forked from drdaviddelorenzo/ibd
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhaploscore.py
171 lines (149 loc) · 7.13 KB
/
haploscore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""
haploscore.py
Copyright (C) 2014 23andMe, Inc.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
If you have questions, please contact 23andMe, Inc. at
1390 Shorebird Way, Mountain View, CA 94043.
Authors:
Eric Y. Durand <edurand@23andme.com>
Cory Y. McLean <cmclean@23andme.com>
"""
import numpy as np
import os.path
import sys
from haplotypes import PlinkMap, PlinkPED
from ibd import GermlineIBDSegment
from threshold import ThresholdMatrix, DEFAULT_THRESHOLD_MATRIX
def get_score_thresholder(thresholdfn, filterval):
"""
Return a Thresholds object that will filter IBD segments based on their
genetic length and HaploScore to the desired filtering threshold.
"""
if thresholdfn is None:
threshmat = DEFAULT_THRESHOLD_MATRIX
else:
threshmat = ThresholdMatrix.from_file(thresholdfn)
return threshmat.threshold(filterval)
def compute_haploscore(segment, snpdata, inddata, geno_penalty, switch_penalty):
"""
Return the HaploScore of the given IBD segment based on the haplotypes of
the individuals and the expected genotype and switch error penalties.
"""
start, end = [snpdata.get_snp(rsid).index
for rsid in [segment.rstart, segment.rend]]
individual1 = inddata.get_individual(segment.iid1)
individual2 = inddata.get_individual(segment.iid2)
haps1 = individual1.haplotypes[:, start:(end+1)]
haps2 = individual2.haplotypes[:, start:(end+1)]
def _get_current_genotype_penalty(i):
_penalty = []
for h1 in xrange(len(haps1)):
m1 = haps1[h1, i]
for h2 in xrange(len(haps2)):
m2 = haps2[h2, i]
_penalty.append((m1 != m2) * geno_penalty)
return np.array(_penalty, dtype=np.float)
prevscore = _get_current_genotype_penalty(0)
# Calculate number of snps as the field
# reported by GERMLINE is occasionally incorrect.
nsnp = end - start + 1
for i in xrange(1, nsnp):
nextswitch = prevscore + switch_penalty
nextscore = _get_current_genotype_penalty(i) + nextswitch.min(1)
prevscore = nextscore
return min(prevscore) / float(nsnp)
def main(args):
if args.verbose:
sys.stderr.write("Loading PLINK data ...\n")
unordered_snpdata = PlinkMap.from_file(args.map)
unordered_inddata = PlinkPED.from_file(args.ped)
genome_order = unordered_snpdata.genome_order_mask()
snpdata = unordered_snpdata.ordered(genome_order)
inddata = unordered_inddata.ordered(genome_order)
genotype_penalty = 1./args.genotype_error
switch_penalty = 1./args.switch_error
switch_pen_mat = np.array([[0,1,1,2],
[1,0,2,1],
[1,2,0,1],
[2,1,1,0]], dtype=np.float) * switch_penalty
minscore = float("infinity")
nofilter = args.filter is None
if not nofilter:
thresholder = get_score_thresholder(args.threshold_file, args.filter)
if args.verbose:
sys.stderr.write("Scoring segments ...\n")
kept = 0
with open(args.ibd, "r") as ifs, open(args.out, "w") as ofs:
for i, line in enumerate(ifs):
segment = GermlineIBDSegment.from_line(line)
score = compute_haploscore(segment,
snpdata,
inddata,
genotype_penalty,
switch_pen_mat)
minscore = min(minscore, score)
if (nofilter or
thresholder.passes_threshold(segment.genetic_length, score)):
ofs.write("%s\t%s\n" % (line.strip(), "%.3f" % score))
kept += 1
if args.verbose and (i+1) % 1000 == 0:
sys.stderr.write("Processed [%d] segments\r" % (i+1))
if args.verbose:
sys.stderr.write("Kept %d of %d total input segments.\n" % (kept, i+1))
# Warn users if input produces worse results than expected
if minscore > 5:
msg = ("### Warning: Minimum HaploScore seen in %d segments=%f. ###\n"
"### Ensure your PED file contains phased genotype data. ###\n")
sys.stderr.write(msg % (i+1, minscore))
if __name__ == '__main__':
import argparse
usage = "%(prog)s [options] match_file ped_file map_file out_file"
description = ("Compute haploscore on provided ibd segments.\n"
"The score is added as an additional column to the segment "
"file.")
parser = argparse.ArgumentParser(usage=usage, description=description)
parser.add_argument("ibd")
parser.add_argument("ped")
parser.add_argument("map")
parser.add_argument("out")
calcgrp = parser.add_argument_group("HaploScore calculation options")
calcgrp.add_argument("--genotype_error", type=float, default=0.0075,
help="Genotyping error rate (#errors per marker) "
"(default: %(default)s)")
calcgrp.add_argument("--switch_error", type=float, default=0.003,
help="Switch error rate (#errors per marker) "
"(default: %(default)s)")
filtgrp = parser.add_argument_group("Segment filtering options",
"(Only use if you want to "
"simultaneously calculate HaploScore "
"and filter out poorly-scoring "
"reported IBD segments)")
filtgrp.add_argument("--filter", type=float, default=None,
help="Filter out segments not passing this mean "
"overlap threshold (default: all segments kept)")
filtgrp.add_argument("--threshold_file", default=None,
help="Use this file of HaploScore thresholds "
"(default: use thresholds from original "
"HaploScore publication)")
parser.add_argument("-v", "--verbose", default=False, action="store_true")
args = parser.parse_args()
# Validate arguments
for argname, val in [('Genotype error', args.genotype_error),
('Switch error', args.switch_error),
('Filter threshold', args.filter)]:
if val is not None and not (0 < val <= 1):
raise ValueError("%s must be in range (0, 1]: %f" % (argname, val))
for fn in [args.ibd, args.ped, args.map]:
if not os.path.isfile(fn):
raise ValueError("%s does not exist." % fn)
if (args.threshold_file is not None and
not os.path.isfile(args.threshold_file)):
raise ValueError("%s does not exist." % args.threshold_file)
main(args)