-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcfp_score.py
executable file
·374 lines (270 loc) · 12.8 KB
/
cfp_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
''' Module for computating clade fitness proxy (CFP) scores '''
import sys
import numpy as np
import numpy.linalg as linalg
from sklearn.cluster import MeanShift, estimate_bandwidth, KMeans
from sklearn import mixture
from sklearn.metrics import silhouette_score
import params as p
###############################################################################
def haplotype_CFP_scores(hap_mat, freqs, subset_col=None, present=None, norm=2):
''' Computes CFP scores for all haplotypes (rows) in given matrix
Returns np.array of scores (possibly containing np.nan)
'hap_mat' : haplotype matrix, numeric (0,1)
'freqs' : np.array of frequencies of corresponding matrix columns
'subset_col': (optional) column based on which to subset results
'present' : (optional) boolean, indicates if column '1's should be kept '0''s
'''
# initialize CFP scores vector
scores = np.zeros(len(hap_mat))
if p.cfp_exclude_fixed:
# exclude fixed alleles, converting 1.0 to 0.0
freqs_mod = np.copy(freqs)
freqs_mod[freqs_mod == 1.0] = 0.0
else:
freqs_mod = freqs
# de normalize, REMOVE??
freqs_mod = freqs_mod * len(hap_mat)
# compute CFP-score for each haplotype
for (i, hap) in enumerate(hap_mat):
scores[i] = np.nan # exclude by defualt
if subset_col is None:
# no subset, include
scores[i] = linalg.norm(np.multiply(hap, freqs_mod), norm)
else:
# subset, decide inclusion
if present and hap_mat[i, subset_col] == 1.0:
# include carrier i
scores[i] = linalg.norm(np.multiply(hap, freqs_mod), norm)
elif (not present) and hap_mat[i, subset_col] == 0.0:
# include non-carrier i
scores[i] = linalg.norm(np.multiply(hap, freqs_mod), norm)
return scores
###############################################################################
def haplotype_CFP_scores_ms(hap_mat, freqs, cfp_norm):
''' Compute un-normalized CFP scores for all haplotypes (rows) in given matrix.
Assumes no fixed sites in input (simulated by e.g. 'ms').
'''
# get sample size & init CFP-scores vector
n = len(hap_mat)
scores = np.zeros(n)
# No need to remove fixed sites from score (i.e., convert 1.0s to 0.0s)
# as this is intended strictly for use with ms output (as name implies)
# compute CFP score for each haplotype
for (i, hap) in enumerate(hap_mat):
# normalized freq., normx
# scores[i] = linalg.norm( np.multiply(hap, freqs), cfp_norm )
# un-normalized freq., norm
scores[i] = linalg.norm(np.multiply(hap, freqs*float(n)), cfp_norm)
# un-normalized freq., power
# scores[i] = np.sum(np.multiply(hap, freqs*float(n))**cfp_norm)
return scores
###############################################################################
def cluster_CFP_scores_GMM(scores):
# prep
scores = np.array(scores)
X = np.reshape(np.array(scores), (-1, 1))
# fit model
g = mixture.GMM(n_components=2)
g.fit(X)
# predict labels
labels = g.predict(X)
high_gaussian_ind = np.argmax(g.means_[:, 0])
low_gaussian_ind = np.argmin(g.means_[:, 0])
pred_labels = np.array(labels == high_gaussian_ind, dtype=float)
# class posteriors
probs = g.predict_proba(X)
p_carr = probs[:, high_gaussian_ind]
p_ncarr = probs[:, low_gaussian_ind]
return pred_labels, p_carr, p_ncarr, g.bic(X), g.aic(X), silhouette_score(X, pred_labels)
###############################################################################
def cluster_CFP_scores(scores):
''' Perform clustering of CFP scores, and infer carrier/non-carrier status.
Clustering algorithm is one of 'Mean Shift' or '2 Means'.
Returns: inferred binary labels, 0=non-carrier,1=carrier.
'''
algo = '2-means'
# algo = 'mean-shift'
# prep
scores = np.array(scores)
X = np.reshape(np.array(scores), (-1, 1))
# cluster
if algo == 'mean-shift':
bandwidth = estimate_bandwidth(X, quantile=0.3, n_samples=None)
est = MeanShift(bandwidth=bandwidth, bin_seeding=True)
est.fit(X)
elif(algo == '2-means'):
est = KMeans(n_clusters=2, init='k-means++', n_init=10)
est.fit(X)
# extract clustering info
labels, clust_centers = est.labels_, est.cluster_centers_
labels_unique, n_clusters = np.unique(labels), len(np.unique(labels))
pred_labels = np.zeros(len(scores))
if n_clusters > 2:
# reduce to 2 clusters accumulating points from the highest CFP cluster
# stop when >50% of data & report accumulated points as carriers
for center in sorted(clust_centers, reverse=True):
to_add = np.array(labels == np.where(clust_centers == center)[0][0], dtype=float)
pred_labels += to_add
if np.sum(pred_labels) > 0.5*len(scores):
break
else:
# assert exactly 2 clusters
if n_clusters == 1:
print "RE DOING WITH 2 MEANS!!"
# force second cluster with 2-means
est2 = KMeans(n_clusters=2, init='k-means++', n_init=10)
est2.fit(X)
labels, clust_centers = est2.labels_, est2.cluster_centers_
labels_unique, n_clusters = np.unique(labels), len(np.unique(labels))
# report top cluster
pred_labels = np.array(labels == np.argmax(clust_centers), dtype=float)
# compute silhouette score
sil_score = silhouette_score(X, pred_labels)
# return predicted labels & silhouette score
return pred_labels, sil_score
###############################################################################
def cluster_CFP_scores_report(scores, carrier_status):
'''
1. cluster CFP scores using the 'cluster_CFP_scores' function
2. generate report using the 'clustering_report' function
'''
# call clustering function
# predicted_status, clustering_score = cluster_CFP_scores(scores)
predicted_status, clustering_score = cluster_CFP_scores_GMM(scores)
# return report function
return clustering_report(predicted_status, carrier_status)
###############################################################################
def clustering_report(predicted_status, carrier_status):
''' return a report including: accuracy, tpr, fpr
'''
# sanity check
assert len(predicted_status) == len(carrier_status), "Error: unexpected # of cluster labels\n"
# prediction stats
true_pos, false_pos, true_neg, false_neg = .0, .0, .0, .0
for i, true_state in enumerate(carrier_status):
pred_state = predicted_status[i]
if pred_state and true_state:
true_pos += 1.0
elif (not pred_state) and true_state:
false_neg += 1.0
elif pred_state and (not true_state):
false_pos += 1.0
elif (not pred_state) and (not true_state):
true_neg += 1.0
# summarize
if true_pos + false_neg > 0:
tpr = true_pos / (true_pos + false_neg)
else:
tpr = 1.0
if false_pos + true_neg > 0:
fpr = false_pos / (false_pos + true_neg)
else:
fpr = 0.0
acc = (true_pos + true_neg) / len(carrier_status)
b_acc = 0.5*(true_pos/(true_pos + false_neg) + true_neg/(false_pos+true_neg))
return acc, b_acc, tpr, fpr
###############################################################################
def mutation_CFP_scores(hap_mat, col_freqs):
''' Convenience method to compute mutation CFP scores:
1. Computes haplotype_CFP_scores().
2. For each mutation (and its carriers), computes mut_score_from_carrier_scores().
'''
# initialize mutation scores
mut_cfp_scores = np.zeros(len(col_freqs))
# compute CFP score for individuals (rows)
hap_cfp_scores = haplotype_CFP_scores(hap_mat, col_freqs)
# compute mutation (column) scores
for (i, f_i) in enumerate(col_freqs):
carrier_scores = hap_cfp_scores[hap_mat[:, i] > 0]
mut_cfp_scores[i] = mut_score_from_carrier_scores(carrier_scores)
return mut_cfp_scores
###############################################################################
def mut_score_from_carrier_scores(carrier_scores):
''' Computes CFP score of mutation given CFP scores of its carriers '''
if len(carrier_scores) == 0:
# print "[cfp_score::mut_score_from_carrier_scores]::warning: 0 carriers"
return np.nan
# return np.median(carrier_scores)
return carrier_scores.mean()
###############################################################################
def score_mutations_and_haplotypes(hap_mat, h_score_func, m_score_func, k):
''' Compute k-scores of mutations & individuals in given haplotype matrix.
All scores at iteration k are computed using scores from iteration k-1.
'''
mut_scores_dict, hap_scores_dict = {}, {}
hap_scores = np.ones(hap_mat.shape[0])
for it in range(k):
# compuate mutation scores
mut_scores = []
for j in range(hap_mat.shape[1]):
mut_scores.append(m_score_func(hap_mat[:, j], hap_scores))
mut_scores = np.array(mut_scores)
mut_scores_dict[it] = mut_scores
# compute haplotype scores
hap_scores = []
for i in range(hap_mat.shape[0]):
hap_scores.append(h_score_func(hap_mat[i, :], mut_scores))
hap_scores = np.array(hap_scores)
hap_scores_dict[it] = hap_scores
return mut_scores_dict, hap_scores_dict
###############################################################################
def h_score(hap_row, mut_scores, n_order=1):
''' Computes the score of a haplotype given hap_row (its row in the haplotype matrix)
and mut_scores (the scores of all mutations in the haplotype matrix).
'''
# vector containing for each mutation in the sample
# its score if carried by the haplotype, and 0 othewise.
carried_mut_scores = np.multiply(hap_row, mut_scores)
# haplotype score is the (normalized) n_order norm of carried mutation scores
# n_order=1 normalized sum, =2 normzlized 2-norm, etc.
hap_score = linalg.norm(carried_mut_scores, n_order)
# hap_score = 1.0 should never happen
# since no haplotype carries all mutations
return hap_score
###############################################################################
def m_score(mut_col, hap_scores, n_order=1):
''' Computes the score of a mutation given mut_col (its column in the haplotype matrix)
and hap_scores (the scores of all haplotypes in the matrix).
when k=0 all haplotype scores are equal, giving the frequency.
when k>0 haplotype scores differ, so a mutation of freq. f carried mostly by high-scoring
haplotypes scores higher than a mutation of freq. f carried mostly by low-scoring haplotypes
'''
# vector containing for each haplotype in the sample its
# score if it is a carrier of the mutation and 0 otherwise
carrier_hap_scores = np.multiply(mut_col, hap_scores)
# mutation score is the (normalized) norm of carrier haplotype scores
# n_order=1 normalized sum, =2 normzlized 2-norm, etc.
mut_score = linalg.norm(carrier_hap_scores, n_order)
# mut_score = 1.0 can happen if unfolded matrix
if not p.fold_freq:
if mut_score == 1.0 or mut_score == linalg.norm(hap_scores, n_order):
mut_score = 0.0 # TODO think about this...
return mut_score
###############################################################################
# DEPRECATED
###############################################################################
def CFP_scores_freq_bin(hap_mat, col_freqs, min_f, max_f, subset_col=None, present=None):
''' Computes CFP scores, aggregated over carriers of mutations of requested frequencies.
'hap_mat' : np.array of haplotype vectors (float)
'min_f' : include mutations with greater frequency.
'max_f' : include mutations with smaller or equal frequency.
'subset_col' : (optional) integer, index of a mutation (column) to subset scores on.
'present' : (optional) boolean, if true carriers of the mutation included,
otherwise excluded.
'''
agg_scores = [] # aggregate of scores for frequency category
# sanity check
if hap_mat.size == 0:
return np.array([])
# cmopute CFP score for each individual (row) in matrix
cfp_scores = CFP_scores(hap_mat, col_freqs, subset_col, present)
# aggregate scores of appropriate carriers
for i, f in enumerate(col_freqs):
if f > min_f and f <= max_f:
# mutation in freq. bin, find carrier indices
carriers = np.nonzero(hap_mat[:, i])[0]
# get scores of those individuals (may contain np.nan)
agg_scores.extend(cfp_scores[carriers])
# return score aggregation
return np.array(agg_scores)