1
- #! /usr/bin/env python
1
+ #! /usr/bin/env python3
2
2
3
3
import matplotlib
4
4
# matplotlib.use('Agg')
14
14
from itertools import islice
15
15
from sklearn .metrics import r2_score
16
16
from scipy .optimize import curve_fit
17
+ import multiprocessing
17
18
18
19
#import uno_data as ud
19
20
@@ -120,83 +121,6 @@ def response_curve_fit(xdata, ydata, bounds=HS_BOUNDS):
120
121
return popt , pcov
121
122
122
123
123
- # def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):
124
- # if save:
125
- # font = {'family' : 'normal',
126
- # # 'weight' : 'bold',
127
- # 'size' : 14}
128
- # matplotlib.rc('font', **font)
129
- # plt.figure(figsize=(12, 6))
130
-
131
- # print(df_exp)
132
- # xdata = df_exp.DOSE.astype(float)
133
- # ydata = df_exp.GROWTH.astype(float)
134
- # # ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
135
-
136
- # # print(xdata)
137
- # # print(ydata)
138
-
139
- # popt, pcov = response_curve_fit(xdata, ydata)
140
- # metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
141
-
142
- # if popt is None:
143
- # return metrics
144
-
145
- # dmin = dmin or xdata.min()
146
- # dmax = dmax or xdata.max()
147
- # xx = np.linspace(dmin, dmax, 100)
148
- # yy = response_curve(xx, *popt)
149
-
150
- # plt.xlim(dmax, dmin)
151
- # plt.ylim(0, np.max([105, np.max(yy)]))
152
- # plt.plot(xx, yy*100, 'r-', label='fit: einf=%.3f, ec50=%.3f, hs=%.3f' % tuple(popt))
153
- # plt.plot(xdata, ydata.clip(lower=0, upper=1.0)*100, 'b*', label='')
154
- # plt.xlabel('Dose (-log10(M))')
155
- # plt.ylabel('Growth%')
156
- # plt.title(title)
157
- # plt.tight_layout()
158
- # plt.legend()
159
- # if save:
160
- # plt.savefig('exp.png', dpi=360)
161
- # plt.close()
162
- # else:
163
- # plt.show()
164
-
165
- # return metrics.to_frame(name='metrics').T
166
-
167
-
168
- def fit_response (df_all , cell , drug , source , study = None , save = False ):
169
- # cell_ids = ud.cell_name_to_ids(cell) or [cell]
170
- # drug_ids = ud.drug_name_to_ids(drug) or [drug]
171
-
172
- #df_exp = df_all[df_all.CELL.isin(cell_ids) & df_all.DRUG.isin(drug_ids)].copy()
173
- df_exp = df_all [(df_all .improve_sample_id == cell ) & (df_all .Drug == drug )].copy ()
174
- df_exp .GROWTH = (df_exp .GROWTH / 2 + 0.5 )
175
- df_exp = df_exp [df_exp .SOURCE == source ]
176
-
177
- title = f'{ cell } treated with { drug } in { source } '
178
-
179
- studies = df_exp .STUDY .unique ()
180
- if len (studies ) > 1 :
181
- study = studies [study ] if type (study ) == int else study or studies [0 ]
182
- title += f' study { study } '
183
- df_exp = df_exp [df_exp .STUDY == study ]
184
-
185
- return fit_exp (df_exp , title , save = save )
186
-
187
-
188
- def show_dose_distribution (df_all ):
189
- sources = df_all .SOURCE .unique ()
190
- qs = [0 , 0.02 , 0.05 , 0.1 , 0.2 , 0.5 , 0.8 , 0.9 , 0.95 , 0.98 , 1 ]
191
- series = []
192
- for src in sources :
193
- s = df_all [df_all .SOURCE == src ].DOSE .quantile (qs )
194
- s .name = src
195
- series .append (s )
196
- df_dose = pd .concat (series , axis = 1 )
197
- return df_dose
198
-
199
-
200
124
def process_df (df , fname , sep = '\t ' , ngroups = None ):
201
125
# df = df1.copy()
202
126
i = 0
@@ -225,182 +149,32 @@ def process_df(df, fname, sep='\t', ngroups=None):
225
149
f .close ()
226
150
227
151
152
+ def process_single_drug (name_group_tuple ):
153
+ name , group = name_group_tuple
154
+ xdata = group .DOSE .astype (float )
155
+ ydata = group .GROWTH .clip (lower = 0 , upper = 1.0 ).astype (float )
156
+ popt , pcov = response_curve_fit (xdata , ydata )
157
+ metrics = compute_fit_metrics (xdata , ydata , popt , pcov )
158
+ return name , metrics
159
+
228
160
def process_df_part (df , fname , beataml = False , sep = '\t ' , start = 0 , count = None ):
229
- header = None
230
161
cols = ['source' , 'improve_sample_id' , 'Drug' , 'study' ,'time' ,'time_unit' ]
231
- if beataml == True :
232
- cols = ['SOURCE' , 'CELL' , 'DRUG' , 'STUDY' ]
233
162
groups = df .groupby (cols )
234
- # count = count or (len(groups) - start)
235
163
count = count or (4484081 - start )
236
164
groups = islice (groups , start , start + count )
237
- f = open (f'{ fname } .{ start } ' , 'w' )
238
- for name , group in tqdm (groups ):
239
- #print(name)
240
- name = [str (n ) for n in name ]
241
- xdata = group .DOSE .astype (float )
242
- # ydata = group.GROWTH
243
- # ydata.clip(lower=0, upper=1.0).astype(float)
244
- ydata = group .GROWTH .clip (lower = 0 , upper = 1.0 ).astype (float )
245
- # print(ydata)
246
- #add in multithreading here:
247
- popt , pcov = response_curve_fit (xdata , ydata )
248
- metrics = compute_fit_metrics (xdata , ydata , popt , pcov )
249
- if start == 0 and header is None :
250
- header = cols + metrics .index .tolist ()
251
- print (sep .join (header ), file = f )
252
- print (sep .join (name ), end = sep , file = f )
253
- print (sep .join ([f'{ x :.4g} ' for x in metrics ]), file = f )
254
- f .close ()
255
-
256
-
257
-
258
-
259
-
260
- def process_chem_partner_data ():
261
- df_cp = pd .read_csv ('curve/ChemPartner_dose_response' , sep = '\t ' )
262
- df_cp = df_cp [df_cp .DRUG2 .isnull () & df_cp .DOSE2 .isnull ()].drop (['DRUG2' , 'DOSE2' ], axis = 1 )
263
- df_cp = df_cp .rename (columns = {'DRUG1' :'DRUG' , 'DOSE1' :'DOSE' })
264
- df_cp .DOSE = - df_cp .DOSE
265
- # df_cp.GROWTH = df_cp.GROWTH/100
266
- df_cp .GROWTH = df_cp .GROWTH / 200 + 0.5
267
-
268
- # process_df(df_cp, 'curve/ChemPartner_single_response_agg', ngroups=10)
269
-
270
- process_df (df_cp , 'curve/ChemPartner_single_response_agg.new' )
271
-
272
-
273
-
274
- def fit_exp (df_exp , title = None , dmin = None , dmax = None , save = False ):
275
- if save :
276
- font = {'family' : 'normal' ,
277
- # 'weight' : 'bold',
278
- 'size' : 14 }
279
- matplotlib .rc ('font' , ** font )
280
- plt .figure (figsize = (12 , 6 ))
281
-
282
- print (df_exp )
283
- xdata = df_exp .DOSE .astype (float )
284
- ydata = df_exp .GROWTH .astype (float )
285
- # ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
286
-
287
- # print(xdata)
288
- # print(ydata)
289
-
290
- popt , pcov = response_curve_fit (xdata , ydata )
291
- metrics = compute_fit_metrics (xdata , ydata , popt , pcov )
292
-
293
- if popt is None :
294
- return metrics
295
-
296
- dmin = dmin or xdata .min ()
297
- dmax = dmax or xdata .max ()
298
- xx = np .linspace (dmin , dmax , 100 )
299
- yy = response_curve (xx , * popt )
300
-
301
- plt .xlim (dmax , dmin )
302
- plt .ylim (0 , np .max ([105 , np .max (yy )]))
303
- plt .plot (xx , yy * 100 , 'r-' , label = 'fit: Einf=%.3f, EC50=%.3f, HS=%.3f' % tuple (popt ))
304
- plt .plot (xdata , ydata .clip (lower = 0 , upper = 1.0 )* 100 , 'b*' , label = '' )
305
- plt .xlabel ('Dose (-log10(M))' )
306
- plt .ylabel ('Growth%' )
307
- plt .title (title )
308
- plt .tight_layout ()
309
- plt .legend ()
310
- if save :
311
- plt .savefig ('exp.png' , dpi = 360 )
312
- plt .close ()
313
- else :
314
- plt .show ()
315
-
316
- return metrics .to_frame (name = 'metrics' ).T
317
-
318
-
319
- def get_tableau20_colors ():
320
- # tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
321
- # (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
322
- # (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
323
- # (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
324
- # (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
325
- tableau20 = [(31 , 119 , 180 ), (174 , 199 , 232 ), (255 , 127 , 14 ), (255 , 187 , 120 ),
326
- (148 , 103 , 189 ), (44 , 160 , 44 ), (214 , 39 , 40 ), (255 , 152 , 150 ),
327
- (152 , 223 , 138 ), (197 , 176 , 213 ), (140 , 86 , 75 ), (196 , 156 , 148 ),
328
- (227 , 119 , 194 ), (247 , 182 , 210 ), (127 , 127 , 127 ), (199 , 199 , 199 ),
329
- (188 , 189 , 34 ), (219 , 219 , 141 ), (23 , 190 , 207 ), (158 , 218 , 229 )]
330
- # Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.
331
- for i in range (len (tableau20 )):
332
- r , g , b = tableau20 [i ]
333
- tableau20 [i ] = (r / 255. , g / 255. , b / 255. )
334
- return tableau20
335
-
336
-
337
- def plot_curves (df_all , cell = 'LOXIMVI' , drug = 'paclitaxel' , study = None , max_reps = 2 , dmin = 4 , dmax = 10 , out = None ):
338
- # cell_ids = ud.cell_name_to_ids(cell)
339
- # drug_ids = ud.drug_name_to_ids(drug)
340
-
341
- #df_exps = df_all[df_all.CELL.isin(cell_ids) & df_all.DRUG.isin(drug_ids)].copy()
342
- df_exps = df_all [(df_all ['CELL' ]== cell ) & (df_all ['Drug' ]== drug )].copy ()
343
- df_exps .GROWTH = (df_exps .GROWTH / 2 + 0.5 )
344
-
345
- title = f'{ cell } treated with { drug } '
346
- out = out or f'{ cell } -{ drug } '
347
-
348
- # font = {'family': 'normal', 'size': 14}
349
- font = {'size' : 14 }
350
- matplotlib .rc ('font' , ** font )
351
- plt .figure (figsize = (12 , 6 ))
352
- colors = get_tableau20_colors ()
353
-
354
- dmin = dmin or df_exps .DOSE .min ()
355
- dmax = dmax or df_exps .DOSE .max ()
356
- xx = np .linspace (dmin - 0.1 , dmax + 0.1 , 100 )
357
-
358
- plt .xlim (dmax + 0.1 , dmin - 0.1 )
359
- plt .ylim (0 , 105 )
360
- plt .xlabel ('Dose (-log10(M))' )
361
- plt .ylabel ('Growth%' )
362
- plt .title (title )
363
-
364
- df_metrics = None
365
- rank = 0
366
- order = ['NCI60' , 'CTRP' , 'GDSC' , 'CCLE' , 'gCSI' ]
367
- sources = df_exps .SOURCE .unique ().tolist () if study is None else study
368
- sources = sorted (sources , key = lambda x :order .index (x ))
369
-
370
- for source in sources :
371
- studies = df_exps [df_exps .SOURCE == source ].STUDY .unique ()
372
- for i , study in enumerate (studies [:max_reps ]):
373
- df_exp = df_exps [(df_exps .SOURCE == source ) & (df_exps .STUDY == study )]
374
- xdata = df_exp .DOSE .astype (float )
375
- ydata = df_exp .GROWTH .astype (float )
376
- # ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
377
- popt , pcov = response_curve_fit (xdata , ydata )
378
- metrics = compute_fit_metrics (xdata , ydata , popt , pcov )
379
- if popt is None :
380
- continue
381
- color = colors [rank ]
382
- rank = (rank + 1 ) % 20
383
- yy = response_curve (xx , * popt )
384
- label = source
385
- if len (studies ) > 1 :
386
- label += f' rep { i + 1 } '
387
- plt .plot (xx , yy * 100 , '-' , color = color , label = label )
388
- plt .plot (xdata , ydata .clip (lower = 0 , upper = 1.0 )* 100 , '.' , color = color , label = '' )
389
- if df_metrics is None :
390
- df_metrics = metrics .to_frame (name = label ).T
391
- else :
392
- df_metrics = pd .concat ([df_metrics , metrics .to_frame (name = label ).T ])
393
-
394
- plt .tight_layout ()
395
- plt .legend ()
396
- plt .savefig (f'{ out } .png' , dpi = 360 )
397
- plt .close ()
398
-
399
- df_metrics .index .name = 'source'
400
- df_metrics .to_csv (f'{ out } .csv' , float_format = '%.5g' )
401
- print (f'Saved { out } .png and { out } .csv.' )
165
+
166
+ with multiprocessing .Pool (processes = multiprocessing .cpu_count ()) as pool :
167
+ results = pool .map (process_single_drug , groups )
402
168
403
- return df_metrics
169
+ with open (f'{ fname } .{ start } ' , 'w' ) as f :
170
+ header = None
171
+ for result in results :
172
+ name , metrics = result
173
+ if header is None :
174
+ header = cols + metrics .index .tolist ()
175
+ print (sep .join (header ), file = f )
176
+ print (sep .join (str (n ) for n in name ), end = sep , file = f )
177
+ print (sep .join (f'{ x :.4g} ' for x in metrics ), file = f )
404
178
405
179
406
180
def main ():
@@ -414,7 +188,6 @@ def main():
414
188
df_all = pd .read_table (args .input )
415
189
#drop nas
416
190
df_all = df_all .dropna ()
417
- #print(df_all)
418
191
##pharmacoGX data is micromolar, we need log transformed molar
419
192
df_all .DOSE = np .log10 (df_all .DOSE * 1000000 )
420
193
##need data to be between 0 and 1, not 0 and 100
0 commit comments