Skip to content

Commit eb9c314

Browse files
committed
Multithreaded
1 parent 1e1541f commit eb9c314

File tree

2 files changed

+23
-257
lines changed

2 files changed

+23
-257
lines changed

build/beatAML/GetBeatAML.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -674,22 +674,16 @@ def generate_drug_list(drug_map_path,drug_path):
674674
updated_raw_drug_file = "beatAML_drug_raw.tsv"
675675
generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)
676676
d_df = pd.read_csv(updated_raw_drug_file,sep='\t')
677-
678677
d_res = d_df.rename(columns={"CELL":"other_id","AUC":"fit_auc",'DRUG':'chem_name'})
679-
680-
# imp_samps = pd.read_csv(improve_map_file)
681678
d_res = d_res.merge(imp_samp_map, on='other_id')
682-
#print(d_res)
683-
#print(imp_drug_map)
684679
d_res = d_res.merge(imp_drug_map,on='chem_name')
685680
d_res = d_res.rename(columns = {'improve_drug_id':'Drug'}) ## stupid but we have to change aks later
686681
d_res.to_csv(updated_raw_drug_file,sep='\t')
687682

688683
print("Starting Curve Fitting Algorithm")
689-
##WHERE DO I GET THE CURVE DATA?
690684
# Run Curve fitting algorithm from scripts directory.
691685
# Note the file path to fit_curve.py may need to be changed.
692-
command = ['python', 'fit_curve.py' ,'--input', 'beatAML_drug_raw.tsv', '--output', 'beatAML_drug_processed.tsv']
686+
command = ['python3', 'fit_curve.py' ,'--input', 'beatAML_drug_raw.tsv', '--output', 'beatAML_drug_processed.tsv', '--beataml']
693687
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
694688
if result.returncode == 0:
695689
print("Curve Fitting executed successfully!")
@@ -698,7 +692,6 @@ def generate_drug_list(drug_map_path,drug_path):
698692
print("Out:", result.stdout)
699693
print("Error:", result.stderr)
700694
print("Starting Experiment Data")
701-
#exp_res = map_exp_to_improve(d_res,improve_map_file)
702695
drug_path = "beatAML_drug_processed.tsv.0"
703696
exp_res = map_exp_to_improve(drug_path)
704697
exp_res.to_csv("/tmp/beataml_experiments.tsv", index=False, sep='\t')

build/utils/fit_curve.py

Lines changed: 22 additions & 249 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#! /usr/bin/env python
1+
#! /usr/bin/env python3
22

33
import matplotlib
44
# matplotlib.use('Agg')
@@ -14,6 +14,7 @@
1414
from itertools import islice
1515
from sklearn.metrics import r2_score
1616
from scipy.optimize import curve_fit
17+
import multiprocessing
1718

1819
#import uno_data as ud
1920

@@ -120,83 +121,6 @@ def response_curve_fit(xdata, ydata, bounds=HS_BOUNDS):
120121
return popt, pcov
121122

122123

123-
# def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):
124-
# if save:
125-
# font = {'family' : 'normal',
126-
# # 'weight' : 'bold',
127-
# 'size' : 14}
128-
# matplotlib.rc('font', **font)
129-
# plt.figure(figsize=(12, 6))
130-
131-
# print(df_exp)
132-
# xdata = df_exp.DOSE.astype(float)
133-
# ydata = df_exp.GROWTH.astype(float)
134-
# # ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
135-
136-
# # print(xdata)
137-
# # print(ydata)
138-
139-
# popt, pcov = response_curve_fit(xdata, ydata)
140-
# metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
141-
142-
# if popt is None:
143-
# return metrics
144-
145-
# dmin = dmin or xdata.min()
146-
# dmax = dmax or xdata.max()
147-
# xx = np.linspace(dmin, dmax, 100)
148-
# yy = response_curve(xx, *popt)
149-
150-
# plt.xlim(dmax, dmin)
151-
# plt.ylim(0, np.max([105, np.max(yy)]))
152-
# plt.plot(xx, yy*100, 'r-', label='fit: einf=%.3f, ec50=%.3f, hs=%.3f' % tuple(popt))
153-
# plt.plot(xdata, ydata.clip(lower=0, upper=1.0)*100, 'b*', label='')
154-
# plt.xlabel('Dose (-log10(M))')
155-
# plt.ylabel('Growth%')
156-
# plt.title(title)
157-
# plt.tight_layout()
158-
# plt.legend()
159-
# if save:
160-
# plt.savefig('exp.png', dpi=360)
161-
# plt.close()
162-
# else:
163-
# plt.show()
164-
165-
# return metrics.to_frame(name='metrics').T
166-
167-
168-
def fit_response(df_all, cell, drug, source, study=None, save=False):
169-
# cell_ids = ud.cell_name_to_ids(cell) or [cell]
170-
# drug_ids = ud.drug_name_to_ids(drug) or [drug]
171-
172-
#df_exp = df_all[df_all.CELL.isin(cell_ids) & df_all.DRUG.isin(drug_ids)].copy()
173-
df_exp = df_all[(df_all.improve_sample_id == cell) & (df_all.Drug == drug)].copy()
174-
df_exp.GROWTH = (df_exp.GROWTH/2 + 0.5)
175-
df_exp = df_exp[df_exp.SOURCE == source]
176-
177-
title = f'{cell} treated with {drug} in {source}'
178-
179-
studies = df_exp.STUDY.unique()
180-
if len(studies) > 1:
181-
study = studies[study] if type(study) == int else study or studies[0]
182-
title += f' study {study}'
183-
df_exp = df_exp[df_exp.STUDY == study]
184-
185-
return fit_exp(df_exp, title, save=save)
186-
187-
188-
def show_dose_distribution(df_all):
189-
sources = df_all.SOURCE.unique()
190-
qs = [0, 0.02, 0.05, 0.1, 0.2, 0.5, 0.8, 0.9, 0.95, 0.98, 1]
191-
series = []
192-
for src in sources:
193-
s = df_all[df_all.SOURCE == src].DOSE.quantile(qs)
194-
s.name = src
195-
series.append(s)
196-
df_dose = pd.concat(series, axis=1)
197-
return df_dose
198-
199-
200124
def process_df(df, fname, sep='\t', ngroups=None):
201125
# df = df1.copy()
202126
i = 0
@@ -225,182 +149,32 @@ def process_df(df, fname, sep='\t', ngroups=None):
225149
f.close()
226150

227151

152+
def process_single_drug(name_group_tuple):
153+
name, group = name_group_tuple
154+
xdata = group.DOSE.astype(float)
155+
ydata = group.GROWTH.clip(lower=0, upper=1.0).astype(float)
156+
popt, pcov = response_curve_fit(xdata, ydata)
157+
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
158+
return name, metrics
159+
228160
def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):
229-
header = None
230161
cols = ['source', 'improve_sample_id', 'Drug', 'study','time','time_unit']
231-
if beataml == True:
232-
cols = ['SOURCE', 'CELL', 'DRUG', 'STUDY']
233162
groups = df.groupby(cols)
234-
# count = count or (len(groups) - start)
235163
count = count or (4484081 - start)
236164
groups = islice(groups, start, start+count)
237-
f = open(f'{fname}.{start}', 'w')
238-
for name, group in tqdm(groups):
239-
#print(name)
240-
name = [str(n) for n in name]
241-
xdata = group.DOSE.astype(float)
242-
# ydata = group.GROWTH
243-
# ydata.clip(lower=0, upper=1.0).astype(float)
244-
ydata = group.GROWTH.clip(lower=0, upper=1.0).astype(float)
245-
# print(ydata)
246-
#add in multithreading here:
247-
popt, pcov = response_curve_fit(xdata, ydata)
248-
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
249-
if start == 0 and header is None:
250-
header = cols + metrics.index.tolist()
251-
print(sep.join(header), file=f)
252-
print(sep.join(name), end=sep, file=f)
253-
print(sep.join([f'{x:.4g}' for x in metrics]), file=f)
254-
f.close()
255-
256-
257-
258-
259-
260-
def process_chem_partner_data():
261-
df_cp = pd.read_csv('curve/ChemPartner_dose_response', sep='\t')
262-
df_cp = df_cp[df_cp.DRUG2.isnull() & df_cp.DOSE2.isnull()].drop(['DRUG2', 'DOSE2'], axis=1)
263-
df_cp = df_cp.rename(columns={'DRUG1':'DRUG', 'DOSE1':'DOSE'})
264-
df_cp.DOSE = -df_cp.DOSE
265-
# df_cp.GROWTH = df_cp.GROWTH/100
266-
df_cp.GROWTH = df_cp.GROWTH/200 + 0.5
267-
268-
# process_df(df_cp, 'curve/ChemPartner_single_response_agg', ngroups=10)
269-
270-
process_df(df_cp, 'curve/ChemPartner_single_response_agg.new')
271-
272-
273-
274-
def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):
275-
if save:
276-
font = {'family' : 'normal',
277-
# 'weight' : 'bold',
278-
'size' : 14}
279-
matplotlib.rc('font', **font)
280-
plt.figure(figsize=(12, 6))
281-
282-
print(df_exp)
283-
xdata = df_exp.DOSE.astype(float)
284-
ydata = df_exp.GROWTH.astype(float)
285-
# ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
286-
287-
# print(xdata)
288-
# print(ydata)
289-
290-
popt, pcov = response_curve_fit(xdata, ydata)
291-
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
292-
293-
if popt is None:
294-
return metrics
295-
296-
dmin = dmin or xdata.min()
297-
dmax = dmax or xdata.max()
298-
xx = np.linspace(dmin, dmax, 100)
299-
yy = response_curve(xx, *popt)
300-
301-
plt.xlim(dmax, dmin)
302-
plt.ylim(0, np.max([105, np.max(yy)]))
303-
plt.plot(xx, yy*100, 'r-', label='fit: Einf=%.3f, EC50=%.3f, HS=%.3f' % tuple(popt))
304-
plt.plot(xdata, ydata.clip(lower=0, upper=1.0)*100, 'b*', label='')
305-
plt.xlabel('Dose (-log10(M))')
306-
plt.ylabel('Growth%')
307-
plt.title(title)
308-
plt.tight_layout()
309-
plt.legend()
310-
if save:
311-
plt.savefig('exp.png', dpi=360)
312-
plt.close()
313-
else:
314-
plt.show()
315-
316-
return metrics.to_frame(name='metrics').T
317-
318-
319-
def get_tableau20_colors():
320-
# tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
321-
# (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
322-
# (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
323-
# (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
324-
# (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
325-
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
326-
(148, 103, 189), (44, 160, 44), (214, 39, 40), (255, 152, 150),
327-
(152, 223, 138), (197, 176, 213), (140, 86, 75), (196, 156, 148),
328-
(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
329-
(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
330-
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.
331-
for i in range(len(tableau20)):
332-
r, g, b = tableau20[i]
333-
tableau20[i] = (r / 255., g / 255., b / 255.)
334-
return tableau20
335-
336-
337-
def plot_curves(df_all, cell='LOXIMVI', drug='paclitaxel', study=None, max_reps=2, dmin=4, dmax=10, out=None):
338-
# cell_ids = ud.cell_name_to_ids(cell)
339-
# drug_ids = ud.drug_name_to_ids(drug)
340-
341-
#df_exps = df_all[df_all.CELL.isin(cell_ids) & df_all.DRUG.isin(drug_ids)].copy()
342-
df_exps = df_all[(df_all['CELL']==cell) & (df_all['Drug']==drug)].copy()
343-
df_exps.GROWTH = (df_exps.GROWTH/2 + 0.5)
344-
345-
title = f'{cell} treated with {drug}'
346-
out = out or f'{cell}-{drug}'
347-
348-
# font = {'family': 'normal', 'size': 14}
349-
font = {'size': 14}
350-
matplotlib.rc('font', **font)
351-
plt.figure(figsize=(12, 6))
352-
colors = get_tableau20_colors()
353-
354-
dmin = dmin or df_exps.DOSE.min()
355-
dmax = dmax or df_exps.DOSE.max()
356-
xx = np.linspace(dmin-0.1, dmax+0.1, 100)
357-
358-
plt.xlim(dmax+0.1, dmin-0.1)
359-
plt.ylim(0, 105)
360-
plt.xlabel('Dose (-log10(M))')
361-
plt.ylabel('Growth%')
362-
plt.title(title)
363-
364-
df_metrics = None
365-
rank = 0
366-
order = ['NCI60', 'CTRP', 'GDSC', 'CCLE', 'gCSI']
367-
sources = df_exps.SOURCE.unique().tolist() if study is None else study
368-
sources = sorted(sources, key=lambda x:order.index(x))
369-
370-
for source in sources:
371-
studies = df_exps[df_exps.SOURCE == source].STUDY.unique()
372-
for i, study in enumerate(studies[:max_reps]):
373-
df_exp = df_exps[(df_exps.SOURCE == source) & (df_exps.STUDY == study)]
374-
xdata = df_exp.DOSE.astype(float)
375-
ydata = df_exp.GROWTH.astype(float)
376-
# ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
377-
popt, pcov = response_curve_fit(xdata, ydata)
378-
metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
379-
if popt is None:
380-
continue
381-
color = colors[rank]
382-
rank = (rank + 1) % 20
383-
yy = response_curve(xx, *popt)
384-
label = source
385-
if len(studies) > 1:
386-
label += f' rep {i+1}'
387-
plt.plot(xx, yy*100, '-', color=color, label=label)
388-
plt.plot(xdata, ydata.clip(lower=0, upper=1.0)*100, '.', color=color, label='')
389-
if df_metrics is None:
390-
df_metrics = metrics.to_frame(name=label).T
391-
else:
392-
df_metrics = pd.concat([df_metrics, metrics.to_frame(name=label).T])
393-
394-
plt.tight_layout()
395-
plt.legend()
396-
plt.savefig(f'{out}.png', dpi=360)
397-
plt.close()
398-
399-
df_metrics.index.name = 'source'
400-
df_metrics.to_csv(f'{out}.csv', float_format='%.5g')
401-
print(f'Saved {out}.png and {out}.csv.')
165+
166+
with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
167+
results = pool.map(process_single_drug, groups)
402168

403-
return df_metrics
169+
with open(f'{fname}.{start}', 'w') as f:
170+
header = None
171+
for result in results:
172+
name, metrics = result
173+
if header is None:
174+
header = cols + metrics.index.tolist()
175+
print(sep.join(header), file=f)
176+
print(sep.join(str(n) for n in name), end=sep, file=f)
177+
print(sep.join(f'{x:.4g}' for x in metrics), file=f)
404178

405179

406180
def main():
@@ -414,7 +188,6 @@ def main():
414188
df_all = pd.read_table(args.input)
415189
#drop nas
416190
df_all = df_all.dropna()
417-
#print(df_all)
418191
##pharmacoGX data is micromolar, we need log transformed molar
419192
df_all.DOSE = np.log10(df_all.DOSE*1000000)
420193
##need data to be between 0 and 1, not 0 and 100

0 commit comments

Comments
 (0)