Multithreaded

jjacobson95 · jjacobson95 · commit eb9c31430d11 · 2024-04-26T16:38:10.000-07:00
diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py
@@ -674,22 +674,16 @@ def generate_drug_list(drug_map_path,drug_path):
             updated_raw_drug_file = "beatAML_drug_raw.tsv"
             generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)
             d_df = pd.read_csv(updated_raw_drug_file,sep='\t')
-            
             d_res = d_df.rename(columns={"CELL":"other_id","AUC":"fit_auc",'DRUG':'chem_name'})
-
-           # imp_samps = pd.read_csv(improve_map_file)
             d_res = d_res.merge(imp_samp_map, on='other_id')
-            #print(d_res)
-            #print(imp_drug_map)
             d_res = d_res.merge(imp_drug_map,on='chem_name')
             d_res = d_res.rename(columns = {'improve_drug_id':'Drug'}) ## stupid but we have to change aks later
             d_res.to_csv(updated_raw_drug_file,sep='\t')
 
             print("Starting Curve Fitting Algorithm")
-            ##WHERE DO I GET THE CURVE DATA?
             # Run Curve fitting algorithm from scripts directory.
             # Note the file path to fit_curve.py may need to be changed.
-            command = ['python', 'fit_curve.py' ,'--input', 'beatAML_drug_raw.tsv', '--output', 'beatAML_drug_processed.tsv']
+            command = ['python3', 'fit_curve.py' ,'--input', 'beatAML_drug_raw.tsv', '--output', 'beatAML_drug_processed.tsv', '--beataml']
             result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
             if result.returncode == 0:
                 print("Curve Fitting executed successfully!")
@@ -698,7 +692,6 @@ def generate_drug_list(drug_map_path,drug_path):
                 print("Out:", result.stdout)
                 print("Error:", result.stderr)
             print("Starting Experiment Data")
-            #exp_res = map_exp_to_improve(d_res,improve_map_file)
             drug_path = "beatAML_drug_processed.tsv.0"
             exp_res = map_exp_to_improve(drug_path)
             exp_res.to_csv("/tmp/beataml_experiments.tsv", index=False, sep='\t')
diff --git a/build/utils/fit_curve.py b/build/utils/fit_curve.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#! /usr/bin/env python3
 
 import matplotlib
 # matplotlib.use('Agg')
@@ -14,6 +14,7 @@
 from itertools import islice
 from sklearn.metrics import r2_score
 from scipy.optimize import curve_fit
+import multiprocessing
 
 #import uno_data as ud
 
@@ -120,83 +121,6 @@ def response_curve_fit(xdata, ydata, bounds=HS_BOUNDS):
     return popt, pcov
 
 
-# def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):
-#     if save:
-#         font = {'family' : 'normal',
-#                 # 'weight' : 'bold',
-#                 'size'   : 14}
-#         matplotlib.rc('font', **font)
-#         plt.figure(figsize=(12, 6))
-
-#     print(df_exp)
-#     xdata = df_exp.DOSE.astype(float)
-#     ydata = df_exp.GROWTH.astype(float)
-#     # ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
-
-#     # print(xdata)
-#     # print(ydata)
-
-#     popt, pcov = response_curve_fit(xdata, ydata)
-#     metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
-
-#     if popt is None:
-#         return metrics
-
-#     dmin = dmin or xdata.min()
-#     dmax = dmax or xdata.max()
-#     xx = np.linspace(dmin, dmax, 100)
-#     yy = response_curve(xx, *popt)
-
-#     plt.xlim(dmax, dmin)
-#     plt.ylim(0, np.max([105, np.max(yy)]))
-#     plt.plot(xx, yy*100, 'r-', label='fit: einf=%.3f, ec50=%.3f, hs=%.3f' % tuple(popt))
-#     plt.plot(xdata, ydata.clip(lower=0, upper=1.0)*100, 'b*', label='')
-#     plt.xlabel('Dose (-log10(M))')
-#     plt.ylabel('Growth%')
-#     plt.title(title)
-#     plt.tight_layout()
-#     plt.legend()
-#     if save:
-#         plt.savefig('exp.png', dpi=360)
-#         plt.close()
-#     else:
-#         plt.show()
-
-#     return metrics.to_frame(name='metrics').T
-
-
-def fit_response(df_all, cell, drug, source, study=None, save=False):
-#    cell_ids = ud.cell_name_to_ids(cell) or [cell]
-#    drug_ids = ud.drug_name_to_ids(drug) or [drug]
-
-    #df_exp = df_all[df_all.CELL.isin(cell_ids) & df_all.DRUG.isin(drug_ids)].copy()
-    df_exp = df_all[(df_all.improve_sample_id == cell) & (df_all.Drug == drug)].copy()
-    df_exp.GROWTH = (df_exp.GROWTH/2 + 0.5)
-    df_exp = df_exp[df_exp.SOURCE == source]
-
-    title = f'{cell} treated with {drug} in {source}'
-
-    studies = df_exp.STUDY.unique()
-    if len(studies) > 1:
-        study = studies[study] if type(study) == int else study or studies[0]
-        title += f' study {study}'
-        df_exp = df_exp[df_exp.STUDY == study]
-
-    return fit_exp(df_exp, title, save=save)
-
-
-def show_dose_distribution(df_all):
-    sources = df_all.SOURCE.unique()
-    qs = [0, 0.02, 0.05, 0.1, 0.2, 0.5, 0.8, 0.9, 0.95, 0.98, 1]
-    series = []
-    for src in sources:
-        s = df_all[df_all.SOURCE == src].DOSE.quantile(qs)
-        s.name = src
-        series.append(s)
-    df_dose = pd.concat(series, axis=1)
-    return df_dose
-
-
 def process_df(df, fname, sep='\t', ngroups=None):
     # df = df1.copy()
     i = 0
@@ -225,182 +149,32 @@ def process_df(df, fname, sep='\t', ngroups=None):
     f.close()
 
 
+def process_single_drug(name_group_tuple):
+    name, group = name_group_tuple
+    xdata = group.DOSE.astype(float)
+    ydata = group.GROWTH.clip(lower=0, upper=1.0).astype(float)
+    popt, pcov = response_curve_fit(xdata, ydata)
+    metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
+    return name, metrics
+
 def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):
-    header = None
     cols = ['source', 'improve_sample_id', 'Drug', 'study','time','time_unit']
-    if beataml == True:
-        cols = ['SOURCE', 'CELL', 'DRUG', 'STUDY']
     groups = df.groupby(cols)
-    # count = count or (len(groups) - start)
     count = count or (4484081 - start)
     groups = islice(groups, start, start+count)
-    f = open(f'{fname}.{start}', 'w')
-    for name, group in tqdm(groups):
-        #print(name)
-        name = [str(n) for n in name]
-        xdata = group.DOSE.astype(float)
-#        ydata = group.GROWTH
-#        ydata.clip(lower=0, upper=1.0).astype(float)
-        ydata = group.GROWTH.clip(lower=0, upper=1.0).astype(float)
-        #        print(ydata)
-        #add in multithreading here:
-        popt, pcov = response_curve_fit(xdata, ydata)
-        metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
-        if start == 0 and header is None:
-            header = cols + metrics.index.tolist()
-            print(sep.join(header), file=f)
-        print(sep.join(name), end=sep, file=f)
-        print(sep.join([f'{x:.4g}' for x in metrics]), file=f)
-    f.close()
-
-
-
-
-
-def process_chem_partner_data():
-    df_cp = pd.read_csv('curve/ChemPartner_dose_response', sep='\t')
-    df_cp = df_cp[df_cp.DRUG2.isnull() & df_cp.DOSE2.isnull()].drop(['DRUG2', 'DOSE2'], axis=1)
-    df_cp = df_cp.rename(columns={'DRUG1':'DRUG', 'DOSE1':'DOSE'})
-    df_cp.DOSE = -df_cp.DOSE
-    # df_cp.GROWTH = df_cp.GROWTH/100
-    df_cp.GROWTH = df_cp.GROWTH/200 + 0.5
-
-    # process_df(df_cp, 'curve/ChemPartner_single_response_agg', ngroups=10)
-
-    process_df(df_cp, 'curve/ChemPartner_single_response_agg.new')
-
-
-
-def fit_exp(df_exp, title=None, dmin=None, dmax=None, save=False):
-    if save:
-        font = {'family' : 'normal',
-                # 'weight' : 'bold',
-                'size'   : 14}
-        matplotlib.rc('font', **font)
-        plt.figure(figsize=(12, 6))
-
-    print(df_exp)
-    xdata = df_exp.DOSE.astype(float)
-    ydata = df_exp.GROWTH.astype(float)
-    # ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
-
-    # print(xdata)
-    # print(ydata)
-
-    popt, pcov = response_curve_fit(xdata, ydata)
-    metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
-
-    if popt is None:
-        return metrics
-
-    dmin = dmin or xdata.min()
-    dmax = dmax or xdata.max()
-    xx = np.linspace(dmin, dmax, 100)
-    yy = response_curve(xx, *popt)
-
-    plt.xlim(dmax, dmin)
-    plt.ylim(0, np.max([105, np.max(yy)]))
-    plt.plot(xx, yy*100, 'r-', label='fit: Einf=%.3f, EC50=%.3f, HS=%.3f' % tuple(popt))
-    plt.plot(xdata, ydata.clip(lower=0, upper=1.0)*100, 'b*', label='')
-    plt.xlabel('Dose (-log10(M))')
-    plt.ylabel('Growth%')
-    plt.title(title)
-    plt.tight_layout()
-    plt.legend()
-    if save:
-        plt.savefig('exp.png', dpi=360)
-        plt.close()
-    else:
-        plt.show()
-
-    return metrics.to_frame(name='metrics').T
-
-
-def get_tableau20_colors():
-    # tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
-    #          (44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
-    #          (148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
-    #          (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
-    #          (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
-    tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
-                 (148, 103, 189), (44, 160, 44), (214, 39, 40), (255, 152, 150),
-                 (152, 223, 138), (197, 176, 213), (140, 86, 75), (196, 156, 148),
-                 (227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
-                 (188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
-    # Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.
-    for i in range(len(tableau20)):
-        r, g, b = tableau20[i]
-        tableau20[i] = (r / 255., g / 255., b / 255.)
-    return tableau20
-
-
-def plot_curves(df_all, cell='LOXIMVI', drug='paclitaxel', study=None, max_reps=2, dmin=4, dmax=10, out=None):
-#    cell_ids = ud.cell_name_to_ids(cell)
-#    drug_ids = ud.drug_name_to_ids(drug)
-
-    #df_exps = df_all[df_all.CELL.isin(cell_ids) & df_all.DRUG.isin(drug_ids)].copy()
-    df_exps = df_all[(df_all['CELL']==cell) & (df_all['Drug']==drug)].copy()
-    df_exps.GROWTH = (df_exps.GROWTH/2 + 0.5)
-
-    title = f'{cell} treated with {drug}'
-    out = out or f'{cell}-{drug}'
-
-    # font = {'family': 'normal', 'size': 14}
-    font = {'size': 14}
-    matplotlib.rc('font', **font)
-    plt.figure(figsize=(12, 6))
-    colors = get_tableau20_colors()
-
-    dmin = dmin or df_exps.DOSE.min()
-    dmax = dmax or df_exps.DOSE.max()
-    xx = np.linspace(dmin-0.1, dmax+0.1, 100)
-
-    plt.xlim(dmax+0.1, dmin-0.1)
-    plt.ylim(0, 105)
-    plt.xlabel('Dose (-log10(M))')
-    plt.ylabel('Growth%')
-    plt.title(title)
-
-    df_metrics = None
-    rank = 0
-    order = ['NCI60', 'CTRP', 'GDSC', 'CCLE', 'gCSI']
-    sources = df_exps.SOURCE.unique().tolist() if study is None else study
-    sources = sorted(sources, key=lambda x:order.index(x))
-
-    for source in sources:
-        studies = df_exps[df_exps.SOURCE == source].STUDY.unique()
-        for i, study in enumerate(studies[:max_reps]):
-            df_exp = df_exps[(df_exps.SOURCE == source) & (df_exps.STUDY == study)]
-            xdata = df_exp.DOSE.astype(float)
-            ydata = df_exp.GROWTH.astype(float)
-            # ydata = df_exp.GROWTH.clip(lower=0, upper=1.0).astype(float)
-            popt, pcov = response_curve_fit(xdata, ydata)
-            metrics = compute_fit_metrics(xdata, ydata, popt, pcov)
-            if popt is None:
-                continue
-            color = colors[rank]
-            rank = (rank + 1) % 20
-            yy = response_curve(xx, *popt)
-            label = source
-            if len(studies) > 1:
-                label += f' rep {i+1}'
-            plt.plot(xx, yy*100, '-', color=color, label=label)
-            plt.plot(xdata, ydata.clip(lower=0, upper=1.0)*100, '.', color=color, label='')
-            if df_metrics is None:
-                df_metrics = metrics.to_frame(name=label).T
-            else:
-                df_metrics = pd.concat([df_metrics, metrics.to_frame(name=label).T])
-
-    plt.tight_layout()
-    plt.legend()
-    plt.savefig(f'{out}.png', dpi=360)
-    plt.close()
-
-    df_metrics.index.name = 'source'
-    df_metrics.to_csv(f'{out}.csv', float_format='%.5g')
-    print(f'Saved {out}.png and {out}.csv.')
+    
+    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
+        results = pool.map(process_single_drug, groups)
 
-    return df_metrics
+    with open(f'{fname}.{start}', 'w') as f:
+        header = None
+        for result in results:
+            name, metrics = result
+            if header is None:
+                header = cols + metrics.index.tolist()
+                print(sep.join(header), file=f)
+            print(sep.join(str(n) for n in name), end=sep, file=f)
+            print(sep.join(f'{x:.4g}' for x in metrics), file=f)
 
 
 def main():
@@ -414,7 +188,6 @@ def main():
     df_all = pd.read_table(args.input)
     #drop nas
     df_all = df_all.dropna()
-    #print(df_all)
     ##pharmacoGX data is micromolar, we need log transformed molar
     df_all.DOSE = np.log10(df_all.DOSE*1000000)
     ##need data to be between 0 and 1, not 0 and 100