Merge branch 'main' into nci60-add

sgosline · web-flow · commit e1dcd45b15ed · 2024-05-09T08:51:02.000-07:00
diff --git a/README.md b/README.md
@@ -26,8 +26,37 @@ please see the [schema description](schema/README.md).
 
 ## Building the data package
 
-The data package is currently assembled via continuous automation,
+We have created a build script that executes each step of the build process to enable the creation of a `local` folder with all the requisite folders.
 
+The build requires Python as well as Docker to be installed. 
+
+To build the docker images and run them, simply run (though this will take a while!):
+```
+python build/build_all.py --all
+```
+
+To only build the docker files:
+```
+python build/build_all.py --docker
+```
+
+Then to build the reference files (after dockers have been built):
+```
+python build/build_all.py --samples
+python build/build_all.py --drugs
+```
+
+Once the sample files have been created, we can collect the omics measurements:
+```
+python build/build_all.py --omics
+```
+
+Once the drugs file and samples have been created, we can refit the curves:
+```
+python build/build_all.py --exp
+```
+
+Note: this will not build the python package, just generate the data!
 
 ## Data Source Reference List
 
diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py
@@ -684,22 +684,16 @@ def generate_drug_list(drug_map_path,drug_path):
             updated_raw_drug_file = "beatAML_drug_raw.tsv"
             generate_raw_drug_file(original_drug_file,sample_mapping_file, updated_raw_drug_file,supplimentary_file)
             d_df = pd.read_csv(updated_raw_drug_file,sep='\t')
-            
             d_res = d_df.rename(columns={"CELL":"other_id","AUC":"fit_auc",'DRUG':'chem_name'})
-
-           # imp_samps = pd.read_csv(improve_map_file)
             d_res = d_res.merge(imp_samp_map, on='other_id')
-            #print(d_res)
-            #print(imp_drug_map)
             d_res = d_res.merge(imp_drug_map,on='chem_name')
             d_res = d_res.rename(columns = {'improve_drug_id':'Drug'}) ## stupid but we have to change aks later
             d_res.to_csv(updated_raw_drug_file,sep='\t')
 
             print("Starting Curve Fitting Algorithm")
-            ##WHERE DO I GET THE CURVE DATA?
             # Run Curve fitting algorithm from scripts directory.
             # Note the file path to fit_curve.py may need to be changed.
-            command = ['python', 'fit_curve.py' ,'--input', 'beatAML_drug_raw.tsv', '--output', 'beatAML_drug_processed.tsv']
+            command = ['python3', 'fit_curve.py' ,'--input', 'beatAML_drug_raw.tsv', '--output', 'beatAML_drug_processed.tsv', '--beataml']
             result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
             if result.returncode == 0:
                 print("Curve Fitting executed successfully!")
@@ -708,7 +702,6 @@ def generate_drug_list(drug_map_path,drug_path):
                 print("Out:", result.stdout)
                 print("Error:", result.stderr)
             print("Starting Experiment Data")
-            #exp_res = map_exp_to_improve(d_res,improve_map_file)
             drug_path = "beatAML_drug_processed.tsv.0"
             exp_res = map_exp_to_improve(drug_path)
             exp_res.to_csv("/tmp/beataml_experiments.tsv", index=False, sep='\t')
diff --git a/build/build_all.py b/build/build_all.py
@@ -66,6 +66,7 @@ def main():
 
     datasets = args.datasets.split(',')
 
+
     ### Any new sample creation must happened here.
     ### Each sample file requires the previous one to be created
     ### current order is : DepMap, Sanger, CPTAC, HCMI, BeatAML, MPNST
diff --git a/build/utils/fit_curve.py b/build/utils/fit_curve.py
@@ -163,7 +163,7 @@ def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):
     count = count or (4484081 - start)
     groups = islice(groups, start, start+count)
     cores = multiprocessing.cpu_count()
-    poolsize = round(cores/2)
+    poolsize = round(cores-1)
     print('we have '+str(cores)+' cores and '+str(poolsize)+' threads')
     with multiprocessing.Pool(processes=poolsize) as pool:
         results = pool.map(process_single_drug, groups)
diff --git a/candle_bmd/fit_curve.py b/candle_bmd/fit_curve.py