Pulled updates from main to this branch

jjacobson95 · jjacobson95 · commit 58fd4f96055d · 2024-05-21T09:21:54.000-07:00
diff --git a/README.md b/README.md
@@ -28,7 +28,15 @@ please see the [schema description](schema/README.md).
 
 We have created a build script that executes each step of the build process to enable the creation of a `local` folder with all the requisite folders.
 
-The build requires Python as well as Docker to be installed. 
+The build requires Python as well as Docker to be installed. To access
+the data on Synapse (MPNST, BeatAML proteomics), you will need to
+[register for a synapse account](http://synapse.org/register) and then
+request access to the [CoderData Build
+Team](https://www.synapse.org/#!Team:3503472). Then you will need to
+create a [personal authentication
+token](https://www.synapse.org/#!PersonalAccessTokens:) with Download
+access and then set the `SYNAPSE_AUTH_TOKEN` environment variable to
+that token. 
 
 To build the docker images and run them, simply run (though this will take a while!):
 ```
diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py
@@ -131,9 +131,10 @@ def generate_samples_file(prev_samples_path):
     prot_samples.rename(columns={"specimenType": "common_name"}, inplace=True)
     prot_samples["cancer_type"] = "Acute Myeloid Leukaemia"
     prot_samples["model_type"] = "ex vivo"
-    prot_samples["other_id_source"] = "beatAML"    
+    prot_samples["other_id_source"] = "beatAML"
     
     all_samples = pd.concat([prot_samples, full_samples])
+    all_samples['species'] = 'Homo sapiens'
     maxval = max(pd.read_csv(prev_samples_path).improve_sample_id)
     mapping = {labId: i for i, labId in enumerate(all_samples['other_id'].unique(), start=(int(maxval)+1))}
     all_samples['improve_sample_id'] = all_samples['other_id'].map(mapping)
diff --git a/build/broad_sanger/02-broadSangerOmics.R b/build/broad_sanger/02-broadSangerOmics.R
@@ -105,6 +105,15 @@ sanger_files<-function(fi,value){
        rm(exp_file)
 
         print('copy call')
+
+        ##rename SANGER value
+        # Amplification -> amp
+        # Deletion -> deep del
+        # Loss -> het loss
+        # Gain -> gain
+        # Neutral -> diploid
+        #
+        res$Sanger=sapply(res$Sanger,function(x) ifelse(x=='Amplification','amp',ifelse(x=='Deletion','deep del',ifelse(x=='Loss','het loss',ifelse(x=='Gain','gain','diploid')))))
       ##calibrate the copy call
       res<-res|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp
         dplyr::mutate(IMPROVE=ifelse(copy_number<0.5210507,'deep del',
diff --git a/build/hcmi/01-createHCMISamplesFile.py b/build/hcmi/01-createHCMISamplesFile.py
@@ -43,6 +43,7 @@ def align_to_linkml_schema(input_df):
     }
 
     # Apply mapping
+    input_df['species'] = 'Homo sapiens' ##i assume they're lal human? 
     input_df['model_type'] = input_df['model_type'].map(mapping_dict)
     input_df.dropna(subset=['model_type'], inplace=True)
     
diff --git a/build/mpnst/00_sample_gen.R b/build/mpnst/00_sample_gen.R
@@ -31,13 +31,13 @@ manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()
 ##first create samples for the original tumors
 tumorTable<-manifest|>
     dplyr::select(common_name='Sample')|>
-    dplyr::mutate(other_id_source='NF Data Portal',cancer_type="Malignant peripheral nerve sheath tumor",species='Human',model_type='tumor')|>
+    dplyr::mutate(other_id_source='NF Data Portal',other_names='',cancer_type="Malignant peripheral nerve sheath tumor",species='Human',model_type='tumor')|>
     tidyr::unite(col='other_id',c('common_name','model_type'),sep=' ',remove=FALSE)
 
 ##then create samples for the PDX
 sampTable<-manifest|>
     dplyr::select(c(common_name='Sample',MicroTissueDrugFolder))|>
-    dplyr::mutate(other_id_source='NF Data Portal',cancer_type="Malignant peripheral nerve sheath tumor",species='Human',model_type='patient derived xenograft')|>
+    dplyr::mutate(other_id_source='NF Data Portal',other_names='',cancer_type="Malignant peripheral nerve sheath tumor",species='Human',model_type='patient derived xenograft')|>
     tidyr::unite(col='other_id',c('common_name','model_type'),sep=' ',remove=FALSE)
 
 
diff --git a/build/mpnst/01_mpnst_get_omics.R b/build/mpnst/01_mpnst_get_omics.R
@@ -157,7 +157,8 @@ cnv<-do.call(rbind,lapply(setdiff(combined$CopyNumber,NA),function(x){
       subset(!is.na(entrez_id))|>
       dplyr::select(entrez_id,log2)|>
       dplyr::distinct()|>
-      dplyr::mutate(copy_number=2^log2)
+        dplyr::mutate(copy_number=2^log2)|>
+        dplyr::select(-log2)
 
   res<-long_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp
       dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del',
diff --git a/build/mpnst/02_get_drug_data.R b/build/mpnst/02_get_drug_data.R
@@ -72,7 +72,7 @@ print(paste(alldrugs,collapse=','))
 
 
 ##copy old drug to new drug
-olddrugs<-do.call(rbind,lapply(unique(unlist(strsplit(olddrugfiles,split=','))),function(x) read.table(x,header=T,sep='\t',quote='',comment.char=''))
+olddrugs<-do.call(rbind,lapply(unique(unlist(strsplit(olddrugfiles,split=','))),function(x) read.table(x,header=T,sep='\t',quote='',comment.char='')))
 olddrugs<-unique(olddrugs)
 
 print(paste('Read in ',nrow(olddrugs),'old drugs'))
diff --git a/build/utils/fit_curve.py b/build/utils/fit_curve.py
@@ -198,8 +198,8 @@ def main():
     fname = args.output or 'combined_single_response_agg'
     process_df_part(df_all, fname, beataml=args.beataml)#, start=args.start, count=args.count)
     
-    if args.beataml == False:
-        format_coderd_schema(fname+'.0')
+#    if args.beataml == False:
+    format_coderd_schema(fname+'.0')
 
 if __name__ == '__main__':
     main()

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ def align_to_linkml_schema(input_df):`
`43`	`43`	`}`
`44`	`44`
`45`	`45`	`# Apply mapping`
	`46`	`+ input_df['species'] = 'Homo sapiens' ##i assume they're lal human?`
`46`	`47`	`input_df['model_type'] = input_df['model_type'].map(mapping_dict)`
`47`	`48`	`input_df.dropna(subset=['model_type'], inplace=True)`
`48`	`49`