Skip to content

Commit 58aed7f

Browse files
author
RubyFore
committed
adding in omics and drug file creation scripts
Addition of omics and drug file creation scripts (.py and .sh) whose output has been tested with `linkml`. Also adding a script ('obtainGSMidLInk.R') that links GEO IDs in transcriptomic data with study IDs so that we can link them to improve_id's.
1 parent 86e83ef commit 58aed7f

File tree

7 files changed

+277
-0
lines changed

7 files changed

+277
-0
lines changed
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
7+
8+
def get_bladder_pdo_samples(synLoginObject, maxval):
9+
10+
# download from Synapse..
11+
samples_syn = synLoginObject.get('syn64765486')
12+
# and read the file
13+
samples_df = pd.read_csv(samples_syn.path, sep="\t")
14+
15+
samples = samples_df[['Sample ID', 'Patient ID', 'Cancer Type Detailed', 'Sample Class']]
16+
samples = samples.rename({"Sample ID" : 'other_id', 'Patient ID' : 'common_name', 'Cancer Type Detailed': 'cancer_type', 'Sample Class' : 'model_type'}, axis=1)
17+
18+
samples.loc[:,['species']] = 'Homo sapiens(Human)'
19+
samples.loc[:,['other_id_source']] = 'Synapse'
20+
samples.loc[:,['other_names'] ]= ''
21+
samples.loc[:,['cancer_type']]=samples['cancer_type'].str.lower()
22+
samples.loc[:, ['model_type']] = samples['model_type'].str.lower()
23+
24+
samples['improve_sample_id'] = range(maxval+1, maxval+1+samples.shape[0])
25+
26+
return samples
27+
28+
29+
if __name__ == "__main__":
30+
31+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Sarcoma PDO project into a single samplesheet")
32+
33+
parser.add_argument('-t', '--token', type=str, help='Synapse Token')
34+
35+
parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const = "", help = "Use this to provide previous sample file, will run sample file generation")
36+
37+
args = parser.parse_args()
38+
39+
print("Logging into Synapse")
40+
PAT = args.token
41+
synObject = synapseclient.login(authToken=PAT)
42+
43+
if (args.prevSamples):
44+
prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id)
45+
else:
46+
prev_max_improve_id = 0
47+
48+
bladder_pdo_samples = get_bladder_pdo_samples(synObject, prev_max_improve_id)
49+
50+
bladder_pdo_samples.to_csv("bladderpdo_samples.csv", index=False)
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
import wget
7+
import gzip
8+
import subprocess
9+
import math
10+
11+
def get_copy_call(a):
12+
"""
13+
Heler Function - Determine copy call for a value.
14+
"""
15+
16+
if a is None:
17+
return float('nan')
18+
19+
if math.isnan(a):
20+
return float('nan')
21+
22+
a_val = math.log2(float(a)+0.000001)
23+
if a_val < 0.5210507:
24+
return 'deep del'
25+
elif a_val < 0.7311832:
26+
return 'het loss'
27+
elif a_val < 1.214125:
28+
return 'diploid'
29+
elif a_val < 1.422233:
30+
return 'gain'
31+
else:
32+
return 'amp'
33+
34+
return pd.Series([get_copy_call(a) for a in arr])
35+
36+
def get_bladder_pdo_transcriptomics(GEO_id_link_table, samples, genes):
37+
38+
bladderpdo_url ='https://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103990/suppl/GSE103990_Normalized_counts.txt.gz'
39+
transcriptomic_txt = wget.download(bladderpdo_url)
40+
transcriptomics = pd.read_csv(transcriptomic_txt, compression='gzip', sep="\t")
41+
subprocess.call (["/usr/local/bin/Rscript", "--vanilla", "obtainGSMidLink.R"])
42+
43+
GEO_ids_link = pd.read_csv("./gsmlinkDf.csv")
44+
fpkm_totals = transcriptomics.iloc[:, 1:43].sum()
45+
transcriptomics.iloc[:, 1:43] = transcriptomics.iloc[:, 1:43].div(fpkm_totals).mul(1e6)
46+
transcriptomics['ensembl'] = transcriptomics['Unnamed: 0'].str.split("_", expand=True)[0]
47+
mapped_df = transcriptomics.merge(genes[['entrez_id', 'other_id']].drop_duplicates(), left_on='ensembl', right_on='other_id', how='left')
48+
# transform data to long format
49+
50+
mapped_df.drop('other_id', axis=1)
51+
value_variables = transcriptomics.columns[transcriptomics.columns.str.contains("M")]
52+
melted_txomics = mapped_df.melt(id_vars = "entrez_id", value_vars = value_variables, var_name='sample_name')
53+
# use info from GEO to get Sample IDS
54+
txomics_with_GEOid = melted_txomics.merge(GEO_ids_link, how = 'left', left_on = "sample_name", right_on='RNAid')
55+
# use samplesheet to link sample_ids to improve ids
56+
txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("org", "Organoid_")
57+
txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("tumor", "Tumor")
58+
txomics_with_improveid = txomics_with_GEOid.merge(samples, left_on="sampleid", right_on="other_id", how="left")
59+
final_transcriptomics = txomics_with_improveid[['entrez_id', 'value', 'improve_sample_id']]
60+
final_transcriptomics['source'] = "Gene Expression Omnibus"
61+
final_transcriptomics['study'] = "Lee etal 2018 Bladder PDOs"
62+
final_transcriptomics.rename({'value' : 'transcriptomics' })
63+
# remove duplicates
64+
toreturn = final_transcriptomics.drop_duplicates()
65+
66+
return toreturn
67+
68+
def get_bladder_pdo_mutations(synObject, samples, genes):
69+
print(samples.head)
70+
mutations = synObject.get("syn64765525")
71+
mutations_df = pd.read_csv(mutations.path, sep='\t')
72+
mutations_df['mutation'] = mutations_df['HGVSc'].str.split(":", expand=True)[1]
73+
#samplesheet = pd.read_csv(samples)
74+
selectioncols_mutations = mutations_df[['Entrez_Gene_Id',"Variant_Classification", "Tumor_Sample_Barcode", "mutation"]]
75+
merged_mutations = selectioncols_mutations.merge(samples, left_on="Tumor_Sample_Barcode", right_on="other_id", how="left")
76+
merged_mutations_renamed = merged_mutations.rename({"Entrez_Gene_Id" : 'entrez_id', 'Variant_Classification' : "variant_classification"}, axis=1)
77+
print(merged_mutations_renamed.head)
78+
final_mutations = merged_mutations_renamed[['entrez_id', "mutation", "variant_classification", "improve_sample_id"]]
79+
final_mutations['study'] = "Lee etal 2018 Bladder PDOs"
80+
print(final_mutations.head)
81+
return final_mutations
82+
83+
def get_bladder_pdo_copynumber(synObject, samples, genes):
84+
segfile = synObject.get("syn64765499")
85+
segfile_df = pd.read_csv(segfile.path, sep='\t')
86+
87+
segfile_df.to_csv("bladder_segfile.csv")
88+
subprocess.call (["/usr/local/bin/Rscript", "--vanilla", "CNV-segfile-annotation.R", "bladder_segfile.csv", "bladder_annotated_segfile.csv"])
89+
copynumber = pd.read_csv("bladder_annotated_segfile.csv")
90+
copynumber['copy_number'] = np.exp2(copynumber['score'].div(2))*2
91+
copynumber['copy_call'] = [get_copy_call(a) for a in copynumber['copy_number']]
92+
copynumber_with_improveids = copynumber.merge(samples, left_on='ID', right_on = 'other_id', how='left')
93+
copynumber_with_correct_colnames = copynumber_with_improveids.rename({"ENTREZID":'entrez_id'}, axis=1)
94+
final_copynumber = copynumber_with_correct_colnames[['entrez_id', 'improve_sample_id', 'copy_number', 'copy_call']]
95+
final_copynumber['source'] = "Synapse"
96+
final_copynumber['study'] = "Lee etal 2018 Bladder PDOs"
97+
98+
return final_copynumber
99+
100+
101+
102+
103+
if __name__ == "__main__":
104+
print('in main')
105+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of omics data files for the Bladder PDO project")
106+
parser.add_argument('-s', '--samples', help='Path to sample file',default=None)
107+
parser.add_argument('-g', '--genes', help='Path to genes file', default = None)
108+
parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
109+
parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
110+
parser.add_argument('-e', '--expression', help='Flag to capture transcriptomic data', action='store_true', default=False)
111+
parser.add_argument('-i', '--geolink', help=".csv file that is the output of 'CNV-segfile-anotation.R")
112+
parser.add_argument('-t', '--token', help='Synapse token')
113+
114+
args = parser.parse_args()
115+
print("Logging into Synapse")
116+
PAT = args.token
117+
synObject = synapseclient.login(authToken=PAT)
118+
print('gene file is:')
119+
print(args.genes)
120+
print('sample file is :')
121+
print(args.samples)
122+
genes = pd.read_csv(args.genes)
123+
samples = pd.read_csv(args.samples)
124+
125+
if args.expression:
126+
get_bladder_pdo_transcriptomics(args.geolink, samples, genes).to_csv("bladderpdo_transcriptomics.csv", index=False)
127+
128+
if args.mutation:
129+
get_bladder_pdo_mutations(synObject, samples, genes).to_csv('bladderpdo_mutations.csv', index=False)
130+
131+
if args.copy:
132+
get_bladder_pdo_copynumber(synObject, samples, genes).to_csv("bladderpdo_copynumber.csv", index=False)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
# for testing locally
7+
from utils.pubchem_retrieval import update_dataframe_and_write_tsv
8+
# for building in docker
9+
#from pubchem_retrieval import update_dataframe_and_write_tsv
10+
11+
12+
def create_bladder_pdo_drugs_file(synObject, prevDrugFilepath, outputPath):
13+
bladder_dir = synObject.get('syn64765430')
14+
filenames = list(synObject.getChildren(parent='syn64765430', includeTypes=['file']))
15+
bladder_drugs = pd.DataFrame({'drugNames' : [str]})
16+
# '-4' - there are 4 nondrug files in this directory.
17+
for i in range(len(filenames)-4):
18+
bladder_drugs.loc[i,'drugNames'] = filenames[i]['name'].split(")")[1].split("(")[0].split(".")[0].strip()
19+
20+
# get unique drugs
21+
newdrugnames = bladder_drugs['drugNames'].unique()
22+
# use helper functions in pubchem_retrieval.py
23+
alldrugs = []
24+
if prevDrugFilepath is not None and prevDrugFilepath is not "":
25+
prevdrugs = [pd.read_csv(t,sep='\t') for t in prevDrugFilepath.split(',')]
26+
alldrugs = pd.concat(prevdrugs).drop_duplicates()
27+
28+
imps = alldrugs[alldrugs.chem_name.isin(newdrugnames)]
29+
newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)]
30+
31+
##write drugs
32+
newdrugs.to_csv(outputPath, sep='\t', index=False)
33+
34+
if len(alldrugs)==0 or len(newdrugnames)>len(set(newdrugs.improve_drug_id)): #we have more names we didn't match
35+
print('Missing drugs in existing file, querying pubchem')
36+
update_dataframe_and_write_tsv(newdrugnames,outputPath)
37+
38+
39+
if __name__ == "__main__":
40+
41+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of drug data files for the Lee Bladder PDO project")
42+
parser.add_argument('-d', '--prevDrugFilePath', help='Path to a previous drug file for sarcpdo', default = None)
43+
parser.add_argument('-o', '--outputPath', help='Output path for updated sarcpdo drug file', default = "/tmp/sarcpdo_drugs.tsv")
44+
parser.add_argument('-t', '--token', help='Synapse token')
45+
46+
args = parser.parse_args()
47+
print("Logging into Synapse")
48+
PAT = args.token
49+
synObject = synapseclient.login(authToken=PAT)
50+
create_bladder_pdo_drugs_file(synObject, args.prevDrugFilePath, args.outputPath)

build/bladderpdo/build_drugs.sh

100644100755
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running script with token and drugFile $1"
7+
# for running locally (from build directory):
8+
python3 -m bladderpdo.02_createBladderPDODrugsFile --token $SYNAPSE_AUTH_TOKEN -d $1 -o ./bladderpdo/bladderpdo_drugs.tsv
9+
#python3 02_createBladderPDODrugsFile.py --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/bladderpdo_drugs.tsv
10+
11+
echo "Running build_drug_desc.py..."
12+
#for running locally:
13+
python3 utils/build_drug_desc.py --drugtable ./bladderpdo/bladderpdo_drugs.tsv --desctable ./bladderpdo/bladderpdo_drug_descriptors.tsv.gz
14+
#python3 build_drug_desc.py --drugtable /tmp/bladderpdo_drugs.tsv --desctable /tmp/bladderpdo_drug_descriptors.tsv.gz

build/bladderpdo/build_omics.sh

100644100755
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running script with token, curSamples $2, and genes $1."
7+
# for mutation data (-m)
8+
#python3 01_createBladderPDOOmicsFiles.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -m
9+
# for expressiondata (-e)
10+
#python3 01_createBladderPDOOmicsFiles.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -e
11+
# for copynumber
12+
python3 01_createBladderPDOOmicsFiles.py --token $SYNAPSE_AUTH_TOKEN -s $2 -g $1 -c

build/bladderpdo/build_samples.sh

100644100755
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running 00_createBladderPDOSampleFile.py with token and previous sample file $1"
7+
python3 00_createBladderPDOSampleFile.py --token $SYNAPSE_AUTH_TOKEN -p $1

build/bladderpdo/obtainGSMidLink.R

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
gse <- GEOquery::getGEO("GSE103990",GSEMatrix=FALSE)
2+
GEOquery::Meta(gse)
3+
GSMs <- GEOquery::Meta(gse)$sample_id
4+
gsmlinkDf <- data.frame(GSM = GSMs, sampleid = NA, RNAid = NA)
5+
for (i in 1:length(GSMs)){
6+
GSMinfo <- GEOquery::getGEO(gsmlinkDf[i,"GSM"])
7+
#description + title
8+
gsmlinkDf[i,'sampleid'] <- GEOquery::Meta(GSMinfo)$title
9+
gsmlinkDf[i, 'RNAid'] <- GEOquery::Meta(GSMinfo)$description[2]
10+
}
11+
12+
write.csv(gsmlinkDf, file="./gsmlinkDf.csv", row.names = F)

0 commit comments

Comments
 (0)