Skip to content

Commit 24816d3

Browse files
authored
Merge pull request #347 from PNNL-CompBio/bladder_pdo
Bladder PDO dataset addition
2 parents 0225c52 + 2a7413b commit 24816d3

16 files changed

+518
-2
lines changed
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
7+
8+
def get_bladder_pdo_samples(synLoginObject, maxval):
9+
10+
# download from Synapse..
11+
samples_syn = synLoginObject.get('syn64765486')
12+
# and read the file
13+
samples_df = pd.read_csv(samples_syn.path, sep="\t")
14+
15+
samples = samples_df[['Sample ID', 'Patient ID', 'Cancer Type Detailed', 'Sample Class']]
16+
samples = samples.rename({"Sample ID" : 'other_id', 'Patient ID' : 'common_name', 'Cancer Type Detailed': 'cancer_type', 'Sample Class' : 'model_type'}, axis=1)
17+
18+
samples.loc[:,['species']] = 'Homo sapiens(Human)'
19+
samples.loc[:,['other_id_source']] = 'Synapse'
20+
samples.loc[:,['other_names'] ]= ''
21+
samples.loc[:,['cancer_type']]=samples['cancer_type'].str.lower()
22+
samples.loc[:, ['model_type']] = samples['model_type'].str.lower()
23+
24+
samples['improve_sample_id'] = range(maxval+1, maxval+1+samples.shape[0])
25+
26+
return samples
27+
28+
29+
if __name__ == "__main__":
30+
31+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Sarcoma PDO project into a single samplesheet")
32+
33+
parser.add_argument('-t', '--token', type=str, help='Synapse Token')
34+
35+
parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const = "", help = "Use this to provide previous sample file, will run sample file generation")
36+
37+
args = parser.parse_args()
38+
39+
print("Logging into Synapse")
40+
PAT = args.token
41+
synObject = synapseclient.login(authToken=PAT)
42+
43+
if (args.prevSamples):
44+
prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id)
45+
else:
46+
prev_max_improve_id = 0
47+
48+
bladder_pdo_samples = get_bladder_pdo_samples(synObject, prev_max_improve_id)
49+
50+
bladder_pdo_samples.to_csv("/tmp/bladderpdo_samples.csv", index=False)
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
import wget
7+
import gzip
8+
import subprocess
9+
import math
10+
11+
def get_copy_call(a):
12+
"""
13+
Heler Function - Determine copy call for a value.
14+
"""
15+
16+
if a is None:
17+
return float('nan')
18+
19+
if math.isnan(a):
20+
return float('nan')
21+
22+
a_val = math.log2(float(a)+0.000001)
23+
if a_val < 0.5210507:
24+
return 'deep del'
25+
elif a_val < 0.7311832:
26+
return 'het loss'
27+
elif a_val < 1.214125:
28+
return 'diploid'
29+
elif a_val < 1.422233:
30+
return 'gain'
31+
else:
32+
return 'amp'
33+
34+
return pd.Series([get_copy_call(a) for a in arr])
35+
36+
def get_bladder_pdo_transcriptomics(GEO_id_link_table, samples, genes):
37+
38+
bladderpdo_url ='https://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103990/suppl/GSE103990_Normalized_counts.txt.gz'
39+
transcriptomic_txt = wget.download(bladderpdo_url)
40+
transcriptomics = pd.read_csv(transcriptomic_txt, compression='gzip', sep="\t")
41+
subprocess.call (["/usr/bin/Rscript", "--vanilla", "obtainGSMidLink.R"])
42+
43+
GEO_ids_link = pd.read_csv("./gsmlinkDf.csv")
44+
fpkm_totals = transcriptomics.iloc[:, 1:43].sum()
45+
transcriptomics.iloc[:, 1:43] = transcriptomics.iloc[:, 1:43].div(fpkm_totals).mul(1e6)
46+
transcriptomics['ensembl'] = transcriptomics['Unnamed: 0'].str.split("_", expand=True)[0]
47+
mapped_df = transcriptomics.merge(genes[['entrez_id', 'other_id']].drop_duplicates(), left_on='ensembl', right_on='other_id', how='left')
48+
# transform data to long format
49+
50+
mapped_df.drop('other_id', axis=1)
51+
value_variables = transcriptomics.columns[transcriptomics.columns.str.contains("M")]
52+
melted_txomics = mapped_df.melt(id_vars = "entrez_id", value_vars = value_variables, var_name='sample_name')
53+
# use info from GEO to get Sample IDS
54+
txomics_with_GEOid = melted_txomics.merge(GEO_ids_link, how = 'left', left_on = "sample_name", right_on='RNAid')
55+
# use samplesheet to link sample_ids to improve ids
56+
txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("org", "Organoid_")
57+
txomics_with_GEOid['sampleid'] = txomics_with_GEOid['sampleid'].str.replace("tumor", "Tumor")
58+
txomics_with_improveid = txomics_with_GEOid.merge(samples, left_on="sampleid", right_on="other_id", how="left")
59+
final_transcriptomics = txomics_with_improveid[['entrez_id', 'value', 'improve_sample_id']]
60+
final_transcriptomics['source'] = "Gene Expression Omnibus"
61+
final_transcriptomics['study'] = "Lee etal 2018 Bladder PDOs"
62+
final_transcriptomics.rename({'value' : 'transcriptomics' })
63+
# remove duplicates
64+
toreturn = final_transcriptomics.drop_duplicates()
65+
66+
return toreturn
67+
68+
def get_bladder_pdo_mutations(synObject, samples, genes):
69+
print(samples.head)
70+
mutations = synObject.get("syn64765525")
71+
mutations_df = pd.read_csv(mutations.path, sep='\t')
72+
mutations_df['mutation'] = mutations_df['HGVSc'].str.split(":", expand=True)[1]
73+
#samplesheet = pd.read_csv(samples)
74+
selectioncols_mutations = mutations_df[['Entrez_Gene_Id',"Variant_Classification", "Tumor_Sample_Barcode", "mutation"]]
75+
merged_mutations = selectioncols_mutations.merge(samples, left_on="Tumor_Sample_Barcode", right_on="other_id", how="left")
76+
merged_mutations_renamed = merged_mutations.rename({"Entrez_Gene_Id" : 'entrez_id', 'Variant_Classification' : "variant_classification"}, axis=1)
77+
print(merged_mutations_renamed.head)
78+
final_mutations = merged_mutations_renamed[['entrez_id', "mutation", "variant_classification", "improve_sample_id"]]
79+
final_mutations['study'] = "Lee etal 2018 Bladder PDOs"
80+
print(final_mutations.head)
81+
return final_mutations
82+
83+
def get_bladder_pdo_copynumber(synObject, samples, genes):
84+
segfile = synObject.get("syn64765499")
85+
segfile_df = pd.read_csv(segfile.path, sep='\t')
86+
87+
segfile_df.to_csv("bladder_segfile.csv")
88+
subprocess.call (["/usr/bin/Rscript", "--vanilla", "CNV-segfile-annotation.R", "bladder_segfile.csv", "bladder_annotated_segfile.csv"])
89+
copynumber = pd.read_csv("bladder_annotated_segfile.csv")
90+
copynumber['copy_number'] = np.exp2(copynumber['score'].div(2))*2
91+
copynumber['copy_call'] = [get_copy_call(a) for a in copynumber['copy_number']]
92+
copynumber_with_improveids = copynumber.merge(samples, left_on='ID', right_on = 'other_id', how='left')
93+
copynumber_with_correct_colnames = copynumber_with_improveids.rename({"ENTREZID":'entrez_id'}, axis=1)
94+
final_copynumber = copynumber_with_correct_colnames[['entrez_id', 'improve_sample_id', 'copy_number', 'copy_call']]
95+
final_copynumber['source'] = "Synapse"
96+
final_copynumber['study'] = "Lee etal 2018 Bladder PDOs"
97+
98+
return final_copynumber
99+
100+
101+
102+
103+
if __name__ == "__main__":
104+
print('in main')
105+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of omics data files for the Bladder PDO project")
106+
parser.add_argument('-s', '--samples', help='Path to sample file',default=None)
107+
parser.add_argument('-g', '--genes', help='Path to genes file', default = None)
108+
parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
109+
parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
110+
parser.add_argument('-e', '--expression', help='Flag to capture transcriptomic data', action='store_true', default=False)
111+
parser.add_argument('-i', '--geolink', help=".csv file that is the output of 'CNV-segfile-anotation.R")
112+
parser.add_argument('-t', '--token', help='Synapse token')
113+
114+
args = parser.parse_args()
115+
print("Logging into Synapse")
116+
PAT = args.token
117+
synObject = synapseclient.login(authToken=PAT)
118+
print('gene file is:')
119+
print(args.genes)
120+
print('sample file is :')
121+
print(args.samples)
122+
genes = pd.read_csv(args.genes)
123+
samples = pd.read_csv(args.samples)
124+
125+
if args.expression:
126+
get_bladder_pdo_transcriptomics(args.geolink, samples, genes).to_csv("/tmp/bladderpdo_transcriptomics.csv", index=False)
127+
128+
if args.mutation:
129+
get_bladder_pdo_mutations(synObject, samples, genes).to_csv('/tmp/bladderpdo_mutations.csv', index=False)
130+
131+
if args.copy:
132+
get_bladder_pdo_copynumber(synObject, samples, genes).to_csv("/tmp/bladderpdo_copynumber.csv", index=False)
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
# for testing locally
7+
#from utils.pubchem_retrieval import update_dataframe_and_write_tsv
8+
# for building in docker
9+
from pubchem_retrieval import update_dataframe_and_write_tsv
10+
11+
12+
def create_bladder_pdo_drugs_file(synObject, prevDrugFilepath, outputPath):
13+
bladder_dir = synObject.get('syn64765430')
14+
filenames = list(synObject.getChildren(parent='syn64765430', includeTypes=['file']))
15+
bladder_drugs = pd.DataFrame({'drugNames' : [str]})
16+
# '-4' - there are 4 nondrug files in this directory.
17+
for i in range(len(filenames)-4):
18+
bladder_drugs.loc[i,'drugNames'] = filenames[i]['name'].split(")")[1].split("(")[0].split(".")[0].strip()
19+
20+
# get unique drugs
21+
newdrugnames = bladder_drugs['drugNames'].unique()
22+
# use helper functions in pubchem_retrieval.py
23+
alldrugs = []
24+
if prevDrugFilepath is not None and prevDrugFilepath is not "":
25+
prevdrugs = [pd.read_csv(t,sep='\t') for t in prevDrugFilepath.split(',')]
26+
alldrugs = pd.concat(prevdrugs).drop_duplicates()
27+
28+
imps = alldrugs[alldrugs.chem_name.isin(newdrugnames)]
29+
newdrugs = alldrugs[alldrugs.improve_drug_id.isin(imps.improve_drug_id)]
30+
31+
##write drugs
32+
newdrugs.to_csv(outputPath, sep='\t', index=False)
33+
34+
if len(alldrugs)==0 or len(newdrugnames)>len(set(newdrugs.improve_drug_id)): #we have more names we didn't match
35+
print('Missing drugs in existing file, querying pubchem')
36+
update_dataframe_and_write_tsv(newdrugnames,outputPath)
37+
38+
39+
if __name__ == "__main__":
40+
41+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of drug data files for the Lee Bladder PDO project")
42+
parser.add_argument('-d', '--prevDrugFilePath', help='Path to a previous drug file for sarcpdo', nargs="?", default = None)
43+
parser.add_argument('-o', '--outputPath', help='Output path for updated sarcpdo drug file', default = "/tmp/sarcpdo_drugs.tsv")
44+
parser.add_argument('-t', '--token', help='Synapse token')
45+
46+
args = parser.parse_args()
47+
print("Logging into Synapse")
48+
PAT = args.token
49+
synObject = synapseclient.login(authToken=PAT)
50+
if args.prevDrugFilePath:
51+
previousDrugs = args.prevDrugFilePath
52+
else:
53+
previousDrugs = None
54+
create_bladder_pdo_drugs_file(synObject, previousDrugs, args.outputPath)
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import synapseclient
2+
import pandas as pd
3+
import argparse
4+
5+
6+
def get_bladder_pdo_experiments(synObject, samples, drugs):
7+
# get list of syn id info
8+
files = list(synObject.getChildren(parent='syn64765430', includeTypes=['file']))
9+
# load sample sheet and format _ to .
10+
# load conversion table and remove trailing _T
11+
conversion_table = synObject.get(files[50]['id'])
12+
conversion_table_df = pd.read_excel(conversion_table.path, header=None)
13+
conversion_table_df[2] = conversion_table_df[1].str.rsplit("_", expand=True)[0]#.replace(".", "_")
14+
conversion_table_df[3] = conversion_table_df[2].str.replace(".", "_")
15+
#print(conversion_table_df.head)
16+
17+
# initiate empty pd.dat.frame
18+
drug_df = pd.DataFrame()
19+
# for each drug,
20+
for i in range(len(files)-4):
21+
drug_table_syn =synObject.get(files[i]['id'])
22+
drug_table = pd.read_csv(drug_table_syn.path, sep="\t")
23+
# melt
24+
# link to conversion table
25+
# link to sample sheet
26+
# Rename, add columns
27+
melted_single_drug = drug_table.melt(id_vars = 'Unnamed: 0', value_vars = drug_table.columns[1:], var_name="sample")
28+
melted_single_drug['linkID'] = melted_single_drug['sample'].str.split(".", expand=True)[0]
29+
drugdata = melted_single_drug.merge(conversion_table_df, left_on = 'linkID', right_on = 0, how='left')[['Unnamed: 0', 'value', 3]]
30+
#print(drugdata.head)
31+
drugdata_with_improvesample = drugdata.merge(samples, left_on = 3, right_on='common_name')
32+
33+
# print(drugdata_with_improvesample.head)
34+
drugdata_with_improvesample = drugdata_with_improvesample[['Unnamed: 0', 'value', 'improve_sample_id']]
35+
#print(drugdata_with_improvesample.columns)
36+
drugdata_with_improvesample = drugdata_with_improvesample.rename({"Unnamed: 0" : "DOSE", 'value' : 'GROWTH'}, axis=1)
37+
#print(drugdata_with_improvesample.columns)
38+
39+
selected_drugdata = drugdata_with_improvesample
40+
selected_drugdata['chem_name'] = files[i]['name'].split(")")[1].split("(")[0].split(".")[0].strip().lower()
41+
#print(selected_drugdata.head)
42+
drugdata_with_both_improveIds = selected_drugdata.merge(drugs[['improve_drug_id', 'chem_name']], how='left')
43+
final_drugdata = drugdata_with_both_improveIds[['DOSE', 'GROWTH', 'improve_sample_id', 'improve_drug_id']]
44+
final_drugdata = final_drugdata.rename({'improve_drug_id' : "Drug"}, axis=1)
45+
final_drugdata['study'] = "Lee etal 2018 Bladder PDOs"
46+
final_drugdata['source'] = "Synapse"
47+
final_drugdata['time'] = 6
48+
final_drugdata['time_unit'] = 'days'
49+
#print(final_drugdata.head)
50+
# append to dataframe
51+
dose_resp_df = pd.concat([drug_df, final_drugdata])
52+
53+
return dose_resp_df
54+
55+
56+
if __name__ == "__main__":
57+
parser = argparse.ArgumentParser()
58+
parser.add_argument('-t', '--token', help='Synapse authentication token')
59+
parser.add_argument('-s', '--curSampleFile', help='Sample mapping file for bladder pdo samples')
60+
parser.add_argument('-d', '--drugfile', help='Drug mapping file for bladder pdo samples')
61+
parser.add_argument('-o', '--output', default = '/tmp/bladderpdo_doserep.tsv',help='Output file to be read into curve fitting code')
62+
63+
args = parser.parse_args()
64+
print("Logging into Synapse")
65+
PAT = args.token
66+
synObject = synapseclient.login(authToken=PAT)
67+
drug_df = pd.read_csv(args.drugfile, sep='\t')
68+
samples_df = pd.read_csv(args.curSampleFile)
69+
70+
doseresponse_data = get_bladder_pdo_experiments(synObject, samples_df, drug_df)
71+
doseresponse_data.to_csv(args.output, sep='\t')
72+
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env Rscript
2+
args = commandArgs(trailingOnly=TRUE)
3+
4+
# If BiocManager not installed, install it
5+
if (!require("BiocManager", quietly = TRUE))
6+
install.packages("BiocManager")
7+
BiocManager::install(version = "3.20")
8+
9+
library(GenomicRanges)
10+
library(Homo.sapiens)
11+
12+
13+
# function to return gene mappings for each coordinate range (row) in the segfile
14+
splitColumnByOverlap <-
15+
function(query, subject, column="ENTREZID", ...)
16+
{
17+
olaps <- findOverlaps(query, subject, ...)
18+
f1 <- factor(subjectHits(olaps),
19+
levels=seq_len(subjectLength(olaps)))
20+
splitAsList(mcols(query)[[column]][queryHits(olaps)], f1)
21+
}
22+
23+
segfile = read.csv(args[1])
24+
25+
# create genomic ranges object from segfile for use with findOverlaps()
26+
gr <- GRanges(seqnames = Rle(paste0('chr', segfile$chrom)),
27+
ranges = IRanges(segfile$loc.start, end = segfile$loc.end),
28+
strand = Rle(c("-", "0", "+")[sign(segfile$loc.start) +2]),
29+
score = segfile$seg.mean,
30+
ID = segfile$ID)
31+
32+
# create genes GRanges obj from database
33+
genes<- genes(Homo.sapiens, columns =c("ENTREZID"))
34+
35+
geneHitsByRow <- splitColumnByOverlap(genes, gr, "ENTREZID")
36+
37+
# create matrices of annotations and scores/patient IDs in segfile
38+
# with a catch if there are no gene hits found
39+
matrixresults <- list()
40+
noresults <- c()
41+
for (i in 1:length(geneHitsByRow)){
42+
if(length(geneHitsByRow[[i]])>0){
43+
44+
matrixresults[[i]] <- data.frame(ENTREZID = as.matrix(geneHitsByRow[[i]]), rep(mcols(gr[i,]), length(geneHitsByRow[[i]])))
45+
}else{
46+
noresults <- c(noresults, i)
47+
matrixresults[[i]] <- data.frame(ENTREZID = NA, mcols(gr)[i,])
48+
}
49+
}
50+
# concatenate annotated matrices
51+
allCNV <- do.call(rbind, matrixresults)
52+
53+
# drop NAs
54+
completeAllCNV <- allCNV[complete.cases(allCNV),]
55+
# aggregate scores for the same genes (that came from different regions) for the same patient ID
56+
aggregatedAllCNV <- aggregate(completeAllCNV$score, by = list(ENTREZID =completeAllCNV$ENTREZID, ID = completeAllCNV$ID), FUN=mean)
57+
names(aggregatedAllCNV)[names(aggregatedAllCNV)=='x'] <- "score"
58+
# write results to csv for further processing in Python
59+
write.csv(aggregatedAllCNV, args[2], row.names=F, quote =F)

build/bladderpdo/README.md

Whitespace-only changes.

build/bladderpdo/build_drugs.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
5+
6+
echo "Running script with token and drugFile $1"
7+
# for running locally (from build directory):
8+
#python3 -m bladderpdo.02_createBladderPDODrugsFile --token $SYNAPSE_AUTH_TOKEN -d $1 -o ./bladderpdo/bladderpdo_drugs.tsv
9+
python3 02_createBladderPDODrugsFile.py --token $SYNAPSE_AUTH_TOKEN -d $1 -o /tmp/bladderpdo_drugs.tsv
10+
11+
echo "Running build_drug_desc.py..."
12+
#for running locally:
13+
#python3 utils/build_drug_desc.py --drugtable ./bladderpdo/bladderpdo_drugs.tsv --desctable ./bladderpdo/bladderpdo_drug_descriptors.tsv.gz
14+
python3 build_drug_desc.py --drugtable /tmp/bladderpdo_drugs.tsv --desctable /tmp/bladderpdo_drug_descriptors.tsv.gz

0 commit comments

Comments
 (0)