Skip to content

Commit 5ca8997

Browse files
authored
Merge pull request #364 from PNNL-CompBio/cdc_organoids
CRC PDO Dataset
2 parents 64d08aa + 376970a commit 5ca8997

29 files changed

+1739
-4
lines changed

build/build_dataset.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ def process_docker(dataset,validate):
4646
'cptac': ['cptac'],
4747
'sarcpdo': ['sarcpdo'],
4848
'genes': ['genes'],
49-
'upload': ['upload'],
49+
'upload': ['upload'],
50+
'crcpdo': ['crcpdo'],
5051
'bladderpdo': ['bladderpdo']
5152
}
5253

@@ -129,7 +130,8 @@ def process_omics(executor, dataset, should_continue):
129130
'mpnstpdx':['copy_number', 'mutations', 'proteomics', 'transcriptomics'],
130131
'sarcpdo': ['mutations', 'transcriptomics'],
131132
'pancpdo': ['transcriptomics'],
132-
'bladderpdo': ['copy_number', 'mutations', 'transcriptomics']
133+
'bladderpdo': ['copy_number', 'mutations', 'transcriptomics'],
134+
'crcpdo':['copy_number', 'mutations', 'transcriptomics']
133135
}
134136

135137
expected_omics = dataset_omics_files.get(dataset, [])

build/crcPDO/01-samples-crcPDO.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
import pandas as pd
2+
import numpy as np
3+
import os
4+
import gzip
5+
import requests
6+
import argparse
7+
import synapseclient
8+
9+
###### NOTES ######
10+
# * need to change all paths to paths relevant to docker image
11+
# * add description to parser
12+
# * run functions in ipynb to test they are working
13+
14+
def download_rnaseq(geo_url:str = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE65253&format=file&file=GSE65253%5Fcol%5Ftum%5Forg%5Fmerge%2Ecsv%2Egz", save_path:str = None):
15+
"""
16+
Retrieve data from a given GEO URL and identify the downloaded file by its name.
17+
18+
This function uses the wget tool to download a file from the provided GEO URL.
19+
By comparing the directory contents before and after the download,
20+
it identifies the newly downloaded file's name.
21+
22+
Parameters
23+
----------
24+
geo_url : str
25+
The GEO URL pointing to the data to be downloaded. Default is from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE65253
26+
27+
save_path : string
28+
Local path where the downloaded file will be saved.
29+
30+
Returns
31+
-------
32+
None
33+
"""
34+
35+
response = requests.get(geo_url)
36+
with open(save_path, 'wb') as f:
37+
f.write(response.content)
38+
39+
def download_sequencing_data(synID:str , save_path:str = None, synToken:str = None):
40+
"""
41+
Download sequencing data from Synapse at synapseID syn64961953. Requires a synapse token, which requires you to make a Synapse account
42+
and create a Personal Access Token. More information here: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens
43+
44+
Parameters
45+
----------
46+
synID : string
47+
SynapseID of dataset to download. Default is synapseID of the sequencing dataset.
48+
49+
save_path : string
50+
Local path where the downloaded file will be saved.
51+
52+
synToken : string
53+
Synapse Personal Access Token of user. Requires a Synapse account. More information at: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens
54+
55+
Returns
56+
-------
57+
str
58+
Filepath to downloaded excel file
59+
"""
60+
61+
syn = synapseclient.Synapse()
62+
syn.login(authToken=synToken)
63+
64+
# Obtain a pointer and download the data
65+
syn64961953 = syn.get(entity=synID, downloadLocation = save_path)
66+
67+
# Get the path to the local copy of the data file
68+
sequencing_filepath = syn64961953.path
69+
return(sequencing_filepath)
70+
71+
def generate_sample_file(sequencing_data_path:str = None, prev_samples_path:str = "") -> pd.DataFrame:
72+
"""
73+
Creates sample file from sequencing data excel file. Checks the input sample file against previous sample files to make sure
74+
there are no clashing sample names and assigns improved ID's starting from where previous sample sheet left off.
75+
76+
Parameters
77+
----------
78+
sequencing_data_path : string
79+
Path to sequencing data from https://www.cell.com/cell/fulltext/S0092-8674(15)00373-6#sec-4 . Supplementary Table S1
80+
81+
prev_samples_path : string
82+
Path to previous sample sheet.
83+
84+
Returns
85+
-------
86+
pd.DataFrame
87+
A DataFrame containing the combined samples data.
88+
89+
"""
90+
# reading in sequencing excel file
91+
sequencing_excel = pd.ExcelFile(open(sequencing_data_path, 'rb'))
92+
recurrent_mutations = pd.read_excel(sequencing_excel, 'TableS1I_Recurrent mutations') # table with recurrent mutation information
93+
somatic_mutations = pd.read_excel(sequencing_excel, 'TableS1J-Somatic mutations') # table with somatic mutation information
94+
copy_num = pd.read_excel(sequencing_excel, 'TableS1D-Segmented_CN')
95+
96+
# reading in previous sample file
97+
if prev_samples_path != "":
98+
prev_samples = pd.read_csv(prev_samples_path)
99+
100+
# reading in rucurent mutation info
101+
recurrent_tumor = pd.DataFrame({'other_id':recurrent_mutations['Tumor_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()})
102+
recurrent_normal = pd.DataFrame({'other_id':recurrent_mutations['Matched_Norm_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()})
103+
104+
# merging somatic organoids too just in case recurrent excludes some
105+
somatic_tumor = pd.DataFrame({'other_id':somatic_mutations['Tumor_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()})
106+
somatic_normal = pd.DataFrame({'other_id':somatic_mutations['Matched_Norm_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()})
107+
108+
# also merging from segmented CN bc the other two exclude P18 Tumor biopsy
109+
copy_num_patients = pd.DataFrame({'other_id':copy_num['Sample'].str.split('.',n = 1,expand=True).iloc[:,1].str.replace(".","-").unique()})
110+
111+
samples_df = pd.concat([recurrent_tumor,recurrent_normal, somatic_tumor, somatic_normal,copy_num_patients])
112+
113+
# formatting the table
114+
samples_df = samples_df.drop_duplicates('other_id')
115+
samples_df = samples_df.reset_index()
116+
samples_df['common_name'] = samples_df['other_id'].str.split('-', n = 1,expand=True).iloc[:,0] + "-"
117+
samples_df['model_type'] = ""
118+
for index, row in samples_df.iterrows():
119+
if "Tumor-Organoid" in samples_df.loc[index, 'other_id']:
120+
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-O"
121+
samples_df.loc[index, 'model_type'] = "organoid"
122+
if "Tumor-Biopsy" in samples_df.loc[index, 'other_id']:
123+
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-B"
124+
samples_df.loc[index, 'model_type'] = "ex vivo"
125+
if "Normal-Organoid" in samples_df.loc[index, 'other_id']:
126+
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "N-O"
127+
samples_df.loc[index, 'model_type'] = "organoid"
128+
samples_df['other_id_source'] = "vandeWetering_2015"
129+
samples_df['cancer_type'] = "Colorectal Carcinoma"
130+
samples_df['species'] = "Homo sapiens (Human)"
131+
132+
# check other_id doesn't clash with previous sample names
133+
if prev_samples_path != "":
134+
if prev_samples.other_id.values in samples_df.other_id.values:
135+
print("Duplicate id names detected. Cannot proceed with generating sample sheet until resolved.")
136+
exit()
137+
if prev_samples_path == "":
138+
maxval = 0
139+
else:
140+
maxval = max(prev_samples.improve_sample_id)
141+
samples_df['improve_sample_id'] = samples_df.index + maxval + 1 # take index plus 1 to create counter, start from max value
142+
samples_df = samples_df.drop(columns = 'index')
143+
return(samples_df)
144+
145+
146+
147+
148+
if __name__ == "__main__":
149+
parser = argparse.ArgumentParser(description='###')
150+
151+
parser.add_argument('-D', '--download',action='store_true', default=False, help='Download RNA seq and sequencing data from GEO and supplemental materials from https://www.cell.com/cell/fulltext/S0092-8674(15)00373-6#mmc2')
152+
parser.add_argument('-t', '--token', type=str, default=None, help='Synapse Token')
153+
parser.add_argument('-i', '--synapseID', type=str, default="syn64961953", help='SynapseID of data to download')
154+
155+
parser.add_argument('-s', '--samples', action = 'store_true', help='Only generate samples, requires previous samples',default=False)
156+
parser.add_argument('-p', '--prevSamples', nargs='?',type=str, default='', const='', help='Use this to provide previous sample file')
157+
158+
159+
160+
args = parser.parse_args()
161+
162+
163+
###########################
164+
165+
if args.download:
166+
if args.token is None:
167+
print("No synpase download tocken was provided. Cannot download data.")
168+
exit()
169+
else:
170+
print("Downloading Files from Synapse.")
171+
# Download RNA seq data
172+
download_rnaseq(save_path = "/tmp/GSE65253_col_tum_org_merge.csv.gz")
173+
# Download sequencing data
174+
sequencing_download_path = download_sequencing_data(synID = args.synapseID, synToken = args.token, save_path = "/tmp")
175+
176+
if args.samples:
177+
if args.prevSamples is None or args.prevSamples=='':
178+
print("No previous samples file provided. Starting improve_sample_id from 1. Running sample file generation")
179+
sample_sheet = generate_sample_file(sequencing_data_path = sequencing_download_path)
180+
else:
181+
print("Previous sample sheet {} detected. Running sample file generation and checking for duplicate IDs.".format(args.prevSamples))
182+
sample_sheet = generate_sample_file(sequencing_data_path = sequencing_download_path, prev_samples_path= args.prevSamples)
183+
sample_sheet.to_csv("/tmp/crcpdo_samples.csv", index=False)
184+
185+

0 commit comments

Comments
 (0)