|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | +import os |
| 4 | +import gzip |
| 5 | +import requests |
| 6 | +import argparse |
| 7 | +import synapseclient |
| 8 | + |
| 9 | +###### NOTES ###### |
| 10 | +# * need to change all paths to paths relevant to docker image |
| 11 | +# * add description to parser |
| 12 | +# * run functions in ipynb to test they are working |
| 13 | + |
| 14 | +def download_rnaseq(geo_url:str = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE65253&format=file&file=GSE65253%5Fcol%5Ftum%5Forg%5Fmerge%2Ecsv%2Egz", save_path:str = None): |
| 15 | + """ |
| 16 | + Retrieve data from a given GEO URL and identify the downloaded file by its name. |
| 17 | +
|
| 18 | + This function uses the wget tool to download a file from the provided GEO URL. |
| 19 | + By comparing the directory contents before and after the download, |
| 20 | + it identifies the newly downloaded file's name. |
| 21 | +
|
| 22 | + Parameters |
| 23 | + ---------- |
| 24 | + geo_url : str |
| 25 | + The GEO URL pointing to the data to be downloaded. Default is from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE65253 |
| 26 | + |
| 27 | + save_path : string |
| 28 | + Local path where the downloaded file will be saved. |
| 29 | +
|
| 30 | + Returns |
| 31 | + ------- |
| 32 | + None |
| 33 | + """ |
| 34 | + |
| 35 | + response = requests.get(geo_url) |
| 36 | + with open(save_path, 'wb') as f: |
| 37 | + f.write(response.content) |
| 38 | + |
| 39 | +def download_sequencing_data(synID:str , save_path:str = None, synToken:str = None): |
| 40 | + """ |
| 41 | + Download sequencing data from Synapse at synapseID syn64961953. Requires a synapse token, which requires you to make a Synapse account |
| 42 | + and create a Personal Access Token. More information here: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens |
| 43 | + |
| 44 | + Parameters |
| 45 | + ---------- |
| 46 | + synID : string |
| 47 | + SynapseID of dataset to download. Default is synapseID of the sequencing dataset. |
| 48 | + |
| 49 | + save_path : string |
| 50 | + Local path where the downloaded file will be saved. |
| 51 | +
|
| 52 | + synToken : string |
| 53 | + Synapse Personal Access Token of user. Requires a Synapse account. More information at: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens |
| 54 | + |
| 55 | + Returns |
| 56 | + ------- |
| 57 | + str |
| 58 | + Filepath to downloaded excel file |
| 59 | + """ |
| 60 | + |
| 61 | + syn = synapseclient.Synapse() |
| 62 | + syn.login(authToken=synToken) |
| 63 | + |
| 64 | + # Obtain a pointer and download the data |
| 65 | + syn64961953 = syn.get(entity=synID, downloadLocation = save_path) |
| 66 | + |
| 67 | + # Get the path to the local copy of the data file |
| 68 | + sequencing_filepath = syn64961953.path |
| 69 | + return(sequencing_filepath) |
| 70 | + |
| 71 | +def generate_sample_file(sequencing_data_path:str = None, prev_samples_path:str = "") -> pd.DataFrame: |
| 72 | + """ |
| 73 | + Creates sample file from sequencing data excel file. Checks the input sample file against previous sample files to make sure |
| 74 | + there are no clashing sample names and assigns improved ID's starting from where previous sample sheet left off. |
| 75 | + |
| 76 | + Parameters |
| 77 | + ---------- |
| 78 | + sequencing_data_path : string |
| 79 | + Path to sequencing data from https://www.cell.com/cell/fulltext/S0092-8674(15)00373-6#sec-4 . Supplementary Table S1 |
| 80 | + |
| 81 | + prev_samples_path : string |
| 82 | + Path to previous sample sheet. |
| 83 | +
|
| 84 | + Returns |
| 85 | + ------- |
| 86 | + pd.DataFrame |
| 87 | + A DataFrame containing the combined samples data. |
| 88 | + |
| 89 | + """ |
| 90 | + # reading in sequencing excel file |
| 91 | + sequencing_excel = pd.ExcelFile(open(sequencing_data_path, 'rb')) |
| 92 | + recurrent_mutations = pd.read_excel(sequencing_excel, 'TableS1I_Recurrent mutations') # table with recurrent mutation information |
| 93 | + somatic_mutations = pd.read_excel(sequencing_excel, 'TableS1J-Somatic mutations') # table with somatic mutation information |
| 94 | + copy_num = pd.read_excel(sequencing_excel, 'TableS1D-Segmented_CN') |
| 95 | + |
| 96 | + # reading in previous sample file |
| 97 | + if prev_samples_path != "": |
| 98 | + prev_samples = pd.read_csv(prev_samples_path) |
| 99 | + |
| 100 | + # reading in rucurent mutation info |
| 101 | + recurrent_tumor = pd.DataFrame({'other_id':recurrent_mutations['Tumor_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()}) |
| 102 | + recurrent_normal = pd.DataFrame({'other_id':recurrent_mutations['Matched_Norm_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()}) |
| 103 | + |
| 104 | + # merging somatic organoids too just in case recurrent excludes some |
| 105 | + somatic_tumor = pd.DataFrame({'other_id':somatic_mutations['Tumor_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()}) |
| 106 | + somatic_normal = pd.DataFrame({'other_id':somatic_mutations['Matched_Norm_Sample_Barcode'].str.split('-',n = 1,expand=True).iloc[:,1].unique()}) |
| 107 | + |
| 108 | + # also merging from segmented CN bc the other two exclude P18 Tumor biopsy |
| 109 | + copy_num_patients = pd.DataFrame({'other_id':copy_num['Sample'].str.split('.',n = 1,expand=True).iloc[:,1].str.replace(".","-").unique()}) |
| 110 | + |
| 111 | + samples_df = pd.concat([recurrent_tumor,recurrent_normal, somatic_tumor, somatic_normal,copy_num_patients]) |
| 112 | + |
| 113 | + # formatting the table |
| 114 | + samples_df = samples_df.drop_duplicates('other_id') |
| 115 | + samples_df = samples_df.reset_index() |
| 116 | + samples_df['common_name'] = samples_df['other_id'].str.split('-', n = 1,expand=True).iloc[:,0] + "-" |
| 117 | + samples_df['model_type'] = "" |
| 118 | + for index, row in samples_df.iterrows(): |
| 119 | + if "Tumor-Organoid" in samples_df.loc[index, 'other_id']: |
| 120 | + samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-O" |
| 121 | + samples_df.loc[index, 'model_type'] = "organoid" |
| 122 | + if "Tumor-Biopsy" in samples_df.loc[index, 'other_id']: |
| 123 | + samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-B" |
| 124 | + samples_df.loc[index, 'model_type'] = "ex vivo" |
| 125 | + if "Normal-Organoid" in samples_df.loc[index, 'other_id']: |
| 126 | + samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "N-O" |
| 127 | + samples_df.loc[index, 'model_type'] = "organoid" |
| 128 | + samples_df['other_id_source'] = "vandeWetering_2015" |
| 129 | + samples_df['cancer_type'] = "Colorectal Carcinoma" |
| 130 | + samples_df['species'] = "Homo sapiens (Human)" |
| 131 | + |
| 132 | + # check other_id doesn't clash with previous sample names |
| 133 | + if prev_samples_path != "": |
| 134 | + if prev_samples.other_id.values in samples_df.other_id.values: |
| 135 | + print("Duplicate id names detected. Cannot proceed with generating sample sheet until resolved.") |
| 136 | + exit() |
| 137 | + if prev_samples_path == "": |
| 138 | + maxval = 0 |
| 139 | + else: |
| 140 | + maxval = max(prev_samples.improve_sample_id) |
| 141 | + samples_df['improve_sample_id'] = samples_df.index + maxval + 1 # take index plus 1 to create counter, start from max value |
| 142 | + samples_df = samples_df.drop(columns = 'index') |
| 143 | + return(samples_df) |
| 144 | + |
| 145 | + |
| 146 | + |
| 147 | + |
| 148 | +if __name__ == "__main__": |
| 149 | + parser = argparse.ArgumentParser(description='###') |
| 150 | + |
| 151 | + parser.add_argument('-D', '--download',action='store_true', default=False, help='Download RNA seq and sequencing data from GEO and supplemental materials from https://www.cell.com/cell/fulltext/S0092-8674(15)00373-6#mmc2') |
| 152 | + parser.add_argument('-t', '--token', type=str, default=None, help='Synapse Token') |
| 153 | + parser.add_argument('-i', '--synapseID', type=str, default="syn64961953", help='SynapseID of data to download') |
| 154 | + |
| 155 | + parser.add_argument('-s', '--samples', action = 'store_true', help='Only generate samples, requires previous samples',default=False) |
| 156 | + parser.add_argument('-p', '--prevSamples', nargs='?',type=str, default='', const='', help='Use this to provide previous sample file') |
| 157 | + |
| 158 | + |
| 159 | + |
| 160 | + args = parser.parse_args() |
| 161 | + |
| 162 | + |
| 163 | + ########################### |
| 164 | + |
| 165 | + if args.download: |
| 166 | + if args.token is None: |
| 167 | + print("No synpase download tocken was provided. Cannot download data.") |
| 168 | + exit() |
| 169 | + else: |
| 170 | + print("Downloading Files from Synapse.") |
| 171 | + # Download RNA seq data |
| 172 | + download_rnaseq(save_path = "/tmp/GSE65253_col_tum_org_merge.csv.gz") |
| 173 | + # Download sequencing data |
| 174 | + sequencing_download_path = download_sequencing_data(synID = args.synapseID, synToken = args.token, save_path = "/tmp") |
| 175 | + |
| 176 | + if args.samples: |
| 177 | + if args.prevSamples is None or args.prevSamples=='': |
| 178 | + print("No previous samples file provided. Starting improve_sample_id from 1. Running sample file generation") |
| 179 | + sample_sheet = generate_sample_file(sequencing_data_path = sequencing_download_path) |
| 180 | + else: |
| 181 | + print("Previous sample sheet {} detected. Running sample file generation and checking for duplicate IDs.".format(args.prevSamples)) |
| 182 | + sample_sheet = generate_sample_file(sequencing_data_path = sequencing_download_path, prev_samples_path= args.prevSamples) |
| 183 | + sample_sheet.to_csv("/tmp/crcpdo_samples.csv", index=False) |
| 184 | + |
| 185 | + |
0 commit comments