Skip to content

Commit 3ecb6c8

Browse files
authored
Merge pull request #187 from PNNL-CompBio/cancer-mapping
updated HCMI to include more metadata
2 parents 8e0f415 + b00fa5d commit 3ecb6c8

File tree

4 files changed

+2252
-1343
lines changed

4 files changed

+2252
-1343
lines changed

build/hcmi/01-createHCMISamplesFile.py

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
import argparse
55
import numpy as np
66

7+
8+
9+
10+
711
def align_to_linkml_schema(input_df):
812
"""
913
Maps the 'model_type' column of the input DataFrame to a set of predefined categories
@@ -123,11 +127,18 @@ def fetch_metadata_for_samples(uuids):
123127
"fields": (
124128
"cases.sample_ids,"
125129
"cases.case_id,"
130+
"cases.submitter_id,"
131+
"cases.annotations.case_submitter_id,"
126132
"cases.samples.sample_id,"
127133
"cases.samples.portions.analytes.aliquots.aliquot_id,"
128134
"cases.samples.sample_type,"
135+
"cases.diagnoses.submitter_id,"
136+
"cases.diagnoses.diagnosis_id,"
137+
"cases.diagnoses.classification_of_tumor,"
129138
"cases.diagnoses.tissue_or_organ_of_origin,"
130139
"cases.diagnoses.primary_diagnosis,"
140+
"cases.diagnoses.treatments.treatment_id,"##getting these but ignoring for now
141+
"cases.diagnoses.treatments.submitter_id," ##getting these but ignoring for now
131142
"cases.samples.tumor_descriptor,"
132143
"cases.samples.composition"
133144
),
@@ -158,59 +169,73 @@ def extract_data(data):
158169
for idx, sample in enumerate(case['samples']):
159170
for portion in sample['portions']:
160171
for analyte in portion['analytes']:
172+
161173
for aliquot in analyte['aliquots']:
162174
if idx < len(case['diagnoses']):
163175
diagnosis = case['diagnoses'][idx]
164176
extracted.append({
165-
'id': hit['id'],
166-
'case_id': case['case_id'],
177+
'entry_id': hit['id'],
178+
'case_uuid': case['case_id'],
179+
'case_id': case['submitter_id'],
167180
'tissue_or_organ_of_origin': diagnosis['tissue_or_organ_of_origin'],
168181
'primary_diagnosis': diagnosis['primary_diagnosis'],
182+
'diagnosis_id':diagnosis['submitter_id'],
183+
'tumor_classification':diagnosis['classification_of_tumor'],
169184
'sample_id': sample['sample_id'],
170185
'sample_type': sample['sample_type'],
171-
'tumor_descriptor': sample.get('tumor_descriptor', None),
186+
#'tumor_descriptor': sample.get('tumor_descriptor', None),
172187
'composition': sample.get('composition', None),
173-
'aliquot_id': aliquot['aliquot_id']
188+
'id': aliquot['aliquot_id']
174189
})
175190
return pd.DataFrame(extracted)
176191

177-
def filter_and_subset_data(df,sampfile):
192+
def filter_and_subset_data(df,sampfile,mapfile):
178193
"""
179194
Filter and subset the data.
180195
181196
Taking a pandas dataframe containing all sample information, filter it to desired columns and rename them to match schema.
182197
183198
Parameters
184199
----------
185-
df : pandas dataframe
200+
df : a tidied pandas dataframe
186201
full samples table
187202
188203
Returns
189204
-------
190205
Pandas Dataframe
191206
"""
192207
duplicates_mask = df.drop('id', axis=1).duplicated(keep='first')
208+
cmap = pd.read_csv(mapfile, encoding='ISO-8859-1')
193209
filt = df[~duplicates_mask]
194-
filt= filt.drop_duplicates(subset='aliquot_id', keep=False)
210+
filt= filt.drop_duplicates()#(subset='id', keep=False)
211+
filt = pd.merge(filt,cmap,right_on=['tissue_or_organ_of_origin','primary_diagnosis'],left_on=['tissue_or_organ_of_origin','primary_diagnosis'],how='left')
195212
filt = filt.rename(
196-
columns={"tissue_or_organ_of_origin":"common_name",
197-
"primary_diagnosis": "cancer_type",
198-
"composition": "model_type",
199-
"case_id": "other_names",
200-
"aliquot_id": "other_id"}
213+
columns={"composition": "model_type",
214+
"case_id": "common_name",
215+
"id": "other_names"}
216+
#"id": "sample_uuid"}
201217
)
202-
filt = filt[["cancer_type","common_name","other_names","other_id","model_type"]]
203-
filt["other_id_source"] = "HCMI"
218+
##now we can melt all the identiers into other_id and other_id_source
219+
longtab = pd.melt(filt, id_vars=['common_name','other_names','model_type','cancer_type'], value_vars=['diagnosis_id','tumor_classification','sample_type'])
220+
longtab = longtab.rename(columns={'variable':'other_id_source','value':'other_id'}).drop_duplicates()
221+
# filt = filt[["cancer_type","common_name","other_names","other_id","model_type"]]
222+
# filt["other_id_source"] = "HCMI"
204223
# Create new improve sample IDs
205224

206225
#Non-docker:
207226
# maxval = max(pd.read_csv('../cptac/cptac_samples.csv').improve_sample_id)
208227
# Docker:
209228
maxval = max(pd.read_csv(sampfile).improve_sample_id)
210-
mapping = {other_id: i for i, other_id in enumerate(filt['other_id'].unique(), start=(int(maxval)+1))}
229+
alluuids = list(set(longtab.other_names))
230+
231+
mapping = pd.DataFrame.from_dict(
232+
{"other_names": [str(a) for a in alluuids],
233+
"improve_sample_id": range(int(maxval)+1,int(maxval)+len(alluuids)+1)
234+
})
235+
longtab = pd.merge(longtab,mapping,on='other_names',how='left')
211236
# Use the map method to create the new column based on the lab-id column
212-
filt['improve_sample_id'] = filt['other_id'].map(mapping)
213-
return filt
237+
#['improve_sample_id'] = longtab['other_id'].map(mapping)
238+
return longtab
214239

215240

216241
def main():
@@ -240,14 +265,15 @@ def main():
240265
"""
241266
parser = argparse.ArgumentParser()
242267
parser.add_argument('--samples',dest='samps',help='Previous sample file')
268+
parser.add_argument('--mapfile',dest='map',help='Mapping to common_cancer from primary_diagnosis and tissue_or_organ_of_origin',default='hcmi_cancer_types.csv')
243269
args = parser.parse_args()
244270
manifest_path = "full_manifest.txt"
245271
#manifest_url = "https://raw.githubusercontent.com/PNNL-CompBio/candleDataProcessing/hcmi_update/hcmi/full_manifest.txt"
246272
#download_from_github(manifest_url, manifest_path)
247273
uuids = extract_uuids_from_manifest(manifest_path)
248274
metadata = fetch_metadata_for_samples(uuids)
249275
df = extract_data(metadata)
250-
output = filter_and_subset_data(df,args.samps)
276+
output = filter_and_subset_data(df,args.samps,args.map)
251277
aligned = align_to_linkml_schema(output)
252278
print(aligned)
253279
aligned.to_csv("/tmp/hcmi_samples.csv",index=False)

build/hcmi/02-getHCMIData.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ def align_to_schema(data, data_type, chunksize=7500,samples_path='/tmp/hcmi_samp
430430
"""
431431
# samples_path = "/tmp/hcmi_samples.csv"
432432
samples = pl.read_csv(samples_path)
433-
samples = samples.drop(["cancer_type", "common_name", "other_names", "model_type", "other_id_source"])
433+
samples = samples.drop(["cancer_type", "common_name", "other_id", "model_type", "species","other_id_source"]).unique()
434434

435435
# Determine columns to select based on data_type
436436
columns = {
@@ -448,8 +448,8 @@ def align_to_schema(data, data_type, chunksize=7500,samples_path='/tmp/hcmi_samp
448448
chunk = chunk.rename({"Variant_Classification": "variant_classification"})
449449
chunk = chunk.select(selected_columns)
450450

451-
merged_chunk = samples.join(chunk, left_on='other_id', right_on='aliquot_id', how='inner')
452-
merged_chunk = merged_chunk.drop(["aliquot_id", "other_id"])
451+
merged_chunk = samples.join(chunk, left_on='other_names', right_on='aliquot_id', how='inner')
452+
merged_chunk = merged_chunk.drop(["aliquot_id", "other_names"])
453453

454454
# Append the processed chunk
455455
merged_data = pl.concat([merged_data, merged_chunk])

0 commit comments

Comments
 (0)