|
4 | 4 | import argparse
|
5 | 5 | import numpy as np
|
6 | 6 |
|
| 7 | + |
| 8 | + |
| 9 | + |
| 10 | + |
7 | 11 | def align_to_linkml_schema(input_df):
|
8 | 12 | """
|
9 | 13 | Maps the 'model_type' column of the input DataFrame to a set of predefined categories
|
@@ -123,11 +127,18 @@ def fetch_metadata_for_samples(uuids):
|
123 | 127 | "fields": (
|
124 | 128 | "cases.sample_ids,"
|
125 | 129 | "cases.case_id,"
|
| 130 | + "cases.submitter_id," |
| 131 | + "cases.annotations.case_submitter_id," |
126 | 132 | "cases.samples.sample_id,"
|
127 | 133 | "cases.samples.portions.analytes.aliquots.aliquot_id,"
|
128 | 134 | "cases.samples.sample_type,"
|
| 135 | + "cases.diagnoses.submitter_id," |
| 136 | + "cases.diagnoses.diagnosis_id," |
| 137 | + "cases.diagnoses.classification_of_tumor," |
129 | 138 | "cases.diagnoses.tissue_or_organ_of_origin,"
|
130 | 139 | "cases.diagnoses.primary_diagnosis,"
|
| 140 | + "cases.diagnoses.treatments.treatment_id,"##getting these but ignoring for now |
| 141 | + "cases.diagnoses.treatments.submitter_id," ##getting these but ignoring for now |
131 | 142 | "cases.samples.tumor_descriptor,"
|
132 | 143 | "cases.samples.composition"
|
133 | 144 | ),
|
@@ -158,59 +169,73 @@ def extract_data(data):
|
158 | 169 | for idx, sample in enumerate(case['samples']):
|
159 | 170 | for portion in sample['portions']:
|
160 | 171 | for analyte in portion['analytes']:
|
| 172 | + |
161 | 173 | for aliquot in analyte['aliquots']:
|
162 | 174 | if idx < len(case['diagnoses']):
|
163 | 175 | diagnosis = case['diagnoses'][idx]
|
164 | 176 | extracted.append({
|
165 |
| - 'id': hit['id'], |
166 |
| - 'case_id': case['case_id'], |
| 177 | + 'entry_id': hit['id'], |
| 178 | + 'case_uuid': case['case_id'], |
| 179 | + 'case_id': case['submitter_id'], |
167 | 180 | 'tissue_or_organ_of_origin': diagnosis['tissue_or_organ_of_origin'],
|
168 | 181 | 'primary_diagnosis': diagnosis['primary_diagnosis'],
|
| 182 | + 'diagnosis_id':diagnosis['submitter_id'], |
| 183 | + 'tumor_classification':diagnosis['classification_of_tumor'], |
169 | 184 | 'sample_id': sample['sample_id'],
|
170 | 185 | 'sample_type': sample['sample_type'],
|
171 |
| - 'tumor_descriptor': sample.get('tumor_descriptor', None), |
| 186 | + #'tumor_descriptor': sample.get('tumor_descriptor', None), |
172 | 187 | 'composition': sample.get('composition', None),
|
173 |
| - 'aliquot_id': aliquot['aliquot_id'] |
| 188 | + 'id': aliquot['aliquot_id'] |
174 | 189 | })
|
175 | 190 | return pd.DataFrame(extracted)
|
176 | 191 |
|
177 |
| -def filter_and_subset_data(df,sampfile): |
| 192 | +def filter_and_subset_data(df,sampfile,mapfile): |
178 | 193 | """
|
179 | 194 | Filter and subset the data.
|
180 | 195 |
|
181 | 196 | Taking a pandas dataframe containing all sample information, filter it to desired columns and rename them to match schema.
|
182 | 197 |
|
183 | 198 | Parameters
|
184 | 199 | ----------
|
185 |
| - df : pandas dataframe |
| 200 | + df : a tidied pandas dataframe |
186 | 201 | full samples table
|
187 | 202 |
|
188 | 203 | Returns
|
189 | 204 | -------
|
190 | 205 | Pandas Dataframe
|
191 | 206 | """
|
192 | 207 | duplicates_mask = df.drop('id', axis=1).duplicated(keep='first')
|
| 208 | + cmap = pd.read_csv(mapfile, encoding='ISO-8859-1') |
193 | 209 | filt = df[~duplicates_mask]
|
194 |
| - filt= filt.drop_duplicates(subset='aliquot_id', keep=False) |
| 210 | + filt= filt.drop_duplicates()#(subset='id', keep=False) |
| 211 | + filt = pd.merge(filt,cmap,right_on=['tissue_or_organ_of_origin','primary_diagnosis'],left_on=['tissue_or_organ_of_origin','primary_diagnosis'],how='left') |
195 | 212 | filt = filt.rename(
|
196 |
| - columns={"tissue_or_organ_of_origin":"common_name", |
197 |
| - "primary_diagnosis": "cancer_type", |
198 |
| - "composition": "model_type", |
199 |
| - "case_id": "other_names", |
200 |
| - "aliquot_id": "other_id"} |
| 213 | + columns={"composition": "model_type", |
| 214 | + "case_id": "common_name", |
| 215 | + "id": "other_names"} |
| 216 | + #"id": "sample_uuid"} |
201 | 217 | )
|
202 |
| - filt = filt[["cancer_type","common_name","other_names","other_id","model_type"]] |
203 |
| - filt["other_id_source"] = "HCMI" |
| 218 | + ##now we can melt all the identiers into other_id and other_id_source |
| 219 | + longtab = pd.melt(filt, id_vars=['common_name','other_names','model_type','cancer_type'], value_vars=['diagnosis_id','tumor_classification','sample_type']) |
| 220 | + longtab = longtab.rename(columns={'variable':'other_id_source','value':'other_id'}).drop_duplicates() |
| 221 | + # filt = filt[["cancer_type","common_name","other_names","other_id","model_type"]] |
| 222 | +# filt["other_id_source"] = "HCMI" |
204 | 223 | # Create new improve sample IDs
|
205 | 224 |
|
206 | 225 | #Non-docker:
|
207 | 226 | # maxval = max(pd.read_csv('../cptac/cptac_samples.csv').improve_sample_id)
|
208 | 227 | # Docker:
|
209 | 228 | maxval = max(pd.read_csv(sampfile).improve_sample_id)
|
210 |
| - mapping = {other_id: i for i, other_id in enumerate(filt['other_id'].unique(), start=(int(maxval)+1))} |
| 229 | + alluuids = list(set(longtab.other_names)) |
| 230 | + |
| 231 | + mapping = pd.DataFrame.from_dict( |
| 232 | + {"other_names": [str(a) for a in alluuids], |
| 233 | + "improve_sample_id": range(int(maxval)+1,int(maxval)+len(alluuids)+1) |
| 234 | + }) |
| 235 | + longtab = pd.merge(longtab,mapping,on='other_names',how='left') |
211 | 236 | # Use the map method to create the new column based on the lab-id column
|
212 |
| - filt['improve_sample_id'] = filt['other_id'].map(mapping) |
213 |
| - return filt |
| 237 | + #['improve_sample_id'] = longtab['other_id'].map(mapping) |
| 238 | + return longtab |
214 | 239 |
|
215 | 240 |
|
216 | 241 | def main():
|
@@ -240,14 +265,15 @@ def main():
|
240 | 265 | """
|
241 | 266 | parser = argparse.ArgumentParser()
|
242 | 267 | parser.add_argument('--samples',dest='samps',help='Previous sample file')
|
| 268 | + parser.add_argument('--mapfile',dest='map',help='Mapping to common_cancer from primary_diagnosis and tissue_or_organ_of_origin',default='hcmi_cancer_types.csv') |
243 | 269 | args = parser.parse_args()
|
244 | 270 | manifest_path = "full_manifest.txt"
|
245 | 271 | #manifest_url = "https://raw.githubusercontent.com/PNNL-CompBio/candleDataProcessing/hcmi_update/hcmi/full_manifest.txt"
|
246 | 272 | #download_from_github(manifest_url, manifest_path)
|
247 | 273 | uuids = extract_uuids_from_manifest(manifest_path)
|
248 | 274 | metadata = fetch_metadata_for_samples(uuids)
|
249 | 275 | df = extract_data(metadata)
|
250 |
| - output = filter_and_subset_data(df,args.samps) |
| 276 | + output = filter_and_subset_data(df,args.samps,args.map) |
251 | 277 | aligned = align_to_linkml_schema(output)
|
252 | 278 | print(aligned)
|
253 | 279 | aligned.to_csv("/tmp/hcmi_samples.csv",index=False)
|
|
0 commit comments