Skip to content

Commit b40fcb3

Browse files
committed
udpated with better sample matching
still missing transcriptomics samples
1 parent 585035a commit b40fcb3

File tree

5 files changed

+41
-11
lines changed

5 files changed

+41
-11
lines changed

build/docker/Dockerfile.pancpdo

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ WORKDIR /usr/src/app
44

55
COPY build/pancpdo/01-createPancPDOSamplesFile.py .
66
COPY build/pancpdo/02-getPancPDOData.py .
7+
COPY build/pancpdo/02a-getPancPDDataFromSynapse.py .
78
COPY build/pancpdo/03-getPancPDODrugs.py .
89
COPY build/pancpdo/04-getPancPDOExperiments.py .
910
COPY build/pancpdo/05-addPrecalcAUC.py .

build/pancpdo/01-createPancPDOSamplesFile.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -270,18 +270,26 @@ def filter_and_subset_data(df, maxval, mapfile):
270270
# Convert 'other_names' to string to ensure consistency
271271
longtab['other_names'] = longtab['other_names'].astype(str)
272272

273+
#print(longtab)
273274
# Reassign 'improve_sample_id's at the end
274275
unique_other_names = longtab['other_names'].unique()
275276
print("Number of unique 'other_names' after filtering:", len(unique_other_names))
276277

278+
##UPDATE: assign them to common_names instead!
279+
unique_common_names = longtab['common_name'].unique()
280+
print("Number of unique 'common_names' after filtering:", len(unique_common_names))
277281
# Create a new mapping
282+
#mapping = pd.DataFrame({
283+
# 'other_names': unique_other_names,
284+
# 'improve_sample_id': range(int(maxval) + 1, int(maxval) + len(unique_other_names) + 1)
285+
#})
278286
mapping = pd.DataFrame({
279-
'other_names': unique_other_names,
280-
'improve_sample_id': range(int(maxval) + 1, int(maxval) + len(unique_other_names) + 1)
281-
})
287+
'common_name':unique_common_names,
288+
'improve_sample_id': range(int(maxval) +1, int(maxval) + len(unique_common_names)+1)
289+
})
282290

283291
# Merge the mapping back into 'longtab'
284-
longtab = pd.merge(longtab, mapping, on='other_names', how='left')
292+
longtab = pd.merge(longtab, mapping, on='common_name', how='left')
285293

286294
# Debugging: Check longtab after reassigning IDs
287295
print("\nlongtab columns after reassigning 'improve_sample_id':", longtab.columns)

build/pancpdo/02-getPancPDOData.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def use_gdc_tool(manifest_data, data_type, download_data):
183183

184184
# Initialize retry variables
185185
retries = 0
186-
max_retries = 5
186+
max_retries = 1
187187

188188
# Function to get downloaded file IDs
189189
def get_downloaded_ids(manifest_loc):

build/pancpdo/02a-getPancPDODataFromSynapse.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,13 @@ def main():
100100
genes = pd.read_csv(args.genes)
101101

102102
sc = synapseclient.login(args.token)
103-
103+
##to double check identifiers, we use transcriptomics data since that determines what samples were sequenced
104+
trans = pd.read_csv('/tmp/pancpdo_transcriptomics.csv.gz')
105+
tsamps = samps[samps.improve_sample_id.isin(trans.improve_sample_id)]
106+
print(samps.shape)
107+
print(tsamps.shape)
108+
109+
104110
missingsamples = []
105111
if args.copy:
106112
##query synapse view for files
@@ -112,11 +118,17 @@ def main():
112118
sname = row['name'].split('--')[0]
113119
print(sid,sname)
114120
path = sc.get(sid).path
115-
if sname in set(samps.other_id):
121+
if sname in set(tsamps.other_id):
122+
print(sname+' in transcriptomics, using that id')
123+
sampid = tsamps.loc[tsamps.other_id==sname]['improve_sample_id'].values[0]
124+
missingsamples.append('copy,trans,'+sname)
125+
elif sname in set(samps.other_id):
126+
print(sname+' in samples but not transcriptomics, using other id')
116127
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
128+
missingsamples.append("copy,notrans,"+sname)
117129
else:
118-
print('Missing sample id for '+sname)
119-
missingsamples.append('copy,'+sname)
130+
print('Missing sample id for '+sname,' skipping for now')
131+
missingsamples.append('copy,missed,'+sname)
120132
continue
121133
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
122134
res = parseCNVFile(path,sampid, genes)
@@ -132,8 +144,14 @@ def main():
132144
sname = row['name'].split('--')[0]
133145
sid = row.id
134146
print(sid,sname)
135-
if sname in set(samps.other_id):
147+
if sname in set(tsamps.other_id):
148+
print(sname+' in transcriptomics, using that id')
149+
sampid = tsamps.loc[tsamps.other_id==sname]['improve_sample_id'].values[0]
150+
missingsamples.append('mutation,trans,'+sname)
151+
elif sname in set(samps.other_id):
152+
print(sname+' in samples but not transcriptomics, using other id')
136153
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
154+
missingsamples.append('mutation,notrans,'+sname)
137155
else:
138156
print('Missing sample id for '+sname)
139157
missingsamples.append('mutation,'+sname)
@@ -144,6 +162,6 @@ def main():
144162
alldats.append(res)
145163
newmut = pd.concat(alldats)
146164
newmut.to_csv("/tmp/pancpdo_mutations.csv.gz",compression='gzip',index=False)
147-
missingsamples.to_csv('missing.csv')
165+
pd.DataFrame(missingsamples).to_csv('missing.csv',index=False,quoting=None,header=False)
148166
if __name__=='__main__':
149167
main()

build/pancpdo/build_omics.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit
66
echo "Running 02-getPancPDOData.py for transcriptomics."
77
python 02-getPancPDOData.py -m full_manifest.txt -t transcriptomics -o /tmp/pancpdo_transcriptomics.csv.gz -g $1 -s $2
88

9+
echo 'Running 02a-getPancPDODataFromSynapse.py for copy number and mutations'
10+
python 02a-getPancPDODataFromSynapse.py -g $1 -s $2 -t $SYNAPSE_AUTH_TOKEN -c -m
11+
912
#echo "Running 02-getPancPDOData.py for copy_number."
1013
#python 02-getPancPDOData.py -m full_manifest.txt -t copy_number -o /tmp/pancpdo_copy_number.csv.gz -g $1 -s $2
1114

0 commit comments

Comments
 (0)