Skip to content

Commit 03d12e0

Browse files
authored
Merge pull request #459 from PNNL-CompBio/deduplication_updates
Deduplication of numerous files - looks good!
2 parents 0093960 + 31d194b commit 03d12e0

File tree

8 files changed

+13
-5
lines changed

8 files changed

+13
-5
lines changed

coderbuild/beatAML/GetBeatAML.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ def generate_samples_file(prev_samples_path):
124124
mapping = {labId: i for i, labId in enumerate(all_samples['other_id'].unique(), start=(int(maxval)+1))}
125125
all_samples['improve_sample_id'] = all_samples['other_id'].map(mapping)
126126
all_samples.insert(1, 'improve_sample_id', all_samples.pop('improve_sample_id'))
127+
all_samples.drop_duplicates(inplace=True)
127128
all_samples.to_csv("/tmp/beataml_samples.csv", index=False)
128129
return all_samples
129130

coderbuild/broad_sanger/02-broadSangerOmics.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -668,9 +668,9 @@ main<-function(){
668668

669669
lapply(alltypes,function(dt){
670670
print(dt)
671-
temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()
671+
temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()|>dplyr::distinct()
672672
readr::write_csv(temps,file=paste0('/tmp/sanger_',dt,'.csv.gz'))
673-
tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()
673+
tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()|>dplyr::distinct()
674674
readr::write_csv(tempd,file=paste0('/tmp/broad_',dt,'.csv.gz'))
675675

676676
# readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))

coderbuild/broad_sanger/04-drug_dosage_and_curves.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
for of in outfiles:
5656
final_file.append(pd.read_csv(of,sep='\t'))
5757

58-
pd.concat(final_file).to_csv('/tmp/broad_sanger_experiments.tsv',index=False,sep='\t')
58+
pd.concat(final_file).drop_duplicates().to_csv('/tmp/broad_sanger_experiments.tsv',index=False,sep='\t')
5959
#os.system('cat *.0 > /tmp/broad_sanger_experiments.tsv')
6060
#os.system('gzip -f /tmp/experiments.tsv')
6161

coderbuild/colorectal/02-omics-colorectal.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data):
242242
else:
243243
print("Starting transcriptomics data.")
244244
transcriptomics_df = map_transcriptomics(transciptomics_data = "/tmp/GSE65253_col_tum_org_merge.csv.gz", improve_id_data = "/tmp/colorectal_samples.csv", entrez_data = "/tmp/genes.csv")
245+
transcriptomics_df.drop_duplicates(inplace=True)
245246
transcriptomics_df.to_csv("/tmp/colorectal_transcriptomics.csv", index=False)
246247

247248
if args.mutations:

coderbuild/liver/02-omics-liver.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,7 @@ def map_proteomics(proteomics_data, improve_id_data, entrez_data):
362362
else:
363363
print("Starting transcriptomics data.")
364364
transcriptomics_df = map_transcriptomics(transciptomics_data = "/tmp/raw_rnaseq_data.csv", improve_id_data = "/tmp/liver_samples.csv", entrez_data = "/tmp/genes.csv")
365+
transcriptomics_df.drop_duplicates(inplace=True)
365366
transcriptomics_df.to_csv("/tmp/liver_transcriptomics.csv", index=False)
366367

367368
if args.mutations:
@@ -385,8 +386,9 @@ def map_proteomics(proteomics_data, improve_id_data, entrez_data):
385386
exit()
386387
else:
387388
print("Starting copy number data.")
388-
mutation_df = map_copy_number(copy_number_data = "/tmp/raw_copynum_data.csv", improve_id_data = "/tmp/liver_samples.csv", entrez_data = "/tmp/genes.csv")
389-
mutation_df.to_csv("/tmp/liver_copy_number.csv", index=False)
389+
copy_number_df = map_copy_number(copy_number_data = "/tmp/raw_copynum_data.csv", improve_id_data = "/tmp/liver_samples.csv", entrez_data = "/tmp/genes.csv")
390+
copy_number_df.drop_duplicates(inplace=True)
391+
copy_number_df.to_csv("/tmp/liver_copy_number.csv", index=False)
390392

391393
if args.proteomics:
392394
if args.genes is None or args.genes=='':

coderbuild/novartis/02-omics-novartis.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ def map_mutations_novPDX(mutation_data, improve_id_data, entrez_data):
295295
else:
296296
print("Starting transcriptomics data.")
297297
transcriptomics_df_final = map_transcriptomics_novPDX(transcriptomics_data = "/tmp/raw_rnaseq_data.csv", improve_id_data = "/tmp/novartis_samples.csv", entrez_data = "/tmp/genes.csv")
298+
transcriptomics_df_final.drop_duplicates(inplace=True)
298299
transcriptomics_df_final.to_csv("/tmp/novartis_transcriptomics.csv", index=False)
299300

300301
if args.mutations:
@@ -307,6 +308,7 @@ def map_mutations_novPDX(mutation_data, improve_id_data, entrez_data):
307308
else:
308309
print("Starting mutations data.")
309310
mutation_df_final = map_mutations_novPDX(mutation_data = "/tmp/raw_mutation_data.csv", improve_id_data = "/tmp/novartis_samples.csv", entrez_data = "/tmp/genes.csv")
311+
mutation_df_final.drop_duplicates(inplace=True)
310312
mutation_df_final.to_csv("/tmp/novartis_mutations.csv", index=False)
311313

312314
if args.copy_number:

coderbuild/pancreatic/02a-getPancreaticDataFromSynapse.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ def main():
165165
res = parseMutFile(path,sampid, genes)
166166
alldats.append(res)
167167
newmut = pd.concat(alldats)
168+
newmut.drop_duplicates(inplace=True)
168169
newmut.to_csv("/tmp/pancreatic_mutations.csv.gz",compression='gzip',index=False)
169170
#pd.DataFrame(missingsamples).to_csv('missing.csv',index=False,quoting=None,header=False)
170171
if __name__=='__main__':

coderbuild/utils/pubchem_retrieval.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,7 @@ def update_dataframe_and_write_tsv(unique_names,
441441
final_df = pd.DataFrame(columns=combined.columns)
442442

443443
# --- 10) write final filtered output ---
444+
final_df.drop_duplicates(inplace=True)
444445
final_df.to_csv(output_filename, sep="\t", index=False)
445446

446447
if os.path.exists(temp_file):

0 commit comments

Comments
 (0)