Skip to content

Commit dbb019b

Browse files
committed
add creation of mutation count mastertable
1 parent 1436bc6 commit dbb019b

File tree

1 file changed

+69
-0
lines changed

1 file changed

+69
-0
lines changed

scripts/prepare_data_for_improve.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11

22
import argparse
33
from copy import deepcopy
4+
import functools as ft
45
import logging
56
from os import PathLike
67
from pathlib import Path
@@ -398,6 +399,74 @@ def process_datasets(args):
398399
)
399400

400401

402+
#-------------------------------------------------------------------
403+
# create mutation count table
404+
#-------------------------------------------------------------------
405+
406+
# combining all mutation data
407+
dfs_to_merge = {}
408+
for data_set in data_sets:
409+
if data_sets[data_set].experiments is not None and data_sets[data_set].mutations is not None:
410+
dfs_to_merge[data_set] = data_sets[data_set].mutations
411+
dfs_to_merge[data_set]['dataset_origin'] = data_set
412+
merged_mutations = ft.reduce(
413+
lambda left_df, right_df: pd.merge(
414+
left_df[['entrez_id', 'improve_sample_id', 'mutation', 'dataset_origin']],
415+
right_df[['entrez_id', 'improve_sample_id', 'mutation', 'dataset_origin']],
416+
on=['entrez_id', 'improve_sample_id', 'mutation', 'dataset_origin'],
417+
how='outer'),
418+
dfs_to_merge.values())
419+
420+
# retrieving unique mutations (the above creates multiplicates)
421+
unique_mutations = merged_mutations[['entrez_id', 'improve_sample_id', 'mutation']].drop_duplicates()
422+
423+
# counting the mutations per entrez_id/improve_sample_id pair and
424+
# aggregating it into a pivot table (also filling NAs with 0s)
425+
mutation_counts = pd.pivot_table(unique_mutations, values='mutation', index='entrez_id', columns='improve_sample_id',
426+
aggfunc='count')
427+
mutation_counts.fillna(0, inplace=True)
428+
429+
# merging in the gene_symbol and ensembl_gene_id
430+
mutation_counts = pd.merge(
431+
mutation_counts,
432+
data_gene_names[[
433+
'entrez_id',
434+
'ensembl_gene_id',
435+
'gene_symbol'
436+
]],
437+
how='left',
438+
on='entrez_id',
439+
)
440+
441+
# rearranging the colums such that entrez_id, gene_symbol and
442+
# ensenbl_gene_id are the first three rows after transposing the
443+
# table
444+
mutation_counts.insert(
445+
1,
446+
'ensembl_gene_id',
447+
mutation_counts.pop('ensembl_gene_id')
448+
)
449+
mutation_counts.insert(
450+
1,
451+
'gene_symbol',
452+
mutation_counts.pop('gene_symbol')
453+
)
454+
455+
# removing some rows where we don't have a 'gene_symbol' for the
456+
# entrez id
457+
mutation_counts = mutation_counts[mutation_counts['gene_symbol'].notna()]
458+
459+
# writing the dataframe to the mutation counts mastertable
460+
outfile_path = args.WORKDIR.joinpath(
461+
"data_out",
462+
"x_data",
463+
"cancer_mutation_count.tsv"
464+
)
465+
mutation_counts.T.to_csv(
466+
path_or_buf=outfile_path,
467+
sep='\t',
468+
header=False
469+
)
401470

402471
def split_data_sets(
403472
args: dict,

0 commit comments

Comments
 (0)