|
1 | 1 |
|
2 | 2 | import argparse
|
3 | 3 | from copy import deepcopy
|
| 4 | +import functools as ft |
4 | 5 | import logging
|
5 | 6 | from os import PathLike
|
6 | 7 | from pathlib import Path
|
@@ -398,6 +399,74 @@ def process_datasets(args):
|
398 | 399 | )
|
399 | 400 |
|
400 | 401 |
|
| 402 | + #------------------------------------------------------------------- |
| 403 | + # create mutation count table |
| 404 | + #------------------------------------------------------------------- |
| 405 | + |
| 406 | + # combining all mutation data |
| 407 | + dfs_to_merge = {} |
| 408 | + for data_set in data_sets: |
| 409 | + if data_sets[data_set].experiments is not None and data_sets[data_set].mutations is not None: |
| 410 | + dfs_to_merge[data_set] = data_sets[data_set].mutations |
| 411 | + dfs_to_merge[data_set]['dataset_origin'] = data_set |
| 412 | + merged_mutations = ft.reduce( |
| 413 | + lambda left_df, right_df: pd.merge( |
| 414 | + left_df[['entrez_id', 'improve_sample_id', 'mutation', 'dataset_origin']], |
| 415 | + right_df[['entrez_id', 'improve_sample_id', 'mutation', 'dataset_origin']], |
| 416 | + on=['entrez_id', 'improve_sample_id', 'mutation', 'dataset_origin'], |
| 417 | + how='outer'), |
| 418 | + dfs_to_merge.values()) |
| 419 | + |
| 420 | + # retrieving unique mutations (the above creates multiplicates) |
| 421 | + unique_mutations = merged_mutations[['entrez_id', 'improve_sample_id', 'mutation']].drop_duplicates() |
| 422 | + |
| 423 | + # counting the mutations per entrez_id/improve_sample_id pair and |
| 424 | + # aggregating it into a pivot table (also filling NAs with 0s) |
| 425 | + mutation_counts = pd.pivot_table(unique_mutations, values='mutation', index='entrez_id', columns='improve_sample_id', |
| 426 | + aggfunc='count') |
| 427 | + mutation_counts.fillna(0, inplace=True) |
| 428 | + |
| 429 | + # merging in the gene_symbol and ensembl_gene_id |
| 430 | + mutation_counts = pd.merge( |
| 431 | + mutation_counts, |
| 432 | + data_gene_names[[ |
| 433 | + 'entrez_id', |
| 434 | + 'ensembl_gene_id', |
| 435 | + 'gene_symbol' |
| 436 | + ]], |
| 437 | + how='left', |
| 438 | + on='entrez_id', |
| 439 | + ) |
| 440 | + |
| 441 | + # rearranging the colums such that entrez_id, gene_symbol and |
| 442 | + # ensenbl_gene_id are the first three rows after transposing the |
| 443 | + # table |
| 444 | + mutation_counts.insert( |
| 445 | + 1, |
| 446 | + 'ensembl_gene_id', |
| 447 | + mutation_counts.pop('ensembl_gene_id') |
| 448 | + ) |
| 449 | + mutation_counts.insert( |
| 450 | + 1, |
| 451 | + 'gene_symbol', |
| 452 | + mutation_counts.pop('gene_symbol') |
| 453 | + ) |
| 454 | + |
| 455 | + # removing some rows where we don't have a 'gene_symbol' for the |
| 456 | + # entrez id |
| 457 | + mutation_counts = mutation_counts[mutation_counts['gene_symbol'].notna()] |
| 458 | + |
| 459 | + # writing the dataframe to the mutation counts mastertable |
| 460 | + outfile_path = args.WORKDIR.joinpath( |
| 461 | + "data_out", |
| 462 | + "x_data", |
| 463 | + "cancer_mutation_count.tsv" |
| 464 | + ) |
| 465 | + mutation_counts.T.to_csv( |
| 466 | + path_or_buf=outfile_path, |
| 467 | + sep='\t', |
| 468 | + header=False |
| 469 | + ) |
401 | 470 |
|
402 | 471 | def split_data_sets(
|
403 | 472 | args: dict,
|
|
0 commit comments