Skip to content

Commit db5217b

Browse files
changed os.system command in GetBeatAML.py script and updated tpmfromCounts.py
1 parent bab688a commit db5217b

File tree

2 files changed

+11
-4
lines changed

2 files changed

+11
-4
lines changed

build/beatAML/GetBeatAML.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,7 @@ def generate_drug_list(drug_map_path,drug_path):
653653
# New Transcriptomics Data
654654
print("Starting Transcriptomics Data")
655655
##first run conversion tool
656-
os.system("python tpmFromCounts.py --counts "+transcriptomics_file)
656+
os.system("python tpmFromCounts.py --counts {} --out_file {}".format(transcriptomics_file,'tpm_'+transcriptomics_file))
657657

658658

659659
t_df = pd.read_csv('tpm_'+transcriptomics_file, sep = '\t')

build/utils/tpmFromCounts.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import argparse
1616
import pandas as pd
1717

18-
def main(counts_data, genome_link, gene_column, out_file):
18+
def main(counts_data, genome_link, gene_column, exclude_columns, out_file):
1919
"""
2020
Converts RNA count matrix to tpm matrix (transcripts per million).
2121
@@ -29,6 +29,9 @@ def main(counts_data, genome_link, gene_column, out_file):
2929
3030
gene_column : string
3131
Column name of column with gene name information. Defaults to "stable_id".
32+
33+
exclude_columns : string
34+
Column names of columns to exclude from patient list. NO SPACES. Defaults to "stable_id,display_label,description,biotype".
3235
3336
out_file : string
3437
Path to output csv. No default.
@@ -42,8 +45,11 @@ def main(counts_data, genome_link, gene_column, out_file):
4245
counts = pd.read_csv(counts_data,sep='\t')
4346
counts.index=counts[gene_column]
4447

48+
# parse list of columns to exclude from patients list
49+
exclude_cols_array = exclude_columns.split(",") # split long string by commas
50+
4551
##get list of patients
46-
pats = set(counts.columns)-set(counts.select_dtypes(include='object')) # get patient names from column names, excluding columns were the datatype is a string
52+
pats = set(counts.columns)-set(counts.select_dtypes(include='object') + exclude_columns) # get patient names from column names, excluding columns were the datatype is a string and any columns in the exclude_columns arg
4753

4854

4955
##transcript info from grc37
@@ -84,9 +90,10 @@ def main(counts_data, genome_link, gene_column, out_file):
8490
parser.add_argument('--counts', default=None, help='Transcriptomics counts matrix')
8591
parser.add_argument('--genome_build', default="https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz", help='Link to human genome build')
8692
parser.add_argument('--gene_col', default="stable_id", help='Name of column with gene names')
93+
parser.add_argument('--exclude_col', default="stable_id,display_label,description,biotype", help='Name of column with gene names')
8794
parser.add_argument('--out_file', default=None, help='Output csv name.')
8895

8996

9097
args = parser.parse_args()
9198
print('Creating TPM from '+args.counts)
92-
main(counts_data = args.counts, genome_link = args.genome_build, gene_column = args.gene_col, out_file = args.out_file)
99+
main(counts_data = args.counts, genome_link = args.genome_build, gene_column = args.gene_col, exclude_columns = args.exclude_col, out_file = args.out_file)

0 commit comments

Comments
 (0)