Skip to content

Commit 0225c52

Browse files
authored
Merge pull request #344 from PNNL-CompBio/update_tpmFromCounts
Update tpm from counts
2 parents 4c8e1b5 + 6af210e commit 0225c52

File tree

2 files changed

+53
-16
lines changed

2 files changed

+53
-16
lines changed

build/beatAML/GetBeatAML.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,7 @@ def generate_drug_list(drug_map_path,drug_path):
653653
# New Transcriptomics Data
654654
print("Starting Transcriptomics Data")
655655
##first run conversion tool
656-
os.system("python tpmFromCounts.py --counts "+transcriptomics_file)
656+
os.system("python tpmFromCounts.py --counts {} --out_file {}".format(transcriptomics_file,'tpm_'+transcriptomics_file))
657657

658658

659659
t_df = pd.read_csv('tpm_'+transcriptomics_file, sep = '\t')

build/utils/tpmFromCounts.py

Lines changed: 52 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,48 +6,85 @@
66
import argparse
77
import pandas as pd
88

9-
def main():
10-
parser = argparse.ArgumentParser("Quick script to get TPM from counts matrix")
11-
parser.add_argument('--counts')
9+
def main(counts_data, genome_link, gene_column, exclude_columns, out_file):
10+
"""
11+
Converts RNA count matrix to tpm matrix (transcripts per million).
12+
13+
Parameters
14+
----------
15+
counts_data : string
16+
Path to RNA sequencing counts data. No default
17+
18+
genome_link : string
19+
Link to human genome build. Defaults to "https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz"
20+
21+
gene_column : string
22+
Column name of column with gene name information. Defaults to "stable_id".
23+
24+
exclude_columns : string
25+
Column names of columns to exclude from patient list. NO SPACES. Defaults to "stable_id,display_label,description,biotype".
1226
27+
out_file : string
28+
Path to output csv. No default.
1329
14-
args = parser.parse_args()
15-
print('Creating TPM from '+args.counts)
16-
counts = pd.read_csv(args.counts,sep='\t')
17-
counts.index=counts.stable_id
30+
Returns
31+
-------
32+
None
33+
34+
"""
35+
# read in counts data
36+
counts = pd.read_csv(counts_data,sep='\t')
37+
counts.index=counts[gene_column]
38+
39+
# parse list of columns to exclude from patients list
40+
exclude_cols_array = exclude_columns.split(",") # split long string by commas
1841

1942
##get list of patients
20-
pats = set(counts.columns)-set(['stable_id','display_label','description','biotype'])
43+
pats = set(counts.columns)-set(counts.select_dtypes(include='object') + exclude_columns) # get patient names from column names, excluding columns were the datatype is a string and any columns in the exclude_columns arg
44+
2145

2246
##transcript info from grc37
2347
# gtf = pd.read_csv("https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz",sep='\t',comment='#') # the "current" dir no longer exists...
24-
gtf = pd.read_csv("https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz",sep='\t',comment='#')
48+
gtf = pd.read_csv(genome_link,sep='\t',comment='#')
2549
gtf.index = [a.split(';')[0].split(' ')[1].strip('"') for a in gtf[gtf.columns[8]]]
2650
##first select only exons
2751
gtf = gtf[gtf.gene=='exon']
2852
##compute length and convert to kilobases
2953
length = abs(gtf[gtf.columns[4]]-gtf[gtf.columns[3]])/1000 ##get difference between start and end, then divide by kb
30-
3154
#gtf = gtf[gtf.gene=='exon'].groupby(level=0).sum() ##sum over all the exons for a particular gene. yes this doesn't really tell you about which exon
3255
length = length.groupby(level=0).sum() ##sum over all exon lengths for each gene
33-
56+
3457

3558
##set counts matrix
3659
X = counts[list(pats)]
37-
tg = [g for g in X. index if g in length.index]
60+
tg = [g for g in X.index if g in length.index]
61+
3862

3963
X = X.loc[tg].transpose()
4064
length = pd.Series(length)[X.columns]
4165
# length =length.loc[tg]
42-
66+
67+
4368
## df = pd.DataFrame(lengths=lengths,Genes=gnames)
4469
C = X.values
4570
L = length.values
4671
N = X.sum(axis=1).values.reshape(-1,1)
4772
rpk = C/L
4873
per_million_scaling_factor = (rpk.sum(axis=1)/1e6).reshape(-1,1)
4974
tpm = pd.DataFrame( rpk/per_million_scaling_factor, index=X.index, columns=X.columns).transpose()
50-
tpm.to_csv('tpm_'+args.counts,sep='\t')
75+
tpm.to_csv(out_file, sep='\t')
76+
5177

5278
if __name__=='__main__':
53-
main()
79+
parser = argparse.ArgumentParser("Quick script to get TPM from counts matrix")
80+
81+
parser.add_argument('--counts', default=None, help='Transcriptomics counts matrix')
82+
parser.add_argument('--genome_build', default="https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz", help='Link to human genome build')
83+
parser.add_argument('--gene_col', default="stable_id", help='Name of column with gene names')
84+
parser.add_argument('--exclude_col', default="stable_id,display_label,description,biotype", help='Name of column with gene names')
85+
parser.add_argument('--out_file', default=None, help='Output csv name.')
86+
87+
88+
args = parser.parse_args()
89+
print('Creating TPM from '+args.counts)
90+
main(counts_data = args.counts, genome_link = args.genome_build, gene_column = args.gene_col, exclude_columns = args.exclude_col, out_file = args.out_file)

0 commit comments

Comments
 (0)