Skip to content

Commit b18ef86

Browse files
changed tpmFromCounts to take arguments such as genome build, gene_co, and out_file. Also, detects non-RNA counts data by exculding cols that contain dtype str
1 parent e65634b commit b18ef86

File tree

1 file changed

+35
-17
lines changed

1 file changed

+35
-17
lines changed

build/utils/tpmFromCounts.py

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,71 @@
11
'''
22
python script to read in counts matrix and gene lengths to calculate tpm
33
4+
Input: expects patient Id's on column names
5+
6+
Args:
7+
genome_link : str
8+
Link to human genome build. Defualts to "https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz"
9+
10+
cols_to_exclude : list
11+
List of column names to exclude. Script assumes that patient Id's are in column names. Put columns to exclude (that don't have patient info in them).
12+
413
'''
514

615
import argparse
716
import pandas as pd
817

9-
def main():
10-
parser = argparse.ArgumentParser("Quick script to get TPM from counts matrix")
11-
parser.add_argument('--counts')
12-
13-
14-
args = parser.parse_args()
15-
print('Creating TPM from '+args.counts)
16-
counts = pd.read_csv(args.counts,sep='\t')
17-
counts.index=counts.stable_id
18+
def main(counts_data, genome_link, gene_column, out_file):
19+
# read in counts data
20+
if isinstance(counts_data, pd.DataFrame) == False:
21+
counts = pd.read_csv(counts_data,sep='\t')
22+
counts.index=counts[gene_column]
1823

1924
##get list of patients
20-
pats = set(counts.columns)-set(['stable_id','display_label','description','biotype'])
25+
pats = set(counts.columns)-set(counts.select_dtypes(include='object')) # get patient names from column names, excluding columns were the datatype is a string
26+
2127

2228
##transcript info from grc37
2329
# gtf = pd.read_csv("https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz",sep='\t',comment='#') # the "current" dir no longer exists...
24-
gtf = pd.read_csv("https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz",sep='\t',comment='#')
30+
gtf = pd.read_csv(genome_link,sep='\t',comment='#')
2531
gtf.index = [a.split(';')[0].split(' ')[1].strip('"') for a in gtf[gtf.columns[8]]]
2632
##first select only exons
2733
gtf = gtf[gtf.gene=='exon']
2834
##compute length and convert to kilobases
2935
length = abs(gtf[gtf.columns[4]]-gtf[gtf.columns[3]])/1000 ##get difference between start and end, then divide by kb
30-
3136
#gtf = gtf[gtf.gene=='exon'].groupby(level=0).sum() ##sum over all the exons for a particular gene. yes this doesn't really tell you about which exon
3237
length = length.groupby(level=0).sum() ##sum over all exon lengths for each gene
33-
38+
3439

3540
##set counts matrix
3641
X = counts[list(pats)]
37-
tg = [g for g in X. index if g in length.index]
42+
tg = [g for g in X.index if g in length.index]
43+
3844

3945
X = X.loc[tg].transpose()
4046
length = pd.Series(length)[X.columns]
4147
# length =length.loc[tg]
42-
48+
49+
4350
## df = pd.DataFrame(lengths=lengths,Genes=gnames)
4451
C = X.values
4552
L = length.values
4653
N = X.sum(axis=1).values.reshape(-1,1)
4754
rpk = C/L
4855
per_million_scaling_factor = (rpk.sum(axis=1)/1e6).reshape(-1,1)
4956
tpm = pd.DataFrame( rpk/per_million_scaling_factor, index=X.index, columns=X.columns).transpose()
50-
tpm.to_csv('tpm_'+args.counts,sep='\t')
57+
tpm.to_csv(out_file, sep='\t')
58+
5159

5260
if __name__=='__main__':
53-
main()
61+
parser = argparse.ArgumentParser("Quick script to get TPM from counts matrix")
62+
63+
parser.add_argument('--counts', default=None, help='Transcriptomics counts matrix')
64+
parser.add_argument('--genome_build', default="https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz", help='Link to human genome build')
65+
parser.add_argument('--gene_col', default="stable_id", help='Name of column with gene names')
66+
parser.add_argument('--out_file', default=None, help='Output csv name.')
67+
68+
69+
args = parser.parse_args()
70+
print('Creating TPM from '+args.counts)
71+
main(counts_data = args.counts, genome_link = args.genome_build, gene_column = args.gene_col, out_file = args.out_file)

0 commit comments

Comments
 (0)