|
1 | 1 | '''
|
2 | 2 | python script to read in counts matrix and gene lengths to calculate tpm
|
3 | 3 |
|
| 4 | +Input: expects patient Id's on column names |
| 5 | +
|
| 6 | +Args: |
| 7 | +genome_link : str |
| 8 | + Link to human genome build. Defualts to "https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz" |
| 9 | +
|
| 10 | +cols_to_exclude : list |
| 11 | + List of column names to exclude. Script assumes that patient Id's are in column names. Put columns to exclude (that don't have patient info in them). |
| 12 | +
|
4 | 13 | '''
|
5 | 14 |
|
6 | 15 | import argparse
|
7 | 16 | import pandas as pd
|
8 | 17 |
|
9 |
| -def main(): |
10 |
| - parser = argparse.ArgumentParser("Quick script to get TPM from counts matrix") |
11 |
| - parser.add_argument('--counts') |
12 |
| - |
13 |
| - |
14 |
| - args = parser.parse_args() |
15 |
| - print('Creating TPM from '+args.counts) |
16 |
| - counts = pd.read_csv(args.counts,sep='\t') |
17 |
| - counts.index=counts.stable_id |
| 18 | +def main(counts_data, genome_link, gene_column, out_file): |
| 19 | + # read in counts data |
| 20 | + if isinstance(counts_data, pd.DataFrame) == False: |
| 21 | + counts = pd.read_csv(counts_data,sep='\t') |
| 22 | + counts.index=counts[gene_column] |
18 | 23 |
|
19 | 24 | ##get list of patients
|
20 |
| - pats = set(counts.columns)-set(['stable_id','display_label','description','biotype']) |
| 25 | + pats = set(counts.columns)-set(counts.select_dtypes(include='object')) # get patient names from column names, excluding columns were the datatype is a string |
| 26 | + |
21 | 27 |
|
22 | 28 | ##transcript info from grc37
|
23 | 29 | # gtf = pd.read_csv("https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz",sep='\t',comment='#') # the "current" dir no longer exists...
|
24 |
| - gtf = pd.read_csv("https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz",sep='\t',comment='#') |
| 30 | + gtf = pd.read_csv(genome_link,sep='\t',comment='#') |
25 | 31 | gtf.index = [a.split(';')[0].split(' ')[1].strip('"') for a in gtf[gtf.columns[8]]]
|
26 | 32 | ##first select only exons
|
27 | 33 | gtf = gtf[gtf.gene=='exon']
|
28 | 34 | ##compute length and convert to kilobases
|
29 | 35 | length = abs(gtf[gtf.columns[4]]-gtf[gtf.columns[3]])/1000 ##get difference between start and end, then divide by kb
|
30 |
| - |
31 | 36 | #gtf = gtf[gtf.gene=='exon'].groupby(level=0).sum() ##sum over all the exons for a particular gene. yes this doesn't really tell you about which exon
|
32 | 37 | length = length.groupby(level=0).sum() ##sum over all exon lengths for each gene
|
33 |
| - |
| 38 | + |
34 | 39 |
|
35 | 40 | ##set counts matrix
|
36 | 41 | X = counts[list(pats)]
|
37 |
| - tg = [g for g in X. index if g in length.index] |
| 42 | + tg = [g for g in X.index if g in length.index] |
| 43 | + |
38 | 44 |
|
39 | 45 | X = X.loc[tg].transpose()
|
40 | 46 | length = pd.Series(length)[X.columns]
|
41 | 47 | # length =length.loc[tg]
|
42 |
| - |
| 48 | + |
| 49 | + |
43 | 50 | ## df = pd.DataFrame(lengths=lengths,Genes=gnames)
|
44 | 51 | C = X.values
|
45 | 52 | L = length.values
|
46 | 53 | N = X.sum(axis=1).values.reshape(-1,1)
|
47 | 54 | rpk = C/L
|
48 | 55 | per_million_scaling_factor = (rpk.sum(axis=1)/1e6).reshape(-1,1)
|
49 | 56 | tpm = pd.DataFrame( rpk/per_million_scaling_factor, index=X.index, columns=X.columns).transpose()
|
50 |
| - tpm.to_csv('tpm_'+args.counts,sep='\t') |
| 57 | + tpm.to_csv(out_file, sep='\t') |
| 58 | + |
51 | 59 |
|
52 | 60 | if __name__=='__main__':
|
53 |
| - main() |
| 61 | + parser = argparse.ArgumentParser("Quick script to get TPM from counts matrix") |
| 62 | + |
| 63 | + parser.add_argument('--counts', default=None, help='Transcriptomics counts matrix') |
| 64 | + parser.add_argument('--genome_build', default="https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz", help='Link to human genome build') |
| 65 | + parser.add_argument('--gene_col', default="stable_id", help='Name of column with gene names') |
| 66 | + parser.add_argument('--out_file', default=None, help='Output csv name.') |
| 67 | + |
| 68 | + |
| 69 | + args = parser.parse_args() |
| 70 | + print('Creating TPM from '+args.counts) |
| 71 | + main(counts_data = args.counts, genome_link = args.genome_build, gene_column = args.gene_col, out_file = args.out_file) |
0 commit comments