|
6 | 6 | import argparse
|
7 | 7 | import pandas as pd
|
8 | 8 |
|
9 |
| -def main(): |
10 |
| - parser = argparse.ArgumentParser("Quick script to get TPM from counts matrix") |
11 |
| - parser.add_argument('--counts') |
| 9 | +def main(counts_data, genome_link, gene_column, exclude_columns, out_file): |
| 10 | + """ |
| 11 | + Converts RNA count matrix to tpm matrix (transcripts per million). |
| 12 | + |
| 13 | + Parameters |
| 14 | + ---------- |
| 15 | + counts_data : string |
| 16 | + Path to RNA sequencing counts data. No default |
| 17 | + |
| 18 | + genome_link : string |
| 19 | + Link to human genome build. Defaults to "https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz" |
| 20 | +
|
| 21 | + gene_column : string |
| 22 | + Column name of column with gene name information. Defaults to "stable_id". |
| 23 | + |
| 24 | + exclude_columns : string |
| 25 | + Column names of columns to exclude from patient list. NO SPACES. Defaults to "stable_id,display_label,description,biotype". |
12 | 26 |
|
| 27 | + out_file : string |
| 28 | + Path to output csv. No default. |
13 | 29 |
|
14 |
| - args = parser.parse_args() |
15 |
| - print('Creating TPM from '+args.counts) |
16 |
| - counts = pd.read_csv(args.counts,sep='\t') |
17 |
| - counts.index=counts.stable_id |
| 30 | + Returns |
| 31 | + ------- |
| 32 | + None |
| 33 | + |
| 34 | + """ |
| 35 | + # read in counts data |
| 36 | + counts = pd.read_csv(counts_data,sep='\t') |
| 37 | + counts.index=counts[gene_column] |
| 38 | + |
| 39 | + # parse list of columns to exclude from patients list |
| 40 | + exclude_cols_array = exclude_columns.split(",") # split long string by commas |
18 | 41 |
|
19 | 42 | ##get list of patients
|
20 |
| - pats = set(counts.columns)-set(['stable_id','display_label','description','biotype']) |
| 43 | + pats = set(counts.columns)-set(counts.select_dtypes(include='object') + exclude_columns) # get patient names from column names, excluding columns were the datatype is a string and any columns in the exclude_columns arg |
| 44 | + |
21 | 45 |
|
22 | 46 | ##transcript info from grc37
|
23 | 47 | # gtf = pd.read_csv("https://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz",sep='\t',comment='#') # the "current" dir no longer exists...
|
24 |
| - gtf = pd.read_csv("https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz",sep='\t',comment='#') |
| 48 | + gtf = pd.read_csv(genome_link,sep='\t',comment='#') |
25 | 49 | gtf.index = [a.split(';')[0].split(' ')[1].strip('"') for a in gtf[gtf.columns[8]]]
|
26 | 50 | ##first select only exons
|
27 | 51 | gtf = gtf[gtf.gene=='exon']
|
28 | 52 | ##compute length and convert to kilobases
|
29 | 53 | length = abs(gtf[gtf.columns[4]]-gtf[gtf.columns[3]])/1000 ##get difference between start and end, then divide by kb
|
30 |
| - |
31 | 54 | #gtf = gtf[gtf.gene=='exon'].groupby(level=0).sum() ##sum over all the exons for a particular gene. yes this doesn't really tell you about which exon
|
32 | 55 | length = length.groupby(level=0).sum() ##sum over all exon lengths for each gene
|
33 |
| - |
| 56 | + |
34 | 57 |
|
35 | 58 | ##set counts matrix
|
36 | 59 | X = counts[list(pats)]
|
37 |
| - tg = [g for g in X. index if g in length.index] |
| 60 | + tg = [g for g in X.index if g in length.index] |
| 61 | + |
38 | 62 |
|
39 | 63 | X = X.loc[tg].transpose()
|
40 | 64 | length = pd.Series(length)[X.columns]
|
41 | 65 | # length =length.loc[tg]
|
42 |
| - |
| 66 | + |
| 67 | + |
43 | 68 | ## df = pd.DataFrame(lengths=lengths,Genes=gnames)
|
44 | 69 | C = X.values
|
45 | 70 | L = length.values
|
46 | 71 | N = X.sum(axis=1).values.reshape(-1,1)
|
47 | 72 | rpk = C/L
|
48 | 73 | per_million_scaling_factor = (rpk.sum(axis=1)/1e6).reshape(-1,1)
|
49 | 74 | tpm = pd.DataFrame( rpk/per_million_scaling_factor, index=X.index, columns=X.columns).transpose()
|
50 |
| - tpm.to_csv('tpm_'+args.counts,sep='\t') |
| 75 | + tpm.to_csv(out_file, sep='\t') |
| 76 | + |
51 | 77 |
|
52 | 78 | if __name__=='__main__':
|
53 |
| - main() |
| 79 | + parser = argparse.ArgumentParser("Quick script to get TPM from counts matrix") |
| 80 | + |
| 81 | + parser.add_argument('--counts', default=None, help='Transcriptomics counts matrix') |
| 82 | + parser.add_argument('--genome_build', default="https://ftp.ensembl.org/pub/grch37/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz", help='Link to human genome build') |
| 83 | + parser.add_argument('--gene_col', default="stable_id", help='Name of column with gene names') |
| 84 | + parser.add_argument('--exclude_col', default="stable_id,display_label,description,biotype", help='Name of column with gene names') |
| 85 | + parser.add_argument('--out_file', default=None, help='Output csv name.') |
| 86 | + |
| 87 | + |
| 88 | + args = parser.parse_args() |
| 89 | + print('Creating TPM from '+args.counts) |
| 90 | + main(counts_data = args.counts, genome_link = args.genome_build, gene_column = args.gene_col, exclude_columns = args.exclude_col, out_file = args.out_file) |
0 commit comments