From 2cb41db7069b7fe2a4d8a40c01fa786237687e2f Mon Sep 17 00:00:00 2001 From: Adetunji Date: Mon, 19 Apr 2021 16:00:24 -0500 Subject: [PATCH] Update BAM2GFF_gtftogenes --- bin/BAM2GFF_gtftogenes.py | 136 +++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 69 deletions(-) diff --git a/bin/BAM2GFF_gtftogenes.py b/bin/BAM2GFF_gtftogenes.py index 1dff362..1fda578 100755 --- a/bin/BAM2GFF_gtftogenes.py +++ b/bin/BAM2GFF_gtftogenes.py @@ -3,18 +3,10 @@ Generate genomic coordinates of all promoters, 5'UTR, 3'UTR, CDS ''' +import sys import os import argparse -if not os.path.exists('annotation'): - os.makedirs('annotation') - -#initialize outputfiles -PSEUDOGFF = open('annotation/genes.gff', 'w') -PROMOTERSGFF = open('annotation/promoters.gff', 'w') -UPSTREAMGFF = open('annotation/upstream.gff', 'w') -DOWNSTREAMGFF = open('annotation/downstream.gff', 'w') - def parse_genelocations(chromz, results, flank): """ Parse genomic regions Args: @@ -22,6 +14,12 @@ def parse_genelocations(chromz, results, flank): results (string) : Individual gene coordinates flank (int) : genomic distance from start/end site """ + + #initialize outputfiles + PROMOTERSGFF = open('annotation/promoters.gff', 'a') + UPSTREAMGFF = open('annotation/upstream.gff', 'a') + DOWNSTREAMGFF = open('annotation/downstream.gff', 'a') + lines = results.split("\t") lines[3] = int(lines[3]) lines[4] = int(lines[4]) @@ -60,79 +58,79 @@ def parse_genelocations(chromz, results, flank): downstart, downend, "\t".join(lines[5:]))) def main(): - usage = "usage: %prog -g [GTF/GFF file] -f [FEATURE TYPE (gene/transcript)] \ - -c [CHROMSIZES] -d [DISTANCE(bp) from start/end site]" + usage = "usage: " + sys.argv[0] + " -g [GTF/GFF file] -f [FEATURE TYPE (gene/transcript)] " \ + "-c [CHROMSIZES] -d [DISTANCE(bp) from start/end site] [-h help]" parser = argparse.ArgumentParser(usage=usage) parser.add_argument("-g", "--gtf", dest="gtf", required=True, help="Enter .gtf/gff file to be processed.") - parser.add_argument("-d", "--dist", dest="distance", default=2000, type=int, - help="Distance (bp) from site [TSS/TES].") + parser.add_argument("-d", "--dist", dest="distance", required=False, + default=2000, type=int, help="Distance (bp) from site [TSS/TES].") parser.add_argument("-c", "--chrom", dest="chrom", required=True, - help="Enter ucsc chrom sizes file to be processed.") + help="Enter UCSC chrom sizes file to be processed.") options = parser.parse_args() - print(options) + #print(options) + + if not os.path.exists('annotation'): + os.makedirs('annotation') flank = options.distance #flank distance from TSS / TES - if options.chrom and options.gtf: - chromsizefile = open(options.chrom, 'r') - chrom_sizes = {} - for line in chromsizefile: - line = line.split('\t') - chrom_sizes[line[0]] = line[1].rstrip("\n") + chromsizefile = open(options.chrom, 'r') + chrom_sizes = {} + for line in chromsizefile: + line = line.split('\t') + chrom_sizes[line[0]] = line[1].rstrip("\n") - #feature = options.feature - gtf_file = open(options.gtf, 'r') - - feature_dict = {} - if not options.gtf.split('.')[-1] == 'gtf': - sys.exit("ERROR :\tGTF required") - - #reading the feature type to be used - for line in gtf_file: - if not line.startswith('#'): - lines = line.split("\t") - feature_dict[lines[2]] = lines[2] + #feature = options.feature + gtf_name = options.gtf + gtf_file = open(options.gtf, 'r') + + feature_dict = {} - if 'transcript' in feature_dict: - feature = "transcript" - elif 'gene' in feature_dict: - feature = "gene" - else: - sys.exit("ERROR :\tGTF with either transcript/gene annotation is needed") - - print("NOTE :\tFeature type used is '%s'" %(feature)) + #reading the feature type to be used + for line in gtf_file: + if not line.startswith('#'): + lines = line.split("\t") + feature_dict[lines[2]] = lines[2] + + if 'transcript' in feature_dict: + feature = "transcript" + elif 'gene' in feature_dict: + feature = "gene" + else: + sys.exit("ERROR :\tGTF/GFF with either transcript/gene annotation is needed") - #reading the gff_file to parse regions - gff_file = open(options.gtf, 'r') + print("NOTE :\tFeature type used is '%s'" %(feature)) - if options.gtf.split('.')[-1] == 'gff': - for line in gff_file: - if not line.startswith('#'): - lines = line.split("\t") - if not lines[0].startswith('chr'): - lines[0] = "chr"+lines[0] - if lines[2] == feature: - results = ("{0}\t{1}".format(lines[0], "\t".join(lines[1:]))) - PSEUDOGFF.write(results+"\n") - parse_genelocations(chrom_sizes, results, flank) - elif options.gtf.split('.')[-1] == 'gtf': - for line in gff_file: - if not line.startswith('#'): - lines = line.split("\t") - if not lines[0].startswith('chr'): - lines[0] = "chr"+lines[0] - if lines[2] == feature: - newline = lines[8].split(' ') - results = ("{0}\t{1}\t{2}={3}".format(lines[0], - "\t".join(lines[1:8]), - newline[0], newline[1])) - PSEUDOGFF.write(results + "\n") - parse_genelocations(chrom_sizes, results, flank) - else: - parser.print_help() - + #reading the gff_file to parse regions + gff_file = open(options.gtf, 'r') + + #initialize output files + PSEUDOGFF = open('annotation/genes.gff', 'w') + PROMOTERSGFF = open('annotation/promoters.gff', 'w') + UPSTREAMGFF = open('annotation/upstream.gff', 'w') + DOWNSTREAMGFF = open('annotation/downstream.gff', 'w') + + for line in gff_file: + if not line.startswith('#'): + lines = line.rstrip("\n").split("\t") + if not lines[0].startswith('chr'): + lines[0] = "chr"+lines[0] + if lines[2] == feature: + if gtf_name.split('.')[-1] == 'gff' or gtf_name.split('.')[-1] == 'gff3': + newline = lines[8].split(';') + transcript = [s for s in newline if "transcript_id=" in s] + results = ("{0}\t{1}\t{2}".format(lines[0], "\t".join(lines[1:8]), transcript[0])) + elif gtf_name.split('.')[-1] == 'gtf': + newline = lines[8].split(' ') + results = ("{0}\t{1}\t{2}={3}".format(lines[0], + "\t".join(lines[1:8]), + newline[0], newline[1].strip('";'))) + else: + sys.exit("ERROR :\tFailed to process %s" %(gtf_name)) + PSEUDOGFF.write(results+"\n") + parse_genelocations(chrom_sizes, results, flank) if __name__ == "__main__": main()