From 2cb41db7069b7fe2a4d8a40c01fa786237687e2f Mon Sep 17 00:00:00 2001
From: Adetunji <madetunj@splprhpc06.cm.cluster>
Date: Mon, 19 Apr 2021 16:00:24 -0500
Subject: [PATCH] Update BAM2GFF_gtftogenes

---
 bin/BAM2GFF_gtftogenes.py | 136 +++++++++++++++++++-------------------
 1 file changed, 67 insertions(+), 69 deletions(-)

diff --git a/bin/BAM2GFF_gtftogenes.py b/bin/BAM2GFF_gtftogenes.py
index 1dff362..1fda578 100755
--- a/bin/BAM2GFF_gtftogenes.py
+++ b/bin/BAM2GFF_gtftogenes.py
@@ -3,18 +3,10 @@
     Generate genomic coordinates of all promoters, 5'UTR, 3'UTR, CDS
 '''
 
+import sys
 import os
 import argparse
 
-if not os.path.exists('annotation'):
-    os.makedirs('annotation')
-
-#initialize outputfiles
-PSEUDOGFF = open('annotation/genes.gff', 'w')
-PROMOTERSGFF = open('annotation/promoters.gff', 'w')
-UPSTREAMGFF = open('annotation/upstream.gff', 'w')
-DOWNSTREAMGFF = open('annotation/downstream.gff', 'w')
-
 def parse_genelocations(chromz, results, flank):
     """ Parse genomic regions
     Args:
@@ -22,6 +14,12 @@ def parse_genelocations(chromz, results, flank):
         results (string) : Individual gene coordinates
         flank (int) : genomic distance from start/end site
     """
+    
+    #initialize outputfiles
+    PROMOTERSGFF = open('annotation/promoters.gff', 'a')
+    UPSTREAMGFF = open('annotation/upstream.gff', 'a')
+    DOWNSTREAMGFF = open('annotation/downstream.gff', 'a')
+    
     lines = results.split("\t")
     lines[3] = int(lines[3])
     lines[4] = int(lines[4])
@@ -60,79 +58,79 @@ def parse_genelocations(chromz, results, flank):
                                                       downstart, downend, "\t".join(lines[5:])))
 
 def main():
-    usage = "usage: %prog -g [GTF/GFF file] -f [FEATURE TYPE (gene/transcript)] \
-             -c [CHROMSIZES] -d [DISTANCE(bp) from start/end site]"
+    usage = "usage: " + sys.argv[0] + " -g [GTF/GFF file] -f [FEATURE TYPE (gene/transcript)] " \
+            "-c [CHROMSIZES] -d [DISTANCE(bp) from start/end site] [-h help]"
     parser = argparse.ArgumentParser(usage=usage)
     parser.add_argument("-g", "--gtf", dest="gtf", required=True,
                         help="Enter .gtf/gff file to be processed.")
-    parser.add_argument("-d", "--dist", dest="distance", default=2000, type=int,
-                        help="Distance (bp) from site [TSS/TES].")
+    parser.add_argument("-d", "--dist", dest="distance", required=False,
+                        default=2000, type=int, help="Distance (bp) from site [TSS/TES].")
     parser.add_argument("-c", "--chrom", dest="chrom", required=True,
-                        help="Enter ucsc chrom sizes file to be processed.")
+                        help="Enter UCSC chrom sizes file to be processed.")
 
     options = parser.parse_args()
-    print(options)
+    #print(options)
+
+    if not os.path.exists('annotation'):
+        os.makedirs('annotation')
 
     flank = options.distance #flank distance from TSS / TES
 
-    if options.chrom and options.gtf:
-        chromsizefile = open(options.chrom, 'r')
-        chrom_sizes = {}
-        for line in chromsizefile:
-            line = line.split('\t')
-            chrom_sizes[line[0]] = line[1].rstrip("\n")
+    chromsizefile = open(options.chrom, 'r')
+    chrom_sizes = {}
+    for line in chromsizefile:
+        line = line.split('\t')
+        chrom_sizes[line[0]] = line[1].rstrip("\n")
 
-        #feature = options.feature
-        gtf_file = open(options.gtf, 'r')
-        
-        feature_dict = {}
-        if not options.gtf.split('.')[-1] == 'gtf':
-            sys.exit("ERROR :\tGTF required")
-            
-        #reading the feature type to be used
-        for line in gtf_file:
-            if not line.startswith('#'):
-                lines = line.split("\t")
-                feature_dict[lines[2]] = lines[2]
+    #feature = options.feature
+    gtf_name = options.gtf
+    gtf_file = open(options.gtf, 'r')
+    
+    feature_dict = {}
 
-        if 'transcript' in feature_dict:
-            feature = "transcript"
-        elif 'gene' in feature_dict:
-            feature = "gene"
-        else:
-            sys.exit("ERROR :\tGTF with either transcript/gene annotation is needed")
-            
-        print("NOTE :\tFeature type used is '%s'" %(feature))
+    #reading the feature type to be used
+    for line in gtf_file:
+        if not line.startswith('#'):
+            lines = line.split("\t")
+            feature_dict[lines[2]] = lines[2]
+
+    if 'transcript' in feature_dict:
+        feature = "transcript"
+    elif 'gene' in feature_dict:
+        feature = "gene"
+    else:
+        sys.exit("ERROR :\tGTF/GFF with either transcript/gene annotation is needed")
         
-        #reading the gff_file to parse regions
-        gff_file = open(options.gtf, 'r')
+    print("NOTE :\tFeature type used is '%s'" %(feature))
     
-        if options.gtf.split('.')[-1] == 'gff':
-            for line in gff_file:
-                if not line.startswith('#'):
-                    lines = line.split("\t")
-                    if not lines[0].startswith('chr'):
-                        lines[0] = "chr"+lines[0]
-                    if lines[2] == feature:
-                        results = ("{0}\t{1}".format(lines[0], "\t".join(lines[1:])))
-                        PSEUDOGFF.write(results+"\n")
-                        parse_genelocations(chrom_sizes, results, flank)
-        elif options.gtf.split('.')[-1] == 'gtf':
-            for line in gff_file:
-                if not line.startswith('#'):
-                    lines = line.split("\t")
-                    if not lines[0].startswith('chr'):
-                        lines[0] = "chr"+lines[0]
-                    if lines[2] == feature:
-                        newline = lines[8].split(' ')
-                        results = ("{0}\t{1}\t{2}={3}".format(lines[0],
-                                                                 "\t".join(lines[1:8]),
-                                                                 newline[0], newline[1]))
-                        PSEUDOGFF.write(results + "\n")
-                        parse_genelocations(chrom_sizes, results, flank)
-    else:
-        parser.print_help()
-
+    #reading the gff_file to parse regions
+    gff_file = open(options.gtf, 'r')
+    
+    #initialize output files
+    PSEUDOGFF = open('annotation/genes.gff', 'w')
+    PROMOTERSGFF = open('annotation/promoters.gff', 'w')
+    UPSTREAMGFF = open('annotation/upstream.gff', 'w')
+    DOWNSTREAMGFF = open('annotation/downstream.gff', 'w')
+    
+    for line in gff_file:
+        if not line.startswith('#'):
+            lines = line.rstrip("\n").split("\t")
+            if not lines[0].startswith('chr'):
+                lines[0] = "chr"+lines[0]
+            if lines[2] == feature:
+                if gtf_name.split('.')[-1] == 'gff' or gtf_name.split('.')[-1] == 'gff3':
+                    newline = lines[8].split(';')
+                    transcript = [s for s in newline if "transcript_id=" in s]
+                    results = ("{0}\t{1}\t{2}".format(lines[0], "\t".join(lines[1:8]), transcript[0]))
+                elif gtf_name.split('.')[-1] == 'gtf':
+                    newline = lines[8].split(' ')
+                    results = ("{0}\t{1}\t{2}={3}".format(lines[0],
+                                                          "\t".join(lines[1:8]),
+                                                          newline[0], newline[1].strip('";')))
+                else:
+                    sys.exit("ERROR :\tFailed to process %s" %(gtf_name))
+                PSEUDOGFF.write(results+"\n")
+                parse_genelocations(chrom_sizes, results, flank)
 
 if __name__ == "__main__":
     main()