Skip to content

Commit

Permalink
modified insert_classify related functions
Browse files Browse the repository at this point in the history
  • Loading branch information
friend1ws committed Apr 15, 2024
1 parent ab70214 commit c045d3d
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 6 deletions.
13 changes: 9 additions & 4 deletions nanomonsv/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,16 @@ def create_parser():
insert_classify.add_argument("reference_fasta", metavar = "reference.fa", default = None, type = str,
help = "Path to the reference genome sequence")

insert_classify.add_argument("--grc", default = False, action = 'store_true',
help = "Deprecated. This is not used any more. Convert chromosome names to Genome Reference Consortium nomenclature (default: %(default)s)")
# insert_classify.add_argument("--grc", default = False, action = 'store_true',
# help = "Deprecated. This is not used any more. Convert chromosome names to Genome Reference Consortium nomenclature (default: %(default)s)")
insert_classify.add_argument("gtf_file", metavar = "gencode.gtf.gz", default = None, type = str,
help = "Path to GFT file for transcript")

insert_classify.add_argument("--genome_id", choices = ["hg19", "hg38", "mm10"], default = "hg38",
help = "Genome id used for selecting UCSC-GRC chromosome name corresponding files (default: %(default)s)")
# insert_classify.add_argument("--genome_id", choices = ["hg19", "hg38", "mm10"], default = "hg38",
# help = "Genome id used for selecting UCSC-GRC chromosome name corresponding files (default: %(default)s)")

insert_classify.add_argument("LINE1_db", default = None, type = str,
help = "Path to LINE1 position file")

insert_classify.add_argument("--debug", default = False, action = 'store_true', help = "keep intermediate files")

Expand Down
49 changes: 47 additions & 2 deletions nanomonsv/insert_classify.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#! /user/bin/env python3

import sys, re, pkg_resources
import sys, re, os, gzip, subprocess, pkg_resources
import pysam

from .swalign import *
Expand All @@ -24,6 +24,48 @@ def make_fasta_file(input_file, output_file, seq_id_file):
sid = sid + 1


def make_exon_bed_from_gtf(gtf_file, exon_bed_file):

if gtf_file.endswith(".gtf"):
hin = open(gtf_file, 'r')
elif gtf_file.endswith(".gtf.gz"):
hin = gzip.open(gtf_file, 'rt')
else:
print("The input GTF file should ends with 'gtf' or 'gtf.gz'", file = sys.stderr)
sys.exit(1)

hout = open(output_file + ".unsorted.tmp", 'w')
for line in hin:
if line.startswith('#'): continue
F = line.rstrip('\n').split('\t')
if F[2] != "exon": continue
infos = F[8].split('; ')

transcript_id = None
exon_number = None
for elm in infos:
if elm.startswith("transcript_id "):
transcript_id = elm.replace("transcript_id ", '').strip('"')
if elm.startswith("exon_number "):
exon_number = elm.replace("exon_number ", '').strip('"')


print(f'{F[0]}\t{int(F[3]) - 1}\t{F[4]}\t{transcript_id}\t{exon_number}\t{F[6]}', file = hout)

hin.close()
hout.close()

with open(output_file + ".sorted.tmp", 'w') as hout:
subprocess.run(["sort", "-k1,1", "-k2,2n", "-k3,3n", output_file + ".unsorted.tmp"], stdout = hout)

with open(output_file, 'w') as hout:
subprocess.run(["bgzip", "-f", "-c", output_file + ".sorted.tmp"], stdout = hout)

subprocess.run(["tabix", "-p", "bed", output_file])

os.remove(output_file + ".unsorted.tmp")
os.remove(output_file + ".sorted.tmp")


def sam2bed_split(input_file, output_file):

Expand Down Expand Up @@ -484,7 +526,7 @@ def summarize_bwa_alignment2(input_sam, seq_list, output_file):



def organize_info(rmsk_file, alignment_file, tsd_file, seq_list, output_file, genome_id):
def organize_info(rmsk_file, alignment_file, tsd_file, seq_list, output_file, LINE1_db_file):

key2rmsk = {}
with open(rmsk_file, 'r') as hin:
Expand All @@ -507,10 +549,13 @@ def organize_info(rmsk_file, alignment_file, tsd_file, seq_list, output_file, ge

keys = list(set(list(key2rmsk) + list(key2alignment) + list(key2tsd_polyAT)))

"""
if genome_id == "hg38":
line1_tb = pysam.TabixFile(pkg_resources.resource_filename("nanomonsv", "data/LINE1.hg38.bed.gz"))
else:
line1_tb = pysam.TabixFile(pkg_resources.resource_filename("nanomonsv", "data/LINE1.hg19.bed.gz"))
"""
line1_tb = pysam.TabixFile(LINE1_db_file)

with open(output_file, 'w') as hout:

Expand Down

0 comments on commit c045d3d

Please sign in to comment.