Skip to content
This repository has been archived by the owner on Feb 12, 2018. It is now read-only.

Commit

Permalink
tools in file with tools
Browse files Browse the repository at this point in the history
  • Loading branch information
katyasosa committed Dec 10, 2012
1 parent 5ab9f4b commit 11a10f9
Show file tree
Hide file tree
Showing 3 changed files with 260 additions and 1 deletion.
23 changes: 23 additions & 0 deletions katyasosa/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
Data/ECO2_seq_gm*
Data/ECO3_seq_gm*
Data/ECO6_seq_gm*
Data/ECO6_seq_gm*
Data/MRU5_seq_gm*
Data/MRU6_seq_gm*
Data/MRU7_seq_gm*
Data/MRU9_seq_gm*
Data/PHE4_seq_gm*
Data/PHE6_seq_gm*
Data/PHE7_seq_gm*
gmsuite/*
Data/ECO2_seq_on*
Data/ECO3_seq_on*
Data/ECO6_seq_on*
Data/ECO6_seq_on*
Data/MRU5_seq_on*
Data/MRU6_seq_on*
Data/MRU7_seq_on*
Data/MRU9_seq_on*
Data/PHE4_seq_on*
Data/PHE6_seq_on*
Data/PHE7_seq_on*
1 change: 0 additions & 1 deletion katyasosa/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def count_metrics(genes_sam_path, predict_sam_path):
def compare_data(tools, genomes, data_dir):
for genome_name, genes_path in genomes.iteritems():
for tool in tools:
print tool.executable
path_result = tool.execute(genome_name)
path_prefix = make_sam_out(path_result, genes_path, genome_name,
data_dir, tool.name)
Expand Down
237 changes: 237 additions & 0 deletions katyasosa/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
from abc import abstractmethod, ABCMeta
import os
import re
import shutil
import subprocess
from tempfile import NamedTemporaryFile
from Bio import SeqIO
import itertools

__author__ = 'katya'

def common_gc_content(sequences):
"""
Example:
s = "AGCCT"
gc_content = count_gc_content(s)
print "GC content is %.2f%%"%gc_content
Result:
GC content is 60.00%
"""
GC_count, length = 0, 0
for seq_record in sequences:
sequence = seq_record.seq
GC_count += sequence.count("G") + sequence.count("C")
length += len(sequence) - sequence.count("N")
return round((1.0 * GC_count)/length * 100, 2)

class GeneFinder(object):
__metaclass__ = ABCMeta

executable = "gene_finder"
name = "gene_finder"

def __init__(self, data_dir, lib_dir):
self.lib_dir = lib_dir
self.data_dir = data_dir
self.install()

def execute(self, genome_name):
genome_path = os.path.join(self.data_dir, genome_name + ".fasta")
args = self.get_args(genome_path)
devnull = open(os.devnull, "w")

subprocess.check_call(
[os.path.join(self.lib_dir, self.executable)] + args,
stdout=devnull, stderr=devnull)
parsed_result_path = os.path.join(self.data_dir,
"{0}_{1}_genes.fasta".format(genome_name, self.name))

SeqIO.write(self.parse_result(genome_path), parsed_result_path, "fasta")
return parsed_result_path

__call__ = execute

@abstractmethod
def install(self):
pass

@abstractmethod
def get_args(self, _genome_path):
pass

@abstractmethod
def parse_result(self, _genome_path):
pass


class GeneMarkCommonGC(GeneFinder):
"""
./gm -rn -m heuristic_path genome_path
"""
executable = "gm"
name = "gm_commonGC"

def install(self):
gm_key_path = os.path.join(self.lib_dir, "gm_key")
if not os.path.isfile(os.path.expanduser("~/.gm_key")):
shutil.copyfile(gm_key_path, os.path.expanduser("~/.gm_key"))

def get_args(self, genome_path):
gc = common_gc_content(SeqIO.parse(genome_path, "fasta"))
gc = min(70, max(30, int(gc)))
heuristic_dir = os.path.join(self.lib_dir, "heuristic_mat")
heuristic_filename = os.path.join(heuristic_dir, "heu_11_{0}.mat".format(gc))
return ["-rn", "-m", heuristic_filename, genome_path]

def parse_result(self, genome_path):
result_path = genome_path + ".fasta.rgn"
if not os.path.isfile(os.path.expanduser(result_path)):
return

gm_result = SeqIO.parse(result_path, "fasta")

for seq_record in gm_result:
seq_record.description = ""
yield seq_record

class GeneMarkEveryGC(GeneMarkCommonGC):
"""
For every GC:
./gm -rn -m heuristic_for_the_gc_path contigs_with_the_gc_path
"""
executable = "gm"
name = "gm_everyGC"

def execute(self, genome_name):
genome_path = os.path.join(self.data_dir, genome_name + ".fasta")

counter = [0]
def rename(gene):
gene.id = gene.description = gene.name = self.name + "_" + str(counter[0])
counter[0] += 1
return gene

genes = []
execute = super(GeneMarkEveryGC, self).execute
for seq_record in SeqIO.parse(genome_path, "fasta"):
if len(seq_record.seq) > 200:
with NamedTemporaryFile(prefix=seq_record.id,
suffix=".fasta") as f:
SeqIO.write(seq_record, f, "fasta")
f.flush()
current_genes = SeqIO.parse(execute(f.name.replace(".fasta", "")),
"fasta")
genes.append(itertools.imap(rename, current_genes))

parsed_result_path = os.path.join(self.data_dir,
"{0}_{1}_genes.fasta".format(genome_name, self.name))
SeqIO.write(itertools.chain(*genes), parsed_result_path, "fasta")
return parsed_result_path

class GeneMarkHmmCommomGC(GeneFinder):
executable = "gmhmmp"
name = "gmhmmp_commonGC"

def install(self):
gm_key_path = os.path.join(self.lib_dir, "gm_key")
if not os.path.isfile(os.path.expanduser("~/.gm_key")):
shutil.copyfile(gm_key_path, os.path.expanduser("~/.gm_key"))

def get_args(self, genome_path):
gc = common_gc_content(SeqIO.parse(genome_path, "fasta"))
gc = min(70, max(30, int(gc)))
heuristic_dir = os.path.join(self.lib_dir, "heuristic_mod")
heuristic_filename = os.path.join(heuristic_dir, "heu_11_{0}.mod".format(gc))
result_path = genome_path + '.gmhmm'
return ['-d', '-p', '0','-m', heuristic_filename, '-o', result_path, genome_path]

def parse_result(self, genome_path):
result_path = genome_path + '.gmhmm'
reading_gene = False
with open(result_path) as f:
for line in f:
if line.startswith('>gene'):
reading_gene = True
seq = []
seq_id = re.sub(r'[\s>]', '', line)
# >gene_2|GeneMark.hmm|57_nt|+|1|57 >NODE_3_length_713_cov_1.25228
elif reading_gene:
if line.isspace():
reading_gene = False
seq = SeqIO.Seq(''.join(seq))
#genes.append(Gene(contig_id, strand, left_index, right_index, str_seq))
yield SeqIO.SeqRecord(seq, id='>' + seq_id, description = '', name='')
else:
seq.append(line.strip())

class GeneMarkHmmEveryGC(GeneMarkHmmCommomGC):
executable = 'gmhmmp'
name = 'gmhmmp_everyGC'

def execute(self, genome_name):
genome_path = os.path.join(self.data_dir, genome_name + ".fasta")

counter = [0]
def rename(gene):
gene.id = gene.description = gene.name = self.name + "_" + str(counter[0])
counter[0] += 1
return gene

genes = []
execute = super(GeneMarkHmmCommomGC, self).execute
for seq_record in SeqIO.parse(genome_path, "fasta"):
if len(seq_record.seq) > 200:
with NamedTemporaryFile(prefix=seq_record.id,
suffix=".fasta") as f:
SeqIO.write(seq_record, f, "fasta")
f.flush()
current_genes = SeqIO.parse(execute(f.name.replace(".fasta", "")),
"fasta")
genes.append(itertools.imap(rename, current_genes))

parsed_result_path = os.path.join(self.data_dir,
"{0}_{1}_genes.fasta".format(genome_name, self.name))
SeqIO.write(itertools.chain(*genes), parsed_result_path, "fasta")
return parsed_result_path

class GeneMarkS(GeneFinder):
executable = 'gmsn.pl'
name = 'gms'

def install(self):
gm_key_path = os.path.join(self.lib_dir, "gm_key")
if not os.path.isfile(os.path.expanduser("~/.gm_key")):
shutil.copyfile(gm_key_path, os.path.expanduser("~/.gm_key"))

def get_args(self, genome_path):
return ['--clean', genome_path]

def parse_result(self, genome_path):
result_path = genome_path + ".fasta.lst"
if not os.path.isfile(os.path.expanduser(result_path)):
return
contigs = dict([(s.id, s.seq) for s in SeqIO.parse(open(genome_path),
'fasta')])
with open(result_path, 'r') as f:
reading_genes = False
for line in f.readlines():
if line.startswith(' #'):
reading_genes = True
continue
if line.startswith('---') or line.strip() == '':
reading_genes = False
if reading_genes:
gene_sp = re.split(r'[\t ]+', line.strip())
seq_id = contig_id + '_gene_' + gene_sp[0]
l_index = int(gene_sp[2].replace('<', '')) -1
r_index = int(gene_sp[3].replace('>', ''))
seq = contig_seq[l_index:r_index]
yield SeqIO.SeqRecord(seq, id=seq_id, description='', name='')
if line.startswith('FASTA definition line'):
contig_id = line.strip().replace('FASTA definition line: ', '')
contig_seq = contigs[contig_id]
os.remove(result_path)
os.remove('gms.log')
os.remove('GeneMark_hmm.mod')

0 comments on commit 11a10f9

Please sign in to comment.