From 5ab9f4b5cc19b5ae99c9c5472a58ff12ac892037 Mon Sep 17 00:00:00 2001 From: katyasosa Date: Mon, 10 Dec 2012 13:07:15 +0400 Subject: [PATCH] New structure --- katyasosa/compare.py | 161 +++++-------------------------------------- 1 file changed, 19 insertions(+), 142 deletions(-) diff --git a/katyasosa/compare.py b/katyasosa/compare.py index eaccf15..fa30eb2 100644 --- a/katyasosa/compare.py +++ b/katyasosa/compare.py @@ -1,138 +1,10 @@ # -*- coding: utf-8 -*- -from abc import ABCMeta, abstractmethod + from collections import namedtuple import os -import shutil import subprocess -from tempfile import NamedTemporaryFile -from Bio import SeqIO -import itertools - - -class GeneFinder(object): - __metaclass__ = ABCMeta - - executable = "gene_finder" - name = "gene_finder" - - def __init__(self, data_dir, lib_dir): - self.lib_dir = lib_dir - self.data_dir = data_dir - self.install() - - def execute(self, genome_name): - self.genome_name = genome_name - genome_path = os.path.join(self.data_dir, genome_name + ".fasta") - args = self.get_args(genome_path) - devnull = open(os.devnull, "w") - - subprocess.check_call( - [os.path.join(self.lib_dir, self.executable)] + args, - stdout=devnull, stderr=devnull) - parsed_result_path = os.path.join(self.data_dir, - "{0}_{1}_genes.fasta".format(genome_name, self.name)) - - def rename(gene): - gene.id = gene.name = gene.description = self.name - result = itertools.imap(rename, self.parse_result()) - SeqIO.write(self.parse_result(), parsed_result_path, "fasta") - return parsed_result_path - - __call__ = execute - - @abstractmethod - def install(self): - pass - - @abstractmethod - def get_args(self, _genome_path): - pass - - @abstractmethod - def parse_result(self): - pass - - -class GeneMarkCommonGC(GeneFinder): - """ - ./gm -rn -m heuristic_path genome_path - """ - executable = "gm" - name = "gm_commonGC" - - def install(self): - gm_key_path = os.path.join(self.lib_dir, "gm_key") - if not os.path.isfile(os.path.expanduser("~/.gm_key")): - shutil.copyfile(gm_key_path, os.path.expanduser("~/.gm_key")) - - def get_args(self, genome_path): - gc = common_gc_content(SeqIO.parse(genome_path, "fasta")) - gc = min(70, max(30, int(gc))) - heuristic_dir = os.path.join(self.lib_dir, "heuristic_mat") - heuristic_filename = os.path.join(heuristic_dir, "heu_11_{0}.mat".format(gc)) - return ["-rn", "-m", heuristic_filename, genome_path] - - def parse_result(self): - result_path = os.path.join(self.data_dir, self.genome_name + ".fasta.rgn") - if not os.path.isfile(os.path.expanduser(result_path)): - return - - gm_result = SeqIO.parse(result_path, "fasta") - - for seq_record in gm_result: - seq_record.description = "" - yield seq_record - -class GeneMarkEveryGC(GeneMarkCommonGC): - """ - For every GC: - ./gm -rn -m heuristic_for_the_gc_path contigs_with_the_gc_path - """ - executable = "gm" - name = "gm_everyGC" - - def execute(self, genome_name): - genome_path = os.path.join(self.data_dir, genome_name + ".fasta") - - counter = [0] - def rename(gene): - gene.id = gene.description = gene.name = self.name + "_" + str(counter[0]) - counter[0] += 1 - return gene - - genes = [] - execute = super(GeneMarkEveryGC, self).execute - for seq_record in SeqIO.parse(genome_path, "fasta"): - if len(seq_record.seq) > 200: - with NamedTemporaryFile(prefix=seq_record.id, - suffix=".fasta") as f: - SeqIO.write(seq_record, f, "fasta") - f.flush() - current_genes = SeqIO.parse(execute(f.name.replace(".fasta", "")), - "fasta") - genes.append(itertools.imap(rename, current_genes)) - - parsed_result_path = os.path.join(self.data_dir, - "{0}_{1}_genes.fasta".format(genome_name, self.name)) - SeqIO.write(itertools.chain(*genes), parsed_result_path, "fasta") - return parsed_result_path - -def common_gc_content(sequences): - """ - Example: - s = "AGCCT" - gc_content = count_gc_content(s) - print "GC content is %.2f%%"%gc_content +from tools import GeneMarkCommonGC, GeneMarkEveryGC, GeneMarkHmmCommomGC, GeneMarkHmmEveryGC, GeneMarkS - Result: - GC content is 60.00% - """ - GC_count, length = 0, 0 - for seq_record in sequences: - sequence = seq_record.seq - GC_count += sequence.count("G") + sequence.count("C") - length += len(sequence) - sequence.count("N") - return round((1.0 * GC_count)/length * 100, 2) def make_sam_out(predict_path, genes_path, genome_name, data_dir, method_name): devnull = open(os.devnull, "w") @@ -151,18 +23,18 @@ def make_sam_out(predict_path, genes_path, genome_name, data_dir, method_name): def get_sam_map(sam_path): res = {} - keys = set() - with open(sam_path) as first: - for line in first: + qnames = set() + with open(sam_path) as sam_file: + for line in sam_file: if line.startswith('@'): pass else: sl = line.split() qname, rname, mapq = sl[0], sl[2], int(sl[4]) - keys.add(qname) + qnames.add(qname) if mapq >= 200: #mapq = -10*log(10, Pr{mapp. pos. is wrong}), if mapq = 200, Pr{..} = 10**(-20) res[qname] = rname - return res, len(keys) + return res, len(qnames) def count_metrics(genes_sam_path, predict_sam_path): predict_on_genes, predict_count = get_sam_map(predict_sam_path) @@ -179,6 +51,7 @@ def count_metrics(genes_sam_path, predict_sam_path): def compare_data(tools, genomes, data_dir): for genome_name, genes_path in genomes.iteritems(): for tool in tools: + print tool.executable path_result = tool.execute(genome_name) path_prefix = make_sam_out(path_result, genes_path, genome_name, data_dir, tool.name) @@ -191,15 +64,19 @@ def compare_data(tools, genomes, data_dir): if __name__ == "__main__": data_dir = 'Data' - lib_dir = 'genemark_suite_linux_64/gmsuite' - genomes = {'ECO2_seq': 'Data/eco_genes.fasta', 'ECO3_seq': 'Data/eco_genes.fasta', - 'ECO6_seq': 'Data/eco_genes.fasta', 'ECO7_seq': 'Data/eco_genes.fasta', - 'MRU5_seq': 'Data/mru_genes.fasta', 'MRU6_seq': 'Data/mru_genes.fasta', - 'MRU9_seq': 'Data/mru_genes.fasta', 'PHE4_seq': 'Data/phe_genes.fasta', - 'PHE6_seq': 'Data/phe_genes.fasta', 'PHE7_seq': 'Data/phe_genes.fasta'} + lib_dir = '/home/katya/CSC/python2012/katyasosa/gmsuite' + genomes = {'ECO2_seq': 'Data/eco_genes.fasta'} +# 'ECO3_seq': 'Data/eco_genes.fasta', +# 'ECO6_seq': 'Data/eco_genes.fasta', 'ECO7_seq': 'Data/eco_genes.fasta', +# 'MRU5_seq': 'Data/mru_genes.fasta', 'MRU6_seq': 'Data/mru_genes.fasta', +# 'MRU9_seq': 'Data/mru_genes.fasta', 'PHE4_seq': 'Data/phe_genes.fasta', +# 'PHE6_seq': 'Data/phe_genes.fasta', 'PHE7_seq': 'Data/phe_genes.fasta'} gm_common_gc = GeneMarkCommonGC(data_dir, lib_dir) gm_every_gc = GeneMarkEveryGC(data_dir, lib_dir) - tools = [gm_common_gc, gm_every_gc] + gm_hmm_common_gc = GeneMarkHmmCommomGC(data_dir, lib_dir) + gm_hmm_every_gc = GeneMarkHmmEveryGC(data_dir, lib_dir) + gm_s = GeneMarkS(data_dir, lib_dir) + tools = [gm_common_gc, gm_every_gc, gm_hmm_common_gc, gm_hmm_every_gc, gm_s] with open('compare_out', 'w') as out: for data in compare_data(tools, genomes, data_dir): out.write(','.join([str(x) for x in data]) + '\n') \ No newline at end of file