add test data

ktisha · Dec 10, 2012 · a776f19 · a776f19
1 parent cbe314d
commit a776f19
Show file tree

Hide file tree

Showing 20 changed files with 1,062,274 additions and 140 deletions.
diff --git a/katyasosa/Data/ECO2_seq.fasta b/katyasosa/Data/ECO2_seq.fasta
diff --git a/katyasosa/Data/ECO3_seq.fasta b/katyasosa/Data/ECO3_seq.fasta
diff --git a/katyasosa/Data/ECO6_seq.fasta b/katyasosa/Data/ECO6_seq.fasta
diff --git a/katyasosa/Data/ECO7_seq.fasta b/katyasosa/Data/ECO7_seq.fasta
diff --git a/katyasosa/Data/MRU5_seq.fasta b/katyasosa/Data/MRU5_seq.fasta
diff --git a/katyasosa/Data/MRU6_seq.fasta b/katyasosa/Data/MRU6_seq.fasta
diff --git a/katyasosa/Data/MRU7_seq.fasta b/katyasosa/Data/MRU7_seq.fasta
diff --git a/katyasosa/Data/MRU9_seq.fasta b/katyasosa/Data/MRU9_seq.fasta
diff --git a/katyasosa/Data/PHE4_seq.fasta b/katyasosa/Data/PHE4_seq.fasta
diff --git a/katyasosa/Data/PHE6_seq.fasta b/katyasosa/Data/PHE6_seq.fasta
diff --git a/katyasosa/Data/PHE7_seq.fasta b/katyasosa/Data/PHE7_seq.fasta
diff --git a/katyasosa/Data/eco_genes.fasta b/katyasosa/Data/eco_genes.fasta
diff --git a/katyasosa/Data/mru_genes.fasta b/katyasosa/Data/mru_genes.fasta
diff --git a/katyasosa/Data/phe_genes.fasta b/katyasosa/Data/phe_genes.fasta
diff --git a/katyasosa/README.md b/katyasosa/README.md
@@ -0,0 +1,55 @@
+```
+ _____       ______  _____
+|  __ \      |  ___|/  __ \
+| |  \/  ___ | |_   | /  \/  ___   _ __ ___   _ __
+| | __  / _ \|  _|  | |     / _ \ | '_ ` _ \ | '_ \
+| |_\ \|  __/| |    | \__/\| (_) || | | | | || |_) |
+ \____/ \___|\_|     \____/ \___/ |_| |_| |_|| .__/
+                                             | |
+                                             |_|
+
+                       -- all your genes belong to us!
+```
+
+GeFComp is a Python script for comparing performance metrics of different gene
+finding tools. Given a number of [genome assemblies] [ga-wiki] in FASTA format,
+GeFComp executes each available tool on each of the genomes and evaluates
+*Type I* and *Type II* errors, also known as false positives and false negatives.
+Results are then summarised
+
+Currently, GeFComp supports:
+
+* [GeneMark] [gm]
+* [GeneMark.hmm] [gm]
+* [GeneMark-S] [gm]
+
+[ga-wiki]: http://en.wikipedia.org/wiki/Genome_project#Genome_assembly
+[ggplot2]: http://ggplot2.org
+[gm]: http://exon.gatech.edu
+
+## Installation
+
+Python-only requirements can be installed via the usual `pip` boilerplate, but to do the
+evaluation you also have to make sure that the following tools are available in `$PATH`:
+
+* [BWA] [bwa], a popular short read aligner.
+
+```bash
+$ pip install -r requirements.txt
+```
+
+Or, if you prefer Debian and system-wide installation:
+
+```bash
+# aptitude install python-biopython bwa
+```
+
+[bwa]: http://bio-bwa.sourceforge.net
+
+## Usage
+
+```bash
+$ gefcomp.py config.py
+$ ls *.csv
+summary.csv
+```
diff --git a/katyasosa/compare.py b/katyasosa/compare.py
diff --git a/katyasosa/config.py b/katyasosa/config.py
@@ -0,0 +1,22 @@
+from tools import *
+
+RESULT_DIR = ''
+
+DATA_DIR = 'Data'
+GENOMES = {'ECO2_seq': 'Data/eco_genes.fasta',
+                     'ECO3_seq': 'Data/eco_genes.fasta',
+                     'ECO6_seq': 'Data/eco_genes.fasta',
+                     'ECO7_seq': 'Data/eco_genes.fasta',
+                     'MRU5_seq': 'Data/mru_genes.fasta',
+                     'MRU6_seq': 'Data/mru_genes.fasta',
+                     'MRU9_seq': 'Data/mru_genes.fasta',
+                     'PHE4_seq': 'Data/phe_genes.fasta',
+                     'PHE6_seq': 'Data/phe_genes.fasta',
+                     'PHE7_seq': 'Data/phe_genes.fasta'}
+
+
+GENE_FINDER_TOOLS = [GeneMarkCommonGC(DATA_DIR, 'genemark_suite_linux_64/gmsuite'),
+                     GeneMarkEveryGC(DATA_DIR, 'genemark_suite_linux_64/gmsuite'),
+                     GeneMarkHmmCommomGC(DATA_DIR, 'genemark_suite_linux_64/gmsuite'),
+                     GeneMarkHmmEveryGC(DATA_DIR, 'genemark_suite_linux_64/gmsuite'),
+                     GeneMarkS(DATA_DIR, 'genemark_suite_linux_64/gmsuite')]
diff --git a/katyasosa/gefcomp.py b/katyasosa/gefcomp.py
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+
+from collections import namedtuple
+import csv
+import importlib
+import os
+import subprocess
+import itertools
+import sys
+import imp
+from config import *
+
+
+
+def cross_align(predict_path, genes_path, out_path_prefix, tool_name):
+    """Aligns predicted genes on known genes using BWA and vise versa.
+
+    :param predict_path: path to a FASTA file with genes, predicted
+                         by the tool, being evaluated.
+    :param genes_path: path to a FASTA file with **true** genes.
+    :param out_path_prefix: a prefix for resulting alignment-files in
+                            SAM format.
+    :param tool_name: human-readable name of the tool, being evaluated.
+    :return: a tuple, where the first argument is a path to a SAM file
+             with predicted genes, aligned on known genes, and second
+             is a SAM file with known genes, aligned on predicted genes.
+    """
+    devnull = open(os.devnull, 'w')
+    subprocess.call(['bwa', 'index', '-a', 'bwtsw', predict_path],
+        stdout=devnull, stderr=devnull)
+    subprocess.call(['bwa', 'index', '-a', 'bwtsw', genes_path],
+        stdout=devnull, stderr=devnull)
+
+    genes_sam_path = out_path_prefix + tool_name + '_on.sam'
+    predict_sam_path = out_path_prefix + 'on_' + tool_name + '.sam'
+    with open(genes_sam_path, 'w') as fout:
+        subprocess.call(['bwa', 'bwasw', predict_path, genes_path],
+            stdout=fout, stderr=devnull)
+    with open(predict_sam_path, 'w') as fout:
+        subprocess.call(['bwa', 'bwasw', genes_path, predict_path],
+            stdout=fout, stderr=devnull)
+
+    return genes_sam_path, predict_sam_path
+
+
+Hypothesis = namedtuple('Hypothesis', ['name', 'TP', 'FP', 'FN', 'precision', 'recall', 'f1_score'])
+
+def evaluate_alignments(genes_sam_path, predict_sam_path):
+    """Evaluates precision-recall and F1-score metrics for aligned genes."""
+    def unique_alignments(sam_path):
+        res, seen = {}, set()
+        with open(sam_path) as sam_file:
+            for line in sam_file:
+                if line.startswith('@'):
+                    continue
+
+                chunks = line.split()
+                qname, rname, mapq = chunks[0], chunks[2], int(chunks[4])
+                seen.add(qname)
+
+                # mapq = -10*log(10, Pr{mapp. pos. is wrong}), if mapq = 200, Pr{..} = 10**(-20)
+                if mapq >= 200:
+                    res[qname] = rname
+
+        return res, len(seen)
+
+    predict_on_genes, predict_count = unique_alignments(predict_sam_path)
+    genes_on_predict, genes_count = unique_alignments(genes_sam_path)
+
+    # Hypothesis #1: predicted gene is actually a known gene.
+    TP = len(predict_on_genes)
+    FP = predict_count - TP
+    FN = genes_count - len(set(predict_on_genes.values()))
+
+    precision = TP * 1. / (TP + FP)
+    recall = TP * 1. / (TP + FN)
+    f_score = 2 * precision * recall / (precision + recall)
+    H_predict_is_gene = Hypothesis('predict is gene', TP, FP, FN,
+        precision, recall, f_score)
+
+    # Hypothesis #2: a known gene was predicted correctly.
+    TP = len(genes_on_predict)
+    FP = genes_count - TP
+    FN = predict_count - len(set(genes_on_predict.values()))
+
+    precision = TP * 1. / (TP + FP)
+    recall = TP * 1. / (TP + FN)
+    f_score = 2 * precision * recall / (precision + recall)
+    H_gene_was_predicted = Hypothesis('gene was predicted', TP, FP, FN,
+        precision, recall, f_score)
+    return H_predict_is_gene, H_gene_was_predicted
+
+def compare_data(tools, genomes, data_dir):
+    for genome_name, tool in itertools.product(genomes, tools):
+        path_result = tool.execute(genome_name)
+        path_prefix = os.path.join(data_dir, genome_name) + '_'
+        genes_sam_path, predict_sam_path =\
+        cross_align(path_result, genomes[genome_name], path_prefix, tool.name)
+
+        hypothesises = evaluate_alignments(genes_sam_path, predict_sam_path)
+        for hypothesis in hypothesises:
+            yield {'hypothesis_name': hypothesis.name,
+                   'genome_name': genome_name,
+                   'tool_name': tool.name,
+                   'TP': hypothesis.TP,
+                   'FP': hypothesis.FP,
+                   'FN': hypothesis.FN,
+                   'precision': hypothesis.precision,
+                   'recall': hypothesis.recall,
+                   'f1_score': hypothesis.f1_score}
+
+if __name__ == '__main__':
+    config_path = sys.argv[1]
+    config = imp.load_source('config', config_path)
+
+    data_dir = config.DATA_DIR
+    genomes = config.GENOMES
+    tools = config.GENE_FINDER_TOOLS
+
+    csv_path = os.path.join(config.RESULT_DIR, 'summary.csv')
+    with open(csv_path, 'w') as f:
+        summary = csv.DictWriter(f, ['hypothesis_name', 'genome_name',
+                                     'tool_name', 'TP', 'FP', 'FN',
+                                     'precision', 'recall', 'f1_score'])
+        summary.writeheader()
+        summary.writerows(compare_data(tools, genomes, data_dir))
diff --git a/katyasosa/requirements.txt b/katyasosa/requirements.txt
@@ -0,0 +1,2 @@
+biopython==1.60
+bwa==0.5.9