Skip to content
This repository has been archived by the owner on Feb 12, 2018. It is now read-only.

Commit

Permalink
add test data
Browse files Browse the repository at this point in the history
  • Loading branch information
katyasosa committed Dec 10, 2012
1 parent cbe314d commit a776f19
Show file tree
Hide file tree
Showing 20 changed files with 1,062,274 additions and 140 deletions.
80,709 changes: 80,709 additions & 0 deletions katyasosa/Data/ECO2_seq.fasta

Large diffs are not rendered by default.

75,992 changes: 75,992 additions & 0 deletions katyasosa/Data/ECO3_seq.fasta

Large diffs are not rendered by default.

82,529 changes: 82,529 additions & 0 deletions katyasosa/Data/ECO6_seq.fasta

Large diffs are not rendered by default.

82,400 changes: 82,400 additions & 0 deletions katyasosa/Data/ECO7_seq.fasta

Large diffs are not rendered by default.

89,128 changes: 89,128 additions & 0 deletions katyasosa/Data/MRU5_seq.fasta

Large diffs are not rendered by default.

58,066 changes: 58,066 additions & 0 deletions katyasosa/Data/MRU6_seq.fasta

Large diffs are not rendered by default.

67,766 changes: 67,766 additions & 0 deletions katyasosa/Data/MRU7_seq.fasta

Large diffs are not rendered by default.

62,182 changes: 62,182 additions & 0 deletions katyasosa/Data/MRU9_seq.fasta

Large diffs are not rendered by default.

82,521 changes: 82,521 additions & 0 deletions katyasosa/Data/PHE4_seq.fasta

Large diffs are not rendered by default.

78,902 changes: 78,902 additions & 0 deletions katyasosa/Data/PHE6_seq.fasta

Large diffs are not rendered by default.

89,678 changes: 89,678 additions & 0 deletions katyasosa/Data/PHE7_seq.fasta

Large diffs are not rendered by default.

73,104 changes: 73,104 additions & 0 deletions katyasosa/Data/eco_genes.fasta

Large diffs are not rendered by default.

52,161 changes: 52,161 additions & 0 deletions katyasosa/Data/mru_genes.fasta

Large diffs are not rendered by default.

86,882 changes: 86,882 additions & 0 deletions katyasosa/Data/phe_genes.fasta

Large diffs are not rendered by default.

55 changes: 55 additions & 0 deletions katyasosa/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
```
_____ ______ _____
| __ \ | ___|/ __ \
| | \/ ___ | |_ | / \/ ___ _ __ ___ _ __
| | __ / _ \| _| | | / _ \ | '_ ` _ \ | '_ \
| |_\ \| __/| | | \__/\| (_) || | | | | || |_) |
\____/ \___|\_| \____/ \___/ |_| |_| |_|| .__/
| |
|_|
-- all your genes belong to us!
```

GeFComp is a Python script for comparing performance metrics of different gene
finding tools. Given a number of [genome assemblies] [ga-wiki] in FASTA format,
GeFComp executes each available tool on each of the genomes and evaluates
*Type I* and *Type II* errors, also known as false positives and false negatives.
Results are then summarised

Currently, GeFComp supports:

* [GeneMark] [gm]
* [GeneMark.hmm] [gm]
* [GeneMark-S] [gm]

[ga-wiki]: http://en.wikipedia.org/wiki/Genome_project#Genome_assembly
[ggplot2]: http://ggplot2.org
[gm]: http://exon.gatech.edu

## Installation

Python-only requirements can be installed via the usual `pip` boilerplate, but to do the
evaluation you also have to make sure that the following tools are available in `$PATH`:

* [BWA] [bwa], a popular short read aligner.

```bash
$ pip install -r requirements.txt
```

Or, if you prefer Debian and system-wide installation:

```bash
# aptitude install python-biopython bwa
```

[bwa]: http://bio-bwa.sourceforge.net

## Usage

```bash
$ gefcomp.py config.py
$ ls *.csv
summary.csv
```
91 changes: 0 additions & 91 deletions katyasosa/compare.py

This file was deleted.

22 changes: 22 additions & 0 deletions katyasosa/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from tools import *

RESULT_DIR = ''

DATA_DIR = 'Data'
GENOMES = {'ECO2_seq': 'Data/eco_genes.fasta',
'ECO3_seq': 'Data/eco_genes.fasta',
'ECO6_seq': 'Data/eco_genes.fasta',
'ECO7_seq': 'Data/eco_genes.fasta',
'MRU5_seq': 'Data/mru_genes.fasta',
'MRU6_seq': 'Data/mru_genes.fasta',
'MRU9_seq': 'Data/mru_genes.fasta',
'PHE4_seq': 'Data/phe_genes.fasta',
'PHE6_seq': 'Data/phe_genes.fasta',
'PHE7_seq': 'Data/phe_genes.fasta'}


GENE_FINDER_TOOLS = [GeneMarkCommonGC(DATA_DIR, 'genemark_suite_linux_64/gmsuite'),
GeneMarkEveryGC(DATA_DIR, 'genemark_suite_linux_64/gmsuite'),
GeneMarkHmmCommomGC(DATA_DIR, 'genemark_suite_linux_64/gmsuite'),
GeneMarkHmmEveryGC(DATA_DIR, 'genemark_suite_linux_64/gmsuite'),
GeneMarkS(DATA_DIR, 'genemark_suite_linux_64/gmsuite')]
126 changes: 126 additions & 0 deletions katyasosa/gefcomp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# -*- coding: utf-8 -*-

from collections import namedtuple
import csv
import importlib
import os
import subprocess
import itertools
import sys
import imp
from config import *



def cross_align(predict_path, genes_path, out_path_prefix, tool_name):
"""Aligns predicted genes on known genes using BWA and vise versa.
:param predict_path: path to a FASTA file with genes, predicted
by the tool, being evaluated.
:param genes_path: path to a FASTA file with **true** genes.
:param out_path_prefix: a prefix for resulting alignment-files in
SAM format.
:param tool_name: human-readable name of the tool, being evaluated.
:return: a tuple, where the first argument is a path to a SAM file
with predicted genes, aligned on known genes, and second
is a SAM file with known genes, aligned on predicted genes.
"""
devnull = open(os.devnull, 'w')
subprocess.call(['bwa', 'index', '-a', 'bwtsw', predict_path],
stdout=devnull, stderr=devnull)
subprocess.call(['bwa', 'index', '-a', 'bwtsw', genes_path],
stdout=devnull, stderr=devnull)

genes_sam_path = out_path_prefix + tool_name + '_on.sam'
predict_sam_path = out_path_prefix + 'on_' + tool_name + '.sam'
with open(genes_sam_path, 'w') as fout:
subprocess.call(['bwa', 'bwasw', predict_path, genes_path],
stdout=fout, stderr=devnull)
with open(predict_sam_path, 'w') as fout:
subprocess.call(['bwa', 'bwasw', genes_path, predict_path],
stdout=fout, stderr=devnull)

return genes_sam_path, predict_sam_path


Hypothesis = namedtuple('Hypothesis', ['name', 'TP', 'FP', 'FN', 'precision', 'recall', 'f1_score'])

def evaluate_alignments(genes_sam_path, predict_sam_path):
"""Evaluates precision-recall and F1-score metrics for aligned genes."""
def unique_alignments(sam_path):
res, seen = {}, set()
with open(sam_path) as sam_file:
for line in sam_file:
if line.startswith('@'):
continue

chunks = line.split()
qname, rname, mapq = chunks[0], chunks[2], int(chunks[4])
seen.add(qname)

# mapq = -10*log(10, Pr{mapp. pos. is wrong}), if mapq = 200, Pr{..} = 10**(-20)
if mapq >= 200:
res[qname] = rname

return res, len(seen)

predict_on_genes, predict_count = unique_alignments(predict_sam_path)
genes_on_predict, genes_count = unique_alignments(genes_sam_path)

# Hypothesis #1: predicted gene is actually a known gene.
TP = len(predict_on_genes)
FP = predict_count - TP
FN = genes_count - len(set(predict_on_genes.values()))

precision = TP * 1. / (TP + FP)
recall = TP * 1. / (TP + FN)
f_score = 2 * precision * recall / (precision + recall)
H_predict_is_gene = Hypothesis('predict is gene', TP, FP, FN,
precision, recall, f_score)

# Hypothesis #2: a known gene was predicted correctly.
TP = len(genes_on_predict)
FP = genes_count - TP
FN = predict_count - len(set(genes_on_predict.values()))

precision = TP * 1. / (TP + FP)
recall = TP * 1. / (TP + FN)
f_score = 2 * precision * recall / (precision + recall)
H_gene_was_predicted = Hypothesis('gene was predicted', TP, FP, FN,
precision, recall, f_score)
return H_predict_is_gene, H_gene_was_predicted

def compare_data(tools, genomes, data_dir):
for genome_name, tool in itertools.product(genomes, tools):
path_result = tool.execute(genome_name)
path_prefix = os.path.join(data_dir, genome_name) + '_'
genes_sam_path, predict_sam_path =\
cross_align(path_result, genomes[genome_name], path_prefix, tool.name)

hypothesises = evaluate_alignments(genes_sam_path, predict_sam_path)
for hypothesis in hypothesises:
yield {'hypothesis_name': hypothesis.name,
'genome_name': genome_name,
'tool_name': tool.name,
'TP': hypothesis.TP,
'FP': hypothesis.FP,
'FN': hypothesis.FN,
'precision': hypothesis.precision,
'recall': hypothesis.recall,
'f1_score': hypothesis.f1_score}

if __name__ == '__main__':
config_path = sys.argv[1]
config = imp.load_source('config', config_path)

data_dir = config.DATA_DIR
genomes = config.GENOMES
tools = config.GENE_FINDER_TOOLS

csv_path = os.path.join(config.RESULT_DIR, 'summary.csv')
with open(csv_path, 'w') as f:
summary = csv.DictWriter(f, ['hypothesis_name', 'genome_name',
'tool_name', 'TP', 'FP', 'FN',
'precision', 'recall', 'f1_score'])
summary.writeheader()
summary.writerows(compare_data(tools, genomes, data_dir))
2 changes: 2 additions & 0 deletions katyasosa/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
biopython==1.60
bwa==0.5.9
Loading

0 comments on commit a776f19

Please sign in to comment.