More unit testing and installation instructions

Function `hunter` in `wolfpack.py` is tested now. This helped fixing a few bugs that might have been critical on machines with more than 100 CPUs. This test does not work on OS X because `seq -w 000 003`, for example, returns 0 1 2 3 and not 000 001 002 003. Installation instructions have been updated and we now try to install R in Travis.
medvir · May 9, 2016 · d531aab · d531aab
1 parent 2cdff0f
commit d531aab
Show file tree

Hide file tree

Showing 11 changed files with 114 additions and 8,129 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -8,7 +8,10 @@ omit =
     /usr/local/lib*
 
 include =
-    virmet/*
+    virmet/covplot.py
+    virmet/common.py
+    virmet/wolfpack.py
+
 
 source =
     virmet
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@ viral_dates.txt
 *pyc
 .coverage
 coverage.xml
+old_data/
diff --git a/.travis.yml b/.travis.yml
@@ -15,6 +15,7 @@ addons:
     - bwa
     - tabix
     - libwww-perl
+    - r-base
 
 cache:
   directories:

diff --git a/README.md b/README.md
@@ -3,7 +3,8 @@ VirMet
 
 [![Build Status](https://travis-ci.org/ozagordi/VirMet.svg?branch=master)](https://travis-ci.org/ozagordi/VirMet)
 
-[![codecov.io](https://codecov.io/github/ozagordi/VirMet/coverage.svg?branch=master)](https://codecov.io/github/ozagordi/VirMet?branch=master)
+[![codecov.io](https://codecov.io/github/ozagordi/VirMet/coverage.svg?branch=master)](https://codecov.io/github/ozagordi/VirMet?branch=master)  
+Watch out: only a few files are counted in coverage statistics.
 
 Full documentation on [Read the Docs](http://virmet.rtfd.org/en/latest/).
 
@@ -24,10 +25,7 @@ A short help is obtained with `virmet subcommand -h`.
 
 ### Installation
 VirMet relies on a number of third-party tools used to access databases, trim,
-convert, filter and map reads. One can refer to the files [`.travis.yml`](./.travis.yml)
-and [`install-dependencies.sh`](./install-dependencies.sh) for details or
-further down in this README.
-The dependencies are:
+convert, filter and map reads. The dependencies are:
 
 - bwa
 - samtools 1.3
@@ -38,11 +36,16 @@ The dependencies are:
 - blast+ 2.3.0
 - python (3.x, it's 2016...) with pandas and Biopython
 
+Once downloaded, it can be installed with the classic `python setup.py install`.
+Alternatively, the user can call directly `bin/virmet` in the package directory,
+for example adding the directory `virmet_package_directory/bin` to the `PATH`
+and calling `virmet` on the command line.
+
 
 ### Preparation: fetching databases
 
 After installation, one needs to populate the database directory. By default
-this will be `\data\virmet_databases` and will occupy about 60 GB. In order to
+this will be `/data/virmet_databases` and will occupy about 60 GB. In order to
 populate this, use the subcommand `fetch`, for example as follows
 
     virmet fetch --viral n  # this downloads viral sequences, nucleotide only

diff --git a/data/read1.fastq b/data/read1.fastq
diff --git a/data/read2.fastq b/data/read2.fastq
diff --git a/docs/installation.md b/docs/installation.md
@@ -1,13 +1,12 @@
-### Installation
+## Installation
+
+The classic `python setup.py install` should work, provided the user has the
+necessary permission.
 
 VirMet relies on a number of third-party tools used to access databases, trim,
-convert, filter and map reads. One can refer to the files
-[`.travis.yml`](https://github.com/ozagordi/VirMet/blob/master/.travis.yml)
-and
-[`install-dependencies.sh`](https://github.com/ozagordi/VirMet/blob/master/install-dependencies.sh)
-for details.
+convert, filter and map reads.
 
-The dependencies are:
+The tools VirMet dependends on are:
 
 - bwa
 - samtools 1.3
@@ -16,18 +15,22 @@ The dependencies are:
 - prinseq-lite
 - edirect command line tools
 - blast+ 2.3.0
+
+Moreover, the following languages are used
+
 - python (3.x, it's 2016...) with pandas and Biopython
 - R (for `covplot` only)
 
 
-### Commands for Ubuntu
+### Commands to install dependencies on Ubuntu
 On a Ubuntu 14.04 the following commands should provide a system wide
-installation, although on Travis we use a slightly different strategy.
+installation of the tools mentioned above plus R, although on Travis a slightly
+different strategy is used.
 
     # system wide configuration available as Ubuntu packages
     sudo apt-get update -qq
     sudo apt-get install -qq -y build-essential ftp golang unzip \
-    bwa tabix seqtk libwww-perl
+    bwa tabix seqtk libwww-perl r-base
 
     #  NCBI edirect tools
     cd /tmp
@@ -75,4 +78,6 @@ any 3.x should work), together with [pandas](http://pandas.pydata.org) and
 [Biopython](http://biopython.org/wiki/Main_Page). Go to the respective
 installation pages and choose your favourite method. For continuous
 integration on Travis we used conda (see [`.travis.yml`](./.travis.yml)).
-Finally, R is needed to run `covplot`.
+Finally, R packages ggplot2 and ggthemes are needed to run `covplot`.
+To install these, type `install.packages(c('ggplot2', 'ggthemes'))` inside the
+R console.
diff --git a/setup.py b/setup.py
@@ -9,14 +9,11 @@
         'console_scripts': [
             'virmet = virmet.__main__:main'
         ]
-#        'gui_scripts': [
-#            'baz = my_package_gui:start_func',
-#        ]
+# 'gui_scripts': [
+#     'baz = my_package_gui:start_func',
+# ]
     },
 
-    # Project uses reStructuredText, so ensure that the docutils get
-    # installed or upgraded on the target machine
-
     package_data = {
         # If any package contains *.txt or *.rst files, include them:
         '': ['*.txt', '*.rst'],
@@ -35,5 +32,4 @@
     Set of tools for the analysis of sequencing data to identify and characterize
     the viral fraction in metagenomic samples, especially in the clinical setting.
     '''
-    # could also include long_description, download_url, classifiers, etc.
 )
diff --git a/tests/test_common.py b/tests/test_common.py
@@ -26,38 +26,39 @@ class TestFTPDown(unittest.TestCase):
 
     def setUp(self):
         self.tmpdir = tempfile.gettempdir()
-        self.remote_1 = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.primary_assembly.annotation.gtf.gz'
+        # big file, 39 MB
+        # self.remote_1 = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.primary_assembly.annotation.gtf.gz'
+        # small file, 335 KB
+        self.remote_1 = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24//gencode.v24.2wayconspseudos.gtf.gz'
+        # again small file
         self.remote_2 = 'ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/_README.TXT'
-        # self.fasta = open(os.path.join(self.tmpdir, 'tmp.fasta'), 'w')
-        # self.fasta.write(">gi|1234|xyz\nAGCTAGC\n>gi|ABCD\nATCG\n")
-        # self.fasta.close()
-
-    # def test_nodecompress(self):
-    #     out_file = os.path.join(tempfile.gettempdir(), 'gtf.txt.gz')
-    #     ftp_down(self.remote_1, out_file)
-    #     ftl = run_child('file', out_file)
-    #     os.remove(out_file)
-    #     ft = parse_file_line(ftl)
-    #     self.assertEqual(ft, 'gzipped')
-    #
-    # def test_decompress(self):
-    #     out_file = os.path.join(tempfile.gettempdir(), 'gtf.txt')
-    #     ftp_down(self.remote_1, out_file)
-    #     ftl = run_child('file', out_file)
-    #     os.remove(out_file)
-    #     ft = parse_file_line(ftl)
-    #     self.assertEqual(ft, 'ascii')
-    #
-    # def test_append(self):
-    #     out_file = os.path.join(tempfile.gettempdir(), 'README.TXT')
-    #     ftp_down(self.remote_2, out_file)
-    #     with open(out_file) as f:
-    #         n_lines_1 = sum(1 for _ in f)
-    #     ftp_down(self.remote_2, out_file)
-    #     with open(out_file) as f:
-    #         n_lines_2 = sum(1 for _ in f)
-    #     self.assertEqual(n_lines_2, 2 * n_lines_1)
-    #     os.remove(out_file)
+
+    def test_nodecompress(self):
+        out_file = os.path.join(tempfile.gettempdir(), 'gtf.txt.gz')
+        ftp_down(self.remote_1, out_file)
+        ftl = run_child('file', out_file)
+        os.remove(out_file)
+        ft = parse_file_line(ftl)
+        self.assertEqual(ft, 'gzipped')
+
+    def test_decompress(self):
+        out_file = os.path.join(tempfile.gettempdir(), 'gtf.txt')
+        ftp_down(self.remote_1, out_file)
+        ftl = run_child('file', out_file)
+        os.remove(out_file)
+        ft = parse_file_line(ftl)
+        self.assertEqual(ft, 'ascii')
+
+    def test_append(self):
+        out_file = os.path.join(tempfile.gettempdir(), 'README.TXT')
+        ftp_down(self.remote_2, out_file)
+        with open(out_file) as f:
+            n_lines_1 = sum(1 for _ in f)
+        ftp_down(self.remote_2, out_file)
+        with open(out_file) as f:
+            n_lines_2 = sum(1 for _ in f)
+        self.assertEqual(n_lines_2, 2 * n_lines_1)
+        os.remove(out_file)
 
 class TestMisc(unittest.TestCase):
 
@@ -73,7 +74,7 @@ def test_gids(self):
         self.assertTrue('1234' in ids)
         self.assertTrue('ABCD' in ids)
 
-    def test_bact_fung_query(self):
+    def test_bact_query(self):
         all_urls = bact_fung_query(query_type='bacteria', download=True)
         bac_lines = len(all_urls)
         self.assertGreater(bac_lines, 100)
@@ -85,6 +86,18 @@ def test_bact_fung_query(self):
         self.assertEqual(bac_lines, bac_lines_again)
         os.remove('xyz.tsv')
 
+    def test_fung_query(self):
+        all_urls = bact_fung_query(query_type='fungi', download=True)
+        fung_lines = len(all_urls)
+        self.assertGreater(fung_lines, 100)
+        self.assertTrue(os.path.exists('fungi_refseq_info.tsv'))
+        os.rename('fungi_refseq_info.tsv', 'xyz.tsv')
+        urls_again = bact_fung_query(query_type='fungi', download=False,
+                                     info_file='xyz.tsv')
+        fung_lines_again = len(urls_again)
+        self.assertEqual(fung_lines, fung_lines_again)
+        os.remove('xyz.tsv')
+
     def test_multi_download(self):
         tmpf = 'tmp_multi_down.txt'
         # download same file twice

diff --git a/virmet/common.py b/virmet/common.py
@@ -10,7 +10,7 @@
 import pandas as pd
 
 DB_DIR = '/data/virmet_databases/'
-prinseq_exe = '/usr/local/bin/prinseq-lite.pl'
+#prinseq_exe = '/usr/local/bin/prinseq-lite.pl'
 
 
 def run_child(exe_name, arg_string, exe='/bin/sh'):
@@ -119,7 +119,7 @@ def bact_fung_query(query_type=None, download=True, info_file=None):
         with urllib.request.urlopen(url) as f:
             print(f.read().decode('utf-8'), file=bh)
         bh.close()
-    querinfo = pd.read_csv(info_file, sep='\t', header=0)
+    querinfo = pd.read_csv(info_file, sep='\t', header=0, skiprows=1)
     querinfo.rename(columns={'# assembly_accession': 'assembly_accession'}, inplace=True)
     if query_type == 'bacteria':
         gb = querinfo[(querinfo.assembly_level == 'Complete Genome') &
@@ -137,50 +137,6 @@ def bact_fung_query(query_type=None, download=True, info_file=None):
     return all_urls
 
 
-# def bacterial_query(download=True, info_file='bacteria_refseq_info.tsv'):
-#     ''' download bacterial genomes in refseq as explained in FAQ 12 here
-#     http://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/#asmsumfiles
-#     '''
-#     if download:
-#         url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt'
-#         bh = open(info_file, 'w')
-#         with urllib.request.urlopen(url) as f:
-#             print(f.read().decode('utf-8'), file=bh)
-#         bh.close()
-#     bactinfo = pd.read_csv(info_file, sep='\t', header=0)
-#     bactinfo.rename(columns={'# assembly_accession': 'assembly_accession'}, inplace=True)
-#     gb = bactinfo[(bactinfo.assembly_level == 'Complete Genome') & (bactinfo.version_status == 'latest')]
-#     gb.set_index('assembly_accession')
-#     x = gb['ftp_path'].apply(lambda col: col + '/' + col.split('/')[5] + '_genomic.fna.gz')
-#     gb.loc[:, 'ftp_genome_path'] = pd.Series(x, index=gb.index)
-#     all_urls = list(gb['ftp_genome_path'])
-#     assert len(all_urls) == len(gb)
-#     return all_urls
-#
-#
-# def fungal_query(download=True, info_file='fungi_refseq_info.tsv'):
-#     ''' download fungal genomes in refseq in a similar way to bacterial
-#     '''
-#     if download:
-#         url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/fungi/assembly_summary.txt'
-#         bh = open(info_file, 'w')
-#         with urllib.request.urlopen(url) as f:
-#             print(f.read().decode('utf-8'), file=bh)
-#         bh.close()
-#     funginfo = pd.read_csv(info_file, sep='\t', header=0)
-#     funginfo.rename(columns={'# assembly_accession': 'assembly_accession'}, inplace=True)
-#     gb = funginfo[(funginfo.refseq_category != 'na') &
-#                   (funginfo.version_status == 'latest') &
-#                   (funginfo.genome_rep == 'Full') &
-#                   (funginfo.release_type == 'Major')]
-#     gb.set_index('assembly_accession')
-#     x = gb['ftp_path'].apply(lambda col: col + '/' + col.split('/')[5] + '_genomic.fna.gz')
-#     gb.loc[:, 'ftp_genome_path'] = pd.Series(x, index=gb.index)
-#     all_urls = list(gb['ftp_genome_path'])
-#     assert len(all_urls) == len(gb)
-#     return all_urls
-
-
 def download_genomes(all_urls, prefix, n_files=1):
     ''' download genomes given a list of urls, randomly assigning them
     to one of several (n_files) fasta files
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,6 +15,7 @@ addons: @@
         - bwa
         - tabix
         - libwww-perl
+        - r-base
     cache:
       directories:
@@ Expand Down @@