qiita-spots · antgonza · Apr 7, 2025
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,3 @@
+[run]
+branch = True
+relative_files = True
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,66 @@
+name: Sequence Processing Pipeline CI
+
+# Controls when the action will run.
+on:
+  # Triggers the workflow on pull request and push events, only on the master
+  # branch
+  pull_request:
+    branches: [ master ]
+  push:
+    branches: [ master ]
+
+jobs:
+  # name of job
+  build:
+    # The type of runner that the job will run on (available options are window/macOS/linux)
+    runs-on: ubuntu-latest
+    # we can add more versions of node.js in the future
+    strategy:
+      matrix:
+        python-version: ['3.9']
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # first grab branch from github
+      - uses: actions/checkout@v2
+        with:
+          persist-credentials: false
+          fetch-depth: 0
+
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          activate-environment: tester
+          python-version: ${{ matrix.python-version }}
+          conda-channels: anaconda, conda-forge, bioconda
+          auto-update-conda: true
+
+      - name: Main install
+        shell: bash -l {0}
+        run: |
+          conda install bioconda::fastq-pair
+          pip install -e ".[all]"
+
+      - name: Python linter
+        shell: bash -l {0}
+        run: flake8 sequence_processing_pipeline/ setup.py
+
+      - name: Run tests and measure coverage
+        shell: bash -l {0}
+        run: |
+          nosetests --with-coverage --cover-inclusive --cover-package sequence_processing_pipeline
+
+      - name: Coveralls
+        uses: AndreMiras/coveralls-python-action@develop
+        with:
+          parallel: true
+          flag-name: Unit Test
+
+  coveralls_finish:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+    - name: Coveralls Finished
+      uses: AndreMiras/coveralls-python-action@develop
+      with:
+        github-token: ${{ secrets.github_token }}
+        parallel-finished: true
diff --git a/.gitignore b/.gitignore
@@ -130,3 +130,6 @@ dmypy.json
 
 # ignore local changes
 qp_klp/tests/data/sample-sheets/*/*/*.csv
+
+# test output
+sequence_processing_pipeline/tests/data/output_dir/
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,9 @@
 BSD 3-Clause License
 
-Copyright (c) 2014--, Qiita Spots
+Copyright (c) 2014, Qiita Spots
+Copyright (c) 2020, biocore
+Copyright (c) 2021, jdereus
+
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/README.md b/README.md
@@ -0,0 +1,70 @@
+# Sequence Processing Pipeline
+
+A Jupyter notebook to assist wet lab shotgun pipeline.
+A packaged Python-based implementation of Knight Lab's sequencing process pipeline.
+
+## Installation
+
+To install this package, first clone this repository from GitHub:
+
+```bash
+git clone https://github.com/biocore/mg-scripts.git
+```
+
+Create a Python3 Conda environment in which to run the notebook:
+
+```bash
+conda create --yes -n spp python='python=3.9' scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair
+```
+
+Activate the Conda environment:
+
+```bash
+source activate sp_pipeline
+```
+
+Change directory to the cloned repository folder and install:
+
+```bash
+cd mg-scripts
+pip install -e .
+```
+
+This will automatically install https://github.com/biocore/metagenomics_pooling_notebook.git, a dependency of mg-scripts and the sequence_processing_pipeline.
+
+## Running Unittests
+
+Change directory to the downloaded repository folder:
+
+```bash
+cd mg-scripts
+nosetests --with-coverage --cover-inclusive --cover-package sequence_processing_pipeline
+```
+
+## Getting Started
+
+Review Pipeline.py and main.py to learn how to import and access package functionality:
+
+```bash
+cd mg-scripts/sequence_processing_pipeline
+more Pipeline.py
+more main.py
+```
+
+Adjust configuration settings as needed:
+
+```bash
+cd mg-scripts/sequence_processing_pipeline
+vi configuration.json
+```
+
+Please note that the setting 'minimap2_databases' is expected to be a list of paths to individual .mmi files for QCJob.
+For NuQCJob, minimap2_databases is expected to be the path to a directory containing two subdirectories: 'metagenomic'
+and 'metatranscriptomic'. Each directory should contain or symlink to the appropriate .mmi files needed for that Assay
+type.
+
+Additional TellSeq-related notes:
+'spades-cloudspades-0.1', 'tellread-release-novaseqX' or similar directories must be placed in a location available to SPP.
+Their paths should be made known to SPP in the configuration files. (See examples for details).
+Additional scripts found in sequence_processing_pipeline/contrib were contributed by Daniel and Omar and can be similarly located and configured.
+
diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py
@@ -0,0 +1,158 @@
+import glob
+import gzip
+import os
+from sequence_processing_pipeline.util import (iter_paired_files,
+                                               determine_orientation)
+
+
+def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
+                            batch_prefix):
+    '''Partitions input fastqs to coarse bins
+
+    :param data_location_path: Path to the ConvertJob directory.
+    :param max_file_list_size_in_gb: Upper threshold for file-size.
+    :param batch_prefix: Path + file-name prefix for output-files.
+    :return: The number of output-files created, size of largest bin.
+    '''
+    # to prevent issues w/filenames like the ones below from being mistaken
+    # for R1 or R2 files, use determine_orientation().
+    # LS_8_22_2014_R2_SRE_S2_L007_I1_001.fastq.gz
+    # LS_8_22_2014_R1_SRE_S3_L007_I1_001.fastq.gz
+
+    # since the names of all fastq files are being scanned for orientation,
+    # collect all of them instead of mistakenly pre-filtering some files.
+    fastq_paths = glob.glob(data_location_path + '/*/*.fastq.gz')
+    fastq_paths = [x for x in fastq_paths
+                   if determine_orientation(x) in ['R1', 'R2']]
+
+    # convert from GB and halve as we sum R1
+    max_size = (int(max_file_list_size_in_gb) * (2 ** 30) / 2)
+
+    split_offset = 0
+
+    # ensure we are max-sized to start.
+    current_size = max_size * 10
+    fp = None
+
+    bucket_size = 0
+    max_bucket_size = 0
+
+    for a, b in iter_paired_files(fastq_paths):
+        r1_size = os.stat(a).st_size
+        r2_size = os.stat(b).st_size
+
+        output_base = os.path.dirname(a).split('/')[-1]
+        if current_size + r1_size > max_size:
+            # bucket is full.
+            if bucket_size > max_bucket_size:
+                max_bucket_size = bucket_size
+
+            # reset bucket_size.
+            bucket_size = r1_size + r2_size
+
+            if fp is not None:
+                fp.close()
+
+            split_offset += 1
+            current_size = r1_size
+            fp = open(batch_prefix + '-%d' % split_offset, 'w')
+        else:
+            # add to bucket_size
+            bucket_size += r1_size + r2_size
+            current_size += r1_size
+
+        fp.write("%s\t%s\t%s\n" % (a, b, output_base))
+
+    if fp is not None:
+        fp.close()
+
+    if split_offset == 0:
+        raise ValueError("No splits made")
+
+    return split_offset, max_bucket_size
+
+
+def demux_cmd(id_map_fp, fp_fp, out_d, task, maxtask):
+    with open(id_map_fp, 'r') as f:
+        id_map = f.readlines()
+        id_map = [line.strip().split('\t') for line in id_map]
+
+    # fp needs to be an open file handle.
+    # ensure task and maxtask are proper ints when coming from cmd-line.
+    with open(fp_fp, 'r') as fp:
+        demux(id_map, fp, out_d, int(task), int(maxtask))
+
+
+def demux(id_map, fp, out_d, task, maxtask):
+    """Split infile data based in provided map"""
+    delimiter = '::MUX::'
+    mode = 'wt'
+    ext = '.fastq.gz'
+    sep = '/'
+    rec = '@'
+
+    openfps = {}
+
+    for offset, (idx, r1, r2, outbase) in enumerate(id_map):
+        if offset % maxtask == task:
+            idx = rec + idx
+
+            # setup output locations
+            outdir = out_d + sep + outbase
+            fullname_r1 = outdir + sep + r1 + ext
+            fullname_r2 = outdir + sep + r2 + ext
+
+            os.makedirs(outdir, exist_ok=True)
+            current_fp_r1 = gzip.open(fullname_r1, mode)
+            current_fp_r2 = gzip.open(fullname_r2, mode)
+            current_fp = {'1': current_fp_r1, '2': current_fp_r2}
+            openfps[idx] = current_fp
+
+    # setup a parser
+    seq_id = iter(fp)
+    seq = iter(fp)
+    dumb = iter(fp)
+    qual = iter(fp)
+
+    for i, s, d, q in zip(seq_id, seq, dumb, qual):
+        # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1'
+        # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1 BX:Z:TATGACACATGCGGCCCT' # noqa
+        # '@baz/1
+
+        # NB: from 6d794a37-12cd-4f8e-95d6-72a4b8a1ec1c's only-adapter-filtered results: # noqa
+        # @A00953:244:HYHYWDSXY:3:1101:14082:3740 1:N:0:CCGTAAGA+TCTAACGC
+
+        fname_encoded, sid = i.split(delimiter, 1)
+
+        if fname_encoded not in openfps:
+            continue
+
+        current_fp = openfps[fname_encoded]
+
+        # remove '\n' from sid and split on all whitespace.
+        tmp = sid.strip().split()
+
+        if len(tmp) == 1:
+            # sequence id line contains no optional metadata.
+            # don't change sid.
+            # -1 is \n
+            orientation = sid[-2]
+            sid = rec + sid
+        elif len(tmp) == 2:
+            sid = tmp[0]
+            metadata = tmp[1]
+            # no '\n'
+            orientation = sid[-1]
+            # hexdump confirms separator is ' ', not '\t'
+            sid = rec + sid + ' ' + metadata + '\n'
+        else:
+            raise ValueError(f"'{sid}' is not a recognized form")
+
+        current_fp[orientation].write(sid)
+        current_fp[orientation].write(s)
+        current_fp[orientation].write(d)
+        current_fp[orientation].write(q)
+
+    for d in openfps.values():
+        for f in d.values():
+            f.close()