Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/qiita-plugin-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ jobs:
pip install .
pip --quiet install coveralls

export QP_KLP_CONFIG_FP=`pwd`/configuration.json
export QP_KLP_CONFIG_FP=`pwd`/qp_klp/tests/data/configuration.json

configure_qtp_job_output_folder --env-script "source /home/runner/.profile; conda activate klp" --ca-cert $QIITA_ROOTCA_CERT
configure_klp --env-script "source /home/runner/.profile; export QP_KLP_CONFIG_FP=$QP_KLP_CONFIG_FP; conda activate klp" --ca-cert $QIITA_ROOTCA_CERT
Expand Down Expand Up @@ -139,7 +139,7 @@ jobs:
conda activate klp
export QIITA_ROOTCA_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt
export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg
export QP_KLP_CONFIG_FP=`pwd`/configuration.json
export QP_KLP_CONFIG_FP=`pwd`/qp_klp/tests/data/configuration.json
export PYTHONWARNINGS="ignore:Certificate for localhost has no \`subjectAltName\`"
nosetests --with-doctest --with-coverage -v --cover-package=qp_klp
- uses: codecov/codecov-action@v3
Expand All @@ -160,4 +160,4 @@ jobs:
- name: lint
run: |
pip install -q flake8
flake8 qp_klp
flake8 .
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ dependencies = [
"click>=3.3",
"future",
"pandas",
"lxml",
"qiita-files@https://github.com/qiita-spots/qiita-files/archive/master.zip",
"qiita_client@https://github.com/qiita-spots/qiita_client/archive/master.zip",
"sequence-processing-pipeline@https://github.com/biocore/mg-scripts/archive/master.zip"
Expand Down
138 changes: 137 additions & 1 deletion qp_klp/StandardMetagenomicWorkflow.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
from functools import partial
import sample_sheet
import pandas as pd
from os.path import basename, join
from os import symlink, makedirs
from datetime import datetime
from metapool import MetagenomicSampleSheetv90
from pathlib import Path

from .Protocol import Illumina
from sequence_processing_pipeline.Pipeline import Pipeline
from .Assays import Metagenomic
from .Assays import ASSAY_NAME_METAGENOMIC
from .FailedSamplesRecord import FailedSamplesRecord
from .Workflows import Workflow
from .Workflows import Workflow, WorkflowError


class StandardMetagenomicWorkflow(Workflow, Metagenomic, Illumina):
Expand Down Expand Up @@ -49,3 +58,130 @@ def __init__(self, **kwargs):
"type bool")

self.update = kwargs['update_qiita']


class PrepNuQC(StandardMetagenomicWorkflow):
def __init__(self, **kwargs):
qclient = kwargs['qclient']
job_id = kwargs['job_id']
parameters = kwargs['parameters']
out_dir = kwargs['out_dir']
config_fp = kwargs['config_fp']
status_line = kwargs['status_line']

out_path = partial(join, out_dir)
self.final_results_path = out_path('final_results')
makedirs(self.final_results_path, exist_ok=True)

pid = parameters.pop('prep_id')

prep_info = qclient.get(f'/qiita_db/prep_template/{pid}/')
dt = prep_info['data_type']
sid = prep_info['study']
if dt not in {'Metagenomic', 'Metatranscriptomic'}:
raise WorkflowError(f'Prep {pid} has a not valid data type: {dt}')
aid = prep_info['artifact']
if not str(aid).isnumeric():
raise WorkflowError(f'Prep {pid} has a not valid artifact: {aid}')

files, pt = qclient.artifact_and_preparation_files(aid)
html_summary = qclient.get_artifact_html_summary(aid)
if html_summary is None:
raise WorkflowError(f'Artifact {aid} doesnot have a summary, '
'please generate one.')
df_summary = pd.read_html(html_summary)[0]
pt.set_index('sample_name', inplace=True)

project_name = f'qiita-{pid}-{aid}_{sid}'

sheet = MetagenomicSampleSheetv90()
sheet.Header['IEMFileVersion'] = '4'
sheet.Header['Date'] = datetime.today().strftime('%m/%d/%y')
sheet.Header['Workflow'] = 'GenerateFASTQ'
sheet.Header['Application'] = 'FASTQ Only'
sheet.Header['Assay'] = prep_info['data_type']
sheet.Header['Description'] = f'prep_NuQCJob - {pid}'
sheet.Header['Chemistry'] = 'Default'
sheet.Header['SheetType'] = 'standard_metag'
sheet.Header['SheetVersion'] = '90'
sheet.Header['Investigator Name'] = 'Qiita'
sheet.Header['Experiment Name'] = project_name

sheet.Bioinformatics = pd.DataFrame(
columns=['Sample_Project', 'ForwardAdapter', 'ReverseAdapter',
'library_construction_protocol',
'experiment_design_description',
'PolyGTrimming', 'HumanFiltering', 'QiitaID'],
data=[[project_name, 'NA', 'NA', 'NA', 'NA',
'FALSE', 'TRUE', sid]])

df_summary = df_summary[df_summary.file_type == 'raw_forward_seqs']
data = []
for k, vals in pt.iterrows():
k = k.split('.', 1)[-1]
rp = vals['run_prefix']
sample = {
'Sample_Name': k,
'Sample_ID': k.replace('.', '_'),
'Sample_Plate': '',
'well_id_384': '',
'I7_Index_ID': '',
'index': vals['index'],
'I5_Index_ID': '',
'index2': vals['index2'],
'Sample_Project': project_name,
'Well_description': '',
'Sample_Well': '',
'Lane': '1'}
sheet.add_sample(sample_sheet.Sample(sample))
_d = df_summary[
df_summary.filename.str.startswith(rp)]
if _d.shape[0] != 1:
ValueError(f'The run_prefix {rp} from {k} has {_d.shape[0]} '
'matches with files')
data.append({
'Lane': '1', 'SampleID': rp, 'Sample_Project': project_name,
'Index': vals['index'], '# Reads': _d.reads.values[0]})

sheet.Contact = pd.DataFrame(
columns=['Email', 'Sample_Project'],
data=[['[email protected]', project_name]])

new_sample_sheet = out_path('sample-sheet.csv')
with open(new_sample_sheet, 'w') as f:
sheet.write(f, 1)

# now that we have a sample_sheet we can fake the
# ConvertJob folder so we are ready for the restart
convert_path = out_path('ConvertJob')
project_folder = out_path('ConvertJob', project_name)
makedirs(project_folder, exist_ok=True)
# creating Demultiplex_Stats.csv
reports_folder = out_path('ConvertJob', 'Reports')
makedirs(reports_folder, exist_ok=True)
pd.DataFrame(data).set_index('SampleID').to_csv(
f'{reports_folder}/Demultiplex_Stats.csv')

for fs in files.values():
for f in fs:
bn = basename(f['filepath']).replace(
'.trimmed.fastq.gz', '.fastq.gz')
symlink(f['filepath'], f'{project_folder}/{bn}')

# create job_completed file to skip this step
Path(f'{convert_path}/job_completed').touch()

kwargs = {'qclient': qclient,
'uif_path': new_sample_sheet,
'lane_number': "1",
'config_fp': config_fp,
'run_identifier': '250225_LH00444_0301_B22N7T2LT4',
'output_dir': out_dir,
'job_id': job_id,
'status_update_callback': status_line.update_job_status,
# set 'update_qiita' to False to avoid updating Qiita DB
# and copying files into uploads dir. Useful for testing.
'update_qiita': True,
'is_restart': True}

super().__init__(**kwargs)
15 changes: 14 additions & 1 deletion qp_klp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------
from qiita_client import QiitaPlugin, QiitaCommand
from .klp import sequence_processing_pipeline
from .klp import sequence_processing_pipeline, prep_NuQCJob


class QiitaPluginAdmin(QiitaPlugin):
Expand All @@ -17,6 +17,8 @@ class QiitaPluginAdmin(QiitaPlugin):

plugin = QiitaPluginAdmin('qp-klp', __version__, 'Knight Lab Processing')

# Adding SPP job

req_params = {
'run_identifier': ('string', ['']),
'sample_sheet': ('prep_template', ['']),
Expand All @@ -33,3 +35,14 @@ class QiitaPluginAdmin(QiitaPlugin):
dflt_param_set)

plugin.register_command(sequence_processing_pipeline_cmd)

# adding prep_NuQCJob job

req_params = {'prep_id': ('integer', [None])}
outputs = {'output': 'job-output-folder'}

prep_NuQCJob_cmd = QiitaCommand(
'prep_NuQCJob', 'Apply NuQCJob to an existing Qiita prep raw data',
prep_NuQCJob, req_params, dict(), outputs, dict())

plugin.register_command(prep_NuQCJob_cmd)
50 changes: 47 additions & 3 deletions qp_klp/klp.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------
from functools import partial
from os import environ
from qiita_client import ArtifactInfo
from os import makedirs
from os import makedirs, environ
from os.path import join, exists
import traceback
from sequence_processing_pipeline.PipelineError import PipelineError
from metapool import load_sample_sheet
from .Workflows import WorkflowError
from .WorkflowFactory import WorkflowFactory

from .StandardMetagenomicWorkflow import PrepNuQC

CONFIG_FP = environ["QP_KLP_CONFIG_FP"]

Expand Down Expand Up @@ -152,9 +152,53 @@ def sequence_processing_pipeline(qclient, job_id, parameters, out_dir):
except (PipelineError, WorkflowError) as e:
# assume AttributeErrors are issues w/bad sample-sheets or
# mapping-files.
with open(f'{out_dir}/error-traceback.err', 'w') as f:
f.write(traceback.format_exc())
return False, None, str(e)

# return success, ainfo, and the last status message.
paths = [(f'{final_results_path}/', 'directory')]
return (True, [ArtifactInfo('output', 'job-output-folder', paths)],
status_line.msg)


def prep_NuQCJob(qclient, job_id, parameters, out_dir):
"""Sequence Processing Pipeline command

Parameters
----------
qclient : tgp.qiita_client.QiitaClient
The Qiita server client
job_id : str
The job id
parameters : dict
The parameter values for this job
out_dir : str
The path to the job's output directory

Returns
-------
bool, list, str
The results of the job
"""
status_line = StatusUpdate(qclient, job_id)

status_line.update_job_status("Setting up pipeline")

kwargs = {'qclient': qclient, 'job_id': job_id,
'parameters': parameters, 'out_dir': out_dir,
'config_fp': CONFIG_FP, 'status_line': status_line}
try:
workflow = PrepNuQC(**kwargs)
workflow.execute_pipeline()
except (PipelineError, WorkflowError) as e:
# assume AttributeErrors are issues w/bad sample-sheets or
# mapping-files.
with open(f'{out_dir}/error-traceback.err', 'w') as f:
f.write(traceback.format_exc())
return False, None, str(e)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The traceback is lost here:

In [4]: def foo():
   ...:     raise ValueError("stuff")
   ...: 

In [5]: try:
   ...:     foo()
   ...: except ValueError as e:
   ...:     print(str(e))
   ...: 
stuff

See this SO post. I think the intent is str(e) to instead be traceback.format_exc(), right? If so then import traceback is also needed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK, the intent is completely the oposite, just raise/report the error in the GUI with minimal information for the user. I think that way, if is something obvious and handled, like "sample sheet has wrong OverwriteCycles value", that's what's shown but if there is something less obvious, users will need to contact the admins/devs to investigate. FWIW, this has been a useful interaction for this specific plugin: wet/dry-lab interactions.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do the dev's know what line in the codebase is throwing the exception without the traceback?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question! Each step is reported in the job's step in the db and as a new folder in the working directory, each folder has its own logs and details. In other words, via the jobs "step" & last folder written.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But the developer will not know the exact line of code raising the exception. Won't that require the developer to then guess or perform a much more time expensive debugging process to determine what specifically failed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see what you are saying but in experience so far that's not the case. However, we might be missing something so I'll change and we can revert back if users get too annoyed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks! For users, the traceback could either be post processed, or an additional item in the tuple could be returned (the original str(e))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, decided to write a log in the outdir so devs can see the full traceback but keep it simple ("str(e)") for users.


status_line.update_job_status("SPP finished")
# return success, ainfo, and the last status message.
paths = [(f'{workflow.final_results_path}/', 'directory')]
return (True, [ArtifactInfo('output', 'job-output-folder', paths)], '')
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
this is a test
16 changes: 16 additions & 0 deletions qp_klp/tests/data/250225_LH00444_0301_B22N7T2LT4/RunInfo.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0"?>
<RunInfo xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="2">
<Run Id="170523_M09999_0010_000000000-XXXXX" Number="10">
<Flowcell>000000000-XXXXX</Flowcell>
<Instrument>M09999</Instrument>
<Date>170523</Date>
<Reads>
<Read NumCycles="151" Number="1" IsIndexedRead="N" />
<Read NumCycles="8" Number="2" IsIndexedRead="Y" />
<Read NumCycles="8" Number="3" IsIndexedRead="Y" />
<Read NumCycles="151" Number="4" IsIndexedRead="N" />
</Reads>
<FlowcellLayout LaneCount="1" SurfaceCount="2" SwathCount="1" TileCount="14" />
</Run>
</RunInfo>
Binary file modified qp_klp/tests/data/dummy.fastq.gz
Binary file not shown.
36 changes: 36 additions & 0 deletions qp_klp/tests/data/summary.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th>filename</th>
<th>md5</th>
<th>file_type</th>
<th>reads</th>
</tr>
</thead>
<tbody>
<tr>
<td>S22205_S104_L001_R1_001.fastq.gz</td>
<td>9dcfb0c77674fdada176262963196db0</td>
<td>raw_forward_seqs</td>
<td>1000000</td>
</tr>
<tr>
<td>S22282_S102_L001_R1_001.fastq.gz</td>
<td>9dcfb0c77674fdada176262963196db0</td>
<td>raw_forward_seqs</td>
<td>1000000</td>
</tr>
<tr>
<td>S22205_S104_L001_R2_001.fastq.gz</td>
<td>9dcfb0c77674fdada176262963196db0</td>
<td>raw_reverse_seqs</td>
<td>1000000</td>
</tr>
<tr>
<td>S22282_S102_L001_R2_001.fastq.gz</td>
<td>9dcfb0c77674fdada176262963196db0</td>
<td>raw_reverse_seqs</td>
<td>1000000</td>
</tr>
</tbody>
</table>
Loading
Loading