From f1bee20d95573909efba65453d5eebadb0169ab2 Mon Sep 17 00:00:00 2001 From: Jorrit Boekel Date: Fri, 31 Jan 2025 20:56:04 +0100 Subject: [PATCH] Remove channels feature, to remove channels which are empty instead of --keepnapsms --- main.nf | 32 +++++++++++++++++++++++++++----- modules.nf | 4 ++++ nextflow.config | 1 + nextflow_schema.json | 3 +++ tests/tmt16_fast.sh | 1 + 5 files changed, 36 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 1634666..377e5a0 100644 --- a/main.nf +++ b/main.nf @@ -2,7 +2,7 @@ include { paramsSummaryMap } from 'plugin/nf-schema' -include { msgf_info_map; listify; stripchars_infile; get_regex_specialchars; read_header } from './modules.nf' +include { msgf_info_map; get_complement_field_nr; listify; stripchars_infile; get_regex_specialchars; read_header } from './modules.nf' include { MSGFPERCO } from './workflows/msgf_perco.nf' include { SAGEPERCO } from './workflows/sage_perco.nf' include { PTMANALYSIS } from './workflows/ptms.nf' @@ -340,7 +340,7 @@ process splitPSMs { container params.__containers[tag][workflow.containerEngine] input: - tuple val(td), path('psms'), val(setnames) + tuple val(td), path('psms'), val(setnames), val(remove_channels) output: tuple val(td), path({listify(setnames).collect { "${it}.tsv" }}) optional true @@ -348,10 +348,19 @@ process splitPSMs { script: """ msstitch split -i psms --splitcol bioset + ${td == 'target' ? + remove_channels.collect { + setch -> setch[1].collect { + ch -> "colnum=${get_complement_field_nr("${setch[0]}.tsv", ch)} && \ + cut -f \$colnum ${setch[0]}.tsv > tmprm && mv tmprm ${setch[0]}.tsv" + }.join(' && ') + }.join(' && ') + : ''} """ } + process splitTotalProteomePSMs { tag 'msstitch' @@ -454,13 +463,20 @@ process sampleTableCheckClean { container params.__containers[tag][workflow.containerEngine] input: - tuple path('sampletable'), val(do_deqms) + tuple path('sampletable'), val(do_deqms), val(remove_channels) output: tuple path('clean_sampletable'), path('sampletable_no_special_chars') script: """ + # Remove empty channels + ${remove_channels.collect { + setch -> setch[1].collect { + ch -> "grep -v '^${ch}\t${setch[0]}' sampletable > tmpst && mv tmpst sampletable" + }.join(' && ') + }.join(' && ') + } # First add NO__GROUP marker for no-samplegroups clean sampletable from special chars awk -v FS="\\t" -v OFS="\\t" \'{if (NF==3) print \$1,\$2,\$3,"NO__GROUP"; else print}\' sampletable > clean_sampletable # Check if there are samplegroups at all @@ -633,6 +649,12 @@ workflow { }.collectEntries() { x-> [x[0], x[2..-1]] } : [:] + // Remove channels from specific sets if those are empty: --remove_channels 'setA:126:127 setB:131' + rmch = params.remove_channels ? params.remove_channels.tokenize(' ') : false + remove_channels_psmtable = rmch ? rmch.collect { y -> y.tokenize(':') + }.collect { x -> [x[0], x[1..-1].collect { ch -> "${setisobaric[x[0]]}_${ch}" } ] } : [:] + remove_channels_sampletable = rmch ? rmch.collect { y -> y.tokenize(':') + }.collect { x -> [x[0], x[1..-1]] } : [:] do_ms1 = !params.noquant && !params.noms1quant do_normalize = (!params.noquant && (params.mediannormalize || params.deqms) && params.isobaric) @@ -854,7 +876,7 @@ workflow { psmtables_ch | filter { it[0] == 'decoy' } | concat(target_psmtable) - | map { [it[0], it[1], all_setnames] } + | map { [it[0], it[1], all_setnames, remove_channels_psmtable] } | splitPSMs | map{ it -> [it[0], listify(it[1]).collect() { it.baseName.replaceFirst(/\.tsv$/, "") }, it[1]]} // get setname from {setname}.tsv | transpose @@ -953,7 +975,7 @@ workflow { if (params.sampletable) { Channel.fromPath(params.sampletable) - | map { [it, params.deqms] } + | map { [it, params.deqms, remove_channels_sampletable] } | sampleTableCheckClean | set { sampletable_ch } } else { diff --git a/modules.nf b/modules.nf index a28198b..1aeafbe 100644 --- a/modules.nf +++ b/modules.nf @@ -11,6 +11,10 @@ def get_field_nr_multi(fn, fieldnames) { return "\$(head -n1 ${fn} | tr '\\t' '\\n' | grep -En '(${fieldnames.join('|')})' | cut -f 1 -d':' | tr '\\n' ',' | sed 's/\\,\$//')" } +def get_complement_field_nr(fn, fieldname) { + /* return field nrs comma separated like: 1,2,5,9 */ + return "\$(head -n1 ${fn} | tr '\\t' '\\n' | grep -vwn '^${fieldname}\$' | cut -f 1 -d':' | tr '\\n' ',' | sed 's/\\,\$//')" +} def parse_isotype(isobtype) { return ['tmt16plex', 'tmt18plex'].any { it == isobtype } ? 'tmtpro' : isobtype diff --git a/nextflow.config b/nextflow.config index 7047d2d..15d911b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -30,6 +30,7 @@ params { phospho = false maxvarmods = 2 isobaric = false + remove_channels = false instrument = 'qe' // Default instrument is Q-Exactive prectol = '10.0ppm' iso_err = '-1,2' diff --git a/nextflow_schema.json b/nextflow_schema.json index 5e0a123..ef25439 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -300,6 +300,9 @@ }, "msgf": { "type": "boolean" + }, + "remove_channels": { + "type": "boolean" } } } diff --git a/tests/tmt16_fast.sh b/tests/tmt16_fast.sh index 08f2002..c528a6d 100644 --- a/tests/tmt16_fast.sh +++ b/tests/tmt16_fast.sh @@ -20,5 +20,6 @@ $NXFCMD --name ${name} --outdir ${resultsdir} \ --locptms Phospho \ --psmconflvl 0.2 --pepconflvl 0.2 \ --deqms --keepnapsmsquant --genes \ + --remove_channels '0set-A:127C:128N' \ --hirief https://github.com/nf-core/test-datasets/raw/6defbf8a92a46b0ac48bb05f9ad96b62716b4a5d/testdata/formatted_known_peptides_ENSUniRefseq_TMT_predpi_20150825.txt # FIXME cannot run with carbamyl +43 -> -261 and Phospho, luciprep crash \