Remove channels feature, to remove channels which are empty instead o…

…f --keepnapsms
lehtiolab · Jan 31, 2025 · f1bee20 · f1bee20
1 parent 3f423a1
commit f1bee20
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 5 deletions.
diff --git a/main.nf b/main.nf
@@ -2,7 +2,7 @@
 
 include { paramsSummaryMap } from 'plugin/nf-schema'
 
-include { msgf_info_map; listify; stripchars_infile; get_regex_specialchars; read_header } from './modules.nf' 
+include { msgf_info_map; get_complement_field_nr; listify; stripchars_infile; get_regex_specialchars; read_header } from './modules.nf' 
 include { MSGFPERCO } from './workflows/msgf_perco.nf'
 include { SAGEPERCO } from './workflows/sage_perco.nf'
 include { PTMANALYSIS } from './workflows/ptms.nf'
@@ -340,18 +340,27 @@ process splitPSMs {
   container params.__containers[tag][workflow.containerEngine]
 
   input:
-  tuple val(td), path('psms'), val(setnames)
+  tuple val(td), path('psms'), val(setnames), val(remove_channels)
 
   output:
   tuple val(td), path({listify(setnames).collect { "${it}.tsv" }}) optional true
 
   script:
   """
   msstitch split -i psms --splitcol bioset
+  ${td == 'target' ?
+    remove_channels.collect {
+      setch -> setch[1].collect {
+        ch -> "colnum=${get_complement_field_nr("${setch[0]}.tsv", ch)} && \
+          cut -f \$colnum ${setch[0]}.tsv > tmprm && mv tmprm ${setch[0]}.tsv"
+        }.join(' && ')
+      }.join(' && ')
+  : ''}
   """
 }
 
 
+
 process splitTotalProteomePSMs {
 
   tag 'msstitch'
@@ -454,13 +463,20 @@ process sampleTableCheckClean {
   container params.__containers[tag][workflow.containerEngine]
 
   input:
-  tuple path('sampletable'), val(do_deqms)
+  tuple path('sampletable'), val(do_deqms), val(remove_channels)
 
   output:
   tuple path('clean_sampletable'), path('sampletable_no_special_chars')
 
   script:
   """
+  # Remove empty channels
+  ${remove_channels.collect {
+    setch -> setch[1].collect {
+      ch -> "grep -v '^${ch}\t${setch[0]}' sampletable > tmpst && mv tmpst sampletable"
+      }.join(' && ')
+    }.join(' && ')
+  }
   # First add NO__GROUP marker for no-samplegroups clean sampletable from special chars
   awk -v FS="\\t" -v OFS="\\t" \'{if (NF==3) print \$1,\$2,\$3,"NO__GROUP"; else print}\' sampletable > clean_sampletable
   # Check if there are samplegroups at all
@@ -633,6 +649,12 @@ workflow {
   }.collectEntries() {
     x-> [x[0], x[2..-1]]
   } : [:]
+  // Remove channels from specific sets if those are empty: --remove_channels 'setA:126:127 setB:131'
+  rmch = params.remove_channels ? params.remove_channels.tokenize(' ') : false
+  remove_channels_psmtable = rmch ? rmch.collect { y -> y.tokenize(':')
+  }.collect { x -> [x[0], x[1..-1].collect { ch -> "${setisobaric[x[0]]}_${ch}" } ] } : [:]
+  remove_channels_sampletable = rmch ? rmch.collect { y -> y.tokenize(':')
+  }.collect { x -> [x[0], x[1..-1]] } : [:]
 
   do_ms1 = !params.noquant && !params.noms1quant
   do_normalize = (!params.noquant && (params.mediannormalize || params.deqms) && params.isobaric)
@@ -854,7 +876,7 @@ workflow {
   psmtables_ch
   | filter { it[0] == 'decoy' }
   | concat(target_psmtable)
-  | map { [it[0], it[1], all_setnames] }
+  | map { [it[0], it[1], all_setnames, remove_channels_psmtable] }
   | splitPSMs
   | map{ it -> [it[0], listify(it[1]).collect() { it.baseName.replaceFirst(/\.tsv$/, "") }, it[1]]} // get setname from {setname}.tsv
   | transpose
@@ -953,7 +975,7 @@ workflow {
 
   if (params.sampletable) {
     Channel.fromPath(params.sampletable)
-    | map { [it, params.deqms] }
+    | map { [it, params.deqms, remove_channels_sampletable] }
     | sampleTableCheckClean
     | set { sampletable_ch }
   } else {

diff --git a/modules.nf b/modules.nf
@@ -11,6 +11,10 @@ def get_field_nr_multi(fn, fieldnames) {
     return "\$(head -n1 ${fn} | tr '\\t' '\\n' | grep -En '(${fieldnames.join('|')})' | cut -f 1 -d':' | tr '\\n' ',' | sed 's/\\,\$//')"
 }
 
+def get_complement_field_nr(fn, fieldname) {
+    /* return field nrs comma separated like: 1,2,5,9 */
+    return "\$(head -n1 ${fn} | tr '\\t' '\\n' | grep -vwn '^${fieldname}\$' | cut -f 1 -d':' | tr '\\n' ',' | sed 's/\\,\$//')"
+}
 
 def parse_isotype(isobtype) {
   return ['tmt16plex', 'tmt18plex'].any { it == isobtype } ? 'tmtpro' : isobtype

diff --git a/nextflow.config b/nextflow.config
@@ -30,6 +30,7 @@ params {
   phospho = false
   maxvarmods = 2
   isobaric = false
+  remove_channels = false
   instrument = 'qe' // Default instrument is Q-Exactive
   prectol = '10.0ppm'
   iso_err = '-1,2'

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -300,6 +300,9 @@
     },
     "msgf": {
       "type": "boolean"
+    },
+    "remove_channels": {
+      "type": "boolean"
     }
   }
 }
diff --git a/tests/tmt16_fast.sh b/tests/tmt16_fast.sh
@@ -20,5 +20,6 @@ $NXFCMD --name ${name} --outdir ${resultsdir} \
     --locptms Phospho \
     --psmconflvl 0.2 --pepconflvl 0.2 \
     --deqms --keepnapsmsquant --genes \
+    --remove_channels '0set-A:127C:128N' \
     --hirief https://github.com/nf-core/test-datasets/raw/6defbf8a92a46b0ac48bb05f9ad96b62716b4a5d/testdata/formatted_known_peptides_ENSUniRefseq_TMT_predpi_20150825.txt
     # FIXME cannot run with carbamyl +43 -> -261 and Phospho, luciprep crash \