From f1bee20d95573909efba65453d5eebadb0169ab2 Mon Sep 17 00:00:00 2001
From: Jorrit Boekel <jorrit.boekel@scilifelab.se>
Date: Fri, 31 Jan 2025 20:56:04 +0100
Subject: [PATCH] Remove channels feature, to remove channels which are empty
 instead of --keepnapsms

---
 main.nf              | 32 +++++++++++++++++++++++++++-----
 modules.nf           |  4 ++++
 nextflow.config      |  1 +
 nextflow_schema.json |  3 +++
 tests/tmt16_fast.sh  |  1 +
 5 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index 1634666..377e5a0 100644
--- a/main.nf
+++ b/main.nf
@@ -2,7 +2,7 @@
 
 include { paramsSummaryMap } from 'plugin/nf-schema'
 
-include { msgf_info_map; listify; stripchars_infile; get_regex_specialchars; read_header } from './modules.nf' 
+include { msgf_info_map; get_complement_field_nr; listify; stripchars_infile; get_regex_specialchars; read_header } from './modules.nf' 
 include { MSGFPERCO } from './workflows/msgf_perco.nf'
 include { SAGEPERCO } from './workflows/sage_perco.nf'
 include { PTMANALYSIS } from './workflows/ptms.nf'
@@ -340,7 +340,7 @@ process splitPSMs {
   container params.__containers[tag][workflow.containerEngine]
 
   input:
-  tuple val(td), path('psms'), val(setnames)
+  tuple val(td), path('psms'), val(setnames), val(remove_channels)
 
   output:
   tuple val(td), path({listify(setnames).collect { "${it}.tsv" }}) optional true
@@ -348,10 +348,19 @@ process splitPSMs {
   script:
   """
   msstitch split -i psms --splitcol bioset
+  ${td == 'target' ?
+    remove_channels.collect {
+      setch -> setch[1].collect {
+        ch -> "colnum=${get_complement_field_nr("${setch[0]}.tsv", ch)} && \
+          cut -f \$colnum ${setch[0]}.tsv > tmprm && mv tmprm ${setch[0]}.tsv"
+        }.join(' && ')
+      }.join(' && ')
+  : ''}
   """
 }
 
 
+
 process splitTotalProteomePSMs {
 
   tag 'msstitch'
@@ -454,13 +463,20 @@ process sampleTableCheckClean {
   container params.__containers[tag][workflow.containerEngine]
  
   input:
-  tuple path('sampletable'), val(do_deqms)
+  tuple path('sampletable'), val(do_deqms), val(remove_channels)
 
   output:
   tuple path('clean_sampletable'), path('sampletable_no_special_chars')
   
   script:
   """
+  # Remove empty channels
+  ${remove_channels.collect {
+    setch -> setch[1].collect {
+      ch -> "grep -v '^${ch}\t${setch[0]}' sampletable > tmpst && mv tmpst sampletable"
+      }.join(' && ')
+    }.join(' && ')
+  }
   # First add NO__GROUP marker for no-samplegroups clean sampletable from special chars
   awk -v FS="\\t" -v OFS="\\t" \'{if (NF==3) print \$1,\$2,\$3,"NO__GROUP"; else print}\' sampletable > clean_sampletable
   # Check if there are samplegroups at all
@@ -633,6 +649,12 @@ workflow {
   }.collectEntries() {
     x-> [x[0], x[2..-1]]
   } : [:]
+  // Remove channels from specific sets if those are empty: --remove_channels 'setA:126:127 setB:131'
+  rmch = params.remove_channels ? params.remove_channels.tokenize(' ') : false
+  remove_channels_psmtable = rmch ? rmch.collect { y -> y.tokenize(':')
+  }.collect { x -> [x[0], x[1..-1].collect { ch -> "${setisobaric[x[0]]}_${ch}" } ] } : [:]
+  remove_channels_sampletable = rmch ? rmch.collect { y -> y.tokenize(':')
+  }.collect { x -> [x[0], x[1..-1]] } : [:]
   
   do_ms1 = !params.noquant && !params.noms1quant
   do_normalize = (!params.noquant && (params.mediannormalize || params.deqms) && params.isobaric)
@@ -854,7 +876,7 @@ workflow {
   psmtables_ch
   | filter { it[0] == 'decoy' }
   | concat(target_psmtable)
-  | map { [it[0], it[1], all_setnames] }
+  | map { [it[0], it[1], all_setnames, remove_channels_psmtable] }
   | splitPSMs
   | map{ it -> [it[0], listify(it[1]).collect() { it.baseName.replaceFirst(/\.tsv$/, "") }, it[1]]} // get setname from {setname}.tsv
   | transpose
@@ -953,7 +975,7 @@ workflow {
 
   if (params.sampletable) {
     Channel.fromPath(params.sampletable)
-    | map { [it, params.deqms] }
+    | map { [it, params.deqms, remove_channels_sampletable] }
     | sampleTableCheckClean
     | set { sampletable_ch }
   } else {
diff --git a/modules.nf b/modules.nf
index a28198b..1aeafbe 100644
--- a/modules.nf
+++ b/modules.nf
@@ -11,6 +11,10 @@ def get_field_nr_multi(fn, fieldnames) {
     return "\$(head -n1 ${fn} | tr '\\t' '\\n' | grep -En '(${fieldnames.join('|')})' | cut -f 1 -d':' | tr '\\n' ',' | sed 's/\\,\$//')"
 }
 
+def get_complement_field_nr(fn, fieldname) {
+    /* return field nrs comma separated like: 1,2,5,9 */
+    return "\$(head -n1 ${fn} | tr '\\t' '\\n' | grep -vwn '^${fieldname}\$' | cut -f 1 -d':' | tr '\\n' ',' | sed 's/\\,\$//')"
+}
 
 def parse_isotype(isobtype) {
   return ['tmt16plex', 'tmt18plex'].any { it == isobtype } ? 'tmtpro' : isobtype
diff --git a/nextflow.config b/nextflow.config
index 7047d2d..15d911b 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -30,6 +30,7 @@ params {
   phospho = false
   maxvarmods = 2
   isobaric = false
+  remove_channels = false
   instrument = 'qe' // Default instrument is Q-Exactive
   prectol = '10.0ppm'
   iso_err = '-1,2'
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 5e0a123..ef25439 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -300,6 +300,9 @@
     },
     "msgf": {
       "type": "boolean"
+    },
+    "remove_channels": {
+      "type": "boolean"
     }
   }
 }
diff --git a/tests/tmt16_fast.sh b/tests/tmt16_fast.sh
index 08f2002..c528a6d 100644
--- a/tests/tmt16_fast.sh
+++ b/tests/tmt16_fast.sh
@@ -20,5 +20,6 @@ $NXFCMD --name ${name} --outdir ${resultsdir} \
     --locptms Phospho \
     --psmconflvl 0.2 --pepconflvl 0.2 \
     --deqms --keepnapsmsquant --genes \
+    --remove_channels '0set-A:127C:128N' \
     --hirief https://github.com/nf-core/test-datasets/raw/6defbf8a92a46b0ac48bb05f9ad96b62716b4a5d/testdata/formatted_known_peptides_ENSUniRefseq_TMT_predpi_20150825.txt
     # FIXME cannot run with carbamyl +43 -> -261 and Phospho, luciprep crash \