From f7266f626e8c017b0b44b4203f5ff58712b938a0 Mon Sep 17 00:00:00 2001
From: davidstew <daastewa@ucsc.edu>
Date: Fri, 18 May 2018 16:04:57 -0700
Subject: [PATCH] Make majority a user provided option (Resolves #286)

Resolves #286

Added snv_majority and indel_majority in a consensus tab under mutation_calling
in input_parameters.yaml. If values are not provided, the majority will be dynamically
figured out.
---
 MANUAL.md                                  |  5 +++++
 src/protect/common.py                      |  2 +-
 src/protect/mutation_calling/common.py     | 19 +++++++++++++------
 src/protect/pipeline/ProTECT.py            | 22 +++++++++++++++++++---
 src/protect/pipeline/defaults.yaml         |  3 +++
 src/protect/pipeline/input_parameters.yaml |  3 +++
 6 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/MANUAL.md b/MANUAL.md
index 4d536b31..d08d9da7 100644
--- a/MANUAL.md
+++ b/MANUAL.md
@@ -335,6 +335,11 @@ be substituted with S3 links. Descriptions for creating all files can be found i
             version: 1.2.0
 
     mutation_calling:
+        consensus:
+            indel_majority: None                                  -> Number of callers required to accept an indel.
+                                                                     A None value will dynamically compute the majority.
+            snv_majority: None                                    -> Number of callers required to accept an snv.
+                                                                     A None value will dynamically compute the majority.
         indexes:
             chromsosomes: canonical, canonical_chr, chr1, Y       -> A list of chromosomes to process.
                                                                      This options overrides the
diff --git a/src/protect/common.py b/src/protect/common.py
index 17d72b5e..1839a2a5 100644
--- a/src/protect/common.py
+++ b/src/protect/common.py
@@ -638,7 +638,7 @@ def email_report(job, univ_options):
         server = smtplib.SMTP('localhost')
     except socket.error as e:
         if e.errno == 111:
-            print('No mail utils on this maachine')
+            print('No mail utils on this machine')
         else:
             print('Unexpected error while attempting to send an email.')
         print('Could not send email report')
diff --git a/src/protect/mutation_calling/common.py b/src/protect/mutation_calling/common.py
index 301eaae2..d0c493b8 100644
--- a/src/protect/mutation_calling/common.py
+++ b/src/protect/mutation_calling/common.py
@@ -51,13 +51,14 @@ def chromosomes_from_fai(genome_fai):
     return chromosomes
 
 
-def run_mutation_aggregator(job, mutation_results, univ_options):
+def run_mutation_aggregator(job, mutation_results, univ_options, consensus_options):
     """
     Aggregate all the called mutations.
 
     :param dict mutation_results: Dict of dicts of the various mutation callers in a per chromosome
            format
     :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict consensus_options: options specific for consensus mutation calling
     :returns: fsID for the merged mutations file
     :rtype: toil.fileStore.FileID
     """
@@ -80,7 +81,7 @@ def run_mutation_aggregator(job, mutation_results, univ_options):
     if chroms:
         for chrom in chroms:
             out[chrom] = job.addChildJobFn(merge_perchrom_mutations, chrom, mutation_results,
-                                           univ_options).rv()
+                                           univ_options, consensus_options).rv()
         merged_snvs = job.addFollowOnJobFn(merge_perchrom_vcfs, out, 'merged', univ_options)
         job.fileStore.logToMaster('Aggregated mutations for %s successfully' % univ_options['patient'])
         return merged_snvs.rv()
@@ -89,7 +90,7 @@ def run_mutation_aggregator(job, mutation_results, univ_options):
 
 
 
-def merge_perchrom_mutations(job, chrom, mutations, univ_options):
+def merge_perchrom_mutations(job, chrom, mutations, univ_options, consensus_options):
     """
     Merge the mutation calls for a single chromosome.
 
@@ -97,6 +98,7 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options):
     :param dict mutations: dict of dicts of the various mutation caller names as keys, and a dict of
            per chromosome job store ids for vcfs as value
     :param dict univ_options: Dict of universal options used by almost all tools
+    :param dict consensus_options: options specific for consensus mutation calling
     :returns fsID for vcf contaning merged calls for the given chromosome
     :rtype: toil.fileStore.FileID
     """
@@ -109,13 +111,13 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options):
     mutations.pop('indels')
     mutations['strelka_indels'] = mutations['strelka']['indels']
     mutations['strelka_snvs'] = mutations['strelka']['snvs']
-    vcf_processor = {'snvs': {'mutect': process_mutect_vcf,
+    vcf_processor = {'snv': {'mutect': process_mutect_vcf,
                               'muse': process_muse_vcf,
                               'radia': process_radia_vcf,
                               'somaticsniper': process_somaticsniper_vcf,
                               'strelka_snvs': process_strelka_vcf
                               },
-                     'indels': {'strelka_indels': process_strelka_vcf
+                     'indel': {'strelka_indels': process_strelka_vcf
                                 }
                      }
     accepted_hits = defaultdict(dict)
@@ -131,7 +133,12 @@ def merge_perchrom_mutations(job, chrom, mutations, univ_options):
         if 'strelka_' + mut_type in perchrom_mutations:
             perchrom_mutations['strelka'] = perchrom_mutations['strelka_' + mut_type]
             perchrom_mutations.pop('strelka_' + mut_type)
-        majority = 1 if len(perchrom_mutations) <= 2 else (len(perchrom_mutations) + 1) / 2
+        if consensus_options[mut_type + '_majority'] is not None:
+            majority = consensus_options[mut_type + '_majority']
+        elif len(perchrom_mutations) <= 2:
+            majority = 1
+        else:
+            majority = (len(perchrom_mutations) + 1) / 2
         # Read in each file to a dict
         vcf_lists = {caller: read_vcf(vcf_file) for caller, vcf_file in perchrom_mutations.items()}
         all_positions = list(set(itertools.chain(*vcf_lists.values())))
diff --git a/src/protect/pipeline/ProTECT.py b/src/protect/pipeline/ProTECT.py
index 2223f661..fd30ca5e 100644
--- a/src/protect/pipeline/ProTECT.py
+++ b/src/protect/pipeline/ProTECT.py
@@ -434,8 +434,24 @@ def _parse_config_file(job, config_file, max_cores=None):
     job.fileStore.logToMaster('Obtaining tool inputs')
     if (sample_without_variants and all(tool_options[k]['run'] is False
                                         for k in mutation_caller_list
-                                        if k not in ('indexes', 'fusion_inspector'))):
+                                        if k not in ('indexes', 'fusion_inspector', 'consensus'))):
         raise RuntimeError("Cannot run mutation callers if all callers are set to run = False.")
+
+    for mut_type in ('snv', 'indel'):
+        if tool_options['consensus'][mut_type + '_majority'] is not None:
+            if not isinstance(tool_options['consensus'][mut_type + '_majority'], int):
+                raise RuntimeError('Majorities have to be integers. Got %s for %s_majority.' %
+                                   (tool_options['consensus'][mut_type + '_majority'], mut_type))
+            if mut_type == 'snv':
+                count = sum(tool_options[k]['run'] is True
+                            for k in ('muse', 'mutect', 'somaticsniper', 'radia', 'strelka'))
+            else:
+                count = 1 if tool_options['strelka']['run'] is True else 0
+            if tool_options['consensus'][mut_type + '_majority'] > count:
+                raise RuntimeError('Majority cannot be greater than the number of callers. Got number of %s callers '
+                                   '= %s majority = %s.' % (mut_type, count,
+                                                            tool_options['consensus'][mut_type + '_majority']))
+            
     process_tool_inputs = job.addChildJobFn(get_all_tool_inputs, tool_options,
                                             mutation_caller_list=mutation_caller_list)
     job.fileStore.logToMaster('Obtained tool inputs')
@@ -687,7 +703,7 @@ def launch_protect(job, patient_data, univ_options, tool_options):
         bam_files['tumor_rna'].addChild(mutations['radia'])
         get_mutations = job.wrapJobFn(run_mutation_aggregator,
                                       {caller: cjob.rv() for caller, cjob in mutations.items()},
-                                      univ_options, disk='100M', memory='100M',
+                                      univ_options, tool_options['consensus'], disk='100M', memory='100M',
                                       cores=1).encapsulate()
         for caller in mutations:
             mutations[caller].addChild(get_mutations)
@@ -777,7 +793,7 @@ def get_all_tool_inputs(job, tools, outer_key='', mutation_caller_list=None):
         indexes = tools.pop('indexes')
         indexes['chromosomes'] = parse_chromosome_string(job, indexes['chromosomes'])
         for mutation_caller in mutation_caller_list:
-            if mutation_caller == 'indexes':
+            if mutation_caller in ('indexes', 'consensus'):
                 continue
             tools[mutation_caller].update(indexes)
     return tools
diff --git a/src/protect/pipeline/defaults.yaml b/src/protect/pipeline/defaults.yaml
index 046e043d..51be7ce0 100644
--- a/src/protect/pipeline/defaults.yaml
+++ b/src/protect/pipeline/defaults.yaml
@@ -55,6 +55,9 @@ expression_estimation:
         version: 1.2.20
 
 mutation_calling:
+    consensus:
+        indel_majority: None
+        snv_majority: None
     indexes:
         chromosomes:
     mutect:
diff --git a/src/protect/pipeline/input_parameters.yaml b/src/protect/pipeline/input_parameters.yaml
index 2d6d652e..cad56a84 100644
--- a/src/protect/pipeline/input_parameters.yaml
+++ b/src/protect/pipeline/input_parameters.yaml
@@ -83,6 +83,9 @@ expression_estimation:
         # version: 1.2.0
 
 mutation_calling:
+    consensus:
+        indel_majority: None
+        snv_majority: None
     indexes:
         chromosomes: canonical_chr, chrM
         genome_fasta: S3://protect-data/hg38_references/hg38.fa.tar.gz