diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index 10932f577..7ab14c6a3 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -45,7 +45,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/lh3/minimap2.git cd minimap2 -git checkout 581f2d7123f651d04c9e103b1c7ea8f7051e909a +git checkout v2.28 # hack in flags support sed -i Makefile -e 's/CFLAGS=/CFLAGS+=/' make -j ${numcpu} ${MINI_ARM_FLAGS} @@ -60,7 +60,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/lh3/minigraph.git pushd minigraph -git checkout v0.20 +git checkout v0.21 # hack in flags support sed -i Makefile -e 's/CFLAGS=/CFLAGS+=/' make -j ${numcpu} @@ -75,7 +75,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/lh3/gfatools.git cd gfatools -git checkout v0.5 +git checkout c31be8a62efc6bdea4576029f7fbe84f345a6eed # hack in flags support sed -i Makefile -e 's/CFLAGS=/CFLAGS+=/' make -j ${numcpu} @@ -176,7 +176,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/ComparativeGenomicsToolkit/cactus-gfa-tools.git cd cactus-gfa-tools -git checkout 0c17bc4ae9a7cf174fa40805cde7f8f1f6de8225 +git checkout 1121e370880ee187ba2963f0e46e632e0e762cc5 make -j ${numcpu} if [[ $STATIC_CHECK -ne 1 || $(ldd paf2lastz | grep so | wc -l) -eq 0 ]] then @@ -235,7 +235,7 @@ fi # hal2vg cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/hal2vg +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/hal2vg chmod +x hal2vg if [[ $STATIC_CHECK -ne 1 || $(ldd hal2vg | grep so | wc -l) -eq 0 ]] then @@ -245,7 +245,7 @@ else fi # clip-vg cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/clip-vg +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/clip-vg chmod +x clip-vg if [[ $STATIC_CHECK -ne 1 || $(ldd clip-vg | grep so | wc -l) -eq 0 ]] then @@ -255,7 +255,7 @@ else fi # halRemoveDupes cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/halRemoveDupes +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/halRemoveDupes chmod +x halRemoveDupes if [[ $STATIC_CHECK -ne 1 || $(ldd halRemoveDupes | grep so | wc -l) -eq 0 ]] then @@ -265,7 +265,7 @@ else fi # halMergeChroms cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/halMergeChroms +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/halMergeChroms chmod +x halMergeChroms if [[ $STATIC_CHECK -ne 1 || $(ldd halMergeChroms | grep so | wc -l) -eq 0 ]] then @@ -276,7 +276,7 @@ fi # halUnclip cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/halUnclip +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/halUnclip chmod +x halUnclip if [[ $STATIC_CHECK -ne 1 || $(ldd halUnclip | grep so | wc -l) -eq 0 ]] then @@ -287,7 +287,7 @@ fi # filter-paf-deletions cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/filter-paf-deletions +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/filter-paf-deletions chmod +x filter-paf-deletions if [[ $STATIC_CHECK -ne 1 || $(ldd filter-paf-deletions | grep so | wc -l) -eq 0 ]] then diff --git a/doc/pangenome.md b/doc/pangenome.md index b54b4e49e..faffe55fc 100644 --- a/doc/pangenome.md +++ b/doc/pangenome.md @@ -133,6 +133,10 @@ The `--reference` option must be used to select a "reference" sample. This samp It is therefore extremely important that the reference sample's assembly be **chromosome** scale. If there are many small contigs in the addition to chromosomes in the reference assembly, then please consider specifying the chromosomes with `--refContigs`. If you still want to keep the other contigs, add `--otherContig chrOther` (see explanation below). +### Self-Alignment and the Collapse Option + +The `--collapse` option, added as an experimental prototype in [v2.9.1](https://github.com/ComparativeGenomicsToolkit/cactus/releases/tag/v2.9.1), can be used to incorporate self-alignments into the pangenome, *including the reference sample*. This will produce a more compact graph with, for example, tandem duplications being represented as cycles (like PGGB) rather than insertions (like minigraph). It also drops the invariant (see above) that the reference path by acyclic -- with this option, the reference path can contain cycles. The `--collapse` option is implemented by running `minimap2` using options found in the `minimapCollapseOptions` parameter in the configuration XML to align each input *contig* with itself. These self alignments are fed into Cactus alongide the usual sequence-to-minigraph alignments. + #### Multiple Reference Samples The `--reference` option can accept multiple samples (separated by space). If multiple samples are specified beyond the first, they will be clipped as usual, but end up as "reference-sense" paths in the vg/gbz output. They can also be used as basis for VCF, and VCF files can be created based on them with the `--vcfReference` sample. diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index f3c9caa61..b3a952d57 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -320,6 +320,9 @@ + + + @@ -334,13 +337,17 @@ - + + diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index 74cf0f695..6b21eea96 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -52,7 +52,9 @@ def main(): parser.add_argument("--outputGAFDir", type=str, help = "Output GAF alignments (raw minigraph output before PAF conversion) to this directory") parser.add_argument("--reference", nargs='+', type=str, help = "Reference genome name. MAPQ filter will not be applied to it") parser.add_argument("--refFromGFA", action="store_true", help = "Do not align reference (--reference) from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)") - parser.add_argument("--mapCores", type=int, help = "Number of cores for minigraph. Overrides graphmap cpu in configuration") + parser.add_argument("--mapCores", type=int, help = "Number of cores for minigraph. Overrides graphmap cpu in configuration") + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False) + parser.add_argument("--collapseRefPAF", help ="Incorporate given (reference-only) self-alignments in PAF format [Experimental]") #WDL hacks parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile") @@ -90,6 +92,14 @@ def main(): if options.reference: options.reference = options.reference[0] + if options.collapseRefPAF: + if not options.collapseRefPAF.endswith('.paf'): + raise RuntimeError('file passed to --collapseRefPAF must end with .paf') + if not options.reference: + raise RuntimeError('--reference must be used with --collapseRefPAF') + if options.collapse: + raise RuntimeError('--collapseRefPAF cannot be used with --collapse') + # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) @@ -101,6 +111,7 @@ def main(): run_time = end_time - start_time logger.info("cactus-graphmap has finished after {} seconds".format(run_time)) + def graph_map(options): with Toil(options) as toil: importSingularityImage(options) @@ -137,6 +148,12 @@ def graph_map(options): if options.batchSystem.lower() in ['single_machine', 'singleMachine']: mg_cores = min(mg_cores, cactus_cpu_count(), int(options.maxCores) if options.maxCores else sys.maxsize) findRequiredNode(config_node, "graphmap").attrib["cpu"] = str(mg_cores) + + if options.collapse: + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all' + if options.collapseRefPAF: + assert options.reference + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'reference' # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode(config_node, "graphmap"), "assemblyName", default="_MINIGRAPH_") @@ -169,6 +186,12 @@ def graph_map(options): logger.info("Importing {}".format(options.minigraphGFA)) gfa_id = toil.importFile(makeURL(options.minigraphGFA)) + #import the reference collapse paf + ref_collapse_paf_id = None + if options.collapseRefPAF: + logger.info("Importing {}".format(options.collapseRefPAF)) + ref_collapse_paf_id = toil.importFile(options.collapseRefPAF) + #import the sequences (that we need to align for the given event, ie leaves and outgroups) seq_id_map = {} fa_id_map = {} @@ -185,7 +208,7 @@ def graph_map(options): # run the workflow paf_id, gfa_fa_id, gaf_id, unfiltered_paf_id, paf_filter_log, paf_was_filtered = toil.start(Job.wrapJobFn( - minigraph_workflow, options, config_wrapper, seq_id_map, gfa_id, graph_event)) + minigraph_workflow, options, config_wrapper, seq_id_map, gfa_id, graph_event, True, ref_collapse_paf_id)) #export the paf toil.exportFile(paf_id, makeURL(options.outputPAF)) @@ -203,7 +226,7 @@ def graph_map(options): if options.outputFasta: add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta), graph_event) -def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sanitize=True): +def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sanitize, ref_collapse_paf_id): """ Overall workflow takes command line options and returns (paf-id, (optional) fa-id) """ fa_id = None gfa_id_size = gfa_id.size @@ -222,6 +245,11 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa sanitize_job = root_job.addChildJobFn(sanitize_fasta_headers, seq_id_map, pangenome=True) seq_id_map = sanitize_job.rv() + # add unique prefixes to the input PAF + if ref_collapse_paf_id: + ref_collapse_paf_id = root_job.addChildJobFn(add_paf_prefixes, ref_collapse_paf_id, options.reference, + disk=2*ref_collapse_paf_id.size).rv() + zipped_gfa = options.minigraphGFA.endswith('.gz') if options.outputFasta: # convert GFA to fasta @@ -239,6 +267,7 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa paf_job = Job.wrapJobFn(minigraph_map_all, config, gfa_id, seq_id_map, graph_event) root_job.addFollowOn(paf_job) + collapse_paf_id = ref_collapse_paf_id if options.reference: # extract a PAF directly from the rGFAs tag for the given reference # if --refFromGFA is specified, we get the entire alignment from that, otherwise we just take contigs @@ -249,7 +278,16 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa root_job.addChild(gfa2paf_job) else: paf_job.addFollowOn(gfa2paf_job) - merge_paf_job = Job.wrapJobFn(merge_pafs, {"1" : paf_job.rv(0), "2" : gfa2paf_job.rv()}, disk=gfa_id_size) + collapse_mode = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") + if collapse_mode in ['reference', 'all', 'nonref']: + collapse_job = paf_job.addChildJobFn(self_align_all, config, seq_id_map, options.reference, collapse_mode) + + if ref_collapse_paf_id: + collapse_paf_id = collapse_job.addFollowOnJobFn(merge_pafs, {"1":collapse_job.rv(), "2":ref_collapse_paf_id}, + disk=gfa_id_size).rv() + else: + collapse_paf_id = collapse_job.rv() + merge_paf_job = Job.wrapJobFn(merge_pafs, {"1" : paf_job.rv(0), "2" : gfa2paf_job.rv()}, disk=gfa_id_size) paf_job.addFollowOn(merge_paf_job) gfa2paf_job.addFollowOn(merge_paf_job) out_paf_id = merge_paf_job.rv() @@ -274,12 +312,27 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa out_paf_id = del_filter_job.rv(0) filtered_paf_log = del_filter_job.rv(1) paf_was_filtered = del_filter_job.rv(2) + prev_job = del_filter_job + + if collapse_paf_id: + # note: the collapse paf doesn't get merged into unfiltered_paf + merge_collapse_job = prev_job.addFollowOnJobFn(merge_pafs, {"1" : out_paf_id, "2" : collapse_paf_id}, disk=gfa_id_size) + out_paf_id = merge_collapse_job.rv() return out_paf_id, fa_id if options.outputFasta else None, paf_job.rv(1), unfiltered_paf_id, filtered_paf_log, paf_was_filtered - + +def add_paf_prefixes(job, paf_id, name): + """ Add prefixes to paf """ + work_dir = job.fileStore.getLocalTempDir() + paf_path = os.path.join(work_dir, name + ".paf") + job.fileStore.readGlobalFile(paf_id, paf_path) + renamed_paf_path = os.path.join(work_dir, name + '.prefixed.paf') + cmd = ['awk', 'BEGIN{{OFS=\" \"}} {{$1="id={}|"$1; $6="id={}|"$6; print}}'.format(name, name), paf_path] + cactus_call(parameters=cmd, outfile=renamed_paf_path) + return job.fileStore.writeGlobalFile(renamed_paf_path) + def make_minigraph_fasta(job, gfa_file_id, gfa_file_path, name): """ Use gfatools to make the minigraph "assembly" """ - # note: using the toil-vg convention of naming working files manually so that logging is more readable work_dir = job.fileStore.getLocalTempDir() gfa_path = os.path.join(work_dir, "mg.gfa") @@ -430,6 +483,54 @@ def extract_paf_from_gfa(job, gfa_id, gfa_path, ref_event, graph_event, ignore_p cactus_call(parameters=cmd, outfile=paf_path) return job.fileStore.writeGlobalFile(paf_path) +def self_align_all(job, config, seq_id_map, reference, collapse_mode): + """ run self-alignment. if reference event given, just run on that, otherwise do all genomes """ + assert collapse_mode in ['reference', 'all', 'nonref'] + assert reference or collapse_mode == 'all' + root_job = Job() + job.addChild(root_job) + events = [] + for event in seq_id_map.keys(): + if collapse_mode == 'all' or (collapse_mode == 'reference' and event == reference) or \ + (collapse_mode == 'nonref' and event != reference): + events.append(event) + mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "cpu", typeFn=int, default=1) + paf_dict = {} + paf_size = 0 + for event in events: + paf_size += seq_id_map[event].size + collapse_job = root_job.addChildJobFn(self_align, config, event, seq_id_map[event], + disk=4*seq_id_map[event].size, + memory=4*seq_id_map[event].size, + cores=mg_cores) + paf_dict[event] = collapse_job.rv() + + merge_paf_job = root_job.addFollowOnJobFn(merge_pafs, paf_dict, + disk=4*paf_size) + return merge_paf_job.rv() + +def self_align(job, config, seq_name, seq_id): + """ run minimap2 self alignment on a single genome """ + work_dir = job.fileStore.getLocalTempDir() + seq_name = seq_name.split('|')[-1] + fa_path = os.path.join(work_dir, seq_name + '.fa') + job.fileStore.readGlobalFile(seq_id, fa_path) + cactus_call(parameters=['samtools', 'faidx', fa_path]) + contigs = [] + with open(fa_path + '.fai', 'r') as fai_file: + for line in fai_file: + if line.strip(): + contigs.append(line.split()[0]) + paf_path = os.path.join(work_dir, seq_name + '.self.paf') + xml_node = findRequiredNode(config.xmlRoot, "graphmap") + minimap_opts = getOptionalAttrib(xml_node, "minimapCollapseOptions", str, default="") + for contig in contigs: + contig_path = os.path.join(work_dir, '{}.{}.fa'.format(seq_name, contig)) + cactus_call(parameters=['samtools', 'faidx', fa_path, contig, '-o', contig_path]) + cmd = ['minimap2', '-t', str(job.cores)] + minimap_opts.split() + [contig_path, contig_path] + cactus_call(parameters=cmd, outfile=paf_path, outappend=True) + return job.fileStore.writeGlobalFile(paf_path) + def filter_paf(job, paf_id, config, reference=None): """ run basic paf-filtering. these are quick filters that are best to do on-the-fly when reading the paf and as such, they are called by cactus-graphmap-split and cactus-align, not here """ @@ -441,15 +542,21 @@ def filter_paf(job, paf_id, config, reference=None): min_block = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minGAFBlockLength", typeFn=int, default=0) min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ", typeFn=int, default=0) min_ident = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minIdentity", typeFn=float, default=0) - RealtimeLogger.info("Running PAF filter with minBlock={} minMAPQ={} minIdentity={}".format(min_block, min_mapq, min_ident)) + min_score = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minScore", typeFn=float, default=0) + max_collapse_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "maxCollapseDistanceRatio", typeFn=float, default=-1) + RealtimeLogger.info("Running PAF filter with minBlock={} minMAPQ={} minIdentity={} minScore={} maxCollapseDistanceRatio={}".format(min_block, min_mapq, min_ident, min_score, max_collapse_ratio)) with open(paf_path, 'r') as paf_file, open(filter_paf_path, 'w') as filter_paf_file: for line in paf_file: toks = line.split('\t') - is_ref = reference and toks[0].startswith('id={}|'.format(reference)) + query_name = toks[0] + target_name = toks[5] + is_ref = reference and query_name.startswith('id={}|'.format(reference)) and query_name != target_name mapq = int(toks[11]) query_len = int(toks[1]) ident = float(toks[9]) / (float(toks[10]) + 0.00000001) bl = None + score = None + collapse_ratio = None for tok in toks[12:]: # this is a special tag that was written by gaf2paf in order to preserve the original gaf block length # we use it to be able to filter by the gaf block even after it's been broken in the paf @@ -458,12 +565,26 @@ def filter_paf(job, paf_id, config, reference=None): # we can also get the identity of the parent gaf block if tok.startswith('gi:i:'): ident = min(ident, float(toks[5:])) - if is_ref or (mapq >= min_mapq and (bl is None or query_len <= min_block or bl >= min_block) and ident >= min_ident): + if tok.startswith('AS:i:'): + score = int(tok[5:]) + if query_name == target_name and max_collapse_ratio >= 0: + # compute the distance between the minimap2 self alignment intervals + # in order to see if they are too far apart via the max ratio + query_start, query_end = int(toks[2]), int(toks[3]) + target_start, target_end = int(toks[7]), int(toks[8]) + block_length = int(toks[10]) + dist = target_start - query_end if query_start < target_start else query_start - target_end + if block_length > 0 and dist > 0: + collapse_ratio = dist / block_length + if is_ref or (mapq >= min_mapq and (bl is None or query_len <= min_block or bl >= min_block) and ident >= min_ident and \ + (score is None or score >= min_score) and (collapse_ratio is None or collapse_ratio <= max_collapse_ratio)): filter_paf_file.write(line) overlap_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterRatio", typeFn=float, default=0) - length_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterMinLengthRatio", typeFn=float, default=0) - if overlap_ratio: + length_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterMinLengthRatio", typeFn=float, default=0) + allow_collapse = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") != "none" + + if overlap_ratio and not allow_collapse: overlap_filter_paf_path = filter_paf_path + ".overlap" cactus_call(parameters=['gaffilter', filter_paf_path, '-p', '-r', str(overlap_ratio), '-m', str(length_ratio), '-b', str(min_block), '-q', str(min_mapq), '-i', str(min_ident)], diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index 6dc791ed4..dfcdaad51 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -149,6 +149,8 @@ def graphmap_join_options(parser): help="Memory in bytes for each indexing and vcf construction job job (defaults to an estimate based on the input data size). If specified will also be used to upper-bound per-chromosome memory estimates -- ie no job will request more than this much memory." "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None) + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False) + def graphmap_join_validate_options(options): """ make sure the options make sense and fill in sensible defaults """ if options.hal and len(options.hal) != len(options.vg): @@ -347,6 +349,9 @@ def graphmap_join(options): configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals(options) + + if options.collapse: + findRequiredNode(configNode, "graphmap").attrib["collapse"] = 'all' # load up the vgs vg_ids = [] @@ -701,6 +706,10 @@ def clip_vg(job, options, config, vg_path, vg_id, phase): clip_vg_cmd += ['-a', graph_event] clip_vg_cmd += ['-o', clipped_bed_path] + # disable reference cycle check if desiired + if getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") in ["all", "reference"]: + clip_vg_cmd += ['-c'] + cmd.append(clip_vg_cmd) # enforce chopping @@ -860,6 +869,8 @@ def vg_to_gfa(job, options, config, vg_path, vg_id, unchopped=False): input_path = '-' if unchopped else os.path.basename(vg_path) cmd = ['vg', 'convert', '-f', '-Q', options.reference[0], input_path, '-B'] + if getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") not in ["all", "reference"]: + cmd += ['-Q', options.reference[0], '-B'] if unchopped: cmd = [['vg', 'mod', '-u', os.path.basename(vg_path)], cmd] diff --git a/src/cactus/refmap/cactus_graphmap_split.py b/src/cactus/refmap/cactus_graphmap_split.py index 3cd4de952..1144631b1 100644 --- a/src/cactus/refmap/cactus_graphmap_split.py +++ b/src/cactus/refmap/cactus_graphmap_split.py @@ -55,6 +55,8 @@ def main(): parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp") parser.add_argument("--minIdentity", type=float, help = "Ignore PAF lines with identity (column 10/11) < this (overrides minIdentity in in config)") parser.add_argument("--permissiveContigFilter", nargs='?', const='0.25', default=None, type=float, help = "If specified, override the configuration to accept contigs so long as they have at least given fraction of coverage (0.25 if no fraction specified). This can increase sensitivity of very small, fragmented and/or diverse assemblies.") + + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False) #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", @@ -103,6 +105,9 @@ def cactus_graphmap_split(options): config = ConfigWrapper(config_node) config.substituteAllPredefinedConstantsWithLiterals(options) + if options.collapse: + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all' + #Run the workflow if options.restart: wf_output = toil.restart() diff --git a/src/cactus/refmap/cactus_pangenome.py b/src/cactus/refmap/cactus_pangenome.py index c21e0736b..a14301362 100644 --- a/src/cactus/refmap/cactus_pangenome.py +++ b/src/cactus/refmap/cactus_pangenome.py @@ -81,7 +81,10 @@ def main(): parser.add_argument("--consMemory", type=human2bytesN, help="Memory in bytes for each cactus_consolidated job (defaults to an estimate based on the input data size). " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None) - + + # cactus-graphmap options + parser.add_argument("--collapseRefPAF", help ="Incorporate given reference self-alignments in PAF format") + # cactus-graphmap-join options graphmap_join_options(parser) @@ -153,6 +156,14 @@ def main(): if options.maxCores is not None and options.maxCores < 2**20 and options.consCores > int(options.maxCores): raise RuntimeError('--consCores must be <= --maxCores') + if options.collapseRefPAF: + if not options.collapseRefPAF.endswith('.paf'): + raise RuntimeError('file passed to --collapseRefPAF must end with .paf') + if not options.reference: + raise RuntimeError('--reference must be used with --collapseRefPAF') + if options.collapse: + raise RuntimeError('--collapseRefPAF cannot be used with --collapse') + # Sort out the graphmap-join options, which can be rather complex # pass in dummy values for now, they will get filled in later # (but we want to do as much error-checking upfront as possible) @@ -198,7 +209,18 @@ def main(): else: if not options.mgCores: raise RuntimeError("--mgCores required run *not* running on single machine batch system") - + + if options.collapse: + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all' + + #import the reference collapse paf + ref_collapse_paf_id = None + if options.collapseRefPAF: + logger.info("Importing {}".format(options.collapseRefPAF)) + ref_collapse_paf_id = toil.importFile(options.collapseRefPAF) + assert options.reference + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'reference' + #import the sequences input_seq_id_map = {} input_path_map = {} @@ -217,7 +239,7 @@ def main(): if genome not in options.reference: input_seq_order.append(genome) - toil.start(Job.wrapJobFn(pangenome_end_to_end_workflow, options, config_wrapper, input_seq_id_map, input_path_map, input_seq_order)) + toil.start(Job.wrapJobFn(pangenome_end_to_end_workflow, options, config_wrapper, input_seq_id_map, input_path_map, input_seq_order, ref_collapse_paf_id)) end_time = timeit.default_timer() run_time = end_time - start_time @@ -315,7 +337,7 @@ def export_join_wrapper(job, options, wf_output): """ toil join wrapper for cactus_graphmap_join """ export_join_data(job.fileStore, options, wf_output[0], wf_output[1], wf_output[2], wf_output[3], wf_output[4], wf_output[5]) -def pangenome_end_to_end_workflow(job, options, config_wrapper, seq_id_map, seq_path_map, seq_order): +def pangenome_end_to_end_workflow(job, options, config_wrapper, seq_id_map, seq_path_map, seq_order, ref_collapse_paf_id): """ chain the entire workflow together, doing exports after each step to mitigate annoyance of failures """ root_job = Job() job.addChild(root_job) @@ -341,7 +363,7 @@ def pangenome_end_to_end_workflow(job, options, config_wrapper, seq_id_map, seq_ options.outputFasta = gfa_fa_path graph_event = getOptionalAttrib(findRequiredNode(config_node, "graphmap"), "assemblyName", default="_MINIGRAPH_") - graphmap_job = minigraph_job.addFollowOnJobFn(minigraph_workflow, options, config_wrapper, seq_id_map, sv_gfa_id, graph_event, sanitize=False) + graphmap_job = minigraph_job.addFollowOnJobFn(minigraph_workflow, options, config_wrapper, seq_id_map, sv_gfa_id, graph_event, False, ref_collapse_paf_id) paf_id, gfa_fa_id, gaf_id, unfiltered_paf_id, paf_filter_log = graphmap_job.rv(0), graphmap_job.rv(1), graphmap_job.rv(2), graphmap_job.rv(3), graphmap_job.rv(4) graphmap_export_job = graphmap_job.addFollowOnJobFn(export_graphmap_wrapper, options, paf_id, paf_path, gaf_id, unfiltered_paf_id, paf_filter_log) diff --git a/src/cactus/setup/cactus_align.py b/src/cactus/setup/cactus_align.py index b49d27444..cb433aead 100644 --- a/src/cactus/setup/cactus_align.py +++ b/src/cactus/setup/cactus_align.py @@ -59,6 +59,9 @@ def main(): parser.add_argument("--pangenome", action="store_true", help="Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings." " The overridden configuration will be saved in .pg-conf.xml") + + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False) + parser.add_argument("--singleCopySpecies", type=str, help="Filter out all self-alignments in given species") parser.add_argument("--barMaskFilter", type=int, default=None, @@ -165,7 +168,7 @@ def main(): # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) - + logger.info('Cactus Command: {}'.format(' '.join(sys.argv))) logger.info('Cactus Commit: {}'.format(cactus_commit)) start_time = timeit.default_timer() @@ -256,6 +259,8 @@ def make_align_job(options, toil, config_wrapper=None, chrom_name=None): config_wrapper = ConfigWrapper(config_node) config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) config_wrapper.initGPU(options) + if options.collapse: + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all' config_wrapper.setSystemMemory(options) mc_tree, input_seq_map, og_candidates = parse_seqfile(options.seqFile, config_wrapper, @@ -331,6 +336,7 @@ def make_align_job(options, toil, config_wrapper=None, chrom_name=None): poaNode.attrib["partialOrderAlignmentDisableSeeding"] = "1" # import the PAF alignments + logger.info("Importing {}".format(options.pafFile)) paf_id = toil.importFile(makeURL(options.pafFile)) #import the sequences @@ -396,8 +402,9 @@ def cactus_align(job, config_wrapper, mc_tree, input_seq_map, input_seq_id_map, sub_tree = get_subtree(mc_tree, root_name, config_wrapper, og_map, include_outgroups=False) # run the hal export + allow_collapse = getOptionalAttrib(findRequiredNode(config_wrapper.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") in ['reference', 'all'] hal_job = cons_job.addFollowOnJobFn(export_hal, sub_tree, config_wrapper.xmlRoot, new_seq_id_map, og_map, results, event=root_name, inMemory=True, - checkpointInfo=checkpointInfo, acyclicEvent=referenceEvents[0] if referenceEvents else None, + checkpointInfo=checkpointInfo, acyclicEvent=referenceEvents[0] if referenceEvents and not allow_collapse else None, memory_override=cons_memory) # clean out some of the intermediate jobstore files diff --git a/src/cactus/shared/common.py b/src/cactus/shared/common.py index 315ae2667..4af78df69 100644 --- a/src/cactus/shared/common.py +++ b/src/cactus/shared/common.py @@ -1180,7 +1180,7 @@ def get_faidx_subpath_rename_cmd(): happy. But it does require conversion going into vg which wants chr[9-15] and hal2vg is updated to do this autmatically """ - return ['sed', '-e', 's/\([^:]*\):\([0-9]*\)-\([0-9]*\)/echo "\\1_sub_$((\\2-1))_\\3"/e'] + return ['sed', '-e', 's/\([^:]*\):\([0-9]+\)-\([0-9]+\)/echo "\\1_sub_$((\\2-1))_\\3"/e'] def clean_jobstore_files(job, file_id_maps=None, file_ids=None): """ clean some intermediate files that are no longer needed out of the jobstore """ diff --git a/test/evolverTest.py b/test/evolverTest.py index d3ac147ca..4789fb613 100644 --- a/test/evolverTest.py +++ b/test/evolverTest.py @@ -441,7 +441,7 @@ def _run_evolver_primates_pangenome(self, binariesMode): out_dir = os.path.dirname(self._out_hal(binariesMode)) out_name = os.path.splitext(os.path.basename(self._out_hal(binariesMode)))[0] cactus_pangenome_cmd = ['cactus-pangenome', self._job_store(binariesMode), seq_file_path, '--reference', 'simHuman', 'simChimp', - '--outDir', out_dir, '--outName', out_name, '--noSplit', '--odgi', '--chrom-og', '--viz', '--draw', '--haplo'] + '--outDir', out_dir, '--outName', out_name, '--noSplit', '--odgi', '--chrom-og', '--viz', '--draw', '--haplo', '--collapse'] subprocess.check_call(cactus_pangenome_cmd + cactus_opts) # cactus-pangenome tacks on the .full to the output name