diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools
index 10932f577..7ab14c6a3 100755
--- a/build-tools/downloadPangenomeTools
+++ b/build-tools/downloadPangenomeTools
@@ -45,7 +45,7 @@ fi
cd ${pangenomeBuildDir}
git clone https://github.com/lh3/minimap2.git
cd minimap2
-git checkout 581f2d7123f651d04c9e103b1c7ea8f7051e909a
+git checkout v2.28
# hack in flags support
sed -i Makefile -e 's/CFLAGS=/CFLAGS+=/'
make -j ${numcpu} ${MINI_ARM_FLAGS}
@@ -60,7 +60,7 @@ fi
cd ${pangenomeBuildDir}
git clone https://github.com/lh3/minigraph.git
pushd minigraph
-git checkout v0.20
+git checkout v0.21
# hack in flags support
sed -i Makefile -e 's/CFLAGS=/CFLAGS+=/'
make -j ${numcpu}
@@ -75,7 +75,7 @@ fi
cd ${pangenomeBuildDir}
git clone https://github.com/lh3/gfatools.git
cd gfatools
-git checkout v0.5
+git checkout c31be8a62efc6bdea4576029f7fbe84f345a6eed
# hack in flags support
sed -i Makefile -e 's/CFLAGS=/CFLAGS+=/'
make -j ${numcpu}
@@ -176,7 +176,7 @@ fi
cd ${pangenomeBuildDir}
git clone https://github.com/ComparativeGenomicsToolkit/cactus-gfa-tools.git
cd cactus-gfa-tools
-git checkout 0c17bc4ae9a7cf174fa40805cde7f8f1f6de8225
+git checkout 1121e370880ee187ba2963f0e46e632e0e762cc5
make -j ${numcpu}
if [[ $STATIC_CHECK -ne 1 || $(ldd paf2lastz | grep so | wc -l) -eq 0 ]]
then
@@ -235,7 +235,7 @@ fi
# hal2vg
cd ${pangenomeBuildDir}
-wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/hal2vg
+wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/hal2vg
chmod +x hal2vg
if [[ $STATIC_CHECK -ne 1 || $(ldd hal2vg | grep so | wc -l) -eq 0 ]]
then
@@ -245,7 +245,7 @@ else
fi
# clip-vg
cd ${pangenomeBuildDir}
-wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/clip-vg
+wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/clip-vg
chmod +x clip-vg
if [[ $STATIC_CHECK -ne 1 || $(ldd clip-vg | grep so | wc -l) -eq 0 ]]
then
@@ -255,7 +255,7 @@ else
fi
# halRemoveDupes
cd ${pangenomeBuildDir}
-wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/halRemoveDupes
+wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/halRemoveDupes
chmod +x halRemoveDupes
if [[ $STATIC_CHECK -ne 1 || $(ldd halRemoveDupes | grep so | wc -l) -eq 0 ]]
then
@@ -265,7 +265,7 @@ else
fi
# halMergeChroms
cd ${pangenomeBuildDir}
-wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/halMergeChroms
+wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/halMergeChroms
chmod +x halMergeChroms
if [[ $STATIC_CHECK -ne 1 || $(ldd halMergeChroms | grep so | wc -l) -eq 0 ]]
then
@@ -276,7 +276,7 @@ fi
# halUnclip
cd ${pangenomeBuildDir}
-wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/halUnclip
+wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/halUnclip
chmod +x halUnclip
if [[ $STATIC_CHECK -ne 1 || $(ldd halUnclip | grep so | wc -l) -eq 0 ]]
then
@@ -287,7 +287,7 @@ fi
# filter-paf-deletions
cd ${pangenomeBuildDir}
-wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/filter-paf-deletions
+wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/filter-paf-deletions
chmod +x filter-paf-deletions
if [[ $STATIC_CHECK -ne 1 || $(ldd filter-paf-deletions | grep so | wc -l) -eq 0 ]]
then
diff --git a/doc/pangenome.md b/doc/pangenome.md
index b54b4e49e..faffe55fc 100644
--- a/doc/pangenome.md
+++ b/doc/pangenome.md
@@ -133,6 +133,10 @@ The `--reference` option must be used to select a "reference" sample. This samp
It is therefore extremely important that the reference sample's assembly be **chromosome** scale. If there are many small contigs in the addition to chromosomes in the reference assembly, then please consider specifying the chromosomes with `--refContigs`. If you still want to keep the other contigs, add `--otherContig chrOther` (see explanation below).
+### Self-Alignment and the Collapse Option
+
+The `--collapse` option, added as an experimental prototype in [v2.9.1](https://github.com/ComparativeGenomicsToolkit/cactus/releases/tag/v2.9.1), can be used to incorporate self-alignments into the pangenome, *including the reference sample*. This will produce a more compact graph with, for example, tandem duplications being represented as cycles (like PGGB) rather than insertions (like minigraph). It also drops the invariant (see above) that the reference path by acyclic -- with this option, the reference path can contain cycles. The `--collapse` option is implemented by running `minimap2` using options found in the `minimapCollapseOptions` parameter in the configuration XML to align each input *contig* with itself. These self alignments are fed into Cactus alongide the usual sequence-to-minigraph alignments.
+
#### Multiple Reference Samples
The `--reference` option can accept multiple samples (separated by space). If multiple samples are specified beyond the first, they will be clipped as usual, but end up as "reference-sense" paths in the vg/gbz output. They can also be used as basis for VCF, and VCF files can be created based on them with the `--vcfReference` sample.
diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml
index f3c9caa61..b3a952d57 100644
--- a/src/cactus/cactus_progressive_config.xml
+++ b/src/cactus/cactus_progressive_config.xml
@@ -320,6 +320,9 @@
+
+
+
@@ -334,13 +337,17 @@
-
+
+
diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py
index 74cf0f695..6b21eea96 100644
--- a/src/cactus/refmap/cactus_graphmap.py
+++ b/src/cactus/refmap/cactus_graphmap.py
@@ -52,7 +52,9 @@ def main():
parser.add_argument("--outputGAFDir", type=str, help = "Output GAF alignments (raw minigraph output before PAF conversion) to this directory")
parser.add_argument("--reference", nargs='+', type=str, help = "Reference genome name. MAPQ filter will not be applied to it")
parser.add_argument("--refFromGFA", action="store_true", help = "Do not align reference (--reference) from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)")
- parser.add_argument("--mapCores", type=int, help = "Number of cores for minigraph. Overrides graphmap cpu in configuration")
+ parser.add_argument("--mapCores", type=int, help = "Number of cores for minigraph. Overrides graphmap cpu in configuration")
+ parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False)
+ parser.add_argument("--collapseRefPAF", help ="Incorporate given (reference-only) self-alignments in PAF format [Experimental]")
#WDL hacks
parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile")
@@ -90,6 +92,14 @@ def main():
if options.reference:
options.reference = options.reference[0]
+ if options.collapseRefPAF:
+ if not options.collapseRefPAF.endswith('.paf'):
+ raise RuntimeError('file passed to --collapseRefPAF must end with .paf')
+ if not options.reference:
+ raise RuntimeError('--reference must be used with --collapseRefPAF')
+ if options.collapse:
+ raise RuntimeError('--collapseRefPAF cannot be used with --collapse')
+
# Mess with some toil options to create useful defaults.
cactus_override_toil_options(options)
@@ -101,6 +111,7 @@ def main():
run_time = end_time - start_time
logger.info("cactus-graphmap has finished after {} seconds".format(run_time))
+
def graph_map(options):
with Toil(options) as toil:
importSingularityImage(options)
@@ -137,6 +148,12 @@ def graph_map(options):
if options.batchSystem.lower() in ['single_machine', 'singleMachine']:
mg_cores = min(mg_cores, cactus_cpu_count(), int(options.maxCores) if options.maxCores else sys.maxsize)
findRequiredNode(config_node, "graphmap").attrib["cpu"] = str(mg_cores)
+
+ if options.collapse:
+ findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all'
+ if options.collapseRefPAF:
+ assert options.reference
+ findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'reference'
# get the minigraph "virutal" assembly name
graph_event = getOptionalAttrib(findRequiredNode(config_node, "graphmap"), "assemblyName", default="_MINIGRAPH_")
@@ -169,6 +186,12 @@ def graph_map(options):
logger.info("Importing {}".format(options.minigraphGFA))
gfa_id = toil.importFile(makeURL(options.minigraphGFA))
+ #import the reference collapse paf
+ ref_collapse_paf_id = None
+ if options.collapseRefPAF:
+ logger.info("Importing {}".format(options.collapseRefPAF))
+ ref_collapse_paf_id = toil.importFile(options.collapseRefPAF)
+
#import the sequences (that we need to align for the given event, ie leaves and outgroups)
seq_id_map = {}
fa_id_map = {}
@@ -185,7 +208,7 @@ def graph_map(options):
# run the workflow
paf_id, gfa_fa_id, gaf_id, unfiltered_paf_id, paf_filter_log, paf_was_filtered = toil.start(Job.wrapJobFn(
- minigraph_workflow, options, config_wrapper, seq_id_map, gfa_id, graph_event))
+ minigraph_workflow, options, config_wrapper, seq_id_map, gfa_id, graph_event, True, ref_collapse_paf_id))
#export the paf
toil.exportFile(paf_id, makeURL(options.outputPAF))
@@ -203,7 +226,7 @@ def graph_map(options):
if options.outputFasta:
add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta), graph_event)
-def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sanitize=True):
+def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sanitize, ref_collapse_paf_id):
""" Overall workflow takes command line options and returns (paf-id, (optional) fa-id) """
fa_id = None
gfa_id_size = gfa_id.size
@@ -222,6 +245,11 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa
sanitize_job = root_job.addChildJobFn(sanitize_fasta_headers, seq_id_map, pangenome=True)
seq_id_map = sanitize_job.rv()
+ # add unique prefixes to the input PAF
+ if ref_collapse_paf_id:
+ ref_collapse_paf_id = root_job.addChildJobFn(add_paf_prefixes, ref_collapse_paf_id, options.reference,
+ disk=2*ref_collapse_paf_id.size).rv()
+
zipped_gfa = options.minigraphGFA.endswith('.gz')
if options.outputFasta:
# convert GFA to fasta
@@ -239,6 +267,7 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa
paf_job = Job.wrapJobFn(minigraph_map_all, config, gfa_id, seq_id_map, graph_event)
root_job.addFollowOn(paf_job)
+ collapse_paf_id = ref_collapse_paf_id
if options.reference:
# extract a PAF directly from the rGFAs tag for the given reference
# if --refFromGFA is specified, we get the entire alignment from that, otherwise we just take contigs
@@ -249,7 +278,16 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa
root_job.addChild(gfa2paf_job)
else:
paf_job.addFollowOn(gfa2paf_job)
- merge_paf_job = Job.wrapJobFn(merge_pafs, {"1" : paf_job.rv(0), "2" : gfa2paf_job.rv()}, disk=gfa_id_size)
+ collapse_mode = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none")
+ if collapse_mode in ['reference', 'all', 'nonref']:
+ collapse_job = paf_job.addChildJobFn(self_align_all, config, seq_id_map, options.reference, collapse_mode)
+
+ if ref_collapse_paf_id:
+ collapse_paf_id = collapse_job.addFollowOnJobFn(merge_pafs, {"1":collapse_job.rv(), "2":ref_collapse_paf_id},
+ disk=gfa_id_size).rv()
+ else:
+ collapse_paf_id = collapse_job.rv()
+ merge_paf_job = Job.wrapJobFn(merge_pafs, {"1" : paf_job.rv(0), "2" : gfa2paf_job.rv()}, disk=gfa_id_size)
paf_job.addFollowOn(merge_paf_job)
gfa2paf_job.addFollowOn(merge_paf_job)
out_paf_id = merge_paf_job.rv()
@@ -274,12 +312,27 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa
out_paf_id = del_filter_job.rv(0)
filtered_paf_log = del_filter_job.rv(1)
paf_was_filtered = del_filter_job.rv(2)
+ prev_job = del_filter_job
+
+ if collapse_paf_id:
+ # note: the collapse paf doesn't get merged into unfiltered_paf
+ merge_collapse_job = prev_job.addFollowOnJobFn(merge_pafs, {"1" : out_paf_id, "2" : collapse_paf_id}, disk=gfa_id_size)
+ out_paf_id = merge_collapse_job.rv()
return out_paf_id, fa_id if options.outputFasta else None, paf_job.rv(1), unfiltered_paf_id, filtered_paf_log, paf_was_filtered
-
+
+def add_paf_prefixes(job, paf_id, name):
+ """ Add prefixes to paf """
+ work_dir = job.fileStore.getLocalTempDir()
+ paf_path = os.path.join(work_dir, name + ".paf")
+ job.fileStore.readGlobalFile(paf_id, paf_path)
+ renamed_paf_path = os.path.join(work_dir, name + '.prefixed.paf')
+ cmd = ['awk', 'BEGIN{{OFS=\" \"}} {{$1="id={}|"$1; $6="id={}|"$6; print}}'.format(name, name), paf_path]
+ cactus_call(parameters=cmd, outfile=renamed_paf_path)
+ return job.fileStore.writeGlobalFile(renamed_paf_path)
+
def make_minigraph_fasta(job, gfa_file_id, gfa_file_path, name):
""" Use gfatools to make the minigraph "assembly" """
-
# note: using the toil-vg convention of naming working files manually so that logging is more readable
work_dir = job.fileStore.getLocalTempDir()
gfa_path = os.path.join(work_dir, "mg.gfa")
@@ -430,6 +483,54 @@ def extract_paf_from_gfa(job, gfa_id, gfa_path, ref_event, graph_event, ignore_p
cactus_call(parameters=cmd, outfile=paf_path)
return job.fileStore.writeGlobalFile(paf_path)
+def self_align_all(job, config, seq_id_map, reference, collapse_mode):
+ """ run self-alignment. if reference event given, just run on that, otherwise do all genomes """
+ assert collapse_mode in ['reference', 'all', 'nonref']
+ assert reference or collapse_mode == 'all'
+ root_job = Job()
+ job.addChild(root_job)
+ events = []
+ for event in seq_id_map.keys():
+ if collapse_mode == 'all' or (collapse_mode == 'reference' and event == reference) or \
+ (collapse_mode == 'nonref' and event != reference):
+ events.append(event)
+ mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "cpu", typeFn=int, default=1)
+ paf_dict = {}
+ paf_size = 0
+ for event in events:
+ paf_size += seq_id_map[event].size
+ collapse_job = root_job.addChildJobFn(self_align, config, event, seq_id_map[event],
+ disk=4*seq_id_map[event].size,
+ memory=4*seq_id_map[event].size,
+ cores=mg_cores)
+ paf_dict[event] = collapse_job.rv()
+
+ merge_paf_job = root_job.addFollowOnJobFn(merge_pafs, paf_dict,
+ disk=4*paf_size)
+ return merge_paf_job.rv()
+
+def self_align(job, config, seq_name, seq_id):
+ """ run minimap2 self alignment on a single genome """
+ work_dir = job.fileStore.getLocalTempDir()
+ seq_name = seq_name.split('|')[-1]
+ fa_path = os.path.join(work_dir, seq_name + '.fa')
+ job.fileStore.readGlobalFile(seq_id, fa_path)
+ cactus_call(parameters=['samtools', 'faidx', fa_path])
+ contigs = []
+ with open(fa_path + '.fai', 'r') as fai_file:
+ for line in fai_file:
+ if line.strip():
+ contigs.append(line.split()[0])
+ paf_path = os.path.join(work_dir, seq_name + '.self.paf')
+ xml_node = findRequiredNode(config.xmlRoot, "graphmap")
+ minimap_opts = getOptionalAttrib(xml_node, "minimapCollapseOptions", str, default="")
+ for contig in contigs:
+ contig_path = os.path.join(work_dir, '{}.{}.fa'.format(seq_name, contig))
+ cactus_call(parameters=['samtools', 'faidx', fa_path, contig, '-o', contig_path])
+ cmd = ['minimap2', '-t', str(job.cores)] + minimap_opts.split() + [contig_path, contig_path]
+ cactus_call(parameters=cmd, outfile=paf_path, outappend=True)
+ return job.fileStore.writeGlobalFile(paf_path)
+
def filter_paf(job, paf_id, config, reference=None):
""" run basic paf-filtering. these are quick filters that are best to do on-the-fly when reading the paf and
as such, they are called by cactus-graphmap-split and cactus-align, not here """
@@ -441,15 +542,21 @@ def filter_paf(job, paf_id, config, reference=None):
min_block = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minGAFBlockLength", typeFn=int, default=0)
min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ", typeFn=int, default=0)
min_ident = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minIdentity", typeFn=float, default=0)
- RealtimeLogger.info("Running PAF filter with minBlock={} minMAPQ={} minIdentity={}".format(min_block, min_mapq, min_ident))
+ min_score = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minScore", typeFn=float, default=0)
+ max_collapse_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "maxCollapseDistanceRatio", typeFn=float, default=-1)
+ RealtimeLogger.info("Running PAF filter with minBlock={} minMAPQ={} minIdentity={} minScore={} maxCollapseDistanceRatio={}".format(min_block, min_mapq, min_ident, min_score, max_collapse_ratio))
with open(paf_path, 'r') as paf_file, open(filter_paf_path, 'w') as filter_paf_file:
for line in paf_file:
toks = line.split('\t')
- is_ref = reference and toks[0].startswith('id={}|'.format(reference))
+ query_name = toks[0]
+ target_name = toks[5]
+ is_ref = reference and query_name.startswith('id={}|'.format(reference)) and query_name != target_name
mapq = int(toks[11])
query_len = int(toks[1])
ident = float(toks[9]) / (float(toks[10]) + 0.00000001)
bl = None
+ score = None
+ collapse_ratio = None
for tok in toks[12:]:
# this is a special tag that was written by gaf2paf in order to preserve the original gaf block length
# we use it to be able to filter by the gaf block even after it's been broken in the paf
@@ -458,12 +565,26 @@ def filter_paf(job, paf_id, config, reference=None):
# we can also get the identity of the parent gaf block
if tok.startswith('gi:i:'):
ident = min(ident, float(toks[5:]))
- if is_ref or (mapq >= min_mapq and (bl is None or query_len <= min_block or bl >= min_block) and ident >= min_ident):
+ if tok.startswith('AS:i:'):
+ score = int(tok[5:])
+ if query_name == target_name and max_collapse_ratio >= 0:
+ # compute the distance between the minimap2 self alignment intervals
+ # in order to see if they are too far apart via the max ratio
+ query_start, query_end = int(toks[2]), int(toks[3])
+ target_start, target_end = int(toks[7]), int(toks[8])
+ block_length = int(toks[10])
+ dist = target_start - query_end if query_start < target_start else query_start - target_end
+ if block_length > 0 and dist > 0:
+ collapse_ratio = dist / block_length
+ if is_ref or (mapq >= min_mapq and (bl is None or query_len <= min_block or bl >= min_block) and ident >= min_ident and \
+ (score is None or score >= min_score) and (collapse_ratio is None or collapse_ratio <= max_collapse_ratio)):
filter_paf_file.write(line)
overlap_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterRatio", typeFn=float, default=0)
- length_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterMinLengthRatio", typeFn=float, default=0)
- if overlap_ratio:
+ length_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterMinLengthRatio", typeFn=float, default=0)
+ allow_collapse = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") != "none"
+
+ if overlap_ratio and not allow_collapse:
overlap_filter_paf_path = filter_paf_path + ".overlap"
cactus_call(parameters=['gaffilter', filter_paf_path, '-p', '-r', str(overlap_ratio), '-m', str(length_ratio),
'-b', str(min_block), '-q', str(min_mapq), '-i', str(min_ident)],
diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py
index 6dc791ed4..dfcdaad51 100644
--- a/src/cactus/refmap/cactus_graphmap_join.py
+++ b/src/cactus/refmap/cactus_graphmap_join.py
@@ -149,6 +149,8 @@ def graphmap_join_options(parser):
help="Memory in bytes for each indexing and vcf construction job job (defaults to an estimate based on the input data size). If specified will also be used to upper-bound per-chromosome memory estimates -- ie no job will request more than this much memory."
"Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None)
+ parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False)
+
def graphmap_join_validate_options(options):
""" make sure the options make sense and fill in sensible defaults """
if options.hal and len(options.hal) != len(options.vg):
@@ -347,6 +349,9 @@ def graphmap_join(options):
configNode = ET.parse(options.configFile).getroot()
config = ConfigWrapper(configNode)
config.substituteAllPredefinedConstantsWithLiterals(options)
+
+ if options.collapse:
+ findRequiredNode(configNode, "graphmap").attrib["collapse"] = 'all'
# load up the vgs
vg_ids = []
@@ -701,6 +706,10 @@ def clip_vg(job, options, config, vg_path, vg_id, phase):
clip_vg_cmd += ['-a', graph_event]
clip_vg_cmd += ['-o', clipped_bed_path]
+ # disable reference cycle check if desiired
+ if getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") in ["all", "reference"]:
+ clip_vg_cmd += ['-c']
+
cmd.append(clip_vg_cmd)
# enforce chopping
@@ -860,6 +869,8 @@ def vg_to_gfa(job, options, config, vg_path, vg_id, unchopped=False):
input_path = '-' if unchopped else os.path.basename(vg_path)
cmd = ['vg', 'convert', '-f', '-Q', options.reference[0], input_path, '-B']
+ if getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") not in ["all", "reference"]:
+ cmd += ['-Q', options.reference[0], '-B']
if unchopped:
cmd = [['vg', 'mod', '-u', os.path.basename(vg_path)], cmd]
diff --git a/src/cactus/refmap/cactus_graphmap_split.py b/src/cactus/refmap/cactus_graphmap_split.py
index 3cd4de952..1144631b1 100644
--- a/src/cactus/refmap/cactus_graphmap_split.py
+++ b/src/cactus/refmap/cactus_graphmap_split.py
@@ -55,6 +55,8 @@ def main():
parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp")
parser.add_argument("--minIdentity", type=float, help = "Ignore PAF lines with identity (column 10/11) < this (overrides minIdentity in in config)")
parser.add_argument("--permissiveContigFilter", nargs='?', const='0.25', default=None, type=float, help = "If specified, override the configuration to accept contigs so long as they have at least given fraction of coverage (0.25 if no fraction specified). This can increase sensitivity of very small, fragmented and/or diverse assemblies.")
+
+ parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False)
#Progressive Cactus Options
parser.add_argument("--configFile", dest="configFile",
@@ -103,6 +105,9 @@ def cactus_graphmap_split(options):
config = ConfigWrapper(config_node)
config.substituteAllPredefinedConstantsWithLiterals(options)
+ if options.collapse:
+ findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all'
+
#Run the workflow
if options.restart:
wf_output = toil.restart()
diff --git a/src/cactus/refmap/cactus_pangenome.py b/src/cactus/refmap/cactus_pangenome.py
index c21e0736b..a14301362 100644
--- a/src/cactus/refmap/cactus_pangenome.py
+++ b/src/cactus/refmap/cactus_pangenome.py
@@ -81,7 +81,10 @@ def main():
parser.add_argument("--consMemory", type=human2bytesN,
help="Memory in bytes for each cactus_consolidated job (defaults to an estimate based on the input data size). "
"Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None)
-
+
+ # cactus-graphmap options
+ parser.add_argument("--collapseRefPAF", help ="Incorporate given reference self-alignments in PAF format")
+
# cactus-graphmap-join options
graphmap_join_options(parser)
@@ -153,6 +156,14 @@ def main():
if options.maxCores is not None and options.maxCores < 2**20 and options.consCores > int(options.maxCores):
raise RuntimeError('--consCores must be <= --maxCores')
+ if options.collapseRefPAF:
+ if not options.collapseRefPAF.endswith('.paf'):
+ raise RuntimeError('file passed to --collapseRefPAF must end with .paf')
+ if not options.reference:
+ raise RuntimeError('--reference must be used with --collapseRefPAF')
+ if options.collapse:
+ raise RuntimeError('--collapseRefPAF cannot be used with --collapse')
+
# Sort out the graphmap-join options, which can be rather complex
# pass in dummy values for now, they will get filled in later
# (but we want to do as much error-checking upfront as possible)
@@ -198,7 +209,18 @@ def main():
else:
if not options.mgCores:
raise RuntimeError("--mgCores required run *not* running on single machine batch system")
-
+
+ if options.collapse:
+ findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all'
+
+ #import the reference collapse paf
+ ref_collapse_paf_id = None
+ if options.collapseRefPAF:
+ logger.info("Importing {}".format(options.collapseRefPAF))
+ ref_collapse_paf_id = toil.importFile(options.collapseRefPAF)
+ assert options.reference
+ findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'reference'
+
#import the sequences
input_seq_id_map = {}
input_path_map = {}
@@ -217,7 +239,7 @@ def main():
if genome not in options.reference:
input_seq_order.append(genome)
- toil.start(Job.wrapJobFn(pangenome_end_to_end_workflow, options, config_wrapper, input_seq_id_map, input_path_map, input_seq_order))
+ toil.start(Job.wrapJobFn(pangenome_end_to_end_workflow, options, config_wrapper, input_seq_id_map, input_path_map, input_seq_order, ref_collapse_paf_id))
end_time = timeit.default_timer()
run_time = end_time - start_time
@@ -315,7 +337,7 @@ def export_join_wrapper(job, options, wf_output):
""" toil join wrapper for cactus_graphmap_join """
export_join_data(job.fileStore, options, wf_output[0], wf_output[1], wf_output[2], wf_output[3], wf_output[4], wf_output[5])
-def pangenome_end_to_end_workflow(job, options, config_wrapper, seq_id_map, seq_path_map, seq_order):
+def pangenome_end_to_end_workflow(job, options, config_wrapper, seq_id_map, seq_path_map, seq_order, ref_collapse_paf_id):
""" chain the entire workflow together, doing exports after each step to mitigate annoyance of failures """
root_job = Job()
job.addChild(root_job)
@@ -341,7 +363,7 @@ def pangenome_end_to_end_workflow(job, options, config_wrapper, seq_id_map, seq_
options.outputFasta = gfa_fa_path
graph_event = getOptionalAttrib(findRequiredNode(config_node, "graphmap"), "assemblyName", default="_MINIGRAPH_")
- graphmap_job = minigraph_job.addFollowOnJobFn(minigraph_workflow, options, config_wrapper, seq_id_map, sv_gfa_id, graph_event, sanitize=False)
+ graphmap_job = minigraph_job.addFollowOnJobFn(minigraph_workflow, options, config_wrapper, seq_id_map, sv_gfa_id, graph_event, False, ref_collapse_paf_id)
paf_id, gfa_fa_id, gaf_id, unfiltered_paf_id, paf_filter_log = graphmap_job.rv(0), graphmap_job.rv(1), graphmap_job.rv(2), graphmap_job.rv(3), graphmap_job.rv(4)
graphmap_export_job = graphmap_job.addFollowOnJobFn(export_graphmap_wrapper, options, paf_id, paf_path, gaf_id, unfiltered_paf_id, paf_filter_log)
diff --git a/src/cactus/setup/cactus_align.py b/src/cactus/setup/cactus_align.py
index b49d27444..cb433aead 100644
--- a/src/cactus/setup/cactus_align.py
+++ b/src/cactus/setup/cactus_align.py
@@ -59,6 +59,9 @@ def main():
parser.add_argument("--pangenome", action="store_true",
help="Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings."
" The overridden configuration will be saved in .pg-conf.xml")
+
+ parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False)
+
parser.add_argument("--singleCopySpecies", type=str,
help="Filter out all self-alignments in given species")
parser.add_argument("--barMaskFilter", type=int, default=None,
@@ -165,7 +168,7 @@ def main():
# Mess with some toil options to create useful defaults.
cactus_override_toil_options(options)
-
+
logger.info('Cactus Command: {}'.format(' '.join(sys.argv)))
logger.info('Cactus Commit: {}'.format(cactus_commit))
start_time = timeit.default_timer()
@@ -256,6 +259,8 @@ def make_align_job(options, toil, config_wrapper=None, chrom_name=None):
config_wrapper = ConfigWrapper(config_node)
config_wrapper.substituteAllPredefinedConstantsWithLiterals(options)
config_wrapper.initGPU(options)
+ if options.collapse:
+ findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all'
config_wrapper.setSystemMemory(options)
mc_tree, input_seq_map, og_candidates = parse_seqfile(options.seqFile, config_wrapper,
@@ -331,6 +336,7 @@ def make_align_job(options, toil, config_wrapper=None, chrom_name=None):
poaNode.attrib["partialOrderAlignmentDisableSeeding"] = "1"
# import the PAF alignments
+ logger.info("Importing {}".format(options.pafFile))
paf_id = toil.importFile(makeURL(options.pafFile))
#import the sequences
@@ -396,8 +402,9 @@ def cactus_align(job, config_wrapper, mc_tree, input_seq_map, input_seq_id_map,
sub_tree = get_subtree(mc_tree, root_name, config_wrapper, og_map, include_outgroups=False)
# run the hal export
+ allow_collapse = getOptionalAttrib(findRequiredNode(config_wrapper.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") in ['reference', 'all']
hal_job = cons_job.addFollowOnJobFn(export_hal, sub_tree, config_wrapper.xmlRoot, new_seq_id_map, og_map, results, event=root_name, inMemory=True,
- checkpointInfo=checkpointInfo, acyclicEvent=referenceEvents[0] if referenceEvents else None,
+ checkpointInfo=checkpointInfo, acyclicEvent=referenceEvents[0] if referenceEvents and not allow_collapse else None,
memory_override=cons_memory)
# clean out some of the intermediate jobstore files
diff --git a/src/cactus/shared/common.py b/src/cactus/shared/common.py
index 315ae2667..4af78df69 100644
--- a/src/cactus/shared/common.py
+++ b/src/cactus/shared/common.py
@@ -1180,7 +1180,7 @@ def get_faidx_subpath_rename_cmd():
happy. But it does require conversion going into vg which wants chr[9-15] and
hal2vg is updated to do this autmatically
"""
- return ['sed', '-e', 's/\([^:]*\):\([0-9]*\)-\([0-9]*\)/echo "\\1_sub_$((\\2-1))_\\3"/e']
+ return ['sed', '-e', 's/\([^:]*\):\([0-9]+\)-\([0-9]+\)/echo "\\1_sub_$((\\2-1))_\\3"/e']
def clean_jobstore_files(job, file_id_maps=None, file_ids=None):
""" clean some intermediate files that are no longer needed out of the jobstore """
diff --git a/test/evolverTest.py b/test/evolverTest.py
index d3ac147ca..4789fb613 100644
--- a/test/evolverTest.py
+++ b/test/evolverTest.py
@@ -441,7 +441,7 @@ def _run_evolver_primates_pangenome(self, binariesMode):
out_dir = os.path.dirname(self._out_hal(binariesMode))
out_name = os.path.splitext(os.path.basename(self._out_hal(binariesMode)))[0]
cactus_pangenome_cmd = ['cactus-pangenome', self._job_store(binariesMode), seq_file_path, '--reference', 'simHuman', 'simChimp',
- '--outDir', out_dir, '--outName', out_name, '--noSplit', '--odgi', '--chrom-og', '--viz', '--draw', '--haplo']
+ '--outDir', out_dir, '--outName', out_name, '--noSplit', '--odgi', '--chrom-og', '--viz', '--draw', '--haplo', '--collapse']
subprocess.check_call(cactus_pangenome_cmd + cactus_opts)
# cactus-pangenome tacks on the .full to the output name