From 215daa60eb7c579daecd2df28ccd83f3ffc21161 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Sat, 10 Aug 2024 14:43:12 -0400 Subject: [PATCH 01/18] start work on self-alignment option in pangenome --- build-tools/downloadPangenomeTools | 2 +- src/cactus/cactus_progressive_config.xml | 6 +++ src/cactus/refmap/cactus_graphmap.py | 50 +++++++++++++++++++++-- src/cactus/refmap/cactus_graphmap_join.py | 12 +++++- src/cactus/refmap/cactus_pangenome.py | 3 ++ src/cactus/setup/cactus_align.py | 6 ++- 6 files changed, 72 insertions(+), 7 deletions(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index 1ea363317..46064e5a1 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -176,7 +176,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/ComparativeGenomicsToolkit/cactus-gfa-tools.git cd cactus-gfa-tools -git checkout 0c17bc4ae9a7cf174fa40805cde7f8f1f6de8225 +git checkout 7d8599ab603a2dffc86be6155073c2b703a3b298 make -j ${numcpu} if [[ $STATIC_CHECK -ne 1 || $(ldd paf2lastz | grep so | wc -l) -eq 0 ]] then diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index e5f363980..17e7766bf 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -320,6 +320,8 @@ + + @@ -341,6 +343,8 @@ assemblyName="_MINIGRAPH_" minigraphMapOptions="-c -xasm" minigraphConstructOptions="-c -xggs" + minimapCollapseOptions="-c -PD -k19 -w19 -m200" + minimapCollapseMode="none" minigraphConstructBatchSize="50" minigraphSortInput="mash" minMAPQ="5" @@ -397,6 +401,7 @@ + diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index 74cf0f695..2ff19619e 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -52,7 +52,8 @@ def main(): parser.add_argument("--outputGAFDir", type=str, help = "Output GAF alignments (raw minigraph output before PAF conversion) to this directory") parser.add_argument("--reference", nargs='+', type=str, help = "Reference genome name. MAPQ filter will not be applied to it") parser.add_argument("--refFromGFA", action="store_true", help = "Do not align reference (--reference) from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)") - parser.add_argument("--mapCores", type=int, help = "Number of cores for minigraph. Overrides graphmap cpu in configuration") + parser.add_argument("--mapCores", type=int, help = "Number of cores for minigraph. Overrides graphmap cpu in configuration") + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\" and \"none\"", default=None) #WDL hacks parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile") @@ -90,6 +91,11 @@ def main(): if options.reference: options.reference = options.reference[0] + if options.collapse and options.collapse not in ['reference', 'all', 'none']: + raise RuntimeError('valid values for --collapse are {reference, all, none}') + if options.collapse == 'reference' and not options.reference: + raise RuntimeError('--reference must be used with --collapse reference') + # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) @@ -137,6 +143,9 @@ def graph_map(options): if options.batchSystem.lower() in ['single_machine', 'singleMachine']: mg_cores = min(mg_cores, cactus_cpu_count(), int(options.maxCores) if options.maxCores else sys.maxsize) findRequiredNode(config_node, "graphmap").attrib["cpu"] = str(mg_cores) + + if options.collapse: + findRequiredNode(config_node, "graphmap_join").attrib["minimapCollapseMode"] = options.collapse # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode(config_node, "graphmap"), "assemblyName", default="_MINIGRAPH_") @@ -249,7 +258,10 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa root_job.addChild(gfa2paf_job) else: paf_job.addFollowOn(gfa2paf_job) - merge_paf_job = Job.wrapJobFn(merge_pafs, {"1" : paf_job.rv(0), "2" : gfa2paf_job.rv()}, disk=gfa_id_size) + if options.collapse in ['reference', 'all']: + collapse_job = paf_job.addChildJobFn(self_align_all, config, seq_id_map, options.collapse, options.reference) + collapse_paf_id = ref_collapse_job.rv() + merge_paf_job = Job.wrapJobFn(merge_pafs, {"1" : paf_job.rv(0), "2" : gfa2paf_job.rv()}, disk=gfa_id_size) paf_job.addFollowOn(merge_paf_job) gfa2paf_job.addFollowOn(merge_paf_job) out_paf_id = merge_paf_job.rv() @@ -274,6 +286,12 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa out_paf_id = del_filter_job.rv(0) filtered_paf_log = del_filter_job.rv(1) paf_was_filtered = del_filter_job.rv(2) + prev_job = del_filter_job + + if options.collapse in ['reference', 'all']: + # note: the collapse paf doesn't get merged into unfiltered_paf + merge_collapse_job = prev_job.addFollowOnJobFn(merge_pafs, {"1" : out_paf_id, "2" : collapse_paf_id}, disk=gfa_id_size) + out_paf_id = merge_collapse_job.rv() return out_paf_id, fa_id if options.outputFasta else None, paf_job.rv(1), unfiltered_paf_id, filtered_paf_log, paf_was_filtered @@ -430,6 +448,28 @@ def extract_paf_from_gfa(job, gfa_id, gfa_path, ref_event, graph_event, ignore_p cactus_call(parameters=cmd, outfile=paf_path) return job.fileStore.writeGlobalFile(paf_path) +def ref_self_align(job, config, seq_id_map, reference): + """ run self-alignment of the reference genomee """ + work_dir = job.fileStore.getLocalTempDir() + ref_name = reference.split('|')[-1] + fa_path = os.path.join(work_dir, ref_name + '.fa') + job.fileStore.readGlobalFile(seq_id_map[reference], fa_path) + cactus_call(parameters=['samtools', 'faidx', fa_path]) + ref_contigs = [] + with open(fa_path + '.fai', 'r') as fai_file: + for line in fai_file: + if line.strip(): + ref_contigs.append(line.split()[0]) + paf_path = os.path.join(work_dir, ref_name + '.self.paf') + xml_node = findRequiredNode(config.xmlRoot, "graphmap") + minimap_opts = getOptionalAttrib(xml_node, "minimapCollapseOptions", str, default="") + for ref_contig in ref_contigs: + contig_path = os.path.join(work_dir, '{}.{}.fa'.format(ref_name, ref_contig)) + cactus_call(parameters=['samtools', 'faidx', fa_path, ref_contig, '-o', contig_path]) + cmd = ['minimap2'] + minimap_opts.split() + [contig_path, contig_path] + cactus_call(parameters=cmd, outfile=paf_path, outappend=True) + return job.fileStore.writeGlobalFile(paf_path) + def filter_paf(job, paf_id, config, reference=None): """ run basic paf-filtering. these are quick filters that are best to do on-the-fly when reading the paf and as such, they are called by cactus-graphmap-split and cactus-align, not here """ @@ -462,8 +502,10 @@ def filter_paf(job, paf_id, config, reference=None): filter_paf_file.write(line) overlap_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterRatio", typeFn=float, default=0) - length_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterMinLengthRatio", typeFn=float, default=0) - if overlap_ratio: + length_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterMinLengthRatio", typeFn=float, default=0) + allow_collapse = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "allowRefCollapse", typeFn=bool, default=False) + + if overlap_ratio and not allow_collapse: overlap_filter_paf_path = filter_paf_path + ".overlap" cactus_call(parameters=['gaffilter', filter_paf_path, '-p', '-r', str(overlap_ratio), '-m', str(length_ratio), '-b', str(min_block), '-q', str(min_mapq), '-i', str(min_ident)], diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index c424e073c..66ddcae78 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -148,7 +148,10 @@ def graphmap_join_options(parser): "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None) parser.add_argument("--chop", type=int, nargs='?', const=1024, default=None, - help="chop all graph nodes to be at most this long (default=1024 specified without value). By default, nodes are only chopped for GBZ-derived formats, but left unchopped in the GFA, VCF, etc. If this option is used, the GBZ and GFA should have consistent IDs") + help="chop all graph nodes to be at most this long (default=1024 specified without value). By default, nodes are only chopped for GBZ-derived formats, but left unchopped in the GFA, VCF, etc. If this option is used, the GBZ and GFA should have consistent IDs") + + parser.add_argument("--refCollapse", action='store_true', help = "Do not filter out self-alignments") + def graphmap_join_validate_options(options): """ make sure the options make sense and fill in sensible defaults """ @@ -336,6 +339,9 @@ def graphmap_join(options): configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals(options) + + if options.refCollapse: + findRequiredNode(config_node, "graphmap_join").attrib["allowRefCollapse"] = "1" # load up the vgs vg_ids = [] @@ -674,6 +680,10 @@ def clip_vg(job, options, config, vg_path, vg_id, phase): clip_vg_cmd += ['-a', graph_event] clip_vg_cmd += ['-o', clipped_bed_path] + # disable cycle check if desiired + if getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "allowRefCollapse", typeFn=bool, default=False): + clip_vg_cmd += ['-c'] + cmd.append(clip_vg_cmd) if options.reference: # GFAFfix can leave uncovered nodes with --dont_collapse. We filter out here so they dont cause trouble later diff --git a/src/cactus/refmap/cactus_pangenome.py b/src/cactus/refmap/cactus_pangenome.py index c21e0736b..5802366b3 100644 --- a/src/cactus/refmap/cactus_pangenome.py +++ b/src/cactus/refmap/cactus_pangenome.py @@ -198,6 +198,9 @@ def main(): else: if not options.mgCores: raise RuntimeError("--mgCores required run *not* running on single machine batch system") + + if options.refCollapse: + findRequiredNode(config_node, "graphmap_join").attrib["allowRefCollapse"] = "1" #import the sequences input_seq_id_map = {} diff --git a/src/cactus/setup/cactus_align.py b/src/cactus/setup/cactus_align.py index b49d27444..0f3cc4bf0 100644 --- a/src/cactus/setup/cactus_align.py +++ b/src/cactus/setup/cactus_align.py @@ -59,6 +59,7 @@ def main(): parser.add_argument("--pangenome", action="store_true", help="Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings." " The overridden configuration will be saved in .pg-conf.xml") + parser.add_argument("--refCollapse", action='store_true', help = "Do not filter out self-alignments in --pangenome mode") parser.add_argument("--singleCopySpecies", type=str, help="Filter out all self-alignments in given species") parser.add_argument("--barMaskFilter", type=int, default=None, @@ -256,6 +257,8 @@ def make_align_job(options, toil, config_wrapper=None, chrom_name=None): config_wrapper = ConfigWrapper(config_node) config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) config_wrapper.initGPU(options) + if options.refCollapse: + findRequiredNode(config_node, "graphmap_join").attrib["allowRefCollapse"] = "1" config_wrapper.setSystemMemory(options) mc_tree, input_seq_map, og_candidates = parse_seqfile(options.seqFile, config_wrapper, @@ -396,8 +399,9 @@ def cactus_align(job, config_wrapper, mc_tree, input_seq_map, input_seq_id_map, sub_tree = get_subtree(mc_tree, root_name, config_wrapper, og_map, include_outgroups=False) # run the hal export + allow_collapse = getOptionalAttrib(findRequiredNode(config_wrapper.xmlRoot, "graphmap_join"), "allowRefCollapse", typeFn=bool, default=False) hal_job = cons_job.addFollowOnJobFn(export_hal, sub_tree, config_wrapper.xmlRoot, new_seq_id_map, og_map, results, event=root_name, inMemory=True, - checkpointInfo=checkpointInfo, acyclicEvent=referenceEvents[0] if referenceEvents else None, + checkpointInfo=checkpointInfo, acyclicEvent=referenceEvents[0] if referenceEvents and not allow_collapse else None, memory_override=cons_memory) # clean out some of the intermediate jobstore files From 7db4e1267080f24861fd6e0d503d721bb1b48653 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 12 Aug 2024 12:29:22 -0400 Subject: [PATCH 02/18] clean up collpase options --- src/cactus/refmap/cactus_graphmap.py | 52 ++++++++++++++++------- src/cactus/refmap/cactus_graphmap_join.py | 5 +++ src/cactus/refmap/cactus_pangenome.py | 17 ++++++-- src/cactus/setup/cactus_align.py | 1 + 4 files changed, 56 insertions(+), 19 deletions(-) diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index 2ff19619e..843c2928d 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -145,7 +145,7 @@ def graph_map(options): findRequiredNode(config_node, "graphmap").attrib["cpu"] = str(mg_cores) if options.collapse: - findRequiredNode(config_node, "graphmap_join").attrib["minimapCollapseMode"] = options.collapse + findRequiredNode(config_node, "graphmap").attrib["minimapCollapseMode"] = options.collapse # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode(config_node, "graphmap"), "assemblyName", default="_MINIGRAPH_") @@ -259,8 +259,9 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa else: paf_job.addFollowOn(gfa2paf_job) if options.collapse in ['reference', 'all']: - collapse_job = paf_job.addChildJobFn(self_align_all, config, seq_id_map, options.collapse, options.reference) - collapse_paf_id = ref_collapse_job.rv() + collapse_job = paf_job.addChildJobFn(self_align_all, config, seq_id_map, + options.reference if options.collapse == 'reference' else None) + collapse_paf_id = collapse_job.rv() merge_paf_job = Job.wrapJobFn(merge_pafs, {"1" : paf_job.rv(0), "2" : gfa2paf_job.rv()}, disk=gfa_id_size) paf_job.addFollowOn(merge_paf_job) gfa2paf_job.addFollowOn(merge_paf_job) @@ -448,25 +449,46 @@ def extract_paf_from_gfa(job, gfa_id, gfa_path, ref_event, graph_event, ignore_p cactus_call(parameters=cmd, outfile=paf_path) return job.fileStore.writeGlobalFile(paf_path) -def ref_self_align(job, config, seq_id_map, reference): - """ run self-alignment of the reference genomee """ +def self_align_all(job, config, seq_id_map, reference): + """ run self-alignment. if reference event given, just run on that, otherwise do all genomes """ + root_job = Job() + job.addChild(root_job) + events = [reference] if reference else seq_id_map.keys() + mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "cpu", typeFn=int, default=1) + paf_dict = {} + paf_size = 0 + for event in events: + paf_size += seq_id_map[event].size + collapse_job = root_job.addChildJobFn(self_align, config, event, seq_id_map[event], + disk=4*seq_id_map[event].size, + cores=mg_cores) + paf_dict[event] = collapse_job.rv() + + merge_paf_job = root_job.addFollowOnJobFn(merge_pafs, paf_dict, + disk=4*paf_size) + return merge_paf_job.rv() + +def self_align(job, config, seq_name, seq_id): + """ run minimap2 self alignment on a single genome """ work_dir = job.fileStore.getLocalTempDir() - ref_name = reference.split('|')[-1] - fa_path = os.path.join(work_dir, ref_name + '.fa') - job.fileStore.readGlobalFile(seq_id_map[reference], fa_path) + seq_name = seq_name.split('|')[-1] + fa_path = os.path.join(work_dir, seq_name + '.fa') + job.fileStore.readGlobalFile(seq_id, fa_path) cactus_call(parameters=['samtools', 'faidx', fa_path]) - ref_contigs = [] + contigs = [] with open(fa_path + '.fai', 'r') as fai_file: for line in fai_file: if line.strip(): - ref_contigs.append(line.split()[0]) - paf_path = os.path.join(work_dir, ref_name + '.self.paf') + contigs.append(line.split()[0]) + paf_path = os.path.join(work_dir, seq_name + '.self.paf') xml_node = findRequiredNode(config.xmlRoot, "graphmap") minimap_opts = getOptionalAttrib(xml_node, "minimapCollapseOptions", str, default="") - for ref_contig in ref_contigs: - contig_path = os.path.join(work_dir, '{}.{}.fa'.format(ref_name, ref_contig)) - cactus_call(parameters=['samtools', 'faidx', fa_path, ref_contig, '-o', contig_path]) - cmd = ['minimap2'] + minimap_opts.split() + [contig_path, contig_path] + for contig in contigs: + contig_path = os.path.join(work_dir, '{}.{}.fa'.format(seq_name, contig)) + cactus_call(parameters=['samtools', 'faidx', fa_path, contig, '-o', contig_path]) + cmd = ['minimap2', '-t', str(job.cores)] + minimap_opts.split() + [contig_path, contig_path] + # hack output to have have quality and primary or it will just get filtered out downstream + cmd = [cmd, ['sed', '-e', 's/tp:A:S/tp:A:P/g'], ['awk', 'OFS="\t" {$12="60"; print}']] cactus_call(parameters=cmd, outfile=paf_path, outappend=True) return job.fileStore.writeGlobalFile(paf_path) diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index 66ddcae78..daaa32143 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -324,6 +324,11 @@ def graphmap_join_validate_options(options): if options.filter and 'filter' not in options.gfa + options.gbz + options.odgi + options.chrom_vg + options.chrom_og + options.vcf + options.giraffe + options.viz + options.draw: options.filter = None + if options.collapse and options.collapse not in ['reference', 'all', 'none']: + raise RuntimeError('valid values for --collapse are {reference, all, none}') + if options.collapse == 'reference' and not options.reference: + raise RuntimeError('--reference must be used with --collapse reference') + return options def graphmap_join(options): diff --git a/src/cactus/refmap/cactus_pangenome.py b/src/cactus/refmap/cactus_pangenome.py index 5802366b3..d5c9e81e0 100644 --- a/src/cactus/refmap/cactus_pangenome.py +++ b/src/cactus/refmap/cactus_pangenome.py @@ -81,7 +81,10 @@ def main(): parser.add_argument("--consMemory", type=human2bytesN, help="Memory in bytes for each cactus_consolidated job (defaults to an estimate based on the input data size). " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None) - + + # cactus-graphmap options + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\" and \"none\"", default=None) + # cactus-graphmap-join options graphmap_join_options(parser) @@ -153,6 +156,11 @@ def main(): if options.maxCores is not None and options.maxCores < 2**20 and options.consCores > int(options.maxCores): raise RuntimeError('--consCores must be <= --maxCores') + if options.collapse and options.collapse not in ['reference', 'all', 'none']: + raise RuntimeError('valid values for --collapse are {reference, all, none}') + if options.collapse == 'reference' and not options.reference: + raise RuntimeError('--reference must be used with --collapse reference') + # Sort out the graphmap-join options, which can be rather complex # pass in dummy values for now, they will get filled in later # (but we want to do as much error-checking upfront as possible) @@ -198,10 +206,11 @@ def main(): else: if not options.mgCores: raise RuntimeError("--mgCores required run *not* running on single machine batch system") - - if options.refCollapse: + + if options.collapse: + findRequiredNode(config_node, "graphmap").attrib["minimapCollapseMode"] = options.collapse findRequiredNode(config_node, "graphmap_join").attrib["allowRefCollapse"] = "1" - + #import the sequences input_seq_id_map = {} input_path_map = {} diff --git a/src/cactus/setup/cactus_align.py b/src/cactus/setup/cactus_align.py index 0f3cc4bf0..33a8e5bcb 100644 --- a/src/cactus/setup/cactus_align.py +++ b/src/cactus/setup/cactus_align.py @@ -334,6 +334,7 @@ def make_align_job(options, toil, config_wrapper=None, chrom_name=None): poaNode.attrib["partialOrderAlignmentDisableSeeding"] = "1" # import the PAF alignments + logger.info("Importing {}".format(options.pafFile)) paf_id = toil.importFile(makeURL(options.pafFile)) #import the sequences From f9cc133ac93efc75a644954369fa8340dc4d3030 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 12 Aug 2024 13:25:04 -0400 Subject: [PATCH 03/18] dont make rgfa when allowing cycles --- src/cactus/refmap/cactus_graphmap_join.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index daaa32143..b9b67c72e 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -831,7 +831,9 @@ def vg_to_gfa(job, options, config, vg_path, vg_id): job.fileStore.readGlobalFile(vg_id, vg_path) out_path = vg_path + '.gfa' - cmd = ['vg', 'convert', '-f', '-Q', options.reference[0], os.path.basename(vg_path), '-B'] + cmd = ['vg', 'convert', '-f', os.path.basename(vg_path)] + if not getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "allowRefCollapse", typeFn=bool, default=False): + cmd += ['-Q', options.reference[0], '-B'] cactus_call(parameters=cmd, outfile=out_path, work_dir=work_dir, job_memory=job.memory) From 9642186a13cfbe085217a158e06cf5bd4aabdb57 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 12 Aug 2024 14:06:50 -0400 Subject: [PATCH 04/18] update deps --- build-tools/downloadPangenomeTools | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index 46064e5a1..28afb1ee9 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -176,7 +176,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/ComparativeGenomicsToolkit/cactus-gfa-tools.git cd cactus-gfa-tools -git checkout 7d8599ab603a2dffc86be6155073c2b703a3b298 +git checkout c82eb3537a128728315dc31bfbadce1886c232ce make -j ${numcpu} if [[ $STATIC_CHECK -ne 1 || $(ldd paf2lastz | grep so | wc -l) -eq 0 ]] then @@ -235,7 +235,7 @@ fi # hal2vg cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/hal2vg +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/hal2vg chmod +x hal2vg if [[ $STATIC_CHECK -ne 1 || $(ldd hal2vg | grep so | wc -l) -eq 0 ]] then @@ -245,7 +245,7 @@ else fi # clip-vg cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/clip-vg +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/clip-vg chmod +x clip-vg if [[ $STATIC_CHECK -ne 1 || $(ldd clip-vg | grep so | wc -l) -eq 0 ]] then @@ -255,7 +255,7 @@ else fi # halRemoveDupes cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/halRemoveDupes +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/halRemoveDupes chmod +x halRemoveDupes if [[ $STATIC_CHECK -ne 1 || $(ldd halRemoveDupes | grep so | wc -l) -eq 0 ]] then @@ -265,7 +265,7 @@ else fi # halMergeChroms cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/halMergeChroms +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/halMergeChroms chmod +x halMergeChroms if [[ $STATIC_CHECK -ne 1 || $(ldd halMergeChroms | grep so | wc -l) -eq 0 ]] then @@ -276,7 +276,7 @@ fi # halUnclip cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/halUnclip +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/halUnclip chmod +x halUnclip if [[ $STATIC_CHECK -ne 1 || $(ldd halUnclip | grep so | wc -l) -eq 0 ]] then @@ -287,7 +287,7 @@ fi # filter-paf-deletions cd ${pangenomeBuildDir} -wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.7/filter-paf-deletions +wget -q https://github.com/ComparativeGenomicsToolkit/hal2vg/releases/download/v1.1.8/filter-paf-deletions chmod +x filter-paf-deletions if [[ $STATIC_CHECK -ne 1 || $(ldd filter-paf-deletions | grep so | wc -l) -eq 0 ]] then From c8707996668a0db0f4174828f704c3d36d5902e4 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 12 Aug 2024 15:24:30 -0400 Subject: [PATCH 05/18] add ref collapse file input --- build-tools/downloadPangenomeTools | 2 +- src/cactus/refmap/cactus_graphmap.py | 46 +++++++++++++++++++++++---- src/cactus/refmap/cactus_pangenome.py | 23 +++++++++++--- 3 files changed, 58 insertions(+), 13 deletions(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index 28afb1ee9..3e4f56850 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -176,7 +176,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/ComparativeGenomicsToolkit/cactus-gfa-tools.git cd cactus-gfa-tools -git checkout c82eb3537a128728315dc31bfbadce1886c232ce +git checkout d02b569c7960aff92aea67c113767e9628dff841 make -j ${numcpu} if [[ $STATIC_CHECK -ne 1 || $(ldd paf2lastz | grep so | wc -l) -eq 0 ]] then diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index 843c2928d..e4bfe3ed7 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -54,6 +54,7 @@ def main(): parser.add_argument("--refFromGFA", action="store_true", help = "Do not align reference (--reference) from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)") parser.add_argument("--mapCores", type=int, help = "Number of cores for minigraph. Overrides graphmap cpu in configuration") parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\" and \"none\"", default=None) + parser.add_argument("--collapseRefPAF", help ="Incorporate given reference self-alignments in PAF format") #WDL hacks parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile") @@ -95,6 +96,11 @@ def main(): raise RuntimeError('valid values for --collapse are {reference, all, none}') if options.collapse == 'reference' and not options.reference: raise RuntimeError('--reference must be used with --collapse reference') + if options.collapseRefPAF: + if not options.collapseRefPAF.endswith('.paf'): + raise RuntimeError('file passed to --collapseRefPaf must end with .paf') + if not options.reference: + raise RuntimeError('--reference must be used with --collapseRefPAF') # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) @@ -178,6 +184,12 @@ def graph_map(options): logger.info("Importing {}".format(options.minigraphGFA)) gfa_id = toil.importFile(makeURL(options.minigraphGFA)) + #import the reference collapse paf + ref_collapse_paf_id = None + if options.collapseRefPAF: + logger.info("Importing {}".format(options.collapseRefPAF)) + ref_collapse_paf_id = toil.importFile(options.collapseRefPAF) + #import the sequences (that we need to align for the given event, ie leaves and outgroups) seq_id_map = {} fa_id_map = {} @@ -194,7 +206,7 @@ def graph_map(options): # run the workflow paf_id, gfa_fa_id, gaf_id, unfiltered_paf_id, paf_filter_log, paf_was_filtered = toil.start(Job.wrapJobFn( - minigraph_workflow, options, config_wrapper, seq_id_map, gfa_id, graph_event)) + minigraph_workflow, options, config_wrapper, seq_id_map, gfa_id, graph_event, True, ref_collapse_paf_id)) #export the paf toil.exportFile(paf_id, makeURL(options.outputPAF)) @@ -212,7 +224,7 @@ def graph_map(options): if options.outputFasta: add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta), graph_event) -def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sanitize=True): +def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sanitize, ref_collapse_paf_id): """ Overall workflow takes command line options and returns (paf-id, (optional) fa-id) """ fa_id = None gfa_id_size = gfa_id.size @@ -231,6 +243,11 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa sanitize_job = root_job.addChildJobFn(sanitize_fasta_headers, seq_id_map, pangenome=True) seq_id_map = sanitize_job.rv() + # add unique prefixes to the input PAF + if ref_collapse_paf_id: + ref_collapse_paf_id = root_job.addChildJobFn(add_paf_prefixes, ref_collapse_paf_id, options.reference, + disk=2*ref_collapse_paf_id.size).rv() + zipped_gfa = options.minigraphGFA.endswith('.gz') if options.outputFasta: # convert GFA to fasta @@ -248,6 +265,7 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa paf_job = Job.wrapJobFn(minigraph_map_all, config, gfa_id, seq_id_map, graph_event) root_job.addFollowOn(paf_job) + collapse_paf_id = ref_collapse_paf_id if options.reference: # extract a PAF directly from the rGFAs tag for the given reference # if --refFromGFA is specified, we get the entire alignment from that, otherwise we just take contigs @@ -261,7 +279,12 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa if options.collapse in ['reference', 'all']: collapse_job = paf_job.addChildJobFn(self_align_all, config, seq_id_map, options.reference if options.collapse == 'reference' else None) - collapse_paf_id = collapse_job.rv() + + if ref_collapse_paf_id: + collapse_paf_id = collapse_job.addFollowOnJobFn(merge_pafs, {"1":collapse_job.rv(), "2":ref_collapse_paf_id}, + disk=gfa_id_size).rv() + else: + collapse_paf_id = collapse_job.rv() merge_paf_job = Job.wrapJobFn(merge_pafs, {"1" : paf_job.rv(0), "2" : gfa2paf_job.rv()}, disk=gfa_id_size) paf_job.addFollowOn(merge_paf_job) gfa2paf_job.addFollowOn(merge_paf_job) @@ -289,16 +312,25 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa paf_was_filtered = del_filter_job.rv(2) prev_job = del_filter_job - if options.collapse in ['reference', 'all']: + if collapse_paf_id: # note: the collapse paf doesn't get merged into unfiltered_paf merge_collapse_job = prev_job.addFollowOnJobFn(merge_pafs, {"1" : out_paf_id, "2" : collapse_paf_id}, disk=gfa_id_size) out_paf_id = merge_collapse_job.rv() return out_paf_id, fa_id if options.outputFasta else None, paf_job.rv(1), unfiltered_paf_id, filtered_paf_log, paf_was_filtered - + +def add_paf_prefixes(job, paf_id, name): + """ Add prefixes to paf """ + work_dir = job.fileStore.getLocalTempDir() + paf_path = os.path.join(work_dir, name + ".paf") + job.fileStore.readGlobalFile(paf_id, paf_path) + renamed_paf_path = os.path.join(work_dir, name + '.prefixed.paf') + cmd = ['awk', 'BEGIN{{OFS=\" \"}} {{$1="id={}|"$1; $6="id={}|"$6; print}}'.format(name, name), paf_path] + cactus_call(parameters=cmd, outfile=renamed_paf_path) + return job.fileStore.writeGlobalFile(renamed_paf_path) + def make_minigraph_fasta(job, gfa_file_id, gfa_file_path, name): """ Use gfatools to make the minigraph "assembly" """ - # note: using the toil-vg convention of naming working files manually so that logging is more readable work_dir = job.fileStore.getLocalTempDir() gfa_path = os.path.join(work_dir, "mg.gfa") @@ -488,7 +520,7 @@ def self_align(job, config, seq_name, seq_id): cactus_call(parameters=['samtools', 'faidx', fa_path, contig, '-o', contig_path]) cmd = ['minimap2', '-t', str(job.cores)] + minimap_opts.split() + [contig_path, contig_path] # hack output to have have quality and primary or it will just get filtered out downstream - cmd = [cmd, ['sed', '-e', 's/tp:A:S/tp:A:P/g'], ['awk', 'OFS="\t" {$12="60"; print}']] + cmd = [cmd, ['sed', '-e', 's/tp:A:S/tp:A:P/g'], ['awk', 'BEGIN{OFS=\" \"} {$12="60"; print}']] cactus_call(parameters=cmd, outfile=paf_path, outappend=True) return job.fileStore.writeGlobalFile(paf_path) diff --git a/src/cactus/refmap/cactus_pangenome.py b/src/cactus/refmap/cactus_pangenome.py index d5c9e81e0..e0b1ab887 100644 --- a/src/cactus/refmap/cactus_pangenome.py +++ b/src/cactus/refmap/cactus_pangenome.py @@ -84,6 +84,7 @@ def main(): # cactus-graphmap options parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\" and \"none\"", default=None) + parser.add_argument("--collapseRefPAF", help ="Incorporate given reference self-alignments in PAF format") # cactus-graphmap-join options graphmap_join_options(parser) @@ -159,7 +160,12 @@ def main(): if options.collapse and options.collapse not in ['reference', 'all', 'none']: raise RuntimeError('valid values for --collapse are {reference, all, none}') if options.collapse == 'reference' and not options.reference: - raise RuntimeError('--reference must be used with --collapse reference') + raise RuntimeError('--reference must be used with --collapse reference') + if options.collapseRefPAF: + if not options.collapseRefPAF.endswith('.paf'): + raise RuntimeError('file passed to --collapseRefPaf must end with .paf') + if not options.reference: + raise RuntimeError('--reference must be used with --collapseRefPAF') # Sort out the graphmap-join options, which can be rather complex # pass in dummy values for now, they will get filled in later @@ -209,8 +215,15 @@ def main(): if options.collapse: findRequiredNode(config_node, "graphmap").attrib["minimapCollapseMode"] = options.collapse + if options.collapse or options.collapseRefPAF: findRequiredNode(config_node, "graphmap_join").attrib["allowRefCollapse"] = "1" - + + #import the reference collapse paf + ref_collapse_paf_id = None + if options.collapseRefPAF: + logger.info("Importing {}".format(options.collapseRefPAF)) + ref_collapse_paf_id = toil.importFile(options.collapseRefPAF) + #import the sequences input_seq_id_map = {} input_path_map = {} @@ -229,7 +242,7 @@ def main(): if genome not in options.reference: input_seq_order.append(genome) - toil.start(Job.wrapJobFn(pangenome_end_to_end_workflow, options, config_wrapper, input_seq_id_map, input_path_map, input_seq_order)) + toil.start(Job.wrapJobFn(pangenome_end_to_end_workflow, options, config_wrapper, input_seq_id_map, input_path_map, input_seq_order, ref_collapse_paf_id)) end_time = timeit.default_timer() run_time = end_time - start_time @@ -327,7 +340,7 @@ def export_join_wrapper(job, options, wf_output): """ toil join wrapper for cactus_graphmap_join """ export_join_data(job.fileStore, options, wf_output[0], wf_output[1], wf_output[2], wf_output[3], wf_output[4], wf_output[5]) -def pangenome_end_to_end_workflow(job, options, config_wrapper, seq_id_map, seq_path_map, seq_order): +def pangenome_end_to_end_workflow(job, options, config_wrapper, seq_id_map, seq_path_map, seq_order, ref_collapse_paf_id): """ chain the entire workflow together, doing exports after each step to mitigate annoyance of failures """ root_job = Job() job.addChild(root_job) @@ -353,7 +366,7 @@ def pangenome_end_to_end_workflow(job, options, config_wrapper, seq_id_map, seq_ options.outputFasta = gfa_fa_path graph_event = getOptionalAttrib(findRequiredNode(config_node, "graphmap"), "assemblyName", default="_MINIGRAPH_") - graphmap_job = minigraph_job.addFollowOnJobFn(minigraph_workflow, options, config_wrapper, seq_id_map, sv_gfa_id, graph_event, sanitize=False) + graphmap_job = minigraph_job.addFollowOnJobFn(minigraph_workflow, options, config_wrapper, seq_id_map, sv_gfa_id, graph_event, False, ref_collapse_paf_id) paf_id, gfa_fa_id, gaf_id, unfiltered_paf_id, paf_filter_log = graphmap_job.rv(0), graphmap_job.rv(1), graphmap_job.rv(2), graphmap_job.rv(3), graphmap_job.rv(4) graphmap_export_job = graphmap_job.addFollowOnJobFn(export_graphmap_wrapper, options, paf_id, paf_path, gaf_id, unfiltered_paf_id, paf_filter_log) From aa0e989a5c9040279be848b7046a2f45bca50b09 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 12 Aug 2024 15:30:56 -0400 Subject: [PATCH 06/18] add --collapse to tests --- test/evolverTest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/evolverTest.py b/test/evolverTest.py index 55eb3e8e6..7d6ade8a7 100644 --- a/test/evolverTest.py +++ b/test/evolverTest.py @@ -441,7 +441,7 @@ def _run_evolver_primates_pangenome(self, binariesMode): out_dir = os.path.dirname(self._out_hal(binariesMode)) out_name = os.path.splitext(os.path.basename(self._out_hal(binariesMode)))[0] cactus_pangenome_cmd = ['cactus-pangenome', self._job_store(binariesMode), seq_file_path, '--reference', 'simHuman', 'simChimp', - '--outDir', out_dir, '--outName', out_name, '--noSplit', '--odgi', '--chrom-og', '--viz', '--draw', '--haplo'] + '--outDir', out_dir, '--outName', out_name, '--noSplit', '--odgi', '--chrom-og', '--viz', '--draw', '--haplo', '--collapse', 'all'] subprocess.check_call(cactus_pangenome_cmd + cactus_opts) # cactus-pangenome tacks on the .full to the output name From 323b98e76b5ae9b95334cb2a7a72818b7977e0df Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 13 Aug 2024 08:32:53 -0400 Subject: [PATCH 07/18] fix bugs --- src/cactus/refmap/cactus_graphmap.py | 1 + src/cactus/refmap/cactus_graphmap_join.py | 5 ----- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index e4bfe3ed7..ec98ed256 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -493,6 +493,7 @@ def self_align_all(job, config, seq_id_map, reference): paf_size += seq_id_map[event].size collapse_job = root_job.addChildJobFn(self_align, config, event, seq_id_map[event], disk=4*seq_id_map[event].size, + memory=4*seq_id_map[event].size, cores=mg_cores) paf_dict[event] = collapse_job.rv() diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index b9b67c72e..84beb83c9 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -324,11 +324,6 @@ def graphmap_join_validate_options(options): if options.filter and 'filter' not in options.gfa + options.gbz + options.odgi + options.chrom_vg + options.chrom_og + options.vcf + options.giraffe + options.viz + options.draw: options.filter = None - if options.collapse and options.collapse not in ['reference', 'all', 'none']: - raise RuntimeError('valid values for --collapse are {reference, all, none}') - if options.collapse == 'reference' and not options.reference: - raise RuntimeError('--reference must be used with --collapse reference') - return options def graphmap_join(options): From 893806c3939fd17ab58348e547f445df61f39262 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 14 Aug 2024 17:37:38 -0400 Subject: [PATCH 08/18] fix self-alignment options --- src/cactus/cactus_progressive_config.xml | 4 ++-- src/cactus/refmap/cactus_graphmap.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index 17e7766bf..c12958a68 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -320,7 +320,7 @@ - + @@ -343,7 +343,7 @@ assemblyName="_MINIGRAPH_" minigraphMapOptions="-c -xasm" minigraphConstructOptions="-c -xggs" - minimapCollapseOptions="-c -PD -k19 -w19 -m200" + minimapCollapseOptions="-c -D -xasm5 -m500" minimapCollapseMode="none" minigraphConstructBatchSize="50" minigraphSortInput="mash" diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index ec98ed256..e12499b17 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -520,8 +520,6 @@ def self_align(job, config, seq_name, seq_id): contig_path = os.path.join(work_dir, '{}.{}.fa'.format(seq_name, contig)) cactus_call(parameters=['samtools', 'faidx', fa_path, contig, '-o', contig_path]) cmd = ['minimap2', '-t', str(job.cores)] + minimap_opts.split() + [contig_path, contig_path] - # hack output to have have quality and primary or it will just get filtered out downstream - cmd = [cmd, ['sed', '-e', 's/tp:A:S/tp:A:P/g'], ['awk', 'BEGIN{OFS=\" \"} {$12="60"; print}']] cactus_call(parameters=cmd, outfile=paf_path, outappend=True) return job.fileStore.writeGlobalFile(paf_path) From b8c943c0e4f7410c16d9ad1a84c838d4c101e742 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 22 Aug 2024 11:09:29 -0400 Subject: [PATCH 09/18] clean collapse logic into single config option --- src/cactus/cactus_progressive_config.xml | 6 ++--- src/cactus/refmap/cactus_graphmap.py | 29 +++++++++++++--------- src/cactus/refmap/cactus_graphmap_join.py | 17 ++++++++----- src/cactus/refmap/cactus_graphmap_split.py | 10 ++++++++ src/cactus/refmap/cactus_pangenome.py | 11 ++------ src/cactus/setup/cactus_align.py | 15 ++++++++--- 6 files changed, 53 insertions(+), 35 deletions(-) diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index c12958a68..768ff2979 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -321,7 +321,7 @@ - + @@ -344,7 +344,7 @@ minigraphMapOptions="-c -xasm" minigraphConstructOptions="-c -xggs" minimapCollapseOptions="-c -D -xasm5 -m500" - minimapCollapseMode="none" + collapse="none" minigraphConstructBatchSize="50" minigraphSortInput="mash" minMAPQ="5" @@ -401,7 +401,6 @@ - diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index e12499b17..f51aa88b6 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -53,7 +53,7 @@ def main(): parser.add_argument("--reference", nargs='+', type=str, help = "Reference genome name. MAPQ filter will not be applied to it") parser.add_argument("--refFromGFA", action="store_true", help = "Do not align reference (--reference) from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)") parser.add_argument("--mapCores", type=int, help = "Number of cores for minigraph. Overrides graphmap cpu in configuration") - parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\" and \"none\"", default=None) + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\", \"nonref\" and \"none\". [EXPERIMENTAL, especially \"reference\" and \"nonref\"]", default=None) parser.add_argument("--collapseRefPAF", help ="Incorporate given reference self-alignments in PAF format") #WDL hacks @@ -92,10 +92,10 @@ def main(): if options.reference: options.reference = options.reference[0] - if options.collapse and options.collapse not in ['reference', 'all', 'none']: - raise RuntimeError('valid values for --collapse are {reference, all, none}') - if options.collapse == 'reference' and not options.reference: - raise RuntimeError('--reference must be used with --collapse reference') + if options.collapse and options.collapse not in ['reference', 'all', 'nonref', 'none']: + raise RuntimeError('valid values for --collapse are {reference, all, nonref, none}') + if options.collapse in ['reference', 'nonref'] and not options.reference: + raise RuntimeError('--reference must be used with --collapse reference/nonref') if options.collapseRefPAF: if not options.collapseRefPAF.endswith('.paf'): raise RuntimeError('file passed to --collapseRefPaf must end with .paf') @@ -151,7 +151,7 @@ def graph_map(options): findRequiredNode(config_node, "graphmap").attrib["cpu"] = str(mg_cores) if options.collapse: - findRequiredNode(config_node, "graphmap").attrib["minimapCollapseMode"] = options.collapse + findRequiredNode(config_node, "graphmap").attrib["collapse"] = options.collapse # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode(config_node, "graphmap"), "assemblyName", default="_MINIGRAPH_") @@ -276,9 +276,8 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa root_job.addChild(gfa2paf_job) else: paf_job.addFollowOn(gfa2paf_job) - if options.collapse in ['reference', 'all']: - collapse_job = paf_job.addChildJobFn(self_align_all, config, seq_id_map, - options.reference if options.collapse == 'reference' else None) + if options.collapse in ['reference', 'all', 'nonref']: + collapse_job = paf_job.addChildJobFn(self_align_all, config, seq_id_map, options.reference, options.collapse) if ref_collapse_paf_id: collapse_paf_id = collapse_job.addFollowOnJobFn(merge_pafs, {"1":collapse_job.rv(), "2":ref_collapse_paf_id}, @@ -481,11 +480,17 @@ def extract_paf_from_gfa(job, gfa_id, gfa_path, ref_event, graph_event, ignore_p cactus_call(parameters=cmd, outfile=paf_path) return job.fileStore.writeGlobalFile(paf_path) -def self_align_all(job, config, seq_id_map, reference): +def self_align_all(job, config, seq_id_map, reference, collapse_mode): """ run self-alignment. if reference event given, just run on that, otherwise do all genomes """ + assert collapse_mode in ['reference', 'all', 'nonref'] + assert reference or collapse_mode == 'all' root_job = Job() job.addChild(root_job) - events = [reference] if reference else seq_id_map.keys() + events = [] + for event in seq_id_map.keys(): + if collapse_mode == 'all' or (collapse_mode == 'reference' and event == reference) or \ + (collapse_mode == 'nonref' and event != reference): + events.append(event) mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "cpu", typeFn=int, default=1) paf_dict = {} paf_size = 0 @@ -556,7 +561,7 @@ def filter_paf(job, paf_id, config, reference=None): overlap_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterRatio", typeFn=float, default=0) length_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterMinLengthRatio", typeFn=float, default=0) - allow_collapse = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "allowRefCollapse", typeFn=bool, default=False) + allow_collapse = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") != "none" if overlap_ratio and not allow_collapse: overlap_filter_paf_path = filter_paf_path + ".overlap" diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index 84beb83c9..36119753b 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -150,7 +150,7 @@ def graphmap_join_options(parser): parser.add_argument("--chop", type=int, nargs='?', const=1024, default=None, help="chop all graph nodes to be at most this long (default=1024 specified without value). By default, nodes are only chopped for GBZ-derived formats, but left unchopped in the GFA, VCF, etc. If this option is used, the GBZ and GFA should have consistent IDs") - parser.add_argument("--refCollapse", action='store_true', help = "Do not filter out self-alignments") + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\", \"nonref\" and \"none\". [EXPERIMENTAL, especially \"reference\" and \"nonref\"]", default=None) def graphmap_join_validate_options(options): @@ -324,6 +324,11 @@ def graphmap_join_validate_options(options): if options.filter and 'filter' not in options.gfa + options.gbz + options.odgi + options.chrom_vg + options.chrom_og + options.vcf + options.giraffe + options.viz + options.draw: options.filter = None + if options.collapse and options.collapse not in ['reference', 'all', 'nonref', 'none']: + raise RuntimeError('valid values for --collapse are {reference, all, nonref, none}') + if options.collapse in ['reference', 'nonref'] and not options.reference: + raise RuntimeError('--reference must be used with --collapse reference/nonref') + return options def graphmap_join(options): @@ -340,8 +345,8 @@ def graphmap_join(options): config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals(options) - if options.refCollapse: - findRequiredNode(config_node, "graphmap_join").attrib["allowRefCollapse"] = "1" + if options.collapse: + findRequiredNode(config_node, "graphmap").attrib["collapse"] = options.collapse # load up the vgs vg_ids = [] @@ -680,8 +685,8 @@ def clip_vg(job, options, config, vg_path, vg_id, phase): clip_vg_cmd += ['-a', graph_event] clip_vg_cmd += ['-o', clipped_bed_path] - # disable cycle check if desiired - if getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "allowRefCollapse", typeFn=bool, default=False): + # disable reference cycle check if desiired + if getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") in ["all", "reference"]: clip_vg_cmd += ['-c'] cmd.append(clip_vg_cmd) @@ -827,7 +832,7 @@ def vg_to_gfa(job, options, config, vg_path, vg_id): out_path = vg_path + '.gfa' cmd = ['vg', 'convert', '-f', os.path.basename(vg_path)] - if not getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_join"), "allowRefCollapse", typeFn=bool, default=False): + if getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") not in ["all", "reference"]: cmd += ['-Q', options.reference[0], '-B'] cactus_call(parameters=cmd, outfile=out_path, work_dir=work_dir, job_memory=job.memory) diff --git a/src/cactus/refmap/cactus_graphmap_split.py b/src/cactus/refmap/cactus_graphmap_split.py index 3cd4de952..b8b50dcd9 100644 --- a/src/cactus/refmap/cactus_graphmap_split.py +++ b/src/cactus/refmap/cactus_graphmap_split.py @@ -55,6 +55,8 @@ def main(): parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp") parser.add_argument("--minIdentity", type=float, help = "Ignore PAF lines with identity (column 10/11) < this (overrides minIdentity in in config)") parser.add_argument("--permissiveContigFilter", nargs='?', const='0.25', default=None, type=float, help = "If specified, override the configuration to accept contigs so long as they have at least given fraction of coverage (0.25 if no fraction specified). This can increase sensitivity of very small, fragmented and/or diverse assemblies.") + + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\", \"nonref\" and \"none\". [EXPERIMENTAL, especially \"reference\" and \"nonref\"]", default=None) #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", @@ -86,6 +88,11 @@ def main(): if options.reference: options.reference = options.reference[0] + if options.collapse and options.collapse not in ['reference', 'all', 'nonref', 'none']: + raise RuntimeError('valid values for --collapse are {reference, all, nonref, none}') + if options.collapse in ['reference', 'nonref'] and not options.reference: + raise RuntimeError('--reference must be used with --collapse reference/nonref') + logger.info('Cactus Command: {}'.format(' '.join(sys.argv))) logger.info('Cactus Commit: {}'.format(cactus_commit)) start_time = timeit.default_timer() @@ -103,6 +110,9 @@ def cactus_graphmap_split(options): config = ConfigWrapper(config_node) config.substituteAllPredefinedConstantsWithLiterals(options) + if options.collapse: + findRequiredNode(config_node, "graphmap").attrib["collapse"] = options.collapse + #Run the workflow if options.restart: wf_output = toil.restart() diff --git a/src/cactus/refmap/cactus_pangenome.py b/src/cactus/refmap/cactus_pangenome.py index e0b1ab887..9f472663b 100644 --- a/src/cactus/refmap/cactus_pangenome.py +++ b/src/cactus/refmap/cactus_pangenome.py @@ -83,7 +83,6 @@ def main(): "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes))", default=None) # cactus-graphmap options - parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\" and \"none\"", default=None) parser.add_argument("--collapseRefPAF", help ="Incorporate given reference self-alignments in PAF format") # cactus-graphmap-join options @@ -157,15 +156,11 @@ def main(): if options.maxCores is not None and options.maxCores < 2**20 and options.consCores > int(options.maxCores): raise RuntimeError('--consCores must be <= --maxCores') - if options.collapse and options.collapse not in ['reference', 'all', 'none']: - raise RuntimeError('valid values for --collapse are {reference, all, none}') - if options.collapse == 'reference' and not options.reference: - raise RuntimeError('--reference must be used with --collapse reference') if options.collapseRefPAF: if not options.collapseRefPAF.endswith('.paf'): raise RuntimeError('file passed to --collapseRefPaf must end with .paf') if not options.reference: - raise RuntimeError('--reference must be used with --collapseRefPAF') + raise RuntimeError('--reference must be used with --collapseRefPAF') # Sort out the graphmap-join options, which can be rather complex # pass in dummy values for now, they will get filled in later @@ -214,9 +209,7 @@ def main(): raise RuntimeError("--mgCores required run *not* running on single machine batch system") if options.collapse: - findRequiredNode(config_node, "graphmap").attrib["minimapCollapseMode"] = options.collapse - if options.collapse or options.collapseRefPAF: - findRequiredNode(config_node, "graphmap_join").attrib["allowRefCollapse"] = "1" + findRequiredNode(config_node, "graphmap").attrib["collapse"] = options.collapse #import the reference collapse paf ref_collapse_paf_id = None diff --git a/src/cactus/setup/cactus_align.py b/src/cactus/setup/cactus_align.py index 33a8e5bcb..dc4a1521f 100644 --- a/src/cactus/setup/cactus_align.py +++ b/src/cactus/setup/cactus_align.py @@ -59,7 +59,9 @@ def main(): parser.add_argument("--pangenome", action="store_true", help="Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings." " The overridden configuration will be saved in .pg-conf.xml") - parser.add_argument("--refCollapse", action='store_true', help = "Do not filter out self-alignments in --pangenome mode") + + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\", \"nonref\" and \"none\". [EXPERIMENTAL, especially \"reference\" and \"nonref\"]", default=None) + parser.add_argument("--singleCopySpecies", type=str, help="Filter out all self-alignments in given species") parser.add_argument("--barMaskFilter", type=int, default=None, @@ -166,6 +168,11 @@ def main(): # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) + + if options.collapse and options.collapse not in ['reference', 'all', 'nonref', 'none']: + raise RuntimeError('valid values for --collapse are {reference, all, nonref, none}') + if options.collapse in ['reference', 'nonref'] and not options.reference: + raise RuntimeError('--reference must be used with --collapse reference/nonref') logger.info('Cactus Command: {}'.format(' '.join(sys.argv))) logger.info('Cactus Commit: {}'.format(cactus_commit)) @@ -257,8 +264,8 @@ def make_align_job(options, toil, config_wrapper=None, chrom_name=None): config_wrapper = ConfigWrapper(config_node) config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) config_wrapper.initGPU(options) - if options.refCollapse: - findRequiredNode(config_node, "graphmap_join").attrib["allowRefCollapse"] = "1" + if options.collapse: + findRequiredNode(config_node, "graphmap").attrib["collapse"] = options.collapse config_wrapper.setSystemMemory(options) mc_tree, input_seq_map, og_candidates = parse_seqfile(options.seqFile, config_wrapper, @@ -400,7 +407,7 @@ def cactus_align(job, config_wrapper, mc_tree, input_seq_map, input_seq_id_map, sub_tree = get_subtree(mc_tree, root_name, config_wrapper, og_map, include_outgroups=False) # run the hal export - allow_collapse = getOptionalAttrib(findRequiredNode(config_wrapper.xmlRoot, "graphmap_join"), "allowRefCollapse", typeFn=bool, default=False) + allow_collapse = getOptionalAttrib(findRequiredNode(config_wrapper.xmlRoot, "graphmap_join"), "allowCollapse", typeFn=bool, default=False) hal_job = cons_job.addFollowOnJobFn(export_hal, sub_tree, config_wrapper.xmlRoot, new_seq_id_map, og_map, results, event=root_name, inMemory=True, checkpointInfo=checkpointInfo, acyclicEvent=referenceEvents[0] if referenceEvents and not allow_collapse else None, memory_override=cons_memory) From 090b35a547f585e5b78aae51e0d63f87b5347a43 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 23 Aug 2024 11:10:49 -0400 Subject: [PATCH 10/18] update cactus-gfa-tools --- build-tools/downloadPangenomeTools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index 3e4f56850..883d6ada4 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -176,7 +176,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/ComparativeGenomicsToolkit/cactus-gfa-tools.git cd cactus-gfa-tools -git checkout d02b569c7960aff92aea67c113767e9628dff841 +git checkout 6fa3157cc2870a459e412f03519f4310a9d12796 make -j ${numcpu} if [[ $STATIC_CHECK -ne 1 || $(ldd paf2lastz | grep so | wc -l) -eq 0 ]] then From e49e700b4c630801c25d86a03d275e8f1b3aba65 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 27 Aug 2024 08:11:00 -0400 Subject: [PATCH 11/18] handle negative minimap2 scores --- src/cactus/cactus_progressive_config.xml | 6 ++++-- src/cactus/refmap/cactus_graphmap.py | 7 ++++++- src/cactus/shared/common.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index 768ff2979..9b97fbaef 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -336,7 +336,8 @@ - + + diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index f51aa88b6..8c0265adb 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -539,6 +539,7 @@ def filter_paf(job, paf_id, config, reference=None): min_block = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minGAFBlockLength", typeFn=int, default=0) min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ", typeFn=int, default=0) min_ident = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minIdentity", typeFn=float, default=0) + min_score = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minScore", typeFn=float, default=0) RealtimeLogger.info("Running PAF filter with minBlock={} minMAPQ={} minIdentity={}".format(min_block, min_mapq, min_ident)) with open(paf_path, 'r') as paf_file, open(filter_paf_path, 'w') as filter_paf_file: for line in paf_file: @@ -548,6 +549,7 @@ def filter_paf(job, paf_id, config, reference=None): query_len = int(toks[1]) ident = float(toks[9]) / (float(toks[10]) + 0.00000001) bl = None + score = None for tok in toks[12:]: # this is a special tag that was written by gaf2paf in order to preserve the original gaf block length # we use it to be able to filter by the gaf block even after it's been broken in the paf @@ -556,7 +558,10 @@ def filter_paf(job, paf_id, config, reference=None): # we can also get the identity of the parent gaf block if tok.startswith('gi:i:'): ident = min(ident, float(toks[5:])) - if is_ref or (mapq >= min_mapq and (bl is None or query_len <= min_block or bl >= min_block) and ident >= min_ident): + if tok.startsiwth('AS:i:'): + score = int(tok[5:]) + if is_ref or (mapq >= min_mapq and (bl is None or query_len <= min_block or bl >= min_block) and ident >= min_ident and \ + (score is None or score < min_score)): filter_paf_file.write(line) overlap_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterRatio", typeFn=float, default=0) diff --git a/src/cactus/shared/common.py b/src/cactus/shared/common.py index 315ae2667..4af78df69 100644 --- a/src/cactus/shared/common.py +++ b/src/cactus/shared/common.py @@ -1180,7 +1180,7 @@ def get_faidx_subpath_rename_cmd(): happy. But it does require conversion going into vg which wants chr[9-15] and hal2vg is updated to do this autmatically """ - return ['sed', '-e', 's/\([^:]*\):\([0-9]*\)-\([0-9]*\)/echo "\\1_sub_$((\\2-1))_\\3"/e'] + return ['sed', '-e', 's/\([^:]*\):\([0-9]+\)-\([0-9]+\)/echo "\\1_sub_$((\\2-1))_\\3"/e'] def clean_jobstore_files(job, file_id_maps=None, file_ids=None): """ clean some intermediate files that are no longer needed out of the jobstore """ From 263174ec13feb4a5117d33db0b9650e4533c83be Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 27 Aug 2024 11:21:57 -0400 Subject: [PATCH 12/18] make --collapse boolean; add to doc; update minimap2 --- build-tools/downloadPangenomeTools | 6 +++--- doc/pangenome.md | 4 ++++ src/cactus/cactus_progressive_config.xml | 2 +- src/cactus/refmap/cactus_graphmap.py | 25 ++++++++++++---------- src/cactus/refmap/cactus_graphmap_join.py | 9 ++------ src/cactus/refmap/cactus_graphmap_split.py | 9 ++------ src/cactus/refmap/cactus_pangenome.py | 8 +++++-- src/cactus/setup/cactus_align.py | 11 +++------- 8 files changed, 35 insertions(+), 39 deletions(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index 883d6ada4..90ae9162b 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -45,7 +45,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/lh3/minimap2.git cd minimap2 -git checkout 581f2d7123f651d04c9e103b1c7ea8f7051e909a +git checkout v2.28 # hack in flags support sed -i Makefile -e 's/CFLAGS=/CFLAGS+=/' make -j ${numcpu} ${MINI_ARM_FLAGS} @@ -60,7 +60,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/lh3/minigraph.git pushd minigraph -git checkout v0.20 +git checkout v0.21 # hack in flags support sed -i Makefile -e 's/CFLAGS=/CFLAGS+=/' make -j ${numcpu} @@ -75,7 +75,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/lh3/gfatools.git cd gfatools -git checkout v0.5 +git checkout c31be8a62efc6bdea4576029f7fbe84f345a6eed # hack in flags support sed -i Makefile -e 's/CFLAGS=/CFLAGS+=/' make -j ${numcpu} diff --git a/doc/pangenome.md b/doc/pangenome.md index f54d1dee7..3f30c38d6 100644 --- a/doc/pangenome.md +++ b/doc/pangenome.md @@ -133,6 +133,10 @@ The `--reference` option must be used to select a "reference" sample. This samp It is therefore extremely important that the reference sample's assembly be **chromosome** scale. If there are many small contigs in the addition to chromosomes in the reference assembly, then please consider specifying the chromosomes with `--refContigs`. If you still want to keep the other contigs, add `--otherContig chrOther` (see explanation below). +### Self-Alignment and the Collapse Option + +The `--collapse` option, added as an experimental prototype in [v2.9.1](https://github.com/ComparativeGenomicsToolkit/cactus/releases/tag/v2.9.1), can be used to incorporate self-alignments into the pangenome, *including the reference sample*. This will produce a more compact graph with, for example, tandem duplications being represented as cycles (like PGGB) rather than insertions (like minigraph). It also drops the invariant (see above) that the reference path by acyclic -- with this option, the reference path can contain cycles. The `--collapse` option is implemented by running `minimap2` using options found in the `minimapCollapseOptions` parameter in the configuration XML to align each input *contig* with itself. These self alignments are fed into Cactus alongide the usual sequence-to-minigraph alignments. + #### Multiple Reference Samples The `--reference` option can accept multiple samples (separated by space). If multiple samples are specified beyond the first, they will be clipped as usual, but end up as "reference-sense" paths in the vg/gbz output. They can also be used as basis for VCF, and VCF files can be created based on them with the `--vcfReference` sample. diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index 9b97fbaef..df379fabb 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -321,7 +321,7 @@ - + diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index 8c0265adb..addf22f55 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -53,8 +53,8 @@ def main(): parser.add_argument("--reference", nargs='+', type=str, help = "Reference genome name. MAPQ filter will not be applied to it") parser.add_argument("--refFromGFA", action="store_true", help = "Do not align reference (--reference) from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)") parser.add_argument("--mapCores", type=int, help = "Number of cores for minigraph. Overrides graphmap cpu in configuration") - parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\", \"nonref\" and \"none\". [EXPERIMENTAL, especially \"reference\" and \"nonref\"]", default=None) - parser.add_argument("--collapseRefPAF", help ="Incorporate given reference self-alignments in PAF format") + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False) + parser.add_argument("--collapseRefPAF", help ="Incorporate given (reference-only) self-alignments in PAF format [Experimental]") #WDL hacks parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile") @@ -92,15 +92,13 @@ def main(): if options.reference: options.reference = options.reference[0] - if options.collapse and options.collapse not in ['reference', 'all', 'nonref', 'none']: - raise RuntimeError('valid values for --collapse are {reference, all, nonref, none}') - if options.collapse in ['reference', 'nonref'] and not options.reference: - raise RuntimeError('--reference must be used with --collapse reference/nonref') if options.collapseRefPAF: if not options.collapseRefPAF.endswith('.paf'): raise RuntimeError('file passed to --collapseRefPaf must end with .paf') if not options.reference: - raise RuntimeError('--reference must be used with --collapseRefPAF') + raise RuntimeError('--reference must be used with --collapseRefPAF') + if options.collapse: + raise RuntimeError('--collapseRefPaf cannot be used with --collapse') # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) @@ -113,6 +111,7 @@ def main(): run_time = end_time - start_time logger.info("cactus-graphmap has finished after {} seconds".format(run_time)) + def graph_map(options): with Toil(options) as toil: importSingularityImage(options) @@ -151,7 +150,10 @@ def graph_map(options): findRequiredNode(config_node, "graphmap").attrib["cpu"] = str(mg_cores) if options.collapse: - findRequiredNode(config_node, "graphmap").attrib["collapse"] = options.collapse + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all' + if options.collapseRefPaf: + assert options.reference + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'reference' # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode(config_node, "graphmap"), "assemblyName", default="_MINIGRAPH_") @@ -276,8 +278,9 @@ def minigraph_workflow(job, options, config, seq_id_map, gfa_id, graph_event, sa root_job.addChild(gfa2paf_job) else: paf_job.addFollowOn(gfa2paf_job) - if options.collapse in ['reference', 'all', 'nonref']: - collapse_job = paf_job.addChildJobFn(self_align_all, config, seq_id_map, options.reference, options.collapse) + collapse_mode = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") + if collapse_mode in ['reference', 'all', 'nonref']: + collapse_job = paf_job.addChildJobFn(self_align_all, config, seq_id_map, options.reference, collapse_mode) if ref_collapse_paf_id: collapse_paf_id = collapse_job.addFollowOnJobFn(merge_pafs, {"1":collapse_job.rv(), "2":ref_collapse_paf_id}, @@ -558,7 +561,7 @@ def filter_paf(job, paf_id, config, reference=None): # we can also get the identity of the parent gaf block if tok.startswith('gi:i:'): ident = min(ident, float(toks[5:])) - if tok.startsiwth('AS:i:'): + if tok.startswith('AS:i:'): score = int(tok[5:]) if is_ref or (mapq >= min_mapq and (bl is None or query_len <= min_block or bl >= min_block) and ident >= min_ident and \ (score is None or score < min_score)): diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index 36119753b..de4e7fb05 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -150,7 +150,7 @@ def graphmap_join_options(parser): parser.add_argument("--chop", type=int, nargs='?', const=1024, default=None, help="chop all graph nodes to be at most this long (default=1024 specified without value). By default, nodes are only chopped for GBZ-derived formats, but left unchopped in the GFA, VCF, etc. If this option is used, the GBZ and GFA should have consistent IDs") - parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\", \"nonref\" and \"none\". [EXPERIMENTAL, especially \"reference\" and \"nonref\"]", default=None) + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False) def graphmap_join_validate_options(options): @@ -324,11 +324,6 @@ def graphmap_join_validate_options(options): if options.filter and 'filter' not in options.gfa + options.gbz + options.odgi + options.chrom_vg + options.chrom_og + options.vcf + options.giraffe + options.viz + options.draw: options.filter = None - if options.collapse and options.collapse not in ['reference', 'all', 'nonref', 'none']: - raise RuntimeError('valid values for --collapse are {reference, all, nonref, none}') - if options.collapse in ['reference', 'nonref'] and not options.reference: - raise RuntimeError('--reference must be used with --collapse reference/nonref') - return options def graphmap_join(options): @@ -346,7 +341,7 @@ def graphmap_join(options): config.substituteAllPredefinedConstantsWithLiterals(options) if options.collapse: - findRequiredNode(config_node, "graphmap").attrib["collapse"] = options.collapse + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all' # load up the vgs vg_ids = [] diff --git a/src/cactus/refmap/cactus_graphmap_split.py b/src/cactus/refmap/cactus_graphmap_split.py index b8b50dcd9..1144631b1 100644 --- a/src/cactus/refmap/cactus_graphmap_split.py +++ b/src/cactus/refmap/cactus_graphmap_split.py @@ -56,7 +56,7 @@ def main(): parser.add_argument("--minIdentity", type=float, help = "Ignore PAF lines with identity (column 10/11) < this (overrides minIdentity in in config)") parser.add_argument("--permissiveContigFilter", nargs='?', const='0.25', default=None, type=float, help = "If specified, override the configuration to accept contigs so long as they have at least given fraction of coverage (0.25 if no fraction specified). This can increase sensitivity of very small, fragmented and/or diverse assemblies.") - parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\", \"nonref\" and \"none\". [EXPERIMENTAL, especially \"reference\" and \"nonref\"]", default=None) + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False) #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", @@ -88,11 +88,6 @@ def main(): if options.reference: options.reference = options.reference[0] - if options.collapse and options.collapse not in ['reference', 'all', 'nonref', 'none']: - raise RuntimeError('valid values for --collapse are {reference, all, nonref, none}') - if options.collapse in ['reference', 'nonref'] and not options.reference: - raise RuntimeError('--reference must be used with --collapse reference/nonref') - logger.info('Cactus Command: {}'.format(' '.join(sys.argv))) logger.info('Cactus Commit: {}'.format(cactus_commit)) start_time = timeit.default_timer() @@ -111,7 +106,7 @@ def cactus_graphmap_split(options): config.substituteAllPredefinedConstantsWithLiterals(options) if options.collapse: - findRequiredNode(config_node, "graphmap").attrib["collapse"] = options.collapse + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all' #Run the workflow if options.restart: diff --git a/src/cactus/refmap/cactus_pangenome.py b/src/cactus/refmap/cactus_pangenome.py index 9f472663b..32c86dc85 100644 --- a/src/cactus/refmap/cactus_pangenome.py +++ b/src/cactus/refmap/cactus_pangenome.py @@ -160,7 +160,9 @@ def main(): if not options.collapseRefPAF.endswith('.paf'): raise RuntimeError('file passed to --collapseRefPaf must end with .paf') if not options.reference: - raise RuntimeError('--reference must be used with --collapseRefPAF') + raise RuntimeError('--reference must be used with --collapseRefPAF') + if options.collapse: + raise RuntimeError('--collapseRefPaf cannot be used with --collapse') # Sort out the graphmap-join options, which can be rather complex # pass in dummy values for now, they will get filled in later @@ -209,13 +211,15 @@ def main(): raise RuntimeError("--mgCores required run *not* running on single machine batch system") if options.collapse: - findRequiredNode(config_node, "graphmap").attrib["collapse"] = options.collapse + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all' #import the reference collapse paf ref_collapse_paf_id = None if options.collapseRefPAF: logger.info("Importing {}".format(options.collapseRefPAF)) ref_collapse_paf_id = toil.importFile(options.collapseRefPAF) + assert options.reference + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'reference' #import the sequences input_seq_id_map = {} diff --git a/src/cactus/setup/cactus_align.py b/src/cactus/setup/cactus_align.py index dc4a1521f..cb433aead 100644 --- a/src/cactus/setup/cactus_align.py +++ b/src/cactus/setup/cactus_align.py @@ -60,7 +60,7 @@ def main(): help="Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings." " The overridden configuration will be saved in .pg-conf.xml") - parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments. Valid values are \"reference\", \"all\", \"nonref\" and \"none\". [EXPERIMENTAL, especially \"reference\" and \"nonref\"]", default=None) + parser.add_argument("--collapse", help = "Incorporate minimap2 self-alignments.", action='store_true', default=False) parser.add_argument("--singleCopySpecies", type=str, help="Filter out all self-alignments in given species") @@ -169,11 +169,6 @@ def main(): # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) - if options.collapse and options.collapse not in ['reference', 'all', 'nonref', 'none']: - raise RuntimeError('valid values for --collapse are {reference, all, nonref, none}') - if options.collapse in ['reference', 'nonref'] and not options.reference: - raise RuntimeError('--reference must be used with --collapse reference/nonref') - logger.info('Cactus Command: {}'.format(' '.join(sys.argv))) logger.info('Cactus Commit: {}'.format(cactus_commit)) start_time = timeit.default_timer() @@ -265,7 +260,7 @@ def make_align_job(options, toil, config_wrapper=None, chrom_name=None): config_wrapper.substituteAllPredefinedConstantsWithLiterals(options) config_wrapper.initGPU(options) if options.collapse: - findRequiredNode(config_node, "graphmap").attrib["collapse"] = options.collapse + findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all' config_wrapper.setSystemMemory(options) mc_tree, input_seq_map, og_candidates = parse_seqfile(options.seqFile, config_wrapper, @@ -407,7 +402,7 @@ def cactus_align(job, config_wrapper, mc_tree, input_seq_map, input_seq_id_map, sub_tree = get_subtree(mc_tree, root_name, config_wrapper, og_map, include_outgroups=False) # run the hal export - allow_collapse = getOptionalAttrib(findRequiredNode(config_wrapper.xmlRoot, "graphmap_join"), "allowCollapse", typeFn=bool, default=False) + allow_collapse = getOptionalAttrib(findRequiredNode(config_wrapper.xmlRoot, "graphmap"), "collapse", typeFn=str, default="none") in ['reference', 'all'] hal_job = cons_job.addFollowOnJobFn(export_hal, sub_tree, config_wrapper.xmlRoot, new_seq_id_map, og_map, results, event=root_name, inMemory=True, checkpointInfo=checkpointInfo, acyclicEvent=referenceEvents[0] if referenceEvents and not allow_collapse else None, memory_override=cons_memory) From ba03b93942ad91f9a8d7e61663d4d1d76a2d10d3 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 27 Aug 2024 12:48:22 -0400 Subject: [PATCH 13/18] typos --- src/cactus/refmap/cactus_graphmap.py | 4 ++-- test/evolverTest.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index addf22f55..2f420ca8e 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -543,7 +543,7 @@ def filter_paf(job, paf_id, config, reference=None): min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ", typeFn=int, default=0) min_ident = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minIdentity", typeFn=float, default=0) min_score = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minScore", typeFn=float, default=0) - RealtimeLogger.info("Running PAF filter with minBlock={} minMAPQ={} minIdentity={}".format(min_block, min_mapq, min_ident)) + RealtimeLogger.info("Running PAF filter with minBlock={} minMAPQ={} minIdentity={} minScore={}".format(min_block, min_mapq, min_ident, min_score)) with open(paf_path, 'r') as paf_file, open(filter_paf_path, 'w') as filter_paf_file: for line in paf_file: toks = line.split('\t') @@ -564,7 +564,7 @@ def filter_paf(job, paf_id, config, reference=None): if tok.startswith('AS:i:'): score = int(tok[5:]) if is_ref or (mapq >= min_mapq and (bl is None or query_len <= min_block or bl >= min_block) and ident >= min_ident and \ - (score is None or score < min_score)): + (score is None or score >= min_score)): filter_paf_file.write(line) overlap_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterRatio", typeFn=float, default=0) diff --git a/test/evolverTest.py b/test/evolverTest.py index 7d6ade8a7..a1ae9d82a 100644 --- a/test/evolverTest.py +++ b/test/evolverTest.py @@ -441,7 +441,7 @@ def _run_evolver_primates_pangenome(self, binariesMode): out_dir = os.path.dirname(self._out_hal(binariesMode)) out_name = os.path.splitext(os.path.basename(self._out_hal(binariesMode)))[0] cactus_pangenome_cmd = ['cactus-pangenome', self._job_store(binariesMode), seq_file_path, '--reference', 'simHuman', 'simChimp', - '--outDir', out_dir, '--outName', out_name, '--noSplit', '--odgi', '--chrom-og', '--viz', '--draw', '--haplo', '--collapse', 'all'] + '--outDir', out_dir, '--outName', out_name, '--noSplit', '--odgi', '--chrom-og', '--viz', '--draw', '--haplo', '--collapse'] subprocess.check_call(cactus_pangenome_cmd + cactus_opts) # cactus-pangenome tacks on the .full to the output name From 65a6dbc75b32b7b68378942527723a009738671d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 27 Aug 2024 12:54:58 -0400 Subject: [PATCH 14/18] switch (back) to latest vg release --- build-tools/downloadPangenomeTools | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index 90ae9162b..a2997804d 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -298,8 +298,7 @@ fi # vg cd ${pangenomeBuildDir} -#wget -q https://github.com/vgteam/vg/releases/download/v1.59.0/vg -wget -q http://public.gi.ucsc.edu/~hickey/vg-patch/vg.c56049888367a34de69d1d467e9757efbb84b618 -O vg +wget -q https://github.com/vgteam/vg/releases/download/v1.59.0/vg chmod +x vg if [[ $STATIC_CHECK -ne 1 || $(ldd vg | grep so | wc -l) -eq 0 ]] then From 76f280a8c265ca15ec34162b5502b817d45b5376 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 29 Aug 2024 10:11:30 -0400 Subject: [PATCH 15/18] add filter for distane self-alignments --- src/cactus/cactus_progressive_config.xml | 2 ++ src/cactus/refmap/cactus_graphmap.py | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index df379fabb..e1a7e7630 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -322,6 +322,7 @@ + @@ -346,6 +347,7 @@ minigraphConstructOptions="-c -xggs" minimapCollapseOptions="-c -D -xasm5 -m500" collapse="none" + maxCollapseDistanceRatio="5" minigraphConstructBatchSize="50" minigraphSortInput="mash" minMAPQ="5" diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index 2f420ca8e..8c47fd3aa 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -542,17 +542,21 @@ def filter_paf(job, paf_id, config, reference=None): min_block = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minGAFBlockLength", typeFn=int, default=0) min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ", typeFn=int, default=0) min_ident = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minIdentity", typeFn=float, default=0) - min_score = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minScore", typeFn=float, default=0) - RealtimeLogger.info("Running PAF filter with minBlock={} minMAPQ={} minIdentity={} minScore={}".format(min_block, min_mapq, min_ident, min_score)) + min_score = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minScore", typeFn=float, default=0) + max_collapse_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "maxCollapseDistanceRatio", typeFn=float, default=-1) + RealtimeLogger.info("Running PAF filter with minBlock={} minMAPQ={} minIdentity={} minScore={} maxCollapseDistanceRatio={}".format(min_block, min_mapq, min_ident, min_score, max_collapse_ratio)) with open(paf_path, 'r') as paf_file, open(filter_paf_path, 'w') as filter_paf_file: for line in paf_file: toks = line.split('\t') - is_ref = reference and toks[0].startswith('id={}|'.format(reference)) + query_name = toks[0] + target_name = toks[5] + is_ref = reference and query_name.startswith('id={}|'.format(reference)) and query_name != target_name mapq = int(toks[11]) query_len = int(toks[1]) ident = float(toks[9]) / (float(toks[10]) + 0.00000001) bl = None score = None + collapse_ratio = None for tok in toks[12:]: # this is a special tag that was written by gaf2paf in order to preserve the original gaf block length # we use it to be able to filter by the gaf block even after it's been broken in the paf @@ -563,8 +567,17 @@ def filter_paf(job, paf_id, config, reference=None): ident = min(ident, float(toks[5:])) if tok.startswith('AS:i:'): score = int(tok[5:]) + if query_name == target_name and max_collapse_ratio >= 0: + # compute the distance between the minimap2 self alignment intervals + # in order to see if they are too far apart via the max ratio + query_start, query_end = int(toks[2]), int(toks[3]) + target_start, target_end = int(toks[7]), int(toks[8]) + block_length = int(toks[10]) + dist = target_start - query_end if query_start < target_start else query_start - target_end + if block_length > 0 and dist > 0: + collapse_ratio = dist / block_length if is_ref or (mapq >= min_mapq and (bl is None or query_len <= min_block or bl >= min_block) and ident >= min_ident and \ - (score is None or score >= min_score)): + (score is None or score >= min_score) and (collapse_ratio is None or collapse_ratio <= max_collapse_ratio)): filter_paf_file.write(line) overlap_ratio = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "PAFOverlapFilterRatio", typeFn=float, default=0) From d7ae054a75f2301a57171ab346b24fbe138f2cde Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 29 Aug 2024 10:20:32 -0400 Subject: [PATCH 16/18] typo in (Doomed) collapseRefPAF option --- src/cactus/refmap/cactus_graphmap.py | 6 +++--- src/cactus/refmap/cactus_pangenome.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cactus/refmap/cactus_graphmap.py b/src/cactus/refmap/cactus_graphmap.py index 8c47fd3aa..6b21eea96 100644 --- a/src/cactus/refmap/cactus_graphmap.py +++ b/src/cactus/refmap/cactus_graphmap.py @@ -94,11 +94,11 @@ def main(): if options.collapseRefPAF: if not options.collapseRefPAF.endswith('.paf'): - raise RuntimeError('file passed to --collapseRefPaf must end with .paf') + raise RuntimeError('file passed to --collapseRefPAF must end with .paf') if not options.reference: raise RuntimeError('--reference must be used with --collapseRefPAF') if options.collapse: - raise RuntimeError('--collapseRefPaf cannot be used with --collapse') + raise RuntimeError('--collapseRefPAF cannot be used with --collapse') # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) @@ -151,7 +151,7 @@ def graph_map(options): if options.collapse: findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all' - if options.collapseRefPaf: + if options.collapseRefPAF: assert options.reference findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'reference' diff --git a/src/cactus/refmap/cactus_pangenome.py b/src/cactus/refmap/cactus_pangenome.py index 32c86dc85..a14301362 100644 --- a/src/cactus/refmap/cactus_pangenome.py +++ b/src/cactus/refmap/cactus_pangenome.py @@ -158,11 +158,11 @@ def main(): if options.collapseRefPAF: if not options.collapseRefPAF.endswith('.paf'): - raise RuntimeError('file passed to --collapseRefPaf must end with .paf') + raise RuntimeError('file passed to --collapseRefPAF must end with .paf') if not options.reference: raise RuntimeError('--reference must be used with --collapseRefPAF') if options.collapse: - raise RuntimeError('--collapseRefPaf cannot be used with --collapse') + raise RuntimeError('--collapseRefPAF cannot be used with --collapse') # Sort out the graphmap-join options, which can be rather complex # pass in dummy values for now, they will get filled in later From 43650c243e2f1fa58ad6e9c1e8a712141313201d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 29 Aug 2024 16:43:55 -0400 Subject: [PATCH 17/18] typo --- src/cactus/refmap/cactus_graphmap_join.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index de4e7fb05..da1efc354 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -341,7 +341,7 @@ def graphmap_join(options): config.substituteAllPredefinedConstantsWithLiterals(options) if options.collapse: - findRequiredNode(config_node, "graphmap").attrib["collapse"] = 'all' + findRequiredNode(configNode, "graphmap").attrib["collapse"] = 'all' # load up the vgs vg_ids = [] From b437bca267fc887d90fe096fbe1121f7344caea0 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 30 Aug 2024 16:21:31 -0400 Subject: [PATCH 18/18] update cactus-gfa-tools --- build-tools/downloadPangenomeTools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-tools/downloadPangenomeTools b/build-tools/downloadPangenomeTools index a2997804d..9c4d2d93f 100755 --- a/build-tools/downloadPangenomeTools +++ b/build-tools/downloadPangenomeTools @@ -176,7 +176,7 @@ fi cd ${pangenomeBuildDir} git clone https://github.com/ComparativeGenomicsToolkit/cactus-gfa-tools.git cd cactus-gfa-tools -git checkout 6fa3157cc2870a459e412f03519f4310a9d12796 +git checkout 1121e370880ee187ba2963f0e46e632e0e762cc5 make -j ${numcpu} if [[ $STATIC_CHECK -ne 1 || $(ldd paf2lastz | grep so | wc -l) -eq 0 ]] then