Skip to content

Commit 1a07d0e

Browse files
authored
Merge pull request #66 from marbl/fix-one-to-one-prefix
Enable prefix-grouping for one-to-one filtering
2 parents cc49b9f + e83483f commit 1a07d0e

File tree

2 files changed

+73
-27
lines changed

2 files changed

+73
-27
lines changed

src/map/include/computeMap.hpp

Lines changed: 72 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <limits>
1111
#include <vector>
1212
#include <algorithm>
13+
#include <iterator>
1314
#include <unordered_map>
1415
#include <fstream>
1516
#include <zlib.h>
@@ -356,17 +357,49 @@ namespace skch
356357
//Filter over reference axis and report the mappings
357358
if (param.filterMode == filter::ONETOONE)
358359
{
359-
skch::Filter::ref::filterMappings(allReadMappings, this->refSketch,
360-
param.numMappingsForSegment - 1
361-
// (input->len < param.segLength ? param.shortSecondaryToKeep : param.secondaryToKeep)
362-
);
360+
// how many secondary mappings to keep
361+
int n_mappings = param.numMappingsForSegment - 1;
362+
363+
// Group sequences by query prefix, then pass to ref filter
364+
auto subrange_begin = allReadMappings.begin();
365+
auto subrange_end = allReadMappings.begin();
366+
MappingResultsVector_t tmpMappings;
367+
MappingResultsVector_t filteredMappings;
368+
369+
while (subrange_end != allReadMappings.end())
370+
{
371+
if (param.skip_prefix)
372+
{
373+
int currGroup = this->getRefGroup(qmetadata[subrange_begin->querySeqId].name);
374+
subrange_end = std::find_if_not(subrange_begin, allReadMappings.end(), [this, currGroup] (const auto& allReadMappings_candidate) {
375+
return currGroup == this->getRefGroup(this->qmetadata[allReadMappings_candidate.querySeqId].name);
376+
});
377+
}
378+
else
379+
{
380+
subrange_end = allReadMappings.end();
381+
}
382+
tmpMappings.insert(
383+
tmpMappings.end(),
384+
std::make_move_iterator(subrange_begin),
385+
std::make_move_iterator(subrange_end));
386+
387+
// tmpMappings now contains mappings from one group of query sequences to all reference groups
388+
// we now run filterByGroup, which filters based on the reference group.
389+
filterByGroup(tmpMappings, filteredMappings, n_mappings, true);
390+
tmpMappings.clear();
391+
subrange_begin = subrange_end;
392+
}
393+
allReadMappings = std::move(filteredMappings);
363394

364395
//Re-sort mappings by input order of query sequences
365396
//This order may be needed for any post analysis of output
366-
std::sort(allReadMappings.begin(), allReadMappings.end(), [](const MappingResult &a, const MappingResult &b)
367-
{
368-
return (a.querySeqId < b.querySeqId);
369-
});
397+
std::sort(
398+
allReadMappings.begin(), allReadMappings.end(),
399+
[](const MappingResult &a, const MappingResult &b) {
400+
return std::tie(a.querySeqId, a.queryStartPos, a.refSeqId, a.refStartPos)
401+
< std::tie(b.querySeqId, b.queryStartPos, b.refSeqId, b.refStartPos);
402+
});
370403

371404
reportReadMappings(allReadMappings, "", outstrm);
372405
}
@@ -460,17 +493,19 @@ namespace skch
460493
}
461494

462495
/**
463-
* @brief helper to main filtering function
464-
* @details filters mappings by group
465-
* @param[in] input unfiltered mappings
466-
* @param[in] output filtered mappings
467-
* @param[in] input num mappings per segment
468-
* @return void
496+
* @brief helper to main filtering function
497+
* @details filters mappings by group
498+
* @param[in] input unfiltered mappings
499+
* @param[in] output filtered mappings
500+
* @param[in] n_mappings num mappings per segment
501+
* @param[in] filter_ref use Filter::ref instead of Filter::query
502+
* @return void
469503
*/
470504
void filterByGroup(
471505
MappingResultsVector_t &unfilteredMappings,
472506
MappingResultsVector_t &filteredMappings,
473-
int n_mappings)
507+
int n_mappings,
508+
bool filter_ref)
474509
{
475510
filteredMappings.reserve(unfilteredMappings.size());
476511

@@ -480,6 +515,7 @@ namespace skch
480515
auto subrange_end = unfilteredMappings.begin();
481516
if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE)
482517
{
518+
std::vector<skch::MappingResult> tmpMappings;
483519
while (subrange_end != unfilteredMappings.end())
484520
{
485521
if (param.skip_prefix)
@@ -493,13 +529,25 @@ namespace skch
493529
{
494530
subrange_end = unfilteredMappings.end();
495531
}
496-
// TODO why are we filtering these before merging?
497-
std::vector<skch::MappingResult> tmpMappings(std::distance(subrange_begin, subrange_end));
498-
std::move(subrange_begin, subrange_end, tmpMappings.begin());
532+
tmpMappings.insert(
533+
tmpMappings.end(),
534+
std::make_move_iterator(subrange_begin),
535+
std::make_move_iterator(subrange_end));
499536
std::sort(tmpMappings.begin(), tmpMappings.end(), [](const auto& a, const auto& b)
500537
{ return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos) < std::tie(b.queryStartPos, b.refSeqId, b.refStartPos); });
501-
skch::Filter::query::filterMappings(tmpMappings, n_mappings);
502-
std::move(tmpMappings.begin(), tmpMappings.end(), std::back_inserter(filteredMappings));
538+
if (filter_ref)
539+
{
540+
skch::Filter::ref::filterMappings(tmpMappings, this->refSketch, n_mappings);
541+
}
542+
else
543+
{
544+
skch::Filter::query::filterMappings(tmpMappings, n_mappings);
545+
}
546+
filteredMappings.insert(
547+
filteredMappings.end(),
548+
std::make_move_iterator(tmpMappings.begin()),
549+
std::make_move_iterator(tmpMappings.end()));
550+
tmpMappings.clear();
503551
subrange_begin = subrange_end;
504552
}
505553
}
@@ -509,8 +557,6 @@ namespace skch
509557
[](const MappingResult &a, const MappingResult &b) {
510558
return std::tie(a.queryStartPos, a.refSeqId, a.refStartPos)
511559
< std::tie(b.queryStartPos, b.refSeqId, b.refStartPos);
512-
//return std::tie(a.refSeqId, a.refStartPos, a.queryStartPos)
513-
//< std::tie(b.refSeqId, b.refStartPos, b.queryStartPos);
514560
});
515561
}
516562

@@ -644,10 +690,10 @@ namespace skch
644690
}
645691

646692
if (param.filterMode == filter::MAP || param.filterMode == filter::ONETOONE) {
647-
MappingResultsVector_t tempMappings;
648-
tempMappings.reserve(output->readMappings.size());
649-
filterByGroup(unfilteredMappings, tempMappings, n_mappings);
650-
std::swap(tempMappings, unfilteredMappings);
693+
MappingResultsVector_t tmpMappings;
694+
tmpMappings.reserve(output->readMappings.size());
695+
filterByGroup(unfilteredMappings, tmpMappings, n_mappings, false);
696+
unfilteredMappings = std::move(tmpMappings);
651697
}
652698

653699
std::swap(output->readMappings, unfilteredMappings);

src/map/include/map_parameters.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ float confidence_interval = 0.95; // Confidence interval to re
9898
float percentage_identity = 0.85; // Percent identity in the mapping step
9999
float ANIDiff = 0.0; // Stage 1 ANI diff threshold
100100
float ANIDiffConf = 0.999; // ANI diff confidence
101-
std::string VERSION = "3.1.2"; // Version of MashMap
101+
std::string VERSION = "3.1.3"; // Version of MashMap
102102
}
103103
}
104104

0 commit comments

Comments
 (0)