Skip to content

Commit 760e65a

Browse files
committed
Do each extension as its own Alignment with no multipath logic
1 parent 6cb9e7b commit 760e65a

File tree

3 files changed

+169
-215
lines changed

3 files changed

+169
-215
lines changed

src/minimizer_mapper.cpp

+151-206
Original file line numberDiff line numberDiff line change
@@ -724,133 +724,87 @@ void MinimizerMapper::find_optimal_tail_alignments(const Alignment& aln, const v
724724
cerr << "Trying again find tail alignments for " << extended_seeds.size() << " extended seeds" << endl;
725725
#endif
726726

727-
// We're going to record source and sink path count distributions, for debugging
728-
vector<double> tail_path_counts;
729-
730-
// We're also going to record read sequence lengths for tails
731-
vector<double> tail_lengths;
732-
733-
// Make a MultipathAlignment and feed in all the extended seeds as subpaths
734-
MultipathAlignment mp;
735-
// Pull over all the non-alignment data (to get copied back out when linearizing)
736-
transfer_read_metadata(aln, mp);
727+
// Make paths for all the extensions
728+
vector<Path> extension_paths;
729+
vector<double> extension_path_scores;
730+
extension_paths.reserve(extended_seeds.size());
731+
extension_path_scores.reserve(extended_seeds.size());
737732
for (auto& extended_seed : extended_seeds) {
738-
Subpath* s = mp.add_subpath();
739-
// Copy in the path.
740-
*s->mutable_path() = extended_seed.to_path(gbwt_graph, out.sequence());
741-
// Score it
742-
s->set_score(get_regular_aligner()->score_partial_alignment(aln, gbwt_graph, s->path(),
733+
// Compute the path for each extension
734+
extension_paths.push_back(extended_seed.to_path(gbwt_graph, aln.sequence()));
735+
// And the extension's score
736+
extension_path_scores.push_back(get_regular_aligner()->score_partial_alignment(aln, gbwt_graph, extension_paths.back(),
743737
aln.sequence().begin() + extended_seed.read_interval.first));
744-
// The position in the read it occurs at will be handled by the multipath topology.
745-
if (extended_seed.read_interval.first == 0) {
746-
// But if it occurs at the very start of the read we need to mark that now.
747-
mp.add_start(mp.subpath_size() - 1);
748-
}
749738
}
750739

751-
// Handle left tails as a forest of trees
752-
753-
// Get the forests of all left tails by extension they belong to
754-
auto tails_by_extension = get_tail_forests(extended_seeds, aln.sequence().size(), true);
755-
756-
for (auto& kv : tails_by_extension) {
757-
// For each extension that has a nonempty tail
758-
auto& source = kv.first;
759-
auto& forest = kv.second;
760-
761-
// Grab the part of the read sequence that comes before it
762-
string before_sequence = aln.sequence().substr(0, extended_seeds[source].read_interval.first);
763-
764-
// Do right-pinned alignment
765-
pair<Path, int64_t> result = get_best_alignment_against_any_tree(forest, before_sequence,
766-
extended_seeds[source].starting_position(gbwt_graph), false);
767-
768-
// Grab the best path in backing graph space (which may be empty)
769-
Path& best_path = result.first;
770-
// And its score
771-
int64_t& best_score = result.second;
772-
773-
// Put it in the MultipathAlignment
774-
Subpath* s = mp.add_subpath();
775-
*s->mutable_path() = std::move(best_path);
776-
s->set_score(best_score);
777-
778-
// And make the edge from it to the correct source
779-
s->add_next(source);
780-
781-
#ifdef debug
782-
cerr << "Add leading tail " << (mp.subpath_size() - 1) << " -> " << source << endl;
783-
#endif
740+
// We will keep the winning alignment in out
741+
out.set_score(0);
742+
743+
// Handle each extension in the set
744+
process_until_threshold(extended_seeds, extension_path_scores,
745+
0, 0, numeric_limits<size_t>::max(),
746+
(function<double(size_t)>) [&](size_t extended_seed_num) {
747+
748+
// This extended seed looks good enough.
749+
750+
// We start with the path in extension_paths[extended_seed_num],
751+
// scored in extension_path_scores[extended_seed_num]
752+
753+
// We also have a left tail path and score
754+
pair<Path, int64_t> left_tail_result {{}, 0};
755+
// And a right tail path and score
756+
pair<Path, int64_t> right_tail_result {{}, 0};
757+
758+
if (extended_seeds[extended_seed_num].read_interval.first != 0) {
759+
// There is a left tail
760+
761+
// Get the forest of all left tail placements
762+
auto forest = get_tail_forest(extended_seeds[extended_seed_num], aln.sequence().size(), true);
763+
764+
// Grab the part of the read sequence that comes before the extension
765+
string before_sequence = aln.sequence().substr(0, extended_seeds[extended_seed_num].read_interval.first);
766+
767+
// Do right-pinned alignment
768+
left_tail_result = get_best_alignment_against_any_tree(forest, before_sequence,
769+
extended_seeds[extended_seed_num].starting_position(gbwt_graph), false);
770+
}
771+
772+
if (extended_seeds[extended_seed_num].read_interval.second != aln.sequence().size()) {
773+
// There is a right tail
774+
775+
// Get the forest of all right tail placements
776+
auto forest = get_tail_forest(extended_seeds[extended_seed_num], aln.sequence().size(), false);
777+
778+
// Find the sequence
779+
string trailing_sequence = aln.sequence().substr(extended_seeds[extended_seed_num].read_interval.second);
784780

785-
// And mark it as a start subpath
786-
mp.add_start(mp.subpath_size() - 1);
787-
}
781+
// Do left-pinned alignment
782+
right_tail_result = get_best_alignment_against_any_tree(forest, trailing_sequence,
783+
extended_seeds[extended_seed_num].tail_position(gbwt_graph), true);
784+
}
788785

789-
// We must have somewhere to start.
790-
assert(mp.start_size() > 0);
786+
// Compute total score
787+
size_t total_score = extension_path_scores[extended_seed_num] + left_tail_result.second + right_tail_result.second;
791788

792-
// Handle right tails as a forest of trees
789+
if (total_score > out.score() || out.score() == 0) {
790+
// This is the new best alignment seen so far.
791+
792+
// Save the score
793+
out.set_score(total_score);
794+
// And compose the full path
795+
*out.mutable_path() = concat_paths(concat_paths(left_tail_result.first, extension_paths[extended_seed_num]),
796+
right_tail_result.first);
797+
}
793798

794-
// Get the forests of all right tails by extension they belong to
795-
tails_by_extension = get_tail_forests(extended_seeds, aln.sequence().size(), false);
796-
797-
for (auto& kv : tails_by_extension) {
798-
// For each extension that has a nonempty tail
799-
auto& from = kv.first;
800-
auto& forest = kv.second;
801-
802-
#ifdef debug
803-
cerr << "Consider right tails for extension " << from << " with interval "
804-
<< extended_seeds[from].read_interval.first << " - " << extended_seeds[from].read_interval.second << endl;
805-
#endif
806-
807-
// Find the sequence
808-
string trailing_sequence = aln.sequence().substr(extended_seeds[from].read_interval.second);
809-
810-
// There should be actual trailing sequence to align on this escape path
811-
assert(!trailing_sequence.empty());
812-
813-
// Do left-pinned alignment
814-
pair<Path, int64_t> result = get_best_alignment_against_any_tree(forest, trailing_sequence,
815-
extended_seeds[from].tail_position(gbwt_graph), true);
816-
817-
// Grab the best path in backing graph space (which may be empty)
818-
Path& best_path = result.first;
819-
// And its score
820-
int64_t& best_score = result.second;
821-
822-
// Put it in the MultipathAlignment
823-
Subpath* s = mp.add_subpath();
824-
*s->mutable_path() = std::move(best_path);
825-
s->set_score(best_score);
826-
827-
// And make the edge to hook it up
828-
mp.mutable_subpath(from)->add_next(mp.subpath_size() - 1);
799+
return true;
800+
}, [&](size_t extended_seed_num) {
801+
// This extended seed isn't good enough by its own score.
802+
});
829803

830-
#ifdef debug
831-
cerr << "Add trailing tail " << from << " -> " << (mp.subpath_size() - 1) << endl;
832-
#endif
833-
834-
}
835-
836-
// Then we take the best linearization of the full MultipathAlignment.
837-
// Make sure to force source to sink
838-
topologically_order_subpaths(mp);
839-
840-
if (!validate_multipath_alignment(mp, gbwt_graph)) {
841-
// If we generated an invalid multipath alignment, we did something wrong and need to stop
842-
cerr << "error[vg::MinimizerMapper]: invalid MultipathAlignment generated: " << pb2json(mp) << endl;
843-
exit(1);
844-
}
804+
// Now the winning path and score over all extended seeds we looked at is in out.
845805

846-
// Linearize into the out alignment, copying path, score, and also sequence and other read metadata
847-
optimal_alignment(mp, out, true);
848806
// Compute the identity from the path.
849807
out.set_identity(identity(out.path()));
850-
851-
// Save all the tail alignment debugging statistics
852-
set_annotation(out, "tail_path_counts", tail_path_counts);
853-
set_annotation(out, "tail_lengths", tail_lengths);
854808
}
855809

856810
pair<Path, size_t> MinimizerMapper::get_best_alignment_against_any_tree(const vector<TreeSubgraph>& trees,
@@ -943,113 +897,104 @@ pair<Path, size_t> MinimizerMapper::get_best_alignment_against_any_tree(const ve
943897
return make_pair(best_path, best_score);
944898
}
945899

946-
unordered_map<size_t, vector<TreeSubgraph>> MinimizerMapper::get_tail_forests(const vector<GaplessExtension>& extended_seeds,
900+
vector<TreeSubgraph> MinimizerMapper::get_tail_forest(const GaplessExtension& extended_seed,
947901
size_t read_length, bool left_tails) const {
948902

949-
// We will fill this in with all the trees we return, by parent extension.
950-
unordered_map<size_t, vector<TreeSubgraph>> to_return;
903+
// We will fill this in with all the trees we return
904+
vector<TreeSubgraph> to_return;
951905

952-
for (size_t extension_number = 0; extension_number < extended_seeds.size(); extension_number++) {
953-
// Now for each extension that can have tails, walk the GBWT in the appropriate direction
954-
906+
// Now for this extension, walk the GBWT in the appropriate direction
907+
955908
#ifdef debug
956-
cerr << "Look for " << (left_tails ? "left" : "right") << " tails from extension " << extension_number << endl;
909+
cerr << "Look for " << (left_tails ? "left" : "right") << " tails from extension" << endl;
957910
#endif
958911

959-
// TODO: Come up with a better way to do this with more accessors on the extension and less get_handle
960-
// Get the Position reading out of the extension on the appropriate tail
961-
Position from;
962-
// And the length of that tail
963-
size_t tail_length;
964-
// And the GBWT search state we want to start with
965-
const gbwt::SearchState* base_state = nullptr;
966-
if (left_tails) {
967-
// Look right from start
968-
from = extended_seeds[extension_number].starting_position(gbwt_graph);
969-
// And then flip to look the other way at the prev base
970-
from = reverse(from, gbwt_graph.get_length(gbwt_graph.get_handle(from.node_id(), false)));
971-
972-
// Use the search state going backward
973-
base_state = &extended_seeds[extension_number].state.backward;
974-
975-
tail_length = extended_seeds[extension_number].read_interval.first;
976-
} else {
977-
// Look right from end
978-
from = extended_seeds[extension_number].tail_position(gbwt_graph);
979-
980-
// Use the search state going forward
981-
base_state = &extended_seeds[extension_number].state.forward;
982-
983-
tail_length = read_length - extended_seeds[extension_number].read_interval.second;
984-
}
985-
986-
if (tail_length == 0) {
987-
// Don't go looking for places to put no tail.
988-
continue;
989-
}
990-
991-
// Make sure we at least have an empty forest if we have any tail.
992-
to_return.emplace(extension_number, vector<TreeSubgraph>());
993-
994-
// This is one tree that we are filling in
995-
vector<pair<int64_t, handle_t>> tree;
996-
997-
// This is a stack of indexes at which we put parents in the tree
998-
list<int64_t> parent_stack;
999-
1000-
// Get the handle we are starting from
1001-
// TODO: is it cheaper to get this out of base_state?
1002-
handle_t start_handle = gbwt_graph.get_handle(from.node_id(), from.is_reverse());
912+
// TODO: Come up with a better way to do this with more accessors on the extension and less get_handle
913+
// Get the Position reading out of the extension on the appropriate tail
914+
Position from;
915+
// And the length of that tail
916+
size_t tail_length;
917+
// And the GBWT search state we want to start with
918+
const gbwt::SearchState* base_state = nullptr;
919+
if (left_tails) {
920+
// Look right from start
921+
from = extended_seed.starting_position(gbwt_graph);
922+
// And then flip to look the other way at the prev base
923+
from = reverse(from, gbwt_graph.get_length(gbwt_graph.get_handle(from.node_id(), false)));
924+
925+
// Use the search state going backward
926+
base_state = &extended_seed.state.backward;
927+
928+
tail_length = extended_seed.read_interval.first;
929+
} else {
930+
// Look right from end
931+
from = extended_seed.tail_position(gbwt_graph);
1003932

1004-
// Decide if the start node will end up included in the tree, or if we cut it all off with the offset.
1005-
bool start_included = (from.offset() < gbwt_graph.get_length(start_handle));
933+
// Use the search state going forward
934+
base_state = &extended_seed.state.forward;
1006935

1007-
// How long should we search? It should be the longest detectable gap plus the remaining sequence.
1008-
size_t search_limit = get_regular_aligner()->longest_detectable_gap(tail_length, read_length) + tail_length;
936+
tail_length = read_length - extended_seed.read_interval.second;
937+
}
938+
939+
if (tail_length == 0) {
940+
// Don't go looking for places to put no tail.
941+
return to_return;
942+
}
943+
944+
// This is one tree that we are filling in
945+
vector<pair<int64_t, handle_t>> tree;
946+
947+
// This is a stack of indexes at which we put parents in the tree
948+
list<int64_t> parent_stack;
949+
950+
// Get the handle we are starting from
951+
// TODO: is it cheaper to get this out of base_state?
952+
handle_t start_handle = gbwt_graph.get_handle(from.node_id(), from.is_reverse());
953+
954+
// Decide if the start node will end up included in the tree, or if we cut it all off with the offset.
955+
bool start_included = (from.offset() < gbwt_graph.get_length(start_handle));
956+
957+
// How long should we search? It should be the longest detectable gap plus the remaining sequence.
958+
size_t search_limit = get_regular_aligner()->longest_detectable_gap(tail_length, read_length) + tail_length;
959+
960+
// Do a DFS over the haplotypes in the GBWT out to that distance.
961+
dfs_gbwt(*base_state, from.offset(), search_limit, [&](const handle_t& entered) {
962+
// Enter a new handle.
1009963

1010-
// Do a DFS over the haplotypes in the GBWT out to that distance.
1011-
dfs_gbwt(*base_state, from.offset(), search_limit, [&](const handle_t& entered) {
1012-
// Enter a new handle.
964+
if (parent_stack.empty()) {
965+
// This is the root of a new tree in the forrest
1013966

1014-
if (parent_stack.empty()) {
1015-
// This is the root of a new tree in the forrest
1016-
1017-
if (!tree.empty()) {
1018-
// Save the old tree and start a new one.
1019-
// We need to cut off from.offset() from the root, unless we would cut off the whole root.
1020-
// In that case, the GBWT DFS will have skipped the empty root entirely, so we cut off nothing.
1021-
to_return[extension_number].emplace_back(&gbwt_graph, std::move(tree), start_included ? from.offset() : 0);
1022-
tree.clear();
1023-
}
1024-
1025-
// Add this to the tree with no parent
1026-
tree.emplace_back(-1, entered);
1027-
} else {
1028-
// Just say this is visitable from our parent.
1029-
tree.emplace_back(parent_stack.back(), entered);
967+
if (!tree.empty()) {
968+
// Save the old tree and start a new one.
969+
// We need to cut off from.offset() from the root, unless we would cut off the whole root.
970+
// In that case, the GBWT DFS will have skipped the empty root entirely, so we cut off nothing.
971+
to_return.emplace_back(&gbwt_graph, std::move(tree), start_included ? from.offset() : 0);
972+
tree.clear();
1030973
}
1031974

1032-
// Record the parent index
1033-
parent_stack.push_back(tree.size() - 1);
1034-
}, [&]() {
1035-
// Exit the last visited handle. Pop off the stack.
1036-
parent_stack.pop_back();
1037-
});
1038-
1039-
if (!tree.empty()) {
1040-
// Now save the last tree
1041-
to_return[extension_number].emplace_back(&gbwt_graph, std::move(tree), start_included ? from.offset() : 0);
1042-
tree.clear();
975+
// Add this to the tree with no parent
976+
tree.emplace_back(-1, entered);
977+
} else {
978+
// Just say this is visitable from our parent.
979+
tree.emplace_back(parent_stack.back(), entered);
1043980
}
1044981

982+
// Record the parent index
983+
parent_stack.push_back(tree.size() - 1);
984+
}, [&]() {
985+
// Exit the last visited handle. Pop off the stack.
986+
parent_stack.pop_back();
987+
});
988+
989+
if (!tree.empty()) {
990+
// Now save the last tree
991+
to_return.emplace_back(&gbwt_graph, std::move(tree), start_included ? from.offset() : 0);
992+
tree.clear();
993+
}
994+
1045995
#ifdef debug
1046-
if (to_return.count(extension_number)) {
1047-
cerr << "Found " << to_return[extension_number].size() << " trees" << endl;
1048-
} else {
1049-
cerr << "Found no trees in no forest" << endl;
1050-
}
996+
cerr << "Found " << to_return.size() << " trees" << endl;
1051997
#endif
1052-
}
1053998

1054999
// Now we have all the trees!
10551000
return to_return;

0 commit comments

Comments
 (0)