@@ -724,133 +724,87 @@ void MinimizerMapper::find_optimal_tail_alignments(const Alignment& aln, const v
724
724
cerr << " Trying again find tail alignments for " << extended_seeds.size () << " extended seeds" << endl;
725
725
#endif
726
726
727
- // We're going to record source and sink path count distributions, for debugging
728
- vector<double > tail_path_counts;
729
-
730
- // We're also going to record read sequence lengths for tails
731
- vector<double > tail_lengths;
732
-
733
- // Make a MultipathAlignment and feed in all the extended seeds as subpaths
734
- MultipathAlignment mp;
735
- // Pull over all the non-alignment data (to get copied back out when linearizing)
736
- transfer_read_metadata (aln, mp);
727
+ // Make paths for all the extensions
728
+ vector<Path> extension_paths;
729
+ vector<double > extension_path_scores;
730
+ extension_paths.reserve (extended_seeds.size ());
731
+ extension_path_scores.reserve (extended_seeds.size ());
737
732
for (auto & extended_seed : extended_seeds) {
738
- Subpath* s = mp.add_subpath ();
739
- // Copy in the path.
740
- *s->mutable_path () = extended_seed.to_path (gbwt_graph, out.sequence ());
741
- // Score it
742
- s->set_score (get_regular_aligner ()->score_partial_alignment (aln, gbwt_graph, s->path (),
733
+ // Compute the path for each extension
734
+ extension_paths.push_back (extended_seed.to_path (gbwt_graph, aln.sequence ()));
735
+ // And the extension's score
736
+ extension_path_scores.push_back (get_regular_aligner ()->score_partial_alignment (aln, gbwt_graph, extension_paths.back (),
743
737
aln.sequence ().begin () + extended_seed.read_interval .first ));
744
- // The position in the read it occurs at will be handled by the multipath topology.
745
- if (extended_seed.read_interval .first == 0 ) {
746
- // But if it occurs at the very start of the read we need to mark that now.
747
- mp.add_start (mp.subpath_size () - 1 );
748
- }
749
738
}
750
739
751
- // Handle left tails as a forest of trees
752
-
753
- // Get the forests of all left tails by extension they belong to
754
- auto tails_by_extension = get_tail_forests (extended_seeds, aln.sequence ().size (), true );
755
-
756
- for (auto & kv : tails_by_extension) {
757
- // For each extension that has a nonempty tail
758
- auto & source = kv.first ;
759
- auto & forest = kv.second ;
760
-
761
- // Grab the part of the read sequence that comes before it
762
- string before_sequence = aln.sequence ().substr (0 , extended_seeds[source].read_interval .first );
763
-
764
- // Do right-pinned alignment
765
- pair<Path, int64_t > result = get_best_alignment_against_any_tree (forest, before_sequence,
766
- extended_seeds[source].starting_position (gbwt_graph), false );
767
-
768
- // Grab the best path in backing graph space (which may be empty)
769
- Path& best_path = result.first ;
770
- // And its score
771
- int64_t & best_score = result.second ;
772
-
773
- // Put it in the MultipathAlignment
774
- Subpath* s = mp.add_subpath ();
775
- *s->mutable_path () = std::move (best_path);
776
- s->set_score (best_score);
777
-
778
- // And make the edge from it to the correct source
779
- s->add_next (source);
780
-
781
- #ifdef debug
782
- cerr << " Add leading tail " << (mp.subpath_size () - 1 ) << " -> " << source << endl;
783
- #endif
740
+ // We will keep the winning alignment in out
741
+ out.set_score (0 );
742
+
743
+ // Handle each extension in the set
744
+ process_until_threshold (extended_seeds, extension_path_scores,
745
+ 0 , 0 , numeric_limits<size_t >::max (),
746
+ (function<double (size_t )>) [&](size_t extended_seed_num) {
747
+
748
+ // This extended seed looks good enough.
749
+
750
+ // We start with the path in extension_paths[extended_seed_num],
751
+ // scored in extension_path_scores[extended_seed_num]
752
+
753
+ // We also have a left tail path and score
754
+ pair<Path, int64_t > left_tail_result {{}, 0 };
755
+ // And a right tail path and score
756
+ pair<Path, int64_t > right_tail_result {{}, 0 };
757
+
758
+ if (extended_seeds[extended_seed_num].read_interval .first != 0 ) {
759
+ // There is a left tail
760
+
761
+ // Get the forest of all left tail placements
762
+ auto forest = get_tail_forest (extended_seeds[extended_seed_num], aln.sequence ().size (), true );
763
+
764
+ // Grab the part of the read sequence that comes before the extension
765
+ string before_sequence = aln.sequence ().substr (0 , extended_seeds[extended_seed_num].read_interval .first );
766
+
767
+ // Do right-pinned alignment
768
+ left_tail_result = get_best_alignment_against_any_tree (forest, before_sequence,
769
+ extended_seeds[extended_seed_num].starting_position (gbwt_graph), false );
770
+ }
771
+
772
+ if (extended_seeds[extended_seed_num].read_interval .second != aln.sequence ().size ()) {
773
+ // There is a right tail
774
+
775
+ // Get the forest of all right tail placements
776
+ auto forest = get_tail_forest (extended_seeds[extended_seed_num], aln.sequence ().size (), false );
777
+
778
+ // Find the sequence
779
+ string trailing_sequence = aln.sequence ().substr (extended_seeds[extended_seed_num].read_interval .second );
784
780
785
- // And mark it as a start subpath
786
- mp.add_start (mp.subpath_size () - 1 );
787
- }
781
+ // Do left-pinned alignment
782
+ right_tail_result = get_best_alignment_against_any_tree (forest, trailing_sequence,
783
+ extended_seeds[extended_seed_num].tail_position (gbwt_graph), true );
784
+ }
788
785
789
- // We must have somewhere to start.
790
- assert (mp. start_size () > 0 ) ;
786
+ // Compute total score
787
+ size_t total_score = extension_path_scores[extended_seed_num] + left_tail_result. second + right_tail_result. second ;
791
788
792
- // Handle right tails as a forest of trees
789
+ if (total_score > out.score () || out.score () == 0 ) {
790
+ // This is the new best alignment seen so far.
791
+
792
+ // Save the score
793
+ out.set_score (total_score);
794
+ // And compose the full path
795
+ *out.mutable_path () = concat_paths (concat_paths (left_tail_result.first , extension_paths[extended_seed_num]),
796
+ right_tail_result.first );
797
+ }
793
798
794
- // Get the forests of all right tails by extension they belong to
795
- tails_by_extension = get_tail_forests (extended_seeds, aln.sequence ().size (), false );
796
-
797
- for (auto & kv : tails_by_extension) {
798
- // For each extension that has a nonempty tail
799
- auto & from = kv.first ;
800
- auto & forest = kv.second ;
801
-
802
- #ifdef debug
803
- cerr << " Consider right tails for extension " << from << " with interval "
804
- << extended_seeds[from].read_interval .first << " - " << extended_seeds[from].read_interval .second << endl;
805
- #endif
806
-
807
- // Find the sequence
808
- string trailing_sequence = aln.sequence ().substr (extended_seeds[from].read_interval .second );
809
-
810
- // There should be actual trailing sequence to align on this escape path
811
- assert (!trailing_sequence.empty ());
812
-
813
- // Do left-pinned alignment
814
- pair<Path, int64_t > result = get_best_alignment_against_any_tree (forest, trailing_sequence,
815
- extended_seeds[from].tail_position (gbwt_graph), true );
816
-
817
- // Grab the best path in backing graph space (which may be empty)
818
- Path& best_path = result.first ;
819
- // And its score
820
- int64_t & best_score = result.second ;
821
-
822
- // Put it in the MultipathAlignment
823
- Subpath* s = mp.add_subpath ();
824
- *s->mutable_path () = std::move (best_path);
825
- s->set_score (best_score);
826
-
827
- // And make the edge to hook it up
828
- mp.mutable_subpath (from)->add_next (mp.subpath_size () - 1 );
799
+ return true ;
800
+ }, [&](size_t extended_seed_num) {
801
+ // This extended seed isn't good enough by its own score.
802
+ });
829
803
830
- #ifdef debug
831
- cerr << " Add trailing tail " << from << " -> " << (mp.subpath_size () - 1 ) << endl;
832
- #endif
833
-
834
- }
835
-
836
- // Then we take the best linearization of the full MultipathAlignment.
837
- // Make sure to force source to sink
838
- topologically_order_subpaths (mp);
839
-
840
- if (!validate_multipath_alignment (mp, gbwt_graph)) {
841
- // If we generated an invalid multipath alignment, we did something wrong and need to stop
842
- cerr << " error[vg::MinimizerMapper]: invalid MultipathAlignment generated: " << pb2json (mp) << endl;
843
- exit (1 );
844
- }
804
+ // Now the winning path and score over all extended seeds we looked at is in out.
845
805
846
- // Linearize into the out alignment, copying path, score, and also sequence and other read metadata
847
- optimal_alignment (mp, out, true );
848
806
// Compute the identity from the path.
849
807
out.set_identity (identity (out.path ()));
850
-
851
- // Save all the tail alignment debugging statistics
852
- set_annotation (out, " tail_path_counts" , tail_path_counts);
853
- set_annotation (out, " tail_lengths" , tail_lengths);
854
808
}
855
809
856
810
pair<Path, size_t > MinimizerMapper::get_best_alignment_against_any_tree (const vector<TreeSubgraph>& trees,
@@ -943,113 +897,104 @@ pair<Path, size_t> MinimizerMapper::get_best_alignment_against_any_tree(const ve
943
897
return make_pair (best_path, best_score);
944
898
}
945
899
946
- unordered_map< size_t , vector<TreeSubgraph>> MinimizerMapper::get_tail_forests (const vector< GaplessExtension>& extended_seeds ,
900
+ vector<TreeSubgraph> MinimizerMapper::get_tail_forest (const GaplessExtension& extended_seed ,
947
901
size_t read_length, bool left_tails) const {
948
902
949
- // We will fill this in with all the trees we return, by parent extension.
950
- unordered_map< size_t , vector<TreeSubgraph> > to_return;
903
+ // We will fill this in with all the trees we return
904
+ vector<TreeSubgraph> to_return;
951
905
952
- for (size_t extension_number = 0 ; extension_number < extended_seeds.size (); extension_number++) {
953
- // Now for each extension that can have tails, walk the GBWT in the appropriate direction
954
-
906
+ // Now for this extension, walk the GBWT in the appropriate direction
907
+
955
908
#ifdef debug
956
- cerr << " Look for " << (left_tails ? " left" : " right" ) << " tails from extension " << extension_number << endl;
909
+ cerr << " Look for " << (left_tails ? " left" : " right" ) << " tails from extension" << endl;
957
910
#endif
958
911
959
- // TODO: Come up with a better way to do this with more accessors on the extension and less get_handle
960
- // Get the Position reading out of the extension on the appropriate tail
961
- Position from;
962
- // And the length of that tail
963
- size_t tail_length;
964
- // And the GBWT search state we want to start with
965
- const gbwt::SearchState* base_state = nullptr ;
966
- if (left_tails) {
967
- // Look right from start
968
- from = extended_seeds[extension_number].starting_position (gbwt_graph);
969
- // And then flip to look the other way at the prev base
970
- from = reverse (from, gbwt_graph.get_length (gbwt_graph.get_handle (from.node_id (), false )));
971
-
972
- // Use the search state going backward
973
- base_state = &extended_seeds[extension_number].state .backward ;
974
-
975
- tail_length = extended_seeds[extension_number].read_interval .first ;
976
- } else {
977
- // Look right from end
978
- from = extended_seeds[extension_number].tail_position (gbwt_graph);
979
-
980
- // Use the search state going forward
981
- base_state = &extended_seeds[extension_number].state .forward ;
982
-
983
- tail_length = read_length - extended_seeds[extension_number].read_interval .second ;
984
- }
985
-
986
- if (tail_length == 0 ) {
987
- // Don't go looking for places to put no tail.
988
- continue ;
989
- }
990
-
991
- // Make sure we at least have an empty forest if we have any tail.
992
- to_return.emplace (extension_number, vector<TreeSubgraph>());
993
-
994
- // This is one tree that we are filling in
995
- vector<pair<int64_t , handle_t >> tree;
996
-
997
- // This is a stack of indexes at which we put parents in the tree
998
- list<int64_t > parent_stack;
999
-
1000
- // Get the handle we are starting from
1001
- // TODO: is it cheaper to get this out of base_state?
1002
- handle_t start_handle = gbwt_graph.get_handle (from.node_id (), from.is_reverse ());
912
+ // TODO: Come up with a better way to do this with more accessors on the extension and less get_handle
913
+ // Get the Position reading out of the extension on the appropriate tail
914
+ Position from;
915
+ // And the length of that tail
916
+ size_t tail_length;
917
+ // And the GBWT search state we want to start with
918
+ const gbwt::SearchState* base_state = nullptr ;
919
+ if (left_tails) {
920
+ // Look right from start
921
+ from = extended_seed.starting_position (gbwt_graph);
922
+ // And then flip to look the other way at the prev base
923
+ from = reverse (from, gbwt_graph.get_length (gbwt_graph.get_handle (from.node_id (), false )));
924
+
925
+ // Use the search state going backward
926
+ base_state = &extended_seed.state .backward ;
927
+
928
+ tail_length = extended_seed.read_interval .first ;
929
+ } else {
930
+ // Look right from end
931
+ from = extended_seed.tail_position (gbwt_graph);
1003
932
1004
- // Decide if the start node will end up included in the tree, or if we cut it all off with the offset.
1005
- bool start_included = (from. offset () < gbwt_graph. get_length (start_handle)) ;
933
+ // Use the search state going forward
934
+ base_state = &extended_seed. state . forward ;
1006
935
1007
- // How long should we search? It should be the longest detectable gap plus the remaining sequence.
1008
- size_t search_limit = get_regular_aligner ()->longest_detectable_gap (tail_length, read_length) + tail_length;
936
+ tail_length = read_length - extended_seed.read_interval .second ;
937
+ }
938
+
939
+ if (tail_length == 0 ) {
940
+ // Don't go looking for places to put no tail.
941
+ return to_return;
942
+ }
943
+
944
+ // This is one tree that we are filling in
945
+ vector<pair<int64_t , handle_t >> tree;
946
+
947
+ // This is a stack of indexes at which we put parents in the tree
948
+ list<int64_t > parent_stack;
949
+
950
+ // Get the handle we are starting from
951
+ // TODO: is it cheaper to get this out of base_state?
952
+ handle_t start_handle = gbwt_graph.get_handle (from.node_id (), from.is_reverse ());
953
+
954
+ // Decide if the start node will end up included in the tree, or if we cut it all off with the offset.
955
+ bool start_included = (from.offset () < gbwt_graph.get_length (start_handle));
956
+
957
+ // How long should we search? It should be the longest detectable gap plus the remaining sequence.
958
+ size_t search_limit = get_regular_aligner ()->longest_detectable_gap (tail_length, read_length) + tail_length;
959
+
960
+ // Do a DFS over the haplotypes in the GBWT out to that distance.
961
+ dfs_gbwt (*base_state, from.offset (), search_limit, [&](const handle_t & entered) {
962
+ // Enter a new handle.
1009
963
1010
- // Do a DFS over the haplotypes in the GBWT out to that distance.
1011
- dfs_gbwt (*base_state, from.offset (), search_limit, [&](const handle_t & entered) {
1012
- // Enter a new handle.
964
+ if (parent_stack.empty ()) {
965
+ // This is the root of a new tree in the forrest
1013
966
1014
- if (parent_stack.empty ()) {
1015
- // This is the root of a new tree in the forrest
1016
-
1017
- if (!tree.empty ()) {
1018
- // Save the old tree and start a new one.
1019
- // We need to cut off from.offset() from the root, unless we would cut off the whole root.
1020
- // In that case, the GBWT DFS will have skipped the empty root entirely, so we cut off nothing.
1021
- to_return[extension_number].emplace_back (&gbwt_graph, std::move (tree), start_included ? from.offset () : 0 );
1022
- tree.clear ();
1023
- }
1024
-
1025
- // Add this to the tree with no parent
1026
- tree.emplace_back (-1 , entered);
1027
- } else {
1028
- // Just say this is visitable from our parent.
1029
- tree.emplace_back (parent_stack.back (), entered);
967
+ if (!tree.empty ()) {
968
+ // Save the old tree and start a new one.
969
+ // We need to cut off from.offset() from the root, unless we would cut off the whole root.
970
+ // In that case, the GBWT DFS will have skipped the empty root entirely, so we cut off nothing.
971
+ to_return.emplace_back (&gbwt_graph, std::move (tree), start_included ? from.offset () : 0 );
972
+ tree.clear ();
1030
973
}
1031
974
1032
- // Record the parent index
1033
- parent_stack.push_back (tree.size () - 1 );
1034
- }, [&]() {
1035
- // Exit the last visited handle. Pop off the stack.
1036
- parent_stack.pop_back ();
1037
- });
1038
-
1039
- if (!tree.empty ()) {
1040
- // Now save the last tree
1041
- to_return[extension_number].emplace_back (&gbwt_graph, std::move (tree), start_included ? from.offset () : 0 );
1042
- tree.clear ();
975
+ // Add this to the tree with no parent
976
+ tree.emplace_back (-1 , entered);
977
+ } else {
978
+ // Just say this is visitable from our parent.
979
+ tree.emplace_back (parent_stack.back (), entered);
1043
980
}
1044
981
982
+ // Record the parent index
983
+ parent_stack.push_back (tree.size () - 1 );
984
+ }, [&]() {
985
+ // Exit the last visited handle. Pop off the stack.
986
+ parent_stack.pop_back ();
987
+ });
988
+
989
+ if (!tree.empty ()) {
990
+ // Now save the last tree
991
+ to_return.emplace_back (&gbwt_graph, std::move (tree), start_included ? from.offset () : 0 );
992
+ tree.clear ();
993
+ }
994
+
1045
995
#ifdef debug
1046
- if (to_return.count (extension_number)) {
1047
- cerr << " Found " << to_return[extension_number].size () << " trees" << endl;
1048
- } else {
1049
- cerr << " Found no trees in no forest" << endl;
1050
- }
996
+ cerr << " Found " << to_return.size () << " trees" << endl;
1051
997
#endif
1052
- }
1053
998
1054
999
// Now we have all the trees!
1055
1000
return to_return;
0 commit comments