@@ -860,7 +860,7 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
860
860
return std::tie (m1._n_sId , m1.qStart , m1.qEnd , m1.sStart , m1.sEnd , m1.qFrameShift , m1.sFrameShift ) ==
861
861
std::tie (m2._n_sId , m2.qStart , m2.qEnd , m2.sStart , m2.sEnd , m2.qFrameShift , m2.sFrameShift );
862
862
});
863
- lH.stats .hitsDuplicate += before - record.matches .size ();
863
+ lH.stats .hitsDuplicate2 += before - record.matches .size ();
864
864
865
865
// sort by evalue before writing
866
866
record.matches .sort ([](auto const & m1, auto const & m2) { return m1.bitScore > m2.bitScore ; });
@@ -873,6 +873,14 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
873
873
}
874
874
lH.stats .hitsFinal += record.matches .size ();
875
875
876
+ /* count uniq qry-subj-pairs */
877
+ lH.uniqSubjIds .clear ();
878
+ lH.uniqSubjIds .reserve (record.matches .size ());
879
+ for (auto const & bm : record.matches )
880
+ lH.uniqSubjIds .insert (bm._n_sId );
881
+
882
+ lH.stats .pairs += lH.uniqSubjIds .size ();
883
+
876
884
// compute LCA
877
885
if (lH.options .computeLCA )
878
886
{
@@ -908,32 +916,25 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
908
916
// Function computeBlastMatch()
909
917
// --------------------------------------------------------------------------
910
918
911
- template <typename TBlastMatch, typename TLocalHolder>
912
- inline void _setupAlignInfix (TBlastMatch & bm, typename TLocalHolder::TMatch const & m, TLocalHolder & lH)
919
+ template <typename TLocalHolder>
920
+ inline void _widenMatch (Match & m, TLocalHolder const & lH)
913
921
{
914
- int64_t startMod = (int64_t )m.subjStart - (int64_t )m.qryStart ;
922
+ // move sStart as far left as needed to cover the part of query before qryStart
923
+ m.subjStart = (m.subjStart < m.qryStart ) ? 0 : m.subjStart - m.qryStart ;
915
924
916
- bm.qEnd = lH.transQrySeqs [m.qryId ].size ();
917
- decltype (bm.qEnd ) band = _bandSize (bm.qEnd );
918
- if (startMod >= 0 )
919
- {
920
- bm.sStart = startMod;
921
- bm.qStart = 0 ;
922
- }
923
- else
924
- {
925
- bm.sStart = 0 ;
926
- bm.qStart = -startMod;
927
- }
928
- bm.sEnd = std::min<size_t >(bm.sStart + bm.qEnd - bm.qStart + band, lH.gH .transSbjSeqs [m.subjId ].size ());
925
+ /* always align full query independent of hit-region */
926
+ m.qryStart = 0 ;
927
+ m.qryEnd = lH.transQrySeqs [m.qryId ].size ();
929
928
930
- if (bm.sStart >= band)
931
- bm.sStart -= band;
932
- else
933
- bm.sStart = 0 ;
929
+ // there is no band in computation but this value extends begin and end of Subj to account for gaps
930
+ uint64_t band = _bandSize (lH.transQrySeqs [m.qryId ].size ());
931
+
932
+ // end on subject is beginning plus full query length plus band
933
+ m.subjEnd =
934
+ std::min<size_t >(m.subjStart + lH.transQrySeqs [m.qryId ].size () + band, lH.gH .transSbjSeqs [m.subjId ].size ());
934
935
935
- seqan::assignSource (bm. alignRow0 , lH. transQrySeqs [m. qryId ] | bio::views::slice (bm. qStart , bm. qEnd ));
936
- seqan::assignSource (bm. alignRow1 , lH. gH . transSbjSeqs [m. subjId ] | bio::views::slice (bm. sStart , bm. sEnd )) ;
936
+ // account for band in subj start
937
+ m. subjStart = (band < m. subjStart ) ? m. subjStart - band : 0 ;
937
938
}
938
939
939
940
template <typename TBlastMatch, typename TLocalHolder>
@@ -1133,7 +1134,48 @@ inline void _performAlignment(TDepSetH & depSetH,
1133
1134
}
1134
1135
1135
1136
template <typename TLocalHolder>
1136
- inline void iterateMatchesFullSimd (TLocalHolder & lH, bsDirection const dir = bsDirection::fwd)
1137
+ inline void _widenAndPreprocessMatches (std::span<Match> & matches, TLocalHolder & lH)
1138
+ {
1139
+ auto before = matches.size ();
1140
+
1141
+ for (Match & m : matches)
1142
+ _widenMatch<TLocalHolder>(m, lH);
1143
+
1144
+ std::ranges::sort (matches);
1145
+
1146
+ if (matches.size () > 1 )
1147
+ {
1148
+ // pairwise merge from left to right
1149
+ for (auto it = matches.begin (); it < matches.end () - 1 ; ++it)
1150
+ {
1151
+ Match & l = *it;
1152
+ Match & r = *(it + 1 );
1153
+ if ((std::tie (l.qryId , l.subjId ) == std::tie (r.qryId , r.subjId )) && (l.subjEnd >= r.subjStart ))
1154
+ {
1155
+ l.subjEnd = r.subjEnd ;
1156
+ r.subjStart = l.subjStart ;
1157
+ }
1158
+ }
1159
+
1160
+ // pairwise "swallow" from right to left
1161
+ for (auto it = matches.rbegin (); it < matches.rend () - 1 ; ++it)
1162
+ {
1163
+ Match & r = *it;
1164
+ Match & l = *(it + 1 );
1165
+ if ((std::tie (r.qryId , r.subjId ) == std::tie (l.qryId , l.subjId )) && (r.subjStart < l.subjEnd ))
1166
+ {
1167
+ l = r;
1168
+ }
1169
+ }
1170
+
1171
+ auto [new_end, old_end] = std::ranges::unique (matches); // move non-uniq to the end
1172
+ matches = std::span<Match>{matches.begin (), new_end}; // "resize" of the span
1173
+ lH.stats .hitsDuplicate += (before - matches.size ());
1174
+ }
1175
+ }
1176
+
1177
+ template <typename TLocalHolder>
1178
+ inline void iterateMatchesFullSimd (std::span<Match> lambdaMatches, TLocalHolder & lH, bsDirection const dir)
1137
1179
{
1138
1180
using TGlobalHolder = typename TLocalHolder::TGlobalHolder;
1139
1181
using TBlastMatch = typename TLocalHolder::TBlastMatch;
@@ -1143,7 +1185,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
1143
1185
// statistics
1144
1186
#ifdef LAMBDA_MICRO_STATS
1145
1187
++lH.stats .numQueryWithExt ;
1146
- lH.stats .numExtScore += seqan::length (lH. matches );
1188
+ lH.stats .numExtScore += seqan::length (lambdaMatches );
1147
1189
1148
1190
double start = sysTime ();
1149
1191
#endif
@@ -1152,58 +1194,37 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
1152
1194
seqan::StringSet<typename seqan::Source<typename TLocalHolder::TAlignRow0>::Type> depSetH;
1153
1195
seqan::StringSet<typename seqan::Source<typename TLocalHolder::TAlignRow1>::Type> depSetV;
1154
1196
1155
- // create blast matches
1197
+ // pre-sort and filter
1198
+ _widenAndPreprocessMatches (lambdaMatches, lH);
1199
+
1200
+ // create blast matches from Lambda matches
1156
1201
std::list<TBlastMatch> blastMatches;
1157
- for (auto it = lH. matches . begin (), itEnd = lH. matches . end (); it != itEnd; ++it )
1202
+ for (Match const & m : lambdaMatches )
1158
1203
{
1159
- // In BS-mode, skip those results that have wrong orientation
1160
- if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
1161
- {
1162
- if ((dir == bsDirection::fwd && (it->subjId % 2 )) || (dir == bsDirection::rev && !(it->subjId % 2 )))
1163
- continue ;
1164
- }
1165
1204
// create blastmatch in list without copy or move
1166
- blastMatches.emplace_back (lH.qryIds [it-> qryId / TGlobalHolder::qryNumFrames],
1167
- const_gH.indexFile .ids [it-> subjId / TGlobalHolder::sbjNumFrames]);
1205
+ blastMatches.emplace_back (lH.qryIds [m. qryId / TGlobalHolder::qryNumFrames],
1206
+ const_gH.indexFile .ids [m. subjId / TGlobalHolder::sbjNumFrames]);
1168
1207
1169
1208
TBlastMatch & bm = blastMatches.back ();
1170
1209
1171
- bm._n_qId = it-> qryId / TGlobalHolder::qryNumFrames;
1172
- bm._n_sId = it-> subjId / TGlobalHolder::sbjNumFrames;
1210
+ bm._n_qId = m. qryId / TGlobalHolder::qryNumFrames;
1211
+ bm._n_sId = m. subjId / TGlobalHolder::sbjNumFrames;
1173
1212
1174
- bm.qLength = // std::ranges::size(lH.transQrySeqs[it->qryId ]);
1175
- std::ranges::size (lH.qrySeqs [bm._n_qId ]);
1213
+ bm.qLength = std::ranges::size (lH.qrySeqs [bm. _n_qId ]);
1214
+ bm. sLength = std::ranges::size (lH.gH . indexFile . seqs [bm._n_sId ]);
1176
1215
1177
- bm.sLength = // std::ranges::size(lH.gH.transSbjSeqs[it->subjId]);
1178
- std::ranges::size (lH.gH .indexFile .seqs [bm._n_sId ]);
1216
+ bm.qStart = m.qryStart ;
1217
+ bm.qEnd = m.qryEnd ;
1218
+ bm.sStart = m.subjStart ;
1219
+ bm.sEnd = m.subjEnd ;
1220
+ seqan::assignSource (bm.alignRow0 , lH.transQrySeqs [m.qryId ] | bio::views::slice (bm.qStart , bm.qEnd ));
1221
+ seqan::assignSource (bm.alignRow1 , lH.gH .transSbjSeqs [m.subjId ] | bio::views::slice (bm.sStart , bm.sEnd ));
1179
1222
1180
- _setupAlignInfix (bm, *it, lH);
1181
-
1182
- _setFrames (bm, *it, lH);
1223
+ _setFrames (bm, m, lH);
1183
1224
1184
1225
if (lH.options .hasSTaxIds )
1185
1226
bm.sTaxIds = lH.gH .indexFile .sTaxIds [bm._n_sId ];
1186
1227
}
1187
- #ifdef LAMBDA_MICRO_STATS
1188
- lH.stats .timeExtend += sysTime () - start;
1189
-
1190
- // filter out duplicates
1191
- start = sysTime ();
1192
- #endif
1193
- auto before = seqan::length (blastMatches);
1194
- blastMatches.sort (
1195
- [](auto const & l, auto const & r)
1196
- {
1197
- return std::tie (l._n_qId , l._n_sId , l.sStart , l.sEnd , l.qStart , l.qEnd , l.qFrameShift , l.sFrameShift ) <
1198
- std::tie (r._n_qId , r._n_sId , r.sStart , r.sEnd , r.qStart , r.qEnd , r.qFrameShift , r.sFrameShift );
1199
- });
1200
- blastMatches.unique (
1201
- [](auto const & l, auto const & r)
1202
- {
1203
- return std::tie (l._n_qId , l._n_sId , l.sStart , l.sEnd , l.qStart , l.qEnd , l.qFrameShift , l.sFrameShift ) ==
1204
- std::tie (r._n_qId , r._n_sId , r.sStart , r.sEnd , r.qStart , r.qEnd , r.qFrameShift , r.sFrameShift );
1205
- });
1206
- lH.stats .hitsDuplicate += (before - seqan::length (blastMatches));
1207
1228
1208
1229
// sort by lengths to minimize padding in SIMD
1209
1230
blastMatches.sort (
@@ -1217,6 +1238,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
1217
1238
1218
1239
start = sysTime ();
1219
1240
#endif
1241
+
1220
1242
// fill batches
1221
1243
_setupDepSets (depSetH, depSetV, blastMatches);
1222
1244
@@ -1342,12 +1364,24 @@ inline void writeRecords(TLocalHolder & lH)
1342
1364
template <typename TLocalHolder>
1343
1365
inline void iterateMatches (TLocalHolder & lH)
1344
1366
{
1345
- iterateMatchesFullSimd (lH, bsDirection::fwd);
1346
1367
if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
1347
1368
{
1348
- iterateMatchesFullSimd (lH, bsDirection::rev);
1369
+ std::ranges::sort (lH.matches ,
1370
+ [](Match const & l, Match const & r) {
1371
+ return std::tuple<bool , Match const &>{l.subjId % 2 , l} <
1372
+ std::tuple<bool , Match const &>{r.subjId % 2 , r};
1373
+ });
1374
+
1375
+ auto it = std::ranges::find_if (lH.matches , [](Match const & m) { return m.subjId % 2 ; });
1376
+
1377
+ iterateMatchesFullSimd (std::span{lH.matches .begin (), it}, lH, bsDirection::fwd);
1378
+ iterateMatchesFullSimd (std::span{it, lH.matches .end ()}, lH, bsDirection::rev);
1349
1379
lH.blastMatches .sort ([](auto const & lhs, auto const & rhs) { return lhs._n_qId < rhs._n_qId ; });
1350
1380
}
1381
+ else
1382
+ {
1383
+ iterateMatchesFullSimd (lH.matches , lH, bsDirection::fwd);
1384
+ }
1351
1385
}
1352
1386
1353
1387
// -----------------------------------------------------------------------
0 commit comments