Skip to content

Commit d995cb5

Browse files
authored
Merge pull request #222 from h-2/align_bug
[fix] alignment bug
2 parents 73f370d + 01b7d01 commit d995cb5

9 files changed

+599
-315
lines changed

src/mkindex.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,21 @@ void realMain(LambdaIndexerOptions const & options);
6868
int mkindexMain(int const argc, char const ** argv)
6969
{
7070
LambdaIndexerOptions options;
71+
72+
#ifdef NDEBUG
73+
try
74+
{
75+
parseCommandLine(options, argc, argv);
76+
}
77+
catch (sharg::parser_error const & ext) // catch user errors
78+
{
79+
std::cerr << "\n\nERROR: during command line parsing\n"
80+
<< " \"" << ext.what() << "\"\n";
81+
return -1;
82+
}
83+
#else
7184
parseCommandLine(options, argc, argv);
85+
#endif
7286

7387
#ifdef NDEBUG
7488
try

src/search.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,21 @@ void realMain(LambdaOptions const & options);
7373
int searchMain(int const argc, char const ** argv)
7474
{
7575
LambdaOptions options;
76+
77+
#ifdef NDEBUG
78+
try
79+
{
80+
parseCommandLine(options, argc, argv);
81+
}
82+
catch (sharg::parser_error const & ext) // catch user errors
83+
{
84+
std::cerr << "\n\nERROR: during command line parsing\n"
85+
<< " \"" << ext.what() << "\"\n";
86+
return -1;
87+
}
88+
#else
7689
parseCommandLine(options, argc, argv);
90+
#endif
7791

7892
#ifdef _OPENMP
7993
omp_set_num_threads(options.threads - options.lazyQryFile); // reserve one thread for I/O when lazy-loading

src/search_algo.hpp

Lines changed: 100 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -860,7 +860,7 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
860860
return std::tie(m1._n_sId, m1.qStart, m1.qEnd, m1.sStart, m1.sEnd, m1.qFrameShift, m1.sFrameShift) ==
861861
std::tie(m2._n_sId, m2.qStart, m2.qEnd, m2.sStart, m2.sEnd, m2.qFrameShift, m2.sFrameShift);
862862
});
863-
lH.stats.hitsDuplicate += before - record.matches.size();
863+
lH.stats.hitsDuplicate2 += before - record.matches.size();
864864

865865
// sort by evalue before writing
866866
record.matches.sort([](auto const & m1, auto const & m2) { return m1.bitScore > m2.bitScore; });
@@ -873,6 +873,14 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
873873
}
874874
lH.stats.hitsFinal += record.matches.size();
875875

876+
/* count uniq qry-subj-pairs */
877+
lH.uniqSubjIds.clear();
878+
lH.uniqSubjIds.reserve(record.matches.size());
879+
for (auto const & bm : record.matches)
880+
lH.uniqSubjIds.insert(bm._n_sId);
881+
882+
lH.stats.pairs += lH.uniqSubjIds.size();
883+
876884
// compute LCA
877885
if (lH.options.computeLCA)
878886
{
@@ -908,32 +916,25 @@ inline void _writeRecord(TBlastRecord & record, TLocalHolder & lH)
908916
// Function computeBlastMatch()
909917
// --------------------------------------------------------------------------
910918

911-
template <typename TBlastMatch, typename TLocalHolder>
912-
inline void _setupAlignInfix(TBlastMatch & bm, typename TLocalHolder::TMatch const & m, TLocalHolder & lH)
919+
template <typename TLocalHolder>
920+
inline void _widenMatch(Match & m, TLocalHolder const & lH)
913921
{
914-
int64_t startMod = (int64_t)m.subjStart - (int64_t)m.qryStart;
922+
// move sStart as far left as needed to cover the part of query before qryStart
923+
m.subjStart = (m.subjStart < m.qryStart) ? 0 : m.subjStart - m.qryStart;
915924

916-
bm.qEnd = lH.transQrySeqs[m.qryId].size();
917-
decltype(bm.qEnd) band = _bandSize(bm.qEnd);
918-
if (startMod >= 0)
919-
{
920-
bm.sStart = startMod;
921-
bm.qStart = 0;
922-
}
923-
else
924-
{
925-
bm.sStart = 0;
926-
bm.qStart = -startMod;
927-
}
928-
bm.sEnd = std::min<size_t>(bm.sStart + bm.qEnd - bm.qStart + band, lH.gH.transSbjSeqs[m.subjId].size());
925+
/* always align full query independent of hit-region */
926+
m.qryStart = 0;
927+
m.qryEnd = lH.transQrySeqs[m.qryId].size();
929928

930-
if (bm.sStart >= band)
931-
bm.sStart -= band;
932-
else
933-
bm.sStart = 0;
929+
// there is no band in computation but this value extends begin and end of Subj to account for gaps
930+
uint64_t band = _bandSize(lH.transQrySeqs[m.qryId].size());
931+
932+
// end on subject is beginning plus full query length plus band
933+
m.subjEnd =
934+
std::min<size_t>(m.subjStart + lH.transQrySeqs[m.qryId].size() + band, lH.gH.transSbjSeqs[m.subjId].size());
934935

935-
seqan::assignSource(bm.alignRow0, lH.transQrySeqs[m.qryId] | bio::views::slice(bm.qStart, bm.qEnd));
936-
seqan::assignSource(bm.alignRow1, lH.gH.transSbjSeqs[m.subjId] | bio::views::slice(bm.sStart, bm.sEnd));
936+
// account for band in subj start
937+
m.subjStart = (band < m.subjStart) ? m.subjStart - band : 0;
937938
}
938939

939940
template <typename TBlastMatch, typename TLocalHolder>
@@ -1133,7 +1134,48 @@ inline void _performAlignment(TDepSetH & depSetH,
11331134
}
11341135

11351136
template <typename TLocalHolder>
1136-
inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bsDirection::fwd)
1137+
inline void _widenAndPreprocessMatches(std::span<Match> & matches, TLocalHolder & lH)
1138+
{
1139+
auto before = matches.size();
1140+
1141+
for (Match & m : matches)
1142+
_widenMatch<TLocalHolder>(m, lH);
1143+
1144+
std::ranges::sort(matches);
1145+
1146+
if (matches.size() > 1)
1147+
{
1148+
// pairwise merge from left to right
1149+
for (auto it = matches.begin(); it < matches.end() - 1; ++it)
1150+
{
1151+
Match & l = *it;
1152+
Match & r = *(it + 1);
1153+
if ((std::tie(l.qryId, l.subjId) == std::tie(r.qryId, r.subjId)) && (l.subjEnd >= r.subjStart))
1154+
{
1155+
l.subjEnd = r.subjEnd;
1156+
r.subjStart = l.subjStart;
1157+
}
1158+
}
1159+
1160+
// pairwise "swallow" from right to left
1161+
for (auto it = matches.rbegin(); it < matches.rend() - 1; ++it)
1162+
{
1163+
Match & r = *it;
1164+
Match & l = *(it + 1);
1165+
if ((std::tie(r.qryId, r.subjId) == std::tie(l.qryId, l.subjId)) && (r.subjStart < l.subjEnd))
1166+
{
1167+
l = r;
1168+
}
1169+
}
1170+
1171+
auto [new_end, old_end] = std::ranges::unique(matches); // move non-uniq to the end
1172+
matches = std::span<Match>{matches.begin(), new_end}; // "resize" of the span
1173+
lH.stats.hitsDuplicate += (before - matches.size());
1174+
}
1175+
}
1176+
1177+
template <typename TLocalHolder>
1178+
inline void iterateMatchesFullSimd(std::span<Match> lambdaMatches, TLocalHolder & lH, bsDirection const dir)
11371179
{
11381180
using TGlobalHolder = typename TLocalHolder::TGlobalHolder;
11391181
using TBlastMatch = typename TLocalHolder::TBlastMatch;
@@ -1143,7 +1185,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
11431185
// statistics
11441186
#ifdef LAMBDA_MICRO_STATS
11451187
++lH.stats.numQueryWithExt;
1146-
lH.stats.numExtScore += seqan::length(lH.matches);
1188+
lH.stats.numExtScore += seqan::length(lambdaMatches);
11471189

11481190
double start = sysTime();
11491191
#endif
@@ -1152,58 +1194,37 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
11521194
seqan::StringSet<typename seqan::Source<typename TLocalHolder::TAlignRow0>::Type> depSetH;
11531195
seqan::StringSet<typename seqan::Source<typename TLocalHolder::TAlignRow1>::Type> depSetV;
11541196

1155-
// create blast matches
1197+
// pre-sort and filter
1198+
_widenAndPreprocessMatches(lambdaMatches, lH);
1199+
1200+
// create blast matches from Lambda matches
11561201
std::list<TBlastMatch> blastMatches;
1157-
for (auto it = lH.matches.begin(), itEnd = lH.matches.end(); it != itEnd; ++it)
1202+
for (Match const & m : lambdaMatches)
11581203
{
1159-
// In BS-mode, skip those results that have wrong orientation
1160-
if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
1161-
{
1162-
if ((dir == bsDirection::fwd && (it->subjId % 2)) || (dir == bsDirection::rev && !(it->subjId % 2)))
1163-
continue;
1164-
}
11651204
// create blastmatch in list without copy or move
1166-
blastMatches.emplace_back(lH.qryIds[it->qryId / TGlobalHolder::qryNumFrames],
1167-
const_gH.indexFile.ids[it->subjId / TGlobalHolder::sbjNumFrames]);
1205+
blastMatches.emplace_back(lH.qryIds[m.qryId / TGlobalHolder::qryNumFrames],
1206+
const_gH.indexFile.ids[m.subjId / TGlobalHolder::sbjNumFrames]);
11681207

11691208
TBlastMatch & bm = blastMatches.back();
11701209

1171-
bm._n_qId = it->qryId / TGlobalHolder::qryNumFrames;
1172-
bm._n_sId = it->subjId / TGlobalHolder::sbjNumFrames;
1210+
bm._n_qId = m.qryId / TGlobalHolder::qryNumFrames;
1211+
bm._n_sId = m.subjId / TGlobalHolder::sbjNumFrames;
11731212

1174-
bm.qLength = //std::ranges::size(lH.transQrySeqs[it->qryId]);
1175-
std::ranges::size(lH.qrySeqs[bm._n_qId]);
1213+
bm.qLength = std::ranges::size(lH.qrySeqs[bm._n_qId]);
1214+
bm.sLength = std::ranges::size(lH.gH.indexFile.seqs[bm._n_sId]);
11761215

1177-
bm.sLength = // std::ranges::size(lH.gH.transSbjSeqs[it->subjId]);
1178-
std::ranges::size(lH.gH.indexFile.seqs[bm._n_sId]);
1216+
bm.qStart = m.qryStart;
1217+
bm.qEnd = m.qryEnd;
1218+
bm.sStart = m.subjStart;
1219+
bm.sEnd = m.subjEnd;
1220+
seqan::assignSource(bm.alignRow0, lH.transQrySeqs[m.qryId] | bio::views::slice(bm.qStart, bm.qEnd));
1221+
seqan::assignSource(bm.alignRow1, lH.gH.transSbjSeqs[m.subjId] | bio::views::slice(bm.sStart, bm.sEnd));
11791222

1180-
_setupAlignInfix(bm, *it, lH);
1181-
1182-
_setFrames(bm, *it, lH);
1223+
_setFrames(bm, m, lH);
11831224

11841225
if (lH.options.hasSTaxIds)
11851226
bm.sTaxIds = lH.gH.indexFile.sTaxIds[bm._n_sId];
11861227
}
1187-
#ifdef LAMBDA_MICRO_STATS
1188-
lH.stats.timeExtend += sysTime() - start;
1189-
1190-
// filter out duplicates
1191-
start = sysTime();
1192-
#endif
1193-
auto before = seqan::length(blastMatches);
1194-
blastMatches.sort(
1195-
[](auto const & l, auto const & r)
1196-
{
1197-
return std::tie(l._n_qId, l._n_sId, l.sStart, l.sEnd, l.qStart, l.qEnd, l.qFrameShift, l.sFrameShift) <
1198-
std::tie(r._n_qId, r._n_sId, r.sStart, r.sEnd, r.qStart, r.qEnd, r.qFrameShift, r.sFrameShift);
1199-
});
1200-
blastMatches.unique(
1201-
[](auto const & l, auto const & r)
1202-
{
1203-
return std::tie(l._n_qId, l._n_sId, l.sStart, l.sEnd, l.qStart, l.qEnd, l.qFrameShift, l.sFrameShift) ==
1204-
std::tie(r._n_qId, r._n_sId, r.sStart, r.sEnd, r.qStart, r.qEnd, r.qFrameShift, r.sFrameShift);
1205-
});
1206-
lH.stats.hitsDuplicate += (before - seqan::length(blastMatches));
12071228

12081229
// sort by lengths to minimize padding in SIMD
12091230
blastMatches.sort(
@@ -1217,6 +1238,7 @@ inline void iterateMatchesFullSimd(TLocalHolder & lH, bsDirection const dir = bs
12171238

12181239
start = sysTime();
12191240
#endif
1241+
12201242
// fill batches
12211243
_setupDepSets(depSetH, depSetV, blastMatches);
12221244

@@ -1342,12 +1364,24 @@ inline void writeRecords(TLocalHolder & lH)
13421364
template <typename TLocalHolder>
13431365
inline void iterateMatches(TLocalHolder & lH)
13441366
{
1345-
iterateMatchesFullSimd(lH, bsDirection::fwd);
13461367
if constexpr (TLocalHolder::TGlobalHolder::c_redAlph == AlphabetEnum::DNA3BS)
13471368
{
1348-
iterateMatchesFullSimd(lH, bsDirection::rev);
1369+
std::ranges::sort(lH.matches,
1370+
[](Match const & l, Match const & r) {
1371+
return std::tuple<bool, Match const &>{l.subjId % 2, l} <
1372+
std::tuple<bool, Match const &>{r.subjId % 2, r};
1373+
});
1374+
1375+
auto it = std::ranges::find_if(lH.matches, [](Match const & m) { return m.subjId % 2; });
1376+
1377+
iterateMatchesFullSimd(std::span{lH.matches.begin(), it}, lH, bsDirection::fwd);
1378+
iterateMatchesFullSimd(std::span{it, lH.matches.end()}, lH, bsDirection::rev);
13491379
lH.blastMatches.sort([](auto const & lhs, auto const & rhs) { return lhs._n_qId < rhs._n_qId; });
13501380
}
1381+
else
1382+
{
1383+
iterateMatchesFullSimd(lH.matches, lH, bsDirection::fwd);
1384+
}
13511385
}
13521386

13531387
//-----------------------------------------------------------------------

src/search_datastructures.hpp

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,16 +57,7 @@ struct Match
5757
TPos subjStart;
5858
TPos subjEnd;
5959

60-
inline bool operator==(Match const & m2) const
61-
{
62-
return std::tie(qryId, subjId, qryStart, subjStart, qryEnd, subjEnd) ==
63-
std::tie(m2.qryId, m2.subjId, m2.qryStart, m2.subjStart, m2.qryEnd, m2.subjEnd);
64-
}
65-
inline bool operator<(Match const & m2) const
66-
{
67-
return std::tie(qryId, subjId, qryStart, subjStart, qryEnd, subjEnd) <
68-
std::tie(m2.qryId, m2.subjId, m2.qryStart, m2.subjStart, m2.qryEnd, m2.subjEnd);
69-
}
60+
constexpr friend auto operator<=>(Match const &, Match const &) = default;
7061
};
7162

7263
// template <typename TAlph>
@@ -117,10 +108,12 @@ struct StatsHolder
117108
uint64_t hitsFailedExtendEValueTest;
118109
uint64_t hitsAbundant;
119110
uint64_t hitsDuplicate;
111+
uint64_t hitsDuplicate2;
120112

121113
// final
122114
uint64_t hitsFinal;
123115
uint64_t qrysWithHit;
116+
uint64_t pairs;
124117

125118
#ifdef LAMBDA_MICRO_STATS
126119
// times
@@ -155,9 +148,11 @@ struct StatsHolder
155148
hitsFailedExtendEValueTest = 0;
156149
hitsAbundant = 0;
157150
hitsDuplicate = 0;
151+
hitsDuplicate2 = 0;
158152

159153
hitsFinal = 0;
160154
qrysWithHit = 0;
155+
pairs = 0;
161156

162157
#ifdef LAMBDA_MICRO_STATS
163158
seedLengths.clear();
@@ -187,9 +182,11 @@ struct StatsHolder
187182
hitsFailedExtendEValueTest += rhs.hitsFailedExtendEValueTest;
188183
hitsAbundant += rhs.hitsAbundant;
189184
hitsDuplicate += rhs.hitsDuplicate;
185+
hitsDuplicate2 += rhs.hitsDuplicate2;
190186

191187
hitsFinal += rhs.hitsFinal;
192188
qrysWithHit += rhs.qrysWithHit;
189+
pairs += rhs.pairs;
193190

194191
#ifdef LAMBDA_MICRO_STATS
195192
seqan::append(seedLengths, rhs.seedLengths);
@@ -248,6 +245,8 @@ void printStats(StatsHolder const & stats, LambdaOptions const & options)
248245
std::cout << "\n - failed %-identity test " << R << stats.hitsFailedExtendPercentIdentTest << RR
249246
<< (rem -= stats.hitsFailedExtendPercentIdentTest);
250247
std::cout << "\n - duplicates " << R << stats.hitsDuplicate << RR << (rem -= stats.hitsDuplicate);
248+
std::cout << "\n - late duplicates " << R << stats.hitsDuplicate2 << RR
249+
<< (rem -= stats.hitsDuplicate2);
251250
std::cout << "\n - abundant " << R << stats.hitsAbundant << "\033[1m" << RR
252251
<< (rem -= stats.hitsAbundant) << "\033[0m\n\n";
253252

@@ -289,7 +288,8 @@ void printStats(StatsHolder const & stats, LambdaOptions const & options)
289288
if (options.verbosity >= 1)
290289
{
291290
auto const w = seqan::_numberOfDigits(stats.hitsFinal);
292-
std::cout << "Number of valid hits: " << std::setw(w) << stats.hitsFinal
291+
std::cout << "Number of total hits: " << std::setw(w) << stats.hitsFinal
292+
<< "\nNumber of Query-Subject pairs: " << std::setw(w) << stats.pairs
293293
<< "\nNumber of Queries with at least one valid hit: " << std::setw(w) << stats.qrysWithHit << "\n";
294294
}
295295
}
@@ -481,7 +481,8 @@ class LocalDataHolder
481481
std::vector<std::string>, // not used
482482
std::string_view,
483483
uint32_t>;
484-
std::list<TBlastMatch> blastMatches;
484+
std::list<TBlastMatch> blastMatches;
485+
std::unordered_set<uint64_t> uniqSubjIds;
485486

486487
// regarding the gathering of stats
487488
StatsHolder stats{};

src/search_misc.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@ struct QueryException : public std::runtime_error
4343
// Alignment-related
4444
// ============================================================================
4545

46-
inline int _bandSize(uint64_t const seqLength)
46+
inline int64_t _bandSize(uint64_t const seqLength)
4747
{
4848
// Currently only used for padding of the subject sequences
49-
return 63ull - std::countl_zero(std::bit_ceil(seqLength));
49+
return static_cast<int64_t>(std::sqrt(seqLength)) + 1;
5050
}
5151

5252
// ----------------------------------------------------------------------------

0 commit comments

Comments
 (0)