Skip to content

Commit 4f4df5d

Browse files
authored
Merge pull request #58 from marbl/prefix-filter
MashMap v3.0.6
2 parents c6978dd + b440e04 commit 4f4df5d

File tree

8 files changed

+329
-88
lines changed

8 files changed

+329
-88
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ endif ()
3333
if (${CMAKE_BUILD_TYPE} MATCHES Debug)
3434
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O -g")
3535
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O -g")
36+
add_compile_options(-fsanitize=address)
37+
add_link_options(-fsanitize=address)
3638
else()
3739
# Use all standard-compliant optimizations - always add these:
3840
set (CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${PIC_FLAG} ${EXTRA_FLAGS}")

src/common/progress.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#pragma once
22

3+
#include <cmath>
34
#include <iostream>
45
#include <string>
56
#include <atomic>

src/common/utils.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#pragma once
22

3+
#include <cstdint>
34
#include <cmath>
45
#include <string>
56

src/map/include/base_types.hpp

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <tuple>
1111
#include <vector>
1212
#include <chrono>
13+
#include "common/progress.hpp"
1314

1415
namespace skch
1516
{
@@ -165,6 +166,7 @@ namespace skch
165166

166167
//--for split read mapping
167168

169+
long double kmerComplexity; // Estimated sequence complexity
168170
int n_merged; // how many mappings we've merged into this one
169171
offset_t splitMappingId; // To identify split mappings that are chained
170172
uint8_t discard; // set to 1 for deletion
@@ -204,10 +206,11 @@ namespace skch
204206
//Container to save copy of kseq object
205207
struct InputSeqContainer
206208
{
207-
seqno_t seqCounter; //sequence counter
208-
offset_t len; //sequence length
209-
std::string seq; //sequence string
210-
std::string seqName; //sequence id
209+
seqno_t seqCounter; //sequence counter
210+
offset_t len; //sequence length
211+
std::string seq; //sequence string
212+
std::string seqName; //sequence id
213+
211214

212215
/*
213216
* @brief constructor
@@ -216,12 +219,30 @@ namespace skch
216219
* @param[in] len length of sequence
217220
*/
218221
InputSeqContainer(const std::string& s, const std::string& id, seqno_t seqcount)
219-
: seq(s)
220-
, seqName(id)
222+
: seqCounter(seqcount)
221223
, len(s.length())
222-
, seqCounter(seqcount) { }
224+
, seq(s)
225+
, seqName(id) { }
226+
};
227+
228+
struct InputSeqProgContainer : InputSeqContainer
229+
{
230+
using InputSeqContainer::InputSeqContainer;
231+
progress_meter::ProgressMeter& progress; //progress meter (shared)
232+
233+
234+
/*
235+
* @brief constructor
236+
* @param[in] kseq_seq complete read or reference sequence
237+
* @param[in] kseq_id sequence id name
238+
* @param[in] len length of sequence
239+
*/
240+
InputSeqProgContainer(const std::string& s, const std::string& id, seqno_t seqcount, progress_meter::ProgressMeter& pm)
241+
: InputSeqContainer(s, id, seqcount)
242+
, progress(pm) { }
223243
};
224244

245+
225246
//Output type of map function
226247
struct MapModuleOutput
227248
{
@@ -248,6 +269,8 @@ namespace skch
248269
std::string seqName; //sequence name
249270
MinmerVec minmerTableQuery; //Vector of minmers in the query
250271
MinmerVec seedHits; //Vector of minmers in the reference
272+
int refGroup; //Prefix group of sequence
273+
float kmerComplexity; //Estimated sequence complexity
251274
};
252275
}
253276

src/map/include/commonFunc.hpp

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,25 @@ namespace skch {
186186
ankerl::unordered_dense::map<hash_t, MinmerInfo> sketched_vals;
187187
std::vector<hash_t> sketched_heap;
188188
sketched_heap.reserve(sketchSize+1);
189+
190+
// Get distance until last "N"
191+
int ambig_kmer_count = 0;
192+
for (int i = kmerSize - 1; i >= 0; i--)
193+
{
194+
if (seq[i] == 'N')
195+
{
196+
ambig_kmer_count = i+1;
197+
break;
198+
}
199+
}
189200

190201
for(offset_t i = 0; i < len - kmerSize + 1; i++)
191202
{
203+
204+
if (seq[i+kmerSize-1] == 'N')
205+
{
206+
ambig_kmer_count = kmerSize;
207+
}
192208
//Hash kmers
193209
hash_t hashFwd = CommonFunc::getHash(seq + i, kmerSize);
194210
hash_t hashBwd;
@@ -199,7 +215,7 @@ namespace skch {
199215
hashBwd = std::numeric_limits<hash_t>::max(); //Pick a dummy high value so that it is ignored later
200216

201217
//Consider non-symmetric kmers only
202-
if(hashBwd != hashFwd)
218+
if(hashBwd != hashFwd && ambig_kmer_count == 0)
203219
{
204220
//Take minimum value of kmer and its reverse complement
205221
hash_t currentKmer = std::min(hashFwd, hashBwd);
@@ -237,6 +253,10 @@ namespace skch {
237253
}
238254
}
239255
}
256+
if (ambig_kmer_count > 0)
257+
{
258+
ambig_kmer_count--;
259+
}
240260
}
241261

242262
minmerIndex.resize(sketched_heap.size());
@@ -293,6 +313,10 @@ namespace skch {
293313

294314
//if(alphabetSize == 4) //not protein
295315
//CommonFunc::reverseComplement(seq, seqRev.get(), len);
316+
317+
// Get distance until last "N"
318+
int ambig_kmer_count = 0;
319+
296320

297321
for(offset_t i = 0; i < len - kmerSize + 1; i++)
298322
{
@@ -369,8 +393,12 @@ namespace skch {
369393
Q.pop_front();
370394
}
371395

396+
if (seq[i+kmerSize-1] == 'N')
397+
{
398+
ambig_kmer_count = kmerSize;
399+
}
372400
//Consider non-symmetric kmers only
373-
if(hashBwd != hashFwd)
401+
if(hashBwd != hashFwd && ambig_kmer_count == 0)
374402
{
375403
// Add current hash to window
376404
Q.push_back(std::make_tuple(currentKmer, currentStrand, i));
@@ -399,6 +427,11 @@ namespace skch {
399427
std::push_heap(heapWindow.begin(), heapWindow.end(), KIHeap_cmp);
400428
}
401429
}
430+
if (ambig_kmer_count > 0)
431+
{
432+
ambig_kmer_count--;
433+
}
434+
402435

403436

404437

0 commit comments

Comments
 (0)