-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStatsManager.hpp
executable file
·415 lines (367 loc) · 12.6 KB
/
StatsManager.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
/*
* StatsManager.hpp
*
* Created on: 2020年5月1日
* Author: fenghe
* Edited by: CRE 20210705
*/
#pragma once
#include <iostream>
#include <map>
#include <vector>
#include "StatsTracker.hpp"
#include <unordered_set>
#include <optional>
#define bam_read_no(_bp) (((((b)->core.flag&BAM_FREAD2) != 0) &&(((b)->core.flag&BAM_FREAD1) == 0)) ? 2 : 1)
char* bam_get_string_tag(bam1_t* _bp, char* tag);
//using function: StatsManager::simpleGetStats to simple get ISIZE
/// this object handles filtration of reads which are:
///
/// 1. not downstream or (if both reads start at same position) not the second in order in the bam file
/// 2. part of a high depth pileup region
///
struct ReadPairDepthFilter {
bool isFilterRead(bam1_t *b);
private:
std::string getKey(const char *initQname, const int initReadNo){
std::string s(initQname); s += std::to_string(initReadNo);
return s;
}
std::string getKey(bam1_t *b){
std::string s((char *)bam_get_qname(b)); s += std::to_string(bam_read_no(b));
return s;
}
static const unsigned maxMateSetSize = 100000;
typedef std::unordered_set<std::string> mateMap_t;
unsigned _posCount = 0;
mateMap_t _goodMates{maxMateSetSize};
int _lastTargetId = 0;
int _lastPos = 0;
};
struct CoreReadFilter {
bool isFilterRead(bam1_t *b);
private:
ReadPairDepthFilter pairFilter;
};
/// \return The BAM record's auxillary RG tag value, or an empty string if no RG tag exists.
inline const char* getReadGroup(bam1_t *b) {
char RG[5] = "RG";
const char *rgStr(bam_get_string_tag(b, RG));
return ((nullptr == rgStr) ? "" : rgStr);
}
/// manage the info structs for each RG
struct TrackerManager {
typedef std::map<StatLabel, StatsTracker> RGMapType;
TrackerManager(bool useRG, const std::string &statsBamFile,
const std::string &defaultStatsFilename) :
_useRG(useRG), _statsBamFile(statsBamFile), _defaultStatsFilename(
defaultStatsFilename) {
if(!_useRG) {//get default Tracker
std::pair<RGMapType::iterator, bool> retval = _rgTracker.insert(
std::make_pair(
StatLabel(_statsBamFile.c_str(), ""),
StatsTracker(_statsBamFile.c_str(), "",
_defaultStatsFilename)));
assert(retval.second);
_defaultTracker = &(retval.first->second);
}
}
StatsTracker& getTracker(bam1_t *b) {
if (!_useRG)
return *_defaultTracker;
const char *readGroup(getReadGroup(b));
StatLabel rgKey(_statsBamFile.c_str(), readGroup, false);
RGMapType::iterator rgIter(_rgTracker.find(rgKey));
if (rgIter == _rgTracker.end()) {
std::pair<RGMapType::iterator, bool> retval;
retval = _rgTracker.insert(
std::make_pair(
StatLabel(_statsBamFile.c_str(), readGroup),
StatsTracker(_statsBamFile.c_str(), readGroup,
_defaultStatsFilename)));
assert(retval.second);
rgIter = retval.first;
}
return rgIter->second;
}
/// check if all read groups have been sufficiently sampled for a slice
/// for each read group, either 100k samples has been collected, or the insert size distrubution has
/// converged
bool isFinishedSlice() {
for (auto &val : _rgTracker) {
if (!val.second.isCheckedOrConverged())
return false;
}
for (RGMapType::value_type &val : _rgTracker) {
val.second.clearChecked();
}
return true;
}
/// test if all read groups have converged or hit other stopping conditions
bool isStopEstimation() {
static const unsigned maxRecordCount(5000000);
for (RGMapType::value_type &val : _rgTracker) {
if (!(val.second.isInsertSizeConverged()
|| (val.second.insertSizeObservations() > maxRecordCount)))
return false;
}
return true;
}
const RGMapType& getMap() {
finalize();
return _rgTracker;
}
bool isFiltered(bam1_t *b)
{
return filter.isFilterRead(b);
}
private:
TrackerManager(const TrackerManager & c)
{
fprintf(stderr,"TrackerManager can`t be copy create!");
exit(1);
}
void finalize() {
if (_isFinalized)
return;
for (RGMapType::value_type &val : _rgTracker) {
val.second.finalize();
}
_isFinalized = true;
}
bool _useRG = false;
bool _isFinalized = false;
RGMapType _rgTracker;
StatsTracker * _defaultTracker = nullptr;
const std::string _statsBamFile;
std::string _defaultStatsFilename;
CoreReadFilter filter;
};
/// \brief Provides something like a map, but with sequential id numbers
/// assigned to each key starting from 0
///
/// The id numbers can be useful for faster lookup of the value, while
/// retaining the option of doing key lookup when required
///
template<typename K, typename V, typename COMPARE = std::less<K>>
struct id_map {
/// \brief Update map with (key,value) and return id
///
unsigned insert(const K &key, const V &value) {
const typename k2id_t::const_iterator i(_k2id.find(key));
if (i == _k2id.end()) {
const unsigned id(_id2kv.size());
_k2id[key] = id;
_id2kv.push_back(std::make_pair(key, value));
return id;
} else {
_id2kv[i->second] = std::make_pair(key, value);
return i->second;
}
}
/// \brief Test if key exists in map
bool test_key(const K &key) const {
return (_k2id.find(key) != _k2id.end());
}
/// \brief Get id of inserted key
std::optional<unsigned> get_optional_id(const K &key) const {
const typename k2id_t::const_iterator i(_k2id.find(key));
if (i == _k2id.end()) {
return std::optional<unsigned>();
}
return std::optional<unsigned>(i->second);
}
/// \brief Get id of inserted key
unsigned get_id(const K &key) const {
const typename k2id_t::const_iterator i(_k2id.find(key));
if (i == _k2id.end()) {
std::cerr << "id_map.get_id(): invalid key" << std::endl;
}
return i->second;
}
/// \brief Get pre-existing key
const K& get_key(const unsigned id) const {
if (id >= _id2kv.size()) {
std::cerr << "idmap.get_key(): invalid id" << std::endl;
}
return _id2kv[id].first;
}
/// \brief Get pre-existing key
const V& get_value(const unsigned id) const {
if (id >= _id2kv.size()) {
std::cerr << "idmap.get_value(): invalid id" << std::endl;
}
return _id2kv[id].second;
}
bool empty() const {
return _id2kv.empty();
}
unsigned size() const {
return _id2kv.size();
}
void clear() {
_k2id.clear();
_id2kv.clear();
}
private:
typedef std::map<K, unsigned, COMPARE> k2id_t;
k2id_t _k2id;
std::vector<std::pair<K, V>> _id2kv;
};
/***************************PART TWO: Output************************************/
/// \brief manages multiple read_group_stats
///
#define default_insert_size_min 0
#define default_insert_size_max 1300
struct StatsManager {
typedef StatLabel KeyType;
private:
id_map<KeyType, UniqueStats> _group;
const char * referenceFilename;
std::string defaultStatsFilename;
public:
StatsManager(const char * referenceFilename_, std::string defaultStatsFilename_):
referenceFilename(referenceFilename_), defaultStatsFilename(defaultStatsFilename_){}
StatsManager(){}
static void simpleGetStats(
const std::string &referenceFilename,
const std::string &alignmentFilename,
FILE* output) {
// calculate fragment size statistics for all read groups in all bams
StatsManager rstats(referenceFilename.c_str(), "");
rstats.handleBamCramStats(alignmentFilename.c_str());
uint32_t minInsertLen = rstats.getInsertLen(alignmentFilename.c_str(), 0.01f);
uint32_t middleInsertLen = rstats.getInsertLen(alignmentFilename.c_str(), 0.5f);
uint32_t maxInsertLen = rstats.getInsertLen(alignmentFilename.c_str(), 0.99f);
fprintf(output, "%d %d %d\n", minInsertLen, middleInsertLen, maxInsertLen);
}
bool empty() const { return _group.empty(); }
unsigned size() const { return _group.size(); }
/// \brief get the index of a read group
///
/// the index can be used for fast lookup of the
/// stats for that group
///
/// if the group does not exist, the returned value
/// evaluates to false per boost::optional
///
/// Each read group is identified as a combination of a bam filename and
/// an RG tag label. An empty label refers to the "default" read group
/// for the file (all records that had no RG tag).
std::optional<unsigned> getGroupIndex(const StatLabel &rgLabel) const { return _group.get_optional_id(rgLabel); }
/// get stats associated with index
const UniqueStats& getStats(const unsigned groupIndex) const { return _group.get_value(groupIndex); }
const KeyType& getKey(const unsigned groupIndex) const { return _group.get_key(groupIndex); }
/// set stats for index
void setStats(const StatLabel &rgLabel, const UniqueStats &rps) { _group.insert(rgLabel, rps); }
unsigned getMinInsertLen(const std::string &bamFilename){
std::optional<unsigned> idx = getGroupIndex(StatLabel(bamFilename.c_str(), ""));
if(idx) return getStats(idx.value()).fragStats.quantile(0.01f);
else {
std::cerr << "Warning: can`t properly get MIN insert_size for " << bamFilename <<
", using default_insert_size_min" << default_insert_size_min << "\n";
return default_insert_size_min;
}
}
unsigned getMaxInsertLen(const std::string &bamFilename){
std::optional<unsigned> idx = getGroupIndex(StatLabel(bamFilename.c_str(), ""));
if(idx) return getStats(idx.value()).fragStats.quantile(0.99f);
else{
std::cerr << "Warning: can`t properly get MAX insert_size for " << bamFilename <<
", using default_insert_size_max" << default_insert_size_max << "\n";
return default_insert_size_max;
}
}
unsigned getInsertLen(const char * bamFilename, float percent){
std::optional<unsigned> idx = getGroupIndex(StatLabel(bamFilename, ""));
if(idx) return getStats(idx.value()).fragStats.quantile(percent);
else{
if(percent < 0.5){
std::cerr << "Warning: can`t properly get isize for " << bamFilename <<
" @[" << percent <<
"], using default isize_min [" << default_insert_size_min << "]\n";
return default_insert_size_min;
}
else{
std::cerr << "Warning: can`t properly get isize for " << bamFilename <<
" @[" << percent <<
"], using default isize_max[" << default_insert_size_max << "]\n";
return default_insert_size_max;
}
}
}
void getBP_Distribution(const std::string &bamFilename, int read_len,
std::vector<float> &DR_bp_distribution, std::vector<float> &SH_bp_distribution,
std::vector<float> &UM_stPos_distribution, int &START_OFFSET_UM){
std::optional<unsigned> idx = getGroupIndex(StatLabel(bamFilename.c_str(), ""));
if(!idx){
std::cerr << "Warning: can`t properly get break point distribution for "
<< bamFilename << "\n";
return;
}
//for discordant /hard clip signals break point distribution
int totalPairedReadCount = getStats(idx.value()).readCounter.totalHighConfidenceReadPairCount();
int max_len = getStats(idx.value()).fragStats.quantile(0.99f);
int max_probability_size = max_len - 2*read_len;
DR_bp_distribution.clear();
const SizeDistribution &fragStats = getStats(idx.value()).fragStats;
if(max_probability_size > 50){
//dis.reserve(max_len);
DR_bp_distribution.resize(max_probability_size, 0);
for(int i = 1; i < max_probability_size; i++){
int index_count = fragStats.getSizeCount(i + 2*read_len);
float PI = (float)index_count/totalPairedReadCount;
float PI_pos = PI/i;
for(int j = 0; j < i; j++ ){
DR_bp_distribution[j] += PI_pos;
}
}
float sum_p = 0;
for(unsigned int i = 0; i < DR_bp_distribution.size(); i ++)
sum_p += DR_bp_distribution[i];
float levelup_rate = 1/sum_p;//sum will be 1
for(unsigned int i = 0; i < DR_bp_distribution.size(); i ++)
DR_bp_distribution[i] *= levelup_rate;
}
else{
max_probability_size = 50;
DR_bp_distribution.resize(max_probability_size, 0.02);
}
//for Soft/hard clip signals break point distribution
int max_probability_size_SH = 10;
SH_bp_distribution.resize(max_probability_size_SH, 0.1);
//for UM start position distribution:
int min_len = getStats(idx.value()).fragStats.quantile(0.03f);
max_len = getStats(idx.value()).fragStats.quantile(0.97f);
int min_um_distribution = min_len - read_len;
int max_um_distribution = max_len - read_len;
int um_distribution_size = max_um_distribution - min_um_distribution;
START_OFFSET_UM = min_um_distribution;
UM_stPos_distribution.resize(um_distribution_size, 0);
for(int i = 0; i < um_distribution_size; i++){
int index_count = fragStats.getSizeCount(i + min_len);
float PI = (float)index_count/totalPairedReadCount;
UM_stPos_distribution[i] = PI;
}
}
/// merge in the contents of another stats set object:
void merge(const StatsManager &rhs);
void save(const char *filename) const;
void load(const char *filename);
bool isEmpty() {
return _group.empty();
}
void handleBamCramStats(const char *alignmentFilename);
struct ChromInfo {
ChromInfo(int32_t chr_ID_, int32_t size_) :
chr_ID(chr_ID_), size(size_) {
}
int32_t chr_ID;
int32_t size;
int32_t highestPos = -1;
};
private:
void clear() {
_group.clear();
}
};