-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.h
141 lines (128 loc) · 4.19 KB
/
parse.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#include <vector>
#include <string>
#include <cstdlib>
#include <iostream>
#include <iomanip>
#include <string_view>
unsigned int chromosome_length = 0;
unsigned int total_ns = 0;
enum class Show { Genomewide,Hide };
std::string getenv_default(const std::string &env, const std::string &default_value) {
const char *value = getenv(env.c_str());
return(value ? value : default_value);
}
std::string SUMMARY = getenv_default("SUMMARY", "0");
std::string GENOMEWIDE = getenv_default("GENOMEWIDE", "0");
bool contains_N(const std::string_view &str) {
return(str.find("N") != std::string::npos || str.find("n") != std::string::npos);
}
class hitCollector {
public:
std::string_view chr;
int start;
int end;
hitCollector(const std::string_view chr_, int start_, int end_) {
chr = chr_;
start = start_;
end = end_;
}
void print(void) {
std::cout << chr << "\t" << start << "\t" << end << "\n";
}
int range(){
return(end-start+1);
}
};
void (hitCollector::*print)(void) = &hitCollector::print; // call member function of class instance
void report(std::vector<hitCollector*> &hc, const std::string_view name, const unsigned int chr_len, Show gw) {
if(!hc.empty()) {
if(SUMMARY == "1") {
int range = 0;
for (auto i: hc) range += i->range();
float ratio = (float(range) / chr_len) * 100;
std::cout << std::setprecision(3) << hc[0]->chr.substr(0,40) << "\t" << chr_len << "\t" << range << "\t" << ratio << "\n";
}
else if(GENOMEWIDE == "1") {
chromosome_length += chr_len;
for (auto i:hc) total_ns += i->range();
if (gw == Show::Genomewide) {
float total = (float(total_ns) / chromosome_length) * 100;
std::cout << "Genome size (bp): " << chromosome_length << "\t" << "N: " << total << " %" << "\n";
}
}
else {
for (auto i: hc) (i->*print)();
}
}
hc.clear();
}
void readSequence(std::istream &input) {
std::vector<hitCollector*> vec;
std::string line, name;
unsigned int position;
unsigned int start_position = 0;
unsigned int end_position = 0;
bool found_start = false;
bool found_end = false;
// loop through nucleotide data
while (std::getline(input, line).good()) {
const char *input_line = line.c_str();
if (input_line[0] == '>') {
// was the last character of the previous sequence an 'N'?
if (found_start == true) {
vec.push_back(new hitCollector(name,start_position,start_position));
found_start = false;
found_end = false;
}
// ouput requested format
report(vec, name, position, Show::Hide);
// preparation for next sequence
name = line.substr(1);
position = 0;
}
else if (input_line[0] == ';') continue;
else {
if (!line.empty()) {
if (!(contains_N(line))) {
position += line.length();
continue;
}
for (int i = 0; i < line.length(); i++) {
const char *currentChar = &line[i];
// the current char is 'N'?
// 78 = 'N' and 110 = 'n'
if (*currentChar == 78 || *currentChar == 110) {
// do we have a start position already?
// if no, this is the start
if (found_start == false) {
start_position = position;
found_start = true;
}
}
else {
// the current char is no 'N' ('n')
// but we already had a start position
if (found_start == true) {
// so this has to be the end of the N stretch
end_position = position - 1;
found_end = true;
}
}
// we do have both, start and end position
// we found the whole stretch -> print and clean up for next hit
if (found_start == true && found_end == true) {
vec.push_back(new hitCollector(name,start_position,end_position));
found_start = false;
found_end = false;
}
position++;
}
}
}
}
// if N is last character of file
if (found_start == true) {
vec.push_back(new hitCollector(name,start_position,start_position));
}
report(vec,name,position,Show::Genomewide);
}