|
8 | 8 | #include <vector>
|
9 | 9 | #include <algorithm>
|
10 | 10 | #include <chrono>
|
| 11 | +#include <bitset> |
| 12 | +#include <unordered_set> |
11 | 13 |
|
12 | 14 | #include "align.h"
|
13 | 15 | #include "common.h"
|
@@ -164,13 +166,102 @@ void stats(const string &ref_path, const string &bed_path)
|
164 | 166 |
|
165 | 167 | /******************************************************************************/
|
166 | 168 |
|
167 |
| -void stats_main(int argc, char **argv) |
| 169 | +#include <boost/dynamic_bitset.hpp> |
| 170 | + |
| 171 | +void get_differences() |
168 | 172 | {
|
169 |
| - if (argc < 2) { |
170 |
| - eprn("invalid usage"); |
171 |
| - return; |
| 173 | + map<string, boost::dynamic_bitset<>> sedef; |
| 174 | + map<string, boost::dynamic_bitset<>> wgac; |
| 175 | + |
| 176 | + string s; |
| 177 | + ifstream fin("results/out.hg19.bed", ifstream::in); |
| 178 | + while (getline(fin, s)) { |
| 179 | + string cigar; |
| 180 | + Hit h = Hit::from_bed(s, &cigar); |
| 181 | + |
| 182 | + auto c1 = fmt::format("{}{}", h.query->name, "+-"[h.query->is_rc]); |
| 183 | + auto c2 = fmt::format("{}{}", h.ref->name, "+-"[h.ref->is_rc]); |
| 184 | + if (sedef.find(c1)==sedef.end()) sedef[c1]=boost::dynamic_bitset<>(250000000); |
| 185 | + if (sedef.find(c2)==sedef.end()) sedef[c2]=boost::dynamic_bitset<>(250000000); |
| 186 | + for (int i = h.query_start; i < h.query_end; i++) sedef[c1].set(i); |
| 187 | + for (int i = h.ref_start; i < h.ref_end; i++) sedef[c2].set(i); |
| 188 | + } |
| 189 | + |
| 190 | + eprn("sedef done"); |
| 191 | + |
| 192 | + ifstream fiw("data/GRCh37GenomicSuperDup.tab"); |
| 193 | + getline(fiw, s); |
| 194 | + unordered_set<string> seen; |
| 195 | + while (getline(fiw, s)) { |
| 196 | + Hit h = Hit::from_wgac(s); |
| 197 | + auto c1 = fmt::format("{}{}", h.query->name, "+-"[h.query->is_rc]); |
| 198 | + auto c2 = fmt::format("{}{}", h.ref->name, "+-"[h.ref->is_rc]); |
| 199 | + if (c1.size() > 6 || c2.size() > 6) |
| 200 | + continue; |
| 201 | + |
| 202 | + if (seen.find(h.name) == seen.end()) { |
| 203 | + seen.insert(h.name); |
| 204 | + if (wgac.find(c1)==wgac.end()) wgac[c1]=boost::dynamic_bitset<>(250000000); |
| 205 | + if (wgac.find(c2)==wgac.end()) wgac[c2]=boost::dynamic_bitset<>(250000000); |
| 206 | + for (int i = h.query_start; i < h.query_end; i++) wgac[c1].set(i); |
| 207 | + for (int i = h.ref_start; i < h.ref_end; i++) wgac[c2].set(i); |
| 208 | + } |
| 209 | + } |
| 210 | + |
| 211 | + eprn("wgac done"); |
| 212 | + |
| 213 | + FastaReference fr("data/hg19/hg19.fa"); |
| 214 | + |
| 215 | + int intersect = 0, wgac_only = 0, wgac_span = 0, sedef_only = 0, sedef_span = 0; |
| 216 | + |
| 217 | + int sedef_extra_upper = 0; |
| 218 | + int miss_upper = 0; |
| 219 | + |
| 220 | + for (auto &p: sedef) { |
| 221 | + auto &s = p.second; |
| 222 | + auto &w = wgac[p.first]; |
| 223 | + |
| 224 | + auto seq = fr.get_sequence(p.first.substr(0, p.first.size()-1)); |
| 225 | + |
| 226 | + for (int i = 0; i < seq.size(); i++) { |
| 227 | + if ((s[i] & (~w[i])) && isupper(seq[i]) && seq[i] != 'N') { |
| 228 | + sedef_extra_upper++; |
| 229 | + } |
| 230 | + if ((w[i] & (~s[i])) && isupper(seq[i]) && seq[i] != 'N') { |
| 231 | + miss_upper++; |
| 232 | + } |
| 233 | + } |
| 234 | + |
| 235 | + intersect += (s & w).count(); |
| 236 | + wgac_only += (w & (~s)).count(); |
| 237 | + sedef_only += (s & (~w)).count(); |
| 238 | + sedef_span += s.count(); |
| 239 | + wgac_span += w.count(); |
| 240 | + } |
| 241 | + |
| 242 | + eprn("SEDEF: span {:12n}\n" |
| 243 | + " only {:12n}\n" |
| 244 | + " on/u {:12n}\n" |
| 245 | + " miss {:12n}\n" |
| 246 | + " mi/u {:12n}\n" |
| 247 | + "WGAC: span {:12n}\n" |
| 248 | + " intr {:12n}", sedef_span, sedef_only, sedef_extra_upper, wgac_only, miss_upper, wgac_span, intersect); |
| 249 | +} |
| 250 | + |
| 251 | +/******************************************************************************/ |
| 252 | + |
| 253 | +void stats_main(int argc, char **argv) |
| 254 | +{ |
| 255 | + if (argc < 3) { |
| 256 | + throw fmt::format("Not enough arguments to stats"); |
| 257 | + } |
| 258 | + |
| 259 | + string command = argv[0]; |
| 260 | + if (command == "generate") { |
| 261 | + stats(argv[1], argv[2]); |
| 262 | + } else if (command == "diff") { |
| 263 | + get_differences(); //(argv[1], argv[2], atoi(argv[3])); |
| 264 | + } else { |
| 265 | + throw fmt::format("Unknown stats command"); |
172 | 266 | }
|
173 |
| - string ref_path = argv[0]; |
174 |
| - string bed_path = argv[1]; |
175 |
| - stats(ref_path, bed_path); |
176 | 267 | }
|
0 commit comments