Skip to content

Commit a20e09b

Browse files
committed
Stats
1 parent 0f33345 commit a20e09b

File tree

2 files changed

+101
-95
lines changed

2 files changed

+101
-95
lines changed

Diff for: README.md

+3-88
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# SEDEF: SEgmental Duplication Evaluation Framework
22

3+
# UNDER CONSTRUCTION
4+
35
## Compile
46

57
Simple! Use
@@ -86,7 +88,7 @@ wc -l out.*bed
8688
>> 1656305 out/out.bed
8789
>> 1558896 out/out.init.bed
8890
>> 231472 out/out.final.bed
89-
>> 71204 out.hg19.bed
91+
>> 67467 out.hg19.bed
9092
```
9193

9294
Then analyse:
@@ -101,90 +103,3 @@ done
101103
>> 0m 58s (7.5G)
102104
```
103105

104-
105-
Misses & partials:
106-
107-
+-------------------+------------+-----+----------------+-------+
108-
+ Stage + Misses + Kbp + Partial misses + Kbp +
109-
+-------------------+------------+-----+----------------+-------+
110-
+ Seed + 167 (0.7%) + 652 + 1,348 (5.5%) + 6,163 +
111-
+ Merge & Extend + 133 (0.5%) + 505 + 73 (0.3%) + 384 +
112-
+ Chain + 135 (0.6%) + 515 + 215 (0.9%) + 1,536 +
113-
+-------------------+------------+-----+----------------+-------+
114-
115-
116-
### Mouse
117-
118-
```bash
119-
for i in `seq 1 19` X Y; do
120-
for j in `seq 1 19` X Y; do
121-
SI=`awk '$1=="chr'$i'" {print $2}' mm8.fa.fai`;
122-
SJ=`awk '$1=="chr'$j'" {print $2}' mm8.fa.fai`;
123-
if [ "$SI" -le "$SJ" ] ; then
124-
for m in y n ; do
125-
echo "~/mesa ./sedef search single mm8.fa chr$i chr$j $m >mouse_out/${i}_${j}_${m}.bed 2>mouse_out/log/${i}_${j}_${m}.log"
126-
done;
127-
fi
128-
done
129-
done | time parallel --will-cite -j 80 --eta
130-
>> 12m 17s
131-
grep Total mouse_out/log/*.log | wc -l
132-
>> 462
133-
grep Wall mouse_out/log/*.log | tr -d '(' | awk '{s+=$4}END{print s}'
134-
>> 46962.9s (13.05 h)
135-
~/mesa ./sedef align bucket mouse_out mouse_out/bins 1000
136-
>>
137-
for j in mouse_out/bins/bucket_???? ; do
138-
k=$(basename $j);
139-
echo "~/mesa ./sedef align generate mm8.fa $j 11 >${j}.bed 2>mouse_out/log/bins/${k}.log"
140-
done | time parallel --will-cite -j 80 --eta
141-
>>
142-
grep Finished mouse_out/log/bins/*.log | wc -l
143-
>> 1000
144-
grep Wall mouse_out/log/bins/*.log | tr -d '(' | awk '{s+=$4}END{print s}'
145-
>> 236095 (65.58 h)cv
146-
cat mouse_out/bins/bucket_???? > mouse_out.init.bed
147-
cat mouse_out/bins/*.bed > mouse_out.final.bed
148-
wc -l mouse_out.*bed
149-
>> 975511 mouse_out.final.bed
150-
>> 1656305 mouse_out.init.bed
151-
152-
# Merge logs and remove progress bars and headers
153-
154-
for S in G S X Y ; do
155-
for i in ${S}_*.e*; do grep -v '%' $i | tail -n+4 ; done > ${S}.log
156-
cat ${S}_*.o* > ${S}.bed
157-
rm -rf ${S}*.[eo]*
158-
done
159-
160-
for i in G X Y S ; do
161-
mkdir -p bins/${i}
162-
sedef/sedef align bucket ${i}.bed bins/${i} 2000
163-
done
164-
165-
for i in G X Y S ; do
166-
for j in bins/${i}/* ; do
167-
k=$(basename $j);
168-
echo "qsub -cwd -V -b y -N \"${i}_${k}\" -l h_vmem=10G -l h_rt=24:00:00 -l h_stack=8M " \
169-
"python2.7 mesa sedef/sedef align generate fasta/hg19.fa $j"
170-
done
171-
done
172-
173-
for S in G X Y S ; do
174-
for i in ${S}_*.e*; do grep -v '\.\.' $i | tail -n+4 ; done > ${S}.align.log
175-
cat ${S}_*.o* > ${S}.align.bed
176-
rm -rf ${S}_*.[eo]*
177-
done
178-
179-
# Count stuff
180-
for S in G S X Y ; do
181-
SEA=`cat ${S}.log | grep 'Wall' | awk '{print $4}' | tr -d '[()]' | awk '{s+=$1} END{print s}'`
182-
ALN=`cat ${S}.align.log | grep 'Wall' | awk '{print $4}' | tr -d '[()]' | awk '{s+=$1} END{print s}'`
183-
printf "%s: %5.1f %5.1f %5.1f\n" $S $((SEA/3600)) $((ALN/3600)) $(((SEA+ALN)/3600))
184-
done
185-
186-
#mkdir -p output/search
187-
#zmv '(sedefrun_*).o*' 'output/search/$1.bed'
188-
```
189-
190-

Diff for: src/stats_main.cc

+98-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#include <vector>
99
#include <algorithm>
1010
#include <chrono>
11+
#include <bitset>
12+
#include <unordered_set>
1113

1214
#include "align.h"
1315
#include "common.h"
@@ -164,13 +166,102 @@ void stats(const string &ref_path, const string &bed_path)
164166

165167
/******************************************************************************/
166168

167-
void stats_main(int argc, char **argv)
169+
#include <boost/dynamic_bitset.hpp>
170+
171+
void get_differences()
168172
{
169-
if (argc < 2) {
170-
eprn("invalid usage");
171-
return;
173+
map<string, boost::dynamic_bitset<>> sedef;
174+
map<string, boost::dynamic_bitset<>> wgac;
175+
176+
string s;
177+
ifstream fin("results/out.hg19.bed", ifstream::in);
178+
while (getline(fin, s)) {
179+
string cigar;
180+
Hit h = Hit::from_bed(s, &cigar);
181+
182+
auto c1 = fmt::format("{}{}", h.query->name, "+-"[h.query->is_rc]);
183+
auto c2 = fmt::format("{}{}", h.ref->name, "+-"[h.ref->is_rc]);
184+
if (sedef.find(c1)==sedef.end()) sedef[c1]=boost::dynamic_bitset<>(250000000);
185+
if (sedef.find(c2)==sedef.end()) sedef[c2]=boost::dynamic_bitset<>(250000000);
186+
for (int i = h.query_start; i < h.query_end; i++) sedef[c1].set(i);
187+
for (int i = h.ref_start; i < h.ref_end; i++) sedef[c2].set(i);
188+
}
189+
190+
eprn("sedef done");
191+
192+
ifstream fiw("data/GRCh37GenomicSuperDup.tab");
193+
getline(fiw, s);
194+
unordered_set<string> seen;
195+
while (getline(fiw, s)) {
196+
Hit h = Hit::from_wgac(s);
197+
auto c1 = fmt::format("{}{}", h.query->name, "+-"[h.query->is_rc]);
198+
auto c2 = fmt::format("{}{}", h.ref->name, "+-"[h.ref->is_rc]);
199+
if (c1.size() > 6 || c2.size() > 6)
200+
continue;
201+
202+
if (seen.find(h.name) == seen.end()) {
203+
seen.insert(h.name);
204+
if (wgac.find(c1)==wgac.end()) wgac[c1]=boost::dynamic_bitset<>(250000000);
205+
if (wgac.find(c2)==wgac.end()) wgac[c2]=boost::dynamic_bitset<>(250000000);
206+
for (int i = h.query_start; i < h.query_end; i++) wgac[c1].set(i);
207+
for (int i = h.ref_start; i < h.ref_end; i++) wgac[c2].set(i);
208+
}
209+
}
210+
211+
eprn("wgac done");
212+
213+
FastaReference fr("data/hg19/hg19.fa");
214+
215+
int intersect = 0, wgac_only = 0, wgac_span = 0, sedef_only = 0, sedef_span = 0;
216+
217+
int sedef_extra_upper = 0;
218+
int miss_upper = 0;
219+
220+
for (auto &p: sedef) {
221+
auto &s = p.second;
222+
auto &w = wgac[p.first];
223+
224+
auto seq = fr.get_sequence(p.first.substr(0, p.first.size()-1));
225+
226+
for (int i = 0; i < seq.size(); i++) {
227+
if ((s[i] & (~w[i])) && isupper(seq[i]) && seq[i] != 'N') {
228+
sedef_extra_upper++;
229+
}
230+
if ((w[i] & (~s[i])) && isupper(seq[i]) && seq[i] != 'N') {
231+
miss_upper++;
232+
}
233+
}
234+
235+
intersect += (s & w).count();
236+
wgac_only += (w & (~s)).count();
237+
sedef_only += (s & (~w)).count();
238+
sedef_span += s.count();
239+
wgac_span += w.count();
240+
}
241+
242+
eprn("SEDEF: span {:12n}\n"
243+
" only {:12n}\n"
244+
" on/u {:12n}\n"
245+
" miss {:12n}\n"
246+
" mi/u {:12n}\n"
247+
"WGAC: span {:12n}\n"
248+
" intr {:12n}", sedef_span, sedef_only, sedef_extra_upper, wgac_only, miss_upper, wgac_span, intersect);
249+
}
250+
251+
/******************************************************************************/
252+
253+
void stats_main(int argc, char **argv)
254+
{
255+
if (argc < 3) {
256+
throw fmt::format("Not enough arguments to stats");
257+
}
258+
259+
string command = argv[0];
260+
if (command == "generate") {
261+
stats(argv[1], argv[2]);
262+
} else if (command == "diff") {
263+
get_differences(); //(argv[1], argv[2], atoi(argv[3]));
264+
} else {
265+
throw fmt::format("Unknown stats command");
172266
}
173-
string ref_path = argv[0];
174-
string bed_path = argv[1];
175-
stats(ref_path, bed_path);
176267
}

0 commit comments

Comments
 (0)