Skip to content

Commit 2f80055

Browse files
committed
resolve #6 by chopping blocks with repeat awareness
1 parent e7a01f4 commit 2f80055

File tree

7 files changed

+190
-14
lines changed

7 files changed

+190
-14
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,6 @@
4040
[submodule "deps/libbf"]
4141
path = deps/libbf
4242
url = https://github.com/mavam/libbf.git
43+
[submodule "deps/sautocorr"]
44+
path = deps/sautocorr
45+
url = https://github.com/ekg/sautocorr.git

CMakeLists.txt

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,15 @@ ExternalProject_Add(xoshiro
204204
ExternalProject_Get_property(xoshiro SOURCE_DIR)
205205
set(xoshiro_INCLUDE "${SOURCE_DIR}")
206206

207+
ExternalProject_Add(sautocorr
208+
SOURCE_DIR "${CMAKE_SOURCE_DIR}/deps/sautocorr"
209+
UPDATE_COMMAND ""
210+
INSTALL_COMMAND ""
211+
BUILD_COMMAND ""
212+
CONFIGURE_COMMAND "")
213+
ExternalProject_Get_property(sautocorr SOURCE_DIR)
214+
set(sautocorr_INCLUDE "${SOURCE_DIR}")
215+
207216
add_subdirectory(deps/spoa EXCLUDE_FROM_ALL)
208217
set(spoa_INCLUDE "${CMAKE_SOURCE_DIR}/deps/spoa/include")
209218

@@ -216,7 +225,8 @@ add_library(smoothxg_objs OBJECT
216225
src/prep.cpp
217226
src/blocks.cpp
218227
src/breaks.cpp
219-
src/smooth.cpp)
228+
src/smooth.cpp
229+
${sautocorr_INCLUDE}/sautocorr.cpp)
220230

221231
add_dependencies(smoothxg_objs handlegraph)
222232
add_dependencies(smoothxg_objs sdsl-lite)
@@ -232,6 +242,7 @@ add_dependencies(smoothxg_objs paryfor)
232242
add_dependencies(smoothxg_objs libbf)
233243
add_dependencies(smoothxg_objs dirtyzipf)
234244
add_dependencies(smoothxg_objs xoshiro)
245+
add_dependencies(smoothxg_objs sautocorr)
235246

236247
set_target_properties(smoothxg_objs PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
237248

@@ -278,7 +289,8 @@ set(smoothxg_INCLUDES
278289
"${libbf_INCLUDE}"
279290
"${random_dist_INCLUDE}"
280291
"${dirtyzipf_INCLUDE}"
281-
"${xoshiro_INCLUDE}")
292+
"${xoshiro_INCLUDE}"
293+
"${sautocorr_INCLUDE}")
282294

283295
set(smoothxg_LIBS
284296
"${sdsl-lite_LIB}/libsdsl.a"

deps/sautocorr

Submodule sautocorr added at e8a6aad

src/blocks.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ smoothable_blocks(
5757
std::vector<path_range_t> path_ranges;
5858
for (auto& step : traversals) {
5959
if (path_ranges.empty()) {
60-
path_ranges.push_back({step, step});
60+
path_ranges.push_back({step, step, 0});
6161
} else {
6262
auto& path_range = path_ranges.back();
6363
auto& last = path_range.end;
@@ -66,7 +66,7 @@ smoothable_blocks(
6666
- (graph.get_position_of_step(last) + graph.get_length(graph.get_handle_of_step(last)))
6767
> max_path_jump)) {
6868
// make a new range
69-
path_ranges.push_back({step, step});
69+
path_ranges.push_back({step, step, 0});
7070
} else {
7171
// extend the range
7272
last = step;
@@ -137,7 +137,7 @@ smoothable_blocks(
137137
block.max_path_length = std::max(included_path_length,
138138
block.max_path_length);
139139
}
140-
std::cerr << "max_path_length " << block.max_path_length << std::endl;
140+
//std::cerr << "max_path_length " << block.max_path_length << std::endl;
141141

142142
// order the path ranges from longest to shortest
143143
ips4o::parallel::sort(

src/breaks.cpp

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#include "breaks.hpp"
2+
3+
namespace smoothxg {
4+
5+
using namespace handlegraph;
6+
7+
// break the path ranges at likely VNTR boundaries
8+
// and break the path ranges to be shorter than our "max" sequence size input to spoa
9+
void break_blocks(const xg::XG& graph,
10+
std::vector<block_t>& blocks,
11+
const uint64_t& max_poa_length,
12+
const uint64_t& min_copy_length,
13+
const uint64_t& max_copy_length,
14+
const uint64_t& min_autocorr_z,
15+
const uint64_t& autocorr_stride) {
16+
17+
const VectorizableHandleGraph& vec_graph = dynamic_cast<const VectorizableHandleGraph&>(graph);
18+
19+
std::cerr << "[smoothxg::break_blocks] cutting blocks that contain sequences above max-poa-length" << std::endl;
20+
21+
uint64_t n_cut_blocks = 0;
22+
uint64_t n_repeat_blocks = 0;
23+
for (auto& block : blocks) {
24+
// check if we have sequences that are too long
25+
bool to_break = false;
26+
for (auto& path_range : block.path_ranges) {
27+
if (path_range.length > max_poa_length) {
28+
to_break = true;
29+
break;
30+
}
31+
}
32+
if (!to_break) continue; // skip if we're spoa-able
33+
// otherwise let's see if we've got repeats that we can use to chop things up
34+
// find if there is a repeat
35+
std::vector<sautocorr::repeat_t> repeats;
36+
for (auto& path_range : block.path_ranges) {
37+
// steps in id space
38+
std::string seq;
39+
std::string name = graph.get_path_name(graph.get_path_handle_of_step(path_range.begin));
40+
for (step_handle_t step = path_range.begin;
41+
step != path_range.end;
42+
step = graph.get_next_step(step)) {
43+
seq.append(graph.get_sequence(graph.get_handle_of_step(step)));
44+
}
45+
if (seq.length() < 2*min_copy_length) continue;
46+
//std::cerr << "on " << name << "\t" << seq.length() << std::endl;
47+
std::vector<uint8_t> vec(seq.begin(), seq.end());
48+
sautocorr::repeat_t result = sautocorr::repeat(vec,
49+
min_copy_length,
50+
max_copy_length,
51+
min_copy_length,
52+
min_autocorr_z,
53+
autocorr_stride);
54+
repeats.push_back(result);
55+
/*
56+
std::cerr << name
57+
<< "\t" << seq.length()
58+
<< "\t" << result.length
59+
<< "\t" << result.z_score << std::endl;
60+
*/
61+
}
62+
// if there is, set the cut length to some fraction of it
63+
std::vector<double> lengths;
64+
for (auto& repeat : repeats) {
65+
if (repeat.length > 0) {
66+
lengths.push_back(repeat.length);
67+
}
68+
}
69+
uint64_t cut_length;
70+
bool found_repeat = !lengths.empty();
71+
if (found_repeat) {
72+
double repeat_length = sautocorr::vec_mean(lengths.begin(), lengths.end());
73+
cut_length = std::round(repeat_length / 2.0);
74+
++n_repeat_blocks;
75+
//std::cerr << "found repeat of " << repeat_length << " cutting to " << cut_length << std::endl;
76+
} else {
77+
// if not, chop blindly
78+
cut_length = max_poa_length;
79+
}
80+
++n_cut_blocks;
81+
std::vector<path_range_t> chopped_ranges;
82+
for (auto& path_range : block.path_ranges) {
83+
84+
if (!found_repeat && path_range.length < cut_length) {
85+
chopped_ranges.push_back(path_range);
86+
continue;
87+
}
88+
// now find outlier clusters based on stdev and mean
89+
// extract a minimum viable repeat length
90+
// scan across the step vector, looking for where the repeat region begins and ends
91+
// cut at the repeat boundaries
92+
93+
// Q: should we determine the repeat length for each sequence or all?
94+
// each is simple, but maybe expensive
95+
// all could provide higher precision, but it's muddier
96+
97+
// if this doesn't work, we're going to blindly cut anyway
98+
uint64_t last_cut = 0;
99+
step_handle_t last_end = path_range.begin;
100+
//path_range_t* new_range = nullptr;
101+
uint64_t pos = 0;
102+
step_handle_t step;
103+
for (step = path_range.begin;
104+
step != path_range.end;
105+
step = graph.get_next_step(step)) {
106+
//handle_t h = graph.get_handle_of_step(step);
107+
//uint64_t id = graph.get_id(h);
108+
//int64_t node_pos = vec_graph.node_vector_offset(id);
109+
pos += graph.get_length(graph.get_handle_of_step(step));
110+
if (pos - last_cut > cut_length) {
111+
step_handle_t next = graph.get_next_step(step);
112+
chopped_ranges.push_back({last_end, next, pos - last_cut});
113+
last_end = next;
114+
last_cut = pos;
115+
}
116+
}
117+
if (step != last_end) {
118+
chopped_ranges.push_back({last_end, step, pos - last_cut});
119+
}
120+
}
121+
block.path_ranges = chopped_ranges;
122+
}
123+
std::cerr << "[smoothxg::break_blocks] cut " << n_cut_blocks << " blocks of which " << n_repeat_blocks << " had repeats" << std::endl;
124+
}
125+
126+
}

src/breaks.hpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#pragma once
2+
3+
#include <iostream>
4+
#include <chrono>
5+
#include <vector>
6+
#include <algorithm>
7+
#include <cmath>
8+
#include <vector>
9+
#include <vector>
10+
#include <numeric>
11+
#include <cmath>
12+
#include "blocks.hpp"
13+
#include "sautocorr.hpp"
14+
#include "xg.hpp"
15+
16+
namespace smoothxg {
17+
18+
using namespace handlegraph;
19+
20+
// break the path ranges at likely VNTR boundaries
21+
// and break the path ranges to be shorter than our "max" sequence size input to spoa
22+
void break_blocks(const xg::XG& graph,
23+
std::vector<block_t>& blocks,
24+
const uint64_t& max_poa_length,
25+
const uint64_t& min_copy_length,
26+
const uint64_t& max_copy_length,
27+
const uint64_t& min_autocorr_z,
28+
const uint64_t& autocorr_stride);
29+
30+
}

src/main.cpp

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,14 @@ int main(int argc, char** argv) {
3636
args::ValueFlag<uint64_t> _max_edge_jump(parser, "N", "maximum edge jump before breaking [default: 100]", {'e', "max-edge-jump"});
3737
args::ValueFlag<uint64_t> _min_copy_length(parser, "N", "minimum repeat length to collapse [default: 1000]", {'c', "min-copy-length"});
3838
args::ValueFlag<uint64_t> _max_copy_length(parser, "N", "maximum repeat length to attempt to detect [default: 20000]", {'m', "max-copy-length"});
39-
args::ValueFlag<uint64_t> _max_spoa_length(parser, "N", "maximum sequence length to put into spoa [default: 10000]", {'l', "max-spoa-length"});
39+
args::ValueFlag<uint64_t> _max_poa_length(parser, "N", "maximum sequence length to put into poa [default: 10000]", {'l', "max-poa-length"});
4040
args::ValueFlag<uint64_t> num_threads(parser, "N", "use this many threads during parallel steps", {'t', "threads"});
41-
args::ValueFlag<int> _poa_m(parser, "N", "spoa score for matching bases [default: 1]", {'M', "poa-match"});
42-
args::ValueFlag<int> _poa_n(parser, "N", "spoa score for mismatching bases [default: -4]", {'N', "poa-mismatch"});
43-
args::ValueFlag<int> _poa_g(parser, "N", "spoa gap opening penalty (must be negative) [default: -6]", {'G', "poa-gap-open"});
44-
args::ValueFlag<int> _poa_e(parser, "N", "spoa gap extension penalty (must be negative) [default: -2]", {'E', "poa-gap-extend"});
45-
args::ValueFlag<int> _poa_q(parser, "N", "spoa gap opening penalty of the second affine function (must be negative) [default: -8]", {'Q', "poa-2nd-gap-open"});
46-
args::ValueFlag<int> _poa_c(parser, "N", "spoa gap extension penalty of the second affine function (must be negative) [default: -1]", {'C', "poa-2nd-gap-extend"});
41+
args::ValueFlag<int> _poa_m(parser, "N", "poa score for matching bases [default: 1]", {'M', "poa-match"});
42+
args::ValueFlag<int> _poa_n(parser, "N", "poa score for mismatching bases [default: -4]", {'N', "poa-mismatch"});
43+
args::ValueFlag<int> _poa_g(parser, "N", "poa gap opening penalty (must be negative) [default: -6]", {'G', "poa-gap-open"});
44+
args::ValueFlag<int> _poa_e(parser, "N", "poa gap extension penalty (must be negative) [default: -2]", {'E', "poa-gap-extend"});
45+
args::ValueFlag<int> _poa_q(parser, "N", "poa gap opening penalty of the second affine function (must be negative) [default: -8]", {'Q', "poa-2nd-gap-open"});
46+
args::ValueFlag<int> _poa_c(parser, "N", "poa gap extension penalty of the second affine function (must be negative) [default: -1]", {'C', "poa-2nd-gap-extend"});
4747
args::ValueFlag<int> _prep_node_chop(parser, "N", "during prep, chop nodes to this length [default: 10]", {'X', "chop-to"});
4848
args::ValueFlag<float> _prep_sgd_min_term_updates(parser, "N", "path-guided SGD sort quality parameter (N * sum_path_length updates per iteration) for graph prep [default: 1]", {'U', "path-sgd-term-updates"});
4949
args::Flag no_toposort(parser, "no-toposort", "don't apply topological sorting in the sort pipeline", {'T', "no-toposort"});
@@ -100,7 +100,7 @@ int main(int argc, char** argv) {
100100
uint64_t max_edge_jump = _max_edge_jump ? args::get(_max_edge_jump) : 100;
101101
uint64_t min_copy_length = _min_copy_length ? args::get(_min_copy_length) : 1000;
102102
uint64_t max_copy_length = _max_copy_length ? args::get(_max_copy_length) : 20000;
103-
uint64_t max_spoa_length = _max_spoa_length ? args::get(_max_spoa_length) : 10000;
103+
uint64_t max_poa_length = _max_poa_length ? args::get(_max_poa_length) : 10000;
104104

105105
std::int8_t poa_m = 1;
106106
std::int8_t poa_n = -4;
@@ -122,11 +122,15 @@ int main(int argc, char** argv) {
122122
min_subpath,
123123
max_edge_jump);
124124

125+
uint64_t min_autocorr_z = 5;
126+
uint64_t autocorr_stride = 50;
125127
smoothxg::break_blocks(graph,
126128
blocks,
129+
max_poa_length,
127130
min_copy_length,
128131
max_copy_length,
129-
max_spoa_length);
132+
min_autocorr_z,
133+
autocorr_stride);
130134

131135
auto smoothed = smoothxg::smooth_and_lace(graph,
132136
blocks,

0 commit comments

Comments
 (0)