|
| 1 | +#include "breaks.hpp" |
| 2 | + |
| 3 | +namespace smoothxg { |
| 4 | + |
| 5 | +using namespace handlegraph; |
| 6 | + |
| 7 | +// break the path ranges at likely VNTR boundaries |
| 8 | +// and break the path ranges to be shorter than our "max" sequence size input to spoa |
| 9 | +void break_blocks(const xg::XG& graph, |
| 10 | + std::vector<block_t>& blocks, |
| 11 | + const uint64_t& max_poa_length, |
| 12 | + const uint64_t& min_copy_length, |
| 13 | + const uint64_t& max_copy_length, |
| 14 | + const uint64_t& min_autocorr_z, |
| 15 | + const uint64_t& autocorr_stride) { |
| 16 | + |
| 17 | + const VectorizableHandleGraph& vec_graph = dynamic_cast<const VectorizableHandleGraph&>(graph); |
| 18 | + |
| 19 | + std::cerr << "[smoothxg::break_blocks] cutting blocks that contain sequences above max-poa-length" << std::endl; |
| 20 | + |
| 21 | + uint64_t n_cut_blocks = 0; |
| 22 | + uint64_t n_repeat_blocks = 0; |
| 23 | + for (auto& block : blocks) { |
| 24 | + // check if we have sequences that are too long |
| 25 | + bool to_break = false; |
| 26 | + for (auto& path_range : block.path_ranges) { |
| 27 | + if (path_range.length > max_poa_length) { |
| 28 | + to_break = true; |
| 29 | + break; |
| 30 | + } |
| 31 | + } |
| 32 | + if (!to_break) continue; // skip if we're spoa-able |
| 33 | + // otherwise let's see if we've got repeats that we can use to chop things up |
| 34 | + // find if there is a repeat |
| 35 | + std::vector<sautocorr::repeat_t> repeats; |
| 36 | + for (auto& path_range : block.path_ranges) { |
| 37 | + // steps in id space |
| 38 | + std::string seq; |
| 39 | + std::string name = graph.get_path_name(graph.get_path_handle_of_step(path_range.begin)); |
| 40 | + for (step_handle_t step = path_range.begin; |
| 41 | + step != path_range.end; |
| 42 | + step = graph.get_next_step(step)) { |
| 43 | + seq.append(graph.get_sequence(graph.get_handle_of_step(step))); |
| 44 | + } |
| 45 | + if (seq.length() < 2*min_copy_length) continue; |
| 46 | + //std::cerr << "on " << name << "\t" << seq.length() << std::endl; |
| 47 | + std::vector<uint8_t> vec(seq.begin(), seq.end()); |
| 48 | + sautocorr::repeat_t result = sautocorr::repeat(vec, |
| 49 | + min_copy_length, |
| 50 | + max_copy_length, |
| 51 | + min_copy_length, |
| 52 | + min_autocorr_z, |
| 53 | + autocorr_stride); |
| 54 | + repeats.push_back(result); |
| 55 | + /* |
| 56 | + std::cerr << name |
| 57 | + << "\t" << seq.length() |
| 58 | + << "\t" << result.length |
| 59 | + << "\t" << result.z_score << std::endl; |
| 60 | + */ |
| 61 | + } |
| 62 | + // if there is, set the cut length to some fraction of it |
| 63 | + std::vector<double> lengths; |
| 64 | + for (auto& repeat : repeats) { |
| 65 | + if (repeat.length > 0) { |
| 66 | + lengths.push_back(repeat.length); |
| 67 | + } |
| 68 | + } |
| 69 | + uint64_t cut_length; |
| 70 | + bool found_repeat = !lengths.empty(); |
| 71 | + if (found_repeat) { |
| 72 | + double repeat_length = sautocorr::vec_mean(lengths.begin(), lengths.end()); |
| 73 | + cut_length = std::round(repeat_length / 2.0); |
| 74 | + ++n_repeat_blocks; |
| 75 | + //std::cerr << "found repeat of " << repeat_length << " cutting to " << cut_length << std::endl; |
| 76 | + } else { |
| 77 | + // if not, chop blindly |
| 78 | + cut_length = max_poa_length; |
| 79 | + } |
| 80 | + ++n_cut_blocks; |
| 81 | + std::vector<path_range_t> chopped_ranges; |
| 82 | + for (auto& path_range : block.path_ranges) { |
| 83 | + |
| 84 | + if (!found_repeat && path_range.length < cut_length) { |
| 85 | + chopped_ranges.push_back(path_range); |
| 86 | + continue; |
| 87 | + } |
| 88 | + // now find outlier clusters based on stdev and mean |
| 89 | + // extract a minimum viable repeat length |
| 90 | + // scan across the step vector, looking for where the repeat region begins and ends |
| 91 | + // cut at the repeat boundaries |
| 92 | + |
| 93 | + // Q: should we determine the repeat length for each sequence or all? |
| 94 | + // each is simple, but maybe expensive |
| 95 | + // all could provide higher precision, but it's muddier |
| 96 | + |
| 97 | + // if this doesn't work, we're going to blindly cut anyway |
| 98 | + uint64_t last_cut = 0; |
| 99 | + step_handle_t last_end = path_range.begin; |
| 100 | + //path_range_t* new_range = nullptr; |
| 101 | + uint64_t pos = 0; |
| 102 | + step_handle_t step; |
| 103 | + for (step = path_range.begin; |
| 104 | + step != path_range.end; |
| 105 | + step = graph.get_next_step(step)) { |
| 106 | + //handle_t h = graph.get_handle_of_step(step); |
| 107 | + //uint64_t id = graph.get_id(h); |
| 108 | + //int64_t node_pos = vec_graph.node_vector_offset(id); |
| 109 | + pos += graph.get_length(graph.get_handle_of_step(step)); |
| 110 | + if (pos - last_cut > cut_length) { |
| 111 | + step_handle_t next = graph.get_next_step(step); |
| 112 | + chopped_ranges.push_back({last_end, next, pos - last_cut}); |
| 113 | + last_end = next; |
| 114 | + last_cut = pos; |
| 115 | + } |
| 116 | + } |
| 117 | + if (step != last_end) { |
| 118 | + chopped_ranges.push_back({last_end, step, pos - last_cut}); |
| 119 | + } |
| 120 | + } |
| 121 | + block.path_ranges = chopped_ranges; |
| 122 | + } |
| 123 | + std::cerr << "[smoothxg::break_blocks] cut " << n_cut_blocks << " blocks of which " << n_repeat_blocks << " had repeats" << std::endl; |
| 124 | +} |
| 125 | + |
| 126 | +} |
0 commit comments