Skip to content

Commit

Permalink
Merge pull request #197 from pangenome/avoid_2_graphs_in_memory
Browse files Browse the repository at this point in the history
Avoid 2 graphs in memory, sample decompressed blocks, parallelize path embedding
  • Loading branch information
AndreaGuarracino authored Aug 28, 2023
2 parents 55f030e + f1e139e commit 4561345
Show file tree
Hide file tree
Showing 12 changed files with 1,252 additions and 611 deletions.
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,9 @@ add_library(smoothxg_objs OBJECT
src/tempfile.cpp
deps/xxHash/xxhash.c
src/xg.cpp
src/dna.cpp
src/pos.cpp
src/seqindex.cpp
src/chain.cpp
src/prep.cpp
src/cleanup.cpp
Expand Down
64 changes: 64 additions & 0 deletions src/dna.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include "dna.hpp"

namespace smoothxg {

static const char dna_complement[256] = {'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 8
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 16
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 24
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 32
'N', 'N', 'N', '$', '#', 'N', 'N', 'N', // 40 GCSA stop/start characters
'N', 'N', 'N', 'N', 'N', '-', 'N', 'N', // 48
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 56
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 64
'N', 'T', 'V', 'G', 'H', 'N', 'N', 'C', // 72
'D', 'N', 'N', 'M', 'N', 'K', 'N', 'N', // 80
'N', 'Q', 'Y', 'W', 'A', 'A', 'B', 'S', // 88
'N', 'R', 'N', 'N', 'N', 'N', 'N', 'N', // 96
'N', 't', 'v', 'g', 'h', 'N', 'N', 'c', // 104
'd', 'N', 'N', 'm', 'N', 'k', 'n', 'N', // 112
'N', 'q', 'y', 'w', 'a', 'a', 'b', 's', // 120
'N', 'r', 'N', 'N', 'N', 'N', 'N', 'N', // 128
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 136
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 144
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 152
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 160
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 168
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 176
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 184
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 192
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 200
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 208
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 216
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 224
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 232
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 240
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 248
'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};// 256

char dna_reverse_complement(const char& c) {
return dna_complement[c];
}

std::string dna_reverse_complement(const std::string& seq) {
std::string rc;
rc.assign(seq.rbegin(), seq.rend());
for (auto& c : rc) {
c = dna_complement[c];
}
return rc;
}

void dna_reverse_complement_in_place(std::string& seq) {
size_t swap_size = seq.size() / 2;
for (size_t i = 0, j = seq.size() - 1; i < swap_size; i++, j--) {
char tmp = seq[i];
seq[i] = dna_complement[seq[j]];
seq[j] = dna_complement[tmp];
}

if (seq.size() % 2) {
seq[swap_size] = dna_complement[seq[swap_size]];
}
}

}
14 changes: 14 additions & 0 deletions src/dna.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef DNA_HPP_INCLUDED
#define DNA_HPP_INCLUDED

#include <string>

namespace smoothxg {

char dna_reverse_complement(const char& c);
std::string dna_reverse_complement(const std::string& seq);
void dna_reverse_complement_in_place(std::string& seq);

}

#endif
831 changes: 654 additions & 177 deletions src/main.cpp

Large diffs are not rendered by default.

72 changes: 72 additions & 0 deletions src/pos.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#include "pos.hpp"

namespace smoothxg {

bool operator<(const aln_pos_t& a, const aln_pos_t& b) {
return a.pos < b.pos && a.aln_length < b.aln_length;
}

bool operator==(const aln_pos_t& a, const aln_pos_t& b) {
return a.pos == b.pos && a.aln_length == b.aln_length;
}

pos_t make_pos_t(uint64_t offset, bool is_rev) {
// top bit is reserved for is_rev flag
// the rest is our offset in the input sequence vector
uint64_t rev_mask = (uint64_t)1; // the bit mask
pos_t pos = offset<<1;
// https://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching
pos = (pos & ~rev_mask) | (-is_rev & rev_mask);
return pos;
}

uint64_t offset(const pos_t& pos) {
//return (pos & ~(uint64_t)1) >> 1;
return pos >> 1;
}

bool is_rev(const pos_t& pos) {
return pos & (uint64_t)1;
}

void incr_pos(pos_t& pos) {
if (is_rev(pos)) {
pos -= 2;
} else {
pos += 2;
}
}

void incr_pos(pos_t& pos, size_t by) {
if (is_rev(pos)) {
pos -= 2*by;
} else {
pos += 2*by;
}
}

void decr_pos(pos_t& pos) {
if (!is_rev(pos)) {
pos -= 2;
} else {
pos += 2;
}
}

void decr_pos(pos_t& pos, size_t by) {
if (!is_rev(pos)) {
pos -= 2*by;
} else {
pos += 2*by;
}
}

pos_t rev_pos_t(const pos_t& pos) {
return make_pos_t(offset(pos), !is_rev(pos));
}

std::string pos_to_string(const pos_t& pos) {
return std::to_string(offset(pos)) + (is_rev(pos)?"-":"+");
}

}
22 changes: 22 additions & 0 deletions src/pos.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#pragma once

#include <cstdint>
#include <string>

namespace smoothxg {

typedef uint64_t pos_t;
struct aln_pos_t { pos_t pos; uint64_t aln_length; };
bool operator<(const aln_pos_t& a, const aln_pos_t& b);
bool operator==(const aln_pos_t& a, const aln_pos_t& b);
pos_t make_pos_t(uint64_t offset, bool is_rev);
uint64_t offset(const pos_t& pos);
bool is_rev(const pos_t& pos);
void incr_pos(pos_t& pos);
void incr_pos(pos_t& pos, size_t by);
void decr_pos(pos_t& pos);
void decr_pos(pos_t& pos, size_t by);
pos_t rev_pos_t(const pos_t& pos);
std::string pos_to_string(const pos_t& pos);

}
Loading

0 comments on commit 4561345

Please sign in to comment.