Skip to content

Commit

Permalink
updated merging
Browse files Browse the repository at this point in the history
  • Loading branch information
riasc committed Jun 1, 2024
1 parent dd5dfd8 commit df896d4
Show file tree
Hide file tree
Showing 9 changed files with 460 additions and 56 deletions.
4 changes: 2 additions & 2 deletions include/DataTypes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ namespace dtp {
using DNAVector = std::vector<seqan3::dna5>; // seqan3 also provides dna5_vector
using DNASpan = std::span<seqan3::dna5>;
using QualSpan = std::span<seqan3::phred42>;

using QualVector = std::vector<seqan3::phred42>;
using CigarVector = std::vector<seqan3::cigar>;

// FASTQ
using FASTQFormat = seqan3::type_list<std::string, DNASpan, QualSpan>;
Expand Down Expand Up @@ -77,7 +77,7 @@ namespace dtp {
int multSplitsCount;
int nSurvivedCount;
};
using StatsMap = std::map<std::string, std::vector<StatsFields>>;
using StatsMap = std::map<std::string, StatsFields>;
}

#endif //RNANUE_DATATYPES_HPP
82 changes: 79 additions & 3 deletions include/SplitReadCalling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,39 +4,115 @@
// Standard
#include <iostream>
#include <bitset>
#include <stdlib.h>
#include <thread>
#include <mutex>
#include <future>
#include <vector>

// Boost
#include <boost/program_options.hpp>
#include <boost/filesystem.hpp>
#include <boost/property_tree/ptree.hpp>

// HTSlib
#include <htslib/sam.h>
#include <htslib/hts.h>

// Vienna
#include <ViennaRNA/fold.h>
//#include <ViennaRNA/fold.h>

// SeqAn3
#include <seqan3/io/sam_file/all.hpp>
#include <seqan3/core/debug_stream.hpp>
#include <seqan3/alphabet/cigar/cigar.hpp>

// Class
#include "IBPTree.hpp"
#include "Stats.hpp"

using types = seqan3::type_list<
std::string,
seqan3::sam_flag,
std::optional<int32_t>,
std::optional<int32_t>,
std::optional<uint8_t>,
dtp::CigarVector,
dtp::DNAVector,
seqan3::sam_tag_dictionary>;

using fields = seqan3::fields<
seqan3::field::id,
seqan3::field::flag,
seqan3::field::ref_id,
seqan3::field::ref_offset,
seqan3::field::mapq,
seqan3::field::cigar,
seqan3::field::seq,
seqan3::field::tags>;

// define tags
using seqan3::operator""_tag;
template <> struct seqan3::sam_tag_type<"XH"_tag> { using type = int32_t; }; // number of splits (SAM record)
template <> struct seqan3::sam_tag_type<"XX"_tag> { using type = int32_t; }; // begin of split
template <> struct seqan3::sam_tag_type<"XY"_tag> { using type = int32_t; }; // end of split
template <> struct seqan3::sam_tag_type<"XJ"_tag> { using type = int32_t; }; // number of splits (whole read)
template <> struct seqan3::sam_tag_type<"XN"_tag> { using type = int32_t; };

// cigar



using SAMrecord = seqan3::record<types, fields>;
using namespace seqan3::literals;
using seqan3::get;

namespace po = boost::program_options;
namespace fs = boost::filesystem;
namespace pt = boost::property_tree;

class ThreadPool {
public:
ThreadPool(size_t num_threads);
~ThreadPool();

template<class F, class... Args>
auto enqueue(F&& f, Args&&... args) -> std::future<std::invoke_result_t<F, Args...>>;

private:
// Worker threads
std::vector<std::thread> workers;
// Task queue
std::queue<std::function<void()>> tasks;

// Synchronization
std::mutex queue_mutex;
std::condition_variable condition;
bool stop;
};


class SplitReadCalling {
public:
SplitReadCalling(po::variables_map params);
~SplitReadCalling();

void start(pt::ptree sample, pt::ptree condition);
void iterate(std::string& matched, std::string& splits, std::string& multsplits);
void sort(const std::string& inputFile, const std::string& outputFile);
void iterate(std::string& matched, std::string& single, std::string& splits, std::string& multsplits);
template <typename T>
void process(std::vector<T>& readrecords, auto& singleOut, auto& splitsOut, auto& multsplitsOut,
auto& singleOutMutex, auto& splitsOutMutex, auto& multsplitsOutMutex);
bool filter(auto& sequence, uint32_t cigarmatch);
void storeSegments(auto& splitrecord, std::optional<int32_t>& refOffset, dtp::CigarVector& cigar,
dtp::DNAVector& seq, seqan3::sam_tag_dictionary &tags, std::vector<SAMrecord>& curated);

private:
po::variables_map params;
IBPTree features;
Stats stats;
//Stats stats;
std::shared_ptr<Stats> stats;
std::string condition; // stores the current condition
};

#endif //RNANUE_DETECT_HPP
2 changes: 1 addition & 1 deletion src/Align.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ void Align::alignReads(std::string query, std::string mate, std::string matched)
std::cout << helper::getTime() << "Start Alignment\n";
std::string align = "segemehl.x";
align += " -S "; // split mode
align += " -b "; // output in bam format
align += " -b "; // output in bam format (and sorted)
align += " -A " + std::to_string(params["accuracy"].as<int>());
align += " -U " + std::to_string(params["minfragsco"].as<int>());
align += " -W " + std::to_string(params["minsplicecov"].as<int>());
Expand Down
6 changes: 6 additions & 0 deletions src/Base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ Base::Base(po::variables_map params) : params(params), paramsVal(params), data(p
//data.analysis();
} else {
if(subcall == "complete") {
data.preproc();
data.detect();
data.align();



//std::cout << "complete analysis" << std::endl;
/*
data.preproc();
Expand Down
11 changes: 6 additions & 5 deletions src/Data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ Data::Data(po::variables_map params) : params(params) {
}
}

Data::~Data() {}
Data::~Data() {
fs::path outDir = fs::path(params["outdir"].as<std::string>());
fs::path tmpDir = outDir / fs::path("tmp");
helper::deleteDir(tmpDir);
}

void Data::preprocDataPrep() {
// retrieve paths that contain the reads
Expand Down Expand Up @@ -340,19 +344,16 @@ void Data::callInAndOut(Callable f) {
for(unsigned i=0;i<groups.size();++i) {
// create directory for groups (e.g., ctrls, trtms)
outGroupDir = outSubcallDir / fs::path(groups[i]);
helper::createDir(outGroupDir, std::cout);

conditions = subcall.get_child(groups[i]);
std::cout << conditions.data() << std::endl;

BOOST_FOREACH(pt::ptree::value_type const &v, conditions.get_child("")) {
pt::ptree condition = v.second;

// create directory for condition (e.g., rpl_exp)
outConditionDir = outGroupDir / fs::path(condition.get<std::string>("condition"));
helper::createDir(outConditionDir, std::cout);

std::cout << condition.get<std::string>("condition") << std::endl;

samples = condition.get_child("samples");
// iterate over samples
BOOST_FOREACH(pt::ptree::value_type const &w, samples.get_child("")) {
Expand Down
5 changes: 2 additions & 3 deletions src/ParameterValidation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,15 @@ void ParameterValidation::validateDirs(std::string param) {
if (fs::exists(params[param].as<std::string>())) {
// if outdir already exists - check for overwrite flag
if (!params.count("overwrite")) {
std::cout << helper::getTime() << "Speciried output directory (--outdir ";
std::cout << helper::getTime() << "Specified output directory (--outdir ";
std::cout << params["outdir"].as<std::string>() << ") already exists\n";
std::cout << helper::getTime() << "Please provide a new directory or use the --overwrite flag\n";
}
} else {
// check if the input directory exists
if (!fs::exists(params[param].as<std::string>())) {
std::cout << helper::getTime() << "Specified input directory (--" << param;
std::cout << helper::getTime() << "Specified output directory (-- " << param;
std::cout << params[param].as<std::string>() << ") does not exist\n";
exit(EXIT_FAILURE);
}
}
}
Expand Down
Loading

0 comments on commit df896d4

Please sign in to comment.