From 8c0cc0213a63f34374138a550e6bc09a9ade4491 Mon Sep 17 00:00:00 2001 From: riasc Date: Mon, 3 Mar 2025 15:06:10 -0600 Subject: [PATCH 1/8] changes to data structure --- CMakeLists.txt | 7 +++++-- include/Utility.hpp | 2 ++ src/Data.cpp | 3 ++- src/main.cpp | 27 ++++++++++++++++++++++++++- test/humanSE.cfg | 6 +++--- 5 files changed, 38 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e33a2dd6..4a6bf5bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.22.1) -project(RNAnue VERSION 0.2.3) +project(RNAnue VERSION 0.2.4) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED True) set(CMAKE_CXX_FLAGS -fopenmp) @@ -11,7 +11,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/Config.hpp.in ${CMAKE_CURRENT ###### SeqAn ##### list (APPEND CMAKE_PREFIX_PATH "${CMAKE_CURRENT_SOURCE_DIR}/seqan3/build_system") -find_package (seqan3 3.0 REQUIRED) +find_package (seqan3 3.3.0 REQUIRED) find_package(OpenMP) ###### Boost ###### @@ -38,6 +38,9 @@ target_link_libraries(RNAnue PkgConfig::HTSLIB) cmake_print_properties(TARGETS RNAnue PROPERTIES TARGET_INCLUDE_DIRECTORIES) ###### Tests ###### +# make tests optional +option(BUILD_TESTS "Building unit tests" OFF) + file(GLOB TEST_SOURCES "test/*.cpp") file(GLOB SOURCES "src/*.cpp") list(REMOVE_ITEM SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/main.cpp) diff --git a/include/Utility.hpp b/include/Utility.hpp index f01f07d3..60002832 100644 --- a/include/Utility.hpp +++ b/include/Utility.hpp @@ -6,6 +6,7 @@ #include #include #include +#include // Boost @@ -18,6 +19,7 @@ #include "DataTypes.hpp" namespace fs = boost::filesystem; +namespace stdfs = std::filesystem; // filesystem manipulation namespace helper { diff --git a/src/Data.cpp b/src/Data.cpp index 5009ef7a..0ed345a6 100644 --- a/src/Data.cpp +++ b/src/Data.cpp @@ -1,6 +1,7 @@ #include "Data.hpp" Data::Data(po::variables_map params) : params(params) { + std::cout << helper::getTime() << "Data object created\n"; std::string subcall = params["subcall"].as(); fs::path outDir = fs::path(params["outdir"].as()); @@ -67,7 +68,7 @@ void Data::detectDataPrep() { } void Data::clusteringDataPrep() { -fs::path ctrlsPath = fs::path(params["outdir"].as()) / "detect/ctrls"; + fs::path ctrlsPath = fs::path(params["outdir"].as()) / "detect/ctrls"; fs::path trtmsPath = fs::path(params["outdir"].as()) / "detect/trtms"; GroupsPath groups = getGroupsPath(ctrlsPath, trtmsPath); diff --git a/src/main.cpp b/src/main.cpp index 6a053ec5..b972a239 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -25,6 +25,27 @@ void showVersion(std::ostream& _str) { _str << std::endl; } +void makePathAbs(po::variables_map& params, std::string param, fs::path& configFileDir) { + if(params.count(param)) { + fs::path filePath = fs::path(params[param].as()); + if(filePath.empty()) { return; } // if file path is empty, do nothing + if(!filePath.is_absolute()) { + fs::path newPath = configFileDir / filePath; + params.erase(param); // remove param from variables_map + // add corrected path to variables_map + params.insert(std::make_pair(param, po::variable_value((configFileDir / filePath).string(), false))); + } + } +} + +// correct the paths to absolute paths (if they are relative) +void correctPaths(po::variables_map& params, fs::path& configFileDir) { + std::vector paramsToCheck = {"ctrls", "trtms", "outdir", "adpt3", "adpt5", "dbref", "features"}; + for(auto& param : paramsToCheck) { + makePathAbs(params, param, configFileDir); + } +} + int main(int argc, char* argv[]) { try { std::string readType; @@ -155,7 +176,6 @@ int main(int argc, char* argv[]) { .add(clustering) .add(output); - // translate all positional options into subcall options po::positional_options_description p; p.add("subcall", -1); @@ -198,6 +218,11 @@ int main(int argc, char* argv[]) { notify(params); } } + + // correct the paths (if they are relative) + fs::path configFileDir = fs::path(configFile).parent_path(); + correctPaths(params, configFileDir); + Base base(params); } catch(po::error& e) { diff --git a/test/humanSE.cfg b/test/humanSE.cfg index 009c4c9c..a4ff7db9 100755 --- a/test/humanSE.cfg +++ b/test/humanSE.cfg @@ -4,7 +4,7 @@ readtype = SE # paired-end (PE) or single-end (SE) # absolute path of dirs containing the raw reads (additional dir for each library) trtms = data/human/trtms/ # treatments ctrls = data/human/ctrls/ # controls -outdir = human-test-outdir # dir +outdir = data/human-test-outdir # dir threads = 4 # number of threads quality = 20 # lower limit for the average quality (Phred Quality Score) of the reads @@ -18,7 +18,7 @@ modetrm = 1 # mode of the trimming: only 5' (=0) and 3' (=1) or both (=2) # sequence preceeding 5'-end (N for arbitrary bp) in .fa format adpt5 = # sequence succeeding 3'-end (N for arbitrary bp) in fa. format -adpt3 = build/adapters3.fa +adpt3 = ../build/adapters3.fa wtrim = 0 # on whether (=1) or not (=0) to include window quality trimming # rate of mismatches allowed when aligning adapters with read sequence mmrate = 0.1 # e.g., 0.1 on a sequence length of 10 results in @@ -26,7 +26,7 @@ wsize = 3 # window size minovlps = 10 # minimum overlaps required when merging paired-end reads ### ALIGNMENT (forwarded to segemehl.x) -dbref = GRCh38.primary_assembly.genome_20-22.fa +dbref = ref/GRCh38.primary_assembly.genome_20-22.fa accuracy = 90 # min percentage of matches per read in semi-global alignment minfragsco = 15 # min score of a spliced fragment minfraglen = 15 # min length of a spliced fragment From 2b12841bd160d69b99753586754828a672525875 Mon Sep 17 00:00:00 2001 From: riasc Date: Mon, 3 Mar 2025 16:00:36 -0600 Subject: [PATCH 2/8] track subset reference --- .gitattributes | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..e9fd21d6 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +test/ref/GRCh38.primary_assembly.genome_20-22.fa filter=lfs diff=lfs merge=lfs -text From e02d5ee61980769b22d814a5338f5f5513fb947a Mon Sep 17 00:00:00 2001 From: riasc Date: Mon, 3 Mar 2025 16:01:43 -0600 Subject: [PATCH 3/8] added reference file --- test/ref/GRCh38.primary_assembly.genome_20-22.fa | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 test/ref/GRCh38.primary_assembly.genome_20-22.fa diff --git a/test/ref/GRCh38.primary_assembly.genome_20-22.fa b/test/ref/GRCh38.primary_assembly.genome_20-22.fa new file mode 100644 index 00000000..90e5ace1 --- /dev/null +++ b/test/ref/GRCh38.primary_assembly.genome_20-22.fa @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3abd192e8fb4b5348945287c3813eccbc79df12307a6becc655232540053cd61 +size 164672193 From 42a87d1322ee0ac1987cd0e8f73327f2717d90a8 Mon Sep 17 00:00:00 2001 From: riasc Date: Mon, 3 Mar 2025 16:08:25 -0600 Subject: [PATCH 4/8] removed large file --- test/ref/GRCh38.primary_assembly.genome_20-22.fa | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 test/ref/GRCh38.primary_assembly.genome_20-22.fa diff --git a/test/ref/GRCh38.primary_assembly.genome_20-22.fa b/test/ref/GRCh38.primary_assembly.genome_20-22.fa deleted file mode 100644 index 90e5ace1..00000000 --- a/test/ref/GRCh38.primary_assembly.genome_20-22.fa +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3abd192e8fb4b5348945287c3813eccbc79df12307a6becc655232540053cd61 -size 164672193 From 634218fbc62615b7bbba075d1cecaee0f42d603e Mon Sep 17 00:00:00 2001 From: riasc Date: Mon, 3 Mar 2025 16:10:04 -0600 Subject: [PATCH 5/8] untrack reference file --- .gitattributes | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index e9fd21d6..e69de29b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +0,0 @@ -test/ref/GRCh38.primary_assembly.genome_20-22.fa filter=lfs diff=lfs merge=lfs -text From 46c4050150d4d6460ab4174860209fac7a9bef31 Mon Sep 17 00:00:00 2001 From: riasc Date: Tue, 4 Mar 2025 10:47:07 -0600 Subject: [PATCH 6/8] version fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 17ed733d..d1554702 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ to detect the Boost libraries system-wide. However, Seqan3 is expected to be loc folder of RNAnue as specified in the CMakeLists.txt. Segemehl and the Vienna binaries need to be located in $PATH. -* [Boost C++ Libraries](https://www.boost.org/) (v1.7.2) +* [Boost C++ Libraries](https://www.boost.org/) (v1.72.x) * [SeqAn](https://github.com/seqan/seqan3) (v3.3.0) * [Segemehl](http://www.bioinf.uni-leipzig.de/Software/segemehl/) (v0.3.4) * [Vienna Package](https://www.tbi.univie.ac.at/RNA/#binary_packages) (>= v2.4.17) From 962c68229dc21a4519b0e4afa37333d3e995bd94 Mon Sep 17 00:00:00 2001 From: riasc Date: Tue, 4 Mar 2025 16:39:58 -0600 Subject: [PATCH 7/8] missing return values caused segfault --- src/SplitReadCalling.cpp | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/src/SplitReadCalling.cpp b/src/SplitReadCalling.cpp index 3ea35318..b9cb28b1 100644 --- a/src/SplitReadCalling.cpp +++ b/src/SplitReadCalling.cpp @@ -109,7 +109,6 @@ void SplitReadCalling::start(pt::ptree sample, pt::ptree condition) { } producer(inBam); // start the producer (and add the recordsreads to the queue) - for(auto& consumer : consumers) { consumer.join(); } @@ -144,6 +143,7 @@ void SplitReadCalling::producer(seqan3::sam_file_input<>& inputfile) { void SplitReadCalling::consumer(dtp::BAMOut& singleOut, dtp::BAMOut& splitsOut, dtp::BAMOut& multsplitsOut, std::mutex& singleOutMutex, std::mutex& splitsOutMutex, std::mutex& multsplitsOutMutex) { + using recordType = typename seqan3::sam_file_input<>::record_type; std::vector readrecords; while(true) { @@ -165,6 +165,7 @@ void SplitReadCalling::process(std::vector& readrecords, auto& singleOut, auto& singleOutMutex, auto& splitsOutMutex, auto& multsplitsOutMutex) { + // define variables std::string qname = ""; seqan3::sam_flag flag = seqan3::sam_flag::none; @@ -195,6 +196,7 @@ void SplitReadCalling::process(std::vector& readrecords, auto& singleOut, std::map> putative{}; for(auto& rec : readrecords) { + qname = rec.id(); flag = rec.flag(); ref_id = rec.reference_id(); @@ -227,6 +229,14 @@ void SplitReadCalling::process(std::vector& readrecords, auto& singleOut, cigarMatch = 0; // reset matches count in cigar // check the cigar string for splits +// std::cout << "cigar: "; +// for(auto& cig: cigar) { // iterate over all cigar elements +// cigarSize = get(cig); // determine the size and operator of the cigar element +// cigarOp = get(cig); // print cigarOP +// std::cout << cigarSize << cigarOp.to_char(); +// } +// std::cout << std::endl; + for(auto& cig: cigar) { // determine the size and operator of the cigar element cigarSize = get(cig); @@ -253,7 +263,9 @@ void SplitReadCalling::process(std::vector& readrecords, auto& singleOut, cigarSplit.clear(); // new split - new CIGAR ref_offset.value() += cigarSize + endPosRead + 1; // adjust left-most mapping position splitId++; // increase split ID - } else { // matches the splice junction + } else { // matches the splice junctiona +// endPosRead += cigarSize; +// cigarSplit.push_back(cig); xjtag--; // decrease the number of splits (basically merge the splits) } } else { @@ -277,6 +289,7 @@ void SplitReadCalling::process(std::vector& readrecords, auto& singleOut, } } + auto subSeq = seq | seqan3::views::slice(startPosRead - 1, endPosRead); seqan3::sam_tag_dictionary newTags{}; newTags.get<"XX"_tag>() = startPosSplit; @@ -292,8 +305,10 @@ void SplitReadCalling::process(std::vector& readrecords, auto& singleOut, curated.clear(); segNum = 0; ++splitId; + } decide(putative, splitsOut, multsplitsOut, splitsOutMutex, multsplitsOutMutex); + } void SplitReadCalling::decide(std::map>& putative, auto& splitsOut, @@ -401,23 +416,28 @@ bool SplitReadCalling::filter(auto& sequence, uint32_t cigarmatch) { return true; } +// checks if the detected splice junction is within the (annotated) splice site bool SplitReadCalling::matchSpliceSites(dtp::Interval& spliceSites, std::optional refId) { if(params["splicing"].as>() == std::bitset<1>("1")) { std::vector> ovlps = features.search(this->refIds[refId.value()], spliceSites); for(int i=0;igetJunctions(); for(auto& [name, junc] : junctions) { - for(int j=1;j 1) { + for(int j=1;j refPos, dtp::DNASpan& seq, From a74c3fbc9247434e7beda20ccb302aed3a2d242c Mon Sep 17 00:00:00 2001 From: riasc Date: Tue, 4 Mar 2025 23:52:14 -0600 Subject: [PATCH 8/8] update to working version without ctrls data --- CHANGELOG.md | 6 ++++++ README.md | 2 +- src/Data.cpp | 22 +++++++++++++++++----- test/humanSE.cfg | 2 +- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee20bc3f..5111b5aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +# [0.2.4] + +# Fix +- Fixed incompatibility when no control sample is provided +- Fixed bug when relative paths are provided for test data + # [0.2.3] ## Fix diff --git a/README.md b/README.md index d1554702..7d376b70 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![docker-release](https://github.com/Ibvt/RNAnue/actions/workflows/docker.yml/badge.svg)](https://github.com/Ibvt/RNAnue/actions/workflows/docker.yml) -# RNAnue - 0.2.3 +# RNAnue - 0.2.4 ## About RNAnue is a comprehensive analysis to detect RNA-RNA interactions from Direct-Duplex-Detection (DDD) data. diff --git a/src/Data.cpp b/src/Data.cpp index 0ed345a6..9af5f934 100644 --- a/src/Data.cpp +++ b/src/Data.cpp @@ -32,7 +32,7 @@ Data::~Data() { void Data::preprocDataPrep() { // retrieve paths that contain the reads - fs::path ctrlsPath = fs::path(this->params["ctrls"].as()); + fs::path ctrlsPath = fs::path(params["ctrls"].as()); fs::path trtmsPath = fs::path(this->params["trtms"].as()); GroupsPath groups = getGroupsPath(ctrlsPath, trtmsPath); @@ -46,7 +46,10 @@ void Data::alignDataPrep() { } else { // make sure that data has been preprocessed (or at least selected) if(params["preproc"].as>() == std::bitset<1>("1")) { - fs::path ctrlsPath = fs::path(params["outdir"].as()) / "preproc/ctrls"; + fs::path ctrlsPath = ""; + if(params["ctrls"].as() != "") { + fs::path ctrlsPath = fs::path(params["outdir"].as()) / "preproc/ctrls"; + } fs::path trtmsPath = fs::path(params["outdir"].as()) / "preproc/trtms"; /* @@ -60,7 +63,10 @@ void Data::alignDataPrep() { } void Data::detectDataPrep() { - fs::path ctrlsPath = fs::path(params["outdir"].as()) / "align/ctrls"; + fs::path ctrlsPath = ""; + if(params["ctrls"].as() != "") { + fs::path ctrlsPath = fs::path(params["outdir"].as()) / "align/ctrls"; + } fs::path trtmsPath = fs::path(params["outdir"].as()) / "align/trtms"; GroupsPath groups = getGroupsPath(ctrlsPath, trtmsPath); @@ -68,7 +74,10 @@ void Data::detectDataPrep() { } void Data::clusteringDataPrep() { - fs::path ctrlsPath = fs::path(params["outdir"].as()) / "detect/ctrls"; + fs::path ctrlsPath = ""; + if(params["ctrls"].as() != "") { + fs::path ctrlsPath = fs::path(params["outdir"].as()) / "detect/ctrls"; + } fs::path trtmsPath = fs::path(params["outdir"].as()) / "detect/trtms"; GroupsPath groups = getGroupsPath(ctrlsPath, trtmsPath); @@ -76,7 +85,10 @@ void Data::clusteringDataPrep() { } void Data::analysisDataPrep() { - fs::path ctrlsPath = fs::path(params["outdir"].as()) / "detect/ctrls"; + fs::path ctrlsPath = ""; + if(params["ctrls"].as() != "") { + fs::path ctrlsPath = fs::path(params["outdir"].as()) / "detect/ctrls"; + } fs::path trtmsPath = fs::path(params["outdir"].as()) / "detect/trtms"; GroupsPath groups = getGroupsPath(ctrlsPath, trtmsPath); diff --git a/test/humanSE.cfg b/test/humanSE.cfg index a4ff7db9..00060790 100755 --- a/test/humanSE.cfg +++ b/test/humanSE.cfg @@ -46,7 +46,7 @@ clustdist = 0 # minimum distance between clusters ### ANALYSIS # specify the annotations of the organism of interest (optional) -features = gencode.v45.annotation.gff3 # GFF3 feature file +features = ref/gencode.v45.annotation.gff3 # GFF3 feature file # OUTPUT stats = 1 # produce a statistics of the libraries