From 60a160cbc046f9317d3bb4f4dc73f5088cf756ec Mon Sep 17 00:00:00 2001 From: riasc Date: Mon, 3 Jun 2024 01:01:16 -0500 Subject: [PATCH] bring repo back on track to have all changes included --- .github/workflows/docker.yml | 6 +- CHANGELOG.md | 19 +- CMakeLists.txt | 12 +- Dockerfile | 4 +- README.md | 152 ++++++++++++-- build/params.cfg | 5 +- include/Analysis.hpp | 7 +- include/FilterScores.hpp | 42 ++++ include/IBPTree.hpp | 4 + include/Node.hpp | 1 + include/SplitReadCalling.hpp | 22 +- src/Align.cpp | 1 + src/Analysis.cpp | 393 ++++++++++++++++++++++++++++++++++- src/Data.cpp | 1 - src/IBPTree.cpp | 98 ++++----- src/Node.cpp | 1 + src/SplitReadCalling.cpp | 187 +++++++++++------ src/Stats.cpp | 13 +- src/main.cpp | 2 +- 19 files changed, 809 insertions(+), 161 deletions(-) create mode 100644 include/FilterScores.hpp diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 320b8584..be5f7a35 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,8 +1,8 @@ name: docker-release on: push: - branches: - - "master" + tags: + - '*' pull_request: branches: - "master" @@ -41,5 +41,5 @@ jobs: context: "{{defaultContext}}" platforms: linux/amd64, linux/arm64 push: true - tags: cobirna/rnanue:${{ steps.extract_version.outputs.VERSION }} + tags: cobirna/rnanue:${{ steps.extract_version.outputs.VERSION }}, cobirna/rnanue:latest diff --git a/CHANGELOG.md b/CHANGELOG.md index 10fbdcc3..97a1a91f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,17 +5,26 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +# [0.2.1] + +## Features + +- code cleanup +- fix in segmentation of the preprocessing + # [0.2.0] +## Features + +- update to C++20 and SeqAn 3.3.0 +- native support for concurrency + # [0.1.1] ## Fix - # [0.1.0] -# [0.0.1] - -## [0.0.0] - +### Features +- Initial implementation of RNAnue diff --git a/CMakeLists.txt b/CMakeLists.txt index 32acd4b0..f2b2bc18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.25) +cmake_minimum_required(VERSION 3.22.1) project(RNAnue VERSION 0.2.0) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED True) @@ -9,7 +9,7 @@ include(CMakePrintHelpers) # configure header file to pass the version number to the source code configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/Config.hpp.in ${CMAKE_CURRENT_SOURCE_DIR}/include/Config.hpp) -###### Seqan ##### +###### SeqAn ##### list (APPEND CMAKE_PREFIX_PATH "${CMAKE_CURRENT_SOURCE_DIR}/seqan3/build_system") find_package (seqan3 3.0 REQUIRED) find_package(OpenMP) @@ -22,11 +22,19 @@ if(Boost_FOUND) include_directories(${Boost_INCLUDE_DIRS}) endif() +###### HTSlib ###### +find_package(PkgConfig REQUIRED) +pkg_check_modules(HTSLIB REQUIRED IMPORTED_TARGET htslib) +include_directories(${HTSLIB_INCLUDE_DIRS}) + file(GLOB SOURCES "src/*.cpp") add_executable(RNAnue ${SOURCES}) target_include_directories(RNAnue PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") target_link_libraries(RNAnue seqan3::seqan3) target_link_libraries(RNAnue Boost::program_options Boost::filesystem) +target_link_libraries(RNAnue PkgConfig::HTSLIB) +#target_link_libraries(RNAnue -L/usr/local/include -lRNA -flto) + cmake_print_properties(TARGETS RNAnue PROPERTIES TARGET_INCLUDE_DIRECTORIES) ###### Tests ###### diff --git a/Dockerfile b/Dockerfile index e49e4913..4ecd85a5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ LABEL authors="Richard A. Schaefer" RUN apt-get -y update && apt-get -y upgrade RUN apt-get install -y curl build-essential cmake git pkg-config RUN apt-get install -y libbz2-dev zlib1g-dev libncurses5-dev liblzma-dev -RUN apt-get install -y libboost-all-dev +RUN apt-get install -y libboost-all-dev gdb # install htslib WORKDIR / @@ -35,6 +35,8 @@ RUN export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig RUN make all RUN cp segemehl.x /usr/local/bin +# install ViennaRNA + # retrieve RNAnue WORKDIR / RUN git clone https://github.com/Ibvt/RNAnue.git diff --git a/README.md b/README.md index 45f2992d..f713cb18 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,8 @@ RNAnue is a comprehensive analysis to detect RNA-RNA interactions from Direct-Du ### Dependencies RNAnue has the following dependencies, whereas the brackets indicate the version RNAnue has been build and tested on. Make sure the requirements are satified by your system. cmake is able -to detect the Boost libraries system-wide. However seqan is expected to be located in the current -folder of RNAnue as specified in the CMakeLists.txt. Segemehl and the Vienna binaries need to be +to detect the Boost libraries system-wide. However Seqan3 is expected to be located in the current +folder of RNAnue as specified in the CMakeLists.txt. egemehl and the Vienna binaries need to be located in $PATH. * [Boost C++ Libraries](https://www.boost.org/) (v1.7.2) @@ -18,8 +18,6 @@ located in $PATH. * [Segemehl](http://www.bioinf.uni-leipzig.de/Software/segemehl/) (v0.3.4) * [Vienna Package](https://www.tbi.univie.ac.at/RNA/#binary_packages) (v2.4.17) - - ### CMake RNAnue is build using CMake. At first, clone the repository and change into the source directory. ``` @@ -29,13 +27,6 @@ cd RNAnue In the next step, retrieve the SeqAn library and place it in the root folder of RNAnue ``` -``` - - - - - - CMake is a cross-platform Makefile generator. For that, we provide the [CMakeLists](./source/CMakeLists.txt) to simplify the build process. In particular, it utilizes the instructions given in the CMakeLists. It is recommended to create a "out-of-source build". For that, create a build folder (e.g., ./bin) @@ -81,7 +72,6 @@ RNAnue --config /path/to/params.cfg Here, subcall corresponds to positional arguments.In any case, the specifying parameters over the command lines has precedence over the config file. - ## Results In principle, the results of the analysis are stored in the specified output folder and its subfolders @@ -90,7 +80,7 @@ and the RNA-RNA interactions. RNAnue reports the split reads in SAM format. Addi scores and hybridization energies are stored in the tags FC and FE, respectively. We report the clusters in a custom format that includes the IDs of the clusters, its length, size and genomic coordinates. -### Split Reads (.SAM) +### Split Reads (.BAM) RNAnue reports the detected splits in .SAM format (RNAnue `detect`). In this file, pairs of rows represent the split reads, consisting of the individual segments, e.g @@ -114,9 +104,143 @@ duplex. ### Clustering results - +The `clustering` procedure reports a single clusters.tab file which is a tab-delimited file of the clustering results. +Here, each line represents a cluster that corresponds to overlapping split reads, defined by the two segments. The +columns are defined in the following: + +| Field | Description | +| ----- | ----------- | +| clustID | Unique identifier of the cluster | +| fst_seg_chr | Chromosome (accession) of the first segment | +| fst_seg_strd | Strand where the first segment is located | +| fst_seg_strt | Start position of the first segment in the cluster | +| fst_seg_end | End position of the first segment in the cluster | +| sec_seg_chr | Chromosome (accession) of the second segment | +| sec_seg_strd | Strand where the second segment is located | +| sec_seg_strt | Start position of the second segment in the cluster | +| sec_seg_end | End position of the second segment in the cluster | +| no_splits | Number of split reads in the cluster | +| fst_seg_len | Length of the first segment | +| sec_seg_len | Length of the second segment | ### Interaction table + +The `analysis` procedure generates `_interactions` files for each library in +which each line represents an annotated split read that is mapped to a +transcript interaction. The fields are defined as follows: + +| Field | Description | +| ----- | ----------- | +| qname | read/template identifier | +| fst_seg_strd | Strand where the first segment is located | +| fst_seg_strt | Start position of the first segment | +| fst_seg_end | End position of the first segment | +| fst_seg_ref | Reference name of the first segment corresponding to the seqid in GFF and/or clusterID | +| fst_seg_name | Name of the first segment that corresponds to gene name/symbol and/or clusterID | +| first_seg_bt | Biotype of the annotation transcript (if available) | +| fst_seg_anno_strd | Strand information of the transcript in the overlapping annotation | +| fst_seg_prod | Description of the transcript (if available in annotation) | +| fst_seg_ori | Orientation of the segment with respect to annotation (sense/antisense) | +| sec_seg_strd | Strand where the second segment is located | +| sec_seg_strt | Start position of the second segment | +| sec_seg_end | End position of the second segment | +| sec_seg_ref | Reference name of the second segment corresponding to the seqid in GFF and/or clusterID | +| sec_seg_name | Name of the second segment that corresponds to gene name/symbol and/or clusterID | +| sec_seg_bt | Biotype of the annotation transcript (if available) | +| sec_seg_anno_strd | Strand information of the transcript in the overlapping annotation | +| sec_seg_prod | Description of the transcript (if available in annotation) | +| sec_seg_ori | Orientation of the segment with respect to annotation (sense/antisense) | +| cmpl | Complementarity score of the interaction | +| fst_seg_compl_aln | Alignment results in the complementarity calculation of the first segment | +| sec_seg_cmpl_aln | Alignment results in the complementarity calculation of the second segment | +| mfe | Hybridisation energy of the interaction | +| mfe_struc | Minimum free energy (MFE) structure of interaction in dot-bracket notation | + +The main result of an RNAnue analysis are transcript interactions. +They are stored in the file `allints.txt` in the same directory. +Its entries are structured as described in the following where +columns with prefix are given for each sample specified in +the analysis (within the same file). + +| Field | Description | +| ----- | ----------- | +| fs_rna | Gene/Transcript name of the first interaction partner | +| sec_rna | Gene/Transcript name of the second interaction partner | +| fst_rna_ori | Orientation of the first interaction partner with respect to annotation (sense/antisense) | +| sec_rna_ori | Orientation of the second interaction partner with respect to annotation (sense/antisense) | +| _supp_reads | Number of (split)reads that support the interaction | +| _ges | Global energy score (gcs) of the interaction | +| _ghs | Global hybridisation score (ghs) of the interaction | +| _pval | Statistical significance (p-value) of the interaction | +| _padj | Benjamini-Hochberg adjusted p-value among the samples | + +If the option –outcnt is set to 1 RNAnue generates the count table `counts.txt` in the output directory. +It contains the counts of each interaction for each sample and can be used for differential expression +analysis. Similarly, –outjgf set to 1 generates a `graph.json` file that contains the detected interactions +in JSON graph format. Finally, –stats set to 1 creates a `stats.txt` file that contains basic statistics for +each step of the analysis. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ### Docker diff --git a/build/params.cfg b/build/params.cfg index 0eae0224..a72ab945 100644 --- a/build/params.cfg +++ b/build/params.cfg @@ -31,7 +31,10 @@ accuracy = 90 # min percentage of matches per read in semi-global alignment minfragsco = 15 # min score of a spliced fragment minfraglen = 15 # min length of a spliced fragment minsplicecov = 80 # min coverage for spliced transcripts -exclclipping = 0 # exclude soft clipping from +mapq = 20 # min mapping quality for spliced transcripts +exclclipping = 0 # exclude soft clipping from +unprd = 0 # only for paired-end reads: include unpaired reads in the subsequent analysis +unmrg = 0 # only for paired-end reads: include unmerged reads in the subsequent analysis ### SPLIT READ CALLING sitelenratio = 0.0 diff --git a/include/Analysis.hpp b/include/Analysis.hpp index b3d50834..c3280c8a 100644 --- a/include/Analysis.hpp +++ b/include/Analysis.hpp @@ -4,9 +4,14 @@ // Standard #include +// Boost +#include + +namespace po = boost::program_options; + class Analysis { public: - Analysis(); + Analysis(po::variables_map params); ~Analysis(); void start(); diff --git a/include/FilterScores.hpp b/include/FilterScores.hpp new file mode 100644 index 00000000..648f9fb9 --- /dev/null +++ b/include/FilterScores.hpp @@ -0,0 +1,42 @@ +#ifndef RNANUE_FILTERSCORES_HPP +#define RNANUE_FILTERSCORES_HPP + +#include "DataTypes.hpp" + +class Complementarity { + public: + Complementarity(); + ~Complementarity(); + + void compute(dtp::DNAVector seq1, dtp::DNAVector seq2); + + private: + double score; + int alignmentLength; + double siteLengthRatio; + int matches; +}; + +class Hybridization { + public: + Hybridization(); + ~Hybridization(); + + void compute(dtp::DNAVector seq1, dtp::DNAVector seq2); + + private: + double score; + int alignmentLength; + double siteLengthRatio; + int matches; +}; + + +class FilterScores { + public: + FilterScores(); + ~FilterScores(); + +}; + +#endif //RNANUE_FILTERSCORES_HPP diff --git a/include/IBPTree.hpp b/include/IBPTree.hpp index b6d49641..e60ae540 100644 --- a/include/IBPTree.hpp +++ b/include/IBPTree.hpp @@ -34,6 +34,10 @@ class IBPTree { void insert(IntervalData& data); void insertIter(Node* node, IntervalData& data); void splitNode(Node* node, int index); + std::vector search(std::string chrom, dtp::Interval interval); + void searchIter(Node* node, const dtp::Interval& interval, std::vector results); + bool isOverlapping(dtp::Interval intvl1, dtp::Interval intvl2); + std::map getAttributes(std::string& attributes); std::string getTag(std::map attributes, const std::vector& keys); diff --git a/include/Node.hpp b/include/Node.hpp index a4c50454..f2e53c7f 100644 --- a/include/Node.hpp +++ b/include/Node.hpp @@ -39,6 +39,7 @@ class IntervalData { // operations void addJunction(std::string junction); bool isSubset(int start, int end); + bool isOverlapping(dtp::Interval intvl1, dtp::Interval intvl2); void printNode(); private: diff --git a/include/SplitReadCalling.hpp b/include/SplitReadCalling.hpp index b6814b93..9c9de585 100644 --- a/include/SplitReadCalling.hpp +++ b/include/SplitReadCalling.hpp @@ -38,7 +38,7 @@ using types = seqan3::type_list< std::optional, std::optional, dtp::CigarVector, - dtp::DNAVector, + dtp::DNASpan, seqan3::sam_tag_dictionary>; using fields = seqan3::fields< @@ -51,6 +51,19 @@ using fields = seqan3::fields< seqan3::field::seq, seqan3::field::tags>; +using types2 = seqan3::type_list< + std::vector, + std::string, + std::vector>; +using fields2 = seqan3::fields< + seqan3::field::seq, + seqan3::field::id, + seqan3::field::cigar>; +using sam_record_type = seqan3::sam_record; + + + + // define tags using seqan3::operator""_tag; template <> struct seqan3::sam_tag_type<"XH"_tag> { using type = int32_t; }; // number of splits (SAM record) @@ -63,7 +76,7 @@ template <> struct seqan3::sam_tag_type<"XN"_tag> { using type = int32_t; }; -using SAMrecord = seqan3::record; +using SAMrecord = seqan3::sam_record; using namespace seqan3::literals; using seqan3::get; @@ -103,7 +116,9 @@ class SplitReadCalling { template void process(std::vector& readrecords, auto& singleOut, auto& splitsOut, auto& multsplitsOut, auto& singleOutMutex, auto& splitsOutMutex, auto& multsplitsOutMutex); - bool filter(auto& sequence, uint32_t cigarmatch); + void decide(std::map>& putative, auto& splitsOut, auto& multsplitsOut, + auto& splitsOutMutex, auto& multsplitsOutMutex); + bool filter(auto& sequence, uint32_t cigarmatch, std::string chrom, dtp::Interval interval); void storeSegments(auto& splitrecord, std::optional& refOffset, dtp::CigarVector& cigar, dtp::DNAVector& seq, seqan3::sam_tag_dictionary &tags, std::vector& curated); @@ -113,6 +128,7 @@ class SplitReadCalling { //Stats stats; std::shared_ptr stats; std::string condition; // stores the current condition + std::deque refIds; // stores the reference ids }; #endif //RNANUE_DETECT_HPP diff --git a/src/Align.cpp b/src/Align.cpp index db9fb9bd..d6c1cde0 100644 --- a/src/Align.cpp +++ b/src/Align.cpp @@ -49,6 +49,7 @@ void Align::alignReads(std::string query, std::string mate, std::string matched) align += " -p " + mate; } align += " -o " + matched; + std::cout << align << "\n"; const char* alignCallChar = align.c_str(); int result = system(alignCallChar); if(result != 0) { diff --git a/src/Analysis.cpp b/src/Analysis.cpp index 13d9fa6a..f8111559 100644 --- a/src/Analysis.cpp +++ b/src/Analysis.cpp @@ -1,5 +1,394 @@ #include "Analysis.hpp" -Analysis::Analysis() { - std::cout << "Analysis object created" << std::endl; +// +Analysis::Analysis(po::variables_map _params) : params(_params) { + std::string line; + std::ifstream anno; + + anno.open(params["features"].as()); + // parse annotations + if(!anno.is_open()) { + perror("Error open"); + exit(EXIT_FAILURE); + } + while(getline(anno, line)) { + if(line[0] == '#') { continue; } + + std::vector tokens; + std::istringstream iss(line); + std::string token; + while(std::getline(iss, token, '\t')) { + tokens.push_back(token); + } + + // boundaries + std::pair bnds = std::make_pair(std::stoi(tokens[3]),std::stoi(tokens[4])); + std::pair,std::string> con = std::make_pair(bnds,"strand="+tokens[6]+";"+tokens[8]); + + std::size_t start_position; + if(features.count(tokens[0]) == 0) { // check of RefName is in map + if(start_position != std::string::npos) { + features.insert(std::pair,std::string>>>(tokens[0],{con})); + } + } else { + features[tokens[0]].push_back(con); + } + } + + // read file + fs::path freqFile = fs::path(params["outdir"].as()) / fs::path("frequency.txt"); + std::ifstream freqHandle; + freqHandle.open(freqFile.string()); + + // iterate and store into frquency map + while(getline(freqHandle, line)) { + std::vector tokens; + std::istringstream iss(line); + std::string token; + while(std::getline(iss, token, '\t')) { + tokens.push_back(token); + } + frequency.insert(std::pair(tokens[0],std::stoi(tokens[1]))); + } } + +// +void Analysis::createCountTable() { + std::map,std::vector,std::vector>>> counts; + + std::ifstream intsfile; + for(unsigned i=0;i tokens; + std::istringstream iss(line); + std::string token; + while(std::getline(iss, token, '\t')) { + tokens.push_back(token); + } + + std::tuple key1; + std::tuple key2; + key1 = std::make_tuple(tokens[5],tokens[8],tokens[13],tokens[16]); + key2 = std::make_tuple(tokens[5],tokens[8],tokens[13],tokens[16]); + + // + if(counts.count(key1) == 0 && counts.count(key2) == 0) { + std::vector,std::vector>> content; + + for(unsigned j=0; j,std::vector> vals{0,{},{}}; + content.push_back(vals); + } + std::get<0>(content[i]) = 1; + std::vector en = std::get<1>(content[i]); + std::vector cm = std::get<2>(content[i]); + + en.push_back(std::stof(tokens[17])); + cm.push_back(std::stof(tokens[18])); + + std::get<1>(content[i]) = en; + std::get<2>(content[i]) = cm; + + counts.insert(std::pair,std::vector,std::vector>>>(key1,content)); + + } else { + if(counts.count(key1) > 0) { + std::tuple,std::vector> oldVal; + oldVal = counts[key1][i]; + std::get<0>(oldVal) = std::get<0>(oldVal) + 1; + + std::vector oldValEnergy = std::get<1>(oldVal); + std::vector oldValCmpl = std::get<2>(oldVal); + + oldValEnergy.push_back(std::stof(tokens[17])); + oldValCmpl.push_back(std::stof(tokens[18])); + + std::get<1>(oldVal) = oldValEnergy; + std::get<2>(oldVal) = oldValCmpl; + + counts[key1][i] = oldVal; + } else { + if(counts.count(key2) > 0) { + std::tuple,std::vector> oldVal; + oldVal = counts[key2][i]; + std::get<0>(oldVal) = std::get<0>(oldVal) + 1; + + std::vector oldValEnergy = std::get<1>(oldVal); + std::vector oldValCmpl = std::get<2>(oldVal); + + oldValEnergy.push_back(std::stof(tokens[17])); + oldValCmpl.push_back(std::stof(tokens[18])); + + std::get<1>(oldVal) = oldValEnergy; + std::get<2>(oldVal) = oldValCmpl; + + counts[key2][i] = oldVal; + } + } + } + } + } + + + // write back to file + std::string outfile = params["outdir"].as(); + fs::path outPath = fs::path(outfile) / "allints.txt"; + + std::ofstream outFileHandle; + outFileHandle.open(outPath.string()); + + outFileHandle << "RNA1\tRNA2\tRNA1orientation\tRNA2orientation\t"; + + for(unsigned i=0;i(it->first) << "\t"; + outFileHandle << std::get<2>(it->first) << "\t"; + outFileHandle << std::get<1>(it->first) << "\t"; + outFileHandle << std::get<3>(it->first) << "\t"; + + // + for(unsigned z=0;zsecond.size();++z) { + outFileHandle << std::get<0>(it->second[z]) << "\t"; + + std::vector vecNrg = std::get<1>(it->second[z]); + std::vector vecCpl = std::get<2>(it->second[z]); + + // determine ges + float ges = 0.0; + if(vecNrg.size() == 1) { + ges = vecNrg[0]; + } else { + if(vecNrg.size() > 1) { + auto m = vecNrg.begin() + vecNrg.size()/2; + std::nth_element(vecNrg.begin(), m, vecNrg.end()); + double min = *min_element(vecNrg.begin(), vecNrg.end()); + ges = vecNrg[vecNrg.size()/2]; + } + } + outFileHandle << ges << "\t"; + + // determine gcs + float gcs = 0.0; + if(vecCpl.size() == 1) { + gcs = vecCpl[0]; + } else { + if(vecCpl.size() > 1) { + auto m = vecCpl.begin() + vecCpl.size()/2; + std::nth_element(vecCpl.begin(), m, vecCpl.end()); + double min = *min_element(vecCpl.begin(), vecCpl.end()); + gcs = vecCpl[vecCpl.size()/2]; + } + } + outFileHandle << gcs << "\t"; + } + outFileHandle << "\n"; + } + outFileHandle.close(); +} + + + +// +std::string Analysis::retrieveTagValue(std::string tags, std::string tagName, std::string oldValue) { + std::size_t start_position = tags.find(tagName+"="); + // gene name + if(start_position != std::string::npos) { + std::string sub = tags.substr(start_position+tagName.size()+1,tags.length()); + std::size_t end_position = sub.find(";"); + oldValue = sub.substr(0,end_position); + } + return oldValue; +} + + +float Analysis::calc_pvalue(int x, int n, float p) { + // simulate draw from binomial distribution + std::binomial_distribution binomial(n, p); + // assign p value + float pval = 1.0 - cdf(binomial, x); + return pval; +} + +void Analysis::start(pt::ptree sample) { + // retrieve input and output files + pt::ptree input = sample.get_child("input"); + std::string splits = input.get("splits"); + pt::ptree output = sample.get_child("output"); + std::string interactions = output.get("interactions"); + + + interPaths.push_back(interactions); + + // input .sam record + seqan3::alignment_file_input fin{splits, + seqan3::fields{}}; + + std::vector flags; + std::vector refIDs; + std::vector> refOffsets; + + std::vector ref_lengths{}; + for(auto &info : fin.header().ref_id_info) { + ref_lengths.push_back(std::get<0>(info)); + } + std::deque ref_ids = fin.header().ref_ids(); + +// uint32_t flag, start, end; + + // open file + std::ofstream outInts; + outInts.open(interactions); + + std::string entry; // stores output to write to file + + // variables for interactions file + std::string qNAME, flag; + uint32_t start, end; + std::string geneID, geneName, product, annoStrand; + float hybnrg, cmpl; + + seqan3::sam_tag_dictionary tags; + + outInts << "#QNAME\tSegment1Strand\tSegment1Start\tSegment1End\tSegment1RefName" + outInts << "\tSegment1Name\tSegment1AnnoStrand\tSegment1Product\tSegment1Orientation" + outInts << "\tSegment2Strand\tSegment2Start\tSegment2End\tSegment2RefName\tSegment2Name" + outInts << "\tSegment2AnnoStrand\tSegment2Product\tSegment1Orientation\tenergy\tcomplementarity\n"; + + int segCnt = 0; + int segCntMatch = 0; + int segFound = 0; + + for(auto && rec : fin | seqan3::views::chunk(2)) { + entry = ""; + hybnrg = 0.0; + cmpl = 0.0; + for(auto & split : rec) { + // seqan3::debug_stream << split << std::endl; + + qNAME = seqan3::get(split); + flag = "+"; + if(static_cast(seqan3::get(split) + & seqan3::sam_flag::on_reverse_strand)) { + flag = "-"; + } + + // start & end + start = seqan3::get(split).value(); + end = start + seqan3::get(split).size()-1; + + // refID + std::optional refIDidx = seqan3::get(split); + std::string refID = ref_ids[refIDidx.value()]; + + tags = seqan3::get(split); + auto nrg = tags["XE"_tag]; + auto cpl = tags["XC"_tag]; + hybnrg = std::get(nrg); + cmpl = std::get(cpl); + + std::vector,std::string>> feat; + if(features.count(refID) > 0) { + feat = features[refID]; + + std::pair geneNames; + geneID = "."; + geneName = "."; + product = "."; + annoStrand = "."; + for(unsigned i=0;i= feat[i].first.first && start <= feat[i].first.second) || + (end >= feat[i].first.first && end <= feat[i].first.second)) { + + // make sure that overlap is in exon + if(params["splicing"].as() && feat[i].second.find("exon") == std::string::npos) { + continue; + } + geneID = retrieveTagValue(feat[i].second, "ID", geneName); + geneName = retrieveTagValue(feat[i].second, "gene", geneName); + product = retrieveTagValue(feat[i].second, "product", product); + annoStrand = retrieveTagValue(feat[i].second, "strand", annoStrand); + + segFound = 1; // found a match (in annotation) for segment + } + } + + // search for geneNames in frequency map + geneNames = std::make_pair(retrieveTagValue(feat[0].second, "gene", geneName), retrieveTagValue(feat[1].second, "gene", geneName)); + + float value; + // check if geneNames first in frequency map + if(frequency.count(geneNames.first) > 0 && frequency.count(geneNames.second) > 0) { + if (geneNames.first == geneNames.second) { + value = frequency[geneNames.first] * frequency[geneNames.second]; + } else { + value = 2 * (frequency[geneNames.first] * frequency[geneNames.second]); + } + } else{ + value = 0; + } + + // calculate p-value + int draws = frequency[geneNames.first] + frequency[geneNames.second]; + float pval = calc_pvalue(segCntMatch, draws, value); + + + if(segFound == 1) { + if(segCnt == 0) { entry += qNAME + "\t";} + entry += flag + "\t"; + entry += std::to_string(start) + "\t"; + entry += std::to_string(end) + "\t"; + entry += refID + "\t"; + entry += geneName + "\t"; + entry += annoStrand + "\t"; + entry += "\"" + product + "\"\t"; + + if(flag == annoStrand) { + entry += "sense\t"; + } else { + entry += "antisense\t"; + } + + entry += pval + "\t"; + + segCntMatch++; + segFound = 0; + } + } + segCnt++; // on to the next segment + } + + if(segCntMatch == 2) { + outInts << entry << hybnrg << "\t" << cmpl << "\n"; + } + + segCnt = 0; // reset segment + segCntMatch = 0; + } + + outInts.close(); +} + + diff --git a/src/Data.cpp b/src/Data.cpp index bf28e92f..7aa705e4 100644 --- a/src/Data.cpp +++ b/src/Data.cpp @@ -57,7 +57,6 @@ void Data::alignDataPrep() { * (ctrls -> "/Users/.../crtls") */ GroupsPath groups = getGroupsPath(ctrlsPath, trtmsPath); - getCondition(groups); } } diff --git a/src/IBPTree.cpp b/src/IBPTree.cpp index b59c9449..2067d1da 100644 --- a/src/IBPTree.cpp +++ b/src/IBPTree.cpp @@ -201,6 +201,50 @@ void IBPTree::splitNode(Node* parent, int index) { } } +std::vector IBPTree::search(std::string chrom, dtp::Interval interval) { + Node* root = this->rootnodes[chrom]; // search for the root node + std::vector result; + searchIter(root, interval, result); +} + +void IBPTree::searchIter(Node* node, const dtp::Interval& interval, std::vector results) { + if(node->isLeaf) { + Node* current = node; + for(auto& intvl : current->keys) { + if(isOverlapping(intvl.first, interval)) { + results.push_back(intvl.second); + } + if(!current->next || !isOverlapping(current->next->keys[0].first, interval)) { + break; + } + node = node->next; + } + } else { + int i = 0; + while(i < node->keys.size() && + (interval.first > node->keys[i].first.first && + (interval.first > node->keys[i].first.second))){ + i++; + } + if(node->children[i] != nullptr) { + searchIter(node->children[i], interval, results); + } + } +} + +bool IBPTree::isOverlapping(dtp::Interval intvl1, dtp::Interval intvl2) { + if((intvl1.first <= intvl2.first && intvl1.second >= intvl2.second) || + (intvl1.first >= intvl2.first && intvl1.second <= intvl2.second) || + (intvl1.first <= intvl2.first && intvl1.second > intvl2.first) || + (intvl1.first > intvl2.first && intvl1.first < intvl2.second)) { + return true; + } +} + + + + + // get attribute from the attributes fields std::string IBPTree::getTag(std::map attributes, const std::vector& keys) { std::string element = ""; @@ -249,57 +293,3 @@ void IBPTree::traverse(Node* parent, Node* child, int link, std::ostream& ofs) c traverse(child, child->children[i], i, ofs); } } - -/* -// Function to search for intervals overlapping with the given interval -std::vector IBPTree::search(const Interval& interval) { - std::vector result; - if (root != nullptr) - searchHelper(root, interval, result); - return result; -} - -// Helper function for searching intervals -void IBPTree::searchHelper(Node* node, const Interval& interval, std::vector& result) { - if (node->isLeaf()) { - LeafNode* leaf = dynamic_cast(node); - for (const Interval& key : leaf->keys) { - if (interval.low <= key.high && interval.high >= key.low) - result.push_back(key); - } - } else { - InternalNode* internal = dynamic_cast(node); - int i = 0; - while (i < internal->keys.size() && interval.low > internal->keys[i]) - i++; - searchHelper(internal->children[i], interval, result); - } -} - - - -*/ -/* -if(root->getLeaf()) { // root is a leaf node - root->addInterval(data); - if(root->getKeys().size() > this->order-1) { - splitNode(root); - } -} else { - dtp::Interval interval = std::make_pair(5300,5400); - Node* child = traverse(interval, root); - if(child == nullptr) { - std::cout << "nullptr" << std::endl; - } else { - std::cout << "new Child: " << child->keysToString() << std::endl; - } - -*/ - - - - -/* -std::cout << "Child: " << child->keysToString() << std::endl; - */ - diff --git a/src/Node.cpp b/src/Node.cpp index 6e9227f6..77b7bf91 100644 --- a/src/Node.cpp +++ b/src/Node.cpp @@ -35,6 +35,7 @@ bool IntervalData::isSubset(int start, int end) { return (start >= this->interval.first && end <= this->interval.second); } + // create new node Node::Node(int k) : order{k}, keys{}, children{}, next{nullptr}, parent{nullptr}, isLeaf{false} {} diff --git a/src/SplitReadCalling.cpp b/src/SplitReadCalling.cpp index d46a5fa7..0e4f5c23 100644 --- a/src/SplitReadCalling.cpp +++ b/src/SplitReadCalling.cpp @@ -19,7 +19,6 @@ ThreadPool::ThreadPool(size_t num_threads) : stop(false) { } } - ThreadPool::~ThreadPool() { { std::unique_lock lock(queue_mutex); @@ -47,13 +46,14 @@ auto ThreadPool::enqueue(F&& f, Args&&... args) -> std::futureparams = params; // create IBPTree (if splicing need to be considered) - if(params["splicing"].as>() == std::bitset<1>("1")) { + if(params["splicing"].as>() == std::bitset<1>("1")) { // order of the IBPTree int k = 7; // can be later extracted from config file // create IBPT this->features = IBPTree(params, k); } this->condition = ""; // stores the current condition + this->refIds = std::deque(); // stores the reference ids // initialize stats this->stats = std::make_shared(); @@ -61,7 +61,12 @@ SplitReadCalling::SplitReadCalling(po::variables_map params) { const char* seq = "GGGAAAUCC"; } -SplitReadCalling::~SplitReadCalling() {} +SplitReadCalling::~SplitReadCalling() { + if(params["stats"].as>() == std::bitset<1>("1")) { + fs::path outdir = fs::path(params["outdir"].as()); + stats->writeStats(outdir); + } +} void SplitReadCalling::iterate(std::string& matched, std::string& single, std::string &splits, std::string &multsplits) { ThreadPool pool(params["threads"].as()); // initialize thread pool @@ -71,10 +76,11 @@ void SplitReadCalling::iterate(std::string& matched, std::string& single, std::s for(auto &info : fin.header().ref_id_info) { ref_lengths.push_back(std::get<0>(info)); } - std::deque ref_ids = fin.header().ref_ids(); - seqan3::sam_file_output singleOut{single, ref_ids, ref_lengths}; // initialize output file - seqan3::sam_file_output splitsOut{splits, ref_ids, ref_lengths}; // initialize output file - seqan3::sam_file_output multsplitsOut{multsplits, ref_ids, ref_lengths}; // initialize output file + this->refIds = fin.header().ref_ids(); + + seqan3::sam_file_output singleOut{single, this->refIds, ref_lengths}; // initialize output file + seqan3::sam_file_output splitsOut{splits, this->refIds, ref_lengths}; // initialize output file + seqan3::sam_file_output multsplitsOut{multsplits, this->refIds, ref_lengths}; // initialize output file // create mutex objects for output - had to be done here as class variables cause errors w/ std::bind std::mutex singleOutMutex; @@ -145,7 +151,7 @@ void SplitReadCalling::process(std::vector& readrecords, auto& singleOut, aut // stores the putative splits/segments std::vector curated; - std::map> segments; + std::map> putative{}; for(auto& rec : readrecords) { qname = rec.id(); @@ -153,6 +159,7 @@ void SplitReadCalling::process(std::vector& readrecords, auto& singleOut, aut ref_id = rec.reference_id(); ref_offset = rec.reference_position(); mapq = rec.mapping_quality(); + seq = rec.sequence(); // CIGAR string cigar = rec.cigar_sequence(); @@ -164,17 +171,18 @@ void SplitReadCalling::process(std::vector& readrecords, auto& singleOut, aut auto xjtag = tags.get<"XJ"_tag>(); auto xhtag = tags.get<"XH"_tag>(); - seqan3::debug_stream << "QNAME: " << qname << "\n"; - seqan3::debug_stream << "FLAG: " << flag << "\n"; - - if(xjtag < 2) { continue; } // ignore reads with less than 2 splits + if(xjtag < 2 ) { // ignore reads with less than 2 splits (or no splits at all) + { + std::lock_guard lock(singleOutMutex); + singleOut.push_back(rec); + } + continue; + } startPosRead = 1; // initialize start/end of read endPosRead = 0; startPosSplit = xxtag; endPosSplit = xytag; - seqan3::debug_stream << cigar << std::endl; - cigarMatch = 0; // reset matches count in cigar // check the cigar string for splits for(auto& cig: cigar) { @@ -190,15 +198,11 @@ void SplitReadCalling::process(std::vector& readrecords, auto& singleOut, aut newTags.get<"XY"_tag>() = endPosSplit; newTags.get<"XN"_tag>() = splitId; - // filter segments - if(filter(subSeq, cigarMatch)) { + // filter & store the segments + dtp::Interval intvl = {ref_offset.value(), ref_offset.value() + endPosRead}; + if(filter(subSeq, cigarMatch, this->refIds[ref_id.value()], intvl)) { storeSegments(rec, ref_offset, cigarSplit, seq, tags, curated); } - - - - - // settings to prepare for the next split startPosSplit = endPosSplit+1; startPosRead = endPosRead+1; @@ -232,49 +236,102 @@ void SplitReadCalling::process(std::vector& readrecords, auto& singleOut, aut newTags.get<"XY"_tag>() = endPosSplit; newTags.get<"XN"_tag>() = splitId; - // filter segements - - if(segNum) { - + // filter and store the segments + dtp::Interval intvl = {ref_offset.value(), ref_offset.value() + endPosRead}; + if(filter(subSeq, cigarMatch, this->refIds[ref_id.value()], intvl)) { + storeSegments(rec, ref_offset, cigarSplit, seq, tags, curated); } + if(segNum == xjtag) { + putative.insert(std::make_pair(splitId, curated)); + curated.clear(); + segNum = 0; + ++splitId; + } + } + decide(putative, splitsOut, multsplitsOut, splitsOutMutex, multsplitsOutMutex); +} - - /* - std::string recid = rec.id(); - dtp::DNAVector recseq = rec.sequence(); - dtp::CigarVector reccigar = rec.cigar_sequence(); - - SAMRec outrec{recid, recseq, reccigar};*/ - - - - - - // single output - /* - { - std::lock_guard lock(singleOutMutex); - singleOut.push_back(outrec); - }*/ - /* - // splits output - { - std::lock_guard lock(splitsOutMutex); - splitsOut.push_back(outrec); +void SplitReadCalling::decide(std::map>& putative, auto& splitsOut, auto& multsplitsOut, + auto& splitsOutMutex, auto& multsplitsOutMutex) { + std::vector p1, p2; + seqan3::sam_tag_dictionary p1Tags, p2Tags; + std::pair scores; + std::string p1QNAME, p2QNAME; + + if(putative.size() > 0) { + for(auto& [splitId, records] : putative) { + if(records.size() > 1) { // splits contain more than one segment + for(unsigned i=0;i(); + auto p1End = p1Tags.get<"XY"_tag>(); + auto p2Start = p2Tags.get<"XX"_tag>(); + auto p2End = p2Tags.get<"XY"_tag>(); + + // prevent overlap between read position -> same segment of read + if(p1.empty() && p2.empty()) { + TracebackResult initCmpl = complementarity(seqan3::get(splits[i]), seqan3::get(splits[j])); + double initHyb = hybridize(seqan3::get(splits[i]), seqan3::get(splits[j])); + if(!initCmpl.a.empty()) { // split read survived complementarity & sitelenratio cutoff + if(initHyb <= params["nrgmax"].as()) { + filters = std::make_pair(initCmpl.cmpl, initHyb); + addComplementarityToSamRecord(splits[i], splits[j], initCmpl); + addHybEnergyToSamRecord(splits[i], splits[j], initHyb); + splitSegments.push_back(std::make_pair(splits[i],splits[j])); + } + } + } else { + TracebackResult cmpl = complementarity(seqan3::get(splits[i]), seqan3::get(splits[j])); + double hyb = hybridize(seqan3::get(splits[i]), seqan3::get(splits[j])); + + if(!cmpl.a.empty()) { // check that there is data for complementarity / passed cutoffs + if(cmpl.cmpl > filters.first) { // complementarity is higher + splitSegments.clear(); + filters = std::make_pair(cmpl.cmpl, hyb); + addComplementarityToSamRecord(splits[i], splits[j], cmpl); + addHybEnergyToSamRecord(splits[i], splits[j], hyb); + splitSegments.push_back(std::make_pair(splits[i],splits[j])); + } else{ + if(cmpl.cmpl == filters.first) { + if(hyb > filters.second) { + splitSegments.clear(); + filters = std::make_pair(cmpl.cmpl, hyb); + addComplementarityToSamRecord(splits[i], splits[j], cmpl); + addHybEnergyToSamRecord(splits[i], splits[j], hyb); + splitSegments.push_back(std::make_pair(splits[i],splits[j])); + } + } else { + if(hyb == filters.second) { // same cmpl & hyb -> ambigious read! + addComplementarityToSamRecord(splits[i], splits[j], cmpl); + addHybEnergyToSamRecord(splits[i], splits[j], hyb); + splitSegments.push_back(std::make_pair(splits[i],splits[j])); + } + } + } + } + } + } + } + } } - // multsplits output - { - std::lock_guard lock(multsplitsOutMutex); - multsplitsOut.push_back(outrec); - }*/ + } } -bool SplitReadCalling::filter(auto& sequence, uint32_t cigarmatch) { - // filter segments +bool SplitReadCalling::filter(auto& sequence, uint32_t cigarmatch, std::string chrom, dtp::Interval intvl) { + //seqan3::debug_stream << sequence << std::endl; if(sequence.size() < params["minlen"].as()) { return false; } double matchrate = static_cast(cigarmatch) / static_cast(sequence.size()); if(matchrate < params["minfragmatches"].as()) { return false; } + if(params["splicing"].as>() == std::bitset<1>("1")) { + std::cout << "check for interval :" << intvl.first << " " << intvl.second << std::endl; + // check if the segment is spliced + //std::vector ovlps = features.search(chrom, intvl); + //std::cout << "overlap " << ovlps.size() << std::endl; + } return true; } @@ -282,20 +339,12 @@ void SplitReadCalling::storeSegments(auto& splitrecord, std::optional& dtp::CigarVector& cigar, dtp::DNAVector& seq, seqan3::sam_tag_dictionary& tags, std::vector& curated) { SAMrecord segment{}; // create new SAM Record - - using types = seqan3::type_list, std::string, std::vector>; - using fields = seqan3::fields; - using sam_record_type = seqan3::sam_record; - - - // write the following to the file - // r001 0 * 0 0 4M2I2M2D * 0 0 ACGTACGT * - sam_record_type record{}; - record.id() = "r001"; - record.sequence() = "ACGTACGT"_dna5; - - seqan3::debug_stream << splitrecord.id() << std::endl; - + segment.id() = splitrecord.id(); + seqan3::sam_flag flag{0}; + if(static_cast(splitrecord.flag() & seqan3::sam_flag::on_reverse_strand)) { + flag |= seqan3::sam_flag::on_reverse_strand; + } + segment.reference_id() = splitrecord.reference_id(); } diff --git a/src/Stats.cpp b/src/Stats.cpp index da4f74c4..fb48de49 100644 --- a/src/Stats.cpp +++ b/src/Stats.cpp @@ -26,9 +26,14 @@ void Stats::setMultSplitsCount(std::string condition, int increment) { stats[condition].multSplitsCount += increment; } } -void Stats::setNSurvivedCount(std::string condition, int nsurvivedcount) { - { - std::lock_guard lock(statsMutex); - stats[condition].nSurvivedCount = nsurvivedcount; + +void Stats::writeStats(fs::path outdir) { + fs::path statsFile = outdir / "stats.tsv"; + std::ofstream statsStream(statsFile); + statsStream << "Condition\tReads\tAligned\tSplits\tMultSplits\n"; + for (const auto& [condition, stat] : stats) { + statsStream << condition << "\t" << stat.readsCount << "\t"; + statsStream << stat.alignedCount << "\t" << stat.splitsCount << "\t"; + statsStream << stat.multSplitsCount << "\n"; } } \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index fa59d46b..2f35b4eb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -85,7 +85,7 @@ int main(int argc, char* argv[]) { "minimum score of a spliced fragment") ("minfraglen", po::value()->default_value(20), "minimum length of a spliced fragment") - ("minfragmatches", po::value()->default_value(0.7), + ("minfragmatches", po::value()->default_value(0.5), "minimum percentage matches of a spliced fragment") ("minsplicecov", po::value()->default_value(80), "minimum coverage for spliced transcripts")