bring repo back on track to have all changes included

Ibvt · Jun 3, 2024 · 60a160c · 60a160c
1 parent df896d4
commit 60a160c
Show file tree

Hide file tree

Showing 19 changed files with 809 additions and 161 deletions.
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -1,8 +1,8 @@
 name: docker-release
 on:
   push:
-    branches:
-      - "master"
+    tags:
+      - '*'
   pull_request:
     branches:
       - "master"
@@ -41,5 +41,5 @@ jobs:
           context: "{{defaultContext}}"
           platforms: linux/amd64, linux/arm64
           push: true
-          tags: cobirna/rnanue:${{ steps.extract_version.outputs.VERSION }}
+          tags: cobirna/rnanue:${{ steps.extract_version.outputs.VERSION }}, cobirna/rnanue:latest
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,17 +5,26 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+# [0.2.1]
+
+## Features
+
+- code cleanup 
+- fix in segmentation of the preprocessing
+
 # [0.2.0]
 
+## Features
+
+- update to C++20 and SeqAn 3.3.0
+- native support for concurrency 
+
 # [0.1.1]
 
 ## Fix
 
-
 # [0.1.0]
 
-# [0.0.1]
-
-## [0.0.0] 
-
+### Features
+- Initial implementation of RNAnue
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.25)
+cmake_minimum_required(VERSION 3.22.1)
 project(RNAnue VERSION 0.2.0)
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
@@ -9,7 +9,7 @@ include(CMakePrintHelpers)
 # configure header file to pass the version number to the source code
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/Config.hpp.in ${CMAKE_CURRENT_SOURCE_DIR}/include/Config.hpp)
 
-###### Seqan #####
+###### SeqAn #####
 list (APPEND CMAKE_PREFIX_PATH "${CMAKE_CURRENT_SOURCE_DIR}/seqan3/build_system")
 find_package (seqan3 3.0 REQUIRED)
 find_package(OpenMP)
@@ -22,11 +22,19 @@ if(Boost_FOUND)
     include_directories(${Boost_INCLUDE_DIRS})
 endif()
 
+###### HTSlib ######
+find_package(PkgConfig REQUIRED)
+pkg_check_modules(HTSLIB REQUIRED IMPORTED_TARGET htslib)
+include_directories(${HTSLIB_INCLUDE_DIRS})
+
 file(GLOB SOURCES "src/*.cpp")
 add_executable(RNAnue ${SOURCES})
 target_include_directories(RNAnue PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
 target_link_libraries(RNAnue seqan3::seqan3)
 target_link_libraries(RNAnue Boost::program_options Boost::filesystem)
+target_link_libraries(RNAnue PkgConfig::HTSLIB)
+#target_link_libraries(RNAnue -L/usr/local/include -lRNA -flto)
+
 cmake_print_properties(TARGETS RNAnue PROPERTIES TARGET_INCLUDE_DIRECTORIES)
 
 ###### Tests ######

diff --git a/Dockerfile b/Dockerfile
@@ -15,7 +15,7 @@ LABEL authors="Richard A. Schaefer"
 RUN apt-get -y update && apt-get -y upgrade
 RUN apt-get install -y curl build-essential cmake git pkg-config
 RUN apt-get install -y libbz2-dev zlib1g-dev libncurses5-dev liblzma-dev
-RUN apt-get install -y libboost-all-dev
+RUN apt-get install -y libboost-all-dev gdb
 
 # install htslib
 WORKDIR /
@@ -35,6 +35,8 @@ RUN export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 RUN make all
 RUN cp segemehl.x /usr/local/bin
 
+# install ViennaRNA
+
 # retrieve RNAnue
 WORKDIR /
 RUN git clone https://github.com/Ibvt/RNAnue.git

diff --git a/README.md b/README.md
@@ -9,17 +9,15 @@ RNAnue is a comprehensive analysis to detect RNA-RNA interactions from Direct-Du
 ### Dependencies
 RNAnue has the following dependencies, whereas the brackets indicate the version RNAnue has 
 been build and tested on. Make sure the requirements are satified by your system. cmake is able
-to detect the Boost libraries system-wide. However seqan is expected to be located in the current 
-folder of RNAnue as specified in the CMakeLists.txt. Segemehl and the Vienna binaries need to be
+to detect the Boost libraries system-wide. However Seqan3 is expected to be located in the current 
+folder of RNAnue as specified in the CMakeLists.txt. egemehl and the Vienna binaries need to be
 located in $PATH.
 
 * [Boost C++ Libraries](https://www.boost.org/) (v1.7.2)
 * [SeqAn](https://github.com/seqan/seqan3) (v3.3.0)
 * [Segemehl](http://www.bioinf.uni-leipzig.de/Software/segemehl/) (v0.3.4)
 * [Vienna Package](https://www.tbi.univie.ac.at/RNA/#binary_packages) (v2.4.17)
 
-
-
 ### CMake
 RNAnue is build using CMake. At first, clone the repository and change into the source directory.
 ```
@@ -29,13 +27,6 @@ cd RNAnue
 In the next step, retrieve the SeqAn library and place it in the root folder of RNAnue
 ```
 
-```
-
-
-
-
-
-
 CMake is a cross-platform Makefile generator. For that, we provide the [CMakeLists](./source/CMakeLists.txt) 
 to simplify the build process. In particular, it utilizes the instructions given in the CMakeLists.
 It is recommended to create a "out-of-source build". For that, create a build folder (e.g., ./bin)
@@ -81,7 +72,6 @@ RNAnue <subcall> --config /path/to/params.cfg
 Here, subcall corresponds to positional arguments.In any case, the specifying parameters over the command lines has 
 precedence over the config file.
 
-
 ## Results
 
 In principle, the results of the analysis are stored in the specified output folder and its subfolders
@@ -90,7 +80,7 @@ and the RNA-RNA interactions. RNAnue reports the split reads in SAM format. Addi
 scores and hybridization energies are stored in the tags FC and FE, respectively. We report the clusters in a
 custom format that includes the IDs of the clusters, its length, size and genomic coordinates.
 
-### Split Reads (.SAM)
+### Split Reads (.BAM)
 
 RNAnue reports the detected splits in .SAM format (RNAnue `detect`). In this file, pairs of rows represent the
 split reads, consisting of the individual segments, e.g
@@ -114,9 +104,143 @@ duplex.
 
 ### Clustering results
 
-
+The `clustering` procedure reports a single clusters.tab file which is a tab-delimited file of the clustering results. 
+Here, each line represents a cluster that corresponds to overlapping split reads, defined by the two segments. The 
+columns are defined in the following:
+
+| Field | Description |
+| ----- | ----------- |
+| clustID | Unique identifier of the cluster |
+| fst_seg_chr | Chromosome (accession) of the first segment |
+| fst_seg_strd | Strand where the first segment is located |
+| fst_seg_strt | Start position of the first segment in the cluster |
+| fst_seg_end | End position of the first segment in the cluster |
+| sec_seg_chr | Chromosome (accession) of the second segment |
+| sec_seg_strd | Strand where the second segment is located |
+| sec_seg_strt | Start position of the second segment in the cluster |
+| sec_seg_end | End position of the second segment in the cluster |
+| no_splits | Number of split reads in the cluster |
+| fst_seg_len | Length of the first segment |
+| sec_seg_len | Length of the second segment |
 
 ### Interaction table
+
+The `analysis` procedure generates `_interactions` files for each library in 
+which each line represents an annotated split read that is mapped to a 
+transcript interaction. The fields are defined as follows:
+
+| Field | Description |
+| ----- | ----------- |
+| qname | read/template identifier |
+| fst_seg_strd | Strand where the first segment is located |
+| fst_seg_strt | Start position of the first segment |
+| fst_seg_end | End position of the first segment |
+| fst_seg_ref | Reference name of the first segment corresponding to the seqid in GFF and/or clusterID |
+| fst_seg_name | Name of the first segment that corresponds to gene name/symbol and/or clusterID |
+| first_seg_bt | Biotype of the annotation transcript (if available) |
+| fst_seg_anno_strd | Strand information of the transcript in the overlapping annotation |
+| fst_seg_prod | Description of the transcript (if available in annotation) |
+| fst_seg_ori | Orientation of the segment with respect to annotation (sense/antisense) |
+| sec_seg_strd | Strand where the second segment is located |
+| sec_seg_strt | Start position of the second segment |
+| sec_seg_end | End position of the second segment |
+| sec_seg_ref | Reference name of the second segment corresponding to the seqid in GFF and/or clusterID |
+| sec_seg_name | Name of the second segment that corresponds to gene name/symbol and/or clusterID |
+| sec_seg_bt | Biotype of the annotation transcript (if available) |
+| sec_seg_anno_strd | Strand information of the transcript in the overlapping annotation |
+| sec_seg_prod | Description of the transcript (if available in annotation) |
+| sec_seg_ori | Orientation of the segment with respect to annotation (sense/antisense) |
+| cmpl | Complementarity score of the interaction |
+| fst_seg_compl_aln | Alignment results in the complementarity calculation of the first segment |
+| sec_seg_cmpl_aln | Alignment results in the complementarity calculation of the second segment |
+| mfe | Hybridisation energy of the interaction |
+| mfe_struc | Minimum free energy (MFE) structure of interaction in dot-bracket notation |
+
+The main result of an RNAnue analysis are transcript interactions. 
+They are stored in  the file  `allints.txt` in the same directory. 
+Its entries are structured as described in the following where 
+columns with prefix <sample> are given for each sample specified in 
+the analysis (within the same file).
+
+| Field | Description |
+| ----- | ----------- |
+| fs_rna | Gene/Transcript name of the first interaction partner |
+| sec_rna | Gene/Transcript name of the second interaction partner |
+| fst_rna_ori | Orientation of the first interaction partner with respect to annotation (sense/antisense) |
+| sec_rna_ori | Orientation of the second interaction partner with respect to annotation (sense/antisense) |
+| <sample>_supp_reads | Number of (split)reads that support the interaction |
+| <sample>_ges | Global energy score (gcs) of the interaction |
+| <sample>_ghs | Global hybridisation score (ghs) of the interaction |
+| <sample>_pval | Statistical significance (p-value) of the interaction |
+| <library>_padj | Benjamini-Hochberg adjusted p-value among the samples |
+
+If the option –outcnt is set to 1 RNAnue generates the count table `counts.txt` in the output directory. 
+It contains the counts of each interaction for each sample and  can be used for differential expression 
+analysis. Similarly, –outjgf set to 1 generates a `graph.json` file that contains the detected interactions 
+in JSON graph format. Finally, –stats set to 1 creates a `stats.txt` file that contains basic statistics for 
+each step of the analysis. 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 
 ### Docker

diff --git a/build/params.cfg b/build/params.cfg
@@ -31,7 +31,10 @@ accuracy = 90 # min percentage of matches per read in semi-global alignment
 minfragsco = 15 # min score of a spliced fragment 
 minfraglen = 15 # min length of a spliced fragment
 minsplicecov = 80 # min coverage for spliced transcripts
-exclclipping = 0 # exclude soft clipping from 
+mapq = 20 # min mapping quality for spliced transcripts
+exclclipping = 0 # exclude soft clipping from
+unprd = 0 # only for paired-end reads: include unpaired reads in the subsequent analysis
+unmrg = 0 # only for paired-end reads: include unmerged reads in the subsequent analysis
 
 ### SPLIT READ CALLING 
 sitelenratio = 0.0

diff --git a/include/Analysis.hpp b/include/Analysis.hpp
@@ -4,9 +4,14 @@
 // Standard
 #include <iostream>
 
+// Boost
+#include <boost/program_options.hpp>
+
+namespace po = boost::program_options;
+
 class Analysis {
     public:
-        Analysis();
+        Analysis(po::variables_map params);
         ~Analysis();
 
         void start();

diff --git a/include/FilterScores.hpp b/include/FilterScores.hpp
@@ -0,0 +1,42 @@
+#ifndef RNANUE_FILTERSCORES_HPP
+#define RNANUE_FILTERSCORES_HPP
+
+#include "DataTypes.hpp"
+
+class Complementarity {
+    public:
+        Complementarity();
+        ~Complementarity();
+
+        void compute(dtp::DNAVector seq1, dtp::DNAVector seq2);
+
+    private:
+        double score;
+        int alignmentLength;
+        double siteLengthRatio;
+        int matches;
+};
+
+class Hybridization {
+    public:
+        Hybridization();
+        ~Hybridization();
+
+        void compute(dtp::DNAVector seq1, dtp::DNAVector seq2);
+
+    private:
+        double score;
+        int alignmentLength;
+        double siteLengthRatio;
+        int matches;
+};
+
+
+class FilterScores {
+    public:
+        FilterScores();
+        ~FilterScores();
+
+};
+
+#endif //RNANUE_FILTERSCORES_HPP
diff --git a/include/IBPTree.hpp b/include/IBPTree.hpp
@@ -34,6 +34,10 @@ class IBPTree {
         void insert(IntervalData& data);
         void insertIter(Node* node, IntervalData& data);
         void splitNode(Node* node, int index);
+        std::vector<IntervalData*> search(std::string chrom, dtp::Interval interval);
+        void searchIter(Node* node, const dtp::Interval& interval, std::vector<IntervalData*> results);
+        bool isOverlapping(dtp::Interval intvl1, dtp::Interval intvl2);
+
 
         std::map<std::string, std::string> getAttributes(std::string& attributes);
         std::string getTag(std::map<std::string, std::string> attributes, const std::vector<std::string>& keys);

diff --git a/include/Node.hpp b/include/Node.hpp
@@ -39,6 +39,7 @@ class IntervalData {
         // operations
         void addJunction(std::string junction);
         bool isSubset(int start, int end);
+        bool isOverlapping(dtp::Interval intvl1, dtp::Interval intvl2);
         void printNode();
 
     private: