Skip to content

Commit 2d13c34

Browse files
committed
Vendored the specification validation library, added bindings to R.
This also prompted some fixes to the prepareDatabaseFiles examples with respect to the uniqueness of simulated gene indices for each set.
1 parent 7c3c6f4 commit 2d13c34

23 files changed

+1112
-6
lines changed

Diff for: .Rbuildignore

+2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
^\.github$
22
^\.gitignore$
3+
^vendor.sh$
4+
^_spec$

Diff for: .gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
*.swp
22
*.html
3+
_spec
4+
*.o
5+
*.so

Diff for: DESCRIPTION

+7-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: gesel
2-
Version: 0.1.1
3-
Date: 2024-11-06
2+
Version: 0.1.2
3+
Date: 2024-11-15
44
Title: Search for Interesting Gene Sets
55
License: MIT + file LICENSE
66
Description:
@@ -13,12 +13,16 @@ Imports:
1313
utils,
1414
methods,
1515
rappdirs,
16-
httr2
16+
httr2,
17+
Rcpp
1718
Suggests:
1819
BiocStyle,
1920
knitr,
2021
testthat,
2122
rmarkdown
23+
LinkingTo:
24+
assorthead,
25+
Rcpp
2226
VignetteBuilder: knitr
2327
Encoding: UTF-8
2428
RoxygenNote: 7.3.2

Diff for: NAMESPACE

+4
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,13 @@ export(newConfig)
2525
export(prepareDatabaseFiles)
2626
export(searchGenes)
2727
export(searchSetText)
28+
export(validateDatabaseFiles)
29+
export(validateGeneFiles)
2830
import(httr2)
2931
import(methods)
32+
importFrom(Rcpp,sourceCpp)
3033
importFrom(rappdirs,user_cache_dir)
3134
importFrom(utils,URLencode)
3235
importFrom(utils,head)
3336
importFrom(utils,write.table)
37+
useDynLib(gesel)

Diff for: R/RcppExports.R

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
2+
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3+
4+
validate_database_files <- function(db_prefix, num_genes) {
5+
.Call('_gesel_validate_database_files', PACKAGE = 'gesel', db_prefix, num_genes)
6+
}
7+
8+
validate_gene_files <- function(gene_prefix, types) {
9+
.Call('_gesel_validate_gene_files', PACKAGE = 'gesel', gene_prefix, types)
10+
}
11+

Diff for: R/prepareDatabaseFiles.R

+3-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#' @param set.info Data frame of information about each gene set, where each row corresponds to a set.
1010
#' This data frame should contain the same columns as that returned by \code{\link{fetchAllSets}}.
1111
#' @param set.membership List of integer vectors, where each vector corresponds to a gene set and contains the indices of its constituent genes.
12-
#' All gene indices should be positive and no greater than \code{num.genes}.
12+
#' All gene indices should be positive, no greater than \code{num.genes}, and unique within each set.
1313
#' @param num.genes Integer scalar specifying the total number of genes available for this species.
1414
#'
1515
#' @return Several files are produced at \code{path} with the \code{<species>_} prefix.
@@ -48,6 +48,7 @@
4848
#' seq_len(nrow(set.info))
4949
#' )
5050
#' )
51+
#' set.membership <- lapply(set.membership, unique)
5152
#' set.info$size <- lengths(set.membership)
5253
#'
5354
#' # Now making the database files.
@@ -122,7 +123,7 @@ save_integer_list <- function(x, prefix, include.names = FALSE) {
122123
for (i in seq_along(x)) {
123124
z <- x[[i]]
124125
if (length(z)) {
125-
z <- sort(unique(z)) # convert to diffs to reduce integer size
126+
z <- sort(z) # convert to diffs to reduce integer size
126127
z <- c(z[1] - 1L, diff(z)) # get to 0-based indexing with delta encoding.
127128
lines[i] <- paste(z, collapse="\t")
128129
}

Diff for: R/validateDatabaseFiles.R

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#' Validate Gesel database files
2+
#'
3+
#' Validate Gesel database and gene mapping files against the specification at \url{https://github.com/gesel-inc/gesel-spec}.
4+
#'
5+
#' @param species String specifying the species in the form of its NCBI taxonomy ID.
6+
#' @param path String containing the path to a directory containing the database files or gene mapping files, for \code{validateDatabaseFiles} and \code{validateGeneFiles} respectively.
7+
#' @param num.genes Integer scalar specifying the total number of genes available for this species.
8+
#' @param types Character vector specifying the types of gene names to validate, e.g.,\code{"symbol"}, \code{"entrez"}, or \code{"ensembl"},
9+
#' If \code{NULL}, all detected files for \code{species} in \code{path} are checked.
10+
#'
11+
#' @return \code{validateDatabaseFiles} returns \code{NULL} invisibly.
12+
#'
13+
#' \code{validateGeneFiles} returns the number of genes, to be used as \code{num.genes}.
14+
#'
15+
#' In both functions, invalid formatting will cause an error to be raised.
16+
#'
17+
#' @author Aaron Lun
18+
#'
19+
#' @examples
20+
#' example(prepareDatabaseFiles, echo=FALSE)
21+
#' validateDatabaseFiles(output, "9606", num.genes)
22+
#'
23+
#' @export
24+
#' @importFrom Rcpp sourceCpp
25+
#' @useDynLib gesel
26+
validateDatabaseFiles <- function(path, species, num.genes) {
27+
validate_database_files(file.path(path, paste0(species, "_")), num.genes)
28+
invisible(NULL)
29+
}
30+
31+
#' @export
32+
#' @rdname validateDatabaseFiles
33+
validateGeneFiles <- function(path, species, types=NULL) {
34+
validate_gene_files(file.path(path, paste0(species, "_")), types)
35+
}

Diff for: man/prepareDatabaseFiles.Rd

+2-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: man/validateDatabaseFiles.Rd

+39
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: src/RcppExports.cpp

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
2+
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
3+
4+
#include <Rcpp.h>
5+
6+
using namespace Rcpp;
7+
8+
#ifdef RCPP_USE_GLOBAL_ROSTREAM
9+
Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
10+
Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
11+
#endif
12+
13+
// validate_database_files
14+
SEXP validate_database_files(std::string db_prefix, int num_genes);
15+
RcppExport SEXP _gesel_validate_database_files(SEXP db_prefixSEXP, SEXP num_genesSEXP) {
16+
BEGIN_RCPP
17+
Rcpp::RObject rcpp_result_gen;
18+
Rcpp::traits::input_parameter< std::string >::type db_prefix(db_prefixSEXP);
19+
Rcpp::traits::input_parameter< int >::type num_genes(num_genesSEXP);
20+
rcpp_result_gen = Rcpp::wrap(validate_database_files(db_prefix, num_genes));
21+
return rcpp_result_gen;
22+
END_RCPP
23+
}
24+
// validate_gene_files
25+
int validate_gene_files(std::string gene_prefix, Rcpp::Nullable<Rcpp::CharacterVector> types);
26+
RcppExport SEXP _gesel_validate_gene_files(SEXP gene_prefixSEXP, SEXP typesSEXP) {
27+
BEGIN_RCPP
28+
Rcpp::RObject rcpp_result_gen;
29+
Rcpp::traits::input_parameter< std::string >::type gene_prefix(gene_prefixSEXP);
30+
Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type types(typesSEXP);
31+
rcpp_result_gen = Rcpp::wrap(validate_gene_files(gene_prefix, types));
32+
return rcpp_result_gen;
33+
END_RCPP
34+
}
35+
36+
static const R_CallMethodDef CallEntries[] = {
37+
{"_gesel_validate_database_files", (DL_FUNC) &_gesel_validate_database_files, 2},
38+
{"_gesel_validate_gene_files", (DL_FUNC) &_gesel_validate_gene_files, 2},
39+
{NULL, NULL, 0}
40+
};
41+
42+
RcppExport void R_init_gesel(DllInfo *dll) {
43+
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
44+
R_useDynamicSymbols(dll, FALSE);
45+
}

Diff for: src/gesel/check_collection_details.hpp

+90
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#ifndef GESEL_CHECK_COLLECTION_DETAILS_HPP
2+
#define GESEL_CHECK_COLLECTION_DETAILS_HPP
3+
4+
#include <string>
5+
#include <cstdint>
6+
#include <vector>
7+
8+
#include "byteme/byteme.hpp"
9+
10+
#include "parse_field.hpp"
11+
12+
namespace gesel {
13+
14+
namespace internal {
15+
16+
inline void check_collection_details(const std::string& path, const std::vector<uint64_t>& ranges, const std::vector<uint64_t>& numbers) {
17+
byteme::RawFileReader raw_r(path);
18+
auto gzpath = path + ".gz";
19+
byteme::GzipFileReader gzip_r(gzpath);
20+
21+
byteme::PerByte raw_p(&raw_r);
22+
byteme::PerByte gzip_p(&gzip_r);
23+
24+
bool raw_valid = raw_p.valid();
25+
bool gzip_valid = gzip_p.valid();
26+
uint64_t line = 0;
27+
const uint64_t num_ranges = ranges.size();
28+
29+
while (raw_valid) {
30+
auto raw_pos = raw_p.position();
31+
auto title = parse_string_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line);
32+
auto description = parse_string_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line);
33+
auto species = parse_integer_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line);
34+
auto maintainer = parse_string_field<FieldType::MIDDLE>(raw_p, raw_valid, path, line);
35+
auto source = parse_string_field<FieldType::LAST>(raw_p, raw_valid, path, line);
36+
37+
if (line >= num_ranges) {
38+
throw std::runtime_error("number of lines in '" + path + "' exceeds that expected from its '*.ranges.gz' file " + append_line_number(line));
39+
}
40+
if (raw_p.position() - raw_pos - 1 != static_cast<size_t>(ranges[line])) {
41+
throw std::runtime_error("number of bytes per line in '" + path + "' is not the same as that expected from the '*.ranges.gz' file " + append_line_number(line));
42+
}
43+
44+
if (!gzip_valid) {
45+
throw std::runtime_error("early termination of the Gzipped version of '" + path + "'");
46+
}
47+
48+
auto gz_title = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line);
49+
if (gz_title != title) {
50+
throw std::runtime_error("different title in '" + path + "' compared to its Gzipped version " + append_line_number(line));
51+
}
52+
53+
auto gz_description = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line);
54+
if (gz_description != description) {
55+
throw std::runtime_error("different description in '" + path + "' compared to its Gzipped version " + append_line_number(line));
56+
}
57+
58+
auto gz_species = parse_integer_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line);
59+
if (gz_species != species) {
60+
throw std::runtime_error("different species in '" + path + "' compared to its Gzipped version " + append_line_number(line));
61+
}
62+
63+
auto gz_maintainer = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line);
64+
if (gz_maintainer != maintainer) {
65+
throw std::runtime_error("different maintainer in '" + path + "' compared to its Gzipped version " + append_line_number(line));
66+
}
67+
68+
auto gz_source = parse_string_field<FieldType::MIDDLE>(gzip_p, gzip_valid, path, line);
69+
if (gz_source != source) {
70+
throw std::runtime_error("different source in '" + path + "' compared to its Gzipped version " + append_line_number(line));
71+
}
72+
73+
auto gz_number = parse_integer_field<FieldType::LAST>(gzip_p, gzip_valid, path, line);
74+
if (gz_number != numbers[line]) {
75+
throw std::runtime_error("different number in '" + path + ".gz' compared to its '*.ranges.gz' file " + append_line_number(line));
76+
}
77+
78+
++line;
79+
}
80+
81+
if (line != num_ranges) {
82+
throw std::runtime_error("number of lines in '" + path + "' is less than that expected from its '*.ranges.gz' file " + append_line_number(line));
83+
}
84+
}
85+
86+
}
87+
88+
}
89+
90+
#endif

Diff for: src/gesel/check_genes.hpp

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#ifndef GESEL_CHECK_GENES_HPP
2+
#define GESEL_CHECK_GENES_HPP
3+
4+
#include <limits>
5+
#include <cstdint>
6+
#include <vector>
7+
#include <string>
8+
#include <stdexcept>
9+
#include <unordered_set>
10+
11+
#include "byteme/byteme.hpp"
12+
13+
#include "parse_field.hpp"
14+
#include "utils.hpp"
15+
16+
namespace gesel {
17+
18+
namespace internal {
19+
20+
inline uint64_t check_genes(const std::string& path) {
21+
byteme::GzipFileReader reader(path);
22+
byteme::PerByte pb(&reader);
23+
std::vector<uint64_t> output;
24+
25+
bool valid = pb.valid();
26+
uint64_t line = 0;
27+
constexpr uint64_t max_line = std::numeric_limits<uint64_t>::max();
28+
std::unordered_set<std::string> current_names;
29+
30+
while (valid) {
31+
if (pb.get() == '\n') {
32+
valid = pb.advance();
33+
} else {
34+
current_names.clear();
35+
do {
36+
auto parsed = parse_string_field<FieldType::UNKNOWN>(pb, valid, path, line);
37+
if (parsed.first == "") {
38+
throw std::runtime_error("empty name detected in '" + path + "' " + append_line_number(line));
39+
}
40+
if (current_names.find(parsed.first) != current_names.end()) {
41+
throw std::runtime_error("duplicated names detected in '" + path + "' " + append_line_number(line));
42+
}
43+
if (parsed.second) {
44+
break;
45+
}
46+
current_names.insert(parsed.first);
47+
} while (true);
48+
}
49+
50+
if (line == max_line) {
51+
throw std::runtime_error("number of lines should fit in a 32-bit integer");
52+
}
53+
++line;
54+
}
55+
56+
return line;
57+
}
58+
59+
}
60+
61+
}
62+
63+
#endif

0 commit comments

Comments
 (0)