From 16b4af917bb1e7002cb9d99d61c4e3f37e1ee49d Mon Sep 17 00:00:00 2001 From: LTLA Date: Wed, 12 Jun 2024 11:26:29 -0700 Subject: [PATCH] Cleaned up docstrings, fleshed out the README. --- CMakeLists.txt | 2 +- README.md | 36 ++++++++++++++++++----- include/knncolle_annoy/knncolle_annoy.hpp | 13 +++++--- 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 35b050f..55c0afe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.14) project(knncolle_annoy VERSION 0.1.0 - DESCRIPTION "Annoy bindings for the knncolle library" + DESCRIPTION "knncolle bindings for Annoy" LANGUAGES CXX) include(GNUInstallDirs) diff --git a/README.md b/README.md index a78557b..b9d1a31 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ For most applications involving large datasets, this is an acceptable trade-off. ## Quick start -The various `Annoy*` classes work directly the code chunks in the [**knncolle**](https://github.com/knncolle/knncolle) documentation. +Instances of the various `knncolle_annoy::Annoy*` classes can be used anywhere that accepts the corresponding **knncolle** interface. For example: ```cpp @@ -22,15 +22,28 @@ For example: // Wrap our data in a light SimpleMatrix. knncolle::SimpleMatrix mat(ndim, nobs, matrix.data()); -// Build a VP-tree index. -knncolle_annoy::AnnoyBuilder<> an_builder; -auto an_index = an_builder.build(mat); +// Build an Annoy index. +knncolle_annoy::AnnoyBuilder an_builder; +auto an_index = an_builder.build_unique(mat); -// Find 10 nearest neighbors of every element. -auto results = knncolle::find_nearest_neighbors(*anp_index, 10); +// Find 10 (approximate) nearest neighbors of every element. +auto results = knncolle::find_nearest_neighbors(*an_index, 10); ``` -We can also customize the construction of the `AnnoyBuilder`: +We could alternate between exact and approximate searches at run-time: + +```cpp +std::unique_ptr > ptr; +if (use_exact) { + knncolle::KmknnBuilder<> kbuilder; + ptr = kbuilder.build_unique(mat); +} else { + knncolle::AnnoyBuilder<> abuilder; + ptr = abuilder.build_unique(mat); +} +``` + +We can also customize the construction of the `AnnoyBuilder` by passing in options: ```cpp knncolle_annoy::AnnoyOptions an_opts; @@ -92,3 +105,12 @@ See [`extern/CMakeLists.txt`](extern/CMakeLists.txt) to find compatible versions If you're not using CMake, the simple approach is to just copy the files in `include/` - either directly or with Git submodules - and include their path during compilation with, e.g., GCC's `-I`. This requires the external dependencies listed in [`extern/CMakeLists.txt`](extern/CMakeLists.txt), which also need to be made available during compilation. + +## Note on vectorization + +Annoy will attempt to perform manual vectorization based on SSE and/or AVX instructions. +This may result in differences in the results across machines due to changes in numeric precision across architectures with varying support for SSE/AVX intrinsics. +For the most part, such differences can be avoided by consistently compiling for the "near-lowest common denominator" (such as the typical `x86-64` default for GCC and Clang) +and ensuring that the more specific instruction subsets like SSE3 and AVX are not enabled (which are typically off by default anyway). +Nonetheless, if reproducibility across architectures is important, it can be achieved at the cost of some speed by defining the `NO_MANUAL_VECTORIZATION` macro, +which will instruct Annoy to disable its vectorized optimizations. diff --git a/include/knncolle_annoy/knncolle_annoy.hpp b/include/knncolle_annoy/knncolle_annoy.hpp index 6032e34..fd00748 100644 --- a/include/knncolle_annoy/knncolle_annoy.hpp +++ b/include/knncolle_annoy/knncolle_annoy.hpp @@ -42,7 +42,7 @@ class AnnoyPrebuilt; * * Instances of this class are usually constructed using `AnnoyPrebuilt::initialize()`. * - * @tparam Distance_ An **Annoy**-derived class to compute the distance between vectors. + * @tparam Distance_ An **Annoy** class to compute the distance between vectors, e.g., `Annoy::Euclidean`. * @tparam Dim_ Integer type for the number of dimensions. * @tparam Index_ Integer type for the indices. * @tparam Float_ Floating point type for the query data and output distances. @@ -214,7 +214,7 @@ class AnnoySearcher : public knncolle::Searcher { * Instances of this class are usually constructed using `AnnoyBuilder::build_raw()`. * The `initialize()` method will create an instance of the `AnnoySearcher` class. * - * @tparam Distance_ An **Annoy**-derived class to compute the distance between vectors. + * @tparam Distance_ An **Annoy** class to compute the distance between vectors, e.g., `Annoy::Euclidean`. * @tparam Dim_ Integer type for the number of dimensions. * For the output of `AnnoyBuilder::build_raw()`, this is set to `Matrix_::dimension_type`. * @tparam Index_ Integer type for the indices. @@ -297,7 +297,7 @@ class AnnoyPrebuilt : public knncolle::Prebuilt { * Annoy. * https://github.com/spotify/annoy * - * @tparam Distance_ An **Annoy**-derived class to compute the distance between vectors. + * @tparam Distance_ An **Annoy** class to compute the distance between vectors, e.g., `Annoy::Euclidean`, `Annoy::Manhattan`. * Note that this is not the same as `knncolle::MockDistance`. * @tparam Matrix_ Matrix-like object satisfying the `knncolle::MockMatrix` interface. * @tparam Float_ Floating point type for the query data and output distances. @@ -305,7 +305,12 @@ class AnnoyPrebuilt : public knncolle::Prebuilt { * @tparam InternalData_ Floating point type for the internal data in Annoy. * This defaults to a `float` instead of a `double` to sacrifice some accuracy for performance. */ -template, typename Float_ = double, typename InternalIndex_ = typename Matrix_::index_type, typename InternalData_ = float> +template< + class Distance_ = Annoy::Euclidean, + class Matrix_ = knncolle::SimpleMatrix, + typename Float_ = double, + typename InternalIndex_ = typename Matrix_::index_type, + typename InternalData_ = float> class AnnoyBuilder : public knncolle::Builder { private: AnnoyOptions my_options;