From 941c41230743339c341ff771bc83b6bfb6afad20 Mon Sep 17 00:00:00 2001 From: Sergey Lisitsyn Date: Mon, 20 May 2024 16:40:44 +0100 Subject: [PATCH] Improve command line tools (#101) * Dispatch methods with maps instead of strcmp * Add "did you mean" for better experience * Support all the methods in go.py and add one more dataset * Check errors and add one more dataset * Add point highlight * Improve parameter descriptions * Simplify argument handling --- examples/go.py | 134 ++++++++++++++++++++++---------- src/cli/main.cpp | 147 +++++++++++++++++++---------------- src/cli/util.hpp | 198 ++++++++++++++++++++++++++++------------------- 3 files changed, 291 insertions(+), 188 deletions(-) diff --git a/examples/go.py b/examples/go.py index 9787c8d..494d211 100755 --- a/examples/go.py +++ b/examples/go.py @@ -4,90 +4,138 @@ import sys import os import subprocess +import re +import tempfile import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D -supported_methods = { - 'lle': 'Locally Linear Embedding', - 'ltsa': 'Local Tangent Space Alignment', - 'isomap': 'Isomap', - 'mds': 'Multidimensional Scaling', - 'pca': 'Principal Component Analysis', - 'kpca': 'Kernel Principal Component Analysis', - 't-sne': 't-distributed Stochastic Neighborhood Embedding', - 'dm': 'Diffusion Map', -} - -def generate_data(type, N=1000): +def generate_data(type, N=1000, random_state=None): + rng = np.random.RandomState(random_state) if type=='swissroll': - tt = np.array((3*np.pi/2)*(1+2*np.random.rand(N))) - height = np.array((np.random.rand(N)-0.5)) + tt = np.array((3*np.pi/2)*(1+2*rng.rand(N))) + height = np.array((rng.rand(N)-0.5)) X = np.array([tt*np.cos(tt), 10*height, tt*np.sin(tt)]) - return X,tt + return X, tt if type=='scurve': - tt = np.array((3*np.pi*(np.random.rand(N)-0.5))) - height = np.array((np.random.rand(N)-0.5)) + tt = np.array((3*np.pi*(rng.rand(N)-0.5))) + height = np.array((rng.rand(N)-0.5)) X = np.array([np.sin(tt), 10*height, np.sign(tt)*(np.cos(tt)-1)]) - return X,tt + return X, tt if type=='helix': tt = np.linspace(1,N,N).T / N tt = tt*2*np.pi X = np.r_[[(2+np.cos(8*tt))*np.cos(tt)], - [(2+np.cos(8*tt))*np.sin(tt)], - [np.sin(8*tt)]] - return X,tt + [(2+np.cos(8*tt))*np.sin(tt)], + [np.sin(8*tt)]] + return X, tt + if type=='twinpeaks': + X = rng.uniform(-1, 1, size=(N, 2)) + tt = np.sin(np.pi * X[:, 0]) * np.tanh(X[:, 1]) + tt += 0.1 * rng.normal(size=tt.shape) + X = np.vstack([X.T, tt]) + return X, tt + if type=='klein': + u = rng.uniform(0, 2 * np.pi, N) + v = rng.uniform(0, 2 * np.pi, N) + x = (2 + np.cos(u / 2) * np.sin(v) - np.sin(u / 2) * np.sin(2 * v)) * np.cos(u) + y = (2 + np.cos(u / 2) * np.sin(v) - np.sin(u / 2) * np.sin(2 * v)) * np.sin(u) + z = np.sin(u / 2) * np.sin(v) + np.cos(u / 2) * np.sin(2 * v) + + noise = 0.01 + x += noise * rng.normal(size=x.shape) + y += noise * rng.normal(size=y.shape) + z += noise * rng.normal(size=z.shape) + return np.vstack((x, y, z)), u raise Exception('Dataset is not supported') def embed(data,method): - if method not in supported_methods: - raise Exception('Method is not supported by this script') - - input_file = 'tapkee_input_data' - output_file = 'tapkee_output_data' - np.savetxt(input_file, data.T,delimiter=',') + input_file = tempfile.NamedTemporaryFile(prefix='tapkee_input') + output_file = tempfile.NamedTemporaryFile(prefix='tapkee_output') + np.savetxt(input_file.name, data.T,delimiter=',') tapkee_binary = 'bin/tapkee' - runner_string = '%s -i %s -o %s -m %s -k 20 --precompute --verbose --transpose-output --benchmark' % (tapkee_binary, input_file, output_file, method) - print('-- To reproduce this use the following command', runner_string) - output = subprocess.check_output(runner_string, shell=True) + runner_string = '%s -i %s -o %s -m %s -k 20 --precompute --debug --verbose --transpose-output --benchmark' % ( + tapkee_binary, input_file.name, output_file.name, method + ) + print('-- To reproduce this use the following command `{}`'.format(runner_string)) + process = subprocess.run(runner_string, shell=True, capture_output=True, text=True) + print(process.stderr) + if process.returncode != 0: + raise Exception('Failed to embed') + + if match := re.search(r'Parameter dimension reduction method = \[([a-zA-Z0-9() ]+)\]', process.stderr): + used_method = match.group(1) + else: + used_method = '' + + embedded_data = np.loadtxt(output_file, delimiter=',') - os.remove(input_file) - os.remove(output_file) - return embedded_data + return embedded_data, used_method def plot(data, embedded_data, colors='m', method=None): fig = plt.figure() fig.set_facecolor('white') - ax = fig.add_subplot(121, projection='3d') - ax.scatter(data[0], data[1], data[2], c=colors, cmap=plt.cm.Spectral, s=5) + ax_original = fig.add_subplot(121, projection='3d') + scatter_original = ax_original.scatter(data[0], data[1], data[2], c=colors, cmap=plt.cm.Spectral, s=5, picker=True) plt.axis('tight') plt.axis('off') plt.title('Original', fontsize=9) - ax = fig.add_subplot(122) - ax.scatter(embedded_data[0], embedded_data[1], c=colors, cmap=plt.cm.Spectral, s=5) + ax_embedding = fig.add_subplot(122) + scatter_embedding = ax_embedding.scatter(embedded_data[0], embedded_data[1], c=colors, cmap=plt.cm.Spectral, s=5, picker=True) plt.axis('tight') plt.axis('off') - plt.title('Embedding' + (' with ' + method) if method else '', fontsize=9) + plt.title('Embedding' + (' with ' + method) if method else '', fontsize=9, wrap=True) + + highlighted_points = [] # To store highlighted points + + # Function to highlight points on both plots + def highlight(index): + # Reset previous highlighted points + for point in highlighted_points: + point.remove() + highlighted_points.clear() + + # Highlight the current point on both scatter plots + point1 = ax_original.scatter([data[0][index]], [data[1][index]], [data[2][index]], color='white', s=25, edgecolor='black', zorder=3) + point2 = ax_embedding.scatter([embedded_data[0][index]], [embedded_data[1][index]], color='white', s=25, edgecolor='black', zorder=3) + highlighted_points.append(point1) + highlighted_points.append(point2) + fig.canvas.draw_idle() + + # Event handler for mouse motion + def on_hover(event): + if event.inaxes == ax_original: + cont, ind = scatter_original.contains(event) + elif event.inaxes == ax_embedding: + cont, ind = scatter_embedding.contains(event) + else: + return + + if cont: + index = ind['ind'][0] + highlight(index) + + fig.canvas.mpl_connect('motion_notify_event', on_hover) plt.show() if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Graphical example of dimension reduction with Tapkee.') - parser.add_argument('dataset', type=str, nargs=1, help='A dataset to embed. One of the following: %s' % str(['swissroll', 'scurve', 'helix'])) - parser.add_argument('method', type=str, nargs=1, help='A method to use. One of the following %s' % str(list(supported_methods.keys()))) + parser.add_argument('dataset', type=str, nargs=1, help='A dataset to embed. One of the following: %s' % str(['swissroll', 'scurve', 'helix', 'twinpeaks'])) + parser.add_argument('method', type=str, nargs=1, help='A method to use. Any of the methods supported by Tapkee') args = parser.parse_args() dataset = args.dataset[0] method = args.method[0] print('-- Loading %s data' % dataset) data, colors = generate_data(dataset) - print('-- Embedding %s data with %s' % (dataset,method)) - embedded_data = embed(data, method) + print('-- Embedding %s data with %s' % (dataset, method)) + embedded_data, used_method = embed(data, method) print('-- Plotting embedded data') - plot(data, embedded_data, colors, supported_methods[method]) + plot(data, embedded_data, colors, used_method) diff --git a/src/cli/main.cpp b/src/cli/main.cpp index 62fc928..d203d25 100644 --- a/src/cli/main.cpp +++ b/src/cli/main.cpp @@ -46,20 +46,54 @@ template auto with_default(T defs) } static const char* INPUT_FILE_KEYWORD = "input-file"; +static const char* INPUT_FILE_DESCRIPTION = "Input filename to be used. Can be any file that can be opened for reading by the program. Expects delimiter-separated matrix of real values. See transposing options for more details on rows and columns."; + static const char* TRANSPOSE_INPUT_KEYWORD = "transpose-input"; +static const char* TRANSPOSE_INPUT_DESCRIPTION = "Whether input file should be considered transposed. By default a line means a row in a matrix (a single vector to be embedded)."; + static const char* TRANSPOSE_OUTPUT_KEYWORD = "transpose-output"; +static const char* TRANSPOSE_OUTPUT_DESCRIPTION = "Whether output file should be transposed. By default a line would be a row of embedding matrix (a single embedding vector)"; + static const char* OUTPUT_FILE_KEYWORD = "output-file"; +static const char* OUTPUT_FILE_DESCRIPTION = "Output filename to be used. Can be any file that can be opened for writing by the program"; + static const char* OUTPUT_PROJECTION_MATRIX_FILE_KEYWORD = "output-projection-matrix-file"; +static const char* OUTPUT_PROJECTION_MATRIX_FILE_DESCRIPTION = "Filename to store the projection matrix calculated by the selected algorithm. Usually supported by linear algorithms such as PCA."; + static const char* OUTPUT_PROJECTION_MEAN_FILE_KEYWORD = "output-projection-mean-file"; +static const char* OUTPUT_PROJECTION_MEAN_FILE_DESCRIPTION = "Filename to store the mean vector calculated by the selected algorithm. Usually supported by linear algorithms such as PCA"; + static const char* DELIMITER_KEYWORD = "delimiter"; +static const char* DELIMITER_DESCRIPTION = "Delimiter to be used in reading and writing matrices"; + static const char* HELP_KEYWORD = "help"; +static const char* HELP_DESCRIPTION = "Print usage of the program"; + static const char* BENCHMARK_KEYWORD = "benchmark"; +static const char* BENCHMARK_DESCRIPTION = "Output benchmarking information about the time of algorithm steps"; + static const char* VERBOSE_KEYWORD = "verbose"; +static const char* VERBOSE_DESCRIPTION = "Be more verbose in logging"; + static const char* DEBUG_KEYWORD = "debug"; +static const char* DEBUG_DESCRIPTION = "Output debugging information such as intermediary steps, parameters, and other internals"; + static const char* METHOD_KEYWORD = "method"; +static const std::string METHOD_DESCRIPTION = "Dimension reduction method. One of the following: " + + comma_separated_keys(DIMENSION_REDUCTION_METHODS.begin(), DIMENSION_REDUCTION_METHODS.end()); + static const char* NEIGHBORS_METHOD_KEYWORD = "neighbors-method"; +static const std::string NEIGHBORS_METHOD_DESCRIPTION = "Neighbors search method. One of the following: " + + comma_separated_keys(NEIGHBORS_METHODS.begin(), NEIGHBORS_METHODS.end()); + static const char* EIGEN_METHOD_KEYWORD = "eigen-method"; +static const std::string EIGEN_METHOD_DESCRIPTION = "Eigendecomposition method. One of the following: " + + comma_separated_keys(EIGEN_METHODS.begin(), EIGEN_METHODS.end()); + static const char* COMPUTATION_STRATEGY_KEYWORD = "computation-strategy"; +static const std::string COMPUTATION_STRATEGY_DESCRIPTION = "Computation strategy. One of the following: " + + comma_separated_keys(COMPUTATION_STRATEGIES.begin(), COMPUTATION_STRATEGIES.end()); + static const char* TARGET_DIMENSION_KEYWORD = "target-dimension"; static const char* NUM_NEIGHBORS_KEYWORD = "num-neighbors"; static const char* GAUSSIAN_WIDTH_KEYWORD = "gaussian-width"; @@ -80,7 +114,7 @@ int run(int argc, const char **argv) { srand(static_cast(time(NULL))); - cxxopts::Options options("tapkee", "Tapkee: a tool for dimension reduction"); + cxxopts::Options options("tapkee", "Tapkee: a tool for dimensionality reduction."); using namespace std::string_literals; @@ -90,73 +124,61 @@ int run(int argc, const char **argv) .add_options() ( either("i", INPUT_FILE_KEYWORD), - "Input file", + INPUT_FILE_DESCRIPTION, with_default("/dev/stdin"s) ) ( TRANSPOSE_INPUT_KEYWORD, - "Transpose input file if set" + TRANSPOSE_INPUT_DESCRIPTION ) ( TRANSPOSE_OUTPUT_KEYWORD, - "Transpose output file if set" + TRANSPOSE_OUTPUT_DESCRIPTION ) ( either("o", OUTPUT_FILE_KEYWORD), - "Output file", + OUTPUT_FILE_DESCRIPTION, with_default("/dev/stdout"s) ) ( either("opmat", OUTPUT_PROJECTION_MATRIX_FILE_KEYWORD), - "Output file for the projection matrix", + OUTPUT_PROJECTION_MATRIX_FILE_DESCRIPTION, with_default("/dev/null"s) ) ( either("opmean", OUTPUT_PROJECTION_MEAN_FILE_KEYWORD), - "Output file for the mean of data", + OUTPUT_PROJECTION_MEAN_FILE_DESCRIPTION, with_default("/dev/null"s) ) ( either("d", DELIMITER_KEYWORD), - "Delimiter", + DELIMITER_DESCRIPTION, with_default(","s) ) ( either("h", HELP_KEYWORD), - "Print usage" + HELP_DESCRIPTION ) ( BENCHMARK_KEYWORD, - "Output benchmark information" + BENCHMARK_DESCRIPTION ) ( VERBOSE_KEYWORD, - "Output more information" + VERBOSE_DESCRIPTION ) ( DEBUG_KEYWORD, - "Output debug information" + DEBUG_DESCRIPTION ) ( either("m", METHOD_KEYWORD), - "Dimension reduction method (default locally_linear_embedding). \n One of the following: \n" - "locally_linear_embedding (lle), neighborhood_preserving_embedding (npe), \n" - "local_tangent_space_alignment (ltsa), linear_local_tangent_space_alignment (lltsa), \n" - "hessian_locally_linear_embedding (hlle), laplacian_eigenmaps (la), locality_preserving_projections (lpp), \n" - "diffusion_map (dm), isomap, landmark_isomap (l-isomap), multidimensional_scaling (mds), \n" - "landmark_multidimensional_scaling (l-mds), stochastic_proximity_embedding (spe), \n" - "kernel_pca (kpca), pca, random_projection (ra), factor_analysis (fa), \n" - "t-stochastic_neighborhood_embedding (t-sne), manifold_sculpting (ms).", + METHOD_DESCRIPTION, with_default("locally_linear_embedding"s) ) ( either("nm", NEIGHBORS_METHOD_KEYWORD), - "Neighbors search method (default is 'covertree' if available, 'vptree' otherwise). One of the following: " - "brute,vptree" -#ifdef TAPKEE_USE_LGPL_COVERTREE - ",covertree" -#endif - ".", + NEIGHBORS_METHOD_DESCRIPTION, #ifdef TAPKEE_USE_LGPL_COVERTREE with_default("covertree"s) #else @@ -165,11 +187,7 @@ int run(int argc, const char **argv) ) ( either("em", EIGEN_METHOD_KEYWORD), - "Eigendecomposition method (default is 'arpack' if available, 'dense' otherwise). One of the following: " -#ifdef TAPKEE_WITH_ARPACK - "arpack, " -#endif - "randomized, dense.", + EIGEN_METHOD_DESCRIPTION, #ifdef TAPKEE_WITH_ARPACK with_default("arpack"s) #else @@ -178,11 +196,7 @@ int run(int argc, const char **argv) ) ( either("cs", COMPUTATION_STRATEGY_KEYWORD), - "Computation strategy (default is 'cpu'). One of the following: " -#ifdef TAPKEE_WITH_VIENNACL - "opencl, " -#endif - "cpu.", + COMPUTATION_STRATEGY_DESCRIPTION, with_default("cpu"s) ) ( @@ -298,9 +312,9 @@ int run(int argc, const char **argv) string method = opt[METHOD_KEYWORD].as(); try { - tapkee_method = parse_reduction_method(method.c_str()); + tapkee_method = parse_multiple(DIMENSION_REDUCTION_METHODS, method); } - catch (const std::exception &) + catch (const std::exception & ex) { tapkee::Logging::instance().message_error(string("Unknown method ") + method); return 1; @@ -312,7 +326,7 @@ int run(int argc, const char **argv) string method = opt[NEIGHBORS_METHOD_KEYWORD].as(); try { - tapkee_neighbors_method = parse_neighbors_method(method.c_str()); + tapkee_neighbors_method = parse_multiple(NEIGHBORS_METHODS, method); } catch (const std::exception &) { @@ -325,7 +339,7 @@ int run(int argc, const char **argv) string method = opt[EIGEN_METHOD_KEYWORD].as(); try { - tapkee_eigen_method = parse_eigen_method(method.c_str()); + tapkee_eigen_method = parse_multiple(EIGEN_METHODS, method); } catch (const std::exception &) { @@ -338,7 +352,7 @@ int run(int argc, const char **argv) string method = opt[COMPUTATION_STRATEGY_KEYWORD].as(); try { - tapkee_computation_strategy = parse_computation_strategy(method.c_str()); + tapkee_computation_strategy = parse_multiple(COMPUTATION_STRATEGIES, method); } catch (const std::exception &) { @@ -375,17 +389,6 @@ int run(int argc, const char **argv) tapkee::Logging::instance().message_error("Number of timesteps is negative."); return 1; } - double eigenshift = opt[EIGENSHIFT_KEYWORD].as(); - double landmark_rt = opt[LANDMARK_RATIO_KEYWORD].as(); - bool spe_global = opt.count(SPE_LOCAL_KEYWORD); - double spe_tol = opt[SPE_TOLERANCE_KEYWORD].as(); - int spe_num_upd = opt[SPE_NUM_UPDATES_KEYWORD].as(); - int max_iters = opt[MAX_ITERS_KEYWORD].as(); - double fa_eps = opt[FA_EPSILON_KEYWORD].as(); - double perplexity = opt[SNE_PERPLEXITY_KEYWORD].as(); - double theta = opt[SNE_THETA_KEYWORD].as(); - double squishing = opt[MS_SQUISHING_RATE_KEYWORD].as(); - // Load data string input_filename = opt[INPUT_FILE_KEYWORD].as(); string output_filename = opt[OUTPUT_FILE_KEYWORD].as(); @@ -412,23 +415,33 @@ int run(int argc, const char **argv) input_data.transposeInPlace(); } - std::stringstream ss; - ss << "Data contains " << input_data.cols() << " feature vectors with dimension of " << input_data.rows(); - tapkee::Logging::instance().message_info(ss.str()); + tapkee::Logging::instance().message_info(fmt::format("Data contains {} feature vectors with dimension of {}", input_data.cols(), input_data.rows())); tapkee::TapkeeOutput output; tapkee::ParametersSet parameters = - tapkee::kwargs[(tapkee::method = tapkee_method, tapkee::computation_strategy = tapkee_computation_strategy, - tapkee::eigen_method = tapkee_eigen_method, tapkee::neighbors_method = tapkee_neighbors_method, - tapkee::num_neighbors = k, tapkee::target_dimension = target_dim, - tapkee::diffusion_map_timesteps = timesteps, tapkee::gaussian_kernel_width = width, - tapkee::max_iteration = max_iters, tapkee::spe_global_strategy = spe_global, - tapkee::spe_num_updates = spe_num_upd, tapkee::spe_tolerance = spe_tol, - tapkee::landmark_ratio = landmark_rt, tapkee::nullspace_shift = eigenshift, - tapkee::check_connectivity = true, tapkee::fa_epsilon = fa_eps, - tapkee::sne_perplexity = perplexity, tapkee::sne_theta = theta, - tapkee::squishing_rate = squishing)]; + tapkee::kwargs[( + tapkee::method = tapkee_method, + tapkee::computation_strategy = tapkee_computation_strategy, + tapkee::eigen_method = tapkee_eigen_method, + tapkee::neighbors_method = tapkee_neighbors_method, + tapkee::num_neighbors = k, + tapkee::target_dimension = target_dim, + tapkee::diffusion_map_timesteps = timesteps, + tapkee::gaussian_kernel_width = width, + tapkee::max_iteration = opt[MAX_ITERS_KEYWORD].as(), + tapkee::spe_global_strategy = opt.count(SPE_LOCAL_KEYWORD), + tapkee::spe_num_updates = opt[SPE_NUM_UPDATES_KEYWORD].as(), + tapkee::spe_tolerance = opt[SPE_TOLERANCE_KEYWORD].as(), + tapkee::landmark_ratio = opt[LANDMARK_RATIO_KEYWORD].as(), + tapkee::nullspace_shift = opt[EIGENSHIFT_KEYWORD].as(), + tapkee::check_connectivity = true, + tapkee::fa_epsilon = opt[FA_EPSILON_KEYWORD].as(), + tapkee::sne_perplexity = opt[SNE_PERPLEXITY_KEYWORD].as(), + tapkee::sne_theta = opt[SNE_THETA_KEYWORD].as(), + tapkee::squishing_rate = opt[MS_SQUISHING_RATE_KEYWORD].as() + )]; + if (opt.count(PRECOMPUTE_KEYWORD)) { @@ -443,13 +456,13 @@ int run(int argc, const char **argv) { tapkee::tapkee_internal::timed_context context("[+] Distance matrix computation"); distance_matrix = matrix_from_callback(static_cast(input_data.cols()), - tapkee::eigen_distance_callback(input_data)); + tapkee::eigen_distance_callback(input_data)); } if (tapkee_method.needs_kernel) { tapkee::tapkee_internal::timed_context context("[+] Kernel matrix computation"); kernel_matrix = matrix_from_callback(static_cast(input_data.cols()), - tapkee::eigen_kernel_callback(input_data)); + tapkee::eigen_kernel_callback(input_data)); } } tapkee::precomputed_distance_callback dcb(distance_matrix); diff --git a/src/cli/util.hpp b/src/cli/util.hpp index 5a5531f..69b5c52 100644 --- a/src/cli/util.hpp +++ b/src/cli/util.hpp @@ -10,6 +10,7 @@ #include #include +#include using namespace std; @@ -22,6 +23,52 @@ inline bool is_wrong_char(char c) return false; } +int levenshtein_distance(const std::string& s1, const std::string& s2) +{ + const auto len1 = s1.size(); + const auto len2 = s2.size(); + + std::vector> d(len1 + 1, std::vector(len2 + 1)); + + d[0][0] = 0; + for (unsigned int i = 1; i <= len1; ++i) + { + d[i][0] = i; + } + for (unsigned int j = 1; j <= len2; ++j) + { + d[0][j] = j; + } + + for (unsigned int i = 1; i <= len1; ++i) + { + for (unsigned int j = 1; j <= len2; ++j) + { + d[i][j] = std::min({ + d[i - 1][j] + 1, + d[i][j - 1] + 1, + d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1) + }); + } + } + + return d[len1][len2]; +} + +template +std::string comma_separated_keys(Iterator begin, Iterator end) { + std::ostringstream oss; + for (Iterator it = begin; it != end; ++it) + { + oss << it->first; + if (std::next(it) != end) + { + oss << ", "; + } + } + return oss.str(); +} + tapkee::DenseMatrix read_data(ifstream& ifs, char delimiter) { string str; @@ -95,92 +142,87 @@ void write_vector(tapkee::DenseVector* matrix, ofstream& of) } } -tapkee::DimensionReductionMethod parse_reduction_method(const char* str) -{ - if (!strcmp(str, "local_tangent_space_alignment") || !strcmp(str, "ltsa")) - return tapkee::KernelLocalTangentSpaceAlignment; - if (!strcmp(str, "locally_linear_embedding") || !strcmp(str, "lle")) - return tapkee::KernelLocallyLinearEmbedding; - if (!strcmp(str, "hessian_locally_linear_embedding") || !strcmp(str, "hlle")) - return tapkee::HessianLocallyLinearEmbedding; - if (!strcmp(str, "multidimensional_scaling") || !strcmp(str, "mds")) - return tapkee::MultidimensionalScaling; - if (!strcmp(str, "landmark_multidimensional_scaling") || !strcmp(str, "l-mds")) - return tapkee::LandmarkMultidimensionalScaling; - if (!strcmp(str, "isomap")) - return tapkee::Isomap; - if (!strcmp(str, "landmark_isomap") || !strcmp(str, "l-isomap")) - return tapkee::LandmarkIsomap; - if (!strcmp(str, "diffusion_map") || !strcmp(str, "dm")) - return tapkee::DiffusionMap; - if (!strcmp(str, "kernel_pca") || !strcmp(str, "kpca")) - return tapkee::KernelPrincipalComponentAnalysis; - if (!strcmp(str, "pca")) - return tapkee::PrincipalComponentAnalysis; - if (!strcmp(str, "random_projection") || !strcmp(str, "ra")) - return tapkee::RandomProjection; - if (!strcmp(str, "laplacian_eigenmaps") || !strcmp(str, "la")) - return tapkee::LaplacianEigenmaps; - if (!strcmp(str, "locality_preserving_projections") || !strcmp(str, "lpp")) - return tapkee::LocalityPreservingProjections; - if (!strcmp(str, "neighborhood_preserving_embedding") || !strcmp(str, "npe")) - return tapkee::NeighborhoodPreservingEmbedding; - if (!strcmp(str, "linear_local_tangent_space_alignment") || !strcmp(str, "lltsa")) - return tapkee::LinearLocalTangentSpaceAlignment; - if (!strcmp(str, "stochastic_proximity_embedding") || !strcmp(str, "spe")) - return tapkee::StochasticProximityEmbedding; - if (!strcmp(str, "passthru")) - return tapkee::PassThru; - if (!strcmp(str, "factor_analysis") || !strcmp(str, "fa")) - return tapkee::FactorAnalysis; - if (!strcmp(str, "t-stochastic_neighbor_embedding") || !strcmp(str, "t-sne")) - return tapkee::tDistributedStochasticNeighborEmbedding; - if (!strcmp(str, "manifold_sculpting") || !strcmp(str, "ms")) - return tapkee::ManifoldSculpting; - - throw std::exception(); - return tapkee::PassThru; -} - -tapkee::NeighborsMethod parse_neighbors_method(const char* str) -{ - if (!strcmp(str, "brute")) - return tapkee::Brute; - if (!strcmp(str, "vptree")) - return tapkee::VpTree; +static const std::map DIMENSION_REDUCTION_METHODS = { + {"local_tangent_space_alignment", tapkee::KernelLocalTangentSpaceAlignment}, + {"ltsa", tapkee::KernelLocalTangentSpaceAlignment}, + {"locally_linear_embedding", tapkee::KernelLocallyLinearEmbedding}, + {"lle", tapkee::KernelLocallyLinearEmbedding}, + {"hessian_locally_linear_embedding", tapkee::HessianLocallyLinearEmbedding}, + {"hlle", tapkee::HessianLocallyLinearEmbedding}, + {"multidimensional_scaling", tapkee::MultidimensionalScaling}, + {"mds", tapkee::MultidimensionalScaling}, + {"landmark_multidimensional_scaling", tapkee::LandmarkMultidimensionalScaling}, + {"l-mds", tapkee::LandmarkMultidimensionalScaling}, + {"isomap", tapkee::Isomap}, + {"landmark_isomap", tapkee::LandmarkIsomap}, + {"l-isomap", tapkee::LandmarkIsomap}, + {"diffusion_map", tapkee::DiffusionMap}, + {"dm", tapkee::DiffusionMap}, + {"kernel_pca", tapkee::KernelPrincipalComponentAnalysis}, + {"kpca", tapkee::KernelPrincipalComponentAnalysis}, + {"pca", tapkee::PrincipalComponentAnalysis}, + {"random_projection", tapkee::RandomProjection}, + {"ra", tapkee::RandomProjection}, + {"laplacian_eigenmaps", tapkee::LaplacianEigenmaps}, + {"la", tapkee::LaplacianEigenmaps}, + {"locality_preserving_projections", tapkee::LocalityPreservingProjections}, + {"lpp", tapkee::LocalityPreservingProjections}, + {"neighborhood_preserving_embedding", tapkee::NeighborhoodPreservingEmbedding}, + {"npe", tapkee::NeighborhoodPreservingEmbedding}, + {"linear_local_tangent_space_alignment", tapkee::LinearLocalTangentSpaceAlignment}, + {"lltsa", tapkee::LinearLocalTangentSpaceAlignment}, + {"stochastic_proximity_embedding", tapkee::StochasticProximityEmbedding}, + {"spe", tapkee::StochasticProximityEmbedding}, + {"passthru", tapkee::PassThru}, + {"factor_analysis", tapkee::FactorAnalysis}, + {"fa", tapkee::FactorAnalysis}, + {"t-stochastic_proximity_embedding", tapkee::tDistributedStochasticNeighborEmbedding}, + {"t-sne", tapkee::tDistributedStochasticNeighborEmbedding}, + {"manifold_sculpting", tapkee::ManifoldSculpting}, +}; + +static const std::map NEIGHBORS_METHODS = { + {"brute", tapkee::Brute}, + {"vptree", tapkee::VpTree}, #ifdef TAPKEE_USE_LGPL_COVERTREE - if (!strcmp(str, "covertree")) - return tapkee::CoverTree; + {"covertree", tapkee::CoverTree}, #endif +}; - throw std::exception(); - return tapkee::Brute; -} - -tapkee::EigenMethod parse_eigen_method(const char* str) -{ +static const std::map EIGEN_METHODS = { + {"dense", tapkee::Dense}, + {"randomized", tapkee::Randomized}, #ifdef TAPKEE_WITH_ARPACK - if (!strcmp(str, "arpack")) - return tapkee::Arpack; + {"arpack", tapkee::Arpack}, #endif - if (!strcmp(str, "randomized")) - return tapkee::Randomized; - if (!strcmp(str, "dense")) - return tapkee::Dense; - - throw std::exception(); - return tapkee::Dense; -} +}; -tapkee::ComputationStrategy parse_computation_strategy(const char* str) -{ - if (!strcmp(str, "cpu")) - return tapkee::HomogeneousCPUStrategy; +static const std::map COMPUTATION_STRATEGIES = { + {"cpu", tapkee::HomogeneousCPUStrategy}, #ifdef TAPKEE_WITH_VIENNACL - if (!strcmp(str, "opencl")) - return tapkee::HeterogeneousOpenCLStrategy; + {"opencl", tapkee::HeterogeneousOpenCLStrategy}, #endif - return tapkee::HomogeneousCPUStrategy; +}; + +template +typename Mapping::mapped_type parse_multiple(Mapping mapping, const std::string& str) +{ + auto it = mapping.find(str); + if (it != mapping.end()) + { + return it->second; + } + + auto closest = std::min_element(mapping.begin(), mapping.end(), + [&str] (const auto &a, const auto &b) { + return levenshtein_distance(str, a.first) < levenshtein_distance(str, b.first); + }); + if (closest != mapping.end()) + { + tapkee::Logging::instance().message_info(fmt::format("Unknown parameter value `{}`. Did you mean `{}`?", str, closest->first)); + } + + throw std::logic_error(str); } template