include/model/CAnomalyDetectorModelConfig.h

/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the following additional limitation. Functionality enabled by the
 * files subject to the Elastic License 2.0 may only be used in production when
 * invoked by an Elasticsearch process with a license key installed that permits
 * use of machine learning features. You may not use this file except in
 * compliance with the Elastic License 2.0 and the foregoing additional
 * limitation.
 */

#ifndef INCLUDED_ml_model_CAnomalyDetectorModelConfig_h
#define INCLUDED_ml_model_CAnomalyDetectorModelConfig_h

#include <core/CoreTypes.h>

#include <model/CSearchKey.h>
#include <model/FunctionTypes.h>
#include <model/ImportExport.h>
#include <model/ModelTypes.h>

#include <boost/property_tree/ptree_fwd.hpp>
#include <boost/unordered_map.hpp>

#include <cstddef>
#include <map>
#include <memory>
#include <set>
#include <utility>
#include <vector>

namespace ml {
namespace model {
class CDetectionRule;
class CInterimBucketCorrector;
class CSearchKey;
class CModelFactory;

//! \brief Responsible for configuring anomaly detection models.
//!
//! DESCRIPTION:\n
//! Responsible for configuring classes for performing anomaly detection.
//! It also defines all parameter defaults.
//!
//! IMPLEMENTATION DECISIONS:\n
//! This wraps up the configuration of anomaly detection to encapsulate
//! the details from calling code. It is anticipated that:
//!   -# Some of this information will be exposed to the user via a
//!      configuration file,
//!   -# Some may be calculated from data characteristics and so on.
class MODEL_EXPORT CAnomalyDetectorModelConfig {
public:
    //! The possible factory types.
    enum EFactoryType {
        E_EventRateFactory = 0,
        E_MetricFactory = 1,
        E_EventRatePopulationFactory = 2,
        E_MetricPopulationFactory = 3,
        E_CountingFactory = 4,
        E_UnknownFactory,
        E_BadFactory
    };

    using TStrSet = std::set<std::string>;
    using TSizeVec = std::vector<std::size_t>;
    using TTimeVec = std::vector<core_t::TTime>;
    using TTimeVecCItr = TTimeVec::const_iterator;
    using TDoubleDoublePr = std::pair<double, double>;
    using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
    using TFeatureVec = model_t::TFeatureVec;
    using TStrVec = std::vector<std::string>;
    using TStrVecCItr = TStrVec::const_iterator;
    using TInterimBucketCorrectorPtr = std::shared_ptr<CInterimBucketCorrector>;
    using TModelFactoryPtr = std::shared_ptr<CModelFactory>;
    using TModelFactoryCPtr = std::shared_ptr<const CModelFactory>;
    using TFactoryTypeFactoryPtrMap = std::map<EFactoryType, TModelFactoryPtr>;
    using TFactoryTypeFactoryPtrMapItr = TFactoryTypeFactoryPtrMap::iterator;
    using TFactoryTypeFactoryPtrMapCItr = TFactoryTypeFactoryPtrMap::const_iterator;
    using TSearchKeyFactoryCPtrMap = std::map<CSearchKey, TModelFactoryCPtr>;

    // Const ref to detection rules map
    using TDetectionRuleVec = std::vector<CDetectionRule>;
    using TDetectionRuleVecCRef = std::reference_wrapper<const TDetectionRuleVec>;
    using TIntDetectionRuleVecUMap = boost::unordered_map<int, TDetectionRuleVec>;
    using TIntDetectionRuleVecUMapCRef = std::reference_wrapper<const TIntDetectionRuleVecUMap>;
    using TIntDetectionRuleVecUMapCItr = TIntDetectionRuleVecUMap::const_iterator;

    using TStrDetectionRulePr = std::pair<std::string, model::CDetectionRule>;
    using TStrDetectionRulePrVec = std::vector<TStrDetectionRulePr>;
    using TStrDetectionRulePrVecCRef = std::reference_wrapper<const TStrDetectionRulePrVec>;

public:
    //! \name Data Gathering
    //@{
    //! The default value used to separate components of a multivariate feature
    //! in its string value.
    static const std::string DEFAULT_MULTIVARIATE_COMPONENT_DELIMITER;

    //! Bucket length if none is specified on the command line.
    static const core_t::TTime DEFAULT_BUCKET_LENGTH;

    //! Default maximum number of buckets for receiving out of order records.
    static const std::size_t DEFAULT_LATENCY_BUCKETS;

    //! Bucket length corresponding to the default decay and learn rates.
    static const core_t::TTime STANDARD_BUCKET_LENGTH;
    //@}

    //! \name Modelling
    //@{
    //! The default rate at which the model priors decay to non-informative
    //! per standard bucket length.
    static const double DEFAULT_DECAY_RATE;

    //! The initial rate, as a multiple of the default decay rate, at which
    //! the model priors decay to non-informative per standard bucket length.
    static const double DEFAULT_INITIAL_DECAY_RATE_MULTIPLIER;

    //! The rate at which information accrues in the model per standard
    //! bucket length elapsed.
    static const double DEFAULT_LEARN_RATE;

    //! The default minimum permitted fraction of points in a distribution
    //! mode for individual modeling.
    static const double DEFAULT_INDIVIDUAL_MINIMUM_MODE_FRACTION;

    //! The default minimum permitted fraction of points in a distribution
    //! mode for population modeling.
    static const double DEFAULT_POPULATION_MINIMUM_MODE_FRACTION;

    //! The default minimum count in a cluster we'll permit in a cluster.
    static const double DEFAULT_MINIMUM_CLUSTER_SPLIT_COUNT;

    //! The default proportion of initial count at which we'll delete a
    //! category from the sketch to cluster.
    static const double DEFAULT_CATEGORY_DELETE_FRACTION;

    //! The default size of the seasonal components we will model.
    static const std::size_t DEFAULT_COMPONENT_SIZE;

    //! The default minimum time to detect a change point in a time series.
    static const core_t::TTime DEFAULT_MINIMUM_TIME_TO_DETECT_CHANGE;

    //! The default maximum time to test for a change point in a time series.
    static const core_t::TTime DEFAULT_MAXIMUM_TIME_TO_TEST_FOR_CHANGE;

    //! The default number of time buckets used to generate multibucket features
    //! for anomaly detection.
    static const std::size_t MULTIBUCKET_FEATURES_WINDOW_LENGTH;

    //! The maximum value that the multi_bucket_impact can take
    static const double MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE;

    //! The maximum number of times we'll update a model in a bucketing
    //! interval. This only applies to our metric statistics, which are
    //! computed on a fixed number of measurements rather than a fixed
    //! time interval. A value of zero implies no constraint.
    static const double DEFAULT_MAXIMUM_UPDATES_PER_BUCKET;

    //! The default minimum value for the influence for which an influencing
    //! field value is judged to have any influence on a feature value.
    static const double DEFAULT_INFLUENCE_CUTOFF;

    //! The default scale factor of the decayRate that determines the minimum
    //! size of the sliding prune window for purging older entries from the
    //! model.
    static const double DEFAULT_PRUNE_WINDOW_SCALE_MINIMUM;

    //! The default scale factor of the decayRate that determines the maximum
    //! size of the sliding prune window for purging older entries from the
    //! model.
    static const double DEFAULT_PRUNE_WINDOW_SCALE_MAXIMUM;

    //! The default factor increase in priors used to model correlations.
    static const double DEFAULT_CORRELATION_MODELS_OVERHEAD;

    //! The default threshold for the Pearson correlation coefficient at
    //! which a correlate will be modeled.
    static const double DEFAULT_MINIMUM_SIGNIFICANT_CORRELATION;
    //@}

    //! \name Anomaly Score Calculation
    //@{
    //! The default values for the aggregation styles' parameters.
    static const double DEFAULT_AGGREGATION_STYLE_PARAMS[model_t::NUMBER_AGGREGATION_STYLES][model_t::NUMBER_AGGREGATION_PARAMS];

    //! The default maximum probability which is deemed to be anomalous.
    static const double DEFAULT_MAXIMUM_ANOMALOUS_PROBABILITY;
    //@}

    //! \name Anomaly Score Normalization
    //@{
    //! The default historic anomaly score percentile for which lower
    //! values are classified as noise.
    static const double DEFAULT_NOISE_PERCENTILE;

    //! The default multiplier applied to the noise level score in
    //! order to be classified as anomalous.
    static const double DEFAULT_NOISE_MULTIPLIER;

    //! We use a piecewise linear mapping between the raw anomaly score
    //! and the normalized anomaly score with these default knot points.
    //! In particular, if we define the percentile of a raw score \f$s\f$
    //! as \f$f_q(s)\f$ and \f$a = \max\{x \le f_q(s)\}\f$ and
    //! \f$b = \min{x \ge f_q(s)}\f$ where \f$x\f$ ranges over the knot point
    //! X- values then the normalized score would be:\n
    //! <pre class="fragment">
    //!   \f$\displaystyle \bar{s} = \frac{(y(b) - y(a))(f_q(s) - a)}{b - a}\f$
    //! </pre>
    //! Here, \f$y(.)\f$ denote the corresponding knot point Y- values.
    static const TDoubleDoublePr DEFAULT_NORMALIZED_SCORE_KNOT_POINTS[9];
    //@}

public:
    //! Create the default configuration.
    //!
    //! \param[in] bucketLength The bucketing interval length.
    //! \param[in] summaryMode Indicates whether the data being gathered
    //! are already summarized by an external aggregation process.
    //! \param[in] summaryCountFieldName If \p summaryMode is E_Manual
    //! then this is the name of the field holding the summary count.
    //! \param[in] latency The amount of time records are buffered for, to
    //! allow out-of-order records to be seen by the models in order.
    //! \param[in] multivariateByFields Should multivariate analysis of
    //! correlated 'by' fields be performed?
    static CAnomalyDetectorModelConfig defaultConfig(core_t::TTime bucketLength,
                                                     model_t::ESummaryMode summaryMode,
                                                     const std::string& summaryCountFieldName,
                                                     core_t::TTime latency,
                                                     bool multivariateByFields);

    //! Overload using defaults.
    static CAnomalyDetectorModelConfig
    defaultConfig(core_t::TTime bucketLength = DEFAULT_BUCKET_LENGTH,
                  model_t::ESummaryMode summaryMode = model_t::E_None,
                  const std::string& summaryCountFieldName = "") {
        return defaultConfig(bucketLength, summaryMode, summaryCountFieldName,
                             DEFAULT_LATENCY_BUCKETS * bucketLength, false);
    }

    //! Get the factor to normalize all bucket lengths to the default
    //! bucket length.
    static double bucketNormalizationFactor(core_t::TTime bucketLength);

    //! Get the decay rate to use for the time series decomposition given
    //! the model decay rate \p modelDecayRate.
    static double trendDecayRate(double modelDecayRate, core_t::TTime bucketLength);

public:
    CAnomalyDetectorModelConfig();

    //! Set the data bucketing interval.
    void bucketLength(core_t::TTime length);
    //! Set the single interim bucket correction calculator.
    void interimBucketCorrector(const TInterimBucketCorrectorPtr& interimBucketCorrector);
    //! Set whether to model multibucket features.
    void useMultibucketFeatures(bool enabled);
    //! Set whether multivariate analysis of correlated 'by' fields should
    //! be performed.
    void multivariateByFields(bool enabled);
    //! Set the model factories.
    void factories(const TFactoryTypeFactoryPtrMap& factories);
    //! Set the style and parameter value for raw score aggregation.
    bool aggregationStyleParams(model_t::EAggregationStyle style,
                                model_t::EAggregationParam param,
                                double value);
    //! Set the maximum anomalous probability.
    void maximumAnomalousProbability(double probability);
    //! Set the noise level as a percentile of historic raw anomaly scores.
    bool noisePercentile(double percentile);
    //! Set the noise multiplier to use when derating normalized scores
    //! based on the noise score level.
    bool noiseMultiplier(double multiplier);
    //! Set the normalized score knot points for the piecewise linear curve
    //! between historic raw score percentiles and normalized scores.
    bool normalizedScoreKnotPoints(const TDoubleDoublePrVec& points);

    //! Populate the parameters from a configuration file.
    bool init(const std::string& configFile);

    //! Populate the parameters from a configuration file, also retrieving
    //! the raw property tree created from the config file.  (The raw
    //! property tree is only valid if the method returns true.)
    bool init(const std::string& configFile, boost::property_tree::ptree& propTree);

    //! Populate the parameters from a property tree.
    bool init(const boost::property_tree::ptree& propTree);

    //! Get the factory for new models.
    //!
    //! \param[in] key The key of the detector for which the factory will be
    //! used.
    TModelFactoryCPtr factory(const CSearchKey& key) const;

    //! Get the factory for new models.
    //!
    //! \param[in] identifier The identifier of the search for which to get a model
    //! factory.
    //! \param[in] function The function being invoked.
    //! \param[in] useNull If true then we will process missing fields as if their
    //! value is equal to the empty string where possible.
    //! \param[in] excludeFrequent Whether to discard frequent results
    //! \param[in] personFieldName The name of the over field.
    //! \param[in] attributeFieldName The name of the by field.
    //! \param[in] valueFieldName The name of the field containing metric values.
    //! \param[in] influenceFieldNames The list of influence field names.
    TModelFactoryCPtr
    factory(int identifier,
            function_t::EFunction function,
            bool useNull = false,
            model_t::EExcludeFrequent excludeFrequent = model_t::E_XF_None,
            const std::string& partitionFieldName = std::string(),
            const std::string& personFieldName = std::string(),
            const std::string& attributeFieldName = std::string(),
            const std::string& valueFieldName = std::string(),
            const CSearchKey::TStrVec& influenceFieldNames = CSearchKey::TStrVec()) const;

    //! Set the rate at which the models lose information.
    void decayRate(double value);
    //! Get the rate at which the models lose information.
    double decayRate() const;

    //! Get the length of the baseline.
    core_t::TTime baselineLength() const;

    //! Get the bucket length.
    core_t::TTime bucketLength() const;

    //! Get the period of time at which to perform a potential prune of the models
    //! expressed in number of seconds.
    core_t::TTime modelPruneWindow() const;

    //! Set the period of time at which to perform a potential prune of the models
    //! expressed in number of seconds.
    void modelPruneWindow(core_t::TTime modelPruneWindow);

    //! Get the maximum latency in the arrival of out of order data.
    core_t::TTime latency() const;

    //! Get the maximum latency in the arrival of out of order data in
    //! numbers of buckets.
    std::size_t latencyBuckets() const;

    //! Get the single interim bucket correction calculator.
    const CInterimBucketCorrector& interimBucketCorrector() const;

    //! Should multivariate analysis of correlated 'by' fields be performed?
    bool multivariateByFields() const;

    //! \name Model Plot
    //@{
    //! Configure modelPlotConfig params from file
    bool configureModelPlot(const std::string& modelPlotConfigFile);

    //! Configure modelPlotConfig params from a property tree
    //! expected to contain three properties: 'boundsPercentile', 'annotationsEnabled'
    //! and 'terms'
    bool configureModelPlot(const boost::property_tree::ptree& propTree);

    //! Configure modelPlotConfig params directly, from the three properties
    //! 'modelPlotEnabled', 'annotationPlotEnabled' and 'terms'.
    //! This initialisation method does not allow setting the value of the
    //! 'boundsPercentile' property, instead a default value is used when 'modelPlotEnabled'
    //! is true and a value of -1.0 is used otherwise.
    void configureModelPlot(bool modelPlotEnabled,
                            bool annotationsEnabled,
                            const std::string& terms);

    //! Set the central confidence interval for the model debug plot
    //! to \p percentage.
    //!
    //! This controls upper and lower confidence interval error bars
    //! returned by the model debug plot.
    //! \note \p percentile should be in the range [0.0, 100.0).
    void modelPlotBoundsPercentile(double percentile);

    //! Get the central confidence interval for the model debug plot.
    double modelPlotBoundsPercentile() const;

    //! Is model plot enabled?
    bool modelPlotEnabled() const;

    //! Are annotations enabled for each of the models?
    bool modelPlotAnnotationsEnabled() const;

    //! Set terms (by, over, or partition field values) to filter
    //! model debug data. When empty, no filtering is applied.
    void modelPlotTerms(TStrSet terms);

    //! Get the terms (by, over, or partition field values)
    //! used to filter model debug data. Empty when no filtering applies.
    const TStrSet& modelPlotTerms() const;
    //@}

    //! \name Anomaly Score Calculation
    //@{
    //! Get the value of the aggregation style parameter identified by
    //! \p style and \p param.
    double aggregationStyleParam(model_t::EAggregationStyle style,
                                 model_t::EAggregationParam param) const;

    //! Get the maximum anomalous probability.
    double maximumAnomalousProbability() const;
    //@}

    //! \name Anomaly Score Normalization
    //@{
    //! Get the historic anomaly score percentile for which lower
    //! values are classified as noise.
    double noisePercentile() const;

    //! Get the multiplier applied to the noise level score in order
    //! to be classified as anomalous.
    double noiseMultiplier() const;

    //! Get the normalized anomaly score knot points.
    const TDoubleDoublePrVec& normalizedScoreKnotPoints() const;
    //@}

    //! Sets the reference to the detection rules map
    void detectionRules(TIntDetectionRuleVecUMapCRef detectionRules);

    //! Sets the reference to the scheduled events vector
    void scheduledEvents(TStrDetectionRulePrVecCRef scheduledEvents);

    //! Process the stanza properties corresponding \p stanzaName.
    //!
    //! \param[in] propertyTree The properties of the stanza called
    //! \p stanzaName.
    bool processStanza(const boost::property_tree::ptree& propertyTree);

    //! Get the factor to normalize all bucket lengths to the default
    //! bucket length.
    double bucketNormalizationFactor() const;

private:
    //! Bucket length.
    core_t::TTime m_BucketLength{0};

    //! Prune window length (in seconds)
    core_t::TTime m_ModelPruneWindow{0};

    //! Should multivariate analysis of correlated 'by' fields be performed?
    bool m_MultivariateByFields{false};

    //! The single interim bucket correction calculator.
    TInterimBucketCorrectorPtr m_InterimBucketCorrector;

    //! The new model factories for each data type.
    TFactoryTypeFactoryPtrMap m_Factories;

    //! A cache of customized factories requested from this config.
    mutable TSearchKeyFactoryCPtrMap m_FactoryCache;

    //! Is model plot enabled?
    bool m_ModelPlotEnabled{false};

    //! Are annotations enabled for each of the models?
    bool m_ModelPlotAnnotationsEnabled{false};

    //! The central confidence interval for the model debug plot.
    double m_ModelPlotBoundsPercentile;

    //! Terms (by, over, or partition field values) used to filter model
    //! debug data. Empty when no filtering applies.
    TStrSet m_ModelPlotTerms;
    //@}

    //! \name Anomaly Score Calculation
    //@{
    //! The values for the aggregation styles' parameters.
    double m_AggregationStyleParams[model_t::NUMBER_AGGREGATION_STYLES][model_t::NUMBER_AGGREGATION_PARAMS];

    //! The maximum probability which is deemed to be anomalous.
    double m_MaximumAnomalousProbability;
    //@}

    //! \name Anomaly Score Normalization
    //@{
    //! The historic anomaly score percentile for which lower values
    //! are classified as noise.
    double m_NoisePercentile;

    //! The multiplier applied to the noise level score in order to
    //! be classified as anomalous.
    double m_NoiseMultiplier;

    //! We use a piecewise linear mapping between the raw anomaly score
    //! and the normalized anomaly score with these knot points.
    //! \see DEFAULT_NORMALIZED_SCORE_KNOT_POINTS for details.
    TDoubleDoublePrVec m_NormalizedScoreKnotPoints;
    //@}

    //! A reference to the map containing detection rules per
    //! detector key. Note that the owner of the map is CAnomalyJobConfig::CAnalysisConfig.
    TIntDetectionRuleVecUMapCRef m_DetectionRules;

    //! A reference to the vector of scheduled events.
    //! The owner of the vector is CAnomalyJobConfig::CAnalysisConfig.
    TStrDetectionRulePrVecCRef m_ScheduledEvents;
};
}
}

#endif // INCLUDED_ml_model_CAnomalyDetectorModelConfig_h