-
Notifications
You must be signed in to change notification settings - Fork 64
/
Copy pathCAnomalyDetectorModelConfig.h
505 lines (412 loc) · 20.9 KB
/
CAnomalyDetectorModelConfig.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#ifndef INCLUDED_ml_model_CAnomalyDetectorModelConfig_h
#define INCLUDED_ml_model_CAnomalyDetectorModelConfig_h
#include <core/CoreTypes.h>
#include <model/CSearchKey.h>
#include <model/FunctionTypes.h>
#include <model/ImportExport.h>
#include <model/ModelTypes.h>
#include <boost/property_tree/ptree_fwd.hpp>
#include <boost/unordered_map.hpp>
#include <cstddef>
#include <map>
#include <memory>
#include <set>
#include <utility>
#include <vector>
namespace ml {
namespace model {
class CDetectionRule;
class CInterimBucketCorrector;
class CSearchKey;
class CModelFactory;
//! \brief Responsible for configuring anomaly detection models.
//!
//! DESCRIPTION:\n
//! Responsible for configuring classes for performing anomaly detection.
//! It also defines all parameter defaults.
//!
//! IMPLEMENTATION DECISIONS:\n
//! This wraps up the configuration of anomaly detection to encapsulate
//! the details from calling code. It is anticipated that:
//! -# Some of this information will be exposed to the user via a
//! configuration file,
//! -# Some may be calculated from data characteristics and so on.
class MODEL_EXPORT CAnomalyDetectorModelConfig {
public:
//! The possible factory types.
enum EFactoryType {
E_EventRateFactory = 0,
E_MetricFactory = 1,
E_EventRatePopulationFactory = 2,
E_MetricPopulationFactory = 3,
E_CountingFactory = 4,
E_UnknownFactory,
E_BadFactory
};
using TStrSet = std::set<std::string>;
using TSizeVec = std::vector<std::size_t>;
using TTimeVec = std::vector<core_t::TTime>;
using TTimeVecCItr = TTimeVec::const_iterator;
using TDoubleDoublePr = std::pair<double, double>;
using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
using TFeatureVec = model_t::TFeatureVec;
using TStrVec = std::vector<std::string>;
using TStrVecCItr = TStrVec::const_iterator;
using TInterimBucketCorrectorPtr = std::shared_ptr<CInterimBucketCorrector>;
using TModelFactoryPtr = std::shared_ptr<CModelFactory>;
using TModelFactoryCPtr = std::shared_ptr<const CModelFactory>;
using TFactoryTypeFactoryPtrMap = std::map<EFactoryType, TModelFactoryPtr>;
using TFactoryTypeFactoryPtrMapItr = TFactoryTypeFactoryPtrMap::iterator;
using TFactoryTypeFactoryPtrMapCItr = TFactoryTypeFactoryPtrMap::const_iterator;
using TSearchKeyFactoryCPtrMap = std::map<CSearchKey, TModelFactoryCPtr>;
// Const ref to detection rules map
using TDetectionRuleVec = std::vector<CDetectionRule>;
using TDetectionRuleVecCRef = std::reference_wrapper<const TDetectionRuleVec>;
using TIntDetectionRuleVecUMap = boost::unordered_map<int, TDetectionRuleVec>;
using TIntDetectionRuleVecUMapCRef = std::reference_wrapper<const TIntDetectionRuleVecUMap>;
using TIntDetectionRuleVecUMapCItr = TIntDetectionRuleVecUMap::const_iterator;
using TStrDetectionRulePr = std::pair<std::string, model::CDetectionRule>;
using TStrDetectionRulePrVec = std::vector<TStrDetectionRulePr>;
using TStrDetectionRulePrVecCRef = std::reference_wrapper<const TStrDetectionRulePrVec>;
public:
//! \name Data Gathering
//@{
//! The default value used to separate components of a multivariate feature
//! in its string value.
static const std::string DEFAULT_MULTIVARIATE_COMPONENT_DELIMITER;
//! Bucket length if none is specified on the command line.
static const core_t::TTime DEFAULT_BUCKET_LENGTH;
//! Default maximum number of buckets for receiving out of order records.
static const std::size_t DEFAULT_LATENCY_BUCKETS;
//! Bucket length corresponding to the default decay and learn rates.
static const core_t::TTime STANDARD_BUCKET_LENGTH;
//@}
//! \name Modelling
//@{
//! The default rate at which the model priors decay to non-informative
//! per standard bucket length.
static const double DEFAULT_DECAY_RATE;
//! The initial rate, as a multiple of the default decay rate, at which
//! the model priors decay to non-informative per standard bucket length.
static const double DEFAULT_INITIAL_DECAY_RATE_MULTIPLIER;
//! The rate at which information accrues in the model per standard
//! bucket length elapsed.
static const double DEFAULT_LEARN_RATE;
//! The default minimum permitted fraction of points in a distribution
//! mode for individual modeling.
static const double DEFAULT_INDIVIDUAL_MINIMUM_MODE_FRACTION;
//! The default minimum permitted fraction of points in a distribution
//! mode for population modeling.
static const double DEFAULT_POPULATION_MINIMUM_MODE_FRACTION;
//! The default minimum count in a cluster we'll permit in a cluster.
static const double DEFAULT_MINIMUM_CLUSTER_SPLIT_COUNT;
//! The default proportion of initial count at which we'll delete a
//! category from the sketch to cluster.
static const double DEFAULT_CATEGORY_DELETE_FRACTION;
//! The default size of the seasonal components we will model.
static const std::size_t DEFAULT_COMPONENT_SIZE;
//! The default minimum time to detect a change point in a time series.
static const core_t::TTime DEFAULT_MINIMUM_TIME_TO_DETECT_CHANGE;
//! The default maximum time to test for a change point in a time series.
static const core_t::TTime DEFAULT_MAXIMUM_TIME_TO_TEST_FOR_CHANGE;
//! The default number of time buckets used to generate multibucket features
//! for anomaly detection.
static const std::size_t MULTIBUCKET_FEATURES_WINDOW_LENGTH;
//! The maximum value that the multi_bucket_impact can take
static const double MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE;
//! The maximum number of times we'll update a model in a bucketing
//! interval. This only applies to our metric statistics, which are
//! computed on a fixed number of measurements rather than a fixed
//! time interval. A value of zero implies no constraint.
static const double DEFAULT_MAXIMUM_UPDATES_PER_BUCKET;
//! The default minimum value for the influence for which an influencing
//! field value is judged to have any influence on a feature value.
static const double DEFAULT_INFLUENCE_CUTOFF;
//! The default scale factor of the decayRate that determines the minimum
//! size of the sliding prune window for purging older entries from the
//! model.
static const double DEFAULT_PRUNE_WINDOW_SCALE_MINIMUM;
//! The default scale factor of the decayRate that determines the maximum
//! size of the sliding prune window for purging older entries from the
//! model.
static const double DEFAULT_PRUNE_WINDOW_SCALE_MAXIMUM;
//! The default factor increase in priors used to model correlations.
static const double DEFAULT_CORRELATION_MODELS_OVERHEAD;
//! The default threshold for the Pearson correlation coefficient at
//! which a correlate will be modeled.
static const double DEFAULT_MINIMUM_SIGNIFICANT_CORRELATION;
//@}
//! \name Anomaly Score Calculation
//@{
//! The default values for the aggregation styles' parameters.
static const double DEFAULT_AGGREGATION_STYLE_PARAMS[model_t::NUMBER_AGGREGATION_STYLES][model_t::NUMBER_AGGREGATION_PARAMS];
//! The default maximum probability which is deemed to be anomalous.
static const double DEFAULT_MAXIMUM_ANOMALOUS_PROBABILITY;
//@}
//! \name Anomaly Score Normalization
//@{
//! The default historic anomaly score percentile for which lower
//! values are classified as noise.
static const double DEFAULT_NOISE_PERCENTILE;
//! The default multiplier applied to the noise level score in
//! order to be classified as anomalous.
static const double DEFAULT_NOISE_MULTIPLIER;
//! We use a piecewise linear mapping between the raw anomaly score
//! and the normalized anomaly score with these default knot points.
//! In particular, if we define the percentile of a raw score \f$s\f$
//! as \f$f_q(s)\f$ and \f$a = \max\{x \le f_q(s)\}\f$ and
//! \f$b = \min{x \ge f_q(s)}\f$ where \f$x\f$ ranges over the knot point
//! X- values then the normalized score would be:\n
//! <pre class="fragment">
//! \f$\displaystyle \bar{s} = \frac{(y(b) - y(a))(f_q(s) - a)}{b - a}\f$
//! </pre>
//! Here, \f$y(.)\f$ denote the corresponding knot point Y- values.
static const TDoubleDoublePr DEFAULT_NORMALIZED_SCORE_KNOT_POINTS[9];
//@}
public:
//! Create the default configuration.
//!
//! \param[in] bucketLength The bucketing interval length.
//! \param[in] summaryMode Indicates whether the data being gathered
//! are already summarized by an external aggregation process.
//! \param[in] summaryCountFieldName If \p summaryMode is E_Manual
//! then this is the name of the field holding the summary count.
//! \param[in] latency The amount of time records are buffered for, to
//! allow out-of-order records to be seen by the models in order.
//! \param[in] multivariateByFields Should multivariate analysis of
//! correlated 'by' fields be performed?
static CAnomalyDetectorModelConfig defaultConfig(core_t::TTime bucketLength,
model_t::ESummaryMode summaryMode,
const std::string& summaryCountFieldName,
core_t::TTime latency,
bool multivariateByFields);
//! Overload using defaults.
static CAnomalyDetectorModelConfig
defaultConfig(core_t::TTime bucketLength = DEFAULT_BUCKET_LENGTH,
model_t::ESummaryMode summaryMode = model_t::E_None,
const std::string& summaryCountFieldName = "") {
return defaultConfig(bucketLength, summaryMode, summaryCountFieldName,
DEFAULT_LATENCY_BUCKETS * bucketLength, false);
}
//! Get the factor to normalize all bucket lengths to the default
//! bucket length.
static double bucketNormalizationFactor(core_t::TTime bucketLength);
//! Get the decay rate to use for the time series decomposition given
//! the model decay rate \p modelDecayRate.
static double trendDecayRate(double modelDecayRate, core_t::TTime bucketLength);
public:
CAnomalyDetectorModelConfig();
//! Set the data bucketing interval.
void bucketLength(core_t::TTime length);
//! Set the single interim bucket correction calculator.
void interimBucketCorrector(const TInterimBucketCorrectorPtr& interimBucketCorrector);
//! Set whether to model multibucket features.
void useMultibucketFeatures(bool enabled);
//! Set whether multivariate analysis of correlated 'by' fields should
//! be performed.
void multivariateByFields(bool enabled);
//! Set the model factories.
void factories(const TFactoryTypeFactoryPtrMap& factories);
//! Set the style and parameter value for raw score aggregation.
bool aggregationStyleParams(model_t::EAggregationStyle style,
model_t::EAggregationParam param,
double value);
//! Set the maximum anomalous probability.
void maximumAnomalousProbability(double probability);
//! Set the noise level as a percentile of historic raw anomaly scores.
bool noisePercentile(double percentile);
//! Set the noise multiplier to use when derating normalized scores
//! based on the noise score level.
bool noiseMultiplier(double multiplier);
//! Set the normalized score knot points for the piecewise linear curve
//! between historic raw score percentiles and normalized scores.
bool normalizedScoreKnotPoints(const TDoubleDoublePrVec& points);
//! Populate the parameters from a configuration file.
bool init(const std::string& configFile);
//! Populate the parameters from a configuration file, also retrieving
//! the raw property tree created from the config file. (The raw
//! property tree is only valid if the method returns true.)
bool init(const std::string& configFile, boost::property_tree::ptree& propTree);
//! Populate the parameters from a property tree.
bool init(const boost::property_tree::ptree& propTree);
//! Get the factory for new models.
//!
//! \param[in] key The key of the detector for which the factory will be
//! used.
TModelFactoryCPtr factory(const CSearchKey& key) const;
//! Get the factory for new models.
//!
//! \param[in] identifier The identifier of the search for which to get a model
//! factory.
//! \param[in] function The function being invoked.
//! \param[in] useNull If true then we will process missing fields as if their
//! value is equal to the empty string where possible.
//! \param[in] excludeFrequent Whether to discard frequent results
//! \param[in] personFieldName The name of the over field.
//! \param[in] attributeFieldName The name of the by field.
//! \param[in] valueFieldName The name of the field containing metric values.
//! \param[in] influenceFieldNames The list of influence field names.
TModelFactoryCPtr
factory(int identifier,
function_t::EFunction function,
bool useNull = false,
model_t::EExcludeFrequent excludeFrequent = model_t::E_XF_None,
const std::string& partitionFieldName = std::string(),
const std::string& personFieldName = std::string(),
const std::string& attributeFieldName = std::string(),
const std::string& valueFieldName = std::string(),
const CSearchKey::TStrVec& influenceFieldNames = CSearchKey::TStrVec()) const;
//! Set the rate at which the models lose information.
void decayRate(double value);
//! Get the rate at which the models lose information.
double decayRate() const;
//! Get the length of the baseline.
core_t::TTime baselineLength() const;
//! Get the bucket length.
core_t::TTime bucketLength() const;
//! Get the period of time at which to perform a potential prune of the models
//! expressed in number of seconds.
core_t::TTime modelPruneWindow() const;
//! Set the period of time at which to perform a potential prune of the models
//! expressed in number of seconds.
void modelPruneWindow(core_t::TTime modelPruneWindow);
//! Get the maximum latency in the arrival of out of order data.
core_t::TTime latency() const;
//! Get the maximum latency in the arrival of out of order data in
//! numbers of buckets.
std::size_t latencyBuckets() const;
//! Get the single interim bucket correction calculator.
const CInterimBucketCorrector& interimBucketCorrector() const;
//! Should multivariate analysis of correlated 'by' fields be performed?
bool multivariateByFields() const;
//! \name Model Plot
//@{
//! Configure modelPlotConfig params from file
bool configureModelPlot(const std::string& modelPlotConfigFile);
//! Configure modelPlotConfig params from a property tree
//! expected to contain three properties: 'boundsPercentile', 'annotationsEnabled'
//! and 'terms'
bool configureModelPlot(const boost::property_tree::ptree& propTree);
//! Configure modelPlotConfig params directly, from the three properties
//! 'modelPlotEnabled', 'annotationPlotEnabled' and 'terms'.
//! This initialisation method does not allow setting the value of the
//! 'boundsPercentile' property, instead a default value is used when 'modelPlotEnabled'
//! is true and a value of -1.0 is used otherwise.
void configureModelPlot(bool modelPlotEnabled,
bool annotationsEnabled,
const std::string& terms);
//! Set the central confidence interval for the model debug plot
//! to \p percentage.
//!
//! This controls upper and lower confidence interval error bars
//! returned by the model debug plot.
//! \note \p percentile should be in the range [0.0, 100.0).
void modelPlotBoundsPercentile(double percentile);
//! Get the central confidence interval for the model debug plot.
double modelPlotBoundsPercentile() const;
//! Is model plot enabled?
bool modelPlotEnabled() const;
//! Are annotations enabled for each of the models?
bool modelPlotAnnotationsEnabled() const;
//! Set terms (by, over, or partition field values) to filter
//! model debug data. When empty, no filtering is applied.
void modelPlotTerms(TStrSet terms);
//! Get the terms (by, over, or partition field values)
//! used to filter model debug data. Empty when no filtering applies.
const TStrSet& modelPlotTerms() const;
//@}
//! \name Anomaly Score Calculation
//@{
//! Get the value of the aggregation style parameter identified by
//! \p style and \p param.
double aggregationStyleParam(model_t::EAggregationStyle style,
model_t::EAggregationParam param) const;
//! Get the maximum anomalous probability.
double maximumAnomalousProbability() const;
//@}
//! \name Anomaly Score Normalization
//@{
//! Get the historic anomaly score percentile for which lower
//! values are classified as noise.
double noisePercentile() const;
//! Get the multiplier applied to the noise level score in order
//! to be classified as anomalous.
double noiseMultiplier() const;
//! Get the normalized anomaly score knot points.
const TDoubleDoublePrVec& normalizedScoreKnotPoints() const;
//@}
//! Sets the reference to the detection rules map
void detectionRules(TIntDetectionRuleVecUMapCRef detectionRules);
//! Sets the reference to the scheduled events vector
void scheduledEvents(TStrDetectionRulePrVecCRef scheduledEvents);
//! Process the stanza properties corresponding \p stanzaName.
//!
//! \param[in] propertyTree The properties of the stanza called
//! \p stanzaName.
bool processStanza(const boost::property_tree::ptree& propertyTree);
//! Get the factor to normalize all bucket lengths to the default
//! bucket length.
double bucketNormalizationFactor() const;
private:
//! Bucket length.
core_t::TTime m_BucketLength{0};
//! Prune window length (in seconds)
core_t::TTime m_ModelPruneWindow{0};
//! Should multivariate analysis of correlated 'by' fields be performed?
bool m_MultivariateByFields{false};
//! The single interim bucket correction calculator.
TInterimBucketCorrectorPtr m_InterimBucketCorrector;
//! The new model factories for each data type.
TFactoryTypeFactoryPtrMap m_Factories;
//! A cache of customized factories requested from this config.
mutable TSearchKeyFactoryCPtrMap m_FactoryCache;
//! Is model plot enabled?
bool m_ModelPlotEnabled{false};
//! Are annotations enabled for each of the models?
bool m_ModelPlotAnnotationsEnabled{false};
//! The central confidence interval for the model debug plot.
double m_ModelPlotBoundsPercentile;
//! Terms (by, over, or partition field values) used to filter model
//! debug data. Empty when no filtering applies.
TStrSet m_ModelPlotTerms;
//@}
//! \name Anomaly Score Calculation
//@{
//! The values for the aggregation styles' parameters.
double m_AggregationStyleParams[model_t::NUMBER_AGGREGATION_STYLES][model_t::NUMBER_AGGREGATION_PARAMS];
//! The maximum probability which is deemed to be anomalous.
double m_MaximumAnomalousProbability;
//@}
//! \name Anomaly Score Normalization
//@{
//! The historic anomaly score percentile for which lower values
//! are classified as noise.
double m_NoisePercentile;
//! The multiplier applied to the noise level score in order to
//! be classified as anomalous.
double m_NoiseMultiplier;
//! We use a piecewise linear mapping between the raw anomaly score
//! and the normalized anomaly score with these knot points.
//! \see DEFAULT_NORMALIZED_SCORE_KNOT_POINTS for details.
TDoubleDoublePrVec m_NormalizedScoreKnotPoints;
//@}
//! A reference to the map containing detection rules per
//! detector key. Note that the owner of the map is CAnomalyJobConfig::CAnalysisConfig.
TIntDetectionRuleVecUMapCRef m_DetectionRules;
//! A reference to the vector of scheduled events.
//! The owner of the vector is CAnomalyJobConfig::CAnalysisConfig.
TStrDetectionRulePrVecCRef m_ScheduledEvents;
};
}
}
#endif // INCLUDED_ml_model_CAnomalyDetectorModelConfig_h