Skip to content

[ML] Report the "actual" memory usage of the autodetect process #2846

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a7b1ce2
[ML] Report the "actual" memory usage of the autodetect process
edsavage Apr 4, 2025
cb957ca
Update changelog
edsavage Apr 4, 2025
8f73f02
Formatting
edsavage Apr 4, 2025
d3a39ae
Appease SonarQube
edsavage Apr 7, 2025
e2d1bf5
Tweak unit test for platform portability
edsavage Apr 7, 2025
1a9a99a
Attend to review comments
edsavage Apr 9, 2025
5ae22cb
Update bin/autodetect/Main.cc
edsavage Apr 10, 2025
fe6f1fa
Update include/model/CResourceMonitor.h
edsavage Apr 10, 2025
582430e
Update include/model/CResourceMonitor.h
edsavage Apr 10, 2025
9476ede
Attend to review comments
edsavage Apr 11, 2025
e29d8f7
Merge remote-tracking branch 'origin/ad_real_mem_usage' into ad_real_…
edsavage Apr 11, 2025
475fef1
Formatting
edsavage Apr 11, 2025
fc8888c
Merge branch 'main' of github.com:elastic/ml-cpp into ad_real_mem_usage
edsavage May 23, 2025
6945d3f
Attend to failing unit tests (hopefully)
edsavage May 27, 2025
3b69b72
Formatting.. grr
edsavage May 27, 2025
1bafa8c
On Linux only, use the value of the system memory usage (max resident…
edsavage Jun 4, 2025
efc311b
Fix copyright headers
edsavage Jun 4, 2025
6be6395
Nits in test code
edsavage Jun 4, 2025
134a494
Attend to code review comments
edsavage Jun 9, 2025
ed426ac
Formatting
edsavage Jun 9, 2025
4c8bf8b
Attend to code review comments
edsavage Jun 11, 2025
fa9c4fa
Small tidy up of CProcessMemoryUsage.cc
edsavage Jun 12, 2025
014be1c
Small tidy up
edsavage Jun 12, 2025
41c02de
On Linux, return system memory (max resident set size) from CResource…
edsavage Jun 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion bin/autodetect/Main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <core/CJsonOutputStreamWrapper.h>
#include <core/CLogger.h>
#include <core/CProcessPriority.h>
#include <core/CProcessStats.h>
#include <core/CProgramCounters.h>
#include <core/CStringUtils.h>
#include <core/CoreTypes.h>
Expand Down Expand Up @@ -83,7 +84,9 @@ int main(int argc, char** argv) {
ml::counter_t::E_TSADNumberMemoryLimitModelCreationFailures,
ml::counter_t::E_TSADNumberPrunedItems,
ml::counter_t::E_TSADAssignmentMemoryBasis,
ml::counter_t::E_TSADOutputMemoryAllocatorUsage};
ml::counter_t::E_TSADOutputMemoryAllocatorUsage,
ml::counter_t::E_TSADSystemMemoryUsage,
ml::counter_t::E_TSADMaxSystemMemoryUsage};

ml::core::CProgramCounters::registerProgramCounterTypes(counters);

Expand Down Expand Up @@ -151,6 +154,8 @@ int main(int argc, char** argv) {
}
cancellerThread.stop();

LOG_DEBUG(<< "Max Resident Set Size: " << ml::core::CProcessStats::maxResidentSetSize());
LOG_DEBUG(<< "Resident Set Size: " << ml::core::CProcessStats::residentSetSize());
// Log the program version immediately after reconfiguring the logger. This
// must be done from the program, and NOT a shared library, as each program
// statically links its own version library.
Expand Down
1 change: 1 addition & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
=== Enhancements

* Track memory used in the hierarchical results normalizer. (See {ml-pull}2831[#2831].)
* Report the actual memory usage of the autodetect process. (See {ml-pull}2846[#2846])

=== Bug Fixes

Expand Down
12 changes: 11 additions & 1 deletion include/core/CProgramCounters.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ enum ECounterTypes {
//! The memory currently used by the allocators to output JSON documents, in bytes.
E_TSADOutputMemoryAllocatorUsage = 30,

//! The resident set size of the process, in bytes.
E_TSADSystemMemoryUsage = 31,

//! The maximum resident set size of the process, in bytes.
E_TSADMaxSystemMemoryUsage = 32,

// Data Frame Outlier Detection

//! The estimated peak memory usage for outlier detection in bytes
Expand Down Expand Up @@ -146,7 +152,7 @@ enum ECounterTypes {
// Add any new values here

//! This MUST be last, increment the value for every new enum added
E_LastEnumCounter = 31
E_LastEnumCounter = 33
};

static constexpr std::size_t NUM_COUNTERS = static_cast<std::size_t>(E_LastEnumCounter);
Expand Down Expand Up @@ -355,6 +361,10 @@ class CORE_EXPORT CProgramCounters {
"Which option is being used to get model memory for node assignment?"},
{counter_t::E_TSADOutputMemoryAllocatorUsage, "E_TSADOutputMemoryAllocatorUsage",
"The amount of memory used to output JSON documents, in bytes."},
{counter_t::E_TSADSystemMemoryUsage, "E_TSADSystemMemoryUsage",
"The amount of system memory used by the process, in bytes"},
{counter_t::E_TSADMaxSystemMemoryUsage, "E_TSADMaxSystemMemoryUsage",
"The maximum amount of system memory used by the process, in bytes"},
{counter_t::E_DFOEstimatedPeakMemoryUsage, "E_DFOEstimatedPeakMemoryUsage",
"The upfront estimate of the peak memory outlier detection would use"},
{counter_t::E_DFOPeakMemoryUsage, "E_DFOPeakMemoryUsage", "The peak memory outlier detection used"},
Expand Down
40 changes: 40 additions & 0 deletions include/model/CProcessMemoryUsage.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/

#ifndef INCLUDED_ml_model_CSystemMemoryUsage_h
#define INCLUDED_ml_model_CSystemMemoryUsage_h

#include <model/ImportExport.h>

#include <cstddef>

namespace ml {
namespace model {

//! \brief Determines how to calculate the memory used by the current process.
//!
//! DESCRIPTION:\n
//! Determines how to calculate the memory used by the current process based on the operating system.
//! On some OS's (Mac, Windows) we use the estimated memory usage of the models,
//! while on others (Linux) we use the actual memory of the process as provided by system calls.
class MODEL_EXPORT CProcessMemoryUsage {
public:
enum class EMemoryStrategy { E_Estimated, E_System };

static const EMemoryStrategy MEMORY_STRATEGY;

public:
CProcessMemoryUsage() = delete;
};
}
}

#endif //INCLUDED_ml_model_CSystemMemoryUsage_h
9 changes: 9 additions & 0 deletions include/model/CResourceMonitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@ class MODEL_EXPORT CResourceMonitor {
//! Returns the sum of used memory plus any extra memory
std::size_t totalMemory() const;

//! Returns the current physical memory of the process (rss) as reported by the system
static std::size_t systemMemory();

//! Returns the maximum physical memory of the process (max rss) as reported by the system
static std::size_t maxSystemMemory();

private:
using TMonitoredResourcePtrSizeUMap =
boost::unordered_map<CMonitoredResource*, std::size_t>;
Expand Down Expand Up @@ -229,6 +235,9 @@ class MODEL_EXPORT CResourceMonitor {
//! Returns the amount by which reported memory usage is scaled depending on the type of persistence in use
std::size_t persistenceMemoryIncreaseFactor() const;

//! Modify the supplied usage value depending on a platform dependent strategy.
std::size_t applyMemoryStrategy(std::size_t usage) const;

private:
//! The registered collection of components
TMonitoredResourcePtrSizeUMap m_Resources;
Expand Down
4 changes: 4 additions & 0 deletions lib/api/CAnomalyJob.cc
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,10 @@ bool CAnomalyJob::handleRecord(const TStrStrUMap& dataRowFields, TOptionalTime t
}

++core::CProgramCounters::counter(counter_t::E_TSADNumberApiRecordsHandled);
core::CProgramCounters::counter(counter_t::E_TSADSystemMemoryUsage) =
model::CResourceMonitor::systemMemory();
core::CProgramCounters::counter(counter_t::E_TSADMaxSystemMemoryUsage) =
model::CResourceMonitor::maxSystemMemory();

++m_NumRecordsHandled;
m_LatestRecordTime = std::max(m_LatestRecordTime, *time);
Expand Down
2 changes: 2 additions & 0 deletions lib/api/CModelSizeStatsJsonWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ const std::string JOB_ID{"job_id"};
const std::string MODEL_SIZE_STATS{"model_size_stats"};
const std::string MODEL_BYTES{"model_bytes"};
const std::string PEAK_MODEL_BYTES{"peak_model_bytes"};
const std::string SYSTEM_MEMORY_BYTES{"system_memory_bytes"};
const std::string MAX_SYSTEM_MEMORY_BYTES{"max_system_memory_bytes"};
Comment on lines +28 to +29
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAICS, this keys are not used anymore.

const std::string MODEL_BYTES_EXCEEDED{"model_bytes_exceeded"};
const std::string MODEL_BYTES_MEMORY_LIMIT{"model_bytes_memory_limit"};
const std::string TOTAL_BY_FIELD_COUNT{"total_by_field_count"};
Expand Down
4 changes: 2 additions & 2 deletions lib/api/unittest/CAnomalyJobLimitTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* limitation.
*/
#include <core/CJsonOutputStreamWrapper.h>
#include <core/CProcessStats.h>
#include <core/CoreTypes.h>

#include <maths/common/CIntegerTools.h>
Expand Down Expand Up @@ -105,8 +106,6 @@ BOOST_AUTO_TEST_CASE(testAccuracy) {
core::CJsonOutputStreamWrapper wrappedOutputStream(outputStrm);

model::CLimits limits;
//limits.resourceMonitor().m_ByteLimitHigh = 100000;
//limits.resourceMonitor().m_ByteLimitLow = 90000;

{
LOG_TRACE(<< "Setting up job");
Expand All @@ -129,6 +128,7 @@ BOOST_AUTO_TEST_CASE(testAccuracy) {
nonLimitedUsage = limits.resourceMonitor().totalMemory();
}
}
LOG_DEBUG(<< "nonLimitedUsage: " << nonLimitedUsage);
{
// Now run the data with limiting
ml::api::CAnomalyJobConfig jobConfig = CTestAnomalyJob::makeSimpleJobConfig(
Expand Down
4 changes: 2 additions & 2 deletions lib/api/unittest/CJsonOutputWriterTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1733,7 +1733,7 @@ BOOST_AUTO_TEST_CASE(testReportMemoryUsage) {
resourceUsage.s_OverFields = 7;
resourceUsage.s_AllocationFailures = 8;
resourceUsage.s_MemoryStatus = ml::model_t::E_MemoryStatusHardLimit;
resourceUsage.s_AssignmentMemoryBasis = ml::model_t::E_AssignmentBasisCurrentModelBytes;
resourceUsage.s_AssignmentMemoryBasis = ml::model_t::E_AssignmentBasisPeakModelBytes;
resourceUsage.s_BucketStartTime = 9;
resourceUsage.s_BytesExceeded = 10;
resourceUsage.s_BytesMemoryLimit = 11;
Expand Down Expand Up @@ -1785,7 +1785,7 @@ BOOST_AUTO_TEST_CASE(testReportMemoryUsage) {
BOOST_TEST_REQUIRE(sizeStats.contains("memory_status"));
BOOST_REQUIRE_EQUAL("hard_limit", sizeStats.at("memory_status").as_string());
BOOST_TEST_REQUIRE(sizeStats.contains("assignment_memory_basis"));
BOOST_REQUIRE_EQUAL("current_model_bytes",
BOOST_REQUIRE_EQUAL("peak_model_bytes",
sizeStats.at("assignment_memory_basis").as_string());
BOOST_TEST_REQUIRE(sizeStats.contains("log_time"));
std::int64_t nowMs{ml::core::CTimeUtils::nowMs()};
Expand Down
3 changes: 2 additions & 1 deletion lib/core/CProcessStats_MacOSX.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include <core/CLogger.h>
#include <core/CProcessStats.h>

#include <core/CLogger.h>

#include <errno.h>
#include <fcntl.h>
#include <sys/resource.h>
Expand Down
4 changes: 3 additions & 1 deletion lib/core/CProcessStats_Windows.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include <core/CLogger.h>
#include <core/CProcessStats.h>

#include <core/CLogger.h>
#include <core/CWindowsError.h>

#include <core/WindowsSafe.h>
Expand All @@ -36,6 +37,7 @@ std::size_t CProcessStats::maxResidentSetSize() {
LOG_DEBUG(<< "Failed to retrieve memory info " << CWindowsError());
return 0;
}

return static_cast<std::size_t>(stats.PeakWorkingSetSize);
}
}
Expand Down
2 changes: 1 addition & 1 deletion lib/core/unittest/CLoggerTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ std::function<void()> makeReader(std::ostringstream& loggedData) {
return;
}
}
BOOST_FAIL("Failed to connect to logging pipe within a reasonable time");
BOOST_TEST_CHECK(false, "Failed to connect to logging pipe within a reasonable time");
};
}

Expand Down
1 change: 1 addition & 0 deletions lib/core/unittest/CNamedPipeFactoryTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ BOOST_AUTO_TEST_CASE(testServerIsCWriter) {
ml::core::CNamedPipeFactory::openPipeFileWrite(TEST_PIPE_NAME, dummy)};
BOOST_TEST_REQUIRE(file);

sleep(1);
std::size_t charsLeft{TEST_SIZE};
std::size_t blockSize{7};
while (charsLeft > 0) {
Expand Down
1 change: 1 addition & 0 deletions lib/model/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ ml_add_library(MlModel SHARED
CSampleCounts.cc
CSearchKey.cc
CSimpleCountDetector.cc
CProcessMemoryUsage.cc
CTokenListCategory.cc
CTokenListDataCategorizerBase.cc
CTokenListReverseSearchCreator.cc
Expand Down
20 changes: 20 additions & 0 deletions lib/model/CProcessMemoryUsage.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/

#include <model/CProcessMemoryUsage.h>

namespace ml {
namespace model {
// On platforms other than Linux the process memory usage is the estimated size of the models.
const CProcessMemoryUsage::EMemoryStrategy CProcessMemoryUsage::MEMORY_STRATEGY{
EMemoryStrategy::E_Estimated};
}
}
21 changes: 21 additions & 0 deletions lib/model/CProcessMemoryUsage_Linux.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/

#include <model/CProcessMemoryUsage.h>

namespace ml {
namespace model {

// On Linux the process memory usage is determined by the OS.
const CProcessMemoryUsage::EMemoryStrategy CProcessMemoryUsage::MEMORY_STRATEGY{
EMemoryStrategy::E_System};
}
}
33 changes: 29 additions & 4 deletions lib/model/CResourceMonitor.cc
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think, this change is missing to update CResrouceMonitor::totalMemory(). Please, correct me if I'm wrong, but the way I understand this code, totalMemory() on Linux should now simply return systemMemoryUsage().

Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@

#include <core/CLogger.h>
#include <core/CMemoryDef.h>
#include <core/CProcessStats.h>
#include <core/CProgramCounters.h>
#include <core/Constants.h>

#include <maths/common/CMathsFuncs.h>
#include <maths/common/CTools.h>

#include <model/CMonitoredResource.h>
#include <model/CProcessMemoryUsage.h>

#include <algorithm>
#include <cmath>
Expand Down Expand Up @@ -382,7 +384,7 @@ CResourceMonitor::createMemoryUsageReport(core_t::TTime bucketStartTime) {
res.s_PeakUsage = static_cast<std::size_t>(
core::CProgramCounters::counter(counter_t::E_TSADPeakMemoryUsage));
res.s_AdjustedPeakUsage = this->adjustedUsage(res.s_PeakUsage);
res.s_BytesMemoryLimit = this->persistenceMemoryIncreaseFactor() * m_ByteLimitHigh;
res.s_BytesMemoryLimit = this->getBytesMemoryLimit();
res.s_BytesExceeded = m_CurrentBytesExceeded;
res.s_MemoryStatus = m_MemoryStatus;
std::uint64_t assignmentMemoryBasis{
Expand All @@ -400,6 +402,22 @@ CResourceMonitor::createMemoryUsageReport(core_t::TTime bucketStartTime) {
return res;
}

std::size_t CResourceMonitor::applyMemoryStrategy(std::size_t usage) const {
std::size_t modifiedUsage{0};
switch (CProcessMemoryUsage::MEMORY_STRATEGY) {
case CProcessMemoryUsage::EMemoryStrategy::E_Estimated: {
modifiedUsage = usage;
break;
}
case CProcessMemoryUsage::EMemoryStrategy::E_System: {
modifiedUsage = core::CProcessStats::maxResidentSetSize();
break;
}
default: { LOG_WARN(<< "Unknown memory strategy"); }
}
return modifiedUsage;
}

std::size_t CResourceMonitor::adjustedUsage(std::size_t usage) const {
// We scale the reported memory usage by the inverse of the byte limit margin.
// This gives the user a fairer indication of how close the job is to hitting
Expand Down Expand Up @@ -486,10 +504,17 @@ std::size_t CResourceMonitor::lowLimit() const {
}

std::size_t CResourceMonitor::totalMemory() const {
return m_MonitoredResourceCurrentMemory + m_ExtraMemory +
static_cast<size_t>(core::CProgramCounters::counter(
counter_t::E_TSADOutputMemoryAllocatorUsage));
return this->applyMemoryStrategy(m_MonitoredResourceCurrentMemory + m_ExtraMemory +
static_cast<size_t>(core::CProgramCounters::counter(
counter_t::E_TSADOutputMemoryAllocatorUsage)));
}

std::size_t CResourceMonitor::systemMemory() {
return core::CProcessStats::residentSetSize();
}

std::size_t CResourceMonitor::maxSystemMemory() {
return core::CProcessStats::maxResidentSetSize();
}
} // model
} // ml
2 changes: 1 addition & 1 deletion lib/model/unittest/CResourceMonitorTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,7 @@ BOOST_FIXTURE_TEST_CASE(testExtraMemory, CTestFixture) {
}

BOOST_FIXTURE_TEST_CASE(testPeakUsage, CTestFixture) {
// Clear the counter so that other test cases do not interfere.
// Clear the counters so that other test cases do not interfere.
core::CProgramCounters::counter(counter_t::E_TSADPeakMemoryUsage) = 0;

CLimits limits;
Expand Down