Skip to content

Commit d8c1d07

Browse files
committed
Fix overdue state doesn't account for timeperiods in HA cluster
1 parent c2c9b6b commit d8c1d07

File tree

13 files changed

+80
-59
lines changed

13 files changed

+80
-59
lines changed

lib/checker/checkercomponent.cpp

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ void CheckerComponent::CheckThreadProc()
135135

136136
bool forced = checkable->GetForceNextCheck();
137137
bool check = true;
138-
bool notifyNextCheck = false;
138+
bool activeChecksDisabled = false;
139139
double nextCheck = -1;
140140

141141
if (!forced) {
@@ -144,19 +144,18 @@ void CheckerComponent::CheckThreadProc()
144144
<< "Skipping check for object '" << checkable->GetName() << "': Dependency failed.";
145145

146146
check = false;
147-
notifyNextCheck = true;
148147
}
149148

150149
Host::Ptr host;
151150
Service::Ptr service;
152151
tie(host, service) = GetHostService(checkable);
153152

154-
if (host && !service && (!checkable->GetEnableActiveChecks() || !icingaApp->GetEnableHostChecks())) {
153+
activeChecksDisabled = !checkable->GetEnableActiveChecks() || !icingaApp->GetEnableServiceChecks();
154+
if (activeChecksDisabled && host && !service) {
155155
Log(LogNotice, "CheckerComponent")
156156
<< "Skipping check for host '" << host->GetName() << "': active host checks are disabled";
157157
check = false;
158-
}
159-
if (host && service && (!checkable->GetEnableActiveChecks() || !icingaApp->GetEnableServiceChecks())) {
158+
} else if (activeChecksDisabled) {
160159
Log(LogNotice, "CheckerComponent")
161160
<< "Skipping check for service '" << service->GetName() << "': active service checks are disabled";
162161
check = false;
@@ -181,7 +180,6 @@ void CheckerComponent::CheckThreadProc()
181180
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck);
182181

183182
check = false;
184-
notifyNextCheck = true;
185183
}
186184
}
187185
}
@@ -192,17 +190,12 @@ void CheckerComponent::CheckThreadProc()
192190
lock.unlock();
193191

194192
if (nextCheck > 0) {
195-
checkable->SetNextCheck(nextCheck);
193+
checkable->SetNextCheck(nextCheck, false, Empty, !activeChecksDisabled);
196194
} else {
197195
Log(LogDebug, "CheckerComponent")
198196
<< "Checks for checkable '" << checkable->GetName() << "' are disabled. Rescheduling check.";
199197

200-
checkable->UpdateNextCheck();
201-
}
202-
203-
if (notifyNextCheck) {
204-
// Trigger update event for Icinga DB
205-
Checkable::OnNextCheckUpdated(checkable);
198+
checkable->UpdateNextCheck(Empty, !activeChecksDisabled);
206199
}
207200

208201
lock.lock();

lib/db_ido/dbevents.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ void DbEvents::StaticInitialize()
4040
DbEvents::RemoveAcknowledgement(checkable);
4141
});
4242

43-
Checkable::OnNextCheckUpdated.connect([](const Checkable::Ptr& checkable) { NextCheckUpdatedHandler(checkable); });
43+
Checkable::OnNextCheckUpdated.connect([](const Checkable::Ptr& checkable, const Value&, bool affectsDB) {
44+
NextCheckUpdatedHandler(checkable, affectsDB);
45+
});
4446
Checkable::OnFlappingChanged.connect([](const Checkable::Ptr& checkable, const Value&) { FlappingChangedHandler(checkable); });
4547
Checkable::OnNotificationSentToAllUsers.connect([](const Notification::Ptr& notification, const Checkable::Ptr& checkable,
4648
const std::set<User::Ptr>&, const NotificationType&, const CheckResult::Ptr&, const String&, const String&,
@@ -118,8 +120,12 @@ void DbEvents::StaticInitialize()
118120
}
119121

120122
/* check events */
121-
void DbEvents::NextCheckUpdatedHandler(const Checkable::Ptr& checkable)
123+
void DbEvents::NextCheckUpdatedHandler(const Checkable::Ptr& checkable, bool affectsDB)
122124
{
125+
if (!affectsDB) {
126+
return;
127+
}
128+
123129
Host::Ptr host;
124130
Service::Ptr service;
125131
tie(host, service) = GetHostService(checkable);

lib/db_ido/dbevents.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ class DbEvents
5252
static void AddLogHistory(const Checkable::Ptr& checkable, const String& buffer, LogEntryType type);
5353

5454
/* Status */
55-
static void NextCheckUpdatedHandler(const Checkable::Ptr& checkable);
55+
static void NextCheckUpdatedHandler(const Checkable::Ptr& checkable, bool affectsDB);
5656
static void FlappingChangedHandler(const Checkable::Ptr& checkable);
5757
static void LastNotificationChangedHandler(const Notification::Ptr& notification, const Checkable::Ptr& checkable);
5858

lib/icinga/apiactions.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,9 +159,6 @@ Dictionary::Ptr ApiActions::RescheduleCheck(const ConfigObject::Ptr& object,
159159

160160
checkable->SetNextCheck(nextCheck);
161161

162-
/* trigger update event for DB IDO */
163-
Checkable::OnNextCheckUpdated(checkable);
164-
165162
return ApiActions::CreateResult(200, "Successfully rescheduled check for object '" + checkable->GetName() + "'.");
166163
}
167164

lib/icinga/checkable-check.cpp

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, co
2222
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, StateType, const MessageOrigin::Ptr&)> Checkable::OnStateChange;
2323
boost::signals2::signal<void (const Checkable::Ptr&, const CheckResult::Ptr&, std::set<Checkable::Ptr>, const MessageOrigin::Ptr&)> Checkable::OnReachabilityChanged;
2424
boost::signals2::signal<void (const Checkable::Ptr&, NotificationType, const CheckResult::Ptr&, const String&, const String&, const MessageOrigin::Ptr&)> Checkable::OnNotificationsRequested;
25-
boost::signals2::signal<void (const Checkable::Ptr&)> Checkable::OnNextCheckUpdated;
25+
boost::signals2::signal<void (const Checkable::Ptr&, const Value&, bool)> Checkable::OnNextCheckUpdated;
2626

2727
Atomic<uint_fast64_t> Checkable::CurrentConcurrentChecks (0);
2828

@@ -50,7 +50,32 @@ long Checkable::GetSchedulingOffset()
5050
return m_SchedulingOffset;
5151
}
5252

53-
void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
53+
/**
54+
* Sets the next check time of the current checkable.
55+
*
56+
* This method is a wrapper around the internal @c ObjectImpl::SetNextCheck method that provides additional
57+
* functionality by triggering the @c Checkable::OnNextCheckUpdated event unless suppressed. This event is
58+
* the same as the auto generated event for the property change, but it also provides an additional parameter
59+
* to broadcast to subscribers whether this timestamp change should be persisted to the database. Both events
60+
* will be triggered only if `suppressEvents` isn't set to true. So, if you're interested on the additional argument,
61+
* you should subscribe to the @c Checkable::OnNextCheckUpdated event, otherwise you can just continue to use the
62+
* @c Checkable::OnNextCheckChanged event as before.
63+
*
64+
* @param value The new next check time as a Timestamp.
65+
* @param suppressEvents If true, the OnNextCheckUpdated event will not be triggered.
66+
* @param cookie An optional Value that can be passed to the event handler.
67+
* @param affectsDB If true, indicates that the change should be persisted to the database.
68+
*/
69+
void Checkable::SetNextCheck(const Timestamp& value, bool suppressEvents, const Value& cookie, bool affectsDB)
70+
{
71+
ObjectImpl::SetNextCheck(value, suppressEvents, cookie);
72+
73+
if (!suppressEvents) {
74+
OnNextCheckUpdated(Ptr(this), cookie, affectsDB);
75+
}
76+
}
77+
78+
void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin, bool affectsDB)
5479
{
5580
double interval;
5681

@@ -78,7 +103,7 @@ void Checkable::UpdateNextCheck(const MessageOrigin::Ptr& origin)
78103
<< " (" << lastCheck << ") to next check time at "
79104
<< Utility::FormatDateTime("%Y-%m-%d %H:%M:%S %z", nextCheck) << " (" << nextCheck << ").";
80105

81-
SetNextCheck(nextCheck, false, origin);
106+
SetNextCheck(nextCheck, false, origin, affectsDB);
82107
}
83108

84109
bool Checkable::HasBeenChecked() const
@@ -372,7 +397,9 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr
372397
// is randomly initialised on each node.
373398
if (!origin) {
374399
if (cr->GetActive()) {
375-
UpdateNextCheck();
400+
// The OnCheckResult event triggered down below implicitly triggers DB next update,
401+
// so we don't spam the DB with the same information twice.
402+
UpdateNextCheck(Empty, false);
376403
} else {
377404
/* Reschedule the next check for external passive check results. The side effect of
378405
* this is that for as long as we receive results for a service we
@@ -385,7 +412,7 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr
385412
else
386413
offset = GetCheckInterval();
387414

388-
SetNextCheck(Utility::GetTime() + offset);
415+
SetNextCheck(Utility::GetTime() + offset, false, Empty, false);
389416
}
390417
}
391418

@@ -518,7 +545,9 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr
518545
ObjectLock oLock (child);
519546

520547
if (nextCheck < child->GetNextCheck()) {
521-
child->SetNextCheck(nextCheck);
548+
// This is purely a cheat to force a re-check of the child object ASAP for reachability,
549+
// but this shouldn't be relevant for Icinga DB or IDO, so we don't persist this change.
550+
child->SetNextCheck(nextCheck, false, Empty, false);
522551
}
523552
}
524553
}
@@ -535,7 +564,9 @@ Checkable::ProcessingResult Checkable::ProcessCheckResult(const CheckResult::Ptr
535564

536565
if (parent->GetNextCheck() >= now + parent->GetRetryInterval()) {
537566
ObjectLock olock(parent);
538-
parent->SetNextCheck(now);
567+
// This is same as above, we just want to force a re-check of the parent ASAP.
568+
// No need to persist this change to Icinga DB or IDO.
569+
parent->SetNextCheck(now, false, Empty, false);
539570
}
540571
}
541572
}
@@ -571,7 +602,7 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer)
571602
* queues and ensures that checks are not fired multiple times. ProcessCheckResult()
572603
* is called too late. See #6421.
573604
*/
574-
UpdateNextCheck();
605+
UpdateNextCheck(Empty, false);
575606

576607
bool reachable = IsReachable();
577608

@@ -644,7 +675,7 @@ void Checkable::ExecuteCheck(const WaitGroup::Ptr& producer)
644675
* a check result from the remote instance. The check will be re-scheduled
645676
* using the proper check interval once we've received a check result.
646677
*/
647-
SetNextCheck(Utility::GetTime() + checkTimeout + 30);
678+
SetNextCheck(Utility::GetTime() + checkTimeout + 30, false, Empty, false);
648679

649680
/*
650681
* Let the user know that there was a problem with the check if

lib/icinga/checkable.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,19 @@ void Checkable::Start(bool runtimeCreated)
8888
auto cr (GetLastCheckResult());
8989

9090
if (GetLastCheckStarted() > (cr ? cr->GetExecutionEnd() : 0.0)) {
91-
SetNextCheck(GetLastCheckStarted());
91+
// This means a check was running when we were restarted, so IcingaDB or IDO should
92+
// have received and persisted the correct next update time. So, they don't have to
93+
// know about this cheat we're doing here.
94+
SetNextCheck(GetLastCheckStarted(), false, Empty, false);
9295
}
9396
}
9497

9598
if (GetNextCheck() < now + 60) {
9699
double delta = std::min(GetCheckInterval(), 60.0);
97100
delta *= (double)std::rand() / RAND_MAX;
98-
SetNextCheck(now + delta);
101+
// This is the same as above, we just want to spread out the checks a bit after a restart.
102+
// No need to persist this change to Icinga DB or IDO.
103+
SetNextCheck(now + delta, false, Empty, false);
99104
}
100105

101106
ObjectImpl<Checkable>::Start(runtimeCreated);

lib/icinga/checkable.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,8 @@ class Checkable : public ObjectImpl<Checkable>
104104
long GetSchedulingOffset();
105105
void SetSchedulingOffset(long offset);
106106

107-
void UpdateNextCheck(const MessageOrigin::Ptr& origin = nullptr);
107+
void SetNextCheck(const Timestamp& value, bool suppressEvents = false, const Value& cookie = Empty, bool affectsDB = true);
108+
void UpdateNextCheck(const MessageOrigin::Ptr& origin = nullptr, bool affectsDB = true);
108109

109110
static String StateTypeToString(StateType type);
110111

@@ -146,7 +147,7 @@ class Checkable : public ObjectImpl<Checkable>
146147
bool, bool, double, double, const MessageOrigin::Ptr&)> OnAcknowledgementSet;
147148
static boost::signals2::signal<void (const Checkable::Ptr&, const String&, double, const MessageOrigin::Ptr&)> OnAcknowledgementCleared;
148149
static boost::signals2::signal<void (const Checkable::Ptr&, double)> OnFlappingChange;
149-
static boost::signals2::signal<void (const Checkable::Ptr&)> OnNextCheckUpdated;
150+
static boost::signals2::signal<void (const Checkable::Ptr&, const Value&, bool)> OnNextCheckUpdated;
150151
static boost::signals2::signal<void (const Checkable::Ptr&)> OnEventCommandExecuted;
151152

152153
static Atomic<uint_fast64_t> CurrentConcurrentChecks;

lib/icinga/checkable.ti

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ abstract class Checkable : CustomVarObject
9292
[config] String icon_image;
9393
[config] String icon_image_alt;
9494

95-
[state] Timestamp next_check;
95+
[state, set_protected] Timestamp next_check;
9696
[state, no_user_view, no_user_modify] Timestamp last_check_started;
9797

9898
[state] int check_attempt {

lib/icinga/clusterevents.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ REGISTER_APIFUNCTION(SetRemovalInfo, event, &ClusterEvents::SetRemovalInfoAPIHan
4646
void ClusterEvents::StaticInitialize()
4747
{
4848
Checkable::OnNewCheckResult.connect(&ClusterEvents::CheckResultHandler);
49-
Checkable::OnNextCheckChanged.connect(&ClusterEvents::NextCheckChangedHandler);
49+
Checkable::OnNextCheckUpdated.connect(&ClusterEvents::NextCheckChangedHandler);
5050
Checkable::OnLastCheckStartedChanged.connect(&ClusterEvents::LastCheckStartedChangedHandler);
5151
Checkable::OnStateBeforeSuppressionChanged.connect(&ClusterEvents::StateBeforeSuppressionChangedHandler);
5252
Checkable::OnSuppressedNotificationsChanged.connect(&ClusterEvents::SuppressedNotificationsChangedHandler);
@@ -183,7 +183,7 @@ Value ClusterEvents::CheckResultAPIHandler(const MessageOrigin::Ptr& origin, con
183183
return Empty;
184184
}
185185

186-
void ClusterEvents::NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin)
186+
void ClusterEvents::NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin, bool affectsDB)
187187
{
188188
ApiListener::Ptr listener = ApiListener::GetInstance();
189189

@@ -199,6 +199,7 @@ void ClusterEvents::NextCheckChangedHandler(const Checkable::Ptr& checkable, con
199199
if (service)
200200
params->Set("service", service->GetShortName());
201201
params->Set("next_check", checkable->GetNextCheck());
202+
params->Set("affects_db", affectsDB);
202203

203204
Dictionary::Ptr message = new Dictionary();
204205
message->Set("jsonrpc", "2.0");
@@ -245,7 +246,7 @@ Value ClusterEvents::NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin
245246
if (nextCheck < Application::GetStartTime() + 60)
246247
return Empty;
247248

248-
checkable->SetNextCheck(params->Get("next_check"), false, origin);
249+
checkable->SetNextCheck(params->Get("next_check"), false, origin, params->Get("affects_db").ToBool());
249250

250251
return Empty;
251252
}

lib/icinga/clusterevents.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class ClusterEvents
2323
static void CheckResultHandler(const Checkable::Ptr& checkable, const CheckResult::Ptr& cr, const MessageOrigin::Ptr& origin);
2424
static Value CheckResultAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);
2525

26-
static void NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin);
26+
static void NextCheckChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin, bool affectsDB);
2727
static Value NextCheckChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);
2828

2929
static void LastCheckStartedChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin);

0 commit comments

Comments
 (0)