Skip to content

Commit

Permalink
Merge pull request #14996 from miodvallat/lua_finer_intervals
Browse files Browse the repository at this point in the history
Lua up checks: finer control
  • Loading branch information
miodvallat authored Jan 13, 2025
2 parents 057d3cb + 7219311 commit f53b2fd
Show file tree
Hide file tree
Showing 4 changed files with 202 additions and 14 deletions.
4 changes: 4 additions & 0 deletions docs/lua-records/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ Record creation functions
- ``backupSelector``: used to pick the address(es) from all addresses if all addresses are down. Choices include 'pickclosest', 'random', 'hashed', 'all' (default 'random').
- ``source``: Source address to check from
- ``timeout``: Maximum time in seconds that you allow the check to take (default 2)
- ``interval``: Time interval between two checks, in seconds. Defaults to :ref:`setting-lua-health-checks-interval` if not specified.
- ``minimumFailures``: The number of unsuccessful checks in a row required to mark the address as down. Defaults to 1 if not specified, i.e. report as down on the first unsuccessful check.


.. function:: ifurlup(url, addresses[, options])
Expand All @@ -86,9 +88,11 @@ Record creation functions
- ``backupSelector``: used to pick the address from all addresses if all addresses are down. Choices include 'pickclosest', 'random', 'hashed', 'all' (default 'random').
- ``source``: Source address to check from
- ``timeout``: Maximum time in seconds that you allow the check to take (default 2)
- ``interval``: Time interval between two checks, in seconds. Defaults to :ref:`setting-lua-health-checks-interval` if not specified.
- ``stringmatch``: check ``url`` for this string, only declare 'up' if found
- ``useragent``: Set the HTTP "User-Agent" header in the requests. By default it is set to "PowerDNS Authoritative Server"
- ``byteslimit``: Limit the maximum download size to ``byteslimit`` bytes (default 0 meaning no limit).
- ``minimumFailures``: The number of unsuccessful checks in a row required to mark the address as down. Defaults to 1 if not specified, i.e. report as down on the first unsuccessful check.

An example of a list of address sets:

Expand Down
65 changes: 55 additions & 10 deletions pdns/lua-record.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
ponder netmask tree from file for huge number of netmasks
unify ifurlup/ifportup
add attribute for certificate check
add attribute for certificate check in genericIfUp
add list of current monitors
expire them too?
Expand Down Expand Up @@ -73,10 +73,14 @@ class IsUpOracle
std::atomic<bool> status{false};
/* current weight */
std::atomic<int> weight{0};
/* first check ? */
/* first check? */
std::atomic<bool> first{true};
/* number of successive checks returning failure */
std::atomic<unsigned int> failures{0};
/* last time the status was accessed */
std::atomic<time_t> lastAccess{0};
/* last time the status was modified */
std::atomic<time_t> lastStatusUpdate{0};
};

public:
Expand All @@ -91,7 +95,7 @@ class IsUpOracle
int isUp(const CheckDesc& cd);

private:
void checkURL(const CheckDesc& cd, const bool status, const bool first = false)
void checkURL(const CheckDesc& cd, const bool status, const bool first) // NOLINT(readability-identifier-length)
{
setThreadName("pdns/lua-c-url");

Expand Down Expand Up @@ -154,7 +158,7 @@ class IsUpOracle
setDown(cd);
}
}
void checkTCP(const CheckDesc& cd, const bool status, const bool first = false) {
void checkTCP(const CheckDesc& cd, const bool status, const bool first) { // NOLINT(readability-identifier-length)
setThreadName("pdns/lua-c-tcp");
try {
int timeout = 2;
Expand Down Expand Up @@ -192,19 +196,46 @@ class IsUpOracle
std::chrono::system_clock::time_point checkStart = std::chrono::system_clock::now();
std::vector<std::future<void>> results;
std::vector<CheckDesc> toDelete;
time_t interval{g_luaHealthChecksInterval};
{
// make sure there's no insertion
auto statuses = d_statuses.read_lock();
for (auto& it: *statuses) {
auto& desc = it.first;
auto& state = it.second;
time_t checkInterval{0};
auto lastAccess = std::chrono::system_clock::from_time_t(state->lastAccess);

if (desc.opts.count("interval") != 0) {
checkInterval = std::atoi(desc.opts.at("interval").c_str());
if (checkInterval != 0) {
interval = std::gcd(interval, checkInterval);
}
}

if (not state->first) {
time_t nextCheckSecond = state->lastStatusUpdate;
if (checkInterval != 0) {
nextCheckSecond += checkInterval;
}
else {
nextCheckSecond += g_luaHealthChecksInterval;
}
if (checkStart < std::chrono::system_clock::from_time_t(nextCheckSecond)) {
continue; // too early
}
}

if (desc.url.empty()) { // TCP
results.push_back(std::async(std::launch::async, &IsUpOracle::checkTCP, this, desc, state->status.load(), state->first.load()));
} else { // URL
results.push_back(std::async(std::launch::async, &IsUpOracle::checkURL, this, desc, state->status.load(), state->first.load()));
}
if (std::chrono::system_clock::from_time_t(state->lastAccess) < (checkStart - std::chrono::seconds(g_luaHealthChecksExpireDelay))) {
// Give it a chance to run at least once.
// If minimumFailures * interval > lua-health-checks-expire-delay, then a down status will never get reported.
// This is unlikely to be a problem in practice due to the default value of the expire delay being one hour.
if (not state->first &&
lastAccess < (checkStart - std::chrono::seconds(g_luaHealthChecksExpireDelay))) {
toDelete.push_back(desc);
}
}
Expand All @@ -223,7 +254,7 @@ class IsUpOracle
// set thread name again, in case std::async surprised us by doing work in this thread
setThreadName("pdns/luaupcheck");

std::this_thread::sleep_until(checkStart + std::chrono::seconds(g_luaHealthChecksInterval));
std::this_thread::sleep_until(checkStart + std::chrono::seconds(interval));
}
}

Expand All @@ -237,9 +268,23 @@ class IsUpOracle
{
auto statuses = d_statuses.write_lock();
auto& state = (*statuses)[cd];
state->status = status;
if (state->first) {
state->first = false;
state->lastStatusUpdate = time(nullptr);
state->first = false;
if (status) {
state->failures = 0;
state->status = true;
} else {
unsigned int minimumFailures = 1;
if (cd.opts.count("minimumFailures") != 0) {
unsigned int value = std::atoi(cd.opts.at("minimumFailures").c_str());
if (value != 0) {
minimumFailures = std::max(minimumFailures, value);
}
}
// Since `status' was set to false at constructor time, we need to
// recompute its value unconditionally to expose "down, but not enough
// times yet" targets as up.
state->status = ++state->failures < minimumFailures;
}
}

Expand Down
23 changes: 23 additions & 0 deletions regression-tests.auth-py/authtests.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,29 @@ def assertAnyRRsetInAnswer(self, msg, rrsets):
raise AssertionError("RRset not found in answer\n%s" %
"\n".join(([ans.to_text() for ans in msg.answer])))

def assertNoneRRsetInAnswer(self, msg, rrsets):
"""Asserts that none of the supplied rrsets exist (without comparing TTL)
in the answer section of msg
@param msg: the dns.message.Message to check
@param rrsets: an array of dns.rrset.RRset object"""

if not isinstance(msg, dns.message.Message):
raise TypeError("msg is not a dns.message.Message")

found = False
for rrset in rrsets:
if not isinstance(rrset, dns.rrset.RRset):
raise TypeError("rrset is not a dns.rrset.RRset")
for ans in msg.answer:
if ans.match(rrset.name, rrset.rdclass, rrset.rdtype, 0, None):
if ans == rrset:
found = True

if found:
raise AssertionError("RRset incorrectly found in answer\n%s" %
"\n".join(([ans.to_text() for ans in msg.answer])))

def assertMatchingRRSIGInAnswer(self, msg, coveredRRset, keys=None):
"""Looks for coveredRRset in the answer section and if there is an RRSIG RRset
that covers that RRset. If keys is not None, this function will also try to
Expand Down
124 changes: 120 additions & 4 deletions regression-tests.auth-py/test_LuaRecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def log_message(self, format, *args):
def do_HEAD(self):
self._set_headers()

class TestLuaRecords(AuthTest):
class BaseLuaTest(AuthTest):
_config_template = """
geoip-database-files=../modules/geoipbackend/regression-tests/GeoLiteCity.mmdb
edns-subnet-processing=yes
Expand Down Expand Up @@ -102,6 +102,16 @@ class TestLuaRecords(AuthTest):
"return ifurlup('http://www.lua.org:8080/', "
"{{EUEips, USAips}}, settings) ")
usa-unreachable IN LUA A ( ";settings={{stringmatch='Programming in Lua', minimumFailures=2}} "
"USAips={{'{prefix}.103', '192.168.42.105'}}"
"return ifurlup('http://www.lua.org:8080/', "
"USAips, settings) ")
usa-slowcheck IN LUA A ( ";settings={{stringmatch='Programming in Lua', interval=8}} "
"USAips={{'{prefix}.103', '192.168.42.105'}}"
"return ifurlup('http://www.lua.org:8080/', "
"USAips, settings) ")
mix.ifurlup IN LUA A ("ifurlup('http://www.other.org:8080/ping.json', "
"{{ '192.168.42.101', '{prefix}.101' }}, "
"{{ stringmatch='pong' }}) ")
Expand Down Expand Up @@ -189,7 +199,7 @@ def HTTPResponder(cls, port):
@classmethod
def setUpClass(cls):

super(TestLuaRecords, cls).setUpClass()
super(BaseLuaTest, cls).setUpClass()

cls._web_rrsets = [dns.rrset.from_text('web1.example.org.', 0, dns.rdataclass.IN, 'A',
'{prefix}.101'.format(prefix=cls._PREFIX)),
Expand All @@ -199,6 +209,8 @@ def setUpClass(cls):
'{prefix}.103'.format(prefix=cls._PREFIX))
]

class TestLuaRecords(BaseLuaTest):

def testPickRandom(self):
"""
Basic pickrandom() test with a set of A records
Expand Down Expand Up @@ -440,7 +452,7 @@ def testIfurlup(self):
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, all_rrs)

# the timeout in the LUA health checker is 2 second, so we make sure to wait slightly longer here
# the timeout in the LUA health checker is 1 second, so we make sure to wait slightly longer here
time.sleep(3)
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
Expand Down Expand Up @@ -468,7 +480,7 @@ def testIfurlupMultiSet(self):
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, all_rrs)

# the timeout in the LUA health checker is 2 second, so we make sure to wait slightly longer here
# the timeout in the LUA health checker is 1 second, so we make sure to wait slightly longer here
time.sleep(3)
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
Expand Down Expand Up @@ -1161,6 +1173,110 @@ class TestLuaRecordsNoWhiteSpace(TestLuaRecords):
def testWhitespace(self):
return TestLuaRecords.testWhitespace(self, False)

class TestLuaRecordsSlowTimeouts(BaseLuaTest):
# This configuration is similar to BaseLuaTest, but the health check
# interval is increased to 5 seconds.
_config_template = """
geoip-database-files=../modules/geoipbackend/regression-tests/GeoLiteCity.mmdb
edns-subnet-processing=yes
launch=bind geoip
any-to-tcp=no
enable-lua-records
lua-records-insert-whitespace=yes
lua-health-checks-interval=5
"""

def testIfurlupMinimumFailures(self):
"""
Simple ifurlup() test with minimumFailures option set.
"""
reachable = [
'{prefix}.103'.format(prefix=self._PREFIX)
]
unreachable = ['192.168.42.105']
ips = reachable + unreachable
all_rrs = []
reachable_rrs = []
unreachable_rrs = []
for ip in ips:
rr = dns.rrset.from_text('usa-unreachable.example.org.', 0, dns.rdataclass.IN, 'A', ip)
all_rrs.append(rr)
if ip in reachable:
reachable_rrs.append(rr)
else:
unreachable_rrs.append(rr)

query = dns.message.make_query('usa-unreachable.example.org', 'A')
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, all_rrs)

# The above request being sent at time T, the following events occur:
# T+00: results computed using backupSelector as no data available yet
# T+00: checker thread starts
# T+02: 192.168.42.105 found down, first time, still kept up
# T+05: checker thread wakes up, decides to skip 192.168.42.105 check,
# as its last update time was T+02, hence no check until T+07
# T+10: checker thread wakes up
# T+12: 192.168.42.105 found down, second time, finally marked down

# Due to minimumFailures set, there should be no error yet.
time.sleep(5)
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, all_rrs)

# Wait for another check. At this point the checker thread should have
# reached the minimumFailures threshold and mark the unreachable IP
# as such.
time.sleep(8)
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, reachable_rrs)
self.assertNoneRRsetInAnswer(res, unreachable_rrs)

def testIfurlupInterval(self):
"""
Simple ifurlup() test with interval option set.
"""
reachable = [
'{prefix}.103'.format(prefix=self._PREFIX)
]
unreachable = ['192.168.42.105']
ips = reachable + unreachable
all_rrs = []
reachable_rrs = []
unreachable_rrs = []
for ip in ips:
rr = dns.rrset.from_text('usa-slowcheck.example.org.', 0, dns.rdataclass.IN, 'A', ip)
all_rrs.append(rr)
if ip in reachable:
reachable_rrs.append(rr)
else:
unreachable_rrs.append(rr)

query = dns.message.make_query('usa-slowcheck.example.org', 'A')
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, all_rrs)

# the timeout in the LUA health checker is 5 second, but usa-slowcheck
# uses 8 seconds, which forces the thread to run every second (gcd
# of 5 and 8).
time.sleep(6)

res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
# due to minimumFailures set, there should be no error yet
self.assertAnyRRsetInAnswer(res, all_rrs)

# At this point the check should have fired.
time.sleep(3)
res = self.sendUDPQuery(query)
self.assertRcodeEqual(res, dns.rcode.NOERROR)
self.assertAnyRRsetInAnswer(res, reachable_rrs)
self.assertNoneRRsetInAnswer(res, unreachable_rrs)

if __name__ == '__main__':
unittest.main()
exit(0)

0 comments on commit f53b2fd

Please sign in to comment.