Skip to content

Commit 6910ce9

Browse files
authored
Merge pull request #440 from yungwine/stats
add new node metric stats
2 parents 6431ec3 + a642374 commit 6910ce9

File tree

8 files changed

+294
-14
lines changed

8 files changed

+294
-14
lines changed

modules/alert_bot.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@ def init_alerts():
116116
"Found proposals with hashes `{hashes}` that have significant amount of votes, but current validator didn't vote for them. Please check @tonstatus for more details.",
117117
VALIDATION_PERIOD
118118
),
119+
"initial_sync_completed": Alert(
120+
"info",
121+
"Initial sync has been completed (info alert with no sound)",
122+
"Node initial sync has been completed",
123+
0
124+
)
119125
}
120126

121127

@@ -134,6 +140,7 @@ def __init__(self, ton, local, *args, **kwargs):
134140
self.token = None
135141
self.chat_id = None
136142
self.last_db_check = 0
143+
self.initial_sync = None
137144

138145
def send_message(self, text: str, silent: bool = False, disable_web_page_preview: bool = False):
139146
if self.token is None:
@@ -207,6 +214,7 @@ def init(self):
207214
self.wallet = self.ton.GetValidatorWallet().addrB64
208215
self.ip = self.ton.get_node_ip()
209216
self.set_global_vars()
217+
self.initial_sync = self.ton.in_initial_sync()
210218
init_alerts()
211219
self.inited = True
212220

@@ -334,12 +342,12 @@ def check_efficiency(self):
334342

335343
def check_validator_working(self):
336344
validator_status = self.ton.GetValidatorStatus()
337-
if not validator_status.is_working:
345+
if not self.initial_sync and not validator_status.is_working:
338346
self.send_alert("service_down")
339347

340348
def check_sync(self):
341349
validator_status = self.ton.GetValidatorStatus()
342-
if validator_status.is_working and validator_status.out_of_sync >= 20:
350+
if not self.initial_sync and validator_status.is_working and validator_status.out_of_sync >= 20:
343351
self.send_alert("out_of_sync", sync=validator_status.out_of_sync)
344352

345353
def check_zero_blocks_created(self):
@@ -435,6 +443,13 @@ def check_voting(self):
435443
if need_to_vote:
436444
self.send_alert("voting", hashes=' '.join(need_to_vote))
437445

446+
def check_initial_sync(self):
447+
if not self.initial_sync:
448+
return
449+
if not self.ton.in_initial_sync():
450+
self.initial_sync = False
451+
self.send_alert("initial_sync_completed")
452+
438453
def check_status(self):
439454
if not self.ton.using_alert_bot():
440455
return
@@ -453,6 +468,7 @@ def check_status(self):
453468
self.local.try_function(self.check_stake_sent)
454469
self.local.try_function(self.check_stake_returned)
455470
self.local.try_function(self.check_voting)
471+
self.local.try_function(self.check_initial_sync)
456472

457473
def add_console_commands(self, console):
458474
console.AddItem("enable_alert", self.enable_alert, self.local.translate("enable_alert_cmd"))

modules/prometheus.py

+36-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,18 @@ def to_format(self, value):
2626
'stake': Metric('validator_stake', 'Validator stake', 'gauge'),
2727
'celldb_gc_block': Metric('validator_celldb_gc_block', 'Celldb GC block latency', 'gauge'),
2828
'celldb_gc_state': Metric('validator_celldb_gc_state', 'Celldb GC queue size', 'gauge'),
29+
'collated_master_ok': Metric('validator_blocks_collated_master_ok', 'Number of masterchain blocks successfully collated', 'gauge'),
30+
'collated_master_err': Metric('validator_blocks_collated_master_err', 'Number of masterchain blocks failed to collate', 'gauge'),
31+
'collated_shard_ok': Metric('validator_blocks_collated_shard_ok', 'Number of shardchain blocks successfully collated', 'gauge'),
32+
'collated_shard_err': Metric('validator_blocks_collated_shard_err', 'Number of shardchain blocks failed to collate', 'gauge'),
33+
'validated_master_ok': Metric('validator_blocks_validated_master_ok', 'Number of masterchain blocks successfully validated', 'gauge'),
34+
'validated_master_err': Metric('validator_blocks_validated_master_err', 'Number of masterchain blocks failed to validate', 'gauge'),
35+
'validated_shard_ok': Metric('validator_blocks_validated_shard_ok', 'Number of shardchain blocks successfully validated', 'gauge'),
36+
'validated_shard_err': Metric('validator_blocks_validated_shard_err', 'Number of shardchain blocks failed to validate', 'gauge'),
37+
'validator_groups_master': Metric('validator_active_groups_master', 'Number of masterchain validation groups validator participates in', 'gauge'),
38+
'validator_groups_shard': Metric('validator_active_groups_shard', 'Number of shardchain validation groups validator participates in', 'gauge'),
39+
'ls_queries_ok': Metric('validator_ls_queries_ok', 'Number of Liteserver successful queries', 'gauge'),
40+
'ls_queries_err': Metric('validator_ls_queries_err', 'Number of Liteserver failed queries', 'gauge'),
2941
}
3042

3143

@@ -44,7 +56,7 @@ def get_validator_status_metrics(self, result: list):
4456
result.append(METRICS['master_out_of_sync'].to_format(status.masterchain_out_of_sync))
4557
if status.shardchain_out_of_sync is not None:
4658
result.append(METRICS['shard_out_of_sync'].to_format(status.shardchain_out_of_sync))
47-
if status.masterchain_out_of_ser is not None and status.stateserializermasterchainseqno != 0:
59+
if status.stateserializerenabled and status.masterchain_out_of_ser is not None and status.stateserializermasterchainseqno != 0:
4860
result.append(METRICS['out_of_ser'].to_format(status.masterchain_out_of_ser))
4961
if status.masterchainblock is not None and status.gcmasterchainblock is not None:
5062
result.append(METRICS['celldb_gc_block'].to_format(status.masterchainblock - status.gcmasterchainblock))
@@ -53,6 +65,9 @@ def get_validator_status_metrics(self, result: list):
5365
result.append(METRICS['celldb_gc_state'].to_format(status.gcmasterchainblock - status.last_deleted_mc_state))
5466
else:
5567
result.append(METRICS['celldb_gc_state'].to_format(-1))
68+
if status.validator_groups_master is not None:
69+
result.append(METRICS['validator_groups_master'].to_format(status.validator_groups_master))
70+
result.append(METRICS['validator_groups_shard'].to_format(status.validator_groups_shard))
5671
result.append(METRICS['vc_up'].to_format(int(is_working)))
5772

5873
def get_validator_validation_metrics(self, result: list):
@@ -67,6 +82,25 @@ def get_validator_validation_metrics(self, result: list):
6782
if stake:
6883
result.append(METRICS['stake'].to_format(round(stake, 2)))
6984

85+
def get_node_stats_metrics(self, result: list):
86+
stats = self.ton.get_node_statistics()
87+
if stats and 'ls_queries' in stats:
88+
if stats['ls_queries']['time'] < 50:
89+
self.local.add_log(f'Liteserver queries time is too low: {stats}')
90+
return
91+
result.append(METRICS['ls_queries_ok'].to_format(stats['ls_queries']['ok']))
92+
result.append(METRICS['ls_queries_err'].to_format(stats['ls_queries']['error']))
93+
if stats and 'collated' in stats:
94+
result.append(METRICS['collated_master_ok'].to_format(stats['collated']['master']['ok']))
95+
result.append(METRICS['collated_master_err'].to_format(stats['collated']['master']['error']))
96+
result.append(METRICS['collated_shard_ok'].to_format(stats['collated']['shard']['ok']))
97+
result.append(METRICS['collated_shard_err'].to_format(stats['collated']['shard']['error']))
98+
if stats and 'validated' in stats:
99+
result.append(METRICS['validated_master_ok'].to_format(stats['validated']['master']['ok']))
100+
result.append(METRICS['validated_master_err'].to_format(stats['validated']['master']['error']))
101+
result.append(METRICS['validated_shard_ok'].to_format(stats['validated']['shard']['ok']))
102+
result.append(METRICS['validated_shard_err'].to_format(stats['validated']['shard']['error']))
103+
70104
def push_metrics(self):
71105
if not self.ton.using_prometheus():
72106
return
@@ -77,6 +111,7 @@ def push_metrics(self):
77111
metrics = []
78112
self.local.try_function(self.get_validator_status_metrics, args=[metrics])
79113
self.local.try_function(self.get_validator_validation_metrics, args=[metrics])
114+
self.local.try_function(self.get_node_stats_metrics, args=[metrics])
80115
requests.post(url, data='\n'.join(metrics).encode())
81116

82117
def add_console_commands(self, console):

mytoncore/functions.py

+74
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,64 @@ def CalculateNetworkStatistics(zerodata, data):
286286
# end define
287287

288288

289+
def save_node_statistics(local, ton):
290+
status = ton.GetValidatorStatus(no_cache=True)
291+
if status.unixtime is None:
292+
return
293+
data = {'timestamp': status.unixtime}
294+
295+
def get_ok_error(value: str):
296+
ok, error = value.split()
297+
return int(ok.split(':')[1]), int(error.split(':')[1])
298+
299+
if 'total.collated_blocks.master' in status:
300+
master_ok, master_error = get_ok_error(status['total.collated_blocks.master'])
301+
shard_ok, shard_error = get_ok_error(status['total.collated_blocks.shard'])
302+
data['collated_blocks'] = {
303+
'master': {'ok': master_ok, 'error': master_error},
304+
'shard': {'ok': shard_ok, 'error': shard_error},
305+
}
306+
if 'total.validated_blocks.master' in status:
307+
master_ok, master_error = get_ok_error(status['total.validated_blocks.master'])
308+
shard_ok, shard_error = get_ok_error(status['total.validated_blocks.shard'])
309+
data['validated_blocks'] = {
310+
'master': {'ok': master_ok, 'error': master_error},
311+
'shard': {'ok': shard_ok, 'error': shard_error},
312+
}
313+
if 'total.ext_msg_check' in status:
314+
ok, error = get_ok_error(status['total.ext_msg_check'])
315+
data['ext_msg_check'] = {'ok': ok, 'error': error}
316+
if 'total.ls_queries_ok' in status and 'total.ls_queries_error' in status:
317+
data['ls_queries'] = {}
318+
for k in status['total.ls_queries_ok'].split():
319+
if k.startswith('TOTAL'):
320+
data['ls_queries']['ok'] = int(k.split(':')[1])
321+
for k in status['total.ls_queries_error'].split():
322+
if k.startswith('TOTAL'):
323+
data['ls_queries']['error'] = int(k.split(':')[1])
324+
statistics = local.db.get("statistics", dict())
325+
326+
if time.time() - int(status.start_time) <= 60: # was node restart <60 sec ago, resetting node statistics
327+
statistics['node'] = []
328+
329+
# statistics['node'] = [stats_from_election_id, stats_from_prev_min, stats_now]
330+
331+
election_id = ton.GetConfig34()['startWorkTime']
332+
if 'node' not in statistics or len(statistics['node']) == 0:
333+
statistics['node'] = [None, data]
334+
elif len(statistics['node']) < 3:
335+
statistics['node'].append(data)
336+
if len(statistics['node']) == 3:
337+
if statistics['node'][0] is None:
338+
if 0 < data['timestamp'] - election_id < 90:
339+
statistics['node'][0] = data
340+
elif statistics['node'][0]['timestamp'] < election_id:
341+
statistics['node'][0] = data
342+
statistics['node'] = statistics.get('node', []) + [data]
343+
statistics['node'].pop(1)
344+
local.db["statistics"] = statistics
345+
346+
289347
def ReadTransData(local, scanner):
290348
transData = local.buffer.transData
291349
SetToTimeData(transData, scanner.transNum)
@@ -544,6 +602,17 @@ def ScanLiteServers(local, ton):
544602
# end define
545603

546604

605+
def check_initial_sync(local, ton):
606+
if not ton.in_initial_sync():
607+
return
608+
validator_status = ton.GetValidatorStatus()
609+
if validator_status.initial_sync:
610+
return
611+
if validator_status.out_of_sync < 20:
612+
ton.set_initial_sync_off()
613+
return
614+
615+
547616
def General(local):
548617
local.add_log("start General function", "debug")
549618
ton = MyTonCore(local)
@@ -570,6 +639,8 @@ def General(local):
570639

571640
local.start_cycle(ScanLiteServers, sec=60, args=(local, ton,))
572641

642+
local.start_cycle(save_node_statistics, sec=60, args=(local, ton, ))
643+
573644
from modules.custom_overlays import CustomOverlayModule
574645
local.start_cycle(CustomOverlayModule(ton, local).custom_overlays, sec=60, args=())
575646

@@ -579,6 +650,9 @@ def General(local):
579650
from modules.prometheus import PrometheusModule
580651
local.start_cycle(PrometheusModule(ton, local).push_metrics, sec=30, args=())
581652

653+
if ton.in_initial_sync():
654+
local.start_cycle(check_initial_sync, sec=120, args=(local, ton))
655+
582656
thr_sleep()
583657
# end define
584658

mytoncore/mytoncore.py

+68-2
Original file line numberDiff line numberDiff line change
@@ -776,16 +776,24 @@ def GetShardsNumber(self, block=None):
776776
return shardsNum
777777
#end define
778778

779-
def GetValidatorStatus(self):
779+
def parse_stats_from_vc(self, output: str, result: dict):
780+
for line in output.split('\n'):
781+
if len(line.split('\t\t\t')) == 2:
782+
name, value = line.split('\t\t\t') # https://github.com/ton-blockchain/ton/blob/master/validator-engine-console/validator-engine-console-query.cpp#L648
783+
if name not in result:
784+
result[name] = value
785+
786+
def GetValidatorStatus(self, no_cache=False):
780787
# Get buffer
781788
bname = "validator_status"
782789
buff = self.GetFunctionBuffer(bname)
783-
if buff:
790+
if buff and not no_cache:
784791
return buff
785792
#end if
786793

787794
self.local.add_log("start GetValidatorStatus function", "debug")
788795
status = Dict()
796+
result = None
789797
try:
790798
# Parse
791799
status.is_working = True
@@ -809,10 +817,19 @@ def GetValidatorStatus(self):
809817
status.out_of_sync = status.masterchain_out_of_sync if status.masterchain_out_of_sync > status.shardchain_out_of_sync else status.shardchain_out_of_sync
810818
status.out_of_ser = status.masterchain_out_of_ser
811819
status.last_deleted_mc_state = int(parse(result, "last_deleted_mc_state", '\n'))
820+
status.stateserializerenabled = parse(result, "stateserializerenabled", '\n') == "true"
821+
self.local.try_function(self.parse_stats_from_vc, args=[result, status])
822+
if 'active_validator_groups' in status:
823+
groups = status.active_validator_groups.split() # master:1 shard:2
824+
status.validator_groups_master = int(groups[0].split(':')[1])
825+
status.validator_groups_shard = int(groups[1].split(':')[1])
812826
except Exception as ex:
813827
self.local.add_log(f"GetValidatorStatus warning: {ex}", "warning")
814828
status.is_working = False
829+
if result is not None:
830+
self.local.try_function(self.parse_stats_from_vc, args=[result, status])
815831
#end try
832+
status.initial_sync = status.get("process.initial_sync")
816833

817834
# old vars
818835
status.outOfSync = status.out_of_sync
@@ -3037,6 +3054,48 @@ def GetStatistics(self, name, statistics=None):
30373054
return data
30383055
#end define
30393056

3057+
def get_node_statistics(self):
3058+
"""
3059+
:return: stats for collated/validated blocks since round beggining and stats for ls queries for the last minute
3060+
"""
3061+
stats = self.local.db.get('statistics', {}).get('node')
3062+
result = {}
3063+
if stats is not None and len(stats) == 3 and stats[0] is not None:
3064+
for k in ['master', 'shard']:
3065+
result = {
3066+
'collated': {
3067+
'ok': 0,
3068+
'error': 0,
3069+
},
3070+
'validated': {
3071+
'ok': 0,
3072+
'error': 0,
3073+
}
3074+
}
3075+
collated_ok = stats[2]['collated_blocks'][k]['ok'] - stats[0]['collated_blocks'][k]['ok']
3076+
collated_error = stats[2]['collated_blocks'][k]['error'] - stats[0]['collated_blocks'][k]['error']
3077+
validated_ok = stats[2]['validated_blocks'][k]['ok'] - stats[0]['validated_blocks'][k]['ok']
3078+
validated_error = stats[2]['validated_blocks'][k]['error'] - stats[0]['validated_blocks'][k]['error']
3079+
result['collated'][k] = {
3080+
'ok': collated_ok,
3081+
'error': collated_error,
3082+
}
3083+
result['validated'][k] = {
3084+
'ok': validated_ok,
3085+
'error': validated_error,
3086+
}
3087+
result['collated']['ok'] += collated_ok
3088+
result['collated']['error'] += collated_error
3089+
result['validated']['ok'] += validated_ok
3090+
result['validated']['error'] += validated_error
3091+
if stats is not None and len(stats) >= 2 and stats[0] is not None:
3092+
result['ls_queries'] = {
3093+
'ok': stats[-1]['ls_queries']['ok'] - stats[-2]['ls_queries']['ok'],
3094+
'error': stats[-1]['ls_queries']['error'] - stats[-2]['ls_queries']['error'],
3095+
'time': stats[-1].get('timestamp', 0) - stats[-2].get('timestamp', 0),
3096+
}
3097+
return result
3098+
30403099
def GetSettings(self, name):
30413100
# self.local.load_db()
30423101
result = self.local.db.get(name)
@@ -3140,6 +3199,13 @@ def using_alert_bot(self):
31403199
def using_prometheus(self):
31413200
return self.get_mode_value('prometheus')
31423201

3202+
def in_initial_sync(self):
3203+
return self.local.db.get('initialSync', False)
3204+
3205+
def set_initial_sync_off(self):
3206+
self.local.db.pop('initialSync', None)
3207+
self.local.save()
3208+
31433209
def Tlb2Json(self, text):
31443210
# Заменить скобки
31453211
start = 0

0 commit comments

Comments
 (0)