Skip to content

Commit

Permalink
implement the last missing docker services.
Browse files Browse the repository at this point in the history
  • Loading branch information
nicdumz authored and mergify[bot] committed Jan 24, 2025
1 parent 346d8c1 commit 572a1d7
Show file tree
Hide file tree
Showing 6 changed files with 197 additions and 5 deletions.
12 changes: 9 additions & 3 deletions nix/modules/nixos/services/docker/compose2nix.nix
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ lib.mkIf config.${namespace}.docker.enable {
names = [
"deadmansnitch_url"
"gandi_token_env"
"prometheus_password"
"prometheus_username"
"telegram_token"
"watchtower_env"
];
Expand All @@ -66,7 +68,7 @@ lib.mkIf config.${namespace}.docker.enable {
environment = env;
volumes = [
"/etc/localtime:/etc/localtime:ro"
"TODO/config/alertmanager:/etc/alertmanager:ro"
"${./config/alertmanager/alertmanager.yml}:/etc/alertmanager/alertmanager.yml:ro"
"${slow}/dockerstate/alertmanager:/alertmanager:rw"
# Those paths are expected in alertmanager.yml
"${secrets.telegram_token.path}:/run/secrets/telegram_token:ro"
Expand Down Expand Up @@ -98,7 +100,8 @@ lib.mkIf config.${namespace}.docker.enable {
environment = env;
volumes = [
"/etc/localtime:/etc/localtime:ro"
"TODO/config/prometheus:/config:ro"
# TODO: this could be a pkgs.writers.writeYAML
"${./config/blackbox/blackbox.yml}:/config/blackbox.yml:ro"
];
cmd = [ "--config.file=/config/blackbox.yml" ];
inherit user;
Expand Down Expand Up @@ -353,8 +356,11 @@ lib.mkIf config.${namespace}.docker.enable {
environment = env;
volumes = [
"/etc/localtime:/etc/localtime:ro"
"TODO/config/prometheus:/etc/prometheus:ro"
"${./config/prometheus/prometheus.yml}:/etc/prometheus/prometheus.yml:ro"
"${./config/prometheus/alerts.yml}:/etc/prometheus/alerts.yml:ro"
"${fast}/prometheus:/prometheus:rw"
"${secrets.prometheus_username.path}:/run/secrets/username:ro"
"${secrets.prometheus_password.path}:/run/secrets/password:ro"
];
cmd = [
"--config.file=/etc/prometheus/prometheus.yml"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
route:
receiver: 'telegram'
repeat_interval: 4h
group_by: [alertname]
routes:
- receiver: 'dead-man-snitch'
matchers:
- service = deadman
repeat_interval: 10m
receivers:
# Default receiver sends a ping to a group chat.
- name: telegram
telegram_configs:
- bot_token_file: /run/secrets/telegram_token
chat_id: -797768186
api_url: "https://api.telegram.org"
parse_mode: "HTML"
# We ping every 10 mins a URL and if this URL / service doesn't hear
# back it emails us after 1h.
- name: 'dead-man-snitch'
webhook_configs:
- url_file: /run/secrets/deadmanssnitch
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: "ip4"
80 changes: 80 additions & 0 deletions nix/modules/nixos/services/docker/config/prometheus/alerts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# https://awesome-prometheus-alerts.grep.to/rules.html

groups:
- name: Presence
rules:
- alert: NodeExporterDown
expr: up{job="node"} < 1
for: 5m
- alert: TraefikDown
expr: up{job="traefik"} < 1
for: 5m
- alert: PrometheusAlertmanagerJobMissing
expr: absent(up{job="alertmanager"})
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CoreradDown
expr: node_systemd_unit_state{name="corerad.service", state="active"} < 1
for: 5m
labels:
severity: warning
annotations:
summary: Corerad daemon is not running, IPv6 is probably broken.
description: "Corerad.service systemd is missing"
- alert: CoreradScrapingBroken
expr: up{job="corerad"} < 1
for: 10m
- name: Host
rules:
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 30 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 30% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 75
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot (instance {{ $labels.instance }})
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: Probing
rules:
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 5m
labels:
severity: critical
annotations:
summary: Blackbox probe failed (instance {{ $labels.instance }})
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusAlertmanagerE2eDeadManSwitch
expr: vector(1)
for: 0m
labels:
severity: end2endtest
service: deadman
annotations:
summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n LABELS = {{ $labels }}"
- name: Misc
rules:
- alert: PrometheusTemplateTextExpansionFailures
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
77 changes: 77 additions & 0 deletions nix/modules/nixos/services/docker/config/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
external_labels:
instance: jonsnow
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: prometheus
static_configs:
- targets: ['localhost:9090']
- job_name: alertmanager
static_configs:
- targets: ['alertmanager:9093']
# blackbox job itself
- job_name: blackbox
static_configs:
- targets:
- blackbox:9115
# To get metrics about the exporter’s targets
- job_name: blackbox-http
metrics_path: /probe
params:
module: [http_2xx]
target: [google.com]
static_configs:
- targets:
- https://amazon.com
- https://www.google.com
- https://www.init7.net
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox:9115
- job_name: blocky
static_configs:
- targets:
- blocky:4000
- job_name: corerad
static_configs:
- targets:
- 'host.docker.internal:9430'
# OK it works but it's a ton of data we don't use.
# - job_name: 'docker'
# scrape_interval: 30s
# static_configs:
# - targets:
# - 'host.docker.internal:9323'
- job_name: node
static_configs:
- targets:
- 'node-exporter:9100'
- job_name: traefik
scrape_interval: 30s
static_configs:
- targets:
- 'traefik:8080'
rule_files:
- alerts.yml
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets: ['alertmanager:9093']
remote_write:
- url: https://prometheus-prod-01-eu-west-0.grafana.net/api/prom/push
basic_auth:
username_file: /run/secrets/username
password_file: /run/secrets/password
write_relabel_configs:
- source_labels: [__name__]
action: keep
# this monster comes from https://grafana.com/docs/grafana-cloud/billing-and-usage/control-prometheus-metrics-usage/usage-analysis-mimirtool/
regex: node_systemd_unit_state|corerad_advertiser_errors_total|corerad_advertiser_inconsistencies_total|corerad_advertiser_messages_received_total|corerad_advertiser_router_advertisements_total|corerad_build_info|corerad_monitor_default_route_expiration_timestamp_seconds|corerad_monitor_messages_received_total|corerad_monitor_prefix_preferred_expiration_timestamp_seconds|corerad_monitor_prefix_valid_expiration_timestamp_seconds|go_gc_duration_seconds_sum|go_memstats_alloc_bytes|go_memstats_alloc_bytes_total|go_memstats_buck_hash_sys_bytes|go_memstats_gc_sys_bytes|go_memstats_heap_alloc_bytes|go_memstats_heap_idle_bytes|go_memstats_heap_inuse_bytes|go_memstats_heap_released_bytes|go_memstats_heap_sys_bytes|go_memstats_mcache_inuse_bytes|go_memstats_mcache_sys_bytes|go_memstats_mspan_inuse_bytes|go_memstats_mspan_sys_bytes|go_memstats_next_gc_bytes|go_memstats_other_sys_bytes|go_memstats_stack_inuse_bytes|go_memstats_stack_sys_bytes|go_memstats_sys_bytes|http_request_duration_microseconds_count|net_conntrack_dialer_conn_failed_total|node_arp_entries|node_boot_time_seconds|node_context_switches_total|node_cooling_device_cur_state|node_cooling_device_max_state|node_cpu_seconds_total|node_disk_discard_time_seconds_total|node_disk_discards_completed_total|node_disk_discards_merged_total|node_disk_io_now|node_disk_io_time_seconds_total|node_disk_io_time_weighted_seconds_total|node_disk_read_bytes_total|node_disk_read_time_seconds_total|node_disk_reads_completed_total|node_disk_reads_merged_total|node_disk_write_time_seconds_total|node_disk_writes_completed_total|node_disk_writes_merged_total|node_disk_written_bytes_total|node_entropy_available_bits|node_filefd_allocated|node_filefd_maximum|node_filesystem_avail_bytes|node_filesystem_device_error|node_filesystem_files|node_filesystem_files_free|node_filesystem_free_bytes|node_filesystem_readonly|node_filesystem_size_bytes|node_forks_total|node_hwmon_temp_celsius|node_hwmon_temp_crit_alarm_celsius|node_hwmon_temp_crit_celsius|node_hwmon_temp_crit_hyst_celsius|node_hwmon_temp_max_celsius|node_interrupts_total|node_intr_total|node_load1|node_load15|node_load5|node_memory_Active_anon_bytes|node_memory_Active_bytes|node_memory_Active_file_bytes|node_memory_AnonHugePages_bytes|node_memory_AnonPages_bytes|node_memory_Bounce_bytes|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_CommitLimit_bytes|node_memory_Committed_AS_bytes|node_memory_DirectMap1G_bytes|node_memory_DirectMap2M_bytes|node_memory_DirectMap4k_bytes|node_memory_Dirty_bytes|node_memory_HardwareCorrupted_bytes|node_memory_HugePages_Free|node_memory_HugePages_Rsvd|node_memory_HugePages_Surp|node_memory_HugePages_Total|node_memory_Hugepagesize_bytes|node_memory_Inactive_anon_bytes|node_memory_Inactive_bytes|node_memory_Inactive_file_bytes|node_memory_KernelStack_bytes|node_memory_Mapped_bytes|node_memory_MemFree_bytes|node_memory_MemTotal_bytes|node_memory_Mlocked_bytes|node_memory_NFS_Unstable_bytes|node_memory_PageTables_bytes|node_memory_Percpu_bytes|node_memory_SReclaimable_bytes|node_memory_SUnreclaim_bytes|node_memory_ShmemHugePages_bytes|node_memory_ShmemPmdMapped_bytes|node_memory_Shmem_bytes|node_memory_Slab_bytes|node_memory_SwapCached_bytes|node_memory_SwapFree_bytes|node_memory_SwapTotal_bytes|node_memory_Unevictable_bytes|node_memory_VmallocChunk_bytes|node_memory_VmallocTotal_bytes|node_memory_VmallocUsed_bytes|node_memory_WritebackTmp_bytes|node_memory_Writeback_bytes|node_netstat_Icmp_InErrors|node_netstat_Icmp_InMsgs|node_netstat_Icmp_OutMsgs|node_netstat_IpExt_InOctets|node_netstat_IpExt_OutOctets|node_netstat_Ip_Forwarding|node_netstat_TcpExt_ListenDrops|node_netstat_TcpExt_ListenOverflows|node_netstat_TcpExt_SyncookiesFailed|node_netstat_TcpExt_SyncookiesRecv|node_netstat_TcpExt_SyncookiesSent|node_netstat_TcpExt_TCPSynRetrans|node_netstat_Tcp_ActiveOpens|node_netstat_Tcp_CurrEstab|node_netstat_Tcp_InErrs|node_netstat_Tcp_InSegs|node_netstat_Tcp_MaxConn|node_netstat_Tcp_OutRsts|node_netstat_Tcp_OutSegs|node_netstat_Tcp_PassiveOpens|node_netstat_Tcp_RetransSegs|node_netstat_UdpLite_InErrors|node_netstat_Udp_InDatagrams|node_netstat_Udp_InErrors|node_netstat_Udp_NoPorts|node_netstat_Udp_OutDatagrams|node_netstat_Udp_RcvbufErrors|node_netstat_Udp_SndbufErrors|node_network_carrier|node_network_mtu_bytes|node_network_receive_bytes_total|node_network_receive_compressed_total|node_network_receive_drop_total|node_network_receive_errs_total|node_network_receive_fifo_total|node_network_receive_frame_total|node_network_receive_multicast_total|node_network_receive_packets_total|node_network_speed_bytes|node_network_transmit_bytes_total|node_network_transmit_carrier_total|node_network_transmit_colls_total|node_network_transmit_compressed_total|node_network_transmit_drop_total|node_network_transmit_errs_total|node_network_transmit_fifo_total|node_network_transmit_packets_total|node_network_transmit_queue_length|node_network_up|node_nf_conntrack_entries|node_nf_conntrack_entries_limit|node_power_supply_online|node_processes_max_processes|node_processes_max_threads|node_processes_pids|node_processes_state|node_processes_threads|node_procs_blocked|node_procs_running|node_schedstat_running_seconds_total|node_schedstat_timeslices_total|node_schedstat_waiting_seconds_total|node_scrape_collector_duration_seconds|node_scrape_collector_success|node_sockstat_FRAG_inuse|node_sockstat_FRAG_memory|node_sockstat_RAW_inuse|node_sockstat_TCP_alloc|node_sockstat_TCP_inuse|node_sockstat_TCP_mem|node_sockstat_TCP_mem_bytes|node_sockstat_TCP_orphan|node_sockstat_TCP_tw|node_sockstat_UDPLITE_inuse|node_sockstat_UDP_inuse|node_sockstat_UDP_mem|node_sockstat_UDP_mem_bytes|node_sockstat_sockets_used|node_softnet_dropped_total|node_softnet_processed_total|node_softnet_times_squeezed_total|node_systemd_socket_accepted_connections_total|node_systemd_units|node_textfile_scrape_error|node_time_seconds|node_timex_estimated_error_seconds|node_timex_frequency_adjustment_ratio|node_timex_loop_time_constant|node_timex_maxerror_seconds|node_timex_offset_seconds|node_timex_sync_status|node_timex_tai_offset_seconds|node_timex_tick_seconds|node_uname_info|node_vmstat_oom_kill|node_vmstat_pgfault|node_vmstat_pgmajfault|node_vmstat_pgpgin|node_vmstat_pgpgout|node_vmstat_pswpin|node_vmstat_pswpout|probe_dns_lookup_time_seconds|probe_duration_seconds|probe_http_duration_seconds|probe_http_ssl|probe_http_status_code|probe_http_version|probe_ssl_earliest_cert_expiry|probe_success|process_cpu_seconds_total|process_max_fds|process_open_fds|process_resident_memory_max_bytes|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus_config_last_reload_success_timestamp_seconds|prometheus_config_last_reload_successful|prometheus_engine_query_duration_seconds_sum|prometheus_evaluator_duration_seconds_count|prometheus_evaluator_duration_seconds_sum|prometheus_evaluator_iterations_missed_total|prometheus_evaluator_iterations_skipped_total|prometheus_evaluator_iterations_total|prometheus_notifications_sent_total|prometheus_rule_evaluation_failures_total|prometheus_sd_azure_refresh_failures_total|prometheus_sd_consul_rpc_failures_total|prometheus_sd_dns_lookup_failures_total|prometheus_sd_ec2_refresh_failures_total|prometheus_sd_gce_refresh_failures_total|prometheus_sd_marathon_refresh_failures_total|prometheus_sd_openstack_refresh_failures_total|prometheus_sd_triton_refresh_failures_total|prometheus_target_interval_length_seconds|prometheus_target_interval_length_seconds_count|prometheus_target_scrape_pool_sync_total|prometheus_target_scrapes_exceeded_sample_limit_total|prometheus_target_scrapes_sample_duplicate_timestamp_total|prometheus_target_scrapes_sample_out_of_bounds_total|prometheus_target_scrapes_sample_out_of_order_total|prometheus_target_sync_length_seconds_sum|prometheus_treecache_zookeeper_failures_total|prometheus_tsdb_compactions_failed_total|prometheus_tsdb_head_chunks|prometheus_tsdb_head_samples_appended_total|prometheus_tsdb_head_series|prometheus_tsdb_head_series_created_total|prometheus_tsdb_head_series_not_found|prometheus_tsdb_head_series_removed_total|prometheus_tsdb_reloads_failures_total|scrape_duration_seconds|traefik_entrypoint_requests_total|traefik_service_request_duration_seconds_sum|traefik_service_requests_total|up|
6 changes: 4 additions & 2 deletions secrets/jonsnow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ telegram_token: ENC[AES256_GCM,data:Wo7HEjZF51WqFluTYQ580MrTf/ZHYmuDBUersF5+kx+h
watchtower_env: ENC[AES256_GCM,data:mqfWFZe5Y3lC7WjR/BgwIISSobDy6CbP9TYk+sTMV/41JlEJZa05EkwcTjkz3qCq/MTVwMdhC5z/xRGGvC04pbKOS4mkFCZBGVBSXdmFtByRdxbMTiU9Be0tsWswl4nwN2PcacqmsOhIautad2RLBbVOByH75QVkFxiQcIIBk4nhxa8e,iv:BeQ84m+isBOgRjnnEKqTZe6ZQH8uygs6nkL4clBpRfM=,tag:TVtoc4vlIhbiroNVTPwBkw==,type:str]
gandi_token_env: ENC[AES256_GCM,data:oRLTQHUkdMwhwDH2esm4zrUL2wOy3zEQXyRpRW1I4Y5GovMbICaWLUmtS9hZ/CQ6nEH32zeqP99wdYtfcDNi8J+3iJWtOmiw/5Cb,iv:7XsyTK8nXPuoLkDuXZNCJakPSzRyvgFclZdQvwLNsx0=,tag:xpX0BwH1pmeWcaWF3lXdtg==,type:str]
deadmansnitch_url: ENC[AES256_GCM,data:VrsYT3wHyzlmO29dJouv9e54ebLaqB/kEsvfgA==,iv:0BqzmTGu4UhxC3Uvn+VozI5GjyYNVxbs1UdXCW1jPms=,tag:MySCfzN4y/lfOw3eTD3V9w==,type:str]
prometheus_username: ENC[AES256_GCM,data:gykidMjk,iv:ugdljNUN5hVa0T0qbHXqCvtccfYIojzbpbNRM5kqDuA=,tag:+xVs5OiNduHgAD5/IylukA==,type:int]
prometheus_password: ENC[AES256_GCM,data:juOjbB0RcO9/mFwiUCv3YWnjwvhMvky1YpXwelqHSokELF36G9VWenC8cvglbGCoE78fkeQGNH9G1ycPcBeFw07Xv2xb9U3T45XbLUcvPJFLrfEsw3+TXLY1prINVbAPdSm4vmIRWOI3Y8HeZ4euXWBlzBc=,iv:4RFBEruw1PI/KFYW7Ucy+1XqoiWe5c02I00uhAaaHe8=,tag:xRJbsbzP/A/HxDNKDU6Z4Q==,type:str]
sops:
kms: []
gcp_kms: []
Expand Down Expand Up @@ -53,8 +55,8 @@ sops:
bURRWmlmYmlOQm45WCt3eHN6OEpCMVUKileiXL5hErgoD+InkTZDA+7FkaG6NV03
cGGbi2u2eWbLA6ZsDD9B2cLH07yz447/6sSnPadZH4VRQQf4qVVM0A==
-----END AGE ENCRYPTED FILE-----
lastmodified: "2025-01-22T18:43:51Z"
mac: ENC[AES256_GCM,data:ZkaOKa9aCZHripnRjvQgXQ3Rt3B2ULoAWCLmHMaOso66cJ4IPzeZD3wxFjCqhUVSKrg9HDJGMenwluO0+DHj/DrdggoKlfwwdGJb+qjxgciUitMrzAuE4l6RenDkBB4NraAR8KQTT4AOYk8nuR0MlbEbErm7Z1T7vAjYrxRWp8c=,iv:aDVJe68g76U06XctCOc/0Xa/rMNiOarDRA1FDWFBIwA=,tag:vWNTMbt4EfREPcGMz3Bhvg==,type:str]
lastmodified: "2025-01-24T19:41:57Z"
mac: ENC[AES256_GCM,data:68p3a/D4kskxK0mdJ++RW+DW7QZ0WbqJf4ibFIvYPbM4bv7Fn7rXsZOTcUVd3hLLnoE2VuHH2iZbocRC55U3sTCj84ismbGwmd0rmuFuYtrnahiKfPRi6+hXX8MK13d6pNoDZfNCyz2PdbadlgztyPWKD34WtLD8WNlkfcu0s6k=,iv:2lWyg/si/RZEZkwba3F5tYN1E7akvAJYWJ+1uNa5HKo=,tag:dt+Khv3VzK8L4q/7iwOXTg==,type:str]
pgp: []
unencrypted_suffix: _unencrypted
version: 3.9.2

0 comments on commit 572a1d7

Please sign in to comment.