From 67f8f1689ad9e2fa56c5300568d06e3c9d5d6084 Mon Sep 17 00:00:00 2001 From: SebastianStork Date: Tue, 10 Mar 2026 20:15:39 +0100 Subject: [PATCH 1/3] prometheus: Enhance alert rules --- modules/nixos/services/prometheus.nix | 108 +++++++++++++++----------- 1 file changed, 62 insertions(+), 46 deletions(-) diff --git a/modules/nixos/services/prometheus.nix b/modules/nixos/services/prometheus.nix index ef20489..22ee4cc 100644 --- a/modules/nixos/services/prometheus.nix +++ b/modules/nixos/services/prometheus.nix @@ -58,25 +58,25 @@ in scrapeConfigs = [ { job_name = "prometheus"; - static_configs = lib.singleton { - targets = - allHosts - |> lib.attrValues - |> lib.map (host: host.config.custom.services.prometheus) - |> lib.filter (prometheus: prometheus.enable) - |> lib.map (prometheus: prometheus.domain); - }; + static_configs = + allHosts + |> lib.attrValues + |> lib.filter (host: host.config.custom.services.prometheus.enable) + |> lib.map (host: { + targets = lib.singleton host.config.custom.services.prometheus.domain; + labels.instance = host.config.networking.hostName; + }); } { job_name = "alertmanager"; - static_configs = lib.singleton { - targets = - allHosts - |> lib.attrValues - |> lib.map (host: host.config.custom.services.alertmanager) - |> lib.filter (alertmanager: alertmanager.enable) - |> lib.map (alertmanager: alertmanager.domain); - }; + static_configs = + allHosts + |> lib.attrValues + |> lib.filter (host: host.config.custom.services.alertmanager.enable) + |> lib.map (host: { + targets = lib.singleton host.config.custom.services.alertmanager.domain; + labels.instance = host.config.networking.hostName; + }); } ]; @@ -84,39 +84,55 @@ in { groups = lib.singleton { name = "Rules"; - rules = [ - { - alert = "InstanceDown"; - expr = "up == 0"; - for = "2m"; - labels.severity = "critical"; - annotations = { - summary = "{{ $labels.instance }} is DOWN"; - description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."; - }; - } - { - alert = "CominDeploymentFailed"; - expr = ''comin_deployment_info{status!="done"}''; - annotations = { - summary = "{{ $labels.instance }} deployment failed"; - description = "The deployment of {{ $labels.instance }} with comin is failing."; - }; - } - { - alert = "CominDeploymentCommitMismatch"; - expr = "count(count by (commit_id) (comin_deployment_info)) > 1"; - for = "10m"; - annotations = { - summary = "Hosts are running different commits"; - description = "Not all hosts are running the same git commit, which may indicate a failed deployment and could lead to incompatible configurations."; - }; - } - ]; + rules = + ( + allHosts + |> lib.attrValues + |> lib.filter (host: host.config.custom.services.alloy.enable) + |> lib.filter (host: host.config.custom.networking.overlay.role == "server") + |> lib.map (host: host.config.networking.hostName) + |> lib.map (hostName: { + alert = "InstanceDown"; + expr = ''absent_over_time(up{instance="${hostName}", job="node"}[2m])''; + labels.severity = "critical"; + annotations = { + summary = "${hostName} is DOWN"; + description = "${hostName} has not reported any metrics for more than 2 minutes."; + }; + }) + ) + ++ [ + { + alert = "ServiceDown"; + expr = ''up{job=~"prometheus|alertmanager"} == 0''; + for = "2m"; + annotations = { + summary = "{{ $labels.job }} on {{ $labels.instance }} is DOWN"; + description = "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes."; + }; + } + { + alert = "CominDeploymentFailed"; + expr = ''comin_deployment_info{status!="done"}''; + annotations = { + summary = "{{ $labels.instance }} deployment failed"; + description = "The deployment of {{ $labels.instance }} with comin is failing."; + }; + } + { + alert = "CominDeploymentCommitMismatch"; + expr = "count(count by (commit_id) (comin_deployment_info)) > 1"; + for = "10m"; + annotations = { + summary = "Hosts are running different commits"; + description = "Not all hosts are running the same git commit, which may indicate a failed deployment and could lead to incompatible configurations."; + }; + } + ]; }; } |> lib.strings.toJSON - |> pkgs.writeText "prometheus-instance-down-rule" + |> pkgs.writeText "prometheus-rules" |> toString |> lib.singleton; }; From 2c8ecb9c7b0513e8735a16c59cb55b1c8925e6e2 Mon Sep 17 00:00:00 2001 From: SebastianStork Date: Tue, 10 Mar 2026 20:15:51 +0100 Subject: [PATCH 2/3] alertmanager: Add inhibit rules for InstanceDown alerts --- modules/nixos/services/alertmanager.nix | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/modules/nixos/services/alertmanager.nix b/modules/nixos/services/alertmanager.nix index 88bb9b7..f483fae 100644 --- a/modules/nixos/services/alertmanager.nix +++ b/modules/nixos/services/alertmanager.nix @@ -63,6 +63,14 @@ in name = "ntfy"; webhook_configs = lib.singleton { url = "http://localhost:${toString cfg.ntfyBridgePort}/hook"; }; }; + inhibit_rules = lib.singleton { + source_matchers = [ + ''alertname="InstanceDown"'' + ''job="node"'' + ]; + target_matchers = lib.singleton ''alertname!="InstanceDown"''; + equal = [ "instance" ]; + }; }; }; From f2258ac79c1e1ca76640b1acdbe9a92cf3f3092b Mon Sep 17 00:00:00 2001 From: SebastianStork Date: Tue, 10 Mar 2026 20:43:06 +0100 Subject: [PATCH 3/3] nameservers/public: Fix by using the public hostname for the primary ns --- modules/nixos/services/nameservers/public.nix | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/nixos/services/nameservers/public.nix b/modules/nixos/services/nameservers/public.nix index fee45f1..0ad08a6 100644 --- a/modules/nixos/services/nameservers/public.nix +++ b/modules/nixos/services/nameservers/public.nix @@ -46,7 +46,12 @@ let in inputs.dns.lib.toString zone { SOA = { - nameServer = "${netCfg.hostName}.${zone}."; + nameServer = + nsRecords + |> lib.map (record: record.name) + |> lib.naturalSort + |> lib.head + |> (hostName: "${hostName}.${zone}."); adminEmail = "hostmaster@sstork.dev"; serial = 1; };