prometheus: Enhance alert rules

This commit is contained in:
SebastianStork 2026-03-10 20:15:39 +01:00
parent 8cf724fb97
commit 67f8f1689a
Signed by: SebastianStork
SSH key fingerprint: SHA256:tRrGdjYOwgHxpSc/wTOZQZEjxcb15P0tyXRsbAfd+2Q

View file

@ -58,25 +58,25 @@ in
scrapeConfigs = [ scrapeConfigs = [
{ {
job_name = "prometheus"; job_name = "prometheus";
static_configs = lib.singleton { static_configs =
targets = allHosts
allHosts |> lib.attrValues
|> lib.attrValues |> lib.filter (host: host.config.custom.services.prometheus.enable)
|> lib.map (host: host.config.custom.services.prometheus) |> lib.map (host: {
|> lib.filter (prometheus: prometheus.enable) targets = lib.singleton host.config.custom.services.prometheus.domain;
|> lib.map (prometheus: prometheus.domain); labels.instance = host.config.networking.hostName;
}; });
} }
{ {
job_name = "alertmanager"; job_name = "alertmanager";
static_configs = lib.singleton { static_configs =
targets = allHosts
allHosts |> lib.attrValues
|> lib.attrValues |> lib.filter (host: host.config.custom.services.alertmanager.enable)
|> lib.map (host: host.config.custom.services.alertmanager) |> lib.map (host: {
|> lib.filter (alertmanager: alertmanager.enable) targets = lib.singleton host.config.custom.services.alertmanager.domain;
|> lib.map (alertmanager: alertmanager.domain); labels.instance = host.config.networking.hostName;
}; });
} }
]; ];
@ -84,39 +84,55 @@ in
{ {
groups = lib.singleton { groups = lib.singleton {
name = "Rules"; name = "Rules";
rules = [ rules =
{ (
alert = "InstanceDown"; allHosts
expr = "up == 0"; |> lib.attrValues
for = "2m"; |> lib.filter (host: host.config.custom.services.alloy.enable)
labels.severity = "critical"; |> lib.filter (host: host.config.custom.networking.overlay.role == "server")
annotations = { |> lib.map (host: host.config.networking.hostName)
summary = "{{ $labels.instance }} is DOWN"; |> lib.map (hostName: {
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."; alert = "InstanceDown";
}; expr = ''absent_over_time(up{instance="${hostName}", job="node"}[2m])'';
} labels.severity = "critical";
{ annotations = {
alert = "CominDeploymentFailed"; summary = "${hostName} is DOWN";
expr = ''comin_deployment_info{status!="done"}''; description = "${hostName} has not reported any metrics for more than 2 minutes.";
annotations = { };
summary = "{{ $labels.instance }} deployment failed"; })
description = "The deployment of {{ $labels.instance }} with comin is failing."; )
}; ++ [
} {
{ alert = "ServiceDown";
alert = "CominDeploymentCommitMismatch"; expr = ''up{job=~"prometheus|alertmanager"} == 0'';
expr = "count(count by (commit_id) (comin_deployment_info)) > 1"; for = "2m";
for = "10m"; annotations = {
annotations = { summary = "{{ $labels.job }} on {{ $labels.instance }} is DOWN";
summary = "Hosts are running different commits"; description = "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes.";
description = "Not all hosts are running the same git commit, which may indicate a failed deployment and could lead to incompatible configurations."; };
}; }
} {
]; alert = "CominDeploymentFailed";
expr = ''comin_deployment_info{status!="done"}'';
annotations = {
summary = "{{ $labels.instance }} deployment failed";
description = "The deployment of {{ $labels.instance }} with comin is failing.";
};
}
{
alert = "CominDeploymentCommitMismatch";
expr = "count(count by (commit_id) (comin_deployment_info)) > 1";
for = "10m";
annotations = {
summary = "Hosts are running different commits";
description = "Not all hosts are running the same git commit, which may indicate a failed deployment and could lead to incompatible configurations.";
};
}
];
}; };
} }
|> lib.strings.toJSON |> lib.strings.toJSON
|> pkgs.writeText "prometheus-instance-down-rule" |> pkgs.writeText "prometheus-rules"
|> toString |> toString
|> lib.singleton; |> lib.singleton;
}; };