prometheus: Enhance alert rules

This commit is contained in:
SebastianStork 2026-03-10 20:15:39 +01:00
parent 8cf724fb97
commit 67f8f1689a
Signed by: SebastianStork
SSH key fingerprint: SHA256:tRrGdjYOwgHxpSc/wTOZQZEjxcb15P0tyXRsbAfd+2Q

View file

@ -58,25 +58,25 @@ in
scrapeConfigs = [
{
job_name = "prometheus";
static_configs = lib.singleton {
targets =
static_configs =
allHosts
|> lib.attrValues
|> lib.map (host: host.config.custom.services.prometheus)
|> lib.filter (prometheus: prometheus.enable)
|> lib.map (prometheus: prometheus.domain);
};
|> lib.filter (host: host.config.custom.services.prometheus.enable)
|> lib.map (host: {
targets = lib.singleton host.config.custom.services.prometheus.domain;
labels.instance = host.config.networking.hostName;
});
}
{
job_name = "alertmanager";
static_configs = lib.singleton {
targets =
static_configs =
allHosts
|> lib.attrValues
|> lib.map (host: host.config.custom.services.alertmanager)
|> lib.filter (alertmanager: alertmanager.enable)
|> lib.map (alertmanager: alertmanager.domain);
};
|> lib.filter (host: host.config.custom.services.alertmanager.enable)
|> lib.map (host: {
targets = lib.singleton host.config.custom.services.alertmanager.domain;
labels.instance = host.config.networking.hostName;
});
}
];
@ -84,15 +84,31 @@ in
{
groups = lib.singleton {
name = "Rules";
rules = [
{
rules =
(
allHosts
|> lib.attrValues
|> lib.filter (host: host.config.custom.services.alloy.enable)
|> lib.filter (host: host.config.custom.networking.overlay.role == "server")
|> lib.map (host: host.config.networking.hostName)
|> lib.map (hostName: {
alert = "InstanceDown";
expr = "up == 0";
for = "2m";
expr = ''absent_over_time(up{instance="${hostName}", job="node"}[2m])'';
labels.severity = "critical";
annotations = {
summary = "{{ $labels.instance }} is DOWN";
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes.";
summary = "${hostName} is DOWN";
description = "${hostName} has not reported any metrics for more than 2 minutes.";
};
})
)
++ [
{
alert = "ServiceDown";
expr = ''up{job=~"prometheus|alertmanager"} == 0'';
for = "2m";
annotations = {
summary = "{{ $labels.job }} on {{ $labels.instance }} is DOWN";
description = "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes.";
};
}
{
@ -116,7 +132,7 @@ in
};
}
|> lib.strings.toJSON
|> pkgs.writeText "prometheus-instance-down-rule"
|> pkgs.writeText "prometheus-rules"
|> toString
|> lib.singleton;
};