Compare commits

..

No commits in common. "f2258ac79c1e1ca76640b1acdbe9a92cf3f3092b" and "8cf724fb97698c28ad90d28463db4188be2ceadb" have entirely different histories.

3 changed files with 47 additions and 76 deletions

View file

@ -63,14 +63,6 @@ in
name = "ntfy"; name = "ntfy";
webhook_configs = lib.singleton { url = "http://localhost:${toString cfg.ntfyBridgePort}/hook"; }; webhook_configs = lib.singleton { url = "http://localhost:${toString cfg.ntfyBridgePort}/hook"; };
}; };
inhibit_rules = lib.singleton {
source_matchers = [
''alertname="InstanceDown"''
''job="node"''
];
target_matchers = lib.singleton ''alertname!="InstanceDown"'';
equal = [ "instance" ];
};
}; };
}; };

View file

@ -46,12 +46,7 @@ let
in in
inputs.dns.lib.toString zone { inputs.dns.lib.toString zone {
SOA = { SOA = {
nameServer = nameServer = "${netCfg.hostName}.${zone}.";
nsRecords
|> lib.map (record: record.name)
|> lib.naturalSort
|> lib.head
|> (hostName: "${hostName}.${zone}.");
adminEmail = "hostmaster@sstork.dev"; adminEmail = "hostmaster@sstork.dev";
serial = 1; serial = 1;
}; };

View file

@ -58,25 +58,25 @@ in
scrapeConfigs = [ scrapeConfigs = [
{ {
job_name = "prometheus"; job_name = "prometheus";
static_configs = static_configs = lib.singleton {
allHosts targets =
|> lib.attrValues allHosts
|> lib.filter (host: host.config.custom.services.prometheus.enable) |> lib.attrValues
|> lib.map (host: { |> lib.map (host: host.config.custom.services.prometheus)
targets = lib.singleton host.config.custom.services.prometheus.domain; |> lib.filter (prometheus: prometheus.enable)
labels.instance = host.config.networking.hostName; |> lib.map (prometheus: prometheus.domain);
}); };
} }
{ {
job_name = "alertmanager"; job_name = "alertmanager";
static_configs = static_configs = lib.singleton {
allHosts targets =
|> lib.attrValues allHosts
|> lib.filter (host: host.config.custom.services.alertmanager.enable) |> lib.attrValues
|> lib.map (host: { |> lib.map (host: host.config.custom.services.alertmanager)
targets = lib.singleton host.config.custom.services.alertmanager.domain; |> lib.filter (alertmanager: alertmanager.enable)
labels.instance = host.config.networking.hostName; |> lib.map (alertmanager: alertmanager.domain);
}); };
} }
]; ];
@ -84,55 +84,39 @@ in
{ {
groups = lib.singleton { groups = lib.singleton {
name = "Rules"; name = "Rules";
rules = rules = [
( {
allHosts alert = "InstanceDown";
|> lib.attrValues expr = "up == 0";
|> lib.filter (host: host.config.custom.services.alloy.enable) for = "2m";
|> lib.filter (host: host.config.custom.networking.overlay.role == "server") labels.severity = "critical";
|> lib.map (host: host.config.networking.hostName) annotations = {
|> lib.map (hostName: { summary = "{{ $labels.instance }} is DOWN";
alert = "InstanceDown"; description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes.";
expr = ''absent_over_time(up{instance="${hostName}", job="node"}[2m])''; };
labels.severity = "critical"; }
annotations = { {
summary = "${hostName} is DOWN"; alert = "CominDeploymentFailed";
description = "${hostName} has not reported any metrics for more than 2 minutes."; expr = ''comin_deployment_info{status!="done"}'';
}; annotations = {
}) summary = "{{ $labels.instance }} deployment failed";
) description = "The deployment of {{ $labels.instance }} with comin is failing.";
++ [ };
{ }
alert = "ServiceDown"; {
expr = ''up{job=~"prometheus|alertmanager"} == 0''; alert = "CominDeploymentCommitMismatch";
for = "2m"; expr = "count(count by (commit_id) (comin_deployment_info)) > 1";
annotations = { for = "10m";
summary = "{{ $labels.job }} on {{ $labels.instance }} is DOWN"; annotations = {
description = "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes."; summary = "Hosts are running different commits";
}; description = "Not all hosts are running the same git commit, which may indicate a failed deployment and could lead to incompatible configurations.";
} };
{ }
alert = "CominDeploymentFailed"; ];
expr = ''comin_deployment_info{status!="done"}'';
annotations = {
summary = "{{ $labels.instance }} deployment failed";
description = "The deployment of {{ $labels.instance }} with comin is failing.";
};
}
{
alert = "CominDeploymentCommitMismatch";
expr = "count(count by (commit_id) (comin_deployment_info)) > 1";
for = "10m";
annotations = {
summary = "Hosts are running different commits";
description = "Not all hosts are running the same git commit, which may indicate a failed deployment and could lead to incompatible configurations.";
};
}
];
}; };
} }
|> lib.strings.toJSON |> lib.strings.toJSON
|> pkgs.writeText "prometheus-rules" |> pkgs.writeText "prometheus-instance-down-rule"
|> toString |> toString
|> lib.singleton; |> lib.singleton;
}; };