nixos-config/modules/nixos/services/prometheus.nix

159 lines
5.4 KiB
Nix

{
config,
pkgs,
lib,
allHosts,
...
}:
let
cfg = config.custom.services.prometheus;
in
{
options.custom.services.prometheus = {
enable = lib.mkEnableOption "";
domain = lib.mkOption {
type = lib.types.nonEmptyStr;
default = "";
};
port = lib.mkOption {
type = lib.types.port;
default = 9090;
};
storageRetentionSize = lib.mkOption {
type = lib.types.nonEmptyStr;
default = "2GB";
};
};
config = lib.mkIf cfg.enable {
services.prometheus = {
enable = true;
stateDir = "prometheus";
listenAddress = "localhost";
inherit (cfg) port;
webExternalUrl = "https://${cfg.domain}";
extraFlags = [
"--web.enable-remote-write-receiver"
"--storage.tsdb.retention.size=${cfg.storageRetentionSize}"
];
globalConfig = {
scrape_interval = "30s";
external_labels.monitor = "global";
};
alertmanagers = lib.singleton {
scheme = "https";
static_configs = lib.singleton {
targets =
allHosts
|> lib.attrValues
|> lib.map (host: host.config.custom.services.alertmanager)
|> lib.filter (alertmanager: alertmanager.enable)
|> lib.map (alertmanager: alertmanager.domain);
};
};
scrapeConfigs = [
{
job_name = "prometheus";
static_configs =
allHosts
|> lib.attrValues
|> lib.filter (host: host.config.custom.services.prometheus.enable)
|> lib.map (host: {
targets = lib.singleton host.config.custom.services.prometheus.domain;
labels.instance = host.config.networking.hostName;
});
}
{
job_name = "alertmanager";
static_configs =
allHosts
|> lib.attrValues
|> lib.filter (host: host.config.custom.services.alertmanager.enable)
|> lib.map (host: {
targets = lib.singleton host.config.custom.services.alertmanager.domain;
labels.instance = host.config.networking.hostName;
});
}
];
ruleFiles =
{
groups = lib.singleton {
name = "Rules";
rules =
(
allHosts
|> lib.attrValues
|> lib.filter (host: host.config.custom.services.alloy.enable)
|> lib.filter (host: host.config.custom.networking.overlay.role == "server")
|> lib.map (host: host.config.networking.hostName)
|> lib.map (hostName: {
alert = "InstanceDown";
expr = ''absent_over_time(up{instance="${hostName}", job="node"}[2m])'';
labels.severity = "critical";
annotations = {
summary = "Host ${hostName} is down";
summary_resolved = "Host ${hostName} is up again";
description = "Prometheus has not received node metrics from ${hostName} for 2 minutes.";
description_resolved = "Prometheus is receiving node metrics from ${hostName} again.";
};
})
)
++ [
{
alert = "ServiceDown";
expr = ''up{job=~"prometheus|alertmanager"} == 0'';
for = "2m";
annotations = {
summary = "Service {{ $labels.job | title }} on {{ $labels.instance }} is down";
summary_resolved = "Service {{ $labels.job | title }} on {{ $labels.instance }} is up again";
description = "Prometheus has not received scrape data for 2 minutes.";
description_resolved = "Prometheus is receiving scrape data again.";
};
}
{
alert = "CominDeploymentFailed";
expr = ''comin_deployment_info{status!="done"}'';
annotations = {
summary = "Deployment on {{ $labels.instance }} failed";
summary_resolved = "Deployment on {{ $labels.instance }} succeeded again";
description = "Comin reports a deployment status other than \"done\".";
description_resolved = "Comin reports the deployment status as \"done\" again.";
};
}
{
alert = "CominDeploymentCommitMismatch";
expr = "count(count by (commit_id) (comin_deployment_info)) > 1";
for = "10m";
annotations = {
summary = "Deployment commits are out of sync";
summary_resolved = "Deployment commits are in sync again";
description = "Comin reports different deployed commits across hosts.";
description_resolved = "Comin reports the same deployed commit across all hosts again.";
};
}
];
};
}
|> lib.strings.toJSON
|> pkgs.writeText "prometheus-rules"
|> toString
|> lib.singleton;
};
custom = {
services.caddy.virtualHosts.${cfg.domain}.port = cfg.port;
persistence.directories = [ "/var/lib/${config.services.prometheus.stateDir}" ];
meta.sites.${cfg.domain} = {
title = "Prometheus";
icon = "sh:prometheus";
};
};
};
}