mirror of
https://github.com/SebastianStork/nixos-config.git
synced 2026-03-22 22:29:06 +01:00
159 lines
5.2 KiB
Nix
159 lines
5.2 KiB
Nix
{
|
|
config,
|
|
pkgs,
|
|
lib,
|
|
allHosts,
|
|
...
|
|
}:
|
|
let
|
|
cfg = config.custom.services.prometheus;
|
|
in
|
|
{
|
|
options.custom.services.prometheus = {
|
|
enable = lib.mkEnableOption "";
|
|
domain = lib.mkOption {
|
|
type = lib.types.nonEmptyStr;
|
|
default = "";
|
|
};
|
|
port = lib.mkOption {
|
|
type = lib.types.port;
|
|
default = 9090;
|
|
};
|
|
storageRetentionSize = lib.mkOption {
|
|
type = lib.types.nonEmptyStr;
|
|
default = "2GB";
|
|
};
|
|
};
|
|
|
|
config = lib.mkIf cfg.enable {
|
|
services.prometheus = {
|
|
enable = true;
|
|
stateDir = "prometheus";
|
|
|
|
listenAddress = "localhost";
|
|
inherit (cfg) port;
|
|
webExternalUrl = "https://${cfg.domain}";
|
|
|
|
extraFlags = [
|
|
"--web.enable-remote-write-receiver"
|
|
"--storage.tsdb.retention.size=${cfg.storageRetentionSize}"
|
|
];
|
|
globalConfig = {
|
|
scrape_interval = "30s";
|
|
external_labels.monitor = "global";
|
|
};
|
|
|
|
alertmanagers = lib.singleton {
|
|
scheme = "https";
|
|
static_configs = lib.singleton {
|
|
targets =
|
|
allHosts
|
|
|> lib.attrValues
|
|
|> lib.map (host: host.config.custom.services.alertmanager)
|
|
|> lib.filter (alertmanager: alertmanager.enable)
|
|
|> lib.map (alertmanager: alertmanager.domain);
|
|
};
|
|
};
|
|
|
|
scrapeConfigs = [
|
|
{
|
|
job_name = "prometheus";
|
|
static_configs =
|
|
allHosts
|
|
|> lib.attrValues
|
|
|> lib.filter (host: host.config.custom.services.prometheus.enable)
|
|
|> lib.map (host: {
|
|
targets = lib.singleton host.config.custom.services.prometheus.domain;
|
|
labels.instance = host.config.networking.hostName;
|
|
});
|
|
}
|
|
{
|
|
job_name = "alertmanager";
|
|
static_configs =
|
|
allHosts
|
|
|> lib.attrValues
|
|
|> lib.filter (host: host.config.custom.services.alertmanager.enable)
|
|
|> lib.map (host: {
|
|
targets = lib.singleton host.config.custom.services.alertmanager.domain;
|
|
labels.instance = host.config.networking.hostName;
|
|
});
|
|
}
|
|
];
|
|
|
|
ruleFiles =
|
|
{
|
|
groups = lib.singleton {
|
|
name = "Rules";
|
|
rules =
|
|
(
|
|
allHosts
|
|
|> lib.attrValues
|
|
|> lib.filter (host: host.config.custom.services.alloy.enable)
|
|
|> lib.filter (host: host.config.custom.networking.overlay.role == "server")
|
|
|> lib.map (host: host.config.networking.hostName)
|
|
|> lib.map (hostName: {
|
|
alert = "InstanceDown";
|
|
expr = ''absent_over_time(up{instance="${hostName}", job="node"}[2m])'';
|
|
labels.severity = "critical";
|
|
annotations = {
|
|
summary = "${hostName} is DOWN";
|
|
summary_resolved = "${hostName} is up again";
|
|
description = "No metrics received for over 2 minutes.";
|
|
description_resolved = "Metrics are being received again.";
|
|
};
|
|
})
|
|
)
|
|
++ [
|
|
{
|
|
alert = "ServiceDown";
|
|
expr = ''up{job=~"prometheus|alertmanager"} == 0'';
|
|
for = "2m";
|
|
annotations = {
|
|
summary = "{{ $labels.job | title }} on {{ $labels.instance }} is DOWN";
|
|
summary_resolved = "{{ $labels.job | title }} on {{ $labels.instance }} is up again";
|
|
description = "Unresponsive for over 2 minutes.";
|
|
description_resolved = "Responding normally.";
|
|
};
|
|
}
|
|
{
|
|
alert = "CominDeploymentFailed";
|
|
expr = ''comin_deployment_info{status!="done"}'';
|
|
annotations = {
|
|
summary = "{{ $labels.instance }} deployment failed";
|
|
summary_resolved = "{{ $labels.instance }} deployment recovered";
|
|
description = "Deployment is not reaching \"done\" status.";
|
|
description_resolved = "Deployment completed successfully.";
|
|
};
|
|
}
|
|
{
|
|
alert = "CominDeploymentCommitMismatch";
|
|
expr = "count(count by (commit_id) (comin_deployment_info)) > 1";
|
|
for = "10m";
|
|
annotations = {
|
|
summary = "Hosts are running different commits";
|
|
summary_resolved = "All hosts are running the same commit again";
|
|
description = "Possibly a failed deployment or incompatible configurations.";
|
|
description_resolved = "All hosts are in sync.";
|
|
};
|
|
}
|
|
];
|
|
};
|
|
}
|
|
|> lib.strings.toJSON
|
|
|> pkgs.writeText "prometheus-rules"
|
|
|> toString
|
|
|> lib.singleton;
|
|
};
|
|
|
|
custom = {
|
|
services.caddy.virtualHosts.${cfg.domain}.port = cfg.port;
|
|
|
|
persistence.directories = [ "/var/lib/${config.services.prometheus.stateDir}" ];
|
|
|
|
meta.sites.${cfg.domain} = {
|
|
title = "Prometheus";
|
|
icon = "sh:prometheus";
|
|
};
|
|
};
|
|
};
|
|
}
|