mirror of
https://github.com/SebastianStork/nixos-config.git
synced 2026-03-22 23:29:08 +01:00
Compare commits
3 commits
8cf724fb97
...
f2258ac79c
| Author | SHA1 | Date | |
|---|---|---|---|
| f2258ac79c | |||
| 2c8ecb9c7b | |||
| 67f8f1689a |
3 changed files with 76 additions and 47 deletions
|
|
@ -63,6 +63,14 @@ in
|
||||||
name = "ntfy";
|
name = "ntfy";
|
||||||
webhook_configs = lib.singleton { url = "http://localhost:${toString cfg.ntfyBridgePort}/hook"; };
|
webhook_configs = lib.singleton { url = "http://localhost:${toString cfg.ntfyBridgePort}/hook"; };
|
||||||
};
|
};
|
||||||
|
inhibit_rules = lib.singleton {
|
||||||
|
source_matchers = [
|
||||||
|
''alertname="InstanceDown"''
|
||||||
|
''job="node"''
|
||||||
|
];
|
||||||
|
target_matchers = lib.singleton ''alertname!="InstanceDown"'';
|
||||||
|
equal = [ "instance" ];
|
||||||
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,12 @@ let
|
||||||
in
|
in
|
||||||
inputs.dns.lib.toString zone {
|
inputs.dns.lib.toString zone {
|
||||||
SOA = {
|
SOA = {
|
||||||
nameServer = "${netCfg.hostName}.${zone}.";
|
nameServer =
|
||||||
|
nsRecords
|
||||||
|
|> lib.map (record: record.name)
|
||||||
|
|> lib.naturalSort
|
||||||
|
|> lib.head
|
||||||
|
|> (hostName: "${hostName}.${zone}.");
|
||||||
adminEmail = "hostmaster@sstork.dev";
|
adminEmail = "hostmaster@sstork.dev";
|
||||||
serial = 1;
|
serial = 1;
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -58,25 +58,25 @@ in
|
||||||
scrapeConfigs = [
|
scrapeConfigs = [
|
||||||
{
|
{
|
||||||
job_name = "prometheus";
|
job_name = "prometheus";
|
||||||
static_configs = lib.singleton {
|
static_configs =
|
||||||
targets =
|
|
||||||
allHosts
|
allHosts
|
||||||
|> lib.attrValues
|
|> lib.attrValues
|
||||||
|> lib.map (host: host.config.custom.services.prometheus)
|
|> lib.filter (host: host.config.custom.services.prometheus.enable)
|
||||||
|> lib.filter (prometheus: prometheus.enable)
|
|> lib.map (host: {
|
||||||
|> lib.map (prometheus: prometheus.domain);
|
targets = lib.singleton host.config.custom.services.prometheus.domain;
|
||||||
};
|
labels.instance = host.config.networking.hostName;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
job_name = "alertmanager";
|
job_name = "alertmanager";
|
||||||
static_configs = lib.singleton {
|
static_configs =
|
||||||
targets =
|
|
||||||
allHosts
|
allHosts
|
||||||
|> lib.attrValues
|
|> lib.attrValues
|
||||||
|> lib.map (host: host.config.custom.services.alertmanager)
|
|> lib.filter (host: host.config.custom.services.alertmanager.enable)
|
||||||
|> lib.filter (alertmanager: alertmanager.enable)
|
|> lib.map (host: {
|
||||||
|> lib.map (alertmanager: alertmanager.domain);
|
targets = lib.singleton host.config.custom.services.alertmanager.domain;
|
||||||
};
|
labels.instance = host.config.networking.hostName;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|
@ -84,15 +84,31 @@ in
|
||||||
{
|
{
|
||||||
groups = lib.singleton {
|
groups = lib.singleton {
|
||||||
name = "Rules";
|
name = "Rules";
|
||||||
rules = [
|
rules =
|
||||||
{
|
(
|
||||||
|
allHosts
|
||||||
|
|> lib.attrValues
|
||||||
|
|> lib.filter (host: host.config.custom.services.alloy.enable)
|
||||||
|
|> lib.filter (host: host.config.custom.networking.overlay.role == "server")
|
||||||
|
|> lib.map (host: host.config.networking.hostName)
|
||||||
|
|> lib.map (hostName: {
|
||||||
alert = "InstanceDown";
|
alert = "InstanceDown";
|
||||||
expr = "up == 0";
|
expr = ''absent_over_time(up{instance="${hostName}", job="node"}[2m])'';
|
||||||
for = "2m";
|
|
||||||
labels.severity = "critical";
|
labels.severity = "critical";
|
||||||
annotations = {
|
annotations = {
|
||||||
summary = "{{ $labels.instance }} is DOWN";
|
summary = "${hostName} is DOWN";
|
||||||
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes.";
|
description = "${hostName} has not reported any metrics for more than 2 minutes.";
|
||||||
|
};
|
||||||
|
})
|
||||||
|
)
|
||||||
|
++ [
|
||||||
|
{
|
||||||
|
alert = "ServiceDown";
|
||||||
|
expr = ''up{job=~"prometheus|alertmanager"} == 0'';
|
||||||
|
for = "2m";
|
||||||
|
annotations = {
|
||||||
|
summary = "{{ $labels.job }} on {{ $labels.instance }} is DOWN";
|
||||||
|
description = "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes.";
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
|
|
@ -116,7 +132,7 @@ in
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|> lib.strings.toJSON
|
|> lib.strings.toJSON
|
||||||
|> pkgs.writeText "prometheus-instance-down-rule"
|
|> pkgs.writeText "prometheus-rules"
|
||||||
|> toString
|
|> toString
|
||||||
|> lib.singleton;
|
|> lib.singleton;
|
||||||
};
|
};
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue