2020-04-27 18:50:29 +00:00
|
|
|
{ pkgs, ... }:
|
2020-04-26 18:40:00 +00:00
|
|
|
|
2020-04-26 20:01:05 +00:00
|
|
|
let
|
|
|
|
# Scrape a target with the specified module, interval, and list of targets.
|
2020-04-28 18:12:30 +00:00
|
|
|
blackboxScrape = (module: blackboxScrapeJobName module module);
|
2020-04-26 20:01:05 +00:00
|
|
|
|
|
|
|
# Same as blackboxScrape, but allow customizing the job name.
|
|
|
|
blackboxScrapeJobName = (job: module: interval: targets: {
|
|
|
|
job_name = "blackbox_${job}";
|
|
|
|
scrape_interval = "${interval}";
|
|
|
|
metrics_path = "/probe";
|
|
|
|
params = { module = [ "${module}" ]; };
|
|
|
|
# blackbox_exporter location is hardcoded.
|
|
|
|
relabel_configs = relabelTarget "servnerr-3:9115";
|
2020-04-27 18:50:29 +00:00
|
|
|
static_configs = [{ inherit targets; }];
|
2020-04-26 20:01:05 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
# Produces a relabeling configuration that replaces the instance label with
|
|
|
|
# the HTTP target parameter.
|
|
|
|
relabelTarget = (target: [
|
|
|
|
{
|
|
|
|
source_labels = [ "__address__" ];
|
|
|
|
target_label = "__param_target";
|
|
|
|
}
|
|
|
|
{
|
2020-05-21 15:15:10 +00:00
|
|
|
source_labels = [ "__param_target" ];
|
2020-04-26 20:01:05 +00:00
|
|
|
target_label = "instance";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
target_label = "__address__";
|
|
|
|
replacement = "${target}";
|
|
|
|
}
|
|
|
|
]);
|
|
|
|
|
|
|
|
in {
|
2020-04-26 18:40:00 +00:00
|
|
|
# Prometheus monitoring server and exporter configuration.
|
|
|
|
services.prometheus = {
|
|
|
|
enable = true;
|
|
|
|
webExternalUrl = "https://prometheus.servnerr.com";
|
|
|
|
|
2020-05-28 20:49:03 +00:00
|
|
|
globalConfig.scrape_interval = "15s";
|
|
|
|
|
2020-05-18 01:06:35 +00:00
|
|
|
extraFlags = [ "--storage.tsdb.retention=365d" ];
|
2020-05-08 16:06:47 +00:00
|
|
|
|
2020-04-26 18:40:00 +00:00
|
|
|
# Use alertmanager running on monitoring machine.
|
|
|
|
alertmanagers =
|
|
|
|
[{ static_configs = [{ targets = [ "monitnerr-1:9093" ]; }]; }];
|
|
|
|
|
|
|
|
exporters = {
|
2020-04-26 18:42:41 +00:00
|
|
|
# Node exporter already enabled on all machines.
|
2020-04-26 20:01:05 +00:00
|
|
|
|
|
|
|
blackbox = {
|
|
|
|
enable = true;
|
|
|
|
configFile = pkgs.writeText "blackbox.yml" ''
|
|
|
|
modules:
|
|
|
|
http_2xx:
|
|
|
|
prober: http
|
|
|
|
http_401:
|
|
|
|
prober: http
|
|
|
|
http:
|
|
|
|
valid_status_codes: [401]
|
|
|
|
ssh_banner:
|
|
|
|
prober: tcp
|
|
|
|
tcp:
|
|
|
|
query_response:
|
|
|
|
- expect: "^SSH-2.0-"
|
|
|
|
'';
|
|
|
|
};
|
|
|
|
|
2020-04-26 18:40:00 +00:00
|
|
|
# SNMP exporter with data file from release 0.17.0.
|
|
|
|
snmp = {
|
|
|
|
enable = true;
|
|
|
|
configurationPath = builtins.fetchurl {
|
|
|
|
url =
|
|
|
|
"https://raw.githubusercontent.com/prometheus/snmp_exporter/f0ad4551a5c2023e383bc8dde2222f47dc760b83/snmp.yml";
|
|
|
|
sha256 =
|
|
|
|
"5c1febe100ce9140c8c59cf3c2a6346a1219dd0966d5cd2926498e88dcd69997";
|
|
|
|
};
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
2020-04-26 20:01:05 +00:00
|
|
|
# TODO: template out hostnames or consider DNSSD.
|
2020-04-26 18:40:00 +00:00
|
|
|
scrapeConfigs = [
|
2020-04-26 20:01:05 +00:00
|
|
|
# Blackbox exporter and associated targets.
|
|
|
|
{
|
|
|
|
job_name = "blackbox";
|
|
|
|
static_configs = [{ targets = [ "servnerr-3:9115" ]; }];
|
|
|
|
}
|
|
|
|
(blackboxScrape "http_2xx" "15s" [ "https://grafana.servnerr.com" ])
|
|
|
|
# Netlify can occasionally be flappy, so check it less often.
|
|
|
|
(blackboxScrapeJobName "http_2xx_mdlayhercom" "http_2xx" "1m"
|
|
|
|
[ "https://mdlayher.com" ])
|
|
|
|
(blackboxScrape "http_401" "15s" [
|
|
|
|
"https://alertmanager.servnerr.com"
|
|
|
|
"https://plex.servnerr.com"
|
|
|
|
"https://prometheus.servnerr.com"
|
|
|
|
])
|
|
|
|
# The SSH banner check produces a fair amount of log spam, so only scrape
|
|
|
|
# it once a minute.
|
|
|
|
(blackboxScrape "ssh_banner" "1m" [
|
|
|
|
"monitnerr-1:22"
|
|
|
|
"nerr-3:22"
|
|
|
|
"routnerr-2:22"
|
|
|
|
"servnerr-3:22"
|
|
|
|
"unifi.servnerr.com:22"
|
|
|
|
])
|
|
|
|
{
|
|
|
|
job_name = "coredns";
|
|
|
|
static_configs = [{ targets = [ "routnerr-2:9153" ]; }];
|
|
|
|
}
|
|
|
|
{
|
|
|
|
job_name = "corerad";
|
|
|
|
static_configs = [{ targets = [ "routnerr-2:9430" ]; }];
|
|
|
|
}
|
2020-05-28 16:29:19 +00:00
|
|
|
{
|
|
|
|
job_name = "keylight";
|
|
|
|
relabel_configs = relabelTarget "servnerr-3:9288";
|
|
|
|
static_configs = [{ targets = [ "keylight" ]; }];
|
|
|
|
}
|
2020-04-26 18:40:00 +00:00
|
|
|
{
|
|
|
|
job_name = "node";
|
|
|
|
static_configs = [{
|
|
|
|
targets = [
|
|
|
|
"monitnerr-1:9100"
|
|
|
|
"nerr-3:9100"
|
|
|
|
"routnerr-2:9100"
|
|
|
|
"servnerr-3:9100"
|
|
|
|
];
|
|
|
|
}];
|
|
|
|
}
|
2020-05-08 16:47:44 +00:00
|
|
|
{
|
|
|
|
job_name = "obs";
|
|
|
|
static_configs = [{ targets = [ "nerr-3:9407" ]; }];
|
|
|
|
}
|
2020-04-26 18:40:00 +00:00
|
|
|
# SNMP relabeling configuration required to properly replace the instance
|
|
|
|
# names and query the correct devices.
|
|
|
|
{
|
|
|
|
job_name = "snmp";
|
|
|
|
metrics_path = "/snmp";
|
|
|
|
params = { module = [ "if_mib" ]; };
|
2020-04-26 20:01:05 +00:00
|
|
|
relabel_configs = relabelTarget "servnerr-3:9116";
|
2020-04-26 18:40:00 +00:00
|
|
|
static_configs = [{
|
2020-05-28 16:29:19 +00:00
|
|
|
targets = [
|
|
|
|
"switch-livingroom01"
|
|
|
|
"switch-office01"
|
|
|
|
"switch-office02.ipv4"
|
|
|
|
"ap-livingroom02.ipv4"
|
|
|
|
];
|
2020-04-26 18:40:00 +00:00
|
|
|
}];
|
|
|
|
}
|
2020-04-30 20:01:39 +00:00
|
|
|
{
|
|
|
|
job_name = "wireguard";
|
|
|
|
static_configs = [{ targets = [ "routnerr-2:9586" ]; }];
|
|
|
|
}
|
2020-04-26 18:40:00 +00:00
|
|
|
];
|
2020-05-19 22:35:41 +00:00
|
|
|
|
|
|
|
# Desktop PC is excluded from alerts as it isn't running 24/7.
|
|
|
|
rules = [
|
|
|
|
(builtins.toJSON ({
|
|
|
|
groups = [{
|
|
|
|
name = "default";
|
|
|
|
rules = [
|
|
|
|
{
|
|
|
|
alert = "InstanceDown";
|
|
|
|
expr = ''up{instance!~"nerr-3.*"} == 0'';
|
|
|
|
for = "2m";
|
|
|
|
annotations.summary =
|
|
|
|
"{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes.";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
alert = "ServiceDown";
|
|
|
|
expr = ''probe_success{instance!~"nerr-3.*"} == 0'';
|
|
|
|
for = "2m";
|
|
|
|
annotations.summary =
|
|
|
|
"{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes.";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
alert = "TLSCertificateNearExpiration";
|
|
|
|
expr =
|
2020-05-24 16:31:15 +00:00
|
|
|
"probe_ssl_earliest_cert_expiry - time() < 60 * 60 * 24 * 2";
|
2020-05-19 22:35:41 +00:00
|
|
|
for = "1m";
|
|
|
|
annotations.summary =
|
|
|
|
"TLS certificate for {{ $labels.instance }} will expire in less than 2 days.";
|
|
|
|
}
|
|
|
|
{
|
|
|
|
alert = "DiskUsageHigh";
|
|
|
|
expr = ''
|
|
|
|
(1 - node_filesystem_free_bytes{fstype=~"ext4|vfat"} / node_filesystem_size_bytes) > 0.75'';
|
|
|
|
for = "1m";
|
|
|
|
annotations.summary =
|
|
|
|
"Disk usage on {{ $labels.instance }}:{{ $labels.mountpoint }} ({{ $labels.device }}) exceeds 75%.";
|
|
|
|
}
|
2020-06-01 00:21:24 +00:00
|
|
|
# All advertising interfaces should be forwarding IPv6 traffic, and
|
|
|
|
# have IPv6 autoconfiguration disabled.
|
2020-05-19 22:35:41 +00:00
|
|
|
{
|
2020-06-01 00:21:24 +00:00
|
|
|
alert = "CoreRADAdvertisingInterfaceMisconfigured";
|
2020-05-19 22:35:41 +00:00
|
|
|
expr =
|
2020-06-01 00:21:24 +00:00
|
|
|
"(corerad_interface_advertising == 1) and ((corerad_interface_forwarding == 0) or (corerad_interface_autoconfiguration == 1))";
|
2020-05-19 22:35:41 +00:00
|
|
|
for = "1m";
|
|
|
|
annotations.summary =
|
|
|
|
"CoreRAD ({{ $labels.instance }}) interface {{ $labels.interface }} is misconfigured for sending IPv6 router advertisements.";
|
|
|
|
}
|
2020-06-01 18:10:17 +00:00
|
|
|
# All monitoring interfaces should be forwarding IPv6 traffic.
|
|
|
|
{
|
|
|
|
alert = "CoreRADMonitoringInterfaceMisconfigured";
|
|
|
|
expr =
|
|
|
|
"(corerad_interface_monitoring == 1) and (corerad_interface_forwarding == 0)";
|
|
|
|
for = "1m";
|
|
|
|
annotations.summary =
|
|
|
|
"CoreRAD ({{ $labels.instance }}) interface {{ $labels.interface }} is misconfigured for monitoring upstream IPv6 NDP traffic.";
|
|
|
|
}
|
2020-05-19 22:35:41 +00:00
|
|
|
# All CoreRAD interfaces should multicast IPv6 RAs on a regular basis
|
|
|
|
# so hosts don't drop their default route.
|
|
|
|
{
|
2020-06-01 18:10:17 +00:00
|
|
|
alert = "CoreRADAdvertiserNotMulticasting";
|
2020-05-19 22:35:41 +00:00
|
|
|
expr = ''
|
|
|
|
rate(corerad_advertiser_router_advertisements_total{type="multicast"}[20m]) == 0'';
|
|
|
|
for = "1m";
|
|
|
|
annotations.summary =
|
|
|
|
"CoreRAD ({{ $labels.instance }}) interface {{ $labels.interface }} has not sent a multicast router advertisment in more than 20 minutes.";
|
|
|
|
}
|
|
|
|
# Monitor for inconsistent advertisements from hosts on the LAN.
|
|
|
|
{
|
2020-06-01 18:11:04 +00:00
|
|
|
alert =
|
|
|
|
"CoreRADAdvertiserReceivedInconsistentRouterAdvertisement";
|
2020-05-19 22:35:41 +00:00
|
|
|
expr =
|
2020-05-24 16:31:15 +00:00
|
|
|
"rate(corerad_advertiser_router_advertisement_inconsistencies_total[5m]) > 0";
|
2020-05-19 22:35:41 +00:00
|
|
|
annotations.summary =
|
|
|
|
"CoreRAD ({{ $labels.instance }}) interface {{ $labels.interface }} received an IPv6 router advertisement with inconsistent configuration compared to its own.";
|
|
|
|
}
|
|
|
|
# We are advertising 2 prefixes per interface out of GUA /56 and ULA /48.
|
|
|
|
{
|
2020-06-01 18:10:17 +00:00
|
|
|
alert = "CoreRADAdvertiserMissingPrefix";
|
2020-05-19 22:35:41 +00:00
|
|
|
expr = ''
|
|
|
|
count by (instance, interface) (corerad_advertiser_router_advertisement_prefix_autonomous{prefix=~"2600:6c4a:7880:32.*|fd9e:1a04:f01d:.*"} == 1) != 2'';
|
|
|
|
for = "1m";
|
|
|
|
annotations.summary =
|
|
|
|
"CoreRAD ({{ $labels.instance }}) interface {{ $labels.interface }} is advertising an incorrect number of IPv6 prefixes for SLAAC.";
|
|
|
|
}
|
|
|
|
# All IPv6 prefixes are advertised with SLAAC.
|
|
|
|
{
|
2020-06-01 18:10:17 +00:00
|
|
|
alert = "CoreRADAdvertiserPrefixNotAutonomous";
|
2020-05-19 22:35:41 +00:00
|
|
|
expr =
|
2020-05-24 16:31:15 +00:00
|
|
|
"corerad_advertiser_router_advertisement_prefix_autonomous == 0";
|
2020-05-19 22:35:41 +00:00
|
|
|
for = "1m";
|
|
|
|
annotations.summary =
|
|
|
|
"CoreRAD ({{ $labels.instance }}) prefix {{ $labels.prefix }} on interface {{ $labels.interface }} is not configured for SLAAC.";
|
|
|
|
}
|
2020-06-01 18:10:17 +00:00
|
|
|
# Expect continuous upstream router advertisements.
|
|
|
|
{
|
|
|
|
alert = "CoreRADMonitorNoUpstreamRouterAdvertisements";
|
2020-06-01 18:11:04 +00:00
|
|
|
expr = ''
|
|
|
|
rate(corerad_monitor_messages_received_total{message="router advertisement"}[5m]) == 0'';
|
2020-06-01 18:10:17 +00:00
|
|
|
annotations.summary =
|
2020-06-01 18:32:10 +00:00
|
|
|
"CoreRAD ({{ $labels.instance }}) interface {{ $labels.interface }} has not received a router advertisement from {{ $labels.host }} in more than 5 minutes.";
|
2020-06-01 18:10:17 +00:00
|
|
|
}
|
|
|
|
# Expect continuous upstream router advertisements.
|
|
|
|
{
|
|
|
|
alert = "CoreRADMonitorDefaultRouteExpiring";
|
|
|
|
expr =
|
|
|
|
"corerad_monitor_default_route_expiration_time - time() < 2*60*60";
|
|
|
|
annotations.summary =
|
|
|
|
"CoreRAD ({{ $labels.instance }}) interface {{ $labels.interface }} will drop its default route to {{ $labels.router }} in less than 2 hours.";
|
|
|
|
}
|
2020-05-19 22:35:41 +00:00
|
|
|
];
|
|
|
|
}];
|
|
|
|
}))
|
|
|
|
];
|
2020-04-26 18:40:00 +00:00
|
|
|
};
|
2020-05-28 16:29:19 +00:00
|
|
|
|
|
|
|
# Out-of-tree exporters.
|
|
|
|
services.keylight_exporter.enable = true;
|
2020-04-26 18:40:00 +00:00
|
|
|
}
|