feat(monitoring): service status lookup

This commit is contained in:
arcnmx 2024-05-31 18:14:37 -07:00
parent f97ab24f47
commit ebfd1f5a9a
12 changed files with 273 additions and 103 deletions

View file

@ -1,86 +1,107 @@
{ config, ... }: {
{
config,
access,
gensokyo-zone,
lib,
...
}: let
inherit (gensokyo-zone) systems;
inherit (gensokyo-zone.lib) mkAlmostOptionDefault unmerged;
inherit (lib.modules) mkMerge mkOptionDefault;
inherit (lib.attrsets) attrValues nameValuePair listToAttrs;
inherit (lib.lists) filter length optional concatMap;
cfg = config.services.gatus;
statusSystems = filter (system: system.config.access.online.enable) (attrValues systems);
mapSystem = system: let
statusServices = map (serviceName: system.config.exports.services.${serviceName}) system.config.exports.status.services;
in concatMap (mkServiceEndpoint system) statusServices;
mkPortEndpoint = { system, service, port, unique }: let
inherit (port.status) gatus;
name = if unique
then service.displayName
else "${service.displayName}: ${port.name}";
conf = {
url = mkOptionDefault (access.proxyUrlFor {
inherit service port;
system = system.config;
scheme = gatus.protocol;
#network = port.listen;
});
};
in nameValuePair name ({ ... }: {
imports =
optional port.status.alert.enable alertingConfigAlerts
++ optional (gatus.protocol == "http" || gatus.protocol == "https") alertingConfigHttp;
config = mkMerge [
(unmerged.mergeAttrs gatus.settings)
conf
];
});
mkServiceEndpoint = system: service: let
statusPorts = map /*lib.attrsets.getAttr*/(portName: service.ports.${portName}) service.status.ports;
gatusPorts = filter (port: port.status.gatus.enable) statusPorts;
unique = length gatusPorts == 1;
in map (port: mkPortEndpoint {
inherit system service port unique;
}) gatusPorts;
alertingConfigAlerts = {
alerts = [
{
type = "discord";
send-on-resolved = true;
description = "Healthcheck failed.";
failure-threshold = 1;
success-threshold = 3;
}
];
};
alertingConfigHttp = {
# Common interval for refreshing all basic HTTP endpoints
interval = mkAlmostOptionDefault "30s";
};
in {
sops.secrets.gatus_environment_file = {
sopsFile = ../secrets/gatus.yaml;
};
services.gatus = {
enable = true;
environmentFile = config.sops.secrets.gatus_environment_file.path;
settings = let
# Common interval for refreshing all basic HTTP endpoints
gatusCommonHTTPInterval = "30s";
settings = {
# Environment variables are pulled in to be usable within the config.
alerting.discord = {
webhook-url = "\${DISCORD_WEBHOOK_URL}";
};
# Shared between all endpoints
commonAlertingConfig = {
alerts = [
{
type = "discord";
send-on-resolved = true;
description = "Healthcheck failed.";
failure-threshold = 1;
success-threshold = 3;
}
];
};
# Used wherever a basic HTTP 200 up-check is required.
basicHTTPCheck = url: {
inherit url;
interval = gatusCommonHTTPInterval;
conditions = [
"[STATUS] == 200"
];
};
in {
# Environment variables are pulled in to be usable within the config.
alerting.discord = {
webhook-url = "\${DISCORD_WEBHOOK_URL}";
};
# Endpoint configuration
endpoints = listToAttrs (concatMap mapSystem statusSystems);
# Endpoint configuration
endpoints = {
# Home Assistant uses the common alerting config, combined with a basic HTTP check for its domain.
"Home Assistant" = commonAlertingConfig // (basicHTTPCheck "https://home.local.gensokyo.zone");
};
# The actual status page configuration
ui = {
title = "Gensokyo Zone Status";
description = "The status of the various girls in Gensokyo!";
header = "Gensokyo Zone Status";
};
# The actual status page configuration
ui = {
title = "Gensokyo Zone Status";
description = "The status of the various girls in Gensokyo!";
header = "Gensokyo Zone Status";
};
# Prometheus metrics...!
metrics = true;
# Prometheus metrics...!
metrics = true;
# We could've used Postgres, but it seems like less moving parts if our status page
# doesn't depend upon another service, internal or external, other than what gets it to the internet.
storage = {
type = "sqlite";
path = "/var/lib/gatus/data.db";
};
# Bind on the local address for now, on the port after the last one allocated for the monitoring project.
web = {
address = "10.1.1.38";
port = 9095;
};
# We could've used Postgres, but it seems like less moving parts if our status page
# doesn't depend upon another service, internal or external, other than what gets it to the internet.
storage = {
type = "sqlite";
path = "/var/lib/gatus/data.db";
};
# Bind on the local address for now, on the port after the last one allocated for the monitoring project.
web = {
address = "[::]";
port = 9095;
};
};
};
/* services.nginx.virtualHosts."status.gensokyo.zone" = let
gatusWebCfg = config.services.gatus.settings.web;
upstream = "${gatusWebCfg.address}:${toString gatusWebCfg.port}";
in {
forceSSL = true;
useACMEHost = serverName;
kTLS = true;
locations."/" = {
proxyPass = "http://${upstream}";
proxyWebsockets = true;
};
}; */
networking.firewall.interfaces.local.allowedTCPPorts = [
config.services.gatus.settings.web.port
networking.firewall.interfaces.lan.allowedTCPPorts = [
cfg.settings.web.port
];
}