mirror of
https://github.com/gensokyo-zone/infrastructure.git
synced 2026-02-09 04:19:19 -08:00
feat(monitoring): gatus, grafana alerting to discord
This commit is contained in:
parent
cee397d774
commit
79ba879e6d
8 changed files with 769 additions and 0 deletions
86
nixos/monitoring/gatus.nix
Normal file
86
nixos/monitoring/gatus.nix
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
{ config, ... }: {
|
||||
sops.secrets.gatus_environment_file = {
|
||||
sopsFile = ../secrets/gatus.yaml;
|
||||
};
|
||||
services.gatus = {
|
||||
enable = true;
|
||||
environmentFile = config.sops.secrets.gatus_environment_file.path;
|
||||
settings = let
|
||||
# Common interval for refreshing all basic HTTP endpoints
|
||||
gatusCommonHTTPInterval = "30s";
|
||||
|
||||
# Shared between all endpoints
|
||||
commonAlertingConfig = {
|
||||
alerts = [
|
||||
{
|
||||
type = "discord";
|
||||
send-on-resolved = true;
|
||||
description = "Healthcheck failed.";
|
||||
failure-threshold = 1;
|
||||
success-threshold = 3;
|
||||
}
|
||||
];
|
||||
};
|
||||
# Used wherever a basic HTTP 200 up-check is required.
|
||||
basicHTTPCheck = url: {
|
||||
inherit url;
|
||||
interval = gatusCommonHTTPInterval;
|
||||
conditions = [
|
||||
"[STATUS] == 200"
|
||||
];
|
||||
};
|
||||
in {
|
||||
# Environment variables are pulled in to be usable within the config.
|
||||
alerting.discord = {
|
||||
webhook-url = "\${DISCORD_WEBHOOK_URL}";
|
||||
};
|
||||
|
||||
# Endpoint configuration
|
||||
endpoints = {
|
||||
# Home Assistant uses the common alerting config, combined with a basic HTTP check for its domain.
|
||||
"Home Assistant" = commonAlertingConfig // (basicHTTPCheck "https://home.local.gensokyo.zone");
|
||||
};
|
||||
|
||||
# The actual status page configuration
|
||||
ui = {
|
||||
title = "Gensokyo Zone Status";
|
||||
description = "The status of the various girls in Gensokyo!";
|
||||
header = "Gensokyo Zone Status";
|
||||
};
|
||||
|
||||
# Prometheus metrics...!
|
||||
metrics = true;
|
||||
|
||||
# We could've used Postgres, but it seems like less moving parts if our status page
|
||||
# doesn't depend upon another service, internal or external, other than what gets it to the internet.
|
||||
storage = {
|
||||
type = "sqlite";
|
||||
path = "/var/lib/gatus/data.db";
|
||||
};
|
||||
|
||||
# Bind on the local address for now, on the port after the last one allocated for the monitoring project.
|
||||
web = {
|
||||
address = "10.1.1.38";
|
||||
port = 9095;
|
||||
};
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
/* services.nginx.virtualHosts."status.gensokyo.zone" = let
|
||||
gatusWebCfg = config.services.gatus.settings.web;
|
||||
upstream = "${gatusWebCfg.address}:${toString gatusWebCfg.port}";
|
||||
in {
|
||||
forceSSL = true;
|
||||
useACMEHost = serverName;
|
||||
kTLS = true;
|
||||
locations."/" = {
|
||||
proxyPass = "http://${upstream}";
|
||||
proxyWebsockets = true;
|
||||
};
|
||||
}; */
|
||||
|
||||
networking.firewall.interfaces.local.allowedTCPPorts = [
|
||||
config.services.gatus.settings.web.port
|
||||
];
|
||||
}
|
||||
26
nixos/monitoring/grafana-alerting.nix
Normal file
26
nixos/monitoring/grafana-alerting.nix
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
{ config, ... }: {
|
||||
sops.secrets.grafana_discord_webhook_url = {
|
||||
sopsFile = ../secrets/grafana.yaml;
|
||||
owner = "grafana";
|
||||
};
|
||||
services.grafana.provision.alerting.contactPoints.settings = {
|
||||
apiVersion = 1;
|
||||
contactPoints = [
|
||||
{
|
||||
orgId = 1;
|
||||
name = "Discord";
|
||||
receivers = [
|
||||
{
|
||||
uid = "discord_alerting";
|
||||
type = "discord";
|
||||
disableResolveMessage = false;
|
||||
settings = {
|
||||
url = "$__file{${config.sops.secrets.grafana_discord_webhook_url.path}}";
|
||||
#avatar_url = "";
|
||||
};
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
36
nixos/monitoring/monitoring.nix
Normal file
36
nixos/monitoring/monitoring.nix
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
{
|
||||
config,
|
||||
lib,
|
||||
...
|
||||
}: let
|
||||
inherit (lib.modules) mkIf mkMerge;
|
||||
inherit (config.services) grafana loki prometheus;
|
||||
in {
|
||||
services = {
|
||||
grafana = {
|
||||
enable = true;
|
||||
settings = {
|
||||
"auth.anonymous" = {
|
||||
enabled = true;
|
||||
# org_name = domain;
|
||||
# org_role = "Viewer"? "Editor"?
|
||||
org_role = "Admin";
|
||||
};
|
||||
metrics = {
|
||||
enabled = true;
|
||||
disable_total_stats = true;
|
||||
};
|
||||
};
|
||||
};
|
||||
loki.enable = true;
|
||||
prometheus.enable = true;
|
||||
};
|
||||
networking.firewall.interfaces.lan.allowedTCPPorts = mkMerge [
|
||||
(mkIf grafana.enable [grafana.settings.server.http_port])
|
||||
(mkIf loki.enable [
|
||||
loki.configuration.server.http_listen_port
|
||||
(mkIf (loki.configuration.server.grpc_listen_port != 0) loki.configuration.server.grpc_listen_port)
|
||||
])
|
||||
(mkIf prometheus.enable [prometheus.port])
|
||||
];
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue