feat(alerts): Health-Alarme via Webhook + Email-SMTP

Sidebar → System → Alarme.

Migration 0021: alert_channels (kind=webhook|email, target, settings,
active) + alert_events (kind, severity=info/warning/error/critical,
subject, message, sent_to JSONB).

internal/services/alerts/:
  - Fire(kind, severity, subject, message) — broadcastet an alle
    aktiven Channels + persistiert Event mit per-Channel-Result
    (ok/error) in sent_to.
  - Webhook-Sender: POST JSON {kind, severity, subject, message,
    content, text, fired_at}. Slack/Discord/Teams akzeptieren das
    out-of-the-box ohne Adapter (content + text-Felder gleichzeitig).
  - Email-Sender: net/smtp + STARTTLS optional. Settings (smtp_host,
    smtp_port, username/password, from, use_tls) liegen in
    channel.settings JSONB.

internal/handlers/alerts.go: CRUD + POST /alerts/test + GET
/alerts/events (history).

Scheduler-Trigger:
  - cert.expiring  — TLS-Cert <14 Tage Restzeit (12h-dedupe pro cert)
                     severity warning, <3 Tage → error
  - cert.renew_failed       — Renewer-Cycle hat fails
  - cert.renewer.run_failed — Renewer-Cycle abgebrochen
  - backup.failed  — Scheduled Backup error
  - license.invalid — License-Server liefert valid=false

In-process Dedupe (12h TTL, map[key]time.Time) verhindert dass
identische Alerts in Schleifen feuern.

UI (pages/Alerts): Tabs Channels (CRUD-Tabelle, Add-Modal mit
conditional-Email-Fields) + History (200 letzte Events mit
severity-Tag + per-Channel-Delivery-Status). Header-Button
„Test-Alert" feuert einen Test-Event in alle aktiven Channels.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Debian
2026-05-13 15:57:05 +02:00
parent 4a34629023
commit 81a8217493
13 changed files with 1012 additions and 14 deletions

View File

@@ -30,6 +30,7 @@ import (
wgrender "git.netcell-it.de/projekte/edgeguard-native/internal/wireguard"
"git.netcell-it.de/projekte/edgeguard-native/internal/handlers/response"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/acme"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/alerts"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/audit"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/backends"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/backendservers"
@@ -52,7 +53,7 @@ import (
wgsvc "git.netcell-it.de/projekte/edgeguard-native/internal/services/wireguard"
)
var version = "1.0.73"
var version = "1.0.74"
func main() {
addr := os.Getenv("EDGEGUARD_API_ADDR")
@@ -205,6 +206,7 @@ func main() {
// Jobs laufen im edgeguard-scheduler.
handlers.NewBackupHandler(backup.New(pool), auditRepo, nodeID, version).Register(authed)
handlers.NewDiagnosticsHandler().Register(authed)
handlers.NewAlertsHandler(alerts.New(pool), auditRepo, nodeID).Register(authed)
handlers.NewTLSCertsHandler(tlsRepo, auditRepo, nodeID, acmeService).Register(authed)
// Firewall reload: nach jeder Mutation den Renderer neu fahren
// (writes ruleset.nft + sudo nft -f). Errors loggen, nicht failen.

View File

@@ -9,7 +9,7 @@ import (
"os"
)
var version = "1.0.73"
var version = "1.0.74"
const usage = `edgeguard-ctl — EdgeGuard CLI

View File

@@ -11,8 +11,10 @@ package main
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"os"
"strconv"
"time"
"github.com/jackc/pgx/v5/pgxpool"
@@ -21,6 +23,7 @@ import (
"git.netcell-it.de/projekte/edgeguard-native/internal/database"
"git.netcell-it.de/projekte/edgeguard-native/internal/license"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/acme"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/alerts"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/backup"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/certrenewer"
licsvc "git.netcell-it.de/projekte/edgeguard-native/internal/services/license"
@@ -28,7 +31,7 @@ import (
"git.netcell-it.de/projekte/edgeguard-native/internal/services/tlscerts"
)
var version = "1.0.73"
var version = "1.0.74"
const (
// renewTickInterval — how often we re-evaluate expiring certs.
@@ -92,10 +95,13 @@ func main() {
slog.Info("scheduler: daily backup enabled", "tick", backupTickInterval,
"dir", backupSvc.BackupDir, "keep_n", backup.DefaultKeepN)
alertSvc := alerts.New(pool)
alertDedupe := newDedupe(12 * time.Hour)
if renewer != nil {
runRenewer(ctx, renewer)
runRenewer(ctx, renewer, alertSvc, alertDedupe)
}
runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID)
runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID, alertSvc, alertDedupe)
// Lokale Node-ID für config-hash-refresh. EnsureNodeID liefert
// dieselbe ID die die API hat (gleiches /var/lib/edgeguard/node-id).
@@ -118,18 +124,80 @@ func main() {
select {
case <-renewTick.C:
if renewer != nil {
runRenewer(ctx, renewer)
runRenewer(ctx, renewer, alertSvc, alertDedupe)
}
runCertExpiryCheck(ctx, tlsRepo, alertSvc, alertDedupe)
case <-licTick.C:
runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID)
runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID, alertSvc, alertDedupe)
case <-backupTick.C:
runBackup(ctx, backupSvc, version)
runBackup(ctx, backupSvc, version, alertSvc)
case <-hashTick.C:
runConfigHash(ctx, pool, localID)
}
}
}
// dedupe verhindert dass derselbe Alert-Key (z.B. "cert.expiring:utm-1.netcell-it.de")
// öfter als alle 12h gefeuert wird. In-memory — Scheduler-Restart
// resettet, was OK ist (Operator soll bei restart wieder einen kennen-
// lernen-Event sehen können).
type dedupe struct {
ttl time.Duration
last map[string]time.Time
}
func newDedupe(ttl time.Duration) *dedupe { return &dedupe{ttl: ttl, last: map[string]time.Time{}} }
func (d *dedupe) shouldFire(key string) bool {
now := time.Now()
if last, ok := d.last[key]; ok && now.Sub(last) < d.ttl {
return false
}
d.last[key] = now
return true
}
// runCertExpiryCheck prüft tls_certs auf bevorstehende Expiry. Warning
// bei <14d Restzeit. Dedupe pro Cert-Name 12h damit der scheduler
// nicht alle 6h dieselbe Warnung feuert.
func runCertExpiryCheck(ctx context.Context, repo *tlscerts.Repo,
a *alerts.Service, d *dedupe) {
if repo == nil || a == nil {
return
}
certs, err := repo.List(ctx)
if err != nil {
slog.Warn("scheduler: cert-expiry list failed", "error", err)
return
}
threshold := 14 * 24 * time.Hour
now := time.Now()
for _, c := range certs {
if c.NotAfter == nil {
continue
}
remain := c.NotAfter.Sub(now)
if remain > threshold || remain < -90*24*time.Hour {
continue
}
key := "cert.expiring:" + c.Domain
if !d.shouldFire(key) {
continue
}
days := int(remain.Hours() / 24)
sev := alerts.SeverityWarning
if days < 3 {
sev = alerts.SeverityError
}
_, err := a.Fire(ctx, "cert.expiring", sev,
"TLS-Zertifikat läuft ab: "+c.Domain,
"Cert für "+c.Domain+" läuft in "+strconv.Itoa(days)+" Tagen ab ("+c.NotAfter.Format(time.RFC3339)+"). Renewer-Status: "+c.Status)
if err != nil {
slog.Warn("scheduler: alert fire failed", "error", err)
}
}
}
// runConfigHash berechnet den Hash und schreibt ihn in ha_nodes.
// Pool kann nil sein (scheduler-pool-fail beim boot) — dann no-op.
func runConfigHash(ctx context.Context, pool *pgxpoolPool, localID string) {
@@ -149,11 +217,16 @@ func runConfigHash(ctx context.Context, pool *pgxpoolPool, localID string) {
type pgxpoolPool = pgxpool.Pool
// runBackup führt einen scheduled Backup aus + prunet alte. Failures
// loggen wir nurder Tick läuft morgen wieder, kein Notfall.
func runBackup(ctx context.Context, svc *backup.Service, version string) {
// loggen wir + alarmierenverlorene Backups sind kritisch.
func runBackup(ctx context.Context, svc *backup.Service, version string, a *alerts.Service) {
res, err := svc.Run(ctx, backup.KindScheduled, version)
if err != nil {
slog.Warn("scheduler: backup failed", "error", err, "file", res.File)
if a != nil {
_, _ = a.Fire(ctx, "backup.failed", alerts.SeverityError,
"Backup fehlgeschlagen",
"Scheduled Backup konnte nicht erstellt werden: "+err.Error())
}
return
}
slog.Info("scheduler: backup done",
@@ -167,8 +240,9 @@ func runBackup(ctx context.Context, svc *backup.Service, version string) {
// runLicenseVerify performs a single re-verify pass. Empty key = no-op
// (box stays in trial), so this is safe to call on every tick.
// Bei valid:false-Antwort + Stand >7d alt → Warnung an Alerts.
func runLicenseVerify(ctx context.Context, c *license.Client, ks *license.KeyStore,
repo *licsvc.Repo, nodeID string) {
repo *licsvc.Repo, nodeID string, a *alerts.Service, d *dedupe) {
key := ks.Get()
if key == "" {
slog.Debug("scheduler: license verify skipped — no key")
@@ -194,15 +268,36 @@ func runLicenseVerify(ctx context.Context, c *license.Client, ks *license.KeySto
}
slog.Info("scheduler: license verified",
"status", status, "valid", res.Valid, "expires_at", res.ExpiresAt)
// Alarm bei ungültiger Lizenz (revoked, expired) — dedupe 12h damit
// der Operator nicht alle 24h denselben Alert bekommt.
if a != nil && d != nil && !res.Valid {
if d.shouldFire("license.invalid") {
_, _ = a.Fire(ctx, "license.invalid", alerts.SeverityError,
"License "+status,
"License-Server liefert valid=false. Reason: "+res.Reason)
}
}
}
func runRenewer(ctx context.Context, r *certrenewer.Service) {
func runRenewer(ctx context.Context, r *certrenewer.Service, a *alerts.Service, d *dedupe) {
res, err := r.Run(ctx)
if err != nil {
slog.Error("scheduler: renewer run failed", "error", err)
if a != nil && d != nil && d.shouldFire("cert.renewer.run_failed") {
_, _ = a.Fire(ctx, "cert.renewer.run_failed", alerts.SeverityError,
"ACME-Renewer-Lauf fehlgeschlagen",
"Certrenewer-Cycle abgebrochen: "+err.Error())
}
return
}
slog.Info("scheduler: renewer pass complete",
"checked", res.Checked, "renewed", res.Renewed,
"failed", res.Failed, "skipped", res.Skipped)
if a != nil && res.Failed > 0 && d != nil && d.shouldFire("cert.renew_failed") {
_, _ = a.Fire(ctx, "cert.renew_failed", alerts.SeverityError,
"Cert-Renewal teilweise fehlgeschlagen",
fmt.Sprintf("Renewer-Cycle: %d checked, %d renewed, %d failed, %d skipped",
res.Checked, res.Renewed, res.Failed, res.Skipped))
}
}