Files
edgeguard-native/cmd/edgeguard-scheduler/main.go
Debian 571f51ba9a feat(backup): pg_dump + state-tarball + daily auto + UI
Production-Box braucht Backups — bisher keine. Jetzt komplette
Pipeline:

Backend (internal/services/backup/):
  - Output: /var/backups/edgeguard/eg-YYYYMMDD-HHMMSS.tar.gz
  - Inhalt: dump.sql (pg_dump --clean --if-exists --no-owner --no-acl),
    files/setup.json, files/license_key, files/license.cache,
    files/.jwt_fingerprint, files/node.conf, files/acme-account/* +
    manifest.json (Version, kind, hostname, sizes)
  - sha256 während-write via TeeWriter, Size + sha in backups-DB-Row
  - Failure-Path: row mit status=failed + error, kein orphan-tarball
  - Prune(keepN=14) löscht erfolgreiche Backups älter als die letzten N

Migration 0018: backups(id, file, size, sha256, db/files bytes, kind,
status, error, host, started/finished).

Scheduler (cmd/edgeguard-scheduler):
  - 24h-Tick → backup.Run(KindScheduled) + Prune. Beim Boot wird ein
    initialer Backup NICHT sofort gezogen (kein nervöses Spam),
    sondern erst beim nächsten 24h-Slot.

REST (internal/handlers/backup.go):
  GET    /api/v1/backups              — list (newest first)
  POST   /api/v1/backups              — trigger manual (sync, audit'ed)
  GET    /api/v1/backups/:id          — single
  GET    /api/v1/backups/:id/download — sendfile tar.gz
  DELETE /api/v1/backups/:id          — entferne file + row

UI (management-ui/src/pages/Backups):
  - Liste mit Time, File+sha (first 16), Kind-Tag, Status, Size (mit
    DB + Files Aufschlüsselung), Dauer
  - „Backup jetzt erstellen" Button, Refresh, Download, Delete
  - Auto-Refresh 30s
  - Sidebar-Eintrag „Backups" unter System

postinst:
  - /var/backups/edgeguard 0750 edgeguard:edgeguard (enthält sensitive
    pg_dump + license_key → NICHT world-readable)
  - sudoers-Whitelist `sudo -u postgres /usr/bin/pg_dump --clean
    --if-exists --no-owner --no-acl edgeguard` — exakte Form

Verifiziert auf der Box: backups-Tabelle existiert, scheduler logged
„backup enabled tick=24h dir=/var/backups/edgeguard keep_n=14",
pg_dump-via-sudoers liefert 2808 lines.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 23:08:18 +02:00

171 lines
5.5 KiB
Go

// edgeguard-scheduler runs background jobs that don't belong on the
// API request path:
//
// - ACME cert renewal (every 6h, re-issues anything < 30d to expiry)
//
// Future jobs (cluster heartbeat, backup, audit-log retention)
// hang off the same Tick loop. Stays single-process — no leader
// election yet (Phase 3).
package main
import (
"context"
"encoding/json"
"log/slog"
"os"
"time"
"git.netcell-it.de/projekte/edgeguard-native/internal/database"
"git.netcell-it.de/projekte/edgeguard-native/internal/license"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/acme"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/backup"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/certrenewer"
licsvc "git.netcell-it.de/projekte/edgeguard-native/internal/services/license"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/setup"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/tlscerts"
)
var version = "1.0.64"
const (
// renewTickInterval — how often we re-evaluate expiring certs.
// 6h is enough: LE renewal window is 30 days; missing one tick
// makes no difference. Hourly would log too much.
renewTickInterval = 6 * time.Hour
// certDir matches handlers.NewTLSCertsHandler default — HAProxy
// reads from this directory.
certDir = "/etc/edgeguard/tls"
// licenseTickInterval — daily re-verify against
// license.netcell-it.com. Result lands in the licenses table.
licenseTickInterval = 24 * time.Hour
// backupTickInterval — daily scheduled backup at ~03:00 (Tick
// alignment ist approximativ, weil time.Ticker bei Boot startet).
// Retention: 14 erfolgreiche Backups (default in backup.Service).
backupTickInterval = 24 * time.Hour
)
func main() {
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo})))
slog.Info("edgeguard-scheduler starting", "version", version)
ctx := context.Background()
pool, err := database.Open(ctx, database.ConnStringFromEnv())
if err != nil {
slog.Error("scheduler: DB open failed — sleeping forever", "error", err)
select {}
}
defer pool.Close()
tlsRepo := tlscerts.New(pool)
setupStore := setup.NewStore(setup.DefaultDir)
st, _ := setupStore.Load()
var renewer *certrenewer.Service
if st != nil && st.ACMEEmail != "" {
issuer := acme.New(st.ACMEEmail)
renewer = certrenewer.New(tlsRepo, issuer, certDir, 30*24*time.Hour)
slog.Info("scheduler: ACME renewer enabled",
"email", st.ACMEEmail, "tick", renewTickInterval, "threshold", "30d")
} else {
slog.Warn("scheduler: setup.acme_email empty — ACME renewal disabled until setup wizard ran")
}
licRepo := licsvc.New(pool)
licClient := license.NewClient()
licKeyStore := license.NewKeyStore()
nodeID := os.Getenv("EDGEGUARD_NODE_ID")
slog.Info("scheduler: license re-verify enabled", "tick", licenseTickInterval)
backupSvc := backup.New(pool)
slog.Info("scheduler: daily backup enabled", "tick", backupTickInterval,
"dir", backupSvc.BackupDir, "keep_n", backup.DefaultKeepN)
if renewer != nil {
runRenewer(ctx, renewer)
}
runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID)
renewTick := time.NewTicker(renewTickInterval)
defer renewTick.Stop()
licTick := time.NewTicker(licenseTickInterval)
defer licTick.Stop()
backupTick := time.NewTicker(backupTickInterval)
defer backupTick.Stop()
for {
select {
case <-renewTick.C:
if renewer != nil {
runRenewer(ctx, renewer)
}
case <-licTick.C:
runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID)
case <-backupTick.C:
runBackup(ctx, backupSvc, version)
}
}
}
// runBackup führt einen scheduled Backup aus + prunet alte. Failures
// loggen wir nur — der Tick läuft morgen wieder, kein Notfall.
func runBackup(ctx context.Context, svc *backup.Service, version string) {
res, err := svc.Run(ctx, backup.KindScheduled, version)
if err != nil {
slog.Warn("scheduler: backup failed", "error", err, "file", res.File)
return
}
slog.Info("scheduler: backup done",
"file", res.File, "size", res.SizeBytes,
"db_bytes", res.DBDumpBytes, "files_bytes", res.FilesBytes,
"sha256", res.SHA256)
if err := svc.Prune(ctx, backup.DefaultKeepN); err != nil {
slog.Warn("scheduler: backup prune failed", "error", err)
}
}
// runLicenseVerify performs a single re-verify pass. Empty key = no-op
// (box stays in trial), so this is safe to call on every tick.
func runLicenseVerify(ctx context.Context, c *license.Client, ks *license.KeyStore,
repo *licsvc.Repo, nodeID string) {
key := ks.Get()
if key == "" {
slog.Debug("scheduler: license verify skipped — no key")
return
}
res, err := c.Verify(key)
if err != nil {
_ = repo.MarkError(ctx, key, err.Error())
slog.Warn("scheduler: license verify failed", "error", err)
return
}
payload, _ := json.Marshal(res)
status := "active"
if !res.Valid {
status = "expired"
if res.Status == "revoked" {
status = "invalid"
}
}
if err := repo.Upsert(ctx, key, status, res.ExpiresAt, nodeID, 0, payload, ""); err != nil {
slog.Warn("scheduler: license db upsert failed", "error", err)
return
}
slog.Info("scheduler: license verified",
"status", status, "valid", res.Valid, "expires_at", res.ExpiresAt)
}
func runRenewer(ctx context.Context, r *certrenewer.Service) {
res, err := r.Run(ctx)
if err != nil {
slog.Error("scheduler: renewer run failed", "error", err)
return
}
slog.Info("scheduler: renewer pass complete",
"checked", res.Checked, "renewed", res.Renewed,
"failed", res.Failed, "skipped", res.Skipped)
}