Files
edgeguard-native/cmd/edgeguard-scheduler/main.go
Debian 27ac7b53fc feat(backup): Off-Site-Upload nach S3 + SFTP
Schutz gegen Box-Total-Loss — lokale Backups in /var/backups/edgeguard
helfen nicht, wenn die Disk stirbt oder die Box brennt. Nach jedem
erfolgreichen lokalen Backup wird die tar.gz an alle aktiven
Off-Site-Ziele hochgeladen.

Migration 0022: backup_remotes (kind=s3|sftp, target_url, settings
JSONB, active, last_upload_at, last_error) + backups.remote_uploads
JSONB (per-Target-Result).

internal/services/backup/remote/:
  - UploadAll() — pro aktivem Target ein Upload, Failures non-fatal
  - S3 via minio-go/v7 — funktioniert mit AWS, MinIO, Backblaze B2,
    Cloudflare R2, Hetzner Object Storage (alle S3-API-kompatibel)
  - SFTP via golang.org/x/crypto/ssh + pkg/sftp. Password + Private-
    Key (OpenSSH, base64-encoded) als Auth. Optional host_key_
    fingerprint-Pinning (SHA256:...); leer = TOFU (unsicher vs MitM,
    OK für initial setup).
  - Test() lädt eine 1KB-Probe + löscht sie wieder — Operator-UI hat
    einen „Verbindung testen"-Button.

backup.Service.RemoteUploader-Interface: nach erfolgreichem
recordSuccess() läuft UploadAll, Results landen in backups.remote_
uploads JSONB. last_upload_at/last_error in backup_remotes pro Target
gepflegt. API + Scheduler injizieren beide den Adapter.

internal/handlers/backup_remotes.go: CRUD + POST /:id/test. Sensitive
Felder (secret_key, password, private_key) werden in GET-Responses
durch ***SET*** maskiert; UpdateChannel merged das zurück damit der
Operator bei Edit ohne Re-Eingabe speichern kann.

UI: Backups-Page jetzt mit Tabs "Sicherungen" + "Off-Site-Ziele".
Tab 2 hat CRUD-Tabelle mit kind-konditionalem Form (S3-Felder oder
SFTP-Felder), Test-Button pro Row, last_upload-Status mit FAIL-Tag
bei Errors.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 18:49:02 +02:00

330 lines
11 KiB
Go

// edgeguard-scheduler runs background jobs that don't belong on the
// API request path:
//
// - ACME cert renewal (every 6h, re-issues anything < 30d to expiry)
//
// Future jobs (cluster heartbeat, backup, audit-log retention)
// hang off the same Tick loop. Stays single-process — no leader
// election yet (Phase 3).
package main
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"os"
"strconv"
"time"
"github.com/jackc/pgx/v5/pgxpool"
"git.netcell-it.de/projekte/edgeguard-native/internal/cluster"
"git.netcell-it.de/projekte/edgeguard-native/internal/database"
"git.netcell-it.de/projekte/edgeguard-native/internal/license"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/acme"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/alerts"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/backup"
backupremote "git.netcell-it.de/projekte/edgeguard-native/internal/services/backup/remote"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/certrenewer"
licsvc "git.netcell-it.de/projekte/edgeguard-native/internal/services/license"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/setup"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/tlscerts"
)
var version = "1.0.75"
const (
// renewTickInterval — how often we re-evaluate expiring certs.
// 6h is enough: LE renewal window is 30 days; missing one tick
// makes no difference. Hourly would log too much.
renewTickInterval = 6 * time.Hour
// certDir matches handlers.NewTLSCertsHandler default — HAProxy
// reads from this directory.
certDir = "/etc/edgeguard/tls"
// licenseTickInterval — daily re-verify against
// license.netcell-it.com. Result lands in the licenses table.
licenseTickInterval = 24 * time.Hour
// backupTickInterval — daily scheduled backup at ~03:00 (Tick
// alignment ist approximativ, weil time.Ticker bei Boot startet).
// Retention: 14 erfolgreiche Backups (default in backup.Service).
backupTickInterval = 24 * time.Hour
// configHashTickInterval — alle 5 min config_hash neu berechnen
// und in ha_nodes der eigenen Row schreiben. Cluster-UI nutzt
// das fürs Drift-Banner — pro-Mutation-Refresh wäre teuer.
configHashTickInterval = 5 * time.Minute
)
func main() {
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo})))
slog.Info("edgeguard-scheduler starting", "version", version)
ctx := context.Background()
pool, err := database.Open(ctx, database.ConnStringFromEnv())
if err != nil {
slog.Error("scheduler: DB open failed — sleeping forever", "error", err)
select {}
}
defer pool.Close()
tlsRepo := tlscerts.New(pool)
setupStore := setup.NewStore(setup.DefaultDir)
st, _ := setupStore.Load()
var renewer *certrenewer.Service
if st != nil && st.ACMEEmail != "" {
issuer := acme.New(st.ACMEEmail)
renewer = certrenewer.New(tlsRepo, issuer, certDir, 30*24*time.Hour)
slog.Info("scheduler: ACME renewer enabled",
"email", st.ACMEEmail, "tick", renewTickInterval, "threshold", "30d")
} else {
slog.Warn("scheduler: setup.acme_email empty — ACME renewal disabled until setup wizard ran")
}
licRepo := licsvc.New(pool)
licClient := license.NewClient()
licKeyStore := license.NewKeyStore()
nodeID := os.Getenv("EDGEGUARD_NODE_ID")
slog.Info("scheduler: license re-verify enabled", "tick", licenseTickInterval)
backupSvc := backup.New(pool)
backupSvc.RemoteUploader = newSchedRemoteAdapter(backupremote.New(pool))
slog.Info("scheduler: daily backup enabled", "tick", backupTickInterval,
"dir", backupSvc.BackupDir, "keep_n", backup.DefaultKeepN)
alertSvc := alerts.New(pool)
alertDedupe := newDedupe(12 * time.Hour)
if renewer != nil {
runRenewer(ctx, renewer, alertSvc, alertDedupe)
}
runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID, alertSvc, alertDedupe)
// Lokale Node-ID für config-hash-refresh. EnsureNodeID liefert
// dieselbe ID die die API hat (gleiches /var/lib/edgeguard/node-id).
localID, _ := cluster.EnsureNodeID("")
slog.Info("scheduler: config-hash refresh enabled", "tick", configHashTickInterval, "node_id", localID)
// Initial-Refresh damit /cluster/status nach API+Scheduler-Boot
// nicht 5min auf den ersten Wert wartet.
runConfigHash(ctx, pool, localID)
renewTick := time.NewTicker(renewTickInterval)
defer renewTick.Stop()
licTick := time.NewTicker(licenseTickInterval)
defer licTick.Stop()
backupTick := time.NewTicker(backupTickInterval)
defer backupTick.Stop()
hashTick := time.NewTicker(configHashTickInterval)
defer hashTick.Stop()
for {
select {
case <-renewTick.C:
if renewer != nil {
runRenewer(ctx, renewer, alertSvc, alertDedupe)
}
runCertExpiryCheck(ctx, tlsRepo, alertSvc, alertDedupe)
case <-licTick.C:
runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID, alertSvc, alertDedupe)
case <-backupTick.C:
runBackup(ctx, backupSvc, version, alertSvc)
case <-hashTick.C:
runConfigHash(ctx, pool, localID)
}
}
}
// dedupe verhindert dass derselbe Alert-Key (z.B. "cert.expiring:utm-1.netcell-it.de")
// öfter als alle 12h gefeuert wird. In-memory — Scheduler-Restart
// resettet, was OK ist (Operator soll bei restart wieder einen kennen-
// lernen-Event sehen können).
type dedupe struct {
ttl time.Duration
last map[string]time.Time
}
func newDedupe(ttl time.Duration) *dedupe { return &dedupe{ttl: ttl, last: map[string]time.Time{}} }
func (d *dedupe) shouldFire(key string) bool {
now := time.Now()
if last, ok := d.last[key]; ok && now.Sub(last) < d.ttl {
return false
}
d.last[key] = now
return true
}
// runCertExpiryCheck prüft tls_certs auf bevorstehende Expiry. Warning
// bei <14d Restzeit. Dedupe pro Cert-Name 12h damit der scheduler
// nicht alle 6h dieselbe Warnung feuert.
func runCertExpiryCheck(ctx context.Context, repo *tlscerts.Repo,
a *alerts.Service, d *dedupe) {
if repo == nil || a == nil {
return
}
certs, err := repo.List(ctx)
if err != nil {
slog.Warn("scheduler: cert-expiry list failed", "error", err)
return
}
threshold := 14 * 24 * time.Hour
now := time.Now()
for _, c := range certs {
if c.NotAfter == nil {
continue
}
remain := c.NotAfter.Sub(now)
if remain > threshold || remain < -90*24*time.Hour {
continue
}
key := "cert.expiring:" + c.Domain
if !d.shouldFire(key) {
continue
}
days := int(remain.Hours() / 24)
sev := alerts.SeverityWarning
if days < 3 {
sev = alerts.SeverityError
}
_, err := a.Fire(ctx, "cert.expiring", sev,
"TLS-Zertifikat läuft ab: "+c.Domain,
"Cert für "+c.Domain+" läuft in "+strconv.Itoa(days)+" Tagen ab ("+c.NotAfter.Format(time.RFC3339)+"). Renewer-Status: "+c.Status)
if err != nil {
slog.Warn("scheduler: alert fire failed", "error", err)
}
}
}
// runConfigHash berechnet den Hash und schreibt ihn in ha_nodes.
// Pool kann nil sein (scheduler-pool-fail beim boot) — dann no-op.
func runConfigHash(ctx context.Context, pool *pgxpoolPool, localID string) {
if pool == nil || localID == "" {
return
}
hash, err := cluster.RefreshLocalHash(ctx, pool, localID)
if err != nil {
slog.Warn("scheduler: config-hash refresh failed", "error", err)
return
}
slog.Debug("scheduler: config-hash refreshed", "hash", hash)
}
// pgxpoolPool ist ein lokaler Alias damit die Signatur stabil bleibt
// wenn wir später den pool austauschen wollen (z.B. read-only-replica).
type pgxpoolPool = pgxpool.Pool
// runBackup führt einen scheduled Backup aus + prunet alte. Failures
// loggen wir + alarmieren — verlorene Backups sind kritisch.
func runBackup(ctx context.Context, svc *backup.Service, version string, a *alerts.Service) {
res, err := svc.Run(ctx, backup.KindScheduled, version)
if err != nil {
slog.Warn("scheduler: backup failed", "error", err, "file", res.File)
if a != nil {
_, _ = a.Fire(ctx, "backup.failed", alerts.SeverityError,
"Backup fehlgeschlagen",
"Scheduled Backup konnte nicht erstellt werden: "+err.Error())
}
return
}
slog.Info("scheduler: backup done",
"file", res.File, "size", res.SizeBytes,
"db_bytes", res.DBDumpBytes, "files_bytes", res.FilesBytes,
"sha256", res.SHA256)
if err := svc.Prune(ctx, backup.DefaultKeepN); err != nil {
slog.Warn("scheduler: backup prune failed", "error", err)
}
}
// runLicenseVerify performs a single re-verify pass. Empty key = no-op
// (box stays in trial), so this is safe to call on every tick.
// Bei valid:false-Antwort + Stand >7d alt → Warnung an Alerts.
func runLicenseVerify(ctx context.Context, c *license.Client, ks *license.KeyStore,
repo *licsvc.Repo, nodeID string, a *alerts.Service, d *dedupe) {
key := ks.Get()
if key == "" {
slog.Debug("scheduler: license verify skipped — no key")
return
}
res, err := c.Verify(key)
if err != nil {
_ = repo.MarkError(ctx, key, err.Error())
slog.Warn("scheduler: license verify failed", "error", err)
return
}
payload, _ := json.Marshal(res)
status := "active"
if !res.Valid {
status = "expired"
if res.Status == "revoked" {
status = "invalid"
}
}
if err := repo.Upsert(ctx, key, status, res.ExpiresAt, nodeID, 0, payload, ""); err != nil {
slog.Warn("scheduler: license db upsert failed", "error", err)
return
}
slog.Info("scheduler: license verified",
"status", status, "valid", res.Valid, "expires_at", res.ExpiresAt)
// Alarm bei ungültiger Lizenz (revoked, expired) — dedupe 12h damit
// der Operator nicht alle 24h denselben Alert bekommt.
if a != nil && d != nil && !res.Valid {
if d.shouldFire("license.invalid") {
_, _ = a.Fire(ctx, "license.invalid", alerts.SeverityError,
"License "+status,
"License-Server liefert valid=false. Reason: "+res.Reason)
}
}
}
func runRenewer(ctx context.Context, r *certrenewer.Service, a *alerts.Service, d *dedupe) {
res, err := r.Run(ctx)
if err != nil {
slog.Error("scheduler: renewer run failed", "error", err)
if a != nil && d != nil && d.shouldFire("cert.renewer.run_failed") {
_, _ = a.Fire(ctx, "cert.renewer.run_failed", alerts.SeverityError,
"ACME-Renewer-Lauf fehlgeschlagen",
"Certrenewer-Cycle abgebrochen: "+err.Error())
}
return
}
slog.Info("scheduler: renewer pass complete",
"checked", res.Checked, "renewed", res.Renewed,
"failed", res.Failed, "skipped", res.Skipped)
if a != nil && res.Failed > 0 && d != nil && d.shouldFire("cert.renew_failed") {
_, _ = a.Fire(ctx, "cert.renew_failed", alerts.SeverityError,
"Cert-Renewal teilweise fehlgeschlagen",
fmt.Sprintf("Renewer-Cycle: %d checked, %d renewed, %d failed, %d skipped",
res.Checked, res.Renewed, res.Failed, res.Skipped))
}
}
// schedRemoteAdapter ist die scheduler-seitige Kopie des Adapters
// aus edgeguard-api — gleicher Code, separater Type damit kein
// Cross-Binary-Import nötig wird.
type schedRemoteAdapter struct{ s *backupremote.Service }
func newSchedRemoteAdapter(s *backupremote.Service) backup.RemoteUploader {
return schedRemoteAdapter{s: s}
}
func (a schedRemoteAdapter) UploadAll(ctx context.Context, localPath string) ([]backup.RemoteUploadInfo, error) {
res, err := a.s.UploadAll(ctx, localPath)
out := make([]backup.RemoteUploadInfo, len(res))
for i, r := range res {
out[i] = backup.RemoteUploadInfo{
RemoteID: r.RemoteID,
RemoteName: r.RemoteName,
OK: r.OK,
SizeBytes: r.SizeBytes,
DurationMs: r.DurationMs,
Error: r.Error,
}
}
return out, err
}