// edgeguard-scheduler runs background jobs that don't belong on the // API request path: // // - ACME cert renewal (every 6h, re-issues anything < 30d to expiry) // // Future jobs (cluster heartbeat, backup, audit-log retention) // hang off the same Tick loop. Stays single-process — no leader // election yet (Phase 3). package main import ( "context" "encoding/json" "fmt" "log/slog" "os" "strconv" "time" "github.com/jackc/pgx/v5/pgxpool" "git.netcell-it.de/projekte/edgeguard-native/internal/cluster" "git.netcell-it.de/projekte/edgeguard-native/internal/database" "git.netcell-it.de/projekte/edgeguard-native/internal/license" "git.netcell-it.de/projekte/edgeguard-native/internal/services/acme" "git.netcell-it.de/projekte/edgeguard-native/internal/services/alerts" "git.netcell-it.de/projekte/edgeguard-native/internal/services/backup" backupremote "git.netcell-it.de/projekte/edgeguard-native/internal/services/backup/remote" "git.netcell-it.de/projekte/edgeguard-native/internal/services/certrenewer" licsvc "git.netcell-it.de/projekte/edgeguard-native/internal/services/license" "git.netcell-it.de/projekte/edgeguard-native/internal/services/setup" "git.netcell-it.de/projekte/edgeguard-native/internal/services/tlscerts" ) var version = "1.0.76" const ( // renewTickInterval — how often we re-evaluate expiring certs. // 6h is enough: LE renewal window is 30 days; missing one tick // makes no difference. Hourly would log too much. renewTickInterval = 6 * time.Hour // certDir matches handlers.NewTLSCertsHandler default — HAProxy // reads from this directory. certDir = "/etc/edgeguard/tls" // licenseTickInterval — daily re-verify against // license.netcell-it.com. Result lands in the licenses table. licenseTickInterval = 24 * time.Hour // backupTickInterval — daily scheduled backup at ~03:00 (Tick // alignment ist approximativ, weil time.Ticker bei Boot startet). // Retention: 14 erfolgreiche Backups (default in backup.Service). backupTickInterval = 24 * time.Hour // configHashTickInterval — alle 5 min config_hash neu berechnen // und in ha_nodes der eigenen Row schreiben. Cluster-UI nutzt // das fürs Drift-Banner — pro-Mutation-Refresh wäre teuer. configHashTickInterval = 5 * time.Minute ) func main() { slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo}))) slog.Info("edgeguard-scheduler starting", "version", version) ctx := context.Background() pool, err := database.Open(ctx, database.ConnStringFromEnv()) if err != nil { slog.Error("scheduler: DB open failed — sleeping forever", "error", err) select {} } defer pool.Close() tlsRepo := tlscerts.New(pool) setupStore := setup.NewStore(setup.DefaultDir) st, _ := setupStore.Load() var renewer *certrenewer.Service if st != nil && st.ACMEEmail != "" { issuer := acme.New(st.ACMEEmail) renewer = certrenewer.New(tlsRepo, issuer, certDir, 30*24*time.Hour) slog.Info("scheduler: ACME renewer enabled", "email", st.ACMEEmail, "tick", renewTickInterval, "threshold", "30d") } else { slog.Warn("scheduler: setup.acme_email empty — ACME renewal disabled until setup wizard ran") } licRepo := licsvc.New(pool) licClient := license.NewClient() licKeyStore := license.NewKeyStore() nodeID := os.Getenv("EDGEGUARD_NODE_ID") slog.Info("scheduler: license re-verify enabled", "tick", licenseTickInterval) backupSvc := backup.New(pool) backupSvc.RemoteUploader = newSchedRemoteAdapter(backupremote.New(pool)) slog.Info("scheduler: daily backup enabled", "tick", backupTickInterval, "dir", backupSvc.BackupDir, "keep_n", backup.DefaultKeepN) alertSvc := alerts.New(pool) alertDedupe := newDedupe(12 * time.Hour) if renewer != nil { runRenewer(ctx, renewer, alertSvc, alertDedupe) } runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID, alertSvc, alertDedupe) // Lokale Node-ID für config-hash-refresh. EnsureNodeID liefert // dieselbe ID die die API hat (gleiches /var/lib/edgeguard/node-id). localID, _ := cluster.EnsureNodeID("") slog.Info("scheduler: config-hash refresh enabled", "tick", configHashTickInterval, "node_id", localID) // Initial-Refresh damit /cluster/status nach API+Scheduler-Boot // nicht 5min auf den ersten Wert wartet. runConfigHash(ctx, pool, localID) renewTick := time.NewTicker(renewTickInterval) defer renewTick.Stop() licTick := time.NewTicker(licenseTickInterval) defer licTick.Stop() backupTick := time.NewTicker(backupTickInterval) defer backupTick.Stop() hashTick := time.NewTicker(configHashTickInterval) defer hashTick.Stop() for { select { case <-renewTick.C: if renewer != nil { runRenewer(ctx, renewer, alertSvc, alertDedupe) } runCertExpiryCheck(ctx, tlsRepo, alertSvc, alertDedupe) case <-licTick.C: runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID, alertSvc, alertDedupe) case <-backupTick.C: runBackup(ctx, backupSvc, version, alertSvc) case <-hashTick.C: runConfigHash(ctx, pool, localID) } } } // dedupe verhindert dass derselbe Alert-Key (z.B. "cert.expiring:utm-1.netcell-it.de") // öfter als alle 12h gefeuert wird. In-memory — Scheduler-Restart // resettet, was OK ist (Operator soll bei restart wieder einen kennen- // lernen-Event sehen können). type dedupe struct { ttl time.Duration last map[string]time.Time } func newDedupe(ttl time.Duration) *dedupe { return &dedupe{ttl: ttl, last: map[string]time.Time{}} } func (d *dedupe) shouldFire(key string) bool { now := time.Now() if last, ok := d.last[key]; ok && now.Sub(last) < d.ttl { return false } d.last[key] = now return true } // runCertExpiryCheck prüft tls_certs auf bevorstehende Expiry. Warning // bei <14d Restzeit. Dedupe pro Cert-Name 12h damit der scheduler // nicht alle 6h dieselbe Warnung feuert. func runCertExpiryCheck(ctx context.Context, repo *tlscerts.Repo, a *alerts.Service, d *dedupe) { if repo == nil || a == nil { return } certs, err := repo.List(ctx) if err != nil { slog.Warn("scheduler: cert-expiry list failed", "error", err) return } threshold := 14 * 24 * time.Hour now := time.Now() for _, c := range certs { if c.NotAfter == nil { continue } remain := c.NotAfter.Sub(now) if remain > threshold || remain < -90*24*time.Hour { continue } key := "cert.expiring:" + c.Domain if !d.shouldFire(key) { continue } days := int(remain.Hours() / 24) sev := alerts.SeverityWarning if days < 3 { sev = alerts.SeverityError } _, err := a.Fire(ctx, "cert.expiring", sev, "TLS-Zertifikat läuft ab: "+c.Domain, "Cert für "+c.Domain+" läuft in "+strconv.Itoa(days)+" Tagen ab ("+c.NotAfter.Format(time.RFC3339)+"). Renewer-Status: "+c.Status) if err != nil { slog.Warn("scheduler: alert fire failed", "error", err) } } } // runConfigHash berechnet den Hash und schreibt ihn in ha_nodes. // Pool kann nil sein (scheduler-pool-fail beim boot) — dann no-op. func runConfigHash(ctx context.Context, pool *pgxpoolPool, localID string) { if pool == nil || localID == "" { return } hash, err := cluster.RefreshLocalHash(ctx, pool, localID) if err != nil { slog.Warn("scheduler: config-hash refresh failed", "error", err) return } slog.Debug("scheduler: config-hash refreshed", "hash", hash) } // pgxpoolPool ist ein lokaler Alias damit die Signatur stabil bleibt // wenn wir später den pool austauschen wollen (z.B. read-only-replica). type pgxpoolPool = pgxpool.Pool // runBackup führt einen scheduled Backup aus + prunet alte. Failures // loggen wir + alarmieren — verlorene Backups sind kritisch. func runBackup(ctx context.Context, svc *backup.Service, version string, a *alerts.Service) { res, err := svc.Run(ctx, backup.KindScheduled, version) if err != nil { slog.Warn("scheduler: backup failed", "error", err, "file", res.File) if a != nil { _, _ = a.Fire(ctx, "backup.failed", alerts.SeverityError, "Backup fehlgeschlagen", "Scheduled Backup konnte nicht erstellt werden: "+err.Error()) } return } slog.Info("scheduler: backup done", "file", res.File, "size", res.SizeBytes, "db_bytes", res.DBDumpBytes, "files_bytes", res.FilesBytes, "sha256", res.SHA256) if err := svc.Prune(ctx, backup.DefaultKeepN); err != nil { slog.Warn("scheduler: backup prune failed", "error", err) } } // runLicenseVerify performs a single re-verify pass. Empty key = no-op // (box stays in trial), so this is safe to call on every tick. // Bei valid:false-Antwort + Stand >7d alt → Warnung an Alerts. func runLicenseVerify(ctx context.Context, c *license.Client, ks *license.KeyStore, repo *licsvc.Repo, nodeID string, a *alerts.Service, d *dedupe) { key := ks.Get() if key == "" { slog.Debug("scheduler: license verify skipped — no key") return } res, err := c.Verify(key) if err != nil { _ = repo.MarkError(ctx, key, err.Error()) slog.Warn("scheduler: license verify failed", "error", err) return } payload, _ := json.Marshal(res) status := "active" if !res.Valid { status = "expired" if res.Status == "revoked" { status = "invalid" } } if err := repo.Upsert(ctx, key, status, res.ExpiresAt, nodeID, 0, payload, ""); err != nil { slog.Warn("scheduler: license db upsert failed", "error", err) return } slog.Info("scheduler: license verified", "status", status, "valid", res.Valid, "expires_at", res.ExpiresAt) // Alarm bei ungültiger Lizenz (revoked, expired) — dedupe 12h damit // der Operator nicht alle 24h denselben Alert bekommt. if a != nil && d != nil && !res.Valid { if d.shouldFire("license.invalid") { _, _ = a.Fire(ctx, "license.invalid", alerts.SeverityError, "License "+status, "License-Server liefert valid=false. Reason: "+res.Reason) } } } func runRenewer(ctx context.Context, r *certrenewer.Service, a *alerts.Service, d *dedupe) { res, err := r.Run(ctx) if err != nil { slog.Error("scheduler: renewer run failed", "error", err) if a != nil && d != nil && d.shouldFire("cert.renewer.run_failed") { _, _ = a.Fire(ctx, "cert.renewer.run_failed", alerts.SeverityError, "ACME-Renewer-Lauf fehlgeschlagen", "Certrenewer-Cycle abgebrochen: "+err.Error()) } return } slog.Info("scheduler: renewer pass complete", "checked", res.Checked, "renewed", res.Renewed, "failed", res.Failed, "skipped", res.Skipped) if a != nil && res.Failed > 0 && d != nil && d.shouldFire("cert.renew_failed") { _, _ = a.Fire(ctx, "cert.renew_failed", alerts.SeverityError, "Cert-Renewal teilweise fehlgeschlagen", fmt.Sprintf("Renewer-Cycle: %d checked, %d renewed, %d failed, %d skipped", res.Checked, res.Renewed, res.Failed, res.Skipped)) } } // schedRemoteAdapter ist die scheduler-seitige Kopie des Adapters // aus edgeguard-api — gleicher Code, separater Type damit kein // Cross-Binary-Import nötig wird. type schedRemoteAdapter struct{ s *backupremote.Service } func newSchedRemoteAdapter(s *backupremote.Service) backup.RemoteUploader { return schedRemoteAdapter{s: s} } func (a schedRemoteAdapter) UploadAll(ctx context.Context, localPath string) ([]backup.RemoteUploadInfo, error) { res, err := a.s.UploadAll(ctx, localPath) out := make([]backup.RemoteUploadInfo, len(res)) for i, r := range res { out[i] = backup.RemoteUploadInfo{ RemoteID: r.RemoteID, RemoteName: r.RemoteName, OK: r.OK, SizeBytes: r.SizeBytes, DurationMs: r.DurationMs, Error: r.Error, } } return out, err }