// Package backup implementiert Backup + Restore für EdgeGuard. // // Backup-Inhalt (eg-.tar.gz): // // dump.sql — pg_dump --clean --if-exists --no-owner --no-acl // der edgeguard-DB (Schema + Daten). Restore via psql. // files/ — Verbatim-Kopie node-lokaler State-Dateien: // - setup.json (Setup-Wizard-Ergebnis) // - license_key (node-lokale Lizenz) // - .jwt_fingerprint (Session-Signing-Secret) // - acme-account/ (LE-Account + Privkey) // manifest.json — Metadaten: version, ts, hostname, sizes. // // Node-local: ein Backup deckt nur diesen Node ab. In Phase 3-Cluster // machen alle Nodes ihre eigenen Backups; Konfig ist eh aus PG // reproduzierbar. package backup import ( "archive/tar" "compress/gzip" "context" "crypto/sha256" "encoding/hex" "encoding/json" "errors" "fmt" "io" "os" "os/exec" "path/filepath" "sort" "strings" "time" "github.com/jackc/pgx/v5/pgxpool" ) // DefaultDir ist der Speicherort der Backup-Archive auf der Box. const DefaultDir = "/var/backups/edgeguard" // DefaultStateDir ist /var/lib/edgeguard — alle node-lokalen State- // Files leben darunter. const DefaultStateDir = "/var/lib/edgeguard" // DefaultKeepN ist die Retention für scheduled-Backups. Operator kann // das in der UI noch nicht überschreiben — Konvention reicht für v1. const DefaultKeepN = 14 // Kind unterscheidet den Trigger — manual aus UI vs. scheduled aus // dem 24h-Tick im edgeguard-scheduler. type Kind string const ( KindManual Kind = "manual" KindScheduled Kind = "scheduled" ) // Result kommt von Run() zurück und landet auch (success oder failed) // als Row in der backups-Tabelle. type Result struct { ID int64 File string SizeBytes int64 SHA256 string DBDumpBytes int64 FilesBytes int64 StartedAt time.Time FinishedAt time.Time Error error } // Manifest ist der content von manifest.json im tarball. type Manifest struct { Version string `json:"version"` Kind Kind `json:"kind"` Hostname string `json:"hostname"` CreatedAt time.Time `json:"created_at"` DBDumpBytes int64 `json:"db_dump_bytes"` FilesBytes int64 `json:"files_bytes"` } // Service bündelt Backup + Restore + Retention. Stateless — alle // Konfig kommt als Konstruktor-Param + Methode-Param. type Service struct { Pool *pgxpool.Pool BackupDir string StateDir string // PGDumpCmd ist normalerweise "pg_dump" — postinst whitelisted // `sudo -n -u postgres pg_dump …`. Mit dem Override-Hook können // Tests einen fake-binary einschleusen. PGDumpCmd func(ctx context.Context, w io.Writer) (int64, error) NowFn func() time.Time } func New(pool *pgxpool.Pool) *Service { return &Service{ Pool: pool, BackupDir: DefaultDir, StateDir: DefaultStateDir, NowFn: time.Now, } } // Run führt ein Backup aus. Bei Erfolg: tarball auf Disk + Row in DB. // Bei Failure: tarball gelöscht, Row mit status=failed. func (s *Service) Run(ctx context.Context, kind Kind, version string) (*Result, error) { now := s.NowFn().UTC() hostname, _ := os.Hostname() res := &Result{ File: fmt.Sprintf("eg-%s.tar.gz", now.Format("20060102-150405")), StartedAt: now, } if err := os.MkdirAll(s.BackupDir, 0o755); err != nil { return res, fmt.Errorf("mkdir: %w", err) } outPath := filepath.Join(s.BackupDir, res.File) // SHA256 berechnen wir während-write parallel via TeeWriter. f, err := os.OpenFile(outPath, os.O_CREATE|os.O_WRONLY|os.O_EXCL, 0o640) if err != nil { return res, fmt.Errorf("create %s: %w", outPath, err) } hasher := sha256.New() mw := io.MultiWriter(f, hasher) gz := gzip.NewWriter(mw) tw := tar.NewWriter(gz) // 1) dump.sql — pg_dump streamt direkt rein. dumpSize, dumpErr := s.writeDump(ctx, tw) if dumpErr != nil { _ = tw.Close() _ = gz.Close() _ = f.Close() _ = os.Remove(outPath) res.Error = dumpErr res.FinishedAt = s.NowFn().UTC() s.recordFailure(ctx, res, hostname, kind) return res, dumpErr } res.DBDumpBytes = dumpSize // 2) files/ — alles aus /var/lib/edgeguard außer Cache/Tmp. filesSize, filesErr := s.writeFiles(tw) if filesErr != nil { _ = tw.Close() _ = gz.Close() _ = f.Close() _ = os.Remove(outPath) res.Error = filesErr res.FinishedAt = s.NowFn().UTC() s.recordFailure(ctx, res, hostname, kind) return res, filesErr } res.FilesBytes = filesSize // 3) manifest.json man := Manifest{ Version: version, Kind: kind, Hostname: hostname, CreatedAt: now, DBDumpBytes: dumpSize, FilesBytes: filesSize, } manBytes, _ := json.MarshalIndent(man, "", " ") if err := writeTarBlob(tw, "manifest.json", manBytes); err != nil { _ = tw.Close() _ = gz.Close() _ = f.Close() _ = os.Remove(outPath) res.Error = err res.FinishedAt = s.NowFn().UTC() s.recordFailure(ctx, res, hostname, kind) return res, err } if err := tw.Close(); err != nil { _ = gz.Close() _ = f.Close() _ = os.Remove(outPath) res.Error = err s.recordFailure(ctx, res, hostname, kind) return res, err } if err := gz.Close(); err != nil { _ = f.Close() _ = os.Remove(outPath) res.Error = err s.recordFailure(ctx, res, hostname, kind) return res, err } if err := f.Sync(); err != nil { // Nicht fatal — fsync-failure kann passieren bei tmpfs in // Dev, aber der File ist da. } if err := f.Close(); err != nil { s.recordFailure(ctx, res, hostname, kind) return res, err } stat, _ := os.Stat(outPath) if stat != nil { res.SizeBytes = stat.Size() } res.SHA256 = hex.EncodeToString(hasher.Sum(nil)) res.FinishedAt = s.NowFn().UTC() if err := s.recordSuccess(ctx, res, hostname, kind); err != nil { // DB-Insert failed — File haben wir, aber Operator sieht // das Backup nicht in der UI. Lassen wir's stehen mit Log- // Warnung; nächster Scheduled-Run räumt es nicht ab (Retention // arbeitet nur über DB-Rows). return res, fmt.Errorf("db record: %w", err) } return res, nil } // writeDump pipet pg_dump direkt in den tar-Stream als file "dump.sql". // pg_dump läuft via `sudo -n -u postgres pg_dump --clean --if-exists // --no-owner --no-acl edgeguard`. Sudoers-Whitelist in postinst. func (s *Service) writeDump(ctx context.Context, tw *tar.Writer) (int64, error) { if s.PGDumpCmd != nil { var buf bytes size, err := s.PGDumpCmd(ctx, &buf) if err != nil { return 0, err } if err := writeTarBlob(tw, "dump.sql", buf.b); err != nil { return 0, err } return size, nil } cmd := exec.CommandContext(ctx, "sudo", "-n", "-u", "postgres", "/usr/bin/pg_dump", "--clean", "--if-exists", "--no-owner", "--no-acl", "edgeguard") stdout, err := cmd.StdoutPipe() if err != nil { return 0, err } if err := cmd.Start(); err != nil { return 0, fmt.Errorf("pg_dump start: %w", err) } dump, err := io.ReadAll(stdout) if err != nil { _ = cmd.Wait() return 0, fmt.Errorf("pg_dump read: %w", err) } if err := cmd.Wait(); err != nil { return 0, fmt.Errorf("pg_dump: %w", err) } if err := writeTarBlob(tw, "dump.sql", dump); err != nil { return 0, err } return int64(len(dump)), nil } // writeFiles bringt alle relevanten /var/lib/edgeguard-Files unter // files/ in den tar. Bewusste Liste statt rekursiv-everything, // damit wir nicht aus Versehen den fs.cache oder Lockfiles backupen. func (s *Service) writeFiles(tw *tar.Writer) (int64, error) { var total int64 candidates := []string{ "setup.json", "license_key", "license.cache", "trial.json", ".jwt_fingerprint", "node.conf", } for _, rel := range candidates { path := filepath.Join(s.StateDir, rel) data, err := os.ReadFile(path) if err != nil { if errors.Is(err, os.ErrNotExist) { continue // optional-file, skip } return total, fmt.Errorf("read %s: %w", path, err) } if err := writeTarBlob(tw, "files/"+rel, data); err != nil { return total, err } total += int64(len(data)) } // acme-account/ rekursiv (Multi-File-Dir mit LE-Privkey). acmeDir := filepath.Join(s.StateDir, "acme-account") if _, err := os.Stat(acmeDir); err == nil { err := filepath.Walk(acmeDir, func(p string, info os.FileInfo, werr error) error { if werr != nil { return werr } if info.IsDir() { return nil } rel, _ := filepath.Rel(s.StateDir, p) data, rerr := os.ReadFile(p) if rerr != nil { return rerr } if werr := writeTarBlob(tw, "files/"+rel, data); werr != nil { return werr } total += int64(len(data)) return nil }) if err != nil { return total, err } } return total, nil } func writeTarBlob(tw *tar.Writer, name string, data []byte) error { hdr := &tar.Header{ Name: name, Mode: 0o600, Size: int64(len(data)), ModTime: time.Now(), } if err := tw.WriteHeader(hdr); err != nil { return err } _, err := tw.Write(data) return err } // recordSuccess + recordFailure schreiben einen Eintrag in backups. func (s *Service) recordSuccess(ctx context.Context, r *Result, host string, kind Kind) error { if s.Pool == nil { return nil } row := s.Pool.QueryRow(ctx, ` INSERT INTO backups (file, size_bytes, sha256, db_dump_bytes, files_bytes, kind, status, host, started_at, finished_at) VALUES ($1, $2, $3, $4, $5, $6, 'success', $7, $8, $9) RETURNING id`, r.File, r.SizeBytes, r.SHA256, r.DBDumpBytes, r.FilesBytes, string(kind), host, r.StartedAt, r.FinishedAt) return row.Scan(&r.ID) } func (s *Service) recordFailure(ctx context.Context, r *Result, host string, kind Kind) { if s.Pool == nil { return } errStr := "" if r.Error != nil { errStr = r.Error.Error() } _, _ = s.Pool.Exec(ctx, ` INSERT INTO backups (file, size_bytes, sha256, db_dump_bytes, files_bytes, kind, status, error, host, started_at, finished_at) VALUES ($1, 0, '', 0, 0, $2, 'failed', $3, $4, $5, $6) ON CONFLICT (file) DO NOTHING`, r.File, string(kind), errStr, host, r.StartedAt, r.FinishedAt) } // Prune löscht erfolgreiche Backups älter als die letzten keepN. Wird // nach jedem scheduled-Run aufgerufen. Failed-Rows bleiben für die // History. func (s *Service) Prune(ctx context.Context, keepN int) error { if keepN <= 0 { keepN = DefaultKeepN } if s.Pool == nil { return nil } rows, err := s.Pool.Query(ctx, ` SELECT id, file FROM backups WHERE status = 'success' ORDER BY started_at DESC`) if err != nil { return err } type row struct { id int64 file string } var all []row for rows.Next() { var r row if err := rows.Scan(&r.id, &r.file); err != nil { rows.Close() return err } all = append(all, r) } rows.Close() if len(all) <= keepN { return nil } expired := all[keepN:] for _, e := range expired { _ = os.Remove(filepath.Join(s.BackupDir, e.file)) _, _ = s.Pool.Exec(ctx, `DELETE FROM backups WHERE id = $1`, e.id) } return nil } // List gibt alle Backup-Einträge zurück (newest first). func (s *Service) List(ctx context.Context) ([]Entry, error) { if s.Pool == nil { return nil, nil } rows, err := s.Pool.Query(ctx, ` SELECT id, file, size_bytes, sha256, db_dump_bytes, files_bytes, kind, status, COALESCE(error, ''), COALESCE(host, ''), started_at, finished_at FROM backups ORDER BY started_at DESC LIMIT 200`) if err != nil { return nil, err } defer rows.Close() out := []Entry{} for rows.Next() { var e Entry if err := rows.Scan(&e.ID, &e.File, &e.SizeBytes, &e.SHA256, &e.DBDumpBytes, &e.FilesBytes, &e.Kind, &e.Status, &e.Error, &e.Host, &e.StartedAt, &e.FinishedAt); err != nil { return nil, err } out = append(out, e) } return out, rows.Err() } // Entry mirrort einen DB-Row. type Entry struct { ID int64 `json:"id"` File string `json:"file"` SizeBytes int64 `json:"size_bytes"` SHA256 string `json:"sha256"` DBDumpBytes int64 `json:"db_dump_bytes"` FilesBytes int64 `json:"files_bytes"` Kind string `json:"kind"` Status string `json:"status"` Error string `json:"error,omitempty"` Host string `json:"host,omitempty"` StartedAt time.Time `json:"started_at"` FinishedAt time.Time `json:"finished_at"` } // Get gibt einen einzelnen Eintrag + den File-Pfad zurück. func (s *Service) Get(ctx context.Context, id int64) (*Entry, string, error) { if s.Pool == nil { return nil, "", errors.New("no pool") } var e Entry err := s.Pool.QueryRow(ctx, ` SELECT id, file, size_bytes, sha256, db_dump_bytes, files_bytes, kind, status, COALESCE(error, ''), COALESCE(host, ''), started_at, finished_at FROM backups WHERE id = $1`, id).Scan( &e.ID, &e.File, &e.SizeBytes, &e.SHA256, &e.DBDumpBytes, &e.FilesBytes, &e.Kind, &e.Status, &e.Error, &e.Host, &e.StartedAt, &e.FinishedAt) if err != nil { return nil, "", err } return &e, filepath.Join(s.BackupDir, e.File), nil } // Restore startet einen full-system-restore aus einem vorhandenen // Backup-Tarball. Läuft analog `/system/upgrade`-Pattern: wir // schreiben /var/lib/edgeguard/restore.sh und dispatchen es per // `sudo systemd-run --unit=edgeguard-restore.service`. Das Skript // stoppt edgeguard-api+scheduler, kopiert die files/, restored den // DB-Dump als postgres, re-rendert die Configs und startet die // Services neu. // // Returnt sofort nach dem Dispatch (asynchron) — der eigentliche // Restore läuft im Hintergrund. UI pollt /healthz für die // Version-Flip-Detection (analog Upgrade). func (s *Service) Restore(ctx context.Context, id int64) (*Entry, error) { e, path, err := s.Get(ctx, id) if err != nil { return nil, fmt.Errorf("backup not found: %w", err) } if e.Status != "success" { return nil, fmt.Errorf("backup is in status %q — cannot restore", e.Status) } if _, err := os.Stat(path); err != nil { return nil, fmt.Errorf("backup file missing on disk: %w", err) } const scriptPath = "/var/lib/edgeguard/restore.sh" script := fmt.Sprintf(`#!/bin/bash # Generated by edgeguard-api — restore from %s set -e sleep 2 # let API return 202 first TARBALL=%q TMP=/var/lib/edgeguard/restore-tmp echo "[restore] extract $TARBALL → $TMP" rm -rf "$TMP" mkdir -p "$TMP" tar -xzf "$TARBALL" -C "$TMP" # 1) Restore node-local state files BEFORE the DB swap so a crash # mid-restore leaves the box in a state where the next API-start # sees the new keys/setup. DB will be partial but recoverable. echo "[restore] state files" for f in setup.json license_key license.cache trial.json .jwt_fingerprint node.conf; do if [ -f "$TMP/files/$f" ]; then cp -a "$TMP/files/$f" /var/lib/edgeguard/ fi done if [ -d "$TMP/files/acme-account" ]; then mkdir -p /var/lib/edgeguard/acme-account cp -a "$TMP/files/acme-account/." /var/lib/edgeguard/acme-account/ fi chown -R edgeguard:edgeguard /var/lib/edgeguard/setup.json \ /var/lib/edgeguard/license_key /var/lib/edgeguard/license.cache \ /var/lib/edgeguard/trial.json /var/lib/edgeguard/.jwt_fingerprint \ /var/lib/edgeguard/node.conf /var/lib/edgeguard/acme-account 2>/dev/null || true # 2) Stop API+scheduler so psql can DROP/CREATE tables without active # connections fighting the dump-restore. echo "[restore] stop services" systemctl stop edgeguard-api edgeguard-scheduler # 3) Apply DB dump. pg_dump --clean emits DROP TABLE IF EXISTS so # we don't need to wipe the schema manually. echo "[restore] psql -f dump.sql" sudo -u postgres /usr/bin/psql --quiet -d edgeguard -f "$TMP/dump.sql" # 4) Re-render configs from the freshly restored DB. Each renderer # triggers its own service reload — haproxy, nft, etc. so the # user-visible state matches DB-state immediately. echo "[restore] render-config" sudo -u edgeguard /usr/bin/edgeguard-ctl render-config || true # 5) Restart edgeguard-api so the UI's /healthz poll sees version- # flip / fresh connection. Scheduler comes back automatically. echo "[restore] start services" systemctl start edgeguard-api edgeguard-scheduler rm -rf "$TMP" "$0" echo "[restore] complete" `, e.File, path) if err := os.WriteFile(scriptPath, []byte(script), 0o755); err != nil { return nil, fmt.Errorf("write %s: %w", scriptPath, err) } const unitName = "edgeguard-restore.service" _ = exec.Command("sudo", "-n", "/usr/bin/systemctl", "reset-failed", unitName).Run() cmd := exec.Command("sudo", "-n", "/usr/bin/systemd-run", "--unit="+unitName, "--description=EdgeGuard self-restore", "--collect", "bash", scriptPath) if err := cmd.Run(); err != nil { return nil, fmt.Errorf("systemd-run: %w", err) } return e, nil } // Delete entfernt File + DB-Row. func (s *Service) Delete(ctx context.Context, id int64) error { _, path, err := s.Get(ctx, id) if err != nil { return err } if err := os.Remove(path); err != nil && !errors.Is(err, os.ErrNotExist) { return err } _, err = s.Pool.Exec(ctx, `DELETE FROM backups WHERE id = $1`, id) return err } // bytes ist ein tiny io.Writer-Stand-in für den PGDumpCmd-Override // (Tests). Stdlib bytes.Buffer hätte's auch getan, aber das Package // hat einen anderen import-graph. type bytes struct{ b []byte } func (b *bytes) Write(p []byte) (int, error) { b.b = append(b.b, p...) return len(p), nil } // strFold ist ein utility nur zum Defensiv-Check, dass kind ein // erlaubter Wert ist (für die DB-Constraint). func strFold(s string) string { return strings.ToLower(strings.TrimSpace(s)) } // SortByDate sortiert Entries newest-first. Wird nicht direkt benutzt // (DB-Query macht's), aber praktisch wenn der Caller eine eigene // Liste hat. func SortByDate(es []Entry) { sort.Slice(es, func(i, j int) bool { return es[i].StartedAt.After(es[j].StartedAt) }) }