Code-Vorbereitung für Multi-Node, ohne dass eine zweite Box nötig ist.
Single-Node-Mode bleibt der Default; alles existiert und wird sichtbar,
sobald ein 2. Knoten joined (Phase 3.2 später).
Migration 0020:
ha_nodes += version (edgeguard-api-Version)
config_hash (drift-Detection-Hash)
mgmt_ip (Management-IP, niemals VIP)
status (online|offline|joining|leaving|unknown)
internal/cluster/local_config.go:
/etc/edgeguard/node.conf — INI-style, node-lokale Identität:
NODE_ID, HOSTNAME, MGMT_IP, ROLE, PEER_HOSTS. NIEMALS zwischen
Cluster-Peers replizieren. LoadLocalConfig / SaveLocalConfig /
EnsureLocalConfig (auto-Generierung beim ersten Boot).
MgmtIP-Default = firstNonLoopbackIPv4(); Operator kann
überschreiben (mehrere Interfaces).
internal/cluster/store.go:
- HANode-Model um die 4 neuen Felder erweitert
- UpsertSelf nimmt jetzt mgmt_ip/version/config_hash/status, COALESCE
erhält werte wenn der Caller sie nicht setzt
- EnsureSelfRegistered-Signatur: + role + version-Argument
internal/handlers/cluster.go:
GET /api/v1/cluster/status — strukturierter Endpoint:
{local_id, local_node, peers[], mode, health, drift_found, updated_at}
GET /api/v1/cluster/nodes bleibt für Tools.
UI (pages/Cluster):
- Header zeigt Mode-Tag (Single-Node / Cluster) + Health-Tag (OK /
degraded / split-brain)
- Self-Card: Descriptions mit FQDN, Node-ID, Status, Role, Version,
MGMT-IP, API-URL, Config-Hash
- Peers-Tabelle nur wenn vorhanden, mit "drift"-Marker pro Row
- Drift-Alert-Banner wenn ein Peer einen anderen config_hash hat
- Single-Node-Mode Hinweis-Alert ("cluster-join kommt in 3.2")
postinst: leeres /etc/edgeguard/node.conf wird angelegt (chown
edgeguard); API auto-befüllt beim ersten boot.
main.go ruft EnsureLocalConfig + EnsureSelfRegistered mit version.
Verifiziert auf der Box (1.0.70):
- /etc/edgeguard/node.conf hat NODE_ID, HOSTNAME, MGMT_IP=89.163.205.6,
ROLE=primary
- ha_nodes-Row: status=online, version=1.0.70, mgmt_ip=89.163.205.6
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
171 lines
5.5 KiB
Go
171 lines
5.5 KiB
Go
// edgeguard-scheduler runs background jobs that don't belong on the
|
|
// API request path:
|
|
//
|
|
// - ACME cert renewal (every 6h, re-issues anything < 30d to expiry)
|
|
//
|
|
// Future jobs (cluster heartbeat, backup, audit-log retention)
|
|
// hang off the same Tick loop. Stays single-process — no leader
|
|
// election yet (Phase 3).
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"log/slog"
|
|
"os"
|
|
"time"
|
|
|
|
"git.netcell-it.de/projekte/edgeguard-native/internal/database"
|
|
"git.netcell-it.de/projekte/edgeguard-native/internal/license"
|
|
"git.netcell-it.de/projekte/edgeguard-native/internal/services/acme"
|
|
"git.netcell-it.de/projekte/edgeguard-native/internal/services/backup"
|
|
"git.netcell-it.de/projekte/edgeguard-native/internal/services/certrenewer"
|
|
licsvc "git.netcell-it.de/projekte/edgeguard-native/internal/services/license"
|
|
"git.netcell-it.de/projekte/edgeguard-native/internal/services/setup"
|
|
"git.netcell-it.de/projekte/edgeguard-native/internal/services/tlscerts"
|
|
)
|
|
|
|
var version = "1.0.70"
|
|
|
|
const (
|
|
// renewTickInterval — how often we re-evaluate expiring certs.
|
|
// 6h is enough: LE renewal window is 30 days; missing one tick
|
|
// makes no difference. Hourly would log too much.
|
|
renewTickInterval = 6 * time.Hour
|
|
|
|
// certDir matches handlers.NewTLSCertsHandler default — HAProxy
|
|
// reads from this directory.
|
|
certDir = "/etc/edgeguard/tls"
|
|
|
|
// licenseTickInterval — daily re-verify against
|
|
// license.netcell-it.com. Result lands in the licenses table.
|
|
licenseTickInterval = 24 * time.Hour
|
|
|
|
// backupTickInterval — daily scheduled backup at ~03:00 (Tick
|
|
// alignment ist approximativ, weil time.Ticker bei Boot startet).
|
|
// Retention: 14 erfolgreiche Backups (default in backup.Service).
|
|
backupTickInterval = 24 * time.Hour
|
|
)
|
|
|
|
func main() {
|
|
slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo})))
|
|
slog.Info("edgeguard-scheduler starting", "version", version)
|
|
|
|
ctx := context.Background()
|
|
|
|
pool, err := database.Open(ctx, database.ConnStringFromEnv())
|
|
if err != nil {
|
|
slog.Error("scheduler: DB open failed — sleeping forever", "error", err)
|
|
select {}
|
|
}
|
|
defer pool.Close()
|
|
|
|
tlsRepo := tlscerts.New(pool)
|
|
setupStore := setup.NewStore(setup.DefaultDir)
|
|
st, _ := setupStore.Load()
|
|
|
|
var renewer *certrenewer.Service
|
|
if st != nil && st.ACMEEmail != "" {
|
|
issuer := acme.New(st.ACMEEmail)
|
|
renewer = certrenewer.New(tlsRepo, issuer, certDir, 30*24*time.Hour)
|
|
slog.Info("scheduler: ACME renewer enabled",
|
|
"email", st.ACMEEmail, "tick", renewTickInterval, "threshold", "30d")
|
|
} else {
|
|
slog.Warn("scheduler: setup.acme_email empty — ACME renewal disabled until setup wizard ran")
|
|
}
|
|
|
|
licRepo := licsvc.New(pool)
|
|
licClient := license.NewClient()
|
|
licKeyStore := license.NewKeyStore()
|
|
nodeID := os.Getenv("EDGEGUARD_NODE_ID")
|
|
slog.Info("scheduler: license re-verify enabled", "tick", licenseTickInterval)
|
|
|
|
backupSvc := backup.New(pool)
|
|
slog.Info("scheduler: daily backup enabled", "tick", backupTickInterval,
|
|
"dir", backupSvc.BackupDir, "keep_n", backup.DefaultKeepN)
|
|
|
|
if renewer != nil {
|
|
runRenewer(ctx, renewer)
|
|
}
|
|
runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID)
|
|
|
|
renewTick := time.NewTicker(renewTickInterval)
|
|
defer renewTick.Stop()
|
|
licTick := time.NewTicker(licenseTickInterval)
|
|
defer licTick.Stop()
|
|
backupTick := time.NewTicker(backupTickInterval)
|
|
defer backupTick.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-renewTick.C:
|
|
if renewer != nil {
|
|
runRenewer(ctx, renewer)
|
|
}
|
|
case <-licTick.C:
|
|
runLicenseVerify(ctx, licClient, licKeyStore, licRepo, nodeID)
|
|
case <-backupTick.C:
|
|
runBackup(ctx, backupSvc, version)
|
|
}
|
|
}
|
|
}
|
|
|
|
// runBackup führt einen scheduled Backup aus + prunet alte. Failures
|
|
// loggen wir nur — der Tick läuft morgen wieder, kein Notfall.
|
|
func runBackup(ctx context.Context, svc *backup.Service, version string) {
|
|
res, err := svc.Run(ctx, backup.KindScheduled, version)
|
|
if err != nil {
|
|
slog.Warn("scheduler: backup failed", "error", err, "file", res.File)
|
|
return
|
|
}
|
|
slog.Info("scheduler: backup done",
|
|
"file", res.File, "size", res.SizeBytes,
|
|
"db_bytes", res.DBDumpBytes, "files_bytes", res.FilesBytes,
|
|
"sha256", res.SHA256)
|
|
if err := svc.Prune(ctx, backup.DefaultKeepN); err != nil {
|
|
slog.Warn("scheduler: backup prune failed", "error", err)
|
|
}
|
|
}
|
|
|
|
// runLicenseVerify performs a single re-verify pass. Empty key = no-op
|
|
// (box stays in trial), so this is safe to call on every tick.
|
|
func runLicenseVerify(ctx context.Context, c *license.Client, ks *license.KeyStore,
|
|
repo *licsvc.Repo, nodeID string) {
|
|
key := ks.Get()
|
|
if key == "" {
|
|
slog.Debug("scheduler: license verify skipped — no key")
|
|
return
|
|
}
|
|
res, err := c.Verify(key)
|
|
if err != nil {
|
|
_ = repo.MarkError(ctx, key, err.Error())
|
|
slog.Warn("scheduler: license verify failed", "error", err)
|
|
return
|
|
}
|
|
payload, _ := json.Marshal(res)
|
|
status := "active"
|
|
if !res.Valid {
|
|
status = "expired"
|
|
if res.Status == "revoked" {
|
|
status = "invalid"
|
|
}
|
|
}
|
|
if err := repo.Upsert(ctx, key, status, res.ExpiresAt, nodeID, 0, payload, ""); err != nil {
|
|
slog.Warn("scheduler: license db upsert failed", "error", err)
|
|
return
|
|
}
|
|
slog.Info("scheduler: license verified",
|
|
"status", status, "valid", res.Valid, "expires_at", res.ExpiresAt)
|
|
}
|
|
|
|
func runRenewer(ctx context.Context, r *certrenewer.Service) {
|
|
res, err := r.Run(ctx)
|
|
if err != nil {
|
|
slog.Error("scheduler: renewer run failed", "error", err)
|
|
return
|
|
}
|
|
slog.Info("scheduler: renewer pass complete",
|
|
"checked", res.Checked, "renewed", res.Renewed,
|
|
"failed", res.Failed, "skipped", res.Skipped)
|
|
}
|