feat(cluster): Config-Hash-Compute für Drift-Detection
Setzt die Foundation aus 1.0.70 fort — bisher war ha_nodes.config_hash
noch NULL und das UI konnte keinen Drift erkennen.
internal/cluster/confighash.go:
- ComputeConfigHash() berechnet SHA-256 (truncated auf 16 hex chars)
über alle replizierbaren Tabellen. Pattern 1:1 aus mail-gateway/
internal/handlers/cluster_status.go (driftHashSpec).
- Pro Tabelle: md5((to_jsonb(t) - id - updated_at - created_at -
excludes)::text) per row, dann string_agg ORDER BY rh.
- Singleton-Tabellen (dns_settings, ntp_settings, mail_config-Stil)
hashen direkt ohne agg.
- 23 Tabellen: domains, backends, backend_servers, routing_rules,
network_interfaces, ip_addresses, tls_certs (mit ExtraExclude
last_renewed_at + last_error damit cert-renewal keinen drift
erzeugt), firewall_zones+address_objects+address_groups+services+
service_groups+rules+nat_rules, wireguard_interfaces+peers,
forward_proxy_acls, dns_zones+records+settings, ntp_pools+settings,
static_routes.
- RefreshLocalHash() schreibt den Hash in die eigene ha_nodes-Row.
Scheduler:
- 5-min-Tick ruft RefreshLocalHash. Pro-Mutation-Refresh wäre zu
teuer (jede UI-Action triggert sonst 23 jsonb-Queries).
- Initial-Refresh beim Scheduler-Boot damit /cluster/status nicht
5 min auf den ersten Wert wartet.
handlers/cluster.go:
- Status() ruft RefreshLocalHash mit 2s-Timeout on-demand. Damit
sieht das UI auch zwischen den Scheduler-Ticks immer frische
Werte; bei Timeout fallback auf den DB-Wert (eventuell stale).
Verifiziert auf 1.0.71: ha_nodes-Row hat config_hash=728834dce5ca4e48,
scheduler-log "config-hash refresh enabled tick=5m0s".
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
137
internal/cluster/confighash.go
Normal file
137
internal/cluster/confighash.go
Normal file
@@ -0,0 +1,137 @@
|
||||
package cluster
|
||||
|
||||
// Config-Hash berechnet einen deterministischen SHA-256-Hash über alle
|
||||
// replizierbaren Tabellen — die Grundlage des "Config-Drift"-Banners
|
||||
// im Cluster-UI. Pattern 1:1 aus mail-gateway/internal/handlers/
|
||||
// cluster_status.go (driftHashSpec).
|
||||
//
|
||||
// Hash-Bildung pro Tabelle:
|
||||
// - SELECT md5((to_jsonb(t) - 'id' - 'updated_at' - 'created_at' -
|
||||
// <extra-excludes>)::text) AS rh
|
||||
// FROM <table> t
|
||||
// - SELECT md5(string_agg(rh, '|' ORDER BY rh)) → table-hash
|
||||
// - Singleton-Tabellen (mail_config-Stil) hashen die row direkt
|
||||
// ohne string_agg.
|
||||
//
|
||||
// Outer-Hash: SHA-256 über concat(table-name, 0x00, table-hash, 0x00)
|
||||
// für jede spec-Tabelle in stabiler Reihenfolge, dann Hex + truncate
|
||||
// auf 16 chars. 16 hex = 64 bit Entropie reichen für drift-Detection;
|
||||
// längere Strings frühstückt das UI nur unnötig.
|
||||
//
|
||||
// Replizierbar = wird per goose-Migration auf beiden Cluster-Nodes
|
||||
// gleich angelegt UND vom Operator über die UI mutiert. NICHT drin:
|
||||
// audit_log (transient log), ha_nodes (cluster-state selbst), licenses
|
||||
// (per-node), backups (per-node).
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
)
|
||||
|
||||
// hashTable beschreibt eine Tabelle die in den config-hash einfließt.
|
||||
type hashTable struct {
|
||||
Name string
|
||||
Singleton bool // dns_settings, ntp_settings → eine row, id=1
|
||||
ExtraExclude []string // Spalten die zusätzlich aus to_jsonb gefiltert werden
|
||||
SkipUpdatedAt bool // setze true wenn updated_at semantisch relevant ist
|
||||
}
|
||||
|
||||
// hashSpec ist die Reihenfolge-stabile Liste. NEUE Tabellen hier
|
||||
// ergänzen wenn sie repliziert werden sollen — sonst flackert das
|
||||
// Drift-Banner.
|
||||
var hashSpec = []hashTable{
|
||||
{Name: "domains"},
|
||||
{Name: "backends"},
|
||||
{Name: "backend_servers"},
|
||||
{Name: "routing_rules"},
|
||||
{Name: "network_interfaces"},
|
||||
{Name: "ip_addresses"},
|
||||
{Name: "tls_certs", ExtraExclude: []string{"last_renewed_at", "last_error"}},
|
||||
|
||||
{Name: "firewall_zones"},
|
||||
{Name: "firewall_address_objects"},
|
||||
{Name: "firewall_address_groups"},
|
||||
{Name: "firewall_services"},
|
||||
{Name: "firewall_service_groups"},
|
||||
{Name: "firewall_rules"},
|
||||
{Name: "firewall_nat_rules"},
|
||||
|
||||
{Name: "wireguard_interfaces"},
|
||||
{Name: "wireguard_peers"},
|
||||
{Name: "forward_proxy_acls"},
|
||||
|
||||
{Name: "dns_zones"},
|
||||
{Name: "dns_records"},
|
||||
{Name: "dns_settings", Singleton: true},
|
||||
|
||||
{Name: "ntp_pools"},
|
||||
{Name: "ntp_settings", Singleton: true},
|
||||
|
||||
{Name: "static_routes"},
|
||||
}
|
||||
|
||||
// hashSQL rendert die SHA-Input-SQL für eine Tabelle.
|
||||
func hashSQL(t hashTable) string {
|
||||
excl := []string{"'id'"}
|
||||
if !t.SkipUpdatedAt {
|
||||
excl = append(excl, "'updated_at'")
|
||||
}
|
||||
excl = append(excl, "'created_at'")
|
||||
for _, c := range t.ExtraExclude {
|
||||
excl = append(excl, "'"+c+"'")
|
||||
}
|
||||
subtract := strings.Join(excl, " - ")
|
||||
if t.Singleton {
|
||||
return `SELECT COALESCE(md5((to_jsonb(t) - ` + subtract + `)::text), '')
|
||||
FROM ` + t.Name + ` t WHERE id = 1`
|
||||
}
|
||||
return `SELECT COALESCE(md5(string_agg(rh, '|' ORDER BY rh)), '')
|
||||
FROM (
|
||||
SELECT md5((to_jsonb(t) - ` + subtract + `)::text) AS rh
|
||||
FROM ` + t.Name + ` t
|
||||
) sub`
|
||||
}
|
||||
|
||||
// ComputeConfigHash gibt den 16-hex-char-Hash über alle Spec-Tabellen
|
||||
// zurück. Fehlende Tabellen (transienter schema-flux) werden als
|
||||
// leerer Per-Table-Hash behandelt — kein Abbruch.
|
||||
func ComputeConfigHash(ctx context.Context, pool *pgxpool.Pool) (string, error) {
|
||||
if pool == nil {
|
||||
return "", fmt.Errorf("nil pool")
|
||||
}
|
||||
h := sha256.New()
|
||||
for _, t := range hashSpec {
|
||||
var s string
|
||||
if err := pool.QueryRow(ctx, hashSQL(t)).Scan(&s); err != nil {
|
||||
// Migration fehlt o.ä. → leeren string nehmen, weiter.
|
||||
s = ""
|
||||
}
|
||||
h.Write([]byte(t.Name))
|
||||
h.Write([]byte{0})
|
||||
h.Write([]byte(s))
|
||||
h.Write([]byte{0})
|
||||
}
|
||||
return hex.EncodeToString(h.Sum(nil))[:16], nil
|
||||
}
|
||||
|
||||
// RefreshLocalHash berechnet den Hash und schreibt ihn in die eigene
|
||||
// ha_nodes-Row. Idempotent. Verwendet vom Scheduler + Cluster-Status-
|
||||
// Handler (on-demand).
|
||||
func RefreshLocalHash(ctx context.Context, pool *pgxpool.Pool, localID string) (string, error) {
|
||||
if pool == nil || localID == "" {
|
||||
return "", nil
|
||||
}
|
||||
hash, err := ComputeConfigHash(ctx, pool)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
_, err = pool.Exec(ctx, `
|
||||
UPDATE ha_nodes SET config_hash = $1, updated_at = NOW() WHERE id = $2`,
|
||||
hash, localID)
|
||||
return hash, err
|
||||
}
|
||||
Reference in New Issue
Block a user