feat(cluster): (c) Phase-3 MVP — stable node-id + self-register + Cluster-Page
Minimal-Slice für Phase-3-Cluster: * internal/cluster/node_id.go — stable UUID 'n-<16hex>' in /var/lib/edgeguard/node-id, idempotent über reboots. * internal/cluster/store.go — ha_nodes-Repo (List/Get/UpsertSelf) via pgxpool. EnsureSelfRegistered upsertet die lokale Row beim Boot mit FQDN aus setup.json. * internal/handlers/cluster.go — GET /api/v1/cluster/nodes liefert alle ha_nodes plus local_id (für UI-Highlighting). * main.go: nach DB-Pool-Open wird EnsureSelfRegistered (nur wenn setup.completed) ausgeführt, ClusterHandler registriert. * management-ui/src/pages/Cluster/index.tsx — Tabelle mit Node-ID, FQDN, Rolle, Beitrittszeit; eigene Node mit "diese Node"-Tag markiert. Sidebar-Eintrag + i18n de/en. Bewusst NICHT in dieser Runde: cluster-init/cluster-join CLIs, KeyDB Active-Active config-gen, PG streaming replication, mTLS zwischen Peers, License-Leader-Election. Diese kommen mit dem ersten echten Multi-Node-Test (Phase 3.1) — sonst Code ohne Smoke-Möglichkeit. End-to-end-Smoke: setup → restart → ha_nodes hat 1 Row mit fqdn=eg.example.com, /cluster/nodes liefert sie korrekt mit local_id-Markierung. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
81
internal/cluster/node_id.go
Normal file
81
internal/cluster/node_id.go
Normal file
@@ -0,0 +1,81 @@
|
||||
// Package cluster owns the local cluster identity (node ID + role)
|
||||
// and self-registration into ha_nodes on boot.
|
||||
//
|
||||
// v1 is single-node only — we register the local node so the UI's
|
||||
// Cluster page has something to show and so multi-node Phase 3.1
|
||||
// can build on a stable identity. Real cluster-join + KeyDB AA +
|
||||
// PG streaming replication come later.
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
// DefaultNodeIDPath persists the node identifier across restarts.
|
||||
// Lives in the EdgeGuard data dir so /etc/machine-id collisions
|
||||
// (cloned VMs) don't matter — only this file determines identity.
|
||||
DefaultNodeIDPath = "/var/lib/edgeguard/node-id"
|
||||
nodeIDPrefix = "n-"
|
||||
)
|
||||
|
||||
// EnsureNodeID returns the stable cluster node identifier, generating
|
||||
// and persisting one on first call. The format is `n-<16 hex chars>`.
|
||||
//
|
||||
// On read errors (missing dir, permission denied) the function returns
|
||||
// the freshly-minted in-memory ID and the persistence error so the
|
||||
// caller can decide whether to abort or proceed with an ephemeral ID
|
||||
// (development boxes typically don't have /var/lib/edgeguard/ writable).
|
||||
func EnsureNodeID(path string) (string, error) {
|
||||
if path == "" {
|
||||
path = DefaultNodeIDPath
|
||||
}
|
||||
if b, err := os.ReadFile(path); err == nil {
|
||||
s := strings.TrimSpace(string(b))
|
||||
if validNodeID(s) {
|
||||
return s, nil
|
||||
}
|
||||
}
|
||||
|
||||
id, err := mintNodeID()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
|
||||
return id, fmt.Errorf("ensure node-id dir: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(path, []byte(id+"\n"), 0o640); err != nil {
|
||||
return id, fmt.Errorf("write node-id: %w", err)
|
||||
}
|
||||
return id, nil
|
||||
}
|
||||
|
||||
func mintNodeID() (string, error) {
|
||||
buf := make([]byte, 8)
|
||||
if _, err := rand.Read(buf); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return nodeIDPrefix + hex.EncodeToString(buf), nil
|
||||
}
|
||||
|
||||
func validNodeID(s string) bool {
|
||||
if !strings.HasPrefix(s, nodeIDPrefix) {
|
||||
return false
|
||||
}
|
||||
rest := s[len(nodeIDPrefix):]
|
||||
if len(rest) != 16 {
|
||||
return false
|
||||
}
|
||||
for _, r := range rest {
|
||||
ok := (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f')
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
48
internal/cluster/node_id_test.go
Normal file
48
internal/cluster/node_id_test.go
Normal file
@@ -0,0 +1,48 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestEnsureNodeID_GeneratesAndPersists(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "node-id")
|
||||
|
||||
id1, err := EnsureNodeID(path)
|
||||
if err != nil {
|
||||
t.Fatalf("first call: %v", err)
|
||||
}
|
||||
if !validNodeID(id1) {
|
||||
t.Fatalf("invalid node id minted: %q", id1)
|
||||
}
|
||||
|
||||
id2, err := EnsureNodeID(path)
|
||||
if err != nil {
|
||||
t.Fatalf("second call: %v", err)
|
||||
}
|
||||
if id1 != id2 {
|
||||
t.Errorf("node id should be stable: %q vs %q", id1, id2)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnsureNodeID_RejectsCorruptFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "node-id")
|
||||
if err := os.WriteFile(path, []byte("not a real id\n"), 0o640); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
id, err := EnsureNodeID(path)
|
||||
if err != nil {
|
||||
t.Fatalf("EnsureNodeID: %v", err)
|
||||
}
|
||||
if !validNodeID(id) {
|
||||
t.Errorf("expected fresh id when file was junk, got %q", id)
|
||||
}
|
||||
// Re-read should now match the regenerated id.
|
||||
id2, _ := EnsureNodeID(path)
|
||||
if id != id2 {
|
||||
t.Errorf("regenerated id not persisted: %q vs %q", id, id2)
|
||||
}
|
||||
}
|
||||
138
internal/cluster/store.go
Normal file
138
internal/cluster/store.go
Normal file
@@ -0,0 +1,138 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
|
||||
"git.netcell-it.de/projekte/edgeguard-native/internal/models"
|
||||
)
|
||||
|
||||
var ErrNotFound = errors.New("ha_node not found")
|
||||
|
||||
// Store wraps the ha_nodes table — used by the cluster handler and by
|
||||
// EnsureSelfRegistered. v1 only ever has one row (the local node);
|
||||
// the table is in place so Phase 3.1 multi-node lands without
|
||||
// schema churn.
|
||||
type Store struct {
|
||||
Pool *pgxpool.Pool
|
||||
}
|
||||
|
||||
func NewStore(pool *pgxpool.Pool) *Store { return &Store{Pool: pool} }
|
||||
|
||||
const baseSelect = `
|
||||
SELECT id, name, fqdn, api_url, public_ip, internal_ip, role,
|
||||
last_seen, joined_at, created_at, updated_at
|
||||
FROM ha_nodes
|
||||
`
|
||||
|
||||
func (s *Store) List(ctx context.Context) ([]models.HANode, error) {
|
||||
rows, err := s.Pool.Query(ctx, baseSelect+" ORDER BY joined_at ASC")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
out := make([]models.HANode, 0, 4)
|
||||
for rows.Next() {
|
||||
n, err := scanNode(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, *n)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
func (s *Store) Get(ctx context.Context, id string) (*models.HANode, error) {
|
||||
row := s.Pool.QueryRow(ctx, baseSelect+" WHERE id = $1", id)
|
||||
n, err := scanNode(row)
|
||||
if err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return nil, ErrNotFound
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// UpsertSelf writes the local node's row using the database-side
|
||||
// ON CONFLICT DO UPDATE so the call is safe to make on every boot.
|
||||
// last_seen is also bumped — handy for the heartbeat-by-restart
|
||||
// pattern even before periodic heartbeats land.
|
||||
func (s *Store) UpsertSelf(ctx context.Context, n models.HANode) (*models.HANode, error) {
|
||||
now := time.Now().UTC()
|
||||
row := s.Pool.QueryRow(ctx, `
|
||||
INSERT INTO ha_nodes (id, name, fqdn, api_url, public_ip, internal_ip, role, last_seen, joined_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
fqdn = EXCLUDED.fqdn,
|
||||
api_url = EXCLUDED.api_url,
|
||||
public_ip = COALESCE(EXCLUDED.public_ip, ha_nodes.public_ip),
|
||||
internal_ip = COALESCE(EXCLUDED.internal_ip, ha_nodes.internal_ip),
|
||||
role = EXCLUDED.role,
|
||||
last_seen = EXCLUDED.last_seen,
|
||||
updated_at = NOW()
|
||||
RETURNING id, name, fqdn, api_url, public_ip, internal_ip, role,
|
||||
last_seen, joined_at, created_at, updated_at`,
|
||||
n.ID, n.Name, n.FQDN, n.APIURL,
|
||||
n.PublicIP, n.InternalIP, n.Role,
|
||||
now, now,
|
||||
)
|
||||
return scanNode(row)
|
||||
}
|
||||
|
||||
// EnsureSelfRegistered mints the node-id if needed, builds the row
|
||||
// from setup.json + os.Hostname, and upserts it. Called on edgeguard-
|
||||
// api boot AFTER the DB pool is reachable.
|
||||
//
|
||||
// fqdn = setup-store fqdn (preferred) or hostname.
|
||||
// apiURL = "https://<fqdn>" (HAProxy-fronted; v1 doesn't yet know if
|
||||
// the operator runs on a non-default port).
|
||||
func EnsureSelfRegistered(ctx context.Context, store *Store, fqdn string, role string) (*models.HANode, error) {
|
||||
id, err := EnsureNodeID("")
|
||||
if err != nil {
|
||||
// Even when persistence failed (read-only /var/lib in dev),
|
||||
// EnsureNodeID returns the in-memory id alongside the error
|
||||
// — so we can still register, but the id will rotate on
|
||||
// every boot. Surface as warning to the caller; here we
|
||||
// just keep going so the dev box doesn't stay un-registered.
|
||||
_ = err
|
||||
}
|
||||
if id == "" {
|
||||
return nil, fmt.Errorf("could not derive node id")
|
||||
}
|
||||
host, hostErr := os.Hostname()
|
||||
if hostErr != nil {
|
||||
host = "unknown"
|
||||
}
|
||||
if fqdn == "" {
|
||||
fqdn = host
|
||||
}
|
||||
n := models.HANode{
|
||||
ID: id,
|
||||
Name: host,
|
||||
FQDN: fqdn,
|
||||
APIURL: "https://" + fqdn,
|
||||
Role: role,
|
||||
}
|
||||
return store.UpsertSelf(ctx, n)
|
||||
}
|
||||
|
||||
func scanNode(row interface{ Scan(...any) error }) (*models.HANode, error) {
|
||||
var n models.HANode
|
||||
if err := row.Scan(
|
||||
&n.ID, &n.Name, &n.FQDN, &n.APIURL,
|
||||
&n.PublicIP, &n.InternalIP, &n.Role,
|
||||
&n.LastSeen, &n.JoinedAt,
|
||||
&n.CreatedAt, &n.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &n, nil
|
||||
}
|
||||
Reference in New Issue
Block a user