feat(cluster): (c) Phase-3 MVP — stable node-id + self-register + Cluster-Page

Minimal-Slice für Phase-3-Cluster:
* internal/cluster/node_id.go — stable UUID 'n-<16hex>' in
  /var/lib/edgeguard/node-id, idempotent über reboots.
* internal/cluster/store.go — ha_nodes-Repo (List/Get/UpsertSelf)
  via pgxpool. EnsureSelfRegistered upsertet die lokale Row beim
  Boot mit FQDN aus setup.json.
* internal/handlers/cluster.go — GET /api/v1/cluster/nodes liefert
  alle ha_nodes plus local_id (für UI-Highlighting).
* main.go: nach DB-Pool-Open wird EnsureSelfRegistered (nur wenn
  setup.completed) ausgeführt, ClusterHandler registriert.
* management-ui/src/pages/Cluster/index.tsx — Tabelle mit Node-ID,
  FQDN, Rolle, Beitrittszeit; eigene Node mit "diese Node"-Tag
  markiert. Sidebar-Eintrag + i18n de/en.

Bewusst NICHT in dieser Runde: cluster-init/cluster-join CLIs, KeyDB
Active-Active config-gen, PG streaming replication, mTLS zwischen
Peers, License-Leader-Election. Diese kommen mit dem ersten echten
Multi-Node-Test (Phase 3.1) — sonst Code ohne Smoke-Möglichkeit.

End-to-end-Smoke: setup → restart → ha_nodes hat 1 Row mit
fqdn=eg.example.com, /cluster/nodes liefert sie korrekt mit
local_id-Markierung.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Debian
2026-05-09 11:52:54 +02:00
parent 6525cb1a41
commit cb5691cf3c
10 changed files with 421 additions and 2 deletions

View File

@@ -17,6 +17,7 @@ import (
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/jackc/pgx/v5/pgxpool" "github.com/jackc/pgx/v5/pgxpool"
"git.netcell-it.de/projekte/edgeguard-native/internal/cluster"
"git.netcell-it.de/projekte/edgeguard-native/internal/database" "git.netcell-it.de/projekte/edgeguard-native/internal/database"
"git.netcell-it.de/projekte/edgeguard-native/internal/handlers" "git.netcell-it.de/projekte/edgeguard-native/internal/handlers"
"git.netcell-it.de/projekte/edgeguard-native/internal/handlers/response" "git.netcell-it.de/projekte/edgeguard-native/internal/handlers/response"
@@ -93,7 +94,25 @@ func main() {
"error", err) "error", err)
} else { } else {
slog.Info("DB pool open, registering CRUD handlers") slog.Info("DB pool open, registering CRUD handlers")
nodeID := nodeIDOrHostname()
nodeID, nodeErr := cluster.EnsureNodeID("")
if nodeErr != nil {
slog.Warn("node-id not persisted, using ephemeral",
"id", nodeID, "error", nodeErr)
}
clusterStore := cluster.NewStore(pool)
// Self-register in ha_nodes — only if setup is complete
// (we want the operator-defined FQDN, not the OS hostname,
// to land in api_url). Failures are logged but non-fatal.
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
st, _ := setupStore.Load()
if st != nil && st.Completed {
if _, err := cluster.EnsureSelfRegistered(ctx, clusterStore, st.FQDN, "primary"); err != nil {
slog.Warn("self-register in ha_nodes failed", "error", err)
}
}
cancel()
auditRepo := audit.New(pool) auditRepo := audit.New(pool)
domainsRepo := domains.New(pool) domainsRepo := domains.New(pool)
@@ -105,6 +124,7 @@ func main() {
handlers.NewDomainsHandler(domainsRepo, routingRepo, auditRepo, nodeID).Register(authed) handlers.NewDomainsHandler(domainsRepo, routingRepo, auditRepo, nodeID).Register(authed)
handlers.NewBackendsHandler(backendsRepo, auditRepo, nodeID).Register(authed) handlers.NewBackendsHandler(backendsRepo, auditRepo, nodeID).Register(authed)
handlers.NewRoutingRulesHandler(routingRepo, auditRepo, nodeID).Register(authed) handlers.NewRoutingRulesHandler(routingRepo, auditRepo, nodeID).Register(authed)
handlers.NewClusterHandler(clusterStore, nodeID).Register(authed)
} }
mountUI(r) mountUI(r)

View File

@@ -0,0 +1,81 @@
// Package cluster owns the local cluster identity (node ID + role)
// and self-registration into ha_nodes on boot.
//
// v1 is single-node only — we register the local node so the UI's
// Cluster page has something to show and so multi-node Phase 3.1
// can build on a stable identity. Real cluster-join + KeyDB AA +
// PG streaming replication come later.
package cluster
import (
"crypto/rand"
"encoding/hex"
"fmt"
"os"
"path/filepath"
"strings"
)
const (
// DefaultNodeIDPath persists the node identifier across restarts.
// Lives in the EdgeGuard data dir so /etc/machine-id collisions
// (cloned VMs) don't matter — only this file determines identity.
DefaultNodeIDPath = "/var/lib/edgeguard/node-id"
nodeIDPrefix = "n-"
)
// EnsureNodeID returns the stable cluster node identifier, generating
// and persisting one on first call. The format is `n-<16 hex chars>`.
//
// On read errors (missing dir, permission denied) the function returns
// the freshly-minted in-memory ID and the persistence error so the
// caller can decide whether to abort or proceed with an ephemeral ID
// (development boxes typically don't have /var/lib/edgeguard/ writable).
func EnsureNodeID(path string) (string, error) {
if path == "" {
path = DefaultNodeIDPath
}
if b, err := os.ReadFile(path); err == nil {
s := strings.TrimSpace(string(b))
if validNodeID(s) {
return s, nil
}
}
id, err := mintNodeID()
if err != nil {
return "", err
}
if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil {
return id, fmt.Errorf("ensure node-id dir: %w", err)
}
if err := os.WriteFile(path, []byte(id+"\n"), 0o640); err != nil {
return id, fmt.Errorf("write node-id: %w", err)
}
return id, nil
}
func mintNodeID() (string, error) {
buf := make([]byte, 8)
if _, err := rand.Read(buf); err != nil {
return "", err
}
return nodeIDPrefix + hex.EncodeToString(buf), nil
}
func validNodeID(s string) bool {
if !strings.HasPrefix(s, nodeIDPrefix) {
return false
}
rest := s[len(nodeIDPrefix):]
if len(rest) != 16 {
return false
}
for _, r := range rest {
ok := (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f')
if !ok {
return false
}
}
return true
}

View File

@@ -0,0 +1,48 @@
package cluster
import (
"os"
"path/filepath"
"testing"
)
func TestEnsureNodeID_GeneratesAndPersists(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "node-id")
id1, err := EnsureNodeID(path)
if err != nil {
t.Fatalf("first call: %v", err)
}
if !validNodeID(id1) {
t.Fatalf("invalid node id minted: %q", id1)
}
id2, err := EnsureNodeID(path)
if err != nil {
t.Fatalf("second call: %v", err)
}
if id1 != id2 {
t.Errorf("node id should be stable: %q vs %q", id1, id2)
}
}
func TestEnsureNodeID_RejectsCorruptFile(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "node-id")
if err := os.WriteFile(path, []byte("not a real id\n"), 0o640); err != nil {
t.Fatal(err)
}
id, err := EnsureNodeID(path)
if err != nil {
t.Fatalf("EnsureNodeID: %v", err)
}
if !validNodeID(id) {
t.Errorf("expected fresh id when file was junk, got %q", id)
}
// Re-read should now match the regenerated id.
id2, _ := EnsureNodeID(path)
if id != id2 {
t.Errorf("regenerated id not persisted: %q vs %q", id, id2)
}
}

138
internal/cluster/store.go Normal file
View File

@@ -0,0 +1,138 @@
package cluster
import (
"context"
"errors"
"fmt"
"os"
"time"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"git.netcell-it.de/projekte/edgeguard-native/internal/models"
)
var ErrNotFound = errors.New("ha_node not found")
// Store wraps the ha_nodes table — used by the cluster handler and by
// EnsureSelfRegistered. v1 only ever has one row (the local node);
// the table is in place so Phase 3.1 multi-node lands without
// schema churn.
type Store struct {
Pool *pgxpool.Pool
}
func NewStore(pool *pgxpool.Pool) *Store { return &Store{Pool: pool} }
const baseSelect = `
SELECT id, name, fqdn, api_url, public_ip, internal_ip, role,
last_seen, joined_at, created_at, updated_at
FROM ha_nodes
`
func (s *Store) List(ctx context.Context) ([]models.HANode, error) {
rows, err := s.Pool.Query(ctx, baseSelect+" ORDER BY joined_at ASC")
if err != nil {
return nil, err
}
defer rows.Close()
out := make([]models.HANode, 0, 4)
for rows.Next() {
n, err := scanNode(rows)
if err != nil {
return nil, err
}
out = append(out, *n)
}
return out, rows.Err()
}
func (s *Store) Get(ctx context.Context, id string) (*models.HANode, error) {
row := s.Pool.QueryRow(ctx, baseSelect+" WHERE id = $1", id)
n, err := scanNode(row)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return nil, ErrNotFound
}
return nil, err
}
return n, nil
}
// UpsertSelf writes the local node's row using the database-side
// ON CONFLICT DO UPDATE so the call is safe to make on every boot.
// last_seen is also bumped — handy for the heartbeat-by-restart
// pattern even before periodic heartbeats land.
func (s *Store) UpsertSelf(ctx context.Context, n models.HANode) (*models.HANode, error) {
now := time.Now().UTC()
row := s.Pool.QueryRow(ctx, `
INSERT INTO ha_nodes (id, name, fqdn, api_url, public_ip, internal_ip, role, last_seen, joined_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
ON CONFLICT (id) DO UPDATE SET
name = EXCLUDED.name,
fqdn = EXCLUDED.fqdn,
api_url = EXCLUDED.api_url,
public_ip = COALESCE(EXCLUDED.public_ip, ha_nodes.public_ip),
internal_ip = COALESCE(EXCLUDED.internal_ip, ha_nodes.internal_ip),
role = EXCLUDED.role,
last_seen = EXCLUDED.last_seen,
updated_at = NOW()
RETURNING id, name, fqdn, api_url, public_ip, internal_ip, role,
last_seen, joined_at, created_at, updated_at`,
n.ID, n.Name, n.FQDN, n.APIURL,
n.PublicIP, n.InternalIP, n.Role,
now, now,
)
return scanNode(row)
}
// EnsureSelfRegistered mints the node-id if needed, builds the row
// from setup.json + os.Hostname, and upserts it. Called on edgeguard-
// api boot AFTER the DB pool is reachable.
//
// fqdn = setup-store fqdn (preferred) or hostname.
// apiURL = "https://<fqdn>" (HAProxy-fronted; v1 doesn't yet know if
// the operator runs on a non-default port).
func EnsureSelfRegistered(ctx context.Context, store *Store, fqdn string, role string) (*models.HANode, error) {
id, err := EnsureNodeID("")
if err != nil {
// Even when persistence failed (read-only /var/lib in dev),
// EnsureNodeID returns the in-memory id alongside the error
// — so we can still register, but the id will rotate on
// every boot. Surface as warning to the caller; here we
// just keep going so the dev box doesn't stay un-registered.
_ = err
}
if id == "" {
return nil, fmt.Errorf("could not derive node id")
}
host, hostErr := os.Hostname()
if hostErr != nil {
host = "unknown"
}
if fqdn == "" {
fqdn = host
}
n := models.HANode{
ID: id,
Name: host,
FQDN: fqdn,
APIURL: "https://" + fqdn,
Role: role,
}
return store.UpsertSelf(ctx, n)
}
func scanNode(row interface{ Scan(...any) error }) (*models.HANode, error) {
var n models.HANode
if err := row.Scan(
&n.ID, &n.Name, &n.FQDN, &n.APIURL,
&n.PublicIP, &n.InternalIP, &n.Role,
&n.LastSeen, &n.JoinedAt,
&n.CreatedAt, &n.UpdatedAt,
); err != nil {
return nil, err
}
return &n, nil
}

View File

@@ -0,0 +1,37 @@
package handlers
import (
"github.com/gin-gonic/gin"
"git.netcell-it.de/projekte/edgeguard-native/internal/cluster"
"git.netcell-it.de/projekte/edgeguard-native/internal/handlers/response"
)
// ClusterHandler exposes cluster-state endpoints. v1 is read-only:
// the UI shows the list of registered nodes but cluster-join + write
// operations land in Phase 3.1.
type ClusterHandler struct {
Store *cluster.Store
LocalID string
}
func NewClusterHandler(store *cluster.Store, localID string) *ClusterHandler {
return &ClusterHandler{Store: store, LocalID: localID}
}
func (h *ClusterHandler) Register(rg *gin.RouterGroup) {
g := rg.Group("/cluster")
g.GET("/nodes", h.ListNodes)
}
func (h *ClusterHandler) ListNodes(c *gin.Context) {
nodes, err := h.Store.List(c.Request.Context())
if err != nil {
response.Internal(c, err)
return
}
response.OK(c, gin.H{
"nodes": nodes,
"local_id": h.LocalID,
})
}

View File

@@ -16,6 +16,7 @@ const DashboardPage = lazy(() => import('./pages/Dashboard'))
const DomainsPage = lazy(() => import('./pages/Domains')) const DomainsPage = lazy(() => import('./pages/Domains'))
const BackendsPage = lazy(() => import('./pages/Backends')) const BackendsPage = lazy(() => import('./pages/Backends'))
const RoutingRulesPage = lazy(() => import('./pages/RoutingRules')) const RoutingRulesPage = lazy(() => import('./pages/RoutingRules'))
const ClusterPage = lazy(() => import('./pages/Cluster'))
const SettingsPage = lazy(() => import('./pages/Settings')) const SettingsPage = lazy(() => import('./pages/Settings'))
const queryClient = new QueryClient({ const queryClient = new QueryClient({
@@ -79,6 +80,7 @@ export default function App() {
<Route path="/domains" element={<DomainsPage />} /> <Route path="/domains" element={<DomainsPage />} />
<Route path="/backends" element={<BackendsPage />} /> <Route path="/backends" element={<BackendsPage />} />
<Route path="/routing-rules" element={<RoutingRulesPage />} /> <Route path="/routing-rules" element={<RoutingRulesPage />} />
<Route path="/cluster" element={<ClusterPage />} />
<Route path="/settings" element={<SettingsPage />} /> <Route path="/settings" element={<SettingsPage />} />
</Route> </Route>

View File

@@ -1,4 +1,4 @@
import { BranchesOutlined, DashboardOutlined, DatabaseOutlined, GlobalOutlined, SettingOutlined } from '@ant-design/icons' import { ApartmentOutlined, BranchesOutlined, DashboardOutlined, DatabaseOutlined, GlobalOutlined, SettingOutlined } from '@ant-design/icons'
import { Menu, Typography } from 'antd' import { Menu, Typography } from 'antd'
import { useNavigate, useLocation } from 'react-router-dom' import { useNavigate, useLocation } from 'react-router-dom'
import { useTranslation } from 'react-i18next' import { useTranslation } from 'react-i18next'
@@ -13,6 +13,7 @@ export default function Sidebar() {
{ key: '/domains', icon: <GlobalOutlined />, label: t('nav.domains') }, { key: '/domains', icon: <GlobalOutlined />, label: t('nav.domains') },
{ key: '/backends', icon: <DatabaseOutlined />, label: t('nav.backends') }, { key: '/backends', icon: <DatabaseOutlined />, label: t('nav.backends') },
{ key: '/routing-rules', icon: <BranchesOutlined />, label: t('nav.routing') }, { key: '/routing-rules', icon: <BranchesOutlined />, label: t('nav.routing') },
{ key: '/cluster', icon: <ApartmentOutlined />, label: t('nav.cluster') },
{ key: '/settings', icon: <SettingOutlined />, label: t('nav.settings') }, { key: '/settings', icon: <SettingOutlined />, label: t('nav.settings') },
] ]

View File

@@ -83,6 +83,15 @@
"selectBackend": "Backend wählen", "selectBackend": "Backend wählen",
"deleteConfirm": "Diese Routing-Regel wirklich löschen?" "deleteConfirm": "Diese Routing-Regel wirklich löschen?"
}, },
"cluster": {
"title": "Cluster",
"intro": "{{count}} Node(s) registriert. Multi-Node-Cluster (KeyDB Active-Active + PG Streaming Replication) folgt in einem späteren Release.",
"id": "Node-ID",
"fqdn": "FQDN",
"role": "Rolle",
"joinedAt": "Beigetreten",
"self": "diese Node"
},
"settings": { "settings": {
"title": "Einstellungen", "title": "Einstellungen",
"intro": "System-Information und Setup-Status. Bearbeitbare Werte folgen in einem späteren Release.", "intro": "System-Information und Setup-Status. Bearbeitbare Werte folgen in einem späteren Release.",

View File

@@ -83,6 +83,15 @@
"selectBackend": "Select backend", "selectBackend": "Select backend",
"deleteConfirm": "Really delete this routing rule?" "deleteConfirm": "Really delete this routing rule?"
}, },
"cluster": {
"title": "Cluster",
"intro": "{{count}} node(s) registered. Multi-node cluster (KeyDB Active-Active + PG streaming replication) coming in a later release.",
"id": "Node ID",
"fqdn": "FQDN",
"role": "Role",
"joinedAt": "Joined",
"self": "this node"
},
"settings": { "settings": {
"title": "Settings", "title": "Settings",
"intro": "System information and setup status. Editable values come in a later release.", "intro": "System information and setup status. Editable values come in a later release.",

View File

@@ -0,0 +1,74 @@
import { Card, Spin, Table, Tag, Typography } from 'antd'
import type { ColumnsType } from 'antd/es/table'
import { useQuery } from '@tanstack/react-query'
import { useTranslation } from 'react-i18next'
import apiClient, { isEnvelope } from '../../api/client'
interface HANode {
id: string
name: string
fqdn: string
api_url: string
public_ip?: string | null
internal_ip?: string | null
role: string
last_seen?: string | null
joined_at: string
created_at: string
updated_at: string
}
interface ClusterPayload {
nodes: HANode[]
local_id: string
}
export default function ClusterPage() {
const { t } = useTranslation()
const { data, isLoading } = useQuery({
queryKey: ['cluster', 'nodes'],
queryFn: async () => {
const r = await apiClient.get('/cluster/nodes')
if (isEnvelope(r.data)) return r.data.data as ClusterPayload
return null
},
refetchInterval: 30_000,
})
if (isLoading) return <Spin />
const columns: ColumnsType<HANode> = [
{
title: t('cluster.id'), dataIndex: 'id', key: 'id',
render: (id: string) => (
<span>
<code>{id}</code>{' '}
{id === data?.local_id && <Tag color="blue">{t('cluster.self')}</Tag>}
</span>
),
},
{ title: t('cluster.fqdn'), dataIndex: 'fqdn', key: 'fqdn' },
{ title: t('cluster.role'), dataIndex: 'role', key: 'role' },
{ title: t('cluster.joinedAt'), dataIndex: 'joined_at', key: 'joined_at',
render: (s: string) => new Date(s).toLocaleString() },
]
return (
<div>
<Typography.Title level={3}>{t('cluster.title')}</Typography.Title>
<Typography.Paragraph type="secondary">
{t('cluster.intro', { count: data?.nodes.length ?? 0 })}
</Typography.Paragraph>
<Card>
<Table
rowKey="id"
columns={columns}
dataSource={data?.nodes ?? []}
pagination={false}
/>
</Card>
</div>
)
}