diff --git a/cmd/edgeguard-api/main.go b/cmd/edgeguard-api/main.go index 57a2122..ba73e79 100644 --- a/cmd/edgeguard-api/main.go +++ b/cmd/edgeguard-api/main.go @@ -17,6 +17,7 @@ import ( "github.com/gin-gonic/gin" "github.com/jackc/pgx/v5/pgxpool" + "git.netcell-it.de/projekte/edgeguard-native/internal/cluster" "git.netcell-it.de/projekte/edgeguard-native/internal/database" "git.netcell-it.de/projekte/edgeguard-native/internal/handlers" "git.netcell-it.de/projekte/edgeguard-native/internal/handlers/response" @@ -93,7 +94,25 @@ func main() { "error", err) } else { slog.Info("DB pool open, registering CRUD handlers") - nodeID := nodeIDOrHostname() + + nodeID, nodeErr := cluster.EnsureNodeID("") + if nodeErr != nil { + slog.Warn("node-id not persisted, using ephemeral", + "id", nodeID, "error", nodeErr) + } + clusterStore := cluster.NewStore(pool) + + // Self-register in ha_nodes — only if setup is complete + // (we want the operator-defined FQDN, not the OS hostname, + // to land in api_url). Failures are logged but non-fatal. + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + st, _ := setupStore.Load() + if st != nil && st.Completed { + if _, err := cluster.EnsureSelfRegistered(ctx, clusterStore, st.FQDN, "primary"); err != nil { + slog.Warn("self-register in ha_nodes failed", "error", err) + } + } + cancel() auditRepo := audit.New(pool) domainsRepo := domains.New(pool) @@ -105,6 +124,7 @@ func main() { handlers.NewDomainsHandler(domainsRepo, routingRepo, auditRepo, nodeID).Register(authed) handlers.NewBackendsHandler(backendsRepo, auditRepo, nodeID).Register(authed) handlers.NewRoutingRulesHandler(routingRepo, auditRepo, nodeID).Register(authed) + handlers.NewClusterHandler(clusterStore, nodeID).Register(authed) } mountUI(r) diff --git a/internal/cluster/node_id.go b/internal/cluster/node_id.go new file mode 100644 index 0000000..a5c8e3c --- /dev/null +++ b/internal/cluster/node_id.go @@ -0,0 +1,81 @@ +// Package cluster owns the local cluster identity (node ID + role) +// and self-registration into ha_nodes on boot. +// +// v1 is single-node only — we register the local node so the UI's +// Cluster page has something to show and so multi-node Phase 3.1 +// can build on a stable identity. Real cluster-join + KeyDB AA + +// PG streaming replication come later. +package cluster + +import ( + "crypto/rand" + "encoding/hex" + "fmt" + "os" + "path/filepath" + "strings" +) + +const ( + // DefaultNodeIDPath persists the node identifier across restarts. + // Lives in the EdgeGuard data dir so /etc/machine-id collisions + // (cloned VMs) don't matter — only this file determines identity. + DefaultNodeIDPath = "/var/lib/edgeguard/node-id" + nodeIDPrefix = "n-" +) + +// EnsureNodeID returns the stable cluster node identifier, generating +// and persisting one on first call. The format is `n-<16 hex chars>`. +// +// On read errors (missing dir, permission denied) the function returns +// the freshly-minted in-memory ID and the persistence error so the +// caller can decide whether to abort or proceed with an ephemeral ID +// (development boxes typically don't have /var/lib/edgeguard/ writable). +func EnsureNodeID(path string) (string, error) { + if path == "" { + path = DefaultNodeIDPath + } + if b, err := os.ReadFile(path); err == nil { + s := strings.TrimSpace(string(b)) + if validNodeID(s) { + return s, nil + } + } + + id, err := mintNodeID() + if err != nil { + return "", err + } + if err := os.MkdirAll(filepath.Dir(path), 0o750); err != nil { + return id, fmt.Errorf("ensure node-id dir: %w", err) + } + if err := os.WriteFile(path, []byte(id+"\n"), 0o640); err != nil { + return id, fmt.Errorf("write node-id: %w", err) + } + return id, nil +} + +func mintNodeID() (string, error) { + buf := make([]byte, 8) + if _, err := rand.Read(buf); err != nil { + return "", err + } + return nodeIDPrefix + hex.EncodeToString(buf), nil +} + +func validNodeID(s string) bool { + if !strings.HasPrefix(s, nodeIDPrefix) { + return false + } + rest := s[len(nodeIDPrefix):] + if len(rest) != 16 { + return false + } + for _, r := range rest { + ok := (r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') + if !ok { + return false + } + } + return true +} diff --git a/internal/cluster/node_id_test.go b/internal/cluster/node_id_test.go new file mode 100644 index 0000000..e5e67f9 --- /dev/null +++ b/internal/cluster/node_id_test.go @@ -0,0 +1,48 @@ +package cluster + +import ( + "os" + "path/filepath" + "testing" +) + +func TestEnsureNodeID_GeneratesAndPersists(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "node-id") + + id1, err := EnsureNodeID(path) + if err != nil { + t.Fatalf("first call: %v", err) + } + if !validNodeID(id1) { + t.Fatalf("invalid node id minted: %q", id1) + } + + id2, err := EnsureNodeID(path) + if err != nil { + t.Fatalf("second call: %v", err) + } + if id1 != id2 { + t.Errorf("node id should be stable: %q vs %q", id1, id2) + } +} + +func TestEnsureNodeID_RejectsCorruptFile(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "node-id") + if err := os.WriteFile(path, []byte("not a real id\n"), 0o640); err != nil { + t.Fatal(err) + } + id, err := EnsureNodeID(path) + if err != nil { + t.Fatalf("EnsureNodeID: %v", err) + } + if !validNodeID(id) { + t.Errorf("expected fresh id when file was junk, got %q", id) + } + // Re-read should now match the regenerated id. + id2, _ := EnsureNodeID(path) + if id != id2 { + t.Errorf("regenerated id not persisted: %q vs %q", id, id2) + } +} diff --git a/internal/cluster/store.go b/internal/cluster/store.go new file mode 100644 index 0000000..aca6451 --- /dev/null +++ b/internal/cluster/store.go @@ -0,0 +1,138 @@ +package cluster + +import ( + "context" + "errors" + "fmt" + "os" + "time" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + + "git.netcell-it.de/projekte/edgeguard-native/internal/models" +) + +var ErrNotFound = errors.New("ha_node not found") + +// Store wraps the ha_nodes table — used by the cluster handler and by +// EnsureSelfRegistered. v1 only ever has one row (the local node); +// the table is in place so Phase 3.1 multi-node lands without +// schema churn. +type Store struct { + Pool *pgxpool.Pool +} + +func NewStore(pool *pgxpool.Pool) *Store { return &Store{Pool: pool} } + +const baseSelect = ` +SELECT id, name, fqdn, api_url, public_ip, internal_ip, role, + last_seen, joined_at, created_at, updated_at +FROM ha_nodes +` + +func (s *Store) List(ctx context.Context) ([]models.HANode, error) { + rows, err := s.Pool.Query(ctx, baseSelect+" ORDER BY joined_at ASC") + if err != nil { + return nil, err + } + defer rows.Close() + out := make([]models.HANode, 0, 4) + for rows.Next() { + n, err := scanNode(rows) + if err != nil { + return nil, err + } + out = append(out, *n) + } + return out, rows.Err() +} + +func (s *Store) Get(ctx context.Context, id string) (*models.HANode, error) { + row := s.Pool.QueryRow(ctx, baseSelect+" WHERE id = $1", id) + n, err := scanNode(row) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return nil, ErrNotFound + } + return nil, err + } + return n, nil +} + +// UpsertSelf writes the local node's row using the database-side +// ON CONFLICT DO UPDATE so the call is safe to make on every boot. +// last_seen is also bumped — handy for the heartbeat-by-restart +// pattern even before periodic heartbeats land. +func (s *Store) UpsertSelf(ctx context.Context, n models.HANode) (*models.HANode, error) { + now := time.Now().UTC() + row := s.Pool.QueryRow(ctx, ` +INSERT INTO ha_nodes (id, name, fqdn, api_url, public_ip, internal_ip, role, last_seen, joined_at) +VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) +ON CONFLICT (id) DO UPDATE SET + name = EXCLUDED.name, + fqdn = EXCLUDED.fqdn, + api_url = EXCLUDED.api_url, + public_ip = COALESCE(EXCLUDED.public_ip, ha_nodes.public_ip), + internal_ip = COALESCE(EXCLUDED.internal_ip, ha_nodes.internal_ip), + role = EXCLUDED.role, + last_seen = EXCLUDED.last_seen, + updated_at = NOW() +RETURNING id, name, fqdn, api_url, public_ip, internal_ip, role, + last_seen, joined_at, created_at, updated_at`, + n.ID, n.Name, n.FQDN, n.APIURL, + n.PublicIP, n.InternalIP, n.Role, + now, now, + ) + return scanNode(row) +} + +// EnsureSelfRegistered mints the node-id if needed, builds the row +// from setup.json + os.Hostname, and upserts it. Called on edgeguard- +// api boot AFTER the DB pool is reachable. +// +// fqdn = setup-store fqdn (preferred) or hostname. +// apiURL = "https://" (HAProxy-fronted; v1 doesn't yet know if +// the operator runs on a non-default port). +func EnsureSelfRegistered(ctx context.Context, store *Store, fqdn string, role string) (*models.HANode, error) { + id, err := EnsureNodeID("") + if err != nil { + // Even when persistence failed (read-only /var/lib in dev), + // EnsureNodeID returns the in-memory id alongside the error + // — so we can still register, but the id will rotate on + // every boot. Surface as warning to the caller; here we + // just keep going so the dev box doesn't stay un-registered. + _ = err + } + if id == "" { + return nil, fmt.Errorf("could not derive node id") + } + host, hostErr := os.Hostname() + if hostErr != nil { + host = "unknown" + } + if fqdn == "" { + fqdn = host + } + n := models.HANode{ + ID: id, + Name: host, + FQDN: fqdn, + APIURL: "https://" + fqdn, + Role: role, + } + return store.UpsertSelf(ctx, n) +} + +func scanNode(row interface{ Scan(...any) error }) (*models.HANode, error) { + var n models.HANode + if err := row.Scan( + &n.ID, &n.Name, &n.FQDN, &n.APIURL, + &n.PublicIP, &n.InternalIP, &n.Role, + &n.LastSeen, &n.JoinedAt, + &n.CreatedAt, &n.UpdatedAt, + ); err != nil { + return nil, err + } + return &n, nil +} diff --git a/internal/handlers/cluster.go b/internal/handlers/cluster.go new file mode 100644 index 0000000..67b993c --- /dev/null +++ b/internal/handlers/cluster.go @@ -0,0 +1,37 @@ +package handlers + +import ( + "github.com/gin-gonic/gin" + + "git.netcell-it.de/projekte/edgeguard-native/internal/cluster" + "git.netcell-it.de/projekte/edgeguard-native/internal/handlers/response" +) + +// ClusterHandler exposes cluster-state endpoints. v1 is read-only: +// the UI shows the list of registered nodes but cluster-join + write +// operations land in Phase 3.1. +type ClusterHandler struct { + Store *cluster.Store + LocalID string +} + +func NewClusterHandler(store *cluster.Store, localID string) *ClusterHandler { + return &ClusterHandler{Store: store, LocalID: localID} +} + +func (h *ClusterHandler) Register(rg *gin.RouterGroup) { + g := rg.Group("/cluster") + g.GET("/nodes", h.ListNodes) +} + +func (h *ClusterHandler) ListNodes(c *gin.Context) { + nodes, err := h.Store.List(c.Request.Context()) + if err != nil { + response.Internal(c, err) + return + } + response.OK(c, gin.H{ + "nodes": nodes, + "local_id": h.LocalID, + }) +} diff --git a/management-ui/src/App.tsx b/management-ui/src/App.tsx index d0890bd..9c38ca5 100644 --- a/management-ui/src/App.tsx +++ b/management-ui/src/App.tsx @@ -16,6 +16,7 @@ const DashboardPage = lazy(() => import('./pages/Dashboard')) const DomainsPage = lazy(() => import('./pages/Domains')) const BackendsPage = lazy(() => import('./pages/Backends')) const RoutingRulesPage = lazy(() => import('./pages/RoutingRules')) +const ClusterPage = lazy(() => import('./pages/Cluster')) const SettingsPage = lazy(() => import('./pages/Settings')) const queryClient = new QueryClient({ @@ -79,6 +80,7 @@ export default function App() { } /> } /> } /> + } /> } /> diff --git a/management-ui/src/components/Layout/Sidebar.tsx b/management-ui/src/components/Layout/Sidebar.tsx index fda5dbe..cd8b6bc 100644 --- a/management-ui/src/components/Layout/Sidebar.tsx +++ b/management-ui/src/components/Layout/Sidebar.tsx @@ -1,4 +1,4 @@ -import { BranchesOutlined, DashboardOutlined, DatabaseOutlined, GlobalOutlined, SettingOutlined } from '@ant-design/icons' +import { ApartmentOutlined, BranchesOutlined, DashboardOutlined, DatabaseOutlined, GlobalOutlined, SettingOutlined } from '@ant-design/icons' import { Menu, Typography } from 'antd' import { useNavigate, useLocation } from 'react-router-dom' import { useTranslation } from 'react-i18next' @@ -13,6 +13,7 @@ export default function Sidebar() { { key: '/domains', icon: , label: t('nav.domains') }, { key: '/backends', icon: , label: t('nav.backends') }, { key: '/routing-rules', icon: , label: t('nav.routing') }, + { key: '/cluster', icon: , label: t('nav.cluster') }, { key: '/settings', icon: , label: t('nav.settings') }, ] diff --git a/management-ui/src/i18n/locales/de/common.json b/management-ui/src/i18n/locales/de/common.json index 2dbe5f4..5dfa8f5 100644 --- a/management-ui/src/i18n/locales/de/common.json +++ b/management-ui/src/i18n/locales/de/common.json @@ -83,6 +83,15 @@ "selectBackend": "Backend wählen", "deleteConfirm": "Diese Routing-Regel wirklich löschen?" }, + "cluster": { + "title": "Cluster", + "intro": "{{count}} Node(s) registriert. Multi-Node-Cluster (KeyDB Active-Active + PG Streaming Replication) folgt in einem späteren Release.", + "id": "Node-ID", + "fqdn": "FQDN", + "role": "Rolle", + "joinedAt": "Beigetreten", + "self": "diese Node" + }, "settings": { "title": "Einstellungen", "intro": "System-Information und Setup-Status. Bearbeitbare Werte folgen in einem späteren Release.", diff --git a/management-ui/src/i18n/locales/en/common.json b/management-ui/src/i18n/locales/en/common.json index a763e1a..675aa4a 100644 --- a/management-ui/src/i18n/locales/en/common.json +++ b/management-ui/src/i18n/locales/en/common.json @@ -83,6 +83,15 @@ "selectBackend": "Select backend", "deleteConfirm": "Really delete this routing rule?" }, + "cluster": { + "title": "Cluster", + "intro": "{{count}} node(s) registered. Multi-node cluster (KeyDB Active-Active + PG streaming replication) coming in a later release.", + "id": "Node ID", + "fqdn": "FQDN", + "role": "Role", + "joinedAt": "Joined", + "self": "this node" + }, "settings": { "title": "Settings", "intro": "System information and setup status. Editable values come in a later release.", diff --git a/management-ui/src/pages/Cluster/index.tsx b/management-ui/src/pages/Cluster/index.tsx new file mode 100644 index 0000000..ee93aac --- /dev/null +++ b/management-ui/src/pages/Cluster/index.tsx @@ -0,0 +1,74 @@ +import { Card, Spin, Table, Tag, Typography } from 'antd' +import type { ColumnsType } from 'antd/es/table' +import { useQuery } from '@tanstack/react-query' +import { useTranslation } from 'react-i18next' + +import apiClient, { isEnvelope } from '../../api/client' + +interface HANode { + id: string + name: string + fqdn: string + api_url: string + public_ip?: string | null + internal_ip?: string | null + role: string + last_seen?: string | null + joined_at: string + created_at: string + updated_at: string +} + +interface ClusterPayload { + nodes: HANode[] + local_id: string +} + +export default function ClusterPage() { + const { t } = useTranslation() + + const { data, isLoading } = useQuery({ + queryKey: ['cluster', 'nodes'], + queryFn: async () => { + const r = await apiClient.get('/cluster/nodes') + if (isEnvelope(r.data)) return r.data.data as ClusterPayload + return null + }, + refetchInterval: 30_000, + }) + + if (isLoading) return + + const columns: ColumnsType = [ + { + title: t('cluster.id'), dataIndex: 'id', key: 'id', + render: (id: string) => ( + + {id}{' '} + {id === data?.local_id && {t('cluster.self')}} + + ), + }, + { title: t('cluster.fqdn'), dataIndex: 'fqdn', key: 'fqdn' }, + { title: t('cluster.role'), dataIndex: 'role', key: 'role' }, + { title: t('cluster.joinedAt'), dataIndex: 'joined_at', key: 'joined_at', + render: (s: string) => new Date(s).toLocaleString() }, + ] + + return ( +
+ {t('cluster.title')} + + {t('cluster.intro', { count: data?.nodes.length ?? 0 })} + + + + + + ) +}