Files
Debian e8334cd276 feat(scheduler): Auto-Renewal für Let's Encrypt Certs
Vorher: edgeguard-scheduler war 60s-sleep-Stub. LE-Certs liefen nach
90 Tagen ab und mussten manuell re-issued werden.

Jetzt:
* internal/services/certrenewer — Pipeline (find expiring → ACME-Issue
  → certstore.WriteCombined → Repo.Upsert → haproxy reload). Kapselt
  was der /tls-certs/issue-Handler macht, nur DB-driven für N Certs.
* edgeguard-scheduler nutzt acme.Service + tlscerts.Repo + certrenewer.
  Tick alle 6h, Threshold 30 Tage Restlaufzeit. Sofort-Run bei
  Startup damit eine frisch eingespielte Box auch ohne 6h-Wartezeit
  prüft.
* Issuer == "letsencrypt" als Filter — manuell hochgeladene PEMs
  bleiben unangetastet (Operator owns lifecycle).
* Errors landen in tls_certs.last_error, retry beim nächsten Tick
  (transiente ACME-Failures self-heal).
* systemd-Unit edgeguard-scheduler.service: ReadWritePaths um
  /etc/edgeguard erweitert (für Cert-PEM-Writes), NoNewPrivileges
  auf false (sudo systemctl reload haproxy braucht setuid). Spiegelt
  edgeguard-api-Unit.

Version 1.0.16.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 22:50:00 +02:00

145 lines
4.2 KiB
Go

// Package certrenewer encapsulates the "find expiring certs +
// re-issue them via Let's Encrypt + reload haproxy" pipeline so the
// scheduler and the handler share the exact same write path.
//
// Scope deliberately narrow:
// - Only Let's Encrypt-issued rows are renewed (manually-uploaded
// PEMs stay untouched — operator owns the lifecycle).
// - Re-issue happens when not_after - now < threshold (default
// 30 days). LE recommends 30; certbot defaults to 30.
// - On error, the row's last_error is set; the schedule retries
// on the next tick, so a transient ACME failure self-heals.
package certrenewer
import (
"context"
"errors"
"fmt"
"log/slog"
"os/exec"
"time"
"git.netcell-it.de/projekte/edgeguard-native/internal/models"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/certstore"
"git.netcell-it.de/projekte/edgeguard-native/internal/services/tlscerts"
)
// Issuer is the contract acme.Service implements (Issue is also
// used for renewal — LE doesn't distinguish; a fresh cert with the
// same Common-Name supersedes the old one).
type Issuer interface {
Issue(domain string) (cert, chain, key string, err error)
}
type Service struct {
Repo *tlscerts.Repo
Issuer Issuer
CertDir string
Threshold time.Duration
// Logger lets the scheduler attach its slog handler. If nil,
// the package uses slog.Default.
Logger *slog.Logger
}
func New(repo *tlscerts.Repo, issuer Issuer, certDir string, threshold time.Duration) *Service {
if threshold == 0 {
threshold = 30 * 24 * time.Hour
}
return &Service{
Repo: repo,
Issuer: issuer,
CertDir: certDir,
Threshold: threshold,
}
}
func (s *Service) log() *slog.Logger {
if s.Logger != nil {
return s.Logger
}
return slog.Default()
}
// Result counts what one Run pass did so the scheduler can log /
// surface it.
type Result struct {
Checked int
Renewed int
Failed int
Skipped int
}
// Run scans tls_certs for LE-issued rows that expire within the
// threshold, re-issues each via the Issuer, writes the new PEM and
// triggers an HAProxy reload at the end if anything changed. Errors
// per cert are logged and recorded in tls_certs.last_error but do
// not abort the loop.
func (s *Service) Run(ctx context.Context) (Result, error) {
if s.Issuer == nil {
return Result{}, errors.New("certrenewer: Issuer is nil — ACME-Email noch nicht im Setup gesetzt?")
}
rows, err := s.Repo.ListExpiringSoon(ctx, s.Threshold)
if err != nil {
return Result{}, fmt.Errorf("list expiring: %w", err)
}
res := Result{Checked: len(rows)}
anyRenewed := false
for _, row := range rows {
if row.Issuer != "letsencrypt" {
res.Skipped++
s.log().Debug("certrenewer: skip non-LE cert", "domain", row.Domain, "issuer", row.Issuer)
continue
}
s.log().Info("certrenewer: renewing", "domain", row.Domain, "expires", row.NotAfter)
certPEM, chainPEM, keyPEM, err := s.Issuer.Issue(row.Domain)
if err != nil {
res.Failed++
s.log().Error("certrenewer: issue failed", "domain", row.Domain, "error", err)
_ = s.Repo.MarkError(ctx, row.Domain, err.Error())
continue
}
info, err := certstore.Parse(certPEM)
if err != nil {
res.Failed++
_ = s.Repo.MarkError(ctx, row.Domain, "parse: "+err.Error())
continue
}
path, err := certstore.WriteCombined(s.CertDir, row.Domain, certPEM, chainPEM, keyPEM)
if err != nil {
res.Failed++
_ = s.Repo.MarkError(ctx, row.Domain, "write: "+err.Error())
continue
}
now := time.Now()
if _, err := s.Repo.Upsert(ctx, models.TLSCert{
Domain: row.Domain,
Issuer: "letsencrypt",
Status: "active",
CertPath: &path,
KeyPath: &path,
NotBefore: &info.NotBefore,
NotAfter: &info.NotAfter,
LastRenewedAt: &now,
}); err != nil {
res.Failed++
s.log().Error("certrenewer: upsert failed", "domain", row.Domain, "error", err)
continue
}
res.Renewed++
anyRenewed = true
}
if anyRenewed {
if err := reloadHAProxy(); err != nil {
s.log().Warn("certrenewer: haproxy reload failed", "error", err)
} else {
s.log().Info("certrenewer: haproxy reloaded")
}
}
return res, nil
}
func reloadHAProxy() error {
return exec.Command("sudo", "-n", "/usr/bin/systemctl", "reload", "haproxy.service").Run()
}