Allow to acquire certificates in multi-node cluster

This commit is contained in:
Ingo Oppermann
2023-06-29 21:15:04 +02:00
parent 2b58c11bb1
commit 6e156d0f3a
13 changed files with 899 additions and 224 deletions

View File

@@ -44,7 +44,9 @@ type Cluster interface {
CoreConfig() *config.Config
About() (ClusterAbout, error)
IsReady(origin string) error
IsClusterDegraded() (bool, error)
IsDegraded() (bool, error)
GetBarrier(name string) bool
Join(origin, id, raftAddress, peerAddress string) error
Leave(origin, id string) error // gracefully remove a node from the cluster
@@ -76,7 +78,7 @@ type Cluster interface {
SetKV(origin, key, value string) error
UnsetKV(origin, key string) error
GetKV(key string) (string, time.Time, error)
GetKV(origin, key string) (string, time.Time, error)
ListKV(prefix string) map[string]store.Value
ProxyReader() proxy.ProxyReader
@@ -155,7 +157,8 @@ type cluster struct {
nodes map[string]*clusterNode
nodesLock sync.RWMutex
ready bool
barrier map[string]bool
barrierLock sync.RWMutex
limiter net.IPLimiter
}
@@ -187,6 +190,8 @@ func New(ctx context.Context, config Config) (Cluster, error) {
config: config.CoreConfig,
nodes: map[string]*clusterNode{},
barrier: map[string]bool{},
limiter: config.IPLimiter,
}
@@ -352,6 +357,16 @@ func New(ctx context.Context, config Config) (Cluster, error) {
go c.monitorLeadership()
go c.sentinel()
err = c.setup(ctx)
if err != nil {
c.Shutdown()
return nil, fmt.Errorf("failed to setup cluster: %w", err)
}
return c, nil
}
func (c *cluster) setup(ctx context.Context) error {
// Wait for a leader to be selected
c.logger.Info().Log("Waiting for a leader to be elected ...")
@@ -364,8 +379,7 @@ func New(ctx context.Context, config Config) (Cluster, error) {
select {
case <-ctx.Done():
c.Shutdown()
return nil, fmt.Errorf("starting cluster has been aborted: %w", ctx.Err())
return fmt.Errorf("starting cluster has been aborted: %w", ctx.Err())
default:
}
@@ -374,7 +388,7 @@ func New(ctx context.Context, config Config) (Cluster, error) {
c.logger.Info().Log("Leader has been elected")
// Wait for cluster to leave degraded mode
// Wait for all cluster nodes to leave degraded mode
c.logger.Info().Log("Waiting for cluster to become operational ...")
@@ -384,125 +398,128 @@ func New(ctx context.Context, config Config) (Cluster, error) {
break
}
c.logger.Warn().WithError(err).Log("Cluster is degraded")
c.logger.Warn().WithError(err).Log("Cluster is in degraded state")
select {
case <-ctx.Done():
c.Shutdown()
return nil, fmt.Errorf("starting cluster has been aborted: %w: %w", ctx.Err(), err)
return fmt.Errorf("starting cluster has been aborted: %w: %w", ctx.Err(), err)
default:
}
time.Sleep(time.Second)
}
err := c.Barrier(ctx, "operational")
if err != nil {
return fmt.Errorf("failed on barrier: %w", err)
}
c.logger.Info().Log("Cluster is operational")
if c.isTLSRequired {
c.logger.Info().Log("Waiting for TLS certificates ...")
// Create certificate manager
names, err := c.getClusterHostnames()
hostnames, err := c.getClusterHostnames()
if err != nil {
c.Shutdown()
return nil, fmt.Errorf("tls: failed to assemble list of all configured hostnames: %w", err)
return fmt.Errorf("tls: failed to assemble list of all configured hostnames: %w", err)
}
if len(names) == 0 {
c.Shutdown()
return nil, fmt.Errorf("tls: no hostnames are configured")
if len(hostnames) == 0 {
return fmt.Errorf("no hostnames are configured")
}
kvs, err := NewClusterKVS(c, c.logger.WithComponent("KVS"))
if err != nil {
c.Shutdown()
return nil, fmt.Errorf("tls: cluster KVS: %w", err)
return fmt.Errorf("tls: cluster KVS: %w", err)
}
storage, err := NewClusterStorage(kvs, "core-cluster-certificates")
storage, err := NewClusterStorage(kvs, "core-cluster-certificates", c.logger.WithComponent("KVS"))
if err != nil {
c.Shutdown()
return nil, fmt.Errorf("tls: certificate store: %w", err)
return fmt.Errorf("tls: certificate store: %w", err)
}
manager, err := autocert.New(autocert.Config{
Storage: storage,
DefaultHostname: names[0],
DefaultHostname: hostnames[0],
EmailAddress: c.config.TLS.Email,
IsProduction: !c.config.TLS.Staging,
Logger: c.logger.WithComponent("Let's Encrypt"),
})
if err != nil {
c.Shutdown()
return nil, fmt.Errorf("tls: certificate manager: %w", err)
return fmt.Errorf("tls: certificate manager: %w", err)
}
c.certManager = manager
if c.IsRaftLeader() {
// Acquire certificates
err = manager.AcquireCertificates(ctx, c.config.Address, names)
if err != nil {
c.Shutdown()
return nil, fmt.Errorf("tls: failed to acquire certificates: %w", err)
}
resctx, rescancel := context.WithCancel(ctx)
defer rescancel()
err = manager.HTTPChallengeResolver(resctx, c.config.Address)
if err != nil {
return fmt.Errorf("tls: failed to start the HTTP challenge resolver: %w", err)
}
// We have to wait for all nodes to have the HTTP challenge resolver started
err = c.Barrier(ctx, "acme")
if err != nil {
return fmt.Errorf("tls: failed on barrier: %w", err)
}
// Acquire certificates, all nodes can do this at the same time because everything
// is synched via the storage.
err = manager.AcquireCertificates(ctx, hostnames)
if err != nil {
return fmt.Errorf("tls: failed to acquire certificates: %w", err)
}
rescancel()
c.logger.Info().Log("TLS certificates acquired")
}
if !c.IsRaftLeader() {
tempctx, cancel := context.WithCancel(context.Background())
c.logger.Info().Log("Waiting for cluster to become ready ...")
if c.isTLSRequired {
// All followers forward any HTTP requests to the leader such that it can respond to the HTTP challenge
leaderAddress, _ := c.raft.Leader()
leader, err := c.CoreAPIAddress(leaderAddress)
if err != nil {
cancel()
c.Shutdown()
return nil, fmt.Errorf("unable to find leader address: %w", err)
}
url, err := url.Parse(leader)
if err != nil {
cancel()
return nil, fmt.Errorf("unable to parse leader address: %w", err)
}
url.Scheme = "http"
url.Path = "/"
url.User = nil
url.RawQuery = ""
go func() {
c.logger.Info().WithField("leader", url.String()).Log("Forwarding ACME challenges to leader")
autocert.ProxyHTTPChallenge(tempctx, c.config.Address, url)
c.logger.Info().WithField("leader", url.String()).Log("Stopped forwarding ACME challenges to leader")
}()
}
for {
// Ask leader if it is ready
err := c.IsReady("")
if err == nil {
cancel()
break
}
select {
case <-ctx.Done():
cancel()
c.Shutdown()
return nil, fmt.Errorf("starting cluster has been aborted: %w", ctx.Err())
default:
}
time.Sleep(time.Second)
}
err = c.Barrier(ctx, "ready")
if err != nil {
return fmt.Errorf("failed on barrier: %w", err)
}
c.ready = true
c.logger.Info().Log("Cluster is ready")
return c, nil
return nil
}
func (c *cluster) GetBarrier(name string) bool {
c.barrierLock.RLock()
defer c.barrierLock.RUnlock()
return c.barrier[name]
}
func (c *cluster) Barrier(ctx context.Context, name string) error {
c.barrierLock.Lock()
c.barrier[name] = true
c.barrierLock.Unlock()
for {
ok, err := c.getClusterBarrier(name)
if ok {
break
}
c.logger.Warn().WithField("name", name).WithError(err).Log("Waiting for barrier")
select {
case <-ctx.Done():
return fmt.Errorf("barrier %s: starting cluster has been aborted: %w: %w", name, ctx.Err(), err)
default:
}
time.Sleep(time.Second)
}
return nil
}
func (c *cluster) Address() string {
@@ -924,7 +941,7 @@ func (c *cluster) trackNodeChanges() {
c.nodesLock.Unlock()
// Put the cluster in "degraded" mode in case there's a mismatch in expected values
err = c.checkClusterNodes()
_, err = c.checkClusterNodes()
c.stateLock.Lock()
if err != nil {
@@ -948,36 +965,62 @@ func (c *cluster) trackNodeChanges() {
c.isCoreDegradedErr = nil
}
c.stateLock.Unlock()
/*
if c.isTLSRequired {
// Update list of managed hostnames
if c.certManager != nil {
c.certManager.ManageCertificates(context.Background(), hostnames)
}
}
*/
case <-c.shutdownCh:
return
}
}
}
func (c *cluster) checkClusterNodes() error {
// checkClusterNodes returns a list of all hostnames configured on all nodes. The
// returned list will not contain any duplicates. An error is returned in case the
// node is not compatible.
func (c *cluster) checkClusterNodes() ([]string, error) {
hostnames := map[string]struct{}{}
c.nodesLock.RLock()
defer c.nodesLock.RUnlock()
for id, node := range c.nodes {
if status, err := node.Status(); status == "offline" {
return fmt.Errorf("node %s is offline: %w", id, err)
return nil, fmt.Errorf("node %s is offline: %w", id, err)
}
version := node.Version()
if err := verifyClusterVersion(version); err != nil {
return fmt.Errorf("node %s has a different version: %s: %w", id, version, err)
return nil, fmt.Errorf("node %s has a different version: %s: %w", id, version, err)
}
config, err := node.CoreConfig()
if err != nil {
return fmt.Errorf("node %s has no configuration available: %w", id, err)
return nil, fmt.Errorf("node %s has no configuration available: %w", id, err)
}
if err := verifyClusterConfig(c.config, config); err != nil {
return fmt.Errorf("node %s has a different configuration: %w", id, err)
return nil, fmt.Errorf("node %s has a different configuration: %w", id, err)
}
for _, name := range config.Host.Name {
hostnames[name] = struct{}{}
}
}
return nil
names := []string{}
for key := range hostnames {
names = append(names, key)
}
sort.Strings(names)
return names, nil
}
func (c *cluster) checkClusterCoreNodes() error {
@@ -993,6 +1036,8 @@ func (c *cluster) checkClusterCoreNodes() error {
return nil
}
// getClusterHostnames return a list of all hostnames configured on all nodes. The
// returned list will not contain any duplicates.
func (c *cluster) getClusterHostnames() ([]string, error) {
hostnames := map[string]struct{}{}
@@ -1021,6 +1066,21 @@ func (c *cluster) getClusterHostnames() ([]string, error) {
return names, nil
}
// getClusterBarrier returns whether all nodes are currently on the same barrier.
func (c *cluster) getClusterBarrier(name string) (bool, error) {
c.nodesLock.RLock()
defer c.nodesLock.RUnlock()
for _, node := range c.nodes {
ok, err := node.Barrier(name)
if !ok {
return false, err
}
}
return true, nil
}
func verifyClusterVersion(v string) error {
version, err := ParseClusterVersion(v)
if err != nil {
@@ -1500,7 +1560,15 @@ func (c *cluster) UnsetKV(origin, key string) error {
return c.applyCommand(cmd)
}
func (c *cluster) GetKV(key string) (string, time.Time, error) {
func (c *cluster) GetKV(origin, key string) (string, time.Time, error) {
if ok, _ := c.IsClusterDegraded(); ok {
return "", time.Time{}, ErrDegraded
}
if !c.IsRaftLeader() {
return c.forwarder.GetKV(origin, key)
}
value, err := c.store.GetFromKVS(key)
if err != nil {
return "", time.Time{}, err
@@ -1515,22 +1583,6 @@ func (c *cluster) ListKV(prefix string) map[string]store.Value {
return storeValues
}
func (c *cluster) IsReady(origin string) error {
if ok, _ := c.IsClusterDegraded(); ok {
return ErrDegraded
}
if !c.IsRaftLeader() {
return c.forwarder.IsReady(origin)
}
if !c.ready {
return fmt.Errorf("no ready yet")
}
return nil
}
func (c *cluster) applyCommand(cmd *store.Command) error {
b, err := json.Marshal(cmd)
if err != nil {