diff --git a/handshake.go b/handshake.go index b32e96f5..25515211 100644 --- a/handshake.go +++ b/handshake.go @@ -228,6 +228,9 @@ func DefaultCertificateSelector(hello *tls.ClientHelloInfo, choices []Certificat func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNecessary, obtainIfNecessary bool) (Certificate, error) { log := loggerNamed(cfg.Logger, "handshake") + // TODO: get a proper context... somehow...? + ctx := context.Background() + // First check our in-memory cache to see if we've already loaded it cert, matched, defaulted := cfg.getCertificate(hello) if matched { @@ -239,12 +242,10 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece zap.String("hash", cert.hash)) } if cert.managed && cfg.OnDemand != nil && obtainIfNecessary { - // It's been reported before that if the machine goes to sleep (or - // suspends the process) that certs which are already loaded into - // memory won't get renewed in the background, so we need to check - // expiry on each handshake too, sigh: - // https://caddy.community/t/local-certificates-not-renewing-on-demand/9482 - return cfg.optionalMaintenance(loggerNamed(cfg.Logger, "on_demand"), cert, hello) + // On-demand certificates are maintained in the background, but + // maintenance is triggered by handshakes instead of by a timer + // as in maintain.go. + return cfg.optionalMaintenance(ctx, loggerNamed(cfg.Logger, "on_demand"), cert, hello) } return cert, nil } @@ -290,7 +291,7 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece zap.Time("expiration", loadedCert.Leaf.NotAfter), zap.String("hash", loadedCert.hash)) } - loadedCert, err = cfg.handshakeMaintenance(hello, loadedCert) + loadedCert, err = cfg.handshakeMaintenance(ctx, hello, loadedCert) if err != nil { if log != nil { log.Error("maintining newly-loaded certificate", @@ -302,7 +303,7 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece } if cfg.OnDemand != nil && obtainIfNecessary { // By this point, we need to ask the CA for a certificate - return cfg.obtainOnDemandCertificate(hello) + return cfg.obtainOnDemandCertificate(ctx, hello) } } @@ -336,8 +337,8 @@ func (cfg *Config) getCertDuringHandshake(hello *tls.ClientHelloInfo, loadIfNece // optionalMaintenance will perform maintenance on the certificate (if necessary) and // will return the resulting certificate. This should only be done if the certificate // is managed, OnDemand is enabled, and the scope is allowed to obtain certificates. -func (cfg *Config) optionalMaintenance(log *zap.Logger, cert Certificate, hello *tls.ClientHelloInfo) (Certificate, error) { - newCert, err := cfg.handshakeMaintenance(hello, cert) +func (cfg *Config) optionalMaintenance(ctx context.Context, log *zap.Logger, cert Certificate, hello *tls.ClientHelloInfo) (Certificate, error) { + newCert, err := cfg.handshakeMaintenance(ctx, hello, cert) if err == nil { return newCert, nil } @@ -382,7 +383,7 @@ func (cfg *Config) checkIfCertShouldBeObtained(name string) error { // hello, it will wait and use what the other goroutine obtained. // // This function is safe for use by multiple concurrent goroutines. -func (cfg *Config) obtainOnDemandCertificate(hello *tls.ClientHelloInfo) (Certificate, error) { +func (cfg *Config) obtainOnDemandCertificate(ctx context.Context, hello *tls.ClientHelloInfo) (Certificate, error) { log := loggerNamed(cfg.Logger, "on_demand") name := cfg.getNameFromClientHello(hello) @@ -436,9 +437,10 @@ func (cfg *Config) obtainOnDemandCertificate(hello *tls.ClientHelloInfo) (Certif log.Info("obtaining new certificate", zap.String("server_name", name)) } - // TODO: use a proper context; we use one with timeout because retries are enabled because interactive is false + // TODO: we are only adding a timeout because we don't know if the context passed in is actually cancelable... // (timeout duration is based on https://caddy.community/t/zerossl-dns-challenge-failing-often-route53-plugin/13822/24?u=matt) - ctx, cancel := context.WithTimeout(context.TODO(), 180*time.Second) + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, 180*time.Second) defer cancel() // Obtain the certificate @@ -464,32 +466,35 @@ func (cfg *Config) obtainOnDemandCertificate(hello *tls.ClientHelloInfo) (Certif // OCSP stapling errors are not returned, only logged. // // This function is safe for use by multiple concurrent goroutines. -func (cfg *Config) handshakeMaintenance(hello *tls.ClientHelloInfo, cert Certificate) (Certificate, error) { +func (cfg *Config) handshakeMaintenance(ctx context.Context, hello *tls.ClientHelloInfo, cert Certificate) (Certificate, error) { log := loggerNamed(cfg.Logger, "on_demand") // Check cert expiration if currentlyInRenewalWindow(cert.Leaf.NotBefore, cert.Leaf.NotAfter, cfg.RenewalWindowRatio) { - return cfg.renewDynamicCertificate(hello, cert) + return cfg.renewDynamicCertificate(ctx, hello, cert) } // Check OCSP staple validity - if cert.ocsp != nil { - refreshTime := cert.ocsp.ThisUpdate.Add(cert.ocsp.NextUpdate.Sub(cert.ocsp.ThisUpdate) / 2) - if time.Now().After(refreshTime) { - _, err := stapleOCSP(cfg.OCSP, cfg.Storage, &cert, nil) - if err != nil { - // An error with OCSP stapling is not the end of the world, and in fact, is - // quite common considering not all certs have issuer URLs that support it. - if log != nil { - log.Warn("stapling OCSP", - zap.String("server_name", hello.ServerName), - zap.Error(err)) - } + if cert.ocsp != nil && !freshOCSP(cert.ocsp) { + ocspResp, err := stapleOCSP(cfg.OCSP, cfg.Storage, &cert, nil) + if err != nil { + // An error with OCSP stapling is not the end of the world, and in fact, is + // quite common considering not all certs have issuer URLs that support it. + if log != nil { + log.Warn("stapling OCSP", + zap.String("server_name", hello.ServerName), + zap.Error(err)) } - cfg.certCache.mu.Lock() - cfg.certCache.cache[cert.hash] = cert - cfg.certCache.mu.Unlock() } + + // our copy of cert has the new OCSP staple, so replace it in the cache + cfg.certCache.mu.Lock() + cfg.certCache.cache[cert.hash] = cert + cfg.certCache.mu.Unlock() + + // We attempt to replace any certificates that were revoked. + // Crucially, this happens OUTSIDE a lock on the certCache. + cfg.forceRenewIfCertRevoked(ctx, log, cert, ocspResp) } return cert, nil @@ -503,7 +508,7 @@ func (cfg *Config) handshakeMaintenance(hello *tls.ClientHelloInfo, cert Certifi // certificate has been renewed, and returns the renewed certificate. // // This function is safe for use by multiple concurrent goroutines. -func (cfg *Config) renewDynamicCertificate(hello *tls.ClientHelloInfo, currentCert Certificate) (Certificate, error) { +func (cfg *Config) renewDynamicCertificate(ctx context.Context, hello *tls.ClientHelloInfo, currentCert Certificate) (Certificate, error) { log := loggerNamed(cfg.Logger, "on_demand") name := cfg.getNameFromClientHello(hello) @@ -587,6 +592,7 @@ func (cfg *Config) renewDynamicCertificate(hello *tls.ClientHelloInfo, currentCe // Renew and reload the certificate renewAndReload := func(ctx context.Context, cancel context.CancelFunc) (Certificate, error) { defer cancel() + err = cfg.RenewCertAsync(ctx, name, false) if err == nil { // even though the recursive nature of the dynamic cert loading @@ -617,14 +623,13 @@ func (cfg *Config) renewDynamicCertificate(hello *tls.ClientHelloInfo, currentCe // if the certificate hasn't expired, we can serve what we have and renew in the background if timeLeft > 0 { - // TODO: get a proper context; we use one with timeout because retries are enabled because interactive is false - ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Minute) + ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) go renewAndReload(ctx, cancel) return currentCert, nil } // otherwise, we have to block while we renew an expired certificate - ctx, cancel := context.WithTimeout(context.TODO(), 90*time.Second) + ctx, cancel := context.WithTimeout(ctx, 90*time.Second) return renewAndReload(ctx, cancel) } diff --git a/maintain.go b/maintain.go index 63d475ce..beb59f15 100644 --- a/maintain.go +++ b/maintain.go @@ -394,64 +394,8 @@ func (certCache *Cache) updateOCSPStaples(ctx context.Context) { // We attempt to replace any certificates that were revoked. // Crucially, this happens OUTSIDE a lock on the certCache. for _, renew := range renewQueue { - if logger != nil { - logger.Warn("OCSP status for managed certificate is REVOKED; attempting to replace with new certificate", - zap.Strings("identifiers", renew.oldCert.Names), - zap.Time("expiration", renew.oldCert.Leaf.NotAfter)) - } - - renewName := renew.oldCert.Names[0] - cfg := configs[renewName] - - // if revoked for key compromise, we can't be sure whether the storage of - // the key is still safe; however, we KNOW the old key is not safe, and we - // can only hope by the time of revocation that storage has been secured; - // key management is not something we want to get into, but in this case - // it seems prudent to replace the key - and since renewal requires reuse - // of a prior key, we can't do a "renew" to replace the cert if we need a - // new key, so we'll have to do an obtain instead - var obtainInsteadOfRenew bool - if renew.ocspResp.RevocationReason == acme.ReasonKeyCompromise { - err := cfg.moveCompromisedPrivateKey(renew.oldCert, logger) - if err != nil && logger != nil { - logger.Error("could not remove compromised private key from use", - zap.Strings("identifiers", renew.oldCert.Names), - zap.String("issuer", renew.oldCert.issuerKey), - zap.Error(err)) - } - obtainInsteadOfRenew = true - } - - var err error - if obtainInsteadOfRenew { - err = cfg.ObtainCertAsync(ctx, renewName) - } else { - // notice that we force renewal; otherwise, it might see that the - // certificate isn't close to expiring and return, but we really - // need a replacement certificate! see issue #4191 - err = cfg.RenewCertAsync(ctx, renewName, true) - } - if err != nil { - // probably better to not serve a revoked certificate at all - if logger != nil { - logger.Error("unable to obtain new to certificate after OCSP status of REVOKED; removing from cache", - zap.Strings("identifiers", renew.oldCert.Names), - zap.Error(err)) - } - certCache.mu.Lock() - certCache.removeCertificate(renew.oldCert) - certCache.mu.Unlock() - continue - } - err = cfg.reloadManagedCertificate(renew.oldCert) - if err != nil { - if logger != nil { - logger.Error("after obtaining new certificate due to OCSP status of REVOKED", - zap.Strings("identifiers", renew.oldCert.Names), - zap.Error(err)) - } - continue - } + cfg := configs[renew.oldCert.Names[0]] + cfg.forceRenewIfCertRevoked(ctx, logger, renew.oldCert, renew.ocspResp) } } @@ -599,6 +543,73 @@ func deleteExpiredCerts(ctx context.Context, storage Storage, gracePeriod time.D return nil } +// forceRenewIfCertRevoked forcefully renews cert if ocspResp has a status of Revoked. (It is a no-op otherwise.) +// Due to nuances with error handling, any errors are logged with the provided logger rather than returned. +// This MUST NOT be called within a lock on cfg.certCacheMu. +func (cfg *Config) forceRenewIfCertRevoked(ctx context.Context, logger *zap.Logger, cert Certificate, ocspResp *ocsp.Response) { + if !cert.managed || ocspResp.Status != ocsp.Revoked || len(cert.Names) == 0 { + return + } + + if logger != nil { + logger.Warn("OCSP status for managed certificate is REVOKED; attempting to replace with new certificate", + zap.Strings("identifiers", cert.Names), + zap.Time("expiration", cert.Leaf.NotAfter)) + } + + renewName := cert.Names[0] + + // if revoked for key compromise, we can't be sure whether the storage of + // the key is still safe; however, we KNOW the old key is not safe, and we + // can only hope by the time of revocation that storage has been secured; + // key management is not something we want to get into, but in this case + // it seems prudent to replace the key - and since renewal requires reuse + // of a prior key, we can't do a "renew" to replace the cert if we need a + // new key, so we'll have to do an obtain instead + var obtainInsteadOfRenew bool + if ocspResp.RevocationReason == acme.ReasonKeyCompromise { + err := cfg.moveCompromisedPrivateKey(cert, logger) + if err != nil && logger != nil { + logger.Error("could not remove compromised private key from use", + zap.Strings("identifiers", cert.Names), + zap.String("issuer", cert.issuerKey), + zap.Error(err)) + } + obtainInsteadOfRenew = true + } + + var err error + if obtainInsteadOfRenew { + err = cfg.ObtainCertAsync(ctx, renewName) + } else { + // notice that we force renewal; otherwise, it might see that the + // certificate isn't close to expiring and return, but we really + // need a replacement certificate! see issue #4191 + err = cfg.RenewCertAsync(ctx, renewName, true) + } + if err != nil { + // probably better to not serve a revoked certificate at all + if logger != nil { + logger.Error("unable to obtain new to certificate after OCSP status of REVOKED; removing from cache", + zap.Strings("identifiers", cert.Names), + zap.Error(err)) + } + cfg.certCache.mu.Lock() + cfg.certCache.removeCertificate(cert) + cfg.certCache.mu.Unlock() + return + } + err = cfg.reloadManagedCertificate(cert) + if err != nil { + if logger != nil { + logger.Error("after obtaining new certificate due to OCSP status of REVOKED", + zap.Strings("identifiers", cert.Names), + zap.Error(err)) + } + return + } +} + // moveCompromisedPrivateKey moves the private key for cert to a ".compromised" file // by copying the data to the new file, then deleting the old one. func (cfg *Config) moveCompromisedPrivateKey(cert Certificate, logger *zap.Logger) error {