From c5e7755afc8a2063215f4bc2c3179e73e9c86af0 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Tue, 16 Aug 2022 14:27:26 -0400 Subject: [PATCH 1/9] Allow automatic rebuilding of CRLs When enabled, periodic rebuilding of CRLs will improve PKI mounts in two way: 1. Reduced load during periods of high (new) revocations, as the CRL isn't rebuilt after each revocation but instead on a fixed schedule. 2. Ensuring the CRL is never stale as long as the cluster remains up, by checking for next CRL expiry and regenerating CRLs before that happens. This may increase cluster load when operators have large CRLs that they'd prefer to let go stale, rather than regenerating fresh copies. In particular, we set a grace period before expiration of CRLs where, when the periodic function triggers (about once a minute), we check upcoming CRL expirations and check if we need to rebuild the CRLs. Signed-off-by: Alexander Scheel --- builtin/logical/pki/backend.go | 45 +++--- builtin/logical/pki/crl_util.go | 197 +++++++++++++++++++++---- builtin/logical/pki/ocsp.go | 2 +- builtin/logical/pki/path_config_crl.go | 73 +++++++-- builtin/logical/pki/path_tidy.go | 14 +- builtin/logical/pki/storage.go | 9 +- 6 files changed, 271 insertions(+), 69 deletions(-) diff --git a/builtin/logical/pki/backend.go b/builtin/logical/pki/backend.go index 49ff9f67a1c5e..cfa11bd3def75 100644 --- a/builtin/logical/pki/backend.go +++ b/builtin/logical/pki/backend.go @@ -8,8 +8,6 @@ import ( "sync/atomic" "time" - atomic2 "go.uber.org/atomic" - "github.com/hashicorp/vault/sdk/helper/consts" "github.com/armon/go-metrics" @@ -178,7 +176,6 @@ func Backend(conf *logical.BackendConfig) *backend { PeriodicFunc: b.periodicFunc, } - b.crlLifetime = time.Hour * 72 b.tidyCASGuard = new(uint32) b.tidyStatus = &tidyStatus{state: tidyStatusInactive} b.storage = conf.StorageView @@ -186,8 +183,8 @@ func Backend(conf *logical.BackendConfig) *backend { b.pkiStorageVersion.Store(0) - b.crlBuilder = &crlBuilder{} - b.isOcspDisabled = atomic2.NewBool(false) + b.crlBuilder = newCRLBuilder() + return &b } @@ -208,9 +205,6 @@ type backend struct { // Write lock around issuers and keys. issuersLock sync.RWMutex - - // Optimization to not read the CRL config on every OCSP request. - isOcspDisabled *atomic2.Bool } type ( @@ -307,8 +301,10 @@ func (b *backend) metricsWrap(callType string, roleMode int, ofunc roleOperation // initialize is used to perform a possible PKI storage migration if needed func (b *backend) initialize(ctx context.Context, _ *logical.InitializationRequest) error { - // load ocsp enabled status - setOcspStatus(b, ctx) + sc := b.makeStorageContext(ctx, b.storage) + if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil { + return err + } // Grab the lock prior to the updating of the storage lock preventing us flipping // the storage flag midway through the request stream of other requests. @@ -335,14 +331,6 @@ func (b *backend) initialize(ctx context.Context, _ *logical.InitializationReque return nil } -func setOcspStatus(b *backend, ctx context.Context) { - sc := b.makeStorageContext(ctx, b.storage) - config, err := sc.getRevocationConfig() - if config != nil && err == nil { - b.isOcspDisabled.Store(config.OcspDisable) - } -} - func (b *backend) useLegacyBundleCaStorage() bool { // This helper function is here to choose whether or not we use the newer // issuer/key storage format or the older legacy ca bundle format. @@ -398,10 +386,27 @@ func (b *backend) invalidate(ctx context.Context, key string) { } case key == "config/crl": // We may need to reload our OCSP status flag - setOcspStatus(b, ctx) + b.crlBuilder.markConfigDirty() } } func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) error { - return b.crlBuilder.rebuildIfForced(ctx, b, request) + // First attempt to reload the CRL configuration. + sc := b.makeStorageContext(ctx, request.Storage) + if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil { + return err + } + + // Check if we're set to auto rebuild and a CRL is set to expire. + if err := b.crlBuilder.checkForAutoRebuild(sc); err != nil { + return err + } + + // Then attempt to rebuild the CRLs if required. + if err := b.crlBuilder.rebuildIfForced(ctx, b, request); err != nil { + return err + } + + // All good! + return nil } diff --git a/builtin/logical/pki/crl_util.go b/builtin/logical/pki/crl_util.go index bd9028c911a0c..d21a49f86e2d1 100644 --- a/builtin/logical/pki/crl_util.go +++ b/builtin/logical/pki/crl_util.go @@ -13,6 +13,8 @@ import ( "sync/atomic" "time" + atomic2 "go.uber.org/atomic" + "github.com/hashicorp/vault/sdk/helper/consts" "github.com/hashicorp/vault/sdk/helper/errutil" @@ -33,9 +35,15 @@ type revocationInfo struct { // without the actual API calls. During the storage invalidation process, we do not have the required state // to actually rebuild the CRLs, so we need to schedule it in a deferred fashion. This allows either // read or write calls to perform the operation if required, or have the flag reset upon a write operation +// +// The CRL builder also tracks the revocation configuration. type crlBuilder struct { m sync.Mutex forceRebuild uint32 + + c sync.RWMutex + dirty *atomic2.Bool + config crlConfig } const ( @@ -43,6 +51,121 @@ const ( _enforceForceFlag = false ) +func newCRLBuilder() *crlBuilder { + return &crlBuilder{ + dirty: atomic2.NewBool(true), + config: defaultCrlConfig, + } +} + +func (cb *crlBuilder) markConfigDirty() { + cb.dirty.Store(true) +} + +func (cb *crlBuilder) reloadConfigIfRequired(sc *storageContext) error { + if cb.dirty.Load() { + // Acquire a write lock. + cb.c.Lock() + defer cb.c.Unlock() + + config, err := sc.getRevocationConfig() + if err != nil { + return err + } + + // Set the default config if none was returned to us. + if config != nil { + cb.config = *config + } else { + cb.config = defaultCrlConfig + } + + // Updated the config; unset dirty. + cb.dirty.Store(false) + } + + return nil +} + +func (cb *crlBuilder) getConfig() *crlConfig { + // Config may mutate immediately after accessing! + cb.c.RLock() + defer cb.c.RUnlock() + + configCopy := cb.config + return &configCopy +} + +func (cb *crlBuilder) getConfigWithUpdate(sc *storageContext) (*crlConfig, error) { + // Config may mutate immediately after accessing, but will be freshly + // fetched if necessary. + if err := cb.reloadConfigIfRequired(sc); err != nil { + return nil, err + } + + return cb.getConfig(), nil +} + +func (cb *crlBuilder) checkForAutoRebuild(sc *storageContext) error { + cfg := cb.getConfig() + if cfg.Disable || !cfg.AutoRebuild || atomic.LoadUint32(&cb.forceRebuild) == 1 { + // Not enabled, not on auto-rebuilder, or we're already scheduled to + // rebuild so there's no point to interrogate CRL values... + return nil + } + + // Auto-Rebuild is enabled. We need to check each issuer's CRL and see + // if its about to expire. If it is, we've gotta rebuild it (and well, + // every other CRL since we don't have a fine-toothed rebuilder). + // + // We store a list of all (unique) CRLs in the cluster-local CRL + // configuration along with their expiration dates. + crlConfig, err := sc.getLocalCRLConfig() + if err != nil { + return fmt.Errorf("error checking for auto-rebuild status: unable to fetch cluster-local CRL configuration: %v", err) + } + + // If there's no config, assume we've gotta rebuild it to get this + // information. + if crlConfig == nil { + atomic.CompareAndSwapUint32(&cb.forceRebuild, 0, 1) + return nil + } + + // If the map is empty, assume we need to upgrade and schedule a + // rebuild. + if len(crlConfig.CRLExpirationMap) == 0 { + atomic.CompareAndSwapUint32(&cb.forceRebuild, 0, 1) + return nil + } + + // Otherwise, check CRL's expirations and see if its zero or within + // the grace period and act accordingly. + now := time.Now() + + period, err := time.ParseDuration(cfg.AutoRebuildGracePeriod) + if err != nil { + // This may occur if the duration is empty; in that case + // assume the default. The default should be valid and shouldn't + // error. + defaultPeriod, defaultErr := time.ParseDuration(defaultCrlConfig.AutoRebuildGracePeriod) + if defaultErr != nil { + return fmt.Errorf("error checking for auto-rebuild status: unable to parse duration from both config's grace period (%v) and default grace period (%v):\n- config: %v\n- default: %v\n", cfg.AutoRebuildGracePeriod, defaultCrlConfig.AutoRebuildGracePeriod, err, defaultErr) + } + + period = defaultPeriod + } + + for _, value := range crlConfig.CRLExpirationMap { + if value.IsZero() || now.After(value.Add(-1*period)) { + atomic.CompareAndSwapUint32(&cb.forceRebuild, 0, 1) + return nil + } + } + + return nil +} + // rebuildIfForced is to be called by readers or periodic functions that might need to trigger // a refresh of the CRL before the read occurs. func (cb *crlBuilder) rebuildIfForced(ctx context.Context, b *backend, request *logical.Request) error { @@ -235,13 +358,23 @@ func revokeCert(ctx context.Context, b *backend, req *logical.Request, serial st } } - crlErr := b.crlBuilder.rebuild(ctx, b, req, false) - if crlErr != nil { - switch crlErr.(type) { - case errutil.UserError: - return logical.ErrorResponse(fmt.Sprintf("Error during CRL building: %s", crlErr)), nil - default: - return nil, fmt.Errorf("error encountered during CRL building: %w", crlErr) + // Fetch the config and see if we need to rebuild the CRL. If we have + // auto building enabled, we will wait for the next rebuild period to + // actually rebuild it. + config, err := b.crlBuilder.getConfigWithUpdate(sc) + if err != nil { + return nil, fmt.Errorf("error building CRL: while updating config: %v", err) + } + + if !config.AutoRebuild { + crlErr := b.crlBuilder.rebuild(ctx, b, req, false) + if crlErr != nil { + switch crlErr.(type) { + case errutil.UserError: + return logical.ErrorResponse(fmt.Sprintf("Error during CRL building: %s", crlErr)), nil + default: + return nil, fmt.Errorf("error encountered during CRL building: %w", crlErr) + } } } @@ -288,6 +421,13 @@ func buildCRLs(ctx context.Context, b *backend, req *logical.Request, forceNew b var wasLegacy bool sc := b.makeStorageContext(ctx, req.Storage) + // First, fetch an updated copy of the CRL config. We'll pass this into + // buildCRL. + globalCRLConfig, err := b.crlBuilder.getConfigWithUpdate(sc) + if err != nil { + return fmt.Errorf("error building CRL: while updating config: %v", err) + } + if !b.useLegacyBundleCaStorage() { issuers, err = sc.listIssuers() if err != nil { @@ -420,9 +560,12 @@ func buildCRLs(ctx context.Context, b *backend, req *logical.Request, forceNew b crlConfig.CRLNumberMap[crlIdentifier] += 1 // Lastly, build the CRL. - if err := buildCRL(sc, forceNew, representative, revokedCerts, crlIdentifier, crlNumber); err != nil { + nextUpdate, err := buildCRL(sc, globalCRLConfig, forceNew, representative, revokedCerts, crlIdentifier, crlNumber) + if err != nil { return fmt.Errorf("error building CRLs: unable to build CRL for issuer (%v): %v", representative, err) } + + crlConfig.CRLExpirationMap[crlIdentifier] = *nextUpdate } } @@ -610,26 +753,19 @@ func augmentWithRevokedIssuers(issuerIDEntryMap map[issuerID]*issuerEntry, issue // Builds a CRL by going through the list of revoked certificates and building // a new CRL with the stored revocation times and serial numbers. -func buildCRL(sc *storageContext, forceNew bool, thisIssuerId issuerID, revoked []pkix.RevokedCertificate, identifier crlID, crlNumber int64) error { - crlInfo, err := sc.getRevocationConfig() - if err != nil { - return errutil.InternalError{Err: fmt.Sprintf("error fetching CRL config information: %s", err)} - } - - crlLifetime := sc.Backend.crlLifetime +func buildCRL(sc *storageContext, crlInfo *crlConfig, forceNew bool, thisIssuerId issuerID, revoked []pkix.RevokedCertificate, identifier crlID, crlNumber int64) (*time.Time, error) { var revokedCerts []pkix.RevokedCertificate - if crlInfo.Expiry != "" { - crlDur, err := time.ParseDuration(crlInfo.Expiry) - if err != nil { - return errutil.InternalError{Err: fmt.Sprintf("error parsing CRL duration of %s", crlInfo.Expiry)} - } - crlLifetime = crlDur + crlLifetime, err := time.ParseDuration(crlInfo.Expiry) + if err != nil { + return nil, errutil.InternalError{Err: fmt.Sprintf("error parsing CRL duration of %s", crlInfo.Expiry)} } if crlInfo.Disable { if !forceNew { - return nil + // In the event of a disabled CRL, we'll have the next time set + // to the zero time as a sentinel in case we get re-enabled. + return &time.Time{}, nil } // NOTE: in this case, the passed argument (revoked) is not added @@ -648,23 +784,26 @@ WRITE: if caErr != nil { switch caErr.(type) { case errutil.UserError: - return errutil.UserError{Err: fmt.Sprintf("could not fetch the CA certificate: %s", caErr)} + return nil, errutil.UserError{Err: fmt.Sprintf("could not fetch the CA certificate: %s", caErr)} default: - return errutil.InternalError{Err: fmt.Sprintf("error fetching CA certificate: %s", caErr)} + return nil, errutil.InternalError{Err: fmt.Sprintf("error fetching CA certificate: %s", caErr)} } } + now := time.Now() + nextUpdate := now.Add(crlLifetime) + revocationListTemplate := &x509.RevocationList{ RevokedCertificates: revokedCerts, Number: big.NewInt(crlNumber), - ThisUpdate: time.Now(), - NextUpdate: time.Now().Add(crlLifetime), + ThisUpdate: now, + NextUpdate: nextUpdate, SignatureAlgorithm: signingBundle.RevocationSigAlg, } crlBytes, err := x509.CreateRevocationList(rand.Reader, revocationListTemplate, signingBundle.Certificate, signingBundle.PrivateKey) if err != nil { - return errutil.InternalError{Err: fmt.Sprintf("error creating new CRL: %s", err)} + return nil, errutil.InternalError{Err: fmt.Sprintf("error creating new CRL: %s", err)} } writePath := "crls/" + identifier.String() @@ -679,8 +818,8 @@ WRITE: Value: crlBytes, }) if err != nil { - return errutil.InternalError{Err: fmt.Sprintf("error storing CRL: %s", err)} + return nil, errutil.InternalError{Err: fmt.Sprintf("error storing CRL: %s", err)} } - return nil + return &nextUpdate, nil } diff --git a/builtin/logical/pki/ocsp.go b/builtin/logical/pki/ocsp.go index 67302e6c3f1a6..631458a89ad1d 100644 --- a/builtin/logical/pki/ocsp.go +++ b/builtin/logical/pki/ocsp.go @@ -100,7 +100,7 @@ func buildPathOcspPost(b *backend) *framework.Path { } func (b *backend) ocspHandler(ctx context.Context, request *logical.Request, data *framework.FieldData) (*logical.Response, error) { - if b.isOcspDisabled.Load() { + if b.crlBuilder.getConfig().OcspDisable { return OcspUnauthorizedResponse, nil } diff --git a/builtin/logical/pki/path_config_crl.go b/builtin/logical/pki/path_config_crl.go index d1d806339e7f2..78515521bc809 100644 --- a/builtin/logical/pki/path_config_crl.go +++ b/builtin/logical/pki/path_config_crl.go @@ -12,9 +12,20 @@ import ( // CRLConfig holds basic CRL configuration information type crlConfig struct { - Expiry string `json:"expiry"` - Disable bool `json:"disable"` - OcspDisable bool `json:"ocsp_disable"` + Expiry string `json:"expiry"` + Disable bool `json:"disable"` + OcspDisable bool `json:"ocsp_disable"` + AutoRebuild bool `json:"auto_rebuild"` + AutoRebuildGracePeriod string `json:"auto_rebuild_grace_period"` +} + +// Implicit default values for the config if it does not exist. +var defaultCrlConfig = crlConfig{ + Expiry: "72h", + Disable: false, + OcspDisable: false, + AutoRebuild: false, + AutoRebuildGracePeriod: "12h", } func pathConfigCRL(b *backend) *framework.Path { @@ -35,6 +46,15 @@ valid; defaults to 72 hours`, Type: framework.TypeBool, Description: `If set to true, ocsp unauthorized responses will be returned.`, }, + "auto_rebuild": { + Type: framework.TypeBool, + Description: `If set to true, enables automatic rebuilding of the CRL`, + }, + "auto_rebuild_grace_period": { + Type: framework.TypeString, + Description: `The time before the CRL expires to automatically rebuild it, when enabled. Must be shorter than the CRL expiry. Defaults to 12h.`, + Default: "12h", + }, }, Operations: map[logical.Operation]framework.OperationHandler{ @@ -63,9 +83,11 @@ func (b *backend) pathCRLRead(ctx context.Context, req *logical.Request, _ *fram return &logical.Response{ Data: map[string]interface{}{ - "expiry": config.Expiry, - "disable": config.Disable, - "ocsp_disable": config.OcspDisable, + "expiry": config.Expiry, + "disable": config.Disable, + "ocsp_disable": config.OcspDisable, + "auto_rebuild": config.AutoRebuild, + "auto_rebuild_grace_period": config.AutoRebuildGracePeriod, }, }, nil } @@ -86,18 +108,37 @@ func (b *backend) pathCRLWrite(ctx context.Context, req *logical.Request, d *fra config.Expiry = expiry } - var oldDisable bool + oldDisable := config.Disable if disableRaw, ok := d.GetOk("disable"); ok { - oldDisable = config.Disable config.Disable = disableRaw.(bool) } - var oldOcspDisable bool if ocspDisableRaw, ok := d.GetOk("ocsp_disable"); ok { - oldOcspDisable = config.OcspDisable config.OcspDisable = ocspDisableRaw.(bool) } + oldAutoRebuild := config.AutoRebuild + if autoRebuildRaw, ok := d.GetOk("auto_rebuild"); ok { + config.AutoRebuild = autoRebuildRaw.(bool) + } + + if autoRebuildGracePeriodRaw, ok := d.GetOk("auto_rebuild_grace_period"); ok { + autoRebuildGracePeriod := autoRebuildGracePeriodRaw.(string) + if _, err := time.ParseDuration(autoRebuildGracePeriod); err != nil { + return logical.ErrorResponse(fmt.Sprintf("given auto_rebuild_grace_period could not be decoded: %s", err)), nil + } + config.AutoRebuildGracePeriod = autoRebuildGracePeriod + } + + if config.AutoRebuild { + expiry, _ := time.ParseDuration(config.Expiry) + gracePeriod, _ := time.ParseDuration(config.AutoRebuildGracePeriod) + + if gracePeriod >= expiry { + return logical.ErrorResponse(fmt.Sprintf("CRL auto-rebuilding grace period (%v) must be strictly shorter than CRL expiry (%v) value when auto-rebuilding of CRLs is enabled", config.AutoRebuildGracePeriod, config.Expiry)), nil + } + } + entry, err := logical.StorageEntryJSON("config/crl", config) if err != nil { return nil, err @@ -107,8 +148,12 @@ func (b *backend) pathCRLWrite(ctx context.Context, req *logical.Request, d *fra return nil, err } - if oldDisable != config.Disable { - // It wasn't disabled but now it is, rotate + b.crlBuilder.markConfigDirty() + b.crlBuilder.reloadConfigIfRequired(sc) + + if oldDisable != config.Disable || (oldAutoRebuild && !config.AutoRebuild) { + // It wasn't disabled but now it is (or equivalently, we were set to + // auto-rebuild and we aren't now), so rotate the CRL. crlErr := b.crlBuilder.rebuild(ctx, b, req, true) if crlErr != nil { switch crlErr.(type) { @@ -120,10 +165,6 @@ func (b *backend) pathCRLWrite(ctx context.Context, req *logical.Request, d *fra } } - if oldOcspDisable != config.OcspDisable { - setOcspStatus(b, ctx) - } - return nil, nil } diff --git a/builtin/logical/pki/path_tidy.go b/builtin/logical/pki/path_tidy.go index 556e4c3488dd0..02f9b4157fada 100644 --- a/builtin/logical/pki/path_tidy.go +++ b/builtin/logical/pki/path_tidy.go @@ -225,9 +225,21 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr } if rebuildCRL { - if err := b.crlBuilder.rebuild(ctx, b, req, false); err != nil { + // Expired certificates isn't generally an important + // reason to trigger a CRL rebuild for. Check if + // automatic CRL rebuilds have been enabled and defer + // the rebuild if so. + sc := b.makeStorageContext(ctx, req.Storage) + config, err := sc.getRevocationConfig() + if err != nil { return err } + + if !config.AutoRebuild { + if err := b.crlBuilder.rebuild(ctx, b, req, false); err != nil { + return err + } + } } } diff --git a/builtin/logical/pki/storage.go b/builtin/logical/pki/storage.go index 0145b2a37e411..1aea6ac7b7b86 100644 --- a/builtin/logical/pki/storage.go +++ b/builtin/logical/pki/storage.go @@ -159,8 +159,9 @@ type issuerEntry struct { } type localCRLConfigEntry struct { - IssuerIDCRLMap map[issuerID]crlID `json:"issuer_id_crl_map"` - CRLNumberMap map[crlID]int64 `json:"crl_number_map"` + IssuerIDCRLMap map[issuerID]crlID `json:"issuer_id_crl_map"` + CRLNumberMap map[crlID]int64 `json:"crl_number_map"` + CRLExpirationMap map[crlID]time.Time `json:"crl_expiration_map"` } type keyConfigEntry struct { @@ -803,6 +804,10 @@ func (sc *storageContext) getLocalCRLConfig() (*localCRLConfigEntry, error) { mapping.CRLNumberMap = make(map[crlID]int64) } + if len(mapping.CRLExpirationMap) == 0 { + mapping.CRLExpirationMap = make(map[crlID]time.Time) + } + return mapping, nil } From dfab956ac8373528cb7b15ae21e0c7425f3ca334 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Mon, 22 Aug 2022 08:21:00 -0400 Subject: [PATCH 2/9] Add changelog entry Signed-off-by: Alexander Scheel --- changelog/16762.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 changelog/16762.txt diff --git a/changelog/16762.txt b/changelog/16762.txt new file mode 100644 index 0000000000000..ade57bd4dc7c2 --- /dev/null +++ b/changelog/16762.txt @@ -0,0 +1,3 @@ +```release-note:improvement +secrets/pki: Add ability to periodically rebuild CRL before expiry +``` From 2105846a6fcff47b39878a28de65a3f2edaab2a7 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Mon, 22 Aug 2022 08:35:56 -0400 Subject: [PATCH 3/9] Add documentation on periodic rebuilding Signed-off-by: Alexander Scheel --- website/content/api-docs/secret/pki.mdx | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/website/content/api-docs/secret/pki.mdx b/website/content/api-docs/secret/pki.mdx index 0aa5ad357305d..6beddec139000 100644 --- a/website/content/api-docs/secret/pki.mdx +++ b/website/content/api-docs/secret/pki.mdx @@ -2989,7 +2989,7 @@ $ curl \ } ``` -### Set CRL Configuration +### Set Revocation Configuration This endpoint allows setting the duration for which the generated CRL should be marked valid. If the CRL is disabled, it will return a signed but zero-length @@ -3009,6 +3009,13 @@ revoked until `tidy` has been run with the desired safety buffer. Re-enabling CRL generation will then result in all such certificates becoming a part of the CRL. +~> Note: Enabling automatic rebuilding of CRLs disabled immediate regeneration + on revocation. This is in line with the behavior of other CAs which only + rebuild CRLs periodically. We suggest manually hitting the rotate if a + fresh CRL is necessary after a revocation. For the most part though, CRLs + should not be relied upon for the latest certificate status information, + and OCSP should be used instead. + | Method | Path | | :----- | :---------------- | | `POST` | `/pki/config/crl` | @@ -3018,6 +3025,10 @@ the CRL. - `expiry` `(string: "72h")` - The amount of time the generated CRL should be valid. - `disable` `(bool: false)` - Disables or enables CRL building. - `ocsp_disable` `(bool: false)` - Disables or enables the OCSP responder in Vault. +- `auto_rebuild` `(bool: false)` - Enables or disables periodic rebuilding of + the CRL upon expiry. +- `auto_rebuild_grace_period` `(string: "12h")` - Grace period before CRL expiry + to attempt rebuild of CRL. Must be shorter than the CRL expiry period. #### Sample Payload @@ -3025,7 +3036,9 @@ the CRL. { "expiry": "48h", "disable": "false", - "ocsp_disable": "false" + "ocsp_disable": "false", + "auto_rebuild": "true", + "auto_rebuild_grace_period": "8h" } ``` @@ -3055,6 +3068,14 @@ and **must** be called on every cluster. to delete their issuer and is instead necessary to **remove** the mount completely. +~> **Note**: As of Vault 1.12, it is suggested to switch to enabling the CRL's + `auto_rebuild` functionality to avoid having to manually hit the Rotate + endpoint when the CRL expires. This ensures a valid CRL is always + maintained, at the expense of potentially not being up-to-date. If a + revocation occurs that must be immediately propagated, this endpoint can + be used to regenerate the CRL, though distribution must still occur outside + of Vault (either manually or via AIA where supported). + | Method | Path | | :----- | :---------------- | | `GET` | `/pki/crl/rotate` | From 70cbdbcb7c1e1f5ea1b892010c2bd117994d2377 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Mon, 22 Aug 2022 12:21:13 -0400 Subject: [PATCH 4/9] Allow modification of rollback period for testing When testing backends that use the periodic func, and specifically, testing the behavior of that periodic func, waiting for the usual 1m interval can lead to excessively long test execution. By switching to a shorter period--strictly for testing--we can make these tests execute faster. Signed-off-by: Alexander Scheel --- vault/rollback.go | 9 +++++---- vault/testing.go | 11 +++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/vault/rollback.go b/vault/rollback.go index c2a3687706a59..ed37f94ef9119 100644 --- a/vault/rollback.go +++ b/vault/rollback.go @@ -14,10 +14,11 @@ import ( "github.com/hashicorp/vault/sdk/logical" ) -const ( - // rollbackPeriod is how often we attempt rollbacks for all the backends - rollbackPeriod = time.Minute -) +// rollbackPeriod is how often we attempt rollbacks for all the backends. +// +// This is turned into a variable to allow test to check behavior without +// waiting the full minute. See SetRollbackPeriodForTesting(...). +var rollbackPeriod = time.Minute // RollbackManager is responsible for performing rollbacks of partial // secrets within logical backends. diff --git a/vault/testing.go b/vault/testing.go index 7a9246090a5c4..f5978ce144dd0 100644 --- a/vault/testing.go +++ b/vault/testing.go @@ -2347,3 +2347,14 @@ func RetryUntil(t testing.T, timeout time.Duration, f func() error) { } t.Fatalf("did not complete before deadline, err: %v", err) } + +// SetRollbackPeriodForTesting lets us modify the periodic func invocation +// time period to some other value. Best practice is to set this, spin up +// a test cluster and immediately reset the value back to the default, to +// avoid impacting other tests too much. To that end, we return the original +// value of that period. +func SetRollbackPeriodForTesting(newPeriod time.Duration) time.Duration { + oldPeriod := rollbackPeriod + rollbackPeriod = newPeriod + return oldPeriod +} From e7fe9ea1e644d7e9957e2d14634fe71fa5c1fcb1 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Mon, 22 Aug 2022 11:51:29 -0400 Subject: [PATCH 5/9] Add tests for auto-rebuilding of CRLs Signed-off-by: Alexander Scheel --- builtin/logical/pki/crl_test.go | 171 ++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) diff --git a/builtin/logical/pki/crl_test.go b/builtin/logical/pki/crl_test.go index 8f9c6b3d57abe..c780c22da9a47 100644 --- a/builtin/logical/pki/crl_test.go +++ b/builtin/logical/pki/crl_test.go @@ -8,7 +8,11 @@ import ( "testing" "time" + "github.com/hashicorp/vault/api" + vaulthttp "github.com/hashicorp/vault/http" "github.com/hashicorp/vault/sdk/logical" + "github.com/hashicorp/vault/vault" + "github.com/stretchr/testify/require" ) @@ -363,6 +367,8 @@ func TestCrlRebuilder(t *testing.T) { } func TestBYOC(t *testing.T) { + t.Parallel() + b, s := createBackendWithStorage(t) // Create a root CA. @@ -480,6 +486,8 @@ func TestBYOC(t *testing.T) { } func TestPoP(t *testing.T) { + t.Parallel() + b, s := createBackendWithStorage(t) // Create a root CA. @@ -638,6 +646,8 @@ func TestPoP(t *testing.T) { } func TestIssuerRevocation(t *testing.T) { + t.Parallel() + b, s := createBackendWithStorage(t) // Create a root CA. @@ -778,6 +788,167 @@ func TestIssuerRevocation(t *testing.T) { require.NotEmpty(t, resp.Data["revocation_time"]) } +func TestAutoRebuild(t *testing.T) { + t.Parallel() + + // While we'd like to reduce this duration, we need to wait until + // the rollback manager timer ticks. With the new helper, we can + // modify the rollback manager timer period directly, allowing us + // to shorten the total test time significantly. + newPeriod := 6 * time.Second + gracePeriod := (newPeriod + 1*time.Second).String() + crlTime := (newPeriod + 2*time.Second).String() + delta := 2 * newPeriod + + // This test requires the periodicFunc to trigger, which requires we stand + // up a full test cluster. + coreConfig := &vault.CoreConfig{ + LogicalBackends: map[string]logical.Factory{ + "pki": Factory, + }, + } + oldPeriod := vault.SetRollbackPeriodForTesting(newPeriod) + cluster := vault.NewTestCluster(t, coreConfig, &vault.TestClusterOptions{ + HandlerFunc: vaulthttp.Handler, + }) + cluster.Start() + defer cluster.Cleanup() + client := cluster.Cores[0].Client + vault.SetRollbackPeriodForTesting(oldPeriod) + + // Mount PKI + err := client.Sys().Mount("pki", &api.MountInput{ + Type: "pki", + Config: api.MountConfigInput{ + DefaultLeaseTTL: "16h", + MaxLeaseTTL: "60h", + }, + }) + require.NoError(t, err) + + // Generate root. + _, err = client.Logical().Write("pki/root/generate/internal", map[string]interface{}{ + "ttl": "40h", + "common_name": "Root X1", + "key_type": "ec", + }) + require.NoError(t, err) + + // Setup a testing role. + _, err = client.Logical().Write("pki/roles/local-testing", map[string]interface{}{ + "allow_any_name": true, + "enforce_hostnames": false, + "key_type": "ec", + }) + require.NoError(t, err) + + // Safety guard: we play with rebuild timing below. We don't expect + // this entire test to take longer than 80s. + _, err = client.Logical().Write("pki/config/crl", map[string]interface{}{ + "expiry": crlTime, + }) + require.NoError(t, err) + + // Issue a cert and revoke it. It should appear on the CRL right away. + resp, err := client.Logical().Write("pki/issue/local-testing", map[string]interface{}{ + "common_name": "example.com", + }) + require.NoError(t, err) + require.NotNil(t, resp) + require.NotNil(t, resp.Data) + require.NotEmpty(t, resp.Data["serial_number"]) + leafSerial := resp.Data["serial_number"].(string) + + _, err = client.Logical().Write("pki/revoke", map[string]interface{}{ + "serial_number": leafSerial, + }) + require.NoError(t, err) + + crl := getCrlCertificateList(t, client, "pki") + lastCRLNumber := crl.Version + lastCRLExpiry := crl.NextUpdate + requireSerialNumberInCRL(t, crl, leafSerial) + + // Enable periodic rebuild of the CRL. + _, err = client.Logical().Write("pki/config/crl", map[string]interface{}{ + "expiry": crlTime, + "auto_rebuild": true, + "auto_rebuild_grace_period": gracePeriod, + }) + require.NoError(t, err) + + // Issue a cert and revoke it. + resp, err = client.Logical().Write("pki/issue/local-testing", map[string]interface{}{ + "common_name": "example.com", + }) + require.NoError(t, err) + require.NotNil(t, resp) + require.NotNil(t, resp.Data) + require.NotEmpty(t, resp.Data["serial_number"]) + newLeafSerial := resp.Data["serial_number"].(string) + + _, err = client.Logical().Write("pki/revoke", map[string]interface{}{ + "serial_number": newLeafSerial, + }) + require.NoError(t, err) + + // Serial should not appear on CRL. + crl = getCrlCertificateList(t, client, "pki") + thisCRLNumber := crl.Version + requireSerialNumberInCRL(t, crl, leafSerial) + + now := time.Now() + graceInterval, _ := time.ParseDuration(gracePeriod) + expectedUpdate := lastCRLExpiry.Add(-1 * graceInterval) + if requireSerialNumberInCRL(nil, crl, newLeafSerial) { + // If we somehow lagged and we ended up needing to rebuild + // the CRL, we should avoid throwing an error. + + if thisCRLNumber == lastCRLNumber { + t.Fatalf("unexpected failure: last (%v) and current (%v) leaf certificate might have the same serial number?", leafSerial, newLeafSerial) + } + + if !now.After(expectedUpdate) { + t.Fatalf("expected newly generated certificate with serial %v not to appear on this CRL but it did, prematurely: %v", newLeafSerial, crl) + } + + t.Fatalf("shouldn't be here") + } + + // Now, wait until we're within the grace period... Then start prompting + // for regeneration. + if expectedUpdate.After(now) { + time.Sleep(expectedUpdate.Sub(now)) + } + + // Otherwise, the absolute latest we're willing to wait is some delta + // after CRL expiry (to let stuff regenerate &c). + interruptChan := time.After(lastCRLExpiry.Sub(now) + delta) + for { + select { + case <-interruptChan: + t.Fatalf("expected CRL to regenerate prior to CRL expiry (plus %v grace period)", delta) + default: + crl = getCrlCertificateList(t, client, "pki") + if crl.NextUpdate.Equal(lastCRLExpiry) { + // Hack to ensure we got a net-new CRL. If we didn't, we can + // exit this default conditional and wait for the next + // go-round. When the timer fires, it'll populate the channel + // and we'll exit correctly. + time.Sleep(1 * time.Second) + break + } + + now := time.Now() + require.True(t, crl.ThisUpdate.Before(now)) + require.True(t, crl.NextUpdate.After(now)) + requireSerialNumberInCRL(t, crl, leafSerial) + requireSerialNumberInCRL(t, crl, newLeafSerial) + return + } + } +} + func requestCrlFromBackend(t *testing.T, s logical.Storage, b *backend) *logical.Response { crlReq := &logical.Request{ Operation: logical.ReadOperation, From 2d5057a8aadbb85a1b239d4fd97de054c72216a7 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Tue, 23 Aug 2022 10:25:55 -0400 Subject: [PATCH 6/9] Remove non-updating getConfig variant Signed-off-by: Alexander Scheel --- builtin/logical/pki/crl_util.go | 21 ++++++++++----------- builtin/logical/pki/ocsp.go | 6 +++--- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/builtin/logical/pki/crl_util.go b/builtin/logical/pki/crl_util.go index d21a49f86e2d1..9a2aa8c597616 100644 --- a/builtin/logical/pki/crl_util.go +++ b/builtin/logical/pki/crl_util.go @@ -87,15 +87,6 @@ func (cb *crlBuilder) reloadConfigIfRequired(sc *storageContext) error { return nil } -func (cb *crlBuilder) getConfig() *crlConfig { - // Config may mutate immediately after accessing! - cb.c.RLock() - defer cb.c.RUnlock() - - configCopy := cb.config - return &configCopy -} - func (cb *crlBuilder) getConfigWithUpdate(sc *storageContext) (*crlConfig, error) { // Config may mutate immediately after accessing, but will be freshly // fetched if necessary. @@ -103,11 +94,19 @@ func (cb *crlBuilder) getConfigWithUpdate(sc *storageContext) (*crlConfig, error return nil, err } - return cb.getConfig(), nil + cb.c.RLock() + defer cb.c.RUnlock() + + configCopy := cb.config + return &configCopy, nil } func (cb *crlBuilder) checkForAutoRebuild(sc *storageContext) error { - cfg := cb.getConfig() + cfg, err := cb.getConfigWithUpdate(sc) + if err != nil { + return err + } + if cfg.Disable || !cfg.AutoRebuild || atomic.LoadUint32(&cb.forceRebuild) == 1 { // Not enabled, not on auto-rebuilder, or we're already scheduled to // rebuild so there's no point to interrogate CRL values... diff --git a/builtin/logical/pki/ocsp.go b/builtin/logical/pki/ocsp.go index 631458a89ad1d..658eaa5720be5 100644 --- a/builtin/logical/pki/ocsp.go +++ b/builtin/logical/pki/ocsp.go @@ -100,7 +100,9 @@ func buildPathOcspPost(b *backend) *framework.Path { } func (b *backend) ocspHandler(ctx context.Context, request *logical.Request, data *framework.FieldData) (*logical.Response, error) { - if b.crlBuilder.getConfig().OcspDisable { + sc := b.makeStorageContext(ctx, request.Storage) + cfg, err := b.crlBuilder.getConfigWithUpdate(sc) + if err != nil || cfg.OcspDisable { return OcspUnauthorizedResponse, nil } @@ -114,8 +116,6 @@ func (b *backend) ocspHandler(ctx context.Context, request *logical.Request, dat return OcspMalformedResponse, nil } - sc := b.makeStorageContext(ctx, request.Storage) - ocspStatus, err := getOcspStatus(sc, request, ocspReq) if err != nil { return logAndReturnInternalError(b, err), nil From 32f4182cf59b262c53b8dec8f6dbddb8d80f1013 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Tue, 23 Aug 2022 10:26:24 -0400 Subject: [PATCH 7/9] Fix typo in docs Signed-off-by: Alexander Scheel --- website/content/api-docs/secret/pki.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/content/api-docs/secret/pki.mdx b/website/content/api-docs/secret/pki.mdx index 6beddec139000..fefea8056dbf4 100644 --- a/website/content/api-docs/secret/pki.mdx +++ b/website/content/api-docs/secret/pki.mdx @@ -3009,7 +3009,7 @@ revoked until `tidy` has been run with the desired safety buffer. Re-enabling CRL generation will then result in all such certificates becoming a part of the CRL. -~> Note: Enabling automatic rebuilding of CRLs disabled immediate regeneration +~> Note: Enabling automatic rebuilding of CRLs disables immediate regeneration on revocation. This is in line with the behavior of other CAs which only rebuild CRLs periodically. We suggest manually hitting the rotate if a fresh CRL is necessary after a revocation. For the most part though, CRLs From 2232940bd4b48e09e19d7218f5708ee02ebb786d Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Tue, 23 Aug 2022 10:30:09 -0400 Subject: [PATCH 8/9] Avoid double reload of config Signed-off-by: Alexander Scheel --- builtin/logical/pki/crl_util.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/builtin/logical/pki/crl_util.go b/builtin/logical/pki/crl_util.go index 9a2aa8c597616..305f1a3b960cd 100644 --- a/builtin/logical/pki/crl_util.go +++ b/builtin/logical/pki/crl_util.go @@ -68,6 +68,12 @@ func (cb *crlBuilder) reloadConfigIfRequired(sc *storageContext) error { cb.c.Lock() defer cb.c.Unlock() + if !cb.dirty.Load() { + // Someone else might've been reloading the config; no need + // to do it twice. + return nil + } + config, err := sc.getRevocationConfig() if err != nil { return err From 99b086ef5fcca9056fd13bb671550ae4606ed061 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Tue, 23 Aug 2022 13:13:29 -0400 Subject: [PATCH 9/9] Fix extra space Signed-off-by: Alexander Scheel --- website/content/api-docs/secret/pki.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/content/api-docs/secret/pki.mdx b/website/content/api-docs/secret/pki.mdx index fefea8056dbf4..746837b930f01 100644 --- a/website/content/api-docs/secret/pki.mdx +++ b/website/content/api-docs/secret/pki.mdx @@ -3038,7 +3038,7 @@ the CRL. "disable": "false", "ocsp_disable": "false", "auto_rebuild": "true", - "auto_rebuild_grace_period": "8h" + "auto_rebuild_grace_period": "8h" } ```