diff --git a/api/sys_stepdown.go b/api/sys_stepdown.go new file mode 100644 index 0000000000000..421e5f19fb960 --- /dev/null +++ b/api/sys_stepdown.go @@ -0,0 +1,10 @@ +package api + +func (c *Sys) StepDown() error { + r := c.c.NewRequest("PUT", "/v1/sys/step-down") + resp, err := c.c.RawRequest(r) + if err == nil { + defer resp.Body.Close() + } + return err +} diff --git a/cli/commands.go b/cli/commands.go index 05f5c7479567e..1f5b89f911faf 100644 --- a/cli/commands.go +++ b/cli/commands.go @@ -224,6 +224,12 @@ func Commands(metaPtr *command.Meta) map[string]cli.CommandFactory { }, nil }, + "step-down": func() (cli.Command, error) { + return &command.StepDownCommand{ + Meta: meta, + }, nil + }, + "mount": func() (cli.Command, error) { return &command.MountCommand{ Meta: meta, diff --git a/command/step-down.go b/command/step-down.go new file mode 100644 index 0000000000000..1f2448e560bc5 --- /dev/null +++ b/command/step-down.go @@ -0,0 +1,54 @@ +package command + +import ( + "fmt" + "strings" +) + +// StepDownCommand is a Command that seals the vault. +type StepDownCommand struct { + Meta +} + +func (c *StepDownCommand) Run(args []string) int { + flags := c.Meta.FlagSet("step-down", FlagSetDefault) + flags.Usage = func() { c.Ui.Error(c.Help()) } + if err := flags.Parse(args); err != nil { + return 1 + } + + client, err := c.Client() + if err != nil { + c.Ui.Error(fmt.Sprintf( + "Error initializing client: %s", err)) + return 2 + } + + if err := client.Sys().StepDown(); err != nil { + c.Ui.Error(fmt.Sprintf("Error stepping down: %s", err)) + return 1 + } + + return 0 +} + +func (c *StepDownCommand) Synopsis() string { + return "Force the Vault node to give up active duty" +} + +func (c *StepDownCommand) Help() string { + helpText := ` +Usage: vault step-down [options] + + Force the Vault node to step down from active duty. + + This causes the indicated node to give up active status. Note that while the + affected node will have a short delay before attempting to grab the lock + again, if no other node grabs the lock beforehand, it is possible for the + same node to re-grab the lock and become active again. + +General Options: + + ` + generalOptionsUsage() + return strings.TrimSpace(helpText) +} diff --git a/http/handler.go b/http/handler.go index bd2f2dafc7cf2..5508a9539a403 100644 --- a/http/handler.go +++ b/http/handler.go @@ -23,6 +23,7 @@ func Handler(core *vault.Core) http.Handler { mux.Handle("/v1/sys/init", handleSysInit(core)) mux.Handle("/v1/sys/seal-status", handleSysSealStatus(core)) mux.Handle("/v1/sys/seal", handleSysSeal(core)) + mux.Handle("/v1/sys/step-down", handleSysStepDown(core)) mux.Handle("/v1/sys/unseal", handleSysUnseal(core)) mux.Handle("/v1/sys/mounts", proxySysRequest(core)) mux.Handle("/v1/sys/mounts/", proxySysRequest(core)) diff --git a/http/sys_seal.go b/http/sys_seal.go index d5ac76624f490..a11a2078b779f 100644 --- a/http/sys_seal.go +++ b/http/sys_seal.go @@ -34,6 +34,29 @@ func handleSysSeal(core *vault.Core) http.Handler { }) } +func handleSysStepDown(core *vault.Core) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.Method { + case "PUT": + case "POST": + default: + respondError(w, http.StatusMethodNotAllowed, nil) + return + } + + // Get the auth for the request so we can access the token directly + req := requestAuth(r, &logical.Request{}) + + // Seal with the token above + if err := core.StepDown(req.ClientToken); err != nil { + respondError(w, http.StatusInternalServerError, err) + return + } + + respondOk(w, nil) + }) +} + func handleSysUnseal(core *vault.Core) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.Method { diff --git a/http/sys_seal_test.go b/http/sys_seal_test.go index 4b3008276040c..e1cca89a6ff65 100644 --- a/http/sys_seal_test.go +++ b/http/sys_seal_test.go @@ -304,3 +304,13 @@ func TestSysSeal_Permissions(t *testing.T) { httpResp = testHttpPut(t, "child", addr+"/v1/sys/seal", nil) testResponseStatus(t, httpResp, 204) } + +func TestSysStepDown(t *testing.T) { + core, _, token := vault.TestCoreUnsealed(t) + ln, addr := TestServer(t, core) + defer ln.Close() + TestServerAuth(t, addr, token) + + resp := testHttpPut(t, token, addr+"/v1/sys/step-down", nil) + testResponseStatus(t, resp, 204) +} diff --git a/vault/core.go b/vault/core.go index 75df726016f2c..ff2a79339e488 100644 --- a/vault/core.go +++ b/vault/core.go @@ -64,6 +64,10 @@ const ( // leaderPrefixCleanDelay is how long to wait between deletions // of orphaned leader keys, to prevent slamming the backend. leaderPrefixCleanDelay = 200 * time.Millisecond + + // manualStepDownSleepPeriod is how long to sleep after a user-initiated + // step down of the active node, to prevent instantly regrabbing the lock + manualStepDownSleepPeriod = 10 * time.Second ) var ( @@ -206,9 +210,10 @@ type Core struct { stateLock sync.RWMutex sealed bool - standby bool - standbyDoneCh chan struct{} - standbyStopCh chan struct{} + standby bool + standbyDoneCh chan struct{} + standbyStopCh chan struct{} + manualStepDownCh chan struct{} // unlockParts has the keys provided to Unseal until // the threshold number of parts is available. @@ -1149,7 +1154,8 @@ func (c *Core) Unseal(key []byte) (bool, error) { // Go to standby mode, wait until we are active to unseal c.standbyDoneCh = make(chan struct{}) c.standbyStopCh = make(chan struct{}) - go c.runStandby(c.standbyDoneCh, c.standbyStopCh) + c.manualStepDownCh = make(chan struct{}) + go c.runStandby(c.standbyDoneCh, c.standbyStopCh, c.manualStepDownCh) } // Success! @@ -1161,6 +1167,7 @@ func (c *Core) Unseal(key []byte) (bool, error) { // be unsealed again to perform any further operations. func (c *Core) Seal(token string) (retErr error) { defer metrics.MeasureSince([]string{"core", "seal"}, time.Now()) + c.stateLock.Lock() defer c.stateLock.Unlock() if c.sealed { @@ -1173,15 +1180,8 @@ func (c *Core) Seal(token string) (retErr error) { Path: "sys/seal", ClientToken: token, } - acl, te, err := c.fetchACLandTokenEntry(req) - // Attempt to use the token (decrement num_uses) - if te != nil { - if err := c.tokenStore.UseToken(te); err != nil { - c.logger.Printf("[ERR] core: failed to use token: %v", err) - retErr = ErrInternalError - } - } + acl, te, err := c.fetchACLandTokenEntry(req) if err != nil { // Since there is no token store in standby nodes, sealing cannot // be done. Ideally, the request has to be forwarded to leader node @@ -1189,11 +1189,20 @@ func (c *Core) Seal(token string) (retErr error) { // just returning with an error and recommending a vault restart, which // essentially does the same thing. if c.standby { - c.logger.Printf("[ERR] core: vault cannot be sealed when in standby mode; please restart instead") - return errors.New("vault cannot be sealed when in standby mode; please restart instead") + c.logger.Printf("[ERR] core: vault cannot seal when in standby mode; please restart instead") + return errors.New("vault cannot seal when in standby mode; please restart instead") } return err } + // Attempt to use the token (decrement num_uses) + // If we can't, we still continue attempting the seal, so long as the token + // has appropriate permissions + if te != nil { + if err := c.tokenStore.UseToken(te); err != nil { + c.logger.Printf("[ERR] core: failed to use token: %v", err) + retErr = ErrInternalError + } + } // Verify that this operation is allowed allowed, rootPrivs := acl.AllowOperation(req.Operation, req.Path) @@ -1206,7 +1215,7 @@ func (c *Core) Seal(token string) (retErr error) { return logical.ErrPermissionDenied } - // Seal the Vault + //Seal the Vault err = c.sealInternal() if err == nil && retErr == ErrInternalError { c.logger.Printf("[ERR] core: core is successfully sealed but another error occurred during the operation") @@ -1217,9 +1226,60 @@ func (c *Core) Seal(token string) (retErr error) { return } -// sealInternal is an internal method used to seal the vault. -// It does not do any authorization checking. The stateLock must -// be held prior to calling. +// StepDown is used to step down from leadership +func (c *Core) StepDown(token string) error { + defer metrics.MeasureSince([]string{"core", "step_down"}, time.Now()) + + c.stateLock.Lock() + defer c.stateLock.Unlock() + if c.sealed { + return nil + } + if c.ha == nil || c.standby { + return nil + } + + // Validate the token is a root token + req := &logical.Request{ + Operation: logical.UpdateOperation, + Path: "sys/step-down", + ClientToken: token, + } + + acl, te, err := c.fetchACLandTokenEntry(req) + if err != nil { + return err + } + // Attempt to use the token (decrement num_uses) + if te != nil { + if err := c.tokenStore.UseToken(te); err != nil { + c.logger.Printf("[ERR] core: failed to use token: %v", err) + return err + } + } + + // Verify that this operation is allowed + allowed, rootPrivs := acl.AllowOperation(req.Operation, req.Path) + if !allowed { + return logical.ErrPermissionDenied + } + + // We always require root privileges for this operation + if !rootPrivs { + return logical.ErrPermissionDenied + } + + select { + case c.manualStepDownCh <- struct{}{}: + default: + c.logger.Printf("[WARN] core: manual step-down operation already queued") + } + + return nil +} + +// sealInternal is an internal method used to seal the vault. It does not do +// any authorization checking. The stateLock must be held prior to calling. func (c *Core) sealInternal() error { // Enable that we are sealed to prevent furthur transactions c.sealed = true @@ -1244,6 +1304,7 @@ func (c *Core) sealInternal() error { return err } c.logger.Printf("[INFO] core: vault is sealed") + return nil } @@ -1353,8 +1414,9 @@ func (c *Core) preSeal() error { // runStandby is a long running routine that is used when an HA backend // is enabled. It waits until we are leader and switches this Vault to // active. -func (c *Core) runStandby(doneCh, stopCh chan struct{}) { +func (c *Core) runStandby(doneCh, stopCh, manualStepDownCh chan struct{}) { defer close(doneCh) + defer close(manualStepDownCh) c.logger.Printf("[INFO] core: entering standby mode") // Monitor for key rotation @@ -1418,11 +1480,15 @@ func (c *Core) runStandby(doneCh, stopCh chan struct{}) { } // Monitor a loss of leadership + var manualStepDown bool select { case <-leaderLostCh: c.logger.Printf("[WARN] core: leadership lost, stopping active operation") case <-stopCh: c.logger.Printf("[WARN] core: stopping active operation") + case <-manualStepDownCh: + c.logger.Printf("[WARN] core: stepping down from active operation to standby") + manualStepDown = true } // Clear ourself as leader @@ -1443,6 +1509,12 @@ func (c *Core) runStandby(doneCh, stopCh chan struct{}) { if preSealErr != nil { c.logger.Printf("[ERR] core: pre-seal teardown failed: %v", err) } + + // If we've merely stepped down, we could instantly grab the lock + // again. Give the other nodes a chance. + if manualStepDown { + time.Sleep(manualStepDownSleepPeriod) + } } } diff --git a/vault/core_test.go b/vault/core_test.go index 1f9f80bd0b908..ec545857368e3 100644 --- a/vault/core_test.go +++ b/vault/core_test.go @@ -1106,9 +1106,6 @@ func TestCore_Standby_Seal(t *testing.T) { // Wait for core to become active testWaitActive(t, core) - // Ensure that the original clean function has stopped running - time.Sleep(2 * time.Second) - // Check the leader is local isLeader, advertise, err := core.Leader() if err != nil { @@ -1183,6 +1180,180 @@ func TestCore_Standby_Seal(t *testing.T) { } } +func TestCore_StepDown(t *testing.T) { + // Create the first core and initialize it + inm := physical.NewInmem() + inmha := physical.NewInmemHA() + advertiseOriginal := "http://127.0.0.1:8200" + core, err := NewCore(&CoreConfig{ + Physical: inm, + HAPhysical: inmha, + AdvertiseAddr: advertiseOriginal, + DisableMlock: true, + }) + if err != nil { + t.Fatalf("err: %v", err) + } + key, root := TestCoreInit(t, core) + if _, err := core.Unseal(TestKeyCopy(key)); err != nil { + t.Fatalf("unseal err: %s", err) + } + + // Verify unsealed + sealed, err := core.Sealed() + if err != nil { + t.Fatalf("err checking seal status: %s", err) + } + if sealed { + t.Fatal("should not be sealed") + } + + // Wait for core to become active + testWaitActive(t, core) + + // Check the leader is local + isLeader, advertise, err := core.Leader() + if err != nil { + t.Fatalf("err: %v", err) + } + if !isLeader { + t.Fatalf("should be leader") + } + if advertise != advertiseOriginal { + t.Fatalf("Bad advertise: %v", advertise) + } + + // Create the second core and initialize it + advertiseOriginal2 := "http://127.0.0.1:8500" + core2, err := NewCore(&CoreConfig{ + Physical: inm, + HAPhysical: inmha, + AdvertiseAddr: advertiseOriginal2, + DisableMlock: true, + }) + if err != nil { + t.Fatalf("err: %v", err) + } + if _, err := core2.Unseal(TestKeyCopy(key)); err != nil { + t.Fatalf("unseal err: %s", err) + } + + // Verify unsealed + sealed, err = core2.Sealed() + if err != nil { + t.Fatalf("err checking seal status: %s", err) + } + if sealed { + t.Fatal("should not be sealed") + } + + // Core2 should be in standby + standby, err := core2.Standby() + if err != nil { + t.Fatalf("err: %v", err) + } + if !standby { + t.Fatalf("should be standby") + } + + // Check the leader is not local + isLeader, advertise, err = core2.Leader() + if err != nil { + t.Fatalf("err: %v", err) + } + if isLeader { + t.Fatalf("should not be leader") + } + if advertise != advertiseOriginal { + t.Fatalf("Bad advertise: %v", advertise) + } + + // Step down core + err = core.StepDown(root) + if err != nil { + t.Fatal("error stepping down core 1") + } + + // Give time to switch leaders + time.Sleep(2 * time.Second) + + // Core1 should be in standby + standby, err = core.Standby() + if err != nil { + t.Fatalf("err: %v", err) + } + if !standby { + t.Fatalf("should be standby") + } + + // Check the leader is core2 + isLeader, advertise, err = core2.Leader() + if err != nil { + t.Fatalf("err: %v", err) + } + if !isLeader { + t.Fatalf("should be leader") + } + if advertise != advertiseOriginal2 { + t.Fatalf("Bad advertise: %v", advertise) + } + + // Check the leader is not local + isLeader, advertise, err = core.Leader() + if err != nil { + t.Fatalf("err: %v", err) + } + if isLeader { + t.Fatalf("should not be leader") + } + if advertise != advertiseOriginal2 { + t.Fatalf("Bad advertise: %v", advertise) + } + + // Step down core2 + err = core2.StepDown(root) + if err != nil { + t.Fatal("error stepping down core 1") + } + + // Give time to switch leaders -- core 1 will still be waiting on its + // cooling off period so give it a full 10 seconds to recover + time.Sleep(10 * time.Second) + + // Core2 should be in standby + standby, err = core2.Standby() + if err != nil { + t.Fatalf("err: %v", err) + } + if !standby { + t.Fatalf("should be standby") + } + + // Check the leader is core1 + isLeader, advertise, err = core.Leader() + if err != nil { + t.Fatalf("err: %v", err) + } + if !isLeader { + t.Fatalf("should be leader") + } + if advertise != advertiseOriginal { + t.Fatalf("Bad advertise: %v", advertise) + } + + // Check the leader is not local + isLeader, advertise, err = core2.Leader() + if err != nil { + t.Fatalf("err: %v", err) + } + if isLeader { + t.Fatalf("should not be leader") + } + if advertise != advertiseOriginal { + t.Fatalf("Bad advertise: %v", advertise) + } +} + func TestCore_CleanLeaderPrefix(t *testing.T) { // Create the first core and initialize it inm := physical.NewInmem() diff --git a/website/source/docs/http/sys-seal.html.md b/website/source/docs/http/sys-seal.html.md index 55d5a81a9ccce..d82b9af38671b 100644 --- a/website/source/docs/http/sys-seal.html.md +++ b/website/source/docs/http/sys-seal.html.md @@ -11,7 +11,9 @@ description: |-
Description
- Seals the Vault. In HA mode, only an active node can be sealed. Standby nodes should be restarted to get the same effect. + Seals the Vault. In HA mode, only an active node can be sealed. Standby + nodes should be restarted to get the same effect. Requires a token with + `root` policy or `sudo` capability on the path.
Method
diff --git a/website/source/docs/http/sys-step-down.html.md b/website/source/docs/http/sys-step-down.html.md new file mode 100644 index 0000000000000..ee6b1d82fd04f --- /dev/null +++ b/website/source/docs/http/sys-step-down.html.md @@ -0,0 +1,33 @@ +--- +layout: "http" +page_title: "HTTP API: /sys/step-down" +sidebar_current: "docs-http-ha-step-down" +description: |- + The '/sys/step-down' endpoint causes the node to give up active status. +--- + +# /sys/seal + +
+
Description
+
+ Forces the node to give up active status. If the node does not have active + status, this endpoint does nothing. Note that the node will sleep for ten + seconds before attempting to grab the active lock again, but if no standby + nodes grab the active lock in the interim, the same node may become the + active node again. Requires a token with `root` policy or `sudo` capability + on the path. +
+ +
Method
+
PUT
+ +
Parameters
+
+ None +
+ +
Returns
+
A `204` response code. +
+
diff --git a/website/source/layouts/http.erb b/website/source/layouts/http.erb index ac69d5651ed9f..ee75b37f9cbc8 100644 --- a/website/source/layouts/http.erb +++ b/website/source/layouts/http.erb @@ -107,6 +107,9 @@ > /sys/leader + > + /sys/step-down +