Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide 'sys/step-down' and 'vault step-down' #1146

Merged
merged 6 commits into from Mar 3, 2016
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 10 additions & 0 deletions api/sys_stepdown.go
@@ -0,0 +1,10 @@
package api

func (c *Sys) StepDown() error {
r := c.c.NewRequest("PUT", "/v1/sys/step-down")
resp, err := c.c.RawRequest(r)
if err == nil {
defer resp.Body.Close()
}
return err
}
6 changes: 6 additions & 0 deletions cli/commands.go
Expand Up @@ -224,6 +224,12 @@ func Commands(metaPtr *command.Meta) map[string]cli.CommandFactory {
}, nil
},

"step-down": func() (cli.Command, error) {
return &command.StepDownCommand{
Meta: meta,
}, nil
},

"mount": func() (cli.Command, error) {
return &command.MountCommand{
Meta: meta,
Expand Down
54 changes: 54 additions & 0 deletions command/step-down.go
@@ -0,0 +1,54 @@
package command

import (
"fmt"
"strings"
)

// StepDownCommand is a Command that seals the vault.
type StepDownCommand struct {
Meta
}

func (c *StepDownCommand) Run(args []string) int {
flags := c.Meta.FlagSet("step-down", FlagSetDefault)
flags.Usage = func() { c.Ui.Error(c.Help()) }
if err := flags.Parse(args); err != nil {
return 1
}

client, err := c.Client()
if err != nil {
c.Ui.Error(fmt.Sprintf(
"Error initializing client: %s", err))
return 2
}

if err := client.Sys().StepDown(); err != nil {
c.Ui.Error(fmt.Sprintf("Error stepping down: %s", err))
return 1
}

return 0
}

func (c *StepDownCommand) Synopsis() string {
return "Force the Vault node to give up active duty"
}

func (c *StepDownCommand) Help() string {
helpText := `
Usage: vault step-down [options]

Force the Vault node to step down from active duty.

This causes the indicated node to give up active status. Note that while the
affected node will have a short delay before attempting to grab the lock
again, if no other node grabs the lock beforehand, it is possible for the
same node to re-grab the lock and become active again.

General Options:

` + generalOptionsUsage()
return strings.TrimSpace(helpText)
}
1 change: 1 addition & 0 deletions http/handler.go
Expand Up @@ -23,6 +23,7 @@ func Handler(core *vault.Core) http.Handler {
mux.Handle("/v1/sys/init", handleSysInit(core))
mux.Handle("/v1/sys/seal-status", handleSysSealStatus(core))
mux.Handle("/v1/sys/seal", handleSysSeal(core))
mux.Handle("/v1/sys/step-down", handleSysStepDown(core))
mux.Handle("/v1/sys/unseal", handleSysUnseal(core))
mux.Handle("/v1/sys/mounts", proxySysRequest(core))
mux.Handle("/v1/sys/mounts/", proxySysRequest(core))
Expand Down
23 changes: 23 additions & 0 deletions http/sys_seal.go
Expand Up @@ -34,6 +34,29 @@ func handleSysSeal(core *vault.Core) http.Handler {
})
}

func handleSysStepDown(core *vault.Core) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.Method {
case "PUT":
case "POST":
default:
respondError(w, http.StatusMethodNotAllowed, nil)
return
}

// Get the auth for the request so we can access the token directly
req := requestAuth(r, &logical.Request{})

// Seal with the token above
if err := core.StepDown(req.ClientToken); err != nil {
respondError(w, http.StatusInternalServerError, err)
return
}

respondOk(w, nil)
})
}

func handleSysUnseal(core *vault.Core) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.Method {
Expand Down
10 changes: 10 additions & 0 deletions http/sys_seal_test.go
Expand Up @@ -304,3 +304,13 @@ func TestSysSeal_Permissions(t *testing.T) {
httpResp = testHttpPut(t, "child", addr+"/v1/sys/seal", nil)
testResponseStatus(t, httpResp, 204)
}

func TestSysStepDown(t *testing.T) {
core, _, token := vault.TestCoreUnsealed(t)
ln, addr := TestServer(t, core)
defer ln.Close()
TestServerAuth(t, addr, token)

resp := testHttpPut(t, token, addr+"/v1/sys/step-down", nil)
testResponseStatus(t, resp, 204)
}
69 changes: 55 additions & 14 deletions vault/core.go
Expand Up @@ -1157,22 +1157,45 @@ func (c *Core) Unseal(key []byte) (bool, error) {
return true, nil
}

// Seal is used to re-seal the Vault. This requires the Vault to
// be unsealed again to perform any further operations.
func (c *Core) Seal(token string) (retErr error) {
defer metrics.MeasureSince([]string{"core", "seal"}, time.Now())
// Seal is used to seal the vault
func (c *Core) Seal(token string) error {
return c.stepDownAndSeal(token, true)
}

// StepDown is used to step down from leadership
func (c *Core) StepDown(token string) error {
return c.stepDownAndSeal(token, false)
}

// stepDownAndSeal is used to step down from leadership and, optionally,
// re-seal the Vault. If sealed, this requires the Vault to be unsealed again
// to perform any further operations.
func (c *Core) stepDownAndSeal(token string, seal bool) (retErr error) {
if seal {
defer metrics.MeasureSince([]string{"core", "seal"}, time.Now())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would just move this into the appropriate methods above to make it cleaner

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about that; I was attempting to avoid copypasta since much of the logic is the same. But there's enough different that it can go either way.

} else {
defer metrics.MeasureSince([]string{"core", "step_down"}, time.Now())
}

c.stateLock.Lock()
defer c.stateLock.Unlock()
if c.sealed {
return nil
}
if !seal && (c.ha == nil || c.standby) {
return nil
}

// Validate the token is a root token
req := &logical.Request{
Operation: logical.UpdateOperation,
Path: "sys/seal",
ClientToken: token,
}
if seal {
req.Path = "sys/seal"
} else {
req.Path = "sys/step-down"
}
acl, te, err := c.fetchACLandTokenEntry(req)

// Attempt to use the token (decrement num_uses)
Expand All @@ -1189,8 +1212,8 @@ func (c *Core) Seal(token string) (retErr error) {
// just returning with an error and recommending a vault restart, which
// essentially does the same thing.
if c.standby {
c.logger.Printf("[ERR] core: vault cannot be sealed when in standby mode; please restart instead")
return errors.New("vault cannot be sealed when in standby mode; please restart instead")
c.logger.Printf("[ERR] core: vault cannot step down or be sealed when in standby mode; please restart instead")
return errors.New("vault cannot step down or be sealed when in standby mode; please restart instead")
}
return err
}
Expand All @@ -1207,19 +1230,22 @@ func (c *Core) Seal(token string) (retErr error) {
}

// Seal the Vault
err = c.sealInternal()
if err == nil && retErr == ErrInternalError {
c.logger.Printf("[ERR] core: core is successfully sealed but another error occurred during the operation")
if seal {
err = c.sealInternal()
if err == nil && retErr == ErrInternalError {
c.logger.Printf("[ERR] core: core is successfully sealed but another error occurred during the operation")
} else {
retErr = err
}
} else {
retErr = err
c.stepDownInternal()
}

return
}

// sealInternal is an internal method used to seal the vault.
// It does not do any authorization checking. The stateLock must
// be held prior to calling.
// sealInternal is an internal method used to seal the vault. It does not do
// any authorization checking. The stateLock must be held prior to calling.
func (c *Core) sealInternal() error {
// Enable that we are sealed to prevent furthur transactions
c.sealed = true
Expand All @@ -1244,9 +1270,20 @@ func (c *Core) sealInternal() error {
return err
}
c.logger.Printf("[INFO] core: vault is sealed")

return nil
}

// stepDownInternal is an internal method used to step down from active duty.
// It does not do any authorization checking.
func (c *Core) stepDownInternal() {
// Merely trigger the loop to re-run. This value will cause the
// loop to run through giving up leadership, but without triggering
// the return at the end of the next loop run, since it's not
// closed
c.standbyStopCh <- struct{}{}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is runStandby will check the top of the loop for any value and exit the loop, so depending on timing this can cause the standby mode to suddenly break. I would instead change the channel to take a bool, where the default false value indicates the loop should break, while true indicates a user initiated step down. We can then modify the logic in runStandby to not return on the true value in a way that is race free.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thought about this too; it'd have to be pretty unlucky, as in, you'd have to be losing leadership just as you're manually requesting a step down, but it's certainly true that it could happen. I like your approach better though.

}

// postUnseal is invoked after the barrier is unsealed, but before
// allowing any user operations. This allows us to setup any state that
// requires the Vault to be unsealed such as mount tables, logical backends,
Expand Down Expand Up @@ -1443,6 +1480,10 @@ func (c *Core) runStandby(doneCh, stopCh chan struct{}) {
if preSealErr != nil {
c.logger.Printf("[ERR] core: pre-seal teardown failed: %v", err)
}

// If we've merely stepped down, we could instantly grab the lock
// again. Give the other nodes a chance.
time.Sleep(time.Second)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can add logic to only do this on a user initiated step down using the bool channel to avoid adding a failover delay. Also lets move the time value into a constant instead of hard coding a second. (Probably at least 5-15 seconds is reasonable on user initiated).

}
}

Expand Down