Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Finish up cluster peering failover #14396

Merged
merged 1 commit into from
Aug 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/14396.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:feature
peering: Add support to failover to services running on cluster peers.
```
34 changes: 23 additions & 11 deletions agent/proxycfg/connect_proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,16 +280,6 @@ func (s *handlerConnectProxy) handleUpdate(ctx context.Context, u UpdateEvent, s
}
snap.Roots = roots

case strings.HasPrefix(u.CorrelationID, peerTrustBundleIDPrefix):
resp, ok := u.Result.(*pbpeering.TrustBundleReadResponse)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
peer := strings.TrimPrefix(u.CorrelationID, peerTrustBundleIDPrefix)
if resp.Bundle != nil {
snap.ConnectProxy.UpstreamPeerTrustBundles.Set(peer, resp.Bundle)
}

case u.CorrelationID == peeringTrustBundlesWatchID:
resp, ok := u.Result.(*pbpeering.TrustBundleListByServiceResponse)
if !ok {
Expand Down Expand Up @@ -369,6 +359,17 @@ func (s *handlerConnectProxy) handleUpdate(ctx context.Context, u UpdateEvent, s
// Clean up data
//

peeredChainTargets := make(map[UpstreamID]struct{})
for _, discoChain := range snap.ConnectProxy.DiscoveryChain {
for _, target := range discoChain.Targets {
if target.Peer == "" {
continue
}
uid := NewUpstreamIDFromTargetID(target.ID)
peeredChainTargets[uid] = struct{}{}
}
}

validPeerNames := make(map[string]struct{})

// Iterate through all known endpoints and remove references to upstream IDs that weren't in the update
Expand All @@ -383,6 +384,11 @@ func (s *handlerConnectProxy) handleUpdate(ctx context.Context, u UpdateEvent, s
validPeerNames[uid.Peer] = struct{}{}
return true
}
// Peered upstream came from a discovery chain target
if _, ok := peeredChainTargets[uid]; ok {
validPeerNames[uid.Peer] = struct{}{}
return true
}
snap.ConnectProxy.PeerUpstreamEndpoints.CancelWatch(uid)
return true
})
Expand Down Expand Up @@ -463,8 +469,14 @@ func (s *handlerConnectProxy) handleUpdate(ctx context.Context, u UpdateEvent, s
continue
}
if _, ok := seenUpstreams[uid]; !ok {
for _, cancelFn := range targets {
for targetID, cancelFn := range targets {
cancelFn()

targetUID := NewUpstreamIDFromTargetID(targetID)
if targetUID.Peer != "" {
snap.ConnectProxy.PeerUpstreamEndpoints.CancelWatch(targetUID)
snap.ConnectProxy.UpstreamPeerTrustBundles.CancelWatch(targetUID.Peer)
}
}
delete(snap.ConnectProxy.WatchedUpstreams, uid)
}
Expand Down
11 changes: 11 additions & 0 deletions agent/proxycfg/ingress_gateway.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ import (
"fmt"

cachetype "github.com/hashicorp/consul/agent/cache-types"
"github.com/hashicorp/consul/agent/proxycfg/internal/watch"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/proto/pbpeering"
)

type handlerIngressGateway struct {
Expand Down Expand Up @@ -66,6 +68,9 @@ func (s *handlerIngressGateway) initialize(ctx context.Context) (ConfigSnapshot,
snap.IngressGateway.WatchedGateways = make(map[UpstreamID]map[string]context.CancelFunc)
snap.IngressGateway.WatchedGatewayEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)
snap.IngressGateway.Listeners = make(map[IngressListenerKey]structs.IngressListener)
snap.IngressGateway.UpstreamPeerTrustBundles = watch.NewMap[string, *pbpeering.PeeringTrustBundle]()
snap.IngressGateway.PeerUpstreamEndpoints = watch.NewMap[UpstreamID, structs.CheckServiceNodes]()
snap.IngressGateway.PeerUpstreamEndpointsUseHostnames = make(map[UpstreamID]struct{})
return snap, nil
}

Expand Down Expand Up @@ -152,6 +157,12 @@ func (s *handlerIngressGateway) handleUpdate(ctx context.Context, u UpdateEvent,
delete(snap.IngressGateway.WatchedUpstreams[uid], targetID)
delete(snap.IngressGateway.WatchedUpstreamEndpoints[uid], targetID)
cancelUpstreamFn()

targetUID := NewUpstreamIDFromTargetID(targetID)
if targetUID.Peer != "" {
snap.IngressGateway.PeerUpstreamEndpoints.CancelWatch(targetUID)
snap.IngressGateway.UpstreamPeerTrustBundles.CancelWatch(targetUID.Peer)
}
}

cancelFn()
Expand Down
12 changes: 12 additions & 0 deletions agent/proxycfg/snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,18 @@ func (s *ConfigSnapshot) MeshConfigTLSOutgoing() *structs.MeshDirectionalTLSConf
return mesh.TLS.Outgoing
}

func (s *ConfigSnapshot) ToConfigSnapshotUpstreams() (*ConfigSnapshotUpstreams, error) {
switch s.Kind {
case structs.ServiceKindConnectProxy:
return &s.ConnectProxy.ConfigSnapshotUpstreams, nil
case structs.ServiceKindIngressGateway:
return &s.IngressGateway.ConfigSnapshotUpstreams, nil
default:
// This is a coherence check and should never fail
return nil, fmt.Errorf("No upstream snapshot for gateway mode %q", s.Kind)
}
}

func (u *ConfigSnapshotUpstreams) UpstreamPeerMeta(uid UpstreamID) structs.PeeringServiceMeta {
nodes, _ := u.PeerUpstreamEndpoints.Get(uid)
if len(nodes) == 0 {
Expand Down
62 changes: 52 additions & 10 deletions agent/proxycfg/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,11 @@ func TestState_WatchesAndUpdates(t *testing.T) {
Mode: structs.MeshGatewayModeNone,
},
},
structs.Upstream{
DestinationType: structs.UpstreamDestTypeService,
DestinationName: "api-failover-to-peer",
LocalBindPort: 10007,
},
structs.Upstream{
DestinationType: structs.UpstreamDestTypeService,
DestinationName: "api-dc2",
Expand Down Expand Up @@ -552,6 +557,16 @@ func TestState_WatchesAndUpdates(t *testing.T) {
Mode: structs.MeshGatewayModeNone,
},
}),
fmt.Sprintf("discovery-chain:%s-failover-to-peer", apiUID.String()): genVerifyDiscoveryChainWatch(&structs.DiscoveryChainRequest{
Name: "api-failover-to-peer",
EvaluateInDatacenter: "dc1",
EvaluateInNamespace: "default",
EvaluateInPartition: "default",
Datacenter: "dc1",
OverrideMeshGateway: structs.MeshGatewayConfig{
Mode: meshGatewayProxyConfigValue,
},
}),
fmt.Sprintf("discovery-chain:%s-dc2", apiUID.String()): genVerifyDiscoveryChainWatch(&structs.DiscoveryChainRequest{
Name: "api-dc2",
EvaluateInDatacenter: "dc1",
Expand Down Expand Up @@ -639,22 +654,45 @@ func TestState_WatchesAndUpdates(t *testing.T) {
},
Err: nil,
},
{
CorrelationID: fmt.Sprintf("discovery-chain:%s-failover-to-peer", apiUID.String()),
Result: &structs.DiscoveryChainResponse{
Chain: discoverychain.TestCompileConfigEntries(t, "api-failover-to-peer", "default", "default", "dc1", "trustdomain.consul",
func(req *discoverychain.CompileRequest) {
req.OverrideMeshGateway.Mode = meshGatewayProxyConfigValue
}, &structs.ServiceResolverConfigEntry{
Kind: structs.ServiceResolver,
Name: "api-failover-to-peer",
Failover: map[string]structs.ServiceResolverFailover{
"*": {
Targets: []structs.ServiceResolverFailoverTarget{
{Peer: "cluster-01"},
},
},
},
}),
},
Err: nil,
},
},
verifySnapshot: func(t testing.TB, snap *ConfigSnapshot) {
require.True(t, snap.Valid())
require.True(t, snap.MeshGateway.isEmpty())
require.Equal(t, indexedRoots, snap.Roots)

require.Equal(t, issuedCert, snap.ConnectProxy.Leaf)
require.Len(t, snap.ConnectProxy.DiscoveryChain, 5, "%+v", snap.ConnectProxy.DiscoveryChain)
require.Len(t, snap.ConnectProxy.WatchedUpstreams, 5, "%+v", snap.ConnectProxy.WatchedUpstreams)
require.Len(t, snap.ConnectProxy.WatchedUpstreamEndpoints, 5, "%+v", snap.ConnectProxy.WatchedUpstreamEndpoints)
require.Len(t, snap.ConnectProxy.WatchedGateways, 5, "%+v", snap.ConnectProxy.WatchedGateways)
require.Len(t, snap.ConnectProxy.WatchedGatewayEndpoints, 5, "%+v", snap.ConnectProxy.WatchedGatewayEndpoints)
require.Len(t, snap.ConnectProxy.DiscoveryChain, 6, "%+v", snap.ConnectProxy.DiscoveryChain)
require.Len(t, snap.ConnectProxy.WatchedUpstreams, 6, "%+v", snap.ConnectProxy.WatchedUpstreams)
require.Len(t, snap.ConnectProxy.WatchedUpstreamEndpoints, 6, "%+v", snap.ConnectProxy.WatchedUpstreamEndpoints)
require.Len(t, snap.ConnectProxy.WatchedGateways, 6, "%+v", snap.ConnectProxy.WatchedGateways)
require.Len(t, snap.ConnectProxy.WatchedGatewayEndpoints, 6, "%+v", snap.ConnectProxy.WatchedGatewayEndpoints)

require.Len(t, snap.ConnectProxy.WatchedServiceChecks, 0, "%+v", snap.ConnectProxy.WatchedServiceChecks)
require.Len(t, snap.ConnectProxy.PreparedQueryEndpoints, 0, "%+v", snap.ConnectProxy.PreparedQueryEndpoints)

require.Equal(t, 1, snap.ConnectProxy.ConfigSnapshotUpstreams.PeerUpstreamEndpoints.Len())
require.Equal(t, 1, snap.ConnectProxy.ConfigSnapshotUpstreams.UpstreamPeerTrustBundles.Len())

require.True(t, snap.ConnectProxy.IntentionsSet)
require.Equal(t, ixnMatch, snap.ConnectProxy.Intentions)
require.True(t, snap.ConnectProxy.MeshConfigSet)
Expand All @@ -667,6 +705,7 @@ func TestState_WatchesAndUpdates(t *testing.T) {
fmt.Sprintf("upstream-target:api-failover-remote.default.default.dc2:%s-failover-remote?dc=dc2", apiUID.String()): genVerifyServiceSpecificRequest("api-failover-remote", "", "dc2", true),
fmt.Sprintf("upstream-target:api-failover-local.default.default.dc2:%s-failover-local?dc=dc2", apiUID.String()): genVerifyServiceSpecificRequest("api-failover-local", "", "dc2", true),
fmt.Sprintf("upstream-target:api-failover-direct.default.default.dc2:%s-failover-direct?dc=dc2", apiUID.String()): genVerifyServiceSpecificRequest("api-failover-direct", "", "dc2", true),
upstreamPeerWatchIDPrefix + fmt.Sprintf("%s-failover-to-peer?peer=cluster-01", apiUID.String()): genVerifyServiceSpecificPeeredRequest("api-failover-to-peer", "", "", "cluster-01", true),
fmt.Sprintf("mesh-gateway:dc2:%s-failover-remote?dc=dc2", apiUID.String()): genVerifyGatewayWatch("dc2"),
fmt.Sprintf("mesh-gateway:dc1:%s-failover-local?dc=dc2", apiUID.String()): genVerifyGatewayWatch("dc1"),
},
Expand All @@ -676,15 +715,18 @@ func TestState_WatchesAndUpdates(t *testing.T) {
require.Equal(t, indexedRoots, snap.Roots)

require.Equal(t, issuedCert, snap.ConnectProxy.Leaf)
require.Len(t, snap.ConnectProxy.DiscoveryChain, 5, "%+v", snap.ConnectProxy.DiscoveryChain)
require.Len(t, snap.ConnectProxy.WatchedUpstreams, 5, "%+v", snap.ConnectProxy.WatchedUpstreams)
require.Len(t, snap.ConnectProxy.WatchedUpstreamEndpoints, 5, "%+v", snap.ConnectProxy.WatchedUpstreamEndpoints)
require.Len(t, snap.ConnectProxy.WatchedGateways, 5, "%+v", snap.ConnectProxy.WatchedGateways)
require.Len(t, snap.ConnectProxy.WatchedGatewayEndpoints, 5, "%+v", snap.ConnectProxy.WatchedGatewayEndpoints)
require.Len(t, snap.ConnectProxy.DiscoveryChain, 6, "%+v", snap.ConnectProxy.DiscoveryChain)
require.Len(t, snap.ConnectProxy.WatchedUpstreams, 6, "%+v", snap.ConnectProxy.WatchedUpstreams)
require.Len(t, snap.ConnectProxy.WatchedUpstreamEndpoints, 6, "%+v", snap.ConnectProxy.WatchedUpstreamEndpoints)
require.Len(t, snap.ConnectProxy.WatchedGateways, 6, "%+v", snap.ConnectProxy.WatchedGateways)
require.Len(t, snap.ConnectProxy.WatchedGatewayEndpoints, 6, "%+v", snap.ConnectProxy.WatchedGatewayEndpoints)

require.Len(t, snap.ConnectProxy.WatchedServiceChecks, 0, "%+v", snap.ConnectProxy.WatchedServiceChecks)
require.Len(t, snap.ConnectProxy.PreparedQueryEndpoints, 0, "%+v", snap.ConnectProxy.PreparedQueryEndpoints)

require.Equal(t, 1, snap.ConnectProxy.ConfigSnapshotUpstreams.PeerUpstreamEndpoints.Len())
require.Equal(t, 1, snap.ConnectProxy.ConfigSnapshotUpstreams.UpstreamPeerTrustBundles.Len())

require.True(t, snap.ConnectProxy.IntentionsSet)
require.Equal(t, ixnMatch, snap.ConnectProxy.Intentions)
},
Expand Down
25 changes: 25 additions & 0 deletions agent/proxycfg/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,31 @@ func TestUpstreamNodesDC2(t testing.T) structs.CheckServiceNodes {
}
}

func TestUpstreamNodesPeerCluster01(t testing.T) structs.CheckServiceNodes {
peer := "cluster-01"
service := structs.TestNodeServiceWithNameInPeer(t, "web", peer)
return structs.CheckServiceNodes{
structs.CheckServiceNode{
Node: &structs.Node{
ID: "test1",
Node: "test1",
Address: "10.40.1.1",
PeerName: peer,
},
Service: service,
},
structs.CheckServiceNode{
Node: &structs.Node{
ID: "test2",
Node: "test2",
Address: "10.40.1.2",
PeerName: peer,
},
Service: service,
},
}
}

func TestUpstreamNodesInStatusDC2(t testing.T, status string) structs.CheckServiceNodes {
return structs.CheckServiceNodes{
structs.CheckServiceNode{
Expand Down
34 changes: 34 additions & 0 deletions agent/proxycfg/testing_upstreams.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/consul/discoverychain"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/proto/pbpeering"
)

func setupTestVariationConfigEntriesAndSnapshot(
Expand Down Expand Up @@ -72,6 +73,24 @@ func setupTestVariationConfigEntriesAndSnapshot(
Nodes: TestGatewayNodesDC2(t),
},
})
case "failover-to-cluster-peer":
events = append(events, UpdateEvent{
CorrelationID: "peer-trust-bundle:cluster-01",
Result: &pbpeering.TrustBundleReadResponse{
Bundle: &pbpeering.PeeringTrustBundle{
PeerName: "peer1",
TrustDomain: "peer1.domain",
ExportedPartition: "peer1ap",
RootPEMs: []string{"peer1-root-1"},
},
},
})
events = append(events, UpdateEvent{
CorrelationID: "upstream-peer:db?peer=cluster-01",
Result: &structs.IndexedCheckServiceNodes{
Nodes: TestUpstreamNodesPeerCluster01(t),
},
})
case "failover-through-double-remote-gateway-triggered":
events = append(events, UpdateEvent{
CorrelationID: "upstream-target:db.default.default.dc1:" + dbUID.String(),
Expand Down Expand Up @@ -255,6 +274,21 @@ func setupTestVariationDiscoveryChain(
},
},
)
case "failover-to-cluster-peer":
entries = append(entries,
&structs.ServiceResolverConfigEntry{
Kind: structs.ServiceResolver,
Name: "db",
ConnectTimeout: 33 * time.Second,
Failover: map[string]structs.ServiceResolverFailover{
"*": {
Targets: []structs.ServiceResolverFailoverTarget{
{Peer: "cluster-01"},
},
},
},
},
)
case "failover-through-double-remote-gateway-triggered":
fallthrough
case "failover-through-double-remote-gateway":
Expand Down