Skip to content

Commit

Permalink
grpclb: keep RR Subchannel state in TRANSIENT_FAILURE until becoming …
Browse files Browse the repository at this point in the history
…READY (#7816)

If all RR servers are unhealthy, it's possible that at least one
connection is CONNECTING at every moment which causes RR to stay in
CONNECTING. It's better to keep the TRANSIENT_FAILURE state in that
case so that fail-fast RPCs can fail fast.

The same changes have been made for RoundRobinLoadBalancer in #6657
  • Loading branch information
zhangkun83 committed Jan 15, 2021
1 parent 9016cf5 commit 23d2796
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 3 deletions.
17 changes: 14 additions & 3 deletions grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java
Expand Up @@ -203,9 +203,20 @@ void handleSubchannelState(Subchannel subchannel, ConnectivityStateInfo newState
if (config.getMode() == Mode.ROUND_ROBIN && newState.getState() == IDLE) {
subchannel.requestConnection();
}
subchannel.getAttributes().get(STATE_INFO).set(newState);
maybeUseFallbackBackends();
maybeUpdatePicker();
AtomicReference<ConnectivityStateInfo> stateInfoRef =
subchannel.getAttributes().get(STATE_INFO);
// If all RR servers are unhealthy, it's possible that at least one connection is CONNECTING at
// every moment which causes RR to stay in CONNECTING. It's better to keep the TRANSIENT_FAILURE
// state in that case so that fail-fast RPCs can fail fast.
boolean keepState =
config.getMode() == Mode.ROUND_ROBIN
&& stateInfoRef.get().getState() == TRANSIENT_FAILURE
&& (newState.getState() == CONNECTING || newState.getState() == IDLE);
if (!keepState) {
stateInfoRef.set(newState);
maybeUseFallbackBackends();
maybeUpdatePicker();
}
}

/**
Expand Down
43 changes: 43 additions & 0 deletions grpclb/src/test/java/io/grpc/grpclb/GrpclbLoadBalancerTest.java
Expand Up @@ -1104,6 +1104,49 @@ public void grpclbWorking() {
verify(subchannelPool).clear();
}

@Test
public void roundRobinMode_subchannelStayTransientFailureUntilReady() {
InOrder inOrder = inOrder(helper);
List<EquivalentAddressGroup> grpclbBalancerList = createResolvedBalancerAddresses(1);
deliverResolvedAddresses(Collections.<EquivalentAddressGroup>emptyList(), grpclbBalancerList);
verify(mockLbService).balanceLoad(lbResponseObserverCaptor.capture());
StreamObserver<LoadBalanceResponse> lbResponseObserver = lbResponseObserverCaptor.getValue();

// Simulate receiving LB response
List<ServerEntry> backends1 = Arrays.asList(
new ServerEntry("127.0.0.1", 2000, "token0001"),
new ServerEntry("127.0.0.1", 2010, "token0002"));
lbResponseObserver.onNext(buildInitialResponse());
lbResponseObserver.onNext(buildLbResponse(backends1));
assertEquals(2, mockSubchannels.size());
Subchannel subchannel1 = mockSubchannels.poll();
Subchannel subchannel2 = mockSubchannels.poll();

deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(CONNECTING));
deliverSubchannelState(subchannel2, ConnectivityStateInfo.forNonError(CONNECTING));
inOrder.verify(helper).updateBalancingState(eq(CONNECTING), any(SubchannelPicker.class));

// Switch subchannel1 to TRANSIENT_FAILURE, making the general state TRANSIENT_FAILURE too.
Status error = Status.UNAVAILABLE.withDescription("error1");
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forTransientFailure(error));
inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture());
assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList)
.containsExactly(new ErrorEntry(error));

// Switch subchannel1 to IDLE, then to CONNECTING, which are ignored since the previous
// subchannel state is TRANSIENT_FAILURE. General state is unchanged.
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(IDLE));
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(CONNECTING));
inOrder.verifyNoMoreInteractions();

// Switch subchannel1 to READY, which will affect the general state
deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(READY));
inOrder.verify(helper).updateBalancingState(eq(READY), pickerCaptor.capture());
assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList)
.containsExactly(new BackendEntry(subchannel1, getLoadRecorder(), "token0001"));
inOrder.verifyNoMoreInteractions();
}

@Test
public void grpclbFallback_initialTimeout_serverListReceivedBeforeTimerExpires() {
subtestGrpclbFallbackInitialTimeout(false);
Expand Down

0 comments on commit 23d2796

Please sign in to comment.