diff --git a/grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java b/grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java index 2ee6befb8e2..dc53be584cf 100644 --- a/grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java +++ b/grpclb/src/main/java/io/grpc/grpclb/GrpclbState.java @@ -203,9 +203,20 @@ void handleSubchannelState(Subchannel subchannel, ConnectivityStateInfo newState if (config.getMode() == Mode.ROUND_ROBIN && newState.getState() == IDLE) { subchannel.requestConnection(); } - subchannel.getAttributes().get(STATE_INFO).set(newState); - maybeUseFallbackBackends(); - maybeUpdatePicker(); + AtomicReference stateInfoRef = + subchannel.getAttributes().get(STATE_INFO); + // If all RR servers are unhealthy, it's possible that at least one connection is CONNECTING at + // every moment which causes RR to stay in CONNECTING. It's better to keep the TRANSIENT_FAILURE + // state in that case so that fail-fast RPCs can fail fast. + boolean keepState = + config.getMode() == Mode.ROUND_ROBIN + && stateInfoRef.get().getState() == TRANSIENT_FAILURE + && (newState.getState() == CONNECTING || newState.getState() == IDLE); + if (!keepState) { + stateInfoRef.set(newState); + maybeUseFallbackBackends(); + maybeUpdatePicker(); + } } /** diff --git a/grpclb/src/test/java/io/grpc/grpclb/GrpclbLoadBalancerTest.java b/grpclb/src/test/java/io/grpc/grpclb/GrpclbLoadBalancerTest.java index dbbf9f8dab2..abc90743383 100644 --- a/grpclb/src/test/java/io/grpc/grpclb/GrpclbLoadBalancerTest.java +++ b/grpclb/src/test/java/io/grpc/grpclb/GrpclbLoadBalancerTest.java @@ -1104,6 +1104,49 @@ public void grpclbWorking() { verify(subchannelPool).clear(); } + @Test + public void roundRobinMode_subchannelStayTransientFailureUntilReady() { + InOrder inOrder = inOrder(helper); + List grpclbBalancerList = createResolvedBalancerAddresses(1); + deliverResolvedAddresses(Collections.emptyList(), grpclbBalancerList); + verify(mockLbService).balanceLoad(lbResponseObserverCaptor.capture()); + StreamObserver lbResponseObserver = lbResponseObserverCaptor.getValue(); + + // Simulate receiving LB response + List backends1 = Arrays.asList( + new ServerEntry("127.0.0.1", 2000, "token0001"), + new ServerEntry("127.0.0.1", 2010, "token0002")); + lbResponseObserver.onNext(buildInitialResponse()); + lbResponseObserver.onNext(buildLbResponse(backends1)); + assertEquals(2, mockSubchannels.size()); + Subchannel subchannel1 = mockSubchannels.poll(); + Subchannel subchannel2 = mockSubchannels.poll(); + + deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(CONNECTING)); + deliverSubchannelState(subchannel2, ConnectivityStateInfo.forNonError(CONNECTING)); + inOrder.verify(helper).updateBalancingState(eq(CONNECTING), any(SubchannelPicker.class)); + + // Switch subchannel1 to TRANSIENT_FAILURE, making the general state TRANSIENT_FAILURE too. + Status error = Status.UNAVAILABLE.withDescription("error1"); + deliverSubchannelState(subchannel1, ConnectivityStateInfo.forTransientFailure(error)); + inOrder.verify(helper).updateBalancingState(eq(TRANSIENT_FAILURE), pickerCaptor.capture()); + assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList) + .containsExactly(new ErrorEntry(error)); + + // Switch subchannel1 to IDLE, then to CONNECTING, which are ignored since the previous + // subchannel state is TRANSIENT_FAILURE. General state is unchanged. + deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(IDLE)); + deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(CONNECTING)); + inOrder.verifyNoMoreInteractions(); + + // Switch subchannel1 to READY, which will affect the general state + deliverSubchannelState(subchannel1, ConnectivityStateInfo.forNonError(READY)); + inOrder.verify(helper).updateBalancingState(eq(READY), pickerCaptor.capture()); + assertThat(((RoundRobinPicker) pickerCaptor.getValue()).pickList) + .containsExactly(new BackendEntry(subchannel1, getLoadRecorder(), "token0001")); + inOrder.verifyNoMoreInteractions(); + } + @Test public void grpclbFallback_initialTimeout_serverListReceivedBeforeTimerExpires() { subtestGrpclbFallbackInitialTimeout(false);