forked from grpc/grpc-java
-
Notifications
You must be signed in to change notification settings - Fork 0
/
RingHashLoadBalancer.java
516 lines (478 loc) · 21.1 KB
/
RingHashLoadBalancer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
/*
* Copyright 2021 The gRPC Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.grpc.xds;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static io.grpc.ConnectivityState.CONNECTING;
import static io.grpc.ConnectivityState.IDLE;
import static io.grpc.ConnectivityState.READY;
import static io.grpc.ConnectivityState.SHUTDOWN;
import static io.grpc.ConnectivityState.TRANSIENT_FAILURE;
import com.google.common.base.MoreObjects;
import com.google.common.collect.Sets;
import io.grpc.Attributes;
import io.grpc.ConnectivityState;
import io.grpc.ConnectivityStateInfo;
import io.grpc.EquivalentAddressGroup;
import io.grpc.InternalLogId;
import io.grpc.LoadBalancer;
import io.grpc.Status;
import io.grpc.SynchronizationContext;
import io.grpc.xds.XdsLogger.XdsLogLevel;
import io.grpc.xds.XdsSubchannelPickers.ErrorPicker;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
/**
* A {@link LoadBalancer} that provides consistent hashing based load balancing to upstream hosts.
* It implements the "Ketama" hashing that maps hosts onto a circle (the "ring") by hashing its
* addresses. Each request is routed to a host by hashing some property of the request and finding
* the nearest corresponding host clockwise around the ring. Each host is placed on the ring some
* number of times proportional to its weight. With the ring partitioned appropriately, the
* addition or removal of one host from a set of N hosts will affect only 1/N requests.
*/
final class RingHashLoadBalancer extends LoadBalancer {
private static final Attributes.Key<Ref<ConnectivityStateInfo>> STATE_INFO =
Attributes.Key.create("state-info");
private static final Status RPC_HASH_NOT_FOUND =
Status.INTERNAL.withDescription("RPC hash not found. Probably a bug because xds resolver"
+ " config selector always generates a hash.");
private static final XxHash64 hashFunc = XxHash64.INSTANCE;
private final XdsLogger logger;
private final SynchronizationContext syncContext;
private final Map<EquivalentAddressGroup, Subchannel> subchannels = new HashMap<>();
private final Helper helper;
private List<RingEntry> ring;
private ConnectivityState currentState;
private Iterator<Subchannel> connectionAttemptIterator = subchannels.values().iterator();
private final Random random = new Random();
RingHashLoadBalancer(Helper helper) {
this.helper = checkNotNull(helper, "helper");
syncContext = checkNotNull(helper.getSynchronizationContext(), "syncContext");
logger = XdsLogger.withLogId(InternalLogId.allocate("ring_hash_lb", helper.getAuthority()));
logger.log(XdsLogLevel.INFO, "Created");
}
@Override
public void handleResolvedAddresses(ResolvedAddresses resolvedAddresses) {
logger.log(XdsLogLevel.DEBUG, "Received resolution result: {0}", resolvedAddresses);
List<EquivalentAddressGroup> addrList = resolvedAddresses.getAddresses();
if (addrList.isEmpty()) {
handleNameResolutionError(Status.UNAVAILABLE.withDescription("Ring hash lb error: EDS "
+ "resolution was successful, but returned server addresses are empty."));
return;
}
Map<EquivalentAddressGroup, EquivalentAddressGroup> latestAddrs = stripAttrs(addrList);
Set<EquivalentAddressGroup> removedAddrs =
Sets.newHashSet(Sets.difference(subchannels.keySet(), latestAddrs.keySet()));
RingHashConfig config = (RingHashConfig) resolvedAddresses.getLoadBalancingPolicyConfig();
Map<EquivalentAddressGroup, Long> serverWeights = new HashMap<>();
long totalWeight = 0L;
for (EquivalentAddressGroup eag : addrList) {
Long weight = eag.getAttributes().get(InternalXdsAttributes.ATTR_SERVER_WEIGHT);
// Support two ways of server weighing: either multiple instances of the same address
// or each address contains a per-address weight attribute. If a weight is not provided,
// each occurrence of the address will be counted a weight value of one.
if (weight == null) {
weight = 1L;
}
totalWeight += weight;
EquivalentAddressGroup addrKey = stripAttrs(eag);
if (serverWeights.containsKey(addrKey)) {
serverWeights.put(addrKey, serverWeights.get(addrKey) + weight);
} else {
serverWeights.put(addrKey, weight);
}
Subchannel existingSubchannel = subchannels.get(addrKey);
if (existingSubchannel != null) {
existingSubchannel.updateAddresses(Collections.singletonList(eag));
continue;
}
Attributes attr = Attributes.newBuilder().set(
STATE_INFO, new Ref<>(ConnectivityStateInfo.forNonError(IDLE))).build();
final Subchannel subchannel = helper.createSubchannel(
CreateSubchannelArgs.newBuilder().setAddresses(eag).setAttributes(attr).build());
subchannel.start(new SubchannelStateListener() {
@Override
public void onSubchannelState(ConnectivityStateInfo newState) {
processSubchannelState(subchannel, newState);
}
});
subchannels.put(addrKey, subchannel);
}
long minWeight = Collections.min(serverWeights.values());
double normalizedMinWeight = (double) minWeight / totalWeight;
// Scale up the number of hashes per host such that the least-weighted host gets a whole
// number of hashes on the the ring. Other hosts might not end up with whole numbers, and
// that's fine (the ring-building algorithm can handle this). This preserves the original
// implementation's behavior: when weights aren't provided, all hosts should get an equal
// number of hashes. In the case where this number exceeds the max_ring_size, it's scaled
// back down to fit.
double scale = Math.min(
Math.ceil(normalizedMinWeight * config.minRingSize) / normalizedMinWeight,
(double) config.maxRingSize);
ring = buildRing(serverWeights, totalWeight, scale);
// Shut down subchannels for delisted addresses.
List<Subchannel> removedSubchannels = new ArrayList<>();
for (EquivalentAddressGroup addr : removedAddrs) {
removedSubchannels.add(subchannels.remove(addr));
}
// If we need to proactively start connecting, iterate through all the subchannels, starting
// at a random position.
// Alternatively, we should better start at the same position.
connectionAttemptIterator = subchannels.values().iterator();
int randomAdvance = random.nextInt(subchannels.size());
while (randomAdvance-- > 0) {
connectionAttemptIterator.next();
}
// Update the picker before shutting down the subchannels, to reduce the chance of race
// between picking a subchannel and shutting it down.
updateBalancingState();
for (Subchannel subchann : removedSubchannels) {
shutdownSubchannel(subchann);
}
}
private static List<RingEntry> buildRing(
Map<EquivalentAddressGroup, Long> serverWeights, long totalWeight, double scale) {
List<RingEntry> ring = new ArrayList<>();
double currentHashes = 0.0;
double targetHashes = 0.0;
for (Map.Entry<EquivalentAddressGroup, Long> entry : serverWeights.entrySet()) {
EquivalentAddressGroup addrKey = entry.getKey();
double normalizedWeight = (double) entry.getValue() / totalWeight;
// TODO(chengyuanzhang): is using the list of socket address correct?
StringBuilder sb = new StringBuilder(addrKey.getAddresses().toString());
sb.append('_');
targetHashes += scale * normalizedWeight;
long i = 0L;
while (currentHashes < targetHashes) {
sb.append(i);
long hash = hashFunc.hashAsciiString(sb.toString());
ring.add(new RingEntry(hash, addrKey));
i++;
currentHashes++;
sb.setLength(sb.length() - 1);
}
}
Collections.sort(ring);
return Collections.unmodifiableList(ring);
}
@Override
public void handleNameResolutionError(Status error) {
if (currentState != READY) {
helper.updateBalancingState(TRANSIENT_FAILURE, new ErrorPicker(error));
}
}
@Override
public void shutdown() {
logger.log(XdsLogLevel.INFO, "Shutdown");
for (Subchannel subchannel : subchannels.values()) {
shutdownSubchannel(subchannel);
}
subchannels.clear();
}
/**
* Updates the overall balancing state by aggregating the connectivity states of all subchannels.
*
* <p>Aggregation rules (in order of dominance):
* <ol>
* <li>If there is at least one subchannel in READY state, overall state is READY</li>
* <li>If there are <em>2 or more</em> subchannels in TRANSIENT_FAILURE, overall state is
* TRANSIENT_FAILURE</li>
* <li>If there is at least one subchannel in CONNECTING state, overall state is
* CONNECTING</li>
* <li> If there is one subchannel in TRANSIENT_FAILURE state and there is
* more than one subchannel, report CONNECTING </li>
* <li>If there is at least one subchannel in IDLE state, overall state is IDLE</li>
* <li>Otherwise, overall state is TRANSIENT_FAILURE</li>
* </ol>
*/
private void updateBalancingState() {
checkState(!subchannels.isEmpty(), "no subchannel has been created");
boolean start_connection_attempt = false;
int num_idle_ = 0;
int num_ready_ = 0;
int num_connecting_ = 0;
int num_transient_failure_ = 0;
for (Subchannel subchannel : subchannels.values()) {
ConnectivityState state = getSubchannelStateInfoRef(subchannel).value.getState();
if (state == READY) {
num_ready_++;
break;
} else if (state == TRANSIENT_FAILURE) {
num_transient_failure_++;
} else if (state == CONNECTING ) {
num_connecting_++;
} else if (state == IDLE) {
num_idle_++;
}
}
ConnectivityState overallState;
if (num_ready_ > 0) {
overallState = READY;
} else if (num_transient_failure_ >= 2) {
overallState = TRANSIENT_FAILURE;
start_connection_attempt = true;
} else if (num_connecting_ > 0) {
overallState = CONNECTING;
} else if (num_transient_failure_ == 1 && subchannels.size() > 1) {
overallState = CONNECTING;
start_connection_attempt = true;
} else if (num_idle_ > 0) {
overallState = IDLE;
} else {
overallState = TRANSIENT_FAILURE;
start_connection_attempt = true;
}
RingHashPicker picker = new RingHashPicker(syncContext, ring, subchannels);
// TODO(chengyuanzhang): avoid unnecessary reprocess caused by duplicated server addr updates
helper.updateBalancingState(overallState, picker);
currentState = overallState;
// While the ring_hash policy is reporting TRANSIENT_FAILURE, it will
// not be getting any pick requests from the priority policy.
// However, because the ring_hash policy does not attempt to
// reconnect to subchannels unless it is getting pick requests,
// it will need special handling to ensure that it will eventually
// recover from TRANSIENT_FAILURE state once the problem is resolved.
// Specifically, it will make sure that it is attempting to connect to
// at least one subchannel at any given time. After a given subchannel
// fails a connection attempt, it will move on to the next subchannel
// in the ring. It will keep doing this until one of the subchannels
// successfully connects, at which point it will report READY and stop
// proactively trying to connect. The policy will remain in
// TRANSIENT_FAILURE until at least one subchannel becomes connected,
// even if subchannels are in state CONNECTING during that time.
//
// Note that we do the same thing when the policy is in state
// CONNECTING, just to ensure that we don't remain in CONNECTING state
// indefinitely if there are no new picks coming in.
if (start_connection_attempt) {
if (!connectionAttemptIterator.hasNext()) {
connectionAttemptIterator = subchannels.values().iterator();
}
connectionAttemptIterator.next().requestConnection();
}
}
private void processSubchannelState(Subchannel subchannel, ConnectivityStateInfo stateInfo) {
if (subchannels.get(stripAttrs(subchannel.getAddresses())) != subchannel) {
return;
}
if (stateInfo.getState() == TRANSIENT_FAILURE || stateInfo.getState() == IDLE) {
helper.refreshNameResolution();
}
updateConnectivityState(subchannel, stateInfo);
updateBalancingState();
}
private void updateConnectivityState(Subchannel subchannel, ConnectivityStateInfo stateInfo) {
Ref<ConnectivityStateInfo> subchannelStateRef = getSubchannelStateInfoRef(subchannel);
ConnectivityState previousConnectivityState = subchannelStateRef.value.getState();
// Don't proactively reconnect if the subchannel enters IDLE, even if previously was connected.
// If the subchannel was previously in TRANSIENT_FAILURE, it is considered to stay in
// TRANSIENT_FAILURE until it becomes READY.
if (previousConnectivityState == TRANSIENT_FAILURE) {
if (stateInfo.getState() == CONNECTING || stateInfo.getState() == IDLE) {
return;
}
}
subchannelStateRef.value = stateInfo;
}
private static void shutdownSubchannel(Subchannel subchannel) {
subchannel.shutdown();
getSubchannelStateInfoRef(subchannel).value = ConnectivityStateInfo.forNonError(SHUTDOWN);
}
/**
* Converts list of {@link EquivalentAddressGroup} to {@link EquivalentAddressGroup} set and
* remove all attributes. The values are the original EAGs.
*/
private static Map<EquivalentAddressGroup, EquivalentAddressGroup> stripAttrs(
List<EquivalentAddressGroup> groupList) {
Map<EquivalentAddressGroup, EquivalentAddressGroup> addrs =
new HashMap<>(groupList.size() * 2);
for (EquivalentAddressGroup group : groupList) {
addrs.put(stripAttrs(group), group);
}
return addrs;
}
private static EquivalentAddressGroup stripAttrs(EquivalentAddressGroup eag) {
return new EquivalentAddressGroup(eag.getAddresses());
}
private static Ref<ConnectivityStateInfo> getSubchannelStateInfoRef(
Subchannel subchannel) {
return checkNotNull(subchannel.getAttributes().get(STATE_INFO), "STATE_INFO");
}
private static final class RingHashPicker extends SubchannelPicker {
private final SynchronizationContext syncContext;
private final List<RingEntry> ring;
// Avoid synchronization between pickSubchannel and subchannel's connectivity state change,
// freeze picker's view of subchannel's connectivity state.
// TODO(chengyuanzhang): can be more performance-friendly with
// IdentityHashMap<Subchannel, ConnectivityStateInfo> and RingEntry contains Subchannel.
private final Map<EquivalentAddressGroup, SubchannelView> pickableSubchannels; // read-only
private RingHashPicker(
SynchronizationContext syncContext, List<RingEntry> ring,
Map<EquivalentAddressGroup, Subchannel> subchannels) {
this.syncContext = syncContext;
this.ring = ring;
pickableSubchannels = new HashMap<>(subchannels.size());
for (Map.Entry<EquivalentAddressGroup, Subchannel> entry : subchannels.entrySet()) {
Subchannel subchannel = entry.getValue();
ConnectivityStateInfo stateInfo = subchannel.getAttributes().get(STATE_INFO).value;
pickableSubchannels.put(entry.getKey(), new SubchannelView(subchannel, stateInfo));
}
}
@Override
public PickResult pickSubchannel(PickSubchannelArgs args) {
Long requestHash = args.getCallOptions().getOption(XdsNameResolver.RPC_HASH_KEY);
if (requestHash == null) {
return PickResult.withError(RPC_HASH_NOT_FOUND);
}
// Find the ring entry with hash next to (clockwise) the RPC's hash.
int low = 0;
int high = ring.size();
int mid;
while (true) {
mid = (low + high) / 2;
if (mid == ring.size()) {
mid = 0;
break;
}
long midVal = ring.get(mid).hash;
long midValL = mid == 0 ? 0 : ring.get(mid - 1).hash;
if (requestHash <= midVal && requestHash > midValL) {
break;
}
if (midVal < requestHash) {
low = mid + 1;
} else {
high = mid - 1;
}
if (low > high) {
mid = 0;
break;
}
}
// Try finding a READY subchannel. Starting from the ring entry next to the RPC's hash.
// If the one of the first two subchannels is not in TRANSIENT_FAILURE, return result
// based on that subchannel. Otherwise, fail the pick unless a READY subchannel is found.
// Meanwhile, trigger connection for the first subchannel that is in IDLE if no subchannel
// before it is in CONNECTING or READY.
boolean hasPending = false; // true if having subchannel(s) in CONNECTING or IDLE
boolean canBuffer = true; // true if RPCs can be buffered with a pending subchannel
Subchannel firstSubchannel = null;
Subchannel secondSubchannel = null;
for (int i = 0; i < ring.size(); i++) {
int index = (mid + i) % ring.size();
EquivalentAddressGroup addrKey = ring.get(index).addrKey;
SubchannelView subchannel = pickableSubchannels.get(addrKey);
if (subchannel.stateInfo.getState() == READY) {
return PickResult.withSubchannel(subchannel.subchannel);
}
// RPCs can be buffered if any of the first two subchannels is pending. Otherwise, RPCs
// are failed unless there is a READY connection.
if (firstSubchannel == null) {
firstSubchannel = subchannel.subchannel;
} else if (subchannel.subchannel != firstSubchannel) {
if (secondSubchannel == null) {
secondSubchannel = subchannel.subchannel;
} else if (subchannel.subchannel != secondSubchannel) {
canBuffer = false;
}
}
if (subchannel.stateInfo.getState() == TRANSIENT_FAILURE) {
continue;
}
if (!hasPending) { // first non-failing subchannel
if (subchannel.stateInfo.getState() == IDLE) {
final Subchannel finalSubchannel = subchannel.subchannel;
syncContext.execute(new Runnable() {
@Override
public void run() {
finalSubchannel.requestConnection();
}
});
}
if (canBuffer) { // done if this is the first or second two subchannel
return PickResult.withNoResult(); // queue the pick and re-process later
}
hasPending = true;
}
}
// Fail the pick with error status of the original subchannel hit by hash.
SubchannelView originalSubchannel = pickableSubchannels.get(ring.get(mid).addrKey);
return PickResult.withError(originalSubchannel.stateInfo.getStatus());
}
}
/**
* An unmodifiable view of a subchannel with state not subject to its real connectivity
* state changes.
*/
private static final class SubchannelView {
private final Subchannel subchannel;
private final ConnectivityStateInfo stateInfo;
private SubchannelView(Subchannel subchannel, ConnectivityStateInfo stateInfo) {
this.subchannel = subchannel;
this.stateInfo = stateInfo;
}
}
private static final class RingEntry implements Comparable<RingEntry> {
private final long hash;
private final EquivalentAddressGroup addrKey;
private RingEntry(long hash, EquivalentAddressGroup addrKey) {
this.hash = hash;
this.addrKey = addrKey;
}
@Override
public int compareTo(RingEntry entry) {
return Long.compare(hash, entry.hash);
}
}
/**
* A lighter weight Reference than AtomicReference.
*/
private static final class Ref<T> {
T value;
Ref(T value) {
this.value = value;
}
}
/**
* Configures the ring property. The larger the ring is (that is, the more hashes there are
* for each provided host) the better the request distribution will reflect the desired weights.
*/
static final class RingHashConfig {
final long minRingSize;
final long maxRingSize;
RingHashConfig(long minRingSize, long maxRingSize) {
checkArgument(minRingSize > 0, "minRingSize <= 0");
checkArgument(maxRingSize > 0, "maxRingSize <= 0");
checkArgument(minRingSize <= maxRingSize, "minRingSize > maxRingSize");
this.minRingSize = minRingSize;
this.maxRingSize = maxRingSize;
}
@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("minRingSize", minRingSize)
.add("maxRingSize", maxRingSize)
.toString();
}
}
}