Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds exponential backoff to re-spawing new streams for supposedly dead peers #483

Merged
merged 20 commits into from May 30, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
28 changes: 19 additions & 9 deletions backoff.go
Expand Up @@ -48,20 +48,23 @@ func newBackoff(ctx context.Context, sizeThreshold int, cleanupInterval time.Dur
return b
}

func (b *backoff) updateAndGet(id peer.ID) (time.Duration, bool) {
func (b *backoff) updateAndGet(id peer.ID) time.Duration {
b.mu.Lock()
defer b.mu.Unlock()

h, ok := b.info[id]
if !ok || time.Since(h.lastTried) > TimeToLive {
switch {
case !ok || time.Since(h.lastTried) > TimeToLive:
// first request goes immediately.
h = &backoffHistory{
duration: time.Duration(0),
attempts: 0,
}
} else if h.duration < MinBackoffDelay {

case h.duration < MinBackoffDelay:
h.duration = MinBackoffDelay
} else if h.duration < MaxBackoffDelay {

case h.duration < MaxBackoffDelay:
jitter := rand.Intn(MaxBackoffJitterCoff)
h.duration = (BackoffMultiplier * h.duration) + time.Duration(jitter)*time.Millisecond
if h.duration > MaxBackoffDelay || h.duration < 0 {
Expand All @@ -71,13 +74,20 @@ func (b *backoff) updateAndGet(id peer.ID) (time.Duration, bool) {

h.lastTried = time.Now()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's get the time after checking the max attempts, will avoid the gettimeofday call in that case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please read my reply to the below comment as this part has got changed.

h.attempts += 1
if h.attempts > b.maxAttempts {
delete(b.info, id)
return 0, false
}

b.info[id] = h
return h.duration, true
return h.duration
}

func (b *backoff) peerExceededBackoffThreshold(id peer.ID) bool {
b.mu.Lock()
defer b.mu.Unlock()

h, ok := b.info[id]
if !ok {
return false // no record of this peer is still there, hence fine.
}
return h.attempts > b.maxAttempts
}

func (b *backoff) cleanup() {
Expand Down
53 changes: 23 additions & 30 deletions backoff_test.go
Expand Up @@ -27,19 +27,15 @@ func TestBackoff_Update(t *testing.T) {
t.Fatal("non-empty info map for backoff")
}

if d, valid := b.updateAndGet(id1); d != time.Duration(0) || !valid {
if d := b.updateAndGet(id1); d != time.Duration(0) {
t.Fatalf("invalid initialization: %v", d)
}
if d, valid := b.updateAndGet(id2); d != time.Duration(0) || !valid {
if d := b.updateAndGet(id2); d != time.Duration(0) {
t.Fatalf("invalid initialization: %v", d)
}

for i := 0; i < maxBackoffAttempts-1; i++ {
got, valid := b.updateAndGet(id1)

if !valid {
t.Fatalf("update attempt invalidated")
}
got := b.updateAndGet(id1)

expected := time.Duration(math.Pow(BackoffMultiplier, float64(i)) *
float64(MinBackoffDelay+MaxBackoffJitterCoff*time.Millisecond))
Expand All @@ -50,40 +46,40 @@ func TestBackoff_Update(t *testing.T) {
if expected < got { // considering jitter, expected backoff must always be greater than or equal to actual.
t.Fatalf("invalid backoff result, expected: %v, got: %v", expected, got)
}
}

if len(b.info) != 2 {
t.Fatalf("pre-invalidation attempt, info map size mismatch, expected: %d, got: %d", 2, len(b.info))
}

// trying once more beyond the threshold, hence expecting an invalidation
if _, valid := b.updateAndGet(id1); valid {
t.Fatal("update beyond max attempts did not invalidate")
// update attempts on id1 are below threshold, hence peer should never go beyond backoff attempt threshold
if b.peerExceededBackoffThreshold(id1) {
t.Fatalf("invalid exceeding threshold status")
}
}

// invalidated entry must be removed
if len(b.info) != 1 {
t.Fatalf("post-invalidation attempt, info map size mismatch, expected: %d, got: %d", 1, len(b.info))
// trying once more beyond the threshold, hence expecting exceeding threshold
b.updateAndGet(id1)
if !b.peerExceededBackoffThreshold(id1) {
t.Fatal("update beyond max attempts does not reflect threshold")
}

got, valid := b.updateAndGet(id2)
if !valid {
t.Fatalf("update attempt invalidated")
}
got := b.updateAndGet(id2)
if got != MinBackoffDelay {
t.Fatalf("invalid backoff result, expected: %v, got: %v", MinBackoffDelay, got)
}

// sets last tried of id2 to long ago that it resets back upon next try.
b.info[id2].lastTried = time.Now().Add(-TimeToLive)
got, valid = b.updateAndGet(id2)
if !valid {
t.Fatalf("update attempt invalidated")
}
got = b.updateAndGet(id2)
if got != time.Duration(0) {
t.Fatalf("invalid ttl expiration, expected: %v, got: %v", time.Duration(0), got)
}

// update attempts on id2 are below threshold, hence peer should never go beyond backoff attempt threshold
if b.peerExceededBackoffThreshold(id2) {
t.Fatalf("invalid exceeding threshold status")
}

if len(b.info) != 2 {
t.Fatalf("pre-invalidation attempt, info map size mismatch, expected: %d, got: %d", 2, len(b.info))
}

}

func TestBackoff_Clean(t *testing.T) {
Expand All @@ -109,10 +105,7 @@ func TestBackoff_Clean(t *testing.T) {
time.Sleep(2 * cleanupInterval)

// next update should trigger cleanup
got, valid := b.updateAndGet(peer.ID("some-new-peer"))
if !valid {
t.Fatalf("update attempt invalidated")
}
got := b.updateAndGet(peer.ID("some-new-peer"))
if got != time.Duration(0) {
t.Fatalf("invalid backoff result, expected: %v, got: %v", time.Duration(0), got)
}
Expand Down
12 changes: 3 additions & 9 deletions comm.go
Expand Up @@ -3,7 +3,6 @@ package pubsub
import (
"bufio"
"context"
"fmt"
"io"
"time"

Expand Down Expand Up @@ -123,20 +122,15 @@ func (p *PubSub) handleNewPeer(ctx context.Context, pid peer.ID, outgoing <-chan
}
}

func (p *PubSub) handleNewPeerWithBackoff(ctx context.Context, pid peer.ID, outgoing <-chan *RPC) error {
delay, valid := p.deadPeerBackoff.updateAndGet(pid)
if !valid {
return fmt.Errorf("backoff attempts to %s expired after reaching maximum allowed", pid)
}
func (p *PubSub) handleNewPeerWithBackoff(ctx context.Context, pid peer.ID, outgoing <-chan *RPC) {
delay := p.deadPeerBackoff.updateAndGet(pid)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to add a failure more if we have backed off too much and simply give up; say we try up to 10 times and then updateAndGet returns an error and we close the channel and forget the peer.

How does that sound?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe 10 is even too much, 3-4 attempts should be enough.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


select {
case <-time.After(delay):
p.handleNewPeer(ctx, pid, outgoing)
case <-ctx.Done():
return fmt.Errorf("context cancelled")
return
}

return nil
}

func (p *PubSub) handlePeerEOF(ctx context.Context, s network.Stream) {
Expand Down
12 changes: 4 additions & 8 deletions pubsub.go
Expand Up @@ -683,19 +683,15 @@ func (p *PubSub) handleDeadPeers() {

close(ch)

if p.host.Network().Connectedness(pid) == network.Connected {
if p.host.Network().Connectedness(pid) == network.Connected &&
!p.deadPeerBackoff.peerExceededBackoffThreshold(pid) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we might want to (debug) log this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


// still connected, must be a duplicate connection being closed.
// we respawn the writer as we need to ensure there is a stream active
log.Debugf("peer declared dead but still connected; respawning writer: %s", pid)
messages := make(chan *RPC, p.peerOutboundQueueSize)
messages <- p.getHelloPacket()
go func() {
err := p.handleNewPeerWithBackoff(p.ctx, pid, messages)
if err != nil {
log.Warnf("could not handle backoff to new peer %s", err)
close(messages)
}
}()
go p.handleNewPeerWithBackoff(p.ctx, pid, messages)
p.peers[pid] = messages
continue
}
Expand Down