libp2p · vyzo · May 30, 2022 · Apr 6, 2022 · Apr 6, 2022 · Apr 6, 2022
diff --git a/backoff.go b/backoff.go
@@ -48,20 +48,23 @@ func newBackoff(ctx context.Context, sizeThreshold int, cleanupInterval time.Dur
 	return b
 }
 
-func (b *backoff) updateAndGet(id peer.ID) (time.Duration, bool) {
+func (b *backoff) updateAndGet(id peer.ID) time.Duration {
 	b.mu.Lock()
 	defer b.mu.Unlock()
 
 	h, ok := b.info[id]
-	if !ok || time.Since(h.lastTried) > TimeToLive {
+	switch {
+	case !ok || time.Since(h.lastTried) > TimeToLive:
 		// first request goes immediately.
 		h = &backoffHistory{
 			duration: time.Duration(0),
 			attempts: 0,
 		}
-	} else if h.duration < MinBackoffDelay {
+
+	case h.duration < MinBackoffDelay:
 		h.duration = MinBackoffDelay
-	} else if h.duration < MaxBackoffDelay {
+
+	case h.duration < MaxBackoffDelay:
 		jitter := rand.Intn(MaxBackoffJitterCoff)
 		h.duration = (BackoffMultiplier * h.duration) + time.Duration(jitter)*time.Millisecond
 		if h.duration > MaxBackoffDelay || h.duration < 0 {
@@ -71,13 +74,20 @@ func (b *backoff) updateAndGet(id peer.ID) (time.Duration, bool) {
 
 	h.lastTried = time.Now()
 	h.attempts += 1
-	if h.attempts > b.maxAttempts {
-		delete(b.info, id)
-		return 0, false
-	}
 
 	b.info[id] = h
-	return h.duration, true
+	return h.duration
+}
+
+func (b *backoff) peerExceededBackoffThreshold(id peer.ID) bool {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+
+	h, ok := b.info[id]
+	if !ok {
+		return false // no record of this peer is still there, hence fine.
+	}
+	return h.attempts > b.maxAttempts
 }
 
 func (b *backoff) cleanup() {

diff --git a/backoff_test.go b/backoff_test.go
@@ -27,19 +27,15 @@ func TestBackoff_Update(t *testing.T) {
 		t.Fatal("non-empty info map for backoff")
 	}
 
-	if d, valid := b.updateAndGet(id1); d != time.Duration(0) || !valid {
+	if d := b.updateAndGet(id1); d != time.Duration(0) {
 		t.Fatalf("invalid initialization: %v", d)
 	}
-	if d, valid := b.updateAndGet(id2); d != time.Duration(0) || !valid {
+	if d := b.updateAndGet(id2); d != time.Duration(0) {
 		t.Fatalf("invalid initialization: %v", d)
 	}
 
 	for i := 0; i < maxBackoffAttempts-1; i++ {
-		got, valid := b.updateAndGet(id1)
-
-		if !valid {
-			t.Fatalf("update attempt invalidated")
-		}
+		got := b.updateAndGet(id1)
 
 		expected := time.Duration(math.Pow(BackoffMultiplier, float64(i)) *
 			float64(MinBackoffDelay+MaxBackoffJitterCoff*time.Millisecond))
@@ -50,40 +46,40 @@ func TestBackoff_Update(t *testing.T) {
 		if expected < got { // considering jitter, expected backoff must always be greater than or equal to actual.
 			t.Fatalf("invalid backoff result, expected: %v, got: %v", expected, got)
 		}
-	}
 
-	if len(b.info) != 2 {
-		t.Fatalf("pre-invalidation attempt, info map size mismatch, expected: %d, got: %d", 2, len(b.info))
-	}
-
-	// trying once more beyond the threshold, hence expecting an invalidation
-	if _, valid := b.updateAndGet(id1); valid {
-		t.Fatal("update beyond max attempts did not invalidate")
+		// update attempts on id1 are below threshold, hence peer should never go beyond backoff attempt threshold
+		if b.peerExceededBackoffThreshold(id1) {
+			t.Fatalf("invalid exceeding threshold status")
+		}
 	}
 
-	// invalidated entry must be removed
-	if len(b.info) != 1 {
-		t.Fatalf("post-invalidation attempt, info map size mismatch, expected: %d, got: %d", 1, len(b.info))
+	// trying once more beyond the threshold, hence expecting exceeding threshold
+	b.updateAndGet(id1)
+	if !b.peerExceededBackoffThreshold(id1) {
+		t.Fatal("update beyond max attempts does not reflect threshold")
 	}
 
-	got, valid := b.updateAndGet(id2)
-	if !valid {
-		t.Fatalf("update attempt invalidated")
-	}
+	got := b.updateAndGet(id2)
 	if got != MinBackoffDelay {
 		t.Fatalf("invalid backoff result, expected: %v, got: %v", MinBackoffDelay, got)
 	}
 
 	// sets last tried of id2 to long ago that it resets back upon next try.
 	b.info[id2].lastTried = time.Now().Add(-TimeToLive)
-	got, valid = b.updateAndGet(id2)
-	if !valid {
-		t.Fatalf("update attempt invalidated")
-	}
+	got = b.updateAndGet(id2)
 	if got != time.Duration(0) {
 		t.Fatalf("invalid ttl expiration, expected: %v, got: %v", time.Duration(0), got)
 	}
 
+	// update attempts on id2 are below threshold, hence peer should never go beyond backoff attempt threshold
+	if b.peerExceededBackoffThreshold(id2) {
+		t.Fatalf("invalid exceeding threshold status")
+	}
+
+	if len(b.info) != 2 {
+		t.Fatalf("pre-invalidation attempt, info map size mismatch, expected: %d, got: %d", 2, len(b.info))
+	}
+
 }
 
 func TestBackoff_Clean(t *testing.T) {
@@ -109,10 +105,7 @@ func TestBackoff_Clean(t *testing.T) {
 	time.Sleep(2 * cleanupInterval)
 
 	// next update should trigger cleanup
-	got, valid := b.updateAndGet(peer.ID("some-new-peer"))
-	if !valid {
-		t.Fatalf("update attempt invalidated")
-	}
+	got := b.updateAndGet(peer.ID("some-new-peer"))
 	if got != time.Duration(0) {
 		t.Fatalf("invalid backoff result, expected: %v, got: %v", time.Duration(0), got)
 	}

diff --git a/comm.go b/comm.go
@@ -3,7 +3,6 @@ package pubsub
 import (
 	"bufio"
 	"context"
-	"fmt"
 	"io"
 	"time"
 
@@ -123,20 +122,15 @@ func (p *PubSub) handleNewPeer(ctx context.Context, pid peer.ID, outgoing <-chan
 	}
 }
 
-func (p *PubSub) handleNewPeerWithBackoff(ctx context.Context, pid peer.ID, outgoing <-chan *RPC) error {
-	delay, valid := p.deadPeerBackoff.updateAndGet(pid)
-	if !valid {
-		return fmt.Errorf("backoff attempts to %s expired after reaching maximum allowed", pid)
-	}
+func (p *PubSub) handleNewPeerWithBackoff(ctx context.Context, pid peer.ID, outgoing <-chan *RPC) {
+	delay := p.deadPeerBackoff.updateAndGet(pid)
 
 	select {
 	case <-time.After(delay):
 		p.handleNewPeer(ctx, pid, outgoing)
 	case <-ctx.Done():
-		return fmt.Errorf("context cancelled")
+		return
 	}
-
-	return nil
 }
 
 func (p *PubSub) handlePeerEOF(ctx context.Context, s network.Stream) {

diff --git a/pubsub.go b/pubsub.go
@@ -683,19 +683,15 @@ func (p *PubSub) handleDeadPeers() {
 
 		close(ch)
 
-		if p.host.Network().Connectedness(pid) == network.Connected {
+		if p.host.Network().Connectedness(pid) == network.Connected &&
+			!p.deadPeerBackoff.peerExceededBackoffThreshold(pid) {
+
 			// still connected, must be a duplicate connection being closed.
 			// we respawn the writer as we need to ensure there is a stream active
 			log.Debugf("peer declared dead but still connected; respawning writer: %s", pid)
 			messages := make(chan *RPC, p.peerOutboundQueueSize)
 			messages <- p.getHelloPacket()
-			go func() {
-				err := p.handleNewPeerWithBackoff(p.ctx, pid, messages)
-				if err != nil {
-					log.Warnf("could not handle backoff to new peer %s", err)
-					close(messages)
-				}
-			}()
+			go p.handleNewPeerWithBackoff(p.ctx, pid, messages)
 			p.peers[pid] = messages
 			continue
 		}