From f62939d910b8948bae79b65e9f2aa744971963d6 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Tue, 23 Mar 2021 11:49:07 -0400 Subject: [PATCH 1/5] ci: enable -race and use go 1.15 --- .circleci/config.yml | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index cbdeb1293..78ac11fb7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -4,7 +4,7 @@ version: 2.1 executors: go: docker: - - image: docker.mirror.hashicorp.services/circleci/golang:1.12.8 + - image: docker.mirror.hashicorp.services/circleci/golang:1.15 environment: - TEST_RESULTS: /tmp/test-results # path to where test results are saved @@ -13,20 +13,8 @@ jobs: executor: go steps: - checkout - - # Restore go module cache if there is one - - restore_cache: - keys: - - memberlist-modcache-v1-{{ checksum "go.mod" }} - - run: go mod download - # Save go module cache if the go.mod file has changed - - save_cache: - key: memberlist-modcache-v1-{{ checksum "go.mod" }} - paths: - - "/go/pkg/mod" - # check go fmt output because it does not report non-zero when there are fmt changes - run: name: check go fmt @@ -52,8 +40,7 @@ jobs: # run go tests with gotestsum - run: | - PACKAGE_NAMES=$(go list ./...) - gotestsum --format=short-verbose --junitfile $TEST_RESULTS/gotestsum-report.xml -- $PACKAGE_NAMES + gotestsum --format=short-verbose --junitfile $TEST_RESULTS/gotestsum-report.xml -- -race - store_test_results: path: /tmp/test-results - store_artifacts: From e3ce9856949d13a0394221c532da0027b96fdadb Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Tue, 23 Mar 2021 12:59:51 -0400 Subject: [PATCH 2/5] Fix data races in gossip() The `gossip` function obtains a read lock for the nodes map, but releases it once it gets the random sample of nodes. Since kNodes is a slice of pointers to nodeState, reading any fields on nodeState could race with any other goroutine that writes to the fields in nodeState. The go -race detector found a few places that were correctly using the nodeLock, but were racing because `gossip` was reading a pointer value. See hashicorp/memberlist/issues/134. There were two other callers to `kRandomNodes` that likely have the same problem because they released the read lock in the same way. This commit fixes the problem by changing `kRandomNodes` to return a slice of `Node` instead of `*nodeState`. This allows the callers to read fields of the Node without having to hold the read lock, and without racing with any writers. Once side effect of this change is that `kRandomNodes` can no longer compare Nodes using pointer address. It was changed to compare Nodes using the Node.Name, which as I understand it, should be unique. The name is used as a key in the nodeMap. --- state.go | 4 ++-- util.go | 18 +++++++++--------- util_test.go | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/state.go b/state.go index 7044f293c..04d3e44dc 100644 --- a/state.go +++ b/state.go @@ -602,13 +602,13 @@ func (m *Memberlist) gossip() { addr := node.Address() if len(msgs) == 1 { // Send single message as is - if err := m.rawSendMsgPacket(node.FullAddress(), &node.Node, msgs[0]); err != nil { + if err := m.rawSendMsgPacket(node.FullAddress(), &node, msgs[0]); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", addr, err) } } else { // Otherwise create and send a compound message compound := makeCompoundMessage(msgs) - if err := m.rawSendMsgPacket(node.FullAddress(), &node.Node, compound.Bytes()); err != nil { + if err := m.rawSendMsgPacket(node.FullAddress(), &node, compound.Bytes()); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", addr, err) } } diff --git a/util.go b/util.go index 22bf6b440..7ca52c4cd 100644 --- a/util.go +++ b/util.go @@ -119,35 +119,35 @@ func moveDeadNodes(nodes []*nodeState, gossipToTheDeadTime time.Duration) int { return n - numDead } -// kRandomNodes is used to select up to k random nodes, excluding any nodes where -// the filter function returns true. It is possible that less than k nodes are +// kRandomNodes is used to select up to k random Nodes, excluding any nodes where +// the exclude function returns true. It is possible that less than k nodes are // returned. -func kRandomNodes(k int, nodes []*nodeState, filterFn func(*nodeState) bool) []*nodeState { +func kRandomNodes(k int, nodes []*nodeState, exclude func(*nodeState) bool) []Node { n := len(nodes) - kNodes := make([]*nodeState, 0, k) + kNodes := make([]Node, 0, k) OUTER: // Probe up to 3*n times, with large n this is not necessary // since k << n, but with small n we want search to be // exhaustive for i := 0; i < 3*n && len(kNodes) < k; i++ { - // Get random node + // Get random nodeState idx := randomOffset(n) - node := nodes[idx] + state := nodes[idx] // Give the filter a shot at it. - if filterFn != nil && filterFn(node) { + if exclude != nil && exclude(state) { continue OUTER } // Check if we have this node already for j := 0; j < len(kNodes); j++ { - if node == kNodes[j] { + if state.Node.Name == kNodes[j].Name { continue OUTER } } // Append the node - kNodes = append(kNodes, node) + kNodes = append(kNodes, state.Node) } return kNodes } diff --git a/util_test.go b/util_test.go index 44d3b86f7..516ed6ce1 100644 --- a/util_test.go +++ b/util_test.go @@ -257,7 +257,7 @@ func TestKRandomNodes(t *testing.T) { t.Fatalf("unexpected equal") } - for _, s := range [][]*nodeState{s1, s2, s3} { + for _, s := range [][]Node{s1, s2, s3} { if len(s) != 3 { t.Fatalf("bad len") } From eb33b03311ad10602a44ac3ef330d3c9115c4a40 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Tue, 23 Mar 2021 17:15:09 -0400 Subject: [PATCH 3/5] Fix a data race in state.go A pointer value was being read outside of the lock. This meant it could race with any writer. By moving the read of fields inside the lock, it can no longer race --- state.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/state.go b/state.go index 04d3e44dc..5e4f7fdd7 100644 --- a/state.go +++ b/state.go @@ -1198,9 +1198,14 @@ func (m *Memberlist) suspectNode(s *suspect) { min := suspicionTimeout(m.config.SuspicionMult, n, m.config.ProbeInterval) max := time.Duration(m.config.SuspicionMaxTimeoutMult) * min fn := func(numConfirmations int) { + var d *dead + m.nodeLock.Lock() state, ok := m.nodeMap[s.Node] timeout := ok && state.State == StateSuspect && state.StateChange == changeTime + if timeout { + d = &dead{Incarnation: state.Incarnation, Node: state.Name, From: m.config.Name} + } m.nodeLock.Unlock() if timeout { @@ -1210,8 +1215,8 @@ func (m *Memberlist) suspectNode(s *suspect) { m.logger.Printf("[INFO] memberlist: Marking %s as failed, suspect timeout reached (%d peer confirmations)", state.Name, numConfirmations) - d := dead{Incarnation: state.Incarnation, Node: state.Name, From: m.config.Name} - m.deadNode(&d) + + m.deadNode(d) } } m.nodeTimers[s.Node] = newSuspicion(s.From, k, min, max, fn) From e5c30045a09b235d515c0926b99473019c4e2791 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Tue, 23 Mar 2021 17:18:08 -0400 Subject: [PATCH 4/5] Fix a data race in EventDelegate EventDelegate is passed a pointer to a node that can be modified by other goroutines. This causes a data race because the EventDelegate could be implemented and passed in by the caller. This commit fixes the problem in tests, but the race still exists for all other callers. It seems a more involved fix will be necessary, but I am fixing it this way for now to address other data races. The test had to be updated because the pointers no longer match. --- event_delegate.go | 9 ++++++--- state_test.go | 5 +++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/event_delegate.go b/event_delegate.go index 35e2a56fd..352f98b43 100644 --- a/event_delegate.go +++ b/event_delegate.go @@ -49,13 +49,16 @@ type NodeEvent struct { } func (c *ChannelEventDelegate) NotifyJoin(n *Node) { - c.Ch <- NodeEvent{NodeJoin, n} + node := *n + c.Ch <- NodeEvent{NodeJoin, &node} } func (c *ChannelEventDelegate) NotifyLeave(n *Node) { - c.Ch <- NodeEvent{NodeLeave, n} + node := *n + c.Ch <- NodeEvent{NodeLeave, &node} } func (c *ChannelEventDelegate) NotifyUpdate(n *Node) { - c.Ch <- NodeEvent{NodeUpdate, n} + node := *n + c.Ch <- NodeEvent{NodeUpdate, &node} } diff --git a/state_test.go b/state_test.go index 9c15af848..5d88b1e03 100644 --- a/state_test.go +++ b/state_test.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" "net" + "reflect" "strconv" "sync" "testing" @@ -1420,8 +1421,8 @@ func TestMemberList_AliveNode_ChangeMeta(t *testing.T) { if e.Event != NodeUpdate { t.Fatalf("bad event: %v", e) } - if e.Node != &state.Node { - t.Fatalf("bad event: %v", e) + if !reflect.DeepEqual(*e.Node, state.Node) { + t.Fatalf("expected %v, got %v", *e.Node, state.Node) } if bytes.Compare(e.Node.Meta, a.Meta) != 0 { t.Fatalf("meta did not update") From 7628fe3adef1b207e5bd52d0a77ff2fffb5005e1 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Tue, 23 Mar 2021 17:19:48 -0400 Subject: [PATCH 5/5] Fix three data races in tests Trying to use a logger that sends to t.Log is a problem when goroutines can outlive the main test goroutine. It can cause the test to panic if a write happens after the test exits. In general there is no reason to send output to t.Log because 'go test' writes output to stdout. Fix the data race by removing the logger in the integration test. Output will now go to stdout. Also fixes access to a field that should be read atomicly. Finally the third test requires a lock to access the nodes. --- integ_test.go | 3 ++- memberlist_test.go | 20 ++++++-------------- net_test.go | 1 - state_test.go | 12 +++++++++--- transport_test.go | 4 ---- z_test.go | 25 ------------------------- 6 files changed, 17 insertions(+), 48 deletions(-) delete mode 100644 z_test.go diff --git a/integ_test.go b/integ_test.go index 849081b47..c23d04f09 100644 --- a/integ_test.go +++ b/integ_test.go @@ -2,6 +2,7 @@ package memberlist import ( "fmt" + "log" "os" "testing" "time" @@ -41,7 +42,7 @@ func TestMemberlist_Integ(t *testing.T) { c.GossipInterval = 20 * time.Millisecond c.PushPullInterval = 200 * time.Millisecond c.SecretKey = secret - c.Logger = testLoggerWithName(t, c.Name) + c.Logger = log.New(os.Stderr, c.Name, log.LstdFlags) if i == 0 { c.Events = &ChannelEventDelegate{eventCh} diff --git a/memberlist_test.go b/memberlist_test.go index 38e07f77c..eac3dcae0 100644 --- a/memberlist_test.go +++ b/memberlist_test.go @@ -46,10 +46,8 @@ func testConfigNet(tb testing.TB, network byte) *Config { config.BindAddr = getBindAddrNet(network).String() config.Name = config.BindAddr config.BindPort = 0 // choose free port - if tb != nil { - config.Logger = testLoggerWithName(tb, config.Name) - } config.RequireNodeNames = true + config.Logger = log.New(os.Stderr, config.Name, log.LstdFlags) return config } @@ -186,7 +184,6 @@ func TestCreate_protocolVersion(t *testing.T) { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() c.ProtocolVersion = tc.version - c.Logger = testLogger(t) m, err := Create(c) if err == nil { @@ -219,7 +216,6 @@ func TestCreate_secretKey(t *testing.T) { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() c.SecretKey = tc.key - c.Logger = testLogger(t) m, err := Create(c) if err == nil { @@ -239,7 +235,6 @@ func TestCreate_secretKeyEmpty(t *testing.T) { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() c.SecretKey = make([]byte, 0) - c.Logger = testLogger(t) m, err := Create(c) require.NoError(t, err) @@ -253,7 +248,6 @@ func TestCreate_secretKeyEmpty(t *testing.T) { func TestCreate_keyringOnly(t *testing.T) { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() - c.Logger = testLogger(t) keyring, err := NewKeyring(nil, make([]byte, 16)) require.NoError(t, err) @@ -271,7 +265,6 @@ func TestCreate_keyringOnly(t *testing.T) { func TestCreate_keyringAndSecretKey(t *testing.T) { c := DefaultLANConfig() c.BindAddr = getBindAddr().String() - c.Logger = testLogger(t) keyring, err := NewKeyring(nil, make([]byte, 16)) require.NoError(t, err) @@ -1322,14 +1315,14 @@ func TestMemberlist_SendTo(t *testing.T) { // Ensure we got the messages if len(msgs1) != 1 { - t.Fatalf("should have 1 messages!") + t.Fatalf("expected 1 message, got %d", len(msgs1)) } if !reflect.DeepEqual(msgs1[0], []byte("pong")) { t.Fatalf("bad msg %v", msgs1[0]) } if len(msgs2) != 1 { - t.Fatalf("should have 1 messages!") + t.Fatalf("should have 1 message, got %d", len(msgs2)) } if !reflect.DeepEqual(msgs2[0], []byte("ping")) { t.Fatalf("bad msg %v", msgs2[0]) @@ -1483,7 +1476,7 @@ func TestMemberlist_Join_IPv6(t *testing.T) { c1.Name = "A" c1.BindAddr = "[::1]" c1.BindPort = 0 // choose free - c1.Logger = testLoggerWithName(t, c1.Name) + c1.Logger = log.New(os.Stderr, c1.Name, log.LstdFlags) m1, err := Create(c1) require.NoError(t, err) @@ -1494,7 +1487,7 @@ func TestMemberlist_Join_IPv6(t *testing.T) { c2.Name = "B" c2.BindAddr = "[::1]" c2.BindPort = 0 // choose free - c2.Logger = testLoggerWithName(t, c2.Name) + c2.Logger = log.New(os.Stderr, c2.Name, log.LstdFlags) m2, err := Create(c2) require.NoError(t, err) @@ -1558,7 +1551,6 @@ func TestAdvertiseAddr(t *testing.T) { c.BindAddr = bindAddr.String() c.BindPort = bindPort c.Name = c.BindAddr - c.Logger = testLoggerWithName(t, c.Name) c.AdvertiseAddr = advertiseAddr.String() c.AdvertisePort = advertisePort @@ -1783,7 +1775,7 @@ func TestMemberlist_EncryptedGossipTransition(t *testing.T) { // Set the gossip interval fast enough to get a reasonable test, // but slow enough to avoid "sendto: operation not permitted" conf.GossipInterval = 100 * time.Millisecond - conf.Logger = testLoggerWithName(t, shortName) + conf.Logger = log.New(os.Stderr, shortName, log.LstdFlags) pretty[conf.Name] = shortName return conf diff --git a/net_test.go b/net_test.go index f5326f4f6..3c15f2579 100644 --- a/net_test.go +++ b/net_test.go @@ -660,7 +660,6 @@ func TestEncryptDecryptState(t *testing.T) { SecretKey: []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, ProtocolVersion: ProtocolVersionMax, } - config.Logger = testLogger(t) m, err := Create(config) if err != nil { diff --git a/state_test.go b/state_test.go index 5d88b1e03..059918af8 100644 --- a/state_test.go +++ b/state_test.go @@ -3,10 +3,13 @@ package memberlist import ( "bytes" "fmt" + "log" "net" + "os" "reflect" "strconv" "sync" + "sync/atomic" "testing" "time" @@ -21,7 +24,7 @@ func HostMemberlist(host string, t *testing.T, f func(*Config)) *Memberlist { c.Name = host c.BindAddr = host c.BindPort = 0 // choose a free port - c.Logger = testLoggerWithName(t, host) + c.Logger = log.New(os.Stderr, host, log.LstdFlags) if f != nil { f(c) } @@ -127,8 +130,8 @@ func TestMemberList_ProbeNode_Suspect(t *testing.T) { time.Sleep(10 * time.Millisecond) // One of the peers should have attempted an indirect probe. - if m2.sequenceNum != 1 && m3.sequenceNum != 1 { - t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) + if s2, s3 := atomic.LoadUint32(&m2.sequenceNum), atomic.LoadUint32(&m3.sequenceNum); s2 != 1 && s3 != 1 { + t.Fatalf("bad seqnos, expected both to be 1: %v, %v", s2, s3) } } @@ -785,9 +788,12 @@ func TestMemberList_ProbeNode_Awareness_MissedNack(t *testing.T) { probeTime := time.Now().Sub(startProbe) // Node should be reported suspect. + + m1.nodeLock.Lock() if n.State != StateSuspect { t.Fatalf("expect node to be suspect") } + m1.nodeLock.Unlock() // Make sure we timed out approximately on time. if probeTime > probeTimeMax { diff --git a/transport_test.go b/transport_test.go index 89176b04e..49745763f 100644 --- a/transport_test.go +++ b/transport_test.go @@ -19,7 +19,6 @@ func TestTransport_Join(t *testing.T) { c1 := DefaultLANConfig() c1.Name = "node1" c1.Transport = t1 - c1.Logger = testLogger(t) m1, err := Create(c1) if err != nil { t.Fatalf("err: %v", err) @@ -31,7 +30,6 @@ func TestTransport_Join(t *testing.T) { c2 := DefaultLANConfig() c2.Name = "node2" c2.Transport = net.NewTransport("node2") - c2.Logger = testLogger(t) m2, err := Create(c2) if err != nil { t.Fatalf("err: %v", err) @@ -67,7 +65,6 @@ func TestTransport_Send(t *testing.T) { c1.Name = "node1" c1.Transport = t1 c1.Delegate = d1 - c1.Logger = testLogger(t) m1, err := Create(c1) if err != nil { t.Fatalf("err: %v", err) @@ -79,7 +76,6 @@ func TestTransport_Send(t *testing.T) { c2 := DefaultLANConfig() c2.Name = "node2" c2.Transport = net.NewTransport("node2") - c2.Logger = testLogger(t) m2, err := Create(c2) if err != nil { t.Fatalf("err: %v", err) diff --git a/z_test.go b/z_test.go deleted file mode 100644 index 400a75bfc..000000000 --- a/z_test.go +++ /dev/null @@ -1,25 +0,0 @@ -package memberlist - -import ( - "log" - "strings" - "testing" -) - -func testLogger(t testing.TB) *log.Logger { - return log.New(&testWriter{t}, "test: ", log.LstdFlags) -} - -func testLoggerWithName(t testing.TB, name string) *log.Logger { - return log.New(&testWriter{t}, "test["+name+"]: ", log.LstdFlags) -} - -type testWriter struct { - t testing.TB -} - -func (tw *testWriter) Write(p []byte) (n int, err error) { - tw.t.Helper() - tw.t.Log(strings.TrimSpace(string(p))) - return len(p), nil -}