Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parallel envtests getting stuck in flock #1599

Closed
Porges opened this issue Jul 21, 2021 · 1 comment · Fixed by #1605
Closed

Parallel envtests getting stuck in flock #1599

Porges opened this issue Jul 21, 2021 · 1 comment · Fixed by #1605

Comments

@Porges
Copy link

Porges commented Jul 21, 2021

Per this comment (cc @vincepri). We recently updated to v0.9.2 and encountered test failures due to timeout where it looks like a goroutine is getting stuck trying to flock a file.

It seems that #1563 would be the culprit, and downgrading to v0.9.1 did allow our test run to pass.

At the moment we don’t limit the number of parallel envtests (we use t.Parallel()), so we are potentially running up to 19 instances (the number of tests we have). We are running our tests in a container based on a VS Code devcontainer image (uname -a shows Linux ca4e9d7b3b89 5.10.43.3-microsoft-standard-WSL2 #1 SMP Wed Jun 16 23:47:55 UTC 2021 x86_64 GNU/Linux). The host is GitHub’s ubuntu-latest.

An example failure is here. The goroutine-dump output from the test run is:

panic: test timed out after 10m0s

goroutine 28671 [running]:
testing.(*M).startAlarm.func1()
	/usr/local/go/src/testing/testing.go:1700 +0xe5
created by time.goFunc
	/usr/local/go/src/time/sleep.go:180 +0x45

goroutine 1 [chan receive, 2 minutes]:
testing.tRunner.func1(0xc0003fe480)
	/usr/local/go/src/testing/testing.go:1159 +0x2bc
testing.tRunner(0xc0003fe480, 0xc00018fca8)
	/usr/local/go/src/testing/testing.go:1197 +0x125
testing.runTests(0xc00047cc18, 0x2e1ce40, 0x14, 0x14, 0xc035f59d34ae68b2, 0x8bb467a93b, 0x2e4a840, 0xc0003e5a80)
	/usr/local/go/src/testing/testing.go:1509 +0x2fe
testing.(*M).Run(0xc000463480, 0x0)
	/usr/local/go/src/testing/testing.go:1417 +0x1eb
github.com/Azure/azure-service-operator/hack/generated/pkg/testcommon.SetupTeardownTestMain(0xc000463480, 0xc00018fe01, 0xc00018fe80, 0x1eec7c0, 0x0)
	/workspace/hack/generated/pkg/testcommon/wait.go:68 +0x185
github.com/Azure/azure-service-operator/hack/generated/controllers_test.TestMain(0xc000463480)
	/workspace/hack/generated/controllers/suite_test.go:47 +0x7c
main.main()
	_testmain.go:527 +0x20e

goroutine 19 [chan receive]:
k8s.io/klog/v2.(*loggingT).flushDaemon(0x2e4ab60)
	/go/pkg/mod/k8s.io/klog/v2@v2.8.0/klog.go:1164 +0x8b
created by k8s.io/klog/v2.init.0
	/go/pkg/mod/k8s.io/klog/v2@v2.8.0/klog.go:418 +0xdf

goroutine 22206 [syscall, 5 minutes]:
syscall.Syscall6(0xf7, 0x1, 0x4af4, 0xc0069e8dd8, 0x1000004, 0x0, 0x0, 0xc0069e8f32, 0x2, 0x3)
	/usr/local/go/src/syscall/asm_linux_amd64.s:43 +0x5
os.(*Process).blockUntilWaitable(0xc0088c2210, 0xc0069e8f32, 0xc0069a5f90, 0xc0069e8f00)
	/usr/local/go/src/os/wait_waitid.go:32 +0x9e
os.(*Process).wait(0xc0088c2210, 0x1eefd30, 0xc0069e8f60, 0x71c085)
	/usr/local/go/src/os/exec_unix.go:22 +0x39
os.(*Process).Wait(...)
	/usr/local/go/src/os/exec.go:129
os/exec.(*Cmd).Wait(0xc0088a9e40, 0xc006afff61, 0x1)
	/usr/local/go/src/os/exec/exec.go:507 +0x65
sigs.k8s.io/controller-runtime/pkg/internal/testing/process.(*State).Start.func1(0xc0079bc480)
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/internal/testing/process/process.go:175 +0x65
created by sigs.k8s.io/controller-runtime/pkg/internal/testing/process.(*State).Start
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/internal/testing/process/process.go:173 +0x2a5

goroutine 60 [syscall, 5 minutes]:
syscall.Syscall(0x49, 0x48, 0x2, 0x0, 0x1da2c20, 0x0, 0xc0089cc150)
	/usr/local/go/src/syscall/asm_linux_amd64.s:20 +0x5
golang.org/x/sys/unix.Flock(0x48, 0x2, 0x2b, 0x80042)
	/go/pkg/mod/golang.org/x/sys@v0.0.0-20210603081109-ebe580a85c40/unix/zsyscall_linux.go:731 +0x4b
sigs.k8s.io/controller-runtime/pkg/internal/flock.Acquire(0xc0089cc120, 0x2b, 0xc0056f6358, 0x3)
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/internal/flock/flock_unix.go:34 +0x90
sigs.k8s.io/controller-runtime/pkg/internal/testing/addr.(*portCache).add(0x2e79f40, 0x9da7, 0x9da7, 0xc00897c2d0, 0x9)
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/internal/testing/addr/manager.go:79 +0x168
sigs.k8s.io/controller-runtime/pkg/internal/testing/addr.Suggest(0x0, 0x0, 0x1e0a302, 0x4, 0xc0002952a0, 0x1c, 0xc0056f6490)
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/internal/testing/addr/manager.go:119 +0xa5
sigs.k8s.io/controller-runtime/pkg/internal/testing/controlplane.(*APIServer).configurePorts(0xc007232e00, 0x1e1777e, 0xe)
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/internal/testing/controlplane/apiserver.go:215 +0x232
sigs.k8s.io/controller-runtime/pkg/internal/testing/controlplane.(*APIServer).setProcessState(0xc007232e00, 0x0, 0x0)
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/internal/testing/controlplane/apiserver.go:248 +0x126
sigs.k8s.io/controller-runtime/pkg/internal/testing/controlplane.(*APIServer).prepare(0xc007232e00, 0x0, 0x0)
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/internal/testing/controlplane/apiserver.go:170 +0x2b
sigs.k8s.io/controller-runtime/pkg/internal/testing/controlplane.(*APIServer).Start(0xc007232e00, 0x0, 0x0)
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/internal/testing/controlplane/apiserver.go:163 +0x2f
sigs.k8s.io/controller-runtime/pkg/internal/testing/controlplane.(*ControlPlane).Start(0xc00799d180, 0x0, 0xc008816780)
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/internal/testing/controlplane/plane.go:62 +0xa7
sigs.k8s.io/controller-runtime/pkg/envtest.(*Environment).startControlPlane(0xc00799d180, 0x1e22e26, 0x16)
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/envtest/server.go:326 +0xc5
sigs.k8s.io/controller-runtime/pkg/envtest.(*Environment).Start(0xc00799d180, 0xc0056f6d48, 0x1, 0x1)
	/go/pkg/mod/sigs.k8s.io/controller-runtime@v0.9.2/pkg/envtest/server.go:260 +0x3b7
github.com/Azure/azure-service-operator/hack/generated/pkg/testcommon.createEnvtestContext(0x1e0ec33, 0x7, 0xc0003e5a80, 0x1, 0xc0002ce000, 0x2082800, 0xc0078ed0c0, 0xc00887ad40, 0x208c958, 0xc0088b0180, ...)
	/workspace/hack/generated/pkg/testcommon/kube_test_context_envtest.go:43 +0x2c5
github.com/Azure/azure-service-operator/hack/generated/pkg/testcommon.KubeGlobalContext.ForTest(0x1e0ec33, 0x7, 0xc0003e5a80, 0x1, 0x1, 0x1e136c0, 0xb, 0x1e2dac2, 0x1e, 0x1e2daa4, ...)
	/workspace/hack/generated/pkg/testcommon/kube_per_test_context.go:124 +0x279
github.com/Azure/azure-service-operator/hack/generated/controllers_test.storageAccountAndResourceGroupProvisionedOutOfOrderHelper(0xc0002ce000, 0x1eec868)
	/workspace/hack/generated/controllers/edge_case_test.go:37 +0x85
github.com/Azure/azure-service-operator/hack/generated/controllers_test.Test_StorageAccount_CreatedInParallelWithResourceGroup(0xc0002ce000)
	/workspace/hack/generated/controllers/edge_case_test.go:114 +0x37
testing.tRunner(0xc0002ce000, 0x1eec840)
	/usr/local/go/src/testing/testing.go:1193 +0xef
created by testing.(*T).Run
	/usr/local/go/src/testing/testing.go:1238 +0x2b3

goroutine 61 [chan send, 5 minutes]:
testing.tRunner.func1.1(0xc009545ed5, 0x0, 0x0, 0xc0002ce300, 0xc009545e01)
	/usr/local/go/src/testing/testing.go:1123 +0x59
testing.tRunner.func1(0xc0002ce300)
	/usr/local/go/src/testing/testing.go:1184 +0x405
testing.tRunner(0xc0002ce300, 0x1eec848)
	/usr/local/go/src/testing/testing.go:1197 +0x125
created by testing.(*T).Run
	/usr/local/go/src/testing/testing.go:1238 +0x2b3

goroutine 62 [chan send, 5 minutes]:
testing.tRunner.func1.1(0xc007f9bed5, 0x0, 0x0, 0xc0002ce600, 0xc007f9be01)
	/usr/local/go/src/testing/testing.go:1123 +0x59
testing.tRunner.func1(0xc0002ce600)
	/usr/local/go/src/testing/testing.go:1184 +0x405
testing.tRunner(0xc0002ce600, 0x1eec850)
	/usr/local/go/src/testing/testing.go:1197 +0x125
created by testing.(*T).Run
	/usr/local/go/src/testing/testing.go:1238 +0x2b3

goroutine 63 [chan send, 8 minutes]:
testing.tRunner.func1.1(0xc001c5fed5, 0x0, 0x0, 0xc0002ce900, 0xc001c5fe01)
	/usr/local/go/src/testing/testing.go:1123 +0x59
testing.tRunner.func1(0xc0002ce900)
	/usr/local/go/src/testing/testing.go:1184 +0x405
testing.tRunner(0xc0002ce900, 0x1eec7d0)
	/usr/local/go/src/testing/testing.go:1197 +0x125
created by testing.(*T).Run
	/usr/local/go/src/testing/testing.go:1238 +0x2b3

goroutine 64 [chan send, 8 minutes]:
testing.tRunner.func1.1(0xc002169ed5, 0x0, 0x0, 0xc0002cec00, 0xc002169e01)
	/usr/local/go/src/testing/testing.go:1123 +0x59
testing.tRunner.func1(0xc0002cec00)
	/usr/local/go/src/testing/testing.go:1184 +0x405
testing.tRunner(0xc0002cec00, 0x1eec7d8)
	/usr/local/go/src/testing/testing.go:1197 +0x125
created by testing.(*T).Run
	/usr/local/go/src/testing/testing.go:1238 +0x2b3

goroutine 65 [chan receive, 10 minutes]:
testing.runTests.func1.1(0xc0003fe480)
	/usr/local/go/src/testing/testing.go:1516 +0x3b
created by testing.runTests.func1
	/usr/local/go/src/testing/testing.go:1516 +0xac
@vincepri
Copy link
Member

vincepri commented Jul 21, 2021

@Porges Does it work if you carry this patch with the non-blocking flag?

diff --git a/go.mod b/go.mod
index 0ce3b6c1..8e8e3f0d 100644
--- a/go.mod
+++ b/go.mod
@@ -12,6 +12,7 @@ require (
 	github.com/imdario/mergo v0.3.12 // indirect
 	github.com/onsi/ginkgo v1.16.4
 	github.com/onsi/gomega v1.13.0
+	github.com/pkg/errors v0.9.1
 	github.com/prometheus/client_golang v1.11.0
 	github.com/prometheus/client_model v0.2.0
 	go.uber.org/goleak v1.1.10
diff --git a/pkg/internal/flock/flock_unix.go b/pkg/internal/flock/flock_unix.go
index 3dae621b..503de43b 100644
--- a/pkg/internal/flock/flock_unix.go
+++ b/pkg/internal/flock/flock_unix.go
@@ -18,7 +18,14 @@ limitations under the License.
 
 package flock
 
-import "golang.org/x/sys/unix"
+import (
+	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
+)
+
+var (
+	ErrAlreadyLocked = errors.New("the file is already locked")
+)
 
 // Acquire acquires a lock on a file for the duration of the process. This method
 // is reentrant.
@@ -30,6 +37,10 @@ func Acquire(path string) error {
 
 	// We don't need to close the fd since we should hold
 	// it until the process exits.
+	err = unix.Flock(fd, unix.LOCK_NB|unix.LOCK_EX)
+	if errors.Is(err, unix.EWOULDBLOCK) { // This condition requires LOCK_NB.
+		return errors.Wrapf(ErrAlreadyLocked, "cannot lock file %q", path)
+	}
+	return err
 
-	return unix.Flock(fd, unix.LOCK_EX)
 }
diff --git a/pkg/internal/testing/addr/manager.go b/pkg/internal/testing/addr/manager.go
index 2326af15..9dcbaa43 100644
--- a/pkg/internal/testing/addr/manager.go
+++ b/pkg/internal/testing/addr/manager.go
@@ -17,6 +17,7 @@ limitations under the License.
 package addr
 
 import (
+	"errors"
 	"fmt"
 	"io/fs"
 	"net"
@@ -31,7 +32,7 @@ import (
 // TODO(directxman12): interface / release functionality for external port managers
 
 const (
-	portReserveTime   = 10 * time.Minute
+	portReserveTime   = 2 * time.Minute
 	portConflictRetry = 100
 	portFilePrefix    = "port-"
 )
@@ -76,7 +77,8 @@ func (c *portCache) add(port int) (bool, error) {
 		return false, err
 	}
 	// Try allocating new port, by acquiring a file.
-	if err := flock.Acquire(fmt.Sprintf("%s/%s%d", cacheDir, portFilePrefix, port)); os.IsExist(err) {
+	path := fmt.Sprintf("%s/%s%d", cacheDir, portFilePrefix, port)
+	if err := flock.Acquire(path); errors.Is(err, os.ErrExist) || errors.Is(err, flock.ErrAlreadyLocked) {
 		return false, nil
 	} else if err != nil {
 		return false, err
@@ -86,22 +88,19 @@ func (c *portCache) add(port int) (bool, error) {
 
 var cache = &portCache{}
 
-func suggest(listenHost string) (int, string, error) {
+func suggest(listenHost string) (*net.TCPListener, int, string, error) {
 	if listenHost == "" {
 		listenHost = "localhost"
 	}
 	addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(listenHost, "0"))
 	if err != nil {
-		return -1, "", err
+		return nil, -1, "", err
 	}
 	l, err := net.ListenTCP("tcp", addr)
 	if err != nil {
-		return -1, "", err
+		return nil, -1, "", err
 	}
-	if err := l.Close(); err != nil {
-		return -1, "", err
-	}
-	return l.Addr().(*net.TCPAddr).Port,
+	return l, l.Addr().(*net.TCPAddr).Port,
 		addr.IP.String(),
 		nil
 }
@@ -112,10 +111,11 @@ func suggest(listenHost string) (int, string, error) {
 // allocated within 1 minute.
 func Suggest(listenHost string) (int, string, error) {
 	for i := 0; i < portConflictRetry; i++ {
-		port, resolvedHost, err := suggest(listenHost)
+		listener, port, resolvedHost, err := suggest(listenHost)
 		if err != nil {
 			return -1, "", err
 		}
+		defer listener.Close()
 		if ok, err := cache.add(port); ok {
 			return port, resolvedHost, nil
 		} else if err != nil {

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants