From 8648239afac793f28452fd67a338ee50135674cb Mon Sep 17 00:00:00 2001 From: Evan Wallace Date: Thu, 8 Dec 2022 14:48:15 -0500 Subject: [PATCH] fix #2727: avoid `sync.WaitGroup` to fix hang --- CHANGELOG.md | 6 ++++ cmd/esbuild/service.go | 56 +++++------------------------------ internal/helpers/waitgroup.go | 37 +++++++++++++++++++++++ scripts/js-api-tests.js | 48 ++++++++++++++++++++++++++++++ scripts/test-yarnpnp.js | 3 +- 5 files changed, 100 insertions(+), 50 deletions(-) create mode 100644 internal/helpers/waitgroup.go diff --git a/CHANGELOG.md b/CHANGELOG.md index b56afa50c65..3f537bc4d20 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## Unreleased + +* Fix a hang with the JS API in certain cases ([#2727](https://github.com/evanw/esbuild/issues/2727)) + + A change that was made in version 0.15.13 accidentally introduced a case when using esbuild's JS API could cause the node process to fail to exit. The change broke esbuild's watchdog timer, which detects if the parent process no longer exists and then automatically exits esbuild. This hang happened when you ran node as a child process with the `stderr` stream set to `pipe` instead of `inherit`, in the child process you call esbuild's JS API and pass `incremental: true` but do not call `dispose()` on the returned `rebuild` object, and then call `process.exit()`. In that case the parent node process was still waiting for the esbuild process that was created by the child node process to exit. The change made in version 0.15.13 was trying to avoid using Go's `sync.WaitGroup` API incorrectly because the API is not thread-safe. Instead of doing this, I have now reverted that change and implemented a thread-safe version of the `sync.WaitGroup` API for esbuild to use instead. + ## 0.16.2 * Fix `process.env.NODE_ENV` substitution when transforming ([#2718](https://github.com/evanw/esbuild/issues/2718)) diff --git a/cmd/esbuild/service.go b/cmd/esbuild/service.go index 1c133f7f01e..09dfe0fbe16 100644 --- a/cmd/esbuild/service.go +++ b/cmd/esbuild/service.go @@ -43,10 +43,9 @@ type serviceType struct { callbacks map[uint32]responseCallback activeBuilds map[int]*activeBuild outgoingPackets chan outgoingPacket - keepAliveWaitGroup sync.WaitGroup + keepAliveWaitGroup *helpers.ThreadSafeWaitGroup mutex sync.Mutex nextRequestID uint32 - disableSendRequest bool } func (service *serviceType) getActiveBuild(key int) *activeBuild { @@ -98,9 +97,10 @@ func runService(sendPings bool) { logger.API = logger.JSAPI service := serviceType{ - callbacks: make(map[uint32]responseCallback), - activeBuilds: make(map[int]*activeBuild), - outgoingPackets: make(chan outgoingPacket), + callbacks: make(map[uint32]responseCallback), + activeBuilds: make(map[int]*activeBuild), + outgoingPackets: make(chan outgoingPacket), + keepAliveWaitGroup: helpers.MakeThreadSafeWaitGroup(), } buffer := make([]byte, 16*1024) stream := []byte{} @@ -119,34 +119,11 @@ func runService(sendPings bool) { // The protocol always starts with the version os.Stdout.Write(append(writeUint32(nil, uint32(len(esbuildVersion))), esbuildVersion...)) - // IMPORTANT: To avoid a data race, we must ensure that calling "Add()" on a - // "WaitGroup" with a counter of zero cannot possibly happen concurrently - // with calling "Wait()" on another goroutine. Thus we must ensure that the - // counter starts off positive before "Wait()" is called and that it only - // ever reaches zero exactly once (and that the counter never goes back above - // zero once it reaches zero). See this for discussion and more information: - // https://github.com/evanw/esbuild/issues/2485#issuecomment-1299318498 + // Wait for the last response to be written to stdout before returning from + // the enclosing function, which will return from "main()" and exit. service.keepAliveWaitGroup.Add(1) defer func() { - // Stop all future calls to "sendRequest()" from calling "Add()" on our - // "WaitGroup" while we're calling "Wait()", since it may have a counter of - // zero at that point. This is a mutex so calls to "sendRequest()" must - // fall into one of two cases: - // - // a) The critical section in "sendRequest()" comes before this critical - // section and "Add()" is called while the counter is non-zero, which - // is fine. - // - // b) The critical section in "sendRequest()" comes after this critical - // section and it does not call "Add()", which is also fine. - // - service.mutex.Lock() - service.disableSendRequest = true service.keepAliveWaitGroup.Done() - service.mutex.Unlock() - - // Wait for the last response to be written to stdout before returning from - // the enclosing function, which will return from "main()" and exit. service.keepAliveWaitGroup.Wait() }() @@ -222,26 +199,7 @@ func (service *serviceType) sendRequest(request interface{}) interface{} { return id }() - // This function can be called from any thread. For example, it might be called - // by the implementation of watch or serve mode, or by esbuild's keep-alive - // timer goroutine. To avoid data races, we must ensure that it's not possible - // for "Add()" to be called on a "WaitGroup" with a counter of zero at the - // same time as "Wait()" is called on another goroutine. - // - // There's a potential data race when the stdin thread has finished reading - // from stdin because stdin has been closed but while it's calling "Wait()" - // on the "WaitGroup" with a counter of zero, some other thread calls - // "sendRequest()" which calls "Add()". - // - // This data race is prevented by not sending any more requests once the - // stdin thread has finished. This is ok because we are about to exit. - service.mutex.Lock() - if service.disableSendRequest { - service.mutex.Unlock() - return nil - } service.keepAliveWaitGroup.Add(1) // The writer thread will call "Done()" - service.mutex.Unlock() service.outgoingPackets <- outgoingPacket{ bytes: encodePacket(packet{ id: id, diff --git a/internal/helpers/waitgroup.go b/internal/helpers/waitgroup.go new file mode 100644 index 00000000000..850e088280e --- /dev/null +++ b/internal/helpers/waitgroup.go @@ -0,0 +1,37 @@ +package helpers + +import "sync/atomic" + +// Go's "sync.WaitGroup" is not thread-safe. Specifically it's not safe to call +// "Add" concurrently with "Wait", which is problematic because we have a case +// where we would like to do that. +// +// This is a simple alternative implementation of "sync.WaitGroup" that is +// thread-safe and that works for our purposes. We don't need to worry about +// multiple waiters so the implementation can be very simple. +type ThreadSafeWaitGroup struct { + counter int32 + channel chan struct{} +} + +func MakeThreadSafeWaitGroup() *ThreadSafeWaitGroup { + return &ThreadSafeWaitGroup{ + channel: make(chan struct{}, 1), + } +} + +func (wg *ThreadSafeWaitGroup) Add(delta int32) { + if counter := atomic.AddInt32(&wg.counter, delta); counter == 0 { + wg.channel <- struct{}{} + } else if counter < 0 { + panic("sync: negative WaitGroup counter") + } +} + +func (wg *ThreadSafeWaitGroup) Done() { + wg.Add(-1) +} + +func (wg *ThreadSafeWaitGroup) Wait() { + <-wg.channel +} diff --git a/scripts/js-api-tests.js b/scripts/js-api-tests.js index 3225c414259..09404f6d44c 100644 --- a/scripts/js-api-tests.js +++ b/scripts/js-api-tests.js @@ -1,4 +1,5 @@ const { installForTests, removeRecursiveSync, writeFileAtomic } = require('./esbuild') +const child_process = require('child_process') const assert = require('assert') const path = require('path') const http = require('http') @@ -5797,6 +5798,52 @@ ${path.relative(process.cwd(), input).replace(/\\/g, '/')}:1:2: ERROR: Unexpecte }, } +let childProcessTests = { + // More info about this test case: https://github.com/evanw/esbuild/issues/2727 + async testIncrementalChildProcessExit({ testDir, esbuild }) { + const file = path.join(testDir, 'build.js') + + await writeFileAsync(file, ` + const esbuild = require(${JSON.stringify(esbuild.ESBUILD_PACKAGE_PATH)}) + esbuild.build({ + entryPoints: [], + incremental: true, + }).then(() => { + console.log('success') + process.exit(0) + }) + `) + + let timeout + const detectHangPromise = new Promise((_, reject) => { + timeout = setTimeout(() => { + reject(new Error('Timed out waiting for keep-alive check to terminate')) + }, 5 * 60 * 1000) + }) + + const testKeepAlivePingPromise = new Promise((resolve, reject) => { + child_process.execFile('node', [file], { + stdio: [ + 'inherit', + 'inherit', + 'pipe', // This is important for the test to check for the hang + ], + }, (error, stdout, stderr) => { + clearTimeout(timeout) + if (error) reject(error) + else if (stdout !== 'success\n') reject(new Error('Unexpected stdout: ' + JSON.stringify(stdout))) + else if (stderr !== '') reject(new Error('Unexpected stderr: ' + JSON.stringify(stderr))) + else resolve() + }) + }) + + await Promise.race([ + detectHangPromise, + testKeepAlivePingPromise, + ]) + }, +} + async function assertSourceMap(jsSourceMap, source) { jsSourceMap = JSON.parse(jsSourceMap) assert.deepStrictEqual(jsSourceMap.version, 3) @@ -5840,6 +5887,7 @@ async function main() { ...Object.entries(formatTests), ...Object.entries(analyzeTests), ...Object.entries(syncTests), + ...Object.entries(childProcessTests), ] let allTestsPassed = (await Promise.all(tests.map(runTest))).every(success => success) diff --git a/scripts/test-yarnpnp.js b/scripts/test-yarnpnp.js index 0395dafe9c7..b085b722cd2 100644 --- a/scripts/test-yarnpnp.js +++ b/scripts/test-yarnpnp.js @@ -78,10 +78,11 @@ function runTests() { } const minutes = 10 -setTimeout(() => { +const timeout = setTimeout(() => { console.error(`❌ Yarn PnP tests timed out after ${minutes} minutes`) process.exit(1) }, minutes * 60 * 1000) reinstallYarnIfNeeded() runTests() +clearTimeout(timeout)