Skip to content

Commit

Permalink
fix #2727: avoid sync.WaitGroup to fix hang
Browse files Browse the repository at this point in the history
  • Loading branch information
evanw committed Dec 8, 2022
1 parent 2ec1f5b commit 8648239
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 50 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
@@ -1,5 +1,11 @@
# Changelog

## Unreleased

* Fix a hang with the JS API in certain cases ([#2727](https://github.com/evanw/esbuild/issues/2727))

A change that was made in version 0.15.13 accidentally introduced a case when using esbuild's JS API could cause the node process to fail to exit. The change broke esbuild's watchdog timer, which detects if the parent process no longer exists and then automatically exits esbuild. This hang happened when you ran node as a child process with the `stderr` stream set to `pipe` instead of `inherit`, in the child process you call esbuild's JS API and pass `incremental: true` but do not call `dispose()` on the returned `rebuild` object, and then call `process.exit()`. In that case the parent node process was still waiting for the esbuild process that was created by the child node process to exit. The change made in version 0.15.13 was trying to avoid using Go's `sync.WaitGroup` API incorrectly because the API is not thread-safe. Instead of doing this, I have now reverted that change and implemented a thread-safe version of the `sync.WaitGroup` API for esbuild to use instead.

## 0.16.2

* Fix `process.env.NODE_ENV` substitution when transforming ([#2718](https://github.com/evanw/esbuild/issues/2718))
Expand Down
56 changes: 7 additions & 49 deletions cmd/esbuild/service.go
Expand Up @@ -43,10 +43,9 @@ type serviceType struct {
callbacks map[uint32]responseCallback
activeBuilds map[int]*activeBuild
outgoingPackets chan outgoingPacket
keepAliveWaitGroup sync.WaitGroup
keepAliveWaitGroup *helpers.ThreadSafeWaitGroup
mutex sync.Mutex
nextRequestID uint32
disableSendRequest bool
}

func (service *serviceType) getActiveBuild(key int) *activeBuild {
Expand Down Expand Up @@ -98,9 +97,10 @@ func runService(sendPings bool) {
logger.API = logger.JSAPI

service := serviceType{
callbacks: make(map[uint32]responseCallback),
activeBuilds: make(map[int]*activeBuild),
outgoingPackets: make(chan outgoingPacket),
callbacks: make(map[uint32]responseCallback),
activeBuilds: make(map[int]*activeBuild),
outgoingPackets: make(chan outgoingPacket),
keepAliveWaitGroup: helpers.MakeThreadSafeWaitGroup(),
}
buffer := make([]byte, 16*1024)
stream := []byte{}
Expand All @@ -119,34 +119,11 @@ func runService(sendPings bool) {
// The protocol always starts with the version
os.Stdout.Write(append(writeUint32(nil, uint32(len(esbuildVersion))), esbuildVersion...))

// IMPORTANT: To avoid a data race, we must ensure that calling "Add()" on a
// "WaitGroup" with a counter of zero cannot possibly happen concurrently
// with calling "Wait()" on another goroutine. Thus we must ensure that the
// counter starts off positive before "Wait()" is called and that it only
// ever reaches zero exactly once (and that the counter never goes back above
// zero once it reaches zero). See this for discussion and more information:
// https://github.com/evanw/esbuild/issues/2485#issuecomment-1299318498
// Wait for the last response to be written to stdout before returning from
// the enclosing function, which will return from "main()" and exit.
service.keepAliveWaitGroup.Add(1)
defer func() {
// Stop all future calls to "sendRequest()" from calling "Add()" on our
// "WaitGroup" while we're calling "Wait()", since it may have a counter of
// zero at that point. This is a mutex so calls to "sendRequest()" must
// fall into one of two cases:
//
// a) The critical section in "sendRequest()" comes before this critical
// section and "Add()" is called while the counter is non-zero, which
// is fine.
//
// b) The critical section in "sendRequest()" comes after this critical
// section and it does not call "Add()", which is also fine.
//
service.mutex.Lock()
service.disableSendRequest = true
service.keepAliveWaitGroup.Done()
service.mutex.Unlock()

// Wait for the last response to be written to stdout before returning from
// the enclosing function, which will return from "main()" and exit.
service.keepAliveWaitGroup.Wait()
}()

Expand Down Expand Up @@ -222,26 +199,7 @@ func (service *serviceType) sendRequest(request interface{}) interface{} {
return id
}()

// This function can be called from any thread. For example, it might be called
// by the implementation of watch or serve mode, or by esbuild's keep-alive
// timer goroutine. To avoid data races, we must ensure that it's not possible
// for "Add()" to be called on a "WaitGroup" with a counter of zero at the
// same time as "Wait()" is called on another goroutine.
//
// There's a potential data race when the stdin thread has finished reading
// from stdin because stdin has been closed but while it's calling "Wait()"
// on the "WaitGroup" with a counter of zero, some other thread calls
// "sendRequest()" which calls "Add()".
//
// This data race is prevented by not sending any more requests once the
// stdin thread has finished. This is ok because we are about to exit.
service.mutex.Lock()
if service.disableSendRequest {
service.mutex.Unlock()
return nil
}
service.keepAliveWaitGroup.Add(1) // The writer thread will call "Done()"
service.mutex.Unlock()
service.outgoingPackets <- outgoingPacket{
bytes: encodePacket(packet{
id: id,
Expand Down
37 changes: 37 additions & 0 deletions internal/helpers/waitgroup.go
@@ -0,0 +1,37 @@
package helpers

import "sync/atomic"

// Go's "sync.WaitGroup" is not thread-safe. Specifically it's not safe to call
// "Add" concurrently with "Wait", which is problematic because we have a case
// where we would like to do that.
//
// This is a simple alternative implementation of "sync.WaitGroup" that is
// thread-safe and that works for our purposes. We don't need to worry about
// multiple waiters so the implementation can be very simple.
type ThreadSafeWaitGroup struct {
counter int32
channel chan struct{}
}

func MakeThreadSafeWaitGroup() *ThreadSafeWaitGroup {
return &ThreadSafeWaitGroup{
channel: make(chan struct{}, 1),
}
}

func (wg *ThreadSafeWaitGroup) Add(delta int32) {
if counter := atomic.AddInt32(&wg.counter, delta); counter == 0 {
wg.channel <- struct{}{}
} else if counter < 0 {
panic("sync: negative WaitGroup counter")
}
}

func (wg *ThreadSafeWaitGroup) Done() {
wg.Add(-1)
}

func (wg *ThreadSafeWaitGroup) Wait() {
<-wg.channel
}
48 changes: 48 additions & 0 deletions scripts/js-api-tests.js
@@ -1,4 +1,5 @@
const { installForTests, removeRecursiveSync, writeFileAtomic } = require('./esbuild')
const child_process = require('child_process')
const assert = require('assert')
const path = require('path')
const http = require('http')
Expand Down Expand Up @@ -5797,6 +5798,52 @@ ${path.relative(process.cwd(), input).replace(/\\/g, '/')}:1:2: ERROR: Unexpecte
},
}

let childProcessTests = {
// More info about this test case: https://github.com/evanw/esbuild/issues/2727
async testIncrementalChildProcessExit({ testDir, esbuild }) {
const file = path.join(testDir, 'build.js')

await writeFileAsync(file, `
const esbuild = require(${JSON.stringify(esbuild.ESBUILD_PACKAGE_PATH)})
esbuild.build({
entryPoints: [],
incremental: true,
}).then(() => {
console.log('success')
process.exit(0)
})
`)

let timeout
const detectHangPromise = new Promise((_, reject) => {
timeout = setTimeout(() => {
reject(new Error('Timed out waiting for keep-alive check to terminate'))
}, 5 * 60 * 1000)
})

const testKeepAlivePingPromise = new Promise((resolve, reject) => {
child_process.execFile('node', [file], {
stdio: [
'inherit',
'inherit',
'pipe', // This is important for the test to check for the hang
],
}, (error, stdout, stderr) => {
clearTimeout(timeout)
if (error) reject(error)
else if (stdout !== 'success\n') reject(new Error('Unexpected stdout: ' + JSON.stringify(stdout)))
else if (stderr !== '') reject(new Error('Unexpected stderr: ' + JSON.stringify(stderr)))
else resolve()
})
})

await Promise.race([
detectHangPromise,
testKeepAlivePingPromise,
])
},
}

async function assertSourceMap(jsSourceMap, source) {
jsSourceMap = JSON.parse(jsSourceMap)
assert.deepStrictEqual(jsSourceMap.version, 3)
Expand Down Expand Up @@ -5840,6 +5887,7 @@ async function main() {
...Object.entries(formatTests),
...Object.entries(analyzeTests),
...Object.entries(syncTests),
...Object.entries(childProcessTests),
]
let allTestsPassed = (await Promise.all(tests.map(runTest))).every(success => success)

Expand Down
3 changes: 2 additions & 1 deletion scripts/test-yarnpnp.js
Expand Up @@ -78,10 +78,11 @@ function runTests() {
}

const minutes = 10
setTimeout(() => {
const timeout = setTimeout(() => {
console.error(`❌ Yarn PnP tests timed out after ${minutes} minutes`)
process.exit(1)
}, minutes * 60 * 1000)

reinstallYarnIfNeeded()
runTests()
clearTimeout(timeout)

0 comments on commit 8648239

Please sign in to comment.