Skip to content

Commit

Permalink
attempted fix for #2485: Add and Wait safety
Browse files Browse the repository at this point in the history
  • Loading branch information
evanw committed Nov 3, 2022
1 parent 4f9694b commit 78d6194
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 10 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Expand Up @@ -28,6 +28,12 @@

This discrepancy between esbuild's path resolution layer and its watch mode was causing watch mode to rebuild continuously on Android. With this release, esbuild's watch mode instead checks for an error status change in the `readdir` file system call, so watch mode should no longer rebuild continuously on Android.

* Apply a fix for a rare deadlock with the JavaScript API ([#1842](https://github.com/evanw/esbuild/issues/1842), [#2485](https://github.com/evanw/esbuild/issues/2485))

There have been reports of esbuild sometimes exiting with an "all goroutines are asleep" deadlock message from the Go language runtime. This issue hasn't made much progress until recently, where a possible cause was discovered (thanks to [@jfirebaugh](https://github.com/jfirebaugh) for the investigation). This release contains a possible fix for that possible cause, so this deadlock may have been fixed. The fix cannot be easily verified because the deadlock is non-deterministic and rare. If this was indeed the cause, then this issue only affected the JavaScript API in situations where esbuild was already in the process of exiting.

In detail: The underlying cause is that Go's [`sync.WaitGroup`](https://pkg.go.dev/sync#WaitGroup) API for waiting for a set of goroutines to finish is not fully thread-safe. Specifically it's not safe to call `Add()` concurrently with `Wait()` when the wait group counter is zero due to a data race. This situation could come up with esbuild's JavaScript API when the host JavaScript process closes the child process's stdin and the child process (with no active tasks) calls `Wait()` to check that there are no active tasks, at the same time as esbuild's watchdog timer calls `Add()` to add an active task (that pings the host to see if it's still there). The fix in this release is to avoid calling `Add()` once we learn that stdin has been closed but before we call `Wait()`.

## 0.15.12

* Fix minifier correctness bug with single-use substitutions ([#2619](https://github.com/evanw/esbuild/issues/2619))
Expand Down
78 changes: 68 additions & 10 deletions cmd/esbuild/service.go
Expand Up @@ -46,6 +46,7 @@ type serviceType struct {
keepAliveWaitGroup sync.WaitGroup
mutex sync.Mutex
nextRequestID uint32
disableSendRequest bool
}

func (service *serviceType) getActiveBuild(key int) *activeBuild {
Expand Down Expand Up @@ -107,10 +108,7 @@ func runService(sendPings bool) {
// Write packets on a single goroutine so they aren't interleaved
go func() {
for {
packet, ok := <-service.outgoingPackets
if !ok {
break // No more packets
}
packet := <-service.outgoingPackets
if _, err := os.Stdout.Write(packet.bytes); err != nil {
os.Exit(1) // I/O error
}
Expand All @@ -121,6 +119,37 @@ func runService(sendPings bool) {
// The protocol always starts with the version
os.Stdout.Write(append(writeUint32(nil, uint32(len(esbuildVersion))), esbuildVersion...))

// IMPORTANT: To avoid a data race, we must ensure that calling "Add()" on a
// "WaitGroup" with a counter of zero cannot possibly happen concurrently
// with calling "Wait()" on another goroutine. Thus we must ensure that the
// counter starts off positive before "Wait()" is called and that it only
// ever reaches zero exactly once (and that the counter never goes back above
// zero once it reaches zero). See this for discussion and more information:
// https://github.com/evanw/esbuild/issues/2485#issuecomment-1299318498
service.keepAliveWaitGroup.Add(1)
defer func() {
// Stop all future calls to "sendRequest()" from calling "Add()" on our
// "WaitGroup" while we're calling "Wait()", since it may have a counter of
// zero at that point. This is a mutex so calls to "sendRequest()" must
// fall into one of two cases:
//
// a) The critical section in "sendRequest()" comes before this critical
// section and "Add()" is called while the counter is non-zero, which
// is fine.
//
// b) The critical section in "sendRequest()" comes after this critical
// section and it does not call "Add()", which is also fine.
//
service.mutex.Lock()
service.disableSendRequest = true
service.keepAliveWaitGroup.Done()
service.mutex.Unlock()

// Wait for the last response to be written to stdout before returning from
// the enclosing function, which will return from "main()" and exit.
service.keepAliveWaitGroup.Wait()
}()

// Periodically ping the host even when we're idle. This will catch cases
// where the host has disappeared and will never send us anything else but
// we incorrectly think we are still needed. In that case we will now try
Expand Down Expand Up @@ -172,11 +201,11 @@ func runService(sendPings bool) {
// Move the remaining partial packet to the end to avoid reallocating
stream = append(stream[:0], bytes...)
}

// Wait for the last response to be written to stdout
service.keepAliveWaitGroup.Wait()
}

// This will either block until the request has been sent and a response has
// been received, or it will return nil to indicate failure to send due to
// stdin being closed.
func (service *serviceType) sendRequest(request interface{}) interface{} {
result := make(chan interface{})
var id uint32
Expand All @@ -192,7 +221,27 @@ func (service *serviceType) sendRequest(request interface{}) interface{} {
service.callbacks[id] = callback
return id
}()

// This function can be called from any thread. For example, it might be called
// by the implementation of watch or serve mode, or by esbuild's keep-alive
// timer goroutine. To avoid data races, we must ensure that it's not possible
// for "Add()" to be called on a "WaitGroup" with a counter of zero at the
// same time as "Wait()" is called on another goroutine.
//
// There's a potential data race when the stdin thread has finished reading
// from stdin because stdin has been closed but while it's calling "Wait()"
// on the "WaitGroup" with a counter of zero, some other thread calls
// "sendRequest()" which calls "Add()".
//
// This data race is prevented by not sending any more requests once the
// stdin thread has finished. This is ok because we are about to exit.
service.mutex.Lock()
if service.disableSendRequest {
service.mutex.Unlock()
return nil
}
service.keepAliveWaitGroup.Add(1) // The writer thread will call "Done()"
service.mutex.Unlock()
service.outgoingPackets <- outgoingPacket{
bytes: encodePacket(packet{
id: id,
Expand Down Expand Up @@ -749,10 +798,13 @@ func (service *serviceType) convertPlugins(key int, jsPlugins interface{}, activ
build.OnStart(func() (api.OnStartResult, error) {
result := api.OnStartResult{}

response := service.sendRequest(map[string]interface{}{
response, ok := service.sendRequest(map[string]interface{}{
"command": "on-start",
"key": key,
}).(map[string]interface{})
if !ok {
return result, errors.New("The service was stopped")
}

if value, ok := response["errors"]; ok {
result.Errors = decodeMessages(value.([]interface{}))
Expand All @@ -778,7 +830,7 @@ func (service *serviceType) convertPlugins(key int, jsPlugins interface{}, activ
return result, nil
}

response := service.sendRequest(map[string]interface{}{
response, ok := service.sendRequest(map[string]interface{}{
"command": "on-resolve",
"key": key,
"ids": ids,
Expand All @@ -789,6 +841,9 @@ func (service *serviceType) convertPlugins(key int, jsPlugins interface{}, activ
"kind": resolveKindToString(args.Kind),
"pluginData": args.PluginData,
}).(map[string]interface{})
if !ok {
return result, errors.New("The service was stopped")
}

if value, ok := response["id"]; ok {
id := value.(int)
Expand Down Expand Up @@ -857,7 +912,7 @@ func (service *serviceType) convertPlugins(key int, jsPlugins interface{}, activ
return result, nil
}

response := service.sendRequest(map[string]interface{}{
response, ok := service.sendRequest(map[string]interface{}{
"command": "on-load",
"key": key,
"ids": ids,
Expand All @@ -866,6 +921,9 @@ func (service *serviceType) convertPlugins(key int, jsPlugins interface{}, activ
"suffix": args.Suffix,
"pluginData": args.PluginData,
}).(map[string]interface{})
if !ok {
return result, errors.New("The service was stopped")
}

if value, ok := response["id"]; ok {
id := value.(int)
Expand Down

0 comments on commit 78d6194

Please sign in to comment.