Optimize AddCaller+AddStacktrace by sharing stacks (#1052)

(Supersedes #808.) If we use the `AddCaller(true)` option for a logger with `AddStacktrace(level)`, which captures a stack trace for the specified level or higher, Zap currently captures inspects the stack twice: - `getCallerFrame` retrieves information about the immediate caller of the log entry - `StackSkip` calls `takeStacktrace` to capture the entire stack trace and build a string from it For the case where both caller and stack trace are requested, we can avoid redundant work by sharing information about the call stack between the logic for the two. To accomplish this, the following high-level pieces were added: type stacktrace captureStacktrace(skip int, depth stacktraceDepth) *stacktrace func (*stacktrace) Next() (runtime.Frame, bool) type stackFormatter func newStackFormatter(*buffer.Buffer) func (*stackFormatter) FormatStack(*stacktrace) func (*stackFormatter) FormatFrame(runtime.Frame) `captureStacktrace` captures stack traces (either just one frame or the entire stack) that can then be inspected manually (to extract caller information) and formatted into a string representation either one-frame-at-a-time or wholesale with `stackFormatter`. This allows us reorganize logic that applies the AddCaller and AddStacktrace options so that it can: - capture as much of the stack as it needs - extract caller information from the first frame if we need it - format that frame and the remaining frames into a stack trace --- Benchmarking: I ran the new benchmark included in this PR, as well as the existing stack trace/caller based benchmarks 10 times for 10 seconds each. ``` % go test -run '^$' -bench 'Caller|Stack' -benchmem -benchtime 10s -count 10 [...] name old time/op new time/op delta StackField-96 834ns ± 8% 819ns ±14% ~ (p=0.971 n=10+10) AddCallerHook-96 466ns ± 7% 480ns ± 1% ~ (p=0.118 n=10+6) AddCallerAndStacktrace-96 1.16µs ± 4% 0.78µs ± 8% -32.60% (p=0.000 n=9+10) TakeStacktrace-96 2.66µs ± 2% 2.74µs ± 2% +3.01% (p=0.000 n=9+10) name old alloc/op new alloc/op delta StackField-96 542B ± 0% 544B ± 0% +0.33% (p=0.000 n=10+10) AddCallerHook-96 254B ± 0% 248B ± 0% -2.36% (p=0.000 n=10+10) AddCallerAndStacktrace-96 652B ± 0% 429B ± 0% -34.20% (p=0.000 n=9+6) TakeStacktrace-96 420B ± 0% 421B ± 0% +0.24% (p=0.000 n=10+10) name old allocs/op new allocs/op delta StackField-96 3.00 ± 0% 3.00 ± 0% ~ (all equal) AddCallerHook-96 3.00 ± 0% 2.00 ± 0% -33.33% (p=0.000 n=10+10) AddCallerAndStacktrace-96 5.00 ± 0% 3.00 ± 0% -40.00% (p=0.000 n=10+10) TakeStacktrace-96 2.00 ± 0% 2.00 ± 0% ~ (all equal) ``` Summary: - AddCaller+AddStacktrace takes 2 fewer allocations and is ~30% faster - AddCaller costs one allocation less - takeStacktrace costs 3% more CPU I think that the overall improvement is worth the slightly more expensive stack traces. Refs GO-1181
uber-go · Jan 27, 2022 · a1155d9 · a1155d9
1 parent 3f20106
commit a1155d9
Show file tree

Hide file tree

Showing 4 changed files with 224 additions and 68 deletions.
diff --git a/logger.go b/logger.go
@@ -24,9 +24,9 @@ import (
 	"fmt"
 	"io/ioutil"
 	"os"
-	"runtime"
 	"strings"
 
+	"go.uber.org/zap/internal/bufferpool"
 	"go.uber.org/zap/zapcore"
 )
 
@@ -259,8 +259,10 @@ func (log *Logger) clone() *Logger {
 }
 
 func (log *Logger) check(lvl zapcore.Level, msg string) *zapcore.CheckedEntry {
-	// check must always be called directly by a method in the Logger interface
-	// (e.g., Check, Info, Fatal).
+	// Logger.check must always be called directly by a method in the
+	// Logger interface (e.g., Check, Info, Fatal).
+	// This skips Logger.check and the Info/Fatal/Check/etc. method that
+	// called it.
 	const callerSkipOffset = 2
 
 	// Check the level first to reduce the cost of disabled log calls.
@@ -307,42 +309,55 @@ func (log *Logger) check(lvl zapcore.Level, msg string) *zapcore.CheckedEntry {
 
 	// Thread the error output through to the CheckedEntry.
 	ce.ErrorOutput = log.errorOutput
-	if log.addCaller {
-		frame, defined := getCallerFrame(log.callerSkip + callerSkipOffset)
-		if !defined {
+
+	addStack := log.addStack.Enabled(ce.Level)
+	if !log.addCaller && !addStack {
+		return ce
+	}
+
+	// Adding the caller or stack trace requires capturing the callers of
+	// this function. We'll share information between these two.
+	stackDepth := stacktraceFirst
+	if addStack {
+		stackDepth = stacktraceFull
+	}
+	stack := captureStacktrace(log.callerSkip+callerSkipOffset, stackDepth)
+	defer stack.Free()
+
+	if stack.Count() == 0 {
+		if log.addCaller {
 			fmt.Fprintf(log.errorOutput, "%v Logger.check error: failed to get caller\n", ent.Time.UTC())
 			log.errorOutput.Sync()
 		}
+		return ce
+	}
 
+	frame, more := stack.Next()
+
+	if log.addCaller {
 		ce.Caller = zapcore.EntryCaller{
-			Defined:  defined,
+			Defined:  frame.PC != 0,
 			PC:       frame.PC,
 			File:     frame.File,
 			Line:     frame.Line,
 			Function: frame.Function,
 		}
 	}
-	if log.addStack.Enabled(ce.Level) {
-		ce.Stack = StackSkip("", log.callerSkip+callerSkipOffset).String
-	}
 
-	return ce
-}
+	if addStack {
+		buffer := bufferpool.Get()
+		defer buffer.Free()
 
-// getCallerFrame gets caller frame. The argument skip is the number of stack
-// frames to ascend, with 0 identifying the caller of getCallerFrame. The
-// boolean ok is false if it was not possible to recover the information.
-//
-// Note: This implementation is similar to runtime.Caller, but it returns the whole frame.
-func getCallerFrame(skip int) (frame runtime.Frame, ok bool) {
-	const skipOffset = 2 // skip getCallerFrame and Callers
-
-	pc := make([]uintptr, 1)
-	numFrames := runtime.Callers(skip+skipOffset, pc)
-	if numFrames < 1 {
-		return
+		stackfmt := newStackFormatter(buffer)
+
+		// We've already extracted the first frame, so format that
+		// separately and defer to stackfmt for the rest.
+		stackfmt.FormatFrame(frame)
+		if more {
+			stackfmt.FormatStack(stack)
+		}
+		ce.Stack = buffer.String()
 	}
 
-	frame, _ = runtime.CallersFrames(pc).Next()
-	return frame, frame.PC != 0
+	return ce
 }
diff --git a/logger_bench_test.go b/logger_bench_test.go
@@ -179,6 +179,24 @@ func BenchmarkAddCallerHook(b *testing.B) {
 	})
 }
 
+func BenchmarkAddCallerAndStacktrace(b *testing.B) {
+	logger := New(
+		zapcore.NewCore(
+			zapcore.NewJSONEncoder(NewProductionConfig().EncoderConfig),
+			&ztest.Discarder{},
+			InfoLevel,
+		),
+		AddCaller(),
+		AddStacktrace(WarnLevel),
+	)
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			logger.Warn("Caller and stacktrace.")
+		}
+	})
+}
+
 func Benchmark10Fields(b *testing.B) {
 	withBenchedLogger(b, func(log *Logger) {
 		log.Info("Ten fields, passed at the log site.",

diff --git a/stacktrace.go b/stacktrace.go
@@ -24,62 +24,153 @@ import (
 	"runtime"
 	"sync"
 
+	"go.uber.org/zap/buffer"
 	"go.uber.org/zap/internal/bufferpool"
 )
 
-var (
-	_stacktracePool = sync.Pool{
-		New: func() interface{} {
-			return newProgramCounters(64)
-		},
-	}
+var _stacktracePool = sync.Pool{
+	New: func() interface{} {
+		return &stacktrace{
+			storage: make([]uintptr, 64),
+		}
+	},
+}
+
+type stacktrace struct {
+	pcs    []uintptr // program counters; always a subslice of storage
+	frames *runtime.Frames
+
+	// The size of pcs varies depending on requirements:
+	// it will be one if the only the first frame was requested,
+	// and otherwise it will reflect the depth of the call stack.
+	//
+	// storage decouples the slice we need (pcs) from the slice we pool.
+	// We will always allocate a reasonably large storage, but we'll use
+	// only as much of it as we need.
+	storage []uintptr
+}
+
+// stacktraceDepth specifies how deep of a stack trace should be captured.
+type stacktraceDepth int
+
+const (
+	// stacktraceFirst captures only the first frame.
+	stacktraceFirst stacktraceDepth = iota
+
+	// stacktraceFull captures the entire call stack, allocating more
+	// storage for it if needed.
+	stacktraceFull
 )
 
-func takeStacktrace(skip int) string {
-	buffer := bufferpool.Get()
-	defer buffer.Free()
-	programCounters := _stacktracePool.Get().(*programCounters)
-	defer _stacktracePool.Put(programCounters)
-
-	var numFrames int
-	for {
-		// Skip the call to runtime.Callers and takeStacktrace so that the
-		// program counters start at the caller of takeStacktrace.
-		numFrames = runtime.Callers(skip+2, programCounters.pcs)
-		if numFrames < len(programCounters.pcs) {
-			break
-		}
-		// Don't put the too-short counter slice back into the pool; this lets
-		// the pool adjust if we consistently take deep stacktraces.
-		programCounters = newProgramCounters(len(programCounters.pcs) * 2)
+// captureStacktrace captures a stack trace of the specified depth, skipping
+// the provided number of frames. skip=0 identifies the caller of
+// captureStacktrace.
+//
+// The caller must call Free on the returned stacktrace after using it.
+func captureStacktrace(skip int, depth stacktraceDepth) *stacktrace {
+	stack := _stacktracePool.Get().(*stacktrace)
+
+	switch depth {
+	case stacktraceFirst:
+		stack.pcs = stack.storage[:1]
+	case stacktraceFull:
+		stack.pcs = stack.storage
 	}
 
-	i := 0
-	frames := runtime.CallersFrames(programCounters.pcs[:numFrames])
+	// Unlike other "skip"-based APIs, skip=0 identifies runtime.Callers
+	// itself. +2 to skip captureStacktrace and runtime.Callers.
+	numFrames := runtime.Callers(
+		skip+2,
+		stack.pcs,
+	)
 
-	// Note: On the last iteration, frames.Next() returns false, with a valid
-	// frame, but we ignore this frame. The last frame is a a runtime frame which
-	// adds noise, since it's only either runtime.main or runtime.goexit.
-	for frame, more := frames.Next(); more; frame, more = frames.Next() {
-		if i != 0 {
-			buffer.AppendByte('\n')
+	// runtime.Callers truncates the recorded stacktrace if there is no
+	// room in the provided slice. For the full stack trace, keep expanding
+	// storage until there are fewer frames than there is room.
+	if depth == stacktraceFull {
+		pcs := stack.pcs
+		for numFrames == len(pcs) {
+			pcs = make([]uintptr, len(pcs)*2)
+			numFrames = runtime.Callers(skip+2, pcs)
 		}
-		i++
-		buffer.AppendString(frame.Function)
-		buffer.AppendByte('\n')
-		buffer.AppendByte('\t')
-		buffer.AppendString(frame.File)
-		buffer.AppendByte(':')
-		buffer.AppendInt(int64(frame.Line))
+
+		// Discard old storage instead of returning it to the pool.
+		// This will adjust the pool size over time if stack traces are
+		// consistently very deep.
+		stack.storage = pcs
+		stack.pcs = pcs[:numFrames]
+	} else {
+		stack.pcs = stack.pcs[:numFrames]
 	}
 
+	stack.frames = runtime.CallersFrames(stack.pcs)
+	return stack
+}
+
+// Free releases resources associated with this stacktrace
+// and returns it back to the pool.
+func (st *stacktrace) Free() {
+	st.frames = nil
+	st.pcs = nil
+	_stacktracePool.Put(st)
+}
+
+// Count reports the total number of frames in this stacktrace.
+// Count DOES NOT change as Next is called.
+func (st *stacktrace) Count() int {
+	return len(st.pcs)
+}
+
+// Next returns the next frame in the stack trace,
+// and a boolean indicating whether there are more after it.
+func (st *stacktrace) Next() (_ runtime.Frame, more bool) {
+	return st.frames.Next()
+}
+
+func takeStacktrace(skip int) string {
+	stack := captureStacktrace(skip+1, stacktraceFull)
+	defer stack.Free()
+
+	buffer := bufferpool.Get()
+	defer buffer.Free()
+
+	stackfmt := newStackFormatter(buffer)
+	stackfmt.FormatStack(stack)
 	return buffer.String()
 }
 
-type programCounters struct {
-	pcs []uintptr
+// stackFormatter formats a stack trace into a readable string representation.
+type stackFormatter struct {
+	b        *buffer.Buffer
+	nonEmpty bool // whehther we've written at least one frame already
+}
+
+// newStackFormatter builds a new stackFormatter.
+func newStackFormatter(b *buffer.Buffer) stackFormatter {
+	return stackFormatter{b: b}
+}
+
+// FormatStack formats all remaining frames in the provided stacktrace -- minus
+// the final runtime.main/runtime.goexit frame.
+func (sf *stackFormatter) FormatStack(stack *stacktrace) {
+	// Note: On the last iteration, frames.Next() returns false, with a valid
+	// frame, but we ignore this frame. The last frame is a a runtime frame which
+	// adds noise, since it's only either runtime.main or runtime.goexit.
+	for frame, more := stack.Next(); more; frame, more = stack.Next() {
+		sf.FormatFrame(frame)
+	}
 }
 
-func newProgramCounters(size int) *programCounters {
-	return &programCounters{make([]uintptr, size)}
+// FormatFrame formats the given frame.
+func (sf *stackFormatter) FormatFrame(frame runtime.Frame) {
+	if sf.nonEmpty {
+		sf.b.AppendByte('\n')
+	}
+	sf.nonEmpty = true
+	sf.b.AppendString(frame.Function)
+	sf.b.AppendByte('\n')
+	sf.b.AppendByte('\t')
+	sf.b.AppendString(frame.File)
+	sf.b.AppendByte(':')
+	sf.b.AppendInt(int64(frame.Line))
 }
diff --git a/stacktrace_test.go b/stacktrace_test.go
@@ -21,6 +21,7 @@
 package zap
 
 import (
+	"bytes"
 	"strings"
 	"testing"
 
@@ -67,8 +68,39 @@ func TestTakeStacktraceWithSkipInnerFunc(t *testing.T) {
 	)
 }
 
+func TestTakeStacktraceDeepStack(t *testing.T) {
+	const (
+		N                  = 500
+		withStackDepthName = "go.uber.org/zap.withStackDepth"
+	)
+	withStackDepth(N, func() {
+		trace := takeStacktrace(0)
+		for found := 0; found < N; found++ {
+			i := strings.Index(trace, withStackDepthName)
+			if i < 0 {
+				t.Fatalf(`expected %v occurrences of %q, found %d`,
+					N, withStackDepthName, found)
+			}
+			trace = trace[i+len(withStackDepthName):]
+		}
+	})
+}
+
 func BenchmarkTakeStacktrace(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		takeStacktrace(0)
 	}
 }
+
+func withStackDepth(depth int, f func()) {
+	var recurse func(rune) rune
+	recurse = func(r rune) rune {
+		if r > 0 {
+			bytes.Map(recurse, []byte(string([]rune{r - 1})))
+		} else {
+			f()
+		}
+		return 0
+	}
+	recurse(rune(depth))
+}