From 05262c8778b068d5a6be0044ad83d6c775e1f8c9 Mon Sep 17 00:00:00 2001
From: Gustavo Niemeyer <gustavo@niemeyer.net>
Date: Thu, 10 Oct 2019 10:08:21 +0100
Subject: [PATCH] Improve heuristics preventing CPU/memory abuse (#515)

This is a forward port of v2 commit f221b84 by Jordan Liggitt.
---
 decode.go     |  31 +++++++++++++-
 limit_test.go | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++
 scannerc.go   |  16 +++++++
 3 files changed, 159 insertions(+), 1 deletion(-)
 create mode 100644 limit_test.go

diff --git a/decode.go b/decode.go
index 7c15d987..be63169b 100644
--- a/decode.go
+++ b/decode.go
@@ -439,12 +439,41 @@ func (d *decoder) fieldByIndex(n *Node, v reflect.Value, index []int) (field ref
 	return v
 }
 
+const (
+	// 400,000 decode operations is ~500kb of dense object declarations, or
+	// ~5kb of dense object declarations with 10000% alias expansion
+	alias_ratio_range_low = 400000
+
+	// 4,000,000 decode operations is ~5MB of dense object declarations, or
+	// ~4.5MB of dense object declarations with 10% alias expansion
+	alias_ratio_range_high = 4000000
+
+	// alias_ratio_range is the range over which we scale allowed alias ratios
+	alias_ratio_range = float64(alias_ratio_range_high - alias_ratio_range_low)
+)
+
+func allowedAliasRatio(decodeCount int) float64 {
+	switch {
+	case decodeCount <= alias_ratio_range_low:
+		// allow 99% to come from alias expansion for small-to-medium documents
+		return 0.99
+	case decodeCount >= alias_ratio_range_high:
+		// allow 10% to come from alias expansion for very large documents
+		return 0.10
+	default:
+		// scale smoothly from 99% down to 10% over the range.
+		// this maps to 396,000 - 400,000 allowed alias-driven decodes over the range.
+		// 400,000 decode operations is ~100MB of allocations in worst-case scenarios (single-item maps).
+		return 0.99 - 0.89*(float64(decodeCount-alias_ratio_range_low)/alias_ratio_range)
+	}
+}
+
 func (d *decoder) unmarshal(n *Node, out reflect.Value) (good bool) {
 	d.decodeCount++
 	if d.aliasDepth > 0 {
 		d.aliasCount++
 	}
-	if d.aliasCount > 100 && d.decodeCount > 1000 && float64(d.aliasCount)/float64(d.decodeCount) > 0.99 {
+	if d.aliasCount > 100 && d.decodeCount > 1000 && float64(d.aliasCount)/float64(d.decodeCount) > allowedAliasRatio(d.decodeCount) {
 		failf("document contains excessive aliasing")
 	}
 	if out.Type() == nodeType {
diff --git a/limit_test.go b/limit_test.go
new file mode 100644
index 00000000..ba1c0800
--- /dev/null
+++ b/limit_test.go
@@ -0,0 +1,113 @@
+package yaml_test
+
+import (
+	"strings"
+	"testing"
+
+	. "gopkg.in/check.v1"
+	"gopkg.in/yaml.v2"
+)
+
+var limitTests = []struct {
+	name  string
+	data  []byte
+	error string
+}{
+	{
+		name:  "1000kb of maps with 100 aliases",
+		data:  []byte(`{a: &a [{a}` + strings.Repeat(`,{a}`, 1000*1024/4-100) + `], b: &b [*a` + strings.Repeat(`,*a`, 99) + `]}`),
+		error: "yaml: document contains excessive aliasing",
+	}, {
+		name:  "1000kb of deeply nested slices",
+		data:  []byte(strings.Repeat(`[`, 1000*1024)),
+		error: "yaml: exceeded max depth of 10000",
+	}, {
+		name:  "1000kb of deeply nested maps",
+		data:  []byte("x: " + strings.Repeat(`{`, 1000*1024)),
+		error: "yaml: exceeded max depth of 10000",
+	}, {
+		name:  "1000kb of deeply nested indents",
+		data:  []byte(strings.Repeat(`- `, 1000*1024)),
+		error: "yaml: exceeded max depth of 10000",
+	}, {
+		name: "1000kb of 1000-indent lines",
+		data: []byte(strings.Repeat(strings.Repeat(`- `, 1000)+"\n", 1024/2)),
+	},
+	{name: "1kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 1*1024/4-1) + `]`)},
+	{name: "10kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 10*1024/4-1) + `]`)},
+	{name: "100kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 100*1024/4-1) + `]`)},
+	{name: "1000kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 1000*1024/4-1) + `]`)},
+}
+
+func (s *S) TestLimits(c *C) {
+	if testing.Short() {
+		return
+	}
+	for _, tc := range limitTests {
+		var v interface{}
+		err := yaml.Unmarshal(tc.data, &v)
+		if len(tc.error) > 0 {
+			c.Assert(err, ErrorMatches, tc.error, Commentf("testcase: %s", tc.name))
+		} else {
+			c.Assert(err, IsNil, Commentf("testcase: %s", tc.name))
+		}
+	}
+}
+
+func Benchmark1000KB100Aliases(b *testing.B) {
+	benchmark(b, "1000kb of maps with 100 aliases")
+}
+func Benchmark1000KBDeeplyNestedSlices(b *testing.B) {
+	benchmark(b, "1000kb of deeply nested slices")
+}
+func Benchmark1000KBDeeplyNestedMaps(b *testing.B) {
+	benchmark(b, "1000kb of deeply nested maps")
+}
+func Benchmark1000KBDeeplyNestedIndents(b *testing.B) {
+	benchmark(b, "1000kb of deeply nested indents")
+}
+func Benchmark1000KB1000IndentLines(b *testing.B) {
+	benchmark(b, "1000kb of 1000-indent lines")
+}
+func Benchmark1KBMaps(b *testing.B) {
+	benchmark(b, "1kb of maps")
+}
+func Benchmark10KBMaps(b *testing.B) {
+	benchmark(b, "10kb of maps")
+}
+func Benchmark100KBMaps(b *testing.B) {
+	benchmark(b, "100kb of maps")
+}
+func Benchmark1000KBMaps(b *testing.B) {
+	benchmark(b, "1000kb of maps")
+}
+
+func benchmark(b *testing.B, name string) {
+	for _, t := range limitTests {
+		if t.name != name {
+			continue
+		}
+
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			var v interface{}
+			err := yaml.Unmarshal(t.data, &v)
+			if len(t.error) > 0 {
+				if err == nil {
+					b.Errorf("expected error, got none")
+				} else if err.Error() != t.error {
+					b.Errorf("expected error '%s', got '%s'", t.error, err.Error())
+				}
+			} else {
+				if err != nil {
+					b.Errorf("unexpected error: %v", err)
+				}
+			}
+		}
+
+		return
+	}
+
+	b.Errorf("testcase %q not found", name)
+}
diff --git a/scannerc.go b/scannerc.go
index 4a9c6564..e33f4959 100644
--- a/scannerc.go
+++ b/scannerc.go
@@ -961,6 +961,9 @@ func yaml_parser_remove_simple_key(parser *yaml_parser_t) bool {
 	return true
 }
 
+// max_flow_level limits the flow_level
+const max_flow_level = 10000
+
 // Increase the flow level and resize the simple key list if needed.
 func yaml_parser_increase_flow_level(parser *yaml_parser_t) bool {
 	// Reset the simple key on the next level.
@@ -968,6 +971,11 @@ func yaml_parser_increase_flow_level(parser *yaml_parser_t) bool {
 
 	// Increase the flow level.
 	parser.flow_level++
+	if parser.flow_level > max_flow_level {
+		return yaml_parser_set_scanner_error(parser,
+			"while increasing flow level", parser.simple_keys[len(parser.simple_keys)-1].mark,
+			fmt.Sprintf("exceeded max depth of %d", max_flow_level))
+	}
 	return true
 }
 
@@ -980,6 +988,9 @@ func yaml_parser_decrease_flow_level(parser *yaml_parser_t) bool {
 	return true
 }
 
+// max_indents limits the indents stack size
+const max_indents = 10000
+
 // Push the current indentation level to the stack and set the new level
 // the current column is greater than the indentation level.  In this case,
 // append or insert the specified token into the token queue.
@@ -994,6 +1005,11 @@ func yaml_parser_roll_indent(parser *yaml_parser_t, column, number int, typ yaml
 		// indentation level.
 		parser.indents = append(parser.indents, parser.indent)
 		parser.indent = column
+		if len(parser.indents) > max_indents {
+			return yaml_parser_set_scanner_error(parser,
+				"while increasing indent level", parser.simple_keys[len(parser.simple_keys)-1].mark,
+				fmt.Sprintf("exceeded max depth of %d", max_indents))
+		}
 
 		// Create a token and insert it into the queue.
 		token := yaml_token_t{