diff --git a/e2e/newenemy/newenemy_test.go b/e2e/newenemy/newenemy_test.go index 958072f227..440dd2bb7e 100644 --- a/e2e/newenemy/newenemy_test.go +++ b/e2e/newenemy/newenemy_test.go @@ -42,6 +42,17 @@ definition {{.}}/resource { {{ end }} ` +const schemaAllowAllText = ` +{{ range .Prefixes }} +definition {{.}}/user {} +definition {{.}}/resource { + relation direct: {{.}}/user + relation excluded: {{.}}/user + permission allowed = direct +} +{{ end }} +` + const ( objIDRegex = "[a-zA-Z0-9_][a-zA-Z0-9/_-]{0,127}" namespacePrefixRegex = "[a-z][a-z0-9_]{2,62}[a-z0-9]" @@ -50,8 +61,10 @@ const ( var ( maxIterations = flag.Int("max-iterations", 1000, "iteration cap for statistic-based tests (0 for no limit)") - schemaTpl = template.Must(template.New("schema").Parse(schemaText)) - testCtx context.Context + schemaTpl = template.Must(template.New("schema").Parse(schemaText)) + schemaAllowTpl = template.Must(template.New("schema_allow").Parse(schemaAllowAllText)) + + testCtx context.Context ) func TestMain(m *testing.M) { @@ -126,39 +139,142 @@ func TestNoNewEnemy(t *testing.T) { fmt.Sprintf(setSmallRanges, dbName), )) - t.Log("fill with schemas to span multiple ranges") - // 4000 is larger than we need to span all three nodes, but a higher number - // seems to make the test converge faster - schemaData := generateSchemaData(4000, 500) - require.NoError(fillSchema(t, schemaData, vulnerableSpiceDb[1].Client().V1Alpha1().Schema())) + // This subtest ensures protection from a "data-based" newenemy problem: + // + // 1. Use this schema: + // definition user {} + // definition resource { + // relation direct: user + // relation excluded: user + // permission allowed = direct - excluded + // } + // 2. Write resource:1#excluded@user:A + // 3. Write resource:2#direct:@user:A + // 4. If the timestamp from (3) is before the timestamp for (2), then: + // check(resource:1#allowed@user:A) + // may succeed, when it should fail. The timestamps can be reversed + // if the tx overlap protections are disabled, because cockroach only + // ensures overlapping transactions are linearizable. + t.Run("protected from data newenemy", func(t *testing.T) { + t.Log("fill with schemas to span multiple ranges") + // 4000 is larger than we need to span all three nodes, but a higher number + // seems to make the test converge faster + schemaData := generateSchemaData(4000, 500) + fillSchema(t, schemaTpl, schemaData, vulnerableSpiceDb[1].Client().V1Alpha1().Schema()) + + t.Log("determining a prefix with a leader on the slow node") + slowNodeId, err := crdb[1].NodeID(testCtx) + require.NoError(err) + slowPrefix := prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) + + t.Logf("using prefix %s for slow node %d", slowPrefix, slowNodeId) - t.Log("determining a prefix with a leader on the slow node") - slowNodeId, err := crdb[1].NodeID(testCtx) - require.NoError(err) - slowPrefix := prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) + t.Log("filling with data to span multiple ranges") + fill(t, vulnerableSpiceDb[0].Client().V0().ACL(), slowPrefix, 4000, 1000) - t.Logf("using prefix %s for slow node %d", slowPrefix, slowNodeId) + t.Log("modifying time") + require.NoError(crdb.TimeDelay(ctx, e2e.MustFile(ctx, t, "timeattack-"+t.Name()+".log"), 1, -200*time.Millisecond)) - t.Log("filling with data to span multiple ranges") - fill(t, vulnerableSpiceDb[0].Client().V0().ACL(), slowPrefix, 4000, 1000) + vulnerableFn := func(t *testing.T) int { + checkCtx, checkCancel := context.WithTimeout(ctx, 30*time.Minute) + protected, attempts := checkDataNoNewEnemy(checkCtx, t, schemaData, slowNodeId, crdb, vulnerableSpiceDb, -1) + require.NotNil(protected, "unable to determine if spicedb displays newenemy when mitigations are disabled within the time limit") + require.False(*protected) + checkCancel() + t.Logf("determined spicedb vulnerable in %d attempts", attempts) + return attempts + } + protectedFn := func(t *testing.T, count int) { + t.Logf("check spicedb is protected after %d attempts", count) + protected, _ := checkDataNoNewEnemy(ctx, t, schemaData, slowNodeId, crdb, protectedSpiceDb, count) + require.NotNil(protected, "unable to determine if spicedb is protected within the time limit") + require.True(*protected, "protection is enabled, but newenemy detected") + } + statTest(t, 5, vulnerableFn, protectedFn) + }) - t.Log("modifying time") - require.NoError(crdb.TimeDelay(ctx, e2e.MustFile(ctx, t, "timeattack.log"), 1, -200*time.Millisecond)) + // This subtest ensures protection from a "schema and data" newenemy, i.e. + // the new enemy conditions require linearizable changes to schema and data + // 1. Start with this schema: + // definition user {} + // definition resource { + // relation direct: user + // relation excluded: user + // permission allowed = direct + // } + // 2. Write resource:2#direct:@user:A + // 3. Update to this schema: + // definition user {} + // definition resource { + // relation direct: user + // relation excluded: user + // permission allowed = direct - excluded + // } + // 4. Write resource:1#excluded@user:A + // 5. If the revision from (4) is before the timestamp for (3), then: + // check(fully consistent, resource:1#allowed@user:A) + // may succeed, when it should fail. The timestamps can be reversed + // if the tx overlap protections are disabled, because cockroach only + // ensures overlapping transactions are linearizable. + // In this case, we don't get an explicit revision back from the + // WriteSchema call, but the Schema write and the resource write are + // fully consistent. + t.Run("protected from schema newenemy", func(t *testing.T) { + t.Log("fill with schemas to span multiple ranges") + // 4000 is larger than we need to span all three nodes, but a higher number + // seems to make the test converge faster + schemaData := generateSchemaData(4000, 500) + fillSchema(t, schemaAllowTpl, schemaData, vulnerableSpiceDb[1].Client().V1Alpha1().Schema()) + + t.Log("determining a prefix with a leader on the slow node") + slowNodeId, err := crdb[1].NodeID(testCtx) + require.NoError(err) + slowPrefix := prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) - const sampleSize = 5 - samples := make([]int, sampleSize) + t.Logf("using prefix %s for slow node %d", slowPrefix, slowNodeId) + + t.Log("filling with data to span multiple ranges") + fill(t, vulnerableSpiceDb[0].Client().V0().ACL(), slowPrefix, 4000, 1000) + + t.Log("modifying time") + require.NoError(crdb.TimeDelay(ctx, e2e.MustFile(ctx, t, "timeattack-"+t.Name()+".log"), 1, -200*time.Millisecond)) + + vulnerableFn := func(t *testing.T) int { + checkCtx, checkCancel := context.WithTimeout(ctx, 30*time.Minute) + protected, attempts := checkSchemaNoNewEnemy(checkCtx, t, schemaData, slowNodeId, crdb, vulnerableSpiceDb, -1) + require.NotNil(protected, "unable to determine if spicedb displays newenemy when mitigations are disabled within the time limit") + require.False(*protected) + checkCancel() + t.Logf("determined spicedb vulnerable in %d attempts", attempts) + return attempts + } + protectedFn := func(t *testing.T, count int) { + t.Logf("check spicedb is protected after %d attempts", count) + protected, _ := checkSchemaNoNewEnemy(ctx, t, schemaData, slowNodeId, crdb, protectedSpiceDb, count) + require.NotNil(protected, "unable to determine if spicedb is protected within the time limit") + require.True(*protected, "protection is enabled, but newenemy detected") + } + statTest(t, 5, vulnerableFn, protectedFn) + }) +} +// statTest takes a testFn that is expected to fail the test, returning a sample +// and a protectedTestFn that is expected to succeed even after a given number +// of runs. +func statTest(t *testing.T, sampleSize int, testFn func(t *testing.T) int, protectedTestFn func(t *testing.T, count int)) { + samples := make([]int, sampleSize) for i := 0; i < sampleSize; i++ { - t.Log(i, "check vulnerability with mitigations disabled") - checkCtx, checkCancel := context.WithTimeout(ctx, 30*time.Minute) - protected, attempts := checkNoNewEnemy(checkCtx, t, schemaData, slowNodeId, crdb, vulnerableSpiceDb, -1) - require.NotNil(protected, "unable to determine if spicedb displays newenemy when mitigations are disabled within the time limit") - require.False(*protected) - checkCancel() - t.Logf("%d - determined spicedb vulnerable in %d attempts", i, attempts) - samples[i] = attempts + t.Logf("collecting sample %d", i) + samples[i] = testFn(t) } + protectedTestFn(t, iterationsForHighConfidence(samples)) +} +// iterationsForHighConfidence returns how many iterations we need to get +// > 3sigma from the mean of the samples. +// Caps at maxIterations to control test runtime. Set maxIterations to 0 for no +// cap. +func iterationsForHighConfidence(samples []int) (iterations int) { // from https://cs.opensource.google/go/x/perf/+/40a54f11:internal/stats/sample.go;l=196 // calculates mean and stddev at the same time mean, M2 := 0.0, 0.0 @@ -167,25 +283,21 @@ func TestNoNewEnemy(t *testing.T) { mean += delta / float64(n+1) M2 += delta * (float64(x) - mean) } - variance := M2 / float64(sampleSize-1) + variance := M2 / float64(len(samples)-1) stddev := math.Sqrt(variance) - samplestddev := stddev / math.Sqrt(float64(sampleSize)) + samplestddev := stddev / math.Sqrt(float64(len(samples))) // how many iterations do we need to get > 3sigma from the mean? // cap maxIterations to control test runtime. - iterations := int(math.Ceil(3*stddev*samplestddev + mean)) + iterations = int(math.Ceil(3*stddev*samplestddev + mean)) if *maxIterations != 0 && *maxIterations < iterations { iterations = *maxIterations } - - t.Logf("check spicedb is protected after %d attempts", iterations) - protected, _ := checkNoNewEnemy(ctx, t, schemaData, slowNodeId, crdb, protectedSpiceDb, iterations) - require.NotNil(protected, "unable to determine if spicedb is protected within the time limit") - require.True(*protected, "protection is enabled, but newenemy detected") + return } -// checkNoNewEnemy returns true if the service is protected, false if it is vulnerable, and nil if we couldn't determine -func checkNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, slowNodeId int, crdb cockroach.Cluster, spicedb spice.Cluster, candidateCount int) (*bool, int) { +// checkDataNoNewEnemy returns true if the service is protected, false if it is vulnerable, and nil if we couldn't determine +func checkDataNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, slowNodeId int, crdb cockroach.Cluster, spicedb spice.Cluster, candidateCount int) (*bool, int) { var attempts int prefix := prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) @@ -231,9 +343,6 @@ func checkNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, t.Log(err) continue } - if canHas.IsMember { - t.Log("service is subject to the new enemy problem") - } r1leader, r2leader := getLeaderNode(testCtx, crdb[1].Conn(), excludes[0].Tuple), getLeaderNode(testCtx, crdb[1].Conn(), directs[0].Tuple) ns1Leader := getLeaderNodeForNamespace(testCtx, crdb[1].Conn(), excludes[0].Tuple.ObjectAndRelation.Namespace) @@ -242,6 +351,12 @@ func checkNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, z2, _ := zookie.DecodeRevision(r2.GetRevision()) t.Log(z1, z2, z1.Sub(z2).String(), r1leader, r2leader, ns1Leader, ns2Leader) + if canHas.IsMember { + t.Log("service is subject to the new enemy problem") + protected := false + return &protected, attempts + } + if ns1Leader != slowNodeId || ns2Leader != slowNodeId { t.Log("namespace leader changed, pick new prefix and fill") prefix = prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) @@ -251,13 +366,7 @@ func checkNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, continue } - if canHas.IsMember { - t.Log("service is subject to the new enemy problem") - protected := false - return &protected, attempts - } - - if !canHas.IsMember && z1.Sub(z2).IsPositive() { + if z1.Sub(z2).IsPositive() { t.Log("error in test, object id has been re-used.") continue } @@ -269,7 +378,7 @@ func checkNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, return &protected, attempts } - if attempts > 1000 { + if attempts > *maxIterations { t.Log("trying with a new prefix") attempts = 0 prefix = prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) @@ -282,11 +391,78 @@ func checkNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, case <-ctx.Done(): return nil, attempts default: + // allow clocks to desync + time.Sleep(100 * time.Millisecond) continue } + } +} - // allow clocks to desync - time.Sleep(100 * time.Millisecond) +// checkSchemaNoNewEnemy returns true if the service is protected, false if it is vulnerable, and nil if we couldn't determine +func checkSchemaNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, slowNodeId int, crdb cockroach.Cluster, spicedb spice.Cluster, candidateCount int) (*bool, int) { + var attempts int + + prefix := prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) + objIdGenerator := generator.NewUniqueGenerator(objIDRegex) + + var b strings.Builder + for { + attempts++ + + directs, excludes := generateTuples(prefix, 1, objIdGenerator) + + // write the "allow" schema + b.Reset() + require.NoError(t, schemaAllowTpl.Execute(&b, SchemaData{Prefixes: []string{prefix}})) + _, err := spicedb[0].Client().V1Alpha1().Schema().WriteSchema(context.Background(), &v1alpha1.WriteSchemaRequest{ + Schema: b.String(), + }) + require.NoError(t, err) + + // write the "direct" tuple + _, err = spicedb[0].Client().V0().ACL().Write(testCtx, &v0.WriteRequest{ + Updates: []*v0.RelationTupleUpdate{directs[0]}, + }) + require.NoError(t, err) + + // write the "exclude" schema + b.Reset() + require.NoError(t, schemaTpl.Execute(&b, SchemaData{Prefixes: []string{prefix}})) + _, err = spicedb[1].Client().V1Alpha1().Schema().WriteSchema(context.Background(), &v1alpha1.WriteSchemaRequest{ + Schema: b.String(), + }) + require.NoError(t, err) + + // write the "excludes" tuple + _, err = spicedb[0].Client().V0().ACL().Write(testCtx, &v0.WriteRequest{ + Updates: []*v0.RelationTupleUpdate{excludes[0]}, + }) + require.NoError(t, err) + + canHas, err := spicedb[1].Client().V0().ACL().Check(context.Background(), &v0.CheckRequest{ + TestUserset: &v0.ObjectAndRelation{ + Namespace: directs[0].Tuple.ObjectAndRelation.Namespace, + ObjectId: directs[0].Tuple.ObjectAndRelation.ObjectId, + Relation: "allowed", + }, + User: directs[0].Tuple.GetUser(), + }) + require.NoError(t, err) + + if canHas.IsMember { + t.Log("service is subject to the new enemy problem") + protected := false + return &protected, attempts + } + + select { + case <-ctx.Done(): + return nil, attempts + default: + // allow clocks to desync + time.Sleep(100 * time.Millisecond) + continue + } } } @@ -329,7 +505,7 @@ func BenchmarkConflictingTupleWrites(b *testing.B) { b.ResetTimer() - checkNoNewEnemy(ctx, b, generateSchemaData(1, 1), 1, crdb, spicedb, b.N) + checkDataNoNewEnemy(ctx, b, generateSchemaData(1, 1), 1, crdb, spicedb, b.N) } func fill(t testing.TB, client v0.ACLServiceClient, prefix string, fillerCount, batchSize int) { @@ -349,24 +525,19 @@ func fill(t testing.TB, client v0.ACLServiceClient, prefix string, fillerCount, } } -func fillSchema(t testing.TB, data []SchemaData, schemaClient v1alpha1.SchemaServiceClient) error { +// fillSchema generates the schema text for given SchemaData and applies it +func fillSchema(t testing.TB, template *template.Template, data []SchemaData, schemaClient v1alpha1.SchemaServiceClient) { var b strings.Builder batchSize := len(data[0].Prefixes) for i, d := range data { t.Logf("filling %d to %d", i*batchSize, (i+1)*batchSize) b.Reset() - err := schemaTpl.Execute(&b, d) - if err != nil { - return err - } - _, err = schemaClient.WriteSchema(context.Background(), &v1alpha1.WriteSchemaRequest{ + require.NoError(t, template.Execute(&b, d)) + _, err := schemaClient.WriteSchema(context.Background(), &v1alpha1.WriteSchemaRequest{ Schema: b.String(), }) - if err != nil { - return err - } + require.NoError(t, err) } - return nil } // prefixForNode finds a prefix with namespace leaders on the node id