diff --git a/e2e/newenemy/newenemy_test.go b/e2e/newenemy/newenemy_test.go index 958072f227..b3dd5a079c 100644 --- a/e2e/newenemy/newenemy_test.go +++ b/e2e/newenemy/newenemy_test.go @@ -42,6 +42,17 @@ definition {{.}}/resource { {{ end }} ` +const schemaAllowAllText = ` +{{ range .Prefixes }} +definition {{.}}/user {} +definition {{.}}/resource { + relation direct: {{.}}/user + relation excluded: {{.}}/user + permission allowed = direct +} +{{ end }} +` + const ( objIDRegex = "[a-zA-Z0-9_][a-zA-Z0-9/_-]{0,127}" namespacePrefixRegex = "[a-z][a-z0-9_]{2,62}[a-z0-9]" @@ -50,8 +61,12 @@ const ( var ( maxIterations = flag.Int("max-iterations", 1000, "iteration cap for statistic-based tests (0 for no limit)") - schemaTpl = template.Must(template.New("schema").Parse(schemaText)) - testCtx context.Context + schemaTpl = template.Must(template.New("schema").Parse(schemaText)) + schemaAllowTpl = template.Must(template.New("schema_allow").Parse(schemaAllowAllText)) + objIdGenerator = generator.NewUniqueGenerator(objIDRegex) + prefixGenerator = generator.NewUniqueGenerator(namespacePrefixRegex) + + testCtx context.Context ) func TestMain(m *testing.M) { @@ -96,7 +111,6 @@ func initializeTestCRDBCluster(ctx context.Context, t testing.TB) cockroach.Clus } func TestNoNewEnemy(t *testing.T) { - require := require.New(t) rand.Seed(time.Now().UnixNano()) ctx, cancel := context.WithCancel(testCtx) defer cancel() @@ -106,9 +120,9 @@ func TestNoNewEnemy(t *testing.T) { t.Log("starting vulnerable spicedb...") vulnerableSpiceDb := spice.NewClusterFromCockroachCluster(crdb, spice.WithDbName(dbName)) - require.NoError(vulnerableSpiceDb.Start(ctx, tlog, "vulnerable", + require.NoError(t, vulnerableSpiceDb.Start(ctx, tlog, "vulnerable", "--datastore-tx-overlap-strategy=insecure")) - require.NoError(vulnerableSpiceDb.Connect(ctx, tlog)) + require.NoError(t, vulnerableSpiceDb.Connect(ctx, tlog)) t.Log("start protected spicedb cluster") protectedSpiceDb := spice.NewClusterFromCockroachCluster(crdb, @@ -118,11 +132,11 @@ func TestNoNewEnemy(t *testing.T) { spice.WithMetricsPort(9100), spice.WithDashboardPort(8100), spice.WithDbName(dbName)) - require.NoError(protectedSpiceDb.Start(ctx, tlog, "protected")) - require.NoError(protectedSpiceDb.Connect(ctx, tlog)) + require.NoError(t, protectedSpiceDb.Start(ctx, tlog, "protected")) + require.NoError(t, protectedSpiceDb.Connect(ctx, tlog)) t.Log("configure small ranges, single replicas, short ttl") - require.NoError(crdb.SQL(ctx, tlog, tlog, + require.NoError(t, crdb.SQL(ctx, tlog, tlog, fmt.Sprintf(setSmallRanges, dbName), )) @@ -130,35 +144,74 @@ func TestNoNewEnemy(t *testing.T) { // 4000 is larger than we need to span all three nodes, but a higher number // seems to make the test converge faster schemaData := generateSchemaData(4000, 500) - require.NoError(fillSchema(t, schemaData, vulnerableSpiceDb[1].Client().V1Alpha1().Schema())) - - t.Log("determining a prefix with a leader on the slow node") + fillSchema(t, schemaTpl, schemaData, vulnerableSpiceDb[1].Client().V1Alpha1().Schema()) slowNodeId, err := crdb[1].NodeID(testCtx) - require.NoError(err) - slowPrefix := prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) + require.NoError(t, err) - t.Logf("using prefix %s for slow node %d", slowPrefix, slowNodeId) + t.Log("modifying time") + require.NoError(t, crdb.TimeDelay(ctx, e2e.MustFile(ctx, t, "timeattack-1.log"), 1, -150*time.Millisecond)) - t.Log("filling with data to span multiple ranges") - fill(t, vulnerableSpiceDb[0].Client().V0().ACL(), slowPrefix, 4000, 1000) + t.Run("protected from schema newenemy", func(t *testing.T) { + vulnerableFn := func(t *testing.T) int { + protected := true + var attempts int + for protected { + // cap attempts at 100, retry with a different prefix + protected, attempts = checkSchemaNoNewEnemy(ctx, t, schemaData, slowNodeId, crdb, vulnerableSpiceDb, 100) + } + require.False(t, protected) + t.Logf("determined spicedb vulnerable in %d attempts", attempts+1) + return attempts + } + protectedFn := func(t *testing.T, count int) { + t.Logf("check spicedb is protected after %d attempts", count+1) + protected, attempts := checkSchemaNoNewEnemy(ctx, t, schemaData, slowNodeId, crdb, protectedSpiceDb, count) + require.Equal(t, attempts, count) + require.True(t, protected, "protection is enabled, but newenemy detected") + } + statTest(t, 5, vulnerableFn, protectedFn) + }) - t.Log("modifying time") - require.NoError(crdb.TimeDelay(ctx, e2e.MustFile(ctx, t, "timeattack.log"), 1, -200*time.Millisecond)) + t.Run("protected from data newenemy", func(t *testing.T) { + vulnerableFn := func(t *testing.T) int { + protected := true + var attempts int + for protected { + // cap attempts at 100, retry with a different prefix + protected, attempts = checkDataNoNewEnemy(ctx, t, schemaData, slowNodeId, crdb, vulnerableSpiceDb, 100) + } + require.False(t, protected) + t.Logf("determined spicedb vulnerable in %d attempts", attempts+1) + return attempts + } + protectedFn := func(t *testing.T, count int) { + t.Logf("check spicedb is protected after %d attempts", count+1) + protected, attempts := checkDataNoNewEnemy(ctx, t, schemaData, slowNodeId, crdb, protectedSpiceDb, count) + require.Equal(t, attempts, count) + require.True(t, protected, "protection is enabled, but newenemy detected") + t.Logf("spicedb is protected after %d attempts", count) + } + statTest(t, 5, vulnerableFn, protectedFn) + }) +} - const sampleSize = 5 +// statTest takes a testFn that is expected to fail the test, returning a sample +// and a protectedTestFn that is expected to succeed even after a given number +// of runs. +func statTest(t *testing.T, sampleSize int, testFn func(t *testing.T) int, protectedTestFn func(t *testing.T, count int)) { samples := make([]int, sampleSize) - for i := 0; i < sampleSize; i++ { - t.Log(i, "check vulnerability with mitigations disabled") - checkCtx, checkCancel := context.WithTimeout(ctx, 30*time.Minute) - protected, attempts := checkNoNewEnemy(checkCtx, t, schemaData, slowNodeId, crdb, vulnerableSpiceDb, -1) - require.NotNil(protected, "unable to determine if spicedb displays newenemy when mitigations are disabled within the time limit") - require.False(*protected) - checkCancel() - t.Logf("%d - determined spicedb vulnerable in %d attempts", i, attempts) - samples[i] = attempts + t.Logf("collecting sample %d", i) + samples[i] = testFn(t) } + protectedTestFn(t, iterationsForHighConfidence(samples)) +} +// iterationsForHighConfidence returns how many iterations we need to get +// > 3sigma from the mean of the samples. +// Caps at maxIterations to control test runtime. Set maxIterations to 0 for no +// cap. +func iterationsForHighConfidence(samples []int) (iterations int) { // from https://cs.opensource.google/go/x/perf/+/40a54f11:internal/stats/sample.go;l=196 // calculates mean and stddev at the same time mean, M2 := 0.0, 0.0 @@ -167,42 +220,51 @@ func TestNoNewEnemy(t *testing.T) { mean += delta / float64(n+1) M2 += delta * (float64(x) - mean) } - variance := M2 / float64(sampleSize-1) + variance := M2 / float64(len(samples)-1) stddev := math.Sqrt(variance) - samplestddev := stddev / math.Sqrt(float64(sampleSize)) + samplestddev := stddev / math.Sqrt(float64(len(samples))) // how many iterations do we need to get > 3sigma from the mean? // cap maxIterations to control test runtime. - iterations := int(math.Ceil(3*stddev*samplestddev + mean)) + iterations = int(math.Ceil(3*stddev*samplestddev + mean)) if *maxIterations != 0 && *maxIterations < iterations { iterations = *maxIterations } - - t.Logf("check spicedb is protected after %d attempts", iterations) - protected, _ := checkNoNewEnemy(ctx, t, schemaData, slowNodeId, crdb, protectedSpiceDb, iterations) - require.NotNil(protected, "unable to determine if spicedb is protected within the time limit") - require.True(*protected, "protection is enabled, but newenemy detected") + return } -// checkNoNewEnemy returns true if the service is protected, false if it is vulnerable, and nil if we couldn't determine -func checkNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, slowNodeId int, crdb cockroach.Cluster, spicedb spice.Cluster, candidateCount int) (*bool, int) { - var attempts int - +// checkDataNoNewEnemy returns true if the service is protected and false if it +// is vulnerable. +// +// This subtest ensures protection from a "data-based" newenemy problem: +// +// 1. Use this schema: +// definition user {} +// definition resource { +// relation direct: user +// relation excluded: user +// permission allowed = direct - excluded +// } +// 2. Write resource:1#excluded@user:A +// 3. Write resource:2#direct:@user:A +// 4. If the timestamp from (3) is before the timestamp for (2), then: +// check(resource:1#allowed@user:A) +// may succeed, when it should fail. The timestamps can be reversed +// if the tx overlap protections are disabled, because cockroach only +// ensures overlapping transactions are linearizable. +func checkDataNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, slowNodeId int, crdb cockroach.Cluster, spicedb spice.Cluster, maxAttempts int) (bool, int) { prefix := prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) - objIdGenerator := generator.NewUniqueGenerator(objIDRegex) + t.Log("filling with data to span multiple ranges for prefix ", prefix) + fill(t, spicedb[0].Client().V0().ACL(), prefix, 4000, 1000) - for { - attempts++ - directs, excludes := generateTuples(prefix, 1, objIdGenerator) + for attempts := 0; attempts < maxAttempts; attempts++ { + direct, exclude := generateTuple(prefix, objIdGenerator) // write to node 1 r1, err := spicedb[0].Client().V0().ACL().Write(testCtx, &v0.WriteRequest{ - Updates: []*v0.RelationTupleUpdate{excludes[0]}, + Updates: []*v0.RelationTupleUpdate{exclude}, }) - if err != nil { - t.Log(err) - continue - } + require.NoError(t, err) // the first write has to read the namespaces from the second node, // which will resync the timestamps. sleeping allows the clocks to get @@ -211,83 +273,156 @@ func checkNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, // write to node 2 (clock is behind) r2, err := spicedb[1].Client().V0().ACL().Write(testCtx, &v0.WriteRequest{ - Updates: []*v0.RelationTupleUpdate{directs[0]}, + Updates: []*v0.RelationTupleUpdate{direct}, }) - if err != nil { - t.Log(err) - continue - } + require.NoError(t, err) canHas, err := spicedb[1].Client().V0().ACL().Check(context.Background(), &v0.CheckRequest{ TestUserset: &v0.ObjectAndRelation{ - Namespace: directs[0].Tuple.ObjectAndRelation.Namespace, - ObjectId: directs[0].Tuple.ObjectAndRelation.ObjectId, + Namespace: direct.Tuple.ObjectAndRelation.Namespace, + ObjectId: direct.Tuple.ObjectAndRelation.ObjectId, Relation: "allowed", }, - User: directs[0].Tuple.GetUser(), + User: direct.Tuple.GetUser(), AtRevision: r2.GetRevision(), }) - if err != nil { - t.Log(err) - continue - } - if canHas.IsMember { - t.Log("service is subject to the new enemy problem") - } + require.NoError(t, err) - r1leader, r2leader := getLeaderNode(testCtx, crdb[1].Conn(), excludes[0].Tuple), getLeaderNode(testCtx, crdb[1].Conn(), directs[0].Tuple) - ns1Leader := getLeaderNodeForNamespace(testCtx, crdb[1].Conn(), excludes[0].Tuple.ObjectAndRelation.Namespace) - ns2Leader := getLeaderNodeForNamespace(testCtx, crdb[1].Conn(), excludes[0].Tuple.User.GetUserset().Namespace) + r1leader, r2leader := getLeaderNode(testCtx, crdb[1].Conn(), exclude.Tuple), getLeaderNode(testCtx, crdb[1].Conn(), direct.Tuple) + ns1Leader := getLeaderNodeForNamespace(testCtx, crdb[1].Conn(), exclude.Tuple.ObjectAndRelation.Namespace) + ns2Leader := getLeaderNodeForNamespace(testCtx, crdb[1].Conn(), exclude.Tuple.User.GetUserset().Namespace) z1, _ := zookie.DecodeRevision(r1.GetRevision()) z2, _ := zookie.DecodeRevision(r2.GetRevision()) t.Log(z1, z2, z1.Sub(z2).String(), r1leader, r2leader, ns1Leader, ns2Leader) + if canHas.IsMember { + t.Log("service is subject to the new enemy problem") + return false, attempts + } + if ns1Leader != slowNodeId || ns2Leader != slowNodeId { t.Log("namespace leader changed, pick new prefix and fill") - prefix = prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) - // need to fill new prefix - t.Log("filling with data to span multiple ranges for prefix ", prefix) - fill(t, spicedb[0].Client().V0().ACL(), prefix, 4000, 1000) + // returning true will re-run with a new prefix + return true, attempts + } + + if z1.Sub(z2).IsPositive() { + t.Log("error in test, object id has been re-used.") continue } - if canHas.IsMember { - t.Log("service is subject to the new enemy problem") - protected := false - return &protected, attempts + select { + case <-ctx.Done(): + return false, attempts + default: + time.Sleep(100 * time.Millisecond) + continue } + } + return true, maxAttempts +} - if !canHas.IsMember && z1.Sub(z2).IsPositive() { - t.Log("error in test, object id has been re-used.") +// checkSchemaNoNewEnemy returns true if the service is protected, false if it +// is vulnerable. +// +// This test ensures protection from a "schema and data" newenemy, i.e. +// the new enemy conditions require linearizable changes to schema and data +// 1. Start with this schema: +// definition user {} +// definition resource { +// relation direct: user +// relation excluded: user +// permission allowed = direct +// } +// 2. Write resource:2#direct:@user:A +// 3. Update to this schema: +// definition user {} +// definition resource { +// relation direct: user +// relation excluded: user +// permission allowed = direct - excluded +// } +// 4. Write resource:1#excluded@user:A +// 5. If the revision from (4) is before the timestamp for (3), then: +// check(fully consistent, resource:1#allowed@user:A) +// may succeed, when it should fail. The timestamps can be reversed +// if the tx overlap protections are disabled, because cockroach only +// ensures overlapping transactions are linearizable. +// In this case, we don't get an explicit revision back from the +// WriteSchema call, but the Schema write and the resource write are +// fully consistent. +func checkSchemaNoNewEnemy(ctx context.Context, t testing.TB, schemaData []SchemaData, slowNodeId int, crdb cockroach.Cluster, spicedb spice.Cluster, maxAttempts int) (bool, int) { + prefix := prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) + + var b strings.Builder + for attempts := 0; attempts < maxAttempts; attempts++ { + direct, exclude := generateTuple(prefix, objIdGenerator) + + // write the "allow" schema + b.Reset() + require.NoError(t, schemaAllowTpl.Execute(&b, SchemaData{Prefixes: []string{prefix}})) + _, err := spicedb[0].Client().V1Alpha1().Schema().WriteSchema(context.Background(), &v1alpha1.WriteSchemaRequest{ + Schema: b.String(), + }) + if err != nil { + t.Log(err.Error()) continue } - // if we find causal reversals, but no newenemy, assume we're protected - if candidateCount > 0 && attempts >= candidateCount { - t.Log(candidateCount, "(would be) causal reversals with no new enemy detected") - protected := true - return &protected, attempts + // write the "direct" tuple + _, err = spicedb[0].Client().V0().ACL().Write(testCtx, &v0.WriteRequest{ + Updates: []*v0.RelationTupleUpdate{direct}, + }) + require.NoError(t, err) + + time.Sleep(100 * time.Millisecond) + // write the "excludes" tuple + r2, err := spicedb[0].Client().V0().ACL().Write(testCtx, &v0.WriteRequest{ + Updates: []*v0.RelationTupleUpdate{exclude}, + }) + require.NoError(t, err) + + // write the "exclude" schema + b.Reset() + require.NoError(t, schemaTpl.Execute(&b, SchemaData{Prefixes: []string{prefix}})) + _, err = spicedb[1].Client().V1Alpha1().Schema().WriteSchema(context.Background(), &v1alpha1.WriteSchemaRequest{ + Schema: b.String(), + }) + if err != nil { + t.Log(err.Error()) + continue } - if attempts > 1000 { - t.Log("trying with a new prefix") - attempts = 0 - prefix = prefixForNode(ctx, crdb[1].Conn(), schemaData, slowNodeId) - t.Log("filling with data to span multiple ranges for prefix ", prefix) - fill(t, spicedb[0].Client().V0().ACL(), prefix, 4000, 1000) + canHas, err := spicedb[1].Client().V0().ACL().Check(context.Background(), &v0.CheckRequest{ + TestUserset: &v0.ObjectAndRelation{ + Namespace: direct.Tuple.ObjectAndRelation.Namespace, + ObjectId: direct.Tuple.ObjectAndRelation.ObjectId, + Relation: "allowed", + }, + User: direct.Tuple.GetUser(), + AtRevision: r2.GetRevision(), + }) + if err != nil { + t.Log(err.Error()) + if strings.Contains(err.Error(), "cannot specify timestamp in the future") { + return false, attempts + } continue } + if canHas.IsMember { + t.Log("service is subject to the new enemy problem") + return false, attempts + } + select { case <-ctx.Done(): - return nil, attempts + return false, attempts default: continue } - - // allow clocks to desync - time.Sleep(100 * time.Millisecond) } + return true, maxAttempts } func BenchmarkBatchWrites(b *testing.B) { @@ -299,7 +434,7 @@ func BenchmarkBatchWrites(b *testing.B) { require.NoError(b, spicedb.Start(ctx, tlog, "")) require.NoError(b, spicedb.Connect(ctx, tlog)) - directs, excludes := generateTuples("", b.N*20, generator.NewUniqueGenerator(objIDRegex)) + directs, excludes := generateTuples("", b.N*20, objIdGenerator) b.ResetTimer() _, err := spicedb[0].Client().V0().ACL().Write(testCtx, &v0.WriteRequest{ Updates: excludes, @@ -329,13 +464,13 @@ func BenchmarkConflictingTupleWrites(b *testing.B) { b.ResetTimer() - checkNoNewEnemy(ctx, b, generateSchemaData(1, 1), 1, crdb, spicedb, b.N) + checkDataNoNewEnemy(ctx, b, generateSchemaData(1, 1), 1, crdb, spicedb, b.N) } func fill(t testing.TB, client v0.ACLServiceClient, prefix string, fillerCount, batchSize int) { t.Log("filling prefix", prefix) require := require.New(t) - directs, excludes := generateTuples(prefix, fillerCount, generator.NewUniqueGenerator(objIDRegex)) + directs, excludes := generateTuples(prefix, fillerCount, objIdGenerator) for i := 0; i < fillerCount/batchSize; i++ { t.Log("filling", i*batchSize, "to", (i+1)*batchSize) _, err := client.Write(testCtx, &v0.WriteRequest{ @@ -349,24 +484,19 @@ func fill(t testing.TB, client v0.ACLServiceClient, prefix string, fillerCount, } } -func fillSchema(t testing.TB, data []SchemaData, schemaClient v1alpha1.SchemaServiceClient) error { +// fillSchema generates the schema text for given SchemaData and applies it +func fillSchema(t testing.TB, template *template.Template, data []SchemaData, schemaClient v1alpha1.SchemaServiceClient) { var b strings.Builder batchSize := len(data[0].Prefixes) for i, d := range data { t.Logf("filling %d to %d", i*batchSize, (i+1)*batchSize) b.Reset() - err := schemaTpl.Execute(&b, d) - if err != nil { - return err - } - _, err = schemaClient.WriteSchema(context.Background(), &v1alpha1.WriteSchemaRequest{ + require.NoError(t, template.Execute(&b, d)) + _, err := schemaClient.WriteSchema(context.Background(), &v1alpha1.WriteSchemaRequest{ Schema: b.String(), }) - if err != nil { - return err - } + require.NoError(t, err) } - return nil } // prefixForNode finds a prefix with namespace leaders on the node id @@ -393,7 +523,6 @@ func prefixForNode(ctx context.Context, conn *pgx.Conn, data []SchemaData, node func generateSchemaData(n int, batchSize int) (data []SchemaData) { data = make([]SchemaData, 0, n/batchSize) - prefixGenerator := generator.NewUniqueGenerator(namespacePrefixRegex) for i := 0; i < n/batchSize; i++ { schema := SchemaData{Prefixes: make([]string, 0, batchSize)} for j := i * batchSize; j < (i+1)*batchSize; j++ { @@ -404,6 +533,11 @@ func generateSchemaData(n int, batchSize int) (data []SchemaData) { return } +func generateTuple(prefix string, objIdGenerator *generator.UniqueGenerator) (direct *v0.RelationTupleUpdate, exclude *v0.RelationTupleUpdate) { + directs, excludes := generateTuples(prefix, 1, objIdGenerator) + return directs[0], excludes[0] +} + func generateTuples(prefix string, n int, objIdGenerator *generator.UniqueGenerator) (directs []*v0.RelationTupleUpdate, excludes []*v0.RelationTupleUpdate) { directs = make([]*v0.RelationTupleUpdate, 0, n) excludes = make([]*v0.RelationTupleUpdate, 0, n) @@ -475,6 +609,8 @@ func getLeaderNodeForNamespace(ctx context.Context, conn *pgx.Conn, namespace st return leaderFromRangeRow(rows) } +// leaderFromRangeRow parses the rows from a `SHOW RANGE` query and returns the +// leader node id for the range func leaderFromRangeRow(rows pgx.Rows) int { var ( startKey sql.NullString diff --git a/internal/datastore/crdb/namespace.go b/internal/datastore/crdb/namespace.go index 5edab10af2..07a543295c 100644 --- a/internal/datastore/crdb/namespace.go +++ b/internal/datastore/crdb/namespace.go @@ -44,8 +44,6 @@ var ( func (cds *crdbDatastore) WriteNamespace(ctx context.Context, newConfig *v0.NamespaceDefinition) (datastore.Revision, error) { var hlcNow decimal.Decimal if err := cds.execute(ctx, cds.conn, pgx.TxOptions{}, func(tx pgx.Tx) error { - keySet := newKeySet() - cds.AddOverlapKey(keySet, newConfig.Name) serialized, err := proto.Marshal(newConfig) if err != nil { return err @@ -56,12 +54,7 @@ func (cds *crdbDatastore) WriteNamespace(ctx context.Context, newConfig *v0.Name if err != nil { return err } - for k := range keySet { - if _, err := tx.Exec(ctx, queryTouchTransaction, k); err != nil { - return err - } - } - return tx.QueryRow( + return cds.conn.QueryRow( datastore.SeparateContextWithTracing(ctx), writeSQL, writeArgs..., ).Scan(&hlcNow) }); err != nil {