diff --git a/pkg/api/metrics.go b/pkg/api/metrics.go index 02a89d71a..b22731f1b 100644 --- a/pkg/api/metrics.go +++ b/pkg/api/metrics.go @@ -16,6 +16,8 @@ package api import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) @@ -35,4 +37,18 @@ var ( Name: "rekor_api_latency_summary", Help: "Api Latency on calls", }, []string{"path", "code"}) + + MetricRequestLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "rekor_latency_by_api", + Help: "Api Latency (in ns) by path and method", + Buckets: prometheus.ExponentialBucketsRange( + float64(time.Millisecond), + float64(4*time.Second), + 10), + }, []string{"path", "method"}) + + MetricRequestCount = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "rekor_qps_by_api", + Help: "Api QPS by path, method, and response code", + }, []string{"path", "method", "code"}) ) diff --git a/pkg/generated/restapi/configure_rekor_server.go b/pkg/generated/restapi/configure_rekor_server.go index e19527ca8..6df6070d4 100644 --- a/pkg/generated/restapi/configure_rekor_server.go +++ b/pkg/generated/restapi/configure_rekor_server.go @@ -18,6 +18,7 @@ limitations under the License. package restapi import ( + "context" "crypto/tls" "net/http" "strconv" @@ -47,6 +48,18 @@ import ( //go:generate swagger generate server --target ../../generated --name RekorServer --spec ../../../openapi.yaml --principal interface{} --exclude-main +type contextKey string + +var ( + ctxKeyAPIToRecord = contextKey("apiToRecord") +) + +// Context payload for recording metrics. +type apiToRecord struct { + method *string // Method to record in metrics, if any. + path *string // Path to record in metrics, if any. +} + func configureFlags(api *operations.RekorServerAPI) { // api.CommandLineOptionsGroups = []swag.CommandLineOptionsGroup{ ... } } @@ -104,6 +117,16 @@ func configureAPI(api *operations.RekorServerAPI) http.Handler { api.AddMiddlewareFor("GET", "/api/v1/log/publicKey", cacheForever) api.AddMiddlewareFor("GET", "/api/v1/log/timestamp/certchain", cacheForever) + // add metrics for explicitly handled endpoints + recordMetricsForAPI(api, "POST", "/api/v1/index/retrieve") + recordMetricsForAPI(api, "GET", "/api/v1/log") + recordMetricsForAPI(api, "GET", "/api/v1/publicKey") + recordMetricsForAPI(api, "GET", "/api/v1/log/proof") + recordMetricsForAPI(api, "GET", "/api/v1/log/entries") + recordMetricsForAPI(api, "POST", "/api/v1/log/entries") + recordMetricsForAPI(api, "GET", "/api/v1/log/entries/{entryUUID}") + recordMetricsForAPI(api, "GET", "/api/v1/log/entries/retrieve") + return setupGlobalMiddleware(api.Serve(setupMiddlewares)) } @@ -159,18 +182,59 @@ func setupGlobalMiddleware(handler http.Handler) http.Handler { })) } +// Populates the the apiToRecord for this method/path so metrics are emitted. +func recordMetricsForAPI(api *operations.RekorServerAPI, method string, path string) { + metricsHandler := func(handler http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + if apiInfo, ok := ctx.Value(ctxKeyAPIToRecord).(*apiToRecord); ok { + apiInfo.method = &method + apiInfo.path = &path + } else { + log.ContextLogger(ctx).Warn("Could not attach api info - endpoint may not be monitored.") + } + handler.ServeHTTP(w, r) + }) + } + + api.AddMiddlewareFor(method, path, metricsHandler) +} + func wrapMetrics(handler http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + apiInfo := apiToRecord{} + ctx = context.WithValue(ctx, ctxKeyAPIToRecord, &apiInfo) + r = r.WithContext(ctx) + start := time.Now() ww := middleware.NewWrapResponseWriter(w, r.ProtoMajor) defer func() { - labels := map[string]string{ - "path": r.URL.Path, - "code": strconv.Itoa(ww.Status()), + // Only record metrics for APIs that need instrumentation. + if apiInfo.path != nil && apiInfo.method != nil { + code := strconv.Itoa(ww.Status()) + labels := map[string]string{ + "path": *apiInfo.path, + "code": code, + } + // This logs latency broken down by URL path and response code + // TODO(var-sdk): delete these metrics once the new metrics are safely rolled out. + pkgapi.MetricLatency.With(labels).Observe(float64(time.Since(start))) + pkgapi.MetricLatencySummary.With(labels).Observe(float64(time.Since(start))) + + pkgapi.MetricRequestLatency.With( + map[string]string{ + "path": *apiInfo.path, + "method": *apiInfo.method, + }).Observe(float64(time.Since(start))) + + pkgapi.MetricRequestCount.With( + map[string]string{ + "path": *apiInfo.path, + "method": *apiInfo.method, + "code": code, + }).Inc() } - // This logs latency broken down by URL path and response code - pkgapi.MetricLatency.With(labels).Observe(float64(time.Since(start))) - pkgapi.MetricLatencySummary.With(labels).Observe(float64(time.Since(start))) }() handler.ServeHTTP(ww, r) diff --git a/tests/e2e_test.go b/tests/e2e_test.go index d03b80e81..135965467 100644 --- a/tests/e2e_test.go +++ b/tests/e2e_test.go @@ -19,6 +19,7 @@ package e2e import ( + "bufio" "bytes" "context" "crypto" @@ -37,6 +38,7 @@ import ( "os/exec" "path/filepath" "reflect" + "regexp" "strconv" "strings" "testing" @@ -1347,3 +1349,75 @@ func TestSearchValidateTreeID(t *testing.T) { t.Fatalf("expected 404 status code but got %d", resp.StatusCode) } } + +func getRekorMetricCount(metricLine string, t *testing.T) (int, error) { + re, err := regexp.Compile(fmt.Sprintf("^%s.*([0-9]+)$", regexp.QuoteMeta(metricLine))) + if err != nil { + return 0, err + } + + resp, err := http.Get("http://localhost:2112/metrics") + if err != nil { + return 0, err + } + defer resp.Body.Close() + + scanner := bufio.NewScanner(resp.Body) + for scanner.Scan() { + match := re.FindStringSubmatch(scanner.Text()) + if len(match) != 2 { + continue + } + + result, err := strconv.Atoi(match[1]) + if err != nil { + return 0, nil + } + t.Log("Matched metric line: " + scanner.Text()) + return result, nil + } + return 0, nil +} + +// Smoke test to ensure we're publishing and recording metrics when an API is +// called. +// TODO: use a more robust test approach here e.g. prometheus client-based? +// TODO: cover all endpoints to make sure none are dropped. +func TestMetricsCounts(t *testing.T) { + latencyMetric := "rekor_latency_by_api_count{method=\"GET\",path=\"/api/v1/log\"}" + qpsMetric := "rekor_qps_by_api{code=\"200\",method=\"GET\",path=\"/api/v1/log\"}" + + latencyCount, err := getRekorMetricCount(latencyMetric, t) + if err != nil { + t.Fatal(err) + } + + qpsCount, err := getRekorMetricCount(qpsMetric, t) + if err != nil { + t.Fatal(err) + } + + resp, err := http.Get("http://localhost:3000/api/v1/log") + if err != nil { + t.Fatal(err) + } + resp.Body.Close() + + latencyCount2, err := getRekorMetricCount(latencyMetric, t) + if err != nil { + t.Fatal(err) + } + + qpsCount2, err := getRekorMetricCount(qpsMetric, t) + if err != nil { + t.Fatal(err) + } + + if latencyCount2-latencyCount != 1 { + t.Error("rekor_latency_by_api_count did not increment") + } + + if qpsCount2-qpsCount != 1 { + t.Error("rekor_qps_by_api did not increment") + } +}