Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

benchmarking utilities #2687

Merged
merged 6 commits into from
Apr 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
147 changes: 147 additions & 0 deletions .buildkite/benchmarks.pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# Copied from pipeline.yml.
docker_plugin_default_config: &docker_plugin_default_config
image: "oasislabs/testing:0.3.0"
always_pull: true
workdir: /workdir
volumes:
- /var/lib/buildkite-agent/.coveralls:/root/.coveralls
- /var/lib/buildkite-agent/.codecov:/root/.codecov
# Shared Rust incremental compile caches.
- /var/tmp/cargo_ic/debug:/var/tmp/artifacts/debug/incremental
- /var/tmp/cargo_ic/debug_sgx:/var/tmp/artifacts/x86_64-unknown-linux-sgx/debug/incremental
# Shared Rust package checkouts directory.
- /var/tmp/cargo_pkg/git:/root/.cargo/git
- /var/tmp/cargo_pkg/registry:/root/.cargo/registry
# Shared Rust SGX standard library artifacts cache.
- /var/tmp/xargo_cache:/root/.xargo
# Shared Go package checkouts directory.
- /var/tmp/go_pkg:/root/go/pkg
# Intel SGX Application Enclave Services Manager (AESM) daemon running on
# the Buildkite host.
- /var/run/aesmd/aesm.socket:/var/run/aesmd/aesm.socket
- /var/tmp/benchmarks:/var/tmp/benchmarks
environment:
- "LC_ALL=C.UTF-8"
- "LANG=C.UTF-8"
- "CARGO_TARGET_DIR=/var/tmp/artifacts"
- "CARGO_INSTALL_ROOT=/root/.cargo"
- "GOPROXY=https://proxy.golang.org/"
- "BUILDKITE_PIPELINE_NAME"
- "BUILDKITE_BUILD_NUMBER"
- "BUILDKITE_BUILD_URL"
- "TESTS"
- "NUM_RUNS"
- "SLACK_WEBHOOK_URL"
- "METRICS_PUSH_ADDR"
- "METRICS_QUERY_ADDR"
- "METRICS_SOURCE_GIT_BRANCH"
- "METRICS_TARGET_GIT_BRANCH"
- "METRICS_THRESHOLDS"
propagate-environment: true
unconfined: true

docker_plugin: &docker_plugin
oasislabs/docker#v3.0.1-oasis1:
<<: *docker_plugin_default_config

steps:
###############################################################
# The following three steps are copied from code.pipeline.yml #
###############################################################
############
# Build jobs
############
- label: Build Go node
command:
- .buildkite/go/build.sh

# Upload the built artifacts.
- cd /workdir/go/oasis-node
- buildkite-agent artifact upload oasis-node
- buildkite-agent artifact upload oasis-node.test
- cd /workdir/go/oasis-test-runner
- buildkite-agent artifact upload oasis-test-runner
- buildkite-agent artifact upload oasis-test-runner.test
- cd /workdir/go/oasis-net-runner
- buildkite-agent artifact upload oasis-net-runner
- cd /workdir/go/oasis-remote-signer
- buildkite-agent artifact upload oasis-remote-signer
plugins:
<<: *docker_plugin

- label: Build Rust runtime loader
command:
- .buildkite/rust/build_generic.sh /workdir -p oasis-core-runtime-loader
- .buildkite/rust/build_generic.sh /workdir -p test-long-term-client
- .buildkite/rust/build_generic.sh /workdir -p simple-keyvalue-client
- .buildkite/rust/build_generic.sh /workdir -p simple-keyvalue-enc-client
- .buildkite/rust/build_generic.sh /workdir -p simple-keyvalue-ops-client

# Upload the built artifacts.
- cd /var/tmp/artifacts/default/debug
- buildkite-agent artifact upload oasis-core-runtime-loader
# Clients for E2E tests.
- buildkite-agent artifact upload test-long-term-client
- buildkite-agent artifact upload simple-keyvalue-client
- buildkite-agent artifact upload simple-keyvalue-enc-client
- buildkite-agent artifact upload simple-keyvalue-ops-client
agents:
buildkite_agent_size: large
plugins:
<<: *docker_plugin

####################
# Runtime build jobs
####################
- label: Build key manager runtime
command:
- .buildkite/rust/build_runtime.sh tests/runtimes/simple-keymanager
- .buildkite/rust/build_runtime.sh tests/runtimes/simple-keyvalue

# Upload the built artifacts.
- cd /var/tmp/artifacts/sgx/x86_64-fortanix-unknown-sgx/debug
- buildkite-agent artifact upload simple-keymanager.sgxs
- buildkite-agent artifact upload simple-keyvalue.sgxs
- cd /var/tmp/artifacts/default/debug
- buildkite-agent artifact upload simple-keymanager
- buildkite-agent artifact upload simple-keyvalue
agents:
buildkite_agent_size: large
plugins:
<<: *docker_plugin

# Wait for all jobs defined before this point
# to finish running in parallel before continuing.
- wait

#########################################
# E2E test jobs with enabled benchmarking
#########################################
- label: E2E tests
parallelism: 7
timeout_in_minutes: 30
command:
- .buildkite/scripts/download_e2e_test_artifacts.sh
- rm -rf /var/tmp/benchmarks/*
- .buildkite/scripts/test_e2e.sh --metrics.address $METRICS_PUSH_ADDR --metrics.interval 5s --metrics.labels instance=\$BUILDKITE_PIPELINE_NAME-\$BUILDKITE_BUILD_NUMBER --num_runs $NUM_RUNS -t $TESTS
env:
TEST_BASE_DIR: /var/tmp/benchmarks
agents:
buildkite_agent_size: large
buildkite_agent_class: stable
plugins:
<<: *docker_plugin

# Wait for all jobs defined before this point
# to finish running in parallel before continuing.
- wait

###########################
# Compare benchmark results
###########################
- label: Benchmark analysis
command:
- .buildkite/scripts/download_e2e_test_artifacts.sh
- .buildkite/scripts/daily_benchmark_analysis.sh
plugins:
<<: *docker_plugin
6 changes: 0 additions & 6 deletions .buildkite/code.pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,6 @@ steps:
############
# Build jobs
############
# This label needs to be synced with runtime-ethereum's
# .buildkite/scripts/download_utils.sh.
- label: Build Go node
command:
- .buildkite/go/build.sh
Expand All @@ -110,8 +108,6 @@ steps:
plugins:
<<: *docker_plugin

# This label needs to be synced with runtime-ethereum's
# .buildkite/scripts/download_utils.sh.
- label: Build Rust runtime loader
command:
- .buildkite/rust/build_generic.sh /workdir -p oasis-core-runtime-loader
Expand All @@ -136,8 +132,6 @@ steps:
####################
# Runtime build jobs
####################
# This label needs to be synced with runtime-ethereum's
# .buildkite/rust/test_runtime_and_gateway.sh and .buildkite/scripts/download_utils.sh.
- label: Build key manager runtime
command:
- .buildkite/rust/build_runtime.sh tests/runtimes/simple-keymanager
Expand Down
47 changes: 47 additions & 0 deletions .buildkite/scripts/daily_benchmark_analysis.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#! /bin/bash

# This script compares all metrics of the last benchmark batch from the feature
# branch to the last batch of the master branch. If any thresholds are
# exceeded, the issue is reported to the slack channel and error code is
# returned.
#
# Script should be invoked from .buildkite/benchmarks.pipeline.yml. Required
# env variables:
# BUILDKITE_BUILD_URL - URL for seeing detailed testing and comparison log (e.g. https://buildkite.com/oasislabs/oasis-core-daily-benchmarks/builds/xx)
# METRICS_QUERY_ADDR - address of Prometheus server (e.g. http://localhost:9090)
# METRICS_SOURCE_GIT_BRANCH - name of feature branch on git (e.g. jsmith/feature/abc)
# METRICS_TARGET_GIT_BRANCH - name of master branch on git (e.g. master)
# METRICS_THRESHOLDS - max or min thresholds flags (e.g. --max_threshold.cpu.avg_ratio 1.05)
# TESTS - names of test(s) to compare (e.g. e2e/runtime/runtime)
# SLACK_WEBHOOK_URL - slack webhook for reporting (e.g. https://hooks.slack.com/services/xxxxxx)

set -ux

./go/oasis-test-runner/oasis-test-runner cmp \
--metrics.address $METRICS_QUERY_ADDR \
--metrics.source.git_branch $METRICS_SOURCE_GIT_BRANCH \
--metrics.target.git_branch $METRICS_TARGET_GIT_BRANCH \
-t $TESTS \
--log.level INFO \
--log.format JSON \
$METRICS_THRESHOLDS \
>out.txt 2>&1
CMP_RETURN_CODE=$?

# Show stdout and stderr in logs for debugging.
cat out.txt

# Escape double quotes for JSON.
CMP_ERROR_LINES=`cat out.txt | sed "s/\"/\\\\\\\\\"/g" | grep error`

if [ $CMP_RETURN_CODE != 0 ]; then
# Post error to slack channel.
curl -H "Content-Type: application/json" \
-X POST \
--data "{\"text\": \"$BUILDKITE_PIPELINE_NAME for branch \`$METRICS_SOURCE_GIT_BRANCH\` failed. Visit $BUILDKITE_BUILD_URL for details.\", \"attachments\":[{\"title\":\"Relevant error lines\",\"text\":\"$CMP_ERROR_LINES\"}]}" \
"$SLACK_WEBHOOK_URL"

# Exit with non-zero exit code, so that the buildkite build will be
# marked as failed.
exit 1
fi
10 changes: 5 additions & 5 deletions .buildkite/scripts/test_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ ${test_runner_binary} \
${BUILDKITE:+--basedir ${TEST_BASE_DIR:-$PWD}/e2e} \
--basedir.no_cleanup \
--e2e.node.binary ${node_binary} \
--e2e.client.binary_dir ${WORKDIR}/target/default/debug \
--e2e.runtime.binary_dir ${WORKDIR}/target/${runtime_target}/debug \
--e2e.runtime.loader ${WORKDIR}/target/default/debug/oasis-core-runtime-loader \
--e2e.tee_hardware ${OASIS_TEE_HARDWARE:-""} \
--remote_signer.binary ${WORKDIR}/go/oasis-remote-signer/oasis-remote-signer \
--e2e/runtime.client.binary_dir ${WORKDIR}/target/default/debug \
--e2e/runtime.runtime.binary_dir ${WORKDIR}/target/${runtime_target}/debug \
--e2e/runtime.runtime.loader ${WORKDIR}/target/default/debug/oasis-core-runtime-loader \
--e2e/runtime.tee_hardware ${OASIS_TEE_HARDWARE:-""} \
--remote-signer.binary ${WORKDIR}/go/oasis-remote-signer/oasis-remote-signer \
matevz marked this conversation as resolved.
Show resolved Hide resolved
--log.level info \
${BUILDKITE_PARALLEL_JOB_COUNT:+--parallel.job_count ${BUILDKITE_PARALLEL_JOB_COUNT}} \
${BUILDKITE_PARALLEL_JOB:+--parallel.job_index ${BUILDKITE_PARALLEL_JOB}} \
Expand Down
10 changes: 10 additions & 0 deletions .changelog/2687.breaking.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
`oasis-node`: Refactor `metrics` parameters

- `--metrics.push.job_name` renamed to `--metrics.job_name`.
- `--metrics.push.interval` renamed to `--metrics.interval`.
- `--metrics.push.instance_label` replaced with more general
`--metrics.labels` map parameter where `instance` is a required key, if
matevz marked this conversation as resolved.
Show resolved Hide resolved
metrics are enabled. For example `--metrics.push.instance_label abc` now
becomes `--metrics.labels instance=abc`. User can also set other
arbitrary Prometheus labels, for example
`--metrics.labels instance=abc,cpu=intel_i7-8750`.
11 changes: 11 additions & 0 deletions .changelog/2687.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
`oasis-node`: Add benchmarking utilities

- New Prometheus metrics for:
- datadir space usage,
- I/O (read/written bytes),
- memory usage (VMSize, RssAnon, RssFile, RssShmem),
- CPU (utime and stime),
- network interfaces (rx/tx bytes/packets),
- Bumps `prometheus/go_client` to latest version which fixes sending label
values containing non-url characters.
- Bumps `spf13/viper` which fixes `IsSet()` behavior.
40 changes: 40 additions & 0 deletions .changelog/2687.internal.1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
`oasis-test-runner`: Add benchmarking utilities

- `oasis-test-runner` now accepts `--metrics.address` and `--metrics.interval`
parameters which are forwarded to `oasis-node` workers.
- `oasis-test-runner` now signals `oasis_up` metric to Prometheus when a test
starts and when it finishes.
- `--num_runs` parameter added which specifies how many times each test should
be run.
- `basic` E2E test was renamed to `runtime`.
- Scenario names now use corresponding namespace. e.g. `halt-restore` is now
`e2e/runtime/halt-restore`.
- Scenario parameters are now exposed and settable via CLI by reimplementing
`scenario.Parameters()` and setting it with `--<test_name>.<param>=<val>`.
- Scenario parameters can also be generally set, for example
`--e2e.node.binary` will set `node.binary` parameter for all E2E tests and
`--e2e/runtime.node.binary` will set it for tests which inherit `runtime`.
- Multiple parameter values can be provided in form
`--<test_name>.<param>=<val1>,<val2>,...`. In this case, `oasis-test-runner`
combines them with other parameters and generates unique parameter sets for
each test.
- Each scenario is run in a unique datadir per parameter set of form
`oasis-test-runnerXXXXXX/<test_name>/<run_id>`.
- Due to very long datadir for some e2e tests, custom internal gRPC socket
names are provided to `oasis-node`.
- If metrics are enabled, new labels are passed to oasis-nodes and pushed to
Prometheus for each test:
- `instance`,
- `run`,
- `test`,
- `software_version`,
- `git_branch`,
- whole test-specific parameter set.
- New `version.GitBranch` variable determined and set during compilation.
- Current parameter set, run number, and test name dumped to `test_info.json`
in corresponding datadir. This is useful when packing whole datadir for
external debugging.
- New `cmp` command for analyzing benchmark results has been added which
fetches the last two batches of benchmark results from Prometheus and
compares them. For more information, see `README.md` in
`go/oasis-test-runner` folder.
10 changes: 10 additions & 0 deletions .changelog/2687.internal.2.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
`oasis-node`: Add custom internal socket path flag (for E2E tests only!)

`--debug.grpc.internal.socket_name` flag was added which forces `oasis-node`
to use the given path for the internal gRPC socket. This was necessary,
because some E2E test names became very lengthy and original datadir exceeded
the maximum unix socket path length. `oasis-test-runner` now generates
shorter socket names in `/tmp/oasis-test-runnerXXXXXX` directory and provides
them to `oasis-node`. **Due to security risks never ever use this flag in
production-like environments. Internal gRPC sockets should always reside in
node datadir!**
4 changes: 4 additions & 0 deletions .changelog/2687.internal.3.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ci: New benchmarks pipeline has been added

`benchmarks.pipeline.yml` runs all E2E tests and compares the benchmark
results from the previous batch using the new `oasis-test-runner cmp` command.
4 changes: 3 additions & 1 deletion common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ ifeq ($(and $(LATEST_TAG),$(IS_TAG)),NO)
endif
export VERSION

GIT_BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
matevz marked this conversation as resolved.
Show resolved Hide resolved

# Try to compute the next version based on the latest tag of the origin remote
# using the Punch tool.
# First, all tags from the origin remote are fetched. Next, the latest tag on
Expand Down Expand Up @@ -90,7 +92,7 @@ GOFLAGS ?= -trimpath -v

# Add Oasis Core's version as a linker string value definition.
ifneq ($(VERSION),)
export GOLDFLAGS ?= "-X github.com/oasislabs/oasis-core/go/common/version.SoftwareVersion=$(VERSION)"
export GOLDFLAGS ?= "-X github.com/oasislabs/oasis-core/go/common/version.SoftwareVersion=$(VERSION) -X github.com/oasislabs/oasis-core/go/common/version.GitBranch=$(GIT_BRANCH)"
endif

# Go build command to use by default.
Expand Down
2 changes: 2 additions & 0 deletions go/common/grpc/grpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,8 @@ func (s *Server) Start() error {
)
return err
}
s.Logger.Info("gRPC server started", "network", cfg.network, "address", cfg.address)

s.startedListeners = append(s.startedListeners, ln)

go func() {
Expand Down
5 changes: 5 additions & 0 deletions go/common/version/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ var (
// by the linker.
SoftwareVersion = "0.0-unset"

// GitBranch is the name of the git branch of Oasis Core.
//
// This is mostly used for reporting and metrics.
GitBranch = ""

// RuntimeProtocol versions the protocol between the Oasis node(s) and
// the runtime.
//
Expand Down