config.toml

[general]
listen = "0.0.0.0:4242"

# Controls whether gzip compression will be used for the http endpoints. This
# can significantly reduce the payload size at the cost of some additional cpu
# time. This is highly recommended when exposing the histograms on the
# prometheus endpoint.
compression = false

[log]
# Controls the log level: "error", "warn", "info", "debug", "trace"
level = "info"

[prometheus]
# Controls whether the full distribution for each histogram is exposed via the
# prometheus endpoint (`/metrics`). This adds a considerable number of time
# series depending on the downsampling factor as each histogram bucket is
# represented as its own time series.
#
# NOTE: it's recommended to enable compression when enabling this.
histograms = false

# The histogram can be downsampled for exposition to reduce the number of
# buckets, and therefore reduce the number of timeseries needed to store the
# distribution.
#
# The grouping power must be in the range 2..=7. The native histograms are
# recorded with a grouping power of 7. Any reduction in the grouping power will
# increase the relative error, as the buckets are wider with lower grouping
# powers.
#
# By default, we reduce the grouping power to 4 to greatly reduce the number of
# timeseries but maintain an acceptable relative error for most uses.
#
# See https://docs.rs/histogram/ for more information about the grouping power.
#
# Power:   	    Error:		Buckets:
# 7             0.781%      7424
# 6             1.56%       3776
# 5             3.13%       1920
# 4             6.25%       976
# 3             12.5%       496
# 2             25.0%       252
histogram_grouping_power = 4

# The defaults are used for each sampler unless there's a sampler level
# configuration present.

[defaults]
# Controls whether the samplers are enabled or not. Setting the default to
# true means that individual sampler configs can be used to opt-out of
# collection for that sampler. Setting the default to false requires that
# individual sampler configs are used to opt-in to collection.
enabled = true
# Controls whether BPF sampler will be used. When a metric can be collected
# without BPF, that sampler will be used instead. Otherwise, the sampler will
# effectively be disabled.
bpf = true
# The collection interval for counter and gauge based metrics. Shorter intervals
# allow for more accurately capturing bursts in the related percentile metrics.
interval = "10ms"
# The collection interval for metrics that sample a distribution. Shorter
# intervals reduce the uncertainty of the exact period corresponding to the
# related percentile metrics.
distribution_interval = "50ms"

# Each sampler can then be individually configured to override the defaults. All
# of the configuration options in the `[defaults]` section are allowed.

# BPF sampler that instruments block_io request queue to measure latency and
# size distribution.
[samplers.block_io_latency]
enabled = true

# Instruments CPU frequency, instructions, and cycles using perf events with
# fallback to instrumenting frequency only via /proc/cpuinfo
[samplers.cpu_perf]
enabled = true

# Instruments CPU usage by state with BPF or by reading /proc/stat on linux
# On macos host_processor_info() is used
[samplers.cpu_usage]
enabled = true
bpf = true

# Produces various nVIDIA specific GPU metrics using NVML
[samplers.gpu_nvidia]
enabled = true

# Memory utilization from /proc/meminfo
[samplers.memory_meminfo]
enabled = true

# Memory NUMA metrics from /proc/vmstat
[samplers.memory_vmstat]
enabled = true

# Produces network interface statistics from /sys/class/net for TX/RX errors
[samplers.network_interfaces]
enabled = true

# Produces network traffic statistics using BPF
[samplers.network_traffic]
enabled = true

# Sample resource utilization for Rezolus itself
[samplers.rezolus_rusage]
enabled = true

# BPF sampler that instruments scheduler events and measures runqueue latency,
# process running time, and context switch information.
[samplers.scheduler_runqueue]
enabled = true

# BPF sampler that instruments syscall enter and exit to gather syscall counts
# and latencies.
[samplers.syscall_latency]
enabled = true

# Instruments TCP connection states by reading /proc/net/tcp
#
# Note: this sampler causes higher CPU utilization than our other samplers when
# running with short intervals. To reduce that cost, we override this to sample
# on a secondly basis.
[samplers.tcp_connection_state]
enabled = true
interval = "1s"

# BPF sampler that probes TCP receive path to measure latency from a packet
# being received until application reads from the socket.
[samplers.tcp_packet_latency]
enabled = true

# BPF sampler that probes TCP receive path to measure jitter and smoothed round
# trip time.
[samplers.tcp_receive]
enabled = true

# BPF sampler that probes TCP retransmit path to measure retransmits.
[samplers.tcp_retransmit]
enabled = true

# Samples TCP traffic using either a BPF sampler or /proc/net/snmp to provide
# metrics for TX/RX bytes and packets
[samplers.tcp_traffic]
enabled = true
bpf = true