-
Notifications
You must be signed in to change notification settings - Fork 8
/
config.toml
147 lines (124 loc) · 4.96 KB
/
config.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
[general]
listen = "0.0.0.0:4242"
# Controls whether gzip compression will be used for the http endpoints. This
# can significantly reduce the payload size at the cost of some additional cpu
# time. This is highly recommended when exposing the histograms on the
# prometheus endpoint.
compression = false
[log]
# Controls the log level: "error", "warn", "info", "debug", "trace"
level = "info"
[prometheus]
# Controls whether the full distribution for each histogram is exposed via the
# prometheus endpoint (`/metrics`). This adds a considerable number of time
# series depending on the downsampling factor as each histogram bucket is
# represented as its own time series.
#
# NOTE: it's recommended to enable compression when enabling this.
histograms = false
# The histogram can be downsampled for exposition to reduce the number of
# buckets, and therefore reduce the number of timeseries needed to store the
# distribution.
#
# The grouping power must be in the range 2..=7. The native histograms are
# recorded with a grouping power of 7. Any reduction in the grouping power will
# increase the relative error, as the buckets are wider with lower grouping
# powers.
#
# By default, we reduce the grouping power to 4 to greatly reduce the number of
# timeseries but maintain an acceptable relative error for most uses.
#
# See https://docs.rs/histogram/ for more information about the grouping power.
#
# Power: Error: Buckets:
# 7 0.781% 7424
# 6 1.56% 3776
# 5 3.13% 1920
# 4 6.25% 976
# 3 12.5% 496
# 2 25.0% 252
histogram_grouping_power = 4
# The defaults are used for each sampler unless there's a sampler level
# configuration present.
[defaults]
# Controls whether the samplers are enabled or not. Setting the default to
# true means that individual sampler configs can be used to opt-out of
# collection for that sampler. Setting the default to false requires that
# individual sampler configs are used to opt-in to collection.
enabled = true
# Controls whether BPF sampler will be used. When a metric can be collected
# without BPF, that sampler will be used instead. Otherwise, the sampler will
# effectively be disabled.
bpf = true
# The collection interval for counter and gauge based metrics. Shorter intervals
# allow for more accurately capturing bursts in the related percentile metrics.
interval = "10ms"
# The collection interval for metrics that sample a distribution. Shorter
# intervals reduce the uncertainty of the exact period corresponding to the
# related percentile metrics.
distribution_interval = "50ms"
# Each sampler can then be individually configured to override the defaults. All
# of the configuration options in the `[defaults]` section are allowed.
# BPF sampler that instruments block_io request queue to measure latency and
# size distribution.
[samplers.block_io_latency]
enabled = true
# Instruments CPU frequency, instructions, and cycles using perf events with
# fallback to instrumenting frequency only via /proc/cpuinfo
[samplers.cpu_perf]
enabled = true
# Instruments CPU usage by state with BPF or by reading /proc/stat on linux
# On macos host_processor_info() is used
[samplers.cpu_usage]
enabled = true
bpf = true
# Produces various nVIDIA specific GPU metrics using NVML
[samplers.gpu_nvidia]
enabled = true
# Memory utilization from /proc/meminfo
[samplers.memory_meminfo]
enabled = true
# Memory NUMA metrics from /proc/vmstat
[samplers.memory_vmstat]
enabled = true
# Produces network interface statistics from /sys/class/net for TX/RX errors
[samplers.network_interfaces]
enabled = true
# Produces network traffic statistics using BPF
[samplers.network_traffic]
enabled = true
# Sample resource utilization for Rezolus itself
[samplers.rezolus_rusage]
enabled = true
# BPF sampler that instruments scheduler events and measures runqueue latency,
# process running time, and context switch information.
[samplers.scheduler_runqueue]
enabled = true
# BPF sampler that instruments syscall enter and exit to gather syscall counts
# and latencies.
[samplers.syscall_latency]
enabled = true
# Instruments TCP connection states by reading /proc/net/tcp
#
# Note: this sampler causes higher CPU utilization than our other samplers when
# running with short intervals. To reduce that cost, we override this to sample
# on a secondly basis.
[samplers.tcp_connection_state]
enabled = true
interval = "1s"
# BPF sampler that probes TCP receive path to measure latency from a packet
# being received until application reads from the socket.
[samplers.tcp_packet_latency]
enabled = true
# BPF sampler that probes TCP receive path to measure jitter and smoothed round
# trip time.
[samplers.tcp_receive]
enabled = true
# BPF sampler that probes TCP retransmit path to measure retransmits.
[samplers.tcp_retransmit]
enabled = true
# Samples TCP traffic using either a BPF sampler or /proc/net/snmp to provide
# metrics for TX/RX bytes and packets
[samplers.tcp_traffic]
enabled = true
bpf = true