-
Notifications
You must be signed in to change notification settings - Fork 619
/
stats.py
285 lines (244 loc) 路 10.5 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
import json
import platform
import subprocess
import threading
import time
from typing import Dict, List, Optional, Union
import psutil
import wandb
from wandb import util
from wandb.vendor.pynvml import pynvml
from . import ipu
from . import tpu
from .settings_static import SettingsStatic
from ..interface.interface_queue import InterfaceQueue
from ..lib import telemetry
GPUHandle = object
SamplerDict = Dict[str, List[float]]
StatsDict = Dict[str, Union[float, Dict[str, float]]]
# TODO: hard coded max watts as 16.5, found this number in the SMC list.
# Eventually we can have the apple_gpu_stats binary query for this.
M1_MAX_POWER_WATTS = 16.5
def gpu_in_use_by_this_process(gpu_handle: GPUHandle, pid: int) -> bool:
if not psutil:
return False
try:
base_process = psutil.Process(pid=pid)
except psutil.NoSuchProcess:
# do not report any gpu metrics if the base process cant be found
return False
our_processes = base_process.children(recursive=True)
our_processes.append(base_process)
our_pids = {process.pid for process in our_processes}
compute_pids = {
process.pid
for process in pynvml.nvmlDeviceGetComputeRunningProcesses(gpu_handle)
}
graphics_pids = {
process.pid
for process in pynvml.nvmlDeviceGetGraphicsRunningProcesses(gpu_handle)
}
pids_using_device = compute_pids | graphics_pids
return len(pids_using_device & our_pids) > 0
class SystemStats:
_pid: int
_interface: InterfaceQueue
sampler: SamplerDict
samples: int
_settings: SettingsStatic
_thread: Optional[threading.Thread]
gpu_count: int
def __init__(self, settings: SettingsStatic, interface: InterfaceQueue) -> None:
try:
pynvml.nvmlInit()
self.gpu_count = pynvml.nvmlDeviceGetCount()
except pynvml.NVMLError:
self.gpu_count = 0
# self.run = run
self._settings = settings
self._pid = settings._stats_pid
self._interface = interface
self.sampler = {}
self.samples = 0
self._shutdown: bool = False
self._telem = telemetry.TelemetryRecord()
if psutil:
net = psutil.net_io_counters()
self.network_init = {"sent": net.bytes_sent, "recv": net.bytes_recv}
else:
wandb.termlog(
"psutil not installed, only GPU stats will be reported. Install with pip install psutil"
)
self._thread = None
self._tpu_profiler = None
if tpu.is_tpu_available():
try:
self._tpu_profiler = tpu.get_profiler()
except Exception as e:
wandb.termlog("Error initializing TPUProfiler: " + str(e))
self._ipu_profiler = None
if ipu.is_ipu_available():
try:
self._ipu_profiler = ipu.IPUProfiler(self._pid)
except Exception as e:
wandb.termlog("Error initializing IPUProfiler: " + str(e))
def start(self) -> None:
if self._thread is None:
self._shutdown = False
self._thread = threading.Thread(target=self._thread_body)
self._thread.name = "StatsThr"
self._thread.daemon = True
if not self._thread.is_alive():
self._thread.start()
if self._tpu_profiler:
self._tpu_profiler.start()
@property
def proc(self) -> psutil.Process:
return psutil.Process(pid=self._pid)
@property
def sample_rate_seconds(self) -> float:
"""Sample system stats every this many seconds, defaults to 2, min is 0.5"""
sample_rate = self._settings._stats_sample_rate_seconds
# TODO: handle self._api.dynamic_settings["system_sample_seconds"]
return max(0.5, sample_rate)
@property
def samples_to_average(self) -> int:
"""The number of samples to average before pushing, defaults to 15 valid range (2:30)"""
samples = self._settings._stats_samples_to_average
# TODO: handle self._api.dynamic_settings["system_samples"]
return min(30, max(2, samples))
def _thread_body(self) -> None:
while True:
stats = self.stats()
for stat, value in stats.items():
if isinstance(value, (int, float)):
self.sampler[stat] = self.sampler.get(stat, [])
self.sampler[stat].append(value)
self.samples += 1
if self._shutdown or self.samples >= self.samples_to_average:
self.flush()
if self._shutdown:
break
seconds = 0.0
while seconds < self.sample_rate_seconds:
time.sleep(0.1)
seconds += 0.1
if self._shutdown:
self.flush() # type: ignore
return
def shutdown(self) -> None:
self._shutdown = True
try:
if self._thread is not None:
self._thread.join()
finally:
self._thread = None
if self._tpu_profiler:
self._tpu_profiler.stop()
def flush(self) -> None:
stats = self.stats()
for stat, value in stats.items():
# TODO: a bit hacky, we assume all numbers should be averaged. If you want
# max for a stat, you must put it in a sub key, like ["network"]["sent"]
if isinstance(value, (float, int)):
# samples = list(self.sampler.get(stat, [stats[stat]]))
samples = list(self.sampler.get(stat, [value]))
stats[stat] = round(sum(samples) / len(samples), 2)
# self.run.events.track("system", stats, _wandb=True)
if self._interface:
self._interface.publish_stats(stats)
self.samples = 0
self.sampler = {}
def stats(self) -> StatsDict:
stats: StatsDict = {}
for i in range(0, self.gpu_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
try:
utilz = pynvml.nvmlDeviceGetUtilizationRates(handle)
memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
temp = pynvml.nvmlDeviceGetTemperature(
handle, pynvml.NVML_TEMPERATURE_GPU
)
in_use_by_us = gpu_in_use_by_this_process(handle, pid=self._pid)
stats["gpu.{}.{}".format(i, "gpu")] = utilz.gpu
stats["gpu.{}.{}".format(i, "memory")] = utilz.memory
stats["gpu.{}.{}".format(i, "memoryAllocated")] = (
memory.used / float(memory.total)
) * 100
stats["gpu.{}.{}".format(i, "temp")] = temp
if in_use_by_us:
stats["gpu.process.{}.{}".format(i, "gpu")] = utilz.gpu
stats["gpu.process.{}.{}".format(i, "memory")] = utilz.memory
stats["gpu.process.{}.{}".format(i, "memoryAllocated")] = (
memory.used / float(memory.total)
) * 100
stats["gpu.process.{}.{}".format(i, "temp")] = temp
# Some GPUs don't provide information about power usage
try:
power_watts = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0
power_capacity_watts = (
pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) / 1000.0
)
power_usage = (power_watts / power_capacity_watts) * 100
stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts
stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage
if in_use_by_us:
stats["gpu.process.{}.{}".format(i, "powerWatts")] = power_watts
stats[
"gpu.process.{}.{}".format(i, "powerPercent")
] = power_usage
except pynvml.NVMLError:
pass
except pynvml.NVMLError:
pass
# On Apple M1 systems let's look for the gpu
if (
platform.system() == "Darwin"
and platform.processor() == "arm"
and self.gpu_count == 0
):
try:
out = subprocess.check_output([util.apple_gpu_stats_binary(), "--json"])
m1_stats = json.loads(out.split(b"\n")[0])
stats["gpu.0.gpu"] = m1_stats["utilization"]
stats["gpu.0.memoryAllocated"] = m1_stats["mem_used"]
stats["gpu.0.temp"] = m1_stats["temperature"]
stats["gpu.0.powerWatts"] = m1_stats["power"]
stats["gpu.0.powerPercent"] = (
m1_stats["power"] / M1_MAX_POWER_WATTS
) * 100
# TODO: this stat could be useful eventually, it was consistently
# 0 in my experimentation and requires a frontend change
# so leaving it out for now.
# stats["gpu.0.cpuWaitMs"] = m1_stats["cpu_wait_ms"]
if self._interface and not self._telem.env.m1_gpu:
self._telem.env.m1_gpu = True
self._interface._publish_telemetry(self._telem)
except (OSError, ValueError, TypeError, subprocess.CalledProcessError) as e:
wandb.termwarn(f"GPU stats error {e}")
pass
if psutil:
net = psutil.net_io_counters()
sysmem = psutil.virtual_memory()
stats["cpu"] = psutil.cpu_percent()
stats["memory"] = sysmem.percent
stats["network"] = {
"sent": net.bytes_sent - self.network_init["sent"],
"recv": net.bytes_recv - self.network_init["recv"],
}
# TODO: maybe show other partitions, will likely need user to configure
stats["disk"] = psutil.disk_usage("/").percent
stats["proc.memory.availableMB"] = sysmem.available / 1048576.0
try:
stats["proc.memory.rssMB"] = self.proc.memory_info().rss / 1048576.0
stats["proc.memory.percent"] = self.proc.memory_percent()
stats["proc.cpu.threads"] = self.proc.num_threads()
except psutil.NoSuchProcess:
pass
if self._tpu_profiler:
tpu_utilization = self._tpu_profiler.get_tpu_utilization()
if tpu_utilization is not None:
stats["tpu"] = tpu_utilization
if self._ipu_profiler:
stats.update(self._ipu_profiler.get_metrics())
return stats