Skip to content

Commit 33fdffa

Browse files
committed
fix(gpu): fix accelerator registry map
1 parent b0f6cdc commit 33fdffa

File tree

4 files changed

+36
-7
lines changed

4 files changed

+36
-7
lines changed

pkg/config/config.go

+3
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,9 @@ func logBoolConfigs() {
287287
klog.V(5).Infof("EXPOSE_ESTIMATED_IDLE_POWER_METRICS: %t. This only impacts when the power is estimated using pre-prained models. Estimated idle power is meaningful only when Kepler is running on bare-metal or with a single virtual machine (VM) on the node.", instance.Kepler.ExposeIdlePowerMetrics)
288288
klog.V(5).Infof("EXPERIMENTAL_BPF_SAMPLE_RATE: %d", instance.Kepler.BPFSampleRate)
289289
klog.V(5).Infof("EXCLUDE_SWAPPER_PROCESS: %t", instance.Kepler.ExcludeSwapperProcess)
290+
if instance.Kepler.EnabledGPU {
291+
klog.V(5).Infof("DCGMHostEngineEndpoint %s", instance.DCGMHostEngineEndpoint)
292+
}
290293
}
291294
}
292295

pkg/sensors/accelerator/accelerator.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ package accelerator
1515

1616
//nolint:gci // The supported device imports are kept separate.
1717
import (
18+
"encoding/json"
1819
"slices"
1920
"sync"
2021
"time"
@@ -130,7 +131,10 @@ func New(atype string, sleep bool) (Accelerator, error) {
130131

131132
// Init the available devices.
132133

133-
devs := devices.GetRegistry().GetAllDeviceTypes()
134+
r := devices.GetRegistry()
135+
j, _ := json.Marshal(r.GetAllDevices())
136+
klog.V(5).Infof("Accelerator Registry AllDevices: %s", string(j))
137+
devs := r.GetAllDeviceTypes()
134138
numDevs := len(devs)
135139
if numDevs == 0 || !slices.Contains(devs, atype) {
136140
return nil, errors.New("no devices found")

pkg/sensors/accelerator/devices/dcgm.go

+9-4
Original file line numberDiff line numberDiff line change
@@ -76,21 +76,22 @@ func dcgmCheck(r *Registry) {
7676
}
7777

7878
func dcgmDeviceStartup() Device {
79-
a := dcgmAccImpl
79+
klog.V(3).Infof("Attempting to startup DCGM")
80+
d := dcgmAccImpl
8081

81-
if err := a.InitLib(); err != nil {
82+
if err := d.InitLib(); err != nil {
8283
klog.Errorf("Error initializing %s: %v", dcgmType.String(), err)
8384
return nil
8485
}
8586

86-
if err := a.Init(); err != nil {
87+
if err := d.Init(); err != nil {
8788
klog.Errorf("failed to StartupDevice: %v", err)
8889
return nil
8990
}
9091

9192
klog.Infof("Using %s to obtain gpu power", dcgmType.String())
9293

93-
return &a
94+
return &d
9495
}
9596

9697
func (d *gpuDcgm) Init() error {
@@ -138,6 +139,7 @@ func (d *gpuDcgm) InitLib() (err error) {
138139
if err != nil {
139140
klog.Infof("There is no DCGM daemon running in the host: %s", err)
140141
// embedded mode is not recommended for production per https://github.com/NVIDIA/dcgm-exporter/issues/22#issuecomment-1321521995
142+
klog.Info("Attempting to inilialize dcgm in Embedded mode.")
141143
cleanup, err = dcgm.Init(dcgm.Embedded)
142144
if err != nil {
143145
klog.Errorf("Could not start DCGM. Error: %s", err)
@@ -147,6 +149,8 @@ func (d *gpuDcgm) InitLib() (err error) {
147149
return fmt.Errorf("not able to connect to DCGM: %s", err)
148150
}
149151
klog.Info("Started DCGM in the Embedded mode ")
152+
} else {
153+
klog.Info("Started DCGM in the Standalone mode ")
150154
}
151155
d.nvmlInited = false
152156
d.devs = make(map[int]GPUDevice)
@@ -172,6 +176,7 @@ func (d *gpuDcgm) InitLib() (err error) {
172176
}
173177

174178
func (d *gpuDcgm) loadDevices() error {
179+
klog.V(5).Infof("Attempting to load dcgm devices.")
175180
d.devs = map[int]GPUDevice{}
176181
count, err := nvml.DeviceGetCount()
177182
if err != nvml.SUCCESS {

pkg/sensors/accelerator/devices/device.go

+19-2
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,13 @@ func (r *Registry) MustRegister(a string, d DeviceType, deviceStartup deviceStar
121121
return
122122
}
123123
klog.V(5).Infof("Adding the device to the registry [%s][%s]", a, d.String())
124-
r.Registry[a] = map[DeviceType]deviceStartupFunc{
125-
d: deviceStartup,
124+
m, ok := r.Registry[a]
125+
if !ok {
126+
r.Registry[a] = map[DeviceType]deviceStartupFunc{
127+
d: deviceStartup,
128+
}
129+
} else {
130+
m[d] = deviceStartup
126131
}
127132
}
128133

@@ -143,6 +148,18 @@ func (r *Registry) GetAllDeviceTypes() []string {
143148
return devices
144149
}
145150

151+
func (r *Registry) GetAllDevices() map[string]map[string]interface{} {
152+
all := map[string]map[string]interface{}{}
153+
for t, m := range r.Registry {
154+
devices := map[string]interface{}{}
155+
for d := range m {
156+
devices[d.String()] = struct{}{}
157+
}
158+
all[t] = devices
159+
}
160+
return all
161+
}
162+
146163
func addDeviceInterface(registry *Registry, dtype DeviceType, accType string, deviceStartup deviceStartupFunc) error {
147164
switch accType {
148165
case config.GPU:

0 commit comments

Comments
 (0)