Skip to content

Commit 7489f4f

Browse files
k0steInformatic
andauthored
Add support for device types and predictable device paths (rebased) (#257)
* Add better error logging on smartctl exec failure We will now log a warning if smartctl path passed via command line is invalid. Signed-off-by: Piotr Dobrowolski <[email protected]> (cherry picked from commit 1c9c694) * Add support for autoscan device types and predictable device paths This adds a new command line option allowing for customization of autodetected device types and enables use of special "by-id" device type that forces use of predictable device paths (/dev/disk/by-id/...) Relevant change to device name parsing regular expression is included now, so predictable device paths are now also usable when directly specified. Signed-off-by: Piotr Dobrowolski <[email protected]> (cherry picked from commit 4c5f721) Conflicts: - file: 'readjson.go' comment: 'manually resolve new logger issues' * Rework device label, fix SATA discovery, per-device type specification Signed-off-by: Piotr Dobrowolski <[email protected]> (cherry picked from commit 319184c) Conflicts: - file: 'main.go' comment: 'manually resolve new logger issues' - file: 'readjson.go' comment: 'manually resolve new logger issues' --------- Co-authored-by: Piotr Dobrowolski <[email protected]>
1 parent 703f9c8 commit 7489f4f

File tree

4 files changed

+130
-59
lines changed

4 files changed

+130
-59
lines changed

main.go

+54-25
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,13 @@ import (
3434

3535
// Device
3636
type Device struct {
37-
Name string `json:"name"`
38-
Info_Name string `json:"info_name"`
39-
Type string `json:"type"`
37+
Name string
38+
Type string
39+
Label string
40+
}
41+
42+
func (d Device) String() string {
43+
return d.Name + ";" + d.Type + " (" + d.Label + ")"
4044
}
4145

4246
// SMARTctlManagerCollector implements the Collector interface.
@@ -80,6 +84,7 @@ func (i *SMARTctlManagerCollector) RescanForDevices() {
8084
time.Sleep(*smartctlRescanInterval)
8185
i.logger.Info("Rescanning for devices")
8286
devices := scanDevices(i.logger)
87+
devices = buildDevicesFromFlag(devices)
8388
i.mutex.Lock()
8489
i.Devices = devices
8590
i.mutex.Unlock()
@@ -96,8 +101,9 @@ var (
96101
smartctlRescanInterval = kingpin.Flag("smartctl.rescan",
97102
"The interval between rescanning for new/disappeared devices. If the interval is smaller than 1s no rescanning takes place. If any devices are configured with smartctl.device also no rescanning takes place.",
98103
).Default("10m").Duration()
104+
smartctlScan = kingpin.Flag("smartctl.scan", "Enable scanning. This is a default if no devices are specified").Default("false").Bool()
99105
smartctlDevices = kingpin.Flag("smartctl.device",
100-
"The device to monitor (repeatable)",
106+
"The device to monitor. Device type can be specified after a semicolon, eg. '/dev/bus/0;megaraid,1' (repeatable)",
101107
).Strings()
102108
smartctlDeviceExclude = kingpin.Flag(
103109
"smartctl.device-exclude",
@@ -107,6 +113,10 @@ var (
107113
"smartctl.device-include",
108114
"Regexp of devices to exclude from automatic scanning. (mutually exclusive to device-exclude)",
109115
).Default("").String()
116+
smartctlScanDeviceTypes = kingpin.Flag(
117+
"smartctl.scan-device-type",
118+
"Device type to use during automatic scan. Special by-id value forces predictable device names. (repeatable)",
119+
).Strings()
110120
smartctlFakeData = kingpin.Flag("smartctl.fake-data",
111121
"The device to monitor (repeatable)",
112122
).Default("false").Hidden().Bool()
@@ -120,34 +130,45 @@ func scanDevices(logger *slog.Logger) []Device {
120130
scanDevices := json.Get("devices").Array()
121131
var scanDeviceResult []Device
122132
for _, d := range scanDevices {
123-
deviceName := extractDiskName(strings.TrimSpace(d.Get("info_name").String()))
124-
if filter.ignored(deviceName) {
125-
logger.Info("Ignoring device", "name", deviceName)
133+
deviceName := d.Get("name").String()
134+
deviceType := d.Get("type").String()
135+
136+
// SATA devices are reported as SCSI during scan - fallback to auto scraping
137+
if deviceType == "scsi" {
138+
deviceType = "auto"
139+
}
140+
141+
deviceLabel := buildDeviceLabel(deviceName, deviceType)
142+
if filter.ignored(deviceLabel) {
143+
logger.Info("Ignoring device", "name", deviceLabel)
126144
} else {
127-
logger.Info("Found device", "name", deviceName)
145+
logger.Info("Found device", "name", deviceLabel)
128146
device := Device{
129-
Name: d.Get("name").String(),
130-
Info_Name: deviceName,
131-
Type: d.Get("type").String(),
147+
Name: deviceName,
148+
Type: deviceType,
149+
Label: deviceLabel,
132150
}
133151
scanDeviceResult = append(scanDeviceResult, device)
134152
}
135153
}
136154
return scanDeviceResult
137155
}
138156

139-
func filterDevices(logger *slog.Logger, devices []Device, filters []string) []Device {
140-
var filtered []Device
141-
for _, d := range devices {
142-
for _, filter := range filters {
143-
logger.Debug("filterDevices", "device", d.Info_Name, "filter", filter)
144-
if strings.Contains(d.Info_Name, filter) {
145-
filtered = append(filtered, d)
146-
break
147-
}
157+
func buildDevicesFromFlag(devices []Device) []Device {
158+
// TODO: deduplication?
159+
for _, device := range *smartctlDevices {
160+
deviceName, deviceType, _ := strings.Cut(device, ";")
161+
if deviceType == "" {
162+
deviceType = "auto"
148163
}
164+
165+
devices = append(devices, Device{
166+
Name: deviceName,
167+
Type: deviceType,
168+
Label: buildDeviceLabel(deviceName, deviceType),
169+
})
149170
}
150-
return filtered
171+
return devices
151172
}
152173

153174
func main() {
@@ -167,11 +188,19 @@ func main() {
167188
logger.Info("Build context", "build_context", version.BuildContext())
168189

169190
var devices []Device
170-
devices = scanDevices(logger)
171-
logger.Info("Number of devices found", "count", len(devices))
191+
192+
if len(*smartctlDevices) == 0 {
193+
*smartctlScan = true
194+
}
195+
196+
if *smartctlScan {
197+
devices = scanDevices(logger)
198+
logger.Info("Number of devices found", "count", len(devices))
199+
}
200+
172201
if len(*smartctlDevices) > 0 {
173202
logger.Info("Devices specified", "devices", strings.Join(*smartctlDevices, ", "))
174-
devices = filterDevices(logger, devices, *smartctlDevices)
203+
devices = buildDevicesFromFlag(devices)
175204
logger.Info("Devices filtered", "count", len(devices))
176205
}
177206

@@ -180,7 +209,7 @@ func main() {
180209
logger: logger,
181210
}
182211

183-
if *smartctlRescanInterval >= 1*time.Second {
212+
if *smartctlScan && *smartctlRescanInterval >= 1*time.Second {
184213
logger.Info("Start background scan process")
185214
logger.Info("Rescanning for devices every", "rescanInterval", *smartctlRescanInterval)
186215
go collector.RescanForDevices()

readjson.go

+23-13
Original file line numberDiff line numberDiff line change
@@ -63,29 +63,39 @@ func readFakeSMARTctl(logger *slog.Logger, device Device) gjson.Result {
6363
// Get json from smartctl and parse it
6464
func readSMARTctl(logger *slog.Logger, device Device) (gjson.Result, bool) {
6565
start := time.Now()
66-
out, err := exec.Command(*smartctlPath, "--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device="+device.Type, device.Name).Output()
66+
var smartctlArgs = []string{"--json", "--info", "--health", "--attributes", "--tolerance=verypermissive", "--nocheck=standby", "--format=brief", "--log=error", "--device=" + device.Type, device.Name}
67+
68+
logger.Debug("Calling smartctl with args", "args", strings.Join(smartctlArgs, " "))
69+
out, err := exec.Command(*smartctlPath, smartctlArgs...).Output()
6770
if err != nil {
68-
logger.Warn("S.M.A.R.T. output reading", "err", err, "device", device.Info_Name)
71+
logger.Warn("S.M.A.R.T. output reading", "err", err, "device", device)
6972
}
7073
// Accommodate a smartmontools pre-7.3 bug
7174
cleaned_out := strings.TrimPrefix(string(out), " Pending defect count:")
7275
json := parseJSON(cleaned_out)
7376
rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int())
7477
jsonOk := jsonIsOk(logger, json)
75-
logger.Debug("Collected S.M.A.R.T. json data", "device", device.Info_Name, "duration", time.Since(start))
78+
logger.Debug("Collected S.M.A.R.T. json data", "device", device, "duration", time.Since(start))
7679
return json, rcOk && jsonOk
7780
}
7881

7982
func readSMARTctlDevices(logger *slog.Logger) gjson.Result {
8083
logger.Debug("Scanning for devices")
81-
out, err := exec.Command(*smartctlPath, "--json", "--scan").Output()
84+
var scanArgs []string = []string{"--json", "--scan"}
85+
for _, d := range *smartctlScanDeviceTypes {
86+
scanArgs = append(scanArgs, "--device", d)
87+
}
88+
out, err := exec.Command(*smartctlPath, scanArgs...).Output()
8289
if exiterr, ok := err.(*exec.ExitError); ok {
8390
logger.Debug("Exit Status", "exit_code", exiterr.ExitCode())
8491
// The smartctl command returns 2 if devices are sleeping, ignore this error.
8592
if exiterr.ExitCode() != 2 {
8693
logger.Warn("S.M.A.R.T. output reading error", "err", err)
8794
return gjson.Result{}
8895
}
96+
} else if err != nil {
97+
logger.Warn("S.M.A.R.T. output reading error", "err", err)
98+
return gjson.Result{}
8999
}
90100
return parseJSON(string(out))
91101
}
@@ -103,7 +113,7 @@ func readData(logger *slog.Logger, device Device) gjson.Result {
103113
jsonCache.Store(device, JSONCache{JSON: json, LastCollect: time.Now()})
104114
j, found := jsonCache.Load(device)
105115
if !found {
106-
logger.Warn("device not found", "device", device.Info_Name)
116+
logger.Warn("device not found", "device", device)
107117
}
108118
return j.(JSONCache).JSON
109119
}
@@ -118,30 +128,30 @@ func resultCodeIsOk(logger *slog.Logger, device Device, SMARTCtlResult int64) bo
118128
if SMARTCtlResult > 0 {
119129
b := SMARTCtlResult
120130
if (b & 1) != 0 {
121-
logger.Error("Command line did not parse", "device", device.Info_Name)
131+
logger.Error("Command line did not parse", "device", device)
122132
result = false
123133
}
124134
if (b & (1 << 1)) != 0 {
125-
logger.Error("Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device.Info_Name)
135+
logger.Error("Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device)
126136
result = false
127137
}
128138
if (b & (1 << 2)) != 0 {
129-
logger.Warn("Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device.Info_Name)
139+
logger.Warn("Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device)
130140
}
131141
if (b & (1 << 3)) != 0 {
132-
logger.Warn("SMART status check returned 'DISK FAILING'", "device", device.Info_Name)
142+
logger.Warn("SMART status check returned 'DISK FAILING'", "device", device)
133143
}
134144
if (b & (1 << 4)) != 0 {
135-
logger.Warn("We found prefail Attributes <= threshold", "device", device.Info_Name)
145+
logger.Warn("We found prefail Attributes <= threshold", "device", device)
136146
}
137147
if (b & (1 << 5)) != 0 {
138-
logger.Warn("SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device.Info_Name)
148+
logger.Warn("SMART status check returned 'DISK OK' but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past", "device", device)
139149
}
140150
if (b & (1 << 6)) != 0 {
141-
logger.Warn("The device error log contains records of errors", "device", device.Info_Name)
151+
logger.Warn("The device error log contains records of errors", "device", device)
142152
}
143153
if (b & (1 << 7)) != 0 {
144-
logger.Warn("The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device.Info_Name)
154+
logger.Warn("The device self-test log contains records of errors. [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored", "device", device)
145155
}
146156
}
147157
return result

smartctl.go

+9-21
Original file line numberDiff line numberDiff line change
@@ -42,28 +42,16 @@ type SMARTctl struct {
4242
device SMARTDevice
4343
}
4444

45-
func extractDiskName(input string) string {
46-
re := regexp.MustCompile(`^(?:/dev/(?P<bus_name>\S+)/(?P<bus_num>\S+)\s\[|/dev/|\[)(?:\s\[|)(?P<disk>[a-z0-9_]+)(?:\].*|)$`)
47-
match := re.FindStringSubmatch(input)
48-
49-
if len(match) > 0 {
50-
busNameIndex := re.SubexpIndex("bus_name")
51-
busNumIndex := re.SubexpIndex("bus_num")
52-
diskIndex := re.SubexpIndex("disk")
53-
var name []string
54-
if busNameIndex != -1 && match[busNameIndex] != "" {
55-
name = append(name, match[busNameIndex])
56-
}
57-
if busNumIndex != -1 && match[busNumIndex] != "" {
58-
name = append(name, match[busNumIndex])
59-
}
60-
if diskIndex != -1 && match[diskIndex] != "" {
61-
name = append(name, match[diskIndex])
62-
}
45+
func buildDeviceLabel(inputName string, inputType string) string {
46+
// Strip /dev prefix and replace / with _ (/dev/bus/0 becomes bus_0, /dev/disk/by-id/abcd becomes abcd)
47+
devReg := regexp.MustCompile(`^/dev/(?:disk/by-id/|disk/by-path/|)`)
48+
deviceName := strings.ReplaceAll(devReg.ReplaceAllString(inputName, ""), "/", "_")
6349

64-
return strings.Join(name, "_")
50+
if strings.Contains(inputType, ",") {
51+
return deviceName + "_" + strings.ReplaceAll(inputType, ",", "_")
6552
}
66-
return ""
53+
54+
return deviceName
6755
}
6856

6957
// NewSMARTctl is smartctl constructor
@@ -84,7 +72,7 @@ func NewSMARTctl(logger *slog.Logger, json gjson.Result, ch chan<- prometheus.Me
8472
json: json,
8573
logger: logger,
8674
device: SMARTDevice{
87-
device: extractDiskName(strings.TrimSpace(json.Get("device.info_name").String())),
75+
device: buildDeviceLabel(json.Get("device.name").String(), json.Get("device.type").String()),
8876
serial: strings.TrimSpace(json.Get("serial_number").String()),
8977
family: strings.TrimSpace(GetStringIfExists(json, "model_family", "unknown")),
9078
model: strings.TrimSpace(model_name),

smartctl_test.go

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Copyright 2024 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package main
15+
16+
import (
17+
"testing"
18+
)
19+
20+
func TestBuildDeviceLabel(t *testing.T) {
21+
tests := []struct {
22+
deviceName string
23+
deviceType string
24+
expectedLabel string
25+
}{
26+
{"/dev/bus/0", "megaraid,1", "bus_0_megaraid_1"},
27+
{"/dev/sda", "auto", "sda"},
28+
{"/dev/disk/by-id/ata-CT500MX500SSD1_ABCDEFGHIJ", "auto", "ata-CT500MX500SSD1_ABCDEFGHIJ"},
29+
// Some cases extracted from smartctl docs. Are these the prettiest?
30+
// Probably not. Are they unique enough. Definitely.
31+
{"/dev/sg1", "cciss,1", "sg1_cciss_1"},
32+
{"/dev/bsg/sssraid0", "sssraid,0,1", "bsg_sssraid0_sssraid_0_1"},
33+
{"/dev/cciss/c0d0", "cciss,0", "cciss_c0d0_cciss_0"},
34+
{"/dev/sdb", "aacraid,1,0,4", "sdb_aacraid_1_0_4"},
35+
{"/dev/twl0", "3ware,1", "twl0_3ware_1"},
36+
}
37+
38+
for _, test := range tests {
39+
result := buildDeviceLabel(test.deviceName, test.deviceType)
40+
if result != test.expectedLabel {
41+
t.Errorf("deviceName=%v deviceType=%v expected=%v result=%v", test.deviceName, test.deviceType, test.expectedLabel, result)
42+
}
43+
}
44+
}

0 commit comments

Comments
 (0)