Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,49 @@ Enabling this option will cause increased network traffic in order to update pac

`linux2mqtt --name Server1 -vvvvv --packages=7200` will search for available updates every 2 hours

### Hard Drives

`linux2mqtt` can publish the status of all harddrives using the `harddrives` option. Each hard drive will present as a separate sensor in Home Assistant. The sensor state reports the harddrive status based on a the smartctl report, which generates a score. The details on the scoring methodology can be found below. Additional data is accessible as state attributes on each sensor.

`linux2mqtt --name Server1 -vvvvv --interval 60 --harddrives`

#### Scoring Methodology

The score to status conversion is:

| Status | Score |
| ------- | ----- |
| HEALTHY | <= 10 |
| GOOD | <= 20 |
| WARNING | <= 50 |
| FAILING | > 50 |

##### ATA Scoring

| SMART Attribute | Penalty | Notes |
| ----------------------------- | ------- | ------------------------------ |
| Reallocated Sector Count | ×2 | Indicates remapped bad sectors |
| Current Pending Sector | ×3 | Sectors waiting reallocation |
| Pending Sector > 10 | +30 | Additional penalty |
| Offline Uncorrectable | ×3 | Unrecoverable errors |
| Reported Uncorrectable Errors | ×2 | Read/write failures |
| Command Timeout | ×1.5 | Communication delays |
| UDMA CRC Error Count | max +10 | Usually cable/interface issue |

##### NVME Scoring

| SMART Attribute | Penalty | Notes |
| ------------------------------- | ------- | ------------------------------------------ |
| critical_warning ≠ 0 | +100 | Any critical SMART flag triggers high risk |
| percent_used > 70% | +10 | NAND wear indicator |
| percent_used > 80% | +20 | Increased wear |
| percent_used > 90% | +50 | Near end-of-life |
| media_errors | ×5 | Data integrity errors |
| num_error_log_entries | max +50 | Error events (capped) |
| warning_temp_time > 0 | +10 | Drive exceeded warning temp |
| critical_temp_time > 0 | +30 | Drive exceeded critical temp |
| available_spare below threshold | +30 | Spare blocks depleted |

## Logging

`linux2mqtt` can log to a directory in addition to the console using the `--logdir` parameter. The specified directory can be absolute or relative and is created if it doesn't exist. The verbosity parameter applies to file logging and the log file size is limited to 1M bytes and 5 previous files are kept.
Expand Down
3 changes: 3 additions & 0 deletions linux2mqtt/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ class NoPackageManagerFound(Linux2MqttException):

class PackageManagerException(Linux2MqttException):
"""Generic package manager exception occurred."""

class HardDriveException(Linux2MqttException):
"""Generic Hard Drive exception occured."""
230 changes: 230 additions & 0 deletions linux2mqtt/harddrive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
"""Hard drives."""

import json
import re
import shlex
from subprocess import DEVNULL, PIPE, Popen

from .exceptions import HardDriveException, Linux2MqttException


class HardDrive:
"""Base class for all harddrives to implement."""

# parameters
_attributes: dict | None
device_id: str
attributes: dict
score: int
status: str

def __init__(self, device_id: str):
"""Initialize the hard drive metric.

Parameters
----------
device_id
The device id from /dev/disk/by-id/

"""
self.device_id = device_id
self._attributes = None

def _get_attributes(self) -> None:
command = shlex.split(
f"/usr/sbin/smartctl --info --all --json --nocheck standby /dev/disk/by-id/{self.device_id}"
)
with Popen(
command,
stdout=PIPE,
stderr=DEVNULL,
text=True,
) as proc:
stdout, stderr = proc.communicate(timeout=30)

if (proc.returncode & 3) != 0:
raise HardDriveException(
f"Something went wrong with smartctl: {proc.returncode}: '{stderr}'"
)

raw_json_data = json.loads(stdout)

self._attributes = raw_json_data

def parse_attributes(self) -> None:
"""Hard Drive specific parse function depending on results from smartctl."""
raise Linux2MqttException from NotImplementedError

def get_score(self) -> None:
"""Hard Drive specific score function depending on results from smartctl."""
raise Linux2MqttException from NotImplementedError

def get_status(self) -> None:
"""Convert the score to an Arbitrary Classification Set by developer."""
if self.score <= 10:
self.status = "HEALTHY"
elif self.score <= 20:
self.status = "GOOD"
elif self.score <= 50:
self.status = "WARNING"
else:
self.status = "FAILING"


class SataDrive(HardDrive):
"""For ATA Drives."""

def parse_attributes(self) -> None:
"""Parse out attributes from smartctl where available."""
self.attributes = {}
self._get_attributes()
ata_smart_attributes = [
("Reallocated Sector Count", 5),
("Command Timeout", 38),
("Reported Uncorrectable Errors", 187),
("Current Pending Sector", 197),
("Offline Uncorrectable", 198),
("UDMA CRC Error Count", 199),
]

self.attributes["Model Name"] = self._attributes["model_name"] # type: ignore[index]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why all this ignore index?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was what mypy was saying was needed.

error: "type: ignore" comment without error code (consider "type: ignore[index]" instead) [ignore-without-code]

self.attributes["Device"] = self._attributes["device"]["name"] # type: ignore[index]
self.attributes["Size TB"] = (
self._attributes["user_capacity"]["bytes"] / 1000000000000 # type: ignore[index]
) # type: ignore[index]
self.attributes["Temperature"] = self._attributes["temperature"]["current"] # type: ignore[index]
self.attributes["Smart status"] = (
"Healthy" if self._attributes["smart_status"]["passed"] else "Failed" # type: ignore[index]
) # type: ignore[index]
self.attributes["Power On Time"] = self._attributes["power_on_time"]["hours"] # type: ignore[index]
self.attributes["Power Cycle Count"] = self._attributes["power_cycle_count"] # type: ignore[index]

new_data = {
item["id"]: item
for item in self._attributes["ata_smart_attributes"]["table"] # type: ignore[index]
} # type: ignore[index]
for name, key in ata_smart_attributes:
tmp = new_data[key]["raw"]["value"] if new_data.get(key) else None
if tmp is not None:
self.attributes[name] = tmp

self.get_score()
self.get_status()
self.attributes["score"] = self.score
self.attributes["status"] = self.status

def get_score(self) -> None:
"""ATA Drive specific score function depending on results from smartctl."""
score = 0
score += self.attributes.get("Reallocated Sector Count", 0) * 2
score += self.attributes.get("Current Pending Sector", 0) * 3
if self.attributes.get("Current Pending Sector", 0) > 10:
score += 30

score += self.attributes.get("Offline Uncorrectable", 0) * 3
score += self.attributes.get("Reported Uncorrectable Errors", 0) * 2
score += self.attributes.get("Command Timeout", 0) * 1.5
score += min(self.attributes.get("UDMA CRC Error Count", 0), 10)

self.score = score


class NVME(HardDrive):
"""For NVME Drives."""

def parse_attributes(self) -> None:
"""Parse NVME Smartctl attributes."""
self.attributes = {}
self._get_attributes()
nvme_smart_attributes = [
"critical_warning",
"percentage_used",
"power_on_hours",
"power_cycles",
"media_errors",
"num_err_log_entries",
"critical_comp_time",
"warning_temp_time",
"available_spare",
"available_spare_threshold",
]

self.attributes["Model Name"] = self._attributes["model_name"] # type: ignore[index]
self.attributes["Device"] = self._attributes["device"]["name"] # type: ignore[index]
self.attributes["Size TB"] = (
self._attributes["user_capacity"]["bytes"] / 1000000000000 # type: ignore[index]
) # type: ignore[index]
self.attributes["Temperature"] = self._attributes["temperature"]["current"] # type: ignore[index]
self.attributes["Smart status"] = (
"Healthy" if self._attributes["smart_status"]["passed"] else "Failed" # type: ignore[index]
) # type: ignore[index]

for key in nvme_smart_attributes:
tmp = self._attributes["nvme_smart_health_information_log"].get(key) # type: ignore[index]
if tmp is not None:
self.attributes[key] = tmp

self.get_score()
self.get_status()
self.attributes["score"] = self.score
self.attributes["status"] = self.status

def get_score(self) -> None:
"""Score specific for NVME Drives."""
score = 0

# Critical warnings (bitmask)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is your scoring based on? Is it an official baseline?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, just an arbitrary scoring metric that I put together to assist with monitoring drive health.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, in that case it would be helpful to document it somewhere for users to understand :)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added this into the readme now.

if self.attributes.get("critical_warning") != 0:
score += 100 # Any critical flag = high risk

# NAND wear
if self.attributes.get("percent_used", 0) > 90:
score += 50
elif self.attributes.get("percent_used", 0) > 80:
score += 20
elif self.attributes.get("percent_used", 0) > 70:
score += 10

# Media/data errors
score += self.attributes.get("media_errors", 0) * 5

# Error log entries
score += min(self.attributes.get("num_error_log_entries", 0), 50) # cap at 50

# Temperature issues
if self.attributes.get("critical_temp_time", 0) > 0:
score += 30
elif self.attributes.get("warning_temp_time", 0) > 0:
score += 10

# Available spare
if self.attributes.get("available_spare", 0) < self.attributes.get(
"available_spare_threshold", 0
):
score += 30

self.score = score


def get_hard_drive(device_name: str) -> HardDrive:
"""Determine the hard drive type.

Returns
-------
HardDrive
The specific hard drive type for drive id

"""

ata_regex = r"^ata.*(?<!part\d)$"
nvme_regex = r"^nvme-eui.*(?<!part\d)$"

r1 = re.compile(ata_regex)
r2 = re.compile(nvme_regex)

if r1.match(device_name):
return SataDrive(device_name)
elif r2.match(device_name):
return NVME(device_name)
else:
raise HardDriveException("Harddrive ID not supported")
24 changes: 22 additions & 2 deletions linux2mqtt/linux2mqtt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import json
import logging
from logging.handlers import RotatingFileHandler
from os import geteuid, path
from os import geteuid, listdir, path
from pathlib import Path
import platform
from queue import Empty, Queue
Expand Down Expand Up @@ -47,13 +47,18 @@
MQTT_QOS_DEFAULT,
MQTT_TIMEOUT_DEFAULT,
)
from .exceptions import Linux2MqttConfigException, Linux2MqttConnectionException
from .exceptions import (
HardDriveException,
Linux2MqttConfigException,
Linux2MqttConnectionException,
)
from .helpers import clean_for_discovery, sanitize
from .metrics import (
BaseMetric,
CPUMetrics,
DiskUsageMetrics,
FanSpeedMetrics,
HardDriveMetrics,
NetConnectionMetrics,
NetworkMetrics,
PackageUpdateMetrics,
Expand Down Expand Up @@ -747,6 +752,11 @@ def main() -> None:
metavar="INTERVAL",
choices=range(MIN_PACKAGE_INTERVAL, MAX_PACKAGE_INTERVAL),
)
parser.add_argument(
"--harddrives",
help="Publish hard drive stats if available",
action="store_true",
)
parser.add_argument(
"--discovery",
default=None,
Expand Down Expand Up @@ -848,6 +858,15 @@ def main() -> None:
)
stats.add_metric(package_updates)

if args.harddrives:
for drive in listdir("/dev/disk/by-id/"):
try:
harddrive = HardDriveMetrics(drive)
if harddrive:
stats.add_metric(harddrive)
except HardDriveException:
pass

if not (
args.vm
or args.connections
Expand All @@ -857,6 +876,7 @@ def main() -> None:
or args.temp
or args.fan
or args.packages
or args.harddrives
):
main_logger.warning("No metrics specified. Nothing will be published.")

Expand Down
Loading
Loading