diff --git a/README.md b/README.md index 43c6a04..d2d3adf 100755 --- a/README.md +++ b/README.md @@ -118,6 +118,49 @@ Enabling this option will cause increased network traffic in order to update pac `linux2mqtt --name Server1 -vvvvv --packages=7200` will search for available updates every 2 hours +### Hard Drives + +`linux2mqtt` can publish the status of all harddrives using the `harddrives` option. Each hard drive will present as a separate sensor in Home Assistant. The sensor state reports the harddrive status based on a the smartctl report, which generates a score. The details on the scoring methodology can be found below. Additional data is accessible as state attributes on each sensor. + +`linux2mqtt --name Server1 -vvvvv --interval 60 --harddrives` + +#### Scoring Methodology + +The score to status conversion is: + +| Status | Score | +| ------- | ----- | +| HEALTHY | <= 10 | +| GOOD | <= 20 | +| WARNING | <= 50 | +| FAILING | > 50 | + +##### ATA Scoring + +| SMART Attribute | Penalty | Notes | +| ----------------------------- | ------- | ------------------------------ | +| Reallocated Sector Count | ×2 | Indicates remapped bad sectors | +| Current Pending Sector | ×3 | Sectors waiting reallocation | +| Pending Sector > 10 | +30 | Additional penalty | +| Offline Uncorrectable | ×3 | Unrecoverable errors | +| Reported Uncorrectable Errors | ×2 | Read/write failures | +| Command Timeout | ×1.5 | Communication delays | +| UDMA CRC Error Count | max +10 | Usually cable/interface issue | + +##### NVME Scoring + +| SMART Attribute | Penalty | Notes | +| ------------------------------- | ------- | ------------------------------------------ | +| critical_warning ≠ 0 | +100 | Any critical SMART flag triggers high risk | +| percent_used > 70% | +10 | NAND wear indicator | +| percent_used > 80% | +20 | Increased wear | +| percent_used > 90% | +50 | Near end-of-life | +| media_errors | ×5 | Data integrity errors | +| num_error_log_entries | max +50 | Error events (capped) | +| warning_temp_time > 0 | +10 | Drive exceeded warning temp | +| critical_temp_time > 0 | +30 | Drive exceeded critical temp | +| available_spare below threshold | +30 | Spare blocks depleted | + ## Logging `linux2mqtt` can log to a directory in addition to the console using the `--logdir` parameter. The specified directory can be absolute or relative and is created if it doesn't exist. The verbosity parameter applies to file logging and the log file size is limited to 1M bytes and 5 previous files are kept. diff --git a/linux2mqtt/exceptions.py b/linux2mqtt/exceptions.py index ae91300..96b7288 100644 --- a/linux2mqtt/exceptions.py +++ b/linux2mqtt/exceptions.py @@ -23,3 +23,6 @@ class NoPackageManagerFound(Linux2MqttException): class PackageManagerException(Linux2MqttException): """Generic package manager exception occurred.""" + +class HardDriveException(Linux2MqttException): + """Generic Hard Drive exception occured.""" diff --git a/linux2mqtt/harddrive.py b/linux2mqtt/harddrive.py new file mode 100644 index 0000000..01ba8c5 --- /dev/null +++ b/linux2mqtt/harddrive.py @@ -0,0 +1,230 @@ +"""Hard drives.""" + +import json +import re +import shlex +from subprocess import DEVNULL, PIPE, Popen + +from .exceptions import HardDriveException, Linux2MqttException + + +class HardDrive: + """Base class for all harddrives to implement.""" + + # parameters + _attributes: dict | None + device_id: str + attributes: dict + score: int + status: str + + def __init__(self, device_id: str): + """Initialize the hard drive metric. + + Parameters + ---------- + device_id + The device id from /dev/disk/by-id/ + + """ + self.device_id = device_id + self._attributes = None + + def _get_attributes(self) -> None: + command = shlex.split( + f"/usr/sbin/smartctl --info --all --json --nocheck standby /dev/disk/by-id/{self.device_id}" + ) + with Popen( + command, + stdout=PIPE, + stderr=DEVNULL, + text=True, + ) as proc: + stdout, stderr = proc.communicate(timeout=30) + + if (proc.returncode & 3) != 0: + raise HardDriveException( + f"Something went wrong with smartctl: {proc.returncode}: '{stderr}'" + ) + + raw_json_data = json.loads(stdout) + + self._attributes = raw_json_data + + def parse_attributes(self) -> None: + """Hard Drive specific parse function depending on results from smartctl.""" + raise Linux2MqttException from NotImplementedError + + def get_score(self) -> None: + """Hard Drive specific score function depending on results from smartctl.""" + raise Linux2MqttException from NotImplementedError + + def get_status(self) -> None: + """Convert the score to an Arbitrary Classification Set by developer.""" + if self.score <= 10: + self.status = "HEALTHY" + elif self.score <= 20: + self.status = "GOOD" + elif self.score <= 50: + self.status = "WARNING" + else: + self.status = "FAILING" + + +class SataDrive(HardDrive): + """For ATA Drives.""" + + def parse_attributes(self) -> None: + """Parse out attributes from smartctl where available.""" + self.attributes = {} + self._get_attributes() + ata_smart_attributes = [ + ("Reallocated Sector Count", 5), + ("Command Timeout", 38), + ("Reported Uncorrectable Errors", 187), + ("Current Pending Sector", 197), + ("Offline Uncorrectable", 198), + ("UDMA CRC Error Count", 199), + ] + + self.attributes["Model Name"] = self._attributes["model_name"] # type: ignore[index] + self.attributes["Device"] = self._attributes["device"]["name"] # type: ignore[index] + self.attributes["Size TB"] = ( + self._attributes["user_capacity"]["bytes"] / 1000000000000 # type: ignore[index] + ) # type: ignore[index] + self.attributes["Temperature"] = self._attributes["temperature"]["current"] # type: ignore[index] + self.attributes["Smart status"] = ( + "Healthy" if self._attributes["smart_status"]["passed"] else "Failed" # type: ignore[index] + ) # type: ignore[index] + self.attributes["Power On Time"] = self._attributes["power_on_time"]["hours"] # type: ignore[index] + self.attributes["Power Cycle Count"] = self._attributes["power_cycle_count"] # type: ignore[index] + + new_data = { + item["id"]: item + for item in self._attributes["ata_smart_attributes"]["table"] # type: ignore[index] + } # type: ignore[index] + for name, key in ata_smart_attributes: + tmp = new_data[key]["raw"]["value"] if new_data.get(key) else None + if tmp is not None: + self.attributes[name] = tmp + + self.get_score() + self.get_status() + self.attributes["score"] = self.score + self.attributes["status"] = self.status + + def get_score(self) -> None: + """ATA Drive specific score function depending on results from smartctl.""" + score = 0 + score += self.attributes.get("Reallocated Sector Count", 0) * 2 + score += self.attributes.get("Current Pending Sector", 0) * 3 + if self.attributes.get("Current Pending Sector", 0) > 10: + score += 30 + + score += self.attributes.get("Offline Uncorrectable", 0) * 3 + score += self.attributes.get("Reported Uncorrectable Errors", 0) * 2 + score += self.attributes.get("Command Timeout", 0) * 1.5 + score += min(self.attributes.get("UDMA CRC Error Count", 0), 10) + + self.score = score + + +class NVME(HardDrive): + """For NVME Drives.""" + + def parse_attributes(self) -> None: + """Parse NVME Smartctl attributes.""" + self.attributes = {} + self._get_attributes() + nvme_smart_attributes = [ + "critical_warning", + "percentage_used", + "power_on_hours", + "power_cycles", + "media_errors", + "num_err_log_entries", + "critical_comp_time", + "warning_temp_time", + "available_spare", + "available_spare_threshold", + ] + + self.attributes["Model Name"] = self._attributes["model_name"] # type: ignore[index] + self.attributes["Device"] = self._attributes["device"]["name"] # type: ignore[index] + self.attributes["Size TB"] = ( + self._attributes["user_capacity"]["bytes"] / 1000000000000 # type: ignore[index] + ) # type: ignore[index] + self.attributes["Temperature"] = self._attributes["temperature"]["current"] # type: ignore[index] + self.attributes["Smart status"] = ( + "Healthy" if self._attributes["smart_status"]["passed"] else "Failed" # type: ignore[index] + ) # type: ignore[index] + + for key in nvme_smart_attributes: + tmp = self._attributes["nvme_smart_health_information_log"].get(key) # type: ignore[index] + if tmp is not None: + self.attributes[key] = tmp + + self.get_score() + self.get_status() + self.attributes["score"] = self.score + self.attributes["status"] = self.status + + def get_score(self) -> None: + """Score specific for NVME Drives.""" + score = 0 + + # Critical warnings (bitmask) + if self.attributes.get("critical_warning") != 0: + score += 100 # Any critical flag = high risk + + # NAND wear + if self.attributes.get("percent_used", 0) > 90: + score += 50 + elif self.attributes.get("percent_used", 0) > 80: + score += 20 + elif self.attributes.get("percent_used", 0) > 70: + score += 10 + + # Media/data errors + score += self.attributes.get("media_errors", 0) * 5 + + # Error log entries + score += min(self.attributes.get("num_error_log_entries", 0), 50) # cap at 50 + + # Temperature issues + if self.attributes.get("critical_temp_time", 0) > 0: + score += 30 + elif self.attributes.get("warning_temp_time", 0) > 0: + score += 10 + + # Available spare + if self.attributes.get("available_spare", 0) < self.attributes.get( + "available_spare_threshold", 0 + ): + score += 30 + + self.score = score + + +def get_hard_drive(device_name: str) -> HardDrive: + """Determine the hard drive type. + + Returns + ------- + HardDrive + The specific hard drive type for drive id + + """ + + ata_regex = r"^ata.*(? None: metavar="INTERVAL", choices=range(MIN_PACKAGE_INTERVAL, MAX_PACKAGE_INTERVAL), ) + parser.add_argument( + "--harddrives", + help="Publish hard drive stats if available", + action="store_true", + ) parser.add_argument( "--discovery", default=None, @@ -848,6 +858,15 @@ def main() -> None: ) stats.add_metric(package_updates) + if args.harddrives: + for drive in listdir("/dev/disk/by-id/"): + try: + harddrive = HardDriveMetrics(drive) + if harddrive: + stats.add_metric(harddrive) + except HardDriveException: + pass + if not ( args.vm or args.connections @@ -857,6 +876,7 @@ def main() -> None: or args.temp or args.fan or args.packages + or args.harddrives ): main_logger.warning("No metrics specified. Nothing will be published.") diff --git a/linux2mqtt/metrics.py b/linux2mqtt/metrics.py old mode 100644 new mode 100755 index 2fdc4c0..4dbe2b6 --- a/linux2mqtt/metrics.py +++ b/linux2mqtt/metrics.py @@ -17,11 +17,13 @@ MIN_NET_INTERVAL, ) from .exceptions import ( + HardDriveException, Linux2MqttConfigException, Linux2MqttException, Linux2MqttMetricsException, NoPackageManagerFound, ) +from .harddrive import HardDrive, get_hard_drive from .helpers import addr_ip, addr_port, is_addr, sanitize from .package_manager import PackageManager, get_package_manager from .type_definitions import LinuxDeviceEntry, LinuxEntry, MetricEntities, SensorType @@ -1175,3 +1177,116 @@ def poll(self, result_queue: Queue[BaseMetric]) -> bool: th.daemon = True th.start() return True # Expect a deferred result + + +class HardDriveMetricThread(BaseMetricThread): + """Hard Drive metric thread.""" + + def __init__( + self, result_queue: Queue[BaseMetric], metric: BaseMetric, harddrive: HardDrive + ): + """Initialize the HardDrive thread. + + Parameters + ---------- + result_queue + The queue to put the metric into once the data is gathered + metric + The hard drive metric to gather data for + harddrive + The type of hard drive to gather data over + + """ + threading.Thread.__init__(self) + self.result_queue = result_queue + self.metric = metric + self.harddrive = harddrive + + def run(self) -> None: + """Run the hard drive thread. Once data is gathered, it is put into the queue and the thread exits. + + Raises + ------ + Linux2MqttMetricsException + hard drive information could not be gathered or prepared for publishing + + """ + try: + self.harddrive.parse_attributes() + self.metric.polled_result = { + **self.harddrive.attributes, # type: ignore[unused-ignore] + } + self.result_queue.put(self.metric) + except Exception as ex: + raise Linux2MqttMetricsException( + f"Could not gather and publish hard drive data {self.metric._name}" + ) from ex + + +class HardDriveMetrics(BaseMetric): + """Hard Drive metric.""" + + icon = "mdi:harddisk" + unit_of_measurement = "" + state_field = "status" + + _name_template = "Hard Drive (ID:{})" + _device: str + _thermal_zone: str + + def __init__(self, device: str): + """Initialize the hard drive metric. + + Parameters + ---------- + device + The device + + Raises + ------ + Linux2MqttException + Bad config + + """ + super().__init__() + + try: + self.harddrive = get_hard_drive(device_name=device) + self._name = self._name_template.format(device) + except HardDriveException as ex: + raise Linux2MqttException( + "Failed to find a suitable hard drive type. Currently supported are: Hard Disk and NVME" + ) from ex + + def poll(self, result_queue: Queue[BaseMetric]) -> bool: + """Poll new data for the hard drive metric. + + Parameters + ---------- + result_queue + The queue where to post new data once gathered + + Returns + ------- + bool = False + True as the data is gathered lazily + + Raises + ------ + Linux2MqttException + General exception + + """ + try: + assert result_queue + except ReferenceError as e: + raise Linux2MqttException( + "Cannot start hard drive metric due to missing result_queue" + ) from e + self.result_queue = result_queue + th = HardDriveMetricThread( + result_queue=result_queue, metric=self, harddrive=self.harddrive + ) + th.daemon = True + th.start() + return True # Expect a deferred result