diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ac94fd..9c55890 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Support for renaming samples while copying (#6) - Support for symbolic linking instead of copying (#9) +- SHA256 checksums to ensure integrity of copied files (#10) ## [0.1.3] 2025-12-03 diff --git a/ezfastq/fastq.py b/ezfastq/fastq.py index f155a0d..27faa79 100644 --- a/ezfastq/fastq.py +++ b/ezfastq/fastq.py @@ -8,6 +8,7 @@ # ------------------------------------------------------------------------------------------------- from dataclasses import dataclass +from hashlib import sha256 from pathlib import Path from shutil import copy from subprocess import run @@ -42,6 +43,8 @@ def copy(self, destination): destination.mkdir(parents=True, exist_ok=True) file_copy = destination / self._working_name copy(self.source_path, file_copy) + if file_sha256(self.source_path) != file_sha256(file_copy): # pragma: no cover + raise CopyError(f"checksum failed for {self.source_path}") if self.extension == "fastq": run(["gzip", str(file_copy)]) @@ -71,5 +74,17 @@ def _working_name(self): return f"{self.stem}.{self.extension}" +def file_sha256(path, block_size=65536): + sha = sha256() + with open(path, "rb") as fh: + for block in iter(lambda: fh.read(block_size), b""): + sha.update(block) + return sha.hexdigest() + + +class CopyError(RuntimeError): + pass + + class LinkError(ValueError): pass