Skip to content

Commit d017798

Browse files
committed
Review comments
1 parent cd1f807 commit d017798

5 files changed

Lines changed: 206 additions & 32 deletions

File tree

dfetch/project/archivesubproject.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,19 +109,28 @@ def _latest_revision_on_branch(self, branch: str) -> str:
109109
del branch
110110
return self.remote
111111

112-
def _download_and_compute_hash(self, algorithm: str = "sha256") -> IntegrityHash:
112+
def _download_and_compute_hash(
113+
self, algorithm: str = "sha256", url: str | None = None
114+
) -> IntegrityHash:
113115
"""Download the archive to a temporary file and return its :class:`IntegrityHash`.
114116
115117
The hash is computed during the download stream — no extra file read.
116118
The temporary file is always cleaned up, even on error.
117119
120+
Args:
121+
algorithm: Hash algorithm to use (``sha256``, ``sha384``, ``sha512``).
122+
url: If given, download from this URL instead of ``self._remote_repo``.
123+
Use this to pin to the exact URL stored in the on-disk revision.
124+
118125
Raises:
119126
RuntimeError: On download failure or unsupported algorithm.
120127
"""
121-
fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(self.remote))
128+
effective_url = url if url is not None else self.remote
129+
remote = ArchiveRemote(effective_url) if url is not None else self._remote_repo
130+
fd, tmp_path = tempfile.mkstemp(suffix=_suffix_for_url(effective_url))
122131
os.close(fd)
123132
try:
124-
hex_digest = self._remote_repo.download(tmp_path, algorithm=algorithm)
133+
hex_digest = remote.download(tmp_path, algorithm=algorithm)
125134
return IntegrityHash(algorithm, hex_digest)
126135
finally:
127136
try:
@@ -231,8 +240,9 @@ def freeze_project(self, project: ProjectEntry) -> str | None:
231240
revision = on_disk.revision
232241

233242
# Already hash-pinned — use the on-disk revision directly.
243+
# Otherwise download from the revision URL (not the possibly-updated manifest URL).
234244
pinned = IntegrityHash.parse(revision) or self._download_and_compute_hash(
235-
"sha256"
245+
"sha256", url=revision
236246
)
237247
new_hash = str(pinned)
238248
if project.hash == new_hash:

dfetch/util/util.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,21 @@ def prune_files_by_pattern(directory: str, patterns: Sequence[str]) -> None:
8787
License files are never removed even when they match a pattern.
8888
"""
8989
seen: set[str] = set()
90+
paths = []
9091
for file_or_dir in find_matching_files(directory, patterns):
9192
resolved = str(file_or_dir.resolve())
9293
if resolved in seen:
9394
continue
9495
seen.add(resolved)
95-
if not (file_or_dir.is_file() and is_license_file(file_or_dir.name)):
96+
paths.append(file_or_dir)
97+
98+
# Remove children before parents to avoid FileNotFoundError on already-deleted paths.
99+
paths.sort(key=lambda p: len(str(p.resolve())), reverse=True)
100+
101+
for file_or_dir in paths:
102+
if file_or_dir.exists() and not (
103+
file_or_dir.is_file() and is_license_file(file_or_dir.name)
104+
):
96105
safe_rm(file_or_dir)
97106

98107

features/steps/archive_steps.py

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,9 @@
1313
from behave import given # pylint: disable=no-name-in-module
1414

1515

16-
def _sha256(path: str) -> str:
17-
"""Return the SHA-256 hex digest of a file."""
18-
h = hashlib.sha256()
19-
with open(path, "rb") as f:
20-
for chunk in iter(lambda: f.read(8192), b""):
21-
h.update(chunk)
22-
return h.hexdigest()
23-
24-
25-
def _sha384(path: str) -> str:
26-
"""Return the SHA-384 hex digest of a file."""
27-
h = hashlib.sha384()
28-
with open(path, "rb") as f:
29-
for chunk in iter(lambda: f.read(8192), b""):
30-
h.update(chunk)
31-
return h.hexdigest()
32-
33-
34-
def _sha512(path: str) -> str:
35-
"""Return the SHA-512 hex digest of a file."""
36-
h = hashlib.sha512()
16+
def _file_digest(path: str, constructor) -> str:
17+
"""Return the hex digest of *path* using the given hashlib *constructor*."""
18+
h = constructor()
3719
with open(path, "rb") as f:
3820
for chunk in iter(lambda: f.read(8192), b""):
3921
h.update(chunk)
@@ -85,12 +67,14 @@ def _create_archive(context, name: str, extension: str) -> None:
8567

8668
if extension == ".tar.gz":
8769
create_tar_gz(archive_path, name, files)
88-
else:
70+
elif extension == ".zip":
8971
create_zip(archive_path, name, files)
72+
else:
73+
raise ValueError(f"Unsupported archive extension: {extension!r}")
9074

91-
context.archive_sha256 = _sha256(archive_path)
92-
context.archive_sha384 = _sha384(archive_path)
93-
context.archive_sha512 = _sha512(archive_path)
75+
context.archive_sha256 = _file_digest(archive_path, hashlib.sha256)
76+
context.archive_sha384 = _file_digest(archive_path, hashlib.sha384)
77+
context.archive_sha512 = _file_digest(archive_path, hashlib.sha512)
9478
context.archive_url = _archive_url(context, filename)
9579

9680

tests/test_archive.py

Lines changed: 117 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
"""Unit tests for dfetch.vcs.archive and dfetch.project.archivesubproject."""
22

3+
import hashlib
34
import io
45
import os
56
import tarfile
67
import tempfile
78
import zipfile
9+
from unittest.mock import patch
810

911
import pytest
1012

11-
from dfetch.project.archivesubproject import _suffix_for_url
13+
from dfetch.manifest.project import ProjectEntry
14+
from dfetch.manifest.version import Version
15+
from dfetch.project.archivesubproject import ArchiveSubProject, _suffix_for_url
1216
from dfetch.vcs.archive import (
1317
ARCHIVE_EXTENSIONS,
1418
ArchiveLocalRepo,
@@ -249,3 +253,115 @@ def test_all_archive_extensions_covered():
249253
assert len(ARCHIVE_EXTENSIONS) > 0
250254
for ext in ARCHIVE_EXTENSIONS:
251255
assert ext.startswith(".")
256+
257+
258+
# ---------------------------------------------------------------------------
259+
# Helpers shared by ArchiveSubProject tests
260+
# ---------------------------------------------------------------------------
261+
262+
263+
def _make_tar_gz(path: str, content: bytes = b"hello") -> None:
264+
"""Write a minimal .tar.gz archive containing one file to *path*."""
265+
with tarfile.open(path, "w:gz") as tf:
266+
info = tarfile.TarInfo(name="pkg/README.md")
267+
info.size = len(content)
268+
tf.addfile(info, io.BytesIO(content))
269+
270+
271+
def _sha256_file(path: str) -> str:
272+
h = hashlib.sha256()
273+
with open(path, "rb") as f:
274+
for chunk in iter(lambda: f.read(8192), b""):
275+
h.update(chunk)
276+
return h.hexdigest()
277+
278+
279+
def _file_url(path: str) -> str:
280+
return "file:///" + path.lstrip("/")
281+
282+
283+
def _make_subproject(url: str) -> ArchiveSubProject:
284+
return ArchiveSubProject(
285+
ProjectEntry({"name": "pkg", "url": url, "vcs": "archive"})
286+
)
287+
288+
289+
# ---------------------------------------------------------------------------
290+
# ArchiveSubProject._download_and_compute_hash – explicit url parameter
291+
# ---------------------------------------------------------------------------
292+
293+
294+
def test_download_and_compute_hash_default_uses_remote_repo():
295+
"""Without an explicit url the hash is computed from self._remote_repo."""
296+
with tempfile.TemporaryDirectory() as tmp:
297+
archive = os.path.join(tmp, "pkg.tar.gz")
298+
_make_tar_gz(archive)
299+
url = _file_url(archive)
300+
sp = _make_subproject(url)
301+
302+
result = sp._download_and_compute_hash("sha256")
303+
304+
assert result.algorithm == "sha256"
305+
assert result.hex_digest == _sha256_file(archive)
306+
307+
308+
def test_download_and_compute_hash_explicit_url_overrides_remote_repo():
309+
"""When *url* is supplied a fresh ArchiveRemote for that URL is used.
310+
311+
This is the regression guard for the fix: if the manifest URL was changed
312+
after fetching, freeze must still hash the *original* archive (the one
313+
recorded in the on-disk revision), not the current manifest URL.
314+
"""
315+
with tempfile.TemporaryDirectory() as tmp:
316+
archive_a = os.path.join(tmp, "pkg_a.tar.gz")
317+
archive_b = os.path.join(tmp, "pkg_b.tar.gz")
318+
_make_tar_gz(archive_a, content=b"version A")
319+
_make_tar_gz(archive_b, content=b"version B")
320+
url_a = _file_url(archive_a)
321+
url_b = _file_url(archive_b)
322+
323+
# SubProject points to archive_b (current manifest URL).
324+
sp = _make_subproject(url_b)
325+
326+
# Passing url=url_a must use archive_a's content.
327+
result = sp._download_and_compute_hash("sha256", url=url_a)
328+
329+
assert result.hex_digest == _sha256_file(archive_a)
330+
assert result.hex_digest != _sha256_file(archive_b)
331+
332+
333+
# ---------------------------------------------------------------------------
334+
# ArchiveSubProject.freeze_project – uses on-disk revision URL
335+
# ---------------------------------------------------------------------------
336+
337+
338+
def test_freeze_project_uses_on_disk_url_not_manifest_url():
339+
"""freeze_project must hash the archive at the on-disk revision URL.
340+
341+
Scenario: the manifest URL was updated after the last fetch. Without the
342+
fix, freeze would download from the new (current) manifest URL and produce
343+
a hash that doesn't match the fetched archive. With the fix it uses the
344+
URL stored in the on-disk revision.
345+
"""
346+
with tempfile.TemporaryDirectory() as tmp:
347+
archive_a = os.path.join(tmp, "pkg_a.tar.gz")
348+
archive_b = os.path.join(tmp, "pkg_b.tar.gz")
349+
_make_tar_gz(archive_a, content=b"original fetch")
350+
_make_tar_gz(archive_b, content=b"updated manifest url")
351+
url_a = _file_url(archive_a)
352+
url_b = _file_url(archive_b)
353+
354+
# SubProject now points to archive_b (manifest was updated after fetch).
355+
sp = _make_subproject(url_b)
356+
357+
# Simulate on-disk state: was fetched from url_a (no hash-pin at the time).
358+
on_disk = Version(revision=url_a)
359+
with patch.object(sp, "on_disk_version", return_value=on_disk):
360+
project_entry = ProjectEntry(
361+
{"name": "pkg", "url": url_b, "vcs": "archive"}
362+
)
363+
sp.freeze_project(project_entry)
364+
365+
expected_hash = f"sha256:{_sha256_file(archive_a)}"
366+
assert project_entry.hash == expected_hash
367+
assert _sha256_file(archive_b) not in project_entry.hash

tests/test_util.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import pytest
77

8-
from dfetch.util.util import copy_src_subset, hash_directory
8+
from dfetch.util.util import copy_src_subset, hash_directory, prune_files_by_pattern
99

1010
# ---------------------------------------------------------------------------
1111
# copy_src_subset – path-traversal protection
@@ -100,3 +100,58 @@ def test_hash_directory_skiplist_excludes_file(tmp_path):
100100
h_with_skip2 = hash_directory(str(d), ["ignored.txt"])
101101

102102
assert h_with_skip == h_with_skip2
103+
104+
105+
# ---------------------------------------------------------------------------
106+
# prune_files_by_pattern – delete-order safety
107+
# ---------------------------------------------------------------------------
108+
109+
110+
def test_prune_removes_matched_file(tmp_path):
111+
(tmp_path / "remove_me.txt").write_text("gone")
112+
prune_files_by_pattern(str(tmp_path), ["remove_me.txt"])
113+
assert not (tmp_path / "remove_me.txt").exists()
114+
115+
116+
def test_prune_parent_and_child_both_matched_no_error(tmp_path):
117+
"""When a dir and a file inside it both match, removal must not raise.
118+
119+
Before the fix, removing the parent first left the child path pointing at a
120+
non-existent location; the subsequent safe_rm call then raised
121+
FileNotFoundError.
122+
"""
123+
src = tmp_path / "src"
124+
src.mkdir()
125+
(src / "main.c").write_text("int main(){}")
126+
127+
# "src" matches the directory; "main.c" matches the child inside it.
128+
prune_files_by_pattern(str(tmp_path), ["src", "main.c"])
129+
130+
assert not src.exists()
131+
132+
133+
def test_prune_preserves_license_file(tmp_path):
134+
"""License files must survive even when they match a removal pattern."""
135+
(tmp_path / "LICENSE").write_text("MIT")
136+
(tmp_path / "delete_me.txt").write_text("gone")
137+
138+
prune_files_by_pattern(str(tmp_path), ["LICENSE", "delete_me.txt"])
139+
140+
assert (tmp_path / "LICENSE").exists()
141+
assert not (tmp_path / "delete_me.txt").exists()
142+
143+
144+
def test_prune_skips_already_removed_paths(tmp_path):
145+
"""Paths that no longer exist after a parent removal are silently skipped."""
146+
parent = tmp_path / "libs"
147+
parent.mkdir()
148+
child = parent / "lib.a"
149+
child.write_text("binary")
150+
unrelated = tmp_path / "readme.txt"
151+
unrelated.write_text("keep")
152+
153+
# Both "libs" (directory) and "libs/lib.a" (child) match; no exception expected.
154+
prune_files_by_pattern(str(tmp_path), ["libs", "lib.a"])
155+
156+
assert not parent.exists()
157+
assert unrelated.exists()

0 commit comments

Comments
 (0)