3131import tarfile
3232import tempfile
3333import urllib .error
34+ import urllib .parse
3435import urllib .request
3536import zipfile
3637from collections .abc import Sequence
3738
39+ from dfetch .log import get_logger
40+ from dfetch .project .subproject import SubProject
41+ from dfetch .util .util import find_matching_files , safe_rm
42+
3843#: Archive file extensions recognised by DFetch.
39- #: Defined before any intra-package imports to avoid partial-initialisation
40- #: issues when other modules (e.g. dfetch.util.purl) import this symbol while
41- #: the module is still being initialised.
4244ARCHIVE_EXTENSIONS = (".tar.gz" , ".tgz" , ".tar.bz2" , ".tar.xz" , ".zip" )
4345
4446#: Hash algorithms supported by the ``integrity.hash`` manifest field.
4547SUPPORTED_HASH_ALGORITHMS = ("sha256" ,)
4648
47- from dfetch .log import get_logger # noqa: E402
48- from dfetch .project .subproject import SubProject # noqa: E402
49- from dfetch .util .util import find_matching_files , safe_rm # noqa: E402
50-
5149logger = get_logger (__name__ )
5250
5351# Safety limits applied during extraction to prevent decompression bombs.
@@ -112,12 +110,15 @@ def is_accessible(self) -> bool:
112110 * ``http``/``https`` URLs first try a ``HEAD`` request. If the server
113111 rejects it (405/501) a partial ``GET`` (``Range: bytes=0-0``) is
114112 attempted instead. Returns *False* on any final failure.
113+ * Any other URL scheme returns *False*.
115114 """
116- from urllib .parse import urlparse as _urlparse # noqa: PLC0415
115+ parsed = urllib .parse .urlparse (self .url )
116+
117+ if parsed .scheme == "file" :
118+ return os .path .exists (parsed .path )
117119
118- if _urlparse (self .url ).scheme == "file" :
119- path = _urlparse (self .url ).path
120- return os .path .exists (path )
120+ if parsed .scheme not in ("http" , "https" ):
121+ return False
121122
122123 for method , headers in [
123124 ("HEAD" , {}),
@@ -142,14 +143,27 @@ def download(self, dest_path: str) -> None:
142143 dest_path: Local file path to write the archive to.
143144
144145 Raises:
145- RuntimeError: On download failure.
146+ RuntimeError: On download failure or unsupported URL scheme .
146147 """
147- try :
148- urllib .request .urlretrieve (self .url , dest_path )
149- except (urllib .error .URLError , OSError ) as exc :
148+ parsed = urllib .parse .urlparse (self .url )
149+ if parsed .scheme == "file" :
150+ try :
151+ shutil .copy (parsed .path , dest_path )
152+ except OSError as exc :
153+ raise RuntimeError (
154+ f"'{ self .url } ' is not a valid URL or unreachable: { exc } "
155+ ) from exc
156+ elif parsed .scheme in ("http" , "https" ):
157+ try :
158+ urllib .request .urlretrieve (self .url , dest_path )
159+ except (urllib .error .URLError , OSError ) as exc :
160+ raise RuntimeError (
161+ f"'{ self .url } ' is not a valid URL or unreachable: { exc } "
162+ ) from exc
163+ else :
150164 raise RuntimeError (
151- f"'{ self .url } ' is not a valid URL or unreachable: { exc } "
152- ) from exc
165+ f"'{ self .url } ' uses unsupported scheme ' { parsed . scheme } '. "
166+ )
153167
154168
155169class ArchiveLocalRepo :
@@ -223,9 +237,13 @@ def _check_archive_limits(member_count: int, total_bytes: int) -> None:
223237 )
224238
225239 @staticmethod
226- def _check_zip_members (zf : zipfile .ZipFile ) -> None :
240+ def check_zip_members (zf : zipfile .ZipFile ) -> list [ zipfile . ZipInfo ] :
227241 """Validate all ZIP member paths against path-traversal attacks.
228242
243+ Returns:
244+ The validated list of members, safe to pass to
245+ :meth:`zipfile.ZipFile.extract`.
246+
229247 Raises:
230248 RuntimeError: When any member contains an absolute path, a ``..``
231249 component, or when the archive exceeds the size/count limits.
@@ -242,6 +260,7 @@ def _check_zip_members(zf: zipfile.ZipFile) -> None:
242260 raise RuntimeError (
243261 f"Archive contains an unsafe member path: { info .filename !r} "
244262 )
263+ return members
245264
246265 @staticmethod
247266 def _check_tar_members (tf : tarfile .TarFile ) -> None :
@@ -289,13 +308,13 @@ def _extract_raw(archive_path: str, dest_dir: str) -> None:
289308 with tarfile .open (archive_path , "r:*" ) as tf :
290309 ArchiveLocalRepo ._check_tar_members (tf )
291310 if sys .version_info >= (3 , 11 , 4 ):
292- tf .extractall (dest_dir , filter = "tar" )
311+ tf .extractall (dest_dir , filter = "tar" ) # nosec B202
293312 else :
294- tf .extractall (dest_dir ) # noqa: S202
313+ tf .extractall (dest_dir ) # nosec B202
295314 elif lower .endswith (".zip" ) or zipfile .is_zipfile (archive_path ):
296315 with zipfile .ZipFile (archive_path ) as zf :
297316 ArchiveLocalRepo ._check_zip_members (zf )
298- zf .extractall (dest_dir )
317+ zf .extractall (dest_dir ) # nosec B202
299318 else :
300319 raise RuntimeError (
301320 f"Unsupported archive format: '{ archive_path } '. "
@@ -319,6 +338,8 @@ def _copy_with_src(
319338 shutil .copy2 (s , d )
320339 elif os .path .isfile (src_path ):
321340 shutil .copy2 (src_path , os .path .join (dest_dir , os .path .basename (src_path )))
341+ else :
342+ raise RuntimeError (f"src { src !r} was not found in archive" )
322343
323344 if keep_licenses :
324345 for item in os .listdir (extract_root ):
0 commit comments