diff --git a/src/buildstream_plugins/sources/docker.py b/src/buildstream_plugins/sources/docker.py index 62c4d76..6545bb3 100644 --- a/src/buildstream_plugins/sources/docker.py +++ b/src/buildstream_plugins/sources/docker.py @@ -107,6 +107,8 @@ import tarfile import urllib.parse +from collections import OrderedDict + import requests from buildstream import Source, SourceError @@ -115,6 +117,7 @@ sha256sum, link_files, move_atomic, + get_umask, ) # @@ -363,23 +366,26 @@ def blob(self, image_path, blob_digest, download_to): class ReadableTarInfo(tarfile.TarInfo): """ - The goal is to override`TarFile`'s `extractall` semantics by ensuring that on extraction, the - files are readable by the owner of the file. This is done by over-riding the accessor for the - mode` attribute in `TarInfo`, class that encapsulates the internal meta-data of the tarball, + The goal is to override `TarFile`'s `extractall` semantics by ensuring that on extraction, the + files are readable by the owner of the file. This is done by overriding the accessor for the + `mode` attribute in `TarInfo`, the class that encapsulates the internal meta-data of the tarball, so that the owner-read bit is always set. """ - # The mode attribute is not declared as a property and so - # this trips up the static type checker, mark this as "ignore" - # + # https://github.com/python/mypy/issues/4125 @property # type: ignore def mode(self): - # ensure file is readable by owner - return self.__permission | 0o400 + # Respect umask instead of the file mode stored in the archive. + # The only bit used from the embedded mode is the executable bit for files. + umask = get_umask() + if self.isdir() or bool(self.__permission & 0o100): + return 0o777 & ~umask + else: + return 0o666 & ~umask @mode.setter def mode(self, permission): - self.__permission = permission + self.__permission = permission # pylint: disable=attribute-defined-outside-init class DockerSource(Source): @@ -605,31 +611,54 @@ def stage(self, directory): raise SourceError("Unable to load manifest: {}".format(e)) from e try: + layer_members = [] + + # Create a list of members to extract from each layer for layer in manifest["layers"]: layer_digest = layer["digest"] blob_path = os.path.join(mirror_dir, layer_digest + ".tar.gz") self._verify_blob(blob_path, expected_digest=layer_digest) - ( - extract_fileset, - white_out_fileset, - ) = self._get_extract_and_remove_files(blob_path) - # remove files associated with whiteouts - for white_out_file in white_out_fileset: - white_out_file = os.path.join(directory, white_out_file) - os.remove(white_out_file) - - # extract files for the current layer + members_dict, whiteout_paths, opaque_whiteout_dirs = self._get_members_and_whiteout_paths(blob_path) + + # Process opaque whiteouts: remove members under the corresponding directories from previous layers + for _, prev_members_dict in layer_members: + members_to_remove = [] + for member_path in prev_members_dict: + # Check if this member should be removed by any opaque whiteout + for opaque_whiteout_dir in opaque_whiteout_dirs: + if member_path.startswith(opaque_whiteout_dir): + members_to_remove.append(member_path) + break + for member_path in members_to_remove: + del prev_members_dict[member_path] + + # Process whiteouts: remove corresponding members from previous layers + for _, prev_members_dict in layer_members: + for whiteout_path in whiteout_paths: + prev_members_dict.pop(whiteout_path, None) + + layer_members.append((blob_path, members_dict)) + + # Extract files for each layer + for blob_path, members_dict in layer_members: + if not members_dict: + # No files to extract from this layer + continue + + # Extract files for the current layer with tarfile.open(blob_path, tarinfo=ReadableTarInfo) as tar: with self.tempdir() as td: + members = list(members_dict.values()) + if hasattr(tarfile, "tar_filter"): # Python 3.12+ (and older versions with backports) tar.extraction_filter = tarfile.tar_filter else: - for member in extract_fileset: + for member in members: self._assert_tarinfo_safe(member, td) - tar.extractall(path=td, members=extract_fileset) + tar.extractall(path=td, members=members) link_files(td, directory) except (OSError, SourceError, tarfile.TarError) as e: @@ -658,14 +687,14 @@ def collect_source_info(self): ] @staticmethod - def _get_extract_and_remove_files(layer_tar_path): + def _get_members_and_whiteout_paths(layer_tar_path): """Return the set of files to remove and extract for a given layer - :param layer_tar_path: The path where a layer has been extracted - :return: Tuple of filesets - - extract_fileset: files to extract into staging directory - - delete_fileset: files to remove from staging directory as the current layer - contains a whiteout corresponding to a staged file in the previous layers + :param layer_tar_path: The path to the layer tar.gz file + :return: Tuple of (members_dict, whiteout_paths, opaque_whiteout_dirs) + - members_dict: OrderedDict of TarInfo members to extract into staging directory + - whiteout_paths: list of file paths that should be removed from previous layers + - opaque_whiteout_dirs: list of directory paths whose contents should be removed from previous layers """ @@ -677,7 +706,7 @@ def strip_wh(white_out_file): """ # whiteout files have the syntax of `*/.wh.*` file_name = os.path.basename(white_out_file) - path = os.path.join(os.path.dirname(white_out_file), file_name.split(".wh.")[1]) + path = os.path.join(os.path.dirname(white_out_file), file_name.split(".wh.", maxsplit=1)[1]) return path def is_regular_file(info): @@ -688,16 +717,27 @@ def is_regular_file(info): """ return not (info.name.startswith("dev/") or info.isdev()) - with tarfile.open(layer_tar_path) as tar: - extract_fileset = [] - delete_fileset = [] + with tarfile.open(layer_tar_path, tarinfo=ReadableTarInfo) as tar: + members_dict = OrderedDict() + whiteout_paths = [] + opaque_whiteout_dirs = [] + for member in tar.getmembers(): - if os.path.basename(member.name).startswith(".wh."): - delete_fileset.append(strip_wh(member.name)) + member_basename = os.path.basename(member.name) + if member_basename == ".wh..wh..opq": + # Opaque whiteout file + opaque_whiteout_dir = os.path.dirname(member.name) + if opaque_whiteout_dir != "": + # Add "/" to facilitate path prefix matching + opaque_whiteout_dir += "/" + opaque_whiteout_dirs.append(opaque_whiteout_dir) + elif member_basename.startswith(".wh."): + # Whiteout file + whiteout_paths.append(strip_wh(member.name)) elif is_regular_file(member): - extract_fileset.append(member) + members_dict[member.name] = member - return extract_fileset, delete_fileset + return members_dict, whiteout_paths, opaque_whiteout_dirs # Assert that a tarfile is safe to extract; specifically, make # sure that we don't do anything outside of the target