From dfbd0e325026363c9fd1d460fc1ec57050cc8e0c Mon Sep 17 00:00:00 2001 From: Dor Askayo Date: Wed, 12 Nov 2025 21:40:10 +0000 Subject: [PATCH 1/5] sources/docker.py: Split whiteout file names only once This prevents a failure to delete whiteout files that have ".wh." more than once in their name. --- src/buildstream_plugins/sources/docker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/buildstream_plugins/sources/docker.py b/src/buildstream_plugins/sources/docker.py index 62c4d76..38c27f4 100644 --- a/src/buildstream_plugins/sources/docker.py +++ b/src/buildstream_plugins/sources/docker.py @@ -677,7 +677,7 @@ def strip_wh(white_out_file): """ # whiteout files have the syntax of `*/.wh.*` file_name = os.path.basename(white_out_file) - path = os.path.join(os.path.dirname(white_out_file), file_name.split(".wh.")[1]) + path = os.path.join(os.path.dirname(white_out_file), file_name.split(".wh.", maxsplit=1)[1]) return path def is_regular_file(info): From 1735f7be78bcd4f75899a077ceb3e6a42759dfae Mon Sep 17 00:00:00 2001 From: Dor Askayo Date: Wed, 12 Nov 2025 21:40:21 +0000 Subject: [PATCH 2/5] sources/docker.py: Use ReadableTarInfo when creating member file sets The list of TarInfo received from tar.getmembers() is returned from _get_extract_and_remove_files() to the caller and used to specify the members to be extracted from the tar file. If the "tarinfo" parameter is not set to ReadableTarInfo when the member list is created, its file mode modifications would not apply to those instances of TarInfo. As a result, the logic in ReadableTarInfo did not actually apply to the file extraction process and file modes remained the same. This results in errors such as the following when staged files cannot be read by the running user: [00:10:04] FAILURE [0e4ec5fd] images/redhat-ubi8-x86_64.bst: Failed to capture tree /home/user/.cache/buildstream/tmp/staging-temppou9rkp9: code: 13 message: "System error in `merklize()` for path \"/home/user/.cache/buildstream/tmp/staging-temppou9rkp9\": std::system_error exception thrown at [buildboxcommon_merklize.cpp:1125] [system:13], errMsg = \"Failed to open path \"\"etc/gshadow\"\"\", errno : Permission denied" Setting tarinfo=ReadableTarInfo in _get_extract_and_remove_files() solves this issue. --- src/buildstream_plugins/sources/docker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/buildstream_plugins/sources/docker.py b/src/buildstream_plugins/sources/docker.py index 38c27f4..638f882 100644 --- a/src/buildstream_plugins/sources/docker.py +++ b/src/buildstream_plugins/sources/docker.py @@ -688,7 +688,7 @@ def is_regular_file(info): """ return not (info.name.startswith("dev/") or info.isdev()) - with tarfile.open(layer_tar_path) as tar: + with tarfile.open(layer_tar_path, tarinfo=ReadableTarInfo) as tar: extract_fileset = [] delete_fileset = [] for member in tar.getmembers(): From ccb62885e6bc6c748f63655355d99fa4a8e0d88f Mon Sep 17 00:00:00 2001 From: Dor Askayo Date: Wed, 12 Nov 2025 21:40:26 +0000 Subject: [PATCH 3/5] sources/docker.py: Sync ReadableTarInfo with the version from BuildStream This version respects the umask, allowing access by buildbox-casd running as a different user. The implementation was copied from BuildStream version 2.6.0.[0] [0] https://github.com/apache/buildstream/blob/4c68517662e17cd68f579db0b2d5c5e5d4451335/src/buildstream/plugins/sources/tar.py#L75-L92 --- src/buildstream_plugins/sources/docker.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/buildstream_plugins/sources/docker.py b/src/buildstream_plugins/sources/docker.py index 638f882..d1c590d 100644 --- a/src/buildstream_plugins/sources/docker.py +++ b/src/buildstream_plugins/sources/docker.py @@ -115,6 +115,7 @@ sha256sum, link_files, move_atomic, + get_umask, ) # @@ -363,23 +364,26 @@ def blob(self, image_path, blob_digest, download_to): class ReadableTarInfo(tarfile.TarInfo): """ - The goal is to override`TarFile`'s `extractall` semantics by ensuring that on extraction, the - files are readable by the owner of the file. This is done by over-riding the accessor for the - mode` attribute in `TarInfo`, class that encapsulates the internal meta-data of the tarball, + The goal is to override `TarFile`'s `extractall` semantics by ensuring that on extraction, the + files are readable by the owner of the file. This is done by overriding the accessor for the + `mode` attribute in `TarInfo`, the class that encapsulates the internal meta-data of the tarball, so that the owner-read bit is always set. """ - # The mode attribute is not declared as a property and so - # this trips up the static type checker, mark this as "ignore" - # + # https://github.com/python/mypy/issues/4125 @property # type: ignore def mode(self): - # ensure file is readable by owner - return self.__permission | 0o400 + # Respect umask instead of the file mode stored in the archive. + # The only bit used from the embedded mode is the executable bit for files. + umask = get_umask() + if self.isdir() or bool(self.__permission & 0o100): + return 0o777 & ~umask + else: + return 0o666 & ~umask @mode.setter def mode(self, permission): - self.__permission = permission + self.__permission = permission # pylint: disable=attribute-defined-outside-init class DockerSource(Source): From 197d934f2a79b2042be4013e4a1851cee9405483 Mon Sep 17 00:00:00 2001 From: Dor Askayo Date: Wed, 12 Nov 2025 21:40:41 +0000 Subject: [PATCH 4/5] sources/docker.py: Process whiteout files before extracting The previous approach relied on removing files which were already extracted in a previous step, which was susceptible to permission and ownership issues. For example, the following error is seen when the the permissions of the parent doesn't allow removing a file: [00:00:06] FAILURE [0e4ec5fd] images/redhat-ubi8-x86_64.bst: docker source at images/redhat-ubi8-x86_64.bst [line 4 column 2]: Error staging source: [Errno 2] No such file or directory: '/home/coder/.cache/buildstream/tmp/staging-tempi542ewmq/root/buildinfo/' Processing layer members before extracting them should avoid such issues by never having to remove any files. It should also result in significantly reduced disk IO when many whiteout files are present. The downside is slightly higher memory usage to store the member list of all layers before extracting them, but the difference should not be significant. --- src/buildstream_plugins/sources/docker.py | 58 ++++++++++++++--------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/src/buildstream_plugins/sources/docker.py b/src/buildstream_plugins/sources/docker.py index d1c590d..c22a081 100644 --- a/src/buildstream_plugins/sources/docker.py +++ b/src/buildstream_plugins/sources/docker.py @@ -107,6 +107,8 @@ import tarfile import urllib.parse +from collections import OrderedDict + import requests from buildstream import Source, SourceError @@ -609,31 +611,42 @@ def stage(self, directory): raise SourceError("Unable to load manifest: {}".format(e)) from e try: + layer_members = [] + + # Create a list of members to extract from each layer for layer in manifest["layers"]: layer_digest = layer["digest"] blob_path = os.path.join(mirror_dir, layer_digest + ".tar.gz") self._verify_blob(blob_path, expected_digest=layer_digest) - ( - extract_fileset, - white_out_fileset, - ) = self._get_extract_and_remove_files(blob_path) - # remove files associated with whiteouts - for white_out_file in white_out_fileset: - white_out_file = os.path.join(directory, white_out_file) - os.remove(white_out_file) + members_dict, whiteout_paths = self._get_members_and_whiteout_paths(blob_path) + + # Process whiteouts: remove corresponding members from previous layers + for _, prev_members_dict in layer_members: + for whiteout_path in whiteout_paths: + prev_members_dict.pop(whiteout_path, None) + + layer_members.append((blob_path, members_dict)) - # extract files for the current layer + # Extract files for each layer + for blob_path, members_dict in layer_members: + if not members_dict: + # No files to extract from this layer + continue + + # Extract files for the current layer with tarfile.open(blob_path, tarinfo=ReadableTarInfo) as tar: with self.tempdir() as td: + members = list(members_dict.values()) + if hasattr(tarfile, "tar_filter"): # Python 3.12+ (and older versions with backports) tar.extraction_filter = tarfile.tar_filter else: - for member in extract_fileset: + for member in members: self._assert_tarinfo_safe(member, td) - tar.extractall(path=td, members=extract_fileset) + tar.extractall(path=td, members=members) link_files(td, directory) except (OSError, SourceError, tarfile.TarError) as e: @@ -662,14 +675,13 @@ def collect_source_info(self): ] @staticmethod - def _get_extract_and_remove_files(layer_tar_path): + def _get_members_and_whiteout_paths(layer_tar_path): """Return the set of files to remove and extract for a given layer - :param layer_tar_path: The path where a layer has been extracted - :return: Tuple of filesets - - extract_fileset: files to extract into staging directory - - delete_fileset: files to remove from staging directory as the current layer - contains a whiteout corresponding to a staged file in the previous layers + :param layer_tar_path: The path to the layer tar.gz file + :return: Tuple of (members_dict, whiteout_paths) + - members_dict: OrderedDict of TarInfo members to extract into staging directory + - whiteout_paths: list of file paths that should be removed from previous layers """ @@ -693,15 +705,17 @@ def is_regular_file(info): return not (info.name.startswith("dev/") or info.isdev()) with tarfile.open(layer_tar_path, tarinfo=ReadableTarInfo) as tar: - extract_fileset = [] - delete_fileset = [] + members_dict = OrderedDict() + whiteout_paths = [] + for member in tar.getmembers(): if os.path.basename(member.name).startswith(".wh."): - delete_fileset.append(strip_wh(member.name)) + # Whiteout file + whiteout_paths.append(strip_wh(member.name)) elif is_regular_file(member): - extract_fileset.append(member) + members_dict[member.name] = member - return extract_fileset, delete_fileset + return members_dict, whiteout_paths # Assert that a tarfile is safe to extract; specifically, make # sure that we don't do anything outside of the target From c2f01cfa45e9e2dbfb99846172b899d6c85ad2fc Mon Sep 17 00:00:00 2001 From: Dor Askayo Date: Wed, 12 Nov 2025 21:40:49 +0000 Subject: [PATCH 5/5] sources/docker.py: Handle opaque whiteout files Opaque whiteouts hide the contents of their corresponding directories in lower layers, but not the directories themselves.[0] The implementation could be likely be optimized by implementing a data structure that keeps a nested list of files for each directory, but the focus of this change was correctness. The current implementation only adds a few seconds of processing overhead to images that take more than 10 minutes to be extracted, so it should be good enough for now. [0] https://specs.opencontainers.org/image-spec/layer/#opaque-whiteout --- src/buildstream_plugins/sources/docker.py | 30 ++++++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/buildstream_plugins/sources/docker.py b/src/buildstream_plugins/sources/docker.py index c22a081..6545bb3 100644 --- a/src/buildstream_plugins/sources/docker.py +++ b/src/buildstream_plugins/sources/docker.py @@ -620,7 +620,19 @@ def stage(self, directory): self._verify_blob(blob_path, expected_digest=layer_digest) - members_dict, whiteout_paths = self._get_members_and_whiteout_paths(blob_path) + members_dict, whiteout_paths, opaque_whiteout_dirs = self._get_members_and_whiteout_paths(blob_path) + + # Process opaque whiteouts: remove members under the corresponding directories from previous layers + for _, prev_members_dict in layer_members: + members_to_remove = [] + for member_path in prev_members_dict: + # Check if this member should be removed by any opaque whiteout + for opaque_whiteout_dir in opaque_whiteout_dirs: + if member_path.startswith(opaque_whiteout_dir): + members_to_remove.append(member_path) + break + for member_path in members_to_remove: + del prev_members_dict[member_path] # Process whiteouts: remove corresponding members from previous layers for _, prev_members_dict in layer_members: @@ -679,9 +691,10 @@ def _get_members_and_whiteout_paths(layer_tar_path): """Return the set of files to remove and extract for a given layer :param layer_tar_path: The path to the layer tar.gz file - :return: Tuple of (members_dict, whiteout_paths) + :return: Tuple of (members_dict, whiteout_paths, opaque_whiteout_dirs) - members_dict: OrderedDict of TarInfo members to extract into staging directory - whiteout_paths: list of file paths that should be removed from previous layers + - opaque_whiteout_dirs: list of directory paths whose contents should be removed from previous layers """ @@ -707,15 +720,24 @@ def is_regular_file(info): with tarfile.open(layer_tar_path, tarinfo=ReadableTarInfo) as tar: members_dict = OrderedDict() whiteout_paths = [] + opaque_whiteout_dirs = [] for member in tar.getmembers(): - if os.path.basename(member.name).startswith(".wh."): + member_basename = os.path.basename(member.name) + if member_basename == ".wh..wh..opq": + # Opaque whiteout file + opaque_whiteout_dir = os.path.dirname(member.name) + if opaque_whiteout_dir != "": + # Add "/" to facilitate path prefix matching + opaque_whiteout_dir += "/" + opaque_whiteout_dirs.append(opaque_whiteout_dir) + elif member_basename.startswith(".wh."): # Whiteout file whiteout_paths.append(strip_wh(member.name)) elif is_regular_file(member): members_dict[member.name] = member - return members_dict, whiteout_paths + return members_dict, whiteout_paths, opaque_whiteout_dirs # Assert that a tarfile is safe to extract; specifically, make # sure that we don't do anything outside of the target