Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 75 additions & 35 deletions src/buildstream_plugins/sources/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@
import tarfile
import urllib.parse

from collections import OrderedDict

import requests

from buildstream import Source, SourceError
Expand All @@ -115,6 +117,7 @@
sha256sum,
link_files,
move_atomic,
get_umask,
)

#
Expand Down Expand Up @@ -363,23 +366,26 @@ def blob(self, image_path, blob_digest, download_to):

class ReadableTarInfo(tarfile.TarInfo):
"""
The goal is to override`TarFile`'s `extractall` semantics by ensuring that on extraction, the
files are readable by the owner of the file. This is done by over-riding the accessor for the
mode` attribute in `TarInfo`, class that encapsulates the internal meta-data of the tarball,
The goal is to override `TarFile`'s `extractall` semantics by ensuring that on extraction, the
files are readable by the owner of the file. This is done by overriding the accessor for the
`mode` attribute in `TarInfo`, the class that encapsulates the internal meta-data of the tarball,
so that the owner-read bit is always set.
"""

# The mode attribute is not declared as a property and so
# this trips up the static type checker, mark this as "ignore"
#
# https://github.com/python/mypy/issues/4125
@property # type: ignore
def mode(self):
# ensure file is readable by owner
return self.__permission | 0o400
# Respect umask instead of the file mode stored in the archive.
# The only bit used from the embedded mode is the executable bit for files.
umask = get_umask()
if self.isdir() or bool(self.__permission & 0o100):
return 0o777 & ~umask
else:
return 0o666 & ~umask

@mode.setter
def mode(self, permission):
self.__permission = permission
self.__permission = permission # pylint: disable=attribute-defined-outside-init


class DockerSource(Source):
Expand Down Expand Up @@ -605,31 +611,54 @@ def stage(self, directory):
raise SourceError("Unable to load manifest: {}".format(e)) from e

try:
layer_members = []

# Create a list of members to extract from each layer
for layer in manifest["layers"]:
layer_digest = layer["digest"]
blob_path = os.path.join(mirror_dir, layer_digest + ".tar.gz")

self._verify_blob(blob_path, expected_digest=layer_digest)
(
extract_fileset,
white_out_fileset,
) = self._get_extract_and_remove_files(blob_path)

# remove files associated with whiteouts
for white_out_file in white_out_fileset:
white_out_file = os.path.join(directory, white_out_file)
os.remove(white_out_file)

# extract files for the current layer
members_dict, whiteout_paths, opaque_whiteout_dirs = self._get_members_and_whiteout_paths(blob_path)

# Process opaque whiteouts: remove members under the corresponding directories from previous layers
for _, prev_members_dict in layer_members:
members_to_remove = []
for member_path in prev_members_dict:
# Check if this member should be removed by any opaque whiteout
for opaque_whiteout_dir in opaque_whiteout_dirs:
if member_path.startswith(opaque_whiteout_dir):
members_to_remove.append(member_path)
break
for member_path in members_to_remove:
del prev_members_dict[member_path]

# Process whiteouts: remove corresponding members from previous layers
for _, prev_members_dict in layer_members:
for whiteout_path in whiteout_paths:
prev_members_dict.pop(whiteout_path, None)

layer_members.append((blob_path, members_dict))

# Extract files for each layer
for blob_path, members_dict in layer_members:
if not members_dict:
# No files to extract from this layer
continue

# Extract files for the current layer
with tarfile.open(blob_path, tarinfo=ReadableTarInfo) as tar:
with self.tempdir() as td:
members = list(members_dict.values())

if hasattr(tarfile, "tar_filter"):
# Python 3.12+ (and older versions with backports)
tar.extraction_filter = tarfile.tar_filter
else:
for member in extract_fileset:
for member in members:
self._assert_tarinfo_safe(member, td)
tar.extractall(path=td, members=extract_fileset)
tar.extractall(path=td, members=members)
link_files(td, directory)

except (OSError, SourceError, tarfile.TarError) as e:
Expand Down Expand Up @@ -658,14 +687,14 @@ def collect_source_info(self):
]

@staticmethod
def _get_extract_and_remove_files(layer_tar_path):
def _get_members_and_whiteout_paths(layer_tar_path):
"""Return the set of files to remove and extract for a given layer

:param layer_tar_path: The path where a layer has been extracted
:return: Tuple of filesets
- extract_fileset: files to extract into staging directory
- delete_fileset: files to remove from staging directory as the current layer
contains a whiteout corresponding to a staged file in the previous layers
:param layer_tar_path: The path to the layer tar.gz file
:return: Tuple of (members_dict, whiteout_paths, opaque_whiteout_dirs)
- members_dict: OrderedDict of TarInfo members to extract into staging directory
- whiteout_paths: list of file paths that should be removed from previous layers
- opaque_whiteout_dirs: list of directory paths whose contents should be removed from previous layers

"""

Expand All @@ -677,7 +706,7 @@ def strip_wh(white_out_file):
"""
# whiteout files have the syntax of `*/.wh.*`
file_name = os.path.basename(white_out_file)
path = os.path.join(os.path.dirname(white_out_file), file_name.split(".wh.")[1])
path = os.path.join(os.path.dirname(white_out_file), file_name.split(".wh.", maxsplit=1)[1])
return path

def is_regular_file(info):
Expand All @@ -688,16 +717,27 @@ def is_regular_file(info):
"""
return not (info.name.startswith("dev/") or info.isdev())

with tarfile.open(layer_tar_path) as tar:
extract_fileset = []
delete_fileset = []
with tarfile.open(layer_tar_path, tarinfo=ReadableTarInfo) as tar:
members_dict = OrderedDict()
whiteout_paths = []
opaque_whiteout_dirs = []

for member in tar.getmembers():
if os.path.basename(member.name).startswith(".wh."):
delete_fileset.append(strip_wh(member.name))
member_basename = os.path.basename(member.name)
if member_basename == ".wh..wh..opq":
# Opaque whiteout file
opaque_whiteout_dir = os.path.dirname(member.name)
if opaque_whiteout_dir != "":
# Add "/" to facilitate path prefix matching
opaque_whiteout_dir += "/"
opaque_whiteout_dirs.append(opaque_whiteout_dir)
elif member_basename.startswith(".wh."):
# Whiteout file
whiteout_paths.append(strip_wh(member.name))
elif is_regular_file(member):
extract_fileset.append(member)
members_dict[member.name] = member

return extract_fileset, delete_fileset
return members_dict, whiteout_paths, opaque_whiteout_dirs

# Assert that a tarfile is safe to extract; specifically, make
# sure that we don't do anything outside of the target
Expand Down