Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 106 additions & 93 deletions python/private/venv_runfiles.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
load("@bazel_skylib//lib:paths.bzl", "paths")
load(
":common.bzl",
"PYTHON_FILE_EXTENSIONS",
"is_file",
"relative_path",
"runfiles_root_path",
Expand Down Expand Up @@ -58,7 +57,6 @@ def create_venv_app_files(ctx, deps, venv_dir_map):
if is_file(link_to):
symlink_from = "{}/{}".format(ctx.label.package, bin_venv_path)
runfiles_symlinks[symlink_from] = link_to

else:
venv_link = ctx.actions.declare_symlink(bin_venv_path)
venv_link_rf_path = runfiles_root_path(ctx, venv_link.short_path)
Expand All @@ -77,14 +75,16 @@ def create_venv_app_files(ctx, deps, venv_dir_map):
)

# Visible for testing
def build_link_map(ctx, entries):
def build_link_map(ctx, entries, return_conflicts = False):
"""Compute the mapping of venv paths to their backing objects.


Args:
ctx: {type}`ctx` current ctx.
entries: {type}`list[VenvSymlinkEntry]` the entries that describe the
venv-relative
return_conflicts: {type}`bool`. Only present for testing. If True,
also return a list of the groups that had overlapping paths and had
to be resolved and merged.

Returns:
{type}`dict[str, dict[str, str|File]]` Mappings of venv paths to their
Expand Down Expand Up @@ -114,6 +114,7 @@ def build_link_map(ctx, entries):

# final paths to keep, grouped by kind
keep_link_map = {} # dict[str kind, dict[path, str|File]]
conflicts = [] if return_conflicts else None
for kind, entries in entries_by_kind.items():
# dict[str kind-relative path, str|File link_to]
keep_kind_link_map = {}
Expand All @@ -129,12 +130,17 @@ def build_link_map(ctx, entries):
else:
keep_kind_link_map[entry.venv_path] = entry.link_to_path
else:
if return_conflicts:
conflicts.append(group)

# Merge a group of overlapping prefixes
_merge_venv_path_group(ctx, group, keep_kind_link_map)

keep_link_map[kind] = keep_kind_link_map

return keep_link_map
if return_conflicts:
return keep_link_map, conflicts
else:
return keep_link_map

def _group_venv_path_entries(entries):
"""Group entries by VenvSymlinkEntry.venv_path overlap.
Expand Down Expand Up @@ -235,109 +241,115 @@ def get_venv_symlinks(ctx, files, package, version_str, site_packages_root):
# Append slash to prevent incorrect prefix-string matches
site_packages_root += "/"

# We have to build a list of (runfiles path, site-packages path) pairs of the files to
# create in the consuming binary's venv site-packages directory. To minimize the number of
# files to create, we just return the paths to the directories containing the code of
# interest.
#
# However, namespace packages complicate matters: multiple distributions install in the
# same directory in site-packages. This works out because they don't overlap in their
# files. Typically, they install to different directories within the namespace package
# directory. We also need to ensure that we can handle a case where the main package (e.g.
# airflow) has directories only containing data files and then namespace packages coming
# along and being next to it.
#
# Lastly we have to assume python modules just being `.py` files (e.g. typing-extensions)
# is just a single Python file.
all_files = sorted(files, key = lambda f: f.short_path)

dir_symlinks = {} # dirname -> runfile path
venv_symlinks = []
# venv paths that cannot be directly linked. Dict acting as set.
cannot_be_linked_directly = {}

# Sort so order is top-down
all_files = sorted(files, key = lambda f: f.short_path)
# dict[str path, VenvSymlinkEntry]
# Where path is the venv path (i.e. relative to site_packages_prefix)
venv_symlinks = {}

# List of (File, str venv_path) tuples
files_left_to_link = []

# We want to minimize the number of files symlinked. Ideally, only the
# top-level directories are symlinked. Unfortunately, shared libraries
# complicate matters: if a shared library's directory is linked, then the
# dynamic linker computes the wrong search path.
#
# To fix, we have to directly link shared libraries. This then means that
# all the parent directories of the shared library can't be linked
# directly.
for src in all_files:
path = _repo_relative_short_path(src.short_path)
if not path.startswith(site_packages_root):
rf_root_path = runfiles_root_path(ctx, src.short_path)
_, _, repo_rel_path = rf_root_path.partition("/")
head, found_sp_root, venv_path = repo_rel_path.partition(site_packages_root)
if head or not found_sp_root:
# If head is set, then the path didn't start with site_packages_root
# if found_sp_root is empty, then it means it wasn't found at all.
continue
path = path.removeprefix(site_packages_root)
dir_name, _, filename = path.rpartition("/")
runfiles_dir_name, _, _ = runfiles_root_path(ctx, src.short_path).partition("/")

filename = paths.basename(venv_path)
if _is_linker_loaded_library(filename):
entry = VenvSymlinkEntry(
venv_symlinks[venv_path] = VenvSymlinkEntry(
kind = VenvSymlinkKind.LIB,
link_to_path = paths.join(runfiles_dir_name, site_packages_root, filename),
link_to_path = rf_root_path,
link_to_file = src,
package = package,
version = version_str,
venv_path = path,
files = depset([src]),
venv_path = venv_path,
)
venv_symlinks.append(entry)
continue

if dir_name in dir_symlinks:
# we already have this dir, this allows us to short-circuit since most of the
# ctx.files.data might share the same directories as ctx.files.srcs
parent = paths.dirname(venv_path)
for _ in range(len(venv_path) + 1): # Iterate enough times to traverse up
if not parent:
break
if cannot_be_linked_directly.get(parent, False):
# Already seen
break
cannot_be_linked_directly[parent] = True
parent = paths.dirname(parent)
else:
files_left_to_link.append((src, venv_path))

# At this point, venv_symlinks has entries for the shared libraries
# and cannot_be_linked_directly has the directories that cannot be
# directly linked. Next, we loop over the remaining files and group
# them into the highest level directory that can be linked.

# dict[str venv_path, list[File]]
optimized_groups = {}

for src, venv_path in files_left_to_link:
parent = paths.dirname(venv_path)
if not parent:
# File in root, must be linked directly
optimized_groups.setdefault(venv_path, [])
optimized_groups[venv_path].append(src)
continue

if dir_name:
# This can be either:
# * a directory with libs (e.g. numpy.libs, created by auditwheel)
# * a directory with `__init__.py` file that potentially also needs to be
# symlinked.
# * `.dist-info` directory
#
# This could be also regular files, that just need to be symlinked, so we will
# add the directory here.
dir_symlinks[dir_name] = runfiles_dir_name
elif src.extension in PYTHON_FILE_EXTENSIONS:
# This would be files that do not have directories and we just need to add
# direct symlinks to them as is, we only allow Python files in here
entry = VenvSymlinkEntry(
kind = VenvSymlinkKind.LIB,
link_to_path = paths.join(runfiles_dir_name, site_packages_root, filename),
link_to_file = src,
package = package,
version = version_str,
venv_path = path,
files = depset([src]),
)
venv_symlinks.append(entry)

# Sort so that we encounter `foo` before `foo/bar`. This ensures we
# see the top-most explicit package first.
dirnames = sorted(dir_symlinks.keys())
first_level_explicit_packages = []
for d in dirnames:
is_sub_package = False
for existing in first_level_explicit_packages:
# Suffix with / to prevent foo matching foobar
if d.startswith(existing + "/"):
is_sub_package = True
break
if not is_sub_package:
first_level_explicit_packages.append(d)

for dirname in first_level_explicit_packages:
prefix = dir_symlinks[dirname]
link_to_path = paths.join(prefix, site_packages_root, dirname)
entry = VenvSymlinkEntry(
if parent in cannot_be_linked_directly:
# File in a directory that cannot be directly linked,
# so link the file directly
optimized_groups.setdefault(venv_path, [])
optimized_groups[venv_path].append(src)
else:
# This path can be grouped. Find the highest-level directory to link.
venv_path = parent
next_parent = paths.dirname(parent)
for _ in range(len(venv_path) + 1): # Iterate enough times
if next_parent:
if next_parent not in cannot_be_linked_directly:
venv_path = next_parent
next_parent = paths.dirname(next_parent)
else:
break
else:
break

optimized_groups.setdefault(venv_path, [])
optimized_groups[venv_path].append(src)

# Finally, for each group, we create the VenvSymlinkEntry objects
for venv_path, files in optimized_groups.items():
link_to_path = (
_get_label_runfiles_repo(ctx, files[0].owner) +
"/" +
site_packages_root +
venv_path
)
venv_symlinks[venv_path] = VenvSymlinkEntry(
kind = VenvSymlinkKind.LIB,
link_to_path = link_to_path,
link_to_file = None,
package = package,
version = version_str,
venv_path = dirname,
files = depset([
f
for f in all_files
if runfiles_root_path(ctx, f.short_path).startswith(link_to_path + "/")
]),
venv_path = venv_path,
files = depset(files),
)
venv_symlinks.append(entry)

return venv_symlinks
return venv_symlinks.values()

def _is_linker_loaded_library(filename):
"""Tells if a filename is one that `dlopen()` or the runtime linker handles.
Expand All @@ -357,9 +369,10 @@ def _is_linker_loaded_library(filename):
return True
return False

def _repo_relative_short_path(short_path):
# Convert `../+pypi+foo/some/file.py` to `some/file.py`
if short_path.startswith("../"):
return short_path[3:].partition("/")[2]
def _get_label_runfiles_repo(ctx, label):
repo = label.repo_name
if repo:
return repo
else:
return short_path
# For files, empty repo means the main repo
return ctx.workspace_name
Loading