diff --git a/python/private/venv_runfiles.bzl b/python/private/venv_runfiles.bzl index eeedda4555..43fcab6192 100644 --- a/python/private/venv_runfiles.bzl +++ b/python/private/venv_runfiles.bzl @@ -3,7 +3,6 @@ load("@bazel_skylib//lib:paths.bzl", "paths") load( ":common.bzl", - "PYTHON_FILE_EXTENSIONS", "is_file", "relative_path", "runfiles_root_path", @@ -58,7 +57,6 @@ def create_venv_app_files(ctx, deps, venv_dir_map): if is_file(link_to): symlink_from = "{}/{}".format(ctx.label.package, bin_venv_path) runfiles_symlinks[symlink_from] = link_to - else: venv_link = ctx.actions.declare_symlink(bin_venv_path) venv_link_rf_path = runfiles_root_path(ctx, venv_link.short_path) @@ -77,14 +75,16 @@ def create_venv_app_files(ctx, deps, venv_dir_map): ) # Visible for testing -def build_link_map(ctx, entries): +def build_link_map(ctx, entries, return_conflicts = False): """Compute the mapping of venv paths to their backing objects. - Args: ctx: {type}`ctx` current ctx. entries: {type}`list[VenvSymlinkEntry]` the entries that describe the venv-relative + return_conflicts: {type}`bool`. Only present for testing. If True, + also return a list of the groups that had overlapping paths and had + to be resolved and merged. Returns: {type}`dict[str, dict[str, str|File]]` Mappings of venv paths to their @@ -114,6 +114,7 @@ def build_link_map(ctx, entries): # final paths to keep, grouped by kind keep_link_map = {} # dict[str kind, dict[path, str|File]] + conflicts = [] if return_conflicts else None for kind, entries in entries_by_kind.items(): # dict[str kind-relative path, str|File link_to] keep_kind_link_map = {} @@ -129,12 +130,17 @@ def build_link_map(ctx, entries): else: keep_kind_link_map[entry.venv_path] = entry.link_to_path else: + if return_conflicts: + conflicts.append(group) + # Merge a group of overlapping prefixes _merge_venv_path_group(ctx, group, keep_kind_link_map) keep_link_map[kind] = keep_kind_link_map - - return keep_link_map + if return_conflicts: + return keep_link_map, conflicts + else: + return keep_link_map def _group_venv_path_entries(entries): """Group entries by VenvSymlinkEntry.venv_path overlap. @@ -235,109 +241,115 @@ def get_venv_symlinks(ctx, files, package, version_str, site_packages_root): # Append slash to prevent incorrect prefix-string matches site_packages_root += "/" - # We have to build a list of (runfiles path, site-packages path) pairs of the files to - # create in the consuming binary's venv site-packages directory. To minimize the number of - # files to create, we just return the paths to the directories containing the code of - # interest. - # - # However, namespace packages complicate matters: multiple distributions install in the - # same directory in site-packages. This works out because they don't overlap in their - # files. Typically, they install to different directories within the namespace package - # directory. We also need to ensure that we can handle a case where the main package (e.g. - # airflow) has directories only containing data files and then namespace packages coming - # along and being next to it. - # - # Lastly we have to assume python modules just being `.py` files (e.g. typing-extensions) - # is just a single Python file. + all_files = sorted(files, key = lambda f: f.short_path) - dir_symlinks = {} # dirname -> runfile path - venv_symlinks = [] + # venv paths that cannot be directly linked. Dict acting as set. + cannot_be_linked_directly = {} - # Sort so order is top-down - all_files = sorted(files, key = lambda f: f.short_path) + # dict[str path, VenvSymlinkEntry] + # Where path is the venv path (i.e. relative to site_packages_prefix) + venv_symlinks = {} + # List of (File, str venv_path) tuples + files_left_to_link = [] + + # We want to minimize the number of files symlinked. Ideally, only the + # top-level directories are symlinked. Unfortunately, shared libraries + # complicate matters: if a shared library's directory is linked, then the + # dynamic linker computes the wrong search path. + # + # To fix, we have to directly link shared libraries. This then means that + # all the parent directories of the shared library can't be linked + # directly. for src in all_files: - path = _repo_relative_short_path(src.short_path) - if not path.startswith(site_packages_root): + rf_root_path = runfiles_root_path(ctx, src.short_path) + _, _, repo_rel_path = rf_root_path.partition("/") + head, found_sp_root, venv_path = repo_rel_path.partition(site_packages_root) + if head or not found_sp_root: + # If head is set, then the path didn't start with site_packages_root + # if found_sp_root is empty, then it means it wasn't found at all. continue - path = path.removeprefix(site_packages_root) - dir_name, _, filename = path.rpartition("/") - runfiles_dir_name, _, _ = runfiles_root_path(ctx, src.short_path).partition("/") + filename = paths.basename(venv_path) if _is_linker_loaded_library(filename): - entry = VenvSymlinkEntry( + venv_symlinks[venv_path] = VenvSymlinkEntry( kind = VenvSymlinkKind.LIB, - link_to_path = paths.join(runfiles_dir_name, site_packages_root, filename), + link_to_path = rf_root_path, link_to_file = src, package = package, version = version_str, - venv_path = path, files = depset([src]), + venv_path = venv_path, ) - venv_symlinks.append(entry) - continue - - if dir_name in dir_symlinks: - # we already have this dir, this allows us to short-circuit since most of the - # ctx.files.data might share the same directories as ctx.files.srcs + parent = paths.dirname(venv_path) + for _ in range(len(venv_path) + 1): # Iterate enough times to traverse up + if not parent: + break + if cannot_be_linked_directly.get(parent, False): + # Already seen + break + cannot_be_linked_directly[parent] = True + parent = paths.dirname(parent) + else: + files_left_to_link.append((src, venv_path)) + + # At this point, venv_symlinks has entries for the shared libraries + # and cannot_be_linked_directly has the directories that cannot be + # directly linked. Next, we loop over the remaining files and group + # them into the highest level directory that can be linked. + + # dict[str venv_path, list[File]] + optimized_groups = {} + + for src, venv_path in files_left_to_link: + parent = paths.dirname(venv_path) + if not parent: + # File in root, must be linked directly + optimized_groups.setdefault(venv_path, []) + optimized_groups[venv_path].append(src) continue - if dir_name: - # This can be either: - # * a directory with libs (e.g. numpy.libs, created by auditwheel) - # * a directory with `__init__.py` file that potentially also needs to be - # symlinked. - # * `.dist-info` directory - # - # This could be also regular files, that just need to be symlinked, so we will - # add the directory here. - dir_symlinks[dir_name] = runfiles_dir_name - elif src.extension in PYTHON_FILE_EXTENSIONS: - # This would be files that do not have directories and we just need to add - # direct symlinks to them as is, we only allow Python files in here - entry = VenvSymlinkEntry( - kind = VenvSymlinkKind.LIB, - link_to_path = paths.join(runfiles_dir_name, site_packages_root, filename), - link_to_file = src, - package = package, - version = version_str, - venv_path = path, - files = depset([src]), - ) - venv_symlinks.append(entry) - - # Sort so that we encounter `foo` before `foo/bar`. This ensures we - # see the top-most explicit package first. - dirnames = sorted(dir_symlinks.keys()) - first_level_explicit_packages = [] - for d in dirnames: - is_sub_package = False - for existing in first_level_explicit_packages: - # Suffix with / to prevent foo matching foobar - if d.startswith(existing + "/"): - is_sub_package = True - break - if not is_sub_package: - first_level_explicit_packages.append(d) - - for dirname in first_level_explicit_packages: - prefix = dir_symlinks[dirname] - link_to_path = paths.join(prefix, site_packages_root, dirname) - entry = VenvSymlinkEntry( + if parent in cannot_be_linked_directly: + # File in a directory that cannot be directly linked, + # so link the file directly + optimized_groups.setdefault(venv_path, []) + optimized_groups[venv_path].append(src) + else: + # This path can be grouped. Find the highest-level directory to link. + venv_path = parent + next_parent = paths.dirname(parent) + for _ in range(len(venv_path) + 1): # Iterate enough times + if next_parent: + if next_parent not in cannot_be_linked_directly: + venv_path = next_parent + next_parent = paths.dirname(next_parent) + else: + break + else: + break + + optimized_groups.setdefault(venv_path, []) + optimized_groups[venv_path].append(src) + + # Finally, for each group, we create the VenvSymlinkEntry objects + for venv_path, files in optimized_groups.items(): + link_to_path = ( + _get_label_runfiles_repo(ctx, files[0].owner) + + "/" + + site_packages_root + + venv_path + ) + venv_symlinks[venv_path] = VenvSymlinkEntry( kind = VenvSymlinkKind.LIB, link_to_path = link_to_path, + link_to_file = None, package = package, version = version_str, - venv_path = dirname, - files = depset([ - f - for f in all_files - if runfiles_root_path(ctx, f.short_path).startswith(link_to_path + "/") - ]), + venv_path = venv_path, + files = depset(files), ) - venv_symlinks.append(entry) - return venv_symlinks + return venv_symlinks.values() def _is_linker_loaded_library(filename): """Tells if a filename is one that `dlopen()` or the runtime linker handles. @@ -357,9 +369,10 @@ def _is_linker_loaded_library(filename): return True return False -def _repo_relative_short_path(short_path): - # Convert `../+pypi+foo/some/file.py` to `some/file.py` - if short_path.startswith("../"): - return short_path[3:].partition("/")[2] +def _get_label_runfiles_repo(ctx, label): + repo = label.repo_name + if repo: + return repo else: - return short_path + # For files, empty repo means the main repo + return ctx.workspace_name diff --git a/tests/venv_site_packages_libs/app_files_building/app_files_building_tests.bzl b/tests/venv_site_packages_libs/app_files_building/app_files_building_tests.bzl index db2f21c7e7..fc0b5d0bf3 100644 --- a/tests/venv_site_packages_libs/app_files_building/app_files_building_tests.bzl +++ b/tests/venv_site_packages_libs/app_files_building/app_files_building_tests.bzl @@ -26,6 +26,11 @@ empty_files = rule( _tests = [] +# NOTE: In bzlmod, the workspace name is always "_main". +# Under workspace, the workspace name is the name configured in WORKSPACE, +# or "__main__" if was unspecified. +# NOTE: ctx.workspace_name is always the root workspace, not the workspace +# of the target being processed (ctx.label). def _ctx(workspace_name = "_main"): return struct( workspace_name = workspace_name, @@ -36,6 +41,23 @@ def _file(short_path): short_path = short_path, ) +def _venv_symlink(venv_path, *, link_to_path = None, files = []): + return struct( + link_to_path = link_to_path, + venv_path = venv_path, + files = files, + ) + +def _venv_symlinks_from_entries(entries): + result = [] + for symlink_entry in entries: + result.append(struct( + venv_path = symlink_entry.venv_path, + link_to_path = symlink_entry.link_to_path, + files = [f.short_path for f in symlink_entry.files.to_list()], + )) + return sorted(result, key = lambda e: (e.link_to_path, e.venv_path)) + def _entry(venv_path, link_to_path, files = [], **kwargs): kwargs.setdefault("kind", VenvSymlinkKind.LIB) kwargs.setdefault("package", None) @@ -89,7 +111,7 @@ def _test_conflict_merging_impl(env, _): _entry("duplicate", "+dupe_b/site-packages/duplicate", ["d.py"]), ] - actual = build_link_map(_ctx(), entries) + actual, conflicts = build_link_map(_ctx(), entries, return_conflicts = True) expected_libs = { "a-1.0.dist-info": "+pypi_a/site-packages/a-1.0.dist-info", "a/a.txt": _file("../+pypi_a/site-packages/a/a.txt"), @@ -101,6 +123,146 @@ def _test_conflict_merging_impl(env, _): env.expect.that_dict(actual[VenvSymlinkKind.LIB]).contains_exactly(expected_libs) env.expect.that_dict(actual).keys().contains_exactly([VenvSymlinkKind.LIB]) + env.expect.that_int(len(conflicts)).is_greater_than(0) + +def _test_optimized_grouping_complex(name): + empty_files( + name = name + "_files", + paths = [ + "site-packages/pkg1/a.txt", + "site-packages/pkg1/b/b_mod.so", + "site-packages/pkg1/c/c1.txt", + "site-packages/pkg1/c/c2.txt", + "site-packages/pkg1/d/d1.txt", + "site-packages/pkg1/dd/dd1.txt", + "site-packages/pkg1/q1/q1.txt", + "site-packages/pkg1/q1/q2a/libq.so", + "site-packages/pkg1/q1/q2a/q2.txt", + "site-packages/pkg1/q1/q2a/q3/q3a.txt", + "site-packages/pkg1/q1/q2a/q3/q3b.txt", + "site-packages/pkg1/q1/q2b/q2b.txt", + ], + ) + analysis_test( + name = name, + impl = _test_optimized_grouping_complex_impl, + target = name + "_files", + ) + +_tests.append(_test_optimized_grouping_complex) + +def _test_optimized_grouping_complex_impl(env, target): + test_ctx = _ctx(workspace_name = env.ctx.workspace_name) + entries = get_venv_symlinks( + test_ctx, + target.files.to_list(), + package = "pkg1", + version_str = "1.0", + site_packages_root = env.ctx.label.package + "/site-packages", + ) + actual = _venv_symlinks_from_entries(entries) + + rr = "{}/{}/site-packages/".format(test_ctx.workspace_name, env.ctx.label.package) + expected = [ + _venv_symlink( + "pkg1/a.txt", + link_to_path = rr + "pkg1/a.txt", + files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/a.txt", + ], + ), + _venv_symlink( + "pkg1/b", + link_to_path = rr + "pkg1/b", + files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/b/b_mod.so", + ], + ), + _venv_symlink("pkg1/c", link_to_path = rr + "pkg1/c", files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/c/c1.txt", + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/c/c2.txt", + ]), + _venv_symlink("pkg1/d", link_to_path = rr + "pkg1/d", files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/d/d1.txt", + ]), + _venv_symlink("pkg1/dd", link_to_path = rr + "pkg1/dd", files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/dd/dd1.txt", + ]), + _venv_symlink("pkg1/q1/q1.txt", link_to_path = rr + "pkg1/q1/q1.txt", files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/q1/q1.txt", + ]), + _venv_symlink("pkg1/q1/q2a/libq.so", link_to_path = rr + "pkg1/q1/q2a/libq.so", files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/q1/q2a/libq.so", + ]), + _venv_symlink("pkg1/q1/q2a/q2.txt", link_to_path = rr + "pkg1/q1/q2a/q2.txt", files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/q1/q2a/q2.txt", + ]), + _venv_symlink("pkg1/q1/q2a/q3", link_to_path = rr + "pkg1/q1/q2a/q3", files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/q1/q2a/q3/q3a.txt", + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/q1/q2a/q3/q3b.txt", + ]), + _venv_symlink("pkg1/q1/q2b", link_to_path = rr + "pkg1/q1/q2b", files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg1/q1/q2b/q2b.txt", + ]), + ] + expected = sorted(expected, key = lambda e: (e.link_to_path, e.venv_path)) + env.expect.that_collection( + actual, + ).contains_exactly(expected) + _, conflicts = build_link_map(test_ctx, entries, return_conflicts = True) + + # The point of the optimization is to avoid having to merge conflicts. + env.expect.that_collection(conflicts).contains_exactly([]) + +def _test_optimized_grouping_single_toplevel(name): + empty_files( + name = name + "_files", + paths = [ + "site-packages/pkg2/a.txt", + "site-packages/pkg2/b_mod.so", + ], + ) + analysis_test( + name = name, + impl = _test_optimized_grouping_single_toplevel_impl, + target = name + "_files", + ) + +_tests.append(_test_optimized_grouping_single_toplevel) + +def _test_optimized_grouping_single_toplevel_impl(env, target): + test_ctx = _ctx(workspace_name = env.ctx.workspace_name) + entries = get_venv_symlinks( + test_ctx, + target.files.to_list(), + package = "pkg2", + version_str = "1.0", + site_packages_root = env.ctx.label.package + "/site-packages", + ) + actual = _venv_symlinks_from_entries(entries) + + rr = "{}/{}/site-packages/".format(test_ctx.workspace_name, env.ctx.label.package) + expected = [ + _venv_symlink( + "pkg2", + link_to_path = rr + "pkg2", + files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg2/a.txt", + "tests/venv_site_packages_libs/app_files_building/site-packages/pkg2/b_mod.so", + ], + ), + ] + expected = sorted(expected, key = lambda e: (e.link_to_path, e.venv_path)) + + env.expect.that_collection( + actual, + ).contains_exactly(expected) + + _, conflicts = build_link_map(test_ctx, entries, return_conflicts = True) + + # The point of the optimization is to avoid having to merge conflicts. + env.expect.that_collection(conflicts).contains_exactly([]) + def _test_package_version_filtering(name): analysis_test( name = name, @@ -219,8 +381,7 @@ def _test_multiple_venv_symlink_kinds_impl(env, _): "libfile", "+pypi_lib/site-packages/libfile", ["lib.txt"], - kind = - VenvSymlinkKind.LIB, + kind = VenvSymlinkKind.LIB, ), _entry( "binfile", @@ -294,34 +455,33 @@ def _test_shared_library_symlinking_impl(env, target): site_packages_root = env.ctx.label.package + "/site-packages", ) - actual = [e for e in actual_entries if e.venv_path == "foo.libs/libx.so"] - if not actual: - fail("Did not find VenvSymlinkEntry with venv_path equal to foo.libs/libx.so. " + - "Found: {}".format(actual_entries)) - elif len(actual) > 1: - fail("Found multiple entries with venv_path=foo.libs/libx.so. " + - "Found: {}".format(actual_entries)) - actual = actual[0] + actual = _venv_symlinks_from_entries(actual_entries) - actual_files = actual.files.to_list() - expected_lib_dso = [f for f in srcs if f.basename == "libx.so"] - env.expect.that_collection(actual_files).contains_exactly(expected_lib_dso) + env.expect.that_collection(actual).contains_at_least([ + _venv_symlink( + "bar/libs/liby.so", + link_to_path = "_main/tests/venv_site_packages_libs/app_files_building/site-packages/bar/libs/liby.so", + files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/bar/libs/liby.so", + ], + ), + _venv_symlink( + "foo.libs/libx.so", + link_to_path = "_main/tests/venv_site_packages_libs/app_files_building/site-packages/foo.libs/libx.so", + files = [ + "tests/venv_site_packages_libs/app_files_building/site-packages/foo.libs/libx.so", + ], + ), + ]) - entries = actual_entries - actual = build_link_map(_ctx(), entries) + actual = build_link_map(_ctx(), actual_entries) # The important condition is that each lib*.so file is linked directly. expected_libs = { "bar/libs/liby.so": srcs[0], - "bar/x.py": srcs[1], - "bar/y.so": srcs[2], - "foo": "_main/tests/venv_site_packages_libs/app_files_building/site-packages/foo", "foo.libs/libx.so": srcs[3], - "root.pth": srcs[-3], - "root.py": srcs[-2], - "root.so": srcs[-1], } - env.expect.that_dict(actual[VenvSymlinkKind.LIB]).contains_exactly(expected_libs) + env.expect.that_dict(actual[VenvSymlinkKind.LIB]).contains_at_least(expected_libs) def app_files_building_test_suite(name): test_suite(