diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 000000000..5008ddfcf Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..d294a4270 --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +.coverage +dependency_graph.dot +dependency_graph.svg +dev_scripts_helpers/thin_client/build.py.log +dev_scripts_helpers/thin_client/thin_client_utils.py.log +dev_scripts_helpers/thin_client/tmp.requirements.txt +helpers.egg-info/ +helpers/__pycache__/ +i +import_check/__pycache__/ +import_check/test/__pycache__/ +report.txt +report_cycles.txt +report_max_level.txt +__pycache__/ +dependency_report.txt +devops/compose/local.docker-compose.yml +tmp.precommit_output.txt +tmp.pytest.log +tmp.amp_normalize_import.txt +source +devops/compose/tmp.docker-compose.yml diff --git a/dev_scripts_helpers/thin_client/requirements.txt b/dev_scripts_helpers/thin_client/requirements.txt index b3c2273d9..b9028c942 100644 --- a/dev_scripts_helpers/thin_client/requirements.txt +++ b/dev_scripts_helpers/thin_client/requirements.txt @@ -8,6 +8,8 @@ docker < 7 docker-compose >= 1.29.0 invoke >= 1.5.0 poetry +networkx >= 2.6.3 +pydot >= 1.4.1 pytest >= 6.0.0 s3fs # For tools like `publish_notebook.py`. tqdm diff --git a/dev_scripts_helpers/thin_client/thin_client_utils.sh b/dev_scripts_helpers/thin_client/thin_client_utils.sh index df3cbd127..7685557e9 100644 --- a/dev_scripts_helpers/thin_client/thin_client_utils.sh +++ b/dev_scripts_helpers/thin_client/thin_client_utils.sh @@ -198,13 +198,32 @@ set_csfy_env_vars() { set_path() { - echo "# set_path()" + # Process interface. + dassert_eq_num_args $# 1 "set_path" local dev_script_dir=$1 + # dassert_dir_exists $dev_script_dir + dtrace "dev_script_dir=$dev_script_dir" + # TODO(gp): Unify this as part of CmTask12257. + if [[ -n "$GIT_ROOT_DIR" ]]; then + # `GIT_ROOT_DIR` is available outside the container. + GIT_ROOT=$GIT_ROOT_DIR + elif [[ -n "$CSFY_GIT_ROOT_PATH" ]]; then + # `CSFY_GIT_ROOT_PATH` is available inside the container. + GIT_ROOT=$CSFY_GIT_ROOT_PATH + fi export PATH=$(pwd):$PATH + dtrace "GIT_ROOT=$GIT_ROOT" + dassert_var_defined "GIT_ROOT" export PATH=$GIT_ROOT_DIR:$PATH + # Avoid ./.mypy_cache/3.12/app/dev_scripts_helpers + DEV_SCRIPT_HELPER_DIR=$(find ${GIT_ROOT} -name dev_scripts_helpers -type d -not -path "*.mypy_cache*") + echo "DEV_SCRIPT_HELPER_DIR=$DEV_SCRIPT_HELPER_DIR" + dassert_dir_exists $DEV_SCRIPT_HELPER_DIR + dtrace "DEV_SCRIPT_HELPER_DIR=$DEV_SCRIPT_HELPER_DIR" # Add to the PATH all the first level directory under `dev_scripts`. - export PATH_TMP="$(find $dev_script_dir -maxdepth 1 -type d -not -path "$(pwd)" | tr '\n' ':' | sed 's/:$//')" + export PATH_TMP="$(find $DEV_SCRIPT_HELPER_DIR -maxdepth 1 -type d -not -path "$(pwd)" | tr '\n' ':' | sed 's/:$//')" + dtrace "PATH_TMP=$PATH_TMP" export PATH=$PATH_TMP:$PATH # Remove duplicates. export PATH=$(remove_dups $PATH) @@ -213,6 +232,7 @@ set_path() { } + set_pythonpath() { local helpers_root_dir="$1" echo "# set_pythonpath()" diff --git a/docs/tools/all.import_check.reference.md b/docs/tools/all.import_check.reference.md index 8ed8651f8..bb9c5530e 100644 --- a/docs/tools/all.import_check.reference.md +++ b/docs/tools/all.import_check.reference.md @@ -1,20 +1,30 @@ - - * [show_imports](#show_imports) - * [Basic usage](#basic-usage) - * [Visualize dependencies at a directory level](#visualize-dependencies-at-a-directory-level) - * [Visualize external dependencies](#visualize-external-dependencies) - * [Visualize level X dependencies](#visualize-level-x-dependencies) - * [Visualize cyclic dependencies](#visualize-cyclic-dependencies) - * [Pydeps-dependent limitations](#pydeps-dependent-limitations) - * [NotModuleError](#notmoduleerror) - * [Modules above the target directory](#modules-above-the-target-directory) - * [Run the tool on our codebase -- pre-docker procedure](#run-the-tool-on-our-codebase----pre-docker-procedure) - * [detect_import_cycles](#detect_import_cycles) - * [Basic usage](#basic-usage-1) - - - - + + +- [show_imports](#show_imports) + * [Basic usage](#basic-usage) + * [Visualize dependencies at a directory level](#visualize-dependencies-at-a-directory-level) + * [Visualize external dependencies](#visualize-external-dependencies) + * [Visualize level X dependencies](#visualize-level-x-dependencies) + * [Visualize cyclic dependencies](#visualize-cyclic-dependencies) + * [Pydeps-dependent limitations](#pydeps-dependent-limitations) + + [NotModuleError](#notmoduleerror) + + [Modules above the target directory](#modules-above-the-target-directory) + * [Run the tool on our codebase -- pre-docker procedure](#run-the-tool-on-our-codebase----pre-docker-procedure) +- [detect_import_cycles](#detect_import_cycles) + * [Basic usage](#basic-usage-1) +- [show_deps](#show_deps) + * [Overview](#overview) + * [Command usage](#command-usage) + * [Examples](#examples) + + [Generate a text report](#generate-a-text-report) + + [Generate a DOT file for visualization](#generate-a-dot-file-for-visualization) + + [Limit analysis to a specific directory depth](#limit-analysis-to-a-specific-directory-depth) + + [Focus on cyclic dependencies](#focus-on-cyclic-dependencies) + * [Options](#options) + * [Limitations](#limitations) + * [Running the tool on our codebase](#running-the-tool-on-our-codebase) + + # show_imports @@ -23,7 +33,7 @@ A tool for visualizing dependencies among files and packages. ## Basic usage ```bash ->./show_imports.py [flags] +> show_imports.py [flags] ``` The script will produce by default an output `.png` file named @@ -61,7 +71,7 @@ example Basic usage example: ```bash ->./show_imports.py --out_filename example/output/basic.png example/input +> show_imports.py --out_filename example/output/basic.png example/input ``` Will produce the following output: @@ -75,7 +85,7 @@ To visualize dependencies at a directory level, specify `--dir` option. Example: ```bash ->./show_imports.py --dir --out_filename example/output/directory_deps.png example/input +> show_imports.py --dir --out_filename example/output/directory_deps.png example/input ``` Output: @@ -90,7 +100,7 @@ specifying the `--ext` option. Example: ```bash ->./show_imports.py --ext --out_filename example/output/external_deps.png example/input +> show_imports.py --ext --out_filename example/output/external_deps.png example/input ``` Output: @@ -105,7 +115,7 @@ can set the `--max_level` option. Example: ```bash ->./show_imports.py --max_level 2 --out_filename example/output/max_level_deps.png example/input +> show_imports.py --max_level 2 --out_filename example/output/max_level_deps.png example/input ``` Output: @@ -120,7 +130,7 @@ When you want to visualize cyclic dependencies only, you can set the Example: ```bash ->./show_imports.py --show_cycles --out_filename example/output/cyclic_deps.png example/input +> show_imports.py --show_cycles --out_filename example/output/cyclic_deps.png example/input ``` Output: @@ -139,7 +149,7 @@ its limitations: considered (e.g., if a module is missing or not installed, it will not be included regardless of whether it is being imported) - All the imports inside submodules should be absolute -- There are certain requirements related to the presence of _modules_ in and +- There are certain requirements related to the presence of modules in and above the target directory, which are described in detail below - Here, a module is a directory that contains an `__init__.py` file @@ -304,7 +314,7 @@ A tool for detecting circular dependencies among files and packages. ## Basic usage ```bash ->./detect_import_cycles.py +> detect_import_cycles.py ``` The script will either exit with an error, logging the groups of files with @@ -318,10 +328,218 @@ For the `import_check/example/input` directory, the script will produce the following output, detecting two import cycles: ```bash ->./detect_import_cycles.py example/input +> detect_import_cycles.py example/input ``` ```bash ERROR detect_import_cycles.py _main:73 Cyclic imports detected: (input.subdir2.subdir3.file1, input.subdir2.subdir3.file2) ERROR detect_import_cycles.py _main:73 Cyclic imports detected: (input.subdir4.file1, input.subdir4.file2, input.subdir4.file3) ``` + +# show_deps + +## Overview + +- Analyzes Python files in a directory for intra-directory import dependencies +- Generates: + - A text report, or + - A DOT file for visualization +- Useful for understanding module relationships within a project +- Supports options to: + - Limit analysis depth + - Focus on cyclic dependencies + +## Command usage + +```bash +> i show_deps [--directory ] [--format ] [--output_file ] [--max_level ] [--show_cycles] +``` + +- **Default behavior**: Produces a text report, printed to stdout. +- **Options**: + - `--directory`: Specifies the directory to analyze (default: current + directory). + - `--format`: Sets the output format (`text` or `dot`, default: `text`). + - `--output_file`: Saves the report to a file (default: stdout for text, + `dependency_graph.dot` for DOT). + - `--max_level`: Limits the directory depth for analysis (e.g., `2` for two + levels). + - `--show_cycles`: Filters the report to show only cyclic dependencies + (default: false). + +## Examples + +The examples below analyze the `helpers` directory, which contains +subdirectories like `notebooks/`. + +### Generate a text report + +Create a text report of all intra-directory dependencies: + +```bash +> i show_deps --directory helpers --format text > report.txt +``` + +Output in `report.txt`: + +```text +helpers/notebooks/gallery_parquet.py imports helpers/hdbg.py, helpers/hio.py +helpers/hdbg.py has no dependencies +helpers/hio.py has no dependencies +... +``` + +### Generate a DOT file for visualization + +Create a DOT file for visualization: + +```bash +> i show_deps --directory helpers --format dot --output_file dependency_graph.dot +>dot -Tsvg dependency_graph.dot -o dependency_graph.svg +>open dependency_graph.svg +``` + +For large graphs, use the `neato` layout engine: + +```bash +>neato -Tsvg dependency_graph.dot -o dependency_graph.svg -Goverlap=scale -Gsep=+0.5 -Gepsilon=0.01 +>open dependency_graph.svg +``` + +### Limit analysis to a specific directory depth + +Restrict analysis to a certain depth with `--max_level` (e.g., `--max_level 2` +includes `helpers/notebooks/`, excludes deeper subdirectories): + +```bash +> i show_deps --directory helpers --format text --max_level 2 > report_max_level.txt +``` + +Output in `report_max_level.txt`: + +```text +helpers/notebooks/gallery_parquet.py imports helpers/hdbg.py, helpers/hio.py +helpers/hdbg.py has no dependencies +helpers/hio.py has no dependencies +... +``` + +Visualize the limited graph: + +```bash +> i show_deps --directory helpers --format dot --output_file dependency_graph.dot --max_level 2 +>neato -Tsvg dependency_graph.dot -o dependency_graph.svg -Goverlap=scale -Gsep=+0.5 -Gepsilon=0.01 +>open dependency_graph.svg +``` + +### Focus on cyclic dependencies + +Show only cyclic dependencies with `--show_cycles`: + +```bash +> i show_deps --directory helpers --format text --show_cycles > report_cycles.txt +``` + +Output in `report_cycles.txt` (if cycles exist): + +```text +helpers/module_d.py imports helpers/module_e.py +helpers/module_e.py imports helpers/module_d.py +... +``` + +Visualize the cyclic dependencies: + +```bash +> i show_deps --directory helpers --format dot --output_file dependency_graph.dot --show_cycles +> neato -Tsvg dependency_graph.dot -o dependency_graph.svg -Goverlap=scale -Gsep=+0.5 -Gepsilon=0.01 +> open dependency_graph.svg +``` + +## Options + +- `--directory `: Directory to analyze (default: `.`). +- `--format `: Output format (default: `text`). +- `--output_file `: File to save the report (default: stdout for text, + `dependency_graph.dot` for DOT). +- `--max_level `: Maximum directory depth to analyze (e.g., `2`). +- `--show_cycles`: Show only cyclic dependencies (e.g., `module_d` importing + `module_e` and vice versa). + +## Limitations + +- Analyzes only intra-directory imports; external imports (e.g., `numpy`) are + ignored. +- Imports must resolve within the directory (e.g., `helpers.hdbg` to + [`/helpers/hdbg.py)`](/helpers/hdbg.py)). +- Directories with Python files must be modules (contain `__init__.py`), or a + `NotModuleError` is raised. + +Example of a valid structure: + +```text +helpers +├── __init__.py +├── notebooks +│ ├── gallery_parquet.py +│ └── __init__.py +└── hdbg.py +``` + +Example causing `NotModuleError`: + +```text +helpers +├── __init__.py +├── notebooks +│ └── gallery_parquet.py +└── hdbg.py +``` + +```bash +NotModuleError: The following dirs have to be modules (add `__init__.py`): ['helpers/notebooks'] +``` + +## Running the tool on our codebase + +1. **Activate the `helpers` environment**: From the `helpers` root directory: + + ```bash + > poetry shell; export PYTHONPATH=$PYTHONPATH:$(pwd) + ``` + +2. **Generate a dependency report**: Create a text report: + + ```bash + > i show_deps --directory helpers --format text > report.txt + ``` + + Or create a DOT file for visualization: + + ```bash + > i show_deps --directory helpers --format dot --output_file dependency_graph.dot + ``` + +3. **Visualize the graph** (optional): Convert the DOT file to SVG and view: + ```bash + > neato -Tsvg dependency_graph.dot -o dependency_graph.svg -Goverlap=scale -Gsep=+0.5 -Gepsilon=0.01 + > open dependency_graph.svg + ``` + +**Troubleshooting**: If `invoke` fails (e.g., +`No idea what '--output_file' is!`), use the fallback script: + +```bash +> generate_deps.py +> neato -Tsvg dependency_graph.dot -o dependency_graph.svg -Goverlap=scale -Gsep=+0.5 -Gepsilon=0.01 +> open dependency_graph.svg +``` + +**Tips**: The `generate_deps.py` script applies customizations like filtering +nodes with no dependencies and shortening labels (e.g., removing `helpers/` +prefix). Adjust Graphviz attributes (`ranksep=2.0`, `nodesep=1.0`, +`splines=spline`, `overlap=false`, `fontsize=10`) for better visualization. + +**Last review**: +- 2025-05-14, GP Saggese +- 2025-05-01, Ehaab Basil diff --git a/generate_deps.py b/generate_deps.py new file mode 100644 index 000000000..d0dc629ff --- /dev/null +++ b/generate_deps.py @@ -0,0 +1,41 @@ +import networkx as nx + +from import_check.dependency_graph import DependencyGraph + +# Generate dependency graph for the helpers directory +graph = DependencyGraph("helpers") +graph.build_graph() + +# Print the text report of dependencies - uncomment if needed to see the text report of dependencies (alt to i show deps) +# print("Text Report:") +# print(graph.get_text_report()) + +# Convert to a NetworkX graph for customization +nx_graph = graph.graph + +# Filter out nodes with no dependencies (in-degree and out-degree both 0) +nodes_to_remove = [ + node + for node in nx_graph.nodes + if nx_graph.in_degree(node) == 0 and nx_graph.out_degree(node) == 0 +] +nx_graph.remove_nodes_from(nodes_to_remove) +print(f"Removed {len(nodes_to_remove)} nodes with no dependencies") + +# Shorten node labels by removing the "helpers/" prefix +for node in nx_graph.nodes: + new_label = node.replace("helpers/", "") + nx_graph.nodes[node]["label"] = new_label + +# Add Graphviz attributes for better layout +nx_graph.graph["graph"] = { + "ranksep": "2.0", # Increase spacing between ranks + "nodesep": "1.0", # Increase spacing between nodes + "splines": "spline", # Use smooth curves for edges + "overlap": "false", # Avoid overlapping nodes + "fontsize": "10", # Smaller font size for labels +} + +# Write the DOT file with the customized graph +nx.drawing.nx_pydot.write_dot(nx_graph, "dependency_graph.dot") +print("Dependency graph written to dependency_graph.dot") diff --git a/helpers/lib_tasks_lint.py b/helpers/lib_tasks_lint.py index 153700b9e..f4f330015 100644 --- a/helpers/lib_tasks_lint.py +++ b/helpers/lib_tasks_lint.py @@ -176,6 +176,69 @@ def lint_detect_cycles( # type: ignore hlitauti.run(ctx, cmd) +# TODO(ehaabbasil): c -> ctx +# TODO(ehaabbasil): Use REST for docstrings. +# TODO(ehaabbasil): Make this invoke target like the others. +# TODO + +@task +def lint_show_deps( + ctx, + dir_name=".", + stage="prod", + version="", + out_file_name="dependency_graph.dot", + max_level=-1, + show_cycles=False, + log_level="INFO", +): + """ + Generate a dependency report for a specified directory. + + :param dir_name: The name of dir to generate dependecy report for. + :param out_file_name: Path to the output DOT file. + :param max_level: Maximum directory depth to analyze (-1 for no limit) + """ + hlitauti.report_task() + cmd = f"python import_check/show_deps.py {dir_name} --out_file {out_file_name} --max_level {max_level}" + if show_cycles: + cmd += " --show_cycles" + cmd += f" -v {log_level}" + docker_cmd = _get_lint_docker_cmd("", cmd, stage=stage, version=version) + hlitauti.run(ctx, docker_cmd) + +@task +def lint_generate_deps( + ctx, + dir_name=".", + stage="prod", + version="", + out_file_name="dependency_graph.dot", + max_level=-1, + show_cycles=False, + log_level="INFO", +): + """ + Generate a dependency graph for intra-directory imports. + + For param descriptions, see `lint()`. + + :param ctx: Invoke context + :param dir_name: Path to the directory to analyze + :param out_file_name: Path to the output DOT file + :param max_level: Maximum directory depth to analyze (-1 for no limit) + :param show_cycles: Show only cyclic dependencies + :param log_level: Logging verbosity level (e.g., DEBUG, INFO) + """ + hlitauti.report_task() + cmd = f"python import_check/generate_deps.py {dir_name} --out_file {out_file_name} --max_level {max_level}" + if show_cycles: + cmd += " --show_cycles" + cmd += f" --log_level {log_level}" + docker_cmd = _get_lint_docker_cmd("", cmd, stage=stage, version=version) + hlitauti.run(ctx, docker_cmd) + + # pylint: disable=line-too-long @task def lint( # type: ignore diff --git a/import_check/dependency_graph.py b/import_check/dependency_graph.py new file mode 100644 index 000000000..53bd346d2 --- /dev/null +++ b/import_check/dependency_graph.py @@ -0,0 +1,321 @@ +""" +Build graph for dependencies. + +Examples: +# Show file dependencies. +> show_deps.py + +# Show directory-level dependencies. +> show_deps.py --max_level 2 + +# Detect cyclic dependencies. +> show_deps.py --show_cycles +""" + +import ast +import logging +import pathlib as path +from typing import Optional + +import networkx as nx + +import helpers.hio as hio +import helpers.hprint as hprint + +_LOG = logging.getLogger(__name__) + + +# ############################################################################# +# DependencyGraph +# ############################################################################# + + +class DependencyGraph: + """ + Generate a dependency graph for intra-directory imports. + + :param directory: Path to the directory to analyze. + :param max_level: Max directory depth to analyze (-1 for no limit). + :param show_cycles: If True, include only cyclic dependencies in the + graph. + """ + + def __init__( + self, + directory: str, + *, + max_level: Optional[ + int + ] = -1, # TODO: Use -1 instead of None to simplify typing. + show_cycles: bool = False, + ) -> None: + """ + Initialize the DependencyGraph with directory and analysis parameters. + """ + # Following caused ValueError: Unable to determine caller function. + # _LOG.debug(hprint.func_signature_to_str()) + # Initialize directory path + print(f"Type of Path: {type(path.Path)}") + self.directory = path.Path(directory).resolve() + # Create a directed graph for dependencies. + self.graph: nx.DiGraph = nx.DiGraph() + # Set maximum directory depth. + self.max_level = max_level if max_level is not None else -1 # Handle None + # Configure cyclic dependency filtering. + self.show_cycles = show_cycles + + def build_graph(self, abort_on_error: bool = False) -> None: + """ + Build a directed graph of intra-directory dependencies. + + :param abort_on_error: If True, raise SyntaxError on parsing + failures; if False, skip invalid files. + """ + # _LOG.debug(hprint.func_signature_to_str()) + # Prepare directory analysis. + _LOG.info("Building dependency graph for %s", self.directory) + base_depth = len(self.directory.parts) + _LOG.debug(hprint.to_str("base_depth")) + # Collect Python files within `max_level` depth. + py_files = [ + path + for path in self.directory.rglob("*.py") + if self.max_level == -1 + or (len(path.parent.parts) - base_depth) <= self.max_level + ] + _LOG.info("Found Python files: %s", py_files) + _LOG.debug(hprint.to_str("py_files")) + # Process each Python file to build the dependency graph. + for py_file in py_files: + relative_path = py_file.relative_to(self.directory.parent).as_posix() + _LOG.info( + "Processing file %s, relative path: %s", py_file, relative_path + ) + _LOG.debug(hprint.to_str("relative_path")) + self.graph.add_node(relative_path) + # Parse the file as an Abstract Syntax Tree (AST). + try: + file_content = hio.from_file(str(py_file)) + tree = ast.parse(file_content, filename=str(py_file)) + except SyntaxError as e: + if abort_on_error: + _LOG.error("Syntax error in %s: %s", py_file, e) + raise e + _LOG.warning("Skipping %s due to syntax error: %s", py_file, e) + continue + # Extract imports from AST. + for node in ast.walk(tree): + if isinstance(node, (ast.Import, ast.ImportFrom)): + imports = ( + [name.name for name in node.names] + if isinstance(node, ast.Import) + else [node.module] if node.module is not None else [] + ) + _LOG.debug(hprint.to_str("imports")) + for imp in imports: + if imp is None: + _LOG.warning( + "Skipping None import in file %s", py_file + ) + continue + _LOG.info("Found import: %s", imp) + _LOG.debug(hprint.to_str("imp")) + imp_path = self._resolve_import(imp, py_file) + if imp_path: + _LOG.info( + "Adding edge: %s -> %s", relative_path, imp_path + ) + self.graph.add_edge(relative_path, imp_path) + _LOG.debug(hprint.to_str("self.graph")) + else: + _LOG.info("No edge added for import %s", imp) + # Filter for cyclic dependencies if enabled. + if self.show_cycles: + self._filter_cycles() + + def get_text_report(self) -> str: + """ + Generate a text report listing each module's dependencies. + + :return: Text report of dependencies, one per line. + """ + # _LOG.debug(hprint.func_signature_to_str()) + # Initialize report. + report = [] + # Generate report lines for each node. + for node in self.graph.nodes: + _LOG.debug(hprint.to_str("node")) + dependencies = list(self.graph.successors(node)) + _LOG.debug(hprint.to_str("dependencies")) + # Create report line based on dependencies. + # TODO: Let's use a if-then-else for clarity. + if dependencies: + line = f"{node} imports {', '.join(dependencies)}" + else: + line = f"{node} has no dependencies" + _LOG.debug(hprint.to_str("line")) + report.append(line) + return "\n".join(report) + + def get_dot_file(self, output_file: str) -> None: + """ + Write the dependency graph to a DOT file. + + :param output_file: Path to the output DOT file. + """ + # _LOG.debug(hprint.func_signature_to_str()) + # Write the graph to a DOT file. + nx.drawing.nx_pydot.write_dot(self.graph, output_file) + _LOG.info("DOT file written to %s", output_file) + + def _filter_cycles(self) -> None: + """ + Filter the graph to show only nodes and edges in cyclic dependencies. + """ + # _LOG.debug(hprint.func_signature_to_str()) + # Find strongly connected components. + cycles = list(nx.strongly_connected_components(self.graph)) + # Accumulate cyclic nodes. + _LOG.debug(hprint.to_str("cycles")) + cyclic_nodes = set() + for component in cycles: + if len(component) > 1: + cyclic_nodes.update(component) + _LOG.debug(hprint.to_str("cyclic_nodes")) + # Create a new graph with cyclic nodes and edges. + new_graph = nx.DiGraph() + for node in cyclic_nodes: + new_graph.add_node(node) + for u, v in self.graph.edges(): + if u in cyclic_nodes and v in cyclic_nodes: + new_graph.add_edge(u, v) + # Update the graph to include only cyclic dependencies. + self.graph = new_graph + _LOG.info( + "Graph filtered to %d nodes and %d edges in cycles", + len(self.graph.nodes), + len(self.graph.edges), + ) + + # TODO: -> Optional[str] + def _resolve_import(self, imp: str, py_file: path.Path) -> Optional[str]: + """ + Resolve an import to a file path within the directory. + + :param imp: Import statement (e.g., `module.submodule`). + :param py_file: File path where the import is found. + :return: Relative path to the resolved file, or None if unresolved. + """ + # _LOG.debug(hprint.func_signature_to_str()) + _LOG.info("Resolving import '%s' for file %s", imp, py_file) + _LOG.debug(hprint.to_str("imp, py_file")) + # Initialize base directory. + base_dir = self.directory + _LOG.info("Base directory: %s", base_dir) + parts = imp.split(".") + _LOG.debug(hprint.to_str("parts")) + current_dir = base_dir + # Check: direct match for directory package + dir_name_parts = base_dir.name.split(".") + if parts == dir_name_parts: + init_path = base_dir / "__init__.py" + _LOG.info("Checking base directory __init__ at %s", init_path) + if init_path.exists(): + resolved_path = init_path.relative_to(base_dir.parent).as_posix() + _LOG.info("Resolved directory self-import to: %s", resolved_path) + return resolved_path + _LOG.error("Base directory __init__.py missing at %s", init_path) + return None + # Handle directory name imports. + # Current directory. + dir_name = self.directory.name + # Parent directory. + parent_name = self.directory.parent.name + # Grandparent directory, if exists. + parent_parent_name = ( + self.directory.parent.parent.name + if len(self.directory.parent.parts) > 1 + else "" + ) + # Collect all parent directory names into a list for validation. + valid_names = [dir_name, parent_name, parent_parent_name] + parent = self.directory.parent + while len(parent.parts) > 2: # Stop near root + valid_names.append(parent.parent.name) + parent = parent.parent + _LOG.info( + "Directory name: %s, Parent name: %s, Parent parent name: %s, " + "Valid names: %s, Import first part: %s", + dir_name, + parent_name, + parent_parent_name, + valid_names, + parts[0] if parts else "", + ) + result = None + if parts and parts[0] in valid_names: + parts = parts[1:] + _LOG.debug(hprint.to_str("parts after dir_name handling")) + if not parts: + # Only if the `dir` name is given (e.g., "helpers"), check for + # `__init__.py`. + init_path = base_dir / "__init__.py" + _LOG.info( + "Checking __init__.py at %s, exists: %s", + init_path, + init_path.exists(), + ) + if init_path.exists(): + resolved_path = init_path.relative_to( + self.directory.parent + ).as_posix() + _LOG.info("Resolved to: %s", resolved_path) + result = resolved_path + else: + _LOG.error("No __init__.py found at %s", init_path) + else: + for i, module_name in enumerate(parts): + _LOG.debug(hprint.to_str("i, module_name")) + package_path = current_dir / module_name / "__init__.py" + _LOG.info("Checking package path: %s", package_path) + _LOG.debug(hprint.to_str("package_path")) + if package_path.exists(): + if i == len(parts) - 1: + resolved_path = package_path.relative_to( + self.directory.parent + ).as_posix() + _LOG.info("Resolved to: %s", resolved_path) + result = resolved_path + break + current_dir = current_dir / module_name + _LOG.debug(hprint.to_str("current_dir")) + continue + # Check for a .py file. + module_path = current_dir / f"{module_name}.py" + _LOG.info("Checking module path: %s", module_path) + _LOG.debug(hprint.to_str("module_path")) + if module_path.exists(): + # If last part, return the `.py` path. + if i == len(parts) - 1: + resolved_path = module_path.relative_to( + self.directory.parent + ).as_posix() + _LOG.info("Resolved to: %s", resolved_path) + result = resolved_path + break + # If not last part, but is a module, it can't lead further. + _LOG.info( + "Could not resolve full import '%s' beyond %s", + imp, + module_path, + ) + break + # If neither exists, the import cannot be resolved. + _LOG.info( + "Could not resolve import '%s' at part '%s'", imp, module_name + ) + break + # Return None if module resolution was unsuccessful. + if result is None: + _LOG.info("Could not resolve import '%s'", imp) + return result diff --git a/import_check/generate_deps.py b/import_check/generate_deps.py new file mode 100644 index 000000000..0c99da71c --- /dev/null +++ b/import_check/generate_deps.py @@ -0,0 +1,40 @@ +import networkx as nx +from import_check.dependency_graph import DependencyGraph + +# Generate dependency graph for the helpers directory +graph = DependencyGraph("helpers") +graph.build_graph() + +# Print the text report of dependencies - uncomment if needed to see the text report of dependencies (alt to i show deps) +# print("Text Report:") +# print(graph.get_text_report()) + +# Convert to a NetworkX graph for customization +nx_graph = graph.graph + +# Filter out nodes with no dependencies (in-degree and out-degree both 0) +nodes_to_remove = [ + node + for node in nx_graph.nodes + if nx_graph.in_degree(node) == 0 and nx_graph.out_degree(node) == 0 +] +nx_graph.remove_nodes_from(nodes_to_remove) +print(f"Removed {len(nodes_to_remove)} nodes with no dependencies") + +# Shorten node labels by removing the "helpers/" prefix +for node in nx_graph.nodes: + new_label = node.replace("helpers/", "") + nx_graph.nodes[node]["label"] = new_label + +# Add Graphviz attributes for better layout +nx_graph.graph["graph"] = { + "ranksep": "2.0", # Increase spacing between ranks + "nodesep": "1.0", # Increase spacing between nodes + "splines": "spline", # Use smooth curves for edges + "overlap": "false", # Avoid overlapping nodes + "fontsize": "10", # Smaller font size for labels +} + +# Write the DOT file with the customized graph +nx.drawing.nx_pydot.write_dot(nx_graph, "dependency_graph.dot") +print("Dependency graph written to dependency_graph.dot") diff --git a/import_check/show_deps.py b/import_check/show_deps.py new file mode 100644 index 000000000..1be7024f2 --- /dev/null +++ b/import_check/show_deps.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python + +""" +Generate a dependency report for a specified directory. + +E.g., +# Generate report for current directory +> show_deps.py . + +# Generate report with max depth 2 +> show_deps.py --max_level 2 . + +# Show only cyclic dependencies +> show_deps.py --show_cycles . +""" + +import argparse +import logging +import sys + +import helpers.hdbg as hdbg +import helpers.hparser as hparser +try: + from import_check.dependency_graph import DependencyGraph +except ImportError as e: + logging.error("Failed to import DependencyGraph: %s", str(e)) + logging.error("Ensure you are running as a module (e.g., python -m import_check.show_deps) or check package structure") + sys.exit(1) + +_LOG = logging.getLogger(__name__) + +# ############################################################################# + +def _parse() -> argparse.ArgumentParser: + """ + Parse command-line arguments. + + :return: ArgumentParser object configured with command-line options. + """ + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "directory", + type=str, + help="Path to the directory to analyze" + ) + parser.add_argument( + "--out_file", + type=str, + default="dependency_graph.dot", + help="Path to the output DOT file" + ) + parser.add_argument( + "--max_level", + type=int, + default=-1, + help="Maximum directory depth to analyze (-1 for no limit)" + ) + parser.add_argument( + "--show_cycles", + action="store_true", + help="Show only cyclic dependencies" + ) + hparser.add_verbosity_arg(parser) + return parser + +# ############################################################################# + +def _main(parser: argparse.ArgumentParser) -> None: + """ + Main function to generate dependency report. + + :param parser: ArgumentParser object to parse command-line arguments. + """ + args = parser.parse_args() + hdbg.init_logger(verbosity=args.log_level, use_exec_path=True) + hdbg.dassert_dir_exists(args.directory, f"{args.directory} is not a valid directory") + _LOG.info("Starting dependency analysis for %s", args.directory) + try: + graph = DependencyGraph(args.directory, max_level=args.max_level, show_cycles=args.show_cycles) + graph.build_graph() + if not graph.graph.nodes: + _LOG.info("No Python files found or no dependencies to report in %s", args.directory) + else: + report = graph.get_text_report() + print(report) + graph.get_dot_file(args.out_file) + _LOG.info("DOT file written to %s", args.out_file) + except Exception as e: + _LOG.error("Failed to generate dependency report: %s", str(e)) + sys.exit(1) + +# ############################################################################# + +if __name__ == "__main__": + _main(_parse()) \ No newline at end of file diff --git a/import_check/test/outcomes/TestDependencyGraph.test_dot_output/output/test.txt b/import_check/test/outcomes/TestDependencyGraph.test_dot_output/output/test.txt new file mode 100644 index 000000000..b059464eb --- /dev/null +++ b/import_check/test/outcomes/TestDependencyGraph.test_dot_output/output/test.txt @@ -0,0 +1,11 @@ +strict digraph { +"tmp.scratch/module_e.py"; +"tmp.scratch/module_d.py"; +"tmp.scratch/module_a.py"; +"tmp.scratch/module_c.py"; +"tmp.scratch/module_b.py"; +"tmp.scratch/module_e.py" -> "tmp.scratch/module_d.py"; +"tmp.scratch/module_d.py" -> "tmp.scratch/module_e.py"; +"tmp.scratch/module_c.py" -> "tmp.scratch/module_b.py"; +"tmp.scratch/module_b.py" -> "tmp.scratch/module_a.py"; +} diff --git a/import_check/test/outcomes/Test_show_imports.test1/output/output.txt b/import_check/test/outcomes/Test_show_imports.test1/output/output.txt index 0977b1d2a..8c2bcfc54 100644 --- a/import_check/test/outcomes/Test_show_imports.test1/output/output.txt +++ b/import_check/test/outcomes/Test_show_imports.test1/output/output.txt @@ -19,7 +19,7 @@ "input.file2" ], "imports": null, - "path": "/app/helpers_root/import_check/test/outcomes/Test_show_imports.test1/input/__init__.py", + "path": "/app/import_check/test/outcomes/Test_show_imports.test1/input/__init__.py", "truncated": false, "is_external": false, "is_file": false @@ -31,7 +31,7 @@ "input.file2" ], "imports": null, - "path": "/app/helpers_root/import_check/test/outcomes/Test_show_imports.test1/input/file1.py", + "path": "/app/import_check/test/outcomes/Test_show_imports.test1/input/file1.py", "truncated": false, "is_external": false, "is_file": true @@ -45,7 +45,7 @@ "input", "input.file1" ], - "path": "/app/helpers_root/import_check/test/outcomes/Test_show_imports.test1/input/file2.py", + "path": "/app/import_check/test/outcomes/Test_show_imports.test1/input/file2.py", "truncated": false, "is_external": false, "is_file": true diff --git a/import_check/test/test_dependency_graph.py b/import_check/test/test_dependency_graph.py new file mode 100644 index 000000000..c77f6e413 --- /dev/null +++ b/import_check/test/test_dependency_graph.py @@ -0,0 +1,246 @@ +import pathlib as path # Updated import style + +import helpers.hio as hio +import helpers.hunit_test as hunitest +import import_check.dependency_graph as ichdegra + + +# ############################################################################# +# TestDependencyGraph +# ############################################################################# + + +# TODO: class TestDependencyGraph(hunitest.TestCase): +class TestDependencyGraph(hunitest.TestCase): + # TODO: use self.get_scratch_dir() and make this a function that is called + def get_test_dir(self) -> path.Path: + """ + Create a temporary directory with test files. + + :return: Path to the temporary directory. + """ + # Prepare directory. + dir_path = path.Path(self.get_scratch_space()) + # Create test files. + hio.create_dir(dir_path, incremental=True) + hio.to_file(str(dir_path / "module_a.py"), "# No imports\n") + hio.to_file(str(dir_path / "module_b.py"), "import module_a\n") + hio.to_file(str(dir_path / "module_c.py"), "import module_b\n") + hio.to_file(str(dir_path / "module_d.py"), "import module_e\n") + hio.to_file(str(dir_path / "module_e.py"), "import module_d\n") + return dir_path + + def test_no_dependencies(self) -> None: + """ + Verify a module with no imports has no dependencies. + """ + # Prepare inputs + test_dir = self.get_test_dir() + graph = ichdegra.DependencyGraph(str(test_dir)) + graph.build_graph() + # Run. + report = graph.get_text_report() + # TODO: Use self.assert_in + # Check. + self.assertIn("tmp.scratch/module_a.py has no dependencies", report) + + def test_multiple_dependencies(self) -> None: + """ + Verify modules with chained dependencies are reported correctly. + """ + # Prepare inputs. + test_dir = self.get_test_dir() + graph = ichdegra.DependencyGraph(str(test_dir)) + # Run. + graph.build_graph() + report = graph.get_text_report() + # Check. + self.assertIn( + "tmp.scratch/module_c.py imports tmp.scratch/module_b.py", report + ) + self.assertIn( + "tmp.scratch/module_b.py imports tmp.scratch/module_a.py", report + ) + + def test_circular_dependencies(self) -> None: + """ + Verify cyclic dependencies are identified correctly. + """ + # Prepare inputs. + test_dir = self.get_test_dir() + graph = ichdegra.DependencyGraph(str(test_dir)) + # Run. + graph.build_graph() + report = graph.get_text_report() + # Check. + self.assertIn( + "tmp.scratch/module_d.py imports tmp.scratch/module_e.py", report + ) + self.assertIn( + "tmp.scratch/module_e.py imports tmp.scratch/module_d.py", report + ) + + def test_dot_output(self) -> None: + """ + Verify the DOT file is generated with correct format. + """ + # Prepare inputs. + test_dir = self.get_test_dir() + graph = ichdegra.DependencyGraph(str(test_dir)) + graph.build_graph() + # Run. + scratch_dir = path.Path(self.get_scratch_space()) + output_file = scratch_dir / "dependency_graph.dot" + graph.get_dot_file(str(output_file)) + # TODO: use self.check_string + # Check. + # Verify the DOT file content matches the expected golden outcome. + content = hio.from_file(str(output_file), encoding="utf-8") + self.check_string(content) + + def test_syntax_error_handling(self) -> None: + """ + Verify syntax errors in files are handled without crashing. + """ + # Prepare inputs. + test_dir = self.get_test_dir() + hio.to_file( + str(test_dir / "module_invalid.py"), + "def invalid_syntax() # Missing colon\n", + ) + graph = ichdegra.DependencyGraph(str(test_dir)) + # Run. + graph.build_graph() + report = graph.get_text_report() + # Check. + self.assertIn("tmp.scratch/module_a.py has no dependencies", report) + + def test_import_directory_only(self) -> None: + """ + Verify importing only the directory name resolves to __init__.py. + """ + # Prepare inputs. + test_dir = self.get_test_dir() + init_path = test_dir / "__init__.py" + hio.to_file(str(init_path), "") + hio.to_file(str(test_dir / "module_f.py"), f"import {test_dir.name}") + graph = ichdegra.DependencyGraph(str(test_dir)) + # Run. + graph.build_graph() + report = graph.get_text_report() + # Check. + self.assertIn( + "tmp.scratch/module_f.py imports tmp.scratch/__init__.py", report + ) + + def test_package_only_import(self) -> None: + """ + Verify importing a package with only __init__.py adds a dependency. + """ + # TODO: use self.get_scratch_space and hio.to_file + # TODO: use hio.create_dir + # TODO: add a descrition of how the dir and files look like + # Prepare inputs. + package_dir = path.Path(self.get_scratch_space()) + subdir = package_dir / "subpackage" + hio.create_dir(subdir, incremental=True) + hio.to_file(str(subdir / "__init__.py"), "") + hio.to_file(str(package_dir / "module_b.py"), "import subpackage") + # Directory structure: + # tmp.scratch/ + # subpackage/ + # __init__.py + # module_b.py + # Run. + graph = ichdegra.DependencyGraph(str(package_dir)) + graph.build_graph() + report = graph.get_text_report() + # Check. + self.assertIn( + "tmp.scratch/module_b.py imports tmp.scratch/subpackage/__init__.py", + report, + ) + + def test_package_import(self) -> None: + """ + Verify nested package imports resolve to __init__.py. + """ + # Prepare inputs. + package_dir = path.Path(self.get_scratch_space()) + subdir = package_dir / "subpackage" + subsubdir = subdir / "subsubpackage" + module_dir = subsubdir / "module_a" + hio.create_dir(subdir, incremental=True) + hio.create_dir(subsubdir, incremental=True) + hio.create_dir(module_dir, incremental=True) + hio.to_file(str(subdir / "__init__.py"), "") + hio.to_file(str(subsubdir / "__init__.py"), "") + hio.to_file(str(module_dir / "__init__.py"), "") + hio.to_file( + str(package_dir / "module_b.py"), + "import subpackage.subsubpackage.module_a", + ) + # Directory structure: + # tmp.scratch/ + # subpackage/ + # __init__.py + # subsubpackage/ + # __init__.py + # module_a/ + # __init__.py + # module_b.py + # Run. + graph = ichdegra.DependencyGraph(str(package_dir)) + graph.build_graph() + # Check. + report = graph.get_text_report() + self.assertIn( + "tmp.scratch/module_b.py imports " + "tmp.scratch/subpackage/subsubpackage/module_a/__init__.py", + report, + ) + + def test_unresolved_nested_import(self) -> None: + """ + Verify unresolved nested imports result in no dependencies. + """ + # Prepare inputs. + package_dir = path.Path(self.get_scratch_space()) + subdir = package_dir / "subpackage" + hio.create_dir(subdir, incremental=True) + hio.to_file(str(subdir / "__init__.py"), "") + hio.to_file( + str(package_dir / "module_b.py"), + "import subpackage.subsubpackage.module_a", + ) + # Directory structure: + # tmp.scratch/ + # subpackage/ + # __init__.py + # module_b.py + # Run. + graph = ichdegra.DependencyGraph(str(package_dir)) + graph.build_graph() + report = graph.get_text_report() + # Check. + self.assertIn("tmp.scratch/module_b.py has no dependencies", report) + + def test_show_cycles_filters_cyclic_dependencies(self) -> None: + """ + Verify show_cycles=True filters the graph to only cyclic dependencies. + """ + # Prepare inputs. + test_dir = self.get_test_dir() + hio.to_file(str(test_dir / "module_f.py"), "# No imports") + graph = ichdegra.DependencyGraph(str(test_dir), show_cycles=True) + # Run. + graph.build_graph() + report = graph.get_text_report() + # Check. + self.assertIn( + "tmp.scratch/module_d.py imports tmp.scratch/module_e.py", report + ) + self.assertIn( + "tmp.scratch/module_e.py imports tmp.scratch/module_d.py", report + ) + self.assertFalse("tmp.scratch/module_f.py" in report) diff --git a/tasks.py b/tasks.py index 72f35fb6c..5372410c7 100644 --- a/tasks.py +++ b/tasks.py @@ -2,6 +2,8 @@ import os from typing import Any +from invoke import task + import helpers.repo_config_utils as hrecouti # Expose the pytest targets. @@ -77,6 +79,9 @@ lint_check_python_files_in_docker, lint_create_branch, lint_detect_cycles, + lint_generate_deps, + lint_show_deps, + lint_sync_code, print_env, print_setup, print_tasks, @@ -133,6 +138,8 @@ ) except ImportError: pass +from import_check.dependency_graph import DependencyGraph + # # TODO(gp): This is due to the coupling between code in linter container and # # the code being linted. # try: @@ -173,7 +180,6 @@ def _run_qa_tests(ctx: Any, stage: str, version: str) -> bool: ctx.run(cmd) return True - default_params = { # TODO(Nikola): Remove prefix after everything is cleaned. # Currently there are a lot dependencies on prefix.