diff --git a/langgraph/.codespellignore b/langgraph/.codespellignore new file mode 100644 index 000000000..e69de29bb diff --git a/langgraph/.env.example b/langgraph/.env.example new file mode 100644 index 000000000..ec66fc0ad --- /dev/null +++ b/langgraph/.env.example @@ -0,0 +1,4 @@ +# To separate your traces from other application +LANGSMITH_PROJECT=new-agent + +# Add API keys for connecting to LLM providers, data sources, and other integrations here diff --git a/langgraph/.github/workflows/integration-tests.yml b/langgraph/.github/workflows/integration-tests.yml new file mode 100644 index 000000000..259852d82 --- /dev/null +++ b/langgraph/.github/workflows/integration-tests.yml @@ -0,0 +1,42 @@ +# This workflow will run integration tests for the current project once per day + +name: Integration Tests + +on: + schedule: + - cron: "37 14 * * *" # Run at 7:37 AM Pacific Time (14:37 UTC) every day + workflow_dispatch: # Allows triggering the workflow manually in GitHub UI + +# If another scheduled run starts while this workflow is still running, +# cancel the earlier run in favor of the next run. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + integration-tests: + name: Integration Tests + strategy: + matrix: + os: [ubuntu-latest] + python-version: ["3.11", "3.12"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + uv venv + uv pip install -r pyproject.toml + uv pip install -U pytest-asyncio + - name: Run integration tests + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }} + LANGSMITH_TRACING: true + run: | + uv run pytest tests/integration_tests diff --git a/langgraph/.github/workflows/unit-tests.yml b/langgraph/.github/workflows/unit-tests.yml new file mode 100644 index 000000000..055407c23 --- /dev/null +++ b/langgraph/.github/workflows/unit-tests.yml @@ -0,0 +1,57 @@ +# This workflow will run unit tests for the current project + +name: CI + +on: + push: + branches: ["main"] + pull_request: + workflow_dispatch: # Allows triggering the workflow manually in GitHub UI + +# If another push to the same PR or branch happens while this workflow is still running, +# cancel the earlier run in favor of the next run. +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + unit-tests: + name: Unit Tests + strategy: + matrix: + os: [ubuntu-latest] + python-version: ["3.11", "3.12"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + uv venv + uv pip install -r pyproject.toml + - name: Lint with ruff + run: | + uv pip install ruff + uv run ruff check . + - name: Lint with mypy + run: | + uv pip install mypy + uv run mypy --strict src/ + - name: Check README spelling + uses: codespell-project/actions-codespell@v2 + with: + ignore_words_file: .codespellignore + path: README.md + - name: Check code spelling + uses: codespell-project/actions-codespell@v2 + with: + ignore_words_file: .codespellignore + path: src/ + - name: Run tests with pytest + run: | + uv pip install pytest + uv run pytest tests/unit_tests diff --git a/langgraph/.gitignore b/langgraph/.gitignore new file mode 100644 index 000000000..eb170e827 --- /dev/null +++ b/langgraph/.gitignore @@ -0,0 +1,165 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +venv.langgraph/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +uv.lock +.langgraph_api/ diff --git a/langgraph/LICENSE b/langgraph/LICENSE new file mode 100644 index 000000000..57d0481d4 --- /dev/null +++ b/langgraph/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 LangChain + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/langgraph/Makefile b/langgraph/Makefile new file mode 100644 index 000000000..4bfd87862 --- /dev/null +++ b/langgraph/Makefile @@ -0,0 +1,67 @@ +.PHONY: all format lint test tests test_watch integration_tests docker_tests help extended_tests + +# Default target executed when no arguments are given to make. +all: help + +# Define a variable for the test file path. +TEST_FILE ?= tests/unit_tests/ + +test: + python -m pytest $(TEST_FILE) + +integration_tests: + python -m pytest tests/integration_tests + +test_watch: + python -m ptw --snapshot-update --now . -- -vv tests/unit_tests + +test_profile: + python -m pytest -vv tests/unit_tests/ --profile-svg + +extended_tests: + python -m pytest --only-extended $(TEST_FILE) + + +###################### +# LINTING AND FORMATTING +###################### + +# Define a variable for Python and notebook files. +PYTHON_FILES=src/ +MYPY_CACHE=.mypy_cache +lint format: PYTHON_FILES=. +lint_diff format_diff: PYTHON_FILES=$(shell git diff --name-only --diff-filter=d main | grep -E '\.py$$|\.ipynb$$') +lint_package: PYTHON_FILES=src +lint_tests: PYTHON_FILES=tests +lint_tests: MYPY_CACHE=.mypy_cache_test + +lint lint_diff lint_package lint_tests: + python -m ruff check . + [ "$(PYTHON_FILES)" = "" ] || python -m ruff format $(PYTHON_FILES) --diff + [ "$(PYTHON_FILES)" = "" ] || python -m ruff check --select I $(PYTHON_FILES) + [ "$(PYTHON_FILES)" = "" ] || python -m mypy --strict $(PYTHON_FILES) + [ "$(PYTHON_FILES)" = "" ] || mkdir -p $(MYPY_CACHE) && python -m mypy --strict $(PYTHON_FILES) --cache-dir $(MYPY_CACHE) + +format format_diff: + ruff format $(PYTHON_FILES) + ruff check --select I --fix $(PYTHON_FILES) + +spell_check: + codespell --toml pyproject.toml + +spell_fix: + codespell --toml pyproject.toml -w + +###################### +# HELP +###################### + +help: + @echo '----' + @echo 'format - run code formatters' + @echo 'lint - run linters' + @echo 'test - run unit tests' + @echo 'tests - run unit tests' + @echo 'test TEST_FILE= - run all tests in file' + @echo 'test_watch - run unit tests in watch mode' + diff --git a/langgraph/README.md b/langgraph/README.md new file mode 100644 index 000000000..18ebe0a9d --- /dev/null +++ b/langgraph/README.md @@ -0,0 +1,80 @@ +# New LangGraph Project + +[![CI](https://github.com/langchain-ai/new-langgraph-project/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/langchain-ai/new-langgraph-project/actions/workflows/unit-tests.yml) +[![Integration Tests](https://github.com/langchain-ai/new-langgraph-project/actions/workflows/integration-tests.yml/badge.svg)](https://github.com/langchain-ai/new-langgraph-project/actions/workflows/integration-tests.yml) + +This template demonstrates a simple application implemented using [LangGraph](https://github.com/langchain-ai/langgraph), designed for showing how to get started with [LangGraph Server](https://langchain-ai.github.io/langgraph/concepts/langgraph_server/#langgraph-server) and using [LangGraph Studio](https://langchain-ai.github.io/langgraph/concepts/langgraph_studio/), a visual debugging IDE. + +
+ Graph view in LangGraph studio UI +
+ +The core logic defined in `src/agent/graph.py`, showcases an single-step application that responds with a fixed string and the configuration provided. + +You can extend this graph to orchestrate more complex agentic workflows that can be visualized and debugged in LangGraph Studio. + +## Getting Started + + + + + +1. Install dependencies, along with the [LangGraph CLI](https://langchain-ai.github.io/langgraph/concepts/langgraph_cli/), which will be used to run the server. + +```bash +cd path/to/your/app +pip install -e . "langgraph-cli[inmem]" +``` + +2. (Optional) Customize the code and project as needed. Create a `.env` file if you need to use secrets. + +```bash +cp .env.example .env +``` + +If you want to enable LangSmith tracing, add your LangSmith API key to the `.env` file. + +```text +# .env +LANGSMITH_API_KEY=lsv2... +``` + +3. Start the LangGraph Server. + +```shell +langgraph dev +``` + +For more information on getting started with LangGraph Server, [see here](https://langchain-ai.github.io/langgraph/tutorials/langgraph-platform/local-server/). + +## How to customize + +1. **Define configurable parameters**: Modify the `Configuration` class in the `graph.py` file to expose the arguments you want to configure. For example, in a chatbot application you may want to define a dynamic system prompt or LLM to use. For more information on configurations in LangGraph, [see here](https://langchain-ai.github.io/langgraph/concepts/low_level/?h=configuration#configuration). + +2. **Extend the graph**: The core logic of the application is defined in [graph.py](./src/agent/graph.py). You can modify this file to add new nodes, edges, or change the flow of information. + +## Development + +While iterating on your graph in LangGraph Studio, you can edit past state and rerun your app from previous states to debug specific nodes. Local changes will be automatically applied via hot reload. + +Follow-up requests extend the same thread. You can create an entirely new thread, clearing previous history, using the `+` button in the top right. + +For more advanced features and examples, refer to the [LangGraph documentation](https://langchain-ai.github.io/langgraph/). These resources can help you adapt this template for your specific use case and build more sophisticated conversational agents. + +LangGraph Studio also integrates with [LangSmith](https://smith.langchain.com/) for more in-depth tracing and collaboration with teammates, allowing you to analyze and optimize your chatbot's performance. + + diff --git a/langgraph/__init__.py b/langgraph/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/langgraph/langgraph.json b/langgraph/langgraph.json new file mode 100644 index 000000000..9c4966ecc --- /dev/null +++ b/langgraph/langgraph.json @@ -0,0 +1,8 @@ +{ + "dependencies": ["."], + "graphs": { + "agent": "./src/agent/graph.py:graph" + }, + "env": ".env", + "image_distro": "wolfi" +} diff --git a/langgraph/pyproject.toml b/langgraph/pyproject.toml new file mode 100644 index 000000000..a6237d4f9 --- /dev/null +++ b/langgraph/pyproject.toml @@ -0,0 +1,65 @@ +[project] +name = "agent" +version = "0.0.1" +description = "Starter template for making a new agent LangGraph." +authors = [ + { name = "William Fu-Hinthorn", email = "13333726+hinthornw@users.noreply.github.com" }, +] +readme = "README.md" +license = { text = "MIT" } +requires-python = ">=3.9" +dependencies = [ + "langgraph>=0.2.6", + "python-dotenv>=1.0.1", +] + + +[project.optional-dependencies] +dev = ["mypy>=1.11.1", "ruff>=0.6.1"] + +[build-system] +requires = ["setuptools>=73.0.0", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +packages = ["langgraph.templates.agent", "agent"] +[tool.setuptools.package-dir] +"langgraph.templates.agent" = "src/agent" +"agent" = "src/agent" + + +[tool.setuptools.package-data] +"*" = ["py.typed"] + +[tool.ruff] +lint.select = [ + "E", # pycodestyle + "F", # pyflakes + "I", # isort + "D", # pydocstyle + "D401", # First line should be in imperative mood + "T201", + "UP", +] +lint.ignore = [ + "UP006", + "UP007", + # We actually do want to import from typing_extensions + "UP035", + # Relax the convention by _not_ requiring documentation for every function parameter. + "D417", + "E501", +] +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["D", "UP"] +[tool.ruff.lint.pydocstyle] +convention = "google" + +[dependency-groups] +dev = [ + "anyio>=4.7.0", + "langgraph-cli[inmem]>=0.2.8", + "mypy>=1.13.0", + "pytest>=8.3.5", + "ruff>=0.8.2", +] diff --git a/langgraph/src/agent/__init__.py b/langgraph/src/agent/__init__.py new file mode 100644 index 000000000..743c2a0fe --- /dev/null +++ b/langgraph/src/agent/__init__.py @@ -0,0 +1,8 @@ +"""New LangGraph Agent. + +This module defines a custom graph. +""" + +from agent.graph import graph + +__all__ = ["graph"] diff --git a/langgraph/src/agent/graph.py b/langgraph/src/agent/graph.py new file mode 100644 index 000000000..69512c1e9 --- /dev/null +++ b/langgraph/src/agent/graph.py @@ -0,0 +1,198 @@ +""" +LangGraph single-node graph template, with OpenAI API integration and .env +support. + +Import as: + +import langgraph.src.agent.graph as lsraggra +""" + +import dataclasses +import os +from typing import Any, Dict, Optional, TypedDict + +import dotenv +import langchain_core.runnables as lcru +import openai + +import langgraph.graph as lgg +import langgraph.src.agent.raw_data_analyzer as lsardaan +import langgraph.src.agent.schema_parser as lsagscpa + +# Load environment variables from .env. +dotenv.load_dotenv() +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY) + + +# ############################################################################# +# Configuration +# ############################################################################# + + +class Configuration(TypedDict): + """ + Configurable parameters for the agent. + """ + + my_configurable_param: str + openai_model: str + + +# ############################################################################# +# State +# ############################################################################# + + +@dataclasses.dataclass +class State: + """ + Input state for the agent. + """ + + file_path: str = "" + raw_data_result: Optional[Dict[str, Any]] = None + schema_content: str = "" + schema_result: Optional[Dict[str, Any]] = None + changeme: str = "example" + + +async def call_model(state: State, config: lcru.RunnableConfig) -> Dict[str, Any]: + """ + Process input and returns output using OpenAI API. + """ + configuration = config["configurable"] + model = configuration.get("openai_model", "gpt-3.5-turbo") + + # Build context from schema parser results + schema_context = "" + if state.schema_result and "error" not in state.schema_result: + schema_context = f""" +Schema Analysis Results: +- Total columns: {state.schema_result.get('total_columns', 0)} +- Required columns: {state.schema_result.get('required_columns', 0)} +- Optional columns: {state.schema_result.get('optional_columns', 0)} +- Schema type: {state.schema_result.get('schema_type', 'unknown')} + +Column Details: +""" + for col in state.schema_result.get("columns", []): + schema_context += ( + f"- {col['name']}: {col['data_type']} " + f"(required: {col['required']}, nullable: {col['nullable']})\n" + ) + if col.get("description"): + schema_context += f" Description: {col['description']}\n" + + # Build context from raw data analysis + raw_data_context = "" + if state.raw_data_result and "error" not in state.raw_data_result: + raw_data_context = f""" +Raw Data Analysis Results: +- File: {state.raw_data_result.get('file_path', 'unknown')} +- Total rows: {state.raw_data_result.get('total_rows', 0)} +- Total columns: {state.raw_data_result.get('total_columns', 0)} +""" + + prompt = f"""You are an AutoEDA (Automated Exploratory Data Analysis) assistant. +Based on the data analysis and schema information below, provide insights and recommendations for exploratory data analysis. + +{raw_data_context} + +{schema_context} + +Configured parameter: {configuration.get('my_configurable_param')} +Additional context: {state.changeme} + +Please provide: +1. Key insights about the dataset structure +2. Recommended exploratory data analysis steps +3. Potential data quality issues to investigate +4. Suggested visualizations or analysis techniques +""" + + response = await client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": prompt}], + temperature=0.7, + max_tokens=512, + ) + + output_text = response.choices[0].message.content + + return {"changeme": output_text} + + +async def analyze_raw_data( + state: State, _config: lcru.RunnableConfig +) -> Dict[str, Any]: + """ + Analyze raw data file and generate schema. + """ + if not state.file_path: + return {"raw_data_result": {"error": "No file path provided"}} + + analyzer = lsardaan.RawDataAnalyzer() + result = analyzer.analyze_file(state.file_path) + + if result.error_message: + return {"raw_data_result": {"error": result.error_message}} + + # Convert result to dict for state + raw_data_result = { + "file_path": result.file_path, + "total_rows": result.total_rows, + "total_columns": result.total_columns, + "suggested_schema": result.suggested_schema, + "analysis_metadata": result.analysis_metadata, + } + + return {"raw_data_result": raw_data_result} + + +async def parse_schema( + state: State, _config: lcru.RunnableConfig +) -> Dict[str, Any]: + """ + Parse schema content and extract column information. + """ + if not state.schema_content: + return {"schema_result": {"error": "No schema content provided"}} + + result = lsagscpa.parse_schema_content(state.schema_content) + + if result.error_message: + return {"schema_result": {"error": result.error_message}} + + # Convert result to dict for state + schema_result = { + "total_columns": result.total_columns, + "required_columns": result.required_columns, + "optional_columns": result.optional_columns, + "schema_type": result.schema_type, + "columns": [ + { + "name": col.name, + "data_type": col.data_type, + "required": col.required, + "description": col.description, + "nullable": col.nullable, + } + for col in result.columns + ], + } + + return {"schema_result": schema_result} + + +# Define the graph +graph = ( + lgg.StateGraph(State, config_schema=Configuration) + .add_node("analyze_raw_data", analyze_raw_data) + .add_node("parse_schema", parse_schema) + .add_node("call_model", call_model) + .add_edge("__start__", "analyze_raw_data") + .add_edge("analyze_raw_data", "parse_schema") + .add_edge("parse_schema", "call_model") + .compile(name="AutoEDA Agent Graph") +) diff --git a/langgraph/src/agent/raw_data_analyzer.py b/langgraph/src/agent/raw_data_analyzer.py new file mode 100644 index 000000000..ccce1cac2 --- /dev/null +++ b/langgraph/src/agent/raw_data_analyzer.py @@ -0,0 +1,631 @@ +#!/usr/bin/env python + +""" +Raw Data Analyzer for AutoEDA Agent. + +Analyzes raw data files (CSV, JSON, Parquet, Excel) and generates schema.json files +for use with the schema parser module. + +Import as: + +import raw_data_analyzer as rdanal +""" + +import argparse +import dataclasses +import json +import logging +import pathlib +import warnings +from typing import Any, Dict, List, Optional + +import pandas as pd + +import helpers.hio as hio + +# Configure logging. +logging.basicConfig(level=logging.INFO) +_LOG = logging.getLogger(__name__) + + +# ############################################################################# +# ColumnAnalysis +# ############################################################################# + + +@dataclasses.dataclass +class ColumnAnalysis: + """ + Represent analysis results for a single column. + """ + + name: str + data_type: str + nullable: bool + null_count: int + unique_count: int + sample_values: List[Any] + min_value: Optional[Any] = None + max_value: Optional[Any] = None + mean_value: Optional[float] = None + std_value: Optional[float] = None + pattern: Optional[str] = None + format_hint: Optional[str] = None + + +# ############################################################################# +# DataAnalysisResult +# ############################################################################# + + +@dataclasses.dataclass +class DataAnalysisResult: + """ + Represent the complete data analysis result. + """ + + file_path: str + total_rows: int + total_columns: int + columns: List[ColumnAnalysis] + suggested_schema: Dict[str, Any] + analysis_metadata: Dict[str, Any] + error_message: Optional[str] = None + + +# ############################################################################# +# RawDataAnalyzer +# ############################################################################# + + +class RawDataAnalyzer: + """ + Analyze raw data files and generate schema definitions. + + Supported formats: + - CSV files + - JSON files + - Parquet files + - Excel files + """ + + def __init__( + self, + *, + sample_size: int = 10000, + max_unique_values: int = 100, + datetime_formats: Optional[List[str]] = None, + ) -> None: + """ + Initialize the raw data analyzer. + + :param sample_size: maximum number of rows to sample for + analysis + :param max_unique_values: maximum unique values to consider for + enum types + :param datetime_formats: list of datetime formats to try for + detection + """ + self.sample_size = sample_size + self.max_unique_values = max_unique_values + self.datetime_threshold = 0.8 # 80% success rate for datetime detection + self.datetime_formats = datetime_formats or [ + "%Y-%m-%d", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M:%SZ", + "%m/%d/%Y", + "%d/%m/%Y", + "%Y%m%d", + ] + self.supported_formats = [".csv", ".json", ".parquet", ".xlsx", ".xls"] + + def analyze_file(self, file_path: str) -> DataAnalysisResult: + """ + Analyze a data file and generate schema information. + + :param file_path: path to the data file to analyze + :return: analysis results with suggested schema + """ + try: + path = pathlib.Path(file_path) + if not path.exists(): + error_msg = f"Data file not found: {file_path}" + return self._create_error_result(file_path, error_msg) + if not self._is_supported_format(path.suffix): + error_msg = f"Unsupported file format: {path.suffix}" + return self._create_error_result(file_path, error_msg) + # Load data based on file format. + df = self._load_data(file_path) + # Sample data if it's too large. + if len(df) > self.sample_size: + _LOG.info( + "Sampling %d rows from %d total rows", + self.sample_size, + len(df), + ) + df = df.sample(n=self.sample_size, random_state=42) + # Analyze each column. + columns_analysis = self._analyze_columns(df) + # Generate suggested schema. + suggested_schema = self._generate_schema(columns_analysis, df) + # Create analysis metadata. + analysis_metadata = self._create_analysis_metadata(file_path, df) + _LOG.info( + "Successfully analyzed %d columns from %s", + len(columns_analysis), + file_path, + ) + return DataAnalysisResult( + file_path=file_path, + total_rows=len(df), + total_columns=len(df.columns), + columns=columns_analysis, + suggested_schema=suggested_schema, + analysis_metadata=analysis_metadata, + ) + except (IOError, ValueError, pd.errors.ParserError) as e: + _LOG.error("Error analyzing file %s: %s", file_path, e) + error_msg = f"Failed to analyze file: {str(e)}" + return self._create_error_result(file_path, error_msg) + + def save_schema( + self, + analysis_result: DataAnalysisResult, + output_path: str, + *, + include_analysis_metadata: bool = True, + ) -> None: + """ + Save the generated schema to a JSON file. + + :param analysis_result: analysis result containing the schema + :param output_path: path where to save the schema.json file + :param include_analysis_metadata: whether to include analysis + metadata + """ + try: + schema_data = analysis_result.suggested_schema.copy() + + if include_analysis_metadata: + schema_data["_analysis_metadata"] = ( + analysis_result.analysis_metadata + ) + + # Ensure output directory exists. + output_dir = pathlib.Path(output_path).parent + hio.create_dir(str(output_dir), incremental=True) + + # Save schema to file. + with open(output_path, "w", encoding="utf-8") as f: + json.dump(schema_data, f, indent=2, default=str) + + _LOG.info("Schema saved to %s", output_path) + + except Exception as e: + _LOG.error("Error saving schema to %s: %s", output_path, e) + raise + + def _load_data(self, file_path: str) -> pd.DataFrame: + """ + Load data from file based on format. + + :param file_path: path to the data file + :return: loaded DataFrame + """ + path = pathlib.Path(file_path) + file_extension = path.suffix.lower() + if file_extension == ".csv": + # Try different encodings and separators. + for encoding in ["utf-8", "latin-1", "cp1252"]: + for sep in [",", ";", "\t"]: + try: + df = pd.read_csv( + file_path, + encoding=encoding, + sep=sep, + nrows=self.sample_size, + ) + # Good indicator of correct separator. + if len(df.columns) > 1: + _LOG.debug( + "Successfully loaded CSV with encoding=%s, sep='%s'", + encoding, + sep, + ) + return df + except (ValueError, TypeError, pd.errors.ParserError): + continue + # Fallback to default pandas behavior. + return pd.read_csv(file_path, nrows=self.sample_size) + if file_extension == ".json": + return pd.read_json(file_path, lines=True, nrows=self.sample_size) + if file_extension == ".parquet": + return pd.read_parquet(file_path) + if file_extension in [".xlsx", ".xls"]: + return pd.read_excel(file_path, nrows=self.sample_size) + raise ValueError(f"Unsupported file format: {file_extension}") + + def _analyze_columns(self, df: pd.DataFrame) -> List[ColumnAnalysis]: + """ + Analyze each column in the DataFrame. + + :param df: DataFrame to analyze + :return: list of column analysis results + """ + columns_analysis = [] + for col_name in df.columns: + col_data = df[col_name] + analysis = self._analyze_single_column(col_name, col_data) + columns_analysis.append(analysis) + return columns_analysis + + def _analyze_single_column( + self, + col_name: str, + col_data: pd.Series, + ) -> ColumnAnalysis: + """ + Analyze a single column. + + :param col_name: name of the column + :param col_data: column data as pandas Series + :return: column analysis result + """ + # Basic statistics. + null_count = col_data.isnull().sum() + unique_count = col_data.nunique() + nullable = null_count > 0 + + # Sample non-null values. + non_null_data = col_data.dropna() + sample_values = ( + non_null_data.head(5).tolist() if len(non_null_data) > 0 else [] + ) + + # Infer data type. + data_type, format_hint = self._infer_data_type(non_null_data) + + # Calculate statistics for numeric columns. + min_value = None + max_value = None + mean_value = None + std_value = None + + if data_type in ["integer", "float"] and len(non_null_data) > 0: + try: + numeric_data = pd.to_numeric(non_null_data, errors="coerce") + min_value = float(numeric_data.min()) + max_value = float(numeric_data.max()) + mean_value = float(numeric_data.mean()) + std_value = float(numeric_data.std()) + except (ValueError, TypeError): + pass + + # Detect patterns for string columns. + pattern = None + if data_type == "string" and len(non_null_data) > 0: + pattern = self._detect_pattern(non_null_data) + + return ColumnAnalysis( + name=col_name, + data_type=data_type, + nullable=nullable, + null_count=int(null_count), + unique_count=int(unique_count), + sample_values=sample_values, + min_value=min_value, + max_value=max_value, + mean_value=mean_value, + std_value=std_value, + pattern=pattern, + format_hint=format_hint, + ) + + def _infer_data_type(self, data: pd.Series) -> tuple[str, Optional[str]]: + """ + Infer the data type of a column. + + :param data: column data (non-null values) + :return: tuple of (data_type, format_hint) + """ + result_type = "string" + format_hint = None + if len(data) == 0: + return result_type, format_hint + # Check for boolean. + if data.dtype == bool or set(data.astype(str).str.lower()) <= { + "true", + "false", + "t", + "f", + "yes", + "no", + "y", + "n", + "1", + "0", + }: + result_type = "boolean" + # Check for integer. + elif data.dtype in ["int64", "int32", "int16", "int8"]: + result_type = "integer" + # Check for float. + elif data.dtype in ["float64", "float32"]: + result_type = "float" + else: + # Try to convert to numeric. + try: + numeric_data = pd.to_numeric(data, errors="coerce") + if not numeric_data.isnull().all(): + # Check if all values are integers. + if ( + numeric_data.fillna(0) + .apply(lambda x: x.is_integer()) + .all() + ): + result_type = "integer" + else: + result_type = "float" + else: + # Check for datetime. + datetime_type, datetime_format = self._check_datetime(data) + if datetime_type: + result_type = datetime_type + format_hint = datetime_format + except (ValueError, TypeError): + # Check for datetime as fallback. + datetime_type, datetime_format = self._check_datetime(data) + if datetime_type: + result_type = datetime_type + format_hint = datetime_format + return result_type, format_hint + + def _check_datetime( + self, data: pd.Series + ) -> tuple[Optional[str], Optional[str]]: + """ + Check if data represents datetime values with improved detection. + + :param data: column data to check + :return: tuple of (datetime_type, format_hint) + """ + # Skip if data looks purely numeric. + try: + pd.to_numeric(data.head(10), errors="raise") + return None, None + except (ValueError, TypeError): + pass + # Sample a subset for testing. + sample_data = data.head(min(100, len(data))) + # Try specific datetime formats first. + for fmt in self.datetime_formats: + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + parsed = pd.to_datetime( + sample_data, format=fmt, errors="coerce" + ) + # Check success rate. + success_rate = 1 - parsed.isnull().sum() / len(sample_data) + if success_rate >= self.datetime_threshold: + # Determine if it's date or datetime. + non_null_parsed = parsed.dropna() + if len(non_null_parsed) > 0: + # Check if all times are midnight (date-only). + if all( + t.time() == non_null_parsed.iloc[0].time() + for t in non_null_parsed.head(10) + ): + return "date", fmt + return "datetime", fmt + except (ValueError, TypeError, AttributeError): + continue + # Try pandas automatic parsing as last resort. + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + parsed = pd.to_datetime( + sample_data, errors="coerce", infer_datetime_format=True + ) + success_rate = 1 - parsed.isnull().sum() / len(sample_data) + if success_rate >= self.datetime_threshold: + return "datetime", "auto" + except (ValueError, TypeError, AttributeError): + pass + return None, None + + def _detect_pattern(self, data: pd.Series) -> Optional[str]: + """ + Detect common patterns in string data. + + :param data: string column data + :return: detected pattern or None + """ + str_data = data.astype(str) + # Check for common patterns. + patterns = { + "email": r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", + "phone": r"^[\+]?[1-9]?[0-9]{7,15}$", + "url": r"^https?://[^\s/$.?#].[^\s]*$", + "uuid": r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", + "ip_address": r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$", + } + for pattern_name, pattern_regex in patterns.items(): + if str_data.str.match(pattern_regex, case=False).mean() > 0.8: + return pattern_name + return None + + def _generate_schema( + self, + columns_analysis: List[ColumnAnalysis], + df: pd.DataFrame, + ) -> Dict[str, Any]: + """ + Generate JSON Schema from column analysis. + + :param columns_analysis: list of analyzed columns + :param df: original DataFrame + :return: JSON Schema dictionary + """ + properties = {} + required_fields = [] + + for col in columns_analysis: + col_schema = { + "type": self._map_to_json_schema_type(col.data_type), + "description": f"Column '{col.name}' with {col.unique_count} unique values", + } + + # Add format hint if available. + if col.format_hint and col.format_hint != "auto": + col_schema["format"] = col.format_hint + + # Add constraints for numeric fields. + if col.data_type in ["integer", "float"]: + if col.min_value is not None: + col_schema["minimum"] = col.min_value + if col.max_value is not None: + col_schema["maximum"] = col.max_value + + # Add enum for low-cardinality categorical fields. + if ( + col.data_type == "string" + and col.unique_count <= self.max_unique_values + and col.unique_count > 1 + ): + unique_values = df[col.name].dropna().unique().tolist() + col_schema["enum"] = unique_values[:50] # Limit enum size. + + # Add pattern if detected. + if col.pattern: + col_schema["pattern"] = col.pattern + + properties[col.name] = col_schema + + # Mark as required if no null values. + if not col.nullable: + required_fields.append(col.name) + + schema = { + "type": "object", + "title": "Generated Schema", + "description": f"Auto-generated schema for data with {len(columns_analysis)} columns", + "properties": properties, + } + + if required_fields: + schema["required"] = required_fields + + return schema + + def _map_to_json_schema_type(self, data_type: str) -> str: + """ + Map internal data types to JSON Schema types. + + :param data_type: internal data type + :return: JSON Schema type + """ + type_mapping = { + "integer": "integer", + "float": "number", + "boolean": "boolean", + "string": "string", + "datetime": "string", + "date": "string", + "object": "object", + "array": "array", + } + return type_mapping.get(data_type, "string") + + def _create_analysis_metadata( + self, + file_path: str, + df: pd.DataFrame, + ) -> Dict[str, Any]: + """ + Create metadata about the analysis process. + + :param file_path: path to analyzed file + :param df: analyzed DataFrame + :return: analysis metadata + """ + return { + "analysis_timestamp": pd.Timestamp.now().isoformat(), + "analyzer_version": "1.0.0", + "source_file": file_path, + "sample_size": min(len(df), self.sample_size), + "total_rows_analyzed": len(df), + "total_columns_analyzed": len(df.columns), + "file_size_bytes": pathlib.Path(file_path).stat().st_size, + "analysis_settings": { + "max_unique_values": self.max_unique_values, + "datetime_formats": self.datetime_formats, + }, + } + + def _is_supported_format(self, file_extension: str) -> bool: + """ + Check if file format is supported. + + :param file_extension: file extension to check + :return: whether the format is supported + """ + return file_extension.lower() in self.supported_formats + + def _create_error_result( + self, + file_path: str, + error_message: str, + ) -> DataAnalysisResult: + """ + Create a DataAnalysisResult with error information. + + :param file_path: path to the file that caused the error + :param error_message: error message to include + :return: DataAnalysisResult object with error details + """ + return DataAnalysisResult( + file_path=file_path, + total_rows=0, + total_columns=0, + columns=[], + suggested_schema={}, + analysis_metadata={}, + error_message=error_message, + ) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Analyze raw data files and generate schema information." + ) + parser.add_argument("file", type=str, help="Path to the data file to analyze") + args = parser.parse_args() + analyzer = RawDataAnalyzer() + result = analyzer.analyze_file(args.file) + if result.error_message: + _LOG.error("Error: %s", result.error_message) + else: + _LOG.info("File: %s", result.file_path) + _LOG.info("Total rows: %d", result.total_rows) + _LOG.info("Total columns: %d", result.total_columns) + _LOG.info("Columns:") + for col in result.columns: + _LOG.info( + " - %s: %s, nullable=%s", col.name, col.data_type, col.nullable + ) + _LOG.info("Suggested schema: %s", result.suggested_schema) + _LOG.info("Metadata: %s", result.analysis_metadata) + # Save suggested schema to JSON file. + schema_path = args.file + ".schema.json" + try: + with open(schema_path, "w", encoding="utf-8") as f: + json.dump(result.suggested_schema, f, indent=2) + _LOG.info("Saved schema to %s", schema_path) + except (IOError, TypeError, ValueError) as e: + _LOG.error("Failed to save schema to %s: %s", schema_path, e) + + +if __name__ == "__main__": + main() diff --git a/langgraph/src/agent/schema_parser.py b/langgraph/src/agent/schema_parser.py new file mode 100755 index 000000000..109885926 --- /dev/null +++ b/langgraph/src/agent/schema_parser.py @@ -0,0 +1,695 @@ +#!/usr/bin/env python + +""" +Schema Parser Module for AutoEDA Agent. + +A standalone module to parse JSON/YAML schema files and extract column information +for automated exploratory data analysis. + +Import as: + +import schema_parser as schpar +""" + +import argparse +import dataclasses +import json +import logging +import pathlib +from typing import Any, Dict, List, Optional + +import yaml # type: ignore[import-untyped] + +import helpers.hdbg as hdbg +import helpers.hio as hio + +# Configure logging. +logging.basicConfig(level=logging.INFO) +_LOG = logging.getLogger(__name__) + + +# ############################################################################# +# ColumnInfo +# ############################################################################# + + +@dataclasses.dataclass +class ColumnInfo: + """ + Represent information about a single column. + """ + + name: str + data_type: str + required: bool = False + description: str = "" + nullable: bool = True + default: Any = None + constraints: Optional[Dict[str, Any]] = None + metadata: Optional[Dict[str, Any]] = None + + def __post_init__(self) -> None: + """ + Initialize default values for optional fields. + """ + if self.constraints is None: + self.constraints = {} + if self.metadata is None: + self.metadata = {} + + +# ############################################################################# +# ParsedSchema +# ############################################################################# + + +@dataclasses.dataclass +class ParsedSchema: + """ + Represent the complete parsed schema result. + """ + + columns: List[ColumnInfo] + raw_schema: Dict[str, Any] + schema_type: str + total_columns: int + required_columns: int + optional_columns: int + error_message: Optional[str] = None + + def __post_init__(self) -> None: + """ + Calculate derived fields from columns. + """ + self.total_columns = len(self.columns) + self.required_columns = sum(1 for col in self.columns if col.required) + self.optional_columns = self.total_columns - self.required_columns + + +# ############################################################################# +# SchemaParser +# ############################################################################# + + +class SchemaParser: + """ + Parse various schema formats and extract column information. + + Supported formats: + - JSON Schema + - Custom column definitions + - YAML schema files + - Generic key-value structures + """ + + def __init__( + self, + *, + include_metadata: bool = True, + output_format: str = "detailed", + ) -> None: + """ + Initialize the schema parser. + + :param include_metadata: whether to include metadata in parsed + results + :param output_format: output format, either "detailed" or + "simple" + """ + hdbg.dassert_in(output_format, ["detailed", "simple"]) + self.include_metadata = include_metadata + self.output_format = output_format + self.supported_formats = ["json", "yaml", "yml"] + + def parse_file(self, file_path: str) -> ParsedSchema: + """ + Parse schema from a file. + + :param file_path: path to the schema file + :return: parsed schema object with results + """ + try: + path = pathlib.Path(file_path) + if not path.exists(): + error_msg = f"Schema file not found: {file_path}" + return self._create_error_result(error_msg) + if not self._is_supported_format(path.suffix): + error_msg = f"Unsupported file format: {path.suffix}" + return self._create_error_result(error_msg) + # Read file content. + content = hio.from_file(str(path)) + return self.parse_content(content, source=str(path)) + except (FileNotFoundError, IOError, OSError, ValueError, TypeError) as e: + _LOG.error("Error parsing file %s: %s", file_path, e) + error_msg = f"Failed to parse file: {str(e)}" + return self._create_error_result(error_msg) + + def parse_content( + self, + content: str, + *, + source: str = "content", + ) -> ParsedSchema: + """ + Parse schema from string content. + + :param content: schema content as string + :param source: source identifier for logging + :return: parsed schema object with results + """ + try: + schema_data = self._parse_schema_content(content) + schema_type = self._detect_schema_type(schema_data) + columns = self._extract_columns(schema_data, schema_type) + + _LOG.info( + "Successfully parsed %d columns from %s", + len(columns), + source, + ) + + return ParsedSchema( + columns=columns, + raw_schema=schema_data, + schema_type=schema_type, + total_columns=len(columns), + required_columns=0, # Will be calculated in __post_init__. + optional_columns=0, # Will be calculated in __post_init__. + ) + + except ( + json.JSONDecodeError, + yaml.YAMLError, + ValueError, + KeyError, + TypeError, + ) as e: + _LOG.error("Error parsing content from %s: %s", source, e) + error_msg = f"Failed to parse content: {str(e)}" + return self._create_error_result(error_msg) + + def to_dict(self, parsed_schema: ParsedSchema) -> Dict[str, Any]: + """ + Convert ParsedSchema to dictionary format. + + :param parsed_schema: parsed schema object to convert + :return: dictionary representation of the schema + """ + return { + "columns": [ + { + "name": col.name, + "data_type": col.data_type, + "required": col.required, + "description": col.description, + "nullable": col.nullable, + "default": col.default, + "constraints": col.constraints, + "metadata": col.metadata if self.include_metadata else {}, + } + for col in parsed_schema.columns + ], + "schema_info": { + "schema_type": parsed_schema.schema_type, + "total_columns": parsed_schema.total_columns, + "required_columns": parsed_schema.required_columns, + "optional_columns": parsed_schema.optional_columns, + }, + "raw_schema": parsed_schema.raw_schema, + "error_message": parsed_schema.error_message, + } + + def to_dataframe_schema(self, parsed_schema: ParsedSchema) -> Dict[str, str]: + """ + Convert to simple column_name: data_type mapping for pandas. + + :param parsed_schema: parsed schema object to convert + :return: simple mapping of column names to data types + """ + return {col.name: col.data_type for col in parsed_schema.columns} + + def _parse_schema_content(self, content: str) -> Dict[str, Any]: + """ + Parse content as JSON or YAML. + + :param content: raw content string to parse + :return: parsed data as dictionary + """ + content = content.strip() + # Try JSON first. + try: + parsed_data = json.loads(content) + if not isinstance(parsed_data, dict): + raise ValueError("Schema must be a JSON object/dictionary") + return parsed_data + except json.JSONDecodeError: + pass + # Try YAML. + try: + parsed_data = yaml.safe_load(content) + if not isinstance(parsed_data, dict): + raise ValueError("Schema must be a YAML object/dictionary") + return parsed_data + except yaml.YAMLError as e: + raise ValueError(f"Invalid JSON/YAML format: {e}") from e + + def _detect_schema_type(self, schema_data: Dict[str, Any]) -> str: + """ + Detect the type of schema format. + + :param schema_data: parsed schema data + :return: detected schema type + """ + if "properties" in schema_data and "type" in schema_data: + return "json_schema" + if "columns" in schema_data: + return "custom_columns" + if "fields" in schema_data: + return "fields_format" + if any( + isinstance(v, dict) and ("type" in v or "dataType" in v) + for v in schema_data.values() + ): + return "generic_properties" + return "unknown" + + def _extract_columns( + self, + schema_data: Dict[str, Any], + schema_type: str, + ) -> List[ColumnInfo]: + """ + Extract column information based on schema type. + + :param schema_data: parsed schema data + :param schema_type: detected schema type + :return: list of column information objects + """ + extractors = { + "json_schema": self._extract_from_json_schema, + "custom_columns": self._extract_from_custom_format, + "fields_format": self._extract_from_fields_format, + "generic_properties": self._extract_from_generic_format, + "unknown": self._extract_from_unknown_format, + } + + extractor = extractors.get(schema_type, self._extract_from_unknown_format) + return extractor(schema_data) + + def _extract_from_json_schema( + self, + schema_data: Dict[str, Any], + ) -> List[ColumnInfo]: + """ + Extract columns from JSON Schema format. + + :param schema_data: JSON schema data + :return: list of column information objects + """ + columns = [] + properties = schema_data.get("properties", {}) + required_fields = set(schema_data.get("required", [])) + + for column_name, column_info in properties.items(): + constraints = {} + metadata = {} + + # Extract constraints. + for key in [ + "minimum", + "maximum", + "minLength", + "maxLength", + "pattern", + "enum", + ]: + if key in column_info: + constraints[key] = column_info[key] + + # Extract metadata if enabled. + if self.include_metadata: + excluded_keys = [ + "type", + "description", + "minimum", + "maximum", + "minLength", + "maxLength", + "pattern", + "enum", + ] + metadata = { + k: v for k, v in column_info.items() if k not in excluded_keys + } + + column = ColumnInfo( + name=column_name, + data_type=self._map_json_schema_type( + column_info.get("type", "string") + ), + required=column_name in required_fields, + description=column_info.get("description", ""), + nullable=not (column_name in required_fields), + default=column_info.get("default"), + constraints=constraints, + metadata=metadata, + ) + + columns.append(column) + + return columns + + def _extract_from_custom_format( + self, + schema_data: Dict[str, Any], + ) -> List[ColumnInfo]: + """ + Extract columns from custom columns format. + + :param schema_data: custom format schema data + :return: list of column information objects + """ + columns = [] + columns_data = schema_data.get("columns", []) + + for column_info in columns_data: + if not isinstance(column_info, dict): + continue + + constraints = column_info.get("constraints", {}) + metadata = {} + + if self.include_metadata: + excluded_keys = [ + "name", + "type", + "required", + "description", + "nullable", + "default", + "constraints", + ] + metadata = { + k: v for k, v in column_info.items() if k not in excluded_keys + } + + column = ColumnInfo( + name=column_info.get("name", ""), + data_type=self._normalize_data_type( + column_info.get("type", "string") + ), + required=column_info.get("required", False), + description=column_info.get("description", ""), + nullable=column_info.get("nullable", True), + default=column_info.get("default"), + constraints=constraints, + metadata=metadata, + ) + + columns.append(column) + + return columns + + def _extract_from_fields_format( + self, + schema_data: Dict[str, Any], + ) -> List[ColumnInfo]: + """ + Extract columns from fields format. + + :param schema_data: fields format schema data + :return: list of column information objects + """ + columns = [] + fields_data = schema_data.get("fields", []) + + for field_info in fields_data: + if not isinstance(field_info, dict): + continue + + constraints = field_info.get("constraints", {}) + metadata = {} + + if self.include_metadata: + excluded_keys = [ + "name", + "type", + "optional", + "description", + "constraints", + ] + metadata = { + k: v for k, v in field_info.items() if k not in excluded_keys + } + + column = ColumnInfo( + name=field_info.get("name", ""), + data_type=self._normalize_data_type( + field_info.get("type", "string") + ), + required=not field_info.get("optional", True), + description=field_info.get("description", ""), + nullable=field_info.get("optional", True), + constraints=constraints, + metadata=metadata, + ) + + columns.append(column) + + return columns + + def _extract_from_generic_format( + self, + schema_data: Dict[str, Any], + ) -> List[ColumnInfo]: + """ + Extract columns from generic format. + + :param schema_data: generic format schema data + :return: list of column information objects + """ + columns = [] + + for key, value in schema_data.items(): + if not isinstance(value, dict): + continue + + if "type" not in value and "dataType" not in value: + continue + + metadata = {} + if self.include_metadata: + excluded_keys = [ + "type", + "dataType", + "required", + "description", + "nullable", + ] + metadata = { + k: v for k, v in value.items() if k not in excluded_keys + } + + column = ColumnInfo( + name=key, + data_type=self._normalize_data_type( + value.get("type", value.get("dataType", "string")) + ), + required=value.get("required", False), + description=value.get("description", ""), + nullable=value.get("nullable", True), + metadata=metadata, + ) + + columns.append(column) + + return columns + + def _extract_from_unknown_format( + self, + schema_data: Dict[str, Any], + ) -> List[ColumnInfo]: + """ + Extract columns from unknown format by making best guesses. + + :param schema_data: unknown format schema data + :return: list of column information objects + """ + columns = [] + + # Try to extract any key-value pairs as potential columns. + for key, value in schema_data.items(): + if isinstance(value, str): + # Assume string values are data types. + column = ColumnInfo( + name=key, + data_type=self._normalize_data_type(value), + description="Inferred from key-value pair", + ) + columns.append(column) + + return columns + + def _map_json_schema_type(self, json_type: str) -> str: + """ + Map JSON Schema types to standard data types. + + :param json_type: JSON schema type string + :return: normalized data type + """ + type_mapping = { + "string": "string", + "integer": "integer", + "number": "float", + "boolean": "boolean", + "array": "array", + "object": "object", + "null": "null", + } + return type_mapping.get(json_type.lower(), "string") + + def _normalize_data_type(self, data_type: str) -> str: + """ + Normalize various data type representations. + + :param data_type: raw data type string + :return: normalized data type + """ + if not isinstance(data_type, str): + return "string" + data_type = data_type.lower().strip() + type_mapping = { + # String types. + "str": "string", + "text": "string", + "varchar": "string", + "char": "string", + "nvarchar": "string", + # Integer types. + "int": "integer", + "int32": "integer", + "int64": "integer", + "bigint": "integer", + "smallint": "integer", + # Float types. + "float": "float", + "float32": "float", + "float64": "float", + "double": "float", + "decimal": "float", + "numeric": "float", + "real": "float", + # Boolean types. + "bool": "boolean", + "bit": "boolean", + # Date/Time types. + "datetime": "datetime", + "datetime64": "datetime", + "timestamp": "datetime", + "date": "date", + "time": "time", + # Other types. + "json": "object", + "jsonb": "object", + "uuid": "string", + "blob": "binary", + "binary": "binary", + } + return type_mapping.get(data_type, data_type) + + def _is_supported_format(self, file_extension: str) -> bool: + """ + Check if file format is supported. + + :param file_extension: file extension to check + :return: whether the format is supported + """ + return file_extension.lower().lstrip(".") in self.supported_formats + + def _create_error_result(self, error_message: str) -> ParsedSchema: + """ + Create a ParsedSchema with error information. + + :param error_message: error message to include + :return: ParsedSchema object with error details + """ + return ParsedSchema( + columns=[], + raw_schema={}, + schema_type="error", + total_columns=0, + required_columns=0, + optional_columns=0, + error_message=error_message, + ) + + +# ############################################################################# +# Convenience functions. +# ############################################################################# + + +def parse_schema_file( + file_path: str, + *, + include_metadata: bool = True, +) -> ParsedSchema: + """ + Parse a schema file using default settings. + + :param file_path: path to schema file + :param include_metadata: whether to include metadata + :return: parsed schema object + """ + parser = SchemaParser(include_metadata=include_metadata) + return parser.parse_file(file_path) + + +def parse_schema_content( + content: str, + *, + include_metadata: bool = True, +) -> ParsedSchema: + """ + Parse schema content using default settings. + + :param content: schema content as string + :param include_metadata: whether to include metadata + :return: parsed schema object + """ + parser = SchemaParser(include_metadata=include_metadata) + return parser.parse_content(content) + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Parse a schema file and log the results." + ) + parser.add_argument( + "schema_file", type=str, help="Path to the schema file (JSON)" + ) + args = parser.parse_args() + # Read schema content from file + try: + with open(args.schema_file, "r", encoding="utf-8") as f: + schema_content = f.read() + except (FileNotFoundError, IOError, OSError) as e: + _LOG.error("Failed to read schema file %s: %s", args.schema_file, e) + return + result = parse_schema_content(schema_content) + if result.error_message: + _LOG.error("Error: %s", result.error_message) + else: + _LOG.info("Successfully parsed %d columns", result.total_columns) + _LOG.info("Required columns: %d", result.required_columns) + _LOG.info("Optional columns: %d", result.optional_columns) + _LOG.info("Schema type: %s", result.schema_type) + _LOG.info("Columns:") + for col in result.columns: + _LOG.info( + " - %s: %s (required: %s)", col.name, col.data_type, col.required + ) + + +if __name__ == "__main__": + main() diff --git a/langgraph/static/studio_ui.png b/langgraph/static/studio_ui.png new file mode 100644 index 000000000..9ccba2b20 Binary files /dev/null and b/langgraph/static/studio_ui.png differ diff --git a/langgraph/tests/conftest.py b/langgraph/tests/conftest.py new file mode 100644 index 000000000..26262e498 --- /dev/null +++ b/langgraph/tests/conftest.py @@ -0,0 +1,6 @@ +import pytest + + +@pytest.fixture(scope="session") +def anyio_backend(): + return "asyncio" diff --git a/langgraph/tests/integration_tests/__init__.py b/langgraph/tests/integration_tests/__init__.py new file mode 100644 index 000000000..d02981b81 --- /dev/null +++ b/langgraph/tests/integration_tests/__init__.py @@ -0,0 +1 @@ +"""Define any integration tests you want in this directory.""" diff --git a/langgraph/tests/integration_tests/test_graph.py b/langgraph/tests/integration_tests/test_graph.py new file mode 100644 index 000000000..68169be7c --- /dev/null +++ b/langgraph/tests/integration_tests/test_graph.py @@ -0,0 +1,12 @@ +import pytest + +from agent import graph + +pytestmark = pytest.mark.anyio + + +@pytest.mark.langsmith +async def test_agent_simple_passthrough() -> None: + inputs = {"changeme": "some_val"} + res = await graph.ainvoke(inputs) + assert res is not None diff --git a/langgraph/tests/unit_tests/__init__.py b/langgraph/tests/unit_tests/__init__.py new file mode 100644 index 000000000..f2900f275 --- /dev/null +++ b/langgraph/tests/unit_tests/__init__.py @@ -0,0 +1 @@ +"""Define any unit tests you may want in this directory.""" diff --git a/langgraph/tests/unit_tests/test_configuration.py b/langgraph/tests/unit_tests/test_configuration.py new file mode 100644 index 000000000..e06f9e38f --- /dev/null +++ b/langgraph/tests/unit_tests/test_configuration.py @@ -0,0 +1,9 @@ +from langgraph.pregel import Pregel + +from agent.graph import graph + + +def test_placeholder() -> None: + # TODO: You can add actual unit tests + # for your graph and other logic here. + assert isinstance(graph, Pregel)