diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1f723c0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,14 @@ +__pycache__ +*.pyc +.git +.github +data/ +*.egg-info +dist/ +build/ +.env +node_modules +.mypy_cache +.ruff_cache +.pytest_cache +QUALITY_REPORT.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..5aa7f2d --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,36 @@ +name: CI + +on: + push: + branches: [main, develop] + pull_request: + branches: [main] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install ruff + run: pip install ruff + - name: Lint + run: ruff check langent/ tests/ + + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install -e ".[dev]" + - name: Run tests + run: pytest tests/ -v --tb=short diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e19c65a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY pyproject.toml README.md ./ +COPY langent/ langent/ +RUN pip install --no-cache-dir -e . + +# Copy config and visualizer +COPY config/ config/ +COPY langent/visualizer/ langent/visualizer/ + +# Create data directory +RUN mkdir -p /app/data + +EXPOSE 8000 + +ENV LANGENT_WORKSPACE=/app/workspace +ENV CHROMA_DB_PATH=/app/data/chroma_db + +CMD ["langent", "serve", "--host", "0.0.0.0", "--port", "8000"] diff --git a/QUALITY_REPORT.md b/QUALITY_REPORT.md new file mode 100644 index 0000000..1c7a284 --- /dev/null +++ b/QUALITY_REPORT.md @@ -0,0 +1,390 @@ +# Langent v2.0.0 Repository Quality Analysis + +**Date**: 2026-02-24 +**Scope**: Full codebase review (~1,900 lines Python + ~560 lines JavaScript) + +--- + +## Executive Summary + +| Category | Score | Grade | +|---|---|---| +| **Architecture & Design** | 8.5/10 | A- | +| **Code Quality** | 7.5/10 | B+ | +| **Type Safety** | 7.0/10 | B | +| **Error Handling** | 5.5/10 | C+ | +| **Testing** | 3.0/10 | D | +| **Security** | 4.0/10 | D+ | +| **Documentation** | 7.5/10 | B+ | +| **CI/CD & DevOps** | 1.0/10 | F | +| **Dependency Management** | 7.0/10 | B | +| **Overall** | 5.7/10 | C+ | + +--- + +## 1. Architecture & Design (8.5/10) + +### Strengths + +- **Clear modular structure**: Well-separated concerns across `brain`, `rag`, `store`, `server`, `agents`, `skills` packages +- **Adapter pattern**: `VectorStore` and `GraphStore` are cleanly abstracted, enabling provider swaps +- **Pipeline pattern**: RAG workflow (ingest -> chunk -> embed -> store -> retrieve) is well-defined +- **LangGraph integration**: State machine-based workflows (`AgentState` TypedDict) in `langent/agents/workflows.py` +- **Lazy initialization**: Heavy objects (embedding models, Neo4j driver, Langent instance) are loaded on demand +- **Dual-layer caching** in `VectorStore.get_3d_positions()`: in-memory + disk cache with cache invalidation + +### Issues + +- **God Class**: `Langent` class (`brain.py`, 330 lines) acts as the central orchestrator for 7+ subsystems. Should be decomposed: + - Workspace management + - Vector/Graph store lifecycle + - RAG orchestration + - Agent workflow execution + - Skill management + +- **Global state in servers**: Both `api.py:31` and `mcp_server.py:32` use module-level `_langent = None` with mutable global state. Not safe for concurrent access or testing. + +```python +# api.py:31-40 — global singleton pattern +_langent = None +def get_langent(): + global _langent + if _langent is None: + _langent = Langent(verbose=True) + return _langent +``` + +- **`sys.path` manipulation**: `api.py:13` and `mcp_server.py:22` use `sys.path.insert()` — indicates packaging issues: +```python +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +``` + +--- + +## 2. Code Quality (7.5/10) + +### Strengths + +- Consistent snake_case naming throughout +- Descriptive method names: `get_nebula_data()`, `search_with_positions()`, `auto_link()` +- Clean Korean docstrings provide context alongside English code +- Good DRY principle adherence — `chunk_documents()` reuses `chunk_document()` +- Visual separators (`# ─── Section ───`) improve readability + +### Issues + +- **String interpolation in Cypher queries** (`graph.py:76,85-88`): Dynamic label/key injection via f-strings creates Cypher injection risk: +```python +# graph.py:76 +query = f"CREATE (n:{label} {{{props_str}}}) RETURN n" +``` + +- **Workflow prompt uses `${}` instead of Python f-strings** (`workflows.py:95-101`): +```python +prompt = f"""... +[사용자 질문] +${query} # Bug: should be {query} +[벡터 컨텍스트] +${context} # Bug: should be {context} +""" +``` +This is a **functional bug** — the variables won't be interpolated. + +- **Unused import**: `json` imported twice in `api.py` (lines 8 and 94) + +- **Hardcoded fallback path** in `mcp_server.py:40-41`: +```python +workspace = os.environ.get( + "LANGENT_WORKSPACE", + r"c:\Users\daewooenc\workspace\Ontology" # Windows-specific personal path +) +``` + +--- + +## 3. Type Safety (7.0/10) + +### Strengths + +- Consistent use of `typing` module: `Optional`, `List`, `Dict`, `Any`, `TypedDict` +- `AgentState` is properly defined as a `TypedDict` for LangGraph +- Pydantic models for config (`LangentConfig`) and sub-agents (`SubAgent`) + +### Issues + +- Heavy use of `Dict[str, Any]` as a catch-all return type — loses type information at boundaries +- No `mypy` configuration or strict type checking in the project +- `workspace: str = None` in `brain.py:44` should be `Optional[str] = None` +- No runtime type validation on API request parameters (FastAPI route handlers accept raw query params without Pydantic models) + +--- + +## 4. Error Handling (5.5/10) — Major Weakness + +### Pattern: Silent exception swallowing + +Multiple locations catch `Exception` and silently discard it: + +| Location | Code | Impact | +|---|---|---| +| `brain.py:80-81` | `except Exception: self.graph = None` | Neo4j init failures silently ignored | +| `brain.py:217-218` | `except Exception: pass` | Graph visualization failures hidden | +| `vector.py:83-84` | `except Exception: pass` | Duplicate check failures hidden | +| `vector.py:186-187` | `except Exception: pass` | Cache read failures hidden | +| `vector.py:227-228` | `except Exception: pass` | Cache write failures hidden | +| `retriever.py:76-77` | `except Exception: continue` | Graph query failures per keyword hidden | +| `api.py:139-140` | `except Exception: pass` | WebSocket broadcast failures hidden | +| `ingest.py:109-110` | `except Exception: pass` | CSV read failures hidden | + +### No logging framework + +The entire codebase uses `print()` instead of Python's `logging` module: +- `brain.py:317-319`: Custom `_log()` wraps `print()` +- `graph.py:43`: `print(f"Neo4j connection failed: {e}")` +- `ingest.py:52`: `print(f" [Warning] Error extracting...")` + +This makes debugging in production extremely difficult — no log levels, no log rotation, no structured logging. + +--- + +## 5. Testing (3.0/10) — Critical Weakness + +### Current state + +- **1 test file** with meaningful content: `tests/test_smoke.py` (93 lines) +- Test structure is a **single monolithic function** `test_basic()` — not proper pytest test cases +- No use of `pytest` fixtures, parametrize, or assertions — uses `print()` for verification +- No separation between unit tests and integration tests + +```python +# test_smoke.py — This is a script, not proper tests +def test_basic(): + print("1️⃣ Import test...") + from langent.config import LangentConfig + print(" ✓ All modules imported") # No assertions! +``` + +### Missing test coverage + +- No tests for: `api.py`, `mcp_server.py`, `cli.py`, `workflows.py`, `delegation.py`, `skills/loader.py` +- No error path testing +- No mock/stub usage for external dependencies (Neo4j, ChromaDB) +- No async test coverage (despite `pytest-asyncio` being in dev deps) +- No test configuration (`pytest.ini`, `conftest.py`, or `pyproject.toml [tool.pytest]`) + +### Estimated coverage: **~10-15%** + +--- + +## 6. Security (4.0/10) — Critical Weakness + +### Hardcoded credentials + +```python +# config.py:23 — Real password exposed in source code +class GraphStoreConfig(BaseModel): + password: str = "yw02280228" + +# graph.py:21 — Duplicated hardcoded password +class GraphStore: + def __init__(self, ..., password: str = "yw02280228"): +``` + +### Cypher injection vulnerability + +`graph.py:75-76,85-88,107-109`: Label and key names are interpolated directly into Cypher queries via f-strings without sanitization: +```python +def create_entity(self, label: str, properties: Dict[str, Any]) -> Dict: + props_str = ", ".join(f"{k}: ${k}" for k in properties) + query = f"CREATE (n:{label} {{{props_str}}}) RETURN n" +``` + +If `label` or key names come from user input, this allows arbitrary Cypher execution. + +### CORS wide open + +```python +# api.py:23-28 +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) +``` + +### No input validation on graph queries + +```python +# api.py:101-103 — Raw Cypher from user passed directly to Neo4j +@app.post("/api/graph") +async def graph_query(cypher: str): + return get_langent().graph_query(cypher) +``` + +### No authentication/authorization on any API endpoint + +All endpoints are publicly accessible without any auth mechanism. + +--- + +## 7. Documentation (7.5/10) + +### Strengths + +- Comprehensive `README.md` with installation, usage, architecture, and MCP integration docs +- YouTube demo link included +- Visual screenshots of the 3D nebula +- Korean + English bilingual documentation +- All modules have descriptive docstrings +- `.env.example` provides clear configuration template + +### Issues + +- No API documentation (no OpenAPI/Swagger customization beyond FastAPI defaults) +- No `CONTRIBUTING.md` or development setup guide +- No `CHANGELOG.md` for version tracking +- README references `python server/api.py` instead of `langent serve` +- `README.md:54`: Typo "raw raw files" + +--- + +## 8. CI/CD & DevOps (1.0/10) — Critical Gap + +### Completely absent + +- No `.github/workflows/` or any CI configuration +- No `Dockerfile` or container setup +- No `Makefile` or task runner +- No pre-commit hooks (`.pre-commit-config.yaml`) +- No linting configuration (no `ruff.toml`, `.flake8`, `mypy.ini`, etc.) +- No code formatting tool configured (no `black`, `isort`, `ruff format`) +- No dependency lockfile (`requirements.txt` or `poetry.lock`) + +--- + +## 9. Dependency Management (7.0/10) + +### Strengths + +- Well-organized `pyproject.toml` with categorized comments +- Reasonable version pinning (minimum versions specified) +- Dev dependencies separated via `[project.optional-dependencies]` +- Hatchling build system is modern and well-suited + +### Issues + +- No upper bounds on dependencies — major version bumps could break the project +- `requests` is used in `llm_proxy.py:74` but not declared in dependencies +- No lockfile for reproducible builds +- Heavy dependency chain: `sentence-transformers` + `torch` adds ~2GB to install size but there's no mention of this in README + +--- + +## 10. Specific Bugs Found + +| # | File | Line | Severity | Description | +|---|---|---|---|---| +| 1 | `workflows.py` | 95-101 | **High** | `${query}` / `${context}` / `${graph_str}` uses shell-style interpolation inside f-string — variables not substituted | +| 2 | `config.py` | 23 | **High** | Hardcoded password `"yw02280228"` committed to repo | +| 3 | `graph.py` | 21 | **High** | Same hardcoded password duplicated | +| 4 | `mcp_server.py` | 40-41 | **Medium** | Windows personal path as fallback | +| 5 | `mcp_server.py` | 137 | **Medium** | `agent.graph.query()` should be `agent.graph_query()` — will crash at runtime | +| 6 | `api.py` | 94 | **Low** | Duplicate `import json` | +| 7 | `api.py` | 13 | **Low** | `sys.path.insert()` hack | + +--- + +## 11. Recommendations (Priority Order) + +### P0 — Immediate + +1. **Remove hardcoded credentials** from `config.py:23` and `graph.py:21` — use environment variables only +2. **Fix the `${}` interpolation bug** in `workflows.py:95-101` +3. **Fix `mcp_server.py:137`** method call: `agent.graph.query()` -> `agent.graph_query()` + +### P1 — Short-term + +4. **Add logging framework**: Replace all `print()` with `logging.getLogger(__name__)` +5. **Add proper test suite**: Convert smoke test to proper pytest cases with fixtures and assertions +6. **Add input validation**: Pydantic request models for FastAPI endpoints +7. **Sanitize Cypher queries**: Validate label/key names against allowlists in `GraphStore` +8. **Add API authentication**: At minimum, API key middleware for production use + +### P2 — Medium-term + +9. **Set up CI/CD**: GitHub Actions for lint + test + build +10. **Add linting**: `ruff` for linting and formatting +11. **Add type checking**: `mypy` with strict mode +12. **Refactor `Langent` class**: Extract subsystem managers +13. **Remove `sys.path` manipulation**: Fix package structure +14. **Add `Dockerfile`** for reproducible deployments +15. **Replace global singletons** with FastAPI dependency injection + +### P3 — Long-term + +16. **Add structured logging** with correlation IDs +17. **Add integration test suite** with Docker Compose (ChromaDB + Neo4j) +18. **Add API rate limiting** and request validation +19. **Implement proper error types** instead of returning `{"error": "..."}` dicts +20. **Add health check endpoints** and graceful shutdown handling + +--- + +## Architecture Diagram + +``` +┌─────────────────────────────────────────────────┐ +│ langent serve (CLI) │ +│ langent ingest │ +│ langent query │ +├─────────────┬───────────────┬───────────────────┤ +│ FastAPI │ MCP Server │ CLI (Click) │ +│ + WebSocket │ (stdio) │ │ +├─────────────┴───────────────┴───────────────────┤ +│ Langent Brain │ +│ (Central Orchestrator — God Class) │ +├──────────┬──────────┬──────────┬────────────────┤ +│ RAG │ Stores │ Agents │ Skills │ +│ Pipeline │ │ │ │ +│ ┌──────┐ │ ┌──────┐ │ ┌──────┐ │ ┌────────────┐│ +│ │Ingest│ │ │Vector│ │ │Work- │ │ │SkillLoader ││ +│ │or │ │ │Store │ │ │flows │ │ │(SKILL.md) ││ +│ ├──────┤ │ │Chroma│ │ │Lang- │ │ └────────────┘│ +│ │Chunk-│ │ │DB │ │ │Graph │ │ │ +│ │er │ │ ├──────┤ │ ├──────┤ │ │ +│ ├──────┤ │ │Graph │ │ │Sub- │ │ │ +│ │Retri-│ │ │Store │ │ │Agent │ │ │ +│ │ever │ │ │Neo4j │ │ │Mgr │ │ │ +│ └──────┘ │ └──────┘ │ └──────┘ │ │ +├──────────┴──────────┴──────────┴────────────────┤ +│ LLM Proxy Layer │ +│ fake | ollama | mcp | proxy modes │ +└─────────────────────────────────────────────────┘ +``` + +--- + +## File-by-File Quality Scores + +| File | Lines | Score | Key Concern | +|---|---|---|---| +| `brain.py` | 331 | 7/10 | God class, silent exceptions | +| `config.py` | 91 | 6/10 | Hardcoded password | +| `store/vector.py` | 273 | 8/10 | Good caching, silent exceptions | +| `store/graph.py` | 168 | 6/10 | Cypher injection, hardcoded password | +| `rag/ingest.py` | 112 | 9/10 | Multi-encoding, robust extraction | +| `rag/chunker.py` | 104 | 8/10 | Good algorithm, clean code | +| `rag/retriever.py` | 128 | 8/10 | Clean RRF implementation | +| `agents/workflows.py` | 110 | 6/10 | Variable interpolation bug | +| `agents/delegation.py` | 51 | 7/10 | Simplistic keyword matching | +| `skills/loader.py` | 77 | 8/10 | Clean, extensible design | +| `server/api.py` | 155 | 6/10 | No auth, CORS *, global state | +| `server/mcp_server.py` | 175 | 6/10 | Method call bug, hardcoded path | +| `server/cli.py` | 56 | 8/10 | Clean Click usage | +| `llm_proxy.py` | 94 | 7/10 | Good LangChain integration | +| `tests/test_smoke.py` | 93 | 4/10 | Not proper tests, no assertions | + +--- + +*Analysis performed by reviewing all 15 source files and configuration in the Langent v2.0.0 repository.* diff --git a/README.md b/README.md index e94a8a8..f597457 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,49 @@ -# 🌌 Langent — Personal Ontology & 3D Knowledge Nebula +# Langent v3 — Personal Ontology & 3D Knowledge Nebula [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) +[![CI](https://github.com/AlexAI-MCP/langent/actions/workflows/ci.yml/badge.svg)](https://github.com/AlexAI-MCP/langent/actions) **Langent** is a RAG (Retrieval-Augmented Generation) framework that transforms your local workspace into a 3D cosmic nebula of knowledge. It combines vector embeddings (ChromaDB) with knowledge graphs (Neo4j) to provide a deeply connected AI experience. ![Langent Nebula Showcase](docs/IMG_6158.jpeg) -### 📺 Watch the Demo +### Watch the Demo [![Watch the video](https://img.youtube.com/vi/8HLQO52VO5g/0.jpg)](https://youtu.be/8HLQO52VO5g) ![Langent Nebula Showcase](docs/nebula_full.png) + --- -## ✨ Key Features +## What's New in v3 -- **📂 Auto-Ingestion**: Automatically scans and indexes MD, PDF, TXT, CSV, JSON, and YAML files. -- **🧠 Hybrid RAG**: Merges semantic vector search with graph-based relationship traversal for superior context. -- **🌌 3D Nebula Visualizer**: Explore your knowledge base in an interactive Three.js 3D environment. -- **🔗 Knowledge Linking**: Automatically discovers and creates relationships between your documents and entities. -- **🤖 MCP Integration**: Built-in support for Model Context Protocol (MCP) to plug into Claude Desktop, Antigravity, and more. +- **Security**: Removed all hardcoded credentials, added Cypher injection prevention, optional API key auth +- **Logging**: Replaced all `print()` with structured `logging` module +- **Testing**: Comprehensive pytest suite with 40+ test cases +- **CI/CD**: GitHub Actions for lint + test across Python 3.10/3.11/3.12 +- **Tooling**: ruff (lint), mypy (type check), Dockerfile for containerized deployment +- **API**: Pydantic request models, FastAPI dependency injection, health check endpoint +- **Bug fixes**: Fixed `${}` interpolation bug in workflows, fixed MCP server method call error --- -## 🚀 Installation & Quick Start +## Key Features + +- **Auto-Ingestion**: Automatically scans and indexes MD, PDF, TXT, CSV, JSON, and YAML files. +- **Hybrid RAG**: Merges semantic vector search with graph-based relationship traversal for superior context. +- **3D Nebula Visualizer**: Explore your knowledge base in an interactive Three.js 3D environment. +- **Knowledge Linking**: Automatically discovers and creates relationships between your documents and entities. +- **MCP Integration**: Built-in support for Model Context Protocol (MCP) to plug into Claude Desktop, Antigravity, and more. +- **LangGraph Workflows**: Agent state machines with RAG + Graph reasoning pipeline. + +--- + +## Installation & Quick Start ### 1. Install via Pip ```bash -git clone https://github.com/yourusername/langent.git +git clone https://github.com/AlexAI-MCP/langent.git cd langent pip install -e . ``` @@ -43,33 +58,58 @@ Edit `.env` to set your `LANGENT_WORKSPACE` (the folder containing your document ### 3. Ingest & Serve ```bash -langent ingest --path ./samples # Start with our sample data! -langent serve # Open http://localhost:8000 +langent ingest --workspace ./samples # Start with sample data +langent serve # Open http://localhost:8000 +``` + +### 4. Docker (Alternative) + +```bash +docker build -t langent . +docker run -p 8000:8000 -v ./workspace:/app/workspace langent +``` + +--- + +## Development + +```bash +# Install with dev dependencies +pip install -e ".[dev]" + +# Run tests +pytest tests/ -v + +# Lint +ruff check langent/ tests/ + +# Type check +mypy langent/ ``` --- -## � Data Workflow: From Workspace to 3D Nebula +## Data Workflow: From Workspace to 3D Nebula -Langent automates the complex journey from raw raw files to an interactive 3D knowledge universe. +Langent automates the journey from raw files to an interactive 3D knowledge universe. -1. **Gather Data (Workspace)**: Drop your "data lumps" (PDFs, Markdown notes, CSV spreadsheets, Research papers) into your connected `LANGENT_WORKSPACE` folder. +1. **Gather Data (Workspace)**: Drop your PDFs, Markdown notes, CSV spreadsheets, and research papers into your `LANGENT_WORKSPACE` folder. 2. **Chunking**: Langent automatically breaks these large files into smaller, semantically meaningful chunks (300-500 tokens). -3. **Vectorization (ChromaDB)**: +3. **Vectorization (ChromaDB)**: - Using local embedding models (e.g., `all-MiniLM-L6-v2`), each chunk is transformed into a high-dimensional vector. - These vectors are stored in **ChromaDB** for lightning-fast semantic retrieval. -4. **3D Projection**: - - Langent uses advanced dimensionality reduction (UMAP) to project these high-dimensional vectors into a **3D Point Cloud**. - - Points that are semantically similar "cluster" together in space, forming the constellations of your knowledge base. +4. **3D Projection**: + - Langent uses UMAP to project high-dimensional vectors into a **3D Point Cloud**. + - Semantically similar points cluster together, forming knowledge constellations. > **Result**: Your messy folder becomes a beautiful, searchable, and navigable 3D cosmic map. --- -## �🛠️ Advanced Setup +## Advanced Setup ### 1. Connecting to Neo4j (Graph Store) -To enable the **Knowledge Graph** features (searching relationships, linking entities), you need a Neo4j instance. +To enable **Knowledge Graph** features, you need a Neo4j instance. - **Option A: Docker (Recommended)** ```bash @@ -87,60 +127,58 @@ To enable the **Knowledge Graph** features (searching relationships, linking ent NEO4J_PASSWORD="your_password" ``` -### 2. MCP Integration (Claude & Antigravity) -Langent acts as an **MCP (Model Context Protocol)** server, allowing AI agents like Claude or Antigravity to use your workspace as their long-term memory. - -- **For Antigravity / Claude Desktop**: - Add the following to your `mcp_config.json`: - ```json - "langent": { - "command": "python", - "args": ["-m", "langent.server.mcp_server"], - "env": { - "LANGENT_WORKSPACE": "/path/to/your/workspace" - } +### 2. API Authentication + +Set an API key in your `.env` to protect write endpoints: +```env +API_KEY="your-secret-api-key" +``` +Then pass it via the `X-API-Key` header for protected endpoints (`/api/ingest`, `/api/link`, `/api/graph`). + +### 3. MCP Integration (Claude & Antigravity) +Langent acts as an **MCP server**, allowing AI agents like Claude to use your workspace as long-term memory. + +Add the following to your `mcp_config.json`: +```json +"langent": { + "command": "python", + "args": ["-m", "langent.server.mcp_server"], + "env": { + "LANGENT_WORKSPACE": "/path/to/your/workspace" } - ``` +} +``` -### 🤖 Using Langent with AI Agents (Antigravity, Claude Code) +### Using Langent with AI Agents (Antigravity, Claude Code) Once connected via MCP, you can talk to your workspace as if it's an intelligent entity. -- **Data Ingestion**: +- **Data Ingestion**: > "Langent의 mcp 도구를 사용해서 내 워크스페이스에 있는 새로운 문서들을 인덱싱해줘." -- **Semantic Search**: +- **Semantic Search**: > "내 워크스페이스에서 'AI 미래 전략'과 관련된 내용을 네뷸라에서 검색해서 요약해줘." - **Graph Insight**: > "내 연구 주제인 'AI 에이전트'와 가장 많이 연결된 핵심 키워드들을 그래프로 분석해서 보고서로 만들어줘." -Langent provides the AI with specific tools (`ingest_workspace`, `search_nebula`, `query_graph`) allowing it to act as a 3D Knowledge Librarian. - --- -## 🌌 How to use Nebula 3D Visualizer +## Nebula 3D Visualizer Once you run `langent serve`, navigate to `http://localhost:8000`. -- **Points (Star Dust)**: Each point represents a chunk of your documents. - - 🎨 **Lavender**: Regular markdown/text files. - - 💖 **Hot Pink**: Important entities like "Alex AI" or yourself. -- **Nodes (Planets)**: These are entities extracted into the Neo4j Graph. -- **Lines (Cosmic Strings)**: - - **Weak Lines**: Relationships between graph entities. - - **Dashed Lines**: Semantic links between vector points and graph nodes. +- **Points (Star Dust)**: Each point represents a chunk of your documents. +- **Nodes (Planets)**: Entities extracted into the Neo4j Graph. +- **Lines (Cosmic Strings)**: Relationships between graph entities and semantic links. - **Controls**: - **Left Click**: Select a point to see its original content. - **Right Click / Drag**: Rotate the universe. - **Scroll**: Zoom in/out of the knowledge cluster. - - **Search Bar**: Type a keyword (e.g., "Suseo") to highlight matching stars in white. + - **Search Bar**: Type a keyword to highlight matching stars. --- -## 📄 License +## License This project is licensed under the Apache License 2.0. --- -*Created with ❤️ by Alex AI* - - - +*Created by Alex AI* diff --git a/langent/__init__.py b/langent/__init__.py index e686288..1cffc53 100644 --- a/langent/__init__.py +++ b/langent/__init__.py @@ -1,16 +1,15 @@ """ -Langent v2 — RAG Agentic Framework +Langent v3 — RAG Agentic Framework ==================================== -Workspace → ChromaDB → 3D Nebula → MCP/API +Workspace -> ChromaDB -> 3D Nebula -> MCP/API Usage: from langent import Langent agent = Langent(workspace="./my_data") agent.ingest() agent.query("Find related documents about AI") - agent.visualize() """ -__version__ = "2.0.0" +__version__ = "3.0.0" from langent.brain import Langent diff --git a/langent/agents/workflows.py b/langent/agents/workflows.py index d17e4b4..3562669 100644 --- a/langent/agents/workflows.py +++ b/langent/agents/workflows.py @@ -1,25 +1,29 @@ """ -Langent Workflows — LangGraph based agentic logic -=================================================== +Langent Workflows v3 — LangGraph based agentic logic +====================================================== Defines the standard RAG + Graph reasoning state machine. """ +import logging from typing import TypedDict, List, Dict, Any, Optional + from langgraph.graph import StateGraph, END from langent.llm_proxy import HostLLMProxy from langent.rag.retriever import HybridRetriever from langent.store.graph import GraphStore +logger = logging.getLogger(__name__) + class AgentState(TypedDict): """LangGraph 워크플로우 상태 정의""" - query: str # 사용자 질문 - context: str # RAG 검색 결과 텍스트 - graph_results: List[Dict] # 그래프 검색 결과 - cypher_query: Optional[str] # 생성된 Cypher 쿼리 - answer: str # 최종 답변 - steps: List[str] # 실행된 단계 기록 - metadata: Dict[str, Any] # 추가 메타데이터 + query: str + context: str + graph_results: List[Dict] + cypher_query: Optional[str] + answer: str + steps: List[str] + metadata: Dict[str, Any] class LangentWorkflows: @@ -42,12 +46,10 @@ def build_rag_graph(self) -> StateGraph: """기본 RAG + Graph 하이브리드 워크플로우 생성""" workflow = StateGraph(AgentState) - # 1. 노드 추가 workflow.add_node("retrieve", self.node_retrieve) workflow.add_node("graph_search", self.node_graph_search) workflow.add_node("generate", self.node_generate) - # 2. 엣지 연결 workflow.set_entry_point("retrieve") workflow.add_edge("retrieve", "graph_search") workflow.add_edge("graph_search", "generate") @@ -72,9 +74,8 @@ def node_graph_search(self, state: AgentState) -> Dict: return {"steps": state.get("steps", []) + ["Skipped graph search (not connected)"]} query = state["query"] - # 하이브리드 리트리버의 로직 재사용 graph_results = self.retriever._graph_search(query) - + return { "graph_results": graph_results, "steps": state.get("steps", []) + [f"Found {len(graph_results)} graph relations"] @@ -85,23 +86,18 @@ def node_generate(self, state: AgentState) -> Dict: query = state["query"] context = state.get("context", "") graph = state.get("graph_results", []) - - graph_str = "\n".join([f"- {g['entity']} -[{g['relation']}]-> {g['related']}" for g in graph]) - - prompt = f"""당신은 우수한 AI 비서 Langent입니다. -제공된 컨텍스트와 지식 그래프 정보를 바탕으로 사용자의 질문에 답변하세요. -[사용자 질문] -${query} - -[벡터 컨텍스트] -${context} - -[지식 그래프 정보] -${graph_str} + graph_str = "\n".join([f"- {g['entity']} -[{g['relation']}]-> {g['related']}" for g in graph]) -지식에 근거하여 상세하고 친절하게 답변하세요. 만약 모르는 내용이라면 확실하지 않다고 솔직하게 답변하세요. -""" + prompt = ( + "당신은 우수한 AI 비서 Langent입니다. " + "제공된 컨텍스트와 지식 그래프 정보를 바탕으로 사용자의 질문에 답변하세요.\n\n" + f"[사용자 질문]\n{query}\n\n" + f"[벡터 컨텍스트]\n{context}\n\n" + f"[지식 그래프 정보]\n{graph_str}\n\n" + "지식에 근거하여 상세하고 친절하게 답변하세요. " + "만약 모르는 내용이라면 확실하지 않다고 솔직하게 답변하세요." + ) response = self.llm.invoke(prompt) return { "answer": response.content, diff --git a/langent/brain.py b/langent/brain.py index cec8976..2eda46f 100644 --- a/langent/brain.py +++ b/langent/brain.py @@ -1,7 +1,7 @@ """ -Langent Brain v2 — Main Orchestrator +Langent Brain v3 — Main Orchestrator ====================================== -Workspace → ChromaDB → 3D Nebula → MCP/API +Workspace -> ChromaDB -> 3D Nebula -> MCP/API The central hub connecting all subsystems: - Workspace file watching & ingestion @@ -10,6 +10,7 @@ - RAG pipeline for intelligent retrieval - MCP/API for Antigravity/Claude Code integration """ +import logging from typing import Optional, List, Dict, Any from pathlib import Path @@ -19,19 +20,19 @@ from langent.rag.ingest import DocumentIngestor from langent.rag.chunker import SmartChunker from langent.rag.retriever import HybridRetriever - - from langent.llm_proxy import get_llm from langent.agents.workflows import LangentWorkflows from langent.agents.delegation import SubAgentManager from langent.skills.loader import SkillLoader +logger = logging.getLogger(__name__) + class Langent: """ - Langent v2 — 워크스페이스 기반 RAG Agentic Framework + Langent v3 — 워크스페이스 기반 RAG Agentic Framework - 1. 워크스페이스 파일들을 스캔·수집 + 1. 워크스페이스 파일들을 스캔/수집 2. ChromaDB에 벡터화하여 저장 3. 시맨틱 검색 + 그래프 검색 하이브리드 RAG 4. 3D 성운 시각화 데이터 제공 @@ -42,8 +43,8 @@ class Langent: def __init__( self, - workspace: str = None, - config_path: str = None, + workspace: Optional[str] = None, + config_path: Optional[str] = None, neo4j: bool = True, llm_mode: str = "fake", verbose: bool = True, @@ -51,6 +52,9 @@ def __init__( self.config = LangentConfig(config_path) self.verbose = verbose + if verbose: + logging.basicConfig(level=logging.INFO, format="%(name)s | %(message)s") + # Workspace ws = workspace or self.config.workspace.path self.workspace = Path(ws) @@ -66,19 +70,7 @@ def __init__( ) # 3. Graph Store (optional) - self.graph = None - if neo4j: - try: - self.graph = GraphStore( - uri=self.config.graph.uri, - user=self.config.graph.user, - password=self.config.graph.password, - ) - if not self.graph.test_connection(): - self._log("⚠ Neo4j unavailable, graph features disabled") - self.graph = None - except Exception: - self.graph = None + self.graph = self._init_graph(neo4j) # 4. RAG Pipeline self.ingestor = DocumentIngestor( @@ -100,7 +92,7 @@ def __init__( self.workflows = LangentWorkflows( llm=self.llm, retriever=self.retriever, - graph_store=self.graph + graph_store=self.graph, ) self._rag_graph = self.workflows.build_rag_graph() @@ -111,29 +103,46 @@ def __init__( ) self.skills.load_all() - self._log(f"[Brain] Langent v2.0 initialized") - self._log(f" Workspace: {self.workspace}") - self._log(f" LLM Mode: {self.llm.mode}") - self._log(f" Skills loaded: {len(self.skills.skills)}") + logger.info("Langent v3.0 initialized") + logger.info(" Workspace: %s", self.workspace) + logger.info(" LLM Mode: %s", self.llm.mode) + logger.info(" Skills loaded: %d", len(self.skills.skills)) + + def _init_graph(self, neo4j: bool) -> Optional[GraphStore]: + """Initialize graph store, returning None if unavailable.""" + if not neo4j: + return None + try: + graph = GraphStore( + uri=self.config.graph.uri, + user=self.config.graph.user, + password=self.config.graph.password, + ) + if not graph.test_connection(): + logger.warning("Neo4j unavailable, graph features disabled") + return None + return graph + except Exception as e: + logger.warning("Neo4j init failed: %s", e) + return None # ─── Agent Chat ─────────────────────────────────── def chat(self, question: str) -> Dict[str, Any]: """LangGraph RAG 워크플로우를 실행합니다.""" - self._log(f"[Chat] Chat request: {question}") + logger.info("Chat request: %s", question[:80]) initial_state = { "query": question, "steps": ["Initialized chat"], - "metadata": {} + "metadata": {}, } - result = self._rag_graph.invoke(initial_state) - return result + return self._rag_graph.invoke(initial_state) # ─── Ingest ─────────────────────────────────────── - def ingest(self, path: str = None) -> Dict[str, int]: + def ingest(self, path: Optional[str] = None) -> Dict[str, int]: """ - 워크스페이스 파일들을 수집 → 청킹 → 벡터 DB 저장 + 워크스페이스 파일들을 수집 -> 청킹 -> 벡터 DB 저장 Returns: {"files_scanned", "chunks_created", "vectors_added"} @@ -146,24 +155,21 @@ def ingest(self, path: str = None) -> Dict[str, int]: ignore=self.config.workspace.ignore, ) - self._log("📥 Ingesting workspace files...") + logger.info("Ingesting workspace files...") - # 1. Scan & extract documents = ingestor.ingest_all() - self._log(f" Found {len(documents)} files") + logger.info("Found %d files", len(documents)) if not documents: return {"files_scanned": 0, "chunks_created": 0, "vectors_added": 0} - # 2. Chunk chunks = self.chunker.chunk_documents(documents) - self._log(f" Created {len(chunks)} chunks") + logger.info("Created %d chunks", len(chunks)) - # 3. Store in vector DB texts = [c["text"] for c in chunks] metas = [c["metadata"] for c in chunks] added = self.vector.add_documents(texts, metas) - self._log(f" Added {added} new vectors (total: {self.vector.count()})") + logger.info("Added %d new vectors (total: %d)", added, self.vector.count()) return { "files_scanned": len(documents), @@ -195,17 +201,16 @@ def get_context(self, question: str, top_k: int = 5) -> str: def get_nebula_data(self, limit: int = 50000) -> Dict[str, Any]: """ 3D 성운 시각화용 데이터를 반환합니다. - Returns: {points: [...], graph: {nodes, edges}, cross_links, stats} + Returns: {points, graph, cross_links, stats} """ points = self.vector.get_3d_positions(limit=limit) - graph_data = {"nodes": [], "edges": []} - cross_links = [] - + graph_data: Dict[str, Any] = {"nodes": [], "edges": []} + cross_links: List[Dict] = [] + if self.graph: try: graph_data = self.graph.export_for_viz(limit=1000) - # Fetch cross-links (Chunk to Entity) links_raw = self.graph.run_cypher( "MATCH (c:Chunk)-[:MENTIONS]->(e) " "RETURN c.id AS chunk_id, id(e) AS node_id LIMIT 2000" @@ -214,8 +219,8 @@ def get_nebula_data(self, limit: int = 50000) -> Dict[str, Any]: {"source": str(l["chunk_id"]), "target": str(l["node_id"])} for l in links_raw ] - except Exception: - pass + except Exception as e: + logger.warning("Graph viz export failed: %s", e) return { "points": points, @@ -235,7 +240,7 @@ def search_nebula(self, query: str, top_k: int = 10) -> Dict: # ─── Graph Operations ───────────────────────────── - def graph_query(self, cypher: str, params: Dict = None) -> List[Dict]: + def graph_query(self, cypher: str, params: Optional[Dict] = None) -> List[Dict]: """Neo4j Cypher 쿼리 실행""" if not self.graph: return [{"error": "Neo4j not connected"}] @@ -248,44 +253,37 @@ def graph_schema(self) -> Dict: return self.graph.get_schema() def auto_link(self) -> Dict[str, int]: - """ - 벡터 청크와 그래프 노드를 자동으로 연결합니다. - 1. 그래프에서 이름(name) 속성이 있는 모든 노드 추출 - 2. 벡터 청크 텍스트에서 해당 이름이 포함되어 있는지 검색 - 3. 일치할 경우 Neo4j에 (:Chunk)-[:MENTIONS]->(Node) 관계 생성 - """ + """벡터 청크와 그래프 노드를 자동으로 연결합니다.""" if not self.graph: return {"error": "Neo4j not connected"} - self._log("🔗 Auto-linking vector chunks to graph entities...") - - # 1. Get entities from graph + logger.info("Auto-linking vector chunks to graph entities...") + entities = self.graph.run_cypher( - "MATCH (n) WHERE n.name IS NOT NULL RETURN id(n) AS id, n.name AS name, labels(n)[0] AS label" + "MATCH (n) WHERE n.name IS NOT NULL " + "RETURN id(n) AS id, n.name AS name, labels(n)[0] AS label" ) entity_map = {e["name"].lower(): e for e in entities if e["name"]} - self._log(f" Found {len(entity_map)} unique entities in graph") + logger.info("Found %d unique entities in graph", len(entity_map)) - # 2. Get chunks from vector store data = self.vector.get_all_embeddings(limit=10000) chunks = [] for i in range(len(data["ids"])): chunks.append({ "id": data["ids"][i], - "text": data["documents"][i].lower() + "text": data["documents"][i].lower(), }) - # 3. Match and link linked_count = 0 for chunk in chunks: mentions = [] for name, entity in entity_map.items(): - if len(name) < 2: continue + if len(name) < 2: + continue if name in chunk["text"]: mentions.append(entity["id"]) - + if mentions: - # Create Chunk node and relationships self.graph.run_cypher( "MERGE (c:Chunk {id: $cid})", {"cid": chunk["id"]} ) @@ -293,13 +291,13 @@ def auto_link(self) -> Dict[str, int]: self.graph.run_cypher( "MATCH (c:Chunk {id: $cid}), (e) WHERE id(e) = $eid " "MERGE (c)-[:MENTIONS]->(e)", - {"cid": chunk["id"], "eid": eid} + {"cid": chunk["id"], "eid": eid}, ) linked_count += 1 if linked_count % 100 == 0: - self._log(f" Linked {linked_count} chunks...") + logger.info("Linked %d chunks...", linked_count) - self._log(f"✅ Linking complete: {linked_count} chunks linked") + logger.info("Linking complete: %d chunks linked", linked_count) return {"chunks_linked": linked_count} # ─── Status ─────────────────────────────────────── @@ -307,6 +305,7 @@ def auto_link(self) -> Dict[str, int]: def status(self) -> Dict[str, Any]: """시스템 상태를 반환합니다.""" return { + "version": "3.0.0", "workspace": str(self.workspace), "vector_count": self.vector.count(), "graph_connected": self.graph is not None, @@ -314,17 +313,10 @@ def status(self) -> Dict[str, Any]: "collections": self.vector.list_collections(), } - def _log(self, msg: str): - if self.verbose: - print(msg) - def close(self): if self.graph: self.graph.close() def __repr__(self): - return ( - f"" - ) + g = "connected" if self.graph else "disabled" + return f"" diff --git a/langent/config.py b/langent/config.py index f3ba19b..c215897 100644 --- a/langent/config.py +++ b/langent/config.py @@ -1,13 +1,16 @@ """ -Langent Config — Settings from .env + YAML +Langent Config v3 — Settings from .env + YAML """ -import os +import logging from pathlib import Path from typing import Optional, List -from pydantic import BaseModel, Field + +from pydantic import BaseModel from pydantic_settings import BaseSettings import yaml +logger = logging.getLogger(__name__) + class VectorStoreConfig(BaseModel): provider: str = "chromadb" @@ -20,7 +23,7 @@ class GraphStoreConfig(BaseModel): provider: str = "neo4j" uri: str = "bolt://localhost:7687" user: str = "neo4j" - password: str = "yw02280228" + password: str = "password" class RAGConfig(BaseModel): @@ -54,7 +57,8 @@ class LangentSettings(BaseSettings): embedding_model: str = "all-MiniLM-L6-v2" api_host: str = "0.0.0.0" api_port: int = 8000 - llm_mode: str = "fake" # Default to fake for safety + api_key: str = "" + llm_mode: str = "fake" model_config = {"env_file": ".env", "env_file_encoding": "utf-8", "extra": "ignore"} @@ -64,7 +68,7 @@ class LangentConfig: def __init__(self, config_path: Optional[str] = None): self.env = LangentSettings() - self._yaml = {} + self._yaml: dict = {} if config_path: self._load_yaml(config_path) else: @@ -86,5 +90,8 @@ def __init__(self, config_path: Optional[str] = None): self.visualizer = VisualizerConfig(**self._yaml.get("visualizer", {})) def _load_yaml(self, path: str): - with open(path, "r", encoding="utf-8") as f: - self._yaml = yaml.safe_load(f) or {} + try: + with open(path, "r", encoding="utf-8") as f: + self._yaml = yaml.safe_load(f) or {} + except Exception as e: + logger.warning("Failed to load config YAML %s: %s", path, e) diff --git a/langent/llm_proxy.py b/langent/llm_proxy.py index ca2c122..3930323 100644 --- a/langent/llm_proxy.py +++ b/langent/llm_proxy.py @@ -1,6 +1,6 @@ """ -HostLLMProxy — LLM access without direct API keys -=================================================== +HostLLMProxy v3 — LLM access without direct API keys +====================================================== Leverages host AI (Antigravity/Claude Code) via MCP or CLI, or uses local models/proxies. @@ -10,23 +10,23 @@ - "ollama": Local Ollama API (localhost:11434) - "proxy": LiteLLM or OpenClaw-style OAuth proxy """ -import os -import json -from typing import List, Dict, Any, Optional, Union +import logging +from typing import List, Any, Optional + from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import BaseMessage, AIMessage, HumanMessage, SystemMessage from langchain_core.outputs import ChatResult, ChatGeneration +logger = logging.getLogger(__name__) + class HostLLMProxy(BaseChatModel): - """ - LangChain compatible ChatModel that proxies calls to host AI. - """ - - mode: str = "fake" # fake | ollama | mcp | proxy + """LangChain compatible ChatModel that proxies calls to host AI.""" + + mode: str = "fake" model_name: str = "host-llm" temperature: float = 0.7 - + def _generate( self, messages: List[BaseMessage], @@ -35,19 +35,22 @@ def _generate( **kwargs: Any, ) -> ChatResult: """메시지를 처리하고 응답을 생성합니다.""" - last_msg = messages[-1].content if messages else "" - + if self.mode == "fake": response_text = self._fake_response(last_msg) elif self.mode == "ollama": response_text = self._ollama_response(messages) elif self.mode == "mcp": - # 이 모드는 프레임워크가 MCP 도구로 실행될 때, - # 호스트 AI에게 "네가 이 답변을 생성해줘"라고 요청하는 마커를 반환합니다. - response_text = f"[HOST_AI_REQUIRED: Please process this request based on the following context context: {last_msg}]" + response_text = ( + f"[HOST_AI_REQUIRED: Please process this request " + f"based on the following context: {last_msg}]" + ) else: - response_text = f"Mode '{self.mode}' is not yet implemented. Using fake response: Hello! I am Langent v2." + response_text = ( + f"Mode '{self.mode}' is not yet implemented. " + f"Using fake response: Hello! I am Langent v3." + ) message = AIMessage(content=response_text) generation = ChatGeneration(message=message) @@ -61,33 +64,49 @@ def _fake_response(self, prompt: str) -> str: """시뮬레이션 응답 (테스트용)""" p_lower = prompt.lower() if "안녕" in p_lower or "hello" in p_lower: - return "안녕하세요! Langent v2 프레임워크의 에이전트입니다. 무엇을 도와드릴까요?" + return "안녕하세요! Langent v3 프레임워크의 에이전트입니다. 무엇을 도와드릴까요?" if "치킨" in p_lower: - return "서울에 위치한 60계치킨, 처갓집양념치킨 등 다양한 치킨집 정보가 벡터 DB에 저장되어 있습니다. 특정 지역을 말씀하시면 상세히 찾아드릴게요." + return ( + "서울에 위치한 60계치킨, 처갓집양념치킨 등 다양한 치킨집 정보가 " + "벡터 DB에 저장되어 있습니다. 특정 지역을 말씀하시면 상세히 찾아드릴게요." + ) if "cypher" in p_lower or "match" in p_lower: return "MATCH (n:Person) RETURN n LIMIT 5" - - return f"Langent 프레임워크가 '{prompt}'에 대한 처리를 준비 중입니다. 현재는 테스트 모드(fake)입니다." + + return ( + f"Langent 프레임워크가 요청을 처리 중입니다. " + f"현재는 테스트 모드(fake)입니다." + ) def _ollama_response(self, messages: List[BaseMessage]) -> str: """로컬 Ollama를 통한 응답""" import requests + url = "http://localhost:11434/api/chat" payload = { "model": "llama3", "messages": [ - {"role": "system" if isinstance(m, SystemMessage) else "user" if isinstance(m, HumanMessage) else "assistant", - "content": m.content} + { + "role": ( + "system" if isinstance(m, SystemMessage) + else "user" if isinstance(m, HumanMessage) + else "assistant" + ), + "content": m.content, + } for m in messages ], - "stream": False + "stream": False, } try: - resp = requests.post(url, json=payload) + resp = requests.post(url, json=payload, timeout=60) + resp.raise_for_status() return resp.json()["message"]["content"] except Exception as e: + logger.error("Ollama connection error: %s", e) return f"Ollama connection error: {e}" + def get_llm(mode: str = "fake", **kwargs): """설정에 따른 LLM 인스턴스 반환""" return HostLLMProxy(mode=mode, **kwargs) diff --git a/langent/rag/ingest.py b/langent/rag/ingest.py index 611eefb..2f17100 100644 --- a/langent/rag/ingest.py +++ b/langent/rag/ingest.py @@ -1,13 +1,16 @@ """ -DocumentIngestor — Multi-format file ingestion -================================================ +DocumentIngestor v3 — Multi-format file ingestion +=================================================== Watches workspace folder, extracts text from PDF/MD/TXT/CSV, -feeds into chunker → vector store. +feeds into chunker -> vector store. """ +import logging import os from pathlib import Path from typing import List, Dict, Any, Optional +logger = logging.getLogger(__name__) + class DocumentIngestor: """ @@ -17,8 +20,8 @@ class DocumentIngestor: SUPPORTED = {".md", ".txt", ".pdf", ".csv", ".json", ".yaml", ".yml"} - def __init__(self, workspace_path: str, extensions: List[str] = None, - ignore: List[str] = None): + def __init__(self, workspace_path: str, extensions: Optional[List[str]] = None, + ignore: Optional[List[str]] = None): self.workspace = Path(workspace_path) self.extensions = set(extensions) if extensions else self.SUPPORTED self.ignore = set(ignore) if ignore else { @@ -29,7 +32,6 @@ def scan_files(self) -> List[Path]: """워크스페이스에서 지원 파일 목록을 스캔합니다.""" files = [] for root, dirs, filenames in os.walk(self.workspace): - # Skip ignored directories dirs[:] = [d for d in dirs if d not in self.ignore] for fname in filenames: fpath = Path(root) / fname @@ -49,7 +51,7 @@ def extract_text(self, file_path: Path) -> Optional[str]: elif suffix == ".csv": return self._read_csv(file_path) except Exception as e: - print(f" [Warning] Error extracting {file_path.name}: {e}") + logger.warning("Error extracting %s: %s", file_path.name, e) return None def ingest_all(self) -> List[Dict[str, Any]]: @@ -93,7 +95,7 @@ def _read_pdf(self, path: Path) -> str: pages = [page.extract_text() or "" for page in reader.pages] return "\n\n".join(pages) except ImportError: - print(" ⚠ pypdf not installed. Run: pip install pypdf") + logger.warning("pypdf not installed. Run: pip install pypdf") return "" def _read_csv(self, path: Path) -> str: @@ -106,6 +108,6 @@ def _read_csv(self, path: Path) -> str: if i > 200: break rows.append(" | ".join(row)) - except Exception: - pass + except Exception as e: + logger.warning("CSV read failed for %s: %s", path.name, e) return "\n".join(rows) diff --git a/langent/rag/retriever.py b/langent/rag/retriever.py index 3deba5a..a094eb1 100644 --- a/langent/rag/retriever.py +++ b/langent/rag/retriever.py @@ -1,12 +1,16 @@ """ -HybridRetriever — Vector + Graph combined search -=================================================== +HybridRetriever v3 — Vector + Graph combined search +===================================================== Merges ChromaDB semantic search with Neo4j graph traversal. """ +import logging from typing import List, Dict, Any, Optional + from langent.store.vector import VectorStore from langent.store.graph import GraphStore +logger = logging.getLogger(__name__) + class HybridRetriever: """ @@ -36,16 +40,12 @@ def search( 2. (옵션) Graph DB에서 관련 엔티티 검색 3. RRF로 결과 병합 """ - # 1. Vector search vector_results = self.vector.search(query, top_k=top_k * 2) if not use_graph or not self.graph: return vector_results[:top_k] - # 2. Graph search (keyword extraction → graph lookup) graph_context = self._graph_search(query) - - # 3. Merge combined = self._rrf_merge(vector_results, graph_context, top_k) return combined @@ -54,7 +54,6 @@ def _graph_search(self, query: str) -> List[Dict[str, str]]: if not self.graph: return [] - # Simple keyword extraction (split by space, filter short words) keywords = [w for w in query.split() if len(w) >= 2] results = [] @@ -73,7 +72,8 @@ def _graph_search(self, query: str) -> List[Dict[str, str]]: "relation": str(n.get("relation", "")), "related": str(n.get("related", "")), }) - except Exception: + except Exception as e: + logger.debug("Graph search for '%s' failed: %s", kw, e) continue return results @@ -86,8 +86,7 @@ def _rrf_merge( """Reciprocal Rank Fusion으로 결과 병합""" k = 60 # RRF constant - scored = {} - # Vector results + scored: Dict[str, Dict] = {} for rank, vr in enumerate(vector_results): doc_id = vr["id"] rrf_score = self.vector_weight / (k + rank + 1) @@ -97,12 +96,7 @@ def _rrf_merge( "graph_context": [], } - # Graph context boost: if a vector result mentions a graph entity, boost it if graph_context: - context_str = " ".join( - f"{g['entity']} {g['relation']} {g['related']}" - for g in graph_context - ) for doc_id, entry in scored.items(): doc_text = entry.get("document", "").lower() for g in graph_context: @@ -111,7 +105,6 @@ def _rrf_merge( entry["rrf_score"] += self.graph_weight / k entry["graph_context"].append(g) - # Sort by RRF score ranked = sorted(scored.values(), key=lambda x: x["rrf_score"], reverse=True) return ranked[:top_k] diff --git a/langent/server/api.py b/langent/server/api.py index 9d7567f..e67c9cf 100644 --- a/langent/server/api.py +++ b/langent/server/api.py @@ -1,24 +1,22 @@ """ -Langent API Server — FastAPI + WebSocket for 3D Nebula -======================================================= +Langent API Server v3 — FastAPI + WebSocket for 3D Nebula +========================================================== Serves REST API + 3D visualizer + real-time WebSocket updates. +Features: Dependency injection, optional API key auth, Pydantic models. """ -import os -import sys import json -import asyncio +import logging from pathlib import Path -from typing import Optional +from typing import Optional, List -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from fastapi import FastAPI, WebSocket, WebSocketDisconnect -from fastapi.staticfiles import StaticFiles +from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Depends, Header, HTTPException, Query from fastapi.responses import FileResponse, JSONResponse from fastapi.middleware.cors import CORSMiddleware -import uvicorn +from pydantic import BaseModel + +logger = logging.getLogger(__name__) -app = FastAPI(title="Langent Nebula", version="2.0.0") +app = FastAPI(title="Langent Nebula", version="3.0.0") app.add_middleware( CORSMiddleware, @@ -27,48 +25,88 @@ allow_headers=["*"], ) -# Lazy Langent instance +# ─── Dependency injection ──────────────────────────── + _langent = None + def get_langent(): + """FastAPI dependency that provides the Langent instance.""" global _langent if _langent is None: from langent.brain import Langent - # We don't pass workspace here because Langent() will - # automatically look at LANGENT_WORKSPACE env var or .env file _langent = Langent(verbose=True) return _langent + +def verify_api_key(x_api_key: Optional[str] = Header(None)): + """Optional API key verification. Skips if API_KEY is not configured.""" + from langent.config import LangentSettings + settings = LangentSettings() + if settings.api_key and x_api_key != settings.api_key: + raise HTTPException(status_code=401, detail="Invalid or missing API key") + + +# ─── Request/Response Models ───────────────────────── + +class IngestRequest(BaseModel): + path: Optional[str] = None + + +class GraphQueryRequest(BaseModel): + cypher: str + + +class QueryResult(BaseModel): + score: float + source: str + preview: str + id: str + + # ─── Static files (visualizer) ─────────────────────── VIZ_DIR = Path(__file__).parent.parent / "visualizer" + @app.get("/") async def index(): return FileResponse(str(VIZ_DIR / "index.html")) + @app.get("/app.js") async def app_js(): return FileResponse(str(VIZ_DIR / "app.js"), media_type="application/javascript") + @app.get("/style.css") async def style_css(): return FileResponse(str(VIZ_DIR / "style.css"), media_type="text/css") + # ─── REST API ───────────────────────────────────────── @app.get("/api/status") -async def status(): - return get_langent().status() +async def status(langent=Depends(get_langent)): + return langent.status() + @app.post("/api/ingest") -async def ingest(path: Optional[str] = None): - result = get_langent().ingest(path=path) - return result +async def ingest( + request: IngestRequest, + langent=Depends(get_langent), + _auth=Depends(verify_api_key), +): + return langent.ingest(path=request.path) + @app.get("/api/query") -async def query(q: str, top_k: int = 5): - results = get_langent().query(q, top_k=top_k) +async def query( + q: str, + top_k: int = Query(default=5, ge=1, le=100), + langent=Depends(get_langent), +): + results = langent.query(q, top_k=top_k) output = [] for r in results: output.append({ @@ -79,76 +117,94 @@ async def query(q: str, top_k: int = 5): }) return output + @app.get("/api/context") -async def context(q: str, top_k: int = 5): - return {"context": get_langent().get_context(q, top_k=top_k)} +async def context( + q: str, + top_k: int = Query(default=5, ge=1, le=100), + langent=Depends(get_langent), +): + return {"context": langent.get_context(q, top_k=top_k)} + @app.post("/api/link") -async def link_knowledge(): - """벡터 청크와 그래프 노드 연결 트리거""" - return get_langent().auto_link() +async def link_knowledge( + langent=Depends(get_langent), + _auth=Depends(verify_api_key), +): + return langent.auto_link() + @app.get("/api/shops/suseo") async def get_suseo_shops(): """수서동 상가 지리 정보 반환""" - import json - path = os.path.join(os.path.dirname(__file__), "..", "visualizer", "suseo_shops.json") - if os.path.exists(path): - with open(path, "r", encoding="utf-8") as f: + shops_path = Path(__file__).parent.parent / "visualizer" / "suseo_shops.json" + if shops_path.exists(): + with open(shops_path, "r", encoding="utf-8") as f: return json.load(f) return [] + @app.post("/api/graph") -async def graph_query(cypher: str): - return get_langent().graph_query(cypher) +async def graph_query( + request: GraphQueryRequest, + langent=Depends(get_langent), + _auth=Depends(verify_api_key), +): + return langent.graph_query(request.cypher) + # ─── 3D Nebula Data ────────────────────────────────── @app.get("/api/nebula") -async def nebula_data(limit: int = 50000): - """3D 시각화용 전체 데이터""" - return get_langent().get_nebula_data(limit=limit) +async def nebula_data( + limit: int = Query(default=50000, ge=1, le=100000), + langent=Depends(get_langent), +): + return langent.get_nebula_data(limit=limit) + @app.get("/api/nebula/search") -async def nebula_search(q: str, top_k: int = 10): - """검색 결과 + 3D 좌표""" - return get_langent().search_nebula(q, top_k=top_k) +async def nebula_search( + q: str, + top_k: int = Query(default=10, ge=1, le=100), + langent=Depends(get_langent), +): + return langent.search_nebula(q, top_k=top_k) + # ─── WebSocket (real-time updates) ──────────────────── -clients = set() +clients: set = set() + @app.websocket("/ws") async def websocket_endpoint(websocket: WebSocket): await websocket.accept() clients.add(websocket) + langent = get_langent() try: while True: data = await websocket.receive_text() msg = json.loads(data) if msg.get("type") == "search": - result = get_langent().search_nebula(msg["query"], top_k=10) + result = langent.search_nebula(msg["query"], top_k=10) await websocket.send_json({"type": "search_result", "data": result}) elif msg.get("type") == "ingest": - result = get_langent().ingest(path=msg.get("path")) - # Broadcast new data to all clients - nebula = get_langent().get_nebula_data() + result = langent.ingest(path=msg.get("path")) + nebula = langent.get_nebula_data() for client in clients: try: await client.send_json({"type": "nebula_update", "data": nebula}) except Exception: - pass + clients.discard(client) await websocket.send_json({"type": "ingest_result", "data": result}) except WebSocketDisconnect: clients.discard(websocket) -if __name__ == "__main__": - import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--port", type=int, default=8000) - parser.add_argument("--host", type=str, default="0.0.0.0") - args = parser.parse_args() - - print(f"\n[Nebula] Langent Nebula Server starting on http://localhost:{args.port}") - uvicorn.run(app, host=args.host, port=args.port, log_level="info") +# ─── Health check ───────────────────────────────────── + +@app.get("/health") +async def health(): + return {"status": "ok", "version": "3.0.0"} diff --git a/langent/server/cli.py b/langent/server/cli.py index a73ab43..7dfea7f 100644 --- a/langent/server/cli.py +++ b/langent/server/cli.py @@ -1,13 +1,16 @@ +"""Langent v3 CLI — RAG Agentic Framework""" +import os + import click import uvicorn -import os -from langent.brain import Langent + @click.group() def main(): - """Langent v2 CLI - RAG Agentic Framework""" + """Langent v3 CLI - RAG Agentic Framework""" pass + @main.command() @click.option("--port", default=8000, help="Port to run the server on") @click.option("--host", default="0.0.0.0", help="Host to run the server on") @@ -16,40 +19,62 @@ def serve(port, host, workspace): """Start the Langent Nebula server""" if workspace: os.environ["LANGENT_WORKSPACE"] = workspace - - # We import app here to avoid loading everything just for --help + from langent.server.api import app - click.echo(f"Starting Langent Nebula server on http://{host}:{port}") + click.echo(f"Starting Langent Nebula v3 on http://{host}:{port}") uvicorn.run(app, host=host, port=port) + @main.command() @click.option("--workspace", default=None, help="Path to the workspace to ingest") def ingest(workspace): """Ingest workspace files into vector store""" + from langent.brain import Langent agent = Langent(workspace=workspace) - click.echo("📥 Starting workspace ingestion...") + click.echo("Starting workspace ingestion...") result = agent.ingest() - click.echo(f"✅ Ingestion complete: {result['files_scanned']} files, {result['vectors_added']} vectors added.") + click.echo( + f"Ingestion complete: {result['files_scanned']} files, " + f"{result['vectors_added']} vectors added." + ) + @main.command() def link(): """Link vector chunks and graph entities""" + from langent.brain import Langent agent = Langent() - click.echo("🔗 Starting knowledge linking...") + click.echo("Starting knowledge linking...") result = agent.auto_link() - click.echo(f"✅ Linking complete: {result.get('chunks_linked', 0)} chunks linked.") + click.echo(f"Linking complete: {result.get('chunks_linked', 0)} chunks linked.") + @main.command() @click.argument("question") def query(question): """Query the Langent knowledge base""" + from langent.brain import Langent agent = Langent() - click.echo(f"🔍 Searching for: {question}") + click.echo(f"Searching for: {question}") results = agent.query(question) for i, r in enumerate(results): click.echo(f"\n[{i+1}] Score: {r.get('score', 0):.4f}") click.echo(f"Source: {r.get('metadata', {}).get('source', 'unknown')}") click.echo(f"Content: {r.get('document', '')[:200]}...") + +@main.command() +def status(): + """Show Langent system status""" + from langent.brain import Langent + agent = Langent() + s = agent.status() + click.echo(f"Langent v{s.get('version', '3.0.0')}") + click.echo(f" Workspace: {s['workspace']}") + click.echo(f" Vectors: {s['vector_count']}") + click.echo(f" Graph: {'connected' if s['graph_connected'] else 'disabled'}") + click.echo(f" Collections: {s['collections']}") + + if __name__ == "__main__": main() diff --git a/langent/server/mcp_server.py b/langent/server/mcp_server.py index 0cfd4e2..9c25f92 100644 --- a/langent/server/mcp_server.py +++ b/langent/server/mcp_server.py @@ -1,6 +1,6 @@ """ -Langent MCP Server — Tool Provider for Antigravity/Claude Code -================================================================ +Langent MCP Server v3 — Tool Provider for Antigravity/Claude Code +=================================================================== Exposes Langent as MCP tools. No API key needed — Antigravity's built-in LLM handles reasoning. @@ -8,37 +8,35 @@ { "langent": { "command": "python", - "args": ["-m", "server.mcp_server"], - "cwd": "c:\\Users\\daewooenc\\workspace\\Ontology\\Langent" + "args": ["-m", "langent.server.mcp_server"], + "env": { + "LANGENT_WORKSPACE": "/path/to/your/workspace" + } } } """ import json -import sys import os import asyncio - -# Add parent to path -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import logging from mcp.server.models import InitializationOptions import mcp.types as types from mcp.server import NotificationOptions, Server from mcp.server.stdio import stdio_server +logger = logging.getLogger(__name__) + server = Server("langent") -# Lazy-init Langent instance _langent = None + def get_langent(): global _langent if _langent is None: from langent.brain import Langent - workspace = os.environ.get( - "LANGENT_WORKSPACE", - r"c:\Users\daewooenc\workspace\Ontology" - ) + workspace = os.environ.get("LANGENT_WORKSPACE", ".") _langent = Langent(workspace=workspace, verbose=False) return _langent @@ -54,8 +52,8 @@ async def handle_list_tools() -> list[types.Tool]: "type": "object", "properties": { "path": {"type": "string", "description": "수집할 경로 (선택)"} - } - } + }, + }, ), types.Tool( name="langent_query", @@ -64,10 +62,10 @@ async def handle_list_tools() -> list[types.Tool]: "type": "object", "properties": { "query": {"type": "string", "description": "검색 쿼리"}, - "top_k": {"type": "number", "default": 5} + "top_k": {"type": "number", "default": 5}, }, - "required": ["query"] - } + "required": ["query"], + }, ), types.Tool( name="langent_chat", @@ -77,8 +75,8 @@ async def handle_list_tools() -> list[types.Tool]: "properties": { "message": {"type": "string", "description": "사용자 메시지"} }, - "required": ["message"] - } + "required": ["message"], + }, ), types.Tool( name="langent_graph", @@ -88,13 +86,13 @@ async def handle_list_tools() -> list[types.Tool]: "properties": { "cypher": {"type": "string", "description": "Cypher 쿼리"} }, - "required": ["cypher"] - } + "required": ["cypher"], + }, ), types.Tool( name="langent_status", description="Langent 프레임워크의 현재 상태(벡터 수, 그래프 연결 등)를 확인합니다.", - inputSchema={"type": "object", "properties": {}} + inputSchema={"type": "object", "properties": {}}, ), types.Tool( name="langent_nebula", @@ -103,8 +101,8 @@ async def handle_list_tools() -> list[types.Tool]: "type": "object", "properties": { "query": {"type": "string", "description": "하이라이트할 검색어 (선택)"} - } - } + }, + }, ), ] @@ -115,26 +113,27 @@ async def handle_call_tool( ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]: """도구 호출을 처리합니다.""" agent = get_langent() - + arguments = arguments or {} + if name == "langent_ingest": path = arguments.get("path") result = agent.ingest(path=path) return [types.TextContent(type="text", text=json.dumps(result, indent=2, ensure_ascii=False))] elif name == "langent_query": - query = arguments.get("query") + query = arguments.get("query", "") top_k = int(arguments.get("top_k", 5)) result = agent.query(query, top_k=top_k) return [types.TextContent(type="text", text=json.dumps(result, indent=2, ensure_ascii=False))] elif name == "langent_chat": - message = arguments.get("message") + message = arguments.get("message", "") result = agent.chat(message) return [types.TextContent(type="text", text=json.dumps(result, indent=2, ensure_ascii=False))] elif name == "langent_graph": - cypher = arguments.get("cypher") - result = agent.graph.query(cypher) if agent.graph else "Neo4j not connected" + cypher = arguments.get("cypher", "") + result = agent.graph_query(cypher) return [types.TextContent(type="text", text=json.dumps(result, indent=2, ensure_ascii=False))] elif name == "langent_status": @@ -149,7 +148,7 @@ async def handle_call_tool( result = agent.get_nebula_data() result["url"] = "http://localhost:8000" return [types.TextContent(type="text", text=json.dumps(result, indent=2, ensure_ascii=False))] - + else: raise ValueError(f"Unknown tool: {name}") @@ -161,7 +160,7 @@ async def main(): write, InitializationOptions( server_name="langent", - server_version="2.0.0", + server_version="3.0.0", capabilities=server.get_capabilities( notification_options=NotificationOptions(), experimental_capabilities={}, diff --git a/langent/store/graph.py b/langent/store/graph.py index 93c5231..932a3f0 100644 --- a/langent/store/graph.py +++ b/langent/store/graph.py @@ -1,10 +1,28 @@ """ -GraphStore — Neo4j Adapter for Langent -======================================== +GraphStore v3 — Neo4j Adapter for Langent +=========================================== Knowledge graph storage, Cypher execution, and 3D viz export. +Includes label/key sanitization to prevent Cypher injection. """ +import logging +import re from typing import List, Dict, Any, Optional +logger = logging.getLogger(__name__) + +# Allowed pattern for Neo4j labels and property keys +_SAFE_IDENTIFIER = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") + + +def _validate_identifier(value: str, kind: str = "identifier") -> str: + """Validate a Cypher identifier (label or property key) against injection.""" + if not _SAFE_IDENTIFIER.match(value): + raise ValueError( + f"Invalid Cypher {kind}: {value!r}. " + f"Only alphanumeric characters and underscores are allowed." + ) + return value + class GraphStore: """ @@ -17,7 +35,7 @@ def __init__( self, uri: str = "bolt://localhost:7687", user: str = "neo4j", - password: str = "yw02280228", + password: str = "password", ): self.uri = uri self.user = user @@ -40,12 +58,12 @@ def test_connection(self) -> bool: session.run("RETURN 1") return True except Exception as e: - print(f"Neo4j connection failed: {e}") + logger.warning("Neo4j connection failed: %s", e) return False # ─── Cypher ─────────────────────────────────────── - def run_cypher(self, query: str, params: Dict = None) -> List[Dict]: + def run_cypher(self, query: str, params: Optional[Dict] = None) -> List[Dict]: """Cypher 쿼리를 실행합니다.""" with self.driver.session() as session: result = session.run(query, parameters=params or {}) @@ -72,6 +90,9 @@ def create_entity( self, label: str, properties: Dict[str, Any] ) -> Dict: """엔티티(노드)를 생성합니다.""" + label = _validate_identifier(label, "label") + for k in properties: + _validate_identifier(k, "property key") props_str = ", ".join(f"{k}: ${k}" for k in properties) query = f"CREATE (n:{label} {{{props_str}}}) RETURN n" result = self.run_cypher(query, properties) @@ -81,6 +102,10 @@ def merge_entity( self, label: str, key: str, properties: Dict[str, Any] ) -> Dict: """엔티티를 MERGE (있으면 업데이트, 없으면 생성)""" + label = _validate_identifier(label, "label") + key = _validate_identifier(key, "property key") + for k in properties: + _validate_identifier(k, "property key") set_parts = ", ".join(f"n.{k} = ${k}" for k in properties if k != key) query = f"MERGE (n:{label} {{{key}: ${key}}})" if set_parts: @@ -94,12 +119,20 @@ def create_relation( from_label: str, from_key: str, from_value: str, rel_type: str, to_label: str, to_key: str, to_value: str, - properties: Dict = None, + properties: Optional[Dict] = None, ) -> Dict: """두 노드 간 관계를 생성합니다.""" + from_label = _validate_identifier(from_label, "label") + to_label = _validate_identifier(to_label, "label") + from_key = _validate_identifier(from_key, "property key") + to_key = _validate_identifier(to_key, "property key") + rel_type = _validate_identifier(rel_type, "relationship type") + props = "" - params = {"from_val": from_value, "to_val": to_value} + params: Dict[str, Any] = {"from_val": from_value, "to_val": to_value} if properties: + for k in properties: + _validate_identifier(k, "property key") props = " {" + ", ".join(f"{k}: ${k}" for k in properties) + "}" params.update(properties) @@ -112,13 +145,16 @@ def create_relation( return result[0] if result else {} def search_nodes( - self, label: str = None, where: str = "", params: Dict = None, limit: int = 50 + self, label: Optional[str] = None, where: str = "", params: Optional[Dict] = None, limit: int = 50 ) -> List[Dict]: """노드를 검색합니다.""" - label_str = f":{label}" if label else "" + label_str = "" + if label: + label = _validate_identifier(label, "label") + label_str = f":{label}" where_str = f" WHERE {where}" if where else "" - query = f"MATCH (n{label_str}){where_str} RETURN n LIMIT {limit}" - return self.run_cypher(query, params) + query = f"MATCH (n{label_str}){where_str} RETURN n LIMIT $limit" + return self.run_cypher(query, {**(params or {}), "limit": limit}) # ─── 3D Visualization Export ────────────────────── @@ -127,7 +163,6 @@ def export_for_viz(self, limit: int = 500) -> Dict[str, Any]: 3D 시각화용 노드/엣지 데이터를 내보냅니다. Returns: {nodes: [...], edges: [...]} """ - # Nodes nodes_raw = self.run_cypher( "MATCH (n) RETURN id(n) AS id, labels(n) AS labels, " "properties(n) AS props LIMIT $limit", @@ -144,7 +179,6 @@ def export_for_viz(self, limit: int = 500) -> Dict[str, Any]: "properties": {k: str(v) for k, v in n["props"].items()}, }) - # Edges edges_raw = self.run_cypher( "MATCH (a)-[r]->(b) RETURN id(a) AS src, id(b) AS dst, " "type(r) AS type LIMIT $limit", diff --git a/langent/store/vector.py b/langent/store/vector.py index b5504d4..13577ac 100644 --- a/langent/store/vector.py +++ b/langent/store/vector.py @@ -1,16 +1,17 @@ """ -VectorStore — ChromaDB Adapter for Langent -============================================ -Workspace files → embeddings → ChromaDB → 3D visualization data +VectorStore v3 — ChromaDB Adapter for Langent +=============================================== +Workspace files -> embeddings -> ChromaDB -> 3D visualization data """ -import os import hashlib import json +import logging from typing import List, Dict, Any, Optional from pathlib import Path import chromadb -from chromadb.config import Settings + +logger = logging.getLogger(__name__) class VectorStore: @@ -35,7 +36,7 @@ def __init__( self._embedding_model_name = embedding_model self._ef = None self._cache_count = -1 - self._coords_cache = [] + self._coords_cache: List[Dict] = [] self._cache_file = self.db_path.parent / "nebula_cache.json" self.collection = self.client.get_or_create_collection( @@ -57,8 +58,8 @@ def _embedding_function(self): def add_documents( self, documents: List[str], - metadatas: List[Dict[str, Any]] = None, - ids: List[str] = None, + metadatas: Optional[List[Dict[str, Any]]] = None, + ids: Optional[List[str]] = None, ) -> int: """문서 청크들을 벡터화하여 저장합니다.""" if not documents: @@ -76,12 +77,12 @@ def add_documents( metadatas = [{}] * len(documents) # Remove duplicates - existing = set() + existing: set = set() try: result = self.collection.get(ids=ids) existing = set(result["ids"]) if result["ids"] else set() - except Exception: - pass + except Exception as e: + logger.debug("Duplicate check skipped: %s", e) new_docs, new_metas, new_ids = [], [], [] for doc, meta, doc_id in zip(documents, metadatas, ids): @@ -102,7 +103,6 @@ def add_documents( self.collection.add(documents=b_docs, metadatas=b_metas, ids=b_ids) added += len(b_docs) - # Clear cache on new data self._cache_count = -1 return added @@ -110,10 +110,10 @@ def search( self, query: str, top_k: int = 5, - where: Dict = None, + where: Optional[Dict] = None, ) -> List[Dict[str, Any]]: """시맨틱 유사도 검색을 수행합니다.""" - kwargs = { + kwargs: Dict[str, Any] = { "query_texts": [query], "n_results": top_k, "include": ["documents", "metadatas", "distances"], @@ -134,7 +134,7 @@ def search( }) return output - def delete(self, ids: List[str] = None, where: Dict = None): + def delete(self, ids: Optional[List[str]] = None, where: Optional[Dict] = None): """문서를 삭제합니다.""" if ids: self.collection.delete(ids=ids) @@ -169,7 +169,7 @@ def get_3d_positions(self, limit: int = 50000) -> List[Dict]: Each point: {id, x, y, z, metadata, document_preview} """ current_count = self.count() - + # 1. Try In-memory cache if self._cache_count == current_count and self._coords_cache: return self._coords_cache @@ -183,8 +183,8 @@ def get_3d_positions(self, limit: int = 50000) -> List[Dict]: self._coords_cache = cached.get("points", []) self._cache_count = current_count return self._coords_cache - except Exception: - pass + except Exception as e: + logger.debug("Cache read failed: %s", e) # 3. Recalculate data = self.get_all_embeddings(limit=limit) @@ -195,7 +195,6 @@ def get_3d_positions(self, limit: int = 50000) -> List[Dict]: try: import umap except ImportError: - # Fallback: random projection embeddings = np.array(data["embeddings"]) rng = np.random.RandomState(42) proj = rng.randn(embeddings.shape[1], 3) @@ -213,20 +212,19 @@ def get_3d_positions(self, limit: int = 50000) -> List[Dict]: random_state=42, ) coords_3d = reducer.fit_transform(embeddings) - # Normalize to [-50, 50] range for Three.js scene coords_3d = (coords_3d - coords_3d.mean(axis=0)) / (coords_3d.std(axis=0) + 1e-8) * 30 self._coords_cache = self._build_points(coords_3d, data) self._cache_count = current_count - + # 4. Save to Disk try: self._cache_file.parent.mkdir(parents=True, exist_ok=True) with open(self._cache_file, "w") as f: json.dump({"count": current_count, "points": self._coords_cache}, f) - except Exception: - pass - + except Exception as e: + logger.debug("Cache write failed: %s", e) + return self._coords_cache def _build_points(self, coords, data) -> List[Dict]: diff --git a/pyproject.toml b/pyproject.toml index b3c4ff6..cf8e8b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,8 @@ build-backend = "hatchling.build" [project] name = "langent" -version = "2.0.0" -description = "RAG Agentic Framework — Workspace → Vector DB → 3D Nebula Visualization → MCP/API" +version = "3.0.0" +description = "RAG Agentic Framework — Workspace -> Vector DB -> 3D Nebula Visualization -> MCP/API" readme = "README.md" license = "Apache-2.0" requires-python = ">=3.10" @@ -42,13 +42,39 @@ dependencies = [ # Utilities "numpy>=1.24.0", "watchdog>=3.0.0", + # HTTP client (for ollama proxy) + "requests>=2.28.0", ] [project.optional-dependencies] -dev = ["pytest>=7.0.0", "pytest-asyncio>=0.21.0"] +dev = [ + "pytest>=7.0.0", + "pytest-asyncio>=0.21.0", + "httpx>=0.24.0", + "ruff>=0.4.0", + "mypy>=1.10.0", +] [project.scripts] langent = "langent.server.cli:main" [tool.hatch.build.targets.wheel] packages = ["langent"] + +[tool.ruff] +target-version = "py310" +line-length = 120 + +[tool.ruff.lint] +select = ["E", "F", "W", "I", "N", "UP", "B", "SIM"] +ignore = ["E501"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +asyncio_mode = "auto" + +[tool.mypy] +python_version = "3.10" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c831ff5 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,65 @@ +""" +Langent v3 — Test configuration and shared fixtures +""" +import os +import tempfile +import pytest + +# Ensure test mode +os.environ.setdefault("LANGENT_WORKSPACE", ".") +os.environ.setdefault("CHROMA_DB_PATH", "./data/test_chroma") + + +@pytest.fixture +def tmp_workspace(tmp_path): + """Create a temporary workspace with sample files.""" + # Create sample markdown file + md_file = tmp_path / "sample.md" + md_file.write_text( + "# AI Research Notes\n\n" + "Artificial intelligence is transforming the landscape of technology.\n\n" + "## Vector Databases\n\n" + "ChromaDB provides fast semantic search using embeddings.\n\n" + "## Knowledge Graphs\n\n" + "Neo4j enables graph-based relationship traversal.", + encoding="utf-8", + ) + + # Create sample text file + txt_file = tmp_path / "notes.txt" + txt_file.write_text( + "LangChain and LangGraph provide powerful agent workflows.\n" + "Three.js enables 3D visualization in the browser.\n" + "UMAP reduces high-dimensional embeddings to 3D coordinates.", + encoding="utf-8", + ) + + # Create sample CSV + csv_file = tmp_path / "data.csv" + csv_file.write_text( + "name,category,score\n" + "ChromaDB,vector_db,95\n" + "Neo4j,graph_db,90\n" + "LangChain,framework,88\n", + encoding="utf-8", + ) + + return tmp_path + + +@pytest.fixture +def vector_store(tmp_path): + """Create a temporary VectorStore instance.""" + from langent.store.vector import VectorStore + db_path = str(tmp_path / "test_chroma") + return VectorStore( + db_path=db_path, + collection_name="test_collection", + ) + + +@pytest.fixture +def config(): + """Create a default LangentConfig.""" + from langent.config import LangentConfig + return LangentConfig() diff --git a/tests/test_chunker.py b/tests/test_chunker.py new file mode 100644 index 0000000..a65da05 --- /dev/null +++ b/tests/test_chunker.py @@ -0,0 +1,58 @@ +"""Tests for langent.rag.chunker""" +import pytest +from langent.rag.chunker import SmartChunker + + +class TestSmartChunker: + def test_empty_text_returns_empty(self): + chunker = SmartChunker(chunk_size=100, min_chunk_size=10) + assert chunker.chunk_document("", {}) == [] + assert chunker.chunk_document(" ", {}) == [] + + def test_short_text_below_min_returns_empty(self): + chunker = SmartChunker(chunk_size=100, min_chunk_size=50) + result = chunker.chunk_document("Short text", {"source": "test"}) + assert result == [] + + def test_single_chunk_within_size(self): + chunker = SmartChunker(chunk_size=500, chunk_overlap=50, min_chunk_size=10) + text = "This is a paragraph with enough content to pass the minimum size threshold for chunking." + result = chunker.chunk_document(text, {"source": "test.md"}) + assert len(result) == 1 + assert result[0]["text"] == text + assert result[0]["metadata"]["source"] == "test.md" + assert result[0]["metadata"]["chunk_index"] == 0 + + def test_multiple_chunks_created(self): + chunker = SmartChunker(chunk_size=100, chunk_overlap=20, min_chunk_size=30) + paragraphs = ["Paragraph one with some content here."] * 10 + text = "\n\n".join(paragraphs) + result = chunker.chunk_document(text, {"source": "long.md"}) + assert len(result) > 1 + for chunk in result: + assert chunk["metadata"]["total_chunks"] == len(result) + + def test_chunk_documents_batch(self): + chunker = SmartChunker(chunk_size=200, min_chunk_size=20) + docs = [ + {"text": "First document with sufficient content for testing purposes.", "metadata": {"source": "a.md"}}, + {"text": "Second document also has enough content for the chunking test.", "metadata": {"source": "b.md"}}, + ] + result = chunker.chunk_documents(docs) + assert len(result) >= 2 + sources = {c["metadata"]["source"] for c in result} + assert "a.md" in sources + assert "b.md" in sources + + def test_hard_split_long_paragraph(self): + chunker = SmartChunker(chunk_size=50, chunk_overlap=0, min_chunk_size=10) + text = "A" * 200 + result = chunker.chunk_document(text, {"source": "long"}) + assert len(result) >= 2 + + def test_metadata_preserved(self): + chunker = SmartChunker(chunk_size=500, min_chunk_size=10) + text = "Enough content to create a valid chunk for testing metadata preservation." + meta = {"source": "doc.md", "custom_field": "value"} + result = chunker.chunk_document(text, meta) + assert result[0]["metadata"]["custom_field"] == "value" diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..5542397 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,49 @@ +"""Tests for langent.config""" +from langent.config import ( + LangentConfig, + LangentSettings, + VectorStoreConfig, + GraphStoreConfig, + RAGConfig, + WorkspaceConfig, +) + + +class TestLangentConfig: + def test_default_config_loads(self, config): + assert config.workspace is not None + assert config.vector is not None + assert config.graph is not None + assert config.rag is not None + assert config.visualizer is not None + + def test_vector_config_defaults(self): + vc = VectorStoreConfig() + assert vc.provider == "chromadb" + assert vc.embedding_model == "all-MiniLM-L6-v2" + assert vc.collection == "langent_knowledge" + + def test_graph_config_no_hardcoded_password(self): + gc = GraphStoreConfig() + assert gc.password == "password" + assert "yw" not in gc.password + + def test_rag_config_defaults(self): + rc = RAGConfig() + assert rc.chunk_size == 500 + assert rc.chunk_overlap == 50 + assert rc.min_chunk_size == 50 + + def test_workspace_config_defaults(self): + wc = WorkspaceConfig() + assert ".md" in wc.extensions + assert ".git" in wc.ignore + + def test_settings_has_api_key_field(self): + settings = LangentSettings() + assert hasattr(settings, "api_key") + assert settings.api_key == "" + + def test_config_with_nonexistent_yaml(self): + config = LangentConfig(config_path="/nonexistent/config.yaml") + assert config.rag.chunk_size == 500 # Falls back to defaults diff --git a/tests/test_graph_store.py b/tests/test_graph_store.py new file mode 100644 index 0000000..954e789 --- /dev/null +++ b/tests/test_graph_store.py @@ -0,0 +1,65 @@ +"""Tests for langent.store.graph — Cypher injection prevention""" +import pytest +from langent.store.graph import GraphStore, _validate_identifier + + +class TestCypherInjectionPrevention: + def test_valid_identifiers(self): + assert _validate_identifier("Person", "label") == "Person" + assert _validate_identifier("name", "key") == "name" + assert _validate_identifier("_private", "key") == "_private" + assert _validate_identifier("Node123", "label") == "Node123" + + def test_invalid_identifiers_raise(self): + with pytest.raises(ValueError, match="Invalid Cypher"): + _validate_identifier("Person; DROP", "label") + + with pytest.raises(ValueError, match="Invalid Cypher"): + _validate_identifier("key with spaces", "key") + + with pytest.raises(ValueError, match="Invalid Cypher"): + _validate_identifier("123start", "label") + + with pytest.raises(ValueError, match="Invalid Cypher"): + _validate_identifier("a.b.c", "key") + + with pytest.raises(ValueError, match="Invalid Cypher"): + _validate_identifier("label})-[:HACK]->(", "label") + + def test_graph_store_default_password_is_safe(self): + gs = GraphStore() + assert gs.password == "password" + assert "yw" not in gs.password + + def test_create_entity_validates_label(self): + gs = GraphStore() + with pytest.raises(ValueError, match="Invalid Cypher label"): + gs.create_entity("Bad Label!", {"name": "test"}) + + def test_create_entity_validates_property_keys(self): + gs = GraphStore() + with pytest.raises(ValueError, match="Invalid Cypher property key"): + gs.create_entity("Person", {"bad key!": "test"}) + + def test_merge_entity_validates_identifiers(self): + gs = GraphStore() + with pytest.raises(ValueError): + gs.merge_entity("Person; DROP", "name", {"name": "test"}) + with pytest.raises(ValueError): + gs.merge_entity("Person", "bad key!", {"bad key!": "test"}) + + def test_create_relation_validates_all(self): + gs = GraphStore() + with pytest.raises(ValueError): + gs.create_relation( + "Bad!", "name", "a", "REL", "Person", "name", "b" + ) + with pytest.raises(ValueError): + gs.create_relation( + "Person", "name", "a", "BAD REL", "Person", "name", "b" + ) + + def test_search_nodes_validates_label(self): + gs = GraphStore() + with pytest.raises(ValueError): + gs.search_nodes(label="Bad Label!") diff --git a/tests/test_ingest.py b/tests/test_ingest.py new file mode 100644 index 0000000..6634f42 --- /dev/null +++ b/tests/test_ingest.py @@ -0,0 +1,61 @@ +"""Tests for langent.rag.ingest""" +import pytest +from langent.rag.ingest import DocumentIngestor + + +class TestDocumentIngestor: + def test_scan_files_finds_supported_types(self, tmp_workspace): + ingestor = DocumentIngestor(workspace_path=str(tmp_workspace)) + files = ingestor.scan_files() + extensions = {f.suffix for f in files} + assert ".md" in extensions + assert ".txt" in extensions + assert ".csv" in extensions + + def test_scan_files_ignores_directories(self, tmp_workspace): + # Create an ignored directory + git_dir = tmp_workspace / ".git" + git_dir.mkdir() + (git_dir / "config").write_text("git config") + + ingestor = DocumentIngestor(workspace_path=str(tmp_workspace)) + files = ingestor.scan_files() + for f in files: + assert ".git" not in str(f) + + def test_extract_text_markdown(self, tmp_workspace): + ingestor = DocumentIngestor(workspace_path=str(tmp_workspace)) + md_file = tmp_workspace / "sample.md" + text = ingestor.extract_text(md_file) + assert text is not None + assert "AI Research" in text + + def test_extract_text_csv(self, tmp_workspace): + ingestor = DocumentIngestor(workspace_path=str(tmp_workspace)) + csv_file = tmp_workspace / "data.csv" + text = ingestor.extract_text(csv_file) + assert text is not None + assert "ChromaDB" in text + + def test_ingest_all(self, tmp_workspace): + ingestor = DocumentIngestor(workspace_path=str(tmp_workspace)) + results = ingestor.ingest_all() + assert len(results) >= 3 # md, txt, csv + for doc in results: + assert "text" in doc + assert "metadata" in doc + assert "source" in doc["metadata"] + assert "filename" in doc["metadata"] + + def test_custom_extensions(self, tmp_workspace): + ingestor = DocumentIngestor( + workspace_path=str(tmp_workspace), + extensions=[".md"], + ) + files = ingestor.scan_files() + assert all(f.suffix == ".md" for f in files) + + def test_empty_workspace(self, tmp_path): + ingestor = DocumentIngestor(workspace_path=str(tmp_path)) + results = ingestor.ingest_all() + assert results == [] diff --git a/tests/test_llm_proxy.py b/tests/test_llm_proxy.py new file mode 100644 index 0000000..1be9c7c --- /dev/null +++ b/tests/test_llm_proxy.py @@ -0,0 +1,39 @@ +"""Tests for langent.llm_proxy""" +from langent.llm_proxy import HostLLMProxy, get_llm + + +class TestHostLLMProxy: + def test_get_llm_returns_proxy(self): + llm = get_llm(mode="fake") + assert isinstance(llm, HostLLMProxy) + assert llm.mode == "fake" + + def test_fake_mode_responds(self): + llm = get_llm(mode="fake") + result = llm.invoke("Hello") + assert result.content + assert "Langent" in result.content + + def test_fake_mode_korean(self): + llm = get_llm(mode="fake") + result = llm.invoke("안녕하세요") + assert "Langent" in result.content + + def test_mcp_mode_returns_marker(self): + llm = get_llm(mode="mcp") + result = llm.invoke("test question") + assert "HOST_AI_REQUIRED" in result.content + + def test_unknown_mode_fallback(self): + llm = get_llm(mode="nonexistent") + result = llm.invoke("test") + assert "not yet implemented" in result.content + + def test_llm_type_property(self): + llm = get_llm() + assert llm._llm_type == "host-llm-proxy" + + def test_v3_in_fake_response(self): + llm = get_llm(mode="fake") + result = llm.invoke("안녕") + assert "v3" in result.content diff --git a/tests/test_vector_store.py b/tests/test_vector_store.py new file mode 100644 index 0000000..9e5ff28 --- /dev/null +++ b/tests/test_vector_store.py @@ -0,0 +1,66 @@ +"""Tests for langent.store.vector""" +import pytest +from langent.store.vector import VectorStore + + +class TestVectorStore: + def test_init_creates_collection(self, vector_store): + assert vector_store.count() == 0 + assert vector_store.collection_name == "test_collection" + + def test_add_documents(self, vector_store): + docs = ["Hello world", "AI is amazing", "Vector databases are fast"] + metas = [{"source": "a"}, {"source": "b"}, {"source": "c"}] + added = vector_store.add_documents(docs, metas) + assert added == 3 + assert vector_store.count() == 3 + + def test_add_documents_deduplication(self, vector_store): + docs = ["Hello world"] + metas = [{"source": "a"}] + vector_store.add_documents(docs, metas) + added_again = vector_store.add_documents(docs, metas) + assert added_again == 0 + assert vector_store.count() == 1 + + def test_add_empty_documents(self, vector_store): + assert vector_store.add_documents([]) == 0 + + def test_search_returns_results(self, vector_store): + docs = [ + "Machine learning algorithms", + "Natural language processing with transformers", + "Computer vision using CNNs", + ] + metas = [{"source": f"doc{i}"} for i in range(3)] + vector_store.add_documents(docs, metas) + + results = vector_store.search("NLP and language models", top_k=2) + assert len(results) == 2 + assert all("id" in r for r in results) + assert all("score" in r for r in results) + assert all("document" in r for r in results) + + def test_delete_by_ids(self, vector_store): + docs = ["Doc A", "Doc B"] + metas = [{"source": "a"}, {"source": "b"}] + vector_store.add_documents(docs, metas, ids=["id_a", "id_b"]) + assert vector_store.count() == 2 + + vector_store.delete(ids=["id_a"]) + assert vector_store.count() == 1 + + def test_list_collections(self, vector_store): + collections = vector_store.list_collections() + assert "test_collection" in collections + + def test_repr(self, vector_store): + r = repr(vector_store) + assert "VectorStore" in r + assert "test_collection" in r + + def test_make_id_deterministic(self, vector_store): + id1 = vector_store._make_id("test_string") + id2 = vector_store._make_id("test_string") + assert id1 == id2 + assert len(id1) == 16 diff --git a/tests/test_workflows.py b/tests/test_workflows.py new file mode 100644 index 0000000..54c7d97 --- /dev/null +++ b/tests/test_workflows.py @@ -0,0 +1,23 @@ +"""Tests for langent.agents.workflows — verifies ${} interpolation bug is fixed""" +import inspect +from langent.agents.workflows import LangentWorkflows, AgentState + + +class TestWorkflows: + def test_agent_state_has_required_fields(self): + hints = AgentState.__annotations__ + assert "query" in hints + assert "context" in hints + assert "answer" in hints + assert "steps" in hints + + def test_node_generate_uses_python_fstrings(self): + """Verify that ${query} interpolation bug (v2) is fixed.""" + source = inspect.getsource(LangentWorkflows.node_generate) + # Must NOT contain ${query}, ${context}, or ${graph_str} + assert "${query}" not in source, "Bug: ${query} found — should be {query}" + assert "${context}" not in source, "Bug: ${context} found — should be {context}" + assert "${graph_str}" not in source, "Bug: ${graph_str} found — should be {graph_str}" + # Must contain proper Python f-string interpolation + assert "{query}" in source + assert "{context}" in source