diff --git a/codebase_rag/graph_updater.py b/codebase_rag/graph_updater.py index c29d6cd62..241dbb73b 100644 --- a/codebase_rag/graph_updater.py +++ b/codebase_rag/graph_updater.py @@ -268,7 +268,7 @@ def __init__( self.repo_path = repo_path self.parsers = parsers self.queries = self._prepare_queries_with_parsers(queries, parsers) - self.project_name = repo_path.name + self.project_name = repo_path.resolve().name self.function_registry = FunctionRegistryTrie() self.simple_name_lookup: dict[str, set[str]] = defaultdict(set) self.ast_cache = BoundedASTCache(max_entries=1000, max_memory_mb=500) diff --git a/codebase_rag/mcp/tools.py b/codebase_rag/mcp/tools.py index 483f725e6..80209086a 100644 --- a/codebase_rag/mcp/tools.py +++ b/codebase_rag/mcp/tools.py @@ -81,10 +81,58 @@ def __init__( # Build tool registry - single source of truth for all tool metadata self._tools: dict[str, ToolMetadata] = { + "list_projects": ToolMetadata( + name="list_projects", + description="List all indexed projects in the knowledge graph database. " + "Returns a list of project names that have been indexed.", + input_schema={ + "type": "object", + "properties": {}, + "required": [], + }, + handler=self.list_projects, + returns_json=True, + ), + "delete_project": ToolMetadata( + name="delete_project", + description="Delete a specific project from the knowledge graph database. " + "This removes all nodes associated with the project while preserving other projects. " + "Use list_projects first to see available projects.", + input_schema={ + "type": "object", + "properties": { + "project_name": { + "type": "string", + "description": "Name of the project to delete (e.g., 'my-project')", + } + }, + "required": ["project_name"], + }, + handler=self.delete_project, + returns_json=True, + ), + "wipe_database": ToolMetadata( + name="wipe_database", + description="WARNING: Completely wipe the entire database, removing ALL indexed projects. " + "This cannot be undone. Use delete_project for removing individual projects.", + input_schema={ + "type": "object", + "properties": { + "confirm": { + "type": "boolean", + "description": "Must be true to confirm the wipe operation", + } + }, + "required": ["confirm"], + }, + handler=self.wipe_database, + returns_json=False, + ), "index_repository": ToolMetadata( name="index_repository", description="Parse and ingest the repository into the Memgraph knowledge graph. " - "This builds a comprehensive graph of functions, classes, dependencies, and relationships.", + "This builds a comprehensive graph of functions, classes, dependencies, and relationships. " + "Note: This now preserves other projects - only the current project is re-indexed.", input_schema={ "type": "object", "properties": {}, @@ -216,6 +264,72 @@ def __init__( ), } + async def list_projects(self) -> dict[str, Any]: + """List all indexed projects in the knowledge graph database. + + Returns: + Dictionary with list of project names + """ + logger.info("[MCP] Listing all projects...") + try: + projects = self.ingestor.list_projects() + return { + "projects": projects, + "count": len(projects), + } + except Exception as e: + logger.error(f"[MCP] Error listing projects: {e}") + return {"error": str(e), "projects": [], "count": 0} + + async def delete_project(self, project_name: str) -> dict[str, Any]: + """Delete a specific project from the knowledge graph database. + + Args: + project_name: Name of the project to delete + + Returns: + Dictionary with deletion status + """ + logger.info(f"[MCP] Deleting project: {project_name}") + try: + # Verify project exists + projects = self.ingestor.list_projects() + if project_name not in projects: + return { + "success": False, + "error": f"Project '{project_name}' not found. Available projects: {projects}", + } + + self.ingestor.delete_project(project_name) + return { + "success": True, + "project": project_name, + "message": f"Successfully deleted project '{project_name}'.", + } + except Exception as e: + logger.error(f"[MCP] Error deleting project: {e}") + return {"success": False, "error": str(e)} + + async def wipe_database(self, confirm: bool) -> str: + """Completely wipe the entire database. + + Args: + confirm: Must be True to proceed with the wipe + + Returns: + Status message + """ + if not confirm: + return "Database wipe cancelled. Set confirm=true to proceed." + + logger.warning("[MCP] Wiping entire database!") + try: + self.ingestor.clean_database() + return "Database completely wiped. All projects have been removed." + except Exception as e: + logger.error(f"[MCP] Error wiping database: {e}") + return f"Error wiping database: {str(e)}" + async def index_repository(self) -> str: """Parse and ingest the repository into the Memgraph knowledge graph. @@ -223,19 +337,18 @@ async def index_repository(self) -> str: a comprehensive knowledge graph with functions, classes, dependencies, and relationships. - Note: This clears all existing data in the database before indexing. - Only one repository can be indexed at a time. + Note: This now only clears data for the current project, preserving other projects. Returns: Success message with indexing statistics """ logger.info(f"[MCP] Indexing repository at: {self.project_root}") + project_name = Path(self.project_root).resolve().name try: - # Clear existing data to ensure clean state for the new repository - logger.info("[MCP] Clearing existing database to avoid conflicts...") - self.ingestor.clean_database() - logger.info("[MCP] Database cleared. Starting fresh indexing...") + # Delete only the current project's data (preserves other projects) + logger.info(f"[MCP] Clearing existing data for project '{project_name}'...") + self.ingestor.delete_project(project_name) updater = GraphUpdater( ingestor=self.ingestor, @@ -245,7 +358,7 @@ async def index_repository(self) -> str: ) updater.run() - return f"Successfully indexed repository at {self.project_root}. Knowledge graph has been updated (previous data cleared)." + return f"Successfully indexed repository at {self.project_root}. Project '{project_name}' has been updated." except Exception as e: logger.error(f"[MCP] Error indexing repository: {e}") return f"Error indexing repository: {str(e)}" diff --git a/codebase_rag/prompts.py b/codebase_rag/prompts.py index dc655a450..65b515201 100644 --- a/codebase_rag/prompts.py +++ b/codebase_rag/prompts.py @@ -125,8 +125,20 @@ {GRAPH_SCHEMA_AND_RULES} -**3. Query Patterns & Examples** -Your goal is to return the `name`, `path`, and `qualified_name` of the found nodes. +**3. Query Optimization Rules** + +- **LIMIT Results**: ALWAYS add `LIMIT 50` to queries that list items. This prevents overwhelming responses. +- **Aggregation Queries**: When asked "how many", "count", or "total", return ONLY the count, not all items: + - CORRECT: `MATCH (c:Class) RETURN count(c) AS total` + - WRONG: `MATCH (c:Class) RETURN c.name, c.path, count(c) AS total` (returns all items!) +- **List vs Count**: If asked to "list" or "show", return items with LIMIT. If asked to "count" or "how many", return only the count. + +**4. Query Patterns & Examples** +When listing items, return the `name`, `path`, and `qualified_name` with a LIMIT. + +**Pattern: Counting Items** +cypher// "How many classes are there?" or "Count all functions" +MATCH (c:Class) RETURN count(c) AS total **Pattern: Finding Decorated Functions/Methods (e.g., Workflows, Tasks)** cypher// "Find all prefect flows" or "what are the workflows?" or "show me the tasks" @@ -134,6 +146,7 @@ MATCH (n:Function|Method) WHERE ANY(d IN n.decorators WHERE toLower(d) IN ['flow', 'task']) RETURN n.name AS name, n.qualified_name AS qualified_name, labels(n) AS type +LIMIT 50 **Pattern: Finding Content by Path (Robustly)** cypher// "what is in the 'workflows/src' directory?" or "list files in workflows" @@ -141,12 +154,14 @@ MATCH (n) WHERE n.path IS NOT NULL AND n.path STARTS WITH 'workflows' RETURN n.name AS name, n.path AS path, labels(n) AS type +LIMIT 50 **Pattern: Keyword & Concept Search (Fallback for general terms)** cypher// "find things related to 'database'" MATCH (n) WHERE toLower(n.name) CONTAINS 'database' OR (n.qualified_name IS NOT NULL AND toLower(n.qualified_name) CONTAINS 'database') RETURN n.name AS name, n.qualified_name AS qualified_name, labels(n) AS type +LIMIT 50 **Pattern: Finding a Specific File** cypher// "Find the main README.md" @@ -173,31 +188,41 @@ - For code nodes (`Class`, `Function`, etc.), return `n.qualified_name AS qualified_name`. 4. **KEEP IT SIMPLE**: Do not try to be clever. A simple query that returns a few relevant nodes is better than a complex one that fails. 5. **CLAUSE ORDER**: You MUST follow the standard Cypher clause order: `MATCH`, `WHERE`, `RETURN`, `LIMIT`. +6. **ALWAYS ADD LIMIT**: For queries that list items, ALWAYS add `LIMIT 50` to prevent overwhelming responses. +7. **AGGREGATION QUERIES**: When asked "how many" or "count", return ONLY the count: + - CORRECT: `MATCH (c:Class) RETURN count(c) AS total` + - WRONG: `MATCH (c:Class) RETURN c.name, count(c) AS total` (returns all items!) **Examples:** +* **Natural Language:** "How many classes are there?" +* **Cypher Query:** + ```cypher + MATCH (c:Class) RETURN count(c) AS total + ``` + * **Natural Language:** "Find the main README file" * **Cypher Query:** ```cypher - MATCH (f:File) WHERE toLower(f.name) CONTAINS 'readme' RETURN f.path AS path, f.name AS name, labels(f) AS type + MATCH (f:File) WHERE toLower(f.name) CONTAINS 'readme' RETURN f.path AS path, f.name AS name, labels(f) AS type LIMIT 50 ``` * **Natural Language:** "Find all python files" * **Cypher Query (Note the '.' in extension):** ```cypher - MATCH (f:File) WHERE f.extension = '.py' RETURN f.path AS path, f.name AS name, labels(f) AS type + MATCH (f:File) WHERE f.extension = '.py' RETURN f.path AS path, f.name AS name, labels(f) AS type LIMIT 50 ``` * **Natural Language:** "show me the tasks" * **Cypher Query:** ```cypher - MATCH (n:Function|Method) WHERE 'task' IN n.decorators RETURN n.qualified_name AS qualified_name, n.name AS name, labels(n) AS type + MATCH (n:Function|Method) WHERE 'task' IN n.decorators RETURN n.qualified_name AS qualified_name, n.name AS name, labels(n) AS type LIMIT 50 ``` * **Natural Language:** "list files in the services folder" * **Cypher Query:** ```cypher - MATCH (f:File) WHERE f.path STARTS WITH 'services' RETURN f.path AS path, f.name AS name, labels(f) AS type + MATCH (f:File) WHERE f.path STARTS WITH 'services' RETURN f.path AS path, f.name AS name, labels(f) AS type LIMIT 50 ``` * **Natural Language:** "Find just one file to test" diff --git a/codebase_rag/services/graph_service.py b/codebase_rag/services/graph_service.py index ad4c69e76..5c93c692f 100644 --- a/codebase_rag/services/graph_service.py +++ b/codebase_rag/services/graph_service.py @@ -124,10 +124,44 @@ def _execute_batch_with_return( cursor.close() def clean_database(self) -> None: + """Wipe the entire database. Use with caution.""" logger.info("--- Cleaning database... ---") self._execute_query("MATCH (n) DETACH DELETE n;") logger.info("--- Database cleaned. ---") + def list_projects(self) -> list[str]: + """List all indexed projects in the database. + + Returns: + List of project names + """ + result = self.fetch_all("MATCH (p:Project) RETURN p.name AS name ORDER BY p.name") + return [r["name"] for r in result] + + def delete_project(self, project_name: str) -> None: + """Delete all nodes associated with a specific project. + + This is an atomic operation that removes the Project node and all nodes + whose qualified_name starts with the project name prefix, preserving + other projects. + + Args: + project_name: Name of the project to delete + """ + logger.info(f"--- Deleting project: {project_name} ---") + + self._execute_query( + """ + MATCH (n) + WHERE n.qualified_name STARTS WITH $prefix + OR (n:Project AND n.name = $project_name) + DETACH DELETE n + """, + {"prefix": f"{project_name}.", "project_name": project_name}, + ) + + logger.info(f"--- Project {project_name} deleted. ---") + def ensure_constraints(self) -> None: logger.info("Ensuring constraints...") for label, prop in self.unique_constraints.items(): diff --git a/codebase_rag/tests/test_mcp_query_and_index.py b/codebase_rag/tests/test_mcp_query_and_index.py index 6e602f976..e9ef22cea 100644 --- a/codebase_rag/tests/test_mcp_query_and_index.py +++ b/codebase_rag/tests/test_mcp_query_and_index.py @@ -297,10 +297,10 @@ async def test_index_repository_multiple_times( # Should have been called twice assert mock_updater.run.call_count == 2 - async def test_index_repository_clears_database_first( + async def test_index_repository_clears_project_data_first( self, mcp_registry: MCPToolsRegistry, temp_project_root: Path ) -> None: - """Test that database is cleared before indexing.""" + """Test that project data is cleared before indexing.""" with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_class: mock_updater = MagicMock() mock_updater.run.return_value = None @@ -309,25 +309,24 @@ async def test_index_repository_clears_database_first( # Index repository result = await mcp_registry.index_repository() - # Verify clean_database was called - mcp_registry.ingestor.clean_database.assert_called_once() # type: ignore[attr-defined] + # Verify delete_project was called with correct project name + project_name = temp_project_root.name + mcp_registry.ingestor.delete_project.assert_called_once_with(project_name) # type: ignore[attr-defined] assert "Error:" not in result - # Verify message indicates data was cleared - assert "cleared" in result.lower() or "previous data" in result.lower() - async def test_index_repository_clears_before_updater_runs( + async def test_index_repository_deletes_project_before_updater_runs( self, mcp_registry: MCPToolsRegistry, temp_project_root: Path ) -> None: - """Test that database clearing happens before GraphUpdater runs.""" + """Test that project deletion happens before GraphUpdater runs.""" call_order: list[str] = [] - def mock_clean() -> None: - call_order.append("clean") + def mock_delete(project_name: str) -> None: + call_order.append("delete") def mock_run() -> None: call_order.append("run") - mcp_registry.ingestor.clean_database = MagicMock(side_effect=mock_clean) # type: ignore[method-assign] + mcp_registry.ingestor.delete_project = MagicMock(side_effect=mock_delete) # type: ignore[method-assign] with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_class: mock_updater = MagicMock() @@ -336,13 +335,13 @@ def mock_run() -> None: await mcp_registry.index_repository() - # Verify clean was called before run - assert call_order == ["clean", "run"] + # Verify delete was called before run + assert call_order == ["delete", "run"] - async def test_sequential_index_clears_previous_repo_data( + async def test_sequential_index_only_clears_own_project_data( self, tmp_path: Path ) -> None: - """Test that indexing a second repository clears the first repository's data.""" + """Test that indexing repositories only clears their own project data.""" # Create two mock registries for different projects mock_ingestor = MagicMock() mock_cypher = MagicMock() @@ -370,11 +369,14 @@ async def test_sequential_index_clears_previous_repo_data( # Index first repository await registry1.index_repository() - assert mock_ingestor.clean_database.call_count == 1 + mock_ingestor.delete_project.assert_called_with("project1") - # Index second repository - should clear database again + # Index second repository - should only delete project2, not project1 await registry2.index_repository() - assert mock_ingestor.clean_database.call_count == 2 + mock_ingestor.delete_project.assert_called_with("project2") + + # Total of 2 delete_project calls (one per project) + assert mock_ingestor.delete_project.call_count == 2 class TestQueryAndIndexIntegration: