Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion codebase_rag/graph_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def __init__(
self.repo_path = repo_path
self.parsers = parsers
self.queries = self._prepare_queries_with_parsers(queries, parsers)
self.project_name = repo_path.name
self.project_name = repo_path.resolve().name
self.function_registry = FunctionRegistryTrie()
self.simple_name_lookup: dict[str, set[str]] = defaultdict(set)
self.ast_cache = BoundedASTCache(max_entries=1000, max_memory_mb=500)
Expand Down
129 changes: 121 additions & 8 deletions codebase_rag/mcp/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,58 @@ def __init__(

# Build tool registry - single source of truth for all tool metadata
self._tools: dict[str, ToolMetadata] = {
"list_projects": ToolMetadata(
name="list_projects",
description="List all indexed projects in the knowledge graph database. "
"Returns a list of project names that have been indexed.",
input_schema={
"type": "object",
"properties": {},
"required": [],
},
handler=self.list_projects,
returns_json=True,
),
"delete_project": ToolMetadata(
name="delete_project",
description="Delete a specific project from the knowledge graph database. "
"This removes all nodes associated with the project while preserving other projects. "
"Use list_projects first to see available projects.",
input_schema={
"type": "object",
"properties": {
"project_name": {
"type": "string",
"description": "Name of the project to delete (e.g., 'my-project')",
}
},
"required": ["project_name"],
},
handler=self.delete_project,
returns_json=True,
),
"wipe_database": ToolMetadata(
name="wipe_database",
description="WARNING: Completely wipe the entire database, removing ALL indexed projects. "
"This cannot be undone. Use delete_project for removing individual projects.",
input_schema={
"type": "object",
"properties": {
"confirm": {
"type": "boolean",
"description": "Must be true to confirm the wipe operation",
}
},
"required": ["confirm"],
},
handler=self.wipe_database,
returns_json=False,
),
"index_repository": ToolMetadata(
name="index_repository",
description="Parse and ingest the repository into the Memgraph knowledge graph. "
"This builds a comprehensive graph of functions, classes, dependencies, and relationships.",
"This builds a comprehensive graph of functions, classes, dependencies, and relationships. "
"Note: This now preserves other projects - only the current project is re-indexed.",
input_schema={
"type": "object",
"properties": {},
Expand Down Expand Up @@ -216,26 +264,91 @@ def __init__(
),
}

async def list_projects(self) -> dict[str, Any]:
"""List all indexed projects in the knowledge graph database.

Returns:
Dictionary with list of project names
"""
logger.info("[MCP] Listing all projects...")
try:
projects = self.ingestor.list_projects()
return {
"projects": projects,
"count": len(projects),
}
except Exception as e:
logger.error(f"[MCP] Error listing projects: {e}")
return {"error": str(e), "projects": [], "count": 0}

async def delete_project(self, project_name: str) -> dict[str, Any]:
"""Delete a specific project from the knowledge graph database.

Args:
project_name: Name of the project to delete

Returns:
Dictionary with deletion status
"""
logger.info(f"[MCP] Deleting project: {project_name}")
try:
# Verify project exists
projects = self.ingestor.list_projects()
if project_name not in projects:
return {
"success": False,
"error": f"Project '{project_name}' not found. Available projects: {projects}",
}

self.ingestor.delete_project(project_name)
return {
"success": True,
"project": project_name,
"message": f"Successfully deleted project '{project_name}'.",
}
except Exception as e:
logger.error(f"[MCP] Error deleting project: {e}")
return {"success": False, "error": str(e)}

async def wipe_database(self, confirm: bool) -> str:
"""Completely wipe the entire database.

Args:
confirm: Must be True to proceed with the wipe

Returns:
Status message
"""
if not confirm:
return "Database wipe cancelled. Set confirm=true to proceed."

logger.warning("[MCP] Wiping entire database!")
try:
self.ingestor.clean_database()
return "Database completely wiped. All projects have been removed."
except Exception as e:
logger.error(f"[MCP] Error wiping database: {e}")
return f"Error wiping database: {str(e)}"

async def index_repository(self) -> str:
"""Parse and ingest the repository into the Memgraph knowledge graph.

This tool analyzes the codebase using Tree-sitter parsers and builds
a comprehensive knowledge graph with functions, classes, dependencies,
and relationships.

Note: This clears all existing data in the database before indexing.
Only one repository can be indexed at a time.
Note: This now only clears data for the current project, preserving other projects.

Returns:
Success message with indexing statistics
"""
logger.info(f"[MCP] Indexing repository at: {self.project_root}")
project_name = Path(self.project_root).resolve().name

try:
# Clear existing data to ensure clean state for the new repository
logger.info("[MCP] Clearing existing database to avoid conflicts...")
self.ingestor.clean_database()
logger.info("[MCP] Database cleared. Starting fresh indexing...")
# Delete only the current project's data (preserves other projects)
logger.info(f"[MCP] Clearing existing data for project '{project_name}'...")
self.ingestor.delete_project(project_name)

updater = GraphUpdater(
ingestor=self.ingestor,
Expand All @@ -245,7 +358,7 @@ async def index_repository(self) -> str:
)
updater.run()

return f"Successfully indexed repository at {self.project_root}. Knowledge graph has been updated (previous data cleared)."
return f"Successfully indexed repository at {self.project_root}. Project '{project_name}' has been updated."
except Exception as e:
logger.error(f"[MCP] Error indexing repository: {e}")
return f"Error indexing repository: {str(e)}"
Expand Down
37 changes: 31 additions & 6 deletions codebase_rag/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,28 +125,43 @@

{GRAPH_SCHEMA_AND_RULES}

**3. Query Patterns & Examples**
Your goal is to return the `name`, `path`, and `qualified_name` of the found nodes.
**3. Query Optimization Rules**

- **LIMIT Results**: ALWAYS add `LIMIT 50` to queries that list items. This prevents overwhelming responses.
- **Aggregation Queries**: When asked "how many", "count", or "total", return ONLY the count, not all items:
- CORRECT: `MATCH (c:Class) RETURN count(c) AS total`
- WRONG: `MATCH (c:Class) RETURN c.name, c.path, count(c) AS total` (returns all items!)
- **List vs Count**: If asked to "list" or "show", return items with LIMIT. If asked to "count" or "how many", return only the count.

**4. Query Patterns & Examples**
When listing items, return the `name`, `path`, and `qualified_name` with a LIMIT.

**Pattern: Counting Items**
cypher// "How many classes are there?" or "Count all functions"
MATCH (c:Class) RETURN count(c) AS total

**Pattern: Finding Decorated Functions/Methods (e.g., Workflows, Tasks)**
cypher// "Find all prefect flows" or "what are the workflows?" or "show me the tasks"
// Use the 'IN' operator to check the 'decorators' list property.
MATCH (n:Function|Method)
WHERE ANY(d IN n.decorators WHERE toLower(d) IN ['flow', 'task'])
RETURN n.name AS name, n.qualified_name AS qualified_name, labels(n) AS type
LIMIT 50

**Pattern: Finding Content by Path (Robustly)**
cypher// "what is in the 'workflows/src' directory?" or "list files in workflows"
// Use `STARTS WITH` for path matching.
MATCH (n)
WHERE n.path IS NOT NULL AND n.path STARTS WITH 'workflows'
RETURN n.name AS name, n.path AS path, labels(n) AS type
LIMIT 50

**Pattern: Keyword & Concept Search (Fallback for general terms)**
cypher// "find things related to 'database'"
MATCH (n)
WHERE toLower(n.name) CONTAINS 'database' OR (n.qualified_name IS NOT NULL AND toLower(n.qualified_name) CONTAINS 'database')
RETURN n.name AS name, n.qualified_name AS qualified_name, labels(n) AS type
LIMIT 50

**Pattern: Finding a Specific File**
cypher// "Find the main README.md"
Expand All @@ -173,31 +188,41 @@
- For code nodes (`Class`, `Function`, etc.), return `n.qualified_name AS qualified_name`.
4. **KEEP IT SIMPLE**: Do not try to be clever. A simple query that returns a few relevant nodes is better than a complex one that fails.
5. **CLAUSE ORDER**: You MUST follow the standard Cypher clause order: `MATCH`, `WHERE`, `RETURN`, `LIMIT`.
6. **ALWAYS ADD LIMIT**: For queries that list items, ALWAYS add `LIMIT 50` to prevent overwhelming responses.
7. **AGGREGATION QUERIES**: When asked "how many" or "count", return ONLY the count:
- CORRECT: `MATCH (c:Class) RETURN count(c) AS total`
- WRONG: `MATCH (c:Class) RETURN c.name, count(c) AS total` (returns all items!)

**Examples:**

* **Natural Language:** "How many classes are there?"
* **Cypher Query:**
```cypher
MATCH (c:Class) RETURN count(c) AS total
```

* **Natural Language:** "Find the main README file"
* **Cypher Query:**
```cypher
MATCH (f:File) WHERE toLower(f.name) CONTAINS 'readme' RETURN f.path AS path, f.name AS name, labels(f) AS type
MATCH (f:File) WHERE toLower(f.name) CONTAINS 'readme' RETURN f.path AS path, f.name AS name, labels(f) AS type LIMIT 50
```

* **Natural Language:** "Find all python files"
* **Cypher Query (Note the '.' in extension):**
```cypher
MATCH (f:File) WHERE f.extension = '.py' RETURN f.path AS path, f.name AS name, labels(f) AS type
MATCH (f:File) WHERE f.extension = '.py' RETURN f.path AS path, f.name AS name, labels(f) AS type LIMIT 50
```

* **Natural Language:** "show me the tasks"
* **Cypher Query:**
```cypher
MATCH (n:Function|Method) WHERE 'task' IN n.decorators RETURN n.qualified_name AS qualified_name, n.name AS name, labels(n) AS type
MATCH (n:Function|Method) WHERE 'task' IN n.decorators RETURN n.qualified_name AS qualified_name, n.name AS name, labels(n) AS type LIMIT 50
```

* **Natural Language:** "list files in the services folder"
* **Cypher Query:**
```cypher
MATCH (f:File) WHERE f.path STARTS WITH 'services' RETURN f.path AS path, f.name AS name, labels(f) AS type
MATCH (f:File) WHERE f.path STARTS WITH 'services' RETURN f.path AS path, f.name AS name, labels(f) AS type LIMIT 50
```

* **Natural Language:** "Find just one file to test"
Expand Down
34 changes: 34 additions & 0 deletions codebase_rag/services/graph_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,44 @@ def _execute_batch_with_return(
cursor.close()

def clean_database(self) -> None:
"""Wipe the entire database. Use with caution."""
logger.info("--- Cleaning database... ---")
self._execute_query("MATCH (n) DETACH DELETE n;")
logger.info("--- Database cleaned. ---")

def list_projects(self) -> list[str]:
"""List all indexed projects in the database.

Returns:
List of project names
"""
result = self.fetch_all("MATCH (p:Project) RETURN p.name AS name ORDER BY p.name")
return [r["name"] for r in result]

def delete_project(self, project_name: str) -> None:
"""Delete all nodes associated with a specific project.

This is an atomic operation that removes the Project node and all nodes
whose qualified_name starts with the project name prefix, preserving
other projects.

Args:
project_name: Name of the project to delete
"""
logger.info(f"--- Deleting project: {project_name} ---")

self._execute_query(
"""
MATCH (n)
WHERE n.qualified_name STARTS WITH $prefix
OR (n:Project AND n.name = $project_name)
DETACH DELETE n
""",
{"prefix": f"{project_name}.", "project_name": project_name},
)

logger.info(f"--- Project {project_name} deleted. ---")

def ensure_constraints(self) -> None:
logger.info("Ensuring constraints...")
for label, prop in self.unique_constraints.items():
Expand Down
38 changes: 20 additions & 18 deletions codebase_rag/tests/test_mcp_query_and_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,10 +297,10 @@ async def test_index_repository_multiple_times(
# Should have been called twice
assert mock_updater.run.call_count == 2

async def test_index_repository_clears_database_first(
async def test_index_repository_clears_project_data_first(
self, mcp_registry: MCPToolsRegistry, temp_project_root: Path
) -> None:
"""Test that database is cleared before indexing."""
"""Test that project data is cleared before indexing."""
with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_class:
mock_updater = MagicMock()
mock_updater.run.return_value = None
Expand All @@ -309,25 +309,24 @@ async def test_index_repository_clears_database_first(
# Index repository
result = await mcp_registry.index_repository()

# Verify clean_database was called
mcp_registry.ingestor.clean_database.assert_called_once() # type: ignore[attr-defined]
# Verify delete_project was called with correct project name
project_name = temp_project_root.name
mcp_registry.ingestor.delete_project.assert_called_once_with(project_name) # type: ignore[attr-defined]
assert "Error:" not in result
# Verify message indicates data was cleared
assert "cleared" in result.lower() or "previous data" in result.lower()

async def test_index_repository_clears_before_updater_runs(
async def test_index_repository_deletes_project_before_updater_runs(
self, mcp_registry: MCPToolsRegistry, temp_project_root: Path
) -> None:
"""Test that database clearing happens before GraphUpdater runs."""
"""Test that project deletion happens before GraphUpdater runs."""
call_order: list[str] = []

def mock_clean() -> None:
call_order.append("clean")
def mock_delete(project_name: str) -> None:
call_order.append("delete")

def mock_run() -> None:
call_order.append("run")

mcp_registry.ingestor.clean_database = MagicMock(side_effect=mock_clean) # type: ignore[method-assign]
mcp_registry.ingestor.delete_project = MagicMock(side_effect=mock_delete) # type: ignore[method-assign]

with patch("codebase_rag.mcp.tools.GraphUpdater") as mock_updater_class:
mock_updater = MagicMock()
Expand All @@ -336,13 +335,13 @@ def mock_run() -> None:

await mcp_registry.index_repository()

# Verify clean was called before run
assert call_order == ["clean", "run"]
# Verify delete was called before run
assert call_order == ["delete", "run"]

async def test_sequential_index_clears_previous_repo_data(
async def test_sequential_index_only_clears_own_project_data(
self, tmp_path: Path
) -> None:
"""Test that indexing a second repository clears the first repository's data."""
"""Test that indexing repositories only clears their own project data."""
# Create two mock registries for different projects
mock_ingestor = MagicMock()
mock_cypher = MagicMock()
Expand Down Expand Up @@ -370,11 +369,14 @@ async def test_sequential_index_clears_previous_repo_data(

# Index first repository
await registry1.index_repository()
assert mock_ingestor.clean_database.call_count == 1
mock_ingestor.delete_project.assert_called_with("project1")

# Index second repository - should clear database again
# Index second repository - should only delete project2, not project1
await registry2.index_repository()
assert mock_ingestor.clean_database.call_count == 2
mock_ingestor.delete_project.assert_called_with("project2")

# Total of 2 delete_project calls (one per project)
assert mock_ingestor.delete_project.call_count == 2


class TestQueryAndIndexIntegration:
Expand Down