diff --git a/src/services/indexer.ts b/src/services/indexer.ts index 617c89f..2b2e932 100644 --- a/src/services/indexer.ts +++ b/src/services/indexer.ts @@ -134,7 +134,7 @@ function isCancellationRequested(resolvedPath: string): boolean { return cancellationRequested.get(resolvedPath) === true; } -async function getProjectHashes(projectId: string, collection: string): Promise> { +async function getProjectHashes(projectId: string, collection: string, resolvedProjectPath?: string): Promise> { if (!projectHashes.has(projectId)) { // Try to load from Qdrant (persistent storage). // loadProjectHashes now throws on transient errors (instead of returning null), @@ -144,9 +144,12 @@ async function getProjectHashes(projectId: string, collection: string): Promise< projectHashesLoaded.add(projectId); const stored = await loadProjectHashes(collection); if (stored) { - logger.info("Loaded file hashes from Qdrant", { projectId, count: stored.size }); - projectHashes.set(projectId, stored); - return stored; + // Migrate absolute-path keys to relative paths (one-time, transparent). + // Indexes built before the relative-path fix stored absolute paths as hash keys. + const migrated = migrateAbsolutePathKeys(stored, resolvedProjectPath); + logger.info("Loaded file hashes from Qdrant", { projectId, count: migrated.size, wasMigrated: migrated !== stored }); + projectHashes.set(projectId, migrated); + return migrated; } } projectHashes.set(projectId, new Map()); @@ -154,14 +157,69 @@ async function getProjectHashes(projectId: string, collection: string): Promise< return projectHashes.get(projectId) as Map; } +/** + * Migrate hash map keys from absolute paths to relative paths. + * Returns a new map if migration was needed, or the original map if keys are already relative. + */ +function migrateAbsolutePathKeys(hashes: Map, resolvedProjectPath?: string): Map { + if (hashes.size === 0) return hashes; + + // Check if keys look like absolute paths + const firstKey = hashes.keys().next().value as string; + if (!firstKey.startsWith("/") && !firstKey.startsWith("\\")) return hashes; + + // Try to strip the stored project path prefix, or detect the common prefix + const prefix = resolvedProjectPath + ? resolvedProjectPath + "/" + : detectCommonPrefix(hashes); + + if (!prefix) { + logger.warn("Hash keys appear absolute but could not determine prefix to strip — skipping migration"); + return hashes; + } + + const migrated = new Map(); + for (const [absPath, hash] of hashes) { + const relative = absPath.startsWith(prefix) ? absPath.slice(prefix.length) : absPath; + migrated.set(relative, hash); + } + + logger.info("Migrated hash keys from absolute to relative paths", { count: migrated.size, prefix }); + return migrated; +} + +/** Detect the longest common directory prefix across all hash keys */ +function detectCommonPrefix(hashes: Map): string | null { + const keys = Array.from(hashes.keys()); + if (keys.length === 0) return null; + + let prefix = keys[0]; + for (let i = 1; i < keys.length; i++) { + while (!keys[i].startsWith(prefix)) { + const lastSlash = prefix.lastIndexOf("/"); + if (lastSlash <= 0) return null; + prefix = prefix.slice(0, lastSlash + 1); + } + } + + // Ensure prefix ends with / + if (!prefix.endsWith("/")) { + const lastSlash = prefix.lastIndexOf("/"); + if (lastSlash <= 0) return null; + prefix = prefix.slice(0, lastSlash + 1); + } + + return prefix; +} + /** Hash file content for change detection */ export function hashContent(content: string): string { return createHash("sha256").update(content).digest("hex").slice(0, 16); } /** Generate a stable chunk ID as a valid UUID (required by Qdrant) */ -export function chunkId(filePath: string, startLine: number): string { - const hash = createHash("sha256").update(`${filePath}:${startLine}`).digest("hex").slice(0, 32); +export function chunkId(relativePath: string, startLine: number): string { + const hash = createHash("sha256").update(`${relativePath}:${startLine}`).digest("hex").slice(0, 32); // Format as UUID: 8-4-4-4-12 return `${hash.slice(0, 8)}-${hash.slice(8, 12)}-${hash.slice(12, 16)}-${hash.slice(16, 20)}-${hash.slice(20, 32)}`; } @@ -323,7 +381,7 @@ function chunkByCharacters( const endLine = startLine + newlineCount; chunks.push({ - id: chunkId(filePath, offset), // byte offset → unique ID even for 1-line files + id: chunkId(relativePath, offset), // byte offset → unique ID even for 1-line files filePath, relativePath, content: chunkContent, @@ -373,7 +431,7 @@ export function chunkFileContent( // Small files: single chunk regardless of language if (lines.length <= CHUNK_SIZE) { return applyCharCap([{ - id: chunkId(filePath, 1), + id: chunkId(relativePath, 1), filePath, relativePath, content, @@ -414,7 +472,7 @@ function chunkByAstRegions( const preambleLines = lines.slice(0, regions[0].startLine); if (preambleLines.length > 0) { chunks.push({ - id: chunkId(filePath, 1), + id: chunkId(relativePath, 1), filePath, relativePath, content: preambleLines.join("\n"), @@ -437,7 +495,7 @@ function chunkByAstRegions( if (regionLength <= MAX_CHUNK_LINES) { chunks.push({ - id: chunkId(filePath, pendingStart + 1), + id: chunkId(relativePath, pendingStart + 1), filePath, relativePath, content: regionLines.join("\n"), @@ -451,7 +509,7 @@ function chunkByAstRegions( for (let start = 0; start < regionLength; start += CHUNK_SIZE - CHUNK_OVERLAP) { const end = Math.min(start + CHUNK_SIZE, regionLength); chunks.push({ - id: chunkId(filePath, pendingStart + start + 1), + id: chunkId(relativePath, pendingStart + start + 1), filePath, relativePath, content: regionLines.slice(start, end).join("\n"), @@ -500,7 +558,7 @@ function chunkByAstRegions( const epilogueLines = lines.slice(lastEnd); if (epilogueLines.length > 0) { chunks.push({ - id: chunkId(filePath, lastEnd + 1), + id: chunkId(relativePath, lastEnd + 1), filePath, relativePath, content: epilogueLines.join("\n"), @@ -531,7 +589,7 @@ function chunkByLines( const chunkContent = lines.slice(start, end).join("\n"); chunks.push({ - id: chunkId(filePath, start + 1), + id: chunkId(relativePath, start + 1), filePath, relativePath, content: chunkContent, @@ -601,7 +659,7 @@ export async function indexProject( try { const projectId = projectIdFromPath(resolvedPath); const collection = collectionName(projectId); - const hashes = await getProjectHashes(projectId, collection); + const hashes = await getProjectHashes(projectId, collection, resolvedPath); // Smart re-index: check if collection already has data. // getCollectionInfo now throws on transient errors (instead of returning null), @@ -676,7 +734,7 @@ export async function indexProject( const contentHash = hashContent(content); // Skip unchanged files during re-index - if (hasExistingData && hashes.get(absolutePath) === contentHash) { + if (hasExistingData && hashes.get(relativePath) === contentHash) { return null; } @@ -701,17 +759,17 @@ export async function indexProject( // Delete old chunks for changed files progress.phase = "cleaning stale chunks"; for (const file of chunkedFiles) { - if (hashes.has(file.absolutePath)) { - await deleteFileChunks(collection, file.absolutePath); + if (hashes.has(file.relativePath)) { + await deleteFileChunks(collection, file.relativePath); } } // Handle deleted files - const currentFileSet = new Set(files.map((f) => path.join(resolvedPath, f))); - for (const [absolutePath] of hashes) { - if (!currentFileSet.has(absolutePath)) { - await deleteFileChunks(collection, absolutePath); - hashes.delete(absolutePath); + const currentFileSet = new Set(files); + for (const [filePath] of hashes) { + if (!currentFileSet.has(filePath)) { + await deleteFileChunks(collection, filePath); + hashes.delete(filePath); } } } @@ -812,7 +870,7 @@ export async function indexProject( // Update hashes for this batch's files for (const file of fileBatch) { - hashes.set(file.absolutePath, file.contentHash); + hashes.set(file.relativePath, file.contentHash); } totalChunksCreated += batchChunkData.length; @@ -920,7 +978,7 @@ export async function updateProjectIndex( try { const projectId = projectIdFromPath(resolvedPath); const collection = collectionName(projectId); - const hashes = await getProjectHashes(projectId, collection); + const hashes = await getProjectHashes(projectId, collection, resolvedPath); // Ensure collection exists — getCollectionInfo now throws on transient errors, // so a network blip will abort rather than cascade into a destructive fallback. @@ -961,7 +1019,7 @@ export async function updateProjectIndex( const currentFiles = await getIndexableFiles(resolvedPath, extraExtensions); progress.filesTotal = currentFiles.length; onProgress?.(`Found ${currentFiles.length} indexable files, scanning for changes...`); - const currentFileSet = new Set(currentFiles.map((f) => path.join(resolvedPath, f))); + const currentFileSet = new Set(currentFiles); interface ChangedFile { relativePath: string; @@ -986,7 +1044,7 @@ export async function updateProjectIndex( } const content = await fsp.readFile(absolutePath, "utf-8"); const contentHash = hashContent(content); - const existingHash = hashes.get(absolutePath); + const existingHash = hashes.get(relativePath); if (existingHash === contentHash) return null; @@ -1015,7 +1073,7 @@ export async function updateProjectIndex( progress.phase = "cleaning stale chunks"; for (const file of changedFiles) { if (!file.isNew) { - await deleteFileChunks(collection, file.absolutePath); + await deleteFileChunks(collection, file.relativePath); } } @@ -1103,7 +1161,7 @@ export async function updateProjectIndex( // Update hashes and counts for this batch's files for (const file of fileBatch) { - hashes.set(file.absolutePath, file.contentHash); + hashes.set(file.relativePath, file.contentHash); if (file.isNew) added++; else updated++; } @@ -1119,10 +1177,10 @@ export async function updateProjectIndex( // Check for deleted files progress.phase = "removing deleted files"; - for (const [absolutePath] of hashes) { - if (!currentFileSet.has(absolutePath)) { - await deleteFileChunks(collection, absolutePath); - hashes.delete(absolutePath); + for (const [filePath] of hashes) { + if (!currentFileSet.has(filePath)) { + await deleteFileChunks(collection, filePath); + hashes.delete(filePath); removed++; } } diff --git a/src/services/qdrant.ts b/src/services/qdrant.ts index 1f19071..0934213 100644 --- a/src/services/qdrant.ts +++ b/src/services/qdrant.ts @@ -285,14 +285,14 @@ export async function upsertPreEmbeddedChunks( return { pointsSkipped: totalSkipped }; } -/** Delete all chunks for a specific file */ -export async function deleteFileChunks(collectionName: string, filePath: string): Promise { +/** Delete all chunks for a specific file (matched by relativePath) */ +export async function deleteFileChunks(collectionName: string, relativePath: string): Promise { const qdrant = getClient(); - logger.info("Deleting file chunks", { collection: collectionName, filePath }); + logger.info("Deleting file chunks", { collection: collectionName, relativePath }); await withRetry( () => qdrant.delete(collectionName, { filter: { - must: [{ key: "filePath", match: { value: filePath } }], + must: [{ key: "relativePath", match: { value: relativePath } }], }, }), "Qdrant delete chunks", diff --git a/tests/integration/qdrant.test.ts b/tests/integration/qdrant.test.ts index 0eab509..6befd45 100644 --- a/tests/integration/qdrant.test.ts +++ b/tests/integration/qdrant.test.ts @@ -207,7 +207,7 @@ describe.skipIf(!dockerAvailable)("qdrant service", () => { describe("delete file chunks", () => { it("deletes chunks for a specific file", async () => { - await deleteFileChunks(TEST_COLLECTION, "/project/lib/data.py"); + await deleteFileChunks(TEST_COLLECTION, "lib/data.py"); const info = await getCollectionInfo(TEST_COLLECTION); expect(info?.pointsCount).toBe(2); // 2 remaining diff --git a/tests/unit/indexer.test.ts b/tests/unit/indexer.test.ts index cae1ff8..55c9623 100644 --- a/tests/unit/indexer.test.ts +++ b/tests/unit/indexer.test.ts @@ -69,6 +69,18 @@ describe("indexer utilities", () => { const b = chunkId("/path/bar.ts", 1); expect(a).not.toBe(b); }); + + it("produces the same ID for the same relative path regardless of absolute prefix", () => { + // With relative paths as the canonical key, worktrees at different + // absolute locations produce identical chunk IDs. + const relPath = "src/index.ts"; + const a = chunkId(relPath, 1); + const b = chunkId(relPath, 1); + expect(a).toBe(b); + // And it differs from an absolute-looking path + const c = chunkId("/home/user/project/src/index.ts", 1); + expect(a).not.toBe(c); + }); }); // ── isIndexableFile ──────────────────────────────────────────