diff --git a/src/service/github_embedding/service.py b/src/service/github_embedding/service.py index 1f67690..88a18bc 100644 --- a/src/service/github_embedding/service.py +++ b/src/service/github_embedding/service.py @@ -16,6 +16,9 @@ TextEmbedderPort, run_github_embedding_pipeline, ) +from src.service.user.asset_hierarchy_sync import ( + sync_folder_project_rows_from_code_document_ids, +) class _TokenGitHubContentAdapter: @@ -121,7 +124,7 @@ async def run_github_repo_embedding_job( chroma = GitHubEmbeddingChromaAdapter( persist_dir=persist if persist is not None else None, ) - return await run_github_embedding_pipeline( + result = await run_github_embedding_pipeline( user_id=user_id, repo_full_name=repo_full_name, code_document_ids=code_document_ids, @@ -132,3 +135,9 @@ async def run_github_repo_embedding_job( chroma=chroma, include_summaries=include_summaries, ) + await sync_folder_project_rows_from_code_document_ids( + user_id=user_id, + repo_full_name=repo_full_name, + code_document_ids=code_document_ids, + ) + return result diff --git a/src/service/user/asset_hierarchy_sync.py b/src/service/user/asset_hierarchy_sync.py index aa784c9..df56199 100644 --- a/src/service/user/asset_hierarchy_sync.py +++ b/src/service/user/asset_hierarchy_sync.py @@ -1,5 +1,7 @@ """ selected_repo_assets의 code 항목을 asset_hierarchy(code)에 반영 (데모·임베딩 SSoT). + +folder/project 행은 (임베딩 결과가 아니라) code 문서 id 경로로부터 결정적으로 파생해 동기화한다. """ from __future__ import annotations @@ -8,6 +10,10 @@ from typing import Any from src.db.sqlite import connect +from src.service.github_embedding.paths import ( + collect_parent_directories, + split_chroma_document_id, +) from src.service.user.selected_assets import get_selected_repo_assets @@ -27,6 +33,8 @@ async def sync_code_rows_from_selected_assets( 해당 ``selected_repo_id``의 ``type=code`` ``asset_hierarchy`` 행을 지우고, ``selected_repo_assets`` 중 ``asset_type=code``만 다시 넣는다. + 또한 code 경로로부터 결정되는 ``type IN ('folder','project')`` 행도 함께 갱신한다. + Returns: ``{"inserted": int, "ids": list[str]}`` """ @@ -48,6 +56,7 @@ async def sync_code_rows_from_selected_assets( full_name = str(row["repo_full_name"]) assets = await get_selected_repo_assets(selected_repo_id, db_path=db_path) + inserted_rel_paths: set[str] = set() await conn.execute( """ DELETE FROM asset_hierarchy @@ -64,6 +73,10 @@ async def sync_code_rows_from_selected_assets( if not isinstance(rp, str) or not rp.strip(): continue doc_id = _chroma_doc_id(full_name, rp) + # doc_id := f"{repo_full_name}/{rel}" + prefix = f"{full_name}/" + rel = doc_id[len(prefix) :] + inserted_rel_paths.add(rel) await conn.execute( """ INSERT INTO asset_hierarchy (id, selected_repo_id, type) @@ -73,8 +86,169 @@ async def sync_code_rows_from_selected_assets( ) inserted_ids.append(doc_id) + # folder/project는 code 경로로부터 결정되므로 함께 재생성한다. + await conn.execute( + """ + DELETE FROM asset_hierarchy + WHERE selected_repo_id = ? AND type IN ('folder', 'project') + """, + (selected_repo_id,), + ) + + if inserted_rel_paths: + folder_paths = collect_parent_directories(sorted(inserted_rel_paths)) + for folder in sorted(folder_paths): + folder_id = f"{full_name}/{folder}" + await conn.execute( + """ + INSERT INTO asset_hierarchy (id, selected_repo_id, type) + VALUES (?, ?, 'folder') + """, + (folder_id, selected_repo_id), + ) + + project_id = f"{full_name}/" + await conn.execute( + """ + INSERT INTO asset_hierarchy (id, selected_repo_id, type) + VALUES (?, ?, 'project') + """, + (project_id, selected_repo_id), + ) + await conn.commit() finally: await conn.close() return {"inserted": len(inserted_ids), "ids": inserted_ids} + + +def _folder_project_rows_from_code_document_ids( + repo_full_name: str, + code_document_ids: list[str], +) -> list[tuple[str, str]]: + """ + code 문서 id(`owner/repo/`)들로부터 folder/project id를 파생한다. + """ + rel_paths: set[str] = set() + for doc_id in code_document_ids: + r, rel = split_chroma_document_id(doc_id) + if r != repo_full_name: + continue + if rel == "/": + # code는 project root가 될 수 없다. + continue + rel_paths.add(rel) + + if not rel_paths: + return [] + + folder_paths = collect_parent_directories(sorted(rel_paths)) + rows: list[tuple[str, str]] = [] + for folder in sorted(folder_paths): + rows.append((f"{repo_full_name}/{folder}", "folder")) + rows.append((f"{repo_full_name}/", "project")) + return rows + + +def _folder_project_rows_from_embedding_ids( + repo_full_name: str, + code_document_ids: list[str], + result_ids: list[str], +) -> list[tuple[str, str]]: + """ + ``result_ids``(Chroma id 목록)에서 folder·project만 골라 ``(id, type)`` 로 반환한다. + + code 경로는 ``code_document_ids``에서 분해한 상대 경로 집합으로 구분한다. + """ + code_rels: set[str] = set() + for doc_id in code_document_ids: + r, rel = split_chroma_document_id(doc_id) + if r == repo_full_name: + code_rels.add(rel) + + rows: list[tuple[str, str]] = [] + for did in result_ids: + r, rel = split_chroma_document_id(did) + if r != repo_full_name: + continue + if rel == "/": + rows.append((did, "project")) + elif rel in code_rels: + continue + else: + rows.append((did, "folder")) + return rows + + +async def sync_folder_project_rows_from_embedding_result( + *, + user_id: str, + repo_full_name: str, + code_document_ids: list[str], + result_ids: list[str], + db_path: str | Path | None = None, +) -> None: + """ + (호환용) folder/project를 sync할 때 Chroma `result_ids`에 의존하지 않고, + `code_document_ids` 경로로부터 결정적으로 재생성한다. + """ + _ = result_ids + await sync_folder_project_rows_from_code_document_ids( + user_id=user_id, + repo_full_name=repo_full_name, + code_document_ids=code_document_ids, + db_path=db_path, + ) + + +async def sync_folder_project_rows_from_code_document_ids( + *, + user_id: str, + repo_full_name: str, + code_document_ids: list[str], + db_path: str | Path | None = None, +) -> None: + """ + `code_document_ids`로부터 folder/project id를 파생해, + 해당 ``selected_repo``의 ``type IN ('folder','project')`` 행을 DELETE 후 INSERT 한다. + """ + conn = await connect(db_path) + try: + cur = await conn.execute( + """ + SELECT id FROM selected_repos + WHERE user_id = ? AND repo_full_name = ? + """, + (user_id, repo_full_name), + ) + row = await cur.fetchone() + await cur.close() + if not row: + raise ValueError("SELECTED_REPO_NOT_FOUND") + + selected_repo_id = int(row["id"]) + + await conn.execute( + """ + DELETE FROM asset_hierarchy + WHERE selected_repo_id = ? AND type IN ('folder', 'project') + """, + (selected_repo_id,), + ) + + for doc_id, htype in _folder_project_rows_from_code_document_ids( + repo_full_name=repo_full_name, + code_document_ids=code_document_ids, + ): + await conn.execute( + """ + INSERT INTO asset_hierarchy (id, selected_repo_id, type) + VALUES (?, ?, ?) + """, + (doc_id, selected_repo_id, htype), + ) + + await conn.commit() + finally: + await conn.close() diff --git a/tests/api/test_selected_repo_assets_api.py b/tests/api/test_selected_repo_assets_api.py index f835d27..4ee3308 100644 --- a/tests/api/test_selected_repo_assets_api.py +++ b/tests/api/test_selected_repo_assets_api.py @@ -243,22 +243,26 @@ def test_asset_hierarchy_sync_inserts_code_rows( assert body["inserted"] == 1 assert body["ids"] == ["owner/repo-a/src/a.py"] - async def check_db() -> list[str]: + async def check_db() -> list[tuple[str, str]]: conn = await connect(db_with_selected_repos["db_path"]) try: cur = await conn.execute( """ - SELECT id FROM asset_hierarchy - WHERE selected_repo_id = ? AND type = 'code' - ORDER BY id + SELECT id, type FROM asset_hierarchy + WHERE selected_repo_id = ? + ORDER BY type ASC, id ASC """, (sid,), ) rows = await cur.fetchall() await cur.close() - return [r["id"] for r in rows] + return [(r["id"], r["type"]) for r in rows] finally: await conn.close() - assert _run(check_db()) == ["owner/repo-a/src/a.py"] + assert _run(check_db()) == [ + ("owner/repo-a/src/a.py", "code"), + ("owner/repo-a/src", "folder"), + ("owner/repo-a/", "project"), + ] diff --git a/tests/service/test_asset_hierarchy_sync_folder_project.py b/tests/service/test_asset_hierarchy_sync_folder_project.py new file mode 100644 index 0000000..0740e9b --- /dev/null +++ b/tests/service/test_asset_hierarchy_sync_folder_project.py @@ -0,0 +1,168 @@ +"""asset_hierarchy folder/project 동기화 (code document ids).""" + +from __future__ import annotations + +import asyncio + +from src.db.sqlite import connect, create_all_tables_async +from src.service.user.asset_hierarchy_sync import sync_folder_project_rows_from_code_document_ids + + +def _run(coro): + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + +def test_sync_folder_project_replaces_rows(tmp_path) -> None: + db = tmp_path / "t.db" + + async def setup() -> int: + conn = await connect(db) + await create_all_tables_async(conn) + await conn.execute( + """ + INSERT INTO users (id, github_username, access_token, created_at, updated_at) + VALUES ('u1', 'x', 'tok', '2026-01-01', '2026-01-01') + """, + ) + await conn.execute( + """ + INSERT INTO selected_repos (user_id, repo_full_name, created_at) + VALUES ('u1', 'o/r', '2026-01-01') + """, + ) + cur = await conn.execute( + "SELECT id FROM selected_repos WHERE user_id = ? AND repo_full_name = ?", + ("u1", "o/r"), + ) + sr_id = int((await cur.fetchone())["id"]) + await cur.close() + await conn.execute( + """ + INSERT INTO asset_hierarchy (id, selected_repo_id, type) + VALUES ('o/r/src/a.py', ?, 'code') + """, + (sr_id,), + ) + await conn.execute( + """ + INSERT INTO asset_hierarchy (id, selected_repo_id, type) + VALUES ('o/r/old', ?, 'folder') + """, + (sr_id,), + ) + await conn.commit() + await conn.close() + return sr_id + + sr_id = _run(setup()) + + async def sync() -> None: + await sync_folder_project_rows_from_code_document_ids( + user_id="u1", + repo_full_name="o/r", + code_document_ids=["o/r/src/a.py"], + db_path=db, + ) + + _run(sync()) + + async def verify() -> list[tuple[str, str]]: + conn = await connect(db) + try: + cur = await conn.execute( + """ + SELECT id, type FROM asset_hierarchy + WHERE selected_repo_id = ? + ORDER BY type ASC, id ASC + """, + (sr_id,), + ) + rows = await cur.fetchall() + await cur.close() + finally: + await conn.close() + return [(str(r["id"]), str(r["type"])) for r in rows] + + assert _run(verify()) == [ + ("o/r/src/a.py", "code"), + ("o/r/src", "folder"), + ("o/r/", "project"), + ] + + +def test_sync_folder_project_empty_result_deletes_only_folder_project(tmp_path) -> None: + db = tmp_path / "e.db" + + async def setup() -> int: + conn = await connect(db) + await create_all_tables_async(conn) + await conn.execute( + """ + INSERT INTO users (id, github_username, access_token, created_at, updated_at) + VALUES ('u1', 'x', 'tok', '2026-01-01', '2026-01-01') + """, + ) + await conn.execute( + """ + INSERT INTO selected_repos (user_id, repo_full_name, created_at) + VALUES ('u1', 'o/r', '2026-01-01') + """, + ) + cur = await conn.execute( + "SELECT id FROM selected_repos WHERE user_id = ? AND repo_full_name = ?", + ("u1", "o/r"), + ) + sr_id = int((await cur.fetchone())["id"]) + await cur.close() + await conn.execute( + """ + INSERT INTO asset_hierarchy (id, selected_repo_id, type) + VALUES ('o/r/x.py', ?, 'code') + """, + (sr_id,), + ) + await conn.execute( + """ + INSERT INTO asset_hierarchy (id, selected_repo_id, type) + VALUES ('o/r/legacy', ?, 'folder') + """, + (sr_id,), + ) + await conn.commit() + await conn.close() + return sr_id + + sr_id = _run(setup()) + + async def sync() -> None: + await sync_folder_project_rows_from_code_document_ids( + user_id="u1", + repo_full_name="o/r", + code_document_ids=[], + db_path=db, + ) + + _run(sync()) + + async def verify() -> list[tuple[str, str]]: + conn = await connect(db) + try: + cur = await conn.execute( + """ + SELECT id, type FROM asset_hierarchy + WHERE selected_repo_id = ? + ORDER BY id ASC + """, + (sr_id,), + ) + rows = await cur.fetchall() + await cur.close() + finally: + await conn.close() + return [(str(r["id"]), str(r["type"])) for r in rows] + + assert _run(verify()) == [("o/r/x.py", "code")] diff --git a/week-issues/week4-asset-hierarchy-embedding.md b/week-issues/week4-asset-hierarchy-embedding.md new file mode 100644 index 0000000..20ba297 --- /dev/null +++ b/week-issues/week4-asset-hierarchy-embedding.md @@ -0,0 +1,38 @@ +# asset_hierarchy · GitHub 임베딩 연동 (설계 메모) + +`asset_hierarchy` 테이블(`id`, `selected_repo_id`, `type`)과 임베딩 파이프라인이 어떻게 맞물리는지 정리한다. + +## 스키마 (`type`: `code` | `folder` | `project`) + +| `type` | 채우는 경로 | +|--------|-------------| +| **code** | `sync_code_rows_from_selected_assets(selected_repo_id)` — `selected_repo_assets`에 저장된 **파일(code)** 경로를 Chroma 문서 id 형식 `owner/repo/상대경로`로 INSERT | +| **folder** / **project** | 임베딩 잡 완료 후 `sync_folder_project_rows_from_code_document_ids` — `code_document_ids`(코드 경로)로부터 부모 디렉터리들(**folder**)과 프로젝트 루트(**project**)를 파생해 DB 반영. | + +- **code**는 “에셋 저장·동기화” 흐름에서 먼저 채워진다. +- **folder** / **project**는 임베딩이 끝난 뒤, `code_document_ids` 경로로부터 재생성되어 동기화된다. + +## 임베딩 파이프라인 + +- 입력: `code_document_ids` — 보통 `asset_hierarchy`의 `type=code` id 목록(`fetch_code_document_ids_for_repo`) 또는 요청 body. +- 파일 원문: `run_github_repo_embedding_job` → `_TokenGitHubContentAdapter` → `get_repo_content` (**GitHub Contents API**). +- 순서: 선택된 파일들 임베딩 → 경로 `/` 기준 **부모 디렉터리 수집** → **bottom-up** 폴더 요약·임베딩 → **프로젝트(루트)** 요약·임베딩. +- 저장: Chroma(유저·레포 메타데이터), 잡 종료 시 `asset_hierarchy`의 folder/project 행 갱신. + +## `selected_repo_id`와의 대응 + +- 잡 API는 `user_id` + `repo_full_name`으로 동작한다. +- DB 기록 시 `selected_repos`에서 `user_id` + `repo_full_name`으로 `selected_repo_id`를 조회해 `asset_hierarchy`에 넣는다. +- 한 유저·레포당 `selected_repos` 행이 하나면 **레포 단위 ≈ selected_repo_id 단위**로 보면 된다. + +## 운영 시 전제 + +1. 임베딩을 돌리려면 먼저 **`type=code` 행**이 있어야 한다 (`NO_CODE_ASSETS_IN_HIERARCHY` 방지). 보통 **selected assets 저장 후** `sync_code_rows_from_selected_assets`로 맞춘다. +2. 파일 원문은 매번 GitHub API로 가져온다. **로컬 캐시/선저장**은 추후 성능 이슈 시 검토. + +## 관련 코드 + +- `src/service/user/asset_hierarchy_sync.py` — code 동기화, folder/project 동기화 +- `src/service/github_embedding/service.py` — 임베딩 잡 + 잡 종료 시 folder/project DB 동기화 +- `src/service/github_embedding/pipeline.py` — 파일 → 폴더 → 프로젝트 순 파이프라인 +- `src/service/github_embedding/hierarchy.py` — `fetch_code_document_ids_for_repo` diff --git a/week-issues/week4-pr-asset-hierarchy.md b/week-issues/week4-pr-asset-hierarchy.md new file mode 100644 index 0000000..8328afc --- /dev/null +++ b/week-issues/week4-pr-asset-hierarchy.md @@ -0,0 +1,40 @@ +# PR 초안: asset_hierarchy ↔ 임베딩 동기화 + +## Compare & PR 생성 URL + +**Base: `main` ← Head: `week4/asset-hierarchy-embedding-sync`** + +https://github.com/kocory1/AutoPolio/compare/main...week4/asset-hierarchy-embedding-sync?expand=1 + +--- + +## 제목 + +``` +feat: asset_hierarchy에 folder/project 동기화 (code path 기반) +``` + +--- + +## 본문 (복사용) + +```markdown +## Summary +- 임베딩 잡(`run_github_repo_embedding_job`) 완료 후 **`code_document_ids`(코드 경로)**로부터 결정적으로 **folder·project** id를 재생성해 `asset_hierarchy`를 갱신한다 (기존 `folder`/`project` 행 DELETE 후 INSERT). +- **`code`** 행은 기존과 같이 `sync_code_rows_from_selected_assets`로만 채운다. +- **`folder/project`**는 Chroma `result["ids"]`가 아니라 **code 경로(부모 디렉터리 + project root)**로 파생한다. +- 설계·운영 전제는 `week-issues/week4-asset-hierarchy-embedding.md` 참고. + +## Changes +- `src/service/user/asset_hierarchy_sync.py`: `sync_folder_project_rows_from_code_document_ids` +- `src/service/github_embedding/service.py`: 파이프라인 직후 위 동기화 호출 +- `tests/service/test_asset_hierarchy_sync_folder_project.py`: 동기화 단위 테스트 + +## How to test +```bash +poetry run pytest -q +``` + +## Related doc +- `week-issues/week4-asset-hierarchy-embedding.md` +```