Skip to content

Commit 5c4682d

Browse files
committed
fix: derive folder/project hierarchy from code paths
Stop syncing asset_hierarchy folder/project rows from Chroma result ids and derive them deterministically from code document paths so tree-stage DB state and embedding flow stay consistent. Made-with: Cursor
1 parent 4d0559d commit 5c4682d

6 files changed

Lines changed: 125 additions & 40 deletions

File tree

src/service/github_embedding/service.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
run_github_embedding_pipeline,
1818
)
1919
from src.service.user.asset_hierarchy_sync import (
20-
sync_folder_project_rows_from_embedding_result,
20+
sync_folder_project_rows_from_code_document_ids,
2121
)
2222

2323

@@ -135,12 +135,9 @@ async def run_github_repo_embedding_job(
135135
chroma=chroma,
136136
include_summaries=include_summaries,
137137
)
138-
ids_raw = result.get("ids")
139-
if isinstance(ids_raw, list):
140-
await sync_folder_project_rows_from_embedding_result(
141-
user_id=user_id,
142-
repo_full_name=repo_full_name,
143-
code_document_ids=code_document_ids,
144-
result_ids=[str(x) for x in ids_raw],
145-
)
138+
await sync_folder_project_rows_from_code_document_ids(
139+
user_id=user_id,
140+
repo_full_name=repo_full_name,
141+
code_document_ids=code_document_ids,
142+
)
146143
return result

src/service/user/asset_hierarchy_sync.py

Lines changed: 95 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
selected_repo_assets의 code 항목을 asset_hierarchy(code)에 반영 (데모·임베딩 SSoT).
33
4-
임베딩 잡 완료 후 folder/project 행을 Chroma ``ids``와 맞추는 동기화도 제공한다.
4+
folder/project 행은 (임베딩 결과가 아니라) code 문서 id 경로로부터 결정적으로 파생해 동기화한다.
55
"""
66

77
from __future__ import annotations
@@ -10,7 +10,10 @@
1010
from typing import Any
1111

1212
from src.db.sqlite import connect
13-
from src.service.github_embedding.paths import split_chroma_document_id
13+
from src.service.github_embedding.paths import (
14+
collect_parent_directories,
15+
split_chroma_document_id,
16+
)
1417
from src.service.user.selected_assets import get_selected_repo_assets
1518

1619

@@ -30,6 +33,8 @@ async def sync_code_rows_from_selected_assets(
3033
해당 ``selected_repo_id``의 ``type=code`` ``asset_hierarchy`` 행을 지우고,
3134
``selected_repo_assets`` 중 ``asset_type=code``만 다시 넣는다.
3235
36+
또한 code 경로로부터 결정되는 ``type IN ('folder','project')`` 행도 함께 갱신한다.
37+
3338
Returns:
3439
``{"inserted": int, "ids": list[str]}``
3540
"""
@@ -51,6 +56,7 @@ async def sync_code_rows_from_selected_assets(
5156
full_name = str(row["repo_full_name"])
5257
assets = await get_selected_repo_assets(selected_repo_id, db_path=db_path)
5358

59+
inserted_rel_paths: set[str] = set()
5460
await conn.execute(
5561
"""
5662
DELETE FROM asset_hierarchy
@@ -67,6 +73,10 @@ async def sync_code_rows_from_selected_assets(
6773
if not isinstance(rp, str) or not rp.strip():
6874
continue
6975
doc_id = _chroma_doc_id(full_name, rp)
76+
# doc_id := f"{repo_full_name}/{rel}"
77+
prefix = f"{full_name}/"
78+
rel = doc_id[len(prefix) :]
79+
inserted_rel_paths.add(rel)
7080
await conn.execute(
7181
"""
7282
INSERT INTO asset_hierarchy (id, selected_repo_id, type)
@@ -76,13 +86,71 @@ async def sync_code_rows_from_selected_assets(
7686
)
7787
inserted_ids.append(doc_id)
7888

89+
# folder/project는 code 경로로부터 결정되므로 함께 재생성한다.
90+
await conn.execute(
91+
"""
92+
DELETE FROM asset_hierarchy
93+
WHERE selected_repo_id = ? AND type IN ('folder', 'project')
94+
""",
95+
(selected_repo_id,),
96+
)
97+
98+
if inserted_rel_paths:
99+
folder_paths = collect_parent_directories(sorted(inserted_rel_paths))
100+
for folder in sorted(folder_paths):
101+
folder_id = f"{full_name}/{folder}"
102+
await conn.execute(
103+
"""
104+
INSERT INTO asset_hierarchy (id, selected_repo_id, type)
105+
VALUES (?, ?, 'folder')
106+
""",
107+
(folder_id, selected_repo_id),
108+
)
109+
110+
project_id = f"{full_name}/"
111+
await conn.execute(
112+
"""
113+
INSERT INTO asset_hierarchy (id, selected_repo_id, type)
114+
VALUES (?, ?, 'project')
115+
""",
116+
(project_id, selected_repo_id),
117+
)
118+
79119
await conn.commit()
80120
finally:
81121
await conn.close()
82122

83123
return {"inserted": len(inserted_ids), "ids": inserted_ids}
84124

85125

126+
def _folder_project_rows_from_code_document_ids(
127+
repo_full_name: str,
128+
code_document_ids: list[str],
129+
) -> list[tuple[str, str]]:
130+
"""
131+
code 문서 id(`owner/repo/<rel_path>`)들로부터 folder/project id를 파생한다.
132+
"""
133+
rel_paths: set[str] = set()
134+
for doc_id in code_document_ids:
135+
r, rel = split_chroma_document_id(doc_id)
136+
if r != repo_full_name:
137+
continue
138+
if rel == "/":
139+
# code는 project root가 될 수 없다.
140+
continue
141+
rel_paths.add(rel)
142+
143+
if not rel_paths:
144+
return []
145+
146+
folder_paths = collect_parent_directories(sorted(rel_paths))
147+
rows: list[tuple[str, str]] = []
148+
for folder in sorted(folder_paths):
149+
rows.append((f"{repo_full_name}/{folder}", "folder"))
150+
rows.append((f"{repo_full_name}/", "project"))
151+
return rows
152+
153+
86154
def _folder_project_rows_from_embedding_ids(
87155
repo_full_name: str,
88156
code_document_ids: list[str],
@@ -122,8 +190,28 @@ async def sync_folder_project_rows_from_embedding_result(
122190
db_path: str | Path | None = None,
123191
) -> None:
124192
"""
125-
``result_ids``에서 folder/project id만 추출한 뒤,
126-
해당 ``selected_repo``의 ``type IN ('folder','project')`` 행을 삭제하고 다시 INSERT 한다.
193+
(호환용) folder/project를 sync할 때 Chroma `result_ids`에 의존하지 않고,
194+
`code_document_ids` 경로로부터 결정적으로 재생성한다.
195+
"""
196+
_ = result_ids
197+
await sync_folder_project_rows_from_code_document_ids(
198+
user_id=user_id,
199+
repo_full_name=repo_full_name,
200+
code_document_ids=code_document_ids,
201+
db_path=db_path,
202+
)
203+
204+
205+
async def sync_folder_project_rows_from_code_document_ids(
206+
*,
207+
user_id: str,
208+
repo_full_name: str,
209+
code_document_ids: list[str],
210+
db_path: str | Path | None = None,
211+
) -> None:
212+
"""
213+
`code_document_ids`로부터 folder/project id를 파생해,
214+
해당 ``selected_repo``의 ``type IN ('folder','project')`` 행을 DELETE 후 INSERT 한다.
127215
"""
128216
conn = await connect(db_path)
129217
try:
@@ -149,10 +237,9 @@ async def sync_folder_project_rows_from_embedding_result(
149237
(selected_repo_id,),
150238
)
151239

152-
for doc_id, htype in _folder_project_rows_from_embedding_ids(
153-
repo_full_name,
154-
code_document_ids,
155-
result_ids,
240+
for doc_id, htype in _folder_project_rows_from_code_document_ids(
241+
repo_full_name=repo_full_name,
242+
code_document_ids=code_document_ids,
156243
):
157244
await conn.execute(
158245
"""

tests/api/test_selected_repo_assets_api.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -243,22 +243,26 @@ def test_asset_hierarchy_sync_inserts_code_rows(
243243
assert body["inserted"] == 1
244244
assert body["ids"] == ["owner/repo-a/src/a.py"]
245245

246-
async def check_db() -> list[str]:
246+
async def check_db() -> list[tuple[str, str]]:
247247
conn = await connect(db_with_selected_repos["db_path"])
248248
try:
249249
cur = await conn.execute(
250250
"""
251-
SELECT id FROM asset_hierarchy
252-
WHERE selected_repo_id = ? AND type = 'code'
253-
ORDER BY id
251+
SELECT id, type FROM asset_hierarchy
252+
WHERE selected_repo_id = ?
253+
ORDER BY type ASC, id ASC
254254
""",
255255
(sid,),
256256
)
257257
rows = await cur.fetchall()
258258
await cur.close()
259-
return [r["id"] for r in rows]
259+
return [(r["id"], r["type"]) for r in rows]
260260
finally:
261261
await conn.close()
262262

263-
assert _run(check_db()) == ["owner/repo-a/src/a.py"]
263+
assert _run(check_db()) == [
264+
("owner/repo-a/src/a.py", "code"),
265+
("owner/repo-a/src", "folder"),
266+
("owner/repo-a/", "project"),
267+
]
264268

tests/service/test_asset_hierarchy_sync_folder_project.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
"""asset_hierarchy folder/project 동기화 (임베딩 result ids)."""
1+
"""asset_hierarchy folder/project 동기화 (code document ids)."""
22

33
from __future__ import annotations
44

55
import asyncio
66

77
from src.db.sqlite import connect, create_all_tables_async
8-
from src.service.user.asset_hierarchy_sync import sync_folder_project_rows_from_embedding_result
8+
from src.service.user.asset_hierarchy_sync import sync_folder_project_rows_from_code_document_ids
99

1010

1111
def _run(coro):
@@ -43,7 +43,7 @@ async def setup() -> int:
4343
await conn.execute(
4444
"""
4545
INSERT INTO asset_hierarchy (id, selected_repo_id, type)
46-
VALUES ('o/r/a.py', ?, 'code')
46+
VALUES ('o/r/src/a.py', ?, 'code')
4747
""",
4848
(sr_id,),
4949
)
@@ -60,14 +60,11 @@ async def setup() -> int:
6060

6161
sr_id = _run(setup())
6262

63-
result_ids = ["o/r/a.py", "o/r/src", "o/r/"]
64-
6563
async def sync() -> None:
66-
await sync_folder_project_rows_from_embedding_result(
64+
await sync_folder_project_rows_from_code_document_ids(
6765
user_id="u1",
6866
repo_full_name="o/r",
69-
code_document_ids=["o/r/a.py"],
70-
result_ids=result_ids,
67+
code_document_ids=["o/r/src/a.py"],
7168
db_path=db,
7269
)
7370

@@ -91,7 +88,7 @@ async def verify() -> list[tuple[str, str]]:
9188
return [(str(r["id"]), str(r["type"])) for r in rows]
9289

9390
assert _run(verify()) == [
94-
("o/r/a.py", "code"),
91+
("o/r/src/a.py", "code"),
9592
("o/r/src", "folder"),
9693
("o/r/", "project"),
9794
]
@@ -142,11 +139,10 @@ async def setup() -> int:
142139
sr_id = _run(setup())
143140

144141
async def sync() -> None:
145-
await sync_folder_project_rows_from_embedding_result(
142+
await sync_folder_project_rows_from_code_document_ids(
146143
user_id="u1",
147144
repo_full_name="o/r",
148145
code_document_ids=[],
149-
result_ids=[],
150146
db_path=db,
151147
)
152148

week-issues/week4-asset-hierarchy-embedding.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
| `type` | 채우는 경로 |
88
|--------|-------------|
99
| **code** | `sync_code_rows_from_selected_assets(selected_repo_id)``selected_repo_assets`에 저장된 **파일(code)** 경로를 Chroma 문서 id 형식 `owner/repo/상대경로`로 INSERT |
10-
| **folder** / **project** | 임베딩 잡 완료 후 `sync_folder_project_rows_from_embedding_result`파이프라인이 Chroma에 넣은 `result["ids"]`에서 code가 아닌 id만 골라 DB 반영. `rel == '/'`**project**, 그 외 상위 폴더 경로 → **folder** |
10+
| **folder** / **project** | 임베딩 잡 완료 후 `sync_folder_project_rows_from_code_document_ids``code_document_ids`(코드 경로)로부터 부모 디렉터리들(**folder**)과 프로젝트 루트(**project**)를 파생해 DB 반영. |
1111

1212
- **code**는 “에셋 저장·동기화” 흐름에서 먼저 채워진다.
13-
- **folder** / **project**는 임베딩이 끝난 뒤, Chroma에 생성된 id 목록과 동기화된다.
13+
- **folder** / **project**는 임베딩이 끝난 뒤, `code_document_ids` 경로로부터 재생성되어 동기화된다.
1414

1515
## 임베딩 파이프라인
1616

week-issues/week4-pr-asset-hierarchy.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ https://github.com/kocory1/AutoPolio/compare/main...week4/asset-hierarchy-embedd
1111
## 제목
1212

1313
```
14-
feat: asset_hierarchy에 folder/project 동기화 (임베딩 result ids)
14+
feat: asset_hierarchy에 folder/project 동기화 (code path 기반)
1515
```
1616

1717
---
@@ -20,12 +20,13 @@ feat: asset_hierarchy에 folder/project 동기화 (임베딩 result ids)
2020

2121
```markdown
2222
## Summary
23-
- 임베딩 잡(`run_github_repo_embedding_job`) 완료 후 Chroma `result["ids"]`에서 **folder·project** id만 추출해 `asset_hierarchy`를 갱신한다 (기존 `folder`/`project` 행 DELETE 후 INSERT).
24-
- **`code`** 행은 기존과 같이 `sync_code_rows_from_selected_assets`로만 채운다. **folder/project**는 파이프라인이 만든 Chroma id와 DB를 맞춘다.
23+
- 임베딩 잡(`run_github_repo_embedding_job`) 완료 후 **`code_document_ids`(코드 경로)**로부터 결정적으로 **folder·project** id를 재생성해 `asset_hierarchy`를 갱신한다 (기존 `folder`/`project` 행 DELETE 후 INSERT).
24+
- **`code`** 행은 기존과 같이 `sync_code_rows_from_selected_assets`로만 채운다.
25+
- **`folder/project`**는 Chroma `result["ids"]`가 아니라 **code 경로(부모 디렉터리 + project root)**로 파생한다.
2526
- 설계·운영 전제는 `week-issues/week4-asset-hierarchy-embedding.md` 참고.
2627

2728
## Changes
28-
- `src/service/user/asset_hierarchy_sync.py`: `sync_folder_project_rows_from_embedding_result`, `_folder_project_rows_from_embedding_ids`
29+
- `src/service/user/asset_hierarchy_sync.py`: `sync_folder_project_rows_from_code_document_ids`
2930
- `src/service/github_embedding/service.py`: 파이프라인 직후 위 동기화 호출
3031
- `tests/service/test_asset_hierarchy_sync_folder_project.py`: 동기화 단위 테스트
3132

0 commit comments

Comments
 (0)