Skip to content

Commit edbdbbe

Browse files
committed
feat: add China Lens persistence adapters
1 parent 4f0bb28 commit edbdbbe

11 files changed

Lines changed: 1082 additions & 64 deletions

File tree

.beads/issues.jsonl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,12 @@
4747
{"id":"edgarpack-lb1.1","title":"China Lens frontend shell: Next workspace, command palette, pack + evidence UI","description":"Create web/ Next.js shell with workspace routes, commandable navigation, citation pills, and Evidence Explorer layout with keyboard shortcuts.","status":"closed","priority":1,"issue_type":"feature","owner":"samay58@gmail.com","created_at":"2026-02-22T01:16:53.249494-08:00","created_by":"Samay Dhawan","updated_at":"2026-02-22T01:34:49.039992-08:00","closed_at":"2026-02-22T01:34:49.039992-08:00","close_reason":"Closed","dependencies":[{"issue_id":"edgarpack-lb1.1","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-02-22T01:16:53.250388-08:00","created_by":"Samay Dhawan"}]}
4848
{"id":"edgarpack-lb1.10","title":"Enable CORS for local China Lens frontend-to-API workflow","description":"Add CORSMiddleware for localhost origins so Next.js dev app can execute POST/OPTIONS calls to FastAPI without preflight failures.","status":"closed","priority":1,"issue_type":"bug","owner":"samay58@gmail.com","created_at":"2026-02-22T01:52:46.792977-08:00","created_by":"Samay Dhawan","updated_at":"2026-02-22T01:53:17.645284-08:00","closed_at":"2026-02-22T01:53:17.645284-08:00","close_reason":"Closed","dependencies":[{"issue_id":"edgarpack-lb1.10","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-02-22T01:52:46.793802-08:00","created_by":"Samay Dhawan"}]}
4949
{"id":"edgarpack-lb1.11","title":"China Lens CNINFO live fetch + page image rendering","description":"Add network-backed CNINFO listing/download adapter (behind retries and deterministic fixtures) and produce stable page image URLs/thumbnails for document viewer rendering; preserve current manifest-path fallback for offline/test runs.","status":"open","priority":2,"issue_type":"task","owner":"samay58@gmail.com","created_at":"2026-03-11T00:24:51.158891-07:00","created_by":"Samay Dhawan","updated_at":"2026-03-11T00:24:51.158891-07:00","dependencies":[{"issue_id":"edgarpack-lb1.11","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-03-11T00:24:51.159513-07:00","created_by":"Samay Dhawan"}]}
50+
{"id":"edgarpack-lb1.12","title":"China Lens pgvector retrieval and DB-native evidence search","description":"Use the PostgreSQL persistence adapter as the retrieval backend instead of Python-side lexical ranking. Add pgvector similarity search, hybrid lexical+vector ranking, and parity tests so evidence search can scale without loading all chunks into process memory.","status":"open","priority":2,"issue_type":"task","owner":"samay58@gmail.com","created_at":"2026-03-11T00:42:56.477371-07:00","created_by":"Samay Dhawan","updated_at":"2026-03-11T00:42:56.477371-07:00","dependencies":[{"issue_id":"edgarpack-lb1.12","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-03-11T00:42:56.477876-07:00","created_by":"Samay Dhawan"}]}
5051
{"id":"edgarpack-lb1.2","title":"China Lens contracts + QA: validators, tests, and implementation spec doc","description":"Add citation gating validators, unit tests, API smoke tests, and docs/China-Lens implementation tracking spec.","status":"closed","priority":1,"issue_type":"task","owner":"samay58@gmail.com","created_at":"2026-02-22T01:16:58.339951-08:00","created_by":"Samay Dhawan","updated_at":"2026-02-22T01:34:49.094526-08:00","closed_at":"2026-02-22T01:34:49.094526-08:00","close_reason":"Closed","dependencies":[{"issue_id":"edgarpack-lb1.2","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-02-22T01:16:58.340772-08:00","created_by":"Samay Dhawan"}]}
5152
{"id":"edgarpack-lb1.3","title":"China Lens backend skeleton: models, job service, and FastAPI routes","description":"Implement initial China Lens backend surface in edgarpack: domain models, in-memory services, and FastAPI endpoints for packs, documents, evidence, ask, citations, and CNINFO sync.","status":"closed","priority":1,"issue_type":"feature","owner":"samay58@gmail.com","created_at":"2026-02-22T01:16:58.340052-08:00","created_by":"Samay Dhawan","updated_at":"2026-02-22T01:34:49.01209-08:00","closed_at":"2026-02-22T01:34:49.01209-08:00","close_reason":"Closed","dependencies":[{"issue_id":"edgarpack-lb1.3","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-02-22T01:16:58.341406-08:00","created_by":"Samay Dhawan"}]}
5253
{"id":"edgarpack-lb1.4","title":"China Lens production hardening: auth, async jobs, and observability","description":"Add single-tenant auth guard, durable background job runner, cancellation guarantees, metrics, and failure-mode dashboards for pack jobs and evidence search.","status":"open","priority":2,"issue_type":"task","owner":"samay58@gmail.com","created_at":"2026-02-22T01:35:01.576653-08:00","created_by":"Samay Dhawan","updated_at":"2026-02-22T01:35:01.576653-08:00","dependencies":[{"issue_id":"edgarpack-lb1.4","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-02-22T01:35:01.578063-08:00","created_by":"Samay Dhawan"}]}
5354
{"id":"edgarpack-lb1.5","title":"China Lens real ingestion: CNINFO acquisition + PDF extraction/OCR pipeline","description":"Implement production connector and extraction pipeline: real CNINFO retrieval, page rendering, embedded text extraction with OCR fallback, and chunk indexing with provenance.","status":"closed","priority":1,"issue_type":"feature","owner":"samay58@gmail.com","created_at":"2026-02-22T01:35:01.57662-08:00","created_by":"Samay Dhawan","updated_at":"2026-03-11T00:24:30.232433-07:00","closed_at":"2026-03-11T00:24:30.232433-07:00","close_reason":"Implemented manifest-driven CNINFO ingestion, date-window filtering, local PDF embedded-text extraction with OCR fallback markers, provenance-preserving chunk indexing, and API validation/error handling with tests.","dependencies":[{"issue_id":"edgarpack-lb1.5","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-02-22T01:35:01.577556-08:00","created_by":"Samay Dhawan"}]}
54-
{"id":"edgarpack-lb1.6","title":"China Lens persistence layer: Postgres/pgvector + object storage adapters","description":"Replace in-memory ChinaLensService storage with repository adapters backed by Postgres and object storage while preserving existing API contracts and citation invariants.","status":"open","priority":1,"issue_type":"feature","owner":"samay58@gmail.com","created_at":"2026-02-22T01:35:06.610187-08:00","created_by":"Samay Dhawan","updated_at":"2026-03-11T00:24:39.344358-07:00","dependencies":[{"issue_id":"edgarpack-lb1.6","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-02-22T01:35:06.61162-08:00","created_by":"Samay Dhawan"}]}
55+
{"id":"edgarpack-lb1.6","title":"China Lens persistence layer: Postgres/pgvector + object storage adapters","description":"Replace in-memory ChinaLensService storage with repository adapters backed by Postgres and object storage while preserving existing API contracts and citation invariants.","status":"closed","priority":1,"issue_type":"feature","owner":"samay58@gmail.com","created_at":"2026-02-22T01:35:06.610187-08:00","created_by":"Samay Dhawan","updated_at":"2026-03-11T00:42:56.44773-07:00","closed_at":"2026-03-11T00:42:56.44773-07:00","close_reason":"Moved China Lens state behind repository/object-store adapters, added durable JSON/local object-store backend, wired environment-based backend selection, and added PostgreSQL repository support while preserving API contracts and tests.","dependencies":[{"issue_id":"edgarpack-lb1.6","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-02-22T01:35:06.61162-08:00","created_by":"Samay Dhawan"}]}
5556
{"id":"edgarpack-lb1.7","title":"China Lens web dependency hardening: address npm audit vulnerabilities","description":"Upgrade or pin web dependencies to resolve high/critical advisories reported by npm audit while preserving Next.js compatibility and build stability.","status":"open","priority":2,"issue_type":"task","owner":"samay58@gmail.com","created_at":"2026-02-22T01:36:35.409413-08:00","created_by":"Samay Dhawan","updated_at":"2026-02-22T01:36:35.409413-08:00","dependencies":[{"issue_id":"edgarpack-lb1.7","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-02-22T01:36:35.410331-08:00","created_by":"Samay Dhawan"}]}
5657
{"id":"edgarpack-lb1.8","title":"Fix China Lens FastAPI dependency injection to prevent 422 responses","description":"Import fastapi.Request at runtime in API dependency helper so FastAPI resolves request injection correctly instead of treating it as a missing query parameter.","status":"closed","priority":1,"issue_type":"bug","owner":"samay58@gmail.com","created_at":"2026-02-22T01:42:06.173404-08:00","created_by":"Samay Dhawan","updated_at":"2026-02-22T01:42:22.833527-08:00","closed_at":"2026-02-22T01:42:22.833527-08:00","close_reason":"Closed","dependencies":[{"issue_id":"edgarpack-lb1.8","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-02-22T01:42:06.174189-08:00","created_by":"Samay Dhawan"}]}
5758
{"id":"edgarpack-lb1.9","title":"China Lens UX/perf polish: EN-first evidence mode and faster citation interactions","description":"Improve frontend usefulness and speed: default English-first evidence reading, citation resolve caching, section filtering, cleaner controls, and denser layout without clutter.","status":"closed","priority":1,"issue_type":"task","owner":"samay58@gmail.com","created_at":"2026-02-22T01:49:18.851827-08:00","created_by":"Samay Dhawan","updated_at":"2026-02-22T01:49:52.452675-08:00","closed_at":"2026-02-22T01:49:52.452675-08:00","close_reason":"Closed","dependencies":[{"issue_id":"edgarpack-lb1.9","depends_on_id":"edgarpack-lb1","type":"parent-child","created_at":"2026-02-22T01:49:18.852439-08:00","created_by":"Samay Dhawan"}]}

README.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@ edgarpack site --packs ./packs --out ./site
116116
uv pip install -e ".[china]"
117117
edgarpack api --host 127.0.0.1 --port 8000
118118

119+
# Durable local China Lens backend (JSON repo + local object store)
120+
export EDGARPACK_CHINA_STORAGE_BACKEND=json
121+
export EDGARPACK_CHINA_STORAGE_DIR="$PWD/.local/china-repo"
122+
export EDGARPACK_CHINA_OBJECT_STORE_DIR="$PWD/.local/china-objects"
123+
edgarpack api --host 127.0.0.1 --port 8000
124+
119125
# CNINFO manifest sync (local deterministic ingestion)
120126
curl -X POST http://127.0.0.1:8000/api/v1/connectors/cninfo/sync \
121127
-H "content-type: application/json" \
@@ -153,6 +159,53 @@ Useful query parameters:
153159
For full behavior and field semantics, see `docs/OBSERVATORY.md`.
154160
For onboarding and module-system mapping, see `docs/OBSERVATORY-EXPLAINER.md`.
155161

162+
## China Lens Practical Use
163+
164+
The current China Lens stack is useful today for three workflows:
165+
166+
1. Deterministic citation-backed workspace dev: run the API, use the seeded Tencent fixtures, and exercise `/companies`, `/documents`, `/evidence/search`, `/citations/resolve`, `/packs`, and `/ask`.
167+
2. Local ingestion from a manifest: point `POST /api/v1/connectors/cninfo/sync` at a JSON manifest containing document metadata plus either page snippets or a local PDF path.
168+
3. Durable local review loop: set `EDGARPACK_CHINA_STORAGE_BACKEND=json` and `EDGARPACK_CHINA_OBJECT_STORE_DIR` so ingested docs, chunks, packs, and jobs survive process restarts.
169+
170+
Minimal manifest example:
171+
172+
```json
173+
{
174+
"company_id": "cmp_tencent_0700",
175+
"documents": [
176+
{
177+
"doc_id": "doc_tencent_2025_board",
178+
"title": "Tencent 2025 Board Update",
179+
"filing_date": "2025-04-01",
180+
"source_url": "https://www.cninfo.com.cn/mock/tencent-2025-board.pdf",
181+
"pages": 12,
182+
"local_pdf_path": "./fixtures/tencent-2025-board.pdf",
183+
"snippets": [
184+
{
185+
"page": 3,
186+
"text_zh": "董事会成员调整,新增两名独立董事。",
187+
"text_en": "Board composition changed with two new independent directors."
188+
}
189+
]
190+
}
191+
]
192+
}
193+
```
194+
195+
Useful inspection calls after sync:
196+
197+
```bash
198+
curl http://127.0.0.1:8000/api/v1/documents?company_id=cmp_tencent_0700
199+
curl -X POST http://127.0.0.1:8000/api/v1/evidence/search \
200+
-H "content-type: application/json" \
201+
-d '{"company_id":"cmp_tencent_0700","query":"independent directors"}'
202+
curl -X POST http://127.0.0.1:8000/api/v1/packs \
203+
-H "content-type: application/json" \
204+
-d '{"company_id":"cmp_tencent_0700"}'
205+
```
206+
207+
If you want a production database instead of local JSON files, set `EDGARPACK_CHINA_STORAGE_BACKEND=postgres` and `EDGARPACK_CHINA_POSTGRES_DSN`. The repository adapter is wired for PostgreSQL JSONB persistence; retrieval still uses the existing lexical ranking path until vector search is added.
208+
156209
## Development
157210

158211
```bash

docs/china-lens/IMPLEMENTATION_TRACKER.md

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,24 @@ Build a high-trust research workspace that produces investor-grade Packs where e
4646
- local PDF extraction (`local_pdf_path`) using embedded text with OCR-placeholder fallback.
4747
- Added date-window filtering (`start_date`, `end_date`) and optional `clear_existing` reset for deterministic reruns.
4848
- Added input validation path: invalid manifests now return HTTP 400 (instead of silent success).
49+
- China Lens state now sits behind repository/object-store adapters instead of raw in-memory dicts.
50+
- Added durable local backend:
51+
- JSON-file repository for companies, documents, chunks, packs, jobs, and acquisition events
52+
- local object-store adapter for source PDFs
53+
- Added environment-based backend selection:
54+
- `memory` (default)
55+
- `json` via `EDGARPACK_CHINA_STORAGE_DIR`
56+
- `postgres` via `EDGARPACK_CHINA_POSTGRES_DSN`
4957
- Added regression coverage:
5058
- manifest ingestion populates evidence search corpus
5159
- date-window filtering behavior
5260
- invalid manifest API handling
61+
- repository persistence across service restarts
62+
- local object-store persistence for synced PDFs
5363

5464
## Next Implementation Steps
55-
1. Wire persistent storage (PostgreSQL + object store) behind service interfaces.
65+
1. Add live CNINFO fetch + page-image rendering on top of the new persistence/object-store boundary.
5666
2. Add production OCR provider behind the existing extraction fallback path.
57-
3. Integrate frontend `web/` with API and implement Evidence Explorer interactions.
58-
4. Add end-to-end tests for generate-pack, verify-citation, and bounded ask workflows.
67+
3. Add vector retrieval and database-native search on top of the PostgreSQL adapter.
68+
4. Integrate frontend `web/` with API and implement Evidence Explorer interactions.
69+
5. Add end-to-end tests for generate-pack, verify-citation, and bounded ask workflows.

edgarpack/china/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@
3131
SearchEvidenceRequest,
3232
SearchEvidenceResponse,
3333
)
34+
from .storage import (
35+
InMemoryChinaLensRepository,
36+
JsonFileChinaLensRepository,
37+
LocalObjectStore,
38+
MemoryObjectStore,
39+
PostgresChinaLensRepository,
40+
)
3441

3542
__all__ = [
3643
"AcquisitionEvent",
@@ -55,11 +62,16 @@
5562
"PackSection",
5663
"PackStatus",
5764
"PackStatusResponse",
65+
"PostgresChinaLensRepository",
5866
"PipelineStage",
5967
"QAReport",
6068
"QAIssue",
6169
"ResolvedCitation",
6270
"SearchEvidenceHit",
6371
"SearchEvidenceRequest",
6472
"SearchEvidenceResponse",
73+
"InMemoryChinaLensRepository",
74+
"JsonFileChinaLensRepository",
75+
"LocalObjectStore",
76+
"MemoryObjectStore",
6577
]

edgarpack/china/acquire/cninfo.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ def document_from_cninfo(
8080
pages: int,
8181
acquisition_log_id: str,
8282
file_hash: str | None = None,
83+
object_key: str = "",
84+
storage_url: str = "",
8385
) -> Document:
8486
"""Build a normalized Document model from CNINFO metadata."""
8587
return Document(
@@ -95,6 +97,8 @@ def document_from_cninfo(
9597
language="zh",
9698
acquired_at=utc_now(),
9799
acquisition_log_id=acquisition_log_id,
100+
object_key=object_key,
101+
storage_url=storage_url,
98102
)
99103

100104

edgarpack/china/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ class Document(BaseModel):
9090
language: str
9191
acquired_at: datetime
9292
acquisition_log_id: str
93+
object_key: str = ""
94+
storage_url: str = ""
9395

9496

9597
class EvidenceChunk(BaseModel):

0 commit comments

Comments
 (0)