From fb7c0cea84b579e224e27bf24077438e40ddd226 Mon Sep 17 00:00:00 2001 From: Rodrigo Rodrigues da Silva Date: Sun, 2 Nov 2025 03:32:56 +0000 Subject: [PATCH 1/5] docs: refresh workflows and roadmap (#2) [docs] refresh workflows and roadmap --- CONTRIBUTING.md | 117 +++++++++++++++++++++++----------------- README.md | 141 +++++++++++++++++++++++++++++------------------- ROADMAP.md | 43 +++++++++++++++ 3 files changed, 198 insertions(+), 103 deletions(-) create mode 100644 ROADMAP.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 07e9549..75da5aa 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,62 +1,81 @@ # πŸ‘Ύ Contributing to PatchVec -We welcome PRs, issues, and feedback! +Patchvec accepts code and docs from people who ship patches. Follow the steps below and keep PRs focused. ---- +## Environment setup -## πŸ›  Dev Setup ```bash -python -m venv .venv -source .venv/bin/activate -python -m pip install --upgrade pip -pip install -r requirements-test.txt # includes runtime + test deps -``` +# clone and enter the repo first +git clone https://github.com/patchvec/patchvec.git +cd patchvec -## πŸ§ͺ Testing -```bash -pytest -q +# GPU deps by default; add USE_CPU=1 if you do not have a GPU +make install-dev + +# copy local config overrides if you need to tweak behaviour +cp config.yml.example config.yml +cp tenants.yml.example tenants.yml + +# optional: run the service right away +USE_CPU=1 make serve ``` -CI/CD blocks releases if tests fail. -## ▢️ Dev Server +Run the test suite before pushing (`USE_CPU=1` if you installed CPU wheels): + ```bash -# CPU-only deps by default -make serve -# or explicitly -HOST=0.0.0.0 PORT=8080 RELOAD=1 WORKERS=1 LOG_LEVEL=info ./pavesrv.sh +# USE_CPU=1 if you installed CPU deps +make test ``` +Need to inspect behaviour without reloads? After tweaking `config.yml` / `tenants.yml`, run `AUTH_MODE=static PATCHVEC_AUTH__GLOBAL_KEY= DEV=0 make serve` for an almost production-like stack, or call the wrapper script directly: `PATCHVEC_AUTH__GLOBAL_KEY= ./pavesrv.sh`. + +## Workflow + +1. Fork and clone the repository. +2. Create a branch named after the task (`feature/tenant-search`, `fix/csv-metadata`, etc.). +3. Make the change, keep commits scoped, and include tests when possible. +4. Run `make test` and `make check` if you touched deployment or packaging paths. +5. Open a pull request referencing the issue you claimed. + +Use imperative, lowercase commit messages (`docs: clarify docker quickstart`). + +## Issues and task claims + +- `ROADMAP.md` lists chores that need owners. +- To claim a task, open an issue titled `claim: ` and describe the approach. +- Good first issues live under the `good-first-issue` label. Submit a draft PR within a few days of claiming. + +## Code style + +- Prefer direct, readable Python. Keep imports sorted and avoid wildcard imports. +- Follow PEP 8 defaults, keep line length ≀ 88 characters, and run `ruff` locally if you have it installed. +- Do not add framework abstractions unless they solve a concrete problem. +- Avoid adding dependencies without discussing them in an issue first. + +## Pull request checklist + +- [ ] Tests pass locally (`make test`, add `USE_CPU=1` if you installed CPU wheels). +- [ ] Packaged stack still works (`make check` on a clean checkout). +- [ ] Docs updated when behavior changes. +- [ ] PR description states what changed and why. + +Ship code, not questions. If you need help, post logs and the failing command instead of asking for permission. + +## Architecture + +- Stores live under `pave/stores/*` (default txtai/FAISS today, Qdrant stub ready). +- Embedding adapters reside in `pave/embedders/*` (txtai, sentence-transformers, OpenAI, etc.). +- `pave/service.py` wires the FastAPI application and injects the store into `app.state`. +- CLI entrypoints are defined in `pave/cli.py`; shell shims `pavecli.sh`/`pavesrv.sh` wrap the same commands for repo contributors. + ## 🧰 Makefile Targets -- `make install` β€” install runtime deps (CPU default; `USE_GPU=1` for GPU) -- `make install-dev` β€” runtime + test deps -- `make serve` β€” start FastAPI app (uvicorn) with autoreload -- `make test` β€” run tests -- `make build` β€” build sdist/wheel (includes ABOUT.md) -- `make package` β€” create .zip and .tar.gz in ./artifacts -- `make release VERSION=x.y.z` β€” update versions (setup.py, main.py, Dockerfile, compose, README tags), prepend CHANGELOG with sorted commits since last tag, run tests/build (must pass), tag & push -- `make clean` / `make clean-dist` β€” cleanup - -## 🚒 Release Notes -- Version bumps also update Docker-related version strings where applicable. -- The release target **will not push** if tests/build fail. -- Ensure `PYPI_API_TOKEN` is set in CI to publish on tag. - -## πŸ” Secrets & Config -- Don’t commit secrets. Use `.env` (ignored) or env vars in CI. -- For per-tenant keys, use an untracked `tenants.yml` and reference it from `config.yml` (`auth.tenants_file`). -- Config precedence: code defaults < `config.yml` < `tenants.yml` < `PATCHVEC__*` env. - -## πŸ“ Commit Style -Keep commit messages short and scoped: -``` -search: fix filter parsing -docs: add curl examples -build: include ABOUT.md in sdist -arch: refactored StoreFactory -``` -## 🧩 Architecture (brief) -- Stores: `pave/stores/*` (default txtai/FAISS, qdrant stub) -- Embedders: `pave/embedders/*` (default/txtai, sbert, openai) -- Pure-ish orchestration: `pave/service.py` -- Dependency injection: `build_app()` wires store via `app.state` +- `make install` β€” install runtime deps (CPU wheels by default; `USE_GPU=1` for GPU builds). +- `make install-dev` β€” runtime + test deps for contributors. +- `make serve` β€” start the FastAPI app (uvicorn) with autoreload (`USE_CPU=1` for CPU-only setups). +- `make test` β€” run the pytest suite. +- `make check` β€” build and smoke-test the container image with the demo corpus. +- `make build` β€” build sdist/wheel (includes ABOUT.md). +- `make package` β€” create `.zip`/`.tar.gz` artifacts under `./artifacts`. +- `make release VERSION=x.y.z` β€” sync version strings, regenerate the changelog, run tests/build, tag & push. +- `make clean` / `make clean-dist` β€” remove caches and build outputs. diff --git a/README.md b/README.md index cedf5d6..688c631 100644 --- a/README.md +++ b/README.md @@ -1,87 +1,120 @@ # 🍰 PatchVec β€” Lightweight, Pluggable Vector Search Microservice -Upload β†’ chunk β†’ index (with metadata) β†’ search via REST and CLI. +Patchvec is a compact vector store built for people who want provenance and fast iteration on RAG plumbing. No black boxes, no hidden pipelines: every chunk records document id, page, and byte offsets, and you can swap embeddings or storage backends per collection. ---- +## βš™οΈ Core capabilities -## πŸš€ Quickstart +- **Docker images** β€” prebuilt CPU/GPU images published to the GitLab Container Registry. +- **Tenants and collections** β€” isolation by tenant with per-collection configuration. +- **Pluggable embeddings** β€” choose the embedding adapter per collection; wire in local or hosted models. +- **REST and CLI** β€” production use over HTTP, quick experiments with the bundled CLI. +- **Deterministic provenance** β€” every hit returns doc id, page, offset, and snippet for traceability. -### πŸ–₯️ CPU-only Dev (default) -```bash -python -m venv .venv -source .venv/bin/activate -python -m pip install --upgrade pip -pip install -r requirements-cpu.txt -./pavesrv.sh -``` +## 🧭 Workflows + +### 🐳 Docker workflow (prebuilt images) + +Pull the image that fits your hardware from the [https://gitlab.com/flowlexi](Flowlexi) Container Registry on Gitlab (CUDA builds publish as `latest`, CPU-only as `latest-cpu`). -### πŸ“¦ Install from PyPI ```bash -python -m venv .venv -source .venv/bin/activate -python -m pip install --upgrade pip -pip install patchvec[cpu] # or patchvec[gpu] for CUDA +docker pull registry.gitlab.com/flowlexi/patchvec/patchvec:latest +docker pull registry.gitlab.com/flowlexi/patchvec/patchvec:latest-cpu ``` -### ▢️ Run the Server -From source: +Run the service by choosing the tag you need and mapping the API port locally: + ```bash -./pavesrv.sh +docker run -d --name patchvec \ + -p 8086:8086 \ + registry.gitlab.com/flowlexi/patchvec/patchvec:latest-cpu ``` -From PyPI: + +Use the bundled CLI inside the container to create a tenant/collection, ingest a demo document, and query it: + ```bash -pavesrv +docker exec patchvec pavecli create-collection demo books +docker exec patchvec pavecli upload demo books /app/demo/20k_leagues.txt \ + --docid=verne-20k --metadata='{"lang":"en"}' +docker exec patchvec pavecli search demo books "captain nemo" -k 3 ``` -### βš™οΈ Minimal Config -For production (static auth), set env vars (do not commit secrets): -```env -PATCHVEC_AUTH__MODE=static -PATCHVEC_AUTH__GLOBAL_KEY=sekret-passwod -``` -(Optional: copy `config.yml.example` to an untracked `config.yml` and tweak as needed) -(Tip: use an untracked `tenants.yml` and point `auth.tenants_file` to it in `config.yml`.) +See below for REST and UI. ---- +Stop the container when you are done: -## πŸ”§ Overriding Server Settings (uvicorn) -You can override a few server knobs via environment variables: ```bash -HOST=127.0.0.1 PORT=9000 RELOAD=1 WORKERS=4 LOG_LEVEL=debug pavesrv +docker rm -f patchvec ``` -> Note: Full configuration uses the `PATCHVEC_...` env scheme (e.g., `PATCHVEC_SERVER__PORT=9000`). ---- +### 🐍 PyPI workflow -## 🌐 REST API Examples +Install Patchvec from PyPI inside an isolated virtual environment and point it at a local configuration directory: -**Create a collection** ```bash -curl -X POST "http://localhost:8086/collections/acme/invoices" -``` +mkdir -p ~/pv && cd ~/pv #or wherever +python -m venv .venv-pv +source .venv-pv/bin/activate +python -m pip install --upgrade pip +python -m pip install "patchvec[cpu]" # or "patchvec[gpu]" for CUDA -**Upload a TXT/PDF/CSV document** -```bash -curl -X POST "http://localhost:8086/collections/acme/invoices/documents" -F "file=@sample.txt" -F "docid=DOC-1" -F 'metadata={"lang":"pt"}' -``` +# grab the default configs +curl -LO https://raw.githubusercontent.com/patchvec/patchvec/main/config.yml.example +curl -LO https://raw.githubusercontent.com/patchvec/patchvec/main/tenants.yml.example +cp config.yml.example config.yml +cp tenants.yml.example tenants.yml -**Search (GET, no filters)** -```bash -curl "http://localhost:8086/collections/acme/invoices/search?q=garantia&k=5" -``` +# sample demo corpus +curl -LO https://raw.githubusercontent.com/patchvec/patchvec/main/demo/20k_leagues.txt -**Search (POST with filters)** -```bash -curl -X POST "http://localhost:8086/collections/acme/invoices/search" -H "Content-Type: application/json" -d '{"q": "garantia", "k": 5, "filters": {"docid": "DOC-1"}}' +# point Patchvec at the config directory and set a local admin key +export PATCHVEC_CONFIG="$HOME/pv/config.yml" +export PATCHVEC_GLOBAL_KEY=super-sekret + +# option A: run the service (stays up until you stop it) +pavesrv + +# option B: operate entirely via the CLI (no server needed) +pavecli create-collection demo books +pavecli upload demo books 20k_leagues.txt --docid=verne-20k --metadata='{"lang":"en"}' +pavecli search demo books "captain nemo" -k 3 ``` -**Health / Metrics** +Deactivate the virtual environment with `deactivate` when finished. + +### 🌐 REST API and Web UI usage + +When the server is running (either via Docker or `pavesrv`), the API listens on `http://localhost:8086`. The following `curl` commands mirror the CLI sequence aboveβ€”adjust the file path to wherever you stored the corpus (`/app/demo/20k_leagues.txt` in Docker, `~/pv/20k_leagues.txt` for PyPI installs) and reuse the bearer token exported earlier: + ```bash -curl "http://localhost:8086/health" -curl "http://localhost:8086/metrics" +# create collection +curl -H "Authorization: Bearer $PATCHVEC_GLOBAL_KEY" \ + -X POST http://localhost:8086/collections/demo/books + +# ingest document +curl -H "Authorization: Bearer $PATCHVEC_GLOBAL_KEY" \ + -X POST http://localhost:8086/collections/demo/books/documents \ + -F "file=@20k_leagues.txt" \ + -F 'metadata={"lang":"en"}' + +# run search +curl -H "Authorization: Bearer $PATCHVEC_GLOBAL_KEY" \ + "http://localhost:8086/collections/demo/books/search?q=captain+nemo&k=3" ``` ---- +There is a simple Swagger UI available at the root of the server. Just point your browser to `http://localhost:8086/` + +Health and metrics endpoints are available at `/health` and `/metrics`. + +Configuration files copied in either workflow can be customised. Runtime options are also accepted via the `PATCHVEC_*` environment variable scheme (`PATCHVEC_SERVER__PORT`, `PATCHVEC_AUTH__MODE`, etc.), which precedes conf files. + +### πŸ› οΈ Developer workflow + +Building from source relies on the `Makefile` shortcuts (`make install-dev`, `USE_CPU=1 make serve`, `make test`, etc.). The full contributor workflow, target reference, and task claiming rules live in [CONTRIBUTING.md](CONTRIBUTING.md). + +## πŸ—ΊοΈ Roadmap + +Short & mid-term chores are tracked in [`ROADMAP.md`](ROADMAP.md). Pick one, open an issue titled `claim: `, and ship a patch. ## πŸ“œ License + GPL-3.0-or-later β€” (C) 2025 Rodrigo Rodrigues da Silva diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 0000000..19d741a --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,43 @@ +# Roadmap + +Immediate chores worth tackling now. Claim a task by opening an issue titled `claim: ` and link your branch or PR when ready. + +## 0.5.x Series β€” Adoption & Stability + +### v0.5.7 β€” Search, Filters, Traceability +- Expand partial filter support (`*` prefix/fuzzy matching). +- Return a `match_reason` field alongside every hit. +- Persist configurable logging per tenant and collection. +- Provide REST/CLI endpoints to delete a document by id. +- Expose latency histograms (p50/p95/p99) via `/metrics` for search and ingest requests. + +### v0.5.8 β€” Persistence, Infrastructure +- Ship an internal metadata/content store (SQLite or TinyDB) with migrations. +- Serve `/metrics` and `/collections` using the internal store as the source of truth. +- Emit structured logs with `request_id`, tenant, and latency, and allow rolling retention per tenant/collection. +- Support renaming collections through the API and CLI. + +### v0.5.9 β€” Ranking, Model Quality +- Add hybrid reranking (vector similarity + BM25/token matching). +- Honor `meta.priority` boosts during scoring. +- Improve multilingual relevance and evaluation fixtures. + +## πŸš€ Towards 1.0 + +### 0.6 β€” Per-Collection Embeddings +- Configure embeddings per collection via `config.yml`. +- Maintain per-collection hot caches with isolation guarantees. +- List tenants and collections via the API (CLI parity included). + +### 0.7 β€” API Utilities & Observability +- Default tenant/collection selectors in Swagger UI. +- Export indexes, enforce collection-level logging toggles, add rate limiting, and finalize per-collection embedding configuration flows. + +### 0.8 β€” Reliability & Governance +- Deliver the internal DB for persistence, document versioning, rebuild tooling, persistent metrics, surfacing metrics in the UI, JWT auth, per-tenant quotas, and transactional rollback with safe retries. + +### 0.9 β€” Scale & Multi-Tenant Search +- Async ingest, parallel purge, horizontal scalability, tenant groups, and shared/group search with sub-index routing. + +### 1.0 β€” API Freeze +- Lock routes, publish the final OpenAPI spec, and ship an SDK client ready for long-term support. From 74789ffa734cb7504c3be8443379cecf1ace8b05 Mon Sep 17 00:00:00 2001 From: Rodrigo Rodrigues da Silva Date: Sun, 2 Nov 2025 01:35:56 -0300 Subject: [PATCH 2/5] [store] ensure CRLF-rich documents round-trip intact --- pave/stores/txtai_store.py | 9 +++++---- tests/test_txtai_store.py | 12 ++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pave/stores/txtai_store.py b/pave/stores/txtai_store.py index 0f44ba8..31280b2 100644 --- a/pave/stores/txtai_store.py +++ b/pave/stores/txtai_store.py @@ -179,16 +179,17 @@ def _save_chunk_text(self, tenant: str, collection: str, p = os.path.join(self._chunks_dir(tenant, collection), self._urid_to_fname(urid)) os.makedirs(os.path.dirname(p), exist_ok=True) - with open(p, "w", encoding="utf-8") as f: - f.write(t or "") + data = (t or "").encode("utf-8") + with open(p, "wb") as f: + f.write(data) f.flush() def _load_chunk_text(self, tenant: str, collection: str, urid: str) -> str | None: p = os.path.join(self._chunks_dir(tenant, collection), self._urid_to_fname(urid)) if os.path.isfile(p): - with open(p, "r", encoding="utf-8") as f: - return f.read() + with open(p, "rb") as f: + return f.read().decode("utf-8") return None def index_records(self, tenant: str, collection: str, docid: str, diff --git a/tests/test_txtai_store.py b/tests/test_txtai_store.py index a007877..27c0925 100644 --- a/tests/test_txtai_store.py +++ b/tests/test_txtai_store.py @@ -69,6 +69,18 @@ def test_index_adds_docid_prefix(store): assert hits[0]["text"] is not None and "verde" in hits[0]["text"].lower() assert hits[0]["id"] is not None and "docbike::0" == hits[0]["id"] +def test_chunk_sidecar_preserves_crlf(store): + text = "First line\r\nSecond line\r\n" + recs = [ + {"id": "0", "content": text, "metadata": {"lang": "en"}}, + ] + + n = store.index_records("acme", "crlf", "doccrlf", recs) + assert n == 1 + + stored = store.impl._load_chunk_text("acme", "crlf", "doccrlf::0") + assert stored == text + def test_meta_json_and_filters(store): recs = [ {"id": "docx::0", "content": "OlΓ‘ mundo", "metadata": {"lang": "pt"}}, From d42463236ad03a0a6ad22d32947bea51ad12f011 Mon Sep 17 00:00:00 2001 From: Rodrigo Rodrigues da Silva Date: Wed, 5 Nov 2025 00:58:41 +0000 Subject: [PATCH 3/5] Sanitize txtai metadata persistence --- pave/stores/txtai_store.py | 113 ++++++++++++++++++++++---- tests/conftest.py | 16 ++++ tests/test_txtai_store_sql_safety.py | 116 +++++++++++++++++++++++++++ tests/utils.py | 56 +++++++++++-- 4 files changed, 278 insertions(+), 23 deletions(-) create mode 100644 tests/test_txtai_store_sql_safety.py diff --git a/pave/stores/txtai_store.py b/pave/stores/txtai_store.py index 31280b2..be2c3bb 100644 --- a/pave/stores/txtai_store.py +++ b/pave/stores/txtai_store.py @@ -4,7 +4,7 @@ from __future__ import annotations import os, json, operator from datetime import datetime -from typing import Dict, Iterable, List, Any +from typing import Any, Dict, Iterable, List, Optional from threading import Lock from contextlib import contextmanager from txtai.embeddings import Embeddings @@ -12,6 +12,14 @@ from pave.config import CFG as c, LOG as log _LOCKS : dict[str, Lock] = {} +_SQL_TRANS = str.maketrans({ + ";": " ", + '"': " ", + "`": " ", + "\\": " ", + "\x00": "", +}) + def get_lock(key: str) -> Lock: if key not in _LOCKS: _LOCKS[key] = Lock() @@ -236,10 +244,10 @@ def index_records(self, tenant: str, collection: str, docid: str, md["docid"] = docid try: - meta_json = json.dumps(md, ensure_ascii=False) - md = json.loads(meta_json) - except: - md = {} + safe_meta = self._sanit_meta_dict(md) + meta_json = json.dumps(safe_meta, ensure_ascii=False) + except Exception: + safe_meta = {} meta_json = "" rid = str(rid) @@ -247,9 +255,11 @@ def index_records(self, tenant: str, collection: str, docid: str, if not rid.startswith(f"{docid}::"): rid = f"{docid}::{rid}" - meta_side[rid] = md + md_for_index = {k: v for k, v in safe_meta.items() if k != "text"} + + meta_side[rid] = safe_meta record_ids.append(rid) - prepared.append((rid, {"text":txt, **md}, meta_json)) + prepared.append((rid, {"text": txt, **md_for_index}, meta_json)) self._save_chunk_text(tenant, collection, rid, txt) assert txt == (self._load_chunk_text(tenant, collection, rid) or "") @@ -280,10 +290,15 @@ def _matches_filters(m: Dict[str, Any], if not filters: return True - def match(have: Any, cond: str) -> bool: + def match(have: Any, cond: Any) -> bool: if have is None: return False - s = str(cond) + if isinstance(have, (list, tuple, set)): + return any(match(item, cond) for item in have) + if isinstance(cond, str): + s = TxtaiStore._sanit_sql(cond) + else: + s = str(cond) hv = str(have) # Numeric/date ops for op in (">=", "<=", "!=", ">", "<"): @@ -313,7 +328,7 @@ def match(have: Any, cond: str) -> bool: return hv == s for k, vals in filters.items(): - if not any(match(m.get(k), v) for v in vals): + if not any(match(TxtaiStore._lookup_meta(m, k), v) for v in vals): return False return True @@ -325,6 +340,9 @@ def _split_filters(filters: dict[str, Any] | None) -> tuple[dict, dict]: pre_f, pos_f = {}, {} for key, vals in (filters or {}).items(): + safe_key = TxtaiStore._sanit_field(key) + if not safe_key: + continue if not isinstance(vals, list): vals = [vals] exacts, extended = [], [] @@ -338,12 +356,68 @@ def _split_filters(filters: dict[str, Any] | None) -> tuple[dict, dict]: else: exacts.append(v) if exacts: - pre_f[key] = exacts + pre_f[safe_key] = exacts if extended: - pos_f[key] = extended + pos_f[safe_key] = extended log.debug(f"after split: PRE {pre_f} POS {pos_f}") return pre_f, pos_f + @staticmethod + def _lookup_meta(meta: Dict[str, Any] | None, key: str) -> Any: + if not meta: + return None + if key in meta: + return meta.get(key) + for raw_key, value in meta.items(): + if TxtaiStore._sanit_field(raw_key) == key: + return value + return None + + @staticmethod + def _sanit_meta_value(value: Any) -> Any: + if isinstance(value, dict): + return TxtaiStore._sanit_meta_dict(value) + if isinstance(value, (list, tuple, set)): + return [TxtaiStore._sanit_meta_value(v) for v in value] + if isinstance(value, (int, float, bool)) or value is None: + return value + return TxtaiStore._sanit_sql(value) + + @staticmethod + def _sanit_meta_dict(meta: Dict[str, Any] | None) -> Dict[str, Any]: + safe: Dict[str, Any] = {} + if not isinstance(meta, dict): + return safe + for raw_key, raw_value in meta.items(): + safe_key = TxtaiStore._sanit_field(raw_key) + if not safe_key or safe_key == "text": + continue + safe[safe_key] = TxtaiStore._sanit_meta_value(raw_value) + return safe + + @staticmethod + def _sanit_sql(value: Any, *, max_len: Optional[int] = None) -> str: + if value is None: + return "" + text = str(value).translate(_SQL_TRANS) + for token in ("--", "/*", "*/"): + if token in text: + text = text.split(token, 1)[0] + text = text.strip() + if max_len is not None and max_len > 0 and len(text) > max_len: + text = text[:max_len] + return text.replace("'", "''") + + @staticmethod + def _sanit_field(name: Any) -> str: + if not isinstance(name, str): + name = str(name) + safe = [] + for ch in name: + if ch.isalnum() or ch in {"_", "-"}: + safe.append(ch) + return "".join(safe) + @staticmethod def _build_sql(query: str, k: int, filters: dict[str, Any], columns: list[str], with_similarity: bool = True, avoid_duplicates = True) -> str: @@ -356,14 +430,23 @@ def _build_sql(query: str, k: int, filters: dict[str, Any], columns: list[str], wheres = [] if with_similarity and query: - q_safe = query.replace("'", "''") + max_len_cfg = c.get("vector_store.txtai.max_query_chars", 512) + try: + max_len = int(max_len_cfg) + except (TypeError, ValueError): + max_len = 512 + limit = max_len if max_len > 0 else None + q_safe = TxtaiStore._sanit_sql(query, max_len=limit) wheres.append(f"similar('{q_safe}')") for key, vals in filters.items(): + safe_key = TxtaiStore._sanit_field(key) + if not safe_key: + continue ors = [] for v in vals: - safe_v = str(v).replace("'", "''") - ors.append(f"[{key}] = '{safe_v}'") + safe_v = TxtaiStore._sanit_sql(v) + ors.append(f"[{safe_key}] = '{safe_v}'") or_safe = " OR ".join(ors) wheres.append(f"({or_safe})") diff --git a/tests/conftest.py b/tests/conftest.py index bf4e8b0..8fd621b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,22 @@ # (C) 2025 Rodrigo Rodrigues da Silva # SPDX-License-Identifier: GPL-3.0-or-later +import sys +import types + +if "txtai.embeddings" not in sys.modules: + txtai_stub = types.ModuleType("txtai") + embeddings_stub = types.ModuleType("txtai.embeddings") + + class _StubEmbeddings: # pragma: no cover - stub for optional dependency + def __init__(self, *args, **kwargs): + pass + + embeddings_stub.Embeddings = _StubEmbeddings + txtai_stub.embeddings = embeddings_stub + sys.modules.setdefault("txtai", txtai_stub) + sys.modules.setdefault("txtai.embeddings", embeddings_stub) + import pytest from fastapi.testclient import TestClient from pave.config import get_cfg, reload_cfg diff --git a/tests/test_txtai_store_sql_safety.py b/tests/test_txtai_store_sql_safety.py new file mode 100644 index 0000000..e230929 --- /dev/null +++ b/tests/test_txtai_store_sql_safety.py @@ -0,0 +1,116 @@ +# (C) 2025 Rodrigo Rodrigues da Silva +# SPDX-License-Identifier: GPL-3.0-or-later + +import json + +import pytest + +from pave.stores import txtai_store as store_mod +from pave.stores.txtai_store import TxtaiStore +from pave.config import get_cfg +from utils import FakeEmbeddings + + +@pytest.fixture(autouse=True) +def _fake_embeddings(monkeypatch): + monkeypatch.setattr(store_mod, "Embeddings", FakeEmbeddings, raising=True) + + +@pytest.fixture() +def store(): + return TxtaiStore() + + +def _extract_similarity_term(sql: str) -> str: + marker = "similar('" + if marker not in sql: + raise AssertionError(f"similar() clause missing in SQL: {sql!r}") + rest = sql.split(marker, 1)[1] + return rest.split("')", 1)[0] + + +def test_build_sql_sanitizes_similarity_term(store): + raw_query = "foo'; DROP TABLE users; -- comment" + sql = store._build_sql(raw_query, 5, {}, ["id", "text"]) + term = _extract_similarity_term(sql) + + # injection primitives are stripped or neutralised + assert ";" not in term + assert "--" not in term + # original alpha characters remain so search still works + assert "foo" in term + + +def test_build_sql_sanitizes_filter_values(store): + filters = {"lang": ["en'; DELETE FROM x;"], "tags": ['alpha"beta']} + sql = store._build_sql("foo", 5, filters, ["id", "text"]) + + # filter clause should not leak dangerous characters + assert ";" not in sql + assert '"' not in sql + assert "--" not in sql + + +def test_build_sql_normalises_filter_keys(store): + filters = {"lang]; DROP": ["en"], 123: ["x"]} + sql = store._build_sql("foo", 5, filters, ["id"]) + assert "[langDROP]" in sql + assert "[123]" in sql + + +def test_build_sql_applies_query_length_limit(store): + cfg = get_cfg() + snapshot = cfg.snapshot() + try: + cfg.set("vector_store.txtai.max_query_chars", 8) + sql = store._build_sql("abcdefghijklmno", 5, {}, ["id"]) + term = _extract_similarity_term(sql) + + # collapse the doubled quotes to measure the original payload length + collapsed = term.replace("''", "'") + assert len(collapsed) == 8 + finally: + cfg.replace(data=snapshot) + + +def test_search_handles_special_characters(store): + tenant, collection = "tenant", "coll" + store.load_or_init(tenant, collection) + + records = [("r1", "hello world", {"lang": "en"})] + store.index_records(tenant, collection, "doc", records) + + hits = store.search(tenant, collection, "world; -- comment", k=5) + assert hits + assert hits[0]["id"].endswith("::r1") + + +def test_round_trip_with_weird_metadata_field(store): + tenant, collection = "tenant", "coll" + store.load_or_init(tenant, collection) + + weird_key = "meta;`DROP" + weird_value = "val'u" + records = [("r2", "strange world", {weird_key: weird_value})] + store.index_records(tenant, collection, "doc2", records) + + filters = {weird_key: weird_value} + hits = store.search(tenant, collection, "strange", k=5, filters=filters) + + assert hits + assert hits[0]["id"].endswith("::r2") + + emb = store._emb[(tenant, collection)] + safe_key = TxtaiStore._sanit_field(weird_key) + assert emb.last_sql and f"[{safe_key}]" in emb.last_sql + + rid = hits[0]["id"] + stored_meta = store._load_meta(tenant, collection).get(rid) or {} + assert safe_key in stored_meta + assert stored_meta[safe_key] == TxtaiStore._sanit_sql(weird_value) + + doc = emb._docs[rid] + assert doc["meta"].get(safe_key) == TxtaiStore._sanit_sql(weird_value) + serialized = json.loads(doc["meta_json"]) if doc.get("meta_json") else {} + assert serialized.get(safe_key) == TxtaiStore._sanit_sql(weird_value) + assert hits[0]["meta"].get(safe_key) == TxtaiStore._sanit_sql(weird_value) diff --git a/tests/utils.py b/tests/utils.py index 4acbb3c..4b77c35 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -10,18 +10,26 @@ class FakeEmbeddings: """Tiny in-memory index. Keeps interface you use in tests.""" def __init__(self, config): # config unused - self._docs = {} # rid -> (text, meta_json) + self._docs = {} # rid -> {"text": str, "meta_json": str, "meta": dict} + self.last_sql = None def index(self, docs): - for rid, text, meta_json in docs: + for rid, payload, meta_json in docs: assert isinstance(meta_json, str) - self._docs[rid] = (text, meta_json) + if isinstance(payload, dict): + text = payload.get("text") + meta = {k: v for k, v in payload.items() if k != "text"} + else: + text = payload + meta = {} + self._docs[rid] = {"text": text, "meta_json": meta_json, "meta": meta} def upsert(self, docs): return self.index(docs) def search(self, sql, k=5): import re + self.last_sql = sql term = None m = re.search(r"similar\('([^']+)'", sql) if m: @@ -32,10 +40,42 @@ def search(self, sql, k=5): if not term: return [] - hits = [ - {"id": rid, "score": 1.0, "text": txt, "tags": {"docid": "DUMMY"}} - for rid, (txt, _) in self._docs.items() if term in str(txt).lower() - ] + filter_pairs = re.findall(r"\[([^\]]+)\]\s*=\s*'((?:''|[^'])*)'", sql) + + hits = [] + for rid, entry in self._docs.items(): + text = entry.get("text") + if text is None: + continue + if term not in str(text).lower(): + continue + + metadata = entry.get("meta") or {} + include = True + for field, raw_val in filter_pairs: + stored = metadata.get(field) + if stored is None: + include = False + break + expected = raw_val + if isinstance(stored, (list, tuple, set)): + options = {str(v) for v in stored} + if expected not in options: + include = False + break + else: + if str(stored) != expected: + include = False + break + if not include: + continue + + hits.append({ + "id": rid, + "score": 1.0, + "text": text, + "docid": metadata.get("docid"), + }) return hits[:10] """ q = (query or "").lower() @@ -47,7 +87,7 @@ def search(self, sql, k=5): """ def lookup(self, ids): - return {rid: self._docs.get(rid, ("", ""))[0] for rid in ids} + return {rid: (self._docs.get(rid) or {}).get("text") for rid in ids} def delete(self, ids): for rid in ids: From c964f1eb9002f29179316d6229c48ee7e08890f7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 5 Nov 2025 01:05:11 +0000 Subject: [PATCH 4/5] Initial plan From 54e10bc39446e01eedc7190e1c098618577a87b7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 5 Nov 2025 01:16:11 +0000 Subject: [PATCH 5/5] Fix escaped single quotes handling in FakeEmbeddings filters Co-authored-by: rodrigopitanga <1755608+rodrigopitanga@users.noreply.github.com> --- tests/test_fake_embeddings_escape.py | 69 ++++++++++++++++++++++++++++ tests/utils.py | 2 +- 2 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 tests/test_fake_embeddings_escape.py diff --git a/tests/test_fake_embeddings_escape.py b/tests/test_fake_embeddings_escape.py new file mode 100644 index 0000000..6007f5e --- /dev/null +++ b/tests/test_fake_embeddings_escape.py @@ -0,0 +1,69 @@ +# (C) 2025 Rodrigo Rodrigues da Silva +# SPDX-License-Identifier: GPL-3.0-or-later + +"""Test that FakeEmbeddings correctly handles escaped single quotes in filter values.""" + +from utils import FakeEmbeddings + + +def test_fake_embeddings_escaped_single_quote(): + """Test that filter values with escaped single quotes are correctly unescaped.""" + fake = FakeEmbeddings({}) + + # Index a document with a single quote in metadata + # The metadata is stored in the payload dict + docs = [ + ("doc1", {"text": "test content", "author": "O'Brien"}, '{"author": "O\'Brien"}'), + ("doc2", {"text": "another test", "author": "Smith"}, '{"author": "Smith"}'), + ] + fake.index(docs) + + # Search with a filter that includes an escaped single quote (as it would appear in SQL) + # The SQL sanitizer in txtai_store.py escapes ' as '' + sql = "SELECT * FROM txtai WHERE similar('test') AND [author] = 'O''Brien'" + results = fake.search(sql) + + # Should find doc1 but not doc2 + assert len(results) == 1 + assert results[0]["id"] == "doc1" + assert results[0]["text"] == "test content" + + +def test_fake_embeddings_no_escape_needed(): + """Test that filter values without single quotes still work correctly.""" + fake = FakeEmbeddings({}) + + # Index documents without single quotes + docs = [ + ("doc1", {"text": "test content", "author": "Smith"}, '{"author": "Smith"}'), + ("doc2", {"text": "another test", "author": "Jones"}, '{"author": "Jones"}'), + ] + fake.index(docs) + + # Search with a normal filter + sql = "SELECT * FROM txtai WHERE similar('test') AND [author] = 'Smith'" + results = fake.search(sql) + + # Should find doc1 + assert len(results) == 1 + assert results[0]["id"] == "doc1" + + +def test_fake_embeddings_multiple_escaped_quotes(): + """Test that multiple escaped single quotes in a value are handled correctly.""" + fake = FakeEmbeddings({}) + + # Index a document with multiple single quotes in metadata + docs = [ + ("doc1", {"text": "test content", "title": "It's a boy's life"}, '{"title": "It\'s a boy\'s life"}'), + ("doc2", {"text": "another test", "title": "Simple title"}, '{"title": "Simple title"}'), + ] + fake.index(docs) + + # Search with a filter that has multiple escaped single quotes + sql = "SELECT * FROM txtai WHERE similar('test') AND [title] = 'It''s a boy''s life'" + results = fake.search(sql) + + # Should find doc1 + assert len(results) == 1 + assert results[0]["id"] == "doc1" diff --git a/tests/utils.py b/tests/utils.py index 4b77c35..74ba374 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -57,7 +57,7 @@ def search(self, sql, k=5): if stored is None: include = False break - expected = raw_val + expected = raw_val.replace("''", "'") if isinstance(stored, (list, tuple, set)): options = {str(v) for v in stored} if expected not in options: