From fb7c0cea84b579e224e27bf24077438e40ddd226 Mon Sep 17 00:00:00 2001
From: Rodrigo Rodrigues da Silva <pitanga@members.fsf.org>
Date: Sun, 2 Nov 2025 03:32:56 +0000
Subject: [PATCH 1/5] docs: refresh workflows and roadmap (#2)

[docs] refresh workflows and roadmap
---
 CONTRIBUTING.md | 117 +++++++++++++++++++++++-----------------
 README.md       | 141 +++++++++++++++++++++++++++++-------------------
 ROADMAP.md      |  43 +++++++++++++++
 3 files changed, 198 insertions(+), 103 deletions(-)
 create mode 100644 ROADMAP.md
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 07e9549..75da5aa 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,62 +1,81 @@
 # 👾 Contributing to PatchVec
 
-We welcome PRs, issues, and feedback!
+Patchvec accepts code and docs from people who ship patches. Follow the steps below and keep PRs focused.
 
----
+## Environment setup
 
-## 🛠 Dev Setup
 ```bash
-python -m venv .venv
-source .venv/bin/activate
-python -m pip install --upgrade pip
-pip install -r requirements-test.txt   # includes runtime + test deps
-```
+# clone and enter the repo first
+git clone https://github.com/patchvec/patchvec.git
+cd patchvec
 
-## 🧪 Testing
-```bash
-pytest -q
+# GPU deps by default; add USE_CPU=1 if you do not have a GPU
+make install-dev
+
+# copy local config overrides if you need to tweak behaviour
+cp config.yml.example config.yml
+cp tenants.yml.example tenants.yml
+
+# optional: run the service right away
+USE_CPU=1 make serve
 ```
-CI/CD blocks releases if tests fail.
 
-## ▶️ Dev Server
+Run the test suite before pushing (`USE_CPU=1` if you installed CPU wheels):
+
 ```bash
-# CPU-only deps by default
-make serve
-# or explicitly
-HOST=0.0.0.0 PORT=8080 RELOAD=1 WORKERS=1 LOG_LEVEL=info ./pavesrv.sh
+# USE_CPU=1 if you installed CPU deps
+make test
 ```
 
+Need to inspect behaviour without reloads? After tweaking `config.yml` / `tenants.yml`, run `AUTH_MODE=static PATCHVEC_AUTH__GLOBAL_KEY=<your-secret> DEV=0 make serve` for an almost production-like stack, or call the wrapper script directly: `PATCHVEC_AUTH__GLOBAL_KEY=<your-secret> ./pavesrv.sh`.
+
+## Workflow
+
+1. Fork and clone the repository.
+2. Create a branch named after the task (`feature/tenant-search`, `fix/csv-metadata`, etc.).
+3. Make the change, keep commits scoped, and include tests when possible.
+4. Run `make test` and `make check` if you touched deployment or packaging paths.
+5. Open a pull request referencing the issue you claimed.
+
+Use imperative, lowercase commit messages (`docs: clarify docker quickstart`).
+
+## Issues and task claims
+
+- `ROADMAP.md` lists chores that need owners.
+- To claim a task, open an issue titled `claim: <task>` and describe the approach.
+- Good first issues live under the `good-first-issue` label. Submit a draft PR within a few days of claiming.
+
+## Code style
+
+- Prefer direct, readable Python. Keep imports sorted and avoid wildcard imports.
+- Follow PEP 8 defaults, keep line length ≤ 88 characters, and run `ruff` locally if you have it installed.
+- Do not add framework abstractions unless they solve a concrete problem.
+- Avoid adding dependencies without discussing them in an issue first.
+
+## Pull request checklist
+
+- [ ] Tests pass locally (`make test`, add `USE_CPU=1` if you installed CPU wheels).
+- [ ] Packaged stack still works (`make check` on a clean checkout).
+- [ ] Docs updated when behavior changes.
+- [ ] PR description states what changed and why.
+
+Ship code, not questions. If you need help, post logs and the failing command instead of asking for permission.
+
+## Architecture
+
+- Stores live under `pave/stores/*` (default txtai/FAISS today, Qdrant stub ready).
+- Embedding adapters reside in `pave/embedders/*` (txtai, sentence-transformers, OpenAI, etc.).
+- `pave/service.py` wires the FastAPI application and injects the store into `app.state`.
+- CLI entrypoints are defined in `pave/cli.py`; shell shims `pavecli.sh`/`pavesrv.sh` wrap the same commands for repo contributors.
+
 ## 🧰 Makefile Targets
-- `make install` — install runtime deps (CPU default; `USE_GPU=1` for GPU)
-- `make install-dev` — runtime + test deps
-- `make serve` — start FastAPI app (uvicorn) with autoreload
-- `make test` — run tests
-- `make build` — build sdist/wheel (includes ABOUT.md)
-- `make package` — create .zip and .tar.gz in ./artifacts
-- `make release VERSION=x.y.z` — update versions (setup.py, main.py, Dockerfile, compose, README tags), prepend CHANGELOG with sorted commits since last tag, run tests/build (must pass), tag & push
-- `make clean` / `make clean-dist` — cleanup
-
-## 🚢 Release Notes
-- Version bumps also update Docker-related version strings where applicable.
-- The release target **will not push** if tests/build fail.
-- Ensure `PYPI_API_TOKEN` is set in CI to publish on tag.
-
-## 🔐 Secrets & Config
-- Don’t commit secrets. Use `.env` (ignored) or env vars in CI.
-- For per-tenant keys, use an untracked `tenants.yml` and reference it from `config.yml` (`auth.tenants_file`).
-- Config precedence: code defaults < `config.yml` < `tenants.yml` < `PATCHVEC__*` env.
-
-## 📝 Commit Style
-Keep commit messages short and scoped:
-```
-search: fix filter parsing
-docs: add curl examples
-build: include ABOUT.md in sdist
-arch: refactored StoreFactory
-```
 
-## 🧩 Architecture (brief)
-- Stores: `pave/stores/*` (default txtai/FAISS, qdrant stub)
-- Embedders: `pave/embedders/*` (default/txtai, sbert, openai)
-- Pure-ish orchestration: `pave/service.py`
-- Dependency injection: `build_app()` wires store via `app.state`
+- `make install` — install runtime deps (CPU wheels by default; `USE_GPU=1` for GPU builds).
+- `make install-dev` — runtime + test deps for contributors.
+- `make serve` — start the FastAPI app (uvicorn) with autoreload (`USE_CPU=1` for CPU-only setups).
+- `make test` — run the pytest suite.
+- `make check` — build and smoke-test the container image with the demo corpus.
+- `make build` — build sdist/wheel (includes ABOUT.md).
+- `make package` — create `.zip`/`.tar.gz` artifacts under `./artifacts`.
+- `make release VERSION=x.y.z` — sync version strings, regenerate the changelog, run tests/build, tag & push.
+- `make clean` / `make clean-dist` — remove caches and build outputs.
diff --git a/README.md b/README.md
index cedf5d6..688c631 100644
--- a/README.md
+++ b/README.md
@@ -1,87 +1,120 @@
 # 🍰 PatchVec — Lightweight, Pluggable Vector Search Microservice
 
-Upload → chunk → index (with metadata) → search via REST and CLI.
+Patchvec is a compact vector store built for people who want provenance and fast iteration on RAG plumbing. No black boxes, no hidden pipelines: every chunk records document id, page, and byte offsets, and you can swap embeddings or storage backends per collection.
 
----
+## ⚙️ Core capabilities
 
-## 🚀 Quickstart
+- **Docker images** — prebuilt CPU/GPU images published to the GitLab Container Registry.
+- **Tenants and collections** — isolation by tenant with per-collection configuration.
+- **Pluggable embeddings** — choose the embedding adapter per collection; wire in local or hosted models.
+- **REST and CLI** — production use over HTTP, quick experiments with the bundled CLI.
+- **Deterministic provenance** — every hit returns doc id, page, offset, and snippet for traceability.
 
-### 🖥️ CPU-only Dev (default)
-```bash
-python -m venv .venv
-source .venv/bin/activate
-python -m pip install --upgrade pip
-pip install -r requirements-cpu.txt
-./pavesrv.sh
-```
+## 🧭 Workflows
+
+### 🐳 Docker workflow (prebuilt images)
+
+Pull the image that fits your hardware from the [https://gitlab.com/flowlexi](Flowlexi) Container Registry on Gitlab (CUDA builds publish as `latest`, CPU-only as `latest-cpu`).
 
-### 📦 Install from PyPI
 ```bash
-python -m venv .venv
-source .venv/bin/activate
-python -m pip install --upgrade pip
-pip install patchvec[cpu]   # or patchvec[gpu] for CUDA
+docker pull registry.gitlab.com/flowlexi/patchvec/patchvec:latest
+docker pull registry.gitlab.com/flowlexi/patchvec/patchvec:latest-cpu
 ```
 
-### ▶️ Run the Server
-From source:
+Run the service by choosing the tag you need and mapping the API port locally:
+
 ```bash
-./pavesrv.sh
+docker run -d --name patchvec \
+  -p 8086:8086 \
+  registry.gitlab.com/flowlexi/patchvec/patchvec:latest-cpu
 ```
-From PyPI:
+
+Use the bundled CLI inside the container to create a tenant/collection, ingest a demo document, and query it:
+
 ```bash
-pavesrv
+docker exec patchvec pavecli create-collection demo books
+docker exec patchvec pavecli upload demo books /app/demo/20k_leagues.txt \
+  --docid=verne-20k --metadata='{"lang":"en"}'
+docker exec patchvec pavecli search demo books "captain nemo" -k 3
 ```
 
-### ⚙️ Minimal Config
-For production (static auth), set env vars (do not commit secrets):
-```env
-PATCHVEC_AUTH__MODE=static
-PATCHVEC_AUTH__GLOBAL_KEY=sekret-passwod
-```
-(Optional: copy `config.yml.example` to an untracked `config.yml` and tweak as needed)
-(Tip: use an untracked `tenants.yml` and point `auth.tenants_file` to it in `config.yml`.)
+See below for REST and UI.
 
----
+Stop the container when you are done:
 
-## 🔧 Overriding Server Settings (uvicorn)
-You can override a few server knobs via environment variables:
 ```bash
-HOST=127.0.0.1 PORT=9000 RELOAD=1 WORKERS=4 LOG_LEVEL=debug pavesrv
+docker rm -f patchvec
 ```
-> Note: Full configuration uses the `PATCHVEC_...` env scheme (e.g., `PATCHVEC_SERVER__PORT=9000`).
 
----
+### 🐍 PyPI workflow
 
-## 🌐 REST API Examples
+Install Patchvec from PyPI inside an isolated virtual environment and point it at a local configuration directory:
 
-**Create a collection**
 ```bash
-curl -X POST "http://localhost:8086/collections/acme/invoices"
-```
+mkdir -p ~/pv && cd ~/pv #or wherever
+python -m venv .venv-pv
+source .venv-pv/bin/activate
+python -m pip install --upgrade pip
+python -m pip install "patchvec[cpu]"   # or "patchvec[gpu]" for CUDA
 
-**Upload a TXT/PDF/CSV document**
-```bash
-curl -X POST "http://localhost:8086/collections/acme/invoices/documents"   -F "file=@sample.txt"   -F "docid=DOC-1"   -F 'metadata={"lang":"pt"}'
-```
+# grab the default configs
+curl -LO https://raw.githubusercontent.com/patchvec/patchvec/main/config.yml.example
+curl -LO https://raw.githubusercontent.com/patchvec/patchvec/main/tenants.yml.example
+cp config.yml.example config.yml
+cp tenants.yml.example tenants.yml
 
-**Search (GET, no filters)**
-```bash
-curl "http://localhost:8086/collections/acme/invoices/search?q=garantia&k=5"
-```
+# sample demo corpus
+curl -LO https://raw.githubusercontent.com/patchvec/patchvec/main/demo/20k_leagues.txt
 
-**Search (POST with filters)**
-```bash
-curl -X POST "http://localhost:8086/collections/acme/invoices/search"   -H "Content-Type: application/json"   -d '{"q": "garantia", "k": 5, "filters": {"docid": "DOC-1"}}'
+# point Patchvec at the config directory and set a local admin key
+export PATCHVEC_CONFIG="$HOME/pv/config.yml"
+export PATCHVEC_GLOBAL_KEY=super-sekret
+
+# option A: run the service (stays up until you stop it)
+pavesrv
+
+# option B: operate entirely via the CLI (no server needed)
+pavecli create-collection demo books
+pavecli upload demo books 20k_leagues.txt --docid=verne-20k --metadata='{"lang":"en"}'
+pavecli search demo books "captain nemo" -k 3
 ```
 
-**Health / Metrics**
+Deactivate the virtual environment with `deactivate` when finished.
+
+### 🌐 REST API and Web UI usage
+
+When the server is running (either via Docker or `pavesrv`), the API listens on `http://localhost:8086`. The following `curl` commands mirror the CLI sequence above—adjust the file path to wherever you stored the corpus (`/app/demo/20k_leagues.txt` in Docker, `~/pv/20k_leagues.txt` for PyPI installs) and reuse the bearer token exported earlier:
+
 ```bash
-curl "http://localhost:8086/health"
-curl "http://localhost:8086/metrics"
+# create collection
+curl -H "Authorization: Bearer $PATCHVEC_GLOBAL_KEY" \
+  -X POST http://localhost:8086/collections/demo/books
+
+# ingest document
+curl -H "Authorization: Bearer $PATCHVEC_GLOBAL_KEY" \
+  -X POST http://localhost:8086/collections/demo/books/documents \
+  -F "file=@20k_leagues.txt" \
+  -F 'metadata={"lang":"en"}'
+
+# run search
+curl -H "Authorization: Bearer $PATCHVEC_GLOBAL_KEY" \
+  "http://localhost:8086/collections/demo/books/search?q=captain+nemo&k=3"
 ```
 
----
+There is a simple Swagger UI available at the root of the server. Just point your browser to `http://localhost:8086/`
+
+Health and metrics endpoints are available at `/health` and `/metrics`.
+
+Configuration files copied in either workflow can be customised. Runtime options are also accepted via the `PATCHVEC_*` environment variable scheme (`PATCHVEC_SERVER__PORT`, `PATCHVEC_AUTH__MODE`, etc.), which precedes conf files.
+
+### 🛠️ Developer workflow
+
+Building from source relies on the `Makefile` shortcuts (`make install-dev`, `USE_CPU=1 make serve`, `make test`, etc.). The full contributor workflow, target reference, and task claiming rules live in [CONTRIBUTING.md](CONTRIBUTING.md).
+
+## 🗺️ Roadmap
+
+Short & mid-term chores are tracked in [`ROADMAP.md`](ROADMAP.md). Pick one, open an issue titled `claim: <task>`, and ship a patch.
 
 ## 📜 License
+
 GPL-3.0-or-later — (C) 2025 Rodrigo Rodrigues da Silva
diff --git a/ROADMAP.md b/ROADMAP.md
new file mode 100644
index 0000000..19d741a
--- /dev/null
+++ b/ROADMAP.md
@@ -0,0 +1,43 @@
+# Roadmap
+
+Immediate chores worth tackling now. Claim a task by opening an issue titled `claim: <task>` and link your branch or PR when ready.
+
+## 0.5.x Series — Adoption & Stability
+
+### v0.5.7 — Search, Filters, Traceability
+- Expand partial filter support (`*` prefix/fuzzy matching).
+- Return a `match_reason` field alongside every hit.
+- Persist configurable logging per tenant and collection.
+- Provide REST/CLI endpoints to delete a document by id.
+- Expose latency histograms (p50/p95/p99) via `/metrics` for search and ingest requests.
+
+### v0.5.8 — Persistence, Infrastructure
+- Ship an internal metadata/content store (SQLite or TinyDB) with migrations.
+- Serve `/metrics` and `/collections` using the internal store as the source of truth.
+- Emit structured logs with `request_id`, tenant, and latency, and allow rolling retention per tenant/collection.
+- Support renaming collections through the API and CLI.
+
+### v0.5.9 — Ranking, Model Quality
+- Add hybrid reranking (vector similarity + BM25/token matching).
+- Honor `meta.priority` boosts during scoring.
+- Improve multilingual relevance and evaluation fixtures.
+
+## 🚀 Towards 1.0
+
+### 0.6 — Per-Collection Embeddings
+- Configure embeddings per collection via `config.yml`.
+- Maintain per-collection hot caches with isolation guarantees.
+- List tenants and collections via the API (CLI parity included).
+
+### 0.7 — API Utilities & Observability
+- Default tenant/collection selectors in Swagger UI.
+- Export indexes, enforce collection-level logging toggles, add rate limiting, and finalize per-collection embedding configuration flows.
+
+### 0.8 — Reliability & Governance
+- Deliver the internal DB for persistence, document versioning, rebuild tooling, persistent metrics, surfacing metrics in the UI, JWT auth, per-tenant quotas, and transactional rollback with safe retries.
+
+### 0.9 — Scale & Multi-Tenant Search
+- Async ingest, parallel purge, horizontal scalability, tenant groups, and shared/group search with sub-index routing.
+
+### 1.0 — API Freeze
+- Lock routes, publish the final OpenAPI spec, and ship an SDK client ready for long-term support.

From 74789ffa734cb7504c3be8443379cecf1ace8b05 Mon Sep 17 00:00:00 2001
From: Rodrigo Rodrigues da Silva <pitanga@members.fsf.org>
Date: Sun, 2 Nov 2025 01:35:56 -0300
Subject: [PATCH 2/5] [store] ensure CRLF-rich documents round-trip intact

---
 pave/stores/txtai_store.py |  9 +++++----
 tests/test_txtai_store.py  | 12 ++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/pave/stores/txtai_store.py b/pave/stores/txtai_store.py
index 0f44ba8..31280b2 100644
--- a/pave/stores/txtai_store.py
+++ b/pave/stores/txtai_store.py
@@ -179,16 +179,17 @@ def _save_chunk_text(self, tenant: str, collection: str,
         p = os.path.join(self._chunks_dir(tenant, collection),
                          self._urid_to_fname(urid))
         os.makedirs(os.path.dirname(p), exist_ok=True)
-        with open(p, "w", encoding="utf-8") as f:
-            f.write(t or "")
+        data = (t or "").encode("utf-8")
+        with open(p, "wb") as f:
+            f.write(data)
             f.flush()
 
     def _load_chunk_text(self, tenant: str, collection: str, urid: str) -> str | None:
         p = os.path.join(self._chunks_dir(tenant, collection),
                          self._urid_to_fname(urid))
         if os.path.isfile(p):
-            with open(p, "r", encoding="utf-8") as f:
-                return f.read()
+            with open(p, "rb") as f:
+                return f.read().decode("utf-8")
         return None
 
     def index_records(self, tenant: str, collection: str, docid: str,
diff --git a/tests/test_txtai_store.py b/tests/test_txtai_store.py
index a007877..27c0925 100644
--- a/tests/test_txtai_store.py
+++ b/tests/test_txtai_store.py
@@ -69,6 +69,18 @@ def test_index_adds_docid_prefix(store):
     assert hits[0]["text"] is not None and "verde" in hits[0]["text"].lower()
     assert hits[0]["id"] is not None and "docbike::0" == hits[0]["id"]
 
+def test_chunk_sidecar_preserves_crlf(store):
+    text = "First line\r\nSecond line\r\n"
+    recs = [
+        {"id": "0", "content": text, "metadata": {"lang": "en"}},
+    ]
+
+    n = store.index_records("acme", "crlf", "doccrlf", recs)
+    assert n == 1
+
+    stored = store.impl._load_chunk_text("acme", "crlf", "doccrlf::0")
+    assert stored == text
+
 def test_meta_json_and_filters(store):
     recs = [
         {"id": "docx::0", "content": "Olá mundo", "metadata": {"lang": "pt"}},

From d42463236ad03a0a6ad22d32947bea51ad12f011 Mon Sep 17 00:00:00 2001
From: Rodrigo Rodrigues da Silva <pitanga@members.fsf.org>
Date: Wed, 5 Nov 2025 00:58:41 +0000
Subject: [PATCH 3/5] Sanitize txtai metadata persistence

---
 pave/stores/txtai_store.py           | 113 ++++++++++++++++++++++----
 tests/conftest.py                    |  16 ++++
 tests/test_txtai_store_sql_safety.py | 116 +++++++++++++++++++++++++++
 tests/utils.py                       |  56 +++++++++++--
 4 files changed, 278 insertions(+), 23 deletions(-)
 create mode 100644 tests/test_txtai_store_sql_safety.py

diff --git a/pave/stores/txtai_store.py b/pave/stores/txtai_store.py
index 31280b2..be2c3bb 100644
--- a/pave/stores/txtai_store.py
+++ b/pave/stores/txtai_store.py
@@ -4,7 +4,7 @@
 from __future__ import annotations
 import os, json, operator
 from datetime import datetime
-from typing import Dict, Iterable, List, Any
+from typing import Any, Dict, Iterable, List, Optional
 from threading import Lock
 from contextlib import contextmanager
 from txtai.embeddings import Embeddings
@@ -12,6 +12,14 @@
 from pave.config import CFG as c, LOG as log
 
 _LOCKS : dict[str, Lock] = {}
+_SQL_TRANS = str.maketrans({
+    ";": " ",
+    '"': " ",
+    "`": " ",
+    "\\": " ",
+    "\x00": "",
+})
+
 def get_lock(key: str) -> Lock:
     if key not in _LOCKS:
         _LOCKS[key] = Lock()
@@ -236,10 +244,10 @@ def index_records(self, tenant: str, collection: str, docid: str,
 
                 md["docid"] = docid
                 try:
-                    meta_json = json.dumps(md, ensure_ascii=False)
-                    md = json.loads(meta_json)
-                except:
-                    md = {}
+                    safe_meta = self._sanit_meta_dict(md)
+                    meta_json = json.dumps(safe_meta, ensure_ascii=False)
+                except Exception:
+                    safe_meta = {}
                     meta_json = ""
 
                 rid = str(rid)
@@ -247,9 +255,11 @@ def index_records(self, tenant: str, collection: str, docid: str,
                 if not rid.startswith(f"{docid}::"):
                     rid = f"{docid}::{rid}"
 
-                meta_side[rid] = md
+                md_for_index = {k: v for k, v in safe_meta.items() if k != "text"}
+
+                meta_side[rid] = safe_meta
                 record_ids.append(rid)
-                prepared.append((rid, {"text":txt, **md}, meta_json))
+                prepared.append((rid, {"text": txt, **md_for_index}, meta_json))
 
                 self._save_chunk_text(tenant, collection, rid, txt)
                 assert txt == (self._load_chunk_text(tenant, collection, rid) or "")
@@ -280,10 +290,15 @@ def _matches_filters(m: Dict[str, Any],
         if not filters:
             return True
 
-        def match(have: Any, cond: str) -> bool:
+        def match(have: Any, cond: Any) -> bool:
             if have is None:
                 return False
-            s = str(cond)
+            if isinstance(have, (list, tuple, set)):
+                return any(match(item, cond) for item in have)
+            if isinstance(cond, str):
+                s = TxtaiStore._sanit_sql(cond)
+            else:
+                s = str(cond)
             hv = str(have)
             # Numeric/date ops
             for op in (">=", "<=", "!=", ">", "<"):
@@ -313,7 +328,7 @@ def match(have: Any, cond: str) -> bool:
             return hv == s
 
         for k, vals in filters.items():
-            if not any(match(m.get(k), v) for v in vals):
+            if not any(match(TxtaiStore._lookup_meta(m, k), v) for v in vals):
                 return False
         return True
 
@@ -325,6 +340,9 @@ def _split_filters(filters: dict[str, Any] | None) -> tuple[dict, dict]:
 
         pre_f, pos_f = {}, {}
         for key, vals in (filters or {}).items():
+            safe_key = TxtaiStore._sanit_field(key)
+            if not safe_key:
+                continue
             if not isinstance(vals, list):
                 vals = [vals]
             exacts, extended = [], []
@@ -338,12 +356,68 @@ def _split_filters(filters: dict[str, Any] | None) -> tuple[dict, dict]:
                 else:
                     exacts.append(v)
             if exacts:
-                pre_f[key] = exacts
+                pre_f[safe_key] = exacts
             if extended:
-                pos_f[key] = extended
+                pos_f[safe_key] = extended
         log.debug(f"after split: PRE {pre_f} POS {pos_f}")
         return pre_f, pos_f
 
+    @staticmethod
+    def _lookup_meta(meta: Dict[str, Any] | None, key: str) -> Any:
+        if not meta:
+            return None
+        if key in meta:
+            return meta.get(key)
+        for raw_key, value in meta.items():
+            if TxtaiStore._sanit_field(raw_key) == key:
+                return value
+        return None
+
+    @staticmethod
+    def _sanit_meta_value(value: Any) -> Any:
+        if isinstance(value, dict):
+            return TxtaiStore._sanit_meta_dict(value)
+        if isinstance(value, (list, tuple, set)):
+            return [TxtaiStore._sanit_meta_value(v) for v in value]
+        if isinstance(value, (int, float, bool)) or value is None:
+            return value
+        return TxtaiStore._sanit_sql(value)
+
+    @staticmethod
+    def _sanit_meta_dict(meta: Dict[str, Any] | None) -> Dict[str, Any]:
+        safe: Dict[str, Any] = {}
+        if not isinstance(meta, dict):
+            return safe
+        for raw_key, raw_value in meta.items():
+            safe_key = TxtaiStore._sanit_field(raw_key)
+            if not safe_key or safe_key == "text":
+                continue
+            safe[safe_key] = TxtaiStore._sanit_meta_value(raw_value)
+        return safe
+
+    @staticmethod
+    def _sanit_sql(value: Any, *, max_len: Optional[int] = None) -> str:
+        if value is None:
+            return ""
+        text = str(value).translate(_SQL_TRANS)
+        for token in ("--", "/*", "*/"):
+            if token in text:
+                text = text.split(token, 1)[0]
+        text = text.strip()
+        if max_len is not None and max_len > 0 and len(text) > max_len:
+            text = text[:max_len]
+        return text.replace("'", "''")
+
+    @staticmethod
+    def _sanit_field(name: Any) -> str:
+        if not isinstance(name, str):
+            name = str(name)
+        safe = []
+        for ch in name:
+            if ch.isalnum() or ch in {"_", "-"}:
+                safe.append(ch)
+        return "".join(safe)
+
     @staticmethod
     def _build_sql(query: str, k: int, filters: dict[str, Any], columns: list[str],
                    with_similarity: bool = True, avoid_duplicates = True) -> str:
@@ -356,14 +430,23 @@ def _build_sql(query: str, k: int, filters: dict[str, Any], columns: list[str],
 
         wheres = []
         if with_similarity and query:
-            q_safe = query.replace("'", "''")
+            max_len_cfg = c.get("vector_store.txtai.max_query_chars", 512)
+            try:
+                max_len = int(max_len_cfg)
+            except (TypeError, ValueError):
+                max_len = 512
+            limit = max_len if max_len > 0 else None
+            q_safe = TxtaiStore._sanit_sql(query, max_len=limit)
             wheres.append(f"similar('{q_safe}')")
 
         for key, vals in filters.items():
+            safe_key = TxtaiStore._sanit_field(key)
+            if not safe_key:
+                continue
             ors = []
             for v in vals:
-                safe_v = str(v).replace("'", "''")
-                ors.append(f"[{key}] = '{safe_v}'")
+                safe_v = TxtaiStore._sanit_sql(v)
+                ors.append(f"[{safe_key}] = '{safe_v}'")
             or_safe = " OR ".join(ors)
             wheres.append(f"({or_safe})")
 
diff --git a/tests/conftest.py b/tests/conftest.py
index bf4e8b0..8fd621b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,22 @@
 # (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
 # SPDX-License-Identifier: GPL-3.0-or-later
 
+import sys
+import types
+
+if "txtai.embeddings" not in sys.modules:
+    txtai_stub = types.ModuleType("txtai")
+    embeddings_stub = types.ModuleType("txtai.embeddings")
+
+    class _StubEmbeddings:  # pragma: no cover - stub for optional dependency
+        def __init__(self, *args, **kwargs):
+            pass
+
+    embeddings_stub.Embeddings = _StubEmbeddings
+    txtai_stub.embeddings = embeddings_stub
+    sys.modules.setdefault("txtai", txtai_stub)
+    sys.modules.setdefault("txtai.embeddings", embeddings_stub)
+
 import pytest
 from fastapi.testclient import TestClient
 from pave.config import get_cfg, reload_cfg
diff --git a/tests/test_txtai_store_sql_safety.py b/tests/test_txtai_store_sql_safety.py
new file mode 100644
index 0000000..e230929
--- /dev/null
+++ b/tests/test_txtai_store_sql_safety.py
@@ -0,0 +1,116 @@
+# (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import json
+
+import pytest
+
+from pave.stores import txtai_store as store_mod
+from pave.stores.txtai_store import TxtaiStore
+from pave.config import get_cfg
+from utils import FakeEmbeddings
+
+
+@pytest.fixture(autouse=True)
+def _fake_embeddings(monkeypatch):
+    monkeypatch.setattr(store_mod, "Embeddings", FakeEmbeddings, raising=True)
+
+
+@pytest.fixture()
+def store():
+    return TxtaiStore()
+
+
+def _extract_similarity_term(sql: str) -> str:
+    marker = "similar('"
+    if marker not in sql:
+        raise AssertionError(f"similar() clause missing in SQL: {sql!r}")
+    rest = sql.split(marker, 1)[1]
+    return rest.split("')", 1)[0]
+
+
+def test_build_sql_sanitizes_similarity_term(store):
+    raw_query = "foo'; DROP TABLE users; -- comment"
+    sql = store._build_sql(raw_query, 5, {}, ["id", "text"])
+    term = _extract_similarity_term(sql)
+
+    # injection primitives are stripped or neutralised
+    assert ";" not in term
+    assert "--" not in term
+    # original alpha characters remain so search still works
+    assert "foo" in term
+
+
+def test_build_sql_sanitizes_filter_values(store):
+    filters = {"lang": ["en'; DELETE FROM x;"], "tags": ['alpha"beta']}
+    sql = store._build_sql("foo", 5, filters, ["id", "text"])
+
+    # filter clause should not leak dangerous characters
+    assert ";" not in sql
+    assert '"' not in sql
+    assert "--" not in sql
+
+
+def test_build_sql_normalises_filter_keys(store):
+    filters = {"lang]; DROP": ["en"], 123: ["x"]}
+    sql = store._build_sql("foo", 5, filters, ["id"])
+    assert "[langDROP]" in sql
+    assert "[123]" in sql
+
+
+def test_build_sql_applies_query_length_limit(store):
+    cfg = get_cfg()
+    snapshot = cfg.snapshot()
+    try:
+        cfg.set("vector_store.txtai.max_query_chars", 8)
+        sql = store._build_sql("abcdefghijklmno", 5, {}, ["id"])
+        term = _extract_similarity_term(sql)
+
+        # collapse the doubled quotes to measure the original payload length
+        collapsed = term.replace("''", "'")
+        assert len(collapsed) == 8
+    finally:
+        cfg.replace(data=snapshot)
+
+
+def test_search_handles_special_characters(store):
+    tenant, collection = "tenant", "coll"
+    store.load_or_init(tenant, collection)
+
+    records = [("r1", "hello world", {"lang": "en"})]
+    store.index_records(tenant, collection, "doc", records)
+
+    hits = store.search(tenant, collection, "world; -- comment", k=5)
+    assert hits
+    assert hits[0]["id"].endswith("::r1")
+
+
+def test_round_trip_with_weird_metadata_field(store):
+    tenant, collection = "tenant", "coll"
+    store.load_or_init(tenant, collection)
+
+    weird_key = "meta;`DROP"
+    weird_value = "val'u"
+    records = [("r2", "strange world", {weird_key: weird_value})]
+    store.index_records(tenant, collection, "doc2", records)
+
+    filters = {weird_key: weird_value}
+    hits = store.search(tenant, collection, "strange", k=5, filters=filters)
+
+    assert hits
+    assert hits[0]["id"].endswith("::r2")
+
+    emb = store._emb[(tenant, collection)]
+    safe_key = TxtaiStore._sanit_field(weird_key)
+    assert emb.last_sql and f"[{safe_key}]" in emb.last_sql
+
+    rid = hits[0]["id"]
+    stored_meta = store._load_meta(tenant, collection).get(rid) or {}
+    assert safe_key in stored_meta
+    assert stored_meta[safe_key] == TxtaiStore._sanit_sql(weird_value)
+
+    doc = emb._docs[rid]
+    assert doc["meta"].get(safe_key) == TxtaiStore._sanit_sql(weird_value)
+    serialized = json.loads(doc["meta_json"]) if doc.get("meta_json") else {}
+    assert serialized.get(safe_key) == TxtaiStore._sanit_sql(weird_value)
+    assert hits[0]["meta"].get(safe_key) == TxtaiStore._sanit_sql(weird_value)
diff --git a/tests/utils.py b/tests/utils.py
index 4acbb3c..4b77c35 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -10,18 +10,26 @@
 class FakeEmbeddings:
     """Tiny in-memory index. Keeps interface you use in tests."""
     def __init__(self, config):  # config unused
-        self._docs = {}  # rid -> (text, meta_json)
+        self._docs = {}  # rid -> {"text": str, "meta_json": str, "meta": dict}
+        self.last_sql = None
 
     def index(self, docs):
-        for rid, text, meta_json in docs:
+        for rid, payload, meta_json in docs:
             assert isinstance(meta_json, str)
-            self._docs[rid] = (text, meta_json)
+            if isinstance(payload, dict):
+                text = payload.get("text")
+                meta = {k: v for k, v in payload.items() if k != "text"}
+            else:
+                text = payload
+                meta = {}
+            self._docs[rid] = {"text": text, "meta_json": meta_json, "meta": meta}
 
     def upsert(self, docs):
         return self.index(docs)
 
     def search(self, sql, k=5):
         import re
+        self.last_sql = sql
         term = None
         m = re.search(r"similar\('([^']+)'", sql)
         if m:
@@ -32,10 +40,42 @@ def search(self, sql, k=5):
         if not term:
             return []
 
-        hits = [
-            {"id": rid, "score": 1.0, "text": txt, "tags": {"docid": "DUMMY"}}
-            for rid, (txt, _) in self._docs.items() if term in str(txt).lower()
-        ]
+        filter_pairs = re.findall(r"\[([^\]]+)\]\s*=\s*'((?:''|[^'])*)'", sql)
+
+        hits = []
+        for rid, entry in self._docs.items():
+            text = entry.get("text")
+            if text is None:
+                continue
+            if term not in str(text).lower():
+                continue
+
+            metadata = entry.get("meta") or {}
+            include = True
+            for field, raw_val in filter_pairs:
+                stored = metadata.get(field)
+                if stored is None:
+                    include = False
+                    break
+                expected = raw_val
+                if isinstance(stored, (list, tuple, set)):
+                    options = {str(v) for v in stored}
+                    if expected not in options:
+                        include = False
+                        break
+                else:
+                    if str(stored) != expected:
+                        include = False
+                        break
+            if not include:
+                continue
+
+            hits.append({
+                "id": rid,
+                "score": 1.0,
+                "text": text,
+                "docid": metadata.get("docid"),
+            })
         return hits[:10]
         """
         q = (query or "").lower()
@@ -47,7 +87,7 @@ def search(self, sql, k=5):
         """
 
     def lookup(self, ids):
-        return {rid: self._docs.get(rid, ("", ""))[0] for rid in ids}
+        return {rid: (self._docs.get(rid) or {}).get("text") for rid in ids}
 
     def delete(self, ids):
         for rid in ids:

From c964f1eb9002f29179316d6229c48ee7e08890f7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 5 Nov 2025 01:05:11 +0000
Subject: [PATCH 4/5] Initial plan


From 54e10bc39446e01eedc7190e1c098618577a87b7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 5 Nov 2025 01:16:11 +0000
Subject: [PATCH 5/5] Fix escaped single quotes handling in FakeEmbeddings
 filters

Co-authored-by: rodrigopitanga <1755608+rodrigopitanga@users.noreply.github.com>
---
 tests/test_fake_embeddings_escape.py | 69 ++++++++++++++++++++++++++++
 tests/utils.py                       |  2 +-
 2 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_fake_embeddings_escape.py

diff --git a/tests/test_fake_embeddings_escape.py b/tests/test_fake_embeddings_escape.py
new file mode 100644
index 0000000..6007f5e
--- /dev/null
+++ b/tests/test_fake_embeddings_escape.py
@@ -0,0 +1,69 @@
+# (C) 2025 Rodrigo Rodrigues da Silva <rodrigopitanga@posteo.net>
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""Test that FakeEmbeddings correctly handles escaped single quotes in filter values."""
+
+from utils import FakeEmbeddings
+
+
+def test_fake_embeddings_escaped_single_quote():
+    """Test that filter values with escaped single quotes are correctly unescaped."""
+    fake = FakeEmbeddings({})
+    
+    # Index a document with a single quote in metadata
+    # The metadata is stored in the payload dict
+    docs = [
+        ("doc1", {"text": "test content", "author": "O'Brien"}, '{"author": "O\'Brien"}'),
+        ("doc2", {"text": "another test", "author": "Smith"}, '{"author": "Smith"}'),
+    ]
+    fake.index(docs)
+    
+    # Search with a filter that includes an escaped single quote (as it would appear in SQL)
+    # The SQL sanitizer in txtai_store.py escapes ' as ''
+    sql = "SELECT * FROM txtai WHERE similar('test') AND [author] = 'O''Brien'"
+    results = fake.search(sql)
+    
+    # Should find doc1 but not doc2
+    assert len(results) == 1
+    assert results[0]["id"] == "doc1"
+    assert results[0]["text"] == "test content"
+
+
+def test_fake_embeddings_no_escape_needed():
+    """Test that filter values without single quotes still work correctly."""
+    fake = FakeEmbeddings({})
+    
+    # Index documents without single quotes
+    docs = [
+        ("doc1", {"text": "test content", "author": "Smith"}, '{"author": "Smith"}'),
+        ("doc2", {"text": "another test", "author": "Jones"}, '{"author": "Jones"}'),
+    ]
+    fake.index(docs)
+    
+    # Search with a normal filter
+    sql = "SELECT * FROM txtai WHERE similar('test') AND [author] = 'Smith'"
+    results = fake.search(sql)
+    
+    # Should find doc1
+    assert len(results) == 1
+    assert results[0]["id"] == "doc1"
+
+
+def test_fake_embeddings_multiple_escaped_quotes():
+    """Test that multiple escaped single quotes in a value are handled correctly."""
+    fake = FakeEmbeddings({})
+    
+    # Index a document with multiple single quotes in metadata
+    docs = [
+        ("doc1", {"text": "test content", "title": "It's a boy's life"}, '{"title": "It\'s a boy\'s life"}'),
+        ("doc2", {"text": "another test", "title": "Simple title"}, '{"title": "Simple title"}'),
+    ]
+    fake.index(docs)
+    
+    # Search with a filter that has multiple escaped single quotes
+    sql = "SELECT * FROM txtai WHERE similar('test') AND [title] = 'It''s a boy''s life'"
+    results = fake.search(sql)
+    
+    # Should find doc1
+    assert len(results) == 1
+    assert results[0]["id"] == "doc1"
diff --git a/tests/utils.py b/tests/utils.py
index 4b77c35..74ba374 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -57,7 +57,7 @@ def search(self, sql, k=5):
                 if stored is None:
                     include = False
                     break
-                expected = raw_val
+                expected = raw_val.replace("''", "'")
                 if isinstance(stored, (list, tuple, set)):
                     options = {str(v) for v in stored}
                     if expected not in options: