From 53b8b9dc668b9371c1829745a31179b973eb3ec4 Mon Sep 17 00:00:00 2001 From: Dima Mikielewicz Date: Wed, 4 Feb 2026 12:14:41 +0100 Subject: [PATCH 1/4] Add BM25 search --- .github/workflows/ci.yml | 9 +- CHANGELOG.md | 29 ++ README.md | 52 ++- lib/torus.ex | 151 ++++++ lib/torus/search/bm25.ex | 122 +++++ mix.exs | 4 +- .../20260203120000_add_pg_textsearch.exs | 23 + test/test_helper.exs | 2 +- test/torus/bm25_compile_test.exs | 40 ++ test/torus/bm25_test.exs | 432 ++++++++++++++++++ 10 files changed, 841 insertions(+), 23 deletions(-) create mode 100644 lib/torus/search/bm25.ex create mode 100644 test/support/migrations/20260203120000_add_pg_textsearch.exs create mode 100644 test/torus/bm25_compile_test.exs create mode 100644 test/torus/bm25_test.exs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2880915..6daa9e8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: # Additional services can be defined here if required. services: db: - image: postgres:15 + image: postgres:17 ports: - 5432/tcp env: @@ -40,8 +40,8 @@ jobs: name: Test on OTP ${{matrix.otp}} / Elixir ${{matrix.elixir}} strategy: matrix: - otp: ['27.2'] - elixir: ['1.18.0'] + otp: ['28.3'] + elixir: ['1.19.5'] steps: # Step: Setup Elixir + Erlang image as the base. - name: Set up Elixir @@ -86,6 +86,9 @@ jobs: - name: Install dependencies run: mix deps.get --check-locked + - name: Install pg_textsearch extension + run: | + docker exec ${{ job.services.db.id }} bash -lc "apt-get update && apt-get install -y wget && wget -q -O /tmp/pg_textsearch.deb https://github.com/timescale/pg_textsearch/releases/download/v0.5.0/pg-textsearch-postgresql-17_0.5.0-1_amd64.deb && dpkg -i /tmp/pg_textsearch.deb" # TODO These steps can be moved to `mix.exs` # Step: Compile the project treating any warnings as errors. diff --git a/CHANGELOG.md b/CHANGELOG.md index 251b216..1e7886f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,32 @@ +# v0.6.0 + +## New 🔥 + +**BM25 Full-Text Search** is now available via the new `Torus.bm25/5` macro! + +[BM25](https://en.wikipedia.org/wiki/Okapi_BM25) is a modern ranking algorithm that generally provides superior relevance scoring compared to traditional TF-IDF (used by `full_text/5`). This integration uses the [pg_textsearch](https://github.com/timescale/pg_textsearch) extension by Timescale. + +Key features: + +- State-of-the-art BM25 ranking with configurable index parameters (k1, b) +- Blazingly fast top-k queries via Block-Max WAND optimization (`Torus.bm25/5` + `limit`) +- Simple syntax: `Post |> Torus.bm25([p], p.body, "search term") |> limit(10)` +- Score selection with `:score_key` and post-filtering with `:score_threshold` +- Language/stemming configured at index creation via `text_config` + +Requirements: + +- PostgreSQL 17+ +- pg_textsearch extension installed +- BM25 index on the search column (with `text_config` for language) + +See the [BM25 Search Guide](https://dimamik.com/posts/bm25_search) for detailed setup instructions and examples. + +**When to use BM25 vs full_text:** + +- Use `bm25/5` for fast single-column search with modern relevance ranking +- Use `full_text/5` for multi-column search with weights or when using stored tsvector columns + # v0.5.3 ## Fixes diff --git a/README.md b/README.md index 6f770f1..ae79460 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Post See [`full_text/5`](https://hexdocs.pm/torus/Torus.html#full_text/5) for more details. -## 6 types of search: +## 7 types of search: 1. **Pattern matching**: Searches for a specific pattern in a string. @@ -58,12 +58,13 @@ See [`full_text/5`](https://hexdocs.pm/torus/Torus.html#full_text/5) for more de 1. **Similarity:** Searches for records that closely match the input text using trigram distance. ```elixir - iex> insert_posts!(["Hogwarts Secrets", "Quidditch Fever", "Hogwart’s Secret"]) - ...> Post - ...> |> Torus.similarity([p], [p.title], "hoggwarrds") - ...> |> limit(2) - ...> |> select([p], p.title) - ...> |> Repo.all() + insert_posts!(["Hogwarts Secrets", "Quidditch Fever", "Hogwart’s Secret"]) + + Post + |> Torus.similarity([p], [p.title], "hoggwarrds") + |> limit(2) + |> select([p], p.title) + |> Repo.all() ["Hogwarts Secrets", "Hogwart’s Secret"] ``` @@ -74,20 +75,39 @@ See [`full_text/5`](https://hexdocs.pm/torus/Torus.html#full_text/5) for more de 1. **Full text**: Uses term-document matrix vectors for, enabling efficient querying and ranking based on term frequency. Supports prefix search and is great for large datasets to quickly return relevant results. See [PostgreSQL Full Text Search](https://www.postgresql.org/docs/current/textsearch.html) for internal implementation details. ```elixir - iex> insert_post!(title: "Hogwarts Shocker", body: "A spell disrupts the Quidditch Cup.") - ...> insert_post!(title: "Diagon Bombshell", body: "Secrets uncovered in the heart of Hogwarts.") - ...> insert_post!(title: "Completely unrelated", body: "No magic here!") - ...> Post - ...> |> Torus.full_text([p], [p.title, p.body], "uncov hogwar") - ...> |> select([p], p.title) - ...> |> Repo.all() + insert_post!(title: "Hogwarts Shocker", body: "A spell disrupts the Quidditch Cup.") + insert_post!(title: "Diagon Bombshell", body: "Secrets uncovered in the heart of Hogwarts.") + insert_post!(title: "Completely unrelated", body: "No magic here!") + + Post + |> Torus.full_text([p], [p.title, p.body], "uncov hogwar") + |> select([p], p.title) + |> Repo.all() ["Diagon Bombshell"] ``` - Use it when you don’t care about spelling, the documents are long, or if you need to order the results by rank. + Use it when you don't care about spelling, the documents are long, you need multi-column search with weights, or if you need to order the results by rank. See [`full_text/5`](https://hexdocs.pm/torus/Torus.html#full_text/5) for more details. +1. **BM25 full text**: Modern BM25 ranking algorithm for superior relevance scoring using the [pg_textsearch](https://github.com/timescale/pg_textsearch) extension. BM25 generally provides better ranking than traditional built-in TF-IDF full text search and is optimized for top-k queries. + + ```elixir + insert_post!(title: "Hogwarts Shocker", body: "A spell disrupts the Quidditch Cup.") + insert_post!(title: "Diagon Bombshell", body: "Secrets uncovered in the heart of Hogwarts.") + insert_post!(title: "Completely unrelated", body: "No magic here!") + + Post + |> Torus.bm25([p], p.body, "secrets hogwarts") + |> select([p], p.title) + |> Repo.all() + ["Diagon Bombshell"] + ``` + + Use it when you need state-of-the-art relevance ranking for single-column search, especially with LIMIT clauses. Requires PostgreSQL 17+. + + See [`bm25/5`](https://hexdocs.pm/torus/Torus.html#bm25/5) and the [BM25 Search Guide](https://dimamik.com/posts/bm25_search) for detailed setup instructions and examples. + 1. **Semantic Search**: Understands the contextual meaning of queries to match and retrieve related content utilizing natural language processing. Read more about semantic search in [Semantic search with Torus guide](/guides/semantic_search.md). ```elixir @@ -131,7 +151,7 @@ Torus offers a few helpers to debug, explain, and analyze your queries before us ## Torus support -For now, Torus supports pattern match, similarity, full-text, and semantic search, with plans to expand support further. These docs will be updated with more examples on which search type to choose and how to make them more performant (by adding indexes or using specific functions). +For now, Torus supports pattern match, similarity, full-text (TF-IDF and BM25), and semantic search, with plans to expand support further. These docs will be updated with more examples on which search type to choose and how to make them more performant (by adding indexes or using specific functions). diff --git a/lib/torus.ex b/lib/torus.ex index 4d6b990..2a75af8 100644 --- a/lib/torus.ex +++ b/lib/torus.ex @@ -366,6 +366,157 @@ defmodule Torus do Torus.Search.FullText.to_tsquery(column, query_text, opts) end + @doc group: "Full text" + @doc """ + BM25 ranked full-text search using the [pg_textsearch](https://github.com/timescale/pg_textsearch) extension. + + BM25 is a modern ranking function that generally provides better relevance than traditional + TF-IDF (used by `full_text/5`). It's particularly effective for top-k queries with LIMIT clauses + due to Block-Max WAND optimization. + + For detailed usage examples, performance tips, and migration guide, see the [BM25 Search guide](https://dimamik.com/posts/bm25_search). + + > #### Requirements {: .warning} + > + > - Requires the `pg_textsearch` extension to be installed + > - PostgreSQL 17+ only + > - Requires a BM25 index on the search column + > - **Single column only** - unlike `full_text/5`, BM25 indexes work on one column at a time + > - **Language is set at index creation** - use `text_config` in the index `WITH` clause + > + > ```elixir + > defmodule YourApp.Repo.Migrations.CreatePgTextsearchExtension do + > use Ecto.Migration + > + > def change do + > execute "CREATE EXTENSION IF NOT EXISTS pg_textsearch", "DROP EXTENSION IF EXISTS pg_textsearch" + > + > # Create BM25 index with language configuration + > execute \"\"\" + > CREATE INDEX posts_body_bm25_idx ON posts + > USING bm25(body) WITH (text_config='english') + > \"\"\", "DROP INDEX posts_body_bm25_idx" + > end + > end + > ``` + + ## Options + + * `:order` - Ordering of results. Note that BM25 returns **negative scores** (lower is better): + - `:asc` (default) - orders by score ascending (best matches first) + - `:desc` - orders by score descending (worst matches first) + - `:none` - no ordering applied + * `:index_name` - Explicit index name. Required when using `score_threshold`. + * `:score_key` - Atom key to select the BM25 score into the result map. + - `:none` (default) - score is not selected + - `atom` - selects score as this key (use with `select_merge/3`) + * `:score_threshold` - Post-filter results by BM25 score (applied after ORDER BY). + Since scores are negative and lower is better, use negative thresholds (e.g., `-3.0` + keeps only results with score < -3.0, i.e., scores like -4.0, -5.0 which are better matches). + May return fewer results than LIMIT. + * `:pre_filter` - Whether to exclude non-matching rows. + - `false` (default) - no pre-filtering + - `true` - adds a `WHERE score < 0` clause to exclude non-matches + + ## Examples + + Basic search - returns top 10 most relevant posts: + + Post + |> Torus.bm25([p], p.body, "database search") + |> limit(10) + |> select([p], p.body) + |> Repo.all() + + With score selection: + + Post + |> Torus.bm25([p], p.body, "database", score_key: :relevance) + |> limit(5) + |> select([p], %{body: p.body}) + |> Repo.all() + # => [%{body: "...", relevance: -2.5}, ...] + + With WHERE clause pre-filtering: + + Post + |> where([p], p.category_id == 123) + |> Torus.bm25([p], p.body, "database") + |> limit(10) + |> Repo.all() + + With score threshold (post-filtering, may return fewer than LIMIT, `index_name` is required): + + Post + |> Torus.bm25([p], p.body, "database", score_threshold: -5.0, index_name: "posts_body_idx") + |> limit(10) + |> Repo.all() + + ## When to use `bm25/5` vs `full_text/5` + + **Use `bm25/5` when:** + - You need better relevance ranking than TF-IDF + - You need faster search with large datasets + - You have large result sets with LIMIT (top-k queries) + - Single column search is sufficient + - You're on PostgreSQL 17+ + + **Use `full_text/5` when:** + - You need multi-column search with different weights per column + - You want to use stored tsvector columns + - You're on PostgreSQL < 17 + - You need the `concat` filter type + + ## Multi-column search workaround + + Since BM25 indexes work on single columns, you can create a generated column: + + ```sql + ALTER TABLE posts + ADD COLUMN searchable_text TEXT + GENERATED ALWAYS AS (title || ' ' || body) STORED; + + CREATE INDEX posts_searchable_bm25_idx + ON posts USING bm25(searchable_text) + WITH (text_config='english'); + ``` + + Then search the generated column: + + ```elixir + Post + |> Torus.bm25([p], p.searchable_text, "search term") + |> limit(10) + |> Repo.all() + ``` + + ## Index options + + BM25 indexes support these parameters in the `WITH` clause: + + - `text_config` - PostgreSQL text search configuration (required). This determines + the language/stemming rules. Available configs: `'english'`, `'french'`, `'german'`, + `'simple'` (no stemming), etc. Run `SELECT cfgname FROM pg_ts_config;` to list all. + - `k1` - Term frequency saturation (default: 1.2, range: 0.1-10.0) + - `b` - Length normalization (default: 0.75, range: 0.0-1.0) + + ```sql + CREATE INDEX custom_idx ON documents + USING bm25(content) + WITH (text_config='english', k1=1.5, b=0.8); + ``` + + ## Performance tips + + - BM25 is most efficient with `ORDER BY + LIMIT` (enables Block-Max WAND optimization) + - For filtered searches, create a separate B-tree index on the filter column + - Pre-filtering works best when the filter is selective (<10% of rows) + - Post-filtering with `score_threshold` may return fewer results than LIMIT + """ + defmacro bm25(query, bindings, qualifier, term, opts \\ []) do + Torus.Search.BM25.bm25(query, bindings, qualifier, term, opts) + end + @doc group: "Pattern matching" @doc """ The substring function with three parameters provides extraction of a substring diff --git a/lib/torus/search/bm25.ex b/lib/torus/search/bm25.ex new file mode 100644 index 0000000..530d28a --- /dev/null +++ b/lib/torus/search/bm25.ex @@ -0,0 +1,122 @@ +defmodule Torus.Search.BM25 do + @moduledoc false + import Torus.Search.Common + import Ecto.Query, warn: false + + @order_types ~w[asc desc none]a + + def bm25(query, bindings, qualifier, term, opts \\ []) do + order = get_arg!(opts, :order, :asc, @order_types) + index_name = Keyword.get(opts, :index_name, nil) + pre_filter = get_arg!(opts, :pre_filter, false, [true, false]) + score_key = Keyword.get(opts, :score_key, :none) + score_threshold = Keyword.get(opts, :score_threshold, nil) + + raise_if( + score_key != :none and not is_atom(score_key), + "The `score_key` option must be an atom or :none." + ) + + raise_if( + score_threshold != nil and index_name == nil, + "The `index_name` option is required when using `score_threshold`." + ) + + # Build the BM25 query fragments + # When index_name is provided, use 2-arg form for explicit index specification + # Otherwise use 1-arg form and let PostgreSQL auto-detect the index + {bm25query_fragment, bm25query_params} = + if index_name do + { + "to_bm25query(?, ?)", + [term, index_name] + } + else + { + "to_bm25query(?)", + [term] + } + end + + # Score fragment for ordering and selection + score_fragment_string = "? <@> #{bm25query_fragment}" + + # Build score fragment AST + score_fragment = + quote do + fragment( + unquote(score_fragment_string), + unquote(qualifier), + unquote_splicing( + Enum.map(bm25query_params, fn param -> + quote do: ^unquote(param) + end) + ) + ) + end + + # Build order fragment if needed + order_fragment = + if order != :none do + asc_desc = if order == :desc, do: :desc, else: :asc + + quote do + [{unquote(asc_desc), unquote(score_fragment)}] + end + end + + # BM25 scores are negative (lower = better), so "better than threshold" means score < threshold + # (e.g., -5.0 is better than -2.0, so threshold -3.0 keeps scores < -3.0 like -4.0, -5.0) + threshold_fragment_string = "? <@> #{bm25query_fragment} < ?" + + # Pre-filtering by match (excludes non-matches) + # Non-matches have score = 0, matches have score < 0 + pre_filter_fragment_string = "? <@> #{bm25query_fragment} < 0" + + # Build the query + quote do + unquote(query) + |> apply_if(unquote(pre_filter), fn q -> + where( + q, + [unquote_splicing(bindings)], + fragment( + unquote(pre_filter_fragment_string), + unquote(qualifier), + unquote_splicing( + Enum.map(bm25query_params, fn param -> + quote do: ^unquote(param) + end) + ) + ) + ) + end) + |> apply_if(unquote(score_threshold) != nil, fn q -> + where( + q, + [unquote_splicing(bindings)], + fragment( + unquote(threshold_fragment_string), + unquote(qualifier), + unquote_splicing( + Enum.map(bm25query_params, fn param -> + quote do: ^unquote(param) + end) + ), + ^unquote(score_threshold) + ) + ) + end) + |> apply_if(unquote(order) != :none, fn q -> + order_by(q, [unquote_splicing(bindings)], unquote(order_fragment)) + end) + |> apply_if(unquote(score_key) != :none, fn q -> + select_merge( + q, + [unquote_splicing(bindings)], + %{unquote(score_key) => unquote(score_fragment)} + ) + end) + end + end +end diff --git a/mix.exs b/mix.exs index caf63c6..e754f17 100644 --- a/mix.exs +++ b/mix.exs @@ -1,7 +1,7 @@ defmodule Torus.MixProject do use Mix.Project - @version "0.5.3" + @version "0.6.0" @source_url "https://github.com/dimamik/torus" def project do @@ -115,8 +115,6 @@ defmodule Torus.MixProject do defp extras do [ "guides/semantic_search.md", - - # TODO Add more guides "CHANGELOG.md": [title: "Changelog"] ] end diff --git a/test/support/migrations/20260203120000_add_pg_textsearch.exs b/test/support/migrations/20260203120000_add_pg_textsearch.exs new file mode 100644 index 0000000..fade5fc --- /dev/null +++ b/test/support/migrations/20260203120000_add_pg_textsearch.exs @@ -0,0 +1,23 @@ +defmodule Torus.Test.Repo.Migrations.AddPgTextsearch do + use Ecto.Migration + + def up do + execute "CREATE EXTENSION IF NOT EXISTS pg_textsearch" + + execute """ + CREATE INDEX posts_body_bm25_idx ON posts + USING bm25(body) WITH (text_config='english') + """ + + execute """ + CREATE INDEX posts_title_bm25_idx ON posts + USING bm25(title) WITH (text_config='english') + """ + end + + def down do + execute "DROP INDEX IF EXISTS posts_body_bm25_idx" + execute "DROP INDEX IF EXISTS posts_title_bm25_idx" + execute "DROP EXTENSION IF EXISTS pg_textsearch CASCADE" + end +end diff --git a/test/test_helper.exs b/test/test_helper.exs index 1729333..c87d96d 100644 --- a/test/test_helper.exs +++ b/test/test_helper.exs @@ -1,5 +1,5 @@ Application.ensure_all_started(:postgrex) Torus.Test.Repo.start_link() -ExUnit.start(assert_receive_timeout: 500, refute_receive_timeout: 50, exclude: [:skip]) +ExUnit.start(assert_receive_timeout: 500, refute_receive_timeout: 50, exclude: [:skip, :skip_ci]) Ecto.Adapters.SQL.Sandbox.mode(Torus.Test.Repo, :manual) diff --git a/test/torus/bm25_compile_test.exs b/test/torus/bm25_compile_test.exs new file mode 100644 index 0000000..824446e --- /dev/null +++ b/test/torus/bm25_compile_test.exs @@ -0,0 +1,40 @@ +defmodule Torus.BM25CompileTest do + @moduledoc false + use ExUnit.Case, async: true + + describe "bm25/5 - compile-time validations" do + test "score_threshold without index_name raises at compile time" do + # This test verifies that using score_threshold without index_name + # results in a compile-time error by attempting to compile code that does so + + code = """ + import Ecto.Query + import Torus + alias TorusTest.Post + + Post + |> Torus.bm25([p], p.body, "search", score_threshold: -5.0) + |> limit(10) + """ + + assert_raise RuntimeError, ~r/index_name.*required.*score_threshold/i, fn -> + Code.eval_string(code, [], __ENV__) + end + end + + test "score_threshold with index_name compiles successfully" do + # This should compile without errors + code = """ + import Ecto.Query + import Torus + alias TorusTest.Post + + Post + |> Torus.bm25([p], p.body, "search", score_threshold: -5.0, index_name: "posts_body_idx") + |> limit(10) + """ + + assert {_query, _bindings} = Code.eval_string(code, [], __ENV__) + end + end +end diff --git a/test/torus/bm25_test.exs b/test/torus/bm25_test.exs new file mode 100644 index 0000000..0eece23 --- /dev/null +++ b/test/torus/bm25_test.exs @@ -0,0 +1,432 @@ +defmodule Torus.BM25Test do + @moduledoc false + use Torus.Case, async: false + + @moduletag :skip_ci + + import Ecto.Query + import Torus + + alias Torus.Test.Repo + alias TorusTest.Post + + # NOTE: pg_textsearch has a known limitation where uncommitted rows from + # rolled-back transactions remain in the BM25 memtable. This can cause + # ORDER BY + LIMIT queries to return stale TIDs that fail visibility checks. + # The standalone operator (without ORDER BY) works correctly. + + defp flush_bm25!(index_name \\ "posts_body_bm25_idx") do + Repo.query!("SELECT bm25_spill_index($1)", [index_name]) + :ok + end + + defp reset_bm25_state! do + Repo.query!("TRUNCATE TABLE posts RESTART IDENTITY CASCADE") + flush_bm25!() + flush_bm25!("posts_title_bm25_idx") + :ok + end + + setup do + reset_bm25_state!() + :ok + end + + describe "bm25/5 - basic functionality" do + test "basic search returns matching results" do + insert_post!(body: "Hogwarts is a powerful magical school system") + insert_post!(body: "Quidditch is a ranking sport used in wizarding games") + insert_post!(body: "Spell search enables finding relevant incantations") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "magical school system", + pre_filter: true, + index_name: "posts_body_bm25_idx" + ) + |> limit(10) + |> select([p], p.body) + |> Repo.all() + + assert length(results) == 1 + assert "Hogwarts is a powerful magical school system" in results + end + + test "returns empty list when no matches" do + insert_post!(body: "Hogwarts magical") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "nonexistent muggle xyz", + pre_filter: true, + index_name: "posts_body_bm25_idx" + ) + |> limit(10) + |> select([p], p.body) + |> Repo.all() + + assert results == [] + end + end + + describe "bm25/5 - ordering" do + test ":order defaults to :asc (best matches first)" do + post1 = insert_post!(body: "magic magic magic") + _post2 = insert_post!(body: "magic wands") + _post3 = insert_post!(body: "unrelated content with magic") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "magic", index_name: "posts_body_bm25_idx") + |> select([p], p.id) + |> Repo.all() + + # First result should be post with most relevance (post1) + assert hd(results) == post1.id + end + + test ":order :asc returns best matches first" do + post1 = insert_post!(body: "Hogwarts magical school system architecture") + post2 = insert_post!(body: "magical") + post3 = insert_post!(body: "unrelated text") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "magical", order: :asc, index_name: "posts_body_bm25_idx") + |> select([p], p.id) + |> Repo.all() + + # Best matches should be first + assert hd(results) in [post1.id, post2.id] + refute hd(results) == post3.id + end + + test ":order :desc returns worst matches first" do + insert_post!(body: "magic magic magic") + _post_bad = insert_post!(body: "unrelated content with magic mention") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "magic", order: :desc, index_name: "posts_body_bm25_idx") + |> limit(1) + |> select([p], p.id) + |> Repo.all() + + # Worst match should be first with :desc + assert length(results) == 1 + end + + test ":order :none doesn't apply ordering" do + insert_post!(body: "magical wands") + insert_post!(body: "magic") + + flush_bm25!() + + # Query should work without ordering + results = + Post + |> Torus.bm25([p], p.body, "magic", order: :none) + |> select([p], p.body) + |> Repo.all() + + assert length(results) == 2 + end + end + + describe "bm25/5 - score selection" do + test ":score_key :none doesn't select score" do + insert_post!(body: "magical wands") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "magic", score_key: :none, order: :none) + |> select([p], %{body: p.body}) + |> Repo.all() + + result = hd(results) + refute Map.has_key?(result, :score) + refute Map.has_key?(result, :relevance) + end + + test ":score_key selects BM25 score into result map" do + insert_post!(body: "magical school architecture") + + flush_bm25!() + + results = + Post + |> select([p], %{id: p.id, body: p.body}) + |> Torus.bm25([p], p.body, "magic", + score_key: :relevance, + index_name: "posts_body_bm25_idx" + ) + |> Repo.all() + + result = hd(results) + assert Map.has_key?(result, :relevance) + assert is_number(result.relevance) + # BM25 scores are negative + assert result.relevance < 0 + end + + test "score_key works with multiple results" do + insert_post!(body: "magic magic") + insert_post!(body: "magic") + + flush_bm25!() + + results = + Post + |> select([p], %{id: p.id, body: p.body}) + |> Torus.bm25([p], p.body, "magic", + score_key: :score, + index_name: "posts_body_bm25_idx" + ) + |> Repo.all() + + assert length(results) == 2 + + for result <- results do + assert Map.has_key?(result, :score) + assert is_number(result.score) + end + end + end + + describe "bm25/5 - index_name option" do + test "explicit index_name works" do + insert_post!(body: "magical wands") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "magic", index_name: "posts_body_bm25_idx") + |> limit(10) + |> select([p], p.body) + |> Repo.all() + + assert "magical wands" in results + end + + test "works without index_name when order: :none" do + insert_post!(body: "magical wands") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "magic", order: :none) + |> limit(10) + |> select([p], p.body) + |> Repo.all() + + assert "magical wands" in results + end + end + + describe "bm25/5 - pre_filter option" do + test "pre_filter true filters non-matching rows" do + insert_post!(body: "magical wands") + insert_post!(body: "completely unrelated muggle stuff") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "magic", + pre_filter: true, + index_name: "posts_body_bm25_idx" + ) + |> select([p], p.body) + |> Repo.all() + + assert results == ["magical wands"] + end + end + + describe "bm25/5 - score_threshold option" do + test "score_threshold keeps only results with score better than threshold" do + # Post with high relevance (many occurrences) will have a more negative score + insert_post!(body: "magic magic magic magic") + # Post with low relevance will have a less negative score (closer to 0) + insert_post!(body: "unrelated text with magic") + + flush_bm25!() + + # Get scores to understand what we're filtering + all_results = + Post + |> select([p], %{body: p.body}) + |> Torus.bm25([p], p.body, "magic", + score_key: :score, + index_name: "posts_body_bm25_idx" + ) + |> Repo.all() + + # All results should have negative scores + assert Enum.all?(all_results, fn r -> r.score < 0 end) + + # With a restrictive threshold (score must be < -3.0, meaning better matches only) + # This should filter out worse matches (less negative scores) + results_with_threshold = + Post + |> Torus.bm25([p], p.body, "magic", + score_threshold: -3.0, + index_name: "posts_body_bm25_idx" + ) + |> select([p], p.body) + |> Repo.all() + + # Threshold should filter some results (only keep scores < -3.0) + assert length(results_with_threshold) <= length(all_results) + end + + test "score_threshold nil doesn't filter" do + insert_post!(body: "magical wands") + insert_post!(body: "magic") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "magic", + score_threshold: nil, + index_name: "posts_body_bm25_idx" + ) + |> select([p], p.body) + |> Repo.all() + + assert length(results) == 2 + end + end + + describe "bm25/5 - language via index" do + test "english index uses stemming (wizards matches wizard)" do + insert_post!(body: "wizards are powerful beings") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "wizard", index_name: "posts_body_bm25_idx") + |> select([p], p.body) + |> Repo.all() + + # English stemming should match "wizards" with "wizard" + assert "wizards are powerful beings" in results + end + + test "simple index does not use stemming" do + Repo.query!("DROP INDEX IF EXISTS posts_body_bm25_simple_idx") + + Repo.query!(""" + CREATE INDEX posts_body_bm25_simple_idx ON posts + USING bm25(body) WITH (text_config='simple') + """) + + insert_post!(body: "simple spell casting example") + + flush_bm25!("posts_body_bm25_simple_idx") + + results = + Post + |> Torus.bm25([p], p.body, "spell", index_name: "posts_body_bm25_simple_idx") + |> select([p], p.body) + |> Repo.all() + + assert "simple spell casting example" in results + + Repo.query!("DROP INDEX IF EXISTS posts_body_bm25_simple_idx") + end + end + + describe "bm25/5 - integration with where clauses" do + test "works with pre-filtering where clauses" do + post1 = insert_post!(title: "Gryffindor", body: "magical wands") + post2 = insert_post!(title: "Slytherin", body: "magical potions") + + flush_bm25!() + + results = + Post + |> where([p], p.title == "Gryffindor") + |> Torus.bm25([p], p.body, "magic", index_name: "posts_body_bm25_idx") + |> select([p], p.id) + |> Repo.all() + + assert results == [post1.id] + refute post2.id in results + end + + test "combines with other Ecto query functions" do + insert_post!(title: "Gryffindor", body: "magical wands") + insert_post!(title: "Slytherin", body: "magical potions") + insert_post!(title: "Ravenclaw", body: "magical knowledge") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "magic", index_name: "posts_body_bm25_idx") + |> limit(2) + |> select([p], p.title) + |> Repo.all() + + assert length(results) == 2 + end + end + + describe "bm25/5 - complex queries" do + test "multi-word search queries" do + insert_post!(body: "Hogwarts is a powerful ancient magical school") + insert_post!(body: "Durmstrang schools are also famous") + + flush_bm25!() + + results = + Post + |> Torus.bm25([p], p.body, "ancient magical school", index_name: "posts_body_bm25_idx") + |> limit(1) + |> select([p], p.body) + |> Repo.all() + + assert "Hogwarts is a powerful ancient magical school" in results + end + + test "works with different columns (title vs body)" do + insert_post!(title: "Magical Schools", body: "Content about potions") + insert_post!(title: "Potions Guide", body: "Content about magic") + + flush_bm25!() + flush_bm25!("posts_title_bm25_idx") + + title_results = + Post + |> Torus.bm25([p], p.title, "magic", index_name: "posts_title_bm25_idx") + |> select([p], p.title) + |> Repo.all() + + body_results = + Post + |> Torus.bm25([p], p.body, "magic", index_name: "posts_body_bm25_idx") + |> select([p], p.body) + |> Repo.all() + + assert "Magical Schools" in title_results + assert "Content about magic" in body_results + end + end +end From b4fa02a984fa31be95b31e771d66f00a27f8a10b Mon Sep 17 00:00:00 2001 From: Dima Mikielewicz Date: Wed, 4 Feb 2026 12:56:22 +0100 Subject: [PATCH 2/4] Fix order by without an explicit index --- lib/torus/search/bm25.ex | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/torus/search/bm25.ex b/lib/torus/search/bm25.ex index 530d28a..25b8f3e 100644 --- a/lib/torus/search/bm25.ex +++ b/lib/torus/search/bm25.ex @@ -23,8 +23,8 @@ defmodule Torus.Search.BM25 do ) # Build the BM25 query fragments - # When index_name is provided, use 2-arg form for explicit index specification - # Otherwise use 1-arg form and let PostgreSQL auto-detect the index + # When index_name is provided, use to_bm25query(?, ?) for explicit index specification + # Otherwise use bare string literal (?) to let PostgreSQL auto-detect the index {bm25query_fragment, bm25query_params} = if index_name do { @@ -33,7 +33,7 @@ defmodule Torus.Search.BM25 do } else { - "to_bm25query(?)", + "?", [term] } end From 0ef3bb63f200d6f96abb6608d5cbf6c8c8347496 Mon Sep 17 00:00:00 2001 From: Dima Mikielewicz Date: Wed, 4 Feb 2026 21:58:54 +0100 Subject: [PATCH 3/4] Include skipped tests --- test/test_helper.exs | 2 +- test/torus/bm25_test.exs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_helper.exs b/test/test_helper.exs index c87d96d..1729333 100644 --- a/test/test_helper.exs +++ b/test/test_helper.exs @@ -1,5 +1,5 @@ Application.ensure_all_started(:postgrex) Torus.Test.Repo.start_link() -ExUnit.start(assert_receive_timeout: 500, refute_receive_timeout: 50, exclude: [:skip, :skip_ci]) +ExUnit.start(assert_receive_timeout: 500, refute_receive_timeout: 50, exclude: [:skip]) Ecto.Adapters.SQL.Sandbox.mode(Torus.Test.Repo, :manual) diff --git a/test/torus/bm25_test.exs b/test/torus/bm25_test.exs index 0eece23..efd51d8 100644 --- a/test/torus/bm25_test.exs +++ b/test/torus/bm25_test.exs @@ -2,10 +2,9 @@ defmodule Torus.BM25Test do @moduledoc false use Torus.Case, async: false - @moduletag :skip_ci + # @moduletag :skip import Ecto.Query - import Torus alias Torus.Test.Repo alias TorusTest.Post From b2d116c61bdfb1069e63cc549307b7e3ae7be750 Mon Sep 17 00:00:00 2001 From: Dima Mikielewicz Date: Thu, 5 Feb 2026 17:41:30 +0100 Subject: [PATCH 4/4] Exclude bm25 test from CI suite --- .github/workflows/ci.yml | 9 ++--- pg_textsearch | 1 + .../20260203120000_add_pg_textsearch.exs | 23 ----------- test/torus/bm25_test.exs | 39 +++++++++++++++---- 4 files changed, 36 insertions(+), 36 deletions(-) create mode 160000 pg_textsearch delete mode 100644 test/support/migrations/20260203120000_add_pg_textsearch.exs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6daa9e8..2880915 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: # Additional services can be defined here if required. services: db: - image: postgres:17 + image: postgres:15 ports: - 5432/tcp env: @@ -40,8 +40,8 @@ jobs: name: Test on OTP ${{matrix.otp}} / Elixir ${{matrix.elixir}} strategy: matrix: - otp: ['28.3'] - elixir: ['1.19.5'] + otp: ['27.2'] + elixir: ['1.18.0'] steps: # Step: Setup Elixir + Erlang image as the base. - name: Set up Elixir @@ -86,9 +86,6 @@ jobs: - name: Install dependencies run: mix deps.get --check-locked - - name: Install pg_textsearch extension - run: | - docker exec ${{ job.services.db.id }} bash -lc "apt-get update && apt-get install -y wget && wget -q -O /tmp/pg_textsearch.deb https://github.com/timescale/pg_textsearch/releases/download/v0.5.0/pg-textsearch-postgresql-17_0.5.0-1_amd64.deb && dpkg -i /tmp/pg_textsearch.deb" # TODO These steps can be moved to `mix.exs` # Step: Compile the project treating any warnings as errors. diff --git a/pg_textsearch b/pg_textsearch new file mode 160000 index 0000000..1a23bc6 --- /dev/null +++ b/pg_textsearch @@ -0,0 +1 @@ +Subproject commit 1a23bc6bbb06dcec0c7e6efa50b2a51c08882926 diff --git a/test/support/migrations/20260203120000_add_pg_textsearch.exs b/test/support/migrations/20260203120000_add_pg_textsearch.exs deleted file mode 100644 index fade5fc..0000000 --- a/test/support/migrations/20260203120000_add_pg_textsearch.exs +++ /dev/null @@ -1,23 +0,0 @@ -defmodule Torus.Test.Repo.Migrations.AddPgTextsearch do - use Ecto.Migration - - def up do - execute "CREATE EXTENSION IF NOT EXISTS pg_textsearch" - - execute """ - CREATE INDEX posts_body_bm25_idx ON posts - USING bm25(body) WITH (text_config='english') - """ - - execute """ - CREATE INDEX posts_title_bm25_idx ON posts - USING bm25(title) WITH (text_config='english') - """ - end - - def down do - execute "DROP INDEX IF EXISTS posts_body_bm25_idx" - execute "DROP INDEX IF EXISTS posts_title_bm25_idx" - execute "DROP EXTENSION IF EXISTS pg_textsearch CASCADE" - end -end diff --git a/test/torus/bm25_test.exs b/test/torus/bm25_test.exs index efd51d8..522cbd5 100644 --- a/test/torus/bm25_test.exs +++ b/test/torus/bm25_test.exs @@ -2,25 +2,50 @@ defmodule Torus.BM25Test do @moduledoc false use Torus.Case, async: false - # @moduletag :skip + @moduletag :skip + + # NOTE: pg_textsearch has a known limitation where uncommitted rows from + # rolled-back transactions remain in the BM25 memtable. This can cause + # ORDER BY + LIMIT queries to return stale TIDs that fail visibility checks. + # This is why this test is skipped by default on CI and needs to be run manually. import Ecto.Query + alias Ecto.Adapters.SQL.Sandbox alias Torus.Test.Repo alias TorusTest.Post - # NOTE: pg_textsearch has a known limitation where uncommitted rows from - # rolled-back transactions remain in the BM25 memtable. This can cause - # ORDER BY + LIMIT queries to return stale TIDs that fail visibility checks. - # The standalone operator (without ORDER BY) works correctly. + setup_all do + Sandbox.unboxed_run(Repo, fn -> + Repo.query!("CREATE EXTENSION IF NOT EXISTS pg_textsearch") + + Repo.query!(""" + CREATE INDEX IF NOT EXISTS posts_body_bm25_idx ON posts + USING bm25(body) WITH (text_config='english') + """) + + Repo.query!(""" + CREATE INDEX IF NOT EXISTS posts_title_bm25_idx ON posts + USING bm25(title) WITH (text_config='english') + """) + end) + + :ok + end defp flush_bm25!(index_name \\ "posts_body_bm25_idx") do - Repo.query!("SELECT bm25_spill_index($1)", [index_name]) + Sandbox.unboxed_run(Repo, fn -> + Repo.query!("SELECT bm25_spill_index($1)", [index_name]) + end) + :ok end defp reset_bm25_state! do - Repo.query!("TRUNCATE TABLE posts RESTART IDENTITY CASCADE") + Sandbox.unboxed_run(Repo, fn -> + Repo.query!("TRUNCATE TABLE posts RESTART IDENTITY CASCADE") + end) + flush_bm25!() flush_bm25!("posts_title_bm25_idx") :ok