Skip to content

Commit ef195c1

Browse files
committed
Initial implementation of fuzzy matching
1 parent 3cecc98 commit ef195c1

4 files changed

Lines changed: 119 additions & 2 deletions

File tree

app/models/name.rb

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,57 @@ def all_public
183183
where(status: public_status, redirect: nil)
184184
end
185185

186+
##
187+
# Performs a fuzzy search for names similar to +query+. The search
188+
# parameters are:
189+
# - method: One of +:similarity+ (default) or +:levenshtein+
190+
# - threshold: Limit to find matches,
191+
# by default 0.7 (similarity) or 3 (levenshtein)
192+
# - limit: Maximum number of results to return
193+
# - selection: Pre-selection of names included in the search. One of:
194+
# - all_valid: (default) All validly published names
195+
# - all_public: All publicly visible names
196+
# - valid_genera: All validly published genus names
197+
# - public_genera: All publicly visible genus names
198+
# - An ActiveRecord query on the +names+ table
199+
def fuzzy_match(
200+
query, method: :similarity, threshold: nil, limit: 10,
201+
selection: :all_valid
202+
)
203+
return unless ActiveRecord::Base.connection.adapter_name == 'PostgreSQL'
204+
205+
case selection
206+
when :all_valid
207+
selection = all_valid
208+
when :all_public
209+
selection = all_public
210+
when :valid_genera
211+
selection = all_valid.where(rank: :genus)
212+
when :public_genera
213+
selection = all_public.where(rank: :genus)
214+
end
215+
216+
clean_query = ActiveRecord::Base.connection.quote(query)
217+
case method.to_sym
218+
when :similarity
219+
threshold ||= 0.7
220+
selection
221+
.select("name, similarity(name, #{clean_query}) AS score")
222+
.where('similarity(name, ?) > ?', query, threshold)
223+
.order('score DESC')
224+
.limit(limit)
225+
when :levenshtein
226+
threshold ||= 3
227+
selection
228+
.select("name, levenshtein(name, #{clean_query}) AS score")
229+
.where('levenshtein(name, ?) <= ?', query, threshold)
230+
.order('score ASC')
231+
.limit(limit)
232+
else
233+
raise ArgumentError, "Unsupported fuzzy match method: #{method}"
234+
end
235+
end
236+
186237
# ============ --- CLASS > ETYMOLOGY --- ============
187238

188239
def etymology_particles
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
class AddTrigramIndexesToNamesAndPseudonyms < ActiveRecord::Migration[6.1]
2+
def up
3+
# Only apply if using PostgreSQL
4+
if ActiveRecord::Base.connection.adapter_name == 'PostgreSQL'
5+
# Enable extensions
6+
execute 'CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;'
7+
execute 'CREATE EXTENSION IF NOT EXISTS pg_trgm;'
8+
9+
# Add trigram index to names.name
10+
execute <<-SQL
11+
CREATE INDEX index_names_on_name_trgm ON names
12+
USING gin (name gin_trgm_ops);
13+
SQL
14+
# Add trigram index to pseudonyms.pseudonym
15+
execute <<-SQL
16+
CREATE INDEX index_pseudonyms_on_pseudonym_trgm ON pseudonyms
17+
USING gin (pseudonym gin_trgm_ops);
18+
SQL
19+
end
20+
end
21+
22+
def down
23+
if ActiveRecord::Base.connection.adapter_name == 'PostgreSQL'
24+
execute 'DROP INDEX IF EXISTS index_names_on_name_trgm;'
25+
execute 'DROP INDEX IF EXISTS index_pseudonyms_on_pseudonym_trgm;'
26+
# Safer to keep them, no need to remove them
27+
# execute 'DROP EXTENSION IF EXISTS fuzzystrmatch;'
28+
# execute 'DROP EXTENSION IF EXISTS pg_trgm;'
29+
end
30+
end
31+
end
32+

db/schema.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#
1111
# It's strongly recommended that you check this file into your version control system.
1212

13-
ActiveRecord::Schema.define(version: 2025_08_03_153338) do
13+
ActiveRecord::Schema.define(version: 2025_08_21_140809) do
1414

1515
create_table "action_text_rich_texts", force: :cascade do |t|
1616
t.string "name", null: false
@@ -45,7 +45,7 @@
4545
end
4646

4747
create_table "active_storage_variant_records", force: :cascade do |t|
48-
t.bigint "blob_id", null: false
48+
t.integer "blob_id", null: false
4949
t.string "variation_digest", null: false
5050
t.index ["blob_id", "variation_digest"], name: "index_active_storage_variant_records_uniqueness", unique: true
5151
end

lib/tasks/fuzzy_match.rake

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
namespace :name do
2+
desc 'Fuzzy match a name against the Name model'
3+
task :fuzzy_match, [:query] => :environment do |t, args|
4+
query = args[:query]
5+
6+
if query.blank?
7+
puts <<~HELP
8+
Please provide a query string
9+
Example: rake name:fuzzy_match['Escherichia coli']
10+
HELP
11+
exit
12+
end
13+
14+
if ActiveRecord::Base.connection.adapter_name != 'PostgreSQL'
15+
puts <<~ERROR
16+
Fuzzy matching requires PostgreSQL
17+
Current adapter: #{ActiveRecord::Base.connection.adapter_name}
18+
ERROR
19+
exit 1
20+
end
21+
22+
puts "Searching for fuzzy matches to: '#{query}'\n\n"
23+
24+
matches = Name.fuzzy_match(query)
25+
if matches.any?
26+
matches.each do |match|
27+
puts "Match: #{match.name} (Similarity: #{match.sim})"
28+
end
29+
else
30+
puts 'No close matches found.'
31+
end
32+
end
33+
end
34+

0 commit comments

Comments
 (0)