From 9be9f8e4820f927ae382a703914dd4b5da2a0645 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 9 Oct 2025 11:14:20 -0400 Subject: [PATCH 01/12] Replace log boost with stepped boost for clique_identifier_count. Closes #205. --- api/server.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/api/server.py b/api/server.py index 3c3ec88a..e5e42ecd 100755 --- a/api/server.py +++ b/api/server.py @@ -448,7 +448,14 @@ async def lookup(string: str, "boost": [ # The boost is multiplied with score -- calculating the log() reduces how quickly this increases # the score for increasing clique identifier counts. - "log(sum(clique_identifier_count, 1))" + # "log(sum(clique_identifier_count, 1))" + # + # However, this approach causes very large clique_identifier_count entries (like diphenhydramine, cic=1332) + # to be returned when we don't have an otherwise good match. So instead we make it stepwise: + # - If clique_identifier_count > 1, we boost by 2x + "if(gt(clique_identifier_count,1),2,1)", + # - If clique_identifier_count > 5, we boost by a further 2x + "if(gt(clique_identifier_count,5),2,1)", ], }, }, From b24cd4d2fd60250ab252a1062c2ac14734ae3535 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 9 Oct 2025 22:53:17 -0400 Subject: [PATCH 02/12] Increased stepping. --- api/server.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/api/server.py b/api/server.py index 5915b425..9e427ddb 100755 --- a/api/server.py +++ b/api/server.py @@ -452,10 +452,12 @@ async def lookup(string: str, # # However, this approach causes very large clique_identifier_count entries (like diphenhydramine, cic=1332) # to be returned when we don't have an otherwise good match. So instead we make it stepwise: - # - If clique_identifier_count > 1, we boost by 2x - "if(gt(clique_identifier_count,1),2,1)", - # - If clique_identifier_count > 5, we boost by a further 2x - "if(gt(clique_identifier_count,5),2,1)", + # - If clique_identifier_count == 1, we reduce the boost by 0.5x + "if(eq(clique_identifier_count, 1), 1, 0.5)", + # - If clique_identifier_count > 10, we boost by a further 2x + "if(gt(clique_identifier_count, 10), 2, 1)", + # - If clique_identifier_count > 20, we boost by a further 2x + "if(gt(clique_identifier_count, 20), 2, 1)", ], }, }, From 953ffb0ef8c0b325a61069f2360bf808c637e93e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 10 Nov 2025 16:53:13 -0500 Subject: [PATCH 03/12] Tried to boost model organisms using bq. --- api/server.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/api/server.py b/api/server.py index 9e427ddb..b275a669 100755 --- a/api/server.py +++ b/api/server.py @@ -443,10 +443,18 @@ async def lookup(string: str, # pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter "pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^30 names^20", - # Boosts - "bq": [], + "bq": [ + # Boost queries are run on the matching documents, and provide an ADDITIVE score to matching documents. + # We'll use this to slightly boost model organisms. This shouldn't change their relative order except + # against each other. + "taxa:'NCBITaxon:9606'^5", # Humans (Homo sapiens): this will +10 to any document that relates to humans. + "taxa:'NCBITaxon:10090'^4", # Mouse (Mus musculus) + "taxa:'NCBITaxon:10116'^3", # Rat (Rattus norvegicus) + "taxa:'NCBITaxon:7955'^2", # Zebrafish (Danio rerio) + "taxa:'NCBITaxon:6239'^1", # C. elegans + ], "boost": [ - # The boost is multiplied with score -- calculating the log() reduces how quickly this increases + # Boosts are MULTIPLIED with score -- calculating the log() reduces how quickly this increases # the score for increasing clique identifier counts. # "log(sum(clique_identifier_count, 1))" # From cd848edc3e05d2ffb349d836e0135053a8b0abb6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 10 Nov 2025 17:11:25 -0500 Subject: [PATCH 04/12] Increase bqs so they have a real effect on the results. --- api/server.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/api/server.py b/api/server.py index b275a669..7abf4590 100755 --- a/api/server.py +++ b/api/server.py @@ -445,13 +445,12 @@ async def lookup(string: str, "pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^30 names^20", "bq": [ # Boost queries are run on the matching documents, and provide an ADDITIVE score to matching documents. - # We'll use this to slightly boost model organisms. This shouldn't change their relative order except - # against each other. - "taxa:'NCBITaxon:9606'^5", # Humans (Homo sapiens): this will +10 to any document that relates to humans. - "taxa:'NCBITaxon:10090'^4", # Mouse (Mus musculus) - "taxa:'NCBITaxon:10116'^3", # Rat (Rattus norvegicus) - "taxa:'NCBITaxon:7955'^2", # Zebrafish (Danio rerio) - "taxa:'NCBITaxon:6239'^1", # C. elegans + # We'll use this to slightly boost model organisms. + 'taxa:"NCBITaxon:9606"^200', # Humans (Homo sapiens): this will +10 to any document that relates to humans. + 'taxa:"NCBITaxon:10090"^100', # Mouse (Mus musculus) + 'taxa:"NCBITaxon:10116"^80', # Rat (Rattus norvegicus) + 'taxa:"NCBITaxon:7955"^60', # Zebrafish (Danio rerio) + 'taxa:"NCBITaxon:6239"^40', # C. elegans ], "boost": [ # Boosts are MULTIPLIED with score -- calculating the log() reduces how quickly this increases From 092cd4ed61da21ff4e88ea70ebfaea52791e64bb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 10 Nov 2025 17:22:50 -0500 Subject: [PATCH 05/12] Changed bqs into boosts so they grow in proportion with the score. Also increased boost for single identifier cliques. --- api/server.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/api/server.py b/api/server.py index 7abf4590..3c50cdea 100755 --- a/api/server.py +++ b/api/server.py @@ -443,15 +443,7 @@ async def lookup(string: str, # pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter "pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^30 names^20", - "bq": [ - # Boost queries are run on the matching documents, and provide an ADDITIVE score to matching documents. - # We'll use this to slightly boost model organisms. - 'taxa:"NCBITaxon:9606"^200', # Humans (Homo sapiens): this will +10 to any document that relates to humans. - 'taxa:"NCBITaxon:10090"^100', # Mouse (Mus musculus) - 'taxa:"NCBITaxon:10116"^80', # Rat (Rattus norvegicus) - 'taxa:"NCBITaxon:7955"^60', # Zebrafish (Danio rerio) - 'taxa:"NCBITaxon:6239"^40', # C. elegans - ], + "bq": [], "boost": [ # Boosts are MULTIPLIED with score -- calculating the log() reduces how quickly this increases # the score for increasing clique identifier counts. @@ -459,12 +451,20 @@ async def lookup(string: str, # # However, this approach causes very large clique_identifier_count entries (like diphenhydramine, cic=1332) # to be returned when we don't have an otherwise good match. So instead we make it stepwise: - # - If clique_identifier_count == 1, we reduce the boost by 0.5x - "if(eq(clique_identifier_count, 1), 1, 0.5)", + # - If clique_identifier_count == 1, we reduce the boost by 0.7x + "if(eq(clique_identifier_count, 1), 1, 0.7)", # - If clique_identifier_count > 10, we boost by a further 2x "if(gt(clique_identifier_count, 10), 2, 1)", # - If clique_identifier_count > 20, we boost by a further 2x "if(gt(clique_identifier_count, 20), 2, 1)", + # Slightly boost model organisms: humans, mice, rats, zebrafish and C. elegans + '''sum(1, + product(termfreq(taxa,"NCBITaxon:9606"),2), + product(termfreq(taxa,"NCBITaxon:10090"),1.5), + product(termfreq(taxa,"NCBITaxon:10116"),1.4), + product(termfreq(taxa,"NCBITaxon:7955"),1.3), + product(termfreq(taxa,"NCBITaxon:6239"),1.2) + )''' ], }, }, From 71a96927e9dbebe4dbda48d795d8b9904480db79 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 10 Nov 2025 17:30:46 -0500 Subject: [PATCH 06/12] Removed boosts on pf, tweaked single identifier clique boosts. --- api/server.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/api/server.py b/api/server.py index 3c50cdea..261809aa 100755 --- a/api/server.py +++ b/api/server.py @@ -437,12 +437,12 @@ async def lookup(string: str, "query": { "edismax": { "query": query, - # qf = query fields, i.e. how should we boost these fields if they contain the same fields as the input. + # qf = query fields, i.e. how should we boost these fields if they contain the query terms. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter "qf": "preferred_name_exactish^250 names_exactish^100 preferred_name^25 names^10", - # pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase. + # pf = phrase fields, i.e. how should we boost these fields if they contain query terms close together. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter - "pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^30 names^20", + "pf": "preferred_name_exactish names_exactish preferred_name names", "bq": [], "boost": [ # Boosts are MULTIPLIED with score -- calculating the log() reduces how quickly this increases @@ -452,7 +452,7 @@ async def lookup(string: str, # However, this approach causes very large clique_identifier_count entries (like diphenhydramine, cic=1332) # to be returned when we don't have an otherwise good match. So instead we make it stepwise: # - If clique_identifier_count == 1, we reduce the boost by 0.7x - "if(eq(clique_identifier_count, 1), 1, 0.7)", + "if(eq(clique_identifier_count, 1), 0.7, 1)", # - If clique_identifier_count > 10, we boost by a further 2x "if(gt(clique_identifier_count, 10), 2, 1)", # - If clique_identifier_count > 20, we boost by a further 2x From c224b6809451bbeff9ffbca60d9074df52a5a290 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 8 Dec 2025 15:32:24 -0500 Subject: [PATCH 07/12] Got rid of pf entirely, tweaked boosts. --- api/server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api/server.py b/api/server.py index d32bff8f..0d95cf50 100755 --- a/api/server.py +++ b/api/server.py @@ -450,7 +450,7 @@ async def lookup(string: str, "qf": "preferred_name_exactish^250 names_exactish^100 preferred_name^25 names^10", # pf = phrase fields, i.e. how should we boost these fields if they contain query terms close together. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter - "pf": "preferred_name_exactish names_exactish preferred_name names", + # "pf": "preferred_name_exactish names_exactish preferred_name names", "bq": [], "boost": [ # Boosts are MULTIPLIED with score -- calculating the log() reduces how quickly this increases @@ -463,8 +463,8 @@ async def lookup(string: str, "if(eq(clique_identifier_count, 1), 0.7, 1)", # - If clique_identifier_count > 10, we boost by a further 2x "if(gt(clique_identifier_count, 10), 2, 1)", - # - If clique_identifier_count > 20, we boost by a further 2x - "if(gt(clique_identifier_count, 20), 2, 1)", + # - If clique_identifier_count > 20, we boost by a further 3x + "if(gt(clique_identifier_count, 20), 3, 1)", # Slightly boost model organisms: humans, mice, rats, zebrafish and C. elegans '''sum(1, product(termfreq(taxa,"NCBITaxon:9606"),2), From 6ea0e9b7144f659e26e53faddf56b6c841a00b13 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 8 Dec 2025 16:09:31 -0500 Subject: [PATCH 08/12] Improved boosts. --- api/server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api/server.py b/api/server.py index 0d95cf50..907b31d3 100755 --- a/api/server.py +++ b/api/server.py @@ -447,15 +447,15 @@ async def lookup(string: str, "query": query, # qf = query fields, i.e. how should we boost these fields if they contain the query terms. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter - "qf": "preferred_name_exactish^250 names_exactish^100 preferred_name^25 names^10", + "qf": "preferred_name_exactish^300 names_exactish^200 preferred_name^10 names^5", # pf = phrase fields, i.e. how should we boost these fields if they contain query terms close together. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter - # "pf": "preferred_name_exactish names_exactish preferred_name names", + "pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^10 names^5", "bq": [], "boost": [ # Boosts are MULTIPLIED with score -- calculating the log() reduces how quickly this increases # the score for increasing clique identifier counts. - # "log(sum(clique_identifier_count, 1))" + # "max(log(sum(clique_identifier_count, 1)), 4)", # # However, this approach causes very large clique_identifier_count entries (like diphenhydramine, cic=1332) # to be returned when we don't have an otherwise good match. So instead we make it stepwise: From 8b4998a8f508da89fd10814a892ca8e0c45616b1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 8 Dec 2025 16:11:30 -0500 Subject: [PATCH 09/12] Reduced phrase boosts. --- api/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/server.py b/api/server.py index 907b31d3..5115daf5 100755 --- a/api/server.py +++ b/api/server.py @@ -450,7 +450,7 @@ async def lookup(string: str, "qf": "preferred_name_exactish^300 names_exactish^200 preferred_name^10 names^5", # pf = phrase fields, i.e. how should we boost these fields if they contain query terms close together. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter - "pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^10 names^5", + "pf": "preferred_name_exactish^30 names_exactish^20 preferred_name^10 names^5", "bq": [], "boost": [ # Boosts are MULTIPLIED with score -- calculating the log() reduces how quickly this increases From 0e0760112e44cfb4b70bc73ccbda661940c44225 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 12 Dec 2025 17:35:34 -0500 Subject: [PATCH 10/12] I like these outputs. --- api/server.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/api/server.py b/api/server.py index 5115daf5..e7fbe90c 100755 --- a/api/server.py +++ b/api/server.py @@ -447,10 +447,10 @@ async def lookup(string: str, "query": query, # qf = query fields, i.e. how should we boost these fields if they contain the query terms. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter - "qf": "preferred_name_exactish^300 names_exactish^200 preferred_name^10 names^5", + "qf": "preferred_name_exactish^400 names_exactish^300 preferred_name^4 names^2", # pf = phrase fields, i.e. how should we boost these fields if they contain query terms close together. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter - "pf": "preferred_name_exactish^30 names_exactish^20 preferred_name^10 names^5", + "pf": "preferred_name_exactish^20 names_exactish^10 preferred_name^4 names^2", "bq": [], "boost": [ # Boosts are MULTIPLIED with score -- calculating the log() reduces how quickly this increases @@ -467,11 +467,11 @@ async def lookup(string: str, "if(gt(clique_identifier_count, 20), 3, 1)", # Slightly boost model organisms: humans, mice, rats, zebrafish and C. elegans '''sum(1, - product(termfreq(taxa,"NCBITaxon:9606"),2), - product(termfreq(taxa,"NCBITaxon:10090"),1.5), - product(termfreq(taxa,"NCBITaxon:10116"),1.4), - product(termfreq(taxa,"NCBITaxon:7955"),1.3), - product(termfreq(taxa,"NCBITaxon:6239"),1.2) + product(termfreq(taxa,"NCBITaxon:9606"),10), + product(termfreq(taxa,"NCBITaxon:10090"),5), + product(termfreq(taxa,"NCBITaxon:10116"),4), + product(termfreq(taxa,"NCBITaxon:7955"),3), + product(termfreq(taxa,"NCBITaxon:6239"),2) )''' ], }, From 90e65d2ae88a3d9dff249de2845b78d4230a0025 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 12 Dec 2025 17:57:36 -0500 Subject: [PATCH 11/12] Even better. --- api/server.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/api/server.py b/api/server.py index e7fbe90c..c6b12b88 100755 --- a/api/server.py +++ b/api/server.py @@ -447,7 +447,7 @@ async def lookup(string: str, "query": query, # qf = query fields, i.e. how should we boost these fields if they contain the query terms. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter - "qf": "preferred_name_exactish^400 names_exactish^300 preferred_name^4 names^2", + "qf": "preferred_name_exactish^500 names_exactish^400 preferred_name^4 names^2", # pf = phrase fields, i.e. how should we boost these fields if they contain query terms close together. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter "pf": "preferred_name_exactish^20 names_exactish^10 preferred_name^4 names^2", @@ -460,18 +460,18 @@ async def lookup(string: str, # However, this approach causes very large clique_identifier_count entries (like diphenhydramine, cic=1332) # to be returned when we don't have an otherwise good match. So instead we make it stepwise: # - If clique_identifier_count == 1, we reduce the boost by 0.7x - "if(eq(clique_identifier_count, 1), 0.7, 1)", + # "if(eq(clique_identifier_count, 1), 0.7, 1)", # - If clique_identifier_count > 10, we boost by a further 2x - "if(gt(clique_identifier_count, 10), 2, 1)", + "if(gt(clique_identifier_count, 20), 5, if(gt(clique_identifier_count, 10), 3, 1))", # - If clique_identifier_count > 20, we boost by a further 3x - "if(gt(clique_identifier_count, 20), 3, 1)", # Slightly boost model organisms: humans, mice, rats, zebrafish and C. elegans '''sum(1, - product(termfreq(taxa,"NCBITaxon:9606"),10), - product(termfreq(taxa,"NCBITaxon:10090"),5), - product(termfreq(taxa,"NCBITaxon:10116"),4), - product(termfreq(taxa,"NCBITaxon:7955"),3), - product(termfreq(taxa,"NCBITaxon:6239"),2) + if(not(taxon_specific), 100, 0), + product(termfreq(taxa,"NCBITaxon:9606"),100), + product(termfreq(taxa,"NCBITaxon:10090"),40), + product(termfreq(taxa,"NCBITaxon:10116"),30), + product(termfreq(taxa,"NCBITaxon:7955"),20), + product(termfreq(taxa,"NCBITaxon:6239"),10) )''' ], }, From 70e683657534a50304a79af6b6278b8c06445956 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 18 Dec 2025 17:20:43 -0500 Subject: [PATCH 12/12] Added on:push triggers for testing. --- .github/workflows/release-name-resolution.yml | 1 + .github/workflows/release-nameres-loading.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/release-name-resolution.yml b/.github/workflows/release-name-resolution.yml index e5773778..1b349b27 100644 --- a/.github/workflows/release-name-resolution.yml +++ b/.github/workflows/release-name-resolution.yml @@ -1,6 +1,7 @@ name: 'Release a new version of NameResolution to Github Packages' on: + push: release: types: [published] diff --git a/.github/workflows/release-nameres-loading.yml b/.github/workflows/release-nameres-loading.yml index cc7e3753..0c94e7be 100644 --- a/.github/workflows/release-nameres-loading.yml +++ b/.github/workflows/release-nameres-loading.yml @@ -1,6 +1,7 @@ name: 'Release a new version of NameResolution Data Loading to Github Packages' on: + push: release: types: [published]