diff --git a/.github/workflows/release-name-resolution.yml b/.github/workflows/release-name-resolution.yml index e5773778..1b349b27 100644 --- a/.github/workflows/release-name-resolution.yml +++ b/.github/workflows/release-name-resolution.yml @@ -1,6 +1,7 @@ name: 'Release a new version of NameResolution to Github Packages' on: + push: release: types: [published] diff --git a/.github/workflows/release-nameres-loading.yml b/.github/workflows/release-nameres-loading.yml index cc7e3753..0c94e7be 100644 --- a/.github/workflows/release-nameres-loading.yml +++ b/.github/workflows/release-nameres-loading.yml @@ -1,6 +1,7 @@ name: 'Release a new version of NameResolution Data Loading to Github Packages' on: + push: release: types: [published] diff --git a/api/server.py b/api/server.py index 1a2965c4..73d68890 100755 --- a/api/server.py +++ b/api/server.py @@ -455,18 +455,34 @@ async def lookup(string: str, "query": { "edismax": { "query": query, - # qf = query fields, i.e. how should we boost these fields if they contain the same fields as the input. + # qf = query fields, i.e. how should we boost these fields if they contain the query terms. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter - "qf": "preferred_name_exactish^250 names_exactish^100 preferred_name^25 names^10", - # pf = phrase fields, i.e. how should we boost these fields if they contain the entire search phrase. + "qf": "preferred_name_exactish^500 names_exactish^400 preferred_name^4 names^2", + # pf = phrase fields, i.e. how should we boost these fields if they contain query terms close together. # https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#pf-phrase-fields-parameter - "pf": "preferred_name_exactish^300 names_exactish^200 preferred_name^30 names^20", - # Boosts + "pf": "preferred_name_exactish^20 names_exactish^10 preferred_name^4 names^2", "bq": [], "boost": [ - # The boost is multiplied with score -- calculating the log() reduces how quickly this increases + # Boosts are MULTIPLIED with score -- calculating the log() reduces how quickly this increases # the score for increasing clique identifier counts. - "log(sum(clique_identifier_count, 1))" + # "max(log(sum(clique_identifier_count, 1)), 4)", + # + # However, this approach causes very large clique_identifier_count entries (like diphenhydramine, cic=1332) + # to be returned when we don't have an otherwise good match. So instead we make it stepwise: + # - If clique_identifier_count == 1, we reduce the boost by 0.7x + # "if(eq(clique_identifier_count, 1), 0.7, 1)", + # - If clique_identifier_count > 10, we boost by a further 2x + "if(gt(clique_identifier_count, 20), 5, if(gt(clique_identifier_count, 10), 3, 1))", + # - If clique_identifier_count > 20, we boost by a further 3x + # Slightly boost model organisms: humans, mice, rats, zebrafish and C. elegans + '''sum(1, + if(not(taxon_specific), 100, 0), + product(termfreq(taxa,"NCBITaxon:9606"),100), + product(termfreq(taxa,"NCBITaxon:10090"),40), + product(termfreq(taxa,"NCBITaxon:10116"),30), + product(termfreq(taxa,"NCBITaxon:7955"),20), + product(termfreq(taxa,"NCBITaxon:6239"),10) + )''' ], }, },