diff --git a/api/resources/openapi.yml b/api/resources/openapi.yml index 8522980c..3d6bab30 100644 --- a/api/resources/openapi.yml +++ b/api/resources/openapi.yml @@ -10,7 +10,8 @@ info: (CURIEs) from a vocabulary or ontology. The lookup is not exact, but includes partial matches.

Multiple results may be returned representing possible conceptual matches, but all of the identifiers have been correctly normalized using the - Node Normalization service.' + Node Normalization service. You can read more + about this API on the NameResolution GitHub repository.' license: name: MIT url: https://opensource.org/licenses/MIT diff --git a/api/server.py b/api/server.py index 9e746d9f..fc33a109 100755 --- a/api/server.py +++ b/api/server.py @@ -1,16 +1,14 @@ -"""Biomedical entity name resolution service. - -1) split the input into fragments at spaces - * The order does not matter -2) search for names including all fragments, case insensitive -3) sort by length, ascending - * The curie with the shortest match is first, etc. - * Matching names are returned first, followed by non-matching names +""" +NameResolver (NameRes) API Endpoints + +Queries are mostly sent to the underlying the NameRes Solr instance. """ import json -import logging, warnings +import logging +import warnings import os import re +from enum import Enum from typing import Dict, List, Union, Annotated, Optional from fastapi import Body, FastAPI, Query @@ -102,6 +100,14 @@ async def status() -> Dict: # ENDPOINT /reverse_lookup +class DebugOptions(str, Enum): + # A list of possible Solr debug options from https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter + none = "none" + query = "query" + timing = "timing" + results = "results" + all = "all" + class Request(BaseModel): """Reverse-lookup request body.""" curies: List[str] @@ -210,6 +216,8 @@ class LookupResult(BaseModel): types: List[str] score: float clique_identifier_count: int + explain: Optional[str] # Explanation for this specific result + debug: Optional[dict] # The debug information for the entire query @app.get("/lookup", @@ -263,17 +271,22 @@ async def lookup_curies_get( "e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.", # We can't use `example` here because otherwise it gets filled in when filling this in. # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955" - )] = None + )] = None, + debug: Annotated[Union[DebugOptions, None], Query( + description="Provide debugging information on the Solr query at https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter" + )] = 'none' ) -> List[LookupResult]: """ Returns cliques with a name or synonym that contains a specified string. """ - return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa) + return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, debug) @app.post("/lookup", summary="Look up cliques for a fragment of a name or synonym.", - description="Returns cliques with a name or synonym that contains a specified string.", + description="Returns cliques with a name or synonym that contains a specified string. " + "You can find out more about this endpoint in the NameRes documentation." + "Note that the cliques we search through are conflated ", response_model=List[LookupResult], tags=["lookup"] ) @@ -322,12 +335,15 @@ async def lookup_curies_post( "e.g. `NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955`.", # We can't use `example` here because otherwise it gets filled in when filling this in. # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955" - )] = None + )] = None, + debug: Annotated[Union[DebugOptions, None], Query( + description="Provide debugging information on the Solr query at https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter" + )] = 'none' ) -> List[LookupResult]: """ Returns cliques with a name or synonym that contains a specified string. """ - return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa) + return await lookup(string, autocomplete, highlighting, offset, limit, biolink_type, only_prefixes, exclude_prefixes, only_taxa, debug) async def lookup(string: str, @@ -338,7 +354,8 @@ async def lookup(string: str, biolink_types: List[str] = None, only_prefixes: str = "", exclude_prefixes: str = "", - only_taxa: str = "" + only_taxa: str = "", + debug: DebugOptions = 'none', ) -> List[LookupResult]: """ Returns cliques with a name or synonym that contains a specified string. @@ -433,6 +450,9 @@ async def lookup(string: str, # "hl.highlightMultiTerm": "true", }) + if debug and debug != 'none': + inner_params['debug'] = debug + params = { "query": { "edismax": { @@ -459,7 +479,8 @@ async def lookup(string: str, "fields": "*, score", "params": inner_params, } - logging.debug(f"Query: {json.dumps(params, indent=2)}") + + print(f"Query: {json.dumps(params, indent=2)}") query_url = f"http://{SOLR_HOST}:{SOLR_PORT}/solr/name_lookup/select" async with httpx.AsyncClient(timeout=None) as client: @@ -468,7 +489,12 @@ async def lookup(string: str, LOGGER.error("Solr REST error: %s", response.text) response.raise_for_status() response = response.json() - logging.debug(f"Solr response: {json.dumps(response, indent=2)}") + print(f"Solr response: {json.dumps(response, indent=2)}") + + # Do we have any debug.explain information? + explain_info = {} + if 'debug' in response and 'explain' in response['debug']: + explain_info = response['debug']['explain'] # Associate highlighting information with search results. highlighting_response = response.get("highlighting", {}) @@ -499,6 +525,17 @@ async def lookup(string: str, # Solr sometimes returns duplicates or a blank string here? synonym_matches = list(filter(lambda s: s, set(synonym_matches))) + # Prepare debugging and explain information for this request. + debug_for_this_request = response.get('debug', None) + explain_for_this_doc = None + if debug == 'explain' or debug == 'all': + if doc['id'] in explain_info: + explain_for_this_doc = explain_info[doc['id']] + + # If we have explain information, we don't need to also include it in the debugging information. + debug_for_this_request['explain'] = {"_comment": "Removed to avoid data duplication"} + + outputs.append(LookupResult(curie=doc.get("curie", ""), label=doc.get("preferred_name", ""), highlighting={ @@ -509,7 +546,9 @@ async def lookup(string: str, score=doc.get("score", ""), taxa=doc.get("taxa", []), clique_identifier_count=doc.get("clique_identifier_count", 0), - types=[f"biolink:{d}" for d in doc.get("types", [])])) + types=[f"biolink:{d}" for d in doc.get("types", [])], + explain=explain_for_this_doc, + debug=debug_for_this_request)) return outputs @@ -570,6 +609,10 @@ class NameResQuery(BaseModel): # We can't use `example` here because otherwise it gets filled in when filling this in. # example="NCBITaxon:9606|NCBITaxon:10090|NCBITaxon:10116|NCBITaxon:7955" ) + debug: Optional[DebugOptions] = Field( + 'none', + description="Provide debugging information on the Solr query as per https://solr.apache.org/guide/solr/latest/query-guide/common-query-parameters.html#debug-parameter" + ) @app.post("/bulk-lookup", @@ -590,7 +633,8 @@ async def bulk_lookup(query: NameResQuery) -> Dict[str, List[LookupResult]]: query.biolink_types, query.only_prefixes, query.exclude_prefixes, - query.only_taxa) + query.only_taxa, + query.debug) return result diff --git a/documentation/API.md b/documentation/API.md new file mode 100644 index 00000000..990d70df --- /dev/null +++ b/documentation/API.md @@ -0,0 +1,153 @@ +# Name Resolver API + +The Name Resolver API is intended to provide an [Apache Solr](https://solr.apache.org/)-based interface to the +[Babel](https://github.com/NCATSTranslator/Babel) cliques of equivalent identifiers. Apache Solr is a document-based search engine: +the documents in this case are descriptions of cliques as generated by the +[Babel](https://github.com/NCATSTranslator/Babel) pipeline in its +[Synonyms format](https://github.com/NCATSTranslator/Babel/blob/master/docs/DataFormats.md#synonym-files), including lists of all known synonyms. +Here is an example document for [NCBIGene:1756](https://name-resolution-sri.renci.org/synonyms?preferred_curies=NCBIGene%3A1756) +(compared with the same CURIE [on NodeNorm](https://nodenormalization-sri.renci.org/get_normalized_nodes?curie=NCBIGene:1756)): + +```json +{ + "curie": "NCBIGene:1756", + "preferred_name": "DMD", + "names": [ + "BMD", + "DMD", + "MRX85", + "CMD3B", + "DXS164", + "DXS270", + "DXS142", + "DXS268", + "DXS206", + "DXS272", + "DXS269", + "DXS239", + "DXS230", + "DMD Gene", + "DMD gene", + "DYSTROPHIN", + "dystrophin", + "APO-DYSTROPHIN 1", + "mutant dystrophin", + "mental retardation, X-linked 85", + "muscular dystrophy, Duchenne and Becker types", + "Dystrophin (Muscular Dystrophy, Duchenne And Becker Types) Gene", + "dystrophin (muscular dystrophy, Duchenne and Becker types), includes DXS142, DXS164, DXS206, DXS230, DXS239, DXS268, DXS269, DXS270, DXS272", + "A0A087WV90_HUMAN Dystrophin (trembl)", + "A0A0S2Z3B5_HUMAN Dystrophin isoform 2 (trembl)", + "A0A0S2Z3J7_HUMAN Dystrophin isoform 1 (Fragment) (trembl)", + "A0A5H1ZRP9_HUMAN Dystrophin (trembl)", + "A0A5H1ZRQ1_HUMAN Dystrophin (trembl)", + "A0A5H1ZRQ8_HUMAN Dystrophin (trembl)", + "A0A5H1ZRR9_HUMAN Dystrophin (trembl)", + "A0A804HKY9_HUMAN Dystrophin (trembl)", + "A7E212_HUMAN Dystrophin (trembl)", + "hDMD", + "Dystrophin", + "DMD protein, human", + "dystrophin (human)", + "Dp116 protein, human", + "DMD_HUMAN Dystrophin (sprot)", + "dystrophin (muscular dystrophy, Duchenne and Becker types) protein, human", + "Q16484_HUMAN DMD protein (Fragment) (trembl)", + "Q4G0X0_HUMAN DMD protein (trembl)" + ], + "types": [ + "Gene", + "GeneOrGeneProduct", + "GenomicEntity", + "ChemicalEntityOrGeneOrGeneProduct", + "PhysicalEssence", + "OntologyClass", + "BiologicalEntity", + "ThingWithTaxon", + "NamedThing", + "Entity", + "PhysicalEssenceOrOccurrent", + "MacromolecularMachineMixin", + "Protein", + "GeneProductMixin", + "Polypeptide", + "ChemicalEntityOrProteinOrPolypeptide" + ], + "shortest_name_length": 3, + "clique_identifier_count": 22, + "taxa": [ + "NCBITaxon:9606" + ], + "curie_suffix": 1756, + "id": "fd3cbf13-1aa7-4538-9df4-11cb80493295", + "_version_": 1842436833304117200 + } +``` + +The Name Resolver largely consists of two [search endpoints](#search-endpoints): `/lookup` (to search for normalized concepts), +`/bulk-lookup` (to search for multiple normalized concepts), and one [lookup endpoint](#lookup-endpoints): +`/synonyms` (to look up for the synonyms for a normalized CURIE). + +## Conflation + +Unlike the Node Normalizer, the Name Resolution Service does not currently support on-the-fly conflation. Instead, +all the [Babel conflations](https://github.com/NCATSTranslator/Babel/blob/master/docs/Conflation.md) are turned on +when Solr database is built. This means that -- for example -- protein-encoding genes will include the synonyms found +for the protein they encode, and that no separate entry will be available for those proteins. + +## Scoring + +Every `/lookup` or `/bulk-lookup` search result returns a search score. This score value is calculated by Apache Solr +and does not have an upper range. For every term in the query and every document in the result, Solr will calculate a +[TF*IDF score](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) by multiplying: +* The term frequency: the relative frequency of the term in the document. Solr uses the equation `freq / (freq + k1 * (1 - b + b * dl / avgdl))`, + where freq = number of occurrences of terms within this document, k1 = term saturation parameter, b = length normalization parameter, + dl = length of field and avgdl = average length of field. +* The inverse document frequency: a measure of how rare this term is among all documents. Solr uses the equation + `log(1 + (N - n + 0.5) / (n + 0.5))`, where N = total number of documents with this field, and n = number of documents + containing the term. + +If multiple terms are matched in the same document, the sum of the score for each term will be used. + +The TF*IDF score will be multiplied by [several boosts](https://github.com/NCATSTranslator/NameResolution/blob/56e2151bb9e6fd120644cebdf4ff45b3bc47da05/api/server.py#L436-L461) +that depend on four factors: +* We index two fields: the "preferred name" of every clique and the "synonyms" of every clique. The [preferred name + is chosen by Babel](https://github.com/NCATSTranslator/Babel?tab=readme-ov-file#how-does-babel-choose-a-preferred-label-for-a-clique), + while the synonyms are collected from all the different Babel sources. +* We set up two indexes: a [StandardTokenizer](https://solr.apache.org/guide/solr/latest/indexing-guide/tokenizers.html#standard-tokenizer) + that splits the field into tokens at whitespace and punctuation characters, and a + [KeywordTokenizer](https://solr.apache.org/guide/solr/latest/indexing-guide/tokenizers.html#keyword-tokenizer) that + treats the entire field as a single token. +* We use the [Query Fields (qf)](https://solr.apache.org/guide/solr/latest/query-guide/dismax-query-parser.html#qf-query-fields-parameter) + field to search for the tokens in the index, but we also use the [Phrase Fields (pf)](https://solr.apache.org/guide/solr/latest/query-guide/edismax-query-parser.html#extended-dismax-parameters) + field to additionally boost search results where all the tokens are found in close proximity. + (NOTE: this might be removed soon.) +* We use the number of identifiers in the clique as a measure of how widely used a clique is. Since some cliques + share the same preferred name or label, we can use this to promote the clique most likely to be useful. + +We combine these factors in this way in a standard query matches: + +| | Preferred name match | Synonym match | +|--------------------------|----------------------|---------------| +| Keyword Tokenizer index | 250x | 100x | +| StandardTokenizer index | 25x | 10x | + +And provide additional boosts for phrase matches, boosting synonym matches more than preferred name matches: + +| | Preferred name match | Synonym match | +|--------------------------|----------------------|---------------| +| Keyword Tokenizer index | 300x | 200x | +| StandardTokenizer index | 30x | 20x | + +Finally, we multiply the total score by the (base 10) logarithm by the number of identifiers in the clique plus one. +This boost ranges from log(2) = 0.3 for a clique that only has a single identifier to over log(1000) = 3. + +## Search endpoints + +### `/lookup` + +### `/bulk-lookup` + +## Lookup endpoints + +### `/synonyms`