From f1dca6bd5d4f56bbc917a9688966e19118d4eb89 Mon Sep 17 00:00:00 2001 From: caseyta Date: Tue, 5 Nov 2024 23:37:43 +0000 Subject: [PATCH] Normalize MCQ (set-input query) score to range [0-1] --- cohd/cohd_trapi.py | 1 + cohd/cohd_trapi_15.py | 48 ++++++++++++++++++++++------------------ cohd/query_cohd_mysql.py | 26 +++++++++++++++++----- 3 files changed, 49 insertions(+), 26 deletions(-) diff --git a/cohd/cohd_trapi.py b/cohd/cohd_trapi.py index 80279ea..3a5acbd 100644 --- a/cohd/cohd_trapi.py +++ b/cohd/cohd_trapi.py @@ -69,6 +69,7 @@ def operate(self): batch_size_limit = 100 # max length of any IDs list limit_max_results = 500 json_inf_replacement = 999 # value to replace +/-Infinity with in JSON + mcq_score_scaling = 0.75 # magic number to adjust normalized MCQ score supported_query_methods = ['relativeFrequency', 'obsExpRatio', 'chiSquare'] supported_operation = 'lookup_and_score' diff --git a/cohd/cohd_trapi_15.py b/cohd/cohd_trapi_15.py index a367a87..9695847 100644 --- a/cohd/cohd_trapi_15.py +++ b/cohd/cohd_trapi_15.py @@ -42,7 +42,7 @@ class CohdTrapi150(CohdTrapi): edge_types_negative = ['biolink:negatively_correlated_with'] default_negative_predicate = edge_types_negative[0] - tool_version = f'{CohdTrapi._SERVICE_NAME} 6.5.3' + tool_version = f'{CohdTrapi._SERVICE_NAME} 6.5.4' schema_version = '1.5.0' biolink_version = bm_version @@ -600,15 +600,15 @@ def _interpret_query(self): if self._concept_1_set_interpretation == 'BATCH': ids = list(set(concept_1_qnode['ids'])) # remove duplicate CURIEs elif self._concept_1_set_interpretation == 'MANY': - member_ids = concept_1_qnode.get('member_ids') - if not member_ids: + self._mcq_member_ids = concept_1_qnode.get('member_ids') + if not self._mcq_member_ids: # Missing required member_ids for MCQ self._valid_query = False description = 'set_interpretation: MANY but no member_ids' response = self._trapi_mini_response(TrapiStatusCode.MISSING_MEMBER_IDS, description) self._invalid_query_response = response, 200 return self._valid_query, self._invalid_query_response - ids = list(set(concept_1_qnode['member_ids'])) # remove duplicate CURIEs + ids = list(self._mcq_member_ids) # remove duplicate CURIEs # Get the MCQ set ID self._mcq_set_id = concept_1_qnode['ids'][0] @@ -999,12 +999,14 @@ def operate_mcq(self): # categories (domains) for domain_id, concept_class_id in self._domain_class_pairs: new_results = query_cohd_mysql.query_trapi_mcq(concept_ids=self._concept_1_omop_ids, - dataset_id=self._dataset_id, - domain_id=domain_id, - concept_class_id=concept_class_id, - ln_ratio_sign=self._association_direction, - confidence=self._confidence_interval, - bypass=self._bypass_cache) + n_member_ids=len(self._mcq_member_ids), + score_scaling=CohdTrapi.mcq_score_scaling, + dataset_id=self._dataset_id, + domain_id=domain_id, + concept_class_id=concept_class_id, + ln_ratio_sign=self._association_direction, + confidence=self._confidence_interval, + bypass=self._bypass_cache) new_set_results, new_single_results = new_results if new_set_results: set_results.extend(new_set_results) @@ -1012,18 +1014,21 @@ def operate_mcq(self): else: # No category (domain) was specified for Node 2. Query the associations between Node 1 and all # domains - new_results = query_cohd_mysql.query_trapi_mcq(concept_id_1=self._concept_1_omop_ids, - dataset_id=self._dataset_id, domain_id=None, - ln_ratio_sign=self._association_direction, - confidence=self._confidence_interval, - bypass=self._bypass_cache) + new_results = query_cohd_mysql.query_trapi_mcq(concept_ids=self._concept_1_omop_ids, + n_member_ids=len(self._mcq_member_ids), + score_scaling=CohdTrapi.mcq_score_scaling, + dataset_id=self._dataset_id, + domain_id=None, + ln_ratio_sign=self._association_direction, + confidence=self._confidence_interval, + bypass=self._bypass_cache) new_set_results, new_single_results = new_results if new_set_results: set_results.extend(new_set_results) single_results.update(new_single_results) # Results within each query call should be sorted, but still need to be sorted across query calls - new_set_results = sort_cohd_results(new_set_results, sort_field='ln_ratio_score') + new_set_results = sort_cohd_results(new_set_results, sort_field='mcq_score') # Convert results from COHD format to Translator Reasoner standard self._add_mcq_results_to_trapi(set_results, single_results) @@ -1169,8 +1174,8 @@ def _add_mcq_result(self, set_result, single_results, criteria): kg_node_2, kg_set_edge, kg_set_edge_id = self._add_kg_set_edge(node_2, is_subject, set_result) # Add to results - score = set_result['ln_ratio_score'] - self._add_result(self._mcq_set_id, concept_2_curie, kg_set_edge_id, score) + score = set_result['mcq_score'] + self._add_result(self._mcq_set_id, concept_2_curie, kg_set_edge_id, score, mcq=True) # Add single result edges and auxiliary graphs support_graphs = list() @@ -1196,7 +1201,7 @@ def _add_mcq_result(self, set_result, single_results, criteria): "value": support_graphs }) - def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score): + def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score, mcq=False): """ Adds a knowledge graph edge to the results list Parameters @@ -1205,6 +1210,7 @@ def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score): kg_node_2_id: Object node ID kg_edge_id: edge ID score: result score + mcq: True/False if MCQ analysis Returns ------- @@ -1231,7 +1237,7 @@ def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score): }] }, 'score': score, - 'scoring_method': 'Lower bound of biolink:ln_ratio_confidence_interval', + 'scoring_method': 'COHD set-input query scoring, range: [0,1]' if mcq else 'Lower bound of biolink:ln_ratio_confidence_interval', } ] } @@ -1913,7 +1919,7 @@ def _add_kg_set_edge(self, node_2, is_subject, set_result): 'value_type_id': 'EDAM:data_1772', # Score 'attribute_source': CohdTrapi._INFORES_ID, 'description': 'Observed-expected frequency ratio.' - }, + }, { 'attribute_type_id': 'biolink:supporting_data_set', # Database ID 'original_attribute_name': 'dataset_id', diff --git a/cohd/query_cohd_mysql.py b/cohd/query_cohd_mysql.py index 1ae263d..cc8a4e8 100644 --- a/cohd/query_cohd_mysql.py +++ b/cohd/query_cohd_mysql.py @@ -1,6 +1,7 @@ import pymysql from flask import jsonify from scipy.stats import chisquare +import numpy as np from numpy import argsort import logging import pandas as pd @@ -15,6 +16,7 @@ DATASET_ID_DEFAULT = 1 DATASET_ID_DEFAULT_HIER = 3 DEFAULT_CONFIDENCE = 0.99 +DEFAULT_MCQ_SCORE_SCALING = 0.75 # OXO API configuration URL_OXO_SEARCH = 'https://www.ebi.ac.uk/spot/oxo/api/search' @@ -1132,7 +1134,11 @@ def query_db(service, method, args): elif type(concept_ids) is not list: concept_ids = [concept_ids] - set_results, single_results = query_trapi_mcq(concept_ids, dataset_id, domain_id, bypass=True) + set_results, single_results = query_trapi_mcq(concept_ids=concept_ids, + n_member_ids=len(concept_ids), + dataset_id=dataset_id, + domain_id=domain_id, + bypass=True) json_return = { 'set_results': set_results, 'single_results': single_results @@ -1837,7 +1843,7 @@ def _get_weighted_statistics(cur=None,dataset_id=None,domain_id = None,concept_i concept_list_1_w_df= pd.DataFrame({'concept_id_1':concept_id_1}) concept_list_1_w_df['w'] = 1 - # Calculate the weights based on Jaccard index between input concep + # Calculate the weights based on Jaccard index between input concepts pair_count_q1 = pd.DataFrame(get_pair_concept_count(cur=cur,dataset_id=dataset_id,domain_id=domain_id, concept_id_list_1=concept_id_1,concept_id_list_2=concept_id_1)) if pair_count_q1.shape[0] > 0: # Sum of Jaccard index @@ -1849,6 +1855,7 @@ def _get_weighted_statistics(cur=None,dataset_id=None,domain_id = None,concept_i # Weight = 1/(1 + sum(Jaccards)) concept_list_1_w_df['w'] = 1/concept_list_1_w_df['w'] concept_list_1_w_df = concept_list_1_w_df[['concept_id_1','w']] + total_weights = concept_list_1_w_df.w.sum() # Multiply the scores by the weights pair_count_df = pair_count_df.merge(concept_list_1_w_df) @@ -1858,7 +1865,7 @@ def _get_weighted_statistics(cur=None,dataset_id=None,domain_id = None,concept_i # Group by concept_id_2. Sum the scores and combine concept_id_1 into a list gb = pair_count_df.groupby('concept_id_2') weighted_stats = gb[json_key].agg('sum') - return weighted_stats.reset_index() + return weighted_stats.reset_index(), total_weights def _get_ci_scores(r, score_col): @@ -1871,7 +1878,8 @@ def _get_ci_scores(r, score_col): @cache.memoize(timeout=86400, unless=_bypass_cache) -def query_trapi_mcq(concept_ids, dataset_id=None, domain_id=None, concept_class_id=None, +def query_trapi_mcq(concept_ids, n_member_ids, score_scaling=DEFAULT_MCQ_SCORE_SCALING, + dataset_id=None, domain_id=None, concept_class_id=None, ln_ratio_sign=0, confidence=DEFAULT_CONFIDENCE, bypass=False): """ Query for TRAPI Multicurie Query. Calculates weighted scores using methods similar to linkage disequilibrium to downweight contributions from input concepts that are similar to each other @@ -1879,6 +1887,8 @@ def query_trapi_mcq(concept_ids, dataset_id=None, domain_id=None, concept_class_ Parameters ---------- concept_ids: list of OMOP concept IDs + n_member_ids: number of input IDs in set node + score_scaling: linear scaling of ln_ratio_score prior to logistic normalization dataset_id: (optional) String - COHD dataset ID domain_id: (optional) String - OMOP domain ID concept_class_id: (optional) String - OMOP concept class ID @@ -1912,13 +1922,19 @@ def query_trapi_mcq(concept_ids, dataset_id=None, domain_id=None, concept_class_ # Adjust the scores by weights concept_list_1 = list(set(associations['concept_id_1'].tolist())) - weighted_ln_ratio = _get_weighted_statistics(cur=cur, dataset_id=dataset_id, domain_id=domain_id, + weighted_ln_ratio, total_weights = _get_weighted_statistics(cur=cur, dataset_id=dataset_id, domain_id=domain_id, concept_id_1=concept_list_1, pair_count_df=associations, json_key = 'ln_ratio_score') # weighted_log_odds = _get_weighted_statistics(cur=cur, dataset_id=dataset_id, domain_id=domain_id, # concept_id_1=concept_list_1, pair_count_df=associations, # json_key = 'log_odds_score') + # For TRAPI result score, normalize the score relative to the number of input CURIEs and + # scale the score range to [0-1] using a scaled logistic function + n_mapped_ids = len(concept_list_1) + weighted_ln_ratio['mcq_score'] = weighted_ln_ratio['ln_ratio_score'] / total_weights * n_mapped_ids / n_member_ids + weighted_ln_ratio['mcq_score'] = (1/(1+np.exp(-np.abs(weighted_ln_ratio['mcq_score']*score_scaling)))-0.5) * 2 + # Add list of single associations single_associations = dict() for i, row in weighted_ln_ratio.iterrows():