From f1dca6bd5d4f56bbc917a9688966e19118d4eb89 Mon Sep 17 00:00:00 2001
From: caseyta <casey.ta@gmail.com>
Date: Tue, 5 Nov 2024 23:37:43 +0000
Subject: [PATCH] Normalize MCQ (set-input query) score to range [0-1]

---
 cohd/cohd_trapi.py       |  1 +
 cohd/cohd_trapi_15.py    | 48 ++++++++++++++++++++++------------------
 cohd/query_cohd_mysql.py | 26 +++++++++++++++++-----
 3 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/cohd/cohd_trapi.py b/cohd/cohd_trapi.py
index 80279ea..3a5acbd 100644
--- a/cohd/cohd_trapi.py
+++ b/cohd/cohd_trapi.py
@@ -69,6 +69,7 @@ def operate(self):
     batch_size_limit = 100  # max length of any IDs list
     limit_max_results = 500
     json_inf_replacement = 999  # value to replace +/-Infinity with in JSON
+    mcq_score_scaling = 0.75  # magic number to adjust normalized MCQ score
     supported_query_methods = ['relativeFrequency', 'obsExpRatio', 'chiSquare']
     supported_operation = 'lookup_and_score'
 
diff --git a/cohd/cohd_trapi_15.py b/cohd/cohd_trapi_15.py
index a367a87..9695847 100644
--- a/cohd/cohd_trapi_15.py
+++ b/cohd/cohd_trapi_15.py
@@ -42,7 +42,7 @@ class CohdTrapi150(CohdTrapi):
     edge_types_negative = ['biolink:negatively_correlated_with']
     default_negative_predicate = edge_types_negative[0]
 
-    tool_version = f'{CohdTrapi._SERVICE_NAME} 6.5.3'
+    tool_version = f'{CohdTrapi._SERVICE_NAME} 6.5.4'
     schema_version = '1.5.0'
     biolink_version = bm_version
 
@@ -600,15 +600,15 @@ def _interpret_query(self):
         if self._concept_1_set_interpretation == 'BATCH':
             ids = list(set(concept_1_qnode['ids']))  # remove duplicate CURIEs
         elif self._concept_1_set_interpretation == 'MANY':
-            member_ids = concept_1_qnode.get('member_ids')
-            if not member_ids:
+            self._mcq_member_ids = concept_1_qnode.get('member_ids')
+            if not self._mcq_member_ids:
                 # Missing required member_ids for MCQ
                 self._valid_query = False
                 description = 'set_interpretation: MANY but no member_ids'
                 response = self._trapi_mini_response(TrapiStatusCode.MISSING_MEMBER_IDS, description)
                 self._invalid_query_response = response, 200
                 return self._valid_query, self._invalid_query_response
-            ids = list(set(concept_1_qnode['member_ids']))  # remove duplicate CURIEs
+            ids = list(self._mcq_member_ids)  # remove duplicate CURIEs
 
             # Get the MCQ set ID
             self._mcq_set_id = concept_1_qnode['ids'][0]
@@ -999,12 +999,14 @@ def operate_mcq(self):
             # categories (domains)
             for domain_id, concept_class_id in self._domain_class_pairs:
                 new_results = query_cohd_mysql.query_trapi_mcq(concept_ids=self._concept_1_omop_ids,
-                                                                dataset_id=self._dataset_id,
-                                                                domain_id=domain_id,
-                                                                concept_class_id=concept_class_id,
-                                                                ln_ratio_sign=self._association_direction,
-                                                                confidence=self._confidence_interval,
-                                                                bypass=self._bypass_cache)
+                                                               n_member_ids=len(self._mcq_member_ids),
+                                                               score_scaling=CohdTrapi.mcq_score_scaling,
+                                                               dataset_id=self._dataset_id,
+                                                               domain_id=domain_id,
+                                                               concept_class_id=concept_class_id,
+                                                               ln_ratio_sign=self._association_direction,
+                                                               confidence=self._confidence_interval,
+                                                               bypass=self._bypass_cache)
                 new_set_results, new_single_results = new_results
                 if new_set_results:
                     set_results.extend(new_set_results)
@@ -1012,18 +1014,21 @@ def operate_mcq(self):
         else:
             # No category (domain) was specified for Node 2. Query the associations between Node 1 and all
             # domains
-            new_results = query_cohd_mysql.query_trapi_mcq(concept_id_1=self._concept_1_omop_ids,
-                                                            dataset_id=self._dataset_id, domain_id=None,
-                                                            ln_ratio_sign=self._association_direction,
-                                                            confidence=self._confidence_interval,
-                                                            bypass=self._bypass_cache)
+            new_results = query_cohd_mysql.query_trapi_mcq(concept_ids=self._concept_1_omop_ids,
+                                                           n_member_ids=len(self._mcq_member_ids),
+                                                           score_scaling=CohdTrapi.mcq_score_scaling,
+                                                           dataset_id=self._dataset_id, 
+                                                           domain_id=None,
+                                                           ln_ratio_sign=self._association_direction,
+                                                           confidence=self._confidence_interval,
+                                                           bypass=self._bypass_cache)
             new_set_results, new_single_results = new_results
             if new_set_results:
                 set_results.extend(new_set_results)
                 single_results.update(new_single_results)
 
         # Results within each query call should be sorted, but still need to be sorted across query calls
-        new_set_results = sort_cohd_results(new_set_results, sort_field='ln_ratio_score')
+        new_set_results = sort_cohd_results(new_set_results, sort_field='mcq_score')
 
         # Convert results from COHD format to Translator Reasoner standard
         self._add_mcq_results_to_trapi(set_results, single_results)
@@ -1169,8 +1174,8 @@ def _add_mcq_result(self, set_result, single_results, criteria):
         kg_node_2, kg_set_edge, kg_set_edge_id = self._add_kg_set_edge(node_2, is_subject, set_result)
 
         # Add to results
-        score = set_result['ln_ratio_score']
-        self._add_result(self._mcq_set_id, concept_2_curie, kg_set_edge_id, score)        
+        score = set_result['mcq_score']
+        self._add_result(self._mcq_set_id, concept_2_curie, kg_set_edge_id, score, mcq=True)        
         
         # Add single result edges and auxiliary graphs
         support_graphs = list()
@@ -1196,7 +1201,7 @@ def _add_mcq_result(self, set_result, single_results, criteria):
             "value": support_graphs
         })
 
-    def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score):
+    def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score, mcq=False):
         """ Adds a knowledge graph edge to the results list
 
         Parameters
@@ -1205,6 +1210,7 @@ def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score):
         kg_node_2_id: Object node ID
         kg_edge_id: edge ID
         score: result score
+        mcq: True/False if MCQ analysis
 
         Returns
         -------
@@ -1231,7 +1237,7 @@ def _add_result(self, kg_node_1_id, kg_node_2_id, kg_edge_id, score):
                         }]
                     },
                     'score': score,
-                    'scoring_method': 'Lower bound of biolink:ln_ratio_confidence_interval',
+                    'scoring_method': 'COHD set-input query scoring, range: [0,1]' if mcq else 'Lower bound of biolink:ln_ratio_confidence_interval',
                 }
             ]
         }
@@ -1913,7 +1919,7 @@ def _add_kg_set_edge(self, node_2, is_subject, set_result):
                         'value_type_id': 'EDAM:data_1772',  # Score
                         'attribute_source': CohdTrapi._INFORES_ID,
                         'description': 'Observed-expected frequency ratio.'
-                    },
+                    },                 
                     {
                         'attribute_type_id': 'biolink:supporting_data_set',  # Database ID
                         'original_attribute_name': 'dataset_id',
diff --git a/cohd/query_cohd_mysql.py b/cohd/query_cohd_mysql.py
index 1ae263d..cc8a4e8 100644
--- a/cohd/query_cohd_mysql.py
+++ b/cohd/query_cohd_mysql.py
@@ -1,6 +1,7 @@
 import pymysql
 from flask import jsonify
 from scipy.stats import chisquare
+import numpy as np
 from numpy import argsort
 import logging
 import pandas as pd
@@ -15,6 +16,7 @@
 DATASET_ID_DEFAULT = 1
 DATASET_ID_DEFAULT_HIER = 3
 DEFAULT_CONFIDENCE = 0.99
+DEFAULT_MCQ_SCORE_SCALING = 0.75
 
 # OXO API configuration
 URL_OXO_SEARCH = 'https://www.ebi.ac.uk/spot/oxo/api/search'
@@ -1132,7 +1134,11 @@ def query_db(service, method, args):
             elif type(concept_ids) is not list:
                 concept_ids = [concept_ids]
             
-            set_results, single_results = query_trapi_mcq(concept_ids, dataset_id, domain_id, bypass=True)
+            set_results, single_results = query_trapi_mcq(concept_ids=concept_ids, 
+                                                          n_member_ids=len(concept_ids), 
+                                                          dataset_id=dataset_id, 
+                                                          domain_id=domain_id, 
+                                                          bypass=True)
             json_return = {
                 'set_results': set_results,
                 'single_results': single_results
@@ -1837,7 +1843,7 @@ def _get_weighted_statistics(cur=None,dataset_id=None,domain_id = None,concept_i
     concept_list_1_w_df= pd.DataFrame({'concept_id_1':concept_id_1})
     concept_list_1_w_df['w'] = 1
 
-    # Calculate the weights based on Jaccard index between input concep
+    # Calculate the weights based on Jaccard index between input concepts
     pair_count_q1 = pd.DataFrame(get_pair_concept_count(cur=cur,dataset_id=dataset_id,domain_id=domain_id, concept_id_list_1=concept_id_1,concept_id_list_2=concept_id_1))
     if pair_count_q1.shape[0] > 0:
         # Sum of Jaccard index
@@ -1849,6 +1855,7 @@ def _get_weighted_statistics(cur=None,dataset_id=None,domain_id = None,concept_i
     # Weight = 1/(1 + sum(Jaccards))
     concept_list_1_w_df['w'] = 1/concept_list_1_w_df['w']
     concept_list_1_w_df = concept_list_1_w_df[['concept_id_1','w']]
+    total_weights = concept_list_1_w_df.w.sum()
 
     # Multiply the scores by the weights
     pair_count_df = pair_count_df.merge(concept_list_1_w_df)
@@ -1858,7 +1865,7 @@ def _get_weighted_statistics(cur=None,dataset_id=None,domain_id = None,concept_i
     # Group by concept_id_2. Sum the scores and combine concept_id_1 into a list
     gb = pair_count_df.groupby('concept_id_2')
     weighted_stats = gb[json_key].agg('sum')
-    return weighted_stats.reset_index()
+    return weighted_stats.reset_index(), total_weights
 
 
 def _get_ci_scores(r, score_col):
@@ -1871,7 +1878,8 @@ def _get_ci_scores(r, score_col):
 
 
 @cache.memoize(timeout=86400, unless=_bypass_cache)
-def query_trapi_mcq(concept_ids, dataset_id=None, domain_id=None, concept_class_id=None,
+def query_trapi_mcq(concept_ids, n_member_ids, score_scaling=DEFAULT_MCQ_SCORE_SCALING, 
+                    dataset_id=None, domain_id=None, concept_class_id=None,
                     ln_ratio_sign=0, confidence=DEFAULT_CONFIDENCE, bypass=False):
     """ Query for TRAPI Multicurie Query. Calculates weighted scores using methods similar to linkage disequilibrium to
     downweight contributions from input concepts that are similar to each other 
@@ -1879,6 +1887,8 @@ def query_trapi_mcq(concept_ids, dataset_id=None, domain_id=None, concept_class_
     Parameters
     ----------
     concept_ids: list of OMOP concept IDs
+    n_member_ids: number of input IDs in set node
+    score_scaling: linear scaling of ln_ratio_score prior to logistic normalization
     dataset_id: (optional) String - COHD dataset ID
     domain_id: (optional) String - OMOP domain ID
     concept_class_id: (optional) String - OMOP concept class ID
@@ -1912,13 +1922,19 @@ def query_trapi_mcq(concept_ids, dataset_id=None, domain_id=None, concept_class_
 
     # Adjust the scores by weights 
     concept_list_1 = list(set(associations['concept_id_1'].tolist()))
-    weighted_ln_ratio = _get_weighted_statistics(cur=cur, dataset_id=dataset_id, domain_id=domain_id, 
+    weighted_ln_ratio, total_weights = _get_weighted_statistics(cur=cur, dataset_id=dataset_id, domain_id=domain_id, 
                                                 concept_id_1=concept_list_1, pair_count_df=associations, 
                                                 json_key = 'ln_ratio_score')
     # weighted_log_odds = _get_weighted_statistics(cur=cur, dataset_id=dataset_id, domain_id=domain_id, 
     #                                             concept_id_1=concept_list_1, pair_count_df=associations, 
     #                                             json_key = 'log_odds_score')
 
+    # For TRAPI result score, normalize the score relative to the number of input CURIEs and 
+    # scale the score range to [0-1] using a scaled logistic function
+    n_mapped_ids = len(concept_list_1)
+    weighted_ln_ratio['mcq_score'] = weighted_ln_ratio['ln_ratio_score'] / total_weights * n_mapped_ids / n_member_ids    
+    weighted_ln_ratio['mcq_score'] = (1/(1+np.exp(-np.abs(weighted_ln_ratio['mcq_score']*score_scaling)))-0.5) * 2
+
     # Add list of single associations 
     single_associations = dict()
     for i, row in weighted_ln_ratio.iterrows():