From f1f0667bc13dde346cc617b8123dd8bafa440b96 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Fri, 29 Nov 2024 22:36:28 +0100
Subject: [PATCH 1/8] passed correct_col to prepare_name pairs and replaced
 hardcoded column names

---
 emm/data/prepare_name_pairs.py         | 11 ++++++-----
 emm/pipeline/pandas_entity_matching.py |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/emm/data/prepare_name_pairs.py b/emm/data/prepare_name_pairs.py
index 0c0b29f..9aa4aa7 100644
--- a/emm/data/prepare_name_pairs.py
+++ b/emm/data/prepare_name_pairs.py
@@ -39,6 +39,7 @@ def prepare_name_pairs_pd(
     entity_id_col="entity_id",
     gt_entity_id_col="gt_entity_id",
     positive_set_col="positive_set",
+    correct_col="correct",
     uid_col="uid",
     random_seed=42,
 ):
@@ -84,7 +85,7 @@ def prepare_name_pairs_pd(
     assert entity_id_col in candidates_pd.columns
     assert gt_entity_id_col in candidates_pd.columns
 
-    candidates_pd["correct"] = candidates_pd[entity_id_col] == candidates_pd[gt_entity_id_col]
+    candidates_pd[correct_col] = candidates_pd[entity_id_col] == candidates_pd[gt_entity_id_col]
 
     # negative sample creation?
     # if so, add positive_set_col column for negative sample creation
@@ -110,14 +111,14 @@ def prepare_name_pairs_pd(
     # - happens with one correct/positive case, we just pick the correct one
     if drop_duplicate_candidates:
         candidates_pd = candidates_pd.sort_values(
-            ["uid", "gt_preprocessed", "correct"], ascending=False
+            ["uid", "gt_preprocessed", correct_col], ascending=False
         ).drop_duplicates(subset=["uid", "gt_preprocessed"], keep="first")
     # Similar, for a training set remove all equal names that are not considered a match.
     # This can happen a lot in actual data, e.g. with franchises that are independent but have the same name.
     # It's a true effect in data, but this screws up our intuitive notion that identical names should be related.
     if drop_samename_nomatch:
         samename_nomatch = (candidates_pd["preprocessed"] == candidates_pd["gt_preprocessed"]) & ~candidates_pd[
-            "correct"
+            correct_col
         ]
         candidates_pd = candidates_pd[~samename_nomatch]
 
@@ -133,7 +134,7 @@ def prepare_name_pairs_pd(
         # is referred to in: resources/data/howto_create_unittest_sample_namepairs.txt
         # create negative sample and rerank negative candidates
         # this drops, in part, the negative correct candidates
-        candidates_pd = create_positive_negative_samples(candidates_pd)
+        candidates_pd = create_positive_negative_samples(candidates_pd, correct_col=correct_col)
 
     # It could be that we dropped all candidates, so we need to re-introduce the no-candidate rows
     names_to_match_after = candidates_pd[names_to_match_cols].drop_duplicates()
@@ -142,7 +143,7 @@ def prepare_name_pairs_pd(
     )
     names_to_match_missing = names_to_match_missing[names_to_match_missing["_merge"] == "left_only"]
     names_to_match_missing = names_to_match_missing.drop(columns=["_merge"])
-    names_to_match_missing["correct"] = False
+    names_to_match_missing[correct_col] = False
     # Since this column is used to calculate benchmark metrics
     names_to_match_missing["score_0_rank"] = 1
 
diff --git a/emm/pipeline/pandas_entity_matching.py b/emm/pipeline/pandas_entity_matching.py
index 190cbdf..0fbd094 100644
--- a/emm/pipeline/pandas_entity_matching.py
+++ b/emm/pipeline/pandas_entity_matching.py
@@ -384,6 +384,7 @@ def create_training_name_pairs(
             else drop_duplicate_candidates,
             create_negative_sample_fraction=create_negative_sample_fraction,
             positive_set_col=self.parameters.get("positive_set_col", "positive_set"),
+            correct_col=self.parameters.get("correct_col", "correct"),
             random_seed=random_seed,
             **kwargs,
         )

From c44bfe1a0ec25c0c50867705005be7f77d43a5f1 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Fri, 29 Nov 2024 22:40:04 +0100
Subject: [PATCH 2/8] passed uid_col to prepare_name pairs and changed
 hardcodes mentions of it

---
 emm/data/prepare_name_pairs.py         | 6 +++---
 emm/pipeline/pandas_entity_matching.py | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/emm/data/prepare_name_pairs.py b/emm/data/prepare_name_pairs.py
index 9aa4aa7..7689e47 100644
--- a/emm/data/prepare_name_pairs.py
+++ b/emm/data/prepare_name_pairs.py
@@ -111,8 +111,8 @@ def prepare_name_pairs_pd(
     # - happens with one correct/positive case, we just pick the correct one
     if drop_duplicate_candidates:
         candidates_pd = candidates_pd.sort_values(
-            ["uid", "gt_preprocessed", correct_col], ascending=False
-        ).drop_duplicates(subset=["uid", "gt_preprocessed"], keep="first")
+            [uid_col, "gt_preprocessed", correct_col], ascending=False
+        ).drop_duplicates(subset=[uid_col, "gt_preprocessed"], keep="first")
     # Similar, for a training set remove all equal names that are not considered a match.
     # This can happen a lot in actual data, e.g. with franchises that are independent but have the same name.
     # It's a true effect in data, but this screws up our intuitive notion that identical names should be related.
@@ -134,7 +134,7 @@ def prepare_name_pairs_pd(
         # is referred to in: resources/data/howto_create_unittest_sample_namepairs.txt
         # create negative sample and rerank negative candidates
         # this drops, in part, the negative correct candidates
-        candidates_pd = create_positive_negative_samples(candidates_pd, correct_col=correct_col)
+        candidates_pd = create_positive_negative_samples(candidates_pd, correct_col=correct_col, uid_col=uid_col)
 
     # It could be that we dropped all candidates, so we need to re-introduce the no-candidate rows
     names_to_match_after = candidates_pd[names_to_match_cols].drop_duplicates()
diff --git a/emm/pipeline/pandas_entity_matching.py b/emm/pipeline/pandas_entity_matching.py
index 0fbd094..05c808c 100644
--- a/emm/pipeline/pandas_entity_matching.py
+++ b/emm/pipeline/pandas_entity_matching.py
@@ -385,6 +385,7 @@ def create_training_name_pairs(
             create_negative_sample_fraction=create_negative_sample_fraction,
             positive_set_col=self.parameters.get("positive_set_col", "positive_set"),
             correct_col=self.parameters.get("correct_col", "correct"),
+            uid_col=self.parameters.get("uid_col", "uid"),
             random_seed=random_seed,
             **kwargs,
         )

From 8ff151cf2c75973f577cc43f09cae22f62683903 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Fri, 29 Nov 2024 22:46:25 +0100
Subject: [PATCH 3/8] added docstring for correct_col in prepare name pairs

---
 emm/data/prepare_name_pairs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/emm/data/prepare_name_pairs.py b/emm/data/prepare_name_pairs.py
index 7689e47..4ce3276 100644
--- a/emm/data/prepare_name_pairs.py
+++ b/emm/data/prepare_name_pairs.py
@@ -71,6 +71,8 @@ def prepare_name_pairs_pd(
                         For matching name-pairs entity_id == gt_entity_id.
         positive_set_col: column that specifies which candidates remain positive and which become negative,
                         default is "positive_set".
+        correct_col: column that indicates a correct match, default is "correct".
+                        For entity_id == gt_entity_id the column value is "correct".
         uid_col: uid column for names to match, default is "uid".
         random_seed: random seed for selection of negative names, default is 42.
     """

From dd127414e8975784e78acbb9d7620be73dbc2df6 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Fri, 29 Nov 2024 22:59:06 +0100
Subject: [PATCH 4/8] passed more columns to prepare name pairs and replaced
 their corresponding hardcoded values

---
 emm/data/prepare_name_pairs.py         | 18 ++++++++++++------
 emm/pipeline/pandas_entity_matching.py |  3 +++
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/emm/data/prepare_name_pairs.py b/emm/data/prepare_name_pairs.py
index 4ce3276..3a4d614 100644
--- a/emm/data/prepare_name_pairs.py
+++ b/emm/data/prepare_name_pairs.py
@@ -41,6 +41,9 @@ def prepare_name_pairs_pd(
     positive_set_col="positive_set",
     correct_col="correct",
     uid_col="uid",
+    gt_uid_col="gt_uid",
+    preprocessed_col="preprocessed",
+    gt_preprocessed_col="gt_preprocessed",
     random_seed=42,
 ):
     """Prepare dataset of name-pair candidates for training of supervised model.
@@ -74,6 +77,9 @@ def prepare_name_pairs_pd(
         correct_col: column that indicates a correct match, default is "correct".
                         For entity_id == gt_entity_id the column value is "correct".
         uid_col: uid column for names to match, default is "uid".
+        gt_uid_col: uid column of ground-truth names, default is "gt_uid".
+        preprocessed_col: name of the preprocessed names column, default is "preprocessed".
+        gt_preprocessed_col: name of the preprocessed ground-truth names column, default is "gt_preprocessed".
         random_seed: random seed for selection of negative names, default is 42.
     """
     """We can have the following dataset.columns, or much more like 'count', 'counterparty_account_count_distinct', 'type1_sum':
@@ -113,13 +119,13 @@ def prepare_name_pairs_pd(
     # - happens with one correct/positive case, we just pick the correct one
     if drop_duplicate_candidates:
         candidates_pd = candidates_pd.sort_values(
-            [uid_col, "gt_preprocessed", correct_col], ascending=False
-        ).drop_duplicates(subset=[uid_col, "gt_preprocessed"], keep="first")
+            [uid_col, gt_preprocessed_col, correct_col], ascending=False
+        ).drop_duplicates(subset=[uid_col, gt_preprocessed_col], keep="first")
     # Similar, for a training set remove all equal names that are not considered a match.
     # This can happen a lot in actual data, e.g. with franchises that are independent but have the same name.
     # It's a true effect in data, but this screws up our intuitive notion that identical names should be related.
     if drop_samename_nomatch:
-        samename_nomatch = (candidates_pd["preprocessed"] == candidates_pd["gt_preprocessed"]) & ~candidates_pd[
+        samename_nomatch = (candidates_pd[preprocessed_col] == candidates_pd[gt_preprocessed_col]) & ~candidates_pd[
             correct_col
         ]
         candidates_pd = candidates_pd[~samename_nomatch]
@@ -136,7 +142,7 @@ def prepare_name_pairs_pd(
         # is referred to in: resources/data/howto_create_unittest_sample_namepairs.txt
         # create negative sample and rerank negative candidates
         # this drops, in part, the negative correct candidates
-        candidates_pd = create_positive_negative_samples(candidates_pd, correct_col=correct_col, uid_col=uid_col)
+        candidates_pd = create_positive_negative_samples(candidates_pd, uid_col=uid_col, correct_col=correct_col)
 
     # It could be that we dropped all candidates, so we need to re-introduce the no-candidate rows
     names_to_match_after = candidates_pd[names_to_match_cols].drop_duplicates()
@@ -150,7 +156,7 @@ def prepare_name_pairs_pd(
     names_to_match_missing["score_0_rank"] = 1
 
     candidates_pd = pd.concat([candidates_pd, names_to_match_missing], ignore_index=True)
-    candidates_pd["gt_preprocessed"] = candidates_pd["gt_preprocessed"].fillna("")
-    candidates_pd["no_candidate"] = candidates_pd["gt_uid"].isnull()
+    candidates_pd[gt_preprocessed_col] = candidates_pd[gt_preprocessed_col].fillna("")
+    candidates_pd["no_candidate"] = candidates_pd[gt_uid_col].isnull()
 
     return candidates_pd
diff --git a/emm/pipeline/pandas_entity_matching.py b/emm/pipeline/pandas_entity_matching.py
index 05c808c..1d4c591 100644
--- a/emm/pipeline/pandas_entity_matching.py
+++ b/emm/pipeline/pandas_entity_matching.py
@@ -386,6 +386,9 @@ def create_training_name_pairs(
             positive_set_col=self.parameters.get("positive_set_col", "positive_set"),
             correct_col=self.parameters.get("correct_col", "correct"),
             uid_col=self.parameters.get("uid_col", "uid"),
+            gt_uid_col=self.parameters.get("gt_uid_col", "gt_uid"),
+            preprocessed_col=self.parameters.get("preprocessed_col", "preprocessed"),
+            gt_preprocessed_col=self.parameters.get("gt_preprocessed_col", "gt_preprocessed"),
             random_seed=random_seed,
             **kwargs,
         )

From bbda7df0f456fdfe95ff97eb786bdb25c73159d4 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Fri, 29 Nov 2024 23:00:53 +0100
Subject: [PATCH 5/8] passed the new columns also to spark version of training
 name pairs

---
 emm/pipeline/spark_entity_matching.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/emm/pipeline/spark_entity_matching.py b/emm/pipeline/spark_entity_matching.py
index 0df6565..fce8a3d 100644
--- a/emm/pipeline/spark_entity_matching.py
+++ b/emm/pipeline/spark_entity_matching.py
@@ -412,6 +412,11 @@ def create_training_name_pairs(
             else drop_duplicate_candidates,
             create_negative_sample_fraction=create_negative_sample_fraction,
             positive_set_col=self.parameters.get("positive_set_col", "positive_set"),
+            correct_col=self.parameters.get("correct_col", "correct"),
+            uid_col=self.parameters.get("uid_col", "uid"),
+            gt_uid_col=self.parameters.get("gt_uid_col", "gt_uid"),
+            preprocessed_col=self.parameters.get("preprocessed_col", "preprocessed"),
+            gt_preprocessed_col=self.parameters.get("gt_preprocessed_col", "gt_preprocessed"),
             random_seed=random_seed,
             **kwargs,
         )

From be2f01d0680235e4a08f24c4973f27befa551c07 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sat, 30 Nov 2024 09:58:57 +0100
Subject: [PATCH 6/8] added branch to test.yml

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e5247e7..778d4f5 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,7 +2,7 @@ name: Tests
 
 on:
   push:
-    branches: [ main ]
+    branches: [ main, refactor_prepare_name_pairs ]
   pull_request:
 
 jobs:

From b62256924c7a5f401403dd331767446bb08d2547 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sat, 30 Nov 2024 10:23:23 +0100
Subject: [PATCH 7/8] passed positive_set_col to create_negative_name_pairs

---
 emm/data/prepare_name_pairs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/emm/data/prepare_name_pairs.py b/emm/data/prepare_name_pairs.py
index 3a4d614..f953295 100644
--- a/emm/data/prepare_name_pairs.py
+++ b/emm/data/prepare_name_pairs.py
@@ -142,7 +142,9 @@ def prepare_name_pairs_pd(
         # is referred to in: resources/data/howto_create_unittest_sample_namepairs.txt
         # create negative sample and rerank negative candidates
         # this drops, in part, the negative correct candidates
-        candidates_pd = create_positive_negative_samples(candidates_pd, uid_col=uid_col, correct_col=correct_col)
+        candidates_pd = create_positive_negative_samples(
+            candidates_pd, uid_col=uid_col, correct_col=correct_col, positive_set_col=positive_set_col
+        )
 
     # It could be that we dropped all candidates, so we need to re-introduce the no-candidate rows
     names_to_match_after = candidates_pd[names_to_match_cols].drop_duplicates()

From 57d177ee7dcec97a9efddec305094454527753c1 Mon Sep 17 00:00:00 2001
From: chrispyl <pylianidis@gmail.com>
Date: Sat, 30 Nov 2024 12:47:44 +0100
Subject: [PATCH 8/8] removed branch from test.yml

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 778d4f5..e5247e7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,7 +2,7 @@ name: Tests
 
 on:
   push:
-    branches: [ main, refactor_prepare_name_pairs ]
+    branches: [ main ]
   pull_request:
 
 jobs: