From f1f0667bc13dde346cc617b8123dd8bafa440b96 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Fri, 29 Nov 2024 22:36:28 +0100 Subject: [PATCH 1/8] passed correct_col to prepare_name pairs and replaced hardcoded column names --- emm/data/prepare_name_pairs.py | 11 ++++++----- emm/pipeline/pandas_entity_matching.py | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/emm/data/prepare_name_pairs.py b/emm/data/prepare_name_pairs.py index 0c0b29f..9aa4aa7 100644 --- a/emm/data/prepare_name_pairs.py +++ b/emm/data/prepare_name_pairs.py @@ -39,6 +39,7 @@ def prepare_name_pairs_pd( entity_id_col="entity_id", gt_entity_id_col="gt_entity_id", positive_set_col="positive_set", + correct_col="correct", uid_col="uid", random_seed=42, ): @@ -84,7 +85,7 @@ def prepare_name_pairs_pd( assert entity_id_col in candidates_pd.columns assert gt_entity_id_col in candidates_pd.columns - candidates_pd["correct"] = candidates_pd[entity_id_col] == candidates_pd[gt_entity_id_col] + candidates_pd[correct_col] = candidates_pd[entity_id_col] == candidates_pd[gt_entity_id_col] # negative sample creation? # if so, add positive_set_col column for negative sample creation @@ -110,14 +111,14 @@ def prepare_name_pairs_pd( # - happens with one correct/positive case, we just pick the correct one if drop_duplicate_candidates: candidates_pd = candidates_pd.sort_values( - ["uid", "gt_preprocessed", "correct"], ascending=False + ["uid", "gt_preprocessed", correct_col], ascending=False ).drop_duplicates(subset=["uid", "gt_preprocessed"], keep="first") # Similar, for a training set remove all equal names that are not considered a match. # This can happen a lot in actual data, e.g. with franchises that are independent but have the same name. # It's a true effect in data, but this screws up our intuitive notion that identical names should be related. if drop_samename_nomatch: samename_nomatch = (candidates_pd["preprocessed"] == candidates_pd["gt_preprocessed"]) & ~candidates_pd[ - "correct" + correct_col ] candidates_pd = candidates_pd[~samename_nomatch] @@ -133,7 +134,7 @@ def prepare_name_pairs_pd( # is referred to in: resources/data/howto_create_unittest_sample_namepairs.txt # create negative sample and rerank negative candidates # this drops, in part, the negative correct candidates - candidates_pd = create_positive_negative_samples(candidates_pd) + candidates_pd = create_positive_negative_samples(candidates_pd, correct_col=correct_col) # It could be that we dropped all candidates, so we need to re-introduce the no-candidate rows names_to_match_after = candidates_pd[names_to_match_cols].drop_duplicates() @@ -142,7 +143,7 @@ def prepare_name_pairs_pd( ) names_to_match_missing = names_to_match_missing[names_to_match_missing["_merge"] == "left_only"] names_to_match_missing = names_to_match_missing.drop(columns=["_merge"]) - names_to_match_missing["correct"] = False + names_to_match_missing[correct_col] = False # Since this column is used to calculate benchmark metrics names_to_match_missing["score_0_rank"] = 1 diff --git a/emm/pipeline/pandas_entity_matching.py b/emm/pipeline/pandas_entity_matching.py index 190cbdf..0fbd094 100644 --- a/emm/pipeline/pandas_entity_matching.py +++ b/emm/pipeline/pandas_entity_matching.py @@ -384,6 +384,7 @@ def create_training_name_pairs( else drop_duplicate_candidates, create_negative_sample_fraction=create_negative_sample_fraction, positive_set_col=self.parameters.get("positive_set_col", "positive_set"), + correct_col=self.parameters.get("correct_col", "correct"), random_seed=random_seed, **kwargs, ) From c44bfe1a0ec25c0c50867705005be7f77d43a5f1 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Fri, 29 Nov 2024 22:40:04 +0100 Subject: [PATCH 2/8] passed uid_col to prepare_name pairs and changed hardcodes mentions of it --- emm/data/prepare_name_pairs.py | 6 +++--- emm/pipeline/pandas_entity_matching.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/emm/data/prepare_name_pairs.py b/emm/data/prepare_name_pairs.py index 9aa4aa7..7689e47 100644 --- a/emm/data/prepare_name_pairs.py +++ b/emm/data/prepare_name_pairs.py @@ -111,8 +111,8 @@ def prepare_name_pairs_pd( # - happens with one correct/positive case, we just pick the correct one if drop_duplicate_candidates: candidates_pd = candidates_pd.sort_values( - ["uid", "gt_preprocessed", correct_col], ascending=False - ).drop_duplicates(subset=["uid", "gt_preprocessed"], keep="first") + [uid_col, "gt_preprocessed", correct_col], ascending=False + ).drop_duplicates(subset=[uid_col, "gt_preprocessed"], keep="first") # Similar, for a training set remove all equal names that are not considered a match. # This can happen a lot in actual data, e.g. with franchises that are independent but have the same name. # It's a true effect in data, but this screws up our intuitive notion that identical names should be related. @@ -134,7 +134,7 @@ def prepare_name_pairs_pd( # is referred to in: resources/data/howto_create_unittest_sample_namepairs.txt # create negative sample and rerank negative candidates # this drops, in part, the negative correct candidates - candidates_pd = create_positive_negative_samples(candidates_pd, correct_col=correct_col) + candidates_pd = create_positive_negative_samples(candidates_pd, correct_col=correct_col, uid_col=uid_col) # It could be that we dropped all candidates, so we need to re-introduce the no-candidate rows names_to_match_after = candidates_pd[names_to_match_cols].drop_duplicates() diff --git a/emm/pipeline/pandas_entity_matching.py b/emm/pipeline/pandas_entity_matching.py index 0fbd094..05c808c 100644 --- a/emm/pipeline/pandas_entity_matching.py +++ b/emm/pipeline/pandas_entity_matching.py @@ -385,6 +385,7 @@ def create_training_name_pairs( create_negative_sample_fraction=create_negative_sample_fraction, positive_set_col=self.parameters.get("positive_set_col", "positive_set"), correct_col=self.parameters.get("correct_col", "correct"), + uid_col=self.parameters.get("uid_col", "uid"), random_seed=random_seed, **kwargs, ) From 8ff151cf2c75973f577cc43f09cae22f62683903 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Fri, 29 Nov 2024 22:46:25 +0100 Subject: [PATCH 3/8] added docstring for correct_col in prepare name pairs --- emm/data/prepare_name_pairs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/emm/data/prepare_name_pairs.py b/emm/data/prepare_name_pairs.py index 7689e47..4ce3276 100644 --- a/emm/data/prepare_name_pairs.py +++ b/emm/data/prepare_name_pairs.py @@ -71,6 +71,8 @@ def prepare_name_pairs_pd( For matching name-pairs entity_id == gt_entity_id. positive_set_col: column that specifies which candidates remain positive and which become negative, default is "positive_set". + correct_col: column that indicates a correct match, default is "correct". + For entity_id == gt_entity_id the column value is "correct". uid_col: uid column for names to match, default is "uid". random_seed: random seed for selection of negative names, default is 42. """ From dd127414e8975784e78acbb9d7620be73dbc2df6 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Fri, 29 Nov 2024 22:59:06 +0100 Subject: [PATCH 4/8] passed more columns to prepare name pairs and replaced their corresponding hardcoded values --- emm/data/prepare_name_pairs.py | 18 ++++++++++++------ emm/pipeline/pandas_entity_matching.py | 3 +++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/emm/data/prepare_name_pairs.py b/emm/data/prepare_name_pairs.py index 4ce3276..3a4d614 100644 --- a/emm/data/prepare_name_pairs.py +++ b/emm/data/prepare_name_pairs.py @@ -41,6 +41,9 @@ def prepare_name_pairs_pd( positive_set_col="positive_set", correct_col="correct", uid_col="uid", + gt_uid_col="gt_uid", + preprocessed_col="preprocessed", + gt_preprocessed_col="gt_preprocessed", random_seed=42, ): """Prepare dataset of name-pair candidates for training of supervised model. @@ -74,6 +77,9 @@ def prepare_name_pairs_pd( correct_col: column that indicates a correct match, default is "correct". For entity_id == gt_entity_id the column value is "correct". uid_col: uid column for names to match, default is "uid". + gt_uid_col: uid column of ground-truth names, default is "gt_uid". + preprocessed_col: name of the preprocessed names column, default is "preprocessed". + gt_preprocessed_col: name of the preprocessed ground-truth names column, default is "gt_preprocessed". random_seed: random seed for selection of negative names, default is 42. """ """We can have the following dataset.columns, or much more like 'count', 'counterparty_account_count_distinct', 'type1_sum': @@ -113,13 +119,13 @@ def prepare_name_pairs_pd( # - happens with one correct/positive case, we just pick the correct one if drop_duplicate_candidates: candidates_pd = candidates_pd.sort_values( - [uid_col, "gt_preprocessed", correct_col], ascending=False - ).drop_duplicates(subset=[uid_col, "gt_preprocessed"], keep="first") + [uid_col, gt_preprocessed_col, correct_col], ascending=False + ).drop_duplicates(subset=[uid_col, gt_preprocessed_col], keep="first") # Similar, for a training set remove all equal names that are not considered a match. # This can happen a lot in actual data, e.g. with franchises that are independent but have the same name. # It's a true effect in data, but this screws up our intuitive notion that identical names should be related. if drop_samename_nomatch: - samename_nomatch = (candidates_pd["preprocessed"] == candidates_pd["gt_preprocessed"]) & ~candidates_pd[ + samename_nomatch = (candidates_pd[preprocessed_col] == candidates_pd[gt_preprocessed_col]) & ~candidates_pd[ correct_col ] candidates_pd = candidates_pd[~samename_nomatch] @@ -136,7 +142,7 @@ def prepare_name_pairs_pd( # is referred to in: resources/data/howto_create_unittest_sample_namepairs.txt # create negative sample and rerank negative candidates # this drops, in part, the negative correct candidates - candidates_pd = create_positive_negative_samples(candidates_pd, correct_col=correct_col, uid_col=uid_col) + candidates_pd = create_positive_negative_samples(candidates_pd, uid_col=uid_col, correct_col=correct_col) # It could be that we dropped all candidates, so we need to re-introduce the no-candidate rows names_to_match_after = candidates_pd[names_to_match_cols].drop_duplicates() @@ -150,7 +156,7 @@ def prepare_name_pairs_pd( names_to_match_missing["score_0_rank"] = 1 candidates_pd = pd.concat([candidates_pd, names_to_match_missing], ignore_index=True) - candidates_pd["gt_preprocessed"] = candidates_pd["gt_preprocessed"].fillna("") - candidates_pd["no_candidate"] = candidates_pd["gt_uid"].isnull() + candidates_pd[gt_preprocessed_col] = candidates_pd[gt_preprocessed_col].fillna("") + candidates_pd["no_candidate"] = candidates_pd[gt_uid_col].isnull() return candidates_pd diff --git a/emm/pipeline/pandas_entity_matching.py b/emm/pipeline/pandas_entity_matching.py index 05c808c..1d4c591 100644 --- a/emm/pipeline/pandas_entity_matching.py +++ b/emm/pipeline/pandas_entity_matching.py @@ -386,6 +386,9 @@ def create_training_name_pairs( positive_set_col=self.parameters.get("positive_set_col", "positive_set"), correct_col=self.parameters.get("correct_col", "correct"), uid_col=self.parameters.get("uid_col", "uid"), + gt_uid_col=self.parameters.get("gt_uid_col", "gt_uid"), + preprocessed_col=self.parameters.get("preprocessed_col", "preprocessed"), + gt_preprocessed_col=self.parameters.get("gt_preprocessed_col", "gt_preprocessed"), random_seed=random_seed, **kwargs, ) From bbda7df0f456fdfe95ff97eb786bdb25c73159d4 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Fri, 29 Nov 2024 23:00:53 +0100 Subject: [PATCH 5/8] passed the new columns also to spark version of training name pairs --- emm/pipeline/spark_entity_matching.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/emm/pipeline/spark_entity_matching.py b/emm/pipeline/spark_entity_matching.py index 0df6565..fce8a3d 100644 --- a/emm/pipeline/spark_entity_matching.py +++ b/emm/pipeline/spark_entity_matching.py @@ -412,6 +412,11 @@ def create_training_name_pairs( else drop_duplicate_candidates, create_negative_sample_fraction=create_negative_sample_fraction, positive_set_col=self.parameters.get("positive_set_col", "positive_set"), + correct_col=self.parameters.get("correct_col", "correct"), + uid_col=self.parameters.get("uid_col", "uid"), + gt_uid_col=self.parameters.get("gt_uid_col", "gt_uid"), + preprocessed_col=self.parameters.get("preprocessed_col", "preprocessed"), + gt_preprocessed_col=self.parameters.get("gt_preprocessed_col", "gt_preprocessed"), random_seed=random_seed, **kwargs, ) From be2f01d0680235e4a08f24c4973f27befa551c07 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sat, 30 Nov 2024 09:58:57 +0100 Subject: [PATCH 6/8] added branch to test.yml --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e5247e7..778d4f5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Tests on: push: - branches: [ main ] + branches: [ main, refactor_prepare_name_pairs ] pull_request: jobs: From b62256924c7a5f401403dd331767446bb08d2547 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sat, 30 Nov 2024 10:23:23 +0100 Subject: [PATCH 7/8] passed positive_set_col to create_negative_name_pairs --- emm/data/prepare_name_pairs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/emm/data/prepare_name_pairs.py b/emm/data/prepare_name_pairs.py index 3a4d614..f953295 100644 --- a/emm/data/prepare_name_pairs.py +++ b/emm/data/prepare_name_pairs.py @@ -142,7 +142,9 @@ def prepare_name_pairs_pd( # is referred to in: resources/data/howto_create_unittest_sample_namepairs.txt # create negative sample and rerank negative candidates # this drops, in part, the negative correct candidates - candidates_pd = create_positive_negative_samples(candidates_pd, uid_col=uid_col, correct_col=correct_col) + candidates_pd = create_positive_negative_samples( + candidates_pd, uid_col=uid_col, correct_col=correct_col, positive_set_col=positive_set_col + ) # It could be that we dropped all candidates, so we need to re-introduce the no-candidate rows names_to_match_after = candidates_pd[names_to_match_cols].drop_duplicates() From 57d177ee7dcec97a9efddec305094454527753c1 Mon Sep 17 00:00:00 2001 From: chrispyl Date: Sat, 30 Nov 2024 12:47:44 +0100 Subject: [PATCH 8/8] removed branch from test.yml --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 778d4f5..e5247e7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Tests on: push: - branches: [ main, refactor_prepare_name_pairs ] + branches: [ main ] pull_request: jobs: