From fc91fbbbba329ffedce9cf628922dab9f876767f Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Mon, 2 Mar 2026 18:13:43 -0500 Subject: [PATCH 1/8] added tests --- sdv/multi_table/base.py | 2 +- tests/integration/metadata/test_metadata.py | 106 ++++++++++++++++++++ tests/unit/metadata/test_metadata.py | 28 ++++++ 3 files changed, 135 insertions(+), 1 deletion(-) diff --git a/sdv/multi_table/base.py b/sdv/multi_table/base.py index 8742e5f99..8fde63e8e 100644 --- a/sdv/multi_table/base.py +++ b/sdv/multi_table/base.py @@ -439,7 +439,7 @@ def _validate_all_tables(self, data): def validate(self, data): """Validate the data. - Validate that the metadata matches the data and thta every table's constraints are valid. + Validate that the metadata matches the data and that every table's constraints are valid. Args: data (dict[str, pd.DataFrame]): diff --git a/tests/integration/metadata/test_metadata.py b/tests/integration/metadata/test_metadata.py index 17ce74083..8d49a1e2f 100644 --- a/tests/integration/metadata/test_metadata.py +++ b/tests/integration/metadata/test_metadata.py @@ -879,6 +879,112 @@ def test_detect_from_dataframes_invalid_format(): Metadata.detect_from_dataframes(data) +def test_detect_from_dataframes__primary_to_primary(): + """Test metadata auto-detection works for primary to primary relationships.""" + # Setup + data = { + 'tableA': pd.DataFrame({ + 'table_A_id': range(5), + 'column_1': ['A', 'B', 'B', 'C', 'C'], + }), + 'tableB': pd.DataFrame({ + 'table_A_id': range(5), + 'column_2': ['A', 'B', 'B', 'C', 'C'], + }), + } + + # Run + detected_metadata = Metadata().detect_from_dataframes( + data, foreign_key_inference_algorithm='column_name_match' + ) + + # Assert + assert detected_metadata.to_dict() == { + 'tables': { + 'tableA': { + 'columns': { + 'table_A_id': {'sdtype': 'id'}, + 'column_1': {'sdtype': 'categorical'}, + }, + 'primary_key': 'table_A_id', + }, + 'tableB': { + 'columns': { + 'table_A_id': {'sdtype': 'id'}, + 'column_2': {'sdtype': 'categorical'}, + }, + 'primary_key': 'table_A_id', + }, + }, + 'relationships': [ + { + 'parent_table_name': 'tableA', + 'child_table_name': 'tableB', + 'parent_primary_key': 'table_A_id', + 'child_foreign_key': 'table_A_id', + } + ], + 'METADATA_SPEC_VERSION': 'V1', + } + + +def test_detect_from_dataframes__primary_to_primary_no_cycles(): + """Test metadata auto-detection does not create cycles with PK to PK.""" + # Setup + data = { + 'tableA': pd.DataFrame({ + 'table_A_id': range(5), + 'column_1': ['A', 'B', 'B', 'C', 'C'], + }), + 'tableB': pd.DataFrame({ + 'table_A_id': range(5), + 'column_2': ['A', 'B', 'B', 'C', 'C'], + }), + 'tableC': pd.DataFrame({ + 'table_A_id': range(5), + 'column_2': ['A', 'B', 'B', 'C', 'C'], + }), + } + + # Run + detected_metadata = Metadata().detect_from_dataframes( + data, foreign_key_inference_algorithm='column_name_match' + ) + + # Assert + assert detected_metadata.to_dict() == { + 'tables': { + 'tableA': { + 'columns': {'table_A_id': {'sdtype': 'id'}, 'column_1': {'sdtype': 'categorical'}}, + 'primary_key': 'table_A_id', + }, + 'tableB': { + 'columns': {'table_A_id': {'sdtype': 'id'}, 'column_2': {'sdtype': 'categorical'}}, + 'primary_key': 'table_A_id', + }, + 'tableC': { + 'columns': {'table_A_id': {'sdtype': 'id'}, 'column_2': {'sdtype': 'categorical'}}, + 'primary_key': 'table_A_id', + }, + }, + 'relationships': [ + { + 'parent_table_name': 'tableA', + 'child_table_name': 'tableC', + 'parent_primary_key': 'table_A_id', + 'child_foreign_key': 'table_A_id', + }, + { + 'parent_table_name': 'tableA', + 'child_table_name': 'tableB', + 'parent_primary_key': 'table_A_id', + 'child_foreign_key': 'table_A_id', + }, + ], + 'METADATA_SPEC_VERSION': 'V1', + } + + def test_validate_metadata_with_reused_foreign_keys(): # Setup metadata_dict = { diff --git a/tests/unit/metadata/test_metadata.py b/tests/unit/metadata/test_metadata.py index 95ab8ca6a..17d400801 100644 --- a/tests/unit/metadata/test_metadata.py +++ b/tests/unit/metadata/test_metadata.py @@ -727,6 +727,34 @@ def test_detect_from_dataframes_bad_input_infer_keys(self): with pytest.raises(ValueError, match=expected_message): Metadata.detect_from_dataframes(data, infer_keys=infer_keys) + def test_detect_from_dataframe_primary_key_to_primary_key(self): + """Test primary to primary key relationship is detected if column name match.""" + # Setup + data = { + 'table1': pd.DataFrame({ + 'id': [1, 2, 3], + }), + 'table2': pd.DataFrame({ + 'id': [1, 2, 3], + }), + } + instance = Metadata() + instance.detect_table_from_dataframe('table1', data['table1']) + instance.detect_table_from_dataframe('table2', data['table2']) + + # Run + instance._detect_foreign_keys_by_column_name(data) + + # Assert + assert instance.to_dict()['relationships'] == [ + { + 'parent_table_name': 'table1', + 'child_table_name': 'table2', + 'parent_primary_key': 'id', + 'child_foreign_key': 'id', + } + ] + @patch('sdv.metadata.metadata.Metadata') def test_detect_from_dataframe(self, mock_metadata): """Test that the method calls the detection method and returns the metadata. From e8d14488423049fab47fc1d761578c63572d1874 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Mon, 2 Mar 2026 18:17:52 -0500 Subject: [PATCH 2/8] add cycle unit test --- tests/unit/metadata/test_metadata.py | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/unit/metadata/test_metadata.py b/tests/unit/metadata/test_metadata.py index 17d400801..5d8a59a7f 100644 --- a/tests/unit/metadata/test_metadata.py +++ b/tests/unit/metadata/test_metadata.py @@ -755,6 +755,44 @@ def test_detect_from_dataframe_primary_key_to_primary_key(self): } ] + def test_detect_from_dataframe_primary_key_to_primary_key_no_cycles(self): + """Test no cycles are created with primary to primary key relationship.""" + # Setup + data = { + 'table1': pd.DataFrame({ + 'id': [1, 2, 3], + }), + 'table2': pd.DataFrame({ + 'id': [1, 2, 3], + }), + 'table3': pd.DataFrame({ + 'id': [1, 2, 3], + }), + } + instance = Metadata() + instance.detect_table_from_dataframe('table1', data['table1']) + instance.detect_table_from_dataframe('table2', data['table2']) + instance.detect_table_from_dataframe('table3', data['table3']) + + # Run + instance._detect_foreign_keys_by_column_name(data) + + # Assert + assert instance.to_dict()['relationships'] == [ + { + 'parent_table_name': 'table1', + 'child_table_name': 'table2', + 'parent_primary_key': 'id', + 'child_foreign_key': 'id', + }, + { + 'parent_table_name': 'table1', + 'child_table_name': 'table3', + 'parent_primary_key': 'id', + 'child_foreign_key': 'id', + }, + ] + @patch('sdv.metadata.metadata.Metadata') def test_detect_from_dataframe(self, mock_metadata): """Test that the method calls the detection method and returns the metadata. From 907c1093d2dcb38f95710abce1da5fbdecf57abb Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Mon, 2 Mar 2026 19:11:46 -0500 Subject: [PATCH 3/8] fix test --- tests/integration/metadata/test_metadata.py | 91 ++++++++------------- 1 file changed, 33 insertions(+), 58 deletions(-) diff --git a/tests/integration/metadata/test_metadata.py b/tests/integration/metadata/test_metadata.py index 8d49a1e2f..d2c643cb0 100644 --- a/tests/integration/metadata/test_metadata.py +++ b/tests/integration/metadata/test_metadata.py @@ -899,33 +899,16 @@ def test_detect_from_dataframes__primary_to_primary(): ) # Assert - assert detected_metadata.to_dict() == { - 'tables': { - 'tableA': { - 'columns': { - 'table_A_id': {'sdtype': 'id'}, - 'column_1': {'sdtype': 'categorical'}, - }, - 'primary_key': 'table_A_id', - }, - 'tableB': { - 'columns': { - 'table_A_id': {'sdtype': 'id'}, - 'column_2': {'sdtype': 'categorical'}, - }, - 'primary_key': 'table_A_id', - }, - }, - 'relationships': [ - { - 'parent_table_name': 'tableA', - 'child_table_name': 'tableB', - 'parent_primary_key': 'table_A_id', - 'child_foreign_key': 'table_A_id', - } - ], - 'METADATA_SPEC_VERSION': 'V1', - } + assert detected_metadata.tables['tableA'].primary_key == 'table_A_id' + assert detected_metadata.tables['tableB'].primary_key == 'table_A_id' + assert detected_metadata.relationships == [ + { + 'parent_table_name': 'tableA', + 'child_table_name': 'tableB', + 'parent_primary_key': 'table_A_id', + 'child_foreign_key': 'table_A_id', + } + ] def test_detect_from_dataframes__primary_to_primary_no_cycles(): @@ -952,37 +935,29 @@ def test_detect_from_dataframes__primary_to_primary_no_cycles(): ) # Assert - assert detected_metadata.to_dict() == { - 'tables': { - 'tableA': { - 'columns': {'table_A_id': {'sdtype': 'id'}, 'column_1': {'sdtype': 'categorical'}}, - 'primary_key': 'table_A_id', - }, - 'tableB': { - 'columns': {'table_A_id': {'sdtype': 'id'}, 'column_2': {'sdtype': 'categorical'}}, - 'primary_key': 'table_A_id', - }, - 'tableC': { - 'columns': {'table_A_id': {'sdtype': 'id'}, 'column_2': {'sdtype': 'categorical'}}, - 'primary_key': 'table_A_id', - }, - }, - 'relationships': [ - { - 'parent_table_name': 'tableA', - 'child_table_name': 'tableC', - 'parent_primary_key': 'table_A_id', - 'child_foreign_key': 'table_A_id', - }, - { - 'parent_table_name': 'tableA', - 'child_table_name': 'tableB', - 'parent_primary_key': 'table_A_id', - 'child_foreign_key': 'table_A_id', - }, - ], - 'METADATA_SPEC_VERSION': 'V1', - } + assert detected_metadata.tables['tableA'].primary_key == 'table_A_id' + assert detected_metadata.tables['tableB'].primary_key == 'table_A_id' + assert detected_metadata.tables['tableC'].primary_key == 'table_A_id' + if len(detected_metadata.relationships) == 1: + assert { + 'child_foreign_key': 'table_A_id', + 'child_table_name': 'tableB', + 'parent_primary_key': 'table_A_id', + 'parent_table_name': 'tableA', + } in detected_metadata.relationships + else: + assert { + 'parent_table_name': 'tableA', # PK to PK + 'child_table_name': 'tableC', + 'parent_primary_key': 'table_A_id', + 'child_foreign_key': 'table_A_id', + } in detected_metadata.relationships + assert { + 'parent_table_name': 'tableA', # PK to PK + 'child_table_name': 'tableB', + 'parent_primary_key': 'table_A_id', + 'child_foreign_key': 'table_A_id', + } in detected_metadata.relationships def test_validate_metadata_with_reused_foreign_keys(): From c20528703bd09da1520d130b5151e55a341f3d16 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Mon, 2 Mar 2026 19:18:27 -0500 Subject: [PATCH 4/8] fix unit test --- tests/unit/metadata/test_metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/metadata/test_metadata.py b/tests/unit/metadata/test_metadata.py index 5d8a59a7f..54475c3c6 100644 --- a/tests/unit/metadata/test_metadata.py +++ b/tests/unit/metadata/test_metadata.py @@ -781,13 +781,13 @@ def test_detect_from_dataframe_primary_key_to_primary_key_no_cycles(self): assert instance.to_dict()['relationships'] == [ { 'parent_table_name': 'table1', - 'child_table_name': 'table2', + 'child_table_name': 'table3', 'parent_primary_key': 'id', 'child_foreign_key': 'id', }, { 'parent_table_name': 'table1', - 'child_table_name': 'table3', + 'child_table_name': 'table2', 'parent_primary_key': 'id', 'child_foreign_key': 'id', }, From d2a82a72789ac2eedee8fc4510f888f281e3bcfd Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Mon, 2 Mar 2026 19:22:10 -0500 Subject: [PATCH 5/8] fix flaky test --- tests/unit/metadata/test_metadata.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/unit/metadata/test_metadata.py b/tests/unit/metadata/test_metadata.py index 54475c3c6..dd5a33708 100644 --- a/tests/unit/metadata/test_metadata.py +++ b/tests/unit/metadata/test_metadata.py @@ -778,20 +778,23 @@ def test_detect_from_dataframe_primary_key_to_primary_key_no_cycles(self): instance._detect_foreign_keys_by_column_name(data) # Assert - assert instance.to_dict()['relationships'] == [ + expected = [ { 'parent_table_name': 'table1', - 'child_table_name': 'table3', + 'child_table_name': 'table2', 'parent_primary_key': 'id', 'child_foreign_key': 'id', }, { 'parent_table_name': 'table1', - 'child_table_name': 'table2', + 'child_table_name': 'table3', 'parent_primary_key': 'id', 'child_foreign_key': 'id', }, ] + for rel in expected: + assert rel in instance.to_dict()['relationships'] + assert len(instance.relationships) == 2 @patch('sdv.metadata.metadata.Metadata') def test_detect_from_dataframe(self, mock_metadata): From 6c23f5dc93b4d5db16a4db7151d3baa92e543356 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Tue, 3 Mar 2026 10:00:44 -0500 Subject: [PATCH 6/8] update docstring --- sdv/metadata/multi_table.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sdv/metadata/multi_table.py b/sdv/metadata/multi_table.py index 5ea9562c3..2a5fdff00 100644 --- a/sdv/metadata/multi_table.py +++ b/sdv/metadata/multi_table.py @@ -536,6 +536,9 @@ def _validate_all_tables_connected(self, parent_map, child_map): def _detect_foreign_keys_by_column_name(self, data): """Detect the foreign keys based on if a column name matches a primary key. + If a column name (a child table) is a primary key, it will also be considered + to be a valid candidate for a foreign key. + Args: data (dict): Dictionary of table names to dataframes. @@ -567,6 +570,7 @@ def _detect_foreign_keys_by_column_name(self, data): ) except InvalidMetadataError: + # circular relationship if pk_sdtype == 'id' and original_fk_sdtype != 'id': self.update_column( table_name=child_candidate, From 669411f572fbbf32a55c86d356ae4e865c0105c0 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Tue, 3 Mar 2026 18:54:45 -0500 Subject: [PATCH 7/8] Update tests/integration/metadata/test_metadata.py Co-authored-by: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> --- tests/integration/metadata/test_metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/metadata/test_metadata.py b/tests/integration/metadata/test_metadata.py index d2c643cb0..8aec56a79 100644 --- a/tests/integration/metadata/test_metadata.py +++ b/tests/integration/metadata/test_metadata.py @@ -940,10 +940,10 @@ def test_detect_from_dataframes__primary_to_primary_no_cycles(): assert detected_metadata.tables['tableC'].primary_key == 'table_A_id' if len(detected_metadata.relationships) == 1: assert { - 'child_foreign_key': 'table_A_id', + 'parent_table_name': 'tableA', 'child_table_name': 'tableB', + 'child_foreign_key': 'table_A_id', 'parent_primary_key': 'table_A_id', - 'parent_table_name': 'tableA', } in detected_metadata.relationships else: assert { From afe04ac99c1cf16a3d41d5a8277df9e4ff0cbed7 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Tue, 3 Mar 2026 18:59:33 -0500 Subject: [PATCH 8/8] fix test with 2 relationships --- tests/integration/metadata/test_metadata.py | 33 ++++++++------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/tests/integration/metadata/test_metadata.py b/tests/integration/metadata/test_metadata.py index 8aec56a79..2d1e1ed89 100644 --- a/tests/integration/metadata/test_metadata.py +++ b/tests/integration/metadata/test_metadata.py @@ -938,26 +938,19 @@ def test_detect_from_dataframes__primary_to_primary_no_cycles(): assert detected_metadata.tables['tableA'].primary_key == 'table_A_id' assert detected_metadata.tables['tableB'].primary_key == 'table_A_id' assert detected_metadata.tables['tableC'].primary_key == 'table_A_id' - if len(detected_metadata.relationships) == 1: - assert { - 'parent_table_name': 'tableA', - 'child_table_name': 'tableB', - 'child_foreign_key': 'table_A_id', - 'parent_primary_key': 'table_A_id', - } in detected_metadata.relationships - else: - assert { - 'parent_table_name': 'tableA', # PK to PK - 'child_table_name': 'tableC', - 'parent_primary_key': 'table_A_id', - 'child_foreign_key': 'table_A_id', - } in detected_metadata.relationships - assert { - 'parent_table_name': 'tableA', # PK to PK - 'child_table_name': 'tableB', - 'parent_primary_key': 'table_A_id', - 'child_foreign_key': 'table_A_id', - } in detected_metadata.relationships + assert len(detected_metadata.relationships) == 2 + assert { + 'parent_table_name': 'tableA', # PK to PK + 'child_table_name': 'tableC', + 'parent_primary_key': 'table_A_id', + 'child_foreign_key': 'table_A_id', + } in detected_metadata.relationships + assert { + 'parent_table_name': 'tableA', # PK to PK + 'child_table_name': 'tableB', + 'parent_primary_key': 'table_A_id', + 'child_foreign_key': 'table_A_id', + } in detected_metadata.relationships def test_validate_metadata_with_reused_foreign_keys():