@@ -980,3 +980,69 @@ def test_load_pq_to_sqlite_basic(
980980 ), "Some json object have underscores in their 'keys'"
981981
982982 cnx .close ()
983+
984+
985+ def test_multi_bucket_load_facts_no_fact_loss (tmp_path ):
986+ """
987+ Exercises the multi-bucket path in load_facts end-to-end and asserts that
988+ every unique fact in the input appears exactly once in fact.parquet.
989+
990+ The multi-bucket path is forced by:
991+ - creating one small parquet file per fact so group_parquet_files produces
992+ many batch files (one per source file with target_mb=0.001)
993+ - overriding parquet_dir_details to simulate a large dataset relative to
994+ available memory, pushing n_buckets above 1
995+
996+ If any facts are silently dropped during bucket assignment or the final merge
997+ this test will fail.
998+ """
999+ n_facts = 50
1000+ unique_facts = [f"{ 'a' * 63 } { i } " for i in range (n_facts )]
1001+
1002+ transformed_dir = tmp_path / "transformed"
1003+ transformed_dir .mkdir ()
1004+
1005+ for i , fact_hash in enumerate (unique_facts ):
1006+ pd .DataFrame (
1007+ {
1008+ "end_date" : ["" ],
1009+ "entity" : [i + 1 ],
1010+ "entry_date" : ["2023-01-01" ],
1011+ "entry_number" : ["1" ],
1012+ "fact" : [fact_hash ],
1013+ "field" : ["name" ],
1014+ "priority" : ["1" ],
1015+ "reference_entity" : ["" ],
1016+ "resource" : ["resource_abc" ],
1017+ "start_date" : ["" ],
1018+ "value" : [f"value_{ i } " ],
1019+ }
1020+ ).to_parquet (transformed_dir / f"resource_{ i } .parquet" , index = False )
1021+
1022+ package = DatasetParquetPackage (
1023+ dataset = "conservation-area" ,
1024+ path = tmp_path / "output" ,
1025+ specification_dir = None ,
1026+ transformed_parquet_dir = transformed_dir ,
1027+ )
1028+
1029+ # One source file per batch, giving n_facts batch files
1030+ package .group_parquet_files (transformed_dir , target_mb = 0.001 )
1031+
1032+ # Simulate large-dataset conditions so n_buckets > 1 is calculated
1033+ package .parquet_dir_details ["total_size_mb" ] = 100.0
1034+ package .parquet_dir_details ["memory_available" ] = 1.0
1035+
1036+ package .load_facts (transformed_dir )
1037+
1038+ output_file = (
1039+ tmp_path / "output" / "fact" / "dataset=conservation-area" / "fact.parquet"
1040+ )
1041+ assert output_file .exists (), "fact.parquet was not created"
1042+
1043+ df_result = pd .read_parquet (output_file )
1044+
1045+ assert len (df_result ) == n_facts , (
1046+ f"Expected { n_facts } facts but got { len (df_result )} . "
1047+ "The multi-bucket path silently dropped some facts."
1048+ )
0 commit comments