@@ -258,6 +258,56 @@ def group_parquet_files(
258258 f"row count mismatch after grouping: { source_count } source rows vs { batch_count } batch rows"
259259 )
260260
261+ def _assign_to_buckets (self , batch_dir , bucket_dir , n_buckets ):
262+ """
263+ Distribute rows from all batch files into n_buckets bucket files based
264+ on HASH(fact) % n_buckets. Each unique fact hash always maps to the same
265+ bucket, so per-bucket deduplication is equivalent to global deduplication.
266+
267+ Returns the list of bucket file paths.
268+ """
269+ digits = max (2 , len (str (n_buckets - 1 )))
270+ bucket_paths = [
271+ bucket_dir / f"bucket_{ i :0{digits }} .parquet" for i in range (n_buckets )
272+ ]
273+ for i in range (n_buckets ):
274+ self .conn .execute (
275+ f"""
276+ COPY (
277+ SELECT *
278+ FROM read_parquet('{ str (batch_dir )} /batch_*.parquet')
279+ WHERE CAST(HASH(fact) AS UBIGINT) % { n_buckets } = { i }
280+ ) TO '{ bucket_paths [i ]} ' (FORMAT PARQUET);
281+ """
282+ )
283+ return bucket_paths
284+
285+ def _deduplicate_buckets (self , bucket_paths , result_dir ):
286+ """
287+ Apply per-fact deduplication to each bucket file, keeping the highest
288+ priority and most recent entry for each unique fact hash.
289+
290+ Returns the list of result file paths.
291+ """
292+ digits = max (2 , len (str (len (bucket_paths ) - 1 )))
293+ result_paths = [
294+ result_dir / f"result_{ i :0{digits }} .parquet"
295+ for i in range (len (bucket_paths ))
296+ ]
297+ for i , bucket_path in enumerate (bucket_paths ):
298+ self .conn .execute (
299+ f"""
300+ COPY (
301+ SELECT *
302+ FROM read_parquet('{ bucket_path } ')
303+ QUALIFY ROW_NUMBER() OVER (
304+ PARTITION BY fact ORDER BY priority, entry_date DESC, entry_number DESC
305+ ) = 1
306+ ) TO '{ result_paths [i ]} ' (FORMAT PARQUET);
307+ """
308+ )
309+ return result_paths
310+
261311 def load_facts (self , transformed_parquet_dir ):
262312 """
263313 This method loads facts into a fact table from a directory containing all transformed files as parquet files
@@ -336,43 +386,23 @@ def load_facts(self, transformed_parquet_dir):
336386 logger .info (
337387 "Need to use multiple buckets in windowed function for facts"
338388 )
339- bucket_dir = transformed_parquet_dir / "bucket"
340- bucket_dir .mkdir (parents = True , exist_ok = True )
341- digits = max (2 , len (str (n_buckets - 1 )))
342- bucket_paths = [
343- bucket_dir / f"bucket_{ i :0{digits }} .parquet"
344- for i in range (n_buckets )
345- ]
389+ batch_dir = transformed_parquet_dir / "batch"
346390 logger .info (
347- f"Have { len (list (transformed_parquet_dir .glob ('batch/ batch_*.parquet' )))} batch files"
391+ f"Have { len (list (batch_dir .glob ('batch_*.parquet' )))} batch files"
348392 )
349393
350- # Loop over each batch file and assign to a bucket file
394+ bucket_dir = transformed_parquet_dir / "bucket"
395+ bucket_dir .mkdir (parents = True , exist_ok = True )
351396 logger .info (f"Assigning to { n_buckets } buckets" )
352- for f in transformed_parquet_dir .glob ("batch/batch_*.parquet" ):
353- for i in range (n_buckets ):
354- self .conn .execute (
355- f"""
356- COPY (
357- SELECT *
358- FROM read_parquet('{ f } ')
359- WHERE CAST(HASH(fact) AS UBIGINT) % { n_buckets } = { i }
360- ) TO '{ bucket_paths [i ]} ' (FORMAT PARQUET, APPEND TRUE);
361- """
362- )
363-
364- logger .info (
365- f"Have { len (list (bucket_dir .glob ('bucket_*.parquet' )))} bucket files"
366- )
397+ bucket_paths = self ._assign_to_buckets (batch_dir , bucket_dir , n_buckets )
398+ logger .info (f"Have { len (bucket_paths )} bucket files" )
367399
368- # temporary diagnostic: total rows across all buckets should equal
369- # total rows across all batch files (deduplication happens later)
400+ batch_total = self .conn .execute (
401+ f"SELECT COUNT(*) FROM read_parquet('{ str (batch_dir )} /batch_*.parquet')"
402+ ).fetchone ()[0 ]
370403 bucket_total = self .conn .execute (
371404 f"SELECT COUNT(*) FROM read_parquet('{ str (bucket_dir )} /bucket_*.parquet')"
372405 ).fetchone ()[0 ]
373- batch_total = self .conn .execute (
374- f"SELECT COUNT(*) FROM read_parquet('{ str (transformed_parquet_dir )} /batch/batch_*.parquet')"
375- ).fetchone ()[0 ]
376406 logger .info (f"row count in batch files: { batch_total } " )
377407 logger .info (
378408 f"row count in bucket files after assignment: { bucket_total } "
@@ -383,51 +413,27 @@ def load_facts(self, transformed_parquet_dir):
383413 )
384414
385415 process = psutil .Process (os .getpid ())
386- mem = process .memory_info ().rss / 1024 ** 2 # Memory in MB
416+ mem = process .memory_info ().rss / 1024 ** 2
387417 logger .info (f"[Memory usage] After 'bucketing': { mem :.2f} MB" )
388418
389419 result_dir = transformed_parquet_dir / "result"
390420 result_dir .mkdir (parents = True , exist_ok = True )
391- result_paths = [
392- result_dir / f"result_{ i :0{digits }} .parquet"
393- for i in range (n_buckets )
394- ]
395- for i in range (n_buckets ):
396- bucket_path = bucket_dir / f"bucket_{ i :0{digits }} .parquet"
397- self .conn .execute (
398- f"""
399- COPY (
400- SELECT *
401- FROM read_parquet('{ bucket_path } ')
402- QUALIFY ROW_NUMBER() OVER (
403- PARTITION BY fact ORDER BY priority, entry_date DESC, entry_number DESC
404- ) = 1
405- ) TO '{ result_paths [i ]} ' (FORMAT PARQUET);
406- """
407- )
408- logger .info (
409- f"Have { len (list (transformed_parquet_dir .glob ('result_*.parquet' )))} result files"
410- )
411-
412- # for path in bucket_paths:
413- # path.unlink(missing_ok=True)
421+ result_paths = self ._deduplicate_buckets (bucket_paths , result_dir )
422+ logger .info (f"Have { len (result_paths )} result files" )
414423
415424 process = psutil .Process (os .getpid ())
416- mem = process .memory_info ().rss / 1024 ** 2 # Memory in MB
425+ mem = process .memory_info ().rss / 1024 ** 2
417426 logger .info (f"[Memory usage] After 'result': { mem :.2f} MB" )
418427
419428 self .conn .execute (
420429 f"""
421- COPY(
422- SELECT * FROM
423- read_parquet('{ str (result_dir )} /result_*.parquet')
424- ) TO '{ str (output_path )} ' (FORMAT PARQUET);
425- """
430+ COPY (
431+ SELECT * FROM read_parquet('{ str (result_dir )} /result_*.parquet')
432+ ) TO '{ str (output_path )} ' (FORMAT PARQUET);
433+ """
426434 )
427435 logger .info (f"Facts saved to output_path: { output_path } " )
428436 logger .info (f"output_path exists: { os .path .exists (output_path )} " )
429- # for path in result_paths:
430- # path.unlink(missing_ok=True)
431437
432438 process = psutil .Process (os .getpid ())
433439 mem = process .memory_info ().rss / 1024 ** 2 # Memory in MB
0 commit comments