From b2d19a4ed07b9f60b0293f676a6e7cbedcf4ad57 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Tue, 30 Dec 2025 13:59:21 +0100 Subject: [PATCH 01/12] #677 Add batch id to bookkeeping records and make it accessible from metastore reader interfaces. --- .../co/absa/pramen/api/MetaTableRunInfo.scala | 1 + .../co/absa/pramen/api/MetastoreReader.scala | 3 +- .../pramen/core/bookkeeper/Bookkeeper.scala | 10 +++--- .../core/bookkeeper/BookkeeperBase.scala | 33 ++++++++++++++---- .../core/bookkeeper/BookkeeperDeltaBase.scala | 22 +++++++++--- .../core/bookkeeper/BookkeeperDeltaPath.scala | 2 +- .../bookkeeper/BookkeeperDeltaTable.scala | 5 +-- .../core/bookkeeper/BookkeeperHadoop.scala | 12 ++++--- .../core/bookkeeper/BookkeeperJdbc.scala | 32 +++++++++++++---- .../core/bookkeeper/BookkeeperMongoDb.scala | 34 ++++++++++++++----- .../core/bookkeeper/BookkeeperNull.scala | 4 ++- .../core/bookkeeper/BookkeeperText.scala | 16 ++++++--- .../bookkeeper/model/BookkeepingRecord.scala | 3 +- .../bookkeeper/model/BookkeepingRecords.scala | 3 +- .../core/metastore/MetastoreReaderBase.scala | 14 ++++++-- .../co/absa/pramen/core/model/DataChunk.scala | 3 +- .../za/co/absa/pramen/core/rdb/PramenDb.scala | 7 +++- .../core/metastore/MetastoreSuite.scala | 14 ++++---- .../pramen/core/mocks/DataChunkFactory.scala | 6 ++-- .../mocks/bookkeeper/SyncBookkeeperMock.scala | 24 ++++++++----- .../mocks/metastore/MetastoreReaderMock.scala | 2 +- .../core/mocks/metastore/MetastoreSpy.scala | 2 +- .../pramen/core/pipeline/SinkJobSuite.scala | 4 +-- .../core/pipeline/TransferJobSuite.scala | 4 +-- .../BookkeeperDeltaPathLongSuite.scala | 2 +- .../BookkeeperDeltaTableLongSuite.scala | 2 +- .../tests/bookkeeper/BookkeeperMemSuite.scala | 2 +- .../BookkeeperMongoDbLongSuite.scala | 2 +- .../bookkeeper/BookkeeperTextLongSuite.scala | 28 +++++++++++---- .../BookkeeperTransientSuiteEager.scala | 2 +- .../TaskRunnerMultithreadedSuite.scala | 4 +-- .../runner/task/TaskRunnerBaseSuite.scala | 4 +-- 32 files changed, 215 insertions(+), 91 deletions(-) diff --git a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetaTableRunInfo.scala b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetaTableRunInfo.scala index addd969f2..36267477f 100644 --- a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetaTableRunInfo.scala +++ b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetaTableRunInfo.scala @@ -20,6 +20,7 @@ import java.time.{Instant, LocalDate} case class MetaTableRunInfo(tableName: String, infoDate: LocalDate, + batchId: Long, inputRecordCount: Long, outputRecordCount: Long, jobStarted: Instant, diff --git a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetastoreReader.scala b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetastoreReader.scala index 1216bdeab..6957ad2c9 100644 --- a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetastoreReader.scala +++ b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetastoreReader.scala @@ -128,9 +128,10 @@ trait MetastoreReader { * * @param tableName The name of the table in the metastore. * @param infoDate The information date of the data. + * @param batchId An optional batch ID to filter by. * @return The run info of the table if available. */ - def getTableRunInfo(tableName: String, infoDate: LocalDate): Option[MetaTableRunInfo] + def getTableRunInfo(tableName: String, infoDate: LocalDate, batchId: Option[Long]): Seq[MetaTableRunInfo] /** * Returns the reason of running the task. This helps transformers and sinks to determine logic based on whether diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala index b5dfee689..3f3af4240 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala @@ -42,6 +42,8 @@ trait Bookkeeper { def getLatestDataChunk(table: String, infoDate: LocalDate): Option[DataChunk] + def getDataChunks(table: String, infoDate: LocalDate, batchId: Option[Long]): Seq[DataChunk] + def getDataChunksCount(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long def getLatestSchema(table: String, until: LocalDate): Option[(StructType, LocalDate)] @@ -115,23 +117,23 @@ object Bookkeeper { mongoDbConnection match { case Some(connection) => log.info(s"Using MongoDB for bookkeeping.") - new BookkeeperMongoDb(connection) + new BookkeeperMongoDb(connection, batchId) case None => bookkeepingConfig.bookkeepingHadoopFormat match { case HadoopFormat.Text => val path = bookkeepingConfig.bookkeepingLocation.get log.info(s"Using Hadoop (CSV for records, JSON for schemas) for bookkeeping at $path") - new BookkeeperText(path) + new BookkeeperText(path, batchId) case HadoopFormat.Delta => bookkeepingConfig.deltaTablePrefix match { case Some(tablePrefix) => val fullTableName = BookkeeperDeltaTable.getFullTableName(bookkeepingConfig.deltaDatabase, tablePrefix, "*") log.info(s"Using Delta Lake managed table '$fullTableName' for bookkeeping.") - new BookkeeperDeltaTable(bookkeepingConfig.deltaDatabase, tablePrefix) + new BookkeeperDeltaTable(bookkeepingConfig.deltaDatabase, tablePrefix, batchId) case None => val path = bookkeepingConfig.bookkeepingLocation.get log.info(s"Using Delta Lake for bookkeeping at $path") - new BookkeeperDeltaPath(path) + new BookkeeperDeltaPath(path, batchId) } } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala index c1afed024..4bfd69412 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala @@ -21,13 +21,15 @@ import za.co.absa.pramen.core.model.DataChunk import java.time.LocalDate import scala.collection.mutable -abstract class BookkeeperBase(isBookkeepingEnabled: Boolean) extends Bookkeeper { +abstract class BookkeeperBase(isBookkeepingEnabled: Boolean, batchId: Long) extends Bookkeeper { private val transientDataChunks = new mutable.HashMap[String, Array[DataChunk]]() def getLatestProcessedDateFromStorage(table: String, until: Option[LocalDate] = None): Option[LocalDate] def getLatestDataChunkFromStorage(table: String, infoDate: LocalDate): Option[DataChunk] + def getDataChunksFromStorage(table: String, infoDate: LocalDate, batchId: Option[Long]): Seq[DataChunk] + def getDataChunksCountFromStorage(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long private[pramen] def saveRecordCountToStorage(table: String, @@ -46,7 +48,7 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean) extends Bookkeeper isTableTransient: Boolean): Unit = { if (isTableTransient || !isBookkeepingEnabled) { val tableLowerCase = table.toLowerCase - val dataChunk = DataChunk(table, infoDate.toString, infoDate.toString, infoDate.toString, inputRecordCount, outputRecordCount, jobStarted, jobFinished) + val dataChunk = DataChunk(table, infoDate.toString, infoDate.toString, infoDate.toString, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) this.synchronized { val dataChunks = transientDataChunks.getOrElse(tableLowerCase, Array.empty[DataChunk]) val newDataChunks = (dataChunks :+ dataChunk).sortBy(_.jobFinished) @@ -82,13 +84,25 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean) extends Bookkeeper } } + final def getDataChunks(table: String, infoDate: LocalDate, batchId: Option[Long]): Seq[DataChunk] = { + val isTransient = this.synchronized { + transientDataChunks.contains(table.toLowerCase) + } + + if (isTransient || !isBookkeepingEnabled) { + getTransientDataChunks(table, Option(infoDate), Option(infoDate), batchId) + } else { + getDataChunksFromStorage(table, infoDate, batchId) + } + } + final def getDataChunksCount(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long = { val isTransient = this.synchronized { transientDataChunks.contains(table.toLowerCase) } if (isTransient || !isBookkeepingEnabled) { - getTransientDataChunks(table, dateBeginOpt, dateEndOpt).length + getTransientDataChunks(table, dateBeginOpt, dateEndOpt, None).length } else { getDataChunksCountFromStorage(table, dateBeginOpt, dateEndOpt) } @@ -100,7 +114,7 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean) extends Bookkeeper } private def getLatestTransientDate(table: String, from: Option[LocalDate], until: Option[LocalDate]): Option[LocalDate] = { - val chunks = getTransientDataChunks(table, from, until) + val chunks = getTransientDataChunks(table, from, until, None) if (chunks.isEmpty) { None @@ -112,12 +126,12 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean) extends Bookkeeper } private def getLatestTransientChunk(table: String, from: Option[LocalDate], until: Option[LocalDate]): Option[DataChunk] = { - getTransientDataChunks(table, from, until) + getTransientDataChunks(table, from, until, None) .lastOption } - private[core] def getTransientDataChunks(table: String, from: Option[LocalDate], until: Option[LocalDate]): Array[DataChunk] = { + private[core] def getTransientDataChunks(table: String, from: Option[LocalDate], until: Option[LocalDate], batchId: Option[Long]): Array[DataChunk] = { val minDate = from.map(_.toString).getOrElse("0000-00-00") val maxDate = until.map(_.toString).getOrElse("9999-99-99") val tableLowerCase = table.toLowerCase @@ -125,7 +139,12 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean) extends Bookkeeper transientDataChunks.getOrElse(tableLowerCase, Array.empty[DataChunk]) } - allChunks.filter(chunk => chunk.infoDate >= minDate && chunk.infoDate <= maxDate) + batchId match { + case Some(id) => + allChunks.filter(chunk => chunk.infoDate >= minDate && chunk.infoDate <= maxDate && chunk.batchId == id) + case None => + allChunks.filter(chunk => chunk.infoDate >= minDate && chunk.infoDate <= maxDate) + } } protected def getDateStr(date: LocalDate): String = DataChunk.dateFormatter.format(date) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala index 2638e97da..6651eb2e7 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala @@ -26,7 +26,7 @@ import java.time.LocalDate import scala.reflect.ClassTag import scala.reflect.runtime.universe -abstract class BookkeeperDeltaBase extends BookkeeperHadoop { +abstract class BookkeeperDeltaBase(batchId: Long) extends BookkeeperHadoop(batchId) { def getBkDf(filter: Column): Dataset[DataChunk] @@ -60,13 +60,19 @@ abstract class BookkeeperDeltaBase extends BookkeeperHadoop { } final override def getLatestDataChunkFromStorage(table: String, infoDate: LocalDate): Option[DataChunk] = { - val infoDateFilter = getFilter(table, Option(infoDate), Option(infoDate)) + val infoDateFilter = getFilter(table, Option(infoDate), Option(infoDate), None) getBkData(infoDateFilter).lastOption } + final override def getDataChunksFromStorage(tableName: String, infoDate: LocalDate, batchId: Option[Long]): Seq[DataChunk] = { + val infoDateFilter = getFilter(tableName, Option(infoDate), Option(infoDate), batchId) + + getBkAllData(infoDateFilter) + } + final def getDataChunksCountFromStorage(table: String, dateBegin: Option[LocalDate], dateEnd: Option[LocalDate]): Long = { - getBkDf(getFilter(table, dateBegin, dateEnd)).count() + getBkDf(getFilter(table, dateBegin, dateEnd, None)).count() } final private[pramen] override def saveRecordCountToStorage(table: String, @@ -77,13 +83,13 @@ abstract class BookkeeperDeltaBase extends BookkeeperHadoop { jobFinished: Long): Unit = { val dateStr = getDateStr(infoDate) - val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished) + val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) saveRecordCountDelta(chunk) } final override def getLatestSchema(table: String, until: LocalDate): Option[(StructType, LocalDate)] = { - val filter = getFilter(table, None, Option(until)) + val filter = getFilter(table, None, Option(until), None) val df = getSchemasDeltaDf @@ -103,6 +109,12 @@ abstract class BookkeeperDeltaBase extends BookkeeperHadoop { saveSchemaDelta(tableSchema) } + private[core] def getBkAllData(filter: Column): Seq[DataChunk] = { + getBkDf(filter) + .collect() + .sortBy(_.infoDate) + } + private[core] def getBkData(filter: Column): Seq[DataChunk] = { getBkDf(filter) .collect() diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaPath.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaPath.scala index 952418f93..e6126bfa4 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaPath.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaPath.scala @@ -34,7 +34,7 @@ object BookkeeperDeltaPath { val locksDirName = "locks" } -class BookkeeperDeltaPath(bookkeepingPath: String)(implicit spark: SparkSession) extends BookkeeperDeltaBase { +class BookkeeperDeltaPath(bookkeepingPath: String, batchId: Long)(implicit spark: SparkSession) extends BookkeeperDeltaBase(batchId) { import BookkeeperDeltaPath._ import spark.implicits._ diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala index b93289600..b4f05a462 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala @@ -38,8 +38,9 @@ object BookkeeperDeltaTable { } class BookkeeperDeltaTable(database: Option[String], - tablePrefix: String) - (implicit spark: SparkSession) extends BookkeeperDeltaBase { + tablePrefix: String, + batchId: Long) + (implicit spark: SparkSession) extends BookkeeperDeltaBase(batchId) { import BookkeeperDeltaTable._ import spark.implicits._ diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperHadoop.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperHadoop.scala index 9fa577215..a971f7123 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperHadoop.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperHadoop.scala @@ -18,14 +18,13 @@ package za.co.absa.pramen.core.bookkeeper import org.apache.spark.sql.Column import org.apache.spark.sql.functions._ -import za.co.absa.pramen.core.model.DataChunk import java.time.LocalDate -abstract class BookkeeperHadoop extends BookkeeperBase(true) { - private[core] def getFilter(tableName: String, infoDateBegin: Option[LocalDate], infoDateEnd: Option[LocalDate]): Column = { - (infoDateBegin, infoDateEnd) match { +abstract class BookkeeperHadoop(batchId: Long) extends BookkeeperBase(true, batchId) { + private[core] def getFilter(tableName: String, infoDateBegin: Option[LocalDate], infoDateEnd: Option[LocalDate], batchId: Option[Long]): Column = { + val baseFilter = (infoDateBegin, infoDateEnd) match { case (Some(begin), Some(end)) => val beginStr = getDateStr(begin) val endStr = getDateStr(end) @@ -39,5 +38,10 @@ abstract class BookkeeperHadoop extends BookkeeperBase(true) { case (None, None) => col("tableName") === tableName } + + batchId match { + case Some(id) => baseFilter && col("batchId") === lit(id) + case None => baseFilter + } } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala index 4d077dc5f..639cb36b1 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala @@ -29,7 +29,7 @@ import za.co.absa.pramen.core.utils.SlickUtils import java.time.LocalDate import scala.util.control.NonFatal -class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true) { +class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, batchId) { import za.co.absa.pramen.core.utils.FutureImplicits._ private val log = LoggerFactory.getLogger(this.getClass) @@ -67,8 +67,21 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true) { } } + override def getDataChunksFromStorage(table: String, infoDate: LocalDate, batchId: Option[Long]): Seq[DataChunk] = { + val query = getFilter(table, Option(infoDate), Option(infoDate), batchId) + + try { + SlickUtils.executeQuery[BookkeepingRecords, BookkeepingRecord](db, query) + .map(toChunk) + .toArray[DataChunk] + .sortBy(_.jobFinished) + } catch { + case NonFatal(ex) => throw new RuntimeException(s"Unable to read from the bookkeeping table.", ex) + } + } + override def getLatestDataChunkFromStorage(table: String, infoDate: LocalDate): Option[DataChunk] = { - val query = getFilter(table, Option(infoDate), Option(infoDate)) + val query = getFilter(table, Option(infoDate), Option(infoDate), None) .sortBy(r => r.jobFinished.desc) .take(1) @@ -86,7 +99,7 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true) { } def getDataChunksCountFromStorage(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long = { - val query = getFilter(table, dateBeginOpt, dateEndOpt) + val query = getFilter(table, dateBeginOpt, dateEndOpt, None) .length val count = try { @@ -106,7 +119,7 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true) { jobFinished: Long): Unit = { val dateStr = DataChunk.dateFormatter.format(infoDate) - val record = BookkeepingRecord(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished) + val record = BookkeepingRecord(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) try { db.run( @@ -123,11 +136,11 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true) { private def toChunk(r: BookkeepingRecord): DataChunk = { DataChunk( - r.pramenTableName, r.infoDate, r.infoDateBegin, r.infoDateEnd, r.inputRecordCount, r.outputRecordCount, r.jobStarted, r.jobFinished) + r.pramenTableName, r.infoDate, r.infoDateBegin, r.infoDateEnd, r.inputRecordCount, r.outputRecordCount, r.jobStarted, r.jobFinished, r.batchId) } - private def getFilter(tableName: String, infoDateBeginOpt: Option[LocalDate], infoDateEndOpt: Option[LocalDate]): Query[BookkeepingRecords, BookkeepingRecord, Seq] = { - (infoDateBeginOpt, infoDateEndOpt) match { + private def getFilter(tableName: String, infoDateBeginOpt: Option[LocalDate], infoDateEndOpt: Option[LocalDate], batchId: Option[Long]): Query[BookkeepingRecords, BookkeepingRecord, Seq] = { + val baseFilter = (infoDateBeginOpt, infoDateEndOpt) match { case (Some(infoDateBegin), Some(infoDateEnd)) => val date0Str = DataChunk.dateFormatter.format(infoDateBegin) val date1Str = DataChunk.dateFormatter.format(infoDateEnd) @@ -153,6 +166,11 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true) { BookkeepingRecords.records .filter(r => r.pramenTableName === tableName) } + + batchId match { + case Some(id) => baseFilter.filter(r => r.batchId === id) + case None => baseFilter + } } override def getLatestSchema(table: String, infoDate: LocalDate): Option[(StructType, LocalDate)] = { diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala index f9678d8f8..d18145dac 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala @@ -37,10 +37,10 @@ object BookkeeperMongoDb { val collectionName = "bookkeeping" val schemaCollectionName = "schemas" - val MODEL_VERSION = 2 + val MODEL_VERSION = 3 } -class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection) extends BookkeeperBase(true) { +class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) extends BookkeeperBase(true, batchId) { import BookkeeperMongoDb._ import za.co.absa.pramen.core.dao.ScalaMongoImplicits._ @@ -79,7 +79,7 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection) extends Bookkeeper } override def getLatestDataChunkFromStorage(table: String, infoDate: LocalDate): Option[DataChunk] = { - val infoDateFilter = getFilter(table, Option(infoDate), Option(infoDate)) + val infoDateFilter = getFilter(table, Option(infoDate), Option(infoDate), None) collection.find(infoDateFilter) .sort(Sorts.descending("jobFinished")) @@ -89,7 +89,14 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection) extends Bookkeeper } def getDataChunksCountFromStorage(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long = { - collection.countDocuments(getFilter(table, dateBeginOpt, dateEndOpt)).execute() + collection.countDocuments(getFilter(table, dateBeginOpt, dateEndOpt, None)).execute() + } + + override def getDataChunksFromStorage(table: String, infoDate: LocalDate, batchId: Option[Long]): Seq[DataChunk] = { + val chunks = collection.find(getFilter(table, Option(infoDate), Option(infoDate), batchId)).execute() + .sortBy(_.jobFinished) + log.info(s"For $table ($infoDate) : ${chunks.mkString("[ ", ", ", " ]")}") + chunks } private[pramen] override def saveRecordCountToStorage(table: String, @@ -100,14 +107,13 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection) extends Bookkeeper jobFinished: Long): Unit = { val dateStr = DataChunk.dateFormatter.format(infoDate) - val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished) + val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) - val opts = (new ReplaceOptions).upsert(true) - collection.replaceOne(getFilter(table, Option(infoDate), Option(infoDate)), chunk, opts).execute() + collection.insertOne(chunk).execute() } - private def getFilter(tableName: String, infoDateBeginOpt: Option[LocalDate], infoDateEndOpt: Option[LocalDate]): Bson = { - (infoDateBeginOpt, infoDateEndOpt) match { + private def getFilter(tableName: String, infoDateBeginOpt: Option[LocalDate], infoDateEndOpt: Option[LocalDate], batchId: Option[Long]): Bson = { + val baseFilter = (infoDateBeginOpt, infoDateEndOpt) match { case (Some(infoDateBegin), Some(infoDateEnd)) => val date0Str = DataChunk.dateFormatter.format(infoDateBegin) @@ -139,6 +145,10 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection) extends Bookkeeper Filters.eq("tableName", tableName) } + batchId match { + case Some(id) => Filters.and(baseFilter, Filters.eq("batchId", id)) + case None => baseFilter + } } private def getSchemaGetFilter(tableName: String, until: LocalDate): Bson = { @@ -165,6 +175,12 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection) extends Bookkeeper d.createCollection(schemaCollectionName) d.createIndex(schemaCollectionName, IndexField("tableName", ASC) :: IndexField("infoDate", ASC) :: Nil, unique = true) } + if (dbVersion < 3) { + val keys = IndexField("tableName", ASC) :: IndexField("infoDate", ASC) :: Nil + // Make the bookkeeping index non-unique + d.dropIndex(collectionName, keys) + d.createIndex(collectionName, keys) + } if (dbVersion < MODEL_VERSION) { d.setVersion(MODEL_VERSION) } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala index b5c9836f4..1ed0a7119 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala @@ -26,13 +26,15 @@ import java.time.LocalDate * this bookkeeper implementation is used. It always returns the state as if every table is new and no information is * available. */ -class BookkeeperNull() extends BookkeeperBase(false) { +class BookkeeperNull() extends BookkeeperBase(false, 0L) { override val bookkeepingEnabled: Boolean = false override def getLatestProcessedDateFromStorage(table: String, until: Option[LocalDate]): Option[LocalDate] = None override def getLatestDataChunkFromStorage(table: String, infoDate: LocalDate): Option[DataChunk] = None + override def getDataChunksFromStorage(table: String, infoDate: LocalDate, batchId: Option[Long]): Seq[DataChunk] = Nil + override def getDataChunksCountFromStorage(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long = 0 private[pramen] override def saveRecordCountToStorage(table: String, diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala index 203859d92..4b8292ac1 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala @@ -38,7 +38,7 @@ object BookkeeperText { val locksDirName = "locks" } -class BookkeeperText(bookkeepingPath: String)(implicit spark: SparkSession) extends BookkeeperHadoop { +class BookkeeperText(bookkeepingPath: String, batchId: Long)(implicit spark: SparkSession) extends BookkeeperHadoop(batchId) { import BookkeeperText._ import spark.implicits._ @@ -77,12 +77,18 @@ class BookkeeperText(bookkeepingPath: String)(implicit spark: SparkSession) exte } } + override def getDataChunksFromStorage(tableName: String, infoDate: LocalDate, batchId: Option[Long]): Seq[DataChunk] = { + val infoDateFilter = getFilter(tableName, Option(infoDate), Option(infoDate), batchId) + + getData(infoDateFilter) + } + override def getLatestDataChunkFromStorage(table: String, infoDate: LocalDate): Option[DataChunk] = { - getData(getFilter(table, Option(infoDate), Option(infoDate))).lastOption + getData(getFilter(table, Option(infoDate), Option(infoDate), None)).lastOption } def getDataChunksCountFromStorage(table: String, dateBegin: Option[LocalDate], dateEnd: Option[LocalDate]): Long = { - getDf(getFilter(table, dateBegin, dateEnd)).count() + getDf(getFilter(table, dateBegin, dateEnd, None)).count() } private[pramen] override def saveRecordCountToStorage(table: String, @@ -96,7 +102,7 @@ class BookkeeperText(bookkeepingPath: String)(implicit spark: SparkSession) exte try { val dateStr = getDateStr(infoDate) - val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished) + val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) val csv = CsvUtils.getRecord(chunk, '|') fsUtils.appendFile(bkFilePath, csv) @@ -160,7 +166,7 @@ class BookkeeperText(bookkeepingPath: String)(implicit spark: SparkSession) exte } override def getLatestSchema(table: String, until: LocalDate): Option[(StructType, LocalDate)] = { - val filter = getFilter(table, None, Option(until)) + val filter = getFilter(table, None, Option(until), None) val df = spark .read diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala index 97db34902..50fe46e43 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala @@ -23,7 +23,8 @@ case class BookkeepingRecord(pramenTableName: String, inputRecordCount: Long, outputRecordCount: Long, jobStarted: Long, - jobFinished: Long) + jobFinished: Long, + batchId: Long) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala index be9aac585..7eb7eff74 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala @@ -28,9 +28,10 @@ class BookkeepingRecords(tag: Tag) extends Table[BookkeepingRecord](tag, "bookke def outputRecordCount = column[Long]("output_record_count") def jobStarted = column[Long]("job_started") def jobFinished = column[Long]("job_finished") + def batchId = column[Long]("batch_id") def * = (pramenTableName, infoDate, infoDateBegin, infoDateEnd, inputRecordCount, outputRecordCount, - jobStarted, jobFinished) <> (BookkeepingRecord.tupled, BookkeepingRecord.unapply) + jobStarted, jobFinished, batchId) <> (BookkeepingRecord.tupled, BookkeepingRecord.unapply) def idx1 = index("bk_idx_1", (pramenTableName, infoDate), unique = false) } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala index 4673984e4..91c30cdd5 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala @@ -70,10 +70,18 @@ abstract class MetastoreReaderBase(metastore: Metastore, MetaTable.getMetaTableDef(metastore.getTableDef(tableName)) } - override def getTableRunInfo(tableName: String, infoDate: LocalDate): Option[MetaTableRunInfo] = { - bookkeeper.getLatestDataChunk(tableName, infoDate) + override def getTableRunInfo(tableName: String, infoDate: LocalDate, batchId: Option[Long]): Seq[MetaTableRunInfo] = { + bookkeeper.getDataChunks(tableName, infoDate, batchId) .map(chunk => - MetaTableRunInfo(tableName, LocalDate.parse(chunk.infoDate), chunk.inputRecordCount, chunk.outputRecordCount, Instant.ofEpochSecond(chunk.jobStarted), Instant.ofEpochSecond(chunk.jobFinished)) + MetaTableRunInfo( + tableName, + LocalDate.parse(chunk.infoDate), + chunk.batchId, + chunk.inputRecordCount, + chunk.outputRecordCount, + Instant.ofEpochSecond(chunk.jobStarted), + Instant.ofEpochSecond(chunk.jobFinished) + ) ) } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala index 98a8a0d32..25d472001 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala @@ -25,7 +25,8 @@ case class DataChunk(tableName: String, inputRecordCount: Long, outputRecordCount: Long, jobStarted: Long, - jobFinished: Long) + jobFinished: Long, + batchId: Long) object DataChunk { diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala index 9ca535138..24d082303 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala @@ -89,9 +89,14 @@ class PramenDb(val jdbcConfig: JdbcConfig, if (0 < dbVersion && dbVersion < 7) { addColumn(LockTickets.lockTickets.baseTableRow.tableName, "created_at", "bigint") } + if (0 < dbVersion && dbVersion < 8) { addColumn(JournalTasks.journalTasks.baseTableRow.tableName, "country", "varchar(50)") } + + if (0 < dbVersion && dbVersion < 9) { + addColumn(BookkeepingRecords.records.baseTableRow.tableName, "batch_id", "bigint") + } } def initTable(schema: H2Profile.SchemaDescription): Unit = { @@ -127,7 +132,7 @@ class PramenDb(val jdbcConfig: JdbcConfig, } object PramenDb { - val MODEL_VERSION = 8 + val MODEL_VERSION = 9 val DEFAULT_RETRIES = 3 def apply(jdbcConfig: JdbcConfig): PramenDb = { diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/metastore/MetastoreSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/metastore/MetastoreSuite.scala index 3e4865ed5..2c6d13a86 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/metastore/MetastoreSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/metastore/MetastoreSuite.scala @@ -249,7 +249,7 @@ class MetastoreSuite extends AnyWordSpec with SparkTestBase with TextComparisonF val df1 = m.getTable("table1", Some(infoDate), Some(infoDate)) assert(df1.count() == 3) - assert(b.getDataChunks("table1", infoDate, infoDate).nonEmpty) + assert(b.getDataChunks("table1", infoDate, None).nonEmpty, None) } } @@ -262,7 +262,7 @@ class MetastoreSuite extends AnyWordSpec with SparkTestBase with TextComparisonF val df1 = m.getTable("table1", Some(infoDate), Some(infoDate)) assert(df1.count() == 3) - assert(b.getDataChunks("table1", infoDate, infoDate).isEmpty) + assert(b.getDataChunks("table1", infoDate, None).isEmpty) } } } @@ -422,14 +422,14 @@ class MetastoreSuite extends AnyWordSpec with SparkTestBase with TextComparisonF m.saveTable("table1", infoDate, getDf) val reader = m.getMetastoreReader("table1" :: Nil, "output_table", infoDate, TaskRunReason.New, ReaderMode.Batch) - val runInfo1 = reader.getTableRunInfo("table1", infoDate) - val runInfo2 = reader.getTableRunInfo("table1", infoDate.plusDays(1)) + val runInfo1 = reader.getTableRunInfo("table1", infoDate, None) + val runInfo2 = reader.getTableRunInfo("table1", infoDate.plusDays(1), None) - assert(runInfo1.isDefined) + assert(runInfo1.nonEmpty) assert(runInfo2.isEmpty) - assert(runInfo1.get.tableName == "table1") - assert(runInfo1.get.infoDate == infoDate) + assert(runInfo1.head.tableName == "table1") + assert(runInfo1.head.infoDate == infoDate) } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala index 1619a7662..76d33a794 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala @@ -24,7 +24,8 @@ object DataChunkFactory { inputRecordCount: Long = 1000, outputRecordCount: Long = 1000, jobStarted: Long = 10000, - jobFinished: Long = 20000): DataChunk = { + jobFinished: Long = 20000, + batchId: Long = 123L): DataChunk = { DataChunk(tableName = tableName, infoDate = infoDate, infoDateBegin = infoDate, @@ -32,6 +33,7 @@ object DataChunkFactory { inputRecordCount = inputRecordCount, outputRecordCount = outputRecordCount, jobStarted = jobStarted, - jobFinished = jobFinished) + jobFinished = jobFinished, + batchId = batchId) } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala index d18557a7e..105a96a14 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala @@ -24,7 +24,7 @@ import java.time.LocalDate import scala.collection.mutable import scala.util.Try -class SyncBookkeeperMock extends Bookkeeper { +class SyncBookkeeperMock(batchId: Long = 123L) extends Bookkeeper { private val chunks = new mutable.HashMap[(String, LocalDate), DataChunk]() private val schemas = new mutable.ListBuffer[(String, (LocalDate, TableSchema))]() @@ -53,15 +53,22 @@ class SyncBookkeeperMock extends Bookkeeper { } override def getLatestDataChunk(table: String, infoDate: LocalDate): Option[DataChunk] = { - getDataChunks(table, infoDate, infoDate).lastOption + getDataChunks(table, infoDate, None).lastOption } - def getDataChunks(table: String, dateBegin: LocalDate, dateEnd: LocalDate): Seq[DataChunk] = { - chunks.toList.flatMap { case ((tblName, infoDate), chunk) => - val isInsidePeriod = tblName == table && (infoDate.isAfter(dateBegin) || infoDate.equals(dateBegin)) && - (infoDate.isBefore(dateEnd) || infoDate.equals(dateEnd)) + override def getDataChunks(table: String, infoDate: LocalDate, batchId: Option[Long]): Seq[DataChunk] = { + chunks.toList.flatMap { case ((tblName, date), chunk) => + val isInsidePeriod = tblName == table && date.equals(infoDate) if (isInsidePeriod) { - Some(chunk) + batchId match { + case Some(id) => + if (chunk.batchId == id) { + Some(chunk) + } else { + None + } + case None => Some(chunk) + } } else { None } @@ -99,7 +106,8 @@ class SyncBookkeeperMock extends Bookkeeper { inputRecordCount, outputRecordCount, jobStarted, - jobFinished) + jobFinished, + batchId) chunks += (table, infoDate) -> chunk } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metastore/MetastoreReaderMock.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metastore/MetastoreReaderMock.scala index d162134e1..06db156c6 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metastore/MetastoreReaderMock.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metastore/MetastoreReaderMock.scala @@ -65,7 +65,7 @@ class MetastoreReaderMock(tables: Seq[(String, DataFrame)], infoDate: LocalDate) } } - override def getTableRunInfo(tableName: String, infoDate: LocalDate): Option[MetaTableRunInfo] = None + override def getTableRunInfo(tableName: String, infoDate: LocalDate, batchId: Option[Long]): Seq[MetaTableRunInfo] = Seq.empty override def metadataManager: MetadataManager = metadata diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metastore/MetastoreSpy.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metastore/MetastoreSpy.scala index ef392c011..6f1d6c533 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metastore/MetastoreSpy.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metastore/MetastoreSpy.scala @@ -164,7 +164,7 @@ class MetastoreSpy(registeredTables: Seq[String] = Seq("table1", "table2"), table.writeOptions) } - override def getTableRunInfo(tableName: String, infoDate: LocalDate): Option[MetaTableRunInfo] = None + override def getTableRunInfo(tableName: String, infoDate: LocalDate, batchId: Option[Long]): Seq[MetaTableRunInfo] = Seq.empty override def getRunReason: TaskRunReason = TaskRunReason.New diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/SinkJobSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/SinkJobSuite.scala index 1192ffa9f..fe1bc91c3 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/SinkJobSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/SinkJobSuite.scala @@ -193,9 +193,9 @@ class SinkJobSuite extends AnyWordSpec with SparkTestBase with TextComparisonFix val outputTable = "table1->mysink" - assert(bk.getDataChunks(outputTable, infoDate, infoDate).nonEmpty) + assert(bk.getDataChunks(outputTable, infoDate, None).nonEmpty) - val chunk = bk.getDataChunks(outputTable, infoDate, infoDate).head + val chunk = bk.getDataChunks(outputTable, infoDate, None).head assert(chunk.inputRecordCount == 10) assert(chunk.outputRecordCount == 3) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala index f120d444a..e54f0de0c 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala @@ -236,9 +236,9 @@ class TransferJobSuite extends AnyWordSpec with SparkTestBase with TextCompariso val outputTable = "table1->sink" - assert(bk.getDataChunks(outputTable, infoDate, infoDate).nonEmpty) + assert(bk.getDataChunks(outputTable, infoDate, None).nonEmpty) - val chunk = bk.getDataChunks(outputTable, infoDate, infoDate).head + val chunk = bk.getDataChunks(outputTable, infoDate, None).head assert(chunk.inputRecordCount == 10) assert(chunk.outputRecordCount == 3) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaPathLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaPathLongSuite.scala index f7cb07561..3eab1bdcd 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaPathLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaPathLongSuite.scala @@ -39,7 +39,7 @@ class BookkeeperDeltaPathLongSuite extends BookkeeperCommonSuite with SparkTestB } def getBookkeeper: BookkeeperDeltaPath = { - new BookkeeperDeltaPath(tmpDir) + new BookkeeperDeltaPath(tmpDir, 123L) } "BookkeeperHadoopDeltaPath" when { diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala index 66de740be..bad7cf09d 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala @@ -35,7 +35,7 @@ class BookkeeperDeltaTableLongSuite extends BookkeeperCommonSuite with SparkTest } def getBookkeeper(prefix: String): BookkeeperDeltaTable = { - new BookkeeperDeltaTable(None, prefix) + new BookkeeperDeltaTable(None, prefix, 123L) } "BookkeeperHadoopDeltaTable" when { diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala index 2a88264b7..e57af94d6 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala @@ -93,7 +93,7 @@ class BookkeeperMemSuite extends AnyWordSpec with BeforeAndAfter { bk.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false) bk.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false) - val chunks = bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table", infoDate1, infoDate1) + val chunks = bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table", infoDate1, None) assert(chunks.size == 1) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala index 8561ce1e0..83a81666e 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala @@ -37,7 +37,7 @@ class BookkeeperMongoDbLongSuite extends BookkeeperCommonSuite with MongoDbFixtu } def getBookkeeper: Bookkeeper = { - new BookkeeperMongoDb(connection) + new BookkeeperMongoDb(connection, 123L) } if (db != null) { diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTextLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTextLongSuite.scala index fa9991fe7..6f82f87c9 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTextLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTextLongSuite.scala @@ -19,7 +19,7 @@ package za.co.absa.pramen.core.tests.bookkeeper import org.apache.hadoop.fs.Path import org.scalatest.BeforeAndAfter import za.co.absa.pramen.core.base.SparkTestBase -import za.co.absa.pramen.core.bookkeeper.{Bookkeeper, BookkeeperText} +import za.co.absa.pramen.core.bookkeeper.BookkeeperText import za.co.absa.pramen.core.fixtures.TempDirFixture import za.co.absa.pramen.core.utils.FsUtils @@ -42,7 +42,7 @@ class BookkeeperTextLongSuite extends BookkeeperCommonSuite with SparkTestBase w } def getBookkeeper: BookkeeperText = { - new BookkeeperText(tmpDir) + new BookkeeperText(tmpDir, 123L) } "BookkeeperHadoopText" when { @@ -62,15 +62,23 @@ class BookkeeperTextLongSuite extends BookkeeperCommonSuite with SparkTestBase w "get a ranged filter" in { val bk = getBookkeeper - val actual = bk.getFilter("table1", Some(LocalDate.of(2021, 1, 1)), Some(LocalDate.of(2021, 1, 2))).toString() + val actual = bk.getFilter("table1", Some(LocalDate.of(2021, 1, 1)), Some(LocalDate.of(2021, 1, 2)), None).toString() assert(actual == "(((tableName = table1) AND (infoDate >= 2021-01-01)) AND (infoDate <= 2021-01-02))") } + "get a ranged filter with batch id" in { + val bk = getBookkeeper + + val actual = bk.getFilter("table1", Some(LocalDate.of(2021, 1, 1)), Some(LocalDate.of(2021, 1, 2)), Some(123L)).toString() + + assert(actual == "((((tableName = table1) AND (infoDate >= 2021-01-01)) AND (infoDate <= 2021-01-02)) AND (batchId = 123))") + } + "get a from filter" in { val bk = getBookkeeper - val actual = bk.getFilter("table1", Some(LocalDate.of(2021, 1, 1)), None).toString() + val actual = bk.getFilter("table1", Some(LocalDate.of(2021, 1, 1)), None, None).toString() assert(actual == "((tableName = table1) AND (infoDate >= 2021-01-01))") } @@ -78,15 +86,23 @@ class BookkeeperTextLongSuite extends BookkeeperCommonSuite with SparkTestBase w "get a to filter" in { val bk = getBookkeeper - val actual = bk.getFilter("table1", None, Some(LocalDate.of(2021, 1, 2))).toString() + val actual = bk.getFilter("table1", None, Some(LocalDate.of(2021, 1, 2)), None).toString() assert(actual == "((tableName = table1) AND (infoDate <= 2021-01-02))") } + "get a batchid filter" in { + val bk = getBookkeeper + + val actual = bk.getFilter("table1", None, None, Some(123L)).toString() + + assert(actual == "((tableName = table1) AND (batchId = 123))") + } + "get a table filter" in { val bk = getBookkeeper - val actual = bk.getFilter("table1", None, None).toString() + val actual = bk.getFilter("table1", None, None, None).toString() assert(actual == "(tableName = table1)") } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala index 441e4bfb8..ea8c5a9e9 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala @@ -71,7 +71,7 @@ class BookkeeperTransientSuiteEager extends AnyWordSpec { val bk = getBookkeeper bk.setRecordCount("table1", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = true) - val chunks = bk.getTransientDataChunks("table1", Option(infoDate2), Option(infoDate2)) + val chunks = bk.getTransientDataChunks("table1", Option(infoDate2), Option(infoDate2), None) assert(chunks.length == 1) assert(chunks.head.infoDate == infoDate2.toString) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala index d7c520828..545b70180 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala @@ -83,7 +83,7 @@ class TaskRunnerMultithreadedSuite extends AnyWordSpec with SparkTestBase { assert(results.head.runStatus.isInstanceOf[Failed]) assert(results.head.runInfo.get.infoDate == runDate) - assert(bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table_out", runDate, runDate).isEmpty) + assert(bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table_out", runDate, None).isEmpty) } "handle a successful multiple task job sequential execution" in { @@ -109,7 +109,7 @@ class TaskRunnerMultithreadedSuite extends AnyWordSpec with SparkTestBase { assert(results.head.runStatus.isInstanceOf[Failed]) assert(results.head.runInfo.get.infoDate == runDate) - assert(bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table_out", runDate, runDate).isEmpty) + assert(bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table_out", runDate, None).isEmpty) } "run job even if it is asking for more resources than maximum available" in { diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/task/TaskRunnerBaseSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/task/TaskRunnerBaseSuite.scala index 4aaaab653..b867778bd 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/task/TaskRunnerBaseSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/task/TaskRunnerBaseSuite.scala @@ -498,7 +498,7 @@ class TaskRunnerBaseSuite extends AnyWordSpec with SparkTestBase with TextCompar val failure = result.runStatus.asInstanceOf[Failed] assert(failure.ex.getMessage == "TestException") - assert(bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table_out", infoDate, infoDate).isEmpty) + assert(bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table_out", infoDate, None).isEmpty) } "handle a dry run" in { @@ -514,7 +514,7 @@ class TaskRunnerBaseSuite extends AnyWordSpec with SparkTestBase with TextCompar assert(success.recordCount.contains(2)) assert(success.sizeBytes.isEmpty) - assert(bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table_out", infoDate, infoDate).isEmpty) + assert(bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table_out", infoDate, None).isEmpty) } "expose Hive table" in { From 5e42a39bc4d626f6906c67db5e39bd84229827e1 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 31 Dec 2025 09:16:54 +0100 Subject: [PATCH 02/12] Bump up the minor version number because of interface changes. --- pramen/api/pom.xml | 2 +- pramen/core/pom.xml | 2 +- pramen/extras/pom.xml | 2 +- pramen/pom.xml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pramen/api/pom.xml b/pramen/api/pom.xml index 3d6dd1efc..0c9d76deb 100644 --- a/pramen/api/pom.xml +++ b/pramen/api/pom.xml @@ -27,7 +27,7 @@ za.co.absa.pramen pramen - 1.12.14-SNAPSHOT + 1.13.0-SNAPSHOT diff --git a/pramen/core/pom.xml b/pramen/core/pom.xml index c70260f79..1552b62e7 100644 --- a/pramen/core/pom.xml +++ b/pramen/core/pom.xml @@ -27,7 +27,7 @@ za.co.absa.pramen pramen - 1.12.14-SNAPSHOT + 1.13.0-SNAPSHOT diff --git a/pramen/extras/pom.xml b/pramen/extras/pom.xml index a2b26c464..c5508f36f 100644 --- a/pramen/extras/pom.xml +++ b/pramen/extras/pom.xml @@ -27,7 +27,7 @@ za.co.absa.pramen pramen - 1.12.14-SNAPSHOT + 1.13.0-SNAPSHOT diff --git a/pramen/pom.xml b/pramen/pom.xml index 7af348b32..ede50c623 100644 --- a/pramen/pom.xml +++ b/pramen/pom.xml @@ -22,7 +22,7 @@ za.co.absa.pramen pramen - 1.12.14-SNAPSHOT + 1.13.0-SNAPSHOT pom Pramen From a6e88e1224886bcd66736a1d805df40a19af1293 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 31 Dec 2025 10:31:30 +0100 Subject: [PATCH 03/12] #677 Add batch id the journal table. --- .../src/main/scala/za/co/absa/pramen/api/PipelineInfo.scala | 3 ++- .../za/co/absa/pramen/core/journal/JournalHadoopCsv.scala | 4 +++- .../scala/za/co/absa/pramen/core/journal/JournalJdbc.scala | 6 ++++-- .../za/co/absa/pramen/core/journal/model/JournalTask.scala | 3 ++- .../za/co/absa/pramen/core/journal/model/JournalTasks.scala | 3 ++- .../co/absa/pramen/core/journal/model/TaskCompleted.scala | 6 ++++-- .../absa/pramen/core/journal/model/TaskCompletedCsv.scala | 3 ++- .../main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala | 1 + .../za/co/absa/pramen/core/state/PipelineStateImpl.scala | 3 ++- .../za/co/absa/pramen/core/mocks/PipelineInfoFactory.scala | 5 +++-- .../za/co/absa/pramen/core/mocks/TaskCompletedFactory.scala | 6 ++++-- .../za/co/absa/pramen/core/tests/journal/TestCases.scala | 6 +++--- .../notification/EcsPipelineNotificationTargetSuite.scala | 2 +- 13 files changed, 33 insertions(+), 18 deletions(-) diff --git a/pramen/api/src/main/scala/za/co/absa/pramen/api/PipelineInfo.scala b/pramen/api/src/main/scala/za/co/absa/pramen/api/PipelineInfo.scala index 0bd40f305..c98d9e1cd 100644 --- a/pramen/api/src/main/scala/za/co/absa/pramen/api/PipelineInfo.scala +++ b/pramen/api/src/main/scala/za/co/absa/pramen/api/PipelineInfo.scala @@ -33,5 +33,6 @@ case class PipelineInfo( pipelineNotificationFailures: Seq[PipelineNotificationFailure], pipelineId: String, tenant: Option[String], - country: Option[String] + country: Option[String], + batchId: Long ) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalHadoopCsv.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalHadoopCsv.scala index fc22caf01..1ec05b92e 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalHadoopCsv.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalHadoopCsv.scala @@ -88,7 +88,8 @@ class JournalHadoopCsv(journalPath: String) pipelineName = v.pipelineName, environmentName = v.environmentName, tenant = v.tenant, - country = v.country + country = v.country, + batchId = v.batchId )) } @@ -125,6 +126,7 @@ class JournalHadoopCsv(journalPath: String) t.environmentName.getOrElse("") :: t.tenant.getOrElse("") :: t.country.getOrElse("") :: + t.batchId :: Nil record.mkString("", s"$separator", "\n") } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalJdbc.scala index ea3c69ced..90dfd521f 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalJdbc.scala @@ -56,7 +56,8 @@ class JournalJdbc(db: Database) extends Journal { entry.pipelineName, entry.environmentName, entry.tenant, - entry.country) + entry.country, + entry.batchId) try { db.run( @@ -97,7 +98,8 @@ class JournalJdbc(db: Database) extends Journal { pipelineName = v.pipelineName, environmentName = v.environmentName, tenant = v.tenant, - country = v.country + country = v.country, + batchId = v.batchId ) }).toList } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTask.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTask.scala index b1aca8b38..8a6ac2d4b 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTask.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTask.scala @@ -37,5 +37,6 @@ case class JournalTask( pipelineName: Option[String], environmentName: Option[String], tenant: Option[String], - country: Option[String] + country: Option[String], + batchId: Long ) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTasks.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTasks.scala index 246fcf1fc..770fd61ed 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTasks.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTasks.scala @@ -41,11 +41,12 @@ class JournalTasks(tag: Tag) extends Table[JournalTask](tag, "journal") { def environmentName = column[Option[String]]("environmentName", O.Length(128)) def tenant = column[Option[String]]("tenant", O.Length(200)) def country = column[Option[String]]("country", O.Length(50)) + def batchId = column[Long]("batch_id") def * = (jobName, pramenTableName, periodBegin, periodEnd, informationDate, inputRecordCount, inputRecordCountOld, outputRecordCount, outputRecordCountOld, appendedRecordCount, outputSize, startedAt, finishedAt, status, failureReason, sparkApplicationId, pipelineId, pipelineName, environmentName, - tenant, country) <> (JournalTask.tupled, JournalTask.unapply) + tenant, country, batchId) <> (JournalTask.tupled, JournalTask.unapply) def idx1 = index("idx_started_at", startedAt, unique = false) def idx2 = index("idx_finished_at", finishedAt, unique = false) } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompleted.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompleted.scala index ae005e966..1fc2fb969 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompleted.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompleted.scala @@ -44,7 +44,8 @@ case class TaskCompleted( pipelineName: Option[String], environmentName: Option[String], tenant: Option[String], - country: Option[String] + country: Option[String], + batchId: Long ) object TaskCompleted { @@ -86,7 +87,8 @@ object TaskCompleted { Option(pipelineInfo.pipelineName), Option(pipelineInfo.environment), pipelineInfo.tenant, - pipelineInfo.country + pipelineInfo.country, + pipelineInfo.batchId ) } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompletedCsv.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompletedCsv.scala index b0a2989ad..8340f3e3a 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompletedCsv.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompletedCsv.scala @@ -37,5 +37,6 @@ case class TaskCompletedCsv( pipelineName: Option[String], environmentName: Option[String], tenant: Option[String], - country: Option[String] + country: Option[String], + batchId: Long ) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala index 24d082303..b966903a6 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala @@ -96,6 +96,7 @@ class PramenDb(val jdbcConfig: JdbcConfig, if (0 < dbVersion && dbVersion < 9) { addColumn(BookkeepingRecords.records.baseTableRow.tableName, "batch_id", "bigint") + addColumn(JournalTasks.journalTasks.baseTableRow.tableName, "batch_id", "bigint") } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/state/PipelineStateImpl.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/state/PipelineStateImpl.scala index fc65e1db9..592ffd477 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/state/PipelineStateImpl.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/state/PipelineStateImpl.scala @@ -139,7 +139,8 @@ class PipelineStateImpl(implicit conf: Config, notificationBuilder: Notification pipelineNotificationFailures.toSeq, pipelineId, tenant, - country + country, + batchId ) } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/PipelineInfoFactory.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/PipelineInfoFactory.scala index 4712b22e6..206e2f723 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/PipelineInfoFactory.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/PipelineInfoFactory.scala @@ -34,7 +34,8 @@ object PipelineInfoFactory { pipelineNotificationFailures: Seq[PipelineNotificationFailure] = Seq.empty, pipelineId: String = "dummy_pipeline_id", tenant: Option[String] = Some("Dummy tenant"), - country: Option[String] = Some("noname")): PipelineInfo = { - PipelineInfo(pipelineName, environment, runtimeInfo, startedAt, finishedAt, warningFlag, sparkApplicationId, pipelineStatus, failureException, pipelineNotificationFailures, pipelineId, tenant, country) + country: Option[String] = Some("noname"), + batchId: Long = 123L): PipelineInfo = { + PipelineInfo(pipelineName, environment, runtimeInfo, startedAt, finishedAt, warningFlag, sparkApplicationId, pipelineStatus, failureException, pipelineNotificationFailures, pipelineId, tenant, country, batchId) } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/TaskCompletedFactory.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/TaskCompletedFactory.scala index 35b83cbea..cfbb8ba61 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/TaskCompletedFactory.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/TaskCompletedFactory.scala @@ -43,7 +43,8 @@ object TaskCompletedFactory { pipelineName: Option[String] = Some("test"), environmentName: Option[String] = Some("DEV"), tenant: Option[String] = Some("Dummy tenant"), - country: Option[String] = Some("noname")): TaskCompleted = { + country: Option[String] = Some("noname"), + batchId: Long = 123L): TaskCompleted = { model.TaskCompleted( jobName, tableName, @@ -65,7 +66,8 @@ object TaskCompletedFactory { pipelineName, environmentName, tenant, - country + country, + batchId ) } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/TestCases.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/TestCases.scala index 3dbc562c3..1871b6292 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/TestCases.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/TestCases.scala @@ -30,7 +30,7 @@ object TestCases { val instant2: Instant = Instant.ofEpochSecond(1597318835) val instant3: Instant = Instant.ofEpochSecond(1597318839) - val task1: TaskCompleted = model.TaskCompleted("job1", "table1", infoDate1, infoDate1, infoDate1, Some(100), Some(0), Some(100), None, None, None, 597318830, 1597318830, "New", Some("Test1"), Some("abc123"), Some("p_id_1"), Some("p_1"), Some("TEST"), Some("T1"), Some("C1")) - val task2: TaskCompleted = model.TaskCompleted("job1", "table1", infoDate2, infoDate2, infoDate2, Some(100), Some(0), Some(100), None, None, None, 1597318835, 1597318835, "Late", Some("Test2"), Some("abc123"), Some("p_id_2"), Some("p_2"), Some("TEST"), Some("T2"), Some("C2")) - val task3: TaskCompleted = model.TaskCompleted("job2", "table2", infoDate3, infoDate3, infoDate3, Some(100), Some(0), Some(100), None, None, None, 1597318839, 1597318839, "Fail", Some("Test3"), Some("abc123"), Some("p_id_3"), Some("p_2"), Some("TEST"), Some("T2"), Some("C3")) + val task1: TaskCompleted = model.TaskCompleted("job1", "table1", infoDate1, infoDate1, infoDate1, Some(100), Some(0), Some(100), None, None, None, 597318830, 1597318830, "New", Some("Test1"), Some("abc123"), Some("p_id_1"), Some("p_1"), Some("TEST"), Some("T1"), Some("C1"), 123L) + val task2: TaskCompleted = model.TaskCompleted("job1", "table1", infoDate2, infoDate2, infoDate2, Some(100), Some(0), Some(100), None, None, None, 1597318835, 1597318835, "Late", Some("Test2"), Some("abc123"), Some("p_id_2"), Some("p_2"), Some("TEST"), Some("T2"), Some("C2"), 123L) + val task3: TaskCompleted = model.TaskCompleted("job2", "table2", infoDate3, infoDate3, infoDate3, Some(100), Some(0), Some(100), None, None, None, 1597318839, 1597318839, "Fail", Some("Test3"), Some("abc123"), Some("p_id_3"), Some("p_2"), Some("TEST"), Some("T2"), Some("C3"), 123L) } diff --git a/pramen/extras/src/test/scala/za/co/absa/pramen/extras/notification/EcsPipelineNotificationTargetSuite.scala b/pramen/extras/src/test/scala/za/co/absa/pramen/extras/notification/EcsPipelineNotificationTargetSuite.scala index f9aadea83..1809bf209 100644 --- a/pramen/extras/src/test/scala/za/co/absa/pramen/extras/notification/EcsPipelineNotificationTargetSuite.scala +++ b/pramen/extras/src/test/scala/za/co/absa/pramen/extras/notification/EcsPipelineNotificationTargetSuite.scala @@ -62,7 +62,7 @@ class EcsPipelineNotificationTargetSuite extends AnyWordSpec { val task3 = TestPrototypes.taskNotification.copy(taskDef = taskDef3) notificationTarget.sendNotification( - PipelineInfo("Dummy", "DEV", RuntimeInfo(LocalDate.parse("2022-02-18")), Instant.now, None, warningFlag = false, None, PipelineStatus.Success, None, Seq.empty, "pid_123", None, None), + PipelineInfo("Dummy", "DEV", RuntimeInfo(LocalDate.parse("2022-02-18")), Instant.now, None, warningFlag = false, None, PipelineStatus.Success, None, Seq.empty, "pid_123", None, None, 123L), Seq(task1, task2, task3), CustomNotification(Seq.empty, Seq.empty) ) From 31256535e6b76fcdba6b037c0da47a14a7735edd Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 31 Dec 2025 12:39:32 +0100 Subject: [PATCH 04/12] #677 Add batchId to journal tables, fix bookeeper compatibility issues in JDBC and MongoDB implementations. --- .../za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala | 4 ++-- .../za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala | 4 ++-- .../absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala | 2 +- .../pramen/core/bookkeeper/model/BookkeepingRecords.scala | 2 +- .../scala/za/co/absa/pramen/core/journal/JournalJdbc.scala | 4 ++-- .../za/co/absa/pramen/core/journal/model/JournalTask.scala | 2 +- .../za/co/absa/pramen/core/journal/model/JournalTasks.scala | 2 +- .../za/co/absa/pramen/core/fixtures/MongoDbFixture.scala | 2 +- .../pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala | 4 +++- .../core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala | 2 +- 10 files changed, 15 insertions(+), 13 deletions(-) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala index 639cb36b1..69b76bfdc 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala @@ -119,7 +119,7 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b jobFinished: Long): Unit = { val dateStr = DataChunk.dateFormatter.format(infoDate) - val record = BookkeepingRecord(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) + val record = BookkeepingRecord(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, Option(batchId)) try { db.run( @@ -136,7 +136,7 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b private def toChunk(r: BookkeepingRecord): DataChunk = { DataChunk( - r.pramenTableName, r.infoDate, r.infoDateBegin, r.infoDateEnd, r.inputRecordCount, r.outputRecordCount, r.jobStarted, r.jobFinished, r.batchId) + r.pramenTableName, r.infoDate, r.infoDateBegin, r.infoDateEnd, r.inputRecordCount, r.outputRecordCount, r.jobStarted, r.jobFinished, r.batchId.getOrElse(0L)) } private def getFilter(tableName: String, infoDateBeginOpt: Option[LocalDate], infoDateEndOpt: Option[LocalDate], batchId: Option[Long]): Query[BookkeepingRecords, BookkeepingRecord, Seq] = { diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala index d18145dac..a484191c7 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala @@ -169,13 +169,13 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext val dbVersion = d.getVersion() if (!d.doesCollectionExists(collectionName)) { d.createCollection(collectionName) - d.createIndex(collectionName, IndexField("tableName", ASC) :: IndexField("infoDate", ASC) :: Nil, unique = true) + d.createIndex(collectionName, IndexField("tableName", ASC) :: IndexField("infoDate", ASC) :: Nil) } if (dbVersion < 2) { d.createCollection(schemaCollectionName) d.createIndex(schemaCollectionName, IndexField("tableName", ASC) :: IndexField("infoDate", ASC) :: Nil, unique = true) } - if (dbVersion < 3) { + if (dbVersion < 3 && dbVersion > 0) { val keys = IndexField("tableName", ASC) :: IndexField("infoDate", ASC) :: Nil // Make the bookkeeping index non-unique d.dropIndex(collectionName, keys) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala index 50fe46e43..5c0bee4c4 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala @@ -24,7 +24,7 @@ case class BookkeepingRecord(pramenTableName: String, outputRecordCount: Long, jobStarted: Long, jobFinished: Long, - batchId: Long) + batchId: Option[Long]) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala index 7eb7eff74..d05ffd7c6 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala @@ -28,7 +28,7 @@ class BookkeepingRecords(tag: Tag) extends Table[BookkeepingRecord](tag, "bookke def outputRecordCount = column[Long]("output_record_count") def jobStarted = column[Long]("job_started") def jobFinished = column[Long]("job_finished") - def batchId = column[Long]("batch_id") + def batchId = column[Option[Long]]("batch_id") def * = (pramenTableName, infoDate, infoDateBegin, infoDateEnd, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) <> (BookkeepingRecord.tupled, BookkeepingRecord.unapply) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalJdbc.scala index 90dfd521f..d2e2cbaf2 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalJdbc.scala @@ -57,7 +57,7 @@ class JournalJdbc(db: Database) extends Journal { entry.environmentName, entry.tenant, entry.country, - entry.batchId) + Option(entry.batchId)) try { db.run( @@ -99,7 +99,7 @@ class JournalJdbc(db: Database) extends Journal { environmentName = v.environmentName, tenant = v.tenant, country = v.country, - batchId = v.batchId + batchId = v.batchId.getOrElse(0L) ) }).toList } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTask.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTask.scala index 8a6ac2d4b..74e5d2cd0 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTask.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTask.scala @@ -38,5 +38,5 @@ case class JournalTask( environmentName: Option[String], tenant: Option[String], country: Option[String], - batchId: Long + batchId: Option[Long] ) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTasks.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTasks.scala index 770fd61ed..35d4637d7 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTasks.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/JournalTasks.scala @@ -41,7 +41,7 @@ class JournalTasks(tag: Tag) extends Table[JournalTask](tag, "journal") { def environmentName = column[Option[String]]("environmentName", O.Length(128)) def tenant = column[Option[String]]("tenant", O.Length(200)) def country = column[Option[String]]("country", O.Length(50)) - def batchId = column[Long]("batch_id") + def batchId = column[Option[Long]]("batch_id") def * = (jobName, pramenTableName, periodBegin, periodEnd, informationDate, inputRecordCount, inputRecordCountOld, outputRecordCount, outputRecordCountOld, appendedRecordCount, outputSize, startedAt, finishedAt, diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/fixtures/MongoDbFixture.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/fixtures/MongoDbFixture.scala index 9298c2b03..4c8199d59 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/fixtures/MongoDbFixture.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/fixtures/MongoDbFixture.scala @@ -28,7 +28,7 @@ trait MongoDbFixture extends BeforeAndAfterAll { import za.co.absa.pramen.core.dao.ScalaMongoImplicits._ - private val (mongoDbExecutable, mongoPort) = MongoDbSingleton.embeddedMongoDb + protected val (mongoDbExecutable, mongoPort) = MongoDbSingleton.embeddedMongoDb def connectionString: String = s"mongodb://localhost:$mongoPort" diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala index eaf6f502c..0af207b56 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala @@ -143,15 +143,17 @@ class BookkeeperCommonSuite extends AnyWordSpec { } "setRecordCount()" should { - "overwrite the previous entry" in { + "add a new entry" in { val bk = getBookkeeper() bk.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false) bk.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false) val latestChunkOpt = bk.getLatestDataChunk("table", infoDate1) + val chunks = bk.getDataChunks("table", infoDate1, None) assert(latestChunkOpt.isDefined) + assert(chunks.length == 2) assert(latestChunkOpt.get.infoDate == "2020-08-11") assert(latestChunkOpt.get.jobFinished == 1597318839) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala index 83a81666e..7142df4a0 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala @@ -40,7 +40,7 @@ class BookkeeperMongoDbLongSuite extends BookkeeperCommonSuite with MongoDbFixtu new BookkeeperMongoDb(connection, 123L) } - if (db != null) { + if (mongoDbExecutable.nonEmpty) { "BookkeeperMongoDb" when { "initialized" should { "Initialize an empty database" in { From 174211b017ad0244789c94f080bb2cba7b5b0df9 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 31 Dec 2025 13:28:35 +0100 Subject: [PATCH 05/12] #677 Update embedded MongoDB for testing to teh latest version. --- .../core/src/test/resources/log4j.properties | 1 + .../core/src/test/resources/log4j2.properties | 3 + .../core/mocks/dao/MongoDbSingleton.scala | 66 +++++++++++-------- .../tests/bookkeeper/BookkeeperSuite.scala | 2 +- .../tests/journal/JournalMongoDbSuite.scala | 2 +- .../tests/lock/TokenLockMongoDbSuite.scala | 2 +- pramen/pom.xml | 2 +- pramen/project/Versions.scala | 2 +- 8 files changed, 49 insertions(+), 31 deletions(-) diff --git a/pramen/core/src/test/resources/log4j.properties b/pramen/core/src/test/resources/log4j.properties index b682b4759..64f1a173b 100644 --- a/pramen/core/src/test/resources/log4j.properties +++ b/pramen/core/src/test/resources/log4j.properties @@ -35,3 +35,4 @@ log4j.logger.za.co.absa.pramen.core.utils.ConfigUtils$=OFF log4j.logger.za.co.absa.pramen.core.utils.JdbcNativeUtils$=OFF log4j.logger.za.co.absa.pramen.core.utils.SparkUtils$=OFF log4j.logger.za.co.absa.pramen.core.utils.ThreadUtils$=OFF +log4j.logger.za.co.absa.pramen.core.mocks.dao.MongoDbSingleton$=WARN diff --git a/pramen/core/src/test/resources/log4j2.properties b/pramen/core/src/test/resources/log4j2.properties index 2686a2678..315056667 100644 --- a/pramen/core/src/test/resources/log4j2.properties +++ b/pramen/core/src/test/resources/log4j2.properties @@ -65,3 +65,6 @@ logger.sparkutils.level = OFF logger.threadutils.name = za.co.absa.pramen.core.utils.ThreadUtils$ logger.threadutils.level = OFF + +logger.mongodbsingleton.name = za.co.absa.pramen.core.mocks.dao.MongoDbSingleton$ +logger.mongodbsingleton.level = WARN diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/dao/MongoDbSingleton.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/dao/MongoDbSingleton.scala index 920730875..7507afaff 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/dao/MongoDbSingleton.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/dao/MongoDbSingleton.scala @@ -16,17 +16,27 @@ package za.co.absa.pramen.core.mocks.dao -import de.flapdoodle.embed.mongo.config.{MongodConfigBuilder, Net, RuntimeConfigBuilder} + +import de.flapdoodle.embed.mongo.commands.ServerAddress import de.flapdoodle.embed.mongo.distribution.Version -import de.flapdoodle.embed.mongo.{Command, MongodExecutable, MongodStarter} -import de.flapdoodle.embed.process.config.io.ProcessOutput -import de.flapdoodle.embed.process.runtime.Network -import org.slf4j.LoggerFactory +import de.flapdoodle.embed.mongo.transitions.{Mongod, RunningMongodProcess} +import de.flapdoodle.embed.process.io.{ProcessOutput, StreamProcessor} +import de.flapdoodle.reverse.transitions.Start +import de.flapdoodle.reverse.{StateID, TransitionWalker} +import org.slf4j.{Logger, LoggerFactory} object MongoDbSingleton { private val log = LoggerFactory.getLogger(this.getClass) - lazy val embeddedMongoDb: (Option[MongodExecutable], Int) = startEmbeddedMongoDb() + lazy val embeddedMongoDb: (Option[RunningMongodProcess], Int) = startEmbeddedMongoDb() + + final class Slf4jProcessor(logger: Logger, prefix: String) extends StreamProcessor { + override def process(block: String): Unit = { + if (block != null && block.nonEmpty) logger.info(s"$prefix$block") + } + + override def onProcessed(): Unit = () // no-op + } /** * Create and run a MongoDb instance. @@ -35,31 +45,35 @@ object MongoDbSingleton { * * @return A pair: a MongoDb executable object to be used to stop it and the port number the embedded MongoDB listens to. */ - private def startEmbeddedMongoDb(): (Option[MongodExecutable], Int) = { - val mongoPort: Int = Network.getFreeServerPort() - - // Do not print Embedded MongoDB logs - val runtimeConfig = new RuntimeConfigBuilder() - .defaultsWithLogger(Command.MongoD, log) - .processOutput(ProcessOutput.getDefaultInstanceSilent) - .build() + private def startEmbeddedMongoDb(): (Option[RunningMongodProcess], Int) = { + try { + val version: Version = Version.V8_2_2 + val mongod = Mongod.builder() + .processOutput( + Start.to(classOf[ProcessOutput]).initializedWith( + ProcessOutput.builder() + .output(new Slf4jProcessor(log, "[mongod-out] ")) + .error(new Slf4jProcessor(log, "[mongod-err] ")) + .commands(new Slf4jProcessor(log, "[mongod-cmd] ")) + .build() + ) + ) + .build() - val starter = MongodStarter.getInstance(runtimeConfig) + val executable: TransitionWalker.ReachedState[RunningMongodProcess] = + mongod.transitions(version) + .walker() + .initState(StateID.of(classOf[RunningMongodProcess])) - val mongodConfig = new MongodConfigBuilder() - .version(Version.Main.V4_0) - .net(new Net("localhost", mongoPort, Network.localhostIsIPv6())) - .build() + val addr: ServerAddress = executable.current().getServerAddress + val mongoPort: Int = addr.getPort - val executable = try { - val exec = starter.prepare(mongodConfig) - exec.start() - Some(exec) + (Option(executable.current()), mongoPort) } catch { - case _: Throwable => None + case ex: Throwable => + log.warn("Couldn't start embedded Mongodb. MongoDB tests will be skipped", ex) + (None, 0) } - - (executable, mongoPort) } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperSuite.scala index a28293f2c..41adb3486 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperSuite.scala @@ -80,7 +80,7 @@ class BookkeeperSuite extends AnyWordSpec closable.close() } - if (db != null) { + if (mongoDbExecutable.nonEmpty) { "build bookkeeper, token lock, journal, and closable object for MongoDB" in { val bookkeepingConfig = BookkeepingConfigFactory.getDummyBookkeepingConfig( bookkeepingEnabled = true, diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalMongoDbSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalMongoDbSuite.scala index 378bcb37e..d3e0825b1 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalMongoDbSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalMongoDbSuite.scala @@ -37,7 +37,7 @@ class JournalMongoDbSuite extends AnyWordSpec with MongoDbFixture with BeforeAnd } } - if (db != null) { + if (mongoDbExecutable.nonEmpty) { "Journal/MongoDB" should { "Initialize an empty database" in { db.doesCollectionExists("collectionName") diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockMongoDbSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockMongoDbSuite.scala index 579802b65..c974eb6d0 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockMongoDbSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockMongoDbSuite.scala @@ -22,8 +22,8 @@ import org.scalatest.wordspec.AnyWordSpec import za.co.absa.pramen.core.fixtures.MongoDbFixture import za.co.absa.pramen.core.lock.TokenLockMongoDb import za.co.absa.pramen.core.lock.TokenLockMongoDb.collectionName + import scala.concurrent.duration._ -import org.scalatest.concurrent.Eventually._ class TokenLockMongoDbSuite extends AnyWordSpec with MongoDbFixture with BeforeAndAfter { before { diff --git a/pramen/pom.xml b/pramen/pom.xml index ede50c623..9ddf16640 100644 --- a/pramen/pom.xml +++ b/pramen/pom.xml @@ -383,7 +383,7 @@ de.flapdoodle.embed de.flapdoodle.embed.mongo - 2.2.0 + 4.22.0 test diff --git a/pramen/project/Versions.scala b/pramen/project/Versions.scala index 1083aa9cb..4cac97ac3 100644 --- a/pramen/project/Versions.scala +++ b/pramen/project/Versions.scala @@ -31,7 +31,7 @@ object Versions { val channelVersion = "0.2.1" val requestsVersion = "0.8.0" val javaXMailVersion = "1.6.2" - val embeddedMongoDbVersion = "2.2.0" + val embeddedMongoDbVersion = "4.22.0" val scalaCompatColsVersion = "2.12.0" val scalatestVersion = "3.2.14" val mockitoVersion = "2.28.2" From 41ec06fa12702ecb4b9befa0968aeec432f9fe0d Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 31 Dec 2025 14:58:30 +0100 Subject: [PATCH 06/12] #677 Fix the build for Scala 2.11 with the new embedded MongoDB. --- pramen/build.sbt | 3 ++- .../za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala | 5 ----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pramen/build.sbt b/pramen/build.sbt index a87fed719..2c962f953 100644 --- a/pramen/build.sbt +++ b/pramen/build.sbt @@ -28,7 +28,8 @@ ThisBuild / organization := "za.co.absa.pramen" ThisBuild / scalaVersion := scala212 ThisBuild / crossScalaVersions := Seq(scala211, scala212, scala213) -ThisBuild / scalacOptions := Seq("-unchecked", "-deprecation") +ThisBuild / scalacOptions := Seq("-unchecked", "-deprecation", "-target:jvm-1.8") +ThisBuild / javacOptions := Seq("-source", "1.8", "-tagret", "1.8") ThisBuild / versionScheme := Some("early-semver") diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala index 4b8292ac1..12a4f3641 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala @@ -157,11 +157,6 @@ class BookkeeperText(bookkeepingPath: String, batchId: Long)(implicit spark: Spa private def getData(filter: Column): Seq[DataChunk] = { getDf(filter) .collect() - .groupBy(v => (v.tableName, v.infoDate)) - .map { case (_, listChunks) => - listChunks.maxBy(c => c.jobFinished) - } - .toArray[DataChunk] .sortBy(_.jobFinished) } From 6f4df67fbbfcc53740183a7a59b8b0606347b68e Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Fri, 2 Jan 2026 11:26:31 +0100 Subject: [PATCH 07/12] #677 Separate appends and overwrites when saving bookeeping records. --- .../pramen/core/bookkeeper/Bookkeeper.scala | 3 +- .../core/bookkeeper/BookkeeperBase.scala | 8 +- .../core/bookkeeper/BookkeeperDeltaPath.scala | 11 +- .../bookkeeper/BookkeeperDeltaTable.scala | 17 ++- .../core/bookkeeper/BookkeeperJdbc.scala | 25 +++- .../core/bookkeeper/BookkeeperMongoDb.scala | 14 ++- .../core/bookkeeper/BookkeeperNull.scala | 2 + .../core/bookkeeper/BookkeeperText.scala | 4 + .../pramen/core/metastore/MetastoreImpl.scala | 7 +- .../pipeline/PythonTransformationJob.scala | 3 +- .../absa/pramen/core/pipeline/SinkJob.scala | 3 +- .../core/runner/task/TaskRunnerBase.scala | 2 +- .../pramen/core/utils/AlgorithmUtils.scala | 14 +++ .../absa/pramen/core/utils/SlickUtils.scala | 2 +- .../mocks/bookkeeper/SyncBookkeeperMock.scala | 3 +- .../core/pipeline/IngestionJobSuite.scala | 12 +- .../core/pipeline/TransferJobSuite.scala | 4 +- .../core/source/SourceValidationSuite.scala | 2 +- .../bookkeeper/BookkeeperCommonSuite.scala | 115 ++++++++++++------ .../BookkeeperDeltaPathLongSuite.scala | 8 +- .../BookkeeperDeltaTableLongSuite.scala | 44 +++++-- .../bookkeeper/BookkeeperJdbcSuite.scala | 8 +- .../tests/bookkeeper/BookkeeperMemSuite.scala | 18 +-- .../BookkeeperMongoDbLongSuite.scala | 8 +- .../bookkeeper/BookkeeperTextLongSuite.scala | 20 +-- .../BookkeeperTransientSuiteEager.scala | 8 +- .../TaskRunnerMultithreadedSuite.scala | 2 +- 27 files changed, 257 insertions(+), 110 deletions(-) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala index 3f3af4240..facc820b4 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala @@ -54,7 +54,8 @@ trait Bookkeeper { outputRecordCount: Long, jobStarted: Long, jobFinished: Long, - isTableTransient: Boolean): Unit + isTableTransient: Boolean, + overwrite: Boolean): Unit private[pramen] def saveSchema(table: String, infoDate: LocalDate, schema: StructType): Unit diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala index 4bfd69412..749f11270 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala @@ -32,6 +32,8 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean, batchId: Long) exte def getDataChunksCountFromStorage(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long + def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit + private[pramen] def saveRecordCountToStorage(table: String, infoDate: LocalDate, inputRecordCount: Long, @@ -45,7 +47,8 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean, batchId: Long) exte outputRecordCount: Long, jobStarted: Long, jobFinished: Long, - isTableTransient: Boolean): Unit = { + isTableTransient: Boolean, + overwrite: Boolean): Unit = { if (isTableTransient || !isBookkeepingEnabled) { val tableLowerCase = table.toLowerCase val dataChunk = DataChunk(table, infoDate.toString, infoDate.toString, infoDate.toString, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) @@ -56,6 +59,9 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean, batchId: Long) exte } } else { saveRecordCountToStorage(table, infoDate, inputRecordCount, outputRecordCount, jobStarted, jobFinished) + if (overwrite) { + deleteNonCurrentBatchRecords(table, infoDate) + } } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaPath.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaPath.scala index e6126bfa4..0d9ee4eaa 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaPath.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaPath.scala @@ -16,6 +16,7 @@ package za.co.absa.pramen.core.bookkeeper +import io.delta.tables.DeltaTable import org.apache.hadoop.fs.Path import org.apache.spark.sql.functions._ import org.apache.spark.sql.{Column, Dataset, SaveMode, SparkSession} @@ -23,7 +24,7 @@ import za.co.absa.pramen.core.bookkeeper.model.TableSchemaJson import za.co.absa.pramen.core.model.{DataChunk, TableSchema} import za.co.absa.pramen.core.utils.FsUtils -import java.time.Instant +import java.time.{Instant, LocalDate} import scala.reflect.ClassTag import scala.reflect.runtime.universe @@ -67,6 +68,14 @@ class BookkeeperDeltaPath(bookkeepingPath: String, batchId: Long)(implicit spark .save(recordsPath.toUri.toString) } + override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { + val infoDateStr = DataChunk.dateFormatter.format(infoDate) + val filter = (col("tableName") === lit(table)) && (col("infoDate") === lit(infoDateStr)) && (col("batchId") =!= lit(batchId)) + + val deltaTable = DeltaTable.forPath(spark, recordsPath.toUri.toString) + deltaTable.delete(filter) + } + override def getSchemasDeltaDf: Dataset[TableSchemaJson] = { spark .read diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala index b4f05a462..be876ab9f 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala @@ -16,12 +16,13 @@ package za.co.absa.pramen.core.bookkeeper -import org.apache.spark.sql.functions.col +import io.delta.tables.DeltaTable +import org.apache.spark.sql.functions.{col, lit} import org.apache.spark.sql.{Column, Dataset, SaveMode, SparkSession} import za.co.absa.pramen.core.bookkeeper.model.TableSchemaJson import za.co.absa.pramen.core.model.{DataChunk, TableSchema} -import java.time.Instant +import java.time.{Instant, LocalDate} import scala.reflect.ClassTag import scala.reflect.runtime.universe @@ -61,11 +62,20 @@ class BookkeeperDeltaTable(database: Option[String], val df = Seq(dataChunks).toDF() df.write + .format("delta") .mode(SaveMode.Append) .option("mergeSchema", "true") .saveAsTable(recordsFullTableName) } + override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { + val infoDateStr = DataChunk.dateFormatter.format(infoDate) + val filter = (col("tableName") === lit(table)) && (col("infoDate") === lit(infoDateStr)) && (col("batchId") =!= lit(batchId)) + + val deltaTable = DeltaTable.forName(spark, recordsFullTableName) + deltaTable.delete(filter) + } + override def getSchemasDeltaDf: Dataset[TableSchemaJson] = { spark.table(schemasFullTableName).as[TableSchemaJson] } @@ -76,6 +86,7 @@ class BookkeeperDeltaTable(database: Option[String], ).toDF() df.write + .format("delta") .mode(SaveMode.Append) .option("mergeSchema", "true") .saveAsTable(schemasFullTableName) @@ -85,7 +96,7 @@ class BookkeeperDeltaTable(database: Option[String], val df = Seq.empty[T].toDS df.write - .mode(SaveMode.Overwrite) + .format("delta") .saveAsTable(pathOrTable) } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala index 69b76bfdc..f152f3695 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala @@ -24,7 +24,8 @@ import za.co.absa.pramen.core.model.{DataChunk, TableSchema} import za.co.absa.pramen.core.rdb.PramenDb.DEFAULT_RETRIES import za.co.absa.pramen.core.reader.JdbcUrlSelector import za.co.absa.pramen.core.reader.model.JdbcConfig -import za.co.absa.pramen.core.utils.SlickUtils +import za.co.absa.pramen.core.utils.SlickUtils.{WARN_IF_LONGER_MS, log} +import za.co.absa.pramen.core.utils.{AlgorithmUtils, SlickUtils, TimeUtils} import java.time.LocalDate import scala.util.control.NonFatal @@ -111,7 +112,7 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b count } - private[pramen] override def saveRecordCountToStorage(table: String, + override def saveRecordCountToStorage(table: String, infoDate: LocalDate, inputRecordCount: Long, outputRecordCount: Long, @@ -130,6 +131,26 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b } } + override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { + val dateStr = DataChunk.dateFormatter.format(infoDate) + + val query = BookkeepingRecords.records + .filter(r => r.pramenTableName === table && r.infoDate === dateStr && r.batchId =!= batchId) + .delete + + try { + AlgorithmUtils.resultWithTimeout(WARN_IF_LONGER_MS) { + db.run(query).execute() + } { actualTimeMs => + val elapsedTime = TimeUtils.prettyPrintElapsedTimeShort(actualTimeMs) + val sql = query.statements.mkString("; ") + log.warn(s"Action execution time: $elapsedTime. SQL: $sql") + } + } catch { + case NonFatal(ex) => throw new RuntimeException(s"Unable to delete non-current batch records from the bookkeeping table.", ex) + } + } + private[pramen] override def getOffsetManager: OffsetManager = { offsetManagement } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala index a484191c7..f41a56f26 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala @@ -99,7 +99,7 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext chunks } - private[pramen] override def saveRecordCountToStorage(table: String, + override def saveRecordCountToStorage(table: String, infoDate: LocalDate, inputRecordCount: Long, outputRecordCount: Long, @@ -112,6 +112,18 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext collection.insertOne(chunk).execute() } + override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { + val dateStr = DataChunk.dateFormatter.format(infoDate) + + val filter = Filters.and( + Filters.eq("tableName", table), + Filters.eq("infoDate", dateStr), + Filters.ne("batchId", batchId) + ) + + collection.deleteMany(filter).execute() + } + private def getFilter(tableName: String, infoDateBeginOpt: Option[LocalDate], infoDateEndOpt: Option[LocalDate], batchId: Option[Long]): Bson = { val baseFilter = (infoDateBeginOpt, infoDateEndOpt) match { case (Some(infoDateBegin), Some(infoDateEnd)) => diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala index 1ed0a7119..a1983129c 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala @@ -47,4 +47,6 @@ class BookkeeperNull() extends BookkeeperBase(false, 0L) { override def getLatestSchema(table: String, until: LocalDate): Option[(StructType, LocalDate)] = None override def saveSchema(table: String, infoDate: LocalDate, schema: StructType): Unit = {} + + override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = {} } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala index 12a4f3641..d264fc23e 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala @@ -192,4 +192,8 @@ class BookkeeperText(bookkeepingPath: String, batchId: Long)(implicit spark: Spa lock.release() } } + + override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { + + } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreImpl.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreImpl.scala index 306d61634..c5ae0d247 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreImpl.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreImpl.scala @@ -140,7 +140,12 @@ class MetastoreImpl(appConfig: Config, stats.recordCount.foreach{recordCount => if (!skipBookKeepingUpdates && !nothingAppended) { - bookkeeper.setRecordCount(tableName, infoDate, inputRecordCount.getOrElse(recordCount), recordCount, start, finish, isTransient) + val overwrite = saveModeOverride.contains(SaveMode.Overwrite) + val recordCountToRecord = if (overwrite) + recordCount + else + stats.recordCountAppended.getOrElse(recordCount) + bookkeeper.setRecordCount(tableName, infoDate, inputRecordCount.getOrElse(recordCountToRecord), recordCountToRecord, start, finish, isTransient, overwrite) } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/PythonTransformationJob.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/PythonTransformationJob.scala index b3014e9d8..765f220e0 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/PythonTransformationJob.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/PythonTransformationJob.scala @@ -146,7 +146,8 @@ class PythonTransformationJob(operationDef: OperationDef, recordCount, jobStarted.getEpochSecond, jobFinished.getEpochSecond, - isTableTransient = false) + isTableTransient = false, + overwrite = true) } SaveResult(stats) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/SinkJob.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/SinkJob.scala index c6f6dab5e..2b670547a 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/SinkJob.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/SinkJob.scala @@ -168,7 +168,8 @@ class SinkJob(operationDef: OperationDef, sinkResult.recordsSent, jobStarted.getEpochSecond, jobFinished.getEpochSecond, - isTransient + isTransient, + overwrite = false ) if (isIncremental) { diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/runner/task/TaskRunnerBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/runner/task/TaskRunnerBase.scala index a8042703e..f78f0835b 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/runner/task/TaskRunnerBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/runner/task/TaskRunnerBase.scala @@ -327,7 +327,7 @@ abstract class TaskRunnerBase(conf: Config, log.info(s"SKIP validation failure for the task: $outputTableName for date: ${task.infoDate}. Reason: $msg") if (bookkeeper.getLatestDataChunk(outputTableName, task.infoDate).isEmpty) { val isTransient = task.job.outputTable.format.isTransient - bookkeeper.setRecordCount(outputTableName, task.infoDate, status.inputRecordsCount.getOrElse(0L), 0, started.getEpochSecond, Instant.now().getEpochSecond, isTransient) + bookkeeper.setRecordCount(outputTableName, task.infoDate, status.inputRecordsCount.getOrElse(0L), 0, started.getEpochSecond, Instant.now().getEpochSecond, isTransient, overwrite = false) } Left(TaskResult(task.job.taskDef, RunStatus.Skipped(msg), getRunInfo(task.infoDate, started), applicationId, isTransient, isRawFileBased, newSchemaRegistered = false, Nil, status.dependencyWarnings, Nil, options)) case Reason.SkipOnce(msg) => diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/AlgorithmUtils.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/AlgorithmUtils.scala index 3d0dd6024..14c38ba60 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/AlgorithmUtils.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/AlgorithmUtils.scala @@ -17,7 +17,9 @@ package za.co.absa.pramen.core.utils import org.slf4j.Logger +import za.co.absa.pramen.core.utils.SlickUtils.log +import java.time.{Duration, Instant} import scala.annotation.tailrec import scala.collection.mutable @@ -92,4 +94,16 @@ object AlgorithmUtils { } } } + + final def resultWithTimeout[R](timeoutMs: Long)(action: => R)(onTimeout: Long => Unit): R = { + val start = Instant.now + val result = action + val finish = Instant.now + + val duration = Duration.between(start, finish).toMillis + if (duration > timeoutMs) { + onTimeout(timeoutMs) + } + result + } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/SlickUtils.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/SlickUtils.scala index 865ae6366..e777d08e1 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/SlickUtils.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/SlickUtils.scala @@ -30,7 +30,7 @@ object SlickUtils { private val log = LoggerFactory.getLogger(this.getClass) - private val WARN_IF_LONGER_MS = 1000L + val WARN_IF_LONGER_MS = 1000L /** * Synchronously executes a query against a JDBC connection. diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala index 105a96a14..57481188d 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala @@ -96,7 +96,8 @@ class SyncBookkeeperMock(batchId: Long = 123L) extends Bookkeeper { outputRecordCount: Long, jobStarted: Long, jobFinished: Long, - isTableTransient: Boolean): Unit = { + isTableTransient: Boolean, + overwrite: Boolean): Unit = { val dateStr = DataChunk.dateFormatter.format(infoDate) val chunk = DataChunk(table, diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/IngestionJobSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/IngestionJobSuite.scala index 4de5e7378..fdf5f599e 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/IngestionJobSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/IngestionJobSuite.scala @@ -122,7 +122,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis "track already ran" in { val (bk, _, job) = getUseCase() - bk.setRecordCount("table1", infoDate, 4, 4, 123, 456, isTableTransient = false) + bk.setRecordCount("table1", infoDate, 4, 4, 123, 456, isTableTransient = false, overwrite = true) val result = job.preRunCheckJob(infoDate, runReason, conf, Nil) @@ -133,7 +133,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis "some records, default minimum records" in { val (bk, _, job) = getUseCase() - bk.setRecordCount("table1", infoDate, 100, 100, 123, 456, isTableTransient = false) + bk.setRecordCount("table1", infoDate, 100, 100, 123, 456, isTableTransient = false, overwrite = true) val result = job.preRunCheckJob(infoDate, runReason, conf, Nil) @@ -143,7 +143,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis "some records, custom minimum records" in { val (bk, _, job) = getUseCase(minRecords = Some(3)) - bk.setRecordCount("table1", infoDate, 100, 100, 123, 456, isTableTransient = false) + bk.setRecordCount("table1", infoDate, 100, 100, 123, 456, isTableTransient = false, overwrite = true) val result = job.preRunCheckJob(infoDate, runReason, conf, Nil) @@ -199,7 +199,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis "needs update, some records, custom minimum records" in { val (bk, _, job) = getUseCase(minRecords = Some(5)) - bk.setRecordCount("table1", infoDate, 100, 100, 123, 456, isTableTransient = false) + bk.setRecordCount("table1", infoDate, 100, 100, 123, 456, isTableTransient = false, overwrite = true) val result = job.preRunCheckJob(infoDate, runReason, conf, Nil) @@ -214,7 +214,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis val noDataInfoDate = infoDate.plusDays(1) - bk.setRecordCount("table1", noDataInfoDate, 4, 4, 123, 456, isTableTransient = false) + bk.setRecordCount("table1", noDataInfoDate, 4, 4, 123, 456, isTableTransient = false, overwrite = true) val result = job.preRunCheckJob(noDataInfoDate, runReason, conf, Nil) @@ -226,7 +226,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis val noDataInfoDate = infoDate.plusDays(1) - bk.setRecordCount("table1", noDataInfoDate, 30, 30, 123, 456, isTableTransient = false) + bk.setRecordCount("table1", noDataInfoDate, 30, 30, 123, 456, isTableTransient = false, overwrite = true) val result = job.preRunCheckJob(noDataInfoDate, runReason, conf, Nil) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala index e54f0de0c..7434ce8c7 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala @@ -72,7 +72,7 @@ class TransferJobSuite extends AnyWordSpec with SparkTestBase with TextCompariso " the number of records do not match" in { val (job, bk) = getUseCase(numberOfRecords = 7) - bk.setRecordCount("table1->sink", infoDate, 3, 3, 10000, 1001, isTableTransient = false) + bk.setRecordCount("table1->sink", infoDate, 3, 3, 10000, 1001, isTableTransient = false, overwrite = true) val actual = job.preRunCheckJob(infoDate, runReason, conf, Nil) @@ -82,7 +82,7 @@ class TransferJobSuite extends AnyWordSpec with SparkTestBase with TextCompariso "return AlreadyRan when the number of records didn't change" in { val (job, bk) = getUseCase(numberOfRecords = 7) - bk.setRecordCount("table1->sink", infoDate, 7, 3, 10000, 1001, isTableTransient = false) + bk.setRecordCount("table1->sink", infoDate, 7, 3, 10000, 1001, isTableTransient = false, overwrite = true) val actual = job.preRunCheckJob(infoDate, runReason, conf, Nil) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/source/SourceValidationSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/source/SourceValidationSuite.scala index 5f2684a37..ff230e46c 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/source/SourceValidationSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/source/SourceValidationSuite.scala @@ -137,7 +137,7 @@ class SourceValidationSuite extends AnyWordSpec with BeforeAndAfterAll with Temp val state = new PipelineStateSpy - bookkeeper.setRecordCount("table_out", runDateIn.minusDays(1), 1, 1, 0, 0, isTableTransient = false) + bookkeeper.setRecordCount("table_out", runDateIn.minusDays(1), 1, 1, 0, 0, isTableTransient = false, overwrite = true) val sourceTable = SourceTableFactory.getDummySourceTable() diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala index 0af207b56..d8d7fd119 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala @@ -18,14 +18,13 @@ package za.co.absa.pramen.core.tests.bookkeeper import org.apache.spark.sql.types._ import org.scalatest.wordspec.AnyWordSpec -import za.co.absa.pramen.core.bookkeeper.{Bookkeeper, BookkeeperDeltaTable} +import za.co.absa.pramen.core.bookkeeper.{Bookkeeper, BookkeeperDeltaTable, BookkeeperHadoop, BookkeeperText} import za.co.absa.pramen.core.model.DataChunk import java.time.LocalDate class BookkeeperCommonSuite extends AnyWordSpec { - - def testBookKeeper(getBookkeeper: () => Bookkeeper): Unit = { + def testBookKeeper(getBookkeeper: Long => Bookkeeper): Unit = { val infoDate1 = LocalDate.of(2020, 8, 11) val infoDate2 = LocalDate.of(2020, 8, 12) val infoDate3 = LocalDate.of(2020, 8, 13) @@ -42,15 +41,15 @@ class BookkeeperCommonSuite extends AnyWordSpec { "getLatestProcessedDate()" should { "return None if there are no entries" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) assert(bk.getLatestProcessedDate("table").isEmpty) } "return a date when there is an entry" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) val dateOpt = bk.getLatestProcessedDate("table") @@ -59,9 +58,9 @@ class BookkeeperCommonSuite extends AnyWordSpec { } "return None if the passed date is too old" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) val dateOpt = bk.getLatestProcessedDate("table", Some(infoDate1)) @@ -69,9 +68,9 @@ class BookkeeperCommonSuite extends AnyWordSpec { } "return a date when there is an entry and until date is passed" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) val dateOpt = bk.getLatestProcessedDate("table", Some(infoDate2)) @@ -80,11 +79,11 @@ class BookkeeperCommonSuite extends AnyWordSpec { } "return the latest date when there are several dates" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318830, 1597318835, isTableTransient = false) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318830, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate3, 200, 20, 1597318830, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate1, 400, 40, 1597318830, 1597318835, isTableTransient = false, overwrite = true) val dateOpt = bk.getLatestProcessedDate("table") @@ -95,17 +94,17 @@ class BookkeeperCommonSuite extends AnyWordSpec { "getLatestDataChunk()" should { "return None if there are no entries" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) assert(bk.getLatestDataChunk("table", infoDate1).isEmpty) } "return the latest date from the specified periods" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318831, 1597318835, isTableTransient = false) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318832, 1597318836, isTableTransient = false) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318833, 1597318837, isTableTransient = false) + bk.setRecordCount("table", infoDate2, 100, 10, 1597318831, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate3, 200, 20, 1597318832, 1597318836, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate1, 400, 40, 1597318833, 1597318837, isTableTransient = false, overwrite = true) val chunkOpt = bk.getLatestDataChunk("table", infoDate3) val infoDate3Str = infoDate3.format(DataChunk.dateFormatter) @@ -124,17 +123,17 @@ class BookkeeperCommonSuite extends AnyWordSpec { "getDataChunksCount()" should { "return 0 if there are no entries" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) assert(bk.getDataChunksCount("table", Option(infoDate1), Option(infoDate1)) == 0) } "return the number of entries if there are entries" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318833, 1597318837, isTableTransient = false) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318831, 1597318835, isTableTransient = false) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318832, 1597318836, isTableTransient = false) + bk.setRecordCount("table", infoDate1, 400, 40, 1597318833, 1597318837, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate2, 100, 10, 1597318831, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate3, 200, 20, 1597318832, 1597318836, isTableTransient = false, overwrite = true) val chunksCount = bk.getDataChunksCount("table", Option(infoDate1), Option(infoDate2)) @@ -143,11 +142,11 @@ class BookkeeperCommonSuite extends AnyWordSpec { } "setRecordCount()" should { - "add a new entry" in { - val bk = getBookkeeper() + "add a new entry for the same batch" in { + val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false) - bk.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false) + bk.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false, overwrite = true) val latestChunkOpt = bk.getLatestDataChunk("table", infoDate1) val chunks = bk.getDataChunks("table", infoDate1, None) @@ -159,11 +158,47 @@ class BookkeeperCommonSuite extends AnyWordSpec { assert(latestChunkOpt.get.jobFinished == 1597318839) } + "replace the entry for a different batch" in { + val bk1 = getBookkeeper(123L) + val bk2 = getBookkeeper(456L) + + // The feature is not implemented in these implementations + assume(!bk1.isInstanceOf[BookkeeperText]) + + bk1.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false, overwrite = true) + bk2.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false, overwrite = true) + + val latestChunkOpt = bk2.getLatestDataChunk("table", infoDate1) + val chunks = bk2.getDataChunks("table", infoDate1, None) + + assert(latestChunkOpt.isDefined) + assert(chunks.length == 1) + + assert(latestChunkOpt.get.infoDate == "2020-08-11") + assert(latestChunkOpt.get.jobFinished == 1597318839) + } + + "add the entry for a different batch non overwrite" in { + val bk1 = getBookkeeper(123L) + val bk2 = getBookkeeper(456L) + + bk1.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false, overwrite = false) + bk2.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false, overwrite = false) + + val latestChunkOpt = bk2.getLatestDataChunk("table", infoDate1) + val chunks = bk2.getDataChunks("table", infoDate1, None) + + assert(latestChunkOpt.isDefined) + assert(chunks.length == 2) + + assert(latestChunkOpt.get.infoDate == "2020-08-11") + assert(latestChunkOpt.get.jobFinished == 1597318839) + } } "saveSchema" should { "overwrite a schema entry" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) bk.saveSchema("table", infoDate2, schema1) Thread.sleep(10) @@ -178,7 +213,7 @@ class BookkeeperCommonSuite extends AnyWordSpec { "getLatestSchema" should { "return the latest schema" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) bk.saveSchema("table", infoDate1, schema1) bk.saveSchema("table", infoDate2, schema2) @@ -197,7 +232,7 @@ class BookkeeperCommonSuite extends AnyWordSpec { } "return None is schema is not available" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) bk.saveSchema("table", infoDate2, schema1) @@ -211,20 +246,20 @@ class BookkeeperCommonSuite extends AnyWordSpec { "Multiple bookkeepers" should { "share the state" in { - val bk = getBookkeeper() + val bk = getBookkeeper(123L) // A workaround for BookkeeperDeltaTable which outputs to different tables in unit tests - val bk2 = if (bk.isInstanceOf[BookkeeperDeltaTable]) bk else getBookkeeper() - val bk3 = if (bk.isInstanceOf[BookkeeperDeltaTable]) bk else getBookkeeper() + val bk2 = if (bk.isInstanceOf[BookkeeperDeltaTable]) bk else getBookkeeper(123L) + val bk3 = if (bk.isInstanceOf[BookkeeperDeltaTable]) bk else getBookkeeper(123L) - bk.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318840, isTableTransient = false) - bk.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318841, isTableTransient = false) + bk.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318840, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318841, isTableTransient = false, overwrite = true) - bk2.setRecordCount("table", infoDate2, 101, 10, 1597318833, 1597318843, isTableTransient = false) - bk2.setRecordCount("table", infoDate3, 201, 20, 1597318838, 1597318844, isTableTransient = false) + bk2.setRecordCount("table", infoDate2, 101, 10, 1597318833, 1597318843, isTableTransient = false, overwrite = true) + bk2.setRecordCount("table", infoDate3, 201, 20, 1597318838, 1597318844, isTableTransient = false, overwrite = true) - bk3.setRecordCount("table", infoDate3, 102, 10, 1597318833, 1597318842, isTableTransient = false) - bk3.setRecordCount("table", infoDate2, 202, 20, 1597318838, 1597318843, isTableTransient = false) + bk3.setRecordCount("table", infoDate3, 102, 10, 1597318833, 1597318842, isTableTransient = false, overwrite = true) + bk3.setRecordCount("table", infoDate2, 202, 20, 1597318838, 1597318843, isTableTransient = false, overwrite = true) val chunk1 = bk.getLatestDataChunk("table", infoDate3) val chunk2 = bk2.getLatestDataChunk("table", infoDate3) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaPathLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaPathLongSuite.scala index 3eab1bdcd..d80082267 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaPathLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaPathLongSuite.scala @@ -38,20 +38,20 @@ class BookkeeperDeltaPathLongSuite extends BookkeeperCommonSuite with SparkTestB deleteDir(tmpDir) } - def getBookkeeper: BookkeeperDeltaPath = { - new BookkeeperDeltaPath(tmpDir, 123L) + def getBookkeeper(batchId: Long): BookkeeperDeltaPath = { + new BookkeeperDeltaPath(tmpDir, batchId) } "BookkeeperHadoopDeltaPath" when { "initialized" should { "Initialize bookkeeping directory" in { - getBookkeeper + getBookkeeper(123L) assert(fsUtils.exists(new Path(tmpDir, s"$bookkeepingRootPath/$recordsDirName"))) assert(fsUtils.exists(new Path(tmpDir, locksDirName))) } } - testBookKeeper(() => getBookkeeper) + testBookKeeper(batchId => getBookkeeper(batchId)) } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala index bad7cf09d..9f908cb0c 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala @@ -16,14 +16,18 @@ package za.co.absa.pramen.core.tests.bookkeeper -import org.scalatest.BeforeAndAfterAll +import io.delta.tables.DeltaTable +import org.apache.spark.sql.functions.lit +import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, fullstacks} import za.co.absa.pramen.core.base.SparkTestBase import za.co.absa.pramen.core.bookkeeper.BookkeeperDeltaTable import java.io.File import scala.util.Random -class BookkeeperDeltaTableLongSuite extends BookkeeperCommonSuite with SparkTestBase with BeforeAndAfterAll { +class BookkeeperDeltaTableLongSuite extends BookkeeperCommonSuite with SparkTestBase with BeforeAndAfter with BeforeAndAfterAll { + val bookkeepingTablePrefix: String = getNewTablePrefix + override protected def beforeAll(): Unit = { super.beforeAll() cleanUpWarehouse() @@ -34,24 +38,44 @@ class BookkeeperDeltaTableLongSuite extends BookkeeperCommonSuite with SparkTest super.afterAll() } - def getBookkeeper(prefix: String): BookkeeperDeltaTable = { - new BookkeeperDeltaTable(None, prefix, 123L) + before { + val fullRecordsTableName = BookkeeperDeltaTable.getFullTableName(None, bookkeepingTablePrefix, BookkeeperDeltaTable.recordsTable) + val fullSchemasTableName = BookkeeperDeltaTable.getFullTableName(None, bookkeepingTablePrefix, BookkeeperDeltaTable.schemasTable) + + if (spark.catalog.tableExists(fullRecordsTableName)) { + val deltaTable = DeltaTable.forName(spark, fullRecordsTableName) + deltaTable.delete(lit(true)) + } + + if (spark.catalog.tableExists(fullSchemasTableName)) { + val deltaTable = DeltaTable.forName(spark, fullSchemasTableName) + deltaTable.delete(lit(true)) + } + } + + def getBookkeeper(prefix: String, batchId: Long): BookkeeperDeltaTable = { + new BookkeeperDeltaTable(None, prefix, batchId) } "BookkeeperHadoopDeltaTable" when { - testBookKeeper { () => - val rndInt = Math.abs(Random.nextInt()) - getBookkeeper(s"tbl${rndInt}_") + testBookKeeper { batchId => + getBookkeeper(bookkeepingTablePrefix, batchId) } "test tables are created properly" in { - getBookkeeper(s"tbl0000_") + val prefix = getNewTablePrefix + getBookkeeper(prefix, 123L) - assert(spark.catalog.tableExists("tbl0000_bookkeeping")) - assert(spark.catalog.tableExists("tbl0000_schemas")) + assert(spark.catalog.tableExists(s"${prefix}bookkeeping")) + assert(spark.catalog.tableExists(s"${prefix}schemas")) } } + private def getNewTablePrefix: String = { + val rndInt = Math.abs(Random.nextInt()) + s"tbl${rndInt}_" + } + private def cleanUpWarehouse(): Unit = { val warehouseDir = new File("spark-warehouse") if (warehouseDir.exists()) { diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperJdbcSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperJdbcSuite.scala index 010707fcf..fcbf63365 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperJdbcSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperJdbcSuite.scala @@ -37,14 +37,14 @@ class BookkeeperJdbcSuite extends BookkeeperCommonSuite with RelationalDbFixture super.afterAll() } - def getBookkeeper: Bookkeeper = { - new BookkeeperJdbc(pramenDb.slickDb, 0L) + def getBookkeeper(batchId: Long): Bookkeeper = { + new BookkeeperJdbc(pramenDb.slickDb, batchId) } "BookkeeperJdbc" when { "initialized" should { "Initialize an empty database" in { - getBookkeeper + getBookkeeper(0L) assert(getTables.exists(_.equalsIgnoreCase("bookkeeping"))) assert(getTables.exists(_.equalsIgnoreCase("schemas"))) @@ -53,6 +53,6 @@ class BookkeeperJdbcSuite extends BookkeeperCommonSuite with RelationalDbFixture } } - testBookKeeper(() => getBookkeeper) + testBookKeeper(batchId => getBookkeeper(batchId)) } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala index e57af94d6..65bdef385 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala @@ -43,7 +43,7 @@ class BookkeeperMemSuite extends AnyWordSpec with BeforeAndAfter { } "return a date when there is an entry" in { - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) val dateOpt = bk.getLatestProcessedDate("table") @@ -52,9 +52,9 @@ class BookkeeperMemSuite extends AnyWordSpec with BeforeAndAfter { } "return the latest date when there are several dates" in { - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318830, 1597318835, isTableTransient = false) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318830, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate3, 200, 20, 1597318830, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate1, 400, 40, 1597318830, 1597318835, isTableTransient = false, overwrite = true) val dateOpt = bk.getLatestProcessedDate("table") @@ -69,9 +69,9 @@ class BookkeeperMemSuite extends AnyWordSpec with BeforeAndAfter { } "return the latest date from the specified periods" in { - bk.setRecordCount("table", infoDate2, 100, 10, 1597318831, 1597318835, isTableTransient = false) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318832, 1597318836, isTableTransient = false) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318833, 1597318837, isTableTransient = false) + bk.setRecordCount("table", infoDate2, 100, 10, 1597318831, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate3, 200, 20, 1597318832, 1597318836, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate1, 400, 40, 1597318833, 1597318837, isTableTransient = false, overwrite = true) val chunkOpt = bk.getLatestDataChunk("table", infoDate3) val infoDate3Str = infoDate3.format(DataChunk.dateFormatter) @@ -90,8 +90,8 @@ class BookkeeperMemSuite extends AnyWordSpec with BeforeAndAfter { "setRecordCount()" should { "overwrite the previous entry" in { - bk.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false) - bk.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false) + bk.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false, overwrite = true) val chunks = bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table", infoDate1, None) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala index 7142df4a0..036511fd8 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMongoDbLongSuite.scala @@ -36,15 +36,15 @@ class BookkeeperMongoDbLongSuite extends BookkeeperCommonSuite with MongoDbFixtu } } - def getBookkeeper: Bookkeeper = { - new BookkeeperMongoDb(connection, 123L) + def getBookkeeper(batchId: Long): Bookkeeper = { + new BookkeeperMongoDb(connection, batchId) } if (mongoDbExecutable.nonEmpty) { "BookkeeperMongoDb" when { "initialized" should { "Initialize an empty database" in { - getBookkeeper + getBookkeeper(123L) assert(db.doesCollectionExists(collectionName)) @@ -53,7 +53,7 @@ class BookkeeperMongoDbLongSuite extends BookkeeperCommonSuite with MongoDbFixtu } } - testBookKeeper(() => getBookkeeper) + testBookKeeper(batchId => getBookkeeper(batchId)) } } else { diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTextLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTextLongSuite.scala index 6f82f87c9..6722c6f51 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTextLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTextLongSuite.scala @@ -41,26 +41,26 @@ class BookkeeperTextLongSuite extends BookkeeperCommonSuite with SparkTestBase w deleteDir(tmpDir) } - def getBookkeeper: BookkeeperText = { - new BookkeeperText(tmpDir, 123L) + def getBookkeeper(batchId: Long): BookkeeperText = { + new BookkeeperText(tmpDir, batchId) } "BookkeeperHadoopText" when { "initialized" should { "Initialize bookkeeping directory" in { - getBookkeeper + getBookkeeper(123L) assert(fsUtils.exists(new Path(tmpDir, s"$bookkeepingRootPath/$recordsDirName"))) assert(fsUtils.exists(new Path(tmpDir, locksDirName))) } } - testBookKeeper(() => getBookkeeper) + testBookKeeper(batchId => getBookkeeper(batchId)) } "getFilter" should { "get a ranged filter" in { - val bk = getBookkeeper + val bk = getBookkeeper(123L) val actual = bk.getFilter("table1", Some(LocalDate.of(2021, 1, 1)), Some(LocalDate.of(2021, 1, 2)), None).toString() @@ -68,7 +68,7 @@ class BookkeeperTextLongSuite extends BookkeeperCommonSuite with SparkTestBase w } "get a ranged filter with batch id" in { - val bk = getBookkeeper + val bk = getBookkeeper(123L) val actual = bk.getFilter("table1", Some(LocalDate.of(2021, 1, 1)), Some(LocalDate.of(2021, 1, 2)), Some(123L)).toString() @@ -76,7 +76,7 @@ class BookkeeperTextLongSuite extends BookkeeperCommonSuite with SparkTestBase w } "get a from filter" in { - val bk = getBookkeeper + val bk = getBookkeeper(123L) val actual = bk.getFilter("table1", Some(LocalDate.of(2021, 1, 1)), None, None).toString() @@ -84,7 +84,7 @@ class BookkeeperTextLongSuite extends BookkeeperCommonSuite with SparkTestBase w } "get a to filter" in { - val bk = getBookkeeper + val bk = getBookkeeper(123L) val actual = bk.getFilter("table1", None, Some(LocalDate.of(2021, 1, 2)), None).toString() @@ -92,7 +92,7 @@ class BookkeeperTextLongSuite extends BookkeeperCommonSuite with SparkTestBase w } "get a batchid filter" in { - val bk = getBookkeeper + val bk = getBookkeeper(123L) val actual = bk.getFilter("table1", None, None, Some(123L)).toString() @@ -100,7 +100,7 @@ class BookkeeperTextLongSuite extends BookkeeperCommonSuite with SparkTestBase w } "get a table filter" in { - val bk = getBookkeeper + val bk = getBookkeeper(123L) val actual = bk.getFilter("table1", None, None, None).toString() diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala index ea8c5a9e9..0d2f25e80 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala @@ -69,7 +69,7 @@ class BookkeeperTransientSuiteEager extends AnyWordSpec { "setRecordCount()" should { "return the newly added record" in { val bk = getBookkeeper - bk.setRecordCount("table1", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = true) + bk.setRecordCount("table1", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = true, overwrite = true) val chunks = bk.getTransientDataChunks("table1", Option(infoDate2), Option(infoDate2), None) @@ -80,9 +80,9 @@ class BookkeeperTransientSuiteEager extends AnyWordSpec { def getBookkeeper: BookkeeperBase = { val bk = new BookkeeperNull - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = true) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318830, 1597318836, isTableTransient = true) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318830, 1597318837, isTableTransient = true) + bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = true, overwrite = true) + bk.setRecordCount("table", infoDate3, 200, 20, 1597318830, 1597318836, isTableTransient = true, overwrite = true) + bk.setRecordCount("table", infoDate1, 400, 40, 1597318830, 1597318837, isTableTransient = true, overwrite = true) bk } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala index 545b70180..881b57b4e 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala @@ -188,7 +188,7 @@ class TaskRunnerMultithreadedSuite extends AnyWordSpec with SparkTestBase { val state = new PipelineStateSpy - bookkeeper.setRecordCount("table_out", runDate.minusDays(1), 1, 1, 0, 0, isTableTransient = false) + bookkeeper.setRecordCount("table_out", runDate.minusDays(1), 1, 1, 0, 0, isTableTransient = false, overwrite = true) val stats = MetaTableStats(Some(2), None, Some(100)) From cc942c1c6937350cfe4d570ff1510ca58d2e672f Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Fri, 2 Jan 2026 12:59:55 +0100 Subject: [PATCH 08/12] #677 Add the number of records appended to bookeeping records. --- .../co/absa/pramen/api/MetaTableRunInfo.scala | 1 + .../pramen/core/bookkeeper/Bookkeeper.scala | 4 +- .../core/bookkeeper/BookkeeperBase.scala | 11 +- .../core/bookkeeper/BookkeeperDeltaBase.scala | 13 +- .../core/bookkeeper/BookkeeperJdbc.scala | 24 ++-- .../core/bookkeeper/BookkeeperMongoDb.scala | 24 ++-- .../core/bookkeeper/BookkeeperNull.scala | 1 + .../core/bookkeeper/BookkeeperText.scala | 3 +- .../bookkeeper/model/BookkeepingRecord.scala | 4 +- .../model/BookkeepingRecordMongoDb.scala | 28 +++++ .../bookkeeper/model/BookkeepingRecords.scala | 4 +- .../pramen/core/metastore/MetastoreImpl.scala | 8 +- .../core/metastore/MetastoreReaderBase.scala | 1 + .../co/absa/pramen/core/model/DataChunk.scala | 13 ++ .../pipeline/PythonTransformationJob.scala | 4 +- .../absa/pramen/core/pipeline/SinkJob.scala | 4 +- .../za/co/absa/pramen/core/rdb/PramenDb.scala | 1 + .../core/runner/task/TaskRunnerBase.scala | 2 +- .../bookkeeper/BookkeeperDeltaTable.scala | 117 +++++++++++++++++ .../bookkeeper/BookkeeperDeltaTable.scala | 0 .../bookkeeper/BookkeeperDeltaTable.scala | 119 ++++++++++++++++++ .../pramen/core/mocks/DataChunkFactory.scala | 2 + .../mocks/bookkeeper/SyncBookkeeperMock.scala | 15 +-- .../core/pipeline/IngestionJobSuite.scala | 12 +- .../core/pipeline/TransferJobSuite.scala | 4 +- .../core/source/SourceValidationSuite.scala | 2 +- .../bookkeeper/BookkeeperCommonSuite.scala | 48 +++---- .../tests/bookkeeper/BookkeeperMemSuite.scala | 18 +-- .../bookkeeper/BookkeeperNullSuite.scala | 6 +- .../BookkeeperTransientSuiteEager.scala | 8 +- .../TaskRunnerMultithreadedSuite.scala | 2 +- 31 files changed, 394 insertions(+), 109 deletions(-) create mode 100644 pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecordMongoDb.scala create mode 100644 pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala rename pramen/core/src/main/{scala => scala_2.12}/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala (100%) create mode 100644 pramen/core/src/main/scala_2.13/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala diff --git a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetaTableRunInfo.scala b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetaTableRunInfo.scala index 36267477f..c3b7d35d0 100644 --- a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetaTableRunInfo.scala +++ b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetaTableRunInfo.scala @@ -23,5 +23,6 @@ case class MetaTableRunInfo(tableName: String, batchId: Long, inputRecordCount: Long, outputRecordCount: Long, + appendedRecordCount: Option[Long], jobStarted: Instant, jobFinished: Instant) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala index facc820b4..a7ef79e9b 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala @@ -52,10 +52,10 @@ trait Bookkeeper { infoDate: LocalDate, inputRecordCount: Long, outputRecordCount: Long, + appendedRecordCount: Option[Long], jobStarted: Long, jobFinished: Long, - isTableTransient: Boolean, - overwrite: Boolean): Unit + isTableTransient: Boolean): Unit private[pramen] def saveSchema(table: String, infoDate: LocalDate, schema: StructType): Unit diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala index 749f11270..2c54fc660 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala @@ -38,6 +38,7 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean, batchId: Long) exte infoDate: LocalDate, inputRecordCount: Long, outputRecordCount: Long, + recordsAppended: Option[Long], jobStarted: Long, jobFinished: Long): Unit @@ -45,21 +46,21 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean, batchId: Long) exte infoDate: LocalDate, inputRecordCount: Long, outputRecordCount: Long, + recordsAppended: Option[Long], jobStarted: Long, jobFinished: Long, - isTableTransient: Boolean, - overwrite: Boolean): Unit = { + isTableTransient: Boolean): Unit = { if (isTableTransient || !isBookkeepingEnabled) { val tableLowerCase = table.toLowerCase - val dataChunk = DataChunk(table, infoDate.toString, infoDate.toString, infoDate.toString, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) + val dataChunk = DataChunk(table, infoDate.toString, infoDate.toString, infoDate.toString, inputRecordCount, outputRecordCount, recordsAppended, jobStarted, jobFinished, batchId) this.synchronized { val dataChunks = transientDataChunks.getOrElse(tableLowerCase, Array.empty[DataChunk]) val newDataChunks = (dataChunks :+ dataChunk).sortBy(_.jobFinished) transientDataChunks += tableLowerCase -> newDataChunks } } else { - saveRecordCountToStorage(table, infoDate, inputRecordCount, outputRecordCount, jobStarted, jobFinished) - if (overwrite) { + saveRecordCountToStorage(table, infoDate, inputRecordCount, outputRecordCount, recordsAppended, jobStarted, jobFinished) + if (recordsAppended.isEmpty) { deleteNonCurrentBatchRecords(table, infoDate) } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala index 6651eb2e7..5738963ab 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala @@ -76,14 +76,15 @@ abstract class BookkeeperDeltaBase(batchId: Long) extends BookkeeperHadoop(batch } final private[pramen] override def saveRecordCountToStorage(table: String, - infoDate: LocalDate, - inputRecordCount: Long, - outputRecordCount: Long, - jobStarted: Long, - jobFinished: Long): Unit = { + infoDate: LocalDate, + inputRecordCount: Long, + outputRecordCount: Long, + recordsAppended: Option[Long], + jobStarted: Long, + jobFinished: Long): Unit = { val dateStr = getDateStr(infoDate) - val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) + val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, recordsAppended, jobStarted, jobFinished, batchId) saveRecordCountDelta(chunk) } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala index f152f3695..fe99366f8 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala @@ -55,7 +55,7 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b val chunks = try { SlickUtils.executeQuery[BookkeepingRecords, BookkeepingRecord](db, query) - .map(toChunk) + .map(DataChunk.fromRecord) } catch { case NonFatal(ex) => throw new RuntimeException(s"Unable to read from the bookkeeping table.", ex) } @@ -73,7 +73,7 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b try { SlickUtils.executeQuery[BookkeepingRecords, BookkeepingRecord](db, query) - .map(toChunk) + .map(DataChunk.fromRecord) .toArray[DataChunk] .sortBy(_.jobFinished) } catch { @@ -88,7 +88,7 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b try { val records = SlickUtils.executeQuery[BookkeepingRecords, BookkeepingRecord](db, query) - .map(toChunk) + .map(DataChunk.fromRecord) .toArray[DataChunk] if (records.length > 1) @@ -113,14 +113,15 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b } override def saveRecordCountToStorage(table: String, - infoDate: LocalDate, - inputRecordCount: Long, - outputRecordCount: Long, - jobStarted: Long, - jobFinished: Long): Unit = { + infoDate: LocalDate, + inputRecordCount: Long, + outputRecordCount: Long, + recordsAppended: Option[Long], + jobStarted: Long, + jobFinished: Long): Unit = { val dateStr = DataChunk.dateFormatter.format(infoDate) - val record = BookkeepingRecord(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, Option(batchId)) + val record = BookkeepingRecord(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, recordsAppended, jobStarted, jobFinished, Option(batchId)) try { db.run( @@ -155,11 +156,6 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b offsetManagement } - private def toChunk(r: BookkeepingRecord): DataChunk = { - DataChunk( - r.pramenTableName, r.infoDate, r.infoDateBegin, r.infoDateEnd, r.inputRecordCount, r.outputRecordCount, r.jobStarted, r.jobFinished, r.batchId.getOrElse(0L)) - } - private def getFilter(tableName: String, infoDateBeginOpt: Option[LocalDate], infoDateEndOpt: Option[LocalDate], batchId: Option[Long]): Query[BookkeepingRecords, BookkeepingRecord, Seq] = { val baseFilter = (infoDateBeginOpt, infoDateEndOpt) match { case (Some(infoDateBegin), Some(infoDateEnd)) => diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala index f41a56f26..03a55af17 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala @@ -25,6 +25,7 @@ import org.mongodb.scala.bson.codecs.Macros._ import org.mongodb.scala.bson.conversions.Bson import org.mongodb.scala.model.{Filters, Sorts} import org.slf4j.LoggerFactory +import za.co.absa.pramen.core.bookkeeper.model.{BookkeepingRecord, BookkeepingRecordMongoDb} import za.co.absa.pramen.core.dao.MongoDb import za.co.absa.pramen.core.dao.model.{ASC, IndexField} import za.co.absa.pramen.core.model.{DataChunk, TableSchema} @@ -41,18 +42,17 @@ object BookkeeperMongoDb { } class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) extends BookkeeperBase(true, batchId) { - import BookkeeperMongoDb._ import za.co.absa.pramen.core.dao.ScalaMongoImplicits._ private val log = LoggerFactory.getLogger(this.getClass) - private val codecRegistry: CodecRegistry = fromRegistries(fromProviders(classOf[DataChunk], classOf[TableSchema]), DEFAULT_CODEC_REGISTRY) + private val codecRegistry: CodecRegistry = fromRegistries(fromProviders(classOf[BookkeepingRecordMongoDb], classOf[TableSchema]), DEFAULT_CODEC_REGISTRY) private val db = mongoDbConnection.getDatabase initCollection() - private val collection = db.getCollection[DataChunk](collectionName).withCodecRegistry(codecRegistry) + private val collection = db.getCollection[BookkeepingRecordMongoDb](collectionName).withCodecRegistry(codecRegistry) private val schemaCollection = db.getCollection[TableSchema](schemaCollectionName).withCodecRegistry(codecRegistry) override val bookkeepingEnabled: Boolean = true @@ -86,6 +86,7 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext .limit(1) .execute() .headOption + .map(DataChunk.fromRecordMongo) } def getDataChunksCountFromStorage(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long = { @@ -96,20 +97,21 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext val chunks = collection.find(getFilter(table, Option(infoDate), Option(infoDate), batchId)).execute() .sortBy(_.jobFinished) log.info(s"For $table ($infoDate) : ${chunks.mkString("[ ", ", ", " ]")}") - chunks + chunks.map(DataChunk.fromRecordMongo) } override def saveRecordCountToStorage(table: String, - infoDate: LocalDate, - inputRecordCount: Long, - outputRecordCount: Long, - jobStarted: Long, - jobFinished: Long): Unit = { + infoDate: LocalDate, + inputRecordCount: Long, + outputRecordCount: Long, + recordsAppended: Option[Long], + jobStarted: Long, + jobFinished: Long): Unit = { val dateStr = DataChunk.dateFormatter.format(infoDate) - val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) + val record = BookkeepingRecordMongoDb(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, recordsAppended, jobStarted, jobFinished, Option(batchId)) - collection.insertOne(chunk).execute() + collection.insertOne(record).execute() } override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala index a1983129c..3b69b01fe 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperNull.scala @@ -41,6 +41,7 @@ class BookkeeperNull() extends BookkeeperBase(false, 0L) { infoDate: LocalDate, inputRecordCount: Long, outputRecordCount: Long, + recordsAppended: Option[Long], jobStarted: Long, jobFinished: Long): Unit = {} diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala index d264fc23e..1052bb90d 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala @@ -95,6 +95,7 @@ class BookkeeperText(bookkeepingPath: String, batchId: Long)(implicit spark: Spa infoDate: LocalDate, inputRecordCount: Long, outputRecordCount: Long, + recordsAppended: Option[Long], jobStarted: Long, jobFinished: Long): Unit = { val lock: TokenLockHadoopPath = getLock @@ -102,7 +103,7 @@ class BookkeeperText(bookkeepingPath: String, batchId: Long)(implicit spark: Spa try { val dateStr = getDateStr(infoDate) - val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, batchId) + val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, recordsAppended, jobStarted, jobFinished, batchId) val csv = CsvUtils.getRecord(chunk, '|') fsUtils.appendFile(bkFilePath, csv) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala index 5c0bee4c4..8d2e387ec 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecord.scala @@ -22,9 +22,7 @@ case class BookkeepingRecord(pramenTableName: String, infoDateEnd: String, inputRecordCount: Long, outputRecordCount: Long, + appendedRecordCount: Option[Long], jobStarted: Long, jobFinished: Long, batchId: Option[Long]) - - - diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecordMongoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecordMongoDb.scala new file mode 100644 index 000000000..6fe29a32e --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecordMongoDb.scala @@ -0,0 +1,28 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.bookkeeper.model + +case class BookkeepingRecordMongoDb(tableName: String, + infoDate: String, /* Use String to workaround serialization issues */ + infoDateBegin: String, + infoDateEnd: String, + inputRecordCount: Long, + outputRecordCount: Long, + appendedRecordCount: Option[Long], + jobStarted: Long, + jobFinished: Long, + batchId: Option[Long]) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala index d05ffd7c6..fa44a1056 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala @@ -26,11 +26,13 @@ class BookkeepingRecords(tag: Tag) extends Table[BookkeepingRecord](tag, "bookke def infoDateEnd = column[String]("info_date_end", O.Length(20)) def inputRecordCount = column[Long]("input_record_count") def outputRecordCount = column[Long]("output_record_count") + def appendedRecordCount = column[Option[Long]]("appended_record_count") def jobStarted = column[Long]("job_started") def jobFinished = column[Long]("job_finished") def batchId = column[Option[Long]]("batch_id") + def writeMode = column[Option[Int]]("write_mode") def * = (pramenTableName, infoDate, infoDateBegin, infoDateEnd, - inputRecordCount, outputRecordCount, + inputRecordCount, outputRecordCount, appendedRecordCount, jobStarted, jobFinished, batchId) <> (BookkeepingRecord.tupled, BookkeepingRecord.unapply) def idx1 = index("bk_idx_1", (pramenTableName, infoDate), unique = false) } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreImpl.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreImpl.scala index c5ae0d247..47bf65856 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreImpl.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreImpl.scala @@ -141,11 +141,11 @@ class MetastoreImpl(appConfig: Config, stats.recordCount.foreach{recordCount => if (!skipBookKeepingUpdates && !nothingAppended) { val overwrite = saveModeOverride.contains(SaveMode.Overwrite) - val recordCountToRecord = if (overwrite) - recordCount + val recordsAppended = if (overwrite) + None else - stats.recordCountAppended.getOrElse(recordCount) - bookkeeper.setRecordCount(tableName, infoDate, inputRecordCount.getOrElse(recordCountToRecord), recordCountToRecord, start, finish, isTransient, overwrite) + stats.recordCountAppended + bookkeeper.setRecordCount(tableName, infoDate, inputRecordCount.getOrElse(recordCount), recordCount, recordsAppended, start, finish, isTransient) } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala index 91c30cdd5..41c136529 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala @@ -79,6 +79,7 @@ abstract class MetastoreReaderBase(metastore: Metastore, chunk.batchId, chunk.inputRecordCount, chunk.outputRecordCount, + chunk.appendedRecordCount, Instant.ofEpochSecond(chunk.jobStarted), Instant.ofEpochSecond(chunk.jobFinished) ) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala index 25d472001..db6a2a773 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala @@ -16,6 +16,8 @@ package za.co.absa.pramen.core.model +import za.co.absa.pramen.core.bookkeeper.model.{BookkeepingRecord, BookkeepingRecordMongoDb} + import java.time.format.DateTimeFormatter case class DataChunk(tableName: String, @@ -24,6 +26,7 @@ case class DataChunk(tableName: String, infoDateEnd: String, inputRecordCount: Long, outputRecordCount: Long, + appendedRecordCount: Option[Long], jobStarted: Long, jobFinished: Long, batchId: Long) @@ -33,4 +36,14 @@ object DataChunk { /* This is how info dates are stored */ val datePersistFormat = "yyyy-MM-dd" val dateFormatter: DateTimeFormatter = DateTimeFormatter.ofPattern(datePersistFormat) + + def fromRecord(r: BookkeepingRecord): DataChunk = { + DataChunk( + r.pramenTableName, r.infoDate, r.infoDateBegin, r.infoDateEnd, r.inputRecordCount, r.outputRecordCount, r.appendedRecordCount, r.jobStarted, r.jobFinished, r.batchId.getOrElse(0L)) + } + + def fromRecordMongo(r: BookkeepingRecordMongoDb): DataChunk = { + DataChunk( + r.tableName, r.infoDate, r.infoDateBegin, r.infoDateEnd, r.inputRecordCount, r.outputRecordCount, r.appendedRecordCount, r.jobStarted, r.jobFinished, r.batchId.getOrElse(0L)) + } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/PythonTransformationJob.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/PythonTransformationJob.scala index 765f220e0..75482b42d 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/PythonTransformationJob.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/PythonTransformationJob.scala @@ -144,10 +144,10 @@ class PythonTransformationJob(operationDef: OperationDef, infoDate, recordCount, recordCount, + None, jobStarted.getEpochSecond, jobFinished.getEpochSecond, - isTableTransient = false, - overwrite = true) + isTableTransient = false) } SaveResult(stats) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/SinkJob.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/SinkJob.scala index 2b670547a..20bc2b214 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/SinkJob.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/pipeline/SinkJob.scala @@ -166,10 +166,10 @@ class SinkJob(operationDef: OperationDef, infoDate, inputRecordCount.getOrElse(sinkResult.recordsSent), sinkResult.recordsSent, + Option(sinkResult.recordsSent), jobStarted.getEpochSecond, jobFinished.getEpochSecond, - isTransient, - overwrite = false + isTransient ) if (isIncremental) { diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala index b966903a6..5f686a106 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/rdb/PramenDb.scala @@ -96,6 +96,7 @@ class PramenDb(val jdbcConfig: JdbcConfig, if (0 < dbVersion && dbVersion < 9) { addColumn(BookkeepingRecords.records.baseTableRow.tableName, "batch_id", "bigint") + addColumn(BookkeepingRecords.records.baseTableRow.tableName, "appended_record_count", "bigint") addColumn(JournalTasks.journalTasks.baseTableRow.tableName, "batch_id", "bigint") } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/runner/task/TaskRunnerBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/runner/task/TaskRunnerBase.scala index f78f0835b..3a79b39e8 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/runner/task/TaskRunnerBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/runner/task/TaskRunnerBase.scala @@ -327,7 +327,7 @@ abstract class TaskRunnerBase(conf: Config, log.info(s"SKIP validation failure for the task: $outputTableName for date: ${task.infoDate}. Reason: $msg") if (bookkeeper.getLatestDataChunk(outputTableName, task.infoDate).isEmpty) { val isTransient = task.job.outputTable.format.isTransient - bookkeeper.setRecordCount(outputTableName, task.infoDate, status.inputRecordsCount.getOrElse(0L), 0, started.getEpochSecond, Instant.now().getEpochSecond, isTransient, overwrite = false) + bookkeeper.setRecordCount(outputTableName, task.infoDate, status.inputRecordsCount.getOrElse(0L), 0, None, started.getEpochSecond, Instant.now().getEpochSecond, isTransient) } Left(TaskResult(task.job.taskDef, RunStatus.Skipped(msg), getRunInfo(task.infoDate, started), applicationId, isTransient, isRawFileBased, newSchemaRegistered = false, Nil, status.dependencyWarnings, Nil, options)) case Reason.SkipOnce(msg) => diff --git a/pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala b/pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala new file mode 100644 index 000000000..85f5a9d28 --- /dev/null +++ b/pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala @@ -0,0 +1,117 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.bookkeeper + +import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.{Column, Dataset, SaveMode, SparkSession} +import za.co.absa.pramen.core.bookkeeper.model.TableSchemaJson +import za.co.absa.pramen.core.model.{DataChunk, TableSchema} + +import java.time.{Instant, LocalDate} +import scala.reflect.ClassTag +import scala.reflect.runtime.universe + +object BookkeeperDeltaTable { + val recordsTable = "bookkeeping" + val schemasTable = "schemas" + + def getFullTableName(databaseOpt: Option[String], tablePrefix: String, tableName: String): String = { + databaseOpt match { + case Some(db) => s"$db.$tablePrefix$tableName" + case None => s"$tablePrefix$tableName" + } + } +} + +class BookkeeperDeltaTable(database: Option[String], + tablePrefix: String, + batchId: Long) + (implicit spark: SparkSession) extends BookkeeperDeltaBase(batchId) { + import BookkeeperDeltaTable._ + import spark.implicits._ + + private val recordsFullTableName = getFullTableName(database, tablePrefix, recordsTable) + private val schemasFullTableName = getFullTableName(database, tablePrefix, schemasTable) + + init() + + override def getBkDf(filter: Column): Dataset[DataChunk] = { + val df = spark.table(recordsFullTableName).as[DataChunk] + + df.filter(filter) + .orderBy(col("jobFinished")) + .as[DataChunk] + } + + override def saveRecordCountDelta(dataChunks: DataChunk): Unit = { + val df = Seq(dataChunks).toDF() + + df.write + .format("delta") + .mode(SaveMode.Append) + .option("mergeSchema", "true") + .saveAsTable(recordsFullTableName) + } + + override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { + val infoDateStr = DataChunk.dateFormatter.format(infoDate) + val filter = (col("tableName") === lit(table)) && (col("infoDate") === lit(infoDateStr)) && (col("batchId") =!= lit(batchId)) + + spark.sql(s"DROM FROM $schemasFullTableName WHERE tableName='$table' AND infoDate='$infoDateStr' AND batchId != $batchId").count() + } + + override def getSchemasDeltaDf: Dataset[TableSchemaJson] = { + spark.table(schemasFullTableName).as[TableSchemaJson] + } + + override def saveSchemaDelta(schema: TableSchema): Unit = { + val df = Seq( + TableSchemaJson(schema.tableName, schema.infoDate, schema.schemaJson, Instant.now().toEpochMilli) + ).toDF() + + df.write + .format("delta") + .mode(SaveMode.Append) + .option("mergeSchema", "true") + .saveAsTable(schemasFullTableName) + } + + override def writeEmptyDataset[T <: Product : universe.TypeTag : ClassTag](pathOrTable: String): Unit = { + val df = Seq.empty[T].toDS + + df.write + .format("delta") + .saveAsTable(pathOrTable) + } + + def init(): Unit = { + initRecordsDirectory() + initSchemasDirectory() + } + + private def initRecordsDirectory(): Unit = { + if (!spark.catalog.tableExists(recordsFullTableName)) { + writeEmptyDataset[DataChunk](recordsFullTableName) + } + } + + private def initSchemasDirectory(): Unit = { + if (!spark.catalog.tableExists(schemasFullTableName)) { + writeEmptyDataset[TableSchemaJson](schemasFullTableName) + } + } +} diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala b/pramen/core/src/main/scala_2.12/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala similarity index 100% rename from pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala rename to pramen/core/src/main/scala_2.12/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala diff --git a/pramen/core/src/main/scala_2.13/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala b/pramen/core/src/main/scala_2.13/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala new file mode 100644 index 000000000..be876ab9f --- /dev/null +++ b/pramen/core/src/main/scala_2.13/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala @@ -0,0 +1,119 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.bookkeeper + +import io.delta.tables.DeltaTable +import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.{Column, Dataset, SaveMode, SparkSession} +import za.co.absa.pramen.core.bookkeeper.model.TableSchemaJson +import za.co.absa.pramen.core.model.{DataChunk, TableSchema} + +import java.time.{Instant, LocalDate} +import scala.reflect.ClassTag +import scala.reflect.runtime.universe + +object BookkeeperDeltaTable { + val recordsTable = "bookkeeping" + val schemasTable = "schemas" + + def getFullTableName(databaseOpt: Option[String], tablePrefix: String, tableName: String): String = { + databaseOpt match { + case Some(db) => s"$db.$tablePrefix$tableName" + case None => s"$tablePrefix$tableName" + } + } +} + +class BookkeeperDeltaTable(database: Option[String], + tablePrefix: String, + batchId: Long) + (implicit spark: SparkSession) extends BookkeeperDeltaBase(batchId) { + import BookkeeperDeltaTable._ + import spark.implicits._ + + private val recordsFullTableName = getFullTableName(database, tablePrefix, recordsTable) + private val schemasFullTableName = getFullTableName(database, tablePrefix, schemasTable) + + init() + + override def getBkDf(filter: Column): Dataset[DataChunk] = { + val df = spark.table(recordsFullTableName).as[DataChunk] + + df.filter(filter) + .orderBy(col("jobFinished")) + .as[DataChunk] + } + + override def saveRecordCountDelta(dataChunks: DataChunk): Unit = { + val df = Seq(dataChunks).toDF() + + df.write + .format("delta") + .mode(SaveMode.Append) + .option("mergeSchema", "true") + .saveAsTable(recordsFullTableName) + } + + override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { + val infoDateStr = DataChunk.dateFormatter.format(infoDate) + val filter = (col("tableName") === lit(table)) && (col("infoDate") === lit(infoDateStr)) && (col("batchId") =!= lit(batchId)) + + val deltaTable = DeltaTable.forName(spark, recordsFullTableName) + deltaTable.delete(filter) + } + + override def getSchemasDeltaDf: Dataset[TableSchemaJson] = { + spark.table(schemasFullTableName).as[TableSchemaJson] + } + + override def saveSchemaDelta(schema: TableSchema): Unit = { + val df = Seq( + TableSchemaJson(schema.tableName, schema.infoDate, schema.schemaJson, Instant.now().toEpochMilli) + ).toDF() + + df.write + .format("delta") + .mode(SaveMode.Append) + .option("mergeSchema", "true") + .saveAsTable(schemasFullTableName) + } + + override def writeEmptyDataset[T <: Product : universe.TypeTag : ClassTag](pathOrTable: String): Unit = { + val df = Seq.empty[T].toDS + + df.write + .format("delta") + .saveAsTable(pathOrTable) + } + + def init(): Unit = { + initRecordsDirectory() + initSchemasDirectory() + } + + private def initRecordsDirectory(): Unit = { + if (!spark.catalog.tableExists(recordsFullTableName)) { + writeEmptyDataset[DataChunk](recordsFullTableName) + } + } + + private def initSchemasDirectory(): Unit = { + if (!spark.catalog.tableExists(schemasFullTableName)) { + writeEmptyDataset[TableSchemaJson](schemasFullTableName) + } + } +} diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala index 76d33a794..b353d287c 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala @@ -23,6 +23,7 @@ object DataChunkFactory { infoDate: String = "2022-01-15", inputRecordCount: Long = 1000, outputRecordCount: Long = 1000, + recordsAppended: Option[Long] = None, jobStarted: Long = 10000, jobFinished: Long = 20000, batchId: Long = 123L): DataChunk = { @@ -32,6 +33,7 @@ object DataChunkFactory { infoDateEnd = infoDate, inputRecordCount = inputRecordCount, outputRecordCount = outputRecordCount, + appendedRecordCount = recordsAppended, jobStarted = jobStarted, jobFinished = jobFinished, batchId = batchId) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala index 57481188d..986c86bcc 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala @@ -91,13 +91,13 @@ class SyncBookkeeperMock(batchId: Long = 123L) extends Bookkeeper { } private[pramen] override def setRecordCount(table: String, - infoDate: LocalDate, - inputRecordCount: Long, - outputRecordCount: Long, - jobStarted: Long, - jobFinished: Long, - isTableTransient: Boolean, - overwrite: Boolean): Unit = { + infoDate: LocalDate, + inputRecordCount: Long, + outputRecordCount: Long, + recordsAppended: Option[Long], + jobStarted: Long, + jobFinished: Long, + isTableTransient: Boolean): Unit = { val dateStr = DataChunk.dateFormatter.format(infoDate) val chunk = DataChunk(table, @@ -106,6 +106,7 @@ class SyncBookkeeperMock(batchId: Long = 123L) extends Bookkeeper { dateStr, inputRecordCount, outputRecordCount, + recordsAppended, jobStarted, jobFinished, batchId) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/IngestionJobSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/IngestionJobSuite.scala index fdf5f599e..9285828bd 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/IngestionJobSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/IngestionJobSuite.scala @@ -122,7 +122,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis "track already ran" in { val (bk, _, job) = getUseCase() - bk.setRecordCount("table1", infoDate, 4, 4, 123, 456, isTableTransient = false, overwrite = true) + bk.setRecordCount("table1", infoDate, 4, 4, None, 123, 456, isTableTransient = false) val result = job.preRunCheckJob(infoDate, runReason, conf, Nil) @@ -133,7 +133,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis "some records, default minimum records" in { val (bk, _, job) = getUseCase() - bk.setRecordCount("table1", infoDate, 100, 100, 123, 456, isTableTransient = false, overwrite = true) + bk.setRecordCount("table1", infoDate, 100, 100, None, 123, 456, isTableTransient = false) val result = job.preRunCheckJob(infoDate, runReason, conf, Nil) @@ -143,7 +143,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis "some records, custom minimum records" in { val (bk, _, job) = getUseCase(minRecords = Some(3)) - bk.setRecordCount("table1", infoDate, 100, 100, 123, 456, isTableTransient = false, overwrite = true) + bk.setRecordCount("table1", infoDate, 100, 100, None, 123, 456, isTableTransient = false) val result = job.preRunCheckJob(infoDate, runReason, conf, Nil) @@ -199,7 +199,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis "needs update, some records, custom minimum records" in { val (bk, _, job) = getUseCase(minRecords = Some(5)) - bk.setRecordCount("table1", infoDate, 100, 100, 123, 456, isTableTransient = false, overwrite = true) + bk.setRecordCount("table1", infoDate, 100, 100, None, 123, 456, isTableTransient = false) val result = job.preRunCheckJob(infoDate, runReason, conf, Nil) @@ -214,7 +214,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis val noDataInfoDate = infoDate.plusDays(1) - bk.setRecordCount("table1", noDataInfoDate, 4, 4, 123, 456, isTableTransient = false, overwrite = true) + bk.setRecordCount("table1", noDataInfoDate, 4, 4, None, 123, 456, isTableTransient = false) val result = job.preRunCheckJob(noDataInfoDate, runReason, conf, Nil) @@ -226,7 +226,7 @@ class IngestionJobSuite extends AnyWordSpec with SparkTestBase with TextComparis val noDataInfoDate = infoDate.plusDays(1) - bk.setRecordCount("table1", noDataInfoDate, 30, 30, 123, 456, isTableTransient = false, overwrite = true) + bk.setRecordCount("table1", noDataInfoDate, 30, 30, None, 123, 456, isTableTransient = false) val result = job.preRunCheckJob(noDataInfoDate, runReason, conf, Nil) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala index 7434ce8c7..98dcb4596 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/pipeline/TransferJobSuite.scala @@ -72,7 +72,7 @@ class TransferJobSuite extends AnyWordSpec with SparkTestBase with TextCompariso " the number of records do not match" in { val (job, bk) = getUseCase(numberOfRecords = 7) - bk.setRecordCount("table1->sink", infoDate, 3, 3, 10000, 1001, isTableTransient = false, overwrite = true) + bk.setRecordCount("table1->sink", infoDate, 3, 3, None, 10000, 1001, isTableTransient = false) val actual = job.preRunCheckJob(infoDate, runReason, conf, Nil) @@ -82,7 +82,7 @@ class TransferJobSuite extends AnyWordSpec with SparkTestBase with TextCompariso "return AlreadyRan when the number of records didn't change" in { val (job, bk) = getUseCase(numberOfRecords = 7) - bk.setRecordCount("table1->sink", infoDate, 7, 3, 10000, 1001, isTableTransient = false, overwrite = true) + bk.setRecordCount("table1->sink", infoDate, 7, 3, None, 10000, 1001, isTableTransient = false) val actual = job.preRunCheckJob(infoDate, runReason, conf, Nil) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/source/SourceValidationSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/source/SourceValidationSuite.scala index ff230e46c..5060cda39 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/source/SourceValidationSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/source/SourceValidationSuite.scala @@ -137,7 +137,7 @@ class SourceValidationSuite extends AnyWordSpec with BeforeAndAfterAll with Temp val state = new PipelineStateSpy - bookkeeper.setRecordCount("table_out", runDateIn.minusDays(1), 1, 1, 0, 0, isTableTransient = false, overwrite = true) + bookkeeper.setRecordCount("table_out", runDateIn.minusDays(1), 1, 1, None, 0, 0, isTableTransient = false) val sourceTable = SourceTableFactory.getDummySourceTable() diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala index d8d7fd119..81841467b 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala @@ -49,7 +49,7 @@ class BookkeeperCommonSuite extends AnyWordSpec { "return a date when there is an entry" in { val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate2, 100, 10, None, 1597318830, 1597318835, isTableTransient = false) val dateOpt = bk.getLatestProcessedDate("table") @@ -60,7 +60,7 @@ class BookkeeperCommonSuite extends AnyWordSpec { "return None if the passed date is too old" in { val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate2, 100, 10, None, 1597318830, 1597318835, isTableTransient = false) val dateOpt = bk.getLatestProcessedDate("table", Some(infoDate1)) @@ -70,7 +70,7 @@ class BookkeeperCommonSuite extends AnyWordSpec { "return a date when there is an entry and until date is passed" in { val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate2, 100, 10, None, 1597318830, 1597318835, isTableTransient = false) val dateOpt = bk.getLatestProcessedDate("table", Some(infoDate2)) @@ -81,9 +81,9 @@ class BookkeeperCommonSuite extends AnyWordSpec { "return the latest date when there are several dates" in { val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318830, 1597318835, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318830, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate2, 100, 10, None, 1597318830, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate3, 200, 20, None, 1597318830, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate1, 400, 40, None, 1597318830, 1597318835, isTableTransient = false) val dateOpt = bk.getLatestProcessedDate("table") @@ -102,9 +102,9 @@ class BookkeeperCommonSuite extends AnyWordSpec { "return the latest date from the specified periods" in { val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318831, 1597318835, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318832, 1597318836, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318833, 1597318837, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate2, 100, 10, None, 1597318831, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate3, 200, 20, None, 1597318832, 1597318836, isTableTransient = false) + bk.setRecordCount("table", infoDate1, 400, 40, None, 1597318833, 1597318837, isTableTransient = false) val chunkOpt = bk.getLatestDataChunk("table", infoDate3) val infoDate3Str = infoDate3.format(DataChunk.dateFormatter) @@ -131,9 +131,9 @@ class BookkeeperCommonSuite extends AnyWordSpec { "return the number of entries if there are entries" in { val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318833, 1597318837, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate2, 100, 10, 1597318831, 1597318835, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318832, 1597318836, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate1, 400, 40, None, 1597318833, 1597318837, isTableTransient = false) + bk.setRecordCount("table", infoDate2, 100, 10, None, 1597318831, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate3, 200, 20, None, 1597318832, 1597318836, isTableTransient = false) val chunksCount = bk.getDataChunksCount("table", Option(infoDate1), Option(infoDate2)) @@ -145,8 +145,8 @@ class BookkeeperCommonSuite extends AnyWordSpec { "add a new entry for the same batch" in { val bk = getBookkeeper(123L) - bk.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate1, 100, 10, None, 1597318833, 1597318837, isTableTransient = false) + bk.setRecordCount("table", infoDate1, 200, 20, None, 1597318838, 1597318839, isTableTransient = false) val latestChunkOpt = bk.getLatestDataChunk("table", infoDate1) val chunks = bk.getDataChunks("table", infoDate1, None) @@ -165,8 +165,8 @@ class BookkeeperCommonSuite extends AnyWordSpec { // The feature is not implemented in these implementations assume(!bk1.isInstanceOf[BookkeeperText]) - bk1.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false, overwrite = true) - bk2.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false, overwrite = true) + bk1.setRecordCount("table", infoDate1, 100, 10, None, 1597318833, 1597318837, isTableTransient = false) + bk2.setRecordCount("table", infoDate1, 200, 20, None, 1597318838, 1597318839, isTableTransient = false) val latestChunkOpt = bk2.getLatestDataChunk("table", infoDate1) val chunks = bk2.getDataChunks("table", infoDate1, None) @@ -182,8 +182,8 @@ class BookkeeperCommonSuite extends AnyWordSpec { val bk1 = getBookkeeper(123L) val bk2 = getBookkeeper(456L) - bk1.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false, overwrite = false) - bk2.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false, overwrite = false) + bk1.setRecordCount("table", infoDate1, 100, 10, Some(1), 1597318833, 1597318837, isTableTransient = false) + bk2.setRecordCount("table", infoDate1, 200, 20, Some(1), 1597318838, 1597318839, isTableTransient = false) val latestChunkOpt = bk2.getLatestDataChunk("table", infoDate1) val chunks = bk2.getDataChunks("table", infoDate1, None) @@ -252,14 +252,14 @@ class BookkeeperCommonSuite extends AnyWordSpec { val bk2 = if (bk.isInstanceOf[BookkeeperDeltaTable]) bk else getBookkeeper(123L) val bk3 = if (bk.isInstanceOf[BookkeeperDeltaTable]) bk else getBookkeeper(123L) - bk.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318840, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318841, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate1, 100, 10, None, 1597318833, 1597318840, isTableTransient = false) + bk.setRecordCount("table", infoDate1, 200, 20, None, 1597318838, 1597318841, isTableTransient = false) - bk2.setRecordCount("table", infoDate2, 101, 10, 1597318833, 1597318843, isTableTransient = false, overwrite = true) - bk2.setRecordCount("table", infoDate3, 201, 20, 1597318838, 1597318844, isTableTransient = false, overwrite = true) + bk2.setRecordCount("table", infoDate2, 101, 10, None, 1597318833, 1597318843, isTableTransient = false) + bk2.setRecordCount("table", infoDate3, 201, 20, None, 1597318838, 1597318844, isTableTransient = false) - bk3.setRecordCount("table", infoDate3, 102, 10, 1597318833, 1597318842, isTableTransient = false, overwrite = true) - bk3.setRecordCount("table", infoDate2, 202, 20, 1597318838, 1597318843, isTableTransient = false, overwrite = true) + bk3.setRecordCount("table", infoDate3, 102, 10, None, 1597318833, 1597318842, isTableTransient = false) + bk3.setRecordCount("table", infoDate2, 202, 20, None, 1597318838, 1597318843, isTableTransient = false) val chunk1 = bk.getLatestDataChunk("table", infoDate3) val chunk2 = bk2.getLatestDataChunk("table", infoDate3) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala index 65bdef385..bf3c82054 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperMemSuite.scala @@ -43,7 +43,7 @@ class BookkeeperMemSuite extends AnyWordSpec with BeforeAndAfter { } "return a date when there is an entry" in { - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate2, 100, 10, None, 1597318830, 1597318835, isTableTransient = false) val dateOpt = bk.getLatestProcessedDate("table") @@ -52,9 +52,9 @@ class BookkeeperMemSuite extends AnyWordSpec with BeforeAndAfter { } "return the latest date when there are several dates" in { - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318830, 1597318835, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318830, 1597318835, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate2, 100, 10, None, 1597318830, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate3, 200, 20, None, 1597318830, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate1, 400, 40, None, 1597318830, 1597318835, isTableTransient = false) val dateOpt = bk.getLatestProcessedDate("table") @@ -69,9 +69,9 @@ class BookkeeperMemSuite extends AnyWordSpec with BeforeAndAfter { } "return the latest date from the specified periods" in { - bk.setRecordCount("table", infoDate2, 100, 10, 1597318831, 1597318835, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318832, 1597318836, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318833, 1597318837, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate2, 100, 10, None, 1597318831, 1597318835, isTableTransient = false) + bk.setRecordCount("table", infoDate3, 200, 20, None, 1597318832, 1597318836, isTableTransient = false) + bk.setRecordCount("table", infoDate1, 400, 40, None, 1597318833, 1597318837, isTableTransient = false) val chunkOpt = bk.getLatestDataChunk("table", infoDate3) val infoDate3Str = infoDate3.format(DataChunk.dateFormatter) @@ -90,8 +90,8 @@ class BookkeeperMemSuite extends AnyWordSpec with BeforeAndAfter { "setRecordCount()" should { "overwrite the previous entry" in { - bk.setRecordCount("table", infoDate1, 100, 10, 1597318833, 1597318837, isTableTransient = false, overwrite = true) - bk.setRecordCount("table", infoDate1, 200, 20, 1597318838, 1597318839, isTableTransient = false, overwrite = true) + bk.setRecordCount("table", infoDate1, 100, 10, None, 1597318833, 1597318837, isTableTransient = false) + bk.setRecordCount("table", infoDate1, 200, 20, None, 1597318838, 1597318839, isTableTransient = false) val chunks = bk.asInstanceOf[SyncBookkeeperMock].getDataChunks("table", infoDate1, None) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperNullSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperNullSuite.scala index aac61b580..9518feb5b 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperNullSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperNullSuite.scala @@ -69,9 +69,9 @@ class BookkeeperNullSuite extends AnyWordSpec { def getBookkeeper: BookkeeperBase = { val bk = new BookkeeperNull - bk.saveRecordCountToStorage("table", infoDate2, 100, 10, 1597318830, 1597318835) - bk.saveRecordCountToStorage("table", infoDate3, 200, 20, 1597318830, 1597318835) - bk.saveRecordCountToStorage("table", infoDate1, 400, 40, 1597318830, 1597318835) + bk.saveRecordCountToStorage("table", infoDate2, 100, 10, None, 1597318830, 1597318835) + bk.saveRecordCountToStorage("table", infoDate3, 200, 20, None, 1597318830, 1597318835) + bk.saveRecordCountToStorage("table", infoDate1, 400, 40, None, 1597318830, 1597318835) bk } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala index 0d2f25e80..e79154777 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperTransientSuiteEager.scala @@ -69,7 +69,7 @@ class BookkeeperTransientSuiteEager extends AnyWordSpec { "setRecordCount()" should { "return the newly added record" in { val bk = getBookkeeper - bk.setRecordCount("table1", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = true, overwrite = true) + bk.setRecordCount("table1", infoDate2, 100, 10, None, 1597318830, 1597318835, isTableTransient = true) val chunks = bk.getTransientDataChunks("table1", Option(infoDate2), Option(infoDate2), None) @@ -80,9 +80,9 @@ class BookkeeperTransientSuiteEager extends AnyWordSpec { def getBookkeeper: BookkeeperBase = { val bk = new BookkeeperNull - bk.setRecordCount("table", infoDate2, 100, 10, 1597318830, 1597318835, isTableTransient = true, overwrite = true) - bk.setRecordCount("table", infoDate3, 200, 20, 1597318830, 1597318836, isTableTransient = true, overwrite = true) - bk.setRecordCount("table", infoDate1, 400, 40, 1597318830, 1597318837, isTableTransient = true, overwrite = true) + bk.setRecordCount("table", infoDate2, 100, 10, None, 1597318830, 1597318835, isTableTransient = true) + bk.setRecordCount("table", infoDate3, 200, 20, None, 1597318830, 1597318836, isTableTransient = true) + bk.setRecordCount("table", infoDate1, 400, 40, None, 1597318830, 1597318837, isTableTransient = true) bk } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala index 881b57b4e..ecee53c74 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/runner/jobrunner/TaskRunnerMultithreadedSuite.scala @@ -188,7 +188,7 @@ class TaskRunnerMultithreadedSuite extends AnyWordSpec with SparkTestBase { val state = new PipelineStateSpy - bookkeeper.setRecordCount("table_out", runDate.minusDays(1), 1, 1, 0, 0, isTableTransient = false, overwrite = true) + bookkeeper.setRecordCount("table_out", runDate.minusDays(1), 1, 1, None, 0, 0, isTableTransient = false) val stats = MetaTableStats(Some(2), None, Some(100)) From e41464ac1bde6fe48804e98e4191fad8a15d4df1 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Fri, 2 Jan 2026 15:13:26 +0100 Subject: [PATCH 09/12] #677 Fix PR suggestions, improve compatibility of bookeeping for non-relational storages. --- pramen/build.sbt | 2 +- .../core/bookkeeper/BookkeeperBase.scala | 2 +- .../core/bookkeeper/BookkeeperDeltaBase.scala | 2 +- .../core/bookkeeper/BookkeeperMongoDb.scala | 10 +++---- .../core/bookkeeper/BookkeeperText.scala | 2 +- .../model/BookkeepingRecordMongoDb.scala | 28 ------------------- .../bookkeeper/model/BookkeepingRecords.scala | 1 - .../core/journal/JournalHadoopCsv.scala | 2 +- .../core/journal/model/TaskCompletedCsv.scala | 2 +- .../core/metastore/MetastoreReaderBase.scala | 2 +- .../co/absa/pramen/core/model/DataChunk.scala | 21 +++++++------- .../co/absa/pramen/core/utils/CsvUtils.scala | 9 +++++- .../bookkeeper/BookkeeperDeltaTable.scala | 9 ++---- .../bookkeeper/BookkeeperDeltaTable.scala | 4 +-- .../bookkeeper/BookkeeperDeltaTable.scala | 4 +-- .../pramen/core/mocks/DataChunkFactory.scala | 6 ++-- .../mocks/bookkeeper/SyncBookkeeperMock.scala | 4 +-- .../bookkeeper/BookkeeperCommonSuite.scala | 6 +++- .../BookkeeperDeltaTableLongSuite.scala | 28 ++++++++++--------- ...PipelineNotificationBuilderHtmlSuite.scala | 2 +- 20 files changed, 62 insertions(+), 84 deletions(-) delete mode 100644 pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecordMongoDb.scala diff --git a/pramen/build.sbt b/pramen/build.sbt index 2c962f953..7c69214b4 100644 --- a/pramen/build.sbt +++ b/pramen/build.sbt @@ -29,7 +29,7 @@ ThisBuild / scalaVersion := scala212 ThisBuild / crossScalaVersions := Seq(scala211, scala212, scala213) ThisBuild / scalacOptions := Seq("-unchecked", "-deprecation", "-target:jvm-1.8") -ThisBuild / javacOptions := Seq("-source", "1.8", "-tagret", "1.8") +ThisBuild / javacOptions := Seq("-source", "1.8", "-target", "1.8") ThisBuild / versionScheme := Some("early-semver") diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala index 2c54fc660..cb50b1174 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala @@ -52,7 +52,7 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean, batchId: Long) exte isTableTransient: Boolean): Unit = { if (isTableTransient || !isBookkeepingEnabled) { val tableLowerCase = table.toLowerCase - val dataChunk = DataChunk(table, infoDate.toString, infoDate.toString, infoDate.toString, inputRecordCount, outputRecordCount, recordsAppended, jobStarted, jobFinished, batchId) + val dataChunk = DataChunk(table, infoDate.toString, infoDate.toString, infoDate.toString, inputRecordCount, outputRecordCount, jobStarted, jobFinished, Option(batchId), recordsAppended) this.synchronized { val dataChunks = transientDataChunks.getOrElse(tableLowerCase, Array.empty[DataChunk]) val newDataChunks = (dataChunks :+ dataChunk).sortBy(_.jobFinished) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala index 5738963ab..69b89eaf5 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala @@ -84,7 +84,7 @@ abstract class BookkeeperDeltaBase(batchId: Long) extends BookkeeperHadoop(batch jobFinished: Long): Unit = { val dateStr = getDateStr(infoDate) - val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, recordsAppended, jobStarted, jobFinished, batchId) + val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, Option(batchId), recordsAppended) saveRecordCountDelta(chunk) } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala index 03a55af17..de9cd5b24 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala @@ -25,7 +25,6 @@ import org.mongodb.scala.bson.codecs.Macros._ import org.mongodb.scala.bson.conversions.Bson import org.mongodb.scala.model.{Filters, Sorts} import org.slf4j.LoggerFactory -import za.co.absa.pramen.core.bookkeeper.model.{BookkeepingRecord, BookkeepingRecordMongoDb} import za.co.absa.pramen.core.dao.MongoDb import za.co.absa.pramen.core.dao.model.{ASC, IndexField} import za.co.absa.pramen.core.model.{DataChunk, TableSchema} @@ -47,12 +46,12 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext private val log = LoggerFactory.getLogger(this.getClass) - private val codecRegistry: CodecRegistry = fromRegistries(fromProviders(classOf[BookkeepingRecordMongoDb], classOf[TableSchema]), DEFAULT_CODEC_REGISTRY) + private val codecRegistry: CodecRegistry = fromRegistries(fromProviders(classOf[DataChunk], classOf[TableSchema]), DEFAULT_CODEC_REGISTRY) private val db = mongoDbConnection.getDatabase initCollection() - private val collection = db.getCollection[BookkeepingRecordMongoDb](collectionName).withCodecRegistry(codecRegistry) + private val collection = db.getCollection[DataChunk](collectionName).withCodecRegistry(codecRegistry) private val schemaCollection = db.getCollection[TableSchema](schemaCollectionName).withCodecRegistry(codecRegistry) override val bookkeepingEnabled: Boolean = true @@ -86,7 +85,6 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext .limit(1) .execute() .headOption - .map(DataChunk.fromRecordMongo) } def getDataChunksCountFromStorage(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long = { @@ -97,7 +95,7 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext val chunks = collection.find(getFilter(table, Option(infoDate), Option(infoDate), batchId)).execute() .sortBy(_.jobFinished) log.info(s"For $table ($infoDate) : ${chunks.mkString("[ ", ", ", " ]")}") - chunks.map(DataChunk.fromRecordMongo) + chunks } override def saveRecordCountToStorage(table: String, @@ -109,7 +107,7 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext jobFinished: Long): Unit = { val dateStr = DataChunk.dateFormatter.format(infoDate) - val record = BookkeepingRecordMongoDb(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, recordsAppended, jobStarted, jobFinished, Option(batchId)) + val record = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, Option(batchId), recordsAppended) collection.insertOne(record).execute() } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala index 1052bb90d..b6f0fef4e 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala @@ -103,7 +103,7 @@ class BookkeeperText(bookkeepingPath: String, batchId: Long)(implicit spark: Spa try { val dateStr = getDateStr(infoDate) - val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, recordsAppended, jobStarted, jobFinished, batchId) + val chunk = DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, Option(batchId), recordsAppended) val csv = CsvUtils.getRecord(chunk, '|') fsUtils.appendFile(bkFilePath, csv) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecordMongoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecordMongoDb.scala deleted file mode 100644 index 6fe29a32e..000000000 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecordMongoDb.scala +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright 2022 ABSA Group Limited - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package za.co.absa.pramen.core.bookkeeper.model - -case class BookkeepingRecordMongoDb(tableName: String, - infoDate: String, /* Use String to workaround serialization issues */ - infoDateBegin: String, - infoDateEnd: String, - inputRecordCount: Long, - outputRecordCount: Long, - appendedRecordCount: Option[Long], - jobStarted: Long, - jobFinished: Long, - batchId: Option[Long]) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala index fa44a1056..2562d6503 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/model/BookkeepingRecords.scala @@ -30,7 +30,6 @@ class BookkeepingRecords(tag: Tag) extends Table[BookkeepingRecord](tag, "bookke def jobStarted = column[Long]("job_started") def jobFinished = column[Long]("job_finished") def batchId = column[Option[Long]]("batch_id") - def writeMode = column[Option[Int]]("write_mode") def * = (pramenTableName, infoDate, infoDateBegin, infoDateEnd, inputRecordCount, outputRecordCount, appendedRecordCount, jobStarted, jobFinished, batchId) <> (BookkeepingRecord.tupled, BookkeepingRecord.unapply) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalHadoopCsv.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalHadoopCsv.scala index 1ec05b92e..a27b7a0e5 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalHadoopCsv.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalHadoopCsv.scala @@ -89,7 +89,7 @@ class JournalHadoopCsv(journalPath: String) environmentName = v.environmentName, tenant = v.tenant, country = v.country, - batchId = v.batchId + batchId = v.batchId.getOrElse(0L) )) } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompletedCsv.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompletedCsv.scala index 8340f3e3a..ee49e758d 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompletedCsv.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/model/TaskCompletedCsv.scala @@ -38,5 +38,5 @@ case class TaskCompletedCsv( environmentName: Option[String], tenant: Option[String], country: Option[String], - batchId: Long + batchId: Option[Long] ) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala index 41c136529..b082a8e54 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderBase.scala @@ -76,7 +76,7 @@ abstract class MetastoreReaderBase(metastore: Metastore, MetaTableRunInfo( tableName, LocalDate.parse(chunk.infoDate), - chunk.batchId, + chunk.batchId.getOrElse(0L), chunk.inputRecordCount, chunk.outputRecordCount, chunk.appendedRecordCount, diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala index db6a2a773..03444ab2b 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/model/DataChunk.scala @@ -16,20 +16,25 @@ package za.co.absa.pramen.core.model -import za.co.absa.pramen.core.bookkeeper.model.{BookkeepingRecord, BookkeepingRecordMongoDb} +import za.co.absa.pramen.core.bookkeeper.model.BookkeepingRecord import java.time.format.DateTimeFormatter +/** + * Represents a data chunk for storing bookkeeping records in non-relational storage (e.g., CSV files). + * The order of columns must be preserved to maintain compatibility with existing CSV files + * since the field order in CSV directly depends on it. + */ case class DataChunk(tableName: String, - infoDate: String, /* Use String to workaround serialization issues */ + infoDate: String, infoDateBegin: String, infoDateEnd: String, inputRecordCount: Long, outputRecordCount: Long, - appendedRecordCount: Option[Long], jobStarted: Long, jobFinished: Long, - batchId: Long) + batchId: Option[Long], + appendedRecordCount: Option[Long]) object DataChunk { @@ -38,12 +43,6 @@ object DataChunk { val dateFormatter: DateTimeFormatter = DateTimeFormatter.ofPattern(datePersistFormat) def fromRecord(r: BookkeepingRecord): DataChunk = { - DataChunk( - r.pramenTableName, r.infoDate, r.infoDateBegin, r.infoDateEnd, r.inputRecordCount, r.outputRecordCount, r.appendedRecordCount, r.jobStarted, r.jobFinished, r.batchId.getOrElse(0L)) - } - - def fromRecordMongo(r: BookkeepingRecordMongoDb): DataChunk = { - DataChunk( - r.tableName, r.infoDate, r.infoDateBegin, r.infoDateEnd, r.inputRecordCount, r.outputRecordCount, r.appendedRecordCount, r.jobStarted, r.jobFinished, r.batchId.getOrElse(0L)) + DataChunk(r.pramenTableName, r.infoDate, r.infoDateBegin, r.infoDateEnd, r.inputRecordCount, r.outputRecordCount, r.jobStarted, r.jobFinished, r.batchId, r.appendedRecordCount) } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/CsvUtils.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/CsvUtils.scala index c16842c3c..5be7bada3 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/CsvUtils.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/CsvUtils.scala @@ -38,7 +38,14 @@ object CsvUtils { .filterNot(_.getName.contains('$')) .map(field => { field.setAccessible(true) - field.get(obj).toString.replace(separator, ' ') + val ref = field.get(obj) + // Handle Option[_] + val str = ref match { + case Some(x) => x.toString + case None => "" + case x => x.toString + } + str.replace(separator, ' ') }).mkString("", s"$separator", "\n") } diff --git a/pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala b/pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala index 85f5a9d28..71206f2af 100644 --- a/pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala +++ b/pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala @@ -57,8 +57,8 @@ class BookkeeperDeltaTable(database: Option[String], .as[DataChunk] } - override def saveRecordCountDelta(dataChunks: DataChunk): Unit = { - val df = Seq(dataChunks).toDF() + override def saveRecordCountDelta(dataChunk: DataChunk): Unit = { + val df = Seq(dataChunk).toDF() df.write .format("delta") @@ -68,10 +68,7 @@ class BookkeeperDeltaTable(database: Option[String], } override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { - val infoDateStr = DataChunk.dateFormatter.format(infoDate) - val filter = (col("tableName") === lit(table)) && (col("infoDate") === lit(infoDateStr)) && (col("batchId") =!= lit(batchId)) - - spark.sql(s"DROM FROM $schemasFullTableName WHERE tableName='$table' AND infoDate='$infoDateStr' AND batchId != $batchId").count() + // This is not supported for Delta tables using Spark 2.* } override def getSchemasDeltaDf: Dataset[TableSchemaJson] = { diff --git a/pramen/core/src/main/scala_2.12/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala b/pramen/core/src/main/scala_2.12/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala index be876ab9f..5c150abc1 100644 --- a/pramen/core/src/main/scala_2.12/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala +++ b/pramen/core/src/main/scala_2.12/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala @@ -58,8 +58,8 @@ class BookkeeperDeltaTable(database: Option[String], .as[DataChunk] } - override def saveRecordCountDelta(dataChunks: DataChunk): Unit = { - val df = Seq(dataChunks).toDF() + override def saveRecordCountDelta(dataChunk: DataChunk): Unit = { + val df = Seq(dataChunk).toDF() df.write .format("delta") diff --git a/pramen/core/src/main/scala_2.13/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala b/pramen/core/src/main/scala_2.13/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala index be876ab9f..5c150abc1 100644 --- a/pramen/core/src/main/scala_2.13/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala +++ b/pramen/core/src/main/scala_2.13/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala @@ -58,8 +58,8 @@ class BookkeeperDeltaTable(database: Option[String], .as[DataChunk] } - override def saveRecordCountDelta(dataChunks: DataChunk): Unit = { - val df = Seq(dataChunks).toDF() + override def saveRecordCountDelta(dataChunk: DataChunk): Unit = { + val df = Seq(dataChunk).toDF() df.write .format("delta") diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala index b353d287c..7f51e1e50 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/DataChunkFactory.scala @@ -26,16 +26,16 @@ object DataChunkFactory { recordsAppended: Option[Long] = None, jobStarted: Long = 10000, jobFinished: Long = 20000, - batchId: Long = 123L): DataChunk = { + batchId: Option[Long] = Some(123L)): DataChunk = { DataChunk(tableName = tableName, infoDate = infoDate, infoDateBegin = infoDate, infoDateEnd = infoDate, inputRecordCount = inputRecordCount, outputRecordCount = outputRecordCount, - appendedRecordCount = recordsAppended, jobStarted = jobStarted, jobFinished = jobFinished, - batchId = batchId) + batchId = batchId, + appendedRecordCount = recordsAppended) } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala index 986c86bcc..4635dc86d 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala @@ -106,10 +106,10 @@ class SyncBookkeeperMock(batchId: Long = 123L) extends Bookkeeper { dateStr, inputRecordCount, outputRecordCount, - recordsAppended, jobStarted, jobFinished, - batchId) + Option(batchId), + recordsAppended) chunks += (table, infoDate) -> chunk } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala index 81841467b..3ee34967b 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala @@ -16,9 +16,10 @@ package za.co.absa.pramen.core.tests.bookkeeper +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types._ import org.scalatest.wordspec.AnyWordSpec -import za.co.absa.pramen.core.bookkeeper.{Bookkeeper, BookkeeperDeltaTable, BookkeeperHadoop, BookkeeperText} +import za.co.absa.pramen.core.bookkeeper.{Bookkeeper, BookkeeperDeltaTable, BookkeeperText} import za.co.absa.pramen.core.model.DataChunk import java.time.LocalDate @@ -182,6 +183,9 @@ class BookkeeperCommonSuite extends AnyWordSpec { val bk1 = getBookkeeper(123L) val bk2 = getBookkeeper(456L) + // The feature is not supported with Delta Table implementation and Spark 2.x + assume(!SparkSession.active.version.startsWith("2.") || !bk1.isInstanceOf[BookkeeperDeltaTable]) + bk1.setRecordCount("table", infoDate1, 100, 10, Some(1), 1597318833, 1597318837, isTableTransient = false) bk2.setRecordCount("table", infoDate1, 200, 20, Some(1), 1597318838, 1597318839, isTableTransient = false) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala index 9f908cb0c..ef0bea843 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala @@ -16,9 +16,7 @@ package za.co.absa.pramen.core.tests.bookkeeper -import io.delta.tables.DeltaTable -import org.apache.spark.sql.functions.lit -import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, fullstacks} +import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll} import za.co.absa.pramen.core.base.SparkTestBase import za.co.absa.pramen.core.bookkeeper.BookkeeperDeltaTable @@ -39,17 +37,17 @@ class BookkeeperDeltaTableLongSuite extends BookkeeperCommonSuite with SparkTest } before { - val fullRecordsTableName = BookkeeperDeltaTable.getFullTableName(None, bookkeepingTablePrefix, BookkeeperDeltaTable.recordsTable) - val fullSchemasTableName = BookkeeperDeltaTable.getFullTableName(None, bookkeepingTablePrefix, BookkeeperDeltaTable.schemasTable) + if (!spark.version.startsWith("2.")) { + val fullRecordsTableName = BookkeeperDeltaTable.getFullTableName(None, bookkeepingTablePrefix, BookkeeperDeltaTable.recordsTable) + val fullSchemasTableName = BookkeeperDeltaTable.getFullTableName(None, bookkeepingTablePrefix, BookkeeperDeltaTable.schemasTable) - if (spark.catalog.tableExists(fullRecordsTableName)) { - val deltaTable = DeltaTable.forName(spark, fullRecordsTableName) - deltaTable.delete(lit(true)) - } + if (spark.catalog.tableExists(fullRecordsTableName)) { + spark.sql(s"DELETE FROM $fullRecordsTableName WHERE true").count() + } - if (spark.catalog.tableExists(fullSchemasTableName)) { - val deltaTable = DeltaTable.forName(spark, fullSchemasTableName) - deltaTable.delete(lit(true)) + if (spark.catalog.tableExists(fullSchemasTableName)) { + spark.sql(s"DELETE FROM $fullRecordsTableName WHERE true").count() + } } } @@ -59,7 +57,11 @@ class BookkeeperDeltaTableLongSuite extends BookkeeperCommonSuite with SparkTest "BookkeeperHadoopDeltaTable" when { testBookKeeper { batchId => - getBookkeeper(bookkeepingTablePrefix, batchId) + if (spark.version.startsWith("2.")) { + getBookkeeper(getNewTablePrefix, batchId) + } else { + getBookkeeper(bookkeepingTablePrefix, batchId) + } } "test tables are created properly" in { diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/notify/pipeline/PipelineNotificationBuilderHtmlSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/notify/pipeline/PipelineNotificationBuilderHtmlSuite.scala index e336082f9..40c9922af 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/notify/pipeline/PipelineNotificationBuilderHtmlSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/notify/pipeline/PipelineNotificationBuilderHtmlSuite.scala @@ -175,7 +175,7 @@ class PipelineNotificationBuilderHtmlSuite extends AnyWordSpec with TextComparis builder.addCompletedTask(TaskResultFactory.getDummyTaskResult( runStatus = TestPrototypes.runStatusWarning, schemaDifferences = SchemaDifferenceFactory.getDummySchemaDifference() :: Nil, - notificationTargetErrors = Seq(NotificationFailure("table1", "my_tagret", LocalDate.parse("2020-02-18"), new RuntimeException("Target 1 exception")))) + notificationTargetErrors = Seq(NotificationFailure("table1", "my_target", LocalDate.parse("2020-02-18"), new RuntimeException("Target 1 exception")))) ) val actual = builder.renderBody() From b5a52c79eb1a3f564b090917fec7e864261a6cc2 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Fri, 2 Jan 2026 15:27:49 +0100 Subject: [PATCH 10/12] #677 Fix more PR suggestions and nitpicks. --- .../absa/pramen/core/bookkeeper/BookkeeperBase.scala | 2 +- .../pramen/core/bookkeeper/BookkeeperDeltaBase.scala | 2 +- .../absa/pramen/core/bookkeeper/BookkeeperJdbc.scala | 2 +- .../pramen/core/bookkeeper/BookkeeperMongoDb.scala | 11 +++++++++-- .../absa/pramen/core/bookkeeper/BookkeeperText.scala | 3 ++- .../za/co/absa/pramen/core/utils/AlgorithmUtils.scala | 7 +++---- .../pramen/core/bookkeeper/BookkeeperDeltaTable.scala | 2 +- .../core/mocks/bookkeeper/SyncBookkeeperMock.scala | 2 +- .../bookkeeper/BookkeeperDeltaTableLongSuite.scala | 2 +- 9 files changed, 20 insertions(+), 13 deletions(-) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala index cb50b1174..a199de637 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperBase.scala @@ -148,7 +148,7 @@ abstract class BookkeeperBase(isBookkeepingEnabled: Boolean, batchId: Long) exte batchId match { case Some(id) => - allChunks.filter(chunk => chunk.infoDate >= minDate && chunk.infoDate <= maxDate && chunk.batchId == id) + allChunks.filter(chunk => chunk.infoDate >= minDate && chunk.infoDate <= maxDate && chunk.batchId.contains(id)) case None => allChunks.filter(chunk => chunk.infoDate >= minDate && chunk.infoDate <= maxDate) } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala index 69b89eaf5..941c951fc 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaBase.scala @@ -113,7 +113,7 @@ abstract class BookkeeperDeltaBase(batchId: Long) extends BookkeeperHadoop(batch private[core] def getBkAllData(filter: Column): Seq[DataChunk] = { getBkDf(filter) .collect() - .sortBy(_.infoDate) + .sortBy(_.jobFinished) } private[core] def getBkData(filter: Column): Seq[DataChunk] = { diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala index fe99366f8..bf5297543 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala @@ -140,7 +140,7 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b .delete try { - AlgorithmUtils.resultWithTimeout(WARN_IF_LONGER_MS) { + AlgorithmUtils.runActionWithElapsedTimeEvent(WARN_IF_LONGER_MS) { db.run(query).execute() } { actualTimeMs => val elapsedTime = TimeUtils.prettyPrintElapsedTimeShort(actualTimeMs) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala index de9cd5b24..9eca65253 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperMongoDb.scala @@ -29,6 +29,7 @@ import za.co.absa.pramen.core.dao.MongoDb import za.co.absa.pramen.core.dao.model.{ASC, IndexField} import za.co.absa.pramen.core.model.{DataChunk, TableSchema} import za.co.absa.pramen.core.mongo.MongoDbConnection +import za.co.absa.pramen.core.utils.{AlgorithmUtils, TimeUtils} import java.time.LocalDate import scala.util.control.NonFatal @@ -46,6 +47,7 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext private val log = LoggerFactory.getLogger(this.getClass) + private val queryWarningTimeoutMs = 10000L private val codecRegistry: CodecRegistry = fromRegistries(fromProviders(classOf[DataChunk], classOf[TableSchema]), DEFAULT_CODEC_REGISTRY) private val db = mongoDbConnection.getDatabase @@ -94,7 +96,7 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext override def getDataChunksFromStorage(table: String, infoDate: LocalDate, batchId: Option[Long]): Seq[DataChunk] = { val chunks = collection.find(getFilter(table, Option(infoDate), Option(infoDate), batchId)).execute() .sortBy(_.jobFinished) - log.info(s"For $table ($infoDate) : ${chunks.mkString("[ ", ", ", " ]")}") + log.debug(s"For $table ($infoDate) : ${chunks.mkString("[ ", ", ", " ]")}") chunks } @@ -121,7 +123,12 @@ class BookkeeperMongoDb(mongoDbConnection: MongoDbConnection, batchId: Long) ext Filters.ne("batchId", batchId) ) - collection.deleteMany(filter).execute() + AlgorithmUtils.runActionWithElapsedTimeEvent(queryWarningTimeoutMs) { + collection.deleteMany(filter).execute() + }{ actualTimeMs => + val elapsedTime = TimeUtils.prettyPrintElapsedTimeShort(actualTimeMs) + log.warn(s"MongoDB query took too long ($elapsedTime) while deleting from $collectionName, tableName='$table', infoDate='$infoDate', batchId!=$batchId") + } } private def getFilter(tableName: String, infoDateBeginOpt: Option[LocalDate], infoDateEndOpt: Option[LocalDate], batchId: Option[Long]): Bson = { diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala index b6f0fef4e..42fe1b60d 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperText.scala @@ -195,6 +195,7 @@ class BookkeeperText(bookkeepingPath: String, batchId: Long)(implicit spark: Spa } override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { - + // No-op: CSV-based storage doesn't support efficient in-place deletion. + // Cross-batch replacement is not supported for BookkeeperText. } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/AlgorithmUtils.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/AlgorithmUtils.scala index 14c38ba60..cf2ea6bee 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/AlgorithmUtils.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/utils/AlgorithmUtils.scala @@ -17,7 +17,6 @@ package za.co.absa.pramen.core.utils import org.slf4j.Logger -import za.co.absa.pramen.core.utils.SlickUtils.log import java.time.{Duration, Instant} import scala.annotation.tailrec @@ -95,14 +94,14 @@ object AlgorithmUtils { } } - final def resultWithTimeout[R](timeoutMs: Long)(action: => R)(onTimeout: Long => Unit): R = { + final def runActionWithElapsedTimeEvent[R](maxTimeMs: Long)(action: => R)(onMaxTimeBreach: Long => Unit): R = { val start = Instant.now val result = action val finish = Instant.now val duration = Duration.between(start, finish).toMillis - if (duration > timeoutMs) { - onTimeout(timeoutMs) + if (duration > maxTimeMs) { + onMaxTimeBreach(duration) } result } diff --git a/pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala b/pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala index 71206f2af..476b41619 100644 --- a/pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala +++ b/pramen/core/src/main/scala_2.11/za/co/absa/pramen/core/bookkeeper/BookkeeperDeltaTable.scala @@ -16,7 +16,7 @@ package za.co.absa.pramen.core.bookkeeper -import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Column, Dataset, SaveMode, SparkSession} import za.co.absa.pramen.core.bookkeeper.model.TableSchemaJson import za.co.absa.pramen.core.model.{DataChunk, TableSchema} diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala index 4635dc86d..17629d743 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala @@ -62,7 +62,7 @@ class SyncBookkeeperMock(batchId: Long = 123L) extends Bookkeeper { if (isInsidePeriod) { batchId match { case Some(id) => - if (chunk.batchId == id) { + if (chunk.batchId.contains(id)) { Some(chunk) } else { None diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala index ef0bea843..695c29994 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala @@ -46,7 +46,7 @@ class BookkeeperDeltaTableLongSuite extends BookkeeperCommonSuite with SparkTest } if (spark.catalog.tableExists(fullSchemasTableName)) { - spark.sql(s"DELETE FROM $fullRecordsTableName WHERE true").count() + spark.sql(s"DELETE FROM $fullSchemasTableName WHERE true").count() } } } From 3791376422db3e1b20e7d0838a5ac6ccced06a0b Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Mon, 5 Jan 2026 08:53:49 +0100 Subject: [PATCH 11/12] #677 Fix even more PR suggestions and nitpicks. --- .../absa/pramen/core/bookkeeper/BookkeeperJdbc.scala | 2 +- .../core/mocks/bookkeeper/SyncBookkeeperMock.scala | 10 +--------- .../core/tests/bookkeeper/BookkeeperCommonSuite.scala | 7 ++++++- .../bookkeeper/BookkeeperDeltaTableLongSuite.scala | 4 ++-- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala index bf5297543..cf65f3bbf 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala @@ -185,7 +185,7 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b } batchId match { - case Some(id) => baseFilter.filter(r => r.batchId === id) + case Some(id) => baseFilter.filter(r => r.batchId === Option(id)) case None => baseFilter } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala index 17629d743..6a6eb3a8b 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/bookkeeper/SyncBookkeeperMock.scala @@ -60,15 +60,7 @@ class SyncBookkeeperMock(batchId: Long = 123L) extends Bookkeeper { chunks.toList.flatMap { case ((tblName, date), chunk) => val isInsidePeriod = tblName == table && date.equals(infoDate) if (isInsidePeriod) { - batchId match { - case Some(id) => - if (chunk.batchId.contains(id)) { - Some(chunk) - } else { - None - } - case None => Some(chunk) - } + if (batchId.forall(chunk.batchId.contains)) Some(chunk) else None } else { None } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala index 3ee34967b..ec9f54a19 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperCommonSuite.scala @@ -23,6 +23,7 @@ import za.co.absa.pramen.core.bookkeeper.{Bookkeeper, BookkeeperDeltaTable, Book import za.co.absa.pramen.core.model.DataChunk import java.time.LocalDate +import scala.util.Try class BookkeeperCommonSuite extends AnyWordSpec { def testBookKeeper(getBookkeeper: Long => Bookkeeper): Unit = { @@ -183,8 +184,12 @@ class BookkeeperCommonSuite extends AnyWordSpec { val bk1 = getBookkeeper(123L) val bk2 = getBookkeeper(456L) + val sparkVersion = Try { + SparkSession.active.version + }.toOption.getOrElse("") + // The feature is not supported with Delta Table implementation and Spark 2.x - assume(!SparkSession.active.version.startsWith("2.") || !bk1.isInstanceOf[BookkeeperDeltaTable]) + assume(!sparkVersion.startsWith("2.") || !bk1.isInstanceOf[BookkeeperDeltaTable]) bk1.setRecordCount("table", infoDate1, 100, 10, Some(1), 1597318833, 1597318837, isTableTransient = false) bk2.setRecordCount("table", infoDate1, 200, 20, Some(1), 1597318838, 1597318839, isTableTransient = false) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala index 695c29994..0d0027bc8 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDeltaTableLongSuite.scala @@ -42,11 +42,11 @@ class BookkeeperDeltaTableLongSuite extends BookkeeperCommonSuite with SparkTest val fullSchemasTableName = BookkeeperDeltaTable.getFullTableName(None, bookkeepingTablePrefix, BookkeeperDeltaTable.schemasTable) if (spark.catalog.tableExists(fullRecordsTableName)) { - spark.sql(s"DELETE FROM $fullRecordsTableName WHERE true").count() + spark.sql(s"DELETE FROM $fullRecordsTableName WHERE true") } if (spark.catalog.tableExists(fullSchemasTableName)) { - spark.sql(s"DELETE FROM $fullSchemasTableName WHERE true").count() + spark.sql(s"DELETE FROM $fullSchemasTableName WHERE true") } } } From acacfba6025751c6cabb7b3b056cfa4d0bae4654 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Mon, 5 Jan 2026 09:19:44 +0100 Subject: [PATCH 12/12] #677 Filter by Option(batchId) to match better what Slick framework expects. --- .../za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala index cf65f3bbf..ee76b8ada 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperJdbc.scala @@ -136,7 +136,7 @@ class BookkeeperJdbc(db: Database, batchId: Long) extends BookkeeperBase(true, b val dateStr = DataChunk.dateFormatter.format(infoDate) val query = BookkeepingRecords.records - .filter(r => r.pramenTableName === table && r.infoDate === dateStr && r.batchId =!= batchId) + .filter(r => r.pramenTableName === table && r.infoDate === dateStr && r.batchId =!= Option(batchId)) .delete try {