AbsaOSS · yruslan · Oct 29, 2025 · Oct 24, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/README.md b/README.md
@@ -941,8 +941,14 @@ pramen.sources = [
 
     # [Optional] Set name for the struct field that contains Kafka record metadata
     #custom.kafka.column = "kafka"
+
     # [Optional] Set name for the Kafka key column
     #key.column.name = "kafka_key"
+
+    # The Kafka key serializer when key.naming.strategy is NOT defined. Can be "none", "binary", "string".
+    # When key.naming.strategy IS defined in schema.registry, Avro deserialization is used automatically.
+    # Default is "binary".
+    #key.column.serializer = "none"
 
     kafka {
       bootstrap.servers = "mybroker1:9092,mybroker2:9092"
@@ -956,6 +962,7 @@ pramen.sources = [
     schema.registry {
       url = "https://my.schema.registry:8081"
       value.naming.strategy = "topic.name"
+      #key.naming.strategy = "topic.name"
 
       # Arbitrary options for Schema registry
       basic.auth.credentials.source = "..."
@@ -982,7 +989,7 @@ pramen.operations = [
 
     tables = [
       {
-        input.table = "my_kafka_topic1"
+        input.topic = "my_kafka_topic1"
         output.metastore.table = "table1"
       }
     ]

@@ -25,19 +25,22 @@ object QueryBuilder {
   val PATH_KEY = "path"
   val TABLE_KEY = "table"
   val DB_TABLE_KEY = "db.table" // Same as table - for backwards compatibility and config readability
+  val TOPIC_KEY = "topic" // Same as table, only for sources for which topic name is more aligned to the terminology
 
   def fromConfig(conf: Config, prefix: String, parentPath: String): Query = {
     val p = if (prefix.isEmpty) "" else s"$prefix."
 
     val hasSql = conf.hasPath(s"$p$SQL_KEY")
     val hasPath = conf.hasPath(s"$p$PATH_KEY")
-    val hasDbTable = conf.hasPath(s"$p$TABLE_KEY") || conf.hasPath(s"$p$DB_TABLE_KEY")
+    val hasDbTable = conf.hasPath(s"$p$TABLE_KEY") || conf.hasPath(s"$p$DB_TABLE_KEY") || conf.hasPath(s"$p$TOPIC_KEY")
     val hesSomething = if (prefix.isEmpty) !conf.isEmpty else conf.hasPath(prefix)
 
     val tableDef = if (conf.hasPath(s"$p$TABLE_KEY")) {
       Some(conf.getString(s"$p$TABLE_KEY"))
     } else if (conf.hasPath(s"$p$DB_TABLE_KEY")) {
       Some(conf.getString(s"$p$DB_TABLE_KEY"))
+    } else if (conf.hasPath(s"$p$TOPIC_KEY")) {
+      Some(conf.getString(s"$p$TOPIC_KEY"))
     } else {
       None
     }
@@ -50,9 +53,9 @@ object QueryBuilder {
       case _                           =>
         val parent = if (parentPath.isEmpty) "" else s" at $parentPath"
         if (prefix.isEmpty)
-          throw new IllegalArgumentException(s"No options are specified for the query. Usually, it is one of: '$SQL_KEY', '$PATH_KEY', '$TABLE_KEY', '$DB_TABLE_KEY'$parent.")
+          throw new IllegalArgumentException(s"No options are specified for the query. Usually, it is one of: '$SQL_KEY', '$PATH_KEY', '$TABLE_KEY', '$DB_TABLE_KEY', '$TOPIC_KEY'$parent.")
         else
-          throw new IllegalArgumentException(s"No options are specified for the '$prefix' query. Usually, it is one of: '$p$SQL_KEY', '$p$PATH_KEY', '$p$TABLE_KEY', '$p$DB_TABLE_KEY'$parent.")
+          throw new IllegalArgumentException(s"No options are specified for the '$prefix' query. Usually, it is one of: '$p$SQL_KEY', '$p$PATH_KEY', '$p$TABLE_KEY', '$p$DB_TABLE_KEY', '$p$TOPIC_KEY'$parent.")
     }
   }
 }
@@ -30,6 +30,8 @@ class QueryBuilderSuite extends AnyWordSpec {
     TestCase("", """data.file.1 = "/some/data/file"""", Query.Custom(Map("data.file.1" -> "/some/data/file"))),
     TestCase("input", """input.sql = "SELECT * FROM table"""", Query.Sql("SELECT * FROM table")),
     TestCase("input", """input.table = table1""", Query.Table("table1")),
+    TestCase("input", """input.db.table = table1""", Query.Table("table1")),
+    TestCase("input", """input.topic = topic1""", Query.Table("topic1")),
     TestCase("input", """input.path = /some/path""", Query.Path("/some/path")),
     TestCase("input", "input.data.file.1 = /some/data/file1\ninput.data.file.2 = /some/data/file2",
       Query.Custom(Map("data.file.1" -> "/some/data/file1", "data.file.2" -> "/some/data/file2")))
@@ -53,7 +55,7 @@ class QueryBuilderSuite extends AnyWordSpec {
         QueryBuilder.fromConfig(conf, "", "")
       }
 
-      assert(exception.getMessage == "No options are specified for the query. Usually, it is one of: 'sql', 'path', 'table', 'db.table'.")
+      assert(exception.getMessage == "No options are specified for the query. Usually, it is one of: 'sql', 'path', 'table', 'db.table', 'topic'.")
     }
 
     "throw an exception when the prefix is empty" in {
@@ -63,7 +65,7 @@ class QueryBuilderSuite extends AnyWordSpec {
         QueryBuilder.fromConfig(conf, "input", "")
       }
 
-      assert(exception.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table'.")
+      assert(exception.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic'.")
     }
 
     "throw an exception when the prefix is empty and parent is specified" in {
@@ -73,7 +75,7 @@ class QueryBuilderSuite extends AnyWordSpec {
         QueryBuilder.fromConfig(conf, "input", "my.parent")
       }
 
-      assert(exception.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table' at my.parent.")
+      assert(exception.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic' at my.parent.")
     }
   }
 }
@@ -119,7 +119,7 @@ class SourceTableParserSuite extends AnyWordSpec {
         SourceTableParser.fromConfig(conf, "source.tables")
       }
 
-      assert(ex.getMessage.contains("No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table' at source.tables[0]."))
+      assert(ex.getMessage.contains("No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic' at source.tables[0]."))
     }
 
     "throw an exception in case of duplicate entries" in {

@@ -19,6 +19,7 @@ package za.co.absa.pramen.extras.source
 import com.typesafe.config.Config
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.functions.{col, struct}
+import org.apache.spark.sql.types.StringType
 import org.slf4j.LoggerFactory
 import za.co.absa.abris.avro.functions.from_avro
 import za.co.absa.abris.config.AbrisConfig
@@ -48,9 +49,15 @@ import java.time.LocalDate
   *
   *    # [Optional] Set name for the struct field that contains Kafka record metadata
   *    custom.kafka.column = "kafka"
+  *
   *    # [Optional] Set name for the Kafka key column
   *    key.column.name = "kafka_key"
   *
+  *    # The Kafka key serializer when 'key.naming.strategy' is NOT defined. Can be "none", "binary", "string".
+  *    # When 'key.naming.strategy' IS defined in 'schema.registry', Avro deserialization is used automatically.
+  *    # Default is "binary".
+  *    #key.column.serializer = "none"
+  *
   *    kafka {
   *      bootstrap.servers = "mybroker1:9092,mybroker2:9092"
   *
@@ -66,6 +73,7 @@ import java.time.LocalDate
   *
   *      # Can be one of: topic.name, record.name, topic.record.name
   *      value.naming.strategy = "topic.name"
+  *      #key.naming.strategy = "topic.name"
   *
   *      # If you want to force the specific schema id. Otherwise, the latest schema id will be used.
   *      # key.schema.id =
@@ -114,6 +122,9 @@ class KafkaAvroSource(sourceConfig: Config,
 
   private val kafkaColumnName = ConfigUtils.getOptionString(sourceConfig, CUSTOM_KAFKA_COLUMN_KEY).getOrElse("kafka")
   private val keyColumnName = ConfigUtils.getOptionString(sourceConfig, KEY_COLUMN_KEY).getOrElse("kafka_key")
+  private val keyColumnSerializer = ConfigUtils.getOptionString(sourceConfig, KEY_COLUMN_SERIALIZER_KEY).getOrElse("binary").toLowerCase.trim
+  private val tempKafkaColumnName = "tmp_pramen_kafka"
+  private val tempKafkaKeyColumnName = "tmp_pramen_kafka_key"
 
   override def hasInfoDateColumn(query: Query): Boolean = false
 
@@ -206,32 +217,46 @@ class KafkaAvroSource(sourceConfig: Config,
         col("timestampType").as("timestamp_type")
       ))
 
+    val hasKey = kafkaAvroConfig.keyNamingStrategy.isDefined || keyColumnSerializer != "none"
+
     val df2 = kafkaAvroConfig.keyNamingStrategy match {
       case Some(keyNamingStrategy) =>
         val abrisKeyConfig = keyNamingStrategy
           .applyNamingStrategyToAbrisConfig(abrisValueBase, topic, isKey = true)
           .usingSchemaRegistry(schemaRegistryClientConfig)
-        df1.withColumn("tmp_pramen_kafka_key", from_avro(col("key"), abrisKeyConfig))
+        df1.withColumn(tempKafkaKeyColumnName, from_avro(col("key"), abrisKeyConfig))
       case None =>
-        df1.withColumn("tmp_pramen_kafka_key", col("key"))
+        keyColumnSerializer match {
+          case "none" => df1
+          case "binary" => df1.withColumn(tempKafkaKeyColumnName, col("key"))
+          case "string" => df1.withColumn(tempKafkaKeyColumnName, col("key").cast(StringType))
+          case "avro" => throw new IllegalArgumentException("For the 'avro' serializer of Kafka topic key, 'schema.registry.key.naming.strategy' needs to be set.")
+          case x => throw new IllegalArgumentException(s"Unknown Kafka key serializer '$x'. Can be one of: none, binary, string, avro.")
+        }
     }
 
     val payloadFields = df2.select("data.*").schema.fieldNames.toSet
     if (payloadFields.contains(kafkaColumnName)) {
-      log.warn(s"Payload field '$kafkaColumnName' conflicts with Kafka metadata struct name and will be replaced.")
+      log.warn(s"Payload field '$kafkaColumnName' conflicts with reserved Kafka metadata struct name and will be replaced.")
     }
     if (payloadFields.contains(keyColumnName)) {
-      log.warn(s"Payload field '$keyColumnName' conflicts with Kafka key column name and will be replaced.")
+      log.warn(s"Payload field '$keyColumnName' conflicts with reserved Kafka key column name and will be removed.")
     }
 
     // Put data fields to the root level of the schema, and if data struct already has kafka_key and kafka fields,
     // drop them
-    val dfFinal = df2
-      .select("tmp_pramen_kafka_key", "data.*", "tmp_pramen_kafka")
-      .drop(kafkaColumnName)
-      .drop(keyColumnName)
-      .withColumnRenamed("tmp_pramen_kafka", kafkaColumnName)
-      .withColumnRenamed("tmp_pramen_kafka_key", keyColumnName)
+    val dfFinal = if (hasKey) {
+      df2.select(tempKafkaKeyColumnName, "data.*", tempKafkaColumnName)
+        .drop(kafkaColumnName)
+        .drop(keyColumnName)
+        .withColumnRenamed(tempKafkaColumnName, kafkaColumnName)
+        .withColumnRenamed(tempKafkaKeyColumnName, keyColumnName)
+    } else {
+      df2.select("data.*", tempKafkaColumnName)
+        .drop(kafkaColumnName)
+        .drop(keyColumnName)
+        .withColumnRenamed(tempKafkaColumnName, kafkaColumnName)
+    }
 
     SourceResult(dfFinal)
   }
@@ -243,6 +268,7 @@ object KafkaAvroSource extends ExternalChannelFactoryV2[KafkaAvroSource] {
   val TOPIC_NAME_KEY = "topic.name"
   val CUSTOM_KAFKA_COLUMN_KEY = "custom.kafka.column"
   val KEY_COLUMN_KEY = "key.column.name"
+  val KEY_COLUMN_SERIALIZER_KEY = "key.column.serializer"
 
   val KAFKA_TOKENS_TO_REDACT = Set("password", "jaas.config", "auth.user.info")