#660 Implement incremental offset commits for re-runs so normal runs can be ran after reruns..

yruslan · yruslan · commit ec71a1db7f7a · 2025-11-27T11:16:05.000+01:00
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderIncrementalImpl.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metastore/MetastoreReaderIncrementalImpl.scala
@@ -47,9 +47,14 @@ class MetastoreReaderIncrementalImpl(metastore: Metastore,
     if (readMode == ReaderMode.IncrementalPostProcessing && !isRerun) {
       log.info(s"Getting the current batch for table '$tableName' at '$infoDate'...")
       metastore.getBatch(tableName, infoDate, None)
-    } else if ((readMode == ReaderMode.IncrementalValidation || readMode == ReaderMode.IncrementalRun) && !isRerun) {
-      log.info(s"Getting the current incremental chunk for table '$tableName' at '$infoDate'...")
-      getIncremental(tableName, infoDate)
+    } else if (readMode == ReaderMode.IncrementalValidation || readMode == ReaderMode.IncrementalRun) {
+      if (isRerun) {
+        log.info(s"Getting the current incremental chunk for table rerun '$tableName' at '$infoDate'...")
+        getIncrementalForRerun(tableName, infoDate)
+      } else {
+        log.info(s"Getting the current incremental chunk for table '$tableName' at '$infoDate'...")
+        getIncremental(tableName, infoDate)
+      }
     } else {
       log.info(s"Getting daily data for table '$tableName' at '$infoDate'...")
       metastore.getTable(tableName, Option(infoDate), Option(infoDate))
@@ -89,6 +94,13 @@ class MetastoreReaderIncrementalImpl(metastore: Metastore,
     getIncrementalDf(tableName, trackingName, infoDate, commitChanges)
   }
 
+  private def getIncrementalForRerun(tableName: String, infoDate: LocalDate): DataFrame = {
+    val commitChanges = readMode == ReaderMode.IncrementalRun
+    val trackingName = s"$tableName->$outputTable"
+
+    getIncrementalDfForRerun(tableName, trackingName, infoDate, commitChanges)
+  }
+
   private def getIncrementalDf(tableName: String, trackingName: String, infoDate: LocalDate, commit: Boolean): DataFrame = {
     val tableDef = metastore.getTableDef(tableName)
     val om = bookkeeper.getOffsetManager
@@ -133,4 +145,31 @@ class MetastoreReaderIncrementalImpl(metastore: Metastore,
 
     df
   }
+
+  private def getIncrementalDfForRerun(tableName: String, trackingName: String, infoDate: LocalDate, commit: Boolean): DataFrame = {
+    val tableDef = metastore.getTableDef(tableName)
+    val om = bookkeeper.getOffsetManager
+    val offsets = om.getMaxInfoDateAndOffset(trackingName, Option(infoDate))
+    val tableDf = metastore.getTable(tableName, Option(infoDate), Option(infoDate))
+
+    if (commit && !trackingTables.exists(t => t.trackingName == trackingName && t.infoDate == infoDate)) {
+      log.info(s"Starting offset commit for table rerun '$trackingName' for '$infoDate'")
+
+      val trackingTable = TrackingTable(
+        Thread.currentThread().getId,
+        tableName,
+        outputTable,
+        trackingName,
+        tableDef.batchIdColumn,
+        offsets.map(_.minimumOffset),
+        offsets.map(_.maximumOffset),
+        infoDate,
+        Instant.now()
+      )
+
+      trackingTables += trackingTable
+    }
+
+    tableDf
+  }
 }
diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/integration/IncrementalPipelineDeltaLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/integration/IncrementalPipelineDeltaLongSuite.scala
@@ -113,5 +113,13 @@ class IncrementalPipelineDeltaLongSuite extends IncrementalPipelineLongFixture {
     "transformer picks up doubly ingested offsets" in {
       testTransformerPicksUpFromDoubleIngestedData(format)
     }
+
+    "run normal run after a rerun" in {
+      testNormalRunAfterRerun(format)
+    }
+
+    "run normal run then rerun, then normal run again for the same day" in {
+      testNormalRunAfterRerunAfterNormalRun(format)
+    }
   }
 }
diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/integration/IncrementalPipelineLongFixture.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/integration/IncrementalPipelineLongFixture.scala
@@ -1279,6 +1279,120 @@ class IncrementalPipelineLongFixture extends AnyWordSpec
     succeed
   }
 
+  def testNormalRunAfterRerun(metastoreFormat: String): Assertion = {
+    val csv1DataStr = s"id,name,info_date\n1,John,$infoDate\n2,Jack,$infoDate\n"
+    val csv2DataStr = s"id,name,info_date\n3,Jill,$infoDate\n4,Mary,$infoDate\n"
+
+    val expectedStr1: String =
+      """{"id":1,"name":"John"}
+        |{"id":2,"name":"Jack"}
+        |""".stripMargin
+
+    val expectedStr2: String =
+      """{"id":1,"name":"John"}
+        |{"id":2,"name":"Jack"}
+        |{"id":3,"name":"Jill"}
+        |{"id":4,"name":"Mary"}
+        |""".stripMargin
+
+    withTempDirectory("incremental1") { tempDir =>
+      val fsUtils = new FsUtils(spark.sparkContext.hadoopConfiguration, tempDir)
+
+      val path1 = new Path(tempDir, new Path("landing", "landing_file1.csv"))
+      val path2 = new Path(tempDir, new Path("landing", "landing_file2.csv"))
+
+      val table1Path = new Path(tempDir, "table1")
+      val table2Path = new Path(tempDir, "table2")
+
+      fsUtils.writeFile(path1, csv1DataStr)
+      val conf1 = getConfig(tempDir, metastoreFormat, hasInfoDate = true, inferSchema = false, csvSchema = csvWithInfoDateSchema, isRerun = true)
+      val exitCode1 = AppRunner.runPipeline(conf1)
+      assert(exitCode1 == 0)
+
+      fsUtils.writeFile(path2, csv2DataStr)
+      val conf2 = getConfig(tempDir, metastoreFormat, hasInfoDate = true, inferSchema = false, csvSchema = csvWithInfoDateSchema)
+      val exitCode2 = AppRunner.runPipeline(conf2)
+      assert(exitCode2 == 0)
+
+      val dfTable1 = spark.read.format(metastoreFormat).load(table1Path.toString).filter(col(INFO_DATE_COLUMN) === Date.valueOf(infoDate))
+      val dfTable2 = spark.read.format(metastoreFormat).load(table2Path.toString).filter(col(INFO_DATE_COLUMN) === Date.valueOf(infoDate))
+      val actualTable1 = dfTable1.select("id", "name").orderBy("id").toJSON.collect().mkString("\n")
+      val actualTable2 = dfTable2.select("id", "name").orderBy("id").toJSON.collect().mkString("\n")
+
+      compareText(actualTable1, expectedStr2)
+      compareText(actualTable2, expectedStr2)
+
+      val batchIds = dfTable1.select(BATCH_ID_COLUMN).distinct().collect()
+
+      assert(batchIds.length == 2)
+
+      val om = new OffsetManagerJdbc(pramenDb.db, 123L)
+
+      val offsets = om.getOffsets("table1->table2", infoDate).map(_.asInstanceOf[CommittedOffset])
+      assert(offsets.length == 1)
+    }
+    succeed
+  }
+
+  def testNormalRunAfterRerunAfterNormalRun(metastoreFormat: String): Assertion = {
+    val csv1DataStr = s"id,name,info_date\n1,John,$infoDate\n2,Jack,$infoDate\n"
+    val csv2DataStr = s"id,name,info_date\n3,Jill,$infoDate\n4,Mary,$infoDate\n"
+    val csv3DataStr = s"id,name,info_date\n5,Jane,$infoDate\n6,Kate,$infoDate\n"
+
+    val expectedStr: String =
+      """{"id":1,"name":"John"}
+        |{"id":2,"name":"Jack"}
+        |{"id":3,"name":"Jill"}
+        |{"id":4,"name":"Mary"}
+        |{"id":5,"name":"Jane"}
+        |{"id":6,"name":"Kate"}
+        |""".stripMargin
+
+    withTempDirectory("incremental1") { tempDir =>
+      val fsUtils = new FsUtils(spark.sparkContext.hadoopConfiguration, tempDir)
+
+      val path1 = new Path(tempDir, new Path("landing", "landing_file1.csv"))
+      val path2 = new Path(tempDir, new Path("landing", "landing_file2.csv"))
+      val path3 = new Path(tempDir, new Path("landing", "landing_file3.csv"))
+
+      val table1Path = new Path(tempDir, "table1")
+      val table2Path = new Path(tempDir, "table2")
+
+      fsUtils.writeFile(path1, csv1DataStr)
+      val conf1 = getConfig(tempDir, metastoreFormat, hasInfoDate = true, inferSchema = false, csvSchema = csvWithInfoDateSchema)
+      val exitCode1 = AppRunner.runPipeline(conf1)
+      assert(exitCode1 == 0)
+
+      fsUtils.writeFile(path2, csv2DataStr)
+      val conf2 = getConfig(tempDir, metastoreFormat, hasInfoDate = true, inferSchema = false, csvSchema = csvWithInfoDateSchema, isRerun = true)
+      val exitCode2 = AppRunner.runPipeline(conf2)
+      assert(exitCode2 == 0)
+
+      fsUtils.writeFile(path3, csv3DataStr)
+      val conf3 = getConfig(tempDir, metastoreFormat, hasInfoDate = true, inferSchema = false, csvSchema = csvWithInfoDateSchema)
+      val exitCode3 = AppRunner.runPipeline(conf3)
+      assert(exitCode3 == 0)
+
+      val dfTable1 = spark.read.format(metastoreFormat).load(table1Path.toString).filter(col(INFO_DATE_COLUMN) === Date.valueOf(infoDate))
+      val dfTable2 = spark.read.format(metastoreFormat).load(table2Path.toString).filter(col(INFO_DATE_COLUMN) === Date.valueOf(infoDate))
+      val actualTable1 = dfTable1.select("id", "name").orderBy("id").toJSON.collect().mkString("\n")
+      val actualTable2 = dfTable2.select("id", "name").orderBy("id").toJSON.collect().mkString("\n")
+
+      compareText(actualTable1, expectedStr)
+      compareText(actualTable2, expectedStr)
+
+      val batchIds = dfTable1.select(BATCH_ID_COLUMN).distinct().collect()
+
+      assert(batchIds.length == 2)
+
+      val om = new OffsetManagerJdbc(pramenDb.db, 123L)
+
+      val offsets = om.getOffsets("table1->table2", infoDate).map(_.asInstanceOf[CommittedOffset])
+      assert(offsets.length == 1)
+    }
+    succeed
+  }
+
   def getConfig(basePath: String,
                 metastoreFormat: String,
                 isRerun: Boolean = false,
diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/integration/IncrementalPipelineParquetLongSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/integration/IncrementalPipelineParquetLongSuite.scala
@@ -113,5 +113,13 @@ class IncrementalPipelineParquetLongSuite extends IncrementalPipelineLongFixture
     "transformer picks up doubly ingested offsets" in {
       testTransformerPicksUpFromDoubleIngestedData(format)
     }
+
+    "run normal run after a rerun" in {
+      testNormalRunAfterRerun(format)
+    }
+
+    "run normal run then rerun, then normal run again for the same day" in {
+      testNormalRunAfterRerunAfterNormalRun(format)
+    }
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -113,5 +113,13 @@ class IncrementalPipelineDeltaLongSuite extends IncrementalPipelineLongFixture {`
`113`	`113`	`"transformer picks up doubly ingested offsets" in {`
`114`	`114`	`testTransformerPicksUpFromDoubleIngestedData(format)`
`115`	`115`	`}`
	`116`	`+`
	`117`	`+ "run normal run after a rerun" in {`
	`118`	`+ testNormalRunAfterRerun(format)`
	`119`	`+ }`
	`120`	`+`
	`121`	`+ "run normal run then rerun, then normal run again for the same day" in {`
	`122`	`+ testNormalRunAfterRerunAfterNormalRun(format)`
	`123`	`+ }`
`116`	`124`	`}`
`117`	`125`	`}`