@@ -1279,6 +1279,120 @@ class IncrementalPipelineLongFixture extends AnyWordSpec
12791279 succeed
12801280 }
12811281
1282+ def testNormalRunAfterRerun (metastoreFormat : String ): Assertion = {
1283+ val csv1DataStr = s " id,name,info_date \n 1,John, $infoDate\n 2,Jack, $infoDate\n "
1284+ val csv2DataStr = s " id,name,info_date \n 3,Jill, $infoDate\n 4,Mary, $infoDate\n "
1285+
1286+ val expectedStr1 : String =
1287+ """ {"id":1,"name":"John"}
1288+ |{"id":2,"name":"Jack"}
1289+ |""" .stripMargin
1290+
1291+ val expectedStr2 : String =
1292+ """ {"id":1,"name":"John"}
1293+ |{"id":2,"name":"Jack"}
1294+ |{"id":3,"name":"Jill"}
1295+ |{"id":4,"name":"Mary"}
1296+ |""" .stripMargin
1297+
1298+ withTempDirectory(" incremental1" ) { tempDir =>
1299+ val fsUtils = new FsUtils (spark.sparkContext.hadoopConfiguration, tempDir)
1300+
1301+ val path1 = new Path (tempDir, new Path (" landing" , " landing_file1.csv" ))
1302+ val path2 = new Path (tempDir, new Path (" landing" , " landing_file2.csv" ))
1303+
1304+ val table1Path = new Path (tempDir, " table1" )
1305+ val table2Path = new Path (tempDir, " table2" )
1306+
1307+ fsUtils.writeFile(path1, csv1DataStr)
1308+ val conf1 = getConfig(tempDir, metastoreFormat, hasInfoDate = true , inferSchema = false , csvSchema = csvWithInfoDateSchema, isRerun = true )
1309+ val exitCode1 = AppRunner .runPipeline(conf1)
1310+ assert(exitCode1 == 0 )
1311+
1312+ fsUtils.writeFile(path2, csv2DataStr)
1313+ val conf2 = getConfig(tempDir, metastoreFormat, hasInfoDate = true , inferSchema = false , csvSchema = csvWithInfoDateSchema)
1314+ val exitCode2 = AppRunner .runPipeline(conf2)
1315+ assert(exitCode2 == 0 )
1316+
1317+ val dfTable1 = spark.read.format(metastoreFormat).load(table1Path.toString).filter(col(INFO_DATE_COLUMN ) === Date .valueOf(infoDate))
1318+ val dfTable2 = spark.read.format(metastoreFormat).load(table2Path.toString).filter(col(INFO_DATE_COLUMN ) === Date .valueOf(infoDate))
1319+ val actualTable1 = dfTable1.select(" id" , " name" ).orderBy(" id" ).toJSON.collect().mkString(" \n " )
1320+ val actualTable2 = dfTable2.select(" id" , " name" ).orderBy(" id" ).toJSON.collect().mkString(" \n " )
1321+
1322+ compareText(actualTable1, expectedStr2)
1323+ compareText(actualTable2, expectedStr2)
1324+
1325+ val batchIds = dfTable1.select(BATCH_ID_COLUMN ).distinct().collect()
1326+
1327+ assert(batchIds.length == 2 )
1328+
1329+ val om = new OffsetManagerJdbc (pramenDb.db, 123L )
1330+
1331+ val offsets = om.getOffsets(" table1->table2" , infoDate).map(_.asInstanceOf [CommittedOffset ])
1332+ assert(offsets.length == 1 )
1333+ }
1334+ succeed
1335+ }
1336+
1337+ def testNormalRunAfterRerunAfterNormalRun (metastoreFormat : String ): Assertion = {
1338+ val csv1DataStr = s " id,name,info_date \n 1,John, $infoDate\n 2,Jack, $infoDate\n "
1339+ val csv2DataStr = s " id,name,info_date \n 3,Jill, $infoDate\n 4,Mary, $infoDate\n "
1340+ val csv3DataStr = s " id,name,info_date \n 5,Jane, $infoDate\n 6,Kate, $infoDate\n "
1341+
1342+ val expectedStr : String =
1343+ """ {"id":1,"name":"John"}
1344+ |{"id":2,"name":"Jack"}
1345+ |{"id":3,"name":"Jill"}
1346+ |{"id":4,"name":"Mary"}
1347+ |{"id":5,"name":"Jane"}
1348+ |{"id":6,"name":"Kate"}
1349+ |""" .stripMargin
1350+
1351+ withTempDirectory(" incremental1" ) { tempDir =>
1352+ val fsUtils = new FsUtils (spark.sparkContext.hadoopConfiguration, tempDir)
1353+
1354+ val path1 = new Path (tempDir, new Path (" landing" , " landing_file1.csv" ))
1355+ val path2 = new Path (tempDir, new Path (" landing" , " landing_file2.csv" ))
1356+ val path3 = new Path (tempDir, new Path (" landing" , " landing_file3.csv" ))
1357+
1358+ val table1Path = new Path (tempDir, " table1" )
1359+ val table2Path = new Path (tempDir, " table2" )
1360+
1361+ fsUtils.writeFile(path1, csv1DataStr)
1362+ val conf1 = getConfig(tempDir, metastoreFormat, hasInfoDate = true , inferSchema = false , csvSchema = csvWithInfoDateSchema)
1363+ val exitCode1 = AppRunner .runPipeline(conf1)
1364+ assert(exitCode1 == 0 )
1365+
1366+ fsUtils.writeFile(path2, csv2DataStr)
1367+ val conf2 = getConfig(tempDir, metastoreFormat, hasInfoDate = true , inferSchema = false , csvSchema = csvWithInfoDateSchema, isRerun = true )
1368+ val exitCode2 = AppRunner .runPipeline(conf2)
1369+ assert(exitCode2 == 0 )
1370+
1371+ fsUtils.writeFile(path3, csv3DataStr)
1372+ val conf3 = getConfig(tempDir, metastoreFormat, hasInfoDate = true , inferSchema = false , csvSchema = csvWithInfoDateSchema)
1373+ val exitCode3 = AppRunner .runPipeline(conf3)
1374+ assert(exitCode3 == 0 )
1375+
1376+ val dfTable1 = spark.read.format(metastoreFormat).load(table1Path.toString).filter(col(INFO_DATE_COLUMN ) === Date .valueOf(infoDate))
1377+ val dfTable2 = spark.read.format(metastoreFormat).load(table2Path.toString).filter(col(INFO_DATE_COLUMN ) === Date .valueOf(infoDate))
1378+ val actualTable1 = dfTable1.select(" id" , " name" ).orderBy(" id" ).toJSON.collect().mkString(" \n " )
1379+ val actualTable2 = dfTable2.select(" id" , " name" ).orderBy(" id" ).toJSON.collect().mkString(" \n " )
1380+
1381+ compareText(actualTable1, expectedStr)
1382+ compareText(actualTable2, expectedStr)
1383+
1384+ val batchIds = dfTable1.select(BATCH_ID_COLUMN ).distinct().collect()
1385+
1386+ assert(batchIds.length == 2 )
1387+
1388+ val om = new OffsetManagerJdbc (pramenDb.db, 123L )
1389+
1390+ val offsets = om.getOffsets(" table1->table2" , infoDate).map(_.asInstanceOf [CommittedOffset ])
1391+ assert(offsets.length == 1 )
1392+ }
1393+ succeed
1394+ }
1395+
12821396 def getConfig (basePath : String ,
12831397 metastoreFormat : String ,
12841398 isRerun : Boolean = false ,
0 commit comments