@@ -3,6 +3,7 @@ package scheduler
33import (
44 "context"
55 "fmt"
6+ "strconv"
67 "testing"
78 "time"
89
@@ -269,6 +270,7 @@ type ReplicaSchedulerTestCase struct {
269270 replicaNodeSoftAntiAffinity string
270271 replicaZoneSoftAntiAffinity string
271272 replicaDiskSoftAntiAffinity string
273+ replicaAutoBalance string
272274 ReplicaReplenishmentWaitInterval string
273275
274276 // some test cases only try to schedule a subset of a volume's replicas
@@ -1156,6 +1158,11 @@ func (s *TestSuite) TestReplicaScheduler(c *C) {
11561158 tc .firstNilReplica = - 1
11571159 testCases ["non-reusable replica after interval expires" ] = tc
11581160
1161+ // Test scheduling on the right node when "best-effort" auto balancing is enabled and an incorrect node has a
1162+ // node with less load.
1163+ tc = generateBestEffortAutoBalanceScheduleTestCase ()
1164+ testCases ["scheduling on the right node with \" best-effort\" auto balancing" ] = tc
1165+
11591166 for name , tc := range testCases {
11601167 fmt .Printf ("testing %v\n " , name )
11611168
@@ -1323,6 +1330,108 @@ func generateFailedReplicaTestCase(
13231330 return
13241331}
13251332
1333+ // Test scheduling on the right node when "best-effort" auto balancing is enabled and an incorrect node has a
1334+ // node with less load.
1335+ func generateBestEffortAutoBalanceScheduleTestCase () * ReplicaSchedulerTestCase {
1336+ tc := & ReplicaSchedulerTestCase {
1337+ engineImage : newEngineImage (TestEngineImage , longhorn .EngineImageStateDeployed ),
1338+ allReplicas : make (map [string ]* longhorn.Replica ),
1339+ replicasToSchedule : map [string ]struct {}{},
1340+ firstNilReplica : - 1 ,
1341+ }
1342+
1343+ daemon1 := newDaemonPod (corev1 .PodRunning , TestDaemon1 , TestNamespace , TestNode1 , TestIP1 )
1344+ daemon2 := newDaemonPod (corev1 .PodRunning , TestDaemon2 , TestNamespace , TestNode2 , TestIP2 )
1345+ tc .daemons = []* corev1.Pod {
1346+ daemon1 ,
1347+ daemon2 ,
1348+ }
1349+
1350+ // Create 2 nodes in the same zone
1351+ node1 := newNode (TestNode1 , TestNamespace , TestZone1 , true , longhorn .ConditionStatusTrue )
1352+ node2 := newNode (TestNode2 , TestNamespace , TestZone1 , true , longhorn .ConditionStatusTrue )
1353+ tc .engineImage .Status .NodeDeploymentMap [node1 .Name ] = true
1354+ tc .engineImage .Status .NodeDeploymentMap [node2 .Name ] = true
1355+
1356+ // For the test create a volume that requires 2 replicas
1357+ tc .volume = newVolume (TestVolumeName , 2 )
1358+
1359+ // Give each node 2 disks
1360+ addDisks := func (node * longhorn.Node , index int64 , disk longhorn.DiskSpec , storageAvailable int64 , hasReplica bool ) (diskID string ) {
1361+ if node .Spec .Disks == nil {
1362+ node .Spec .Disks = make (map [string ]longhorn.DiskSpec )
1363+ }
1364+ if node .Status .DiskStatus == nil {
1365+ node .Status .DiskStatus = make (map [string ]* longhorn.DiskStatus )
1366+ }
1367+
1368+ var scheduledReplica map [string ]int64
1369+ if hasReplica {
1370+ replica := newReplicaForVolume (tc .volume )
1371+ replica .Spec .NodeID = node .ObjectMeta .Name
1372+ tc .allReplicas [replica .Name ] = replica
1373+ scheduledReplica = map [string ]int64 {replica .Name : TestVolumeSize }
1374+ }
1375+
1376+ id := getDiskID (node .ObjectMeta .Name , strconv .FormatInt (index , 10 ))
1377+ node .Spec .Disks [id ] = disk
1378+ node .Status .DiskStatus [id ] = & longhorn.DiskStatus {
1379+ StorageAvailable : storageAvailable ,
1380+ StorageScheduled : 0 ,
1381+ StorageMaximum : TestDiskSize ,
1382+ Conditions : []longhorn.Condition {
1383+ newCondition (longhorn .DiskConditionTypeSchedulable , longhorn .ConditionStatusTrue ),
1384+ },
1385+ DiskUUID : id ,
1386+ Type : longhorn .DiskTypeFilesystem ,
1387+ ScheduledReplica : scheduledReplica ,
1388+ }
1389+ return id
1390+ }
1391+
1392+ node1disk1 := newDisk (TestDefaultDataPath , true , 0 )
1393+ node1disk2 := newDisk (TestDefaultDataPath , true , 0 )
1394+ addDisks (node1 , 1 , node1disk1 , TestDiskAvailableSize , true ) // Later we schedule a replica on this disk
1395+ addDisks (node1 , 2 , node1disk2 , TestDiskAvailableSize , false ) // No replica, but current bug causes scheduler to pick this disk
1396+
1397+ node2disk1 := newDisk (TestDefaultDataPath , true , 0 )
1398+ node2disk2 := newDisk (TestDefaultDataPath , true , 0 )
1399+ addDisks (node2 , 1 , node2disk1 , TestDiskAvailableSize / 2 - 100 , false ) // No replica
1400+ expectedDiskID := addDisks (node2 , 2 , node2disk2 , TestDiskAvailableSize / 2 , false ) // No replica, scheduler should choose this since it has the most storage available from the valid options.
1401+
1402+ // TODO: setting the next line to "best-effort" causes the test to fail. Removing this line bypasses the bug and makes the test pass.
1403+ tc .replicaAutoBalance = "best-effort" // Current scheduling bug happens with best-effort only.
1404+ tc .replicaDiskSoftAntiAffinity = "false" // Do not allow scheduling of replicas on the same disk.
1405+ tc .replicaNodeSoftAntiAffinity = "false" // Do not allow scheduling of replica on the same node.
1406+ tc .replicaZoneSoftAntiAffinity = "true" // Allow scheduling in the same zone, both nodes are in the same one. The scheduler takes a shortcut otherwise that hides the bug.
1407+
1408+ tc .nodes = map [string ]* longhorn.Node {
1409+ TestNode1 : node1 ,
1410+ TestNode2 : node2 ,
1411+ }
1412+
1413+ // Add replica that still needs to be scheduled
1414+ replicaToSchedule := newReplicaForVolume (tc .volume )
1415+ tc .allReplicas [replicaToSchedule .Name ] = replicaToSchedule
1416+
1417+ // Only test scheduling for the replicaToSchedule, we don't want to schedule the replica that is already on node1disk1 again
1418+ tc .replicasToSchedule [replicaToSchedule .Name ] = struct {}{}
1419+
1420+ // Expect replica to be scheduled on node2disk2.
1421+ // - node1disk1 should not be possible, there is already a replica on the node and on the disk
1422+ // - node1disk2 should not be possible, there is already a replica on the node
1423+ // - node2disk1 is possible, but does not have the most available storage space left
1424+ // - node2disk2 is possible, and has the most storage left of all valid options. This should be the candidate.
1425+ tc .expectedNodes = map [string ]* longhorn.Node {
1426+ TestNode2 : node2 ,
1427+ }
1428+ tc .expectedDisks = map [string ]struct {}{
1429+ expectedDiskID : {},
1430+ }
1431+
1432+ return tc
1433+ }
1434+
13261435func setSettings (tc * ReplicaSchedulerTestCase , lhClient * lhfake.Clientset , sIndexer cache.Indexer , c * C ) {
13271436 // Set default-instance-manager-image setting
13281437 s := initSettings (string (types .SettingNameDefaultInstanceManagerImage ), TestInstanceManagerImage )
@@ -1378,6 +1487,17 @@ func setSettings(tc *ReplicaSchedulerTestCase, lhClient *lhfake.Clientset, sInde
13781487 err = sIndexer .Add (setting )
13791488 c .Assert (err , IsNil )
13801489 }
1490+ // Set replica auto-balance setting
1491+ if tc .replicaAutoBalance != "" {
1492+ s := initSettings (
1493+ string (types .SettingNameReplicaAutoBalance ),
1494+ tc .replicaAutoBalance )
1495+ setting , err :=
1496+ lhClient .LonghornV1beta2 ().Settings (TestNamespace ).Create (context .TODO (), s , metav1.CreateOptions {})
1497+ c .Assert (err , IsNil )
1498+ err = sIndexer .Add (setting )
1499+ c .Assert (err , IsNil )
1500+ }
13811501 // Set replica replenishment wait interval setting
13821502 if tc .ReplicaReplenishmentWaitInterval != "" {
13831503 s := initSettings (
0 commit comments