@@ -468,7 +468,7 @@ func NewMPIJobControllerWithClock(
468
468
oldExpandReplicas : make (map [string ]int32 ),
469
469
runningJobs : pqRunning ,
470
470
queuedJobs : pqQueued ,
471
- freeSlots : 60 ,
471
+ freeSlots : 10 ,
472
472
rescaleGap : 1 * time .Second , // 3 minutes
473
473
}
474
474
// FIXME fix the free slots!
@@ -1534,10 +1534,12 @@ func (c *MPIJobController) calculateWorkerReplicas(mpiJob *kubeflow.MPIJob) (int
1534
1534
klog .Infof ("Queued job 1 %s, %d" , getJobKey (mpiJob ), numWorkersToFree )
1535
1535
return 0 , jobQueuedError
1536
1536
} else {
1537
- numWorkersToFree = * worker .MinReplicas - int32 (c .freeSlots ) + 1
1538
- index = len (c .runningJobs ) - 1
1537
+ //numWorkersToFree = *worker.MinReplicas - int32(c.freeSlots) + 1
1538
+ maxWorkersToFree := * worker .MaxReplicas - int32 (c .freeSlots ) + 1
1539
+ minWorkersToFree := * worker .MinReplicas - int32 (c .freeSlots ) + 1
1540
+ index := len (c .runningJobs ) - 1
1539
1541
for {
1540
- if numWorkersToFree == 0 || index < 0 {
1542
+ if maxWorkersToFree == 0 || index < 0 {
1541
1543
break
1542
1544
}
1543
1545
@@ -1561,7 +1563,7 @@ func (c *MPIJobController) calculateWorkerReplicas(mpiJob *kubeflow.MPIJob) (int
1561
1563
jobMinReplicas := * it .mpiJob .Spec .MPIReplicaSpecs [kubeflow .MPIReplicaTypeWorker ].MinReplicas
1562
1564
if int32 (len (workerPodList )) > jobMinReplicas && c .lastAction [getJobKey (& it .mpiJob )].Add (c .rescaleGap ).Before (c .clock .Now ()) {
1563
1565
newPodCount := int32 (math .Max (float64 (jobMinReplicas ),
1564
- float64 (len (workerPodList )- int (numWorkersToFree ))))
1566
+ float64 (len (workerPodList )- int (maxWorkersToFree ))))
1565
1567
klog .Infof ("Setting replicas for %s to %d" , getJobKey (& it .mpiJob ), newPodCount )
1566
1568
1567
1569
err := c .sendRescaleSignal (& it .mpiJob , int32 (len (workerPodList )), newPodCount )
@@ -1572,20 +1574,21 @@ func (c *MPIJobController) calculateWorkerReplicas(mpiJob *kubeflow.MPIJob) (int
1572
1574
}
1573
1575
1574
1576
c .latestReplicas [getJobKey (& it .mpiJob )] = newPodCount
1575
- numWorkersToFree -= int32 (len (workerPodList ) - int (newPodCount ))
1577
+ maxWorkersToFree -= int32 (len (workerPodList ) - int (newPodCount ))
1578
+ minWorkersToFree -= int32 (len (workerPodList ) - int (newPodCount ))
1576
1579
c .freeSlots += len (workerPodList ) - int (newPodCount )
1577
1580
1578
1581
c .queue .AddRateLimited (getJobKey (& it .mpiJob ))
1579
1582
}
1580
1583
}
1581
- if numWorkersToFree > 0 {
1584
+ if minWorkersToFree > 0 {
1582
1585
// queue this job
1583
1586
//c.enqueueJobInternal(mpiJob)
1584
- klog .Infof ("Queued job 2 %s, %d" , getJobKey (mpiJob ), numWorkersToFree )
1587
+ klog .Infof ("Queued job 2 %s, %d" , getJobKey (mpiJob ), minWorkersToFree )
1585
1588
return 0 , jobQueuedError
1586
1589
}
1587
1590
}
1588
- return * worker . MinReplicas , nil
1591
+ return int32 ( c . freeSlots ) - 1 , nil
1589
1592
} else {
1590
1593
return replicas , nil
1591
1594
}
0 commit comments