Skip to content

Commit 372a082

Browse files
committed
Use IP address to avoid DNS lookup
1 parent f8d26c1 commit 372a082

File tree

1 file changed

+18
-4
lines changed

1 file changed

+18
-4
lines changed

pkg/controller/mpi_job_controller.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,11 +1277,11 @@ func (c *MPIJobController) getConfigMap(mpiJob *kubeflow.MPIJob) (*corev1.Config
12771277
// one if it doesn't exist.
12781278
func (c *MPIJobController) getOrCreateConfigMap(mpiJob *kubeflow.MPIJob) (*corev1.ConfigMap, error) {
12791279
klog.Infof("create config called for %s", getJobKey(mpiJob))
1280-
newCM := newConfigMap(mpiJob, c.workerReplicas(mpiJob))
12811280
podList, err := c.getRunningWorkerPods(mpiJob)
12821281
if err != nil {
12831282
return nil, err
12841283
}
1284+
newCM := newConfigMap(mpiJob, c.workerReplicas(mpiJob), podList)
12851285
updateDiscoverHostsInConfigMap(newCM, mpiJob, podList)
12861286

12871287
cm, err := c.configMapLister.ConfigMaps(mpiJob.Namespace).Get(mpiJob.Name + configSuffix)
@@ -1935,7 +1935,7 @@ func (c *MPIJobController) doUpdateJobStatus(mpiJob *kubeflow.MPIJob) error {
19351935
// newConfigMap creates a new ConfigMap containing configurations for an MPIJob
19361936
// resource. It also sets the appropriate OwnerReferences on the resource so
19371937
// handleObject can discover the MPIJob resource that 'owns' it.
1938-
func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32) *corev1.ConfigMap {
1938+
func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32, workerPods []*corev1.Pod) *corev1.ConfigMap {
19391939
var buffer bytes.Buffer
19401940
slots := ptr.Deref(mpiJob.Spec.SlotsPerWorker, 1)
19411941
// note that pod.spec.dnsConfig also affect the svc resolution
@@ -1955,8 +1955,22 @@ func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32) *corev1.ConfigM
19551955
for i := 0; i < int(*mpiJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].MaxReplicas); i++ {
19561956
name := workerName(mpiJob, i)
19571957

1958-
//buffer.WriteString(fmt.Sprintf("host %s.%s ++cpus %d\n", name, mpiJob.Name, slots))
1959-
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc slots=%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
1958+
// Find the corresponding pod for this worker
1959+
var podIP string
1960+
for _, pod := range workerPods {
1961+
if pod.Name == name && pod.Status.PodIP != "" {
1962+
podIP = pod.Status.PodIP
1963+
break
1964+
}
1965+
}
1966+
1967+
// Use IP address if available, otherwise fall back to DNS name
1968+
if podIP != "" {
1969+
buffer.WriteString(fmt.Sprintf("%s slots=%d\n", podIP, slots))
1970+
} else {
1971+
// Fallback to DNS name if IP is not available
1972+
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc slots=%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
1973+
}
19601974
/*switch mpiJob.Spec.MPIImplementation {
19611975
case kubeflow.MPIImplementationOpenMPI:
19621976
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc slots=%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))

0 commit comments

Comments
 (0)