@@ -1277,11 +1277,11 @@ func (c *MPIJobController) getConfigMap(mpiJob *kubeflow.MPIJob) (*corev1.Config
1277
1277
// one if it doesn't exist.
1278
1278
func (c * MPIJobController ) getOrCreateConfigMap (mpiJob * kubeflow.MPIJob ) (* corev1.ConfigMap , error ) {
1279
1279
klog .Infof ("create config called for %s" , getJobKey (mpiJob ))
1280
- newCM := newConfigMap (mpiJob , c .workerReplicas (mpiJob ))
1281
1280
podList , err := c .getRunningWorkerPods (mpiJob )
1282
1281
if err != nil {
1283
1282
return nil , err
1284
1283
}
1284
+ newCM := newConfigMap (mpiJob , c .workerReplicas (mpiJob ), podList )
1285
1285
updateDiscoverHostsInConfigMap (newCM , mpiJob , podList )
1286
1286
1287
1287
cm , err := c .configMapLister .ConfigMaps (mpiJob .Namespace ).Get (mpiJob .Name + configSuffix )
@@ -1935,7 +1935,7 @@ func (c *MPIJobController) doUpdateJobStatus(mpiJob *kubeflow.MPIJob) error {
1935
1935
// newConfigMap creates a new ConfigMap containing configurations for an MPIJob
1936
1936
// resource. It also sets the appropriate OwnerReferences on the resource so
1937
1937
// handleObject can discover the MPIJob resource that 'owns' it.
1938
- func newConfigMap (mpiJob * kubeflow.MPIJob , workerReplicas int32 ) * corev1.ConfigMap {
1938
+ func newConfigMap (mpiJob * kubeflow.MPIJob , workerReplicas int32 , workerPods [] * corev1. Pod ) * corev1.ConfigMap {
1939
1939
var buffer bytes.Buffer
1940
1940
slots := ptr .Deref (mpiJob .Spec .SlotsPerWorker , 1 )
1941
1941
// note that pod.spec.dnsConfig also affect the svc resolution
@@ -1955,8 +1955,22 @@ func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32) *corev1.ConfigM
1955
1955
for i := 0 ; i < int (* mpiJob .Spec .MPIReplicaSpecs [kubeflow .MPIReplicaTypeWorker ].MaxReplicas ); i ++ {
1956
1956
name := workerName (mpiJob , i )
1957
1957
1958
- //buffer.WriteString(fmt.Sprintf("host %s.%s ++cpus %d\n", name, mpiJob.Name, slots))
1959
- buffer .WriteString (fmt .Sprintf ("%s.%s.%s.svc slots=%d\n " , name , mpiJob .Name , mpiJob .Namespace , slots ))
1958
+ // Find the corresponding pod for this worker
1959
+ var podIP string
1960
+ for _ , pod := range workerPods {
1961
+ if pod .Name == name && pod .Status .PodIP != "" {
1962
+ podIP = pod .Status .PodIP
1963
+ break
1964
+ }
1965
+ }
1966
+
1967
+ // Use IP address if available, otherwise fall back to DNS name
1968
+ if podIP != "" {
1969
+ buffer .WriteString (fmt .Sprintf ("%s slots=%d\n " , podIP , slots ))
1970
+ } else {
1971
+ // Fallback to DNS name if IP is not available
1972
+ buffer .WriteString (fmt .Sprintf ("%s.%s.%s.svc slots=%d\n " , name , mpiJob .Name , mpiJob .Namespace , slots ))
1973
+ }
1960
1974
/*switch mpiJob.Spec.MPIImplementation {
1961
1975
case kubeflow.MPIImplementationOpenMPI:
1962
1976
buffer.WriteString(fmt.Sprintf("%s.%s.%s.svc slots=%d\n", name, mpiJob.Name, mpiJob.Namespace, slots))
0 commit comments