From b8c064741ce048cfa9622d7fce23c5dd55b67378 Mon Sep 17 00:00:00 2001 From: Tao Yang Date: Tue, 5 Aug 2025 10:22:46 +0800 Subject: [PATCH 1/4] YARN-11843. Fix potential deadlock when auto-correction of container allocation is enabled --- .../resourcemanager/scheduler/AbstractYarnScheduler.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index 3343c5f93118d..c0b671b276eb4 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -724,9 +724,7 @@ protected void autoCorrectContainerAllocation(List resourceRequ if (extraContainers > 0) { // Change the state of the container from ALLOCATED to EXPIRED since it is not required. LOG.debug("Removing extra container:{}", rmContainer.getContainer()); - completedContainer(rmContainer, SchedulerUtils.createAbnormalContainerStatus( - rmContainer.getContainerId(), SchedulerUtils.EXPIRED_CONTAINER), - RMContainerEventType.EXPIRE); + asyncContainerRelease(rmContainer); application.newlyAllocatedContainers.remove(rmContainer); extraContainers--; } From cc6bc5fbade22da6b88033c31f3fc8996b737262 Mon Sep 17 00:00:00 2001 From: Tao Yang Date: Tue, 5 Aug 2025 10:38:11 +0800 Subject: [PATCH 2/4] Update comment. --- .../server/resourcemanager/scheduler/AbstractYarnScheduler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index c0b671b276eb4..929254815d876 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -722,7 +722,7 @@ protected void autoCorrectContainerAllocation(List resourceRequ if (allocatedContainers != null) { for (RMContainer rmContainer : allocatedContainers) { if (extraContainers > 0) { - // Change the state of the container from ALLOCATED to EXPIRED since it is not required. + // Change the state of the container from ALLOCATED to RELEASED since it is not required. LOG.debug("Removing extra container:{}", rmContainer.getContainer()); asyncContainerRelease(rmContainer); application.newlyAllocatedContainers.remove(rmContainer); From 29cd1d7e065f1193f52ac6a3f5de5b56b4bde03c Mon Sep 17 00:00:00 2001 From: Tao Yang Date: Tue, 5 Aug 2025 15:51:32 +0800 Subject: [PATCH 3/4] Fix checkstyle warning. --- .../resourcemanager/scheduler/AbstractYarnScheduler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index 929254815d876..cff8c51d48dea 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -722,7 +722,8 @@ protected void autoCorrectContainerAllocation(List resourceRequ if (allocatedContainers != null) { for (RMContainer rmContainer : allocatedContainers) { if (extraContainers > 0) { - // Change the state of the container from ALLOCATED to RELEASED since it is not required. + // Change the state of the container from ALLOCATED to RELEASED + // since it is not required. LOG.debug("Removing extra container:{}", rmContainer.getContainer()); asyncContainerRelease(rmContainer); application.newlyAllocatedContainers.remove(rmContainer); From f05899c153c235392f048360f3752022ef7017db Mon Sep 17 00:00:00 2001 From: Tao Yang Date: Mon, 18 Aug 2025 07:52:22 +0800 Subject: [PATCH 4/4] Adjust expected state from EXIPRED to RELEASED in AbstractYarnScheduler#recoverResourceRequestForContainer and add UT. --- .../hadoop/yarn/event/DrainDispatcher.java | 2 +- .../scheduler/AbstractYarnScheduler.java | 4 +-- .../scheduler/TestAbstractYarnScheduler.java | 28 +++++++++++++++++-- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/event/DrainDispatcher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/event/DrainDispatcher.java index 2045eb6309200..9e6e776949e27 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/event/DrainDispatcher.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/event/DrainDispatcher.java @@ -96,7 +96,7 @@ public void handle(Event event) { } @Override - protected boolean isDrained() { + public boolean isDrained() { synchronized (mutex) { return drained; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java index cff8c51d48dea..fca45b3f6ba82 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AbstractYarnScheduler.java @@ -769,10 +769,10 @@ private void recoverResourceRequestForContainer(RMContainer rmContainer) { } // when auto correct container allocation is enabled, there can be a case when extra containers - // go to expired state from allocated state. When such scenario happens do not re-attempt the + // go to released state from allocated state. When such scenario happens do not re-attempt the // container request since this is expected. if (autoCorrectContainerAllocation && - RMContainerState.EXPIRED.equals(rmContainer.getState())) { + RMContainerState.RELEASED.equals(rmContainer.getState())) { return; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java index d6baacebd47ab..864f0e1d54f4f 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java @@ -42,6 +42,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.service.Service; import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableList; import org.apache.hadoop.util.Lists; import org.apache.hadoop.util.Sets; import org.apache.hadoop.util.Time; @@ -86,6 +87,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode; import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; +import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ContainerRequest; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent; @@ -291,7 +293,7 @@ private void testMaximumAllocationVCoresHelper( */ @ParameterizedTest(name = "{0}") @MethodSource("getParameters") - public void testAutoCorrectContainerAllocation(SchedulerType type) throws IOException { + public void testAutoCorrectContainerAllocation(SchedulerType type) throws Exception { initTestAbstractYarnScheduler(type); Configuration conf = new Configuration(getConf()); conf.setBoolean(YarnConfiguration.RM_SCHEDULER_AUTOCORRECT_CONTAINER_ALLOCATION, true); @@ -372,7 +374,8 @@ private RMContainer createMockRMContainer(int containerId, NodeId nodeId, Container container = mock(Container.class); // Mock the Container instance with the specified parameters - when(container.getResource()).thenReturn(Resource.newInstance(memory, 1)); + Resource resource = Resource.newInstance(memory, 1); + when(container.getResource()).thenReturn(resource); when(container.getPriority()).thenReturn(priority); when(container.getId()).thenReturn(ContainerId.newContainerId(appAttemptId, containerId)); when(container.getNodeId()).thenReturn(nodeId); @@ -388,6 +391,10 @@ private RMContainer createMockRMContainer(int containerId, NodeId nodeId, when(rmContainer.getContainer()).thenReturn(container); when(rmContainer.getContainerId()).thenReturn( ContainerId.newContainerId(appAttemptId, containerId)); + ResourceRequest resourceRequest = + BuilderUtils.newResourceRequest(priority, "*", resource, 1); + when(rmContainer.getContainerRequest()).thenReturn( + new ContainerRequest(ImmutableList.of(resourceRequest))); return rmContainer; } @@ -470,7 +477,7 @@ private void testContainerAskAndNewlyAllocatedContainerOne(AbstractYarnScheduler */ private void testContainerAskZeroAndNewlyAllocatedContainerOne(AbstractYarnScheduler scheduler, SchedulerApplicationAttempt application, SchedulerNode schedulerNode, NodeId nodeId, - Priority priority, ApplicationAttemptId appAttemptId) { + Priority priority, ApplicationAttemptId appAttemptId) throws Exception { // Create a resource request with 0 containers, 1024 MB memory, and GUARANTEED execution type ResourceRequest resourceRequest = createResourceRequest(1024, 1, 0, priority, 0L, @@ -485,14 +492,29 @@ private void testContainerAskZeroAndNewlyAllocatedContainerOne(AbstractYarnSched // Add the RMContainer to the newly allocated containers of the application application.addToNewlyAllocatedContainers(schedulerNode, rmContainer1); + // Simulate the container is released and expect it won't be recovered + // when calling AbstractYarnScheduler#recoverResourceRequestForContainer + when(rmContainer1.getState()).thenReturn(RMContainerState.RELEASED); + // Call the autoCorrectContainerAllocation method scheduler.autoCorrectContainerAllocation(containerAsk, application); + // Make sure all event is handled + Dispatcher dispatcher = scheduler.rmContext.getDispatcher(); + if (dispatcher instanceof DrainDispatcher) { + GenericTestUtils.waitFor( + () -> ((DrainDispatcher) dispatcher).isDrained(), + 500, 3000); + } + // Assert that the container ask remains 0 assertEquals(0, resourceRequest.getNumContainers()); // Assert that there are no newly allocated containers assertEquals(0, application.pullNewlyAllocatedContainers().size()); + + // Assert that the next pending ask is null + assertNull(application.appSchedulingInfo.getNextPendingAsk()); } /**