From a015e8c1024e0cc7f420df3b3f1999b1ccdfa1c4 Mon Sep 17 00:00:00 2001
From: Paulo Aragao <paragao@amazon.com>
Date: Thu, 5 Jun 2025 11:28:12 +0100
Subject: [PATCH 1/2] added new tool to scale up-down nodes on an instance
 group

---
 1.architectures/0.common/README.md            |   6 +
 .../update-instance-group-instance-count.yaml | 423 ++++++++++++++++++
 2 files changed, 429 insertions(+)
 create mode 100644 1.architectures/0.common/update-instance-group-instance-count.yaml
diff --git a/1.architectures/0.common/README.md b/1.architectures/0.common/README.md
index 7086c7431..8a3d5b625 100644
--- a/1.architectures/0.common/README.md
+++ b/1.architectures/0.common/README.md
@@ -12,3 +12,9 @@ This template creates a S3 Bucket with all public access disabled. To deploy it,
 This template deploys a stack to receive human-readable email notifications for HyperPod cluster status changes and node health events. See the [workshop page](https://catalog.workshops.aws/sagemaker-hyperpod/en-US/07-tips-and-tricks/26-event-bridge) for more details.
 
 [<kbd> <br> 1-Click Deploy 🚀 <br> </kbd>](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://ws-assets-prod-iad-r-iad-ed304a55c2ca1aee.s3.us-east-1.amazonaws.com/e3752eec-63b5-4033-9720-fa68d35164e9/hyperpod-event-bridge-email.yaml&stackName=hyperpod-event-bridge-email)
+
+## Create a scheduler to scale up and down the number of nodes in an instance group
+
+This template deploys an AWS Lambda lamdba function which is triggered by an Amazon EventBridge Rule to scale up and down the number of nodes based on a cron expression. 
+
+[<kbd> <br> 1-Click Deploy 🚀 <br> </kdb>](https://ws-assets-prod-iad-r-iad-ed304a55c2ca1aee.s3.us-east-1.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/update-instance-group-instance-count.yaml)
\ No newline at end of file
diff --git a/1.architectures/0.common/update-instance-group-instance-count.yaml b/1.architectures/0.common/update-instance-group-instance-count.yaml
new file mode 100644
index 000000000..2352e4f02
--- /dev/null
+++ b/1.architectures/0.common/update-instance-group-instance-count.yaml
@@ -0,0 +1,423 @@
+Parameters:
+  HyperpodClusterName:
+    Type: String
+    Default: ml-cluster
+    Description: Name of the Sagemaker Hyperpod cluster to work with
+  InstanceGroupName:
+    Type: String
+    Default: accelerated-worker-group-1
+    Description: Name of the instance group to work with
+  ScaleDownCount:
+    Type: String
+    Default: "0"
+    Description: Number of instances when scaling down
+  ScaleUpCount:
+    Type: String
+    Default: "8"
+    Description: Number of instances when scaling up
+  ScaleDownCron:
+    Type: String
+    Default: cron(0 20 * * ? *)
+    Description: Cron expresion to be use for the scale down rule
+  ScaleUpCron:
+    Type: String
+    Default: cron(0 8 * * ? *)
+    Description: Cron expresion to be use for the scale up rule
+Resources:
+  UpdateHyperpodInstanceGroupInstanceCountRoleCAFAD403:
+    Type: AWS::IAM::Role
+    Properties:
+      AssumeRolePolicyDocument:
+        Statement:
+          - Action: sts:AssumeRole
+            Effect: Allow
+            Principal:
+              Service: lambda.amazonaws.com
+        Version: "2012-10-17"
+      Description: Role for the Lambda function to update the instance count of a SageMaker HyperPod instance group
+      ManagedPolicyArns:
+        - Fn::Join:
+            - ""
+            - - "arn:"
+              - Ref: AWS::Partition
+              - :iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
+  UpdateHyperpodInstanceGroupInstanceCountRoleDefaultPolicy47328BA5:
+    Type: AWS::IAM::Policy
+    Properties:
+      PolicyDocument:
+        Statement:
+          - Action:
+              - sagemaker:DescribeCluster
+              - sagemaker:UpdateCluster
+              - sagemaker:BatchDeleteClusterNodes
+            Effect: Allow
+            Resource:
+              Fn::Join:
+                - ""
+                - - "arn:aws:sagemaker:"
+                  - Ref: AWS::Region
+                  - ":"
+                  - Ref: AWS::AccountId
+                  - :cluster/*
+          - Action:
+              - eks:AssociateAccessPolicy
+              - eks:CreateAccessEntry
+              - eks:DeleteAccessEntry
+              - eks:DescribeAccessEntry
+              - eks:DescribeCluster
+            Effect: Allow
+            Resource:
+              - Fn::Join:
+                  - ""
+                  - - "arn:aws:eks:"
+                    - Ref: AWS::Region
+                    - ":"
+                    - Ref: AWS::AccountId
+                    - :access-entry/*/*/*/*/*
+              - Fn::Join:
+                  - ""
+                  - - "arn:aws:eks:"
+                    - Ref: AWS::Region
+                    - ":"
+                    - Ref: AWS::AccountId
+                    - :cluster/*
+          - Action: iam:PassRole
+            Effect: Allow
+            Resource:
+              Fn::Join:
+                - ""
+                - - "arn:aws:iam::"
+                  - Ref: AWS::AccountId
+                  - :role/*
+        Version: "2012-10-17"
+      PolicyName: UpdateHyperpodInstanceGroupInstanceCountRoleDefaultPolicy47328BA5
+      Roles:
+        - Ref: UpdateHyperpodInstanceGroupInstanceCountRoleCAFAD403
+  UpdateHyperpodInstanceGroupInstanceCountCAF78010:
+    Type: AWS::Lambda::Function
+    Properties:
+      Architectures:
+        - arm64
+      Code:
+        ZipFile: "
+
+          import boto3
+
+          import json
+
+          import os
+
+
+          def lambda_handler(event, context):
+
+          \    \"\"\"
+
+          \    AWS Lambda function to update the number of instances in a SageMaker HyperPod instance group.
+
+          \    Uses environment variables for configuration:
+
+          \    - HYPERPOD_CLUSTER_NAME: Name of the HyperPod cluster
+
+          \    - HYPERPOD_INSTANCE_GROUP: Name of the instance group to update (default: accelerated-worker-group)
+
+          \    - HYPERPOD_INSTANCE_COUNT: Number of instances to set (default: 0)
+
+          \    \"\"\"
+
+          \    # Get parameters from environment variables, with optional override from event
+
+          \    cluster_name = event.get('cluster_name', os.environ.get('HYPERPOD_CLUSTER_NAME'))
+
+          \    instance_group = event.get('instance_group', os.environ.get('HYPERPOD_INSTANCE_GROUP'))
+
+          \    instance_count = int(event.get('instance_count'))
+
+
+          \    # Validate required parameters
+
+          \    if not cluster_name:
+
+          \        return {
+
+          \            'statusCode': 400,
+
+          \            'body': json.dumps('Error: cluster_name is required (set via event or HYPERPOD_CLUSTER_NAME environment variable)')
+
+          \        }
+
+
+          \    # Get region from event, environment variable, or default
+
+          \    region = event.get('region', os.environ.get('AWS_REGION', 'us-east-1'))
+
+
+          \    # Initialize SageMaker client
+
+          \    sagemaker_client = boto3.client('sagemaker', region_name=region)
+
+
+          \    try:
+
+          \        # Get current cluster configuration
+
+          \        response = sagemaker_client.describe_cluster(
+
+          \            ClusterName=cluster_name
+
+          \        )
+
+
+          \        # Find the target instance group
+
+          \        instance_groups = response.get('InstanceGroups', [])
+
+          \        target_group = None
+
+          \        other_groups = []
+
+
+          \        for group in instance_groups:
+
+          \            if group.get('InstanceGroupName') == instance_group:
+
+          \                target_group = group
+
+          \            else:
+
+          \                other_groups.append(group)
+
+
+          \        if not target_group:
+
+          \            return {
+
+          \                'statusCode': 404,
+
+          \                'body': json.dumps(f'Error: {instance_group} not found in the cluster')
+
+          \            }
+
+
+          \        # Create a copy of the target group with updated instance count
+
+          \        updated_target_group = {
+
+          \            'InstanceGroupName': instance_group,
+
+          \            'InstanceCount': instance_count,
+
+          \            'InstanceType': target_group.get('InstanceType'),
+
+          \            'LifeCycleConfig': target_group.get('LifeCycleConfig'),
+
+          \            'ExecutionRole': target_group.get('ExecutionRole')
+
+          \        }
+
+
+          \        # Include InstanceStorageConfigs if present in the original configuration
+
+          \        if 'InstanceStorageConfigs' in target_group:
+
+          \            updated_target_group['InstanceStorageConfigs'] = target_group['InstanceStorageConfigs']
+
+
+          \        # Prepare the update request with all instance groups
+
+          \        # We need to include all instance groups in the update request
+
+          \        update_groups = [updated_target_group]
+
+
+          \        # Include other instance groups unchanged
+
+          \        for group in other_groups:
+
+          \            update_group = {
+
+          \                'InstanceGroupName': group.get('InstanceGroupName'),
+
+          \                'InstanceCount': group.get('CurrentCount'),
+
+          \                'InstanceType': group.get('InstanceType'),
+
+          \                'LifeCycleConfig': group.get('LifeCycleConfig'),
+
+          \                'ExecutionRole': group.get('ExecutionRole')
+
+          \            }
+
+
+          \            # Include InstanceStorageConfigs if present
+
+          \            if 'InstanceStorageConfigs' in group:
+
+          \                update_group['InstanceStorageConfigs'] = group['InstanceStorageConfigs']
+
+
+          \            update_groups.append(update_group)
+
+
+          \        # Get node recovery setting if present
+
+          \        node_recovery = response.get('NodeRecovery', 'Automatic')
+
+          \        print('update groups: ', update_groups)
+
+          \        # Update the cluster with all instance groups
+
+          \        update_response = sagemaker_client.update_cluster(
+
+          \            ClusterName=cluster_name,
+
+          \            InstanceGroups=update_groups,
+
+          \            NodeRecovery=node_recovery
+
+          \        )
+
+
+          \        return {
+
+          \            'statusCode': 200,
+
+          \            'body': json.dumps({
+
+          \                'message': f'Successfully updated {instance_group} to {instance_count} instances',
+
+          \                'update_details': update_response
+
+          \            })
+
+          \        }
+
+          \    except Exception as e:
+
+          \        return {
+
+          \            'statusCode': 500,
+
+          \            'body': json.dumps(f'Error updating cluster: {str(e)}')
+
+          \        }
+
+          \                "
+      Environment:
+        Variables:
+          HYPERPOD_CLUSTER_NAME:
+            Ref: HyperpodClusterName
+          HYPERPOD_INSTANCE_GROUP:
+            Ref: InstanceGroupName
+      Handler: index.lambda_handler
+      MemorySize: 128
+      Role:
+        Fn::GetAtt:
+          - UpdateHyperpodInstanceGroupInstanceCountRoleCAFAD403
+          - Arn
+      Runtime: python3.13
+      Timeout: 30
+    DependsOn:
+      - UpdateHyperpodInstanceGroupInstanceCountRoleDefaultPolicy47328BA5
+      - UpdateHyperpodInstanceGroupInstanceCountRoleCAFAD403
+  UpdateHyperpodInstanceGroupInstanceCountLogGroupF950D618:
+    Type: AWS::Logs::LogGroup
+    Properties:
+      LogGroupName:
+        Fn::Join:
+          - ""
+          - - /aws/lambda/
+            - Ref: UpdateHyperpodInstanceGroupInstanceCountCAF78010
+      RetentionInDays: 731
+    UpdateReplacePolicy: Retain
+    DeletionPolicy: Retain
+  HyperPodScaleDownRule667EC34E:
+    Type: AWS::Events::Rule
+    Properties:
+      Description:
+        Fn::Join:
+          - ""
+          - - "Scale down "
+            - Ref: InstanceGroupName
+            - " in "
+            - Ref: HyperpodClusterName
+            - " to "
+            - Ref: ScaleDownCount
+            - " instances"
+      ScheduleExpression:
+        Ref: ScaleDownCron
+      State: ENABLED
+      Targets:
+        - Arn:
+            Fn::GetAtt:
+              - UpdateHyperpodInstanceGroupInstanceCountCAF78010
+              - Arn
+          Id: Target0
+          Input:
+            Fn::Join:
+              - ""
+              - - '{"cluster_name":"'
+                - Ref: HyperpodClusterName
+                - '","instance_group":"'
+                - Ref: InstanceGroupName
+                - '","instance_count":"'
+                - Ref: ScaleDownCount
+                - '","action":"scale-down"}'
+  HyperPodScaleDownRuleAllowEventRuleHyperpodInstanceCountUpdateUpdateHyperpodInstanceGroupInstanceCountF26B980BD6564B21:
+    Type: AWS::Lambda::Permission
+    Properties:
+      Action: lambda:InvokeFunction
+      FunctionName:
+        Fn::GetAtt:
+          - UpdateHyperpodInstanceGroupInstanceCountCAF78010
+          - Arn
+      Principal: events.amazonaws.com
+      SourceArn:
+        Fn::GetAtt:
+          - HyperPodScaleDownRule667EC34E
+          - Arn
+  HyperPodScaleUpRule47956CBB:
+    Type: AWS::Events::Rule
+    Properties:
+      Description:
+        Fn::Join:
+          - ""
+          - - "Scale up "
+            - Ref: InstanceGroupName
+            - " in "
+            - Ref: HyperpodClusterName
+            - " to "
+            - Ref: ScaleUpCount
+            - " instances"
+      ScheduleExpression:
+        Ref: ScaleUpCron
+      State: ENABLED
+      Targets:
+        - Arn:
+            Fn::GetAtt:
+              - UpdateHyperpodInstanceGroupInstanceCountCAF78010
+              - Arn
+          Id: Target0
+          Input:
+            Fn::Join:
+              - ""
+              - - '{"cluster_name":"'
+                - Ref: HyperpodClusterName
+                - '","instance_group":"'
+                - Ref: InstanceGroupName
+                - '","instance_count":"'
+                - Ref: ScaleUpCount
+                - '","action":"scale-up"}'
+  HyperPodScaleUpRuleAllowEventRuleHyperpodInstanceCountUpdateUpdateHyperpodInstanceGroupInstanceCountF26B980B6A706B98:
+    Type: AWS::Lambda::Permission
+    Properties:
+      Action: lambda:InvokeFunction
+      FunctionName:
+        Fn::GetAtt:
+          - UpdateHyperpodInstanceGroupInstanceCountCAF78010
+          - Arn
+      Principal: events.amazonaws.com
+      SourceArn:
+        Fn::GetAtt:
+          - HyperPodScaleUpRule47956CBB
+          - Arn
+

From f216f6cb92f937c1847869268803fd52ca76eb65 Mon Sep 17 00:00:00 2001
From: Paulo Aragao <paragao@amazon.com>
Date: Thu, 5 Jun 2025 13:33:58 +0100
Subject: [PATCH 2/2] changed location of file

---
 1.architectures/0.common/README.md                          | 5 -----
 1.architectures/5.sagemaker-hyperpod/tools/README.md        | 6 ++++++
 .../tools}/update-instance-group-instance-count.yaml        | 0
 3 files changed, 6 insertions(+), 5 deletions(-)
 rename 1.architectures/{0.common => 5.sagemaker-hyperpod/tools}/update-instance-group-instance-count.yaml (100%)

diff --git a/1.architectures/0.common/README.md b/1.architectures/0.common/README.md
index 8a3d5b625..cdaf33f25 100644
--- a/1.architectures/0.common/README.md
+++ b/1.architectures/0.common/README.md
@@ -13,8 +13,3 @@ This template deploys a stack to receive human-readable email notifications for
 
 [<kbd> <br> 1-Click Deploy 🚀 <br> </kbd>](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://ws-assets-prod-iad-r-iad-ed304a55c2ca1aee.s3.us-east-1.amazonaws.com/e3752eec-63b5-4033-9720-fa68d35164e9/hyperpod-event-bridge-email.yaml&stackName=hyperpod-event-bridge-email)
 
-## Create a scheduler to scale up and down the number of nodes in an instance group
-
-This template deploys an AWS Lambda lamdba function which is triggered by an Amazon EventBridge Rule to scale up and down the number of nodes based on a cron expression. 
-
-[<kbd> <br> 1-Click Deploy 🚀 <br> </kdb>](https://ws-assets-prod-iad-r-iad-ed304a55c2ca1aee.s3.us-east-1.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/update-instance-group-instance-count.yaml)
\ No newline at end of file
diff --git a/1.architectures/5.sagemaker-hyperpod/tools/README.md b/1.architectures/5.sagemaker-hyperpod/tools/README.md
index 265060a23..6438d1071 100644
--- a/1.architectures/5.sagemaker-hyperpod/tools/README.md
+++ b/1.architectures/5.sagemaker-hyperpod/tools/README.md
@@ -10,3 +10,9 @@ Utility to dump details of all nodes in a cluster, into a csv file.
 **Usage:** `python dump_cluster_nodes_info.py –cluster-name <name-of-cluster-whose-node-details-are-needed>`
 
 **Output:** “nodes.csv” file in the current directory, containing details of all nodes in the cluster 
+
+## Create a scheduler to scale up and down the number of nodes in an instance group
+
+This template deploys an AWS Lambda lamdba function which is triggered by an Amazon EventBridge Rule to scale up and down the number of nodes based on a cron expression. 
+
+[<kbd> <br> 1-Click Deploy 🚀 <br> </kdb>](https://ws-assets-prod-iad-r-iad-ed304a55c2ca1aee.s3.us-east-1.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/update-instance-group-instance-count.yaml)
\ No newline at end of file
diff --git a/1.architectures/0.common/update-instance-group-instance-count.yaml b/1.architectures/5.sagemaker-hyperpod/tools/update-instance-group-instance-count.yaml
similarity index 100%
rename from 1.architectures/0.common/update-instance-group-instance-count.yaml
rename to 1.architectures/5.sagemaker-hyperpod/tools/update-instance-group-instance-count.yaml