From a015e8c1024e0cc7f420df3b3f1999b1ccdfa1c4 Mon Sep 17 00:00:00 2001 From: Paulo Aragao Date: Thu, 5 Jun 2025 11:28:12 +0100 Subject: [PATCH 1/2] added new tool to scale up-down nodes on an instance group --- 1.architectures/0.common/README.md | 6 + .../update-instance-group-instance-count.yaml | 423 ++++++++++++++++++ 2 files changed, 429 insertions(+) create mode 100644 1.architectures/0.common/update-instance-group-instance-count.yaml diff --git a/1.architectures/0.common/README.md b/1.architectures/0.common/README.md index 7086c7431..8a3d5b625 100644 --- a/1.architectures/0.common/README.md +++ b/1.architectures/0.common/README.md @@ -12,3 +12,9 @@ This template creates a S3 Bucket with all public access disabled. To deploy it, This template deploys a stack to receive human-readable email notifications for HyperPod cluster status changes and node health events. See the [workshop page](https://catalog.workshops.aws/sagemaker-hyperpod/en-US/07-tips-and-tricks/26-event-bridge) for more details. [ 
 1-Click Deploy πŸš€β€ƒ
 
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://ws-assets-prod-iad-r-iad-ed304a55c2ca1aee.s3.us-east-1.amazonaws.com/e3752eec-63b5-4033-9720-fa68d35164e9/hyperpod-event-bridge-email.yaml&stackName=hyperpod-event-bridge-email) + +## Create a scheduler to scale up and down the number of nodes in an instance group + +This template deploys an AWS Lambda lamdba function which is triggered by an Amazon EventBridge Rule to scale up and down the number of nodes based on a cron expression. + +[
1-Click Deploy πŸš€
](https://ws-assets-prod-iad-r-iad-ed304a55c2ca1aee.s3.us-east-1.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/update-instance-group-instance-count.yaml) \ No newline at end of file diff --git a/1.architectures/0.common/update-instance-group-instance-count.yaml b/1.architectures/0.common/update-instance-group-instance-count.yaml new file mode 100644 index 000000000..2352e4f02 --- /dev/null +++ b/1.architectures/0.common/update-instance-group-instance-count.yaml @@ -0,0 +1,423 @@ +Parameters: + HyperpodClusterName: + Type: String + Default: ml-cluster + Description: Name of the Sagemaker Hyperpod cluster to work with + InstanceGroupName: + Type: String + Default: accelerated-worker-group-1 + Description: Name of the instance group to work with + ScaleDownCount: + Type: String + Default: "0" + Description: Number of instances when scaling down + ScaleUpCount: + Type: String + Default: "8" + Description: Number of instances when scaling up + ScaleDownCron: + Type: String + Default: cron(0 20 * * ? *) + Description: Cron expresion to be use for the scale down rule + ScaleUpCron: + Type: String + Default: cron(0 8 * * ? *) + Description: Cron expresion to be use for the scale up rule +Resources: + UpdateHyperpodInstanceGroupInstanceCountRoleCAFAD403: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Action: sts:AssumeRole + Effect: Allow + Principal: + Service: lambda.amazonaws.com + Version: "2012-10-17" + Description: Role for the Lambda function to update the instance count of a SageMaker HyperPod instance group + ManagedPolicyArns: + - Fn::Join: + - "" + - - "arn:" + - Ref: AWS::Partition + - :iam::aws:policy/service-role/AWSLambdaBasicExecutionRole + UpdateHyperpodInstanceGroupInstanceCountRoleDefaultPolicy47328BA5: + Type: AWS::IAM::Policy + Properties: + PolicyDocument: + Statement: + - Action: + - sagemaker:DescribeCluster + - sagemaker:UpdateCluster + - sagemaker:BatchDeleteClusterNodes + Effect: Allow + Resource: + Fn::Join: + - "" + - - "arn:aws:sagemaker:" + - Ref: AWS::Region + - ":" + - Ref: AWS::AccountId + - :cluster/* + - Action: + - eks:AssociateAccessPolicy + - eks:CreateAccessEntry + - eks:DeleteAccessEntry + - eks:DescribeAccessEntry + - eks:DescribeCluster + Effect: Allow + Resource: + - Fn::Join: + - "" + - - "arn:aws:eks:" + - Ref: AWS::Region + - ":" + - Ref: AWS::AccountId + - :access-entry/*/*/*/*/* + - Fn::Join: + - "" + - - "arn:aws:eks:" + - Ref: AWS::Region + - ":" + - Ref: AWS::AccountId + - :cluster/* + - Action: iam:PassRole + Effect: Allow + Resource: + Fn::Join: + - "" + - - "arn:aws:iam::" + - Ref: AWS::AccountId + - :role/* + Version: "2012-10-17" + PolicyName: UpdateHyperpodInstanceGroupInstanceCountRoleDefaultPolicy47328BA5 + Roles: + - Ref: UpdateHyperpodInstanceGroupInstanceCountRoleCAFAD403 + UpdateHyperpodInstanceGroupInstanceCountCAF78010: + Type: AWS::Lambda::Function + Properties: + Architectures: + - arm64 + Code: + ZipFile: " + + import boto3 + + import json + + import os + + + def lambda_handler(event, context): + + \ \"\"\" + + \ AWS Lambda function to update the number of instances in a SageMaker HyperPod instance group. + + \ Uses environment variables for configuration: + + \ - HYPERPOD_CLUSTER_NAME: Name of the HyperPod cluster + + \ - HYPERPOD_INSTANCE_GROUP: Name of the instance group to update (default: accelerated-worker-group) + + \ - HYPERPOD_INSTANCE_COUNT: Number of instances to set (default: 0) + + \ \"\"\" + + \ # Get parameters from environment variables, with optional override from event + + \ cluster_name = event.get('cluster_name', os.environ.get('HYPERPOD_CLUSTER_NAME')) + + \ instance_group = event.get('instance_group', os.environ.get('HYPERPOD_INSTANCE_GROUP')) + + \ instance_count = int(event.get('instance_count')) + + + \ # Validate required parameters + + \ if not cluster_name: + + \ return { + + \ 'statusCode': 400, + + \ 'body': json.dumps('Error: cluster_name is required (set via event or HYPERPOD_CLUSTER_NAME environment variable)') + + \ } + + + \ # Get region from event, environment variable, or default + + \ region = event.get('region', os.environ.get('AWS_REGION', 'us-east-1')) + + + \ # Initialize SageMaker client + + \ sagemaker_client = boto3.client('sagemaker', region_name=region) + + + \ try: + + \ # Get current cluster configuration + + \ response = sagemaker_client.describe_cluster( + + \ ClusterName=cluster_name + + \ ) + + + \ # Find the target instance group + + \ instance_groups = response.get('InstanceGroups', []) + + \ target_group = None + + \ other_groups = [] + + + \ for group in instance_groups: + + \ if group.get('InstanceGroupName') == instance_group: + + \ target_group = group + + \ else: + + \ other_groups.append(group) + + + \ if not target_group: + + \ return { + + \ 'statusCode': 404, + + \ 'body': json.dumps(f'Error: {instance_group} not found in the cluster') + + \ } + + + \ # Create a copy of the target group with updated instance count + + \ updated_target_group = { + + \ 'InstanceGroupName': instance_group, + + \ 'InstanceCount': instance_count, + + \ 'InstanceType': target_group.get('InstanceType'), + + \ 'LifeCycleConfig': target_group.get('LifeCycleConfig'), + + \ 'ExecutionRole': target_group.get('ExecutionRole') + + \ } + + + \ # Include InstanceStorageConfigs if present in the original configuration + + \ if 'InstanceStorageConfigs' in target_group: + + \ updated_target_group['InstanceStorageConfigs'] = target_group['InstanceStorageConfigs'] + + + \ # Prepare the update request with all instance groups + + \ # We need to include all instance groups in the update request + + \ update_groups = [updated_target_group] + + + \ # Include other instance groups unchanged + + \ for group in other_groups: + + \ update_group = { + + \ 'InstanceGroupName': group.get('InstanceGroupName'), + + \ 'InstanceCount': group.get('CurrentCount'), + + \ 'InstanceType': group.get('InstanceType'), + + \ 'LifeCycleConfig': group.get('LifeCycleConfig'), + + \ 'ExecutionRole': group.get('ExecutionRole') + + \ } + + + \ # Include InstanceStorageConfigs if present + + \ if 'InstanceStorageConfigs' in group: + + \ update_group['InstanceStorageConfigs'] = group['InstanceStorageConfigs'] + + + \ update_groups.append(update_group) + + + \ # Get node recovery setting if present + + \ node_recovery = response.get('NodeRecovery', 'Automatic') + + \ print('update groups: ', update_groups) + + \ # Update the cluster with all instance groups + + \ update_response = sagemaker_client.update_cluster( + + \ ClusterName=cluster_name, + + \ InstanceGroups=update_groups, + + \ NodeRecovery=node_recovery + + \ ) + + + \ return { + + \ 'statusCode': 200, + + \ 'body': json.dumps({ + + \ 'message': f'Successfully updated {instance_group} to {instance_count} instances', + + \ 'update_details': update_response + + \ }) + + \ } + + \ except Exception as e: + + \ return { + + \ 'statusCode': 500, + + \ 'body': json.dumps(f'Error updating cluster: {str(e)}') + + \ } + + \ " + Environment: + Variables: + HYPERPOD_CLUSTER_NAME: + Ref: HyperpodClusterName + HYPERPOD_INSTANCE_GROUP: + Ref: InstanceGroupName + Handler: index.lambda_handler + MemorySize: 128 + Role: + Fn::GetAtt: + - UpdateHyperpodInstanceGroupInstanceCountRoleCAFAD403 + - Arn + Runtime: python3.13 + Timeout: 30 + DependsOn: + - UpdateHyperpodInstanceGroupInstanceCountRoleDefaultPolicy47328BA5 + - UpdateHyperpodInstanceGroupInstanceCountRoleCAFAD403 + UpdateHyperpodInstanceGroupInstanceCountLogGroupF950D618: + Type: AWS::Logs::LogGroup + Properties: + LogGroupName: + Fn::Join: + - "" + - - /aws/lambda/ + - Ref: UpdateHyperpodInstanceGroupInstanceCountCAF78010 + RetentionInDays: 731 + UpdateReplacePolicy: Retain + DeletionPolicy: Retain + HyperPodScaleDownRule667EC34E: + Type: AWS::Events::Rule + Properties: + Description: + Fn::Join: + - "" + - - "Scale down " + - Ref: InstanceGroupName + - " in " + - Ref: HyperpodClusterName + - " to " + - Ref: ScaleDownCount + - " instances" + ScheduleExpression: + Ref: ScaleDownCron + State: ENABLED + Targets: + - Arn: + Fn::GetAtt: + - UpdateHyperpodInstanceGroupInstanceCountCAF78010 + - Arn + Id: Target0 + Input: + Fn::Join: + - "" + - - '{"cluster_name":"' + - Ref: HyperpodClusterName + - '","instance_group":"' + - Ref: InstanceGroupName + - '","instance_count":"' + - Ref: ScaleDownCount + - '","action":"scale-down"}' + HyperPodScaleDownRuleAllowEventRuleHyperpodInstanceCountUpdateUpdateHyperpodInstanceGroupInstanceCountF26B980BD6564B21: + Type: AWS::Lambda::Permission + Properties: + Action: lambda:InvokeFunction + FunctionName: + Fn::GetAtt: + - UpdateHyperpodInstanceGroupInstanceCountCAF78010 + - Arn + Principal: events.amazonaws.com + SourceArn: + Fn::GetAtt: + - HyperPodScaleDownRule667EC34E + - Arn + HyperPodScaleUpRule47956CBB: + Type: AWS::Events::Rule + Properties: + Description: + Fn::Join: + - "" + - - "Scale up " + - Ref: InstanceGroupName + - " in " + - Ref: HyperpodClusterName + - " to " + - Ref: ScaleUpCount + - " instances" + ScheduleExpression: + Ref: ScaleUpCron + State: ENABLED + Targets: + - Arn: + Fn::GetAtt: + - UpdateHyperpodInstanceGroupInstanceCountCAF78010 + - Arn + Id: Target0 + Input: + Fn::Join: + - "" + - - '{"cluster_name":"' + - Ref: HyperpodClusterName + - '","instance_group":"' + - Ref: InstanceGroupName + - '","instance_count":"' + - Ref: ScaleUpCount + - '","action":"scale-up"}' + HyperPodScaleUpRuleAllowEventRuleHyperpodInstanceCountUpdateUpdateHyperpodInstanceGroupInstanceCountF26B980B6A706B98: + Type: AWS::Lambda::Permission + Properties: + Action: lambda:InvokeFunction + FunctionName: + Fn::GetAtt: + - UpdateHyperpodInstanceGroupInstanceCountCAF78010 + - Arn + Principal: events.amazonaws.com + SourceArn: + Fn::GetAtt: + - HyperPodScaleUpRule47956CBB + - Arn + From f216f6cb92f937c1847869268803fd52ca76eb65 Mon Sep 17 00:00:00 2001 From: Paulo Aragao Date: Thu, 5 Jun 2025 13:33:58 +0100 Subject: [PATCH 2/2] changed location of file --- 1.architectures/0.common/README.md | 5 ----- 1.architectures/5.sagemaker-hyperpod/tools/README.md | 6 ++++++ .../tools}/update-instance-group-instance-count.yaml | 0 3 files changed, 6 insertions(+), 5 deletions(-) rename 1.architectures/{0.common => 5.sagemaker-hyperpod/tools}/update-instance-group-instance-count.yaml (100%) diff --git a/1.architectures/0.common/README.md b/1.architectures/0.common/README.md index 8a3d5b625..cdaf33f25 100644 --- a/1.architectures/0.common/README.md +++ b/1.architectures/0.common/README.md @@ -13,8 +13,3 @@ This template deploys a stack to receive human-readable email notifications for [ 
 1-Click Deploy πŸš€β€ƒ
 
](https://console.aws.amazon.com/cloudformation/home?#/stacks/quickcreate?templateURL=https://ws-assets-prod-iad-r-iad-ed304a55c2ca1aee.s3.us-east-1.amazonaws.com/e3752eec-63b5-4033-9720-fa68d35164e9/hyperpod-event-bridge-email.yaml&stackName=hyperpod-event-bridge-email) -## Create a scheduler to scale up and down the number of nodes in an instance group - -This template deploys an AWS Lambda lamdba function which is triggered by an Amazon EventBridge Rule to scale up and down the number of nodes based on a cron expression. - -[
1-Click Deploy πŸš€
](https://ws-assets-prod-iad-r-iad-ed304a55c2ca1aee.s3.us-east-1.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/update-instance-group-instance-count.yaml) \ No newline at end of file diff --git a/1.architectures/5.sagemaker-hyperpod/tools/README.md b/1.architectures/5.sagemaker-hyperpod/tools/README.md index 265060a23..6438d1071 100644 --- a/1.architectures/5.sagemaker-hyperpod/tools/README.md +++ b/1.architectures/5.sagemaker-hyperpod/tools/README.md @@ -10,3 +10,9 @@ Utility to dump details of all nodes in a cluster, into a csv file. **Usage:** `python dump_cluster_nodes_info.py –cluster-name ` **Output:** β€œnodes.csv” file in the current directory, containing details of all nodes in the cluster + +## Create a scheduler to scale up and down the number of nodes in an instance group + +This template deploys an AWS Lambda lamdba function which is triggered by an Amazon EventBridge Rule to scale up and down the number of nodes based on a cron expression. + +[
1-Click Deploy πŸš€
](https://ws-assets-prod-iad-r-iad-ed304a55c2ca1aee.s3.us-east-1.amazonaws.com/2433d39e-ccfe-4c00-9d3d-9917b729258e/update-instance-group-instance-count.yaml) \ No newline at end of file diff --git a/1.architectures/0.common/update-instance-group-instance-count.yaml b/1.architectures/5.sagemaker-hyperpod/tools/update-instance-group-instance-count.yaml similarity index 100% rename from 1.architectures/0.common/update-instance-group-instance-count.yaml rename to 1.architectures/5.sagemaker-hyperpod/tools/update-instance-group-instance-count.yaml