From 0aa116593fb7f2871ce213a4b4efff42b83a7d33 Mon Sep 17 00:00:00 2001 From: Vohra Date: Mon, 12 Nov 2018 16:52:12 -0500 Subject: [PATCH 1/5] updated deeplearning.template --- cfn-template/deeplearning.template | 274 ++++++++++++++++++++++++----- 1 file changed, 227 insertions(+), 47 deletions(-) diff --git a/cfn-template/deeplearning.template b/cfn-template/deeplearning.template index 50de1f5..9a3a9e3 100644 --- a/cfn-template/deeplearning.template +++ b/cfn-template/deeplearning.template @@ -2,6 +2,60 @@ "AWSTemplateFormatVersion" : "2010-09-09", "Description" : "Launches a Deep Learning Cluster with one Master and variable number of Workers.", "Parameters" : { + "S3Bucket" : { + "Description" : "S3 bucket name that contains training code, data and scripts, e.g. my-s3-bucket ", + "Type" : "String" + }, + "TarData" : { + "Description" : "Data tar file prefix in S3Bucket copied to EFS, or copied and extracted on worker EBS file system, e.g. data.tar", + "Type" : "String", + "AllowedPattern": ".+\\.tar(\\.gz)?" + }, + "TarSource" : { + "Description" : "Source tar file prefix in S3Bucket copied and extracted on worker EBS file system, e.g. src.tar" , + "Type" : "String", + "AllowedPattern": ".+\\.tar(\\.gz)?" + }, + "RunScript" : { + "Default": "run.sh", + "Description" : "Bash shell run script prefix in S3Bucket for starting training on master, e.g. run.sh", + "Type" : "String" + }, + "SetupScript" : { + "Default": "setup.sh", + "Description" : "Bash shell setup script prefix in S3Bucket for setting up training environment on each worker, e.g. setup.sh", + "Type" : "String" + }, + "MyVpcCIDR" : { + "Default": "192.168.0.0/26", + "Description" : "My VPC CIDR", + "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})", + "Type" : "String" + }, + "PublicSubnetCIDR" : { + "Default": "192.168.0.0/27", + "Description" : "Public Subnet CIDR in MyVpcCIDR", + "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})", + "Type" : "String" + }, + "PrivateSubnetCIDR" : { + "Default": "192.168.0.32/27", + "Description" : "Private Subnet CIDR in MyVpcCIDR", + "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})", + "Type" : "String" + }, + "EbsDeviceName" : { + "Default": "/dev/sda1", + "Description" : "Ebs device name", + "Type" : "String", + "AllowedValues": [ "/dev/sda1" ] + }, + "EbsVolumeSize" : { + "Default": "150", + "Description" : "Ebs volume size", + "Type" : "String", + "AllowedValues": [ "100", "150", "200", "250", "300", "350", "400", "450", "500"] + }, "KeyName" : { "Description" : "Name of an existing Amazon EC2 KeyPair to enable SSH access to the instances", "Type" : "AWS::EC2::KeyPair::KeyName" @@ -13,9 +67,9 @@ "Default" : "1" }, "InstanceType" : { - "Description" : "The EC2 instance type for workers.For GPUs choose g2.xx or p2.xx", + "Description" : "The EC2 instance type for workers. For latest GPUs choose p3.xx", "Type" : "String", - "Default" : "p3.2xlarge", + "Default" : "p3.16xlarge", "AllowedValues" : [ "p2.16xlarge", "p2.8xlarge", @@ -79,10 +133,15 @@ "ImageType" : { "Description" : "Linux Flavor(Amazon Linux or Ubuntu)", "Type" : "String", - "Default" : "AmazonLinux", + "Default" : "Ubuntu", "AllowedValues" : [ "AmazonLinux", "Ubuntu" ], "ConstraintDescription" : "Amazon Supported Image Type" }, + "AMIOverride" : { + "Description" : "Advanced option to override Deep Learning AMI of specified ImageType available in region", + "Type" : "String", + "AllowedPattern": "(ami-[0-9a-z]{17})?" + }, "SSHLocation": { "Description": "Restrict SSH access to a valid CIDR range, this should be a valid CIDR IP address range that you want to allow access to your Master and Stack.", "Type": "String", @@ -102,41 +161,52 @@ "Description" : "The Linux mount point for the EFS volume", "Type": "String", "MinLength": "1", - "Default": "myEFSvolume" + "Default": "efs" + }, + "EFSServesData" : { + "Description" : "Use EFS for serving data to workers", + "Type": "String", + "Default": "false", + "AllowedValues": ["true", "false"] + }, + "ActivateCondaEnv" : { + "Description" : "Activate conda environment", + "Type": "String", + "Default": "tensorflow_p36", + "AllowedValues": ["base", "caffe2_p27", "caffe_p27", "caffe_p35", "chainer_p27", "chainer_p36", "cntk_p27", "cntk_p36", "mxnet_p27", "mxnet_p36", "python2", "python3", "pytorch_p27", "pytorch_p36", "tensorflow_p36", "tensorflow_p27", "theano_p27", "theano_p36" ] } }, "Conditions" : { - "CreateNewFileSystem" : { "Fn::Equals" : [{"Ref": "EFSFileSystemId"}, ""] } + "CreateNewFileSystem" : { "Fn::Equals" : [{"Ref": "EFSFileSystemId"}, ""] }, + "CopyDataToEFS" : { "Fn::Equals" : [{"Ref": "EFSServesData"}, "true"] }, + "OverrideAMI" : { "Fn::Not" : [{ "Fn::Equals" : [{"Ref": "AMIOverride"}, ""]} ]} }, "Mappings" : { - "AmazonLinux" : { - "us-east-1" : { "AMI" : "ami-9706e5ea" }, - "us-west-2" : { "AMI" : "ami-dc70ffa4" }, - "eu-west-1" : { "AMI" : "ami-8caad3f5" }, - "us-east-2" : { "AMI" : "ami-f4586f91" }, - "ap-southeast-2" : { "AMI" : "ami-bbd710d9" }, - "ap-northeast-1" : { "AMI" : "ami-5ba3d93d" }, - "ap-northeast-2" : { "AMI" : "ami-d0d67bbe" }, - "ap-south-1" : { "AMI" : "ami-359ec25a" }, - "eu-central-1" : { "AMI" : "ami-ca3351a5" }, - "ap-southeast-1" : { "AMI" : "ami-ded39da2" } - }, "Ubuntu" : { - "us-east-1" : { "AMI" : "ami-173bd86a" }, - "us-west-2" : { "AMI" : "ami-5a77f822" }, - "eu-west-1" : { "AMI" : "ami-2fb0c956" }, - "us-east-2" : { "AMI" : "ami-295b6c4c" }, - "ap-southeast-2" : { "AMI" : "ami-64d51206" }, - "ap-northeast-1" : { "AMI" : "ami-bcafd5da" }, - "ap-northeast-2" : { "AMI" : "ami-1ad17c74" }, - "ap-south-1" : { "AMI" : "ami-959fc3fa" }, - "eu-central-1" : { "AMI" : "ami-3a254755" }, - "ap-southeast-1" : { "AMI" : "ami-63d9971f" } - }, - "SubnetConfig" : { - "VPC" : { "CIDR" : "10.0.0.0/16" }, - "Public" : { "CIDR" : "10.0.0.0/24" }, - "Private" : { "CIDR" : "10.0.1.0/24" } + "us-east-1" : { "AMI" : "ami-06ddeee5b31a0308f" }, + "us-east-2" : { "AMI" : "ami-0c3db42b191436c09" }, + "us-west-2" : { "AMI" : "ami-0b43cec40e1390f34" }, + "eu-west-1" : { "AMI" : "ami-028a25f96614a39e9" }, + "eu-central-1" : { "AMI" : "ami-01c9f9114af1f4b1c" }, + "ap-southeast-1" : { "AMI" : "ami-01ce8149d2e4ae3d5" }, + "ap-southeast-2" : { "AMI" : "ami-0a99d35d07c323ea4" }, + "ap-south-1" : { "AMI" : "ami-0072f3cbed4acac8b" }, + "ap-northeast-1" : { "AMI" : "ami-03b728b620f1dbcb6" }, + "ap-northeast-2" : { "AMI" : "ami-0403080dacdf781b0" } + + }, + "AmazonLinux" : { + "us-east-1" : { "AMI" : "ami-09f0be2e7a739216e" }, + "us-east-2" : { "AMI" : "ami-031ff0b59155d29a8" }, + "us-west-2" : { "AMI" : "ami-0bbf2b19fefd40f4c" }, + "eu-west-1" : { "AMI" : "ami-0378df3826cda83a5" }, + "eu-central-1" : { "AMI" : "ami-0e19c933ff07c2451" }, + "ap-southeast-1" : { "AMI" : "ami-0d7204303a6281932" }, + "ap-southeast-2" : { "AMI" : "ami-068e883b3301d36e1" }, + "ap-south-1" : { "AMI" : "ami-0072f3cbed4acac8b" }, + "ap-northeast-1" : { "AMI" : "ami-03223d7b5bc1cc101" }, + "ap-northeast-2" : { "AMI" : "ami-04d4182ecb149455a" } + }, "S3" : { "us-east-1" : { "URL" : "https://s3.amazonaws.com/" }, @@ -154,9 +224,9 @@ "S3SourceBucket" : { "BucketNameSuffix" : "-aws-dl-cfn" }, "Setup" : { "Filename" : "dl_cfn_setup_v2.py" }, "LambdaFunction" : { "FileName": "dl_cfn_setup_lambda.zip" }, - "TimeoutValues" : { "WaitConditionTimeout" : "3300", "MasterLaunchTimeout" : "600"}, + "TimeoutValues" : { "WaitConditionTimeout" : "3600", "MasterLaunchTimeout" : "1200"}, "DefaultUser" : {"AmazonLinux": "ec2-user", "Ubuntu": "ubuntu"}, - "CfnPath" : {"AmazonLinux": "/opt/aws/bin", "Ubuntu": "/usr/local/bin"} + "CfnPath" : {"Ubuntu": "/usr/local/bin"} } }, "Resources" : { @@ -251,7 +321,33 @@ } ] }, "Path" : "/", - "Policies" : [ { + "Policies" : [ + { + "PolicyName": "s3-read-write", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:Get*", + "s3:List*" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:Put*" + ], + "Resource": [ + { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "S3Bucket" } ] ] }, + { "Fn::Join": [ "", [ "arn:aws:s3:::", { "Ref": "S3Bucket" }, "/*" ] ] } + ] + } + ] + } + }, + { "PolicyName" : "instance", "PolicyDocument" : { "Statement" : [ { @@ -458,9 +554,9 @@ "WorkerLaunchConfig" : { "Type" : "AWS::AutoScaling::LaunchConfiguration", "Properties" : { - "ImageId" : { - "Fn::FindInMap" : [ {"Ref" : "ImageType" }, { "Ref" : "AWS::Region" }, "AMI" ] - }, + "ImageId" : { "Fn::If": [ "OverrideAMI", {"Ref": "AMIOverride"}, + {"Fn::FindInMap" : [ {"Ref" : "ImageType" }, { "Ref" : "AWS::Region" }, "AMI" ]} + ] }, "InstanceType" : { "Ref" : "InstanceType" }, @@ -470,6 +566,12 @@ "SecurityGroups" : [ {"Ref" : "WorkerSecurityGroup"} ], + "BlockDeviceMappings" : [ + { + "DeviceName" : {"Ref": "EbsDeviceName" }, + "Ebs" : { "VolumeSize" : {"Ref": "EbsVolumeSize"}, "VolumeType" : "gp2" } + } + ], "UserData" : { "Fn::Base64" : { "Fn::Join" : [ "", @@ -484,6 +586,18 @@ "mkdir -p /opt/deeplearning", "\n", + "sudo ln -s /home/", + { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, + "/anaconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh", + "\n", + + "echo 'conda activate ", + {"Ref": "ActivateCondaEnv"}, + "' >> /home/", + { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, + "/.bash_login", + "\n", + "# run cfn-init. \n", { "Fn::FindInMap" : [ "Other", "CfnPath", {"Ref" : "ImageType" } ]}, "\\/cfn-init -v --region ", { "Ref" : "AWS::Region" }, @@ -515,8 +629,27 @@ "02_mount" : { "command" : {"Fn::Join" : [ "", [ "sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 ", {"Fn::If" : [ "CreateNewFileSystem", {"Ref" : "FileSystem"}, {"Ref" : "EFSFileSystemId"} ]}, ".efs.", { "Ref" : "AWS::Region" }, ".amazonaws.com:/ /", {"Ref": "EFSMountPoint"} ]]} }, - "03_permissions" : { - "command" : {"Fn::Join" : [ "", [ "chown ",{ "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, ":", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " /", { "Ref" : "EFSMountPoint" }]]} + "03_data" : { + "command" : {"Fn::Join" : [ "", [ "touch /", {"Ref": "EFSMountPoint"}, "/", { "Fn::If" : [ "CopyDataToEFS", "data.txt", "nodata.txt" ] } ]]} + }, + "04_data_ebs" : { + "test": {"Fn::Join" : [ "", [ "test ! -e /", {"Ref": "EFSMountPoint"}, "/data.txt" ]]}, + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "TarData" } , " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "05_src_ebs" : { + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "TarSource" } , " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "06_ebs_tar" : { + "command" : {"Fn::Join" : [ "", [ "for file in /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, "/*.tar ; do tar -xf $file --directory ", " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " ; done"]]} + }, + "07_setup_script" : { + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "SetupScript" }, " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "08_permissions" : { + "command" : {"Fn::Join" : [ "", [ "chown ",{ "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, ":", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " -R /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "09_permissions" : { + "command" : {"Fn::Join" : [ "", [ "chmod u+x /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, "/*.sh" ]]} } } }, @@ -554,9 +687,9 @@ "Type" : "AWS::AutoScaling::LaunchConfiguration", "Properties" : { "AssociatePublicIpAddress" : "true", - "ImageId" : { - "Fn::FindInMap" : [ {"Ref" : "ImageType" }, { "Ref" : "AWS::Region" }, "AMI" ] - }, + "ImageId" : { "Fn::If": [ "OverrideAMI", {"Ref": "AMIOverride"}, + { "Fn::FindInMap" : [ {"Ref" : "ImageType" }, { "Ref" : "AWS::Region" }, "AMI" ]} + ] }, "InstanceType" : { "Ref" : "InstanceType" }, @@ -567,6 +700,12 @@ { "Ref" : "MasterSecurityGroup" }, { "Ref" : "AdminSSHSecurityGroup" } ], + "BlockDeviceMappings" : [ + { + "DeviceName" : {"Ref": "EbsDeviceName" }, + "Ebs" : { "VolumeSize" : {"Ref": "EbsVolumeSize"}, "VolumeType" : "gp2" } + } + ], "UserData" : { "Fn::Base64" : { "Fn::Join" : [ "", @@ -580,6 +719,18 @@ "mkdir -p /opt/deeplearning", "\n", + "sudo ln -s /home/", + { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, + "/anaconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh", + "\n", + + "echo 'conda activate ", + {"Ref": "ActivateCondaEnv"}, + "' >> /home/", + { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, + "/.bash_login", + "\n", + "# run cfn-init. \n", { "Fn::FindInMap" : [ "Other", "CfnPath", {"Ref" : "ImageType" } ]}, "\\/cfn-init -v --region ", { "Ref" : "AWS::Region" }, @@ -611,7 +762,36 @@ "02_mount" : { "command" : {"Fn::Join" : [ "", [ "sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 ", { "Fn::If" : [ "CreateNewFileSystem", {"Ref" : "FileSystem"}, {"Ref" : "EFSFileSystemId"} ] }, ".efs.", { "Ref" : "AWS::Region" }, ".amazonaws.com:/ /", {"Ref": "EFSMountPoint"} ]]} }, - "03_permissions" : { + "03_data" : { + "command" : {"Fn::Join" : [ "", [ "touch /", {"Ref": "EFSMountPoint"}, "/", { "Fn::If" : [ "CopyDataToEFS", "data.txt", "nodata.txt" ] } ]]} + }, + "04_data_efs" : { + "test": {"Fn::Join" : [ "", [ "test -e /", {"Ref": "EFSMountPoint"}, "/data.txt" ]]}, + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "TarData" } , " /", {"Ref": "EFSMountPoint"} ]]} + }, + "05_data_ebs" : { + "test": {"Fn::Join" : [ "", [ "test ! -e /", {"Ref": "EFSMountPoint"}, "/data.txt" ]]}, + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "TarData" } , " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "06_src_ebs" : { + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "TarSource" } , " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "07_ebs_tar" : { + "command" : {"Fn::Join" : [ "", [ "for file in /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, "/*.tar ; do tar -xf $file --directory ", " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " ; done"]]} + }, + "08_run_script" : { + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "RunScript" }, " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "09_setup_script" : { + "command" : {"Fn::Join" : [ "", [ "aws s3 cp s3://", {"Ref" : "S3Bucket" } , "/", { "Ref" : "SetupScript" }, " /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "10_permissions" : { + "command" : {"Fn::Join" : [ "", [ "chown ",{ "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, ":", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " -R /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}]]} + }, + "11_permissions" : { + "command" : {"Fn::Join" : [ "", [ "chmod u+x /home/", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, "/*.sh" ]]} + }, + "12_permissions" : { "command" : {"Fn::Join" : [ "", [ "chown ",{ "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, ":", { "Fn::FindInMap" : [ "Other", "DefaultUser", {"Ref" : "ImageType" } ]}, " /", { "Ref" : "EFSMountPoint" }]]} } } @@ -768,7 +948,7 @@ "Vpc" : { "Type" : "AWS::EC2::VPC", "Properties" : { - "CidrBlock" : { "Fn::FindInMap" : [ "SubnetConfig", "VPC", "CIDR" ]}, + "CidrBlock" : {"Ref": "MyVpcCIDR"}, "EnableDnsSupport" : "true", "EnableDnsHostnames" : "true", "Tags" : [ @@ -798,7 +978,7 @@ "Properties" : { "VpcId" : {"Ref" : "Vpc"}, "AvailabilityZone" : { "Fn::GetAtt" : [ "PrivateSubnet", "AvailabilityZone" ] } , - "CidrBlock": { "Fn::FindInMap" : [ "SubnetConfig", "Public", "CIDR" ]}, + "CidrBlock": { "Ref" : "PublicSubnetCIDR"}, "Tags" : [ { "Key" : "Network", "Value" : "Public" }, { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } } @@ -809,7 +989,7 @@ "Type" : "AWS::EC2::Subnet", "Properties" : { "VpcId" : { "Ref" : "Vpc" }, - "CidrBlock" : { "Fn::FindInMap" : [ "SubnetConfig", "Private", "CIDR" ]}, + "CidrBlock" : { "Ref" : "PrivateSubnetCIDR"}, "Tags" : [ { "Key" : "Network", "Value" : "Private" }, { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" }} From 4a9037dadd3836fa2064f8d7b0d3a5b881ac42fc Mon Sep 17 00:00:00 2001 From: Ajay Vohra Date: Tue, 27 Nov 2018 17:47:39 -0500 Subject: [PATCH 2/5] added support for EbsOptimized flag for the cluster instances add support for EbsOptimized flag on EC2 instances --- cfn-template/deeplearning.template | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cfn-template/deeplearning.template b/cfn-template/deeplearning.template index 9a3a9e3..b65f7fc 100644 --- a/cfn-template/deeplearning.template +++ b/cfn-template/deeplearning.template @@ -169,6 +169,12 @@ "Default": "false", "AllowedValues": ["true", "false"] }, + "EBSOptimized" : { + "Description": "Is the instance EBS optimized? Not all instace types support EBS optimized option.", + "Type": "String", + "Default": "true", + "AllowedValues": [ "false", "true"] + }, "ActivateCondaEnv" : { "Description" : "Activate conda environment", "Type": "String", @@ -560,6 +566,9 @@ "InstanceType" : { "Ref" : "InstanceType" }, + "EbsOptimized": { + "Ref": "EBSOptimized" + }, "IamInstanceProfile" : { "Ref" : "InstanceProfile" }, @@ -693,6 +702,9 @@ "InstanceType" : { "Ref" : "InstanceType" }, + "EbsOptimized": { + "Ref": "EBSOptimized" + }, "IamInstanceProfile" : { "Ref" : "InstanceProfile" }, From 3b5bd5ed86c7ce6c3118d29eca272c18d605ba62 Mon Sep 17 00:00:00 2001 From: Ajay Vohra Date: Thu, 27 Dec 2018 19:02:33 +0000 Subject: [PATCH 3/5] placement group, availability zone and p3dn.24xlarge support --- cfn-template/deeplearning.template | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/cfn-template/deeplearning.template b/cfn-template/deeplearning.template index b65f7fc..6a6b156 100644 --- a/cfn-template/deeplearning.template +++ b/cfn-template/deeplearning.template @@ -32,6 +32,11 @@ "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})", "Type" : "String" }, + "MyAvailabilityZone" : { + "Description" : "My availability zone, e.g. us-east-1d", + "AllowedPattern": "[a-z]+-[a-z]+-[1-9]{1}[a-z]{1}", + "Type" : "String" + }, "PublicSubnetCIDR" : { "Default": "192.168.0.0/27", "Description" : "Public Subnet CIDR in MyVpcCIDR", @@ -69,7 +74,7 @@ "InstanceType" : { "Description" : "The EC2 instance type for workers. For latest GPUs choose p3.xx", "Type" : "String", - "Default" : "p3.16xlarge", + "Default" : "p3dn.24xlarge", "AllowedValues" : [ "p2.16xlarge", "p2.8xlarge", @@ -77,6 +82,7 @@ "p3.2xlarge", "p3.8xlarge", "p3.16xlarge", + "p3dn.24xlarge", "g2.8xlarge", "g2.2xlarge", "t2.small", @@ -236,6 +242,12 @@ } }, "Resources" : { + "ResourcePlacementGroup": { + "Type" : "AWS::EC2::PlacementGroup", + "Properties" : { + "Strategy" : "cluster" + } + }, "ResourceMetadataLambdaFunction": { "Type": "AWS::Lambda::Function", "DependsOn" : ["MasterQueue"], @@ -851,6 +863,9 @@ "DesiredCapacity" : "1", "MinSize" : "1", "MaxSize" : "1", + "PlacementGroup": { + "Ref": "ResourcePlacementGroup" + }, "LaunchConfigurationName" : { "Ref" : "MasterLaunchConfig"}, "VPCZoneIdentifier" : [{ "Ref" : "PublicSubnet"}], "NotificationConfiguration" : { @@ -884,6 +899,9 @@ "Properties" : { "MinSize" : "0", "MaxSize" : { "Ref" : "WorkerCount" }, + "PlacementGroup": { + "Ref": "ResourcePlacementGroup" + }, "DesiredCapacity" : { "Ref" : "WorkerCount" }, "LaunchConfigurationName" : { "Ref" : "WorkerLaunchConfig" @@ -1002,6 +1020,7 @@ "Properties" : { "VpcId" : { "Ref" : "Vpc" }, "CidrBlock" : { "Ref" : "PrivateSubnetCIDR"}, + "AvailabilityZone": { "Ref": "MyAvailabilityZone"}, "Tags" : [ { "Key" : "Network", "Value" : "Private" }, { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" }} From a6286b79d62fe6d92c7d2fc52e7cc941496fc94a Mon Sep 17 00:00:00 2001 From: Ajay Vohra Date: Tue, 8 Jan 2019 15:57:08 +0000 Subject: [PATCH 4/5] update template and add template for private VPC --- cfn-template/deeplearning.template | 42 +- cfn-template/private-deeplearning.template | 2443 ++++++++++++++++++++ 2 files changed, 2464 insertions(+), 21 deletions(-) create mode 100644 cfn-template/private-deeplearning.template diff --git a/cfn-template/deeplearning.template b/cfn-template/deeplearning.template index 6a6b156..976bd5c 100644 --- a/cfn-template/deeplearning.template +++ b/cfn-template/deeplearning.template @@ -74,7 +74,7 @@ "InstanceType" : { "Description" : "The EC2 instance type for workers. For latest GPUs choose p3.xx", "Type" : "String", - "Default" : "p3dn.24xlarge", + "Default" : "p3.16xlarge", "AllowedValues" : [ "p2.16xlarge", "p2.8xlarge", @@ -195,29 +195,29 @@ }, "Mappings" : { "Ubuntu" : { - "us-east-1" : { "AMI" : "ami-06ddeee5b31a0308f" }, - "us-east-2" : { "AMI" : "ami-0c3db42b191436c09" }, - "us-west-2" : { "AMI" : "ami-0b43cec40e1390f34" }, - "eu-west-1" : { "AMI" : "ami-028a25f96614a39e9" }, - "eu-central-1" : { "AMI" : "ami-01c9f9114af1f4b1c" }, - "ap-southeast-1" : { "AMI" : "ami-01ce8149d2e4ae3d5" }, - "ap-southeast-2" : { "AMI" : "ami-0a99d35d07c323ea4" }, - "ap-south-1" : { "AMI" : "ami-0072f3cbed4acac8b" }, - "ap-northeast-1" : { "AMI" : "ami-03b728b620f1dbcb6" }, - "ap-northeast-2" : { "AMI" : "ami-0403080dacdf781b0" } + "us-east-1" : { "AMI" : "ami-0f9e8c4a1305ecd22" }, + "us-east-2" : { "AMI" : "ami-0c9ae74667b049f59" }, + "us-west-2" : { "AMI" : "ami-0d0ff0945ae093aea" }, + "eu-west-1" : { "AMI" : "ami-0827ddd2d8e38aa56" }, + "eu-central-1" : { "AMI" : "ami-03580946c6347e2f8" }, + "ap-southeast-1" : { "AMI" : "ami-09dfb1478dc499b95" }, + "ap-southeast-2" : { "AMI" : "ami-0a8f8e89b02a21088" }, + "ap-south-1" : { "AMI" : "ami-07ffe4e02cf5c2bd0" }, + "ap-northeast-1" : { "AMI" : "ami-031ce7af929321d3a" }, + "ap-northeast-2" : { "AMI" : "ami-09c2b38b2fbb748ce" } }, "AmazonLinux" : { - "us-east-1" : { "AMI" : "ami-09f0be2e7a739216e" }, - "us-east-2" : { "AMI" : "ami-031ff0b59155d29a8" }, - "us-west-2" : { "AMI" : "ami-0bbf2b19fefd40f4c" }, - "eu-west-1" : { "AMI" : "ami-0378df3826cda83a5" }, - "eu-central-1" : { "AMI" : "ami-0e19c933ff07c2451" }, - "ap-southeast-1" : { "AMI" : "ami-0d7204303a6281932" }, - "ap-southeast-2" : { "AMI" : "ami-068e883b3301d36e1" }, - "ap-south-1" : { "AMI" : "ami-0072f3cbed4acac8b" }, - "ap-northeast-1" : { "AMI" : "ami-03223d7b5bc1cc101" }, - "ap-northeast-2" : { "AMI" : "ami-04d4182ecb149455a" } + "us-east-1" : { "AMI" : "ami-0a4b759b63b333b0e" }, + "us-east-2" : { "AMI" : "ami-0f71284ab59a38265" }, + "us-west-2" : { "AMI" : "ami-0305a0d7a68489e58" }, + "eu-west-1" : { "AMI" : "ami-0d2e5838a2908742f" }, + "eu-central-1" : { "AMI" : "ami-09b5cb82b50e3c9e9" }, + "ap-southeast-1" : { "AMI" : "ami-0abbc7f71da968649" }, + "ap-southeast-2" : { "AMI" : "ami-0b6d01aebbf6a1490" }, + "ap-south-1" : { "AMI" : "ami-0e17a6861b2574143" }, + "ap-northeast-1" : { "AMI" : "ami-0165fe49c30cad525" }, + "ap-northeast-2" : { "AMI" : "ami-0b54ee3b4c6e0b975" } }, "S3" : { diff --git a/cfn-template/private-deeplearning.template b/cfn-template/private-deeplearning.template new file mode 100644 index 0000000..107c24a --- /dev/null +++ b/cfn-template/private-deeplearning.template @@ -0,0 +1,2443 @@ +{ + "AWSTemplateFormatVersion": "2010-09-09", + "Description": "Launches a Deep Learning Cluster with one Master and variable number of Workers.", + "Parameters": { + "S3Bucket": { + "Description": "S3 bucket name that contains training code, data and scripts, e.g. my-s3-bucket ", + "Type": "String" + }, + "TarData": { + "Description": "Data tar file prefix in S3Bucket copied to EFS, or copied and extracted on worker EBS file system, e.g. data.tar", + "Type": "String", + "AllowedPattern": ".+\\.tar(\\.gz)?" + }, + "TarSource": { + "Description": "Source tar file prefix in S3Bucket copied and extracted on worker EBS file system, e.g. src.tar", + "Type": "String", + "AllowedPattern": ".+\\.tar(\\.gz)?" + }, + "RunScript": { + "Default": "run.sh", + "Description": "Bash shell run script prefix in S3Bucket for starting training on master, e.g. run.sh", + "Type": "String" + }, + "SetupScript": { + "Default": "setup.sh", + "Description": "Bash shell setup script prefix in S3Bucket for setting up training environment on each worker, e.g. setup.sh", + "Type": "String" + }, + "MyVpcId": { + "Description": "My VPC ID", + "Type": "AWS::EC2::VPC::Id" + }, + "PrivateSubnetId": { + "Description": "My Subnet ID", + "Type": "AWS::EC2::Subnet::Id" + }, + "EbsDeviceName": { + "Default": "/dev/sda1", + "Description": "Ebs device name", + "Type": "String", + "AllowedValues": [ + "/dev/sda1" + ] + }, + "EbsVolumeSize": { + "Default": "150", + "Description": "Ebs volume size", + "Type": "String", + "AllowedValues": [ + "100", + "150", + "200", + "250", + "300", + "350", + "400", + "450", + "500" + ] + }, + "KeyName": { + "Description": "Name of an existing Amazon EC2 KeyPair to enable SSH access to the instances", + "Type": "AWS::EC2::KeyPair::KeyName" + }, + "WorkerCount": { + "Description": "The number of worker instances (launches +1 instance for the Master).", + "Type": "Number", + "MinValue": "1", + "Default": "1" + }, + "InstanceType": { + "Description": "The EC2 instance type for workers. For latest GPUs choose p3.xx", + "Type": "String", + "Default": "p3.16xlarge", + "AllowedValues": [ + "p2.16xlarge", + "p2.8xlarge", + "p2.xlarge", + "p3.2xlarge", + "p3.8xlarge", + "p3.16xlarge", + "p3dn.24xlarge", + "g2.8xlarge", + "g2.2xlarge", + "t2.small", + "t2.medium", + "t2.large", + "t2.xlarge", + "t2.2xlarge", + "m4.large", + "m4.xlarge", + "m4.2xlarge", + "m4.4xlarge", + "m4.10xlarge", + "m4.16xlarge", + "m3.medium", + "m3.large", + "m3.xlarge", + "m3.2xlarge", + "c4.large", + "c4.xlarge", + "c4.2xlarge", + "c4.4xlarge", + "c4.8xlarge", + "c3.large", + "c3.xlarge", + "c3.2xlarge", + "c3.4xlarge", + "c3.8xlarge", + "x1.16large", + "x1.32xlarge", + "r4.large", + "r4.xlarge", + "r4.2xlarge", + "r4.4xlarge", + "r4.8xlarge", + "r4.16xlarge", + "r3.large", + "r3.xlarge", + "r3.2xlarge", + "r3.4xlarge", + "r3.8xlarge", + "i2.xlarge", + "i2.2xlarge", + "i2.4xlarge", + "i2.8xlarge", + "d2.xlarge", + "d2.2xlarge", + "d2.4xlarge", + "d2.8xlarge", + "f1.2xlarge", + "f1.16xlarge" + ], + "ConstraintDescription": "Must be a valid CPU optimized or GPU EC2 instance type." + }, + "ImageType": { + "Description": "Linux Flavor(Amazon Linux or Ubuntu)", + "Type": "String", + "Default": "Ubuntu", + "AllowedValues": [ + "AmazonLinux", + "Ubuntu" + ], + "ConstraintDescription": "Amazon Supported Image Type" + }, + "AMIOverride": { + "Description": "Advanced option to override Deep Learning AMI of specified ImageType available in region", + "Type": "String", + "AllowedPattern": "(ami-[0-9a-z]{17})?" + }, + "SSHLocation": { + "Description": "Restrict SSH access to a valid CIDR range, this should be a valid CIDR IP address range that you want to allow access to your Master and Stack.", + "Type": "String", + "MinLength": "9", + "MaxLength": "18", + "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})", + "ConstraintDescription": "Must be a valid CIDR range of the form x.x.x.x/x" + }, + "EFSFileSystemId": { + "Description": "Existing Amazon EFS File System Id or leave it blank to create a new EFS File System.", + "Type": "String", + "AllowedPattern": "(^fs-[0-9a-f]{8,8})$|()$", + "Default": "", + "ConstraintDescription": "Should be a Valid EFS File System Id" + }, + "EFSMountPoint": { + "Description": "The Linux mount point for the EFS volume", + "Type": "String", + "MinLength": "1", + "Default": "efs" + }, + "EFSServesData": { + "Description": "Use EFS for serving data to workers", + "Type": "String", + "Default": "false", + "AllowedValues": [ + "true", + "false" + ] + }, + "EBSOptimized": { + "Description": "Is the instance EBS optimized? Not all instace types support EBS optimized option.", + "Type": "String", + "Default": "true", + "AllowedValues": [ + "false", + "true" + ] + }, + "ActivateCondaEnv": { + "Description": "Activate conda environment", + "Type": "String", + "Default": "tensorflow_p36", + "AllowedValues": [ + "base", + "caffe2_p27", + "caffe_p27", + "caffe_p35", + "chainer_p27", + "chainer_p36", + "cntk_p27", + "cntk_p36", + "mxnet_p27", + "mxnet_p36", + "python2", + "python3", + "pytorch_p27", + "pytorch_p36", + "tensorflow_p36", + "tensorflow_p27", + "theano_p27", + "theano_p36" + ] + } + }, + "Conditions": { + "CreateNewFileSystem": { + "Fn::Equals": [ + { + "Ref": "EFSFileSystemId" + }, + "" + ] + }, + "CopyDataToEFS": { + "Fn::Equals": [ + { + "Ref": "EFSServesData" + }, + "true" + ] + }, + "OverrideAMI": { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Ref": "AMIOverride" + }, + "" + ] + } + ] + } + }, + "Mappings": { + "Ubuntu": { + "us-east-1": { + "AMI": "ami-0f9e8c4a1305ecd22" + }, + "us-east-2": { + "AMI": "ami-0c9ae74667b049f59" + }, + "us-west-2": { + "AMI": "ami-0d0ff0945ae093aea" + }, + "eu-west-1": { + "AMI": "ami-0827ddd2d8e38aa56" + }, + "eu-central-1": { + "AMI": "ami-03580946c6347e2f8" + }, + "ap-southeast-1": { + "AMI": "ami-09dfb1478dc499b95" + }, + "ap-southeast-2": { + "AMI": "ami-0a8f8e89b02a21088" + }, + "ap-south-1": { + "AMI": "ami-07ffe4e02cf5c2bd0" + }, + "ap-northeast-1": { + "AMI": "ami-031ce7af929321d3a" + }, + "ap-northeast-2": { + "AMI": "ami-09c2b38b2fbb748ce" + } + }, + "AmazonLinux": { + "us-east-1": { + "AMI": "ami-0a4b759b63b333b0e" + }, + "us-east-2": { + "AMI": "ami-0f71284ab59a38265" + }, + "us-west-2": { + "AMI": "ami-0305a0d7a68489e58" + }, + "eu-west-1": { + "AMI": "ami-0d2e5838a2908742f" + }, + "eu-central-1": { + "AMI": "ami-09b5cb82b50e3c9e9" + }, + "ap-southeast-1": { + "AMI": "ami-0abbc7f71da968649" + }, + "ap-southeast-2": { + "AMI": "ami-0b6d01aebbf6a1490" + }, + "ap-south-1": { + "AMI": "ami-0e17a6861b2574143" + }, + "ap-northeast-1": { + "AMI": "ami-0165fe49c30cad525" + }, + "ap-northeast-2": { + "AMI": "ami-0b54ee3b4c6e0b975" + } + }, + "S3": { + "us-east-1": { + "URL": "https://s3.amazonaws.com/" + }, + "us-west-2": { + "URL": "https://s3-us-west-2.amazonaws.com/" + }, + "eu-west-1": { + "URL": "https://s3-eu-west-1.amazonaws.com/" + }, + "us-east-2": { + "URL": "https://s3-us-east-2.amazonaws.com/" + }, + "ap-southeast-2": { + "URL": "https://s3-ap-southeast-2.amazonaws.com/" + }, + "ap-northeast-1": { + "URL": "https://s3-ap-northeast-1.amazonaws.com/" + }, + "ap-northeast-2": { + "URL": "https://s3-ap-northeast-2.amazonaws.com/" + }, + "ap-south-1": { + "URL": "https://s3-ap-south-1.amazonaws.com/" + }, + "eu-central-1": { + "URL": "https://s3-eu-central-1.amazonaws.com/" + }, + "ap-southeast-1": { + "URL": "https://s3-ap-southeast-1.amazonaws.com/" + } + }, + "Other": { + "S3SourceBucket": { + "BucketNameSuffix": "-aws-dl-cfn" + }, + "Setup": { + "Filename": "dl_cfn_setup_v2.py" + }, + "LambdaFunction": { + "FileName": "dl_cfn_setup_lambda.zip" + }, + "TimeoutValues": { + "WaitConditionTimeout": "3600", + "MasterLaunchTimeout": "1200" + }, + "DefaultUser": { + "AmazonLinux": "ec2-user", + "Ubuntu": "ubuntu" + }, + "CfnPath": { + "Ubuntu": "/usr/local/bin" + } + } + }, + "Resources": { + "ResourcePlacementGroup": { + "Type": "AWS::EC2::PlacementGroup", + "Properties": { + "Strategy": "cluster" + } + }, + "ResourceMetadataLambdaFunction": { + "Type": "AWS::Lambda::Function", + "DependsOn": [ + "MasterQueue" + ], + "Properties": { + "Handler": "lambda_function.lambda_handler", + "Role": { + "Fn::GetAtt": [ + "LambdaExecutionRole", + "Arn" + ] + }, + "Code": { + "S3Bucket": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::Region" + }, + { + "Fn::FindInMap": [ + "Other", + "S3SourceBucket", + "BucketNameSuffix" + ] + } + ] + ] + }, + "S3Key": { + "Fn::FindInMap": [ + "Other", + "LambdaFunction", + "FileName" + ] + } + }, + "MemorySize": "256", + "Timeout": "60", + "Runtime": "python2.7", + "Environment": { + "Variables": { + "AWS_DL_STACK_ID": { + "Ref": "AWS::StackName" + }, + "AWS_DL_MASTER_SQS_URL": { + "Ref": "MasterQueue" + } + } + } + } + }, + "LambdaExecutionRole": { + "Type": "AWS::IAM::Role", + "DependsOn": [ + "MasterQueue" + ], + "Properties": { + "ManagedPolicyArns": [ + "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + ], + "AssumeRolePolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": [ + "lambda.amazonaws.com" + ] + }, + "Action": [ + "sts:AssumeRole" + ] + } + ] + }, + "Path": "/", + "Policies": [ + { + "PolicyName": "AWSDeepLearningLambdaExecutionRole", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:SetDesiredCapacity", + "autoscaling:SuspendProcesses", + "cloudformation:DescribeStackResource", + "cloudformation:SignalResource" + ], + "Resource": "*" + } + ] + } + }, + { + "PolicyName": "AllowLambdaSQSSend", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "sqs:sendmessage" + ], + "Resource": { + "Fn::GetAtt": [ + "MasterQueue", + "Arn" + ] + } + } + ] + } + } + ] + } + }, + "PermissionForSNSToInvokeLambda": { + "Type": "AWS::Lambda::Permission", + "Properties": { + "FunctionName": { + "Fn::GetAtt": [ + "ResourceMetadataLambdaFunction", + "Arn" + ] + }, + "Action": "lambda:InvokeFunction", + "Principal": "sns.amazonaws.com", + "SourceArn": { + "Ref": "ResourceMetadataSNSTopic" + } + } + }, + "InstanceRole": { + "Type": "AWS::IAM::Role", + "Properties": { + "RoleName": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-InstanceRole" + ] + ] + }, + "AssumeRolePolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": [ + "ec2.amazonaws.com" + ] + }, + "Action": [ + "sts:AssumeRole" + ] + } + ] + }, + "Path": "/", + "Policies": [ + { + "PolicyName": "s3-read-write", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:Get*", + "s3:List*" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "s3:Put*" + ], + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:aws:s3:::", + { + "Ref": "S3Bucket" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:aws:s3:::", + { + "Ref": "S3Bucket" + }, + "/*" + ] + ] + } + ] + } + ] + } + }, + { + "PolicyName": "instance", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeAutoScalingInstances", + "ec2:DescribeInstances", + "cloudformation:DescribeStackResource" + ], + "Resource": "*" + } + ] + } + }, + { + "PolicyName": "allow-sqs-receive-send-delete-master", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "sqs:DeleteMessage", + "sqs:ReceiveMessage", + "sqs:SendMessage", + "sqs:GetQueueUrl" + ], + "Resource": { + "Fn::GetAtt": [ + "MasterQueue", + "Arn" + ] + } + } + ] + } + }, + { + "PolicyName": "allow-sqs-receive-send-delete-worker", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "sqs:DeleteMessage", + "sqs:ReceiveMessage", + "sqs:SendMessage", + "sqs:GetQueueUrl" + ], + "Resource": { + "Fn::GetAtt": [ + "WorkerQueue", + "Arn" + ] + } + } + ] + } + }, + { + "PolicyName": "allow-to-send-signal-to-WaitConditionHandle", + "PolicyDocument": { + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:*" + ], + "Resource": { + "Fn::Join": [ + "", + [ + "arn:aws:s3:::", + "cloudformation-waitcondition-", + { + "Ref": "AWS::Region" + }, + "/*" + ] + ] + } + } + ] + } + } + ] + } + }, + "InstanceProfile": { + "Type": "AWS::IAM::InstanceProfile", + "DependsOn": "InstanceRole", + "Properties": { + "Path": "/", + "Roles": [ + { + "Ref": "InstanceRole" + } + ] + } + }, + "AdminSSHSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "GroupDescription": "Security group that controls SSH access to the Master instance.", + "VpcId": { + "Ref": "MyVpcId" + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "_SSH" + ] + ] + } + } + ], + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "22", + "ToPort": "22", + "CidrIp": { + "Ref": "SSHLocation" + } + } + ], + "SecurityGroupEgress": [] + } + }, + "MasterSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "GroupDescription": "Enable Port access to and from the Master on the Private Interface.", + "VpcId": { + "Ref": "MyVpcId" + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "_Master" + ] + ] + } + } + ], + "SecurityGroupIngress": [], + "SecurityGroupEgress": [] + } + }, + "MasterSecurityIngress1": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "MasterSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "tcp", + "FromPort": "0", + "ToPort": "65535", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + } + } + }, + "MasterSecurityIngress2": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "MasterSecurityGroup", + "WorkerSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "icmp", + "FromPort": "-1", + "ToPort": "-1", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + } + } + }, + "MasterSecurityIngress3": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "MasterSecurityGroup", + "WorkerSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "tcp", + "FromPort": "0", + "ToPort": "65535", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + } + } + }, + "MasterSecurityIngress4": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "MasterSecurityGroup", + "WorkerSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "MasterSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "icmp", + "FromPort": "-1", + "ToPort": "-1", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + } + } + }, + "WorkerSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "DependsOn": [ + "MasterSecurityGroup" + ], + "Properties": { + "GroupDescription": "Enable Port access to and from the Worker on the Private Interface", + "VpcId": { + "Ref": "MyVpcId" + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "_Worker" + ] + ] + } + } + ], + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "0", + "ToPort": "65535", + "SourceSecurityGroupId": { + "Ref": "MasterSecurityGroup" + } + }, + { + "IpProtocol": "icmp", + "FromPort": "-1", + "ToPort": "-1", + "SourceSecurityGroupId": { + "Ref": "MasterSecurityGroup" + } + } + ], + "SecurityGroupEgress": [] + } + }, + "WorkerSecurityIngress3": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "WorkerSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "tcp", + "FromPort": "0", + "ToPort": "65535", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + } + } + }, + "WorkerSecurityIngress4": { + "Type": "AWS::EC2::SecurityGroupIngress", + "DependsOn": [ + "WorkerSecurityGroup" + ], + "Properties": { + "GroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + }, + "IpProtocol": "icmp", + "FromPort": "-1", + "ToPort": "-1", + "SourceSecurityGroupId": { + "Fn::GetAtt": [ + "WorkerSecurityGroup", + "GroupId" + ] + } + } + }, + "MountTargetSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "DependsOn": [ + "MasterSecurityGroup", + "WorkerSecurityGroup" + ], + "Properties": { + "GroupDescription": "Security group for mount target", + "VpcId": { + "Ref": "MyVpcId" + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "_Master" + ] + ] + } + } + ], + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "2049", + "ToPort": "2049", + "SourceSecurityGroupId": { + "Ref": "MasterSecurityGroup" + } + }, + { + "IpProtocol": "tcp", + "FromPort": "2049", + "ToPort": "2049", + "SourceSecurityGroupId": { + "Ref": "WorkerSecurityGroup" + } + } + ], + "SecurityGroupEgress": [] + } + }, + "FileSystem": { + "Type": "AWS::EFS::FileSystem", + "Condition": "CreateNewFileSystem", + "DeletionPolicy": "Retain", + "Properties": { + "PerformanceMode": "generalPurpose", + "FileSystemTags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ] + } + }, + "MountTarget": { + "Type": "AWS::EFS::MountTarget", + "Properties": { + "FileSystemId": { + "Fn::If": [ + "CreateNewFileSystem", + { + "Ref": "FileSystem" + }, + { + "Ref": "EFSFileSystemId" + } + ] + }, + "SubnetId": { + "Ref": "PrivateSubnetId" + }, + "SecurityGroups": [ + { + "Ref": "MountTargetSecurityGroup" + } + ] + } + }, + "WorkerLaunchConfig": { + "Type": "AWS::AutoScaling::LaunchConfiguration", + "Properties": { + "ImageId": { + "Fn::If": [ + "OverrideAMI", + { + "Ref": "AMIOverride" + }, + { + "Fn::FindInMap": [ + { + "Ref": "ImageType" + }, + { + "Ref": "AWS::Region" + }, + "AMI" + ] + } + ] + }, + "InstanceType": { + "Ref": "InstanceType" + }, + "EbsOptimized": { + "Ref": "EBSOptimized" + }, + "IamInstanceProfile": { + "Ref": "InstanceProfile" + }, + "SecurityGroups": [ + { + "Ref": "WorkerSecurityGroup" + } + ], + "BlockDeviceMappings": [ + { + "DeviceName": { + "Ref": "EbsDeviceName" + }, + "Ebs": { + "VolumeSize": { + "Ref": "EbsVolumeSize" + }, + "VolumeType": "gp2" + } + } + ], + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#!/bin/bash -xe", + "\n", + "# setup ssh-forwarding. ", + "sed -i \"s/^#\\(\\s\\+\\)ForwardAgent\\(\\s\\+\\)no/\\ \\1ForwardAgent\\2yes/g\" /etc/ssh/ssh_config", + "\n", + "mkdir -p /opt/deeplearning", + "\n", + "sudo ln -s /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/anaconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh", + "\n", + "echo 'conda activate ", + { + "Ref": "ActivateCondaEnv" + }, + "' >> /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/.bash_login", + "\n", + "# run cfn-init. \n", + { + "Fn::FindInMap": [ + "Other", + "CfnPath", + { + "Ref": "ImageType" + } + ] + }, + "\\/cfn-init -v --region ", + { + "Ref": "AWS::Region" + }, + " --configsets Setup ", + " -s ", + { + "Ref": "AWS::StackId" + }, + " -r WorkerLaunchConfig ", + "\n", + "" + ] + ] + } + }, + "KeyName": { + "Ref": "KeyName" + } + }, + "Metadata": { + "AWS::CloudFormation::Init": { + "configSets": { + "Setup": [ + "efs-config", + "download-setup", + "deeplearning-config" + ] + }, + "efs-config": { + "commands": { + "00_install_nfs": { + "command": { + "Fn::Join": [ + "", + [ + "if [ \"AmazonLinux\" = \"", + { + "Ref": "ImageType" + }, + "\" ];", + "then yum -y -q install nfs-utils; else apt-get -qq -y install nfs-common ; fi" + ] + ] + } + }, + "01_createdir": { + "command": { + "Fn::Join": [ + "", + [ + "mkdir -p /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + }, + "02_mount": { + "command": { + "Fn::Join": [ + "", + [ + "sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 ", + { + "Fn::If": [ + "CreateNewFileSystem", + { + "Ref": "FileSystem" + }, + { + "Ref": "EFSFileSystemId" + } + ] + }, + ".efs.", + { + "Ref": "AWS::Region" + }, + ".amazonaws.com:/ /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + }, + "03_data": { + "command": { + "Fn::Join": [ + "", + [ + "touch /", + { + "Ref": "EFSMountPoint" + }, + "/", + { + "Fn::If": [ + "CopyDataToEFS", + "data.txt", + "nodata.txt" + ] + } + ] + ] + } + }, + "04_data_ebs": { + "test": { + "Fn::Join": [ + "", + [ + "test ! -e /", + { + "Ref": "EFSMountPoint" + }, + "/data.txt" + ] + ] + }, + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "TarData" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "05_src_ebs": { + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "TarSource" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "06_ebs_tar": { + "command": { + "Fn::Join": [ + "", + [ + "for file in /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/*.tar ; do tar -xf $file --directory ", + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + " ; done" + ] + ] + } + }, + "07_setup_script": { + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "SetupScript" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "08_permissions": { + "command": { + "Fn::Join": [ + "", + [ + "chown ", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + ":", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + " -R /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "09_permissions": { + "command": { + "Fn::Join": [ + "", + [ + "chmod u+x /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/*.sh" + ] + ] + } + } + } + }, + "download-setup": { + "files": { + "/opt/deeplearning/dl_cfn_setup_v2.py": { + "source": { + "Fn::Join": [ + "", + [ + { + "Fn::FindInMap": [ + "S3", + { + "Ref": "AWS::Region" + }, + "URL" + ] + }, + { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::Region" + }, + { + "Fn::FindInMap": [ + "Other", + "S3SourceBucket", + "BucketNameSuffix" + ] + } + ] + ] + }, + "/", + { + "Fn::FindInMap": [ + "Other", + "Setup", + "Filename" + ] + } + ] + ] + } + } + } + }, + "deeplearning-config": { + "commands": { + "01_setup": { + "command": "python /opt/deeplearning/dl_cfn_setup_v2.py | tee -a /var/log/cloud-init-output.log", + "cwd": "/opt/deeplearning", + "env": { + "AWS_DL_NODE_TYPE": "Worker", + "AWS_DL_MASTER_QUEUE": { + "Fn::GetAtt": [ + "MasterQueue", + "QueueName" + ] + }, + "AWS_DL_WORKER_QUEUE": { + "Fn::GetAtt": [ + "WorkerQueue", + "QueueName" + ] + }, + "AWS_DL_WAITCONDITION_TIMEOUT": { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "WaitConditionTimeout" + ] + }, + "AWS_DL_MASTERLAUNCH_TIMEOUT": { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "MasterLaunchTimeout" + ] + }, + "AWS_DL_STACK_ID": { + "Ref": "AWS::StackId" + }, + "AWS_DL_WAIT_HANDLE": { + "Ref": "myWaitHandle" + }, + "AWS_DL_ROLE_NAME": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-InstanceRole" + ] + ] + }, + "AWS_DL_DEFAULT_USER": { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "AWS_REGION": { + "Ref": "AWS::Region" + }, + "EFS_MOUNT": { + "Fn::Join": [ + "", + [ + "/", + { + "Ref": "EFSMountPoint" + } + ] + ] + }, + "CFN_PATH": { + "Fn::FindInMap": [ + "Other", + "CfnPath", + { + "Ref": "ImageType" + } + ] + } + } + } + } + } + } + } + }, + "MasterLaunchConfig": { + "Type": "AWS::AutoScaling::LaunchConfiguration", + "Properties": { + "AssociatePublicIpAddress": "false", + "ImageId": { + "Fn::If": [ + "OverrideAMI", + { + "Ref": "AMIOverride" + }, + { + "Fn::FindInMap": [ + { + "Ref": "ImageType" + }, + { + "Ref": "AWS::Region" + }, + "AMI" + ] + } + ] + }, + "InstanceType": { + "Ref": "InstanceType" + }, + "EbsOptimized": { + "Ref": "EBSOptimized" + }, + "IamInstanceProfile": { + "Ref": "InstanceProfile" + }, + "SecurityGroups": [ + { + "Ref": "MasterSecurityGroup" + }, + { + "Ref": "AdminSSHSecurityGroup" + } + ], + "BlockDeviceMappings": [ + { + "DeviceName": { + "Ref": "EbsDeviceName" + }, + "Ebs": { + "VolumeSize": { + "Ref": "EbsVolumeSize" + }, + "VolumeType": "gp2" + } + } + ], + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#!/bin/bash -xe", + "\n", + "# setup ssh-forwarding. \n", + "sed -i \"s/^#\\(\\s\\+\\)ForwardAgent\\(\\s\\+\\)no/\\ \\1ForwardAgent\\2yes/g\" /etc/ssh/ssh_config", + "\n", + "mkdir -p /opt/deeplearning", + "\n", + "sudo ln -s /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/anaconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh", + "\n", + "echo 'conda activate ", + { + "Ref": "ActivateCondaEnv" + }, + "' >> /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/.bash_login", + "\n", + "# run cfn-init. \n", + { + "Fn::FindInMap": [ + "Other", + "CfnPath", + { + "Ref": "ImageType" + } + ] + }, + "\\/cfn-init -v --region ", + { + "Ref": "AWS::Region" + }, + " --configsets Setup ", + " -s ", + { + "Ref": "AWS::StackId" + }, + " -r MasterLaunchConfig ", + "\n", + "" + ] + ] + } + }, + "KeyName": { + "Ref": "KeyName" + } + }, + "Metadata": { + "AWS::CloudFormation::Init": { + "configSets": { + "Setup": [ + "efs-config", + "download-setup", + "deeplearning-config" + ] + }, + "efs-config": { + "commands": { + "00_install_nfs": { + "command": { + "Fn::Join": [ + "", + [ + "if [ \"AmazonLinux\" = \"", + { + "Ref": "ImageType" + }, + "\" ];", + "then yum -y -q install nfs-utils; else apt-get -qq -y install nfs-common ; fi" + ] + ] + } + }, + "01_createdir": { + "command": { + "Fn::Join": [ + "", + [ + "mkdir -p /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + }, + "02_mount": { + "command": { + "Fn::Join": [ + "", + [ + "sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 ", + { + "Fn::If": [ + "CreateNewFileSystem", + { + "Ref": "FileSystem" + }, + { + "Ref": "EFSFileSystemId" + } + ] + }, + ".efs.", + { + "Ref": "AWS::Region" + }, + ".amazonaws.com:/ /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + }, + "03_data": { + "command": { + "Fn::Join": [ + "", + [ + "touch /", + { + "Ref": "EFSMountPoint" + }, + "/", + { + "Fn::If": [ + "CopyDataToEFS", + "data.txt", + "nodata.txt" + ] + } + ] + ] + } + }, + "04_data_efs": { + "test": { + "Fn::Join": [ + "", + [ + "test -e /", + { + "Ref": "EFSMountPoint" + }, + "/data.txt" + ] + ] + }, + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "TarData" + }, + " /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + }, + "05_data_ebs": { + "test": { + "Fn::Join": [ + "", + [ + "test ! -e /", + { + "Ref": "EFSMountPoint" + }, + "/data.txt" + ] + ] + }, + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "TarData" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "06_src_ebs": { + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "TarSource" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "07_ebs_tar": { + "command": { + "Fn::Join": [ + "", + [ + "for file in /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/*.tar ; do tar -xf $file --directory ", + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + " ; done" + ] + ] + } + }, + "08_run_script": { + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "RunScript" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "09_setup_script": { + "command": { + "Fn::Join": [ + "", + [ + "aws s3 cp s3://", + { + "Ref": "S3Bucket" + }, + "/", + { + "Ref": "SetupScript" + }, + " /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "10_permissions": { + "command": { + "Fn::Join": [ + "", + [ + "chown ", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + ":", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + " -R /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + } + ] + ] + } + }, + "11_permissions": { + "command": { + "Fn::Join": [ + "", + [ + "chmod u+x /home/", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "/*.sh" + ] + ] + } + }, + "12_permissions": { + "command": { + "Fn::Join": [ + "", + [ + "chown ", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + ":", + { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + " /", + { + "Ref": "EFSMountPoint" + } + ] + ] + } + } + } + }, + "download-setup": { + "files": { + "/opt/deeplearning/dl_cfn_setup_v2.py": { + "source": { + "Fn::Join": [ + "", + [ + { + "Fn::FindInMap": [ + "S3", + { + "Ref": "AWS::Region" + }, + "URL" + ] + }, + { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::Region" + }, + { + "Fn::FindInMap": [ + "Other", + "S3SourceBucket", + "BucketNameSuffix" + ] + } + ] + ] + }, + "/", + { + "Fn::FindInMap": [ + "Other", + "Setup", + "Filename" + ] + } + ] + ] + } + } + } + }, + "deeplearning-config": { + "commands": { + "01_setup": { + "command": "python /opt/deeplearning/dl_cfn_setup_v2.py | tee -a /var/log/cloud-init-output.log", + "cwd": "/opt/deeplearning", + "env": { + "AWS_DL_NODE_TYPE": "Master", + "AWS_DL_MASTER_QUEUE": { + "Fn::GetAtt": [ + "MasterQueue", + "QueueName" + ] + }, + "AWS_DL_WORKER_QUEUE": { + "Fn::GetAtt": [ + "WorkerQueue", + "QueueName" + ] + }, + "AWS_DL_WAITCONDITION_TIMEOUT": { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "WaitConditionTimeout" + ] + }, + "AWS_DL_MASTERLAUNCH_TIMEOUT": { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "MasterLaunchTimeout" + ] + }, + "AWS_DL_STACK_ID": { + "Ref": "AWS::StackId" + }, + "AWS_DL_WAIT_HANDLE": { + "Ref": "myWaitHandle" + }, + "AWS_DL_ROLE_NAME": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-InstanceRole" + ] + ] + }, + "AWS_DL_DEFAULT_USER": { + "Fn::FindInMap": [ + "Other", + "DefaultUser", + { + "Ref": "ImageType" + } + ] + }, + "AWS_REGION": { + "Ref": "AWS::Region" + }, + "EFS_MOUNT": { + "Fn::Join": [ + "", + [ + "/", + { + "Ref": "EFSMountPoint" + } + ] + ] + }, + "CFN_PATH": { + "Fn::FindInMap": [ + "Other", + "CfnPath", + { + "Ref": "ImageType" + } + ] + } + } + } + } + } + } + } + }, + "MasterAutoScalingGroup": { + "Type": "AWS::AutoScaling::AutoScalingGroup", + "DependsOn": [ + "MasterLaunchConfig", + "MountTarget", + "MasterQueue", + "WorkerQueue" + ], + "CreationPolicy": { + "ResourceSignal": { + "Timeout": { + "Fn::Join": [ + "", + [ + "PT", + { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "MasterLaunchTimeout" + ] + }, + "S" + ] + ] + }, + "Count": "1" + } + }, + "Properties": { + "DesiredCapacity": "1", + "MinSize": "1", + "MaxSize": "1", + "PlacementGroup": { + "Ref": "ResourcePlacementGroup" + }, + "LaunchConfigurationName": { + "Ref": "MasterLaunchConfig" + }, + "VPCZoneIdentifier": [ + { + "Ref": "PrivateSubnetId" + } + ], + "NotificationConfiguration": { + "TopicARN": { + "Ref": "ResourceMetadataSNSTopic" + }, + "NotificationTypes": [ + "autoscaling:EC2_INSTANCE_LAUNCH", + "autoscaling:EC2_INSTANCE_LAUNCH_ERROR", + "autoscaling:EC2_INSTANCE_TERMINATE_ERROR", + "autoscaling:EC2_INSTANCE_TERMINATE" + ] + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-Master" + ] + ] + }, + "PropagateAtLaunch": true + }, + { + "Key": "NodeType", + "Value": "Master", + "PropagateAtLaunch": true + } + ] + } + }, + "WorkerAutoScalingGroup": { + "Type": "AWS::AutoScaling::AutoScalingGroup", + "DependsOn": [ + "WorkerLaunchConfig", + "MountTarget", + "MasterQueue", + "WorkerQueue", + "MasterAutoScalingGroup" + ], + "Properties": { + "MinSize": "0", + "MaxSize": { + "Ref": "WorkerCount" + }, + "PlacementGroup": { + "Ref": "ResourcePlacementGroup" + }, + "DesiredCapacity": { + "Ref": "WorkerCount" + }, + "LaunchConfigurationName": { + "Ref": "WorkerLaunchConfig" + }, + "VPCZoneIdentifier": [ + { + "Ref": "PrivateSubnetId" + } + ], + "NotificationConfiguration": { + "TopicARN": { + "Ref": "ResourceMetadataSNSTopic" + }, + "NotificationTypes": [ + "autoscaling:EC2_INSTANCE_LAUNCH", + "autoscaling:EC2_INSTANCE_LAUNCH_ERROR", + "autoscaling:EC2_INSTANCE_TERMINATE_ERROR", + "autoscaling:EC2_INSTANCE_TERMINATE" + ] + }, + "Tags": [ + { + "Key": "Name", + "Value": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-Worker" + ] + ] + }, + "PropagateAtLaunch": true + }, + { + "Key": "NodeType", + "Value": "Worker", + "PropagateAtLaunch": true + } + ] + } + }, + "MasterQueue": { + "Type": "AWS::SQS::Queue", + "Properties": { + "QueueName": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-aws-dl-cfn-master" + ] + ] + } + } + }, + "WorkerQueue": { + "Type": "AWS::SQS::Queue", + "Properties": { + "QueueName": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-aws-dl-cfn-worker" + ] + ] + } + } + }, + "ResourceMetadataSNSTopic": { + "Type": "AWS::SNS::Topic", + "Properties": { + "DisplayName": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-aws-dl-cfn" + ] + ] + }, + "Subscription": [ + { + "Endpoint": { + "Fn::GetAtt": [ + "ResourceMetadataLambdaFunction", + "Arn" + ] + }, + "Protocol": "lambda" + } + ], + "TopicName": { + "Fn::Join": [ + "", + [ + { + "Ref": "AWS::StackName" + }, + "-aws-dl-cfn" + ] + ] + } + } + }, + "myWaitHandle": { + "Type": "AWS::CloudFormation::WaitConditionHandle", + "Properties": {} + }, + "myWaitCondition": { + "Type": "AWS::CloudFormation::WaitCondition", + "Properties": { + "Handle": { + "Ref": "myWaitHandle" + }, + "Timeout": { + "Fn::FindInMap": [ + "Other", + "TimeoutValues", + "WaitConditionTimeout" + ] + } + } + } + }, + "Outputs": { + "AdminSSHSecurityGroup": { + "Description": "Security Group that restricts Inbound IPs to SSH into the Master", + "Value": { + "Ref": "AdminSSHSecurityGroup" + } + }, + "MasterAutoScalingGroup": { + "Description": "Autoscaling Group that contains the Master Instance", + "Value": { + "Ref": "MasterAutoScalingGroup" + } + }, + "WorkerAutoScalingGroup": { + "Description": "Autoscaling Group that contains the Workers", + "Value": { + "Ref": "WorkerAutoScalingGroup" + } + }, + "MountTargetID": { + "Description": "EFS Mount target ID", + "Value": { + "Ref": "MountTarget" + } + } + } +} \ No newline at end of file From 153180a95b978c5ea12259aea5e85f4da524cbc9 Mon Sep 17 00:00:00 2001 From: Ajay Vohra Date: Tue, 29 Jan 2019 20:34:03 +0000 Subject: [PATCH 5/5] upgrade to version 21.0 --- cfn-template/deeplearning.template | 42 +++++++++++++++--------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/cfn-template/deeplearning.template b/cfn-template/deeplearning.template index 976bd5c..2b29f7a 100644 --- a/cfn-template/deeplearning.template +++ b/cfn-template/deeplearning.template @@ -195,29 +195,29 @@ }, "Mappings" : { "Ubuntu" : { - "us-east-1" : { "AMI" : "ami-0f9e8c4a1305ecd22" }, - "us-east-2" : { "AMI" : "ami-0c9ae74667b049f59" }, - "us-west-2" : { "AMI" : "ami-0d0ff0945ae093aea" }, - "eu-west-1" : { "AMI" : "ami-0827ddd2d8e38aa56" }, - "eu-central-1" : { "AMI" : "ami-03580946c6347e2f8" }, - "ap-southeast-1" : { "AMI" : "ami-09dfb1478dc499b95" }, - "ap-southeast-2" : { "AMI" : "ami-0a8f8e89b02a21088" }, - "ap-south-1" : { "AMI" : "ami-07ffe4e02cf5c2bd0" }, - "ap-northeast-1" : { "AMI" : "ami-031ce7af929321d3a" }, - "ap-northeast-2" : { "AMI" : "ami-09c2b38b2fbb748ce" } + "us-east-1" : { "AMI" : "ami-09a706a24845d0723" }, + "us-east-2" : { "AMI" : "ami-003ce277a8a9c0014" }, + "us-west-2" : { "AMI" : "ami-0b294f219d14e6a82" }, + "eu-west-1" : { "AMI" : "ami-086062166ec8340ac" }, + "eu-central-1" : { "AMI" : "ami-0f57552c8fc9e228f" }, + "ap-southeast-1" : { "AMI" : "ami-077b987c8b7a6462e" }, + "ap-southeast-2" : { "AMI" : "ami-0512a7cd86ea45901" }, + "ap-south-1" : { "AMI" : "ami-01e5f909b3c234383" }, + "ap-northeast-1" : { "AMI" : "ami-07a65197e224510c7" }, + "ap-northeast-2" : { "AMI" : "ami-098cb0cca04bdac5a" } }, "AmazonLinux" : { - "us-east-1" : { "AMI" : "ami-0a4b759b63b333b0e" }, - "us-east-2" : { "AMI" : "ami-0f71284ab59a38265" }, - "us-west-2" : { "AMI" : "ami-0305a0d7a68489e58" }, - "eu-west-1" : { "AMI" : "ami-0d2e5838a2908742f" }, - "eu-central-1" : { "AMI" : "ami-09b5cb82b50e3c9e9" }, - "ap-southeast-1" : { "AMI" : "ami-0abbc7f71da968649" }, - "ap-southeast-2" : { "AMI" : "ami-0b6d01aebbf6a1490" }, - "ap-south-1" : { "AMI" : "ami-0e17a6861b2574143" }, - "ap-northeast-1" : { "AMI" : "ami-0165fe49c30cad525" }, - "ap-northeast-2" : { "AMI" : "ami-0b54ee3b4c6e0b975" } + "us-east-1" : { "AMI" : "ami-0f5788229b53809c9" }, + "us-east-2" : { "AMI" : "ami-001f9c1ca57fbc7a2" }, + "us-west-2" : { "AMI" : "ami-0c0c1a8d6a4695fdc" }, + "eu-west-1" : { "AMI" : "ami-088b2e2cc2498f3ca" }, + "eu-central-1" : { "AMI" : "ami-055ab192b68ca4d2f" }, + "ap-southeast-1" : { "AMI" : "ami-044c38d8c0100ea15" }, + "ap-southeast-2" : { "AMI" : "ami-02c907307d02dc462" }, + "ap-south-1" : { "AMI" : "ami-074811debc0b11bdf" }, + "ap-northeast-1" : { "AMI" : "ami-08a7740ff4d3fd90f" }, + "ap-northeast-2" : { "AMI" : "ami-07b22a7626892dd48" } }, "S3" : { @@ -899,7 +899,7 @@ "Properties" : { "MinSize" : "0", "MaxSize" : { "Ref" : "WorkerCount" }, - "PlacementGroup": { + "PlacementGroup": { "Ref": "ResourcePlacementGroup" }, "DesiredCapacity" : { "Ref" : "WorkerCount" },