| { | |
| "AWSTemplateFormatVersion" : "2010-09-09", | |
| "Description" : "Launches a Deep Learning Cluster with one Master and variable number of Workers.", | |
| "Parameters" : { | |
| "KeyName" : { | |
| "Description" : "Name of an existing Amazon EC2 KeyPair to enable SSH access to the instances", | |
| "Type" : "AWS::EC2::KeyPair::KeyName" | |
| }, | |
| "WorkerCount" : { | |
| "Description" : "The number of worker instances (launches +1 instance for the Master).", | |
| "Type" : "Number", | |
| "MinValue" : "1", | |
| "Default" : "1" | |
| }, | |
| "InstanceType" : { | |
| "Description" : "The EC2 instance type for all instances.", | |
| "Type" : "String", | |
| "Default" : "g2.2xlarge", | |
| "AllowedValues" : [ "g2.2xlarge", "g2.8xlarge", "p2.xlarge", "p2.8xlarge", "p2.16xlarge" ], | |
| "ConstraintDescription" : "Must be a valid GPU EC2 instance type." | |
| }, | |
| "SSHLocation": { | |
| "Description": "Restrict SSH access to a valid CIDR range, this should be a valid CIDR IP address range that you want to allow access to your Master and Stack.", | |
| "Type": "String", | |
| "MinLength": "9", | |
| "MaxLength": "18", | |
| "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})", | |
| "ConstraintDescription": "Must be a valid CIDR range of the form x.x.x.x/x" | |
| } | |
| }, | |
| "Mappings" : { | |
| "AmazonLinux" : { | |
| "us-east-1" : { "AMI" : "ami-7d93bf6a" }, | |
| "us-west-2" : { "AMI" : "ami-8b08abeb" }, | |
| "eu-west-1" : { "AMI" : "ami-4844153b" } | |
| }, | |
| "SubnetConfig" : { | |
| "VPC" : { "CIDR" : "10.0.0.0/16" }, | |
| "Public" : { "CIDR" : "10.0.0.0/24" }, | |
| "Private" : { "CIDR" : "10.0.1.0/24" } | |
| } | |
| }, | |
| "Resources" : { | |
| "InstanceRole" : { | |
| "Type" : "AWS::IAM::Role", | |
| "Properties" : { | |
| "AssumeRolePolicyDocument" : { | |
| "Statement" : [ { | |
| "Effect" : "Allow", | |
| "Principal" : { | |
| "Service" : [ "ec2.amazonaws.com" ] | |
| }, | |
| "Action" : [ "sts:AssumeRole" ] | |
| } ] | |
| }, | |
| "Path" : "/", | |
| "Policies" : [ { | |
| "PolicyName" : "instance", | |
| "PolicyDocument" : { | |
| "Statement" : [ { | |
| "Effect" : "Allow", | |
| "Action" : [ "autoscaling:DescribeAutoScalingGroups", "autoscaling:DescribeAutoScalingInstances", "ec2:DescribeInstances", "cloudformation:DescribeStackResource"], | |
| "Resource" : "*" | |
| } ] | |
| } | |
| } ] | |
| } | |
| }, | |
| "InstanceProfile" : { | |
| "Type" : "AWS::IAM::InstanceProfile", | |
| "DependsOn" : "InstanceRole", | |
| "Properties" : { | |
| "Path" : "/", | |
| "Roles" : [ { | |
| "Ref" : "InstanceRole" | |
| } ] | |
| } | |
| }, | |
| "AdminSSHSecurityGroup" : { | |
| "Type" : "AWS::EC2::SecurityGroup", | |
| "Properties" : { | |
| "GroupDescription" : "Security group that controls SSH access to the Master instance.", | |
| "VpcId" : { "Ref" : "Vpc" }, | |
| "Tags" : [ | |
| { "Key" : "Name", "Value" : {"Fn::Join" : ["", [{ "Ref" : "AWS::StackName" }, "_SSH" ] ] } } | |
| ], | |
| "SecurityGroupIngress" : [ | |
| { "IpProtocol" : "tcp", "FromPort" : "22", "ToPort" : "22", "CidrIp" : { "Ref" : "SSHLocation" } } | |
| ], | |
| "SecurityGroupEgress" : [ | |
| ] | |
| } | |
| }, | |
| "MasterSecurityGroup" : { | |
| "Type" : "AWS::EC2::SecurityGroup", | |
| "Properties" : { | |
| "GroupDescription" : "Enable Port access to and from the Master on the Private Interface.", | |
| "VpcId" : { "Ref" : "Vpc" }, | |
| "Tags" : [ | |
| { "Key" : "Name", "Value" : {"Fn::Join" : ["", [{ "Ref" : "AWS::StackName" }, "_Master" ] ] } } | |
| ], | |
| "SecurityGroupIngress" : [ | |
| ], | |
| "SecurityGroupEgress" : [ | |
| ] | |
| } | |
| }, | |
| "MasterSecurityIngress1" : { | |
| "Type" : "AWS::EC2::SecurityGroupIngress", | |
| "DependsOn" : ["MasterSecurityGroup"], | |
| "Properties" : { | |
| "GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] }, | |
| "IpProtocol" : "tcp", | |
| "FromPort" : "0", | |
| "ToPort" : "65535", | |
| "SourceSecurityGroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] } | |
| } | |
| }, | |
| "MasterSecurityIngress2" : { | |
| "Type" : "AWS::EC2::SecurityGroupIngress", | |
| "DependsOn" : ["MasterSecurityGroup", "WorkerSecurityGroup"], | |
| "Properties" : { | |
| "GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] }, | |
| "IpProtocol" : "icmp", | |
| "FromPort" : "-1", | |
| "ToPort" : "-1", | |
| "SourceSecurityGroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] } | |
| } | |
| }, | |
| "MasterSecurityIngress3" : { | |
| "Type" : "AWS::EC2::SecurityGroupIngress", | |
| "DependsOn" : ["MasterSecurityGroup", "WorkerSecurityGroup"], | |
| "Properties" : { | |
| "GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] }, | |
| "IpProtocol" : "tcp", | |
| "FromPort" : "0", | |
| "ToPort" : "65535", | |
| "SourceSecurityGroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] } | |
| } | |
| }, | |
| "MasterSecurityIngress4" : { | |
| "Type" : "AWS::EC2::SecurityGroupIngress", | |
| "DependsOn" : ["MasterSecurityGroup", "WorkerSecurityGroup"], | |
| "Properties" : { | |
| "GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] }, | |
| "IpProtocol" : "icmp", | |
| "FromPort" : "-1", | |
| "ToPort" : "-1", | |
| "SourceSecurityGroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] } | |
| } | |
| }, | |
| "WorkerSecurityGroup" : { | |
| "Type" : "AWS::EC2::SecurityGroup", | |
| "DependsOn" : ["MasterSecurityGroup"], | |
| "Properties" : { | |
| "GroupDescription" : "Enable Port access to and from the Worker on the Private Interface", | |
| "VpcId" : { "Ref" : "Vpc" }, | |
| "Tags" : [ | |
| { "Key" : "Name", "Value" : {"Fn::Join" : ["", [{ "Ref" : "AWS::StackName" }, "_Worker"] ]} } | |
| ], | |
| "SecurityGroupIngress" : [ | |
| { "IpProtocol" : "tcp", "FromPort" : "0", "ToPort" : "65535", "SourceSecurityGroupId" : { "Ref" : "MasterSecurityGroup" } }, | |
| { "IpProtocol" : "icmp", "FromPort" : "-1", "ToPort" : "-1", "SourceSecurityGroupId" : { "Ref" : "MasterSecurityGroup" } } | |
| ], | |
| "SecurityGroupEgress" : [ | |
| ] | |
| } | |
| }, | |
| "WorkerSecurityIngress3" : { | |
| "Type" : "AWS::EC2::SecurityGroupIngress", | |
| "DependsOn" : ["WorkerSecurityGroup"], | |
| "Properties" : { | |
| "GroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] }, | |
| "IpProtocol" : "tcp", | |
| "FromPort" : "0", | |
| "ToPort" : "65535", | |
| "SourceSecurityGroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] } | |
| } | |
| }, | |
| "WorkerSecurityIngress4" : { | |
| "Type" : "AWS::EC2::SecurityGroupIngress", | |
| "DependsOn" : ["WorkerSecurityGroup"], | |
| "Properties" : { | |
| "GroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] }, | |
| "IpProtocol" : "icmp", | |
| "FromPort" : "-1", | |
| "ToPort" : "-1", | |
| "SourceSecurityGroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] } | |
| } | |
| }, | |
| "WorkerLaunchConfig" : { | |
| "Type" : "AWS::AutoScaling::LaunchConfiguration", | |
| "Properties" : { | |
| "ImageId" : { | |
| "Fn::FindInMap" : [ "AmazonLinux", { "Ref" : "AWS::Region" }, "AMI" ] | |
| }, | |
| "InstanceType" : { | |
| "Ref" : "InstanceType" | |
| }, | |
| "IamInstanceProfile" : { | |
| "Ref" : "InstanceProfile" | |
| }, | |
| "SecurityGroups" : [ | |
| {"Ref" : "WorkerSecurityGroup"} | |
| ], | |
| "UserData" : { | |
| "Fn::Base64" : { | |
| "Fn::Join" : [ "", | |
| [ | |
| "#!/bin/bash -xe", | |
| "\n", | |
| "# setup ssh-forwarding. ", | |
| "sed -i \"s/^#\\(\\s\\+\\)ForwardAgent\\(\\s\\+\\)no/\\ \\1ForwardAgent\\2yes/g\" /etc/ssh/ssh_config", | |
| "\n", | |
| "" | |
| ] | |
| ] | |
| } | |
| }, | |
| "KeyName" : { | |
| "Ref" : "KeyName" | |
| } | |
| } | |
| }, | |
| "MasterLaunchConfig" : { | |
| "Type" : "AWS::AutoScaling::LaunchConfiguration", | |
| "Properties" : { | |
| "AssociatePublicIpAddress" : "true", | |
| "ImageId" : { | |
| "Fn::FindInMap" : [ "AmazonLinux", { "Ref" : "AWS::Region" }, "AMI" ] | |
| }, | |
| "InstanceType" : { | |
| "Ref" : "InstanceType" | |
| }, | |
| "IamInstanceProfile" : { | |
| "Ref" : "InstanceProfile" | |
| }, | |
| "SecurityGroups" : [ | |
| { "Ref" : "MasterSecurityGroup" }, | |
| { "Ref" : "AdminSSHSecurityGroup" } | |
| ], | |
| "UserData" : { | |
| "Fn::Base64" : { | |
| "Fn::Join" : [ "", | |
| [ | |
| "#!/bin/bash -xe", | |
| "\n", | |
| "# setup ssh-forwarding. \n", | |
| "sed -i \"s/^#\\(\\s\\+\\)ForwardAgent\\(\\s\\+\\)no/\\ \\1ForwardAgent\\2yes/g\" /etc/ssh/ssh_config", | |
| "\n", | |
| "mkdir -p /opt/deeplearning", | |
| "\n", | |
| "# run cfn-init. \n", | |
| "export CFN_PATH=\\/opt\\/aws\\/bin", | |
| "\n", | |
| "$CFN_PATH\\/cfn-init -v --region ", { "Ref" : "AWS::Region" }, | |
| " -s ", | |
| { "Ref" : "AWS::StackId" }, | |
| " -r MasterLaunchConfig ", | |
| "\n", | |
| "" | |
| ] | |
| ] | |
| } | |
| }, | |
| "KeyName" : { | |
| "Ref" : "KeyName" | |
| } | |
| }, | |
| "Metadata" : { | |
| "AWS::CloudFormation::Init" : { | |
| "config" : { | |
| "commands" : { | |
| "test" : { | |
| "command" : "/opt/deeplearning/fetch-hosts.sh" | |
| } | |
| }, | |
| "files" : { | |
| "/opt/deeplearning/fetch-hosts.sh": { | |
| "content" : { "Fn::Join" : ["", [ | |
| "#!/bin/bash -xe", | |
| "\n", | |
| "# setup deep learning Master ip,dns alias", | |
| "\n", | |
| "num_instance=1", | |
| "\n", | |
| "instance=$(aws --region \"", { "Ref" : "AWS::Region" }, "\" autoscaling describe-auto-scaling-groups --no-paginate --query \"AutoScalingGroups[?Tags[?Value=='",{"Ref" : "AWS::StackName"},"']]|[?Tags[?Key=='NodeType']]|[?Tags[?Value=='Master']].Instances[*].InstanceId\" --output text | tr \"\\t\" \"\\n\")", | |
| "\n", | |
| "ip=$(aws --region \"", { "Ref" : "AWS::Region" }, "\" ec2 describe-instances --instance-ids $instance --output text --query \"Reservations[*].Instances[*].PrivateIpAddress\")", | |
| "\n", | |
| "echo \"$ip deeplearning-master\" >>/etc/hosts", | |
| "\n", | |
| "echo \"$ip deeplearning-worker$num_instance\" >>/etc/hosts", | |
| "\n", | |
| "echo \"deeplearning-worker$num_instance\" >>/opt/deeplearning/workers", | |
| "\n", | |
| "# setup deep learning workers ip,dns alias", | |
| "\n", | |
| "for instance in `aws --region \"", { "Ref" : "AWS::Region" }, "\" autoscaling describe-auto-scaling-groups --no-paginate --query \"AutoScalingGroups[?Tags[?Value=='",{"Ref" : "AWS::StackName"},"']]|[?Tags[?Key=='NodeType']]|[?Tags[?Value=='Worker']].Instances[*].InstanceId\" --output text | tr \"\\t\" \"\\n\"`", | |
| "\n", | |
| "do", | |
| "\n", | |
| "let \"num_instance += 1\"", | |
| "\n", | |
| "ip=$(aws --region \"", { "Ref" : "AWS::Region" }, "\" ec2 describe-instances --instance-ids $instance --output text --query \"Reservations[*].Instances[*].PrivateIpAddress\")", | |
| "\n", | |
| "echo \"$ip deeplearning-worker$num_instance\" >>/etc/hosts", | |
| "\n", | |
| "echo \"deeplearning-worker$num_instance\" >>/opt/deeplearning/workers", | |
| "\n", | |
| "done;", | |
| "\n", | |
| "# set deep learning environment variables", | |
| "\n", | |
| "echo \"export DEEPLEARNING_WORKERS_PATH=/opt/deeplearning/workers\" >>/etc/profile.d/deeplearning.sh", | |
| "\n", | |
| "echo \"export DEEPLEARNING_WORKERS_COUNT=$(wc -l < \\/opt\\/deeplearning\\/workers)\" >>/etc/profile.d/deeplearning.sh", | |
| "\n", | |
| "echo \"export DEEPLEARNING_WORKER_GPU_COUNT=$(nvidia-smi -L | wc -l)\" >>/etc/profile.d/deeplearning.sh", | |
| "\n", | |
| "" | |
| ]]}, | |
| "mode" : "000544", | |
| "owner" : "root", | |
| "group" : "root" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "MasterAutoScalingGroup" : { | |
| "Type" : "AWS::AutoScaling::AutoScalingGroup", | |
| "DependsOn" : ["MasterLaunchConfig", "WorkerAutoScalingGroup"], | |
| "Properties" : { | |
| "DesiredCapacity" : "1", | |
| "MinSize" : "1", | |
| "MaxSize" : "1", | |
| "LaunchConfigurationName" : { "Ref" : "MasterLaunchConfig"}, | |
| "VPCZoneIdentifier" : [{ "Ref" : "PublicSubnet"}], | |
| "Tags" : [ { | |
| "Key" : "Name", | |
| "Value" : { | |
| "Ref" : "AWS::StackName" | |
| }, | |
| "PropagateAtLaunch" : true | |
| }, | |
| { | |
| "Key" : "NodeType", | |
| "Value" : "Master", | |
| "PropagateAtLaunch" : true | |
| } | |
| ] | |
| } | |
| }, | |
| "WorkerAutoScalingGroup" : { | |
| "Type" : "AWS::AutoScaling::AutoScalingGroup", | |
| "DependsOn" : ["WorkerLaunchConfig"], | |
| "Properties" : { | |
| "DesiredCapacity" : { | |
| "Ref" : "WorkerCount" | |
| }, | |
| "MinSize" : { | |
| "Ref" : "WorkerCount" | |
| }, | |
| "MaxSize" : { | |
| "Ref" : "WorkerCount" | |
| }, | |
| "LaunchConfigurationName" : { | |
| "Ref" : "WorkerLaunchConfig" | |
| }, | |
| "VPCZoneIdentifier" : [ { "Ref" : "PrivateSubnet" } ], | |
| "Tags" : [ { | |
| "Key" : "Name", | |
| "Value" : { | |
| "Ref" : "AWS::StackName" | |
| }, | |
| "PropagateAtLaunch" : true | |
| }, | |
| { | |
| "Key" : "NodeType", | |
| "Value" : "Worker", | |
| "PropagateAtLaunch" : true | |
| } | |
| ] | |
| } | |
| }, | |
| "NATGatewayEIP" : { | |
| "Type" : "AWS::EC2::EIP", | |
| "Properties" : {"Domain" : "vpc"} | |
| }, | |
| "Vpc" : { | |
| "Type" : "AWS::EC2::VPC", | |
| "Properties" : { | |
| "CidrBlock" : { "Fn::FindInMap" : [ "SubnetConfig", "VPC", "CIDR" ]}, | |
| "EnableDnsSupport" : "true", | |
| "EnableDnsHostnames" : "true", | |
| "Tags" : [ | |
| { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } } | |
| ] | |
| } | |
| }, | |
| "InternetGateway" : { | |
| "Type" : "AWS::EC2::InternetGateway", | |
| "Properties" : { | |
| "Tags" : [ | |
| { "Key" : "Network", "Value" : "Public" }, | |
| { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } } | |
| ] | |
| } | |
| }, | |
| "GatewayToInternet" : { | |
| "Type" : "AWS::EC2::VPCGatewayAttachment", | |
| "Properties" : { | |
| "VpcId" : { "Ref" : "Vpc" }, | |
| "InternetGatewayId" : { "Ref" : "InternetGateway" } | |
| } | |
| }, | |
| "PublicSubnet" : { | |
| "Type" : "AWS::EC2::Subnet", | |
| "DependsOn" : ["PrivateSubnet"], | |
| "Properties" : { | |
| "VpcId" : {"Ref" : "Vpc"}, | |
| "AvailabilityZone" : { "Fn::GetAtt" : [ "PrivateSubnet", "AvailabilityZone" ] } , | |
| "CidrBlock": { "Fn::FindInMap" : [ "SubnetConfig", "Public", "CIDR" ]}, | |
| "Tags" : [ | |
| { "Key" : "Network", "Value" : "Public" }, | |
| { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } } | |
| ] | |
| } | |
| }, | |
| "PrivateSubnet" : { | |
| "Type" : "AWS::EC2::Subnet", | |
| "Properties" : { | |
| "VpcId" : { "Ref" : "Vpc" }, | |
| "CidrBlock" : { "Fn::FindInMap" : [ "SubnetConfig", "Private", "CIDR" ]}, | |
| "Tags" : [ | |
| { "Key" : "Network", "Value" : "Private" }, | |
| { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" }} | |
| ] | |
| } | |
| }, | |
| "NATGateway" : { | |
| "Type" : "AWS::EC2::NatGateway", | |
| "DependsOn" : "GatewayToInternet", | |
| "Properties" : { | |
| "AllocationId" : { | |
| "Fn::GetAtt" : [ | |
| "NATGatewayEIP", | |
| "AllocationId" | |
| ] | |
| }, | |
| "SubnetId" : { | |
| "Ref" : "PublicSubnet" | |
| } | |
| } | |
| }, | |
| "PublicRouteTable" : { | |
| "Type" : "AWS::EC2::RouteTable", | |
| "Properties" : { | |
| "VpcId" : { "Ref" : "Vpc" }, | |
| "Tags" : [ | |
| { "Key" : "Network", "Value" : "Public" }, | |
| { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } } | |
| ] | |
| } | |
| }, | |
| "PublicRoute" : { | |
| "Type" : "AWS::EC2::Route", | |
| "Properties" : { | |
| "RouteTableId" : { "Ref" : "PublicRouteTable" }, | |
| "DestinationCidrBlock" : "0.0.0.0/0", | |
| "GatewayId" : { "Ref" : "InternetGateway" } | |
| } | |
| }, | |
| "PublicSubnetRouteAssociation" : { | |
| "Type" : "AWS::EC2::SubnetRouteTableAssociation", | |
| "Properties" : { | |
| "SubnetId" : { "Ref" : "PublicSubnet" }, | |
| "RouteTableId" : { "Ref" : "PublicRouteTable" } | |
| } | |
| }, | |
| "PrivateRouteTable" : { | |
| "Type" : "AWS::EC2::RouteTable", | |
| "Properties" : { | |
| "VpcId" : { "Ref" : "Vpc" }, | |
| "Tags" : [ | |
| { "Key" : "Network", "Value" : "Private" }, | |
| { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" }} | |
| ] | |
| } | |
| }, | |
| "PrivateRoute" : { | |
| "Type" : "AWS::EC2::Route", | |
| "Properties" : { | |
| "RouteTableId" : { "Ref" : "PrivateRouteTable" }, | |
| "DestinationCidrBlock" : "0.0.0.0/0", | |
| "NatGatewayId" : { "Ref" : "NATGateway" } | |
| } | |
| }, | |
| "PrivateSubnetRouteAssociation" : { | |
| "Type" : "AWS::EC2::SubnetRouteTableAssociation", | |
| "Properties" : { | |
| "SubnetId" : { "Ref" : "PrivateSubnet" }, | |
| "RouteTableId" : { "Ref" : "PrivateRouteTable" } | |
| } | |
| } | |
| }, | |
| "Outputs" : { | |
| "AdminSSHSecurityGroup" : { | |
| "Description" : "Security Group that restricts Inbound IPs to SSH into the Master", | |
| "Value" : { | |
| "Ref" : "AdminSSHSecurityGroup" | |
| } | |
| }, | |
| "MasterAutoScalingGroup" : { | |
| "Description" : "Autoscaling Group that contains the Master Instance", | |
| "Value" : { | |
| "Ref" : "MasterAutoScalingGroup" | |
| } | |
| } | |
| } | |
| } |