blob: b5ef6026ca01d1e647053c9e976afd409538665b [file] [log] [blame]
{
"AWSTemplateFormatVersion" : "2010-09-09",
"Description" : "Launches a Deep Learning Cluster with one Master and variable number of Workers.",
"Parameters" : {
"KeyName" : {
"Description" : "Name of an existing Amazon EC2 KeyPair to enable SSH access to the instances",
"Type" : "AWS::EC2::KeyPair::KeyName"
},
"WorkerCount" : {
"Description" : "The number of worker instances (launches +1 instance for the Master).",
"Type" : "Number",
"MinValue" : "1",
"Default" : "1"
},
"InstanceType" : {
"Description" : "The EC2 instance type for all instances.",
"Type" : "String",
"Default" : "g2.2xlarge",
"AllowedValues" : [ "g2.2xlarge", "g2.8xlarge", "p2.xlarge", "p2.8xlarge", "p2.16xlarge" ],
"ConstraintDescription" : "Must be a valid GPU EC2 instance type."
},
"SSHLocation": {
"Description": "Restrict SSH access to a valid CIDR range, this should be a valid CIDR IP address range that you want to allow access to your Master and Stack.",
"Type": "String",
"MinLength": "9",
"MaxLength": "18",
"AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})",
"ConstraintDescription": "Must be a valid CIDR range of the form x.x.x.x/x"
}
},
"Mappings" : {
"AmazonLinux" : {
"us-east-1" : { "AMI" : "ami-7d93bf6a" },
"us-west-2" : { "AMI" : "ami-8b08abeb" },
"eu-west-1" : { "AMI" : "ami-4844153b" }
},
"SubnetConfig" : {
"VPC" : { "CIDR" : "10.0.0.0/16" },
"Public" : { "CIDR" : "10.0.0.0/24" },
"Private" : { "CIDR" : "10.0.1.0/24" }
}
},
"Resources" : {
"InstanceRole" : {
"Type" : "AWS::IAM::Role",
"Properties" : {
"AssumeRolePolicyDocument" : {
"Statement" : [ {
"Effect" : "Allow",
"Principal" : {
"Service" : [ "ec2.amazonaws.com" ]
},
"Action" : [ "sts:AssumeRole" ]
} ]
},
"Path" : "/",
"Policies" : [ {
"PolicyName" : "instance",
"PolicyDocument" : {
"Statement" : [ {
"Effect" : "Allow",
"Action" : [ "autoscaling:DescribeAutoScalingGroups", "autoscaling:DescribeAutoScalingInstances", "ec2:DescribeInstances", "cloudformation:DescribeStackResource"],
"Resource" : "*"
} ]
}
} ]
}
},
"InstanceProfile" : {
"Type" : "AWS::IAM::InstanceProfile",
"DependsOn" : "InstanceRole",
"Properties" : {
"Path" : "/",
"Roles" : [ {
"Ref" : "InstanceRole"
} ]
}
},
"AdminSSHSecurityGroup" : {
"Type" : "AWS::EC2::SecurityGroup",
"Properties" : {
"GroupDescription" : "Security group that controls SSH access to the Master instance.",
"VpcId" : { "Ref" : "Vpc" },
"Tags" : [
{ "Key" : "Name", "Value" : {"Fn::Join" : ["", [{ "Ref" : "AWS::StackName" }, "_SSH" ] ] } }
],
"SecurityGroupIngress" : [
{ "IpProtocol" : "tcp", "FromPort" : "22", "ToPort" : "22", "CidrIp" : { "Ref" : "SSHLocation" } }
],
"SecurityGroupEgress" : [
]
}
},
"MasterSecurityGroup" : {
"Type" : "AWS::EC2::SecurityGroup",
"Properties" : {
"GroupDescription" : "Enable Port access to and from the Master on the Private Interface.",
"VpcId" : { "Ref" : "Vpc" },
"Tags" : [
{ "Key" : "Name", "Value" : {"Fn::Join" : ["", [{ "Ref" : "AWS::StackName" }, "_Master" ] ] } }
],
"SecurityGroupIngress" : [
],
"SecurityGroupEgress" : [
]
}
},
"MasterSecurityIngress1" : {
"Type" : "AWS::EC2::SecurityGroupIngress",
"DependsOn" : ["MasterSecurityGroup"],
"Properties" : {
"GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] },
"IpProtocol" : "tcp",
"FromPort" : "0",
"ToPort" : "65535",
"SourceSecurityGroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] }
}
},
"MasterSecurityIngress2" : {
"Type" : "AWS::EC2::SecurityGroupIngress",
"DependsOn" : ["MasterSecurityGroup", "WorkerSecurityGroup"],
"Properties" : {
"GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] },
"IpProtocol" : "icmp",
"FromPort" : "-1",
"ToPort" : "-1",
"SourceSecurityGroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] }
}
},
"MasterSecurityIngress3" : {
"Type" : "AWS::EC2::SecurityGroupIngress",
"DependsOn" : ["MasterSecurityGroup", "WorkerSecurityGroup"],
"Properties" : {
"GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] },
"IpProtocol" : "tcp",
"FromPort" : "0",
"ToPort" : "65535",
"SourceSecurityGroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] }
}
},
"MasterSecurityIngress4" : {
"Type" : "AWS::EC2::SecurityGroupIngress",
"DependsOn" : ["MasterSecurityGroup", "WorkerSecurityGroup"],
"Properties" : {
"GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] },
"IpProtocol" : "icmp",
"FromPort" : "-1",
"ToPort" : "-1",
"SourceSecurityGroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] }
}
},
"WorkerSecurityGroup" : {
"Type" : "AWS::EC2::SecurityGroup",
"DependsOn" : ["MasterSecurityGroup"],
"Properties" : {
"GroupDescription" : "Enable Port access to and from the Worker on the Private Interface",
"VpcId" : { "Ref" : "Vpc" },
"Tags" : [
{ "Key" : "Name", "Value" : {"Fn::Join" : ["", [{ "Ref" : "AWS::StackName" }, "_Worker"] ]} }
],
"SecurityGroupIngress" : [
{ "IpProtocol" : "tcp", "FromPort" : "0", "ToPort" : "65535", "SourceSecurityGroupId" : { "Ref" : "MasterSecurityGroup" } },
{ "IpProtocol" : "icmp", "FromPort" : "-1", "ToPort" : "-1", "SourceSecurityGroupId" : { "Ref" : "MasterSecurityGroup" } }
],
"SecurityGroupEgress" : [
]
}
},
"WorkerSecurityIngress3" : {
"Type" : "AWS::EC2::SecurityGroupIngress",
"DependsOn" : ["WorkerSecurityGroup"],
"Properties" : {
"GroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] },
"IpProtocol" : "tcp",
"FromPort" : "0",
"ToPort" : "65535",
"SourceSecurityGroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] }
}
},
"WorkerSecurityIngress4" : {
"Type" : "AWS::EC2::SecurityGroupIngress",
"DependsOn" : ["WorkerSecurityGroup"],
"Properties" : {
"GroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] },
"IpProtocol" : "icmp",
"FromPort" : "-1",
"ToPort" : "-1",
"SourceSecurityGroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] }
}
},
"WorkerLaunchConfig" : {
"Type" : "AWS::AutoScaling::LaunchConfiguration",
"Properties" : {
"ImageId" : {
"Fn::FindInMap" : [ "AmazonLinux", { "Ref" : "AWS::Region" }, "AMI" ]
},
"InstanceType" : {
"Ref" : "InstanceType"
},
"IamInstanceProfile" : {
"Ref" : "InstanceProfile"
},
"SecurityGroups" : [
{"Ref" : "WorkerSecurityGroup"}
],
"UserData" : {
"Fn::Base64" : {
"Fn::Join" : [ "",
[
"#!/bin/bash -xe",
"\n",
"# setup ssh-forwarding. ",
"sed -i \"s/^#\\(\\s\\+\\)ForwardAgent\\(\\s\\+\\)no/\\ \\1ForwardAgent\\2yes/g\" /etc/ssh/ssh_config",
"\n",
""
]
]
}
},
"KeyName" : {
"Ref" : "KeyName"
}
}
},
"MasterLaunchConfig" : {
"Type" : "AWS::AutoScaling::LaunchConfiguration",
"Properties" : {
"AssociatePublicIpAddress" : "true",
"ImageId" : {
"Fn::FindInMap" : [ "AmazonLinux", { "Ref" : "AWS::Region" }, "AMI" ]
},
"InstanceType" : {
"Ref" : "InstanceType"
},
"IamInstanceProfile" : {
"Ref" : "InstanceProfile"
},
"SecurityGroups" : [
{ "Ref" : "MasterSecurityGroup" },
{ "Ref" : "AdminSSHSecurityGroup" }
],
"UserData" : {
"Fn::Base64" : {
"Fn::Join" : [ "",
[
"#!/bin/bash -xe",
"\n",
"# setup ssh-forwarding. \n",
"sed -i \"s/^#\\(\\s\\+\\)ForwardAgent\\(\\s\\+\\)no/\\ \\1ForwardAgent\\2yes/g\" /etc/ssh/ssh_config",
"\n",
"mkdir -p /opt/deeplearning",
"\n",
"# run cfn-init. \n",
"export CFN_PATH=\\/opt\\/aws\\/bin",
"\n",
"$CFN_PATH\\/cfn-init -v --region ", { "Ref" : "AWS::Region" },
" -s ",
{ "Ref" : "AWS::StackId" },
" -r MasterLaunchConfig ",
"\n",
""
]
]
}
},
"KeyName" : {
"Ref" : "KeyName"
}
},
"Metadata" : {
"AWS::CloudFormation::Init" : {
"config" : {
"commands" : {
"test" : {
"command" : "/opt/deeplearning/fetch-hosts.sh"
}
},
"files" : {
"/opt/deeplearning/fetch-hosts.sh": {
"content" : { "Fn::Join" : ["", [
"#!/bin/bash -xe",
"\n",
"# setup deep learning Master ip,dns alias",
"\n",
"num_instance=1",
"\n",
"instance=$(aws --region \"", { "Ref" : "AWS::Region" }, "\" autoscaling describe-auto-scaling-groups --no-paginate --query \"AutoScalingGroups[?Tags[?Value=='",{"Ref" : "AWS::StackName"},"']]|[?Tags[?Key=='NodeType']]|[?Tags[?Value=='Master']].Instances[*].InstanceId\" --output text | tr \"\\t\" \"\\n\")",
"\n",
"ip=$(aws --region \"", { "Ref" : "AWS::Region" }, "\" ec2 describe-instances --instance-ids $instance --output text --query \"Reservations[*].Instances[*].PrivateIpAddress\")",
"\n",
"echo \"$ip deeplearning-master\" >>/etc/hosts",
"\n",
"echo \"$ip deeplearning-worker$num_instance\" >>/etc/hosts",
"\n",
"echo \"deeplearning-worker$num_instance\" >>/opt/deeplearning/workers",
"\n",
"# setup deep learning workers ip,dns alias",
"\n",
"for instance in `aws --region \"", { "Ref" : "AWS::Region" }, "\" autoscaling describe-auto-scaling-groups --no-paginate --query \"AutoScalingGroups[?Tags[?Value=='",{"Ref" : "AWS::StackName"},"']]|[?Tags[?Key=='NodeType']]|[?Tags[?Value=='Worker']].Instances[*].InstanceId\" --output text | tr \"\\t\" \"\\n\"`",
"\n",
"do",
"\n",
"let \"num_instance += 1\"",
"\n",
"ip=$(aws --region \"", { "Ref" : "AWS::Region" }, "\" ec2 describe-instances --instance-ids $instance --output text --query \"Reservations[*].Instances[*].PrivateIpAddress\")",
"\n",
"echo \"$ip deeplearning-worker$num_instance\" >>/etc/hosts",
"\n",
"echo \"deeplearning-worker$num_instance\" >>/opt/deeplearning/workers",
"\n",
"done;",
"\n",
"# set deep learning environment variables",
"\n",
"echo \"export DEEPLEARNING_WORKERS_PATH=/opt/deeplearning/workers\" >>/etc/profile.d/deeplearning.sh",
"\n",
"echo \"export DEEPLEARNING_WORKERS_COUNT=$(wc -l < \\/opt\\/deeplearning\\/workers)\" >>/etc/profile.d/deeplearning.sh",
"\n",
"echo \"export DEEPLEARNING_WORKER_GPU_COUNT=$(nvidia-smi -L | wc -l)\" >>/etc/profile.d/deeplearning.sh",
"\n",
""
]]},
"mode" : "000544",
"owner" : "root",
"group" : "root"
}
}
}
}
}
},
"MasterAutoScalingGroup" : {
"Type" : "AWS::AutoScaling::AutoScalingGroup",
"DependsOn" : ["MasterLaunchConfig", "WorkerAutoScalingGroup"],
"Properties" : {
"DesiredCapacity" : "1",
"MinSize" : "1",
"MaxSize" : "1",
"LaunchConfigurationName" : { "Ref" : "MasterLaunchConfig"},
"VPCZoneIdentifier" : [{ "Ref" : "PublicSubnet"}],
"Tags" : [ {
"Key" : "Name",
"Value" : {
"Ref" : "AWS::StackName"
},
"PropagateAtLaunch" : true
},
{
"Key" : "NodeType",
"Value" : "Master",
"PropagateAtLaunch" : true
}
]
}
},
"WorkerAutoScalingGroup" : {
"Type" : "AWS::AutoScaling::AutoScalingGroup",
"DependsOn" : ["WorkerLaunchConfig"],
"Properties" : {
"DesiredCapacity" : {
"Ref" : "WorkerCount"
},
"MinSize" : {
"Ref" : "WorkerCount"
},
"MaxSize" : {
"Ref" : "WorkerCount"
},
"LaunchConfigurationName" : {
"Ref" : "WorkerLaunchConfig"
},
"VPCZoneIdentifier" : [ { "Ref" : "PrivateSubnet" } ],
"Tags" : [ {
"Key" : "Name",
"Value" : {
"Ref" : "AWS::StackName"
},
"PropagateAtLaunch" : true
},
{
"Key" : "NodeType",
"Value" : "Worker",
"PropagateAtLaunch" : true
}
]
}
},
"NATGatewayEIP" : {
"Type" : "AWS::EC2::EIP",
"Properties" : {"Domain" : "vpc"}
},
"Vpc" : {
"Type" : "AWS::EC2::VPC",
"Properties" : {
"CidrBlock" : { "Fn::FindInMap" : [ "SubnetConfig", "VPC", "CIDR" ]},
"EnableDnsSupport" : "true",
"EnableDnsHostnames" : "true",
"Tags" : [
{ "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } }
]
}
},
"InternetGateway" : {
"Type" : "AWS::EC2::InternetGateway",
"Properties" : {
"Tags" : [
{ "Key" : "Network", "Value" : "Public" },
{ "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } }
]
}
},
"GatewayToInternet" : {
"Type" : "AWS::EC2::VPCGatewayAttachment",
"Properties" : {
"VpcId" : { "Ref" : "Vpc" },
"InternetGatewayId" : { "Ref" : "InternetGateway" }
}
},
"PublicSubnet" : {
"Type" : "AWS::EC2::Subnet",
"DependsOn" : ["PrivateSubnet"],
"Properties" : {
"VpcId" : {"Ref" : "Vpc"},
"AvailabilityZone" : { "Fn::GetAtt" : [ "PrivateSubnet", "AvailabilityZone" ] } ,
"CidrBlock": { "Fn::FindInMap" : [ "SubnetConfig", "Public", "CIDR" ]},
"Tags" : [
{ "Key" : "Network", "Value" : "Public" },
{ "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } }
]
}
},
"PrivateSubnet" : {
"Type" : "AWS::EC2::Subnet",
"Properties" : {
"VpcId" : { "Ref" : "Vpc" },
"CidrBlock" : { "Fn::FindInMap" : [ "SubnetConfig", "Private", "CIDR" ]},
"Tags" : [
{ "Key" : "Network", "Value" : "Private" },
{ "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" }}
]
}
},
"NATGateway" : {
"Type" : "AWS::EC2::NatGateway",
"DependsOn" : "GatewayToInternet",
"Properties" : {
"AllocationId" : {
"Fn::GetAtt" : [
"NATGatewayEIP",
"AllocationId"
]
},
"SubnetId" : {
"Ref" : "PublicSubnet"
}
}
},
"PublicRouteTable" : {
"Type" : "AWS::EC2::RouteTable",
"Properties" : {
"VpcId" : { "Ref" : "Vpc" },
"Tags" : [
{ "Key" : "Network", "Value" : "Public" },
{ "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } }
]
}
},
"PublicRoute" : {
"Type" : "AWS::EC2::Route",
"Properties" : {
"RouteTableId" : { "Ref" : "PublicRouteTable" },
"DestinationCidrBlock" : "0.0.0.0/0",
"GatewayId" : { "Ref" : "InternetGateway" }
}
},
"PublicSubnetRouteAssociation" : {
"Type" : "AWS::EC2::SubnetRouteTableAssociation",
"Properties" : {
"SubnetId" : { "Ref" : "PublicSubnet" },
"RouteTableId" : { "Ref" : "PublicRouteTable" }
}
},
"PrivateRouteTable" : {
"Type" : "AWS::EC2::RouteTable",
"Properties" : {
"VpcId" : { "Ref" : "Vpc" },
"Tags" : [
{ "Key" : "Network", "Value" : "Private" },
{ "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" }}
]
}
},
"PrivateRoute" : {
"Type" : "AWS::EC2::Route",
"Properties" : {
"RouteTableId" : { "Ref" : "PrivateRouteTable" },
"DestinationCidrBlock" : "0.0.0.0/0",
"NatGatewayId" : { "Ref" : "NATGateway" }
}
},
"PrivateSubnetRouteAssociation" : {
"Type" : "AWS::EC2::SubnetRouteTableAssociation",
"Properties" : {
"SubnetId" : { "Ref" : "PrivateSubnet" },
"RouteTableId" : { "Ref" : "PrivateRouteTable" }
}
}
},
"Outputs" : {
"AdminSSHSecurityGroup" : {
"Description" : "Security Group that restricts Inbound IPs to SSH into the Master",
"Value" : {
"Ref" : "AdminSSHSecurityGroup"
}
},
"MasterAutoScalingGroup" : {
"Description" : "Autoscaling Group that contains the Master Instance",
"Value" : {
"Ref" : "MasterAutoScalingGroup"
}
}
}
}