blob: 2c63c0d8d05f8e7a31cdc4016605c938af26aa89 [file] [log] [blame]
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Multi arch dockerized build tool."""
__author__ = 'Marco de Abreu, Kellen Sunderland, Anton Chernov, Pedro Larroy, Leonard Lausen'
__version__ = '0.4'
import argparse
import pprint
import os
import signal
import subprocess
from itertools import chain
from subprocess import check_call
from typing import *
import yaml
from util import *
def get_platforms() -> List[str]:
"""Get a list of architectures declared in docker-compose.yml"""
with open("docker/docker-compose.yml", "r") as f:
compose_config = yaml.load(f.read(), yaml.SafeLoader)
return list(compose_config["services"].keys())
def get_docker_tag(platform: str, registry: str) -> str:
""":return: docker tag to be used for the container"""
with open("docker/docker-compose.yml", "r") as f:
compose_config = yaml.load(f.read(), yaml.SafeLoader)
return compose_config["services"][platform]["image"].replace('${DOCKER_CACHE_REGISTRY}', registry)
def build_docker(platform: str, registry: str, num_retries: int, no_cache: bool,
cache_intermediate: bool = False) -> str:
"""
Build a container for the given platform
:param platform: Platform
:param registry: Dockerhub registry name
:param num_retries: Number of retries to build the docker image
:param no_cache: pass no-cache to docker to rebuild the images
:return: Id of the top level image
"""
logging.info('Building docker container \'%s\' based on ci/docker/docker-compose.yml', platform)
# We add a user with the same group as the executing non-root user so files created in the
# container match permissions of the local user. Same for the group.
cmd = ['docker-compose', '-f', 'docker/docker-compose.yml', 'build',
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
if cache_intermediate:
cmd.append('--no-rm')
cmd.append(platform)
env = os.environ.copy()
env["DOCKER_CACHE_REGISTRY"] = registry
@retry(subprocess.CalledProcessError, tries=num_retries)
def run_cmd(env=None):
logging.info("Running command: '%s'", ' '.join(cmd))
check_call(cmd, env=env)
run_cmd(env=env)
def buildir() -> str:
return os.path.join(get_mxnet_root(), "build")
def default_ccache_dir() -> str:
""":return: ccache directory for the current platform"""
# Share ccache across containers
if 'CCACHE_DIR' in os.environ:
ccache_dir = os.path.realpath(os.environ['CCACHE_DIR'])
try:
os.makedirs(ccache_dir, exist_ok=True)
return ccache_dir
except PermissionError:
logging.info('Unable to make dirs at %s, falling back to local temp dir', ccache_dir)
# In osx tmpdir is not mountable by default
import platform
if platform.system() == 'Darwin':
ccache_dir = "/tmp/_mxnet_ccache"
os.makedirs(ccache_dir, exist_ok=True)
return ccache_dir
return os.path.join(os.path.expanduser("~"), ".ccache")
def container_run(platform: str,
nvidia_runtime: bool,
docker_registry: str,
shared_memory_size: str,
local_ccache_dir: str,
command: List[str],
environment: Dict[str, str],
dry_run: bool = False) -> int:
"""Run command in a container"""
# set default environment variables
environment.update({
'CCACHE_MAXSIZE': '500G',
'CCACHE_TEMPDIR': '/tmp/ccache', # temp dir should be local and not shared
'CCACHE_DIR': '/work/ccache', # this path is inside the container as /work/ccache is mounted
'CCACHE_LOGFILE': '/tmp/ccache.log', # a container-scoped log, useful for ccache verification.
})
environment.update({k: os.environ[k] for k in ['CCACHE_MAXSIZE'] if k in os.environ})
if 'RELEASE_BUILD' not in environment:
environment['RELEASE_BUILD'] = 'false'
tag = get_docker_tag(platform=platform, registry=docker_registry)
mx_root = get_mxnet_root()
local_build_folder = buildir()
# We need to create it first, otherwise it will be created by the docker daemon with root only permissions
os.makedirs(local_build_folder, exist_ok=True)
os.makedirs(local_ccache_dir, exist_ok=True)
logging.info("Using ccache directory: %s", local_ccache_dir)
# Log enviroment
logging.info("environment ---> {0}".format(environment))
# Build docker command
docker_arg_list = [
"--cap-add", "SYS_PTRACE", # Required by ASAN
'--rm',
'--shm-size={}'.format(shared_memory_size),
# mount mxnet root
'-v', "{}:/work/mxnet".format(mx_root),
# mount mxnet/build for storing build
'-v', "{}:/work/build".format(local_build_folder),
'-v', "{}:/work/ccache".format(local_ccache_dir),
'-u', '{}:{}'.format(os.getuid(), os.getgid()),
'-e', 'CCACHE_MAXSIZE={}'.format(environment['CCACHE_MAXSIZE']),
# temp dir should be local and not shared
'-e', 'CCACHE_TEMPDIR={}'.format(environment['CCACHE_TEMPDIR']),
# this path is inside the container as /work/ccache is mounted
'-e', 'CCACHE_DIR={}'.format(environment['CCACHE_DIR']),
# a container-scoped log, useful for ccache verification.
'-e', 'CCACHE_LOGFILE={}'.format(environment['CCACHE_LOGFILE']),
# whether this is a release build or not
'-e', 'RELEASE_BUILD={}'.format(environment['RELEASE_BUILD']),
]
docker_arg_list += [tag]
docker_arg_list.extend(command)
def docker_run_cmd(cmd):
logging.info("Running %s in container %s", command, tag)
logging.info("Executing command:\n%s\n", ' \\\n\t'.join(cmd))
subprocess.run(cmd, stdout=sys.stdout, stderr=sys.stderr, check=True)
if not dry_run:
if not nvidia_runtime:
docker_run_cmd(['docker', 'run'] + docker_arg_list)
else:
try:
docker_run_cmd(['docker', 'run', '--gpus', 'all'] + docker_arg_list)
except subprocess.CalledProcessError as e:
if e.returncode == 125:
docker_run_cmd(['docker', 'run', '--runtime', 'nvidia'] + docker_arg_list)
else:
raise
return 0
def list_platforms() -> str:
return "\nSupported platforms:\n{}".format('\n'.join(get_platforms()))
def load_docker_cache(platform, tag, docker_registry) -> None:
"""Imports tagged container from the given docker registry"""
if docker_registry:
env = os.environ.copy()
env["DOCKER_CACHE_REGISTRY"] = docker_registry
cmd = ['docker-compose', '-f', 'docker/docker-compose.yml', 'pull', platform]
logging.info("Running command: 'DOCKER_CACHE_REGISTRY=%s %s'", docker_registry, ' '.join(cmd))
check_call(cmd, env=env)
else:
logging.info('Distributed docker cache disabled')
def log_environment():
instance_info = ec2_instance_info()
if instance_info:
logging.info("EC2: %s", instance_info)
pp = pprint.PrettyPrinter(indent=4)
logging.debug("Build environment: %s", pp.pformat(dict(os.environ)))
def main() -> int:
config_logging()
logging.info("MXNet container based build tool.")
log_environment()
chdir_to_script_directory()
parser = argparse.ArgumentParser(description="""Utility for building and testing MXNet on docker
containers""", epilog="")
parser.add_argument("-p", "--platform", type=str, help= \
"Platform. See ci/docker/docker-compose.yml for list of supported " \
"platforms (services).")
parser.add_argument("-b", "--build-only",
help="Only build the container, don't build the project",
action='store_true')
parser.add_argument("-R", "--run-only",
help="Only run the container, don't rebuild the container",
action='store_true')
parser.add_argument("-n", "--nvidiadocker",
help="Use nvidia docker",
action='store_true')
parser.add_argument("--shm-size",
help="Size of the shared memory /dev/shm allocated in the container (e.g '1g')",
default='500m',
dest="shared_memory_size")
parser.add_argument("-l", "--list",
help="List platforms",
action='store_true')
parser.add_argument("--print-docker-run",
help="print docker run command for manual inspection",
action='store_true')
parser.add_argument("-d", "--docker-registry",
help="Dockerhub registry name to retrieve cache from.",
default='mxnetci',
type=str)
parser.add_argument("-r", "--docker-build-retries",
help="Number of times to retry building the docker image. Default is 1",
default=1,
type=int)
parser.add_argument("--no-pull", action="store_true",
help="Don't pull from dockerhub registry to initialize cache.")
parser.add_argument("--no-cache", action="store_true",
help="passes --no-cache to docker build")
parser.add_argument("--cache-intermediate", action="store_true",
help="passes --rm=false to docker build")
parser.add_argument("-e", "--environment", nargs="*", default=[],
help="Environment variables for the docker container. "
"Specify with a list containing either names or name=value")
parser.add_argument("command",
help="command to run in the container",
nargs='*', action='append', type=str)
parser.add_argument("--ccache-dir",
default=default_ccache_dir(),
help="ccache directory",
type=str)
args = parser.parse_args()
command = list(chain.from_iterable(args.command))
environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
for e in args.environment])
if args.list:
print(list_platforms())
elif args.platform:
platform = args.platform
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
if args.docker_registry and not args.no_pull:
load_docker_cache(platform=platform, tag=tag, docker_registry=args.docker_registry)
if not args.run_only:
build_docker(platform=platform, registry=args.docker_registry, num_retries=args.docker_build_retries,
no_cache=args.no_cache, cache_intermediate=args.cache_intermediate)
else:
logging.info("Skipping docker build step.")
if args.build_only:
logging.warning("Container was just built. Exiting due to build-only.")
return 0
# noinspection PyUnusedLocal
ret = 0
if command:
ret = container_run(
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, environment=environment)
elif args.print_docker_run:
command = []
ret = container_run(
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, dry_run=True, environment=environment)
else:
# With no commands, execute a build function for the target platform
command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
logging.info("No command specified, trying default build: %s", ' '.join(command))
ret = container_run(
platform=platform, nvidia_runtime=args.nvidiadocker,
shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
local_ccache_dir=args.ccache_dir, environment=environment)
if ret != 0:
logging.critical("Execution of %s failed with status: %d", command, ret)
return ret
else:
parser.print_help()
list_platforms()
print("""
Examples:
./build.py -p armv7
Will build a docker container with cross compilation tools and build MXNet for armv7 by
running: ci/docker/runtime_functions.sh build_armv7 inside the container.
./build.py -p armv7 ls
Will execute the given command inside the armv7 container
./build.py -p armv7 --print-docker-run
Will print a docker run command to get inside the container in a shell
./build.py -a
Builds for all platforms and leaves artifacts in build_<platform>
""")
return 0
if __name__ == '__main__':
sys.exit(main())