python/mxnet/rtc.py - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """Interface to runtime cuda kernel compile module."""

 from array import array
 import re
 import ctypes
 import numpy as np

 from .base import _LIB, mx_uint, c_array, c_array_buf, c_str_array, check_call
 from .base import c_str, CudaModuleHandle, CudaKernelHandle, numeric_types, string_types
 from .ndarray import dtype_np_to_mx, dtype_mx_to_np, NDArray

 _DTYPE_CPP_TO_NP = {
     'float': np.float32,
     'double': np.float64,
     '__half': np.float16,
     'uint8_t': np.uint8,
     'int': np.int32,
     'int32_t': np.int32,
     'int8_t': np.int8,
     'char': np.int8,
     'int64_t': np.int64,
 }

 class CudaModule(object):
     r"""Compile and run CUDA code from Python.

     In CUDA 7.5, you need to prepend your kernel definitions
     with 'extern "C"' to avoid name mangling::

         source = r'''
         extern "C" __global__ void axpy(const float *x, float *y, float alpha) {
             int i = threadIdx.x + blockIdx.x * blockDim.x;
             y[i] += alpha * x[i];
         }
         '''
         module = mx.rtc.CudaModule(source)
         func = module.get_kernel("axpy", "const float *x, float *y, float alpha")
         x = mx.nd.ones((10,), ctx=mx.gpu(0))
         y = mx.nd.zeros((10,), ctx=mx.gpu(0))
         func.launch([x, y, 3.0], mx.gpu(0), (1, 1, 1), (10, 1, 1))
         print(y)

     Starting from CUDA 8.0, you can instead export functions by name.
     This also allows you to use templates::

         source = r'''
         template<typename DType>
         __global__ void axpy(const DType *x, DType *y, DType alpha) {
             int i = threadIdx.x + blockIdx.x * blockDim.x;
             y[i] += alpha * x[i];
         }
         '''
         module = mx.rtc.CudaModule(source, exports=['axpy<float>', 'axpy<double>'])
         func32 = module.get_kernel("axpy<float>", "const float *x, float *y, float alpha")
         x = mx.nd.ones((10,), dtype='float32', ctx=mx.gpu(0))
         y = mx.nd.zeros((10,), dtype='float32', ctx=mx.gpu(0))
         func32.launch([x, y, 3.0], mx.gpu(0), (1, 1, 1), (10, 1, 1))
         print(y)

         func64 = module.get_kernel("axpy<double>", "const double *x, double *y, double alpha")
         x = mx.nd.ones((10,), dtype='float64', ctx=mx.gpu(0))
         y = mx.nd.zeros((10,), dtype='float64', ctx=mx.gpu(0))
         func32.launch([x, y, 3.0], mx.gpu(0), (1, 1, 1), (10, 1, 1))
         print(y)


     Parameters
     ----------
     source : str
         Complete source code.
     options : tuple of str
         Compiler flags. For example, use "-I/usr/local/cuda/include" to
         add cuda headers to include path.
     exports : tuple of str
         Export kernel names.
     """
     def __init__(self, source, options=(), exports=()):
         if isinstance(options, string_types):
             options = (options,)
         if isinstance(exports, string_types):
             exports = (exports,)
         self.handle = CudaModuleHandle()
         check_call(_LIB.MXRtcCudaModuleCreate(
             c_str(source),
             len(options),
             c_str_array(options),
             len(exports),
             c_str_array(exports),
             ctypes.byref(self.handle)))

     def __del__(self):
         check_call(_LIB.MXRtcCudaModuleFree(self.handle))

     def get_kernel(self, name, signature):
         r"""Get CUDA kernel from compiled module.

         Parameters
         ----------
         name : str
             String name of the kernel.
         signature : str
             Function signature for the kernel. For example, if a kernel is
             declared as::

                 extern "C" __global__ void axpy(const float *x, double *y, int alpha)

             Then its signature should be::

                 const float *x, double *y, int alpha

             or::

                 const float *, double *, int

             Note that `*` in signature marks an argument as array and
             `const` marks an argument as constant (input) array.

         Returns
         -------
         CudaKernel
             CUDA kernels that can be launched on GPUs.
         """
         hdl = CudaKernelHandle()
         is_ndarray = []
         is_const = []
         dtypes = []
         pattern = re.compile(r"""^(const)?\s?([\w_]+)\s?(\*)?\s?([\w_]+)?$""")
         args = re.sub(r"\s+", " ", signature).split(",")
         for arg in args:
             sanitized_arg = " ".join(arg.split())
             match = pattern.match(sanitized_arg)
             if not match or match.groups()[1] == 'const':
                 raise ValueError(
                     f'Invalid function prototype "{sanitized_arg}". Must be in the '
                     'form of "(const) type (*) (name)"')
             is_const.append(bool(match.groups()[0]))
             dtype = match.groups()[1]
             is_ndarray.append(bool(match.groups()[2]))
             if dtype not in _DTYPE_CPP_TO_NP:
                 raise TypeError(
                     "Unsupported kernel argument type {}. Supported types are: {}.".format(
                         sanitized_arg, ','.join(_DTYPE_CPP_TO_NP.keys())))
             dtypes.append(dtype_np_to_mx(_DTYPE_CPP_TO_NP[dtype]))

         check_call(_LIB.MXRtcCudaKernelCreate(
             self.handle,
             c_str(name),
             len(dtypes),
             c_array_buf(ctypes.c_int, array('i', is_ndarray)),
             c_array_buf(ctypes.c_int, array('i', is_const)),
             c_array_buf(ctypes.c_int, array('i', dtypes)),
             ctypes.byref(hdl)))

         return CudaKernel(hdl, name, is_ndarray, dtypes)

 class CudaKernel(object):
     """Constructs CUDA kernel. Should be created by `CudaModule.get_kernel`,
     not intended to be used by users.
     """
     def __init__(self, handle, name, is_ndarray, dtypes):
         self.handle = handle
         self._name = name
         self._is_ndarray = is_ndarray
         self._dtypes = [dtype_mx_to_np(i) for i in dtypes]

     def __del__(self):
         check_call(_LIB.MXRtcCudaKernelFree(self.handle))

     def launch(self, args, ctx, grid_dims, block_dims, shared_mem=0):
         """Launch cuda kernel.

         Parameters
         ----------
         args : tuple of NDArray or numbers
             List of arguments for kernel. NDArrays are expected for pointer
             types (e.g. `float*`, `double*`) while numbers are expected for
             non-pointer types (e.g. `int`, `float`).
         ctx : Context
             The context to launch kernel on. Must be GPU context.
         grid_dims : tuple of 3 integers
             Grid dimensions for CUDA kernel.
         block_dims : tuple of 3 integers
             Block dimensions for CUDA kernel.
         shared_mem : integer, optional
             Size of dynamically allocated shared memory. Defaults to 0.
         """
         assert ctx.device_type == 'gpu', "Cuda kernel can only be launched on GPU"
         assert len(grid_dims) == 3, "grid_dims must be a tuple of 3 integers"
         assert len(block_dims) == 3, "grid_dims must be a tuple of 3 integers"
         assert len(args) == len(self._dtypes), \
             f"CudaKernel({self._name}) expects {len(self._dtypes)} arguments but got {len(args)}"
         void_args = []
         ref_holder = []
         for i, (arg, is_nd, dtype) in enumerate(zip(args, self._is_ndarray, self._dtypes)):
             if is_nd:
                 assert isinstance(arg, NDArray), \
                     f"The {i}-th argument is expected to be a NDArray but got {type(arg)}"
                 void_args.append(arg.handle)
             else:
                 assert isinstance(arg, numeric_types), \
                     f"The {i}-th argument is expected to be a number, but got {type(arg)}"
                 ref_holder.append(np.array(arg, dtype=dtype))
                 void_args.append(ref_holder[-1].ctypes.data_as(ctypes.c_void_p))

         check_call(_LIB.MXRtcCudaKernelCall(
             self.handle,
             ctx.device_id,
             c_array(ctypes.c_void_p, void_args),
             mx_uint(grid_dims[0]), mx_uint(grid_dims[1]), mx_uint(grid_dims[2]),
             mx_uint(block_dims[0]), mx_uint(block_dims[1]), mx_uint(block_dims[2]),
             mx_uint(shared_mem)))
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""Interface to runtime cuda kernel compile module."""

	from array import array
	import re
	import ctypes
	import numpy as np

	from .base import _LIB, mx_uint, c_array, c_array_buf, c_str_array, check_call
	from .base import c_str, CudaModuleHandle, CudaKernelHandle, numeric_types, string_types
	from .ndarray import dtype_np_to_mx, dtype_mx_to_np, NDArray

	_DTYPE_CPP_TO_NP = {
	'float': np.float32,
	'double': np.float64,
	'__half': np.float16,
	'uint8_t': np.uint8,
	'int': np.int32,
	'int32_t': np.int32,
	'int8_t': np.int8,
	'char': np.int8,
	'int64_t': np.int64,
	}

	class CudaModule(object):
	r"""Compile and run CUDA code from Python.

	In CUDA 7.5, you need to prepend your kernel definitions
	with 'extern "C"' to avoid name mangling::

	source = r'''
	extern "C" __global__ void axpy(const float x, float y, float alpha) {
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	y[i] += alpha * x[i];
	}
	'''
	module = mx.rtc.CudaModule(source)
	func = module.get_kernel("axpy", "const float x, float y, float alpha")
	x = mx.nd.ones((10,), ctx=mx.gpu(0))
	y = mx.nd.zeros((10,), ctx=mx.gpu(0))
	func.launch([x, y, 3.0], mx.gpu(0), (1, 1, 1), (10, 1, 1))
	print(y)

	Starting from CUDA 8.0, you can instead export functions by name.
	This also allows you to use templates::

	source = r'''
	template<typename DType>
	__global__ void axpy(const DType x, DType y, DType alpha) {
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	y[i] += alpha * x[i];
	}
	'''
	module = mx.rtc.CudaModule(source, exports=['axpy<float>', 'axpy<double>'])
	func32 = module.get_kernel("axpy<float>", "const float x, float y, float alpha")
	x = mx.nd.ones((10,), dtype='float32', ctx=mx.gpu(0))
	y = mx.nd.zeros((10,), dtype='float32', ctx=mx.gpu(0))
	func32.launch([x, y, 3.0], mx.gpu(0), (1, 1, 1), (10, 1, 1))
	print(y)

	func64 = module.get_kernel("axpy<double>", "const double x, double y, double alpha")
	x = mx.nd.ones((10,), dtype='float64', ctx=mx.gpu(0))
	y = mx.nd.zeros((10,), dtype='float64', ctx=mx.gpu(0))
	func32.launch([x, y, 3.0], mx.gpu(0), (1, 1, 1), (10, 1, 1))
	print(y)


	Parameters
	----------
	source : str
	Complete source code.
	options : tuple of str
	Compiler flags. For example, use "-I/usr/local/cuda/include" to
	add cuda headers to include path.
	exports : tuple of str
	Export kernel names.
	"""
	def __init__(self, source, options=(), exports=()):
	if isinstance(options, string_types):
	options = (options,)
	if isinstance(exports, string_types):
	exports = (exports,)
	self.handle = CudaModuleHandle()
	check_call(_LIB.MXRtcCudaModuleCreate(
	c_str(source),
	len(options),
	c_str_array(options),
	len(exports),
	c_str_array(exports),
	ctypes.byref(self.handle)))

	def __del__(self):
	check_call(_LIB.MXRtcCudaModuleFree(self.handle))

	def get_kernel(self, name, signature):
	r"""Get CUDA kernel from compiled module.

	Parameters
	----------
	name : str
	String name of the kernel.
	signature : str
	Function signature for the kernel. For example, if a kernel is
	declared as::

	extern "C" __global__ void axpy(const float x, double y, int alpha)

	Then its signature should be::

	const float x, double y, int alpha

	or::

	const float , double , int

	Note that `*` in signature marks an argument as array and
	`const` marks an argument as constant (input) array.

	Returns
	-------
	CudaKernel
	CUDA kernels that can be launched on GPUs.
	"""
	hdl = CudaKernelHandle()
	is_ndarray = []
	is_const = []
	dtypes = []
	pattern = re.compile(r"""^(const)?\s?([\w_]+)\s?(\*)?\s?([\w_]+)?$""")
	args = re.sub(r"\s+", " ", signature).split(",")
	for arg in args:
	sanitized_arg = " ".join(arg.split())
	match = pattern.match(sanitized_arg)
	if not match or match.groups()[1] == 'const':
	raise ValueError(
	f'Invalid function prototype "{sanitized_arg}". Must be in the '
	'form of "(const) type (*) (name)"')
	is_const.append(bool(match.groups()[0]))
	dtype = match.groups()[1]
	is_ndarray.append(bool(match.groups()[2]))
	if dtype not in _DTYPE_CPP_TO_NP:
	raise TypeError(
	"Unsupported kernel argument type {}. Supported types are: {}.".format(
	sanitized_arg, ','.join(_DTYPE_CPP_TO_NP.keys())))
	dtypes.append(dtype_np_to_mx(_DTYPE_CPP_TO_NP[dtype]))

	check_call(_LIB.MXRtcCudaKernelCreate(
	self.handle,
	c_str(name),
	len(dtypes),
	c_array_buf(ctypes.c_int, array('i', is_ndarray)),
	c_array_buf(ctypes.c_int, array('i', is_const)),
	c_array_buf(ctypes.c_int, array('i', dtypes)),
	ctypes.byref(hdl)))

	return CudaKernel(hdl, name, is_ndarray, dtypes)

	class CudaKernel(object):
	"""Constructs CUDA kernel. Should be created by `CudaModule.get_kernel`,
	not intended to be used by users.
	"""
	def __init__(self, handle, name, is_ndarray, dtypes):
	self.handle = handle
	self._name = name
	self._is_ndarray = is_ndarray
	self._dtypes = [dtype_mx_to_np(i) for i in dtypes]

	def __del__(self):
	check_call(_LIB.MXRtcCudaKernelFree(self.handle))

	def launch(self, args, ctx, grid_dims, block_dims, shared_mem=0):
	"""Launch cuda kernel.

	Parameters
	----------
	args : tuple of NDArray or numbers
	List of arguments for kernel. NDArrays are expected for pointer
	types (e.g. `float`, `double`) while numbers are expected for
	non-pointer types (e.g. `int`, `float`).
	ctx : Context
	The context to launch kernel on. Must be GPU context.
	grid_dims : tuple of 3 integers
	Grid dimensions for CUDA kernel.
	block_dims : tuple of 3 integers
	Block dimensions for CUDA kernel.
	shared_mem : integer, optional
	Size of dynamically allocated shared memory. Defaults to 0.
	"""
	assert ctx.device_type == 'gpu', "Cuda kernel can only be launched on GPU"
	assert len(grid_dims) == 3, "grid_dims must be a tuple of 3 integers"
	assert len(block_dims) == 3, "grid_dims must be a tuple of 3 integers"
	assert len(args) == len(self._dtypes), \
	f"CudaKernel({self._name}) expects {len(self._dtypes)} arguments but got {len(args)}"
	void_args = []
	ref_holder = []
	for i, (arg, is_nd, dtype) in enumerate(zip(args, self._is_ndarray, self._dtypes)):
	if is_nd:
	assert isinstance(arg, NDArray), \
	f"The {i}-th argument is expected to be a NDArray but got {type(arg)}"
	void_args.append(arg.handle)
	else:
	assert isinstance(arg, numeric_types), \
	f"The {i}-th argument is expected to be a number, but got {type(arg)}"
	ref_holder.append(np.array(arg, dtype=dtype))
	void_args.append(ref_holder[-1].ctypes.data_as(ctypes.c_void_p))

	check_call(_LIB.MXRtcCudaKernelCall(
	self.handle,
	ctx.device_id,
	c_array(ctypes.c_void_p, void_args),
	mx_uint(grid_dims[0]), mx_uint(grid_dims[1]), mx_uint(grid_dims[2]),
	mx_uint(block_dims[0]), mx_uint(block_dims[1]), mx_uint(block_dims[2]),
	mx_uint(shared_mem)))