tutorials/language/scan.py - tvm - Git at Google

 """
 Scan and Recurrent Kernel
 =========================
 **Author**: `Tianqi Chen <https://tqchen.github.io>`_

 This is an introduction material on how to do recurrent computing in TVM.
 Recurrent computing is a typical pattern in neural networks.
 """
 from __future__ import absolute_import, print_function

 import tvm
 import numpy as np

 ######################################################################
 # TVM supports a scan operator to describe symbolic loop.
 # The following scan op computes cumsum over columns of X.
 #
 # The scan is carried over the highest dimension of the tensor.
 # :code:`s_state` is a placeholder that describes the transition state of the scan.
 # :code:`s_init` describes how we can initialize the first k timesteps.
 # Here since s_init's first dimension is 1, it describes how we initialize
 # The state at first timestep.
 #
 # :code:`s_update` describes how to update the value at timestep t. The update
 # value can refer back to the values of previous timestep via state placeholder.
 # Note that while it is invalid to refer to :code:`s_state` at current or later timestep.
 #
 # The scan takes in state placeholder, initial value and update description.
 # It is also recommended(although not necessary) to list the inputs to the scan cell.
 # The result of the scan is a tensor, giving the result of :code:`s_state` after the
 # update over the time domain.
 #
 m = tvm.var("m")
 n = tvm.var("n")
 X = tvm.placeholder((m, n), name="X")
 s_state = tvm.placeholder((m, n))
 s_init = tvm.compute((1, n), lambda _, i: X[0, i])
 s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
 s_scan = tvm.scan(s_init, s_update, s_state, inputs=[X])

 ######################################################################
 # Schedule the Scan Cell
 # ----------------------
 # We can schedule the body of the scan by scheduling the update and
 # init part seperately. Note that it is invalid to schedule the
 # first iteration dimension of the update part.
 # To split on the time iteration, user can schedule on scan_op.scan_axis instead.
 #
 s = tvm.create_schedule(s_scan.op)
 num_thread = 256
 block_x = tvm.thread_axis("blockIdx.x")
 thread_x = tvm.thread_axis("threadIdx.x")
 xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
 s[s_init].bind(xo, block_x)
 s[s_init].bind(xi, thread_x)
 xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread)
 s[s_update].bind(xo, block_x)
 s[s_update].bind(xi, thread_x)
 print(tvm.lower(s, [X, s_scan], simple_mode=True))

 ######################################################################
 # Build and Verify
 # ----------------
 # We can build the scan kernel like other tvm kernels, here we use
 # numpy to verify the correctness of the result.
 #
 fscan = tvm.build(s, [X, s_scan], "cuda", name="myscan")
 ctx = tvm.gpu(0)
 n = 1024
 m = 10
 a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype)
 a = tvm.nd.array(a_np, ctx)
 b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), ctx)
 fscan(a, b)
 tvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))

 ######################################################################
 # Multi-Stage Scan Cell
 # ---------------------
 # In the above example we described the scan cell using one Tensor
 # computation stage in s_update. It is possible to use multiple
 # Tensor stages in the scan cell.
 #
 # The following lines demonstrate a scan with two stage operations
 # in the scan cell.
 #
 m = tvm.var("m")
 n = tvm.var("n")
 X = tvm.placeholder((m, n), name="X")
 s_state = tvm.placeholder((m, n))
 s_init = tvm.compute((1, n), lambda _, i: X[0, i])
 s_update_s1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] * 2, name="s1")
 s_update_s2 = tvm.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name="s2")
 s_scan = tvm.scan(s_init, s_update_s2, s_state, inputs=[X])

 ######################################################################
 # These intermediate tensors can also be scheduled normally.
 # To ensure correctness, TVM creates a group constraint to forbid
 # the body of scan to be compute_at locations outside the scan loop.
 #
 s = tvm.create_schedule(s_scan.op)
 xo, xi = s[s_update_s2].split(s_update_s2.op.axis[1], factor=32)
 s[s_update_s1].compute_at(s[s_update_s2], xo)
 print(tvm.lower(s, [X, s_scan], simple_mode=True))

 ######################################################################
 # Multiple States
 # ---------------
 # For complicated applications like RNN, we might need more than one
 # recurrent state. Scan support multiple recurrent states.
 # The following example demonstrates how we can build recurrence with two states.
 #
 m = tvm.var("m")
 n = tvm.var("n")
 l = tvm.var("l")
 X = tvm.placeholder((m, n), name="X")
 s_state1 = tvm.placeholder((m, n))
 s_state2 = tvm.placeholder((m, l))
 s_init1 = tvm.compute((1, n), lambda _, i: X[0, i])
 s_init2 = tvm.compute((1, l), lambda _, i: 0.0)
 s_update1 = tvm.compute((m, n), lambda t, i: s_state1[t-1, i] + X[t, i])
 s_update2 = tvm.compute((m, l), lambda t, i: s_state2[t-1, i] + s_state1[t-1, 0])
 s_scan1, s_scan2 = tvm.scan([s_init1, s_init2],
                             [s_update1, s_update2],
                             [s_state1, s_state2], inputs=[X])
 s = tvm.create_schedule(s_scan1.op)
 print(tvm.lower(s, [X, s_scan1, s_scan2], simple_mode=True))

 ######################################################################
 # Summary
 # -------
 # This tutorial provides a walk through of scan primitive.
 #
 # - Describe scan with init and update.
 # - Schedule the scan cells as normal schedule.
 # - For complicated workload, use multiple states and steps in scan cell.
	"""
	Scan and Recurrent Kernel
	=========================
	Author: `Tianqi Chen <https://tqchen.github.io>`_

	This is an introduction material on how to do recurrent computing in TVM.
	Recurrent computing is a typical pattern in neural networks.
	"""
	from __future__ import absolute_import, print_function

	import tvm
	import numpy as np

	######################################################################
	# TVM supports a scan operator to describe symbolic loop.
	# The following scan op computes cumsum over columns of X.
	#
	# The scan is carried over the highest dimension of the tensor.
	# :code:`s_state` is a placeholder that describes the transition state of the scan.
	# :code:`s_init` describes how we can initialize the first k timesteps.
	# Here since s_init's first dimension is 1, it describes how we initialize
	# The state at first timestep.
	#
	# :code:`s_update` describes how to update the value at timestep t. The update
	# value can refer back to the values of previous timestep via state placeholder.
	# Note that while it is invalid to refer to :code:`s_state` at current or later timestep.
	#
	# The scan takes in state placeholder, initial value and update description.
	# It is also recommended(although not necessary) to list the inputs to the scan cell.
	# The result of the scan is a tensor, giving the result of :code:`s_state` after the
	# update over the time domain.
	#
	m = tvm.var("m")
	n = tvm.var("n")
	X = tvm.placeholder((m, n), name="X")
	s_state = tvm.placeholder((m, n))
	s_init = tvm.compute((1, n), lambda _, i: X[0, i])
	s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
	s_scan = tvm.scan(s_init, s_update, s_state, inputs=[X])

	######################################################################
	# Schedule the Scan Cell
	# ----------------------
	# We can schedule the body of the scan by scheduling the update and
	# init part seperately. Note that it is invalid to schedule the
	# first iteration dimension of the update part.
	# To split on the time iteration, user can schedule on scan_op.scan_axis instead.
	#
	s = tvm.create_schedule(s_scan.op)
	num_thread = 256
	block_x = tvm.thread_axis("blockIdx.x")
	thread_x = tvm.thread_axis("threadIdx.x")
	xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
	s[s_init].bind(xo, block_x)
	s[s_init].bind(xi, thread_x)
	xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread)
	s[s_update].bind(xo, block_x)
	s[s_update].bind(xi, thread_x)
	print(tvm.lower(s, [X, s_scan], simple_mode=True))

	######################################################################
	# Build and Verify
	# ----------------
	# We can build the scan kernel like other tvm kernels, here we use
	# numpy to verify the correctness of the result.
	#
	fscan = tvm.build(s, [X, s_scan], "cuda", name="myscan")
	ctx = tvm.gpu(0)
	n = 1024
	m = 10
	a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype)
	a = tvm.nd.array(a_np, ctx)
	b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), ctx)
	fscan(a, b)
	tvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))

	######################################################################
	# Multi-Stage Scan Cell
	# ---------------------
	# In the above example we described the scan cell using one Tensor
	# computation stage in s_update. It is possible to use multiple
	# Tensor stages in the scan cell.
	#
	# The following lines demonstrate a scan with two stage operations
	# in the scan cell.
	#
	m = tvm.var("m")
	n = tvm.var("n")
	X = tvm.placeholder((m, n), name="X")
	s_state = tvm.placeholder((m, n))
	s_init = tvm.compute((1, n), lambda _, i: X[0, i])
	s_update_s1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] * 2, name="s1")
	s_update_s2 = tvm.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name="s2")
	s_scan = tvm.scan(s_init, s_update_s2, s_state, inputs=[X])

	######################################################################
	# These intermediate tensors can also be scheduled normally.
	# To ensure correctness, TVM creates a group constraint to forbid
	# the body of scan to be compute_at locations outside the scan loop.
	#
	s = tvm.create_schedule(s_scan.op)
	xo, xi = s[s_update_s2].split(s_update_s2.op.axis[1], factor=32)
	s[s_update_s1].compute_at(s[s_update_s2], xo)
	print(tvm.lower(s, [X, s_scan], simple_mode=True))

	######################################################################
	# Multiple States
	# ---------------
	# For complicated applications like RNN, we might need more than one
	# recurrent state. Scan support multiple recurrent states.
	# The following example demonstrates how we can build recurrence with two states.
	#
	m = tvm.var("m")
	n = tvm.var("n")
	l = tvm.var("l")
	X = tvm.placeholder((m, n), name="X")
	s_state1 = tvm.placeholder((m, n))
	s_state2 = tvm.placeholder((m, l))
	s_init1 = tvm.compute((1, n), lambda _, i: X[0, i])
	s_init2 = tvm.compute((1, l), lambda _, i: 0.0)
	s_update1 = tvm.compute((m, n), lambda t, i: s_state1[t-1, i] + X[t, i])
	s_update2 = tvm.compute((m, l), lambda t, i: s_state2[t-1, i] + s_state1[t-1, 0])
	s_scan1, s_scan2 = tvm.scan([s_init1, s_init2],
	[s_update1, s_update2],
	[s_state1, s_state2], inputs=[X])
	s = tvm.create_schedule(s_scan1.op)
	print(tvm.lower(s, [X, s_scan1, s_scan2], simple_mode=True))

	######################################################################
	# Summary
	# -------
	# This tutorial provides a walk through of scan primitive.
	#
	# - Describe scan with init and update.
	# - Schedule the scan cells as normal schedule.
	# - For complicated workload, use multiple states and steps in scan cell.