tests/python/target/test_arm_target.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import subprocess
 import tempfile
 import re

 import pytest
 import numpy as np

 import tvm
 from tvm.script import tir as T
 from tvm.target import codegen

 llvm_version, arm_target, input_dtype, kernel_dtype, is_supported = tvm.testing.parameters(
     # Testing mcpu type
     (8, "c -mcpu=cortex-m4", "int8", "int8", False),
     (8, "c -mcpu=cortex-m7", "int8", "int8", False),
     (8, "c -mcpu=cortex-m33", "int8", "int8", False),
     (8, "c -mcpu=cortex-m55", "int8", "int8", False),
     (8, "c -mcpu=cortex-m3", "int8", "int8", False),
     (7, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", False),
     (8, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", True),
     (9, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", True),
     (8, "llvm -mtriple=arm-linux-gnueabi", "int8", "int8", False),
     (7, "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.4a,+dotprod", "int8", "int8", False),
     (8, "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.4a,+dotprod", "int8", "int8", True),
     (9, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", True),
     (8, "llvm -mtriple=aarch64-linux-gnu", "int8", "int8", True),
     # Testing dtype
     (8, "llvm -mtriple=aarch64-linux-gnu -mattr=+neon", "int16", "int8", False),
     (8, "llvm -mtriple=aarch64-linux-gnu -mattr=+neon", "int8", "int16", False),
     (8, "llvm -mtriple=aarch64-linux-gnu -mattr=+neon", "int16", "int16", False),
 )


 @pytest.fixture(scope="session")
 def sve_device_vector_length():
     c_code = r"""
     #include <stdio.h>
     #include <arm_sve.h>

     int main() {
         printf("%ld\n", svcntb() * 8);
     }
     """

     with tempfile.TemporaryDirectory() as tmp_dir:
         c_path = f"{tmp_dir}/vl.c"
         o_path = f"{tmp_dir}/out.o"
         with open(c_path, "w") as f:
             f.write(c_code)
         tvm.contrib.cc.create_executable(o_path, c_path, ["-march=native"])
         out = subprocess.check_output(o_path, shell=True).strip().decode()

     return int(out)


 @tvm.testing.requires_aarch64_sve
 def test_scalable_div(sve_device_vector_length):
     np.random.seed(0)
     target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sve"
     dev = tvm.cpu(0)

     @T.prim_func
     def my_func(a: T.handle):
         A = T.match_buffer(a, (1,), "int32")
         T.func_attr({"global_symbol": "my_module", "tir.noalias": True})
         A[0] = T.Div(10000, 4 * T.vscale())

     mod = tvm.compile(my_func, target=target)

     A_nd = tvm.runtime.tensor(np.empty((1,), dtype="int32"), device=dev)
     mod(A_nd)

     ref = 10000 // (sve_device_vector_length // 32)
     tvm.testing.assert_allclose(A_nd.numpy()[0], ref)


 @tvm.testing.requires_aarch64_sve
 def test_scalable_buffer_load_store(sve_device_vector_length):
     np.random.seed(0)
     target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sve"
     num_elements = sve_device_vector_length // 32
     dev = tvm.cpu(0)

     @T.prim_func
     def my_func(a: T.handle, b: T.handle):
         A = T.match_buffer(a, (num_elements,), "float32")
         B = T.match_buffer(b, (num_elements,), "float32")
         T.func_attr({"global_symbol": "my_module", "tir.noalias": True})
         B[T.ramp(0, 1, 4 * T.vscale())] = A[T.ramp(0, 1, 4 * T.vscale())]

     mod = tvm.compile(my_func, target=target)

     A_np = np.random.uniform(size=(num_elements,)).astype("float32")
     B_np = np.zeros((num_elements,)).astype("float32")
     A_nd = tvm.runtime.tensor(A_np, device=dev)
     B_nd = tvm.runtime.tensor(B_np, device=dev)
     mod(A_nd, B_nd)

     tvm.testing.assert_allclose(B_nd.numpy(), A_np)


 @tvm.testing.requires_aarch64_sve
 def test_scalable_loop_bound(sve_device_vector_length):
     np.random.seed(0)

     dtype = "float32"
     num_elements = sve_device_vector_length // 32
     target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sve"
     dev = tvm.cpu(0)

     @T.prim_func
     def my_func(a: T.handle, b: T.handle):
         A = T.match_buffer(a, (num_elements,), "float32")
         B = T.match_buffer(b, (num_elements,), "float32")
         T.func_attr({"global_symbol": "my_module", "tir.noalias": True})
         for i in T.serial(0, 4 * T.vscale()):
             B[i] = A[i]

     mod = tvm.compile(my_func, target=target)

     A_np = np.random.uniform(size=(num_elements,)).astype(dtype)
     B_np = np.zeros((num_elements,)).astype(dtype)
     A_nd = tvm.runtime.tensor(A_np, device=dev)
     B_nd = tvm.runtime.tensor(B_np, device=dev)
     mod(A_nd, B_nd)

     tvm.testing.assert_allclose(B_nd.numpy(), A_np)


 @tvm.testing.requires_aarch64_sve
 def test_scalable_broadcast(sve_device_vector_length):
     target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sve"
     num_elements = sve_device_vector_length // 32
     dev = tvm.cpu(0)

     @T.prim_func
     def my_func(a: T.handle):
         A = T.match_buffer(a, (num_elements,), "float32")
         T.func_attr({"global_symbol": "my_module", "tir.noalias": True})
         A[T.ramp(0, 1, 4 * T.vscale())] = T.broadcast(1, 4 * T.vscale())

     mod = tvm.compile(my_func, target=target)

     A_np = np.zeros((num_elements,)).astype("float32")
     A_nd = tvm.runtime.tensor(A_np, device=dev)
     mod(A_nd)

     ref = np.ones((num_elements,))
     tvm.testing.assert_allclose(A_nd.numpy(), ref)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import subprocess
	import tempfile
	import re

	import pytest
	import numpy as np

	import tvm
	from tvm.script import tir as T
	from tvm.target import codegen

	llvm_version, arm_target, input_dtype, kernel_dtype, is_supported = tvm.testing.parameters(
	# Testing mcpu type
	(8, "c -mcpu=cortex-m4", "int8", "int8", False),
	(8, "c -mcpu=cortex-m7", "int8", "int8", False),
	(8, "c -mcpu=cortex-m33", "int8", "int8", False),
	(8, "c -mcpu=cortex-m55", "int8", "int8", False),
	(8, "c -mcpu=cortex-m3", "int8", "int8", False),
	(7, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", False),
	(8, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", True),
	(9, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", True),
	(8, "llvm -mtriple=arm-linux-gnueabi", "int8", "int8", False),
	(7, "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.4a,+dotprod", "int8", "int8", False),
	(8, "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.4a,+dotprod", "int8", "int8", True),
	(9, "llvm -mtriple=arm-linux-gnueabi -mattr=+neon", "int8", "int8", True),
	(8, "llvm -mtriple=aarch64-linux-gnu", "int8", "int8", True),
	# Testing dtype
	(8, "llvm -mtriple=aarch64-linux-gnu -mattr=+neon", "int16", "int8", False),
	(8, "llvm -mtriple=aarch64-linux-gnu -mattr=+neon", "int8", "int16", False),
	(8, "llvm -mtriple=aarch64-linux-gnu -mattr=+neon", "int16", "int16", False),
	)


	@pytest.fixture(scope="session")
	def sve_device_vector_length():
	c_code = r"""
	#include <stdio.h>
	#include <arm_sve.h>

	int main() {
	printf("%ld\n", svcntb() * 8);
	}
	"""

	with tempfile.TemporaryDirectory() as tmp_dir:
	c_path = f"{tmp_dir}/vl.c"
	o_path = f"{tmp_dir}/out.o"
	with open(c_path, "w") as f:
	f.write(c_code)
	tvm.contrib.cc.create_executable(o_path, c_path, ["-march=native"])
	out = subprocess.check_output(o_path, shell=True).strip().decode()

	return int(out)


	@tvm.testing.requires_aarch64_sve
	def test_scalable_div(sve_device_vector_length):
	np.random.seed(0)
	target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sve"
	dev = tvm.cpu(0)

	@T.prim_func
	def my_func(a: T.handle):
	A = T.match_buffer(a, (1,), "int32")
	T.func_attr({"global_symbol": "my_module", "tir.noalias": True})
	A[0] = T.Div(10000, 4 * T.vscale())

	mod = tvm.compile(my_func, target=target)

	A_nd = tvm.runtime.tensor(np.empty((1,), dtype="int32"), device=dev)
	mod(A_nd)

	ref = 10000 // (sve_device_vector_length // 32)
	tvm.testing.assert_allclose(A_nd.numpy()[0], ref)


	@tvm.testing.requires_aarch64_sve
	def test_scalable_buffer_load_store(sve_device_vector_length):
	np.random.seed(0)
	target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sve"
	num_elements = sve_device_vector_length // 32
	dev = tvm.cpu(0)

	@T.prim_func
	def my_func(a: T.handle, b: T.handle):
	A = T.match_buffer(a, (num_elements,), "float32")
	B = T.match_buffer(b, (num_elements,), "float32")
	T.func_attr({"global_symbol": "my_module", "tir.noalias": True})
	B[T.ramp(0, 1, 4 * T.vscale())] = A[T.ramp(0, 1, 4 * T.vscale())]

	mod = tvm.compile(my_func, target=target)

	A_np = np.random.uniform(size=(num_elements,)).astype("float32")
	B_np = np.zeros((num_elements,)).astype("float32")
	A_nd = tvm.runtime.tensor(A_np, device=dev)
	B_nd = tvm.runtime.tensor(B_np, device=dev)
	mod(A_nd, B_nd)

	tvm.testing.assert_allclose(B_nd.numpy(), A_np)


	@tvm.testing.requires_aarch64_sve
	def test_scalable_loop_bound(sve_device_vector_length):
	np.random.seed(0)

	dtype = "float32"
	num_elements = sve_device_vector_length // 32
	target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sve"
	dev = tvm.cpu(0)

	@T.prim_func
	def my_func(a: T.handle, b: T.handle):
	A = T.match_buffer(a, (num_elements,), "float32")
	B = T.match_buffer(b, (num_elements,), "float32")
	T.func_attr({"global_symbol": "my_module", "tir.noalias": True})
	for i in T.serial(0, 4 * T.vscale()):
	B[i] = A[i]

	mod = tvm.compile(my_func, target=target)

	A_np = np.random.uniform(size=(num_elements,)).astype(dtype)
	B_np = np.zeros((num_elements,)).astype(dtype)
	A_nd = tvm.runtime.tensor(A_np, device=dev)
	B_nd = tvm.runtime.tensor(B_np, device=dev)
	mod(A_nd, B_nd)

	tvm.testing.assert_allclose(B_nd.numpy(), A_np)


	@tvm.testing.requires_aarch64_sve
	def test_scalable_broadcast(sve_device_vector_length):
	target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sve"
	num_elements = sve_device_vector_length // 32
	dev = tvm.cpu(0)

	@T.prim_func
	def my_func(a: T.handle):
	A = T.match_buffer(a, (num_elements,), "float32")
	T.func_attr({"global_symbol": "my_module", "tir.noalias": True})
	A[T.ramp(0, 1, 4 * T.vscale())] = T.broadcast(1, 4 * T.vscale())

	mod = tvm.compile(my_func, target=target)

	A_np = np.zeros((num_elements,)).astype("float32")
	A_nd = tvm.runtime.tensor(A_np, device=dev)
	mod(A_nd)

	ref = np.ones((num_elements,))
	tvm.testing.assert_allclose(A_nd.numpy(), ref)