tests/python/relax/backend/adreno/test_clml_ops.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # ruff: noqa: E501, F401, F841
 """CLML integration operator tests."""

 import inspect
 import json
 import os

 import numpy as np
 import pytest
 from mod_utils import (
     get_avgpool_expected_codegen,
     get_batchnorm_mod,
     get_binary_op_mod,
     get_clml_conv2d_codegen,
     get_conv2d_transpose_expected_codegen,
     get_dequant_matmul_module,
     get_dequant_vec_matmul_module,
     get_global_avgpool_expected_codegen,
     get_global_maxpool_expected_codegen,
     get_maxpool_expected_codegen,
     get_relax_avgpool_mod,
     get_relax_conv2d_mod,
     get_relax_conv2d_transpose_mod,
     get_relax_global_avgpool_mod,
     get_relax_global_maxpool_mod,
     get_relax_maxpool_mod,
     get_relax_reshape_codegen,
     get_relax_reshape_mod,
     get_unary_op_mod,
 )
 from utils import skip_unless_adreno_clml, verify_results

 import tvm
 import tvm.testing
 from tvm import relax, rpc
 from tvm.relax.backend.adreno import clml
 from tvm.relax.backend.adreno.clml import OpenCLMLOffLoad, OpenCLMLOffLoadForLLM
 from tvm.script import ir as I
 from tvm.script import relax as R
 from tvm.script import tirx as T
 from tvm.script.ir_builder import IRBuilder
 from tvm.script.ir_builder import relax as relax_builder

 CLML_VERSION = clml.clml_sdk_version()
 TARGET_CLML_VERSION = int(os.environ.get("ADRENO_TARGET_CLML_VERSION", 4))
 clml_target = tvm.target.Target("qcom/adreno-opencl-clml")
 ref_target = tvm.target.Target("opencl")


 def verify_clml_codegen(clml_mod, clml_codegen):
     clml_mod = OpenCLMLOffLoadForLLM(clml_target)(clml_mod)
     clml_mod = OpenCLMLOffLoad()(clml_mod)

     source = clml_mod.attrs["external_mods"][0].inspect_source()
     codegen = json.loads(source)["nodes"]
     for node in range(len(codegen)):
         if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
             codegen[node]["name"] = ""
         if codegen[node]["op"] == "kernel":
             codegen[node]["name"] = ""

     codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
     known_good_codegen_str = json.dumps(clml_codegen, sort_keys=True, indent=2)
     assert codegen_str == known_good_codegen_str, (
         f"The JSON produced by codegen does not match the expected result. \n"
         f"Actual={codegen_str} \n"
         f"Expected={known_good_codegen_str}"
     )


 def verify(
     mod, clml_codegen, inputs_np, params_np, target_minimum_clml_version=None, target_test=True
 ):
     mod = tvm.relax.transform.BindParams("main", params_np)(mod)
     codegen_mod, clml_mod = mod.clone(), mod
     verify_clml_codegen(codegen_mod, clml_codegen)

     if (
         target_minimum_clml_version is not None
         and TARGET_CLML_VERSION < target_minimum_clml_version
     ):
         print(f"Skipped Eval Tests for {inspect.stack()[1].function} function", flush=True)
         return

     if "ADRENO_TARGET" not in os.environ:
         return

     if target_test:
         verify_results(clml_mod, target=clml_target, ref_target=ref_target)


 @pytest.mark.gpu
 @skip_unless_adreno_clml
 @pytest.mark.parametrize("dtype", ["float32"])
 @pytest.mark.parametrize(
     "kernel_h, kernel_w, padding, stride, dilation, out_channels, shape, has_bias, has_bn, has_activation, has_pad, is_depthwise",
     [
         (3, 3, (1, 1), (1, 1), (1, 1), 64, (3, 224, 224), False, True, False, True, False),
         (3, 3, (1, 1), (1, 1), (1, 1), 64, (3, 224, 224), False, True, False, False, False),
         # (5, 5, (2, 2), (1, 1), (1, 1), 16, (16, 64, 64), False, True, True, False, False),
         # (7, 7, (3, 3), (2, 2), (1, 1), 32, (3, 224, 224), True, False, True, True, False),
         (3, 3, (0, 0), (1, 1), (1, 1), 512, (256, 14, 14), True, False, True, False, False),
         (1, 1, (0, 0), (1, 1), (1, 1), 1024, (512, 7, 7), True, False, True, False, False),
         (1, 3, (0, 0), (1, 1), (1, 1), 64, (64, 7, 7), True, False, True, False, False),
         (3, 1, (0, 0), (1, 1), (1, 1), 64, (64, 7, 7), False, True, True, True, False),
     ],
 )
 def test_conv2d_offload(
     kernel_h,
     kernel_w,
     padding,
     stride,
     dilation,
     out_channels,
     shape,
     has_bias,
     has_bn,
     has_activation,
     has_pad,
     is_depthwise,
     dtype,
 ):
     low, high = -0.01, 0.01
     rtol, atol = 1e-3, 1e-3
     if CLML_VERSION > 3:
         rtol, atol = 1e-2, 1e-2  # @clml precision

     data_shape = (1, *shape)
     if is_depthwise:
         groups = data_shape[1] // out_channels
     else:
         groups = 1
     padding = (padding[0], padding[1], padding[0], padding[1])

     weight_format = "IOHW" if is_depthwise else "OIHW"
     weight_shape = (out_channels, data_shape[1] // groups, kernel_h, kernel_w)

     data = np.random.uniform(low, high, size=data_shape).astype(dtype)
     weight = np.random.uniform(low, high, size=weight_shape).astype(dtype)
     bias = np.random.uniform(low, high, size=(1, weight_shape[0], 1, 1)).astype(dtype)

     gamma = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
     beta = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
     mean = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
     variance = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)

     inputs_np = [data]
     params_np = {"weight": weight}
     if has_bias:
         params_np["bias"] = bias
     if has_bn:
         params_np.update({"gamma": gamma, "beta": beta, "mean": mean, "variance": variance})

     mod = get_relax_conv2d_mod(
         data_shape,
         weight_shape,
         stride=stride,
         dilation=dilation,
         padding=padding,
         weight_layout=weight_format,
         groups=groups,
         dtype=dtype,
         has_bias=has_bias,
         has_bn=has_bn,
         has_activation=has_activation,
         has_pad=has_pad,
         is_depthwise=is_depthwise,
     )
     clml_codegen = get_clml_conv2d_codegen(
         data_shape,
         weight_shape,
         stride=stride,
         dilation=dilation,
         padding=padding,
         weight_layout=weight_format,
         groups=groups,
         dtype=dtype,
         has_bias=has_bias,
         has_bn=has_bn,
         has_activation=has_activation,
         has_pad=has_pad,
         is_depthwise=is_depthwise,
     )
     verify(mod, clml_codegen, inputs_np, params_np)


 @pytest.mark.gpu
 @skip_unless_adreno_clml
 @pytest.mark.parametrize("dtype", ["float32"])
 @pytest.mark.parametrize(
     "dshape, kshape, channels, kernel_size, strides, padding, out_shape",
     [
         ((1, 256, 100, 100), (64, 256, 4, 4), 64, (4, 4), (2, 2), (0, 0, 0, 0), (1, 64, 202, 202)),
         ((1, 64, 200, 200), (64, 64, 4, 4), 64, (4, 4), (2, 2), (1, 1, 1, 1), (1, 64, 400, 400)),
         ((1, 64, 200, 200), (64, 64, 4, 4), 64, (4, 4), (2, 2), (1, 1, 1, 1), (1, 64, 400, 400)),
         ((1, 64, 400, 400), (16, 64, 4, 4), 16, (4, 4), (2, 2), (1, 1, 1, 1), (1, 16, 800, 800)),
     ],
 )
 def test_conv2d_transpose(
     dshape, kshape, channels, kernel_size, strides, padding, dtype, out_shape
 ):
     low, high = -1, 1

     data = np.random.uniform(low, high, size=dshape).astype(dtype)
     weight = np.random.uniform(low, high, size=kshape).astype(dtype)

     inputs_np = [data]
     params_np = {"weight": weight}

     mod = get_relax_conv2d_transpose_mod(
         dshape,
         kshape,
         channels=channels,
         stride=strides,
         padding=padding,
         dtype=dtype,
     )

     clml_codegen = get_conv2d_transpose_expected_codegen(
         dshape=dshape,
         kshape=kshape,
         channels=channels,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         dilation=(1, 1),
         dtype=dtype,
         output_shape=out_shape,
     )
     verify(mod, clml_codegen, inputs_np, params_np, target_test=False)


 @pytest.mark.gpu
 @skip_unless_adreno_clml
 @pytest.mark.skipif(
     CLML_VERSION < 3,
     reason="Requires compiler supporting CLML v5 or above",
 )
 @pytest.mark.parametrize("dtype", ["float32"])
 @pytest.mark.parametrize(
     "trials",
     [
         [(1, 64, 14, 14), 1, 3e-4],
         [(1, 14, 256, 256), 1, 3e-4],
         [(1, 14, 256, 256), 1, 3e-4],
         [(1, 256, 1, 1), 1, 3e-4],
     ],
 )
 def test_batchnorm(dtype, trials):
     low, high = 0, 1
     (input_shape, axis, epsilon) = trials
     channels = input_shape[axis]

     def _get_axis_tuple(axis):
         if axis == 0:
             return (1, 2, 3)
         elif axis == 1:
             return (0, 2, 3)
         elif axis == 2:
             return (0, 1, 3)
         else:
             return (0, 1, 2)

     data = np.random.uniform(low, high, size=(input_shape)).astype(dtype)
     gamma = np.random.uniform(low, high, size=(channels)).astype(dtype)
     beta = np.random.uniform(low, high, size=(channels)).astype(dtype)
     mean = np.mean(data, _get_axis_tuple(axis), keepdims=False)
     variance = np.var(data, _get_axis_tuple(axis), keepdims=False)

     inputs_np = [data]
     params_np = {"gamma": gamma, "beta": beta, "moving_mean": mean, "moving_var": variance}
     mod = get_batchnorm_mod(input_shape, channels, axis, epsilon, dtype)
     clml_codegen = [
         {
             "attrs": {"dtype": [dtype], "shape": [input_shape]},
             "name": "",
             "op": "input",
         },
         {"attrs": {"dtype": [dtype], "shape": [[channels]]}, "name": "", "op": "const"},
         {"attrs": {"dtype": [dtype], "shape": [[channels]]}, "name": "", "op": "const"},
         {"attrs": {"dtype": [dtype], "shape": [[channels]]}, "name": "", "op": "const"},
         {"attrs": {"dtype": [dtype], "shape": [[channels]]}, "name": "", "op": "const"},
         {
             "attrs": {
                 "axis": axis,
                 "center": 1,
                 "dtype": [dtype],
                 "momentum": 0.10000000000000001,
                 "epsilon": 0.00029999999999999997,
                 "num_inputs": 5,
                 "num_outputs": 1,
                 "scale": 1,
                 "training": 1,
                 "shape": [input_shape],
             },
             "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 0, 0]],
             "name": "",
             "op": "kernel",
         },
     ]
     verify(mod, clml_codegen, inputs_np, params_np)


 @pytest.mark.gpu
 @skip_unless_adreno_clml
 @pytest.mark.parametrize("dtype", ["float32"])
 @pytest.mark.parametrize(
     "a_shape, b_shape, op",
     [
         ((1, 64, 14, 14), (1, 64, 14, 14), R.add),
         ((1, 256), (1, 256), R.add),
         ((1, 64, 14, 14), (1, 64, 14, 14), R.subtract),
         ((1, 256), (1, 256), R.subtract),
         ((1, 64, 14, 14), (1, 64, 14, 14), R.multiply),
         ((1, 256), (1, 256), R.multiply),
         ((1, 64, 14, 14), (1, 64, 14, 14), R.divide),
         ((1, 256), (1, 256), R.divide),
         ((1, 64, 14, 14), (1, 64, 14, 14), R.minimum),
         ((1, 256), (1, 256), R.minimum),
         ((1, 64, 14, 14), (1, 64, 14, 14), R.maximum),
         ((1, 256), (1, 256), R.maximum),
     ],
 )
 @pytest.mark.gpu
 @skip_unless_adreno_clml
 def test_binary_ops(a_shape, b_shape, op, dtype):
     (mod, inputs_np) = get_binary_op_mod(a_shape, b_shape, op, dtype)
     clml_codegen = [
         {
             "attrs": {
                 "dtype": [dtype],
                 "shape": [a_shape],
             },
             "name": "",
             "op": "input",
         },
         {
             "attrs": {
                 "dtype": [dtype],
                 "shape": [b_shape],
             },
             "name": "",
             "op": "input",
         },
         {
             "attrs": {
                 "dtype": [dtype],
                 "num_inputs": 2,
                 "num_outputs": 1,
                 "shape": [a_shape],
             },
             "inputs": [[0, 0, 0], [1, 0, 0]],
             "name": "",
             "op": "kernel",
         },
     ]
     verify(mod, clml_codegen, inputs_np, {})


 @pytest.mark.gpu
 @skip_unless_adreno_clml
 @pytest.mark.parametrize(
     "dtype",
     [
         "float32",
     ],
 )
 @pytest.mark.parametrize(
     "a_shape, op",
     [
         ((1, 64, 14, 14), R.nn.relu),
         ((1, 256, 1, 1), R.nn.relu),
         ((1, 14, 256, 256), R.nn.relu),
         ((1, 14, 14, 256), R.nn.relu),
     ],
 )
 @pytest.mark.gpu
 @skip_unless_adreno_clml
 def test_unary_ops(a_shape, op, dtype):
     (mod, inputs_np) = get_unary_op_mod(a_shape, op, dtype)
     clml_codegen = [
         {
             "attrs": {
                 "dtype": [dtype],
                 "shape": [a_shape],
             },
             "name": "",
             "op": "input",
         },
         {
             "attrs": {
                 "activation_type": "relu",
                 "dtype": [dtype],
                 "num_inputs": 1,
                 "num_outputs": 1,
                 "shape": [a_shape],
             },
             "inputs": [[0, 0, 0]],
             "name": "",
             "op": "kernel",
         },
     ]
     verify(mod, clml_codegen, inputs_np, {})


 @pytest.mark.gpu
 @skip_unless_adreno_clml
 @pytest.mark.parametrize("dtype", ["float32"])
 @pytest.mark.parametrize(
     "trials",
     [
         [(1, 64, 147, 147), (3, 3), (2, 2), (1, 1), (0, 0, 0, 0), False],
         [(1, 256, 17, 17), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
         [(1, 1024, 14, 14), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
         # With padding is realized as nn.pad + pool
         # [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (1, 1, 1, 1), True],
         # [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (0, 1, 0, 1), True],
         # [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 1, 1, 1), True],
         # [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
     ],
 )
 def test_max_pool(dtype, trials):
     low, high = -1, 1
     (input_shape, pool_size, stride, dilation, padding, has_pad) = trials
     mod = get_relax_maxpool_mod(input_shape, dtype, pool_size, stride, dilation, padding, has_pad)
     clml_codegen = get_maxpool_expected_codegen(
         input_shape, pool_size, stride, padding, "maxpool2d", dtype
     )

     inputs_np = [np.random.uniform(low, high, size=input_shape).astype(dtype)]
     verify(mod, clml_codegen, inputs_np, {})


 @pytest.mark.gpu
 @skip_unless_adreno_clml
 @pytest.mark.parametrize("dtype", ["float32"])
 @pytest.mark.parametrize(
     "trials",
     [
         [(1, 64, 147, 147), (3, 3), (2, 2), (1, 1), (0, 0, 0, 0), False],
         [(1, 256, 17, 17), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
         [(1, 1024, 14, 14), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
         # With padding is realized as nn.pad + pool
         # [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (1, 1, 1, 1), True],
         # [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (0, 1, 0, 1), True],
         # [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 1, 1, 1), True],
         # [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
     ],
 )
 def test_avg_pool(dtype, trials):
     low, high = -1, 1
     (input_shape, pool_size, stride, dilation, padding, has_pad) = trials
     mod = get_relax_avgpool_mod(input_shape, dtype, pool_size, stride, dilation, padding, has_pad)
     clml_codegen = get_avgpool_expected_codegen(
         input_shape, pool_size, stride, padding, "avg_pool2d", dtype
     )

     inputs_np = [np.random.uniform(low, high, size=input_shape).astype(dtype)]
     params_np = {}
     verify(mod, clml_codegen, inputs_np, {})


 @pytest.mark.gpu
 @skip_unless_adreno_clml
 @pytest.mark.parametrize("dtype", ["float32"])
 @pytest.mark.parametrize(
     "trials",
     [
         [(1, 3, 32, 32), (1, 4, -1, 32)],
         [(1, 4, 8, 32), (1, 4, -1, 16)],
         [(1, 64, 3, 3), (1, 32, 3, -1)],
     ],
 )
 def test_reshape(dtype, trials):
     low, high = -1, 1
     (input_shape, output_shape) = trials
     mod = get_relax_reshape_mod(input_shape, output_shape, dtype)
     clml_codegen = get_relax_reshape_codegen(input_shape, output_shape, dtype)

     inputs_np = [np.random.uniform(low, high, size=input_shape).astype(dtype)]
     verify(mod, clml_codegen, inputs_np, {})


 @pytest.mark.skip(reason="Codegen Comparision Failing")
 @pytest.mark.gpu
 @skip_unless_adreno_clml
 @pytest.mark.parametrize("dtype", ["float32"])
 @pytest.mark.parametrize(
     "trials",
     [
         [(1, 64, 147, 147), True],
         [(1, 256, 17, 17), False],
         [(1, 1024, 14, 14), True],
         [(1, 32, 256, 256), False],
     ],
 )
 def test_global_avg_pool(dtype, trials):
     """Test function for global average pooling."""
     low, high = -1, 1
     (input_shape, keep_dims) = trials
     N, C, H, W = input_shape
     pool_size, stride, padding = (H, W), (1, 1), (0, 0, 0, 0)
     mod = get_relax_global_avgpool_mod(input_shape, keep_dims, dtype)
     clml_codegen = get_global_maxpool_expected_codegen(
         input_shape, pool_size, stride, padding, "global_max", dtype
     )

     inputs_np = [np.random.uniform(low, high, size=input_shape).astype(dtype)]
     verify(mod, clml_codegen, inputs_np, {})


 @pytest.mark.gpu
 @skip_unless_adreno_clml
 @pytest.mark.parametrize("dtype", ["float32"])
 @pytest.mark.parametrize(
     "trials",
     [
         [(1, 64, 147, 147), True],
         [(1, 256, 17, 17), False],
         [(1, 1024, 14, 14), True],
         [(1, 32, 256, 256), False],
     ],
 )
 def test_global_max_pool(dtype, trials):
     """Test function for global average pooling."""
     low, high = -1, 1
     (input_shape, keep_dims) = trials
     N, C, H, W = input_shape
     pool_size, stride, padding = (H, W), (1, 1), (0, 0, 0, 0)
     mod = get_relax_global_maxpool_mod(input_shape, keep_dims, dtype)
     clml_codegen = get_global_maxpool_expected_codegen(
         input_shape, pool_size, stride, padding, "global_max", dtype
     )

     inputs_np = [np.random.uniform(low, high, size=input_shape).astype(dtype)]
     verify(mod, clml_codegen, inputs_np, {})


 @pytest.mark.skipif(
     CLML_VERSION < 5,
     reason="Requires target device with CLML v5 or above",
 )
 @pytest.mark.parametrize(
     "K, N, M",
     [
         (4096, 11008, 256),
         (2048, 32768, 128),
         (4096, 4096, 512),
         (4096, 22016, 64),
         (16384, 2048, 128),
         (2048, 2560, 1024),
         (3072, 9216, 256),
         (14336, 4096, 128),
         (1536, 17920, 128),
         (8960, 1536, 1024),
     ],
 )
 def test_dequant_matmul(K, N, M):
     x_data = np.random.uniform(-0.1, 0.1, size=(1, M, K)).astype("float16")
     weight = np.random.randint(0, 100, size=(K // 8, N)).astype("uint32")
     scale = np.random.uniform(-0.1, 0.1, size=(K // 32, N)).astype("float16")

     mod = get_dequant_matmul_module(K, N)
     clml_codegen = [
         {
             "op": "input",
             "name": "",
             "attrs": {"shape": [[[K // 8, N]]], "dtype": [["uint32"]]},
         },
         {
             "op": "input",
             "name": "",
             "attrs": {"shape": [[[K // 32, N]]], "dtype": [["float16"]]},
         },
         {
             "op": "input",
             "name": "",
             "attrs": {"shape": [[[1, -1, K]]], "dtype": [["float16"]]},
         },
         {
             "op": "kernel",
             "name": "",
             "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]],
             "attrs": {
                 "dtype": ["float16"],
                 "num_inputs": 3,
                 "num_outputs": 1,
                 "out_dtype": ["float16"],
                 "shape": [[1, -1, N]],
             },
         },
     ]

     inputs_np = [x_data, weight, scale]
     verify(mod, clml_codegen, inputs_np, {}, target_minimum_clml_version=5)


 @pytest.mark.skipif(
     CLML_VERSION < 5,
     reason="Requires compiler supporting CLML v5 or above",
 )
 @pytest.mark.parametrize(
     "K, N",
     [
         (4096, 11008),
         (2048, 32768),
         (4096, 4096),
         (4096, 22016),
         (16384, 2048),
         (2048, 2560),
         (3072, 9216),
         (4096, 28672),
         (14336, 4096),
         (1536, 17920),
         (8960, 1536),
     ],
 )
 def test_dequant_vec_matmul(K, N):
     x_data = np.random.uniform(-0.1, 0.1, size=(1, 1, K)).astype("float16")
     weight = np.random.randint(0, 100, size=(K // 8, N)).astype("uint32")
     scale = np.random.uniform(-0.1, 0.1, size=(K // 32, N)).astype("float16")

     mod = get_dequant_vec_matmul_module(K, N)
     clml_codegen = [
         {
             "op": "input",
             "name": "",
             "attrs": {"shape": [[[K // 8, -1]]], "dtype": [["uint32"]]},
         },
         {
             "op": "input",
             "name": "",
             "attrs": {"shape": [[[K // 32, -1]]], "dtype": [["float16"]]},
         },
         {
             "op": "input",
             "name": "",
             "attrs": {"shape": [[[1, 1, K]]], "dtype": [["float16"]]},
         },
         {
             "op": "kernel",
             "name": "",
             "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]],
             "attrs": {
                 "dtype": ["float16"],
                 "num_inputs": 3,
                 "num_outputs": 1,
                 "out_dtype": ["float16"],
                 "shape": [[1, 1, -1]],
             },
         },
     ]

     inputs_np = (x_data, weight, scale)
     verify(mod, clml_codegen, inputs_np, {}, target_minimum_clml_version=5)


 if __name__ == "__main__":
     tvm.testing.main()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	# ruff: noqa: E501, F401, F841
	"""CLML integration operator tests."""

	import inspect
	import json
	import os

	import numpy as np
	import pytest
	from mod_utils import (
	get_avgpool_expected_codegen,
	get_batchnorm_mod,
	get_binary_op_mod,
	get_clml_conv2d_codegen,
	get_conv2d_transpose_expected_codegen,
	get_dequant_matmul_module,
	get_dequant_vec_matmul_module,
	get_global_avgpool_expected_codegen,
	get_global_maxpool_expected_codegen,
	get_maxpool_expected_codegen,
	get_relax_avgpool_mod,
	get_relax_conv2d_mod,
	get_relax_conv2d_transpose_mod,
	get_relax_global_avgpool_mod,
	get_relax_global_maxpool_mod,
	get_relax_maxpool_mod,
	get_relax_reshape_codegen,
	get_relax_reshape_mod,
	get_unary_op_mod,
	)
	from utils import skip_unless_adreno_clml, verify_results

	import tvm
	import tvm.testing
	from tvm import relax, rpc
	from tvm.relax.backend.adreno import clml
	from tvm.relax.backend.adreno.clml import OpenCLMLOffLoad, OpenCLMLOffLoadForLLM
	from tvm.script import ir as I
	from tvm.script import relax as R
	from tvm.script import tirx as T
	from tvm.script.ir_builder import IRBuilder
	from tvm.script.ir_builder import relax as relax_builder

	CLML_VERSION = clml.clml_sdk_version()
	TARGET_CLML_VERSION = int(os.environ.get("ADRENO_TARGET_CLML_VERSION", 4))
	clml_target = tvm.target.Target("qcom/adreno-opencl-clml")
	ref_target = tvm.target.Target("opencl")


	def verify_clml_codegen(clml_mod, clml_codegen):
	clml_mod = OpenCLMLOffLoadForLLM(clml_target)(clml_mod)
	clml_mod = OpenCLMLOffLoad()(clml_mod)

	source = clml_mod.attrs["external_mods"][0].inspect_source()
	codegen = json.loads(source)["nodes"]
	for node in range(len(codegen)):
	if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
	codegen[node]["name"] = ""
	if codegen[node]["op"] == "kernel":
	codegen[node]["name"] = ""

	codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
	known_good_codegen_str = json.dumps(clml_codegen, sort_keys=True, indent=2)
	assert codegen_str == known_good_codegen_str, (
	f"The JSON produced by codegen does not match the expected result. \n"
	f"Actual={codegen_str} \n"
	f"Expected={known_good_codegen_str}"
	)


	def verify(
	mod, clml_codegen, inputs_np, params_np, target_minimum_clml_version=None, target_test=True
	):
	mod = tvm.relax.transform.BindParams("main", params_np)(mod)
	codegen_mod, clml_mod = mod.clone(), mod
	verify_clml_codegen(codegen_mod, clml_codegen)

	if (
	target_minimum_clml_version is not None
	and TARGET_CLML_VERSION < target_minimum_clml_version
	):
	print(f"Skipped Eval Tests for {inspect.stack()[1].function} function", flush=True)
	return

	if "ADRENO_TARGET" not in os.environ:
	return

	if target_test:
	verify_results(clml_mod, target=clml_target, ref_target=ref_target)


	@pytest.mark.gpu
	@skip_unless_adreno_clml
	@pytest.mark.parametrize("dtype", ["float32"])
	@pytest.mark.parametrize(
	"kernel_h, kernel_w, padding, stride, dilation, out_channels, shape, has_bias, has_bn, has_activation, has_pad, is_depthwise",
	[
	(3, 3, (1, 1), (1, 1), (1, 1), 64, (3, 224, 224), False, True, False, True, False),
	(3, 3, (1, 1), (1, 1), (1, 1), 64, (3, 224, 224), False, True, False, False, False),
	# (5, 5, (2, 2), (1, 1), (1, 1), 16, (16, 64, 64), False, True, True, False, False),
	# (7, 7, (3, 3), (2, 2), (1, 1), 32, (3, 224, 224), True, False, True, True, False),
	(3, 3, (0, 0), (1, 1), (1, 1), 512, (256, 14, 14), True, False, True, False, False),
	(1, 1, (0, 0), (1, 1), (1, 1), 1024, (512, 7, 7), True, False, True, False, False),
	(1, 3, (0, 0), (1, 1), (1, 1), 64, (64, 7, 7), True, False, True, False, False),
	(3, 1, (0, 0), (1, 1), (1, 1), 64, (64, 7, 7), False, True, True, True, False),
	],
	)
	def test_conv2d_offload(
	kernel_h,
	kernel_w,
	padding,
	stride,
	dilation,
	out_channels,
	shape,
	has_bias,
	has_bn,
	has_activation,
	has_pad,
	is_depthwise,
	dtype,
	):
	low, high = -0.01, 0.01
	rtol, atol = 1e-3, 1e-3
	if CLML_VERSION > 3:
	rtol, atol = 1e-2, 1e-2 # @clml precision

	data_shape = (1, *shape)
	if is_depthwise:
	groups = data_shape[1] // out_channels
	else:
	groups = 1
	padding = (padding[0], padding[1], padding[0], padding[1])

	weight_format = "IOHW" if is_depthwise else "OIHW"
	weight_shape = (out_channels, data_shape[1] // groups, kernel_h, kernel_w)

	data = np.random.uniform(low, high, size=data_shape).astype(dtype)
	weight = np.random.uniform(low, high, size=weight_shape).astype(dtype)
	bias = np.random.uniform(low, high, size=(1, weight_shape[0], 1, 1)).astype(dtype)

	gamma = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
	beta = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
	mean = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
	variance = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)

	inputs_np = [data]
	params_np = {"weight": weight}
	if has_bias:
	params_np["bias"] = bias
	if has_bn:
	params_np.update({"gamma": gamma, "beta": beta, "mean": mean, "variance": variance})

	mod = get_relax_conv2d_mod(
	data_shape,
	weight_shape,
	stride=stride,
	dilation=dilation,
	padding=padding,
	weight_layout=weight_format,
	groups=groups,
	dtype=dtype,
	has_bias=has_bias,
	has_bn=has_bn,
	has_activation=has_activation,
	has_pad=has_pad,
	is_depthwise=is_depthwise,
	)
	clml_codegen = get_clml_conv2d_codegen(
	data_shape,
	weight_shape,
	stride=stride,
	dilation=dilation,
	padding=padding,
	weight_layout=weight_format,
	groups=groups,
	dtype=dtype,
	has_bias=has_bias,
	has_bn=has_bn,
	has_activation=has_activation,
	has_pad=has_pad,
	is_depthwise=is_depthwise,
	)
	verify(mod, clml_codegen, inputs_np, params_np)


	@pytest.mark.gpu
	@skip_unless_adreno_clml
	@pytest.mark.parametrize("dtype", ["float32"])
	@pytest.mark.parametrize(
	"dshape, kshape, channels, kernel_size, strides, padding, out_shape",
	[
	((1, 256, 100, 100), (64, 256, 4, 4), 64, (4, 4), (2, 2), (0, 0, 0, 0), (1, 64, 202, 202)),
	((1, 64, 200, 200), (64, 64, 4, 4), 64, (4, 4), (2, 2), (1, 1, 1, 1), (1, 64, 400, 400)),
	((1, 64, 200, 200), (64, 64, 4, 4), 64, (4, 4), (2, 2), (1, 1, 1, 1), (1, 64, 400, 400)),
	((1, 64, 400, 400), (16, 64, 4, 4), 16, (4, 4), (2, 2), (1, 1, 1, 1), (1, 16, 800, 800)),
	],
	)
	def test_conv2d_transpose(
	dshape, kshape, channels, kernel_size, strides, padding, dtype, out_shape
	):
	low, high = -1, 1

	data = np.random.uniform(low, high, size=dshape).astype(dtype)
	weight = np.random.uniform(low, high, size=kshape).astype(dtype)

	inputs_np = [data]
	params_np = {"weight": weight}

	mod = get_relax_conv2d_transpose_mod(
	dshape,
	kshape,
	channels=channels,
	stride=strides,
	padding=padding,
	dtype=dtype,
	)

	clml_codegen = get_conv2d_transpose_expected_codegen(
	dshape=dshape,
	kshape=kshape,
	channels=channels,
	kernel_size=kernel_size,
	strides=strides,
	padding=padding,
	dilation=(1, 1),
	dtype=dtype,
	output_shape=out_shape,
	)
	verify(mod, clml_codegen, inputs_np, params_np, target_test=False)


	@pytest.mark.gpu
	@skip_unless_adreno_clml
	@pytest.mark.skipif(
	CLML_VERSION < 3,
	reason="Requires compiler supporting CLML v5 or above",
	)
	@pytest.mark.parametrize("dtype", ["float32"])
	@pytest.mark.parametrize(
	"trials",
	[
	[(1, 64, 14, 14), 1, 3e-4],
	[(1, 14, 256, 256), 1, 3e-4],
	[(1, 14, 256, 256), 1, 3e-4],
	[(1, 256, 1, 1), 1, 3e-4],
	],
	)
	def test_batchnorm(dtype, trials):
	low, high = 0, 1
	(input_shape, axis, epsilon) = trials
	channels = input_shape[axis]

	def _get_axis_tuple(axis):
	if axis == 0:
	return (1, 2, 3)
	elif axis == 1:
	return (0, 2, 3)
	elif axis == 2:
	return (0, 1, 3)
	else:
	return (0, 1, 2)

	data = np.random.uniform(low, high, size=(input_shape)).astype(dtype)
	gamma = np.random.uniform(low, high, size=(channels)).astype(dtype)
	beta = np.random.uniform(low, high, size=(channels)).astype(dtype)
	mean = np.mean(data, _get_axis_tuple(axis), keepdims=False)
	variance = np.var(data, _get_axis_tuple(axis), keepdims=False)

	inputs_np = [data]
	params_np = {"gamma": gamma, "beta": beta, "moving_mean": mean, "moving_var": variance}
	mod = get_batchnorm_mod(input_shape, channels, axis, epsilon, dtype)
	clml_codegen = [
	{
	"attrs": {"dtype": [dtype], "shape": [input_shape]},
	"name": "",
	"op": "input",
	},
	{"attrs": {"dtype": [dtype], "shape": [[channels]]}, "name": "", "op": "const"},
	{"attrs": {"dtype": [dtype], "shape": [[channels]]}, "name": "", "op": "const"},
	{"attrs": {"dtype": [dtype], "shape": [[channels]]}, "name": "", "op": "const"},
	{"attrs": {"dtype": [dtype], "shape": [[channels]]}, "name": "", "op": "const"},
	{
	"attrs": {
	"axis": axis,
	"center": 1,
	"dtype": [dtype],
	"momentum": 0.10000000000000001,
	"epsilon": 0.00029999999999999997,
	"num_inputs": 5,
	"num_outputs": 1,
	"scale": 1,
	"training": 1,
	"shape": [input_shape],
	},
	"inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 0, 0]],
	"name": "",
	"op": "kernel",
	},
	]
	verify(mod, clml_codegen, inputs_np, params_np)


	@pytest.mark.gpu
	@skip_unless_adreno_clml
	@pytest.mark.parametrize("dtype", ["float32"])
	@pytest.mark.parametrize(
	"a_shape, b_shape, op",
	[
	((1, 64, 14, 14), (1, 64, 14, 14), R.add),
	((1, 256), (1, 256), R.add),
	((1, 64, 14, 14), (1, 64, 14, 14), R.subtract),
	((1, 256), (1, 256), R.subtract),
	((1, 64, 14, 14), (1, 64, 14, 14), R.multiply),
	((1, 256), (1, 256), R.multiply),
	((1, 64, 14, 14), (1, 64, 14, 14), R.divide),
	((1, 256), (1, 256), R.divide),
	((1, 64, 14, 14), (1, 64, 14, 14), R.minimum),
	((1, 256), (1, 256), R.minimum),
	((1, 64, 14, 14), (1, 64, 14, 14), R.maximum),
	((1, 256), (1, 256), R.maximum),
	],
	)
	@pytest.mark.gpu
	@skip_unless_adreno_clml
	def test_binary_ops(a_shape, b_shape, op, dtype):
	(mod, inputs_np) = get_binary_op_mod(a_shape, b_shape, op, dtype)
	clml_codegen = [
	{
	"attrs": {
	"dtype": [dtype],
	"shape": [a_shape],
	},
	"name": "",
	"op": "input",
	},
	{
	"attrs": {
	"dtype": [dtype],
	"shape": [b_shape],
	},
	"name": "",
	"op": "input",
	},
	{
	"attrs": {
	"dtype": [dtype],
	"num_inputs": 2,
	"num_outputs": 1,
	"shape": [a_shape],
	},
	"inputs": [[0, 0, 0], [1, 0, 0]],
	"name": "",
	"op": "kernel",
	},
	]
	verify(mod, clml_codegen, inputs_np, {})


	@pytest.mark.gpu
	@skip_unless_adreno_clml
	@pytest.mark.parametrize(
	"dtype",
	[
	"float32",
	],
	)
	@pytest.mark.parametrize(
	"a_shape, op",
	[
	((1, 64, 14, 14), R.nn.relu),
	((1, 256, 1, 1), R.nn.relu),
	((1, 14, 256, 256), R.nn.relu),
	((1, 14, 14, 256), R.nn.relu),
	],
	)
	@pytest.mark.gpu
	@skip_unless_adreno_clml
	def test_unary_ops(a_shape, op, dtype):
	(mod, inputs_np) = get_unary_op_mod(a_shape, op, dtype)
	clml_codegen = [
	{
	"attrs": {
	"dtype": [dtype],
	"shape": [a_shape],
	},
	"name": "",
	"op": "input",
	},
	{
	"attrs": {
	"activation_type": "relu",
	"dtype": [dtype],
	"num_inputs": 1,
	"num_outputs": 1,
	"shape": [a_shape],
	},
	"inputs": [[0, 0, 0]],
	"name": "",
	"op": "kernel",
	},
	]
	verify(mod, clml_codegen, inputs_np, {})


	@pytest.mark.gpu
	@skip_unless_adreno_clml
	@pytest.mark.parametrize("dtype", ["float32"])
	@pytest.mark.parametrize(
	"trials",
	[
	[(1, 64, 147, 147), (3, 3), (2, 2), (1, 1), (0, 0, 0, 0), False],
	[(1, 256, 17, 17), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
	[(1, 1024, 14, 14), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
	# With padding is realized as nn.pad + pool
	# [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (1, 1, 1, 1), True],
	# [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (0, 1, 0, 1), True],
	# [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 1, 1, 1), True],
	# [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
	],
	)
	def test_max_pool(dtype, trials):
	low, high = -1, 1
	(input_shape, pool_size, stride, dilation, padding, has_pad) = trials
	mod = get_relax_maxpool_mod(input_shape, dtype, pool_size, stride, dilation, padding, has_pad)
	clml_codegen = get_maxpool_expected_codegen(
	input_shape, pool_size, stride, padding, "maxpool2d", dtype
	)

	inputs_np = [np.random.uniform(low, high, size=input_shape).astype(dtype)]
	verify(mod, clml_codegen, inputs_np, {})


	@pytest.mark.gpu
	@skip_unless_adreno_clml
	@pytest.mark.parametrize("dtype", ["float32"])
	@pytest.mark.parametrize(
	"trials",
	[
	[(1, 64, 147, 147), (3, 3), (2, 2), (1, 1), (0, 0, 0, 0), False],
	[(1, 256, 17, 17), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
	[(1, 1024, 14, 14), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
	# With padding is realized as nn.pad + pool
	# [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (1, 1, 1, 1), True],
	# [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (0, 1, 0, 1), True],
	# [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 1, 1, 1), True],
	# [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
	],
	)
	def test_avg_pool(dtype, trials):
	low, high = -1, 1
	(input_shape, pool_size, stride, dilation, padding, has_pad) = trials
	mod = get_relax_avgpool_mod(input_shape, dtype, pool_size, stride, dilation, padding, has_pad)
	clml_codegen = get_avgpool_expected_codegen(
	input_shape, pool_size, stride, padding, "avg_pool2d", dtype
	)

	inputs_np = [np.random.uniform(low, high, size=input_shape).astype(dtype)]
	params_np = {}
	verify(mod, clml_codegen, inputs_np, {})


	@pytest.mark.gpu
	@skip_unless_adreno_clml
	@pytest.mark.parametrize("dtype", ["float32"])
	@pytest.mark.parametrize(
	"trials",
	[
	[(1, 3, 32, 32), (1, 4, -1, 32)],
	[(1, 4, 8, 32), (1, 4, -1, 16)],
	[(1, 64, 3, 3), (1, 32, 3, -1)],
	],
	)
	def test_reshape(dtype, trials):
	low, high = -1, 1
	(input_shape, output_shape) = trials
	mod = get_relax_reshape_mod(input_shape, output_shape, dtype)
	clml_codegen = get_relax_reshape_codegen(input_shape, output_shape, dtype)

	inputs_np = [np.random.uniform(low, high, size=input_shape).astype(dtype)]
	verify(mod, clml_codegen, inputs_np, {})


	@pytest.mark.skip(reason="Codegen Comparision Failing")
	@pytest.mark.gpu
	@skip_unless_adreno_clml
	@pytest.mark.parametrize("dtype", ["float32"])
	@pytest.mark.parametrize(
	"trials",
	[
	[(1, 64, 147, 147), True],
	[(1, 256, 17, 17), False],
	[(1, 1024, 14, 14), True],
	[(1, 32, 256, 256), False],
	],
	)
	def test_global_avg_pool(dtype, trials):
	"""Test function for global average pooling."""
	low, high = -1, 1
	(input_shape, keep_dims) = trials
	N, C, H, W = input_shape
	pool_size, stride, padding = (H, W), (1, 1), (0, 0, 0, 0)
	mod = get_relax_global_avgpool_mod(input_shape, keep_dims, dtype)
	clml_codegen = get_global_maxpool_expected_codegen(
	input_shape, pool_size, stride, padding, "global_max", dtype
	)

	inputs_np = [np.random.uniform(low, high, size=input_shape).astype(dtype)]
	verify(mod, clml_codegen, inputs_np, {})


	@pytest.mark.gpu
	@skip_unless_adreno_clml
	@pytest.mark.parametrize("dtype", ["float32"])
	@pytest.mark.parametrize(
	"trials",
	[
	[(1, 64, 147, 147), True],
	[(1, 256, 17, 17), False],
	[(1, 1024, 14, 14), True],
	[(1, 32, 256, 256), False],
	],
	)
	def test_global_max_pool(dtype, trials):
	"""Test function for global average pooling."""
	low, high = -1, 1
	(input_shape, keep_dims) = trials
	N, C, H, W = input_shape
	pool_size, stride, padding = (H, W), (1, 1), (0, 0, 0, 0)
	mod = get_relax_global_maxpool_mod(input_shape, keep_dims, dtype)
	clml_codegen = get_global_maxpool_expected_codegen(
	input_shape, pool_size, stride, padding, "global_max", dtype
	)

	inputs_np = [np.random.uniform(low, high, size=input_shape).astype(dtype)]
	verify(mod, clml_codegen, inputs_np, {})


	@pytest.mark.skipif(
	CLML_VERSION < 5,
	reason="Requires target device with CLML v5 or above",
	)
	@pytest.mark.parametrize(
	"K, N, M",
	[
	(4096, 11008, 256),
	(2048, 32768, 128),
	(4096, 4096, 512),
	(4096, 22016, 64),
	(16384, 2048, 128),
	(2048, 2560, 1024),
	(3072, 9216, 256),
	(14336, 4096, 128),
	(1536, 17920, 128),
	(8960, 1536, 1024),
	],
	)
	def test_dequant_matmul(K, N, M):
	x_data = np.random.uniform(-0.1, 0.1, size=(1, M, K)).astype("float16")
	weight = np.random.randint(0, 100, size=(K // 8, N)).astype("uint32")
	scale = np.random.uniform(-0.1, 0.1, size=(K // 32, N)).astype("float16")

	mod = get_dequant_matmul_module(K, N)
	clml_codegen = [
	{
	"op": "input",
	"name": "",
	"attrs": {"shape": [[[K // 8, N]]], "dtype": [["uint32"]]},
	},
	{
	"op": "input",
	"name": "",
	"attrs": {"shape": [[[K // 32, N]]], "dtype": [["float16"]]},
	},
	{
	"op": "input",
	"name": "",
	"attrs": {"shape": [[[1, -1, K]]], "dtype": [["float16"]]},
	},
	{
	"op": "kernel",
	"name": "",
	"inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]],
	"attrs": {
	"dtype": ["float16"],
	"num_inputs": 3,
	"num_outputs": 1,
	"out_dtype": ["float16"],
	"shape": [[1, -1, N]],
	},
	},
	]

	inputs_np = [x_data, weight, scale]
	verify(mod, clml_codegen, inputs_np, {}, target_minimum_clml_version=5)


	@pytest.mark.skipif(
	CLML_VERSION < 5,
	reason="Requires compiler supporting CLML v5 or above",
	)
	@pytest.mark.parametrize(
	"K, N",
	[
	(4096, 11008),
	(2048, 32768),
	(4096, 4096),
	(4096, 22016),
	(16384, 2048),
	(2048, 2560),
	(3072, 9216),
	(4096, 28672),
	(14336, 4096),
	(1536, 17920),
	(8960, 1536),
	],
	)
	def test_dequant_vec_matmul(K, N):
	x_data = np.random.uniform(-0.1, 0.1, size=(1, 1, K)).astype("float16")
	weight = np.random.randint(0, 100, size=(K // 8, N)).astype("uint32")
	scale = np.random.uniform(-0.1, 0.1, size=(K // 32, N)).astype("float16")

	mod = get_dequant_vec_matmul_module(K, N)
	clml_codegen = [
	{
	"op": "input",
	"name": "",
	"attrs": {"shape": [[[K // 8, -1]]], "dtype": [["uint32"]]},
	},
	{
	"op": "input",
	"name": "",
	"attrs": {"shape": [[[K // 32, -1]]], "dtype": [["float16"]]},
	},
	{
	"op": "input",
	"name": "",
	"attrs": {"shape": [[[1, 1, K]]], "dtype": [["float16"]]},
	},
	{
	"op": "kernel",
	"name": "",
	"inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]],
	"attrs": {
	"dtype": ["float16"],
	"num_inputs": 3,
	"num_outputs": 1,
	"out_dtype": ["float16"],
	"shape": [[1, 1, -1]],
	},
	},
	]

	inputs_np = (x_data, weight, scale)
	verify(mod, clml_codegen, inputs_np, {}, target_minimum_clml_version=5)


	if __name__ == "__main__":
	tvm.testing.main()