blob: 3e0d5db9537991758298a6fa017a24cf87402ce1 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
import tvm
import tvm.testing
from tvm import te
import numpy as np
from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32
import pytest
@tvm.testing.requires_llvm
@pytest.mark.skip("skip because feature not enabled")
def test_fc_int8_acc32():
m = 1024
n = 1024
k = 1024
X = te.placeholder((m, k), name="X", dtype="uint8")
W = te.placeholder((n, k), name="W", dtype="int8")
peak = 280
print("Peak {} Gops/s".format(peak))
memory_ops = m * k + n * k + 2 * m * n
gops_per_mm = 2 * m * n * k
# For LLVM < 8.0, it shows "'cascadelake' is not a recognized processor for this target
# (ignoring processor)" error with the following setting. After LLVM 8.0 is enabled in the
# test, we should use cascadelake setting.
def verify(target="llvm -mcpu=cascadelake"):
if not tvm.testing.device_enabled(target):
print("skip because %s is not enabled..." % target)
return
ctx = tvm.context(target, 0)
pc = dot_16x1x16_uint8_int8_int32_cascadelake()
ak = te.reduce_axis((0, k), name="k")
packedW = te.placeholder((n // 16, 16 * (k // 4), 4), name="packedW", dtype="int8")
t_fc = te.compute(
(m, n),
lambda i, j: te.sum(
X[i, ak].astype("int32")
* packedW[j / 16, (ak / 4) * 16 + j % 16, ak % 4].astype("int32"),
axis=ak,
),
name="F",
)
t_sch = te.create_schedule(t_fc.op)
a_x, a_y = t_fc.op.axis
(a_k,) = t_fc.op.reduce_axis
a_yo, a_yi = t_sch[t_fc].split(a_y, factor=16)
a_xo, a_xi = t_sch[t_fc].split(a_x, factor=32)
a_ko, a_ki = t_sch[t_fc].split(a_k, factor=4)
a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=4)
t_sch[t_fc].reorder(a_yo, a_xo, a_xi, a_koo, a_koi, a_yi, a_ki)
t_sch[t_fc].unroll(a_koi)
t_sch[t_fc].tensorize(a_yi, pc)
t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
t_evaluator = t_func.time_evaluator(t_func.entry_name, ctx, number=10)
# generate the plain data
a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8")
packW = np.random.uniform(1, 10, size=(n // 16, 16 * (k // 4), 4)).astype("int8")
# This occurs in pre_compute stage
for r_idx in range(n // 16):
for s_idx in range(16 * (k // 4)):
for t_idx in range(4):
packW[r_idx][s_idx][t_idx] = b_[r_idx * 16 + s_idx % 16][
(s_idx // 16) * 4 + t_idx
]
x = tvm.nd.array(a_, ctx)
w = tvm.nd.array(packW, ctx)
y = tvm.nd.array(np.zeros((m, n), dtype="int32"), ctx)
result = t_evaluator(x, w, y)
gops_per_sec = gops_per_mm / result.mean / 1e9
# verify the correctness
tvm.testing.assert_allclose(y.asnumpy(), np.dot(a_, b_.T), rtol=0)
print(
"Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}".format(
result.mean * 1000, gops_per_sec, gops_per_sec / peak
)
)
t_func.export_library("tensorize_acc32.o")
verify()
if __name__ == "__main__":
# The test requires Cascade Lake and newer Intel machines to generate the
# correct AVX512 VNNI instruction. So, disabling the test.
# test_fc_int8_acc32()
pass