blob: 818765dc0b272e33620374e4ca730a50ea0b4ad7 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Core kernel of dot product of 4 Int8 operations"""
# pylint: disable=invalid-name
import tvm
from tvm import te
import tvm.target.codegen
def dot_16x1x16_uint8_int8_int32():
"""Dispatch the most optimized intrin depending on the target"""
mcpu = tvm.target.Target.current().mcpu
assert mcpu in (
"skylake-avx512",
"cascadelake",
), "An old Intel machine that does not have fast Int8 support."
if mcpu == "skylake-avx512":
return dot_16x1x16_uint8_int8_int32_skylake()
# cascadelake
return dot_16x1x16_uint8_int8_int32_cascadelake()
def dot_16x1x16_uint8_int8_int32_skylake():
"""
Int8 dot product by every 4 elements using AVX512 Skylake instructions.
This function takes two arrays of uint8 and int8 datatype -- data[4] and
kernel[16][4] -- and computes a dot product of data[4] with every
4 elements of kernels, resulting in output[16] of int32 datatype.
The pseudo code is as follows.
.. code-block:: c
void dot_16x1x16_uint8_int8_int32(uint8 data[4], int8 kernel[16][4],
int32 output[16]){
for (int i = 0; i < 16; i++){
output[i] = 0;
for (int k = 0; k < 4; k++){
output[i] += data[k] * kernel[i][k]
}
}
}
Physically, the kernel array sits in an AVX512 vector register and
the data[4] is broadcasted to another AVX512 vector register. This
function returns a TensorIntrin that can be used to tensorize
a schedule.
Returns
-------
intrin : TensorIntrin
The Skylake int8 TensorIntrin that can be used in tensorizing schedule
"""
int32_lanes = 16 # 16 int32 lanes in AVX512
num_int8_elements = 4 # 4 int8 elements in int32
data = te.placeholder((num_int8_elements,), dtype="uint8", name="data")
kernel = te.placeholder((int32_lanes, num_int8_elements), dtype="int8", name="kernel")
k = te.reduce_axis((0, num_int8_elements), name="k")
C = te.compute(
(int32_lanes,),
lambda i: te.sum(data[k].astype("int32") * kernel[i, k].astype("int32"), axis=k),
name="C",
)
a_buffer = tvm.tir.decl_buffer(
data.shape, dtype="uint8", name="a_buffer", offset_factor=1, strides=[1]
)
b_buffer = tvm.tir.decl_buffer(
kernel.shape, dtype="int8", name="b_buffer", offset_factor=1, strides=[te.var("ldw"), 1]
)
def _intrin_func(ins, outs):
def _instr(index):
ib = tvm.tir.ir_builder.create()
if index == 1:
ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x16")))
return ib.get()
a_int8 = ins[0].vload([0], "uint8x4")
re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
vec_ai32 = re_int32.astype("int32x16")
vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai32)
vec_b = ins[1].vload([0, 0], "int8x64")
vec_one = tvm.tir.const(1, "int16x32")
pair_reduction = tvm.tir.call_llvm_pure_intrin(
"int16x32",
"llvm.x86.avx512.pmaddubs.w.512",
tvm.tir.const(0, "uint32"),
vec_a,
vec_b,
)
quad_reduction = tvm.tir.call_llvm_pure_intrin(
"int32x16",
"llvm.x86.avx512.pmaddw.d.512",
tvm.tir.const(0, "uint32"),
pair_reduction,
vec_one,
)
if index == 0:
ib.emit(outs[0].vstore(0, quad_reduction))
else:
ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], "int32x16")))
return ib.get()
# body, reset, update
return _instr(0), _instr(1), _instr(2)
buffer_params = {"offset_factor": 1}
return te.decl_tensor_intrin(
C.op,
_intrin_func,
binds={data: a_buffer, kernel: b_buffer},
default_buffer_params=buffer_params,
)
def dot_16x1x16_uint8_int8_int16():
"""
Int8 dot product by every 2 elements using AVX512 Skylake instructions.
This function takes two arrays of uint8 and int8 datatype -- data[2] and
kernel[4][32][2] -- and computes a dot product of data[2] with every
2 elements of kernels, resulting in output[4][32] of int16 datatype.
The pseudo code is as follows.
.. code-block:: c
void dot_16x1x16_uint8_int8_int16(uint8 data[2], int8 kernel[32*4][2],
int16 output[32*4]){
for (int i = 0; i< 4; i++){
for (int j = 0; j < 32; j++){
output[i][i] = 0;
for (int k = 0; k < 2; k++){
output[i][j][k] += data[k] * kernel[i][j][k]
}
}
}
}
Physically, the kernel array sits in four AVX512 vector registers and
the data[2] is broadcasted to another AVX512 vector register. This
function returns a TensorIntrin that can be used to tensorize
a schedule.
Returns
-------
intrin : TensorIntrin
The Skylake int8 TensorIntrin that can be used in tensorizing schedule
"""
int16_lanes = 4 * 32 # 4*32 int32 lanes in 4 AVX512 vector registers
num_int8_elements = 2 # 2 int8 elements in int16
data = te.placeholder((num_int8_elements,), dtype="uint8", name="data")
kernel = te.placeholder((int16_lanes, num_int8_elements), dtype="int8", name="kernel")
k = te.reduce_axis((0, num_int8_elements), name="k")
C = te.compute(
(int16_lanes,),
lambda i: te.sum(data[k].astype("int16") * kernel[i, k].astype("int16"), axis=k),
name="C",
)
a_buffer = tvm.tir.decl_buffer(
data.shape, dtype="uint8", name="a_buffer", offset_factor=1, strides=[1]
)
b_buffer = tvm.tir.decl_buffer(kernel.shape, dtype="int8", name="b_buffer", offset_factor=1)
# strides=[te.var('ldw'), 1, 1])
def _intrin_func(ins, outs):
def _instr(index):
ib = tvm.tir.ir_builder.create()
if index == 1:
for i in range(4):
ib.emit(outs[0].vstore([i * 32], tvm.tir.const(0, "int16x32")))
return ib.get()
a_int8 = ins[0].vload([0], "uint8x2")
re_int16 = tvm.tir.call_intrin("int16", "tir.reinterpret", a_int8)
vec_ai16 = re_int16.astype("int16x32")
vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai16)
for i in range(4):
vec_b = ins[1].vload([i * 32, 0], "int8x64")
pair_reduction = tvm.tir.call_llvm_pure_intrin(
"int16x32",
"llvm.x86.avx512.pmaddubs.w.512",
tvm.tir.const(0, "uint32"),
vec_a,
vec_b,
)
if index == 0:
ib.emit(outs[0].vstore([i * 32], pair_reduction))
else:
ib.emit(
outs[0].vstore(
[i * 32], pair_reduction + outs[0].vload([i * 32], "int16x32")
)
)
return ib.get()
# body, reset, update
return _instr(0), _instr(1), _instr(2)
buffer_params = {"offset_factor": 1}
return te.decl_tensor_intrin(
C.op,
_intrin_func,
binds={data: a_buffer, kernel: b_buffer},
default_buffer_params=buffer_params,
)
def dot_16x1x16_uint8_int8_int32_cascadelake():
"""
Int8 dot product by every 4 elements using AVX512VNNI Cascade Lake instructions.
This function takes two arrays of uint8 and int8 datatype -- data[4] and
kernel[16][4] -- and computes a dot product of data[4] with every
4 elements of kernels, resulting in output[16] of int32 datatype.
The pseudo code is as follows.
.. code-block:: c
void dot_16x1x16_uint8_int8_int32_cascadelake(uint8 data[4], int8 kernel[16][4],
int32 output[16]){
for (int i = 0; i < 16; i++){
output[i] = 0;
for (int k = 0; k < 4; k++){
output[i] += data[k] * kernel[i][k]
}
}
}
Physically, the kernel array sits in an AVX512 vector register and
the data[4] is broadcasted to another AVX512 vector register. This
function returns a TensorIntrin that can be used to tensorize
a schedule.
Returns
-------
intrin : TensorIntrin
The Cascade Lake int8 TensorIntrin that can be used in tensorizing schedule
"""
int32_lanes = 16 # 16 int32 lanes in AVX512
num_int8_elements = 4 # 4 int8 elements in int32
data = te.placeholder((num_int8_elements,), dtype="uint8", name="data")
kernel = te.placeholder((int32_lanes, num_int8_elements), dtype="int8", name="kernel")
k = te.reduce_axis((0, num_int8_elements), name="k")
C = te.compute(
(int32_lanes,),
lambda i: te.sum(data[k].astype("int32") * kernel[i, k].astype("int32"), axis=k),
name="C",
)
a_buffer = tvm.tir.decl_buffer(
data.shape, dtype="uint8", name="a_buffer", offset_factor=1, strides=[1]
)
b_buffer = tvm.tir.decl_buffer(
kernel.shape, dtype="int8", name="b_buffer", offset_factor=1, strides=[te.var("ldw"), 1]
)
def _intrin_func(ins, outs):
def _instr(index):
ib = tvm.tir.ir_builder.create()
if index == 1:
ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x16")))
return ib.get()
a_int8 = ins[0].vload([0], "uint8x4")
re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
vec_ai32 = re_int32.astype("int32x16")
vec_b = ins[1].vload([0, 0], "int8x64")
vnni_inst_name = "llvm.x86.avx512.vpdpbusd.512"
llvm_id = tvm.target.codegen.llvm_lookup_intrinsic_id(vnni_inst_name)
if llvm_id != 0: # VNNI is available for current LLVM version
vec_bi32 = tvm.tir.call_intrin("int32x16", "tir.reinterpret", vec_b)
vec_zero = tvm.tir.const(0, "int32x16")
quad_reduction = tvm.tir.call_llvm_pure_intrin(
"int32x16",
"llvm.x86.avx512.vpdpbusd.512",
tvm.tir.const(0, "uint32"),
vec_zero,
vec_ai32,
vec_bi32,
)
else: # Fall back to the normal AVX512
vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai32)
vec_one = tvm.tir.const(1, "int16x32")
pair_reduction = tvm.tir.call_llvm_pure_intrin(
"int16x32",
"llvm.x86.avx512.pmaddubs.w.512",
tvm.tir.const(0, "uint32"),
vec_a,
vec_b,
)
quad_reduction = tvm.tir.call_llvm_pure_intrin(
"int32x16",
"llvm.x86.avx512.pmaddw.d.512",
tvm.tir.const(0, "uint32"),
pair_reduction,
vec_one,
)
if index == 0:
ib.emit(outs[0].vstore(0, quad_reduction))
else:
ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], "int32x16")))
return ib.get()
# body, reset, update
return _instr(0), _instr(1), _instr(2)
buffer_params = {"offset_factor": 1}
return te.decl_tensor_intrin(
C.op,
_intrin_func,
binds={data: a_buffer, kernel: b_buffer},
default_buffer_params=buffer_params,
)