| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # 'License'); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| import sys |
| |
| import numpy as np |
| import pytest |
| |
| import tvm |
| import tvm.testing |
| from tvm import autotvm |
| from tvm import te |
| from tvm.topi import testing |
| from tvm.topi.utils import get_const_tuple, simplify |
| from tvm.topi import nn |
| |
| |
| def compute_plus_one_rank3(shape): |
| X = te.placeholder(shape, name="X", dtype="float32") |
| Y = te.compute(shape, lambda i, j, k: X[i, j, k] + 1, name="Compute_Y") |
| return X, Y |
| |
| |
| def schedule_plus_one_rank3(X, Y): |
| s = te.create_schedule(Y.op) |
| # Xt = s.cache_read(X, "texture", [Y]) |
| # Xt = s.cache_read(X, "global", [Y]) |
| Xt = s.cache_read(X, "global.texture", [Y]) |
| |
| # copy to texture stage |
| x, y, c = s[Xt].op.axis |
| s[Xt].bind(x, te.thread_axis("blockIdx.x")) |
| s[Xt].bind(y, te.thread_axis("threadIdx.x")) |
| s[Xt].vectorize(c) |
| |
| # the compute stage |
| x, y, c = s[Y].op.axis |
| xo, yo, xi, yi = s[Y].tile(x, y, 4, 4) |
| s[Y].bind(xo, te.thread_axis("blockIdx.x")) |
| s[Y].bind(yo, te.thread_axis("threadIdx.x")) |
| s[Y].vectorize(c) |
| return s |
| |
| |
| def compute_plus_one_rank5(shape): |
| X = te.placeholder(shape, name="X", dtype="float32") |
| Y = te.compute(shape, lambda i, j, k, l, m: X[i, j, k, l, m] + 1, name="Compute_Y") |
| return X, Y |
| |
| |
| def schedule_plus_one_rank5(X, Y): |
| s = te.create_schedule(Y.op) |
| Xt = s.cache_read(X, "global.texture", [Y]) |
| |
| # copy to texture stage |
| a, b, c, d, e = s[Xt].op.axis |
| abc = s[Xt].fuse(a, b, c) |
| s[Xt].bind(abc, te.thread_axis("blockIdx.x")) |
| s[Xt].bind(d, te.thread_axis("threadIdx.x")) |
| s[Xt].vectorize(e) |
| |
| # the compute stage |
| a, b, c, d, e = s[Y].op.axis |
| abc = s[Y].fuse(a, b, c) |
| xo, yo, xi, yi = s[Y].tile(abc, d, 4, 4) |
| s[Y].bind(xo, te.thread_axis("blockIdx.x")) |
| s[Y].bind(yo, te.thread_axis("threadIdx.x")) |
| s[Y].vectorize(e) |
| return s |
| |
| |
| def compute_matmul(shape): |
| A = te.placeholder(shape, name="A", dtype="float32") |
| B = te.placeholder(shape, name="B", dtype="float32") |
| k = te.reduce_axis((0, shape[1]), name="k") |
| C = te.compute( |
| (shape[0] * shape[2], shape[0] * shape[2]), |
| lambda i, j: te.sum( |
| A[i // shape[2], k, i % shape[2]].astype("float32") |
| * B[j // shape[2], k, j % shape[2]].astype("float32"), |
| axis=[k], |
| ), |
| name="Compute_MatMul", |
| ) |
| return A, B, C |
| |
| |
| def schedule_matmul(A, B, C, local=False): |
| s = te.create_schedule(C.op) |
| At = s.cache_read(A, "global.texture", [C]) |
| Bt = s.cache_read(B, "global.texture", [C]) |
| if local: |
| Al = s.cache_read(At, "local", [C]) |
| Bl = s.cache_read(Bt, "local", [C]) |
| Cl = s.cache_write(C, "local") |
| |
| bx = te.thread_axis("blockIdx.x") |
| tx = te.thread_axis("threadIdx.x") |
| |
| def copy_to_texture(stage): |
| _io, _k, _ii = s[stage].op.axis |
| s[stage].vectorize(_ii) |
| s[stage].bind(_io, bx) |
| s[stage].bind(_k, tx) |
| |
| copy_to_texture(At) |
| copy_to_texture(Bt) |
| |
| # copy to global stage |
| _i, _j = s[C].op.axis |
| xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4) |
| s[C].unroll(xi) |
| s[C].vectorize(yi) |
| s[C].bind(xo, te.thread_axis("blockIdx.x")) |
| s[C].bind(yo, te.thread_axis("threadIdx.x")) |
| |
| # the compute stage |
| s[Cl].compute_at(s[C], yo) |
| (_k,) = Cl.op.reduce_axis |
| _x, _y = s[Cl].op.axis |
| s[Cl].reorder(_k, _x, _y) |
| s[Cl].unroll(_x) |
| s[Cl].vectorize(_y) |
| |
| if local: |
| s[Al].compute_at(s[Cl], _k) |
| s[Al].vectorize(s[Al].op.axis[-1]) |
| s[Bl].compute_at(s[Cl], _k) |
| s[Bl].vectorize(s[Bl].op.axis[-1]) |
| |
| return s |
| |
| |
| def compute_matmul_inner(shape): |
| A = te.placeholder(shape, name="A", dtype="float32") |
| B = te.placeholder(shape, name="B", dtype="float32") |
| k = te.reduce_axis((0, shape[1] * shape[2]), name="k") |
| # (M, K) x (N, K) |
| # (32, 256) x (32, 256) |
| # (32, 64, 4) x (32, 64, 4) |
| C = te.compute( |
| (shape[0], shape[0]), |
| lambda i, j: te.sum( |
| A[i, k // shape[2], k % shape[2]].astype("float32") |
| * B[j, k // shape[2], k % shape[2]].astype("float32"), |
| axis=[k], |
| ), |
| name="Compute_MatMul", |
| ) |
| return A, B, C |
| |
| |
| def schedule_matmul_inner(A, B, C, local=False): |
| s = te.create_schedule(C.op) |
| At = s.cache_read(A, "global.texture", [C]) |
| Bt = s.cache_read(B, "global.texture", [C]) |
| if local: |
| Al = s.cache_read(At, "local", [C]) |
| Bl = s.cache_read(Bt, "local", [C]) |
| Cl = s.cache_write(C, "local") |
| |
| bx = te.thread_axis("blockIdx.x") |
| tx = te.thread_axis("threadIdx.x") |
| |
| def copy_to_texture(stage): |
| _i, _ko, _ki = s[stage].op.axis |
| s[stage].vectorize(_ki) |
| s[stage].bind(_i, bx) |
| s[stage].bind(_ko, tx) |
| |
| copy_to_texture(At) |
| copy_to_texture(Bt) |
| |
| # copy to global stage |
| _i, _j = s[C].op.axis |
| xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4) |
| s[C].unroll(xi) |
| s[C].vectorize(yi) |
| s[C].bind(xo, te.thread_axis("blockIdx.x")) |
| s[C].bind(yo, te.thread_axis("threadIdx.x")) |
| |
| # the compute stage |
| s[Cl].compute_at(s[C], yo) |
| (_k,) = Cl.op.reduce_axis |
| _x, _y = s[Cl].op.axis |
| s[Cl].reorder(_x, _y, _k) |
| s[Cl].unroll(_x) |
| # TODO(csullivan): consider whether the below error is worth resolving |
| # s[Cl].vectorize(_y) # error |
| |
| if local: |
| s[Al].compute_at(s[Cl], _x) |
| s[Al].vectorize(s[Al].op.axis[-1]) |
| s[Bl].compute_at(s[Cl], _x) |
| s[Bl].vectorize(s[Bl].op.axis[-1]) |
| |
| return s |
| |
| |
| def compute_matmul_vector_accumulator(shapeA, shapeB): |
| # A x B |
| # (K/4, M, K%4) x (K, N/4, N%4) = (M, N) |
| # (32, 64, 4) x (128, 16, 4) = (64, 64) |
| A = te.placeholder(shapeA, name="A", dtype="float32") |
| B = te.placeholder(shapeB, name="B", dtype="float32") |
| k = te.reduce_axis((0, shapeB[0]), name="k") |
| C = te.compute( |
| (shapeA[1], shapeB[1] * shapeB[2]), |
| lambda i, j: te.sum( |
| A[k // shapeA[-1], i, k % shapeA[-1]].astype("float32") |
| * B[k, j // shapeB[-1], j % shapeB[-1]].astype("float32"), |
| axis=[k], |
| ), |
| name="Compute_MatMul", |
| ) |
| return A, B, C |
| |
| |
| def schedule_matmul_vector_accumulator(A, B, C, local=False): |
| s = te.create_schedule(C.op) |
| At = s.cache_read(A, "global.texture", [C]) |
| Bt = s.cache_read(B, "global.texture", [C]) |
| if local: |
| Al = s.cache_read(At, "local", [C]) |
| Bl = s.cache_read(Bt, "local", [C]) |
| Cl = s.cache_write(C, "local") |
| |
| def copy_to_texture(stage): |
| _y, _x, _v = s[stage].op.axis |
| # TODO(csullivan): removing this vectorize results in numerical errors, autovectorize |
| s[stage].vectorize(_v) |
| s[stage].bind(_y, te.thread_axis("blockIdx.x")) |
| s[stage].bind(_x, te.thread_axis("threadIdx.x")) |
| |
| copy_to_texture(At) |
| copy_to_texture(Bt) |
| |
| # copy to global stage |
| _i, _j = s[C].op.axis |
| xo, yo, xi, yi = s[C].tile(_i, _j, 4, 4) |
| s[C].unroll(xi) |
| s[C].vectorize(yi) |
| s[C].bind(xo, te.thread_axis("blockIdx.x")) |
| s[C].bind(yo, te.thread_axis("threadIdx.x")) |
| |
| # the compute stage |
| s[Cl].compute_at(s[C], yo) |
| (_k,) = Cl.op.reduce_axis |
| _a, _b = s[Cl].op.axis |
| _ko, _ki = s[Cl].split(_k, factor=4) |
| s[Cl].reorder(_ko, _a, _ki, _b) |
| s[Cl].unroll(_ki) |
| s[Cl].unroll(_a) |
| s[Cl].vectorize(_b) |
| |
| if local: |
| s[Al].compute_at(s[Cl], _a) |
| _aa, _ka, _ba = s[Al].op.axis |
| # TODO(csullivan)[BEFORE PR]: removing this vectorize command causes a crash. This needs to be autovectorized. |
| s[Al].vectorize(_ba) |
| s[Bl].compute_at(s[Cl], _ko) |
| _ab, _kb, _bb = s[Bl].op.axis |
| s[Bl].vectorize(_bb) |
| s[Bl].unroll(_ab) |
| |
| return s |
| |
| |
| def compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape): |
| # conv2d( [N, C, H, W, c] , [1, 1, C, K, k] |
| data = te.placeholder(input_shape, name="data", dtype="float32") |
| filt = te.placeholder(filter_shape, name="filter", dtype="float32") |
| c = te.reduce_axis((0, input_shape[1]), name="C") |
| c4 = te.reduce_axis((0, input_shape[-1]), name="c4") |
| kh = te.reduce_axis((0, filter_shape[0]), name="kh") |
| kw = te.reduce_axis((0, filter_shape[1]), name="kw") |
| conv = te.compute( |
| (input_shape[0], filter_shape[-2], input_shape[2], input_shape[3], filter_shape[-1]), |
| lambda n, ko, i, j, ki: te.sum( |
| data[n, c, i, j, c4].astype("float32") |
| * filt[kh, kw, c * input_shape[-1] + c4, ko, ki].astype("float32"), |
| axis=[kh, kw, c, c4], |
| ), |
| # name="Compute_conv2d_1x1_NCHWc_RSCKk", |
| name="conv2d_1x1", |
| ) |
| return data, filt, conv |
| |
| |
| def schedule_conv2d_1x1_NCHWc_RSCKk(data, filt, conv): |
| # inputs: (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4) |
| # outputs: |
| s = te.create_schedule(conv.op) |
| A, B, C = data, filt, conv |
| At = s.cache_read(A, "global.texture", [C]) |
| Bt = s.cache_read(B, "global.texture", [C]) |
| Al = s.cache_read(At, "local", [C]) |
| Bl = s.cache_read(Bt, "local", [C]) |
| Cl = s.cache_write(C, "local") |
| |
| def copy_to_texture(stage): |
| axes = s[stage].op.axis |
| fused = s[stage].fuse(*axes[:-1]) |
| block, thread = s[stage].split(fused, factor=32) |
| s[stage].vectorize(axes[-1]) |
| s[stage].bind(block, te.thread_axis("blockIdx.x")) |
| s[stage].bind(thread, te.thread_axis("threadIdx.x")) |
| |
| copy_to_texture(At) |
| copy_to_texture(Bt) |
| |
| _n, _ko, _h, _w, _ki = s[C].op.axis |
| s[C].vectorize(_ki) |
| s[C].bind(_n, te.thread_axis("blockIdx.x")) |
| s[C].bind(_ko, te.thread_axis("threadIdx.x")) |
| |
| s[Cl].compute_at(s[C], _w) |
| _nl, _kol, _hl, _wl, _kil = s[Cl].op.axis |
| _khl, _kwl, _cl, _cl4 = s[Cl].op.reduce_axis |
| _clo, _cli = s[Cl].split(_cl, factor=4) |
| s[Cl].reorder(_clo, _cli, _cl4, _kil) |
| s[Cl].unroll(_cli) |
| s[Cl].unroll(_cl4) |
| s[Cl].vectorize(_kil) |
| |
| s[Al].compute_at(s[Cl], _cli) |
| s[Al].vectorize(s[Al].op.axis[-1]) |
| s[Bl].compute_at(s[Cl], _kwl) |
| s[Bl].vectorize(s[Bl].op.axis[-1]) |
| |
| return s |
| |
| |
| def compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape): |
| # input_shape = [W, C, H, N, c] -> [W, C, H*N, c] |
| # filter_shape = [C, R, S, K, k] -> [C, R*S*K, k] |
| # output_shape: [WK, HN, k] -> [W, K, H, N, k] |
| data = te.placeholder(input_shape, name="data", dtype="float32") |
| filt = te.placeholder(filter_shape, name="filter", dtype="float32") |
| |
| packed_data = te.compute( |
| (input_shape[0], input_shape[1], input_shape[2] * input_shape[3], input_shape[4]), |
| lambda i, j, k, l: data[i, j, k // input_shape[3], k % input_shape[3], l], |
| name="packed_data", |
| ) |
| |
| # Logical transformation of Nd -> 3d tensor |
| # CRSKk -> C|RSK|k |
| # r = rsk // SK |
| # sk = rsk % SK |
| # s = sk // K == (rsk % SK) // K == (rsk // K) % S |
| # k = sk % K == (rsk % SK) % K == rsk % K |
| packed_filter = te.compute( |
| (filter_shape[0], filter_shape[1] * filter_shape[2] * filter_shape[3], filter_shape[4]), |
| lambda i, j, k: filt[ |
| i, |
| j // (filter_shape[3] * filter_shape[2]), |
| (j // filter_shape[3]) % filter_shape[2], |
| j % filter_shape[3], |
| k, |
| ], |
| name="packed_filter", |
| ) |
| |
| c = te.reduce_axis((0, input_shape[1]), name="C") |
| c4 = te.reduce_axis((0, input_shape[-1]), name="c4") |
| r = te.reduce_axis((0, filter_shape[1]), name="r") |
| s = te.reduce_axis((0, filter_shape[2]), name="s") |
| |
| conv = te.compute( |
| (input_shape[0], filter_shape[3], input_shape[2], input_shape[3], filter_shape[4]), |
| lambda w, ko, h, n, ki: te.sum( |
| packed_data[w, c, h * input_shape[3] + n, c4].astype("float32") |
| * packed_filter[ |
| c * input_shape[-1] + c4, ((r * filter_shape[2]) + s) * filter_shape[3] + ko, ki |
| ].astype("float32"), |
| axis=[r, s, c, c4], |
| ), |
| name="conv2d_1x1", |
| ) |
| return data, filt, packed_data, packed_filter, conv |
| |
| |
| def schedule_conv2d_1x1_WCHNc_CRSKk(data, filt, packed_data, packed_filter, conv): |
| # data: [W, C, H*N, c] |
| # filter: [C, R*S*K, k] |
| # output: [W, K, H, N, k] |
| |
| # conv2d( [N, C, H, W, c] , [1, 1, C, K, k] |
| # inputs: (1, 128//4, 56, 56, 4), (1, 1, 128, 128//4, 4) |
| |
| # data: (56, 128//4, 56*1, 4) = (56, 32, 56, 4) |
| # filt: (128, 1*1*128//4, 4) = (128, 32, 4) |
| # conv: (56, 32, 56, 1, 4) |
| |
| s = te.create_schedule(conv.op) |
| cfg = autotvm.get_config() |
| |
| s[packed_data].compute_inline() |
| s[packed_filter].compute_inline() |
| A, B, C = packed_data, packed_filter, conv |
| At = s.cache_read(A, "global.texture", [C]) |
| Bt = s.cache_read(B, "global.texture", [C]) |
| Al = s.cache_read(At, "local", [C]) |
| Bl = s.cache_read(Bt, "local", [C]) |
| Cl = s.cache_write(C, "local") |
| |
| def copy_to_texture(stage): |
| axes = s[stage].op.axis |
| fused = s[stage].fuse(*axes[:-1]) |
| block, thread = s[stage].split(fused, factor=32) |
| s[stage].vectorize(axes[-1]) |
| s[stage].bind(block, te.thread_axis("blockIdx.x")) |
| s[stage].bind(thread, te.thread_axis("threadIdx.x")) |
| |
| copy_to_texture(At) |
| copy_to_texture(Bt) |
| |
| _w, _ko, _h, _n, _ki = s[C].op.axis |
| kernel_scope, _n = s[C].split(_n, nparts=1) |
| |
| cfg.define_split("tile_f", _ko, num_outputs=4) |
| cfg.define_split("tile_w", _w, num_outputs=4) |
| cfg.define_split("tile_h", _h, num_outputs=4) |
| cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) |
| |
| bk, vk, tk, ki = cfg["tile_f"].apply(s, C, _ko) |
| bw, vw, tw, wi = cfg["tile_w"].apply(s, C, _w) |
| bh, vh, th, hi = cfg["tile_h"].apply(s, C, _h) |
| s[C].reorder(bh, _n, vh, th, hi) |
| bhn = s[C].fuse(bh, _n) |
| |
| s[C].bind(bk, te.thread_axis("blockIdx.z")) |
| s[C].bind(bhn, te.thread_axis("blockIdx.y")) |
| s[C].bind(bw, te.thread_axis("blockIdx.x")) |
| s[C].bind(vk, te.thread_axis("vthread")) |
| s[C].bind(vh, te.thread_axis("vthread")) |
| s[C].bind(vw, te.thread_axis("vthread")) |
| s[C].bind(tk, te.thread_axis("threadIdx.z")) |
| s[C].bind(th, te.thread_axis("threadIdx.y")) |
| s[C].bind(tw, te.thread_axis("threadIdx.x")) |
| s[C].reorder(bw, bk, bhn, vw, vk, vh, tw, tk, th, ki, hi, wi, _ki) |
| s[C].vectorize(_ki) |
| |
| # TODO(csullivan): Try uneven workgroup split |
| # _wo, _wi = s[C].split(_w, factor=4) |
| # #_hno, _hni = s[C].split(_hn, factor=8) |
| # #s[C].reorder(_wo, _wi, _ko, _hno, _hni, _ki) |
| # s[C].reorder(_wo, _ko, _hn, _ki, _wi) |
| # s[C].unroll(_wi) |
| |
| # # mace: |
| # # const int out_ch_blk = get_global_id(0); |
| # # const int out_w_blk = get_global_id(1); |
| # # const int out_hb = get_global_id(2); |
| |
| # bx = te.thread_axis("blockIdx.x") |
| # by = te.thread_axis("blockIdx.y") |
| # bz = te.thread_axis("blockIdx.z") |
| # s[C].bind(_ko, bx) |
| # s[C].bind(_wo, by) |
| # s[C].bind(_hn, bz) |
| |
| # s[Cl].compute_at(s[C], _hn) |
| s[Cl].compute_at(s[C], th) |
| |
| _wl, _kol, _hl, _nl, _kil = s[Cl].op.axis |
| _khl, _kwl, _cl, _cl4 = s[Cl].op.reduce_axis |
| |
| cfg.define_split("tile_c", _cl, num_outputs=2) |
| cfg.define_split("tile_kh", _khl, num_outputs=2) |
| cfg.define_split("tile_kw", _kwl, num_outputs=2) |
| |
| _clo, _cli = cfg["tile_c"].apply(s, Cl, _cl) |
| _khlo, _khli = cfg["tile_kh"].apply(s, Cl, _khl) |
| _kwlo, _kwli = cfg["tile_kw"].apply(s, Cl, _kwl) |
| # s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x) |
| s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli, _kol, _hl, _nl, _kil, _wl) |
| # s[Cl].reorder(_clo, _khlo, _kwlo, _cli, _cl4, _khli, _kwli) |
| # s[Cl].reorder(_cl, _cl4, _kil, _wl) |
| s[Cl].unroll(_cl4) |
| s[Cl].unroll(_wl) |
| s[Cl].vectorize(_kil) |
| |
| _wla, _cla, _hnla, _cl4a = s[Al].op.axis |
| s[Al].compute_at(s[Cl], _cli) |
| s[Al].vectorize(_cl4a) |
| s[Al].unroll(_wla) |
| |
| _clb, _rskolb, _kilb = s[Bl].op.axis |
| s[Bl].compute_at(s[Cl], _cli) |
| s[Bl].vectorize(_kilb) |
| s[Bl].unroll(_clb) |
| |
| s[C].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) |
| |
| WO, K, HO, N, K4 = get_const_tuple(C.shape) |
| RSC, _, _ = get_const_tuple(B.shape) |
| cfg.add_flop(2 * N * K * K4 * HO * WO * RSC) |
| |
| return s |
| |
| |
| def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype=None): |
| """Convolution operator in NCHWc layout.""" |
| |
| if out_dtype is None: |
| out_dtype = Input.dtype |
| assert isinstance(stride, int) or len(stride) == 2 |
| assert isinstance(dilation, int) or len(dilation) == 2 |
| if isinstance(stride, int): |
| stride_h = stride_w = stride |
| else: |
| stride_h, stride_w = stride |
| |
| if isinstance(dilation, int): |
| dilation_h = dilation_w = dilation |
| else: |
| dilation_h, dilation_w = dilation |
| |
| batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape |
| num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape |
| # compute the output shape |
| dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 |
| dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 |
| pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( |
| padding, (dilated_kernel_h, dilated_kernel_w) |
| ) |
| |
| out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) |
| out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) |
| # compute graph |
| pad_before = [0, 0, pad_top, pad_left, 0] |
| pad_after = [0, 0, pad_down, pad_right, 0] |
| temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") |
| |
| rcc = te.reduce_axis((0, in_channel_chunk), name="rc") |
| rcb = te.reduce_axis((0, in_channel_block), name="rc") |
| ry = te.reduce_axis((0, kernel_h), name="ry") |
| rx = te.reduce_axis((0, kernel_w), name="rx") |
| |
| # NCHWc x KCRSk |
| # texture: NCH|W|c |
| # texture: K|CRS|k |
| # c = crs//RS |
| # rs = crs % RS |
| # r = rs // W == (crs // S) % R |
| # s = rs % W == crs % S |
| Filter = te.compute( |
| (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block), |
| lambda ffc, crs, ffb: Filter[ |
| ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb |
| ], |
| name="packed_filter", |
| ) |
| return te.compute( |
| (batch, num_filter_chunk, out_height, out_width, num_filter_block), |
| lambda nn, ffc, yy, xx, ffb: te.sum( |
| temp[ |
| nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb |
| ].astype(out_dtype) |
| * Filter[ |
| ffc, ((rcc * in_channel_block + rcb) * kernel_h + ry) * kernel_w + rx, ffb |
| ].astype(out_dtype), |
| axis=[rcc, rcb, ry, rx], |
| ), |
| tag="conv2d_nchwc_kcrsk_texture", |
| ) |
| |
| |
| def schedule_conv2d_NCHWc_KCRSk(cfg, s, conv): |
| """schedule optimized for batch size = 1""" |
| |
| ##### space definition begin ##### |
| n, fc, y, x, fb = s[conv].op.axis |
| rcc, rcb, ry, rx = s[conv].op.reduce_axis |
| cfg.define_split("tile_fc", fc, num_outputs=4) |
| cfg.define_split("tile_y", y, num_outputs=4) |
| cfg.define_split("tile_x", x, num_outputs=4) |
| cfg.define_split("tile_rcc", rcc, num_outputs=2) |
| cfg.define_split("tile_ry", ry, num_outputs=2) |
| cfg.define_split("tile_rx", rx, num_outputs=2) |
| cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) |
| |
| pad_data, flattened_kernel = s[conv].op.input_tensors |
| kernel = s[flattened_kernel].op.input_tensors[0] |
| s[flattened_kernel].compute_inline() |
| |
| s[pad_data].compute_inline() |
| if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: |
| s[kernel].compute_inline() |
| kernel = flattened_kernel |
| |
| if conv.op in s.outputs: |
| output = conv |
| OL = s.cache_write(conv, "local") |
| else: |
| output = s.outputs[0].output(0) |
| s[conv].set_scope("local") |
| OL = conv |
| |
| # create cache stage |
| AT = s.cache_read(pad_data, "global.texture", [OL]) |
| WT = s.cache_read(kernel, "global.texture", [OL]) |
| |
| def copy_to_texture(stage): |
| axes = s[stage].op.axis |
| fused = s[stage].fuse(*axes[:-1]) |
| block, thread = s[stage].split(fused, factor=32) |
| s[stage].vectorize(axes[-1]) |
| s[stage].bind(block, te.thread_axis("blockIdx.x")) |
| s[stage].bind(thread, te.thread_axis("threadIdx.x")) |
| |
| copy_to_texture(AT) |
| copy_to_texture(WT) |
| |
| AA = s.cache_read(AT, "shared", [OL]) |
| WW = s.cache_read(WT, "shared", [OL]) |
| |
| # tile and bind spatial axes |
| n, fc, y, x, fb = s[output].op.axis |
| |
| kernel_scope, n = s[output].split(n, nparts=1) |
| |
| bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc) |
| by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) |
| bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) |
| |
| bf = s[output].fuse(n, bf) |
| s[output].bind(bf, te.thread_axis("blockIdx.z")) |
| s[output].bind(by, te.thread_axis("blockIdx.y")) |
| s[output].bind(bx, te.thread_axis("blockIdx.x")) |
| s[output].bind(vf, te.thread_axis("vthread")) |
| s[output].bind(vy, te.thread_axis("vthread")) |
| s[output].bind(vx, te.thread_axis("vthread")) |
| s[output].bind(tf, te.thread_axis("threadIdx.z")) |
| s[output].bind(ty, te.thread_axis("threadIdx.y")) |
| s[output].bind(tx, te.thread_axis("threadIdx.x")) |
| s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb) |
| s[output].vectorize(fb) |
| s[OL].compute_at(s[output], tx) |
| |
| # tile reduction axes |
| n, fc, y, x, fb = s[OL].op.axis |
| |
| rcc, rcb, ry, rx = s[OL].op.reduce_axis |
| rco, rci = cfg["tile_rcc"].apply(s, OL, rcc) |
| ryo, ryi = cfg["tile_ry"].apply(s, OL, ry) |
| rxo, rxi = cfg["tile_rx"].apply(s, OL, rx) |
| |
| # TODO(csullivan): check position of rcb |
| s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb) |
| s[OL].vectorize(fb) |
| s[OL].unroll(rcb) |
| |
| s[AA].compute_at(s[OL], rxo) |
| s[WW].compute_at(s[OL], rxo) |
| # cooperative fetching |
| for load in [AA, WW]: |
| if load == WW: |
| n, fyx, v = s[load].op.axis |
| fused = s[load].fuse(n, fyx) |
| else: |
| n, f, y, x, v = s[load].op.axis |
| fused = s[load].fuse(n, f, y, x) |
| tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2]) |
| ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) |
| tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) |
| s[load].bind(tz, te.thread_axis("threadIdx.z")) |
| s[load].bind(ty, te.thread_axis("threadIdx.y")) |
| s[load].bind(tx, te.thread_axis("threadIdx.x")) |
| s[load].vectorize(v) |
| |
| # unroll |
| s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) |
| |
| N, OCC, OH, OW, OCB = get_const_tuple(output.shape) |
| _, ICKHKW, _ = get_const_tuple(kernel.shape) |
| |
| if isinstance(N, int): |
| cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW) |
| |
| |
| def compute_conv2d_NCHWc_KCRSk_acc32(Input, Filter, stride, padding, dilation, out_dtype=None): |
| """Convolution operator in NCHWc layout.""" |
| |
| if out_dtype is None: |
| out_dtype = Input.dtype |
| assert isinstance(stride, int) or len(stride) == 2 |
| assert isinstance(dilation, int) or len(dilation) == 2 |
| if isinstance(stride, int): |
| stride_h = stride_w = stride |
| else: |
| stride_h, stride_w = stride |
| |
| if isinstance(dilation, int): |
| dilation_h = dilation_w = dilation |
| else: |
| dilation_h, dilation_w = dilation |
| |
| batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape |
| num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape |
| # compute the output shape |
| dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 |
| dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 |
| pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( |
| padding, (dilated_kernel_h, dilated_kernel_w) |
| ) |
| |
| out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) |
| out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) |
| # compute graph |
| pad_before = [0, 0, pad_top, pad_left, 0] |
| pad_after = [0, 0, pad_down, pad_right, 0] |
| temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") |
| |
| rcc = te.reduce_axis((0, in_channel_chunk), name="rc") |
| rcb = te.reduce_axis((0, in_channel_block), name="rc") |
| ry = te.reduce_axis((0, kernel_h), name="ry") |
| rx = te.reduce_axis((0, kernel_w), name="rx") |
| |
| # NCHWc x KCRSk |
| # texture: NCH|W|c |
| # texture: K|CRS|k |
| # c = crs//RS |
| # rs = crs % RS |
| # r = rs // W == (crs // S) % R |
| # s = rs % W == crs % S |
| Filter = te.compute( |
| (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block), |
| lambda ffc, crs, ffb: Filter[ |
| ffc, crs // (kernel_h * kernel_w), (crs // kernel_w) % kernel_h, crs % kernel_w, ffb |
| ], |
| name="packed_filter", |
| ) |
| conv = te.compute( |
| (batch, num_filter_chunk, out_height, out_width, num_filter_block), |
| lambda nn, ffc, yy, xx, ffb: te.sum( |
| ( |
| temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] |
| * Filter[ffc, ((rcc * in_channel_block + rcb) * kernel_h + ry) * kernel_w + rx, ffb] |
| ).astype(out_dtype), |
| axis=[rcc, rcb, ry, rx], |
| ), |
| tag="conv2d_nchwc_kcrsk_texture", |
| ) |
| output = te.compute(conv.shape, lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype("float32")) |
| return output |
| |
| |
| def schedule_conv2d_NCHWc_KCRSk_acc32(cfg, s, output): |
| """schedule optimized for batch size = 1""" |
| |
| conv = output.op.input_tensors[0] |
| |
| ##### space definition begin ##### |
| n, fc, y, x, fb = s[conv].op.axis |
| rcc, rcb, ry, rx = s[conv].op.reduce_axis |
| cfg.define_split("tile_fc", fc, num_outputs=4) |
| cfg.define_split("tile_y", y, num_outputs=4) |
| cfg.define_split("tile_x", x, num_outputs=4) |
| cfg.define_split("tile_rcc", rcc, num_outputs=2) |
| cfg.define_split("tile_ry", ry, num_outputs=2) |
| cfg.define_split("tile_rx", rx, num_outputs=2) |
| cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) |
| |
| pad_data, flattened_kernel = s[conv].op.input_tensors |
| kernel = s[flattened_kernel].op.input_tensors[0] |
| s[flattened_kernel].compute_inline() |
| |
| s[pad_data].compute_inline() |
| if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: |
| s[kernel].compute_inline() |
| kernel = flattened_kernel |
| |
| if conv.op in s.outputs: |
| output = conv |
| OL = s.cache_write(conv, "local") |
| else: |
| output = s.outputs[0].output(0) |
| s[conv].set_scope("local") |
| OL = conv |
| |
| # create cache stage |
| AT = s.cache_read(pad_data, "global.texture", [OL]) |
| WT = s.cache_read(kernel, "global.texture", [OL]) |
| |
| def copy_to_texture(stage): |
| axes = s[stage].op.axis |
| fused = s[stage].fuse(*axes[:-1]) |
| block, thread = s[stage].split(fused, factor=32) |
| s[stage].vectorize(axes[-1]) |
| s[stage].bind(block, te.thread_axis("blockIdx.x")) |
| s[stage].bind(thread, te.thread_axis("threadIdx.x")) |
| |
| copy_to_texture(AT) |
| copy_to_texture(WT) |
| |
| AA = s.cache_read(AT, "shared", [OL]) |
| WW = s.cache_read(WT, "shared", [OL]) |
| |
| # tile and bind spatial axes |
| n, fc, y, x, fb = s[output].op.axis |
| |
| kernel_scope, n = s[output].split(n, nparts=1) |
| |
| bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc) |
| by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) |
| bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) |
| |
| bf = s[output].fuse(n, bf) |
| s[output].bind(bf, te.thread_axis("blockIdx.z")) |
| s[output].bind(by, te.thread_axis("blockIdx.y")) |
| s[output].bind(bx, te.thread_axis("blockIdx.x")) |
| s[output].bind(vf, te.thread_axis("vthread")) |
| s[output].bind(vy, te.thread_axis("vthread")) |
| s[output].bind(vx, te.thread_axis("vthread")) |
| s[output].bind(tf, te.thread_axis("threadIdx.z")) |
| s[output].bind(ty, te.thread_axis("threadIdx.y")) |
| s[output].bind(tx, te.thread_axis("threadIdx.x")) |
| s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb) |
| s[output].vectorize(fb) |
| |
| s[OL].compute_at(s[output], tx) |
| |
| # tile reduction axes |
| n, fc, y, x, fb = s[OL].op.axis |
| |
| rcc, rcb, ry, rx = s[OL].op.reduce_axis |
| rco, rci = cfg["tile_rcc"].apply(s, OL, rcc) |
| ryo, ryi = cfg["tile_ry"].apply(s, OL, ry) |
| rxo, rxi = cfg["tile_rx"].apply(s, OL, rx) |
| |
| # TODO(csullivan): check position of rcb |
| s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb) |
| s[OL].vectorize(fb) |
| s[OL].unroll(rcb) |
| |
| s[AA].compute_at(s[OL], rxo) |
| s[WW].compute_at(s[OL], rxo) |
| # cooperative fetching |
| for load in [AA, WW]: |
| if load == WW: |
| n, fyx, v = s[load].op.axis |
| fused = s[load].fuse(n, fyx) |
| else: |
| n, f, y, x, v = s[load].op.axis |
| fused = s[load].fuse(n, f, y, x) |
| tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2]) |
| ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) |
| tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) |
| s[load].bind(tz, te.thread_axis("threadIdx.z")) |
| s[load].bind(ty, te.thread_axis("threadIdx.y")) |
| s[load].bind(tx, te.thread_axis("threadIdx.x")) |
| s[load].vectorize(v) |
| |
| # unroll |
| s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) |
| |
| N, OCC, OH, OW, OCB = get_const_tuple(output.shape) |
| _, ICKHKW, _ = get_const_tuple(kernel.shape) |
| |
| if isinstance(N, int): |
| cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW) |
| |
| |
| def compute_depthwise_conv2d_NCHWc_KCRSk_acc32( |
| Input, Filter, stride, padding, dilation, out_dtype=None |
| ): |
| """Depthwise convolution operator in NCHWc layout.""" |
| if out_dtype is None: |
| out_dtype = Input.dtype |
| assert isinstance(stride, int) or len(stride) == 2 |
| assert isinstance(dilation, int) or len(dilation) == 2 |
| |
| if isinstance(stride, int): |
| stride_h = stride_w = stride |
| else: |
| stride_h, stride_w = stride |
| |
| if isinstance(dilation, int): |
| dilation_h = dilation_w = dilation |
| else: |
| dilation_h, dilation_w = dilation |
| |
| batch, channel_chunk, in_height, in_width, channel_block = Input.shape |
| _, channel_multiplier, kernel_h, kernel_w, _ = Filter.shape |
| |
| # compute the output shape |
| dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 |
| dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 |
| pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple( |
| padding, (dilated_kernel_h, dilated_kernel_w) |
| ) |
| out_channel_chunk = simplify(channel_chunk * channel_multiplier) |
| out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) |
| out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) |
| # compute graph |
| pad_before = [0, 0, pad_top, pad_left, 0] |
| pad_after = [0, 0, pad_down, pad_right, 0] |
| temp = nn.pad(Input, pad_before, pad_after, name="pad_temp") |
| |
| ry = te.reduce_axis((0, kernel_h), name="ry") |
| rx = te.reduce_axis((0, kernel_w), name="rx") |
| |
| # NCHWc x CMRSc = [N,(C//4)M,OH,OW, 4c] |
| # NCHWc x CMRS |
| # texture: NCH|W|c |
| # texture: C|MRS|c |
| # output: N |
| # m = mrs//RS |
| # rs = mrs % RS |
| # r = rs // W == (mrs // S) % R |
| # s = rs % W == mrs % S |
| Filter = te.compute( |
| (channel_chunk, channel_multiplier * kernel_h * kernel_w, channel_block), |
| lambda ffc, mrs, ffb: Filter[ |
| ffc, mrs // (kernel_h * kernel_w), (mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb |
| ], |
| name="packed_filter", |
| ) |
| |
| conv = te.compute( |
| (batch, out_channel_chunk, out_height, out_width, channel_block), |
| lambda nn, ffc, yy, xx, ffb: te.sum( |
| ( |
| temp[ |
| nn, |
| ffc // channel_multiplier, |
| yy * stride_h + ry * dilation_h, |
| xx * stride_w + rx * dilation_w, |
| ffb, |
| ] |
| * Filter[ |
| ffc // channel_multiplier, |
| ((ffc % channel_multiplier) * kernel_h + ry) * kernel_w + rx, |
| ffb, |
| ] |
| ).astype(out_dtype), |
| axis=[ry, rx], |
| ), |
| tag="depthwise_conv2d_nchwc_kcrsk_texture", |
| ) |
| return te.compute( |
| conv.shape, lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype("float32") |
| ) |
| |
| |
| def schedule_depthwise_conv2d_NCHWc_KCRSk_acc32(cfg, s, output): |
| """schedule optimized for batch size = 1""" |
| |
| conv = output.op.input_tensors[0] |
| |
| ##### space definition begin ##### |
| n, fc, y, x, fb = s[conv].op.axis |
| ry, rx = s[conv].op.reduce_axis |
| cfg.define_split("tile_fc", fc, num_outputs=4) |
| cfg.define_split("tile_y", y, num_outputs=4) |
| cfg.define_split("tile_x", x, num_outputs=4) |
| cfg.define_split("tile_ry", ry, num_outputs=2) |
| cfg.define_split("tile_rx", rx, num_outputs=2) |
| cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) |
| |
| pad_data, flattened_kernel = s[conv].op.input_tensors |
| kernel = s[flattened_kernel].op.input_tensors[0] |
| s[flattened_kernel].compute_inline() |
| |
| s[pad_data].compute_inline() |
| if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag: |
| s[kernel].compute_inline() |
| kernel = flattened_kernel |
| |
| if conv.op in s.outputs: |
| output = conv |
| OL = s.cache_write(conv, "local") |
| else: |
| output = s.outputs[0].output(0) |
| s[conv].set_scope("local") |
| OL = conv |
| |
| # create cache stage |
| AT = s.cache_read(pad_data, "global.texture", [OL]) |
| WT = s.cache_read(kernel, "global.texture", [OL]) |
| |
| def copy_to_texture(stage): |
| axes = s[stage].op.axis |
| fused = s[stage].fuse(*axes[:-1]) |
| block, thread = s[stage].split(fused, factor=32) |
| s[stage].vectorize(axes[-1]) |
| s[stage].bind(block, te.thread_axis("blockIdx.x")) |
| s[stage].bind(thread, te.thread_axis("threadIdx.x")) |
| |
| copy_to_texture(AT) |
| copy_to_texture(WT) |
| |
| AA = s.cache_read(AT, "shared", [OL]) |
| WW = s.cache_read(WT, "shared", [OL]) |
| |
| # tile and bind spatial axes |
| n, fc, y, x, fb = s[output].op.axis |
| |
| kernel_scope, n = s[output].split(n, nparts=1) |
| |
| bf, vf, tf, fi = cfg["tile_fc"].apply(s, output, fc) |
| by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) |
| bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) |
| |
| bf = s[output].fuse(n, bf) |
| s[output].bind(bf, te.thread_axis("blockIdx.z")) |
| s[output].bind(by, te.thread_axis("blockIdx.y")) |
| s[output].bind(bx, te.thread_axis("blockIdx.x")) |
| s[output].bind(vf, te.thread_axis("vthread")) |
| s[output].bind(vy, te.thread_axis("vthread")) |
| s[output].bind(vx, te.thread_axis("vthread")) |
| s[output].bind(tf, te.thread_axis("threadIdx.z")) |
| s[output].bind(ty, te.thread_axis("threadIdx.y")) |
| s[output].bind(tx, te.thread_axis("threadIdx.x")) |
| s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi, fb) |
| s[output].vectorize(fb) |
| |
| s[OL].compute_at(s[output], tx) |
| |
| # tile reduction axes |
| n, fc, y, x, fb = s[OL].op.axis |
| |
| ry, rx = s[OL].op.reduce_axis |
| ryo, ryi = cfg["tile_ry"].apply(s, OL, ry) |
| rxo, rxi = cfg["tile_rx"].apply(s, OL, rx) |
| |
| s[OL].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb) |
| s[OL].vectorize(fb) |
| # s[OL].unroll() |
| |
| s[AA].compute_at(s[OL], rxo) |
| s[WW].compute_at(s[OL], rxo) |
| # cooperative fetching |
| for load in [AA, WW]: |
| if load == WW: |
| n, fyx, v = s[load].op.axis |
| fused = s[load].fuse(n, fyx) |
| else: |
| n, f, y, x, v = s[load].op.axis |
| fused = s[load].fuse(n, f, y, x) |
| tz, fused = s[load].split(fused, nparts=cfg["tile_fc"].size[2]) |
| ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) |
| tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) |
| s[load].bind(tz, te.thread_axis("threadIdx.z")) |
| s[load].bind(ty, te.thread_axis("threadIdx.y")) |
| s[load].bind(tx, te.thread_axis("threadIdx.x")) |
| s[load].vectorize(v) |
| |
| # unroll |
| s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) |
| |
| N, OCC, OH, OW, OCB = get_const_tuple(output.shape) |
| ICC, MKHKW, ICB = get_const_tuple(kernel.shape) |
| M = (OCC * OCB) // (ICC * ICB) |
| KHKW = MKHKW // M |
| |
| if isinstance(N, int): |
| cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW) |
| |
| |
| def scheduler(compute, schedule, *args, **kwargs): |
| placeholders = compute(*args) |
| s = schedule(*placeholders, **kwargs) |
| return s, placeholders |
| |
| |
| def conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape): |
| placeholders = compute_conv2d_1x1_NCHWc_RSCKk(input_shape, filter_shape) |
| s = schedule_conv2d_1x1_NCHWc_RSCKk(*placeholders) |
| return s, placeholders |
| |
| |
| def conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape): |
| placeholders = compute_conv2d_1x1_WCHNc_CRSKk(input_shape, filter_shape) |
| s = schedule_conv2d_1x1_WCHNc_CRSKk(*placeholders) |
| return s, (placeholders[0], placeholders[1], placeholders[-1]) |
| |
| |
| def conv2d_NCHWc_KCRSk(input_shape, filter_shape): |
| data = te.placeholder(input_shape, name="data", dtype="float32") |
| filt = te.placeholder(filter_shape, name="filter", dtype="float32") |
| conv = compute_conv2d_NCHWc_KCRSk(data, filt, [1, 1], [0, 0], [1, 1], "float32") |
| cfg = autotvm.get_config() |
| s = te.create_schedule([x.op for x in [conv]]) |
| schedule_conv2d_NCHWc_KCRSk(cfg, s, conv) |
| return s, (data, filt, conv) |
| |
| |
| def conv2d_NCHWc_KCRSk_fp32_acc(input_shape, filter_shape): |
| data = te.placeholder(input_shape, name="data", dtype="float32") |
| filt = te.placeholder(filter_shape, name="filter", dtype="float32") |
| output = compute_conv2d_NCHWc_KCRSk_acc32(data, filt, [1, 1], [0, 0], [1, 1], "float32") |
| cfg = autotvm.get_config() |
| s = te.create_schedule([x.op for x in [output]]) |
| schedule_conv2d_NCHWc_KCRSk_acc32(cfg, s, output) |
| return s, (data, filt, output) |
| |
| |
| def depthwise_conv2d_NCHWc_KCRSk_acc32(input_shape, filter_shape): |
| data = te.placeholder(input_shape, name="data", dtype="float32") |
| filt = te.placeholder(filter_shape, name="filter", dtype="float32") |
| output = compute_depthwise_conv2d_NCHWc_KCRSk_acc32( |
| data, filt, [1, 1], [0, 0], [1, 1], "float32" |
| ) |
| cfg = autotvm.get_config() |
| s = te.create_schedule([x.op for x in [output]]) |
| schedule_depthwise_conv2d_NCHWc_KCRSk_acc32(cfg, s, output) |
| return s, (data, filt, output) |
| |
| |
| def ref_convolution(data, kernel, stride, pad): |
| import mxnet as mx |
| |
| groups = 1 |
| kernel_size = (kernel.shape[2], kernel.shape[3]) |
| num_filter = kernel.shape[0] |
| ref_res = mx.nd.Convolution( |
| data=mx.nd.array(data), |
| weight=mx.nd.array(kernel), |
| bias=None, |
| no_bias=True, |
| kernel=kernel_size, |
| stride=stride, |
| pad=pad, |
| num_filter=num_filter, |
| num_group=groups, |
| ) |
| return ref_res.asnumpy() |
| |
| |
| def ref_depthwise_convolution(data, kernel, stride, pad): |
| import mxnet as mx |
| |
| groups = kernel.shape[0] |
| kernel_size = (kernel.shape[2], kernel.shape[3]) |
| num_filter = kernel.shape[0] |
| multiplier = kernel.shape[1] |
| ref_res = mx.nd.Convolution( |
| data=mx.nd.array(data), |
| weight=mx.nd.array(kernel), |
| bias=None, |
| no_bias=True, |
| kernel=kernel_size, |
| stride=stride, |
| pad=pad, |
| num_filter=num_filter, |
| num_group=groups, |
| ) |
| return ref_res.asnumpy() |
| |
| |
| def validate(workload, target, dev, input_shapes, *args, **kwargs): |
| s, placeholders = workload(*input_shapes, *args, **kwargs) |
| func = tvm.driver.build(s, [*placeholders], target=target, name="TestFunction") |
| |
| args_tvm = [] |
| args_np = [] |
| for var in placeholders[:-1]: |
| var_np = np.random.uniform(size=[i.value for i in var.shape]).astype(var.dtype) |
| args_np.append(var_np) |
| args_tvm.append(tvm.nd.array(var_np, dev)) |
| args_tvm.append( |
| tvm.nd.array( |
| np.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype), dev |
| ) |
| ) |
| func(*args_tvm) |
| |
| if "plus_one" in workload.__name__: |
| np_result = args_np[0] + 1.0 |
| elif "matmul" in workload.__name__: |
| if "inner" in workload.__name__: |
| np_result = np.matmul( |
| args_np[0].reshape(32, 256), args_np[1].reshape(32, 256).transpose(1, 0) |
| ) |
| elif "accum" in workload.__name__: |
| np_result = np.matmul( |
| args_np[0].transpose((1, 0, 2)).reshape(64, 128), args_np[1].reshape(128, 64) |
| ) |
| else: |
| np_result = np.matmul( |
| args_np[0].transpose((0, 2, 1)).reshape(128, 64), |
| args_np[1].transpose(1, 0, 2).reshape(64, 128), |
| ) |
| elif "conv2d_1x1_NCHWc_RSCKk" in workload.__name__: |
| vec_length = args_np[1].shape[-1] |
| # nchwc -> nchw |
| args_np[0] = ( |
| args_np[0] |
| .transpose((0, 1, 4, 2, 3)) |
| .reshape( |
| args_np[0].shape[0], |
| args_np[0].shape[1] * args_np[0].shape[-1], |
| args_np[0].shape[2], |
| args_np[0].shape[3], |
| ) |
| ) |
| # rsckk -> rsck -> kcrs |
| args_np[1] = ( |
| args_np[1] |
| .reshape( |
| args_np[1].shape[0], |
| args_np[1].shape[1], |
| args_np[1].shape[2], |
| args_np[1].shape[3] * args_np[1].shape[4], |
| ) |
| .transpose((3, 2, 0, 1)) |
| ) |
| np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) |
| # nkhw -> nkhwk |
| np_result = np_result.reshape( |
| np_result.shape[0], |
| np_result.shape[1] // vec_length, |
| vec_length, |
| np_result.shape[2], |
| np_result.shape[3], |
| ).transpose(0, 1, 3, 4, 2) |
| elif "conv2d_1x1_WCHNc_CRSKk" in workload.__name__: |
| vec_length = args_np[1].shape[-1] |
| # wchnc -> nchw |
| args_np[0] = ( |
| args_np[0] |
| .transpose((3, 1, 4, 2, 0)) |
| .reshape( |
| args_np[0].shape[3], |
| args_np[0].shape[1] * args_np[0].shape[-1], |
| args_np[0].shape[2], |
| args_np[0].shape[0], |
| ) |
| ) |
| # crskk -> crsk -> kcrs |
| args_np[1] = ( |
| args_np[1] |
| .reshape( |
| args_np[1].shape[0], |
| args_np[1].shape[1], |
| args_np[1].shape[2], |
| args_np[1].shape[3] * args_np[1].shape[4], |
| ) |
| .transpose((3, 0, 1, 2)) |
| ) |
| np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) |
| # nkhw -> nkkhw -> wkhnk |
| np_result = np_result.reshape( |
| np_result.shape[0], |
| np_result.shape[1] // vec_length, |
| vec_length, |
| np_result.shape[2], |
| np_result.shape[3], |
| ).transpose(4, 1, 3, 0, 2) |
| elif "NCHW_KCRS" in workload.__name__: |
| np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) |
| elif "NCHWc_KCRSk" in workload.__name__: |
| vec_length = args_np[1].shape[-1] |
| # nchwc -> nchw |
| args_np[0] = ( |
| args_np[0] |
| .transpose((0, 1, 4, 2, 3)) |
| .reshape( |
| args_np[0].shape[0], |
| args_np[0].shape[1] * args_np[0].shape[-1], |
| args_np[0].shape[2], |
| args_np[0].shape[3], |
| ) |
| ) |
| # kcrsk/cmrsc -> kcrs/cmrs |
| args_np[1] = ( |
| args_np[1] |
| .transpose((0, 4, 1, 2, 3)) |
| .reshape( |
| args_np[1].shape[0] * args_np[1].shape[4], |
| args_np[1].shape[1], |
| args_np[1].shape[2], |
| args_np[1].shape[3], |
| ) |
| ) |
| if "depthwise" in workload.__name__: |
| # np_result = testing.depthwise_conv2d_python_nchw(args_np[0], args_np[1], 1, "VALID") |
| np_result = ref_depthwise_convolution(args_np[0], args_np[1], [], []) |
| else: |
| # np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) |
| np_result = ref_convolution(args_np[0], args_np[1], [], []) |
| # nkhw -> nkhwk |
| np_result = np_result.reshape( |
| np_result.shape[0], |
| np_result.shape[1] // vec_length, |
| vec_length, |
| np_result.shape[2], |
| np_result.shape[3], |
| ).transpose(0, 1, 3, 4, 2) |
| np.testing.assert_allclose(args_tvm[-1].asnumpy(), np_result, rtol=1e-2, atol=1e-2) |
| |
| |
| class BaseSingleShapeValidator: |
| @tvm.testing.parametrize_targets("opencl") |
| def test_unary(self, test_func, input_shape, target, dev): |
| validate(test_func, target, dev, [input_shape]) |
| |
| |
| class TestPlusOneRank3(BaseSingleShapeValidator): |
| input_shape = tvm.testing.parameter((32, 32, 4)) |
| |
| def plus_one(input_shape): |
| return scheduler(compute_plus_one_rank3, schedule_plus_one_rank3, input_shape) |
| |
| test_func = tvm.testing.parameter(plus_one) |
| |
| |
| class TestPlusOneRank5(BaseSingleShapeValidator): |
| input_shape = tvm.testing.parameter((32, 2, 4, 4, 4)) |
| |
| def plus_one(input_shape): |
| return scheduler(compute_plus_one_rank5, schedule_plus_one_rank5, input_shape) |
| |
| test_func = tvm.testing.parameter(plus_one) |
| |
| |
| class TestMatmul: |
| input_shape = tvm.testing.parameter((32, 64, 4)) |
| local = tvm.testing.parameter(False, True) |
| |
| def matmul(input_shape, local): |
| return scheduler(compute_matmul, schedule_matmul, input_shape, local=local) |
| |
| def matmul_inner(input_shape, local): |
| return scheduler(compute_matmul_inner, schedule_matmul_inner, input_shape, local=local) |
| |
| test_func = tvm.testing.parameter(matmul, matmul_inner) |
| |
| @tvm.testing.parametrize_targets("opencl") |
| def test_matmul(self, test_func, input_shape, local, target, dev): |
| validate(test_func, target, dev, [input_shape], local=local) |
| |
| |
| class TestMatmulVectorAccumulator: |
| shapeA = tvm.testing.parameter((32, 64, 4)) |
| shapeB = tvm.testing.parameter((128, 16, 4)) |
| local = tvm.testing.parameter(False, True) |
| |
| def matmul_vector_accumulator(shapeA, shapeB, local): |
| return scheduler( |
| compute_matmul_vector_accumulator, |
| schedule_matmul_vector_accumulator, |
| shapeA, |
| shapeB, |
| local=local, |
| ) |
| |
| test_func = tvm.testing.parameter(matmul_vector_accumulator) |
| |
| @tvm.testing.parametrize_targets("opencl") |
| def test_matmul_vec_acc(self, test_func, shapeA, shapeB, local, target, dev): |
| validate(test_func, target, dev, [shapeA, shapeB], local=local) |
| |
| |
| class BaseConv2DValidator: |
| @tvm.testing.parametrize_targets("opencl") |
| def test_conv2d(self, test_func, input_shapes, target, dev): |
| validate(test_func, target, dev, input_shapes) |
| |
| |
| class TestConv2dNCHWcRSCKk(BaseConv2DValidator): |
| input_shapes = tvm.testing.parameter([(1, 32, 56, 56, 4), (1, 1, 128, 32, 4)]) |
| test_func = tvm.testing.parameter(conv2d_1x1_NCHWc_RSCKk) |
| |
| |
| class TestConv2dWCHNcCRSKk(BaseConv2DValidator): |
| input_shapes = tvm.testing.parameter([(56, 32, 56, 1, 4), (128, 1, 1, 32, 4)]) |
| test_func = tvm.testing.parameter(conv2d_1x1_WCHNc_CRSKk) |
| |
| |
| class TestConv2dNCHWcKCRSk(BaseConv2DValidator): |
| input_shapes = tvm.testing.parameter( |
| [(1, 32, 56, 56, 4), (32, 128, 1, 1, 4)], [(1, 32, 112, 112, 4), (32, 128, 3, 3, 4)] |
| ) |
| test_func = tvm.testing.parameter(conv2d_NCHWc_KCRSk, conv2d_NCHWc_KCRSk_fp32_acc) |
| |
| |
| class TestDepthwiseConv2dNCHWcKCRSk(BaseConv2DValidator): |
| input_shapes = tvm.testing.parameter([(1, 24, 257, 257, 4), (24, 1, 3, 3, 4)]) |
| test_func = tvm.testing.parameter(depthwise_conv2d_NCHWc_KCRSk_acc32) |
| |
| |
| def simple_texture_to_scalar_common( |
| target, input_info, output_info, find_patterns, dtype, cast_type |
| ): |
| def _compute(): |
| p0 = te.placeholder(input_info[1], name="p0", dtype=dtype) |
| p0_comp = te.compute(input_info[1], lambda *i: p0(*i), name="p0_comp") |
| if len(output_info[1]) == 4 and len(input_info[1]) == 5: |
| out = te.compute( |
| output_info[1], |
| lambda n, c, h, w: p0_comp[n][c // 4][h][w][c % 4].astype(cast_type), |
| name="out", |
| ) |
| elif len(output_info[1]) == 5 and len(input_info[1]) == 5: |
| out = te.compute( |
| output_info[1], |
| lambda n, c, h, w, cb: p0_comp[n][c][h][w][cb].astype(cast_type), |
| name="out", |
| ) |
| else: |
| raise Exception("Impossible case") |
| dummy_out = te.compute(output_info[1], lambda *i: out(*i), name="dummy_out") |
| return p0, dummy_out |
| |
| def _schedule(dummy_out): |
| from tvm.topi.adreno.utils import bind_data_copy |
| |
| s = te.create_schedule(dummy_out.op) |
| out = s[dummy_out].op.input_tensors[0] |
| p0_comp = s[out].op.input_tensors[0] |
| s[p0_comp].set_scope(input_info[0]) |
| bind_data_copy(s[p0_comp]) |
| s[out].set_scope(output_info[0]) |
| bind_data_copy(s[out]) |
| bind_data_copy(s[dummy_out]) |
| return s |
| |
| p0, dummy_out = _compute() |
| s = _schedule(dummy_out) |
| |
| fun = tvm.build(s, [p0, dummy_out], target) |
| dev = tvm.device(target, 0) |
| opencl_source = fun.imported_modules[0].get_source() |
| start_idx = 0 |
| for pattern in find_patterns: |
| start_idx = opencl_source.find(pattern, start_idx) |
| assert start_idx > -1 |
| |
| input_np = np.random.uniform(size=[i for i in input_info[1]]).astype(dtype) |
| input_tvm = tvm.nd.array(input_np, dev) |
| c = tvm.nd.empty(output_info[1], dtype, dev) |
| # Doesn't run OpenCL code for FP16 because GPUs in CI don't support FP16 inference |
| if cast_type == "float32": |
| fun(input_tvm, c) |
| # For output len == 5 it makes no sense to check the accuracy |
| if cast_type == "float32" and len(output_info[1]) == 4: |
| np_result = input_np.transpose(0, 2, 3, 1, 4) # NCHW4c -> NHWC4c |
| np_result = np.squeeze(np_result, axis=3) |
| np_result = np_result.transpose(0, 3, 1, 2) # NHWC -> NCHW |
| np.testing.assert_allclose(c.asnumpy(), np_result, rtol=1e-2, atol=1e-2) |
| |
| |
| class TestSimpleTextureToScalarFP16: |
| # (input [scope, shape], output [scope, shape], [find_patterns]) |
| input_info, output_info, find_patterns = tvm.testing.parameters( |
| # 1. Texture (NCHW4c) -> Cast(FP16) -> Buffer (NCHW) |
| ( |
| ["global.texture", (1, 1, 40, 40, 4)], |
| ["", (1, 4, 40, 40)], |
| [ |
| "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));", |
| "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]));", |
| ], |
| ), |
| # 2. Buffer (NCHW4c) -> Cast(FP16) -> Buffer (NCHW) |
| ( |
| ["", (1, 1, 40, 40, 4)], |
| ["", (1, 4, 40, 40)], |
| [ |
| "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))]));" |
| ], |
| ), |
| # 3. Texture (NCHW4c) -> Cast(FP16) -> Texture (NCHW4c) |
| ( |
| ["global.texture", (1, 1, 40, 40, 4)], |
| ["global.texture", (1, 1, 40, 40, 4)], |
| [ |
| "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5))));", |
| "write_imageh(out, (int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5)), (convert_half4(v_)));", |
| ], |
| ), |
| ) |
| dtype = tvm.testing.parameter("float32") |
| |
| @tvm.testing.parametrize_targets("opencl") |
| def test_simple_texture_to_scalar_fp16( |
| self, input_info, output_info, find_patterns, dtype, target |
| ): |
| simple_texture_to_scalar_common( |
| target, input_info, output_info, find_patterns, dtype, "float16" |
| ) |
| |
| |
| class TestSimpleTextureToScalarFP32: |
| # (input [scope, shape], output [scope, shape], [find_patterns]) |
| input_info, output_info, find_patterns = tvm.testing.parameters( |
| # 1. Texture (NCHW4c) -> Buffer (NCHW) |
| ( |
| ["global.texture", (1, 1, 40, 40, 4)], |
| ["", (1, 4, 40, 40)], |
| [ |
| "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));", |
| "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = ((float*)&v_)[((convert_int(get_group_id(0))) >> 1)];", |
| ], |
| ), |
| # 2. Buffer (NCHW4c) -> Buffer (NCHW) |
| ( |
| ["", (1, 1, 40, 40, 4)], |
| ["", (1, 4, 40, 40)], |
| [ |
| "out[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))];" |
| ], |
| ), |
| ) |
| dtype = tvm.testing.parameter("float32") |
| |
| @tvm.testing.parametrize_targets("opencl") |
| def test_simple_texture_to_scalar_fp32( |
| self, input_info, output_info, find_patterns, dtype, target |
| ): |
| simple_texture_to_scalar_common( |
| target, input_info, output_info, find_patterns, dtype, "float32" |
| ) |
| |
| |
| def texture_to_scalar_reuse_ssa_common( |
| target, input_info, output_info, find_patterns, dtype, cast_type |
| ): |
| def _compute(): |
| p0 = te.placeholder(input_info[1], name="p0", dtype=dtype) |
| p0_comp = te.compute(input_info[1], lambda *i: p0(*i), name="p0_comp") |
| if len(output_info[1]) == 4 and len(input_info[1]) == 5: |
| out = te.compute( |
| output_info[1], |
| lambda n, c, h, w: p0_comp[n][c // 4][h][w][c % 4].astype(cast_type), |
| name="out", |
| ) |
| out2 = te.compute( |
| output_info[1], |
| lambda n, c, h, w: out[n][c][h][w] |
| + p0_comp[n][c // 4][h][w][c % 4].astype(cast_type), |
| name="out", |
| ) |
| elif len(output_info[1]) == 5 and len(input_info[1]) == 5: |
| out = te.compute( |
| output_info[1], |
| lambda n, c, h, w, cb: p0_comp[n][c][h][w][cb].astype(cast_type), |
| name="out", |
| ) |
| out2 = te.compute( |
| output_info[1], |
| lambda n, c, h, w, cb: out[n][c][h][w][cb] |
| + p0_comp[n][c][h][w][cb].astype(cast_type), |
| name="out", |
| ) |
| else: |
| raise Exception("Impossible case") |
| out_sum = te.compute(output_info[1], lambda *i: out(*i) + out2(*i), name="out_sum") |
| dummy_out = te.compute(output_info[1], lambda *i: out_sum(*i), name="dummy_out") |
| return p0, dummy_out |
| |
| def _schedule(dummy_out): |
| from tvm.topi.adreno.utils import bind_data_copy |
| |
| s = te.create_schedule(dummy_out.op) |
| out_sum = s[dummy_out].op.input_tensors[0] |
| out, out2 = s[out_sum].op.input_tensors |
| p0_comp = s[out].op.input_tensors[0] |
| s[p0_comp].set_scope(input_info[0]) |
| bind_data_copy(s[p0_comp]) |
| s[out].set_scope(output_info[0]) |
| s[out2].set_scope(output_info[0]) |
| s[out2].compute_inline() |
| s[out].compute_inline() |
| s[out_sum].set_scope(output_info[0]) |
| bind_data_copy(s[out_sum]) |
| bind_data_copy(s[dummy_out]) |
| return s |
| |
| p0, dummy_out = _compute() |
| s = _schedule(dummy_out) |
| |
| fun = tvm.build(s, [p0, dummy_out], target) |
| dev = tvm.device(target, 0) |
| opencl_source = fun.imported_modules[0].get_source() |
| start_idx = 0 |
| for pattern in find_patterns: |
| start_idx = opencl_source.find(pattern, start_idx) |
| assert start_idx > -1 |
| |
| input_np = np.random.uniform(size=[i for i in input_info[1]]).astype(dtype) |
| input_tvm = tvm.nd.array(input_np, dev) |
| c = tvm.nd.empty(output_info[1], dtype, dev) |
| # Doesn't run OpenCL code for FP16 because GPUs in CI don't support FP16 inference |
| if cast_type == "float32": |
| fun(input_tvm, c) |
| # For output len == 5 it makes no sense to check the accuracy |
| if cast_type == "float32" and len(output_info[1]) == 4: |
| np_result = input_np * 3 |
| np_result = np_result.transpose(0, 2, 3, 1, 4) # NCHW4c -> NHWC4c |
| np_result = np.squeeze(np_result, axis=3) |
| np_result = np_result.transpose(0, 3, 1, 2) # NHWC -> NCHW |
| np.testing.assert_allclose(c.asnumpy(), np_result, rtol=1e-2, atol=1e-2) |
| |
| |
| class TestTextureToScalarReuseSSAFP16: |
| # (input [scope, shape], output [scope, shape], [find_patterns]) |
| input_info, output_info, find_patterns = tvm.testing.parameters( |
| # 1. Texture (NCHW4c) -> Cast(FP16) -> Buffer (NCHW) |
| ( |
| ["global.texture", (1, 1, 40, 40, 4)], |
| ["", (1, 4, 40, 40)], |
| [ |
| "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));", |
| "out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = ((convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)])) + ((convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)])) + (convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]))));", |
| ], |
| ), |
| # 2. Buffer (NCHW4c) -> Cast(FP16) -> Buffer (NCHW) |
| ( |
| ["", (1, 1, 40, 40, 4)], |
| ["", (1, 4, 40, 40)], |
| [ |
| " out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = ((convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))])) + ((convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))])) + (convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))]))));" |
| ], |
| ), |
| # 3. Texture (NCHW4c) -> Cast(FP16) -> Texture (NCHW4c) |
| ( |
| ["global.texture", (1, 1, 40, 40, 4)], |
| ["global.texture", (1, 1, 40, 40, 4)], |
| [ |
| "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5))));", |
| "write_imageh(out_sum, (int2)(((((convert_int(get_group_id(0))) * 24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) + ((convert_int(get_local_id(0))) >> 3)) / 5)), ((convert_half4(v_)) + ((convert_half4(v_)) + (convert_half4(v_)))));", |
| ], |
| ), |
| ) |
| dtype = tvm.testing.parameter("float32") |
| |
| @tvm.testing.parametrize_targets("opencl") |
| def test_texture_to_scalar_reuse_ssa_fp16( |
| self, input_info, output_info, find_patterns, dtype, target |
| ): |
| texture_to_scalar_reuse_ssa_common( |
| target, input_info, output_info, find_patterns, dtype, "float16" |
| ) |
| |
| |
| class TestTextureToScalarReuseSSAFP32: |
| # (input [scope, shape], output [scope, shape], [find_patterns]) |
| input_info, output_info, find_patterns = tvm.testing.parameters( |
| # 1. Texture (NCHW4c) -> Buffer (NCHW) |
| ( |
| ["global.texture", (1, 1, 40, 40, 4)], |
| ["", (1, 4, 40, 40)], |
| [ |
| "float4 v_ = READ_IMAGEF(p0_comp, image_sampler, ((int2)(((convert_int(get_local_id(0))) % 40), ((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0))) / 40)))));", |
| "out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (((float*)&v_)[((convert_int(get_group_id(0))) >> 1)] + (((float*)&v_)[((convert_int(get_group_id(0))) >> 1)] + ((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]));", |
| ], |
| ), |
| # 2. Buffer (NCHW4c) -> Buffer (NCHW) |
| ( |
| ["", (1, 1, 40, 40, 4)], |
| ["", (1, 4, 40, 40)], |
| [ |
| "out_sum[(((convert_int(get_group_id(0))) * 800) + (convert_int(get_local_id(0))))] = (p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))] + (p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))] + p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))]));" |
| ], |
| ), |
| ) |
| dtype = tvm.testing.parameter("float32") |
| |
| @tvm.testing.parametrize_targets("opencl") |
| def test_texture_to_scalar_reuse_ssa_fp32( |
| self, input_info, output_info, find_patterns, dtype, target |
| ): |
| texture_to_scalar_reuse_ssa_common( |
| target, input_info, output_info, find_patterns, dtype, "float32" |
| ) |
| |
| |
| class TestLocalArrayToTexture: |
| # 1. conv2d(Texture(NCHW4c), Texture(OIHW4o)) -> local_array[4] -> Texture (NCHW4c) |
| input_shape1, input_shape2, output_shape, find_patterns = tvm.testing.parameters( |
| ( |
| (1, 1, 40, 40, 4), |
| (2, 4, 3, 3, 4), |
| (1, 2, 38, 38, 4), |
| [ |
| "float out_local[4];", |
| "float4 v_ = READ_IMAGEF(p1_comp, image_sampler, ((int2)(((((convert_int(get_group_id(0))) * 14) + (convert_int(get_local_id(0)))) % 38), (((((convert_int(get_group_id(0))) * 64) + ((convert_int(get_local_id(0))) >> 1)) % 722) / 19))));", |
| "float4 v__1 = READ_IMAGEF(p2_comp, image_sampler, ((int2)(rw, (((((((convert_int(get_group_id(0))) * 32) + ((convert_int(get_local_id(0))) >> 2)) / 361) * 12) + (rcb * 3)) + rh))));", |
| "out_local[cb_c] = (out_local[cb_c] + (((float*)&v_)[rcb] * ((float*)&v__1)[cb_c]));", |
| "write_imagef(out, (int2)(((((convert_int(get_group_id(0))) * 14) + (convert_int(get_local_id(0)))) % 38), ((((convert_int(get_group_id(0))) * 64) + ((convert_int(get_local_id(0))) >> 1)) / 19)), vload4(0, out_local + 0));", |
| ], |
| ), |
| ) |
| dtype = tvm.testing.parameter("float32") |
| |
| @tvm.testing.parametrize_targets("opencl") |
| def test_local_array_to_texture( |
| self, input_shape1, input_shape2, output_shape, find_patterns, dtype, target |
| ): |
| def _compute(): |
| p1 = te.placeholder(input_shape1, name="p1", dtype=dtype) |
| p1_comp = te.compute(input_shape1, lambda *i: p1(*i), name="p1_comp") |
| p2 = te.placeholder(input_shape2, name="p2", dtype=dtype) |
| p2_comp = te.compute(input_shape2, lambda *i: p2(*i), name="p2_comp") |
| KH, KW = input_shape2[2], input_shape2[3] |
| IC, ICB = input_shape1[1], input_shape1[4] |
| rh = te.reduce_axis((0, KH), name="rh") |
| rw = te.reduce_axis((0, KW), name="rw") |
| rc = te.reduce_axis((0, IC), name="rc") |
| rcb = te.reduce_axis((0, ICB), name="rcb") |
| out = te.compute( |
| output_shape, |
| lambda n, c, h, w, cb: te.sum( |
| (p1_comp[n, rc, h, w, rcb] * p2_comp[c, rc * ICB + rcb, rh, rw, cb]).astype( |
| dtype |
| ), |
| axis=[rh, rw, rc, rcb], |
| ), |
| name="out", |
| ) |
| dummy_out = te.compute(output_shape, lambda *i: out(*i), name="dummy_out") |
| return p1, p2, dummy_out |
| |
| def _schedule(dummy_out): |
| from tvm.topi.adreno.utils import bind_data_copy |
| |
| s = te.create_schedule(dummy_out.op) |
| out = s[dummy_out].op.input_tensors[0] |
| p1_comp, p2_comp = s[out].op.input_tensors |
| bind_data_copy(s[p1_comp]) |
| s[p1_comp].set_scope("global.texture") |
| bind_data_copy(s[p2_comp]) |
| s[p2_comp].set_scope("global.texture") |
| OL = s.cache_write(out, "local") |
| n, c, h, w, cb = s[out].op.axis |
| fused = s[out].fuse(n, c, h, w) |
| bx, tx = s[out].split(fused, 128) |
| s[out].reorder(bx, tx, cb) |
| s[out].vectorize(cb) |
| s[out].set_scope("global.texture") |
| s[out].bind(bx, te.thread_axis("blockIdx.x")) |
| s[out].bind(tx, te.thread_axis("threadIdx.x")) |
| s[OL].compute_at(s[out], tx) |
| bind_data_copy(s[dummy_out]) |
| return s |
| |
| p1, p2, dummy_out = _compute() |
| s = _schedule(dummy_out) |
| |
| fun = tvm.build(s, [p1, p2, dummy_out], target) |
| dev = tvm.device(target, 0) |
| opencl_source = fun.imported_modules[0].get_source() |
| start_idx = 0 |
| for pattern in find_patterns: |
| start_idx = opencl_source.find(pattern, start_idx) |
| assert start_idx > -1 |
| |
| input_np1 = np.random.uniform(size=[i for i in input_shape1]).astype(dtype) |
| input_np2 = np.random.uniform(size=[i for i in input_shape2]).astype(dtype) |
| input_tvm1 = tvm.nd.array(input_np1, dev) |
| input_tvm2 = tvm.nd.array(input_np2, dev) |
| c = tvm.nd.empty(output_shape, dtype, dev) |
| fun(input_tvm1, input_tvm2, c) |
| |
| |
| if __name__ == "__main__": |
| tvm.testing.main() |