| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import time |
| import gc |
| import sys |
| import mxnet as mx |
| from mxnet.gluon import nn |
| from mxnet.contrib import quantization |
| |
| #shape, num_hidden: |
| sizes = [ |
| (( 1, 224), 512), |
| (( 1, 224), 4096), |
| (( 16, 1024), 1024), |
| (( 32, 4096), 1024), |
| (( 32, 4096), 4096), |
| ((512, 512), 4096)] |
| |
| rounds = 1000 |
| warmup = 10 |
| |
| test_header = "--no_test_header" not in sys.argv |
| table_header = "--no_table_header" not in sys.argv |
| table_left_colums = "--no_size_column" not in sys.argv |
| dump_graph = "--dump_graph" in sys.argv |
| |
| def dump_graph_fn(net, postfix): |
| if dump_graph: |
| net.export("/tmp/fc_add_" + postfix) |
| |
| def operator_string(elemwise_add): |
| return 'elemwise_add' if elemwise_add else 'npi_add' |
| |
| def print_header(header): |
| print("\n") |
| print(header if test_header else "", "\n") |
| if table_header: |
| if table_left_colums: |
| print("| Shape | Hidden | Mean [ms] |" ) |
| print("|------------:|-------:|----------:|" ) |
| else: |
| print(" Mean [ms] |" ) |
| print("----------:|" ) |
| |
| def print_value(shape, hidden, mean): |
| if table_left_colums: |
| print(f"| ({shape[0]:4},{shape[1]:4}) | {hidden:6} | {mean:9.3f} |") |
| else: |
| print(f" {mean:9.3f} |") |
| |
| |
| def measure(net, data0, data1, data2, shape, nhid): |
| mx.nd.waitall() |
| gc.collect() |
| gc.disable() |
| for i in range(rounds + warmup): |
| if i == warmup: |
| start_time = time.time() |
| o = net(data0, data1, data2) |
| o.wait_to_read() |
| end_time = time.time() |
| run_time = (end_time - start_time) |
| print_value(shape, nhid, 1000 * run_time / rounds) |
| gc.enable() |
| |
| |
| class FCWithSum(nn.HybridBlock): |
| def __init__(self, num_in, num_hidden, elemwise_add, **kwargs): |
| super(FCWithSum, self).__init__(**kwargs) |
| self.fc0 = nn.Dense(units=num_hidden, in_units=num_in) |
| self.fc1 = nn.Dense(units=num_hidden) |
| self.elemwise_add = elemwise_add |
| |
| def forward(self, data0, data1, data2): |
| _fc0 = self.fc0(data0) |
| _fc1 = self.fc1(data1) |
| if self.elemwise_add: |
| _sum0 = mx.nd.elemwise_add(data2.as_nd_ndarray(), _fc0.as_nd_ndarray()).as_np_ndarray() |
| _sum1 = mx.nd.elemwise_add(_fc1.as_nd_ndarray(), _sum0.as_nd_ndarray()).as_np_ndarray() |
| else: |
| _sum0 = data2 + _fc0 |
| _sum1 = _fc1 + _sum0 |
| return _sum1 |
| |
| def benchmark_float(elemwise_add, broadcast=False): |
| header = operator_string(elemwise_add) + ', float' + (' , broadcast' if broadcast else "") |
| print_header(header) |
| for shape, nhid in sizes: |
| net = FCWithSum(shape[1], nhid, elemwise_add) |
| net.initialize() |
| net.hybridize(static_alloc=True, static_shape=True) |
| data0 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0) |
| data1 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0) |
| shape2 = (shape[0], nhid) |
| if broadcast and not elemwise_add: |
| # broadcast is allowed only for npi_add version |
| shape2 = (1, 1) |
| data2 = mx.np.random.uniform(size=shape2, low=-1.0, high=1.0) |
| net.optimize_for(data0, data1, data2, backend='ONEDNN') |
| measure(net, data0, data1, data2, shape, nhid) |
| dump_graph_fn(net, operator_string(elemwise_add) + '_float') |
| |
| class CalibIter(mx.io.DataIter): |
| def __init__(self, batch, data_shape, batch_size): |
| super(CalibIter, self).__init__(batch_size) |
| self.label_shape = (batch_size,) |
| self.data_shape = data_shape |
| if isinstance(data_shape, tuple): |
| self.provide_data = [('data', data_shape)] |
| else: |
| self.provide_data = data_shape |
| self.provide_label = [] |
| self.batch = batch |
| def __iter__(self): |
| yield self.batch |
| |
| def benchmark_int8(quantize_mode, quantize_granularity, elemwise_add, broadcast = False): |
| header = operator_string(elemwise_add) + ', mode = ' + quantize_mode + \ |
| ', granularity = ' + quantize_granularity + (' , broadcast' if broadcast else "") |
| print_header(header) |
| for shape, nhid in sizes: |
| net = FCWithSum(shape[1], nhid, elemwise_add) |
| net.initialize() |
| net.hybridize(static_alloc=True, static_shape=True) |
| data0 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0) |
| data1 = mx.np.random.uniform(size=shape, low=-1.0, high=1.0) |
| shape2 = (shape[0], nhid) |
| if broadcast and not elemwise_add: |
| # broadcast is allowed only for npi_add |
| shape2 = (shape[0], 1) |
| data2 = mx.np.random.uniform(size=shape2, low=-1.0, high=1.0) |
| data = mx.gluon.data.ArrayDataset(data0, data1, data2) |
| calib_data = mx.gluon.data.DataLoader(data, batch_size=1) |
| net = quantization.quantize_net(net, |
| device=mx.cpu(), |
| exclude_layers=None, |
| exclude_operators=None, |
| calib_mode='naive', |
| calib_data=calib_data, |
| num_calib_batches=1, |
| quantize_mode=quantize_mode, |
| quantize_granularity=quantize_granularity |
| ) |
| net.hybridize(static_alloc=True, static_shape=True) |
| measure(net, data0, data1, data2, shape, nhid) |
| dump_graph_fn(net, operator_string(elemwise_add) + \ |
| '_' + str(quantize_mode) + '_' + str(quantize_granularity)) |
| |
| for elemwise_add in [True, False]: |
| benchmark_float(elemwise_add) |
| |
| for quantize_mode in ['smart', 'full']: |
| for quantize_granularity in ['tensor-wise', 'channel-wise']: |
| for elemwise_add in [True, False]: |
| benchmark_int8(quantize_mode, quantize_granularity, elemwise_add) |
| |
| # Benchmark FC + npi_add with broadcasted input |
| benchmark_float(False, True) |
| |
| # Benchmark quantized FC + npi_add with broadcasted input |
| for quantize_mode in ['smart', 'full']: |
| for quantize_granularity in ['tensor-wise', 'channel-wise']: |
| benchmark_int8(quantize_mode, quantize_granularity, False, True) |