src/runtime/contrib/arm_compute_lib/acl_runtime.cc - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file src/runtime/contrib/arm_compute_lib/acl_runtime.cc
  * \brief A simple JSON runtime for Arm Compute Library.
  */

 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/tensor.h>

 #include "../json/json_node.h"
 #include "../json/json_runtime.h"

 #ifdef TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB
 #include <arm_compute/core/Types.h>
 #include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
 #include <arm_compute/runtime/NEON/functions/NEConcatenateLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEElementwiseOperations.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEReshapeLayer.h>

 #include "acl_allocator.h"
 #include "acl_utils.h"
 #endif

 namespace tvm {
 namespace runtime {
 namespace contrib {

 using namespace tvm::runtime::json;

 class ACLRuntime : public JSONRuntimeBase {
  public:
   /*!
    * \brief The ACL runtime module. Deserialize the provided functions
    * on creation and store in the layer cache.
    *
    * \param symbol_name The name of the function.
    * \param graph_json serialized JSON representation of a sub-graph.
    * \param const_names The names of each constant in the sub-graph.
    */
   explicit ACLRuntime(const std::string& symbol_name, const std::string& graph_json,
                       const ffi::Array<ffi::String>& const_names)
       : JSONRuntimeBase(symbol_name, graph_json, const_names) {}

   /*!
    * \brief The type key of the module.
    *
    * \return module type key.
    */
   const char* kind() const override { return "arm_compute_lib"; }

   /*!
    * \brief Initialize runtime. Create ACL layer from JSON
    * representation.
    *
    * \param consts The constant params from compiled model.
    */
   void Init(const ffi::Array<Tensor>& consts) override {
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
     SetupConstants(consts);
     BuildEngine();
   }

 #ifdef TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB
   /*!
    * \brief Unpack inputs and outputs and run inference on a given layer.
    *
    * \param args Access inputs and outputs.
    * \param function The layer to execute inference on.
    * \return Status of inference.
    */
   void Run() override {
     for (size_t nid_idx = 0; nid_idx < input_nodes_.size(); ++nid_idx) {
       auto nid = input_nodes_[nid_idx];
       if (nodes_[nid].GetOpType() == "input") {
         for (uint32_t eid_idx = 0; eid_idx < nodes_[nid].GetNumOutput(); eid_idx++) {
           uint32_t eid = EntryID(nid, eid_idx);
           void* data = data_entry_[eid]->data;
           auto key = std::pair<uint32_t, uint32_t>(nid, eid_idx);
           if (layer_.json_inputid_to_layer_inputid.count(key) > 0) {
             CheckACLError(
                 layer_.inputs[layer_.json_inputid_to_layer_inputid[key]].allocator()->import_memory(
                     data));
           } else {
             CheckACLError(layer_.inputs[nid_idx].allocator()->import_memory(data));
           }
         }
       }
     }

     for (size_t i = 0; i < outputs_.size(); ++i) {
       uint32_t eid = EntryID(outputs_[i]);
       void* data = data_entry_[eid]->data;
       CheckACLError(layer_.outputs[i].allocator()->import_memory(data));
     }

     this->layer_.function->run();
   }

  private:
   /*!
    * \brief Build ACL layer from JSON representation and cache.
    *
    * \note For the time being only one layer or operator is supported
    * per engine.
    */
   void BuildEngine() {
     std::shared_ptr<arm_compute::MemoryManagerOnDemand> mm = MakeACLMemoryManager();
     int num_pools = 0;
     bool found_kernel_node = false;
     for (size_t nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
       if (found_kernel_node) {
         LOG(FATAL)
             << "Arm Compute Library runtime module only supports one kernel node per function.";
       }
       if (node.GetOpType() == "kernel") {
         found_kernel_node = true;
         auto op_name = node.GetOpName();
         if ("nn.conv2d" == op_name || "qnn.conv2d" == op_name) {
           CreateConvolution2DLayer(&layer_, node, mm);
           num_pools++;
         } else if ("nn.depthwise_conv2d" == op_name || "qnn.depthwise_conv2d" == op_name) {
           CreateDepthwiseConvolution2DLayer(&layer_, node, mm);
           num_pools++;
         } else if ("nn.dense" == op_name || "qnn.dense" == op_name) {
           CreateFullyConnectedLayer(&layer_, node, mm);
           num_pools++;
         } else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
                    "nn.l2_pool2d" == op_name) {
           CreatePoolingLayer(&layer_, node);
         } else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name) {
           CreateGlobalPoolingLayer(&layer_, node);
         } else if ("reshape" == op_name) {
           CreateReshapeLayer(&layer_, node);
         } else if ("maximum" == op_name) {
           CreateMaximumLayer(&layer_, node);
         } else if ("add" == op_name || "qnn.add" == op_name) {
           CreateAddLayer(&layer_, node);
         } else if ("concatenate" == op_name) {
           CreateConcatenateLayer(&layer_, node);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
       }
     }
     this->layer_.function->prepare();
     if (num_pools > 0) mm->populate(this->allocator_, num_pools);
   }

   /*!
    * \brief ACL objects we cache in order to avoid needing to construct
    * a new layer each time.
    */
   struct CachedLayer {
     std::shared_ptr<arm_compute::IFunction> function;
     std::vector<arm_compute::Tensor> inputs;
     std::vector<arm_compute::Tensor> outputs;
     // maps the input index of JSON node to the index of the ACL layer's inputs
     // this is optional (i.e.only when an operator uses the eid index)
     std::map<std::pair<uint32_t, uint32_t>, uint32_t> json_inputid_to_layer_inputid;
   };

   /*!
    * \brief Create an ACL tensor given the JSON representation. If scale
    * and offset are given, then create a quantized ACL tensor.
    *
    * \param tensor The tensor to represent.
    * \param scale (optional) The scale of the tensor as an input.
    * \param offset (optional) The offset of the tensor as an input.
    * \param apply_dim_correction (Optional) Flag to state whether apply dimension correction after
    * setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but
    * _num_dimensions should be 3 rather than 1.
    * \param increase_dim_unit (Optional) Set to true if new unit dimensions increase the number of
    * dimensions of the shape.
    * \return ACL Tensor.
    */
   arm_compute::Tensor MakeACLTensorFromJSONEntry(const JSONGraphNodeEntry& tensor,
                                                  JSONGraphNodeEntry* scale = nullptr,
                                                  JSONGraphNodeEntry* offset = nullptr,
                                                  bool apply_dim_correction = true,
                                                  bool increase_dim_unit = true) {
     JSONGraphNode node = nodes_[tensor.id_];
     void* node_data = nullptr;
     if (node.GetOpType() == "const") {
       node_data = data_entry_[EntryID(tensor)]->data;
     }
     return MakeACLTensorFromJSONNode(node, scale, offset, node_data, apply_dim_correction,
                                      increase_dim_unit, tensor.index_);
   }

   /*!
    * \brief Create an ACL tensor given the JSON representation. If scale
    * and offset are given, then create a quantized ACL tensor.
    *
    * \param node The tensor to represent.
    * \param scale (optional) The scale of the tensor as an input.
    * \param offset (optional) The offset of the tensor as an input.
    * \param data (optional) Constant data of input node.
    * \param apply_dim_correction (Optional) Flag to state whether apply dimension correction after
    * setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but
    * _num_dimensions should be 3 rather than 1.
    * \param increase_dim_unit (Optional) Set to true if new unit dimensions increase the number of
    * dimensions of the shape.
    * \param entry_index The entry index.
    * \return ACL Tensor.
    */
   arm_compute::Tensor MakeACLTensorFromJSONNode(
       const JSONGraphNode& node, JSONGraphNodeEntry* scale = nullptr,
       JSONGraphNodeEntry* offset = nullptr, void* data = nullptr, bool apply_dim_correction = true,
       bool increase_dim_unit = true, uint32_t entry_index = 0) {
     const DLTensor* scale_data = nullptr;
     const DLTensor* offset_data = nullptr;
     if (scale && offset) {
       scale_data = data_entry_[EntryID(*scale)];
       offset_data = data_entry_[EntryID(*offset)];
     }
     return MakeACLTensor(node, data, scale_data, offset_data, apply_dim_correction,
                          increase_dim_unit, entry_index);
   }

   /*!
    * \brief Create a 2D convolution layer.
    *
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
    * \param node The JSON representation of the operator.
    * \param mm The ACL conv2d layer can request auxiliary memory from TVM.
    */
   void CreateConvolution2DLayer(CachedLayer* layer, const JSONGraphNode& node,
                                 const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& mm) {
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
     std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
     arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides);

     int groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
     ICHECK(groups == 1) << "Arm Compute Library NEON convolution only supports group size of 1.";

     arm_compute::ActivationLayerInfo act_info;
     if (node.HasAttr("activation_type")) {
       std::string activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
       act_info = MakeACLActivationInfo(activation_type);
     }

     arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1]));

     // Collect inputs and outputs, handling both nn.conv2d and qnn.conv2d cases.
     std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
     size_t num_inputs = inputs.size();
     bool has_bias;
     if (node.GetOpName() == "qnn.conv2d") {
       ICHECK(num_inputs >= 8U && num_inputs <= 9U)
           << "Quantized convolution requires 9 inputs with a bias, 8 inputs without.";
       has_bias = num_inputs == 9;
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3]));
       if (has_bias) {
         layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6]));
       }
       layer->outputs.push_back(
           MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
     } else {
       ICHECK(num_inputs >= 2U && num_inputs <= 3U)
           << "Convolution requires 3 inputs with a bias, 2 inputs without.";
       has_bias = num_inputs == 3;
       for (const auto& i : inputs) {
         layer->inputs.push_back(MakeACLTensorFromJSONEntry(i));
       }
       layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
     }

     auto function = std::make_shared<arm_compute::NEConvolutionLayer>(mm);
     function->configure(&layer->inputs[0], &layer->inputs[1],
                         has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], pad_stride_info,
                         arm_compute::WeightsInfo(), dilation_2d, act_info);
     layer->function = function;
   }

   /*!
    * \brief Create a 2D depthwise convolution layer.
    *
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
    * \param node The JSON representation of the operator.
    * \param mm The ACL conv2d layer can request auxiliary memory from TVM.
    */
   void CreateDepthwiseConvolution2DLayer(
       CachedLayer* layer, const JSONGraphNode& node,
       const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& mm) {
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
     std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
     arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides);

     arm_compute::ActivationLayerInfo act_info;
     if (node.HasAttr("activation_type")) {
       std::string activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
       act_info = MakeACLActivationInfo(activation_type);
     }

     arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1]));

     // Collect inputs and outputs, handling both nn.conv2d and qnn.conv2d cases.
     std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
     size_t num_inputs = inputs.size();
     bool has_bias;
     if (node.GetOpName() == "qnn.depthwise_conv2d") {
       ICHECK(num_inputs >= 8U && num_inputs <= 9U)
           << "Quantized convolution requires 9 inputs with a bias, 8 inputs without.";
       has_bias = num_inputs == 9;
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3]));
       if (has_bias) {
         layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6]));
       }
       layer->outputs.push_back(
           MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
     } else {
       ICHECK(num_inputs >= 2U && num_inputs <= 3U)
           << "Convolution requires 3 inputs with a bias, 2 inputs without.";
       has_bias = num_inputs == 3;
       for (const auto& i : inputs) {
         layer->inputs.push_back(MakeACLTensorFromJSONEntry(i));
       }
       layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
     }

     // Depth multiplier is the final dimension in acl weights tensor (IWH*M*)
     int depth_multiplier = layer->inputs[1].info()->tensor_shape()[3];

     auto function = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer>(mm);
     function->configure(&layer->inputs[0], &layer->inputs[1],
                         has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], pad_stride_info,
                         depth_multiplier, act_info, dilation_2d);
     layer->function = function;
   }

   /*!
    * \brief Create a fully connected (dense) layer.
    *
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
    * \param node The JSON representation of the operator.
    * \param mm The ACL fully connected layer can request auxiliary memory from TVM.
    */
   void CreateFullyConnectedLayer(CachedLayer* layer, const JSONGraphNode& node,
                                  const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& mm) {
     arm_compute::FullyConnectedLayerInfo fc_info;
     fc_info.set_weights_trained_layout(arm_compute::DataLayout::NHWC);

     // Collect inputs and outputs, handling both nn.dense and qnn.dense cases.
     std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
     size_t num_inputs = inputs.size();
     bool has_bias;
     if (node.GetOpName() == "qnn.dense") {
       ICHECK(num_inputs >= 8U && num_inputs <= 9U)
           << "Quantized fully connected (dense) layer requires 9 inputs with a bias, 8 inputs "
              "without.";
       has_bias = num_inputs == 9;
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2]));
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3]));
       if (has_bias) {
         layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6]));
       }
       layer->outputs.push_back(
           MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias]));
     } else {
       ICHECK(num_inputs >= 2U && num_inputs <= 3U)
           << "Fully connected (dense) layer requires 3 inputs with a bias, 2 inputs without.";
       has_bias = num_inputs == 3;
       for (const auto& i : inputs) {
         layer->inputs.push_back(MakeACLTensorFromJSONEntry(i));
       }
       layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
     }

     auto function = std::make_shared<arm_compute::NEFullyConnectedLayer>(mm);
     function->configure(&layer->inputs[0], &layer->inputs[1],
                         has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], fc_info);
     layer->function = function;
   }

   /*!
    * \brief Create a pooling layer.
    *
    * \note Currently max_pool2d, avg_pool2d and L2 pooling are supported.
    *
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
    * \param node The JSON representation of the operator.
    */
   void CreatePoolingLayer(CachedLayer* layer, const JSONGraphNode& node) {
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
     std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
     bool ceil_mode = std::stoi(node.GetAttr<std::vector<std::string>>("ceil_mode")[0]);
     arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides, ceil_mode);
     auto attr_pool_size = node.GetAttr<std::vector<std::string>>("pool_size");
     int pool_size_h = std::stoi(attr_pool_size[0]);
     int pool_size_w = std::stoi(attr_pool_size[1]);

     // Only applies to average pool and l2 pool.
     // ACL exclude pad option is inverse to Relays include pad option.
     bool exclude_pad = false;
     if (node.HasAttr("count_include_pad")) {
       int count_include_pad =
           std::stoi(node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
       exclude_pad = !count_include_pad;
     }

     arm_compute::PoolingType pool_type;
     if (node.GetOpName() == "nn.max_pool2d") {
       pool_type = arm_compute::PoolingType::MAX;
     } else if (node.GetOpName() == "nn.avg_pool2d") {
       pool_type = arm_compute::PoolingType::AVG;
     } else if (node.GetOpName() == "nn.l2_pool2d") {
       pool_type = arm_compute::PoolingType::L2;
     } else {
       LOG(FATAL) << "Pooling type not supported";
     }

     ICHECK(dilation.size() == 2 && dilation[0] == "1" && dilation[1] == "1")
         << "Dilation other than (1, 1) not supported";
     arm_compute::PoolingLayerInfo pool_info =
         arm_compute::PoolingLayerInfo(pool_type, arm_compute::Size2D(pool_size_h, pool_size_w),
                                       arm_compute::DataLayout::NHWC, pad_stride_info, exclude_pad);

     layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
     layer->outputs.push_back(MakeACLTensorFromJSONNode(node));

     auto function = std::make_shared<arm_compute::NEPoolingLayer>();
     function->configure(&layer->inputs[0], &layer->outputs[0], pool_info);
     layer->function = function;
   }

   /*!
    * \brief Create a global pooling layer.
    *
    * \note Currently global_max_pool2d and global_avg_pool2d are supported.
    *
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
    * \param node The JSON representation of the operator.
    */
   void CreateGlobalPoolingLayer(CachedLayer* layer, const JSONGraphNode& node) {
     arm_compute::PoolingType pool_type;
     if (node.GetOpName() == "nn.global_max_pool2d") {
       pool_type = arm_compute::PoolingType::MAX;
     } else if (node.GetOpName() == "nn.global_avg_pool2d") {
       pool_type = arm_compute::PoolingType::AVG;
     } else {
       LOG(FATAL) << "Pooling type not supported";
     }

     arm_compute::PoolingLayerInfo pool_info =
         arm_compute::PoolingLayerInfo(pool_type, arm_compute::DataLayout::NHWC);

     layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
     layer->outputs.push_back(MakeACLTensorFromJSONNode(node));

     auto function = std::make_shared<arm_compute::NEPoolingLayer>();
     function->configure(&layer->inputs[0], &layer->outputs[0], pool_info);
     layer->function = function;
   }

   /*!
    * \brief Create a reshape layer.
    *
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
    * \param node The JSON representation of the operator.
    */
   void CreateReshapeLayer(CachedLayer* layer, const JSONGraphNode& node) {
     layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
     layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
     auto function = std::make_shared<arm_compute::NEReshapeLayer>();
     function->configure(&layer->inputs[0], &layer->outputs[0]);
     layer->function = function;
   }

   /*!
    * \brief Create a maximum layer.
    *
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
    * \param node The JSON representation of the operator.
    */
   void CreateMaximumLayer(CachedLayer* layer, const JSONGraphNode& node) {
     layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
     layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1]));
     layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
     auto function = std::make_shared<arm_compute::NEElementwiseMax>();
     function->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0]);
     layer->function = function;
   }
   /*!
    * \brief Creates an add/qnn.add layer
    *
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
    * \param node  The JSON representation of the operator.
    */
   void CreateAddLayer(CachedLayer* layer, const JSONGraphNode& node) {
     auto op_name = node.GetOpName();
     if ("add" == op_name) {
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1]));
       layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
     } else if ("qnn.add" == op_name) {
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0], &node.GetInputs()[2],
                                                          &node.GetInputs()[3]));
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1], &node.GetInputs()[4],
                                                          &node.GetInputs()[5]));
       layer->outputs.push_back(
           MakeACLTensorFromJSONNode(node, &node.GetInputs()[6], &node.GetInputs()[7]));
     } else {
       LOG(FATAL) << "Unsupported form of add op: " + op_name;
     }

     auto f = std::make_shared<arm_compute::NEArithmeticAddition>();

     // SATURATE is used as add_QASYMM8_QASYMM8_QASYMM8 always saturates result
     f->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0],
                  arm_compute::ConvertPolicy::SATURATE);
     layer->function = f;
   }

   /*!
    * \brief Create a Concatenate layer.
    *
    * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.c
    * \param node The JSON representation of the operator.
    */
   void CreateConcatenateLayer(CachedLayer* layer, const JSONGraphNode& node) {
     std::vector<std::string> axis = node.GetAttr<std::vector<std::string>>("axis");
     std::vector<const arm_compute::ITensor*> inputs;
     for (auto input : node.GetInputs()) {
       layer->inputs.push_back(MakeACLTensorFromJSONEntry(input, nullptr, nullptr, false));
       layer->json_inputid_to_layer_inputid[std::pair<uint32_t, uint32_t>(input.id_, input.index_)] =
           layer->inputs.size() - 1;
     }
     for (size_t i = 0; i < layer->inputs.size(); i++) {
       inputs.push_back(&layer->inputs[i]);
     }
     layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
     int dimNum = layer->inputs[0].info()->num_dimensions();
     auto function = std::make_shared<arm_compute::NEConcatenateLayer>();
     // the shape of input tensor will be reversed after passing to ACL
     // for example a tensor with shape [1, 2, 3, 4] will be changed to
     // [4, 3, 2, 1] at ACL side. So the axis here should be preprocessed.
     auto a = std::stoi(axis[0]);
     function->configure(inputs, &layer->outputs[0], a < 0 ? -a - 1 : dimNum - a - 1);
     layer->function = function;
   }

   /*! \brief Allow ACL functions to request auxiliary memory from TVM. */
   ACLAllocator allocator_;
   /*!
    * \brief The network layers represented by acl functions.
    * \note Currently only supports a single layer.
    */
   CachedLayer layer_;
 #else
   void Run() override {
     LOG(FATAL) << "Cannot call run on Arm Compute Library module without runtime enabled. "
                << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR.";
   }

   void BuildEngine() {
     LOG(WARNING) << "Arm Compute Library engine is not initialized. "
                  << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR.";
   }
 #endif
 };
 ffi::Module ACLRuntimeCreate(const ffi::String& symbol_name, const ffi::String& graph_json,
                              const ffi::Array<ffi::String>& const_names) {
   auto n = ffi::make_object<ACLRuntime>(symbol_name, graph_json, const_names);
   return ffi::Module(n);
 }

 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("runtime.arm_compute_lib_runtime_create", ACLRuntimeCreate)
       .def("ffi.Module.load_from_bytes.arm_compute_lib",
            JSONRuntimeBase::LoadFromBytes<ACLRuntime>);
 }
 }  //  namespace contrib
 }  //  namespace runtime
 }  //  namespace tvm