src/runtime/contrib/clml/clml_runtime.cc - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file src/runtime/contrib/clml/clml_runtime.cc
  * \brief A simple JSON runtime for CLML.
  */
 #include "clml_runtime.h"

 #include <tvm/ffi/reflection/registry.h>

 #include <unordered_map>

 #ifdef TVM_GRAPH_EXECUTOR_CLML
 #include "clml_memory_planner.h"
 #include "clml_utils.h"
 #endif

 #include <tvm/runtime/profiling.h>

 namespace tvm {
 namespace runtime {
 namespace contrib {

 using namespace tvm::runtime::json;
 using JSONGraphNode = tvm::runtime::json::JSONGraphNode;

 #ifdef TVM_GRAPH_EXECUTOR_CLML
 CLMLThreadEntry* CLMLWorkspace::GetThreadEntry() { return CLMLThreadEntry::ThreadLocal(); }

 CLMLWorkspace* CLMLWorkspace::Global() {
   static CLMLWorkspace* inst = new CLMLWorkspace();
   return inst;
 }

 CLMLWorkspace::CLMLWorkspace() {
   cl_int result = 0;
   workspace = cl::OpenCLWorkspace::Global();
   workspace->Init();
   tentry = workspace->GetThreadEntry();

   device_id = workspace->GetCLDeviceID(tentry->device.device_id);
   platform_id = workspace->device_info[device_id].platform_id;

   // Print extensions
   size_t reqd_size = 0;
   result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, nullptr, &reqd_size);
   ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
   std::vector<char> extn_buf(reqd_size);
   result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, extn_buf.data(), nullptr);
   ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
   std::string extensions(extn_buf.data());
   LOG_CLML << "OpenCL Extensions:" << extensions;

   if (extensions.find("cl_qcom_ml_ops") == std::string::npos) {
     LOG(FATAL) << "CLML Runtime Init: Qualcomm extn not present.\n";
     return;
   }
   if (getenv("CLML_DISABLE_RECORDABLE_QUEUE")) {
     is_recordable_queue = 0;
     is_on_chip_memory = 0;
   } else {
     is_recordable_queue = (extensions.find("cl_qcom_recordable_queues") != std::string::npos);
     is_on_chip_memory = (extensions.find("cl_qcom_onchip_global_memory") != std::string::npos);
     LOG_CLML << "Recordable Queues Support :" << is_recordable_queue;
     LOG_CLML << "On chip Memory Support :" << is_on_chip_memory;
   }

   if (is_on_chip_memory) {
     result = clGetDeviceInfo(device_id, CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM,
                              sizeof(onchip_mem_size), &onchip_mem_size, nullptr);
     ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo(CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM):"
                                  << result;
     LOG_CLML << "On chip memory size:" << onchip_mem_size;
   }

   // Query and Get CLML Interface
   static const cl_uint MAX_VERSIONS = 256;
   cl_int majorVersions[MAX_VERSIONS];
   cl_int minorVersions[MAX_VERSIONS];
   cl_uint numVersions = 0;
   result = clQueryMLInterfaceVersionsQCOM(nullptr, nullptr, 0, &numVersions);
   ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
   ICHECK(numVersions > 0u);
   ICHECK(numVersions <= MAX_VERSIONS);

   result = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, numVersions, nullptr);
   ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;

   target_major = majorVersions[numVersions - 1];
   target_minor = minorVersions[numVersions - 1];

   LOG(WARNING) << "CLML Target Version:" << target_major << "." << target_minor;

   if (target_major > CL_QCOM_ML_OPS_H_MAJOR_VERSION) {
     LOG(WARNING) << "Runtime is compiled with " << CL_QCOM_ML_OPS_H_MAJOR_VERSION
                  << "where as target supports " << target_major
                  << "\nTrying to use API interface version:" << CL_QCOM_ML_OPS_H_MAJOR_VERSION
                  << "\nSome functionality may not work as expected ...";
     target_major = CL_QCOM_ML_OPS_H_MAJOR_VERSION;
     target_minor = 0;
   }

   clGetMLInterfaceQCOM(&h_ClmlIntf, target_major, target_minor);

   ICHECK(nullptr != h_ClmlIntf) << "Couldn't get API interface, target is not supported."
                                 << "Compiled version: " << CL_QCOM_ML_OPS_H_MAJOR_VERSION << "."
                                 << CL_QCOM_ML_OPS_H_MINOR_VERSION
                                 << "Target Version:" << target_major << "." << target_minor;

   char* tune_flag;
   if ((tune_flag = getenv("CLML_IS_TUNING_RUN")))
     is_tuning_run = std::stoi(tune_flag);
   else
     is_tuning_run = 0;

   if (!(tuning_file = getenv("CLML_TUNING_CACHE"))) this->is_tuning_run = 0;
 }

 typedef dmlc::ThreadLocalStore<CLMLThreadEntry> CLMLThreadStore;

 CLMLThreadEntry* CLMLThreadEntry::ThreadLocal() { return CLMLThreadStore::Get(); }
 #endif

 class CLMLRuntime : public JSONRuntimeBase {
  public:
   /*!
    * \brief The CLML runtime module. Deserialize the provided functions
    * on creation and store in the layer cache.
    *
    * \param symbol_name The name of the function.
    * \param graph_json serialized JSON representation of a sub-graph.
    * \param const_names The names of each constant in the sub-graph.
    */
   explicit CLMLRuntime(const std::string& symbol_name, const std::string& graph_json,
                        const ffi::Array<ffi::String>& const_names)
       : JSONRuntimeBase(symbol_name, graph_json, const_names), clml_symbol(symbol_name) {}

   ~CLMLRuntime() {
 #ifdef TVM_GRAPH_EXECUTOR_CLML
     cl_int result = 0;
     if (this->layer_.tuning_cache) {
       CLML_CALL(clReleaseMLTuningCacheQCOM, this->layer_.tuning_cache);
     }
     for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
       auto tensor_desc = it->second.tensor_desc;
       CLML_CALL(clReleaseMLTensorQCOM, tensor_desc->tensor)
       if (this->layer_.ddr_storage_ref_map.find(tensor_desc->memory) !=
           this->layer_.ddr_storage_ref_map.end()) {
         ReleaseDDRMemory(tensor_desc->memory);
       } else {
         result = clReleaseMemObject(tensor_desc->memory);
         ICHECK(result == CL_SUCCESS) << "clReleaseMemObject:" << result;
       }
     }
     for (size_t i = 0; i < this->layer_.function.size(); ++i) {
       CLML_CALL(clReleaseMLOpQCOM, this->layer_.function[i])
     }
     for (auto it = this->layer_.in_placeholder.begin(); it != this->layer_.in_placeholder.end();
          it++) {
       CLML_CALL(clReleaseMLTensorQCOM, it->second->tensor)
     }
     for (auto it = this->layer_.out_placeholder.begin(); it != this->layer_.out_placeholder.end();
          it++) {
       CLML_CALL(clReleaseMLTensorQCOM, (*it)->tensor)
     }
     CLML_CALL(clReleaseMLTensorMemoryDescriptorSetQCOM, layer_.descriptorSet)

     if (this->layer_.recordable_queue) {
       clReleaseCommandQueue(this->layer_.recordable_queue);
     }
 #endif
   }

   /*!
    * \brief The type key of the module.
    *
    * \return module type key.
    */
   const char* kind() const override { return "clml"; }

   /*!
    * \brief Initialize runtime. Create CLML layer from JSON
    * representation.
    *
    * \param consts The constant params from compiled model.
    */
   void Init(const ffi::Array<Tensor>& consts) override {
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
     SetupConstants(consts);

 #ifdef TVM_GRAPH_EXECUTOR_CLML
     InitCLML();
 #endif

     BuildEngine();
   }

 #ifdef TVM_GRAPH_EXECUTOR_CLML
   void InitCLML() {
     // Setup CLML Context
     cl_int result = 0;
     cws = CLMLWorkspace::Global();

     if (cws->is_recordable_queue) {
       this->layer_.recordable_queue =
           clCreateCommandQueue(CLML_CTX, cws->device_id, CL_QUEUE_RECORDABLE_QCOM, &result);
       ICHECK(result == CL_SUCCESS) << "clCreateCommandQueue - Recordable:" << result;

       this->layer_.recording = clNewRecordingQCOM(this->layer_.recordable_queue, &result);
       ICHECK(result == CL_SUCCESS) << "clNewRecordingQCOM:" << result;
     }

     // A Tuning run, so create the cache from scratch
     CLML_CALL(clCreateMLTuningCacheQCOM, &layer_.tuning_cache)
     if (!cws->is_tuning_run && cws->tuning_file) {
       std::vector<unsigned char> tune_buffer;
       std::string tune_blob;
       LoadBinaryFromFile(cws->tuning_file, &tune_blob);
       dmlc::MemoryStringStream mstrm(const_cast<std::string*>(&tune_blob));
       dmlc::Stream* strm = &mstrm;

       uint64_t header, reserve;
       std::string tune_symbol;
       while (strm->Read(&header)) {
         if (header != kTVMCLMLTuningCacheMagic) break;
         if (!strm->Read(&reserve)) break;
         if (!strm->Read(&tune_symbol)) break;
         if (tune_symbol == clml_symbol) {
           strm->Read(&tune_buffer);
           break;
         } else {
           std::vector<unsigned char> tmp_buf;
           if (!strm->Read(&tmp_buf)) break;
         }
       }

       if (tune_buffer.size()) {
         LOG(INFO) << "Loading tuning cache for symbol:" << clml_symbol
                   << " size:" << tune_buffer.size();
         CLML_CALL(clLoadMLTuningCacheQCOM, layer_.tuning_cache, tune_buffer.size(),
                   tune_buffer.data())
       } else {
         LOG(WARNING) << "Tuning cache not cound for symbol :" << clml_symbol << " in file "
                      << cws->tuning_file;
       }
     }
   }

   std::string DebugDump(void) override {
     if (cws->is_recordable_queue) {
       LOG(FATAL) << "Debugging over recordable queues is not supported yet. You may disable the "
                     "same by exporting CLML_DISABLE_RECORDABLE_QUEUE at runtime.";
     }
     cl_command_queue queue = CLML_QUEUE;
     ffi::Map<ffi::String, Tensor> dump_tensors;
     std::ostringstream os;
     dmlc::JSONWriter writer(&os);
     writer.BeginObject();

     writer.WriteObjectKeyValue("graph", graph_json_);

     int op_index = 0;
     for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
       int nid = it->first;
       auto clml_desc = it->second.tensor_desc;
       auto node = it->second.node;

       if ("kernel" == node.GetOpType()) {
         CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[op_index],
                   this->layer_.descriptorSet, 0, nullptr, nullptr);
         OPENCL_CALL(clFinish(queue));
         op_index++;
       }

       // Dump tensor to CPU
       std::vector<int64_t> shape = node.GetOpShape()[0];
       DLDataType tvm_dtype = node.GetOpDataType()[0];
       Tensor narr = Tensor::Empty(ffi::Shape(shape), tvm_dtype, {kDLCPU, 0});
       CopyDataFromCLMLTensor(clml_desc, narr.operator->()->data);

       // Naming convention
       std::string node_name;
       bool is_out = false;
       for (size_t i = 0; i < outputs_.size(); ++i) {
         uint32_t eid = EntryID(outputs_[i]);
         is_out = (eid == nid);
       }
       if (is_out) {
         node_name = clml_symbol + "_layer_out_" + std::to_string(nid);
       } else if (("const" == node.GetOpType()) || ("input" == node.GetOpType())) {
         node_name = node.GetOpName();
       } else {
         node_name = node.GetOpName() + "____topo-index:" + std::to_string(nid);
       }
       dump_tensors.Set(node_name, narr);
     }

     const auto f = tvm::ffi::Function::GetGlobal("runtime.SaveParams");
     if (f.has_value()) {
       std::string dump_bytes = (*f)(dump_tensors);
       std::ostringstream oss;
       /*TODO(Siva) HEX encoding doubles the size, look for better encode that can cross the RPC. */
       for (size_t i = 0; i < dump_bytes.size(); ++i) {
         oss << std::setw(2) << std::setfill('0') << std::hex << static_cast<int>(dump_bytes[i]);
       }
       writer.WriteObjectKeyValue("tensors", oss.str());
     }

     writer.EndObject();
     return os.str();
   }

   void RunProfile(profiling::Profiler* prof) override {
     cl_command_queue queue = CLML_QUEUE;
     std::vector<cl_event>& evts = cws->workspace->GetEventQueue(cws->tentry->device);
     std::vector<profiling::MetricCollector> cs;
     std::vector<Device> devices;
     devices.push_back(cws->tentry->device);

     for (size_t i = 0; i < input_nodes_.size(); ++i) {
       auto nid = input_nodes_[i];
       uint32_t eid = EntryID(nid, 0);
       if (nodes_[nid].GetOpType() == "input") {
         // Assuming all inputs are from OpenCL
         if (kDLOpenCL == data_entry_[eid]->device.device_type) {
           layer_.in_placeholder[nid]->memory = static_cast<cl_mem>(
               ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
           cl_event cpy_evt = nullptr;
           cl_event* evt = &cpy_evt;
           if (cws->workspace->IsProfiling(cws->tentry->device)) {
             evts.resize(evts.size() + 1);
             evt = &(evts.back());
           }
           std::unordered_map<std::string, ObjectRef> metrics;
           std::string shape_str;
           std::vector<int64_t> shape = nodes_[nid].GetOpShape()[0];
           DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
           shape_str.append(profiling::ShapeString(shape, tvm_dtype));
           metrics["Argument Shapes"] = ffi::String(shape_str);

           prof->StartCall("CopyIn", cws->tentry->device, metrics);
           CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.in_placeholder[nid]->tensor,
                     layer_.in_placeholder[nid]->memory, layer_.inputs[nid]->tensor,
                     layer_.inputs[nid]->memory, 0, nullptr, evt);
           prof->StopCall();
         }
       }
     }

     for (size_t i = 0; i < this->layer_.function.size(); ++i) {
       std::unordered_map<std::string, ObjectRef> metrics;
       auto node = this->layer_.op_node_map[this->layer_.function[i]].second;
       std::string shape_str;
       for (uint32_t j = 0; j < node.GetInputs().size(); ++j) {
         const JSONGraphNode in_node = nodes_[node.GetInputs()[j].id_];
         std::vector<int64_t> shape = in_node.GetOpShape()[0];
         DLDataType tvm_dtype = in_node.GetOpDataType()[0];
         shape_str.append(profiling::ShapeString(shape, tvm_dtype));
         shape_str.append(", ");
       }
       // Assuming one output per operation
       std::vector<int64_t> shape = node.GetOpShape()[0];
       DLDataType tvm_dtype = node.GetOpDataType()[0];
       shape_str.append(profiling::ShapeString(shape, tvm_dtype));
       metrics["Argument Shapes"] = ffi::String(shape_str);

       // Launch call
       prof->StartCall(clml_symbol + "-" + this->layer_.layer_names[i], cws->tentry->device,
                       metrics);
       queue = CLML_QUEUE;
       evts.resize(evts.size() + 1);
       cl_event* evt = &(evts.back());
       CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet, 0,
                 nullptr, evt);
       prof->StopCall();
     }

     for (size_t i = 0; i < outputs_.size(); ++i) {
       uint32_t eid = EntryID(outputs_[i]);

       // Assuming all outputs are to OpenCL
       if (kDLOpenCL == data_entry_[eid]->device.device_type) {
         layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
             ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
         cl_event cpy_evt = nullptr;
         cl_event* evt = &cpy_evt;
         if (cws->workspace->IsProfiling(cws->tentry->device)) {
           evts.resize(evts.size() + 1);
           evt = &(evts.back());
         }

         std::unordered_map<std::string, ObjectRef> metrics;
         std::string shape_str;
         std::vector<int64_t> shape = nodes_[eid].GetOpShape()[0];
         DLDataType tvm_dtype = nodes_[eid].GetOpDataType()[0];
         shape_str.append(profiling::ShapeString(shape, tvm_dtype));
         metrics["Argument Shapes"] = ffi::String(shape_str);

         prof->StartCall("CopyOut", cws->tentry->device, metrics);
         CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.outputs[i]->tensor,
                   layer_.outputs[i]->memory, layer_.out_placeholder[i]->tensor,
                   layer_.out_placeholder[i]->memory, 0, nullptr, evt);
         prof->StopCall();
       }
     }

     return;
   }

   /*!
    * \brief Unpack inputs and outputs and run inference on a given layer.
    *
    * \param args Access inputs and outputs.
    * \param function The layer to execute inference on.
    * \return Status of inference.
    */
   void Run() override {
     LOG_CLML << "Run Start";
     cl_command_queue queue = CLML_QUEUE;
     std::vector<cl_event>& evts = cws->workspace->GetEventQueue(cws->tentry->device);
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
       auto nid = input_nodes_[i];
       uint32_t eid = EntryID(nid, 0);
       if (nodes_[nid].GetOpType() == "input") {
         void* data = data_entry_[eid]->data;
         size_t isize = 1;
         for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
           isize *= data_entry_[eid]->shape[j];
         }
         if (kDLCPU == data_entry_[eid]->device.device_type) {
           CopyDataToCLMLTensor(layer_.inputs[nid], data);
         } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
           layer_.in_placeholder[nid]->memory = static_cast<cl_mem>(
               ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
           cl_event cpy_evt = nullptr;
           cl_event* evt = &cpy_evt;
           if (cws->workspace->IsProfiling(cws->tentry->device)) {
             evts.resize(evts.size() + 1);
             evt = &(evts.back());
           }
           LOG_CLML << "Enqueue CLML Copy";
           CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.in_placeholder[nid]->tensor,
                     layer_.in_placeholder[nid]->memory, layer_.inputs[nid]->tensor,
                     layer_.inputs[nid]->memory, 0, nullptr, evt);
           LOG_CLML << "Enqueue CLML Copy Completed";
         } else {
           DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
           cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
           int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
           void* tmpptr = reinterpret_cast<void*>(malloc(isize * dtype_size));
           TVMTensorCopyToBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
                                isize * dtype_size);
           CopyDataToCLMLTensor(layer_.inputs[nid], tmpptr);
           free(tmpptr);
         }
       }
     }
     LOG_CLML << "Inputs Set";

     int64_t duration = 0;
     if (cws->is_recordable_queue) {
       LOG_CLML << "Execution by Rec Queue";
       if (cws->workspace->IsProfiling(cws->tentry->device)) {
         Timer t;
         auto f = tvm::ffi::Function::GetGlobal(std::string("profiling.timer.opencl"));
         t = f->operator()(cws->tentry->device);
         t->Start();
         queue = CLML_QUEUE;
         evts.resize(evts.size() + 1);
         cl_event* evt = &(evts.back());
         CLML_CALL(clEnqueueRecordingMLOpQCOM, queue, this->layer_.recording, 0, nullptr, 0, nullptr,
                   0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, evt);
         t->Stop();
         duration += t->SyncAndGetElapsedNanos();
       } else {
         CLML_CALL(clEnqueueRecordingMLOpQCOM, queue, this->layer_.recording, 0, nullptr, 0, nullptr,
                   0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, nullptr);
       }
     } else {
       LOG_CLML << "Execution by Normal Queue";
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
         // Make CLML subgraphs accounted by OpenCLTimerNode.
         LOG_CLML << "Run Layer:" << this->layer_.layer_names[i];
         if (cws->workspace->IsProfiling(cws->tentry->device)) {
           Timer t;
           auto f = tvm::ffi::Function::GetGlobal(std::string("profiling.timer.opencl"));
           t = f->operator()(cws->tentry->device);
           t->Start();
           queue = CLML_QUEUE;
           evts.resize(evts.size() + 1);
           cl_event* evt = &(evts.back());
           CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet,
                     0, nullptr, evt);
           t->Stop();
           duration += t->SyncAndGetElapsedNanos();
           LOG_CLML << "Layer:" << this->layer_.layer_names[i]
                    << " Duration:" << t->SyncAndGetElapsedNanos();
         } else {
           CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet,
                     0, nullptr, nullptr);
         }
       }
     }
     if (cws->workspace->IsProfiling(cws->tentry->device)) {
       LOG_CLML << "Total Duration for " << clml_symbol << " is:" << duration;
     }

     LOG_CLML << "Run Completed";
     for (size_t i = 0; i < outputs_.size(); ++i) {
       uint32_t eid = EntryID(outputs_[i]);
       void* data = data_entry_[eid]->data;

       size_t osize = 1;
       for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
         osize *= data_entry_[eid]->shape[j];
       }
       if (kDLCPU == data_entry_[eid]->device.device_type) {
         CopyDataFromCLMLTensor(layer_.outputs[0], data);
       } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
         layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
             ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
         cl_event cpy_evt = nullptr;
         cl_event* evt = &cpy_evt;
         if (cws->workspace->IsProfiling(cws->tentry->device)) {
           evts.resize(evts.size() + 1);
           evt = &(evts.back());
         }
         CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.outputs[i]->tensor,
                   layer_.outputs[i]->memory, layer_.out_placeholder[i]->tensor,
                   layer_.out_placeholder[i]->memory, 0, nullptr, evt);
       } else {
         DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
         cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
         int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;

         void* tmpptr = reinterpret_cast<void*>(malloc(osize * dtype_size));
         CopyDataFromCLMLTensor(layer_.outputs[0], tmpptr);
         TVMTensorCopyFromBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
                                osize * dtype_size);
         free(tmpptr);
       }
     }
     LOG_CLML << "Run End";
   }

  private:
   /*!
    * \brief check if the nid is graph output tensor or not.
    *
    */
   bool IsOutputTensor(int nid) {
     for (size_t i = 0; i < outputs_.size(); ++i) {
       if (nid == outputs_[i].id_) return true;
     }
     return false;
   }

   /*!
    * \brief Initialize memory pool.
    *
    */
   void InitMemoryPool(void) {
     layer_.on_chip_pool_size.clear();
     layer_.on_chip_pool_size.insert({0, cws->onchip_mem_size});
     layer_.on_chip_pool_alloc_info.clear();
     layer_.alloc_ping_pong = true;
     layer_.in_chip_total_free = cws->onchip_mem_size;
     layer_.in_chip_total_alloc = 0;
     layer_.on_chip_alert_fail = 0;
   }

   /*!
    * \brief Plan Memory for activations to allocate on on-chip global memory where ever possible.
    *
    */
   void PlanMemory() {
     InitMemoryPool();
     // Build the ref count table for all activation tensors.
     LOG_MEM << "Build Ref Map";
     for (size_t nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
       if (node.GetOpType() == "kernel") {
         std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
         for (auto& input_node : inputs) {
           if (nodes_[input_node.id_].GetOpType() != "const") {
             if (layer_.storage_ref_map.find(input_node.id_) == layer_.storage_ref_map.end()) {
               layer_.storage_ref_map.insert({input_node.id_, 1});
               layer_.life_span.insert({input_node.id_, nid});
             } else {
               layer_.storage_ref_map[input_node.id_]++;
               layer_.life_span[input_node.id_] = nid;
             }
           }
         }
       }
     }
     LOG_MEM << "Print Ref Map";

     for (auto it = layer_.storage_ref_map.begin(); it != layer_.storage_ref_map.end(); it++) {
       LOG_MEM << "RefMap:" << it->first << " Count:" << it->second
               << "Life Span:" << layer_.life_span[it->first];
     }

     for (size_t nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
       uint32_t size = 0;
       if (this->layer_.storage_map.find(nid) == this->layer_.storage_map.end()) {
         // Possible that some nodes are not consumed by any operation
         // Example being nn.pad second argument.
         continue;
       }
       CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, layer_.storage_map[nid].tensor_desc->tensor,
                 &size);

       if ((node.GetOpType() == "kernel") || (node.GetOpType() == "input")) {
         std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
         LOG_MEM << "Request :" << size << " Nid:" << nid;
         size_t offset = -1;
         // On-chip memory only for intermediate tensors with in recording scope.
         if ((cws->is_on_chip_memory) && (!IsOutputTensor(nid)) && (node.GetOpType() != "input")) {
           offset = RequestOnChipMemory(&this->layer_, size);
         }
         if (-1 != offset) {
           LOG_MEM << "Got On-Chip Mem:" << offset << "Nid:" << nid;
           layer_.on_chip_pool_alloc_info.insert({offset, nid});
           layer_.on_chip_alloc_plan.insert({nid, std::make_pair(size, offset)});
         } else {
           layer_.on_chip_reject.insert({nid, size});
           // DDR Allocation
           auto ddr_mem = RequestDDRMemory(&this->layer_, size);
           LOG_MEM << "Alloc DDR from global pool for nid:" << nid << " Type:" << node.GetOpType();
           layer_.ddr_alloc_plan.insert({nid, ddr_mem});
         }

         // Now free up the input tensors on-chip memory for reuse.
         for (auto& input_node : inputs) {
           if (nodes_[input_node.id_].GetOpType() != "const") {
             LOG_MEM << "Free Input Mem:" << input_node.id_;
             FreeMemory(&this->layer_, input_node.id_);
           }
         }
       }
     }

     // Stats dump
     size_t in_chip_total_alloc = 0;
     size_t total_reject = 0;
     for (auto it = layer_.on_chip_alloc_plan.begin(); it != layer_.on_chip_alloc_plan.end(); it++) {
       LOG_STATS << " On-chip Alloc:" << it->first << " Size:" << it->second.first
                 << " Offset:" << it->second.second;
       in_chip_total_alloc += it->second.first;
     }

     for (auto it = layer_.on_chip_reject.begin(); it != layer_.on_chip_reject.end(); it++) {
       LOG_STATS << "Reject:" << it->first << " Size:" << it->second;
       total_reject += it->second;
     }
     LOG_STATS << "Total On-chip Alloc:" << in_chip_total_alloc + total_reject
               << " On-Chip:" << in_chip_total_alloc << " Reject:" << total_reject
               << " Alert Fail:" << layer_.on_chip_alert_fail;

     auto cws = CLMLWorkspace::Global();
     for (auto it = cws->ddr_global_pool.begin(); it != cws->ddr_global_pool.end(); it++) {
       LOG_STATS << "DDR Global pool - size:" << it->second.first << " Ref:" << it->second.second;
     }
     for (auto it = this->layer_.ddr_storage_ref_map.begin();
          it != this->layer_.ddr_storage_ref_map.end(); it++) {
       LOG_STATS << "DDR Local pool - size:" << it->second.first << " Ref cnt:" << it->second.second;
     }
   }

   /*!
    * \brief Create an CLML tensor from JSON node entry. Lookup storage map before creation.
    * Update input placeholder for NHWC layout
    *
    * \param nid The node index of graph JSON.
    * \param shape shape information of tensor
    * \param layout the tensor layout to be used
    * \param dtype tensor data type
    * \return CLML Tensor descriptor.
    */
   std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONEntry(
       size_t nid, std::vector<size_t> shape, cl_ml_tensor_layout_qcom layout, cl_uint dtype) {
     const JSONGraphNode node = nodes_[nid];
     cl_ml_tensor_usage_qcom usage = CL_TENSOR_USAGE_CNN_QCOM;

     if (this->layer_.storage_map.find(nid) != this->layer_.storage_map.end()) {
       if (nullptr != layer_.storage_map[nid].tensor_desc) {
         return this->layer_.storage_map[nid].tensor_desc;
       }
     } else {
       this->layer_.storage_map.insert({nid, NodeDescriptor()});
       this->layer_.storage_map[nid].node = node;
     }

     void* node_data = nullptr;
     if (node.GetOpType() == "const") {
       uint32_t eid = EntryID(nid, 0);
       node_data = data_entry_[eid]->data;
       usage = CL_TENSOR_USAGE_PARAMETER_QCOM;
       ICHECK(CL_TENSOR_USAGE_INVALID_QCOM == this->layer_.storage_map[nid].usage)
           << "Parameter have usage reservation !!!";
     }
     if (CL_TENSOR_USAGE_INVALID_QCOM != this->layer_.storage_map[nid].usage) {
       // Respect special reservation on usage.
       usage = this->layer_.storage_map[nid].usage;
     } else {
       this->layer_.storage_map[nid].usage = usage;
     }
     if (this->layer_.storage_map[nid].custom_layout) {
       // Respect special reservation on layout.
       layout = this->layer_.storage_map[nid].layout;
     } else {
       this->layer_.storage_map[nid].layout = layout;
     }

     auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, usage, dtype, node_data, shape);

     this->layer_.storage_map[nid].tensor_desc = clml_tensor;
     this->layer_.storage_map[nid].usage = usage;
     this->layer_.storage_map[nid].layout = layout;
     LOG_CLML << "Storage Map Alloc:" << nid << " Name:" << node.GetOpName() << " Usage: " << usage
              << " Layout:" << layout;

     if ("input" == node.GetOpType()) {
       this->layer_.inputs.insert({nid, this->layer_.storage_map[nid].tensor_desc});
       // Input copy placeholder Tensor
       if (layout == CL_TENSOR_LAYOUT_OPTIMAL_QCOM) {
         this->layer_.in_placeholder.insert(
             {nid, MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, usage, dtype,
                                              node_data, shape)});
       } else {
         this->layer_.in_placeholder.insert(
             {nid, MakeCLMLTensorFromJSONNode(node, layout, usage, dtype, node_data, shape)});
       }
     }
     return clml_tensor;
   }

   /*!
    * \brief Build CLML layer from JSON representation and cache.
    *
    * \note For the time being only one layer or operator is supported
    * per engine.
    */
   void BuildEngine() {
     size_t nid;
     // Create tensors for the operators which has distinct layout format
     // other than CL_TENSOR_LAYOUT_OPTIMAL_QCOM.
     for (nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
       if ("nn.dense" == node.GetOpName()) CreateDenseLayerTensor(&layer_, node, nid);
       if ("nn.batch_matmul" == node.GetOpName()) CreateBatchMatmulLayerTensor(&layer_, node, nid);
       if ("nn.softmax" == node.GetOpName() || PatternMatch(node.GetOpName(), "nn.softmax"))
         CreateSoftmaxLayerTensor(&layer_, node, nid);
     }

     for (nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
       if (node.GetOpType() == "input") {
         // Layers may request for different layout. Differ the input allocation.
       } else if (node.GetOpType() == "kernel") {
         auto op_name = node.GetOpName();
         if (PatternMatch(op_name, "nn.conv2d") || PatternMatch(op_name, "nn.pad_conv2d"))
           CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_CONVOLUTION_QCOM, nid);
         else if (PatternMatch(op_name, "nn.depthwise_conv2d"))
           CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_DEPTHWISE_QCOM, nid);
         else if (PatternMatch(op_name, "nn.conv2d_transpose"))
           CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_TRANSPOSE_QCOM, nid);
         else if ("nn.relu6" == op_name || PatternMatch(op_name, "nn.relu6"))
           CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU6);
         else if (PatternMatch(op_name, "nn.relu"))
           CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU);
         else if (PatternMatch(op_name, "nn.batch_norm"))
           CreateBatchNormLayer(&layer_, node, nid);
         else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
                  "nn.l2_pool2d" == op_name || PatternMatch(op_name, "nn.max_pool2d") ||
                  PatternMatch(op_name, "nn.avg_pool2d"))
           CreatePoolingLayer(&layer_, node, nid);
         else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name ||
                  PatternMatch(op_name, "nn.global_avg_pool2d") ||
                  PatternMatch(op_name, "nn.global_max_pool2d"))
           CreateGlobalPoolingLayer(&layer_, node, nid);
         else if ("reshape" == op_name || PatternMatch(op_name, "reshape"))
           CreateReshapeLayer(&layer_, node, nid);
         else if ("concatenate" == op_name)
           CreateConcatLayer(&layer_, node, nid);
         else if ("nn.dense" == op_name)
           CreateDenseLayer(&layer_, node, nid);
         else if ("nn.softmax" == op_name || PatternMatch(op_name, "nn.softmax"))
           CreateSoftMaxLayer(&layer_, node, nid);
         else if ("nn.pad" == op_name)
           CreatePadLayer(&layer_, node, nid);
         else if ("nn.batch_flatten" == op_name)
           CreateBatchFlattenLayer(&layer_, node, nid);
         else if ("clip" == op_name)
           CreateClipLayer(&layer_, node, nid);
         else if ("add" == op_name || "subtract" == op_name || "multiply" == op_name ||
                  "minimum" == op_name || "maximum" == op_name || "divide" == op_name ||
                  PatternMatch(op_name, "relax.add") || PatternMatch(op_name, "relax.subtract") ||
                  PatternMatch(op_name, "relax.multiply") ||
                  PatternMatch(op_name, "relax.minimum") || PatternMatch(op_name, "relax.maximum") ||
                  PatternMatch(op_name, "relax.divide"))
           CreateBinaryLayer(&layer_, node, nid);
         else if ("nn.depth_to_space" == op_name)
           CreateDepthToSpaceLayer(&layer_, node, nid);
         else if ("nn.upsampling" == op_name)
           CreateResizeLayer(&layer_, node, nid);
         else if ("nn.batch_matmul" == op_name)
           CreateBatchMatmulLayer(&layer_, node, nid);
         else
           LOG(FATAL) << "Unsupported op: " << op_name;
         this->layer_.layer_names.push_back(op_name);
         // Keep map of function and Node to use in profiling
         this->layer_.op_node_map.insert({this->layer_.function.back(), std::make_pair(nid, node)});
       } else if (node.GetOpType() != "const") {
         LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
       }
     }

     for (size_t i = 0; i < outputs_.size(); ++i) {
       nid = outputs_[i].id_;
       DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
       cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
       this->layer_.outputs.push_back(this->layer_.storage_map[nid].tensor_desc);
       if (this->layer_.out_shapes.find(nid) != this->layer_.out_shapes.end()) {
         // Handle customized shapes here
         this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
             nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype, nullptr,
             this->layer_.out_shapes[nid]));
       } else {
         this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
             nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype));
       }
     }

     // Plan memory utilization
     PlanMemory();

     // ALlocate device memories and initialize the params if any
     cl_int result = 0;
     size_t alloc_on_chip = 0;
     size_t alloc_ddr = 0;
     size_t alloc_ddr_reuse = 0;
     for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
       auto tensor_desc = it->second.tensor_desc;
       uint32_t mem_size = 0;
       result = CL_OUT_OF_HOST_MEMORY;
       CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, tensor_desc->tensor, &mem_size);

       JSONGraphNode node = it->second.node;
       void* node_data = nullptr;
       size_t on_chip_mem_offset = -1;
       if (layer_.on_chip_alloc_plan.find(it->first) != layer_.on_chip_alloc_plan.end()) {
         LOG_MEM << "Found GMEM Alloc:" << it->first
                 << " Size:" << layer_.on_chip_alloc_plan[it->first].first
                 << " Offset:" << layer_.on_chip_alloc_plan[it->first].second;
         on_chip_mem_offset = layer_.on_chip_alloc_plan[it->first].second;
         alloc_on_chip += mem_size;
         tensor_desc->memory = AllocateOnChipTensorMemory(mem_size, on_chip_mem_offset);
       } else if (layer_.ddr_alloc_plan.find(it->first) != layer_.ddr_alloc_plan.end()) {
         LOG_MEM << "DDR Alloc for nid:" << it->first << " Type:" << node.GetOpType();
         tensor_desc->memory = layer_.ddr_alloc_plan[it->first];
         alloc_ddr_reuse += mem_size;
         //} else if ((node.GetOpType() == "input") || IsOutputTensor(it->first) || (node.GetOpType()
         //== "const")) {
       } else if (node.GetOpType() == "const") {
         LOG_MEM << "DDR Alloc for Const/Input/Output";
         tensor_desc->memory = AllocateDDRTensorMemory(mem_size);
         alloc_ddr += mem_size;
       } else {
         LOG(FATAL) << "Mem allocation not found on DDR as well as On-Chip nid: " << it->first
                    << " Type:" << node.GetOpType();
       }

       if (node.GetOpType() == "const") {
         node_data = data_entry_[EntryID(it->first, 0)]->data;
         if (node_data != nullptr) {
           CopyDataToCLMLTensor(tensor_desc, node_data);
         }
       }
       this->layer_.tensorMemDescs.push_back(*tensor_desc);
     }
     LOG_STATS << "Total On-Chip Allocation  :" << alloc_on_chip;
     LOG_STATS << "Total DDR Reuse Allocation:" << alloc_ddr_reuse;
     LOG_STATS << "Total DDR fixed allocation:" << alloc_ddr;
     size_t ddr_global_pool = 0;
     size_t ddr_local_pool = 0;
     auto cws = CLMLWorkspace::Global();
     for (auto it = cws->ddr_global_pool.begin(); it != cws->ddr_global_pool.end(); it++) {
       LOG_STATS << "DDR Global pool - size:" << it->second.first << " Ref:" << it->second.second;
       ddr_global_pool += it->second.first;
     }
     LOG_STATS << "Total Global Pool:" << ddr_global_pool;
     for (auto it = this->layer_.ddr_storage_ref_map.begin();
          it != this->layer_.ddr_storage_ref_map.end(); it++) {
       LOG_STATS << "DDR Local pool - size:" << it->second.first << " Ref cnt:" << it->second.second;
       ddr_local_pool += it->second.first;
     }
     LOG_STATS << "Total Local Pool:" << ddr_local_pool;

     // Setup descriptor set
     CLML_CALL(clCreateMLTensorMemoryDescriptorSetQCOM, &this->layer_.descriptorSet);

     CLML_CALL(clUpdateMLTensorMemoryDescriptorSetQCOM, this->layer_.descriptorSet,
               static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
               this->layer_.tensorMemDescs.data());

     if (cws->is_tuning_run) {
       LOG_CLML << "CLML Tunning In Progress:";
       // Let the command queue recreated in profiling mode.
       cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, true);
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
         LOG_CLML << "CLML Tunning:" << this->layer_.layer_names[i];
         CLML_CALL(clTuneMLOpQCOM, CLML_QUEUE, this->layer_.function[i], this->layer_.descriptorSet,
                   this->layer_.tuning_cache, nullptr);
       }
       cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, false);

       size_t cache_len_bytes = 0;
       size_t len_ret = 0;
       CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, 0, nullptr, &cache_len_bytes);

       std::vector<unsigned char> saved_cache(cache_len_bytes, 0);
       CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, saved_cache.size(),
                 saved_cache.data(), &len_ret);

       std::string tune_str;
       dmlc::MemoryStringStream mstrm(&tune_str);
       dmlc::Stream* strm = &mstrm;
       uint64_t header = kTVMCLMLTuningCacheMagic;
       uint64_t reserved = 0x0;
       strm->Write(header);
       strm->Write(reserved);
       strm->Write(clml_symbol);
       strm->Write(saved_cache);

       std::ofstream fs(cws->tuning_file, std::ios::app | std::ios::binary);
       ICHECK(!fs.fail()) << "Cannot open " << cws->tuning_file;
       fs.write(&tune_str[0], tune_str.length());
       LOG_CLML << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size"
                << tune_str.length() << " with tuning blob len " << saved_cache.size();
     }
     if (cws->is_recordable_queue) {
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
         CLML_CALL(clEnqueueMLOpQCOM, this->layer_.recordable_queue, this->layer_.function[i],
                   this->layer_.descriptorSet, 0, nullptr, nullptr);
       }

       result = clEndRecordingQCOM(this->layer_.recording);
       ICHECK(result == CL_SUCCESS) << "clEndRecordingQCOM:" << result;
     }
   }

   /*!
    * \brief Create a 2D convolution layer.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
    * \param node The JSON representation of the operator.
    * \param mode The conv2d mode type - CL_CONVOLUTION_MODE_CONVOLUTION_QCOM
    *                                    or CL_CONVOLUTION_MODE_DEPTHWISE_QCOM
    *                                    or CL_CONVOLUTION_MODE_TRANSPOSE_QCOM.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateConvolution2DLayer(CachedLayer* layer, const JSONGraphNode& node,
                                 cl_convolution_mode_qcom mode, size_t nid) {
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
     std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
     std::vector<cl_uint> clml_padding = GetVectorValues(padding);

     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     if (!node.HasAttr("padding")) {
       clml_padding.resize(4);
       std::fill(clml_padding.begin(), clml_padding.end(), 0);
     }

     cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[0], clml_padding[1]};
     cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[2], clml_padding[3]};
     std::vector<cl_uint> v_strides = GetVectorValues(strides);
     std::vector<cl_uint> v_dilation = GetVectorValues(dilation);
     cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_strides[0], v_strides[1]};
     cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_dilation[0], v_dilation[1]};

     cl_uint groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
     if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
       ICHECK(groups == 1) << "CLML convolution only supports group size of 1.";
     } else {
       groups = 1;  // Don't need to pass groups to depthwise
     }

     bool has_act = false;
     std::string activation_type;
     cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU;
     if (node.HasAttr("activation_type")) {
       activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
       ICHECK(activation_type == "relu" || activation_type == "relu6")
           << "Unknown activation type:" << activation_type;
       if (activation_type == "relu") {
         clml_act_type = CL_ACTIVATION_RELU;
       } else {
         clml_act_type = CL_ACTIVATION_RELU6;
       }
       has_act = true;
     }
     cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
                                               cl_arithmetic_mode};

     // Collect inputs and outputs, handling nn.conv2d.
     std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
     size_t num_inputs = inputs.size();
     bool has_bias;
     bool has_bn;
     ICHECK(num_inputs >= 2 && num_inputs <= 7)
         << "Batchnorm fused convolution requires max 7 arguments";
     has_bias = (num_inputs == 3) || (num_inputs == 7);
     has_bn = (num_inputs == 6) || (num_inputs == 7);
     // Input
     auto input =
         MakeCLMLTensorFromJSONEntry(inputs[0].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     // Weight
     auto weight =
         MakeCLMLTensorFromJSONEntry(inputs[1].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     // Bias
     auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     if (has_bias) {
       bias =
           MakeCLMLTensorFromJSONEntry(inputs[2].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     } else {
       cl_ml_tensor_desc_qcom desc = {};
       desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
       CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
                                      &layer_.unusedTensor);
       ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
       bias->tensor = layer_.unusedTensor;
     }
     // Output
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_ml_op_convolution_desc_qcom conv_desc{mode,
                                              groups,
                                              4,
                                              {clml_padding_b[0], clml_padding_b[1]},
                                              {clml_padding_a[0], clml_padding_a[1]},
                                              {clml_strides[0], clml_strides[1]},
                                              {clml_dilation[0], clml_dilation[1]},
                                              0,
                                              cl_arithmetic_mode};

     cl_ml_op_qcom op = nullptr;
     if (!has_bn) {
       if (!has_act) {
         CLML_CALL(clCreateMLOpConvolutionForwardQCOM, CLML_CTX, nullptr, &conv_desc, input->tensor,
                   weight->tensor, bias->tensor, output->tensor, &op, nullptr);
       } else {
         CLML_CALL(clCreateMLOpFusedConvolutionActivationForwardQCOM, CLML_CTX, nullptr, &conv_desc,
                   &act_desc, input->tensor, weight->tensor, bias->tensor, nullptr, output->tensor,
                   &op, layer_.tuning_cache);
       }
       layer->function.push_back(op);
     } else {
       int bn_index = has_bias ? 3 : 2;
       int axis = std::stoi(node.GetAttr<std::vector<std::string>>("batchnorm")[0]);
       auto bn_dims = GetTensorDims(nodes_[inputs[bn_index].id_]);
       float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("batchnorm")[1]);

       std::vector<cl_ml_op_properties_qcom> opProperties;
       opProperties.push_back(CL_ML_BATCH_NORM_OP_EPSILON_QCOM);
       opProperties.push_back(*reinterpret_cast<cl_ml_op_properties_qcom*>(&epsilon));
       opProperties.push_back(CL_ML_OP_PROPERTY_LIST_END_QCOM);
       std::vector<size_t> bn_shape = {1, 1, 1, 1};
       bn_shape[axis] = bn_dims.n;
       auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
       auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
       auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
       auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
       bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index].id_, bn_shape,
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
       bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1].id_, bn_shape,
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
       bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2].id_, bn_shape,
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
       bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3].id_, bn_shape,
                                            CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

       cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
       if (!has_act) {
         CLML_CALL(clCreateMLOpFusedConvolutionBatchNormForwardQCOM, CLML_CTX, opProperties.data(),
                   &conv_desc, &bn_desc, input->tensor, weight->tensor, bias->tensor, output->tensor,
                   bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op,
                   layer_.tuning_cache);
       } else {
         CLML_CALL(clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM, CLML_CTX,
                   opProperties.data(), &conv_desc, &bn_desc, &act_desc, input->tensor,
                   weight->tensor, bias->tensor, output->tensor, nullptr, bn_mean->tensor,
                   bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op, layer_.tuning_cache);
       }
       layer->function.push_back(op);
     }
     return;
   }

   /*!
    * \brief Create a ReLU(X) layer.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateReLULayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid,
                        cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

     cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
                                               cl_arithmetic_mode};

     cl_ml_tensor_desc_qcom desc = {};
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
     CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
                                    &layer_.unusedTensor);
     ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";

     CLML_CALL(clCreateMLOpActivationForwardQCOM, CLML_CTX, nullptr, &act_desc, input->tensor,
               layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
     ICHECK(op) << "Activation Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a batch norm layer.
    *
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateBatchNormLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
     float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("epsilon")[0]);

     std::vector<cl_ml_op_properties_qcom> opProperties;
     opProperties.push_back(CL_ML_BATCH_NORM_OP_EPSILON_QCOM);
     opProperties.push_back(*reinterpret_cast<cl_ml_op_properties_qcom*>(&epsilon));
     opProperties.push_back(CL_ML_OP_PROPERTY_LIST_END_QCOM);

     auto bn_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
     std::vector<size_t> bn_shape = {1, 1, 1, 1};
     bn_shape[axis] = bn_dims.n;
     auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, bn_shape,
                                            CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2].id_, bn_shape,
                                           CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3].id_, bn_shape,
                                           CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4].id_, bn_shape,
                                          CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

     cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};

     CLML_CALL(clCreateMLOpBatchNormForwardQCOM, CLML_CTX, opProperties.data(), &bn_desc,
               input->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor,
               output->tensor, &op, layer_.tuning_cache);
     ICHECK(op) << "Batchnorm Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a creating pooling layer.
    *
    * \note Currently global_max_pool2d and global_avg_pool2d are supported.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreatePoolingLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

     std::vector<std::string> windows = node.GetAttr<std::vector<std::string>>("pool_size");
     std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
     std::vector<cl_uint> clml_window = GetVectorValues(windows);
     std::vector<cl_uint> clml_stride = GetVectorValues(strides);
     std::vector<cl_uint> clml_padding = GetVectorValues(padding);

     cl_ml_op_pooling_desc_qcom pool_desc = {
         ((node.GetOpName() == "nn.max_pool2d") || PatternMatch(node.GetOpName(), "nn.max_pool2d"))
             ? CL_POOLING_MODE_MAX_QCOM
             : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
         4,  // reserved
         {clml_padding[0], clml_padding[1]},
         {clml_padding[2], clml_padding[3]},
         {clml_stride[0], clml_stride[1]},
         {clml_window[0], clml_window[1]},
         CL_PROPAGATE_NAN_QCOM,
         cl_arithmetic_mode,
     };

     cl_ml_tensor_desc_qcom desc = {};
     cl_ml_tensor_qcom unusedTensor = nullptr;
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
     CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
                                    &unusedTensor);
     ICHECK(unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";

     CLML_CALL(clCreateMLOpPoolingForwardQCOM, CLML_CTX, nullptr, &pool_desc, input->tensor,
               unusedTensor, output->tensor, &op, layer_.tuning_cache);
     ICHECK(op) << "Pooling Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a global pooling layer.
    *
    * \note Currently global_max_pool2d and global_avg_pool2d are supported.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateGlobalPoolingLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
     cl_ml_op_pooling_desc_qcom pool_desc = {
         ((node.GetOpName() == "nn.global_max_pool2d") ||
          PatternMatch(node.GetOpName(), "nn.global_max_pool2d"))
             ? CL_POOLING_MODE_MAX_QCOM
             : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
         4,  // reserved
         {0, 0},
         {0, 0},
         {1, 1},
         {in_dims.w, in_dims.h},
         CL_PROPAGATE_NAN_QCOM,
         cl_arithmetic_mode,
     };

     cl_ml_tensor_desc_qcom desc = {};
     desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
     CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
                                    &layer_.unusedTensor);
     ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";

     CLML_CALL(clCreateMLOpPoolingForwardQCOM, CLML_CTX, nullptr, &pool_desc, input->tensor,
               layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
     ICHECK(op) << "Pooling Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a Softmax layer Tensors with supported layout.
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateSoftmaxLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_tensor_layout_qcom layout;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     auto out_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
     int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
     // enabling  NHWC layout && NCHW layout for 4D,  basis the axis value
     if (out_dims.h >= 1 && out_dims.w >= 1) {
       if (axis == 3 || axis == -1) {
         layout = CL_TENSOR_LAYOUT_NHWC_QCOM;
       } else {
         layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
       }
     } else {  // default layout for 2D
       layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
     }
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);

     return;
   }

   /*!
    * \brief Create a SoftMax layer.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateSoftMaxLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
     cl_softmax_mode_qcom mode = CL_SOFTMAX_MODE_SPATIAL_QCOM;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
     cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM, mode,
                                                cl_arithmetic_mode};
     CLML_CALL(clCreateMLOpSoftmaxQCOM, CLML_CTX, nullptr, &softmax_desc, input->tensor,
               output->tensor, &op, layer_.tuning_cache);
     ICHECK(op) << "SoftMax Error";
     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a Pad layer.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreatePadLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

     std::string pad_mode = node.GetAttr<std::vector<std::string>>("pad_mode")[0];
     std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("pad_width");
     std::vector<cl_uint> clml_padding = GetVectorValues(padding);

     cl_pad_mode_qcom clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
     if (pad_mode == "constant")
       clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
     else if (pad_mode == "edge")
       clml_pad_mode = CL_PAD_MODE_SYMMETRIC_QCOM;
     else if (pad_mode == "reflect")
       clml_pad_mode = CL_PAD_MODE_REFLECT_QCOM;
     else
       LOG(FATAL) << "Padding mode not supported by CLML:" << pad_mode;

     cl_ml_op_pad_desc_qcom pad_desc{
         clml_pad_mode,
         {0, 0},
         {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0},
         cl_arithmetic_mode};

     CLML_CALL(clCreateMLOpPadQCOM, CLML_CTX, nullptr, &pad_desc, input->tensor, output->tensor, &op,
               layer_.tuning_cache);
     ICHECK(op) << "Pad Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a Batch Flatten layer.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateBatchFlattenLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

     CLML_CALL(clCreateMLOpReshapeQCOM, CLML_CTX, nullptr, input->tensor, output->tensor, &op,
               layer_.tuning_cache);
     ICHECK(op) << "Reshape Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a Reshape layer.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateReshapeLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);

     CLML_CALL(clCreateMLOpReshapeQCOM, CLML_CTX, nullptr, input->tensor, output->tensor, &op,
               layer_.tuning_cache);
     ICHECK(op) << "Reshape Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a concat layer.
    *
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateConcatLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     std::vector<JSONGraphNodeEntry> input_ = node.GetInputs();
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     int inputSize = input_.size();
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_uint axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
     cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[inputSize];
     for (int i = 0; i < inputSize; i++) {
       auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[i].id_, {},
                                                CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
       concatInputs[i] = input->tensor;
     }
     cl_ml_op_concat_desc_qcom concatDesc = {axis, (cl_uint)inputSize, cl_arithmetic_mode};

     CLML_CALL(clCreateMLOpConcatQCOM, CLML_CTX, nullptr, &concatDesc, concatInputs, output->tensor,
               &op, layer_.tuning_cache);
     ICHECK(op) << "Concat Error";

     layer->function.push_back(op);

     delete[] concatInputs;
     return;
   }

   /*!
    * \brief Create a dense layer.
    *
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateDenseLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     size_t num_inputs = node.GetInputs().size();
     bool has_bias = (num_inputs == 3);
     auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
     cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
     bool is_vec_matmul = false;
     if (in_dims.n == 1 && has_bias) {
       layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
       is_vec_matmul = true;
     }

     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
     auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
     auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.n, wt_dims.c},
                                               layout, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);

     auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     if (has_bias) {
       bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2].id_, {}, layout, cl_dtype);
     } else {
       cl_ml_tensor_desc_qcom desc = {};
       desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
       bias->tensor = layer_.unusedTensor;
     }

     if (is_vec_matmul) {
       cl_fc_weight_transform_qcom w_transform = CL_FC_WEIGHT_TRANSFORM_NONE_QCOM;
       if (in_dims.c == wt_dims.c) w_transform = CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM;

       cl_ml_op_fully_connected_desc_qcom fc_desc{1,  // refer clml_ops.txt for struct
                                                  w_transform, cl_arithmetic_mode};

       CLML_CALL(clCreateMLOpFullyConnectedQCOM, CLML_CTX, nullptr, &fc_desc, input->tensor,
                 weight->tensor, bias->tensor, output->tensor, &op, layer_.tuning_cache);
       ICHECK(op) << "FC layer Error";
       layer->function.push_back(op);
     } else {
       cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
       if (in_dims.c == wt_dims.c) b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;

       cl_ml_op_gemm_desc_qcom gemmDesc = {in_dims.n,                    // m
                                           wt_dims.n,                    // n
                                           wt_dims.c,                    // k
                                           CL_GEMM_TRANSFORM_NONE_QCOM,  // A transform
                                           b_transform,                  // B transform
                                           {{1.0}, CL_FLOAT},            // alpha
                                           {{0.0}, CL_FLOAT},            // beta
                                           cl_arithmetic_mode};

       CLML_CALL(clCreateMLOpGemmQCOM, CLML_CTX, nullptr, &gemmDesc, input->tensor, weight->tensor,
                 output->tensor, &op, layer_.tuning_cache);
       ICHECK(op) << "Gemm layer Error";
       layer->function.push_back(op);
       if (has_bias) {
         cl_ml_op_binary_desc_qcom binaryDesc = {CL_TENSOR_OP_ADD_QCOM,
                                                 {{1.0}, CL_FLOAT},  // alpha
                                                 {{1.0}, CL_FLOAT},  // beta
                                                 {{1.0}, CL_FLOAT},  // gamma
                                                 cl_arithmetic_mode};
         CLML_CALL(clCreateMLOpBinaryQCOM, CLML_CTX, nullptr, &binaryDesc, bias->tensor,
                   layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
         ICHECK(op) << "Binary Op Error";
         layer->function.push_back(op);
       }
     }

     return;
   }

   /*!
    * \brief Create a dense layer Tensors with supported layout.
    *
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateDenseLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
     size_t num_inputs = node.GetInputs().size();
     bool has_bias = (num_inputs == 3);
     cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
     if (in_dims.n == 1 && has_bias) {
       layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
     }
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
     auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
     auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.n, wt_dims.c},
                                               layout, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);

     return;
   }

   /*!
    * \brief Create a batch_matmul layer.
    *
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateBatchMatmulLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {in_dims.c, in_dims.h},
                                              CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
     auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
     auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.c, wt_dims.h},
                                               CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);

     std::vector<int64_t> out_shape = node.GetOpShape()[0];
     std::vector<size_t> clml_out_shape;
     clml_out_shape.push_back(out_shape[1]);
     clml_out_shape.push_back(out_shape[2]);
     clml_out_shape.push_back(1);
     clml_out_shape.push_back(1);
     auto output =
         MakeCLMLTensorFromJSONEntry(nid, clml_out_shape, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
     layer->out_shapes.insert({nid, clml_out_shape});

     cl_bool b_transpose = std::stoi(node.GetAttr<std::vector<std::string>>("transpose_b")[0]);
     cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
     if (b_transpose) {
       b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;
     }
     cl_ml_op_gemm_desc_qcom gemmDesc = {in_dims.c,                    // m
                                         wt_dims.c,                    // n
                                         wt_dims.h,                    // k
                                         CL_GEMM_TRANSFORM_NONE_QCOM,  // A transform
                                         b_transform,                  // B transform
                                         {{1.0}, CL_FLOAT},            // alpha
                                         {{0.0}, CL_FLOAT},            // beta
                                         cl_arithmetic_mode};

     CLML_CALL(clCreateMLOpGemmQCOM, CLML_CTX, nullptr, &gemmDesc, input->tensor, weight->tensor,
               output->tensor, &op, layer_.tuning_cache);
     ICHECK(op) << "BatchMatmul Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a Batch matmul layer(batch_size=1 supported) Tensors with supported layout.
    *
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateBatchMatmulLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {in_dims.c, in_dims.h},
                                              CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
     auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
     auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.c, wt_dims.h},
                                               CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);

     std::vector<int64_t> out_shape = node.GetOpShape()[0];
     std::vector<size_t> clml_out_shape;
     clml_out_shape.push_back(out_shape[1]);
     clml_out_shape.push_back(out_shape[2]);
     clml_out_shape.push_back(1);
     clml_out_shape.push_back(1);
     auto output =
         MakeCLMLTensorFromJSONEntry(nid, clml_out_shape, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
     layer->out_shapes.insert({nid, clml_out_shape});
     return;
   }

   /*!
    * \brief Create a Clip(X) layer.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateClipLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_float a_max = std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
     cl_float a_min = std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);

     cl_ml_op_clip_desc_qcom clip_desc = {
         CL_CLIP_BY_VALUE_QCOM, {{a_max}, CL_FLOAT}, {{a_min}, CL_FLOAT}, cl_arithmetic_mode};

     CLML_CALL_clCreateMLOpClipQCOM(CLML_CTX, nullptr, &clip_desc, input->tensor, output->tensor,
                                    &op, layer_.tuning_cache);
     ICHECK(op) << "Clip Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a Binary layer.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateBinaryLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto input_a = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                                CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto input_b = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {},
                                                CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     std::string op_name = node.GetOpName();
     cl_binary_op_qcom binary_op = CL_TENSOR_OP_ADD_QCOM;
     if (op_name == "subtract" || PatternMatch(op_name, "relax.subtract"))
       binary_op = CL_TENSOR_OP_SUB_QCOM;
     else if (op_name == "multiply" || PatternMatch(op_name, "relax.multiply"))
       binary_op = CL_TENSOR_OP_MUL_QCOM;
     else if (op_name == "divide" || PatternMatch(op_name, "relax.divide"))
       binary_op = CL_TENSOR_OP_DIV_QCOM;
     else if (op_name == "minimum" || PatternMatch(op_name, "relax.minimum"))
       binary_op = CL_TENSOR_OP_MIN_QCOM;
     else if (op_name == "maximum" || PatternMatch(op_name, "relax.maximum"))
       binary_op = CL_TENSOR_OP_MAX_QCOM;
     else if (op_name == "add" || PatternMatch(op_name, "relax.add"))
       binary_op = CL_TENSOR_OP_ADD_QCOM;
     else
       LOG(FATAL) << "Undefined binary op:" << op_name;
     cl_ml_op_binary_desc_qcom add_desc = {
         binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, cl_arithmetic_mode};
     LOG(INFO) << "Op name - " << op_name;
     CLML_CALL(clCreateMLOpBinaryQCOM, CLML_CTX, nullptr, &add_desc, input_a->tensor,
               input_b->tensor, output->tensor, &op, layer_.tuning_cache);
     ICHECK(op) << op_name << " Node Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a DepthToSpace(X) layer.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateDepthToSpaceLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_uint block_size = std::stoi(node.GetAttr<std::vector<std::string>>("block_size")[0]);

     cl_ml_op_depthtospace_desc_qcom dtos_desc = {block_size, cl_arithmetic_mode};
     CLML_CALL(clCreateMLOpDepthToSpaceQCOM, CLML_CTX, nullptr, &dtos_desc, input->tensor,
               output->tensor, &op, layer_.tuning_cache);
     ICHECK(op) << "DepthToSpace Layer Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief Create a Resize(X) layer.
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateResizeLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
     auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_bool align_corners = std::stoi(node.GetAttr<std::vector<std::string>>("align_corners")[0]);

     cl_ml_op_resize_bilinear_desc_qcom resize_desc = {align_corners, false, cl_arithmetic_mode};
     CLML_CALL(clCreateMLOpResizeBilinearQCOM, CLML_CTX, nullptr, &resize_desc, input->tensor,
               output->tensor, &op, layer_.tuning_cache);
     ICHECK(op) << "Resize Layer Error";

     layer->function.push_back(op);
     return;
   }

   /*!
    * \brief The network layers represented by acl functions.
    * \note Currently only supports a single layer.
    */

   // This layer instance
   CachedLayer layer_;

   // CLML Workspace
   CLMLWorkspace* cws;

 #else
   void Run() override {
     LOG(FATAL) << "Cannot call run on CLML module without runtime enabled. "
                << "Please build with USE_CLML_GRAPH_EXECUTOR.";
   }

   void BuildEngine() {
     LOG(WARNING) << "CLML engine is not initialized. "
                  << "Please build with USE_CLML_GRAPH_EXECUTOR.";
   }
 #endif
   bool CanDebug() override { return true; }

   /*! CLML sub graph symbol in TVM main module */
   std::string clml_symbol;
 };

 ffi::Module CLMLRuntimeCreate(const ffi::String& symbol_name, const ffi::String& graph_json,
                               const ffi::Array<ffi::String>& const_names) {
   auto n = ffi::make_object<CLMLRuntime>(symbol_name, graph_json, const_names);
   return ffi::Module(n);
 }

 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("runtime.clml_runtime_create", CLMLRuntimeCreate)
       .def("ffi.Module.load_from_bytes.clml", JSONRuntimeBase::LoadFromBytes<CLMLRuntime>);
 }
 }  //  namespace contrib
 }  //  namespace runtime
 }  //  namespace tvm