blob: c166d0fb4bed1ae83ee6133205a1d23b022e95de [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file src/runtime/contrib/clml/clml_runtime.cc
* \brief A simple JSON runtime for CLML.
*/
#include "clml_runtime.h"
#include <tvm/ffi/reflection/registry.h>
#include <unordered_map>
#ifdef TVM_GRAPH_EXECUTOR_CLML
#include "clml_memory_planner.h"
#include "clml_utils.h"
#endif
#include <tvm/runtime/profiling.h>
namespace tvm {
namespace runtime {
namespace contrib {
using namespace tvm::runtime::json;
using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
#ifdef TVM_GRAPH_EXECUTOR_CLML
CLMLThreadEntry* CLMLWorkspace::GetThreadEntry() { return CLMLThreadEntry::ThreadLocal(); }
CLMLWorkspace* CLMLWorkspace::Global() {
static CLMLWorkspace* inst = new CLMLWorkspace();
return inst;
}
CLMLWorkspace::CLMLWorkspace() {
cl_int result = 0;
workspace = cl::OpenCLWorkspace::Global();
workspace->Init();
tentry = workspace->GetThreadEntry();
device_id = workspace->GetCLDeviceID(tentry->device.device_id);
platform_id = workspace->device_info[device_id].platform_id;
// Print extensions
size_t reqd_size = 0;
result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, nullptr, &reqd_size);
ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
std::vector<char> extn_buf(reqd_size);
result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, extn_buf.data(), nullptr);
ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
std::string extensions(extn_buf.data());
LOG_CLML << "OpenCL Extensions:" << extensions;
if (extensions.find("cl_qcom_ml_ops") == std::string::npos) {
LOG(FATAL) << "CLML Runtime Init: Qualcomm extn not present.\n";
return;
}
if (getenv("CLML_DISABLE_RECORDABLE_QUEUE")) {
is_recordable_queue = 0;
is_on_chip_memory = 0;
} else {
is_recordable_queue = (extensions.find("cl_qcom_recordable_queues") != std::string::npos);
is_on_chip_memory = (extensions.find("cl_qcom_onchip_global_memory") != std::string::npos);
LOG_CLML << "Recordable Queues Support :" << is_recordable_queue;
LOG_CLML << "On chip Memory Support :" << is_on_chip_memory;
}
if (is_on_chip_memory) {
result = clGetDeviceInfo(device_id, CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM,
sizeof(onchip_mem_size), &onchip_mem_size, nullptr);
ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo(CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM):"
<< result;
LOG_CLML << "On chip memory size:" << onchip_mem_size;
}
// Query and Get CLML Interface
static const cl_uint MAX_VERSIONS = 256;
cl_int majorVersions[MAX_VERSIONS];
cl_int minorVersions[MAX_VERSIONS];
cl_uint numVersions = 0;
result = clQueryMLInterfaceVersionsQCOM(nullptr, nullptr, 0, &numVersions);
ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
ICHECK(numVersions > 0u);
ICHECK(numVersions <= MAX_VERSIONS);
result = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, numVersions, nullptr);
ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
target_major = majorVersions[numVersions - 1];
target_minor = minorVersions[numVersions - 1];
LOG(WARNING) << "CLML Target Version:" << target_major << "." << target_minor;
if (target_major > CL_QCOM_ML_OPS_H_MAJOR_VERSION) {
LOG(WARNING) << "Runtime is compiled with " << CL_QCOM_ML_OPS_H_MAJOR_VERSION
<< "where as target supports " << target_major
<< "\nTrying to use API interface version:" << CL_QCOM_ML_OPS_H_MAJOR_VERSION
<< "\nSome functionality may not work as expected ...";
target_major = CL_QCOM_ML_OPS_H_MAJOR_VERSION;
target_minor = 0;
}
clGetMLInterfaceQCOM(&h_ClmlIntf, target_major, target_minor);
ICHECK(nullptr != h_ClmlIntf) << "Couldn't get API interface, target is not supported."
<< "Compiled version: " << CL_QCOM_ML_OPS_H_MAJOR_VERSION << "."
<< CL_QCOM_ML_OPS_H_MINOR_VERSION
<< "Target Version:" << target_major << "." << target_minor;
char* tune_flag;
if ((tune_flag = getenv("CLML_IS_TUNING_RUN")))
is_tuning_run = std::stoi(tune_flag);
else
is_tuning_run = 0;
if (!(tuning_file = getenv("CLML_TUNING_CACHE"))) this->is_tuning_run = 0;
}
typedef dmlc::ThreadLocalStore<CLMLThreadEntry> CLMLThreadStore;
CLMLThreadEntry* CLMLThreadEntry::ThreadLocal() { return CLMLThreadStore::Get(); }
#endif
class CLMLRuntime : public JSONRuntimeBase {
public:
/*!
* \brief The CLML runtime module. Deserialize the provided functions
* on creation and store in the layer cache.
*
* \param symbol_name The name of the function.
* \param graph_json serialized JSON representation of a sub-graph.
* \param const_names The names of each constant in the sub-graph.
*/
explicit CLMLRuntime(const std::string& symbol_name, const std::string& graph_json,
const ffi::Array<ffi::String>& const_names)
: JSONRuntimeBase(symbol_name, graph_json, const_names), clml_symbol(symbol_name) {}
~CLMLRuntime() {
#ifdef TVM_GRAPH_EXECUTOR_CLML
cl_int result = 0;
if (this->layer_.tuning_cache) {
CLML_CALL(clReleaseMLTuningCacheQCOM, this->layer_.tuning_cache);
}
for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
auto tensor_desc = it->second.tensor_desc;
CLML_CALL(clReleaseMLTensorQCOM, tensor_desc->tensor)
if (this->layer_.ddr_storage_ref_map.find(tensor_desc->memory) !=
this->layer_.ddr_storage_ref_map.end()) {
ReleaseDDRMemory(tensor_desc->memory);
} else {
result = clReleaseMemObject(tensor_desc->memory);
ICHECK(result == CL_SUCCESS) << "clReleaseMemObject:" << result;
}
}
for (size_t i = 0; i < this->layer_.function.size(); ++i) {
CLML_CALL(clReleaseMLOpQCOM, this->layer_.function[i])
}
for (auto it = this->layer_.in_placeholder.begin(); it != this->layer_.in_placeholder.end();
it++) {
CLML_CALL(clReleaseMLTensorQCOM, it->second->tensor)
}
for (auto it = this->layer_.out_placeholder.begin(); it != this->layer_.out_placeholder.end();
it++) {
CLML_CALL(clReleaseMLTensorQCOM, (*it)->tensor)
}
CLML_CALL(clReleaseMLTensorMemoryDescriptorSetQCOM, layer_.descriptorSet)
if (this->layer_.recordable_queue) {
clReleaseCommandQueue(this->layer_.recordable_queue);
}
#endif
}
/*!
* \brief The type key of the module.
*
* \return module type key.
*/
const char* kind() const override { return "clml"; }
/*!
* \brief Initialize runtime. Create CLML layer from JSON
* representation.
*
* \param consts The constant params from compiled model.
*/
void Init(const ffi::Array<Tensor>& consts) override {
ICHECK_EQ(consts.size(), const_idx_.size())
<< "The number of input constants must match the number of required.";
SetupConstants(consts);
#ifdef TVM_GRAPH_EXECUTOR_CLML
InitCLML();
#endif
BuildEngine();
}
#ifdef TVM_GRAPH_EXECUTOR_CLML
void InitCLML() {
// Setup CLML Context
cl_int result = 0;
cws = CLMLWorkspace::Global();
if (cws->is_recordable_queue) {
this->layer_.recordable_queue =
clCreateCommandQueue(CLML_CTX, cws->device_id, CL_QUEUE_RECORDABLE_QCOM, &result);
ICHECK(result == CL_SUCCESS) << "clCreateCommandQueue - Recordable:" << result;
this->layer_.recording = clNewRecordingQCOM(this->layer_.recordable_queue, &result);
ICHECK(result == CL_SUCCESS) << "clNewRecordingQCOM:" << result;
}
// A Tuning run, so create the cache from scratch
CLML_CALL(clCreateMLTuningCacheQCOM, &layer_.tuning_cache)
if (!cws->is_tuning_run && cws->tuning_file) {
std::vector<unsigned char> tune_buffer;
std::string tune_blob;
LoadBinaryFromFile(cws->tuning_file, &tune_blob);
dmlc::MemoryStringStream mstrm(const_cast<std::string*>(&tune_blob));
dmlc::Stream* strm = &mstrm;
uint64_t header, reserve;
std::string tune_symbol;
while (strm->Read(&header)) {
if (header != kTVMCLMLTuningCacheMagic) break;
if (!strm->Read(&reserve)) break;
if (!strm->Read(&tune_symbol)) break;
if (tune_symbol == clml_symbol) {
strm->Read(&tune_buffer);
break;
} else {
std::vector<unsigned char> tmp_buf;
if (!strm->Read(&tmp_buf)) break;
}
}
if (tune_buffer.size()) {
LOG(INFO) << "Loading tuning cache for symbol:" << clml_symbol
<< " size:" << tune_buffer.size();
CLML_CALL(clLoadMLTuningCacheQCOM, layer_.tuning_cache, tune_buffer.size(),
tune_buffer.data())
} else {
LOG(WARNING) << "Tuning cache not cound for symbol :" << clml_symbol << " in file "
<< cws->tuning_file;
}
}
}
std::string DebugDump(void) override {
if (cws->is_recordable_queue) {
LOG(FATAL) << "Debugging over recordable queues is not supported yet. You may disable the "
"same by exporting CLML_DISABLE_RECORDABLE_QUEUE at runtime.";
}
cl_command_queue queue = CLML_QUEUE;
ffi::Map<ffi::String, Tensor> dump_tensors;
std::ostringstream os;
dmlc::JSONWriter writer(&os);
writer.BeginObject();
writer.WriteObjectKeyValue("graph", graph_json_);
int op_index = 0;
for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
int nid = it->first;
auto clml_desc = it->second.tensor_desc;
auto node = it->second.node;
if ("kernel" == node.GetOpType()) {
CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[op_index],
this->layer_.descriptorSet, 0, nullptr, nullptr);
OPENCL_CALL(clFinish(queue));
op_index++;
}
// Dump tensor to CPU
std::vector<int64_t> shape = node.GetOpShape()[0];
DLDataType tvm_dtype = node.GetOpDataType()[0];
Tensor narr = Tensor::Empty(ffi::Shape(shape), tvm_dtype, {kDLCPU, 0});
CopyDataFromCLMLTensor(clml_desc, narr.operator->()->data);
// Naming convention
std::string node_name;
bool is_out = false;
for (size_t i = 0; i < outputs_.size(); ++i) {
uint32_t eid = EntryID(outputs_[i]);
is_out = (eid == nid);
}
if (is_out) {
node_name = clml_symbol + "_layer_out_" + std::to_string(nid);
} else if (("const" == node.GetOpType()) || ("input" == node.GetOpType())) {
node_name = node.GetOpName();
} else {
node_name = node.GetOpName() + "____topo-index:" + std::to_string(nid);
}
dump_tensors.Set(node_name, narr);
}
const auto f = tvm::ffi::Function::GetGlobal("runtime.SaveParams");
if (f.has_value()) {
std::string dump_bytes = (*f)(dump_tensors);
std::ostringstream oss;
/*TODO(Siva) HEX encoding doubles the size, look for better encode that can cross the RPC. */
for (size_t i = 0; i < dump_bytes.size(); ++i) {
oss << std::setw(2) << std::setfill('0') << std::hex << static_cast<int>(dump_bytes[i]);
}
writer.WriteObjectKeyValue("tensors", oss.str());
}
writer.EndObject();
return os.str();
}
void RunProfile(profiling::Profiler* prof) override {
cl_command_queue queue = CLML_QUEUE;
std::vector<cl_event>& evts = cws->workspace->GetEventQueue(cws->tentry->device);
std::vector<profiling::MetricCollector> cs;
std::vector<Device> devices;
devices.push_back(cws->tentry->device);
for (size_t i = 0; i < input_nodes_.size(); ++i) {
auto nid = input_nodes_[i];
uint32_t eid = EntryID(nid, 0);
if (nodes_[nid].GetOpType() == "input") {
// Assuming all inputs are from OpenCL
if (kDLOpenCL == data_entry_[eid]->device.device_type) {
layer_.in_placeholder[nid]->memory = static_cast<cl_mem>(
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
cl_event cpy_evt = nullptr;
cl_event* evt = &cpy_evt;
if (cws->workspace->IsProfiling(cws->tentry->device)) {
evts.resize(evts.size() + 1);
evt = &(evts.back());
}
std::unordered_map<std::string, ObjectRef> metrics;
std::string shape_str;
std::vector<int64_t> shape = nodes_[nid].GetOpShape()[0];
DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
shape_str.append(profiling::ShapeString(shape, tvm_dtype));
metrics["Argument Shapes"] = ffi::String(shape_str);
prof->StartCall("CopyIn", cws->tentry->device, metrics);
CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.in_placeholder[nid]->tensor,
layer_.in_placeholder[nid]->memory, layer_.inputs[nid]->tensor,
layer_.inputs[nid]->memory, 0, nullptr, evt);
prof->StopCall();
}
}
}
for (size_t i = 0; i < this->layer_.function.size(); ++i) {
std::unordered_map<std::string, ObjectRef> metrics;
auto node = this->layer_.op_node_map[this->layer_.function[i]].second;
std::string shape_str;
for (uint32_t j = 0; j < node.GetInputs().size(); ++j) {
const JSONGraphNode in_node = nodes_[node.GetInputs()[j].id_];
std::vector<int64_t> shape = in_node.GetOpShape()[0];
DLDataType tvm_dtype = in_node.GetOpDataType()[0];
shape_str.append(profiling::ShapeString(shape, tvm_dtype));
shape_str.append(", ");
}
// Assuming one output per operation
std::vector<int64_t> shape = node.GetOpShape()[0];
DLDataType tvm_dtype = node.GetOpDataType()[0];
shape_str.append(profiling::ShapeString(shape, tvm_dtype));
metrics["Argument Shapes"] = ffi::String(shape_str);
// Launch call
prof->StartCall(clml_symbol + "-" + this->layer_.layer_names[i], cws->tentry->device,
metrics);
queue = CLML_QUEUE;
evts.resize(evts.size() + 1);
cl_event* evt = &(evts.back());
CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet, 0,
nullptr, evt);
prof->StopCall();
}
for (size_t i = 0; i < outputs_.size(); ++i) {
uint32_t eid = EntryID(outputs_[i]);
// Assuming all outputs are to OpenCL
if (kDLOpenCL == data_entry_[eid]->device.device_type) {
layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
cl_event cpy_evt = nullptr;
cl_event* evt = &cpy_evt;
if (cws->workspace->IsProfiling(cws->tentry->device)) {
evts.resize(evts.size() + 1);
evt = &(evts.back());
}
std::unordered_map<std::string, ObjectRef> metrics;
std::string shape_str;
std::vector<int64_t> shape = nodes_[eid].GetOpShape()[0];
DLDataType tvm_dtype = nodes_[eid].GetOpDataType()[0];
shape_str.append(profiling::ShapeString(shape, tvm_dtype));
metrics["Argument Shapes"] = ffi::String(shape_str);
prof->StartCall("CopyOut", cws->tentry->device, metrics);
CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.outputs[i]->tensor,
layer_.outputs[i]->memory, layer_.out_placeholder[i]->tensor,
layer_.out_placeholder[i]->memory, 0, nullptr, evt);
prof->StopCall();
}
}
return;
}
/*!
* \brief Unpack inputs and outputs and run inference on a given layer.
*
* \param args Access inputs and outputs.
* \param function The layer to execute inference on.
* \return Status of inference.
*/
void Run() override {
LOG_CLML << "Run Start";
cl_command_queue queue = CLML_QUEUE;
std::vector<cl_event>& evts = cws->workspace->GetEventQueue(cws->tentry->device);
for (size_t i = 0; i < input_nodes_.size(); ++i) {
auto nid = input_nodes_[i];
uint32_t eid = EntryID(nid, 0);
if (nodes_[nid].GetOpType() == "input") {
void* data = data_entry_[eid]->data;
size_t isize = 1;
for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
isize *= data_entry_[eid]->shape[j];
}
if (kDLCPU == data_entry_[eid]->device.device_type) {
CopyDataToCLMLTensor(layer_.inputs[nid], data);
} else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
layer_.in_placeholder[nid]->memory = static_cast<cl_mem>(
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
cl_event cpy_evt = nullptr;
cl_event* evt = &cpy_evt;
if (cws->workspace->IsProfiling(cws->tentry->device)) {
evts.resize(evts.size() + 1);
evt = &(evts.back());
}
LOG_CLML << "Enqueue CLML Copy";
CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.in_placeholder[nid]->tensor,
layer_.in_placeholder[nid]->memory, layer_.inputs[nid]->tensor,
layer_.inputs[nid]->memory, 0, nullptr, evt);
LOG_CLML << "Enqueue CLML Copy Completed";
} else {
DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
void* tmpptr = reinterpret_cast<void*>(malloc(isize * dtype_size));
TVMTensorCopyToBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
isize * dtype_size);
CopyDataToCLMLTensor(layer_.inputs[nid], tmpptr);
free(tmpptr);
}
}
}
LOG_CLML << "Inputs Set";
int64_t duration = 0;
if (cws->is_recordable_queue) {
LOG_CLML << "Execution by Rec Queue";
if (cws->workspace->IsProfiling(cws->tentry->device)) {
Timer t;
auto f = tvm::ffi::Function::GetGlobal(std::string("profiling.timer.opencl"));
t = f->operator()(cws->tentry->device);
t->Start();
queue = CLML_QUEUE;
evts.resize(evts.size() + 1);
cl_event* evt = &(evts.back());
CLML_CALL(clEnqueueRecordingMLOpQCOM, queue, this->layer_.recording, 0, nullptr, 0, nullptr,
0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, evt);
t->Stop();
duration += t->SyncAndGetElapsedNanos();
} else {
CLML_CALL(clEnqueueRecordingMLOpQCOM, queue, this->layer_.recording, 0, nullptr, 0, nullptr,
0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, nullptr);
}
} else {
LOG_CLML << "Execution by Normal Queue";
for (size_t i = 0; i < this->layer_.function.size(); ++i) {
// Make CLML subgraphs accounted by OpenCLTimerNode.
LOG_CLML << "Run Layer:" << this->layer_.layer_names[i];
if (cws->workspace->IsProfiling(cws->tentry->device)) {
Timer t;
auto f = tvm::ffi::Function::GetGlobal(std::string("profiling.timer.opencl"));
t = f->operator()(cws->tentry->device);
t->Start();
queue = CLML_QUEUE;
evts.resize(evts.size() + 1);
cl_event* evt = &(evts.back());
CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet,
0, nullptr, evt);
t->Stop();
duration += t->SyncAndGetElapsedNanos();
LOG_CLML << "Layer:" << this->layer_.layer_names[i]
<< " Duration:" << t->SyncAndGetElapsedNanos();
} else {
CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet,
0, nullptr, nullptr);
}
}
}
if (cws->workspace->IsProfiling(cws->tentry->device)) {
LOG_CLML << "Total Duration for " << clml_symbol << " is:" << duration;
}
LOG_CLML << "Run Completed";
for (size_t i = 0; i < outputs_.size(); ++i) {
uint32_t eid = EntryID(outputs_[i]);
void* data = data_entry_[eid]->data;
size_t osize = 1;
for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
osize *= data_entry_[eid]->shape[j];
}
if (kDLCPU == data_entry_[eid]->device.device_type) {
CopyDataFromCLMLTensor(layer_.outputs[0], data);
} else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
cl_event cpy_evt = nullptr;
cl_event* evt = &cpy_evt;
if (cws->workspace->IsProfiling(cws->tentry->device)) {
evts.resize(evts.size() + 1);
evt = &(evts.back());
}
CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.outputs[i]->tensor,
layer_.outputs[i]->memory, layer_.out_placeholder[i]->tensor,
layer_.out_placeholder[i]->memory, 0, nullptr, evt);
} else {
DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
void* tmpptr = reinterpret_cast<void*>(malloc(osize * dtype_size));
CopyDataFromCLMLTensor(layer_.outputs[0], tmpptr);
TVMTensorCopyFromBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
osize * dtype_size);
free(tmpptr);
}
}
LOG_CLML << "Run End";
}
private:
/*!
* \brief check if the nid is graph output tensor or not.
*
*/
bool IsOutputTensor(int nid) {
for (size_t i = 0; i < outputs_.size(); ++i) {
if (nid == outputs_[i].id_) return true;
}
return false;
}
/*!
* \brief Initialize memory pool.
*
*/
void InitMemoryPool(void) {
layer_.on_chip_pool_size.clear();
layer_.on_chip_pool_size.insert({0, cws->onchip_mem_size});
layer_.on_chip_pool_alloc_info.clear();
layer_.alloc_ping_pong = true;
layer_.in_chip_total_free = cws->onchip_mem_size;
layer_.in_chip_total_alloc = 0;
layer_.on_chip_alert_fail = 0;
}
/*!
* \brief Plan Memory for activations to allocate on on-chip global memory where ever possible.
*
*/
void PlanMemory() {
InitMemoryPool();
// Build the ref count table for all activation tensors.
LOG_MEM << "Build Ref Map";
for (size_t nid = 0; nid < nodes_.size(); ++nid) {
const auto& node = nodes_[nid];
if (node.GetOpType() == "kernel") {
std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
for (auto& input_node : inputs) {
if (nodes_[input_node.id_].GetOpType() != "const") {
if (layer_.storage_ref_map.find(input_node.id_) == layer_.storage_ref_map.end()) {
layer_.storage_ref_map.insert({input_node.id_, 1});
layer_.life_span.insert({input_node.id_, nid});
} else {
layer_.storage_ref_map[input_node.id_]++;
layer_.life_span[input_node.id_] = nid;
}
}
}
}
}
LOG_MEM << "Print Ref Map";
for (auto it = layer_.storage_ref_map.begin(); it != layer_.storage_ref_map.end(); it++) {
LOG_MEM << "RefMap:" << it->first << " Count:" << it->second
<< "Life Span:" << layer_.life_span[it->first];
}
for (size_t nid = 0; nid < nodes_.size(); ++nid) {
const auto& node = nodes_[nid];
uint32_t size = 0;
if (this->layer_.storage_map.find(nid) == this->layer_.storage_map.end()) {
// Possible that some nodes are not consumed by any operation
// Example being nn.pad second argument.
continue;
}
CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, layer_.storage_map[nid].tensor_desc->tensor,
&size);
if ((node.GetOpType() == "kernel") || (node.GetOpType() == "input")) {
std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
LOG_MEM << "Request :" << size << " Nid:" << nid;
size_t offset = -1;
// On-chip memory only for intermediate tensors with in recording scope.
if ((cws->is_on_chip_memory) && (!IsOutputTensor(nid)) && (node.GetOpType() != "input")) {
offset = RequestOnChipMemory(&this->layer_, size);
}
if (-1 != offset) {
LOG_MEM << "Got On-Chip Mem:" << offset << "Nid:" << nid;
layer_.on_chip_pool_alloc_info.insert({offset, nid});
layer_.on_chip_alloc_plan.insert({nid, std::make_pair(size, offset)});
} else {
layer_.on_chip_reject.insert({nid, size});
// DDR Allocation
auto ddr_mem = RequestDDRMemory(&this->layer_, size);
LOG_MEM << "Alloc DDR from global pool for nid:" << nid << " Type:" << node.GetOpType();
layer_.ddr_alloc_plan.insert({nid, ddr_mem});
}
// Now free up the input tensors on-chip memory for reuse.
for (auto& input_node : inputs) {
if (nodes_[input_node.id_].GetOpType() != "const") {
LOG_MEM << "Free Input Mem:" << input_node.id_;
FreeMemory(&this->layer_, input_node.id_);
}
}
}
}
// Stats dump
size_t in_chip_total_alloc = 0;
size_t total_reject = 0;
for (auto it = layer_.on_chip_alloc_plan.begin(); it != layer_.on_chip_alloc_plan.end(); it++) {
LOG_STATS << " On-chip Alloc:" << it->first << " Size:" << it->second.first
<< " Offset:" << it->second.second;
in_chip_total_alloc += it->second.first;
}
for (auto it = layer_.on_chip_reject.begin(); it != layer_.on_chip_reject.end(); it++) {
LOG_STATS << "Reject:" << it->first << " Size:" << it->second;
total_reject += it->second;
}
LOG_STATS << "Total On-chip Alloc:" << in_chip_total_alloc + total_reject
<< " On-Chip:" << in_chip_total_alloc << " Reject:" << total_reject
<< " Alert Fail:" << layer_.on_chip_alert_fail;
auto cws = CLMLWorkspace::Global();
for (auto it = cws->ddr_global_pool.begin(); it != cws->ddr_global_pool.end(); it++) {
LOG_STATS << "DDR Global pool - size:" << it->second.first << " Ref:" << it->second.second;
}
for (auto it = this->layer_.ddr_storage_ref_map.begin();
it != this->layer_.ddr_storage_ref_map.end(); it++) {
LOG_STATS << "DDR Local pool - size:" << it->second.first << " Ref cnt:" << it->second.second;
}
}
/*!
* \brief Create an CLML tensor from JSON node entry. Lookup storage map before creation.
* Update input placeholder for NHWC layout
*
* \param nid The node index of graph JSON.
* \param shape shape information of tensor
* \param layout the tensor layout to be used
* \param dtype tensor data type
* \return CLML Tensor descriptor.
*/
std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONEntry(
size_t nid, std::vector<size_t> shape, cl_ml_tensor_layout_qcom layout, cl_uint dtype) {
const JSONGraphNode node = nodes_[nid];
cl_ml_tensor_usage_qcom usage = CL_TENSOR_USAGE_CNN_QCOM;
if (this->layer_.storage_map.find(nid) != this->layer_.storage_map.end()) {
if (nullptr != layer_.storage_map[nid].tensor_desc) {
return this->layer_.storage_map[nid].tensor_desc;
}
} else {
this->layer_.storage_map.insert({nid, NodeDescriptor()});
this->layer_.storage_map[nid].node = node;
}
void* node_data = nullptr;
if (node.GetOpType() == "const") {
uint32_t eid = EntryID(nid, 0);
node_data = data_entry_[eid]->data;
usage = CL_TENSOR_USAGE_PARAMETER_QCOM;
ICHECK(CL_TENSOR_USAGE_INVALID_QCOM == this->layer_.storage_map[nid].usage)
<< "Parameter have usage reservation !!!";
}
if (CL_TENSOR_USAGE_INVALID_QCOM != this->layer_.storage_map[nid].usage) {
// Respect special reservation on usage.
usage = this->layer_.storage_map[nid].usage;
} else {
this->layer_.storage_map[nid].usage = usage;
}
if (this->layer_.storage_map[nid].custom_layout) {
// Respect special reservation on layout.
layout = this->layer_.storage_map[nid].layout;
} else {
this->layer_.storage_map[nid].layout = layout;
}
auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, usage, dtype, node_data, shape);
this->layer_.storage_map[nid].tensor_desc = clml_tensor;
this->layer_.storage_map[nid].usage = usage;
this->layer_.storage_map[nid].layout = layout;
LOG_CLML << "Storage Map Alloc:" << nid << " Name:" << node.GetOpName() << " Usage: " << usage
<< " Layout:" << layout;
if ("input" == node.GetOpType()) {
this->layer_.inputs.insert({nid, this->layer_.storage_map[nid].tensor_desc});
// Input copy placeholder Tensor
if (layout == CL_TENSOR_LAYOUT_OPTIMAL_QCOM) {
this->layer_.in_placeholder.insert(
{nid, MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, usage, dtype,
node_data, shape)});
} else {
this->layer_.in_placeholder.insert(
{nid, MakeCLMLTensorFromJSONNode(node, layout, usage, dtype, node_data, shape)});
}
}
return clml_tensor;
}
/*!
* \brief Build CLML layer from JSON representation and cache.
*
* \note For the time being only one layer or operator is supported
* per engine.
*/
void BuildEngine() {
size_t nid;
// Create tensors for the operators which has distinct layout format
// other than CL_TENSOR_LAYOUT_OPTIMAL_QCOM.
for (nid = 0; nid < nodes_.size(); ++nid) {
const auto& node = nodes_[nid];
if ("nn.dense" == node.GetOpName()) CreateDenseLayerTensor(&layer_, node, nid);
if ("nn.batch_matmul" == node.GetOpName()) CreateBatchMatmulLayerTensor(&layer_, node, nid);
if ("nn.softmax" == node.GetOpName() || PatternMatch(node.GetOpName(), "nn.softmax"))
CreateSoftmaxLayerTensor(&layer_, node, nid);
}
for (nid = 0; nid < nodes_.size(); ++nid) {
const auto& node = nodes_[nid];
if (node.GetOpType() == "input") {
// Layers may request for different layout. Differ the input allocation.
} else if (node.GetOpType() == "kernel") {
auto op_name = node.GetOpName();
if (PatternMatch(op_name, "nn.conv2d") || PatternMatch(op_name, "nn.pad_conv2d"))
CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_CONVOLUTION_QCOM, nid);
else if (PatternMatch(op_name, "nn.depthwise_conv2d"))
CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_DEPTHWISE_QCOM, nid);
else if (PatternMatch(op_name, "nn.conv2d_transpose"))
CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_TRANSPOSE_QCOM, nid);
else if ("nn.relu6" == op_name || PatternMatch(op_name, "nn.relu6"))
CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU6);
else if (PatternMatch(op_name, "nn.relu"))
CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU);
else if (PatternMatch(op_name, "nn.batch_norm"))
CreateBatchNormLayer(&layer_, node, nid);
else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
"nn.l2_pool2d" == op_name || PatternMatch(op_name, "nn.max_pool2d") ||
PatternMatch(op_name, "nn.avg_pool2d"))
CreatePoolingLayer(&layer_, node, nid);
else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name ||
PatternMatch(op_name, "nn.global_avg_pool2d") ||
PatternMatch(op_name, "nn.global_max_pool2d"))
CreateGlobalPoolingLayer(&layer_, node, nid);
else if ("reshape" == op_name || PatternMatch(op_name, "reshape"))
CreateReshapeLayer(&layer_, node, nid);
else if ("concatenate" == op_name)
CreateConcatLayer(&layer_, node, nid);
else if ("nn.dense" == op_name)
CreateDenseLayer(&layer_, node, nid);
else if ("nn.softmax" == op_name || PatternMatch(op_name, "nn.softmax"))
CreateSoftMaxLayer(&layer_, node, nid);
else if ("nn.pad" == op_name)
CreatePadLayer(&layer_, node, nid);
else if ("nn.batch_flatten" == op_name)
CreateBatchFlattenLayer(&layer_, node, nid);
else if ("clip" == op_name)
CreateClipLayer(&layer_, node, nid);
else if ("add" == op_name || "subtract" == op_name || "multiply" == op_name ||
"minimum" == op_name || "maximum" == op_name || "divide" == op_name ||
PatternMatch(op_name, "relax.add") || PatternMatch(op_name, "relax.subtract") ||
PatternMatch(op_name, "relax.multiply") ||
PatternMatch(op_name, "relax.minimum") || PatternMatch(op_name, "relax.maximum") ||
PatternMatch(op_name, "relax.divide"))
CreateBinaryLayer(&layer_, node, nid);
else if ("nn.depth_to_space" == op_name)
CreateDepthToSpaceLayer(&layer_, node, nid);
else if ("nn.upsampling" == op_name)
CreateResizeLayer(&layer_, node, nid);
else if ("nn.batch_matmul" == op_name)
CreateBatchMatmulLayer(&layer_, node, nid);
else
LOG(FATAL) << "Unsupported op: " << op_name;
this->layer_.layer_names.push_back(op_name);
// Keep map of function and Node to use in profiling
this->layer_.op_node_map.insert({this->layer_.function.back(), std::make_pair(nid, node)});
} else if (node.GetOpType() != "const") {
LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
}
}
for (size_t i = 0; i < outputs_.size(); ++i) {
nid = outputs_[i].id_;
DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
this->layer_.outputs.push_back(this->layer_.storage_map[nid].tensor_desc);
if (this->layer_.out_shapes.find(nid) != this->layer_.out_shapes.end()) {
// Handle customized shapes here
this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype, nullptr,
this->layer_.out_shapes[nid]));
} else {
this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, CL_TENSOR_USAGE_CNN_QCOM, cl_dtype));
}
}
// Plan memory utilization
PlanMemory();
// ALlocate device memories and initialize the params if any
cl_int result = 0;
size_t alloc_on_chip = 0;
size_t alloc_ddr = 0;
size_t alloc_ddr_reuse = 0;
for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
auto tensor_desc = it->second.tensor_desc;
uint32_t mem_size = 0;
result = CL_OUT_OF_HOST_MEMORY;
CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, tensor_desc->tensor, &mem_size);
JSONGraphNode node = it->second.node;
void* node_data = nullptr;
size_t on_chip_mem_offset = -1;
if (layer_.on_chip_alloc_plan.find(it->first) != layer_.on_chip_alloc_plan.end()) {
LOG_MEM << "Found GMEM Alloc:" << it->first
<< " Size:" << layer_.on_chip_alloc_plan[it->first].first
<< " Offset:" << layer_.on_chip_alloc_plan[it->first].second;
on_chip_mem_offset = layer_.on_chip_alloc_plan[it->first].second;
alloc_on_chip += mem_size;
tensor_desc->memory = AllocateOnChipTensorMemory(mem_size, on_chip_mem_offset);
} else if (layer_.ddr_alloc_plan.find(it->first) != layer_.ddr_alloc_plan.end()) {
LOG_MEM << "DDR Alloc for nid:" << it->first << " Type:" << node.GetOpType();
tensor_desc->memory = layer_.ddr_alloc_plan[it->first];
alloc_ddr_reuse += mem_size;
//} else if ((node.GetOpType() == "input") || IsOutputTensor(it->first) || (node.GetOpType()
//== "const")) {
} else if (node.GetOpType() == "const") {
LOG_MEM << "DDR Alloc for Const/Input/Output";
tensor_desc->memory = AllocateDDRTensorMemory(mem_size);
alloc_ddr += mem_size;
} else {
LOG(FATAL) << "Mem allocation not found on DDR as well as On-Chip nid: " << it->first
<< " Type:" << node.GetOpType();
}
if (node.GetOpType() == "const") {
node_data = data_entry_[EntryID(it->first, 0)]->data;
if (node_data != nullptr) {
CopyDataToCLMLTensor(tensor_desc, node_data);
}
}
this->layer_.tensorMemDescs.push_back(*tensor_desc);
}
LOG_STATS << "Total On-Chip Allocation :" << alloc_on_chip;
LOG_STATS << "Total DDR Reuse Allocation:" << alloc_ddr_reuse;
LOG_STATS << "Total DDR fixed allocation:" << alloc_ddr;
size_t ddr_global_pool = 0;
size_t ddr_local_pool = 0;
auto cws = CLMLWorkspace::Global();
for (auto it = cws->ddr_global_pool.begin(); it != cws->ddr_global_pool.end(); it++) {
LOG_STATS << "DDR Global pool - size:" << it->second.first << " Ref:" << it->second.second;
ddr_global_pool += it->second.first;
}
LOG_STATS << "Total Global Pool:" << ddr_global_pool;
for (auto it = this->layer_.ddr_storage_ref_map.begin();
it != this->layer_.ddr_storage_ref_map.end(); it++) {
LOG_STATS << "DDR Local pool - size:" << it->second.first << " Ref cnt:" << it->second.second;
ddr_local_pool += it->second.first;
}
LOG_STATS << "Total Local Pool:" << ddr_local_pool;
// Setup descriptor set
CLML_CALL(clCreateMLTensorMemoryDescriptorSetQCOM, &this->layer_.descriptorSet);
CLML_CALL(clUpdateMLTensorMemoryDescriptorSetQCOM, this->layer_.descriptorSet,
static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
this->layer_.tensorMemDescs.data());
if (cws->is_tuning_run) {
LOG_CLML << "CLML Tunning In Progress:";
// Let the command queue recreated in profiling mode.
cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, true);
for (size_t i = 0; i < this->layer_.function.size(); ++i) {
LOG_CLML << "CLML Tunning:" << this->layer_.layer_names[i];
CLML_CALL(clTuneMLOpQCOM, CLML_QUEUE, this->layer_.function[i], this->layer_.descriptorSet,
this->layer_.tuning_cache, nullptr);
}
cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, false);
size_t cache_len_bytes = 0;
size_t len_ret = 0;
CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, 0, nullptr, &cache_len_bytes);
std::vector<unsigned char> saved_cache(cache_len_bytes, 0);
CLML_CALL(clSaveMLTuningCacheQCOM, layer_.tuning_cache, saved_cache.size(),
saved_cache.data(), &len_ret);
std::string tune_str;
dmlc::MemoryStringStream mstrm(&tune_str);
dmlc::Stream* strm = &mstrm;
uint64_t header = kTVMCLMLTuningCacheMagic;
uint64_t reserved = 0x0;
strm->Write(header);
strm->Write(reserved);
strm->Write(clml_symbol);
strm->Write(saved_cache);
std::ofstream fs(cws->tuning_file, std::ios::app | std::ios::binary);
ICHECK(!fs.fail()) << "Cannot open " << cws->tuning_file;
fs.write(&tune_str[0], tune_str.length());
LOG_CLML << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size"
<< tune_str.length() << " with tuning blob len " << saved_cache.size();
}
if (cws->is_recordable_queue) {
for (size_t i = 0; i < this->layer_.function.size(); ++i) {
CLML_CALL(clEnqueueMLOpQCOM, this->layer_.recordable_queue, this->layer_.function[i],
this->layer_.descriptorSet, 0, nullptr, nullptr);
}
result = clEndRecordingQCOM(this->layer_.recording);
ICHECK(result == CL_SUCCESS) << "clEndRecordingQCOM:" << result;
}
}
/*!
* \brief Create a 2D convolution layer.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
* \param node The JSON representation of the operator.
* \param mode The conv2d mode type - CL_CONVOLUTION_MODE_CONVOLUTION_QCOM
* or CL_CONVOLUTION_MODE_DEPTHWISE_QCOM
* or CL_CONVOLUTION_MODE_TRANSPOSE_QCOM.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateConvolution2DLayer(CachedLayer* layer, const JSONGraphNode& node,
cl_convolution_mode_qcom mode, size_t nid) {
std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
std::vector<cl_uint> clml_padding = GetVectorValues(padding);
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
if (!node.HasAttr("padding")) {
clml_padding.resize(4);
std::fill(clml_padding.begin(), clml_padding.end(), 0);
}
cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[0], clml_padding[1]};
cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[2], clml_padding[3]};
std::vector<cl_uint> v_strides = GetVectorValues(strides);
std::vector<cl_uint> v_dilation = GetVectorValues(dilation);
cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_strides[0], v_strides[1]};
cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_dilation[0], v_dilation[1]};
cl_uint groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
ICHECK(groups == 1) << "CLML convolution only supports group size of 1.";
} else {
groups = 1; // Don't need to pass groups to depthwise
}
bool has_act = false;
std::string activation_type;
cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU;
if (node.HasAttr("activation_type")) {
activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
ICHECK(activation_type == "relu" || activation_type == "relu6")
<< "Unknown activation type:" << activation_type;
if (activation_type == "relu") {
clml_act_type = CL_ACTIVATION_RELU;
} else {
clml_act_type = CL_ACTIVATION_RELU6;
}
has_act = true;
}
cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
cl_arithmetic_mode};
// Collect inputs and outputs, handling nn.conv2d.
std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
size_t num_inputs = inputs.size();
bool has_bias;
bool has_bn;
ICHECK(num_inputs >= 2 && num_inputs <= 7)
<< "Batchnorm fused convolution requires max 7 arguments";
has_bias = (num_inputs == 3) || (num_inputs == 7);
has_bn = (num_inputs == 6) || (num_inputs == 7);
// Input
auto input =
MakeCLMLTensorFromJSONEntry(inputs[0].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
// Weight
auto weight =
MakeCLMLTensorFromJSONEntry(inputs[1].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
// Bias
auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
if (has_bias) {
bias =
MakeCLMLTensorFromJSONEntry(inputs[2].id_, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
} else {
cl_ml_tensor_desc_qcom desc = {};
desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
&layer_.unusedTensor);
ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
bias->tensor = layer_.unusedTensor;
}
// Output
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_ml_op_convolution_desc_qcom conv_desc{mode,
groups,
4,
{clml_padding_b[0], clml_padding_b[1]},
{clml_padding_a[0], clml_padding_a[1]},
{clml_strides[0], clml_strides[1]},
{clml_dilation[0], clml_dilation[1]},
0,
cl_arithmetic_mode};
cl_ml_op_qcom op = nullptr;
if (!has_bn) {
if (!has_act) {
CLML_CALL(clCreateMLOpConvolutionForwardQCOM, CLML_CTX, nullptr, &conv_desc, input->tensor,
weight->tensor, bias->tensor, output->tensor, &op, nullptr);
} else {
CLML_CALL(clCreateMLOpFusedConvolutionActivationForwardQCOM, CLML_CTX, nullptr, &conv_desc,
&act_desc, input->tensor, weight->tensor, bias->tensor, nullptr, output->tensor,
&op, layer_.tuning_cache);
}
layer->function.push_back(op);
} else {
int bn_index = has_bias ? 3 : 2;
int axis = std::stoi(node.GetAttr<std::vector<std::string>>("batchnorm")[0]);
auto bn_dims = GetTensorDims(nodes_[inputs[bn_index].id_]);
float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("batchnorm")[1]);
std::vector<cl_ml_op_properties_qcom> opProperties;
opProperties.push_back(CL_ML_BATCH_NORM_OP_EPSILON_QCOM);
opProperties.push_back(*reinterpret_cast<cl_ml_op_properties_qcom*>(&epsilon));
opProperties.push_back(CL_ML_OP_PROPERTY_LIST_END_QCOM);
std::vector<size_t> bn_shape = {1, 1, 1, 1};
bn_shape[axis] = bn_dims.n;
auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
if (!has_act) {
CLML_CALL(clCreateMLOpFusedConvolutionBatchNormForwardQCOM, CLML_CTX, opProperties.data(),
&conv_desc, &bn_desc, input->tensor, weight->tensor, bias->tensor, output->tensor,
bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op,
layer_.tuning_cache);
} else {
CLML_CALL(clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM, CLML_CTX,
opProperties.data(), &conv_desc, &bn_desc, &act_desc, input->tensor,
weight->tensor, bias->tensor, output->tensor, nullptr, bn_mean->tensor,
bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op, layer_.tuning_cache);
}
layer->function.push_back(op);
}
return;
}
/*!
* \brief Create a ReLU(X) layer.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateReLULayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid,
cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
cl_arithmetic_mode};
cl_ml_tensor_desc_qcom desc = {};
desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
&layer_.unusedTensor);
ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
CLML_CALL(clCreateMLOpActivationForwardQCOM, CLML_CTX, nullptr, &act_desc, input->tensor,
layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << "Activation Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a batch norm layer.
*
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateBatchNormLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("epsilon")[0]);
std::vector<cl_ml_op_properties_qcom> opProperties;
opProperties.push_back(CL_ML_BATCH_NORM_OP_EPSILON_QCOM);
opProperties.push_back(*reinterpret_cast<cl_ml_op_properties_qcom*>(&epsilon));
opProperties.push_back(CL_ML_OP_PROPERTY_LIST_END_QCOM);
auto bn_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
std::vector<size_t> bn_shape = {1, 1, 1, 1};
bn_shape[axis] = bn_dims.n;
auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4].id_, bn_shape,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode};
CLML_CALL(clCreateMLOpBatchNormForwardQCOM, CLML_CTX, opProperties.data(), &bn_desc,
input->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor,
output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << "Batchnorm Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a creating pooling layer.
*
* \note Currently global_max_pool2d and global_avg_pool2d are supported.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreatePoolingLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
std::vector<std::string> windows = node.GetAttr<std::vector<std::string>>("pool_size");
std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
std::vector<cl_uint> clml_window = GetVectorValues(windows);
std::vector<cl_uint> clml_stride = GetVectorValues(strides);
std::vector<cl_uint> clml_padding = GetVectorValues(padding);
cl_ml_op_pooling_desc_qcom pool_desc = {
((node.GetOpName() == "nn.max_pool2d") || PatternMatch(node.GetOpName(), "nn.max_pool2d"))
? CL_POOLING_MODE_MAX_QCOM
: CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
4, // reserved
{clml_padding[0], clml_padding[1]},
{clml_padding[2], clml_padding[3]},
{clml_stride[0], clml_stride[1]},
{clml_window[0], clml_window[1]},
CL_PROPAGATE_NAN_QCOM,
cl_arithmetic_mode,
};
cl_ml_tensor_desc_qcom desc = {};
cl_ml_tensor_qcom unusedTensor = nullptr;
desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
&unusedTensor);
ICHECK(unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
CLML_CALL(clCreateMLOpPoolingForwardQCOM, CLML_CTX, nullptr, &pool_desc, input->tensor,
unusedTensor, output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << "Pooling Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a global pooling layer.
*
* \note Currently global_max_pool2d and global_avg_pool2d are supported.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateGlobalPoolingLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
cl_ml_op_pooling_desc_qcom pool_desc = {
((node.GetOpName() == "nn.global_max_pool2d") ||
PatternMatch(node.GetOpName(), "nn.global_max_pool2d"))
? CL_POOLING_MODE_MAX_QCOM
: CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
4, // reserved
{0, 0},
{0, 0},
{1, 1},
{in_dims.w, in_dims.h},
CL_PROPAGATE_NAN_QCOM,
cl_arithmetic_mode,
};
cl_ml_tensor_desc_qcom desc = {};
desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
CLML_CALL_clCreateMLTensorQCOM(CLML_CTX, nullptr, &desc, CL_TENSOR_USAGE_UNUSED_QCOM,
&layer_.unusedTensor);
ICHECK(layer_.unusedTensor) << "clCreateMLTensorQCOM: unusedTensor";
CLML_CALL(clCreateMLOpPoolingForwardQCOM, CLML_CTX, nullptr, &pool_desc, input->tensor,
layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << "Pooling Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a Softmax layer Tensors with supported layout.
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateSoftmaxLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_tensor_layout_qcom layout;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
auto out_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
// enabling NHWC layout && NCHW layout for 4D, basis the axis value
if (out_dims.h >= 1 && out_dims.w >= 1) {
if (axis == 3 || axis == -1) {
layout = CL_TENSOR_LAYOUT_NHWC_QCOM;
} else {
layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
}
} else { // default layout for 2D
layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
}
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
return;
}
/*!
* \brief Create a SoftMax layer.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateSoftMaxLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
cl_softmax_mode_qcom mode = CL_SOFTMAX_MODE_SPATIAL_QCOM;
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM, mode,
cl_arithmetic_mode};
CLML_CALL(clCreateMLOpSoftmaxQCOM, CLML_CTX, nullptr, &softmax_desc, input->tensor,
output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << "SoftMax Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a Pad layer.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreatePadLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
std::string pad_mode = node.GetAttr<std::vector<std::string>>("pad_mode")[0];
std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("pad_width");
std::vector<cl_uint> clml_padding = GetVectorValues(padding);
cl_pad_mode_qcom clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
if (pad_mode == "constant")
clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
else if (pad_mode == "edge")
clml_pad_mode = CL_PAD_MODE_SYMMETRIC_QCOM;
else if (pad_mode == "reflect")
clml_pad_mode = CL_PAD_MODE_REFLECT_QCOM;
else
LOG(FATAL) << "Padding mode not supported by CLML:" << pad_mode;
cl_ml_op_pad_desc_qcom pad_desc{
clml_pad_mode,
{0, 0},
{clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0},
cl_arithmetic_mode};
CLML_CALL(clCreateMLOpPadQCOM, CLML_CTX, nullptr, &pad_desc, input->tensor, output->tensor, &op,
layer_.tuning_cache);
ICHECK(op) << "Pad Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a Batch Flatten layer.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateBatchFlattenLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
CLML_CALL(clCreateMLOpReshapeQCOM, CLML_CTX, nullptr, input->tensor, output->tensor, &op,
layer_.tuning_cache);
ICHECK(op) << "Reshape Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a Reshape layer.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateReshapeLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
CLML_CALL(clCreateMLOpReshapeQCOM, CLML_CTX, nullptr, input->tensor, output->tensor, &op,
layer_.tuning_cache);
ICHECK(op) << "Reshape Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a concat layer.
*
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateConcatLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
std::vector<JSONGraphNodeEntry> input_ = node.GetInputs();
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
int inputSize = input_.size();
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_uint axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[inputSize];
for (int i = 0; i < inputSize; i++) {
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[i].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
concatInputs[i] = input->tensor;
}
cl_ml_op_concat_desc_qcom concatDesc = {axis, (cl_uint)inputSize, cl_arithmetic_mode};
CLML_CALL(clCreateMLOpConcatQCOM, CLML_CTX, nullptr, &concatDesc, concatInputs, output->tensor,
&op, layer_.tuning_cache);
ICHECK(op) << "Concat Error";
layer->function.push_back(op);
delete[] concatInputs;
return;
}
/*!
* \brief Create a dense layer.
*
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateDenseLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
size_t num_inputs = node.GetInputs().size();
bool has_bias = (num_inputs == 3);
auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
bool is_vec_matmul = false;
if (in_dims.n == 1 && has_bias) {
layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
is_vec_matmul = true;
}
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.n, wt_dims.c},
layout, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
if (has_bias) {
bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2].id_, {}, layout, cl_dtype);
} else {
cl_ml_tensor_desc_qcom desc = {};
desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
bias->tensor = layer_.unusedTensor;
}
if (is_vec_matmul) {
cl_fc_weight_transform_qcom w_transform = CL_FC_WEIGHT_TRANSFORM_NONE_QCOM;
if (in_dims.c == wt_dims.c) w_transform = CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM;
cl_ml_op_fully_connected_desc_qcom fc_desc{1, // refer clml_ops.txt for struct
w_transform, cl_arithmetic_mode};
CLML_CALL(clCreateMLOpFullyConnectedQCOM, CLML_CTX, nullptr, &fc_desc, input->tensor,
weight->tensor, bias->tensor, output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << "FC layer Error";
layer->function.push_back(op);
} else {
cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
if (in_dims.c == wt_dims.c) b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;
cl_ml_op_gemm_desc_qcom gemmDesc = {in_dims.n, // m
wt_dims.n, // n
wt_dims.c, // k
CL_GEMM_TRANSFORM_NONE_QCOM, // A transform
b_transform, // B transform
{{1.0}, CL_FLOAT}, // alpha
{{0.0}, CL_FLOAT}, // beta
cl_arithmetic_mode};
CLML_CALL(clCreateMLOpGemmQCOM, CLML_CTX, nullptr, &gemmDesc, input->tensor, weight->tensor,
output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << "Gemm layer Error";
layer->function.push_back(op);
if (has_bias) {
cl_ml_op_binary_desc_qcom binaryDesc = {CL_TENSOR_OP_ADD_QCOM,
{{1.0}, CL_FLOAT}, // alpha
{{1.0}, CL_FLOAT}, // beta
{{1.0}, CL_FLOAT}, // gamma
cl_arithmetic_mode};
CLML_CALL(clCreateMLOpBinaryQCOM, CLML_CTX, nullptr, &binaryDesc, bias->tensor,
layer_.unusedTensor, output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << "Binary Op Error";
layer->function.push_back(op);
}
}
return;
}
/*!
* \brief Create a dense layer Tensors with supported layout.
*
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateDenseLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
size_t num_inputs = node.GetInputs().size();
bool has_bias = (num_inputs == 3);
cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
if (in_dims.n == 1 && has_bias) {
layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
}
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.n, wt_dims.c},
layout, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
return;
}
/*!
* \brief Create a batch_matmul layer.
*
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateBatchMatmulLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {in_dims.c, in_dims.h},
CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.c, wt_dims.h},
CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
std::vector<int64_t> out_shape = node.GetOpShape()[0];
std::vector<size_t> clml_out_shape;
clml_out_shape.push_back(out_shape[1]);
clml_out_shape.push_back(out_shape[2]);
clml_out_shape.push_back(1);
clml_out_shape.push_back(1);
auto output =
MakeCLMLTensorFromJSONEntry(nid, clml_out_shape, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
layer->out_shapes.insert({nid, clml_out_shape});
cl_bool b_transpose = std::stoi(node.GetAttr<std::vector<std::string>>("transpose_b")[0]);
cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
if (b_transpose) {
b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;
}
cl_ml_op_gemm_desc_qcom gemmDesc = {in_dims.c, // m
wt_dims.c, // n
wt_dims.h, // k
CL_GEMM_TRANSFORM_NONE_QCOM, // A transform
b_transform, // B transform
{{1.0}, CL_FLOAT}, // alpha
{{0.0}, CL_FLOAT}, // beta
cl_arithmetic_mode};
CLML_CALL(clCreateMLOpGemmQCOM, CLML_CTX, nullptr, &gemmDesc, input->tensor, weight->tensor,
output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << "BatchMatmul Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a Batch matmul layer(batch_size=1 supported) Tensors with supported layout.
*
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateBatchMatmulLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {in_dims.c, in_dims.h},
CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, wt_dims.c, wt_dims.h},
CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
std::vector<int64_t> out_shape = node.GetOpShape()[0];
std::vector<size_t> clml_out_shape;
clml_out_shape.push_back(out_shape[1]);
clml_out_shape.push_back(out_shape[2]);
clml_out_shape.push_back(1);
clml_out_shape.push_back(1);
auto output =
MakeCLMLTensorFromJSONEntry(nid, clml_out_shape, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
layer->out_shapes.insert({nid, clml_out_shape});
return;
}
/*!
* \brief Create a Clip(X) layer.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateClipLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_float a_max = std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
cl_float a_min = std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);
cl_ml_op_clip_desc_qcom clip_desc = {
CL_CLIP_BY_VALUE_QCOM, {{a_max}, CL_FLOAT}, {{a_min}, CL_FLOAT}, cl_arithmetic_mode};
CLML_CALL_clCreateMLOpClipQCOM(CLML_CTX, nullptr, &clip_desc, input->tensor, output->tensor,
&op, layer_.tuning_cache);
ICHECK(op) << "Clip Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a Binary layer.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateBinaryLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
auto input_a = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto input_b = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
std::string op_name = node.GetOpName();
cl_binary_op_qcom binary_op = CL_TENSOR_OP_ADD_QCOM;
if (op_name == "subtract" || PatternMatch(op_name, "relax.subtract"))
binary_op = CL_TENSOR_OP_SUB_QCOM;
else if (op_name == "multiply" || PatternMatch(op_name, "relax.multiply"))
binary_op = CL_TENSOR_OP_MUL_QCOM;
else if (op_name == "divide" || PatternMatch(op_name, "relax.divide"))
binary_op = CL_TENSOR_OP_DIV_QCOM;
else if (op_name == "minimum" || PatternMatch(op_name, "relax.minimum"))
binary_op = CL_TENSOR_OP_MIN_QCOM;
else if (op_name == "maximum" || PatternMatch(op_name, "relax.maximum"))
binary_op = CL_TENSOR_OP_MAX_QCOM;
else if (op_name == "add" || PatternMatch(op_name, "relax.add"))
binary_op = CL_TENSOR_OP_ADD_QCOM;
else
LOG(FATAL) << "Undefined binary op:" << op_name;
cl_ml_op_binary_desc_qcom add_desc = {
binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, cl_arithmetic_mode};
LOG(INFO) << "Op name - " << op_name;
CLML_CALL(clCreateMLOpBinaryQCOM, CLML_CTX, nullptr, &add_desc, input_a->tensor,
input_b->tensor, output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << op_name << " Node Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a DepthToSpace(X) layer.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateDepthToSpaceLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_uint block_size = std::stoi(node.GetAttr<std::vector<std::string>>("block_size")[0]);
cl_ml_op_depthtospace_desc_qcom dtos_desc = {block_size, cl_arithmetic_mode};
CLML_CALL(clCreateMLOpDepthToSpaceQCOM, CLML_CTX, nullptr, &dtos_desc, input->tensor,
output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << "DepthToSpace Layer Error";
layer->function.push_back(op);
return;
}
/*!
* \brief Create a Resize(X) layer.
*
* \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
* \param node The JSON representation of the operator.
* \param nid The node index of JSON graph node, which points to this operator.
*/
void CreateResizeLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
auto output = MakeCLMLTensorFromJSONEntry(nid, {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
cl_bool align_corners = std::stoi(node.GetAttr<std::vector<std::string>>("align_corners")[0]);
cl_ml_op_resize_bilinear_desc_qcom resize_desc = {align_corners, false, cl_arithmetic_mode};
CLML_CALL(clCreateMLOpResizeBilinearQCOM, CLML_CTX, nullptr, &resize_desc, input->tensor,
output->tensor, &op, layer_.tuning_cache);
ICHECK(op) << "Resize Layer Error";
layer->function.push_back(op);
return;
}
/*!
* \brief The network layers represented by acl functions.
* \note Currently only supports a single layer.
*/
// This layer instance
CachedLayer layer_;
// CLML Workspace
CLMLWorkspace* cws;
#else
void Run() override {
LOG(FATAL) << "Cannot call run on CLML module without runtime enabled. "
<< "Please build with USE_CLML_GRAPH_EXECUTOR.";
}
void BuildEngine() {
LOG(WARNING) << "CLML engine is not initialized. "
<< "Please build with USE_CLML_GRAPH_EXECUTOR.";
}
#endif
bool CanDebug() override { return true; }
/*! CLML sub graph symbol in TVM main module */
std::string clml_symbol;
};
ffi::Module CLMLRuntimeCreate(const ffi::String& symbol_name, const ffi::String& graph_json,
const ffi::Array<ffi::String>& const_names) {
auto n = ffi::make_object<CLMLRuntime>(symbol_name, graph_json, const_names);
return ffi::Module(n);
}
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
refl::GlobalDef()
.def("runtime.clml_runtime_create", CLMLRuntimeCreate)
.def("ffi.Module.load_from_bytes.clml", JSONRuntimeBase::LoadFromBytes<CLMLRuntime>);
}
} // namespace contrib
} // namespace runtime
} // namespace tvm