blob: 96905598737c1a39b404f1b803bec84f255ba914 [file] [log] [blame]
/* * Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file runtime/contrib/tensorrt/tensorrt_builder.h
* \brief The TensorRTBuilder class can be used to convert a JSONRuntime graph into a TRT engine
* which can be used for inference.
*/
#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
#include <tvm/runtime/tensor.h>
#include <string>
#include <unordered_map>
#include <vector>
#include "../json/json_node.h"
#include "NvInfer.h"
#include "tensorrt_logger.h"
#include "tensorrt_ops.h"
namespace tvm {
namespace runtime {
namespace contrib {
using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
/*!
* \brief The product of TensorRTBuilder which provides everything needed to
* perform inference.
*/
struct TensorRTEngineAndContext {
nvinfer1::ICudaEngine* engine = nullptr;
nvinfer1::IExecutionContext* context = nullptr;
std::vector<std::string> inputs;
std::vector<std::string> outputs;
};
/*!
* \brief Converts a JSONRuntime graph into a TensorRT engine and execution context. Inputs,
* constants, layers, and outputs can be added to construct the TensorRT network definition.
* BuildEngine() will then use the network definition to build the TensorRT engine and context which
* can be used to run inference - this phase can take a long time because TensorRT will query the
* performance of all available kernels and fusions to optimize the engine.
*/
class TensorRTBuilder {
public:
/*!
* \brief Create TensorRT builder.
* \param logger TensorRT logger to use for errors and warnings.
* \param max_workspace_size Workspace size parameter for TensorRT engine build phase.
* \param use_implicit_batch Whether to use implicit batch mode (default)
* \param use_fp16 Whether to automatically convert a model to fp16
* \param batch_size If use_implicit_batch,
*/
TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,
size_t max_workspace_size, bool use_implicit_batch, bool use_fp16, int batch_size,
nvinfer1::IInt8Calibrator* calibrator = nullptr);
/*!
* \brief Add TensorRT input(s) for input node in network definition.
* \param nid The input node id.
* \param entry_id The index into data_entry_ for first entry in node.
* \param node The input node.
*/
void AddInput(int nid, uint32_t entry_id, const JSONGraphNode& node);
/*!
* \brief Add TensorRT weight for input constant in network definition.
* \param nid The input node id.
* \param node The data tensor on CPU.
*/
void AddConstant(int nid, const DLTensor* data);
/*!
* \brief Add TensorRT layer for op node in network definition.
* \param nid The input node id.
* \param node The op node.
*/
void AddLayer(int nid, const JSONGraphNode& node);
/*!
* \brief Mark TensorRT output in network definition.
* \param entry The output node entry.
* \param entry_id The output node entry id.
*/
void AddOutput(const JSONGraphNodeEntry& entry, uint32_t entry_id);
/*!
* \brief Takes network definition and "compiles" a TensorRT engine which can be used for
* inference. This step is time confusing.
* \return TRT engine, context, and input/output information.
*/
TensorRTEngineAndContext BuildEngine();
private:
/*! \brief Convert a DLTensor to a TensorRT weight. */
nvinfer1::Weights GetDLTensorAsWeights(const DLTensor* dptr, DLDeviceType src_device);
/*! \brief Convert an input to a Tensor if it is a Weight */
nvinfer1::ITensor* GetInputAsTensor(const TensorRTOpInput& input);
/*! \brief Clean up resources used to create engine. */
void CleanUp();
/*! \brief Maps a node to its outputs. */
std::unordered_map<int, std::vector<TensorRTOpInput>> node_output_map_;
/*! \brief TensorRT builder. */
nvinfer1::IBuilder* builder_ = nullptr;
#if TRT_VERSION_GE(6, 0, 1)
/*! \brief TensorRT builder config. */
nvinfer1::IBuilderConfig* config_ = nullptr;
#endif
/*! \brief TensorRT network definition. */
nvinfer1::INetworkDefinition* network_ = nullptr;
/*! \brief List of all weights held in memory. */
std::vector<nvinfer1::Weights> trt_weights_;
/*! \brief Input and output tensors from TVM. */
const std::vector<const DLTensor*>& data_entry_;
/*! \brief Map TensorRT binding name to index in data_entry_. */
std::unordered_map<std::string, uint32_t> entry_id_map_;
/*! \brief Max workspace size in bytes for TRT. */
size_t max_workspace_size_;
/*! \brief Whether to use implicit batch mode. */
bool use_implicit_batch_;
/*! \brief Whether to automatically convert model to 16-bit floating point precision. */
bool use_fp16_;
/*! \brief whether to automatically convert model to int8 precision */
bool use_int8_;
/*! \brief Batch size to optimize for. */
int batch_size_;
/*! \brief Input names. */
std::vector<std::string> network_input_names_;
/*! \brief Output names. */
std::vector<std::string> network_output_names_;
/*! \brief calibrator pointer to add batch data when using int8 mode */
/*! \brief pointer will be nullptr when it is fp16 or fp32 precision */
nvinfer1::IInt8Calibrator* calibrator_;
};
} // namespace contrib
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_