src/runtime/contrib/tensorrt/tensorrt_builder.h - tvm - Git at Google

 /* * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file runtime/contrib/tensorrt/tensorrt_builder.h
  * \brief The TensorRTBuilder class can be used to convert a JSONRuntime graph into a TRT engine
  * which can be used for inference.
  */

 #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
 #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_

 #include <tvm/runtime/tensor.h>

 #include <string>
 #include <unordered_map>
 #include <vector>

 #include "../json/json_node.h"
 #include "NvInfer.h"
 #include "tensorrt_logger.h"
 #include "tensorrt_ops.h"

 namespace tvm {
 namespace runtime {
 namespace contrib {

 using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
 using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;

 /*!
  * \brief The product of TensorRTBuilder which provides everything needed to
  * perform inference.
  */
 struct TensorRTEngineAndContext {
   nvinfer1::ICudaEngine* engine = nullptr;
   nvinfer1::IExecutionContext* context = nullptr;
   std::vector<std::string> inputs;
   std::vector<std::string> outputs;
 };

 /*!
  * \brief Converts a JSONRuntime graph into a TensorRT engine and execution context. Inputs,
  * constants, layers, and outputs can be added to construct the TensorRT network definition.
  * BuildEngine() will then use the network definition to build the TensorRT engine and context which
  * can be used to run inference - this phase can take a long time because TensorRT will query the
  * performance of all available kernels and fusions to optimize the engine.
  */
 class TensorRTBuilder {
  public:
   /*!
    * \brief Create TensorRT builder.
    * \param logger TensorRT logger to use for errors and warnings.
    * \param max_workspace_size Workspace size parameter for TensorRT engine build phase.
    * \param use_implicit_batch Whether to use implicit batch mode (default)
    * \param use_fp16 Whether to automatically convert a model to fp16
    * \param batch_size If use_implicit_batch,
    */
   TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,
                   size_t max_workspace_size, bool use_implicit_batch, bool use_fp16, int batch_size,
                   nvinfer1::IInt8Calibrator* calibrator = nullptr);

   /*!
    * \brief Add TensorRT input(s) for input node in network definition.
    * \param nid The input node id.
    * \param entry_id The index into data_entry_ for first entry in node.
    * \param node The input node.
    */
   void AddInput(int nid, uint32_t entry_id, const JSONGraphNode& node);

   /*!
    * \brief Add TensorRT weight for input constant in network definition.
    * \param nid The input node id.
    * \param node The data tensor on CPU.
    */
   void AddConstant(int nid, const DLTensor* data);

   /*!
    * \brief Add TensorRT layer for op node in network definition.
    * \param nid The input node id.
    * \param node The op node.
    */
   void AddLayer(int nid, const JSONGraphNode& node);

   /*!
    * \brief Mark TensorRT output in network definition.
    * \param entry The output node entry.
    * \param entry_id The output node entry id.
    */
   void AddOutput(const JSONGraphNodeEntry& entry, uint32_t entry_id);

   /*!
    * \brief Takes network definition and "compiles" a TensorRT engine which can be used for
    * inference. This step is time confusing.
    * \return TRT engine, context, and input/output information.
    */
   TensorRTEngineAndContext BuildEngine();

  private:
   /*! \brief Convert a DLTensor to a TensorRT weight. */
   nvinfer1::Weights GetDLTensorAsWeights(const DLTensor* dptr, DLDeviceType src_device);

   /*! \brief Convert an input to a Tensor if it is a Weight */
   nvinfer1::ITensor* GetInputAsTensor(const TensorRTOpInput& input);

   /*! \brief Clean up resources used to create engine. */
   void CleanUp();

   /*! \brief Maps a node to its outputs. */
   std::unordered_map<int, std::vector<TensorRTOpInput>> node_output_map_;

   /*! \brief TensorRT builder. */
   nvinfer1::IBuilder* builder_ = nullptr;

 #if TRT_VERSION_GE(6, 0, 1)
   /*! \brief TensorRT builder config. */
   nvinfer1::IBuilderConfig* config_ = nullptr;
 #endif

   /*! \brief TensorRT network definition. */
   nvinfer1::INetworkDefinition* network_ = nullptr;

   /*! \brief List of all weights held in memory. */
   std::vector<nvinfer1::Weights> trt_weights_;

   /*! \brief Input and output tensors from TVM. */
   const std::vector<const DLTensor*>& data_entry_;

   /*! \brief Map TensorRT binding name to index in data_entry_. */
   std::unordered_map<std::string, uint32_t> entry_id_map_;

   /*! \brief Max workspace size in bytes for TRT. */
   size_t max_workspace_size_;

   /*! \brief Whether to use implicit batch mode. */
   bool use_implicit_batch_;

   /*! \brief Whether to automatically convert model to 16-bit floating point precision. */
   bool use_fp16_;

   /*! \brief whether to automatically convert model to int8 precision */
   bool use_int8_;

   /*! \brief Batch size to optimize for. */
   int batch_size_;

   /*! \brief Input names. */
   std::vector<std::string> network_input_names_;

   /*! \brief Output names. */
   std::vector<std::string> network_output_names_;

   /*! \brief calibrator pointer to add batch data when using int8 mode */
   /*! \brief pointer will be nullptr when it is fp16 or fp32 precision */
   nvinfer1::IInt8Calibrator* calibrator_;
 };

 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm

 #endif  // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
	/* * Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file runtime/contrib/tensorrt/tensorrt_builder.h
	* \brief The TensorRTBuilder class can be used to convert a JSONRuntime graph into a TRT engine
	* which can be used for inference.
	*/

	#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
	#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_

	#include <tvm/runtime/tensor.h>

	#include <string>
	#include <unordered_map>
	#include <vector>

	#include "../json/json_node.h"
	#include "NvInfer.h"
	#include "tensorrt_logger.h"
	#include "tensorrt_ops.h"

	namespace tvm {
	namespace runtime {
	namespace contrib {

	using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
	using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;

	/*!
	* \brief The product of TensorRTBuilder which provides everything needed to
	* perform inference.
	*/
	struct TensorRTEngineAndContext {
	nvinfer1::ICudaEngine* engine = nullptr;
	nvinfer1::IExecutionContext* context = nullptr;
	std::vector<std::string> inputs;
	std::vector<std::string> outputs;
	};

	/*!
	* \brief Converts a JSONRuntime graph into a TensorRT engine and execution context. Inputs,
	* constants, layers, and outputs can be added to construct the TensorRT network definition.
	* BuildEngine() will then use the network definition to build the TensorRT engine and context which
	* can be used to run inference - this phase can take a long time because TensorRT will query the
	* performance of all available kernels and fusions to optimize the engine.
	*/
	class TensorRTBuilder {
	public:
	/*!
	* \brief Create TensorRT builder.
	* \param logger TensorRT logger to use for errors and warnings.
	* \param max_workspace_size Workspace size parameter for TensorRT engine build phase.
	* \param use_implicit_batch Whether to use implicit batch mode (default)
	* \param use_fp16 Whether to automatically convert a model to fp16
	* \param batch_size If use_implicit_batch,
	*/
	TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,
	size_t max_workspace_size, bool use_implicit_batch, bool use_fp16, int batch_size,
	nvinfer1::IInt8Calibrator* calibrator = nullptr);

	/*!
	* \brief Add TensorRT input(s) for input node in network definition.
	* \param nid The input node id.
	* \param entry_id The index into data_entry_ for first entry in node.
	* \param node The input node.
	*/
	void AddInput(int nid, uint32_t entry_id, const JSONGraphNode& node);

	/*!
	* \brief Add TensorRT weight for input constant in network definition.
	* \param nid The input node id.
	* \param node The data tensor on CPU.
	*/
	void AddConstant(int nid, const DLTensor* data);

	/*!
	* \brief Add TensorRT layer for op node in network definition.
	* \param nid The input node id.
	* \param node The op node.
	*/
	void AddLayer(int nid, const JSONGraphNode& node);

	/*!
	* \brief Mark TensorRT output in network definition.
	* \param entry The output node entry.
	* \param entry_id The output node entry id.
	*/
	void AddOutput(const JSONGraphNodeEntry& entry, uint32_t entry_id);

	/*!
	* \brief Takes network definition and "compiles" a TensorRT engine which can be used for
	* inference. This step is time confusing.
	* \return TRT engine, context, and input/output information.
	*/
	TensorRTEngineAndContext BuildEngine();

	private:
	/! \brief Convert a DLTensor to a TensorRT weight. /
	nvinfer1::Weights GetDLTensorAsWeights(const DLTensor* dptr, DLDeviceType src_device);

	/! \brief Convert an input to a Tensor if it is a Weight /
	nvinfer1::ITensor* GetInputAsTensor(const TensorRTOpInput& input);

	/! \brief Clean up resources used to create engine. /
	void CleanUp();

	/! \brief Maps a node to its outputs. /
	std::unordered_map<int, std::vector<TensorRTOpInput>> node_output_map_;

	/! \brief TensorRT builder. /
	nvinfer1::IBuilder* builder_ = nullptr;

	#if TRT_VERSION_GE(6, 0, 1)
	/! \brief TensorRT builder config. /
	nvinfer1::IBuilderConfig* config_ = nullptr;
	#endif

	/! \brief TensorRT network definition. /
	nvinfer1::INetworkDefinition* network_ = nullptr;

	/! \brief List of all weights held in memory. /
	std::vector<nvinfer1::Weights> trt_weights_;

	/! \brief Input and output tensors from TVM. /
	const std::vector<const DLTensor*>& data_entry_;

	/! \brief Map TensorRT binding name to index in data_entry_. /
	std::unordered_map<std::string, uint32_t> entry_id_map_;

	/! \brief Max workspace size in bytes for TRT. /
	size_t max_workspace_size_;

	/! \brief Whether to use implicit batch mode. /
	bool use_implicit_batch_;

	/! \brief Whether to automatically convert model to 16-bit floating point precision. /
	bool use_fp16_;

	/! \brief whether to automatically convert model to int8 precision /
	bool use_int8_;

	/! \brief Batch size to optimize for. /
	int batch_size_;

	/! \brief Input names. /
	std::vector<std::string> network_input_names_;

	/! \brief Output names. /
	std::vector<std::string> network_output_names_;

	/! \brief calibrator pointer to add batch data when using int8 mode /
	/! \brief pointer will be nullptr when it is fp16 or fp32 precision /
	nvinfer1::IInt8Calibrator* calibrator_;
	};

	} // namespace contrib
	} // namespace runtime
	} // namespace tvm

	#endif // TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_