tutorials/frontend/deploy_prequantized_tflite.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """
 Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)
 ================================================================
 **Author**: `Siju Samuel <https://github.com/siju-samuel>`_

 Welcome to part 3 of the Deploy Framework-Prequantized Model with TVM tutorial.
 In this part, we will start with a Quantized TFLite graph and then compile and execute it via TVM.


 For more details on quantizing the model using TFLite, readers are encouraged to
 go through `Converting Quantized Models
 <https://www.tensorflow.org/lite/convert/quantization>`_.

 The TFLite models can be downloaded from this `link
 <https://www.tensorflow.org/lite/guide/hosted_models>`_.

 To get started, Tensorflow and TFLite package needs to be installed as prerequisite.

 .. code-block:: bash

     # install tensorflow and tflite
     pip install tensorflow==2.1.0
     pip install tflite==2.1.0

 Now please check if TFLite package is installed successfully, ``python -c "import tflite"``

 """

 ###############################################################################
 # Necessary imports
 # -----------------
 import os

 import numpy as np
 import tflite

 import tvm
 from tvm import relay


 ######################################################################
 # Download pretrained Quantized TFLite model
 # ------------------------------------------

 # Download mobilenet V2 TFLite model provided by Google
 from tvm.contrib.download import download_testdata

 model_url = (
     "https://storage.googleapis.com/download.tensorflow.org/models/"
     "tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz"
 )

 # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
 model_path = download_testdata(
     model_url, "mobilenet_v2_1.0_224_quant.tgz", module=["tf", "official"]
 )
 model_dir = os.path.dirname(model_path)


 ######################################################################
 # Utils for downloading and extracting zip files
 # ----------------------------------------------
 def extract(path):
     import tarfile

     if path.endswith("tgz") or path.endswith("gz"):
         dir_path = os.path.dirname(path)
         tar = tarfile.open(path)
         tar.extractall(path=dir_path)
         tar.close()
     else:
         raise RuntimeError("Could not decompress the file: " + path)


 extract(model_path)


 ######################################################################
 # Load a test image
 # -----------------

 #######################################################################
 # Get a real image for e2e testing
 # --------------------------------
 def get_real_image(im_height, im_width):
     from PIL import Image

     repo_base = "https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/"
     img_name = "elephant-299.jpg"
     image_url = os.path.join(repo_base, img_name)
     img_path = download_testdata(image_url, img_name, module="data")
     image = Image.open(img_path).resize((im_height, im_width))
     x = np.array(image).astype("uint8")
     data = np.reshape(x, (1, im_height, im_width, 3))
     return data


 data = get_real_image(224, 224)

 ######################################################################
 # Load a tflite model
 # -------------------

 ######################################################################
 # Now we can open mobilenet_v2_1.0_224.tflite
 tflite_model_file = os.path.join(model_dir, "mobilenet_v2_1.0_224_quant.tflite")
 tflite_model_buf = open(tflite_model_file, "rb").read()

 # Get TFLite model from buffer
 try:
     import tflite

     tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
 except AttributeError:
     import tflite.Model

     tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)

 ###############################################################################
 # Lets run TFLite pre-quantized model inference and get the TFLite prediction.
 def run_tflite_model(tflite_model_buf, input_data):
     """ Generic function to execute TFLite """
     try:
         from tensorflow import lite as interpreter_wrapper
     except ImportError:
         from tensorflow.contrib import lite as interpreter_wrapper

     input_data = input_data if isinstance(input_data, list) else [input_data]

     interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)
     interpreter.allocate_tensors()

     input_details = interpreter.get_input_details()
     output_details = interpreter.get_output_details()

     # set input
     assert len(input_data) == len(input_details)
     for i in range(len(input_details)):
         interpreter.set_tensor(input_details[i]["index"], input_data[i])

     # Run
     interpreter.invoke()

     # get output
     tflite_output = list()
     for i in range(len(output_details)):
         tflite_output.append(interpreter.get_tensor(output_details[i]["index"]))

     return tflite_output


 ###############################################################################
 # Lets run TVM compiled pre-quantized model inference and get the TVM prediction.
 def run_tvm(lib):
     from tvm.contrib import graph_runtime

     rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu(0)))
     rt_mod.set_input("input", data)
     rt_mod.run()
     tvm_res = rt_mod.get_output(0).asnumpy()
     tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1]
     return tvm_pred, rt_mod


 ###############################################################################
 # TFLite inference
 # ----------------

 ###############################################################################
 # Run TFLite inference on the quantized model.
 tflite_res = run_tflite_model(tflite_model_buf, data)
 tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1]

 ###############################################################################
 # TVM compilation and inference
 # -----------------------------

 ###############################################################################
 # We use the TFLite-Relay parser to convert the TFLite pre-quantized graph into Relay IR. Note that
 # frontend parser call for a pre-quantized model is exactly same as frontend parser call for a FP32
 # model. We encourage you to remove the comment from print(mod) and inspect the Relay module. You
 # will see many QNN operators, like, Requantize, Quantize and QNN Conv2D.
 dtype_dict = {"input": data.dtype.name}
 shape_dict = {"input": data.shape}

 mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)
 # print(mod)

 ###############################################################################
 # Lets now the compile the Relay module. We use the "llvm" target here. Please replace it with the
 # target platform that you are interested in.
 target = "llvm"
 with tvm.transform.PassContext(opt_level=3):
     lib = relay.build_module.build(mod, target=target, params=params)

 ###############################################################################
 # Finally, lets call inference on the TVM compiled module.
 tvm_pred, rt_mod = run_tvm(lib)

 ###############################################################################
 # Accuracy comparison
 # -------------------

 ###############################################################################
 # Print the top-5 labels for MXNet and TVM inference.
 # Checking the labels because the requantize implementation is different between
 # TFLite and Relay. This cause final output numbers to mismatch. So, testing accuracy via labels.

 print("TVM Top-5 labels:", tvm_pred)
 print("TFLite Top-5 labels:", tflite_pred)


 ##########################################################################
 # Measure performance
 # -------------------
 # Here we give an example of how to measure performance of TVM compiled models.
 n_repeat = 100  # should be bigger to make the measurement more accurate
 ctx = tvm.cpu(0)
 ftimer = rt_mod.module.time_evaluator("run", ctx, number=1, repeat=n_repeat)
 prof_res = np.array(ftimer().results) * 1e3
 print("Elapsed average ms:", np.mean(prof_res))

 ######################################################################
 # .. note::
 #
 #   Unless the hardware has special support for fast 8 bit instructions, quantized models are
 #   not expected to be any faster than FP32 models. Without fast 8 bit instructions, TVM does
 #   quantized convolution in 16 bit, even if the model itself is 8 bit.
 #
 #   For x86, the best performance can be achieved on CPUs with AVX512 instructions set.
 #   In this case, TVM utilizes the fastest available 8 bit instructions for the given target.
 #   This includes support for the VNNI 8 bit dot product instruction (CascadeLake or newer).
 #   For EC2 C5.12x large instance, TVM latency for this tutorial is ~2 ms.
 #
 #   Intel conv2d NCHWc schedule on ARM gives better end-to-end latency compared to ARM NCHW
 #   conv2d spatial pack schedule for many TFLite networks. ARM winograd performance is higher but
 #   it has a high memory footprint.
 #
 #   Moreover, the following general tips for CPU performance equally applies:
 #
 #    * Set the environment variable TVM_NUM_THREADS to the number of physical cores
 #    * Choose the best target for your hardware, such as "llvm -mcpu=skylake-avx512" or
 #      "llvm -mcpu=cascadelake" (more CPUs with AVX512 would come in the future)
 #    * Perform autotuning - `Auto-tuning a convolution network for x86 CPU
 #      <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html>`_.
 #    * To get best inference performance on ARM CPU, change target argument according to your
 #      device and follow `Auto-tuning a convolution network for ARM CPU
 #      <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_arm.html>`_.
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	"""
	Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)
	================================================================
	Author: `Siju Samuel <https://github.com/siju-samuel>`_

	Welcome to part 3 of the Deploy Framework-Prequantized Model with TVM tutorial.
	In this part, we will start with a Quantized TFLite graph and then compile and execute it via TVM.


	For more details on quantizing the model using TFLite, readers are encouraged to
	go through `Converting Quantized Models
	<https://www.tensorflow.org/lite/convert/quantization>`_.

	The TFLite models can be downloaded from this `link
	<https://www.tensorflow.org/lite/guide/hosted_models>`_.

	To get started, Tensorflow and TFLite package needs to be installed as prerequisite.

	.. code-block:: bash

	# install tensorflow and tflite
	pip install tensorflow==2.1.0
	pip install tflite==2.1.0

	Now please check if TFLite package is installed successfully, ``python -c "import tflite"``

	"""

	###############################################################################
	# Necessary imports
	# -----------------
	import os

	import numpy as np
	import tflite

	import tvm
	from tvm import relay


	######################################################################
	# Download pretrained Quantized TFLite model
	# ------------------------------------------

	# Download mobilenet V2 TFLite model provided by Google
	from tvm.contrib.download import download_testdata

	model_url = (
	"https://storage.googleapis.com/download.tensorflow.org/models/"
	"tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz"
	)

	# Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite
	model_path = download_testdata(
	model_url, "mobilenet_v2_1.0_224_quant.tgz", module=["tf", "official"]
	)
	model_dir = os.path.dirname(model_path)


	######################################################################
	# Utils for downloading and extracting zip files
	# ----------------------------------------------
	def extract(path):
	import tarfile

	if path.endswith("tgz") or path.endswith("gz"):
	dir_path = os.path.dirname(path)
	tar = tarfile.open(path)
	tar.extractall(path=dir_path)
	tar.close()
	else:
	raise RuntimeError("Could not decompress the file: " + path)


	extract(model_path)


	######################################################################
	# Load a test image
	# -----------------

	#######################################################################
	# Get a real image for e2e testing
	# --------------------------------
	def get_real_image(im_height, im_width):
	from PIL import Image

	repo_base = "https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/"
	img_name = "elephant-299.jpg"
	image_url = os.path.join(repo_base, img_name)
	img_path = download_testdata(image_url, img_name, module="data")
	image = Image.open(img_path).resize((im_height, im_width))
	x = np.array(image).astype("uint8")
	data = np.reshape(x, (1, im_height, im_width, 3))
	return data


	data = get_real_image(224, 224)

	######################################################################
	# Load a tflite model
	# -------------------

	######################################################################
	# Now we can open mobilenet_v2_1.0_224.tflite
	tflite_model_file = os.path.join(model_dir, "mobilenet_v2_1.0_224_quant.tflite")
	tflite_model_buf = open(tflite_model_file, "rb").read()

	# Get TFLite model from buffer
	try:
	import tflite

	tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
	except AttributeError:
	import tflite.Model

	tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)

	###############################################################################
	# Lets run TFLite pre-quantized model inference and get the TFLite prediction.
	def run_tflite_model(tflite_model_buf, input_data):
	""" Generic function to execute TFLite """
	try:
	from tensorflow import lite as interpreter_wrapper
	except ImportError:
	from tensorflow.contrib import lite as interpreter_wrapper

	input_data = input_data if isinstance(input_data, list) else [input_data]

	interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf)
	interpreter.allocate_tensors()

	input_details = interpreter.get_input_details()
	output_details = interpreter.get_output_details()

	# set input
	assert len(input_data) == len(input_details)
	for i in range(len(input_details)):
	interpreter.set_tensor(input_details[i]["index"], input_data[i])

	# Run
	interpreter.invoke()

	# get output
	tflite_output = list()
	for i in range(len(output_details)):
	tflite_output.append(interpreter.get_tensor(output_details[i]["index"]))

	return tflite_output


	###############################################################################
	# Lets run TVM compiled pre-quantized model inference and get the TVM prediction.
	def run_tvm(lib):
	from tvm.contrib import graph_runtime

	rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu(0)))
	rt_mod.set_input("input", data)
	rt_mod.run()
	tvm_res = rt_mod.get_output(0).asnumpy()
	tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1]
	return tvm_pred, rt_mod


	###############################################################################
	# TFLite inference
	# ----------------

	###############################################################################
	# Run TFLite inference on the quantized model.
	tflite_res = run_tflite_model(tflite_model_buf, data)
	tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1]

	###############################################################################
	# TVM compilation and inference
	# -----------------------------

	###############################################################################
	# We use the TFLite-Relay parser to convert the TFLite pre-quantized graph into Relay IR. Note that
	# frontend parser call for a pre-quantized model is exactly same as frontend parser call for a FP32
	# model. We encourage you to remove the comment from print(mod) and inspect the Relay module. You
	# will see many QNN operators, like, Requantize, Quantize and QNN Conv2D.
	dtype_dict = {"input": data.dtype.name}
	shape_dict = {"input": data.shape}

	mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict)
	# print(mod)

	###############################################################################
	# Lets now the compile the Relay module. We use the "llvm" target here. Please replace it with the
	# target platform that you are interested in.
	target = "llvm"
	with tvm.transform.PassContext(opt_level=3):
	lib = relay.build_module.build(mod, target=target, params=params)

	###############################################################################
	# Finally, lets call inference on the TVM compiled module.
	tvm_pred, rt_mod = run_tvm(lib)

	###############################################################################
	# Accuracy comparison
	# -------------------

	###############################################################################
	# Print the top-5 labels for MXNet and TVM inference.
	# Checking the labels because the requantize implementation is different between
	# TFLite and Relay. This cause final output numbers to mismatch. So, testing accuracy via labels.

	print("TVM Top-5 labels:", tvm_pred)
	print("TFLite Top-5 labels:", tflite_pred)


	##########################################################################
	# Measure performance
	# -------------------
	# Here we give an example of how to measure performance of TVM compiled models.
	n_repeat = 100 # should be bigger to make the measurement more accurate
	ctx = tvm.cpu(0)
	ftimer = rt_mod.module.time_evaluator("run", ctx, number=1, repeat=n_repeat)
	prof_res = np.array(ftimer().results) * 1e3
	print("Elapsed average ms:", np.mean(prof_res))

	######################################################################
	# .. note::
	#
	# Unless the hardware has special support for fast 8 bit instructions, quantized models are
	# not expected to be any faster than FP32 models. Without fast 8 bit instructions, TVM does
	# quantized convolution in 16 bit, even if the model itself is 8 bit.
	#
	# For x86, the best performance can be achieved on CPUs with AVX512 instructions set.
	# In this case, TVM utilizes the fastest available 8 bit instructions for the given target.
	# This includes support for the VNNI 8 bit dot product instruction (CascadeLake or newer).
	# For EC2 C5.12x large instance, TVM latency for this tutorial is ~2 ms.
	#
	# Intel conv2d NCHWc schedule on ARM gives better end-to-end latency compared to ARM NCHW
	# conv2d spatial pack schedule for many TFLite networks. ARM winograd performance is higher but
	# it has a high memory footprint.
	#
	# Moreover, the following general tips for CPU performance equally applies:
	#
	# * Set the environment variable TVM_NUM_THREADS to the number of physical cores
	# * Choose the best target for your hardware, such as "llvm -mcpu=skylake-avx512" or
	# "llvm -mcpu=cascadelake" (more CPUs with AVX512 would come in the future)
	# * Perform autotuning - `Auto-tuning a convolution network for x86 CPU
	# <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html>`_.
	# * To get best inference performance on ARM CPU, change target argument according to your
	# device and follow `Auto-tuning a convolution network for ARM CPU
	# <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_arm.html>`_.