| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| """ |
| Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite) |
| ================================================================ |
| **Author**: `Siju Samuel <https://github.com/siju-samuel>`_ |
| |
| Welcome to part 3 of the Deploy Framework-Prequantized Model with TVM tutorial. |
| In this part, we will start with a Quantized TFLite graph and then compile and execute it via TVM. |
| |
| |
| For more details on quantizing the model using TFLite, readers are encouraged to |
| go through `Converting Quantized Models |
| <https://www.tensorflow.org/lite/convert/quantization>`_. |
| |
| The TFLite models can be downloaded from this `link |
| <https://www.tensorflow.org/lite/guide/hosted_models>`_. |
| |
| To get started, Tensorflow and TFLite package needs to be installed as prerequisite. |
| |
| .. code-block:: bash |
| |
| # install tensorflow and tflite |
| pip install tensorflow==2.1.0 |
| pip install tflite==2.1.0 |
| |
| Now please check if TFLite package is installed successfully, ``python -c "import tflite"`` |
| |
| """ |
| |
| ############################################################################### |
| # Necessary imports |
| # ----------------- |
| import os |
| |
| import numpy as np |
| import tflite |
| |
| import tvm |
| from tvm import relay |
| |
| |
| ###################################################################### |
| # Download pretrained Quantized TFLite model |
| # ------------------------------------------ |
| |
| # Download mobilenet V2 TFLite model provided by Google |
| from tvm.contrib.download import download_testdata |
| |
| model_url = ( |
| "https://storage.googleapis.com/download.tensorflow.org/models/" |
| "tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz" |
| ) |
| |
| # Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite |
| model_path = download_testdata( |
| model_url, "mobilenet_v2_1.0_224_quant.tgz", module=["tf", "official"] |
| ) |
| model_dir = os.path.dirname(model_path) |
| |
| |
| ###################################################################### |
| # Utils for downloading and extracting zip files |
| # ---------------------------------------------- |
| def extract(path): |
| import tarfile |
| |
| if path.endswith("tgz") or path.endswith("gz"): |
| dir_path = os.path.dirname(path) |
| tar = tarfile.open(path) |
| tar.extractall(path=dir_path) |
| tar.close() |
| else: |
| raise RuntimeError("Could not decompress the file: " + path) |
| |
| |
| extract(model_path) |
| |
| |
| ###################################################################### |
| # Load a test image |
| # ----------------- |
| |
| ####################################################################### |
| # Get a real image for e2e testing |
| # -------------------------------- |
| def get_real_image(im_height, im_width): |
| from PIL import Image |
| |
| repo_base = "https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/" |
| img_name = "elephant-299.jpg" |
| image_url = os.path.join(repo_base, img_name) |
| img_path = download_testdata(image_url, img_name, module="data") |
| image = Image.open(img_path).resize((im_height, im_width)) |
| x = np.array(image).astype("uint8") |
| data = np.reshape(x, (1, im_height, im_width, 3)) |
| return data |
| |
| |
| data = get_real_image(224, 224) |
| |
| ###################################################################### |
| # Load a tflite model |
| # ------------------- |
| |
| ###################################################################### |
| # Now we can open mobilenet_v2_1.0_224.tflite |
| tflite_model_file = os.path.join(model_dir, "mobilenet_v2_1.0_224_quant.tflite") |
| tflite_model_buf = open(tflite_model_file, "rb").read() |
| |
| # Get TFLite model from buffer |
| try: |
| import tflite |
| |
| tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0) |
| except AttributeError: |
| import tflite.Model |
| |
| tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) |
| |
| ############################################################################### |
| # Lets run TFLite pre-quantized model inference and get the TFLite prediction. |
| def run_tflite_model(tflite_model_buf, input_data): |
| """ Generic function to execute TFLite """ |
| try: |
| from tensorflow import lite as interpreter_wrapper |
| except ImportError: |
| from tensorflow.contrib import lite as interpreter_wrapper |
| |
| input_data = input_data if isinstance(input_data, list) else [input_data] |
| |
| interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf) |
| interpreter.allocate_tensors() |
| |
| input_details = interpreter.get_input_details() |
| output_details = interpreter.get_output_details() |
| |
| # set input |
| assert len(input_data) == len(input_details) |
| for i in range(len(input_details)): |
| interpreter.set_tensor(input_details[i]["index"], input_data[i]) |
| |
| # Run |
| interpreter.invoke() |
| |
| # get output |
| tflite_output = list() |
| for i in range(len(output_details)): |
| tflite_output.append(interpreter.get_tensor(output_details[i]["index"])) |
| |
| return tflite_output |
| |
| |
| ############################################################################### |
| # Lets run TVM compiled pre-quantized model inference and get the TVM prediction. |
| def run_tvm(lib): |
| from tvm.contrib import graph_runtime |
| |
| rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu(0))) |
| rt_mod.set_input("input", data) |
| rt_mod.run() |
| tvm_res = rt_mod.get_output(0).asnumpy() |
| tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1] |
| return tvm_pred, rt_mod |
| |
| |
| ############################################################################### |
| # TFLite inference |
| # ---------------- |
| |
| ############################################################################### |
| # Run TFLite inference on the quantized model. |
| tflite_res = run_tflite_model(tflite_model_buf, data) |
| tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1] |
| |
| ############################################################################### |
| # TVM compilation and inference |
| # ----------------------------- |
| |
| ############################################################################### |
| # We use the TFLite-Relay parser to convert the TFLite pre-quantized graph into Relay IR. Note that |
| # frontend parser call for a pre-quantized model is exactly same as frontend parser call for a FP32 |
| # model. We encourage you to remove the comment from print(mod) and inspect the Relay module. You |
| # will see many QNN operators, like, Requantize, Quantize and QNN Conv2D. |
| dtype_dict = {"input": data.dtype.name} |
| shape_dict = {"input": data.shape} |
| |
| mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict) |
| # print(mod) |
| |
| ############################################################################### |
| # Lets now the compile the Relay module. We use the "llvm" target here. Please replace it with the |
| # target platform that you are interested in. |
| target = "llvm" |
| with tvm.transform.PassContext(opt_level=3): |
| lib = relay.build_module.build(mod, target=target, params=params) |
| |
| ############################################################################### |
| # Finally, lets call inference on the TVM compiled module. |
| tvm_pred, rt_mod = run_tvm(lib) |
| |
| ############################################################################### |
| # Accuracy comparison |
| # ------------------- |
| |
| ############################################################################### |
| # Print the top-5 labels for MXNet and TVM inference. |
| # Checking the labels because the requantize implementation is different between |
| # TFLite and Relay. This cause final output numbers to mismatch. So, testing accuracy via labels. |
| |
| print("TVM Top-5 labels:", tvm_pred) |
| print("TFLite Top-5 labels:", tflite_pred) |
| |
| |
| ########################################################################## |
| # Measure performance |
| # ------------------- |
| # Here we give an example of how to measure performance of TVM compiled models. |
| n_repeat = 100 # should be bigger to make the measurement more accurate |
| ctx = tvm.cpu(0) |
| ftimer = rt_mod.module.time_evaluator("run", ctx, number=1, repeat=n_repeat) |
| prof_res = np.array(ftimer().results) * 1e3 |
| print("Elapsed average ms:", np.mean(prof_res)) |
| |
| ###################################################################### |
| # .. note:: |
| # |
| # Unless the hardware has special support for fast 8 bit instructions, quantized models are |
| # not expected to be any faster than FP32 models. Without fast 8 bit instructions, TVM does |
| # quantized convolution in 16 bit, even if the model itself is 8 bit. |
| # |
| # For x86, the best performance can be achieved on CPUs with AVX512 instructions set. |
| # In this case, TVM utilizes the fastest available 8 bit instructions for the given target. |
| # This includes support for the VNNI 8 bit dot product instruction (CascadeLake or newer). |
| # For EC2 C5.12x large instance, TVM latency for this tutorial is ~2 ms. |
| # |
| # Intel conv2d NCHWc schedule on ARM gives better end-to-end latency compared to ARM NCHW |
| # conv2d spatial pack schedule for many TFLite networks. ARM winograd performance is higher but |
| # it has a high memory footprint. |
| # |
| # Moreover, the following general tips for CPU performance equally applies: |
| # |
| # * Set the environment variable TVM_NUM_THREADS to the number of physical cores |
| # * Choose the best target for your hardware, such as "llvm -mcpu=skylake-avx512" or |
| # "llvm -mcpu=cascadelake" (more CPUs with AVX512 would come in the future) |
| # * Perform autotuning - `Auto-tuning a convolution network for x86 CPU |
| # <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html>`_. |
| # * To get best inference performance on ARM CPU, change target argument according to your |
| # device and follow `Auto-tuning a convolution network for ARM CPU |
| # <https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_arm.html>`_. |