| |
| |
| |
| |
| |
| |
| <!DOCTYPE html> |
| <html class="writer-html5" lang="en" > |
| <head> |
| <meta charset="utf-8"> |
| |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| |
| <title>Deploy a Quantized Model on Cuda — tvm 0.9.dev0 documentation</title> |
| |
| |
| |
| <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous"> |
| <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" /> |
| <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" /> |
| <link rel="stylesheet" href="../../_static/gallery.css" type="text/css" /> |
| <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" href="../../_static/css/tlcpack_theme.css" type="text/css" /> |
| |
| |
| |
| <link rel="shortcut icon" href="../../_static/tvm-logo-square.png"/> |
| |
| |
| |
| |
| |
| |
| |
| <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script> |
| <script data-url_root="../../" id="documentation_options" src="../../_static/documentation_options.js"></script> |
| <script src="../../_static/jquery.js"></script> |
| <script src="../../_static/underscore.js"></script> |
| <script src="../../_static/doctools.js"></script> |
| |
| <script type="text/javascript" src="../../_static/js/theme.js"></script> |
| |
| |
| <script type="text/javascript" src="../../_static/js/tlcpack_theme.js"></script> |
| <link rel="index" title="Index" href="../../genindex.html" /> |
| <link rel="search" title="Search" href="../../search.html" /> |
| <link rel="next" title="Deploy a Hugging Face Pruned Model on CPU" href="deploy_sparse.html" /> |
| <link rel="prev" title="Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)" href="deploy_prequantized_tflite.html" /> |
| </head> |
| |
| <body class="wy-body-for-nav"> |
| |
| |
| <div class="wy-grid-for-nav"> |
| |
| |
| <header class="header"> |
| <div class="innercontainer"> |
| <div class="headerInner d-flex justify-content-between align-items-center"> |
| <div class="headerLogo"> |
| <a href="https://tvm.apache.org/"><img src=https://tvm.apache.org/assets/images/logo.svg alt="logo"></a> |
| </div> |
| |
| <div id="headMenu" class="headerNav"> |
| <button type="button" id="closeHeadMenu" class="navCloseBtn"><img src="../../_static/img/close-icon.svg" alt="Close"></button> |
| <ul class="nav"> |
| <li class="nav-item"> |
| <a class="nav-link" href=https://tvm.apache.org/community>Community</a> |
| </li> |
| <li class="nav-item"> |
| <a class="nav-link" href=https://tvm.apache.org/download>Download</a> |
| </li> |
| <li class="nav-item"> |
| <a class="nav-link" href=https://tvm.apache.org/vta>VTA</a> |
| </li> |
| <li class="nav-item"> |
| <a class="nav-link" href=https://tvm.apache.org/blog>Blog</a> |
| </li> |
| <li class="nav-item"> |
| <a class="nav-link" href=https://tvm.apache.org/docs>Docs</a> |
| </li> |
| <li class="nav-item"> |
| <a class="nav-link" href=https://tvmconf.org>Conference</a> |
| </li> |
| <li class="nav-item"> |
| <a class="nav-link" href=https://github.com/apache/tvm/>Github</a> |
| </li> |
| </ul> |
| <div class="responsivetlcdropdown"> |
| <button type="button" class="btn-link"> |
| ASF |
| </button> |
| <ul> |
| <li> |
| <a href=https://apache.org/>Apache Homepage</a> |
| </li> |
| <li> |
| <a href=https://www.apache.org/licenses/>License</a> |
| </li> |
| <li> |
| <a href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a> |
| </li> |
| <li> |
| <a href=https://www.apache.org/security/>Security</a> |
| </li> |
| <li> |
| <a href=https://www.apache.org/foundation/thanks.html>Thanks</a> |
| </li> |
| <li> |
| <a href=https://www.apache.org/events/current-event>Events</a> |
| </li> |
| </ul> |
| </div> |
| </div> |
| <div class="responsiveMenuIcon"> |
| <button type="button" id="menuBtn" class="btn-menu"><img src="../../_static/img/menu-icon.svg" alt="Menu Icon"></button> |
| </div> |
| |
| <div class="tlcDropdown"> |
| <div class="dropdown"> |
| <button type="button" class="btn-link dropdown-toggle" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false"> |
| ASF |
| </button> |
| <div class="dropdown-menu dropdown-menu-right"> |
| <ul> |
| <li> |
| <a href=https://apache.org/>Apache Homepage</a> |
| </li> |
| <li> |
| <a href=https://www.apache.org/licenses/>License</a> |
| </li> |
| <li> |
| <a href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a> |
| </li> |
| <li> |
| <a href=https://www.apache.org/security/>Security</a> |
| </li> |
| <li> |
| <a href=https://www.apache.org/foundation/thanks.html>Thanks</a> |
| </li> |
| <li> |
| <a href=https://www.apache.org/events/current-event>Events</a> |
| </li> |
| </ul> |
| </div> |
| </div> |
| </div> |
| </div> |
| </div> |
| </header> |
| |
| <nav data-toggle="wy-nav-shift" class="wy-nav-side fixed"> |
| <div class="wy-side-scroll"> |
| <div class="wy-side-nav-search" > |
| |
| |
| |
| <a href="../../index.html"> |
| |
| |
| |
| |
| <img src="../../_static/tvm-logo-small.png" class="logo" alt="Logo"/> |
| |
| </a> |
| |
| |
| |
| |
| <div class="version"> |
| 0.9.dev0 |
| </div> |
| |
| |
| |
| |
| <div role="search"> |
| <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get"> |
| <input type="text" name="q" placeholder="Search docs" /> |
| <input type="hidden" name="check_keywords" value="yes" /> |
| <input type="hidden" name="area" value="default" /> |
| </form> |
| </div> |
| |
| |
| </div> |
| |
| |
| <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation"> |
| |
| |
| |
| |
| |
| |
| <p class="caption" role="heading"><span class="caption-text">Getting Started</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../../install/index.html">Installing TVM</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../contribute/index.html">Contributor Guide</a></li> |
| </ul> |
| <p class="caption" role="heading"><span class="caption-text">User Guide</span></p> |
| <ul class="current"> |
| <li class="toctree-l1"><a class="reference internal" href="../../tutorial/index.html">User Tutorial</a></li> |
| <li class="toctree-l1 current"><a class="reference internal" href="../index.html">How To Guides</a><ul class="current"> |
| <li class="toctree-l2"><a class="reference internal" href="../compile_models/index.html">Compile Deep Learning Models</a></li> |
| <li class="toctree-l2 current"><a class="reference internal" href="../deploy/index.html">Deploy Models and Integrate TVM</a><ul class="current"> |
| <li class="toctree-l3"><a class="reference internal" href="../deploy/index.html#build-the-tvm-runtime-library">Build the TVM runtime library</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="../deploy/index.html#cross-compile-the-tvm-runtime-for-other-architectures">Cross compile the TVM runtime for other architectures</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="../deploy/index.html#optimize-and-tune-models-for-target-devices">Optimize and tune models for target devices</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="../deploy/index.html#deploy-optimized-model-on-target-devices">Deploy optimized model on target devices</a></li> |
| <li class="toctree-l3 current"><a class="reference internal" href="../deploy/index.html#additional-deployment-how-tos">Additional Deployment How-Tos</a><ul class="current"> |
| <li class="toctree-l4 current"><a class="reference internal" href="index.html">Deploy Deep Learning Models</a></li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l2"><a class="reference internal" href="../work_with_relay/index.html">Work With Relay</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../work_with_schedules/index.html">Work With Tensor Expression and Schedules</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../optimize_operators/index.html">Optimize Tensor Operators</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../tune_with_autotvm/index.html">Auto-Tune with Templates and AutoTVM</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../tune_with_autoscheduler/index.html">Use AutoScheduler for Template-Free Scheduling</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../work_with_microtvm/index.html">Work With microTVM</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../extend_tvm/index.html">Extend TVM</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../profile/index.html">Profile Models</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../../errors.html">Handle TVM Errors</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="../../faq.html">Frequently Asked Questions</a></li> |
| </ul> |
| </li> |
| </ul> |
| <p class="caption" role="heading"><span class="caption-text">Developer Guide</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../../dev/tutorial/index.html">Developer Tutorial</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../dev/how_to/how_to.html">Developer How-To Guide</a></li> |
| </ul> |
| <p class="caption" role="heading"><span class="caption-text">Architecture Guide</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../../arch/index.html">Design and Architecture</a></li> |
| </ul> |
| <p class="caption" role="heading"><span class="caption-text">Topic Guides</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../../topic/microtvm/index.html">microTVM: TVM on bare-metal</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../topic/vta/index.html">VTA: Versatile Tensor Accelerator</a></li> |
| </ul> |
| <p class="caption" role="heading"><span class="caption-text">Reference Guide</span></p> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../../reference/langref/index.html">Language Reference</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../reference/api/python/index.html">Python API</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../reference/api/links.html">Other APIs</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../reference/publications.html">Publications</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../genindex.html">Index</a></li> |
| </ul> |
| |
| |
| |
| </div> |
| |
| </div> |
| </nav> |
| |
| <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"> |
| |
| <nav class="wy-nav-top" aria-label="top navigation" data-toggle="wy-nav-top"> |
| |
| <div class="togglemenu"> |
| |
| </div> |
| <div class="nav-content"> |
| <!-- tvm --> |
| Table of content |
| </div> |
| |
| </nav> |
| |
| |
| <div class="wy-nav-content"> |
| |
| <div class="rst-content"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <div role="navigation" aria-label="breadcrumbs navigation"> |
| |
| <ul class="wy-breadcrumbs"> |
| |
| <li><a href="../../index.html">Docs</a> <span class="br-arrow">></span></li> |
| |
| <li><a href="../index.html">How To Guides</a> <span class="br-arrow">></span></li> |
| |
| <li><a href="../deploy/index.html">Deploy Models and Integrate TVM</a> <span class="br-arrow">></span></li> |
| |
| <li><a href="index.html">Deploy Deep Learning Models</a> <span class="br-arrow">></span></li> |
| |
| <li>Deploy a Quantized Model on Cuda</li> |
| |
| |
| <li class="wy-breadcrumbs-aside"> |
| |
| |
| <a href="../../_sources/how_to/deploy_models/deploy_quantized.rst.txt" rel="nofollow"> <img src="../../_static//img/source.svg" alt="viewsource"/></a> |
| |
| |
| </li> |
| |
| </ul> |
| |
| |
| <hr/> |
| </div> |
| <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> |
| <div itemprop="articleBody"> |
| |
| <div class="sphx-glr-download-link-note admonition note"> |
| <p class="admonition-title">Note</p> |
| <p>Click <a class="reference internal" href="#sphx-glr-download-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">here</span></a> to download the full example code</p> |
| </div> |
| <div class="sphx-glr-example-title section" id="deploy-a-quantized-model-on-cuda"> |
| <span id="sphx-glr-how-to-deploy-models-deploy-quantized-py"></span><h1>Deploy a Quantized Model on Cuda<a class="headerlink" href="#deploy-a-quantized-model-on-cuda" title="Permalink to this headline">¶</a></h1> |
| <p><strong>Author</strong>: <a class="reference external" href="https://github.com/vinx13">Wuwei Lin</a></p> |
| <p>This article is an introductory tutorial of automatic quantization with TVM. |
| Automatic quantization is one of the quantization modes in TVM. More details on |
| the quantization story in TVM can be found |
| <a class="reference external" href="https://discuss.tvm.apache.org/t/quantization-story/3920">here</a>. |
| In this tutorial, we will import a GluonCV pre-trained model on ImageNet to |
| Relay, quantize the Relay model and then perform the inference.</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tvm</span> |
| <span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">te</span> |
| <span class="kn">from</span> <span class="nn">tvm</span> <span class="k">import</span> <span class="n">relay</span> |
| <span class="kn">import</span> <span class="nn">mxnet</span> <span class="k">as</span> <span class="nn">mx</span> |
| <span class="kn">from</span> <span class="nn">tvm.contrib.download</span> <span class="k">import</span> <span class="n">download_testdata</span> |
| <span class="kn">from</span> <span class="nn">mxnet</span> <span class="k">import</span> <span class="n">gluon</span> |
| <span class="kn">import</span> <span class="nn">logging</span> |
| <span class="kn">import</span> <span class="nn">os</span> |
| |
| <span class="n">batch_size</span> <span class="o">=</span> <span class="mi">1</span> |
| <span class="n">model_name</span> <span class="o">=</span> <span class="s2">"resnet18_v1"</span> |
| <span class="n">target</span> <span class="o">=</span> <span class="s2">"cuda"</span> |
| <span class="n">dev</span> <span class="o">=</span> <span class="n">tvm</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="n">target</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <div class="section" id="prepare-the-dataset"> |
| <h2>Prepare the Dataset<a class="headerlink" href="#prepare-the-dataset" title="Permalink to this headline">¶</a></h2> |
| <p>We will demonstrate how to prepare the calibration dataset for quantization. |
| We first download the validation set of ImageNet and pre-process the dataset.</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">calibration_rec</span> <span class="o">=</span> <span class="n">download_testdata</span><span class="p">(</span> |
| <span class="s2">"http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec"</span><span class="p">,</span> |
| <span class="s2">"val_256_q90.rec"</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| |
| <span class="k">def</span> <span class="nf">get_val_data</span><span class="p">(</span><span class="n">num_workers</span><span class="o">=</span><span class="mi">4</span><span class="p">):</span> |
| <span class="n">mean_rgb</span> <span class="o">=</span> <span class="p">[</span><span class="mf">123.68</span><span class="p">,</span> <span class="mf">116.779</span><span class="p">,</span> <span class="mf">103.939</span><span class="p">]</span> |
| <span class="n">std_rgb</span> <span class="o">=</span> <span class="p">[</span><span class="mf">58.393</span><span class="p">,</span> <span class="mf">57.12</span><span class="p">,</span> <span class="mf">57.375</span><span class="p">]</span> |
| |
| <span class="k">def</span> <span class="nf">batch_fn</span><span class="p">(</span><span class="n">batch</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">batch</span><span class="o">.</span><span class="n">data</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">asnumpy</span><span class="p">(),</span> <span class="n">batch</span><span class="o">.</span><span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">asnumpy</span><span class="p">()</span> |
| |
| <span class="n">img_size</span> <span class="o">=</span> <span class="mi">299</span> <span class="k">if</span> <span class="n">model_name</span> <span class="o">==</span> <span class="s2">"inceptionv3"</span> <span class="k">else</span> <span class="mi">224</span> |
| <span class="n">val_data</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">io</span><span class="o">.</span><span class="n">ImageRecordIter</span><span class="p">(</span> |
| <span class="n">path_imgrec</span><span class="o">=</span><span class="n">calibration_rec</span><span class="p">,</span> |
| <span class="n">preprocess_threads</span><span class="o">=</span><span class="n">num_workers</span><span class="p">,</span> |
| <span class="n">shuffle</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="n">batch_size</span><span class="o">=</span><span class="n">batch_size</span><span class="p">,</span> |
| <span class="n">resize</span><span class="o">=</span><span class="mi">256</span><span class="p">,</span> |
| <span class="n">data_shape</span><span class="o">=</span><span class="p">(</span><span class="mi">3</span><span class="p">,</span> <span class="n">img_size</span><span class="p">,</span> <span class="n">img_size</span><span class="p">),</span> |
| <span class="n">mean_r</span><span class="o">=</span><span class="n">mean_rgb</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> |
| <span class="n">mean_g</span><span class="o">=</span><span class="n">mean_rgb</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> |
| <span class="n">mean_b</span><span class="o">=</span><span class="n">mean_rgb</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> |
| <span class="n">std_r</span><span class="o">=</span><span class="n">std_rgb</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> |
| <span class="n">std_g</span><span class="o">=</span><span class="n">std_rgb</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> |
| <span class="n">std_b</span><span class="o">=</span><span class="n">std_rgb</span><span class="p">[</span><span class="mi">2</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">val_data</span><span class="p">,</span> <span class="n">batch_fn</span> |
| </pre></div> |
| </div> |
| <p>The calibration dataset should be an iterable object. We define the |
| calibration dataset as a generator object in Python. In this tutorial, we |
| only use a few samples for calibration.</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">calibration_samples</span> <span class="o">=</span> <span class="mi">10</span> |
| |
| |
| <span class="k">def</span> <span class="nf">calibrate_dataset</span><span class="p">():</span> |
| <span class="n">val_data</span><span class="p">,</span> <span class="n">batch_fn</span> <span class="o">=</span> <span class="n">get_val_data</span><span class="p">()</span> |
| <span class="n">val_data</span><span class="o">.</span><span class="n">reset</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">batch</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">val_data</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">i</span> <span class="o">*</span> <span class="n">batch_size</span> <span class="o">>=</span> <span class="n">calibration_samples</span><span class="p">:</span> |
| <span class="k">break</span> |
| <span class="n">data</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">batch_fn</span><span class="p">(</span><span class="n">batch</span><span class="p">)</span> |
| <span class="k">yield</span> <span class="p">{</span><span class="s2">"data"</span><span class="p">:</span> <span class="n">data</span><span class="p">}</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="import-the-model"> |
| <h2>Import the model<a class="headerlink" href="#import-the-model" title="Permalink to this headline">¶</a></h2> |
| <p>We use the Relay MxNet frontend to import a model from the Gluon model zoo.</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">get_model</span><span class="p">():</span> |
| <span class="n">gluon_model</span> <span class="o">=</span> <span class="n">gluon</span><span class="o">.</span><span class="n">model_zoo</span><span class="o">.</span><span class="n">vision</span><span class="o">.</span><span class="n">get_model</span><span class="p">(</span><span class="n">model_name</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">img_size</span> <span class="o">=</span> <span class="mi">299</span> <span class="k">if</span> <span class="n">model_name</span> <span class="o">==</span> <span class="s2">"inceptionv3"</span> <span class="k">else</span> <span class="mi">224</span> |
| <span class="n">data_shape</span> <span class="o">=</span> <span class="p">(</span><span class="n">batch_size</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">img_size</span><span class="p">,</span> <span class="n">img_size</span><span class="p">)</span> |
| <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <a href="../../reference/api/python/relay/frontend.html#tvm.relay.frontend.from_mxnet" title="View documentation for tvm.relay.frontend.from_mxnet"><span class="n">relay</span><span class="o">.</span><span class="n">frontend</span><span class="o">.</span><span class="n">from_mxnet</span></a><span class="p">(</span><span class="n">gluon_model</span><span class="p">,</span> <span class="p">{</span><span class="s2">"data"</span><span class="p">:</span> <span class="n">data_shape</span><span class="p">})</span> |
| <span class="k">return</span> <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="quantize-the-model"> |
| <h2>Quantize the Model<a class="headerlink" href="#quantize-the-model" title="Permalink to this headline">¶</a></h2> |
| <p>In quantization, we need to find the scale for each weight and intermediate |
| feature map tensor of each layer.</p> |
| <p>For weights, the scales are directly calculated based on the value of the |
| weights. Two modes are supported: <cite>power2</cite> and <cite>max</cite>. Both modes find the |
| maximum value within the weight tensor first. In <cite>power2</cite> mode, the maximum |
| is rounded down to power of two. If the scales of both weights and |
| intermediate feature maps are power of two, we can leverage bit shifting for |
| multiplications. This make it computationally more efficient. In <cite>max</cite> mode, |
| the maximum is used as the scale. Without rounding, <cite>max</cite> mode might have |
| better accuracy in some cases. When the scales are not powers of two, fixed |
| point multiplications will be used.</p> |
| <p>For intermediate feature maps, we can find the scales with data-aware |
| quantization. Data-aware quantization takes a calibration dataset as the |
| input argument. Scales are calculated by minimizing the KL divergence between |
| distribution of activation before and after quantization. |
| Alternatively, we can also use pre-defined global scales. This saves the time |
| for calibration. But the accuracy might be impacted.</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">quantize</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span> <span class="n">params</span><span class="p">,</span> <span class="n">data_aware</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">data_aware</span><span class="p">:</span> |
| <span class="k">with</span> <span class="n">relay</span><span class="o">.</span><span class="n">quantize</span><span class="o">.</span><span class="n">qconfig</span><span class="p">(</span><span class="n">calibrate_mode</span><span class="o">=</span><span class="s2">"kl_divergence"</span><span class="p">,</span> <span class="n">weight_scale</span><span class="o">=</span><span class="s2">"max"</span><span class="p">):</span> |
| <span class="n">mod</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">quantize</span><span class="o">.</span><span class="n">quantize</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span> <span class="n">params</span><span class="p">,</span> <span class="n">dataset</span><span class="o">=</span><span class="n">calibrate_dataset</span><span class="p">())</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">with</span> <span class="n">relay</span><span class="o">.</span><span class="n">quantize</span><span class="o">.</span><span class="n">qconfig</span><span class="p">(</span><span class="n">calibrate_mode</span><span class="o">=</span><span class="s2">"global_scale"</span><span class="p">,</span> <span class="n">global_scale</span><span class="o">=</span><span class="mf">8.0</span><span class="p">):</span> |
| <span class="n">mod</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">quantize</span><span class="o">.</span><span class="n">quantize</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span> <span class="n">params</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">mod</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="run-inference"> |
| <h2>Run Inference<a class="headerlink" href="#run-inference" title="Permalink to this headline">¶</a></h2> |
| <p>We create a Relay VM to build and execute the model.</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">run_inference</span><span class="p">(</span><span class="n">mod</span><span class="p">):</span> |
| <span class="n">model</span> <span class="o">=</span> <span class="n">relay</span><span class="o">.</span><span class="n">create_executor</span><span class="p">(</span><span class="s2">"vm"</span><span class="p">,</span> <span class="n">mod</span><span class="p">,</span> <span class="n">dev</span><span class="p">,</span> <span class="n">target</span><span class="p">)</span><span class="o">.</span><span class="n">evaluate</span><span class="p">()</span> |
| <span class="n">val_data</span><span class="p">,</span> <span class="n">batch_fn</span> <span class="o">=</span> <span class="n">get_val_data</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">batch</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">val_data</span><span class="p">):</span> |
| <span class="n">data</span><span class="p">,</span> <span class="n">label</span> <span class="o">=</span> <span class="n">batch_fn</span><span class="p">(</span><span class="n">batch</span><span class="p">)</span> |
| <span class="n">prediction</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">i</span> <span class="o">></span> <span class="mi">10</span><span class="p">:</span> <span class="c1"># only run inference on a few samples in this tutorial</span> |
| <span class="k">break</span> |
| |
| |
| <span class="k">def</span> <span class="nf">main</span><span class="p">():</span> |
| <span class="n">mod</span><span class="p">,</span> <span class="n">params</span> <span class="o">=</span> <span class="n">get_model</span><span class="p">()</span> |
| <span class="n">mod</span> <span class="o">=</span> <span class="n">quantize</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span> <span class="n">params</span><span class="p">,</span> <span class="n">data_aware</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="n">run_inference</span><span class="p">(</span><span class="n">mod</span><span class="p">)</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">main</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| <p class="sphx-glr-script-out">Out:</p> |
| <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/workspace/python/tvm/relay/build_module.py:439: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function) |
| DeprecationWarning, |
| </pre></div> |
| </div> |
| <p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 21.742 seconds)</p> |
| <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py"> |
| <div class="sphx-glr-download docutils container"> |
| <p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p> |
| </div> |
| <div class="sphx-glr-download docutils container"> |
| <p><a class="reference download internal" download="" href="../../_downloads/a269cb38341b190be980a0bd3ea8a625/deploy_quantized.ipynb"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Jupyter</span> <span class="pre">notebook:</span> <span class="pre">deploy_quantized.ipynb</span></code></a></p> |
| </div> |
| </div> |
| <p class="sphx-glr-signature"><a class="reference external" href="https://sphinx-gallery.github.io">Gallery generated by Sphinx-Gallery</a></p> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| </div> |
| |
| |
| <footer> |
| |
| <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation"> |
| |
| <a href="deploy_sparse.html" class="btn btn-neutral float-right" title="Deploy a Hugging Face Pruned Model on CPU" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a> |
| |
| |
| <a href="deploy_prequantized_tflite.html" class="btn btn-neutral float-left" title="Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a> |
| |
| </div> |
| |
| <div id="button" class="backtop"><img src="../../_static//img/right.svg" alt="backtop"/> </div> |
| <section class="footerSec"> |
| <div class="footerHeader"> |
| <ul class="d-flex align-md-items-center justify-content-between flex-column flex-md-row"> |
| <li class="copywrite d-flex align-items-center"> |
| <h5 id="copy-right-info">© 2020 Apache Software Foundation | All right reserved</h5> |
| </li> |
| </ul> |
| |
| </div> |
| |
| <ul> |
| <li class="footernote">Copyright © 2020 The Apache Software Foundation. Apache TVM, Apache, the Apache feather, and the Apache TVM project logo are either trademarks or registered trademarks of the Apache Software Foundation.</li> |
| </ul> |
| |
| </section> |
| </footer> |
| </div> |
| </div> |
| |
| </section> |
| |
| </div> |
| |
| |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.12.9/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script> |
| <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script> |
| |
| </body> |
| <script type="text/javascript"> |
| jQuery(function () { |
| SphinxRtdTheme.Navigation.enable(true); |
| }); |
| </script> |
| |
| |
| |
| |
| <!-- Theme Analytics --> |
| <script> |
| (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ |
| (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), |
| m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) |
| })(window,document,'script','https://www.google-analytics.com/analytics.js','ga'); |
| |
| ga('create', 'UA-75982049-2', 'auto'); |
| ga('send', 'pageview'); |
| </script> |
| |
| |
| |
| |
| </body> |
| </html> |