blob: 176ec39f3b9ae44ce61ed8326448c65f55e60cae [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="Text API" property="og:title">
<meta content="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/og-logo.png" property="og:image">
<meta content="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/og-logo.png" property="og:image:secure_url">
<meta content="Text API" property="og:description"/>
<title>Text API — mxnet documentation</title>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" rel="stylesheet"/>
<link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/>
<link href="../../../_static/basic.css" rel="stylesheet" type="text/css">
<link href="../../../_static/pygments.css" rel="stylesheet" type="text/css">
<link href="../../../_static/mxnet.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript">
var DOCUMENTATION_OPTIONS = {
URL_ROOT: '../../../',
VERSION: '',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
HAS_SOURCE: true,
SOURCELINK_SUFFIX: '.txt'
};
</script>
<script src="https://code.jquery.com/jquery-1.11.1.min.js" type="text/javascript"></script>
<script src="../../../_static/underscore.js" type="text/javascript"></script>
<script src="../../../_static/searchtools_custom.js" type="text/javascript"></script>
<script src="../../../_static/doctools.js" type="text/javascript"></script>
<script src="../../../_static/selectlang.js" type="text/javascript"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
<script type="text/javascript"> jQuery(function() { Search.loadIndex("/searchindex.js"); Search.init();}); </script>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new
Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-96378503-1', 'auto');
ga('send', 'pageview');
</script>
<!-- -->
<!-- <script type="text/javascript" src="../../../_static/jquery.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="../../../_static/underscore.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="../../../_static/doctools.js"></script> -->
<!-- -->
<!-- <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script> -->
<!-- -->
<link href="../../../genindex.html" rel="index" title="Index">
<link href="../../../search.html" rel="search" title="Search"/>
<link href="../index.html" rel="up" title="MXNet - Python API"/>
<link href="onnx.html" rel="next" title="ONNX-MXNet API"/>
<link href="contrib.html" rel="prev" title="Contrib Package"/>
<link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"/>
</link></link></link></meta></meta></meta></head>
<body background="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-background-compressed.jpeg" role="document">
<div class="content-block"><div class="navbar navbar-fixed-top">
<div class="container" id="navContainer">
<div class="innder" id="header-inner">
<h1 id="logo-wrap">
<a href="../../../" id="logo"><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet_logo.png"/></a>
</h1>
<nav class="nav-bar" id="main-nav">
<a class="main-nav-link" href="../../../install/index.html">Install</a>
<span id="dropdown-menu-position-anchor">
<a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">Gluon <span class="caret"></span></a>
<ul class="dropdown-menu navbar-menu" id="package-dropdown-menu">
<li><a class="main-nav-link" href="../../../gluon/index.html">About</a></li>
<li><a class="main-nav-link" href="http://gluon.mxnet.io">Tutorials</a></li>
</ul>
</span>
<span id="dropdown-menu-position-anchor">
<a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">API <span class="caret"></span></a>
<ul class="dropdown-menu navbar-menu" id="package-dropdown-menu">
<li><a class="main-nav-link" href="../../../api/python/index.html">Python</a></li>
<li><a class="main-nav-link" href="../../../api/c++/index.html">C++</a></li>
<li><a class="main-nav-link" href="../../../api/clojure/index.html">Clojure</a></li>
<li><a class="main-nav-link" href="../../../api/julia/index.html">Julia</a></li>
<li><a class="main-nav-link" href="../../../api/perl/index.html">Perl</a></li>
<li><a class="main-nav-link" href="../../../api/r/index.html">R</a></li>
<li><a class="main-nav-link" href="../../../api/scala/index.html">Scala</a></li>
</ul>
</span>
<span id="dropdown-menu-position-anchor-docs">
<a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">Docs <span class="caret"></span></a>
<ul class="dropdown-menu navbar-menu" id="package-dropdown-menu-docs">
<li><a class="main-nav-link" href="../../../faq/index.html">FAQ</a></li>
<li><a class="main-nav-link" href="../../../tutorials/index.html">Tutorials</a>
<li><a class="main-nav-link" href="https://github.com/apache/incubator-mxnet/tree/master/example">Examples</a></li>
<li><a class="main-nav-link" href="../../../architecture/index.html">Architecture</a></li>
<li><a class="main-nav-link" href="../../../api/python/gluon/model_zoo.html">Model Zoo</a></li>
<li><a class="main-nav-link" href="../../../api/python/contrib/onnx.html">ONNX</a></li>
</li></ul>
</span>
<span id="dropdown-menu-position-anchor-community">
<a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">Community <span class="caret"></span></a>
<ul class="dropdown-menu navbar-menu" id="package-dropdown-menu-community">
<li><a class="main-nav-link" href="http://discuss.mxnet.io">Forum</a></li>
<li><a class="main-nav-link" href="https://github.com/apache/incubator-mxnet">Github</a></li>
<li><a class="main-nav-link" href="../../../community/contribute.html">Contribute</a></li>
<li><a class="main-nav-link" href="../../../community/ecosystem.html">Ecosystem</a></li>
<li><a class="main-nav-link" href="../../../community/powered_by.html">Powered By</a></li>
</ul>
</span>
<span id="dropdown-menu-position-anchor-version" style="position: relative"><a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Versions(master)<span class="caret"></span></a><ul id="package-dropdown-menu" class="dropdown-menu"><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/>master</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/versions/1.2.1/index.html>1.2.1</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/versions/1.1.0/index.html>1.1.0</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/versions/1.0.0/index.html>1.0.0</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/versions/0.12.1/index.html>0.12.1</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/versions/0.11.0/index.html>0.11.0</a></li></ul></span></nav>
<script> function getRootPath(){ return "../../../" } </script>
<div class="burgerIcon dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"></a>
<ul class="dropdown-menu" id="burgerMenu">
<li><a href="../../../install/index.html">Install</a></li>
<li><a class="main-nav-link" href="../../../tutorials/index.html">Tutorials</a></li>
<li class="dropdown-submenu dropdown">
<a aria-haspopup="true" class="dropdown-toggle burger-link" data-toggle="dropdown" href="#" role="button" tabindex="-1">Community</a>
<ul class="dropdown-menu">
<li><a href="http://discuss.mxnet.io" tabindex="-1">Forum</a></li>
<li><a href="https://github.com/apache/incubator-mxnet" tabindex="-1">Github</a></li>
<li><a href="../../../community/contribute.html" tabindex="-1">Contribute</a></li>
<li><a href="../../../community/ecosystem.html" tabindex="-1">Ecosystem</a></li>
<li><a href="../../../community/powered_by.html" tabindex="-1">Powered By</a></li>
</ul>
</li>
<li class="dropdown-submenu">
<a aria-haspopup="true" class="dropdown-toggle burger-link" data-toggle="dropdown" href="#" role="button" tabindex="-1">API</a>
<ul class="dropdown-menu">
<li><a href="../../../api/python/index.html" tabindex="-1">Python</a>
</li>
<li><a href="../../../api/c++/index.html" tabindex="-1">C++</a>
</li>
<li><a href="../../../api/clojure/index.html" tabindex="-1">Clojure</a>
</li>
<li><a href="../../../api/julia/index.html" tabindex="-1">Julia</a>
</li>
<li><a href="../../../api/perl/index.html" tabindex="-1">Perl</a>
</li>
<li><a href="../../../api/r/index.html" tabindex="-1">R</a>
</li>
<li><a href="../../../api/scala/index.html" tabindex="-1">Scala</a>
</li>
</ul>
</li>
<li class="dropdown-submenu">
<a aria-expanded="true" aria-haspopup="true" class="dropdown-toggle burger-link" data-toggle="dropdown" href="#" tabindex="-1">Docs</a>
<ul class="dropdown-menu">
<li><a href="../../../tutorials/index.html" tabindex="-1">Tutorials</a></li>
<li><a href="../../../faq/index.html" tabindex="-1">FAQ</a></li>
<li><a href="../../../architecture/index.html" tabindex="-1">Architecture</a></li>
<li><a href="https://github.com/apache/incubator-mxnet/tree/master/example" tabindex="-1">Examples</a></li>
<li><a href="../../../api/python/gluon/model_zoo.html" tabindex="-1">Gluon Model Zoo</a></li>
</ul>
</li>
<li><a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a></li>
<li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" style="position: relative"><a href="#" tabindex="-1">Versions(master)</a><ul class="dropdown-menu"><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/>master</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/versions/1.2.1/index.html>1.2.1</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/versions/1.1.0/index.html>1.1.0</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/versions/1.0.0/index.html>1.0.0</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/versions/0.12.1/index.html>0.12.1</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/versions/0.11.0/index.html>0.11.0</a></li></ul></li></ul>
</div>
<div class="plusIcon dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"><span aria-hidden="true" class="glyphicon glyphicon-plus"></span></a>
<ul class="dropdown-menu dropdown-menu-right" id="plusMenu"></ul>
</div>
<div id="search-input-wrap">
<form action="../../../search.html" autocomplete="off" class="" method="get" role="search">
<div class="form-group inner-addon left-addon">
<i class="glyphicon glyphicon-search"></i>
<input class="form-control" name="q" placeholder="Search" type="text"/>
</div>
<input name="check_keywords" type="hidden" value="yes">
<input name="area" type="hidden" value="default"/>
</input></form>
<div id="search-preview"></div>
</div>
<div id="searchIcon">
<span aria-hidden="true" class="glyphicon glyphicon-search"></span>
</div>
<!-- <div id="lang-select-wrap"> -->
<!-- <label id="lang-select-label"> -->
<!-- <\!-- <i class="fa fa-globe"></i> -\-> -->
<!-- <span></span> -->
<!-- </label> -->
<!-- <select id="lang-select"> -->
<!-- <option value="en">Eng</option> -->
<!-- <option value="zh">中文</option> -->
<!-- </select> -->
<!-- </div> -->
<!-- <a id="mobile-nav-toggle">
<span class="mobile-nav-toggle-bar"></span>
<span class="mobile-nav-toggle-bar"></span>
<span class="mobile-nav-toggle-bar"></span>
</a> -->
</div>
</div>
</div>
<script type="text/javascript">
$('body').css('background', 'white');
</script>
<div class="container">
<div class="row">
<div aria-label="main navigation" class="sphinxsidebar leftsidebar" role="navigation">
<div class="sphinxsidebarwrapper">
<ul class="current">
<li class="toctree-l1 current"><a class="reference internal" href="../index.html">Python Documents</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="../index.html#ndarray-api">NDArray API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#symbol-api">Symbol API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#module-api">Module API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#autograd-api">Autograd API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#gluon-api">Gluon API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#io-api">IO API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#image-api">Image API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#optimization-api">Optimization API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#callback-api">Callback API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#metric-api">Metric API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#profiler-api">Profiler API</a></li>
<li class="toctree-l2"><a class="reference internal" href="../index.html#run-time-compilation-api">Run-Time Compilation API</a></li>
<li class="toctree-l2 current"><a class="reference internal" href="../index.html#contrib-package">Contrib Package</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="contrib.html">Contrib Package</a></li>
<li class="toctree-l3 current"><a class="current reference internal" href="#">Text API</a><ul>
<li class="toctree-l4"><a class="reference internal" href="#overview">Overview</a></li>
<li class="toctree-l4"><a class="reference internal" href="#vocabulary">Vocabulary</a></li>
<li class="toctree-l4"><a class="reference internal" href="#text-token-embedding">Text token embedding</a></li>
<li class="toctree-l4"><a class="reference internal" href="#text-utilities">Text utilities</a></li>
<li class="toctree-l4"><a class="reference internal" href="#api-reference">API Reference</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="onnx.html">ONNX-MXNet API</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../r/index.html">R Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../julia/index.html">Julia Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../c++/index.html">C++ Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../scala/index.html">Scala Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../perl/index.html">Perl Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../faq/index.html">HowTo Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../architecture/index.html">System Documents</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../tutorials/index.html">Tutorials</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../community/contribute.html">Community</a></li>
</ul>
</div>
</div>
<div class="content">
<div class="page-tracker"></div>
<div class="section" id="text-api">
<span id="text-api"></span><h1>Text API<a class="headerlink" href="#text-api" title="Permalink to this headline"></a></h1>
<div class="section" id="overview">
<span id="overview"></span><h2>Overview<a class="headerlink" href="#overview" title="Permalink to this headline"></a></h2>
<p>The <code class="docutils literal"><span class="pre">mxnet.contrib.text</span></code> APIs refer to classes and functions related to text data processing, such
as bulding indices and loading pre-trained embedding vectors for text tokens and storing them in the
<code class="docutils literal"><span class="pre">mxnet.ndarray.NDArray</span></code> format.</p>
<div class="admonition warning">
<p class="first admonition-title">Warning</p>
<p class="last">This package contains experimental APIs and may change in the near future.</p>
</div>
<p>This document lists the text APIs in mxnet:</p>
<table border="1" class="longtable docutils">
<colgroup>
<col width="10%"/>
<col width="90%"/>
</colgroup>
<tbody valign="top">
<tr class="row-odd"><td><a class="reference internal" href="#module-mxnet.contrib.text.embedding" title="mxnet.contrib.text.embedding"><code class="xref py py-obj docutils literal"><span class="pre">mxnet.contrib.text.embedding</span></code></a></td>
<td>Text token embeddings.</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="#module-mxnet.contrib.text.vocab" title="mxnet.contrib.text.vocab"><code class="xref py py-obj docutils literal"><span class="pre">mxnet.contrib.text.vocab</span></code></a></td>
<td>Text token indexer.</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="#module-mxnet.contrib.text.utils" title="mxnet.contrib.text.utils"><code class="xref py py-obj docutils literal"><span class="pre">mxnet.contrib.text.utils</span></code></a></td>
<td>Provide utilities for text data processing.</td>
</tr>
</tbody>
</table>
<p>All the code demonstrated in this document assumes that the following modules or packages are
imported.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="kn">from</span> <span class="nn">mxnet</span> <span class="kn">import</span> <span class="n">gluon</span>
<span class="gp">>>> </span><span class="kn">from</span> <span class="nn">mxnet</span> <span class="kn">import</span> <span class="n">nd</span>
<span class="gp">>>> </span><span class="kn">from</span> <span class="nn">mxnet.contrib</span> <span class="kn">import</span> <span class="n">text</span>
<span class="gp">>>> </span><span class="kn">import</span> <span class="nn">collections</span>
</pre></div>
</div>
<div class="section" id="looking-up-pre-trained-word-embeddings-for-indexed-words">
<span id="looking-up-pre-trained-word-embeddings-for-indexed-words"></span><h3>Looking up pre-trained word embeddings for indexed words<a class="headerlink" href="#looking-up-pre-trained-word-embeddings-for-indexed-words" title="Permalink to this headline"></a></h3>
<p>As a common use case, let us look up pre-trained word embedding vectors for indexed words in just a
few lines of code.</p>
<p>To begin with, Suppose that we have a simple text data set in the string format. We can count
word frequency in the data set.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">text_data</span> <span class="o">=</span> <span class="s2">" hello world </span><span class="se">\n</span><span class="s2"> hello nice world </span><span class="se">\n</span><span class="s2"> hi world </span><span class="se">\n</span><span class="s2">"</span>
<span class="gp">>>> </span><span class="n">counter</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">count_tokens_from_str</span><span class="p">(</span><span class="n">text_data</span><span class="p">)</span>
</pre></div>
</div>
<p>The obtained <code class="docutils literal"><span class="pre">counter</span></code> has key-value pairs whose keys are words and values are word frequencies.
Suppose that we want to build indices for all the keys in <code class="docutils literal"><span class="pre">counter</span></code> and load the defined fastText
word embedding for all such indexed words. First, we need a Vocabulary object with <code class="docutils literal"><span class="pre">counter</span></code> as its
argument</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_vocab</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">vocab</span><span class="o">.</span><span class="n">Vocabulary</span><span class="p">(</span><span class="n">counter</span><span class="p">)</span>
</pre></div>
</div>
<p>We can create a fastText word embedding object by specifying the embedding name <code class="docutils literal"><span class="pre">fasttext</span></code> and
the pre-trained file <code class="docutils literal"><span class="pre">wiki.simple.vec</span></code>. We also specify that the indexed tokens for loading the
fastText word embedding come from the defined Vocabulary object <code class="docutils literal"><span class="pre">my_vocab</span></code>.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_embedding</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">embedding</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="s1">'fasttext'</span><span class="p">,</span> <span class="n">pretrained_file_name</span><span class="o">=</span><span class="s1">'wiki.simple.vec'</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">vocabulary</span><span class="o">=</span><span class="n">my_vocab</span><span class="p">)</span>
</pre></div>
</div>
<p>Now we are ready to look up the fastText word embedding vectors for indexed words, such as ‘hello’
and ‘world’.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">get_vecs_by_tokens</span><span class="p">([</span><span class="s1">'hello'</span><span class="p">,</span> <span class="s1">'world'</span><span class="p">])</span>
<span class="go">[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01</span>
<span class="go"> ...</span>
<span class="go"> -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02]</span>
<span class="go"> [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01</span>
<span class="go"> ...</span>
<span class="go"> -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]]</span>
<span class="go"><NDArray 2x300 @cpu(0)></span>
</pre></div>
</div>
</div>
<div class="section" id="using-pre-trained-word-embeddings-in-gluon">
<span id="using-pre-trained-word-embeddings-in-gluon"></span><h3>Using pre-trained word embeddings in <code class="docutils literal"><span class="pre">gluon</span></code><a class="headerlink" href="#using-pre-trained-word-embeddings-in-gluon" title="Permalink to this headline"></a></h3>
<p>To demonstrate how to use pre-trained word embeddings in the <code class="docutils literal"><span class="pre">gluon</span></code> package, let us first obtain
indices of the words ‘hello’ and ‘world’.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">to_indices</span><span class="p">([</span><span class="s1">'hello'</span><span class="p">,</span> <span class="s1">'world'</span><span class="p">])</span>
<span class="go">[2, 1]</span>
</pre></div>
</div>
<p>We can obtain the vector representation for the words ‘hello’ and ‘world’ by specifying their
indices (2 and 1) and the <code class="docutils literal"><span class="pre">my_embedding.idx_to_vec</span></code> in <code class="docutils literal"><span class="pre">mxnet.gluon.nn.Embedding</span></code>.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">layer</span> <span class="o">=</span> <span class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Embedding</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">my_embedding</span><span class="p">),</span> <span class="n">my_embedding</span><span class="o">.</span><span class="n">vec_len</span><span class="p">)</span>
<span class="gp">>>> </span><span class="n">layer</span><span class="o">.</span><span class="n">initialize</span><span class="p">()</span>
<span class="gp">>>> </span><span class="n">layer</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">set_data</span><span class="p">(</span><span class="n">my_embedding</span><span class="o">.</span><span class="n">idx_to_vec</span><span class="p">)</span>
<span class="gp">>>> </span><span class="n">layer</span><span class="p">(</span><span class="n">nd</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">]))</span>
<span class="go">[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01</span>
<span class="go"> ...</span>
<span class="go"> -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02]</span>
<span class="go"> [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01</span>
<span class="go"> ...</span>
<span class="go"> -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]]</span>
<span class="go"><NDArray 2x300 @cpu(0)></span>
</pre></div>
</div>
</div>
</div>
<div class="section" id="vocabulary">
<span id="vocabulary"></span><h2>Vocabulary<a class="headerlink" href="#vocabulary" title="Permalink to this headline"></a></h2>
<p>The vocabulary builds indices for text tokens. Such indexed tokens can be used by token embedding
instances. The input counter whose keys are candidate indices may be obtained via
<a class="reference external" href="#mxnet.contrib.text.utils.count_tokens_from_str"><code class="docutils literal"><span class="pre">count_tokens_from_str</span></code></a>.</p>
<table border="1" class="longtable docutils">
<colgroup>
<col width="10%"/>
<col width="90%"/>
</colgroup>
<tbody valign="top">
<tr class="row-odd"><td><a class="reference internal" href="#mxnet.contrib.text.vocab.Vocabulary" title="mxnet.contrib.text.vocab.Vocabulary"><code class="xref py py-obj docutils literal"><span class="pre">Vocabulary</span></code></a></td>
<td>Indexing for text tokens.</td>
</tr>
</tbody>
</table>
<p>Suppose that we have a simple text data set in the string format. We can count word frequency in the
data set.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">text_data</span> <span class="o">=</span> <span class="s2">" hello world </span><span class="se">\n</span><span class="s2"> hello nice world </span><span class="se">\n</span><span class="s2"> hi world </span><span class="se">\n</span><span class="s2">"</span>
<span class="gp">>>> </span><span class="n">counter</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">count_tokens_from_str</span><span class="p">(</span><span class="n">text_data</span><span class="p">)</span>
</pre></div>
</div>
<p>The obtained <code class="docutils literal"><span class="pre">counter</span></code> has key-value pairs whose keys are words and values are word frequencies.
Suppose that we want to build indices for the 2 most frequent keys in <code class="docutils literal"><span class="pre">counter</span></code> with the unknown
token representation ‘<p>&amp;lt;</p>
unk<p>&amp;gt;</p>
‘ and a reserved token ‘<p>&amp;lt;</p>
pad<p>&amp;gt;</p>
‘.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_vocab</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">vocab</span><span class="o">.</span><span class="n">Vocabulary</span><span class="p">(</span><span class="n">counter</span><span class="p">,</span> <span class="n">most_freq_count</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">unknown_token</span><span class="o">=</span><span class="s1">'&amp;lt;unk&amp;gt;'</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">reserved_tokens</span><span class="o">=</span><span class="p">[</span><span class="s1">'&amp;lt;pad&amp;gt;'</span><span class="p">])</span>
</pre></div>
</div>
<p>We can access properties such as <code class="docutils literal"><span class="pre">token_to_idx</span></code> (mapping tokens to indices), <code class="docutils literal"><span class="pre">idx_to_token</span></code> (mapping
indices to tokens), <code class="docutils literal"><span class="pre">vec_len</span></code> (length of each embedding vector), and <code class="docutils literal"><span class="pre">unknown_token</span></code> (representation
of any unknown token) and <code class="docutils literal"><span class="pre">reserved_tokens</span></code>.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_vocab</span><span class="o">.</span><span class="n">token_to_idx</span>
<span class="go">{'&amp;lt;unk&amp;gt;': 0, '&amp;lt;pad&amp;gt;': 1, 'world': 2, 'hello': 3}</span>
<span class="gp">>>> </span><span class="n">my_vocab</span><span class="o">.</span><span class="n">idx_to_token</span>
<span class="go">['&amp;lt;unk&amp;gt;', '&amp;lt;pad&amp;gt;', 'world', 'hello']</span>
<span class="gp">>>> </span><span class="n">my_vocab</span><span class="o">.</span><span class="n">unknown_token</span>
<span class="go">'&amp;lt;unk&amp;gt;'</span>
<span class="gp">>>> </span><span class="n">my_vocab</span><span class="o">.</span><span class="n">reserved_tokens</span>
<span class="go">['&amp;lt;pad&amp;gt;']</span>
<span class="gp">>>> </span><span class="nb">len</span><span class="p">(</span><span class="n">my_vocab</span><span class="p">)</span>
<span class="go">4</span>
</pre></div>
</div>
<p>Besides the specified unknown token ‘<p>&amp;lt;</p>
unk<p>&amp;gt;</p>
‘ and reserved_token ‘<p>&amp;lt;</p>
pad<p>&amp;gt;</p>
‘ are indexed, the 2 most
frequent words ‘world’ and ‘hello’ are also indexed.</p>
</div>
<div class="section" id="text-token-embedding">
<span id="text-token-embedding"></span><h2>Text token embedding<a class="headerlink" href="#text-token-embedding" title="Permalink to this headline"></a></h2>
<p>To load token embeddings from an externally hosted pre-trained token embedding file, such as those
of GloVe and FastText, use
<a class="reference external" href="#mxnet.contrib.text.embedding.create"><code class="docutils literal"><span class="pre">embedding.create(embedding_name,</span> <span class="pre">pretrained_file_name)</span></code></a>.</p>
<p>To get all the available <code class="docutils literal"><span class="pre">embedding_name</span></code> and <code class="docutils literal"><span class="pre">pretrained_file_name</span></code>, use
<a class="reference external" href="#mxnet.contrib.text.embedding.get_pretrained_file_names"><code class="docutils literal"><span class="pre">embedding.get_pretrained_file_names()</span></code></a>.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">text</span><span class="o">.</span><span class="n">embedding</span><span class="o">.</span><span class="n">get_pretrained_file_names</span><span class="p">()</span>
<span class="go">{'glove': ['glove.42B.300d.txt', 'glove.6B.50d.txt', 'glove.6B.100d.txt', ...],</span>
<span class="go">'fasttext': ['wiki.en.vec', 'wiki.simple.vec', 'wiki.zh.vec', ...]}</span>
</pre></div>
</div>
<p>Alternatively, to load embedding vectors from a custom pre-trained text token
embedding file, use <a class="reference external" href="#mxnet.contrib.text.embedding.CustomEmbedding"><code class="docutils literal"><span class="pre">CustomEmbedding</span></code></a>.</p>
<p>Moreover, to load composite embedding vectors, such as to concatenate embedding vectors,
use <a class="reference external" href="#mxnet.contrib.text.embedding.CompositeEmbedding"><code class="docutils literal"><span class="pre">CompositeEmbedding</span></code></a>.</p>
<p>The indexed tokens in a text token embedding may come from a vocabulary or from the loaded embedding
vectors. In the former case, only the indexed tokens in a vocabulary are associated with the loaded
embedding vectors, such as loaded from a pre-trained token embedding file. In the later case, all
the tokens from the loaded embedding vectors, such as loaded from a pre-trained token embedding
file, are taken as the indexed tokens of the embedding.</p>
<table border="1" class="longtable docutils">
<colgroup>
<col width="10%"/>
<col width="90%"/>
</colgroup>
<tbody valign="top">
<tr class="row-odd"><td><a class="reference internal" href="#mxnet.contrib.text.embedding.register" title="mxnet.contrib.text.embedding.register"><code class="xref py py-obj docutils literal"><span class="pre">register</span></code></a></td>
<td>Registers a new token embedding.</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="#mxnet.contrib.text.embedding.create" title="mxnet.contrib.text.embedding.create"><code class="xref py py-obj docutils literal"><span class="pre">create</span></code></a></td>
<td>Creates an instance of token embedding.</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="#mxnet.contrib.text.embedding.get_pretrained_file_names" title="mxnet.contrib.text.embedding.get_pretrained_file_names"><code class="xref py py-obj docutils literal"><span class="pre">get_pretrained_file_names</span></code></a></td>
<td>Get valid token embedding names and their pre-trained file names.</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="#mxnet.contrib.text.embedding.GloVe" title="mxnet.contrib.text.embedding.GloVe"><code class="xref py py-obj docutils literal"><span class="pre">GloVe</span></code></a></td>
<td>The GloVe word embedding.</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="#mxnet.contrib.text.embedding.FastText" title="mxnet.contrib.text.embedding.FastText"><code class="xref py py-obj docutils literal"><span class="pre">FastText</span></code></a></td>
<td>The fastText word embedding.</td>
</tr>
<tr class="row-even"><td><a class="reference internal" href="#mxnet.contrib.text.embedding.CustomEmbedding" title="mxnet.contrib.text.embedding.CustomEmbedding"><code class="xref py py-obj docutils literal"><span class="pre">CustomEmbedding</span></code></a></td>
<td>User-defined token embedding.</td>
</tr>
<tr class="row-odd"><td><a class="reference internal" href="#mxnet.contrib.text.embedding.CompositeEmbedding" title="mxnet.contrib.text.embedding.CompositeEmbedding"><code class="xref py py-obj docutils literal"><span class="pre">CompositeEmbedding</span></code></a></td>
<td>Composite token embeddings.</td>
</tr>
</tbody>
</table>
<div class="section" id="indexed-tokens-are-from-a-vocabulary">
<span id="indexed-tokens-are-from-a-vocabulary"></span><h3>Indexed tokens are from a vocabulary<a class="headerlink" href="#indexed-tokens-are-from-a-vocabulary" title="Permalink to this headline"></a></h3>
<p>One can specify that only the indexed tokens in a vocabulary are associated with the loaded
embedding vectors, such as loaded from a pre-trained token embedding file.</p>
<p>To begin with, suppose that we have a simple text data set in the string format. We can count word
frequency in the data set.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">text_data</span> <span class="o">=</span> <span class="s2">" hello world </span><span class="se">\n</span><span class="s2"> hello nice world </span><span class="se">\n</span><span class="s2"> hi world </span><span class="se">\n</span><span class="s2">"</span>
<span class="gp">>>> </span><span class="n">counter</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">count_tokens_from_str</span><span class="p">(</span><span class="n">text_data</span><span class="p">)</span>
</pre></div>
</div>
<p>The obtained <code class="docutils literal"><span class="pre">counter</span></code> has key-value pairs whose keys are words and values are word frequencies.
Suppose that we want to build indices for the most frequent 2 keys in <code class="docutils literal"><span class="pre">counter</span></code> and load the defined
fastText word embedding with pre-trained file <code class="docutils literal"><span class="pre">wiki.simple.vec</span></code> for all these 2 words.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_vocab</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">vocab</span><span class="o">.</span><span class="n">Vocabulary</span><span class="p">(</span><span class="n">counter</span><span class="p">,</span> <span class="n">most_freq_count</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="gp">>>> </span><span class="n">my_embedding</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">embedding</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="s1">'fasttext'</span><span class="p">,</span> <span class="n">pretrained_file_name</span><span class="o">=</span><span class="s1">'wiki.simple.vec'</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">vocabulary</span><span class="o">=</span><span class="n">my_vocab</span><span class="p">)</span>
</pre></div>
</div>
<p>Now we are ready to look up the fastText word embedding vectors for indexed words.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">get_vecs_by_tokens</span><span class="p">([</span><span class="s1">'hello'</span><span class="p">,</span> <span class="s1">'world'</span><span class="p">])</span>
<span class="go">[[ 3.95669997e-01 2.14540005e-01 -3.53889987e-02 -2.42990002e-01</span>
<span class="go"> ...</span>
<span class="go"> -7.54180014e-01 -3.14429998e-01 2.40180008e-02 -7.61009976e-02]</span>
<span class="go"> [ 1.04440004e-01 -1.08580001e-01 2.72119999e-01 1.32990003e-01</span>
<span class="go"> ...</span>
<span class="go"> -3.73499990e-01 5.67310005e-02 5.60180008e-01 2.90190000e-02]]</span>
<span class="go"><NDArray 2x300 @cpu(0)></span>
</pre></div>
</div>
<p>We can also access properties such as <code class="docutils literal"><span class="pre">token_to_idx</span></code> (mapping tokens to indices), <code class="docutils literal"><span class="pre">idx_to_token</span></code>
(mapping indices to tokens), and <code class="docutils literal"><span class="pre">vec_len</span></code> (length of each embedding vector).</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">token_to_idx</span>
<span class="go">{'&amp;lt;unk&amp;gt;': 0, 'world': 1, 'hello': 2}</span>
<span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">idx_to_token</span>
<span class="go">['&amp;lt;unk&amp;gt;', 'world', 'hello']</span>
<span class="gp">>>> </span><span class="nb">len</span><span class="p">(</span><span class="n">my_embedding</span><span class="p">)</span>
<span class="go">3</span>
<span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">vec_len</span>
<span class="go">300</span>
</pre></div>
</div>
<p>If a token is unknown to <code class="docutils literal"><span class="pre">glossary</span></code>, its embedding vector is initialized according to the default
specification in <code class="docutils literal"><span class="pre">fasttext_simple</span></code> (all elements are 0).</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">get_vecs_by_tokens</span><span class="p">(</span><span class="s1">'nice'</span><span class="p">)</span>
<span class="go">[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.</span>
<span class="go"> ...</span>
<span class="go"> 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]</span>
<span class="go"><NDArray 300 @cpu(0)></span>
</pre></div>
</div>
</div>
<div class="section" id="indexed-tokens-are-from-the-loaded-embedding-vectors">
<span id="indexed-tokens-are-from-the-loaded-embedding-vectors"></span><h3>Indexed tokens are from the loaded embedding vectors<a class="headerlink" href="#indexed-tokens-are-from-the-loaded-embedding-vectors" title="Permalink to this headline"></a></h3>
<p>One can also use all the tokens from the loaded embedding vectors, such as loaded from a pre-trained
token embedding file, as the indexed tokens of the embedding.</p>
<p>To begin with, we can create a fastText word embedding object by specifying the embedding name
‘fasttext’ and the pre-trained file ‘wiki.simple.vec’. The argument <code class="docutils literal"><span class="pre">init_unknown_vec</span></code> specifies
default vector representation for any unknown token. To index all the tokens from this pre-trained
word embedding file, we do not need to specify any vocabulary.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_embedding</span> <span class="o">=</span> <span class="n">text</span><span class="o">.</span><span class="n">embedding</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="s1">'fasttext'</span><span class="p">,</span> <span class="n">pretrained_file_name</span><span class="o">=</span><span class="s1">'wiki.simple.vec'</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">init_unknown_vec</span><span class="o">=</span><span class="n">nd</span><span class="o">.</span><span class="n">zeros</span><span class="p">)</span>
</pre></div>
</div>
<p>We can access properties such as <code class="docutils literal"><span class="pre">token_to_idx</span></code> (mapping tokens to indices), <code class="docutils literal"><span class="pre">idx_to_token</span></code> (mapping
indices to tokens), <code class="docutils literal"><span class="pre">vec_len</span></code> (length of each embedding vector), and <code class="docutils literal"><span class="pre">unknown_token</span></code> (representation
of any unknown token, default value is ‘<p>&amp;lt;</p>
unk<p>&amp;gt;</p>
‘).</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">token_to_idx</span><span class="p">[</span><span class="s1">'nice'</span><span class="p">]</span>
<span class="go">2586</span>
<span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">idx_to_token</span><span class="p">[</span><span class="mi">2586</span><span class="p">]</span>
<span class="go">'nice'</span>
<span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">vec_len</span>
<span class="go">300</span>
<span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">unknown_token</span>
<span class="go">'&amp;lt;unk&amp;gt;'</span>
</pre></div>
</div>
<p>For every unknown token, if its representation ‘<p>&amp;lt;</p>
unk<p>&amp;gt;</p>
‘ is encountered in the pre-trained token
embedding file, index 0 of property <code class="docutils literal"><span class="pre">idx_to_vec</span></code> maps to the pre-trained token embedding vector
loaded from the file; otherwise, index 0 of property <code class="docutils literal"><span class="pre">idx_to_vec</span></code> maps to the default token
embedding vector specified via <code class="docutils literal"><span class="pre">init_unknown_vec</span></code> (set to nd.zeros here). Since the pre-trained file
does not have a vector for the token ‘<p>&amp;lt;</p>
unk<p>&amp;gt;</p>
‘, index 0 has to map to an additional token ‘<p>&amp;lt;</p>
unk<p>&amp;gt;</p>
‘ and
the number of tokens in the embedding is 111,052.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="nb">len</span><span class="p">(</span><span class="n">my_embedding</span><span class="p">)</span>
<span class="go">111052</span>
<span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">idx_to_vec</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="go">[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.</span>
<span class="go"> ...</span>
<span class="go"> 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]</span>
<span class="go"><NDArray 300 @cpu(0)></span>
<span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">get_vecs_by_tokens</span><span class="p">(</span><span class="s1">'nice'</span><span class="p">)</span>
<span class="go">[ 0.49397001 0.39996001 0.24000999 -0.15121 -0.087512 0.37114</span>
<span class="go"> ...</span>
<span class="go"> 0.089521 0.29175001 -0.40917999 -0.089206 -0.1816 -0.36616999]</span>
<span class="go"><NDArray 300 @cpu(0)></span>
<span class="gp">>>> </span><span class="n">my_embedding</span><span class="o">.</span><span class="n">get_vecs_by_tokens</span><span class="p">([</span><span class="s1">'unknownT0kEN'</span><span class="p">,</span> <span class="s1">'unknownT0kEN'</span><span class="p">])</span>
<span class="go">[[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.</span>
<span class="go"> ...</span>
<span class="go"> 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]</span>
<span class="go"> [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.</span>
<span class="go"> ...</span>
<span class="go"> 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]</span>
<span class="go"><NDArray 2x50 @cpu(0)></span>
</pre></div>
</div>
</div>
<div class="section" id="implement-a-new-text-token-embedding">
<span id="implement-a-new-text-token-embedding"></span><h3>Implement a new text token embedding<a class="headerlink" href="#implement-a-new-text-token-embedding" title="Permalink to this headline"></a></h3>
<p>For <code class="docutils literal"><span class="pre">optimizer</span></code>, create a subclass of <code class="docutils literal"><span class="pre">mxnet.contrib.text.embedding._TokenEmbedding</span></code>.
Also add <code class="docutils literal"><span class="pre">@mxnet.contrib.text.embedding._TokenEmbedding.register</span></code> before this class. See
<a class="reference external" href="https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/contrib/text/embedding.py"><code class="docutils literal"><span class="pre">embedding.py</span></code></a>
for examples.</p>
</div>
</div>
<div class="section" id="text-utilities">
<span id="text-utilities"></span><h2>Text utilities<a class="headerlink" href="#text-utilities" title="Permalink to this headline"></a></h2>
<p>The following functions provide utilities for text data processing.</p>
<table border="1" class="longtable docutils">
<colgroup>
<col width="10%"/>
<col width="90%"/>
</colgroup>
<tbody valign="top">
<tr class="row-odd"><td><a class="reference internal" href="#mxnet.contrib.text.utils.count_tokens_from_str" title="mxnet.contrib.text.utils.count_tokens_from_str"><code class="xref py py-obj docutils literal"><span class="pre">count_tokens_from_str</span></code></a></td>
<td>Counts tokens in the specified string.</td>
</tr>
</tbody>
</table>
</div>
<div class="section" id="api-reference">
<span id="api-reference"></span><h2>API Reference<a class="headerlink" href="#api-reference" title="Permalink to this headline"></a></h2>
<script src="../../../_static/js/auto_module_index.js" type="text/javascript"></script><span class="target" id="module-mxnet.contrib.text.embedding"></span><p>Text token embeddings.</p>
<dl class="function">
<dt id="mxnet.contrib.text.embedding.register">
<code class="descclassname">mxnet.contrib.text.embedding.</code><code class="descname">register</code><span class="sig-paren">(</span><em>embedding_cls</em><span class="sig-paren">)</span><a class="reference internal" href="../../../_modules/mxnet/contrib/text/embedding.html#register"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#mxnet.contrib.text.embedding.register" title="Permalink to this definition"></a></dt>
<dd><p>Registers a new token embedding.</p>
<p>Once an embedding is registered, we can create an instance of this embedding with
<a class="reference internal" href="#mxnet.contrib.text.embedding.create" title="mxnet.contrib.text.embedding.create"><code class="xref py py-func docutils literal"><span class="pre">create()</span></code></a>.</p>
<p class="rubric">Examples</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="nd">@mxnet</span><span class="o">.</span><span class="n">contrib</span><span class="o">.</span><span class="n">text</span><span class="o">.</span><span class="n">embedding</span><span class="o">.</span><span class="n">register</span>
<span class="gp">... </span><span class="k">class</span> <span class="nc">MyTextEmbed</span><span class="p">(</span><span class="n">mxnet</span><span class="o">.</span><span class="n">contrib</span><span class="o">.</span><span class="n">text</span><span class="o">.</span><span class="n">embedding</span><span class="o">.</span><span class="n">_TokenEmbedding</span><span class="p">):</span>
<span class="gp">... </span> <span class="k">def</span> <span class="nf">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">pretrained_file_name</span><span class="o">=</span><span class="s1">'my_pretrain_file'</span><span class="p">):</span>
<span class="gp">... </span> <span class="k">pass</span>
<span class="gp">>>> </span><span class="n">embed</span> <span class="o">=</span> <span class="n">mxnet</span><span class="o">.</span><span class="n">contrib</span><span class="o">.</span><span class="n">text</span><span class="o">.</span><span class="n">embedding</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="s1">'MyTokenEmbed'</span><span class="p">)</span>
<span class="gp">>>> </span><span class="nb">print</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">embed</span><span class="p">))</span>
<span class="go"><class '__main__.MyTokenEmbed'></span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="mxnet.contrib.text.embedding.create">
<code class="descclassname">mxnet.contrib.text.embedding.</code><code class="descname">create</code><span class="sig-paren">(</span><em>embedding_name</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../../_modules/mxnet/contrib/text/embedding.html#create"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#mxnet.contrib.text.embedding.create" title="Permalink to this definition"></a></dt>
<dd><p>Creates an instance of token embedding.</p>
<p>Creates a token embedding instance by loading embedding vectors from an externally hosted
pre-trained token embedding file, such as those of GloVe and FastText. To get all the valid
<cite>embedding_name</cite> and <cite>pretrained_file_name</cite>, use
<cite>mxnet.contrib.text.embedding.get_pretrained_file_names()</cite>.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>embedding_name</strong> (<em>str</em>) – The token embedding name (case-insensitive).</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A token embedding instance that loads embedding vectors from an externally hosted
pre-trained token embedding file.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">An instance of <cite>mxnet.contrib.text.glossary._TokenEmbedding</cite></td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="function">
<dt id="mxnet.contrib.text.embedding.get_pretrained_file_names">
<code class="descclassname">mxnet.contrib.text.embedding.</code><code class="descname">get_pretrained_file_names</code><span class="sig-paren">(</span><em>embedding_name=None</em><span class="sig-paren">)</span><a class="reference internal" href="../../../_modules/mxnet/contrib/text/embedding.html#get_pretrained_file_names"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#mxnet.contrib.text.embedding.get_pretrained_file_names" title="Permalink to this definition"></a></dt>
<dd><p>Get valid token embedding names and their pre-trained file names.</p>
<p>To load token embedding vectors from an externally hosted pre-trained token embedding file,
such as those of GloVe and FastText, one should use
<cite>mxnet.contrib.text.embedding.create(embedding_name, pretrained_file_name)</cite>.
This method returns all the valid names of <cite>pretrained_file_name</cite> for the specified
<cite>embedding_name</cite>. If <cite>embedding_name</cite> is set to None, this method returns all the valid
names of <cite>embedding_name</cite> with their associated <cite>pretrained_file_name</cite>.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>embedding_name</strong> (<em>str</em><em> or </em><em>None</em><em>, </em><em>default None</em>) – The pre-trained token embedding name.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A list of all the valid pre-trained token embedding file names (<cite>pretrained_file_name</cite>)
for the specified token embedding name (<cite>embedding_name</cite>). If the text embeding name is
set to None, returns a dict mapping each valid token embedding name to a list of valid
pre-trained files (<cite>pretrained_file_name</cite>). They can be plugged into
<cite>mxnet.contrib.text.embedding.create(embedding_name,
pretrained_file_name)</cite>.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">dict or list</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="class">
<dt id="mxnet.contrib.text.embedding.GloVe">
<em class="property">class </em><code class="descclassname">mxnet.contrib.text.embedding.</code><code class="descname">GloVe</code><span class="sig-paren">(</span><em>pretrained_file_name='glove.840B.300d.txt'</em>, <em>embedding_root='/home/jenkins_slave/.mxnet/embeddings'</em>, <em>init_unknown_vec=<function zeros></em>, <em>vocabulary=None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../../_modules/mxnet/contrib/text/embedding.html#GloVe"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#mxnet.contrib.text.embedding.GloVe" title="Permalink to this definition"></a></dt>
<dd><p>The GloVe word embedding.</p>
<p>GloVe is an unsupervised learning algorithm for obtaining vector representations for words.
Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and
the resulting representations showcase interesting linear substructures of the word vector
space. (Source from <a class="reference external" href="https://nlp.stanford.edu/projects/glove/">https://nlp.stanford.edu/projects/glove/</a>)</p>
<p>Reference:</p>
<p>GloVe: Global Vectors for Word Representation.
Jeffrey Pennington, Richard Socher, and Christopher D. Manning.
<a class="reference external" href="https://nlp.stanford.edu/pubs/glove.pdf">https://nlp.stanford.edu/pubs/glove.pdf</a></p>
<p>Website:</p>
<p><a class="reference external" href="https://nlp.stanford.edu/projects/glove/">https://nlp.stanford.edu/projects/glove/</a></p>
<p>To get the updated URLs to the externally hosted pre-trained token embedding
files, visit <a class="reference external" href="https://nlp.stanford.edu/projects/glove/">https://nlp.stanford.edu/projects/glove/</a></p>
<p>License for pre-trained embeddings:</p>
<blockquote>
<div><a class="reference external" href="https://opendatacommons.org/licenses/pddl/">https://opendatacommons.org/licenses/pddl/</a></div></blockquote>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>pretrained_file_name</strong> (<em>str</em><em>, </em><em>default 'glove.840B.300d.txt'</em>) – The name of the pre-trained token embedding file.</li>
<li><strong>embedding_root</strong> (<em>str</em><em>, </em><em>default $MXNET_HOME/embeddings</em>) – The root directory for storing embedding-related files.</li>
<li><strong>init_unknown_vec</strong> (<a class="reference internal" href="../callback/callback.html#module-mxnet.callback" title="mxnet.callback"><em>callback</em></a>) – The callback used to initialize the embedding vector for the unknown token.</li>
<li><strong>vocabulary</strong> (<a class="reference internal" href="#mxnet.contrib.text.vocab.Vocabulary" title="mxnet.contrib.text.vocab.Vocabulary"><code class="xref py py-class docutils literal"><span class="pre">Vocabulary</span></code></a>, default None) – It contains the tokens to index. Each indexed token will be associated with the loaded
embedding vectors, such as loaded from a pre-trained token embedding file. If None, all the
tokens from the loaded embedding vectors, such as loaded from a pre-trained token embedding
file, will be indexed.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="docutils">
<dt>token_to_idx <span class="classifier-delimiter">:</span> <span class="classifier">dict mapping str to int</span></dt>
<dd>A dict mapping each token to its index integer.</dd>
<dt>idx_to_token <span class="classifier-delimiter">:</span> <span class="classifier">list of strs</span></dt>
<dd>A list of indexed tokens where the list indices and the token indices are aligned.</dd>
<dt>unknown_token <span class="classifier-delimiter">:</span> <span class="classifier">hashable object</span></dt>
<dd>The representation for any unknown token. In other words, any unknown token will be indexed
as the same representation.</dd>
<dt>reserved_tokens <span class="classifier-delimiter">:</span> <span class="classifier">list of strs or None</span></dt>
<dd>A list of reserved tokens that will always be indexed.</dd>
<dt>vec_len <span class="classifier-delimiter">:</span> <span class="classifier">int</span></dt>
<dd>The length of the embedding vector for each token.</dd>
<dt>idx_to_vec <span class="classifier-delimiter">:</span> <span class="classifier">mxnet.ndarray.NDArray</span></dt>
<dd>For all the indexed tokens in this embedding, this NDArray maps each token’s index to an
embedding vector. The largest valid index maps to the initialized embedding vector for every
reserved token, such as an unknown_token token and a padding token.</dd>
</dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.GloVe.get_vecs_by_tokens">
<code class="descname">get_vecs_by_tokens</code><span class="sig-paren">(</span><em>tokens</em>, <em>lower_case_backup=False</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.GloVe.get_vecs_by_tokens" title="Permalink to this definition"></a></dt>
<dd><p>Look up embedding vectors of tokens.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>tokens</strong> (<em>str</em><em> or </em><em>list of strs</em>) – A token or a list of tokens.</li>
<li><strong>lower_case_backup</strong> (<em>bool</em><em>, </em><em>default False</em>) – If False, each token in the original case will be looked up; if True, each token in the
original case will be looked up first, if not found in the keys of the property
<cite>token_to_idx</cite>, the token in the lower case will be looked up.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">The embedding vector(s) of the token(s). According to numpy conventions, if <cite>tokens</cite> is
a string, returns a 1-D NDArray of shape <cite>self.vec_len</cite>; if <cite>tokens</cite> is a list of
strings, returns a 2-D NDArray of shape=(len(tokens), self.vec_len).</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="../ndarray/ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray">mxnet.ndarray.NDArray</a></p>
</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.GloVe.to_indices">
<code class="descname">to_indices</code><span class="sig-paren">(</span><em>tokens</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.GloVe.to_indices" title="Permalink to this definition"></a></dt>
<dd><p>Converts tokens to indices according to the vocabulary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>tokens</strong> (<em>str</em><em> or </em><em>list of strs</em>) – A source token or tokens to be converted.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A token index or a list of token indices according to the vocabulary.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">int or list of ints</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.GloVe.to_tokens">
<code class="descname">to_tokens</code><span class="sig-paren">(</span><em>indices</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.GloVe.to_tokens" title="Permalink to this definition"></a></dt>
<dd><p>Converts token indices to tokens according to the vocabulary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>indices</strong> (<em>int</em><em> or </em><em>list of ints</em>) – A source token index or token indices to be converted.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A token or a list of tokens according to the vocabulary.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">str or list of strs</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.GloVe.update_token_vectors">
<code class="descname">update_token_vectors</code><span class="sig-paren">(</span><em>tokens</em>, <em>new_vectors</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.GloVe.update_token_vectors" title="Permalink to this definition"></a></dt>
<dd><p>Updates embedding vectors for tokens.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>tokens</strong> (<em>str</em><em> or </em><em>a list of strs</em>) – A token or a list of tokens whose embedding vector are to be updated.</li>
<li><strong>new_vectors</strong> (<a class="reference internal" href="../ndarray/ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray"><em>mxnet.ndarray.NDArray</em></a>) – An NDArray to be assigned to the embedding vectors of <cite>tokens</cite>. Its length must be equal
to the number of <cite>tokens</cite> and its width must be equal to the dimension of embeddings of
the glossary. If <cite>tokens</cite> is a singleton, it must be 1-D or 2-D. If <cite>tokens</cite> is a list
of multiple strings, it must be 2-D.</li>
</ul>
</td>
</tr>
</tbody>
</table>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="mxnet.contrib.text.embedding.FastText">
<em class="property">class </em><code class="descclassname">mxnet.contrib.text.embedding.</code><code class="descname">FastText</code><span class="sig-paren">(</span><em>pretrained_file_name='wiki.simple.vec'</em>, <em>embedding_root='/home/jenkins_slave/.mxnet/embeddings'</em>, <em>init_unknown_vec=<function zeros></em>, <em>vocabulary=None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../../_modules/mxnet/contrib/text/embedding.html#FastText"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#mxnet.contrib.text.embedding.FastText" title="Permalink to this definition"></a></dt>
<dd><p>The fastText word embedding.</p>
<p>FastText is an open-source, free, lightweight library that allows users to learn text
representations and text classifiers. It works on standard, generic hardware. Models can later
be reduced in size to even fit on mobile devices. (Source from <a class="reference external" href="https://fasttext.cc/">https://fasttext.cc/</a>)</p>
<p>References:</p>
<p>Enriching Word Vectors with Subword Information.
Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov.
<a class="reference external" href="https://arxiv.org/abs/1607.04606">https://arxiv.org/abs/1607.04606</a></p>
<p>Bag of Tricks for Efficient Text Classification.
Armand Joulin, Edouard Grave, Piotr Bojanowski, and Tomas Mikolov.
<a class="reference external" href="https://arxiv.org/abs/1607.01759">https://arxiv.org/abs/1607.01759</a></p>
<p>FastText.zip: Compressing text classification models.
Armand Joulin, Edouard Grave, Piotr Bojanowski, Matthijs Douze, Herve Jegou,
and Tomas Mikolov.
<a class="reference external" href="https://arxiv.org/abs/1612.03651">https://arxiv.org/abs/1612.03651</a></p>
<p>For ‘wiki.multi’ embeddings:
Word Translation Without Parallel Data
Alexis Conneau, Guillaume Lample, Marc’Aurelio Ranzato, Ludovic Denoyer,
and Herve Jegou.
<a class="reference external" href="https://arxiv.org/abs/1710.04087">https://arxiv.org/abs/1710.04087</a></p>
<p>Website:</p>
<p><a class="reference external" href="https://fasttext.cc/">https://fasttext.cc/</a></p>
<p>To get the updated URLs to the externally hosted pre-trained token embedding files, visit
<a class="reference external" href="https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md">https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md</a></p>
<p>License for pre-trained embeddings:</p>
<blockquote>
<div><a class="reference external" href="https://creativecommons.org/licenses/by-sa/3.0/">https://creativecommons.org/licenses/by-sa/3.0/</a></div></blockquote>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>pretrained_file_name</strong> (<em>str</em><em>, </em><em>default 'wiki.en.vec'</em>) – The name of the pre-trained token embedding file.</li>
<li><strong>embedding_root</strong> (<em>str</em><em>, </em><em>default $MXNET_HOME/embeddings</em>) – The root directory for storing embedding-related files.</li>
<li><strong>init_unknown_vec</strong> (<a class="reference internal" href="../callback/callback.html#module-mxnet.callback" title="mxnet.callback"><em>callback</em></a>) – The callback used to initialize the embedding vector for the unknown token.</li>
<li><strong>vocabulary</strong> (<a class="reference internal" href="#mxnet.contrib.text.vocab.Vocabulary" title="mxnet.contrib.text.vocab.Vocabulary"><code class="xref py py-class docutils literal"><span class="pre">Vocabulary</span></code></a>, default None) – It contains the tokens to index. Each indexed token will be associated with the loaded
embedding vectors, such as loaded from a pre-trained token embedding file. If None, all the
tokens from the loaded embedding vectors, such as loaded from a pre-trained token embedding
file, will be indexed.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="docutils">
<dt>token_to_idx <span class="classifier-delimiter">:</span> <span class="classifier">dict mapping str to int</span></dt>
<dd>A dict mapping each token to its index integer.</dd>
<dt>idx_to_token <span class="classifier-delimiter">:</span> <span class="classifier">list of strs</span></dt>
<dd>A list of indexed tokens where the list indices and the token indices are aligned.</dd>
<dt>unknown_token <span class="classifier-delimiter">:</span> <span class="classifier">hashable object</span></dt>
<dd>The representation for any unknown token. In other words, any unknown token will be indexed
as the same representation.</dd>
<dt>reserved_tokens <span class="classifier-delimiter">:</span> <span class="classifier">list of strs or None</span></dt>
<dd>A list of reserved tokens that will always be indexed.</dd>
<dt>vec_len <span class="classifier-delimiter">:</span> <span class="classifier">int</span></dt>
<dd>The length of the embedding vector for each token.</dd>
<dt>idx_to_vec <span class="classifier-delimiter">:</span> <span class="classifier">mxnet.ndarray.NDArray</span></dt>
<dd>For all the indexed tokens in this embedding, this NDArray maps each token’s index to an
embedding vector. The largest valid index maps to the initialized embedding vector for every
reserved token, such as an unknown_token token and a padding token.</dd>
</dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.FastText.get_vecs_by_tokens">
<code class="descname">get_vecs_by_tokens</code><span class="sig-paren">(</span><em>tokens</em>, <em>lower_case_backup=False</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.FastText.get_vecs_by_tokens" title="Permalink to this definition"></a></dt>
<dd><p>Look up embedding vectors of tokens.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>tokens</strong> (<em>str</em><em> or </em><em>list of strs</em>) – A token or a list of tokens.</li>
<li><strong>lower_case_backup</strong> (<em>bool</em><em>, </em><em>default False</em>) – If False, each token in the original case will be looked up; if True, each token in the
original case will be looked up first, if not found in the keys of the property
<cite>token_to_idx</cite>, the token in the lower case will be looked up.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">The embedding vector(s) of the token(s). According to numpy conventions, if <cite>tokens</cite> is
a string, returns a 1-D NDArray of shape <cite>self.vec_len</cite>; if <cite>tokens</cite> is a list of
strings, returns a 2-D NDArray of shape=(len(tokens), self.vec_len).</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="../ndarray/ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray">mxnet.ndarray.NDArray</a></p>
</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.FastText.to_indices">
<code class="descname">to_indices</code><span class="sig-paren">(</span><em>tokens</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.FastText.to_indices" title="Permalink to this definition"></a></dt>
<dd><p>Converts tokens to indices according to the vocabulary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>tokens</strong> (<em>str</em><em> or </em><em>list of strs</em>) – A source token or tokens to be converted.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A token index or a list of token indices according to the vocabulary.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">int or list of ints</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.FastText.to_tokens">
<code class="descname">to_tokens</code><span class="sig-paren">(</span><em>indices</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.FastText.to_tokens" title="Permalink to this definition"></a></dt>
<dd><p>Converts token indices to tokens according to the vocabulary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>indices</strong> (<em>int</em><em> or </em><em>list of ints</em>) – A source token index or token indices to be converted.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A token or a list of tokens according to the vocabulary.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">str or list of strs</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.FastText.update_token_vectors">
<code class="descname">update_token_vectors</code><span class="sig-paren">(</span><em>tokens</em>, <em>new_vectors</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.FastText.update_token_vectors" title="Permalink to this definition"></a></dt>
<dd><p>Updates embedding vectors for tokens.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>tokens</strong> (<em>str</em><em> or </em><em>a list of strs</em>) – A token or a list of tokens whose embedding vector are to be updated.</li>
<li><strong>new_vectors</strong> (<a class="reference internal" href="../ndarray/ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray"><em>mxnet.ndarray.NDArray</em></a>) – An NDArray to be assigned to the embedding vectors of <cite>tokens</cite>. Its length must be equal
to the number of <cite>tokens</cite> and its width must be equal to the dimension of embeddings of
the glossary. If <cite>tokens</cite> is a singleton, it must be 1-D or 2-D. If <cite>tokens</cite> is a list
of multiple strings, it must be 2-D.</li>
</ul>
</td>
</tr>
</tbody>
</table>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="mxnet.contrib.text.embedding.CustomEmbedding">
<em class="property">class </em><code class="descclassname">mxnet.contrib.text.embedding.</code><code class="descname">CustomEmbedding</code><span class="sig-paren">(</span><em>pretrained_file_path</em>, <em>elem_delim=' '</em>, <em>encoding='utf8'</em>, <em>init_unknown_vec=<function zeros></em>, <em>vocabulary=None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../../_modules/mxnet/contrib/text/embedding.html#CustomEmbedding"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#mxnet.contrib.text.embedding.CustomEmbedding" title="Permalink to this definition"></a></dt>
<dd><p>User-defined token embedding.</p>
<p>This is to load embedding vectors from a user-defined pre-trained text embedding file.</p>
<p>Denote by ‘[ed]’ the argument <cite>elem_delim</cite>. Denote by [v_ij] the j-th element of the token
embedding vector for [token_i], the expected format of a custom pre-trained token embedding file
is:</p>
<p>‘[token_1][ed][v_11][ed][v_12][ed]...[ed][v_1k]\n[token_2][ed][v_21][ed][v_22][ed]...[ed]
[v_2k]\n...’</p>
<p>where k is the length of the embedding vector <cite>vec_len</cite>.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>pretrained_file_path</strong> (<em>str</em>) – The path to the custom pre-trained token embedding file.</li>
<li><strong>elem_delim</strong> (<em>str</em><em>, </em><em>default ' '</em>) – The delimiter for splitting a token and every embedding vector element value on the same
line of the custom pre-trained token embedding file.</li>
<li><strong>encoding</strong> (<em>str</em><em>, </em><em>default 'utf8'</em>) – The encoding scheme for reading the custom pre-trained token embedding file.</li>
<li><strong>init_unknown_vec</strong> (<a class="reference internal" href="../callback/callback.html#module-mxnet.callback" title="mxnet.callback"><em>callback</em></a>) – The callback used to initialize the embedding vector for the unknown token.</li>
<li><strong>vocabulary</strong> (<a class="reference internal" href="#mxnet.contrib.text.vocab.Vocabulary" title="mxnet.contrib.text.vocab.Vocabulary"><code class="xref py py-class docutils literal"><span class="pre">Vocabulary</span></code></a>, default None) – It contains the tokens to index. Each indexed token will be associated with the loaded
embedding vectors, such as loaded from a pre-trained token embedding file. If None, all the
tokens from the loaded embedding vectors, such as loaded from a pre-trained token embedding
file, will be indexed.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="docutils">
<dt>token_to_idx <span class="classifier-delimiter">:</span> <span class="classifier">dict mapping str to int</span></dt>
<dd>A dict mapping each token to its index integer.</dd>
<dt>idx_to_token <span class="classifier-delimiter">:</span> <span class="classifier">list of strs</span></dt>
<dd>A list of indexed tokens where the list indices and the token indices are aligned.</dd>
<dt>unknown_token <span class="classifier-delimiter">:</span> <span class="classifier">hashable object</span></dt>
<dd>The representation for any unknown token. In other words, any unknown token will be indexed
as the same representation.</dd>
<dt>reserved_tokens <span class="classifier-delimiter">:</span> <span class="classifier">list of strs or None</span></dt>
<dd>A list of reserved tokens that will always be indexed.</dd>
<dt>vec_len <span class="classifier-delimiter">:</span> <span class="classifier">int</span></dt>
<dd>The length of the embedding vector for each token.</dd>
<dt>idx_to_vec <span class="classifier-delimiter">:</span> <span class="classifier">mxnet.ndarray.NDArray</span></dt>
<dd>For all the indexed tokens in this embedding, this NDArray maps each token’s index to an
embedding vector. The largest valid index maps to the initialized embedding vector for every
reserved token, such as an unknown_token token and a padding token.</dd>
</dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.CustomEmbedding.get_vecs_by_tokens">
<code class="descname">get_vecs_by_tokens</code><span class="sig-paren">(</span><em>tokens</em>, <em>lower_case_backup=False</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.CustomEmbedding.get_vecs_by_tokens" title="Permalink to this definition"></a></dt>
<dd><p>Look up embedding vectors of tokens.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>tokens</strong> (<em>str</em><em> or </em><em>list of strs</em>) – A token or a list of tokens.</li>
<li><strong>lower_case_backup</strong> (<em>bool</em><em>, </em><em>default False</em>) – If False, each token in the original case will be looked up; if True, each token in the
original case will be looked up first, if not found in the keys of the property
<cite>token_to_idx</cite>, the token in the lower case will be looked up.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">The embedding vector(s) of the token(s). According to numpy conventions, if <cite>tokens</cite> is
a string, returns a 1-D NDArray of shape <cite>self.vec_len</cite>; if <cite>tokens</cite> is a list of
strings, returns a 2-D NDArray of shape=(len(tokens), self.vec_len).</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="../ndarray/ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray">mxnet.ndarray.NDArray</a></p>
</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.CustomEmbedding.to_indices">
<code class="descname">to_indices</code><span class="sig-paren">(</span><em>tokens</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.CustomEmbedding.to_indices" title="Permalink to this definition"></a></dt>
<dd><p>Converts tokens to indices according to the vocabulary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>tokens</strong> (<em>str</em><em> or </em><em>list of strs</em>) – A source token or tokens to be converted.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A token index or a list of token indices according to the vocabulary.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">int or list of ints</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.CustomEmbedding.to_tokens">
<code class="descname">to_tokens</code><span class="sig-paren">(</span><em>indices</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.CustomEmbedding.to_tokens" title="Permalink to this definition"></a></dt>
<dd><p>Converts token indices to tokens according to the vocabulary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>indices</strong> (<em>int</em><em> or </em><em>list of ints</em>) – A source token index or token indices to be converted.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A token or a list of tokens according to the vocabulary.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">str or list of strs</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.CustomEmbedding.update_token_vectors">
<code class="descname">update_token_vectors</code><span class="sig-paren">(</span><em>tokens</em>, <em>new_vectors</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.CustomEmbedding.update_token_vectors" title="Permalink to this definition"></a></dt>
<dd><p>Updates embedding vectors for tokens.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>tokens</strong> (<em>str</em><em> or </em><em>a list of strs</em>) – A token or a list of tokens whose embedding vector are to be updated.</li>
<li><strong>new_vectors</strong> (<a class="reference internal" href="../ndarray/ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray"><em>mxnet.ndarray.NDArray</em></a>) – An NDArray to be assigned to the embedding vectors of <cite>tokens</cite>. Its length must be equal
to the number of <cite>tokens</cite> and its width must be equal to the dimension of embeddings of
the glossary. If <cite>tokens</cite> is a singleton, it must be 1-D or 2-D. If <cite>tokens</cite> is a list
of multiple strings, it must be 2-D.</li>
</ul>
</td>
</tr>
</tbody>
</table>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="mxnet.contrib.text.embedding.CompositeEmbedding">
<em class="property">class </em><code class="descclassname">mxnet.contrib.text.embedding.</code><code class="descname">CompositeEmbedding</code><span class="sig-paren">(</span><em>vocabulary</em>, <em>token_embeddings</em><span class="sig-paren">)</span><a class="reference internal" href="../../../_modules/mxnet/contrib/text/embedding.html#CompositeEmbedding"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#mxnet.contrib.text.embedding.CompositeEmbedding" title="Permalink to this definition"></a></dt>
<dd><p>Composite token embeddings.</p>
<p>For each indexed token in a vocabulary, multiple embedding vectors, such as concatenated
multiple embedding vectors, will be associated with it. Such embedding vectors can be loaded
from externally hosted or custom pre-trained token embedding files, such as via token embedding
instances.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>vocabulary</strong> (<a class="reference internal" href="#mxnet.contrib.text.vocab.Vocabulary" title="mxnet.contrib.text.vocab.Vocabulary"><code class="xref py py-class docutils literal"><span class="pre">Vocabulary</span></code></a>) – For each indexed token in a vocabulary, multiple embedding vectors, such as concatenated
multiple embedding vectors, will be associated with it.</li>
<li><strong>token_embeddings</strong> (instance or list of <cite>mxnet.contrib.text.embedding._TokenEmbedding</cite>) – One or multiple pre-trained token embeddings to load. If it is a list of multiple
embeddings, these embedding vectors will be concatenated for each token.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="docutils">
<dt>token_to_idx <span class="classifier-delimiter">:</span> <span class="classifier">dict mapping str to int</span></dt>
<dd>A dict mapping each token to its index integer.</dd>
<dt>idx_to_token <span class="classifier-delimiter">:</span> <span class="classifier">list of strs</span></dt>
<dd>A list of indexed tokens where the list indices and the token indices are aligned.</dd>
<dt>unknown_token <span class="classifier-delimiter">:</span> <span class="classifier">hashable object</span></dt>
<dd>The representation for any unknown token. In other words, any unknown token will be indexed
as the same representation.</dd>
<dt>reserved_tokens <span class="classifier-delimiter">:</span> <span class="classifier">list of strs or None</span></dt>
<dd>A list of reserved tokens that will always be indexed.</dd>
<dt>vec_len <span class="classifier-delimiter">:</span> <span class="classifier">int</span></dt>
<dd>The length of the embedding vector for each token.</dd>
<dt>idx_to_vec <span class="classifier-delimiter">:</span> <span class="classifier">mxnet.ndarray.NDArray</span></dt>
<dd>For all the indexed tokens in this embedding, this NDArray maps each token’s index to an
embedding vector. The largest valid index maps to the initialized embedding vector for every
reserved token, such as an unknown_token token and a padding token.</dd>
</dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.CompositeEmbedding.get_vecs_by_tokens">
<code class="descname">get_vecs_by_tokens</code><span class="sig-paren">(</span><em>tokens</em>, <em>lower_case_backup=False</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.CompositeEmbedding.get_vecs_by_tokens" title="Permalink to this definition"></a></dt>
<dd><p>Look up embedding vectors of tokens.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>tokens</strong> (<em>str</em><em> or </em><em>list of strs</em>) – A token or a list of tokens.</li>
<li><strong>lower_case_backup</strong> (<em>bool</em><em>, </em><em>default False</em>) – If False, each token in the original case will be looked up; if True, each token in the
original case will be looked up first, if not found in the keys of the property
<cite>token_to_idx</cite>, the token in the lower case will be looked up.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">The embedding vector(s) of the token(s). According to numpy conventions, if <cite>tokens</cite> is
a string, returns a 1-D NDArray of shape <cite>self.vec_len</cite>; if <cite>tokens</cite> is a list of
strings, returns a 2-D NDArray of shape=(len(tokens), self.vec_len).</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="../ndarray/ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray">mxnet.ndarray.NDArray</a></p>
</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.CompositeEmbedding.to_indices">
<code class="descname">to_indices</code><span class="sig-paren">(</span><em>tokens</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.CompositeEmbedding.to_indices" title="Permalink to this definition"></a></dt>
<dd><p>Converts tokens to indices according to the vocabulary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>tokens</strong> (<em>str</em><em> or </em><em>list of strs</em>) – A source token or tokens to be converted.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A token index or a list of token indices according to the vocabulary.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">int or list of ints</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.CompositeEmbedding.to_tokens">
<code class="descname">to_tokens</code><span class="sig-paren">(</span><em>indices</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.CompositeEmbedding.to_tokens" title="Permalink to this definition"></a></dt>
<dd><p>Converts token indices to tokens according to the vocabulary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>indices</strong> (<em>int</em><em> or </em><em>list of ints</em>) – A source token index or token indices to be converted.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A token or a list of tokens according to the vocabulary.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">str or list of strs</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.embedding.CompositeEmbedding.update_token_vectors">
<code class="descname">update_token_vectors</code><span class="sig-paren">(</span><em>tokens</em>, <em>new_vectors</em><span class="sig-paren">)</span><a class="headerlink" href="#mxnet.contrib.text.embedding.CompositeEmbedding.update_token_vectors" title="Permalink to this definition"></a></dt>
<dd><p>Updates embedding vectors for tokens.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>tokens</strong> (<em>str</em><em> or </em><em>a list of strs</em>) – A token or a list of tokens whose embedding vector are to be updated.</li>
<li><strong>new_vectors</strong> (<a class="reference internal" href="../ndarray/ndarray.html#mxnet.ndarray.NDArray" title="mxnet.ndarray.NDArray"><em>mxnet.ndarray.NDArray</em></a>) – An NDArray to be assigned to the embedding vectors of <cite>tokens</cite>. Its length must be equal
to the number of <cite>tokens</cite> and its width must be equal to the dimension of embeddings of
the glossary. If <cite>tokens</cite> is a singleton, it must be 1-D or 2-D. If <cite>tokens</cite> is a list
of multiple strings, it must be 2-D.</li>
</ul>
</td>
</tr>
</tbody>
</table>
</dd></dl>
</dd></dl>
<span class="target" id="module-mxnet.contrib.text.vocab"></span><p>Text token indexer.</p>
<dl class="class">
<dt id="mxnet.contrib.text.vocab.Vocabulary">
<em class="property">class </em><code class="descclassname">mxnet.contrib.text.vocab.</code><code class="descname">Vocabulary</code><span class="sig-paren">(</span><em>counter=None</em>, <em>most_freq_count=None</em>, <em>min_freq=1</em>, <em>unknown_token='<unk>'</em>, <em>reserved_tokens=None</em><span class="sig-paren">)</span><a class="reference internal" href="../../../_modules/mxnet/contrib/text/vocab.html#Vocabulary"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#mxnet.contrib.text.vocab.Vocabulary" title="Permalink to this definition"></a></dt>
<dd><p>Indexing for text tokens.</p>
<p>Build indices for the unknown token, reserved tokens, and input counter keys. Indexed tokens can
be used by token embeddings.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>counter</strong> (<em>collections.Counter</em><em> or </em><em>None</em><em>, </em><em>default None</em>) – Counts text token frequencies in the text data. Its keys will be indexed according to
frequency thresholds such as <cite>most_freq_count</cite> and <cite>min_freq</cite>. Keys of <cite>counter</cite>,
<cite>unknown_token</cite>, and values of <cite>reserved_tokens</cite> must be of the same hashable type.
Examples: str, int, and tuple.</li>
<li><strong>most_freq_count</strong> (<em>None</em><em> or </em><em>int</em><em>, </em><em>default None</em>) – The maximum possible number of the most frequent tokens in the keys of <cite>counter</cite> that can be
indexed. Note that this argument does not count any token from <cite>reserved_tokens</cite>. Suppose
that there are different keys of <cite>counter</cite> whose frequency are the same, if indexing all of
them will exceed this argument value, such keys will be indexed one by one according to
their __cmp__() order until the frequency threshold is met. If this argument is None or
larger than its largest possible value restricted by <cite>counter</cite> and <cite>reserved_tokens</cite>, this
argument has no effect.</li>
<li><strong>min_freq</strong> (<em>int</em><em>, </em><em>default 1</em>) – The minimum frequency required for a token in the keys of <cite>counter</cite> to be indexed.</li>
<li><strong>unknown_token</strong> (<em>hashable object</em><em>, </em><em>default '&amp;lt;unk&amp;gt;'</em>) – The representation for any unknown token. In other words, any unknown token will be indexed
as the same representation. Keys of <cite>counter</cite>, <cite>unknown_token</cite>, and values of
<cite>reserved_tokens</cite> must be of the same hashable type. Examples: str, int, and tuple.</li>
<li><strong>reserved_tokens</strong> (<em>list of hashable objects</em><em> or </em><em>None</em><em>, </em><em>default None</em>) – A list of reserved tokens that will always be indexed, such as special symbols representing
padding, beginning of sentence, and end of sentence. It cannot contain <cite>unknown_token</cite>, or
duplicate reserved tokens. Keys of <cite>counter</cite>, <cite>unknown_token</cite>, and values of
<cite>reserved_tokens</cite> must be of the same hashable type. Examples: str, int, and tuple.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="docutils">
<dt>token_to_idx <span class="classifier-delimiter">:</span> <span class="classifier">dict mapping str to int</span></dt>
<dd>A dict mapping each token to its index integer.</dd>
<dt>idx_to_token <span class="classifier-delimiter">:</span> <span class="classifier">list of strs</span></dt>
<dd>A list of indexed tokens where the list indices and the token indices are aligned.</dd>
<dt>unknown_token <span class="classifier-delimiter">:</span> <span class="classifier">hashable object</span></dt>
<dd>The representation for any unknown token. In other words, any unknown token will be indexed
as the same representation.</dd>
<dt>reserved_tokens <span class="classifier-delimiter">:</span> <span class="classifier">list of strs or None</span></dt>
<dd>A list of reserved tokens that will always be indexed.</dd>
</dl>
<dl class="method">
<dt id="mxnet.contrib.text.vocab.Vocabulary.to_indices">
<code class="descname">to_indices</code><span class="sig-paren">(</span><em>tokens</em><span class="sig-paren">)</span><a class="reference internal" href="../../../_modules/mxnet/contrib/text/vocab.html#Vocabulary.to_indices"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#mxnet.contrib.text.vocab.Vocabulary.to_indices" title="Permalink to this definition"></a></dt>
<dd><p>Converts tokens to indices according to the vocabulary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>tokens</strong> (<em>str</em><em> or </em><em>list of strs</em>) – A source token or tokens to be converted.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A token index or a list of token indices according to the vocabulary.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">int or list of ints</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="method">
<dt id="mxnet.contrib.text.vocab.Vocabulary.to_tokens">
<code class="descname">to_tokens</code><span class="sig-paren">(</span><em>indices</em><span class="sig-paren">)</span><a class="reference internal" href="../../../_modules/mxnet/contrib/text/vocab.html#Vocabulary.to_tokens"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#mxnet.contrib.text.vocab.Vocabulary.to_tokens" title="Permalink to this definition"></a></dt>
<dd><p>Converts token indices to tokens according to the vocabulary.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>indices</strong> (<em>int</em><em> or </em><em>list of ints</em>) – A source token index or token indices to be converted.</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">A token or a list of tokens according to the vocabulary.</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body">str or list of strs</td>
</tr>
</tbody>
</table>
</dd></dl>
</dd></dl>
<span class="target" id="module-mxnet.contrib.text.utils"></span><p>Provide utilities for text data processing.</p>
<dl class="function">
<dt id="mxnet.contrib.text.utils.count_tokens_from_str">
<code class="descclassname">mxnet.contrib.text.utils.</code><code class="descname">count_tokens_from_str</code><span class="sig-paren">(</span><em>source_str</em>, <em>token_delim=' '</em>, <em>seq_delim='\n'</em>, <em>to_lower=False</em>, <em>counter_to_update=None</em><span class="sig-paren">)</span><a class="reference internal" href="../../../_modules/mxnet/contrib/text/utils.html#count_tokens_from_str"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#mxnet.contrib.text.utils.count_tokens_from_str" title="Permalink to this definition"></a></dt>
<dd><p>Counts tokens in the specified string.</p>
<p>For token_delim=’<td>’ and seq_delim=’<sd>’, a specified string of two sequences of
tokens may look like:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="o"><</span><span class="n">td</span><span class="o">></span><span class="n">token1</span><span class="o"><</span><span class="n">td</span><span class="o">></span><span class="n">token2</span><span class="o"><</span><span class="n">td</span><span class="o">></span><span class="n">token3</span><span class="o"><</span><span class="n">td</span><span class="o">><</span><span class="n">sd</span><span class="o">><</span><span class="n">td</span><span class="o">></span><span class="n">token4</span><span class="o"><</span><span class="n">td</span><span class="o">></span><span class="n">token5</span><span class="o"><</span><span class="n">td</span><span class="o">><</span><span class="n">sd</span><span class="o">></span>
</pre></div>
</div>
<p><td> and <sd> are regular expressions. Make use of \ to allow special characters as
delimiters. The list of
special characters can be found at <a class="reference external" href="https://docs.python.org/3/library/re.html">https://docs.python.org/3/library/re.html</a>.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name"/>
<col class="field-body"/>
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>source_str</strong> (<em>str</em>) – A source string of tokens.</li>
<li><strong>token_delim</strong> (<em>str</em><em>, </em><em>default ' '</em>) – A token delimiter.</li>
<li><strong>seq_delim</strong> (<em>str</em><em>, </em><em>default '\n'</em>) – A sequence delimiter.</li>
<li><strong>to_lower</strong> (<em>bool</em><em>, </em><em>default False</em>) – Whether to convert the source source_str to the lower case.</li>
<li><strong>counter_to_update</strong> (<em>collections.Counter</em><em> or </em><em>None</em><em>, </em><em>default None</em>) – The collections.Counter instance to be updated with the token counts of <cite>source_str</cite>. If
None, return a new collections.Counter instance counting tokens from <cite>source_str</cite>.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">The <cite>counter_to_update</cite> collections.Counter instance after being updated with the token
counts of <cite>source_str</cite>. If <cite>counter_to_update</cite> is None, return a new collections.Counter
instance counting tokens from <cite>source_str</cite>.</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">collections.Counter</p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Examples</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">source_str</span> <span class="o">=</span> <span class="s1">' Life is great ! </span><span class="se">\n</span><span class="s1"> life is good . </span><span class="se">\n</span><span class="s1">'</span>
<span class="gp">>>> </span><span class="n">count_tokens_from_str</span><span class="p">(</span><span class="n">token_line</span><span class="p">,</span> <span class="s1">' '</span><span class="p">,</span> <span class="s1">'</span><span class="se">\n</span><span class="s1">'</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
<span class="go">Counter({'!': 1, '.': 1, 'good': 1, 'great': 1, 'is': 2, 'life': 2})</span>
</pre></div>
</div>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">source_str</span> <span class="o">=</span> <span class="s1">'*Life*is*great*!*</span><span class="se">\n</span><span class="s1">*life*is*good*.*</span><span class="se">\n</span><span class="s1">'</span>
<span class="gp">>>> </span><span class="n">count_tokens_from_str</span><span class="p">(</span><span class="n">token_line</span><span class="p">,</span> <span class="s1">'\*'</span><span class="p">,</span> <span class="s1">'</span><span class="se">\n</span><span class="s1">'</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
<span class="go">Counter({'is': 2, 'life': 2, '!': 1, 'great': 1, 'good': 1, '.': 1})</span>
</pre></div>
</div>
</dd></dl>
<script>auto_index("api-reference");</script></div>
</div>
</div>
</div>
<div aria-label="main navigation" class="sphinxsidebar rightsidebar" role="navigation">
<div class="sphinxsidebarwrapper">
<h3><a href="../../../index.html">Table Of Contents</a></h3>
<ul>
<li><a class="reference internal" href="#">Text API</a><ul>
<li><a class="reference internal" href="#overview">Overview</a><ul>
<li><a class="reference internal" href="#looking-up-pre-trained-word-embeddings-for-indexed-words">Looking up pre-trained word embeddings for indexed words</a></li>
<li><a class="reference internal" href="#using-pre-trained-word-embeddings-in-gluon">Using pre-trained word embeddings in <code class="docutils literal"><span class="pre">gluon</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#vocabulary">Vocabulary</a></li>
<li><a class="reference internal" href="#text-token-embedding">Text token embedding</a><ul>
<li><a class="reference internal" href="#indexed-tokens-are-from-a-vocabulary">Indexed tokens are from a vocabulary</a></li>
<li><a class="reference internal" href="#indexed-tokens-are-from-the-loaded-embedding-vectors">Indexed tokens are from the loaded embedding vectors</a></li>
<li><a class="reference internal" href="#implement-a-new-text-token-embedding">Implement a new text token embedding</a></li>
</ul>
</li>
<li><a class="reference internal" href="#text-utilities">Text utilities</a></li>
<li><a class="reference internal" href="#api-reference">API Reference</a></li>
</ul>
</li>
</ul>
</div>
</div>
</div><div class="footer">
<div class="section-disclaimer">
<div class="container">
<div>
<img height="60" src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/apache_incubator_logo.png"/>
<p>
Apache MXNet is an effort undergoing incubation at The Apache Software Foundation (ASF), <strong>sponsored by the <i>Apache Incubator</i></strong>. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
</p>
<p>
"Copyright © 2017-2018, The Apache Software Foundation
Apache MXNet, MXNet, Apache, the Apache feather, and the Apache MXNet project logo are either registered trademarks or trademarks of the Apache Software Foundation."
</p>
</div>
</div>
</div>
</div> <!-- pagename != index -->
</div>
<script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
<script src="../../../_static/js/sidebar.js" type="text/javascript"></script>
<script src="../../../_static/js/search.js" type="text/javascript"></script>
<script src="../../../_static/js/navbar.js" type="text/javascript"></script>
<script src="../../../_static/js/clipboard.min.js" type="text/javascript"></script>
<script src="../../../_static/js/copycode.js" type="text/javascript"></script>
<script src="../../../_static/js/page.js" type="text/javascript"></script>
<script type="text/javascript">
$('body').ready(function () {
$('body').css('visibility', 'visible');
});
</script>
</body>
</html>