| <!DOCTYPE html> |
| |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8"/> |
| <meta content="IE=edge" http-equiv="X-UA-Compatible"/> |
| <meta content="width=device-width, initial-scale=1" name="viewport"/> |
| <title>Speech LSTM — mxnet documentation</title> |
| <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css" integrity="sha384-1q8mTJOASx8j1Au+a5WDVnPi2lkFfwwEAa8hDDdjZlpLegxhjVME1fgjWPGmkzs7" rel="stylesheet"/> |
| <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.5.0/css/font-awesome.min.css" rel="stylesheet"/> |
| <link href="../../_static/basic.css" rel="stylesheet" type="text/css"> |
| <link href="../../_static/pygments.css" rel="stylesheet" type="text/css"> |
| <link href="../../_static/mxnet.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript"> |
| var DOCUMENTATION_OPTIONS = { |
| URL_ROOT: '../../', |
| VERSION: '', |
| COLLAPSE_INDEX: false, |
| FILE_SUFFIX: '.html', |
| HAS_SOURCE: true, |
| SOURCELINK_SUFFIX: '' |
| }; |
| </script> |
| <script src="../../_static/jquery-1.11.1.js" type="text/javascript"></script> |
| <script src="../../_static/underscore.js" type="text/javascript"></script> |
| <script src="../../_static/searchtools_custom.js" type="text/javascript"></script> |
| <script src="../../_static/doctools.js" type="text/javascript"></script> |
| <script src="../../_static/selectlang.js" type="text/javascript"></script> |
| <script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script> |
| <script type="text/javascript"> jQuery(function() { Search.loadIndex("/searchindex.js"); Search.init();}); </script> |
| <script> |
| (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ |
| (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new |
| Date();a=s.createElement(o), |
| m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) |
| })(window,document,'script','https://www.google-analytics.com/analytics.js','ga'); |
| |
| ga('create', 'UA-96378503-1', 'auto'); |
| ga('send', 'pageview'); |
| |
| </script> |
| <!-- --> |
| <!-- <script type="text/javascript" src="../../_static/jquery.js"></script> --> |
| <!-- --> |
| <!-- <script type="text/javascript" src="../../_static/underscore.js"></script> --> |
| <!-- --> |
| <!-- <script type="text/javascript" src="../../_static/doctools.js"></script> --> |
| <!-- --> |
| <!-- <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script> --> |
| <!-- --> |
| <link href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png" rel="icon" type="image/png"/> |
| </link></link></head> |
| <body role="document"><!-- Previous Navbar Layout |
| <div class="navbar navbar-default navbar-fixed-top"> |
| <div class="container"> |
| <div class="navbar-header"> |
| <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar"> |
| <span class="sr-only">Toggle navigation</span> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| </button> |
| <a href="../../" class="navbar-brand"> |
| <img src="http://data.mxnet.io/theme/mxnet.png"> |
| </a> |
| </div> |
| <div id="navbar" class="navbar-collapse collapse"> |
| <ul id="navbar" class="navbar navbar-left"> |
| |
| <li> <a href="../../get_started/index.html">Get Started</a> </li> |
| |
| <li> <a href="../../tutorials/index.html">Tutorials</a> </li> |
| |
| <li> <a href="../../how_to/index.html">How To</a> </li> |
| |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Packages <span class="caret"></span></a> |
| <ul class="dropdown-menu"> |
| |
| <li><a href="../../packages/python/index.html"> |
| Python |
| </a></li> |
| |
| <li><a href="../../packages/r/index.html"> |
| R |
| </a></li> |
| |
| <li><a href="../../packages/julia/index.html"> |
| Julia |
| </a></li> |
| |
| <li><a href="../../packages/c++/index.html"> |
| C++ |
| </a></li> |
| |
| <li><a href="../../packages/scala/index.html"> |
| Scala |
| </a></li> |
| |
| <li><a href="../../packages/perl/index.html"> |
| Perl |
| </a></li> |
| |
| </ul> |
| </li> |
| |
| <li> <a href="../../system/index.html">System</a> </li> |
| <li> |
| <form class="" role="search" action="../../search.html" method="get" autocomplete="off"> |
| <div class="form-group inner-addon left-addon"> |
| <i class="glyphicon glyphicon-search"></i> |
| <input type="text" name="q" class="form-control" placeholder="Search"> |
| </div> |
| <input type="hidden" name="check_keywords" value="yes" /> |
| <input type="hidden" name="area" value="default" /> |
| |
| </form> </li> |
| </ul> |
| <ul id="navbar" class="navbar navbar-right"> |
| <li> <a href="../../index.html"><span class="flag-icon flag-icon-us"></span></a> </li> |
| <li> <a href="../..//zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li> |
| </ul> |
| </div> |
| </div> |
| </div> |
| Previous Navbar Layout End --> |
| <div class="navbar navbar-fixed-top"> |
| <div class="container" id="navContainer"> |
| <div class="innder" id="header-inner"> |
| <h1 id="logo-wrap"> |
| <a href="../../" id="logo"><img src="http://data.mxnet.io/theme/mxnet.png"/></a> |
| </h1> |
| <nav class="nav-bar" id="main-nav"> |
| <a class="main-nav-link" href="../../get_started/install.html">Install</a> |
| <a class="main-nav-link" href="../../tutorials/index.html">Tutorials</a> |
| <a class="main-nav-link" href="../../how_to/index.html">How To</a> |
| <span id="dropdown-menu-position-anchor"> |
| <a aria-expanded="true" aria-haspopup="true" class="main-nav-link dropdown-toggle" data-toggle="dropdown" href="#" role="button">API <span class="caret"></span></a> |
| <ul class="dropdown-menu" id="package-dropdown-menu"> |
| <li><a class="main-nav-link" href="../../api/python/index.html">Python</a></li> |
| <li><a class="main-nav-link" href="../../api/scala/index.html">Scala</a></li> |
| <li><a class="main-nav-link" href="../../api/r/index.html">R</a></li> |
| <li><a class="main-nav-link" href="../../api/julia/index.html">Julia</a></li> |
| <li><a class="main-nav-link" href="../../api/c++/index.html">C++</a></li> |
| <li><a class="main-nav-link" href="../../api/perl/index.html">Perl</a></li> |
| </ul> |
| </span> |
| <a class="main-nav-link" href="../../architecture/index.html">Architecture</a> |
| <!-- <a class="main-nav-link" href="../../community/index.html">Community</a> --> |
| <a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a> |
| <span id="dropdown-menu-position-anchor-version" style="position: relative"><a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Versions(master)<span class="caret"></span></a><ul id="package-dropdown-menu" class="dropdown-menu"><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a class="main-nav-link" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></span></nav> |
| <script> function getRootPath(){ return "../../" } </script> |
| <div class="burgerIcon dropdown"> |
| <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button">☰</a> |
| <ul class="dropdown-menu dropdown-menu-right" id="burgerMenu"> |
| <li><a href="../../get_started/install.html">Install</a></li> |
| <li><a href="../../tutorials/index.html">Tutorials</a></li> |
| <li><a href="../../how_to/index.html">How To</a></li> |
| <li class="dropdown-submenu"> |
| <a href="#" tabindex="-1">API</a> |
| <ul class="dropdown-menu"> |
| <li><a href="../../api/python/index.html" tabindex="-1">Python</a> |
| </li> |
| <li><a href="../../api/scala/index.html" tabindex="-1">Scala</a> |
| </li> |
| <li><a href="../../api/r/index.html" tabindex="-1">R</a> |
| </li> |
| <li><a href="../../api/julia/index.html" tabindex="-1">Julia</a> |
| </li> |
| <li><a href="../../api/c++/index.html" tabindex="-1">C++</a> |
| </li> |
| <li><a href="../../api/perl/index.html" tabindex="-1">Perl</a> |
| </li> |
| </ul> |
| </li> |
| <li><a href="../../architecture/index.html">Architecture</a></li> |
| <li><a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a></li> |
| <li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" style="position: relative"><a href="#" tabindex="-1">Versions(master)</a><ul class="dropdown-menu"><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/>v0.10.14</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/0.10/index.html>0.10</a></li><li><a tabindex="-1" href=http://mxnet.incubator.apache.org/test/versions/master/index.html>master</a></li></ul></li></ul> |
| </div> |
| <div class="plusIcon dropdown"> |
| <a class="dropdown-toggle" data-toggle="dropdown" href="#" role="button"><span aria-hidden="true" class="glyphicon glyphicon-plus"></span></a> |
| <ul class="dropdown-menu dropdown-menu-right" id="plusMenu"></ul> |
| </div> |
| <div id="search-input-wrap"> |
| <form action="../../search.html" autocomplete="off" class="" method="get" role="search"> |
| <div class="form-group inner-addon left-addon"> |
| <i class="glyphicon glyphicon-search"></i> |
| <input class="form-control" name="q" placeholder="Search" type="text"/> |
| </div> |
| <input name="check_keywords" type="hidden" value="yes"> |
| <input name="area" type="hidden" value="default"/> |
| </input></form> |
| <div id="search-preview"></div> |
| </div> |
| <div id="searchIcon"> |
| <span aria-hidden="true" class="glyphicon glyphicon-search"></span> |
| </div> |
| <!-- <div id="lang-select-wrap"> --> |
| <!-- <label id="lang-select-label"> --> |
| <!-- <\!-- <i class="fa fa-globe"></i> -\-> --> |
| <!-- <span></span> --> |
| <!-- </label> --> |
| <!-- <select id="lang-select"> --> |
| <!-- <option value="en">Eng</option> --> |
| <!-- <option value="zh">中文</option> --> |
| <!-- </select> --> |
| <!-- </div> --> |
| <!-- <a id="mobile-nav-toggle"> |
| <span class="mobile-nav-toggle-bar"></span> |
| <span class="mobile-nav-toggle-bar"></span> |
| <span class="mobile-nav-toggle-bar"></span> |
| </a> --> |
| </div> |
| </div> |
| </div> |
| <div class="container"> |
| <div class="row"> |
| <div aria-label="main navigation" class="sphinxsidebar leftsidebar" role="navigation"> |
| <div class="sphinxsidebarwrapper"> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../../api/python/index.html">Python Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../api/r/index.html">R Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../api/julia/index.html">Julia Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../api/c++/index.html">C++ Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../api/scala/index.html">Scala Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../api/perl/index.html">Perl Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../how_to/index.html">HowTo Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../architecture/index.html">System Documents</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../index.html">Tutorials</a></li> |
| </ul> |
| </div> |
| </div> |
| <div class="content"> |
| <div class="section" id="speech-lstm"> |
| <span id="speech-lstm"></span><h1>Speech LSTM<a class="headerlink" href="#speech-lstm" title="Permalink to this headline">¶</a></h1> |
| <p>You can get the source code for these examples on <a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/speech-demo">GitHub</a>.</p> |
| <div class="section" id="speech-acoustic-modeling-example"> |
| <span id="speech-acoustic-modeling-example"></span><h2>Speech Acoustic Modeling Example<a class="headerlink" href="#speech-acoustic-modeling-example" title="Permalink to this headline">¶</a></h2> |
| <p>The examples folder contains examples for speech recognition:</p> |
| <ul class="simple"> |
| <li><a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/speech-demo/lstm_proj.py">lstm_proj.py</a>: Functions for building an LSTM network with and without a projection layer.</li> |
| <li><a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/speech-demo/io_util.py">io_util.py</a>: Wrapper functions for <code class="docutils literal"><span class="pre">DataIter</span></code> over speech data.</li> |
| <li><a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/speech-demo/train_lstm_proj.py">train_lstm_proj.py</a>: A script for training an LSTM acoustic model.</li> |
| <li><a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/speech-demo/decode_mxnet.py">decode_mxnet.py</a>: A script for decoding an LSTMP acoustic model.</li> |
| <li><a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/speech-demo/default.cfg">default.cfg</a>: Configuration for training on the <code class="docutils literal"><span class="pre">AMI</span></code> SDM1 dataset. You can use it as a template for writing other configuration files.</li> |
| <li><a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/speech-demo/python_wrap">python_wrap</a>: C wrappers for Kaldi C++ code, built into an .so file. Python code that loads the .so file and calls the C wrapper functions in <code class="docutils literal"><span class="pre">io_func/feat_readers/reader_kaldi.py</span></code>.</li> |
| </ul> |
| <p>Connect to Kaldi:</p> |
| <ul class="simple"> |
| <li><a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/speech-demo/decode_mxnet.sh">decode_mxnet.sh</a>: Called by Kaldi to decode an acoustic model trained by MXNet (select the <code class="docutils literal"><span class="pre">simple</span></code> method for decoding).</li> |
| </ul> |
| <p>A full receipt:</p> |
| <ul class="simple"> |
| <li><a class="reference external" href="https://github.com/dmlc/mxnet/tree/master/example/speech-demo/run_ami.sh">run_ami.sh</a>: A full receipt to train and decode an acoustic model on AMI. It takes features and alignment from Kaldi to train an acoustic model and decode it.</li> |
| </ul> |
| <p>To create the speech acoustic modeling example, use the following steps.</p> |
| <div class="section" id="build-kaldi"> |
| <span id="build-kaldi"></span><h3>Build Kaldi<a class="headerlink" href="#build-kaldi" title="Permalink to this headline">¶</a></h3> |
| <p>Build Kaldi as shared libraries if you have not already done so.</p> |
| <div class="highlight-bash"><div class="highlight"><pre><span></span><span class="nb">cd</span> kaldi/src |
| ./configure --shared <span class="c1"># and other options that you need</span> |
| make depend |
| make |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="build-the-python-wrapper"> |
| <span id="build-the-python-wrapper"></span><h3>Build the Python Wrapper<a class="headerlink" href="#build-the-python-wrapper" title="Permalink to this headline">¶</a></h3> |
| <ol class="simple"> |
| <li>Copy or link the attached <code class="docutils literal"><span class="pre">python_wrap</span></code> folder to <code class="docutils literal"><span class="pre">kaldi/src</span></code>.</li> |
| <li>Compile python_wrap/.</li> |
| </ol> |
| <div class="highlight-python"><div class="highlight"><pre><span></span>cd kaldi/src/python_wrap/ |
| make |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="extract-features-and-prepare-frame-level-labels"> |
| <span id="extract-features-and-prepare-frame-level-labels"></span><h3>Extract Features and Prepare Frame-level Labels<a class="headerlink" href="#extract-features-and-prepare-frame-level-labels" title="Permalink to this headline">¶</a></h3> |
| <p>The acoustic models use Mel filter-bank or MFCC as input features. They also need to use Kaldi to perform force-alignment to generate frame-level labels from the text transcriptions. For example, if you want to work on the <code class="docutils literal"><span class="pre">AMI</span></code> data <code class="docutils literal"><span class="pre">SDM1</span></code>, you can run <code class="docutils literal"><span class="pre">kaldi/egs/ami/s5/run_sdm.sh</span></code>. Before you can run the examples, you need to configure some paths in <code class="docutils literal"><span class="pre">kaldi/egs/ami/s5/cmd.sh</span></code> and <code class="docutils literal"><span class="pre">kaldi/egs/ami/s5/run_sdm.sh</span></code>. Refer to Kaldi’s documentation for details.</p> |
| <p>The default <code class="docutils literal"><span class="pre">run_sdm.sh</span></code> script generates the force-alignment labels in their stage 7, and saves the force-aligned labels in <code class="docutils literal"><span class="pre">exp/sdm1/tri3a_ali</span></code>. The default script generates MFCC features (13-dimensional). You can try training with the MFCC features, or you can create Mel filter-bank features by yourself. For example, you can use a script like this to compute Mel filter-bank features using Kaldi:</p> |
| <div class="highlight-bash"><div class="highlight"><pre><span></span><span class="ch">#!/bin/bash -u</span> |
| |
| . ./cmd.sh |
| . ./path.sh |
| |
| <span class="c1"># SDM - Single Distant Microphone</span> |
| <span class="nv">micid</span><span class="o">=</span><span class="m">1</span> <span class="c1">#which mic from array should be used?</span> |
| <span class="nv">mic</span><span class="o">=</span>sdm<span class="nv">$micid</span> |
| |
| <span class="c1"># Set bash to 'debug' mode, it prints the commands (option '-x') and exits on :</span> |
| <span class="c1"># -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',</span> |
| <span class="nb">set</span> -euxo pipefail |
| |
| <span class="c1"># Path where AMI gets downloaded (or where locally available):</span> |
| <span class="nv">AMI_DIR</span><span class="o">=</span><span class="nv">$PWD</span>/wav_db <span class="c1"># Default,</span> |
| <span class="nv">data_dir</span><span class="o">=</span><span class="nv">$PWD</span>/data/<span class="nv">$mic</span> |
| |
| <span class="c1"># make filter bank data</span> |
| <span class="k">for</span> dset in train dev eval<span class="p">;</span> <span class="k">do</span> |
| steps/make_fbank.sh --nj <span class="m">48</span> --cmd <span class="s2">"</span><span class="nv">$train_cmd</span><span class="s2">"</span> <span class="nv">$data_dir</span>/<span class="nv">$dset</span> <span class="se">\</span> |
| <span class="nv">$data_dir</span>/<span class="nv">$dset</span>/log <span class="nv">$data_dir</span>/<span class="nv">$dset</span>/data-fbank |
| steps/compute_cmvn_stats.sh <span class="nv">$data_dir</span>/<span class="nv">$dset</span> <span class="se">\</span> |
| <span class="nv">$data_dir</span>/<span class="nv">$dset</span>/log <span class="nv">$data_dir</span>/<span class="nv">$dset</span>/data |
| |
| apply-cmvn --utt2spk<span class="o">=</span>ark:<span class="nv">$data_dir</span>/<span class="nv">$dset</span>/utt2spk <span class="se">\</span> |
| scp:<span class="nv">$data_dir</span>/<span class="nv">$dset</span>/cmvn.scp scp:<span class="nv">$data_dir</span>/<span class="nv">$dset</span>/feats.scp <span class="se">\</span> |
| ark,scp:<span class="nv">$data_dir</span>/<span class="nv">$dset</span>/feats-cmvn.ark,<span class="nv">$data_dir</span>/<span class="nv">$dset</span>/feats-cmvn.scp |
| |
| mv <span class="nv">$data_dir</span>/<span class="nv">$dset</span>/feats-cmvn.scp <span class="nv">$data_dir</span>/<span class="nv">$dset</span>/feats.scp |
| <span class="k">done</span> |
| </pre></div> |
| </div> |
| <p><code class="docutils literal"><span class="pre">apply-cmvn</span></code> provides mean-variance normalization. The default setup was applied per speaker. It’s more common to perform mean-variance normalization for the whole corpus, and then feed the results to the neural networks:</p> |
| <div class="highlight-python"><div class="highlight"><pre><span></span> compute-cmvn-stats scp:data/sdm1/train_fbank/feats.scp data/sdm1/train_fbank/cmvn_g.ark |
| apply-cmvn --norm-vars=true data/sdm1/train_fbank/cmvn_g.ark scp:data/sdm1/train_fbank/feats.scp ark,scp:data/sdm1/train_fbank_gcmvn/feats.ark,data/sdm1/train_fbank_gcmvn/feats.scp |
| </pre></div> |
| </div> |
| <p>Note that Kaldi always tries to find features in <code class="docutils literal"><span class="pre">feats.scp</span></code>. Ensure that the normalized features are organized as Kaldi expects them during decoding.</p> |
| <p>Finally, put the features and labels together in a file so that MXNet can find them. More specifically, for each data set (train, dev, eval), you will need to create a file similar to <code class="docutils literal"><span class="pre">train_mxnet.feats</span></code>, with the following contents:</p> |
| <div class="highlight-python"><div class="highlight"><pre><span></span>TRANSFORM scp:feat.scp |
| scp:label.scp |
| </pre></div> |
| </div> |
| <p><code class="docutils literal"><span class="pre">TRANSFORM</span></code> is the transformation you want to apply to the features. By default, we use <code class="docutils literal"><span class="pre">NO_FEATURE_TRANSFORM</span></code>. The <code class="docutils literal"><span class="pre">scp:</span></code> syntax is from Kaldi. <code class="docutils literal"><span class="pre">feat.scp</span></code> is typically the file from <code class="docutils literal"><span class="pre">data/sdm1/train/feats.scp</span></code>, and <code class="docutils literal"><span class="pre">label.scp</span></code> is converted from the force-aligned labels located in <code class="docutils literal"><span class="pre">exp/sdm1/tri3a_ali</span></code>. Because the force-alignments are generated only on the training data, we split the training set in two, using a 90/10 ratio, and then use the 1/10 holdout as the dev set (validation set). The script <a class="reference external" href="https://github.com/dmlc/mxnet/blob/master/example/speech-demo/run_ami.sh">run_ami.sh</a> automatically splits and formats the file for MXNet. Before running it, set the path in the script correctly. The <a class="reference external" href="https://github.com/dmlc/mxnet/blob/master/example/speech-demo/run_ami.sh">run_ami.sh</a> script actually runs the full pipeline, including training the acoustic model and decoding. If the scripts ran successfully, you can skip the following sections.</p> |
| </div> |
| <div class="section" id="run-mxnet-acoustic-model-training"> |
| <span id="run-mxnet-acoustic-model-training"></span><h3>Run MXNet Acoustic Model Training<a class="headerlink" href="#run-mxnet-acoustic-model-training" title="Permalink to this headline">¶</a></h3> |
| <ol class="simple"> |
| <li>Return to the speech demo directory in MXNet. Make a copy of <code class="docutils literal"><span class="pre">default.cfg</span></code>, and edit the necessary parameters, such as the path to the dataset you just prepared.</li> |
| <li>Run <code class="docutils literal"><span class="pre">python</span> <span class="pre">train_lstm.py</span> <span class="pre">--configfile=your-config.cfg</span></code>. For help, use <code class="docutils literal"><span class="pre">python</span> <span class="pre">train_lstm.py</span> <span class="pre">--help</span></code>. You can set all of the configuration parameters in <code class="docutils literal"><span class="pre">default.cfg</span></code>, the customized config file, and through the command line (e.g., using <code class="docutils literal"><span class="pre">--train_batch_size=50</span></code>). The latter values overwrite the former ones.</li> |
| </ol> |
| <p>Here are some example outputs from training on the TIMIT dataset:</p> |
| <div class="highlight-python"><div class="highlight"><pre><span></span>Example output for TIMIT: |
| Summary of dataset ================== |
| bucket of len 100 : 3 samples |
| bucket of len 200 : 346 samples |
| bucket of len 300 : 1496 samples |
| bucket of len 400 : 974 samples |
| bucket of len 500 : 420 samples |
| bucket of len 600 : 90 samples |
| bucket of len 700 : 11 samples |
| bucket of len 800 : 2 samples |
| Summary of dataset ================== |
| bucket of len 100 : 0 samples |
| bucket of len 200 : 28 samples |
| bucket of len 300 : 169 samples |
| bucket of len 400 : 107 samples |
| bucket of len 500 : 41 samples |
| bucket of len 600 : 6 samples |
| bucket of len 700 : 3 samples |
| bucket of len 800 : 0 samples |
| 2016-04-21 20:02:40,904 Epoch[0] Train-Acc_exlude_padding=0.154763 |
| 2016-04-21 20:02:40,904 Epoch[0] Time cost=91.574 |
| 2016-04-21 20:02:44,419 Epoch[0] Validation-Acc_exlude_padding=0.353552 |
| 2016-04-21 20:04:17,290 Epoch[1] Train-Acc_exlude_padding=0.447318 |
| 2016-04-21 20:04:17,290 Epoch[1] Time cost=92.870 |
| 2016-04-21 20:04:20,738 Epoch[1] Validation-Acc_exlude_padding=0.506458 |
| 2016-04-21 20:05:53,127 Epoch[2] Train-Acc_exlude_padding=0.557543 |
| 2016-04-21 20:05:53,128 Epoch[2] Time cost=92.390 |
| 2016-04-21 20:05:56,568 Epoch[2] Validation-Acc_exlude_padding=0.548100 |
| </pre></div> |
| </div> |
| <p>The final frame accuracy was approximately 62%.</p> |
| </div> |
| <div class="section" id="run-decode-on-the-trained-acoustic-model"> |
| <span id="run-decode-on-the-trained-acoustic-model"></span><h3>Run Decode on the Trained Acoustic Model<a class="headerlink" href="#run-decode-on-the-trained-acoustic-model" title="Permalink to this headline">¶</a></h3> |
| <ol class="simple"> |
| <li>Estimate senone priors by running <code class="docutils literal"><span class="pre">python</span> <span class="pre">make_stats.py</span> <span class="pre">--configfile=your-config.cfg</span> <span class="pre">|</span> <span class="pre">copy-feats</span> <span class="pre">ark:-</span> <span class="pre">ark:label_mean.ark</span></code> (edit necessary items, such as the path to the training dataset). This command generates the label counts in <code class="docutils literal"><span class="pre">label_mean.ark</span></code>.</li> |
| <li>Link to the necessary Kaldi decode setup, e.g., <code class="docutils literal"><span class="pre">local/</span></code> and <code class="docutils literal"><span class="pre">utils/</span></code> and run <code class="docutils literal"><span class="pre">./run_ami.sh</span> <span class="pre">--model</span> <span class="pre">prefix</span> <span class="pre">model</span> <span class="pre">--num_epoch</span> <span class="pre">num</span></code>.</li> |
| </ol> |
| <p>Here are the results for the TIMIT and AMI test sets (using the default setup, three-layer LSTM with projection layers):</p> |
| <table border="1" class="docutils"> |
| <colgroup> |
| <col width="50%"/> |
| <col width="50%"/> |
| </colgroup> |
| <thead valign="bottom"> |
| <tr class="row-odd"><th class="head">Corpus</th> |
| <th class="head">WER</th> |
| </tr> |
| </thead> |
| <tbody valign="top"> |
| <tr class="row-even"><td>TIMIT</td> |
| <td>18.9</td> |
| </tr> |
| <tr class="row-odd"><td>AMI</td> |
| <td>51.7 (42.2)</td> |
| </tr> |
| </tbody> |
| </table> |
| <p>For AMI 42.2 was evaluated non-overlapped speech. The Kaldi-HMM baseline was 67.2%, and DNN was 57.5%.</p> |
| </div> |
| </div> |
| <div class="section" id="next-steps"> |
| <span id="next-steps"></span><h2>Next Steps<a class="headerlink" href="#next-steps" title="Permalink to this headline">¶</a></h2> |
| <div class="toctree-wrapper compound"> |
| <ul> |
| <li class="toctree-l1"><a class="reference external" href="http://mxnet.io/tutorials/index.html">MXNet tutorials index</a></li> |
| </ul> |
| </div> |
| </div> |
| </div> |
| <div class="container"> |
| <div class="footer"> |
| <p> © 2015-2017 DMLC. All rights reserved. </p> |
| </div> |
| </div> |
| </div> |
| <div aria-label="main navigation" class="sphinxsidebar rightsidebar" role="navigation"> |
| <div class="sphinxsidebarwrapper"> |
| <h3><a href="../../index.html">Table Of Contents</a></h3> |
| <ul> |
| <li><a class="reference internal" href="#">Speech LSTM</a><ul> |
| <li><a class="reference internal" href="#speech-acoustic-modeling-example">Speech Acoustic Modeling Example</a><ul> |
| <li><a class="reference internal" href="#build-kaldi">Build Kaldi</a></li> |
| <li><a class="reference internal" href="#build-the-python-wrapper">Build the Python Wrapper</a></li> |
| <li><a class="reference internal" href="#extract-features-and-prepare-frame-level-labels">Extract Features and Prepare Frame-level Labels</a></li> |
| <li><a class="reference internal" href="#run-mxnet-acoustic-model-training">Run MXNet Acoustic Model Training</a></li> |
| <li><a class="reference internal" href="#run-decode-on-the-trained-acoustic-model">Run Decode on the Trained Acoustic Model</a></li> |
| </ul> |
| </li> |
| <li><a class="reference internal" href="#next-steps">Next Steps</a></li> |
| </ul> |
| </li> |
| </ul> |
| </div> |
| </div> |
| </div> <!-- pagename != index --> |
| <script crossorigin="anonymous" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script> |
| <script src="../../_static/js/sidebar.js" type="text/javascript"></script> |
| <script src="../../_static/js/search.js" type="text/javascript"></script> |
| <script src="../../_static/js/navbar.js" type="text/javascript"></script> |
| <script src="../../_static/js/clipboard.min.js" type="text/javascript"></script> |
| <script src="../../_static/js/copycode.js" type="text/javascript"></script> |
| <script type="text/javascript"> |
| $('body').ready(function () { |
| $('body').css('visibility', 'visible'); |
| }); |
| </script> |
| </div></body> |
| </html> |