blob: 205c85bffb6e7adce126974bc3c116db3cb84ca7 [file] [log] [blame]
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<!-- NewPage -->
<html lang="en">
<head>
<!-- Generated by javadoc (1.8.0_121) on Fri Apr 14 22:11:36 PDT 2017 -->
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>DictionaryVectorizer (Mahout Map-Reduce 0.13.0 API)</title>
<meta name="date" content="2017-04-14">
<link rel="stylesheet" type="text/css" href="../../../../stylesheet.css" title="Style">
<script type="text/javascript" src="../../../../script.js"></script>
</head>
<body>
<script type="text/javascript"><!--
try {
if (location.href.indexOf('is-external=true') == -1) {
parent.document.title="DictionaryVectorizer (Mahout Map-Reduce 0.13.0 API)";
}
}
catch(err) {
}
//-->
var methods = {"i0":9,"i1":10,"i2":9,"i3":10};
var tabs = {65535:["t0","All Methods"],1:["t1","Static Methods"],2:["t2","Instance Methods"],8:["t4","Concrete Methods"]};
var altColor = "altColor";
var rowColor = "rowColor";
var tableTab = "tableTab";
var activeTableTab = "activeTableTab";
</script>
<noscript>
<div>JavaScript is disabled on your browser.</div>
</noscript>
<!-- ========= START OF TOP NAVBAR ======= -->
<div class="topNav"><a name="navbar.top">
<!-- -->
</a>
<div class="skipNav"><a href="#skip.navbar.top" title="Skip navigation links">Skip navigation links</a></div>
<a name="navbar.top.firstrow">
<!-- -->
</a>
<ul class="navList" title="Navigation">
<li><a href="../../../../overview-summary.html">Overview</a></li>
<li><a href="package-summary.html">Package</a></li>
<li class="navBarCell1Rev">Class</li>
<li><a href="class-use/DictionaryVectorizer.html">Use</a></li>
<li><a href="package-tree.html">Tree</a></li>
<li><a href="../../../../deprecated-list.html">Deprecated</a></li>
<li><a href="../../../../index-all.html">Index</a></li>
<li><a href="../../../../help-doc.html">Help</a></li>
</ul>
</div>
<div class="subNav">
<ul class="navList">
<li>Prev&nbsp;Class</li>
<li><a href="../../../../org/apache/mahout/vectorizer/DocumentProcessor.html" title="class in org.apache.mahout.vectorizer"><span class="typeNameLink">Next&nbsp;Class</span></a></li>
</ul>
<ul class="navList">
<li><a href="../../../../index.html?org/apache/mahout/vectorizer/DictionaryVectorizer.html" target="_top">Frames</a></li>
<li><a href="DictionaryVectorizer.html" target="_top">No&nbsp;Frames</a></li>
</ul>
<ul class="navList" id="allclasses_navbar_top">
<li><a href="../../../../allclasses-noframe.html">All&nbsp;Classes</a></li>
</ul>
<div>
<script type="text/javascript"><!--
allClassesLink = document.getElementById("allclasses_navbar_top");
if(window==top) {
allClassesLink.style.display = "block";
}
else {
allClassesLink.style.display = "none";
}
//-->
</script>
</div>
<div>
<ul class="subNavList">
<li>Summary:&nbsp;</li>
<li>Nested&nbsp;|&nbsp;</li>
<li><a href="#field.summary">Field</a>&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.summary">Method</a></li>
</ul>
<ul class="subNavList">
<li>Detail:&nbsp;</li>
<li><a href="#field.detail">Field</a>&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.detail">Method</a></li>
</ul>
</div>
<a name="skip.navbar.top">
<!-- -->
</a></div>
<!-- ========= END OF TOP NAVBAR ========= -->
<!-- ======== START OF CLASS DATA ======== -->
<div class="header">
<div class="subTitle">org.apache.mahout.vectorizer</div>
<h2 title="Class DictionaryVectorizer" class="title">Class DictionaryVectorizer</h2>
</div>
<div class="contentContainer">
<ul class="inheritance">
<li><a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true" title="class or interface in java.lang">java.lang.Object</a></li>
<li>
<ul class="inheritance">
<li>org.apache.hadoop.conf.Configured</li>
<li>
<ul class="inheritance">
<li><a href="../../../../org/apache/mahout/common/AbstractJob.html" title="class in org.apache.mahout.common">org.apache.mahout.common.AbstractJob</a></li>
<li>
<ul class="inheritance">
<li>org.apache.mahout.vectorizer.DictionaryVectorizer</li>
</ul>
</li>
</ul>
</li>
</ul>
</li>
</ul>
<div class="description">
<ul class="blockList">
<li class="blockList">
<dl>
<dt>All Implemented Interfaces:</dt>
<dd>org.apache.hadoop.conf.Configurable, org.apache.hadoop.util.Tool, <a href="../../../../org/apache/mahout/vectorizer/Vectorizer.html" title="interface in org.apache.mahout.vectorizer">Vectorizer</a></dd>
</dl>
<hr>
<br>
<pre>public final class <span class="typeNameLabel">DictionaryVectorizer</span>
extends <a href="../../../../org/apache/mahout/common/AbstractJob.html" title="class in org.apache.mahout.common">AbstractJob</a>
implements <a href="../../../../org/apache/mahout/vectorizer/Vectorizer.html" title="interface in org.apache.mahout.vectorizer">Vectorizer</a></pre>
<div class="block">This class converts a set of input documents in the sequence file format to vectors. The Sequence file
input should have a <code>Text</code> key containing the unique document identifier and a <a href="../../../../org/apache/mahout/common/StringTuple.html" title="class in org.apache.mahout.common"><code>StringTuple</code></a>
value containing the tokenized document. You may use <a href="../../../../org/apache/mahout/vectorizer/DocumentProcessor.html" title="class in org.apache.mahout.vectorizer"><code>DocumentProcessor</code></a> to tokenize the document.
This is a dictionary based Vectorizer.</div>
</li>
</ul>
</div>
<div class="summary">
<ul class="blockList">
<li class="blockList">
<!-- =========== FIELD SUMMARY =========== -->
<ul class="blockList">
<li class="blockList"><a name="field.summary">
<!-- -->
</a>
<h3>Field Summary</h3>
<table class="memberSummary" border="0" cellpadding="3" cellspacing="0" summary="Field Summary table, listing fields, and an explanation">
<caption><span>Fields</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Modifier and Type</th>
<th class="colLast" scope="col">Field and Description</th>
</tr>
<tr class="altColor">
<td class="colFirst"><code>static int</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../org/apache/mahout/vectorizer/DictionaryVectorizer.html#DEFAULT_MIN_SUPPORT">DEFAULT_MIN_SUPPORT</a></span></code>&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><code>static <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a></code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../org/apache/mahout/vectorizer/DictionaryVectorizer.html#DICTIONARY_FILE">DICTIONARY_FILE</a></span></code>&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><code>static <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a></code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../org/apache/mahout/vectorizer/DictionaryVectorizer.html#DOCUMENT_VECTOR_OUTPUT_FOLDER">DOCUMENT_VECTOR_OUTPUT_FOLDER</a></span></code>&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><code>static <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a></code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../org/apache/mahout/vectorizer/DictionaryVectorizer.html#MAX_NGRAMS">MAX_NGRAMS</a></span></code>&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><code>static <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a></code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../org/apache/mahout/vectorizer/DictionaryVectorizer.html#MIN_SUPPORT">MIN_SUPPORT</a></span></code>&nbsp;</td>
</tr>
</table>
<ul class="blockList">
<li class="blockList"><a name="fields.inherited.from.class.org.apache.mahout.common.AbstractJob">
<!-- -->
</a>
<h3>Fields inherited from class&nbsp;org.apache.mahout.common.<a href="../../../../org/apache/mahout/common/AbstractJob.html" title="class in org.apache.mahout.common">AbstractJob</a></h3>
<code><a href="../../../../org/apache/mahout/common/AbstractJob.html#argMap">argMap</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#inputFile">inputFile</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#inputPath">inputPath</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#outputFile">outputFile</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#outputPath">outputPath</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#tempPath">tempPath</a></code></li>
</ul>
</li>
</ul>
<!-- ========== METHOD SUMMARY =========== -->
<ul class="blockList">
<li class="blockList"><a name="method.summary">
<!-- -->
</a>
<h3>Method Summary</h3>
<table class="memberSummary" border="0" cellpadding="3" cellspacing="0" summary="Method Summary table, listing methods, and an explanation">
<caption><span id="t0" class="activeTableTab"><span>All Methods</span><span class="tabEnd">&nbsp;</span></span><span id="t1" class="tableTab"><span><a href="javascript:show(1);">Static Methods</a></span><span class="tabEnd">&nbsp;</span></span><span id="t2" class="tableTab"><span><a href="javascript:show(2);">Instance Methods</a></span><span class="tabEnd">&nbsp;</span></span><span id="t4" class="tableTab"><span><a href="javascript:show(8);">Concrete Methods</a></span><span class="tabEnd">&nbsp;</span></span></caption>
<tr>
<th class="colFirst" scope="col">Modifier and Type</th>
<th class="colLast" scope="col">Method and Description</th>
</tr>
<tr id="i0" class="altColor">
<td class="colFirst"><code>static void</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../org/apache/mahout/vectorizer/DictionaryVectorizer.html#createTermFrequencyVectors-org.apache.hadoop.fs.Path-org.apache.hadoop.fs.Path-java.lang.String-org.apache.hadoop.conf.Configuration-int-int-float-float-boolean-int-int-boolean-boolean-">createTermFrequencyVectors</a></span>(org.apache.hadoop.fs.Path&nbsp;input,
org.apache.hadoop.fs.Path&nbsp;output,
<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a>&nbsp;tfVectorsFolderName,
org.apache.hadoop.conf.Configuration&nbsp;baseConf,
int&nbsp;minSupport,
int&nbsp;maxNGramSize,
float&nbsp;minLLRValue,
float&nbsp;normPower,
boolean&nbsp;logNormalize,
int&nbsp;numReducers,
int&nbsp;chunkSizeInMegabytes,
boolean&nbsp;sequentialAccess,
boolean&nbsp;namedVectors)</code>
<div class="block">Create Term Frequency (Tf) Vectors from the input set of documents in <code>SequenceFile</code> format.</div>
</td>
</tr>
<tr id="i1" class="rowColor">
<td class="colFirst"><code>void</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../org/apache/mahout/vectorizer/DictionaryVectorizer.html#createVectors-org.apache.hadoop.fs.Path-org.apache.hadoop.fs.Path-org.apache.mahout.vectorizer.VectorizerConfig-">createVectors</a></span>(org.apache.hadoop.fs.Path&nbsp;input,
org.apache.hadoop.fs.Path&nbsp;output,
<a href="../../../../org/apache/mahout/vectorizer/VectorizerConfig.html" title="class in org.apache.mahout.vectorizer">VectorizerConfig</a>&nbsp;config)</code>&nbsp;</td>
</tr>
<tr id="i2" class="altColor">
<td class="colFirst"><code>static void</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../org/apache/mahout/vectorizer/DictionaryVectorizer.html#main-java.lang.String:A-">main</a></span>(<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a>[]&nbsp;args)</code>&nbsp;</td>
</tr>
<tr id="i3" class="rowColor">
<td class="colFirst"><code>int</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../org/apache/mahout/vectorizer/DictionaryVectorizer.html#run-java.lang.String:A-">run</a></span>(<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a>[]&nbsp;args)</code>&nbsp;</td>
</tr>
</table>
<ul class="blockList">
<li class="blockList"><a name="methods.inherited.from.class.org.apache.mahout.common.AbstractJob">
<!-- -->
</a>
<h3>Methods inherited from class&nbsp;org.apache.mahout.common.<a href="../../../../org/apache/mahout/common/AbstractJob.html" title="class in org.apache.mahout.common">AbstractJob</a></h3>
<code><a href="../../../../org/apache/mahout/common/AbstractJob.html#addFlag-java.lang.String-java.lang.String-java.lang.String-">addFlag</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#addInputOption--">addInputOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#addOption-org.apache.commons.cli2.Option-">addOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#addOption-java.lang.String-java.lang.String-java.lang.String-">addOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#addOption-java.lang.String-java.lang.String-java.lang.String-boolean-">addOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#addOption-java.lang.String-java.lang.String-java.lang.String-java.lang.String-">addOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#addOutputOption--">addOutputOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#buildOption-java.lang.String-java.lang.String-java.lang.String-boolean-boolean-java.lang.String-">buildOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#buildOption-java.lang.String-java.lang.String-java.lang.String-boolean-int-int-boolean-java.lang.String-">buildOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getAnalyzerClassFromOption--">getAnalyzerClassFromOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getCLIOption-java.lang.String-">getCLIOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getConf--">getConf</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getDimensions-org.apache.hadoop.fs.Path-">getDimensions</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getFloat-java.lang.String-">getFloat</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getFloat-java.lang.String-float-">getFloat</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getGroup--">getGroup</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getInputFile--">getInputFile</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getInputPath--">getInputPath</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getInt-java.lang.String-">getInt</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getInt-java.lang.String-int-">getInt</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getOption-java.util.Map-java.lang.String-">getOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getOption-java.lang.String-">getOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getOption-java.lang.String-java.lang.String-">getOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getOptions-java.lang.String-">getOptions</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getOutputFile--">getOutputFile</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getOutputPath--">getOutputPath</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getOutputPath-java.lang.String-">getOutputPath</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getTempPath--">getTempPath</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#getTempPath-java.lang.String-">getTempPath</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#hasOption-java.lang.String-">hasOption</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#keyFor-java.lang.String-">keyFor</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#maybePut-java.util.Map-org.apache.commons.cli2.CommandLine-org.apache.commons.cli2.Option...-">maybePut</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#parseArguments-java.lang.String:A-">parseArguments</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#parseArguments-java.lang.String:A-boolean-boolean-">parseArguments</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#parseDirectories-org.apache.commons.cli2.CommandLine-boolean-boolean-">parseDirectories</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#prepareJob-org.apache.hadoop.fs.Path-org.apache.hadoop.fs.Path-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-">prepareJob</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#prepareJob-org.apache.hadoop.fs.Path-org.apache.hadoop.fs.Path-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.String-">prepareJob</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#prepareJob-org.apache.hadoop.fs.Path-org.apache.hadoop.fs.Path-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-">prepareJob</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#prepareJob-org.apache.hadoop.fs.Path-org.apache.hadoop.fs.Path-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-java.lang.Class-">prepareJob</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#setConf-org.apache.hadoop.conf.Configuration-">setConf</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#setS3SafeCombinedInputPath-org.apache.hadoop.mapreduce.Job-org.apache.hadoop.fs.Path-org.apache.hadoop.fs.Path-org.apache.hadoop.fs.Path-">setS3SafeCombinedInputPath</a>, <a href="../../../../org/apache/mahout/common/AbstractJob.html#shouldRunNextPhase-java.util.Map-java.util.concurrent.atomic.AtomicInteger-">shouldRunNextPhase</a></code></li>
</ul>
<ul class="blockList">
<li class="blockList"><a name="methods.inherited.from.class.java.lang.Object">
<!-- -->
</a>
<h3>Methods inherited from class&nbsp;java.lang.<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true" title="class or interface in java.lang">Object</a></h3>
<code><a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true#clone--" title="class or interface in java.lang">clone</a>, <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true#equals-java.lang.Object-" title="class or interface in java.lang">equals</a>, <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true#finalize--" title="class or interface in java.lang">finalize</a>, <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true#getClass--" title="class or interface in java.lang">getClass</a>, <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true#hashCode--" title="class or interface in java.lang">hashCode</a>, <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true#notify--" title="class or interface in java.lang">notify</a>, <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true#notifyAll--" title="class or interface in java.lang">notifyAll</a>, <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true#toString--" title="class or interface in java.lang">toString</a>, <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true#wait--" title="class or interface in java.lang">wait</a>, <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true#wait-long-" title="class or interface in java.lang">wait</a>, <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Object.html?is-external=true#wait-long-int-" title="class or interface in java.lang">wait</a></code></li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
<div class="details">
<ul class="blockList">
<li class="blockList">
<!-- ============ FIELD DETAIL =========== -->
<ul class="blockList">
<li class="blockList"><a name="field.detail">
<!-- -->
</a>
<h3>Field Detail</h3>
<a name="DOCUMENT_VECTOR_OUTPUT_FOLDER">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>DOCUMENT_VECTOR_OUTPUT_FOLDER</h4>
<pre>public static final&nbsp;<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a> DOCUMENT_VECTOR_OUTPUT_FOLDER</pre>
<dl>
<dt><span class="seeLabel">See Also:</span></dt>
<dd><a href="../../../../constant-values.html#org.apache.mahout.vectorizer.DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER">Constant Field Values</a></dd>
</dl>
</li>
</ul>
<a name="MIN_SUPPORT">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>MIN_SUPPORT</h4>
<pre>public static final&nbsp;<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a> MIN_SUPPORT</pre>
<dl>
<dt><span class="seeLabel">See Also:</span></dt>
<dd><a href="../../../../constant-values.html#org.apache.mahout.vectorizer.DictionaryVectorizer.MIN_SUPPORT">Constant Field Values</a></dd>
</dl>
</li>
</ul>
<a name="MAX_NGRAMS">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>MAX_NGRAMS</h4>
<pre>public static final&nbsp;<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a> MAX_NGRAMS</pre>
<dl>
<dt><span class="seeLabel">See Also:</span></dt>
<dd><a href="../../../../constant-values.html#org.apache.mahout.vectorizer.DictionaryVectorizer.MAX_NGRAMS">Constant Field Values</a></dd>
</dl>
</li>
</ul>
<a name="DEFAULT_MIN_SUPPORT">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>DEFAULT_MIN_SUPPORT</h4>
<pre>public static final&nbsp;int DEFAULT_MIN_SUPPORT</pre>
<dl>
<dt><span class="seeLabel">See Also:</span></dt>
<dd><a href="../../../../constant-values.html#org.apache.mahout.vectorizer.DictionaryVectorizer.DEFAULT_MIN_SUPPORT">Constant Field Values</a></dd>
</dl>
</li>
</ul>
<a name="DICTIONARY_FILE">
<!-- -->
</a>
<ul class="blockListLast">
<li class="blockList">
<h4>DICTIONARY_FILE</h4>
<pre>public static final&nbsp;<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a> DICTIONARY_FILE</pre>
<dl>
<dt><span class="seeLabel">See Also:</span></dt>
<dd><a href="../../../../constant-values.html#org.apache.mahout.vectorizer.DictionaryVectorizer.DICTIONARY_FILE">Constant Field Values</a></dd>
</dl>
</li>
</ul>
</li>
</ul>
<!-- ============ METHOD DETAIL ========== -->
<ul class="blockList">
<li class="blockList"><a name="method.detail">
<!-- -->
</a>
<h3>Method Detail</h3>
<a name="createVectors-org.apache.hadoop.fs.Path-org.apache.hadoop.fs.Path-org.apache.mahout.vectorizer.VectorizerConfig-">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>createVectors</h4>
<pre>public&nbsp;void&nbsp;createVectors(org.apache.hadoop.fs.Path&nbsp;input,
org.apache.hadoop.fs.Path&nbsp;output,
<a href="../../../../org/apache/mahout/vectorizer/VectorizerConfig.html" title="class in org.apache.mahout.vectorizer">VectorizerConfig</a>&nbsp;config)
throws <a href="http://docs.oracle.com/javase/7/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io">IOException</a>,
<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/ClassNotFoundException.html?is-external=true" title="class or interface in java.lang">ClassNotFoundException</a>,
<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/InterruptedException.html?is-external=true" title="class or interface in java.lang">InterruptedException</a></pre>
<dl>
<dt><span class="overrideSpecifyLabel">Specified by:</span></dt>
<dd><code><a href="../../../../org/apache/mahout/vectorizer/Vectorizer.html#createVectors-org.apache.hadoop.fs.Path-org.apache.hadoop.fs.Path-org.apache.mahout.vectorizer.VectorizerConfig-">createVectors</a></code>&nbsp;in interface&nbsp;<code><a href="../../../../org/apache/mahout/vectorizer/Vectorizer.html" title="interface in org.apache.mahout.vectorizer">Vectorizer</a></code></dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="http://docs.oracle.com/javase/7/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io">IOException</a></code></dd>
<dd><code><a href="http://docs.oracle.com/javase/7/docs/api/java/lang/ClassNotFoundException.html?is-external=true" title="class or interface in java.lang">ClassNotFoundException</a></code></dd>
<dd><code><a href="http://docs.oracle.com/javase/7/docs/api/java/lang/InterruptedException.html?is-external=true" title="class or interface in java.lang">InterruptedException</a></code></dd>
</dl>
</li>
</ul>
<a name="createTermFrequencyVectors-org.apache.hadoop.fs.Path-org.apache.hadoop.fs.Path-java.lang.String-org.apache.hadoop.conf.Configuration-int-int-float-float-boolean-int-int-boolean-boolean-">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>createTermFrequencyVectors</h4>
<pre>public static&nbsp;void&nbsp;createTermFrequencyVectors(org.apache.hadoop.fs.Path&nbsp;input,
org.apache.hadoop.fs.Path&nbsp;output,
<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a>&nbsp;tfVectorsFolderName,
org.apache.hadoop.conf.Configuration&nbsp;baseConf,
int&nbsp;minSupport,
int&nbsp;maxNGramSize,
float&nbsp;minLLRValue,
float&nbsp;normPower,
boolean&nbsp;logNormalize,
int&nbsp;numReducers,
int&nbsp;chunkSizeInMegabytes,
boolean&nbsp;sequentialAccess,
boolean&nbsp;namedVectors)
throws <a href="http://docs.oracle.com/javase/7/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io">IOException</a>,
<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/InterruptedException.html?is-external=true" title="class or interface in java.lang">InterruptedException</a>,
<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/ClassNotFoundException.html?is-external=true" title="class or interface in java.lang">ClassNotFoundException</a></pre>
<div class="block">Create Term Frequency (Tf) Vectors from the input set of documents in <code>SequenceFile</code> format. This
tries to fix the maximum memory used by the feature chunk per node thereby splitting the process across
multiple map/reduces.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>input</code> - input directory of the documents in <code>SequenceFile</code> format</dd>
<dd><code>output</code> - output directory where <a href="http://mahout.apache.org/mahout-math/apidocs/org/apache/mahout/math/RandomAccessSparseVector.html?is-external=true" title="class or interface in org.apache.mahout.math"><code>RandomAccessSparseVector</code></a>'s of the document
are generated</dd>
<dd><code>tfVectorsFolderName</code> - The name of the folder in which the final output vectors will be stored</dd>
<dd><code>baseConf</code> - job configuration</dd>
<dd><code>minSupport</code> - the minimum frequency of the feature in the entire corpus to be considered for inclusion in the
sparse vector</dd>
<dd><code>maxNGramSize</code> - 1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and trigram</dd>
<dd><code>minLLRValue</code> - minValue of log likelihood ratio to used to prune ngrams</dd>
<dd><code>normPower</code> - L_p norm to be computed</dd>
<dd><code>logNormalize</code> - whether to use log normalization</dd>
<dd><code>numReducers</code> - </dd>
<dd><code>chunkSizeInMegabytes</code> - the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
stage. Its recommended you calculated this based on the number of cores and the free memory
available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
partial vectors without thrashing the system due to increased swapping</dd>
<dd><code>sequentialAccess</code> - </dd>
<dd><code>namedVectors</code> - </dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="http://docs.oracle.com/javase/7/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io">IOException</a></code></dd>
<dd><code><a href="http://docs.oracle.com/javase/7/docs/api/java/lang/InterruptedException.html?is-external=true" title="class or interface in java.lang">InterruptedException</a></code></dd>
<dd><code><a href="http://docs.oracle.com/javase/7/docs/api/java/lang/ClassNotFoundException.html?is-external=true" title="class or interface in java.lang">ClassNotFoundException</a></code></dd>
</dl>
</li>
</ul>
<a name="run-java.lang.String:A-">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>run</h4>
<pre>public&nbsp;int&nbsp;run(<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a>[]&nbsp;args)
throws <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Exception.html?is-external=true" title="class or interface in java.lang">Exception</a></pre>
<dl>
<dt><span class="overrideSpecifyLabel">Specified by:</span></dt>
<dd><code>run</code>&nbsp;in interface&nbsp;<code>org.apache.hadoop.util.Tool</code></dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Exception.html?is-external=true" title="class or interface in java.lang">Exception</a></code></dd>
</dl>
</li>
</ul>
<a name="main-java.lang.String:A-">
<!-- -->
</a>
<ul class="blockListLast">
<li class="blockList">
<h4>main</h4>
<pre>public static&nbsp;void&nbsp;main(<a href="http://docs.oracle.com/javase/7/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang">String</a>[]&nbsp;args)
throws <a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Exception.html?is-external=true" title="class or interface in java.lang">Exception</a></pre>
<dl>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="http://docs.oracle.com/javase/7/docs/api/java/lang/Exception.html?is-external=true" title="class or interface in java.lang">Exception</a></code></dd>
</dl>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</div>
<!-- ========= END OF CLASS DATA ========= -->
<!-- ======= START OF BOTTOM NAVBAR ====== -->
<div class="bottomNav"><a name="navbar.bottom">
<!-- -->
</a>
<div class="skipNav"><a href="#skip.navbar.bottom" title="Skip navigation links">Skip navigation links</a></div>
<a name="navbar.bottom.firstrow">
<!-- -->
</a>
<ul class="navList" title="Navigation">
<li><a href="../../../../overview-summary.html">Overview</a></li>
<li><a href="package-summary.html">Package</a></li>
<li class="navBarCell1Rev">Class</li>
<li><a href="class-use/DictionaryVectorizer.html">Use</a></li>
<li><a href="package-tree.html">Tree</a></li>
<li><a href="../../../../deprecated-list.html">Deprecated</a></li>
<li><a href="../../../../index-all.html">Index</a></li>
<li><a href="../../../../help-doc.html">Help</a></li>
</ul>
</div>
<div class="subNav">
<ul class="navList">
<li>Prev&nbsp;Class</li>
<li><a href="../../../../org/apache/mahout/vectorizer/DocumentProcessor.html" title="class in org.apache.mahout.vectorizer"><span class="typeNameLink">Next&nbsp;Class</span></a></li>
</ul>
<ul class="navList">
<li><a href="../../../../index.html?org/apache/mahout/vectorizer/DictionaryVectorizer.html" target="_top">Frames</a></li>
<li><a href="DictionaryVectorizer.html" target="_top">No&nbsp;Frames</a></li>
</ul>
<ul class="navList" id="allclasses_navbar_bottom">
<li><a href="../../../../allclasses-noframe.html">All&nbsp;Classes</a></li>
</ul>
<div>
<script type="text/javascript"><!--
allClassesLink = document.getElementById("allclasses_navbar_bottom");
if(window==top) {
allClassesLink.style.display = "block";
}
else {
allClassesLink.style.display = "none";
}
//-->
</script>
</div>
<div>
<ul class="subNavList">
<li>Summary:&nbsp;</li>
<li>Nested&nbsp;|&nbsp;</li>
<li><a href="#field.summary">Field</a>&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.summary">Method</a></li>
</ul>
<ul class="subNavList">
<li>Detail:&nbsp;</li>
<li><a href="#field.detail">Field</a>&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.detail">Method</a></li>
</ul>
</div>
<a name="skip.navbar.bottom">
<!-- -->
</a></div>
<!-- ======== END OF BOTTOM NAVBAR ======= -->
<p class="legalCopy"><small>Copyright &#169; 2008&#x2013;2017 <a href="http://www.apache.org/">The Apache Software Foundation</a>. All rights reserved.</small></p>
</body>
</html>