userguide/clustering/plsa.html - incubator-hivemall-site - Git at Google


 <!DOCTYPE HTML>
 <html lang="" >
     <head>
         <meta charset="UTF-8">
         <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
         <title>Probabilistic Latent Semantic Analysis · Hivemall User Manual</title>
         <meta http-equiv="X-UA-Compatible" content="IE=edge" />
         <meta name="description" content="">
         <meta name="generator" content="GitBook 3.2.3">


     <link rel="stylesheet" href="../gitbook/style.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-splitter/splitter.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-etoc/plugin.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-callouts/plugin.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-toggle-chapters/toggle.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-codeblock-filename/block.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-expandable-chapters/expandable-chapters.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-multipart/multipart.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-katex/katex.min.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-emphasize/plugin.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">


                 <link rel="stylesheet" href="../gitbook/gitbook-plugin-theme-api/theme-api.css">


     <meta name="HandheldFriendly" content="true"/>
     <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
     <meta name="apple-mobile-web-app-capable" content="yes">
     <meta name="apple-mobile-web-app-status-bar-style" content="black">
     <link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
     <link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">


     <link rel="next" href="../geospatial/latlon.html" />


     <link rel="prev" href="lda.html" />


     </head>
     <body>

 <div class="book">
     <div class="book-summary">


 <div id="book-search-input" role="search">
     <input type="text" placeholder="Type to search" />
 </div>


                 <nav role="navigation">


 <ul class="summary">


         <li>
             <a href="https://hivemall.incubator.apache.org/" target="_blank" class="custom-link"><i class="fa fa-home"></i> Home</a>
         </li>


     <li class="divider"></li>


         <li class="header">TABLE OF CONTENTS</li>


         <li class="chapter " data-level="1.1" data-path="../">

                 <a href="../">


                         <b>1.1.</b>

                     Introduction

                 </a>


         </li>

         <li class="chapter " data-level="1.2" data-path="../getting_started/">

                 <a href="../getting_started/">


                         <b>1.2.</b>

                     Getting Started

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="1.2.1" data-path="../getting_started/installation.html">

                 <a href="../getting_started/installation.html">


                         <b>1.2.1.</b>

                     Installation

                 </a>


         </li>

         <li class="chapter " data-level="1.2.2" data-path="../getting_started/permanent-functions.html">

                 <a href="../getting_started/permanent-functions.html">


                         <b>1.2.2.</b>

                     Install as permanent functions

                 </a>


         </li>

         <li class="chapter " data-level="1.2.3" data-path="../getting_started/input-format.html">

                 <a href="../getting_started/input-format.html">


                         <b>1.2.3.</b>

                     Input Format

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="1.3" data-path="../misc/funcs.html">

                 <a href="../misc/funcs.html">


                         <b>1.3.</b>

                     List of Functions

                 </a>


         </li>

         <li class="chapter " data-level="1.4" data-path="../tips/">

                 <a href="../tips/">


                         <b>1.4.</b>

                     Tips for Effective Hivemall

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="1.4.1" data-path="../tips/addbias.html">

                 <a href="../tips/addbias.html">


                         <b>1.4.1.</b>

                     Explicit add_bias() for better prediction

                 </a>


         </li>

         <li class="chapter " data-level="1.4.2" data-path="../tips/rand_amplify.html">

                 <a href="../tips/rand_amplify.html">


                         <b>1.4.2.</b>

                     Use rand_amplify() to better prediction results

                 </a>


         </li>

         <li class="chapter " data-level="1.4.3" data-path="../tips/rt_prediction.html">

                 <a href="../tips/rt_prediction.html">


                         <b>1.4.3.</b>

                     Real-time prediction on RDBMS

                 </a>


         </li>

         <li class="chapter " data-level="1.4.4" data-path="../tips/ensemble_learning.html">

                 <a href="../tips/ensemble_learning.html">


                         <b>1.4.4.</b>

                     Ensemble learning for stable prediction

                 </a>


         </li>

         <li class="chapter " data-level="1.4.5" data-path="../tips/mixserver.html">

                 <a href="../tips/mixserver.html">


                         <b>1.4.5.</b>

                     Mixing models for a better prediction convergence (MIX server)

                 </a>


         </li>

         <li class="chapter " data-level="1.4.6" data-path="../tips/emr.html">

                 <a href="../tips/emr.html">


                         <b>1.4.6.</b>

                     Run Hivemall on Amazon Elastic MapReduce

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="1.5" data-path="../tips/general_tips.html">

                 <a href="../tips/general_tips.html">


                         <b>1.5.</b>

                     General Hive/Hadoop Tips

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="1.5.1" data-path="../tips/rowid.html">

                 <a href="../tips/rowid.html">


                         <b>1.5.1.</b>

                     Adding rowid for each row

                 </a>


         </li>

         <li class="chapter " data-level="1.5.2" data-path="../tips/hadoop_tuning.html">

                 <a href="../tips/hadoop_tuning.html">


                         <b>1.5.2.</b>

                     Hadoop tuning for Hivemall

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="1.6" data-path="../troubleshooting/">

                 <a href="../troubleshooting/">


                         <b>1.6.</b>

                     Troubleshooting

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="1.6.1" data-path="../troubleshooting/oom.html">

                 <a href="../troubleshooting/oom.html">


                         <b>1.6.1.</b>

                     OutOfMemoryError in training

                 </a>


         </li>

         <li class="chapter " data-level="1.6.2" data-path="../troubleshooting/mapjoin_task_error.html">

                 <a href="../troubleshooting/mapjoin_task_error.html">


                         <b>1.6.2.</b>

                     SemanticException generate map join task error: Cannot serialize object

                 </a>


         </li>

         <li class="chapter " data-level="1.6.3" data-path="../troubleshooting/asterisk.html">

                 <a href="../troubleshooting/asterisk.html">


                         <b>1.6.3.</b>

                     Asterisk argument for UDTF does not work

                 </a>


         </li>

         <li class="chapter " data-level="1.6.4" data-path="../troubleshooting/num_mappers.html">

                 <a href="../troubleshooting/num_mappers.html">


                         <b>1.6.4.</b>

                     The number of mappers is less than input splits in Hadoop 2.x

                 </a>


         </li>

         <li class="chapter " data-level="1.6.5" data-path="../troubleshooting/mapjoin_classcastex.html">

                 <a href="../troubleshooting/mapjoin_classcastex.html">


                         <b>1.6.5.</b>

                     Map-side join causes ClassCastException on Tez

                 </a>


         </li>


             </ul>

         </li>


         <li class="header">Part II - Generic Features</li>


         <li class="chapter " data-level="2.1" data-path="../misc/generic_funcs.html">

                 <a href="../misc/generic_funcs.html">


                         <b>2.1.</b>

                     List of Generic Hivemall Functions

                 </a>


         </li>

         <li class="chapter " data-level="2.2" data-path="../misc/topk.html">

                 <a href="../misc/topk.html">


                         <b>2.2.</b>

                     Efficient Top-K Query Processing

                 </a>


         </li>

         <li class="chapter " data-level="2.3" data-path="../misc/tokenizer.html">

                 <a href="../misc/tokenizer.html">


                         <b>2.3.</b>

                     Text Tokenizer

                 </a>


         </li>

         <li class="chapter " data-level="2.4" data-path="../misc/approx.html">

                 <a href="../misc/approx.html">


                         <b>2.4.</b>

                     Approximate Aggregate Functions

                 </a>


         </li>


         <li class="header">Part III - Feature Engineering</li>


         <li class="chapter " data-level="3.1" data-path="../ft_engineering/scaling.html">

                 <a href="../ft_engineering/scaling.html">


                         <b>3.1.</b>

                     Feature Scaling

                 </a>


         </li>

         <li class="chapter " data-level="3.2" data-path="../ft_engineering/hashing.html">

                 <a href="../ft_engineering/hashing.html">


                         <b>3.2.</b>

                     Feature Hashing

                 </a>


         </li>

         <li class="chapter " data-level="3.3" data-path="../ft_engineering/selection.html">

                 <a href="../ft_engineering/selection.html">


                         <b>3.3.</b>

                     Feature Selection

                 </a>


         </li>

         <li class="chapter " data-level="3.4" data-path="../ft_engineering/binning.html">

                 <a href="../ft_engineering/binning.html">


                         <b>3.4.</b>

                     Feature Binning

                 </a>


         </li>

         <li class="chapter " data-level="3.5" data-path="../ft_engineering/pairing.html">

                 <a href="../ft_engineering/pairing.html">


                         <b>3.5.</b>

                     Feature Paring

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="3.5.1" data-path="../ft_engineering/polynomial.html">

                 <a href="../ft_engineering/polynomial.html">


                         <b>3.5.1.</b>

                     Polynomial features

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="3.6" data-path="../ft_engineering/ft_trans.html">

                 <a href="../ft_engineering/ft_trans.html">


                         <b>3.6.</b>

                     Feature Transformation

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="3.6.1" data-path="../ft_engineering/vectorization.html">

                 <a href="../ft_engineering/vectorization.html">


                         <b>3.6.1.</b>

                     Feature vectorization

                 </a>


         </li>

         <li class="chapter " data-level="3.6.2" data-path="../ft_engineering/quantify.html">

                 <a href="../ft_engineering/quantify.html">


                         <b>3.6.2.</b>

                     Quantify non-number features

                 </a>


         </li>

         <li class="chapter " data-level="3.6.3" data-path="../ft_engineering/binarize.html">

                 <a href="../ft_engineering/binarize.html">


                         <b>3.6.3.</b>

                     Binarize label

                 </a>


         </li>

         <li class="chapter " data-level="3.6.4" data-path="../ft_engineering/onehot.html">

                 <a href="../ft_engineering/onehot.html">


                         <b>3.6.4.</b>

                     One-hot encoding

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="3.7" data-path="../ft_engineering/term_vector.html">

                 <a href="../ft_engineering/term_vector.html">


                         <b>3.7.</b>

                     Term Vector Model

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="3.7.1" data-path="../ft_engineering/tfidf.html">

                 <a href="../ft_engineering/tfidf.html">


                         <b>3.7.1.</b>

                     TF-IDF Term Weighting

                 </a>


         </li>

         <li class="chapter " data-level="3.7.2" data-path="../ft_engineering/bm25.html">

                 <a href="../ft_engineering/bm25.html">


                         <b>3.7.2.</b>

                     Okapi BM25 Term Weighting

                 </a>


         </li>


             </ul>

         </li>


         <li class="header">Part IV - Evaluation</li>


         <li class="chapter " data-level="4.1" data-path="../eval/binary_classification_measures.html">

                 <a href="../eval/binary_classification_measures.html">


                         <b>4.1.</b>

                     Binary Classification Metrics

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="4.1.1" data-path="../eval/auc.html">

                 <a href="../eval/auc.html">


                         <b>4.1.1.</b>

                     Area under the ROC curve

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="4.2" data-path="../eval/multilabel_classification_measures.html">

                 <a href="../eval/multilabel_classification_measures.html">


                         <b>4.2.</b>

                     Multi-label Classification Metrics

                 </a>


         </li>

         <li class="chapter " data-level="4.3" data-path="../eval/regression.html">

                 <a href="../eval/regression.html">


                         <b>4.3.</b>

                     Regression Metrics

                 </a>


         </li>

         <li class="chapter " data-level="4.4" data-path="../eval/rank.html">

                 <a href="../eval/rank.html">


                         <b>4.4.</b>

                     Ranking Measures

                 </a>


         </li>

         <li class="chapter " data-level="4.5" data-path="../eval/datagen.html">

                 <a href="../eval/datagen.html">


                         <b>4.5.</b>

                     Data Generation

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="4.5.1" data-path="../eval/lr_datagen.html">

                 <a href="../eval/lr_datagen.html">


                         <b>4.5.1.</b>

                     Logistic Regression data generation

                 </a>


         </li>


             </ul>

         </li>


         <li class="header">Part V - Supervised Learning</li>


         <li class="chapter " data-level="5.1" data-path="../supervised_learning/prediction.html">

                 <a href="../supervised_learning/prediction.html">


                         <b>5.1.</b>

                     How Prediction Works

                 </a>


         </li>

         <li class="chapter " data-level="5.2" data-path="../supervised_learning/tutorial.html">

                 <a href="../supervised_learning/tutorial.html">


                         <b>5.2.</b>

                     Step-by-Step Tutorial on Supervised Learning

                 </a>


         </li>


         <li class="header">Part VI - Binary Classification</li>


         <li class="chapter " data-level="6.1" data-path="../binaryclass/general.html">

                 <a href="../binaryclass/general.html">


                         <b>6.1.</b>

                     Binary Classification

                 </a>


         </li>

         <li class="chapter " data-level="6.2" data-path="../binaryclass/a9a.html">

                 <a href="../binaryclass/a9a.html">


                         <b>6.2.</b>

                     a9a Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="6.2.1" data-path="../binaryclass/a9a_dataset.html">

                 <a href="../binaryclass/a9a_dataset.html">


                         <b>6.2.1.</b>

                     Data Preparation

                 </a>


         </li>

         <li class="chapter " data-level="6.2.2" data-path="../binaryclass/a9a_generic.html">

                 <a href="../binaryclass/a9a_generic.html">


                         <b>6.2.2.</b>

                     General Binary Classifier

                 </a>


         </li>

         <li class="chapter " data-level="6.2.3" data-path="../binaryclass/a9a_lr.html">

                 <a href="../binaryclass/a9a_lr.html">


                         <b>6.2.3.</b>

                     Logistic Regression

                 </a>


         </li>

         <li class="chapter " data-level="6.2.4" data-path="../binaryclass/a9a_minibatch.html">

                 <a href="../binaryclass/a9a_minibatch.html">


                         <b>6.2.4.</b>

                     Mini-batch Gradient Descent

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="6.3" data-path="../binaryclass/news20.html">

                 <a href="../binaryclass/news20.html">


                         <b>6.3.</b>

                     News20 Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="6.3.1" data-path="../binaryclass/news20_dataset.html">

                 <a href="../binaryclass/news20_dataset.html">


                         <b>6.3.1.</b>

                     Data Preparation

                 </a>


         </li>

         <li class="chapter " data-level="6.3.2" data-path="../binaryclass/news20_pa.html">

                 <a href="../binaryclass/news20_pa.html">


                         <b>6.3.2.</b>

                     Perceptron, Passive Aggressive

                 </a>


         </li>

         <li class="chapter " data-level="6.3.3" data-path="../binaryclass/news20_scw.html">

                 <a href="../binaryclass/news20_scw.html">


                         <b>6.3.3.</b>

                     CW, AROW, SCW

                 </a>


         </li>

         <li class="chapter " data-level="6.3.4" data-path="../binaryclass/news20_generic.html">

                 <a href="../binaryclass/news20_generic.html">


                         <b>6.3.4.</b>

                     General Binary Classifier

                 </a>


         </li>

         <li class="chapter " data-level="6.3.5" data-path="../binaryclass/news20_generic_bagging.html">

                 <a href="../binaryclass/news20_generic_bagging.html">


                         <b>6.3.5.</b>

                     Baggnig classiers

                 </a>


         </li>

         <li class="chapter " data-level="6.3.6" data-path="../binaryclass/news20_adagrad.html">

                 <a href="../binaryclass/news20_adagrad.html">


                         <b>6.3.6.</b>

                     AdaGradRDA, AdaGrad, AdaDelta

                 </a>


         </li>

         <li class="chapter " data-level="6.3.7" data-path="../binaryclass/news20_rf.html">

                 <a href="../binaryclass/news20_rf.html">


                         <b>6.3.7.</b>

                     Random Forest

                 </a>


         </li>

         <li class="chapter " data-level="6.3.8" data-path="../binaryclass/news20b_xgboost.html">

                 <a href="../binaryclass/news20b_xgboost.html">


                         <b>6.3.8.</b>

                     XGBoost

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="6.4" data-path="../binaryclass/kdd2010a.html">

                 <a href="../binaryclass/kdd2010a.html">


                         <b>6.4.</b>

                     KDD2010a Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="6.4.1" data-path="../binaryclass/kdd2010a_dataset.html">

                 <a href="../binaryclass/kdd2010a_dataset.html">


                         <b>6.4.1.</b>

                     Data Preparation

                 </a>


         </li>

         <li class="chapter " data-level="6.4.2" data-path="../binaryclass/kdd2010a_scw.html">

                 <a href="../binaryclass/kdd2010a_scw.html">


                         <b>6.4.2.</b>

                     PA, CW, AROW, SCW

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="6.5" data-path="../binaryclass/kdd2010b.html">

                 <a href="../binaryclass/kdd2010b.html">


                         <b>6.5.</b>

                     KDD2010b Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="6.5.1" data-path="../binaryclass/kdd2010b_dataset.html">

                 <a href="../binaryclass/kdd2010b_dataset.html">


                         <b>6.5.1.</b>

                     Data Preparation

                 </a>


         </li>

         <li class="chapter " data-level="6.5.2" data-path="../binaryclass/kdd2010b_arow.html">

                 <a href="../binaryclass/kdd2010b_arow.html">


                         <b>6.5.2.</b>

                     AROW

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="6.6" data-path="../binaryclass/webspam.html">

                 <a href="../binaryclass/webspam.html">


                         <b>6.6.</b>

                     Webspam Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="6.6.1" data-path="../binaryclass/webspam_dataset.html">

                 <a href="../binaryclass/webspam_dataset.html">


                         <b>6.6.1.</b>

                     Data Pareparation

                 </a>


         </li>

         <li class="chapter " data-level="6.6.2" data-path="../binaryclass/webspam_scw.html">

                 <a href="../binaryclass/webspam_scw.html">


                         <b>6.6.2.</b>

                     PA1, AROW, SCW

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="6.7" data-path="../binaryclass/titanic_rf.html">

                 <a href="../binaryclass/titanic_rf.html">


                         <b>6.7.</b>

                     Kaggle Titanic Tutorial

                 </a>


         </li>

         <li class="chapter " data-level="6.8" data-path="../binaryclass/criteo.html">

                 <a href="../binaryclass/criteo.html">


                         <b>6.8.</b>

                     Criteo Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="6.8.1" data-path="../binaryclass/criteo_dataset.html">

                 <a href="../binaryclass/criteo_dataset.html">


                         <b>6.8.1.</b>

                     Data Preparation

                 </a>


         </li>

         <li class="chapter " data-level="6.8.2" data-path="../binaryclass/criteo_ffm.html">

                 <a href="../binaryclass/criteo_ffm.html">


                         <b>6.8.2.</b>

                     Field-Aware Factorization Machines

                 </a>


         </li>


             </ul>

         </li>


         <li class="header">Part VII - Multiclass Classification</li>


         <li class="chapter " data-level="7.1" data-path="../multiclass/news20.html">

                 <a href="../multiclass/news20.html">


                         <b>7.1.</b>

                     News20 Multiclass Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="7.1.1" data-path="../multiclass/news20_dataset.html">

                 <a href="../multiclass/news20_dataset.html">


                         <b>7.1.1.</b>

                     Data Preparation

                 </a>


         </li>

         <li class="chapter " data-level="7.1.2" data-path="../multiclass/news20_one-vs-the-rest_dataset.html">

                 <a href="../multiclass/news20_one-vs-the-rest_dataset.html">


                         <b>7.1.2.</b>

                     Data Preparation for one-vs-the-rest classifiers

                 </a>


         </li>

         <li class="chapter " data-level="7.1.3" data-path="../multiclass/news20_pa.html">

                 <a href="../multiclass/news20_pa.html">


                         <b>7.1.3.</b>

                     PA

                 </a>


         </li>

         <li class="chapter " data-level="7.1.4" data-path="../multiclass/news20_scw.html">

                 <a href="../multiclass/news20_scw.html">


                         <b>7.1.4.</b>

                     CW, AROW, SCW

                 </a>


         </li>

         <li class="chapter " data-level="7.1.5" data-path="../multiclass/news20_xgboost.html">

                 <a href="../multiclass/news20_xgboost.html">


                         <b>7.1.5.</b>

                     XGBoost

                 </a>


         </li>

         <li class="chapter " data-level="7.1.6" data-path="../multiclass/news20_ensemble.html">

                 <a href="../multiclass/news20_ensemble.html">


                         <b>7.1.6.</b>

                     Ensemble learning

                 </a>


         </li>

         <li class="chapter " data-level="7.1.7" data-path="../multiclass/news20_one-vs-the-rest.html">

                 <a href="../multiclass/news20_one-vs-the-rest.html">


                         <b>7.1.7.</b>

                     one-vs-the-rest Classifier

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="7.2" data-path="../multiclass/iris.html">

                 <a href="../multiclass/iris.html">


                         <b>7.2.</b>

                     Iris Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="7.2.1" data-path="../multiclass/iris_dataset.html">

                 <a href="../multiclass/iris_dataset.html">


                         <b>7.2.1.</b>

                     Data preparation

                 </a>


         </li>

         <li class="chapter " data-level="7.2.2" data-path="../multiclass/iris_scw.html">

                 <a href="../multiclass/iris_scw.html">


                         <b>7.2.2.</b>

                     SCW

                 </a>


         </li>

         <li class="chapter " data-level="7.2.3" data-path="../multiclass/iris_randomforest.html">

                 <a href="../multiclass/iris_randomforest.html">


                         <b>7.2.3.</b>

                     Random Forest

                 </a>


         </li>

         <li class="chapter " data-level="7.2.4" data-path="../multiclass/iris_xgboost.html">

                 <a href="../multiclass/iris_xgboost.html">


                         <b>7.2.4.</b>

                     XGBoost

                 </a>


         </li>


             </ul>

         </li>


         <li class="header">Part VIII - Regression</li>


         <li class="chapter " data-level="8.1" data-path="../regression/general.html">

                 <a href="../regression/general.html">


                         <b>8.1.</b>

                     Regression

                 </a>


         </li>

         <li class="chapter " data-level="8.2" data-path="../regression/e2006.html">

                 <a href="../regression/e2006.html">


                         <b>8.2.</b>

                     E2006-tfidf Regression Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="8.2.1" data-path="../regression/e2006_dataset.html">

                 <a href="../regression/e2006_dataset.html">


                         <b>8.2.1.</b>

                     Data Preparation

                 </a>


         </li>

         <li class="chapter " data-level="8.2.2" data-path="../regression/e2006_generic.html">

                 <a href="../regression/e2006_generic.html">


                         <b>8.2.2.</b>

                     General Regessor

                 </a>


         </li>

         <li class="chapter " data-level="8.2.3" data-path="../regression/e2006_arow.html">

                 <a href="../regression/e2006_arow.html">


                         <b>8.2.3.</b>

                     Passive Aggressive, AROW

                 </a>


         </li>

         <li class="chapter " data-level="8.2.4" data-path="../regression/e2006_xgboost.html">

                 <a href="../regression/e2006_xgboost.html">


                         <b>8.2.4.</b>

                     XGBoost

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="8.3" data-path="../regression/kddcup12tr2.html">

                 <a href="../regression/kddcup12tr2.html">


                         <b>8.3.</b>

                     KDDCup 2012 Track 2 CTR Prediction Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="8.3.1" data-path="../regression/kddcup12tr2_dataset.html">

                 <a href="../regression/kddcup12tr2_dataset.html">


                         <b>8.3.1.</b>

                     Data Preparation

                 </a>


         </li>

         <li class="chapter " data-level="8.3.2" data-path="../regression/kddcup12tr2_lr.html">

                 <a href="../regression/kddcup12tr2_lr.html">


                         <b>8.3.2.</b>

                     Logistic Regression, Passive Aggressive

                 </a>


         </li>

         <li class="chapter " data-level="8.3.3" data-path="../regression/kddcup12tr2_lr_amplify.html">

                 <a href="../regression/kddcup12tr2_lr_amplify.html">


                         <b>8.3.3.</b>

                     Logistic Regression with amplifier

                 </a>


         </li>

         <li class="chapter " data-level="8.3.4" data-path="../regression/kddcup12tr2_adagrad.html">

                 <a href="../regression/kddcup12tr2_adagrad.html">


                         <b>8.3.4.</b>

                     AdaGrad, AdaDelta

                 </a>


         </li>


             </ul>

         </li>


         <li class="header">Part IX - Recommendation</li>


         <li class="chapter " data-level="9.1" data-path="../recommend/cf.html">

                 <a href="../recommend/cf.html">


                         <b>9.1.</b>

                     Collaborative Filtering

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="9.1.1" data-path="../recommend/item_based_cf.html">

                 <a href="../recommend/item_based_cf.html">


                         <b>9.1.1.</b>

                     Item-based Collaborative Filtering

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="9.2" data-path="../recommend/news20.html">

                 <a href="../recommend/news20.html">


                         <b>9.2.</b>

                     News20 Related Article Recommendation Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="9.2.1" data-path="../multiclass/news20_dataset.html">

                 <a href="../multiclass/news20_dataset.html">


                         <b>9.2.1.</b>

                     Data Preparation

                 </a>


         </li>

         <li class="chapter " data-level="9.2.2" data-path="../recommend/news20_jaccard.html">

                 <a href="../recommend/news20_jaccard.html">


                         <b>9.2.2.</b>

                     LSH/MinHash and Jaccard Similarity

                 </a>


         </li>

         <li class="chapter " data-level="9.2.3" data-path="../recommend/news20_knn.html">

                 <a href="../recommend/news20_knn.html">


                         <b>9.2.3.</b>

                     LSH/MinHash and Brute-force Search

                 </a>


         </li>

         <li class="chapter " data-level="9.2.4" data-path="../recommend/news20_bbit_minhash.html">

                 <a href="../recommend/news20_bbit_minhash.html">


                         <b>9.2.4.</b>

                     kNN search using b-Bits MinHash

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="9.3" data-path="../recommend/movielens.html">

                 <a href="../recommend/movielens.html">


                         <b>9.3.</b>

                     MovieLens Movie Recommendation Tutorial

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="9.3.1" data-path="../recommend/movielens_dataset.html">

                 <a href="../recommend/movielens_dataset.html">


                         <b>9.3.1.</b>

                     Data Preparation

                 </a>


         </li>

         <li class="chapter " data-level="9.3.2" data-path="../recommend/movielens_cf.html">

                 <a href="../recommend/movielens_cf.html">


                         <b>9.3.2.</b>

                     Item-based Collaborative Filtering

                 </a>


         </li>

         <li class="chapter " data-level="9.3.3" data-path="../recommend/movielens_mf.html">

                 <a href="../recommend/movielens_mf.html">


                         <b>9.3.3.</b>

                     Matrix Factorization

                 </a>


         </li>

         <li class="chapter " data-level="9.3.4" data-path="../recommend/movielens_fm.html">

                 <a href="../recommend/movielens_fm.html">


                         <b>9.3.4.</b>

                     Factorization Machine

                 </a>


         </li>

         <li class="chapter " data-level="9.3.5" data-path="../recommend/movielens_slim.html">

                 <a href="../recommend/movielens_slim.html">


                         <b>9.3.5.</b>

                     SLIM for fast top-k Recommendation

                 </a>


         </li>

         <li class="chapter " data-level="9.3.6" data-path="../recommend/movielens_cv.html">

                 <a href="../recommend/movielens_cv.html">


                         <b>9.3.6.</b>

                     10-fold Cross Validation (Matrix Factorization)

                 </a>


         </li>


             </ul>

         </li>


         <li class="header">Part X - Anomaly Detection</li>


         <li class="chapter " data-level="10.1" data-path="../anomaly/lof.html">

                 <a href="../anomaly/lof.html">


                         <b>10.1.</b>

                     Outlier Detection using Local Outlier Factor (LOF)

                 </a>


         </li>

         <li class="chapter " data-level="10.2" data-path="../anomaly/sst.html">

                 <a href="../anomaly/sst.html">


                         <b>10.2.</b>

                     Change-Point Detection using Singular Spectrum Transformation (SST)

                 </a>


         </li>

         <li class="chapter " data-level="10.3" data-path="../anomaly/changefinder.html">

                 <a href="../anomaly/changefinder.html">


                         <b>10.3.</b>

                     ChangeFinder: Detecting Outlier and Change-Point Simultaneously

                 </a>


         </li>


         <li class="header">Part XI - Clustering</li>


         <li class="chapter " data-level="11.1" data-path="lda.html">

                 <a href="lda.html">


                         <b>11.1.</b>

                     Latent Dirichlet Allocation

                 </a>


         </li>

         <li class="chapter active" data-level="11.2" data-path="plsa.html">

                 <a href="plsa.html">


                         <b>11.2.</b>

                     Probabilistic Latent Semantic Analysis

                 </a>


         </li>


         <li class="header">Part XII - GeoSpatial Functions</li>


         <li class="chapter " data-level="12.1" data-path="../geospatial/latlon.html">

                 <a href="../geospatial/latlon.html">


                         <b>12.1.</b>

                     Lat/Lon functions

                 </a>


         </li>


         <li class="header">Part XIII - Hivemall on SparkSQL</li>


         <li class="chapter " data-level="13.1" data-path="../spark/getting_started/README.md">

                 <span>


                         <b>13.1.</b>

                     Getting Started

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="13.1.1" data-path="../spark/getting_started/installation.html">

                 <a href="../spark/getting_started/installation.html">


                         <b>13.1.1.</b>

                     Installation

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="13.2" data-path="../spark/binaryclass/">

                 <a href="../spark/binaryclass/">


                         <b>13.2.</b>

                     Binary Classification

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="13.2.1" data-path="../spark/binaryclass/a9a_sql.html">

                 <a href="../spark/binaryclass/a9a_sql.html">


                         <b>13.2.1.</b>

                     a9a Tutorial for SQL

                 </a>


         </li>


             </ul>

         </li>

         <li class="chapter " data-level="13.3" data-path="../spark/binaryclass/">

                 <a href="../spark/binaryclass/">


                         <b>13.3.</b>

                     Regression

                 </a>


             <ul class="articles">


         <li class="chapter " data-level="13.3.1" data-path="../spark/regression/e2006_sql.html">

                 <a href="../spark/regression/e2006_sql.html">


                         <b>13.3.1.</b>

                     E2006-tfidf Regression Tutorial for SQL

                 </a>


         </li>


             </ul>

         </li>


         <li class="header">Part XIV - Hivemall on Docker</li>


         <li class="chapter " data-level="14.1" data-path="../docker/getting_started.html">

                 <a href="../docker/getting_started.html">


                         <b>14.1.</b>

                     Getting Started

                 </a>


         </li>


         <li class="header">Part XIV - External References</li>


         <li class="chapter " data-level="15.1" >

                 <a target="_blank" href="https://github.com/daijyc/hivemall/wiki/PigHome">


                         <b>15.1.</b>

                     Hivemall on Apache Pig

                 </a>


         </li>


     <li class="divider"></li>

     <li>
         <a href="https://www.gitbook.com" target="blank" class="gitbook-link">
             Published with GitBook
         </a>
     </li>
 </ul>


                 </nav>


     </div>

     <div class="book-body">

             <div class="body-inner">


 <div class="book-header" role="navigation">


     <!-- Title -->
     <h1>
         <i class="fa fa-circle-o-notch fa-spin"></i>
         <a href=".." >Probabilistic Latent Semantic Analysis</a>
     </h1>
 </div>


                     <div class="page-wrapper" tabindex="-1" role="main">
                         <div class="page-inner">

 <div id="book-search-results">
     <div class="search-noresults">

                                 <section class="normal markdown-section">

                                 <!--
   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
   distributed with this work for additional information
   regarding copyright ownership.  The ASF licenses this file
   to you under the Apache License, Version 2.0 (the
   "License"); you may not use this file except in compliance
   with the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing,
   software distributed under the License is distributed on an
   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
   KIND, either express or implied.  See the License for the
   specific language governing permissions and limitations
   under the License.
 -->
 <p>As described in <a href="lda.html">our user guide for Latent Dirichlet Allocation (LDA)</a>, Hivemall enables you to apply clustering for your data based on a topic modeling technique. While LDA is one of the most popular techniques, there is another approach named <strong>Probabilistic Latent Semantic Analysis</strong> (pLSA). In fact, pLSA is the predecessor of LDA, but it has an advantage in terms of running time.</p>
 <ul>
 <li>T. Hofmann. <a href="https://dl.acm.org/citation.cfm?id=312649" target="_blank">Probabilistic Latent Semantic Indexing</a>. SIGIR 1999, pp. 50-57.</li>
 <li>T. Hofmann. <a href="https://www.iro.umontreal.ca/~nie/IFT6255/Hofmann-UAI99.pdf" target="_blank">Probabilistic Latent Semantic Analysis</a>. UAI 1999, pp. 289-296.</li>
 </ul>
 <p>In order to efficiently handle large-scale data, our pLSA implementation is based on the following incremental variant of the original pLSA algorithm:</p>
 <ul>
 <li>H. Wu, et al. <a href="https://dl.acm.org/citation.cfm?id=1454026" target="_blank">Incremental Probabilistic Latent Semantic Analysis for Automatic Question Recommendation</a>. RecSys 2008, pp. 99-106.</li>
 </ul>
 <!-- toc --><div id="toc" class="toc">

 <ul>
 <li><a href="#usage">Usage</a></li>
 <li><a href="#difference-with-lda">Difference with LDA</a></li>
 <li><a href="#setting-hyper-parameter-alpha">Setting hyper-parameter <code>alpha</code></a></li>
 </ul>

 </div><!-- tocstop -->
 <div class="panel panel-primary"><div class="panel-heading"><h3 class="panel-title" id="note"><i class="fa fa-edit"></i> Note</h3></div><div class="panel-body"><p>This feature is supported from Hivemall v0.5-rc.1 or later.</p></div></div>
 <h1 id="usage">Usage</h1>
 <p>Basically, you can use our pLSA function in a similar way to LDA.</p>
 <p>In particular, we have two pLSA functions, <code>train_plsa()</code> and <code>plsa_predict()</code>. These functions can be used almost interchangeably with <code>train_lda()</code> and <code>lda_predict()</code>. Thus, reading <a href="lda.html">our user guide for LDA</a> should be helpful before trying pLSA.</p>
 <p>In short, for the sample <code>docs</code> table we introduced in the LDA tutorial:</p>
 <table>
 <thead>
 <tr>
 <th style="text-align:center">docid</th>
 <th style="text-align:left">doc</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td style="text-align:center">1</td>
 <td style="text-align:left">&quot;Fruits and vegetables are healthy.&quot;</td>
 </tr>
 <tr>
 <td style="text-align:center">2</td>
 <td style="text-align:left">&quot;I like apples, oranges, and avocados. I do not like the flu or colds.&quot;</td>
 </tr>
 <tr>
 <td style="text-align:center">...</td>
 <td style="text-align:left">...</td>
 </tr>
 </tbody>
 </table>
 <p>a pLSA model can be built as follows:</p>
 <pre><code class="lang-sql">with word_counts as (
   <span class="hljs-keyword">select</span>
     docid,
     feature(word, <span class="hljs-keyword">count</span>(word)) <span class="hljs-keyword">as</span> f
   <span class="hljs-keyword">from</span>
     docs t1
     lateral <span class="hljs-keyword">view</span> explode(tokenize(doc, <span class="hljs-literal">true</span>)) t2 <span class="hljs-keyword">as</span> word
   <span class="hljs-keyword">where</span>
     <span class="hljs-keyword">not</span> is_stopword(word)
   <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span>
     docid, word
 ),
 <span class="hljs-keyword">input</span> <span class="hljs-keyword">as</span> (
   <span class="hljs-keyword">select</span> docid, collect_list(f) <span class="hljs-keyword">as</span> features
   <span class="hljs-keyword">from</span> word_counts
   <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span> docid
 )
 <span class="hljs-keyword">select</span>
   train_plsa(features, <span class="hljs-string">&apos;-topics 2 -eps 0.00001 -iter 2048 -alpha 0.01&apos;</span>) <span class="hljs-keyword">as</span> (label, word, prob)
 <span class="hljs-keyword">from</span>
   <span class="hljs-keyword">input</span>
 ;
 </code></pre>
 <table>
 <thead>
 <tr>
 <th style="text-align:center">label</th>
 <th style="text-align:center">word</th>
 <th style="text-align:center">prob</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td style="text-align:center">0</td>
 <td style="text-align:center">like</td>
 <td style="text-align:center">0.28549945</td>
 </tr>
 <tr>
 <td style="text-align:center">0</td>
 <td style="text-align:center">colds</td>
 <td style="text-align:center">0.14294468</td>
 </tr>
 <tr>
 <td style="text-align:center">0</td>
 <td style="text-align:center">apples</td>
 <td style="text-align:center">0.14291435</td>
 </tr>
 <tr>
 <td style="text-align:center">0</td>
 <td style="text-align:center">avocados</td>
 <td style="text-align:center">0.1428958</td>
 </tr>
 <tr>
 <td style="text-align:center">0</td>
 <td style="text-align:center">flu</td>
 <td style="text-align:center">0.14287639</td>
 </tr>
 <tr>
 <td style="text-align:center">0</td>
 <td style="text-align:center">oranges</td>
 <td style="text-align:center">0.1428691</td>
 </tr>
 <tr>
 <td style="text-align:center">0</td>
 <td style="text-align:center">healthy</td>
 <td style="text-align:center">1.2605103E-7</td>
 </tr>
 <tr>
 <td style="text-align:center">0</td>
 <td style="text-align:center">fruits</td>
 <td style="text-align:center">4.772253E-8</td>
 </tr>
 <tr>
 <td style="text-align:center">0</td>
 <td style="text-align:center">vegetables</td>
 <td style="text-align:center">1.929087E-8</td>
 </tr>
 <tr>
 <td style="text-align:center">1</td>
 <td style="text-align:center">vegetables</td>
 <td style="text-align:center">0.32713377</td>
 </tr>
 <tr>
 <td style="text-align:center">1</td>
 <td style="text-align:center">fruits</td>
 <td style="text-align:center">0.32713372</td>
 </tr>
 <tr>
 <td style="text-align:center">1</td>
 <td style="text-align:center">healthy</td>
 <td style="text-align:center">0.3271335</td>
 </tr>
 <tr>
 <td style="text-align:center">1</td>
 <td style="text-align:center">like</td>
 <td style="text-align:center">0.006977764</td>
 </tr>
 <tr>
 <td style="text-align:center">1</td>
 <td style="text-align:center">oranges</td>
 <td style="text-align:center">0.0025642214</td>
 </tr>
 <tr>
 <td style="text-align:center">1</td>
 <td style="text-align:center">flu</td>
 <td style="text-align:center">0.002507711</td>
 </tr>
 <tr>
 <td style="text-align:center">1</td>
 <td style="text-align:center">avocados</td>
 <td style="text-align:center">0.0023572792</td>
 </tr>
 <tr>
 <td style="text-align:center">1</td>
 <td style="text-align:center">apples</td>
 <td style="text-align:center">0.002213457</td>
 </tr>
 <tr>
 <td style="text-align:center">1</td>
 <td style="text-align:center">colds</td>
 <td style="text-align:center">0.001978546</td>
 </tr>
 </tbody>
 </table>
 <p>And prediction can be done as:</p>
 <pre><code class="lang-sql">test as (
   <span class="hljs-keyword">select</span>
     docid,
     word,
     <span class="hljs-keyword">count</span>(word) <span class="hljs-keyword">as</span> <span class="hljs-keyword">value</span>
   <span class="hljs-keyword">from</span>
     docs t1
     LATERAL <span class="hljs-keyword">VIEW</span> explode(tokenize(doc, <span class="hljs-literal">true</span>)) t2 <span class="hljs-keyword">as</span> word
   <span class="hljs-keyword">where</span>
     <span class="hljs-keyword">not</span> is_stopword(word)
   <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span>
     docid, word
 ),
 topic <span class="hljs-keyword">as</span> (
   <span class="hljs-keyword">select</span>
     t.docid,
     plsa_predict(t.word, t.<span class="hljs-keyword">value</span>, m.label, m.prob, <span class="hljs-string">&apos;-topics 2&apos;</span>) <span class="hljs-keyword">as</span> probabilities
   <span class="hljs-keyword">from</span>
     <span class="hljs-keyword">test</span> t
     <span class="hljs-keyword">JOIN</span> plsa_model m <span class="hljs-keyword">ON</span> (t.word = m.word)
   <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span>
     t.docid
 )
 <span class="hljs-keyword">select</span>
   docid,
   probabilities,
   probabilities[<span class="hljs-number">0</span>].label,
   m.words <span class="hljs-comment">-- topic each document should be assigned</span>
 <span class="hljs-keyword">from</span>
   topic t
   <span class="hljs-keyword">JOIN</span> (
     <span class="hljs-keyword">select</span> label, collect_list(feature(word, prob)) <span class="hljs-keyword">as</span> words
     <span class="hljs-keyword">from</span> plsa_model
     <span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span> label
   ) m <span class="hljs-keyword">on</span> t.probabilities[<span class="hljs-number">0</span>].label = m.label
 ;
 </code></pre>
 <table>
 <thead>
 <tr>
 <th style="text-align:center">docid</th>
 <th style="text-align:left">probabilities</th>
 <th style="text-align:center">label</th>
 <th style="text-align:left">m.words</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td style="text-align:center">1</td>
 <td style="text-align:left">[{&quot;label&quot;:1,&quot;probability&quot;:0.72298235},{&quot;label&quot;:0,&quot;probability&quot;:0.27701768}]</td>
 <td style="text-align:center">1</td>
 <td style="text-align:left">[&quot;vegetables:0.32713377&quot;,&quot;fruits:0.32713372&quot;,&quot;healthy:0.3271335&quot;,&quot;like:0.006977764&quot;,&quot;oranges:0.0025642214&quot;,&quot;flu:0.002507711&quot;,&quot;avocados:0.0023572792&quot;,&quot;apples:0.002213457&quot;,&quot;colds:0.001978546&quot;]</td>
 </tr>
 <tr>
 <td style="text-align:center">2</td>
 <td style="text-align:left">[{&quot;label&quot;:0,&quot;probability&quot;:0.7052526},{&quot;label&quot;:1,&quot;probability&quot;:0.2947474}]</td>
 <td style="text-align:center">0</td>
 <td style="text-align:left">[&quot;like:0.28549945&quot;,&quot;colds:0.14294468&quot;,&quot;apples:0.14291435&quot;,&quot;avocados:0.1428958&quot;,&quot;flu:0.14287639&quot;,&quot;oranges:0.1428691&quot;,&quot;healthy:1.2605103E-7&quot;,&quot;fruits:4.772253E-8&quot;,&quot;vegetables:1.929087E-8&quot;]</td>
 </tr>
 </tbody>
 </table>
 <h1 id="difference-with-lda">Difference with LDA</h1>
 <p>The main advantage of using pLSA is its efficiency. Since mathematical formulation and optimization logic is much simpler than LDA, using pLSA generally requires much shorter running time.</p>
 <p>In terms of accuracy, LDA could be better than pLSA. For example, a word <code>like</code> appears twice in the above sample document#2 gets larger probabilities both in topic#1 and #2, even though one document does not contain the word. By contrast, LDA results (i.e., <em>lambda</em> values) are more clearly separated as shown in <a href="lda.html">the LDA page</a>. Thus, a pLSA model is likely to be biased.</p>
 <p>For the reasons that we mentioned above, we recommend you to first use LDA. After that, if you encountered problems such as slow running time and undesirable clustering results, let you try alternative pLSA approach.</p>
 <h1 id="setting-hyper-parameter-alpha">Setting hyper-parameter <code>alpha</code></h1>
 <p>For training pLSA, we set a hyper-parameter <code>alpha</code> in the above example:</p>
 <pre><code class="lang-sql"><span class="hljs-keyword">SELECT</span> train_plsa(feature, <span class="hljs-string">&apos;-topics 2 -eps 0.00001 -iter 2048 -alpha 0.01&apos;</span>)
 </code></pre>
 <p>This value controls <strong>how much iterative model update is affected by the old results</strong>.</p>
 <p>From an algorithmic point of view, training pLSA (and LDA) iteratively repeats certain operations and updates the target value (i.e., probability obtained as a result of <code>train_plsa()</code>). This iterative procedure gradually makes the probabilities more accurate. What <code>alpha</code> does is to control the degree of the change of probabilities in each step.</p>
 <p>Importantly, pLSA is likely to overfit single mini-batch. As a result, <span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>P</mi><mo>(</mo><mi>w</mi><mi mathvariant="normal">&#x2223;</mi><mi>z</mi><mo>)</mo></mrow><annotation encoding="application/x-tex">P(w|z)</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.75em;"></span><span class="strut bottom" style="height:1em;vertical-align:-0.25em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord mathit" style="margin-right:0.02691em;">w</span><span class="mord mathrm">&#x2223;</span><span class="mord mathit" style="margin-right:0.04398em;">z</span><span class="mclose">)</span></span></span></span> could be particularly bad values (i.e., <span class="katex"><span class="katex-mathml"><math><semantics><mrow><mo>(</mo><mi>w</mi><mi mathvariant="normal">&#x2223;</mi><mi>z</mi><mo>)</mo><mo>=</mo><mn>0</mn></mrow><annotation encoding="application/x-tex">(w|z) = 0</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.75em;"></span><span class="strut bottom" style="height:1em;vertical-align:-0.25em;"></span><span class="base textstyle uncramped"><span class="mopen">(</span><span class="mord mathit" style="margin-right:0.02691em;">w</span><span class="mord mathrm">&#x2223;</span><span class="mord mathit" style="margin-right:0.04398em;">z</span><span class="mclose">)</span><span class="mrel">=</span><span class="mord mathrm">0</span></span></span></span>), and <code>train_plsa()</code> sometimes fails with an exception like:</p>
 <pre><code>Perplexity would be Infinity. Try different mini-batch size `-s`, larger `-delta` and/or larger `-alpha`.
 </code></pre><p>In that case, you need to try different hyper-parameters to avoid overfitting as the exception suggests.</p>
 <p>For instance, <a href="http://qwone.com/~jason/20Newsgroups/" target="_blank">20 newsgroups dataset</a> which consists of 10906 realistic documents empirically requires the following options:</p>
 <pre><code class="lang-sql"><span class="hljs-keyword">SELECT</span> train_plsa(features, <span class="hljs-string">&apos;-topics 20 -iter 10 -s 128 -delta 0.01 -alpha 512 -eps 0.1&apos;</span>)
 </code></pre>
 <p>Clearly, <code>alpha</code> is much larger than <code>0.01</code> which was used for the dummy data above. Let you keep in mind that an appropriate value of <code>alpha</code> highly depends on the number of documents and mini-batch size.</p>
 <p><div id="page-footer" class="localized-footer"><hr><!--
   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
   distributed with this work for additional information
   regarding copyright ownership.  The ASF licenses this file
   to you under the Apache License, Version 2.0 (the
   "License"); you may not use this file except in compliance
   with the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing,
   software distributed under the License is distributed on an
   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
   KIND, either express or implied.  See the License for the
   specific language governing permissions and limitations
   under the License.
 -->
 <p><sub><font color="gray">
 Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator.
 </font></sub></p>
 </div></p>


                                 </section>

     </div>
     <div class="search-results">
         <div class="has-results">

             <h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
             <ul class="search-results-list"></ul>

         </div>
         <div class="no-results">

             <h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>

         </div>
     </div>
 </div>

                         </div>
                     </div>

             </div>


     </div>

     <script>
         var gitbook = gitbook || [];
         gitbook.push(function() {
             gitbook.page.hasChanged({"page":{"title":"Probabilistic Latent Semantic Analysis","level":"11.2","depth":1,"next":{"title":"Lat/Lon functions","level":"12.1","depth":1,"path":"geospatial/latlon.md","ref":"geospatial/latlon.md","articles":[]},"previous":{"title":"Latent Dirichlet Allocation","level":"11.1","depth":1,"path":"clustering/lda.md","ref":"clustering/lda.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"h2lb":3,"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md","hline":"true"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/tree/master/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"https://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"clustering/plsa.md","mtime":"2021-04-22T11:42:38.122Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2021-04-22T11:56:59.644Z"},"basePath":"..","book":{"language":""}});
         });
     </script>
 </div>


     <script src="../gitbook/gitbook.js"></script>
     <script src="../gitbook/theme.js"></script>


         <script src="../gitbook/gitbook-plugin-edit-link/plugin.js"></script>


         <script src="../gitbook/gitbook-plugin-github/plugin.js"></script>


         <script src="../gitbook/gitbook-plugin-splitter/splitter.js"></script>


         <script src="../gitbook/gitbook-plugin-etoc/plugin.js"></script>


         <script src="../gitbook/gitbook-plugin-toggle-chapters/toggle.js"></script>


         <script src="../gitbook/gitbook-plugin-anchorjs/anchor.min.js"></script>


         <script src="../gitbook/gitbook-plugin-anchorjs/anchor-style.js"></script>


         <script src="../gitbook/gitbook-plugin-expandable-chapters/expandable-chapters.js"></script>


         <script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>


         <script src="../gitbook/gitbook-plugin-search/search.js"></script>


         <script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>


         <script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>


         <script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>


         <script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>


         <script src="../gitbook/gitbook-plugin-theme-api/theme-api.js"></script>


     </body>
 </html>