blob: 88fa901dffb86a616a052fcd51e52a4ce742e222 [file] [log] [blame]
<!DOCTYPE HTML>
<html lang="" >
<head>
<meta charset="UTF-8">
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>Text Tokenizer ยท Hivemall User Manual</title>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="description" content="">
<meta name="generator" content="GitBook 3.2.3">
<link rel="stylesheet" href="../gitbook/style.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-splitter/splitter.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-etoc/plugin.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-callouts/plugin.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-toggle-chapters/toggle.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-codeblock-filename/block.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-expandable-chapters/expandable-chapters.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-multipart/multipart.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-katex/katex.min.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-emphasize/plugin.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-theme-api/theme-api.css">
<meta name="HandheldFriendly" content="true"/>
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
<link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
<link rel="next" href="approx.html" />
<link rel="prev" href="topk.html" />
</head>
<body>
<div class="book">
<div class="book-summary">
<div id="book-search-input" role="search">
<input type="text" placeholder="Type to search" />
</div>
<nav role="navigation">
<ul class="summary">
<li>
<a href="https://hivemall.incubator.apache.org/" target="_blank" class="custom-link"><i class="fa fa-home"></i> Home</a>
</li>
<li class="divider"></li>
<li class="header">TABLE OF CONTENTS</li>
<li class="chapter " data-level="1.1" data-path="../">
<a href="../">
<b>1.1.</b>
Introduction
</a>
</li>
<li class="chapter " data-level="1.2" data-path="../getting_started/">
<a href="../getting_started/">
<b>1.2.</b>
Getting Started
</a>
<ul class="articles">
<li class="chapter " data-level="1.2.1" data-path="../getting_started/installation.html">
<a href="../getting_started/installation.html">
<b>1.2.1.</b>
Installation
</a>
</li>
<li class="chapter " data-level="1.2.2" data-path="../getting_started/permanent-functions.html">
<a href="../getting_started/permanent-functions.html">
<b>1.2.2.</b>
Install as permanent functions
</a>
</li>
<li class="chapter " data-level="1.2.3" data-path="../getting_started/input-format.html">
<a href="../getting_started/input-format.html">
<b>1.2.3.</b>
Input Format
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.3" data-path="funcs.html">
<a href="funcs.html">
<b>1.3.</b>
List of Functions
</a>
</li>
<li class="chapter " data-level="1.4" data-path="../tips/">
<a href="../tips/">
<b>1.4.</b>
Tips for Effective Hivemall
</a>
<ul class="articles">
<li class="chapter " data-level="1.4.1" data-path="../tips/addbias.html">
<a href="../tips/addbias.html">
<b>1.4.1.</b>
Explicit add_bias() for better prediction
</a>
</li>
<li class="chapter " data-level="1.4.2" data-path="../tips/rand_amplify.html">
<a href="../tips/rand_amplify.html">
<b>1.4.2.</b>
Use rand_amplify() to better prediction results
</a>
</li>
<li class="chapter " data-level="1.4.3" data-path="../tips/rt_prediction.html">
<a href="../tips/rt_prediction.html">
<b>1.4.3.</b>
Real-time prediction on RDBMS
</a>
</li>
<li class="chapter " data-level="1.4.4" data-path="../tips/ensemble_learning.html">
<a href="../tips/ensemble_learning.html">
<b>1.4.4.</b>
Ensemble learning for stable prediction
</a>
</li>
<li class="chapter " data-level="1.4.5" data-path="../tips/mixserver.html">
<a href="../tips/mixserver.html">
<b>1.4.5.</b>
Mixing models for a better prediction convergence (MIX server)
</a>
</li>
<li class="chapter " data-level="1.4.6" data-path="../tips/emr.html">
<a href="../tips/emr.html">
<b>1.4.6.</b>
Run Hivemall on Amazon Elastic MapReduce
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.5" data-path="../tips/general_tips.html">
<a href="../tips/general_tips.html">
<b>1.5.</b>
General Hive/Hadoop Tips
</a>
<ul class="articles">
<li class="chapter " data-level="1.5.1" data-path="../tips/rowid.html">
<a href="../tips/rowid.html">
<b>1.5.1.</b>
Adding rowid for each row
</a>
</li>
<li class="chapter " data-level="1.5.2" data-path="../tips/hadoop_tuning.html">
<a href="../tips/hadoop_tuning.html">
<b>1.5.2.</b>
Hadoop tuning for Hivemall
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.6" data-path="../troubleshooting/">
<a href="../troubleshooting/">
<b>1.6.</b>
Troubleshooting
</a>
<ul class="articles">
<li class="chapter " data-level="1.6.1" data-path="../troubleshooting/oom.html">
<a href="../troubleshooting/oom.html">
<b>1.6.1.</b>
OutOfMemoryError in training
</a>
</li>
<li class="chapter " data-level="1.6.2" data-path="../troubleshooting/mapjoin_task_error.html">
<a href="../troubleshooting/mapjoin_task_error.html">
<b>1.6.2.</b>
SemanticException generate map join task error: Cannot serialize object
</a>
</li>
<li class="chapter " data-level="1.6.3" data-path="../troubleshooting/asterisk.html">
<a href="../troubleshooting/asterisk.html">
<b>1.6.3.</b>
Asterisk argument for UDTF does not work
</a>
</li>
<li class="chapter " data-level="1.6.4" data-path="../troubleshooting/num_mappers.html">
<a href="../troubleshooting/num_mappers.html">
<b>1.6.4.</b>
The number of mappers is less than input splits in Hadoop 2.x
</a>
</li>
<li class="chapter " data-level="1.6.5" data-path="../troubleshooting/mapjoin_classcastex.html">
<a href="../troubleshooting/mapjoin_classcastex.html">
<b>1.6.5.</b>
Map-side join causes ClassCastException on Tez
</a>
</li>
</ul>
</li>
<li class="header">Part II - Generic Features</li>
<li class="chapter " data-level="2.1" data-path="generic_funcs.html">
<a href="generic_funcs.html">
<b>2.1.</b>
List of Generic Hivemall Functions
</a>
</li>
<li class="chapter " data-level="2.2" data-path="topk.html">
<a href="topk.html">
<b>2.2.</b>
Efficient Top-K Query Processing
</a>
</li>
<li class="chapter active" data-level="2.3" data-path="tokenizer.html">
<a href="tokenizer.html">
<b>2.3.</b>
Text Tokenizer
</a>
</li>
<li class="chapter " data-level="2.4" data-path="approx.html">
<a href="approx.html">
<b>2.4.</b>
Approximate Aggregate Functions
</a>
</li>
<li class="header">Part III - Feature Engineering</li>
<li class="chapter " data-level="3.1" data-path="../ft_engineering/scaling.html">
<a href="../ft_engineering/scaling.html">
<b>3.1.</b>
Feature Scaling
</a>
</li>
<li class="chapter " data-level="3.2" data-path="../ft_engineering/hashing.html">
<a href="../ft_engineering/hashing.html">
<b>3.2.</b>
Feature Hashing
</a>
</li>
<li class="chapter " data-level="3.3" data-path="../ft_engineering/selection.html">
<a href="../ft_engineering/selection.html">
<b>3.3.</b>
Feature Selection
</a>
</li>
<li class="chapter " data-level="3.4" data-path="../ft_engineering/binning.html">
<a href="../ft_engineering/binning.html">
<b>3.4.</b>
Feature Binning
</a>
</li>
<li class="chapter " data-level="3.5" data-path="../ft_engineering/pairing.html">
<a href="../ft_engineering/pairing.html">
<b>3.5.</b>
Feature Paring
</a>
<ul class="articles">
<li class="chapter " data-level="3.5.1" data-path="../ft_engineering/polynomial.html">
<a href="../ft_engineering/polynomial.html">
<b>3.5.1.</b>
Polynomial features
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="3.6" data-path="../ft_engineering/ft_trans.html">
<a href="../ft_engineering/ft_trans.html">
<b>3.6.</b>
Feature Transformation
</a>
<ul class="articles">
<li class="chapter " data-level="3.6.1" data-path="../ft_engineering/vectorization.html">
<a href="../ft_engineering/vectorization.html">
<b>3.6.1.</b>
Feature vectorization
</a>
</li>
<li class="chapter " data-level="3.6.2" data-path="../ft_engineering/quantify.html">
<a href="../ft_engineering/quantify.html">
<b>3.6.2.</b>
Quantify non-number features
</a>
</li>
<li class="chapter " data-level="3.6.3" data-path="../ft_engineering/binarize.html">
<a href="../ft_engineering/binarize.html">
<b>3.6.3.</b>
Binarize label
</a>
</li>
<li class="chapter " data-level="3.6.4" data-path="../ft_engineering/onehot.html">
<a href="../ft_engineering/onehot.html">
<b>3.6.4.</b>
One-hot encoding
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="3.7" data-path="../ft_engineering/term_vector.html">
<a href="../ft_engineering/term_vector.html">
<b>3.7.</b>
Term Vector Model
</a>
<ul class="articles">
<li class="chapter " data-level="3.7.1" data-path="../ft_engineering/tfidf.html">
<a href="../ft_engineering/tfidf.html">
<b>3.7.1.</b>
TF-IDF Term Weighting
</a>
</li>
<li class="chapter " data-level="3.7.2" data-path="../ft_engineering/bm25.html">
<a href="../ft_engineering/bm25.html">
<b>3.7.2.</b>
Okapi BM25 Term Weighting
</a>
</li>
</ul>
</li>
<li class="header">Part IV - Evaluation</li>
<li class="chapter " data-level="4.1" data-path="../eval/binary_classification_measures.html">
<a href="../eval/binary_classification_measures.html">
<b>4.1.</b>
Binary Classification Metrics
</a>
<ul class="articles">
<li class="chapter " data-level="4.1.1" data-path="../eval/auc.html">
<a href="../eval/auc.html">
<b>4.1.1.</b>
Area under the ROC curve
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="4.2" data-path="../eval/multilabel_classification_measures.html">
<a href="../eval/multilabel_classification_measures.html">
<b>4.2.</b>
Multi-label Classification Metrics
</a>
</li>
<li class="chapter " data-level="4.3" data-path="../eval/regression.html">
<a href="../eval/regression.html">
<b>4.3.</b>
Regression Metrics
</a>
</li>
<li class="chapter " data-level="4.4" data-path="../eval/rank.html">
<a href="../eval/rank.html">
<b>4.4.</b>
Ranking Measures
</a>
</li>
<li class="chapter " data-level="4.5" data-path="../eval/datagen.html">
<a href="../eval/datagen.html">
<b>4.5.</b>
Data Generation
</a>
<ul class="articles">
<li class="chapter " data-level="4.5.1" data-path="../eval/lr_datagen.html">
<a href="../eval/lr_datagen.html">
<b>4.5.1.</b>
Logistic Regression data generation
</a>
</li>
</ul>
</li>
<li class="header">Part V - Supervised Learning</li>
<li class="chapter " data-level="5.1" data-path="../supervised_learning/prediction.html">
<a href="../supervised_learning/prediction.html">
<b>5.1.</b>
How Prediction Works
</a>
</li>
<li class="chapter " data-level="5.2" data-path="../supervised_learning/tutorial.html">
<a href="../supervised_learning/tutorial.html">
<b>5.2.</b>
Step-by-Step Tutorial on Supervised Learning
</a>
</li>
<li class="header">Part VI - Binary Classification</li>
<li class="chapter " data-level="6.1" data-path="../binaryclass/general.html">
<a href="../binaryclass/general.html">
<b>6.1.</b>
Binary Classification
</a>
</li>
<li class="chapter " data-level="6.2" data-path="../binaryclass/a9a.html">
<a href="../binaryclass/a9a.html">
<b>6.2.</b>
a9a Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.2.1" data-path="../binaryclass/a9a_dataset.html">
<a href="../binaryclass/a9a_dataset.html">
<b>6.2.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="6.2.2" data-path="../binaryclass/a9a_generic.html">
<a href="../binaryclass/a9a_generic.html">
<b>6.2.2.</b>
General Binary Classifier
</a>
</li>
<li class="chapter " data-level="6.2.3" data-path="../binaryclass/a9a_lr.html">
<a href="../binaryclass/a9a_lr.html">
<b>6.2.3.</b>
Logistic Regression
</a>
</li>
<li class="chapter " data-level="6.2.4" data-path="../binaryclass/a9a_minibatch.html">
<a href="../binaryclass/a9a_minibatch.html">
<b>6.2.4.</b>
Mini-batch Gradient Descent
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="6.3" data-path="../binaryclass/news20.html">
<a href="../binaryclass/news20.html">
<b>6.3.</b>
News20 Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.3.1" data-path="../binaryclass/news20_dataset.html">
<a href="../binaryclass/news20_dataset.html">
<b>6.3.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="6.3.2" data-path="../binaryclass/news20_pa.html">
<a href="../binaryclass/news20_pa.html">
<b>6.3.2.</b>
Perceptron, Passive Aggressive
</a>
</li>
<li class="chapter " data-level="6.3.3" data-path="../binaryclass/news20_scw.html">
<a href="../binaryclass/news20_scw.html">
<b>6.3.3.</b>
CW, AROW, SCW
</a>
</li>
<li class="chapter " data-level="6.3.4" data-path="../binaryclass/news20_generic.html">
<a href="../binaryclass/news20_generic.html">
<b>6.3.4.</b>
General Binary Classifier
</a>
</li>
<li class="chapter " data-level="6.3.5" data-path="../binaryclass/news20_generic_bagging.html">
<a href="../binaryclass/news20_generic_bagging.html">
<b>6.3.5.</b>
Baggnig classiers
</a>
</li>
<li class="chapter " data-level="6.3.6" data-path="../binaryclass/news20_adagrad.html">
<a href="../binaryclass/news20_adagrad.html">
<b>6.3.6.</b>
AdaGradRDA, AdaGrad, AdaDelta
</a>
</li>
<li class="chapter " data-level="6.3.7" data-path="../binaryclass/news20_rf.html">
<a href="../binaryclass/news20_rf.html">
<b>6.3.7.</b>
Random Forest
</a>
</li>
<li class="chapter " data-level="6.3.8" data-path="../binaryclass/news20b_xgboost.html">
<a href="../binaryclass/news20b_xgboost.html">
<b>6.3.8.</b>
XGBoost
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="6.4" data-path="../binaryclass/kdd2010a.html">
<a href="../binaryclass/kdd2010a.html">
<b>6.4.</b>
KDD2010a Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.4.1" data-path="../binaryclass/kdd2010a_dataset.html">
<a href="../binaryclass/kdd2010a_dataset.html">
<b>6.4.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="6.4.2" data-path="../binaryclass/kdd2010a_scw.html">
<a href="../binaryclass/kdd2010a_scw.html">
<b>6.4.2.</b>
PA, CW, AROW, SCW
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="6.5" data-path="../binaryclass/kdd2010b.html">
<a href="../binaryclass/kdd2010b.html">
<b>6.5.</b>
KDD2010b Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.5.1" data-path="../binaryclass/kdd2010b_dataset.html">
<a href="../binaryclass/kdd2010b_dataset.html">
<b>6.5.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="6.5.2" data-path="../binaryclass/kdd2010b_arow.html">
<a href="../binaryclass/kdd2010b_arow.html">
<b>6.5.2.</b>
AROW
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="6.6" data-path="../binaryclass/webspam.html">
<a href="../binaryclass/webspam.html">
<b>6.6.</b>
Webspam Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.6.1" data-path="../binaryclass/webspam_dataset.html">
<a href="../binaryclass/webspam_dataset.html">
<b>6.6.1.</b>
Data Pareparation
</a>
</li>
<li class="chapter " data-level="6.6.2" data-path="../binaryclass/webspam_scw.html">
<a href="../binaryclass/webspam_scw.html">
<b>6.6.2.</b>
PA1, AROW, SCW
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="6.7" data-path="../binaryclass/titanic_rf.html">
<a href="../binaryclass/titanic_rf.html">
<b>6.7.</b>
Kaggle Titanic Tutorial
</a>
</li>
<li class="chapter " data-level="6.8" data-path="../binaryclass/criteo.html">
<a href="../binaryclass/criteo.html">
<b>6.8.</b>
Criteo Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.8.1" data-path="../binaryclass/criteo_dataset.html">
<a href="../binaryclass/criteo_dataset.html">
<b>6.8.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="6.8.2" data-path="../binaryclass/criteo_ffm.html">
<a href="../binaryclass/criteo_ffm.html">
<b>6.8.2.</b>
Field-Aware Factorization Machines
</a>
</li>
</ul>
</li>
<li class="header">Part VII - Multiclass Classification</li>
<li class="chapter " data-level="7.1" data-path="../multiclass/news20.html">
<a href="../multiclass/news20.html">
<b>7.1.</b>
News20 Multiclass Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="7.1.1" data-path="../multiclass/news20_dataset.html">
<a href="../multiclass/news20_dataset.html">
<b>7.1.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="7.1.2" data-path="../multiclass/news20_one-vs-the-rest_dataset.html">
<a href="../multiclass/news20_one-vs-the-rest_dataset.html">
<b>7.1.2.</b>
Data Preparation for one-vs-the-rest classifiers
</a>
</li>
<li class="chapter " data-level="7.1.3" data-path="../multiclass/news20_pa.html">
<a href="../multiclass/news20_pa.html">
<b>7.1.3.</b>
PA
</a>
</li>
<li class="chapter " data-level="7.1.4" data-path="../multiclass/news20_scw.html">
<a href="../multiclass/news20_scw.html">
<b>7.1.4.</b>
CW, AROW, SCW
</a>
</li>
<li class="chapter " data-level="7.1.5" data-path="../multiclass/news20_xgboost.html">
<a href="../multiclass/news20_xgboost.html">
<b>7.1.5.</b>
XGBoost
</a>
</li>
<li class="chapter " data-level="7.1.6" data-path="../multiclass/news20_ensemble.html">
<a href="../multiclass/news20_ensemble.html">
<b>7.1.6.</b>
Ensemble learning
</a>
</li>
<li class="chapter " data-level="7.1.7" data-path="../multiclass/news20_one-vs-the-rest.html">
<a href="../multiclass/news20_one-vs-the-rest.html">
<b>7.1.7.</b>
one-vs-the-rest Classifier
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="7.2" data-path="../multiclass/iris.html">
<a href="../multiclass/iris.html">
<b>7.2.</b>
Iris Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="7.2.1" data-path="../multiclass/iris_dataset.html">
<a href="../multiclass/iris_dataset.html">
<b>7.2.1.</b>
Data preparation
</a>
</li>
<li class="chapter " data-level="7.2.2" data-path="../multiclass/iris_scw.html">
<a href="../multiclass/iris_scw.html">
<b>7.2.2.</b>
SCW
</a>
</li>
<li class="chapter " data-level="7.2.3" data-path="../multiclass/iris_randomforest.html">
<a href="../multiclass/iris_randomforest.html">
<b>7.2.3.</b>
Random Forest
</a>
</li>
<li class="chapter " data-level="7.2.4" data-path="../multiclass/iris_xgboost.html">
<a href="../multiclass/iris_xgboost.html">
<b>7.2.4.</b>
XGBoost
</a>
</li>
</ul>
</li>
<li class="header">Part VIII - Regression</li>
<li class="chapter " data-level="8.1" data-path="../regression/general.html">
<a href="../regression/general.html">
<b>8.1.</b>
Regression
</a>
</li>
<li class="chapter " data-level="8.2" data-path="../regression/e2006.html">
<a href="../regression/e2006.html">
<b>8.2.</b>
E2006-tfidf Regression Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="8.2.1" data-path="../regression/e2006_dataset.html">
<a href="../regression/e2006_dataset.html">
<b>8.2.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="8.2.2" data-path="../regression/e2006_generic.html">
<a href="../regression/e2006_generic.html">
<b>8.2.2.</b>
General Regessor
</a>
</li>
<li class="chapter " data-level="8.2.3" data-path="../regression/e2006_arow.html">
<a href="../regression/e2006_arow.html">
<b>8.2.3.</b>
Passive Aggressive, AROW
</a>
</li>
<li class="chapter " data-level="8.2.4" data-path="../regression/e2006_xgboost.html">
<a href="../regression/e2006_xgboost.html">
<b>8.2.4.</b>
XGBoost
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="8.3" data-path="../regression/kddcup12tr2.html">
<a href="../regression/kddcup12tr2.html">
<b>8.3.</b>
KDDCup 2012 Track 2 CTR Prediction Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="8.3.1" data-path="../regression/kddcup12tr2_dataset.html">
<a href="../regression/kddcup12tr2_dataset.html">
<b>8.3.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="8.3.2" data-path="../regression/kddcup12tr2_lr.html">
<a href="../regression/kddcup12tr2_lr.html">
<b>8.3.2.</b>
Logistic Regression, Passive Aggressive
</a>
</li>
<li class="chapter " data-level="8.3.3" data-path="../regression/kddcup12tr2_lr_amplify.html">
<a href="../regression/kddcup12tr2_lr_amplify.html">
<b>8.3.3.</b>
Logistic Regression with amplifier
</a>
</li>
<li class="chapter " data-level="8.3.4" data-path="../regression/kddcup12tr2_adagrad.html">
<a href="../regression/kddcup12tr2_adagrad.html">
<b>8.3.4.</b>
AdaGrad, AdaDelta
</a>
</li>
</ul>
</li>
<li class="header">Part IX - Recommendation</li>
<li class="chapter " data-level="9.1" data-path="../recommend/cf.html">
<a href="../recommend/cf.html">
<b>9.1.</b>
Collaborative Filtering
</a>
<ul class="articles">
<li class="chapter " data-level="9.1.1" data-path="../recommend/item_based_cf.html">
<a href="../recommend/item_based_cf.html">
<b>9.1.1.</b>
Item-based Collaborative Filtering
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="9.2" data-path="../recommend/news20.html">
<a href="../recommend/news20.html">
<b>9.2.</b>
News20 Related Article Recommendation Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="9.2.1" data-path="../multiclass/news20_dataset.html">
<a href="../multiclass/news20_dataset.html">
<b>9.2.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="9.2.2" data-path="../recommend/news20_jaccard.html">
<a href="../recommend/news20_jaccard.html">
<b>9.2.2.</b>
LSH/MinHash and Jaccard Similarity
</a>
</li>
<li class="chapter " data-level="9.2.3" data-path="../recommend/news20_knn.html">
<a href="../recommend/news20_knn.html">
<b>9.2.3.</b>
LSH/MinHash and Brute-force Search
</a>
</li>
<li class="chapter " data-level="9.2.4" data-path="../recommend/news20_bbit_minhash.html">
<a href="../recommend/news20_bbit_minhash.html">
<b>9.2.4.</b>
kNN search using b-Bits MinHash
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="9.3" data-path="../recommend/movielens.html">
<a href="../recommend/movielens.html">
<b>9.3.</b>
MovieLens Movie Recommendation Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="9.3.1" data-path="../recommend/movielens_dataset.html">
<a href="../recommend/movielens_dataset.html">
<b>9.3.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="9.3.2" data-path="../recommend/movielens_cf.html">
<a href="../recommend/movielens_cf.html">
<b>9.3.2.</b>
Item-based Collaborative Filtering
</a>
</li>
<li class="chapter " data-level="9.3.3" data-path="../recommend/movielens_mf.html">
<a href="../recommend/movielens_mf.html">
<b>9.3.3.</b>
Matrix Factorization
</a>
</li>
<li class="chapter " data-level="9.3.4" data-path="../recommend/movielens_fm.html">
<a href="../recommend/movielens_fm.html">
<b>9.3.4.</b>
Factorization Machine
</a>
</li>
<li class="chapter " data-level="9.3.5" data-path="../recommend/movielens_slim.html">
<a href="../recommend/movielens_slim.html">
<b>9.3.5.</b>
SLIM for fast top-k Recommendation
</a>
</li>
<li class="chapter " data-level="9.3.6" data-path="../recommend/movielens_cv.html">
<a href="../recommend/movielens_cv.html">
<b>9.3.6.</b>
10-fold Cross Validation (Matrix Factorization)
</a>
</li>
</ul>
</li>
<li class="header">Part X - Anomaly Detection</li>
<li class="chapter " data-level="10.1" data-path="../anomaly/lof.html">
<a href="../anomaly/lof.html">
<b>10.1.</b>
Outlier Detection using Local Outlier Factor (LOF)
</a>
</li>
<li class="chapter " data-level="10.2" data-path="../anomaly/sst.html">
<a href="../anomaly/sst.html">
<b>10.2.</b>
Change-Point Detection using Singular Spectrum Transformation (SST)
</a>
</li>
<li class="chapter " data-level="10.3" data-path="../anomaly/changefinder.html">
<a href="../anomaly/changefinder.html">
<b>10.3.</b>
ChangeFinder: Detecting Outlier and Change-Point Simultaneously
</a>
</li>
<li class="header">Part XI - Clustering</li>
<li class="chapter " data-level="11.1" data-path="../clustering/lda.html">
<a href="../clustering/lda.html">
<b>11.1.</b>
Latent Dirichlet Allocation
</a>
</li>
<li class="chapter " data-level="11.2" data-path="../clustering/plsa.html">
<a href="../clustering/plsa.html">
<b>11.2.</b>
Probabilistic Latent Semantic Analysis
</a>
</li>
<li class="header">Part XII - GeoSpatial Functions</li>
<li class="chapter " data-level="12.1" data-path="../geospatial/latlon.html">
<a href="../geospatial/latlon.html">
<b>12.1.</b>
Lat/Lon functions
</a>
</li>
<li class="header">Part XIII - Hivemall on SparkSQL</li>
<li class="chapter " data-level="13.1" data-path="../spark/getting_started/README.md">
<span>
<b>13.1.</b>
Getting Started
</a>
<ul class="articles">
<li class="chapter " data-level="13.1.1" data-path="../spark/getting_started/installation.html">
<a href="../spark/getting_started/installation.html">
<b>13.1.1.</b>
Installation
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="13.2" data-path="../spark/binaryclass/">
<a href="../spark/binaryclass/">
<b>13.2.</b>
Binary Classification
</a>
<ul class="articles">
<li class="chapter " data-level="13.2.1" data-path="../spark/binaryclass/a9a_sql.html">
<a href="../spark/binaryclass/a9a_sql.html">
<b>13.2.1.</b>
a9a Tutorial for SQL
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="13.3" data-path="../spark/binaryclass/">
<a href="../spark/binaryclass/">
<b>13.3.</b>
Regression
</a>
<ul class="articles">
<li class="chapter " data-level="13.3.1" data-path="../spark/regression/e2006_sql.html">
<a href="../spark/regression/e2006_sql.html">
<b>13.3.1.</b>
E2006-tfidf Regression Tutorial for SQL
</a>
</li>
</ul>
</li>
<li class="header">Part XIV - Hivemall on Docker</li>
<li class="chapter " data-level="14.1" data-path="../docker/getting_started.html">
<a href="../docker/getting_started.html">
<b>14.1.</b>
Getting Started
</a>
</li>
<li class="header">Part XIV - External References</li>
<li class="chapter " data-level="15.1" >
<a target="_blank" href="https://github.com/daijyc/hivemall/wiki/PigHome">
<b>15.1.</b>
Hivemall on Apache Pig
</a>
</li>
<li class="divider"></li>
<li>
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
Published with GitBook
</a>
</li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<!-- Title -->
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i>
<a href=".." >Text Tokenizer</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<div id="book-search-results">
<div class="search-noresults">
<section class="normal markdown-section">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<!-- toc --><div id="toc" class="toc">
<ul>
<li><a href="#tokenizer-for-english-texts">Tokenizer for English Texts</a></li>
<li><a href="#tokenizer-for-non-english-texts">Tokenizer for Non-English Texts</a><ul>
<li><a href="#japanese-tokenizer">Japanese Tokenizer</a><ul>
<li><a href="#custom-dictionary">Custom dictionary</a></li>
<li><a href="#part-of-speech">Part-of-speech</a></li>
</ul>
</li>
<li><a href="#chinese-tokenizer">Chinese Tokenizer</a></li>
<li><a href="#korean-tokenizer">Korean Tokenizer</a><ul>
<li><a href="#custom-dictionary-1">Custom dictionary</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</div><!-- tocstop -->
<h1 id="tokenizer-for-english-texts">Tokenizer for English Texts</h1>
<p>Hivemall provides simple English text tokenizer UDF that has following syntax:</p>
<pre><code class="lang-sql">tokenize(text input, optional boolean toLowerCase = false)
</code></pre>
<h1 id="tokenizer-for-non-english-texts">Tokenizer for Non-English Texts</h1>
<h2 id="japanese-tokenizer">Japanese Tokenizer</h2>
<p>Japanese text tokenizer UDF uses <a href="https://github.com/atilika/kuromoji" target="_blank">Kuromoji</a>. </p>
<p>The signature of the UDF is as follows:</p>
<pre><code class="lang-sql">-- uses Kuromoji default dictionary by the default
tokenize_ja(text input, optional const text mode = &quot;normal&quot;, optional const array&lt;string&gt; stopWords, const array&lt;string&gt; stopTags, const array&lt;string&gt; userDict)
-- tokenize_ja_neologd uses mecab-ipa-neologd for it&apos;s dictionary.
tokenize_ja_neologd(text input, optional const text mode = &quot;normal&quot;, optional const array&lt;string&gt; stopWords, const array&lt;string&gt; stopTags, const array&lt;string&gt; userDict)
</code></pre>
<div class="panel panel-primary"><div class="panel-heading"><h3 class="panel-title" id="note"><i class="fa fa-edit"></i> Note</h3></div><div class="panel-body"><p><code>tokenize_ja_neologd</code> returns tokenized strings in an array by using the NEologd dictionary. <a href="https://github.com/neologd/mecab-ipadic-neologd" target="_blank">mecab-ipadic-NEologd</a> is a customized system dictionary for MeCab inclucing new vocablaries extracted from many resources on the Web. </p></div></div>
<p>See differences between with and without Neologd as follows:</p>
<pre><code class="lang-sql">select tokenize_ja(&quot;&#x5F7C;&#x5973;&#x306F;&#x30DA;&#x30F3;&#x30D1;&#x30A4;&#x30CA;&#x30C3;&#x30DD;&#x30FC;&#x30A2;&#x30C3;&#x30DD;&#x30FC;&#x30DA;&#x30F3;&#x3068;&#x604B;&#x30C0;&#x30F3;&#x30B9;&#x3092;&#x8E0A;&#x3063;&#x305F;&#x3002;&quot;);
&gt;[&quot;&#x5F7C;&#x5973;&quot;,&quot;&#x30DA;&#x30F3;&#x30D1;&#x30A4;&#x30CA;&#x30C3;&#x30DD;&#x30FC;&#x30A2;&#x30C3;&#x30DD;&#x30FC;&#x30DA;&#x30F3;&quot;,&quot;&#x604B;&quot;,&quot;&#x30C0;&#x30F3;&#x30B9;&quot;,&quot;&#x8E0A;&#x308B;&quot;]
select tokenize_ja_neologd(&quot;&#x5F7C;&#x5973;&#x306F;&#x30DA;&#x30F3;&#x30D1;&#x30A4;&#x30CA;&#x30C3;&#x30DD;&#x30FC;&#x30A2;&#x30C3;&#x30DD;&#x30FC;&#x30DA;&#x30F3;&#x3068;&#x604B;&#x30C0;&#x30F3;&#x30B9;&#x3092;&#x8E0A;&#x3063;&#x305F;&#x3002;&quot;);
&gt; [&quot;&#x5F7C;&#x5973;&quot;,&quot;&#x30DA;&#x30F3;&#x30D1;&#x30A4;&#x30CA;&#x30C3;&#x30DD;&#x30FC;&#x30A2;&#x30C3;&#x30DD;&#x30FC;&#x30DA;&#x30F3;&quot;,&quot;&#x604B;&#x30C0;&#x30F3;&#x30B9;&quot;,&quot;&#x8E0A;&#x308B;&quot;]
</code></pre>
<p>You can print versions for Kuromoji UDFs as follows:</p>
<pre><code class="lang-sql">select tokenize_ja();
&gt; [&quot;8.8.2&quot;]
select tokenize_ja_neologd();
&gt; [&quot;8.8.2-20200910.2&quot;]
</code></pre>
<p>Its basic usage is as follows:</p>
<pre><code class="lang-sql"><span class="hljs-keyword">select</span> tokenize_ja(<span class="hljs-string">&quot;kuromoji&#x3092;&#x4F7F;&#x3063;&#x305F;&#x5206;&#x304B;&#x3061;&#x66F8;&#x304D;&#x306E;&#x30C6;&#x30B9;&#x30C8;&#x3067;&#x3059;&#x3002;&#x7B2C;&#x4E8C;&#x5F15;&#x6570;&#x306B;&#x306F;normal/search/extended&#x3092;&#x6307;&#x5B9A;&#x3067;&#x304D;&#x307E;&#x3059;&#x3002;&#x30C7;&#x30D5;&#x30A9;&#x30EB;&#x30C8;&#x3067;&#x306F;normal&#x30E2;&#x30FC;&#x30C9;&#x3067;&#x3059;&#x3002;&quot;</span>);
</code></pre>
<blockquote>
<p>[&quot;kuromoji&quot;,&quot;&#x4F7F;&#x3046;&quot;,&quot;&#x5206;&#x304B;&#x3061;&#x66F8;&#x304D;&quot;,&quot;&#x30C6;&#x30B9;&#x30C8;&quot;,&quot;&#x7B2C;&quot;,&quot;&#x4E8C;&quot;,&quot;&#x5F15;&#x6570;&quot;,&quot;normal&quot;,&quot;search&quot;,&quot;extended&quot;,&quot;&#x6307;&#x5B9A;&quot;,&quot;&#x30C7;&#x30D5;&#x30A9;&#x30EB;&#x30C8;&quot;,&quot;normal&quot;,&quot;&#x30E2;&#x30FC;&#x30C9;&quot;]</p>
</blockquote>
<p>In addition, the third and fourth argument respectively allow you to use your own list of stop words and stop tags. For example, the following query simply ignores &quot;kuromoji&quot; (as a stop word) and noun word &quot;&#x5206;&#x304B;&#x3061;&#x66F8;&#x304D;&quot; (as a stop tag):</p>
<pre><code class="lang-sql"><span class="hljs-keyword">select</span> tokenize_ja(<span class="hljs-string">&quot;kuromoji&#x3092;&#x4F7F;&#x3063;&#x305F;&#x5206;&#x304B;&#x3061;&#x66F8;&#x304D;&#x306E;&#x30C6;&#x30B9;&#x30C8;&#x3067;&#x3059;&#x3002;&quot;</span>, <span class="hljs-string">&quot;normal&quot;</span>, <span class="hljs-built_in">array</span>(<span class="hljs-string">&quot;kuromoji&quot;</span>), <span class="hljs-built_in">array</span>(<span class="hljs-string">&quot;&#x540D;&#x8A5E;-&#x4E00;&#x822C;&quot;</span>));
</code></pre>
<blockquote>
<p>[&quot;&#x3092;&quot;,&quot;&#x4F7F;&#x3046;&quot;,&quot;&#x305F;&quot;,&quot;&#x306E;&quot;,&quot;&#x30C6;&#x30B9;&#x30C8;&quot;,&quot;&#x3067;&#x3059;&quot;]</p>
</blockquote>
<pre><code class="lang-sql"><span class="hljs-keyword">select</span> tokenize_ja(<span class="hljs-string">&quot;kuromoji&#x3092;&#x4F7F;&#x3063;&#x305F;&#x5206;&#x304B;&#x3061;&#x66F8;&#x304D;&#x306E;&#x30C6;&#x30B9;&#x30C8;&#x3067;&#x3059;&#x3002;&quot;</span>, <span class="hljs-string">&quot;normal&quot;</span>, <span class="hljs-built_in">array</span>(<span class="hljs-string">&quot;kuromoji&quot;</span>), stoptags_exclude(<span class="hljs-built_in">array</span>(<span class="hljs-string">&quot;&#x540D;&#x8A5E;&quot;</span>)));
</code></pre>
<blockquote>
<p>[&quot;&#x5206;&#x304B;&#x3061;&#x66F8;&#x304D;&quot;,&quot;&#x30C6;&#x30B9;&#x30C8;&quot;]</p>
</blockquote>
<p><code>stoptags_exclude(array&lt;string&gt; tags, [, const string lang=&apos;ja&apos;])</code> is a useful UDF for getting <a href="https://github.com/apache/lucene-solr/blob/master/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt" target="_blank">stoptags</a> excluding given part-of-speech tags as seen below:</p>
<pre><code class="lang-sql"><span class="hljs-keyword">select</span> stoptags_exclude(<span class="hljs-built_in">array</span>(<span class="hljs-string">&quot;&#x540D;&#x8A5E;-&#x56FA;&#x6709;&#x540D;&#x8A5E;&quot;</span>));
</code></pre>
<blockquote>
<p>[&quot;&#x305D;&#x306E;&#x4ED6;&quot;,&quot;&#x305D;&#x306E;&#x4ED6;-&#x9593;&#x6295;&quot;,&quot;&#x30D5;&#x30A3;&#x30E9;&#x30FC;&quot;,&quot;&#x526F;&#x8A5E;&quot;,&quot;&#x526F;&#x8A5E;-&#x4E00;&#x822C;&quot;,&quot;&#x526F;&#x8A5E;-&#x52A9;&#x8A5E;&#x985E;&#x63A5;&#x7D9A;&quot;,&quot;&#x52A9;&#x52D5;&#x8A5E;&quot;,&quot;&#x52A9;&#x8A5E;&quot;,&quot;&#x52A9;&#x8A5E;-&#x4E26;&#x7ACB;&#x52A9;&#x8A5E;&quot;
,&quot;&#x52A9;&#x8A5E;-&#x4FC2;&#x52A9;&#x8A5E;&quot;,&quot;&#x52A9;&#x8A5E;-&#x526F;&#x52A9;&#x8A5E;&quot;,&quot;&#x52A9;&#x8A5E;-&#x526F;&#x52A9;&#x8A5E;&#xFF0F;&#x4E26;&#x7ACB;&#x52A9;&#x8A5E;&#xFF0F;&#x7D42;&#x52A9;&#x8A5E;&quot;,&quot;&#x52A9;&#x8A5E;-&#x526F;&#x8A5E;&#x5316;&quot;,&quot;&#x52A9;&#x8A5E;-&#x63A5;&#x7D9A;&#x52A9;&#x8A5E;&quot;,&quot;&#x52A9;&#x8A5E;-&#x683C;&#x52A9;&#x8A5E;
&quot;,&quot;&#x52A9;&#x8A5E;-&#x683C;&#x52A9;&#x8A5E;-&#x4E00;&#x822C;&quot;,&quot;&#x52A9;&#x8A5E;-&#x683C;&#x52A9;&#x8A5E;-&#x5F15;&#x7528;&quot;,&quot;&#x52A9;&#x8A5E;-&#x683C;&#x52A9;&#x8A5E;-&#x9023;&#x8A9E;&quot;,&quot;&#x52A9;&#x8A5E;-&#x7279;&#x6B8A;&quot;,&quot;&#x52A9;&#x8A5E;-&#x7D42;&#x52A9;&#x8A5E;&quot;,&quot;&#x52A9;&#x8A5E;-&#x9023;&#x4F53;&#x5316;&quot;,&quot;&#x52A9;
&#x8A5E;-&#x9593;&#x6295;&#x52A9;&#x8A5E;&quot;,&quot;&#x52D5;&#x8A5E;&quot;,&quot;&#x52D5;&#x8A5E;-&#x63A5;&#x5C3E;&quot;,&quot;&#x52D5;&#x8A5E;-&#x81EA;&#x7ACB;&quot;,&quot;&#x52D5;&#x8A5E;-&#x975E;&#x81EA;&#x7ACB;&quot;,&quot;&#x540D;&#x8A5E;&quot;,&quot;&#x540D;&#x8A5E;-&#x30B5;&#x5909;&#x63A5;&#x7D9A;&quot;,&quot;&#x540D;&#x8A5E;-&#x30CA;&#x30A4;&#x5F62;&#x5BB9;&#x8A5E;&#x8A9E;&#x5E79;&quot;,
&quot;&#x540D;&#x8A5E;-&#x4E00;&#x822C;&quot;,&quot;&#x540D;&#x8A5E;-&#x4EE3;&#x540D;&#x8A5E;&quot;,&quot;&#x540D;&#x8A5E;-&#x4EE3;&#x540D;&#x8A5E;-&#x4E00;&#x822C;&quot;,&quot;&#x540D;&#x8A5E;-&#x4EE3;&#x540D;&#x8A5E;-&#x7E2E;&#x7D04;&quot;,&quot;&#x540D;&#x8A5E;-&#x526F;&#x8A5E;&#x53EF;&#x80FD;&quot;,&quot;&#x540D;&#x8A5E;-&#x52D5;&#x8A5E;&#x975E;&#x81EA;&#x7ACB;&#x7684;&quot;,&quot;&#x540D;
&#x8A5E;-&#x5F15;&#x7528;&#x6587;&#x5B57;&#x5217;&quot;,&quot;&#x540D;&#x8A5E;-&#x5F62;&#x5BB9;&#x52D5;&#x8A5E;&#x8A9E;&#x5E79;&quot;,&quot;&#x540D;&#x8A5E;-&#x63A5;&#x5C3E;&quot;,&quot;&#x540D;&#x8A5E;-&#x63A5;&#x5C3E;-&#x30B5;&#x5909;&#x63A5;&#x7D9A;&quot;,&quot;&#x540D;&#x8A5E;-&#x63A5;&#x5C3E;-&#x4E00;&#x822C;&quot;,&quot;&#x540D;&#x8A5E;-&#x63A5;&#x5C3E;-&#x4EBA;&#x540D;&quot;,&quot;
&#x540D;&#x8A5E;-&#x63A5;&#x5C3E;-&#x526F;&#x8A5E;&#x53EF;&#x80FD;&quot;,&quot;&#x540D;&#x8A5E;-&#x63A5;&#x5C3E;-&#x52A9;&#x52D5;&#x8A5E;&#x8A9E;&#x5E79;&quot;,&quot;&#x540D;&#x8A5E;-&#x63A5;&#x5C3E;-&#x52A9;&#x6570;&#x8A5E;&quot;,&quot;&#x540D;&#x8A5E;-&#x63A5;&#x5C3E;-&#x5730;&#x57DF;&quot;,&quot;&#x540D;&#x8A5E;-&#x63A5;&#x5C3E;-&#x5F62;&#x5BB9;&#x52D5;&#x8A5E;&#x8A9E;&#x5E79;&quot;
,&quot;&#x540D;&#x8A5E;-&#x63A5;&#x5C3E;-&#x7279;&#x6B8A;&quot;,&quot;&#x540D;&#x8A5E;-&#x63A5;&#x7D9A;&#x8A5E;&#x7684;&quot;,&quot;&#x540D;&#x8A5E;-&#x6570;&quot;,&quot;&#x540D;&#x8A5E;-&#x7279;&#x6B8A;&quot;,&quot;&#x540D;&#x8A5E;-&#x7279;&#x6B8A;-&#x52A9;&#x52D5;&#x8A5E;&#x8A9E;&#x5E79;&quot;,&quot;&#x540D;&#x8A5E;-&#x975E;&#x81EA;&#x7ACB;&quot;,&quot;&#x540D;&#x8A5E;-&#x975E;&#x81EA;
&#x7ACB;-&#x4E00;&#x822C;&quot;,&quot;&#x540D;&#x8A5E;-&#x975E;&#x81EA;&#x7ACB;-&#x526F;&#x8A5E;&#x53EF;&#x80FD;&quot;,&quot;&#x540D;&#x8A5E;-&#x975E;&#x81EA;&#x7ACB;-&#x52A9;&#x52D5;&#x8A5E;&#x8A9E;&#x5E79;&quot;,&quot;&#x540D;&#x8A5E;-&#x975E;&#x81EA;&#x7ACB;-&#x5F62;&#x5BB9;&#x52D5;&#x8A5E;&#x8A9E;&#x5E79;&quot;,&quot;&#x5F62;&#x5BB9;&#x8A5E;&quot;,&quot;&#x5F62;&#x5BB9;&#x8A5E;-&#x63A5;
&#x5C3E;&quot;,&quot;&#x5F62;&#x5BB9;&#x8A5E;-&#x81EA;&#x7ACB;&quot;,&quot;&#x5F62;&#x5BB9;&#x8A5E;-&#x975E;&#x81EA;&#x7ACB;&quot;,&quot;&#x611F;&#x52D5;&#x8A5E;&quot;,&quot;&#x63A5;&#x7D9A;&#x8A5E;&quot;,&quot;&#x63A5;&#x982D;&#x8A5E;&quot;,&quot;&#x63A5;&#x982D;&#x8A5E;-&#x52D5;&#x8A5E;&#x63A5;&#x7D9A;&quot;,&quot;&#x63A5;&#x982D;&#x8A5E;-&#x540D;&#x8A5E;&#x63A5;&#x7D9A;&quot;,&quot;&#x63A5;&#x982D;
&#x8A5E;-&#x5F62;&#x5BB9;&#x8A5E;&#x63A5;&#x7D9A;&quot;,&quot;&#x63A5;&#x982D;&#x8A5E;-&#x6570;&#x63A5;&quot;,&quot;&#x672A;&#x77E5;&#x8A9E;&quot;,&quot;&#x8A18;&#x53F7;&quot;,&quot;&#x8A18;&#x53F7;-&#x30A2;&#x30EB;&#x30D5;&#x30A1;&#x30D9;&#x30C3;&#x30C8;&quot;,&quot;&#x8A18;&#x53F7;-&#x4E00;&#x822C;&quot;,&quot;&#x8A18;&#x53F7;-&#x53E5;&#x70B9;&quot;,&quot;&#x8A18;&#x53F7;-&#x62EC;&#x5F27;&#x9589;
&quot;,&quot;&#x8A18;&#x53F7;-&#x62EC;&#x5F27;&#x958B;&quot;,&quot;&#x8A18;&#x53F7;-&#x7A7A;&#x767D;&quot;,&quot;&#x8A18;&#x53F7;-&#x8AAD;&#x70B9;&quot;,&quot;&#x8A9E;&#x65AD;&#x7247;&quot;,&quot;&#x9023;&#x4F53;&#x8A5E;&quot;,&quot;&#x975E;&#x8A00;&#x8A9E;&#x97F3;&quot;]</p>
</blockquote>
<h3 id="custom-dictionary">Custom dictionary</h3>
<p>Moreover, the fifth argument <code>userDict</code> enables you to register a user-defined custom dictionary in <a href="https://github.com/atilika/kuromoji/blob/909fd6b32bf4e9dc86b7599de5c9b50ca8f004a1/kuromoji-core/src/test/resources/userdict.txt" target="_blank">Kuromoji official format</a>:</p>
<pre><code class="lang-sql"><span class="hljs-keyword">select</span> tokenize_ja(<span class="hljs-string">&quot;&#x65E5;&#x672C;&#x7D4C;&#x6E08;&#x65B0;&#x805E;&#xFF06;&#x95A2;&#x897F;&#x56FD;&#x969B;&#x7A7A;&#x6E2F;&quot;</span>, <span class="hljs-string">&quot;normal&quot;</span>, <span class="hljs-literal">null</span>, <span class="hljs-literal">null</span>,
<span class="hljs-built_in">array</span>(
<span class="hljs-string">&quot;&#x65E5;&#x672C;&#x7D4C;&#x6E08;&#x65B0;&#x805E;,&#x65E5;&#x672C; &#x7D4C;&#x6E08; &#x65B0;&#x805E;,&#x30CB;&#x30DB;&#x30F3; &#x30B1;&#x30A4;&#x30B6;&#x30A4; &#x30B7;&#x30F3;&#x30D6;&#x30F3;,&#x30AB;&#x30B9;&#x30BF;&#x30E0;&#x540D;&#x8A5E;&quot;</span>,
<span class="hljs-string">&quot;&#x95A2;&#x897F;&#x56FD;&#x969B;&#x7A7A;&#x6E2F;,&#x95A2;&#x897F; &#x56FD;&#x969B; &#x7A7A;&#x6E2F;,&#x30AB;&#x30F3;&#x30B5;&#x30A4; &#x30B3;&#x30AF;&#x30B5;&#x30A4; &#x30AF;&#x30A6;&#x30B3;&#x30A6;,&#x30C6;&#x30B9;&#x30C8;&#x540D;&#x8A5E;&quot;</span>
));
</code></pre>
<blockquote>
<p>[&quot;&#x65E5;&#x672C;&quot;,&quot;&#x7D4C;&#x6E08;&quot;,&quot;&#x65B0;&#x805E;&quot;,&quot;&#x95A2;&#x897F;&quot;,&quot;&#x56FD;&#x969B;&quot;,&quot;&#x7A7A;&#x6E2F;&quot;]</p>
</blockquote>
<p>Note that you can pass <code>null</code> to each of the third and fourth argument to explicitly use Kuromoji&apos;s <a href="https://github.com/apache/lucene-solr/blob/master/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stopwords.txt" target="_blank">default stop words</a> and <a href="https://github.com/apache/lucene-solr/blob/master/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/stoptags.txt" target="_blank">stop tags</a>.</p>
<p>If you have a large custom dictionary as an external file, <code>userDict</code> can also be <code>const string userDictURL</code> which indicates URL of the external file on somewhere like Amazon S3:</p>
<pre><code class="lang-sql">select tokenize_ja(&quot;&#x65E5;&#x672C;&#x7D4C;&#x6E08;&#x65B0;&#x805E;&#xFF06;&#x95A2;&#x897F;&#x56FD;&#x969B;&#x7A7A;&#x6E2F;&quot;, &quot;normal&quot;, null, null,
&quot;https://raw.githubusercontent.com/atilika/kuromoji/909fd6b32bf4e9dc86b7599de5c9b50ca8f004a1/kuromoji-core/src/test/resources/userdict.txt&quot;);
&gt; [&quot;&#x65E5;&#x672C;&quot;,&quot;&#x7D4C;&#x6E08;&quot;,&quot;&#x65B0;&#x805E;&quot;,&quot;&#x95A2;&#x897F;&quot;,&quot;&#x56FD;&#x969B;&quot;,&quot;&#x7A7A;&#x6E2F;&quot;]
</code></pre>
<div class="panel panel-primary"><div class="panel-heading"><h3 class="panel-title" id="note"><i class="fa fa-edit"></i> Note</h3></div><div class="panel-body"><p>Dictionary SHOULD be accessible through http/https protocol. And, it SHOULD be compressed using gzip with <code>.gz</code> suffix because the maximum dictionary size is limited to 32MB and read timeout is set to 60 sec. Also, connection must be established in 10 sec.</p><p>If you want to use HTTP Basic Authentication, please use the following form: <code>https://user:password@www.sitreurl.com/my_dict.txt.gz</code> (see Sec 3.1 of <a href="https://www.ietf.org/rfc/rfc1738.txt" target="_blank">rfc1738</a>)</p></div></div>
<p>For detailed APIs, please refer Javadoc of <a href="https://lucene.apache.org/core/5_3_1/analyzers-kuromoji/org/apache/lucene/analysis/ja/JapaneseAnalyzer.html" target="_blank">JapaneseAnalyzer</a> as well.</p>
<h3 id="part-of-speech">Part-of-speech</h3>
<p>From Hivemall v0.6.0, the second argument can also accept the following option format:</p>
<pre><code> -mode &lt;arg&gt; The tokenization mode. One of [&apos;normal&apos;, &apos;search&apos;,
&apos;extended&apos;, &apos;default&apos; (normal)]
-pos Return part-of-speech information
</code></pre><p>Then, you can get part-of-speech information as follows:</p>
<pre><code class="lang-sql">WITH tmp as (
<span class="hljs-keyword">select</span>
tokenize_ja(<span class="hljs-string">&apos;kuromoji&#x3092;&#x4F7F;&#x3063;&#x305F;&#x5206;&#x304B;&#x3061;&#x66F8;&#x304D;&#x306E;&#x30C6;&#x30B9;&#x30C8;&#x3067;&#x3059;&#x3002;&apos;</span>,<span class="hljs-string">&apos;-mode search -pos&apos;</span>) <span class="hljs-keyword">as</span> r
)
<span class="hljs-keyword">select</span>
r.tokens,
r.pos,
r.tokens[<span class="hljs-number">0</span>] <span class="hljs-keyword">as</span> token0,
r.pos[<span class="hljs-number">0</span>] <span class="hljs-keyword">as</span> pos0
<span class="hljs-keyword">from</span>
tmp;
</code></pre>
<table>
<thead>
<tr>
<th style="text-align:center">tokens</th>
<th style="text-align:center">pos</th>
<th style="text-align:center">token0</th>
<th style="text-align:center">pos0</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:center">[&quot;kuromoji&quot;,&quot;&#x4F7F;&#x3046;&quot;,&quot;&#x5206;&#x304B;&#x3061;&#x66F8;&#x304D;&quot;,&quot;&#x30C6;&#x30B9;&#x30C8;&quot;]</td>
<td style="text-align:center">[&quot;&#x540D;&#x8A5E;-&#x4E00;&#x822C;&quot;,&quot;&#x52D5;&#x8A5E;-&#x81EA;&#x7ACB;&quot;,&quot;&#x540D;&#x8A5E;-&#x4E00;&#x822C;&quot;,&quot;&#x540D;&#x8A5E;-&#x30B5;&#x5909;&#x63A5;&#x7D9A;&quot;]</td>
<td style="text-align:center">kuromoji</td>
<td style="text-align:center">&#x540D;&#x8A5E;-&#x4E00;&#x822C;</td>
</tr>
</tbody>
</table>
<p>Note that when <code>-pos</code> option is specified, <code>tokenize_ja</code> returns a struct record containing <code>array&lt;string&gt; tokens</code> and <code>array&lt;string&gt; pos</code> as the elements.</p>
<h2 id="chinese-tokenizer">Chinese Tokenizer</h2>
<p>Chinese text tokenizer UDF uses <a href="https://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html" target="_blank">SmartChineseAnalyzer</a>. </p>
<p>The signature of the UDF is as follows:</p>
<pre><code class="lang-sql">tokenize_cn(string line, optional const array&lt;string&gt; stopWords)
</code></pre>
<p>Its basic usage is as follows:</p>
<pre><code class="lang-sql">select tokenize_cn(&quot;Smartcn&#x4E3A;Apache2.0&#x534F;&#x8BAE;&#x7684;&#x5F00;&#x6E90;&#x4E2D;&#x6587;&#x5206;&#x8BCD;&#x7CFB;&#x7EDF;&#xFF0C;Java&#x8BED;&#x8A00;&#x7F16;&#x5199;&#xFF0C;&#x4FEE;&#x6539;&#x7684;&#x4E2D;&#x79D1;&#x9662;&#x8BA1;&#x7B97;&#x6240;ICTCLAS&#x5206;&#x8BCD;&#x7CFB;&#x7EDF;&#x3002;&quot;);
&gt; [smartcn, &#x4E3A;, apach, 2, 0, &#x534F;&#x8BAE;, &#x7684;, &#x5F00;&#x6E90;, &#x4E2D;&#x6587;, &#x5206;&#x8BCD;, &#x7CFB;&#x7EDF;, java, &#x8BED;&#x8A00;, &#x7F16;&#x5199;, &#x4FEE;&#x6539;, &#x7684;, &#x4E2D;&#x79D1;&#x9662;, &#x8BA1;&#x7B97;, &#x6240;, ictcla, &#x5206;&#x8BCD;, &#x7CFB;&#x7EDF;]
</code></pre>
<p>For detailed APIs, please refer Javadoc of <a href="https://lucene.apache.org/core/5_3_1/analyzers-smartcn/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.html" target="_blank">SmartChineseAnalyzer</a> as well.</p>
<h2 id="korean-tokenizer">Korean Tokenizer</h2>
<p>Korean toknizer internally uses <a href="https://www.slideshare.net/elasticsearch/nori-the-official-elasticsearch-plugin-for-korean-language-analysis" target="_blank">lucene-analyzers-nori</a> for tokenization.</p>
<p>The signature of the UDF is as follows:</p>
<pre><code class="lang-sql">tokenize_ko(
String line [, const string mode = &quot;discard&quot; (or const string opts),
const array&lt;string&gt; stopWords,
const array&lt;string&gt;
stopTags,
const array&lt;string&gt; userDict (or const string userDictURL)]
) - returns tokenized strings in array&lt;string&gt;
</code></pre>
<div class="panel panel-primary"><div class="panel-heading"><h3 class="panel-title" id="note"><i class="fa fa-edit"></i> Note</h3></div><div class="panel-body"><p>Instead of mode, the 2nd argument can take options starting with <code>-</code>.</p></div></div>
<p>You can get usage as follows:</p>
<pre><code class="lang-sql">select tokenize_ko(&quot;&quot;, &quot;-help&quot;);
usage: tokenize_ko(String line [, const string mode = &quot;discard&quot; (or const
string opts), const array&lt;string&gt; stopWords, const array&lt;string&gt;
stopTags, const array&lt;string&gt; userDict (or const string
userDictURL)]) - returns tokenized strings in array&lt;string&gt; [-help]
[-mode &lt;arg&gt;] [-outputUnknownUnigrams]
-help Show function help
-mode &lt;arg&gt; The tokenization mode. One of [&apos;node&apos;, &apos;discard&apos;
(default), &apos;mixed&apos;]
-outputUnknownUnigrams outputs unigrams for unknown words.
</code></pre>
<div class="panel panel-primary"><div class="panel-heading"><h3 class="panel-title" id="note"><i class="fa fa-edit"></i> Note</h3></div><div class="panel-body"><p>For details options, please refer <a href="https://lucene.apache.org/core/8_8_2/analyzers-nori/org/apache/lucene/analysis/ko/KoreanAnalyzer.html" target="_blank">Lucene API document</a>. <code>none</code>, <code>discord</code> (default), or <code>mixed</code> are supported for the mode argument.</p></div></div>
<p>See the following examples for the usage.</p>
<pre><code class="lang-sql">-- show version of lucene-analyzers-nori
select tokenize_ko();
&gt; 8.8.2
select tokenize_ko(&apos;&#xC911;&#xC694;&#xD55C; &#xC0C8; &#xAE30;&#xB2A5;&#xC744; &#xAC1C;&#xBC1C;&#xD574;&#xC918;&#xC11C; &#xC815;&#xB9D0; &#xACE0;&#xB9C8;&#xC6CC;&#xC694;!&apos;);
&gt; [&quot;&#xC911;&#xC694;&quot;,&quot;&#xAE30;&#xB2A5;&quot;,&quot;&#xAC1C;&#xBC1C;&quot;,&quot;&#xC8FC;&quot;,&quot;&#xACE0;&#xB9D9;&quot;]
-- explicitly using default options
select tokenize_ko(&apos;&#xC911;&#xC694;&#xD55C; &#xC0C8; &#xAE30;&#xB2A5;&#xC744; &#xAC1C;&#xBC1C;&#xD574;&#xC918;&#xC11C; &#xC815;&#xB9D0; &#xACE0;&#xB9C8;&#xC6CC;&#xC694;!&apos;, &apos;-mode discard&apos;,
-- stopwords (null to use default)
-- see https://github.com/apache/incubator-hivemall/blob/master/nlp/src/main/resources/hivemall/nlp/tokenizer/ext/stopwords-ko.txt
null,
-- stoptags
-- see https://lucene.apache.org/core/8_8_2/analyzers-nori/org/apache/lucene/analysis/ko/POS.Tag.html
array(
&apos;E&apos;, -- Verbal endings
&apos;IC&apos;, -- Interjection
&apos;J&apos;, -- Ending Particle
&apos;MAG&apos;, -- General Adverb
&apos;MAJ&apos;, -- Conjunctive adverb
&apos;MM&apos;, -- Determiner
&apos;SP&apos;, -- Space
&apos;SSC&apos;, -- Closing brackets
&apos;SSO&apos;, -- Opening brackets
&apos;SC&apos;, -- Separator
&apos;SE&apos;, -- Ellipsis
&apos;XPN&apos;, -- Prefix
&apos;XSA&apos;, -- Adjective Suffix
&apos;XSN&apos;, -- Noun Suffix
&apos;XSV&apos;, -- Verb Suffix
&apos;UNA&apos;, -- Unknown
&apos;NA&apos;, -- Unknown
&apos;VSV&apos; -- Unknown
)
);
&gt; [&quot;&#xC911;&#xC694;&quot;,&quot;&#xAE30;&#xB2A5;&quot;,&quot;&#xAC1C;&#xBC1C;&quot;,&quot;&#xC8FC;&quot;,&quot;&#xACE0;&#xB9D9;&quot;]
-- None mode, without General Adverb (MAG)
select tokenize_ko(&apos;&#xC911;&#xC694;&#xD55C; &#xC0C8; &#xAE30;&#xB2A5;&#xC744; &#xAC1C;&#xBC1C;&#xD574;&#xC918;&#xC11C; &#xC815;&#xB9D0; &#xACE0;&#xB9C8;&#xC6CC;&#xC694;!&apos;,
-- No decomposition for compound.
&apos;-mode none&apos;,
-- stopwords (null to use default)
null,
array(
&apos;E&apos;, -- Verbal endings
&apos;IC&apos;, -- Interjection
&apos;J&apos;, -- Ending Particle
-- &apos;MAG&apos;, -- General Adverb
&apos;MAJ&apos;, -- Conjunctive adverb
&apos;MM&apos;, -- Determiner
&apos;SP&apos;, -- Space
&apos;SSC&apos;, -- Closing brackets
&apos;SSO&apos;, -- Opening brackets
&apos;SC&apos;, -- Separator
&apos;SE&apos;, -- Ellipsis
&apos;XPN&apos;, -- Prefix
&apos;XSA&apos;, -- Adjective Suffix
&apos;XSN&apos;, -- Noun Suffix
&apos;XSV&apos;, -- Verb Suffix
&apos;UNA&apos;, -- Unknown
&apos;NA&apos;, -- Unknown
&apos;VSV&apos; -- Unknown
)
);
&gt; [&quot;&#xC911;&#xC694;&quot;,&quot;&#xAE30;&#xB2A5;&quot;,&quot;&#xAC1C;&#xBC1C;&quot;,&quot;&#xC918;&#xC11C;&quot;,&quot;&#xC815;&#xB9D0;&quot;,&quot;&#xACE0;&#xB9C8;&#xC6CC;&#xC694;&quot;]
-- discard mode: Decompose compounds and discards the original form (default).
-- https://lucene.apache.org/core/8_8_2/analyzers-nori/org/apache/lucene/analysis/ko/KoreanTokenizer.DecompoundMode.html
select tokenize_ko(&apos;&#xC911;&#xC694;&#xD55C; &#xC0C8; &#xAE30;&#xB2A5;&#xC744; &#xAC1C;&#xBC1C;&#xD574;&#xC918;&#xC11C; &#xC815;&#xB9D0; &#xACE0;&#xB9C8;&#xC6CC;&#xC694;!&apos;, &apos;-mode discard&apos;);
&gt; [&quot;&#xC911;&#xC694;&quot;,&quot;&#xAE30;&#xB2A5;&quot;,&quot;&#xAC1C;&#xBC1C;&quot;,&quot;&#xC8FC;&quot;,&quot;&#xACE0;&#xB9D9;&quot;]
-- default stopward (null), with stoptags
select tokenize_ko(&apos;&#xC911;&#xC694;&#xD55C; &#xC0C8; &#xAE30;&#xB2A5;&#xC744; &#xAC1C;&#xBC1C;&#xD574;&#xC918;&#xC11C; &#xC815;&#xB9D0; &#xACE0;&#xB9C8;&#xC6CC;&#xC694;!&apos;, &apos;-mode discard&apos;, null, array(&apos;E&apos;, &apos;VV&apos;));
&gt; [&quot;&#xC911;&#xC694;&quot;,&quot;&#xD558;&quot;,&quot;&#xC0C8;&quot;,&quot;&#xAE30;&#xB2A5;&quot;,&quot;&#xC744;&quot;,&quot;&#xAC1C;&#xBC1C;&quot;,&quot;&#xD558;&quot;,&quot;&#xC8FC;&quot;,&quot;&#xC815;&#xB9D0;&quot;,&quot;&#xACE0;&#xB9D9;&quot;]
-- mixed mode: Decompose compounds and keeps the original form.
select tokenize_ko(&apos;&#xC911;&#xC694;&#xD55C; &#xC0C8; &#xAE30;&#xB2A5;&#xC744; &#xAC1C;&#xBC1C;&#xD574;&#xC918;&#xC11C; &#xC815;&#xB9D0; &#xACE0;&#xB9C8;&#xC6CC;&#xC694;!&apos;, &apos;mixed&apos;);
&gt; [&quot;&#xC911;&#xC694;&quot;,&quot;&#xAE30;&#xB2A5;&quot;,&quot;&#xAC1C;&#xBC1C;&quot;,&quot;&#xC918;&#xC11C;&quot;,&quot;&#xC8FC;&quot;,&quot;&#xACE0;&#xB9C8;&#xC6CC;&#xC694;&quot;,&quot;&#xACE0;&#xB9D9;&quot;]
select tokenize_ko(&apos;&#xC911;&#xC694;&#xD55C; &#xC0C8; &#xAE30;&#xB2A5;&#xC744; &#xAC1C;&#xBC1C;&#xD574;&#xC918;&#xC11C; &#xC815;&#xB9D0; &#xACE0;&#xB9C8;&#xC6CC;&#xC694;!&apos;, &apos;-mode mixed&apos;);
&gt; [&quot;&#xC911;&#xC694;&quot;,&quot;&#xAE30;&#xB2A5;&quot;,&quot;&#xAC1C;&#xBC1C;&quot;,&quot;&#xC918;&#xC11C;&quot;,&quot;&#xC8FC;&quot;,&quot;&#xACE0;&#xB9C8;&#xC6CC;&#xC694;&quot;,&quot;&#xACE0;&#xB9D9;&quot;]
-- node mode: No decomposition for compound.
select tokenize_ko(&apos;&#xC911;&#xC694;&#xD55C; &#xC0C8; &#xAE30;&#xB2A5;&#xC744; &#xAC1C;&#xBC1C;&#xD574;&#xC918;&#xC11C; &#xC815;&#xB9D0; &#xACE0;&#xB9C8;&#xC6CC;&#xC694;!&apos;, &apos;-mode none&apos;);
&gt; [&quot;&#xC911;&#xC694;&quot;,&quot;&#xAE30;&#xB2A5;&quot;,&quot;&#xAC1C;&#xBC1C;&quot;,&quot;&#xC918;&#xC11C;&quot;,&quot;&#xACE0;&#xB9C8;&#xC6CC;&#xC694;&quot;]
select tokenize_ko(&apos;Hello, world.&apos;, &apos;-mode none&apos;);
&gt; [&quot;hello&quot;,&quot;world&quot;]
select tokenize_ko(&apos;Hello, world.&apos;, &apos;-mode none -outputUnknownUnigrams&apos;);
&gt; [&quot;h&quot;,&quot;e&quot;,&quot;l&quot;,&quot;l&quot;,&quot;o&quot;,&quot;w&quot;,&quot;o&quot;,&quot;r&quot;,&quot;l&quot;,&quot;d&quot;]
select tokenize_ko(&apos;&#xB098;&#xB294; C++ &#xC5B8;&#xC5B4;&#xB97C; &#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D; &#xC5B8;&#xC5B4;&#xB85C; &#xC0AC;&#xB791;&#xD55C;&#xB2E4;.&apos;, &apos;-mode discard&apos;);
&gt; [&quot;&#xB098;&quot;,&quot;c&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D;&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xC0AC;&#xB791;&quot;]
select tokenize_ko(&apos;&#xB098;&#xB294; C++ &#xC5B8;&#xC5B4;&#xB97C; &#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D; &#xC5B8;&#xC5B4;&#xB85C; &#xC0AC;&#xB791;&#xD55C;&#xB2E4;.&apos;, &apos;-mode discard&apos;, array(), null);
&gt; [&quot;&#xB098;&quot;,&quot;&#xB294;&quot;,&quot;c&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xB97C;&quot;,&quot;&#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D;&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xB85C;&quot;,&quot;&#xC0AC;&#xB791;&quot;,&quot;&#xD558;&quot;,&quot;&#x11AB;&#xB2E4;&quot;]
-- default stopward (null), default stoptags (null)
select tokenize_ko(&apos;&#xB098;&#xB294; C++ &#xC5B8;&#xC5B4;&#xB97C; &#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D; &#xC5B8;&#xC5B4;&#xB85C; &#xC0AC;&#xB791;&#xD55C;&#xB2E4;.&apos;, &apos;-mode discard&apos;);
select tokenize_ko(&apos;&#xB098;&#xB294; C++ &#xC5B8;&#xC5B4;&#xB97C; &#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D; &#xC5B8;&#xC5B4;&#xB85C; &#xC0AC;&#xB791;&#xD55C;&#xB2E4;.&apos;, &apos;-mode discard&apos;, null, null);
&gt; [&quot;&#xB098;&quot;,&quot;c&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D;&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xC0AC;&#xB791;&quot;]
-- no stopward (empty array), default stoptags (null)
select tokenize_ko(&apos;&#xB098;&#xB294; C++ &#xC5B8;&#xC5B4;&#xB97C; &#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D; &#xC5B8;&#xC5B4;&#xB85C; &#xC0AC;&#xB791;&#xD55C;&#xB2E4;.&apos;, &apos;-mode discard&apos;, array());
select tokenize_ko(&apos;&#xB098;&#xB294; C++ &#xC5B8;&#xC5B4;&#xB97C; &#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D; &#xC5B8;&#xC5B4;&#xB85C; &#xC0AC;&#xB791;&#xD55C;&#xB2E4;.&apos;, &apos;-mode discard&apos;, array(), null);
&gt; [&quot;&#xB098;&quot;,&quot;c&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D;&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xC0AC;&#xB791;&quot;]
-- no stopward (empty array), no stoptags (emptry array), custom dict
select tokenize_ko(&apos;&#xB098;&#xB294; C++ &#xC5B8;&#xC5B4;&#xB97C; &#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D; &#xC5B8;&#xC5B4;&#xB85C; &#xC0AC;&#xB791;&#xD55C;&#xB2E4;.&apos;, &apos;-mode discard&apos;, array(), array(), array(&apos;C++&apos;));
&gt; [&quot;&#xB098;&quot;,&quot;&#xB294;&quot;,&quot;c++&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xB97C;&quot;,&quot;&#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D;&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xB85C;&quot;,&quot;&#xC0AC;&#xB791;&quot;,&quot;&#xD558;&quot;,&quot;&#x11AB;&#xB2E4;&quot;]
&gt; -- default stopward (null), default stoptags (null), custom dict
select tokenize_ko(&apos;&#xB098;&#xB294; C++ &#xC5B8;&#xC5B4;&#xB97C; &#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D; &#xC5B8;&#xC5B4;&#xB85C; &#xC0AC;&#xB791;&#xD55C;&#xB2E4;.&apos;, &apos;-mode discard&apos;, null, null, array(&apos;C++&apos;));
&gt; [&quot;&#xB098;&quot;,&quot;c++&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D;&quot;,&quot;&#xC5B8;&#xC5B4;&quot;,&quot;&#xC0AC;&#xB791;&quot;]
</code></pre>
<h3 id="custom-dictionary">Custom dictionary</h3>
<p>Moreover, the fifth argument <code>userDictURL</code> enables you to register a user-defined custom dictionary placed in http/https accessible external site. Find the dictionary format <a href="https://raw.githubusercontent.com/apache/lucene/main/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt" target="_blank">here from Lucene&apos;s one</a>.</p>
<pre><code class="lang-sql">select tokenize_ko(&apos;&#xB098;&#xB294; c++ &#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D;&#xC744; &#xC990;&#xAE34;&#xB2E4;.&apos;, &apos;-mode discard&apos;, null, null, &apos;https://raw.githubusercontent.com/apache/lucene/main/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/userdict.txt&apos;);
&gt; [&quot;&#xB098;&quot;,&quot;c++&quot;,&quot;&#xD504;&#xB85C;&#xADF8;&#xB798;&#xBC0D;&quot;,&quot;&#xC990;&#xAE30;&quot;]
</code></pre>
<div class="panel panel-primary"><div class="panel-heading"><h3 class="panel-title" id="note"><i class="fa fa-edit"></i> Note</h3></div><div class="panel-body"><p>Dictionary SHOULD be accessible through http/https protocol. And, it SHOULD be compressed using gzip with <code>.gz</code> suffix because the maximum dictionary size is limited to 32MB and read timeout is set to 60 sec. Also, connection must be established in 10 sec.</p></div></div>
<p><div id="page-footer" class="localized-footer"><hr><!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<p><sub><font color="gray">
Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator.
</font></sub></p>
</div></p>
</section>
</div>
<div class="search-results">
<div class="has-results">
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
<ul class="search-results-list"></ul>
</div>
<div class="no-results">
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
var gitbook = gitbook || [];
gitbook.push(function() {
gitbook.page.hasChanged({"page":{"title":"Text Tokenizer","level":"2.3","depth":1,"next":{"title":"Approximate Aggregate Functions","level":"2.4","depth":1,"path":"misc/approx.md","ref":"misc/approx.md","articles":[]},"previous":{"title":"Efficient Top-K Query Processing","level":"2.2","depth":1,"path":"misc/topk.md","ref":"misc/topk.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"h2lb":3,"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md","hline":"true"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/tree/master/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"https://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"misc/tokenizer.md","mtime":"2021-05-14T03:34:47.827Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2021-05-14T03:36:16.688Z"},"basePath":"..","book":{"language":""}});
});
</script>
</div>
<script src="../gitbook/gitbook.js"></script>
<script src="../gitbook/theme.js"></script>
<script src="../gitbook/gitbook-plugin-edit-link/plugin.js"></script>
<script src="../gitbook/gitbook-plugin-github/plugin.js"></script>
<script src="../gitbook/gitbook-plugin-splitter/splitter.js"></script>
<script src="../gitbook/gitbook-plugin-etoc/plugin.js"></script>
<script src="../gitbook/gitbook-plugin-toggle-chapters/toggle.js"></script>
<script src="../gitbook/gitbook-plugin-anchorjs/anchor.min.js"></script>
<script src="../gitbook/gitbook-plugin-anchorjs/anchor-style.js"></script>
<script src="../gitbook/gitbook-plugin-expandable-chapters/expandable-chapters.js"></script>
<script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
<script src="../gitbook/gitbook-plugin-search/search.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
<script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
<script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
<script src="../gitbook/gitbook-plugin-theme-api/theme-api.js"></script>
</body>
</html>