blob: acd6cb2e87f9fde665d1e0ee4e0d946ef9db4255 [file] [log] [blame]
<!DOCTYPE HTML>
<html lang="" >
<head>
<meta charset="UTF-8">
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>XGBoost ยท Hivemall User Manual</title>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="description" content="">
<meta name="generator" content="GitBook 3.2.3">
<link rel="stylesheet" href="../gitbook/style.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-splitter/splitter.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-etoc/plugin.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-callouts/plugin.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-toggle-chapters/toggle.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-codeblock-filename/block.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-expandable-chapters/expandable-chapters.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-multipart/multipart.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-katex/katex.min.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-emphasize/plugin.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-theme-api/theme-api.css">
<meta name="HandheldFriendly" content="true"/>
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
<link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
<link rel="next" href="kdd2010a.html" />
<link rel="prev" href="news20_rf.html" />
</head>
<body>
<div class="book">
<div class="book-summary">
<div id="book-search-input" role="search">
<input type="text" placeholder="Type to search" />
</div>
<nav role="navigation">
<ul class="summary">
<li>
<a href="https://hivemall.incubator.apache.org/" target="_blank" class="custom-link"><i class="fa fa-home"></i> Home</a>
</li>
<li class="divider"></li>
<li class="header">TABLE OF CONTENTS</li>
<li class="chapter " data-level="1.1" data-path="../">
<a href="../">
<b>1.1.</b>
Introduction
</a>
</li>
<li class="chapter " data-level="1.2" data-path="../getting_started/">
<a href="../getting_started/">
<b>1.2.</b>
Getting Started
</a>
<ul class="articles">
<li class="chapter " data-level="1.2.1" data-path="../getting_started/installation.html">
<a href="../getting_started/installation.html">
<b>1.2.1.</b>
Installation
</a>
</li>
<li class="chapter " data-level="1.2.2" data-path="../getting_started/permanent-functions.html">
<a href="../getting_started/permanent-functions.html">
<b>1.2.2.</b>
Install as permanent functions
</a>
</li>
<li class="chapter " data-level="1.2.3" data-path="../getting_started/input-format.html">
<a href="../getting_started/input-format.html">
<b>1.2.3.</b>
Input Format
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.3" data-path="../misc/funcs.html">
<a href="../misc/funcs.html">
<b>1.3.</b>
List of Functions
</a>
</li>
<li class="chapter " data-level="1.4" data-path="../tips/">
<a href="../tips/">
<b>1.4.</b>
Tips for Effective Hivemall
</a>
<ul class="articles">
<li class="chapter " data-level="1.4.1" data-path="../tips/addbias.html">
<a href="../tips/addbias.html">
<b>1.4.1.</b>
Explicit add_bias() for better prediction
</a>
</li>
<li class="chapter " data-level="1.4.2" data-path="../tips/rand_amplify.html">
<a href="../tips/rand_amplify.html">
<b>1.4.2.</b>
Use rand_amplify() to better prediction results
</a>
</li>
<li class="chapter " data-level="1.4.3" data-path="../tips/rt_prediction.html">
<a href="../tips/rt_prediction.html">
<b>1.4.3.</b>
Real-time prediction on RDBMS
</a>
</li>
<li class="chapter " data-level="1.4.4" data-path="../tips/ensemble_learning.html">
<a href="../tips/ensemble_learning.html">
<b>1.4.4.</b>
Ensemble learning for stable prediction
</a>
</li>
<li class="chapter " data-level="1.4.5" data-path="../tips/mixserver.html">
<a href="../tips/mixserver.html">
<b>1.4.5.</b>
Mixing models for a better prediction convergence (MIX server)
</a>
</li>
<li class="chapter " data-level="1.4.6" data-path="../tips/emr.html">
<a href="../tips/emr.html">
<b>1.4.6.</b>
Run Hivemall on Amazon Elastic MapReduce
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.5" data-path="../tips/general_tips.html">
<a href="../tips/general_tips.html">
<b>1.5.</b>
General Hive/Hadoop Tips
</a>
<ul class="articles">
<li class="chapter " data-level="1.5.1" data-path="../tips/rowid.html">
<a href="../tips/rowid.html">
<b>1.5.1.</b>
Adding rowid for each row
</a>
</li>
<li class="chapter " data-level="1.5.2" data-path="../tips/hadoop_tuning.html">
<a href="../tips/hadoop_tuning.html">
<b>1.5.2.</b>
Hadoop tuning for Hivemall
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.6" data-path="../troubleshooting/">
<a href="../troubleshooting/">
<b>1.6.</b>
Troubleshooting
</a>
<ul class="articles">
<li class="chapter " data-level="1.6.1" data-path="../troubleshooting/oom.html">
<a href="../troubleshooting/oom.html">
<b>1.6.1.</b>
OutOfMemoryError in training
</a>
</li>
<li class="chapter " data-level="1.6.2" data-path="../troubleshooting/mapjoin_task_error.html">
<a href="../troubleshooting/mapjoin_task_error.html">
<b>1.6.2.</b>
SemanticException generate map join task error: Cannot serialize object
</a>
</li>
<li class="chapter " data-level="1.6.3" data-path="../troubleshooting/asterisk.html">
<a href="../troubleshooting/asterisk.html">
<b>1.6.3.</b>
Asterisk argument for UDTF does not work
</a>
</li>
<li class="chapter " data-level="1.6.4" data-path="../troubleshooting/num_mappers.html">
<a href="../troubleshooting/num_mappers.html">
<b>1.6.4.</b>
The number of mappers is less than input splits in Hadoop 2.x
</a>
</li>
<li class="chapter " data-level="1.6.5" data-path="../troubleshooting/mapjoin_classcastex.html">
<a href="../troubleshooting/mapjoin_classcastex.html">
<b>1.6.5.</b>
Map-side join causes ClassCastException on Tez
</a>
</li>
</ul>
</li>
<li class="header">Part II - Generic Features</li>
<li class="chapter " data-level="2.1" data-path="../misc/generic_funcs.html">
<a href="../misc/generic_funcs.html">
<b>2.1.</b>
List of Generic Hivemall Functions
</a>
</li>
<li class="chapter " data-level="2.2" data-path="../misc/topk.html">
<a href="../misc/topk.html">
<b>2.2.</b>
Efficient Top-K Query Processing
</a>
</li>
<li class="chapter " data-level="2.3" data-path="../misc/tokenizer.html">
<a href="../misc/tokenizer.html">
<b>2.3.</b>
Text Tokenizer
</a>
</li>
<li class="chapter " data-level="2.4" data-path="../misc/approx.html">
<a href="../misc/approx.html">
<b>2.4.</b>
Approximate Aggregate Functions
</a>
</li>
<li class="header">Part III - Feature Engineering</li>
<li class="chapter " data-level="3.1" data-path="../ft_engineering/scaling.html">
<a href="../ft_engineering/scaling.html">
<b>3.1.</b>
Feature Scaling
</a>
</li>
<li class="chapter " data-level="3.2" data-path="../ft_engineering/hashing.html">
<a href="../ft_engineering/hashing.html">
<b>3.2.</b>
Feature Hashing
</a>
</li>
<li class="chapter " data-level="3.3" data-path="../ft_engineering/selection.html">
<a href="../ft_engineering/selection.html">
<b>3.3.</b>
Feature Selection
</a>
</li>
<li class="chapter " data-level="3.4" data-path="../ft_engineering/binning.html">
<a href="../ft_engineering/binning.html">
<b>3.4.</b>
Feature Binning
</a>
</li>
<li class="chapter " data-level="3.5" data-path="../ft_engineering/pairing.html">
<a href="../ft_engineering/pairing.html">
<b>3.5.</b>
Feature Paring
</a>
<ul class="articles">
<li class="chapter " data-level="3.5.1" data-path="../ft_engineering/polynomial.html">
<a href="../ft_engineering/polynomial.html">
<b>3.5.1.</b>
Polynomial features
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="3.6" data-path="../ft_engineering/ft_trans.html">
<a href="../ft_engineering/ft_trans.html">
<b>3.6.</b>
Feature Transformation
</a>
<ul class="articles">
<li class="chapter " data-level="3.6.1" data-path="../ft_engineering/vectorization.html">
<a href="../ft_engineering/vectorization.html">
<b>3.6.1.</b>
Feature vectorization
</a>
</li>
<li class="chapter " data-level="3.6.2" data-path="../ft_engineering/quantify.html">
<a href="../ft_engineering/quantify.html">
<b>3.6.2.</b>
Quantify non-number features
</a>
</li>
<li class="chapter " data-level="3.6.3" data-path="../ft_engineering/binarize.html">
<a href="../ft_engineering/binarize.html">
<b>3.6.3.</b>
Binarize label
</a>
</li>
<li class="chapter " data-level="3.6.4" data-path="../ft_engineering/onehot.html">
<a href="../ft_engineering/onehot.html">
<b>3.6.4.</b>
One-hot encoding
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="3.7" data-path="../ft_engineering/term_vector.html">
<a href="../ft_engineering/term_vector.html">
<b>3.7.</b>
Term Vector Model
</a>
<ul class="articles">
<li class="chapter " data-level="3.7.1" data-path="../ft_engineering/tfidf.html">
<a href="../ft_engineering/tfidf.html">
<b>3.7.1.</b>
TF-IDF Term Weighting
</a>
</li>
<li class="chapter " data-level="3.7.2" data-path="../ft_engineering/bm25.html">
<a href="../ft_engineering/bm25.html">
<b>3.7.2.</b>
Okapi BM25 Term Weighting
</a>
</li>
</ul>
</li>
<li class="header">Part IV - Evaluation</li>
<li class="chapter " data-level="4.1" data-path="../eval/binary_classification_measures.html">
<a href="../eval/binary_classification_measures.html">
<b>4.1.</b>
Binary Classification Metrics
</a>
<ul class="articles">
<li class="chapter " data-level="4.1.1" data-path="../eval/auc.html">
<a href="../eval/auc.html">
<b>4.1.1.</b>
Area under the ROC curve
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="4.2" data-path="../eval/multilabel_classification_measures.html">
<a href="../eval/multilabel_classification_measures.html">
<b>4.2.</b>
Multi-label Classification Metrics
</a>
</li>
<li class="chapter " data-level="4.3" data-path="../eval/regression.html">
<a href="../eval/regression.html">
<b>4.3.</b>
Regression Metrics
</a>
</li>
<li class="chapter " data-level="4.4" data-path="../eval/rank.html">
<a href="../eval/rank.html">
<b>4.4.</b>
Ranking Measures
</a>
</li>
<li class="chapter " data-level="4.5" data-path="../eval/datagen.html">
<a href="../eval/datagen.html">
<b>4.5.</b>
Data Generation
</a>
<ul class="articles">
<li class="chapter " data-level="4.5.1" data-path="../eval/lr_datagen.html">
<a href="../eval/lr_datagen.html">
<b>4.5.1.</b>
Logistic Regression data generation
</a>
</li>
</ul>
</li>
<li class="header">Part V - Supervised Learning</li>
<li class="chapter " data-level="5.1" data-path="../supervised_learning/prediction.html">
<a href="../supervised_learning/prediction.html">
<b>5.1.</b>
How Prediction Works
</a>
</li>
<li class="chapter " data-level="5.2" data-path="../supervised_learning/tutorial.html">
<a href="../supervised_learning/tutorial.html">
<b>5.2.</b>
Step-by-Step Tutorial on Supervised Learning
</a>
</li>
<li class="header">Part VI - Binary Classification</li>
<li class="chapter " data-level="6.1" data-path="general.html">
<a href="general.html">
<b>6.1.</b>
Binary Classification
</a>
</li>
<li class="chapter " data-level="6.2" data-path="a9a.html">
<a href="a9a.html">
<b>6.2.</b>
a9a Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.2.1" data-path="a9a_dataset.html">
<a href="a9a_dataset.html">
<b>6.2.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="6.2.2" data-path="a9a_generic.html">
<a href="a9a_generic.html">
<b>6.2.2.</b>
General Binary Classifier
</a>
</li>
<li class="chapter " data-level="6.2.3" data-path="a9a_lr.html">
<a href="a9a_lr.html">
<b>6.2.3.</b>
Logistic Regression
</a>
</li>
<li class="chapter " data-level="6.2.4" data-path="a9a_minibatch.html">
<a href="a9a_minibatch.html">
<b>6.2.4.</b>
Mini-batch Gradient Descent
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="6.3" data-path="news20.html">
<a href="news20.html">
<b>6.3.</b>
News20 Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.3.1" data-path="news20_dataset.html">
<a href="news20_dataset.html">
<b>6.3.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="6.3.2" data-path="news20_pa.html">
<a href="news20_pa.html">
<b>6.3.2.</b>
Perceptron, Passive Aggressive
</a>
</li>
<li class="chapter " data-level="6.3.3" data-path="news20_scw.html">
<a href="news20_scw.html">
<b>6.3.3.</b>
CW, AROW, SCW
</a>
</li>
<li class="chapter " data-level="6.3.4" data-path="news20_generic.html">
<a href="news20_generic.html">
<b>6.3.4.</b>
General Binary Classifier
</a>
</li>
<li class="chapter " data-level="6.3.5" data-path="news20_generic_bagging.html">
<a href="news20_generic_bagging.html">
<b>6.3.5.</b>
Baggnig classiers
</a>
</li>
<li class="chapter " data-level="6.3.6" data-path="news20_adagrad.html">
<a href="news20_adagrad.html">
<b>6.3.6.</b>
AdaGradRDA, AdaGrad, AdaDelta
</a>
</li>
<li class="chapter " data-level="6.3.7" data-path="news20_rf.html">
<a href="news20_rf.html">
<b>6.3.7.</b>
Random Forest
</a>
</li>
<li class="chapter active" data-level="6.3.8" data-path="news20b_xgboost.html">
<a href="news20b_xgboost.html">
<b>6.3.8.</b>
XGBoost
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="6.4" data-path="kdd2010a.html">
<a href="kdd2010a.html">
<b>6.4.</b>
KDD2010a Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.4.1" data-path="kdd2010a_dataset.html">
<a href="kdd2010a_dataset.html">
<b>6.4.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="6.4.2" data-path="kdd2010a_scw.html">
<a href="kdd2010a_scw.html">
<b>6.4.2.</b>
PA, CW, AROW, SCW
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="6.5" data-path="kdd2010b.html">
<a href="kdd2010b.html">
<b>6.5.</b>
KDD2010b Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.5.1" data-path="kdd2010b_dataset.html">
<a href="kdd2010b_dataset.html">
<b>6.5.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="6.5.2" data-path="kdd2010b_arow.html">
<a href="kdd2010b_arow.html">
<b>6.5.2.</b>
AROW
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="6.6" data-path="webspam.html">
<a href="webspam.html">
<b>6.6.</b>
Webspam Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.6.1" data-path="webspam_dataset.html">
<a href="webspam_dataset.html">
<b>6.6.1.</b>
Data Pareparation
</a>
</li>
<li class="chapter " data-level="6.6.2" data-path="webspam_scw.html">
<a href="webspam_scw.html">
<b>6.6.2.</b>
PA1, AROW, SCW
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="6.7" data-path="titanic_rf.html">
<a href="titanic_rf.html">
<b>6.7.</b>
Kaggle Titanic Tutorial
</a>
</li>
<li class="chapter " data-level="6.8" data-path="criteo.html">
<a href="criteo.html">
<b>6.8.</b>
Criteo Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="6.8.1" data-path="criteo_dataset.html">
<a href="criteo_dataset.html">
<b>6.8.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="6.8.2" data-path="criteo_ffm.html">
<a href="criteo_ffm.html">
<b>6.8.2.</b>
Field-Aware Factorization Machines
</a>
</li>
</ul>
</li>
<li class="header">Part VII - Multiclass Classification</li>
<li class="chapter " data-level="7.1" data-path="../multiclass/news20.html">
<a href="../multiclass/news20.html">
<b>7.1.</b>
News20 Multiclass Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="7.1.1" data-path="../multiclass/news20_dataset.html">
<a href="../multiclass/news20_dataset.html">
<b>7.1.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="7.1.2" data-path="../multiclass/news20_one-vs-the-rest_dataset.html">
<a href="../multiclass/news20_one-vs-the-rest_dataset.html">
<b>7.1.2.</b>
Data Preparation for one-vs-the-rest classifiers
</a>
</li>
<li class="chapter " data-level="7.1.3" data-path="../multiclass/news20_pa.html">
<a href="../multiclass/news20_pa.html">
<b>7.1.3.</b>
PA
</a>
</li>
<li class="chapter " data-level="7.1.4" data-path="../multiclass/news20_scw.html">
<a href="../multiclass/news20_scw.html">
<b>7.1.4.</b>
CW, AROW, SCW
</a>
</li>
<li class="chapter " data-level="7.1.5" data-path="../multiclass/news20_xgboost.html">
<a href="../multiclass/news20_xgboost.html">
<b>7.1.5.</b>
XGBoost
</a>
</li>
<li class="chapter " data-level="7.1.6" data-path="../multiclass/news20_ensemble.html">
<a href="../multiclass/news20_ensemble.html">
<b>7.1.6.</b>
Ensemble learning
</a>
</li>
<li class="chapter " data-level="7.1.7" data-path="../multiclass/news20_one-vs-the-rest.html">
<a href="../multiclass/news20_one-vs-the-rest.html">
<b>7.1.7.</b>
one-vs-the-rest Classifier
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="7.2" data-path="../multiclass/iris.html">
<a href="../multiclass/iris.html">
<b>7.2.</b>
Iris Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="7.2.1" data-path="../multiclass/iris_dataset.html">
<a href="../multiclass/iris_dataset.html">
<b>7.2.1.</b>
Data preparation
</a>
</li>
<li class="chapter " data-level="7.2.2" data-path="../multiclass/iris_scw.html">
<a href="../multiclass/iris_scw.html">
<b>7.2.2.</b>
SCW
</a>
</li>
<li class="chapter " data-level="7.2.3" data-path="../multiclass/iris_randomforest.html">
<a href="../multiclass/iris_randomforest.html">
<b>7.2.3.</b>
Random Forest
</a>
</li>
<li class="chapter " data-level="7.2.4" data-path="../multiclass/iris_xgboost.html">
<a href="../multiclass/iris_xgboost.html">
<b>7.2.4.</b>
XGBoost
</a>
</li>
</ul>
</li>
<li class="header">Part VIII - Regression</li>
<li class="chapter " data-level="8.1" data-path="../regression/general.html">
<a href="../regression/general.html">
<b>8.1.</b>
Regression
</a>
</li>
<li class="chapter " data-level="8.2" data-path="../regression/e2006.html">
<a href="../regression/e2006.html">
<b>8.2.</b>
E2006-tfidf Regression Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="8.2.1" data-path="../regression/e2006_dataset.html">
<a href="../regression/e2006_dataset.html">
<b>8.2.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="8.2.2" data-path="../regression/e2006_generic.html">
<a href="../regression/e2006_generic.html">
<b>8.2.2.</b>
General Regessor
</a>
</li>
<li class="chapter " data-level="8.2.3" data-path="../regression/e2006_arow.html">
<a href="../regression/e2006_arow.html">
<b>8.2.3.</b>
Passive Aggressive, AROW
</a>
</li>
<li class="chapter " data-level="8.2.4" data-path="../regression/e2006_xgboost.html">
<a href="../regression/e2006_xgboost.html">
<b>8.2.4.</b>
XGBoost
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="8.3" data-path="../regression/kddcup12tr2.html">
<a href="../regression/kddcup12tr2.html">
<b>8.3.</b>
KDDCup 2012 Track 2 CTR Prediction Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="8.3.1" data-path="../regression/kddcup12tr2_dataset.html">
<a href="../regression/kddcup12tr2_dataset.html">
<b>8.3.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="8.3.2" data-path="../regression/kddcup12tr2_lr.html">
<a href="../regression/kddcup12tr2_lr.html">
<b>8.3.2.</b>
Logistic Regression, Passive Aggressive
</a>
</li>
<li class="chapter " data-level="8.3.3" data-path="../regression/kddcup12tr2_lr_amplify.html">
<a href="../regression/kddcup12tr2_lr_amplify.html">
<b>8.3.3.</b>
Logistic Regression with amplifier
</a>
</li>
<li class="chapter " data-level="8.3.4" data-path="../regression/kddcup12tr2_adagrad.html">
<a href="../regression/kddcup12tr2_adagrad.html">
<b>8.3.4.</b>
AdaGrad, AdaDelta
</a>
</li>
</ul>
</li>
<li class="header">Part IX - Recommendation</li>
<li class="chapter " data-level="9.1" data-path="../recommend/cf.html">
<a href="../recommend/cf.html">
<b>9.1.</b>
Collaborative Filtering
</a>
<ul class="articles">
<li class="chapter " data-level="9.1.1" data-path="../recommend/item_based_cf.html">
<a href="../recommend/item_based_cf.html">
<b>9.1.1.</b>
Item-based Collaborative Filtering
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="9.2" data-path="../recommend/news20.html">
<a href="../recommend/news20.html">
<b>9.2.</b>
News20 Related Article Recommendation Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="9.2.1" data-path="../multiclass/news20_dataset.html">
<a href="../multiclass/news20_dataset.html">
<b>9.2.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="9.2.2" data-path="../recommend/news20_jaccard.html">
<a href="../recommend/news20_jaccard.html">
<b>9.2.2.</b>
LSH/MinHash and Jaccard Similarity
</a>
</li>
<li class="chapter " data-level="9.2.3" data-path="../recommend/news20_knn.html">
<a href="../recommend/news20_knn.html">
<b>9.2.3.</b>
LSH/MinHash and Brute-force Search
</a>
</li>
<li class="chapter " data-level="9.2.4" data-path="../recommend/news20_bbit_minhash.html">
<a href="../recommend/news20_bbit_minhash.html">
<b>9.2.4.</b>
kNN search using b-Bits MinHash
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="9.3" data-path="../recommend/movielens.html">
<a href="../recommend/movielens.html">
<b>9.3.</b>
MovieLens Movie Recommendation Tutorial
</a>
<ul class="articles">
<li class="chapter " data-level="9.3.1" data-path="../recommend/movielens_dataset.html">
<a href="../recommend/movielens_dataset.html">
<b>9.3.1.</b>
Data Preparation
</a>
</li>
<li class="chapter " data-level="9.3.2" data-path="../recommend/movielens_cf.html">
<a href="../recommend/movielens_cf.html">
<b>9.3.2.</b>
Item-based Collaborative Filtering
</a>
</li>
<li class="chapter " data-level="9.3.3" data-path="../recommend/movielens_mf.html">
<a href="../recommend/movielens_mf.html">
<b>9.3.3.</b>
Matrix Factorization
</a>
</li>
<li class="chapter " data-level="9.3.4" data-path="../recommend/movielens_fm.html">
<a href="../recommend/movielens_fm.html">
<b>9.3.4.</b>
Factorization Machine
</a>
</li>
<li class="chapter " data-level="9.3.5" data-path="../recommend/movielens_slim.html">
<a href="../recommend/movielens_slim.html">
<b>9.3.5.</b>
SLIM for fast top-k Recommendation
</a>
</li>
<li class="chapter " data-level="9.3.6" data-path="../recommend/movielens_cv.html">
<a href="../recommend/movielens_cv.html">
<b>9.3.6.</b>
10-fold Cross Validation (Matrix Factorization)
</a>
</li>
</ul>
</li>
<li class="header">Part X - Anomaly Detection</li>
<li class="chapter " data-level="10.1" data-path="../anomaly/lof.html">
<a href="../anomaly/lof.html">
<b>10.1.</b>
Outlier Detection using Local Outlier Factor (LOF)
</a>
</li>
<li class="chapter " data-level="10.2" data-path="../anomaly/sst.html">
<a href="../anomaly/sst.html">
<b>10.2.</b>
Change-Point Detection using Singular Spectrum Transformation (SST)
</a>
</li>
<li class="chapter " data-level="10.3" data-path="../anomaly/changefinder.html">
<a href="../anomaly/changefinder.html">
<b>10.3.</b>
ChangeFinder: Detecting Outlier and Change-Point Simultaneously
</a>
</li>
<li class="header">Part XI - Clustering</li>
<li class="chapter " data-level="11.1" data-path="../clustering/lda.html">
<a href="../clustering/lda.html">
<b>11.1.</b>
Latent Dirichlet Allocation
</a>
</li>
<li class="chapter " data-level="11.2" data-path="../clustering/plsa.html">
<a href="../clustering/plsa.html">
<b>11.2.</b>
Probabilistic Latent Semantic Analysis
</a>
</li>
<li class="header">Part XII - GeoSpatial Functions</li>
<li class="chapter " data-level="12.1" data-path="../geospatial/latlon.html">
<a href="../geospatial/latlon.html">
<b>12.1.</b>
Lat/Lon functions
</a>
</li>
<li class="header">Part XIII - Hivemall on SparkSQL</li>
<li class="chapter " data-level="13.1" data-path="../spark/getting_started/README.md">
<span>
<b>13.1.</b>
Getting Started
</a>
<ul class="articles">
<li class="chapter " data-level="13.1.1" data-path="../spark/getting_started/installation.html">
<a href="../spark/getting_started/installation.html">
<b>13.1.1.</b>
Installation
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="13.2" data-path="../spark/binaryclass/">
<a href="../spark/binaryclass/">
<b>13.2.</b>
Binary Classification
</a>
<ul class="articles">
<li class="chapter " data-level="13.2.1" data-path="../spark/binaryclass/a9a_sql.html">
<a href="../spark/binaryclass/a9a_sql.html">
<b>13.2.1.</b>
a9a Tutorial for SQL
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="13.3" data-path="../spark/binaryclass/">
<a href="../spark/binaryclass/">
<b>13.3.</b>
Regression
</a>
<ul class="articles">
<li class="chapter " data-level="13.3.1" data-path="../spark/regression/e2006_sql.html">
<a href="../spark/regression/e2006_sql.html">
<b>13.3.1.</b>
E2006-tfidf Regression Tutorial for SQL
</a>
</li>
</ul>
</li>
<li class="header">Part XIV - Hivemall on Docker</li>
<li class="chapter " data-level="14.1" data-path="../docker/getting_started.html">
<a href="../docker/getting_started.html">
<b>14.1.</b>
Getting Started
</a>
</li>
<li class="header">Part XIV - External References</li>
<li class="chapter " data-level="15.1" >
<a target="_blank" href="https://github.com/daijyc/hivemall/wiki/PigHome">
<b>15.1.</b>
Hivemall on Apache Pig
</a>
</li>
<li class="divider"></li>
<li>
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
Published with GitBook
</a>
</li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<!-- Title -->
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i>
<a href=".." >XGBoost</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<div id="book-search-results">
<div class="search-noresults">
<section class="normal markdown-section">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<p>In this tutorial, we build a binary classification model using XGBoost.</p>
<!-- toc --><div id="toc" class="toc">
<ul>
<li><a href="#feature-vector-format-for-xgboost">Feature Vector format for XGBoost</a></li>
<li><a href="#label-format-in-binary-classification">Label format in Binary Classification</a></li>
<li><a href="#usage-and-hyperparameters">Usage and Hyperparameters</a></li>
<li><a href="#training">Training</a></li>
<li><a href="#prediction">prediction</a></li>
<li><a href="#evaluation">evaluation</a></li>
</ul>
</div><!-- tocstop -->
<h2 id="feature-vector-format-for-xgboost">Feature Vector format for XGBoost</h2>
<p>For feature vector, <code>train_xgboost</code> takes a sparse vector format (<code>array&lt;string&gt;</code>) or a dense vector format (<code>array&lt;double&gt;</code>).
In the feature vector, each feature takes a LIBSVM format:</p>
<pre><code>feature ::= &lt;index&gt;:&lt;weight&gt;
index ::= &lt;Non-negative INT&gt; (e.g., 0,1,2,...)
weight ::= &lt;DOUBLE&gt;
</code></pre><div class="panel panel-primary"><div class="panel-heading"><h3 class="panel-title" id="note"><i class="fa fa-edit"></i> Note</h3></div><div class="panel-body"><p>Unlike the original libsvm format, it&apos;s not needed to sort a feature vector by ansceding order of feature index.</p></div></div>
<p>Target label format of binary classification follows <a href="http://hivemall.apache.org/userguide/getting_started/input-format.html#label-format-in-binary-classification" target="_blank">this rule</a>. Please refer <a href="https://xgboost.readthedocs.io/en/latest/tutorials/input_format.html" target="_blank">xgboost document</a> as well.</p>
<h2 id="label-format-in-binary-classification">Label format in Binary Classification</h2>
<p>The label must be an INT typed column and the values are positive (+1) or negative (-1) as follows:</p>
<pre><code>&lt;label&gt; ::= 1 | -1
</code></pre><p>Alternatively, you can use the following format that represents 1 for a positive example and 0 for a negative example:</p>
<pre><code>&lt;label&gt; ::= 0 | 1
</code></pre><h2 id="usage-and-hyperparameters">Usage and Hyperparameters</h2>
<p>You can find hyperparameters and it&apos;s default setting by running the following query:</p>
<pre><code class="lang-sql">select train_xgboost();
usage: train_xgboost(array&lt;string|double&gt; features, int|double target [,
string options]) - Returns a relation consists of &lt;string model_id,
array&lt;string&gt; pred_model&gt; [-alpha &lt;arg&gt;] [-base_score &lt;arg&gt;]
[-booster &lt;arg&gt;] [-colsample_bylevel &lt;arg&gt;] [-colsample_bynode
&lt;arg&gt;] [-colsample_bytree &lt;arg&gt;] [-disable_default_eval_metric
&lt;arg&gt;] [-eta &lt;arg&gt;] [-eval_metric &lt;arg&gt;] [-feature_selector &lt;arg&gt;]
[-gamma &lt;arg&gt;] [-grow_policy &lt;arg&gt;] [-lambda &lt;arg&gt;] [-lambda_bias
&lt;arg&gt;] [-max_bin &lt;arg&gt;] [-max_delta_step &lt;arg&gt;] [-max_depth &lt;arg&gt;]
[-max_leaves &lt;arg&gt;] [-maximize_evaluation_metrics &lt;arg&gt;]
[-min_child_weight &lt;arg&gt;] [-normalize_type &lt;arg&gt;] [-num_class
&lt;arg&gt;] [-num_early_stopping_rounds &lt;arg&gt;] [-num_feature &lt;arg&gt;]
[-num_parallel_tree &lt;arg&gt;] [-num_pbuffer &lt;arg&gt;] [-num_round &lt;arg&gt;]
[-objective &lt;arg&gt;] [-one_drop &lt;arg&gt;] [-process_type &lt;arg&gt;]
[-rate_drop &lt;arg&gt;] [-refresh_leaf &lt;arg&gt;] [-sample_type &lt;arg&gt;]
[-scale_pos_weight &lt;arg&gt;] [-seed &lt;arg&gt;] [-silent &lt;arg&gt;]
[-sketch_eps &lt;arg&gt;] [-skip_drop &lt;arg&gt;] [-subsample &lt;arg&gt;] [-top_k
&lt;arg&gt;] [-tree_method &lt;arg&gt;] [-tweedie_variance_power &lt;arg&gt;]
[-updater &lt;arg&gt;] [-validation_ratio &lt;arg&gt;] [-verbosity &lt;arg&gt;]
-alpha,--reg_alpha &lt;arg&gt; L1 regularization term on weights.
Increasing this value will make
model more conservative. [default:
0.0]
-base_score &lt;arg&gt; Initial prediction score of all
instances, global bias [default:
0.5]
-booster &lt;arg&gt; Set a booster to use, gbtree or
gblinear or dart. [default: gbree]
-colsample_bylevel &lt;arg&gt; Subsample ratio of columns for each
level [default: 1.0]
-colsample_bynode &lt;arg&gt; Subsample ratio of columns for each
node [default: 1.0]
-colsample_bytree &lt;arg&gt; Subsample ratio of columns when
constructing each tree [default:
1.0]
-disable_default_eval_metric &lt;arg&gt; NFlag to disable default metric. Set
to &gt;0 to disable. [default: 0]
-eta,--learning_rate &lt;arg&gt; Step size shrinkage used in update
to prevents overfitting [default:
0.3]
-eval_metric &lt;arg&gt; Evaluation metrics for validation
data. A default metric is assigned
according to the objective:
- rmse: for regression
- error: for classification
- map: for ranking
For a list of valid inputs, see
XGBoost Parameters.
-feature_selector &lt;arg&gt; Feature selection and ordering
method. [Choices: cyclic (default),
shuffle, random, greedy, thrifty]
-gamma,--min_split_loss &lt;arg&gt; Minimum loss reduction required to
make a further partition on a leaf
node of the tree. [default: 0.0]
-grow_policy &lt;arg&gt; Controls a way new nodes are added
to the tree. Currently supported
only if tree_method is set to hist.
[default: depthwise, Choices:
depthwise, lossguide]
-lambda,--reg_lambda &lt;arg&gt; L2 regularization term on weights.
Increasing this value will make
model more conservative. [default:
1.0 for gbtree, 0.0 for gblinear]
-lambda_bias &lt;arg&gt; L2 regularization term on bias
[default: 0.0]
-max_bin &lt;arg&gt; Maximum number of discrete bins to
bucket continuous features. Only
used if tree_method is set to hist.
[default: 256]
-max_delta_step &lt;arg&gt; Maximum delta step we allow each
tree&apos;s weight estimation to be
[default: 0]
-max_depth &lt;arg&gt; Max depth of decision tree [default:
6]
-max_leaves &lt;arg&gt; Maximum number of nodes to be added.
Only relevant when
grow_policy=lossguide is set.
[default: 0]
-maximize_evaluation_metrics &lt;arg&gt; Maximize evaluation metrics
[default: false]
-min_child_weight &lt;arg&gt; Minimum sum of instance weight
(hessian) needed in a child
[default: 1.0]
-normalize_type &lt;arg&gt; Type of normalization algorithm.
[Choices: tree (default), forest]
-num_class &lt;arg&gt; Number of classes to classify
-num_early_stopping_rounds &lt;arg&gt; Minimum rounds required for early
stopping [default: 0]
-num_feature &lt;arg&gt; Feature dimension used in boosting
[default: set automatically by
xgboost]
-num_parallel_tree &lt;arg&gt; Number of parallel trees constructed
during each iteration. This option
is used to support boosted random
forest. [default: 1]
-num_pbuffer &lt;arg&gt; Size of prediction buffer [default:
set automatically by xgboost]
-num_round,--iters &lt;arg&gt; Number of boosting iterations
[default: 10]
-objective &lt;arg&gt; Specifies the learning task and the
corresponding learning objective.
Examples: reg:linear, reg:logistic,
multi:softmax. For a full list of
valid inputs, refer to XGBoost
Parameters. [default: reg:linear]
-one_drop &lt;arg&gt; When this flag is enabled, at least
one tree is always dropped during
the dropout. 0 or 1. [default: 0]
-process_type &lt;arg&gt; A type of boosting process to run.
[Choices: default, update]
-rate_drop &lt;arg&gt; Dropout rate in range [0.0, 1.0].
[default: 0.0]
-refresh_leaf &lt;arg&gt; This is a parameter of the refresh
updater plugin. When this flag is 1,
tree leafs as well as tree nodes&#x2019;
stats are updated. When it is 0,
only node stats are updated.
[default: 1]
-sample_type &lt;arg&gt; Type of sampling algorithm.
[Choices: uniform (default),
weighted]
-scale_pos_weight &lt;arg&gt; ontrol the balance of positive and
negative weights, useful for
unbalanced classes. A typical value
to consider: sum(negative instances)
/ sum(positive instances) [default:
1.0]
-seed &lt;arg&gt; Random number seed. [default: 43]
-silent &lt;arg&gt; Deprecated. Please use verbosity
instead. 0 means printing running
messages, 1 means silent mode
[default: 1]
-sketch_eps &lt;arg&gt; This roughly translates into O(1 /
sketch_eps) number of bins.
Compared to directly select number
of bins, this comes with theoretical
guarantee with sketch accuracy.
Only used for tree_method=approx.
Usually user does not have to tune
this. [default: 0.03]
-skip_drop &lt;arg&gt; Probability of skipping the dropout
procedure during a boosting
iteration in range [0.0, 1.0].
[default: 0.0]
-subsample &lt;arg&gt; Subsample ratio of the training
instance in range (0.0,1.0]
[default: 1.0]
-top_k &lt;arg&gt; The number of top features to select
in greedy and thrifty feature
selector. The value of 0 means using
all the features. [default: 0]
-tree_method &lt;arg&gt; The tree construction algorithm used
in XGBoost. [default: auto, Choices:
auto, exact, approx, hist]
-tweedie_variance_power &lt;arg&gt; Parameter that controls the variance
of the Tweedie distribution in range
[1.0, 2.0]. [default: 1.5]
-updater &lt;arg&gt; A comma-separated string that
defines the sequence of tree
updaters to run. For a full list of
valid inputs, please refer to
XGBoost Parameters. [default:
&apos;grow_colmaker,prune&apos; for gbtree,
&apos;shotgun&apos; for gblinear]
-validation_ratio &lt;arg&gt; Validation ratio in range [0.0,1.0]
[default: 0.2]
-verbosity &lt;arg&gt; Verbosity of printing messages.
Choices: 0 (silent), 1 (warning), 2
(info), 3 (debug). [default: 0]
</code></pre>
<p>Objective function <code>-objective</code> SHOULD be specified though <code>-objective reg:linear</code> is used for Objective function by the default.
For the full list of objective functions, please refer <a href="https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters" target="_blank">this xgboost v0.90 documentation</a>.</p>
<p>The following objectives would widely be used for regression, binary classication, and multiclass classication, respectively.</p>
<ul>
<li><code>reg:squarederror</code> regression with squared loss.</li>
<li><code>binary:logistic</code> logistic regression for binary classification, output probability.</li>
<li><code>binary:hinge</code> hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.</li>
<li><code>multi:softmax</code> set XGBoost to do multiclass classification using the softmax objective, you also need to set <code>num_class</code> (number of classes).</li>
<li><code>multi:softprob</code> same as softmax, but output a vector of <code>ndata * nclass</code>, which can be further reshaped to <code>ndata * nclass</code> matrix. The result contains predicted probability of each data point belonging to each class.</li>
</ul>
<p>Other hyperparameters better to be tuned are:</p>
<ul>
<li><code>-booster gbree</code> Which booster to use. The default gbtree (Gradient Boosting Trees) would be fine for most cases. Can be <code>gbtree</code>, <code>gblinear</code> or <code>dart</code>; gbtree and dart use tree based models while gblinear uses linear functions.</li>
<li><code>-eta 0.1</code> The learning rate, 0.3 by the default. 0.05, 0.1, 0.3 are worth trying.</li>
<li><code>-max_depth 6</code> The maximum depth of the tree. The default value 6 would be fine for most case. Recommended value range is 5-10.</li>
<li><code>-num_class 3</code> The number of classes MUST be specified for multiclass classification (i.e., <code>-objective multi:softmax</code> or <code>-objective multi:softprob</code>)</li>
<li><code>-num_round 10</code> The number of rounds for boosting. 10 or more would be preferred.</li>
<li><code>-num_early_stopping_rounds 3</code> The number of rounds required for early stopping. Without specifying <code>-num_early_stopping_rounds</code>, no early stopping is NOT carried. When <code>-num_round=100</code> and <code>-num_early_stopping_rounds=5</code>, traning could be early stopped at 15th iteration if there is no evaluation result greater than the 10th iteration&apos;s (best one). Early stopping 3 or so would be preferred. </li>
<li><code>-validation_ratio 0.2</code> The ratio data used for validation (early stopping). 0.2 would be enough for most cases. Note that 80% data is used for training when <code>validation_ratio 0.2</code> is set.</li>
</ul>
<p>You can find the underlying XGBoost version by:</p>
<pre><code class="lang-sql">select xgboost_version();
&gt; 0.90
</code></pre>
<h2 id="training">Training</h2>
<p><code>train_xgboost</code> UDTF is used for training. </p>
<p>The function signature is <code>train_xgboost(array&lt;string|double&gt; features, double target [,string options])</code> and it returns a prediction model as a relation consist of <code>&lt;string model_id, array&lt;string&gt; pred_model&gt;</code>.</p>
<pre><code class="lang-sql"><span class="hljs-comment">-- explicitly use 3 reducers</span>
<span class="hljs-comment">-- set mapred.reduce.tasks=3;</span>
<span class="hljs-keyword">drop</span> <span class="hljs-keyword">table</span> xgb_lr_model;
<span class="hljs-keyword">create</span> <span class="hljs-keyword">table</span> xgb_lr_model <span class="hljs-keyword">as</span>
<span class="hljs-keyword">select</span>
train_xgboost(features, label, <span class="hljs-string">&apos;-objective binary:logistic -num_round 10 -num_early_stopping_rounds 3&apos;</span>)
<span class="hljs-keyword">as</span> (model_id, <span class="hljs-keyword">model</span>)
<span class="hljs-keyword">from</span> (
<span class="hljs-keyword">select</span> features, label
<span class="hljs-keyword">from</span> news20b_train
cluster <span class="hljs-keyword">by</span> <span class="hljs-keyword">rand</span>(<span class="hljs-number">43</span>) <span class="hljs-comment">-- shuffle data to reducers</span>
) shuffled;
<span class="hljs-keyword">drop</span> <span class="hljs-keyword">table</span> xgb_hinge_model;
<span class="hljs-keyword">create</span> <span class="hljs-keyword">table</span> xgb_hinge_model <span class="hljs-keyword">as</span>
<span class="hljs-keyword">select</span>
train_xgboost(features, label, <span class="hljs-string">&apos;-objective binary:hinge -num_round 10 -num_early_stopping_rounds 3&apos;</span>)
<span class="hljs-keyword">as</span> (model_id, <span class="hljs-keyword">model</span>)
<span class="hljs-keyword">from</span> (
<span class="hljs-keyword">select</span> features, label
<span class="hljs-keyword">from</span> news20b_train
cluster <span class="hljs-keyword">by</span> <span class="hljs-keyword">rand</span>(<span class="hljs-number">43</span>) <span class="hljs-comment">-- shuffle data to reducers</span>
) shuffled;
</code></pre>
<div class="panel panel-warning"><div class="panel-heading"><h3 class="panel-title" id="caution"><i class="fa fa-exclamation-triangle"></i> Caution</h3></div><div class="panel-body"><p><code>cluster by rand()</code> is NOT required when training data is small and a single task is launched for XGBoost training.
<code>cluster by rand()</code> shuffles data at random and divided it for multiple XGBoost instances.</p></div></div>
<h2 id="prediction">prediction</h2>
<pre><code class="lang-sql"><span class="hljs-keyword">drop</span> <span class="hljs-keyword">table</span> xgb_lr_predicted;
<span class="hljs-keyword">create</span> <span class="hljs-keyword">table</span> xgb_lr_predicted
<span class="hljs-keyword">as</span>
<span class="hljs-keyword">select</span>
<span class="hljs-keyword">rowid</span>,
array_avg(predicted) <span class="hljs-keyword">as</span> predicted,
<span class="hljs-keyword">avg</span>(predicted[<span class="hljs-number">0</span>]) <span class="hljs-keyword">as</span> prob
<span class="hljs-keyword">from</span> (
<span class="hljs-keyword">select</span>
<span class="hljs-comment">-- fast predictition by xgboost-predictor-java (https://github.com/komiya-atsushi/xgboost-predictor-java/)</span>
xgboost_predict(<span class="hljs-keyword">rowid</span>, features, model_id, <span class="hljs-keyword">model</span>) <span class="hljs-keyword">as</span> (<span class="hljs-keyword">rowid</span>, predicted)
<span class="hljs-comment">-- predict by xgboost4j (https://xgboost.readthedocs.io/en/stable/jvm/)</span>
<span class="hljs-comment">-- xgboost_batch_predict(rowid, features, model_id, model) as (rowid, predicted)</span>
<span class="hljs-keyword">from</span>
<span class="hljs-comment">-- for each model l </span>
<span class="hljs-comment">-- for each test r</span>
<span class="hljs-comment">-- predict</span>
xgb_lr_model l
<span class="hljs-keyword">LEFT</span> <span class="hljs-keyword">OUTER</span> <span class="hljs-keyword">JOIN</span> news20b_test r
) t
<span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span> <span class="hljs-keyword">rowid</span>;
<span class="hljs-keyword">drop</span> <span class="hljs-keyword">table</span> xgb_hinge_predicted;
<span class="hljs-keyword">create</span> <span class="hljs-keyword">table</span> xgb_hinge_predicted
<span class="hljs-keyword">as</span>
<span class="hljs-keyword">select</span>
<span class="hljs-keyword">rowid</span>,
<span class="hljs-comment">-- voting</span>
<span class="hljs-comment">-- if(sum(if(predicted[0]=1,1,0)) &gt; sum(if(predicted[0]=0,1,0)),1,-1) as predicted</span>
majority_vote(<span class="hljs-keyword">if</span>(predicted[<span class="hljs-number">0</span>]=<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">-1</span>)) <span class="hljs-keyword">as</span> predicted
<span class="hljs-keyword">from</span> (
<span class="hljs-keyword">select</span>
<span class="hljs-comment">-- binary:hinge is not supported in xgboost_predict</span>
<span class="hljs-comment">-- binary:hinge returns [1.0] or [0.0] for predicted</span>
xgboost_batch_predict(<span class="hljs-keyword">rowid</span>, features, model_id, <span class="hljs-keyword">model</span>)
<span class="hljs-keyword">as</span> (<span class="hljs-keyword">rowid</span>, predicted)
<span class="hljs-keyword">from</span>
<span class="hljs-comment">-- for each model l </span>
<span class="hljs-comment">-- for each test r</span>
<span class="hljs-comment">-- predict</span>
xgb_hinge_model l
<span class="hljs-keyword">LEFT</span> <span class="hljs-keyword">OUTER</span> <span class="hljs-keyword">JOIN</span> news20b_test r
) t
<span class="hljs-keyword">group</span> <span class="hljs-keyword">by</span>
<span class="hljs-keyword">rowid</span>
</code></pre>
<p>You can find the function signature of <code>xgboost_predict</code> by</p>
<pre><code class="lang-sql">select xgboost_predict();
usage: xgboost_predict(PRIMITIVE rowid, array&lt;string|double&gt; features,
string model_id, array&lt;string&gt; pred_model [, string options]) -
Returns a prediction result as (string rowid, array&lt;double&gt;
predicted)
select xgboost_batch_predict();
usage: xgboost_batch_predict(PRIMITIVE rowid, array&lt;string|double&gt;
features, string model_id, array&lt;string&gt; pred_model [, string
options]) - Returns a prediction result as (string rowid,
array&lt;double&gt; predicted) [-batch_size &lt;arg&gt;]
-batch_size &lt;arg&gt; Number of rows to predict together [default: 128]
</code></pre>
<div class="panel panel-warning"><div class="panel-heading"><h3 class="panel-title" id="caution"><i class="fa fa-exclamation-triangle"></i> Caution</h3></div><div class="panel-body"><p><code>xgboost_predict</code> outputs probability for <code>-objective binary:logistic</code> while 0/1 is resulted for <code>-objective binary:hinge</code>.</p><p><code>xgboost_predict</code> only support the following models and objectives because it uses <a href="https://github.com/komiya-atsushi/xgboost-predictor-java" target="_blank">xgboost-predictor-java</a>:
Models: {gblinear, gbtree, dart}
Objective functions: {binary:logistic, binary:logitraw, multi:softmax, multi:softprob, reg:linear, reg:squarederror, rank:pairwise}</p><p>For other models and objectives, please use <code>xgboost_batch_predict</code> that uses <a href="https://xgboost.readthedocs.io/en/stable/jvm/" target="_blank">xgboost4j</a> insead.</p></div></div>
<h2 id="evaluation">evaluation</h2>
<pre><code class="lang-sql">WITH submit as (
<span class="hljs-keyword">select</span>
t.label <span class="hljs-keyword">as</span> actual,
<span class="hljs-comment">-- probability thresholding by 0.5</span>
<span class="hljs-keyword">if</span>(p.prob &gt; <span class="hljs-number">0.5</span>,<span class="hljs-number">1</span>,<span class="hljs-number">-1</span>) <span class="hljs-keyword">as</span> predicted
<span class="hljs-keyword">from</span>
news20b_test t
<span class="hljs-keyword">JOIN</span> xgb_lr_predicted p
<span class="hljs-keyword">on</span> (t.<span class="hljs-keyword">rowid</span> = p.<span class="hljs-keyword">rowid</span>)
)
<span class="hljs-keyword">select</span>
<span class="hljs-keyword">sum</span>(<span class="hljs-keyword">if</span>(actual = predicted, <span class="hljs-number">1</span>, <span class="hljs-number">0</span>)) / <span class="hljs-keyword">count</span>(<span class="hljs-number">1</span>) <span class="hljs-keyword">as</span> accuracy
<span class="hljs-keyword">from</span>
submit;
</code></pre>
<blockquote>
<p>0.8372698158526821 (logistic loss)</p>
</blockquote>
<pre><code class="lang-sql">WITH submit as (
<span class="hljs-keyword">select</span>
t.label <span class="hljs-keyword">as</span> actual,
p.predicted
<span class="hljs-keyword">from</span>
news20b_test t
<span class="hljs-keyword">JOIN</span> xgb_hinge_predicted p
<span class="hljs-keyword">on</span> (t.<span class="hljs-keyword">rowid</span> = p.<span class="hljs-keyword">rowid</span>)
)
<span class="hljs-keyword">select</span>
<span class="hljs-keyword">sum</span>(<span class="hljs-keyword">if</span>(actual=predicted,<span class="hljs-number">1</span>,<span class="hljs-number">0</span>)) / <span class="hljs-keyword">count</span>(<span class="hljs-number">1</span>) <span class="hljs-keyword">as</span> accuracy
<span class="hljs-keyword">from</span>
submit;
</code></pre>
<blockquote>
<p>0.7752201761409128 (hinge loss)</p>
</blockquote>
<p><div id="page-footer" class="localized-footer"><hr><!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<p><sub><font color="gray">
Apache Hivemall is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator.
</font></sub></p>
</div></p>
</section>
</div>
<div class="search-results">
<div class="has-results">
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
<ul class="search-results-list"></ul>
</div>
<div class="no-results">
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
<script>
var gitbook = gitbook || [];
gitbook.push(function() {
gitbook.page.hasChanged({"page":{"title":"XGBoost","level":"6.3.8","depth":2,"next":{"title":"KDD2010a Tutorial","level":"6.4","depth":1,"path":"binaryclass/kdd2010a.md","ref":"binaryclass/kdd2010a.md","articles":[{"title":"Data Preparation","level":"6.4.1","depth":2,"path":"binaryclass/kdd2010a_dataset.md","ref":"binaryclass/kdd2010a_dataset.md","articles":[]},{"title":"PA, CW, AROW, SCW","level":"6.4.2","depth":2,"path":"binaryclass/kdd2010a_scw.md","ref":"binaryclass/kdd2010a_scw.md","articles":[]}]},"previous":{"title":"Random Forest","level":"6.3.7","depth":2,"path":"binaryclass/news20_rf.md","ref":"binaryclass/news20_rf.md","articles":[]},"dir":"ltr"},"config":{"plugins":["theme-api","edit-link","github","splitter","etoc","callouts","toggle-chapters","anchorjs","codeblock-filename","expandable-chapters","multipart","codeblock-filename","katex","emphasize","localized-footer"],"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"pluginsConfig":{"emphasize":{},"callouts":{},"etoc":{"h2lb":3,"header":1,"maxdepth":3,"mindepth":1,"notoc":true},"github":{"url":"https://github.com/apache/incubator-hivemall/"},"splitter":{},"search":{},"downloadpdf":{"base":"https://github.com/apache/incubator-hivemall/docs/gitbook","label":"PDF","multilingual":false},"multipart":{},"localized-footer":{"filename":"FOOTER.md","hline":"true"},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"katex":{},"fontsettings":{"theme":"white","family":"sans","size":2,"font":"sans"},"highlight":{},"codeblock-filename":{},"theme-api":{"languages":[],"split":false,"theme":"dark"},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"edit-link":{"label":"Edit","base":"https://github.com/apache/incubator-hivemall/tree/master/docs/gitbook"},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":true},"anchorjs":{"selector":"h1,h2,h3,*:not(.callout) > h4,h5"},"toggle-chapters":{},"expandable-chapters":{}},"theme":"default","pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"variables":{},"title":"Hivemall User Manual","links":{"sidebar":{"<i class=\"fa fa-home\"></i> Home":"https://hivemall.incubator.apache.org/"}},"gitbook":"3.x.x","description":"User Manual for Apache Hivemall"},"file":{"path":"binaryclass/news20b_xgboost.md","mtime":"2021-04-22T11:42:38.110Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2021-04-22T11:56:59.644Z"},"basePath":"..","book":{"language":""}});
});
</script>
</div>
<script src="../gitbook/gitbook.js"></script>
<script src="../gitbook/theme.js"></script>
<script src="../gitbook/gitbook-plugin-edit-link/plugin.js"></script>
<script src="../gitbook/gitbook-plugin-github/plugin.js"></script>
<script src="../gitbook/gitbook-plugin-splitter/splitter.js"></script>
<script src="../gitbook/gitbook-plugin-etoc/plugin.js"></script>
<script src="../gitbook/gitbook-plugin-toggle-chapters/toggle.js"></script>
<script src="../gitbook/gitbook-plugin-anchorjs/anchor.min.js"></script>
<script src="../gitbook/gitbook-plugin-anchorjs/anchor-style.js"></script>
<script src="../gitbook/gitbook-plugin-expandable-chapters/expandable-chapters.js"></script>
<script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
<script src="../gitbook/gitbook-plugin-search/search.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
<script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
<script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
<script src="../gitbook/gitbook-plugin-theme-api/theme-api.js"></script>
</body>
</html>