blob: 679a0829719de8a8695f6a68d3d06ec0924b1f5f [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file DT_proto.hpp
*
*//* ----------------------------------------------------------------------- */
#ifndef MADLIB_MODULES_RP_DT_PROTO_HPP
#define MADLIB_MODULES_RP_DT_PROTO_HPP
#include <iostream>
#include <sstream>
#include <vector>
#include <string>
#include <iterator>
#include <limits> // std::numeric_limits
#include <dbconnector/dbconnector.hpp>
namespace madlib {
namespace modules {
namespace recursive_partitioning {
// Use Eigen
using namespace dbal;
using namespace dbal::eigen_integration;
using std::vector;
using std::string;
// stats_per_split is the number of statistics needed to accumulate for a split.
// This differs for classification and regression. Value for classification is a
// function of number of response values and is computed at runtime whereas
// value for regression is a constant.
// For classification, accumulate the following:
// num of weighted tuples for each possible response and num of unweighted tuples
// For regression, accumulate 4 values for evaulating a split:
// weight, weight * response, weight * response^2, num of unweighted rows
const uint16_t REGRESS_N_STATS = 4u;
template <class Container>
class DecisionTree: public DynamicStruct<DecisionTree<Container>, Container> {
public:
typedef DynamicStruct<DecisionTree, Container> Base;
MADLIB_DYNAMIC_STRUCT_TYPEDEFS;
enum { MSE, MISCLASS, ENTROPY, GINI };
enum { IN_PROCESS_LEAF=-1, FINISHED_LEAF=-2, NODE_NON_EXISTING=-3 };
enum { SURR_IS_MAJORITY=-1, SURR_NON_EXISTING=-2 };
// functions
DecisionTree(Init_type& inInitialization);
DecisionTree();
void bind(ByteStream_type& inStream);
DecisionTree& rebind(const uint16_t in_tree_depth,
const uint16_t in_n_y_labels,
const uint16_t in_max_n_surr,
const bool in_is_regression);
DecisionTree& incrementInPlace();
DecisionTree& my_tree();
Index search(MappedIntegerVector cat_features,
MappedColumnVector con_features) const;
uint64_t getMajorityCount(Index node_index) const;
bool getMajoritySplit(Index node_index) const;
bool getSurrSplit(Index node_index,
MappedIntegerVector cat_features,
MappedColumnVector con_features) const;
ColumnVector predict(MappedIntegerVector cat_features,
MappedColumnVector con_features) const;
double predict_response(MappedIntegerVector cat_features,
MappedColumnVector con_features) const;
double predict_response(Index feature_index) const;
bool updatePrimarySplit(const Index node_index,
const int & max_feat,
const double & max_threshold,
const bool & max_is_cat,
const uint16_t & min_split,
const ColumnVector &true_stats,
const ColumnVector &false_stats);
template <class Accumulator>
bool expand(const Accumulator &state,
const MappedMatrix &con_splits,
const uint16_t &min_split,
const uint16_t &min_bucket,
const uint16_t &max_depth);
template <class Accumulator>
bool expand_by_sampling(const Accumulator &state,
const MappedMatrix &con_splits,
const uint16_t &min_split,
const uint16_t &min_bucket,
const uint16_t &max_depth,
const int &num_random_features);
Index parentIndex(Index current) const {
return current > 0? static_cast<Index>((current-1)/2): 0;
}
Index trueChild(Index current) const { return 2 * current + 1; }
Index falseChild(Index current) const { return 2 * current + 2; }
bool isInternalNode(const Index node_index) const {
// Internal nodes have a valid index pointing to the feature
// used in the split of the node.
return feature_indices(node_index) >= 0;
}
double impurity(const ColumnVector & stats) const;
double impurityGain(const ColumnVector &combined_stats,
const uint16_t &stats_per_split) const;
bool shouldSplit(const uint64_t &total_count,
const uint64_t &true_count,
const uint64_t &false_count,
const uint16_t &min_split,
const uint16_t &min_bucket,
const uint16_t &max_depth) const;
template <class Accumulator>
void pickSurrogates(const Accumulator &state,
const MappedMatrix &con_splits);
ColumnVector statPredict(const ColumnVector &stats) const;
uint64_t statCount(const ColumnVector &stats) const;
double statWeightedCount(const ColumnVector &stats) const;
uint64_t nodeCount(const Index node_index) const;
double nodeWeightedCount(const Index node_index) const;
double computeMisclassification(const Index node_index) const;
double computeRisk(const Index node_index) const;
bool isChildPure(const ColumnVector &stats) const;
bool isNull(const double feature_val, const bool is_categorical) const{
return (is_categorical) ? (feature_val < 0) : std::isnan(feature_val);
}
uint16_t recomputeTreeDepth() const;
string displayLeafNode(Index id, ArrayHandle<text*> &dep_levels, const std::string & id_prefix, bool verbose);
string displayInternalNode(Index id,
ArrayHandle<text*> &cat_features_str,
ArrayHandle<text*> &con_features_str,
ArrayHandle<text*> &cat_levels_text,
ArrayHandle<int> &cat_n_levels,
ArrayHandle<text*> &dep_levels,
const std::string & id_prefix,
bool verbose);
string display(ArrayHandle<text*>&, ArrayHandle<text*>&, ArrayHandle<text*>&,
ArrayHandle<int>&, ArrayHandle<text*>&, const std::string&, bool verbose);
string getCatLabels(Index, Index, Index, ArrayHandle<text*> &,
ArrayHandle<int> &);
string print_split(bool, bool, Index, double,
ArrayHandle<text*> &, ArrayHandle<text*> &,
ArrayHandle<text*> &, ArrayHandle<int> &);
string print(Index, ArrayHandle<text*>&, ArrayHandle<text*>&,
ArrayHandle<text*>&, ArrayHandle<int>&,
ArrayHandle<text*>&, uint16_t);
string surr_display(
ArrayHandle<text*> &cat_features_str,
ArrayHandle<text*> &con_features_str,
ArrayHandle<text*> &cat_levels_text,
ArrayHandle<int> &cat_n_levels);
int encodeIndex(const int &feature_index,
const int &is_categorical,
const int &n_cat_features) const;
void computeVariableImportance(ColumnVector& cat_var_importance,
ColumnVector& con_var_importance);
// attributes
// dimension information
uint16_type tree_depth; // 1 for root-only tree
uint16_type n_y_labels;
uint16_type max_n_surr;
bool_type is_regression; // = 0 for classification, = 1 for regression
uint16_type impurity_type; // can be 'mse', 'gini', 'entropy', 'misclass'
// All vectors below are of size n_nodes ( = 2^{tree_depth} - 1 )
// Note: in order to make DynamicStruct::DryRun work,
// non-scalars should be of size 0 when dimension is 0
// The complete tree is broken into multiple vectors: each vector being the
// collection of a single variable for all nodes.
// -1 means leaf, -2 mean non-existing node
IntegerVector_type feature_indices;
// elements are of integer type for categorical
ColumnVector_type feature_thresholds;
// used as boolean array, 0 means continuous, otherwise categorical
IntegerVector_type is_categorical;
// Used to keep count of the number of non-null rows that are split to the
// left and right children for each internal node.
// For leaf nodes, the value is 0.
// This is useful when using surrogates to compute the majority count/split
// for an internal node. Size = n_nodes x 2
ColumnVector_type nonnull_split_count;
// FIXME: we use a ColumnVector (elements of 'double' type) to store
// big int values, since we dont' yet have a vector type with uint64_t elements.
// 'surrogate_indices' is of size n_nodes x max_n_surrogates
// If a particular node has fewer than max_n_surrogates, then
// the indices on non-existing will be -1
IntegerVector_type surr_indices;
ColumnVector_type surr_thresholds; // used as integer if classification
// used as enumerated array to record the status of a surrogate split
// 0 = default value indicating invalid surrogate split
// 1 = categorical feature <= threshold
// -1 = categorical feature > threshold
// 2 = continuous feature <= threshold
// -2 = continuous feature > threshold
IntegerVector_type surr_status;
// used for keeping count of number of rows that are common between primary
// and surrogate
IntegerVector_type surr_agreement;
// 'prediction' is matrix of size n_nodes x n_predictions
// where n_predictions = stats_per_split,
// stats_per_split is set by the accumulator during training of tree
Matrix_type predictions; // used as integer if we do classification
};
// ------------------------------------------------------------------------
// TreeAccumulator is used for collecting statistics during training the nodes
// for a level of the decision tree. The same accumulator is also used for
// computing the statistics for surrogate splits.
template <class Container, class DTree>
class TreeAccumulator
: public DynamicStruct<TreeAccumulator<Container, DTree>, Container> {
public:
typedef DynamicStruct<TreeAccumulator, Container> Base;
MADLIB_DYNAMIC_STRUCT_TYPEDEFS;
typedef DTree tree_type;
typedef std::tuple< tree_type,
MappedIntegerVector, // categorical feature values
MappedColumnVector, // continuous feature values
double, // response variable
double, // weight
MappedIntegerVector, // levels for each categorical feature
MappedMatrix // split values for each continuous feature
> tuple_type;
typedef std::tuple< tree_type,
MappedIntegerVector, // categorical feature values
MappedColumnVector, // continuous feature values
MappedIntegerVector, // levels for each categorical feature
MappedMatrix, // split values for each continuous feature
int // duplicated count for each tuple
// (used in random forest)
> surr_tuple_type;
// functions
TreeAccumulator(Init_type& inInitialization);
void bind(ByteStream_type& inStream);
void rebind(uint16_t n_bins, uint16_t n_cat_feat,
uint16_t n_con_feat, uint32_t n_total_levels,
uint16_t tree_depth, uint16_t n_stats, bool weights_as_rows,
uint32_t n_reachable_leaf_nodes);
TreeAccumulator& operator<<(const tuple_type& inTuple);
TreeAccumulator& operator<<(const surr_tuple_type& inTuple);
template <class C, class DT>
TreeAccumulator& operator<<(const TreeAccumulator<C, DT>& inOther);
bool empty() const { return this->n_rows == 0; }
// cat_features[feature_index] <= cat_value
Index indexCatStats(Index feature_index, int cat_value, bool is_split_true) const;
// con_features[feature_index] <= bin_threshold,
// bin_threshold = con_splits[feature_index, bin_index]
Index indexConStats(Index feature_index, Index bin_index, bool is_split_true) const;
Index computeSubIndex(Index start_Index, Index relative_index, bool is_split_true) const;
void updateNodeStats(bool is_regression, Index row_index,
const double response, const double weight);
// apply the tuple using indices
void updateStats(bool is_regression, bool is_cat, Index row_index,
Index stats_index, const double response,
const double weight);
// apply the tuple using indices
void updateSurrStats(const bool is_cat, const bool surr_agrees,
Index row_index, Index stats_index, const int dup_count);
// attributes
uint64_type n_rows; // number of rows mapped to this node
// return NULL state if the node is terminated due to an error
bool_type terminated;
// dimension information
uint16_type n_bins;
uint16_type n_cat_features;
uint16_type n_con_features;
// sum of num of levels in each categorical variable
uint32_type total_n_cat_levels;
// n_leaf_nodes = 2^{dt.tree_depth-1} for dt.tree_depth > 0
uint32_type n_leaf_nodes;
// Not all "leaf" nodes at a tree level are reachable. A leaf becomes
// non-reachable when one of its ancestor is itself a leaf.
// For a full tree, n_leaf_nodes = n_reachable_leaf_nodes
uint32_type n_reachable_leaf_nodes;
// For regression, stats_per_split = 4, i.e. (w, w*y, w*y^2, 1)
// For classification, stats_per_split = (number of class labels + 1)
// i.e. (w_1, w_2, ..., w_c, 1)
// For surrogates calculation, stats_per_split = 1
uint16_type stats_per_split;
// treat weights as duplicated rows (used for random forest)
bool_type weights_as_rows;
// training statistics
// cumulative sum of the levels of categorical variables
// with first element as 0. This is helpful to compute the index into
// cat_stats for each categorical variable
// size = n_cat_features last
// element = (total_n_cat_levels - last element of cat_levels)
IntegerVector_type cat_levels_cumsum; // used as integer array
// con_stats and cat_stats are matrices that contain the statistics used
// during training.
// cat_stats is a matrix of size:
// (n_reachable_leaf_nodes) x (total_n_cat_levels * stats_per_split * 2)
Matrix_type cat_stats;
// con_stats is a matrix:
// (n_reachable_leaf_nodes) x (n_con_features * n_bins * stats_per_split * 2)
Matrix_type con_stats;
// node_stats is used to keep a statistic of all the rows that land on a
// node and are used to determine the prediction made by a node.
// In the absence of any NULL value, these stats can be deduced from
// cat_stats/con_stats. In the presence of NULL value, the stats could be
// different.
Matrix_type node_stats;
// Above stats matrices are used as pseudo-sparse matrices since not all
// leaf nodes are reachable (esp. as tree gets deeper).
IntegerVector_type stats_lookup;
};
// ------------------------------------------------------------------------
} // namespace recursive_partitioning
} // namespace modules
} // namespace madlib
#endif // defined(MADLIB_MODULES_RP_DT_PROTO_HPP)