src/modules/recursive_partitioning/DT_proto.hpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file DT_proto.hpp
  *
  *//* ----------------------------------------------------------------------- */

 #ifndef MADLIB_MODULES_RP_DT_PROTO_HPP
 #define MADLIB_MODULES_RP_DT_PROTO_HPP

 #include <iostream>
 #include <sstream>
 #include <vector>
 #include <string>
 #include <iterator>

 #include <limits>  // std::numeric_limits

 #include <dbconnector/dbconnector.hpp>

 namespace madlib {

 namespace modules {

 namespace recursive_partitioning {

 // Use Eigen
 using namespace dbal;
 using namespace dbal::eigen_integration;
 using std::vector;
 using std::string;


 // stats_per_split is the number of statistics needed to accumulate for a split.
 // This differs for classification and regression. Value for classification is a
 // function of number of response values and is computed at runtime whereas
 // value for regression is a constant.

 // For classification, accumulate the following:
 //  num of weighted tuples for each possible response and num of unweighted tuples

 // For regression, accumulate 4 values for evaulating a split:
 //      weight, weight * response, weight * response^2, num of unweighted rows
 const uint16_t REGRESS_N_STATS = 4u;

 template <class Container>
 class DecisionTree: public DynamicStruct<DecisionTree<Container>, Container> {
 public:
     typedef DynamicStruct<DecisionTree, Container> Base;
     MADLIB_DYNAMIC_STRUCT_TYPEDEFS;

     enum { MSE, MISCLASS, ENTROPY, GINI };
     enum { IN_PROCESS_LEAF=-1, FINISHED_LEAF=-2, NODE_NON_EXISTING=-3 };
     enum { SURR_IS_MAJORITY=-1, SURR_NON_EXISTING=-2 };

     // functions
     DecisionTree(Init_type& inInitialization);
     DecisionTree();
     void bind(ByteStream_type& inStream);
     DecisionTree& rebind(const uint16_t in_tree_depth,
                          const uint16_t in_n_y_labels,
                          const uint16_t in_max_n_surr,
                          const bool in_is_regression);
     DecisionTree& incrementInPlace();
     DecisionTree& my_tree();
     Index search(MappedIntegerVector cat_features,
                  MappedColumnVector con_features) const;
     uint64_t getMajorityCount(Index node_index) const;
     bool getMajoritySplit(Index node_index) const;
     bool getSurrSplit(Index node_index,
                       MappedIntegerVector cat_features,
                       MappedColumnVector con_features) const;
     ColumnVector predict(MappedIntegerVector cat_features,
                          MappedColumnVector con_features) const;
     double predict_response(MappedIntegerVector cat_features,
                             MappedColumnVector con_features) const;
     double predict_response(Index feature_index) const;

     bool updatePrimarySplit(const Index node_index,
                             const int & max_feat,
                             const double & max_threshold,
                             const bool & max_is_cat,
                             const uint16_t & min_split,
                             const ColumnVector &true_stats,
                             const ColumnVector &false_stats);

     template <class Accumulator>
     bool expand(const Accumulator &state,
                 const MappedMatrix &con_splits,
                 const uint16_t &min_split,
                 const uint16_t &min_bucket,
                 const uint16_t &max_depth);

     template <class Accumulator>
     bool expand_by_sampling(const Accumulator &state,
                             const MappedMatrix &con_splits,
                             const uint16_t &min_split,
                             const uint16_t &min_bucket,
                             const uint16_t &max_depth,
                             const int &num_random_features);


     Index parentIndex(Index current) const {
         return current > 0? static_cast<Index>((current-1)/2): 0;
     }
     Index trueChild(Index current) const { return 2 * current + 1; }
     Index falseChild(Index current) const { return 2 * current + 2; }
     bool isInternalNode(const Index node_index) const {
         // Internal nodes have a valid index pointing to the feature
         // used in the split of the node.
         return feature_indices(node_index) >= 0;
     }

     double impurity(const ColumnVector & stats) const;
     double impurityGain(const ColumnVector &combined_stats,
                         const uint16_t &stats_per_split) const;
     bool shouldSplit(const uint64_t &total_count,
                      const uint64_t &true_count,
                      const uint64_t &false_count,
                      const uint16_t &min_split,
                      const uint16_t &min_bucket,
                      const uint16_t &max_depth) const;

     template <class Accumulator>
     void pickSurrogates(const Accumulator &state,
                         const MappedMatrix &con_splits);

     ColumnVector statPredict(const ColumnVector &stats) const;
     uint64_t statCount(const ColumnVector &stats) const;
     double statWeightedCount(const ColumnVector &stats) const;
     uint64_t nodeCount(const Index node_index) const;
     double nodeWeightedCount(const Index node_index) const;
     double computeMisclassification(const Index node_index) const;
     double computeRisk(const Index node_index) const;
     bool isChildPure(const ColumnVector &stats) const;
     bool isNull(const double feature_val, const bool is_categorical) const{
         return (is_categorical) ? (feature_val < 0) : std::isnan(feature_val);
     }

     uint16_t recomputeTreeDepth() const;
     string displayLeafNode(Index id, ArrayHandle<text*> &dep_levels, const std::string & id_prefix, bool verbose);
     string displayInternalNode(Index id,
                                ArrayHandle<text*> &cat_features_str,
                                ArrayHandle<text*> &con_features_str,
                                ArrayHandle<text*> &cat_levels_text,
                                ArrayHandle<int> &cat_n_levels,
                                ArrayHandle<text*> &dep_levels,
                                const std::string & id_prefix,
                                bool verbose);
     string display(ArrayHandle<text*>&, ArrayHandle<text*>&, ArrayHandle<text*>&,
                    ArrayHandle<int>&, ArrayHandle<text*>&, const std::string&, bool verbose);
     string getCatLabels(Index, Index, Index, ArrayHandle<text*> &,
                         ArrayHandle<int> &);
     string print_split(bool, bool, Index, double,
                        ArrayHandle<text*> &, ArrayHandle<text*> &,
                        ArrayHandle<text*> &, ArrayHandle<int> &);
     string print(Index, ArrayHandle<text*>&, ArrayHandle<text*>&,
                  ArrayHandle<text*>&, ArrayHandle<int>&,
                  ArrayHandle<text*>&, uint16_t);
     string surr_display(
         ArrayHandle<text*> &cat_features_str,
         ArrayHandle<text*> &con_features_str,
         ArrayHandle<text*> &cat_levels_text,
         ArrayHandle<int> &cat_n_levels);

     int encodeIndex(const int &feature_index,
                       const int &is_categorical,
                       const int &n_cat_features) const;

     void computeVariableImportance(ColumnVector& cat_var_importance,
                                    ColumnVector& con_var_importance);

     // attributes
     // dimension information
     uint16_type tree_depth; // 1 for root-only tree

     uint16_type n_y_labels;

     uint16_type max_n_surr;

     bool_type is_regression; // = 0 for classification, = 1 for regression

     uint16_type impurity_type;  // can be 'mse', 'gini', 'entropy', 'misclass'

     // All vectors below are of size n_nodes ( = 2^{tree_depth} - 1 )
     // Note: in order to make DynamicStruct::DryRun work,
     // non-scalars should be of size 0 when dimension is 0

     // The complete tree is broken into multiple vectors: each vector being the
     // collection of a single variable for all nodes.

     // -1 means leaf, -2 mean non-existing node
     IntegerVector_type feature_indices;
     // elements are of integer type for categorical
     ColumnVector_type feature_thresholds;

     // used as boolean array, 0 means continuous, otherwise categorical
     IntegerVector_type is_categorical;

     // Used to keep count of the number of non-null rows that are split to the
     // left and right children for each internal node.
     // For leaf nodes, the value is 0.
     // This is useful when using surrogates to compute the majority count/split
     // for an internal node. Size = n_nodes x 2
     ColumnVector_type nonnull_split_count;
     // FIXME: we use a ColumnVector (elements of 'double' type) to store
     // big int values, since we dont' yet have a vector type with uint64_t elements.

     // 'surrogate_indices' is of size n_nodes x max_n_surrogates
     // If a particular node has fewer than max_n_surrogates, then
     //   the indices on non-existing will be -1
     IntegerVector_type surr_indices;
     ColumnVector_type surr_thresholds;   // used as integer if classification

     // used as enumerated array to record the status of a surrogate split
     //      0 = default value indicating invalid surrogate split
     //      1 = categorical feature <= threshold
     //     -1 = categorical feature > threshold
     //      2 = continuous feature <= threshold
     //     -2 = continuous feature > threshold
     IntegerVector_type surr_status;
     // used for keeping count of number of rows that are common between primary
     // and surrogate
     IntegerVector_type surr_agreement;

     // 'prediction' is matrix of size n_nodes x n_predictions
     //      where n_predictions = stats_per_split,
     //      stats_per_split is set by the accumulator during training of tree
     Matrix_type predictions;   // used as integer if we do classification
 };


 // ------------------------------------------------------------------------
 // TreeAccumulator is used for collecting statistics during training the nodes
 // for a level of the decision tree. The same accumulator is also used for
 // computing the statistics for surrogate splits.

 template <class Container, class DTree>
 class TreeAccumulator
   : public DynamicStruct<TreeAccumulator<Container, DTree>, Container> {
 public:
     typedef DynamicStruct<TreeAccumulator, Container> Base;
     MADLIB_DYNAMIC_STRUCT_TYPEDEFS;
     typedef DTree tree_type;
     typedef std::tuple< tree_type,
                         MappedIntegerVector, // categorical feature values
                         MappedColumnVector,  // continuous feature values
                         double,              // response variable
                         double,              // weight
                         MappedIntegerVector, // levels for each categorical feature
                         MappedMatrix         // split values for each continuous feature
                       > tuple_type;

     typedef std::tuple< tree_type,
                         MappedIntegerVector, // categorical feature values
                         MappedColumnVector,  // continuous feature values
                         MappedIntegerVector, // levels for each categorical feature
                         MappedMatrix,        // split values for each continuous feature
                         int                  // duplicated count for each tuple
                                              //   (used in random forest)
                        > surr_tuple_type;

     // functions
     TreeAccumulator(Init_type& inInitialization);
     void bind(ByteStream_type& inStream);
     void rebind(uint16_t n_bins, uint16_t n_cat_feat,
                 uint16_t n_con_feat, uint32_t n_total_levels,
                 uint16_t tree_depth, uint16_t n_stats, bool weights_as_rows,
                 uint32_t n_reachable_leaf_nodes);

     TreeAccumulator& operator<<(const tuple_type& inTuple);
     TreeAccumulator& operator<<(const surr_tuple_type& inTuple);

     template <class C, class DT>
     TreeAccumulator& operator<<(const TreeAccumulator<C, DT>& inOther);
     bool empty() const { return this->n_rows == 0; }

     // cat_features[feature_index] <= cat_value
     Index indexCatStats(Index feature_index, int cat_value, bool is_split_true) const;
     // con_features[feature_index] <= bin_threshold,
     // bin_threshold = con_splits[feature_index, bin_index]
     Index indexConStats(Index feature_index, Index bin_index, bool is_split_true) const;
     Index computeSubIndex(Index start_Index, Index relative_index, bool is_split_true) const;

     void updateNodeStats(bool is_regression, Index row_index,
                          const double response, const double weight);
     // apply the tuple using indices
     void updateStats(bool is_regression, bool is_cat, Index row_index,
                      Index stats_index, const double response,
                      const double weight);

     // apply the tuple using indices
     void updateSurrStats(const bool is_cat, const bool surr_agrees,
                          Index row_index, Index stats_index, const int dup_count);

     // attributes
     uint64_type n_rows;  // number of rows mapped to this node
     // return NULL state if the node is terminated due to an error
     bool_type terminated;

     // dimension information
     uint16_type n_bins;
     uint16_type n_cat_features;
     uint16_type n_con_features;
     // sum of num of levels in each categorical variable
     uint32_type total_n_cat_levels;
     // n_leaf_nodes = 2^{dt.tree_depth-1} for dt.tree_depth > 0
     uint32_type n_leaf_nodes;

     // Not all "leaf" nodes at a tree level are reachable. A leaf becomes
     // non-reachable when one of its ancestor is itself a leaf.
     // For a full tree, n_leaf_nodes = n_reachable_leaf_nodes
     uint32_type n_reachable_leaf_nodes;

     // For regression, stats_per_split = 4, i.e. (w, w*y, w*y^2, 1)
     // For classification, stats_per_split = (number of class labels + 1)
     // i.e. (w_1, w_2, ..., w_c, 1)
     // For surrogates calculation, stats_per_split = 1
     uint16_type stats_per_split;

     // treat weights as duplicated rows (used for random forest)
     bool_type weights_as_rows;

     // training statistics
     // cumulative sum of the levels of categorical variables
     // with first element as 0. This is helpful to compute the index into
     // cat_stats for each categorical variable
     // size = n_cat_features last
     // element = (total_n_cat_levels - last element of cat_levels)
     IntegerVector_type cat_levels_cumsum; // used as integer array

     // con_stats and cat_stats are matrices that contain the statistics used
     // during training.
     // cat_stats is a matrix of size:
     // (n_reachable_leaf_nodes) x (total_n_cat_levels * stats_per_split * 2)
     Matrix_type cat_stats;

     // con_stats is a matrix:
     // (n_reachable_leaf_nodes) x (n_con_features * n_bins * stats_per_split * 2)
     Matrix_type con_stats;

     // node_stats is used to keep a statistic of all the rows that land on a
     // node and are used to determine the prediction made by a node.
     // In the absence of any NULL value, these stats can be deduced from
     // cat_stats/con_stats. In the presence of NULL value, the stats could be
     // different.
     Matrix_type node_stats;

     // Above stats matrices are used as pseudo-sparse matrices since not all
     // leaf nodes are reachable (esp. as tree gets deeper).
     IntegerVector_type stats_lookup;
 };
 // ------------------------------------------------------------------------


 } // namespace recursive_partitioning

 } // namespace modules

 } // namespace madlib

 #endif // defined(MADLIB_MODULES_RP_DT_PROTO_HPP)
	/* ----------------------------------------------------------------------- //*
	*
	* @file DT_proto.hpp
	*
	// ----------------------------------------------------------------------- */

	#ifndef MADLIB_MODULES_RP_DT_PROTO_HPP
	#define MADLIB_MODULES_RP_DT_PROTO_HPP

	#include <iostream>
	#include <sstream>
	#include <vector>
	#include <string>
	#include <iterator>

	#include <limits> // std::numeric_limits

	#include <dbconnector/dbconnector.hpp>

	namespace madlib {

	namespace modules {

	namespace recursive_partitioning {

	// Use Eigen
	using namespace dbal;
	using namespace dbal::eigen_integration;
	using std::vector;
	using std::string;


	// stats_per_split is the number of statistics needed to accumulate for a split.
	// This differs for classification and regression. Value for classification is a
	// function of number of response values and is computed at runtime whereas
	// value for regression is a constant.

	// For classification, accumulate the following:
	// num of weighted tuples for each possible response and num of unweighted tuples

	// For regression, accumulate 4 values for evaulating a split:
	// weight, weight * response, weight * response^2, num of unweighted rows
	const uint16_t REGRESS_N_STATS = 4u;

	template <class Container>
	class DecisionTree: public DynamicStruct<DecisionTree<Container>, Container> {
	public:
	typedef DynamicStruct<DecisionTree, Container> Base;
	MADLIB_DYNAMIC_STRUCT_TYPEDEFS;

	enum { MSE, MISCLASS, ENTROPY, GINI };
	enum { IN_PROCESS_LEAF=-1, FINISHED_LEAF=-2, NODE_NON_EXISTING=-3 };
	enum { SURR_IS_MAJORITY=-1, SURR_NON_EXISTING=-2 };

	// functions
	DecisionTree(Init_type& inInitialization);
	DecisionTree();
	void bind(ByteStream_type& inStream);
	DecisionTree& rebind(const uint16_t in_tree_depth,
	const uint16_t in_n_y_labels,
	const uint16_t in_max_n_surr,
	const bool in_is_regression);
	DecisionTree& incrementInPlace();
	DecisionTree& my_tree();
	Index search(MappedIntegerVector cat_features,
	MappedColumnVector con_features) const;
	uint64_t getMajorityCount(Index node_index) const;
	bool getMajoritySplit(Index node_index) const;
	bool getSurrSplit(Index node_index,
	MappedIntegerVector cat_features,
	MappedColumnVector con_features) const;
	ColumnVector predict(MappedIntegerVector cat_features,
	MappedColumnVector con_features) const;
	double predict_response(MappedIntegerVector cat_features,
	MappedColumnVector con_features) const;
	double predict_response(Index feature_index) const;

	bool updatePrimarySplit(const Index node_index,
	const int & max_feat,
	const double & max_threshold,
	const bool & max_is_cat,
	const uint16_t & min_split,
	const ColumnVector &true_stats,
	const ColumnVector &false_stats);

	template <class Accumulator>
	bool expand(const Accumulator &state,
	const MappedMatrix &con_splits,
	const uint16_t &min_split,
	const uint16_t &min_bucket,
	const uint16_t &max_depth);

	template <class Accumulator>
	bool expand_by_sampling(const Accumulator &state,
	const MappedMatrix &con_splits,
	const uint16_t &min_split,
	const uint16_t &min_bucket,
	const uint16_t &max_depth,
	const int &num_random_features);


	Index parentIndex(Index current) const {
	return current > 0? static_cast<Index>((current-1)/2): 0;
	}
	Index trueChild(Index current) const { return 2 * current + 1; }
	Index falseChild(Index current) const { return 2 * current + 2; }
	bool isInternalNode(const Index node_index) const {
	// Internal nodes have a valid index pointing to the feature
	// used in the split of the node.
	return feature_indices(node_index) >= 0;
	}

	double impurity(const ColumnVector & stats) const;
	double impurityGain(const ColumnVector &combined_stats,
	const uint16_t &stats_per_split) const;
	bool shouldSplit(const uint64_t &total_count,
	const uint64_t &true_count,
	const uint64_t &false_count,
	const uint16_t &min_split,
	const uint16_t &min_bucket,
	const uint16_t &max_depth) const;

	template <class Accumulator>
	void pickSurrogates(const Accumulator &state,
	const MappedMatrix &con_splits);

	ColumnVector statPredict(const ColumnVector &stats) const;
	uint64_t statCount(const ColumnVector &stats) const;
	double statWeightedCount(const ColumnVector &stats) const;
	uint64_t nodeCount(const Index node_index) const;
	double nodeWeightedCount(const Index node_index) const;
	double computeMisclassification(const Index node_index) const;
	double computeRisk(const Index node_index) const;
	bool isChildPure(const ColumnVector &stats) const;
	bool isNull(const double feature_val, const bool is_categorical) const{
	return (is_categorical) ? (feature_val < 0) : std::isnan(feature_val);
	}

	uint16_t recomputeTreeDepth() const;
	string displayLeafNode(Index id, ArrayHandle<text*> &dep_levels, const std::string & id_prefix, bool verbose);
	string displayInternalNode(Index id,
	ArrayHandle<text*> &cat_features_str,
	ArrayHandle<text*> &con_features_str,
	ArrayHandle<text*> &cat_levels_text,
	ArrayHandle<int> &cat_n_levels,
	ArrayHandle<text*> &dep_levels,
	const std::string & id_prefix,
	bool verbose);
	string display(ArrayHandle<text>&, ArrayHandle<text>&, ArrayHandle<text*>&,
	ArrayHandle<int>&, ArrayHandle<text*>&, const std::string&, bool verbose);
	string getCatLabels(Index, Index, Index, ArrayHandle<text*> &,
	ArrayHandle<int> &);
	string print_split(bool, bool, Index, double,
	ArrayHandle<text> &, ArrayHandle<text> &,
	ArrayHandle<text*> &, ArrayHandle<int> &);
	string print(Index, ArrayHandle<text>&, ArrayHandle<text>&,
	ArrayHandle<text*>&, ArrayHandle<int>&,
	ArrayHandle<text*>&, uint16_t);
	string surr_display(
	ArrayHandle<text*> &cat_features_str,
	ArrayHandle<text*> &con_features_str,
	ArrayHandle<text*> &cat_levels_text,
	ArrayHandle<int> &cat_n_levels);

	int encodeIndex(const int &feature_index,
	const int &is_categorical,
	const int &n_cat_features) const;

	void computeVariableImportance(ColumnVector& cat_var_importance,
	ColumnVector& con_var_importance);

	// attributes
	// dimension information
	uint16_type tree_depth; // 1 for root-only tree

	uint16_type n_y_labels;

	uint16_type max_n_surr;

	bool_type is_regression; // = 0 for classification, = 1 for regression

	uint16_type impurity_type; // can be 'mse', 'gini', 'entropy', 'misclass'

	// All vectors below are of size n_nodes ( = 2^{tree_depth} - 1 )
	// Note: in order to make DynamicStruct::DryRun work,
	// non-scalars should be of size 0 when dimension is 0

	// The complete tree is broken into multiple vectors: each vector being the
	// collection of a single variable for all nodes.

	// -1 means leaf, -2 mean non-existing node
	IntegerVector_type feature_indices;
	// elements are of integer type for categorical
	ColumnVector_type feature_thresholds;

	// used as boolean array, 0 means continuous, otherwise categorical
	IntegerVector_type is_categorical;

	// Used to keep count of the number of non-null rows that are split to the
	// left and right children for each internal node.
	// For leaf nodes, the value is 0.
	// This is useful when using surrogates to compute the majority count/split
	// for an internal node. Size = n_nodes x 2
	ColumnVector_type nonnull_split_count;
	// FIXME: we use a ColumnVector (elements of 'double' type) to store
	// big int values, since we dont' yet have a vector type with uint64_t elements.

	// 'surrogate_indices' is of size n_nodes x max_n_surrogates
	// If a particular node has fewer than max_n_surrogates, then
	// the indices on non-existing will be -1
	IntegerVector_type surr_indices;
	ColumnVector_type surr_thresholds; // used as integer if classification

	// used as enumerated array to record the status of a surrogate split
	// 0 = default value indicating invalid surrogate split
	// 1 = categorical feature <= threshold
	// -1 = categorical feature > threshold
	// 2 = continuous feature <= threshold
	// -2 = continuous feature > threshold
	IntegerVector_type surr_status;
	// used for keeping count of number of rows that are common between primary
	// and surrogate
	IntegerVector_type surr_agreement;

	// 'prediction' is matrix of size n_nodes x n_predictions
	// where n_predictions = stats_per_split,
	// stats_per_split is set by the accumulator during training of tree
	Matrix_type predictions; // used as integer if we do classification
	};


	// ------------------------------------------------------------------------
	// TreeAccumulator is used for collecting statistics during training the nodes
	// for a level of the decision tree. The same accumulator is also used for
	// computing the statistics for surrogate splits.

	template <class Container, class DTree>
	class TreeAccumulator
	: public DynamicStruct<TreeAccumulator<Container, DTree>, Container> {
	public:
	typedef DynamicStruct<TreeAccumulator, Container> Base;
	MADLIB_DYNAMIC_STRUCT_TYPEDEFS;
	typedef DTree tree_type;
	typedef std::tuple< tree_type,
	MappedIntegerVector, // categorical feature values
	MappedColumnVector, // continuous feature values
	double, // response variable
	double, // weight
	MappedIntegerVector, // levels for each categorical feature
	MappedMatrix // split values for each continuous feature
	> tuple_type;

	typedef std::tuple< tree_type,
	MappedIntegerVector, // categorical feature values
	MappedColumnVector, // continuous feature values
	MappedIntegerVector, // levels for each categorical feature
	MappedMatrix, // split values for each continuous feature
	int // duplicated count for each tuple
	// (used in random forest)
	> surr_tuple_type;

	// functions
	TreeAccumulator(Init_type& inInitialization);
	void bind(ByteStream_type& inStream);
	void rebind(uint16_t n_bins, uint16_t n_cat_feat,
	uint16_t n_con_feat, uint32_t n_total_levels,
	uint16_t tree_depth, uint16_t n_stats, bool weights_as_rows,
	uint32_t n_reachable_leaf_nodes);

	TreeAccumulator& operator<<(const tuple_type& inTuple);
	TreeAccumulator& operator<<(const surr_tuple_type& inTuple);

	template <class C, class DT>
	TreeAccumulator& operator<<(const TreeAccumulator<C, DT>& inOther);
	bool empty() const { return this->n_rows == 0; }

	// cat_features[feature_index] <= cat_value
	Index indexCatStats(Index feature_index, int cat_value, bool is_split_true) const;
	// con_features[feature_index] <= bin_threshold,
	// bin_threshold = con_splits[feature_index, bin_index]
	Index indexConStats(Index feature_index, Index bin_index, bool is_split_true) const;
	Index computeSubIndex(Index start_Index, Index relative_index, bool is_split_true) const;

	void updateNodeStats(bool is_regression, Index row_index,
	const double response, const double weight);
	// apply the tuple using indices
	void updateStats(bool is_regression, bool is_cat, Index row_index,
	Index stats_index, const double response,
	const double weight);

	// apply the tuple using indices
	void updateSurrStats(const bool is_cat, const bool surr_agrees,
	Index row_index, Index stats_index, const int dup_count);

	// attributes
	uint64_type n_rows; // number of rows mapped to this node
	// return NULL state if the node is terminated due to an error
	bool_type terminated;

	// dimension information
	uint16_type n_bins;
	uint16_type n_cat_features;
	uint16_type n_con_features;
	// sum of num of levels in each categorical variable
	uint32_type total_n_cat_levels;
	// n_leaf_nodes = 2^{dt.tree_depth-1} for dt.tree_depth > 0
	uint32_type n_leaf_nodes;

	// Not all "leaf" nodes at a tree level are reachable. A leaf becomes
	// non-reachable when one of its ancestor is itself a leaf.
	// For a full tree, n_leaf_nodes = n_reachable_leaf_nodes
	uint32_type n_reachable_leaf_nodes;

	// For regression, stats_per_split = 4, i.e. (w, wy, wy^2, 1)
	// For classification, stats_per_split = (number of class labels + 1)
	// i.e. (w_1, w_2, ..., w_c, 1)
	// For surrogates calculation, stats_per_split = 1
	uint16_type stats_per_split;

	// treat weights as duplicated rows (used for random forest)
	bool_type weights_as_rows;

	// training statistics
	// cumulative sum of the levels of categorical variables
	// with first element as 0. This is helpful to compute the index into
	// cat_stats for each categorical variable
	// size = n_cat_features last
	// element = (total_n_cat_levels - last element of cat_levels)
	IntegerVector_type cat_levels_cumsum; // used as integer array

	// con_stats and cat_stats are matrices that contain the statistics used
	// during training.
	// cat_stats is a matrix of size:
	// (n_reachable_leaf_nodes) x (total_n_cat_levels * stats_per_split * 2)
	Matrix_type cat_stats;

	// con_stats is a matrix:
	// (n_reachable_leaf_nodes) x (n_con_features * n_bins * stats_per_split * 2)
	Matrix_type con_stats;

	// node_stats is used to keep a statistic of all the rows that land on a
	// node and are used to determine the prediction made by a node.
	// In the absence of any NULL value, these stats can be deduced from
	// cat_stats/con_stats. In the presence of NULL value, the stats could be
	// different.
	Matrix_type node_stats;

	// Above stats matrices are used as pseudo-sparse matrices since not all
	// leaf nodes are reachable (esp. as tree gets deeper).
	IntegerVector_type stats_lookup;
	};
	// ------------------------------------------------------------------------


	} // namespace recursive_partitioning

	} // namespace modules

	} // namespace madlib

	#endif // defined(MADLIB_MODULES_RP_DT_PROTO_HPP)