src/modules/recursive_partitioning/DT_impl.hpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file Decision_Tree_impl.hpp
  *
  *//* ----------------------------------------------------------------------- */

 #ifndef MADLIB_MODULES_RP_DT_IMPL_HPP
 #define MADLIB_MODULES_RP_DT_IMPL_HPP

 #include <iostream>
 #include <sstream>
 #include <vector>
 #include <string>
 #include <iterator>

 #include <limits>  // std::numeric_limits

 #include <dbconnector/dbconnector.hpp>
 #include <boost/random/uniform_int.hpp>
 #include <boost/random/variate_generator.hpp>
 #include "DT_proto.hpp"

 namespace madlib {

 // Use Eigen
 using namespace dbal::eigen_integration;
 using boost::uniform_int;
 using boost::variate_generator;

 namespace modules {

 namespace recursive_partitioning {

 namespace {

 typedef std::pair<int, double> argsort_pair;

 bool argsort_comp(const argsort_pair& left,
                   const argsort_pair& right) {
     return left.second > right.second;
 }

 IntegerVector argsort(const ColumnVector & x) {
     IntegerVector indices(x.size());
     std::vector<argsort_pair> data(x.size());
     for(int i=0; i < x.size(); i++) {
         data[i].first = i;
         data[i].second = x(i);
     }
     std::sort(data.begin(), data.end(), argsort_comp);
     for(size_t i=0; i < data.size(); i++) {
         indices(i) = data[i].first;
     }
     return indices;
 }

 string
 escape_quotes(const string &before) {
     // From http://stackoverflow.com/questions/1162619/fastest-quote-escaping-implementation
     string after;
     after.reserve(before.length() + 4);

     for (string::size_type i = 0; i < before.length(); ++i) {
         switch (before[i]) {
             case '"':
             case '\\':
                 after += '\\';
                 // Fall through.
             default:
                 after += before[i];
         }
     }
     return after;
 }
 // ------------------------------------------------------------

 double
 computeEntropy(const double &p) {
     if (p < 0.) { throw std::runtime_error("unexpected negative probability"); }
     if (p == 0.) { return 0.; }
     return -p * log2(p);
 }
 // ------------------------------------------------------------

 // Extract a string from ArrayHandle<text*>
 inline
 string
 get_text(ArrayHandle<text*> &strs, size_t i) {
     return std::string(VARDATA_ANY(strs[i]), VARSIZE_ANY(strs[i]) - VARHDRSZ);
 }

 } // anonymous namespace

 // ------------------------------------------------------------------------
 // Definitions for class Decision Tree
 // ------------------------------------------------------------------------

 template <class Container>
 inline
 DecisionTree<Container>::DecisionTree():
         Base(defaultAllocator().allocateByteString<
                 dbal::FunctionContext, dbal::DoZero, dbal::ThrowBadAlloc>(0)) {
     this->initialize();
 }
 // ------------------------------------------------------------

 template <class Container>
 inline
 DecisionTree<Container>::DecisionTree(
         Init_type& inInitialization): Base(inInitialization) {
     this->initialize();
 }
 // ------------------------------------------------------------

 template <class Container>
 inline
 void
 DecisionTree<Container>::bind(ByteStream_type& inStream) {
     inStream >> tree_depth
              >> n_y_labels
              >> max_n_surr
              >> is_regression
              >> impurity_type;
     size_t n_nodes = 0;
     size_t n_labels = 0;
     size_t max_surrogates = 0;
     if (!tree_depth.isNull()) {
         n_nodes = static_cast<size_t>(pow(2.0, tree_depth) - 1);
         // for classification n_labels = n_y_labels + 1 since the last element
         // is the count of actual (unweighted) tuples landing on a node
         // for regression, n_y_labels is same as REGRESS_N_STATS
         if (is_regression)
             n_labels = static_cast<size_t>(n_y_labels);
         else
             n_labels = static_cast<size_t>(n_y_labels + 1);
         max_surrogates = max_n_surr;
     }

     inStream
         >> feature_indices.rebind(n_nodes)
         >> feature_thresholds.rebind(n_nodes)
         >> is_categorical.rebind(n_nodes)
         >> nonnull_split_count.rebind(n_nodes * 2)
         >> surr_indices.rebind(n_nodes * max_surrogates)
         >> surr_thresholds.rebind(n_nodes * max_surrogates)
         >> surr_status.rebind(n_nodes * max_surrogates)
         >> surr_agreement.rebind(n_nodes * max_surrogates)
         >> predictions.rebind(n_nodes, n_labels)
         ;
 }
 // ------------------------------------------------------------

 template <class Container>
 inline
 DecisionTree<Container>&
 DecisionTree<Container>::rebind(const uint16_t in_tree_depth,
                                 const uint16_t in_y_labels,
                                 const uint16_t in_max_n_surr,
                                 const bool in_is_regression) {
     tree_depth = in_tree_depth;
     n_y_labels = in_y_labels;
     max_n_surr = in_max_n_surr;
     is_regression = in_is_regression;
     this->resize();
     return *this;
 }
 // ------------------------------------------------------------

 template <class Container>
 inline
 DecisionTree<Container>&
 DecisionTree<Container>::incrementInPlace() {
     // back up current tree
     size_t n_orig_nodes = static_cast<size_t>(pow(2.0, tree_depth) - 1);
     DecisionTree<Container> orig = DecisionTree<Container>();
     orig.rebind(tree_depth, n_y_labels, max_n_surr, is_regression);
     orig.copy(*this);

     // increment one level
     tree_depth ++;
     this->resize();

     // restore from backup
     is_regression = orig.is_regression;
     impurity_type = orig.impurity_type;
     feature_indices.segment(0, n_orig_nodes) = orig.feature_indices;
     feature_thresholds.segment(0, n_orig_nodes) = orig.feature_thresholds;
     is_categorical.segment(0, n_orig_nodes) = orig.is_categorical;
     nonnull_split_count.segment(0, n_orig_nodes*2) = orig.nonnull_split_count;
     if (max_n_surr > 0){
         surr_indices.segment(0, n_orig_nodes*max_n_surr) = orig.surr_indices;
         surr_thresholds.segment(0, n_orig_nodes*max_n_surr) = orig.surr_thresholds;
         surr_status.segment(0, n_orig_nodes*max_n_surr) = orig.surr_status;
         surr_agreement.segment(0, n_orig_nodes*max_n_surr) = orig.surr_agreement;
     }

     for (Index i = 0; i < orig.predictions.rows(); i++){
         // resize adds rows at the end of predictions
         predictions.row(i) = orig.predictions.row(i);
     }

     // mark all newly allocated leaves as non-existing nodes, they will be
     //  categorized as leaf nodes by the parent during expansion
     size_t n_new_leaves = n_orig_nodes + 1;
     feature_indices.segment(n_orig_nodes, n_new_leaves).setConstant(NODE_NON_EXISTING);
     feature_thresholds.segment(n_orig_nodes, n_new_leaves).setConstant(0);
     is_categorical.segment(n_orig_nodes, n_new_leaves).setConstant(0);
     nonnull_split_count.segment(n_orig_nodes*2, n_new_leaves*2).setConstant(0);

     if (max_n_surr > 0){
         surr_indices.segment(n_orig_nodes*max_n_surr,
                              n_new_leaves*max_n_surr).setConstant(SURR_NON_EXISTING);
         surr_thresholds.segment(n_orig_nodes*max_n_surr,
                                 n_new_leaves*max_n_surr).setConstant(0);
         surr_status.segment(n_orig_nodes*max_n_surr,
                                 n_new_leaves*max_n_surr).setConstant(0);
         surr_agreement.segment(n_orig_nodes*max_n_surr,
                                     n_new_leaves*max_n_surr).setConstant(0);
     }
     for (size_t i = n_orig_nodes; i < n_orig_nodes + n_new_leaves; i++){
         predictions.row(i).setConstant(0);
     }

    return *this;
 }
 // ------------------------------------------------------------

 template <class Container>
 inline
 uint64_t
 DecisionTree<Container>::getMajorityCount(Index node_index) const {
     // majority count is defined as the greater of the tuples passed in to
     // the two split branches. We only count tuples that have a non-null
     // value for the primary split of the node.
     if (feature_indices(node_index) < 0)
         throw std::runtime_error("Requested count for a leaf/non-existing node");
     uint64_t true_count = static_cast<uint64_t>(nonnull_split_count(trueChild(node_index)));
     uint64_t false_count = static_cast<uint64_t>(nonnull_split_count(falseChild(node_index)));
     return true_count >= false_count ? true_count : false_count;
 }

 template <class Container>
 inline
 bool
 DecisionTree<Container>::getMajoritySplit(Index node_index) const {
     // majority count is defined as the greater of the tuples passed in to
     // the two split branches. We only count tuples that have a non-null
     // value for the primary split of the node.
     if (feature_indices(node_index) < 0)
         throw std::runtime_error("Requested count for a leaf/non-existing node");
     uint64_t true_count = static_cast<uint64_t>(nonnull_split_count(trueChild(node_index)));
     uint64_t false_count = static_cast<uint64_t>(nonnull_split_count(falseChild(node_index)));
     return (true_count >= false_count);
 }
 // -------------------------------------------------------------------------

 template <class Container>
 inline
 bool
 DecisionTree<Container>::getSurrSplit(
         Index node_index,
         MappedIntegerVector cat_features,
         MappedColumnVector con_features) const {

     Index surr_base_index;
     for (surr_base_index = node_index * max_n_surr;
             surr_base_index < (node_index+1) * max_n_surr; surr_base_index++){

         Index surr_feat_index = surr_indices(surr_base_index);
         if (surr_feat_index < 0)
             break;

         double surr_feat_threshold = surr_thresholds(surr_base_index);
         bool split_response;
         if (std::abs(surr_status(surr_base_index)) == 1){
             if (!isNull(cat_features(surr_feat_index), true)){
                 split_response = cat_features(surr_feat_index) <= surr_feat_threshold;
                 // negative status is a reverse split (> relation)
                 return ((surr_status(surr_base_index) > 0) ?
                             (split_response) : (!split_response));
             }
         } else {
             if (!isNull(con_features(surr_feat_index), false)){
                 split_response = con_features(surr_feat_index) <= surr_feat_threshold;
                 // negative status is a reverse split (> relation)
                 return ((surr_status(surr_base_index) > 0) ?
                             (split_response) : (!split_response));
             }
         }
     }
     return getMajoritySplit(node_index);
 }
 // -------------------------------------------------------------------------

 template <class Container>
 inline
 Index
 DecisionTree<Container>::search(MappedIntegerVector cat_features,
                                 MappedColumnVector con_features) const {
     Index current = 0;
     int feature_index = feature_indices(current);
     while (feature_index != IN_PROCESS_LEAF && feature_index != FINISHED_LEAF) {
         assert(feature_index != NODE_NON_EXISTING);
         bool is_split_true = false;
         if (is_categorical(current) != 0) {
             if (isNull(cat_features(feature_index), true)){
                 is_split_true = getSurrSplit(current, cat_features, con_features);
             } else{
                 is_split_true = (cat_features(feature_index)
                                     <= feature_thresholds(current));
             }
         } else {
             if (isNull(con_features(feature_index), false)){
                 is_split_true = getSurrSplit(current, cat_features, con_features);

             } else{
                 is_split_true = (con_features(feature_index)
                                     <= feature_thresholds(current));
             }
         }
         /*       (i)
                /     \
            (2i+1)  (2i+2)
          */
         current = is_split_true ? trueChild(current) : falseChild(current);
         feature_index = feature_indices(current);
     }
     return current;
 }
 // ------------------------------------------------------------

 template <class Container>
 inline
 ColumnVector
 DecisionTree<Container>::predict(MappedIntegerVector cat_features,
                                  MappedColumnVector con_features) const {
     Index leaf_index = search(cat_features, con_features);
     return statPredict(predictions.row(leaf_index));
 }
 // ------------------------------------------------------------

 template <class Container>
 inline
 double
 DecisionTree<Container>::predict_response(
         MappedIntegerVector cat_features,
         MappedColumnVector con_features) const {
     ColumnVector curr_prediction = predict(cat_features, con_features);

     if (is_regression){
         return curr_prediction(0);
     } else {
         Index max_label;
         curr_prediction.maxCoeff(&max_label);
         return static_cast<double>(max_label);
     }
 }
 // ------------------------------------------------------------

 template <class Container>
 inline
 double
 DecisionTree<Container>::predict_response(Index leaf_index) const {
     ColumnVector curr_prediction = statPredict(predictions.row(leaf_index));
     if (is_regression){
         return curr_prediction(0);
     } else {
         Index max_label;
         curr_prediction.maxCoeff(&max_label);
         return static_cast<double>(max_label);
     }
 }
 // ------------------------------------------------------------

 template <class Container>
 inline
 double
 DecisionTree<Container>::impurity(const ColumnVector &stats) const {
     if (is_regression){
         // only mean-squared error metric is supported
         // variance is a measure of the mean-squared distance to all points
         return stats(2) / stats(0) - pow(stats(1) / stats(0), 2);
     } else {
         ColumnVector proportions = statPredict(stats);
         if (impurity_type == GINI){
             return 1 - proportions.cwiseProduct(proportions).sum();
         } else if (impurity_type == ENTROPY){
             return proportions.unaryExpr(std::function<double(const double&)>(computeEntropy)).sum();
         } else if (impurity_type == MISCLASS){
             return 1 - proportions.maxCoeff();
         } else
             throw std::runtime_error("No impurity function set for a classification tree");
     }
 }
 // ------------------------------------------------------------

 template <class Container>
 inline
 double
 DecisionTree<Container>::impurityGain(const ColumnVector &combined_stats,
                                       const uint16_t &stats_per_split) const {

     double true_count = statWeightedCount(combined_stats.segment(0, stats_per_split));
     double false_count = statWeightedCount(combined_stats.segment(stats_per_split, stats_per_split));
     double total_count = true_count + false_count;

     if (total_count == 0 || true_count == 0 || false_count == 0) {
         // no gain if no tuples incoming or if all fall into one side
         return 0.;
     }
     double true_weight = true_count / total_count;
     double false_weight = false_count / total_count;
     ColumnVector stats_sum = combined_stats.segment(0, stats_per_split) +
             combined_stats.segment(stats_per_split, stats_per_split);
     return (impurity(stats_sum) -
             true_weight * impurity(combined_stats.segment(0, stats_per_split)) -
             false_weight *
                 impurity(combined_stats.segment(stats_per_split, stats_per_split))
             );
 }

 // ------------------------------------------------------------
 template <class Container>
 inline
 bool
 DecisionTree<Container>::updatePrimarySplit(
         const Index node_index,
         const int & max_feat,
         const double & max_threshold,
         const bool & max_is_cat,
         const uint16_t & min_split,
         const ColumnVector &true_stats,
         const ColumnVector &false_stats) {

     // current node
     feature_indices(node_index) = max_feat;
     is_categorical(node_index) = max_is_cat ? 1 : 0;
     feature_thresholds(node_index) = max_threshold;

     // update feature indices and prediction for children
     feature_indices(trueChild(node_index)) = IN_PROCESS_LEAF;
     predictions.row(trueChild(node_index)) = true_stats;
     feature_indices(falseChild(node_index)) = IN_PROCESS_LEAF;
     predictions.row(falseChild(node_index)) = false_stats;

     // true_stats and false_stats only include the tuples for which the primary
     // split is not NULL. The number of tuples in these stats need to be stored to
     // compute a majority branch during surrogate training.
     uint64_t true_count = statCount(true_stats);
     uint64_t false_count = statCount(false_stats);
     nonnull_split_count(trueChild(node_index)) = static_cast<double>(true_count);
     nonnull_split_count(falseChild(node_index)) = static_cast<double>(false_count);

     // current node's each child won't split if,
     //      1. child is pure (responses are too similar to split further)
     //      OR
     //      2. child is too small to split (count < min_split)
     bool children_wont_split = ((isChildPure(true_stats) ||
                                     true_count < min_split) &&
                                 (isChildPure(false_stats) ||
                                     false_count < min_split));
     return children_wont_split;
 }
 // -------------------------------------------------------------------------


 template <class Container>
 template <class Accumulator>
 inline
 bool
 DecisionTree<Container>::expand(const Accumulator &state,
                                 const MappedMatrix &con_splits,
                                 const uint16_t &min_split,
                                 const uint16_t &min_bucket,
                                 const uint16_t &max_depth) {
     uint32_t n_non_leaf_nodes = static_cast<uint32_t>(state.n_leaf_nodes - 1);
     bool children_not_allocated = true;
     bool children_wont_split = true;

     const uint16_t &sps = state.stats_per_split;  // short form for brevity
     for (Index i=0; i < state.n_leaf_nodes; i++) {
         Index current = n_non_leaf_nodes + i;
         if (feature_indices(current) == IN_PROCESS_LEAF) {
             Index stats_i = static_cast<Index>(state.stats_lookup(i));
             assert(stats_i >= 0);

             // 1. Update predictions if necessary
             if (statCount(predictions.row(current)) !=
                     statCount(state.node_stats.row(stats_i))){
                 // Predictions for each node is set by its parent using stats
                 // recorded while training parent node. These stats do not
                 // include rows that had a NULL value for the primary split
                 // feature. The NULL count is included in 'node_stats' while
                 // training current node.
                 predictions.row(current) = state.node_stats.row(stats_i);

                 // Presence of NULL rows indicate that stats used for deciding
                 // 'children_wont_split' are inaccurate. Hence avoid using the
                 // flag to decide termination.
                 children_wont_split = false;
             }

             // 2. Compute the best feature to split current node

             // if a leaf node exists, compute the gain in impurity for each split
             // pick split  with maximum gain and update node with split value
             int max_feat = -1;
             Index max_bin = -1;
             bool max_is_cat = false;
             double max_impurity_gain = -std::numeric_limits<double>::infinity();
             ColumnVector max_stats;
             // go through all categorical stats
             int cumsum = 0;
             for (int f=0; f < state.n_cat_features; ++f){ // each feature
                 for (int v=0; cumsum < state.cat_levels_cumsum(f); ++v, ++cumsum){
                     // each value of feature
                     Index fv_index = state.indexCatStats(f, v, true);
                     double gain = impurityGain(
                         state.cat_stats.row(stats_i).
                             segment(fv_index, sps * 2), sps);

                     if (gain > max_impurity_gain){
                         max_impurity_gain = gain;
                         max_feat = f;
                         max_bin = v;
                         max_is_cat = true;
                         max_stats = state.cat_stats.row(stats_i).segment(fv_index, sps * 2);
                     }
                 }
             }
             // go through all continuous stats
             for (int f=0; f < state.n_con_features; ++f){  // each feature
                 for (Index b=0; b < state.n_bins; ++b){
                     // each bin of feature
                     Index fb_index = state.indexConStats(f, b, true);
                     double gain = impurityGain(
                         state.con_stats.row(stats_i).segment(fb_index, sps * 2), sps);
                     if (gain > max_impurity_gain){
                         max_impurity_gain = gain;
                         max_feat = f;
                         max_bin = b;
                         max_is_cat = false;
                         max_stats = state.con_stats.row(stats_i).segment(fb_index, sps * 2);
                     }
                 }
             }

             // 3. Create and update children if splitting current
             uint64_t true_count = statCount(max_stats.segment(0, sps));
             uint64_t false_count = statCount(max_stats.segment(sps, sps));
             uint64_t total_count = statCount(predictions.row(current));
             if (max_impurity_gain > 0 &&
                     shouldSplit(total_count, true_count, false_count,
                                 min_split, min_bucket, max_depth)) {

                 double max_threshold;
                 if (max_is_cat)
                     max_threshold = static_cast<double>(max_bin);
                 else
                     max_threshold = con_splits(max_feat, max_bin);

                 if (children_not_allocated) {
                     // allocate the memory for child nodes if not allocated already
                     incrementInPlace();
                     children_not_allocated = false;
                 }
                 children_wont_split &=
                     updatePrimarySplit(
                         current,
                         static_cast<int>(max_feat),
                         max_threshold, max_is_cat,
                         min_split,
                         max_stats.segment(0, sps),   // true_stats
                         max_stats.segment(sps, sps)  // false_stats
                     );
             } else {
                 feature_indices(current) = FINISHED_LEAF;
             }
         } // if leaf exists
     } // for each leaf

     // return true if tree expansion is finished
     //      we check (tree_depth = max_depth + 1) since internally
     //      tree_depth starts from 1 though max_depth expects root node depth as 0
     bool training_finished = (children_not_allocated ||
                               tree_depth >= (max_depth + 1) ||
                               children_wont_split);
     if (training_finished){
         // label any remaining IN_PROCESS_LEAF as FINISHED_LEAF
         for (Index i=0; i < feature_indices.size(); i++) {
             if (feature_indices(i) == IN_PROCESS_LEAF)
                 feature_indices(i) = FINISHED_LEAF;
         }
     }
     return training_finished;
 }
 // -------------------------------------------------------------------------


 template <class Container>
 template <class Accumulator>
 inline
 void
 DecisionTree<Container>::pickSurrogates(
         const Accumulator &state,
         const MappedMatrix &con_splits) {

     uint16_t n_cats = state.n_cat_features;
     uint16_t n_cons = state.n_con_features;
     uint16_t n_bins = state.n_bins;
     uint32_t n_cat_splits = state.total_n_cat_levels;
     uint32_t n_con_splits = n_cons * n_bins;

     // add every two columns in cat stats and con_stats
     // this assumes that stats_per_split = 2, hence cat_stats and con_stats have
     // even number of cols
     // we use the *_agg_matrix to add every two alternate columns of the
     // *_stats matrix to create a forward and reverse agreement metric
     // for each split.
     // eg. For cat_stats,
     //      add columns 1 and 3 to get the <= split agreement for 1st cat split.
     //      add columns 2 and 4 to get the > split agreement for 1st cat split.
     //      add columns 5 and 7 to get the <= split agreement for 2nd cat split.
     //      add columns 6 and 8 to get the > split agreement for 2nd cat split.
     ColumnVector fwd_agg_vec(4);
     fwd_agg_vec << 1, 0, 1, 0;
     ColumnVector rev_agg_vec(4);
     rev_agg_vec << 0, 1, 0, 1;

     Matrix cat_agg_matrix = Matrix::Zero(n_cat_splits*4, n_cat_splits * 2);
     for (Index i=0; i < cat_agg_matrix.cols(); i+=2){
         cat_agg_matrix.col(i).segment(2*i, 4) = fwd_agg_vec;
         cat_agg_matrix.col(i+1).segment(2*i, 4) = rev_agg_vec;
     }

     Matrix con_agg_matrix = Matrix::Zero(n_con_splits*4, n_con_splits*2);
     for (Index i=0; i < con_agg_matrix.cols(); i+=2){
         con_agg_matrix.col(i).segment(2*i, 4) = fwd_agg_vec;
         con_agg_matrix.col(i+1).segment(2*i, 4) = rev_agg_vec;
     }

     assert(state.cat_stats.cols() == cat_agg_matrix.rows());
     assert(state.con_stats.cols() == con_agg_matrix.rows());
     Matrix cat_stats_counts(state.cat_stats * cat_agg_matrix);
     Matrix con_stats_counts(state.con_stats * con_agg_matrix);

     // cat_stats_counts size = n_reachable_leaf_nodes x n_cats*2
     // con_stats_counts size = n_reachable_leaf_nodes x n_cons*2
     // *_stats_counts now contains the agreement count for each split where
     // each even col represents forward surrogate split count and
     // each odd col represents reverse surrogate split count.

     // Number of nodes in a last layer = 2^(tree_depth-1). (since depth starts from 1)
     // For n_surr_nodes, we need number of nodes in 2nd last layer,
     // so we use 2^(tree_depth-2)
     uint32_t n_surr_nodes = static_cast<uint32_t>(pow(2, tree_depth - 2));
     uint32_t n_ancestors = static_cast<uint32_t>(n_surr_nodes - 1);

     for (Index i=0; i < n_surr_nodes; i++){
         Index curr_node = n_ancestors + i;
         assert(curr_node >= 0 && curr_node < feature_indices.size());

         if (feature_indices(curr_node) >= 0){
             Index stats_i = static_cast<Index>(state.stats_lookup(i));
             assert(stats_i >= 0);

             // 1. Compute the max count and corresponding split threshold for
             // each categorical and continuous feature

             ColumnVector cat_max_thres = ColumnVector::Zero(n_cats);
             ColumnVector cat_max_count = ColumnVector::Zero(n_cats);
             IntegerVector cat_max_is_reverse = IntegerVector::Zero(n_cats);
             Index prev_cum_levels = 0;
             for (Index each_cat=0; each_cat < n_cats; each_cat++){
                 Index n_levels = state.cat_levels_cumsum(each_cat) - prev_cum_levels;
                 if (n_levels > 0){
                     Index max_label;
                     (cat_stats_counts.row(stats_i).segment(
                         prev_cum_levels * 2, n_levels * 2)).maxCoeff(&max_label);

                     // For each split, there are two stats =>
                     //  max_label / 2 gives the split index. A floor
                     // operation is unnecessary since the threshold will yield
                     // the same results for n and n+0.5.
                     cat_max_thres(each_cat) = static_cast<double>(max_label / 2);
                     cat_max_count(each_cat) =
                             cat_stats_counts(stats_i, prev_cum_levels*2 + max_label);
                     // every odd col is for reverse, hence i % 2 == 1 for reverse index i
                     cat_max_is_reverse(each_cat) = max_label % 2;
                     prev_cum_levels = state.cat_levels_cumsum(each_cat);
                 }
             }

             ColumnVector con_max_thres = ColumnVector::Zero(n_cons);
             ColumnVector con_max_count = ColumnVector::Zero(n_cons);
             IntegerVector con_max_is_reverse = IntegerVector::Zero(n_cons);
             for (Index each_con=0; each_con < n_cons; each_con++){
                 Index max_label;
                 (con_stats_counts.row(stats_i).segment(
                         each_con*n_bins*2, n_bins*2)).maxCoeff(&max_label);
                 con_max_thres(each_con) = con_splits(each_con, max_label / 2);
                 con_max_count(each_con) =
                         con_stats_counts(stats_i, each_con*n_bins*2 + max_label);
                 con_max_is_reverse(each_con) = (max_label % 2 == 1) ? 1 : 0;
             }

             // 2. Combine the best counts and sort them to get best
             // variable/splits in descending order
             ColumnVector all_counts(n_cats + n_cons);
             all_counts.segment(0, n_cats) = cat_max_count;
             all_counts.segment(n_cats, n_cons) = con_max_count;
             IntegerVector sorted_surr_indices = argsort(all_counts);

             // 3. Store the top max_n_surr (or fewer) surrogates in appropriate
             // data structures
             Index max_size = sorted_surr_indices.size() < max_n_surr  ?
                                 sorted_surr_indices.size() : max_n_surr;
             Index surr_count = 0;
             for(Index j=0; j < max_size; j++){
                 Index curr_surr = sorted_surr_indices(j);
                 if (all_counts(curr_surr) < double(getMajorityCount(curr_node))){
                     break;
                 }
                 Index to_update_surr = curr_node * max_n_surr + surr_count;
                 if (curr_surr < n_cats){    // curr_surr is categorical
                     // ensure primary not same as surrogate
                     if ((is_categorical(curr_node) != 1) ||
                             (feature_indices(curr_node) != curr_surr)) {
                         surr_indices(to_update_surr) = static_cast<int>(curr_surr);
                         surr_thresholds(to_update_surr) = cat_max_thres(curr_surr);
                         // reverse splits have negative status
                         surr_status(to_update_surr) =
                             (cat_max_is_reverse(curr_surr) == 1) ?
                                 static_cast<int>(-1) : static_cast<int>(1);
                         surr_agreement(to_update_surr) = static_cast<int>(cat_max_count(curr_surr));
                         surr_count++;
                     }
                 } else {  // curr_surr is continuous
                     curr_surr -= n_cats;  // continuous indices after the categorical

                     // ensure primary not same as surrogate
                     if ((is_categorical(curr_node) != 0) ||
                             (feature_indices(curr_node) != curr_surr)) {
                         surr_indices(to_update_surr) = static_cast<int>(curr_surr);
                         surr_thresholds(to_update_surr) = con_max_thres(curr_surr);
                         // reverse splits have negative status
                         surr_status(to_update_surr) =
                             (con_max_is_reverse(curr_surr) == 1) ?
                                 static_cast<int>(-2) : static_cast<int>(2);
                         surr_agreement(to_update_surr) = static_cast<int>(con_max_count(curr_surr));
                         surr_count++;
                     }
                 }
             }
         }
     }
 }

 // -------------------------------------------------------------------------

 template <class Container>
 template <class Accumulator>
 inline
 bool
 DecisionTree<Container>::expand_by_sampling(const Accumulator &state,
                                 const MappedMatrix &con_splits,
                                 const uint16_t &min_split,
                                 const uint16_t &min_bucket,
                                 const uint16_t &max_depth,
                                 const int &n_random_features) {

     uint32_t n_non_leaf_nodes = static_cast<uint32_t>(state.n_leaf_nodes - 1);
     bool children_not_allocated = true;
     bool children_wont_split = true;

     //select cat and con features to be sampled
     int total_cat_con_features = state.n_cat_features + state.n_con_features;
     const uint16_t &sps = state.stats_per_split;  // short form for brevity

     //store indicies from 0 to total_cat_con_features-1
     int *cat_con_feature_indices = new int[total_cat_con_features];
     NativeRandomNumberGenerator generator;
     uniform_int<int> uni_dist;
     variate_generator<NativeRandomNumberGenerator, uniform_int<int> > rvt(generator, uni_dist);

     for (Index i=0; i < state.n_leaf_nodes; i++) {
         Index current = n_non_leaf_nodes + i;
         if (feature_indices(current) == IN_PROCESS_LEAF) {
             Index stats_i = static_cast<Index>(state.stats_lookup(i));
             assert(stats_i >= 0);

             if (statCount(predictions.row(current)) !=
                     statCount(state.node_stats.row(stats_i))){
                 // Predictions for each node is set by its parent using stats
                 // recorded while training parent node. These stats do not include
                 // rows that had a NULL value for the primary split feature.
                 // The NULL count is included in the 'node_stats' while training
                 // current node. Further, presence of NULL rows indicate that
                 // stats used for deciding 'children_wont_split' are inaccurate.
                 // Hence avoid using the flag to decide termination.
                 predictions.row(current) = state.node_stats.row(stats_i);
                 children_wont_split = false;
             }

             for (int j=0; j<total_cat_con_features; j++) {
                 cat_con_feature_indices[j] = j;
              }
             //randomly shuffle cat_con_feature_indices
             std::random_shuffle(cat_con_feature_indices,
                     cat_con_feature_indices + total_cat_con_features, rvt);
             // if a leaf node exists, compute the gain in impurity for each split
             // pick split with maximum gain and update node with split value
             int max_feat = -1;
             Index max_bin = -1;
             bool max_is_cat = false;
             double max_impurity_gain = -std::numeric_limits<double>::infinity();
             ColumnVector max_stats;

             for (int index=0; index<n_random_features; index++) {
                 int f = cat_con_feature_indices[index];
                 if (f >= 0 && f < state.n_cat_features) {
                     //categorical feature
                     int v_end = f < 1 ? \
                             state.cat_levels_cumsum(0) : \
                             state.cat_levels_cumsum(f) - state.cat_levels_cumsum(f-1);
                     for (int v=0; v < v_end; ++v){
                         // each value of feature
                         Index fv_index = state.indexCatStats(f, v, true);
                         double gain = impurityGain(
                             state.cat_stats.row(stats_i).
                                 segment(fv_index, sps * 2),
                             sps);
                         if (gain > max_impurity_gain){
                             max_impurity_gain = gain;
                             max_feat = f;
                             max_bin = v;
                             max_is_cat = true;
                             max_stats = state.cat_stats.row(stats_i).
                                             segment(fv_index, sps * 2);
                         }
                     }

                 } else { //f >= state.n_cat.features
                     //continuous feature
                     f -= state.n_cat_features;
                     for (Index b=0; b < state.n_bins; ++b){
                         // each bin of feature
                         Index fb_index = state.indexConStats(f, b, true);
                         double gain = impurityGain(
                             state.con_stats.row(stats_i).
                                 segment(fb_index, sps * 2),
                             sps);
                         if (gain > max_impurity_gain){
                             max_impurity_gain = gain;
                             max_feat = f;
                             max_bin = b;
                             max_is_cat = false;
                             max_stats = state.con_stats.row(stats_i).
                                             segment(fb_index, sps * 2);
                         }
                     }
                 }
             }

             bool is_leaf_split = false;
             if (max_impurity_gain > 0){
                 // Create and update child nodes if splitting current
                 uint64_t true_count = statCount(max_stats.segment(0, sps));
                 uint64_t false_count = statCount(max_stats.segment(sps, sps));
                 uint64_t total_count = statCount(predictions.row(current));
                 if (shouldSplit(total_count, true_count, false_count,
                                 min_split, min_bucket, max_depth)) {

                     is_leaf_split = true;
                     double max_threshold;
                     if (max_is_cat)
                         max_threshold = static_cast<double>(max_bin);
                     else
                         max_threshold = con_splits(max_feat, max_bin);

                     if (children_not_allocated) {
                         // allocate the memory for child nodes if not allocated already
                         incrementInPlace();
                         children_not_allocated = false;
                     }

                     children_wont_split &=
                         updatePrimarySplit(
                             current, static_cast<int>(max_feat),
                             max_threshold, max_is_cat,
                             min_split,
                             max_stats.segment(0, sps),   // true_stats
                             max_stats.segment(sps, sps)  // false_stats
                         );
                 } // if shouldSplit
             } //if max_impurity_gain > 0
             if (not is_leaf_split)
                 feature_indices(current) = FINISHED_LEAF;
         } // if leaf is in_process
     } // for each leaf

     // return true if tree expansion is finished
     //      we check (tree_depth = max_depth + 1) since internally
     //      tree_depth starts from 1 though max_depth expects root node depth as 0
     bool training_finished = (children_not_allocated ||
                               tree_depth >= (max_depth + 1) ||
                               children_wont_split);
     if (training_finished){
         // label any remaining IN_PROCESS_LEAF as FINISHED_LEAF
         for (Index i=0; i < feature_indices.size(); i++) {
             if (feature_indices(i) == IN_PROCESS_LEAF)
                 feature_indices(i) = FINISHED_LEAF;
         }
     }
     delete[] cat_con_feature_indices;
     return training_finished;
 }

 template <class Container>
 inline
 ColumnVector
 DecisionTree<Container>::statPredict(const ColumnVector &stats) const {

     // stats is assumed to be of size = stats_per_split
     if (is_regression){
         // regression stat -> (0) = sum of weights, (1) = weighted sum of responses
         // we return the average response as prediction
         return ColumnVector(stats.segment(1, 1) / stats(0));
     } else {
         // classification stat ->  (i) = num of tuples for class i
         // we return the proportion of each label
         ColumnVector statsCopy(stats);
         return statsCopy.head(n_y_labels) / static_cast<double>(stats.head(n_y_labels).sum());
     }
 }
 // -------------------------------------------------------------------------

 /**
  * @brief Return the number of tuples accounted in a 'stats' vector
  */
 template <class Container>
 inline
 uint64_t
 DecisionTree<Container>::statCount(const ColumnVector &stats) const{
     // stats is assumed to be of size = stats_per_split
     // for both regression and classification, the last element is the number
     // of tuples landing on this node.
     return static_cast<uint64_t>(stats.tail(1)(0));  // tail(n) returns a slice with last n elements
 }
 // -------------------------------------------------------------------------

 /**
  * @brief Return the number of weighted tuples accounted in a 'stats' vector
  */
 template <class Container>
 inline
 double
 DecisionTree<Container>::statWeightedCount(const ColumnVector &stats) const{
     // stats is assumed to be of size = stats_per_split
     if (is_regression)
         return stats(0);
     else
         return stats.head(n_y_labels).sum();
 }
 // -------------------------------------------------------------------------

 /**
  * @brief Return the number of tuples that landed on given node
  */
 template <class Container>
 inline
 uint64_t
 DecisionTree<Container>::nodeCount(const Index node_index) const{
     return statCount(predictions.row(node_index));
 }
 // -------------------------------------------------------------------------


 /**
  * @brief Return the number of tuples (normalized using weights) that landed on given node
  */
 template <class Container>
 inline
 double
 DecisionTree<Container>::nodeWeightedCount(const Index node_index) const{
     return statWeightedCount(predictions.row(node_index));
 }
 // -------------------------------------------------------------------------


 /*
  * Compute misclassification for prediction from a node in a classification tree.
  * For regression, a zero value is returned.
  * For classification, the difference between sum of weighted count and the max coefficient.
  *
  * @param node_index: Index of node for which to compute misclassification
  */
 template <class Container>
 inline
 double
 DecisionTree<Container>::computeMisclassification(Index node_index) const {
     if (is_regression) {
         return 0;
     } else {
         return predictions.row(node_index).head(n_y_labels).sum() -
                 predictions.row(node_index).head(n_y_labels).maxCoeff();
     }
 }
 // -------------------------------------------------------------------------


 /*
  * Compute risk for a node of the tree.
  * For regression, risk is the variance of the reponse at that node.
  * For classification, risk is the number of misclassifications at that node.
  *
  * @param node_index: Index of node for which to compute risk
  */
 template <class Container>
 inline
 double
 DecisionTree<Container>::computeRisk(const Index node_index) const {
     if (is_regression) {
         double wt_tot = predictions.row(node_index)(0);
         double y_avg = predictions.row(node_index)(1);
         double y2_avg = predictions.row(node_index)(2);

         if (wt_tot <= 0)
             return 0;
         else
             return (y2_avg - (y_avg * y_avg / wt_tot));
     } else {
         return computeMisclassification(node_index);
     }
 }
 // -------------------------------------------------------------------------


 /**
  * @brief Return if a child node is pure
  */
 template <class Container>
 inline
 bool
 DecisionTree<Container>::isChildPure(const ColumnVector &stats) const{
     // stats is assumed to be of size = stats_per_split
     double epsilon = 1e-5;
     if (is_regression){
         // child is pure if variance is extremely small compared to mean
         double mean = stats(1) / stats(0);
         double variance = stats(2) / stats(0) - std::pow(mean, 2);
         return variance < epsilon * mean * mean;
     } else {
         // child is pure if most are of same class
         // return (statPredict(stats) / stats.head(n_y_labels).maxCoeff()).sum() < 100 * epsilon;
         double total_count = stats.head(n_y_labels).sum();
         double non_max_vals =  total_count - stats.head(n_y_labels).maxCoeff();
         return (non_max_vals/total_count) < 100*epsilon;
     }
 }
 // -------------------------------------------------------------------------

 template <class Container>
 inline
 bool
 DecisionTree<Container>::shouldSplit(const uint64_t &total_count,
                                      const uint64_t &true_count,
                                      const uint64_t &false_count,
                                      const uint16_t &min_split,
                                      const uint16_t &min_bucket,
                                      const uint16_t &max_depth) const {
     // total_count != true_count + false_count if there are rows with NULL values

     // Always want at least 1 tuple going into a child node. Hence the
     // minimum value for min_bucket is 1
     uint64_t thresh_min_bucket = (min_bucket == 0) ? 1u : min_bucket;
     return (total_count >= min_split &&
             true_count >= thresh_min_bucket &&
             false_count >= thresh_min_bucket &&
             tree_depth <= max_depth + 1);
 }
 // ------------------------------------------------------------------------

 template <class Container>
 inline
 uint16_t
 DecisionTree<Container>::recomputeTreeDepth() const{
     if (feature_indices.size() <= 1 || tree_depth <= 1)
         return tree_depth;

     for(uint16_t depth_counter = 2; depth_counter <= tree_depth; depth_counter++){
         uint32_t n_leaf_nodes = static_cast<uint32_t>(pow(2.0, depth_counter - 1));
         uint32_t leaf_start_index = n_leaf_nodes - 1;
         bool all_non_existing = true;
         for (uint32_t leaf_index=0; leaf_index < n_leaf_nodes; leaf_index++){
             if (feature_indices(leaf_start_index + leaf_index) != NODE_NON_EXISTING){
                 all_non_existing = false;
                 break;
             }
         }
         if (all_non_existing){
             // The previous level was the right depth since current has all
             // non-existing nodes. Return 1 less than current
             return static_cast<uint16_t>(depth_counter - 1);
         }
     }
     return tree_depth;
 }
 // -------------------------------------------------------------------------


 /**
  * @brief Display the decision tree in dot format
  */
 template <class Container>
 inline
 string
 DecisionTree<Container>::displayLeafNode(
             Index id,
             ArrayHandle<text*> &dep_levels,
             const std::string & id_prefix,
             bool verbose){
     std::stringstream predict_str;
     if (static_cast<bool>(is_regression)){
         predict_str << predict_response(id);
     }
     else{
         std::string dep_value = get_text(dep_levels, static_cast<int>(predict_response(id)));
         predict_str << escape_quotes(dep_value);
     }

     std::stringstream display_str;
     display_str << "\"" << id_prefix << id << "\" [label=\"" << predict_str.str();

     if(verbose){
         display_str << "\\n impurity = "<< impurity(predictions.row(id))
                     << "\\n samples = " << statCount(predictions.row(id))
                     << "\\n value = ";
         if (is_regression)
             display_str << statPredict(predictions.row(id));
         else{
             display_str << "[";
             // NUM_PER_LINE: inserting new lines at fixed intervals
             // avoids a really long 'value' line
             const uint16_t NUM_PER_LINE = 10;
             // note: last element of predictions is 'statCount' and
             // can be ignored
             const Index pred_size = predictions.row(id).size() - 1;
             for (Index i = 0; i < pred_size; i += NUM_PER_LINE){
                 if (i + NUM_PER_LINE >= pred_size) {
                     // not overflowing the vector
                     display_str << predictions.row(id).segment(i, pred_size - i);
                 } else {
                     display_str << predictions.row(id).segment(i, NUM_PER_LINE) << "\n";
                 }
             }
             display_str << "]";

         }
     }
     display_str << "\",shape=box]" << ";";
     return display_str.str();
 }
 // -------------------------------------------------------------------------

 /*
     @brief Display the decision tree in dot format
 */
 template <class Container>
 inline
 string
 DecisionTree<Container>::displayInternalNode(
             Index id,
             ArrayHandle<text*> &cat_features_str,
             ArrayHandle<text*> &con_features_str,
             ArrayHandle<text*> &cat_levels_text,
             ArrayHandle<int> &cat_n_levels,
             ArrayHandle<text*> &dep_levels,
             const std::string & id_prefix,
             bool verbose
             ){

     string feature_name;
     std::stringstream label_str;
     if (is_categorical(id) == 0) {
         feature_name = get_text(con_features_str, feature_indices(id));
         label_str << escape_quotes(feature_name) << " <= " << feature_thresholds(id);
     } else {
         feature_name = get_text(cat_features_str, feature_indices(id));
         label_str << escape_quotes(feature_name) << " <= ";

         // Text for all categoricals are stored in a flat array (cat_levels_text);
         // find the appropriate index for this node
         size_t to_skip = 0;
         for (Index i=0; i < feature_indices(id); i++)
             to_skip += cat_n_levels[i];
         const size_t index = to_skip + static_cast<size_t>(feature_thresholds(id));
         label_str << get_text(cat_levels_text, index);
     }

     std::stringstream display_str;
     display_str << "\"" << id_prefix << id << "\" [label=\"" << label_str.str();
     if(verbose){
         display_str << "\\n impurity = "<< impurity(predictions.row(id)) << "\\n samples = " << statCount(predictions.row(id));

         display_str << "\\n value = ";
         if (is_regression)
             display_str << statPredict(predictions.row(id));
         else{
             display_str << "[";
             // NUM_PER_LINE: inserting new lines at fixed interval
             // avoids really long 'value' line
             const uint16_t NUM_PER_LINE = 10;
             // note: last element of predictions is just 'statCount' and needs to
             // be ignored
             const Index pred_size = predictions.row(id).size() - 1;
             for (Index i = 0; i < pred_size; i += NUM_PER_LINE){
                 if (i + NUM_PER_LINE > pred_size) {
                     // not overflowing the vector
                     display_str << predictions.row(id).segment(i, pred_size - i);
                 } else {
                     display_str << predictions.row(id).segment(i, NUM_PER_LINE) << "\n";
                 }
             }
             display_str << "]";
         }

         std::stringstream predict_str;
         if (static_cast<bool>(is_regression)){
             predict_str << predict_response(id);
         }
         else{
             std::string dep_value = get_text(dep_levels, static_cast<int>(predict_response(id)));
             predict_str << escape_quotes(dep_value);
         }
         display_str << "\\n class = " << predict_str.str();
     }
     display_str << "\", shape=ellipse]" << ";";
     return display_str.str();
 }
 // -------------------------------------------------------------------------

 /**
  * @brief Display the decision tree in dot format
  */
 template <class Container>
 inline
 string
 DecisionTree<Container>::display(
         ArrayHandle<text*> &cat_features_str,
         ArrayHandle<text*> &con_features_str,
         ArrayHandle<text*> &cat_levels_text,
         ArrayHandle<int> &cat_n_levels,
         ArrayHandle<text*> &dependent_levels,
         const std::string &id_prefix,
         bool verbose) {

     std::stringstream display_string;
     if (feature_indices(0) == FINISHED_LEAF){
         display_string << displayLeafNode(0, dependent_levels, id_prefix, verbose)
                        << std::endl;
     }
     else{
         for(Index index = 0; index < feature_indices.size() / 2; index++) {
             if (feature_indices(index) != NODE_NON_EXISTING &&
                     feature_indices(index) != IN_PROCESS_LEAF &&
                     feature_indices(index) != FINISHED_LEAF) {

                 display_string << displayInternalNode(
                         index, cat_features_str, con_features_str,
                         cat_levels_text, cat_n_levels, dependent_levels, id_prefix, verbose) << std::endl;

                 // Display the children
                 Index tc = trueChild(index);
                 if (feature_indices(tc) != NODE_NON_EXISTING) {
                     display_string << "\"" << id_prefix << index << "\" -> "
                                    << "\"" << id_prefix << tc << "\"";

                      // edge going left is "true" node
                     display_string << "[label=\"yes\"];" << std::endl;

                     if (feature_indices(tc) == IN_PROCESS_LEAF ||
                         feature_indices(tc) == FINISHED_LEAF)
                         display_string
                             << displayLeafNode(tc, dependent_levels, id_prefix, verbose)
                             << std::endl;
                 }

                 Index fc = falseChild(index);
                 if (feature_indices(fc) != NODE_NON_EXISTING) {
                     display_string << "\"" << id_prefix << index << "\" -> "
                                    << "\"" << id_prefix << fc << "\"";

                     // root edge going right is "false" node
                     display_string << "[label=\"no\"];" << std::endl;

                     if (feature_indices(fc) == IN_PROCESS_LEAF ||
                         feature_indices(fc) == FINISHED_LEAF)
                         display_string
                             << displayLeafNode(fc, dependent_levels, id_prefix, verbose)
                             << std::endl;
                 }
             }
         } // end of for loop
     }
     return display_string.str();
 }
 // -------------------------------------------------------------------------

 template <class Container>
 inline
 string
 DecisionTree<Container>::print_split(
         bool is_cat,
         bool is_reverse,
         Index feat_index,
         double feat_threshold,
         ArrayHandle<text*> &cat_features_str,
         ArrayHandle<text*> &con_features_str,
         ArrayHandle<text*> &cat_levels_text,
         ArrayHandle<int> &cat_n_levels){

     string feature_name;
     std::stringstream label_str;
     std::string compare;

     if (!is_cat) {
         if (!is_reverse)
             compare = " <= ";
         else
             compare = " > ";
         feature_name = get_text(con_features_str, feat_index);
         label_str << feature_name << compare << feat_threshold;
     } else {
         Index start_threshold;
         Index end_threshold;
         if (!is_reverse){
             start_threshold = 0;
             end_threshold = static_cast<Index>(feat_threshold);
         }
         else{
             start_threshold = static_cast<Index>(feat_threshold + 1);
             end_threshold = cat_n_levels[feat_index] - 1;
         }

         feature_name = get_text(cat_features_str, feat_index);
         label_str << feature_name << " in "
                   << getCatLabels(feat_index, start_threshold, end_threshold,
                                   cat_levels_text, cat_n_levels);
     }
     return label_str.str();
 }
 // -------------------------------------------------------------------------

 template <class Container>
 inline
 string
 DecisionTree<Container>::print(
         Index current,
         ArrayHandle<text*> &cat_features_str,
         ArrayHandle<text*> &con_features_str,
         ArrayHandle<text*> &cat_levels_text,
         ArrayHandle<int> &cat_n_levels,
         ArrayHandle<text*> &dep_levels,
         uint16_t recursion_depth){

     if (feature_indices(current) == NODE_NON_EXISTING){
         return "";
     }
     std::stringstream print_string;

     // print current node + prediction
     print_string << "(" << current << ")";
     print_string << "[";
     if (is_regression){
         print_string << nodeWeightedCount(current) << ", "
                      << statPredict(predictions.row(current));
     }
     else{
         print_string << predictions.row(current).head(n_y_labels);
     }
     print_string << "]  ";

     if (feature_indices(current) >= 0){
         string label_str = print_split(static_cast<bool>(is_categorical(current)),
                                        false,
                                        feature_indices(current),
                                        feature_thresholds(current),
                                        cat_features_str,
                                        con_features_str,
                                        cat_levels_text,
                                        cat_n_levels);

         print_string << label_str << std::endl;
         std::string indentation(recursion_depth * 3, ' ');
         print_string
             << indentation
             << print(trueChild(current), cat_features_str,
                      con_features_str, cat_levels_text,
                      cat_n_levels, dep_levels, static_cast<uint16_t>(recursion_depth + 1));

         print_string
             << indentation
             << print(falseChild(current), cat_features_str,
                      con_features_str, cat_levels_text,
                      cat_n_levels, dep_levels, static_cast<uint16_t>(recursion_depth + 1));
     } else {
         print_string << "*";
         if (!is_regression){
             std::string dep_value = get_text(dep_levels,
                                              static_cast<int>(predict_response(current)));
             print_string << " --> " << dep_value;
         }
         print_string << std::endl;
     }
     return print_string.str();
 }
 // -------------------------------------------------------------------------

 template <class Container>
 inline
 string
 DecisionTree<Container>::getCatLabels(Index cat_index,
                                       Index start_value,
                                       Index end_value,
                                       ArrayHandle<text*> &cat_levels_text,
                                       ArrayHandle<int> &cat_n_levels) {
     Index MAX_LABELS = 2;
     size_t to_skip = 0;
     for (Index i=0; i < cat_index; i++) {
         to_skip += cat_n_levels[i];
     }
     std::stringstream cat_levels;
     size_t index;
     cat_levels << "{";
     for (index = to_skip + start_value;
             index < to_skip + end_value && index < cat_levels_text.size();
             index++) {
         cat_levels << get_text(cat_levels_text, index) << ",";
         if (index > to_skip + start_value + MAX_LABELS){
             cat_levels << " ... ";
             break;
         }
     }
     cat_levels << get_text(cat_levels_text, to_skip + end_value) << "}";
     return cat_levels.str();
 }
 // -------------------------------------------------------------------------

 template <class Container>
 inline
 int
 DecisionTree<Container>::encodeIndex(const int &feature_index,
         const int &is_categorical, const int &n_cat_features) const {
     if (is_categorical != 0) {
         return feature_index;
     } else {
         if (feature_index >= 0) {
             return feature_index + n_cat_features;
         } else {
             return feature_index;
         }
     }
 }
 // -------------------------------------------------------------------------

 template <class Container>
 inline
 void
 DecisionTree<Container>::computeVariableImportance(
         ColumnVector &cat_var_importance,
         ColumnVector &con_var_importance){

     // stats_per_split
     uint16_t sps = is_regression ? REGRESS_N_STATS :
                                    static_cast<uint16_t>(n_y_labels + 1);

     // loop through for each internal node and check the primary split and any
     // surrogate splits
     for (Index node_index = 0;
             node_index < feature_indices.size() / 2;
             node_index++){
         if (isInternalNode(node_index)){
             ColumnVector combined_stats(sps * 2);
             combined_stats << predictions.row(trueChild(node_index)).transpose(),
                               predictions.row(falseChild(node_index)).transpose();
             double split_gain = impurityGain(combined_stats, sps);

             // importance = impurity gain from split +
             //                  impurity gain * adjusted agreement from
             //                      surrogate split

             // primary split contribution to importance
             Index feat_index = feature_indices(node_index);
             if(is_categorical(node_index)) {
                 assert(feat_index < cat_var_importance.size());
                 cat_var_importance(feat_index) += split_gain;
             } else {
                 assert(feat_index < con_var_importance.size());
                 con_var_importance(feat_index) += split_gain;
             }

             // surrogate contribution to importance
             if (max_n_surr > 0){
                 for (Index surr_count=0; surr_count < max_n_surr; surr_count++){
                     Index surr_lookup_index = node_index * max_n_surr + surr_count;

                     // surr_status == 0 implies non-existing surrogate
                     if (surr_status(surr_lookup_index) == 0)
                         break;

                     uint64_t total_count =
                         statCount(predictions.row(trueChild(node_index))) +
                         statCount(predictions.row(falseChild(node_index)));

                     // Adjusted agreement is defined as how much better does
                     // the surrogate do compared to majority count. This value
                     // is relative to the number of rows that the majority branch
                     // would not predict correctly (minority count).
                     uint64_t node_count = nodeCount(node_index);
                     uint64_t maj_count = getMajorityCount(node_index);
                     uint64_t min_count =  node_count - maj_count;
                     assert(min_count > 0);

                     double adj_agreement =
                         (surr_agreement(surr_lookup_index) - maj_count) / min_count;
                     Index surr_feat_index = surr_indices(surr_lookup_index);
                     if (std::abs(surr_status(surr_lookup_index)) == 1){
                         cat_var_importance[surr_feat_index] += split_gain *
                                                                 adj_agreement;
                     } else {
                         con_var_importance[surr_feat_index] += split_gain *
                                                                 adj_agreement;
                     }
                 }
             }
         }
     }
 }
 // -------------------------------------------------------------------------

 template <class Container>
 inline
 string
 DecisionTree<Container>::surr_display(
         ArrayHandle<text*> &cat_features_str,
         ArrayHandle<text*> &con_features_str,
         ArrayHandle<text*> &cat_levels_text,
         ArrayHandle<int> &cat_n_levels){

     if (max_n_surr <= 0 )
         return "";

     std::stringstream display_string;
     std::string indentation(5, ' ');
     for(Index curr_node=0; curr_node < feature_indices.size() / 2; curr_node++){
         Index feat_index = feature_indices(curr_node);
         if (feat_index != NODE_NON_EXISTING && feat_index != IN_PROCESS_LEAF &&
                 feat_index != FINISHED_LEAF) {
             string feature_str = print_split(is_categorical(curr_node),
                                              false,
                                              feat_index,
                                              feature_thresholds(curr_node),
                                              cat_features_str,
                                              con_features_str,
                                              cat_levels_text,
                                              cat_n_levels);
             display_string << "(" << curr_node << ") ";
             display_string << feature_str
                 << std::endl;
             Index surr_base = curr_node * max_n_surr;
             for(Index i = 0;
                     i < max_n_surr && surr_indices(surr_base + i) >= 0;
                     i++){

                 Index curr_surr = surr_base + i;
                 if (surr_indices(curr_surr) >= 0){
                     bool is_cat = std::abs(surr_status(curr_surr)) == 1? true : false;
                     bool is_reverse = surr_status(curr_surr) < 0 ? true : false;
                     string surr_str = print_split(is_cat,
                                                   is_reverse,
                                                   surr_indices(curr_surr),
                                                   surr_thresholds(curr_surr),
                                                   cat_features_str,
                                                   con_features_str,
                                                   cat_levels_text,
                                                   cat_n_levels);
                     display_string << indentation;
                     display_string << i + 1 << ": ";
                     display_string << surr_str
                         << "    [common rows = " << surr_agreement(curr_surr)<< "]"
                         << std::endl;
                 }
             }
             display_string << indentation
                 << "[Majority branch = " << getMajorityCount(curr_node) << " ]"
                 << std::endl
                 << std::endl;
         }
     }
     return display_string.str();
 }

 // ------------------------------------------------------------------------
 // Definitions for class TreeAccumulator
 // ------------------------------------------------------------------------
 template <class Container, class DTree>
 inline
 TreeAccumulator<Container, DTree>::TreeAccumulator(
         Init_type& inInitialization): Base(inInitialization){
     this->initialize();
 }

 /**
  * @brief Bind all elements of the state to the data in the stream
  *
  * The bind() is special in that even after running operator>>() on an element,
  * there is no guarantee yet that the element can indeed be accessed. It is
  * cruicial to first check this.
  *
  * Provided that this method correctly lists all member variables, all other
  * methods can rely on that fact that all variables are correctly
  * initialized and accessible.
  */
 template <class Container, class DTree>
 inline
 void
 TreeAccumulator<Container, DTree>::bind(ByteStream_type& inStream) {
     // update with actual parameters
     inStream >> n_rows
              >> terminated
              >> n_bins
              >> n_cat_features
              >> n_con_features
              >> total_n_cat_levels
              >> n_leaf_nodes
              >> n_reachable_leaf_nodes
              >> stats_per_split
              >> weights_as_rows ;

     uint16_t n_bins_tmp = 0;
     uint16_t n_cat = 0;
     uint16_t n_con = 0;
     uint32_t tot_levels = 0;
     uint32_t n_leaves = 0;
     uint32_t n_reachable_leaves = 0;
     uint16_t n_stats = 0;

     if (!n_rows.isNull()){
         n_bins_tmp = n_bins;
         n_cat = n_cat_features;
         n_con = n_con_features;
         tot_levels = total_n_cat_levels;
         n_leaves = n_leaf_nodes;
         n_reachable_leaves = n_reachable_leaf_nodes;
         n_stats = stats_per_split;
     }

     inStream
         >> cat_levels_cumsum.rebind(n_cat)
         >> cat_stats.rebind(n_reachable_leaves, tot_levels * n_stats * 2)
         >> con_stats.rebind(n_reachable_leaves, n_con * n_bins_tmp * n_stats * 2)
         >> node_stats.rebind(n_reachable_leaves, n_stats)
         >> stats_lookup.rebind(n_leaves);
 }
 // -------------------------------------------------------------------------

 /**
  * @brief Rebind all elements of the state when dimensionality elements are
  *  available
  *
  */
 template <class Container, class DTree>
 inline
 void
 TreeAccumulator<Container, DTree>::rebind(
         uint16_t in_n_bins, uint16_t in_n_cat_feat,
         uint16_t in_n_con_feat, uint32_t in_n_total_levels,
         uint16_t tree_depth, uint16_t in_n_stats,
         bool in_weights_as_rows, uint32_t n_reachable_leaves) {

     n_bins = in_n_bins;
     n_cat_features = in_n_cat_feat;
     n_con_features = in_n_con_feat;
     total_n_cat_levels = in_n_total_levels;
     weights_as_rows = in_weights_as_rows;
     if (tree_depth > 0)
         n_leaf_nodes = static_cast<uint32_t>(pow(2.0, tree_depth - 1));
     else
         n_leaf_nodes = 1;
     if (n_reachable_leaves >= n_leaf_nodes)
         n_reachable_leaf_nodes = n_leaf_nodes;
     else
         n_reachable_leaf_nodes = n_reachable_leaves;
     stats_per_split = in_n_stats;
     this->resize();
 }
 // -------------------------------------------------------------------------

 /**
  * @brief Update the accumulation state by feeding a tuple
  */
 template <class Container, class DTree>
 inline
 TreeAccumulator<Container, DTree>&
 TreeAccumulator<Container, DTree>::operator<<(const tuple_type& inTuple) {
     tree_type dt = std::get<0>(inTuple);
     const MappedIntegerVector& cat_features = std::get<1>(inTuple);
     const MappedColumnVector& con_features = std::get<2>(inTuple);
     const double& response = std::get<3>(inTuple);
     const double& weight = std::get<4>(inTuple);
     const MappedIntegerVector& cat_levels = std::get<5>(inTuple);
     const MappedMatrix& con_splits = std::get<6>(inTuple);

     // The following checks were introduced with MADLIB-138. It still seems
     // useful to have clear error messages in case of infinite input values.
     if (!terminated){
         if (!std::isfinite(response)) {
             warning("Decision tree response variable values are not finite.");
         } else if ((cat_features.size() + con_features.size()) >
                         std::numeric_limits<uint16_t>::max()) {
             warning("Number of independent variables cannot be larger than 65535.");
         } else if (n_cat_features != static_cast<uint16_t>(cat_features.size())) {
             warning("Inconsistent numbers of categorical independent variables.");
         } else if (n_con_features != static_cast<uint16_t>(con_features.size())) {
             warning("Inconsistent numbers of continuous independent variables.");
         } else{
             uint32_t n_non_leaf_nodes = static_cast<uint32_t>(n_leaf_nodes - 1);
             Index dt_search_index = dt.search(cat_features, con_features);
             if (dt.feature_indices(dt_search_index) != dt.FINISHED_LEAF &&
                    dt.feature_indices(dt_search_index) != dt.NODE_NON_EXISTING) {
                 Index row_index = dt_search_index - n_non_leaf_nodes;
                 assert(row_index >= 0);
                 // add this row into the stats for the node
                 updateNodeStats(static_cast<bool>(dt.is_regression), row_index,
                                 response, weight);

                 // update stats for categorical feature values in the current row
                 for (Index i=0; i < n_cat_features; ++i){
                     for (int j=0; j < cat_levels(i); ++j){
                         if (!dt.isNull(cat_features(i), true)){
                             Index col_index = indexCatStats(
                                     i, j, (cat_features(i) <= j));
                             updateStats(static_cast<bool>(dt.is_regression), true,
                                     row_index, col_index, response, weight);
                         }
                     }
                 }
                 // update stats for continuous feature values in the current row
                 for (Index i=0; i < n_con_features; ++i){
                     for (Index j=0; j < n_bins; ++j){
                         if (!dt.isNull(con_features(i), false)){
                             Index col_index = indexConStats(i, j,
                                     (con_features(i) <= con_splits(i, j)));
                             updateStats(static_cast<bool>(dt.is_regression), false,
                                     row_index, col_index, response, weight);
                         }
                     }
                 }
             }
             n_rows++;
             return *this;
         }
         // error case for current group
         terminated = true;
     }
     return *this;
 }
 // -------------------------------------------------------------------------


 /**
  * @brief Update the accumulation state for surrogate statistics by feeding a tuple
  */
 template <class Container, class DTree>
 inline
 TreeAccumulator<Container, DTree>&
 TreeAccumulator<Container, DTree>::operator<<(const surr_tuple_type& inTuple) {

     tree_type dt = std::get<0>(inTuple);
     const MappedIntegerVector& cat_features = std::get<1>(inTuple);
     const MappedColumnVector& con_features = std::get<2>(inTuple);
     const MappedIntegerVector& cat_levels = std::get<3>(inTuple);
     const MappedMatrix& con_splits = std::get<4>(inTuple);
     const int dup_count = std::get<5>(inTuple);

     if ((cat_features.size() + con_features.size()) >
                     std::numeric_limits<uint16_t>::max()) {
         warning("Number of independent variables cannot be larger than 65535.");
     } else if (n_cat_features != static_cast<uint16_t>(cat_features.size())) {
         warning("Inconsistent numbers of categorical independent variables.");
     } else if (n_con_features != static_cast<uint16_t>(con_features.size())) {
         warning("Inconsistent numbers of continuous independent variables.");
     } else{
         // the accumulator is setup to train for the 2nd last layer
         // hence the n_leaf_nodes is same as n_surr_nodes
         uint32_t n_surr_nodes = n_leaf_nodes;
         uint32_t n_non_surr_nodes = static_cast<uint32_t>(n_surr_nodes - 1);

         Index dt_parent_index = dt.parentIndex(dt.search(cat_features, con_features));

         Index primary_index = dt.feature_indices(dt_parent_index);
         bool is_primary_cat = static_cast<bool>(dt.is_categorical(dt_parent_index));
         double primary_val = is_primary_cat ? cat_features(primary_index) :
                                               con_features(primary_index);

         // Only capture statistics for rows that:
         //  1. lead to leaf nodes in the last layer. Surrogates for other nodes
         //      have already been trained.
         //  2. have non-null values for the primary split.
         if (dt_parent_index >= n_non_surr_nodes &&
                 !dt.isNull(primary_val, is_primary_cat)) {

             double primary_threshold = dt.feature_thresholds(dt_parent_index);
             bool is_primary_true = (primary_val <= primary_threshold);

             if (dt.feature_indices(dt_parent_index) >= 0){
                 Index row_index = dt_parent_index - n_non_surr_nodes;

                 assert(row_index >= 0 && row_index < stats_lookup.rows());

                 for (Index i=0; i < n_cat_features; ++i){
                     if (is_primary_cat && i == primary_index)
                         continue;
                     for (int j=0; j < cat_levels(i); ++j){
                         if (!dt.isNull(cat_features(i), true)){
                             // we don't capture stats when surrogate is NULL
                             bool is_surrogate_true = (cat_features(i) <= j);
                             Index col_index = indexCatStats(i, j, is_surrogate_true);
                             updateSurrStats(true,
                                             is_primary_true == is_surrogate_true,
                                             row_index,
                                             col_index,
                                             dup_count);
                         }
                     }
                 }
                 for (Index i=0; i < n_con_features; ++i){
                     if (!is_primary_cat && i == primary_index)
                         continue;
                     for (Index j=0; j < n_bins; ++j){
                         if (!dt.isNull(con_features(i), false)){
                             // we don't capture stats when surrogate is NULL
                             bool is_surrogate_true = (con_features(i) <= con_splits(i, j));
                             Index col_index = indexConStats(i, j, is_surrogate_true);
                             updateSurrStats(false,
                                             is_primary_true == is_surrogate_true,
                                             row_index,
                                             col_index,
                                             dup_count);
                         }
                     }
                 }
             }
             n_rows++;
         }
     }
     return *this;
 }

 /**
  * @brief Merge with another accumulation state
  */
 template <class Container, class DTree>
 template <class C, class DT>
 inline
 TreeAccumulator<Container, DTree>&
 TreeAccumulator<Container, DTree>::operator<<(
         const TreeAccumulator<C, DT>& inOther) {
     // assuming that (*this) is not empty. This check needs to happen before
     // this function is called.
     if (!inOther.empty()) {
         if ((n_bins != inOther.n_bins) ||
                (n_cat_features != inOther.n_cat_features) ||
                (n_con_features != inOther.n_con_features)) {
             warning("Inconsistent states during merge.");
             terminated = true;
         } else {
             cat_stats += inOther.cat_stats;
             con_stats += inOther.con_stats;
             node_stats += inOther.node_stats;
         }
     }
     return *this;
 }
 // -------------------------------------------------------------------------

 /**
  * @brief Update the node statistics for given node
  */
 template <class Container, class DTree>
 inline
 void
 TreeAccumulator<Container, DTree>::updateNodeStats(bool is_regression,
                                                   Index node_index,
                                                   const double response,
                                                   const double weight) {
     ColumnVector stats(stats_per_split);
     stats.fill(0);
     int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1;
     if (is_regression){
         double w_response = weight * response;
         stats << weight, w_response, w_response * response, n_rows;
     } else {
         assert(response >= 0);
         stats(static_cast<uint16_t>(response)) = weight;
         stats.tail(1)(0) = n_rows;
     }
     assert(stats_lookup(node_index) >= 0);
     node_stats.row(stats_lookup(node_index)) += stats;
 }
 // -------------------------------------------------------------------------

 /**
  * @brief Update the leaf node statistics for current row in given feature/bin
  */
 template <class Container, class DTree>
 inline
 void
 TreeAccumulator<Container, DTree>::updateStats(bool is_regression,
                                                bool  is_cat,
                                                Index row_index,
                                                Index stats_index,
                                                const double response,
                                                const double weight) {
     ColumnVector stats(stats_per_split);
     stats.fill(0);
     int n_rows = this->weights_as_rows ? static_cast<int>(weight) : 1;
     if (is_regression){
         double w_response = weight * response;
         stats << weight, w_response, w_response * response, n_rows;
     } else {
         stats(static_cast<uint16_t>(response)) = weight;
         stats.tail(1)(0) = n_rows;
     }
     Index stats_i = stats_lookup(row_index);
     assert(stats_i >= 0);
     if (is_cat) {
         cat_stats.row(stats_i).segment(stats_index, stats_per_split) += stats;
     } else {
         con_stats.row(stats_i).segment(stats_index, stats_per_split) += stats;
     }
 }
 // -------------------------------------------------------------------------

 /**
  * @brief Update the surrogate statistics for current row in given feature/bin
  */
 template <class Container, class DTree>
 inline
 void
 TreeAccumulator<Container, DTree>::updateSurrStats(
         const bool is_cat, const bool surr_agrees,
         Index row_index, Index stats_index, const int dup_count) {

     // Note: the below works only if stats_per_split = 2
     // 1st position for <= surrogate split and
     // 2nd position for > split
     ColumnVector stats(stats_per_split);
     if (surr_agrees)
         stats << dup_count, 0;
     else
         stats << 0, dup_count;

     Index stats_i = stats_lookup(row_index);
     assert(stats_i >= 0);
     if (is_cat) {
         cat_stats.row(stats_i).segment(stats_index, stats_per_split) += stats;
     } else {
         con_stats.row(stats_i).segment(stats_index, stats_per_split) += stats;
     }
 }
 // -------------------------------------------------------------------------

 template <class Container, class DTree>
 inline
 Index
 TreeAccumulator<Container, DTree>::indexConStats(Index feature_index,
                                                  Index bin_index,
                                                  bool  is_split_true) const {
     assert(feature_index < n_con_features);
     assert(bin_index < n_bins);
     return computeSubIndex(feature_index * n_bins, bin_index, is_split_true);
 }
 // -------------------------------------------------------------------------

 template <class Container, class DTree>
 inline
 Index
 TreeAccumulator<Container, DTree>::indexCatStats(Index feature_index,
                                                  int   cat_value,
                                                  bool  is_split_true) const {
     // cat_stats is a matrix
     //   size = (n_reachable_leaf_nodes) x
     //                  (total_n_cat_levels * stats_per_split * 2)
     assert(feature_index < n_cat_features);
     unsigned int cat_cumsum_value = (feature_index == 0) ? 0 : cat_levels_cumsum(feature_index - 1);
     return computeSubIndex(static_cast<Index>(cat_cumsum_value),
                            static_cast<Index>(cat_value),
                            is_split_true);
 }
 // -------------------------------------------------------------------------

 template <class Container, class DTree>
 inline
 Index
 TreeAccumulator<Container, DTree>::computeSubIndex(Index start_index,
                                                    Index relative_index,
                                                    bool is_split_true) const {
     Index col_index = static_cast<Index>(stats_per_split * 2 *
                                          (start_index + relative_index));
     return is_split_true ? col_index : col_index + stats_per_split;
 }
 //-------------------------------------------------------------------------

 } // namespace recursive_partitioning
 } // namespace modules
 } // namespace madlib

 #endif // defined(MADLIB_MODULES_RP_DT_IMPL_HPP)