scripts/algorithms/decision-tree.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT IMPLEMENTS CLASSIFICATION TREES WITH BOTH SCALE AND CATEGORICAL FEATURES
 #
 # INPUT         PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME          TYPE     DEFAULT      MEANING
 # ---------------------------------------------------------------------------------------------
 # X             String   ---          Location to read feature matrix X; note that X needs to be both recoded and dummy coded
 # Y 			String   ---		  Location to read label matrix Y; note that Y needs to be both recoded and dummy coded
 # R   	  		String   " "	      Location to read the matrix R which for each feature in X contains the following information
 #										- R[,1]: column ids
 #										- R[,2]: start indices
 #										- R[,3]: end indices
 #									  If R is not provided by default all variables are assumed to be scale
 # bins          Int 	 20			  Number of equiheight bins per scale feature to choose thresholds
 # depth         Int 	 25			  Maximum depth of the learned tree
 # num_leaf      Int      10           Number of samples when splitting stops and a leaf node is added
 # num_samples   Int 	 3000		  Number of samples at which point we switch to in-memory subtree building
 # impurity      String   "Gini"    	  Impurity measure: entropy or Gini (the default)
 # M             String 	 ---	   	  Location to write matrix M containing the learned tree
 # O     		String   " "          Location to write the training accuracy; by default is standard output
 # S_map			String   " "		  Location to write the mappings from scale feature ids to global feature ids
 # C_map			String   " "		  Location to write the mappings from categorical feature ids to global feature ids
 # fmt     	    String   "text"       The output format of the model (matrix M), such as "text" or "csv"
 # ---------------------------------------------------------------------------------------------
 # OUTPUT:
 # Matrix M where each column corresponds to a node in the learned tree and each row contains the following information:
 #	 M[1,j]: id of node j (in a complete binary tree)
 #	 M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
 #	 M[3,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical)
 #			 that node j looks at if j is an internal node, otherwise 0
 #	 M[4,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features,
 #		     otherwise the label that leaf node j is supposed to predict
 #	 M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values
 #			 stored in rows 6,7,... if j is categorical
 #			 If j is a leaf node: number of misclassified samples reaching at node j
 #	 M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale,
 #			  otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
 #	          If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
 # -------------------------------------------------------------------------------------------
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar SystemDS.jar -f decision-tree.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=OUTPUT_DIR/model
 #     				                 				   bins=20 depth=25 num_leaf=10 num_samples=3000 impurity=Gini fmt=csv


 # External function for binning
 binning = externalFunction(Matrix[Double] A, Integer binsize, Integer numbins) return (Matrix[Double] B, Integer numbinsdef)
 	implemented in (classname="org.apache.sysds.udf.lib.BinningWrapper",exectype="mem")


 # Default values of some parameters
 fileR = ifdef ($R, " ");
 fileO = ifdef ($O, " ");
 fileS_map = ifdef ($S_map, " ");
 fileC_map = ifdef ($C_map, " ");
 fileM = $M;
 num_bins = ifdef($bins, 20);
 depth = ifdef($depth, 25);
 num_leaf = ifdef($num_leaf, 10);
 threshold = ifdef ($num_samples, 3000);
 imp = ifdef($impurity, "Gini");
 fmtO = ifdef($fmt, "text");

 X = read($X);
 Y_bin = read($Y);
 num_records = nrow (X);
 num_classes = ncol (Y_bin);

 # check if there is only one class label
 Y_bin_sum = sum (colSums (Y_bin) == num_records);
 if (Y_bin_sum == 1) {
 	stop ("Y contains only one class label. No model will be learned!");
 } else if (Y_bin_sum > 1) {
 	stop ("Y is not properly dummy coded. Multiple columns of Y contain only ones!")
 }

 # split data into X_scale and X_cat
 if (fileR != " ") {
 	R = read (fileR);
 	R = order (target = R, by = 2); # sort by start indices
 	dummy_coded = (R[,2] != R[,3]);
 	R_scale = removeEmpty (target = R[,2:3] * (1 - dummy_coded), margin = "rows");
 	R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
 	if (fileS_map != " ") {
 		scale_feature_mapping = removeEmpty (target = (1 - dummy_coded) * seq (1, nrow (R)), margin = "rows");
 		write (scale_feature_mapping, fileS_map, format = fmtO);
 	}
 	if (fileC_map != " ") {
 		cat_feature_mapping = removeEmpty (target = dummy_coded * seq (1, nrow (R)), margin = "rows");
 		write (cat_feature_mapping, fileC_map, format = fmtO);
 	}
 	sum_dummy = sum (dummy_coded);
 	if (sum_dummy == nrow (R)) { # all features categorical
 		print ("All features categorical");
 		num_cat_features = nrow (R_cat);
 		num_scale_features = 0;
 		X_cat = X;
 		distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
 		distinct_values_max = max (distinct_values);
 		distinct_values_offset = cumsum (t (distinct_values));
 		distinct_values_overall = sum (distinct_values);
 	} else if (sum_dummy == 0) { # all features scale
 		print ("All features scale");
 		num_scale_features = ncol (X);
 		num_cat_features = 0;
 		X_scale = X;
 		distinct_values_max = 1;
 	} else { # some features scale some features categorical
 		num_cat_features = nrow (R_cat);
 		num_scale_features = nrow (R_scale);
 		distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
 		distinct_values_max = max (distinct_values);
 		distinct_values_offset = cumsum (t (distinct_values));
 		distinct_values_overall = sum (distinct_values);

 		W = matrix (1, rows = num_cat_features, cols = 1) %*% matrix ("1 -1", rows = 1, cols = 2);
 		W = matrix (W, rows = 2 * num_cat_features, cols = 1);
 		if (as.scalar (R_cat[num_cat_features, 2]) == ncol (X)) {
 			W[2 * num_cat_features,] = 0;
 		}

 		last = (R_cat[,2] != ncol (X));
 		R_cat1 = (R_cat[,2] + 1) * last;
 		R_cat[,2] = (R_cat[,2] * (1 - last)) + R_cat1;
 		R_cat_vec = matrix (R_cat, rows = 2 * num_cat_features, cols = 1);

 		col_tab = table (R_cat_vec, 1, W, ncol (X), 1);
 		col_ind = cumsum (col_tab);

 		col_ind_cat = removeEmpty (target = col_ind * seq (1, ncol (X)), margin = "rows");
 		col_ind_scale = removeEmpty (target = (1 - col_ind) * seq (1, ncol (X)), margin = "rows");
 		X_cat = X %*% table (col_ind_cat, seq (1, nrow (col_ind_cat)), ncol (X), nrow (col_ind_cat));
 		X_scale = X %*% table (col_ind_scale, seq (1, nrow (col_ind_scale)), ncol (X), nrow (col_ind_scale));
 	}
 } else { # only scale features exist
 	print ("All features scale");
 	num_scale_features = ncol (X);
 	num_cat_features = 0;
 	X_scale = X;
 	distinct_values_max = 1;
 }

 if (num_scale_features > 0) {

 	print ("COMPUTING BINNING...");
 	bin_size = max (as.integer (num_records / num_bins), 1);
 	count_thresholds = matrix (0, rows = 1, cols = num_scale_features)
 	thresholds = matrix (0, rows = num_bins + 1, cols = num_scale_features)
 	parfor(i1 in 1:num_scale_features) {
 		col = order (target = X_scale[,i1], by = 1, decreasing = FALSE);
 		[col_bins, num_bins_defined] = binning (col, bin_size, num_bins);
 		count_thresholds[,i1] = num_bins_defined;
 		thresholds[,i1] = col_bins;
 	}

 	print ("PREPROCESSING SCALE FEATURE MATRIX...");
 	min_num_bins = min (count_thresholds);
 	max_num_bins = max (count_thresholds);
 	total_num_bins = sum (count_thresholds);
 	cum_count_thresholds = t (cumsum (t (count_thresholds)));

 	X_scale_ext = matrix (0, rows = num_records, cols = total_num_bins);
 	parfor (i2 in 1:num_scale_features, check = 0) {
 		Xi2 = X_scale[,i2];
 		count_threshold = as.scalar (count_thresholds[,i2]);
 		offset_feature = 1;
 		if (i2 > 1) {
 			offset_feature = offset_feature + as.integer (as.scalar (cum_count_thresholds[, (i2 - 1)]));
 		}

 		ti2 = t(thresholds[1:count_threshold, i2]);
 		X_scale_ext[,offset_feature:(offset_feature + count_threshold - 1)] = outer (Xi2, ti2, "<");
 	}
 }


 ##### INITIALIZATION
 L = matrix (1, rows = num_records, cols = 1); # last visited node id for each training sample

 # ---- model ----
 # leaf nodes
 # NC_large[,1]: node id
 # NC_large[,2]: class label
 # NC_large[,3]: no. of misclassified samples
 # NC_large[,4]: 1 if special leaf (impure and # samples at that leaf > threshold) or 0 otherwise
 NC_large = matrix (0, rows = 4, cols = 1);
 NC_small = matrix (0, rows = 4, cols = 1); # same schema as NC_large

 # internal nodes
 # Q_large[,1]: node id
 Q_large = matrix (1, rows = 1, cols = 1);
 Q_small = matrix (0, rows = 1, cols = 1); # same schema as Q_large

 # F_large[,1]: feature
 # F_large[,2]: type
 # F_large[,3]: offset
 F_large = matrix (0, rows = 3, cols = 1);
 F_small = matrix (0, rows = 3, cols = 1); # same schema as F_large

 S_large = matrix (0, rows = 1, cols = 1); # split points for LARGE internal nodes
 S_small = matrix (0, rows = 1, cols = 1); # split points for SMALL internal nodes

 # temp matrix for tree traversal
 # first row: ids of SMALL internal nodes
 # second row: no. of samples for these nodes
 cur_nodes_large = matrix (1, rows = 1, cols = 1);


 num_cur_nodes_large = 1;
 num_cur_nodes_small = 0;
 level = 0;

 while ((num_cur_nodes_large + num_cur_nodes_small) > 0 & level < depth) {

 	level = level + 1;
 	print (" --- start level " + level + " --- ");

 	##### PREPARE MODEL
 	if (num_cur_nodes_large > 0) { # LARGE nodes to process
 		cur_Q_large = matrix (0, rows = 1, cols = 2 * num_cur_nodes_large);
 		cur_NC_large = matrix (0, rows = 4, cols = 2 * num_cur_nodes_large);
 		cur_F_large = matrix (0, rows = 3, cols = num_cur_nodes_large);
 		cur_S_large = matrix (0, rows = 1, cols = num_cur_nodes_large * distinct_values_max);
 		cur_nodes_small = matrix (0, rows = 2, cols = 2 * num_cur_nodes_large);
 	}

 	##### LOOP OVER LARGE NODES...
 	parfor (i6 in 1:num_cur_nodes_large, check = 0) {

 		cur_node = as.scalar (cur_nodes_large[,i6]);

 		# --- find best split ---
 		# samples that reach cur_node
 		Ix = (L[,1] == cur_node);

 		cur_Y_bin = Y_bin * Ix;
 		label_counts_overall = colSums (cur_Y_bin);
 		label_sum_overall = sum (label_counts_overall);
 		label_dist_overall = label_counts_overall / label_sum_overall;

 		if (imp == "entropy") {
 			label_dist_zero = (label_dist_overall == 0);
 			cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero)); # / log (2); # impurity before
 		} else { # imp == "Gini"
 			cur_impurity = sum (label_dist_overall * (1 - label_dist_overall)); # impurity before
 		}
 		best_scale_gain = 0;
 		best_cat_gain = 0;
 		if (num_scale_features > 0) {

 			# main operation
 			label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext);

 			# compute left and right label distribution
 			label_sum_left = rowSums (label_counts_left_scale);
 			label_dist_left = label_counts_left_scale / label_sum_left;
 			if (imp == "entropy") {
 				label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
 				log_label_dist_left = log (label_dist_left); # / log (2)
 				impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left);
 			} else { # imp == "Gini"
 				impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left));
 			}
 			#
 			label_counts_right_scale = - label_counts_left_scale + label_counts_overall;
 			label_sum_right = rowSums (label_counts_right_scale);
 			label_dist_right = label_counts_right_scale / label_sum_right;
 			if (imp == "entropy") {
 				label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
 				log_label_dist_right = log (label_dist_right); # / log (2)
 				impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right);
 			} else { # imp == "Gini"
 				impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right));
 			}

 			I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale);

 			I_gain_scale = replace (target = I_gain_scale, pattern = NaN, replacement = 0);

 			# determine best feature to split on and the split value
 			best_scale_gain = max (I_gain_scale);
 			max_I_gain_ind_scale = as.scalar (rowIndexMax (t (I_gain_scale)));
 			p = (cum_count_thresholds < max_I_gain_ind_scale);
 			sum_cum_count_thresholds = sum (p);
 			best_scale_feature = sum_cum_count_thresholds + 1;
 			best_scale_split = max_I_gain_ind_scale;
 			if (best_scale_feature != 1) {
 				best_scale_split = best_scale_split - as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
 			}
 		}

 		if (num_cat_features > 0){

 			# initialization
 			split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall);
 			split_values = split_values_bin;
 			split_values_offset = matrix (0, rows = 1, cols = num_cat_features);
 			I_gains = split_values_offset;
 			impurities_left = split_values_offset;
 			impurities_right = split_values_offset;
 			best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes);
 			best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes);

 			# main operation
 			label_counts = t (t (cur_Y_bin) %*% X_cat);

 			parfor (i9 in 1:num_cat_features, check = 0){

 				start_ind = 1;
 				if (i9 > 1) {
 					start_ind = start_ind + as.scalar (distinct_values_offset[(i9 - 1),]);
 				}
 				offset = as.scalar (distinct_values[1,i9]);

 				cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),];

 				label_sum = rowSums (cur_label_counts);
 				label_dist = cur_label_counts / label_sum;
 				if (imp == "entropy") {
 					label_dist = replace (target = label_dist, pattern = 0, replacement = 1);
 					log_label_dist = log (label_dist); # / log(2)
 					impurity = - rowSums (label_dist * log_label_dist);
 					impurity = replace (target = impurity, pattern = NaN, replacement = 1/0);
 				} else { # imp == "Gini"
 					impurity = rowSums (label_dist * (1 - label_dist));
 				}

 				# sort cur feature by impurity
 				cur_distinct_values = seq (1, nrow (cur_label_counts));
 				cur_distinct_values_impurity = cbind (cur_distinct_values, impurity);
 				cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE);
 				P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix
 				label_counts_sorted = P %*% cur_label_counts;

 				# compute left and right label distribution
 				label_counts_left = cumsum (label_counts_sorted);

 				label_sum_left = rowSums (label_counts_left);
 				label_dist_left = label_counts_left / label_sum_left;
 				label_dist_left = replace (target = label_dist_left, pattern = "NaN", replacement = 1);
 				if (imp == "entropy") {
 					label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
 					log_label_dist_left = log (label_dist_left); # / log(2)
 					impurity_left = - rowSums (label_dist_left * log_label_dist_left);
 				} else { # imp == "Gini"
 					impurity_left = rowSums (label_dist_left * (1 - label_dist_left));
 				}
 				#
 				label_counts_right = - label_counts_left + label_counts_overall;
 				label_sum_right = rowSums (label_counts_right);
 				label_dist_right = label_counts_right / label_sum_right;
 				label_dist_right = replace (target = label_dist_right, pattern = NaN, replacement = 1);
 				if (imp == "entropy") {
 					label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
 					log_label_dist_right = log (label_dist_right); # / log (2)
 					impurity_right = - rowSums (label_dist_right * log_label_dist_right);
 				} else { # imp == "Gini"
 					impurity_right = rowSums (label_dist_right * (1 - label_dist_right));
 				}
 				I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right);

 				Ix_label_sum_left_zero = (label_sum_left == 0);
 				Ix_label_sum_right_zero = (label_sum_right == 0);
 				Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero;
 				I_gain = I_gain * (1 - Ix_label_sum_zero);

 				I_gain[nrow (I_gain),] = 0; # last entry invalid

 				max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain)));

 				split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]);
 				for (i10 in 1:max_I_gain_ind) {
 					ind = as.scalar (cur_feature_sorted[i10,1]);
 					if (ind == 1) {
 						split_values_bin[1,start_ind] = 1.0;
 					} else {
 						split_values_bin[1,(start_ind + ind - 1)] = 1.0;
 					}
 				}
 				split_values_offset[1,i9] = max_I_gain_ind;

 				I_gains[1,i9] = max (I_gain);

 				impurities_left[1,i9] = as.scalar (impurity_left[max_I_gain_ind,]);
 				impurities_right[1,i9] = as.scalar (impurity_right[max_I_gain_ind,]);
 				best_label_counts_left[i9,] = label_counts_left[max_I_gain_ind,];
 				best_label_counts_right[i9,] = label_counts_right[max_I_gain_ind,];
 			}

 			# determine best feature to split on and the split values
 			best_cat_feature = as.scalar (rowIndexMax (I_gains));
 			best_cat_gain = max (I_gains);
 			start_ind = 1;
 			if (best_cat_feature > 1) {
 				start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]);
 			}
 			offset = as.scalar (distinct_values[1,best_cat_feature]);
 			best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)];

 		}

 		# compare best scale feature to best cat. feature and pick the best one
 		if (num_scale_features > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) {

 			# --- update model ---
 			cur_F_large[1,i6] = best_scale_feature;
 			cur_F_large[2,i6] = 1;
 			cur_F_large[3,i6] = 1;
 			cur_S_large[1,(i6 - 1) * distinct_values_max + 1] = thresholds[best_scale_split, best_scale_feature];

 			left_child = 2 * (cur_node - 1) + 1 + 1;
 			right_child = 2 * (cur_node - 1) + 2 + 1;

 			# samples going to the left subtree
 			Ix_left = X_scale_ext[, max_I_gain_ind_scale];

 			Ix_left = Ix * Ix_left;
 			Ix_right = Ix * (1 - Ix_left);

 			L[,1] = L[,1] * (1 - Ix_left) + (Ix_left * left_child);
 			L[,1] = L[,1] * (1 - Ix_right) + (Ix_right * right_child);

 			left_child_size = sum (Ix_left);
 			right_child_size = sum (Ix_right);

 			# check if left or right child is a leaf
 			left_pure = FALSE;
 			right_pure = FALSE;
 			cur_impurity_left = as.scalar(impurity_left_scale[max_I_gain_ind_scale,]);
 			cur_impurity_right = as.scalar(impurity_right_scale[max_I_gain_ind_scale,]);
 			if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) &
 			   (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) |
 			   (left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf

 				cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,];
 				cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
 				cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
 				left_pure = TRUE;
 				cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples

 				cur_label_counts_right = label_counts_overall - cur_label_counts_left;
 				cur_NC_large[1,(2 * i6)] = right_child;
 				cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
 				right_pure = TRUE;
 				cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples

 			} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) |
 				      (left_child_size <= threshold & (level == depth))) {

 				cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,];
 				cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
 				cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
 				left_pure = TRUE;
 				cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples

 			} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) |
 					  (right_child_size <= threshold & (level == depth))) {

 				cur_label_counts_right = label_counts_right_scale[max_I_gain_ind_scale,];
 				cur_NC_large[1,(2 * i6)] = right_child;
 				cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
 				right_pure = TRUE;
 				cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples

 			}

 		} else if (num_cat_features > 0 & best_cat_gain > 0) {

 			# --- update model ---
 			cur_F_large[1,i6] = best_cat_feature;
 			offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]);
 			S_start_ind = (i6 - 1) * distinct_values_max + 1;
 			cur_S_large[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)];
 			cur_F_large[3,i6] = offset_nonzero;
 			cur_F_large[2,i6] = 2;

 			left_child = 2 * (cur_node - 1) + 1 + 1;
 			right_child = 2 * (cur_node - 1) + 2 + 1;

 			# samples going to the left subtree
 			Ix_left = rowSums (X_cat[,start_ind:(start_ind + offset - 1)] * best_split_values_bin);
 			Ix_left = (Ix_left >= 1);

 			Ix_left = Ix * Ix_left;
 			Ix_right = Ix * (1 - Ix_left);

 			L[,1] = L[,1] * (1 - Ix_left) + (Ix_left * left_child);
 			L[,1] = L[,1] * (1 - Ix_right) + (Ix_right * right_child);

 			left_child_size = sum (Ix_left);
 			right_child_size = sum (Ix_right);

 			# check if left or right child is a leaf
 			left_pure = FALSE;
 			right_pure = FALSE;
 			cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]);
 			cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]);
 			if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) &
 			   (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) |
 			   (left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf

 				cur_label_counts_left = best_label_counts_left[best_cat_feature,];
 				cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
 				cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
 				left_pure = TRUE;
 				cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples

 				cur_label_counts_right = label_counts_overall - cur_label_counts_left;
 				cur_NC_large[1,(2 * i6)] = right_child;
 				cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
 				right_pure = TRUE;
 				cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right);	# no. of misclassified samples

 			} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) |
 					  (left_child_size <= threshold & (level == depth))) {

 				cur_label_counts_left = best_label_counts_left[best_cat_feature,];
 				cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
 				cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
 				left_pure = TRUE;
 				cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left);	# no. of misclassified samples

 			} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) |
 					  (right_child_size <= threshold & (level == depth))) {

 				cur_label_counts_right = best_label_counts_right[best_cat_feature,];
 				cur_NC_large[1,(2 * i6)] = right_child;
 				cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
 				right_pure = TRUE;
 				cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right);	# no. of misclassified samples

 			}
 		} else {

 			print ("NUMBER OF SAMPLES AT NODE " + cur_node + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!");
 			right_pure = TRUE;
 			left_pure = TRUE;
 			cur_NC_large[1,(2 * (i6 - 1) + 1)] = cur_node;
 			class_label = as.scalar (rowIndexMax (label_counts_overall));
 			cur_NC_large[2,(2 * (i6 - 1) + 1)] = class_label;
 			cur_NC_large[3,(2 * (i6 - 1) + 1)] = label_sum_overall - max (label_counts_overall);
 			cur_NC_large[4,(2 * (i6 - 1) + 1)] = 1; # special leaf
 		}

 		# add nodes to Q
 		if (!left_pure) {
 			if (left_child_size > threshold) {
 				cur_Q_large[1,(2 * (i6 - 1)+ 1)] = left_child;
 			} else {
 				# first row: ids of SMALL internal nodes, second row: no. of samples for these nodes
 				cur_nodes_small[1,(2 * (i6 - 1)+ 1)] = left_child;
 				cur_nodes_small[2,(2 * (i6 - 1)+ 1)] = left_child_size;
 			}
 		}
 		if (!right_pure) {
 			if (right_child_size > threshold) {
 				cur_Q_large[1,(2 * i6)] = right_child;
 			} else{
 				# first row: ids of SMALL internal nodes, second row: no. of samples for these nodes
 				cur_nodes_small[1,(2 * i6)] = right_child;
 				cur_nodes_small[2,(2 * i6)] = right_child_size;
 			}
 		}
 	}

 	##### PREPARE MODEL FOR LARGE NODES
 	if (num_cur_nodes_large > 0) {
 		cur_Q_large = removeEmpty (target = cur_Q_large, margin = "cols");
 		if (as.scalar (cur_Q_large[1,1]) != 0) Q_large = cbind (Q_large, cur_Q_large);
 		cur_NC_large = removeEmpty (target = cur_NC_large, margin = "cols");
 		if (as.scalar (cur_NC_large[1,1]) != 0) NC_large = cbind (NC_large, cur_NC_large);

 		cur_F_large = removeEmpty (target = cur_F_large, margin = "cols");
 		if (as.scalar (cur_F_large[1,1]) != 0) F_large = cbind (F_large, cur_F_large);
 		cur_S_large = removeEmpty (target = cur_S_large, margin = "cols");
 		if (as.scalar (cur_S_large[1,1]) != 0) S_large = cbind (S_large, cur_S_large);

 		num_cur_nodes_large_pre = 2 * num_cur_nodes_large;
 		if (as.scalar (cur_Q_large[1,1]) == 0) {
 			num_cur_nodes_large = 0;
 		} else {
 			cur_nodes_large = cur_Q_large;
 			num_cur_nodes_large = ncol (cur_Q_large);
 		}
 	}

 	##### PREPARE MODEL FOR SMALL NODES
 	cur_nodes_small_nonzero = removeEmpty (target = cur_nodes_small, margin = "cols");
 	if (as.scalar (cur_nodes_small_nonzero[1,1]) != 0) { # if SMALL nodes exist
 		num_cur_nodes_small = ncol (cur_nodes_small_nonzero);
 	}

 	if (num_cur_nodes_small > 0) { # SMALL nodes to process
 		reserve_len = sum (2 ^ (ceil (log (cur_nodes_small_nonzero[2,]) / log (2)))) + num_cur_nodes_small;
 		cur_Q_small =  matrix (0, rows = 1, cols = reserve_len);
 		cur_NC_small = matrix (0, rows = 4, cols = reserve_len);
 		cur_F_small = matrix (0, rows = 3, cols = reserve_len);
 		cur_S_small = matrix (0, rows = 1, cols = reserve_len * distinct_values_max); # split values of the best feature
 	}

 	##### LOOP OVER SMALL NODES...
 	parfor (i7 in 1:num_cur_nodes_small, check = 0) {

 		cur_node_small = as.scalar (cur_nodes_small_nonzero[1,i7]);

 		# build dataset for SMALL node
 		Ix = (L[,1] == cur_node_small);
 		if (num_scale_features > 0) {
 			X_scale_ext_small = removeEmpty (target = X_scale_ext, margin = "rows", select = Ix);
 		}
 		if (num_cat_features > 0) {
 			X_cat_small = removeEmpty (target = X_cat, margin = "rows", select = Ix);
 		}
 		L_small = removeEmpty (target = L * Ix, margin = "rows");
 		Y_bin_small = removeEmpty (target = Y_bin * Ix, margin = "rows");

 		# compute offset
 		start_ind_global = 1;
 		start_ind_S_global = 1;
 		if (i7 > 1) {
 			offsets = cumsum (t (2 ^ ceil (log (cur_nodes_small_nonzero[2,]) / log (2))));
 		  	start_ind_global = start_ind_global + as.scalar (offsets[(i7 - 1),]);
       		start_ind_S_global = start_ind_S_global + (as.scalar (offsets[(i7 - 1),]) * distinct_values_max);
 		}

 		Q = matrix (cur_node_small, rows = 1, cols = 1);
 		NC = matrix (0, rows = 4, cols = 1);
 		F = matrix (0, rows = 3, cols = 1);
 		S = matrix (0, rows = 1, cols = 1);

 		cur_nodes_ = matrix (cur_node_small, rows = 1, cols = ncol(cur_Q_small));

 		num_cur_nodes = 1;
 		level_ = level;
 		while (num_cur_nodes > 0 & level_ < depth) {

 			level_ = level_ + 1;
 			cur_Q = matrix (0, rows = 1, cols = 2 * num_cur_nodes);
 			cur_NC = matrix (0, rows = 4, cols = 2 * num_cur_nodes);
 			cur_F = matrix (0, rows = 3, cols = num_cur_nodes);
 			cur_S = matrix (0, rows = 1, cols = num_cur_nodes * distinct_values_max);

 			parfor (i8 in 1:num_cur_nodes, check = 0) {

 				cur_node = as.scalar (cur_nodes_[,i8]);

 				# --- find best split ---
 				# samples that reach cur_node
 				Ix = (L_small[,1] == cur_node);
 				cur_Y_bin = Y_bin_small * Ix;
 				label_counts_overall = colSums (cur_Y_bin);

 				label_sum_overall = sum (label_counts_overall);
 				label_dist_overall = label_counts_overall / label_sum_overall;
 				if (imp == "entropy") {
 					label_dist_zero = (label_dist_overall == 0);
 					cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero));
 				} else { # imp == "Gini"
 					cur_impurity = sum (label_dist_overall * (1 - label_dist_overall));
 				}
 				best_scale_gain = 0;
 				best_cat_gain = 0;
 				if (num_scale_features > 0) {

 					# main operation
 					label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext_small);

 					# compute left and right label distribution
 					label_sum_left = rowSums (label_counts_left_scale);
 					label_dist_left = label_counts_left_scale / label_sum_left;
 					if (imp == "entropy") {
 						label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
 						log_label_dist_left = log (label_dist_left); # / log (2)
 						impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left);
 					} else { # imp == "Gini"
 						impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left));
 					}
 					#
 					label_counts_right_scale = - label_counts_left_scale + label_counts_overall;
 					label_sum_right = rowSums (label_counts_right_scale);
 					label_dist_right = label_counts_right_scale / label_sum_right;
 					if (imp == "entropy") {
 						label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
 						log_label_dist_right = log (label_dist_right); # log (2)
 						impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right);
 					} else { # imp == "Gini"
 						impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right));
 					}
 					I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale); # I_gain -> I_gain_scale

 					I_gain_scale = replace (target = I_gain_scale, pattern = NaN, replacement = 0);

 					# determine best feature to split on and the split value
 					best_scale_gain = max (I_gain_scale);
 					max_I_gain_ind_scale = as.scalar (rowIndexMax (t (I_gain_scale)));
 					p = (cum_count_thresholds < max_I_gain_ind_scale);
 					sum_cum_count_thresholds = sum (p);
 					best_scale_feature = sum_cum_count_thresholds + 1;
 					best_scale_split = max_I_gain_ind_scale;
 					if (best_scale_feature != 1) {
 						best_scale_split = best_scale_split - as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
 					}
 				}

 				if (num_cat_features > 0){

 					# initialization
 					split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall);
 					split_values = split_values_bin;
 					split_values_offset = matrix (0, rows = 1, cols = num_cat_features);
 					I_gains = split_values_offset;
 					impurities_left = split_values_offset;
 					impurities_right = split_values_offset;
 					best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes);
 					best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes);

 					# main operation
 					label_counts = t (t (cur_Y_bin) %*% X_cat_small);

 					parfor (i9 in 1:num_cat_features, check = 0){

 						start_ind = 1;
 						if (i9 > 1) {
 							start_ind = start_ind + as.scalar (distinct_values_offset[(i9 - 1),]);
 						}
 						offset = as.scalar (distinct_values[1,i9]);

 						cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),];

 						label_sum = rowSums (cur_label_counts);
 						label_dist = cur_label_counts / label_sum;
 						if (imp == "entropy") {
 							label_dist = replace (target = label_dist, pattern = 0, replacement = 1);
 							log_label_dist = log (label_dist); # / log(2)
 							impurity = - rowSums (label_dist * log_label_dist);
 							impurity = replace (target = impurity, pattern = NaN, replacement = 1/0);
 						} else { # imp == "Gini"
 							impurity = rowSums (label_dist * (1 - label_dist));
 						}
 						# sort cur feature by impurity
 						cur_distinct_values = seq (1, nrow (cur_label_counts));
 						cur_distinct_values_impurity = cbind (cur_distinct_values, impurity);
 						cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE);
 						P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix
 						label_counts_sorted = P %*% cur_label_counts;

 						# compute left and right label distribution
 						label_counts_left = cumsum (label_counts_sorted);

 						label_sum_left = rowSums (label_counts_left);
 						label_dist_left = label_counts_left / label_sum_left;
 						label_dist_left = replace (target = label_dist_left, pattern = NaN, replacement = 1);
 						if (imp == "entropy") {
 							label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1);
 							log_label_dist_left = log (label_dist_left); # / log(2)
 							impurity_left = - rowSums (label_dist_left * log_label_dist_left);
 						} else { # imp == "Gini"
 							impurity_left = rowSums (label_dist_left * (1 - label_dist_left));
 						}
 						#
 						label_counts_right = - label_counts_left + label_counts_overall;
 						label_sum_right = rowSums (label_counts_right);
 						label_dist_right = label_counts_right / label_sum_right;
 						label_dist_right = replace (target = label_dist_right, pattern = NaN, replacement = 1);
 						if (imp == "entropy") {
 							label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1);
 							log_label_dist_right = log (label_dist_right); # / log (2)
 							impurity_right = - rowSums (label_dist_right * log_label_dist_right);
 						} else { # imp == "Gini"
 							impurity_right = rowSums (label_dist_right * (1 - label_dist_right));
 						}
 						I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right);

 						Ix_label_sum_left_zero = (label_sum_left == 0);
 						Ix_label_sum_right_zero = (label_sum_right == 0);
 						Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero;
 						I_gain = I_gain * (1 - Ix_label_sum_zero);

 						I_gain[nrow (I_gain),] = 0; # last entry invalid

 						max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain)));

 						split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]);
 						for (i10 in 1:max_I_gain_ind) {
 							ind = as.scalar (cur_feature_sorted[i10,1]);
 							if (ind == 1) {
 								split_values_bin[1,start_ind] = 1.0;
 							} else {
 								split_values_bin[1,(start_ind + ind - 1)] = 1.0;
 							}
 						}
 						split_values_offset[1,i9] = max_I_gain_ind;

 						I_gains[1,i9] = max (I_gain);

 						impurities_left[1,i9] = as.scalar (impurity_left[max_I_gain_ind,]);
 						impurities_right[1,i9] = as.scalar (impurity_right[max_I_gain_ind,]);
 						best_label_counts_left[i9,] = label_counts_left[max_I_gain_ind,];
 						best_label_counts_right[i9,] = label_counts_right[max_I_gain_ind,];
 					}

 					# determine best feature to split on and the split values
 					best_cat_feature = as.scalar (rowIndexMax (I_gains));
 					best_cat_gain = max (I_gains);
 					start_ind = 1;
 					if (best_cat_feature > 1) {
 						start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]);
 					}
 					offset = as.scalar (distinct_values[1,best_cat_feature]);
 					best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)];

 				}

 				# compare best scale feature to best cat. feature and pick the best one
 				if (num_scale_features > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) {

 					# --- update model ---
 					cur_F[1,i8] = best_scale_feature;
 					cur_F[2,i8] = 1;
 					cur_F[3,i8] = 1;
 					cur_S[1,(i8 - 1) * distinct_values_max + 1] = thresholds[best_scale_split, best_scale_feature];

 					left_child = 2 * (cur_node - 1) + 1 + 1;
 					right_child = 2 * (cur_node - 1) + 2 + 1;

 					# samples going to the left subtree
 					Ix_left = X_scale_ext_small[, max_I_gain_ind_scale];

 					Ix_left = Ix * Ix_left;
 					Ix_right = Ix * (1 - Ix_left);

 					L_small[,1] = L_small[,1] * (1 - Ix_left) + (Ix_left * left_child);
 					L_small[,1] = L_small[,1] * (1 - Ix_right) + (Ix_right * right_child);

 					left_child_size = sum (Ix_left);
 					right_child_size = sum (Ix_right);

 					# check if left or right child is a leaf
 					left_pure = FALSE;
 					right_pure = FALSE;
 					cur_impurity_left = as.scalar(impurity_left_scale[max_I_gain_ind_scale,]);
 					cur_impurity_right = as.scalar(impurity_right_scale[max_I_gain_ind_scale,]);
 					if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level_ == depth)) &
 					   (right_child_size <= num_leaf | cur_impurity_right == 0 | (level_ == depth)) ) { # both left and right nodes are leaf

 						cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,];
 						cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
 						cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
 						left_pure = TRUE;
 						cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples

 						cur_label_counts_right = label_counts_overall - cur_label_counts_left;
 						cur_NC[1,(2 * i8)] = right_child;
 						cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
 						right_pure = TRUE;
 						cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples

 					} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) {

 						cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,];
 						cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
 						cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
 						left_pure = TRUE;
 						cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples

 					} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) {

 						cur_label_counts_right = label_counts_right_scale[max_I_gain_ind_scale,];
 						cur_NC[1,(2 * i8)] = right_child;
 						cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
 						right_pure = TRUE;
 						cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples

 					}

 				} else if (num_cat_features > 0 & best_cat_gain > 0) {

 					# --- update model ---
 					cur_F[1,i8] = best_cat_feature;
 					offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]);
 					S_start_ind = (i8 - 1) * distinct_values_max + 1;
 					cur_S[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)];
 					cur_F[3,i8] = offset_nonzero;
 					cur_F[2,i8] = 2;

 					left_child = 2 * (cur_node - 1) + 1 + 1;
 					right_child = 2 * (cur_node - 1) + 2 + 1;

 					# samples going to the left subtree
 					Ix_left = rowSums (X_cat_small[,start_ind:(start_ind + offset - 1)] * best_split_values_bin);
 					Ix_left = (Ix_left >= 1);

 					Ix_left = Ix * Ix_left;
 					Ix_right = Ix * (1 - Ix_left);

 					L_small[,1] = L_small[,1] * (1 - Ix_left) + (Ix_left * left_child);
 					L_small[,1] = L_small[,1] * (1 - Ix_right) + (Ix_right * right_child);

 					left_child_size = sum (Ix_left);
 					right_child_size = sum (Ix_right);

 					# check if left or right child is a leaf
 					left_pure = FALSE;
 					right_pure = FALSE;
 					cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]);
 					cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]);
 					if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level_ == depth)) &
 					   (right_child_size <= num_leaf | cur_impurity_right == 0 | (level_ == depth)) ) { # both left and right nodes are leaf

 						cur_label_counts_left = best_label_counts_left[best_cat_feature,];
 						cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
 						cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
 						left_pure = TRUE;
 						cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples

 						cur_label_counts_right = label_counts_overall - cur_label_counts_left;
 						cur_NC[1,(2 * i8)] = right_child;
 						cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
 						right_pure = TRUE;
 						cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples

 					} else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) {

 						cur_label_counts_left = best_label_counts_left[best_cat_feature,];
 						cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
 						cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
 						left_pure = TRUE;
 						cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples

 					} else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) {

 						cur_label_counts_right = best_label_counts_right[best_cat_feature,];
 						cur_NC[1,(2 * i8)] = right_child;
 						cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label
 						right_pure = TRUE;
 						cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples

 					}
 				} else {

 					print ("NUMBER OF SAMPLES AT NODE " + cur_node + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!");
 					right_pure = TRUE;
 					left_pure = TRUE;
 					cur_NC[1,(2 * (i8 - 1) + 1)] = cur_node;
 					class_label = as.scalar (rowIndexMax (label_counts_overall));
 					cur_NC[2,(2 * (i8 - 1) + 1)] = class_label;
 					cur_NC[3,(2 * (i8 - 1) + 1)] = label_sum_overall - max (label_counts_overall);
 					cur_NC[4,(2 * (i8 - 1) + 1)] = 1; # special leaf
 				}

 				# add nodes to Q
 				if (!left_pure) {
 					cur_Q[1,(2 * (i8 - 1)+ 1)] = left_child;
 				}
 				if (!right_pure) {
 					cur_Q[1,(2 * i8)] = right_child;
 				}
 			}

 			cur_Q = removeEmpty (target = cur_Q, margin = "cols");
 			Q = cbind (Q, cur_Q);
 			NC = cbind (NC, cur_NC);
 			F = cbind (F, cur_F);
 			S = cbind (S, cur_S);

 			num_cur_nodes_pre = 2 * num_cur_nodes;
 			if (as.scalar (cur_Q[1,1]) == 0) {
 				num_cur_nodes = 0;
 			} else {
 				cur_nodes_[,1:ncol(cur_Q)] = cur_Q;
         		num_cur_nodes = ncol (cur_Q);
 			}
 		}

 		cur_Q_small[,start_ind_global:(start_ind_global + ncol (Q) - 1)] = Q;
 		cur_NC_small[,start_ind_global:(start_ind_global + ncol (NC) - 1)] = NC;
 		cur_F_small[,start_ind_global:(start_ind_global + ncol (F) - 1)] = F;
 		cur_S_small[,start_ind_S_global:(start_ind_S_global + ncol (S) - 1)] = S;
 	}

 	##### PREPARE MODEL FOR SMALL NODES
 	if (num_cur_nodes_small > 0) {	# small nodes already processed
 		cur_Q_small = removeEmpty (target = cur_Q_small, margin = "cols");
 		if (as.scalar (cur_Q_small[1,1]) != 0)  Q_small = cbind (Q_small, cur_Q_small);
 		cur_NC_small = removeEmpty (target = cur_NC_small, margin = "cols");
 		if (as.scalar (cur_NC_small[1,1]) != 0)  NC_small = cbind (NC_small, cur_NC_small);

 		cur_F_small = removeEmpty (target = cur_F_small, margin = "cols");
 		if (as.scalar (cur_F_small[1,1]) != 0)  F_small = cbind (F_small, cur_F_small);
 		cur_S_small = removeEmpty (target = cur_S_small, margin = "cols");
 		if (as.scalar (cur_S_small[1,1]) != 0)  S_small = cbind (S_small, cur_S_small);

 		num_cur_nodes_small = 0; # reset
 	}

 	print (" --- end level " + level + ", remaining no. of LARGE nodes to expand " + num_cur_nodes_large + " --- ");
 }

 num_misclassified = sum (NC_large[3,]) + sum (NC_small[3,]);
 accuracy = (1 - (num_misclassified / num_records)) * 100;
 str_o = "no. of misclassified " + num_misclassified + " training accuracy " + accuracy + " %";
 if (fileO != " ") {
 	ACCURACY = matrix (accuracy, rows = 1, cols = 1);
 	write (ACCURACY, fileO, format = fmtO)
 } else {
 	print (str_o);
 }


 #### prepare model
 print ("PREPARING MODEL...")
 # large nodes
 if (as.scalar (Q_large[1,1]) == 0 & ncol (Q_large) > 1) {
 	Q_large = Q_large[,2:ncol (Q_large)];
 }
 if (as.scalar (NC_large[1,1]) == 0 & ncol (NC_large) > 1) {
 	NC_large = NC_large[,2:ncol (NC_large)];
 }
 if (as.scalar (S_large[1,1]) == 0 & ncol (S_large) > 1) {
 	S_large = S_large[,2:ncol (S_large)];
 }
 if (as.scalar (F_large[1,1]) == 0 & ncol (F_large) > 1) {
 	F_large = F_large[,2:ncol (F_large)];
 }
 # small nodes
 if (as.scalar (Q_small[1,1]) == 0 & ncol (Q_small) > 1) {
 	Q_small = Q_small[,2:ncol (Q_small)];
 }
 if (as.scalar (NC_small[1,1]) == 0 & ncol (NC_small) > 1) {
 	NC_small = NC_small[,2:ncol (NC_small)];
 }
 if (as.scalar (S_small[1,1]) == 0 & ncol (S_small) > 1) {
 	S_small = S_small[,2:ncol (S_small)];
 }
 if (as.scalar (F_small[1,1]) == 0 & ncol (F_small) > 1) {
 	F_small = F_small[,2:ncol (F_small)];
 }

 # check for special leaves and if there are any remove them from Q_large and Q_small
 special_large_leaves_ind = NC_large[4,];
 if (sum (special_large_leaves_ind) > 0) {
 	print ("PROCESSING SPECIAL LARGE LEAVES...");
 	special_large_leaves = removeEmpty (target = NC_large[1,] * special_large_leaves_ind, margin = "cols");
 	large_internal_ind = 1 - colSums (outer (t (special_large_leaves), Q_large[1,], "=="));
 	Q_large = removeEmpty (target = Q_large * large_internal_ind, margin = "cols");
 	F_large = removeEmpty (target = F_large, margin = "cols"); # remove special leaves from F
 }

 special_small_leaves_ind = NC_small[4,];
 if (sum (special_small_leaves_ind) > 0) {
 	print ("PROCESSING SPECIAL SMALL LEAVES...");
 	special_small_leaves = removeEmpty (target = NC_small[1,] * special_small_leaves_ind, margin = "cols");
 	small_internal_ind = 1 - colSums (outer (t (special_small_leaves), Q_small[1,], "=="));
 	Q_small = removeEmpty (target = Q_small * small_internal_ind, margin = "cols");
 	F_small = removeEmpty (target = F_small, margin = "cols"); # remove special leaves from F
 }

 # model corresponding to large internal nodes
 max_offset = 1;
 no_large_internal_node = FALSE;
 if (as.scalar (Q_large[1,1]) != 0) {
 	print ("PROCESSING LARGE INTERNAL NODES...");
 	num_large_internal = ncol (Q_large);
 	max_offset = max (max (F_large[3,]), max (F_small[3,]));
 	M1_large = matrix (0, rows = 5 + max_offset, cols = num_large_internal);
 	M1_large[1,] = Q_large;
 	M1_large[3:5,] = F_large;
 	# process S_large
 	cum_offsets_large = cumsum (t (F_large[3,]));
 	parfor (it in 1:num_large_internal, check = 0) {
 		start_ind = 1;
 		if (it > 1) {
 			start_ind = start_ind + as.scalar (cum_offsets_large[(it - 1),]);
 		}
 		offset = as.scalar (F_large[3,it]);
 		M1_large[6:(6 + offset - 1),it] = t (S_large[1,start_ind:(start_ind + offset - 1)]);
 	}
 } else {
 	print ("No LARGE internal nodes available");
 	no_large_internal_node = TRUE;
 }

 # model corresponding to small internal nodes
 no_small_internal_node = FALSE;
 if (as.scalar (Q_small[1,1]) != 0) {
 	print ("PROCESSING SMALL INTERNAL NODES...");
 	num_small_internal = ncol (Q_small);
 	M1_small = matrix (0, rows = 5 + max_offset, cols = num_small_internal);
 	M1_small[1,] = Q_small;
 	M1_small[3:5,] = F_small;
 	# process S_small
 	cum_offsets_small = cumsum (t (F_small[3,]));
 	parfor (it in 1:num_small_internal, check = 0) {
 		start_ind = 1;
 		if (it > 1) {
 			start_ind = start_ind + as.scalar (cum_offsets_small[(it - 1),]);
 		}
 		offset = as.scalar (F_small[3,it]);
 		M1_small[6:(6 + offset - 1),it] = t (S_small[1,start_ind:(start_ind + offset - 1)]);
 	}
 } else {
 	print ("No SMALL internal nodes available");
 	no_small_internal_node = TRUE;
 }

 # model corresponding to large leaf nodes
 no_large_leaf_node = FALSE;
 if (as.scalar (NC_large[1,1]) != 0) {
 	print ("PROCESSING LARGE LEAF NODES...");
 	num_large_leaf = ncol (NC_large);
 	M2_large = matrix (0, rows = 5 + max_offset, cols = num_large_leaf);
 	M2_large[1,] = NC_large[1,];
 	M2_large[4:6,] = NC_large[2:4,];
 } else {
 	print ("No LARGE leaf nodes available");
 	no_large_leaf_node = TRUE;
 }

 # model corresponding to small leaf nodes
 no_small_leaf_node = FALSE;
 if (as.scalar (NC_small[1,1]) != 0) {
 	print ("PROCESSING SMALL LEAF NODES...");
 	num_small_leaf = ncol (NC_small);
 	M2_small = matrix (0, rows = 5 + max_offset, cols = num_small_leaf);
 	M2_small[1,] = NC_small[1,];
 	M2_small[4:6,] = NC_small[2:4,];
 } else {
 	print ("No SMALL leaf nodes available");
 	no_small_leaf_node = TRUE;
 }

 if (no_large_internal_node) {
 	M1 = M1_small;
 } else if (no_small_internal_node) {
 	M1 = M1_large;
 } else {
 	M1 = cbind (M1_large, M1_small);
 }

 if (no_large_leaf_node) {
 	M2 = M2_small;
 } else if (no_small_internal_node) {
 	M2 = M2_large;
 } else {
 	M2 = cbind (M2_large, M2_small);
 }

 M = cbind (M1, M2);
 M = t (order (target = t (M), by = 1));

 # removing redundant subtrees
 if (ncol (M) > 1) {
 	print ("CHECKING FOR REDUNDANT SUBTREES...");
 	red_leaf = TRUE;
 	process_red_subtree = FALSE;
 	invalid_node_ind = matrix (0, rows = 1, cols = ncol (M));
 	while (red_leaf & ncol (M) > 1) {
 		leaf_ind = (M[3,] == 0);
 		labels = M[4,] * leaf_ind;
 		parent_ids = floor (M[1,] /2);
 		cond1 = (labels[,1:(ncol (M) - 1)] == labels[,2:ncol (M)]); # siebling leaves with same label
 		cond2 = (parent_ids[,1:(ncol (M)- 1)] == parent_ids[,2:ncol (M)]); # same parents
 		red_leaf_ind =  cond1 * cond2 * leaf_ind[,2:ncol (M)];

 		if (sum (red_leaf_ind) > 0) { # if redundant subtrees exist
 			red_leaf_ids = red_leaf_ind * M[1,2:ncol (M)];
 			red_leaf_ids_nonzero = removeEmpty (target = red_leaf_ids, margin = "cols");
 			parfor (it in 1:ncol (red_leaf_ids_nonzero), check = 0){
 				cur_right_leaf_id = as.scalar (red_leaf_ids_nonzero[1,it]);
 				cur_parent_id = floor (cur_right_leaf_id / 2);
 				cur_right_leaf_pos = as.scalar (rowIndexMax ((M[1,] == cur_right_leaf_id)));
 				cur_parent_pos = as.scalar(rowIndexMax (M[1,] == cur_parent_id));
 				M[2:nrow (M), cur_parent_pos] = M[2:nrow (M), cur_right_leaf_pos];
 				M[3,cur_right_leaf_pos] = -1;
 				M[3,cur_right_leaf_pos - 1] = -1;
 				M[5,cur_parent_pos] = M[5,cur_parent_pos] + M[5,cur_right_leaf_pos - 1];
 				invalid_node_ind[1,cur_right_leaf_pos] = 1;
 				invalid_node_ind[1,cur_right_leaf_pos - 1] = 1;
 			}
 			process_red_subtree = TRUE;
 		} else {
 			red_leaf = FALSE;
 		}
 	}

 	if (process_red_subtree) {
 		print ("REMOVING REDUNDANT SUBTREES...");
 		valid_node_ind = (invalid_node_ind == 0);
 		M = removeEmpty (target = M * valid_node_ind, margin = "cols");
 	}
 }

 if (ncol (M) > 1) { # if internal nodes exist
     print ("COMPUTING OFFSETS TO THE LEFT CHILD FOR INTERNAL NODES...");
     internal_ind = (M[3,] > 0);
     internal_ids = internal_ind * M[1,];
     internal_ids_nonzero = removeEmpty (target = internal_ids, margin = "cols");
     a1 = internal_ids_nonzero;
     a2 = internal_ids_nonzero * 2;
     pos_a1 = rowIndexMax( outer(t(a1), M[1,], "==") );
     pos_a2 = rowIndexMax( outer(t(a2), M[1,], "==") );
     M[2,] = t(table(pos_a1, 1, pos_a2 - pos_a1, ncol(M), 1));
 }
 else {
     print ("The tree model contains only one leaf with label " + as.scalar (M[4,]));
 }

 write (M, fileM, format = fmtO);