| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # THIS SCRIPT IMPLEMENTS CLASSIFICATION TREES WITH BOTH SCALE AND CATEGORICAL FEATURES |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X String --- Location to read feature matrix X; note that X needs to be both recoded and dummy coded |
| # Y String --- Location to read label matrix Y; note that Y needs to be both recoded and dummy coded |
| # R String " " Location to read the matrix R which for each feature in X contains the following information |
| # - R[,1]: column ids |
| # - R[,2]: start indices |
| # - R[,3]: end indices |
| # If R is not provided by default all variables are assumed to be scale |
| # bins Int 20 Number of equiheight bins per scale feature to choose thresholds |
| # depth Int 25 Maximum depth of the learned tree |
| # num_leaf Int 10 Number of samples when splitting stops and a leaf node is added |
| # num_samples Int 3000 Number of samples at which point we switch to in-memory subtree building |
| # impurity String "Gini" Impurity measure: entropy or Gini (the default) |
| # M String --- Location to write matrix M containing the learned tree |
| # O String " " Location to write the training accuracy; by default is standard output |
| # S_map String " " Location to write the mappings from scale feature ids to global feature ids |
| # C_map String " " Location to write the mappings from categorical feature ids to global feature ids |
| # fmt String "text" The output format of the model (matrix M), such as "text" or "csv" |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT: |
| # Matrix M where each column corresponds to a node in the learned tree and each row contains the following information: |
| # M[1,j]: id of node j (in a complete binary tree) |
| # M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0 |
| # M[3,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical) |
| # that node j looks at if j is an internal node, otherwise 0 |
| # M[4,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, |
| # otherwise the label that leaf node j is supposed to predict |
| # M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values |
| # stored in rows 6,7,... if j is categorical |
| # If j is a leaf node: number of misclassified samples reaching at node j |
| # M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale, |
| # otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j |
| # If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0 |
| # ------------------------------------------------------------------------------------------- |
| # HOW TO INVOKE THIS SCRIPT - EXAMPLE: |
| # hadoop jar SystemDS.jar -f decision-tree.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=OUTPUT_DIR/model |
| # bins=20 depth=25 num_leaf=10 num_samples=3000 impurity=Gini fmt=csv |
| |
| |
| # External function for binning |
| binning = externalFunction(Matrix[Double] A, Integer binsize, Integer numbins) return (Matrix[Double] B, Integer numbinsdef) |
| implemented in (classname="org.apache.sysds.udf.lib.BinningWrapper",exectype="mem") |
| |
| |
| # Default values of some parameters |
| fileR = ifdef ($R, " "); |
| fileO = ifdef ($O, " "); |
| fileS_map = ifdef ($S_map, " "); |
| fileC_map = ifdef ($C_map, " "); |
| fileM = $M; |
| num_bins = ifdef($bins, 20); |
| depth = ifdef($depth, 25); |
| num_leaf = ifdef($num_leaf, 10); |
| threshold = ifdef ($num_samples, 3000); |
| imp = ifdef($impurity, "Gini"); |
| fmtO = ifdef($fmt, "text"); |
| |
| X = read($X); |
| Y_bin = read($Y); |
| num_records = nrow (X); |
| num_classes = ncol (Y_bin); |
| |
| # check if there is only one class label |
| Y_bin_sum = sum (colSums (Y_bin) == num_records); |
| if (Y_bin_sum == 1) { |
| stop ("Y contains only one class label. No model will be learned!"); |
| } else if (Y_bin_sum > 1) { |
| stop ("Y is not properly dummy coded. Multiple columns of Y contain only ones!") |
| } |
| |
| # split data into X_scale and X_cat |
| if (fileR != " ") { |
| R = read (fileR); |
| R = order (target = R, by = 2); # sort by start indices |
| dummy_coded = (R[,2] != R[,3]); |
| R_scale = removeEmpty (target = R[,2:3] * (1 - dummy_coded), margin = "rows"); |
| R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows"); |
| if (fileS_map != " ") { |
| scale_feature_mapping = removeEmpty (target = (1 - dummy_coded) * seq (1, nrow (R)), margin = "rows"); |
| write (scale_feature_mapping, fileS_map, format = fmtO); |
| } |
| if (fileC_map != " ") { |
| cat_feature_mapping = removeEmpty (target = dummy_coded * seq (1, nrow (R)), margin = "rows"); |
| write (cat_feature_mapping, fileC_map, format = fmtO); |
| } |
| sum_dummy = sum (dummy_coded); |
| if (sum_dummy == nrow (R)) { # all features categorical |
| print ("All features categorical"); |
| num_cat_features = nrow (R_cat); |
| num_scale_features = 0; |
| X_cat = X; |
| distinct_values = t (R_cat[,2] - R_cat[,1] + 1); |
| distinct_values_max = max (distinct_values); |
| distinct_values_offset = cumsum (t (distinct_values)); |
| distinct_values_overall = sum (distinct_values); |
| } else if (sum_dummy == 0) { # all features scale |
| print ("All features scale"); |
| num_scale_features = ncol (X); |
| num_cat_features = 0; |
| X_scale = X; |
| distinct_values_max = 1; |
| } else { # some features scale some features categorical |
| num_cat_features = nrow (R_cat); |
| num_scale_features = nrow (R_scale); |
| distinct_values = t (R_cat[,2] - R_cat[,1] + 1); |
| distinct_values_max = max (distinct_values); |
| distinct_values_offset = cumsum (t (distinct_values)); |
| distinct_values_overall = sum (distinct_values); |
| |
| W = matrix (1, rows = num_cat_features, cols = 1) %*% matrix ("1 -1", rows = 1, cols = 2); |
| W = matrix (W, rows = 2 * num_cat_features, cols = 1); |
| if (as.scalar (R_cat[num_cat_features, 2]) == ncol (X)) { |
| W[2 * num_cat_features,] = 0; |
| } |
| |
| last = (R_cat[,2] != ncol (X)); |
| R_cat1 = (R_cat[,2] + 1) * last; |
| R_cat[,2] = (R_cat[,2] * (1 - last)) + R_cat1; |
| R_cat_vec = matrix (R_cat, rows = 2 * num_cat_features, cols = 1); |
| |
| col_tab = table (R_cat_vec, 1, W, ncol (X), 1); |
| col_ind = cumsum (col_tab); |
| |
| col_ind_cat = removeEmpty (target = col_ind * seq (1, ncol (X)), margin = "rows"); |
| col_ind_scale = removeEmpty (target = (1 - col_ind) * seq (1, ncol (X)), margin = "rows"); |
| X_cat = X %*% table (col_ind_cat, seq (1, nrow (col_ind_cat)), ncol (X), nrow (col_ind_cat)); |
| X_scale = X %*% table (col_ind_scale, seq (1, nrow (col_ind_scale)), ncol (X), nrow (col_ind_scale)); |
| } |
| } else { # only scale features exist |
| print ("All features scale"); |
| num_scale_features = ncol (X); |
| num_cat_features = 0; |
| X_scale = X; |
| distinct_values_max = 1; |
| } |
| |
| if (num_scale_features > 0) { |
| |
| print ("COMPUTING BINNING..."); |
| bin_size = max (as.integer (num_records / num_bins), 1); |
| count_thresholds = matrix (0, rows = 1, cols = num_scale_features) |
| thresholds = matrix (0, rows = num_bins + 1, cols = num_scale_features) |
| parfor(i1 in 1:num_scale_features) { |
| col = order (target = X_scale[,i1], by = 1, decreasing = FALSE); |
| [col_bins, num_bins_defined] = binning (col, bin_size, num_bins); |
| count_thresholds[,i1] = num_bins_defined; |
| thresholds[,i1] = col_bins; |
| } |
| |
| print ("PREPROCESSING SCALE FEATURE MATRIX..."); |
| min_num_bins = min (count_thresholds); |
| max_num_bins = max (count_thresholds); |
| total_num_bins = sum (count_thresholds); |
| cum_count_thresholds = t (cumsum (t (count_thresholds))); |
| |
| X_scale_ext = matrix (0, rows = num_records, cols = total_num_bins); |
| parfor (i2 in 1:num_scale_features, check = 0) { |
| Xi2 = X_scale[,i2]; |
| count_threshold = as.scalar (count_thresholds[,i2]); |
| offset_feature = 1; |
| if (i2 > 1) { |
| offset_feature = offset_feature + as.integer (as.scalar (cum_count_thresholds[, (i2 - 1)])); |
| } |
| |
| ti2 = t(thresholds[1:count_threshold, i2]); |
| X_scale_ext[,offset_feature:(offset_feature + count_threshold - 1)] = outer (Xi2, ti2, "<"); |
| } |
| } |
| |
| |
| ##### INITIALIZATION |
| L = matrix (1, rows = num_records, cols = 1); # last visited node id for each training sample |
| |
| # ---- model ---- |
| # leaf nodes |
| # NC_large[,1]: node id |
| # NC_large[,2]: class label |
| # NC_large[,3]: no. of misclassified samples |
| # NC_large[,4]: 1 if special leaf (impure and # samples at that leaf > threshold) or 0 otherwise |
| NC_large = matrix (0, rows = 4, cols = 1); |
| NC_small = matrix (0, rows = 4, cols = 1); # same schema as NC_large |
| |
| # internal nodes |
| # Q_large[,1]: node id |
| Q_large = matrix (1, rows = 1, cols = 1); |
| Q_small = matrix (0, rows = 1, cols = 1); # same schema as Q_large |
| |
| # F_large[,1]: feature |
| # F_large[,2]: type |
| # F_large[,3]: offset |
| F_large = matrix (0, rows = 3, cols = 1); |
| F_small = matrix (0, rows = 3, cols = 1); # same schema as F_large |
| |
| S_large = matrix (0, rows = 1, cols = 1); # split points for LARGE internal nodes |
| S_small = matrix (0, rows = 1, cols = 1); # split points for SMALL internal nodes |
| |
| # temp matrix for tree traversal |
| # first row: ids of SMALL internal nodes |
| # second row: no. of samples for these nodes |
| cur_nodes_large = matrix (1, rows = 1, cols = 1); |
| |
| |
| num_cur_nodes_large = 1; |
| num_cur_nodes_small = 0; |
| level = 0; |
| |
| while ((num_cur_nodes_large + num_cur_nodes_small) > 0 & level < depth) { |
| |
| level = level + 1; |
| print (" --- start level " + level + " --- "); |
| |
| ##### PREPARE MODEL |
| if (num_cur_nodes_large > 0) { # LARGE nodes to process |
| cur_Q_large = matrix (0, rows = 1, cols = 2 * num_cur_nodes_large); |
| cur_NC_large = matrix (0, rows = 4, cols = 2 * num_cur_nodes_large); |
| cur_F_large = matrix (0, rows = 3, cols = num_cur_nodes_large); |
| cur_S_large = matrix (0, rows = 1, cols = num_cur_nodes_large * distinct_values_max); |
| cur_nodes_small = matrix (0, rows = 2, cols = 2 * num_cur_nodes_large); |
| } |
| |
| ##### LOOP OVER LARGE NODES... |
| parfor (i6 in 1:num_cur_nodes_large, check = 0) { |
| |
| cur_node = as.scalar (cur_nodes_large[,i6]); |
| |
| # --- find best split --- |
| # samples that reach cur_node |
| Ix = (L[,1] == cur_node); |
| |
| cur_Y_bin = Y_bin * Ix; |
| label_counts_overall = colSums (cur_Y_bin); |
| label_sum_overall = sum (label_counts_overall); |
| label_dist_overall = label_counts_overall / label_sum_overall; |
| |
| if (imp == "entropy") { |
| label_dist_zero = (label_dist_overall == 0); |
| cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero)); # / log (2); # impurity before |
| } else { # imp == "Gini" |
| cur_impurity = sum (label_dist_overall * (1 - label_dist_overall)); # impurity before |
| } |
| best_scale_gain = 0; |
| best_cat_gain = 0; |
| if (num_scale_features > 0) { |
| |
| # main operation |
| label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext); |
| |
| # compute left and right label distribution |
| label_sum_left = rowSums (label_counts_left_scale); |
| label_dist_left = label_counts_left_scale / label_sum_left; |
| if (imp == "entropy") { |
| label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1); |
| log_label_dist_left = log (label_dist_left); # / log (2) |
| impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left); |
| } else { # imp == "Gini" |
| impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left)); |
| } |
| # |
| label_counts_right_scale = - label_counts_left_scale + label_counts_overall; |
| label_sum_right = rowSums (label_counts_right_scale); |
| label_dist_right = label_counts_right_scale / label_sum_right; |
| if (imp == "entropy") { |
| label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1); |
| log_label_dist_right = log (label_dist_right); # / log (2) |
| impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right); |
| } else { # imp == "Gini" |
| impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right)); |
| } |
| |
| I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale); |
| |
| I_gain_scale = replace (target = I_gain_scale, pattern = NaN, replacement = 0); |
| |
| # determine best feature to split on and the split value |
| best_scale_gain = max (I_gain_scale); |
| max_I_gain_ind_scale = as.scalar (rowIndexMax (t (I_gain_scale))); |
| p = (cum_count_thresholds < max_I_gain_ind_scale); |
| sum_cum_count_thresholds = sum (p); |
| best_scale_feature = sum_cum_count_thresholds + 1; |
| best_scale_split = max_I_gain_ind_scale; |
| if (best_scale_feature != 1) { |
| best_scale_split = best_scale_split - as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]); |
| } |
| } |
| |
| if (num_cat_features > 0){ |
| |
| # initialization |
| split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall); |
| split_values = split_values_bin; |
| split_values_offset = matrix (0, rows = 1, cols = num_cat_features); |
| I_gains = split_values_offset; |
| impurities_left = split_values_offset; |
| impurities_right = split_values_offset; |
| best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes); |
| best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes); |
| |
| # main operation |
| label_counts = t (t (cur_Y_bin) %*% X_cat); |
| |
| parfor (i9 in 1:num_cat_features, check = 0){ |
| |
| start_ind = 1; |
| if (i9 > 1) { |
| start_ind = start_ind + as.scalar (distinct_values_offset[(i9 - 1),]); |
| } |
| offset = as.scalar (distinct_values[1,i9]); |
| |
| cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),]; |
| |
| label_sum = rowSums (cur_label_counts); |
| label_dist = cur_label_counts / label_sum; |
| if (imp == "entropy") { |
| label_dist = replace (target = label_dist, pattern = 0, replacement = 1); |
| log_label_dist = log (label_dist); # / log(2) |
| impurity = - rowSums (label_dist * log_label_dist); |
| impurity = replace (target = impurity, pattern = NaN, replacement = 1/0); |
| } else { # imp == "Gini" |
| impurity = rowSums (label_dist * (1 - label_dist)); |
| } |
| |
| # sort cur feature by impurity |
| cur_distinct_values = seq (1, nrow (cur_label_counts)); |
| cur_distinct_values_impurity = cbind (cur_distinct_values, impurity); |
| cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE); |
| P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix |
| label_counts_sorted = P %*% cur_label_counts; |
| |
| # compute left and right label distribution |
| label_counts_left = cumsum (label_counts_sorted); |
| |
| label_sum_left = rowSums (label_counts_left); |
| label_dist_left = label_counts_left / label_sum_left; |
| label_dist_left = replace (target = label_dist_left, pattern = "NaN", replacement = 1); |
| if (imp == "entropy") { |
| label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1); |
| log_label_dist_left = log (label_dist_left); # / log(2) |
| impurity_left = - rowSums (label_dist_left * log_label_dist_left); |
| } else { # imp == "Gini" |
| impurity_left = rowSums (label_dist_left * (1 - label_dist_left)); |
| } |
| # |
| label_counts_right = - label_counts_left + label_counts_overall; |
| label_sum_right = rowSums (label_counts_right); |
| label_dist_right = label_counts_right / label_sum_right; |
| label_dist_right = replace (target = label_dist_right, pattern = NaN, replacement = 1); |
| if (imp == "entropy") { |
| label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1); |
| log_label_dist_right = log (label_dist_right); # / log (2) |
| impurity_right = - rowSums (label_dist_right * log_label_dist_right); |
| } else { # imp == "Gini" |
| impurity_right = rowSums (label_dist_right * (1 - label_dist_right)); |
| } |
| I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right); |
| |
| Ix_label_sum_left_zero = (label_sum_left == 0); |
| Ix_label_sum_right_zero = (label_sum_right == 0); |
| Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero; |
| I_gain = I_gain * (1 - Ix_label_sum_zero); |
| |
| I_gain[nrow (I_gain),] = 0; # last entry invalid |
| |
| max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain))); |
| |
| split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]); |
| for (i10 in 1:max_I_gain_ind) { |
| ind = as.scalar (cur_feature_sorted[i10,1]); |
| if (ind == 1) { |
| split_values_bin[1,start_ind] = 1.0; |
| } else { |
| split_values_bin[1,(start_ind + ind - 1)] = 1.0; |
| } |
| } |
| split_values_offset[1,i9] = max_I_gain_ind; |
| |
| I_gains[1,i9] = max (I_gain); |
| |
| impurities_left[1,i9] = as.scalar (impurity_left[max_I_gain_ind,]); |
| impurities_right[1,i9] = as.scalar (impurity_right[max_I_gain_ind,]); |
| best_label_counts_left[i9,] = label_counts_left[max_I_gain_ind,]; |
| best_label_counts_right[i9,] = label_counts_right[max_I_gain_ind,]; |
| } |
| |
| # determine best feature to split on and the split values |
| best_cat_feature = as.scalar (rowIndexMax (I_gains)); |
| best_cat_gain = max (I_gains); |
| start_ind = 1; |
| if (best_cat_feature > 1) { |
| start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]); |
| } |
| offset = as.scalar (distinct_values[1,best_cat_feature]); |
| best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)]; |
| |
| } |
| |
| # compare best scale feature to best cat. feature and pick the best one |
| if (num_scale_features > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) { |
| |
| # --- update model --- |
| cur_F_large[1,i6] = best_scale_feature; |
| cur_F_large[2,i6] = 1; |
| cur_F_large[3,i6] = 1; |
| cur_S_large[1,(i6 - 1) * distinct_values_max + 1] = thresholds[best_scale_split, best_scale_feature]; |
| |
| left_child = 2 * (cur_node - 1) + 1 + 1; |
| right_child = 2 * (cur_node - 1) + 2 + 1; |
| |
| # samples going to the left subtree |
| Ix_left = X_scale_ext[, max_I_gain_ind_scale]; |
| |
| Ix_left = Ix * Ix_left; |
| Ix_right = Ix * (1 - Ix_left); |
| |
| L[,1] = L[,1] * (1 - Ix_left) + (Ix_left * left_child); |
| L[,1] = L[,1] * (1 - Ix_right) + (Ix_right * right_child); |
| |
| left_child_size = sum (Ix_left); |
| right_child_size = sum (Ix_right); |
| |
| # check if left or right child is a leaf |
| left_pure = FALSE; |
| right_pure = FALSE; |
| cur_impurity_left = as.scalar(impurity_left_scale[max_I_gain_ind_scale,]); |
| cur_impurity_right = as.scalar(impurity_right_scale[max_I_gain_ind_scale,]); |
| if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) & |
| (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) | |
| (left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf |
| |
| cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,]; |
| cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; |
| cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label |
| left_pure = TRUE; |
| cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples |
| |
| cur_label_counts_right = label_counts_overall - cur_label_counts_left; |
| cur_NC_large[1,(2 * i6)] = right_child; |
| cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label |
| right_pure = TRUE; |
| cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples |
| |
| } else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) | |
| (left_child_size <= threshold & (level == depth))) { |
| |
| cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,]; |
| cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; |
| cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label |
| left_pure = TRUE; |
| cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples |
| |
| } else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) | |
| (right_child_size <= threshold & (level == depth))) { |
| |
| cur_label_counts_right = label_counts_right_scale[max_I_gain_ind_scale,]; |
| cur_NC_large[1,(2 * i6)] = right_child; |
| cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label |
| right_pure = TRUE; |
| cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples |
| |
| } |
| |
| } else if (num_cat_features > 0 & best_cat_gain > 0) { |
| |
| # --- update model --- |
| cur_F_large[1,i6] = best_cat_feature; |
| offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]); |
| S_start_ind = (i6 - 1) * distinct_values_max + 1; |
| cur_S_large[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)]; |
| cur_F_large[3,i6] = offset_nonzero; |
| cur_F_large[2,i6] = 2; |
| |
| left_child = 2 * (cur_node - 1) + 1 + 1; |
| right_child = 2 * (cur_node - 1) + 2 + 1; |
| |
| # samples going to the left subtree |
| Ix_left = rowSums (X_cat[,start_ind:(start_ind + offset - 1)] * best_split_values_bin); |
| Ix_left = (Ix_left >= 1); |
| |
| Ix_left = Ix * Ix_left; |
| Ix_right = Ix * (1 - Ix_left); |
| |
| L[,1] = L[,1] * (1 - Ix_left) + (Ix_left * left_child); |
| L[,1] = L[,1] * (1 - Ix_right) + (Ix_right * right_child); |
| |
| left_child_size = sum (Ix_left); |
| right_child_size = sum (Ix_right); |
| |
| # check if left or right child is a leaf |
| left_pure = FALSE; |
| right_pure = FALSE; |
| cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]); |
| cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]); |
| if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) & |
| (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) | |
| (left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf |
| |
| cur_label_counts_left = best_label_counts_left[best_cat_feature,]; |
| cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; |
| cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label |
| left_pure = TRUE; |
| cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples |
| |
| cur_label_counts_right = label_counts_overall - cur_label_counts_left; |
| cur_NC_large[1,(2 * i6)] = right_child; |
| cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label |
| right_pure = TRUE; |
| cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples |
| |
| } else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) | |
| (left_child_size <= threshold & (level == depth))) { |
| |
| cur_label_counts_left = best_label_counts_left[best_cat_feature,]; |
| cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; |
| cur_NC_large[2,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label |
| left_pure = TRUE; |
| cur_NC_large[3,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples |
| |
| } else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) | |
| (right_child_size <= threshold & (level == depth))) { |
| |
| cur_label_counts_right = best_label_counts_right[best_cat_feature,]; |
| cur_NC_large[1,(2 * i6)] = right_child; |
| cur_NC_large[2,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label |
| right_pure = TRUE; |
| cur_NC_large[3,(2 * i6)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples |
| |
| } |
| } else { |
| |
| print ("NUMBER OF SAMPLES AT NODE " + cur_node + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!"); |
| right_pure = TRUE; |
| left_pure = TRUE; |
| cur_NC_large[1,(2 * (i6 - 1) + 1)] = cur_node; |
| class_label = as.scalar (rowIndexMax (label_counts_overall)); |
| cur_NC_large[2,(2 * (i6 - 1) + 1)] = class_label; |
| cur_NC_large[3,(2 * (i6 - 1) + 1)] = label_sum_overall - max (label_counts_overall); |
| cur_NC_large[4,(2 * (i6 - 1) + 1)] = 1; # special leaf |
| } |
| |
| # add nodes to Q |
| if (!left_pure) { |
| if (left_child_size > threshold) { |
| cur_Q_large[1,(2 * (i6 - 1)+ 1)] = left_child; |
| } else { |
| # first row: ids of SMALL internal nodes, second row: no. of samples for these nodes |
| cur_nodes_small[1,(2 * (i6 - 1)+ 1)] = left_child; |
| cur_nodes_small[2,(2 * (i6 - 1)+ 1)] = left_child_size; |
| } |
| } |
| if (!right_pure) { |
| if (right_child_size > threshold) { |
| cur_Q_large[1,(2 * i6)] = right_child; |
| } else{ |
| # first row: ids of SMALL internal nodes, second row: no. of samples for these nodes |
| cur_nodes_small[1,(2 * i6)] = right_child; |
| cur_nodes_small[2,(2 * i6)] = right_child_size; |
| } |
| } |
| } |
| |
| ##### PREPARE MODEL FOR LARGE NODES |
| if (num_cur_nodes_large > 0) { |
| cur_Q_large = removeEmpty (target = cur_Q_large, margin = "cols"); |
| if (as.scalar (cur_Q_large[1,1]) != 0) Q_large = cbind (Q_large, cur_Q_large); |
| cur_NC_large = removeEmpty (target = cur_NC_large, margin = "cols"); |
| if (as.scalar (cur_NC_large[1,1]) != 0) NC_large = cbind (NC_large, cur_NC_large); |
| |
| cur_F_large = removeEmpty (target = cur_F_large, margin = "cols"); |
| if (as.scalar (cur_F_large[1,1]) != 0) F_large = cbind (F_large, cur_F_large); |
| cur_S_large = removeEmpty (target = cur_S_large, margin = "cols"); |
| if (as.scalar (cur_S_large[1,1]) != 0) S_large = cbind (S_large, cur_S_large); |
| |
| num_cur_nodes_large_pre = 2 * num_cur_nodes_large; |
| if (as.scalar (cur_Q_large[1,1]) == 0) { |
| num_cur_nodes_large = 0; |
| } else { |
| cur_nodes_large = cur_Q_large; |
| num_cur_nodes_large = ncol (cur_Q_large); |
| } |
| } |
| |
| ##### PREPARE MODEL FOR SMALL NODES |
| cur_nodes_small_nonzero = removeEmpty (target = cur_nodes_small, margin = "cols"); |
| if (as.scalar (cur_nodes_small_nonzero[1,1]) != 0) { # if SMALL nodes exist |
| num_cur_nodes_small = ncol (cur_nodes_small_nonzero); |
| } |
| |
| if (num_cur_nodes_small > 0) { # SMALL nodes to process |
| reserve_len = sum (2 ^ (ceil (log (cur_nodes_small_nonzero[2,]) / log (2)))) + num_cur_nodes_small; |
| cur_Q_small = matrix (0, rows = 1, cols = reserve_len); |
| cur_NC_small = matrix (0, rows = 4, cols = reserve_len); |
| cur_F_small = matrix (0, rows = 3, cols = reserve_len); |
| cur_S_small = matrix (0, rows = 1, cols = reserve_len * distinct_values_max); # split values of the best feature |
| } |
| |
| ##### LOOP OVER SMALL NODES... |
| parfor (i7 in 1:num_cur_nodes_small, check = 0) { |
| |
| cur_node_small = as.scalar (cur_nodes_small_nonzero[1,i7]); |
| |
| # build dataset for SMALL node |
| Ix = (L[,1] == cur_node_small); |
| if (num_scale_features > 0) { |
| X_scale_ext_small = removeEmpty (target = X_scale_ext, margin = "rows", select = Ix); |
| } |
| if (num_cat_features > 0) { |
| X_cat_small = removeEmpty (target = X_cat, margin = "rows", select = Ix); |
| } |
| L_small = removeEmpty (target = L * Ix, margin = "rows"); |
| Y_bin_small = removeEmpty (target = Y_bin * Ix, margin = "rows"); |
| |
| # compute offset |
| start_ind_global = 1; |
| start_ind_S_global = 1; |
| if (i7 > 1) { |
| offsets = cumsum (t (2 ^ ceil (log (cur_nodes_small_nonzero[2,]) / log (2)))); |
| start_ind_global = start_ind_global + as.scalar (offsets[(i7 - 1),]); |
| start_ind_S_global = start_ind_S_global + (as.scalar (offsets[(i7 - 1),]) * distinct_values_max); |
| } |
| |
| Q = matrix (cur_node_small, rows = 1, cols = 1); |
| NC = matrix (0, rows = 4, cols = 1); |
| F = matrix (0, rows = 3, cols = 1); |
| S = matrix (0, rows = 1, cols = 1); |
| |
| cur_nodes_ = matrix (cur_node_small, rows = 1, cols = ncol(cur_Q_small)); |
| |
| num_cur_nodes = 1; |
| level_ = level; |
| while (num_cur_nodes > 0 & level_ < depth) { |
| |
| level_ = level_ + 1; |
| cur_Q = matrix (0, rows = 1, cols = 2 * num_cur_nodes); |
| cur_NC = matrix (0, rows = 4, cols = 2 * num_cur_nodes); |
| cur_F = matrix (0, rows = 3, cols = num_cur_nodes); |
| cur_S = matrix (0, rows = 1, cols = num_cur_nodes * distinct_values_max); |
| |
| parfor (i8 in 1:num_cur_nodes, check = 0) { |
| |
| cur_node = as.scalar (cur_nodes_[,i8]); |
| |
| # --- find best split --- |
| # samples that reach cur_node |
| Ix = (L_small[,1] == cur_node); |
| cur_Y_bin = Y_bin_small * Ix; |
| label_counts_overall = colSums (cur_Y_bin); |
| |
| label_sum_overall = sum (label_counts_overall); |
| label_dist_overall = label_counts_overall / label_sum_overall; |
| if (imp == "entropy") { |
| label_dist_zero = (label_dist_overall == 0); |
| cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero)); |
| } else { # imp == "Gini" |
| cur_impurity = sum (label_dist_overall * (1 - label_dist_overall)); |
| } |
| best_scale_gain = 0; |
| best_cat_gain = 0; |
| if (num_scale_features > 0) { |
| |
| # main operation |
| label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext_small); |
| |
| # compute left and right label distribution |
| label_sum_left = rowSums (label_counts_left_scale); |
| label_dist_left = label_counts_left_scale / label_sum_left; |
| if (imp == "entropy") { |
| label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1); |
| log_label_dist_left = log (label_dist_left); # / log (2) |
| impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left); |
| } else { # imp == "Gini" |
| impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left)); |
| } |
| # |
| label_counts_right_scale = - label_counts_left_scale + label_counts_overall; |
| label_sum_right = rowSums (label_counts_right_scale); |
| label_dist_right = label_counts_right_scale / label_sum_right; |
| if (imp == "entropy") { |
| label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1); |
| log_label_dist_right = log (label_dist_right); # log (2) |
| impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right); |
| } else { # imp == "Gini" |
| impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right)); |
| } |
| I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale); # I_gain -> I_gain_scale |
| |
| I_gain_scale = replace (target = I_gain_scale, pattern = NaN, replacement = 0); |
| |
| # determine best feature to split on and the split value |
| best_scale_gain = max (I_gain_scale); |
| max_I_gain_ind_scale = as.scalar (rowIndexMax (t (I_gain_scale))); |
| p = (cum_count_thresholds < max_I_gain_ind_scale); |
| sum_cum_count_thresholds = sum (p); |
| best_scale_feature = sum_cum_count_thresholds + 1; |
| best_scale_split = max_I_gain_ind_scale; |
| if (best_scale_feature != 1) { |
| best_scale_split = best_scale_split - as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]); |
| } |
| } |
| |
| if (num_cat_features > 0){ |
| |
| # initialization |
| split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall); |
| split_values = split_values_bin; |
| split_values_offset = matrix (0, rows = 1, cols = num_cat_features); |
| I_gains = split_values_offset; |
| impurities_left = split_values_offset; |
| impurities_right = split_values_offset; |
| best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes); |
| best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes); |
| |
| # main operation |
| label_counts = t (t (cur_Y_bin) %*% X_cat_small); |
| |
| parfor (i9 in 1:num_cat_features, check = 0){ |
| |
| start_ind = 1; |
| if (i9 > 1) { |
| start_ind = start_ind + as.scalar (distinct_values_offset[(i9 - 1),]); |
| } |
| offset = as.scalar (distinct_values[1,i9]); |
| |
| cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),]; |
| |
| label_sum = rowSums (cur_label_counts); |
| label_dist = cur_label_counts / label_sum; |
| if (imp == "entropy") { |
| label_dist = replace (target = label_dist, pattern = 0, replacement = 1); |
| log_label_dist = log (label_dist); # / log(2) |
| impurity = - rowSums (label_dist * log_label_dist); |
| impurity = replace (target = impurity, pattern = NaN, replacement = 1/0); |
| } else { # imp == "Gini" |
| impurity = rowSums (label_dist * (1 - label_dist)); |
| } |
| # sort cur feature by impurity |
| cur_distinct_values = seq (1, nrow (cur_label_counts)); |
| cur_distinct_values_impurity = cbind (cur_distinct_values, impurity); |
| cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE); |
| P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix |
| label_counts_sorted = P %*% cur_label_counts; |
| |
| # compute left and right label distribution |
| label_counts_left = cumsum (label_counts_sorted); |
| |
| label_sum_left = rowSums (label_counts_left); |
| label_dist_left = label_counts_left / label_sum_left; |
| label_dist_left = replace (target = label_dist_left, pattern = NaN, replacement = 1); |
| if (imp == "entropy") { |
| label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1); |
| log_label_dist_left = log (label_dist_left); # / log(2) |
| impurity_left = - rowSums (label_dist_left * log_label_dist_left); |
| } else { # imp == "Gini" |
| impurity_left = rowSums (label_dist_left * (1 - label_dist_left)); |
| } |
| # |
| label_counts_right = - label_counts_left + label_counts_overall; |
| label_sum_right = rowSums (label_counts_right); |
| label_dist_right = label_counts_right / label_sum_right; |
| label_dist_right = replace (target = label_dist_right, pattern = NaN, replacement = 1); |
| if (imp == "entropy") { |
| label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1); |
| log_label_dist_right = log (label_dist_right); # / log (2) |
| impurity_right = - rowSums (label_dist_right * log_label_dist_right); |
| } else { # imp == "Gini" |
| impurity_right = rowSums (label_dist_right * (1 - label_dist_right)); |
| } |
| I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right); |
| |
| Ix_label_sum_left_zero = (label_sum_left == 0); |
| Ix_label_sum_right_zero = (label_sum_right == 0); |
| Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero; |
| I_gain = I_gain * (1 - Ix_label_sum_zero); |
| |
| I_gain[nrow (I_gain),] = 0; # last entry invalid |
| |
| max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain))); |
| |
| split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]); |
| for (i10 in 1:max_I_gain_ind) { |
| ind = as.scalar (cur_feature_sorted[i10,1]); |
| if (ind == 1) { |
| split_values_bin[1,start_ind] = 1.0; |
| } else { |
| split_values_bin[1,(start_ind + ind - 1)] = 1.0; |
| } |
| } |
| split_values_offset[1,i9] = max_I_gain_ind; |
| |
| I_gains[1,i9] = max (I_gain); |
| |
| impurities_left[1,i9] = as.scalar (impurity_left[max_I_gain_ind,]); |
| impurities_right[1,i9] = as.scalar (impurity_right[max_I_gain_ind,]); |
| best_label_counts_left[i9,] = label_counts_left[max_I_gain_ind,]; |
| best_label_counts_right[i9,] = label_counts_right[max_I_gain_ind,]; |
| } |
| |
| # determine best feature to split on and the split values |
| best_cat_feature = as.scalar (rowIndexMax (I_gains)); |
| best_cat_gain = max (I_gains); |
| start_ind = 1; |
| if (best_cat_feature > 1) { |
| start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]); |
| } |
| offset = as.scalar (distinct_values[1,best_cat_feature]); |
| best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)]; |
| |
| } |
| |
| # compare best scale feature to best cat. feature and pick the best one |
| if (num_scale_features > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) { |
| |
| # --- update model --- |
| cur_F[1,i8] = best_scale_feature; |
| cur_F[2,i8] = 1; |
| cur_F[3,i8] = 1; |
| cur_S[1,(i8 - 1) * distinct_values_max + 1] = thresholds[best_scale_split, best_scale_feature]; |
| |
| left_child = 2 * (cur_node - 1) + 1 + 1; |
| right_child = 2 * (cur_node - 1) + 2 + 1; |
| |
| # samples going to the left subtree |
| Ix_left = X_scale_ext_small[, max_I_gain_ind_scale]; |
| |
| Ix_left = Ix * Ix_left; |
| Ix_right = Ix * (1 - Ix_left); |
| |
| L_small[,1] = L_small[,1] * (1 - Ix_left) + (Ix_left * left_child); |
| L_small[,1] = L_small[,1] * (1 - Ix_right) + (Ix_right * right_child); |
| |
| left_child_size = sum (Ix_left); |
| right_child_size = sum (Ix_right); |
| |
| # check if left or right child is a leaf |
| left_pure = FALSE; |
| right_pure = FALSE; |
| cur_impurity_left = as.scalar(impurity_left_scale[max_I_gain_ind_scale,]); |
| cur_impurity_right = as.scalar(impurity_right_scale[max_I_gain_ind_scale,]); |
| if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level_ == depth)) & |
| (right_child_size <= num_leaf | cur_impurity_right == 0 | (level_ == depth)) ) { # both left and right nodes are leaf |
| |
| cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,]; |
| cur_NC[1,(2 * (i8 - 1) + 1)] = left_child; |
| cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label |
| left_pure = TRUE; |
| cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples |
| |
| cur_label_counts_right = label_counts_overall - cur_label_counts_left; |
| cur_NC[1,(2 * i8)] = right_child; |
| cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label |
| right_pure = TRUE; |
| cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples |
| |
| } else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) { |
| |
| cur_label_counts_left = label_counts_left_scale[max_I_gain_ind_scale,]; |
| cur_NC[1,(2 * (i8 - 1) + 1)] = left_child; |
| cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label |
| left_pure = TRUE; |
| cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples |
| |
| } else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) { |
| |
| cur_label_counts_right = label_counts_right_scale[max_I_gain_ind_scale,]; |
| cur_NC[1,(2 * i8)] = right_child; |
| cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label |
| right_pure = TRUE; |
| cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples |
| |
| } |
| |
| } else if (num_cat_features > 0 & best_cat_gain > 0) { |
| |
| # --- update model --- |
| cur_F[1,i8] = best_cat_feature; |
| offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]); |
| S_start_ind = (i8 - 1) * distinct_values_max + 1; |
| cur_S[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)]; |
| cur_F[3,i8] = offset_nonzero; |
| cur_F[2,i8] = 2; |
| |
| left_child = 2 * (cur_node - 1) + 1 + 1; |
| right_child = 2 * (cur_node - 1) + 2 + 1; |
| |
| # samples going to the left subtree |
| Ix_left = rowSums (X_cat_small[,start_ind:(start_ind + offset - 1)] * best_split_values_bin); |
| Ix_left = (Ix_left >= 1); |
| |
| Ix_left = Ix * Ix_left; |
| Ix_right = Ix * (1 - Ix_left); |
| |
| L_small[,1] = L_small[,1] * (1 - Ix_left) + (Ix_left * left_child); |
| L_small[,1] = L_small[,1] * (1 - Ix_right) + (Ix_right * right_child); |
| |
| left_child_size = sum (Ix_left); |
| right_child_size = sum (Ix_right); |
| |
| # check if left or right child is a leaf |
| left_pure = FALSE; |
| right_pure = FALSE; |
| cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]); |
| cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]); |
| if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level_ == depth)) & |
| (right_child_size <= num_leaf | cur_impurity_right == 0 | (level_ == depth)) ) { # both left and right nodes are leaf |
| |
| cur_label_counts_left = best_label_counts_left[best_cat_feature,]; |
| cur_NC[1,(2 * (i8 - 1) + 1)] = left_child; |
| cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label |
| left_pure = TRUE; |
| cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples |
| |
| cur_label_counts_right = label_counts_overall - cur_label_counts_left; |
| cur_NC[1,(2 * i8)] = right_child; |
| cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label |
| right_pure = TRUE; |
| cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples |
| |
| } else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) { |
| |
| cur_label_counts_left = best_label_counts_left[best_cat_feature,]; |
| cur_NC[1,(2 * (i8 - 1) + 1)] = left_child; |
| cur_NC[2,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label |
| left_pure = TRUE; |
| cur_NC[3,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); # no. of misclassified samples |
| |
| } else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) { |
| |
| cur_label_counts_right = best_label_counts_right[best_cat_feature,]; |
| cur_NC[1,(2 * i8)] = right_child; |
| cur_NC[2,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label |
| right_pure = TRUE; |
| cur_NC[3,(2 * i8)] = right_child_size - max (cur_label_counts_right); # no. of misclassified samples |
| |
| } |
| } else { |
| |
| print ("NUMBER OF SAMPLES AT NODE " + cur_node + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!"); |
| right_pure = TRUE; |
| left_pure = TRUE; |
| cur_NC[1,(2 * (i8 - 1) + 1)] = cur_node; |
| class_label = as.scalar (rowIndexMax (label_counts_overall)); |
| cur_NC[2,(2 * (i8 - 1) + 1)] = class_label; |
| cur_NC[3,(2 * (i8 - 1) + 1)] = label_sum_overall - max (label_counts_overall); |
| cur_NC[4,(2 * (i8 - 1) + 1)] = 1; # special leaf |
| } |
| |
| # add nodes to Q |
| if (!left_pure) { |
| cur_Q[1,(2 * (i8 - 1)+ 1)] = left_child; |
| } |
| if (!right_pure) { |
| cur_Q[1,(2 * i8)] = right_child; |
| } |
| } |
| |
| cur_Q = removeEmpty (target = cur_Q, margin = "cols"); |
| Q = cbind (Q, cur_Q); |
| NC = cbind (NC, cur_NC); |
| F = cbind (F, cur_F); |
| S = cbind (S, cur_S); |
| |
| num_cur_nodes_pre = 2 * num_cur_nodes; |
| if (as.scalar (cur_Q[1,1]) == 0) { |
| num_cur_nodes = 0; |
| } else { |
| cur_nodes_[,1:ncol(cur_Q)] = cur_Q; |
| num_cur_nodes = ncol (cur_Q); |
| } |
| } |
| |
| cur_Q_small[,start_ind_global:(start_ind_global + ncol (Q) - 1)] = Q; |
| cur_NC_small[,start_ind_global:(start_ind_global + ncol (NC) - 1)] = NC; |
| cur_F_small[,start_ind_global:(start_ind_global + ncol (F) - 1)] = F; |
| cur_S_small[,start_ind_S_global:(start_ind_S_global + ncol (S) - 1)] = S; |
| } |
| |
| ##### PREPARE MODEL FOR SMALL NODES |
| if (num_cur_nodes_small > 0) { # small nodes already processed |
| cur_Q_small = removeEmpty (target = cur_Q_small, margin = "cols"); |
| if (as.scalar (cur_Q_small[1,1]) != 0) Q_small = cbind (Q_small, cur_Q_small); |
| cur_NC_small = removeEmpty (target = cur_NC_small, margin = "cols"); |
| if (as.scalar (cur_NC_small[1,1]) != 0) NC_small = cbind (NC_small, cur_NC_small); |
| |
| cur_F_small = removeEmpty (target = cur_F_small, margin = "cols"); |
| if (as.scalar (cur_F_small[1,1]) != 0) F_small = cbind (F_small, cur_F_small); |
| cur_S_small = removeEmpty (target = cur_S_small, margin = "cols"); |
| if (as.scalar (cur_S_small[1,1]) != 0) S_small = cbind (S_small, cur_S_small); |
| |
| num_cur_nodes_small = 0; # reset |
| } |
| |
| print (" --- end level " + level + ", remaining no. of LARGE nodes to expand " + num_cur_nodes_large + " --- "); |
| } |
| |
| num_misclassified = sum (NC_large[3,]) + sum (NC_small[3,]); |
| accuracy = (1 - (num_misclassified / num_records)) * 100; |
| str_o = "no. of misclassified " + num_misclassified + " training accuracy " + accuracy + " %"; |
| if (fileO != " ") { |
| ACCURACY = matrix (accuracy, rows = 1, cols = 1); |
| write (ACCURACY, fileO, format = fmtO) |
| } else { |
| print (str_o); |
| } |
| |
| |
| #### prepare model |
| print ("PREPARING MODEL...") |
| # large nodes |
| if (as.scalar (Q_large[1,1]) == 0 & ncol (Q_large) > 1) { |
| Q_large = Q_large[,2:ncol (Q_large)]; |
| } |
| if (as.scalar (NC_large[1,1]) == 0 & ncol (NC_large) > 1) { |
| NC_large = NC_large[,2:ncol (NC_large)]; |
| } |
| if (as.scalar (S_large[1,1]) == 0 & ncol (S_large) > 1) { |
| S_large = S_large[,2:ncol (S_large)]; |
| } |
| if (as.scalar (F_large[1,1]) == 0 & ncol (F_large) > 1) { |
| F_large = F_large[,2:ncol (F_large)]; |
| } |
| # small nodes |
| if (as.scalar (Q_small[1,1]) == 0 & ncol (Q_small) > 1) { |
| Q_small = Q_small[,2:ncol (Q_small)]; |
| } |
| if (as.scalar (NC_small[1,1]) == 0 & ncol (NC_small) > 1) { |
| NC_small = NC_small[,2:ncol (NC_small)]; |
| } |
| if (as.scalar (S_small[1,1]) == 0 & ncol (S_small) > 1) { |
| S_small = S_small[,2:ncol (S_small)]; |
| } |
| if (as.scalar (F_small[1,1]) == 0 & ncol (F_small) > 1) { |
| F_small = F_small[,2:ncol (F_small)]; |
| } |
| |
| # check for special leaves and if there are any remove them from Q_large and Q_small |
| special_large_leaves_ind = NC_large[4,]; |
| if (sum (special_large_leaves_ind) > 0) { |
| print ("PROCESSING SPECIAL LARGE LEAVES..."); |
| special_large_leaves = removeEmpty (target = NC_large[1,] * special_large_leaves_ind, margin = "cols"); |
| large_internal_ind = 1 - colSums (outer (t (special_large_leaves), Q_large[1,], "==")); |
| Q_large = removeEmpty (target = Q_large * large_internal_ind, margin = "cols"); |
| F_large = removeEmpty (target = F_large, margin = "cols"); # remove special leaves from F |
| } |
| |
| special_small_leaves_ind = NC_small[4,]; |
| if (sum (special_small_leaves_ind) > 0) { |
| print ("PROCESSING SPECIAL SMALL LEAVES..."); |
| special_small_leaves = removeEmpty (target = NC_small[1,] * special_small_leaves_ind, margin = "cols"); |
| small_internal_ind = 1 - colSums (outer (t (special_small_leaves), Q_small[1,], "==")); |
| Q_small = removeEmpty (target = Q_small * small_internal_ind, margin = "cols"); |
| F_small = removeEmpty (target = F_small, margin = "cols"); # remove special leaves from F |
| } |
| |
| # model corresponding to large internal nodes |
| max_offset = 1; |
| no_large_internal_node = FALSE; |
| if (as.scalar (Q_large[1,1]) != 0) { |
| print ("PROCESSING LARGE INTERNAL NODES..."); |
| num_large_internal = ncol (Q_large); |
| max_offset = max (max (F_large[3,]), max (F_small[3,])); |
| M1_large = matrix (0, rows = 5 + max_offset, cols = num_large_internal); |
| M1_large[1,] = Q_large; |
| M1_large[3:5,] = F_large; |
| # process S_large |
| cum_offsets_large = cumsum (t (F_large[3,])); |
| parfor (it in 1:num_large_internal, check = 0) { |
| start_ind = 1; |
| if (it > 1) { |
| start_ind = start_ind + as.scalar (cum_offsets_large[(it - 1),]); |
| } |
| offset = as.scalar (F_large[3,it]); |
| M1_large[6:(6 + offset - 1),it] = t (S_large[1,start_ind:(start_ind + offset - 1)]); |
| } |
| } else { |
| print ("No LARGE internal nodes available"); |
| no_large_internal_node = TRUE; |
| } |
| |
| # model corresponding to small internal nodes |
| no_small_internal_node = FALSE; |
| if (as.scalar (Q_small[1,1]) != 0) { |
| print ("PROCESSING SMALL INTERNAL NODES..."); |
| num_small_internal = ncol (Q_small); |
| M1_small = matrix (0, rows = 5 + max_offset, cols = num_small_internal); |
| M1_small[1,] = Q_small; |
| M1_small[3:5,] = F_small; |
| # process S_small |
| cum_offsets_small = cumsum (t (F_small[3,])); |
| parfor (it in 1:num_small_internal, check = 0) { |
| start_ind = 1; |
| if (it > 1) { |
| start_ind = start_ind + as.scalar (cum_offsets_small[(it - 1),]); |
| } |
| offset = as.scalar (F_small[3,it]); |
| M1_small[6:(6 + offset - 1),it] = t (S_small[1,start_ind:(start_ind + offset - 1)]); |
| } |
| } else { |
| print ("No SMALL internal nodes available"); |
| no_small_internal_node = TRUE; |
| } |
| |
| # model corresponding to large leaf nodes |
| no_large_leaf_node = FALSE; |
| if (as.scalar (NC_large[1,1]) != 0) { |
| print ("PROCESSING LARGE LEAF NODES..."); |
| num_large_leaf = ncol (NC_large); |
| M2_large = matrix (0, rows = 5 + max_offset, cols = num_large_leaf); |
| M2_large[1,] = NC_large[1,]; |
| M2_large[4:6,] = NC_large[2:4,]; |
| } else { |
| print ("No LARGE leaf nodes available"); |
| no_large_leaf_node = TRUE; |
| } |
| |
| # model corresponding to small leaf nodes |
| no_small_leaf_node = FALSE; |
| if (as.scalar (NC_small[1,1]) != 0) { |
| print ("PROCESSING SMALL LEAF NODES..."); |
| num_small_leaf = ncol (NC_small); |
| M2_small = matrix (0, rows = 5 + max_offset, cols = num_small_leaf); |
| M2_small[1,] = NC_small[1,]; |
| M2_small[4:6,] = NC_small[2:4,]; |
| } else { |
| print ("No SMALL leaf nodes available"); |
| no_small_leaf_node = TRUE; |
| } |
| |
| if (no_large_internal_node) { |
| M1 = M1_small; |
| } else if (no_small_internal_node) { |
| M1 = M1_large; |
| } else { |
| M1 = cbind (M1_large, M1_small); |
| } |
| |
| if (no_large_leaf_node) { |
| M2 = M2_small; |
| } else if (no_small_internal_node) { |
| M2 = M2_large; |
| } else { |
| M2 = cbind (M2_large, M2_small); |
| } |
| |
| M = cbind (M1, M2); |
| M = t (order (target = t (M), by = 1)); |
| |
| # removing redundant subtrees |
| if (ncol (M) > 1) { |
| print ("CHECKING FOR REDUNDANT SUBTREES..."); |
| red_leaf = TRUE; |
| process_red_subtree = FALSE; |
| invalid_node_ind = matrix (0, rows = 1, cols = ncol (M)); |
| while (red_leaf & ncol (M) > 1) { |
| leaf_ind = (M[3,] == 0); |
| labels = M[4,] * leaf_ind; |
| parent_ids = floor (M[1,] /2); |
| cond1 = (labels[,1:(ncol (M) - 1)] == labels[,2:ncol (M)]); # siebling leaves with same label |
| cond2 = (parent_ids[,1:(ncol (M)- 1)] == parent_ids[,2:ncol (M)]); # same parents |
| red_leaf_ind = cond1 * cond2 * leaf_ind[,2:ncol (M)]; |
| |
| if (sum (red_leaf_ind) > 0) { # if redundant subtrees exist |
| red_leaf_ids = red_leaf_ind * M[1,2:ncol (M)]; |
| red_leaf_ids_nonzero = removeEmpty (target = red_leaf_ids, margin = "cols"); |
| parfor (it in 1:ncol (red_leaf_ids_nonzero), check = 0){ |
| cur_right_leaf_id = as.scalar (red_leaf_ids_nonzero[1,it]); |
| cur_parent_id = floor (cur_right_leaf_id / 2); |
| cur_right_leaf_pos = as.scalar (rowIndexMax ((M[1,] == cur_right_leaf_id))); |
| cur_parent_pos = as.scalar(rowIndexMax (M[1,] == cur_parent_id)); |
| M[2:nrow (M), cur_parent_pos] = M[2:nrow (M), cur_right_leaf_pos]; |
| M[3,cur_right_leaf_pos] = -1; |
| M[3,cur_right_leaf_pos - 1] = -1; |
| M[5,cur_parent_pos] = M[5,cur_parent_pos] + M[5,cur_right_leaf_pos - 1]; |
| invalid_node_ind[1,cur_right_leaf_pos] = 1; |
| invalid_node_ind[1,cur_right_leaf_pos - 1] = 1; |
| } |
| process_red_subtree = TRUE; |
| } else { |
| red_leaf = FALSE; |
| } |
| } |
| |
| if (process_red_subtree) { |
| print ("REMOVING REDUNDANT SUBTREES..."); |
| valid_node_ind = (invalid_node_ind == 0); |
| M = removeEmpty (target = M * valid_node_ind, margin = "cols"); |
| } |
| } |
| |
| if (ncol (M) > 1) { # if internal nodes exist |
| print ("COMPUTING OFFSETS TO THE LEFT CHILD FOR INTERNAL NODES..."); |
| internal_ind = (M[3,] > 0); |
| internal_ids = internal_ind * M[1,]; |
| internal_ids_nonzero = removeEmpty (target = internal_ids, margin = "cols"); |
| a1 = internal_ids_nonzero; |
| a2 = internal_ids_nonzero * 2; |
| pos_a1 = rowIndexMax( outer(t(a1), M[1,], "==") ); |
| pos_a2 = rowIndexMax( outer(t(a2), M[1,], "==") ); |
| M[2,] = t(table(pos_a1, 1, pos_a2 - pos_a1, ncol(M), 1)); |
| } |
| else { |
| print ("The tree model contains only one leaf with label " + as.scalar (M[4,])); |
| } |
| |
| write (M, fileM, format = fmtO); |
| |
| |