| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # |
| # THIS SCRIPT IMPLEMENTS CLASSIFICATION TREES WITH BOTH SCALE AND CATEGORICAL FEATURES |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X Matrix[Double] --- Feature matrix X; note that X needs to be both recoded and dummy coded |
| # Y Matrix[Double] --- Label matrix Y; note that Y needs to be both recoded and dummy coded |
| # R Matrix[Double] " " Matrix R which for each feature in X contains the following information |
| # - R[1,]: Row Vector which indicates if feature vector is scalar or categorical. 1 indicates |
| # a scalar feature vector, other positive Integers indicate the number of categories |
| # If R is not provided by default all variables are assumed to be scale |
| # bins Integer 20 Number of equiheight bins per scale feature to choose thresholds |
| # depth Integer 25 Maximum depth of the learned tree |
| # verbose Boolean FALSE boolean specifying if the algorithm should print information while executing |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT: |
| # Matrix M where each column corresponds to a node in the learned tree and each row contains the following information: |
| # M[1,j]: id of node j (in a complete binary tree) |
| # M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0 |
| # M[3,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical) |
| # that node j looks at if j is an internal node, otherwise 0 |
| # M[4,j]: Type of the feature that node j looks at if j is an internal node: holds the same information as R input vector |
| # M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values |
| # stored in rows 6,7,... if j is categorical |
| # If j is a leaf node: number of misclassified samples reaching at node j |
| # M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale, |
| # otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j |
| # If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0 |
| # ------------------------------------------------------------------------------------------- |
| |
| m_decisionTree = function( |
| Matrix[Double] X, |
| Matrix[Double] Y, |
| Matrix[Double] R, |
| Integer bins = 10, |
| Integer depth = 20, |
| Boolean verbose = FALSE |
| ) return (Matrix[Double] M) { |
| if (verbose) { |
| print("Executing Decision Tree:") |
| } |
| node_queue = matrix(1, rows=1, cols=1) # Add first Node |
| impurity_queue = matrix(1, rows=1, cols=1) |
| use_cols_queue = matrix(1, rows=ncol(X), cols=1) # Add fist bool Vector with all cols <=> (use all cols) |
| use_rows_queue = matrix(1, rows=nrow(X), cols=1) # Add fist bool Vector with all rows <=> (use all rows) |
| queue_length = 1 |
| M = matrix(0, rows = 0, cols = 0) |
| while (queue_length > 0) { |
| [node_queue, node] = dataQueuePop(node_queue) |
| [use_rows_queue, use_rows_vector] = dataQueuePop(use_rows_queue) |
| [use_cols_queue, use_cols_vector] = dataQueuePop(use_cols_queue) |
| |
| available_rows = calcAvailable(use_rows_vector) |
| available_cols = calcAvailable(use_cols_vector) |
| [impurity_queue, parent_impurity] = dataQueuePop(impurity_queue) |
| create_child_nodes_flag = FALSE |
| if (verbose) { |
| print("Popped Node: " + as.scalar(node)) |
| print("Rows: " + toString(t(use_rows_vector))) |
| print("Cols: " + toString(t(use_cols_vector))) |
| print("Available Rows: " + available_rows) |
| print("Available Cols: " + available_cols) |
| print("Parent impurity: " + as.scalar(parent_impurity)) |
| } |
| |
| node_depth = calculateNodeDepth(node) |
| used_col = 0.0 |
| if (node_depth < depth & available_rows > 1 & available_cols > 0 & as.scalar(parent_impurity) > 0) { |
| [impurity, used_col, threshold, type] = calcBestSplittingCriteria(X, Y, R, use_rows_vector, use_cols_vector, bins) |
| create_child_nodes_flag = impurity < as.scalar(parent_impurity) |
| if (verbose) { |
| print("Current impurity: " + impurity) |
| print("Current threshold: "+ toString(t(threshold))) |
| } |
| } |
| if (verbose) { |
| |
| print("Current column: " + used_col) |
| print("Current type: " + type) |
| } |
| if (create_child_nodes_flag) { |
| [left, right] = calculateChildNodes(node) |
| node_queue = dataQueuePush(left, right, node_queue) |
| |
| [new_use_cols_vector, left_use_rows_vector, right_use_rows_vector] = splitData(X, use_rows_vector, use_cols_vector, used_col, threshold, type) |
| use_rows_queue = dataQueuePush(left_use_rows_vector, right_use_rows_vector, use_rows_queue) |
| use_cols_queue = dataQueuePush(new_use_cols_vector, new_use_cols_vector, use_cols_queue) |
| |
| impurity_queue = dataQueuePush(matrix(impurity, rows = 1, cols = 1), matrix(impurity, rows = 1, cols = 1), impurity_queue) |
| offset = dataQueueLength(node_queue) - 1 |
| M = outputMatrixBind(M, node, offset, used_col, R, threshold) |
| } else { |
| M = outputMatrixBind(M, node, 0.0, used_col, R, matrix(0, rows = 1, cols = 1)) |
| } |
| queue_length = dataQueueLength(node_queue)# -- user-defined function calls not supported in relational expressions |
| |
| if (verbose) { |
| print("New QueueLen: " + queue_length) |
| print("") |
| } |
| } |
| } |
| |
| dataQueueLength = function(Matrix[Double] queue) return (Double len) { |
| len = ncol(queue) |
| } |
| |
| dataQueuePop = function(Matrix[Double] queue) return (Matrix[Double] new_queue, Matrix[Double] node) { |
| node = matrix(queue[,1], rows=1, cols=nrow(queue)) # reshape to force the creation of a new object |
| node = matrix(node, rows=nrow(queue), cols=1) # reshape to force the creation of a new object |
| len = dataQueueLength(queue) |
| if (len < 2) { |
| new_queue = matrix(0,0,0) |
| } else { |
| new_queue = matrix(queue[,2:ncol(queue)], rows=nrow(queue), cols=ncol(queue)-1) |
| } |
| } |
| |
| dataQueuePush = function(Matrix[Double] left, Matrix[Double] right, Matrix[Double] queue) return (Matrix[Double] new_queue) { |
| len = dataQueueLength(queue) |
| if(len <= 0) { |
| new_queue = cbind(left, right) |
| } else { |
| new_queue = cbind(queue, left, right) |
| } |
| } |
| |
| dataVectorLength = function(Matrix[Double] vector) return (Double len) { |
| len = nrow(vector) |
| } |
| |
| dataColVectorLength = function(Matrix[Double] vector) return (Double len) { |
| len = ncol(vector) |
| } |
| |
| dataVectorGet = function(Matrix[Double] vector, Double index) return (Double value) { |
| value = as.scalar(vector[index, 1]) |
| } |
| |
| dataVectorSet = function(Matrix[Double] vector, Double index, Double data) return (Matrix[Double] new_vector) { |
| vector[index, 1] = data |
| new_vector = vector |
| } |
| |
| calcAvailable = function(Matrix[Double] vector) return(Double available_elements){ |
| len = dataVectorLength(vector) |
| available_elements = 0.0 |
| for (index in 1:len) { |
| element = dataVectorGet(vector, index) |
| if(element > 0.0) { |
| available_elements = available_elements + 1.0 |
| } |
| } |
| } |
| |
| calculateNodeDepth = function(Matrix[Double] node) return(Double depth) { |
| depth = log(as.scalar(node), 2) + 1 |
| } |
| |
| calculateChildNodes = function(Matrix[Double] node) return(Matrix[Double] left, Matrix[Double] right) { |
| left = node * 2.0 |
| right = node * 2.0 + 1.0 |
| } |
| |
| getTypeOfCol = function(Matrix[Double] R, Double col) return(Double type) { # 1..scalar, 2..categorical |
| type = as.scalar(R[1, col]) |
| } |
| |
| extrapolateOrderedScalarFeatures = function( |
| Matrix[Double] X, |
| Matrix[Double] use_rows_vector, |
| Double col) return (Matrix[Double] feature_vector) { |
| feature_vector = matrix(1, rows = 1, cols = 1) |
| len = nrow(X) |
| first_time = TRUE |
| for(row in 1:len) { |
| use_feature = dataVectorGet(use_rows_vector, row) |
| if (use_feature != 0) { |
| if(first_time) { |
| feature_vector[1,1] = X[row, col] |
| first_time = FALSE |
| } else { |
| feature_vector = rbind(feature_vector, X[row, col]) |
| } |
| } |
| } |
| feature_vector = order(target=feature_vector, by=1, decreasing=FALSE, index.return=FALSE) |
| } |
| |
| calcPossibleThresholdsScalar = function( |
| Matrix[Double] X, |
| Matrix[Double] use_rows_vector, |
| Double col, |
| int bins) return (Matrix[Double] thresholds) { |
| ordered_features = extrapolateOrderedScalarFeatures(X, use_rows_vector, col) |
| ordered_features_len = dataVectorLength(ordered_features) |
| thresholds = matrix(1, rows = 1, cols = ordered_features_len - 1) |
| virtual_length = min(ordered_features_len, 20) |
| step_length = ordered_features_len / virtual_length |
| if (ordered_features_len > 1) { |
| for (index in 1:(virtual_length - 1)) { |
| real_index = index * step_length |
| mean = (dataVectorGet(ordered_features, real_index) + dataVectorGet(ordered_features, real_index + 1)) / 2 |
| thresholds[1, index] = mean |
| } |
| } |
| } |
| |
| calcPossibleThresholdsCategory = function(Double type) return (Matrix[Double] thresholds) { |
| numberThresholds = 2 ^ type |
| thresholds = matrix(-1, rows = type, cols = numberThresholds) |
| toggleFactor = numberThresholds / 2 |
| |
| for (index in 1:type) { |
| beginCols = 1 |
| endCols = toggleFactor |
| iterations = numberThresholds / toggleFactor / 2 |
| for (it in 1:iterations) { |
| category_val = type - index + 1 |
| thresholds[index, beginCols:endCols] = matrix(category_val, rows = 1, cols = toggleFactor) |
| endCols = endCols + 2 * toggleFactor |
| beginCols = beginCols + 2 * toggleFactor |
| } |
| toggleFactor = toggleFactor / 2 |
| iterations = numberThresholds / toggleFactor / 2 |
| } |
| ncol = ncol(thresholds) |
| if (ncol > 2.0) { |
| thresholds = cbind(thresholds[,2:ncol-2], thresholds[,ncol-1]) |
| } |
| } |
| |
| calcGiniImpurity = function(Double num_true, Double num_false) return (Double impurity) { |
| prop_true = num_true / (num_true + num_false) |
| prop_false = num_false / (num_true + num_false) |
| impurity = 1 - (prop_true ^ 2) - (prop_false ^ 2) |
| } |
| |
| calcImpurity = function( |
| Matrix[Double] X, |
| Matrix[Double] Y, |
| Matrix[Double] use_rows_vector, |
| Double col, |
| Double type, |
| int bins) return (Double impurity, Matrix[Double] threshold) { |
| |
| is_scalar_type = typeIsScalar(type) |
| if (is_scalar_type) { |
| possible_thresholds = calcPossibleThresholdsScalar(X, use_rows_vector, col, bins) |
| } else { |
| possible_thresholds = calcPossibleThresholdsCategory(type) |
| } |
| len_thresholds = ncol(possible_thresholds) |
| impurity = 1 |
| threshold = matrix(0, rows=1, cols=1) |
| for (index in 1:len_thresholds) { |
| [false_rows, true_rows] = splitRowsVector(X, use_rows_vector, col, possible_thresholds[, index], type) |
| num_true_positive = 0; num_false_positive = 0; num_true_negative = 0; num_false_negative = 0 |
| len = dataVectorLength(use_rows_vector) |
| for (c_row in 1:len) { |
| true_row_data = dataVectorGet(true_rows, c_row) |
| false_row_data = dataVectorGet(false_rows, c_row) |
| if (true_row_data != 0 & false_row_data == 0) { # IT'S POSITIVE! |
| if (as.scalar(Y[c_row, 1]) != 0) { |
| num_true_positive = num_true_positive + 1 |
| } else { |
| num_false_positive = num_false_positive + 1 |
| } |
| } else if (true_row_data == 0 & false_row_data != 0) { # IT'S NEGATIVE |
| if (as.scalar(Y[c_row, 1]) != 0.0) { |
| num_false_negative = num_false_negative + 1 |
| } else { |
| num_true_negative = num_true_negative + 1 |
| } |
| } |
| } |
| impurity_positive_branch = calcGiniImpurity(num_true_positive, num_false_positive) |
| impurity_negative_branch = calcGiniImpurity(num_true_negative, num_false_negative) |
| num_samples = num_true_positive + num_false_positive + num_true_negative + num_false_negative |
| num_negative = num_true_negative + num_false_negative |
| num_positive = num_true_positive + num_false_positive |
| c_impurity = num_positive / num_samples * impurity_positive_branch + num_negative / num_samples * impurity_negative_branch |
| if (c_impurity <= impurity) { |
| impurity = c_impurity |
| threshold = possible_thresholds[, index] |
| } |
| } |
| } |
| |
| calcBestSplittingCriteria = function( |
| Matrix[Double] X, |
| Matrix[Double] Y, |
| Matrix[Double] R, |
| Matrix[Double] use_rows_vector, |
| Matrix[Double] use_cols_vector, |
| int bins) return (Double impurity, Double used_col, Matrix[Double] threshold, Double type) { |
| |
| impurity = 1 |
| used_col = 1 |
| threshold = matrix(0, 1, 1) |
| type = 1 |
| # -- user-defined function calls not supported for iterable predicates |
| len = dataVectorLength(use_cols_vector) |
| for (c_col in 1:len) { |
| use_feature = dataVectorGet(use_cols_vector, c_col) |
| if (use_feature != 0) { |
| c_type = getTypeOfCol(R, c_col) |
| [c_impurity, c_threshold] = calcImpurity(X, Y, use_rows_vector, c_col, c_type, bins) |
| if(c_impurity <= impurity) { |
| impurity = c_impurity |
| used_col = c_col |
| threshold = c_threshold |
| type = c_type |
| } |
| } |
| } |
| } |
| |
| typeIsScalar = function(Double type) return(Boolean b) { |
| b = type == 1.0 |
| } |
| |
| splitRowsVector = function( |
| Matrix[Double] X, |
| Matrix[Double] use_rows_vector, |
| Double col, |
| Matrix[Double] threshold, |
| Double type |
| ) return (Matrix[Double] false_use_rows_vector, Matrix[Double] true_use_rows_vector) { |
| type_is_scalar = typeIsScalar(type) |
| false_use_rows_vector = use_rows_vector |
| true_use_rows_vector = use_rows_vector |
| |
| if (type_is_scalar) { |
| scalar_threshold = as.scalar(threshold[1,1]) |
| len = dataVectorLength(use_rows_vector) |
| for (c_row in 1:len) { |
| row_enabled = dataVectorGet(use_rows_vector, c_row) |
| if (row_enabled != 0) { |
| if (as.scalar(X[c_row, col]) > scalar_threshold) { |
| false_use_rows_vector = dataVectorSet(false_use_rows_vector, c_row, 0.0) |
| } else { |
| true_use_rows_vector = dataVectorSet(true_use_rows_vector, c_row, 0.0) |
| } |
| } |
| } |
| } else { |
| len = dataVectorLength(use_rows_vector) |
| for (c_row in 1:len) { |
| row_enabled = dataVectorGet(use_rows_vector, c_row) |
| if (row_enabled != 0) { |
| categories_len = dataColVectorLength(threshold) |
| move_sample_to_true_set = FALSE |
| for (category_col_index in 1:categories_len) { |
| desired_category = as.scalar(X[c_row, col]) |
| if(desired_category != -1) { |
| category_of_threshold = threshold[type - desired_category + 1, category_col_index] |
| move_sample_to_true_set = as.scalar(X[c_row, col]) == as.scalar(category_of_threshold) |
| } else { |
| #Todo: has category -1 to be considered? |
| move_sample_to_true_set = TRUE |
| } |
| } |
| if (move_sample_to_true_set) { |
| false_use_rows_vector = dataVectorSet(false_use_rows_vector, c_row, 0.0) |
| } else { |
| true_use_rows_vector = dataVectorSet(true_use_rows_vector, c_row, 0.0) |
| } |
| } |
| } |
| } |
| } |
| |
| splitData = function( |
| Matrix[Double] X, |
| Matrix[Double] use_rows_vector, |
| Matrix[Double] use_cols_vector, |
| Double col, |
| Matrix[Double] threshold, |
| Double type |
| ) return (Matrix[Double] new_use_cols_vector, Matrix[Double] false_use_rows_vector, Matrix[Double] true_use_rows_vector) { |
| new_use_cols_vector = dataVectorSet(use_cols_vector, col, 0.0) |
| [false_use_rows_vector, true_use_rows_vector] = splitRowsVector(X, use_rows_vector, col, threshold, type) |
| } |
| |
| outputMatrixBind = function( |
| Matrix[Double] M, |
| Matrix[Double] node, |
| Double offset, |
| Double used_col, |
| Matrix[Double] R, |
| Matrix[Double] threshold |
| ) return (Matrix[Double] new_M) { |
| col = matrix(0, rows = 5, cols = 1) |
| col[1, 1] = node[1, 1] |
| col[2, 1] = offset |
| col[3, 1] = used_col |
| if (used_col >= 1.0) { col[4, 1] = R[1, used_col] } |
| col[5, 1] = nrow(threshold) |
| col = rbind(col, threshold) |
| |
| if (ncol(M) == 0 & nrow(M) == 0) { |
| new_M = col |
| } else { |
| row_difference = nrow(M) - nrow(col) |
| if (row_difference < 0.0) { |
| buffer = matrix(-1, rows = -row_difference, cols = ncol(M)) |
| M = rbind(M, buffer) |
| } else if (row_difference > 0.0) { |
| buffer = matrix(-1, rows = row_difference, cols = 1) |
| col = rbind(col, buffer) |
| } |
| new_M = cbind(M, col) |
| } |
| } |