| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X Matrix[Double] --- Feature matrix X; note that X needs to be both recoded and dummy coded |
| # Y Matrix[Double] --- Label matrix Y; note that Y needs to be both recoded and dummy coded |
| # R Matrix[Double] 1, 1xn Matrix R; 1xn vector which for each feature in X contains the following information |
| # - R[,1]: 1 (scalar feature) |
| # - R[,2]: 2 (categorical feature) |
| # Feature 1 is a scalar feature and features 2 is a categorical feature |
| # If R is not provided by default all variables are assumed to be scale (1) |
| # sml_type Integer 1 Supervised machine learning type: 1 = Regression(default), 2 = Classification |
| # num_trees Integer 7 Number of trees to be created in the xgboost model |
| # learning_rate Double 0.3 Alias: eta. After each boosting step the learning rate controls the weights of the new predictions |
| # max_depth Integer 6 Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit |
| # lambda Double 0.0 L2 regularization term on weights. Increasing this value will make model more conservative and reduce amount of leaves of a tree |
| # --------------------------------------------------------------------------------------------- |
| |
| # --------------------------------------------------------------------------------------------- |
| # OUTPUT: |
| # Matrix M where each column corresponds to a node in the learned tree (the first node is the init prediction) and each row contains the following information: |
| # M[1,j]: id of node j (in a complete binary tree) |
| # M[2,j]: tree id to which node j belongs |
| # M[3,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0 |
| # M[4,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical) |
| # that node j looks at if j is an internal node, otherwise 0 |
| # M[5,j]: Type of the feature that node j looks at if j is an internal node. if leaf = 0, if scalar = 1, if categorical = 2 |
| # M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale, |
| # otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j |
| # If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0 |
| # ------------------------------------------------------------------------------------------- |
| |
| m_xgboost = function(Matrix[Double] X, Matrix[Double] y, |
| Matrix[Double] R = matrix(1,rows=1,cols=nrow(X)), Integer sml_type = 1, Integer num_trees = 7, |
| Double learning_rate = 0.3, Integer max_depth = 6, Double lambda = 0.0) |
| return (Matrix[Double] M) |
| { |
| # test if input correct |
| assert(nrow(X) == nrow(y)) |
| assert(ncol(y) == 1) |
| assert(nrow(R) == 1) |
| |
| M = matrix(0,rows=6,cols=0) |
| # set the init prediction at first col in M |
| init_prediction_matrix = matrix(0,rows=nrow(M),cols=1) |
| init_prediction_matrix[6,1] = median(y) |
| M = cbind(M, init_prediction_matrix) |
| |
| current_prediction = matrix(median(y), rows=nrow(y), cols=1) |
| |
| tree_id = 1 |
| while(tree_id <= num_trees) { |
| if(sml_type == 1) # Regression |
| { |
| curr_M = buildOneTreeRegression(X, y, R, max_depth, current_prediction, tree_id, lambda) |
| } |
| else # classification |
| { |
| assert(sml_type == 2) |
| curr_M = buildOneTreeClassification(X, y, R, max_depth, current_prediction, tree_id, lambda) |
| } |
| |
| # in current prediction all previous trees are considered, so we only add the current tree to calculate new predictions |
| current_prediction = calculateNewPredictions(X, sml_type,current_prediction, learning_rate, curr_M) |
| |
| tree_id = tree_id + 1 |
| M = cbind(M, curr_M) # concat the new tree to the existing one (forest-ing) |
| } |
| } |
| |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: X: nxn matrix, original input matrix |
| # INPUT: current_prediction: nx1 vector of the current prediction for my target features y (1st run is init prediction) |
| # INPUT: learning_rate: set by user |
| # INPUT: curr_M: The current M matrix with the current tree |
| # OUTPUT: new_prediction: x1 vector of new new_prediction for my target features y |
| calculateNewPredictions = function(Matrix[Double] X, Integer sml_type, Matrix[Double] current_prediction, |
| Double learning_rate, Matrix[Double] curr_M) |
| return (Matrix[Double] new_prediction) |
| { |
| new_prediction = matrix(0, rows=nrow(current_prediction), cols=1) |
| start_node_current_tree = curr_M[,1] |
| |
| if(sml_type == 1) # Regression |
| { |
| for(entry in 1:nrow(X)) # go though each entry in X and calculate the new prediction |
| { |
| output_values = matrix(0, rows=1, cols=0) |
| |
| output_value = getOutputValueForEntry(X[entry,], curr_M, start_node_current_tree) |
| output_values = cbind(output_values, as.matrix(output_value)) |
| new_prediction[entry,] = current_prediction[entry,] + learning_rate * sum(output_values) |
| } |
| } |
| else # Classification |
| { |
| for(entry in 1:nrow(X)) # go though each entry in X and calculate the new prediction |
| { |
| output_values = matrix(0, rows=1, cols=0) |
| output_value = getOutputValueForEntry(X[entry,], curr_M, start_node_current_tree) |
| output_values = cbind(output_values, as.matrix(output_value)) |
| odds = as.scalar(current_prediction[entry,]) |
| if((1 - odds) == 0) |
| log_odds = 0 |
| else |
| log_odds = log(odds / (1 - odds)) |
| x = (log_odds + learning_rate * sum(output_values)) |
| e = 2.7182818284 |
| new_prediction[entry,] = e^x / (1 + e^x) |
| } |
| } |
| } |
| |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: row_vector: nx1 vector, one sample with n features |
| # INPUT: curr_M: The current M matrix with the current tree |
| # INPUT: current_node: the start node of the current tree |
| # OUTPUT: output_value: the prediction for the current sample at the current tree |
| getOutputValueForEntry = function(Matrix[Double] row_vector, |
| Matrix[Double] curr_M, Matrix[Double] current_node) |
| return (Double output_value) |
| { |
| cur_node_index = 1 # cant take the node id because its diff to the index |
| |
| # iterate over one tree and find output value |
| while(as.scalar(current_node[5,]) != 0.0) # until leaf |
| { |
| used_feature = as.scalar(current_node[4,]) |
| feature_is_scalar = as.scalar(current_node[5,]) == 1 |
| if(feature_is_scalar) #SCALAR values |
| { |
| if(as.scalar(row_vector[,used_feature]) < as.scalar(current_node[6,])) # go left |
| { |
| cur_node_index = (cur_node_index + as.scalar(current_node[3,])) |
| current_node = curr_M[,cur_node_index] |
| } |
| else # go right |
| { |
| cur_node_index = cur_node_index + as.scalar(current_node[3,]) + 1 |
| current_node = curr_M[,cur_node_index] |
| } |
| } |
| else # CATEGORICAL values |
| { |
| assert(as.scalar(current_node[5,]) == 2) |
| if(as.scalar(row_vector[,used_feature]) == 1) # go left |
| { |
| cur_node_index = (cur_node_index + as.scalar(current_node[3,])) |
| current_node = curr_M[,cur_node_index] |
| } |
| else # go right |
| { |
| assert(as.scalar(row_vector[,used_feature]) == 0) |
| cur_node_index = cur_node_index + as.scalar(current_node[3,]) + 1 |
| current_node = curr_M[,cur_node_index] |
| } |
| } |
| } |
| output_value = as.scalar(current_node[6,]) |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: X: nxn matrix, original input matrix |
| # INPUT: y: nx1 vector of my target values |
| # INPUT: R The matrix R which for each feature in X contains the following information |
| # - R[,2]: 1 (scalar feature) |
| # - R[,1]: 2 (categorical feature) |
| # INPUT: max_depth: the max depth of a tree |
| # INPUT: prediction: nx1 vector, my current predictions for my target value y |
| # INPUT: tree_id: The current tree id, starting at 1 |
| # INPUT: lambda: the regularization parameter lambda |
| # OUTPUT: M: the current M matrix of this tree |
| buildOneTreeRegression = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] R, Integer max_depth, |
| Matrix[Double] prediction, Double tree_id, Double lambda) |
| return (Matrix[Double] M) |
| { |
| sml_type = 1 # regression |
| |
| M = matrix(0,rows=6,cols=0) |
| node_queue = matrix(1, rows=1, cols=1) # Add first Node |
| curr_rows_queue = matrix(1, rows=nrow(X), cols=1) |
| node_queue_len = 1 |
| depth = 0.0 |
| level = 0 |
| |
| # adjust this here if we create more trees to create new prediction values |
| while (node_queue_len > 0) { |
| has_child = FALSE |
| [node_queue, node] = dataQueuePop(node_queue) |
| [curr_rows_queue, curr_rows_vector] = dataQueuePop(curr_rows_queue) |
| |
| level = log(as.scalar(node), 2) + 1 |
| available_rows = calcAvailableRows(curr_rows_vector) # check if leaf |
| |
| if(available_rows == 0) { |
| residual_matrix = matrix(0, rows=0, cols=0) |
| } |
| else { |
| [curr_X, curr_y, curr_X_full, curr_prediction] = updateMatrices(X, y, prediction, curr_rows_vector) |
| residual_matrix = calculateResiduals(curr_y, curr_prediction) # we need residuals calculated also for leafs for output value |
| } |
| |
| best_feature_index = 0.00 |
| done = FALSE |
| |
| if(available_rows > 1 & max_depth > level & done == FALSE) # leaf check or max depth check |
| { |
| best_feature_index = findBestFeature(X=curr_X, y=curr_y, sml_type=sml_type) |
| type = getTypeOfFeature(R, best_feature_index) |
| |
| if(type == 1.0) # SCALAR |
| { |
| similarity_score = calculateSimilarityScore(residual_matrix, lambda) |
| [best_split_threshold, best_gain] = findBestSplit(sml_type, curr_X[,best_feature_index], similarity_score, curr_prediction, lambda) |
| has_child = best_gain > 0 # if the gain is < 0, the split is worse than the current node |
| } |
| else # CATEGORICAL |
| { |
| has_child = TRUE |
| } |
| } |
| |
| if (has_child) |
| { |
| [left, right] = calculateChildNodeIDs(node) |
| node_queue = dataQueuePush(left, right, node_queue) |
| |
| if (type == 1.0) # SCALAR |
| { |
| [left_row_vec, right_row_vec] = splitMatrixByValueLoop(curr_X_full[,best_feature_index], X[,best_feature_index], best_split_threshold) |
| curr_rows_queue = dataQueuePush(left_row_vec, right_row_vec, curr_rows_queue) |
| offset = ncol(node_queue) - 1 |
| M = addOutputRow(M, node, tree_id, R, offset, best_feature_index, best_split_threshold, 0.0) |
| } |
| else # CATEGORICAL |
| { |
| [left_row_vec, right_row_vec] = splitMatrixByCategory(curr_X_full[,best_feature_index], X[,best_feature_index]) |
| curr_rows_queue = dataQueuePush(left_row_vec, right_row_vec, curr_rows_queue) |
| offset = ncol(node_queue) - 1 |
| M = addOutputRow(M, node, tree_id, R, offset, best_feature_index, 0.0, 0.0) |
| } |
| } |
| else # has no child => must be leaf |
| { |
| output_value = calculateOutputValue(residual_matrix, lambda) |
| # offset, best_feature_idx, threshold |
| M = addOutputRow(M, node, tree_id, R, 0.0, 0.0, 0.0, output_value) |
| } |
| node_queue_len = ncol(node_queue) |
| } |
| } |
| |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: X: nxn matrix, original input matrix |
| # INPUT: y: nx1 vector of my target values |
| # INPUT: R The matrix R which for each feature in X contains the following information |
| # - R[,2]: 1 (scalar feature) |
| # - R[,1]: 2 (categorical feature) |
| # INPUT: max_depth: the max depth of a tree |
| # INPUT: prediction: nx1 vector, my current predictions for my target value y |
| # INPUT: tree_id: The current tree id, starting at 1 |
| # INPUT: lambda: the regularization parameter lambda |
| # OUTPUT: M: the current M matrix of this tree |
| buildOneTreeClassification = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] R, Integer max_depth, |
| Matrix[Double] prediction, Double tree_id, Double lambda) |
| return (Matrix[Double] M) |
| { |
| sml_type = 2 # classification |
| |
| M = matrix(0,rows=6,cols=0) |
| node_queue = matrix(1, rows=1, cols=1) # Add first Node |
| curr_rows_queue = matrix(1, rows=nrow(X), cols=1) |
| node_queue_len = 1 |
| depth = 0.0 |
| level = 0 |
| |
| # adjust this here if we create more trees to create new prediction values |
| while (node_queue_len > 0) { |
| has_child = FALSE |
| [node_queue, node] = dataQueuePop(node_queue) |
| [curr_rows_queue, curr_rows_vector] = dataQueuePop(curr_rows_queue) |
| |
| level = log(as.scalar(node), 2) + 1 |
| available_rows = calcAvailableRows(curr_rows_vector) # check if leaf |
| |
| if(available_rows == 0) { |
| residual_matrix = matrix(0, rows=0, cols=0) |
| } |
| else { |
| [curr_X, curr_y, curr_X_full, curr_prediction] = updateMatrices(X, y, prediction, curr_rows_vector) |
| residual_matrix = calculateResiduals(curr_y, curr_prediction) # we need residuals calculated also for leafs for output value |
| } |
| |
| best_feature_index = 0.00 |
| done = FALSE |
| count = sum(curr_y) |
| if(count == 0) |
| done = TRUE |
| if(count == nrow(curr_y)) |
| done = TRUE |
| |
| if(available_rows > 1 & max_depth > level & done == FALSE) # leaf check or max depth check |
| { |
| best_feature_index = findBestFeature(X=curr_X, y=curr_y, sml_type=sml_type) |
| type = getTypeOfFeature(R, best_feature_index) |
| |
| if(type == 1.0) # SCALAR |
| { |
| similarity_score = calculateSimilarityScoreClassification(residual_matrix, curr_prediction, lambda) |
| [best_split_threshold, best_gain] = findBestSplit(sml_type, curr_X[,best_feature_index], similarity_score, curr_prediction, lambda) |
| has_child = best_gain > 0 # if the gain is < 0, the split is worse than the current node |
| } |
| else # CATEGORICAL |
| { |
| has_child = TRUE |
| } |
| } |
| |
| if (has_child) |
| { |
| [left, right] = calculateChildNodeIDs(node) |
| node_queue = dataQueuePush(left, right, node_queue) |
| |
| if (type == 1.0) # SCALAR |
| { |
| [left_row_vec, right_row_vec] = splitMatrixByValueLoop(curr_X_full[,best_feature_index], X[,best_feature_index], best_split_threshold) |
| curr_rows_queue = dataQueuePush(left_row_vec, right_row_vec, curr_rows_queue) |
| offset = ncol(node_queue) - 1 |
| M = addOutputRow(M, node, tree_id, R, offset, best_feature_index, best_split_threshold, 0.0) |
| } |
| else # CATEGORICAL |
| { |
| assert(type == 2.0) |
| [left_row_vec, right_row_vec] = splitMatrixByCategory(curr_X_full[,best_feature_index], X[,best_feature_index]) |
| curr_rows_queue = dataQueuePush(left_row_vec, right_row_vec, curr_rows_queue) |
| offset = ncol(node_queue) - 1 |
| M = addOutputRow(M, node, tree_id, R, offset, best_feature_index, 0.0, 0.0) |
| } |
| } |
| else # has no child => must be leaf |
| { |
| output_value = calculateOutputValueClassification(residual_matrix, curr_prediction, lambda) |
| # offset, best_feature_idx, threshold |
| M = addOutputRow(M, node, tree_id, R, 0.0, 0.0, 0.0, output_value) |
| } |
| node_queue_len = ncol(node_queue) |
| } |
| } |
| |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: node: a 1xn matrix (node containing values) |
| # OUTPUT: left: the new left node id |
| # OUTPUT: right: the new right node id |
| calculateChildNodeIDs = function(Matrix[Double] node) |
| return(Matrix[Double] left, Matrix[Double] right) |
| { |
| left = node * 2.0 |
| right = node * 2.0 + 1.0 |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: M: the nxn output matrix |
| # INPUT: node: the current node |
| # INPUT: R The matrix R which for each feature in X contains the following information |
| # - R[,2]: 1 (scalar feature) |
| # - R[,1]: 2 (categorical feature) |
| # INPUT: offset: offset to the left child |
| # INPUT: used_col: which feature got used |
| # INPUT: |
| # INPUT: threshold: the value at the node we separated it, if leaf => the output value |
| # OUTPUT: new_M: nxn output Matrix |
| addOutputRow = function(Matrix[Double] M, Matrix[Double] node, Double tree_id, Matrix[Double] R, |
| Double offset, Double used_col, Double threshold, Double output_value) |
| return (Matrix[Double] new_M) |
| { |
| current_node = matrix(0, rows=6, cols=1) |
| current_node[1,1] = node[1,1] # node id |
| current_node[2,1] = tree_id |
| current_node[3,1] = offset # offset to left child |
| current_node[4,1] = used_col # features used |
| |
| if(used_col == 0.0) { # is a single value or max depth reached => leaf |
| current_node[5,1] = 0 # 0 if leaf node |
| assert(threshold == 0) |
| current_node[6,1] = output_value |
| } |
| else if (as.scalar(R[1, used_col]) == 2.0) { |
| current_node[5, 1] = 2.0 # categorical |
| assert(threshold == 0.0) # should be 0 at categorical |
| current_node[6,1] = threshold |
| } |
| else { |
| current_node[5,1] = 1.0 # 1 if it is scalar |
| current_node[6,1] = threshold # split value ( 0 if leaf or categorical) |
| } |
| |
| new_M = cbind(M, current_node) |
| } |
| |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: R: Location to read the matrix R(nx1) which for each feature in X contains the following information |
| # - R[,2]: 1 (scalar feature) |
| # - R[,1]: 2 (categorical feature) |
| # INPUT: col: the col that is desired to know the type |
| # OUTPUT: [1.0,scalar] or [2.0,categorical] |
| #----------------------------------------------------------------------------------------------------------------------- |
| getTypeOfFeature = function(Matrix[Double] R, Double col) return(Double type) { |
| type = as.scalar(R[1, col]) |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: X: a nxn matrix (X Matrix from input) |
| # INPUT: y: a 1xn matrix (y Matrix from input) |
| # INPUT: prediction: a 1xn matrix with the current predictions for y (median at first tree) |
| # INPUT: vector: a 1xn matrix (contains 1 for relevant and 0 for non-relevant value) |
| # OUTPUT: new_X: the new X containing only the relevant rows |
| # OUTPUT: new_y: the new y containing only the relevant rows |
| # OUTPUT: new_X_full: the new X containing the relevant rows + 0 for non_relevant |
| # OUTPUT: new_prediction: the current prediction containing only the relevant rows |
| updateMatrices = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] prediction, Matrix[Double] vector) |
| return(Matrix[Double] new_X, Matrix[Double] new_y, Matrix[Double] new_X_full, Matrix[Double] new_prediction) { |
| |
| assert(ncol(vector) == 1) |
| assert(nrow(vector) == nrow(X)) |
| |
| new_X = matrix(0, rows = 0, cols = 0) |
| new_y = matrix(0, rows = 0, cols = 0) |
| nan_vec = matrix(NaN, rows=nrow(vector), cols=ncol(X)) |
| new_X_full = matrix(0, rows = 0, cols=0) |
| new_prediction = matrix(0, rows=0, cols=0) |
| for(i in 1:nrow(vector)) { |
| if(as.scalar(vector[i,]) == 1.0) # this row is relevant for this node |
| { |
| if (ncol(new_X) == 0 & ncol(new_X_full) == 0) { # first iteration |
| new_X = X[i,] |
| new_y = y[i,] |
| new_X_full = X[i,] |
| new_prediction = prediction[i,] |
| } else if (ncol(new_X) == 0 & ncol(new_X_full) != 0) { # already a 0 row found |
| new_X = X[i,] |
| new_y = y[i,] |
| new_X_full = rbind(new_X_full, X[i,]) |
| new_prediction = prediction[i,] |
| } else { # adding all others |
| new_X = rbind(new_X, X[i,]) |
| new_y = rbind(new_y, y[i,]) |
| new_X_full = rbind(new_X_full, X[i,]) |
| new_prediction = rbind(new_prediction, prediction[i,]) |
| } |
| } |
| else { # this row is NOT relevant for this node |
| assert(as.scalar(vector[i,]) == 0.0 | as.scalar(vector[i,]) == 'NaN') |
| if (ncol(new_X_full) == 0) # first row |
| new_X_full = nan_vec[i,] |
| else |
| new_X_full = rbind(new_X_full, nan_vec[i,]) # add a new row filled with NaNs |
| } |
| } |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: vector: a 1xn matrix (current datapoints which are interesting) |
| # OUTPUT: available_elements: nr. of available datapoints |
| calcAvailableRows = function(Matrix[Double] vector) |
| return(Double available_elements) |
| { |
| assert(ncol(vector) == 1) |
| len = nrow(vector) |
| available_elements = 0.0 |
| # TODO perf vectorize |
| for (index in 1:len) { |
| element = as.scalar(vector[index,]) |
| if(element > 0.0) |
| available_elements = available_elements + 1.0 |
| } |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: queue: a nxn matrix (queue containing values) |
| # OUTPUT: new_queue: the new queue after the first vector gets popped |
| # OUTPUT: node_vec: the popped vector (1xn) from the queue |
| dataQueuePop = function(Matrix[Double] queue) |
| return (Matrix[Double] new_queue, Matrix[Double] node_vec) |
| { |
| node_vec = matrix(queue[,1], rows=1, cols=nrow(queue)) # reshape to force the creation of a new object |
| node_vec = matrix(node_vec, rows=nrow(queue), cols=1) # reshape to force the creation of a new object |
| len = ncol(queue) |
| if (len < 2) |
| new_queue = matrix(0,0,0) |
| else |
| new_queue = matrix(queue[,2:ncol(queue)], rows=nrow(queue), cols=ncol(queue)-1) |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: left: a 1xn matrix |
| # INPUT: right: a 1xn matrix |
| # INPUT: queue: a nxn matrix (queue containing values) |
| # OUTPUT: new_queue: the new queue nxn matrix |
| dataQueuePush = function(Matrix[Double] left, Matrix[Double] right, Matrix[Double] queue) |
| return (Matrix[Double] new_queue) |
| { |
| len = ncol(queue) |
| if(len <= 0) |
| new_queue = cbind(left, right) |
| else |
| new_queue = cbind(queue, left, right) |
| } |
| |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: X: a nxn matrix (the current samples with all features we observe) |
| # INPUT: y: a 1xn matrix (the current y to all observed samples) |
| # OUTPUT: lowest_residuals_index: the feature index with the lowest residuals |
| findBestFeature = function(Matrix[Double] X, Matrix[Double] y, Integer sml_type) |
| return (Integer lowest_residuals_index) |
| { |
| lowest_residuals = 0 |
| lowest_residuals_index = 1 |
| |
| for(i in 1: ncol(X)) { |
| current_feature = X[,i] |
| |
| # TODO investigate if glm is necessary here |
| if(sml_type == 1) # Regression |
| weights = glm(X=current_feature, Y=y, dfam=1, verbose=FALSE) |
| else # Classification |
| weights = glm(X=current_feature, Y=y, dfam=2, verbose=FALSE) |
| |
| y_residual = y - current_feature %*% weights |
| res = sum(y_residual ^ 2) # sum of least square calculation |
| |
| if(i == 1 | res < lowest_residuals) { |
| lowest_residuals = res |
| lowest_residuals_index = i |
| } |
| } |
| } |
| |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: one_featureX: a 1xn matrix (one feature with all values) |
| # OUTPUT: best_split: the best split (highest gain indicates best splitting of datasets) |
| findBestSplit = function(Integer sml_type, Matrix[Double] one_featureX, Double sim_score_parent, Matrix[Double] predictions, Double lambda) |
| return (Double best_split, Double best_gain) |
| { |
| assert(ncol(one_featureX) == 1) |
| best_split = 0 |
| best_gain = 0 |
| best_sim_score_left = 0 |
| best_sim_score_right = 0 |
| |
| ordered_X = orderFeature(one_featureX) # order entries by feature value |
| |
| # iterate over all averages between 2 points |
| for(i in 1: (nrow(ordered_X)-1)) { |
| current_split = average(ordered_X[i,], ordered_X[i+1,]) |
| [left, right] = splitMatrixByValue(one_featureX, current_split) |
| |
| if(sml_type == 1) { # Regression |
| sim_score_left = calculateSimilarityScore(left, lambda) |
| sim_score_right = calculateSimilarityScore(right, lambda) |
| } |
| else { # Classification |
| sim_score_left = calculateSimilarityScoreClassification(left, predictions, lambda) |
| sim_score_right = calculateSimilarityScoreClassification(right, predictions, lambda) |
| } |
| current_gain = sim_score_left + sim_score_right - sim_score_parent |
| |
| if(current_gain > best_gain) { |
| best_gain = current_gain |
| best_split = current_split |
| best_sim_score_left = sim_score_left |
| best_sim_score_right = sim_score_right |
| } |
| } |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: X: a 1xn matrix (one feature with all values) |
| # value: the border to separate the values |
| # loop: boolean to check if used in loop and returns then other values |
| # OUTPUT: left: a 1xn matrix with all values smaller than "value" |
| # right: a 1xn matrix with all values larger than "value" |
| splitMatrixByValue = function(Matrix[Double] X, Double value) return (Matrix[Double] left, Matrix[Double] right) { |
| assert(ncol(X) == 1) |
| left = matrix(0, rows=0, cols=1) |
| right = matrix(0, rows=0, cols=1) |
| |
| # TODO perf vectorize |
| for(i in 1:nrow(X)) { |
| if(as.scalar(X[i,]) < value) |
| left = rbind(left,X[i,]) |
| else |
| right = rbind(right, X[i,]) |
| } |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: curr_X: a 1xn matrix (one feature with current values) |
| # X: a 1xn matrix the feature of the X-Matrix with all values |
| # value: the border to separate the values |
| # OUTPUT: left: a 1xn matrix with 1.0 if the value is smaller or 0.0 if it is bigger |
| # right: a 1xn matrix with 1.0 if the value is bigger or 0.0 if it is smaller |
| splitMatrixByValueLoop = function(Matrix[Double] curr_X, Matrix[Double] X, Double value) |
| return (Matrix[Double] left, Matrix[Double] right) |
| { |
| left = matrix(0, rows=nrow(X), cols=1) |
| right = matrix(0, rows=nrow(X), cols=1) |
| |
| # TODO perf vectorize |
| for(i in 1:nrow(X)) { |
| if (as.scalar(X[i,]) == as.scalar(curr_X[i,])) { # if same row in curr_X and X |
| if (as.scalar(X[i,]) < value) |
| left[i,1] = 1 |
| else |
| right[i,1] = 1 |
| } |
| else { # 0 in left and right if not used in this node |
| left[i,1] = 0 |
| right[i,1] = 0 |
| } |
| } |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: curr_X: a 1xn matrix (one feature with current values) |
| # X: a 1xn matrix the feature of the X-Matrix with all values |
| # OUTPUT: left: a 1xn matrix with 1.0 for if the value is true or 0.0 if it is false |
| # right: a 1xn matrix with 1.0 for if the value is false or 0.0 if it is true |
| splitMatrixByCategory = function(Matrix[Double] curr_X, Matrix[Double] X) |
| return (Matrix[Double] left, Matrix[Double] right) |
| { |
| assert(ncol(curr_X) == 1) |
| assert(ncol(X) == 1) |
| left = matrix(0, rows=nrow(X), cols=1) |
| right = matrix(0, rows=nrow(X), cols=1) |
| |
| for(i in 1:nrow(curr_X)) { |
| if (as.scalar(curr_X[i,]) == 1) # categorical true |
| left[i,1] = 1 |
| else if(as.scalar(curr_X[i,]) == 0) # categorical false |
| right[i,1] = 1 |
| else { # replace with NaN if not used for the current samples |
| left[i,1] = 'NaN' |
| right[i,1] = 'NaN' |
| } |
| } |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: row_vector: a 1xn matrix (one feature with all residuals) |
| # OUTPUT: similarity_score: the similarity score of the residuals |
| calculateSimilarityScore = function (matrix[Double] row_vector, Double lambda) |
| return (Double similarity_score) |
| { |
| similarity_score = (sum(row_vector)^2) / (nrow(row_vector) + lambda); |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: row_vector: a 1xn matrix (one feature with all residuals) |
| # OUTPUT: similarity_score: the similarity score of the residuals |
| calculateSimilarityScoreClassification = function (matrix[Double] row_vector, matrix[Double] predictions, Double lambda) |
| return (Double similarity_score) |
| { |
| nominator = (sum(row_vector)^2) |
| d = predictions * (1 - predictions) |
| denominator = sum(d) + lambda |
| similarity_score = nominator / denominator; |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: residuals_vector: a 1xn matrix (one feature with all residuals) |
| # OUTPUT: similarity_score: the similarity score of the residuals |
| calculateOutputValue = function (matrix[Double] residuals_vector, Double lambda) |
| return (Double output_value) |
| { |
| output_value = (sum(residuals_vector)) / (nrow(residuals_vector) + lambda); |
| if(output_value == 'NaN') # just in case we have a node with no sample inside |
| output_value = 0.0 |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: residuals_vector: a 1xn matrix (one feature with all residuals) |
| # OUTPUT: similarity_score: the similarity score of the residuals |
| calculateOutputValueClassification = function (matrix[Double] residuals_vector, matrix[Double] predictions, Double lambda) |
| return (Double output_value) |
| { |
| nominator = (sum(residuals_vector)) |
| d = predictions * (1 - predictions) |
| denominator = sum(d) + lambda |
| if(denominator == 0) |
| output_value = 0 |
| else |
| output_value = nominator / denominator |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: row_vector: a 1xn matrix (one feature with all rows) |
| # prediction: a 1xn matrix, the current prediction value of each row |
| # OUTPUT: the residuals from the "prediction" as 1xn matrix |
| calculateResiduals = function (matrix[Double] row_vector, matrix[Double] prediction) |
| return (matrix[Double] residuals_row) |
| { |
| residuals_row = row_vector - prediction; |
| } |
| |
| #----------------------------------------------------------------------------------------------------------------------- |
| # INPUT: row_vector: a 1xn matrix which has unordered values |
| # OUTPUT: row_vector_ordered: 1xn matrix were the values are in accessing order |
| orderFeature = function (matrix[double] row_vector) |
| return (matrix[double] row_vector_ordered) |
| { |
| row_vector_ordered = order(target=row_vector, by=1, decreasing=FALSE, index.return=FALSE); |
| } |
| |
| average = function(Matrix[Double] x, Matrix[Double] y) return (Double avg) { |
| avg = as.scalar((x + y) / 2) |
| } |