blob: dfaef13f134f0c78fa3e0e9bc4d42feba50138db [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X Matrix[Double] --- Feature matrix X; note that X needs to be both recoded and dummy coded
# Y Matrix[Double] --- Label matrix Y; note that Y needs to be both recoded and dummy coded
# R Matrix[Double] 1, 1xn Matrix R; 1xn vector which for each feature in X contains the following information
# - R[,1]: 1 (scalar feature)
# - R[,2]: 2 (categorical feature)
# Feature 1 is a scalar feature and features 2 is a categorical feature
# If R is not provided by default all variables are assumed to be scale (1)
# sml_type Integer 1 Supervised machine learning type: 1 = Regression(default), 2 = Classification
# num_trees Integer 7 Number of trees to be created in the xgboost model
# learning_rate Double 0.3 Alias: eta. After each boosting step the learning rate controls the weights of the new predictions
# max_depth Integer 6 Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit
# lambda Double 0.0 L2 regularization term on weights. Increasing this value will make model more conservative and reduce amount of leaves of a tree
# ---------------------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------------------
# OUTPUT:
# Matrix M where each column corresponds to a node in the learned tree (the first node is the init prediction) and each row contains the following information:
# M[1,j]: id of node j (in a complete binary tree)
# M[2,j]: tree id to which node j belongs
# M[3,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
# M[4,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical)
# that node j looks at if j is an internal node, otherwise 0
# M[5,j]: Type of the feature that node j looks at if j is an internal node. if leaf = 0, if scalar = 1, if categorical = 2
# M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale,
# otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
# -------------------------------------------------------------------------------------------
m_xgboost = function(Matrix[Double] X, Matrix[Double] y,
Matrix[Double] R = matrix(1,rows=1,cols=nrow(X)), Integer sml_type = 1, Integer num_trees = 7,
Double learning_rate = 0.3, Integer max_depth = 6, Double lambda = 0.0)
return (Matrix[Double] M)
{
# test if input correct
assert(nrow(X) == nrow(y))
assert(ncol(y) == 1)
assert(nrow(R) == 1)
M = matrix(0,rows=6,cols=0)
# set the init prediction at first col in M
init_prediction_matrix = matrix(0,rows=nrow(M),cols=1)
init_prediction_matrix[6,1] = median(y)
M = cbind(M, init_prediction_matrix)
current_prediction = matrix(median(y), rows=nrow(y), cols=1)
tree_id = 1
while(tree_id <= num_trees) {
if(sml_type == 1) # Regression
{
curr_M = buildOneTreeRegression(X, y, R, max_depth, current_prediction, tree_id, lambda)
}
else # classification
{
assert(sml_type == 2)
curr_M = buildOneTreeClassification(X, y, R, max_depth, current_prediction, tree_id, lambda)
}
# in current prediction all previous trees are considered, so we only add the current tree to calculate new predictions
current_prediction = calculateNewPredictions(X, sml_type,current_prediction, learning_rate, curr_M)
tree_id = tree_id + 1
M = cbind(M, curr_M) # concat the new tree to the existing one (forest-ing)
}
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: X: nxn matrix, original input matrix
# INPUT: current_prediction: nx1 vector of the current prediction for my target features y (1st run is init prediction)
# INPUT: learning_rate: set by user
# INPUT: curr_M: The current M matrix with the current tree
# OUTPUT: new_prediction: x1 vector of new new_prediction for my target features y
calculateNewPredictions = function(Matrix[Double] X, Integer sml_type, Matrix[Double] current_prediction,
Double learning_rate, Matrix[Double] curr_M)
return (Matrix[Double] new_prediction)
{
new_prediction = matrix(0, rows=nrow(current_prediction), cols=1)
start_node_current_tree = curr_M[,1]
if(sml_type == 1) # Regression
{
for(entry in 1:nrow(X)) # go though each entry in X and calculate the new prediction
{
output_values = matrix(0, rows=1, cols=0)
output_value = getOutputValueForEntry(X[entry,], curr_M, start_node_current_tree)
output_values = cbind(output_values, as.matrix(output_value))
new_prediction[entry,] = current_prediction[entry,] + learning_rate * sum(output_values)
}
}
else # Classification
{
for(entry in 1:nrow(X)) # go though each entry in X and calculate the new prediction
{
output_values = matrix(0, rows=1, cols=0)
output_value = getOutputValueForEntry(X[entry,], curr_M, start_node_current_tree)
output_values = cbind(output_values, as.matrix(output_value))
odds = as.scalar(current_prediction[entry,])
if((1 - odds) == 0)
log_odds = 0
else
log_odds = log(odds / (1 - odds))
x = (log_odds + learning_rate * sum(output_values))
e = 2.7182818284
new_prediction[entry,] = e^x / (1 + e^x)
}
}
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: row_vector: nx1 vector, one sample with n features
# INPUT: curr_M: The current M matrix with the current tree
# INPUT: current_node: the start node of the current tree
# OUTPUT: output_value: the prediction for the current sample at the current tree
getOutputValueForEntry = function(Matrix[Double] row_vector,
Matrix[Double] curr_M, Matrix[Double] current_node)
return (Double output_value)
{
cur_node_index = 1 # cant take the node id because its diff to the index
# iterate over one tree and find output value
while(as.scalar(current_node[5,]) != 0.0) # until leaf
{
used_feature = as.scalar(current_node[4,])
feature_is_scalar = as.scalar(current_node[5,]) == 1
if(feature_is_scalar) #SCALAR values
{
if(as.scalar(row_vector[,used_feature]) < as.scalar(current_node[6,])) # go left
{
cur_node_index = (cur_node_index + as.scalar(current_node[3,]))
current_node = curr_M[,cur_node_index]
}
else # go right
{
cur_node_index = cur_node_index + as.scalar(current_node[3,]) + 1
current_node = curr_M[,cur_node_index]
}
}
else # CATEGORICAL values
{
assert(as.scalar(current_node[5,]) == 2)
if(as.scalar(row_vector[,used_feature]) == 1) # go left
{
cur_node_index = (cur_node_index + as.scalar(current_node[3,]))
current_node = curr_M[,cur_node_index]
}
else # go right
{
assert(as.scalar(row_vector[,used_feature]) == 0)
cur_node_index = cur_node_index + as.scalar(current_node[3,]) + 1
current_node = curr_M[,cur_node_index]
}
}
}
output_value = as.scalar(current_node[6,])
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: X: nxn matrix, original input matrix
# INPUT: y: nx1 vector of my target values
# INPUT: R The matrix R which for each feature in X contains the following information
# - R[,2]: 1 (scalar feature)
# - R[,1]: 2 (categorical feature)
# INPUT: max_depth: the max depth of a tree
# INPUT: prediction: nx1 vector, my current predictions for my target value y
# INPUT: tree_id: The current tree id, starting at 1
# INPUT: lambda: the regularization parameter lambda
# OUTPUT: M: the current M matrix of this tree
buildOneTreeRegression = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] R, Integer max_depth,
Matrix[Double] prediction, Double tree_id, Double lambda)
return (Matrix[Double] M)
{
sml_type = 1 # regression
M = matrix(0,rows=6,cols=0)
node_queue = matrix(1, rows=1, cols=1) # Add first Node
curr_rows_queue = matrix(1, rows=nrow(X), cols=1)
node_queue_len = 1
depth = 0.0
level = 0
# adjust this here if we create more trees to create new prediction values
while (node_queue_len > 0) {
has_child = FALSE
[node_queue, node] = dataQueuePop(node_queue)
[curr_rows_queue, curr_rows_vector] = dataQueuePop(curr_rows_queue)
level = log(as.scalar(node), 2) + 1
available_rows = calcAvailableRows(curr_rows_vector) # check if leaf
if(available_rows == 0) {
residual_matrix = matrix(0, rows=0, cols=0)
}
else {
[curr_X, curr_y, curr_X_full, curr_prediction] = updateMatrices(X, y, prediction, curr_rows_vector)
residual_matrix = calculateResiduals(curr_y, curr_prediction) # we need residuals calculated also for leafs for output value
}
best_feature_index = 0.00
done = FALSE
if(available_rows > 1 & max_depth > level & done == FALSE) # leaf check or max depth check
{
best_feature_index = findBestFeature(X=curr_X, y=curr_y, sml_type=sml_type)
type = getTypeOfFeature(R, best_feature_index)
if(type == 1.0) # SCALAR
{
similarity_score = calculateSimilarityScore(residual_matrix, lambda)
[best_split_threshold, best_gain] = findBestSplit(sml_type, curr_X[,best_feature_index], similarity_score, curr_prediction, lambda)
has_child = best_gain > 0 # if the gain is < 0, the split is worse than the current node
}
else # CATEGORICAL
{
has_child = TRUE
}
}
if (has_child)
{
[left, right] = calculateChildNodeIDs(node)
node_queue = dataQueuePush(left, right, node_queue)
if (type == 1.0) # SCALAR
{
[left_row_vec, right_row_vec] = splitMatrixByValueLoop(curr_X_full[,best_feature_index], X[,best_feature_index], best_split_threshold)
curr_rows_queue = dataQueuePush(left_row_vec, right_row_vec, curr_rows_queue)
offset = ncol(node_queue) - 1
M = addOutputRow(M, node, tree_id, R, offset, best_feature_index, best_split_threshold, 0.0)
}
else # CATEGORICAL
{
[left_row_vec, right_row_vec] = splitMatrixByCategory(curr_X_full[,best_feature_index], X[,best_feature_index])
curr_rows_queue = dataQueuePush(left_row_vec, right_row_vec, curr_rows_queue)
offset = ncol(node_queue) - 1
M = addOutputRow(M, node, tree_id, R, offset, best_feature_index, 0.0, 0.0)
}
}
else # has no child => must be leaf
{
output_value = calculateOutputValue(residual_matrix, lambda)
# offset, best_feature_idx, threshold
M = addOutputRow(M, node, tree_id, R, 0.0, 0.0, 0.0, output_value)
}
node_queue_len = ncol(node_queue)
}
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: X: nxn matrix, original input matrix
# INPUT: y: nx1 vector of my target values
# INPUT: R The matrix R which for each feature in X contains the following information
# - R[,2]: 1 (scalar feature)
# - R[,1]: 2 (categorical feature)
# INPUT: max_depth: the max depth of a tree
# INPUT: prediction: nx1 vector, my current predictions for my target value y
# INPUT: tree_id: The current tree id, starting at 1
# INPUT: lambda: the regularization parameter lambda
# OUTPUT: M: the current M matrix of this tree
buildOneTreeClassification = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] R, Integer max_depth,
Matrix[Double] prediction, Double tree_id, Double lambda)
return (Matrix[Double] M)
{
sml_type = 2 # classification
M = matrix(0,rows=6,cols=0)
node_queue = matrix(1, rows=1, cols=1) # Add first Node
curr_rows_queue = matrix(1, rows=nrow(X), cols=1)
node_queue_len = 1
depth = 0.0
level = 0
# adjust this here if we create more trees to create new prediction values
while (node_queue_len > 0) {
has_child = FALSE
[node_queue, node] = dataQueuePop(node_queue)
[curr_rows_queue, curr_rows_vector] = dataQueuePop(curr_rows_queue)
level = log(as.scalar(node), 2) + 1
available_rows = calcAvailableRows(curr_rows_vector) # check if leaf
if(available_rows == 0) {
residual_matrix = matrix(0, rows=0, cols=0)
}
else {
[curr_X, curr_y, curr_X_full, curr_prediction] = updateMatrices(X, y, prediction, curr_rows_vector)
residual_matrix = calculateResiduals(curr_y, curr_prediction) # we need residuals calculated also for leafs for output value
}
best_feature_index = 0.00
done = FALSE
count = sum(curr_y)
if(count == 0)
done = TRUE
if(count == nrow(curr_y))
done = TRUE
if(available_rows > 1 & max_depth > level & done == FALSE) # leaf check or max depth check
{
best_feature_index = findBestFeature(X=curr_X, y=curr_y, sml_type=sml_type)
type = getTypeOfFeature(R, best_feature_index)
if(type == 1.0) # SCALAR
{
similarity_score = calculateSimilarityScoreClassification(residual_matrix, curr_prediction, lambda)
[best_split_threshold, best_gain] = findBestSplit(sml_type, curr_X[,best_feature_index], similarity_score, curr_prediction, lambda)
has_child = best_gain > 0 # if the gain is < 0, the split is worse than the current node
}
else # CATEGORICAL
{
has_child = TRUE
}
}
if (has_child)
{
[left, right] = calculateChildNodeIDs(node)
node_queue = dataQueuePush(left, right, node_queue)
if (type == 1.0) # SCALAR
{
[left_row_vec, right_row_vec] = splitMatrixByValueLoop(curr_X_full[,best_feature_index], X[,best_feature_index], best_split_threshold)
curr_rows_queue = dataQueuePush(left_row_vec, right_row_vec, curr_rows_queue)
offset = ncol(node_queue) - 1
M = addOutputRow(M, node, tree_id, R, offset, best_feature_index, best_split_threshold, 0.0)
}
else # CATEGORICAL
{
assert(type == 2.0)
[left_row_vec, right_row_vec] = splitMatrixByCategory(curr_X_full[,best_feature_index], X[,best_feature_index])
curr_rows_queue = dataQueuePush(left_row_vec, right_row_vec, curr_rows_queue)
offset = ncol(node_queue) - 1
M = addOutputRow(M, node, tree_id, R, offset, best_feature_index, 0.0, 0.0)
}
}
else # has no child => must be leaf
{
output_value = calculateOutputValueClassification(residual_matrix, curr_prediction, lambda)
# offset, best_feature_idx, threshold
M = addOutputRow(M, node, tree_id, R, 0.0, 0.0, 0.0, output_value)
}
node_queue_len = ncol(node_queue)
}
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: node: a 1xn matrix (node containing values)
# OUTPUT: left: the new left node id
# OUTPUT: right: the new right node id
calculateChildNodeIDs = function(Matrix[Double] node)
return(Matrix[Double] left, Matrix[Double] right)
{
left = node * 2.0
right = node * 2.0 + 1.0
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: M: the nxn output matrix
# INPUT: node: the current node
# INPUT: R The matrix R which for each feature in X contains the following information
# - R[,2]: 1 (scalar feature)
# - R[,1]: 2 (categorical feature)
# INPUT: offset: offset to the left child
# INPUT: used_col: which feature got used
# INPUT:
# INPUT: threshold: the value at the node we separated it, if leaf => the output value
# OUTPUT: new_M: nxn output Matrix
addOutputRow = function(Matrix[Double] M, Matrix[Double] node, Double tree_id, Matrix[Double] R,
Double offset, Double used_col, Double threshold, Double output_value)
return (Matrix[Double] new_M)
{
current_node = matrix(0, rows=6, cols=1)
current_node[1,1] = node[1,1] # node id
current_node[2,1] = tree_id
current_node[3,1] = offset # offset to left child
current_node[4,1] = used_col # features used
if(used_col == 0.0) { # is a single value or max depth reached => leaf
current_node[5,1] = 0 # 0 if leaf node
assert(threshold == 0)
current_node[6,1] = output_value
}
else if (as.scalar(R[1, used_col]) == 2.0) {
current_node[5, 1] = 2.0 # categorical
assert(threshold == 0.0) # should be 0 at categorical
current_node[6,1] = threshold
}
else {
current_node[5,1] = 1.0 # 1 if it is scalar
current_node[6,1] = threshold # split value ( 0 if leaf or categorical)
}
new_M = cbind(M, current_node)
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: R: Location to read the matrix R(nx1) which for each feature in X contains the following information
# - R[,2]: 1 (scalar feature)
# - R[,1]: 2 (categorical feature)
# INPUT: col: the col that is desired to know the type
# OUTPUT: [1.0,scalar] or [2.0,categorical]
#-----------------------------------------------------------------------------------------------------------------------
getTypeOfFeature = function(Matrix[Double] R, Double col) return(Double type) {
type = as.scalar(R[1, col])
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: X: a nxn matrix (X Matrix from input)
# INPUT: y: a 1xn matrix (y Matrix from input)
# INPUT: prediction: a 1xn matrix with the current predictions for y (median at first tree)
# INPUT: vector: a 1xn matrix (contains 1 for relevant and 0 for non-relevant value)
# OUTPUT: new_X: the new X containing only the relevant rows
# OUTPUT: new_y: the new y containing only the relevant rows
# OUTPUT: new_X_full: the new X containing the relevant rows + 0 for non_relevant
# OUTPUT: new_prediction: the current prediction containing only the relevant rows
updateMatrices = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] prediction, Matrix[Double] vector)
return(Matrix[Double] new_X, Matrix[Double] new_y, Matrix[Double] new_X_full, Matrix[Double] new_prediction) {
assert(ncol(vector) == 1)
assert(nrow(vector) == nrow(X))
new_X = matrix(0, rows = 0, cols = 0)
new_y = matrix(0, rows = 0, cols = 0)
nan_vec = matrix(NaN, rows=nrow(vector), cols=ncol(X))
new_X_full = matrix(0, rows = 0, cols=0)
new_prediction = matrix(0, rows=0, cols=0)
for(i in 1:nrow(vector)) {
if(as.scalar(vector[i,]) == 1.0) # this row is relevant for this node
{
if (ncol(new_X) == 0 & ncol(new_X_full) == 0) { # first iteration
new_X = X[i,]
new_y = y[i,]
new_X_full = X[i,]
new_prediction = prediction[i,]
} else if (ncol(new_X) == 0 & ncol(new_X_full) != 0) { # already a 0 row found
new_X = X[i,]
new_y = y[i,]
new_X_full = rbind(new_X_full, X[i,])
new_prediction = prediction[i,]
} else { # adding all others
new_X = rbind(new_X, X[i,])
new_y = rbind(new_y, y[i,])
new_X_full = rbind(new_X_full, X[i,])
new_prediction = rbind(new_prediction, prediction[i,])
}
}
else { # this row is NOT relevant for this node
assert(as.scalar(vector[i,]) == 0.0 | as.scalar(vector[i,]) == 'NaN')
if (ncol(new_X_full) == 0) # first row
new_X_full = nan_vec[i,]
else
new_X_full = rbind(new_X_full, nan_vec[i,]) # add a new row filled with NaNs
}
}
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: vector: a 1xn matrix (current datapoints which are interesting)
# OUTPUT: available_elements: nr. of available datapoints
calcAvailableRows = function(Matrix[Double] vector)
return(Double available_elements)
{
assert(ncol(vector) == 1)
len = nrow(vector)
available_elements = 0.0
# TODO perf vectorize
for (index in 1:len) {
element = as.scalar(vector[index,])
if(element > 0.0)
available_elements = available_elements + 1.0
}
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: queue: a nxn matrix (queue containing values)
# OUTPUT: new_queue: the new queue after the first vector gets popped
# OUTPUT: node_vec: the popped vector (1xn) from the queue
dataQueuePop = function(Matrix[Double] queue)
return (Matrix[Double] new_queue, Matrix[Double] node_vec)
{
node_vec = matrix(queue[,1], rows=1, cols=nrow(queue)) # reshape to force the creation of a new object
node_vec = matrix(node_vec, rows=nrow(queue), cols=1) # reshape to force the creation of a new object
len = ncol(queue)
if (len < 2)
new_queue = matrix(0,0,0)
else
new_queue = matrix(queue[,2:ncol(queue)], rows=nrow(queue), cols=ncol(queue)-1)
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: left: a 1xn matrix
# INPUT: right: a 1xn matrix
# INPUT: queue: a nxn matrix (queue containing values)
# OUTPUT: new_queue: the new queue nxn matrix
dataQueuePush = function(Matrix[Double] left, Matrix[Double] right, Matrix[Double] queue)
return (Matrix[Double] new_queue)
{
len = ncol(queue)
if(len <= 0)
new_queue = cbind(left, right)
else
new_queue = cbind(queue, left, right)
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: X: a nxn matrix (the current samples with all features we observe)
# INPUT: y: a 1xn matrix (the current y to all observed samples)
# OUTPUT: lowest_residuals_index: the feature index with the lowest residuals
findBestFeature = function(Matrix[Double] X, Matrix[Double] y, Integer sml_type)
return (Integer lowest_residuals_index)
{
lowest_residuals = 0
lowest_residuals_index = 1
for(i in 1: ncol(X)) {
current_feature = X[,i]
# TODO investigate if glm is necessary here
if(sml_type == 1) # Regression
weights = glm(X=current_feature, Y=y, dfam=1, verbose=FALSE)
else # Classification
weights = glm(X=current_feature, Y=y, dfam=2, verbose=FALSE)
y_residual = y - current_feature %*% weights
res = sum(y_residual ^ 2) # sum of least square calculation
if(i == 1 | res < lowest_residuals) {
lowest_residuals = res
lowest_residuals_index = i
}
}
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: one_featureX: a 1xn matrix (one feature with all values)
# OUTPUT: best_split: the best split (highest gain indicates best splitting of datasets)
findBestSplit = function(Integer sml_type, Matrix[Double] one_featureX, Double sim_score_parent, Matrix[Double] predictions, Double lambda)
return (Double best_split, Double best_gain)
{
assert(ncol(one_featureX) == 1)
best_split = 0
best_gain = 0
best_sim_score_left = 0
best_sim_score_right = 0
ordered_X = orderFeature(one_featureX) # order entries by feature value
# iterate over all averages between 2 points
for(i in 1: (nrow(ordered_X)-1)) {
current_split = average(ordered_X[i,], ordered_X[i+1,])
[left, right] = splitMatrixByValue(one_featureX, current_split)
if(sml_type == 1) { # Regression
sim_score_left = calculateSimilarityScore(left, lambda)
sim_score_right = calculateSimilarityScore(right, lambda)
}
else { # Classification
sim_score_left = calculateSimilarityScoreClassification(left, predictions, lambda)
sim_score_right = calculateSimilarityScoreClassification(right, predictions, lambda)
}
current_gain = sim_score_left + sim_score_right - sim_score_parent
if(current_gain > best_gain) {
best_gain = current_gain
best_split = current_split
best_sim_score_left = sim_score_left
best_sim_score_right = sim_score_right
}
}
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: X: a 1xn matrix (one feature with all values)
# value: the border to separate the values
# loop: boolean to check if used in loop and returns then other values
# OUTPUT: left: a 1xn matrix with all values smaller than "value"
# right: a 1xn matrix with all values larger than "value"
splitMatrixByValue = function(Matrix[Double] X, Double value) return (Matrix[Double] left, Matrix[Double] right) {
assert(ncol(X) == 1)
left = matrix(0, rows=0, cols=1)
right = matrix(0, rows=0, cols=1)
# TODO perf vectorize
for(i in 1:nrow(X)) {
if(as.scalar(X[i,]) < value)
left = rbind(left,X[i,])
else
right = rbind(right, X[i,])
}
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: curr_X: a 1xn matrix (one feature with current values)
# X: a 1xn matrix the feature of the X-Matrix with all values
# value: the border to separate the values
# OUTPUT: left: a 1xn matrix with 1.0 if the value is smaller or 0.0 if it is bigger
# right: a 1xn matrix with 1.0 if the value is bigger or 0.0 if it is smaller
splitMatrixByValueLoop = function(Matrix[Double] curr_X, Matrix[Double] X, Double value)
return (Matrix[Double] left, Matrix[Double] right)
{
left = matrix(0, rows=nrow(X), cols=1)
right = matrix(0, rows=nrow(X), cols=1)
# TODO perf vectorize
for(i in 1:nrow(X)) {
if (as.scalar(X[i,]) == as.scalar(curr_X[i,])) { # if same row in curr_X and X
if (as.scalar(X[i,]) < value)
left[i,1] = 1
else
right[i,1] = 1
}
else { # 0 in left and right if not used in this node
left[i,1] = 0
right[i,1] = 0
}
}
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: curr_X: a 1xn matrix (one feature with current values)
# X: a 1xn matrix the feature of the X-Matrix with all values
# OUTPUT: left: a 1xn matrix with 1.0 for if the value is true or 0.0 if it is false
# right: a 1xn matrix with 1.0 for if the value is false or 0.0 if it is true
splitMatrixByCategory = function(Matrix[Double] curr_X, Matrix[Double] X)
return (Matrix[Double] left, Matrix[Double] right)
{
assert(ncol(curr_X) == 1)
assert(ncol(X) == 1)
left = matrix(0, rows=nrow(X), cols=1)
right = matrix(0, rows=nrow(X), cols=1)
for(i in 1:nrow(curr_X)) {
if (as.scalar(curr_X[i,]) == 1) # categorical true
left[i,1] = 1
else if(as.scalar(curr_X[i,]) == 0) # categorical false
right[i,1] = 1
else { # replace with NaN if not used for the current samples
left[i,1] = 'NaN'
right[i,1] = 'NaN'
}
}
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: row_vector: a 1xn matrix (one feature with all residuals)
# OUTPUT: similarity_score: the similarity score of the residuals
calculateSimilarityScore = function (matrix[Double] row_vector, Double lambda)
return (Double similarity_score)
{
similarity_score = (sum(row_vector)^2) / (nrow(row_vector) + lambda);
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: row_vector: a 1xn matrix (one feature with all residuals)
# OUTPUT: similarity_score: the similarity score of the residuals
calculateSimilarityScoreClassification = function (matrix[Double] row_vector, matrix[Double] predictions, Double lambda)
return (Double similarity_score)
{
nominator = (sum(row_vector)^2)
d = predictions * (1 - predictions)
denominator = sum(d) + lambda
similarity_score = nominator / denominator;
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: residuals_vector: a 1xn matrix (one feature with all residuals)
# OUTPUT: similarity_score: the similarity score of the residuals
calculateOutputValue = function (matrix[Double] residuals_vector, Double lambda)
return (Double output_value)
{
output_value = (sum(residuals_vector)) / (nrow(residuals_vector) + lambda);
if(output_value == 'NaN') # just in case we have a node with no sample inside
output_value = 0.0
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: residuals_vector: a 1xn matrix (one feature with all residuals)
# OUTPUT: similarity_score: the similarity score of the residuals
calculateOutputValueClassification = function (matrix[Double] residuals_vector, matrix[Double] predictions, Double lambda)
return (Double output_value)
{
nominator = (sum(residuals_vector))
d = predictions * (1 - predictions)
denominator = sum(d) + lambda
if(denominator == 0)
output_value = 0
else
output_value = nominator / denominator
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: row_vector: a 1xn matrix (one feature with all rows)
# prediction: a 1xn matrix, the current prediction value of each row
# OUTPUT: the residuals from the "prediction" as 1xn matrix
calculateResiduals = function (matrix[Double] row_vector, matrix[Double] prediction)
return (matrix[Double] residuals_row)
{
residuals_row = row_vector - prediction;
}
#-----------------------------------------------------------------------------------------------------------------------
# INPUT: row_vector: a 1xn matrix which has unordered values
# OUTPUT: row_vector_ordered: 1xn matrix were the values are in accessing order
orderFeature = function (matrix[double] row_vector)
return (matrix[double] row_vector_ordered)
{
row_vector_ordered = order(target=row_vector, by=1, decreasing=FALSE, index.return=FALSE);
}
average = function(Matrix[Double] x, Matrix[Double] y) return (Double avg) {
avg = as.scalar((x + y) / 2)
}