scripts/builtin/decisionTree.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 #
 # THIS SCRIPT IMPLEMENTS CLASSIFICATION TREES WITH BOTH SCALE AND CATEGORICAL FEATURES
 #
 # INPUT   PARAMETERS:
 # ---------------------------------------------------------------------------------------------
 # NAME    TYPE             DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # X       Matrix[Double]   ---      Feature matrix X; note that X needs to be both recoded and dummy coded
 # Y       Matrix[Double]   ---      Label matrix Y; note that Y needs to be both recoded and dummy coded
 # R       Matrix[Double]   " "      Matrix R which for each feature in X contains the following information
 #                                     - R[1,]: Row Vector which indicates if feature vector is scalar or categorical. 1 indicates
 #                                     a scalar feature vector, other positive Integers indicate the number of categories
 #                                     If R is not provided by default all variables are assumed to be scale
 # bins    Integer          20       Number of equiheight bins per scale feature to choose thresholds
 # depth   Integer          25       Maximum depth of the learned tree
 # verbose Boolean          FALSE    boolean specifying if the algorithm should print information while executing
 # ---------------------------------------------------------------------------------------------
 # OUTPUT:
 # Matrix M where each column corresponds to a node in the learned tree and each row contains the following information:
 #  M[1,j]: id of node j (in a complete binary tree)
 #  M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
 #  M[3,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical)
 #    that node j looks at if j is an internal node, otherwise 0
 #  M[4,j]: Type of the feature that node j looks at if j is an internal node: holds the same information as R input vector
 #  M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values
 #    stored in rows 6,7,... if j is categorical
 #    If j is a leaf node: number of misclassified samples reaching at node j
 #  M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale,
 #     otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
 #     If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
 # -------------------------------------------------------------------------------------------

 m_decisionTree = function(
   Matrix[Double] X,
   Matrix[Double] Y,
   Matrix[Double] R,
   Integer bins = 10,
   Integer depth = 20,
   Boolean verbose = FALSE
 ) return (Matrix[Double] M) {
   if (verbose) {
     print("Executing Decision Tree:")
   }
   node_queue = matrix(1, rows=1, cols=1)     # Add first Node
   impurity_queue = matrix(1, rows=1, cols=1)
   use_cols_queue = matrix(1, rows=ncol(X), cols=1)   # Add fist bool Vector with all cols <=> (use all cols)
   use_rows_queue = matrix(1, rows=nrow(X), cols=1)   # Add fist bool Vector with all rows <=> (use all rows)
   queue_length = 1
   M = matrix(0, rows = 0, cols = 0)
   while (queue_length > 0) {
     [node_queue, node] = dataQueuePop(node_queue)
     [use_rows_queue, use_rows_vector] = dataQueuePop(use_rows_queue)
     [use_cols_queue, use_cols_vector] = dataQueuePop(use_cols_queue)

     available_rows = calcAvailable(use_rows_vector)
     available_cols = calcAvailable(use_cols_vector)
     [impurity_queue, parent_impurity] = dataQueuePop(impurity_queue)
     create_child_nodes_flag = FALSE
       if (verbose) {
         print("Popped Node:  " + as.scalar(node))
         print("Rows:     " + toString(t(use_rows_vector)))
         print("Cols:     " + toString(t(use_cols_vector)))
         print("Available Rows:   " + available_rows)
         print("Available Cols:   " + available_cols)
         print("Parent impurity:  " + as.scalar(parent_impurity))
       }

     node_depth = calculateNodeDepth(node)
     used_col = 0.0
     if (node_depth < depth & available_rows > 1 & available_cols > 0 & as.scalar(parent_impurity) > 0) {
       [impurity, used_col, threshold, type] = calcBestSplittingCriteria(X, Y, R, use_rows_vector, use_cols_vector, bins)
       create_child_nodes_flag = impurity < as.scalar(parent_impurity)
         if (verbose) {
       print("Current impurity:   " + impurity)
       print("Current threshold:  "+ toString(t(threshold)))
         }
     }
     if (verbose) {

      print("Current column:   " + used_col)
      print("Current type:   " + type)
     }
     if (create_child_nodes_flag) {
       [left, right] = calculateChildNodes(node)
       node_queue = dataQueuePush(left, right, node_queue)

       [new_use_cols_vector, left_use_rows_vector, right_use_rows_vector] = splitData(X, use_rows_vector, use_cols_vector, used_col, threshold, type)
       use_rows_queue = dataQueuePush(left_use_rows_vector, right_use_rows_vector, use_rows_queue)
       use_cols_queue = dataQueuePush(new_use_cols_vector, new_use_cols_vector, use_cols_queue)

       impurity_queue = dataQueuePush(matrix(impurity, rows = 1, cols = 1), matrix(impurity, rows = 1, cols = 1), impurity_queue)
       offset = dataQueueLength(node_queue) - 1
       M = outputMatrixBind(M, node, offset, used_col, R, threshold)
     } else {
       M = outputMatrixBind(M, node, 0.0, used_col, R, matrix(0, rows = 1, cols = 1))
     }
     queue_length = dataQueueLength(node_queue)# -- user-defined function calls not supported in relational expressions

     if (verbose) {
       print("New QueueLen:   " + queue_length)
       print("")
     }
   }
 }

 dataQueueLength = function(Matrix[Double] queue)  return (Double len) {
   len = ncol(queue)
 }

 dataQueuePop = function(Matrix[Double] queue)  return (Matrix[Double] new_queue, Matrix[Double] node) {
   node = matrix(queue[,1], rows=1, cols=nrow(queue))   # reshape to force the creation of a new object
   node = matrix(node, rows=nrow(queue), cols=1)    # reshape to force the creation of a new object
   len = dataQueueLength(queue)
   if (len < 2) {
     new_queue = matrix(0,0,0)
   } else {
     new_queue = matrix(queue[,2:ncol(queue)], rows=nrow(queue), cols=ncol(queue)-1)
   }
 }

 dataQueuePush = function(Matrix[Double] left, Matrix[Double] right, Matrix[Double] queue)  return (Matrix[Double] new_queue) {
   len = dataQueueLength(queue)
   if(len <= 0) {
     new_queue = cbind(left, right)
   } else {
     new_queue = cbind(queue, left, right)
   }
 }

 dataVectorLength = function(Matrix[Double] vector)  return (Double len) {
   len = nrow(vector)
 }

 dataColVectorLength = function(Matrix[Double] vector)  return (Double len) {
   len = ncol(vector)
 }

 dataVectorGet = function(Matrix[Double] vector, Double index)  return (Double value) {
   value = as.scalar(vector[index, 1])
 }

 dataVectorSet = function(Matrix[Double] vector, Double index, Double data) return (Matrix[Double] new_vector) {
   vector[index, 1] = data
   new_vector = vector
 }

 calcAvailable = function(Matrix[Double] vector) return(Double available_elements){
   len = dataVectorLength(vector)
   available_elements = 0.0
   for (index in 1:len) {
     element = dataVectorGet(vector, index)
     if(element > 0.0) {
       available_elements = available_elements + 1.0
     }
   }
 }

 calculateNodeDepth = function(Matrix[Double] node)  return(Double depth) {
   depth = log(as.scalar(node), 2) + 1
 }

 calculateChildNodes = function(Matrix[Double] node)  return(Matrix[Double] left, Matrix[Double] right) {
   left = node * 2.0
   right = node * 2.0 + 1.0
 }

 getTypeOfCol = function(Matrix[Double] R, Double col)  return(Double type) {  # 1..scalar,  2..categorical
   type = as.scalar(R[1, col])
 }

 extrapolateOrderedScalarFeatures = function(
   Matrix[Double] X,
   Matrix[Double] use_rows_vector,
   Double col) return (Matrix[Double] feature_vector) {
   feature_vector = matrix(1, rows = 1, cols = 1)
   len = nrow(X)
   first_time = TRUE
   for(row in 1:len) {
     use_feature = dataVectorGet(use_rows_vector, row)
     if (use_feature != 0) {
       if(first_time) {
         feature_vector[1,1] = X[row, col]
         first_time = FALSE
       } else {
         feature_vector = rbind(feature_vector, X[row, col])
       }
     }
   }
   feature_vector = order(target=feature_vector, by=1, decreasing=FALSE, index.return=FALSE)
 }

 calcPossibleThresholdsScalar = function(
   Matrix[Double] X,
   Matrix[Double] use_rows_vector,
   Double col,
   int bins) return (Matrix[Double] thresholds) {
   ordered_features = extrapolateOrderedScalarFeatures(X, use_rows_vector, col)
   ordered_features_len = dataVectorLength(ordered_features)
   thresholds = matrix(1, rows = 1, cols = ordered_features_len - 1)
   virtual_length = min(ordered_features_len, 20)
   step_length = ordered_features_len / virtual_length
   if (ordered_features_len > 1) {
     for (index in 1:(virtual_length - 1)) {
       real_index = index * step_length
       mean = (dataVectorGet(ordered_features, real_index) + dataVectorGet(ordered_features, real_index + 1)) / 2
       thresholds[1, index] = mean
     }
   }
 }

 calcPossibleThresholdsCategory = function(Double type) return (Matrix[Double] thresholds) {
   numberThresholds = 2 ^ type
   thresholds = matrix(-1, rows = type, cols = numberThresholds)
   toggleFactor = numberThresholds / 2

   for (index in 1:type) {
     beginCols = 1
     endCols = toggleFactor
     iterations = numberThresholds / toggleFactor / 2
     for (it in 1:iterations) {
       category_val = type - index + 1
       thresholds[index, beginCols:endCols] = matrix(category_val, rows = 1, cols = toggleFactor)
       endCols = endCols + 2 * toggleFactor
       beginCols = beginCols + 2 * toggleFactor
     }
     toggleFactor = toggleFactor / 2
     iterations = numberThresholds / toggleFactor / 2
   }
   ncol = ncol(thresholds)
   if (ncol > 2.0) {
     thresholds = cbind(thresholds[,2:ncol-2], thresholds[,ncol-1])
   }
 }

 calcGiniImpurity = function(Double num_true, Double num_false) return (Double impurity) {
   prop_true = num_true / (num_true + num_false)
   prop_false = num_false / (num_true + num_false)
   impurity = 1 - (prop_true ^ 2) - (prop_false ^ 2)
 }

 calcImpurity = function(
   Matrix[Double] X,
   Matrix[Double] Y,
   Matrix[Double] use_rows_vector,
   Double col,
   Double type,
   int bins) return (Double impurity, Matrix[Double] threshold) {

   is_scalar_type = typeIsScalar(type)
   if (is_scalar_type) {
     possible_thresholds = calcPossibleThresholdsScalar(X, use_rows_vector, col, bins)
   } else {
     possible_thresholds = calcPossibleThresholdsCategory(type)
   }
   len_thresholds = ncol(possible_thresholds)
   impurity = 1
   threshold = matrix(0, rows=1, cols=1)
   for (index in 1:len_thresholds) {
     [false_rows, true_rows] = splitRowsVector(X, use_rows_vector, col, possible_thresholds[, index], type)
     num_true_positive = 0; num_false_positive = 0; num_true_negative = 0; num_false_negative = 0
     len = dataVectorLength(use_rows_vector)
     for (c_row in 1:len) {
       true_row_data = dataVectorGet(true_rows, c_row)
       false_row_data = dataVectorGet(false_rows, c_row)
       if (true_row_data != 0 & false_row_data == 0) { # IT'S POSITIVE!
         if (as.scalar(Y[c_row, 1]) != 0) {
           num_true_positive = num_true_positive + 1
         } else {
           num_false_positive = num_false_positive + 1
         }
       } else if (true_row_data == 0 & false_row_data != 0) { # IT'S NEGATIVE
         if (as.scalar(Y[c_row, 1]) != 0.0) {
           num_false_negative = num_false_negative + 1
         } else {
           num_true_negative = num_true_negative + 1
         }
       }
     }
     impurity_positive_branch = calcGiniImpurity(num_true_positive, num_false_positive)
     impurity_negative_branch = calcGiniImpurity(num_true_negative, num_false_negative)
     num_samples = num_true_positive + num_false_positive + num_true_negative + num_false_negative
     num_negative = num_true_negative + num_false_negative
     num_positive = num_true_positive + num_false_positive
     c_impurity = num_positive / num_samples * impurity_positive_branch + num_negative / num_samples * impurity_negative_branch
     if (c_impurity <= impurity) {
       impurity = c_impurity
       threshold = possible_thresholds[, index]
     }
   }
 }

 calcBestSplittingCriteria = function(
   Matrix[Double] X,
   Matrix[Double] Y,
   Matrix[Double] R,
   Matrix[Double] use_rows_vector,
   Matrix[Double] use_cols_vector,
   int bins)  return (Double impurity, Double used_col, Matrix[Double] threshold, Double type) {

   impurity = 1
   used_col = 1
   threshold = matrix(0, 1, 1)
   type = 1
   # -- user-defined function calls not supported for iterable predicates
   len = dataVectorLength(use_cols_vector)
   for (c_col in 1:len) {
     use_feature = dataVectorGet(use_cols_vector, c_col)
     if (use_feature != 0) {
       c_type = getTypeOfCol(R, c_col)
       [c_impurity, c_threshold] = calcImpurity(X, Y, use_rows_vector, c_col, c_type, bins)
       if(c_impurity <= impurity) {
         impurity = c_impurity
         used_col = c_col
         threshold = c_threshold
         type = c_type
       }
     }
   }
 }

 typeIsScalar = function(Double type) return(Boolean b) {
   b = type == 1.0
 }

 splitRowsVector = function(
   Matrix[Double] X,
   Matrix[Double] use_rows_vector,
   Double col,
   Matrix[Double] threshold,
   Double type
 ) return (Matrix[Double] false_use_rows_vector, Matrix[Double] true_use_rows_vector) {
   type_is_scalar = typeIsScalar(type)
   false_use_rows_vector = use_rows_vector
   true_use_rows_vector = use_rows_vector

   if (type_is_scalar) {
     scalar_threshold = as.scalar(threshold[1,1])
     len = dataVectorLength(use_rows_vector)
     for (c_row in 1:len) {
       row_enabled = dataVectorGet(use_rows_vector, c_row)
       if (row_enabled != 0) {
       if (as.scalar(X[c_row, col]) > scalar_threshold) {
         false_use_rows_vector = dataVectorSet(false_use_rows_vector, c_row, 0.0)
       } else {
         true_use_rows_vector = dataVectorSet(true_use_rows_vector, c_row, 0.0)
       }
       }
     }
   } else {
     len = dataVectorLength(use_rows_vector)
     for (c_row in 1:len) {
       row_enabled = dataVectorGet(use_rows_vector, c_row)
       if (row_enabled != 0) {
         categories_len = dataColVectorLength(threshold)
         move_sample_to_true_set = FALSE
         for (category_col_index in 1:categories_len) {
           desired_category = as.scalar(X[c_row, col])
           if(desired_category != -1) {
             category_of_threshold = threshold[type - desired_category + 1, category_col_index]
             move_sample_to_true_set = as.scalar(X[c_row, col]) == as.scalar(category_of_threshold)
           } else {
             #Todo: has category -1 to be considered?
             move_sample_to_true_set = TRUE
           }
         }
         if (move_sample_to_true_set) {
           false_use_rows_vector = dataVectorSet(false_use_rows_vector, c_row, 0.0)
         } else {
           true_use_rows_vector = dataVectorSet(true_use_rows_vector, c_row, 0.0)
         }
       }
     }
   }
 }

 splitData = function(
   Matrix[Double] X,
   Matrix[Double] use_rows_vector,
   Matrix[Double] use_cols_vector,
   Double col,
   Matrix[Double] threshold,
   Double type
 ) return (Matrix[Double] new_use_cols_vector, Matrix[Double] false_use_rows_vector, Matrix[Double] true_use_rows_vector) {
   new_use_cols_vector = dataVectorSet(use_cols_vector, col, 0.0)
   [false_use_rows_vector, true_use_rows_vector] = splitRowsVector(X, use_rows_vector, col, threshold, type)
 }

 outputMatrixBind = function(
   Matrix[Double] M,
   Matrix[Double] node,
   Double offset,
   Double used_col,
   Matrix[Double] R,
   Matrix[Double] threshold
 ) return (Matrix[Double] new_M) {
   col = matrix(0, rows = 5, cols = 1)
   col[1, 1] = node[1, 1]
   col[2, 1] = offset
   col[3, 1] = used_col
   if (used_col >= 1.0) { col[4, 1] = R[1, used_col] }
   col[5, 1] = nrow(threshold)
   col = rbind(col, threshold)

   if (ncol(M) == 0 & nrow(M) == 0) {
     new_M = col
   } else {
     row_difference = nrow(M) - nrow(col)
   if (row_difference < 0.0) {
     buffer = matrix(-1, rows = -row_difference, cols = ncol(M))
     M = rbind(M, buffer)
   } else if (row_difference > 0.0) {
     buffer = matrix(-1, rows = row_difference, cols = 1)
     col = rbind(col, buffer)
   }
     new_M = cbind(M, col)
   }
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	#
	# THIS SCRIPT IMPLEMENTS CLASSIFICATION TREES WITH BOTH SCALE AND CATEGORICAL FEATURES
	#
	# INPUT PARAMETERS:
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X Matrix[Double] --- Feature matrix X; note that X needs to be both recoded and dummy coded
	# Y Matrix[Double] --- Label matrix Y; note that Y needs to be both recoded and dummy coded
	# R Matrix[Double] " " Matrix R which for each feature in X contains the following information
	# - R[1,]: Row Vector which indicates if feature vector is scalar or categorical. 1 indicates
	# a scalar feature vector, other positive Integers indicate the number of categories
	# If R is not provided by default all variables are assumed to be scale
	# bins Integer 20 Number of equiheight bins per scale feature to choose thresholds
	# depth Integer 25 Maximum depth of the learned tree
	# verbose Boolean FALSE boolean specifying if the algorithm should print information while executing
	# ---------------------------------------------------------------------------------------------
	# OUTPUT:
	# Matrix M where each column corresponds to a node in the learned tree and each row contains the following information:
	# M[1,j]: id of node j (in a complete binary tree)
	# M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0
	# M[3,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical)
	# that node j looks at if j is an internal node, otherwise 0
	# M[4,j]: Type of the feature that node j looks at if j is an internal node: holds the same information as R input vector
	# M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values
	# stored in rows 6,7,... if j is categorical
	# If j is a leaf node: number of misclassified samples reaching at node j
	# M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale,
	# otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j
	# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0
	# -------------------------------------------------------------------------------------------

	m_decisionTree = function(
	Matrix[Double] X,
	Matrix[Double] Y,
	Matrix[Double] R,
	Integer bins = 10,
	Integer depth = 20,
	Boolean verbose = FALSE
	) return (Matrix[Double] M) {
	if (verbose) {
	print("Executing Decision Tree:")
	}
	node_queue = matrix(1, rows=1, cols=1) # Add first Node
	impurity_queue = matrix(1, rows=1, cols=1)
	use_cols_queue = matrix(1, rows=ncol(X), cols=1) # Add fist bool Vector with all cols <=> (use all cols)
	use_rows_queue = matrix(1, rows=nrow(X), cols=1) # Add fist bool Vector with all rows <=> (use all rows)
	queue_length = 1
	M = matrix(0, rows = 0, cols = 0)
	while (queue_length > 0) {
	[node_queue, node] = dataQueuePop(node_queue)
	[use_rows_queue, use_rows_vector] = dataQueuePop(use_rows_queue)
	[use_cols_queue, use_cols_vector] = dataQueuePop(use_cols_queue)

	available_rows = calcAvailable(use_rows_vector)
	available_cols = calcAvailable(use_cols_vector)
	[impurity_queue, parent_impurity] = dataQueuePop(impurity_queue)
	create_child_nodes_flag = FALSE
	if (verbose) {
	print("Popped Node: " + as.scalar(node))
	print("Rows: " + toString(t(use_rows_vector)))
	print("Cols: " + toString(t(use_cols_vector)))
	print("Available Rows: " + available_rows)
	print("Available Cols: " + available_cols)
	print("Parent impurity: " + as.scalar(parent_impurity))
	}

	node_depth = calculateNodeDepth(node)
	used_col = 0.0
	if (node_depth < depth & available_rows > 1 & available_cols > 0 & as.scalar(parent_impurity) > 0) {
	[impurity, used_col, threshold, type] = calcBestSplittingCriteria(X, Y, R, use_rows_vector, use_cols_vector, bins)
	create_child_nodes_flag = impurity < as.scalar(parent_impurity)
	if (verbose) {
	print("Current impurity: " + impurity)
	print("Current threshold: "+ toString(t(threshold)))
	}
	}
	if (verbose) {

	print("Current column: " + used_col)
	print("Current type: " + type)
	}
	if (create_child_nodes_flag) {
	[left, right] = calculateChildNodes(node)
	node_queue = dataQueuePush(left, right, node_queue)

	[new_use_cols_vector, left_use_rows_vector, right_use_rows_vector] = splitData(X, use_rows_vector, use_cols_vector, used_col, threshold, type)
	use_rows_queue = dataQueuePush(left_use_rows_vector, right_use_rows_vector, use_rows_queue)
	use_cols_queue = dataQueuePush(new_use_cols_vector, new_use_cols_vector, use_cols_queue)

	impurity_queue = dataQueuePush(matrix(impurity, rows = 1, cols = 1), matrix(impurity, rows = 1, cols = 1), impurity_queue)
	offset = dataQueueLength(node_queue) - 1
	M = outputMatrixBind(M, node, offset, used_col, R, threshold)
	} else {
	M = outputMatrixBind(M, node, 0.0, used_col, R, matrix(0, rows = 1, cols = 1))
	}
	queue_length = dataQueueLength(node_queue)# -- user-defined function calls not supported in relational expressions

	if (verbose) {
	print("New QueueLen: " + queue_length)
	print("")
	}
	}
	}

	dataQueueLength = function(Matrix[Double] queue) return (Double len) {
	len = ncol(queue)
	}

	dataQueuePop = function(Matrix[Double] queue) return (Matrix[Double] new_queue, Matrix[Double] node) {
	node = matrix(queue[,1], rows=1, cols=nrow(queue)) # reshape to force the creation of a new object
	node = matrix(node, rows=nrow(queue), cols=1) # reshape to force the creation of a new object
	len = dataQueueLength(queue)
	if (len < 2) {
	new_queue = matrix(0,0,0)
	} else {
	new_queue = matrix(queue[,2:ncol(queue)], rows=nrow(queue), cols=ncol(queue)-1)
	}
	}

	dataQueuePush = function(Matrix[Double] left, Matrix[Double] right, Matrix[Double] queue) return (Matrix[Double] new_queue) {
	len = dataQueueLength(queue)
	if(len <= 0) {
	new_queue = cbind(left, right)
	} else {
	new_queue = cbind(queue, left, right)
	}
	}

	dataVectorLength = function(Matrix[Double] vector) return (Double len) {
	len = nrow(vector)
	}

	dataColVectorLength = function(Matrix[Double] vector) return (Double len) {
	len = ncol(vector)
	}

	dataVectorGet = function(Matrix[Double] vector, Double index) return (Double value) {
	value = as.scalar(vector[index, 1])
	}

	dataVectorSet = function(Matrix[Double] vector, Double index, Double data) return (Matrix[Double] new_vector) {
	vector[index, 1] = data
	new_vector = vector
	}

	calcAvailable = function(Matrix[Double] vector) return(Double available_elements){
	len = dataVectorLength(vector)
	available_elements = 0.0
	for (index in 1:len) {
	element = dataVectorGet(vector, index)
	if(element > 0.0) {
	available_elements = available_elements + 1.0
	}
	}
	}

	calculateNodeDepth = function(Matrix[Double] node) return(Double depth) {
	depth = log(as.scalar(node), 2) + 1
	}

	calculateChildNodes = function(Matrix[Double] node) return(Matrix[Double] left, Matrix[Double] right) {
	left = node * 2.0
	right = node * 2.0 + 1.0
	}

	getTypeOfCol = function(Matrix[Double] R, Double col) return(Double type) { # 1..scalar, 2..categorical
	type = as.scalar(R[1, col])
	}

	extrapolateOrderedScalarFeatures = function(
	Matrix[Double] X,
	Matrix[Double] use_rows_vector,
	Double col) return (Matrix[Double] feature_vector) {
	feature_vector = matrix(1, rows = 1, cols = 1)
	len = nrow(X)
	first_time = TRUE
	for(row in 1:len) {
	use_feature = dataVectorGet(use_rows_vector, row)
	if (use_feature != 0) {
	if(first_time) {
	feature_vector[1,1] = X[row, col]
	first_time = FALSE
	} else {
	feature_vector = rbind(feature_vector, X[row, col])
	}
	}
	}
	feature_vector = order(target=feature_vector, by=1, decreasing=FALSE, index.return=FALSE)
	}

	calcPossibleThresholdsScalar = function(
	Matrix[Double] X,
	Matrix[Double] use_rows_vector,
	Double col,
	int bins) return (Matrix[Double] thresholds) {
	ordered_features = extrapolateOrderedScalarFeatures(X, use_rows_vector, col)
	ordered_features_len = dataVectorLength(ordered_features)
	thresholds = matrix(1, rows = 1, cols = ordered_features_len - 1)
	virtual_length = min(ordered_features_len, 20)
	step_length = ordered_features_len / virtual_length
	if (ordered_features_len > 1) {
	for (index in 1:(virtual_length - 1)) {
	real_index = index * step_length
	mean = (dataVectorGet(ordered_features, real_index) + dataVectorGet(ordered_features, real_index + 1)) / 2
	thresholds[1, index] = mean
	}
	}
	}

	calcPossibleThresholdsCategory = function(Double type) return (Matrix[Double] thresholds) {
	numberThresholds = 2 ^ type
	thresholds = matrix(-1, rows = type, cols = numberThresholds)
	toggleFactor = numberThresholds / 2

	for (index in 1:type) {
	beginCols = 1
	endCols = toggleFactor
	iterations = numberThresholds / toggleFactor / 2
	for (it in 1:iterations) {
	category_val = type - index + 1
	thresholds[index, beginCols:endCols] = matrix(category_val, rows = 1, cols = toggleFactor)
	endCols = endCols + 2 * toggleFactor
	beginCols = beginCols + 2 * toggleFactor
	}
	toggleFactor = toggleFactor / 2
	iterations = numberThresholds / toggleFactor / 2
	}
	ncol = ncol(thresholds)
	if (ncol > 2.0) {
	thresholds = cbind(thresholds[,2:ncol-2], thresholds[,ncol-1])
	}
	}

	calcGiniImpurity = function(Double num_true, Double num_false) return (Double impurity) {
	prop_true = num_true / (num_true + num_false)
	prop_false = num_false / (num_true + num_false)
	impurity = 1 - (prop_true ^ 2) - (prop_false ^ 2)
	}

	calcImpurity = function(
	Matrix[Double] X,
	Matrix[Double] Y,
	Matrix[Double] use_rows_vector,
	Double col,
	Double type,
	int bins) return (Double impurity, Matrix[Double] threshold) {

	is_scalar_type = typeIsScalar(type)
	if (is_scalar_type) {
	possible_thresholds = calcPossibleThresholdsScalar(X, use_rows_vector, col, bins)
	} else {
	possible_thresholds = calcPossibleThresholdsCategory(type)
	}
	len_thresholds = ncol(possible_thresholds)
	impurity = 1
	threshold = matrix(0, rows=1, cols=1)
	for (index in 1:len_thresholds) {
	[false_rows, true_rows] = splitRowsVector(X, use_rows_vector, col, possible_thresholds[, index], type)
	num_true_positive = 0; num_false_positive = 0; num_true_negative = 0; num_false_negative = 0
	len = dataVectorLength(use_rows_vector)
	for (c_row in 1:len) {
	true_row_data = dataVectorGet(true_rows, c_row)
	false_row_data = dataVectorGet(false_rows, c_row)
	if (true_row_data != 0 & false_row_data == 0) { # IT'S POSITIVE!
	if (as.scalar(Y[c_row, 1]) != 0) {
	num_true_positive = num_true_positive + 1
	} else {
	num_false_positive = num_false_positive + 1
	}
	} else if (true_row_data == 0 & false_row_data != 0) { # IT'S NEGATIVE
	if (as.scalar(Y[c_row, 1]) != 0.0) {
	num_false_negative = num_false_negative + 1
	} else {
	num_true_negative = num_true_negative + 1
	}
	}
	}
	impurity_positive_branch = calcGiniImpurity(num_true_positive, num_false_positive)
	impurity_negative_branch = calcGiniImpurity(num_true_negative, num_false_negative)
	num_samples = num_true_positive + num_false_positive + num_true_negative + num_false_negative
	num_negative = num_true_negative + num_false_negative
	num_positive = num_true_positive + num_false_positive
	c_impurity = num_positive / num_samples * impurity_positive_branch + num_negative / num_samples * impurity_negative_branch
	if (c_impurity <= impurity) {
	impurity = c_impurity
	threshold = possible_thresholds[, index]
	}
	}
	}

	calcBestSplittingCriteria = function(
	Matrix[Double] X,
	Matrix[Double] Y,
	Matrix[Double] R,
	Matrix[Double] use_rows_vector,
	Matrix[Double] use_cols_vector,
	int bins) return (Double impurity, Double used_col, Matrix[Double] threshold, Double type) {

	impurity = 1
	used_col = 1
	threshold = matrix(0, 1, 1)
	type = 1
	# -- user-defined function calls not supported for iterable predicates
	len = dataVectorLength(use_cols_vector)
	for (c_col in 1:len) {
	use_feature = dataVectorGet(use_cols_vector, c_col)
	if (use_feature != 0) {
	c_type = getTypeOfCol(R, c_col)
	[c_impurity, c_threshold] = calcImpurity(X, Y, use_rows_vector, c_col, c_type, bins)
	if(c_impurity <= impurity) {
	impurity = c_impurity
	used_col = c_col
	threshold = c_threshold
	type = c_type
	}
	}
	}
	}

	typeIsScalar = function(Double type) return(Boolean b) {
	b = type == 1.0
	}

	splitRowsVector = function(
	Matrix[Double] X,
	Matrix[Double] use_rows_vector,
	Double col,
	Matrix[Double] threshold,
	Double type
	) return (Matrix[Double] false_use_rows_vector, Matrix[Double] true_use_rows_vector) {
	type_is_scalar = typeIsScalar(type)
	false_use_rows_vector = use_rows_vector
	true_use_rows_vector = use_rows_vector

	if (type_is_scalar) {
	scalar_threshold = as.scalar(threshold[1,1])
	len = dataVectorLength(use_rows_vector)
	for (c_row in 1:len) {
	row_enabled = dataVectorGet(use_rows_vector, c_row)
	if (row_enabled != 0) {
	if (as.scalar(X[c_row, col]) > scalar_threshold) {
	false_use_rows_vector = dataVectorSet(false_use_rows_vector, c_row, 0.0)
	} else {
	true_use_rows_vector = dataVectorSet(true_use_rows_vector, c_row, 0.0)
	}
	}
	}
	} else {
	len = dataVectorLength(use_rows_vector)
	for (c_row in 1:len) {
	row_enabled = dataVectorGet(use_rows_vector, c_row)
	if (row_enabled != 0) {
	categories_len = dataColVectorLength(threshold)
	move_sample_to_true_set = FALSE
	for (category_col_index in 1:categories_len) {
	desired_category = as.scalar(X[c_row, col])
	if(desired_category != -1) {
	category_of_threshold = threshold[type - desired_category + 1, category_col_index]
	move_sample_to_true_set = as.scalar(X[c_row, col]) == as.scalar(category_of_threshold)
	} else {
	#Todo: has category -1 to be considered?
	move_sample_to_true_set = TRUE
	}
	}
	if (move_sample_to_true_set) {
	false_use_rows_vector = dataVectorSet(false_use_rows_vector, c_row, 0.0)
	} else {
	true_use_rows_vector = dataVectorSet(true_use_rows_vector, c_row, 0.0)
	}
	}
	}
	}
	}

	splitData = function(
	Matrix[Double] X,
	Matrix[Double] use_rows_vector,
	Matrix[Double] use_cols_vector,
	Double col,
	Matrix[Double] threshold,
	Double type
	) return (Matrix[Double] new_use_cols_vector, Matrix[Double] false_use_rows_vector, Matrix[Double] true_use_rows_vector) {
	new_use_cols_vector = dataVectorSet(use_cols_vector, col, 0.0)
	[false_use_rows_vector, true_use_rows_vector] = splitRowsVector(X, use_rows_vector, col, threshold, type)
	}

	outputMatrixBind = function(
	Matrix[Double] M,
	Matrix[Double] node,
	Double offset,
	Double used_col,
	Matrix[Double] R,
	Matrix[Double] threshold
	) return (Matrix[Double] new_M) {
	col = matrix(0, rows = 5, cols = 1)
	col[1, 1] = node[1, 1]
	col[2, 1] = offset
	col[3, 1] = used_col
	if (used_col >= 1.0) { col[4, 1] = R[1, used_col] }
	col[5, 1] = nrow(threshold)
	col = rbind(col, threshold)

	if (ncol(M) == 0 & nrow(M) == 0) {
	new_M = col
	} else {
	row_difference = nrow(M) - nrow(col)
	if (row_difference < 0.0) {
	buffer = matrix(-1, rows = -row_difference, cols = ncol(M))
	M = rbind(M, buffer)
	} else if (row_difference > 0.0) {
	buffer = matrix(-1, rows = row_difference, cols = 1)
	col = rbind(col, buffer)
	}
	new_M = cbind(M, col)
	}
	}