scripts/builtin/decisionTreePredict.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # This script implements random forest prediction for recoded and binned
 # categorical and numerical input features.
 # Hummingbird paper (https://www.usenix.org/system/files/osdi20-nakandala.pdf).
 #
 # INPUT:
 # ------------------------------------------------------------------------------
 # X               Feature matrix in recoded/binned representation
 # y               Label matrix in recoded/binned representation,
 #                 optional for accuracy evaluation
 # ctypes          Row-Vector of column types [1 scale/ordinal, 2 categorical]
 # M               Matrix M holding the learned tree in linearized form
 #                 see decisionTree() for the detailed tree representation.
 # strategy        Prediction strategy, can be one of ["GEMM", "TT", "PTT"],
 #                 referring to "Generic matrix multiplication",
 #                 "Tree traversal", and "Perfect tree traversal", respectively
 # verbose         Flag indicating verbose debug output
 # ------------------------------------------------------------------------------
 #
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # yhat            Label vector of predictions
 # ------------------------------------------------------------------------------

 m_decisionTreePredict = function(Matrix[Double] X, Matrix[Double] y = matrix(0,0,0),
     Matrix[Double] ctypes, Matrix[Double] M, String strategy="TT", Boolean verbose = FALSE)
   return (Matrix[Double] yhat)
 {
   if( strategy == "TT" )
     yhat = predict_TT(M, X);
   else if( strategy == "GEMM" )
     yhat = predict_GEMM(M, X);
   else {
     print ("No such strategy" + strategy)
     yhat = matrix("0", rows=0, cols=0)
   }
 }

 predict_TT = function (Matrix[Double] M, Matrix[Double] X)
   return (Matrix[Double] yhat)
 {
   # initialization of model tensors and parameters
   [N, N_L, N_R, N_F, N_T, C] = createTTNodeTensors(M)
   nr = nrow(X); n = ncol(N_L);
   tree_depth = ceiling(log(max(N)+1,2)) # max depth


   Ti = matrix(1, nr, 1); # current nodes (start at root)
   noChange = FALSE; i = 1;
   while( !noChange & i <= tree_depth) {
     P = table(seq(1,nr), Ti, nr, n);
     TF = P %*% t(N_F); # get node feature indexes
     Tv = rowSums(X * table(seq(1,nr),TF,nr,ncol(X))); # get feature values
     Tt = P %*% t(N_T); # get node thresholds
     TL = P %*% t(N_L); # get node left paths
     TR = P %*% t(N_R); # get node right paths
     # pick left or right path for each record separately
     Ti_new = ifelse(Tv <= Tt, TL, TR);
     noChange = (sum(Ti != Ti_new) == 0);
     i = i + 1;
     Ti = Ti_new;
   }

   # extract classes
   yhat = table(seq(1,nr), Ti, nr, n) %*%  C;
 }

 predict_GEMM = function (Matrix[Double] M, Matrix[Double] X)
   return (Matrix[Double] Y)
 {
   # initialization of model tensors and parameters
   [A, B, C, D, E] = createGEMMNodeTensors(M, ncol(X));

   # scoring pipline, evaluating all nodes in parallel
   Y = rowIndexMax(((((X %*% A) <= B) %*% C) == D) %*% E);
 }

 createTTNodeTensors = function( Matrix[Double] M )
   return (Matrix[Double] N, Matrix[Double] N_L, Matrix[Double] N_R,
           Matrix[Double] N_F, Matrix[Double] N_T, Matrix[Double] C)
 {
   # all tree nodes (inner and leaf nodes)
   M2 = matrix(M, rows=ncol(M)/2, cols=2);
   NID = seq(1, nrow(M2));
   nI = (M2[,1]!=0 | M2[,2]!=0)
   cnI = cumsum(nI);
   N = t(removeEmpty(target=NID, margin="rows", select=nI));
   n_nodes = ncol(N)

   # left/right child node ids, default self-id
   N_L = t(removeEmpty(target=ifelse(M2[,1]!=0, 2*NID, NID), margin="rows", select=nI));
   N_R = t(removeEmpty(target=ifelse(M2[,1]!=0, 2*NID+1, NID), margin="rows", select=nI));
   # recoding to node vector positions (removed non-existing nodes)
   N_L = t(table(seq(1,n_nodes), t(N_L), n_nodes, nrow(M2)) %*% cnI);
   N_R = t(table(seq(1,n_nodes), t(N_R), n_nodes, nrow(M2)) %*% cnI);

   # node feature IDs (positions) and threshold values
   N_F = t(removeEmpty(target=ifelse(M2[,1]!=0, M2[,1], 1), margin="rows", select=nI));
   N_T = t(removeEmpty(target=ifelse(M2[,1]!=0, M2[,2], 0), margin="rows", select=nI));

   C = removeEmpty(target=M2[,2], margin="rows", select=nI);
 }

 createGEMMNodeTensors = function( Matrix[Double] M, Int m )
   return (Matrix[Double] A, Matrix[Double] B, Matrix[Double] C,
   Matrix[Double] D, Matrix[Double] E)
 {
   M2 = matrix(M, rows=ncol(M)/2, cols=2)
   NID = seq(1, nrow(M2))

   # predicate map [#feat x #inodes] and values [1 x #inodes]
   is_inner = M2[,1]!=0
   I1 = removeEmpty(target=NID, margin="rows", select=is_inner)
   pivot = removeEmpty(target=M2[,1], margin="rows", select=is_inner)
   nin = nrow(I1)
   A = table(pivot, seq(1,nin), m, nin)
   B = t(removeEmpty(target=M2[,2], margin="rows", select=is_inner))

   # bucket paths [#inodes x #paths] and path sums
   is_leaf = (!is_inner & M2[,2]!=0)
   leaf_ids = t(removeEmpty(target=NID, margin="rows", select=is_leaf))
   last_leaf = as.scalar(leaf_ids[1,ncol(leaf_ids)])
   leaf_classes = removeEmpty(target=M2[,2], margin="rows", select=is_leaf)
   nl = ncol(leaf_ids)

   # iterate over each inner node and check for each leaf node if it is the left subtree (1), right subtree (-1) or not included (0)
   #              | i |
   #           /        \
   #        |2i|       |2i+1|
   #       /    \     /      \
   #     |4i| |4i+1| |4i+2| |4i+3|
   #
   # left_subtree_of_node(i) = { x | (2^j)*i <= x < (2^j)*i + 2^(j-1), for j elem {1, 2, 3, ...}} -> j is the level of tree
   # right_subtree_of_node(i) = { x | (2^j)*i + 2^(j-1) <= x < (2^j + 1)*i, for j elem {1, 2, 3, ...}}

   C = matrix(0, nin, nl)
   parfor(i in seq(1, nin)){
     boundary_left = 2*as.scalar(I1[i, 1]) # initialize the left boundary with the left child of the inner node
     out = matrix(0, 1, nl)
     step_size = 1

     # iterate each level of tree [log(max_node_id) iterations]
     while(boundary_left < last_leaf) {

       # left side
       subset_lower_bound = leaf_ids >= boundary_left
       boundary_right = boundary_left + step_size
       subset_upper_bound = leaf_ids < boundary_right
       ones = subset_lower_bound & subset_upper_bound
       out = out + ones

       # right side
       subset_lower_bound = !subset_upper_bound #reuse by inverting
       boundary_right = boundary_right + step_size
       subset_upper_bound = leaf_ids < boundary_right
       ones = subset_lower_bound & subset_upper_bound
       out = out - ones

       step_size = step_size*2 # with each level the amount of nodes in subtree level doubles
       boundary_left = boundary_left*2 # new left boundary is the left child of the previous left boundary
     }
     C[i,] = out
   }
   D = colSums(max(C, 0));

   # class map [#paths x #classes]
   E = table(seq(1,ncol(C)),leaf_classes)
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# This script implements random forest prediction for recoded and binned
	# categorical and numerical input features.
	# Hummingbird paper (https://www.usenix.org/system/files/osdi20-nakandala.pdf).
	#
	# INPUT:
	# ------------------------------------------------------------------------------
	# X Feature matrix in recoded/binned representation
	# y Label matrix in recoded/binned representation,
	# optional for accuracy evaluation
	# ctypes Row-Vector of column types [1 scale/ordinal, 2 categorical]
	# M Matrix M holding the learned tree in linearized form
	# see decisionTree() for the detailed tree representation.
	# strategy Prediction strategy, can be one of ["GEMM", "TT", "PTT"],
	# referring to "Generic matrix multiplication",
	# "Tree traversal", and "Perfect tree traversal", respectively
	# verbose Flag indicating verbose debug output
	# ------------------------------------------------------------------------------
	#
	# OUTPUT:
	# ------------------------------------------------------------------------------
	# yhat Label vector of predictions
	# ------------------------------------------------------------------------------

	m_decisionTreePredict = function(Matrix[Double] X, Matrix[Double] y = matrix(0,0,0),
	Matrix[Double] ctypes, Matrix[Double] M, String strategy="TT", Boolean verbose = FALSE)
	return (Matrix[Double] yhat)
	{
	if( strategy == "TT" )
	yhat = predict_TT(M, X);
	else if( strategy == "GEMM" )
	yhat = predict_GEMM(M, X);
	else {
	print ("No such strategy" + strategy)
	yhat = matrix("0", rows=0, cols=0)
	}
	}

	predict_TT = function (Matrix[Double] M, Matrix[Double] X)
	return (Matrix[Double] yhat)
	{
	# initialization of model tensors and parameters
	[N, N_L, N_R, N_F, N_T, C] = createTTNodeTensors(M)
	nr = nrow(X); n = ncol(N_L);
	tree_depth = ceiling(log(max(N)+1,2)) # max depth


	Ti = matrix(1, nr, 1); # current nodes (start at root)
	noChange = FALSE; i = 1;
	while( !noChange & i <= tree_depth) {
	P = table(seq(1,nr), Ti, nr, n);
	TF = P %*% t(N_F); # get node feature indexes
	Tv = rowSums(X * table(seq(1,nr),TF,nr,ncol(X))); # get feature values
	Tt = P %*% t(N_T); # get node thresholds
	TL = P %*% t(N_L); # get node left paths
	TR = P %*% t(N_R); # get node right paths
	# pick left or right path for each record separately
	Ti_new = ifelse(Tv <= Tt, TL, TR);
	noChange = (sum(Ti != Ti_new) == 0);
	i = i + 1;
	Ti = Ti_new;
	}

	# extract classes
	yhat = table(seq(1,nr), Ti, nr, n) %*% C;
	}

	predict_GEMM = function (Matrix[Double] M, Matrix[Double] X)
	return (Matrix[Double] Y)
	{
	# initialization of model tensors and parameters
	[A, B, C, D, E] = createGEMMNodeTensors(M, ncol(X));

	# scoring pipline, evaluating all nodes in parallel
	Y = rowIndexMax(((((X %% A) <= B) %% C) == D) %*% E);
	}

	createTTNodeTensors = function( Matrix[Double] M )
	return (Matrix[Double] N, Matrix[Double] N_L, Matrix[Double] N_R,
	Matrix[Double] N_F, Matrix[Double] N_T, Matrix[Double] C)
	{
	# all tree nodes (inner and leaf nodes)
	M2 = matrix(M, rows=ncol(M)/2, cols=2);
	NID = seq(1, nrow(M2));
	nI = (M2[,1]!=0 \| M2[,2]!=0)
	cnI = cumsum(nI);
	N = t(removeEmpty(target=NID, margin="rows", select=nI));
	n_nodes = ncol(N)

	# left/right child node ids, default self-id
	N_L = t(removeEmpty(target=ifelse(M2[,1]!=0, 2*NID, NID), margin="rows", select=nI));
	N_R = t(removeEmpty(target=ifelse(M2[,1]!=0, 2*NID+1, NID), margin="rows", select=nI));
	# recoding to node vector positions (removed non-existing nodes)
	N_L = t(table(seq(1,n_nodes), t(N_L), n_nodes, nrow(M2)) %*% cnI);
	N_R = t(table(seq(1,n_nodes), t(N_R), n_nodes, nrow(M2)) %*% cnI);

	# node feature IDs (positions) and threshold values
	N_F = t(removeEmpty(target=ifelse(M2[,1]!=0, M2[,1], 1), margin="rows", select=nI));
	N_T = t(removeEmpty(target=ifelse(M2[,1]!=0, M2[,2], 0), margin="rows", select=nI));

	C = removeEmpty(target=M2[,2], margin="rows", select=nI);
	}

	createGEMMNodeTensors = function( Matrix[Double] M, Int m )
	return (Matrix[Double] A, Matrix[Double] B, Matrix[Double] C,
	Matrix[Double] D, Matrix[Double] E)
	{
	M2 = matrix(M, rows=ncol(M)/2, cols=2)
	NID = seq(1, nrow(M2))

	# predicate map [#feat x #inodes] and values [1 x #inodes]
	is_inner = M2[,1]!=0
	I1 = removeEmpty(target=NID, margin="rows", select=is_inner)
	pivot = removeEmpty(target=M2[,1], margin="rows", select=is_inner)
	nin = nrow(I1)
	A = table(pivot, seq(1,nin), m, nin)
	B = t(removeEmpty(target=M2[,2], margin="rows", select=is_inner))

	# bucket paths [#inodes x #paths] and path sums
	is_leaf = (!is_inner & M2[,2]!=0)
	leaf_ids = t(removeEmpty(target=NID, margin="rows", select=is_leaf))
	last_leaf = as.scalar(leaf_ids[1,ncol(leaf_ids)])
	leaf_classes = removeEmpty(target=M2[,2], margin="rows", select=is_leaf)
	nl = ncol(leaf_ids)

	# iterate over each inner node and check for each leaf node if it is the left subtree (1), right subtree (-1) or not included (0)
	# \| i \|
	# / \
	# \|2i\| \|2i+1\|
	# / \ / \
	# \|4i\| \|4i+1\| \|4i+2\| \|4i+3\|
	#
	# left_subtree_of_node(i) = { x \| (2^j)i <= x < (2^j)i + 2^(j-1), for j elem {1, 2, 3, ...}} -> j is the level of tree
	# right_subtree_of_node(i) = { x \| (2^j)i + 2^(j-1) <= x < (2^j + 1)i, for j elem {1, 2, 3, ...}}

	C = matrix(0, nin, nl)
	parfor(i in seq(1, nin)){
	boundary_left = 2*as.scalar(I1[i, 1]) # initialize the left boundary with the left child of the inner node
	out = matrix(0, 1, nl)
	step_size = 1

	# iterate each level of tree [log(max_node_id) iterations]
	while(boundary_left < last_leaf) {

	# left side
	subset_lower_bound = leaf_ids >= boundary_left
	boundary_right = boundary_left + step_size
	subset_upper_bound = leaf_ids < boundary_right
	ones = subset_lower_bound & subset_upper_bound
	out = out + ones

	# right side
	subset_lower_bound = !subset_upper_bound #reuse by inverting
	boundary_right = boundary_right + step_size
	subset_upper_bound = leaf_ids < boundary_right
	ones = subset_lower_bound & subset_upper_bound
	out = out - ones

	step_size = step_size*2 # with each level the amount of nodes in subtree level doubles
	boundary_left = boundary_left*2 # new left boundary is the left child of the previous left boundary
	}
	C[i,] = out
	}
	D = colSums(max(C, 0));

	# class map [#paths x #classes]
	E = table(seq(1,ncol(C)),leaf_classes)
	}