scripts/builtin/randomForestPredict.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # This script implements random forest prediction for recoded and binned
 # categorical and numerical input features.
 #
 # INPUT:
 # ------------------------------------------------------------------------------
 # X               Feature matrix in recoded/binned representation
 # y               Label matrix in recoded/binned representation,
 #                 optional for accuracy evaluation
 # ctypes          Row-Vector of column types [1 scale/ordinal, 2 categorical]
 # M               Matrix M holding the learned trees (one tree per row),
 #                 see randomForest() for the detailed tree representation.
 # verbose         Flag indicating verbose debug output
 # ------------------------------------------------------------------------------
 #
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # yhat            Label vector of predictions
 # ------------------------------------------------------------------------------

 m_randomForestPredict = function(Matrix[Double] X, Matrix[Double] y = matrix(0,0,0),
     Matrix[Double] ctypes, Matrix[Double] M, Boolean verbose = FALSE)
   return(Matrix[Double] yhat)
 {
   t1 = time();
   classify = as.scalar(ctypes[1,ncol(X)+1]) == 2;
   yExists = (nrow(X)==nrow(y));

   if(verbose) {
     print("randomForestPredict: called for batch of "+nrow(X)+" rows, model of "
       +nrow(M)+" trees, and with labels-provided "+yExists+".");
   }

   # scoring of num_tree decision trees
   Ytmp = matrix(0, rows=nrow(M), cols=nrow(X));
   parfor(i in 1:nrow(M)) {
     if( verbose )
       print("randomForest: start scoring tree "+i+"/"+nrow(M)+".");

     # step 1: sample features (consistent with training)
     I2 = M[i, 1:ncol(X)];
     Xi = removeEmpty(target=X, margin="cols", select=I2);

     # step 2: score decision tree
     t2 = time();
     ret = decisionTreePredict(X=Xi, M=M[i,ncol(X)+1:ncol(M)], ctypes=ctypes, strategy="TT");
     Ytmp[i,1:nrow(ret)] = t(ret);
     if( verbose )
       print("-- ["+i+"] scored decision tree in "+(time()-t2)/1e9+" seconds.");
   }

   # label aggregation (majority voting / average)
   yhat = matrix(0, nrow(X), 1);
   if( classify ) {
     parfor(i in 1:nrow(X))
       yhat[i,1] = rowIndexMax(t(table(Ytmp[,i],1)));
   }
   else {
     yhat = t(colSums(Ytmp)/nrow(M));
   }

   # summary statistics
   if( yExists & verbose ) {
     if( classify )
       print("Accuracy (%): " + (sum(yhat == y) / nrow(y) * 100));
     else
       lmPredictStats(yhat, y, FALSE);
   }

   if(verbose) {
     print("randomForest: scored batch of "+nrow(X)+" rows with "+nrow(M)+" trees in "+(time()-t1)/1e9+" seconds.");
   }
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# This script implements random forest prediction for recoded and binned
	# categorical and numerical input features.
	#
	# INPUT:
	# ------------------------------------------------------------------------------
	# X Feature matrix in recoded/binned representation
	# y Label matrix in recoded/binned representation,
	# optional for accuracy evaluation
	# ctypes Row-Vector of column types [1 scale/ordinal, 2 categorical]
	# M Matrix M holding the learned trees (one tree per row),
	# see randomForest() for the detailed tree representation.
	# verbose Flag indicating verbose debug output
	# ------------------------------------------------------------------------------
	#
	# OUTPUT:
	# ------------------------------------------------------------------------------
	# yhat Label vector of predictions
	# ------------------------------------------------------------------------------

	m_randomForestPredict = function(Matrix[Double] X, Matrix[Double] y = matrix(0,0,0),
	Matrix[Double] ctypes, Matrix[Double] M, Boolean verbose = FALSE)
	return(Matrix[Double] yhat)
	{
	t1 = time();
	classify = as.scalar(ctypes[1,ncol(X)+1]) == 2;
	yExists = (nrow(X)==nrow(y));

	if(verbose) {
	print("randomForestPredict: called for batch of "+nrow(X)+" rows, model of "
	+nrow(M)+" trees, and with labels-provided "+yExists+".");
	}

	# scoring of num_tree decision trees
	Ytmp = matrix(0, rows=nrow(M), cols=nrow(X));
	parfor(i in 1:nrow(M)) {
	if( verbose )
	print("randomForest: start scoring tree "+i+"/"+nrow(M)+".");

	# step 1: sample features (consistent with training)
	I2 = M[i, 1:ncol(X)];
	Xi = removeEmpty(target=X, margin="cols", select=I2);

	# step 2: score decision tree
	t2 = time();
	ret = decisionTreePredict(X=Xi, M=M[i,ncol(X)+1:ncol(M)], ctypes=ctypes, strategy="TT");
	Ytmp[i,1:nrow(ret)] = t(ret);
	if( verbose )
	print("-- ["+i+"] scored decision tree in "+(time()-t2)/1e9+" seconds.");
	}

	# label aggregation (majority voting / average)
	yhat = matrix(0, nrow(X), 1);
	if( classify ) {
	parfor(i in 1:nrow(X))
	yhat[i,1] = rowIndexMax(t(table(Ytmp[,i],1)));
	}
	else {
	yhat = t(colSums(Ytmp)/nrow(M));
	}

	# summary statistics
	if( yExists & verbose ) {
	if( classify )
	print("Accuracy (%): " + (sum(yhat == y) / nrow(y) * 100));
	else
	lmPredictStats(yhat, y, FALSE);
	}

	if(verbose) {
	print("randomForest: scored batch of "+nrow(X)+" rows with "+nrow(M)+" trees in "+(time()-t1)/1e9+" seconds.");
	}
	}