scripts/algorithms/obsolete/naive-bayes-parfor.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # Implements multinomial naive Bayes classifier with Laplace correction
 #
 # Example Usage:
 # hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
 #

 # defaults
 cmdLine_laplace = ifdef($laplace, 1)
 cmdLine_fmt = ifdef($fmt, "text")

 # reading input args
 D = read($X)
 min_feature_val = min(D)
 if(min_feature_val < 0)
 	stop("Stopping due to invalid argument: Multinomial naive Bayes is meant for count-based feature values, minimum value in X is negative")
 numRows = nrow(D)
 if(numRows < 2)
 	stop("Stopping due to invalid inputs: Not possible to learn a classifier without at least 2 rows")

 C = read($Y)
 if(min(C) < 1)
 	stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
 numClasses = max(C)
 if(numClasses == 1)
 	stop("Stopping due to invalid argument: Maximum label value is 1, need more than one class to learn a multi-class classifier")
 mod1 = C %% 1
 mod1_should_be_nrow = sum(abs(mod1 == 0))
 if(mod1_should_be_nrow != numRows)
 	stop("Stopping due to invalid argument: Please ensure that Y contains (positive) integral labels")

 laplace_correction = cmdLine_laplace
 if(laplace_correction < 0)
 	stop("Stopping due to invalid argument: Laplacian correction (laplace) must be non-negative")

 numFeatures = ncol(D)

 # Compute conditionals

 # Compute the feature counts for each class
 classFeatureCounts = matrix(0, rows=numClasses, cols=numFeatures)
 parfor (i in 1:numFeatures) {
   Col = D[,i]
   classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=as.integer(numClasses))
 }

 # Compute the total feature count for each class
 # and add the number of features to this sum
 # for subsequent regularization (Laplace's rule)
 classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction

 # Compute class conditional probabilities
 #ones = matrix(1, rows=1, cols=numFeatures)
 #repClassSums = classSums %*% ones
 #class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums
 class_conditionals = (classFeatureCounts + laplace_correction) / classSums

 # Compute class priors
 class_counts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses))
 class_prior = class_counts / numRows;

 # Compute accuracy on training set
 ones = matrix(1, rows=numRows, cols=1)
 D_w_ones = append(D, ones)
 model = append(class_conditionals, class_prior)
 log_probs = D_w_ones %*% t(log(model))
 pred = rowIndexMax(log_probs)
 acc = sum(pred == C) / numRows * 100

 acc_str = "Training Accuracy (%): " + acc
 print(acc_str)
 write(acc_str, $accuracy)

 extra_model_params = matrix(0, rows=1, cols=1)
 extra_model_params[1, 1] = numFeatures
 class_prior = t(append(t(class_prior), extra_model_params))

 # write out the model
 write(class_prior, $prior, format=cmdLine_fmt);
 write(class_conditionals, $conditionals, format=cmdLine_fmt);
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# Implements multinomial naive Bayes classifier with Laplace correction
	#
	# Example Usage:
	# hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text"
	#

	# defaults
	cmdLine_laplace = ifdef($laplace, 1)
	cmdLine_fmt = ifdef($fmt, "text")

	# reading input args
	D = read($X)
	min_feature_val = min(D)
	if(min_feature_val < 0)
	stop("Stopping due to invalid argument: Multinomial naive Bayes is meant for count-based feature values, minimum value in X is negative")
	numRows = nrow(D)
	if(numRows < 2)
	stop("Stopping due to invalid inputs: Not possible to learn a classifier without at least 2 rows")

	C = read($Y)
	if(min(C) < 1)
	stop("Stopping due to invalid argument: Label vector (Y) must be recoded")
	numClasses = max(C)
	if(numClasses == 1)
	stop("Stopping due to invalid argument: Maximum label value is 1, need more than one class to learn a multi-class classifier")
	mod1 = C %% 1
	mod1_should_be_nrow = sum(abs(mod1 == 0))
	if(mod1_should_be_nrow != numRows)
	stop("Stopping due to invalid argument: Please ensure that Y contains (positive) integral labels")

	laplace_correction = cmdLine_laplace
	if(laplace_correction < 0)
	stop("Stopping due to invalid argument: Laplacian correction (laplace) must be non-negative")

	numFeatures = ncol(D)

	# Compute conditionals

	# Compute the feature counts for each class
	classFeatureCounts = matrix(0, rows=numClasses, cols=numFeatures)
	parfor (i in 1:numFeatures) {
	Col = D[,i]
	classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=as.integer(numClasses))
	}

	# Compute the total feature count for each class
	# and add the number of features to this sum
	# for subsequent regularization (Laplace's rule)
	classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction

	# Compute class conditional probabilities
	#ones = matrix(1, rows=1, cols=numFeatures)
	#repClassSums = classSums %*% ones
	#class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums
	class_conditionals = (classFeatureCounts + laplace_correction) / classSums

	# Compute class priors
	class_counts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses))
	class_prior = class_counts / numRows;

	# Compute accuracy on training set
	ones = matrix(1, rows=numRows, cols=1)
	D_w_ones = append(D, ones)
	model = append(class_conditionals, class_prior)
	log_probs = D_w_ones %*% t(log(model))
	pred = rowIndexMax(log_probs)
	acc = sum(pred == C) / numRows * 100

	acc_str = "Training Accuracy (%): " + acc
	print(acc_str)
	write(acc_str, $accuracy)

	extra_model_params = matrix(0, rows=1, cols=1)
	extra_model_params[1, 1] = numFeatures
	class_prior = t(append(t(class_prior), extra_model_params))

	# write out the model
	write(class_prior, $prior, format=cmdLine_fmt);
	write(class_conditionals, $conditionals, format=cmdLine_fmt);