src/test/scripts/applications/validation/CV_MultiClassSVM.sasha.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # 10 K Dataset w/ k-fold CV
 # hadoop jar SystemDS.jar -f CV_MultiClassSVM.sasha.dml -args $INPUT_DIR/X $INPUT_DIR/y num_folds intercept num_classes max_num_iterations tolerance regularization
 #                                                             $1           $2           $3        $4        $5          $6                 $7        $8
 # EXAMPLE: hadoop jar SystemDS.jar -f CV_MultiClassSVM.sasha.dml -args itau/svm/X_10k_10 itau/svm/y_10k 4 0 5 100 0.001 1.0


 #X = Rand(rows=1000000, cols=100);
 #y = Rand(rows=1000000, cols=1);
 X = read( $1 );
 y = read( $2 );
 m = nrow( X );
 n = ncol( X );

 k = $3;

 #parameters for model training
 is_intercept = $4;
 number_of_classes = $5;
 max_num_iterations = $6;
 tolerance = $7;
 regularization_coeff = $8;

 P = Rand(rows=m, cols=1, min=0.0, max=1.0, pdf = "uniform");
 P = round(0.5+P*k);

 ones = matrix(1, rows=1, cols=n);
 stats = matrix(0, rows=k, cols=40); #k-folds x 40-stats

 parfor( i in 1:k )
 {
     #prepare train/test fold projections
     vPxi = (P == i);
     mPxi = (vPxi %*% ones);
     #nvPxi = (P != i);
     #nmPxi = (nvPxi %*% ones);  #note: inefficient for sparse data

     #create train/test folds
     Xi = X * mPxi;  #  Create the TEST set with 1/k of all the rows
     yi = y * vPxi;  #  Create the labels for the TEST set
     nXi = X - Xi;   #  Create the TRAINING set with (k-1)/k of the rows
     nyi = y - yi;   #  Create the labels for the TRAINING set
     Xyi = cbind(Xi,yi); #keep alignment on removeEmpty
     Xyi = removeEmpty( target=Xyi, margin="rows" );
     Xi = Xyi[ , 1:n];
     yi = Xyi[ , n+1];
     nXyi = cbind(nXi,nyi); #keep alignment on removeEmpty
     nXyi = removeEmpty( target=nXyi, margin="rows" );
     nXi = nXyi[ , 1:n];
     nyi = nXyi[ , n+1];

     check_Xi = sum(Xi)
     if(check_Xi == 0) {
 	    print("Xi has no non-zeros")
     } else {
         #train multiclass SVM model per fold, use the TRAINING set
         wi = multiClassSVM (nXi, nyi, is_intercept, number_of_classes, max_num_iterations, tolerance, regularization_coeff);

         #score multiclass SVM model per fold, use both the TRAINING and the TEST sets

         score = scoreMultiClassSVM (nXi, nyi, wi, Xi, yi);
         stats [i, ] = score;
     }
 }

 # printed output of stats
 z = printFoldStatistics( stats );


 ################################################################################

 multiClassSVM = function (Matrix[double] X, Matrix[double] Y, Integer intercept, Integer num_classes, Integer max_iterations, Double epsilon, Double lambda)
   return (Matrix[double] ret)
 {
 # Implements multiclass C-SVM with squared slack variables, uses one-against-the-rest binary classifiers

 num_samples = nrow(X)
 num_features = ncol(X)

 if (intercept == 1) {
  ones  = Rand(rows=num_samples, cols=1, min=1, max=1, pdf="uniform");
  X = cbind(X, ones);
 }

 iter_class = 1

 Y_local = 2 * (Y == iter_class) - 1
 w_class = Rand(rows=num_features, cols=1, min=0, max=0, pdf="uniform")
 if (intercept == 1) {
  zero_matrix = Rand(rows=1, cols=1, min=0.0, max=0.0);
  w_class = t(cbind(t(w_class), zero_matrix));
 }

 g_old = t(X) %*% Y_local
 s = g_old
 iter = 0
 continue = 1
 while(continue == 1)  {
   # minimizing primal obj along direction s
   step_sz = 0
   Xd = X %*% s
   wd = lambda * sum(w_class * s)
   dd = lambda * sum(s * s)
   continue1 = 1
   while(continue1 == 1){
    tmp_w = w_class + step_sz*s
    out = 1 - Y_local * (X %*% tmp_w)
    sv = (out > 0)
    out = out * sv
    g = wd + step_sz*dd - sum(out * Y_local * Xd)
    h = dd + sum(Xd * sv * Xd)
    step_sz = step_sz - g/h
    if (g*g/h < 0.0000000001){
     continue1 = 0
    }
   }

   #update weights
   w_class = w_class + step_sz*s

   out = 1 - Y_local * (X %*% w_class)
   sv = (out > 0)
   out = sv * out
   obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
   g_new = t(X) %*% (out * Y_local) - lambda * w_class

   tmp = sum(s * g_old)

   train_acc = sum((Y_local*(X%*%w_class)) >= 0)/num_samples*100
   print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)

   if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){
    continue = 0
   }

   #non-linear CG step
   be = sum(g_new * g_new)/sum(g_old * g_old)
   s = be * s + g_new
   g_old = g_new

   iter = iter + 1
  }


 w = w_class
 iter_class = iter_class + 1

 while(iter_class <= num_classes){
  Y_local = 2 * (Y == iter_class) - 1
 # w_class = Rand(rows=num_features, cols=1, min=0, max=0, pdf="uniform")
  w_class = Rand(rows=ncol(X), cols=1, min=0, max=0, pdf="uniform")
  if (intercept == 1) {
  	zero_matrix = Rand(rows=1, cols=1, min=0.0, max=0.0);
  	w_class = t(cbind(t(w_class), zero_matrix));
  }

  g_old = t(X) %*% Y_local
  s = g_old

  iter = 0
  continue = 1
  while(continue == 1)  {
   # minimizing primal obj along direction s
   step_sz = 0
   Xd = X %*% s
   wd = lambda * sum(w_class * s)
   dd = lambda * sum(s * s)
   continue1 = 1
   while(continue1 == 1){
    tmp_w = w_class + step_sz*s
    out = 1 - Y_local * (X %*% tmp_w)
    sv = (out > 0)
    out = out * sv
    g = wd + step_sz*dd - sum(out * Y_local * Xd)
    h = dd + sum(Xd * sv * Xd)
    step_sz = step_sz - g/h
    if (g*g/h < 0.0000000001){
     continue1 = 0
    }
   }

   #update weights
   w_class = w_class + step_sz*s

   out = 1 - Y_local * (X %*% w_class)
   sv = (out > 0)
   out = sv * out
   obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
   g_new = t(X) %*% (out * Y_local) - lambda * w_class

   tmp = sum(s * g_old)

   train_acc = sum(Y_local*(X%*%w_class) >= 0)/num_samples*100
   print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)

   if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){
    continue = 0
   }

   #non-linear CG step
   be = sum(g_new * g_new)/sum(g_old * g_old)
   s = be * s + g_new
   g_old = g_new

   iter = iter + 1
  }

  w = cbind(w, w_class)
  iter_class = iter_class + 1
 }

   ret = w; #return
 }


 ################################################################################


 scoreMultiClassSVM  = function (Matrix[double] X_train, Matrix[double] y_train, Matrix[double] W_train,
                                 Matrix[double] X_test, Matrix[double] y_test)
     return (Matrix[double] stats)
 #   X_train     = training-set feature matrix file
 #   y_train     = training-set class label vector file
 #   W_train     = training-set SVM weights MATRIX (not vector!) file; may have intercept weights as the last row
 #   X_test      = test-set feature matrix file
 #   y_test      = test-set class label vector file
 {
     one = matrix(1, rows=1, cols=1);
     stats = matrix(0, rows=1, cols=40);

     num_train_records = nrow (X_train);
     num_test_records = nrow (X_test);
     num_features = ncol (X_test);
     num_classes = ncol (W_train);

     b = Rand(rows=1, cols=num_classes, min=0, max=0, pdf="uniform");
     if (nrow (W_train) > num_features) {
         b = W_train [num_features + 1, ];
     }

     W = W_train [1:num_features, ];

     ones_train = Rand(rows=num_train_records, cols=1, min=1, max=1, pdf="uniform");
     ones_test  = Rand(rows=num_test_records,  cols=1, min=1, max=1, pdf="uniform");
     scores_train = X_train %*% W + ones_train %*% b;
     scores_test  = X_test  %*% W + ones_test  %*% b;
     y_train_pred = rowIndexMax (scores_train);
     y_test_pred  = rowIndexMax (scores_test);
     correct_train= (y_train_pred == y_train);
     correct_test = (y_test_pred == y_test);

 # TRAINING DATA - COMPARE WITH ACTUAL LABELS:
 # Compute the actual number of true/false predictions

     num_TP_train = sum (correct_train);
     num_FP_train = sum (1 - correct_train);

     stats [1, 11] = one*num_TP_train;
     stats [1, 12] = one*num_FP_train;

 # Compute the actual precision

     precision_train = 100.0 * num_TP_train / (num_TP_train + num_FP_train);

     stats [1, 15] = one*precision_train;

 # TEST DATA - COMPARE WITH ACTUAL LABELS:
 # Compute the actual number of true/false positives/negatives

     num_TP_test = sum (correct_test);
     num_FP_test = sum (1 - correct_test);

     stats [1, 31] = one*num_TP_test;
     stats [1, 32] = one*num_FP_test;

 # Compute the actual precision

     precision_test = 100.0 * num_TP_test / (num_TP_test + num_FP_test);

     stats [1, 35] = one*precision_test;
 }


 printFoldStatistics = function (Matrix[double] stats ) return( Integer err )
 {
     stats_avg = round (colMeans(stats) * 10000.0) / 10000.0;
     stats_max = round (colMaxs (stats) * 10000.0) / 10000.0;
     stats_min = round (colMins (stats) * 10000.0) / 10000.0;

     print ("Training Data, Label comparison statistics:");
     print ("    True Matches:  Min = " + as.scalar (stats_min [1, 11]) + ",  Avg = " + as.scalar (stats_avg [1, 11]) + ",  Max = " + as.scalar (stats_max [1, 11]));
     print ("   False Matches:  Min = " + as.scalar (stats_min [1, 12]) + ",  Avg = " + as.scalar (stats_avg [1, 12]) + ",  Max = " + as.scalar (stats_max [1, 12]));
     print ("     Precision %:  Min = " + as.scalar (stats_min [1, 15]) + ",  Avg = " + as.scalar (stats_avg [1, 15]) + ",  Max = " + as.scalar (stats_max [1, 15]));

     print (" ");
     print(" ")
     while(FALSE){}

     print ("TEST Data, Label comparison statistics:");
     print ("    True Matches:  Min = " + as.scalar (stats_min [1, 31]) + ",  Avg = " + as.scalar (stats_avg [1, 31]) + ",  Max = " + as.scalar (stats_max [1, 31]));
     print ("   False Matches:  Min = " + as.scalar (stats_min [1, 32]) + ",  Avg = " + as.scalar (stats_avg [1, 32]) + ",  Max = " + as.scalar (stats_max [1, 32]));
     print ("     Precision %:  Min = " + as.scalar (stats_min [1, 35]) + ",  Avg = " + as.scalar (stats_avg [1, 35]) + ",  Max = " + as.scalar (stats_max [1, 35]));

     err = 0;
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# 10 K Dataset w/ k-fold CV
	# hadoop jar SystemDS.jar -f CV_MultiClassSVM.sasha.dml -args $INPUT_DIR/X $INPUT_DIR/y num_folds intercept num_classes max_num_iterations tolerance regularization
	# $1 $2 $3 $4 $5 $6 $7 $8
	# EXAMPLE: hadoop jar SystemDS.jar -f CV_MultiClassSVM.sasha.dml -args itau/svm/X_10k_10 itau/svm/y_10k 4 0 5 100 0.001 1.0


	#X = Rand(rows=1000000, cols=100);
	#y = Rand(rows=1000000, cols=1);
	X = read( $1 );
	y = read( $2 );
	m = nrow( X );
	n = ncol( X );

	k = $3;

	#parameters for model training
	is_intercept = $4;
	number_of_classes = $5;
	max_num_iterations = $6;
	tolerance = $7;
	regularization_coeff = $8;

	P = Rand(rows=m, cols=1, min=0.0, max=1.0, pdf = "uniform");
	P = round(0.5+P*k);

	ones = matrix(1, rows=1, cols=n);
	stats = matrix(0, rows=k, cols=40); #k-folds x 40-stats

	parfor( i in 1:k )
	{
	#prepare train/test fold projections
	vPxi = (P == i);
	mPxi = (vPxi %*% ones);
	#nvPxi = (P != i);
	#nmPxi = (nvPxi %*% ones); #note: inefficient for sparse data

	#create train/test folds
	Xi = X * mPxi; # Create the TEST set with 1/k of all the rows
	yi = y * vPxi; # Create the labels for the TEST set
	nXi = X - Xi; # Create the TRAINING set with (k-1)/k of the rows
	nyi = y - yi; # Create the labels for the TRAINING set
	Xyi = cbind(Xi,yi); #keep alignment on removeEmpty
	Xyi = removeEmpty( target=Xyi, margin="rows" );
	Xi = Xyi[ , 1:n];
	yi = Xyi[ , n+1];
	nXyi = cbind(nXi,nyi); #keep alignment on removeEmpty
	nXyi = removeEmpty( target=nXyi, margin="rows" );
	nXi = nXyi[ , 1:n];
	nyi = nXyi[ , n+1];

	check_Xi = sum(Xi)
	if(check_Xi == 0) {
	print("Xi has no non-zeros")
	} else {
	#train multiclass SVM model per fold, use the TRAINING set
	wi = multiClassSVM (nXi, nyi, is_intercept, number_of_classes, max_num_iterations, tolerance, regularization_coeff);

	#score multiclass SVM model per fold, use both the TRAINING and the TEST sets

	score = scoreMultiClassSVM (nXi, nyi, wi, Xi, yi);
	stats [i, ] = score;
	}
	}

	# printed output of stats
	z = printFoldStatistics( stats );



	################################################################################

	multiClassSVM = function (Matrix[double] X, Matrix[double] Y, Integer intercept, Integer num_classes, Integer max_iterations, Double epsilon, Double lambda)
	return (Matrix[double] ret)
	{
	# Implements multiclass C-SVM with squared slack variables, uses one-against-the-rest binary classifiers

	num_samples = nrow(X)
	num_features = ncol(X)

	if (intercept == 1) {
	ones = Rand(rows=num_samples, cols=1, min=1, max=1, pdf="uniform");
	X = cbind(X, ones);
	}

	iter_class = 1

	Y_local = 2 * (Y == iter_class) - 1
	w_class = Rand(rows=num_features, cols=1, min=0, max=0, pdf="uniform")
	if (intercept == 1) {
	zero_matrix = Rand(rows=1, cols=1, min=0.0, max=0.0);
	w_class = t(cbind(t(w_class), zero_matrix));
	}

	g_old = t(X) %*% Y_local
	s = g_old
	iter = 0
	continue = 1
	while(continue == 1) {
	# minimizing primal obj along direction s
	step_sz = 0
	Xd = X %*% s
	wd = lambda * sum(w_class * s)
	dd = lambda * sum(s * s)
	continue1 = 1
	while(continue1 == 1){
	tmp_w = w_class + step_sz*s
	out = 1 - Y_local * (X %*% tmp_w)
	sv = (out > 0)
	out = out * sv
	g = wd + step_szdd - sum(out Y_local * Xd)
	h = dd + sum(Xd * sv * Xd)
	step_sz = step_sz - g/h
	if (g*g/h < 0.0000000001){
	continue1 = 0
	}
	}

	#update weights
	w_class = w_class + step_sz*s

	out = 1 - Y_local * (X %*% w_class)
	sv = (out > 0)
	out = sv * out
	obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
	g_new = t(X) %% (out Y_local) - lambda * w_class

	tmp = sum(s * g_old)

	train_acc = sum((Y_local(X%%w_class)) >= 0)/num_samples*100
	print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)

	if((step_sztmp < epsilonobj) \| (iter >= max_iterations-1)){
	continue = 0
	}

	#non-linear CG step
	be = sum(g_new * g_new)/sum(g_old * g_old)
	s = be * s + g_new
	g_old = g_new

	iter = iter + 1
	}


	w = w_class
	iter_class = iter_class + 1

	while(iter_class <= num_classes){
	Y_local = 2 * (Y == iter_class) - 1
	# w_class = Rand(rows=num_features, cols=1, min=0, max=0, pdf="uniform")
	w_class = Rand(rows=ncol(X), cols=1, min=0, max=0, pdf="uniform")
	if (intercept == 1) {
	zero_matrix = Rand(rows=1, cols=1, min=0.0, max=0.0);
	w_class = t(cbind(t(w_class), zero_matrix));
	}

	g_old = t(X) %*% Y_local
	s = g_old

	iter = 0
	continue = 1
	while(continue == 1) {
	# minimizing primal obj along direction s
	step_sz = 0
	Xd = X %*% s
	wd = lambda * sum(w_class * s)
	dd = lambda * sum(s * s)
	continue1 = 1
	while(continue1 == 1){
	tmp_w = w_class + step_sz*s
	out = 1 - Y_local * (X %*% tmp_w)
	sv = (out > 0)
	out = out * sv
	g = wd + step_szdd - sum(out Y_local * Xd)
	h = dd + sum(Xd * sv * Xd)
	step_sz = step_sz - g/h
	if (g*g/h < 0.0000000001){
	continue1 = 0
	}
	}

	#update weights
	w_class = w_class + step_sz*s

	out = 1 - Y_local * (X %*% w_class)
	sv = (out > 0)
	out = sv * out
	obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
	g_new = t(X) %% (out Y_local) - lambda * w_class

	tmp = sum(s * g_old)

	train_acc = sum(Y_local(X%%w_class) >= 0)/num_samples*100
	print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc)

	if((step_sztmp < epsilonobj) \| (iter >= max_iterations-1)){
	continue = 0
	}

	#non-linear CG step
	be = sum(g_new * g_new)/sum(g_old * g_old)
	s = be * s + g_new
	g_old = g_new

	iter = iter + 1
	}

	w = cbind(w, w_class)
	iter_class = iter_class + 1
	}

	ret = w; #return
	}


	################################################################################


	scoreMultiClassSVM = function (Matrix[double] X_train, Matrix[double] y_train, Matrix[double] W_train,
	Matrix[double] X_test, Matrix[double] y_test)
	return (Matrix[double] stats)
	# X_train = training-set feature matrix file
	# y_train = training-set class label vector file
	# W_train = training-set SVM weights MATRIX (not vector!) file; may have intercept weights as the last row
	# X_test = test-set feature matrix file
	# y_test = test-set class label vector file
	{
	one = matrix(1, rows=1, cols=1);
	stats = matrix(0, rows=1, cols=40);

	num_train_records = nrow (X_train);
	num_test_records = nrow (X_test);
	num_features = ncol (X_test);
	num_classes = ncol (W_train);

	b = Rand(rows=1, cols=num_classes, min=0, max=0, pdf="uniform");
	if (nrow (W_train) > num_features) {
	b = W_train [num_features + 1, ];
	}

	W = W_train [1:num_features, ];

	ones_train = Rand(rows=num_train_records, cols=1, min=1, max=1, pdf="uniform");
	ones_test = Rand(rows=num_test_records, cols=1, min=1, max=1, pdf="uniform");
	scores_train = X_train %% W + ones_train %% b;
	scores_test = X_test %% W + ones_test %% b;
	y_train_pred = rowIndexMax (scores_train);
	y_test_pred = rowIndexMax (scores_test);
	correct_train= (y_train_pred == y_train);
	correct_test = (y_test_pred == y_test);

	# TRAINING DATA - COMPARE WITH ACTUAL LABELS:
	# Compute the actual number of true/false predictions

	num_TP_train = sum (correct_train);
	num_FP_train = sum (1 - correct_train);

	stats [1, 11] = one*num_TP_train;
	stats [1, 12] = one*num_FP_train;

	# Compute the actual precision

	precision_train = 100.0 * num_TP_train / (num_TP_train + num_FP_train);

	stats [1, 15] = one*precision_train;

	# TEST DATA - COMPARE WITH ACTUAL LABELS:
	# Compute the actual number of true/false positives/negatives

	num_TP_test = sum (correct_test);
	num_FP_test = sum (1 - correct_test);

	stats [1, 31] = one*num_TP_test;
	stats [1, 32] = one*num_FP_test;

	# Compute the actual precision

	precision_test = 100.0 * num_TP_test / (num_TP_test + num_FP_test);

	stats [1, 35] = one*precision_test;
	}


	printFoldStatistics = function (Matrix[double] stats ) return( Integer err )
	{
	stats_avg = round (colMeans(stats) * 10000.0) / 10000.0;
	stats_max = round (colMaxs (stats) * 10000.0) / 10000.0;
	stats_min = round (colMins (stats) * 10000.0) / 10000.0;

	print ("Training Data, Label comparison statistics:");
	print (" True Matches: Min = " + as.scalar (stats_min [1, 11]) + ", Avg = " + as.scalar (stats_avg [1, 11]) + ", Max = " + as.scalar (stats_max [1, 11]));
	print (" False Matches: Min = " + as.scalar (stats_min [1, 12]) + ", Avg = " + as.scalar (stats_avg [1, 12]) + ", Max = " + as.scalar (stats_max [1, 12]));
	print (" Precision %: Min = " + as.scalar (stats_min [1, 15]) + ", Avg = " + as.scalar (stats_avg [1, 15]) + ", Max = " + as.scalar (stats_max [1, 15]));

	print (" ");
	print(" ")
	while(FALSE){}

	print ("TEST Data, Label comparison statistics:");
	print (" True Matches: Min = " + as.scalar (stats_min [1, 31]) + ", Avg = " + as.scalar (stats_avg [1, 31]) + ", Max = " + as.scalar (stats_max [1, 31]));
	print (" False Matches: Min = " + as.scalar (stats_min [1, 32]) + ", Avg = " + as.scalar (stats_avg [1, 32]) + ", Max = " + as.scalar (stats_max [1, 32]));
	print (" Precision %: Min = " + as.scalar (stats_min [1, 35]) + ", Avg = " + as.scalar (stats_avg [1, 35]) + ", Max = " + as.scalar (stats_max [1, 35]));

	err = 0;
	}