scripts/staging/slicing/tests/classification/test_adult.py - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 import sys

 import pandas as pd
 from sklearn.preprocessing import OneHotEncoder
 from sklearn import svm

 from slicing.base import slicer as slicer, union_slicer
 from sklearn.ensemble import RandomForestClassifier
 from sklearn import preprocessing
 from sklearn.model_selection import train_test_split


 if __name__ == "__main__":
     args = sys.argv
     if len(args) > 1:
         k = int(args[1])
         w = float(args[2].replace(',', '.'))
         alpha = int(args[3])
         if args[4] == "True":
             b_update = True
         else:
             b_update = False
         debug = args[5]
         loss_type = int(args[6])
         enumerator = args[7]
     else:
         k = 10
         w = 0.5
         alpha = 4
         b_update = True
         debug = True
         loss_type = 1
         enumerator = "union"
     dataset = pd.read_csv('/slicing/datasets/adult.csv')
     attributes_amount = len(dataset.values[0])
     x = dataset.iloc[:, 0:attributes_amount - 1].values
     y = dataset.iloc[:, attributes_amount - 1]
     le = preprocessing.LabelEncoder()
     le.fit(y)
     y = le.transform(y)
     complete_x = []
     complete_y = []
     counter = 0
     all_indexes = []
     not_encoded_columns = [
         "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
         "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
         "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
     ]
     for row in x:
             row[0] = int(row[0] / 10)
             row[2] = int(row[2]) // 100000
             row[4] = int(row[4] / 5)
             row[10] = int(row[10] / 1000)
             row[12] = int(row[12] / 10)
     enc = OneHotEncoder(handle_unknown='ignore')
     x = enc.fit_transform(x).toarray()
     all_features = enc.get_feature_names()
     x_size = len(complete_x)
     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
     for item in x_test:
         complete_x.append((counter, item))
         complete_y.append((counter, y_test[counter]))
         counter = counter + 1
     x_size = counter
     clf = svm.SVC()
     clf.fit(x_train, y_train)

     # alpha is size significance coefficient
     # verbose option is for returning debug info while creating slices and printing it
     # k is number of top-slices we want
     # w is a weight of error function significance (1 - w) is a size significance propagated into optimization function
     # loss_type = 0 (l2 in case of regression model
     # loss_type = 1 (cross entropy in case of classification model)
     preds = clf.predict(x_test)
     predictions = []
     counter = 0
     mistakes = 0
     for pred in preds:
         predictions.append((counter, pred))
         if y_test[counter] != pred:
             mistakes = mistakes + 1
         counter = counter + 1
     lossF = mistakes / counter

     # enumerator <union>/<join> indicates an approach of next level slices combination process:
     # in case of <join> in order to create new node of current level slicer
     # combines only nodes of previous layer with each other
     # <union> case implementation is based on DPSize algorithm
     if enumerator == "join":
         slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha, k=k,
                        w=w, loss_type=loss_type, b_update=b_update)
     elif enumerator == "union":
         union_slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha,
                              k=k, w=w, loss_type=loss_type, b_update=b_update)
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	import sys

	import pandas as pd
	from sklearn.preprocessing import OneHotEncoder
	from sklearn import svm

	from slicing.base import slicer as slicer, union_slicer
	from sklearn.ensemble import RandomForestClassifier
	from sklearn import preprocessing
	from sklearn.model_selection import train_test_split


	if __name__ == "__main__":
	args = sys.argv
	if len(args) > 1:
	k = int(args[1])
	w = float(args[2].replace(',', '.'))
	alpha = int(args[3])
	if args[4] == "True":
	b_update = True
	else:
	b_update = False
	debug = args[5]
	loss_type = int(args[6])
	enumerator = args[7]
	else:
	k = 10
	w = 0.5
	alpha = 4
	b_update = True
	debug = True
	loss_type = 1
	enumerator = "union"
	dataset = pd.read_csv('/slicing/datasets/adult.csv')
	attributes_amount = len(dataset.values[0])
	x = dataset.iloc[:, 0:attributes_amount - 1].values
	y = dataset.iloc[:, attributes_amount - 1]
	le = preprocessing.LabelEncoder()
	le.fit(y)
	y = le.transform(y)
	complete_x = []
	complete_y = []
	counter = 0
	all_indexes = []
	not_encoded_columns = [
	"Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
	"MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
	"CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
	]
	for row in x:
	row[0] = int(row[0] / 10)
	row[2] = int(row[2]) // 100000
	row[4] = int(row[4] / 5)
	row[10] = int(row[10] / 1000)
	row[12] = int(row[12] / 10)
	enc = OneHotEncoder(handle_unknown='ignore')
	x = enc.fit_transform(x).toarray()
	all_features = enc.get_feature_names()
	x_size = len(complete_x)
	x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
	for item in x_test:
	complete_x.append((counter, item))
	complete_y.append((counter, y_test[counter]))
	counter = counter + 1
	x_size = counter
	clf = svm.SVC()
	clf.fit(x_train, y_train)

	# alpha is size significance coefficient
	# verbose option is for returning debug info while creating slices and printing it
	# k is number of top-slices we want
	# w is a weight of error function significance (1 - w) is a size significance propagated into optimization function
	# loss_type = 0 (l2 in case of regression model
	# loss_type = 1 (cross entropy in case of classification model)
	preds = clf.predict(x_test)
	predictions = []
	counter = 0
	mistakes = 0
	for pred in preds:
	predictions.append((counter, pred))
	if y_test[counter] != pred:
	mistakes = mistakes + 1
	counter = counter + 1
	lossF = mistakes / counter

	# enumerator <union>/<join> indicates an approach of next level slices combination process:
	# in case of <join> in order to create new node of current level slicer
	# combines only nodes of previous layer with each other
	# <union> case implementation is based on DPSize algorithm
	if enumerator == "join":
	slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha, k=k,
	w=w, loss_type=loss_type, b_update=b_update)
	elif enumerator == "union":
	union_slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha,
	k=k, w=w, loss_type=loss_type, b_update=b_update)