scripts/staging/slicing/tests/classification/test_compas.py - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LinearRegression
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import OneHotEncoder
 import sys

 from slicing.base import slicer, union_slicer

 if __name__ == "__main__":
     args = sys.argv
     if len(args) > 1:
         k = int(args[1])
         w = float(args[2].replace(',', '.'))
         alpha = int(args[3])
         if args[4] == "True":
             b_update = True
         else:
             b_update = False
         debug = args[5]
         loss_type = int(args[6])
         enumerator = args[7]
     else:
         k = 10
         w = 0.5
         alpha = 4
         b_update = True
         debug = True
         loss_type = 0
         enumerator = "union"
     file_name = 'slicing/datasets/real/compas/compas-test.csv'
     dataset = pd.read_csv(file_name)
     attributes_amount = len(dataset.values[0])
     y = dataset.iloc[:, attributes_amount - 1:attributes_amount].values
     # starting with one not including id field
     x = dataset.iloc[:, 0:attributes_amount - 1].values
     # hot encoding of categorical features
     enc = OneHotEncoder(handle_unknown='ignore')
     x = enc.fit_transform(x).toarray()
     complete_x = []
     complete_y = []
     counter = 0
     all_features = enc.get_feature_names()
     # train model on a whole dataset
     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
     for item in x_test:
         complete_x.append((counter, item))
         complete_y.append((counter, y_test[counter]))
         counter = counter + 1
     x_size = counter
     clf = RandomForestClassifier(n_jobs=2, random_state=0)
     clf.fit(x_train, y_train)
     RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                            max_depth=None, max_features='auto', max_leaf_nodes=None,
                            min_impurity_split=1e-07, min_samples_leaf=1,
                            min_samples_split=2, min_weight_fraction_leaf=0.0,
                            n_estimators=10, n_jobs=2, oob_score=False, random_state=0,
                            verbose=0, warm_start=False)
     preds = clf.predict(x_test)
     predictions = []
     counter = 0
     mistakes = 0
     for pred in preds:
         predictions.append((counter, pred))
         if y_test[counter] != pred:
             mistakes = mistakes + 1
         counter = counter + 1
     lossF = mistakes / counter
     # alpha is size significance coefficient
     # verbose option is for returning debug info while creating slices and printing it
     # k is number of top-slices we want
     # w is a weight of error function significance (1 - w) is a size significance propagated into optimization function

     # enumerator <union>/<join> indicates an approach of next level slices combination process:
     # in case of <join> in order to create new node of current level slicer
     # combines only nodes of previous layer with each other
     # <union> case implementation is based on DPSize algorithm
     if enumerator == "join":
         slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha, k=k,
                        w=w, loss_type=loss_type, b_update=b_update)
     elif enumerator == "union":
         union_slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha,
                              k=k, w=w, loss_type=loss_type, b_update=b_update)
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	import pandas as pd
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_squared_error
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import OneHotEncoder
	import sys

	from slicing.base import slicer, union_slicer

	if __name__ == "__main__":
	args = sys.argv
	if len(args) > 1:
	k = int(args[1])
	w = float(args[2].replace(',', '.'))
	alpha = int(args[3])
	if args[4] == "True":
	b_update = True
	else:
	b_update = False
	debug = args[5]
	loss_type = int(args[6])
	enumerator = args[7]
	else:
	k = 10
	w = 0.5
	alpha = 4
	b_update = True
	debug = True
	loss_type = 0
	enumerator = "union"
	file_name = 'slicing/datasets/real/compas/compas-test.csv'
	dataset = pd.read_csv(file_name)
	attributes_amount = len(dataset.values[0])
	y = dataset.iloc[:, attributes_amount - 1:attributes_amount].values
	# starting with one not including id field
	x = dataset.iloc[:, 0:attributes_amount - 1].values
	# hot encoding of categorical features
	enc = OneHotEncoder(handle_unknown='ignore')
	x = enc.fit_transform(x).toarray()
	complete_x = []
	complete_y = []
	counter = 0
	all_features = enc.get_feature_names()
	# train model on a whole dataset
	x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
	for item in x_test:
	complete_x.append((counter, item))
	complete_y.append((counter, y_test[counter]))
	counter = counter + 1
	x_size = counter
	clf = RandomForestClassifier(n_jobs=2, random_state=0)
	clf.fit(x_train, y_train)
	RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
	max_depth=None, max_features='auto', max_leaf_nodes=None,
	min_impurity_split=1e-07, min_samples_leaf=1,
	min_samples_split=2, min_weight_fraction_leaf=0.0,
	n_estimators=10, n_jobs=2, oob_score=False, random_state=0,
	verbose=0, warm_start=False)
	preds = clf.predict(x_test)
	predictions = []
	counter = 0
	mistakes = 0
	for pred in preds:
	predictions.append((counter, pred))
	if y_test[counter] != pred:
	mistakes = mistakes + 1
	counter = counter + 1
	lossF = mistakes / counter
	# alpha is size significance coefficient
	# verbose option is for returning debug info while creating slices and printing it
	# k is number of top-slices we want
	# w is a weight of error function significance (1 - w) is a size significance propagated into optimization function

	# enumerator <union>/<join> indicates an approach of next level slices combination process:
	# in case of <join> in order to create new node of current level slicer
	# combines only nodes of previous layer with each other
	# <union> case implementation is based on DPSize algorithm
	if enumerator == "join":
	slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha, k=k,
	w=w, loss_type=loss_type, b_update=b_update)
	elif enumerator == "union":
	union_slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha,
	k=k, w=w, loss_type=loss_type, b_update=b_update)