blob: 7fbb6fca6474390184f897b56d32986a144fed90 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import sys
from slicing.base import slicer, union_slicer
if __name__ == "__main__":
args = sys.argv
if len(args) > 1:
k = int(args[1])
w = float(args[2].replace(',', '.'))
alpha = int(args[3])
if args[4] == "True":
b_update = True
else:
b_update = False
debug = args[5]
loss_type = int(args[6])
enumerator = args[7]
else:
k = 10
w = 0.5
alpha = 4
b_update = True
debug = True
loss_type = 0
enumerator = "union"
file_name = 'slicing/datasets/real/compas/compas-test.csv'
dataset = pd.read_csv(file_name)
attributes_amount = len(dataset.values[0])
y = dataset.iloc[:, attributes_amount - 1:attributes_amount].values
# starting with one not including id field
x = dataset.iloc[:, 0:attributes_amount - 1].values
# hot encoding of categorical features
enc = OneHotEncoder(handle_unknown='ignore')
x = enc.fit_transform(x).toarray()
complete_x = []
complete_y = []
counter = 0
all_features = enc.get_feature_names()
# train model on a whole dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
for item in x_test:
complete_x.append((counter, item))
complete_y.append((counter, y_test[counter]))
counter = counter + 1
x_size = counter
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(x_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=10, n_jobs=2, oob_score=False, random_state=0,
verbose=0, warm_start=False)
preds = clf.predict(x_test)
predictions = []
counter = 0
mistakes = 0
for pred in preds:
predictions.append((counter, pred))
if y_test[counter] != pred:
mistakes = mistakes + 1
counter = counter + 1
lossF = mistakes / counter
# alpha is size significance coefficient
# verbose option is for returning debug info while creating slices and printing it
# k is number of top-slices we want
# w is a weight of error function significance (1 - w) is a size significance propagated into optimization function
# enumerator <union>/<join> indicates an approach of next level slices combination process:
# in case of <join> in order to create new node of current level slicer
# combines only nodes of previous layer with each other
# <union> case implementation is based on DPSize algorithm
if enumerator == "join":
slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha, k=k,
w=w, loss_type=loss_type, b_update=b_update)
elif enumerator == "union":
union_slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha,
k=k, w=w, loss_type=loss_type, b_update=b_update)