blob: cacb715391dfc6a376301ac8dd1169ab8f45533d [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
import sys
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from slicing.base import slicer as slicer, union_slicer
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
if __name__ == "__main__":
args = sys.argv
if len(args) > 1:
k = int(args[1])
w = float(args[2].replace(',', '.'))
alpha = int(args[3])
if args[4] == "True":
b_update = True
else:
b_update = False
debug = args[5]
loss_type = int(args[6])
enumerator = args[7]
else:
k = 10
w = 0.5
alpha = 4
b_update = True
debug = True
loss_type = 1
enumerator = "union"
dataset = pd.read_csv('/slicing/datasets/adult.csv')
attributes_amount = len(dataset.values[0])
x = dataset.iloc[:, 0:attributes_amount - 1].values
y = dataset.iloc[:, attributes_amount - 1]
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)
complete_x = []
complete_y = []
counter = 0
all_indexes = []
not_encoded_columns = [
"Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
"MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
"CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
for row in x:
row[0] = int(row[0] / 10)
row[2] = int(row[2]) // 100000
row[4] = int(row[4] / 5)
row[10] = int(row[10] / 1000)
row[12] = int(row[12] / 10)
enc = OneHotEncoder(handle_unknown='ignore')
x = enc.fit_transform(x).toarray()
all_features = enc.get_feature_names()
x_size = len(complete_x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
for item in x_test:
complete_x.append((counter, item))
complete_y.append((counter, y_test[counter]))
counter = counter + 1
x_size = counter
clf = svm.SVC()
clf.fit(x_train, y_train)
# alpha is size significance coefficient
# verbose option is for returning debug info while creating slices and printing it
# k is number of top-slices we want
# w is a weight of error function significance (1 - w) is a size significance propagated into optimization function
# loss_type = 0 (l2 in case of regression model
# loss_type = 1 (cross entropy in case of classification model)
preds = clf.predict(x_test)
predictions = []
counter = 0
mistakes = 0
for pred in preds:
predictions.append((counter, pred))
if y_test[counter] != pred:
mistakes = mistakes + 1
counter = counter + 1
lossF = mistakes / counter
# enumerator <union>/<join> indicates an approach of next level slices combination process:
# in case of <join> in order to create new node of current level slicer
# combines only nodes of previous layer with each other
# <union> case implementation is based on DPSize algorithm
if enumerator == "join":
slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha, k=k,
w=w, loss_type=loss_type, b_update=b_update)
elif enumerator == "union":
union_slicer.process(all_features, complete_x, lossF, x_size, complete_y, predictions, debug=debug, alpha=alpha,
k=k, w=w, loss_type=loss_type, b_update=b_update)