blob: 8ec64d558489ef6b4aa076edcbe4852bf19d549f [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
from slicing.base.node import Node
from slicing.base.top_k import Topk
from slicing.base.slicer import opt_fun, union
import matplotlib.pyplot as plt
def check_attributes(left_node, right_node):
attr1 = list(map(lambda x: x[0].split("_")[0], left_node.attributes))
attr2 = list(map(lambda x: x[0].split("_")[0], right_node.attributes))
if set(attr1).intersection(set(attr2)):
return False
return True
def make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, w, alpha, top_k):
all_nodes = {}
counter = 0
first_level = []
for feature in all_features:
new_node = Node(complete_x, loss, x_size, y_test, errors)
new_node.parents = [(feature, counter)]
new_node.attributes.append((feature, counter))
new_node.name = new_node.make_name()
new_id = len(all_nodes)
new_node.key = new_node.make_key(new_id)
all_nodes[new_node.key] = new_node
new_node.process_slice(loss_type)
# for first level nodes all bounds are strict as concrete metrics
new_node.s_upper = new_node.size
new_node.s_lower = 0
new_node.e_upper = new_node.loss
new_node.e_max_upper = new_node.e_max
new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w)
new_node.c_upper = new_node.score
first_level.append(new_node)
new_node.print_debug(top_k, 0)
# constraints for 1st level nodes to be problematic candidates
if new_node.score > 1 and new_node.size >= x_size / alpha:
# this method updates top k slices if needed
top_k.add_new_top_slice(new_node)
counter = counter + 1
return first_level, all_nodes
def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update):
top_k = Topk(k)
# First level slices are enumerated in a "classic way" (getting data and not analyzing bounds
levels = []
first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, w, alpha, top_k)
# double appending of first level nodes in order to enumerating second level in the same way as others
levels.append((first_level[0], len(all_features)))
candidates = []
pruned = []
indexes = []
indexes.append(1)
candidates.append(len(first_level[0]))
pruned.append(len(first_level[0]))
all_nodes = first_level[1]
# cur_lvl - index of current level, correlates with number of slice forming features
cur_lvl = 1 # level that is planned to be filled later
cur_lvl_nodes = first_level
# currently for debug
print("Level 1 had " + str(len(all_features)) + " candidates")
print()
print("Current topk are: ")
top_k.print_topk()
# DPSize algorithm approach of previous levels nodes combinations and updating bounds for those that already exist
while len(cur_lvl_nodes) > 0:
cur_lvl_nodes = []
count = 0
prev_lvl = levels[cur_lvl - 1]
for left in range(int(cur_lvl / 2) + 1):
right = cur_lvl - 1 - left
for node_i in range(len(levels[left][0])):
for node_j in range(len(levels[right][0])):
flag = check_attributes(levels[left][0][node_i], levels[right][0][node_j])
required_number = len(union(levels[left][0][node_i].attributes, levels[right][0][node_j].attributes))
if flag and required_number == cur_lvl + 1:
new_node = Node(complete_x, loss, x_size, y_test, errors)
parents_set = set(new_node.parents)
parents_set.add(levels[left][0][node_i])
parents_set.add(levels[right][0][node_j])
new_node.parents = list(parents_set)
parent1_attr = levels[left][0][node_i].attributes
parent2_attr = levels[right][0][node_j].attributes
new_node_attr = union(parent1_attr, parent2_attr)
new_node.attributes = new_node_attr
new_node.name = new_node.make_name()
new_id = len(all_nodes)
new_node.key = new_node.make_key(new_id)
if new_node.key[1] in all_nodes:
existing_item = all_nodes[new_node.key[1]]
parents_set = set(existing_item.parents)
existing_item.parents = parents_set
if b_update:
s_upper = new_node.calc_s_upper(cur_lvl)
s_lower = new_node.calc_s_lower(cur_lvl)
e_upper = new_node.calc_e_upper()
e_max_upper = new_node.calc_e_max_upper(cur_lvl)
new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w)
else:
new_node.calc_bounds(cur_lvl, w)
all_nodes[new_node.key[1]] = new_node
# check if concrete data should be extracted or not (only for those that have score upper
# big enough and if size of subset is big enough
to_slice = new_node.check_bounds(x_size, alpha, top_k.min_score)
if to_slice:
new_node.process_slice(loss_type)
new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w)
# we decide to add node to current level nodes (in order to make new combinations
# on the next one or not basing on its score value
if new_node.check_constraint(top_k, x_size, alpha, top_k.min_score) and new_node.key \
not in top_k.keys:
top_k.add_new_top_slice(new_node)
cur_lvl_nodes.append(new_node)
if debug:
new_node.print_debug(top_k, cur_lvl)
count = count + levels[left][1] * levels[right][1]
cur_lvl = cur_lvl + 1
indexes.append(cur_lvl)
print("Level " + str(cur_lvl) + " had " + str(count) +
" candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level")
levels.append((cur_lvl_nodes, count))
candidates.append(count)
pruned.append(len(cur_lvl_nodes))
top_k.print_topk()
plt.plot(indexes, candidates, 'r--',
indexes, pruned, 'g--')
plt.xlabel('Level')
plt.ylabel('Number of slices')
plt.show()
print("Program stopped at level " + str(cur_lvl))
print()
print("Selected slices are: ")
top_k.print_topk()
print("candidates:")
print(candidates)
print(">>>>>>>>>")
print("pruned:")
print(pruned)
return top_k