scripts/staging/slicing/base/slicer.py - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 from slicing.base.node import Node
 from slicing.base.top_k import Topk
 import matplotlib.pyplot as plt


 # optimization function calculation:
 # fi - error of subset, si - its size
 # f - error of complete set, x_size - its size
 # w - "significance" weight of error constraint
 def opt_fun(fi, si, f, x_size, w):
     formula = w * (fi/f) + (1 - w) * (si/x_size)
     return float(formula)


 # slice_name_nonsense function defines if combination of nodes on current level is fine or impossible:
 # there is dependency between common nodes' attributes number and current level is such that:
 # commons == cur_lvl - 1
 # valid combination example: node ABC + node BCD (on 4th level) // three attributes nodes have two common attributes
 # invalid combination example: node ABC + CDE (on 4th level) // result node - ABCDE (absurd for 4th level)
 def slice_name_nonsense(node_i, node_j, cur_lvl):
     attr1 = list(map(lambda x: x[0].split("_")[0], node_i.attributes))
     key1 = node_i.key[0]
     attr2 = list(map(lambda x: x[0].split("_")[0], node_j.attributes))
     key2 = node_j.key[0]
     commons = len(list(set(attr1) & set(attr2)))
     return commons == cur_lvl - 1 and key1 < key2


 def union(lst1, lst2):
     final_list = sorted(set(list(lst1) + list(lst2)))
     return final_list


 def make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, top_k, alpha, w):
     first_level = {}
     counter = 0
     all_nodes = {}
     # First level slices are enumerated in a "classic way" (getting data and not analyzing bounds
     for feature in all_features:
         new_node = Node(complete_x, loss, x_size, y_test, errors)
         new_node.parents = [(feature, counter)]
         new_node.attributes.append((feature, counter))
         new_node.name = new_node.make_name()
         new_id = len(all_nodes)
         new_node.key = new_node.make_key(new_id)
         all_nodes[new_node.key] = new_node
         new_node.process_slice(loss_type)
         new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w)
         new_node.c_upper = new_node.score
         first_level[new_node.key] = new_node
         new_node.print_debug(top_k, 0)
         # constraints for 1st level nodes to be problematic candidates
         if new_node.check_constraint(top_k, x_size, alpha, 1):
             # this method updates top k slices if needed
             top_k.add_new_top_slice(new_node)
         counter = counter + 1
     return first_level, all_nodes


 def join_enum(node_i, prev_lvl, complete_x, loss, x_size, y_test, errors, debug, alpha, w, loss_type, b_update, cur_lvl,
               all_nodes, top_k, cur_lvl_nodes):
     for node_j in prev_lvl:
         flag = slice_name_nonsense(prev_lvl[node_i], prev_lvl[node_j], cur_lvl)
         required_number = len(union(prev_lvl[node_i].attributes, prev_lvl[node_j].attributes))
         if flag and required_number == cur_lvl + 1:
             new_node = Node(complete_x, loss, x_size, y_test, errors)
             parents_set = set(new_node.parents)
             parents_set.add(prev_lvl[node_i])
             parents_set.add(prev_lvl[node_j])
             new_node.parents = list(parents_set)
             parent1_attr = prev_lvl[node_i].attributes
             parent2_attr = prev_lvl[node_j].attributes
             new_node_attr = union(parent1_attr, parent2_attr)
             new_node.attributes = new_node_attr
             new_node.name = new_node.make_name()
             new_id = len(all_nodes)
             new_node.key = new_node.make_key(new_id)
             if new_node.key[1] in cur_lvl_nodes:
                 existing_item = cur_lvl_nodes[new_node.key[1]]
                 parents_set = set(existing_item.parents)
                 existing_item.parents = parents_set
                 if b_update:
                     s_upper = new_node.calc_s_upper(cur_lvl)
                     s_lower = new_node.calc_s_lower(cur_lvl)
                     e_upper = new_node.calc_e_upper()
                     e_max_upper = new_node.calc_e_max_upper(cur_lvl)
                     new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w)
             else:
                 new_node.calc_bounds(cur_lvl, w)
                 all_nodes[new_node.key[0]] = new_node
                 # check if concrete data should be extracted or not (only for those that have score upper
                 # big enough and if size of subset is big enough
                 to_slice = new_node.check_bounds(x_size, alpha, top_k.min_score)
                 if to_slice:
                     new_node.process_slice(loss_type)
                     new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w)
                     # we decide to add node to current level nodes (in order to make new combinations
                     # on the next one or not basing on its score value
                     if new_node.check_constraint(top_k, x_size, alpha, top_k.min_score) and new_node.key not in top_k.keys:
                         top_k.add_new_top_slice(new_node)
                     cur_lvl_nodes[new_node.key[1]] = new_node
                 if debug:
                     new_node.print_debug(top_k, cur_lvl)
     return cur_lvl_nodes, all_nodes


 # alpha is size significance coefficient (required for optimization function)
 # verbose option is for returning debug info while creating slices and printing it (in console)
 # k is number of top-slices we want to receive as a result (maximum output, if less all of subsets will be printed)
 # w is a weight of error function significance (1 - w) is a size significance propagated into optimization function
 # loss_type = 0 (in case of regression model)
 # loss_type = 1 (in case of classification model)
 def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update):
     levels = []
     top_k = Topk(k)
     first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, top_k, alpha, w)
     candidates = []
     pruned = []
     indexes = []
     indexes.append(1)
     candidates.append(len(first_level[0]))
     pruned.append(len(first_level[0]))
     all_nodes = first_level[1]
     levels.append(first_level[0])
     # cur_lvl - index of current level, correlates with number of slice forming features
     cur_lvl = 1  # currently filled level after first init iteration
     print()
     print("Current topk are: ")
     top_k.print_topk()
     # combining each candidate of previous level with every till it becomes useless (one node can't make a pair)
     while len(levels[cur_lvl - 1]) > 1:
         cur_lvl_nodes = {}
         prev_lvl = levels[cur_lvl - 1]
         level_candidates = len(prev_lvl) * (len(prev_lvl) - 1)
         candidates.append(level_candidates)
         for node_i in prev_lvl:
             partial = join_enum(node_i, prev_lvl, complete_x, loss, x_size, y_test, errors, debug, alpha, w, loss_type,
                                 b_update, cur_lvl, all_nodes, top_k, cur_lvl_nodes)
             cur_lvl_nodes = partial[0]
             all_nodes = partial[1]
         cur_lvl = cur_lvl + 1
         indexes.append(cur_lvl)
         levels.append(cur_lvl_nodes)
         print("Level " + str(cur_lvl) + " had " + str(candidates) +
               " candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level")
         pruned.append(len(cur_lvl_nodes))
         print()
         print("Current topk are: ")
         top_k.print_topk()
     plt.plot(indexes, candidates, 'r--',
              indexes, pruned, 'g--')
     plt.xlabel('Level')
     plt.ylabel('Number of slices')
     plt.show()
     print("Program stopped at level " + str(cur_lvl))
     print()
     print("Selected slices are: ")
     top_k.print_topk()
     print("candidates:")
     print(candidates)
     print(">>>>>>>>>")
     print("pruned:")
     print(pruned)
     return top_k
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	from slicing.base.node import Node
	from slicing.base.top_k import Topk
	import matplotlib.pyplot as plt


	# optimization function calculation:
	# fi - error of subset, si - its size
	# f - error of complete set, x_size - its size
	# w - "significance" weight of error constraint
	def opt_fun(fi, si, f, x_size, w):
	formula = w * (fi/f) + (1 - w) * (si/x_size)
	return float(formula)


	# slice_name_nonsense function defines if combination of nodes on current level is fine or impossible:
	# there is dependency between common nodes' attributes number and current level is such that:
	# commons == cur_lvl - 1
	# valid combination example: node ABC + node BCD (on 4th level) // three attributes nodes have two common attributes
	# invalid combination example: node ABC + CDE (on 4th level) // result node - ABCDE (absurd for 4th level)
	def slice_name_nonsense(node_i, node_j, cur_lvl):
	attr1 = list(map(lambda x: x[0].split("_")[0], node_i.attributes))
	key1 = node_i.key[0]
	attr2 = list(map(lambda x: x[0].split("_")[0], node_j.attributes))
	key2 = node_j.key[0]
	commons = len(list(set(attr1) & set(attr2)))
	return commons == cur_lvl - 1 and key1 < key2


	def union(lst1, lst2):
	final_list = sorted(set(list(lst1) + list(lst2)))
	return final_list


	def make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, top_k, alpha, w):
	first_level = {}
	counter = 0
	all_nodes = {}
	# First level slices are enumerated in a "classic way" (getting data and not analyzing bounds
	for feature in all_features:
	new_node = Node(complete_x, loss, x_size, y_test, errors)
	new_node.parents = [(feature, counter)]
	new_node.attributes.append((feature, counter))
	new_node.name = new_node.make_name()
	new_id = len(all_nodes)
	new_node.key = new_node.make_key(new_id)
	all_nodes[new_node.key] = new_node
	new_node.process_slice(loss_type)
	new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w)
	new_node.c_upper = new_node.score
	first_level[new_node.key] = new_node
	new_node.print_debug(top_k, 0)
	# constraints for 1st level nodes to be problematic candidates
	if new_node.check_constraint(top_k, x_size, alpha, 1):
	# this method updates top k slices if needed
	top_k.add_new_top_slice(new_node)
	counter = counter + 1
	return first_level, all_nodes


	def join_enum(node_i, prev_lvl, complete_x, loss, x_size, y_test, errors, debug, alpha, w, loss_type, b_update, cur_lvl,
	all_nodes, top_k, cur_lvl_nodes):
	for node_j in prev_lvl:
	flag = slice_name_nonsense(prev_lvl[node_i], prev_lvl[node_j], cur_lvl)
	required_number = len(union(prev_lvl[node_i].attributes, prev_lvl[node_j].attributes))
	if flag and required_number == cur_lvl + 1:
	new_node = Node(complete_x, loss, x_size, y_test, errors)
	parents_set = set(new_node.parents)
	parents_set.add(prev_lvl[node_i])
	parents_set.add(prev_lvl[node_j])
	new_node.parents = list(parents_set)
	parent1_attr = prev_lvl[node_i].attributes
	parent2_attr = prev_lvl[node_j].attributes
	new_node_attr = union(parent1_attr, parent2_attr)
	new_node.attributes = new_node_attr
	new_node.name = new_node.make_name()
	new_id = len(all_nodes)
	new_node.key = new_node.make_key(new_id)
	if new_node.key[1] in cur_lvl_nodes:
	existing_item = cur_lvl_nodes[new_node.key[1]]
	parents_set = set(existing_item.parents)
	existing_item.parents = parents_set
	if b_update:
	s_upper = new_node.calc_s_upper(cur_lvl)
	s_lower = new_node.calc_s_lower(cur_lvl)
	e_upper = new_node.calc_e_upper()
	e_max_upper = new_node.calc_e_max_upper(cur_lvl)
	new_node.update_bounds(s_upper, s_lower, e_upper, e_max_upper, w)
	else:
	new_node.calc_bounds(cur_lvl, w)
	all_nodes[new_node.key[0]] = new_node
	# check if concrete data should be extracted or not (only for those that have score upper
	# big enough and if size of subset is big enough
	to_slice = new_node.check_bounds(x_size, alpha, top_k.min_score)
	if to_slice:
	new_node.process_slice(loss_type)
	new_node.score = opt_fun(new_node.loss, new_node.size, loss, x_size, w)
	# we decide to add node to current level nodes (in order to make new combinations
	# on the next one or not basing on its score value
	if new_node.check_constraint(top_k, x_size, alpha, top_k.min_score) and new_node.key not in top_k.keys:
	top_k.add_new_top_slice(new_node)
	cur_lvl_nodes[new_node.key[1]] = new_node
	if debug:
	new_node.print_debug(top_k, cur_lvl)
	return cur_lvl_nodes, all_nodes


	# alpha is size significance coefficient (required for optimization function)
	# verbose option is for returning debug info while creating slices and printing it (in console)
	# k is number of top-slices we want to receive as a result (maximum output, if less all of subsets will be printed)
	# w is a weight of error function significance (1 - w) is a size significance propagated into optimization function
	# loss_type = 0 (in case of regression model)
	# loss_type = 1 (in case of classification model)
	def process(all_features, complete_x, loss, x_size, y_test, errors, debug, alpha, k, w, loss_type, b_update):
	levels = []
	top_k = Topk(k)
	first_level = make_first_level(all_features, complete_x, loss, x_size, y_test, errors, loss_type, top_k, alpha, w)
	candidates = []
	pruned = []
	indexes = []
	indexes.append(1)
	candidates.append(len(first_level[0]))
	pruned.append(len(first_level[0]))
	all_nodes = first_level[1]
	levels.append(first_level[0])
	# cur_lvl - index of current level, correlates with number of slice forming features
	cur_lvl = 1 # currently filled level after first init iteration
	print()
	print("Current topk are: ")
	top_k.print_topk()
	# combining each candidate of previous level with every till it becomes useless (one node can't make a pair)
	while len(levels[cur_lvl - 1]) > 1:
	cur_lvl_nodes = {}
	prev_lvl = levels[cur_lvl - 1]
	level_candidates = len(prev_lvl) * (len(prev_lvl) - 1)
	candidates.append(level_candidates)
	for node_i in prev_lvl:
	partial = join_enum(node_i, prev_lvl, complete_x, loss, x_size, y_test, errors, debug, alpha, w, loss_type,
	b_update, cur_lvl, all_nodes, top_k, cur_lvl_nodes)
	cur_lvl_nodes = partial[0]
	all_nodes = partial[1]
	cur_lvl = cur_lvl + 1
	indexes.append(cur_lvl)
	levels.append(cur_lvl_nodes)
	print("Level " + str(cur_lvl) + " had " + str(candidates) +
	" candidates but after pruning only " + str(len(cur_lvl_nodes)) + " go to the next level")
	pruned.append(len(cur_lvl_nodes))
	print()
	print("Current topk are: ")
	top_k.print_topk()
	plt.plot(indexes, candidates, 'r--',
	indexes, pruned, 'g--')
	plt.xlabel('Level')
	plt.ylabel('Number of slices')
	plt.show()
	print("Program stopped at level " + str(cur_lvl))
	print()
	print("Selected slices are: ")
	top_k.print_topk()
	print("candidates:")
	print(candidates)
	print(">>>>>>>>>")
	print("pruned:")
	print(pruned)
	return top_k