from collections import namedtuple, defaultdict from itertools import combinations import numpy as np import pysubgroup_mod as ps from tqdm import tqdm from copy import copy import itertools class GpGrowth: def __init__(self, mode='b_u' ): self.GP_node = namedtuple('GP_node', ['cls', 'id', 'parent', 'children', 'stats']) self.minSupp = 10 self.tqdm = tqdm self.depth = 0 self.mode = mode #specify eihther b_u (bottom up) or t_d (top down) # Future: There also is the option of a stable mode which never creates the prefix trees def prepare_selectors(self, search_space): self.get_stats = task.qf.gp_get_stats self.get_null_vector = task.qf.gp_get_null_vector self.merge = task.qf.gp_merge l = [] for selector in search_space: cov_arr = selector.covers(data) l.append((np.count_nonzero(cov_arr), selector, cov_arr)) l = [(size, selector, arr) for size, selector, arr in l if size > self.minSupp] s = sorted(l, reverse=True) selectors_sorted = [selector for size, selector, arr in s] arrs = np.vstack([arr for size, selector, arr in s]).T return selectors_sorted, arrs def nodes_to_cls_nodes(self, nodes): cls_nodes = defaultdict(list) for node in nodes: cls_nodes[node.cls].append(node) return cls_nodes def execute(self, task): assert(self.mode in ('b_u', 't_d')) task.qf.calculate_constant_statistics(task) self.depth = task.depth selectors_sorted, arrs = self.prepare_selectors(task.search_space) self.requires_cover_arr = task.qf.gp_requires_cover_arr # Create tree root = self.GP_node(-1, -1, None, {}, self.get_null_vector()) nodes = [] for row_index, row in self.tqdm(enumerate(arrs), 'creating tree', total=len(arrs)): self.normal_insert(root, nodes, self.get_stats(row_index), np.nonzero(row)[0]) nodes.append(root) # mine tree cls_nodes = self.nodes_to_cls_nodes(nodes) if self.mode == 'b_u': patterns = self.recurse(cls_nodes, []) elif self.mode == 't_d': patterns = self.recurse_top_down(cls_nodes, root) else: raise RuntimeError('mode needs to be either b_u or t_d') # compute quality functions return self.calculate_quality_function_for_patterns(patterns, selectors_sorted, arrs) def calculate_quality_function_for_patterns(self, patterns, selectors_sorted, arrs): out = [] for indices, gp_params in self.tqdm(patterns, 'computing quality function',): if len(indices) > 0: selectors = [selectors_sorted[i] for i in indices] #print(selectors, stats) sg = ps.Conjunction(selectors) if self.requires_cover_arr: statistics = task.qf.gp_get_params(np.all([arrs[i] for i in indices]), gp_params) else: statistics = task.qf.gp_get_params(None, gp_params) #qual1 = task.qf.evaluate(sg, task.qf.calculate_statistics(sg, qual2 = task.qf.evaluate(sg, statistics) out.append((qual2, sg)) return out def normal_insert(self, root, nodes, new_stats, classes): node = root for cls in classes: if cls not in node.children: new_child = self.GP_node(cls, len(nodes), node, {}, self.get_null_vector()) nodes.append(new_child) node.children[cls] = new_child self.merge(node.stats, new_stats) node = node.children[cls] self.merge(node.stats, new_stats) return node def insert_into_tree(self, root, nodes, new_stats, classes, max_depth): ''' Creates a tree of a maximum depth = depth ''' if len(classes) <= max_depth: self.normal_insert(root, nodes, new_stats, classes) return for prefix in combinations(classes, max_depth -1): node = self.normal_insert(root, nodes, new_stats, classes) # do normal insert for prefix index_for_remaining = classes.index(prefix) + 1 for cls in classes[index_for_remaining:]: if cls not in node.children: new_child = self.GP_node(cls, len(nodes), node, {}, self.get_null_vector()) nodes.append(new_child) node.children[cls] = new_child self.merge(node.stats, new_stats) def check_constraints(self, node): #return node[0] >= self.minSupp return node['size'] >= self.minSupp def recurse(self, cls_nodes, prefix, is_single_path=False): if len(cls_nodes) == 0: raise RuntimeError results = [] results.append((prefix, cls_nodes[-1][0].stats)) if len(prefix) >= self.depth: return results stats_dict = self.get_stats_for_class(cls_nodes) if is_single_path: root_stats = cls_nodes[-1][0].stats del stats_dict[-1] all_combinations = ps.powerset(stats_dict.keys(), max_length=self.depth - len(prefix)) for comb in all_combinations: results.append((prefix+comb, root_stats)) else: for cls, nodes in cls_nodes.items(): if cls >= 0: if self.check_constraints(stats_dict[cls]): if len(prefix) == (self.depth - 1): results.append(((*prefix, cls), stats_dict[cls])) else: is_single_path_now = len(nodes) == 1 new_tree = self.create_new_tree_from_nodes(nodes) if len(new_tree) > 0: results.extend(self.recurse(new_tree, (*prefix, cls), is_single_path_now)) return results def get_prefixes_top_down(self, alpha, max_length): if len(alpha) == 0: return [()] if len(alpha) == 1 or max_length == 1: return [(alpha[0],)] prefixes = [(alpha[0],)] prefixes.extend([(alpha[0], *suffix) for suffix in self.get_prefixes_top_down(alpha[1:], max_length-1)]) return prefixes def recurse_top_down(self, cls_nodes, root, depth_in=0): alpha = [] curr_depth = depth_in while True: if root.cls == -1: pass else: alpha.append(root.cls) if len(root.children) == 1 and curr_depth <= self.depth: curr_depth += 1 root = next(iter(root.children.values())) else: break prefixes = self.get_prefixes_top_down(alpha, max_length=self.depth - depth_in + 1) # Bug: If we have a longer path that branches. eg. consider the tree from items A - B - C and A - B - D # and depth - depth_in == 2 then prefixes = [(A), (A, B)] but the sets # (A, C) and (A, D) are also valid # basically if we have prefixes of diffrent length this does not work properly if len(root.children) == 0 or curr_depth >= self.depth: results = [] stats_dict = self.get_stats_for_class(cls_nodes) for prefix in prefixes: cls = max(prefix) if self.check_constraints(stats_dict[cls]): results.append((prefix, stats_dict[cls])) return results else: suffixes = [((), root.stats)] stats_dict = self.get_stats_for_class(cls_nodes) for cls in cls_nodes: if cls >= 0 and cls not in alpha: if self.check_constraints(stats_dict[cls]): # Future: There is also the possibility to compute the stats_dict of the prefix tree # without creating the prefix tree first # This might be useful if curr_depth == self.depth - 2 # as we need not recreate the tree if curr_depth == (self.depth - 1): suffixes.append(((cls,), stats_dict[cls])) else: new_root, nodes = self.get_top_down_tree_for_class(cls_nodes, cls) if len(nodes) > 0: new_cls_nodes = self.nodes_to_cls_nodes(nodes) print(" " * curr_depth, cls, curr_depth, len(new_cls_nodes)) suffixes.extend(self.recurse_top_down(new_cls_nodes, new_root, curr_depth+1)) return [((*pre, *(suf[0])), suf[1]) for pre, suf in itertools.product(prefixes, suffixes)] def remove_infrequent_class(self, nodes, cls_nodes, stats_dict): # returns cleaned tree infrequent_classes = [] for cls in cls_nodes: if not self.check_constraints(stats_dict[cls]): infrequent_classes.append(cls) infrequent_classes = sorted(infrequent_classes, reverse=True) for cls in infrequent_classes: for node_to_remove in cls_nodes[cls]: self.merge_trees_top_down(nodes, node_to_remove.parent, node_to_remove) def get_top_down_tree_for_class(self, cls_nodes, cls): # Future: Can eventually also remove infrequent nodes already during tree creation base_root = None nodes = [] if len(cls_nodes[cls]) > 0: base_root = self.create_copy_of_tree_top_down(cls_nodes[cls][0], nodes) for other_root in cls_nodes[cls][1:]: self.merge_trees_top_down(nodes, base_root, other_root) return base_root, nodes def create_copy_of_tree_top_down(self, root, nodes=None, parent=None): if nodes is None: nodes = [] #if len(nodes) == 0: # root_cls = -1 children = {} new_root = self.GP_node(root.cls, len(nodes), parent, children, root.stats.copy()) nodes.append(new_root) for child_cls, child in root.children.items(): new_child = self.create_copy_of_tree_top_down(child, nodes, new_root) children[child_cls] = new_child return new_root def merge_trees_top_down(self, nodes, mutable_root, other_root): self.merge(mutable_root.stats, other_root.stats) for cls in other_root.children: if cls not in mutable_root.children: self.create_copy_of_tree_top_down(other_root.children[cls], nodes, mutable_root) else: self.merge_trees_top_down(nodes, mutable_root.children[cls], other_root.children[cls]) def get_stats_for_class(self, cls_nodes): out = {} for key, nodes in cls_nodes.items(): s = self.get_null_vector() for node in nodes: self.merge(s, node.stats) out[key] = s return out def create_new_tree_from_nodes(self, nodes): new_nodes = {} for node in nodes: nodes_upwards = self.get_nodes_upwards(node) self.create_copy_of_path(nodes_upwards[1:], new_nodes, node.stats) #self.remove_infrequent_nodes(new_nodes) cls_nodes = defaultdict(list) for new_node in new_nodes.values(): cls_nodes[new_node.cls].append(new_node) return cls_nodes def remove_infrequent_nodes(self, new_nodes): keys = list(new_nodes.keys()) for key in keys: node = new_nodes[key] if node.stats["size"] < self.minSupp: del new_nodes[key] def create_copy_of_path(self, nodes, new_nodes, stats): parent = None for node in reversed(nodes): if not in new_nodes: new_node = self.GP_node(node.cls,, parent, {}, stats.copy()) new_nodes[] = new_node else: new_node = new_nodes[] self.merge(new_node.stats, stats) if parent is not None: parent.children[new_node.cls] = new_node parent = new_node def get_nodes_upwards(self, node): ref = node path = [] while True: path.append(ref) ref = ref.parent if ref is None: break return path def to_file(self, task, path): task.qf.calculate_constant_statistics(task) self.depth = task.depth selectors_sorted, arrs = self.prepare_selectors(task.search_space) # Create tree root = self.GP_node(-1, -1, None, {}, self.get_null_vector()) nodes = [] with open(path, 'w') as f: for row_index, row in self.tqdm(enumerate(arrs), 'creating tree', total=len(arrs)): #print(np.nonzero(row)[0]) f.write(" ".join(map(str, np.nonzero(row)[0])) + " "+ task.qf.gp_to_str(self.get_stats(row_index))+"\r\n") if __name__ == '__main__': from pysubgroup.tests.DataSets import get_credit_data from pysubgroup import model_target data = get_credit_data() #warnings.filterwarnings("error") print(data.columns) searchSpace_Nominal = ps.create_nominal_selectors(data, ignore=['duration', 'credit_amount']) searchSpace_Numeric = ps.create_numeric_selectors(data, ignore=['duration', 'credit_amount']) searchSpace = searchSpace_Nominal + searchSpace_Numeric target = ps.FITarget() #QF=model_target.EMM_Likelihood(model_target.PolyRegression_ModelClass(x_name='duration', y_name='credit_amount')) QF=ps.CountQF() task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=200, depth=4, qf=QF) GpGrowth(mode='b_u').to_file(task,'E:/tmp/gp_credit.txt') import time start_time = time.time() gp = GpGrowth(mode='b_u').execute(task) print("--- %s seconds ---" % (time.time() - start_time)) #gp = [(qual, sg) for qual, sg in gp if sg.depth <= task.depth] gp = sorted(gp) quit() start_time = time.time() dfs1 = ps.SimpleDFS().execute(task) print("--- %s seconds ---" % (time.time() - start_time)) dfs = [(qual, sg.subgroup_description) for qual, sg in dfs1] dfs = sorted(dfs, reverse=True) gp = sorted(gp, reverse=True) def better_sorted(l): the_dict=defaultdict(list) prev_key=l[0][0] for key, val in l: if abs(prev_key-key)<10**-11: the_dict[prev_key].append(val) else: the_dict[key].append(val) prev_key = key print(len(the_dict)) result = [] for key, vals in the_dict.items(): for val in sorted(vals): result.append((key, val)) return result dfs = better_sorted(dfs) gp = better_sorted(gp) gp = gp[:task.result_set_size] for i, (l, r) in enumerate(zip(gp, dfs)): print(i) print('gp:', l) print('df:', r) assert(abs(l[0]-r[0]) < 10 ** -7) assert(l[1] == r[1])