gp_growth.py 14.9 KB
Newer Older
aarongitrepos's avatar
All  
aarongitrepos committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
from collections import  namedtuple, defaultdict
from itertools import combinations
import numpy as np
import pysubgroup_mod as ps
from tqdm import tqdm
from copy import copy
import itertools
class GpGrowth:

    def __init__(self, mode='b_u' ):
        self.GP_node = namedtuple('GP_node', ['cls', 'id', 'parent', 'children', 'stats'])
        self.minSupp = 10
        self.tqdm = tqdm
        self.depth = 0
        self.mode = mode  #specify eihther b_u (bottom up) or t_d (top down)
        # Future: There also is the option of a stable mode which never creates the prefix trees

    def prepare_selectors(self, search_space):
        
        self.get_stats = task.qf.gp_get_stats
        self.get_null_vector = task.qf.gp_get_null_vector
        self.merge = task.qf.gp_merge
        l = []
        for selector in search_space:
            cov_arr = selector.covers(data)
            l.append((np.count_nonzero(cov_arr), selector, cov_arr))
        l = [(size, selector, arr) for size, selector, arr in l if size > self.minSupp]

        s = sorted(l, reverse=True)
        selectors_sorted = [selector for size, selector, arr in s]
        arrs = np.vstack([arr for size, selector, arr in s]).T
        return selectors_sorted, arrs

    def nodes_to_cls_nodes(self, nodes):
        cls_nodes = defaultdict(list)
        for node in nodes:
            cls_nodes[node.cls].append(node)
        return cls_nodes


    def execute(self, task):
        assert(self.mode in ('b_u', 't_d'))
        task.qf.calculate_constant_statistics(task)
        self.depth = task.depth
        selectors_sorted, arrs = self.prepare_selectors(task.search_space)
        self.requires_cover_arr = task.qf.gp_requires_cover_arr

        # Create tree
        root = self.GP_node(-1, -1, None, {}, self.get_null_vector())
        nodes = []
        for row_index, row in self.tqdm(enumerate(arrs), 'creating tree', total=len(arrs)):
            self.normal_insert(root, nodes, self.get_stats(row_index), np.nonzero(row)[0])
        nodes.append(root)

        # mine tree
        cls_nodes = self.nodes_to_cls_nodes(nodes)
        if self.mode == 'b_u':
            patterns = self.recurse(cls_nodes, [])
        elif self.mode == 't_d':
            patterns = self.recurse_top_down(cls_nodes, root)
        else:
            raise RuntimeError('mode needs to be either b_u or t_d')

        # compute quality functions
        return self.calculate_quality_function_for_patterns(patterns, selectors_sorted, arrs)

    def calculate_quality_function_for_patterns(self, patterns, selectors_sorted, arrs):
        out = []
        for indices, gp_params in self.tqdm(patterns, 'computing quality function',):
            if len(indices) > 0:
                selectors = [selectors_sorted[i] for i in indices]
                #print(selectors, stats)
                sg = ps.Conjunction(selectors)
                if self.requires_cover_arr:
                    statistics = task.qf.gp_get_params(np.all([arrs[i] for i in indices]), gp_params)
                else:
                    statistics = task.qf.gp_get_params(None, gp_params)
                #qual1 = task.qf.evaluate(sg, task.qf.calculate_statistics(sg, task.data))
                qual2 = task.qf.evaluate(sg, statistics)
                out.append((qual2, sg))
        return out

    def normal_insert(self, root, nodes, new_stats, classes):
        node = root
        for cls in classes:
            if cls not in node.children:
                new_child = self.GP_node(cls, len(nodes), node, {}, self.get_null_vector())
                nodes.append(new_child)
                node.children[cls] = new_child
            self.merge(node.stats, new_stats)
            node = node.children[cls]
        self.merge(node.stats, new_stats)
        return node

    def insert_into_tree(self, root, nodes, new_stats, classes, max_depth):
        ''' Creates a tree of a maximum depth = depth
        '''
        if len(classes) <= max_depth:
            self.normal_insert(root, nodes, new_stats, classes)
            return
        for prefix in combinations(classes, max_depth -1):
            node = self.normal_insert(root, nodes, new_stats, classes)
            # do normal insert for prefix
            index_for_remaining = classes.index(prefix) + 1
            for cls in classes[index_for_remaining:]:
                if cls not in node.children:
                    new_child = self.GP_node(cls, len(nodes), node, {}, self.get_null_vector())
                    nodes.append(new_child)
                    node.children[cls] = new_child
                    self.merge(node.stats, new_stats)


    def check_constraints(self, node):
        #return node[0] >= self.minSupp
        return node['size'] >= self.minSupp

    def recurse(self, cls_nodes, prefix, is_single_path=False):
        if len(cls_nodes) == 0:
            raise RuntimeError
        results = []

        results.append((prefix, cls_nodes[-1][0].stats))
        if len(prefix) >= self.depth:
            return results
        
        stats_dict = self.get_stats_for_class(cls_nodes)
        if is_single_path:
            root_stats = cls_nodes[-1][0].stats
            del stats_dict[-1]
            all_combinations = ps.powerset(stats_dict.keys(), max_length=self.depth - len(prefix))
            
            for comb in all_combinations:
                results.append((prefix+comb, root_stats))
        else:
            for cls, nodes in cls_nodes.items():
                if cls >= 0:
                    if self.check_constraints(stats_dict[cls]):
                        if len(prefix) == (self.depth - 1):
                            results.append(((*prefix, cls), stats_dict[cls]))
                        else:
                            is_single_path_now = len(nodes) == 1
                            new_tree = self.create_new_tree_from_nodes(nodes)
                            if len(new_tree) > 0:
                                results.extend(self.recurse(new_tree, (*prefix, cls), is_single_path_now))
        return results

    def get_prefixes_top_down(self, alpha, max_length):
        if len(alpha) == 0:
            return [()]
        if len(alpha) == 1 or max_length == 1:
            return [(alpha[0],)]
        prefixes = [(alpha[0],)]
        prefixes.extend([(alpha[0], *suffix) for suffix in self.get_prefixes_top_down(alpha[1:], max_length-1)])
        return prefixes


    def recurse_top_down(self, cls_nodes, root, depth_in=0):

        alpha = []
        curr_depth = depth_in
        while True:
            if root.cls == -1:
               pass
            else:
                alpha.append(root.cls)
            if len(root.children) == 1 and curr_depth <= self.depth:
                curr_depth += 1
                root = next(iter(root.children.values()))
            else:
                break
        prefixes = self.get_prefixes_top_down(alpha, max_length=self.depth - depth_in + 1)

        # Bug: If we have a longer path that branches. eg. consider the tree from items A - B - C and A - B - D
        # and depth - depth_in == 2 then prefixes = [(A), (A, B)] but the sets
        # (A, C) and (A, D) are also valid
        # basically if we have prefixes of diffrent length this does not work properly
        if len(root.children) == 0 or curr_depth >= self.depth:
            results = []
            stats_dict = self.get_stats_for_class(cls_nodes)
            for prefix in prefixes:
                cls = max(prefix)
                if self.check_constraints(stats_dict[cls]):
                    results.append((prefix, stats_dict[cls]))
            return results
        else:
            suffixes = [((), root.stats)]
            stats_dict = self.get_stats_for_class(cls_nodes)
            for cls in cls_nodes:
                if cls >= 0 and cls not in alpha:
                    if self.check_constraints(stats_dict[cls]):
                        # Future: There is also the possibility to compute the stats_dict of the prefix tree
                        # without creating the prefix tree first
                        # This might be useful if curr_depth == self.depth - 2
                        # as we need not recreate the tree

                        if curr_depth == (self.depth - 1):
                            suffixes.append(((cls,), stats_dict[cls]))
                        else:
                            new_root, nodes = self.get_top_down_tree_for_class(cls_nodes, cls)
                            if len(nodes) > 0:
                                new_cls_nodes = self.nodes_to_cls_nodes(nodes)
                                print("  " * curr_depth, cls, curr_depth, len(new_cls_nodes))
                                suffixes.extend(self.recurse_top_down(new_cls_nodes, new_root, curr_depth+1))

        return [((*pre, *(suf[0])), suf[1]) for pre, suf in itertools.product(prefixes, suffixes)]

    def remove_infrequent_class(self, nodes, cls_nodes, stats_dict):
        # returns cleaned tree

        infrequent_classes = []
        for cls in cls_nodes:
            if not self.check_constraints(stats_dict[cls]):
                infrequent_classes.append(cls)
        infrequent_classes = sorted(infrequent_classes, reverse=True)
        for cls in infrequent_classes:
            for node_to_remove in cls_nodes[cls]:
                self.merge_trees_top_down(nodes, node_to_remove.parent, node_to_remove)



    def get_top_down_tree_for_class(self, cls_nodes, cls):
        # Future: Can eventually also remove infrequent nodes already during tree creation
        base_root = None
        nodes = []
        if len(cls_nodes[cls]) > 0:
            base_root = self.create_copy_of_tree_top_down(cls_nodes[cls][0], nodes)
            for other_root in cls_nodes[cls][1:]:
                self.merge_trees_top_down(nodes, base_root, other_root)
        return base_root, nodes

    def create_copy_of_tree_top_down(self, root, nodes=None, parent=None):
        if nodes is None:
            nodes = []
        #if len(nodes) == 0:
        #    root_cls = -1
        children = {}
        new_root = self.GP_node(root.cls, len(nodes), parent, children, root.stats.copy())
        nodes.append(new_root)
        for child_cls, child in root.children.items():
            new_child = self.create_copy_of_tree_top_down(child, nodes, new_root)
            children[child_cls] = new_child
        return new_root

    def merge_trees_top_down(self, nodes, mutable_root, other_root):
        self.merge(mutable_root.stats, other_root.stats)
        for cls in other_root.children:
            if cls not in mutable_root.children:
                self.create_copy_of_tree_top_down(other_root.children[cls], nodes, mutable_root)
            else:
                self.merge_trees_top_down(nodes, mutable_root.children[cls], other_root.children[cls])
        

    def get_stats_for_class(self, cls_nodes):
        out = {}
        for key, nodes in cls_nodes.items():
            s = self.get_null_vector()
            for node in nodes:
                self.merge(s, node.stats)
            out[key] = s
        return out


    def create_new_tree_from_nodes(self, nodes):
        new_nodes = {}
        for node in nodes:
            nodes_upwards = self.get_nodes_upwards(node)
            self.create_copy_of_path(nodes_upwards[1:], new_nodes, node.stats)

        #self.remove_infrequent_nodes(new_nodes)
        cls_nodes = defaultdict(list)
        for new_node in new_nodes.values():
            cls_nodes[new_node.cls].append(new_node)
        return cls_nodes

    def remove_infrequent_nodes(self, new_nodes):
        keys = list(new_nodes.keys())
        for key in keys:
            node = new_nodes[key]
            if node.stats["size"] < self.minSupp:
                del new_nodes[key]

    def create_copy_of_path(self, nodes, new_nodes, stats):
        parent = None
        for node in reversed(nodes):
            if node.id not in new_nodes:
                new_node = self.GP_node(node.cls, node.id, parent, {}, stats.copy())
                new_nodes[node.id] = new_node
            else:
                new_node = new_nodes[node.id]
                self.merge(new_node.stats, stats)
            if parent is not None:
                parent.children[new_node.cls] = new_node
            parent = new_node

    def get_nodes_upwards(self, node):
        ref = node
        path = []
        while True:
            path.append(ref)
            ref = ref.parent
            if ref is None:
                break
        return path

    def to_file(self, task, path):
        task.qf.calculate_constant_statistics(task)
        self.depth = task.depth
        selectors_sorted, arrs = self.prepare_selectors(task.search_space)

        # Create tree
        root = self.GP_node(-1, -1, None, {}, self.get_null_vector())
        nodes = []
        with open(path, 'w') as f:
            for row_index, row in self.tqdm(enumerate(arrs), 'creating tree', total=len(arrs)):
                #print(np.nonzero(row)[0])
                f.write(" ".join(map(str, np.nonzero(row)[0])) + " "+ task.qf.gp_to_str(self.get_stats(row_index))+"\r\n")
        

if __name__ == '__main__':
    from pysubgroup.tests.DataSets import get_credit_data
    from pysubgroup import model_target

    data = get_credit_data()
    #warnings.filterwarnings("error")
    print(data.columns)
    searchSpace_Nominal = ps.create_nominal_selectors(data, ignore=['duration', 'credit_amount'])
    searchSpace_Numeric = ps.create_numeric_selectors(data, ignore=['duration', 'credit_amount'])
    searchSpace = searchSpace_Nominal + searchSpace_Numeric
    target = ps.FITarget()
    #QF=model_target.EMM_Likelihood(model_target.PolyRegression_ModelClass(x_name='duration', y_name='credit_amount'))
    QF=ps.CountQF()
    task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=200, depth=4, qf=QF)
    GpGrowth(mode='b_u').to_file(task,'E:/tmp/gp_credit.txt')

    import time
    start_time = time.time()
    gp = GpGrowth(mode='b_u').execute(task)
    print("--- %s seconds ---" % (time.time() - start_time))
    #gp = [(qual, sg) for qual, sg in gp if sg.depth <= task.depth]
    gp = sorted(gp)
    quit()

    start_time = time.time()
    dfs1 = ps.SimpleDFS().execute(task)
    print("--- %s seconds ---" % (time.time() - start_time))
    dfs = [(qual, sg.subgroup_description) for qual, sg in dfs1]
    dfs = sorted(dfs, reverse=True)
    gp = sorted(gp, reverse=True)

    def better_sorted(l):
        the_dict=defaultdict(list)
        prev_key=l[0][0]
        for key, val in l:
            
            if abs(prev_key-key)<10**-11:
                the_dict[prev_key].append(val)
            else:
                the_dict[key].append(val)
                prev_key = key
        print(len(the_dict))
        result = []
        for key, vals in the_dict.items():
            for val in sorted(vals):
                result.append((key, val))
        return result
    dfs = better_sorted(dfs)
    gp = better_sorted(gp)
    gp = gp[:task.result_set_size]

    for i, (l, r) in enumerate(zip(gp, dfs)):
        print(i)
        print('gp:', l)
        print('df:', r)
        assert(abs(l[0]-r[0]) < 10 ** -7)
        assert(l[1] == r[1])