All

c4385bc7 · aarongitrepos · c4385bc7 · c4385bc7 · c4385bc7 · c4385bc7
Commit c4385bc7 authored Sep 26, 2023 by aarongitrepos
32 changed files
--- a/.gitignore
+++ b/.gitignore
+*/__pycache__/
\ No newline at end of file
--- a/InfoGained/datasets/P4-Lucat.csv
+++ b/InfoGained/datasets/P4-Lucat.csv
--- a/InfoGained/main.py
+++ b/InfoGained/main.py
+import os, sys
+import pandas as pd
+currentdir = os.path.dirname(os.path.realpath(__file__))
+parentdir = os.path.dirname(currentdir)
+sys.path.append(parentdir)
+sys.path.append(currentdir)
+import pysubgroup_mod as ps
+import argparse
+import numpy as np
+
+
+def eliminate_reps(elem,l):
+    l.remove(elem)
+    for li in l:
+        if elem in li:
+        #if set(elem.split(" AND ")).issubset(li.split(" AND ")):
+        #if ', '.join(map(str,elem)) in ', '.join(map(str,li)):
+            return False
+    return True
+
+def info_gained_algorithm(dataname,class_column,class_value,mode_parameter="default",depth=5,list_ignore=[],list_conds=[]):
+    """
+    Parameters
+    ----------
+    dataname: string
+        The name of the dataset allocated in the datasets directory to be used for analysis.
+
+    class_column: string
+        A column of the dataset that will be used as target.
+
+    class_value: ??
+        A value of the class_column. It corresponds to the condition that has to be meet, i.e, class_column==class_Value.
+
+    mode_parameter: string, optional (default=default)
+        This parameter controls the behaviour of algorithms search. For InfoGained algorithm, it is necessary to
+        use dynamic or maximum options. For other algorithms, the parameter has the value "default".
+        (possible values: dynamic, maximum, default)
+    
+    depth: int, optional (default=5)
+        This parameter indicates the number of variables that will be added to rules.
+
+    list_ignore: list of strings, optional (default=None)
+        List containing the column names that will not be used in search activity.
+
+    list_conds: list of strings, optional (default=None)
+        List containing the column names that are neccesary to appear in rules. It will only work with InfoGained algorithm.
+    """
+    df = pd.read_csv(currentdir+"/datasets/"+dataname+".csv",index_col=[0])
+    if class_column not in df.columns:
+        print("No class column")
+        exit(0)
+    if df[class_column].dtype in [np.int16, np.int32, np.int64]:
+        class_value = int(class_value)
+    if class_value not in df[class_column].unique():
+        print("The class_value specified is not an option")
+        exit(0)
+
+    target = ps.BinaryTarget (class_column, class_value)
+    searchspace = ps.create_selectors(df, ignore=list_ignore)
+    mode_parameter = {'dynamic' : 0, 'maximum': 1, "default":2}[mode_parameter]
+    task = ps.SubgroupDiscoveryTask (
+        df, 
+        target, 
+        searchspace,
+        mode=mode_parameter, 
+        depth=depth,
+        filter_vars = list_conds, 
+        qf=ps.WRAccQF())
+
+    result, result_cut = ps.InfoGainedSearch().execute(task)
+    #df_result = result.to_dataframe()
+    df_result_cut = result_cut.to_dataframe(mode=mode_parameter)
+
+    df_result_cut.drop_duplicates(inplace=True)
+    df_result_cut.reset_index(drop=True,inplace=True)
+
+    #df_result = df_result[df_result.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result["subgroup"])), axis=1)]
+    df_result_cut = df_result_cut[df_result_cut.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result_cut["subgroup"])), axis=1)]
+    df_result_cut["target"] = [class_value] * df_result_cut.shape[0]
+    #route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
+    """ dir_type = "max"
+    if mode_parameter == 0:
+        dir_type = "threshold"
+    route = parentdir+"/datasets_compared/"+dataname+"/InfoGained/"+dir_type+"/"+dataname+"_"+class_column+"_"+str(class_value)+".csv" """
+    route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
+    df_result_cut.to_csv(route, encoding="UTF-8",index=True)
+
+
+if __name__ == "__main__":
+    #list_ignore=['Prog_Rec', 'ToxBin', 'boolenProg', 'booleanTox', 'NoProg-Tox', 'SiProg-Tox',"orgfam","target","target_num"]
+    # Cancer_stage, FirstTreatment
+
+    parser=argparse.ArgumentParser()
+
+    parser.add_argument('--dataname', type=str, required=True)
+    parser.add_argument('--class_column', type=str, required=True)
+    parser.add_argument('--class_value', type=str, required=True)
+    parser.add_argument('--mode', type=str, choices=["dynamic","maximum","default"], default="default")
+    parser.add_argument('--depth', type=int, required=True)
+    parser.add_argument("--list_ignore", nargs="*", type=str, default=[])
+    parser.add_argument("--list_conds", nargs="*", type=str, default=[])
+
+    args = parser.parse_args()
+
+    info_gained_algorithm(args.dataname,args.class_column,args.class_value,args.mode,args.depth,args.list_ignore,args.list_conds)
\ No newline at end of file
--- a/InfoGained/pysubgroup_mod/__init__.py
+++ b/InfoGained/pysubgroup_mod/__init__.py
+from pysubgroup_mod.subgroup_description import *
+from pysubgroup_mod.algorithms import *
+from pysubgroup_mod.measures import *
+from pysubgroup_mod.utils import *
+
+from pysubgroup_mod.binary_target import *
+from pysubgroup_mod.numeric_target import *
+from pysubgroup_mod.fi_target import *
+
+from pysubgroup_mod.visualization import *
+from pysubgroup_mod.refinement_operator import *
+from pysubgroup_mod.representations import *
+
+from pysubgroup_mod.constraints import *
+
+from pysubgroup_mod.subgroup_description import *
--- a/InfoGained/pysubgroup_mod/__pycache__/__init__.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/__init__.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/algorithms.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/algorithms.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/binary_target.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/binary_target.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/constraints.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/constraints.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/fi_target.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/fi_target.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/gp_growth.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/gp_growth.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/measures.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/measures.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/model_target.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/model_target.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/numeric_target.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/numeric_target.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/refinement_operator.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/refinement_operator.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/representations.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/representations.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/subgroup_description.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/subgroup_description.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/utils.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/utils.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/__pycache__/visualization.cpython-39.pyc
+++ b/InfoGained/pysubgroup_mod/__pycache__/visualization.cpython-39.pyc
--- a/InfoGained/pysubgroup_mod/algorithms.py
+++ b/InfoGained/pysubgroup_mod/algorithms.py
--- a/InfoGained/pysubgroup_mod/binary_target.py
+++ b/InfoGained/pysubgroup_mod/binary_target.py
--- a/InfoGained/pysubgroup_mod/constraints.py
+++ b/InfoGained/pysubgroup_mod/constraints.py
+import pysubgroup_mod as ps
+
+
+class MinSupportConstraint:
+    def __init__(self, min_support):
+        self.min_support = min_support
+
+    @property
+    def is_monotone(self):
+        return True
+
+    def is_satisfied(self, subgroup, statistics=None, data=None):
+        if hasattr(statistics, 'size'):
+            return statistics.size >= self.min_support
+        elif hasattr(statistics, 'size_sg'):
+            return statistics.size_sg >= self.min_support
+        else:
+            return ps.get_size(subgroup, len(data), data) >= self.min_support
--- a/InfoGained/pysubgroup_mod/fi_target.py
+++ b/InfoGained/pysubgroup_mod/fi_target.py
+'''
+Created on 29.09.2017
+
+@author: lemmerfn
+'''
+from collections import namedtuple
+from functools import total_ordering
+import pysubgroup_mod as ps
+
+
+
+@total_ordering
+class FITarget:
+    statistic_types = ('size_sg', 'size_dataset')
+
+    def __repr__(self):
+        return "T: Frequent Itemsets"
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __lt__(self, other):
+        return str(self) < str(other)
+
+    def get_attributes(self):
+        return []
+
+    def get_base_statistics(self, subgroup, data):
+        _, size = ps.get_cover_array_and_size(subgroup, len(data), data)
+        return size
+
+    def calculate_statistics(self, subgroup_description, data, cached_statistics=None):
+        if cached_statistics is None or not isinstance(cached_statistics, dict):
+            statistics = dict()
+        elif all(k in cached_statistics for k in FITarget.statistic_types):
+            return cached_statistics
+        else:
+            statistics = cached_statistics
+
+        _, size = ps.get_cover_array_and_size(subgroup_description, len(data), data)
+
+        statistics['size_sg'] = size
+        statistics['size_dataset'] = len(data)
+        return statistics
+
+
+class SimpleCountQF(ps.AbstractInterestingnessMeasure):
+    tpl = namedtuple('CountQF_parameters', ('subgroup_size'))
+
+    def __init__(self):
+        self.required_stat_attrs = ('subgroup_size',)
+        self.has_constant_statistics = True
+        self.size_dataset = None
+
+    def calculate_constant_statistics(self, data, target):
+        self.size_dataset = len(data)
+
+    def calculate_statistics(self, subgroup_description, target, data, statistics=None):
+        _, size = ps.get_cover_array_and_size(subgroup_description, self.size_dataset, data)
+        return SimpleCountQF.tpl(size)
+
+    def gp_get_stats(self, _):
+        return {"subgroup_size" : 1}
+
+    def gp_get_null_vector(self):
+        return {"subgroup_size":0}
+
+    def gp_merge(self, l, r):
+        l["subgroup_size"] += r["subgroup_size"]
+
+    def gp_get_params(self, _cover_arr, v):
+        return SimpleCountQF.tpl(v['subgroup_size'])
+
+    def gp_to_str(self, stats):
+        return str(stats['subgroup_size'])
+
+    @property
+    def gp_requires_cover_arr(self):
+        return False
+
+
+class CountQF(SimpleCountQF, ps.BoundedInterestingnessMeasure):
+    def evaluate(self, subgroup, target, data, statistics=None):
+        statistics = self.ensure_statistics(subgroup, target, data, statistics)
+        return statistics.subgroup_size
+
+    def optimistic_estimate(self, subgroup, target, data, statistics=None):
+        statistics = self.ensure_statistics(subgroup, target, data, statistics)
+        return statistics.subgroup_size
+
+
+
+class AreaQF(SimpleCountQF):
+    def evaluate(self, subgroup, target, data, statistics=None):
+        statistics = self.ensure_statistics(subgroup, target, data, statistics)
+        return statistics.subgroup_size * subgroup.depth
--- a/InfoGained/pysubgroup_mod/gp_growth.py
+++ b/InfoGained/pysubgroup_mod/gp_growth.py
--- a/InfoGained/pysubgroup_mod/measures.py
+++ b/InfoGained/pysubgroup_mod/measures.py
+'''
+Created on 28.04.2016
+
+@author: lemmerfn
+'''
+from abc import ABC, abstractmethod
+from collections import namedtuple
+from itertools import combinations
+import numpy as np
+import pysubgroup_mod as ps
+
+
+class AbstractInterestingnessMeasure(ABC):
+
+    # pylint: disable=no-member
+    def ensure_statistics(self, subgroup, target, data, statistics=None):
+        if not self.has_constant_statistics:
+            self.calculate_constant_statistics(data, target)
+        if any(not hasattr(statistics, attr) for attr in self.required_stat_attrs):
+            if getattr(subgroup, 'statistics', False):
+                return subgroup.statistics
+            else:
+                return self.calculate_statistics(subgroup, target, data, statistics)
+        return statistics
+    # pylint: enable=no-member
+    #def optimistic_estimate_from_dataset(self, data, subgroup, weighting_attribute=None): #pylint: disable=unused-argument
+    #    return float("inf")
+
+
+class BoundedInterestingnessMeasure(AbstractInterestingnessMeasure):
+    pass
+    #@abstractmethod
+    #def optimistic_estimate_from_dataset(self, data, subgroup, weighting_attribute=None):
+    #    pass
+
+
+
+#####
+# FIX ME: This is currently not working anymore
+#####
+class CombinedInterestingnessMeasure(BoundedInterestingnessMeasure):
+    def __init__(self, measures, weights=None):
+        self.measures = measures
+
+        if weights is None:
+            weights = [1] * len(measures)
+        assert len(weights) == len(measures)
+        self.weights = weights
+
+    def calculate_constant_statistics(self, data, target):
+        pass
+
+    def calculate_statistics(self, subgroup, target, data, cached_statistics=None):
+        pass
+
+    def evaluate(self, subgroup, target, data, statistics=None):
+        #FIX USE of constant statistics
+        return np.dot([m.evaluate(subgroup, target, data, None) for m in self.measures], self.weights)
+
+    def optimistic_estimate(self, subgroup, target, data, statistics=None):
+        # FIX USE of constant statistics
+        return np.dot([m.optimistic_estimate(subgroup, target, data, None) for m in self.measures], self.weights)
+
+    def evaluate_from_statistics(self, instances_dataset, positives_dataset, instances_subgroup, positives_subgroup):
+        return np.dot([m.evaluate_from_statistics(instances_dataset, positives_dataset, instances_subgroup, positives_subgroup) for m in self.measures], self.weights)
+
+    #def optimistic_estimate_from_statistics(self, instances_dataset, positives_dataset, instances_subgroup, positives_subgroup):
+    #    return np.dot(
+    #        [m.evaluate_from_statistics(instances_dataset, positives_dataset, instances_subgroup, positives_subgroup) for m in self.measures],
+    #        self.weights)
+
+
+##########
+# Filter
+##########
+def unique_attributes(result_set, data):
+    result = []
+    used_attributes = []
+    for (q, sg) in result_set:
+        atts = sg.subgroup_description.get_attributes()
+        if atts not in used_attributes or all([ps.is_categorical_attribute(data, x) for x in atts]):
+            result.append((q, sg))
+            used_attributes.append(atts)
+    return result
+
+
+def minimum_statistic_filter(result_set, statistic, minimum, data):
+    result = []
+    for (q, sg) in result_set:
+        if len(sg.statistics) == 0:
+            sg.calculate_statistics(data)
+        if sg.statistics[statistic] >= minimum:
+            result.append((q, sg))
+    return result
+
+
+def minimum_quality_filter(result_set, minimum):
+    result = []
+    for (q, sg) in result_set:
+        if q >= minimum:
+            result.append((q, sg))
+    return result
+
+
+def maximum_statistic_filter(result_set, statistic, maximum):
+    result = []
+    for (q, sg) in result_set:
+        if sg.statistics[statistic] <= maximum:
+            result.append((q, sg))
+    return result
+
+
+def overlap_filter(result_set, data, similarity_level=0.9):
+    result = []
+    result_sgs = []
+    for (q, sg) in result_set:
+        if not overlaps_list(sg, result_sgs, data, similarity_level):
+            result_sgs.append(sg)
+            result.append((q, sg))
+    return result
+
+
+def overlaps_list(sg, list_of_sgs, data, similarity_level=0.9):
+    for anotherSG in list_of_sgs:
+        if ps.overlap(sg, anotherSG, data) > similarity_level:
+            return True
+    return False
+
+
+class CountCallsInterestingMeasure(BoundedInterestingnessMeasure):
+    def __init__(self, qf):
+        self.qf = qf
+        self.calls = 0
+
+    def calculate_statistics(self, sg, target, data, statistics=None):
+        self.calls += 1
+        return self.qf.calculate_statistics(sg, target, data, statistics)
+
+    def __getattr__(self, name):
+        return getattr(self.qf, name)
+
+    def __hasattr__(self, name):
+        return hasattr(self.qf, name)
+
+
+#####
+# GeneralizationAware Interestingness Measures
+#####
+class GeneralizationAwareQF(AbstractInterestingnessMeasure):
+    ga_tuple = namedtuple('ga_tuple', ['subgroup_quality', 'generalisation_quality'])
+    def __init__(self, qf):
+        self.qf = qf
+
+        # this cache maps the representation of descriptions to tuples
+        # the first entry is the quality and the second one is
+        # the largest quality of all its predessors
+        self.cache = {}
+        self.has_constant_statistics = False
+        self.required_stat_attrs = ['subgroup_quality', 'generalisation_quality']
+        self.q0 = 0
+
+    def calculate_constant_statistics(self, data, target):
+        self.cache = {}
+        self.qf.calculate_constant_statistics(data, target)
+        self.q0 = self.qf.evaluate(slice(None), target, data)
+        self.has_constant_statistics = self.qf.has_constant_statistics
+
+    def calculate_statistics(self, subgroup, target, data, statistics=None):
+        sg_repr = repr(subgroup)
+        if sg_repr in self.cache:
+            return GeneralizationAwareQF.ga_tuple(*self.cache[sg_repr])
+        else:
+            (q_sg, q_prev) = self.get_qual_and_previous_qual(subgroup, target, data)
+            self.cache[sg_repr] = (q_sg, q_prev)
+            return GeneralizationAwareQF.ga_tuple(q_sg, q_prev)
+
+    def get_qual_and_previous_qual(self, subgroup, target, data):
+        q_subgroup = self.qf.evaluate(subgroup, target, data)
+        max_q = 0
+        selectors = subgroup.selectors
+        if len(selectors) > 0:
+            # compute quality of all generalizations
+            generalizations = combinations(selectors, len(selectors)-1)
+
+            for sels in generalizations:
+                sgd = ps.Conjunction(list(sels))
+                (q_sg, q_prev) = self.calculate_statistics(sgd, target, data)
+                max_q = max(max_q, q_sg, q_prev)
+        return (q_subgroup, max_q)
+
+    def evaluate(self, subgroup, target, data, statistics=None):
+        statistics = self.ensure_statistics(subgroup, target, data, statistics)
+        return statistics.subgroup_quality - statistics.generalisation_quality
+
+
+#####
+# GeneralizationAware Interestingness Measures
+#####
+class GeneralizationAwareQF_stats(AbstractInterestingnessMeasure):
+    ga_tuple = namedtuple('ga_stats_tuple', ['subgroup_stats', 'generalisation_stats'])
+    def __init__(self, qf):
+        self.qf = qf
+
+        # this cache maps the representation of descriptions to tuples
+        # the first entry is the quality and the second one is
+        # the largest quality of all its predecessors
+        self.cache = {}
+        self.has_constant_statistics = False
+        self.required_stat_attrs = GeneralizationAwareQF_stats.ga_tuple._fields
+        self.stats0 = None
+
+    def calculate_constant_statistics(self, data, target):
+        self.cache = {}
+        self.qf.calculate_constant_statistics(data, target)
+        self.stats0 = self.qf.calculate_statistics(slice(None), target, data)
+        self.has_constant_statistics = self.qf.has_constant_statistics
+
+    def calculate_statistics(self, subgroup, target, data, statistics=None):
+        sg_repr = repr(subgroup)
+        if sg_repr in self.cache:
+            return GeneralizationAwareQF_stats.ga_tuple(*self.cache[sg_repr])
+        else:
+            (stats_sg, stats_prev) = self.get_stats_and_previous_stats(subgroup, target, data)
+            self.cache[sg_repr] = (stats_sg, stats_prev)
+            return GeneralizationAwareQF_stats.ga_tuple(stats_sg, stats_prev)
+
+    def get_stats_and_previous_stats(self, subgroup, target, data):
+        stats_subgroup = self.qf.calculate_statistics(subgroup, target, data)
+        max_stats = self.stats0
+        selectors = subgroup.selectors
+        if len(selectors) > 0:
+            # compute quality of all generalizations
+            generalizations = combinations(selectors, len(selectors)-1)
+
+            for sels in generalizations:
+                sgd = ps.Conjunction(list(sels))
+                (stats_sg, stats_prev) = self.calculate_statistics(sgd, target, data)
+                max_stats = self.get_max(max_stats, stats_sg, stats_prev)
+        return (stats_subgroup, max_stats)
+
+    def evaluate(self, subgroup, statistics_or_data=None):
+        raise NotImplementedError
+
+    def get_max(self, *args):
+        raise NotImplementedError
\ No newline at end of file
--- a/InfoGained/pysubgroup_mod/model_target.py
+++ b/InfoGained/pysubgroup_mod/model_target.py
+from collections import namedtuple
+from scipy.stats import norm
+import numpy as np
+import pysubgroup_mod as ps
+beta_tuple = namedtuple('beta_tuple', ['beta', 'size'])
+
+
+class EMM_Likelihood(ps.AbstractInterestingnessMeasure):
+    tpl = namedtuple('EMM_Likelihood', ['model_params', 'subgroup_likelihood', 'inverse_likelihood', 'size'])
+    def __init__(self, model):
+        self.model = model
+        self.has_constant_statistics = False
+        self.required_stat_attrs = EMM_Likelihood.tpl._fields
+        self.data_size = None
+
+    def calculate_constant_statistics(self, task):
+        self.model.calculate_constant_statistics(task)
+        self.data_size = len(task.data)
+        self.has_constant_statistics = True
+
+    def calculate_statistics(self, subgroup, data=None):
+        cover_arr, sg_size = ps.get_cover_array_and_size(subgroup, self.data_size, data)
+
+        params = self.model.fit(cover_arr, data)
+        return self.get_tuple(sg_size, params, cover_arr)
+
+    def get_tuple(self, sg_size, params, cover_arr):
+        #numeric stability?
+        all_likelihood = self.model.likelihood(params, np.ones(self.data_size, dtype=bool))
+        sg_likelihood_sum = np.sum(all_likelihood[cover_arr])
+        total_likelihood_sum = np.sum(all_likelihood)
+        dataset_average = np.nan
+        if (self.data_size - sg_size) > 0:
+            dataset_average = (total_likelihood_sum - sg_likelihood_sum)/(self.data_size - sg_size)
+        sg_average = np.nan
+        if sg_size > 0:
+            sg_average = sg_likelihood_sum/sg_size
+        return EMM_Likelihood.tpl(params, sg_average, dataset_average, sg_size)
+
+    def evaluate(self, subgroup, statistics=None):
+        statistics = self.ensure_statistics(subgroup, statistics)
+        #numeric stability?
+        return statistics.subgroup_likelihood - statistics.inverse_likelihood
+
+    def gp_get_params(self, cover_arr, v):
+        params = self.model.gp_get_params(v)
+        sg_size = params.size
+        return self.get_tuple(sg_size, params, cover_arr)
+
+
+    def supports_weights(self):
+        return False
+
+    def is_applicable(self, _):
+        return True
+
+    def __getattr__(self, name):
+        return getattr(self.model, name)
+
+class PolyRegression_ModelClass:
+    def __init__(self, x_name='x', y_name='y', degree=1):
+        self.x_name = x_name
+        self.y_name = y_name
+        if degree != 1:
+            raise ValueError('Currently only degree == 1 is supported')
+        self.degree = degree
+        self.x = None
+        self.y = None
+        self.has_constant_statistics = True
+        super().__init__()
+
+    def calculate_constant_statistics(self, task):
+        data = task.data
+        self.x = data[self.x_name].to_numpy()
+        self.y = data[self.y_name].to_numpy()
+        self.has_constant_statistics = True
+
+    @staticmethod
+    def gp_merge(u, v):
+        v0 = v[0]
+        u0 = u[0]
+        if v0 == 0 or u0 == 0:
+            d = 0
+        else:
+            d = v0 * u0/(v0 + u0)*(v[1]/v0 - u[1]/u0)*(v[2]/v0 - u[2]/u0)
+        u += v
+        u[3] += d
+
+    def gp_get_null_vector(self):
+        return np.zeros(5)
+
+    def gp_get_stats(self, row_index):
+        x = self.x[row_index]
+        return np.array([1, x, self.y[row_index], 0, x*x])
+
+    def gp_get_params(self, v):
+        size = v[0]
+        if size < self.degree:
+            return beta_tuple(np.full(self.degree + 1, np.nan), size)
+        v1 = v[1]
+        slope = v[0] * v[3] / (v[0]*v[4] - v1 * v1)
+        intersept = v[2]/v[0] - slope * v[1]/v[0]
+        return beta_tuple(np.array([slope, intersept]), v[0])
+
+    def fit(self, subgroup, data=None):
+        cover_arr, size = ps.get_cover_array_and_size(subgroup, len(self.x), data)
+        if size <= self.degree + 1:
+            return beta_tuple(np.full(self.degree + 1, np.nan), size)
+        return beta_tuple(np.polyfit(self.x[cover_arr], self.y[cover_arr], deg=self.degree), size)
+
+    def likelihood(self, stats, sg):
+        if any(np.isnan(stats.beta)):
+            return np.full(self.x[sg].shape, np.nan)
+        return norm.pdf(np.polyval(stats.beta, self.x[sg]) - self.y[sg])
+
+    def loglikelihood(self, stats, sg):
+        return norm.logpdf(np.polyval(stats.beta, self.x[sg]) - self.y[sg])
--- a/InfoGained/pysubgroup_mod/numeric_target.py
+++ b/InfoGained/pysubgroup_mod/numeric_target.py
--- a/InfoGained/pysubgroup_mod/refinement_operator.py
+++ b/InfoGained/pysubgroup_mod/refinement_operator.py
+import pysubgroup_mod as ps
+from collections import defaultdict
+from itertools import chain
+class RefinementOperator:
+    pass
+
+
+class StaticSpecializationOperator:
+    def __init__(self, selectors):
+        search_space_dict = defaultdict(list)
+        for selector in selectors:
+            search_space_dict[selector.attribute_name].append(selector)
+        self.search_space = list(search_space_dict.values())
+        self.search_space_index = {key: i for i, key in enumerate(search_space_dict.keys())}
+
+    def refinements(self, subgroup):
+        if subgroup.depth > 0:
+            index_of_last = self.search_space_index[subgroup._selectors[-1].attribute_name]
+            new_selectors = chain.from_iterable(self.search_space[index_of_last + 1:])
+        else:
+            new_selectors = chain.from_iterable(self.search_space)
+
+        return (subgroup & sel for sel in new_selectors)
+
+
+class StaticGeneralizationOperator:
+    def __init__(self, selectors):
+        self.search_space = selectors
+
+    def refinements(self, sG):
+        index_of_last_selector = min(self.search_space.index(sG._selectors[-1]), len(self.search_space) - 1)
+        new_selectors = self.search_space[index_of_last_selector + 1:]
+
+        return (sG | sel for sel in new_selectors)
--- a/InfoGained/pysubgroup_mod/representations.py
+++ b/InfoGained/pysubgroup_mod/representations.py
+import numpy as np
+import pysubgroup_mod as ps
+
+
+
+
+class RepresentationBase():
+    def __init__(self, new_conjunction, selectors_to_patch):
+        self._new_conjunction = new_conjunction
+        self.previous_conjunction = None
+        self.selectors_to_patch = selectors_to_patch
+
+    def patch_all_selectors(self):
+        for sel in self.selectors_to_patch:
+            self.patch_selector(sel)
+
+    def patch_selector(self, sel):
+        raise NotImplementedError
+
+    def patch_classes(self):
+        pass
+
+    def undo_patch_classes(self):
+        pass
+
+    def __enter__(self):
+        self.patch_classes()
+        self.patch_all_selectors()
+        return self
+
+
+    def __exit__(self, * args):
+        self.undo_patch_classes()
+
+
+
+class BitSet_Conjunction(ps.Conjunction):
+    n_instances = 0
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.representation = self.compute_representation()
+
+    def compute_representation(self):
+                # empty description ==> return a list of all '1's
+        if not self._selectors:
+            return np.full(BitSet_Conjunction.n_instances, True, dtype=bool)
+        # non-empty description
+        return np.all([sel.representation for sel in self._selectors], axis=0)
+
+    @property
+    def size_sg(self):
+        return np.count_nonzero(self.representation)
+
+    def append_and(self, to_append):
+        super().append_and(to_append)
+        self.representation = np.logical_and(self.representation, to_append.representation)
+
+    @property
+    def __array_interface__(self):
+        return self.representation.__array_interface__
+
+
+
+class BitSet_Disjunction(ps.Disjunction):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.representation = self.compute_representation()
+
+    def compute_representation(self):
+                # empty description ==> return a list of all '1's
+        if not self._selectors:
+            return np.full(BitSet_Conjunction.n_instances, False, dtype=bool)
+        # non-empty description
+        return np.any([sel.representation for sel in self._selectors], axis=0)
+
+    @property
+    def size_sg(self):
+        return np.count_nonzero(self.representation)
+
+    def append_or(self, to_append):
+        super().append_or(to_append)
+        self.representation = np.logical_or(self.representation, to_append.representation)
+
+    @property
+    def __array_interface__(self):
+        return self.representation.__array_interface__
+
+
+
+
+class BitSetRepresentation(RepresentationBase):
+    Conjunction = BitSet_Conjunction
+    Disjunction = BitSet_Disjunction
+    def __init__(self, df, selectors_to_patch):
+        self.df = df
+        super().__init__(BitSet_Conjunction, selectors_to_patch)
+
+    def patch_selector(self, sel):
+        sel.representation = sel.covers(self.df)
+        sel.size_sg = np.count_nonzero(sel.representation)
+
+    def patch_classes(self):
+        BitSet_Conjunction.n_instances = len(self.df)
+        super().patch_classes()
+
+
+class Set_Conjunction(ps.Conjunction):
+    all_set = set()
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.representation = self.compute_representation()
+        self.arr_for_interface = np.array(list(self.representation), dtype=int)
+
+    def compute_representation(self):
+                # empty description ==> return a list of all '1's
+        if not self._selectors:
+            return Set_Conjunction.all_set
+        # non-empty description
+        return set.intersection(*[sel.representation for sel in self._selectors])
+
+    @property
+    def size_sg(self):
+        return len(self.representation)
+
+    #def __copy__(self):
+    #    tmp = super().__copy__()
+    #    tmp.representation = self.representation.copy()
+    #    return tmp
+
+    def append_and(self, to_append):
+        super().append_and(to_append)
+        self.representation = self.representation.intersection(to_append.representation)
+        self.arr_for_interface = np.array(list(self.representation), dtype=int)
+
+    @property
+    def __array_interface__(self):
+        return self.arr_for_interface.__array_interface__ # pylint: disable=no-member
+
+
+class SetRepresentation(RepresentationBase):
+    Conjunction = Set_Conjunction
+    def __init__(self, df, selectors_to_patch):
+        self.df = df
+        super().__init__(Set_Conjunction, selectors_to_patch)
+
+    def patch_selector(self, sel):
+        sel.representation = set(*np.nonzero(sel.covers(self.df)))
+        sel.size_sg = len(sel.representation)
+
+    def patch_classes(self):
+        Set_Conjunction.all_set = set(self.df.index)
+        super().patch_classes()
+
+
+class NumpySet_Conjunction(ps.Conjunction):
+    all_set = None
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.representation = self.compute_representation()
+
+    def compute_representation(self):
+                # empty description ==> return a list of all '1's
+        if not self._selectors:
+            return NumpySet_Conjunction.all_set
+        start = self._selectors[0].representation
+        for sel in self._selectors[1:]:
+            start = np.intersect1d(start, sel.representation, assume_unique=True)
+        return start
+
+    @property
+    def size_sg(self):
+        return len(self.representation)
+
+    #def __copy__(self):
+    #    tmp = super().__copy__()
+    #    tmp.representation = self.representation.copy()
+    #    return tmp
+
+    def append_and(self, to_append):
+        super().append_and(to_append)
+        #self._selectors.append(to_append)
+        self.representation = np.intersect1d(self.representation, to_append.representation, True)
+
+    @property
+    def __array_interface__(self):
+        return self.representation.__array_interface__
+
+
+class NumpySetRepresentation(RepresentationBase):
+    Conjunction = NumpySet_Conjunction
+    def __init__(self, df, selectors_to_patch):
+        self.df = df
+        super().__init__(NumpySet_Conjunction, selectors_to_patch)
+
+    def patch_selector(self, sel):
+        sel.representation = np.nonzero(sel.covers(self.df))[0]
+        sel.size_sg = len(sel.representation)
+
+    def patch_classes(self):
+        NumpySet_Conjunction.all_set = np.arange(len(self.df))
+        super().patch_classes()
--- a/InfoGained/pysubgroup_mod/subgroup_description.py
+++ b/InfoGained/pysubgroup_mod/subgroup_description.py
--- a/InfoGained/pysubgroup_mod/utils.py
+++ b/InfoGained/pysubgroup_mod/utils.py
--- a/InfoGained/pysubgroup_mod/visualization.py
+++ b/InfoGained/pysubgroup_mod/visualization.py
+from functools import partial
+
+import pandas as pd
+import numpy as np
+from scipy.cluster.hierarchy import dendrogram, linkage
+from scipy.spatial.distance import squareform
+from matplotlib import pyplot as plt
+
+import pysubgroup_mod as ps
+
+
+def plot_sgbars(result_df, _, ylabel="target share", title="Discovered Subgroups", dynamic_widths=False, _suffix=""):
+    shares_sg = result_df["target_share_sg"]
+    shares_compl = result_df["target_share_complement"]
+    sg_relative_sizes = result_df["relative_size_sg"]
+    x = np.arange(len(result_df))
+
+    base_width = 0.8
+    if dynamic_widths:
+        width_sg = 0.02 + base_width * sg_relative_sizes
+        width_compl = base_width - width_sg
+    else:
+        width_sg = base_width / 2
+        width_compl = base_width / 2
+
+    fig, ax = plt.subplots()
+    rects1 = ax.bar(x, shares_sg, width_sg, align='edge')
+    rects2 = ax.bar(x + width_sg, shares_compl, width_compl, align='edge', color='#61b76f')
+
+    ax.set_ylabel(ylabel)
+    ax.set_title(title)
+    ax.set_xticks(x + base_width / 2)
+    ax.set_xticklabels(result_df.index, rotation=90)
+
+    ax.legend((rects1[0], rects2[0]), ('subgroup', 'complement'))
+    fig.set_size_inches(12, len(result_df))
+
+    return fig
+
+
+def plot_roc(result_df, data, qf=ps.StandardQF(0.5), levels=40, annotate=False):
+    instances_dataset = len(data)
+    positives_dataset = np.max(result_df['positives_dataset'])
+    negatives_dataset = instances_dataset - positives_dataset
+
+    xlist = np.linspace(0.01, 0.99, 100)
+    ylist = np.linspace(0.01, 0.99, 100)
+    X, Y = np.meshgrid(xlist, ylist)
+    f = np.vectorize(partial(qf.evaluate, instances_dataset, positives_dataset), otypes=[np.float])
+    Z = f(X * negatives_dataset + Y * positives_dataset, Y * positives_dataset)
+    max_val = np.max([np.max(Z), -np.min(Z)])
+
+    fig, ax = plt.subplots()
+    cm = plt.cm.get_cmap("bwr")
+
+    plt.contourf(X, Y, Z, levels, cmap=cm, vmin=-max_val, vmax=max_val)
+
+    for i, sg in result_df.iterrows():
+        rel_positives_sg = sg['positives_sg'] / positives_dataset
+        rel_negatives_sg = (sg['size_sg'] - sg['positives_sg']) / negatives_dataset
+        ax.plot(rel_negatives_sg, rel_positives_sg, 'ro', color='black')
+        if annotate:
+            label_margin = 0.01
+            ax.annotate(str(i), (rel_negatives_sg + label_margin, rel_positives_sg + label_margin))
+
+    # plt.colorbar(cp)
+    plt.title('Discovered subgroups')
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+
+    return fig
+
+
+def plot_npspace(result_df, data, annotate=True, fixed_limits=False):
+
+    fig, ax = plt.subplots()
+
+    for i, sg in result_df.iterrows():
+        target_share_sg = sg['target_share_sg']
+        size_sg = sg['size_sg']
+        ax.plot(size_sg, target_share_sg, 'ro', color='black')
+        if annotate:
+            ax.annotate(str(i), (size_sg + 5, target_share_sg + 0.001))
+
+    if fixed_limits:
+        plt.xlim((0, len(data)))
+        plt.ylim((0, 1))
+
+    plt.title('Discovered subgroups')
+    plt.xlabel('Size of Subgroup')
+    plt.ylabel('Target Share Subgroup')
+
+    return fig
+
+
+def plot_distribution_numeric(sg, data, bins):
+    fig, _ = plt.subplots()
+    target_values_sg = data[sg.covers(data)][sg.target.get_attributes()].values
+    target_values_data = data[sg.target.get_attributes()].values
+    plt.hist(target_values_sg, bins, alpha=0.5, label=str(sg.subgroup_description), density=True)
+    plt.hist(target_values_data, bins, alpha=0.5, label="Overall Data", density=True)
+    plt.legend(loc='upper right')
+    return fig
+
+
+def compare_distributions_numeric(sgs, data, bins):
+    fig, _ = plt.subplots()
+    for sg in sgs:
+        target_values_sg = data[sg.covers(data)][sg.target.get_attributes()].values
+        plt.hist(target_values_sg, bins, alpha=0.3, label=str(sg.subgroup_description), density=True)
+    plt.legend(loc='upper right')
+    return fig
+
+
+def similarity_sgs(sgd_results, data, color=True):
+    sgs = [x[1] for x in sgd_results]
+    #sgNames = [str(sg.subgroup_description) for sg in sgs]
+    dists = [[ps.overlap(sg, sg2, data) for sg2 in sgs] for sg in sgs]
+    dist_df = pd.DataFrame(dists)
+    if color:
+        dist_df = dist_df.style.background_gradient()
+    return dist_df
+
+
+def similarity_dendrogram(result, data):
+    fig, _ = plt.subplots()
+    dist_df = similarity_sgs(result, data, color=False)
+    mat = 1 - dist_df.values
+    dists = squareform(mat)
+    linkage_matrix = linkage(dists, "single")
+    dendrogram(linkage_matrix, labels=dist_df.index)
+    return fig
+
+def supportSetVisualization(result, in_order=True, drop_empty=True):
+    df = result.task.data
+    n_items = len(result.task.data)
+    n_SGDs = len(result.results)
+    covs = np.zeros((n_items, n_SGDs), dtype=bool)
+    for i, (_, r, _) in enumerate(result.to_subgroups):
+        covs[:, i] = r.covers(df)
+
+    img_arr = covs.copy()
+
+    sort_inds_x = np.argsort(np.sum(covs, axis=1))[::-1]
+    img_arr = img_arr[sort_inds_x, :]
+    if not in_order:
+        sort_inds_y = np.argsort(np.sum(covs, axis=0))
+        img_arr = img_arr[:, sort_inds_y]
+    if drop_empty:
+        keep_entities = np.sum(img_arr, axis=1) > 0
+        print("Discarding {} entities that are not covered".format(n_items - np.count_nonzero(keep_entities)))
+        img_arr = img_arr[keep_entities, :]
+    return img_arr.T
--- a/InfoGained/readme.md
+++ b/InfoGained/readme.md
+IGSD
+This repository contains the material refering to the paper: "", it contains:
+    1. datasets: Directory in which datasets to be used in the algortihm are stored.
+    2. results: Directory in which the algortihm will store the results produced.
+    3. pysubgroup_mod: The project code.
+
+1. IGSD Project Scripts
+Contains the scripts of IGSD and other algorithms such as BeamSearch, DFS, BestFirstSearch, etc. Moreover, main.py is the
+principal script file which will launch the specific algorithm.
+
+The main.py file required several arguments to be used, so the following command line will execute the python file:
+
+py --dataname <FILE> --class_column <CLASS_COLUMN> --class_value <CLASS_VALUE> --mode <MODE> --depth <DEPTH> --list_ignore <LIST_IGNORE> --list_conds <LIST_CONDS>
+
+With:
+    - <FILE>: The name of the dataset input file.
+    - <CLASS_COLUMN>: The attribute (column) used as target (studied class).
+    - <CLASS_VALUE>: The value of <CLASS_COLUMN> that we want to analize.
+    - <MODE>: The mode that IGSD will employ to perfom the analysis when IG threshold is calculated (dynamic, maximum). If you want to employ another algorithm, the default value is used.
+    - <DEPTH>: The number of attributes that the algortihms will consider.
+    - <LIST_IGNORE>: A list with the attributes (columns) of the dataset that the user does not want to be consider in the anaylis.
+    - <LIST_CONDS>: A list with the attributes (columns) of the dataset that the user wants to be present in the patterns obtained.