Commit c4385bc7 authored by aarongitrepos's avatar aarongitrepos

All

parents
*/__pycache__/
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
import os, sys
import pandas as pd
currentdir = os.path.dirname(os.path.realpath(__file__))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)
sys.path.append(currentdir)
import pysubgroup_mod as ps
import argparse
import numpy as np
def eliminate_reps(elem,l):
l.remove(elem)
for li in l:
if elem in li:
#if set(elem.split(" AND ")).issubset(li.split(" AND ")):
#if ', '.join(map(str,elem)) in ', '.join(map(str,li)):
return False
return True
def info_gained_algorithm(dataname,class_column,class_value,mode_parameter="default",depth=5,list_ignore=[],list_conds=[]):
"""
Parameters
----------
dataname: string
The name of the dataset allocated in the datasets directory to be used for analysis.
class_column: string
A column of the dataset that will be used as target.
class_value: ??
A value of the class_column. It corresponds to the condition that has to be meet, i.e, class_column==class_Value.
mode_parameter: string, optional (default=default)
This parameter controls the behaviour of algorithms search. For InfoGained algorithm, it is necessary to
use dynamic or maximum options. For other algorithms, the parameter has the value "default".
(possible values: dynamic, maximum, default)
depth: int, optional (default=5)
This parameter indicates the number of variables that will be added to rules.
list_ignore: list of strings, optional (default=None)
List containing the column names that will not be used in search activity.
list_conds: list of strings, optional (default=None)
List containing the column names that are neccesary to appear in rules. It will only work with InfoGained algorithm.
"""
df = pd.read_csv(currentdir+"/datasets/"+dataname+".csv",index_col=[0])
if class_column not in df.columns:
print("No class column")
exit(0)
if df[class_column].dtype in [np.int16, np.int32, np.int64]:
class_value = int(class_value)
if class_value not in df[class_column].unique():
print("The class_value specified is not an option")
exit(0)
target = ps.BinaryTarget (class_column, class_value)
searchspace = ps.create_selectors(df, ignore=list_ignore)
mode_parameter = {'dynamic' : 0, 'maximum': 1, "default":2}[mode_parameter]
task = ps.SubgroupDiscoveryTask (
df,
target,
searchspace,
mode=mode_parameter,
depth=depth,
filter_vars = list_conds,
qf=ps.WRAccQF())
result, result_cut = ps.InfoGainedSearch().execute(task)
#df_result = result.to_dataframe()
df_result_cut = result_cut.to_dataframe(mode=mode_parameter)
df_result_cut.drop_duplicates(inplace=True)
df_result_cut.reset_index(drop=True,inplace=True)
#df_result = df_result[df_result.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result["subgroup"])), axis=1)]
df_result_cut = df_result_cut[df_result_cut.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result_cut["subgroup"])), axis=1)]
df_result_cut["target"] = [class_value] * df_result_cut.shape[0]
#route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
""" dir_type = "max"
if mode_parameter == 0:
dir_type = "threshold"
route = parentdir+"/datasets_compared/"+dataname+"/InfoGained/"+dir_type+"/"+dataname+"_"+class_column+"_"+str(class_value)+".csv" """
route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
df_result_cut.to_csv(route, encoding="UTF-8",index=True)
if __name__ == "__main__":
#list_ignore=['Prog_Rec', 'ToxBin', 'boolenProg', 'booleanTox', 'NoProg-Tox', 'SiProg-Tox',"orgfam","target","target_num"]
# Cancer_stage, FirstTreatment
parser=argparse.ArgumentParser()
parser.add_argument('--dataname', type=str, required=True)
parser.add_argument('--class_column', type=str, required=True)
parser.add_argument('--class_value', type=str, required=True)
parser.add_argument('--mode', type=str, choices=["dynamic","maximum","default"], default="default")
parser.add_argument('--depth', type=int, required=True)
parser.add_argument("--list_ignore", nargs="*", type=str, default=[])
parser.add_argument("--list_conds", nargs="*", type=str, default=[])
args = parser.parse_args()
info_gained_algorithm(args.dataname,args.class_column,args.class_value,args.mode,args.depth,args.list_ignore,args.list_conds)
\ No newline at end of file
from pysubgroup_mod.subgroup_description import *
from pysubgroup_mod.algorithms import *
from pysubgroup_mod.measures import *
from pysubgroup_mod.utils import *
from pysubgroup_mod.binary_target import *
from pysubgroup_mod.numeric_target import *
from pysubgroup_mod.fi_target import *
from pysubgroup_mod.visualization import *
from pysubgroup_mod.refinement_operator import *
from pysubgroup_mod.representations import *
from pysubgroup_mod.constraints import *
from pysubgroup_mod.subgroup_description import *
This diff is collapsed.
This diff is collapsed.
import pysubgroup_mod as ps
class MinSupportConstraint:
def __init__(self, min_support):
self.min_support = min_support
@property
def is_monotone(self):
return True
def is_satisfied(self, subgroup, statistics=None, data=None):
if hasattr(statistics, 'size'):
return statistics.size >= self.min_support
elif hasattr(statistics, 'size_sg'):
return statistics.size_sg >= self.min_support
else:
return ps.get_size(subgroup, len(data), data) >= self.min_support
'''
Created on 29.09.2017
@author: lemmerfn
'''
from collections import namedtuple
from functools import total_ordering
import pysubgroup_mod as ps
@total_ordering
class FITarget:
statistic_types = ('size_sg', 'size_dataset')
def __repr__(self):
return "T: Frequent Itemsets"
def __eq__(self, other):
return self.__dict__ == other.__dict__
def __lt__(self, other):
return str(self) < str(other)
def get_attributes(self):
return []
def get_base_statistics(self, subgroup, data):
_, size = ps.get_cover_array_and_size(subgroup, len(data), data)
return size
def calculate_statistics(self, subgroup_description, data, cached_statistics=None):
if cached_statistics is None or not isinstance(cached_statistics, dict):
statistics = dict()
elif all(k in cached_statistics for k in FITarget.statistic_types):
return cached_statistics
else:
statistics = cached_statistics
_, size = ps.get_cover_array_and_size(subgroup_description, len(data), data)
statistics['size_sg'] = size
statistics['size_dataset'] = len(data)
return statistics
class SimpleCountQF(ps.AbstractInterestingnessMeasure):
tpl = namedtuple('CountQF_parameters', ('subgroup_size'))
def __init__(self):
self.required_stat_attrs = ('subgroup_size',)
self.has_constant_statistics = True
self.size_dataset = None
def calculate_constant_statistics(self, data, target):
self.size_dataset = len(data)
def calculate_statistics(self, subgroup_description, target, data, statistics=None):
_, size = ps.get_cover_array_and_size(subgroup_description, self.size_dataset, data)
return SimpleCountQF.tpl(size)
def gp_get_stats(self, _):
return {"subgroup_size" : 1}
def gp_get_null_vector(self):
return {"subgroup_size":0}
def gp_merge(self, l, r):
l["subgroup_size"] += r["subgroup_size"]
def gp_get_params(self, _cover_arr, v):
return SimpleCountQF.tpl(v['subgroup_size'])
def gp_to_str(self, stats):
return str(stats['subgroup_size'])
@property
def gp_requires_cover_arr(self):
return False
class CountQF(SimpleCountQF, ps.BoundedInterestingnessMeasure):
def evaluate(self, subgroup, target, data, statistics=None):
statistics = self.ensure_statistics(subgroup, target, data, statistics)
return statistics.subgroup_size
def optimistic_estimate(self, subgroup, target, data, statistics=None):
statistics = self.ensure_statistics(subgroup, target, data, statistics)
return statistics.subgroup_size
class AreaQF(SimpleCountQF):
def evaluate(self, subgroup, target, data, statistics=None):
statistics = self.ensure_statistics(subgroup, target, data, statistics)
return statistics.subgroup_size * subgroup.depth
This diff is collapsed.
'''
Created on 28.04.2016
@author: lemmerfn
'''
from abc import ABC, abstractmethod
from collections import namedtuple
from itertools import combinations
import numpy as np
import pysubgroup_mod as ps
class AbstractInterestingnessMeasure(ABC):
# pylint: disable=no-member
def ensure_statistics(self, subgroup, target, data, statistics=None):
if not self.has_constant_statistics:
self.calculate_constant_statistics(data, target)
if any(not hasattr(statistics, attr) for attr in self.required_stat_attrs):
if getattr(subgroup, 'statistics', False):
return subgroup.statistics
else:
return self.calculate_statistics(subgroup, target, data, statistics)
return statistics
# pylint: enable=no-member
#def optimistic_estimate_from_dataset(self, data, subgroup, weighting_attribute=None): #pylint: disable=unused-argument
# return float("inf")
class BoundedInterestingnessMeasure(AbstractInterestingnessMeasure):
pass
#@abstractmethod
#def optimistic_estimate_from_dataset(self, data, subgroup, weighting_attribute=None):
# pass
#####
# FIX ME: This is currently not working anymore
#####
class CombinedInterestingnessMeasure(BoundedInterestingnessMeasure):
def __init__(self, measures, weights=None):
self.measures = measures
if weights is None:
weights = [1] * len(measures)
assert len(weights) == len(measures)
self.weights = weights
def calculate_constant_statistics(self, data, target):
pass
def calculate_statistics(self, subgroup, target, data, cached_statistics=None):
pass
def evaluate(self, subgroup, target, data, statistics=None):
#FIX USE of constant statistics
return np.dot([m.evaluate(subgroup, target, data, None) for m in self.measures], self.weights)
def optimistic_estimate(self, subgroup, target, data, statistics=None):
# FIX USE of constant statistics
return np.dot([m.optimistic_estimate(subgroup, target, data, None) for m in self.measures], self.weights)
def evaluate_from_statistics(self, instances_dataset, positives_dataset, instances_subgroup, positives_subgroup):
return np.dot([m.evaluate_from_statistics(instances_dataset, positives_dataset, instances_subgroup, positives_subgroup) for m in self.measures], self.weights)
#def optimistic_estimate_from_statistics(self, instances_dataset, positives_dataset, instances_subgroup, positives_subgroup):
# return np.dot(
# [m.evaluate_from_statistics(instances_dataset, positives_dataset, instances_subgroup, positives_subgroup) for m in self.measures],
# self.weights)
##########
# Filter
##########
def unique_attributes(result_set, data):
result = []
used_attributes = []
for (q, sg) in result_set:
atts = sg.subgroup_description.get_attributes()
if atts not in used_attributes or all([ps.is_categorical_attribute(data, x) for x in atts]):
result.append((q, sg))
used_attributes.append(atts)
return result
def minimum_statistic_filter(result_set, statistic, minimum, data):
result = []
for (q, sg) in result_set:
if len(sg.statistics) == 0:
sg.calculate_statistics(data)
if sg.statistics[statistic] >= minimum:
result.append((q, sg))
return result
def minimum_quality_filter(result_set, minimum):
result = []
for (q, sg) in result_set:
if q >= minimum:
result.append((q, sg))
return result
def maximum_statistic_filter(result_set, statistic, maximum):
result = []
for (q, sg) in result_set:
if sg.statistics[statistic] <= maximum:
result.append((q, sg))
return result
def overlap_filter(result_set, data, similarity_level=0.9):
result = []
result_sgs = []
for (q, sg) in result_set:
if not overlaps_list(sg, result_sgs, data, similarity_level):
result_sgs.append(sg)
result.append((q, sg))
return result
def overlaps_list(sg, list_of_sgs, data, similarity_level=0.9):
for anotherSG in list_of_sgs:
if ps.overlap(sg, anotherSG, data) > similarity_level:
return True
return False
class CountCallsInterestingMeasure(BoundedInterestingnessMeasure):
def __init__(self, qf):
self.qf = qf
self.calls = 0
def calculate_statistics(self, sg, target, data, statistics=None):
self.calls += 1
return self.qf.calculate_statistics(sg, target, data, statistics)
def __getattr__(self, name):
return getattr(self.qf, name)
def __hasattr__(self, name):
return hasattr(self.qf, name)
#####
# GeneralizationAware Interestingness Measures
#####
class GeneralizationAwareQF(AbstractInterestingnessMeasure):
ga_tuple = namedtuple('ga_tuple', ['subgroup_quality', 'generalisation_quality'])
def __init__(self, qf):
self.qf = qf
# this cache maps the representation of descriptions to tuples
# the first entry is the quality and the second one is
# the largest quality of all its predessors
self.cache = {}
self.has_constant_statistics = False
self.required_stat_attrs = ['subgroup_quality', 'generalisation_quality']
self.q0 = 0
def calculate_constant_statistics(self, data, target):
self.cache = {}
self.qf.calculate_constant_statistics(data, target)
self.q0 = self.qf.evaluate(slice(None), target, data)
self.has_constant_statistics = self.qf.has_constant_statistics
def calculate_statistics(self, subgroup, target, data, statistics=None):
sg_repr = repr(subgroup)
if sg_repr in self.cache:
return GeneralizationAwareQF.ga_tuple(*self.cache[sg_repr])
else:
(q_sg, q_prev) = self.get_qual_and_previous_qual(subgroup, target, data)
self.cache[sg_repr] = (q_sg, q_prev)
return GeneralizationAwareQF.ga_tuple(q_sg, q_prev)
def get_qual_and_previous_qual(self, subgroup, target, data):
q_subgroup = self.qf.evaluate(subgroup, target, data)
max_q = 0
selectors = subgroup.selectors
if len(selectors) > 0:
# compute quality of all generalizations
generalizations = combinations(selectors, len(selectors)-1)
for sels in generalizations:
sgd = ps.Conjunction(list(sels))
(q_sg, q_prev) = self.calculate_statistics(sgd, target, data)
max_q = max(max_q, q_sg, q_prev)
return (q_subgroup, max_q)
def evaluate(self, subgroup, target, data, statistics=None):
statistics = self.ensure_statistics(subgroup, target, data, statistics)
return statistics.subgroup_quality - statistics.generalisation_quality
#####
# GeneralizationAware Interestingness Measures
#####
class GeneralizationAwareQF_stats(AbstractInterestingnessMeasure):
ga_tuple = namedtuple('ga_stats_tuple', ['subgroup_stats', 'generalisation_stats'])
def __init__(self, qf):
self.qf = qf
# this cache maps the representation of descriptions to tuples
# the first entry is the quality and the second one is
# the largest quality of all its predecessors
self.cache = {}
self.has_constant_statistics = False
self.required_stat_attrs = GeneralizationAwareQF_stats.ga_tuple._fields
self.stats0 = None
def calculate_constant_statistics(self, data, target):
self.cache = {}
self.qf.calculate_constant_statistics(data, target)
self.stats0 = self.qf.calculate_statistics(slice(None), target, data)
self.has_constant_statistics = self.qf.has_constant_statistics
def calculate_statistics(self, subgroup, target, data, statistics=None):
sg_repr = repr(subgroup)
if sg_repr in self.cache:
return GeneralizationAwareQF_stats.ga_tuple(*self.cache[sg_repr])
else:
(stats_sg, stats_prev) = self.get_stats_and_previous_stats(subgroup, target, data)
self.cache[sg_repr] = (stats_sg, stats_prev)
return GeneralizationAwareQF_stats.ga_tuple(stats_sg, stats_prev)
def get_stats_and_previous_stats(self, subgroup, target, data):
stats_subgroup = self.qf.calculate_statistics(subgroup, target, data)
max_stats = self.stats0
selectors = subgroup.selectors
if len(selectors) > 0:
# compute quality of all generalizations
generalizations = combinations(selectors, len(selectors)-1)
for sels in generalizations:
sgd = ps.Conjunction(list(sels))
(stats_sg, stats_prev) = self.calculate_statistics(sgd, target, data)
max_stats = self.get_max(max_stats, stats_sg, stats_prev)
return (stats_subgroup, max_stats)
def evaluate(self, subgroup, statistics_or_data=None):
raise NotImplementedError
def get_max(self, *args):
raise NotImplementedError
\ No newline at end of file
from collections import namedtuple
from scipy.stats import norm
import numpy as np
import pysubgroup_mod as ps
beta_tuple = namedtuple('beta_tuple', ['beta', 'size'])
class EMM_Likelihood(ps.AbstractInterestingnessMeasure):
tpl = namedtuple('EMM_Likelihood', ['model_params', 'subgroup_likelihood', 'inverse_likelihood', 'size'])
def __init__(self, model):
self.model = model
self.has_constant_statistics = False
self.required_stat_attrs = EMM_Likelihood.tpl._fields
self.data_size = None
def calculate_constant_statistics(self, task):
self.model.calculate_constant_statistics(task)
self.data_size = len(task.data)
self.has_constant_statistics = True
def calculate_statistics(self, subgroup, data=None):
cover_arr, sg_size = ps.get_cover_array_and_size(subgroup, self.data_size, data)
params = self.model.fit(cover_arr, data)
return self.get_tuple(sg_size, params, cover_arr)
def get_tuple(self, sg_size, params, cover_arr):
#numeric stability?
all_likelihood = self.model.likelihood(params, np.ones(self.data_size, dtype=bool))
sg_likelihood_sum = np.sum(all_likelihood[cover_arr])
total_likelihood_sum = np.sum(all_likelihood)
dataset_average = np.nan
if (self.data_size - sg_size) > 0:
dataset_average = (total_likelihood_sum - sg_likelihood_sum)/(self.data_size - sg_size)
sg_average = np.nan
if sg_size > 0:
sg_average = sg_likelihood_sum/sg_size
return EMM_Likelihood.tpl(params, sg_average, dataset_average, sg_size)
def evaluate(self, subgroup, statistics=None):
statistics = self.ensure_statistics(subgroup, statistics)
#numeric stability?
return statistics.subgroup_likelihood - statistics.inverse_likelihood
def gp_get_params(self, cover_arr, v):
params = self.model.gp_get_params(v)
sg_size = params.size
return self.get_tuple(sg_size, params, cover_arr)
def supports_weights(self):
return False
def is_applicable(self, _):
return True
def __getattr__(self, name):
return getattr(self.model, name)
class PolyRegression_ModelClass:
def __init__(self, x_name='x', y_name='y', degree=1):
self.x_name = x_name
self.y_name = y_name
if degree != 1:
raise ValueError('Currently only degree == 1 is supported')
self.degree = degree
self.x = None
self.y = None
self.has_constant_statistics = True
super().__init__()
def calculate_constant_statistics(self, task):
data = task.data
self.x = data[self.x_name].to_numpy()
self.y = data[self.y_name].to_numpy()
self.has_constant_statistics = True
@staticmethod
def gp_merge(u, v):
v0 = v[0]
u0 = u[0]
if v0 == 0 or u0 == 0:
d = 0
else:
d = v0 * u0/(v0 + u0)*(v[1]/v0 - u[1]/u0)*(v[2]/v0 - u[2]/u0)
u += v
u[3] += d
def gp_get_null_vector(self):
return np.zeros(5)
def gp_get_stats(self, row_index):
x = self.x[row_index]
return np.array([1, x, self.y[row_index], 0, x*x])
def gp_get_params(self, v):
size = v[0]
if size < self.degree:
return beta_tuple(np.full(self.degree + 1, np.nan), size)
v1 = v[1]
slope = v[0] * v[3] / (v[0]*v[4] - v1 * v1)
intersept = v[2]/v[0] - slope * v[1]/v[0]
return beta_tuple(np.array([slope, intersept]), v[0])
def fit(self, subgroup, data=None):
cover_arr, size = ps.get_cover_array_and_size(subgroup, len(self.x), data)
if size <= self.degree + 1:
return beta_tuple(np.full(self.degree + 1, np.nan), size)
return beta_tuple(np.polyfit(self.x[cover_arr], self.y[cover_arr], deg=self.degree), size)
def likelihood(self, stats, sg):
if any(np.isnan(stats.beta)):
return np.full(self.x[sg].shape, np.nan)
return norm.pdf(np.polyval(stats.beta, self.x[sg]) - self.y[sg])
def loglikelihood(self, stats, sg):
return norm.logpdf(np.polyval(stats.beta, self.x[sg]) - self.y[sg])
This diff is collapsed.
import pysubgroup_mod as ps
from collections import defaultdict
from itertools import chain
class RefinementOperator:
pass
class StaticSpecializationOperator:
def __init__(self, selectors):
search_space_dict = defaultdict(list)
for selector in selectors:
search_space_dict[selector.attribute_name].append(selector)
self.search_space = list(search_space_dict.values())
self.search_space_index = {key: i for i, key in enumerate(search_space_dict.keys())}
def refinements(self, subgroup):
if subgroup.depth > 0:
index_of_last = self.search_space_index[subgroup._selectors[-1].attribute_name]
new_selectors = chain.from_iterable(self.search_space[index_of_last + 1:])
else:
new_selectors = chain.from_iterable(self.search_space)
return (subgroup & sel for sel in new_selectors)
class StaticGeneralizationOperator:
def __init__(self, selectors):
self.search_space = selectors
def refinements(self, sG):
index_of_last_selector = min(self.search_space.index(sG._selectors[-1]), len(self.search_space) - 1)
new_selectors = self.search_space[index_of_last_selector + 1:]
return (sG | sel for sel in new_selectors)
import numpy as np
import pysubgroup_mod as ps
class RepresentationBase():
def __init__(self, new_conjunction, selectors_to_patch):
self._new_conjunction = new_conjunction
self.previous_conjunction = None
self.selectors_to_patch = selectors_to_patch
def patch_all_selectors(self):
for sel in self.selectors_to_patch:
self.patch_selector(sel)
def patch_selector(self, sel):
raise NotImplementedError
def patch_classes(self):
pass
def undo_patch_classes(self):
pass
def __enter__(self):
self.patch_classes()
self.patch_all_selectors()
return self
def __exit__(self, * args):
self.undo_patch_classes()
class BitSet_Conjunction(ps.Conjunction):
n_instances = 0
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.representation = self.compute_representation()
def compute_representation(self):
# empty description ==> return a list of all '1's
if not self._selectors:
return np.full(BitSet_Conjunction.n_instances, True, dtype=bool)
# non-empty description
return np.all([sel.representation for sel in self._selectors], axis=0)
@property
def size_sg(self):
return np.count_nonzero(self.representation)
def append_and(self, to_append):
super().append_and(to_append)
self.representation = np.logical_and(self.representation, to_append.representation)
@property
def __array_interface__(self):
return self.representation.__array_interface__
class BitSet_Disjunction(ps.Disjunction):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.representation = self.compute_representation()
def compute_representation(self):
# empty description ==> return a list of all '1's
if not self._selectors:
return np.full(BitSet_Conjunction.n_instances, False, dtype=bool)
# non-empty description
return np.any([sel.representation for sel in self._selectors], axis=0)
@property
def size_sg(self):
return np.count_nonzero(self.representation)
def append_or(self, to_append):
super().append_or(to_append)
self.representation = np.logical_or(self.representation, to_append.representation)
@property
def __array_interface__(self):
return self.representation.__array_interface__
class BitSetRepresentation(RepresentationBase):
Conjunction = BitSet_Conjunction
Disjunction = BitSet_Disjunction
def __init__(self, df, selectors_to_patch):
self.df = df
super().__init__(BitSet_Conjunction, selectors_to_patch)
def patch_selector(self, sel):
sel.representation = sel.covers(self.df)
sel.size_sg = np.count_nonzero(sel.representation)
def patch_classes(self):
BitSet_Conjunction.n_instances = len(self.df)
super().patch_classes()
class Set_Conjunction(ps.Conjunction):
all_set = set()
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.representation = self.compute_representation()
self.arr_for_interface = np.array(list(self.representation), dtype=int)
def compute_representation(self):
# empty description ==> return a list of all '1's
if not self._selectors:
return Set_Conjunction.all_set
# non-empty description
return set.intersection(*[sel.representation for sel in self._selectors])
@property
def size_sg(self):
return len(self.representation)
#def __copy__(self):
# tmp = super().__copy__()
# tmp.representation = self.representation.copy()
# return tmp
def append_and(self, to_append):
super().append_and(to_append)
self.representation = self.representation.intersection(to_append.representation)
self.arr_for_interface = np.array(list(self.representation), dtype=int)
@property
def __array_interface__(self):
return self.arr_for_interface.__array_interface__ # pylint: disable=no-member
class SetRepresentation(RepresentationBase):
Conjunction = Set_Conjunction
def __init__(self, df, selectors_to_patch):
self.df = df
super().__init__(Set_Conjunction, selectors_to_patch)
def patch_selector(self, sel):
sel.representation = set(*np.nonzero(sel.covers(self.df)))
sel.size_sg = len(sel.representation)
def patch_classes(self):
Set_Conjunction.all_set = set(self.df.index)
super().patch_classes()
class NumpySet_Conjunction(ps.Conjunction):
all_set = None
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.representation = self.compute_representation()
def compute_representation(self):
# empty description ==> return a list of all '1's
if not self._selectors:
return NumpySet_Conjunction.all_set
start = self._selectors[0].representation
for sel in self._selectors[1:]:
start = np.intersect1d(start, sel.representation, assume_unique=True)
return start
@property
def size_sg(self):
return len(self.representation)
#def __copy__(self):
# tmp = super().__copy__()
# tmp.representation = self.representation.copy()
# return tmp
def append_and(self, to_append):
super().append_and(to_append)
#self._selectors.append(to_append)
self.representation = np.intersect1d(self.representation, to_append.representation, True)
@property
def __array_interface__(self):
return self.representation.__array_interface__
class NumpySetRepresentation(RepresentationBase):
Conjunction = NumpySet_Conjunction
def __init__(self, df, selectors_to_patch):
self.df = df
super().__init__(NumpySet_Conjunction, selectors_to_patch)
def patch_selector(self, sel):
sel.representation = np.nonzero(sel.covers(self.df))[0]
sel.size_sg = len(sel.representation)
def patch_classes(self):
NumpySet_Conjunction.all_set = np.arange(len(self.df))
super().patch_classes()
This diff is collapsed.
This diff is collapsed.
from functools import partial
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
from matplotlib import pyplot as plt
import pysubgroup_mod as ps
def plot_sgbars(result_df, _, ylabel="target share", title="Discovered Subgroups", dynamic_widths=False, _suffix=""):
shares_sg = result_df["target_share_sg"]
shares_compl = result_df["target_share_complement"]
sg_relative_sizes = result_df["relative_size_sg"]
x = np.arange(len(result_df))
base_width = 0.8
if dynamic_widths:
width_sg = 0.02 + base_width * sg_relative_sizes
width_compl = base_width - width_sg
else:
width_sg = base_width / 2
width_compl = base_width / 2
fig, ax = plt.subplots()
rects1 = ax.bar(x, shares_sg, width_sg, align='edge')
rects2 = ax.bar(x + width_sg, shares_compl, width_compl, align='edge', color='#61b76f')
ax.set_ylabel(ylabel)
ax.set_title(title)
ax.set_xticks(x + base_width / 2)
ax.set_xticklabels(result_df.index, rotation=90)
ax.legend((rects1[0], rects2[0]), ('subgroup', 'complement'))
fig.set_size_inches(12, len(result_df))
return fig
def plot_roc(result_df, data, qf=ps.StandardQF(0.5), levels=40, annotate=False):
instances_dataset = len(data)
positives_dataset = np.max(result_df['positives_dataset'])
negatives_dataset = instances_dataset - positives_dataset
xlist = np.linspace(0.01, 0.99, 100)
ylist = np.linspace(0.01, 0.99, 100)
X, Y = np.meshgrid(xlist, ylist)
f = np.vectorize(partial(qf.evaluate, instances_dataset, positives_dataset), otypes=[np.float])
Z = f(X * negatives_dataset + Y * positives_dataset, Y * positives_dataset)
max_val = np.max([np.max(Z), -np.min(Z)])
fig, ax = plt.subplots()
cm = plt.cm.get_cmap("bwr")
plt.contourf(X, Y, Z, levels, cmap=cm, vmin=-max_val, vmax=max_val)
for i, sg in result_df.iterrows():
rel_positives_sg = sg['positives_sg'] / positives_dataset
rel_negatives_sg = (sg['size_sg'] - sg['positives_sg']) / negatives_dataset
ax.plot(rel_negatives_sg, rel_positives_sg, 'ro', color='black')
if annotate:
label_margin = 0.01
ax.annotate(str(i), (rel_negatives_sg + label_margin, rel_positives_sg + label_margin))
# plt.colorbar(cp)
plt.title('Discovered subgroups')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
return fig
def plot_npspace(result_df, data, annotate=True, fixed_limits=False):
fig, ax = plt.subplots()
for i, sg in result_df.iterrows():
target_share_sg = sg['target_share_sg']
size_sg = sg['size_sg']
ax.plot(size_sg, target_share_sg, 'ro', color='black')
if annotate:
ax.annotate(str(i), (size_sg + 5, target_share_sg + 0.001))
if fixed_limits:
plt.xlim((0, len(data)))
plt.ylim((0, 1))
plt.title('Discovered subgroups')
plt.xlabel('Size of Subgroup')
plt.ylabel('Target Share Subgroup')
return fig
def plot_distribution_numeric(sg, data, bins):
fig, _ = plt.subplots()
target_values_sg = data[sg.covers(data)][sg.target.get_attributes()].values
target_values_data = data[sg.target.get_attributes()].values
plt.hist(target_values_sg, bins, alpha=0.5, label=str(sg.subgroup_description), density=True)
plt.hist(target_values_data, bins, alpha=0.5, label="Overall Data", density=True)
plt.legend(loc='upper right')
return fig
def compare_distributions_numeric(sgs, data, bins):
fig, _ = plt.subplots()
for sg in sgs:
target_values_sg = data[sg.covers(data)][sg.target.get_attributes()].values
plt.hist(target_values_sg, bins, alpha=0.3, label=str(sg.subgroup_description), density=True)
plt.legend(loc='upper right')
return fig
def similarity_sgs(sgd_results, data, color=True):
sgs = [x[1] for x in sgd_results]
#sgNames = [str(sg.subgroup_description) for sg in sgs]
dists = [[ps.overlap(sg, sg2, data) for sg2 in sgs] for sg in sgs]
dist_df = pd.DataFrame(dists)
if color:
dist_df = dist_df.style.background_gradient()
return dist_df
def similarity_dendrogram(result, data):
fig, _ = plt.subplots()
dist_df = similarity_sgs(result, data, color=False)
mat = 1 - dist_df.values
dists = squareform(mat)
linkage_matrix = linkage(dists, "single")
dendrogram(linkage_matrix, labels=dist_df.index)
return fig
def supportSetVisualization(result, in_order=True, drop_empty=True):
df = result.task.data
n_items = len(result.task.data)
n_SGDs = len(result.results)
covs = np.zeros((n_items, n_SGDs), dtype=bool)
for i, (_, r, _) in enumerate(result.to_subgroups):
covs[:, i] = r.covers(df)
img_arr = covs.copy()
sort_inds_x = np.argsort(np.sum(covs, axis=1))[::-1]
img_arr = img_arr[sort_inds_x, :]
if not in_order:
sort_inds_y = np.argsort(np.sum(covs, axis=0))
img_arr = img_arr[:, sort_inds_y]
if drop_empty:
keep_entities = np.sum(img_arr, axis=1) > 0
print("Discarding {} entities that are not covered".format(n_items - np.count_nonzero(keep_entities)))
img_arr = img_arr[keep_entities, :]
return img_arr.T
IGSD
This repository contains the material refering to the paper: "", it contains:
1. datasets: Directory in which datasets to be used in the algortihm are stored.
2. results: Directory in which the algortihm will store the results produced.
3. pysubgroup_mod: The project code.
1. IGSD Project Scripts
Contains the scripts of IGSD and other algorithms such as BeamSearch, DFS, BestFirstSearch, etc. Moreover, main.py is the
principal script file which will launch the specific algorithm.
The main.py file required several arguments to be used, so the following command line will execute the python file:
py --dataname <FILE> --class_column <CLASS_COLUMN> --class_value <CLASS_VALUE> --mode <MODE> --depth <DEPTH> --list_ignore <LIST_IGNORE> --list_conds <LIST_CONDS>
With:
- <FILE>: The name of the dataset input file.
- <CLASS_COLUMN>: The attribute (column) used as target (studied class).
- <CLASS_VALUE>: The value of <CLASS_COLUMN> that we want to analize.
- <MODE>: The mode that IGSD will employ to perfom the analysis when IG threshold is calculated (dynamic, maximum). If you want to employ another algorithm, the default value is used.
- <DEPTH>: The number of attributes that the algortihms will consider.
- <LIST_IGNORE>: A list with the attributes (columns) of the dataset that the user does not want to be consider in the anaylis.
- <LIST_CONDS>: A list with the attributes (columns) of the dataset that the user wants to be present in the patterns obtained.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment