changes

c79e3a1d · AARON GARCIA MUÑIZ · e43c1474 · c79e3a1d · c79e3a1d · c79e3a1d
Commit c79e3a1d authored Jun 13, 2024 by AARON GARCIA MUÑIZ
21 changed files
--- a/main.py
+++ b/main.py
@@ -64,31 +64,31 @@ def info_gained_algorithm(dataname,class_column,class_value,mode_parameter="defa
        searchspace,
        mode=mode_parameter, 
        depth=depth,
-        filter_vars = list_conds, 
+        filter_vars = list_conds,
+        min_quality = 0,
        qf=ps.WRAccQF())
+    
+    result = ps.BeamSearch().execute(task)
+    df_result = result.to_dataframe(mode=mode_parameter)

-    result, result_cut = ps.InfoGainedSearch().execute(task)
-    #df_result = result.to_dataframe()
-    df_result_cut = result_cut.to_dataframe(mode=mode_parameter)
+    # result, result_cut = ps.BeamSearch().execute(task)
+    # #df_result = result.to_dataframe()
+    # df_result_cut = result_cut.to_dataframe(mode=mode_parameter)

-    df_result_cut.drop_duplicates(inplace=True)
-    df_result_cut.reset_index(drop=True,inplace=True)
+    # df_result_cut.drop_duplicates(inplace=True)
+    # df_result_cut.reset_index(drop=True,inplace=True)

-    #df_result = df_result[df_result.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result["subgroup"])), axis=1)]
-    df_result_cut = df_result_cut[df_result_cut.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result_cut["subgroup"])), axis=1)]
-    df_result_cut["target"] = [class_value] * df_result_cut.shape[0]
-    #route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
-    """ dir_type = "max"
+    # #df_result = df_result[df_result.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result["subgroup"])), axis=1)]
+    # df_result_cut = df_result_cut[df_result_cut.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result_cut["subgroup"])), axis=1)]
+    df_result["target"] = [class_value] * df_result.shape[0]
+    route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
+    dir_type = "max"
    if mode_parameter == 0:
        dir_type = "threshold"
-    route = parentdir+"/datasets_compared/"+dataname+"/InfoGained/"+dir_type+"/"+dataname+"_"+class_column+"_"+str(class_value)+".csv" """
-    route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
-    df_result_cut.to_csv(route, encoding="UTF-8",index=True)
+    df_result.to_csv(route, encoding="UTF-8",index=True)


 if __name__ == "__main__":
-    #list_ignore=['Prog_Rec', 'ToxBin', 'boolenProg', 'booleanTox', 'NoProg-Tox', 'SiProg-Tox',"orgfam","target","target_num"]
-    # Cancer_stage, FirstTreatment

    parser=argparse.ArgumentParser()


--- a/pysubgroup_mod/__pycache__/__init__.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/__init__.cpython-38.pyc
--- a/pysubgroup_mod/__pycache__/algorithms.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/algorithms.cpython-38.pyc
--- a/pysubgroup_mod/__pycache__/algorithms.cpython-39.pyc
+++ b/pysubgroup_mod/__pycache__/algorithms.cpython-39.pyc
--- a/pysubgroup_mod/__pycache__/binary_target.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/binary_target.cpython-38.pyc
--- a/pysubgroup_mod/__pycache__/binary_target.cpython-39.pyc
+++ b/pysubgroup_mod/__pycache__/binary_target.cpython-39.pyc
--- a/pysubgroup_mod/__pycache__/constraints.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/constraints.cpython-38.pyc
--- a/pysubgroup_mod/__pycache__/fi_target.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/fi_target.cpython-38.pyc
--- a/pysubgroup_mod/__pycache__/measures.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/measures.cpython-38.pyc
--- a/pysubgroup_mod/__pycache__/numeric_target.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/numeric_target.cpython-38.pyc
--- a/pysubgroup_mod/__pycache__/refinement_operator.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/refinement_operator.cpython-38.pyc
--- a/pysubgroup_mod/__pycache__/refinement_operator.cpython-39.pyc
+++ b/pysubgroup_mod/__pycache__/refinement_operator.cpython-39.pyc
--- a/pysubgroup_mod/__pycache__/representations.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/representations.cpython-38.pyc
--- a/pysubgroup_mod/__pycache__/subgroup_description.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/subgroup_description.cpython-38.pyc
--- a/pysubgroup_mod/__pycache__/utils.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/utils.cpython-38.pyc
--- a/pysubgroup_mod/__pycache__/utils.cpython-39.pyc
+++ b/pysubgroup_mod/__pycache__/utils.cpython-39.pyc
--- a/pysubgroup_mod/__pycache__/visualization.cpython-38.pyc
+++ b/pysubgroup_mod/__pycache__/visualization.cpython-38.pyc
--- a/pysubgroup_mod/algorithms.py
+++ b/pysubgroup_mod/algorithms.py
@@ -311,8 +311,8 @@ class BeamSearch:
                            new_selectors.append(sel)
                            sg = ps.Conjunction(new_selectors,task.mode)
                            statistics = task.qf.calculate_statistics(sg, task.target, task.data)
-                            quality,_,_ = task.qf.evaluate(sg, task.target, task.data, statistics)
-                            #p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat.
+                            #quality,_,_ = task.qf.evaluate(sg, task.target, task.data, statistics)
+                            quality = task.qf.evaluate(sg, task.target, task.data, statistics)                            
                            ps.add_if_required(beam, sg, quality, task, check_for_duplicates=True, statistics=statistics)
            depth += 1
 # TODO make sure there is no bug here
@@ -342,7 +342,7 @@ class InfoGainedSearch:
        depth = 0
        start = time()
        while beam != last_beam and depth < task.depth:
-            #print(depth)
+            print(depth)
            last_beam = beam.copy()
            beam.clear() # List used to save all the candidates of iteration n.
            smt = False # Parameter used to control if there are something in beam list
@@ -351,7 +351,7 @@ class InfoGainedSearch:
                for sel in task.search_space:
                    # Generate a sg using the parents labels + possible labels.
                    new_selectors = list(last_sg.selectors)
-                    if sel not in new_selectors: # A sg can not contain 2 same selectors.
+                    if sel not in new_selectors: # A sg can not contain 2 same labels.
                        new_selectors.append(sel) # New sg generated
                        sg = ps.Conjunction(new_selectors,task.mode)                          
                        statistics = task.qf.calculate_statistics(sg, task.target, task.data) ## Calculate some stats
@@ -365,6 +365,8 @@ class InfoGainedSearch:
                            # If there are not elements in beam, we will add the element. In case there are something in list, check
                            # if new generated sg is already in list (it means that new sg is in the list but has different labels order). 
                            if smt is False or sorted(new_selectors) not in [sorted(elem[1]._selectors) for elem in beam]:
+                                # If wracc stat < 0, then generated sg is not take into account.
+                                #if quality >= 0:
                                p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat.
                                aux_beam.append((quality_l.copy() + [quality], sg, stats_l.copy() + [statistics], info_l.copy() + [info_gain], odd_l.copy() + [odd_v],sel_idx, pvalue_l.copy() + [p_value]))  
                # After adding all the candidates that satisfy the conditions
@@ -384,6 +386,7 @@ class InfoGainedSearch:
            if time()-start>task.timeout:
                break 
            depth += 1
+# TODO make sure there is no bug here
        
        beam_cut = [] # Final gropus cutted.
        for elem in beam:

--- a/pysubgroup_mod/binary_target.py
+++ b/pysubgroup_mod/binary_target.py
@@ -223,7 +223,9 @@ class StandardQF(SimplePositivesQF, ps.BoundedInterestingnessMeasure):
    def standard_qf(subg,a, instances_dataset, positives_dataset, instances_subgroup, positives_subgroup, measures):
        
        if not hasattr(instances_subgroup, '__array_interface__') and (instances_subgroup == 0):
-            return np.nan, np.nan, np.nan
+            if measures is True:
+                return np.nan, np.nan, np.nan
+            return np.nan
        p_subgroup = np.divide(positives_subgroup, instances_subgroup)
        #if instances_subgroup == 0:
        #    return 0

--- a/pysubgroup_mod/utils.py
+++ b/pysubgroup_mod/utils.py
+'''
+Created on 02.05.2016

+@author: lemmerfn
+'''
 import itertools
 from functools import partial
 from heapq import heappush, heappop
@@ -9,13 +13,12 @@ import pandas as pd
 import pysubgroup_mod as ps
 from math import sqrt

-# Function that calculate entrophy
+## Added
 def calculate_entriopia(x):
    if x in [0.0,1.0]:
        return 0
    return -x*math.log(x,2) - (1-x)*math.log(1-x,2)

-# Function that calculate information gained for a subgroup
 def calculate_info_gained(ID,IS,PD,PS):
    a = ID - IS
    b = PD - PS
@@ -28,7 +31,6 @@ def calculate_info_gained(ID,IS,PD,PS):
        p3 = b / a
    return calculate_entriopia(p1) - (x1)*calculate_entriopia(p2) - (x2)*calculate_entriopia(p3)

-# Function that calculate odd value for a subgroup
 def calculate_odd_value(ID,IS,PD,PS):
    b = IS - PS
    c = PD - PS
@@ -39,7 +41,6 @@ def calculate_odd_value(ID,IS,PD,PS):
        odd_value = (PS*d) / (b*c)
    return odd_value

-# Function that calculate and optimal threshold based on standard deviation for a given values list.
 def threshold(info_list,depth,mode):
    if len(np.unique(info_list)) == 1:
        return list(info_list)[0]
@@ -98,18 +99,19 @@ def best_complex(elem,mode,filter_vars):
    for idx, cand in enumerate(x_filter[1:],start=1):
        if cand[1][0] > return_cand[1][0]: # If candidate upgrades odd range, it is selected as new return_cand
            return_cand = cand
-        # If candidate has an odd range lower than return_cand odd range, the algorithm stops.
-        # Also, if candidate is not consecutive and his odd range does not improve the return_cand odd range, the algorithm stops.
-        elif (cand[1][0] == return_cand[1][0] and cand[3] > x_filter[idx-1][3] + 1) or (cand[1][0] < return_cand[1][0]):
+        elif (cand[1][0] == return_cand[1][0] and cand[3] == x_filter[idx-1][3] + 1):
+            return_cand = cand
+        else:
            break
        #if return_cand[1][1] == 100:
-        if return_cand[1][0] == 4: # Also, if return_cand has maximum odd range (odd value > 6.71), algorithm stops and returns return_cand.
-            break
+        """ if return_cand[1][0] == 4: # Also, if return_cand has maximum odd range (odd value > 6.71), algorithm stops and returns return_cand.
+            break """
    index = group_labels.index(return_cand[2])
    sg = ps.Conjunction(selectors[:index+1],mode)
    tup = (elem[0][index],sg,elem[2][index],elem[3][index],elem[4][index],elem[5],elem[6][index])
    return tup

+#def add_if_required(result, sg, quality, task, check_for_duplicates=False, statistics=None):
 def add_if_required(result, sg, quality, task, check_for_duplicates=False, statistics=None):
    if quality > task.min_quality:
        p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat.
@@ -317,7 +319,7 @@ class SubgroupDiscoveryResult:
                row.append(stat)
            if mode != 2:
                row.append("pvalue")
-                table.append(row)
+            table.append(row)
        if mode !=2:
            for (q, sg, stats,_,_,_,p_value) in self.results:
                stats = self.task.target.calculate_statistics(sg, self.task.data, stats)
@@ -329,13 +331,14 @@ class SubgroupDiscoveryResult:
                row.append(str(p_value))
                table.append(row)
        else:
-            for (q, sg, stats) in self.results:
+            for (q, _ ,sg, stats) in self.results:
                stats = self.task.target.calculate_statistics(sg, self.task.data, stats)
                row = [str(q), str(sg)]
                if include_target:
                    row.append(str(self.task.target))
                for stat in statistics_to_show:
                    row.append(str(stats[stat]))
+                table.append(row)
        return table

    def to_dataframe(self, statistics_to_show=None, autoround=False, include_target=False, mode=2):

--- a/results/P4-Lucat_target_num_1.csv
+++ b/results/P4-Lucat_target_num_1.csv
+,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift,target
+0,0.23568820806202717,Prog_Rec=='No progression/relapse' AND ToxBin=='NoTox',248.0,652.0,248.0,248.0,404.0,0.3803680981595092,0.6196319018404908,1.0,0.0,1.0,0.0,0.3803680981595092,2.629032258064516,1
+1,0.21060258195641535,Prog_Rec=='No progression/relapse' AND boolenProg=='[]',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1
+2,0.21060258195641535,Prog_Rec=='No progression/relapse',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1
+3,0.21060258195641535,Prog_Rec=='No progression/relapse' AND booleanTox=='[]',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1
+4,0.2044487937069517,Prog_Rec=='No progression/relapse' AND marmolre=='['No']',270.0,652.0,236.0,248.0,382.0,0.41411042944785276,0.5858895705521472,0.9516129032258065,0.04838709677419355,0.8740740740740741,0.031413612565445025,0.3803680981595092,2.2979689366786142,1
+5,0.1843690014678761,FirstTreatment=='Curative surgery' AND Prog_Rec=='No progression/relapse',194.0,652.0,194.0,248.0,458.0,0.29754601226993865,0.7024539877300614,0.782258064516129,0.21774193548387097,1.0,0.11790393013100436,0.3803680981595092,2.629032258064516,1
+6,0.17533591779893862,PDL1=='PDL1_Negative' AND Prog_Rec=='No progression/relapse',220.0,652.0,198.0,248.0,432.0,0.3374233128834356,0.6625766871165644,0.7983870967741935,0.20161290322580644,0.9,0.11574074074074074,0.3803680981595092,2.3661290322580646,1
+7,0.15319545334788665,Gender=='Male' AND Prog_Rec=='No progression/relapse',208.0,652.0,179.0,248.0,444.0,0.31901840490797545,0.6809815950920245,0.7217741935483871,0.2782258064516129,0.8605769230769231,0.1554054054054054,0.3803680981595092,2.2624844913151363,1
+8,0.12073280891264254,Age_range=='[64 - 90]' AND Prog_Rec=='No progression/relapse',169.0,652.0,143.0,248.0,483.0,0.25920245398773006,0.74079754601227,0.5766129032258065,0.42338709677419356,0.8461538461538461,0.21739130434782608,0.3803680981595092,2.224565756823821,1
+9,0.11786292295532388,FirstTreatment=='Curative surgery',308.0,652.0,194.0,248.0,344.0,0.4723926380368098,0.5276073619631901,0.782258064516129,0.21774193548387097,0.6298701298701299,0.1569767441860465,0.3803680981595092,1.6559488898198576,1