diff --git a/main.py b/main.py index a15805a3d90d750a9465ff286a1fa3434021210a..563099dbdb7834d96fc7831d22a92df86414dc40 100644 --- a/main.py +++ b/main.py @@ -64,31 +64,31 @@ def info_gained_algorithm(dataname,class_column,class_value,mode_parameter="defa searchspace, mode=mode_parameter, depth=depth, - filter_vars = list_conds, + filter_vars = list_conds, + min_quality = 0, qf=ps.WRAccQF()) + + result = ps.BeamSearch().execute(task) + df_result = result.to_dataframe(mode=mode_parameter) - result, result_cut = ps.InfoGainedSearch().execute(task) - #df_result = result.to_dataframe() - df_result_cut = result_cut.to_dataframe(mode=mode_parameter) + # result, result_cut = ps.BeamSearch().execute(task) + # #df_result = result.to_dataframe() + # df_result_cut = result_cut.to_dataframe(mode=mode_parameter) - df_result_cut.drop_duplicates(inplace=True) - df_result_cut.reset_index(drop=True,inplace=True) + # df_result_cut.drop_duplicates(inplace=True) + # df_result_cut.reset_index(drop=True,inplace=True) - #df_result = df_result[df_result.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result["subgroup"])), axis=1)] - df_result_cut = df_result_cut[df_result_cut.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result_cut["subgroup"])), axis=1)] - df_result_cut["target"] = [class_value] * df_result_cut.shape[0] - #route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv" - """ dir_type = "max" + # #df_result = df_result[df_result.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result["subgroup"])), axis=1)] + # df_result_cut = df_result_cut[df_result_cut.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result_cut["subgroup"])), axis=1)] + df_result["target"] = [class_value] * df_result.shape[0] + route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv" + dir_type = "max" if mode_parameter == 0: dir_type = "threshold" - route = parentdir+"/datasets_compared/"+dataname+"/InfoGained/"+dir_type+"/"+dataname+"_"+class_column+"_"+str(class_value)+".csv" """ - route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv" - df_result_cut.to_csv(route, encoding="UTF-8",index=True) + df_result.to_csv(route, encoding="UTF-8",index=True) if __name__ == "__main__": - #list_ignore=['Prog_Rec', 'ToxBin', 'boolenProg', 'booleanTox', 'NoProg-Tox', 'SiProg-Tox',"orgfam","target","target_num"] - # Cancer_stage, FirstTreatment parser=argparse.ArgumentParser() diff --git a/pysubgroup_mod/__pycache__/__init__.cpython-38.pyc b/pysubgroup_mod/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de5d2e7f24179bcd6923b733b0119811198f178a Binary files /dev/null and b/pysubgroup_mod/__pycache__/__init__.cpython-38.pyc differ diff --git a/pysubgroup_mod/__pycache__/algorithms.cpython-38.pyc b/pysubgroup_mod/__pycache__/algorithms.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec69260b6aa851c51c16d090e09aca00aa80822d Binary files /dev/null and b/pysubgroup_mod/__pycache__/algorithms.cpython-38.pyc differ diff --git a/pysubgroup_mod/__pycache__/algorithms.cpython-39.pyc b/pysubgroup_mod/__pycache__/algorithms.cpython-39.pyc index 79b4c571d60c64b6f04f1460f080a2999b32fb04..ffea386f1eb888be9063081c43477ce653e7cdb7 100644 Binary files a/pysubgroup_mod/__pycache__/algorithms.cpython-39.pyc and b/pysubgroup_mod/__pycache__/algorithms.cpython-39.pyc differ diff --git a/pysubgroup_mod/__pycache__/binary_target.cpython-38.pyc b/pysubgroup_mod/__pycache__/binary_target.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..171b00e0d7e16894886eee2af36bb6594476acd6 Binary files /dev/null and b/pysubgroup_mod/__pycache__/binary_target.cpython-38.pyc differ diff --git a/pysubgroup_mod/__pycache__/binary_target.cpython-39.pyc b/pysubgroup_mod/__pycache__/binary_target.cpython-39.pyc index 7b500e87e559b72615174c9af1f2238ea38049da..014e2cbf07cd7cfd1ee1bc5b27b48122aed162da 100644 Binary files a/pysubgroup_mod/__pycache__/binary_target.cpython-39.pyc and b/pysubgroup_mod/__pycache__/binary_target.cpython-39.pyc differ diff --git a/pysubgroup_mod/__pycache__/constraints.cpython-38.pyc b/pysubgroup_mod/__pycache__/constraints.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9608210754308a234f832406996cf8433da32beb Binary files /dev/null and b/pysubgroup_mod/__pycache__/constraints.cpython-38.pyc differ diff --git a/pysubgroup_mod/__pycache__/fi_target.cpython-38.pyc b/pysubgroup_mod/__pycache__/fi_target.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5b0b89f02c50056b5a89cc77c49944afca2d0fe Binary files /dev/null and b/pysubgroup_mod/__pycache__/fi_target.cpython-38.pyc differ diff --git a/pysubgroup_mod/__pycache__/measures.cpython-38.pyc b/pysubgroup_mod/__pycache__/measures.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e41d91a42922af1f0d2bf78218ecaf706503ab9 Binary files /dev/null and b/pysubgroup_mod/__pycache__/measures.cpython-38.pyc differ diff --git a/pysubgroup_mod/__pycache__/numeric_target.cpython-38.pyc b/pysubgroup_mod/__pycache__/numeric_target.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f86f98fa5efc70f2695f38976fc8ca3fc1a9ac85 Binary files /dev/null and b/pysubgroup_mod/__pycache__/numeric_target.cpython-38.pyc differ diff --git a/pysubgroup_mod/__pycache__/refinement_operator.cpython-38.pyc b/pysubgroup_mod/__pycache__/refinement_operator.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05fdcd7a6f5e89d7bee13306adf5415ec296bcf4 Binary files /dev/null and b/pysubgroup_mod/__pycache__/refinement_operator.cpython-38.pyc differ diff --git a/pysubgroup_mod/__pycache__/refinement_operator.cpython-39.pyc b/pysubgroup_mod/__pycache__/refinement_operator.cpython-39.pyc index 08da8c15e95da1a608cae8c72b1646b40dfdad6f..0e02a968c0262e5a0c0f3716966cf05ad95df364 100644 Binary files a/pysubgroup_mod/__pycache__/refinement_operator.cpython-39.pyc and b/pysubgroup_mod/__pycache__/refinement_operator.cpython-39.pyc differ diff --git a/pysubgroup_mod/__pycache__/representations.cpython-38.pyc b/pysubgroup_mod/__pycache__/representations.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6f727469b0189ec12f72d4e14cc79cc6c77c932 Binary files /dev/null and b/pysubgroup_mod/__pycache__/representations.cpython-38.pyc differ diff --git a/pysubgroup_mod/__pycache__/subgroup_description.cpython-38.pyc b/pysubgroup_mod/__pycache__/subgroup_description.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7994406904e8f3e6c2de6ed2063ed97473883a63 Binary files /dev/null and b/pysubgroup_mod/__pycache__/subgroup_description.cpython-38.pyc differ diff --git a/pysubgroup_mod/__pycache__/utils.cpython-38.pyc b/pysubgroup_mod/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4fad36fc4660854926001f200838876bc1e1dbf6 Binary files /dev/null and b/pysubgroup_mod/__pycache__/utils.cpython-38.pyc differ diff --git a/pysubgroup_mod/__pycache__/utils.cpython-39.pyc b/pysubgroup_mod/__pycache__/utils.cpython-39.pyc index 28dfdbbda542020d2a79db62ebc4965bb3dab6fa..28ef6fd5749ed6f7b13f22fc4310a970d42ce4b3 100644 Binary files a/pysubgroup_mod/__pycache__/utils.cpython-39.pyc and b/pysubgroup_mod/__pycache__/utils.cpython-39.pyc differ diff --git a/pysubgroup_mod/__pycache__/visualization.cpython-38.pyc b/pysubgroup_mod/__pycache__/visualization.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a582c4619f84385b973079e731cd478dca785c1 Binary files /dev/null and b/pysubgroup_mod/__pycache__/visualization.cpython-38.pyc differ diff --git a/pysubgroup_mod/algorithms.py b/pysubgroup_mod/algorithms.py index b2c6dce0d018b555d5cee23964cba8539dc1cf07..cbae281bab9dae697e4668aa3de3a5c5068302d9 100644 --- a/pysubgroup_mod/algorithms.py +++ b/pysubgroup_mod/algorithms.py @@ -311,8 +311,8 @@ class BeamSearch: new_selectors.append(sel) sg = ps.Conjunction(new_selectors,task.mode) statistics = task.qf.calculate_statistics(sg, task.target, task.data) - quality,_,_ = task.qf.evaluate(sg, task.target, task.data, statistics) - #p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat. + #quality,_,_ = task.qf.evaluate(sg, task.target, task.data, statistics) + quality = task.qf.evaluate(sg, task.target, task.data, statistics) ps.add_if_required(beam, sg, quality, task, check_for_duplicates=True, statistics=statistics) depth += 1 # TODO make sure there is no bug here @@ -342,7 +342,7 @@ class InfoGainedSearch: depth = 0 start = time() while beam != last_beam and depth < task.depth: - #print(depth) + print(depth) last_beam = beam.copy() beam.clear() # List used to save all the candidates of iteration n. smt = False # Parameter used to control if there are something in beam list @@ -351,7 +351,7 @@ class InfoGainedSearch: for sel in task.search_space: # Generate a sg using the parents labels + possible labels. new_selectors = list(last_sg.selectors) - if sel not in new_selectors: # A sg can not contain 2 same selectors. + if sel not in new_selectors: # A sg can not contain 2 same labels. new_selectors.append(sel) # New sg generated sg = ps.Conjunction(new_selectors,task.mode) statistics = task.qf.calculate_statistics(sg, task.target, task.data) ## Calculate some stats @@ -365,6 +365,8 @@ class InfoGainedSearch: # If there are not elements in beam, we will add the element. In case there are something in list, check # if new generated sg is already in list (it means that new sg is in the list but has different labels order). if smt is False or sorted(new_selectors) not in [sorted(elem[1]._selectors) for elem in beam]: + # If wracc stat < 0, then generated sg is not take into account. + #if quality >= 0: p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat. aux_beam.append((quality_l.copy() + [quality], sg, stats_l.copy() + [statistics], info_l.copy() + [info_gain], odd_l.copy() + [odd_v],sel_idx, pvalue_l.copy() + [p_value])) # After adding all the candidates that satisfy the conditions @@ -384,6 +386,7 @@ class InfoGainedSearch: if time()-start>task.timeout: break depth += 1 +# TODO make sure there is no bug here beam_cut = [] # Final gropus cutted. for elem in beam: diff --git a/pysubgroup_mod/binary_target.py b/pysubgroup_mod/binary_target.py index 1fab1225b9ffe0d47e465d7c3dbcbb1cf1dd03d9..890707e1dd58359004af393724c5496222d54b64 100644 --- a/pysubgroup_mod/binary_target.py +++ b/pysubgroup_mod/binary_target.py @@ -223,7 +223,9 @@ class StandardQF(SimplePositivesQF, ps.BoundedInterestingnessMeasure): def standard_qf(subg,a, instances_dataset, positives_dataset, instances_subgroup, positives_subgroup, measures): if not hasattr(instances_subgroup, '__array_interface__') and (instances_subgroup == 0): - return np.nan, np.nan, np.nan + if measures is True: + return np.nan, np.nan, np.nan + return np.nan p_subgroup = np.divide(positives_subgroup, instances_subgroup) #if instances_subgroup == 0: # return 0 diff --git a/pysubgroup_mod/utils.py b/pysubgroup_mod/utils.py index 19ff3fc5ff1ff1e022569ab2fda158d7ea5c0587..84e4a292f59764e9412153f2c0ca927429fd446d 100644 --- a/pysubgroup_mod/utils.py +++ b/pysubgroup_mod/utils.py @@ -1,4 +1,8 @@ +''' +Created on 02.05.2016 +@author: lemmerfn +''' import itertools from functools import partial from heapq import heappush, heappop @@ -9,13 +13,12 @@ import pandas as pd import pysubgroup_mod as ps from math import sqrt -# Function that calculate entrophy +## Added def calculate_entriopia(x): if x in [0.0,1.0]: return 0 return -x*math.log(x,2) - (1-x)*math.log(1-x,2) -# Function that calculate information gained for a subgroup def calculate_info_gained(ID,IS,PD,PS): a = ID - IS b = PD - PS @@ -28,7 +31,6 @@ def calculate_info_gained(ID,IS,PD,PS): p3 = b / a return calculate_entriopia(p1) - (x1)*calculate_entriopia(p2) - (x2)*calculate_entriopia(p3) -# Function that calculate odd value for a subgroup def calculate_odd_value(ID,IS,PD,PS): b = IS - PS c = PD - PS @@ -39,7 +41,6 @@ def calculate_odd_value(ID,IS,PD,PS): odd_value = (PS*d) / (b*c) return odd_value -# Function that calculate and optimal threshold based on standard deviation for a given values list. def threshold(info_list,depth,mode): if len(np.unique(info_list)) == 1: return list(info_list)[0] @@ -98,18 +99,19 @@ def best_complex(elem,mode,filter_vars): for idx, cand in enumerate(x_filter[1:],start=1): if cand[1][0] > return_cand[1][0]: # If candidate upgrades odd range, it is selected as new return_cand return_cand = cand - # If candidate has an odd range lower than return_cand odd range, the algorithm stops. - # Also, if candidate is not consecutive and his odd range does not improve the return_cand odd range, the algorithm stops. - elif (cand[1][0] == return_cand[1][0] and cand[3] > x_filter[idx-1][3] + 1) or (cand[1][0] < return_cand[1][0]): + elif (cand[1][0] == return_cand[1][0] and cand[3] == x_filter[idx-1][3] + 1): + return_cand = cand + else: break #if return_cand[1][1] == 100: - if return_cand[1][0] == 4: # Also, if return_cand has maximum odd range (odd value > 6.71), algorithm stops and returns return_cand. - break + """ if return_cand[1][0] == 4: # Also, if return_cand has maximum odd range (odd value > 6.71), algorithm stops and returns return_cand. + break """ index = group_labels.index(return_cand[2]) sg = ps.Conjunction(selectors[:index+1],mode) tup = (elem[0][index],sg,elem[2][index],elem[3][index],elem[4][index],elem[5],elem[6][index]) return tup +#def add_if_required(result, sg, quality, task, check_for_duplicates=False, statistics=None): def add_if_required(result, sg, quality, task, check_for_duplicates=False, statistics=None): if quality > task.min_quality: p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat. @@ -317,7 +319,7 @@ class SubgroupDiscoveryResult: row.append(stat) if mode != 2: row.append("pvalue") - table.append(row) + table.append(row) if mode !=2: for (q, sg, stats,_,_,_,p_value) in self.results: stats = self.task.target.calculate_statistics(sg, self.task.data, stats) @@ -329,13 +331,14 @@ class SubgroupDiscoveryResult: row.append(str(p_value)) table.append(row) else: - for (q, sg, stats) in self.results: + for (q, _ ,sg, stats) in self.results: stats = self.task.target.calculate_statistics(sg, self.task.data, stats) row = [str(q), str(sg)] if include_target: row.append(str(self.task.target)) for stat in statistics_to_show: row.append(str(stats[stat])) + table.append(row) return table def to_dataframe(self, statistics_to_show=None, autoround=False, include_target=False, mode=2): diff --git a/results/P4-Lucat_target_num_1.csv b/results/P4-Lucat_target_num_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..711cd6a3fe33edeaf37e698adad615bda876e1b3 --- /dev/null +++ b/results/P4-Lucat_target_num_1.csv @@ -0,0 +1,11 @@ +,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift,target +0,0.23568820806202717,Prog_Rec=='No progression/relapse' AND ToxBin=='NoTox',248.0,652.0,248.0,248.0,404.0,0.3803680981595092,0.6196319018404908,1.0,0.0,1.0,0.0,0.3803680981595092,2.629032258064516,1 +1,0.21060258195641535,Prog_Rec=='No progression/relapse' AND boolenProg=='[]',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1 +2,0.21060258195641535,Prog_Rec=='No progression/relapse',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1 +3,0.21060258195641535,Prog_Rec=='No progression/relapse' AND booleanTox=='[]',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1 +4,0.2044487937069517,Prog_Rec=='No progression/relapse' AND marmolre=='['No']',270.0,652.0,236.0,248.0,382.0,0.41411042944785276,0.5858895705521472,0.9516129032258065,0.04838709677419355,0.8740740740740741,0.031413612565445025,0.3803680981595092,2.2979689366786142,1 +5,0.1843690014678761,FirstTreatment=='Curative surgery' AND Prog_Rec=='No progression/relapse',194.0,652.0,194.0,248.0,458.0,0.29754601226993865,0.7024539877300614,0.782258064516129,0.21774193548387097,1.0,0.11790393013100436,0.3803680981595092,2.629032258064516,1 +6,0.17533591779893862,PDL1=='PDL1_Negative' AND Prog_Rec=='No progression/relapse',220.0,652.0,198.0,248.0,432.0,0.3374233128834356,0.6625766871165644,0.7983870967741935,0.20161290322580644,0.9,0.11574074074074074,0.3803680981595092,2.3661290322580646,1 +7,0.15319545334788665,Gender=='Male' AND Prog_Rec=='No progression/relapse',208.0,652.0,179.0,248.0,444.0,0.31901840490797545,0.6809815950920245,0.7217741935483871,0.2782258064516129,0.8605769230769231,0.1554054054054054,0.3803680981595092,2.2624844913151363,1 +8,0.12073280891264254,Age_range=='[64 - 90]' AND Prog_Rec=='No progression/relapse',169.0,652.0,143.0,248.0,483.0,0.25920245398773006,0.74079754601227,0.5766129032258065,0.42338709677419356,0.8461538461538461,0.21739130434782608,0.3803680981595092,2.224565756823821,1 +9,0.11786292295532388,FirstTreatment=='Curative surgery',308.0,652.0,194.0,248.0,344.0,0.4723926380368098,0.5276073619631901,0.782258064516129,0.21774193548387097,0.6298701298701299,0.1569767441860465,0.3803680981595092,1.6559488898198576,1