Commit c79e3a1d authored by AARON GARCIA MUÑIZ's avatar AARON GARCIA MUÑIZ

changes

parent e43c1474
...@@ -64,31 +64,31 @@ def info_gained_algorithm(dataname,class_column,class_value,mode_parameter="defa ...@@ -64,31 +64,31 @@ def info_gained_algorithm(dataname,class_column,class_value,mode_parameter="defa
searchspace, searchspace,
mode=mode_parameter, mode=mode_parameter,
depth=depth, depth=depth,
filter_vars = list_conds, filter_vars = list_conds,
min_quality = 0,
qf=ps.WRAccQF()) qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)
df_result = result.to_dataframe(mode=mode_parameter)
result, result_cut = ps.InfoGainedSearch().execute(task) # result, result_cut = ps.BeamSearch().execute(task)
#df_result = result.to_dataframe() # #df_result = result.to_dataframe()
df_result_cut = result_cut.to_dataframe(mode=mode_parameter) # df_result_cut = result_cut.to_dataframe(mode=mode_parameter)
df_result_cut.drop_duplicates(inplace=True) # df_result_cut.drop_duplicates(inplace=True)
df_result_cut.reset_index(drop=True,inplace=True) # df_result_cut.reset_index(drop=True,inplace=True)
#df_result = df_result[df_result.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result["subgroup"])), axis=1)] # #df_result = df_result[df_result.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result["subgroup"])), axis=1)]
df_result_cut = df_result_cut[df_result_cut.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result_cut["subgroup"])), axis=1)] # df_result_cut = df_result_cut[df_result_cut.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result_cut["subgroup"])), axis=1)]
df_result_cut["target"] = [class_value] * df_result_cut.shape[0] df_result["target"] = [class_value] * df_result.shape[0]
#route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv" route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
""" dir_type = "max" dir_type = "max"
if mode_parameter == 0: if mode_parameter == 0:
dir_type = "threshold" dir_type = "threshold"
route = parentdir+"/datasets_compared/"+dataname+"/InfoGained/"+dir_type+"/"+dataname+"_"+class_column+"_"+str(class_value)+".csv" """ df_result.to_csv(route, encoding="UTF-8",index=True)
route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
df_result_cut.to_csv(route, encoding="UTF-8",index=True)
if __name__ == "__main__": if __name__ == "__main__":
#list_ignore=['Prog_Rec', 'ToxBin', 'boolenProg', 'booleanTox', 'NoProg-Tox', 'SiProg-Tox',"orgfam","target","target_num"]
# Cancer_stage, FirstTreatment
parser=argparse.ArgumentParser() parser=argparse.ArgumentParser()
......
...@@ -311,8 +311,8 @@ class BeamSearch: ...@@ -311,8 +311,8 @@ class BeamSearch:
new_selectors.append(sel) new_selectors.append(sel)
sg = ps.Conjunction(new_selectors,task.mode) sg = ps.Conjunction(new_selectors,task.mode)
statistics = task.qf.calculate_statistics(sg, task.target, task.data) statistics = task.qf.calculate_statistics(sg, task.target, task.data)
quality,_,_ = task.qf.evaluate(sg, task.target, task.data, statistics) #quality,_,_ = task.qf.evaluate(sg, task.target, task.data, statistics)
#p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat. quality = task.qf.evaluate(sg, task.target, task.data, statistics)
ps.add_if_required(beam, sg, quality, task, check_for_duplicates=True, statistics=statistics) ps.add_if_required(beam, sg, quality, task, check_for_duplicates=True, statistics=statistics)
depth += 1 depth += 1
# TODO make sure there is no bug here # TODO make sure there is no bug here
...@@ -342,7 +342,7 @@ class InfoGainedSearch: ...@@ -342,7 +342,7 @@ class InfoGainedSearch:
depth = 0 depth = 0
start = time() start = time()
while beam != last_beam and depth < task.depth: while beam != last_beam and depth < task.depth:
#print(depth) print(depth)
last_beam = beam.copy() last_beam = beam.copy()
beam.clear() # List used to save all the candidates of iteration n. beam.clear() # List used to save all the candidates of iteration n.
smt = False # Parameter used to control if there are something in beam list smt = False # Parameter used to control if there are something in beam list
...@@ -351,7 +351,7 @@ class InfoGainedSearch: ...@@ -351,7 +351,7 @@ class InfoGainedSearch:
for sel in task.search_space: for sel in task.search_space:
# Generate a sg using the parents labels + possible labels. # Generate a sg using the parents labels + possible labels.
new_selectors = list(last_sg.selectors) new_selectors = list(last_sg.selectors)
if sel not in new_selectors: # A sg can not contain 2 same selectors. if sel not in new_selectors: # A sg can not contain 2 same labels.
new_selectors.append(sel) # New sg generated new_selectors.append(sel) # New sg generated
sg = ps.Conjunction(new_selectors,task.mode) sg = ps.Conjunction(new_selectors,task.mode)
statistics = task.qf.calculate_statistics(sg, task.target, task.data) ## Calculate some stats statistics = task.qf.calculate_statistics(sg, task.target, task.data) ## Calculate some stats
...@@ -365,6 +365,8 @@ class InfoGainedSearch: ...@@ -365,6 +365,8 @@ class InfoGainedSearch:
# If there are not elements in beam, we will add the element. In case there are something in list, check # If there are not elements in beam, we will add the element. In case there are something in list, check
# if new generated sg is already in list (it means that new sg is in the list but has different labels order). # if new generated sg is already in list (it means that new sg is in the list but has different labels order).
if smt is False or sorted(new_selectors) not in [sorted(elem[1]._selectors) for elem in beam]: if smt is False or sorted(new_selectors) not in [sorted(elem[1]._selectors) for elem in beam]:
# If wracc stat < 0, then generated sg is not take into account.
#if quality >= 0:
p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat. p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat.
aux_beam.append((quality_l.copy() + [quality], sg, stats_l.copy() + [statistics], info_l.copy() + [info_gain], odd_l.copy() + [odd_v],sel_idx, pvalue_l.copy() + [p_value])) aux_beam.append((quality_l.copy() + [quality], sg, stats_l.copy() + [statistics], info_l.copy() + [info_gain], odd_l.copy() + [odd_v],sel_idx, pvalue_l.copy() + [p_value]))
# After adding all the candidates that satisfy the conditions # After adding all the candidates that satisfy the conditions
...@@ -384,6 +386,7 @@ class InfoGainedSearch: ...@@ -384,6 +386,7 @@ class InfoGainedSearch:
if time()-start>task.timeout: if time()-start>task.timeout:
break break
depth += 1 depth += 1
# TODO make sure there is no bug here
beam_cut = [] # Final gropus cutted. beam_cut = [] # Final gropus cutted.
for elem in beam: for elem in beam:
......
...@@ -223,7 +223,9 @@ class StandardQF(SimplePositivesQF, ps.BoundedInterestingnessMeasure): ...@@ -223,7 +223,9 @@ class StandardQF(SimplePositivesQF, ps.BoundedInterestingnessMeasure):
def standard_qf(subg,a, instances_dataset, positives_dataset, instances_subgroup, positives_subgroup, measures): def standard_qf(subg,a, instances_dataset, positives_dataset, instances_subgroup, positives_subgroup, measures):
if not hasattr(instances_subgroup, '__array_interface__') and (instances_subgroup == 0): if not hasattr(instances_subgroup, '__array_interface__') and (instances_subgroup == 0):
return np.nan, np.nan, np.nan if measures is True:
return np.nan, np.nan, np.nan
return np.nan
p_subgroup = np.divide(positives_subgroup, instances_subgroup) p_subgroup = np.divide(positives_subgroup, instances_subgroup)
#if instances_subgroup == 0: #if instances_subgroup == 0:
# return 0 # return 0
......
'''
Created on 02.05.2016
@author: lemmerfn
'''
import itertools import itertools
from functools import partial from functools import partial
from heapq import heappush, heappop from heapq import heappush, heappop
...@@ -9,13 +13,12 @@ import pandas as pd ...@@ -9,13 +13,12 @@ import pandas as pd
import pysubgroup_mod as ps import pysubgroup_mod as ps
from math import sqrt from math import sqrt
# Function that calculate entrophy ## Added
def calculate_entriopia(x): def calculate_entriopia(x):
if x in [0.0,1.0]: if x in [0.0,1.0]:
return 0 return 0
return -x*math.log(x,2) - (1-x)*math.log(1-x,2) return -x*math.log(x,2) - (1-x)*math.log(1-x,2)
# Function that calculate information gained for a subgroup
def calculate_info_gained(ID,IS,PD,PS): def calculate_info_gained(ID,IS,PD,PS):
a = ID - IS a = ID - IS
b = PD - PS b = PD - PS
...@@ -28,7 +31,6 @@ def calculate_info_gained(ID,IS,PD,PS): ...@@ -28,7 +31,6 @@ def calculate_info_gained(ID,IS,PD,PS):
p3 = b / a p3 = b / a
return calculate_entriopia(p1) - (x1)*calculate_entriopia(p2) - (x2)*calculate_entriopia(p3) return calculate_entriopia(p1) - (x1)*calculate_entriopia(p2) - (x2)*calculate_entriopia(p3)
# Function that calculate odd value for a subgroup
def calculate_odd_value(ID,IS,PD,PS): def calculate_odd_value(ID,IS,PD,PS):
b = IS - PS b = IS - PS
c = PD - PS c = PD - PS
...@@ -39,7 +41,6 @@ def calculate_odd_value(ID,IS,PD,PS): ...@@ -39,7 +41,6 @@ def calculate_odd_value(ID,IS,PD,PS):
odd_value = (PS*d) / (b*c) odd_value = (PS*d) / (b*c)
return odd_value return odd_value
# Function that calculate and optimal threshold based on standard deviation for a given values list.
def threshold(info_list,depth,mode): def threshold(info_list,depth,mode):
if len(np.unique(info_list)) == 1: if len(np.unique(info_list)) == 1:
return list(info_list)[0] return list(info_list)[0]
...@@ -98,18 +99,19 @@ def best_complex(elem,mode,filter_vars): ...@@ -98,18 +99,19 @@ def best_complex(elem,mode,filter_vars):
for idx, cand in enumerate(x_filter[1:],start=1): for idx, cand in enumerate(x_filter[1:],start=1):
if cand[1][0] > return_cand[1][0]: # If candidate upgrades odd range, it is selected as new return_cand if cand[1][0] > return_cand[1][0]: # If candidate upgrades odd range, it is selected as new return_cand
return_cand = cand return_cand = cand
# If candidate has an odd range lower than return_cand odd range, the algorithm stops. elif (cand[1][0] == return_cand[1][0] and cand[3] == x_filter[idx-1][3] + 1):
# Also, if candidate is not consecutive and his odd range does not improve the return_cand odd range, the algorithm stops. return_cand = cand
elif (cand[1][0] == return_cand[1][0] and cand[3] > x_filter[idx-1][3] + 1) or (cand[1][0] < return_cand[1][0]): else:
break break
#if return_cand[1][1] == 100: #if return_cand[1][1] == 100:
if return_cand[1][0] == 4: # Also, if return_cand has maximum odd range (odd value > 6.71), algorithm stops and returns return_cand. """ if return_cand[1][0] == 4: # Also, if return_cand has maximum odd range (odd value > 6.71), algorithm stops and returns return_cand.
break break """
index = group_labels.index(return_cand[2]) index = group_labels.index(return_cand[2])
sg = ps.Conjunction(selectors[:index+1],mode) sg = ps.Conjunction(selectors[:index+1],mode)
tup = (elem[0][index],sg,elem[2][index],elem[3][index],elem[4][index],elem[5],elem[6][index]) tup = (elem[0][index],sg,elem[2][index],elem[3][index],elem[4][index],elem[5],elem[6][index])
return tup return tup
#def add_if_required(result, sg, quality, task, check_for_duplicates=False, statistics=None):
def add_if_required(result, sg, quality, task, check_for_duplicates=False, statistics=None): def add_if_required(result, sg, quality, task, check_for_duplicates=False, statistics=None):
if quality > task.min_quality: if quality > task.min_quality:
p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat. p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat.
...@@ -317,7 +319,7 @@ class SubgroupDiscoveryResult: ...@@ -317,7 +319,7 @@ class SubgroupDiscoveryResult:
row.append(stat) row.append(stat)
if mode != 2: if mode != 2:
row.append("pvalue") row.append("pvalue")
table.append(row) table.append(row)
if mode !=2: if mode !=2:
for (q, sg, stats,_,_,_,p_value) in self.results: for (q, sg, stats,_,_,_,p_value) in self.results:
stats = self.task.target.calculate_statistics(sg, self.task.data, stats) stats = self.task.target.calculate_statistics(sg, self.task.data, stats)
...@@ -329,13 +331,14 @@ class SubgroupDiscoveryResult: ...@@ -329,13 +331,14 @@ class SubgroupDiscoveryResult:
row.append(str(p_value)) row.append(str(p_value))
table.append(row) table.append(row)
else: else:
for (q, sg, stats) in self.results: for (q, _ ,sg, stats) in self.results:
stats = self.task.target.calculate_statistics(sg, self.task.data, stats) stats = self.task.target.calculate_statistics(sg, self.task.data, stats)
row = [str(q), str(sg)] row = [str(q), str(sg)]
if include_target: if include_target:
row.append(str(self.task.target)) row.append(str(self.task.target))
for stat in statistics_to_show: for stat in statistics_to_show:
row.append(str(stats[stat])) row.append(str(stats[stat]))
table.append(row)
return table return table
def to_dataframe(self, statistics_to_show=None, autoround=False, include_target=False, mode=2): def to_dataframe(self, statistics_to_show=None, autoround=False, include_target=False, mode=2):
......
,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift,target
0,0.23568820806202717,Prog_Rec=='No progression/relapse' AND ToxBin=='NoTox',248.0,652.0,248.0,248.0,404.0,0.3803680981595092,0.6196319018404908,1.0,0.0,1.0,0.0,0.3803680981595092,2.629032258064516,1
1,0.21060258195641535,Prog_Rec=='No progression/relapse' AND boolenProg=='[]',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1
2,0.21060258195641535,Prog_Rec=='No progression/relapse',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1
3,0.21060258195641535,Prog_Rec=='No progression/relapse' AND booleanTox=='[]',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1
4,0.2044487937069517,Prog_Rec=='No progression/relapse' AND marmolre=='['No']',270.0,652.0,236.0,248.0,382.0,0.41411042944785276,0.5858895705521472,0.9516129032258065,0.04838709677419355,0.8740740740740741,0.031413612565445025,0.3803680981595092,2.2979689366786142,1
5,0.1843690014678761,FirstTreatment=='Curative surgery' AND Prog_Rec=='No progression/relapse',194.0,652.0,194.0,248.0,458.0,0.29754601226993865,0.7024539877300614,0.782258064516129,0.21774193548387097,1.0,0.11790393013100436,0.3803680981595092,2.629032258064516,1
6,0.17533591779893862,PDL1=='PDL1_Negative' AND Prog_Rec=='No progression/relapse',220.0,652.0,198.0,248.0,432.0,0.3374233128834356,0.6625766871165644,0.7983870967741935,0.20161290322580644,0.9,0.11574074074074074,0.3803680981595092,2.3661290322580646,1
7,0.15319545334788665,Gender=='Male' AND Prog_Rec=='No progression/relapse',208.0,652.0,179.0,248.0,444.0,0.31901840490797545,0.6809815950920245,0.7217741935483871,0.2782258064516129,0.8605769230769231,0.1554054054054054,0.3803680981595092,2.2624844913151363,1
8,0.12073280891264254,Age_range=='[64 - 90]' AND Prog_Rec=='No progression/relapse',169.0,652.0,143.0,248.0,483.0,0.25920245398773006,0.74079754601227,0.5766129032258065,0.42338709677419356,0.8461538461538461,0.21739130434782608,0.3803680981595092,2.224565756823821,1
9,0.11786292295532388,FirstTreatment=='Curative surgery',308.0,652.0,194.0,248.0,344.0,0.4723926380368098,0.5276073619631901,0.782258064516129,0.21774193548387097,0.6298701298701299,0.1569767441860465,0.3803680981595092,1.6559488898198576,1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment