Commit c79e3a1d authored by AARON GARCIA MUÑIZ's avatar AARON GARCIA MUÑIZ

changes

parent e43c1474
......@@ -64,31 +64,31 @@ def info_gained_algorithm(dataname,class_column,class_value,mode_parameter="defa
searchspace,
mode=mode_parameter,
depth=depth,
filter_vars = list_conds,
filter_vars = list_conds,
min_quality = 0,
qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)
df_result = result.to_dataframe(mode=mode_parameter)
result, result_cut = ps.InfoGainedSearch().execute(task)
#df_result = result.to_dataframe()
df_result_cut = result_cut.to_dataframe(mode=mode_parameter)
# result, result_cut = ps.BeamSearch().execute(task)
# #df_result = result.to_dataframe()
# df_result_cut = result_cut.to_dataframe(mode=mode_parameter)
df_result_cut.drop_duplicates(inplace=True)
df_result_cut.reset_index(drop=True,inplace=True)
# df_result_cut.drop_duplicates(inplace=True)
# df_result_cut.reset_index(drop=True,inplace=True)
#df_result = df_result[df_result.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result["subgroup"])), axis=1)]
df_result_cut = df_result_cut[df_result_cut.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result_cut["subgroup"])), axis=1)]
df_result_cut["target"] = [class_value] * df_result_cut.shape[0]
#route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
""" dir_type = "max"
# #df_result = df_result[df_result.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result["subgroup"])), axis=1)]
# df_result_cut = df_result_cut[df_result_cut.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result_cut["subgroup"])), axis=1)]
df_result["target"] = [class_value] * df_result.shape[0]
route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
dir_type = "max"
if mode_parameter == 0:
dir_type = "threshold"
route = parentdir+"/datasets_compared/"+dataname+"/InfoGained/"+dir_type+"/"+dataname+"_"+class_column+"_"+str(class_value)+".csv" """
route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
df_result_cut.to_csv(route, encoding="UTF-8",index=True)
df_result.to_csv(route, encoding="UTF-8",index=True)
if __name__ == "__main__":
#list_ignore=['Prog_Rec', 'ToxBin', 'boolenProg', 'booleanTox', 'NoProg-Tox', 'SiProg-Tox',"orgfam","target","target_num"]
# Cancer_stage, FirstTreatment
parser=argparse.ArgumentParser()
......
......@@ -311,8 +311,8 @@ class BeamSearch:
new_selectors.append(sel)
sg = ps.Conjunction(new_selectors,task.mode)
statistics = task.qf.calculate_statistics(sg, task.target, task.data)
quality,_,_ = task.qf.evaluate(sg, task.target, task.data, statistics)
#p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat.
#quality,_,_ = task.qf.evaluate(sg, task.target, task.data, statistics)
quality = task.qf.evaluate(sg, task.target, task.data, statistics)
ps.add_if_required(beam, sg, quality, task, check_for_duplicates=True, statistics=statistics)
depth += 1
# TODO make sure there is no bug here
......@@ -342,7 +342,7 @@ class InfoGainedSearch:
depth = 0
start = time()
while beam != last_beam and depth < task.depth:
#print(depth)
print(depth)
last_beam = beam.copy()
beam.clear() # List used to save all the candidates of iteration n.
smt = False # Parameter used to control if there are something in beam list
......@@ -351,7 +351,7 @@ class InfoGainedSearch:
for sel in task.search_space:
# Generate a sg using the parents labels + possible labels.
new_selectors = list(last_sg.selectors)
if sel not in new_selectors: # A sg can not contain 2 same selectors.
if sel not in new_selectors: # A sg can not contain 2 same labels.
new_selectors.append(sel) # New sg generated
sg = ps.Conjunction(new_selectors,task.mode)
statistics = task.qf.calculate_statistics(sg, task.target, task.data) ## Calculate some stats
......@@ -365,6 +365,8 @@ class InfoGainedSearch:
# If there are not elements in beam, we will add the element. In case there are something in list, check
# if new generated sg is already in list (it means that new sg is in the list but has different labels order).
if smt is False or sorted(new_selectors) not in [sorted(elem[1]._selectors) for elem in beam]:
# If wracc stat < 0, then generated sg is not take into account.
#if quality >= 0:
p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat.
aux_beam.append((quality_l.copy() + [quality], sg, stats_l.copy() + [statistics], info_l.copy() + [info_gain], odd_l.copy() + [odd_v],sel_idx, pvalue_l.copy() + [p_value]))
# After adding all the candidates that satisfy the conditions
......@@ -384,6 +386,7 @@ class InfoGainedSearch:
if time()-start>task.timeout:
break
depth += 1
# TODO make sure there is no bug here
beam_cut = [] # Final gropus cutted.
for elem in beam:
......
......@@ -223,7 +223,9 @@ class StandardQF(SimplePositivesQF, ps.BoundedInterestingnessMeasure):
def standard_qf(subg,a, instances_dataset, positives_dataset, instances_subgroup, positives_subgroup, measures):
if not hasattr(instances_subgroup, '__array_interface__') and (instances_subgroup == 0):
return np.nan, np.nan, np.nan
if measures is True:
return np.nan, np.nan, np.nan
return np.nan
p_subgroup = np.divide(positives_subgroup, instances_subgroup)
#if instances_subgroup == 0:
# return 0
......
'''
Created on 02.05.2016
@author: lemmerfn
'''
import itertools
from functools import partial
from heapq import heappush, heappop
......@@ -9,13 +13,12 @@ import pandas as pd
import pysubgroup_mod as ps
from math import sqrt
# Function that calculate entrophy
## Added
def calculate_entriopia(x):
if x in [0.0,1.0]:
return 0
return -x*math.log(x,2) - (1-x)*math.log(1-x,2)
# Function that calculate information gained for a subgroup
def calculate_info_gained(ID,IS,PD,PS):
a = ID - IS
b = PD - PS
......@@ -28,7 +31,6 @@ def calculate_info_gained(ID,IS,PD,PS):
p3 = b / a
return calculate_entriopia(p1) - (x1)*calculate_entriopia(p2) - (x2)*calculate_entriopia(p3)
# Function that calculate odd value for a subgroup
def calculate_odd_value(ID,IS,PD,PS):
b = IS - PS
c = PD - PS
......@@ -39,7 +41,6 @@ def calculate_odd_value(ID,IS,PD,PS):
odd_value = (PS*d) / (b*c)
return odd_value
# Function that calculate and optimal threshold based on standard deviation for a given values list.
def threshold(info_list,depth,mode):
if len(np.unique(info_list)) == 1:
return list(info_list)[0]
......@@ -98,18 +99,19 @@ def best_complex(elem,mode,filter_vars):
for idx, cand in enumerate(x_filter[1:],start=1):
if cand[1][0] > return_cand[1][0]: # If candidate upgrades odd range, it is selected as new return_cand
return_cand = cand
# If candidate has an odd range lower than return_cand odd range, the algorithm stops.
# Also, if candidate is not consecutive and his odd range does not improve the return_cand odd range, the algorithm stops.
elif (cand[1][0] == return_cand[1][0] and cand[3] > x_filter[idx-1][3] + 1) or (cand[1][0] < return_cand[1][0]):
elif (cand[1][0] == return_cand[1][0] and cand[3] == x_filter[idx-1][3] + 1):
return_cand = cand
else:
break
#if return_cand[1][1] == 100:
if return_cand[1][0] == 4: # Also, if return_cand has maximum odd range (odd value > 6.71), algorithm stops and returns return_cand.
break
""" if return_cand[1][0] == 4: # Also, if return_cand has maximum odd range (odd value > 6.71), algorithm stops and returns return_cand.
break """
index = group_labels.index(return_cand[2])
sg = ps.Conjunction(selectors[:index+1],mode)
tup = (elem[0][index],sg,elem[2][index],elem[3][index],elem[4][index],elem[5],elem[6][index])
return tup
#def add_if_required(result, sg, quality, task, check_for_duplicates=False, statistics=None):
def add_if_required(result, sg, quality, task, check_for_duplicates=False, statistics=None):
if quality > task.min_quality:
p_value = ps.ChiSquaredQF(stat="p").evaluate(sg, task.target, task.data, statistics) # Calculate pvalue stat.
......@@ -317,7 +319,7 @@ class SubgroupDiscoveryResult:
row.append(stat)
if mode != 2:
row.append("pvalue")
table.append(row)
table.append(row)
if mode !=2:
for (q, sg, stats,_,_,_,p_value) in self.results:
stats = self.task.target.calculate_statistics(sg, self.task.data, stats)
......@@ -329,13 +331,14 @@ class SubgroupDiscoveryResult:
row.append(str(p_value))
table.append(row)
else:
for (q, sg, stats) in self.results:
for (q, _ ,sg, stats) in self.results:
stats = self.task.target.calculate_statistics(sg, self.task.data, stats)
row = [str(q), str(sg)]
if include_target:
row.append(str(self.task.target))
for stat in statistics_to_show:
row.append(str(stats[stat]))
table.append(row)
return table
def to_dataframe(self, statistics_to_show=None, autoround=False, include_target=False, mode=2):
......
,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift,target
0,0.23568820806202717,Prog_Rec=='No progression/relapse' AND ToxBin=='NoTox',248.0,652.0,248.0,248.0,404.0,0.3803680981595092,0.6196319018404908,1.0,0.0,1.0,0.0,0.3803680981595092,2.629032258064516,1
1,0.21060258195641535,Prog_Rec=='No progression/relapse' AND boolenProg=='[]',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1
2,0.21060258195641535,Prog_Rec=='No progression/relapse',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1
3,0.21060258195641535,Prog_Rec=='No progression/relapse' AND booleanTox=='[]',291.0,652.0,248.0,248.0,361.0,0.44631901840490795,0.553680981595092,1.0,0.0,0.852233676975945,0.0,0.3803680981595092,2.2405498281786937,1
4,0.2044487937069517,Prog_Rec=='No progression/relapse' AND marmolre=='['No']',270.0,652.0,236.0,248.0,382.0,0.41411042944785276,0.5858895705521472,0.9516129032258065,0.04838709677419355,0.8740740740740741,0.031413612565445025,0.3803680981595092,2.2979689366786142,1
5,0.1843690014678761,FirstTreatment=='Curative surgery' AND Prog_Rec=='No progression/relapse',194.0,652.0,194.0,248.0,458.0,0.29754601226993865,0.7024539877300614,0.782258064516129,0.21774193548387097,1.0,0.11790393013100436,0.3803680981595092,2.629032258064516,1
6,0.17533591779893862,PDL1=='PDL1_Negative' AND Prog_Rec=='No progression/relapse',220.0,652.0,198.0,248.0,432.0,0.3374233128834356,0.6625766871165644,0.7983870967741935,0.20161290322580644,0.9,0.11574074074074074,0.3803680981595092,2.3661290322580646,1
7,0.15319545334788665,Gender=='Male' AND Prog_Rec=='No progression/relapse',208.0,652.0,179.0,248.0,444.0,0.31901840490797545,0.6809815950920245,0.7217741935483871,0.2782258064516129,0.8605769230769231,0.1554054054054054,0.3803680981595092,2.2624844913151363,1
8,0.12073280891264254,Age_range=='[64 - 90]' AND Prog_Rec=='No progression/relapse',169.0,652.0,143.0,248.0,483.0,0.25920245398773006,0.74079754601227,0.5766129032258065,0.42338709677419356,0.8461538461538461,0.21739130434782608,0.3803680981595092,2.224565756823821,1
9,0.11786292295532388,FirstTreatment=='Curative surgery',308.0,652.0,194.0,248.0,344.0,0.4723926380368098,0.5276073619631901,0.782258064516129,0.21774193548387097,0.6298701298701299,0.1569767441860465,0.3803680981595092,1.6559488898198576,1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment