main.py 4.46 KB
Newer Older
aarongitrepos's avatar
All  
aarongitrepos committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
import os, sys
import pandas as pd
currentdir = os.path.dirname(os.path.realpath(__file__))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)
sys.path.append(currentdir)
import pysubgroup_mod as ps
import argparse
import numpy as np


def eliminate_reps(elem,l):
    l.remove(elem)
    for li in l:
        if elem in li:
        #if set(elem.split(" AND ")).issubset(li.split(" AND ")):
        #if ', '.join(map(str,elem)) in ', '.join(map(str,li)):
            return False
    return True

def info_gained_algorithm(dataname,class_column,class_value,mode_parameter="default",depth=5,list_ignore=[],list_conds=[]):
    """
    Parameters
    ----------
    dataname: string
        The name of the dataset allocated in the datasets directory to be used for analysis.

    class_column: string
        A column of the dataset that will be used as target.

    class_value: ??
        A value of the class_column. It corresponds to the condition that has to be meet, i.e, class_column==class_Value.

    mode_parameter: string, optional (default=default)
        This parameter controls the behaviour of algorithms search. For InfoGained algorithm, it is necessary to
        use dynamic or maximum options. For other algorithms, the parameter has the value "default".
        (possible values: dynamic, maximum, default)
    
    depth: int, optional (default=5)
        This parameter indicates the number of variables that will be added to rules.

    list_ignore: list of strings, optional (default=None)
        List containing the column names that will not be used in search activity.

    list_conds: list of strings, optional (default=None)
        List containing the column names that are neccesary to appear in rules. It will only work with InfoGained algorithm.
    """
    df = pd.read_csv(currentdir+"/datasets/"+dataname+".csv",index_col=[0])
    if class_column not in df.columns:
        print("No class column")
        exit(0)
    if df[class_column].dtype in [np.int16, np.int32, np.int64]:
        class_value = int(class_value)
    if class_value not in df[class_column].unique():
        print("The class_value specified is not an option")
        exit(0)

    target = ps.BinaryTarget (class_column, class_value)
    searchspace = ps.create_selectors(df, ignore=list_ignore)
    mode_parameter = {'dynamic' : 0, 'maximum': 1, "default":2}[mode_parameter]
    task = ps.SubgroupDiscoveryTask (
        df, 
        target, 
        searchspace,
        mode=mode_parameter, 
        depth=depth,
        filter_vars = list_conds, 
        qf=ps.WRAccQF())

    result, result_cut = ps.InfoGainedSearch().execute(task)
    #df_result = result.to_dataframe()
    df_result_cut = result_cut.to_dataframe(mode=mode_parameter)

    df_result_cut.drop_duplicates(inplace=True)
    df_result_cut.reset_index(drop=True,inplace=True)

    #df_result = df_result[df_result.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result["subgroup"])), axis=1)]
    df_result_cut = df_result_cut[df_result_cut.apply(lambda row: eliminate_reps(row['subgroup'],list(df_result_cut["subgroup"])), axis=1)]
    df_result_cut["target"] = [class_value] * df_result_cut.shape[0]
    #route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
    """ dir_type = "max"
    if mode_parameter == 0:
        dir_type = "threshold"
    route = parentdir+"/datasets_compared/"+dataname+"/InfoGained/"+dir_type+"/"+dataname+"_"+class_column+"_"+str(class_value)+".csv" """
    route = currentdir+"/results/"+dataname+"_"+class_column+"_"+str(class_value)+".csv"
    df_result_cut.to_csv(route, encoding="UTF-8",index=True)


if __name__ == "__main__":
    #list_ignore=['Prog_Rec', 'ToxBin', 'boolenProg', 'booleanTox', 'NoProg-Tox', 'SiProg-Tox',"orgfam","target","target_num"]
    # Cancer_stage, FirstTreatment

    parser=argparse.ArgumentParser()

    parser.add_argument('--dataname', type=str, required=True)
    parser.add_argument('--class_column', type=str, required=True)
    parser.add_argument('--class_value', type=str, required=True)
    parser.add_argument('--mode', type=str, choices=["dynamic","maximum","default"], default="default")
    parser.add_argument('--depth', type=int, required=True)
    parser.add_argument("--list_ignore", nargs="*", type=str, default=[])
    parser.add_argument("--list_conds", nargs="*", type=str, default=[])

    args = parser.parse_args()

    info_gained_algorithm(args.dataname,args.class_column,args.class_value,args.mode,args.depth,args.list_ignore,args.list_conds)