FSS.py 2.42 KB
Newer Older
Lucia Prieto's avatar
Lucia Prieto committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
import numpy as np
from math import log, e
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif


def entropy(data):
    # Computes entropy of label.

    n_labels = len(data)

    if n_labels <= 1:
        return 0

    value, counts = np.unique(data, return_counts=True)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)

    if n_classes <= 1:
        return 0

    ent = 0.

    # Compute entropy
    base = e
    for i in probs:
        ent -= i * log(i, base)

    return ent


if __name__ == '__main__':
    feats = ["data/converted/feats.csv", "data/converted/featsCluster.csv"]
    figs = ["data/FSS/gainRatio.svg", "data/FSS/gainRatio_cluster.svg"]
    figs2 = ["data/FSS/corrMatrix.svg", "data/FSS/corrMatrix_cluster.svg"]
    saves = ["data/FSS/featsGR.csv", "data/FSS/featsGR_cluster.csv"]

    column = "Dropout_1"
    y = pd.read_csv("data/converted/labels.csv")
    for feat, fig, fig2, save in zip(feats, figs, figs2, saves):

        xF = pd.read_csv(feat)

        print(xF.shape)

        # Mutual Information
        entropyV = [entropy(xF[col].tolist()) for col in xF]
        gr = mutual_info_classif(xF, y[column], random_state=1) / entropyV

        indices = np.argsort(gr)

        names = xF.columns
        plt.figure(figsize=(25, 30))
        plt.title(column)
        plt.barh(range(len(indices)), gr[indices], color='g', align='center')
        plt.yticks(range(len(indices)), [names[i] for i in indices])
        plt.xlabel('Gain Ratio')
        plt.savefig(fig, format='svg', dpi=1200)


        grKeep = gr[indices][-20:]
        namesKeep = names[indices][-20:]

        xF1 = xF[namesKeep]

        # Correlation Between Variables
        correlation_matrix = xF1.corr()
        plt.figure(figsize=(25, 25))
        sns.heatmap(correlation_matrix, annot=True)
        plt.savefig(fig2, format='svg', dpi=1200)
        corr = []
        for i in range(0, len(xF1.columns)):
            for j in range(0, len(xF1.columns)):
                if i != j:
                    corr_1 = np.abs(xF1[xF1.columns[i]].corr(xF1[xF1.columns[j]]))
                    if corr_1 > 0.75 and i < j:
                        print(xF1.columns[i], "is highly  correlated  with", xF1.columns[j], "->", corr_1 )
                        corr.append(xF1.columns[i])

        xF2 = xF1.drop(columns=corr)
        print(xF2.shape)
        xF2.to_csv(save, index=False)