From 20e6a30cc749922be9e30aaf8f897233a13419eb Mon Sep 17 00:00:00 2001 From: Lucia Prieto Date: Mon, 23 Oct 2023 16:00:57 +0000 Subject: [PATCH] Add FSS.py --- code/FSS.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 code/FSS.py diff --git a/code/FSS.py b/code/FSS.py new file mode 100644 index 0000000..0ed12ee --- /dev/null +++ b/code/FSS.py @@ -0,0 +1,84 @@ +import numpy as np +from math import log, e +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from sklearn.feature_selection import mutual_info_classif + + +def entropy(data): + # Computes entropy of label. + + n_labels = len(data) + + if n_labels <= 1: + return 0 + + value, counts = np.unique(data, return_counts=True) + probs = counts / n_labels + n_classes = np.count_nonzero(probs) + + if n_classes <= 1: + return 0 + + ent = 0. + + # Compute entropy + base = e + for i in probs: + ent -= i * log(i, base) + + return ent + + +if __name__ == '__main__': + feats = ["data/converted/feats.csv", "data/converted/featsCluster.csv"] + figs = ["data/FSS/gainRatio.svg", "data/FSS/gainRatio_cluster.svg"] + figs2 = ["data/FSS/corrMatrix.svg", "data/FSS/corrMatrix_cluster.svg"] + saves = ["data/FSS/featsGR.csv", "data/FSS/featsGR_cluster.csv"] + + column = "Dropout_1" + y = pd.read_csv("data/converted/labels.csv") + for feat, fig, fig2, save in zip(feats, figs, figs2, saves): + + xF = pd.read_csv(feat) + + print(xF.shape) + + # Mutual Information + entropyV = [entropy(xF[col].tolist()) for col in xF] + gr = mutual_info_classif(xF, y[column], random_state=1) / entropyV + + indices = np.argsort(gr) + + names = xF.columns + plt.figure(figsize=(25, 30)) + plt.title(column) + plt.barh(range(len(indices)), gr[indices], color='g', align='center') + plt.yticks(range(len(indices)), [names[i] for i in indices]) + plt.xlabel('Gain Ratio') + plt.savefig(fig, format='svg', dpi=1200) + + + grKeep = gr[indices][-20:] + namesKeep = names[indices][-20:] + + xF1 = xF[namesKeep] + + # Correlation Between Variables + correlation_matrix = xF1.corr() + plt.figure(figsize=(25, 25)) + sns.heatmap(correlation_matrix, annot=True) + plt.savefig(fig2, format='svg', dpi=1200) + corr = [] + for i in range(0, len(xF1.columns)): + for j in range(0, len(xF1.columns)): + if i != j: + corr_1 = np.abs(xF1[xF1.columns[i]].corr(xF1[xF1.columns[j]])) + if corr_1 > 0.75 and i < j: + print(xF1.columns[i], "is highly correlated with", xF1.columns[j], "->", corr_1 ) + corr.append(xF1.columns[i]) + + xF2 = xF1.drop(columns=corr) + print(xF2.shape) + xF2.to_csv(save, index=False) -- 2.24.1