import numpy as np from math import log, e import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.feature_selection import mutual_info_classif def entropy(data): # Computes entropy of label. n_labels = len(data) if n_labels <= 1: return 0 value, counts = np.unique(data, return_counts=True) probs = counts / n_labels n_classes = np.count_nonzero(probs) if n_classes <= 1: return 0 ent = 0. # Compute entropy base = e for i in probs: ent -= i * log(i, base) return ent if __name__ == '__main__': feats = ["data/converted/feats.csv", "data/converted/featsCluster.csv"] figs = ["data/FSS/gainRatio.svg", "data/FSS/gainRatio_cluster.svg"] figs2 = ["data/FSS/corrMatrix.svg", "data/FSS/corrMatrix_cluster.svg"] saves = ["data/FSS/featsGR.csv", "data/FSS/featsGR_cluster.csv"] column = "Dropout_1" y = pd.read_csv("data/converted/labels.csv") for feat, fig, fig2, save in zip(feats, figs, figs2, saves): xF = pd.read_csv(feat) print(xF.shape) # Mutual Information entropyV = [entropy(xF[col].tolist()) for col in xF] gr = mutual_info_classif(xF, y[column], random_state=1) / entropyV indices = np.argsort(gr) names = xF.columns plt.figure(figsize=(25, 30)) plt.title(column) plt.barh(range(len(indices)), gr[indices], color='g', align='center') plt.yticks(range(len(indices)), [names[i] for i in indices]) plt.xlabel('Gain Ratio') plt.savefig(fig, format='svg', dpi=1200) grKeep = gr[indices][-20:] namesKeep = names[indices][-20:] xF1 = xF[namesKeep] # Correlation Between Variables correlation_matrix = xF1.corr() plt.figure(figsize=(25, 25)) sns.heatmap(correlation_matrix, annot=True) plt.savefig(fig2, format='svg', dpi=1200) corr = [] for i in range(0, len(xF1.columns)): for j in range(0, len(xF1.columns)): if i != j: corr_1 = np.abs(xF1[xF1.columns[i]].corr(xF1[xF1.columns[j]])) if corr_1 > 0.75 and i < j: print(xF1.columns[i], "is highly correlated with", xF1.columns[j], "->", corr_1 ) corr.append(xF1.columns[i]) xF2 = xF1.drop(columns=corr) print(xF2.shape) xF2.to_csv(save, index=False)