Commit 20e6a30c authored by Lucia Prieto's avatar Lucia Prieto

Add FSS.py

parent 811d1c8a
import numpy as np
from math import log, e
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif
def entropy(data):
# Computes entropy of label.
n_labels = len(data)
if n_labels <= 1:
return 0
value, counts = np.unique(data, return_counts=True)
probs = counts / n_labels
n_classes = np.count_nonzero(probs)
if n_classes <= 1:
return 0
ent = 0.
# Compute entropy
base = e
for i in probs:
ent -= i * log(i, base)
return ent
if __name__ == '__main__':
feats = ["data/converted/feats.csv", "data/converted/featsCluster.csv"]
figs = ["data/FSS/gainRatio.svg", "data/FSS/gainRatio_cluster.svg"]
figs2 = ["data/FSS/corrMatrix.svg", "data/FSS/corrMatrix_cluster.svg"]
saves = ["data/FSS/featsGR.csv", "data/FSS/featsGR_cluster.csv"]
column = "Dropout_1"
y = pd.read_csv("data/converted/labels.csv")
for feat, fig, fig2, save in zip(feats, figs, figs2, saves):
xF = pd.read_csv(feat)
print(xF.shape)
# Mutual Information
entropyV = [entropy(xF[col].tolist()) for col in xF]
gr = mutual_info_classif(xF, y[column], random_state=1) / entropyV
indices = np.argsort(gr)
names = xF.columns
plt.figure(figsize=(25, 30))
plt.title(column)
plt.barh(range(len(indices)), gr[indices], color='g', align='center')
plt.yticks(range(len(indices)), [names[i] for i in indices])
plt.xlabel('Gain Ratio')
plt.savefig(fig, format='svg', dpi=1200)
grKeep = gr[indices][-20:]
namesKeep = names[indices][-20:]
xF1 = xF[namesKeep]
# Correlation Between Variables
correlation_matrix = xF1.corr()
plt.figure(figsize=(25, 25))
sns.heatmap(correlation_matrix, annot=True)
plt.savefig(fig2, format='svg', dpi=1200)
corr = []
for i in range(0, len(xF1.columns)):
for j in range(0, len(xF1.columns)):
if i != j:
corr_1 = np.abs(xF1[xF1.columns[i]].corr(xF1[xF1.columns[j]]))
if corr_1 > 0.75 and i < j:
print(xF1.columns[i], "is highly correlated with", xF1.columns[j], "->", corr_1 )
corr.append(xF1.columns[i])
xF2 = xF1.drop(columns=corr)
print(xF2.shape)
xF2.to_csv(save, index=False)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment