cv_metric_distr.py 2.74 KB
Newer Older
Joaquin Torres's avatar
Joaquin Torres committed
1 2
# CV Metrics Distribution
# Author: Joaquín Torres Bravo
3
"""
Joaquin Torres's avatar
Joaquin Torres committed
4
    Plotting the distribution of the metrics obtained from CV via boxplots.
5 6 7 8 9
"""

# Libraries
# --------------------------------------------------------------------------------------------------------
import pandas as pd
Joaquin Torres's avatar
Joaquin Torres committed
10
import matplotlib.pyplot as plt
11 12 13 14 15 16
# --------------------------------------------------------------------------------------------------------

if __name__ == "__main__":

    metric_names = ['F1', 'PREC', 'REC', 'ACC', 'NREC', 'TN', 'FN', 'FP', 'TP', 'AUROC', 'AUPRC']
    model_names_simple = ['DT', 'RF', 'Bagging', 'AB', 'XGB', 'LR', 'SVM', 'MLP']
Joaquin Torres's avatar
Joaquin Torres committed
17
    model_names_cs = ['DT', 'RF', 'Bagging', 'AB', 'LR', 'SVM'] # Cost-sensitive learning
18 19 20 21 22

    # Distribution of cv metrics
    # --------------------------------------------------------------------------------------------------------
    for group in ['pre', 'post']:
        for method in ['_ORIG', '_ORIG_CW', '_OVER', '_UNDER']:
Joaquin Torres's avatar
Joaquin Torres committed
23
            # Read CV metrics sheet for current group and method
24 25 26 27 28 29 30
            df = pd.read_excel('./output_cv_metrics/metrics.xlsx', sheet_name=group+method)
            # Model names based on cost-senstive training or not
            if method == '_ORIG_CW':
                model_names = model_names_cs
            else:
                model_names = model_names_simple
            # Create figure for current sheet, one row per metric
31
            fig, axes = plt.subplots(len(metric_names), 1, figsize=(10, 10 * len(metric_names)))
Joaquin Torres's avatar
Joaquin Torres committed
32
            # For each metric
33 34 35
            for metric_id, metric_name in enumerate(metric_names):
                # Get the axis for the current metric
                ax = axes[metric_id]
36 37
                # List to store the metric array for each model
                metric_data = []
38
                for model_name in model_names:
39
                    # Construct the row name
40 41
                    row_name = f'{model_name}_{metric_name}'
                    metric_row = df.loc[df['Unnamed: 0'] == row_name].iloc[0, 1:].values
42 43 44 45
                    metric_data.append(metric_row)
                # Plot boxplot for the current metric across all models
                ax.boxplot(metric_data, labels=model_names)
                ax.set_title(f'{metric_name} for {group}{method}')
Joaquin Torres's avatar
Joaquin Torres committed
46 47 48
                # Set y-axis limits for metrics that take values in the interval [0, 1]
                if metric_name in ['F1', 'PREC', 'REC', 'ACC', 'AUROC', 'AUPRC']:
                    ax.set_ylim(0, 1)
49
            plt.tight_layout()
50
            fig.savefig(f'./output/cv_metrics/distributions/{group}{method}.svg', format='svg', dpi=600)
Joaquin Torres's avatar
Joaquin Torres committed
51
            plt.close(fig)
52 53

    print("Succesful distribution plots generation")
54
    # --------------------------------------------------------------------------------------------------------