<div style="width: 100%; clear: both;">
<div style="float: left; width: 50%;">
<img src="https://www.upm.es/sfs/Rectorado/Gabinete%20del%20Rector/Logos/UPM/Logotipo%20con%20Leyenda/LOGOTIPO%20leyenda%20color%20PNG.png", align="left">
</div>
<div style="float: right; width: 50%;">
<p style="margin: 0; padding-top: 100px; text-align:right;">May 2022</p>
<p style="margin: 0; text-align:right;">ML Depresión</p>
<p style="margin: 0; text-align:right; padding-button: 100px;">Lucía Prieto Santamaría</p>
</div>
</div>
<div style="width:100%;">&nbsp;</div>

# ML models for Twitter Depresion datasets sampling after split and with tunning of hyperparameters

Import libraries for data processing and visualizing 

In [1]:
import pandas as pd
import numpy as np
import itertools


#from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn import metrics

import matplotlib.pyplot as plt

Import dataframes

| Dataset | NUM vs AVG-PERC | Time variables | PCA |
|:-:|:-:|:-:|:-:|
| **df0** | Both | Original | No |
|  |  |  |  |
| **df1** | NUM | Original | No |
| **df2** | PERC/AVG | Original | No |
| **df3** | NUM | Grouped | No |
| **df4** | PERC/AVG | Grouped | No |
| **df5** | NUM | Original | Yes |
| **df6** | PERC/AVG | Original | Yes |
| **df7** | NUM | Grouped | Yes |
| **df8** | PERC/AVG | Grouped | Yes |

In [2]:
df0 = pd.read_csv('datasets/d0.csv', sep=';')
df1 = pd.read_csv('datasets/d1.csv', sep=';')
df2 = pd.read_csv('datasets/d2.csv', sep=';')
df3 = pd.read_csv('datasets/d3.csv', sep=';')
df4 = pd.read_csv('datasets/d4.csv', sep=';')
df5 = pd.read_csv('datasets/d5.csv', sep=';')
df6 = pd.read_csv('datasets/d6.csv', sep=';')
df7 = pd.read_csv('datasets/d7.csv', sep=';')
df8 = pd.read_csv('datasets/d8.csv', sep=';')

In [3]:
datasets = {'d_original': df0,
            'd_num_to': df1,
            'd_perc_to': df2,
            'd_num_tg': df3,
            'd_perc_tg': df4,
            'd_num_to_pca': df5,
            'd_perc_to_pca': df6,
            'd_num_tg_pca': df7,
            'd_perc_tg_pca': df8}

In [4]:
for d in datasets.keys():
    datasets[d].loc[datasets[d]['GROUP'] == 'DEPRESSIVE','GROUP'] = 1
    datasets[d].loc[datasets[d]['GROUP'] == 'CONTROL','GROUP'] = 0
    datasets[d]['GROUP'] = datasets[d]['GROUP'].astype('int')

In [5]:
data = {'d_original': dict(),
        'd_num_to': dict(),
        'd_perc_to': dict(),
        'd_num_tg': dict(),
        'd_perc_tg': dict(),
        'd_num_to_pca': dict(),
        'd_perc_to_pca': dict(),
        'd_num_tg_pca': dict(),
        'd_perc_tg_pca': dict()}

for d in data.keys():
    data[d]['data'], data[d]['target'] = datasets[d].drop(datasets[d].columns[-1], axis=1).to_numpy(), datasets[d].GROUP.to_numpy()
    data[d]['X_train'], data[d]['X_test'], data[d]['y_train'], data[d]['y_test'] = train_test_split(data[d]['data'], 
                                                                                                    data[d]['target'], 
                                                                                                    test_size = 0.10, 
                                                                                                    stratify=data[d]['target'])

### Tunning hyperparameters

In [6]:
R = 12345

Sampling after splitting

In [7]:
oversample_smote = SMOTE(sampling_strategy = 'minority', 
                         random_state=R)

oversample_adasyn = ADASYN(sampling_strategy = 'minority', 
                           random_state=R)

In [8]:
params_grid = [{
                'sampling':[None, oversample_smote, oversample_adasyn],
                'estimator':[SVC()],
                'estimator__C': [100, 10, 1.0, 0.1, 0.001],
                'estimator__gamma': [0.001, 0.0001],
                'estimator__kernel': ['poly', 'rbf', 'sigmoid'],
                },
                {
                'sampling':[None, oversample_smote, oversample_adasyn],
                'estimator': [DecisionTreeClassifier()],
                'estimator__max_depth': [1,2,3,4,5],
                'estimator__max_features': [None, "auto", "sqrt", "log2"],
                },
                {
                 'sampling':[None, oversample_smote, oversample_adasyn],
                 'estimator': [RandomForestClassifier()],
                 'estimator__n_estimators':[100, 150, 200],
                 'estimator__criterion':["gini", "entropy"],
                 'estimator__max_depth' : [3, 4, 5]},
                {
                 'sampling':[None, oversample_smote, oversample_adasyn],
                 'estimator': [AdaBoostClassifier()],
                 'estimator__n_estimators':[20, 30, 40],
                 'estimator__learning_rate':[0.1, 0.5, 1]},
                {
                'sampling':[None, oversample_smote, oversample_adasyn],
                'estimator': [BaggingClassifier()],
                'estimator__n_estimators':[10, 100, 1000],
                'estimator__max_samples' : [0.05, 0.1, 0.2, 0.5]},
                {
                'sampling':[None, oversample_smote, oversample_adasyn],
                'estimator': [XGBClassifier(random_state=R, verbosity = 0)],
                'estimator__max_depth':[2, 3, 5, 7, 10],
                'estimator__n_estimators':[10, 100, 500],
                },
                {
                 'sampling':[None, oversample_smote, oversample_adasyn],
                 'estimator': [MLPClassifier(max_iter=100)],
                 'estimator__solver': ['sgd', 'adam'],
                 'estimator__learning_rate': ['constant','adaptive'],
                 'estimator__hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
                 'estimator__activation': ['tanh', 'relu'],
                 'estimator__alpha': [0.0001, 0.05],
                 'estimator__early_stopping': [True, False]
                },

              ]

### Grid search CV

In [9]:
scoring_metrics = ['accuracy', 
                   'recall', 
                   'precision',
                   'f1', 
                   'roc_auc']

In [10]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
for d in data.keys():
    
    print('Tunning hyperparameters for: ', d)
    
    if d in ['d_num_to_pca', 'd_perc_to_pca', 'd_num_tg_pca', 'd_perc_tg_pca']:
        pipe = Pipeline(steps=[
                ('sampling', None),
                ('estimator', SVC())])
        grid = GridSearchCV(pipe, 
                            params_grid, 
                            scoring=scoring_metrics, 
                            cv=10,
                            n_jobs=-1,
                            refit='accuracy')
        grid.fit(data[d]['X_train'], data[d]['y_train'])
        data[d]['cv_results'] = pd.DataFrame(grid.cv_results_)
   
    else:
        pipe = Pipeline(steps=[
                ('sampling', None),
                ('scaler', StandardScaler()),
                ('estimator', SVC())])
        grid = GridSearchCV(pipe, 
                            params_grid, 
                            scoring=scoring_metrics, 
                            cv=10,
                            n_jobs=-1,
                            refit='accuracy')
        grid.fit(data[d]['X_train'], data[d]['y_train'])
        data[d]['cv_results'] = pd.DataFrame(grid.cv_results_)

Tunning hyperparameters for:  d_original
Tunning hyperparameters for:  d_num_to
Tunning hyperparameters for:  d_perc_to
Tunning hyperparameters for:  d_num_tg
Tunning hyperparameters for:  d_perc_tg
Tunning hyperparameters for:  d_num_to_pca
Tunning hyperparameters for:  d_perc_to_pca
Tunning hyperparameters for:  d_num_tg_pca
Tunning hyperparameters for:  d_perc_tg_pca


In [12]:
results = dict()

for d in data.keys():
    results[d] = data[d]['cv_results']
    
for d, df in results.items():
    df['dataset'] = d

results_final_df = pd.concat(sorted(results.values(), key=lambda df: df['dataset'][0]), ignore_index=True)

In [13]:
results_final_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,param_estimator__C,param_estimator__gamma,param_estimator__kernel,param_sampling,param_estimator__max_depth,...,split4_test_roc_auc,split5_test_roc_auc,split6_test_roc_auc,split7_test_roc_auc,split8_test_roc_auc,split9_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc,dataset
0,0.007182,0.000647,0.006731,0.000907,"SVC(C=100, gamma=0.001)",100,0.001,poly,,,...,1.000000,0.883333,0.915625,0.800000,0.931250,0.865625,0.896291,0.050239,373,d_num_tg
1,0.049600,0.014785,0.009935,0.000351,"SVC(C=100, gamma=0.001)",100,0.001,poly,"SMOTE(random_state=12345, sampling_strategy='m...",,...,0.875000,0.797222,0.762500,0.896875,0.934375,0.696875,0.836834,0.094692,429,d_num_tg
2,0.053334,0.022372,0.010235,0.000578,"SVC(C=100, gamma=0.001)",100,0.001,poly,"ADASYN(random_state=12345, sampling_strategy='...",,...,0.756098,0.833333,0.740625,0.906250,0.878125,0.675000,0.817663,0.087314,441,d_num_tg
3,0.009854,0.007383,0.008447,0.003213,"SVC(C=100, gamma=0.001)",100,0.001,rbf,,,...,0.984756,1.000000,0.828125,1.000000,0.996875,0.915625,0.966441,0.051869,67,d_num_tg
4,0.025010,0.004858,0.012894,0.002616,"SVC(C=100, gamma=0.001)",100,0.001,rbf,"SMOTE(random_state=12345, sampling_strategy='m...",,...,0.987805,0.994444,0.865625,0.971875,1.000000,0.912500,0.963774,0.040993,85,d_num_tg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5395,0.563792,0.081506,0.009973,0.004184,"MLPClassifier(alpha=0.05, hidden_layer_sizes=(...",,,,"SMOTE(random_state=12345, sampling_strategy='m...",,...,0.884146,0.991667,0.959375,0.978125,0.993750,0.928125,0.952177,0.031330,335,d_perc_to_pca
5396,0.521605,0.040524,0.007580,0.000798,"MLPClassifier(alpha=0.05, hidden_layer_sizes=(...",,,,"ADASYN(random_state=12345, sampling_strategy='...",,...,0.905488,0.944444,0.968750,0.959375,0.965625,0.900000,0.932661,0.026477,434,d_perc_to_pca
5397,0.372205,0.017851,0.006583,0.000662,"MLPClassifier(alpha=0.05, hidden_layer_sizes=(...",,,,,,...,0.911585,0.997222,0.993750,0.959375,0.990625,0.962500,0.977542,0.025811,19,d_perc_to_pca
5398,0.668791,0.079895,0.008411,0.002076,"MLPClassifier(alpha=0.05, hidden_layer_sizes=(...",,,,"SMOTE(random_state=12345, sampling_strategy='m...",,...,0.942073,0.997222,0.990625,0.956250,0.990625,0.965625,0.978754,0.018297,9,d_perc_to_pca


In [14]:
results_final_df.to_csv('results/20220322/all/hyperp_sampling_after.tsv', index=False, sep='\t')