Source code for ugtm.ugtm_crossvalidate

"""Cross-validation support for GTC and GTR models (also SVM and PCA).
"""
# Authors: Helena A. Gaspar <hagax8@gmail.com>
# License: MIT

import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC
from sklearn.svm import SVR
import scipy.stats as st
from scipy.stats import t
from . import ugtm_predictions
from . import ugtm_preprocess



[docs]
def crossvalidateGTC(data, labels, k=16, m=4, s=-1.0, regul=1.0,
                     n_neighbors=1, niter=200,
                     representation="modes",
                     doPCA=False, n_components=-1,
                     missing=False, missing_strategy='median',
                     random_state=1234, predict_mode="bayes",
                     prior="estimated",
                     n_folds=5, n_repetitions=10):
    r"""Cross-validate GTC model.

    Parameters
    ==========
    data : array of shape (n_individuals, n_dimensions)
        Train set data matrix.
    labels : array of shape (n_individuals, 1)
        Labels for train set.
    k : int, optional (default = 16)
        If k is set to 0, k is computed as sqrt(5*sqrt(n_individuals))+2.
        k is the sqrt of the number of GTM nodes.
        One of four GTM hyperparameters (k, m, s, regul).
        Ex: k = 25 means the GTM will be discretized into a 25x25 grid.
    m : int, optional (default = 4)
        If m is set to 0, m is computed as sqrt(k).
        (generally good rule of thumb).
        m is the qrt of the number of RBF centers.
        One of four GTM hyperparameters (k, m, s, regul).
        Ex: m = 5 means the RBF functions will be arranged on a 5x5 grid.
    s : float, optional (default = -1)
        RBF width factor. Default (-1) is to try different values.
        Parameter to tune width of RBF functions.
        Impacts manifold flexibility.
    regul : float, optional (default = -1)
        Regularization coefficient. Default (-1) is to try different values.
        Impacts manifold flexibility.
    n_neighbors : int, optional (default = 1)
        Number of neighbors for kNN algorithm (number of nearest nodes).
        At the moment, n_neighbors for GTC is always equal to 1.
    niter : int, optional (default = 200)
        Number of iterations for EM algorithm.
    representation : {"modes", "means"}
        2D GTM representation for the test set, used for kNN algorithms:
        "modes" for position with max. responsibility,
        "means" for average position (usual GTM representation)
    doPCA : bool, optional (default = False)
        Apply PCA pre-processing.
    n_components : int, optional (default = -1)
        Number of components for PCA pre-processing.
        If set to -1, keep principal components
        accounting for 80% of data variance.
    missing : bool, optional (default = True)
        Replace missing values (calls scikit-learn functions).
    missing_strategy : str, optional (default = 'median')
        Scikit-learn missing data strategy.
    random_state : int, optional (default = 1234)
        Random state.
    predict_mode : {"bayes", "knn"}, optional
        Choose between nearest node algorithm
        ("knn", output of :func:`~ugtm.ugtm_predictions.predictNN`)
        or GTM Bayes classifier
        ("bayes", output of :func:`~ugtm.ugtm_predictions.predictBayes`).
        NB: the kNN algorithm is limited to only 1 nearest node at the moment
        (n_neighbors = 1).
    prior : {"estimated", "equiprobable"}, optional
        Type of prior used to build GTM class map
        (:func:`~ugtm.ugtm_landscape.classMap`).
        Choose "estimated" to account for class imbalance.
    n_folds : int, optional (default = 5)
        Number of CV folds.
    n_repetitions : int, optional (default = 10)
        Number of CV iterations.
    """
    print("")
    print("k = sqrt(grid size), m = sqrt(radial basis function grid size), "
          "regul = regularization, s = RBF width factor")
    print("")
    uniqClasses, labels = np.unique(labels, return_inverse=True)
    nClasses = len(uniqClasses)
    print("Classes: ", uniqClasses)
    print("nClasses: %s" % (nClasses))
    print("")
    print("model\tparameters=k:m:s:regul\t"
          "recall with CI\tprecision with CI\tF1-score with CI")
    print("")
    if k == 0:
        k = int(np.sqrt(5*np.sqrt(data.shape[0])))+2
    if m == 0:
        m = int(np.sqrt(k))
    if n_components == -1 and doPCA:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(
            pca.explained_variance_ratio_.cumsum(), 0.8)+1
        print("Used number of components explaining 80%% of "
              "the variance in whole data set = %s\n" %
              n_components)
    if regul < 0.0:
        lvec = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
    else:
        lvec = [regul]
    if s <= 0.0:
        svec = [0.25, 0.5, 1.0, 1.50, 2.0]
    else:
        svec = [s]
    savemean = -9999
    nummodel = 0
    savemodel = ""
    for s in svec:
        for regul in lvec:
            modelstring = str(k)+':'+str(m)+":"+str(s)+":"+str(regul)
            nummodel += 1
            recallvec = []
            precisionvec = []
            f1vec = []
            recallclassvec = np.array([])
            precisionclassvec = np.array([])
            f1classvec = np.array([])
            meanclass = np.zeros(nClasses)
            meanprecisionclass = np.zeros(nClasses)
            meanf1class = np.zeros(nClasses)
            seclass = np.zeros(nClasses)
            seprecisionclass = np.zeros(nClasses)
            sef1class = np.zeros(nClasses)
            hclass = np.zeros(nClasses)
            hprecisionclass = np.zeros(nClasses)
            hf1class = np.zeros(nClasses)
            for j in range(n_repetitions):
                ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
                y_true = []
                y_pred = []
                for train_index, test_index in ss.split(data):
                    train = np.copy(data[train_index])
                    test = np.copy(data[test_index])
                    prediction = ugtm_predictions.GTC(train=train,
                                                      labels=labels[train_index],
                                                      test=test, k=k,
                                                      m=m, s=s, regul=regul,
                                                      n_neighbors=n_neighbors,
                                                      niter=niter,
                                                      representation=representation,
                                                      doPCA=doPCA,
                                                      n_components=n_components,
                                                      random_state=random_state,
                                                      missing=missing,
                                                      missing_strategy=missing_strategy,
                                                      predict_mode=predict_mode,
                                                      prior=prior)
                    y_true = np.append(y_true, labels[test_index])
                    y_pred = np.append(y_pred, prediction)
                recall = recall_score(y_true, y_pred, average='weighted')
                precision = precision_score(
                    y_true, y_pred, average='weighted')
                f1 = f1_score(y_true, y_pred, average='weighted')
                recallvec = np.append(recallvec, recall)
                precisionvec = np.append(precisionvec, precision)
                f1vec = np.append(f1vec, f1)
                recallclass = recall_score(y_true, y_pred, average=None)
                precisionclass = precision_score(y_true, y_pred, average=None)
                f1class = f1_score(y_true, y_pred, average=None)
                if(j == 0):
                    recallclassvec = recallclass
                    precisionclassvec = precisionclass
                    f1classvec = f1class
                else:
                    recallclassvec = np.vstack([recallclassvec, recallclass])
                    precisionclassvec = np.vstack(
                        [precisionclassvec, precisionclass])
                    f1classvec = np.vstack([f1classvec, f1class])
            mean, se = np.mean(recallvec), st.sem(recallvec)
            meanprecision, seprecision = np.mean(
                precisionvec), st.sem(precisionvec)
            meanf1, sef1 = np.mean(f1vec), st.sem(f1vec)
            h = se * t._ppf((1+0.95)/2., len(recallvec)-1)
            hprecision = seprecision * \
                t._ppf((1+0.95)/2., len(precisionvec)-1)
            hf1 = sef1 * t._ppf((1+0.95)/2., len(f1vec)-1)
            if(meanf1 > savemean):
                savemean = meanf1
                savemodel = "Model "+str(nummodel)
            for i in range(0, nClasses):
                meanclass[i] = np.mean(recallclassvec[:, i])
                seclass[i] = st.sem(recallclassvec[:, i])
                meanf1class[i] = np.mean(f1classvec[:, i])
                sef1class[i] = st.sem(f1classvec[:, i])
                meanprecisionclass[i] = np.mean(precisionclassvec[:, i])
                seprecisionclass[i] = st.sem(precisionclassvec[:, i])
                hclass[i] = seclass[i] * \
                    t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1)
                hprecisionclass[i] = seprecisionclass[i] \
                    * t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1)
                hf1class[i] = sef1class[i] * \
                    t._ppf((1+0.95)/2., len(f1classvec[:, i])-1)
            print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f"
                  % (nummodel, modelstring, mean, h,
                     meanprecision, hprecision, meanf1, hf1))
            for i in range(nClasses):
                print("Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f"
                      % (uniqClasses[i], modelstring, meanclass[i],
                         hclass[i], meanprecisionclass[i],
                         hprecisionclass[i], meanf1class[i], hf1class[i]))
            print('')

    print('')
    print("########best GTC model##########")
    print(savemodel)
    print("")




[docs]
def crossvalidateGTR(data, labels, k=16, m=4, s=-1, regul=-1,
                     n_neighbors=1, niter=200, representation="modes",
                     doPCA=False, n_components=-1,
                     missing=False, missing_strategy='median',
                     random_state=1234, n_folds=5, n_repetitions=10):
    r"""Cross-validate GTR model.

    Parameters
    ==========
    data : array of shape (n_individuals, n_dimensions)
        Train set data matrix.
    labels : array of shape (n_individuals, 1)
        Labels for train set.
    k : int, optional (default = 16)
        If k is set to 0, k is computed as sqrt(5*sqrt(n_individuals))+2.
        k is the sqrt of the number of GTM nodes.
        One of four GTM hyperparameters (k, m, s, regul).
        Ex: k = 25 means the GTM will be discretized into a 25x25 grid.
    m : int, optional (default = 4)
        If m is set to 0, m is computed as sqrt(k).
        (generally good rule of thumb).
        m is the qrt of the number of RBF centers.
        One of four GTM hyperparameters (k, m, s, regul).
        Ex: m = 5 means the RBF functions will be arranged on a 5x5 grid.
    s : float, optional (default = -1)
        RBF width factor. Default (-1) is to try different values.
        Parameter to tune width of RBF functions.
        Impacts manifold flexibility.
    regul : float, optional (default = -1)
        Regularization coefficient. Default (-1) is to try different values.
        Impacts manifold flexibility.
    n_neighbors : int, optional (default = 1)
        Number of neighbors for kNN algorithm (number of nearest nodes).
    niter : int, optional (default = 200)
        Number of iterations for EM algorithm.
    representation : {"modes", "means"}
        2D GTM representation for the test set, used for kNN algorithms:
        "modes" for position with max. responsibility,
        "means" for average position (usual GTM representation)
    doPCA : bool, optional (default = False)
        Apply PCA pre-processing.
    n_components : int, optional (default = -1)
        Number of components for PCA pre-processing.
        If set to -1, keep principal components
        accounting for 80% of data variance.
    missing : bool, optional (default = True)
        Replace missing values (calls scikit-learn functions).
    missing_strategy : str, optional (default = 'median')
        Scikit-learn missing data strategy.
    random_state : int, optional (default = 1234)
        Random state.
    n_folds : int, optional (default = 5)
        Number of CV folds.
    n_repetitions : int, optional (default = 10)
        Number of CV iterations.
    """
    print("")
    print("k = sqrt(grid size), m = sqrt(radial basis function grid size), "
          "regul = regularization, s = RBF width factor")
    print("")
    if k == 0:
        k = int(np.sqrt(5*np.sqrt(data.shape[0])))+2
    if m == 0:
        m = int(np.sqrt(k))
    if n_components == -1 and doPCA is True:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(
            pca.explained_variance_ratio_.cumsum(), 0.8)+1
        print("Used number of components explaining 80%% of the variance = %s\n"
              % n_components)
    if regul < 0.0:
        lvec = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
    else:
        lvec = [regul]
    if s <= 0.0:
        svec = [0.25, 0.5, 1.0, 1.50, 2.0]
    else:
        svec = [s]
    savemean = 999999999
    saveh = 0.0
    modelvec = ""
    savemeanr2 = 0.0
    savehr2 = 0.0
    print("k:m:s:regul\tRMSE with CI\tR2 with CI\t")
    for s in svec:
        for regul in lvec:
            modelstring = str(s)+":"+str(regul)
            rmsevec = []
            r2vec = []
            for j in range(n_repetitions):
                ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
                y_true = []
                y_pred = []
                for train_index, test_index in ss.split(data):
                    train = np.copy(data[train_index])
                    test = np.copy(data[test_index])
                    prediction = ugtm_predictions.GTR(train=train,
                                                      labels=labels[train_index],
                                                      test=test, k=k,
                                                      m=m, s=s, regul=regul,
                                                      n_neighbors=n_neighbors,
                                                      niter=niter,
                                                      representation=representation,
                                                      doPCA=doPCA,
                                                      n_components=n_components,
                                                      random_state=random_state,
                                                      missing=missing,
                                                      missing_strategy=missing_strategy)
                    y_pred = np.append(y_pred, prediction)
                    y_true = np.append(y_true, labels[test_index])
                rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                r2 = r2_score(y_true, y_pred)
                rmsevec = np.append(rmsevec, rmse)
                r2vec = np.append(r2vec, r2)
            mean, se = np.mean(rmsevec), st.sem(rmsevec)
            h = se * t._ppf((1.0+0.95)/2., len(rmsevec)-1)
            meanr2, ser2 = np.mean(r2vec), st.sem(r2vec)
            hr2 = ser2 * t._ppf((1.0+0.95)/2., len(r2vec)-1)
            if(mean < savemean):
                savemean = mean
                saveh = h
                modelvec = modelstring
                savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec)
                savehr2 = saveser2 * t._ppf((1+0.95)/2., len(r2vec)-1)
            print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f"
                  % (str(k)+':'+str(m)+':'+modelstring, mean, h, meanr2, hr2))
    print('')
    print("########best GTR model##########")
    print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f"
          % (str(k)+':'+str(m)+':'+modelvec,
             savemean, saveh, savemeanr2, savehr2))
    print("")




[docs]
def crossvalidatePCAC(data, labels, n_neighbors=1, maxneighbours=11,
                      doPCA=False, n_components=-1, missing=False,
                      missing_strategy='median', random_state=1234,
                      n_folds=5, n_repetitions=10):
    r"""Cross-validate PCA kNN classification model.

    Parameters
    ==========
    data : array of shape (n_individuals, n_dimensions)
        Train set data matrix.
    labels : array of shape (n_individuals, 1)
        Labels for train set.
    n_neighbors : int, optional (default = 1)
        Number of neighbors for kNN algorithm (number of nearest nodes).
    max_neighbors : int, optional (default = 11)
        The function crossvalidates kNN models with k between n_neighbors
        and max_neighbors.
    doPCA : bool, optional (default = False)
        Apply PCA pre-processing.
    n_components : int, optional (default = -1)
        Number of components for PCA pre-processing.
        If set to -1, keep principal components
        accounting for 80% of data variance.
    missing : bool, optional (default = True)
        Replace missing values (calls scikit-learn functions).
    missing_strategy : str, optional (default = 'median')
        Scikit-learn missing data strategy.
    random_state : int, optional (default = 1234)
        Random state.
    n_folds : int, optional (default = 5)
        Number of CV folds.
    n_repetitions : int, optional (default = 10)
        Number of CV iterations.
    """
    if n_components == -1 and doPCA is True:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(
            pca.explained_variance_ratio_.cumsum(), 0.8)+1
        print("Used number of components "
              "explaining 80%% of the variance = %s\n"
              % n_components)
    uniqClasses, labels = np.unique(labels, return_inverse=True)
    nClasses = len(uniqClasses)
    print("Classes: ", uniqClasses)
    print("nClasses: ", nClasses)
    print("")
    print("model\tparameters=k_for_kNN\trecall with CI\t"
          "precision with CI\tF1-score with CI")
    print("")
    if n_neighbors <= 0:
        Kvec = np.arange(start=1, stop=maxneighbours, step=1, dtype=np.int32)
    else:
        Kvec = [n_neighbors]

    savemean = -9999
    nummodel = 0
    savemodel = ""
    for c in Kvec:
        nummodel += 1
        modelstring = str(c)
        recallvec = []
        precisionvec = []
        f1vec = []
        recallclassvec = np.array([])
        precisionclassvec = np.array([])
        f1classvec = np.array([])
        meanclass = np.zeros(nClasses)
        meanprecisionclass = np.zeros(nClasses)
        meanf1class = np.zeros(nClasses)
        seclass = np.zeros(nClasses)
        seprecisionclass = np.zeros(nClasses)
        sef1class = np.zeros(nClasses)
        hclass = np.zeros(nClasses)
        hprecisionclass = np.zeros(nClasses)
        hf1class = np.zeros(nClasses)
        for j in range(n_repetitions):
            ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
            y_true = []
            y_pred = []
            for train_index, test_index in ss.split(data):
                train = np.copy(data[train_index])
                test = np.copy(data[test_index])
                processed = ugtm_preprocess.processTrainTest(train,
                                                             test, doPCA,
                                                             n_components,
                                                             missing,
                                                             missing_strategy)
                y_pred = np.append(y_pred, ugtm_predictions.predictNNSimple(
                    processed.train,
                    processed.test,
                    labels[train_index],
                    c,
                    "classification"))
                y_true = np.append(y_true, labels[test_index])
            recall = recall_score(y_true, y_pred, average='weighted')
            precision = precision_score(y_true, y_pred, average='weighted')
            f1 = f1_score(y_true, y_pred, average='weighted')
            recallvec = np.append(recallvec, recall)
            precisionvec = np.append(precisionvec, precision)
            f1vec = np.append(f1vec, f1)
            recallclass = recall_score(y_true, y_pred, average=None)
            precisionclass = precision_score(y_true, y_pred, average=None)
            f1class = f1_score(y_true, y_pred, average=None)
            if(j == 0):
                recallclassvec = recallclass
                precisionclassvec = precisionclass
                f1classvec = f1class
            else:
                recallclassvec = np.vstack([recallclassvec, recallclass])
                precisionclassvec = np.vstack(
                    [precisionclassvec, precisionclass])
                f1classvec = np.vstack([f1classvec, f1class])
        mean, se = np.mean(recallvec), st.sem(recallvec)
        meanprecision, seprecision = np.mean(
            precisionvec), st.sem(precisionvec)
        meanf1, sef1 = np.mean(f1vec), st.sem(f1vec)
        h = se * t._ppf((1+0.95)/2., len(recallvec)-1)
        hprecision = seprecision * t._ppf((1+0.95)/2., len(precisionvec)-1)
        hf1 = sef1 * t._ppf((1+0.95)/2., len(f1vec)-1)
        if(meanf1 > savemean):
            savemean = meanf1
            savemodel = "Model "+str(nummodel)
        for i in range(0, nClasses):
            meanclass[i] = np.mean(recallclassvec[:, i])
            seclass[i] = st.sem(recallclassvec[:, i])
            meanf1class[i] = np.mean(f1classvec[:, i])
            sef1class[i] = st.sem(f1classvec[:, i])
            meanprecisionclass[i] = np.mean(precisionclassvec[:, i])
            seprecisionclass[i] = st.sem(precisionclassvec[:, i])
            hclass[i] = seclass[i] * \
                t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1)
            hprecisionclass[i] = seprecisionclass[i] \
                * t._ppf((1+0.95)/2.,
                         len(precisionclassvec[:, i])-1)
            hf1class[i] = sef1class[i] * \
                t._ppf((1+0.95)/2., len(f1classvec[:, i])-1)

        print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f"
              % (nummodel, modelstring,
                 mean, h, meanprecision, hprecision, meanf1, hf1))
        for i in range(nClasses):
            print("Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f"
                  % (uniqClasses[i], modelstring, meanclass[i], hclass[i],
                      meanprecisionclass[i], hprecisionclass[i],
                      meanf1class[i], hf1class[i]))
        print('')
    print('')
    print("########best nearest neighbors model##########")
    print(savemodel)
    print("")




[docs]
def crossvalidatePCAR(data, labels, n_neighbors=1,
                      maxneighbours=11, doPCA=False,
                      n_components=-1, missing=False,
                      missing_strategy='median',
                      random_state=1234,
                      n_folds=5, n_repetitions=10):
    r"""Cross-validate PCA kNN regression model.

    Parameters
    ==========
    data : array of shape (n_individuals, n_dimensions)
        Train set data matrix.
    labels : array of shape (n_individuals, 1)
        Labels for train set.
    n_neighbors : int, optional (default = 1)
        Number of neighbors for kNN algorithm (number of nearest nodes).
    max_neighbors : int, optional (default = 11)
        The function crossvalidates kNN models with k between n_neighbors
        and max_neighbors.
    doPCA : bool, optional (default = False)
        Apply PCA pre-processing.
    n_components : int, optional (default = -1)
        Number of components for PCA pre-processing.
        If set to -1, keep principal components
        accounting for 80% of data variance.
    missing : bool, optional (default = True)
        Replace missing values (calls scikit-learn functions).
    missing_strategy : str, optional (default = 'median')
        Scikit-learn missing data strategy.
    random_state : int, optional (default = 1234)
        Random state.
    n_folds : int, optional (default = 5)
        Number of CV folds.
    n_repetitions : int, optional (default = 10)
        Number of CV iterations.
    """
    if n_components == -1 and doPCA is True:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(
            pca.explained_variance_ratio_.cumsum(), 0.8)+1
        print("Used number of components explaining 80%% of the variance = %s\n"
              % n_components)
    print("")
    uniqClasses, labels = np.unique(labels, return_inverse=True)
    if n_neighbors <= 0:
        Kvec = np.arange(start=1, stop=maxneighbours, step=1, dtype=np.int32)
    else:
        Kvec = [n_neighbors]

    modelvec = ""
    savemean = 99999
    saveh = 0.0
    savemeanr2 = 0.0
    savehr2 = 0.0
    nummodel = 0
    print("k = number of nearest neighbours\tRMSE with CI\tR2 with CI\t")
    for c in Kvec:
        nummodel += 1
        modelstring = str(c)
        rmsevec = []
        r2vec = []
        for j in range(n_repetitions):
            ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
            y_true = []
            y_pred = []
            for train_index, test_index in ss.split(data):
                train = np.copy(data[train_index])
                test = np.copy(data[test_index])
                processed = ugtm_preprocess.processTrainTest(train, test,
                                                             doPCA,
                                                             n_components,
                                                             missing,
                                                             missing_strategy)
                y_pred = np.append(y_pred, ugtm_predictions.predictNNSimple(
                    processed.train,
                    processed.test,
                    labels[train_index],
                    c, "regression"))
                y_true = np.append(y_true, labels[test_index])
            rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            r2 = r2_score(y_true, y_pred)
            rmsevec = np.append(rmsevec, rmse)
            r2vec = np.append(r2vec, r2)
        mean, se = np.mean(rmsevec), st.sem(rmsevec)
        h = se * t._ppf((1+0.95)/2., len(rmsevec)-1)
        meanr2, ser2 = np.mean(r2vec), st.sem(r2vec)
        hr2 = ser2 * t._ppf((1+0.95)/2., len(r2vec)-1)
        if(mean < savemean):
            savemean = mean
            saveh = h
            modelvec = modelstring
            savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec)
            savehr2 = saveser2 * t._ppf((1+0.95)/2., len(r2vec)-1)
        print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f"
              % (modelstring, mean, h, meanr2, hr2))
    print('')
    print("########best nearest neighbors model##########")
    print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f"
          % (modelvec, savemean, saveh, savemeanr2, savehr2))
    print("")




[docs]
def crossvalidateSVC(data, labels, C=1.0, doPCA=False, n_components=-1,
                     missing=False,
                     missing_strategy='median',
                     random_state=1234, n_folds=5, n_repetitions=10):
    r"""Cross-validate SVC model.

    Parameters
    ==========
    data : array of shape (n_individuals, n_dimensions)
        Train set data matrix.
    labels : array of shape (n_individuals, 1)
        Labels for train set.
    C : float, optional (default = 1.0)
        SVM regularization parameter.
    doPCA : bool, optional (default = False)
        Apply PCA pre-processing.
    n_components : int, optional (default = -1)
        Number of components for PCA pre-processing.
        If set to -1, keep principal components
        accounting for 80% of data variance.
    missing : bool, optional (default = True)
        Replace missing values (calls scikit-learn functions).
    missing_strategy : str, optional (default = 'median')
        Scikit-learn missing data strategy.
    random_state : int, optional (default = 1234)
        Random state.
    n_folds : int, optional (default = 5)
        Number of CV folds.
    n_repetitions : int, optional (default = 10)
        Number of CV iterations.
    """

    if n_components == -1 and doPCA is True:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(
            pca.explained_variance_ratio_.cumsum(), 0.8)+1
        print("Used number of components explaining 80%% of the variance = %s\n"
              % n_components)
    uniqClasses, labels = np.unique(labels, return_inverse=True)
    nClasses = len(uniqClasses)
    print("Classes: ", uniqClasses)
    print("nClasses: ", nClasses)
    print("")
    print("model\tparameters=C\trecall with CI\t"
          "precision with CI\tF1-score with CI")
    print("")
    if C < 0.0:
        Cvec = np.power(2, np.arange(
            start=-5, stop=15, step=1, dtype=float))
    else:
        Cvec = [C]
    savemean = -9999
    nummodel = 0
    savemodel = ""
    for C in Cvec:
        modelstring = str(C)
        nummodel += 1
        recallvec = []
        precisionvec = []
        f1vec = []
        recallclassvec = np.array([])
        precisionclassvec = np.array([])
        f1classvec = np.array([])
        meanclass = np.zeros(nClasses)
        meanprecisionclass = np.zeros(nClasses)
        meanf1class = np.zeros(nClasses)
        seclass = np.zeros(nClasses)
        seprecisionclass = np.zeros(nClasses)
        sef1class = np.zeros(nClasses)
        hclass = np.zeros(nClasses)
        hprecisionclass = np.zeros(nClasses)
        hf1class = np.zeros(nClasses)
        for j in range(n_repetitions):
            ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
            y_true = []
            y_pred = []
            for train_index, test_index in ss.split(data):
                train = np.copy(data[train_index])
                test = np.copy(data[test_index])
                processed = ugtm_preprocess.processTrainTest(train, test,
                                                             doPCA,
                                                             n_components,
                                                             missing,
                                                             missing_strategy)
                clf = SVC(kernel='linear', C=C)
                clf.fit(processed.train, labels[train_index])
                y_pred = np.append(y_pred, clf.predict(processed.test))
                y_true = np.append(y_true, labels[test_index])
            recall = recall_score(y_true, y_pred, average='weighted')
            precision = precision_score(y_true, y_pred, average='weighted')
            f1 = f1_score(y_true, y_pred, average='weighted')
            recallvec = np.append(recallvec, recall)
            precisionvec = np.append(precisionvec, precision)
            f1vec = np.append(f1vec, f1)
            recallclass = recall_score(y_true, y_pred, average=None)
            precisionclass = precision_score(y_true, y_pred, average=None)
            f1class = f1_score(y_true, y_pred, average=None)
            if(j == 0):
                recallclassvec = recallclass
                precisionclassvec = precisionclass
                f1classvec = f1class
            else:
                recallclassvec = np.vstack([recallclassvec, recallclass])
                precisionclassvec = np.vstack(
                    [precisionclassvec, precisionclass])
                f1classvec = np.vstack([f1classvec, f1class])
        mean, se = np.mean(recallvec), st.sem(recallvec)
        meanprecision, seprecision = np.mean(
            precisionvec), st.sem(precisionvec)
        meanf1, sef1 = np.mean(f1vec), st.sem(f1vec)
        h = se * t._ppf((1+0.95)/2., len(recallvec)-1)
        hprecision = seprecision * t._ppf((1+0.95)/2., len(precisionvec)-1)
        hf1 = sef1 * t._ppf((1+0.95)/2., len(f1vec)-1)
        if(meanf1 > savemean):
            savemean = meanf1
            savemodel = "Model "+str(nummodel)
        for i in range(0, nClasses):
            meanclass[i] = np.mean(recallclassvec[:, i])
            seclass[i] = st.sem(recallclassvec[:, i])
            sef1class[i] = st.sem(f1classvec[:, i])
            meanf1class[i] = np.mean(f1classvec[:, i])
            meanprecisionclass[i] = np.mean(precisionclassvec[:, i])
            seprecisionclass[i] = st.sem(precisionclassvec[:, i])
            hclass[i] = seclass[i] * \
                t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1)
            hprecisionclass[i] = seprecisionclass[i] * \
                t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1)
            hf1class[i] = sef1class[i] * \
                t._ppf((1+0.95)/2., len(f1classvec[:, i])-1)
        print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f"
              % (nummodel, modelstring,
                 mean, h, meanprecision, hprecision, meanf1, hf1))
        for i in range(nClasses):
            print("Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f"
                  % (uniqClasses[i], modelstring, meanclass[i], hclass[i],
                     meanprecisionclass[i], hprecisionclass[i],
                     meanf1class[i], hf1class[i]))
        print('')
    print('')
    print("########best linear SVM model##########")
    print(savemodel)
    print("")




[docs]
def crossvalidateSVR(data, labels,
                     C=-1, epsilon=-1,
                     doPCA=False,
                     n_components=-1, missing=False,
                     missing_strategy='median', random_state=1234,
                     n_folds=5, n_repetitions=10):
    r"""Cross-validate SVR model with linear kernel.

    Parameters
    ==========
    data : array of shape (n_individuals, n_dimensions)
        Train set data matrix.
    labels : array of shape (n_individuals, 1)
        Labels for train set.
    C : float, optional (default = -1)
        SVM regularization parameter.
        If (C = -1), different values are tested.
    epsilon : float, optional (default = -1)
        SVM tolerance parameter.
        If (epsilon = -1), different values are tested.
    doPCA : bool, optional (default = False)
        Apply PCA pre-processing.
    n_components : int, optional (default = -1)
        Number of components for PCA pre-processing.
        If set to -1, keep principal components
        accounting for 80% of data variance.
    missing : bool, optional (default = True)
        Replace missing values (calls scikit-learn functions).
    missing_strategy : str, optional (default = 'median')
        Scikit-learn missing data strategy.
    random_state : int, optional (default = 1234)
        Random state.
    n_folds : int, optional (default = 5)
        Number of CV folds.
    n_repetitions : int, optional (default = 10)
        Number of CV iterations.
    """
    if C < 0.0:
        Cvec = np.power(2, np.arange(
            start=-5, stop=15, step=1, dtype=float))
    else:
        Cvec = [C]
    if epsilon < 0.0:
        EpsVec = [0, 0.01, 0.1, 0.5, 1, 2, 4]
    else:
        EpsVec = [epsilon]
    modelvec = ""
    savemean = 99999
    saveh = 0.0
    savemeanr2 = 0.0
    savehr2 = 0.0
    if n_components == -1 and doPCA is True:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(
            pca.explained_variance_ratio_.cumsum(), 0.8)+1
        print("Used number of components explaining 80%%"
              "of the variance = %s\n"
              % n_components)
    print("C:epsilon\tRMSE with CI\tR2 with CI\t")
    for C in Cvec:
        for eps in EpsVec:
            modelstring = str(C)+":"+str(eps)
            rmsevec = []
            r2vec = []
            for j in range(n_repetitions):
                ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
                y_true = []
                y_pred = []
                for train_index, test_index in ss.split(data):
                    train = np.copy(data[train_index])
                    test = np.copy(data[test_index])
                    processed = ugtm_preprocess.processTrainTest(train, test,
                                                                 doPCA,
                                                                 n_components,
                                                                 missing,
                                                                 missing_strategy)
                    clf = SVR(kernel='linear', C=C, epsilon=eps)
                    clf.fit(processed.train, labels[train_index])
                    y_pred = np.append(y_pred, clf.predict(processed.test))
                    y_true = np.append(y_true, labels[test_index])
                rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                r2 = r2_score(y_true, y_pred)
                rmsevec = np.append(rmsevec, rmse)
                r2vec = np.append(r2vec, r2)
            mean, se = np.mean(rmsevec), st.sem(rmsevec)
            h = se * t._ppf((1+0.95)/2., len(rmsevec)-1)
            meanr2, ser2 = np.mean(r2vec), st.sem(r2vec)
            hr2 = ser2 * t._ppf((1+0.95)/2., len(r2vec)-1)
            if(mean < savemean):
                savemean = mean
                saveh = h
                modelvec = modelstring
                savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec)
                savehr2 = saveser2 * t._ppf((1+0.95)/2., len(r2vec)-1)
            print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f"
                  % (modelstring, mean, h, meanr2, hr2))
    print('')
    print("########best linear SVM model##########")
    print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f"
          % (modelvec, savemean, saveh, savemeanr2, savehr2))
    print("")




[docs]
def crossvalidateSVCrbf(data, labels,  C=1, gamma=1, doPCA=False,
                        n_components=-1, missing=False,
                        missing_strategy='median',
                        random_state=1234, n_folds=5,
                        n_repetitions=10):
    r"""Cross-validate SVC model with RBF kernel.

    Parameters
    ==========
    data : array of shape (n_individuals, n_dimensions)
        Train set data matrix.
    labels : array of shape (n_individuals, 1)
        Labels for train set.
    C : float, optional (default = 1)
        SVM regularization parameter.
    gamma : float, optional (default = 1)
        RBF parameter.
    doPCA : bool, optional (default = False)
        Apply PCA pre-processing.
    n_components : int, optional (default = -1)
        Number of components for PCA pre-processing.
        If set to -1, keep principal components
        accounting for 80% of data variance.
    missing : bool, optional (default = True)
        Replace missing values (calls scikit-learn functions).
    missing_strategy : str, optional (default = 'median')
        Scikit-learn missing data strategy.
    random_state : int, optional (default = 1234)
        Random state.
    n_folds : int, optional (default = 5)
        Number of CV folds.
    n_repetitions : int, optional (default = 10)
        Number of CV iterations.
    """
    if C < 0.0:
        Cvec = np.power(2, np.arange(
            start=-5, stop=15, step=1, dtype=float))
    else:
        Cvec = [C]
    if gamma < 0.0:
        gvec = np.power(2.0, np.arange(
            start=-15, stop=3, step=1, dtype=float))
    else:
        gvec = [gamma]
#    modelvec = ""
    savemean = -9999.0
#    saveh = 0.0
    nummodel = 0
    if n_components == -1 and doPCA is True:
        pca = PCA(random_state=random_state)
        pca.fit(data)
        n_components = np.searchsorted(
            pca.explained_variance_ratio_.cumsum(), 0.8)+1
        print("Used number of components explaining 80%% "
              "of the variance = %s\n"
              % n_components)
    uniqClasses, labels = np.unique(labels, return_inverse=True)
    nClasses = len(uniqClasses)
    print("Classes: ", uniqClasses)
    print("nClasses: ", nClasses)
    print("")
    print("model\tparameters=C:gamma\trecall with CI\t"
          "precision with CI\tF1-score with CI")
    print("")
    for C in Cvec:
        for g in gvec:
            modelstring = str(C)+"-"+str(g)
            nummodel += 1
            recallvec = []
            precisionvec = []
            f1vec = []
            recallclassvec = np.array([])
            precisionclassvec = np.array([])
            f1classvec = np.array([])
            meanclass = np.zeros(nClasses)
            meanprecisionclass = np.zeros(nClasses)
            meanf1class = np.zeros(nClasses)
            seclass = np.zeros(nClasses)
            seprecisionclass = np.zeros(nClasses)
            sef1class = np.zeros(nClasses)
            hclass = np.zeros(nClasses)
            hprecisionclass = np.zeros(nClasses)
            hf1class = np.zeros(nClasses)
            for j in range(n_repetitions):
                ss = KFold(n_splits=n_folds, shuffle=True, random_state=j)
                y_true = []
                y_pred = []
                for train_index, test_index in ss.split(data):
                    train = np.copy(data[train_index])
                    test = np.copy(data[test_index])
                    processed = ugtm_preprocess.processTrainTest(train, test,
                                                                 doPCA,
                                                                 n_components,
                                                                 missing,
                                                                 missing_strategy)
                    clf = SVC(kernel='rbf', C=C, gamma=g)
                    clf.fit(processed.train, labels[train_index])
                    y_pred = np.append(y_pred, clf.predict(processed.test))
                    y_true = np.append(y_true, labels[test_index])
                recall = recall_score(y_true, y_pred, average='weighted')
                precision = precision_score(
                    y_true, y_pred, average='weighted')
                f1 = f1_score(y_true, y_pred, average='weighted')
                recallvec = np.append(recallvec, recall)
                precisionvec = np.append(precisionvec, precision)
                f1vec = np.append(f1vec, f1)
                recallclass = recall_score(y_true, y_pred, average=None)
                precisionclass = precision_score(y_true, y_pred, average=None)
                f1class = f1_score(y_true, y_pred, average=None)
                if(j == 0):
                    recallclassvec = recallclass
                    precisionclassvec = precisionclass
                    f1classvec = f1class
                else:
                    recallclassvec = np.vstack([recallclassvec, recallclass])
                    precisionclassvec = np.vstack(
                        [precisionclassvec, precisionclass])
                    f1classvec = np.vstack([f1classvec, f1class])
            mean, se = np.mean(recallvec), st.sem(recallvec)
            meanprecision, seprecision = np.mean(
                precisionvec), st.sem(precisionvec)
            meanf1, sef1 = np.mean(f1vec), st.sem(f1vec)
            h = se * t._ppf((1+0.95)/2., len(recallvec)-1)
            hprecision = seprecision * \
                t._ppf((1+0.95)/2., len(precisionvec)-1)
            hf1 = sef1 * t._ppf((1+0.95)/2., len(f1vec)-1)
            if(meanf1 > savemean):
                savemean = meanf1
#                saveh = hf1
#                modelvec = modelstring
                savemodel = "Model "+str(nummodel)
            for i in range(0, nClasses):
                meanclass[i], seclass[i] = np.mean(recallclassvec[:, i]), \
                    st.sem(recallclassvec[:, i])
                meanf1class[i], sef1class[i] = np.mean(f1classvec[:, i]), \
                    st.sem(f1classvec[:, i])
                meanprecisionclass[i] = np.mean(precisionclassvec[:, i])
                seprecisionclass[i] = st.sem(precisionclassvec[:, i])
                hclass[i] = seclass[i] * \
                    t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1)
                hprecisionclass[i] = seprecisionclass[i] * \
                    t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1)
                hf1class[i] = sef1class[i] * \
                    t._ppf((1+0.95)/2., len(f1classvec[:, i])-1)
            print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f"
                  % (nummodel, modelstring, mean, h,
                     meanprecision, hprecision, meanf1, hf1))
            for i in range(nClasses):
                print("Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f"
                      % (uniqClasses[i], modelstring, meanclass[i],
                         hclass[i], meanprecisionclass[i],
                         hprecisionclass[i], meanf1class[i],
                         hf1class[i]))
            print("")
    print("")

    print("########best RBF SVM model##########")
    print(savemodel)
    print("")




[docs]
def whichExperiment(data, labels, args, discrete=False):
    if discrete is True and args.model == 'GTM':
        decide = 'crossvalidateGTC'
    elif discrete is False and args.model == 'GTM':
        decide = 'crossvalidateGTR'
    elif discrete is True and args.model == 'SVM':
        decide = 'crossvalidateSVC'
    elif discrete is False and args.model == 'SVM':
        decide = 'crossvalidateSVR'
    elif discrete is True and args.model == 'SVMrbf':
        decide = 'crossvalidateSVCrbf'
    elif discrete is True and args.model == 'PCA':
        decide = 'crossvalidatePCAC'
    elif discrete is False and args.model == 'PCA':
        decide = 'crossvalidatePCAR'
    elif discrete is True and args.model == 'compare':
        decide = 'comparecrossvalidateC'
    elif discrete is False and args.model == 'compare':
        decide = 'comparecrossvalidateR'
    else:
        decide = ''
        exit
    if decide == 'crossvalidateGTC':
        crossvalidateGTC(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components,
                         n_neighbors=args.n_neighbors, representation=args.representation,
                         missing=args.missing, missing_strategy=args.missing_strategy,
                         random_state=args.random_state, k=args.grid_size,
                         m=args.rbf_grid_size, predict_mode=args.predict_mode,
                         prior=args.prior, regul=args.regularization,
                         s=args.rbf_width_factor)
    elif decide == 'crossvalidateGTR':
        crossvalidateGTR(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components,
                         n_neighbors=args.n_neighbors, representation=args.representation,
                         missing=args.missing, missing_strategy=args.missing_strategy,
                         random_state=args.random_state, k=args.grid_size, m=args.rbf_grid_size,
                         regul=args.regularization, s=args.rbf_width_factor)
    elif decide == 'crossvalidateSVC':
        crossvalidateSVC(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components,
                         missing=args.missing, missing_strategy=args.missing_strategy,
                         random_state=args.random_state, C=args.svm_margin)
    elif decide == 'crossvalidateSVCrbf':
        crossvalidateSVCrbf(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components,
                            missing=args.missing, missing_strategy=args.missing_strategy,
                            random_state=args.random_state, C=args.svm_margin, gamma=args.svm_gamma)
    elif decide == 'crossvalidateSVR':
        crossvalidateSVR(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components,
                         missing=args.missing, missing_strategy=args.missing_strategy,
                         random_state=args.random_state, C=args.svm_margin, epsilon=args.svm_epsilon)
    elif decide == 'crossvalidatePCAC':
        crossvalidatePCAC(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components,
                          missing=args.missing, missing_strategy=args.missing_strategy,
                          random_state=args.random_state, n_neighbors=args.n_neighbors)
    elif decide == 'crossvalidatePCAR':
        crossvalidatePCAR(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components,
                          missing=args.missing, missing_strategy=args.missing_strategy,
                          random_state=args.random_state, n_neighbors=args.n_neighbors)
    elif decide == 'comparecrossvalidateC':
        crossvalidateSVC(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components,
                         missing=args.missing, missing_strategy=args.missing_strategy,
                         random_state=args.random_state, C=args.svm_margin)
        crossvalidateGTC(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components,
                         representation=args.representation, missing=args.missing,
                         missing_strategy=args.missing_strategy, random_state=args.random_state,
                         k=args.grid_size, m=args.rbf_grid_size, predict_mode=args.predict_mode,
                         prior=args.prior, regul=args.regularization, s=args.rbf_width_factor)
    elif decide == 'comparecrossvalidateR':
        crossvalidateSVR(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components,
                         missing=args.missing, missing_strategy=args.missing_strategy,
                         random_state=args.random_state,
                         C=args.svm_margin, epsilon=args.svm_epsilon)
        crossvalidateGTR(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components,
                         n_neighbors=args.n_neighbors, representation=args.representation,
                         missing=args.missing, missing_strategy=args.missing_strategy,
                         random_state=args.random_state, k=args.grid_size, m=args.rbf_grid_size,
                         regul=args.regularization, s=args.rbf_width_factor)
    else:
        print("Could not determine which experiment to conduct.")