Source code for ugtm.ugtm_crossvalidate

"""Cross-validation support for GTC and GTR models (also SVM and PCA).
"""
# Authors: Helena A. Gaspar <hagax8@gmail.com>
# License: MIT

import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC
from sklearn.svm import SVR
import scipy.stats as st
from scipy.stats import t
from . import ugtm_predictions
from . import ugtm_preprocess


[docs] def crossvalidateGTC(data, labels, k=16, m=4, s=-1.0, regul=1.0, n_neighbors=1, niter=200, representation="modes", doPCA=False, n_components=-1, missing=False, missing_strategy='median', random_state=1234, predict_mode="bayes", prior="estimated", n_folds=5, n_repetitions=10): r"""Cross-validate GTC model. Parameters ========== data : array of shape (n_individuals, n_dimensions) Train set data matrix. labels : array of shape (n_individuals, 1) Labels for train set. k : int, optional (default = 16) If k is set to 0, k is computed as sqrt(5*sqrt(n_individuals))+2. k is the sqrt of the number of GTM nodes. One of four GTM hyperparameters (k, m, s, regul). Ex: k = 25 means the GTM will be discretized into a 25x25 grid. m : int, optional (default = 4) If m is set to 0, m is computed as sqrt(k). (generally good rule of thumb). m is the qrt of the number of RBF centers. One of four GTM hyperparameters (k, m, s, regul). Ex: m = 5 means the RBF functions will be arranged on a 5x5 grid. s : float, optional (default = -1) RBF width factor. Default (-1) is to try different values. Parameter to tune width of RBF functions. Impacts manifold flexibility. regul : float, optional (default = -1) Regularization coefficient. Default (-1) is to try different values. Impacts manifold flexibility. n_neighbors : int, optional (default = 1) Number of neighbors for kNN algorithm (number of nearest nodes). At the moment, n_neighbors for GTC is always equal to 1. niter : int, optional (default = 200) Number of iterations for EM algorithm. representation : {"modes", "means"} 2D GTM representation for the test set, used for kNN algorithms: "modes" for position with max. responsibility, "means" for average position (usual GTM representation) doPCA : bool, optional (default = False) Apply PCA pre-processing. n_components : int, optional (default = -1) Number of components for PCA pre-processing. If set to -1, keep principal components accounting for 80% of data variance. missing : bool, optional (default = True) Replace missing values (calls scikit-learn functions). missing_strategy : str, optional (default = 'median') Scikit-learn missing data strategy. random_state : int, optional (default = 1234) Random state. predict_mode : {"bayes", "knn"}, optional Choose between nearest node algorithm ("knn", output of :func:`~ugtm.ugtm_predictions.predictNN`) or GTM Bayes classifier ("bayes", output of :func:`~ugtm.ugtm_predictions.predictBayes`). NB: the kNN algorithm is limited to only 1 nearest node at the moment (n_neighbors = 1). prior : {"estimated", "equiprobable"}, optional Type of prior used to build GTM class map (:func:`~ugtm.ugtm_landscape.classMap`). Choose "estimated" to account for class imbalance. n_folds : int, optional (default = 5) Number of CV folds. n_repetitions : int, optional (default = 10) Number of CV iterations. """ print("") print("k = sqrt(grid size), m = sqrt(radial basis function grid size), " "regul = regularization, s = RBF width factor") print("") uniqClasses, labels = np.unique(labels, return_inverse=True) nClasses = len(uniqClasses) print("Classes: ", uniqClasses) print("nClasses: %s" % (nClasses)) print("") print("model\tparameters=k:m:s:regul\t" "recall with CI\tprecision with CI\tF1-score with CI") print("") if k == 0: k = int(np.sqrt(5*np.sqrt(data.shape[0])))+2 if m == 0: m = int(np.sqrt(k)) if n_components == -1 and doPCA: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted( pca.explained_variance_ratio_.cumsum(), 0.8)+1 print("Used number of components explaining 80%% of " "the variance in whole data set = %s\n" % n_components) if regul < 0.0: lvec = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100] else: lvec = [regul] if s <= 0.0: svec = [0.25, 0.5, 1.0, 1.50, 2.0] else: svec = [s] savemean = -9999 nummodel = 0 savemodel = "" for s in svec: for regul in lvec: modelstring = str(k)+':'+str(m)+":"+str(s)+":"+str(regul) nummodel += 1 recallvec = [] precisionvec = [] f1vec = [] recallclassvec = np.array([]) precisionclassvec = np.array([]) f1classvec = np.array([]) meanclass = np.zeros(nClasses) meanprecisionclass = np.zeros(nClasses) meanf1class = np.zeros(nClasses) seclass = np.zeros(nClasses) seprecisionclass = np.zeros(nClasses) sef1class = np.zeros(nClasses) hclass = np.zeros(nClasses) hprecisionclass = np.zeros(nClasses) hf1class = np.zeros(nClasses) for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) prediction = ugtm_predictions.GTC(train=train, labels=labels[train_index], test=test, k=k, m=m, s=s, regul=regul, n_neighbors=n_neighbors, niter=niter, representation=representation, doPCA=doPCA, n_components=n_components, random_state=random_state, missing=missing, missing_strategy=missing_strategy, predict_mode=predict_mode, prior=prior) y_true = np.append(y_true, labels[test_index]) y_pred = np.append(y_pred, prediction) recall = recall_score(y_true, y_pred, average='weighted') precision = precision_score( y_true, y_pred, average='weighted') f1 = f1_score(y_true, y_pred, average='weighted') recallvec = np.append(recallvec, recall) precisionvec = np.append(precisionvec, precision) f1vec = np.append(f1vec, f1) recallclass = recall_score(y_true, y_pred, average=None) precisionclass = precision_score(y_true, y_pred, average=None) f1class = f1_score(y_true, y_pred, average=None) if(j == 0): recallclassvec = recallclass precisionclassvec = precisionclass f1classvec = f1class else: recallclassvec = np.vstack([recallclassvec, recallclass]) precisionclassvec = np.vstack( [precisionclassvec, precisionclass]) f1classvec = np.vstack([f1classvec, f1class]) mean, se = np.mean(recallvec), st.sem(recallvec) meanprecision, seprecision = np.mean( precisionvec), st.sem(precisionvec) meanf1, sef1 = np.mean(f1vec), st.sem(f1vec) h = se * t._ppf((1+0.95)/2., len(recallvec)-1) hprecision = seprecision * \ t._ppf((1+0.95)/2., len(precisionvec)-1) hf1 = sef1 * t._ppf((1+0.95)/2., len(f1vec)-1) if(meanf1 > savemean): savemean = meanf1 savemodel = "Model "+str(nummodel) for i in range(0, nClasses): meanclass[i] = np.mean(recallclassvec[:, i]) seclass[i] = st.sem(recallclassvec[:, i]) meanf1class[i] = np.mean(f1classvec[:, i]) sef1class[i] = st.sem(f1classvec[:, i]) meanprecisionclass[i] = np.mean(precisionclassvec[:, i]) seprecisionclass[i] = st.sem(precisionclassvec[:, i]) hclass[i] = seclass[i] * \ t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1) hprecisionclass[i] = seprecisionclass[i] \ * t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1) hf1class[i] = sef1class[i] * \ t._ppf((1+0.95)/2., len(f1classvec[:, i])-1) print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (nummodel, modelstring, mean, h, meanprecision, hprecision, meanf1, hf1)) for i in range(nClasses): print("Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (uniqClasses[i], modelstring, meanclass[i], hclass[i], meanprecisionclass[i], hprecisionclass[i], meanf1class[i], hf1class[i])) print('') print('') print("########best GTC model##########") print(savemodel) print("")
[docs] def crossvalidateGTR(data, labels, k=16, m=4, s=-1, regul=-1, n_neighbors=1, niter=200, representation="modes", doPCA=False, n_components=-1, missing=False, missing_strategy='median', random_state=1234, n_folds=5, n_repetitions=10): r"""Cross-validate GTR model. Parameters ========== data : array of shape (n_individuals, n_dimensions) Train set data matrix. labels : array of shape (n_individuals, 1) Labels for train set. k : int, optional (default = 16) If k is set to 0, k is computed as sqrt(5*sqrt(n_individuals))+2. k is the sqrt of the number of GTM nodes. One of four GTM hyperparameters (k, m, s, regul). Ex: k = 25 means the GTM will be discretized into a 25x25 grid. m : int, optional (default = 4) If m is set to 0, m is computed as sqrt(k). (generally good rule of thumb). m is the qrt of the number of RBF centers. One of four GTM hyperparameters (k, m, s, regul). Ex: m = 5 means the RBF functions will be arranged on a 5x5 grid. s : float, optional (default = -1) RBF width factor. Default (-1) is to try different values. Parameter to tune width of RBF functions. Impacts manifold flexibility. regul : float, optional (default = -1) Regularization coefficient. Default (-1) is to try different values. Impacts manifold flexibility. n_neighbors : int, optional (default = 1) Number of neighbors for kNN algorithm (number of nearest nodes). niter : int, optional (default = 200) Number of iterations for EM algorithm. representation : {"modes", "means"} 2D GTM representation for the test set, used for kNN algorithms: "modes" for position with max. responsibility, "means" for average position (usual GTM representation) doPCA : bool, optional (default = False) Apply PCA pre-processing. n_components : int, optional (default = -1) Number of components for PCA pre-processing. If set to -1, keep principal components accounting for 80% of data variance. missing : bool, optional (default = True) Replace missing values (calls scikit-learn functions). missing_strategy : str, optional (default = 'median') Scikit-learn missing data strategy. random_state : int, optional (default = 1234) Random state. n_folds : int, optional (default = 5) Number of CV folds. n_repetitions : int, optional (default = 10) Number of CV iterations. """ print("") print("k = sqrt(grid size), m = sqrt(radial basis function grid size), " "regul = regularization, s = RBF width factor") print("") if k == 0: k = int(np.sqrt(5*np.sqrt(data.shape[0])))+2 if m == 0: m = int(np.sqrt(k)) if n_components == -1 and doPCA is True: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted( pca.explained_variance_ratio_.cumsum(), 0.8)+1 print("Used number of components explaining 80%% of the variance = %s\n" % n_components) if regul < 0.0: lvec = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100] else: lvec = [regul] if s <= 0.0: svec = [0.25, 0.5, 1.0, 1.50, 2.0] else: svec = [s] savemean = 999999999 saveh = 0.0 modelvec = "" savemeanr2 = 0.0 savehr2 = 0.0 print("k:m:s:regul\tRMSE with CI\tR2 with CI\t") for s in svec: for regul in lvec: modelstring = str(s)+":"+str(regul) rmsevec = [] r2vec = [] for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) prediction = ugtm_predictions.GTR(train=train, labels=labels[train_index], test=test, k=k, m=m, s=s, regul=regul, n_neighbors=n_neighbors, niter=niter, representation=representation, doPCA=doPCA, n_components=n_components, random_state=random_state, missing=missing, missing_strategy=missing_strategy) y_pred = np.append(y_pred, prediction) y_true = np.append(y_true, labels[test_index]) rmse = np.sqrt(mean_squared_error(y_true, y_pred)) r2 = r2_score(y_true, y_pred) rmsevec = np.append(rmsevec, rmse) r2vec = np.append(r2vec, r2) mean, se = np.mean(rmsevec), st.sem(rmsevec) h = se * t._ppf((1.0+0.95)/2., len(rmsevec)-1) meanr2, ser2 = np.mean(r2vec), st.sem(r2vec) hr2 = ser2 * t._ppf((1.0+0.95)/2., len(r2vec)-1) if(mean < savemean): savemean = mean saveh = h modelvec = modelstring savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec) savehr2 = saveser2 * t._ppf((1+0.95)/2., len(r2vec)-1) print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (str(k)+':'+str(m)+':'+modelstring, mean, h, meanr2, hr2)) print('') print("########best GTR model##########") print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (str(k)+':'+str(m)+':'+modelvec, savemean, saveh, savemeanr2, savehr2)) print("")
[docs] def crossvalidatePCAC(data, labels, n_neighbors=1, maxneighbours=11, doPCA=False, n_components=-1, missing=False, missing_strategy='median', random_state=1234, n_folds=5, n_repetitions=10): r"""Cross-validate PCA kNN classification model. Parameters ========== data : array of shape (n_individuals, n_dimensions) Train set data matrix. labels : array of shape (n_individuals, 1) Labels for train set. n_neighbors : int, optional (default = 1) Number of neighbors for kNN algorithm (number of nearest nodes). max_neighbors : int, optional (default = 11) The function crossvalidates kNN models with k between n_neighbors and max_neighbors. doPCA : bool, optional (default = False) Apply PCA pre-processing. n_components : int, optional (default = -1) Number of components for PCA pre-processing. If set to -1, keep principal components accounting for 80% of data variance. missing : bool, optional (default = True) Replace missing values (calls scikit-learn functions). missing_strategy : str, optional (default = 'median') Scikit-learn missing data strategy. random_state : int, optional (default = 1234) Random state. n_folds : int, optional (default = 5) Number of CV folds. n_repetitions : int, optional (default = 10) Number of CV iterations. """ if n_components == -1 and doPCA is True: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted( pca.explained_variance_ratio_.cumsum(), 0.8)+1 print("Used number of components " "explaining 80%% of the variance = %s\n" % n_components) uniqClasses, labels = np.unique(labels, return_inverse=True) nClasses = len(uniqClasses) print("Classes: ", uniqClasses) print("nClasses: ", nClasses) print("") print("model\tparameters=k_for_kNN\trecall with CI\t" "precision with CI\tF1-score with CI") print("") if n_neighbors <= 0: Kvec = np.arange(start=1, stop=maxneighbours, step=1, dtype=np.int32) else: Kvec = [n_neighbors] savemean = -9999 nummodel = 0 savemodel = "" for c in Kvec: nummodel += 1 modelstring = str(c) recallvec = [] precisionvec = [] f1vec = [] recallclassvec = np.array([]) precisionclassvec = np.array([]) f1classvec = np.array([]) meanclass = np.zeros(nClasses) meanprecisionclass = np.zeros(nClasses) meanf1class = np.zeros(nClasses) seclass = np.zeros(nClasses) seprecisionclass = np.zeros(nClasses) sef1class = np.zeros(nClasses) hclass = np.zeros(nClasses) hprecisionclass = np.zeros(nClasses) hf1class = np.zeros(nClasses) for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) processed = ugtm_preprocess.processTrainTest(train, test, doPCA, n_components, missing, missing_strategy) y_pred = np.append(y_pred, ugtm_predictions.predictNNSimple( processed.train, processed.test, labels[train_index], c, "classification")) y_true = np.append(y_true, labels[test_index]) recall = recall_score(y_true, y_pred, average='weighted') precision = precision_score(y_true, y_pred, average='weighted') f1 = f1_score(y_true, y_pred, average='weighted') recallvec = np.append(recallvec, recall) precisionvec = np.append(precisionvec, precision) f1vec = np.append(f1vec, f1) recallclass = recall_score(y_true, y_pred, average=None) precisionclass = precision_score(y_true, y_pred, average=None) f1class = f1_score(y_true, y_pred, average=None) if(j == 0): recallclassvec = recallclass precisionclassvec = precisionclass f1classvec = f1class else: recallclassvec = np.vstack([recallclassvec, recallclass]) precisionclassvec = np.vstack( [precisionclassvec, precisionclass]) f1classvec = np.vstack([f1classvec, f1class]) mean, se = np.mean(recallvec), st.sem(recallvec) meanprecision, seprecision = np.mean( precisionvec), st.sem(precisionvec) meanf1, sef1 = np.mean(f1vec), st.sem(f1vec) h = se * t._ppf((1+0.95)/2., len(recallvec)-1) hprecision = seprecision * t._ppf((1+0.95)/2., len(precisionvec)-1) hf1 = sef1 * t._ppf((1+0.95)/2., len(f1vec)-1) if(meanf1 > savemean): savemean = meanf1 savemodel = "Model "+str(nummodel) for i in range(0, nClasses): meanclass[i] = np.mean(recallclassvec[:, i]) seclass[i] = st.sem(recallclassvec[:, i]) meanf1class[i] = np.mean(f1classvec[:, i]) sef1class[i] = st.sem(f1classvec[:, i]) meanprecisionclass[i] = np.mean(precisionclassvec[:, i]) seprecisionclass[i] = st.sem(precisionclassvec[:, i]) hclass[i] = seclass[i] * \ t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1) hprecisionclass[i] = seprecisionclass[i] \ * t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1) hf1class[i] = sef1class[i] * \ t._ppf((1+0.95)/2., len(f1classvec[:, i])-1) print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (nummodel, modelstring, mean, h, meanprecision, hprecision, meanf1, hf1)) for i in range(nClasses): print("Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (uniqClasses[i], modelstring, meanclass[i], hclass[i], meanprecisionclass[i], hprecisionclass[i], meanf1class[i], hf1class[i])) print('') print('') print("########best nearest neighbors model##########") print(savemodel) print("")
[docs] def crossvalidatePCAR(data, labels, n_neighbors=1, maxneighbours=11, doPCA=False, n_components=-1, missing=False, missing_strategy='median', random_state=1234, n_folds=5, n_repetitions=10): r"""Cross-validate PCA kNN regression model. Parameters ========== data : array of shape (n_individuals, n_dimensions) Train set data matrix. labels : array of shape (n_individuals, 1) Labels for train set. n_neighbors : int, optional (default = 1) Number of neighbors for kNN algorithm (number of nearest nodes). max_neighbors : int, optional (default = 11) The function crossvalidates kNN models with k between n_neighbors and max_neighbors. doPCA : bool, optional (default = False) Apply PCA pre-processing. n_components : int, optional (default = -1) Number of components for PCA pre-processing. If set to -1, keep principal components accounting for 80% of data variance. missing : bool, optional (default = True) Replace missing values (calls scikit-learn functions). missing_strategy : str, optional (default = 'median') Scikit-learn missing data strategy. random_state : int, optional (default = 1234) Random state. n_folds : int, optional (default = 5) Number of CV folds. n_repetitions : int, optional (default = 10) Number of CV iterations. """ if n_components == -1 and doPCA is True: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted( pca.explained_variance_ratio_.cumsum(), 0.8)+1 print("Used number of components explaining 80%% of the variance = %s\n" % n_components) print("") uniqClasses, labels = np.unique(labels, return_inverse=True) if n_neighbors <= 0: Kvec = np.arange(start=1, stop=maxneighbours, step=1, dtype=np.int32) else: Kvec = [n_neighbors] modelvec = "" savemean = 99999 saveh = 0.0 savemeanr2 = 0.0 savehr2 = 0.0 nummodel = 0 print("k = number of nearest neighbours\tRMSE with CI\tR2 with CI\t") for c in Kvec: nummodel += 1 modelstring = str(c) rmsevec = [] r2vec = [] for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) processed = ugtm_preprocess.processTrainTest(train, test, doPCA, n_components, missing, missing_strategy) y_pred = np.append(y_pred, ugtm_predictions.predictNNSimple( processed.train, processed.test, labels[train_index], c, "regression")) y_true = np.append(y_true, labels[test_index]) rmse = np.sqrt(mean_squared_error(y_true, y_pred)) r2 = r2_score(y_true, y_pred) rmsevec = np.append(rmsevec, rmse) r2vec = np.append(r2vec, r2) mean, se = np.mean(rmsevec), st.sem(rmsevec) h = se * t._ppf((1+0.95)/2., len(rmsevec)-1) meanr2, ser2 = np.mean(r2vec), st.sem(r2vec) hr2 = ser2 * t._ppf((1+0.95)/2., len(r2vec)-1) if(mean < savemean): savemean = mean saveh = h modelvec = modelstring savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec) savehr2 = saveser2 * t._ppf((1+0.95)/2., len(r2vec)-1) print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (modelstring, mean, h, meanr2, hr2)) print('') print("########best nearest neighbors model##########") print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (modelvec, savemean, saveh, savemeanr2, savehr2)) print("")
[docs] def crossvalidateSVC(data, labels, C=1.0, doPCA=False, n_components=-1, missing=False, missing_strategy='median', random_state=1234, n_folds=5, n_repetitions=10): r"""Cross-validate SVC model. Parameters ========== data : array of shape (n_individuals, n_dimensions) Train set data matrix. labels : array of shape (n_individuals, 1) Labels for train set. C : float, optional (default = 1.0) SVM regularization parameter. doPCA : bool, optional (default = False) Apply PCA pre-processing. n_components : int, optional (default = -1) Number of components for PCA pre-processing. If set to -1, keep principal components accounting for 80% of data variance. missing : bool, optional (default = True) Replace missing values (calls scikit-learn functions). missing_strategy : str, optional (default = 'median') Scikit-learn missing data strategy. random_state : int, optional (default = 1234) Random state. n_folds : int, optional (default = 5) Number of CV folds. n_repetitions : int, optional (default = 10) Number of CV iterations. """ if n_components == -1 and doPCA is True: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted( pca.explained_variance_ratio_.cumsum(), 0.8)+1 print("Used number of components explaining 80%% of the variance = %s\n" % n_components) uniqClasses, labels = np.unique(labels, return_inverse=True) nClasses = len(uniqClasses) print("Classes: ", uniqClasses) print("nClasses: ", nClasses) print("") print("model\tparameters=C\trecall with CI\t" "precision with CI\tF1-score with CI") print("") if C < 0.0: Cvec = np.power(2, np.arange( start=-5, stop=15, step=1, dtype=float)) else: Cvec = [C] savemean = -9999 nummodel = 0 savemodel = "" for C in Cvec: modelstring = str(C) nummodel += 1 recallvec = [] precisionvec = [] f1vec = [] recallclassvec = np.array([]) precisionclassvec = np.array([]) f1classvec = np.array([]) meanclass = np.zeros(nClasses) meanprecisionclass = np.zeros(nClasses) meanf1class = np.zeros(nClasses) seclass = np.zeros(nClasses) seprecisionclass = np.zeros(nClasses) sef1class = np.zeros(nClasses) hclass = np.zeros(nClasses) hprecisionclass = np.zeros(nClasses) hf1class = np.zeros(nClasses) for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) processed = ugtm_preprocess.processTrainTest(train, test, doPCA, n_components, missing, missing_strategy) clf = SVC(kernel='linear', C=C) clf.fit(processed.train, labels[train_index]) y_pred = np.append(y_pred, clf.predict(processed.test)) y_true = np.append(y_true, labels[test_index]) recall = recall_score(y_true, y_pred, average='weighted') precision = precision_score(y_true, y_pred, average='weighted') f1 = f1_score(y_true, y_pred, average='weighted') recallvec = np.append(recallvec, recall) precisionvec = np.append(precisionvec, precision) f1vec = np.append(f1vec, f1) recallclass = recall_score(y_true, y_pred, average=None) precisionclass = precision_score(y_true, y_pred, average=None) f1class = f1_score(y_true, y_pred, average=None) if(j == 0): recallclassvec = recallclass precisionclassvec = precisionclass f1classvec = f1class else: recallclassvec = np.vstack([recallclassvec, recallclass]) precisionclassvec = np.vstack( [precisionclassvec, precisionclass]) f1classvec = np.vstack([f1classvec, f1class]) mean, se = np.mean(recallvec), st.sem(recallvec) meanprecision, seprecision = np.mean( precisionvec), st.sem(precisionvec) meanf1, sef1 = np.mean(f1vec), st.sem(f1vec) h = se * t._ppf((1+0.95)/2., len(recallvec)-1) hprecision = seprecision * t._ppf((1+0.95)/2., len(precisionvec)-1) hf1 = sef1 * t._ppf((1+0.95)/2., len(f1vec)-1) if(meanf1 > savemean): savemean = meanf1 savemodel = "Model "+str(nummodel) for i in range(0, nClasses): meanclass[i] = np.mean(recallclassvec[:, i]) seclass[i] = st.sem(recallclassvec[:, i]) sef1class[i] = st.sem(f1classvec[:, i]) meanf1class[i] = np.mean(f1classvec[:, i]) meanprecisionclass[i] = np.mean(precisionclassvec[:, i]) seprecisionclass[i] = st.sem(precisionclassvec[:, i]) hclass[i] = seclass[i] * \ t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1) hprecisionclass[i] = seprecisionclass[i] * \ t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1) hf1class[i] = sef1class[i] * \ t._ppf((1+0.95)/2., len(f1classvec[:, i])-1) print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (nummodel, modelstring, mean, h, meanprecision, hprecision, meanf1, hf1)) for i in range(nClasses): print("Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (uniqClasses[i], modelstring, meanclass[i], hclass[i], meanprecisionclass[i], hprecisionclass[i], meanf1class[i], hf1class[i])) print('') print('') print("########best linear SVM model##########") print(savemodel) print("")
[docs] def crossvalidateSVR(data, labels, C=-1, epsilon=-1, doPCA=False, n_components=-1, missing=False, missing_strategy='median', random_state=1234, n_folds=5, n_repetitions=10): r"""Cross-validate SVR model with linear kernel. Parameters ========== data : array of shape (n_individuals, n_dimensions) Train set data matrix. labels : array of shape (n_individuals, 1) Labels for train set. C : float, optional (default = -1) SVM regularization parameter. If (C = -1), different values are tested. epsilon : float, optional (default = -1) SVM tolerance parameter. If (epsilon = -1), different values are tested. doPCA : bool, optional (default = False) Apply PCA pre-processing. n_components : int, optional (default = -1) Number of components for PCA pre-processing. If set to -1, keep principal components accounting for 80% of data variance. missing : bool, optional (default = True) Replace missing values (calls scikit-learn functions). missing_strategy : str, optional (default = 'median') Scikit-learn missing data strategy. random_state : int, optional (default = 1234) Random state. n_folds : int, optional (default = 5) Number of CV folds. n_repetitions : int, optional (default = 10) Number of CV iterations. """ if C < 0.0: Cvec = np.power(2, np.arange( start=-5, stop=15, step=1, dtype=float)) else: Cvec = [C] if epsilon < 0.0: EpsVec = [0, 0.01, 0.1, 0.5, 1, 2, 4] else: EpsVec = [epsilon] modelvec = "" savemean = 99999 saveh = 0.0 savemeanr2 = 0.0 savehr2 = 0.0 if n_components == -1 and doPCA is True: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted( pca.explained_variance_ratio_.cumsum(), 0.8)+1 print("Used number of components explaining 80%%" "of the variance = %s\n" % n_components) print("C:epsilon\tRMSE with CI\tR2 with CI\t") for C in Cvec: for eps in EpsVec: modelstring = str(C)+":"+str(eps) rmsevec = [] r2vec = [] for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) processed = ugtm_preprocess.processTrainTest(train, test, doPCA, n_components, missing, missing_strategy) clf = SVR(kernel='linear', C=C, epsilon=eps) clf.fit(processed.train, labels[train_index]) y_pred = np.append(y_pred, clf.predict(processed.test)) y_true = np.append(y_true, labels[test_index]) rmse = np.sqrt(mean_squared_error(y_true, y_pred)) r2 = r2_score(y_true, y_pred) rmsevec = np.append(rmsevec, rmse) r2vec = np.append(r2vec, r2) mean, se = np.mean(rmsevec), st.sem(rmsevec) h = se * t._ppf((1+0.95)/2., len(rmsevec)-1) meanr2, ser2 = np.mean(r2vec), st.sem(r2vec) hr2 = ser2 * t._ppf((1+0.95)/2., len(r2vec)-1) if(mean < savemean): savemean = mean saveh = h modelvec = modelstring savemeanr2, saveser2 = np.mean(r2vec), st.sem(r2vec) savehr2 = saveser2 * t._ppf((1+0.95)/2., len(r2vec)-1) print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (modelstring, mean, h, meanr2, hr2)) print('') print("########best linear SVM model##########") print("%s\t%.4f +/- %.4f\t%.4f +/- %.4f" % (modelvec, savemean, saveh, savemeanr2, savehr2)) print("")
[docs] def crossvalidateSVCrbf(data, labels, C=1, gamma=1, doPCA=False, n_components=-1, missing=False, missing_strategy='median', random_state=1234, n_folds=5, n_repetitions=10): r"""Cross-validate SVC model with RBF kernel. Parameters ========== data : array of shape (n_individuals, n_dimensions) Train set data matrix. labels : array of shape (n_individuals, 1) Labels for train set. C : float, optional (default = 1) SVM regularization parameter. gamma : float, optional (default = 1) RBF parameter. doPCA : bool, optional (default = False) Apply PCA pre-processing. n_components : int, optional (default = -1) Number of components for PCA pre-processing. If set to -1, keep principal components accounting for 80% of data variance. missing : bool, optional (default = True) Replace missing values (calls scikit-learn functions). missing_strategy : str, optional (default = 'median') Scikit-learn missing data strategy. random_state : int, optional (default = 1234) Random state. n_folds : int, optional (default = 5) Number of CV folds. n_repetitions : int, optional (default = 10) Number of CV iterations. """ if C < 0.0: Cvec = np.power(2, np.arange( start=-5, stop=15, step=1, dtype=float)) else: Cvec = [C] if gamma < 0.0: gvec = np.power(2.0, np.arange( start=-15, stop=3, step=1, dtype=float)) else: gvec = [gamma] # modelvec = "" savemean = -9999.0 # saveh = 0.0 nummodel = 0 if n_components == -1 and doPCA is True: pca = PCA(random_state=random_state) pca.fit(data) n_components = np.searchsorted( pca.explained_variance_ratio_.cumsum(), 0.8)+1 print("Used number of components explaining 80%% " "of the variance = %s\n" % n_components) uniqClasses, labels = np.unique(labels, return_inverse=True) nClasses = len(uniqClasses) print("Classes: ", uniqClasses) print("nClasses: ", nClasses) print("") print("model\tparameters=C:gamma\trecall with CI\t" "precision with CI\tF1-score with CI") print("") for C in Cvec: for g in gvec: modelstring = str(C)+"-"+str(g) nummodel += 1 recallvec = [] precisionvec = [] f1vec = [] recallclassvec = np.array([]) precisionclassvec = np.array([]) f1classvec = np.array([]) meanclass = np.zeros(nClasses) meanprecisionclass = np.zeros(nClasses) meanf1class = np.zeros(nClasses) seclass = np.zeros(nClasses) seprecisionclass = np.zeros(nClasses) sef1class = np.zeros(nClasses) hclass = np.zeros(nClasses) hprecisionclass = np.zeros(nClasses) hf1class = np.zeros(nClasses) for j in range(n_repetitions): ss = KFold(n_splits=n_folds, shuffle=True, random_state=j) y_true = [] y_pred = [] for train_index, test_index in ss.split(data): train = np.copy(data[train_index]) test = np.copy(data[test_index]) processed = ugtm_preprocess.processTrainTest(train, test, doPCA, n_components, missing, missing_strategy) clf = SVC(kernel='rbf', C=C, gamma=g) clf.fit(processed.train, labels[train_index]) y_pred = np.append(y_pred, clf.predict(processed.test)) y_true = np.append(y_true, labels[test_index]) recall = recall_score(y_true, y_pred, average='weighted') precision = precision_score( y_true, y_pred, average='weighted') f1 = f1_score(y_true, y_pred, average='weighted') recallvec = np.append(recallvec, recall) precisionvec = np.append(precisionvec, precision) f1vec = np.append(f1vec, f1) recallclass = recall_score(y_true, y_pred, average=None) precisionclass = precision_score(y_true, y_pred, average=None) f1class = f1_score(y_true, y_pred, average=None) if(j == 0): recallclassvec = recallclass precisionclassvec = precisionclass f1classvec = f1class else: recallclassvec = np.vstack([recallclassvec, recallclass]) precisionclassvec = np.vstack( [precisionclassvec, precisionclass]) f1classvec = np.vstack([f1classvec, f1class]) mean, se = np.mean(recallvec), st.sem(recallvec) meanprecision, seprecision = np.mean( precisionvec), st.sem(precisionvec) meanf1, sef1 = np.mean(f1vec), st.sem(f1vec) h = se * t._ppf((1+0.95)/2., len(recallvec)-1) hprecision = seprecision * \ t._ppf((1+0.95)/2., len(precisionvec)-1) hf1 = sef1 * t._ppf((1+0.95)/2., len(f1vec)-1) if(meanf1 > savemean): savemean = meanf1 # saveh = hf1 # modelvec = modelstring savemodel = "Model "+str(nummodel) for i in range(0, nClasses): meanclass[i], seclass[i] = np.mean(recallclassvec[:, i]), \ st.sem(recallclassvec[:, i]) meanf1class[i], sef1class[i] = np.mean(f1classvec[:, i]), \ st.sem(f1classvec[:, i]) meanprecisionclass[i] = np.mean(precisionclassvec[:, i]) seprecisionclass[i] = st.sem(precisionclassvec[:, i]) hclass[i] = seclass[i] * \ t._ppf((1+0.95)/2., len(recallclassvec[:, i])-1) hprecisionclass[i] = seprecisionclass[i] * \ t._ppf((1+0.95)/2., len(precisionclassvec[:, i])-1) hf1class[i] = sef1class[i] * \ t._ppf((1+0.95)/2., len(f1classvec[:, i])-1) print("Model %s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (nummodel, modelstring, mean, h, meanprecision, hprecision, meanf1, hf1)) for i in range(nClasses): print("Class=%s\t%s\t%.4f +/- %.4f\t%.4f +/- %.4f\t%.4f +/- %.4f" % (uniqClasses[i], modelstring, meanclass[i], hclass[i], meanprecisionclass[i], hprecisionclass[i], meanf1class[i], hf1class[i])) print("") print("") print("########best RBF SVM model##########") print(savemodel) print("")
[docs] def whichExperiment(data, labels, args, discrete=False): if discrete is True and args.model == 'GTM': decide = 'crossvalidateGTC' elif discrete is False and args.model == 'GTM': decide = 'crossvalidateGTR' elif discrete is True and args.model == 'SVM': decide = 'crossvalidateSVC' elif discrete is False and args.model == 'SVM': decide = 'crossvalidateSVR' elif discrete is True and args.model == 'SVMrbf': decide = 'crossvalidateSVCrbf' elif discrete is True and args.model == 'PCA': decide = 'crossvalidatePCAC' elif discrete is False and args.model == 'PCA': decide = 'crossvalidatePCAR' elif discrete is True and args.model == 'compare': decide = 'comparecrossvalidateC' elif discrete is False and args.model == 'compare': decide = 'comparecrossvalidateR' else: decide = '' exit if decide == 'crossvalidateGTC': crossvalidateGTC(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components, n_neighbors=args.n_neighbors, representation=args.representation, missing=args.missing, missing_strategy=args.missing_strategy, random_state=args.random_state, k=args.grid_size, m=args.rbf_grid_size, predict_mode=args.predict_mode, prior=args.prior, regul=args.regularization, s=args.rbf_width_factor) elif decide == 'crossvalidateGTR': crossvalidateGTR(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components, n_neighbors=args.n_neighbors, representation=args.representation, missing=args.missing, missing_strategy=args.missing_strategy, random_state=args.random_state, k=args.grid_size, m=args.rbf_grid_size, regul=args.regularization, s=args.rbf_width_factor) elif decide == 'crossvalidateSVC': crossvalidateSVC(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components, missing=args.missing, missing_strategy=args.missing_strategy, random_state=args.random_state, C=args.svm_margin) elif decide == 'crossvalidateSVCrbf': crossvalidateSVCrbf(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components, missing=args.missing, missing_strategy=args.missing_strategy, random_state=args.random_state, C=args.svm_margin, gamma=args.svm_gamma) elif decide == 'crossvalidateSVR': crossvalidateSVR(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components, missing=args.missing, missing_strategy=args.missing_strategy, random_state=args.random_state, C=args.svm_margin, epsilon=args.svm_epsilon) elif decide == 'crossvalidatePCAC': crossvalidatePCAC(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components, missing=args.missing, missing_strategy=args.missing_strategy, random_state=args.random_state, n_neighbors=args.n_neighbors) elif decide == 'crossvalidatePCAR': crossvalidatePCAR(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components, missing=args.missing, missing_strategy=args.missing_strategy, random_state=args.random_state, n_neighbors=args.n_neighbors) elif decide == 'comparecrossvalidateC': crossvalidateSVC(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components, missing=args.missing, missing_strategy=args.missing_strategy, random_state=args.random_state, C=args.svm_margin) crossvalidateGTC(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components, representation=args.representation, missing=args.missing, missing_strategy=args.missing_strategy, random_state=args.random_state, k=args.grid_size, m=args.rbf_grid_size, predict_mode=args.predict_mode, prior=args.prior, regul=args.regularization, s=args.rbf_width_factor) elif decide == 'comparecrossvalidateR': crossvalidateSVR(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components, missing=args.missing, missing_strategy=args.missing_strategy, random_state=args.random_state, C=args.svm_margin, epsilon=args.svm_epsilon) crossvalidateGTR(data=data, labels=labels, doPCA=args.pca, n_components=args.n_components, n_neighbors=args.n_neighbors, representation=args.representation, missing=args.missing, missing_strategy=args.missing_strategy, random_state=args.random_state, k=args.grid_size, m=args.rbf_grid_size, regul=args.regularization, s=args.rbf_width_factor) else: print("Could not determine which experiment to conduct.")