Source code for ugtm.ugtm_predictions

"""GTC (GTM classification) and GTR (GTM regression)
"""
# Authors: Helena A. Gaspar <hagax8@gmail.com>
# License: MIT

from __future__ import print_function
import numpy as np
from sklearn import neighbors
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from .ugtm_gtm import initialize
from .ugtm_gtm import optimize
from .ugtm_gtm import projection
from . import ugtm_landscape
from . import ugtm_preprocess


[docs] def predictNN(optimizedModel, labels, new_data, modeltype="regression", n_neighbors=1, representation="modes", prior="estimated"): r"""GTM nearest node(s) classification or regression. Parameters ========== optimizedModel : instance of :class:`~ugtm.ugtm_classes.OptimizedGTM` Optimized GTM model built using a training set of shape (n_individuals, n_dimensions) labels : array of shape (n_individuals, 1) Labels (discrete or continuous) associated with training set new_data : array of shape (n_test, n_dimensions) New data matrix (test set). modeltype : {'classification', 'regression'}, optional Choice between classification and regression. n_neighbors : int, optional (default = 1) Number of nodes to take into account in kNN algorithm. NB: for classification, n_neighbors is always equal to 1. representation : {'modes', 'means'}, optional Defines GTM representation type: mean or mode of responsibilities. prior : {'estimated', 'equiprobable'}, optional Only used for classification. Sets priors (Bayes' theorem) in :func:`~ugtm.ugtm_landscape.classMap`. Returns ======= array of shape (n_test, 1) Predicted outcome. Notes ===== This function implements classification or regression based on nearest GTM nodes: 1. If (modeltype == 'classification'), generate GTM class map (:func:`~ugtm.ugtm_landscape.classMap`); if (modeltype == 'regression'), generate GTM landscape (:func:`~ugtm.ugtm_landscape.landscape`) 2. Project new data (:func:`~ugtm.ugtm_gtm.projection`) on optimized GTM model (:class:`~ugtm.ugtm_classes.OptimizedGTM`) 3. Depending on provided parameters, choose means or modes as GTM coordinates for the new data 4. Find the nodes closest to the new data GTM coordinates (sklearn function kneighbors) 5. Retrieve predicted outcomes corresponding to nodes on class map (classification task) or landscape (regression task) 6. If (modeltype == 'classification'), the predicted outcome is the outcome of the nearest node on the class map; if (modeltype == 'regression'), the predicted outcome is the average outcome of the k nearest nodes (k = n_neighbors), weighted by inverse squared distances (weights=1/((dist)**2)) """ if modeltype == 'regression': activityModel = ugtm_landscape.landscape(optimizedModel, labels) elif modeltype == 'classification': n_neighbors = 1 activityModel = ugtm_landscape.classMap(optimizedModel, labels, prior).activityModel projected = projection(optimizedModel, new_data) neighborModel = NearestNeighbors( n_neighbors=n_neighbors, metric='euclidean') fitted = neighborModel.fit(optimizedModel.matX) if representation == 'means': rep = projected.matMeans elif representation == 'modes': rep = projected.matModes if modeltype == 'regression' and n_neighbors > 1: dist, nnID = fitted.kneighbors(rep, return_distance=True) dist[dist <= 0] = np.finfo(float).tiny predicted = np.average( activityModel[nnID], axis=1, weights=1/((dist)**2)) else: nnID = fitted.kneighbors(rep, return_distance=False) predicted = activityModel[nnID] return np.squeeze(predicted).astype(int)
[docs] def predictNNSimple(train, test, labels, n_neighbors=1, modeltype='regression'): r"""Nearest neighbor(s) classification or regression. Parameters ========== train : array of shape (n_train, n_dimensions) Train set data matrix. test : array of shape (n_test, n_dimensions) Test set data matrix. labels : array of shape (n_train, 1) Labels (discrete or continuous) for the training set. n_neighbors : int, optional (default = 1) Number of nodes to take into account in kNN algorithm. modeltype : {'classification', 'regression'}, optional Choice between classification and regression. Returns ======= array of shape (n_test, 1) Predicted outcome. Notes ===== This function implements classification or regression based on classical kNN algorithm. """ if modeltype == 'regression' and n_neighbors > 1: neighborModel = NearestNeighbors( n_neighbors=n_neighbors, metric='euclidean') fitted = neighborModel.fit(train) dist, nnID = fitted.kneighbors(test, return_distance=True) dist[dist <= 0] = np.finfo(float).tiny predicted = np.average(labels[nnID], axis=1, weights=1/((dist)**2)) else: clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance') clf.fit(train, labels) predicted = clf.predict(test) return np.squeeze(predicted).astype(int)
[docs] def predictBayes(optimizedModel, labels, new_data, prior="estimated"): r""" Bayesian GTM classifier (GTC Bayes). Parameters ========== optimizedModel : instance of :class:`~ugtm.ugtm_classes.OptimizedGTM` Optimized GTM model built using a training set of shape (n_individuals, n_dimensions) labels : array of shape (n_individuals, 1) Labels (discrete or continuous) associated with training set new_data : array of shape (n_test, n_dimensions) New data matrix (test set). prior : {'estimated', 'equiprobable'}, optional Only used for classification. Sets priors (Bayes' theorem) in :func:`~ugtm.ugtm_landscape.classMap`. Returns ======= array of shape (n_test, 1) Predicted outcome. Notes ===== This function computes GTM class predictions by using posterior probabilities of classes weighted by responsibilities. Similar to maximum a posterior (MAP) estimator. 1. generate GTM class map (:func:`~ugtm.ugtm_landscape.classMap`); 2. Project new data (:func:`~ugtm.ugtm_gtm.projection`) on optimized GTM model (:class:`~ugtm.ugtm_classes.OptimizedGTM`) 3. Projected data responsibilities R are used as weights to find outcome :math:`C_{max}` for each tested instance: :math:`C_{max} = \operatorname*{arg\,max}_C \sum_k{R_{ki} P(C|k)}` """ activityModel = ugtm_landscape.classMap(optimizedModel, labels, prior).nodeClassP projected = projection(optimizedModel, new_data).matR predicted = np.argmax(np.dot(projected, activityModel), axis=1) return predicted
[docs] def advancedGTC(train, labels, test, n_neighbors=1, representation="modes", niter=200, k=16, m=4, regul=0.1, s=0.3, doPCA=False, n_components=-1, missing=False, missing_strategy='median', random_state=1234, predict_mode="bayes", prior="estimated"): r"""Run GTC (GTM classification): advanced Bayes Parameters ========== train : array of shape (n_train, n_dimensions) Train set data matrix. labels : array of shape (n_train, 1) Labels for train set. test : array of shape (n_test, n_dimensions) Test set data matrix. k : int, optional (default = 16) If k is set to 0, k is computed as sqrt(5*sqrt(n_individuals))+2. k is the sqrt of the number of GTM nodes. One of four GTM hyperparameters (k, m, s, regul). Ex: k = 25 means the GTM will be discretized into a 25x25 grid. m : int, optional (default = 4) If m is set to 0, m is computed as sqrt(k). (generally good rule of thumb). m is the qrt of the number of RBF centers. One of four GTM hyperparameters (k, m, s, regul). Ex: m = 5 means the RBF functions will be arranged on a 5x5 grid. s : float, optional (default = 0.3) RBF width factor. One of four GTM hyperparameters (k, m, s, regul). Parameter to tune width of RBF functions. Impacts manifold flexibility. regul : float, optional (default = 0.1) One of four GTM hyperparameters (k, m, s, regul). Regularization coefficient. Impacts manifold flexibility. n_neighbors : int, optional (default = 1) Number of neighbors for kNN algorithm (number of nearest nodes). At the moment, n_neighbors is always equal to 1. niter : int, optional (default = 200) Number of iterations for EM algorithm. representation : {"modes", "means"} 2D GTM representation for the test set, used for kNN algorithms: "modes" for position with max. responsibility, "means" for average position (usual GTM representation) doPCA : bool, optional (default = False) Apply PCA pre-processing. n_components : int, optional (default = -1) Number of components for PCA pre-processing. If set to -1, keep principal components accounting for 80% of data variance. missing : bool, optional (default = True) Replace missing values (calls scikit-learn functions). missing_strategy : str, optional (default = 'median') Scikit-learn missing data strategy. random_state : int, optional (default = 1234) Random state. predict_mode : {"bayes"}, optional At the moment, only the GTM Bayes classifier is available; ("bayes", output of :func:`~ugtm.ugtm_predictions.advancedPredictBayes`). prior : {"estimated", "equiprobable"}, optional Type of prior used to build GTM class map (:func:`~ugtm.ugtm_landscape.classMap`). Choose "estimated" to account for class imbalance. Returns ======= a dict The output is a dictionary defined as follows: 1. output["optimizedModel"]: original training set GTM model, instance of :class:`~ugtm.ugtm_classes.OptimizedGTM` 2. output["indiv_projections"]: test set GTM model, instance of :class:`~ugtm.ugtm_classes.OptimizedGTM` 3. output["indiv_probabilities"]: class probabilities for each individual (= dot product between test responsibility matrix and posterior class probabilities) 4. output["indiv_predictions"]: class prediction for each individual (argmax of output["indiv_probabilities"]) 5. output["group_projections"]: average responsibility vector for the entire test set 6. output["group_probabilities"]: posterior class probabilities for the entire test set (dot product between output["group_projections"] and posterior class probabilities) 7. output["uniqClasses"]: classes Notes ===== The GTM nearest node classifier (predict_mode = "knn", :func:`~ugtm.ugtm_predictions.predictNN`): 1. A GTM class map (GTM colored by class) is built using the training set (:func:`~ugtm.ugtm_landscape.classMap`); the class map is discretized into nodes, and each node has a class label 2. The test set is projected onto the GTM map 3. A 2D GTM representation is chosen for the test set (representation = modes or means) 4. Nearest node on the GTM map is found for each test set individual 5. The predicted label for each individual is the label of its nearest node on the GTM map The GTM Bayes classifier (predict_mode = "bayes", :func:`~ugtm.ugtm_predictions.predictBayes`): 1. A GTM class map (GTM colored by class) is built using the training set (:func:`~ugtm.ugtm_landscape.classMap`); the class map is discretized into nodes, and each node has posterior class probabilities 2. The test set is projected onto the GTM map 3. The GTM representation for each individual is its responsibility vector (posterior probability distribution on the map) 4. The probabilities of belonging to each class for a specific individual are computed as an average of posterior class probabilities (array of shape (n_nodes_n,classes)), weighted by the individual's responsibilities on the GTM map (array of shape (1, n_nodes)) """ if k == 0: k = int(np.sqrt(5*np.sqrt(train.shape[0])))+2 if m == 0: m = int(np.sqrt(k)) if n_components == -1 and doPCA: pca = PCA(random_state=random_state) pca.fit(train) n_components = np.searchsorted( pca.explained_variance_ratio_.cumsum(), 0.8)+1 print("Used n_components explaining 80%% of the variance = %s\n" % n_components) if regul < 0.0: regul = 0.1 if s <= 0.0: s = 0.3 processed = ugtm_preprocess.processTrainTest(train, test, doPCA, n_components, missing, missing_strategy) initialModel = initialize(processed.train, k, m, s, random_state=random_state) optimizedModel = optimize(processed.train, initialModel, regul, niter, 0) prediction = advancedPredictBayes( optimizedModel, labels, processed.test, prior) return prediction
[docs] def advancedPredictBayes(optimizedModel, labels, new_data, prior="estimated"): r"""Bayesian GTM classifier: complete model Parameters ========== optimizedModel : instance of :class:`~ugtm.ugtm_classes.OptimizedGTM` Optimized GTM model built using a training set of shape (n_individuals, n_dimensions) labels : array of shape (n_individuals, 1) Labels (discrete or continuous) associated with training set new_data : array of shape (n_test, n_dimensions) New data matrix (test set). prior : {'estimated', 'equiprobable'}, optional Only used for classification. Sets priors (Bayes' theorem) in :func:`~ugtm.ugtm_landscape.classMap`. Returns ======= a dict The output is a dictionary defined as follows: 1. output["optimizedModel"]: original training set GTM model, instance of :class:`~ugtm.ugtm_classes.OptimizedGTM` 2. output["indiv_projections"]: test set GTM model, instance of :class:`~ugtm.ugtm_classes.OptimizedGTM` 3. output["indiv_probabilities"]: class probabilities for each individual (= dot product between test responsibility matrix and posterior class probabilities) 4. output["indiv_predictions"]: class prediction for each individual (argmax of output["indiv_probabilities"]) 5. output["group_projections"]: average responsibility vector for the entire test set 6. output["group_probabilities"]: posterior class probabilities for the entire test set (dot product between output["group_projections"] and posterior class probabilities) 7. output["uniqClasses"]: classes Notes ===== This function computes GTM class predictions by using posterior probabilities of classes weighted by responsibilities. 1. generate GTM class map (:func:`~ugtm.ugtm_landscape.classMap`); 2. Project new data (:func:`~ugtm.ugtm_gtm.projection`) on optimized GTM model (:class:`~ugtm.ugtm_classes.OptimizedGTM`) 3. Projected data responsibilities R are used as weights to find outcome :math:`C_{max}` for each tested instance: :math:`C_{max} = \operatorname*{arg\,max}_C \sum_k{R_{ki} P(C|k)}` The algorithm is the same as in :func:`~ugtm.ugtm_predicions.predictBayes`, but this function returns a complete output including original training set optimized GTM model, test set GTM model, individual class probabilities for each individual, class prediction for each individual, group projections (average position of the whole test set on the map), class probabilities for the whole test set, and classes used to build the classification model. """ predicted = {} cl = ugtm_landscape.classMap(optimizedModel, labels, prior) activityModel = cl.nodeClassP projected = projection(optimizedModel, new_data) predicted["optimizedModel"] = optimizedModel predicted["indiv_projections"] = projected predicted["indiv_probabilities"] = np.dot(projected.matR, activityModel) predicted["indiv_predictions"] = np.argmax( predicted["indiv_probabilities"], axis=1) predicted["group_projections"] = np.mean(projected.matR, axis=0) predicted["group_probabilities"] = np.dot( predicted["group_projections"], activityModel) predicted["uniqClasses"] = cl.uniqClasses return predicted
[docs] def printClassPredictions(prediction, output): r"""Print output of :func:`~ugtm.ugtm_predictions.advancedPredictBayes`. Parameters ========== prediction : dict Output of :func:`~ugtm.ugtm_predictions.advancedPredictBayes`. With following keys: "optimizedModel": :class:`~ugtm.ugtm_classes.OptimizedGTM`, "indiv_projections": :class:`~ugtm.ugtm_classes.OptimizedGTM`, "indiv_probabilities": array of shape (n_individuals, n_classes), "indiv_predictions": array of shape (n_individuals, 1), "group_projections": array of shape (n_nodes, 1), "group_probabilities": array of shape (n_probabilities, 1), "uniqClasses": array of shape(n_classes, 1) output : str Output path to write class prediction model (prediction dictionary). Returns ======= CSV files 1. output_indiv_probabilities.csv 2. output_indiv_predictions.csv 3. output_group_probabilities.csv """ string = "Classes_in_this_order:" count = 0 grouproba = prediction["group_probabilities"] for i in range(len(prediction["uniqClasses"])): string += str(count)+"="+str(prediction["uniqClasses"][i])+";" count = count + 1 predvec = [prediction["uniqClasses"][j] for j in prediction["indiv_predictions"]] np.savetxt(fname=output+"_indiv_probabilities.csv", X=prediction["indiv_probabilities"], delimiter=",", header=string, fmt='%.2f') np.savetxt(fname=output+"_indiv_predictions.csv", X=prediction["indiv_predictions"], delimiter=",", header=string, fmt='%s') np.savetxt(fname=output+"_indiv_predictions_label.csv", X=predvec, delimiter=",", header=string, fmt='%s') np.savetxt(fname=output+"_group_probabilities.csv", X=grouproba.reshape(1, grouproba.shape[0]), delimiter=",", header=string, fmt='%.2f') print("Wrote to disk:") print("%s: individual probabilities" % (output+"_indiv_probabilities.csv")) print("%s: individual predictions" % (output+"_indiv_predictions.csv")) print("%s: group probabilities" % (output+"_group_probabilities.csv")) print("")
[docs] def GTC(train, labels, test, k=16, m=4, s=0.3, regul=0.1, n_neighbors=1, niter=200, representation="modes", doPCA=False, n_components=-1, missing=False, missing_strategy='median', random_state=1234, predict_mode="bayes", prior="estimated"): r"""Run GTC (GTM classification): Bayes or nearest node algorithm. Parameters ========== train : array of shape (n_train, n_dimensions) Train set data matrix. labels : array of shape (n_train, 1) Labels for train set. test : array of shape (n_test, n_dimensions) Test set data matrix. k : int, optional (default = 16) If k is set to 0, k is computed as sqrt(5*sqrt(n_individuals))+2. k is the sqrt of the number of GTM nodes. One of four GTM hyperparameters (k, m, s, regul). Ex: k = 25 means the GTM will be discretized into a 25x25 grid. m : int, optional (default = 4) If m is set to 0, m is computed as sqrt(k). (generally good rule of thumb). m is the qrt of the number of RBF centers. One of four GTM hyperparameters (k, m, s, regul). Ex: m = 5 means the RBF functions will be arranged on a 5x5 grid. s : float, optional (default = 0.3) RBF width factor. One of four GTM hyperparameters (k, m, s, regul). Parameter to tune width of RBF functions. Impacts manifold flexibility. regul : float, optional (default = 0.1) One of four GTM hyperparameters (k, m, s, regul). Regularization coefficient. Impacts manifold flexibility. n_neighbors : int, optional (default = 1) Number of neighbors for kNN algorithm (number of nearest nodes). At the moment, n_neighbors is always equal to 1. niter : int, optional (default = 200) Number of iterations for EM algorithm. representation : {"modes", "means"} 2D GTM representation for the test set, used for kNN algorithms: "modes" for position with max. responsibility, "means" for average position (usual GTM representation) doPCA : bool, optional (default = False) Apply PCA pre-processing. n_components : int, optional (default = -1) Number of components for PCA pre-processing. If set to -1, keep principal components accounting for 80% of data variance. missing : bool, optional (default = True) Replace missing values (calls scikit-learn functions). missing_strategy : str, optional (default = 'median') Scikit-learn missing data strategy. random_state : int, optional (default = 1234) Random state. predict_mode : {"bayes", "knn"}, optional Choose between nearest node algorithm ("knn", output of :func:`~ugtm.ugtm_predictions.predictNN`) or GTM Bayes classifier ("bayes", output of :func:`~ugtm.ugtm_predictions.predictBayes`). NB: the kNN algorithm is limited to only 1 nearest node at the moment (n_neighbors = 1). prior : {"estimated", "equiprobable"}, optional Type of prior used to build GTM class map (:func:`~ugtm.ugtm_landscape.classMap`). Choose "estimated" to account for class imbalance. Returns ======= array of shape (n_test, 1) Predicted class for test set individuals. Notes ===== The GTM nearest node classifier (predict_mode = "knn", :func:`~ugtm.ugtm_predictions.predictNN`): 1. A GTM class map (GTM colored by class) is built using the training set (:func:`~ugtm.ugtm_landscape.classMap`); the class map is discretized into nodes, and each node has a class label 2. The test set is projected onto the GTM map 3. A 2D GTM representation is chosen for the test set (representation = modes or means) 4. Nearest node on the GTM map is found for each test set individual 5. The predicted label for each individual is the label of its nearest node on the GTM map The GTM Bayes classifier (predict_mode = "bayes", :func:`~ugtm.ugtm_predictions.predictBayes`): 1. A GTM class map (GTM colored by class) is built using the training set (:func:`~ugtm.ugtm_landscape.classMap`); the class map is discretized into nodes, and each node has posterior class probabilities 2. The test set is projected onto the GTM map 3. The GTM representation for each individual is its responsibility vector (posterior probability distribution on the map) 4. The probabilities of belonging to each class for a specific individual are computed as an average of posterior class probabilities (array of shape (n_nodes_n,classes)), weighted by the individual's responsibilities on the GTM map (array of shape (1, n_nodes)) """ if k == 0: k = int(np.sqrt(5*np.sqrt(train.shape[0])))+2 if m == 0: m = int(np.sqrt(k)) processed = ugtm_preprocess.processTrainTest(train, test, doPCA, n_components, missing, missing_strategy) initialModel = initialize(processed.train, k, m, s, random_state=random_state) optimizedModel = optimize(processed.train, initialModel, regul, niter, 0) if predict_mode == "knn": prediction = predictNN(optimizedModel, labels, processed.test, "classification", n_neighbors, representation, prior) elif predict_mode == "bayes": prediction = predictBayes( optimizedModel, labels, processed.test, prior) return prediction
[docs] def GTR(train, labels, test, k=16, m=4, s=0.3, regul=0.1, n_neighbors=1, niter=200, representation="modes", doPCA=False, n_components=-1, missing=False, missing_strategy='median', random_state=1234): r"""Run GTR (GTM nearest node(s) regression). Parameters ========== train : array of shape (n_train, n_dimensions) Train set data matrix. labels : array of shape (n_train, 1) Labels for train set. test : array of shape (n_test, n_dimensions) Test set data matrix. k : int, optional (default = 16) If k is set to 0, k is computed as sqrt(5*sqrt(n_individuals))+2. k is the sqrt of the number of GTM nodes. One of four GTM hyperparameters (k, m, s, regul). Ex: k = 25 means the GTM will be discretized into a 25x25 grid. m : int, optional (default = 4) If m is set to 0, m is computed as sqrt(k). (generally good rule of thumb). m is the qrt of the number of RBF centers. One of four GTM hyperparameters (k, m, s, regul). Ex: m = 5 means the RBF functions will be arranged on a 5x5 grid. s : float, optional (default = 0.3) RBF width factor. One of four GTM hyperparameters (k, m, s, regul). Parameter to tune width of RBF functions. Impacts manifold flexibility. regul : float, optional (default = 0.1) One of four GTM hyperparameters (k, m, s, regul). Regularization coefficient. Impacts manifold flexibility. n_neighbors : int, optional (default = 1) Number of neighbors for kNN algorithm (number of nearest nodes). niter : int, optional (default = 200) Number of iterations for EM algorithm. representation : {"modes", "means"} 2D GTM representation for the test set, used for kNN algorithms: "modes" for position with max. responsibility, "means" for average position (usual GTM representation) doPCA : bool, optional (default = False) Apply PCA pre-processing. n_components : int, optional (default = -1) Number of components for PCA pre-processing. If set to -1, keep principal components accounting for 80% of data variance. missing : bool, optional (default = True) Replace missing values (calls scikit-learn functions). missing_strategy : str, optional (default = 'median') Scikit-learn missing data strategy. random_state : int, optional (default = 1234) Random state. Returns ======= array of shape (n_test, 1) Predicted class for test set individuals. Notes ===== The GTM nearest node(s) regression (:func:`~ugtm.ugtm_predictions.predictNN`): 1. A GTM landscape (GTM colored by activity) is built using the training set (:func:`~ugtm.ugtm_landscape.landscape`); the landscape is discretized into nodes, and each node has an estimated activity value 2. The test set is projected onto the GTM map 3. A 2D GTM representation is chosen for the test set (representation = modes or means) 4. Nearest node(s) on the GTM map is found for each test set individual 5. The predicted activity for each individual is a weighted average of nearest node activities. """ if k == 0: k = int(np.sqrt(5*np.sqrt(train.shape[0])))+2 if m == 0: m = int(np.sqrt(k)) processed = ugtm_preprocess.processTrainTest(train, test, doPCA, n_components) initialModel = initialize(processed.train, k, m, s, random_state=random_state) optimizedModel = optimize(processed.train, initialModel, regul, niter, 0) prediction = predictNN(optimizedModel=optimizedModel, labels=labels, new_data=processed.test, modeltype="regression", n_neighbors=n_neighbors, representation=representation) return prediction