Source code for ugtm.ugtm_kgtm

"""Functions to initialize and optimize kernel GTM models.
"""
# Authors: Helena A. Gaspar <hagax8@gmail.com>
# License: MIT

from __future__ import print_function
import numpy as np
from sklearn.decomposition import PCA
from . import ugtm_preprocess
from . import ugtm_classes
from . import ugtm_core



[docs]
def initializeKernel(data, k, m, s, maxdim, random_state=1234):
    r"""Initializes a kernel GTM (kGTM) model.

    Parameters
    ==========
    data : array of shape (n_individuals, n_dimensions)
        Data matrix.
    k : int
        Sqrt of the number of GTM nodes.
        Ex: k = 25 means the GTM will be discretized into a 25x25 grid.
    m : int
        Sqrt of the number of RBF centers.
        Ex: m = 5 means the RBF functions will be arranged on a 5x5 grid.
    s : float
        RBF width factor.
        Parameter to tune width of RBF functions.
        Impacts manifold flexibility.
    maxdim : int
        Max boundary for internal dimensionality estimation.
    random_state : int, optional (default = 1234)
        Random state.

    Returns
    =======
    instance of :class:`~ugtm.ugtm_classes.InitialGTM`
        Initial GTM model (not optimized).
    """
    n_individuals = data.shape[0]
    n_nodes = k*k
    n_rbf_centers = m*m
    x = np.linspace(-1, 1, k)
    matX = np.transpose(np.meshgrid(x, x)).reshape(k*k, 2)
    x = np.linspace(-1, 1, m)
    matM = np.transpose(np.meshgrid(x, x)).reshape(m*m, 2)
    if m == 1:
        matM = np.array([[0.0, 0.0]])
    if k == 1:
        matX = np.array([[0.0, 0.0]])
    rbfWidth = ugtm_core.computeWidth(matM, n_rbf_centers, s)
    matPhiMPlusOne = ugtm_core.createPhiMatrix(
        matX, matM, n_nodes, n_rbf_centers, rbfWidth)
    pca = PCA(random_state=random_state)
    pca.fit(data)
    matW = (pca.components_.T * np.sqrt(pca.explained_variance_)
            )[:, 0:n_rbf_centers+1]
    n_dimensions = np.searchsorted(
        pca.explained_variance_ratio_.cumsum(), 0.995)+1
    if n_dimensions > maxdim:
        n_dimensions = maxdim
    matD = ugtm_core.KERNELcreateDistanceMatrix(data, matW, matPhiMPlusOne)
    betaInv = ugtm_core.initBetaInvRandom(matD, n_nodes, n_individuals,
                                          n_dimensions)
    matY = ugtm_core.createYMatrixInit(data, matW, matPhiMPlusOne)
    return ugtm_classes.InitialGTM(matX, matM, n_nodes, n_rbf_centers,
                                   rbfWidth,
                                   matPhiMPlusOne, matW,
                                   matY, betaInv, n_dimensions)




[docs]
def optimizeKernel(data, initialModel, regul, niter, verbose=True):
    r"""Optimizes a kGTM model.

    Parameters
    ==========
    data : array of shape (n_individuals, n_dimensions)
        Data matrix.
    initialModel : instance of :class:`~ugtm.ugtm_classes.InitialGTM`
        Initial kGTM model.
        The initial model is separate from the optimized model
        so that different data sets can be potentially used for initialization
        and optimization.
    regul : float
        Regularization coefficient.
    niter : int
        Number of iterations for EM algorithm.
    verbose : bool, optional (default = True)
        Verbose mode (outputs loglikelihood values during EM algorithm).

    Returns
    =======
    instance of :class:`~ugtm.ugtm_classes.OptimizedGTM`
        Optimized kGTM model.
    """
    matD = ugtm_core.KERNELcreateDistanceMatrix(
        data, initialModel.matW, initialModel.matPhiMPlusOne)
    matY = initialModel.matY
    betaInv = initialModel.betaInv
    i = 1
    diff = 1000
    converged = 0
    while (i < (niter+1)) and (converged < 4):
        # expectation
        matP = ugtm_core.createPMatrix(matD, betaInv,
                                       initialModel.n_dimensions)
        matR = ugtm_core.createRMatrix(matP)
        # maximization
        matG = ugtm_core.createGMatrix(matR)
        matW = ugtm_core.optimLMatrix(
            matR, initialModel.matPhiMPlusOne, matG, betaInv, regul)
        matY = ugtm_core.createYMatrix(matW, initialModel.matPhiMPlusOne)
        matD = ugtm_core.KERNELcreateDistanceMatrix(
            data, matW, initialModel.matPhiMPlusOne)
        betaInv = ugtm_core.optimBetaInv(matR, matD, initialModel.n_dimensions)
        # objective function
        if i == 1:
            loglike = ugtm_core.computelogLikelihood(
                matP, betaInv, initialModel.n_dimensions)
        else:
            loglikebefore = loglike
            loglike = ugtm_core.computelogLikelihood(
                matP, betaInv, initialModel.n_dimensions)
            diff = abs(loglikebefore-loglike)
        if verbose is True:
            print("Iter ", i, " Err: ", loglike)
        if diff <= 0.0001:
            converged += 1
        else:
            converged = 0
        i += 1
    if verbose is True:
        if converged >= 3:
            print("Converged: ", loglike)
    if converged >= 3:
        has_converged = True
    else:
        has_converged = False
    matY = ugtm_core.createYMatrix(matW, initialModel.matPhiMPlusOne)
    matMeans = ugtm_core.meanPoint(matR, initialModel.matX)
    matModes = ugtm_core.modePoint(matR, initialModel.matX)
    return ugtm_classes.OptimizedGTM(matW, matY, matP.T, matR.T,
                                     betaInv, matMeans,
                                     matModes,
                                     initialModel.matX,
                                     initialModel.n_dimensions,
                                     has_converged)




[docs]
def runkGTM(data,  k=16, m=4, s=0.3, regul=0.1, maxdim=100,
            doPCA=False, doKernel=False, kernel="linear",
            n_components=-1,
            missing=True, missing_strategy="median",
            random_state=1234, niter=200,
            verbose=False):
    r"""Run kGTM algorithm (wrapper for initialize + optimize).

    Parameters
    ==========
    data : array of shape (n_individuals, n_dimensions)
        Data matrix.
    k : int, optional (default = 16)
        If k is set to 0, k is computed as sqrt(5*sqrt(n_individuals))+2.
        k is the sqrt of the number of GTM nodes.
        One of four GTM hyperparameters (k, m, s, regul).
        Ex: k = 25 means the GTM will be discretized into a 25x25 grid.
    m : int, optional (default = 4)
        If m is set to 0, m is computed as sqrt(k).
        m is the qrt of the number of RBF centers.
        One of four GTM hyperparameters (k, m, s, regul).
        Ex: m = 5 means the RBF functions will be arranged on a 5x5 grid.
    s : float, optional (default = 0.3)
        RBF width factor.
        Parameter to tune width of RBF functions.
        Impacts manifold flexibility.
    regul : float, optional (default = 0.1)
        One of four GTM hyperparameters (k, m, s, regul).
        Regularization coefficient.
    maxdim : int
        Max boundary for internal dimensionality estimation.
        Internal dimensionality is estimated as number of principal components
        accounting for 99.5% of data variance. If this value is higher than
        maxdim, it is replaced by maxim.
    doPCA : bool, optional (default = False)
        Apply PCA pre-processing.
    doKernel : bool, optional (default = False)
        If doKernel is False, the data is supposed to be a kernel already.
        If doKernel is True, a kernel will be computed from the data.
    kernel : scikit-learn kernel (default = "linear")
    n_components : int, optional (default = -1)
        Number of components for PCA pre-processing.
        If set to -1, keep principal components
        accounting for 80% of data variance.
    missing : bool, optional (default = True)
        Replace missing values (calls scikit-learn functions).
    missing_strategy : str (default = 'median')
        Scikit-learn missing data strategy.
    random_state : int (default = 1234)
        Random state.
    niter : int, optional (default = 200)
        Number of iterations for EM algorithm.
    verbose : bool, optional (default = False)
        Verbose mode (outputs loglikelihood values during EM algorithm).

    Returns
    =======
    instance of :class:`~ugtm.ugtm_classes.OptimizedGTM`
        Optimized kGTM model.
    """

    if k == 0:
        k = int(np.sqrt(5*np.sqrt(data.shape[0])))+2
    if m == 0:
        m = int(np.sqrt(k))

    data = ugtm_preprocess.pcaPreprocess(data=data, doPCA=doPCA,
                                         n_components=n_components,
                                         missing=missing,
                                         missing_strategy=missing_strategy,
                                         random_state=random_state)
    if doKernel or data.shape[0] != data.shape[1]:
        data = ugtm_preprocess.chooseKernel(data, kernel)
    initialModel = initializeKernel(data, k, m, s, random_state)
    optimizedModel = optimizeKernel(
        data, initialModel, regul, niter, verbose=verbose)
    return optimizedModel