Source code for GPy.kern.src.mlp

# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)

from .kern import Kern
from ...core.parameterization import Param
from paramz.transformations import Logexp
import numpy as np
from paramz.caching import Cache_this
four_over_tau = 2./np.pi

[docs]class MLP(Kern): """ Multi layer perceptron kernel (also known as arc sine kernel or neural network kernel) .. math:: k(x,y) = \\sigma^{2}\\frac{2}{\\pi } \\text{asin} \\left ( \\frac{ \\sigma_w^2 x^\\top y+\\sigma_b^2}{\\sqrt{\\sigma_w^2x^\\top x + \\sigma_b^2 + 1}\\sqrt{\\sigma_w^2 y^\\top y + \\sigma_b^2 +1}} \\right ) :param input_dim: the number of input dimensions :type input_dim: int :param variance: the variance :math:`\sigma^2` :type variance: float :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w` :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter) :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b` :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension. :type ARD: Boolean :rtype: Kernpart object """ def __init__(self, input_dim, variance=1., weight_variance=1., bias_variance=1., ARD=False, active_dims=None, name='mlp'): super(MLP, self).__init__(input_dim, active_dims, name) self.variance = Param('variance', variance, Logexp()) self.ARD= ARD if ARD: wv = np.empty((input_dim,)) wv[:] = weight_variance weight_variance = wv self.weight_variance = Param('weight_variance', weight_variance, Logexp()) self.bias_variance = Param('bias_variance', bias_variance, Logexp()) self.link_parameters(self.variance, self.weight_variance, self.bias_variance)
[docs] @Cache_this(limit=3, ignore_args=()) def K(self, X, X2=None): if X2 is None: X_denom = np.sqrt(self._comp_prod(X)+1.) X2_denom = X_denom X2 = X else: X_denom = np.sqrt(self._comp_prod(X)+1.) X2_denom = np.sqrt(self._comp_prod(X2)+1.) XTX = self._comp_prod(X,X2)/X_denom[:,None]/X2_denom[None,:] return self.variance*four_over_tau*np.arcsin(XTX)
[docs] @Cache_this(limit=3, ignore_args=()) def Kdiag(self, X): """Compute the diagonal of the covariance matrix for X.""" X_prod = self._comp_prod(X) return self.variance*four_over_tau*np.arcsin(X_prod/(X_prod+1.))
[docs] def update_gradients_full(self, dL_dK, X, X2=None): """Derivative of the covariance with respect to the parameters.""" dvar, dw, db = self._comp_grads(dL_dK, X, X2)[:3] self.variance.gradient = dvar self.weight_variance.gradient = dw self.bias_variance.gradient = db
[docs] def update_gradients_diag(self, dL_dKdiag, X): dvar, dw, db = self._comp_grads_diag(dL_dKdiag, X)[:3] self.variance.gradient = dvar self.weight_variance.gradient = dw self.bias_variance.gradient = db
[docs] def gradients_X(self, dL_dK, X, X2): """Derivative of the covariance matrix with respect to X""" return self._comp_grads(dL_dK, X, X2)[3]
[docs] def gradients_X_X2(self, dL_dK, X, X2): """Derivative of the covariance matrix with respect to X""" return self._comp_grads(dL_dK, X, X2)[3:]
[docs] def gradients_X_diag(self, dL_dKdiag, X): """Gradient of diagonal of covariance with respect to X""" return self._comp_grads_diag(dL_dKdiag, X)[3]
@Cache_this(limit=3, ignore_args=()) def _comp_prod(self, X, X2=None): if X2 is None: return (np.square(X)*self.weight_variance).sum(axis=1)+self.bias_variance else: return (X*self.weight_variance).dot(X2.T)+self.bias_variance @Cache_this(limit=3, ignore_args=(1,)) def _comp_grads(self, dL_dK, X, X2=None): var,w,b = self.variance, self.weight_variance, self.bias_variance K = self.K(X, X2) dvar = (dL_dK*K).sum()/var X_prod = self._comp_prod(X) X2_prod = self._comp_prod(X2) if X2 is not None else X_prod XTX = self._comp_prod(X,X2) if X2 is not None else self._comp_prod(X, X) common = var*four_over_tau/np.sqrt((X_prod[:,None]+1.)*(X2_prod[None,:]+1.)-np.square(XTX))*dL_dK if self.ARD: if X2 is not None: XX2 = X[:,None,:]*X2[None,:,:] if X2 is not None else X[:,None,:]*X[None,:,:] XX = np.square(X) X2X2 = np.square(X2) Q = self.weight_variance.shape[0] common_XTX = common*XTX dw = np.dot(common.flat,XX2.reshape(-1,Q)) -( (common_XTX.sum(1)/(X_prod+1.)).T.dot(XX)+(common_XTX.sum(0)/(X2_prod+1.)).dot(X2X2))/2 else: XX2 = X[:,None,:]*X[None,:,:] XX = np.square(X) Q = self.weight_variance.shape[0] common_XTX = common*XTX dw = np.dot(common.flat,XX2.reshape(-1,Q)) - ((common_XTX.sum(0)+common_XTX.sum(1))/(X_prod+1.)).dot(XX)/2 else: dw = (common*((XTX-b)/w-XTX*(((X_prod-b)/(w*(X_prod+1.)))[:,None]+((X2_prod-b)/(w*(X2_prod+1.)))[None,:])/2.)).sum() db = (common*(1.-XTX*(1./(X_prod[:,None]+1.)+1./(X2_prod[None,:]+1.))/2.)).sum() if X2 is None: common = common+common.T dX = common.dot(X)*w-((common*XTX).sum(axis=1)/(X_prod+1.))[:,None]*X*w dX2 = dX else: dX = common.dot(X2)*w-((common*XTX).sum(axis=1)/(X_prod+1.))[:,None]*X*w dX2 = common.T.dot(X)*w-((common*XTX).sum(axis=0)/(X2_prod+1.))[:,None]*X2*w return dvar, dw, db, dX, dX2 @Cache_this(limit=3, ignore_args=(1,)) def _comp_grads_diag(self, dL_dKdiag, X): var,w,b = self.variance, self.weight_variance, self.bias_variance K = self.Kdiag(X) dvar = (dL_dKdiag*K).sum()/var X_prod = self._comp_prod(X) common = var*four_over_tau/(np.sqrt(1-np.square(X_prod/(X_prod+1)))*np.square(X_prod+1))*dL_dKdiag if self.ARD: XX = np.square(X) dw = np.dot(common,XX) else: dw = (common*(X_prod-b)).sum()/w db = common.sum() dX = common[:,None]*X*w*2 return dvar, dw, db, dX