# Source code for GPy.likelihoods.student_t

# Copyright (c) 2012-2014 Ricardo Andrade, Alan Saul

import numpy as np
from scipy import stats, special
import scipy as sp
from scipy import stats, integrate
from scipy.special import gammaln, gamma
from .likelihood import Likelihood
from ..core.parameterization import Param
from paramz.transformations import Logexp
from scipy.special import psi as digamma

[docs]class StudentT(Likelihood):
"""
Student T likelihood

For nomanclature see Bayesian Data Analysis 2003 p576

.. math::
p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - f_{i})^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}

"""

# sigma2 is not a noise parameter, it is a squared scale.
self.sigma2 = Param('t_scale2', float(sigma2), Logexp())
self.v = Param('deg_free', float(deg_free), Logexp())
#self.v.constrain_fixed()

self.log_concave = False

"""
Pull out the gradients, be careful as the order must match the order
in which the parameters are added
"""

"""

.. math::
p(y_{i}|\\lambda(f_{i})) = \\frac{\\Gamma\\left(\\frac{v+1}{2}\\right)}{\\Gamma\\left(\\frac{v}{2}\\right)\\sqrt{v\\pi\\sigma^{2}}}\\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \\lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)^{\\frac{-v+1}{2}}

:param y: data
:type y: Nx1 array
:returns: likelihood evaluated for this point
:rtype: float
"""
#Careful gamma(big_number) is infinity!
objective = ((np.exp(gammaln((self.v + 1)*0.5) - gammaln(self.v * 0.5))
/ (np.sqrt(self.v * np.pi * self.sigma2)))
* ((1 + (1./float(self.v))*((e**2)/float(self.sigma2)))**(-0.5*(self.v + 1)))
)
return np.prod(objective)

"""

.. math::
\\ln p(y_{i}|\lambda(f_{i})) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)

:param y: data
:type y: Nx1 array
:returns: likelihood evaluated for this point
:rtype: float

"""
#FIXME:
#Why does np.log(1 + (1/self.v)*((y-inv_link_f)**2)/self.sigma2) suppress the divide by zero?!
#But np.log(1 + (1/float(self.v))*((y-inv_link_f)**2)/self.sigma2) throws it correctly
#print - 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
objective = (+ gammaln((self.v + 1) * 0.5)
- gammaln(self.v * 0.5)
- 0.5*np.log(self.sigma2 * self.v * np.pi)
- 0.5*(self.v + 1)*np.log(1 + (1/np.float(self.v))*((e**2)/self.sigma2))
)
return objective

"""

.. math::
\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{(v+1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v}

:param y: data
:type y: Nx1 array
:returns: gradient of likelihood evaluated at points
:rtype: Nx1 array

"""
grad = ((self.v + 1) * e) / (self.v * self.sigma2 + (e**2))

"""
The hessian will be 0 unless i == j

.. math::
\\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{(v+1)((y_{i}-\lambda(f_{i}))^{2} - \\sigma^{2}v)}{((y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v)^{2}}

:param y: data
:type y: Nx1 array
:returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
:rtype: Nx1 array

.. Note::
Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
"""
hess = ((self.v + 1)*(e**2 - self.v*self.sigma2)) / ((self.sigma2*self.v + e**2)**2)
return hess

"""

.. math::
\\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{-2(v+1)((y_{i} - \lambda(f_{i}))^3 - 3(y_{i} - \lambda(f_{i})) \\sigma^{2} v))}{((y_{i} - \lambda(f_{i})) + \\sigma^{2} v)^3}

:param y: data
:type y: Nx1 array
:returns: third derivative of likelihood evaluated at points f
:rtype: Nx1 array
"""
d3lik_dlink3 = ( -(2*(self.v + 1)*(-e)*(e**2 - 3*self.v*self.sigma2)) /
((e**2 + self.sigma2*self.v)**3)
)

"""
Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)

.. math::
\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{v((y_{i} - \lambda(f_{i}))^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})}

:param y: data
:type y: Nx1 array
:returns: derivative of likelihood evaluated at points f w.r.t variance parameter
:rtype: float
"""
e2 = np.square(e)
dlogpdf_dvar = self.v*(e2 - self.sigma2)/(2*self.sigma2*(self.sigma2*self.v + e2))
return dlogpdf_dvar

"""
Derivative of the dlogpdf_dlink w.r.t variance parameter (t_noise)

.. math::
\\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^2 + \\sigma^2 v)^2}

:param y: data
:type y: Nx1 array
:returns: derivative of likelihood evaluated at points f w.r.t variance parameter
:rtype: Nx1 array
"""

"""

.. math::
\\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - \lambda(f_{i}))^{2})}{(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})^{3}}

:param y: data
:type y: Nx1 array
:returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
:rtype: Nx1 array
"""
d2logpdf_dlink2_dvar = ( (self.v*(self.v+1)*(self.sigma2*self.v - 3*(e**2)))
/ ((self.sigma2*self.v + (e**2))**3)
)

e2 = np.square(e)
df = float(self.v[:])
s2 = float(self.sigma2[:])
dlogpdf_dv =  0.5*digamma(0.5*(df+1)) - 0.5*digamma(0.5*df) - 1.0/(2*df)
dlogpdf_dv += 0.5*(df+1)*e2/(df*(e2 + s2*df))
dlogpdf_dv -= 0.5*np.log1p(e2/(s2*df))
return dlogpdf_dv

e2 = np.square(e)
df = float(self.v[:])
s2 = float(self.sigma2[:])
dlogpdf_df_dv = e*(e2 - self.sigma2)/(e2 + s2*df)**2
return dlogpdf_df_dv

e2 = np.square(e)
df = float(self.v[:])
s2 = float(self.sigma2[:])
e2_s2v = e**2 + s2*df
d2logpdf_df2_dv = (-s2*(df+1) + e2 - s2*df)/e2_s2v**2 - 2*s2*(df+1)*(e2 - s2*df)/e2_s2v**3
return d2logpdf_df2_dv

return np.array((dlogpdf_dvar, dlogpdf_dv))

[docs]    def predictive_mean(self, mu, sigma, Y_metadata=None):
# The comment here confuses mean and median.
return self.gp_link.transf(mu) # only true if link is monotonic, which it is.

[docs]    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
if self.deg_free<=2.:
return np.empty(mu.shape)*np.nan # does not exist for degrees of freedom <= 2.
else:
return super(StudentT, self).predictive_variance(mu, variance, predictive_mean, Y_metadata)

[docs]    def conditional_mean(self, gp):

[docs]    def conditional_variance(self, gp):
return self.deg_free/(self.deg_free - 2.)

"""
Returns a set of samples of observations based on a given value of the latent variable.

:param gp: latent variable
"""
orig_shape = gp.shape
gp = gp.flatten()
#FIXME: Very slow as we are computing a new random variable per input!
#Can't get it to sample all at the same time
#student_t_samples = np.array([stats.t.rvs(self.v, self.gp_link.transf(gpj),scale=np.sqrt(self.sigma2), size=1) for gpj in gp])
dfs = np.ones_like(gp)*self.v
scales = np.ones_like(gp)*np.sqrt(self.sigma2)