# -*- coding: utf-8 -*-
'''These loss functions are available for neural network models.'''
import numpy as np
import theano.tensor as TT
from . import util
[docs]class Loss(util.Registrar(str('Base'), (), {})):
r'''A loss function base class.
Parameters
----------
target : int or Theano variable
If this is an integer, it specifies the number of dimensions required to
store the target values for computing the loss. If it is a Theano
variable, this variable will be used directly to access target values.
weight : float, optional
The importance of this loss for the model being trained. Defaults to 1.
weighted : bool, optional
If True, a floating-point array of weights with the same dimensions as
``target`` will be required to compute the "weighted" loss. Defaults
to False.
output_name : str, optional
Name of the network output to tap for computing the loss. Defaults to
'out:out', the name of the default output of the last layer in a linear
network.
Attributes
----------
weight : float
The importance of this loss for the model being trained.
output_name : str
Name of the network output to tap for computing the loss.
'''
[docs] def __init__(self, target, weight=1., weighted=False, output_name='out'):
self.weight = weight
self._target = (util.FLOAT_CONTAINERS[target]('target')
if isinstance(target, int) else target)
self._weights = None
if weighted:
self._weights = util.FLOAT_CONTAINERS[self._target.ndim]('weights')
self.output_name = output_name
if ':' not in self.output_name:
self.output_name += ':out'
@property
def variables(self):
'''A list of Theano variables used in this loss.'''
result = [self._target]
if self._weights is not None:
result.append(self._weights)
return result
[docs] def log(self):
'''Log some diagnostic info about this loss.'''
util.log('using loss: {0.weight} * {0.__class__.__name__} '
'(output {0.output_name})', self)
def __call__(self, outputs):
'''Construct the computation graph for this loss function.
Parameters
----------
outputs : dict of Theano expressions
A dictionary mapping network output names to Theano expressions
representing the outputs of a computation graph.
Returns
-------
loss : Theano expression
The values of the loss given the network output.
'''
raise NotImplementedError
[docs]class MeanSquaredError(Loss):
r'''Mean-squared-error (MSE) loss function.
Notes
-----
The mean squared error (MSE) loss computes the mean of the squared
difference between the output of a computation graph
:math:`x = (x_1, \dots, x_d)` and its expected target value
:math:`t = (t_1, \dots, t_d)`. Mathematically,
.. math::
\begin{eqnarray*}
\mathcal{L}(x, t) &=& \frac{1}{d} \|x - t\|_2^2 \\
&=& \frac{1}{d} \sum_{i=1}^d (x_i - t_i)^2
\end{eqnarray*}
Whereas some MSE computations return the sum over dimensions, the MSE here
is computed as an average over the dimensionality of the data.
For cases where :math:`x` and :math:`t` are matrices, the MSE computes the
average over corresponding rows in :math:`x` and :math:`t`:
.. math::
\mathcal{L}(X, T) = \frac{1}{dm} \sum_{j=1}^m \sum_{i=1}^d (x_{ji} - t_{ji})^2
'''
__extra_registration_keys__ = ['MSE']
def __call__(self, outputs):
'''Construct the computation graph for this loss function.
Parameters
----------
outputs : dict of Theano expressions
A dictionary mapping network output names to Theano expressions
representing the outputs of a computation graph.
Returns
-------
loss : Theano expression
The values of the loss given the network output.
'''
err = outputs[self.output_name] - self._target
if self._weights is not None:
return (self._weights * err * err).sum() / self._weights.sum()
return (err * err).mean()
[docs]class MeanAbsoluteError(Loss):
r'''Mean-absolute-error (MAE) loss function.
Notes
-----
The mean absolute error (MAE) loss computes the mean difference between the
output of a computation graph :math:`x = (x_1, \dots, x_d)` and its expected
target value :math:`t = (t_1, \dots, t_d)`. Mathematically,
.. math::
\begin{eqnarray*}
\mathcal{L}(x, t) &=& \frac{1}{d} \|x - t\|_1 \\
&=& \frac{1}{d} \sum_{i=1}^d |x_i - t_i|
\end{eqnarray*}
Whereas some MAE computations return the sum over dimensions, the MAE here
is computed as an average over the dimensionality of the data.
For cases where :math:`x` and :math:`t` are matrices, the MAE computes the
average over corresponding rows in :math:`x` and :math:`t`.
'''
__extra_registration_keys__ = ['MAE']
def __call__(self, outputs):
'''Construct the computation graph for this loss function.
Parameters
----------
outputs : dict of Theano expressions
A dictionary mapping network output names to Theano expressions
representing the outputs of a computation graph.
Returns
-------
loss : Theano expression
The values of the loss given the network output.
'''
err = outputs[self.output_name] - self._target
if self._weights is not None:
return abs(self._weights * err).sum() / self._weights.sum()
return abs(err).mean()
[docs]class GaussianLogLikelihood(Loss):
r'''Gaussian Log Likelihood (GLL) loss function.
Parameters
----------
mean_name : str
Name of the network graph output to use for the mean of the Gaussian
distribution.
covar_name : str
Name of the network graph output to use for the diagonal covariance of
the Gaussian distribution.
Notes
-----
This loss computes the negative log-likelihood of the observed target data
:math:`y` under a Gaussian distribution, where the neural network computes
the mean :math:`\mu` and the diagonal of the covariance :math:`\Sigma` as a
function of its input :math:`x`. The loss is given by:
.. math::
\mathcal{L}(x, y) = -\log p(y) = -\log p\left(y|\mu(x),\Sigma(x)\right)
where
.. math::
p(y) = p(y|\mu,\Sigma) = \frac{1}{(2\pi)^{n/2}|\Sigma|^{1/2}}
\exp\left\{-\frac{1}{2}(y-\mu)^\top\Sigma^{-1}(y-\mu) \right\}
is the Gaussian density function.
The log density :math:`\log p(y)` can be parameterized more conveniently
[Gu08]_ as:
.. math::
\log p(y|\nu,\Lambda) = a + \eta^\top y - \frac{1}{2} y^\top \Lambda y
where :math:`\Lambda = \Sigma^{-1}` is the precision,
:math:`\eta = \Lambda\mu` is the covariance-skewed mean, and
:math:`a=-\frac{1}{2}\left(n\log 2\pi-\log|\Lambda|+\eta^\top\Lambda\eta\right)`
contains all constant terms. (These terms are all computed as a function of
the input, :math:`x`.)
This implementation of the Gaussian log-likelihood loss approximates
:math:`\Sigma` using only its diagonal. This makes the precision easy to
compute because
.. math::
\Sigma^{-1} = \Lambda =
\mbox{diag}(\frac{1}{\sigma_1}, \dots, \frac{1}{\sigma_n})
is just the matrix containing the multiplicative inverse of the diagonal
covariance values. Similarly, the log-determinant of the precision is just
the sum of the logs of the diagonal terms:
.. math::
\log|\Lambda|=\sum_{i=1}^n\log\lambda_i=-\sum_{i=1}^n\log\sigma_i.
The log-likelihood is computed separately for each input-output pair in a
batch, and the overall likelihood is the mean of these individual values.
Weighted targets unfortunately do not work with this loss at the moment.
References
----------
.. [Gu08] Multivariate Gaussian Distribution.
https://www.cs.cmu.edu/~epxing/Class/10701-08s/recitation/gaussian.pdf
'''
__extra_registration_keys__ = ['GLL']
[docs] def __init__(self, mean_name='mean', covar_name='covar', covar_eps=1e-3, **kwargs):
self.mean_name = mean_name
if ':' not in self.mean_name:
self.mean_name += ':out'
self.covar_name = covar_name
if ':' not in self.covar_name:
self.covar_name += ':out'
self.covar_eps = covar_eps
super(GaussianLogLikelihood, self).__init__(**kwargs)
[docs] def log(self):
'''Log some diagnostic info about this loss.'''
util.log('using loss: {0.weight} * {0.__class__.__name__} '
'(mean {0.mean_name}, covar {0.covar_name})', self)
def __call__(self, outputs):
'''Construct the computation graph for this loss function.
Parameters
----------
outputs : dict of Theano expressions
A dictionary mapping network output names to Theano expressions
representing the outputs of a computation graph.
Returns
-------
loss : Theano expression
The values of the loss given the network output.
'''
# this code is going to look weird to people who are used to seeing
# implementations of the gaussian likelihood function. our mean, covar,
# and self._target arrays are all of shape (batch-size, dims). each of
# these arrays codes an independent input/output pair for the loss, but
# they're stacked together in matrices for computational efficiency.
#
# what's worse, the covariance is encoded as a vector of the diagonal
# elements, again one per input/output pair in the batch.
#
# the upshot of this is that many operations written traditionally as
# three dot products (vector-matrix-vector, e.g., x^T \Lambda x) are
# here written as three elementwise array products (x * prec * x),
# followed by a sum across the last dimension. this has the added
# benefit that it will be way faster than dot products, but it looks
# strange in the code below.
mean = outputs[self.mean_name]
covar = outputs[self.covar_name]
prec = 1 / (abs(covar) + self.covar_eps) # prevent nans!
eta = mean * prec
logpi = TT.cast(mean.shape[-1] * np.log(2 * np.pi), 'float32')
logdet = TT.log(prec.sum(axis=-1))
const = logpi - logdet + (eta * prec * eta).sum(axis=-1)
squared = (self._target * prec * self._target).sum(axis=-1)
nll = 0.5 * (const + squared) - (eta * self._target).sum(axis=-1)
return nll.mean()
[docs]class MaximumMeanDiscrepancy(Loss):
r'''Maximum Mean Discrepancy (MMD) loss function.
Parameters
----------
kernel : callable or numeric, optional
A kernel function to call for computing pairwise kernel values. If this
is a callable, it should take two Theano arrays as arguments and return
a Theano array. If it is a numeric value, the kernel will be a Gaussian
with the given value as the bandwidth parameter. Defaults to 1.
Notes
-----
This loss computes the differential between a predicted distribution
(generated by a network) and an observed distribution (of data within a
mini-batch). The loss is given by:
.. math::
\mathcal{L}(x, y) = \| \sum_{j=1}^N \phi(y_j) - \sum_{i=1}^N \phi(x_i) \|_2^2
This can be expanded to
.. math::
\mathcal{L}(x, y) = \sum_{j=1}^N \sum_{j'=1}^N \phi(y_j)^\top \phi(y_{j'})
- 2 \sum_{j=1}^N \sum_{i=1}^N \phi(y_j)^\top \phi(x_i)
+ \sum_{i=1}^N \sum_{i'=1}^N \phi(x_i)^\top \phi(x_{i'})
and then the kernel trick can be applied,
.. math::
\mathcal{L}(x, y) = \sum_{j=1}^N \sum_{j'=1}^N k(y_j, y_{j'})
- 2 \sum_{j=1}^N \sum_{i=1}^N k(y_j, x_i)
+ \sum_{i=1}^N \sum_{i'=1}^N k(x_i, x_{i'})
By default the loss here uses the Gaussian kernel
.. math::
k(x, x') = \exp(-(x-x')^2/\sigma)
where :math:`\sigma` is a scalar bandwidth parameter. However, other kernels
can be provided when constructing the loss.
References
----------
.. [Gre07] A. Gretton, K. M. Borgwardt, M. Rasch, B. Scholkopf, & A. J.
Smola (NIPS 2007) "A Kernel Method for the Two-Sample-Problem."
http://papers.nips.cc/paper/3110-a-kernel-method-for-the-two-sample-problem.pdf
.. [Li15] Y. Li, K. Swersky, & R. Zemel (ICML 2015) "Generative Moment
Matching Networks." http://jmlr.org/proceedings/papers/v37/li15.pdf
'''
__extra_registration_keys__ = ['MMD']
@staticmethod
def gaussian(bw):
def kernel(x, y):
# this dimshuffle lets us compute squared euclidean distance with a
# broadcasted subtraction, a square, and a sum.
r = x.dimshuffle(0, 'x', *tuple(range(1, x.ndim)))
return TT.exp(TT.sqr(r - y).sum(axis=-1) / -bw)
return kernel
[docs] def __init__(self, kernel=1, **kwargs):
super(MaximumMeanDiscrepancy, self).__init__(**kwargs)
if isinstance(kernel, (int, float)):
kernel = MaximumMeanDiscrepancy.gaussian(kernel)
self.kernel = kernel
def __call__(self, outputs):
'''Construct the computation graph for this loss function.
Parameters
----------
outputs : dict of Theano expressions
A dictionary mapping network output names to Theano expressions
representing the outputs of a computation graph.
Returns
-------
loss : Theano expression
The values of the loss given the network output.
'''
output = outputs[self.output_name]
xx = self.kernel(self._target, self._target)
xy = self.kernel(self._target, output)
yy = self.kernel(output, output)
return xx.mean() - 2 * xy.mean() + yy.mean()
[docs]class KullbackLeiblerDivergence(Loss):
r'''The KL divergence loss is computed over probability distributions.
Notes
-----
The KL divergence loss is intended to optimize models that generate
probability distributions. If the outputs :math:`x_i` of a model represent a
normalized probability distribution (over the output variables), and the
targets :math:`t_i` represent a normalized target distribution (over the
output variables), then the KL divergence is given by:
.. math::
\mathcal{L}(x, t) = \frac{1}{d} \sum_{i=1}^d t_i \log \frac{t_i}{x_i}
Here the KL divergence is computed as a mean value over the output variables
in the model.
'''
__extra_registration_keys__ = ['KL', 'KLD']
def __call__(self, outputs):
'''Construct the computation graph for this loss function.
Parameters
----------
outputs : dict of Theano expressions
A dictionary mapping network output names to Theano expressions
representing the outputs of a computation graph.
Returns
-------
loss : Theano expression
The values of the loss given the network output.
'''
output = outputs[self.output_name]
eps = 1e-8
t = TT.clip(self._target, eps, 1 - eps)
kl = t * TT.log(t / TT.clip(output, eps, 1 - eps))
if self._weights is not None:
return abs(self._weights * kl).sum() / self._weights.sum()
return abs(kl).mean()
[docs]class CrossEntropy(Loss):
r'''Cross-entropy (XE) loss function for classifiers.
Parameters
----------
target : int
Number of dimensions required to store the target values for computing
the loss.
weight : float, optional
The importance of this loss for the model being trained. Defaults to 1.
weighted : bool, optional
If True, a floating-point array of weights with the same dimensions as
``out_dim`` will be required to compute the "weighted" loss. Defaults
to False.
output_name : str, optional
Name of the network output to tap for computing the loss. Defaults to
'out:out', the name of the default output of the last layer in a linear
network.
Attributes
----------
weight : float, optional
The importance of this loss for the model being trained.
output_name : str
Name of the network output to tap for computing the loss.
Notes
-----
The cross-entropy between a "true" distribution over discrete classes
:math:`p(t)` and a "model" distribution over predicted classes :math:`q(x)`
is the expected number of bits needed to store the model distribution, under
the expectation of the true distribution. Mathematically, this loss
computes:
.. math::
\mathcal{L}(x, t) = - \sum_{k=1}^K p(t=k) \log q(x=k)
The loss value is similar to the KL divergence between :math:`p` and
:math:`q`, but it is specifically aimed at classification models. When using
this loss, targets are assumed to be integers in the half-open interval
:math:`[0, k)`; internally, the loss is computed by first taking the log of
the model distribution and then summing up only the entries in the resulting
array corresponding to the true class.
'''
__extra_registration_keys__ = ['XE']
[docs] def __init__(self, target, weight=1., weighted=False, output_name='out'):
super(CrossEntropy, self).__init__(
target, weight=weight, weighted=weighted, output_name=output_name)
self._target = util.INT_CONTAINERS[target]('target')
def __call__(self, outputs):
'''Construct the computation graph for this loss function.
Parameters
----------
outputs : dict of Theano expressions
A dictionary mapping network output names to Theano expressions
representing the outputs of a computation graph.
Returns
-------
loss : Theano expression
The values of the loss given the network output.
'''
output = outputs[self.output_name]
k = output.shape[-1]
n = TT.prod(output.shape) // k
prob = output.reshape((n, k))[TT.arange(n), self._target.reshape((n, ))]
nlp = -TT.log(TT.clip(prob, 1e-8, 1))
if self._weights is not None:
return (self._weights.reshape((n, )) * nlp).sum() / self._weights.sum()
return nlp.mean()
[docs] def accuracy(self, outputs):
'''Build a Theano expression for computing the accuracy of graph output.
Parameters
----------
outputs : dict of Theano expressions
A dictionary mapping network output names to Theano expressions
representing the outputs of a computation graph.
Returns
-------
acc : Theano expression
A Theano expression representing the accuracy of the output compared
to the target data.
'''
output = outputs[self.output_name]
predict = TT.argmax(output, axis=-1)
correct = TT.eq(predict, self._target)
acc = correct.mean()
if self._weights is not None:
acc = (self._weights * correct).sum() / self._weights.sum()
return acc
[docs]class Hinge(CrossEntropy):
r'''Hinge loss function for classifiers.
Notes
-----
The hinge loss as implemented here computes the maximum difference between
the prediction :math:`q(x=k)` for a class :math:`k` and the prediction
:math:`q(x=t)` for the correct class :math:`t`:
.. math::
\mathcal{L}(x, t) = \max(0, \max_k q(x=k) - q(x=t))
This loss is zero whenever the prediction for the correct class is the
largest over classes, and increases linearly when the prediction for an
incorrect class is the largest.
'''
__extra_registration_keys__ = []
def __call__(self, outputs):
'''Construct the computation graph for this loss function.
Parameters
----------
outputs : dict of Theano expressions
A dictionary mapping network output names to Theano expressions
representing the outputs of a computation graph.
Returns
-------
loss : Theano expression
The values of the loss given the network output.
'''
output = outputs[self.output_name]
k = output.shape[-1]
n = TT.prod(output.shape) // k
output = output.reshape((n, k))
true = output[TT.arange(n), self._target.reshape((n, ))]
err = TT.maximum(0, (output - true[:, None]).max(axis=-1))
if self._weights is not None:
return (self._weights.reshape((n, )) * err).sum() / self._weights.sum()
return err.mean()