Source code for theanets.losses

# -*- coding: utf-8 -*-

'''These loss functions are available for neural network models.'''

import numpy as np
import theano.tensor as TT

from . import util


[docs]class Loss(util.Registrar(str('Base'), (), {})): r'''A loss function base class. Parameters ---------- target : int or Theano variable If this is an integer, it specifies the number of dimensions required to store the target values for computing the loss. If it is a Theano variable, this variable will be used directly to access target values. weight : float, optional The importance of this loss for the model being trained. Defaults to 1. weighted : bool, optional If True, a floating-point array of weights with the same dimensions as ``target`` will be required to compute the "weighted" loss. Defaults to False. output_name : str, optional Name of the network output to tap for computing the loss. Defaults to 'out:out', the name of the default output of the last layer in a linear network. Attributes ---------- weight : float The importance of this loss for the model being trained. output_name : str Name of the network output to tap for computing the loss. '''
[docs] def __init__(self, target, weight=1., weighted=False, output_name='out'): self.weight = weight self._target = (util.FLOAT_CONTAINERS[target]('target') if isinstance(target, int) else target) self._weights = None if weighted: self._weights = util.FLOAT_CONTAINERS[self._target.ndim]('weights') self.output_name = output_name if ':' not in self.output_name: self.output_name += ':out'
@property def variables(self): '''A list of Theano variables used in this loss.''' result = [self._target] if self._weights is not None: result.append(self._weights) return result
[docs] def log(self): '''Log some diagnostic info about this loss.''' util.log('using loss: {0.weight} * {0.__class__.__name__} ' '(output {0.output_name})', self)
def __call__(self, outputs): '''Construct the computation graph for this loss function. Parameters ---------- outputs : dict of Theano expressions A dictionary mapping network output names to Theano expressions representing the outputs of a computation graph. Returns ------- loss : Theano expression The values of the loss given the network output. ''' raise NotImplementedError
[docs]class MeanSquaredError(Loss): r'''Mean-squared-error (MSE) loss function. Notes ----- The mean squared error (MSE) loss computes the mean of the squared difference between the output of a computation graph :math:`x = (x_1, \dots, x_d)` and its expected target value :math:`t = (t_1, \dots, t_d)`. Mathematically, .. math:: \begin{eqnarray*} \mathcal{L}(x, t) &=& \frac{1}{d} \|x - t\|_2^2 \\ &=& \frac{1}{d} \sum_{i=1}^d (x_i - t_i)^2 \end{eqnarray*} Whereas some MSE computations return the sum over dimensions, the MSE here is computed as an average over the dimensionality of the data. For cases where :math:`x` and :math:`t` are matrices, the MSE computes the average over corresponding rows in :math:`x` and :math:`t`: .. math:: \mathcal{L}(X, T) = \frac{1}{dm} \sum_{j=1}^m \sum_{i=1}^d (x_{ji} - t_{ji})^2 ''' __extra_registration_keys__ = ['MSE'] def __call__(self, outputs): '''Construct the computation graph for this loss function. Parameters ---------- outputs : dict of Theano expressions A dictionary mapping network output names to Theano expressions representing the outputs of a computation graph. Returns ------- loss : Theano expression The values of the loss given the network output. ''' err = outputs[self.output_name] - self._target if self._weights is not None: return (self._weights * err * err).sum() / self._weights.sum() return (err * err).mean()
[docs]class MeanAbsoluteError(Loss): r'''Mean-absolute-error (MAE) loss function. Notes ----- The mean absolute error (MAE) loss computes the mean difference between the output of a computation graph :math:`x = (x_1, \dots, x_d)` and its expected target value :math:`t = (t_1, \dots, t_d)`. Mathematically, .. math:: \begin{eqnarray*} \mathcal{L}(x, t) &=& \frac{1}{d} \|x - t\|_1 \\ &=& \frac{1}{d} \sum_{i=1}^d |x_i - t_i| \end{eqnarray*} Whereas some MAE computations return the sum over dimensions, the MAE here is computed as an average over the dimensionality of the data. For cases where :math:`x` and :math:`t` are matrices, the MAE computes the average over corresponding rows in :math:`x` and :math:`t`. ''' __extra_registration_keys__ = ['MAE'] def __call__(self, outputs): '''Construct the computation graph for this loss function. Parameters ---------- outputs : dict of Theano expressions A dictionary mapping network output names to Theano expressions representing the outputs of a computation graph. Returns ------- loss : Theano expression The values of the loss given the network output. ''' err = outputs[self.output_name] - self._target if self._weights is not None: return abs(self._weights * err).sum() / self._weights.sum() return abs(err).mean()
[docs]class GaussianLogLikelihood(Loss): r'''Gaussian Log Likelihood (GLL) loss function. Parameters ---------- mean_name : str Name of the network graph output to use for the mean of the Gaussian distribution. covar_name : str Name of the network graph output to use for the diagonal covariance of the Gaussian distribution. Notes ----- This loss computes the negative log-likelihood of the observed target data :math:`y` under a Gaussian distribution, where the neural network computes the mean :math:`\mu` and the diagonal of the covariance :math:`\Sigma` as a function of its input :math:`x`. The loss is given by: .. math:: \mathcal{L}(x, y) = -\log p(y) = -\log p\left(y|\mu(x),\Sigma(x)\right) where .. math:: p(y) = p(y|\mu,\Sigma) = \frac{1}{(2\pi)^{n/2}|\Sigma|^{1/2}} \exp\left\{-\frac{1}{2}(y-\mu)^\top\Sigma^{-1}(y-\mu) \right\} is the Gaussian density function. The log density :math:`\log p(y)` can be parameterized more conveniently [Gu08]_ as: .. math:: \log p(y|\nu,\Lambda) = a + \eta^\top y - \frac{1}{2} y^\top \Lambda y where :math:`\Lambda = \Sigma^{-1}` is the precision, :math:`\eta = \Lambda\mu` is the covariance-skewed mean, and :math:`a=-\frac{1}{2}\left(n\log 2\pi-\log|\Lambda|+\eta^\top\Lambda\eta\right)` contains all constant terms. (These terms are all computed as a function of the input, :math:`x`.) This implementation of the Gaussian log-likelihood loss approximates :math:`\Sigma` using only its diagonal. This makes the precision easy to compute because .. math:: \Sigma^{-1} = \Lambda = \mbox{diag}(\frac{1}{\sigma_1}, \dots, \frac{1}{\sigma_n}) is just the matrix containing the multiplicative inverse of the diagonal covariance values. Similarly, the log-determinant of the precision is just the sum of the logs of the diagonal terms: .. math:: \log|\Lambda|=\sum_{i=1}^n\log\lambda_i=-\sum_{i=1}^n\log\sigma_i. The log-likelihood is computed separately for each input-output pair in a batch, and the overall likelihood is the mean of these individual values. Weighted targets unfortunately do not work with this loss at the moment. References ---------- .. [Gu08] Multivariate Gaussian Distribution. https://www.cs.cmu.edu/~epxing/Class/10701-08s/recitation/gaussian.pdf ''' __extra_registration_keys__ = ['GLL']
[docs] def __init__(self, mean_name='mean', covar_name='covar', covar_eps=1e-3, **kwargs): self.mean_name = mean_name if ':' not in self.mean_name: self.mean_name += ':out' self.covar_name = covar_name if ':' not in self.covar_name: self.covar_name += ':out' self.covar_eps = covar_eps super(GaussianLogLikelihood, self).__init__(**kwargs)
[docs] def log(self): '''Log some diagnostic info about this loss.''' util.log('using loss: {0.weight} * {0.__class__.__name__} ' '(mean {0.mean_name}, covar {0.covar_name})', self)
def __call__(self, outputs): '''Construct the computation graph for this loss function. Parameters ---------- outputs : dict of Theano expressions A dictionary mapping network output names to Theano expressions representing the outputs of a computation graph. Returns ------- loss : Theano expression The values of the loss given the network output. ''' # this code is going to look weird to people who are used to seeing # implementations of the gaussian likelihood function. our mean, covar, # and self._target arrays are all of shape (batch-size, dims). each of # these arrays codes an independent input/output pair for the loss, but # they're stacked together in matrices for computational efficiency. # # what's worse, the covariance is encoded as a vector of the diagonal # elements, again one per input/output pair in the batch. # # the upshot of this is that many operations written traditionally as # three dot products (vector-matrix-vector, e.g., x^T \Lambda x) are # here written as three elementwise array products (x * prec * x), # followed by a sum across the last dimension. this has the added # benefit that it will be way faster than dot products, but it looks # strange in the code below. mean = outputs[self.mean_name] covar = outputs[self.covar_name] prec = 1 / (abs(covar) + self.covar_eps) # prevent nans! eta = mean * prec logpi = TT.cast(mean.shape[-1] * np.log(2 * np.pi), 'float32') logdet = TT.log(prec.sum(axis=-1)) const = logpi - logdet + (eta * prec * eta).sum(axis=-1) squared = (self._target * prec * self._target).sum(axis=-1) nll = 0.5 * (const + squared) - (eta * self._target).sum(axis=-1) return nll.mean()
[docs]class MaximumMeanDiscrepancy(Loss): r'''Maximum Mean Discrepancy (MMD) loss function. Parameters ---------- kernel : callable or numeric, optional A kernel function to call for computing pairwise kernel values. If this is a callable, it should take two Theano arrays as arguments and return a Theano array. If it is a numeric value, the kernel will be a Gaussian with the given value as the bandwidth parameter. Defaults to 1. Notes ----- This loss computes the differential between a predicted distribution (generated by a network) and an observed distribution (of data within a mini-batch). The loss is given by: .. math:: \mathcal{L}(x, y) = \| \sum_{j=1}^N \phi(y_j) - \sum_{i=1}^N \phi(x_i) \|_2^2 This can be expanded to .. math:: \mathcal{L}(x, y) = \sum_{j=1}^N \sum_{j'=1}^N \phi(y_j)^\top \phi(y_{j'}) - 2 \sum_{j=1}^N \sum_{i=1}^N \phi(y_j)^\top \phi(x_i) + \sum_{i=1}^N \sum_{i'=1}^N \phi(x_i)^\top \phi(x_{i'}) and then the kernel trick can be applied, .. math:: \mathcal{L}(x, y) = \sum_{j=1}^N \sum_{j'=1}^N k(y_j, y_{j'}) - 2 \sum_{j=1}^N \sum_{i=1}^N k(y_j, x_i) + \sum_{i=1}^N \sum_{i'=1}^N k(x_i, x_{i'}) By default the loss here uses the Gaussian kernel .. math:: k(x, x') = \exp(-(x-x')^2/\sigma) where :math:`\sigma` is a scalar bandwidth parameter. However, other kernels can be provided when constructing the loss. References ---------- .. [Gre07] A. Gretton, K. M. Borgwardt, M. Rasch, B. Scholkopf, & A. J. Smola (NIPS 2007) "A Kernel Method for the Two-Sample-Problem." http://papers.nips.cc/paper/3110-a-kernel-method-for-the-two-sample-problem.pdf .. [Li15] Y. Li, K. Swersky, & R. Zemel (ICML 2015) "Generative Moment Matching Networks." http://jmlr.org/proceedings/papers/v37/li15.pdf ''' __extra_registration_keys__ = ['MMD'] @staticmethod def gaussian(bw): def kernel(x, y): # this dimshuffle lets us compute squared euclidean distance with a # broadcasted subtraction, a square, and a sum. r = x.dimshuffle(0, 'x', *tuple(range(1, x.ndim))) return TT.exp(TT.sqr(r - y).sum(axis=-1) / -bw) return kernel
[docs] def __init__(self, kernel=1, **kwargs): super(MaximumMeanDiscrepancy, self).__init__(**kwargs) if isinstance(kernel, (int, float)): kernel = MaximumMeanDiscrepancy.gaussian(kernel) self.kernel = kernel
def __call__(self, outputs): '''Construct the computation graph for this loss function. Parameters ---------- outputs : dict of Theano expressions A dictionary mapping network output names to Theano expressions representing the outputs of a computation graph. Returns ------- loss : Theano expression The values of the loss given the network output. ''' output = outputs[self.output_name] xx = self.kernel(self._target, self._target) xy = self.kernel(self._target, output) yy = self.kernel(output, output) return xx.mean() - 2 * xy.mean() + yy.mean()
[docs]class KullbackLeiblerDivergence(Loss): r'''The KL divergence loss is computed over probability distributions. Notes ----- The KL divergence loss is intended to optimize models that generate probability distributions. If the outputs :math:`x_i` of a model represent a normalized probability distribution (over the output variables), and the targets :math:`t_i` represent a normalized target distribution (over the output variables), then the KL divergence is given by: .. math:: \mathcal{L}(x, t) = \frac{1}{d} \sum_{i=1}^d t_i \log \frac{t_i}{x_i} Here the KL divergence is computed as a mean value over the output variables in the model. ''' __extra_registration_keys__ = ['KL', 'KLD'] def __call__(self, outputs): '''Construct the computation graph for this loss function. Parameters ---------- outputs : dict of Theano expressions A dictionary mapping network output names to Theano expressions representing the outputs of a computation graph. Returns ------- loss : Theano expression The values of the loss given the network output. ''' output = outputs[self.output_name] eps = 1e-8 t = TT.clip(self._target, eps, 1 - eps) kl = t * TT.log(t / TT.clip(output, eps, 1 - eps)) if self._weights is not None: return abs(self._weights * kl).sum() / self._weights.sum() return abs(kl).mean()
[docs]class CrossEntropy(Loss): r'''Cross-entropy (XE) loss function for classifiers. Parameters ---------- target : int Number of dimensions required to store the target values for computing the loss. weight : float, optional The importance of this loss for the model being trained. Defaults to 1. weighted : bool, optional If True, a floating-point array of weights with the same dimensions as ``out_dim`` will be required to compute the "weighted" loss. Defaults to False. output_name : str, optional Name of the network output to tap for computing the loss. Defaults to 'out:out', the name of the default output of the last layer in a linear network. Attributes ---------- weight : float, optional The importance of this loss for the model being trained. output_name : str Name of the network output to tap for computing the loss. Notes ----- The cross-entropy between a "true" distribution over discrete classes :math:`p(t)` and a "model" distribution over predicted classes :math:`q(x)` is the expected number of bits needed to store the model distribution, under the expectation of the true distribution. Mathematically, this loss computes: .. math:: \mathcal{L}(x, t) = - \sum_{k=1}^K p(t=k) \log q(x=k) The loss value is similar to the KL divergence between :math:`p` and :math:`q`, but it is specifically aimed at classification models. When using this loss, targets are assumed to be integers in the half-open interval :math:`[0, k)`; internally, the loss is computed by first taking the log of the model distribution and then summing up only the entries in the resulting array corresponding to the true class. ''' __extra_registration_keys__ = ['XE']
[docs] def __init__(self, target, weight=1., weighted=False, output_name='out'): super(CrossEntropy, self).__init__( target, weight=weight, weighted=weighted, output_name=output_name) self._target = util.INT_CONTAINERS[target]('target')
def __call__(self, outputs): '''Construct the computation graph for this loss function. Parameters ---------- outputs : dict of Theano expressions A dictionary mapping network output names to Theano expressions representing the outputs of a computation graph. Returns ------- loss : Theano expression The values of the loss given the network output. ''' output = outputs[self.output_name] k = output.shape[-1] n = TT.prod(output.shape) // k prob = output.reshape((n, k))[TT.arange(n), self._target.reshape((n, ))] nlp = -TT.log(TT.clip(prob, 1e-8, 1)) if self._weights is not None: return (self._weights.reshape((n, )) * nlp).sum() / self._weights.sum() return nlp.mean()
[docs] def accuracy(self, outputs): '''Build a Theano expression for computing the accuracy of graph output. Parameters ---------- outputs : dict of Theano expressions A dictionary mapping network output names to Theano expressions representing the outputs of a computation graph. Returns ------- acc : Theano expression A Theano expression representing the accuracy of the output compared to the target data. ''' output = outputs[self.output_name] predict = TT.argmax(output, axis=-1) correct = TT.eq(predict, self._target) acc = correct.mean() if self._weights is not None: acc = (self._weights * correct).sum() / self._weights.sum() return acc
[docs]class Hinge(CrossEntropy): r'''Hinge loss function for classifiers. Notes ----- The hinge loss as implemented here computes the maximum difference between the prediction :math:`q(x=k)` for a class :math:`k` and the prediction :math:`q(x=t)` for the correct class :math:`t`: .. math:: \mathcal{L}(x, t) = \max(0, \max_k q(x=k) - q(x=t)) This loss is zero whenever the prediction for the correct class is the largest over classes, and increases linearly when the prediction for an incorrect class is the largest. ''' __extra_registration_keys__ = [] def __call__(self, outputs): '''Construct the computation graph for this loss function. Parameters ---------- outputs : dict of Theano expressions A dictionary mapping network output names to Theano expressions representing the outputs of a computation graph. Returns ------- loss : Theano expression The values of the loss given the network output. ''' output = outputs[self.output_name] k = output.shape[-1] n = TT.prod(output.shape) // k output = output.reshape((n, k)) true = output[TT.arange(n), self._target.reshape((n, ))] err = TT.maximum(0, (output - true[:, None]).max(axis=-1)) if self._weights is not None: return (self._weights.reshape((n, )) * err).sum() / self._weights.sum() return err.mean()