python/mxnet/loss.py - mxnet-test - Git at Google

 # coding: utf-8
 # pylint: disable=too-many-arguments, no-member, protected-access, too-many-locals
 """ losses for training neural networks """
 from __future__ import absolute_import

 from .base import numeric_types, string_types, __version__
 from . import symbol
 from . import registry
 from . import metric as _metric


 def _apply_weight(loss, weight=None, sample_weight=None):
     """Apply weighting to loss.

     Parameters
     ----------
     loss : Symbol
         the loss to be weighted.
     weight : float or None
         global scalar weight for loss
     sample_weight : Symbol or None
         per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch sample_weight should have shape (64, 1)

     Returns
     -------
     loss : Symbol
         weighted loss
     """
     assert len(loss.list_outputs()) == 1, "loss symbol must have a single output"

     if sample_weight is not None:
         assert isinstance(sample_weight, symbol.Symbol), "sample_weight must be a Symbol"
         loss = symbol.broadcast_mul(loss, sample_weight)

     if weight is not None:
         assert isinstance(weight, numeric_types), "weight must be a number"
         loss *= weight

     return loss


 def _parse_metric(metric, output, label):
     """Create metric and set output/label names"""
     if metric is None:
         return None
     if isinstance(metric, string_types):
         metric = _metric.create(metric)
     if isinstance(label, symbol.Symbol):
         metric.label_names = [i for i in label.list_arguments()
                               if i not in output.list_arguments()]
     else:
         metric.label_names = list(label)
     assert len(metric.label_names) == 1, "too many labels %s"%str(metric.label_names)
     assert len(output.list_outputs()) == 1, "too many outputs %s"%str(output.list_outputs())
     metric.output_names = [output.name + '_out_output']
     return metric


 class BaseLoss(object):
     """Base class for all loss layers.

     Parameters
     ----------
     losses : Symbol
         a symbol whose output is the loss. Can be a scalar value
         or an array. If loss is an array, the sum of its elements
         will be the final loss.
     outputs : Symbol
         output of the model when predicting.
     label_names : list of str
         names of label variables. labels are used for training
         and scoring but not for predicting output.
     name : str
         name of this loss
     metric : EvalMetric or None
         metric for training and scoring. If None, only the loss
         values are displayed.
     output_head_grad : bool
         whether output needs head gradient for backward.
     loss_head_grad : bool
         whether loss needs head gradients for backward.

     Returns
     -------
     loss : BaseLoss
         created loss
     """
     def __init__(self, losses, outputs, label_names, name, metric=None,
                  output_head_grad=False, loss_head_grad=False, **kwargs):
         if losses is None:
             sym = list(symbol.load_json(kwargs['__group_sym__']))
             num = kwargs['__num_losses__']
             losses = symbol.Group(sym[:num])
             outputs = symbol.Group(sym[num:])
             if metric is not None:
                 metric = _metric.create(**metric)

         losses = symbol.Group(list(losses))
         outputs = symbol.Group(list(outputs))
         self._kwargs = kwargs
         self._kwargs.update({
             'loss': self.__class__.__name__,
             'losses': None,
             'outputs': None,
             'label_names': label_names,
             'name': name,
             'metric': metric.get_config() if metric is not None else None,
             'output_head_grad': output_head_grad,
             'loss_head_grad': loss_head_grad,
             '__group_sym__': symbol.Group(list(losses)+list(outputs)).tojson(),
             '__num_losses__': len(list(losses)),
             '__type__': 'loss',
             '__version__': __version__})

         if not loss_head_grad:
             self._loss_symbol = symbol.Group([symbol.make_loss(x, name=x.name+'_loss')
                                               for x in losses])
         else:
             self._loss_symbol = losses

         if not output_head_grad:
             self._output_symbol = symbol.Group([symbol.stop_gradient(x, name=x.name+'_out')
                                                 for x in outputs])
         else:
             self._output_symbol = outputs

         self._label_names = list(label_names) if label_names else []
         self._name = name

         composite = _metric.CompositeEvalMetric()
         if metric is not None:
             composite.add(metric)
         for i in self._loss_symbol:
             composite.add(_metric.Loss(name=i.name, output_names=i.list_outputs(), label_names=[]))
         self._metric = composite

     @property
     def name(self):
         """Name of loss"""
         return self._name

     @property
     def label_names(self):
         """names of label variables used to compute loss"""
         return self._label_names

     @property
     def loss_symbol(self):
         """loss symbol for training and scoring"""
         return self._loss_symbol

     @property
     def output_symbol(self):
         """output symbol for prediction"""
         return self._output_symbol

     @property
     def metric(self):
         """Metric for evaluation"""
         return self._metric

     def get_config(self):
         """get configs for serialization"""
         return self._kwargs.copy()


 # pylint: disable=invalid-name
 register = registry.get_register_func(BaseLoss, 'loss')
 create = registry.get_create_func(BaseLoss, 'loss')
 register(BaseLoss)
 # pylint: enable=invalid-name


 def custom_loss(loss, output, label_names, extra_outputs=(),
                 weight=None, sample_weight=None, name='loss',
                 metric=None, **kwargs):
     """User defined custom loss. For example, the following
     custom loss has the same effect as l2_loss::
         data = mx.sym.Variable('data')
         label = mx.sym.Variable('label')
         output = mx.sym.FullyConnected(data, num_hidden=1)
         L = mx.sym.square(output-label)/2
         loss = mx.sym.custom_loss(L, output, label_names=['label'])

     Parameters
     ----------
     loss : Symbol
         a symbol whose output is the loss. Can be a scalar value
         or an array. If loss is an array, the sum of its elements
         will be the final loss.
     output : Symbol
         output of the model when predicting.
     label_names : list of str
         names of label variables. labels are used for training
         and scoring but not for predicting output.
     extra_outputs : list of Symbol
         extra outputs for predition but not used for evaluating
         metric. Normally used when you want to analyze internal
         feature maps.
     weight : float or None
         global scalar weight for loss
     sample_weight : Symbol or None
         per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch, sample_weight should have shape (64, 1)
     name : str
         name of this loss
     metric : EvalMetric or None
         metric for training and scoring. If None, only the loss
         values are displayed.
     output_head_grad : bool
         whether output needs head gradient for backward.
     loss_head_grad : bool
         whether loss needs head gradients for backward.

     Returns
     -------
     loss : BaseLoss
         created loss
     """
     label_names = list(label_names)
     metric = _parse_metric(metric, output, label_names)
     output = list(output) + list(extra_outputs)
     if sample_weight is not None:
         label_names += [i for i in sample_weight.list_arguments()
                         if i not in loss.list_arguments()]
     loss = _apply_weight(loss, weight=weight, sample_weight=sample_weight)
     loss._set_attr(name=name)
     return BaseLoss(loss, output, label_names, name, **kwargs)


 def multi_loss(losses, extra_outputs=(), name='multi'):
     """Combine multiple losses. The final loss is the sum
     of all losses.

     Parameters
     ----------
     losses : list of BaseLoss
         a list of individual losses with no extra outputs.
     extra_outputs : list of Symbol
         extra outputs for predition but not used for evaluating
         metric. Normally used when you want to analyze internal
         feature maps.
     name : str
         name of combined loss

     Returns
     -------
     loss : BaseLoss
         created loss
     """
     loss = symbol.Group(sum([list(i.loss_symbol) for i in losses], []))
     output = symbol.Group(sum([list(i.output_symbol) for i in losses], []) + list(extra_outputs))
     label_names = []
     for i in losses:
         for name in i.label_names:
             if name not in label_names:
                 label_names.append(name)
     ret = BaseLoss(loss, output, label_names, name,
                    output_head_grad=True, loss_head_grad=True)
     del ret.metric.metrics[:]
     for i in losses:
         ret.metric.add(i.metric)
     return ret


 def l2_loss(output, label, extra_outputs=(), weight=1.,
             sample_weight=None, metric='mse', name='l2',
             **kwargs):
     """Calculate the mean squared error between output and label:

     .. math::
     L = \\frac{1}{2}\\sum_i \\Vert {output}_i - {label}_i \\Vert^2.

     output and label can have arbitrary shape as long as they have the same
     number of elements.

     Parameters
     ----------
     output : Symbol
         output of the network
     label : Symbol
         target to compare output against
     extra_outputs : list of Symbol
         extra outputs for predition but not used for evaluating
         metric. Normally used when you want to analyze internal
         feature maps.
     weight : float or None
         global scalar weight for loss
     sample_weight : Symbol or None
         per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch, sample_weight should have shape (64, 1)
     metric : EvalMetric or None
         metric for training and scoring. If None, only the loss
         values are displayed.
     name : str
         name of this loss
     output_head_grad : bool
         whether output needs head gradient for backward.
     loss_head_grad : bool
         whether loss needs head gradients for backward.

     Returns
     -------
     loss : BaseLoss
         created loss
     """
     metric = _parse_metric(metric, output, label)
     outputs = [output] + list(extra_outputs)

     output = symbol.reshape(output, shape=(-1,))
     label = symbol.reshape(label, shape=(-1,))
     loss = symbol.square(output - label)
     loss = _apply_weight(loss, weight/2., sample_weight)
     loss._set_attr(name=name)

     label_names = [x for x in loss.list_arguments()
                    if x not in output.list_arguments()]
     return BaseLoss(loss, outputs, label_names, name, metric=metric, **kwargs)


 def l1_loss(output, label, extra_outputs=(), name='l1',
             weight=None, sample_weight=None, metric='mae',
             **kwargs):
     """Calculate the mean absolute error between output and label:

     .. math::
     L = \\frac{1}{2}\\sum_i \\vert {output}_i - {label}_i \\vert.

     output and label can have arbitrary shape as long as they have the same
     number of elements.

     Parameters
     ----------
     output : Symbol
         output of the network
     label : Symbol
         target to compare output against
     extra_outputs : list of Symbol
         extra outputs for predition but not used for evaluating
         metric. Normally used when you want to analyze internal
         feature maps.
     weight : float or None
         global scalar weight for loss
     sample_weight : Symbol or None
         per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch, sample_weight should have shape (64, 1)
     metric : EvalMetric or None
         metric for training and scoring. If None, only the loss
         values are displayed.
     name : str
         name of this loss
     output_head_grad : bool
         whether output needs head gradient for backward.
     loss_head_grad : bool
         whether loss needs head gradients for backward.

     Returns
     -------
     loss : BaseLoss
         created loss
     """
     metric = _parse_metric(metric, output, label)
     outputs = [output] + list(extra_outputs)

     output = symbol.reshape(output, shape=(-1,))
     label = symbol.reshape(label, shape=(-1,))
     loss = symbol.abs(output - label)
     loss = _apply_weight(loss, weight, sample_weight)
     loss._set_attr(name=name)

     label_names = [x for x in loss.list_arguments()
                    if x not in output.list_arguments()]
     return BaseLoss(loss, outputs, label_names, name, metric=metric, **kwargs)


 def softmax_cross_entropy_loss(output, label, sparse_label=True, axis=1,
                                extra_outputs=(), name='ce', weight=None,
                                sample_weight=None, metric='acc', **kwargs):
     """Compute the softmax cross entropy loss.

     If sparse_label is True, label should contain integer category indicators:
     .. math::
     p = {softmax}({output})
     L = -\\sum_i {log}(p_{i,{label}_i})

     If sparse_label is False, label should cantain probability distribution
     with the same shape as output:
     .. math::
     p = {softmax}({output})
     L = -\\sum_i \\sum_j {label}_j {log}(p_{ij})

     Parameters
     ----------
     output : Symbol
         output of the network
     label : Symbol
         target to compare output against
     sparse_label : bool
         where label is sparse integer or probability distribution
     axis : int
         The axis to sum over when computing softmax and entropy
     extra_outputs : list of Symbol
         extra outputs for predition but not used for evaluating
         metric. Normally used when you want to analyze internal
         feature maps.
     weight : float or None
         global scalar weight for loss
     sample_weight : Symbol or None
         per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch, sample_weight should have shape (64, 1)
     metric : EvalMetric or None
         metric for training and scoring. If None, only the loss
         values are displayed.
     name : str
         name of this loss
     output_head_grad : bool
         whether output needs head gradient for backward.
     loss_head_grad : bool
         whether loss needs head gradients for backward.

     Returns
     -------
     loss : BaseLoss
         created loss
     """
     metric = _parse_metric(metric, output, label)
     outputs = [output] + list(extra_outputs)

     # TODO(Eric): make a log_softmax op
     basis = symbol.max(output, axis=axis, keepdims=True)
     basis = symbol.stop_gradient(basis)
     output = symbol.broadcast_sub(output, basis)
     norm = symbol.log(symbol.sum(symbol.exp(output), axis=axis, keepdims=True))
     output = symbol.broadcast_sub(output, norm)

     if sparse_label:
         loss = -symbol.pick(output, label, axis=axis, keepdims=False)
     else:
         loss = -symbol.sum(output*label, axis=axis, keepdims=False)

     loss = _apply_weight(loss, weight, sample_weight)
     loss._set_attr(name=name)

     label_names = [x for x in loss.list_arguments()
                    if x not in output.list_arguments()]
     return BaseLoss(loss, outputs, label_names, name, metric=metric, **kwargs)
	# coding: utf-8
	# pylint: disable=too-many-arguments, no-member, protected-access, too-many-locals
	""" losses for training neural networks """
	from __future__ import absolute_import

	from .base import numeric_types, string_types, __version__
	from . import symbol
	from . import registry
	from . import metric as _metric


	def _apply_weight(loss, weight=None, sample_weight=None):
	"""Apply weighting to loss.

	Parameters
	----------
	loss : Symbol
	the loss to be weighted.
	weight : float or None
	global scalar weight for loss
	sample_weight : Symbol or None
	per sample weighting. Must be broadcastable to
	the same shape as loss. For example, if loss has
	shape (64, 10) and you want to weight each sample
	in the batch sample_weight should have shape (64, 1)

	Returns
	-------
	loss : Symbol
	weighted loss
	"""
	assert len(loss.list_outputs()) == 1, "loss symbol must have a single output"

	if sample_weight is not None:
	assert isinstance(sample_weight, symbol.Symbol), "sample_weight must be a Symbol"
	loss = symbol.broadcast_mul(loss, sample_weight)

	if weight is not None:
	assert isinstance(weight, numeric_types), "weight must be a number"
	loss *= weight

	return loss


	def _parse_metric(metric, output, label):
	"""Create metric and set output/label names"""
	if metric is None:
	return None
	if isinstance(metric, string_types):
	metric = _metric.create(metric)
	if isinstance(label, symbol.Symbol):
	metric.label_names = [i for i in label.list_arguments()
	if i not in output.list_arguments()]
	else:
	metric.label_names = list(label)
	assert len(metric.label_names) == 1, "too many labels %s"%str(metric.label_names)
	assert len(output.list_outputs()) == 1, "too many outputs %s"%str(output.list_outputs())
	metric.output_names = [output.name + '_out_output']
	return metric


	class BaseLoss(object):
	"""Base class for all loss layers.

	Parameters
	----------
	losses : Symbol
	a symbol whose output is the loss. Can be a scalar value
	or an array. If loss is an array, the sum of its elements
	will be the final loss.
	outputs : Symbol
	output of the model when predicting.
	label_names : list of str
	names of label variables. labels are used for training
	and scoring but not for predicting output.
	name : str
	name of this loss
	metric : EvalMetric or None
	metric for training and scoring. If None, only the loss
	values are displayed.
	output_head_grad : bool
	whether output needs head gradient for backward.
	loss_head_grad : bool
	whether loss needs head gradients for backward.

	Returns
	-------
	loss : BaseLoss
	created loss
	"""
	def __init__(self, losses, outputs, label_names, name, metric=None,
	output_head_grad=False, loss_head_grad=False, **kwargs):
	if losses is None:
	sym = list(symbol.load_json(kwargs['__group_sym__']))
	num = kwargs['__num_losses__']
	losses = symbol.Group(sym[:num])
	outputs = symbol.Group(sym[num:])
	if metric is not None:
	metric = _metric.create(**metric)

	losses = symbol.Group(list(losses))
	outputs = symbol.Group(list(outputs))
	self._kwargs = kwargs
	self._kwargs.update({
	'loss': self.__class__.__name__,
	'losses': None,
	'outputs': None,
	'label_names': label_names,
	'name': name,
	'metric': metric.get_config() if metric is not None else None,
	'output_head_grad': output_head_grad,
	'loss_head_grad': loss_head_grad,
	'__group_sym__': symbol.Group(list(losses)+list(outputs)).tojson(),
	'__num_losses__': len(list(losses)),
	'__type__': 'loss',
	'__version__': __version__})

	if not loss_head_grad:
	self._loss_symbol = symbol.Group([symbol.make_loss(x, name=x.name+'_loss')
	for x in losses])
	else:
	self._loss_symbol = losses

	if not output_head_grad:
	self._output_symbol = symbol.Group([symbol.stop_gradient(x, name=x.name+'_out')
	for x in outputs])
	else:
	self._output_symbol = outputs

	self._label_names = list(label_names) if label_names else []
	self._name = name

	composite = _metric.CompositeEvalMetric()
	if metric is not None:
	composite.add(metric)
	for i in self._loss_symbol:
	composite.add(_metric.Loss(name=i.name, output_names=i.list_outputs(), label_names=[]))
	self._metric = composite

	@property
	def name(self):
	"""Name of loss"""
	return self._name

	@property
	def label_names(self):
	"""names of label variables used to compute loss"""
	return self._label_names

	@property
	def loss_symbol(self):
	"""loss symbol for training and scoring"""
	return self._loss_symbol

	@property
	def output_symbol(self):
	"""output symbol for prediction"""
	return self._output_symbol

	@property
	def metric(self):
	"""Metric for evaluation"""
	return self._metric

	def get_config(self):
	"""get configs for serialization"""
	return self._kwargs.copy()


	# pylint: disable=invalid-name
	register = registry.get_register_func(BaseLoss, 'loss')
	create = registry.get_create_func(BaseLoss, 'loss')
	register(BaseLoss)
	# pylint: enable=invalid-name


	def custom_loss(loss, output, label_names, extra_outputs=(),
	weight=None, sample_weight=None, name='loss',
	metric=None, **kwargs):
	"""User defined custom loss. For example, the following
	custom loss has the same effect as l2_loss::
	data = mx.sym.Variable('data')
	label = mx.sym.Variable('label')
	output = mx.sym.FullyConnected(data, num_hidden=1)
	L = mx.sym.square(output-label)/2
	loss = mx.sym.custom_loss(L, output, label_names=['label'])

	Parameters
	----------
	loss : Symbol
	a symbol whose output is the loss. Can be a scalar value
	or an array. If loss is an array, the sum of its elements
	will be the final loss.
	output : Symbol
	output of the model when predicting.
	label_names : list of str
	names of label variables. labels are used for training
	and scoring but not for predicting output.
	extra_outputs : list of Symbol
	extra outputs for predition but not used for evaluating
	metric. Normally used when you want to analyze internal
	feature maps.
	weight : float or None
	global scalar weight for loss
	sample_weight : Symbol or None
	per sample weighting. Must be broadcastable to
	the same shape as loss. For example, if loss has
	shape (64, 10) and you want to weight each sample
	in the batch, sample_weight should have shape (64, 1)
	name : str
	name of this loss
	metric : EvalMetric or None
	metric for training and scoring. If None, only the loss
	values are displayed.
	output_head_grad : bool
	whether output needs head gradient for backward.
	loss_head_grad : bool
	whether loss needs head gradients for backward.

	Returns
	-------
	loss : BaseLoss
	created loss
	"""
	label_names = list(label_names)
	metric = _parse_metric(metric, output, label_names)
	output = list(output) + list(extra_outputs)
	if sample_weight is not None:
	label_names += [i for i in sample_weight.list_arguments()
	if i not in loss.list_arguments()]
	loss = _apply_weight(loss, weight=weight, sample_weight=sample_weight)
	loss._set_attr(name=name)
	return BaseLoss(loss, output, label_names, name, **kwargs)


	def multi_loss(losses, extra_outputs=(), name='multi'):
	"""Combine multiple losses. The final loss is the sum
	of all losses.

	Parameters
	----------
	losses : list of BaseLoss
	a list of individual losses with no extra outputs.
	extra_outputs : list of Symbol
	extra outputs for predition but not used for evaluating
	metric. Normally used when you want to analyze internal
	feature maps.
	name : str
	name of combined loss

	Returns
	-------
	loss : BaseLoss
	created loss
	"""
	loss = symbol.Group(sum([list(i.loss_symbol) for i in losses], []))
	output = symbol.Group(sum([list(i.output_symbol) for i in losses], []) + list(extra_outputs))
	label_names = []
	for i in losses:
	for name in i.label_names:
	if name not in label_names:
	label_names.append(name)
	ret = BaseLoss(loss, output, label_names, name,
	output_head_grad=True, loss_head_grad=True)
	del ret.metric.metrics[:]
	for i in losses:
	ret.metric.add(i.metric)
	return ret



	def l2_loss(output, label, extra_outputs=(), weight=1.,
	sample_weight=None, metric='mse', name='l2',
	**kwargs):
	"""Calculate the mean squared error between output and label:

	.. math::
	L = \\frac{1}{2}\\sum_i \\Vert {output}_i - {label}_i \\Vert^2.

	output and label can have arbitrary shape as long as they have the same
	number of elements.

	Parameters
	----------
	output : Symbol
	output of the network
	label : Symbol
	target to compare output against
	extra_outputs : list of Symbol
	extra outputs for predition but not used for evaluating
	metric. Normally used when you want to analyze internal
	feature maps.
	weight : float or None
	global scalar weight for loss
	sample_weight : Symbol or None
	per sample weighting. Must be broadcastable to
	the same shape as loss. For example, if loss has
	shape (64, 10) and you want to weight each sample
	in the batch, sample_weight should have shape (64, 1)
	metric : EvalMetric or None
	metric for training and scoring. If None, only the loss
	values are displayed.
	name : str
	name of this loss
	output_head_grad : bool
	whether output needs head gradient for backward.
	loss_head_grad : bool
	whether loss needs head gradients for backward.

	Returns
	-------
	loss : BaseLoss
	created loss
	"""
	metric = _parse_metric(metric, output, label)
	outputs = [output] + list(extra_outputs)

	output = symbol.reshape(output, shape=(-1,))
	label = symbol.reshape(label, shape=(-1,))
	loss = symbol.square(output - label)
	loss = _apply_weight(loss, weight/2., sample_weight)
	loss._set_attr(name=name)

	label_names = [x for x in loss.list_arguments()
	if x not in output.list_arguments()]
	return BaseLoss(loss, outputs, label_names, name, metric=metric, **kwargs)


	def l1_loss(output, label, extra_outputs=(), name='l1',
	weight=None, sample_weight=None, metric='mae',
	**kwargs):
	"""Calculate the mean absolute error between output and label:

	.. math::
	L = \\frac{1}{2}\\sum_i \\vert {output}_i - {label}_i \\vert.

	output and label can have arbitrary shape as long as they have the same
	number of elements.

	Parameters
	----------
	output : Symbol
	output of the network
	label : Symbol
	target to compare output against
	extra_outputs : list of Symbol
	extra outputs for predition but not used for evaluating
	metric. Normally used when you want to analyze internal
	feature maps.
	weight : float or None
	global scalar weight for loss
	sample_weight : Symbol or None
	per sample weighting. Must be broadcastable to
	the same shape as loss. For example, if loss has
	shape (64, 10) and you want to weight each sample
	in the batch, sample_weight should have shape (64, 1)
	metric : EvalMetric or None
	metric for training and scoring. If None, only the loss
	values are displayed.
	name : str
	name of this loss
	output_head_grad : bool
	whether output needs head gradient for backward.
	loss_head_grad : bool
	whether loss needs head gradients for backward.

	Returns
	-------
	loss : BaseLoss
	created loss
	"""
	metric = _parse_metric(metric, output, label)
	outputs = [output] + list(extra_outputs)

	output = symbol.reshape(output, shape=(-1,))
	label = symbol.reshape(label, shape=(-1,))
	loss = symbol.abs(output - label)
	loss = _apply_weight(loss, weight, sample_weight)
	loss._set_attr(name=name)

	label_names = [x for x in loss.list_arguments()
	if x not in output.list_arguments()]
	return BaseLoss(loss, outputs, label_names, name, metric=metric, **kwargs)


	def softmax_cross_entropy_loss(output, label, sparse_label=True, axis=1,
	extra_outputs=(), name='ce', weight=None,
	sample_weight=None, metric='acc', **kwargs):
	"""Compute the softmax cross entropy loss.

	If sparse_label is True, label should contain integer category indicators:
	.. math::
	p = {softmax}({output})
	L = -\\sum_i {log}(p_{i,{label}_i})

	If sparse_label is False, label should cantain probability distribution
	with the same shape as output:
	.. math::
	p = {softmax}({output})
	L = -\\sum_i \\sum_j {label}_j {log}(p_{ij})

	Parameters
	----------
	output : Symbol
	output of the network
	label : Symbol
	target to compare output against
	sparse_label : bool
	where label is sparse integer or probability distribution
	axis : int
	The axis to sum over when computing softmax and entropy
	extra_outputs : list of Symbol
	extra outputs for predition but not used for evaluating
	metric. Normally used when you want to analyze internal
	feature maps.
	weight : float or None
	global scalar weight for loss
	sample_weight : Symbol or None
	per sample weighting. Must be broadcastable to
	the same shape as loss. For example, if loss has
	shape (64, 10) and you want to weight each sample
	in the batch, sample_weight should have shape (64, 1)
	metric : EvalMetric or None
	metric for training and scoring. If None, only the loss
	values are displayed.
	name : str
	name of this loss
	output_head_grad : bool
	whether output needs head gradient for backward.
	loss_head_grad : bool
	whether loss needs head gradients for backward.

	Returns
	-------
	loss : BaseLoss
	created loss
	"""
	metric = _parse_metric(metric, output, label)
	outputs = [output] + list(extra_outputs)

	# TODO(Eric): make a log_softmax op
	basis = symbol.max(output, axis=axis, keepdims=True)
	basis = symbol.stop_gradient(basis)
	output = symbol.broadcast_sub(output, basis)
	norm = symbol.log(symbol.sum(symbol.exp(output), axis=axis, keepdims=True))
	output = symbol.broadcast_sub(output, norm)

	if sparse_label:
	loss = -symbol.pick(output, label, axis=axis, keepdims=False)
	else:
	loss = -symbol.sum(output*label, axis=axis, keepdims=False)

	loss = _apply_weight(loss, weight, sample_weight)
	loss._set_attr(name=name)

	label_names = [x for x in loss.list_arguments()
	if x not in output.list_arguments()]
	return BaseLoss(loss, outputs, label_names, name, metric=metric, **kwargs)