perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 use strict;
 use warnings;
 package AI::MXNet::Gluon::Loss;
 use AI::MXNet::NS;
 use AI::MXNet::Gluon::Block;
 use AI::MXNet::Function::Parameters;

 =head1 NAME

     AI::MXNet::Gluon::Loss - Base class for loss.
 =cut

 =head2 DESCRIPTION

     Base class for loss.

     Parameters
     ----------
     weight : float or None
         Global scalar weight for loss.
     batch_axis : int, default 0
         The axis that represents mini-batch.
 =cut

 =head2 _apply_weighting

     Apply weighting to loss.

     Parameters
     ----------
     loss : Symbol
         The loss to be weighted.
     weight : float or None
         Global scalar weight for loss.
     sample_weight : Symbol or None
         Per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch separately, `sample_weight` should have
         shape (64, 1).

     Returns
     -------
     loss : Symbol
         Weighted loss
 =cut


 method _apply_weighting(Str $F, GluonInput $loss, Maybe[Num] $weight=, Maybe[GluonInput] $sample_weight=)
 {
     if(defined $sample_weight)
     {
         $loss = $F->broadcast_mul($loss, $sample_weight);
     }
     if(defined $weight)
     {
         $loss = $loss * $weight;
     }
     return $loss;
 }

 # Reshapes x to the same shape as y
 method _reshape_like(GluonClass $F, GluonInput $x, GluonInput $y)
 {
     if($F eq 'AI::MXNet::NDArray')
     {
         return $x->reshape($y->shape);
     }
     else
     {
         return $F->reshape_like($x, $y);
     }
 }

 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::HybridBlock';
 has 'weight'     => (is => 'rw', isa => 'Num');
 has 'batch_axis' => (is => 'rw', isa => 'Int', default => 0);

 use overload '""' => sub {
         my $self = shift;
         sprintf(
             "%s(batch_axis=%s, w=%s)",
             $self->_class_name,
             $self->batch_axis,
             $self->weight
         );
     };

 method hybrid_forward($F, $x, @args)
 {
     confess('NotImplementedError');
 }

 package AI::MXNet::Gluon::L2Loss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';

 =head1 NAME

     AI::MXNet::Gluon::L2Loss
 =cut

 =head1 DESCRIPTION

     Calculates the mean squared error between output and label:

     Output and label can have arbitrary shape as long as they have the same
     number of elements.

     Parameters
     ----------
     weight : float or None
         Global scalar weight for loss.
     sample_weight : Symbol or None
         Per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch, `sample_weight` should have shape (64, 1).
     batch_axis : int, default 0
         The axis that represents mini-batch.
 =cut
 has '+weight'     => (default => 1);
 has '+batch_axis' => (default => 0);

 method hybrid_forward(GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=)
 {

     $label = __PACKAGE__->_reshape_like($F, $label, $pred);
     my $loss = $F->square($pred - $label);
     $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight/2, $sample_weight);
     return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::L1Loss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';
 has '+weight'     => (default => 1);
 has '+batch_axis' => (default => 0);

 =head1 NAME

     AI::MXNet::Gluon::L1Loss
 =cut

 =head1 DESCRIPTION

     Calculates the mean absolute error between output and label:

     .. math::
         L = \\frac{1}{2}\\sum_i \\vert {output}_i - {label}_i \\vert.

     Output and label must have the same shape.

     Parameters
     ----------
     weight : float or None
         Global scalar weight for loss.
     sample_weight : Symbol or None
         Per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch, `sample_weight` should have shape (64, 1).
     batch_axis : int, default 0
         The axis that represents mini-batch.
 =cut

 method hybrid_forward(GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=)
 {
     $label = __PACKAGE__->_reshape_like($F, $label, $pred);
     my $loss = $F->abs($pred - $label);
     $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
     return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::SigmoidBinaryCrossEntropyLoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';
 has 'from_sigmoid' => (is => 'ro', isa => 'Bool', default => 0);
 has '+batch_axis'  => (default => 0);

 =head1 NAME

     AI::MXNet::Gluon::SigmoidBinaryCrossEntropyLoss
 =cut

 =head1 DESCRIPTION

     The cross-entropy loss for binary classification. (alias: SigmoidBCELoss)

     BCE loss is useful when training logistic regression.

     .. math::
         loss(o, t) = - 1/n \sum_i (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))


     Parameters
     ----------
     from_sigmoid : bool, default is `False`
         Whether the input is from the output of sigmoid. Set this to false will make
         the loss calculate sigmoid and then BCE, which is more numerically stable through
         log-sum-exp trick.
     weight : float or None
         Global scalar weight for loss.
     sample_weight : Symbol or None
         Per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch, `sample_weight` should have shape (64, 1).
     batch_axis : int, default 0
         The axis that represents mini-batch.
 =cut

 method hybrid_forward(GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=)
 {
     $label = __PACKAGE__->_reshape_like($F, $label, $pred);
     my $loss;
     if(not $self->from_sigmoid)
     {
         $loss = $F->relu($pred) - $pred * $label + $F->Activation(-$F->abs($pred), act_type=>'softrelu');
     }
     else
     {
         $loss = -($F->log($pred+1e-12)*$label + $F->log(1-$pred+1e-12)*(1-$label));
     }
     $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
     return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::SigmoidBCELoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::SigmoidBinaryCrossEntropyLoss';

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::SoftmaxCrossEntropyLoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';


 =head1 NAME

     AI::MXNet::Gluon::SoftmaxCrossEntropyLoss
 =cut

 =head1 DESCRIPTION

     Computes the softmax cross entropy loss. (alias: SoftmaxCELoss)

     If `sparse_label` is `True`, label should contain integer category indicators:

     .. math::
         p = {softmax}({output})

         L = -\\sum_i {log}(p_{i,{label}_i})

     Label's shape should be output's shape without the `axis` dimension. i.e. for
     `output.shape` = (1,2,3,4) and axis = 2, `label.shape` should be (1,2,4).

     If `sparse_label` is `False`, label should contain probability distribution
     with the same shape as output:

     .. math::
         p = {softmax}({output})

         L = -\\sum_i \\sum_j {label}_j {log}(p_{ij})

     Parameters
     ----------
     axis : int, default -1
         The axis to sum over when computing softmax and entropy.
     sparse_label : bool, default True
         Whether label is an integer array instead of probability distribution.
     from_logits : bool, default False
         Whether input is a log probability (usually from log_softmax) instead
         of unnormalized numbers.
     weight : float or None
         Global scalar weight for loss.
     sample_weight : Symbol or None
         Per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch, `sample_weight` should have shape (64, 1).
     batch_axis : int, default 0
         The axis that represents mini-batch.
 =cut

 has 'axis'         => (is => 'ro', isa => 'Int', default => -1);
 has '+batch_axis'  => (default => 0);
 has 'sparse_label' => (is => 'ro', isa => 'Bool', default => 1);
 has 'from_logits'  => (is => 'ro', isa => 'Bool', default => 0);

 method hybrid_forward(GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=)
 {
     if(not $self->from_logits)
     {
         $pred = $F->log_softmax($pred, axis => $self->axis);
     }
     my $loss;
     if($self->sparse_label)
     {
         $loss = -$F->pick($pred, $label, axis=>$self->axis, keepdims => 1);
     }
     else
     {
         __PACKAGE__->reshape_like($F, $label, $pred);
         $loss = -$F->sum($pred*$label, axis => $self->axis, keepdims => 1);
     }
     $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
     return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::SoftmaxCELoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::SoftmaxCrossEntropyLoss';

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');


 package AI::MXNet::Gluon::KLDivLoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';
 has '+batch_axis'  => (default => 0);
 has 'axis'         => (is => 'ro', isa => 'Int', default => -1);
 has 'from_logits'  => (is => 'ro', isa => 'Bool', default => 1);

 =head1 NAME

     AI::MXNet::Gluon::KLDivLoss
 =cut

 =head1 DESCRIPTION

     The Kullback-Leibler divergence loss.

     KL divergence is a useful distance measure for continuous distributions
     and is often useful when performing direct regression over the space of
     (discretely sampled) continuous output distributions.

     .. _Kullback-Leibler divergence:
         https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
     .. math::
         L = 1/n \\sum_i (label_i * (log(label_i) - output_i))

     Label's shape should be the same as output's.

     Parameters
     ----------
     from_logits : bool, default is `True`
         Whether the input is log probability (usually from log_softmax) instead
         of unnormalized numbers.
     weight : float or None
         Global scalar weight for loss.
     axis : int, default -1
         The dimension along with to compute softmax. Only used when `from_logits`
         is False.
     sample_weight : Symbol or None
         Per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch, `sample_weight` should have shape (64, 1).
     batch_axis : int, default 0
         The axis that represents mini-batch.
 =cut

 method hybrid_forward(GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=)
 {
     if(not $self->from_logits)
     {
         $pred = $F->log_softmax($pred, axis => $self->axis);
     }
     my $loss = $label * ($F->log($label+1e-12) - $pred);
     $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
     return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::CTCLoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';
 has 'layout'        => (is => 'rw', isa => 'Str', default => 'NTC');
 has 'label_layout'  => (is => 'rw', isa => 'Str', default => 'NT');

 =head1 NAME

     AI::MXNet::Gluon::CTCLoss
 =cut

 =head1 DESCRIPTION

     Connectionist Temporal Classification Loss.

     See `"Connectionist Temporal Classification: Labelling Unsegmented
     Sequence Data with Recurrent Neural Networks"
     <http://www.cs.toronto.edu/~graves/icml_2006.pdf>`_ paper for more information.

     Parameters
     ----------
     layout : str, default 'NTC'
         Layout of the output sequence activation vector.
     label_layout : str, default 'NT'
         Layout of the labels.
     weight : float or None
         Global scalar weight for loss.
     sample_weight : Symbol or None
         Per sample weighting. Must be broadcastable to
         the same shape as loss. For example, if loss has
         shape (64, 10) and you want to weight each sample
         in the batch, `sample_weight` should have shape (64, 1).
         This should be used as the fifth argument when calling this loss.

     Input shapes:
         `data` is an activation tensor (i.e. before softmax).
         Its shape depends on `layout`. For `layout='TNC'`, this
         input has shape `(sequence_length, batch_size, alphabet_size)`
         Note that the last dimension with index `alphabet_size-1` is reserved for special
         blank character.

         `label` is the label index matrix with zero-indexed labels.
         Its shape depends on `label_layout`. For `label_layout='TN'`, this
         input has shape `(label_sequence_length, batch_size)`. Padding mask of value ``-1``
         is available for dealing with unaligned label lengths.
         When `label_lengths` is specified, label lengths are directly used and padding mask
         is not allowed in the label.
         When `label_lengths` is not specified, the first occurrence of ``-1``
         in each sample marks the end of the label sequence of that sample.

         For example, suppose the vocabulary is `[a, b, c]`, and in one batch we have three
         sequences 'ba', 'cbb', and 'abac'. We can index the labels as `{'a': 0, 'b': 1, 'c': 2}`.
         The alphabet size should be 4, and we reserve the channel index 3 for blank label
         in data tensor. The padding mask value for extra length is -1, so the resulting `label`
         tensor should be padded to be::

           [[1, 0, -1, -1], [2, 1, 1, -1], [0, 1, 0, 2]]

         `data_lengths` is optional and defaults to None.
         When specified, it represents the actual lengths of data.
         The shape should be (batch_size,).
         If None, the data lengths are treated as being equal to the max sequence length.
         This should be used as the third argument when calling this loss.

         `label_lengths` is optional and defaults to None.
         When specified, it represents the actual lengths of labels.
         The shape should be (batch_size,).
         If None, the label lengths are derived from the first occurrence of
         the value specified by `padding_mask`.
         This should be used as the fourth argument when calling this loss.

     Output shape:
         The CTC loss output has the shape (batch_size,).
 =cut
 use AI::MXNet::Base;

 sub BUILD
 {
     my $self = shift;
     assert(
         (grep { $_ eq $self->layout } ('NTC', 'TNC')),
         "Only 'NTC' and 'TNC' layouts for output are supported. Got: ${\ $self->layout }"
     );
     assert(
         (grep { $_ eq $self->label_layout } ('NT', 'TN')),
         "Only 'NT' and 'TN' layouts for label are supported. Got: ${\ $self->label_layout }"
     );
     $self->batch_axis(index($self->label_layout, 'N'));
 }

 method hybrid_forward(
     GluonClass $F, GluonInput $data, GluonInput $label,
     Maybe[GluonInput] $data_lengths=, Maybe[GluonInput] $label_lengths=, Maybe[GluonInput] $sample_weight=
 )
 {
     if($self->layout eq 'NTC')
     {
         $data = $F->swapaxes($data, dim1 => 0, dim2 => 1);
     }
     if($self->batch_axis == 1)
     {
         $label = $F->swapaxes($label, dim1 => 0, dim2 => 1);
     }
     my $loss = $F->contrib->CTCLoss(
         $data, $label,
         (defined $data_lengths ? $data_lengths : ()),
         (defined $label_lengths ? $label_lengths : ()),
         use_data_lengths  => defined $data_lengths ? 1 : 0,
         use_label_lengths => defined $label_lengths ? 1 : 0,
         blank_label=>'last'
     );
     return $self->_apply_weighting($F, $loss, $self->weight, $sample_weight);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::HuberLoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';
 has 'rho' => (is => 'rw', isa => 'Num', default => 1);

 =head1 NAME

     AI::MXNet::Gluon::HuberLoss
 =cut

 =head1 DESCRIPTION

     Calculates smoothed L1 loss that is equal to L1 loss if absolute error
     exceeds rho but is equal to L2 loss otherwise. Also called SmoothedL1 loss.

     .. math::
         L = \sum_i \begin{cases} \frac{1}{2 {rho}} ({pred}_i - {label}_i)^2 &
                            \text{ if } |{pred}_i - {label}_i| < {rho} \\
                            |{pred}_i - {label}_i| - \frac{{rho}}{2} &
                            \text{ otherwise }
             \end{cases}

     `pred` and `label` can have arbitrary shape as long as they have the same
     number of elements.

     Parameters
     ----------
     rho : float, default 1
         Threshold for trimmed mean estimator.
     weight : float or None
         Global scalar weight for loss.
     batch_axis : int, default 0
         The axis that represents mini-batch.


     Inputs:
         - **pred**: prediction tensor with arbitrary shape
         - **label**: target tensor with the same size as pred.
         - **sample_weight**: element-wise weighting tensor. Must be broadcastable
           to the same shape as pred. For example, if pred has shape [64, 10]
           and you want to weigh each sample in the batch separately,
           sample_weight should have shape [64, 1].

     Outputs:
         - **loss**: loss tensor with shape [batch_size]. Dimenions other than
           batch_axis are averaged out.
 =cut

 method hybrid_forward(
     GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=
 )
 {
     $label = __PACKAGE__->_reshape_like($F, $label, $pred);
     my $loss = $F->abs($pred - $label);
     $loss = $F->where(
         $loss > $self->rho, $loss - 0.5 * $self->rho,
         (0.5/$self->rho) * $F->square($loss)
     );
     $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
     return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::HingeLoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';
 has 'margin' => (is => 'rw', isa => 'Num', default => 1);

 =head1 NAME

     AI::MXNet::Gluon::HingeLoss
 =cut

 =head1 DESCRIPTION

     Calculates the hinge loss function often used in SVMs:

     .. math::
         L = \sum_i max(0, {margin} - {pred}_i \cdot {label}_i)

     where `pred` is the classifier prediction and `label` is the target tensor
     containing values -1 or 1. `pred` and `label` must have the same number of
     elements.

     Parameters
     ----------
     margin : float
         The margin in hinge loss. Defaults to 1.0
     weight : float or None
         Global scalar weight for loss.
     batch_axis : int, default 0
         The axis that represents mini-batch.


     Inputs:
         - **pred**: prediction tensor with arbitrary shape.
         - **label**: truth tensor with values -1 or 1. Must have the same size
           as pred.
         - **sample_weight**: element-wise weighting tensor. Must be broadcastable
           to the same shape as pred. For example, if pred has shape (64, 10)
           and you want to weigh each sample in the batch separately,
           sample_weight should have shape (64, 1).

     Outputs:
         - **loss**: loss tensor with shape (batch_size,). Dimenions other than
           batch_axis are averaged out.
 =cut

 method hybrid_forward(
     GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=
 )
 {
     $label = __PACKAGE__->_reshape_like($F, $label, $pred);
     my $loss = $F->relu($self->margin - $pred * $label);
     $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
     return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::SquaredHingeLoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';
 has 'margin' => (is => 'rw', isa => 'Num', default => 1);

 =head1 NAME

     AI::MXNet::Gluon::SquaredHingeLoss
 =cut

 =head1 DESCRIPTION

     Calculates the soft-margin loss function used in SVMs:

     .. math::
         L = \sum_i max(0, {margin} - {pred}_i \cdot {label}_i)^2

     where `pred` is the classifier prediction and `label` is the target tensor
     containing values -1 or 1. `pred` and `label` can have arbitrary shape as
     long as they have the same number of elements.

     Parameters
     ----------
     margin : float
         The margin in hinge loss. Defaults to 1.0
     weight : float or None
         Global scalar weight for loss.
     batch_axis : int, default 0
         The axis that represents mini-batch.


     Inputs:
         - **pred**: prediction tensor with arbitrary shape
         - **label**: truth tensor with values -1 or 1. Must have the same size
           as pred.
         - **sample_weight**: element-wise weighting tensor. Must be broadcastable
           to the same shape as pred. For example, if pred has shape (64, 10)
           and you want to weigh each sample in the batch separately,
           sample_weight should have shape (64, 1).

     Outputs:
         - **loss**: loss tensor with shape (batch_size,). Dimenions other than
           batch_axis are averaged out.
 =cut

 method hybrid_forward(
     GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=
 )
 {
     $label = __PACKAGE__->_reshape_like($F, $label, $pred);
     my $loss = $F->square($F->relu($self->margin - $pred * $label));
     $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
     return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::LogisticLoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';
 has 'label_format' => (is => 'rw', isa => 'Str', default => 'signed');

 =head1 NAME

     AI::MXNet::Gluon::LogisticLoss
 =cut

 =head1 DESCRIPTION

     Calculates the logistic loss (for binary losses only):

     .. math::
         L = \sum_i \log(1 + \exp(- {pred}_i \cdot {label}_i))

     where `pred` is the classifier prediction and `label` is the target tensor
     containing values -1 or 1 (0 or 1 if `label_format` is binary).
      `pred` and `label` can have arbitrary shape as long as they have the same number of elements.

     Parameters
     ----------
     weight : float or None
         Global scalar weight for loss.
     batch_axis : int, default 0
         The axis that represents mini-batch.
     label_format : str, default 'signed'
         Can be either 'signed' or 'binary'. If the label_format is 'signed', all label values should
         be either -1 or 1. If the label_format is 'binary', all label values should be either
         0 or 1.

     Inputs:
         - **pred**: prediction tensor with arbitrary shape.
         - **label**: truth tensor with values -1/1 (label_format is 'signed')
           or 0/1 (label_format is 'binary'). Must have the same size as pred.
         - **sample_weight**: element-wise weighting tensor. Must be broadcastable
           to the same shape as pred. For example, if pred has shape (64, 10)
           and you want to weigh each sample in the batch separately,
           sample_weight should have shape (64, 1).

     Outputs:
         - **loss**: loss tensor with shape (batch_size,). Dimenions other than
           batch_axis are averaged out.
 =cut

 sub BUILD
 {
     my $self = shift;
     if(not ($self->label_format eq 'signed' or $self->label_format eq 'binary'))
     {
         confess(sprintf("label_format can only be signed or binary, recieved %s", $self->label_format));
     }
 }

 method hybrid_forward(
     GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=
 )
 {
     $label = __PACKAGE__->_reshape_like($F, $label, $pred);
     if($self->label_format eq 'signed')
     {
         $label = ($label + 1) / 2;  # Transform label to be either 0 or 1
     }
     # Use a stable formula in computation
     my $loss = $F->relu($pred) - $pred * $label + $F->Activation(-$F->abs($pred), act_type=>'softrelu');
     $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
     return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::TripletLoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';
 has 'margin' => (is => 'rw', isa => 'Num', default => 1);

 =head1 NAME

     AI::MXNet::Gluon::TripletLoss
 =cut

 =head1 DESCRIPTION

     Calculates triplet loss given three input tensors and a positive margin.
     Triplet loss measures the relative similarity between prediction, a positive
     example and a negative example:

     .. math::
         L = \sum_i \max(\Vert {pred}_i - {pos_i} \Vert_2^2 -
                         \Vert {pred}_i - {neg_i} \Vert_2^2 + {margin}, 0)

     `pred`, `positive` and `negative` can have arbitrary shape as long as they
     have the same number of elements.

     Parameters
     ----------
     margin : float
         Margin of separation between correct and incorrect pair.
     weight : float or None
         Global scalar weight for loss.
     batch_axis : int, default 0
         The axis that represents mini-batch.


     Inputs:
         - **pred**: prediction tensor with arbitrary shape
         - **positive**: positive example tensor with arbitrary shape. Must have
           the same size as pred.
         - **negative**: negative example tensor with arbitrary shape Must have
           the same size as pred.

     Outputs:
         - **loss**: loss tensor with shape (batch_size,).
 =cut

 method hybrid_forward(
     GluonClass $F, GluonInput $pred, GluonInput $positive, GluonInput $negative, Maybe[GluonInput] $sample_weight=
 )
 {
     $positive = __PACKAGE__->_reshape_like($F, $positive, $pred);
     $negative = __PACKAGE__->_reshape_like($F, $negative, $pred);
     my $loss = $F->sum($F->square($pred-$positive) - $F->square($pred-$negative),
                      axis=>$self->batch_axis, exclude=>1);
     $loss = $F->relu($loss + $self->margin);
     return __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::PoissonNLLLoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';
 has 'from_logits'  => (is => 'ro', isa => 'Bool', default => 1);
 has 'compute_full' => (is => 'ro', isa => 'Bool', default => 0);

 =head1 NAME

     AI::MXNet::Gluon::PoissonNLLLoss
 =cut

 =head1 DESCRIPTION

     For a target (Random Variable) in a Poisson distribution, the function calculates the Negative
     Log likelihood loss.
     PoissonNLLLoss measures the loss accrued from a poisson regression prediction made by the model.

     .. math::
         L = \text{pred} - \text{target} * \log(\text{pred}) +\log(\text{target!})

     `pred`, `target` can have arbitrary shape as long as they have the same number of elements.

     Parameters
     ----------
     from_logits : boolean, default True
         indicating whether log(predicted) value has already been computed. If True, the loss is computed as
         :math:`\exp(\text{pred}) - \text{target} * \text{pred}`, and if False, then loss is computed as
         :math:`\text{pred} - \text{target} * \log(\text{pred}+\text{epsilon})`.The default value
     weight : float or None
         Global scalar weight for loss.
     batch_axis : int, default 0
         The axis that represents mini-batch.
     compute_full: boolean, default False
         Indicates whether to add an approximation(Stirling factor) for the Factorial term in the formula for the loss.
         The Stirling factor is:
         :math:`\text{target} * \log(\text{target}) - \text{target} + 0.5 * \log(2 * \pi * \text{target})`
     epsilon: float, default 1e-08
         This is to avoid calculating log(0) which is not defined.


     Inputs:
         - **pred**:   Predicted value
         - **target**: Random variable(count or number) which belongs to a Poisson distribution.
         - **sample_weight**: element-wise weighting tensor. Must be broadcastable
           to the same shape as pred. For example, if pred has shape (64, 10)
           and you want to weigh each sample in the batch separately,
           sample_weight should have shape (64, 1).

     Outputs:
         - **loss**: Average loss (shape=(1,1)) of the loss tensor with shape (batch_size,).
 =cut

 method hybrid_forward(
     GluonClass $F, GluonInput $pred, GluonInput $target,
     Maybe[GluonInput] $sample_weight=, Maybe[Num] $epsilon=1e-08
 )
 {
     $target = __PACKAGE__->_reshape_like($F, $target, $pred);
     my $loss;
     if($self->from_logits)
     {
         $loss = $F->exp($pred) - $target * $pred;
     }
     else
     {
         $loss = $pred - $target * $F->log($pred + $epsilon);
         if($self->compute_full)
         {
             my $stirling_factor = $target * $F->log($target) - $target + 0.5 * $F->log(2 * $target * 3.1415926);
             $stirling_factor *= ($target > 1);
             $loss += $stirling_factor;
         }
         $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
     }
     return $F->mean($loss);
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 package AI::MXNet::Gluon::CosineEmbeddingLoss;
 use AI::MXNet::Gluon::Mouse;
 extends 'AI::MXNet::Gluon::Loss';
 has 'margin' => (is => 'rw', isa => 'Num', default => 0);

 =head1 NAME

     AI::MXNet::Gluon::CosineEmbeddingLoss
 =cut

 =head1 DESCRIPTION

     For a target label 1 or -1, vectors input1 and input2, the function computes the cosine distance
     between the vectors. This can be interpreted as how similar/dissimilar two input vectors are.

     .. math::

         L = \sum_i \begin{cases} 1 - {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = 1\\
                          {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = -1 \end{cases}\\
         cos\_sim(input1, input2) = \frac{{input1}_i.{input2}_i}{||{input1}_i||.||{input2}_i||}

     `input1`, `input2` can have arbitrary shape as long as they have the same number of elements.

     Parameters
     ----------
     weight : float or None
         Global scalar weight for loss.
     batch_axis : int, default 0
         The axis that represents mini-batch.
     margin : float
         Margin of separation between correct and incorrect pair.


     Inputs:
         - **input1**: a tensor with arbitrary shape
         - **input2**: another tensor with same shape as pred to which input1 is
           compared for similarity and loss calculation
         - **label**: A 1-D tensor indicating for each pair input1 and input2, target label is 1 or -1
         - **sample_weight**: element-wise weighting tensor. Must be broadcastable
           to the same shape as input1. For example, if input1 has shape (64, 10)
           and you want to weigh each sample in the batch separately,
           sample_weight should have shape (64, 1).

     Outputs:
         - **loss**: The loss tensor with shape (batch_size,).
 =cut

 method hybrid_forward(
     GluonClass $F, GluonInput $input1, GluonInput $input2, GluonInput $label, Maybe[GluonInput] $sample_weight=
 )
 {
     $input1 = __PACKAGE__->_reshape_like($F, $input1, $input2);
     $label = $label->reshape([-1, 1]);
     my $cos_sim = $self->_cosine_similarity($F, $input1, $input2);
     my $y_1 = $label == 1;
     my $y_minus_1 = $label == -1;
     my $cos_sim_a = (1 - $cos_sim) * $y_1;

     my $z_array;
     if($F eq 'AI::MXNet::NDArray')
     {
         $z_array = $F->array([0]);
     }
     else
     {
         $z_array = $F->zeros([1, 1]);
     }
     my $cos_sim_b = $F->broadcast_maximum($z_array, $y_minus_1 * ($cos_sim - $self->margin), { axis=>1 });
     my $loss = $cos_sim_a + $cos_sim_b;
     $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
     return $loss;
 }

 method _cosine_similarity($F, $x, $y, $axis=-1)
 {
     my $x_norm = $F->norm($x, axis=>$axis)->reshape([-1, 1]);
     my $y_norm = $F->norm($y, axis=>$axis)->reshape([-1, 1]);
     my $x_dot_y = $F->sum($x*$y, axis=>$axis)->reshape([-1, 1]);
     my $eps_arr;
     if($F eq 'AI::MXNet::NDArray')
     {
         $eps_arr = $F->array([1e-12]);
     }
     else
     {
         $eps_arr = $F->full([1, 1], 1e-12);
     }
     return ($x_dot_y / $F->broadcast_maximum($x_norm * $y_norm, $eps_arr));
 }

 __PACKAGE__->register('AI::MXNet::Gluon::Loss');

 1;