mllib/src/main/scala/org/apache/spark/mllib/optimization/Gradient.scala - spark - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.spark.mllib.optimization

 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS.{axpy, dot, scal}
 import org.apache.spark.mllib.util.MLUtils

 /**
  * :: DeveloperApi ::
  * Class used to compute the gradient for a loss function, given a single data point.
  */
 @DeveloperApi
 abstract class Gradient extends Serializable {
   /**
    * Compute the gradient and loss given the features of a single data point.
    *
    * @param data features for one data point
    * @param label label for this data point
    * @param weights weights/coefficients corresponding to features
    *
    * @return (gradient: Vector, loss: Double)
    */
   def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
     val gradient = Vectors.zeros(weights.size)
     val loss = compute(data, label, weights, gradient)
     (gradient, loss)
   }

   /**
    * Compute the gradient and loss given the features of a single data point,
    * add the gradient to a provided vector to avoid creating new objects, and return loss.
    *
    * @param data features for one data point
    * @param label label for this data point
    * @param weights weights/coefficients corresponding to features
    * @param cumGradient the computed gradient will be added to this vector
    *
    * @return loss
    */
   def compute(data: Vector, label: Double, weights: Vector, cumGradient: Vector): Double
 }

 /**
  * :: DeveloperApi ::
  * Compute gradient and loss for a multinomial logistic loss function, as used
  * in multi-class classification (it is also used in binary logistic regression).
  *
  * In `The Elements of Statistical Learning: Data Mining, Inference, and Prediction, 2nd Edition`
  * by Trevor Hastie, Robert Tibshirani, and Jerome Friedman, which can be downloaded from
  * http://statweb.stanford.edu/~tibs/ElemStatLearn/ , Eq. (4.17) on page 119 gives the formula of
  * multinomial logistic regression model. A simple calculation shows that
  *
  * <p><blockquote>
  *    $$
  *    P(y=0|x, w) = 1 / (1 + \sum_i^{K-1} \exp(x w_i))\\
  *    P(y=1|x, w) = exp(x w_1) / (1 + \sum_i^{K-1} \exp(x w_i))\\
  *    ...\\
  *    P(y=K-1|x, w) = exp(x w_{K-1}) / (1 + \sum_i^{K-1} \exp(x w_i))\\
  *    $$
  * </blockquote></p>
  *
  * for K classes multiclass classification problem.
  *
  * The model weights $w = (w_1, w_2, ..., w_{K-1})^T$ becomes a matrix which has dimension of
  * (K-1) * (N+1) if the intercepts are added. If the intercepts are not added, the dimension
  * will be (K-1) * N.
  *
  * As a result, the loss of objective function for a single instance of data can be written as
  * <p><blockquote>
  *    $$
  *    \begin{align}
  *    l(w, x) &= -log P(y|x, w) = -\alpha(y) log P(y=0|x, w) - (1-\alpha(y)) log P(y|x, w) \\
  *            &= log(1 + \sum_i^{K-1}\exp(x w_i)) - (1-\alpha(y)) x w_{y-1} \\
  *            &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1}
  *    \end{align}
  *    $$
  * </blockquote></p>
  *
  * where $\alpha(i) = 1$ if $i \ne 0$, and
  *       $\alpha(i) = 0$ if $i == 0$,
  *       $margins_i = x w_i$.
  *
  * For optimization, we have to calculate the first derivative of the loss function, and
  * a simple calculation shows that
  *
  * <p><blockquote>
  *    $$
  *    \begin{align}
  *      \frac{\partial l(w, x)}{\partial w_{ij}} &=
  *         (\exp(x w_i) / (1 + \sum_k^{K-1} \exp(x w_k)) - (1-\alpha(y)\delta_{y, i+1})) * x_j \\
  *                                               &= multiplier_i * x_j
  *    \end{align}
  *    $$
  * </blockquote></p>
  *
  * where $\delta_{i, j} = 1$ if $i == j$,
  *       $\delta_{i, j} = 0$ if $i != j$, and
  *       multiplier =
  *         $\exp(margins_i) / (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1})$
  *
  * If any of margins is larger than 709.78, the numerical computation of multiplier and loss
  * function will be suffered from arithmetic overflow. This issue occurs when there are outliers
  * in data which are far away from hyperplane, and this will cause the failing of training once
  * infinity / infinity is introduced. Note that this is only a concern when max(margins) > 0.
  *
  * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can be
  * easily rewritten into the following equivalent numerically stable formula.
  *
  * <p><blockquote>
  *    $$
  *    \begin{align}
  *      l(w, x) &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1} \\
  *              &= log(\exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin)) + maxMargin
  *                  - (1-\alpha(y)) margins_{y-1} \\
  *              &= log(1 + sum) + maxMargin - (1-\alpha(y)) margins_{y-1}
  *    \end{align}
  *    $$
  * </blockquote></p>

  * where sum = $\exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin) - 1$.
  *
  * Note that each term, $(margins_i - maxMargin)$ in $\exp$ is smaller than zero; as a result,
  * overflow will not happen with this formula.
  *
  * For multiplier, similar trick can be applied as the following,
  *
  * <p><blockquote>
  *    $$
  *    \begin{align}
  *      multiplier
  *       &= \exp(margins_i) /
   *           (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1}) \\
  *       &= \exp(margins_i - maxMargin) / (1 + sum) - (1-\alpha(y)\delta_{y, i+1})
  *    \end{align}
  *    $$
  * </blockquote></p>
  *
  * where each term in $\exp$ is also smaller than zero, so overflow is not a concern.
  *
  * For the detailed mathematical derivation, see the reference at
  * http://www.slideshare.net/dbtsai/2014-0620-mlor-36132297
  *
  * @param numClasses the number of possible outcomes for k classes classification problem in
  *                   Multinomial Logistic Regression. By default, it is binary logistic regression
  *                   so numClasses will be set to 2.
  */
 @DeveloperApi
 class LogisticGradient(numClasses: Int) extends Gradient {

   def this() = this(2)

   override def compute(
       data: Vector,
       label: Double,
       weights: Vector,
       cumGradient: Vector): Double = {
     val dataSize = data.size

     // (weights.size / dataSize + 1) is number of classes
     require(weights.size % dataSize == 0 && numClasses == weights.size / dataSize + 1)
     numClasses match {
       case 2 =>
         /**
          * For Binary Logistic Regression.
          *
          * Although the loss and gradient calculation for multinomial one is more generalized,
          * and multinomial one can also be used in binary case, we still implement a specialized
          * binary version for performance reason.
          */
         val margin = -1.0 * dot(data, weights)
         val multiplier = (1.0 / (1.0 + math.exp(margin))) - label
         axpy(multiplier, data, cumGradient)
         if (label > 0) {
           // The following is equivalent to log(1 + exp(margin)) but more numerically stable.
           MLUtils.log1pExp(margin)
         } else {
           MLUtils.log1pExp(margin) - margin
         }
       case _ =>
         /**
          * For Multinomial Logistic Regression.
          */
         val weightsArray = weights match {
           case dv: DenseVector => dv.values
           case _ =>
             throw new IllegalArgumentException(
               s"weights only supports dense vector but got type ${weights.getClass}.")
         }
         val cumGradientArray = cumGradient match {
           case dv: DenseVector => dv.values
           case _ =>
             throw new IllegalArgumentException(
               s"cumGradient only supports dense vector but got type ${cumGradient.getClass}.")
         }

         // marginY is margins(label - 1) in the formula.
         var marginY = 0.0
         var maxMargin = Double.NegativeInfinity
         var maxMarginIndex = 0

         val margins = Array.tabulate(numClasses - 1) { i =>
           var margin = 0.0
           data.foreachActive { (index, value) =>
             if (value != 0.0) margin += value * weightsArray((i * dataSize) + index)
           }
           if (i == label.toInt - 1) marginY = margin
           if (margin > maxMargin) {
             maxMargin = margin
             maxMarginIndex = i
           }
           margin
         }

         /**
          * When maxMargin > 0, the original formula will cause overflow as we discuss
          * in the previous comment.
          * We address this by subtracting maxMargin from all the margins, so it's guaranteed
          * that all of the new margins will be smaller than zero to prevent arithmetic overflow.
          */
         val sum = {
           var temp = 0.0
           if (maxMargin > 0) {
             for (i <- 0 until numClasses - 1) {
               margins(i) -= maxMargin
               if (i == maxMarginIndex) {
                 temp += math.exp(-maxMargin)
               } else {
                 temp += math.exp(margins(i))
               }
             }
           } else {
             for (i <- 0 until numClasses - 1) {
               temp += math.exp(margins(i))
             }
           }
           temp
         }

         for (i <- 0 until numClasses - 1) {
           val multiplier = math.exp(margins(i)) / (sum + 1.0) - {
             if (label != 0.0 && label == i + 1) 1.0 else 0.0
           }
           data.foreachActive { (index, value) =>
             if (value != 0.0) cumGradientArray(i * dataSize + index) += multiplier * value
           }
         }

         val loss = if (label > 0.0) math.log1p(sum) - marginY else math.log1p(sum)

         if (maxMargin > 0) {
           loss + maxMargin
         } else {
           loss
         }
     }
   }
 }

 /**
  * :: DeveloperApi ::
  * Compute gradient and loss for a Least-squared loss function, as used in linear regression.
  * This is correct for the averaged least squares loss function (mean squared error)
  *              L = 1/2n ||A weights-y||^2
  * See also the documentation for the precise formulation.
  */
 @DeveloperApi
 class LeastSquaresGradient extends Gradient {
   override def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
     val diff = dot(data, weights) - label
     val loss = diff * diff / 2.0
     val gradient = data.copy
     scal(diff, gradient)
     (gradient, loss)
   }

   override def compute(
       data: Vector,
       label: Double,
       weights: Vector,
       cumGradient: Vector): Double = {
     val diff = dot(data, weights) - label
     axpy(diff, data, cumGradient)
     diff * diff / 2.0
   }
 }

 /**
  * :: DeveloperApi ::
  * Compute gradient and loss for a Hinge loss function, as used in SVM binary classification.
  * See also the documentation for the precise formulation.
  * NOTE: This assumes that the labels are {0,1}
  */
 @DeveloperApi
 class HingeGradient extends Gradient {
   override def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
     val dotProduct = dot(data, weights)
     // Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x)))
     // Therefore the gradient is -(2y - 1)*x
     val labelScaled = 2 * label - 1.0
     if (1.0 > labelScaled * dotProduct) {
       val gradient = data.copy
       scal(-labelScaled, gradient)
       (gradient, 1.0 - labelScaled * dotProduct)
     } else {
       (Vectors.sparse(weights.size, Array.empty, Array.empty), 0.0)
     }
   }

   override def compute(
       data: Vector,
       label: Double,
       weights: Vector,
       cumGradient: Vector): Double = {
     val dotProduct = dot(data, weights)
     // Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x)))
     // Therefore the gradient is -(2y - 1)*x
     val labelScaled = 2 * label - 1.0
     if (1.0 > labelScaled * dotProduct) {
       axpy(-labelScaled, data, cumGradient)
       1.0 - labelScaled * dotProduct
     } else {
       0.0
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.spark.mllib.optimization

	import org.apache.spark.annotation.DeveloperApi
	import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
	import org.apache.spark.mllib.linalg.BLAS.{axpy, dot, scal}
	import org.apache.spark.mllib.util.MLUtils

	/**
	* :: DeveloperApi ::
	* Class used to compute the gradient for a loss function, given a single data point.
	*/
	@DeveloperApi
	abstract class Gradient extends Serializable {
	/**
	* Compute the gradient and loss given the features of a single data point.
	*
	* @param data features for one data point
	* @param label label for this data point
	* @param weights weights/coefficients corresponding to features
	*
	* @return (gradient: Vector, loss: Double)
	*/
	def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
	val gradient = Vectors.zeros(weights.size)
	val loss = compute(data, label, weights, gradient)
	(gradient, loss)
	}

	/**
	* Compute the gradient and loss given the features of a single data point,
	* add the gradient to a provided vector to avoid creating new objects, and return loss.
	*
	* @param data features for one data point
	* @param label label for this data point
	* @param weights weights/coefficients corresponding to features
	* @param cumGradient the computed gradient will be added to this vector
	*
	* @return loss
	*/
	def compute(data: Vector, label: Double, weights: Vector, cumGradient: Vector): Double
	}

	/**
	* :: DeveloperApi ::
	* Compute gradient and loss for a multinomial logistic loss function, as used
	* in multi-class classification (it is also used in binary logistic regression).
	*
	* In `The Elements of Statistical Learning: Data Mining, Inference, and Prediction, 2nd Edition`
	* by Trevor Hastie, Robert Tibshirani, and Jerome Friedman, which can be downloaded from
	* http://statweb.stanford.edu/~tibs/ElemStatLearn/ , Eq. (4.17) on page 119 gives the formula of
	* multinomial logistic regression model. A simple calculation shows that
	*
	* <p><blockquote>
	* $$
	* P(y=0\|x, w) = 1 / (1 + \sum_i^{K-1} \exp(x w_i))\\
	* P(y=1\|x, w) = exp(x w_1) / (1 + \sum_i^{K-1} \exp(x w_i))\\
	* ...\\
	* P(y=K-1\|x, w) = exp(x w_{K-1}) / (1 + \sum_i^{K-1} \exp(x w_i))\\
	* $$
	* </blockquote></p>
	*
	* for K classes multiclass classification problem.
	*
	* The model weights $w = (w_1, w_2, ..., w_{K-1})^T$ becomes a matrix which has dimension of
	* (K-1) * (N+1) if the intercepts are added. If the intercepts are not added, the dimension
	* will be (K-1) * N.
	*
	* As a result, the loss of objective function for a single instance of data can be written as
	* <p><blockquote>
	* $$
	* \begin{align}
	* l(w, x) &= -log P(y\|x, w) = -\alpha(y) log P(y=0\|x, w) - (1-\alpha(y)) log P(y\|x, w) \\
	* &= log(1 + \sum_i^{K-1}\exp(x w_i)) - (1-\alpha(y)) x w_{y-1} \\
	* &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1}
	* \end{align}
	* $$
	* </blockquote></p>
	*
	* where $\alpha(i) = 1$ if $i \ne 0$, and
	* $\alpha(i) = 0$ if $i == 0$,
	* $margins_i = x w_i$.
	*
	* For optimization, we have to calculate the first derivative of the loss function, and
	* a simple calculation shows that
	*
	* <p><blockquote>
	* $$
	* \begin{align}
	* \frac{\partial l(w, x)}{\partial w_{ij}} &=
	* (\exp(x w_i) / (1 + \sum_k^{K-1} \exp(x w_k)) - (1-\alpha(y)\delta_{y, i+1})) * x_j \\
	* &= multiplier_i * x_j
	* \end{align}
	* $$
	* </blockquote></p>
	*
	* where $\delta_{i, j} = 1$ if $i == j$,
	* $\delta_{i, j} = 0$ if $i != j$, and
	* multiplier =
	* $\exp(margins_i) / (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1})$
	*
	* If any of margins is larger than 709.78, the numerical computation of multiplier and loss
	* function will be suffered from arithmetic overflow. This issue occurs when there are outliers
	* in data which are far away from hyperplane, and this will cause the failing of training once
	* infinity / infinity is introduced. Note that this is only a concern when max(margins) > 0.
	*
	* Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can be
	* easily rewritten into the following equivalent numerically stable formula.
	*
	* <p><blockquote>
	* $$
	* \begin{align}
	* l(w, x) &= log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1} \\
	* &= log(\exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin)) + maxMargin
	* - (1-\alpha(y)) margins_{y-1} \\
	* &= log(1 + sum) + maxMargin - (1-\alpha(y)) margins_{y-1}
	* \end{align}
	* $$
	* </blockquote></p>

	* where sum = $\exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin) - 1$.
	*
	* Note that each term, $(margins_i - maxMargin)$ in $\exp$ is smaller than zero; as a result,
	* overflow will not happen with this formula.
	*
	* For multiplier, similar trick can be applied as the following,
	*
	* <p><blockquote>
	* $$
	* \begin{align}
	* multiplier
	* &= \exp(margins_i) /
	* (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1}) \\
	* &= \exp(margins_i - maxMargin) / (1 + sum) - (1-\alpha(y)\delta_{y, i+1})
	* \end{align}
	* $$
	* </blockquote></p>
	*
	* where each term in $\exp$ is also smaller than zero, so overflow is not a concern.
	*
	* For the detailed mathematical derivation, see the reference at
	* http://www.slideshare.net/dbtsai/2014-0620-mlor-36132297
	*
	* @param numClasses the number of possible outcomes for k classes classification problem in
	* Multinomial Logistic Regression. By default, it is binary logistic regression
	* so numClasses will be set to 2.
	*/
	@DeveloperApi
	class LogisticGradient(numClasses: Int) extends Gradient {

	def this() = this(2)

	override def compute(
	data: Vector,
	label: Double,
	weights: Vector,
	cumGradient: Vector): Double = {
	val dataSize = data.size

	// (weights.size / dataSize + 1) is number of classes
	require(weights.size % dataSize == 0 && numClasses == weights.size / dataSize + 1)
	numClasses match {
	case 2 =>
	/**
	* For Binary Logistic Regression.
	*
	* Although the loss and gradient calculation for multinomial one is more generalized,
	* and multinomial one can also be used in binary case, we still implement a specialized
	* binary version for performance reason.
	*/
	val margin = -1.0 * dot(data, weights)
	val multiplier = (1.0 / (1.0 + math.exp(margin))) - label
	axpy(multiplier, data, cumGradient)
	if (label > 0) {
	// The following is equivalent to log(1 + exp(margin)) but more numerically stable.
	MLUtils.log1pExp(margin)
	} else {
	MLUtils.log1pExp(margin) - margin
	}
	case _ =>
	/**
	* For Multinomial Logistic Regression.
	*/
	val weightsArray = weights match {
	case dv: DenseVector => dv.values
	case _ =>
	throw new IllegalArgumentException(
	s"weights only supports dense vector but got type ${weights.getClass}.")
	}
	val cumGradientArray = cumGradient match {
	case dv: DenseVector => dv.values
	case _ =>
	throw new IllegalArgumentException(
	s"cumGradient only supports dense vector but got type ${cumGradient.getClass}.")
	}

	// marginY is margins(label - 1) in the formula.
	var marginY = 0.0
	var maxMargin = Double.NegativeInfinity
	var maxMarginIndex = 0

	val margins = Array.tabulate(numClasses - 1) { i =>
	var margin = 0.0
	data.foreachActive { (index, value) =>
	if (value != 0.0) margin += value * weightsArray((i * dataSize) + index)
	}
	if (i == label.toInt - 1) marginY = margin
	if (margin > maxMargin) {
	maxMargin = margin
	maxMarginIndex = i
	}
	margin
	}

	/**
	* When maxMargin > 0, the original formula will cause overflow as we discuss
	* in the previous comment.
	* We address this by subtracting maxMargin from all the margins, so it's guaranteed
	* that all of the new margins will be smaller than zero to prevent arithmetic overflow.
	*/
	val sum = {
	var temp = 0.0
	if (maxMargin > 0) {
	for (i <- 0 until numClasses - 1) {
	margins(i) -= maxMargin
	if (i == maxMarginIndex) {
	temp += math.exp(-maxMargin)
	} else {
	temp += math.exp(margins(i))
	}
	}
	} else {
	for (i <- 0 until numClasses - 1) {
	temp += math.exp(margins(i))
	}
	}
	temp
	}

	for (i <- 0 until numClasses - 1) {
	val multiplier = math.exp(margins(i)) / (sum + 1.0) - {
	if (label != 0.0 && label == i + 1) 1.0 else 0.0
	}
	data.foreachActive { (index, value) =>
	if (value != 0.0) cumGradientArray(i * dataSize + index) += multiplier * value
	}
	}

	val loss = if (label > 0.0) math.log1p(sum) - marginY else math.log1p(sum)

	if (maxMargin > 0) {
	loss + maxMargin
	} else {
	loss
	}
	}
	}
	}

	/**
	* :: DeveloperApi ::
	* Compute gradient and loss for a Least-squared loss function, as used in linear regression.
	* This is correct for the averaged least squares loss function (mean squared error)
	* L = 1/2n \|\|A weights-y\|\|^2
	* See also the documentation for the precise formulation.
	*/
	@DeveloperApi
	class LeastSquaresGradient extends Gradient {
	override def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
	val diff = dot(data, weights) - label
	val loss = diff * diff / 2.0
	val gradient = data.copy
	scal(diff, gradient)
	(gradient, loss)
	}

	override def compute(
	data: Vector,
	label: Double,
	weights: Vector,
	cumGradient: Vector): Double = {
	val diff = dot(data, weights) - label
	axpy(diff, data, cumGradient)
	diff * diff / 2.0
	}
	}

	/**
	* :: DeveloperApi ::
	* Compute gradient and loss for a Hinge loss function, as used in SVM binary classification.
	* See also the documentation for the precise formulation.
	* NOTE: This assumes that the labels are {0,1}
	*/
	@DeveloperApi
	class HingeGradient extends Gradient {
	override def compute(data: Vector, label: Double, weights: Vector): (Vector, Double) = {
	val dotProduct = dot(data, weights)
	// Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x)))
	// Therefore the gradient is -(2y - 1)*x
	val labelScaled = 2 * label - 1.0
	if (1.0 > labelScaled * dotProduct) {
	val gradient = data.copy
	scal(-labelScaled, gradient)
	(gradient, 1.0 - labelScaled * dotProduct)
	} else {
	(Vectors.sparse(weights.size, Array.empty, Array.empty), 0.0)
	}
	}

	override def compute(
	data: Vector,
	label: Double,
	weights: Vector,
	cumGradient: Vector): Double = {
	val dotProduct = dot(data, weights)
	// Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x)))
	// Therefore the gradient is -(2y - 1)*x
	val labelScaled = 2 * label - 1.0
	if (1.0 > labelScaled * dotProduct) {
	axpy(-labelScaled, data, cumGradient)
	1.0 - labelScaled * dotProduct
	} else {
	0.0
	}
	}
	}