| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.ml.maxent.quasinewton; |
| |
| import opennlp.tools.ml.ArrayMath; |
| import opennlp.tools.ml.maxent.quasinewton.LineSearch.LineSearchResult; |
| |
| /** |
| * Implementation of L-BFGS which supports L1-, L2-regularization |
| * and Elastic Net for solving convex optimization problems. <p> |
| * Usage example: |
| * <blockquote><pre> |
| * // Quadratic function f(x) = (x-1)^2 + 10 |
| * // f obtains its minimum value 10 at x = 1 |
| * Function f = new Function() { |
| * |
| * {@literal @}Override |
| * public int getDimension() { |
| * return 1; |
| * } |
| * |
| * {@literal @}Override |
| * public double valueAt(double[] x) { |
| * return StrictMath.pow(x[0]-1, 2) + 10; |
| * } |
| * |
| * {@literal @}Override |
| * public double[] gradientAt(double[] x) { |
| * return new double[] { 2*(x[0]-1) }; |
| * } |
| * |
| * }; |
| * |
| * QNMinimizer minimizer = new QNMinimizer(); |
| * double[] x = minimizer.minimize(f); |
| * double min = f.valueAt(x); |
| * </pre></blockquote> |
| */ |
| public class QNMinimizer { |
| |
| // Function change rate tolerance |
| public static final double CONVERGE_TOLERANCE = 1e-4; |
| |
| // Relative gradient norm tolerance |
| public static final double REL_GRAD_NORM_TOL = 1e-4; |
| |
| // Initial step size |
| public static final double INITIAL_STEP_SIZE = 1.0; |
| |
| // Minimum step size |
| public static final double MIN_STEP_SIZE = 1e-10; |
| |
| // Default L1-cost |
| public static final double L1COST_DEFAULT = 0; |
| |
| // Default L2-cost |
| public static final double L2COST_DEFAULT = 0; |
| |
| // Default number of iterations |
| public static final int NUM_ITERATIONS_DEFAULT = 100; |
| |
| // Default number of Hessian updates to store |
| public static final int M_DEFAULT = 15; |
| |
| // Default maximum number of function evaluations |
| public static final int MAX_FCT_EVAL_DEFAULT = 30000; |
| |
| // L1-regularization cost |
| private double l1Cost; |
| |
| // L2-regularization cost |
| private double l2Cost; |
| |
| // Maximum number of iterations |
| private int iterations; |
| |
| // Number of Hessian updates to store |
| private int m; |
| |
| // Maximum number of function evaluations |
| private int maxFctEval; |
| |
| // Verbose output |
| private boolean verbose; |
| |
| // Objective function's dimension |
| private int dimension; |
| |
| // Hessian updates |
| private UpdateInfo updateInfo; |
| |
| // For evaluating quality of training parameters. |
| // This is optional and can be omitted. |
| private Evaluator evaluator; |
| |
| public QNMinimizer() { |
| this(L1COST_DEFAULT, L2COST_DEFAULT); |
| } |
| |
| public QNMinimizer(double l1Cost, double l2Cost) { |
| this(l1Cost, l2Cost, NUM_ITERATIONS_DEFAULT); |
| } |
| |
| public QNMinimizer(double l1Cost, double l2Cost, int iterations) { |
| this(l1Cost, l2Cost, iterations, M_DEFAULT, MAX_FCT_EVAL_DEFAULT); |
| } |
| |
| public QNMinimizer(double l1Cost, double l2Cost, |
| int iterations, int m, int maxFctEval) { |
| this(l1Cost, l2Cost, iterations, m, maxFctEval, true); |
| } |
| |
| /** |
| * Constructor |
| * @param l1Cost L1-regularization cost |
| * @param l2Cost L2-regularization cost |
| * @param iterations maximum number of iterations |
| * @param m number of Hessian updates to store |
| * @param maxFctEval maximum number of function evaluations |
| * @param verbose verbose output |
| */ |
| public QNMinimizer(double l1Cost, double l2Cost, int iterations, |
| int m, int maxFctEval, boolean verbose) |
| { |
| // Check arguments |
| if (l1Cost < 0 || l2Cost < 0) |
| throw new IllegalArgumentException( |
| "L1-cost and L2-cost must not be less than zero"); |
| |
| if (iterations <= 0) |
| throw new IllegalArgumentException( |
| "Number of iterations must be larger than zero"); |
| |
| if (m <= 0) |
| throw new IllegalArgumentException( |
| "Number of Hessian updates must be larger than zero"); |
| |
| if (maxFctEval <= 0) |
| throw new IllegalArgumentException( |
| "Maximum number of function evaluations must be larger than zero"); |
| |
| this.l1Cost = l1Cost; |
| this.l2Cost = l2Cost; |
| this.iterations = iterations; |
| this.m = m; |
| this.maxFctEval = maxFctEval; |
| this.verbose = verbose; |
| } |
| |
| public Evaluator getEvaluator() { |
| return evaluator; |
| } |
| |
| public void setEvaluator(Evaluator evaluator) { |
| this.evaluator = evaluator; |
| } |
| |
| /** |
| * Find the parameters that minimize the objective function |
| * @param function objective function |
| * @return minimizing parameters |
| */ |
| public double[] minimize(Function function) { |
| |
| Function l2RegFunction = new L2RegFunction(function, l2Cost); |
| this.dimension = l2RegFunction.getDimension(); |
| this.updateInfo = new UpdateInfo(this.m, this.dimension); |
| |
| // Current point is at the origin |
| double[] currPoint = new double[dimension]; |
| |
| double currValue = l2RegFunction.valueAt(currPoint); |
| |
| // Gradient at the current point |
| double[] currGrad = new double[dimension]; |
| System.arraycopy(l2RegFunction.gradientAt(currPoint), 0, |
| currGrad, 0, dimension); |
| |
| // Pseudo-gradient - only use when L1-regularization is enabled |
| double[] pseudoGrad = null; |
| if (l1Cost > 0) { |
| currValue += l1Cost * ArrayMath.l1norm(currPoint); |
| pseudoGrad = new double[dimension]; |
| computePseudoGrad(currPoint, currGrad, pseudoGrad); |
| } |
| |
| LineSearchResult lsr; |
| if (l1Cost > 0) { |
| lsr = LineSearchResult.getInitialObjectForL1( |
| currValue, currGrad, pseudoGrad, currPoint); |
| } else { |
| lsr = LineSearchResult.getInitialObject( |
| currValue, currGrad, currPoint); |
| } |
| |
| if (verbose) { |
| display("\nSolving convex optimization problem."); |
| display("\nObjective function has " + dimension + " variable(s)."); |
| display("\n\nPerforming " + iterations + " iterations with " + |
| "L1Cost=" + l1Cost + " and L2Cost=" + l2Cost + "\n"); |
| } |
| |
| double[] direction = new double[dimension]; |
| long startTime = System.currentTimeMillis(); |
| |
| // Initial step size for the 1st iteration |
| double initialStepSize = l1Cost > 0 ? |
| ArrayMath.invL2norm(lsr.getPseudoGradAtNext()) : |
| ArrayMath.invL2norm(lsr.getGradAtNext()); |
| |
| for (int iter = 1; iter <= iterations; iter++) { |
| // Find direction |
| if (l1Cost > 0) { |
| System.arraycopy(lsr.getPseudoGradAtNext(), 0, direction, 0, direction.length); |
| } else { |
| System.arraycopy(lsr.getGradAtNext(), 0, direction, 0, direction.length); |
| } |
| computeDirection(direction); |
| |
| // Line search |
| if (l1Cost > 0) { |
| // Constrain the search direction |
| pseudoGrad = lsr.getPseudoGradAtNext(); |
| for (int i = 0; i < dimension; i++) { |
| if (direction[i] * pseudoGrad[i] >= 0) { |
| direction[i] = 0; |
| } |
| } |
| LineSearch.doConstrainedLineSearch(l2RegFunction, direction, lsr, l1Cost, initialStepSize); |
| computePseudoGrad(lsr.getNextPoint(), lsr.getGradAtNext(), pseudoGrad); |
| lsr.setPseudoGradAtNext(pseudoGrad); |
| } |
| else { |
| LineSearch.doLineSearch(l2RegFunction, direction, lsr, initialStepSize); |
| } |
| |
| // Save Hessian updates |
| updateInfo.update(lsr); |
| |
| if (verbose) { |
| if (iter < 10) |
| display(" " + iter + ": "); |
| else if (iter < 100) |
| display(" " + iter + ": "); |
| else |
| display(iter + ": "); |
| |
| if (evaluator != null) { |
| display("\t" + lsr.getValueAtNext() + "\t" + lsr.getFuncChangeRate() |
| + "\t" + evaluator.evaluate(lsr.getNextPoint()) + "\n"); |
| } else { |
| display("\t " + lsr.getValueAtNext() + |
| "\t" + lsr.getFuncChangeRate() + "\n"); |
| } |
| } |
| if (isConverged(lsr)) |
| break; |
| |
| initialStepSize = INITIAL_STEP_SIZE; |
| } |
| |
| // Undo L2-shrinkage if Elastic Net is used (since |
| // in that case, the shrinkage is done twice) |
| if (l1Cost > 0 && l2Cost > 0) { |
| double[] x = lsr.getNextPoint(); |
| for (int i = 0; i < dimension; i++) { |
| x[i] = StrictMath.sqrt(1 + l2Cost) * x[i]; |
| } |
| } |
| |
| long endTime = System.currentTimeMillis(); |
| long duration = endTime - startTime; |
| display("Running time: " + (duration / 1000.) + "s\n"); |
| |
| // Release memory |
| this.updateInfo = null; |
| System.gc(); |
| |
| // Avoid returning the reference to LineSearchResult's member so that GC can |
| // collect memory occupied by lsr after this function completes (is it necessary?) |
| double[] parameters = new double[dimension]; |
| System.arraycopy(lsr.getNextPoint(), 0, parameters, 0, dimension); |
| |
| return parameters; |
| } |
| |
| /** |
| * Pseudo-gradient for L1-regularization (see equation 4 in the paper |
| * "Scalable Training of L1-Regularized Log-Linear Models", Andrew et al. 2007) |
| * |
| * @param x current point |
| * @param g gradient at x |
| * @param pg pseudo-gradient at x which is to be computed |
| */ |
| private void computePseudoGrad(double[] x, double[] g, double[] pg) { |
| for (int i = 0; i < dimension; i++) { |
| if (x[i] < 0) { |
| pg[i] = g[i] - l1Cost; |
| } |
| else if (x[i] > 0) { |
| pg[i] = g[i] + l1Cost; |
| } |
| else { |
| if (g[i] < -l1Cost) { |
| // right partial derivative |
| pg[i] = g[i] + l1Cost; |
| } |
| else if (g[i] > l1Cost) { |
| // left partial derivative |
| pg[i] = g[i] - l1Cost; |
| } |
| else { |
| pg[i] = 0; |
| } |
| } |
| } |
| } |
| |
| /** |
| * L-BFGS two-loop recursion (see Nocedal & Wright 2006, Numerical Optimization, p. 178) |
| */ |
| private void computeDirection(double[] direction) { |
| |
| // Implemented two-loop Hessian update method. |
| int k = updateInfo.kCounter; |
| double[] rho = updateInfo.rho; |
| double[] alpha = updateInfo.alpha; // just to avoid recreating alpha |
| double[][] S = updateInfo.S; |
| double[][] Y = updateInfo.Y; |
| |
| // First loop |
| for (int i = k - 1; i >= 0; i--) { |
| alpha[i] = rho[i] * ArrayMath.innerProduct(S[i], direction); |
| for (int j = 0; j < dimension; j++) { |
| direction[j] = direction[j] - alpha[i] * Y[i][j]; |
| } |
| } |
| |
| // Second loop |
| for (int i = 0; i < k; i++) { |
| double beta = rho[i] * ArrayMath.innerProduct(Y[i], direction); |
| for (int j = 0; j < dimension; j++) { |
| direction[j] = direction[j] + S[i][j] * (alpha[i] - beta); |
| } |
| } |
| |
| for (int i = 0; i < dimension; i++) { |
| direction[i] = -direction[i]; |
| } |
| } |
| |
| private boolean isConverged(LineSearchResult lsr) { |
| |
| // Check function's change rate |
| if (lsr.getFuncChangeRate() < CONVERGE_TOLERANCE) { |
| if (verbose) |
| display("Function change rate is smaller than the threshold " |
| + CONVERGE_TOLERANCE + ".\nTraining will stop.\n\n"); |
| return true; |
| } |
| |
| // Check gradient's norm using the criteria: ||g(x)|| / max(1, ||x||) < threshold |
| double xNorm = StrictMath.max(1, ArrayMath.l2norm(lsr.getNextPoint())); |
| double gradNorm = l1Cost > 0 ? |
| ArrayMath.l2norm(lsr.getPseudoGradAtNext()) : ArrayMath.l2norm(lsr.getGradAtNext()); |
| if (gradNorm / xNorm < REL_GRAD_NORM_TOL) { |
| if (verbose) |
| display("Relative L2-norm of the gradient is smaller than the threshold " |
| + REL_GRAD_NORM_TOL + ".\nTraining will stop.\n\n"); |
| return true; |
| } |
| |
| // Check step size |
| if (lsr.getStepSize() < MIN_STEP_SIZE) { |
| if (verbose) |
| display("Step size is smaller than the minimum step size " |
| + MIN_STEP_SIZE + ".\nTraining will stop.\n\n"); |
| return true; |
| } |
| |
| // Check number of function evaluations |
| if (lsr.getFctEvalCount() > this.maxFctEval) { |
| if (verbose) |
| display("Maximum number of function evaluations has exceeded the threshold " |
| + this.maxFctEval + ".\nTraining will stop.\n\n"); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /** |
| * Shorthand for System.out.print |
| */ |
| private void display(String s) { |
| System.out.print(s); |
| } |
| |
| /** |
| * Class to store vectors for Hessian approximation update. |
| */ |
| private class UpdateInfo { |
| private double[][] S; |
| private double[][] Y; |
| private double[] rho; |
| private double[] alpha; |
| private int m; |
| |
| private int kCounter; |
| |
| // Constructor |
| UpdateInfo(int numCorrection, int dimension) { |
| this.m = numCorrection; |
| this.kCounter = 0; |
| S = new double[this.m][dimension]; |
| Y = new double[this.m][dimension]; |
| rho = new double[this.m]; |
| alpha = new double[this.m]; |
| } |
| |
| public void update(LineSearchResult lsr) { |
| double[] currPoint = lsr.getCurrPoint(); |
| double[] gradAtCurr = lsr.getGradAtCurr(); |
| double[] nextPoint = lsr.getNextPoint(); |
| double[] gradAtNext = lsr.getGradAtNext(); |
| |
| // Inner product of S_k and Y_k |
| double SYk = 0.0; |
| |
| // Add new ones. |
| if (kCounter < m) { |
| for (int j = 0; j < dimension; j++) { |
| S[kCounter][j] = nextPoint[j] - currPoint[j]; |
| Y[kCounter][j] = gradAtNext[j] - gradAtCurr[j]; |
| SYk += S[kCounter][j] * Y[kCounter][j]; |
| } |
| rho[kCounter] = 1.0 / SYk; |
| } |
| else { |
| // Discard oldest vectors and add new ones. |
| for (int i = 0; i < m - 1; i++) { |
| S[i] = S[i + 1]; |
| Y[i] = Y[i + 1]; |
| rho[i] = rho[i + 1]; |
| } |
| for (int j = 0; j < dimension; j++) { |
| S[m - 1][j] = nextPoint[j] - currPoint[j]; |
| Y[m - 1][j] = gradAtNext[j] - gradAtCurr[j]; |
| SYk += S[m - 1][j] * Y[m - 1][j]; |
| } |
| rho[m - 1] = 1.0 / SYk; |
| } |
| |
| if (kCounter < m) |
| kCounter++; |
| } |
| } |
| |
| /** |
| * L2-regularized objective function |
| */ |
| public static class L2RegFunction implements Function { |
| private Function f; |
| private double l2Cost; |
| |
| public L2RegFunction(Function f, double l2Cost) { |
| this.f = f; |
| this.l2Cost = l2Cost; |
| } |
| |
| @Override |
| public int getDimension() { |
| return f.getDimension(); |
| } |
| |
| @Override |
| public double valueAt(double[] x) { |
| checkDimension(x); |
| double value = f.valueAt(x); |
| if (l2Cost > 0) { |
| value += l2Cost * ArrayMath.innerProduct(x, x); |
| } |
| return value; |
| } |
| |
| @Override |
| public double[] gradientAt(double[] x) { |
| checkDimension(x); |
| double[] gradient = f.gradientAt(x); |
| if (l2Cost > 0) { |
| for (int i = 0; i < x.length; i++) { |
| gradient[i] += 2 * l2Cost * x[i]; |
| } |
| } |
| return gradient; |
| } |
| |
| private void checkDimension(double[] x) { |
| if (x.length != getDimension()) |
| throw new IllegalArgumentException( |
| "x's dimension is not the same as function's dimension"); |
| } |
| } |
| |
| /** |
| * Evaluate quality of training parameters. For example, |
| * it can be used to report model's training accuracy when |
| * we train a Maximum Entropy classifier. |
| */ |
| public interface Evaluator { |
| /** |
| * Measure quality of the training parameters |
| * @param parameters |
| * @return evaluated result |
| */ |
| double evaluate(double[] parameters); |
| } |
| } |