blob: a563fda3d4bea20319a0c45bbc5eec188e8ca88b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math4.ml.clustering;
import org.apache.commons.math4.exception.NumberIsTooSmallException;
import org.apache.commons.math4.ml.distance.DistanceMeasure;
import org.apache.commons.math4.util.MathUtils;
import org.apache.commons.math4.util.Pair;
import org.apache.commons.rng.UniformRandomProvider;
import org.apache.commons.rng.sampling.ListSampler;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* Clustering algorithm <a href="https://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf">
* based on KMeans</a>.
*
* @param <T> Type of the points to cluster.
*/
public class MiniBatchKMeansClusterer<T extends Clusterable>
extends KMeansPlusPlusClusterer<T> {
/** Batch data size in iteration. */
private final int batchSize;
/** Iteration count of initialize the centers. */
private final int initIterations;
/** Data size of batch to initialize the centers. */
private final int initBatchSize;
/** Maximum number of iterations during which no improvement is occuring. */
private final int maxNoImprovementTimes;
/**
* Build a clusterer.
*
* @param k Number of clusters to split the data into.
* @param maxIterations Maximum number of iterations to run the algorithm for all the points,
* The actual number of iterationswill be smaller than {@code maxIterations * size / batchSize},
* where {@code size} is the number of points to cluster.
* Disabled if negative.
* @param batchSize Batch size for training iterations.
* @param initIterations Number of iterations allowed in order to find out the best initial centers.
* @param initBatchSize Batch size for initializing the clusters centers.
* A value of {@code 3 * batchSize} should be suitable in most cases.
* @param maxNoImprovementTimes Maximum number of iterations during which no improvement is occuring.
* A value of 10 is suitable in most cases.
* @param measure Distance measure.
* @param random Random generator.
* @param emptyStrategy Strategy for handling empty clusters that may appear during algorithm iterations.
*/
public MiniBatchKMeansClusterer(final int k,
final int maxIterations,
final int batchSize,
final int initIterations,
final int initBatchSize,
final int maxNoImprovementTimes,
final DistanceMeasure measure,
final UniformRandomProvider random,
final EmptyClusterStrategy emptyStrategy) {
super(k, maxIterations, measure, random, emptyStrategy);
if (batchSize < 1) {
throw new NumberIsTooSmallException(batchSize, 1, true);
}
if (initIterations < 1) {
throw new NumberIsTooSmallException(initIterations, 1, true);
}
if (initBatchSize < 1) {
throw new NumberIsTooSmallException(initBatchSize, 1, true);
}
if (maxNoImprovementTimes < 1) {
throw new NumberIsTooSmallException(maxNoImprovementTimes, 1, true);
}
this.batchSize = batchSize;
this.initIterations = initIterations;
this.initBatchSize = initBatchSize;
this.maxNoImprovementTimes = maxNoImprovementTimes;
}
/**
* Runs the MiniBatch K-means clustering algorithm.
*
* @param points Points to cluster (cannot be {@code null}).
* @return the clusters.
* @throws org.apache.commons.math4.exception.MathIllegalArgumentException
* if the number of points is smaller than the number of clusters.
*/
@Override
public List<CentroidCluster<T>> cluster(final Collection<T> points) {
// Sanity check.
MathUtils.checkNotNull(points);
if (points.size() < getNumberOfClusters()) {
throw new NumberIsTooSmallException(points.size(), getNumberOfClusters(), false);
}
final int pointSize = points.size();
final int batchCount = pointSize / batchSize + (pointSize % batchSize > 0 ? 1 : 0);
final int max = getMaxIterations() < 0 ?
Integer.MAX_VALUE :
getMaxIterations() * batchCount;
final List<T> pointList = new ArrayList<>(points);
List<CentroidCluster<T>> clusters = initialCenters(pointList);
final ImprovementEvaluator evaluator = new ImprovementEvaluator(batchSize,
maxNoImprovementTimes);
for (int i = 0; i < max; i++) {
clearClustersPoints(clusters);
final List<T> batchPoints = ListSampler.sample(getRandomGenerator(), pointList, batchSize);
// Training step.
final Pair<Double, List<CentroidCluster<T>>> pair = step(batchPoints, clusters);
final double squareDistance = pair.getFirst();
clusters = pair.getSecond();
// Check whether the training can finished early.
if (evaluator.converge(squareDistance, pointSize)) {
break;
}
}
// Add every mini batch points to their nearest cluster.
clearClustersPoints(clusters);
for (final T point : points) {
addToNearestCentroidCluster(point, clusters);
}
return clusters;
}
/**
* Helper method.
*
* @param clusters Clusters to clear.
*/
private void clearClustersPoints(final List<CentroidCluster<T>> clusters) {
for (CentroidCluster<T> cluster : clusters) {
cluster.getPoints().clear();
}
}
/**
* Mini batch iteration step.
*
* @param batchPoints Points selected for this batch.
* @param clusters Centers of the clusters.
* @return the squared distance of all the batch points to the nearest center.
*/
private Pair<Double, List<CentroidCluster<T>>> step(final List<T> batchPoints,
final List<CentroidCluster<T>> clusters) {
// Add every mini batch points to their nearest cluster.
for (final T point : batchPoints) {
addToNearestCentroidCluster(point, clusters);
}
final List<CentroidCluster<T>> newClusters = adjustClustersCenters(clusters);
// Add every mini batch points to their nearest cluster again.
double squareDistance = 0.0;
for (T point : batchPoints) {
final double d = addToNearestCentroidCluster(point, newClusters);
squareDistance += d * d;
}
return new Pair<>(squareDistance, newClusters);
}
/**
* Initializes the clusters centers.
*
* @param points Points used to initialize the centers.
* @return clusters with their center initialized.
*/
private List<CentroidCluster<T>> initialCenters(final List<T> points) {
final List<T> validPoints = initBatchSize < points.size() ?
ListSampler.sample(getRandomGenerator(), points, initBatchSize) :
new ArrayList<>(points);
double nearestSquareDistance = Double.POSITIVE_INFINITY;
List<CentroidCluster<T>> bestCenters = null;
for (int i = 0; i < initIterations; i++) {
final List<T> initialPoints = (initBatchSize < points.size()) ?
ListSampler.sample(getRandomGenerator(), points, initBatchSize) :
new ArrayList<>(points);
final List<CentroidCluster<T>> clusters = chooseInitialCenters(initialPoints);
final Pair<Double, List<CentroidCluster<T>>> pair = step(validPoints, clusters);
final double squareDistance = pair.getFirst();
final List<CentroidCluster<T>> newClusters = pair.getSecond();
//Find out a best centers that has the nearest total square distance.
if (squareDistance < nearestSquareDistance) {
nearestSquareDistance = squareDistance;
bestCenters = newClusters;
}
}
return bestCenters;
}
/**
* Adds a point to the cluster whose center is closest.
*
* @param point Point to add.
* @param clusters Clusters.
* @return the distance between point and the closest center.
*/
private double addToNearestCentroidCluster(final T point,
final List<CentroidCluster<T>> clusters) {
double minDistance = Double.POSITIVE_INFINITY;
CentroidCluster<T> closestCentroidCluster = null;
// Find cluster closest to the point.
for (CentroidCluster<T> centroidCluster : clusters) {
final double distance = distance(point, centroidCluster.getCenter());
if (distance < minDistance) {
minDistance = distance;
closestCentroidCluster = centroidCluster;
}
}
MathUtils.checkNotNull(closestCentroidCluster);
closestCentroidCluster.addPoint(point);
return minDistance;
}
/**
* Stopping criterion.
* The evaluator checks whether improvement occurred during the
* {@link #maxNoImprovementTimes allowed number of successive iterations}.
*/
private static class ImprovementEvaluator {
/** Batch size. */
private final int batchSize;
/** Maximum number of iterations during which no improvement is occuring. */
private final int maxNoImprovementTimes;
/**
* <a href="https://en.wikipedia.org/wiki/Moving_average">
* Exponentially Weighted Average</a> of the squared
* diff to monitor the convergence while discarding
* minibatch-local stochastic variability.
*/
private double ewaInertia = Double.NaN;
/** Minimum value of {@link #ewaInertia} during iteration. */
private double ewaInertiaMin = Double.POSITIVE_INFINITY;
/** Number of iteration during which {@link #ewaInertia} did not improve. */
private int noImprovementTimes = 0;
/**
* @param batchSize Number of elements for each batch iteration.
* @param maxNoImprovementTimes Maximum number of iterations during
* which no improvement is occuring.
*/
private ImprovementEvaluator(int batchSize,
int maxNoImprovementTimes) {
this.batchSize = batchSize;
this.maxNoImprovementTimes = maxNoImprovementTimes;
}
/**
* Stopping criterion.
*
* @param squareDistance Total square distance from the batch points
* to their nearest center.
* @param pointSize Number of data points.
* @return {@code true} if no improvement was made after the allowed
* number of iterations, {@code false} otherwise.
*/
public boolean converge(final double squareDistance,
final int pointSize) {
final double batchInertia = squareDistance / batchSize;
if (Double.isNaN(ewaInertia)) {
ewaInertia = batchInertia;
} else {
final double alpha = Math.min(batchSize * 2 / (pointSize + 1), 1);
ewaInertia = ewaInertia * (1 - alpha) + batchInertia * alpha;
}
if (ewaInertia < ewaInertiaMin) {
// Improved.
noImprovementTimes = 0;
ewaInertiaMin = ewaInertia;
} else {
// No improvement.
++noImprovementTimes;
}
return noImprovementTimes >= maxNoImprovementTimes;
}
}
}