| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.commons.math4.legacy.ml.clustering; |
| |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.List; |
| |
| import org.apache.commons.rng.UniformRandomProvider; |
| import org.apache.commons.math4.legacy.exception.NumberIsTooSmallException; |
| import org.apache.commons.math4.legacy.ml.distance.DistanceMeasure; |
| import org.apache.commons.math4.legacy.stat.descriptive.moment.VectorialMean; |
| |
| /** |
| * Implementation of k-means++ algorithm. |
| * It is based on |
| * <blockquote> |
| * Elkan, Charles. |
| * "Using the triangle inequality to accelerate k-means." |
| * ICML. Vol. 3. 2003. |
| * </blockquote> |
| * |
| * <p> |
| * Algorithm uses triangle inequality to speed up computation, by reducing |
| * the amount of distances calculations. Towards the last iterations of |
| * the algorithm, points which already assigned to some cluster are unlikely |
| * to move to a new cluster; updates of cluster centers are also usually |
| * relatively small. |
| * Triangle inequality is thus used to determine the cases where distance |
| * computation could be skipped since center move only a little, without |
| * affecting points partitioning. |
| * |
| * <p> |
| * For initial centers seeding, we apply the algorithm described in |
| * <blockquote> |
| * Arthur, David, and Sergei Vassilvitskii. |
| * "k-means++: The advantages of careful seeding." |
| * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms. |
| * Society for Industrial and Applied Mathematics, 2007. |
| * </blockquote> |
| * |
| * @param <T> Type of the points to cluster. |
| */ |
| public class ElkanKMeansPlusPlusClusterer<T extends Clusterable> |
| extends KMeansPlusPlusClusterer<T> { |
| |
| /** |
| * @param k Clustering parameter. |
| */ |
| public ElkanKMeansPlusPlusClusterer(int k) { |
| super(k); |
| } |
| |
| /** |
| * @param k Clustering parameter. |
| * @param maxIterations Allowed number of iterations. |
| * @param measure Distance measure. |
| * @param random Random generator. |
| */ |
| public ElkanKMeansPlusPlusClusterer(int k, |
| int maxIterations, |
| DistanceMeasure measure, |
| UniformRandomProvider random) { |
| super(k, maxIterations, measure, random); |
| } |
| |
| /** |
| * @param k Clustering parameter. |
| * @param maxIterations Allowed number of iterations. |
| * @param measure Distance measure. |
| * @param random Random generator. |
| * @param emptyStrategy Strategy for handling empty clusters that |
| * may appear during algorithm progress. |
| */ |
| public ElkanKMeansPlusPlusClusterer(int k, |
| int maxIterations, |
| DistanceMeasure measure, |
| UniformRandomProvider random, |
| EmptyClusterStrategy emptyStrategy) { |
| super(k, maxIterations, measure, random, emptyStrategy); |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| public List<CentroidCluster<T>> cluster(final Collection<T> points) { |
| final int k = getNumberOfClusters(); |
| |
| // Number of clusters has to be smaller or equal the number of data points. |
| if (points.size() < k) { |
| throw new NumberIsTooSmallException(points.size(), k, false); |
| } |
| |
| final List<T> pointsList = new ArrayList<>(points); |
| final int n = points.size(); |
| final int dim = pointsList.get(0).getPoint().length; |
| |
| // Keep minimum intra cluster distance, e.g. for given cluster c s[c] is |
| // the distance to the closest cluster c' or s[c] = 1/2 * min_{c'!=c} dist(c', c) |
| final double[] s = new double[k]; |
| Arrays.fill(s, Double.MAX_VALUE); |
| // Store the matrix of distances between all cluster centers, e.g. dcc[c1][c2] = distance(c1, c2) |
| final double[][] dcc = new double[k][k]; |
| |
| // For each point keeps the upper bound distance to the cluster center. |
| final double[] u = new double[n]; |
| Arrays.fill(u, Double.MAX_VALUE); |
| |
| // For each point and for each cluster keeps the lower bound for the distance between the point and cluster |
| final double[][] l = new double[n][k]; |
| |
| // Seed initial set of cluster centers. |
| final double[][] centers = seed(pointsList); |
| |
| // Points partitioning induced by cluster centers, e.g. for point xi the value of partitions[xi] indicates |
| // the cluster or index of the cluster center which is closest to xi. partitions[xi] = min_{c} distance(xi, c). |
| final int[] partitions = partitionPoints(pointsList, centers, u, l); |
| |
| final double[] deltas = new double[k]; |
| VectorialMean[] means = new VectorialMean[k]; |
| for (int it = 0, max = getMaxIterations(); |
| it < max; |
| it++) { |
| int changes = 0; |
| // Step I. |
| // Compute inter-cluster distances. |
| updateIntraCentersDistances(centers, dcc, s); |
| |
| for (int xi = 0; xi < n; xi++) { |
| boolean r = true; |
| |
| // Step II. |
| if (u[xi] <= s[partitions[xi]]) { |
| continue; |
| } |
| |
| for (int c = 0; c < k; c++) { |
| // Check condition III. |
| if (isSkipNext(partitions, u, l, dcc, xi, c)) { |
| continue; |
| } |
| |
| final double[] x = pointsList.get(xi).getPoint(); |
| |
| // III(a) |
| if (r) { |
| u[xi] = distance(x, centers[partitions[xi]]); |
| l[xi][partitions[xi]] = u[xi]; |
| r = false; |
| } |
| // III(b) |
| if (u[xi] > l[xi][c] || u[xi] > dcc[partitions[xi]][c]) { |
| l[xi][c] = distance(x, centers[c]); |
| if (l[xi][c] < u[xi]) { |
| partitions[xi] = c; |
| u[xi] = l[xi][c]; |
| ++changes; |
| } |
| } |
| } |
| } |
| |
| // Stopping criterion. |
| if (changes == 0 && |
| it != 0) { // First iteration needed (to update bounds). |
| break; |
| } |
| |
| // Step IV. |
| Arrays.fill(means, null); |
| for (int i = 0; i < n; i++) { |
| if (means[partitions[i]] == null) { |
| means[partitions[i]] = new VectorialMean(dim); |
| } |
| means[partitions[i]].increment(pointsList.get(i).getPoint()); |
| } |
| |
| for (int i = 0; i < k; i++) { |
| deltas[i] = distance(centers[i], means[i].getResult()); |
| centers[i] = means[i].getResult(); |
| } |
| |
| updateBounds(partitions, u, l, deltas); |
| } |
| |
| return buildResults(pointsList, partitions, centers); |
| } |
| |
| /** |
| * kmeans++ seeding which provides guarantee of resulting with log(k) approximation |
| * for final clustering results |
| * <p> |
| * Arthur, David, and Sergei Vassilvitskii. "k-means++: The advantages of careful seeding." |
| * Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms. |
| * Society for Industrial and Applied Mathematics, 2007. |
| * |
| * @param points input data points |
| * @return an array of initial clusters centers |
| * |
| */ |
| private double[][] seed(final List<T> points) { |
| final int k = getNumberOfClusters(); |
| final UniformRandomProvider random = getRandomGenerator(); |
| |
| final double[][] result = new double[k][]; |
| final int n = points.size(); |
| final int pointIndex = random.nextInt(n); |
| |
| final double[] minDistances = new double[n]; |
| |
| int idx = 0; |
| result[idx] = points.get(pointIndex).getPoint(); |
| |
| double sumSqDist = 0; |
| |
| for (int i = 0; i < n; i++) { |
| final double d = distance(result[idx], points.get(i).getPoint()); |
| minDistances[i] = d * d; |
| sumSqDist += minDistances[i]; |
| } |
| |
| while (++idx < k) { |
| final double p = sumSqDist * random.nextDouble(); |
| int next = 0; |
| for (double cdf = 0; cdf < p; next++) { |
| cdf += minDistances[next]; |
| } |
| |
| result[idx] = points.get(next - 1).getPoint(); |
| for (int i = 0; i < n; i++) { |
| final double d = distance(result[idx], points.get(i).getPoint()); |
| sumSqDist -= minDistances[i]; |
| minDistances[i] = Math.min(minDistances[i], d * d); |
| sumSqDist += minDistances[i]; |
| } |
| } |
| |
| return result; |
| } |
| |
| |
| /** |
| * Once initial centers are chosen, we can actually go through data points and assign points to the |
| * cluster based on the distance between initial centers and points. |
| * |
| * @param pointsList data points list |
| * @param centers current clusters centers |
| * @param u points upper bounds |
| * @param l lower bounds for points to clusters centers |
| * |
| * @return initial assignment of points into clusters |
| */ |
| private int[] partitionPoints(List<T> pointsList, |
| double[][] centers, |
| double[] u, |
| double[][] l) { |
| final int k = getNumberOfClusters(); |
| final int n = pointsList.size(); |
| // Points assignments vector. |
| final int[] assignments = new int[n]; |
| Arrays.fill(assignments, -1); |
| // Need to assign points to the clusters for the first time and intitialize the lower bound l(x, c) |
| for (int i = 0; i < n; i++) { |
| final double[] x = pointsList.get(i).getPoint(); |
| for (int j = 0; j < k; j++) { |
| l[i][j] = distance(x, centers[j]); // l(x, c) = d(x, c) |
| if (u[i] > l[i][j]) { |
| u[i] = l[i][j]; // u(x) = min_c d(x, c) |
| assignments[i] = j; // c(x) = argmin_c d(x, c) |
| } |
| } |
| } |
| return assignments; |
| } |
| |
| /** |
| * Updated distances between clusters centers and for each cluster |
| * pick the closest neighbour and keep distance to it. |
| * |
| * @param centers cluster centers |
| * @param dcc matrix of distance between clusters centers, e.g. |
| * {@code dcc[i][j] = distance(centers[i], centers[j])} |
| * @param s For a given cluster, {@code s[si]} holds distance value |
| * to the closest cluster center. |
| */ |
| private void updateIntraCentersDistances(double[][] centers, |
| double[][] dcc, |
| double[] s) { |
| final int k = getNumberOfClusters(); |
| for (int i = 0; i < k; i++) { |
| // Since distance(xi, xj) == distance(xj, xi), we need to update |
| // only upper or lower triangle of the distances matrix and mirror |
| // to the lower of upper triangle accordingly, trace has to be all |
| // zeros, since distance(xi, xi) == 0. |
| for (int j = i + 1; j < k; j++) { |
| dcc[i][j] = 0.5 * distance(centers[i], centers[j]); |
| dcc[j][i] = dcc[i][j]; |
| if (dcc[i][j] < s[i]) { |
| s[i] = dcc[i][j]; |
| } |
| if (dcc[j][i] < s[j]) { |
| s[j] = dcc[j][i]; |
| } |
| } |
| } |
| } |
| |
| /** |
| * For given points and and cluster, check condition (3) of Elkan algorithm. |
| * |
| * <ul> |
| * <li>c is not the cluster xi assigned to</li> |
| * <li>{@code u[xi] > l[xi][x]} upper bound for point xi is greater than |
| * lower bound between xi and some cluster c</li> |
| * <li>{@code u[xi] > 1/2 * d(c(xi), c)} upper bound is greater than |
| * distance between center of xi's cluster and c</li> |
| * </ul> |
| * |
| * @param partitions current partition of points into clusters |
| * @param u upper bounds for points |
| * @param l lower bounds for distance between cluster centers and points |
| * @param dcc matrix of distance between clusters centers |
| * @param xi index of the point |
| * @param c index of the cluster |
| * @return true if conditions above satisfied false otherwise |
| */ |
| private static boolean isSkipNext(int[] partitions, |
| double[] u, |
| double[][] l, |
| double[][] dcc, |
| int xi, |
| int c) { |
| return c == partitions[xi] || |
| u[xi] <= l[xi][c] || |
| u[xi] <= dcc[partitions[xi]][c]; |
| } |
| |
| /** |
| * Once kmeans iterations have been converged and no more movements, we can build up the final |
| * resulted list of cluster centroids ({@link CentroidCluster}) and assign input points based |
| * on the converged partitioning. |
| * |
| * @param pointsList list of data points |
| * @param partitions current partition of points into clusters |
| * @param centers cluster centers |
| * @return cluster partitioning |
| */ |
| private List<CentroidCluster<T>> buildResults(List<T> pointsList, |
| int[] partitions, |
| double[][] centers) { |
| final int k = getNumberOfClusters(); |
| final List<CentroidCluster<T>> result = new ArrayList<>(); |
| for (int i = 0; i < k; i++) { |
| final CentroidCluster<T> cluster = new CentroidCluster<>(new DoublePoint(centers[i])); |
| result.add(cluster); |
| } |
| for (int i = 0; i < pointsList.size(); i++) { |
| result.get(partitions[i]).addPoint(pointsList.get(i)); |
| } |
| return result; |
| } |
| |
| /** |
| * Based on the distance that cluster center has moved we need to update our upper and lower bound. |
| * Worst case assumption, the center of the assigned to given cluster moves away from the point, while |
| * centers of over clusters become closer. |
| * |
| * @param partitions current points assiments to the clusters |
| * @param u points upper bounds |
| * @param l lower bounds for distances between point and corresponding cluster |
| * @param deltas the movement delta for each cluster center |
| */ |
| private void updateBounds(int[] partitions, |
| double[] u, |
| double[][] l, |
| double[] deltas) { |
| final int k = getNumberOfClusters(); |
| for (int i = 0; i < partitions.length; i++) { |
| u[i] += deltas[partitions[i]]; |
| for (int j = 0; j < k; j++) { |
| l[i][j] = Math.max(0, l[i][j] - deltas[j]); |
| } |
| } |
| } |
| |
| /** |
| * @param a Coordinates. |
| * @param b Coordinates. |
| * @return the distance between {@code a} and {@code b}. |
| */ |
| private double distance(final double[] a, |
| final double[] b) { |
| return getDistanceMeasure().compute(a, b); |
| } |
| } |