| package com.yahoo.labs.samoa.moa.clusterers; |
| |
| /* |
| * #%L |
| * SAMOA |
| * %% |
| * Copyright (C) 2014 - 2015 Apache Software Foundation |
| * %% |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * #L% |
| */ |
| |
| import java.util.ArrayList; |
| import java.util.List; |
| import com.yahoo.labs.samoa.moa.cluster.CFCluster; |
| import com.yahoo.labs.samoa.moa.cluster.Cluster; |
| import com.yahoo.labs.samoa.moa.cluster.Clustering; |
| import com.yahoo.labs.samoa.moa.cluster.SphereCluster; |
| |
| /** |
| * A kMeans implementation for microclusterings. For now it only uses the real centers of the groundtruthclustering for |
| * implementation. There should also be an option to use random centers. TODO: random centers TODO: Create a macro |
| * clustering interface to make different macro clustering algorithms available to micro clustering algorithms like |
| * clustream, denstream and clustree |
| * |
| */ |
| public class KMeans { |
| |
| /** |
| * This kMeans implementation clusters a big number of microclusters into a smaller amount of macro clusters. To make |
| * it comparable to other algorithms it uses the real centers of the ground truth macro clustering to have the best |
| * possible initialization. The quality of resulting macro clustering yields an upper bound for kMeans on the |
| * underlying microclustering. |
| * |
| * @param centers |
| * of the ground truth clustering |
| * @param data |
| * list of microclusters |
| * @return |
| */ |
| public static Clustering kMeans(Cluster[] centers, List<? extends Cluster> data) { |
| int k = centers.length; |
| |
| int dimensions = centers[0].getCenter().length; |
| |
| ArrayList<ArrayList<Cluster>> clustering = |
| new ArrayList<ArrayList<Cluster>>(); |
| for (int i = 0; i < k; i++) { |
| clustering.add(new ArrayList<Cluster>()); |
| } |
| |
| int repetitions = 100; |
| while (repetitions-- >= 0) { |
| // Assign points to clusters |
| for (Cluster point : data) { |
| double minDistance = distance(point.getCenter(), centers[0].getCenter()); |
| int closestCluster = 0; |
| for (int i = 1; i < k; i++) { |
| double distance = distance(point.getCenter(), centers[i].getCenter()); |
| if (distance < minDistance) { |
| closestCluster = i; |
| minDistance = distance; |
| } |
| } |
| |
| clustering.get(closestCluster).add(point); |
| } |
| |
| // Calculate new centers and clear clustering lists |
| SphereCluster[] newCenters = new SphereCluster[centers.length]; |
| for (int i = 0; i < k; i++) { |
| newCenters[i] = calculateCenter(clustering.get(i), dimensions); |
| clustering.get(i).clear(); |
| } |
| centers = newCenters; |
| } |
| |
| return new Clustering(centers); |
| } |
| |
| private static double distance(double[] pointA, double[] pointB) { |
| double distance = 0.0; |
| for (int i = 0; i < pointA.length; i++) { |
| double d = pointA[i] - pointB[i]; |
| distance += d * d; |
| } |
| return Math.sqrt(distance); |
| } |
| |
| private static SphereCluster calculateCenter(ArrayList<Cluster> cluster, int dimensions) { |
| double[] res = new double[dimensions]; |
| for (int i = 0; i < res.length; i++) { |
| res[i] = 0.0; |
| } |
| |
| if (cluster.size() == 0) { |
| return new SphereCluster(res, 0.0); |
| } |
| |
| for (Cluster point : cluster) { |
| double[] center = point.getCenter(); |
| for (int i = 0; i < res.length; i++) { |
| res[i] += center[i]; |
| } |
| } |
| |
| // Normalize |
| for (int i = 0; i < res.length; i++) { |
| res[i] /= cluster.size(); |
| } |
| |
| // Calculate radius |
| double radius = 0.0; |
| for (Cluster point : cluster) { |
| double dist = distance(res, point.getCenter()); |
| if (dist > radius) { |
| radius = dist; |
| } |
| } |
| |
| return new SphereCluster(res, radius); |
| } |
| |
| public static Clustering gaussianMeans(Clustering gtClustering, Clustering clustering) { |
| ArrayList<CFCluster> microclusters = new ArrayList<CFCluster>(); |
| for (int i = 0; i < clustering.size(); i++) { |
| if (clustering.get(i) instanceof CFCluster) { |
| microclusters.add((CFCluster) clustering.get(i)); |
| } |
| else { |
| System.out.println("Unsupported Cluster Type:" + clustering.get(i).getClass() |
| + ". Cluster needs to extend moa.cluster.CFCluster"); |
| } |
| } |
| Cluster[] centers = new Cluster[gtClustering.size()]; |
| for (int i = 0; i < centers.length; i++) { |
| centers[i] = gtClustering.get(i); |
| |
| } |
| |
| int k = centers.length; |
| if (microclusters.size() < k) { |
| return new Clustering(new Cluster[0]); |
| } |
| |
| Clustering kMeansResult = kMeans(centers, microclusters); |
| |
| k = kMeansResult.size(); |
| CFCluster[] res = new CFCluster[k]; |
| |
| for (CFCluster microcluster : microclusters) { |
| // Find closest kMeans cluster |
| double minDistance = Double.MAX_VALUE; |
| int closestCluster = 0; |
| for (int i = 0; i < k; i++) { |
| double distance = distance(kMeansResult.get(i).getCenter(), microcluster.getCenter()); |
| if (distance < minDistance) { |
| closestCluster = i; |
| minDistance = distance; |
| } |
| } |
| |
| // Add to cluster |
| if (res[closestCluster] == null) { |
| res[closestCluster] = (CFCluster) microcluster.copy(); |
| } else { |
| res[closestCluster].add(microcluster); |
| } |
| } |
| |
| // Clean up res |
| int count = 0; |
| for (int i = 0; i < res.length; i++) { |
| if (res[i] != null) |
| ++count; |
| } |
| |
| CFCluster[] cleaned = new CFCluster[count]; |
| count = 0; |
| for (int i = 0; i < res.length; i++) { |
| if (res[i] != null) |
| cleaned[count++] = res[i]; |
| } |
| |
| return new Clustering(cleaned); |
| } |
| |
| } |