| |
| package com.yahoo.labs.samoa.moa.clusterers; |
| |
| /* |
| * #%L |
| * SAMOA |
| * %% |
| * Copyright (C) 2010 RWTH Aachen University, Germany |
| * %% |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * #L% |
| */ |
| |
| import java.util.ArrayList; |
| import java.util.List; |
| import com.yahoo.labs.samoa.moa.cluster.CFCluster; |
| import com.yahoo.labs.samoa.moa.cluster.Cluster; |
| import com.yahoo.labs.samoa.moa.cluster.Clustering; |
| import com.yahoo.labs.samoa.moa.cluster.SphereCluster; |
| |
| /** |
| * A kMeans implementation for microclusterings. For now it only uses the real centers of the |
| * groundtruthclustering for implementation. There should also be an option to use random |
| * centers. |
| * TODO: random centers |
| * TODO: Create a macro clustering interface to make different macro clustering algorithms available |
| * to micro clustering algorithms like clustream, denstream and clustree |
| * |
| */ |
| public class KMeans { |
| |
| /** |
| * This kMeans implementation clusters a big number of microclusters |
| * into a smaller amount of macro clusters. To make it comparable to other |
| * algorithms it uses the real centers of the ground truth macro clustering |
| * to have the best possible initialization. The quality of resulting |
| * macro clustering yields an upper bound for kMeans on the underlying |
| * microclustering. |
| * |
| * @param centers of the ground truth clustering |
| * @param data list of microclusters |
| * @return |
| */ |
| public static Clustering kMeans(Cluster[] centers, List<? extends Cluster> data ) { |
| int k = centers.length; |
| |
| int dimensions = centers[0].getCenter().length; |
| |
| ArrayList<ArrayList<Cluster>> clustering = |
| new ArrayList<ArrayList<Cluster>>(); |
| for ( int i = 0; i < k; i++ ) { |
| clustering.add( new ArrayList<Cluster>() ); |
| } |
| |
| int repetitions = 100; |
| while ( repetitions-- >= 0 ) { |
| // Assign points to clusters |
| for ( Cluster point : data ) { |
| double minDistance = distance( point.getCenter(), centers[0].getCenter() ); |
| int closestCluster = 0; |
| for ( int i = 1; i < k; i++ ) { |
| double distance = distance( point.getCenter(), centers[i].getCenter() ); |
| if ( distance < minDistance ) { |
| closestCluster = i; |
| minDistance = distance; |
| } |
| } |
| |
| clustering.get( closestCluster ).add( point ); |
| } |
| |
| // Calculate new centers and clear clustering lists |
| SphereCluster[] newCenters = new SphereCluster[centers.length]; |
| for ( int i = 0; i < k; i++ ) { |
| newCenters[i] = calculateCenter( clustering.get( i ), dimensions ); |
| clustering.get( i ).clear(); |
| } |
| centers = newCenters; |
| } |
| |
| return new Clustering( centers ); |
| } |
| |
| private static double distance(double[] pointA, double [] pointB){ |
| double distance = 0.0; |
| for (int i = 0; i < pointA.length; i++) { |
| double d = pointA[i] - pointB[i]; |
| distance += d * d; |
| } |
| return Math.sqrt(distance); |
| } |
| |
| |
| private static SphereCluster calculateCenter( ArrayList<Cluster> cluster, int dimensions ) { |
| double[] res = new double[dimensions]; |
| for ( int i = 0; i < res.length; i++ ) { |
| res[i] = 0.0; |
| } |
| |
| if ( cluster.size() == 0 ) { |
| return new SphereCluster( res, 0.0 ); |
| } |
| |
| for ( Cluster point : cluster ) { |
| double [] center = point.getCenter(); |
| for (int i = 0; i < res.length; i++) { |
| res[i] += center[i]; |
| } |
| } |
| |
| // Normalize |
| for ( int i = 0; i < res.length; i++ ) { |
| res[i] /= cluster.size(); |
| } |
| |
| // Calculate radius |
| double radius = 0.0; |
| for ( Cluster point : cluster ) { |
| double dist = distance( res, point.getCenter() ); |
| if ( dist > radius ) { |
| radius = dist; |
| } |
| } |
| |
| return new SphereCluster( res, radius ); |
| } |
| |
| public static Clustering gaussianMeans(Clustering gtClustering, Clustering clustering) { |
| ArrayList<CFCluster> microclusters = new ArrayList<CFCluster>(); |
| for (int i = 0; i < clustering.size(); i++) { |
| if (clustering.get(i) instanceof CFCluster) { |
| microclusters.add((CFCluster)clustering.get(i)); |
| } |
| else{ |
| System.out.println("Unsupported Cluster Type:"+clustering.get(i).getClass() |
| +". Cluster needs to extend moa.cluster.CFCluster"); |
| } |
| } |
| Cluster[] centers = new Cluster[gtClustering.size()]; |
| for (int i = 0; i < centers.length; i++) { |
| centers[i] = gtClustering.get(i); |
| |
| } |
| |
| int k = centers.length; |
| if ( microclusters.size() < k ) { |
| return new Clustering( new Cluster[0]); |
| } |
| |
| Clustering kMeansResult = kMeans( centers, microclusters ); |
| |
| k = kMeansResult.size(); |
| CFCluster[] res = new CFCluster[ k ]; |
| |
| for ( CFCluster microcluster : microclusters) { |
| // Find closest kMeans cluster |
| double minDistance = Double.MAX_VALUE; |
| int closestCluster = 0; |
| for ( int i = 0; i < k; i++ ) { |
| double distance = distance( kMeansResult.get(i).getCenter(), microcluster.getCenter() ); |
| if ( distance < minDistance ) { |
| closestCluster = i; |
| minDistance = distance; |
| } |
| } |
| |
| // Add to cluster |
| if ( res[closestCluster] == null ) { |
| res[closestCluster] = (CFCluster)microcluster.copy(); |
| } else { |
| res[closestCluster].add(microcluster); |
| } |
| } |
| |
| // Clean up res |
| int count = 0; |
| for ( int i = 0; i < res.length; i++ ) { |
| if ( res[i] != null ) |
| ++count; |
| } |
| |
| CFCluster[] cleaned = new CFCluster[count]; |
| count = 0; |
| for ( int i = 0; i < res.length; i++ ) { |
| if ( res[i] != null ) |
| cleaned[count++] = res[i]; |
| } |
| |
| return new Clustering( cleaned ); |
| } |
| |
| } |