blob: 8eeaeba9578fa302b33ea2849ceed692d35e7d3c [file] [log] [blame]
package org.apache.samoa.moa.clusterers;
/*
* #%L
* SAMOA
* %%
* Copyright (C) 2014 - 2015 Apache Software Foundation
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import java.util.ArrayList;
import java.util.List;
import org.apache.samoa.moa.cluster.CFCluster;
import org.apache.samoa.moa.cluster.Cluster;
import org.apache.samoa.moa.cluster.Clustering;
import org.apache.samoa.moa.cluster.SphereCluster;
/**
* A kMeans implementation for microclusterings. For now it only uses the real centers of the groundtruthclustering for
* implementation. There should also be an option to use random centers. TODO: random centers TODO: Create a macro
* clustering interface to make different macro clustering algorithms available to micro clustering algorithms like
* clustream, denstream and clustree
*
*/
public class KMeans {
/**
* This kMeans implementation clusters a big number of microclusters into a smaller amount of macro clusters. To make
* it comparable to other algorithms it uses the real centers of the ground truth macro clustering to have the best
* possible initialization. The quality of resulting macro clustering yields an upper bound for kMeans on the
* underlying microclustering.
*
* @param centers
* of the ground truth clustering
* @param data
* list of microclusters
* @return
*/
public static Clustering kMeans(Cluster[] centers, List<? extends Cluster> data) {
int k = centers.length;
int dimensions = centers[0].getCenter().length;
ArrayList<ArrayList<Cluster>> clustering =
new ArrayList<ArrayList<Cluster>>();
for (int i = 0; i < k; i++) {
clustering.add(new ArrayList<Cluster>());
}
int repetitions = 100;
while (repetitions-- >= 0) {
// Assign points to clusters
for (Cluster point : data) {
double minDistance = distance(point.getCenter(), centers[0].getCenter());
int closestCluster = 0;
for (int i = 1; i < k; i++) {
double distance = distance(point.getCenter(), centers[i].getCenter());
if (distance < minDistance) {
closestCluster = i;
minDistance = distance;
}
}
clustering.get(closestCluster).add(point);
}
// Calculate new centers and clear clustering lists
SphereCluster[] newCenters = new SphereCluster[centers.length];
for (int i = 0; i < k; i++) {
newCenters[i] = calculateCenter(clustering.get(i), dimensions);
clustering.get(i).clear();
}
centers = newCenters;
}
return new Clustering(centers);
}
private static double distance(double[] pointA, double[] pointB) {
double distance = 0.0;
for (int i = 0; i < pointA.length; i++) {
double d = pointA[i] - pointB[i];
distance += d * d;
}
return Math.sqrt(distance);
}
private static SphereCluster calculateCenter(ArrayList<Cluster> cluster, int dimensions) {
double[] res = new double[dimensions];
for (int i = 0; i < res.length; i++) {
res[i] = 0.0;
}
if (cluster.size() == 0) {
return new SphereCluster(res, 0.0);
}
for (Cluster point : cluster) {
double[] center = point.getCenter();
for (int i = 0; i < res.length; i++) {
res[i] += center[i];
}
}
// Normalize
for (int i = 0; i < res.length; i++) {
res[i] /= cluster.size();
}
// Calculate radius
double radius = 0.0;
for (Cluster point : cluster) {
double dist = distance(res, point.getCenter());
if (dist > radius) {
radius = dist;
}
}
return new SphereCluster(res, radius);
}
public static Clustering gaussianMeans(Clustering gtClustering, Clustering clustering) {
ArrayList<CFCluster> microclusters = new ArrayList<CFCluster>();
for (int i = 0; i < clustering.size(); i++) {
if (clustering.get(i) instanceof CFCluster) {
microclusters.add((CFCluster) clustering.get(i));
}
else {
System.out.println("Unsupported Cluster Type:" + clustering.get(i).getClass()
+ ". Cluster needs to extend moa.cluster.CFCluster");
}
}
Cluster[] centers = new Cluster[gtClustering.size()];
for (int i = 0; i < centers.length; i++) {
centers[i] = gtClustering.get(i);
}
int k = centers.length;
if (microclusters.size() < k) {
return new Clustering(new Cluster[0]);
}
Clustering kMeansResult = kMeans(centers, microclusters);
k = kMeansResult.size();
CFCluster[] res = new CFCluster[k];
for (CFCluster microcluster : microclusters) {
// Find closest kMeans cluster
double minDistance = Double.MAX_VALUE;
int closestCluster = 0;
for (int i = 0; i < k; i++) {
double distance = distance(kMeansResult.get(i).getCenter(), microcluster.getCenter());
if (distance < minDistance) {
closestCluster = i;
minDistance = distance;
}
}
// Add to cluster
if (res[closestCluster] == null) {
res[closestCluster] = (CFCluster) microcluster.copy();
} else {
res[closestCluster].add(microcluster);
}
}
// Clean up res
int count = 0;
for (int i = 0; i < res.length; i++) {
if (res[i] != null)
++count;
}
CFCluster[] cleaned = new CFCluster[count];
count = 0;
for (int i = 0; i < res.length; i++) {
if (res[i] != null)
cleaned[count++] = res[i];
}
return new Clustering(cleaned);
}
}