| package com.yahoo.labs.samoa.moa.cluster; |
| |
| /* |
| * #%L |
| * SAMOA |
| * %% |
| * Copyright (C) 2014 - 2015 Apache Software Foundation |
| * %% |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * #L% |
| */ |
| |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| import com.yahoo.labs.samoa.moa.AbstractMOAObject; |
| import com.yahoo.labs.samoa.moa.core.AutoExpandVector; |
| import com.yahoo.labs.samoa.moa.core.DataPoint; |
| import com.yahoo.labs.samoa.instances.Attribute; |
| import com.yahoo.labs.samoa.instances.Instance; |
| |
| public class Clustering extends AbstractMOAObject { |
| |
| private AutoExpandVector<Cluster> clusters; |
| |
| public Clustering() { |
| this.clusters = new AutoExpandVector<Cluster>(); |
| } |
| |
| public Clustering(Cluster[] clusters) { |
| this.clusters = new AutoExpandVector<Cluster>(); |
| for (int i = 0; i < clusters.length; i++) { |
| this.clusters.add(clusters[i]); |
| } |
| } |
| |
| public Clustering(List<? extends Instance> points) { |
| HashMap<Integer, Integer> labelMap = classValues(points); |
| int dim = points.get(0).dataset().numAttributes() - 1; |
| |
| int numClasses = labelMap.size(); |
| int noiseLabel; |
| |
| Attribute classLabel = points.get(0).dataset().classAttribute(); |
| int lastLabelIndex = classLabel.numValues() - 1; |
| if (classLabel.value(lastLabelIndex) == "noise") { |
| noiseLabel = lastLabelIndex; |
| } else { |
| noiseLabel = -1; |
| } |
| |
| ArrayList<Instance>[] sorted_points = (ArrayList<Instance>[]) new ArrayList[numClasses]; |
| for (int i = 0; i < numClasses; i++) { |
| sorted_points[i] = new ArrayList<Instance>(); |
| } |
| for (Instance point : points) { |
| int clusterid = (int) point.classValue(); |
| if (clusterid == noiseLabel) |
| continue; |
| sorted_points[labelMap.get(clusterid)].add((Instance) point); |
| } |
| this.clusters = new AutoExpandVector<Cluster>(); |
| for (int i = 0; i < numClasses; i++) { |
| if (sorted_points[i].size() > 0) { |
| SphereCluster s = new SphereCluster(sorted_points[i], dim); |
| s.setId(sorted_points[i].get(0).classValue()); |
| s.setGroundTruth(sorted_points[i].get(0).classValue()); |
| clusters.add(s); |
| } |
| } |
| } |
| |
| public Clustering(ArrayList<DataPoint> points, double overlapThreshold, int initMinPoints) { |
| HashMap<Integer, Integer> labelMap = Clustering.classValues(points); |
| int dim = points.get(0).dataset().numAttributes() - 1; |
| |
| int numClasses = labelMap.size(); |
| int num = 0; |
| |
| ArrayList<DataPoint>[] sorted_points = (ArrayList<DataPoint>[]) new ArrayList[numClasses]; |
| for (int i = 0; i < numClasses; i++) { |
| sorted_points[i] = new ArrayList<DataPoint>(); |
| } |
| for (DataPoint point : points) { |
| int clusterid = (int) point.classValue(); |
| if (clusterid == -1) |
| continue; |
| sorted_points[labelMap.get(clusterid)].add(point); |
| num++; |
| } |
| |
| clusters = new AutoExpandVector<Cluster>(); |
| int microID = 0; |
| for (int i = 0; i < numClasses; i++) { |
| ArrayList<SphereCluster> microByClass = new ArrayList<SphereCluster>(); |
| ArrayList<DataPoint> pointInCluster = new ArrayList<DataPoint>(); |
| ArrayList<ArrayList<Instance>> pointInMicroClusters = new ArrayList(); |
| |
| pointInCluster.addAll(sorted_points[i]); |
| while (pointInCluster.size() > 0) { |
| ArrayList<Instance> micro_points = new ArrayList<Instance>(); |
| for (int j = 0; j < initMinPoints && !pointInCluster.isEmpty(); j++) { |
| micro_points.add((Instance) pointInCluster.get(0)); |
| pointInCluster.remove(0); |
| } |
| if (micro_points.size() > 0) { |
| SphereCluster s = new SphereCluster(micro_points, dim); |
| for (int c = 0; c < microByClass.size(); c++) { |
| if (((SphereCluster) microByClass.get(c)).overlapRadiusDegree(s) > overlapThreshold) { |
| micro_points.addAll(pointInMicroClusters.get(c)); |
| s = new SphereCluster(micro_points, dim); |
| pointInMicroClusters.remove(c); |
| microByClass.remove(c); |
| // System.out.println("Removing redundant cluster based on radius overlap"+c); |
| } |
| } |
| for (int j = 0; j < pointInCluster.size(); j++) { |
| Instance instance = pointInCluster.get(j); |
| if (s.getInclusionProbability(instance) > 0.8) { |
| pointInCluster.remove(j); |
| micro_points.add(instance); |
| } |
| } |
| s.setWeight(micro_points.size()); |
| microByClass.add(s); |
| pointInMicroClusters.add(micro_points); |
| microID++; |
| } |
| } |
| // |
| boolean changed = true; |
| while (changed) { |
| changed = false; |
| for (int c = 0; c < microByClass.size(); c++) { |
| for (int c1 = c + 1; c1 < microByClass.size(); c1++) { |
| double overlap = microByClass.get(c).overlapRadiusDegree(microByClass.get(c1)); |
| // System.out.println("Overlap C"+(clustering.size()+c)+" ->C"+(clustering.size()+c1)+": "+overlap); |
| if (overlap > overlapThreshold) { |
| pointInMicroClusters.get(c).addAll(pointInMicroClusters.get(c1)); |
| SphereCluster s = new SphereCluster(pointInMicroClusters.get(c), dim); |
| microByClass.set(c, s); |
| pointInMicroClusters.remove(c1); |
| microByClass.remove(c1); |
| changed = true; |
| break; |
| } |
| } |
| } |
| } |
| for (int j = 0; j < microByClass.size(); j++) { |
| microByClass.get(j).setGroundTruth(sorted_points[i].get(0).classValue()); |
| clusters.add(microByClass.get(j)); |
| } |
| |
| } |
| for (int j = 0; j < clusters.size(); j++) { |
| clusters.get(j).setId(j); |
| } |
| |
| } |
| |
| /** |
| * @param points |
| * @return an array with the min and max class label value |
| */ |
| public static HashMap<Integer, Integer> classValues(List<? extends Instance> points) { |
| HashMap<Integer, Integer> classes = new HashMap<Integer, Integer>(); |
| int workcluster = 0; |
| boolean hasnoise = false; |
| for (int i = 0; i < points.size(); i++) { |
| int label = (int) points.get(i).classValue(); |
| if (label == -1) { |
| hasnoise = true; |
| } |
| else { |
| if (!classes.containsKey(label)) { |
| classes.put(label, workcluster); |
| workcluster++; |
| } |
| } |
| } |
| if (hasnoise) |
| classes.put(-1, workcluster); |
| return classes; |
| } |
| |
| public Clustering(AutoExpandVector<Cluster> clusters) { |
| this.clusters = clusters; |
| } |
| |
| /** |
| * add a cluster to the clustering |
| */ |
| public void add(Cluster cluster) { |
| clusters.add(cluster); |
| } |
| |
| /** |
| * remove a cluster from the clustering |
| */ |
| public void remove(int index) { |
| if (index < clusters.size()) { |
| clusters.remove(index); |
| } |
| } |
| |
| /** |
| * remove a cluster from the clustering |
| */ |
| public Cluster get(int index) { |
| if (index < clusters.size()) { |
| return clusters.get(index); |
| } |
| return null; |
| } |
| |
| /** |
| * @return the <code>Clustering</code> as an AutoExpandVector |
| */ |
| public AutoExpandVector<Cluster> getClustering() { |
| return clusters; |
| } |
| |
| /** |
| * @return A deepcopy of the <code>Clustering</code> as an AutoExpandVector |
| */ |
| public AutoExpandVector<Cluster> getClusteringCopy() { |
| return (AutoExpandVector<Cluster>) clusters.copy(); |
| } |
| |
| /** |
| * @return the number of clusters |
| */ |
| public int size() { |
| return clusters.size(); |
| } |
| |
| /** |
| * @return the number of dimensions of this clustering |
| */ |
| public int dimension() { |
| assert (clusters.size() != 0); |
| return clusters.get(0).getCenter().length; |
| } |
| |
| @Override |
| public void getDescription(StringBuilder sb, int i) { |
| sb.append("Clustering Object"); |
| } |
| |
| public double getMaxInclusionProbability(Instance point) { |
| double maxInclusion = 0.0; |
| for (int i = 0; i < clusters.size(); i++) { |
| maxInclusion = Math.max(clusters.get(i).getInclusionProbability(point), |
| maxInclusion); |
| } |
| return maxInclusion; |
| } |
| |
| } |