| package com.yahoo.labs.samoa.evaluation.measures; |
| |
| /* |
| * #%L |
| * SAMOA |
| * %% |
| * Copyright (C) 2011 RWTH Aachen University, Germany |
| * %% |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * #L% |
| */ |
| |
| import com.yahoo.labs.samoa.evaluation.measures.CMM_GTAnalysis.CMMPoint; |
| import com.yahoo.labs.samoa.moa.cluster.Cluster; |
| import com.yahoo.labs.samoa.moa.cluster.Clustering; |
| import com.yahoo.labs.samoa.moa.cluster.SphereCluster; |
| import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection; |
| import com.yahoo.labs.samoa.moa.core.DataPoint; |
| import java.util.ArrayList; |
| |
| /** |
| * [CMM.java] |
| * |
| * CMM: Main class |
| * |
| * Reference: Kremer et al., "An Effective Evaluation Measure for Clustering on Evolving Data Streams", KDD, 2011 |
| * |
| * @author Timm jansen |
| * Data Management and Data Exploration Group, RWTH Aachen University |
| */ |
| |
| public class CMM extends MeasureCollection{ |
| |
| private static final long serialVersionUID = 1L; |
| |
| /** |
| * found clustering |
| */ |
| private Clustering clustering; |
| |
| /** |
| * the ground truth analysis |
| */ |
| private CMM_GTAnalysis gtAnalysis; |
| |
| /** |
| * number of points within the horizon |
| */ |
| private int numPoints; |
| |
| /** |
| * number of clusters in the found clustering |
| */ |
| private int numFClusters; |
| |
| /** |
| * number of cluster in the adjusted groundtruth clustering that |
| * was calculated through the groundtruth analysis |
| */ |
| private int numGT0Classes; |
| |
| /** |
| * match found clusters to GT clusters |
| */ |
| private int matchMap[]; |
| |
| /** |
| * pointInclusionProbFC[p][C] contains the probability of point p |
| * being included in cluster C |
| */ |
| private double[][] pointInclusionProbFC; |
| |
| /** |
| * threshold that defines when a point is being considered belonging to a cluster |
| */ |
| private double pointInclusionProbThreshold = 0.5; |
| |
| /** |
| * parameterize the error weight of missed points (default 1) |
| */ |
| private double lamdaMissed = 1; |
| |
| |
| /** |
| * enable/disable debug mode |
| */ |
| public boolean debug = false; |
| |
| |
| /** |
| * enable/disable class merge (main feature of ground truth analysis) |
| */ |
| public boolean enableClassMerge = true; |
| |
| /** |
| * enable/disable model error |
| * when enabled errors that are caused by the underling cluster model will not be counted |
| */ |
| public boolean enableModelError = true; |
| |
| |
| @Override |
| protected String[] getNames() { |
| String[] names = {"CMM","CMM Basic","CMM Missed","CMM Misplaced","CMM Noise", |
| "CA Seperability", "CA Noise", "CA Modell"}; |
| return names; |
| } |
| |
| @Override |
| protected boolean[] getDefaultEnabled() { |
| boolean [] defaults = {false, false, false, false, false, false, false, false}; |
| return defaults; |
| } |
| |
| |
| @Override |
| public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) throws Exception{ |
| this.clustering = clustering; |
| |
| numPoints = points.size(); |
| numFClusters = clustering.size(); |
| |
| gtAnalysis = new CMM_GTAnalysis(trueClustering, points, enableClassMerge); |
| |
| numGT0Classes = gtAnalysis.getNumberOfGT0Classes(); |
| |
| addValue("CA Seperability",gtAnalysis.getClassSeparability()); |
| addValue("CA Noise",gtAnalysis.getNoiseSeparability()); |
| addValue("CA Modell",gtAnalysis.getModelQuality()); |
| |
| /* init the matching and point distances */ |
| calculateMatching(); |
| |
| /* calculate the actual error */ |
| calculateError(); |
| } |
| |
| |
| /** |
| * calculates the CMM specific matching between found clusters and ground truth clusters |
| */ |
| private void calculateMatching(){ |
| |
| /** |
| * found cluster frequencies |
| */ |
| int[][] mapFC = new int[numFClusters][numGT0Classes]; |
| |
| /** |
| * ground truth cluster frequencies |
| */ |
| int[][] mapGT = new int[numGT0Classes][numGT0Classes]; |
| int [] sumsFC = new int[numFClusters]; |
| |
| //calculate fuzzy mapping from |
| pointInclusionProbFC = new double[numPoints][numFClusters]; |
| for (int p = 0; p < numPoints; p++) { |
| CMMPoint cmdp = gtAnalysis.getPoint(p); |
| //found cluster frequencies |
| for (int fc = 0; fc < numFClusters; fc++) { |
| Cluster cl = clustering.get(fc); |
| pointInclusionProbFC[p][fc] = cl.getInclusionProbability(cmdp); |
| if (pointInclusionProbFC[p][fc] >= pointInclusionProbThreshold) { |
| //make sure we don't count points twice that are contained in two merged clusters |
| if(cmdp.isNoise()) continue; |
| mapFC[fc][cmdp.workclass()]++; |
| sumsFC[fc]++; |
| } |
| } |
| |
| //ground truth cluster frequencies |
| if(!cmdp.isNoise()){ |
| for(int hc = 0; hc < numGT0Classes;hc++){ |
| if(hc == cmdp.workclass()){ |
| mapGT[hc][hc]++; |
| } |
| else{ |
| if(gtAnalysis.getGT0Cluster(hc).getInclusionProbability(cmdp) >= 1){ |
| mapGT[hc][cmdp.workclass()]++; |
| } |
| } |
| } |
| } |
| } |
| |
| //assign each found cluster to a hidden cluster |
| matchMap = new int[numFClusters]; |
| for (int fc = 0; fc < numFClusters; fc++) { |
| int matchIndex = -1; |
| //check if we only have one entry anyway |
| for (int hc0 = 0; hc0 < numGT0Classes; hc0++) { |
| if(mapFC[fc][hc0]!=0){ |
| if(matchIndex == -1) |
| matchIndex = hc0; |
| else{ |
| matchIndex = -1; |
| break; |
| } |
| } |
| } |
| |
| //more then one entry, so look for most similar frequency profile |
| int minDiff = Integer.MAX_VALUE; |
| if(sumsFC[fc]!=0 && matchIndex == -1){ |
| ArrayList<Integer> fitCandidates = new ArrayList<Integer>(); |
| for (int hc0 = 0; hc0 < numGT0Classes; hc0++) { |
| int errDiff = 0; |
| for (int hc1 = 0; hc1 < numGT0Classes; hc1++) { |
| //fc profile doesn't fit into current hc profile |
| double freq_diff = mapFC[fc][hc1] - mapGT[hc0][hc1]; |
| if(freq_diff > 0){ |
| errDiff+= freq_diff; |
| } |
| } |
| if(errDiff == 0){ |
| fitCandidates.add(hc0); |
| } |
| if(errDiff < minDiff){ |
| minDiff = errDiff; |
| matchIndex = hc0; |
| } |
| if(debug){ |
| //System.out.println("FC"+fc+"("+Arrays.toString(mapFC[fc])+") - HC0_"+hc0+"("+Arrays.toString(mapGT[hc0])+"):"+errDiff); |
| } |
| } |
| //if we have a fitting profile overwrite the min error choice |
| //if we have multiple fit candidates, use majority vote of corresponding classes |
| if(fitCandidates.size()!=0){ |
| int bestGTfit = fitCandidates.get(0); |
| for(int i = 1; i < fitCandidates.size(); i++){ |
| int GTfit = fitCandidates.get(i); |
| if(mapFC[fc][GTfit] > mapFC[fc][bestGTfit]) |
| bestGTfit=fitCandidates.get(i); |
| } |
| matchIndex = bestGTfit; |
| } |
| } |
| |
| matchMap[fc] = matchIndex; |
| int realMatch = -1; |
| if(matchIndex==-1){ |
| if(debug) |
| System.out.println("No cluster match: needs to be implemented?"); |
| } |
| else{ |
| realMatch = gtAnalysis.getGT0Cluster(matchMap[fc]).getLabel(); |
| } |
| clustering.get(fc).setMeasureValue("CMM Match", "C"+realMatch); |
| clustering.get(fc).setMeasureValue("CMM Workclass", "C"+matchMap[fc]); |
| } |
| |
| //print matching table |
| if(debug){ |
| for (int i = 0; i < numFClusters; i++) { |
| System.out.print("C"+((int)clustering.get(i).getId()) + " N:"+((int)clustering.get(i).getWeight())+" | "); |
| for (int j = 0; j < numGT0Classes; j++) { |
| System.out.print(mapFC[i][j] + " "); |
| } |
| System.out.print(" = "+sumsFC[i] + " | "); |
| String match = "-"; |
| if (matchMap[i]!=-1) { |
| match = Integer.toString(gtAnalysis.getGT0Cluster(matchMap[i]).getLabel()); |
| } |
| System.out.println(" --> " + match + "(work:"+matchMap[i]+")"); |
| } |
| } |
| } |
| |
| |
| /** |
| * Calculate the actual error values |
| */ |
| private void calculateError(){ |
| int totalErrorCount = 0; |
| int totalRedundancy = 0; |
| int trueCoverage = 0; |
| int totalCoverage = 0; |
| |
| int numNoise = 0; |
| double errorNoise = 0; |
| double errorNoiseMax = 0; |
| |
| double errorMissed = 0; |
| double errorMissedMax = 0; |
| |
| double errorMisplaced = 0; |
| double errorMisplacedMax = 0; |
| |
| double totalError = 0.0; |
| double totalErrorMax = 0.0; |
| |
| /** mainly iterate over all points and find the right error value for the point. |
| * within the same run calculate various other stuff like coverage etc... |
| */ |
| for (int p = 0; p < numPoints; p++) { |
| CMMPoint cmdp = gtAnalysis.getPoint(p); |
| double weight = cmdp.weight(); |
| //noise counter |
| if(cmdp.isNoise()){ |
| numNoise++; |
| //this is always 1 |
| errorNoiseMax+=cmdp.connectivity*weight; |
| } |
| else{ |
| errorMissedMax+=cmdp.connectivity*weight; |
| errorMisplacedMax+=cmdp.connectivity*weight; |
| } |
| //sum up maxError as the individual errors are the quality weighted between 0-1 |
| totalErrorMax+=cmdp.connectivity*weight; |
| |
| |
| double err = 0; |
| int coverage = 0; |
| |
| //check every FCluster |
| for (int c = 0; c < numFClusters; c++) { |
| //contained in cluster c? |
| if(pointInclusionProbFC[p][c] >= pointInclusionProbThreshold){ |
| coverage++; |
| |
| if(!cmdp.isNoise()){ |
| //PLACED CORRECTLY |
| if(matchMap[c] == cmdp.workclass()){ |
| } |
| //MISPLACED |
| else{ |
| double errvalue = misplacedError(cmdp, c); |
| if(errvalue > err) |
| err = errvalue; |
| } |
| } |
| else{ |
| //NOISE |
| double errvalue = noiseError(cmdp, c); |
| if(errvalue > err) err = errvalue; |
| } |
| } |
| } |
| //not in any cluster |
| if(coverage == 0){ |
| //MISSED |
| if(!cmdp.isNoise()){ |
| err = missedError(cmdp,true); |
| errorMissed+= weight*err; |
| } |
| //NOISE |
| else{ |
| } |
| } |
| else{ |
| if(!cmdp.isNoise()){ |
| errorMisplaced+= err*weight; |
| } |
| else{ |
| errorNoise+= err*weight; |
| } |
| } |
| |
| /* processing of other evaluation values */ |
| totalError+= err*weight; |
| if(err!=0)totalErrorCount++; |
| if(coverage>0) totalCoverage++; //points covered by clustering (incl. noise) |
| if(coverage>0 && !cmdp.isNoise()) trueCoverage++; //points covered by clustering, don't count noise |
| if(coverage>1) totalRedundancy++; //include noise |
| |
| cmdp.p.setMeasureValue("CMM",err); |
| cmdp.p.setMeasureValue("Redundancy", coverage); |
| } |
| |
| addValue("CMM", (totalErrorMax!=0)?1-totalError/totalErrorMax:1); |
| addValue("CMM Missed", (errorMissedMax!=0)?1-errorMissed/errorMissedMax:1); |
| addValue("CMM Misplaced", (errorMisplacedMax!=0)?1-errorMisplaced/errorMisplacedMax:1); |
| addValue("CMM Noise", (errorNoiseMax!=0)?1-errorNoise/errorNoiseMax:1); |
| addValue("CMM Basic", 1-((double)totalErrorCount/(double)numPoints)); |
| |
| if(debug){ |
| System.out.println("-------------"); |
| } |
| } |
| |
| |
| private double noiseError(CMMPoint cmdp, int assignedClusterID){ |
| int gtAssignedID = matchMap[assignedClusterID]; |
| double error; |
| |
| //Cluster wasn't matched, so just contains noise |
| //TODO: Noiscluster? |
| //also happens when we decrease the radius and there is only a noise point in the center |
| if(gtAssignedID==-1){ |
| error = 1; |
| cmdp.p.setMeasureValue("CMM Type","noise - cluster"); |
| } |
| else{ |
| if(enableModelError && gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= pointInclusionProbThreshold){ |
| //set to MIN_ERROR so we can still track the error |
| error = 0.00001; |
| cmdp.p.setMeasureValue("CMM Type","noise - byModel"); |
| } |
| else{ |
| error = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID); |
| cmdp.p.setMeasureValue("CMM Type","noise"); |
| } |
| } |
| |
| return error; |
| } |
| |
| private double missedError(CMMPoint cmdp, boolean useHullDistance){ |
| cmdp.p.setMeasureValue("CMM Type","missed"); |
| if(!useHullDistance){ |
| return cmdp.connectivity; |
| } |
| else{ |
| //main idea: look at relative distance of missed point to cluster |
| double minHullDist = 1; |
| for (int fc = 0; fc < numFClusters; fc++){ |
| //if fc is mappend onto the class of the point, check it for its hulldist |
| if(matchMap[fc]!=-1 && matchMap[fc] == cmdp.workclass()){ |
| if(clustering.get(fc) instanceof SphereCluster){ |
| SphereCluster sc = (SphereCluster)clustering.get(fc); |
| double distanceFC = sc.getCenterDistance(cmdp); |
| double radius = sc.getRadius(); |
| double hullDist = (distanceFC-radius)/(distanceFC+radius); |
| if(hullDist < minHullDist) |
| minHullDist = hullDist; |
| } |
| else{ |
| double min = 1; |
| double max = 1; |
| |
| //TODO: distance for random shape |
| //generate X points from the cluster with clustering.get(fc).sample(null) |
| //and find Min and Max values |
| |
| double hullDist = min/max; |
| if(hullDist < minHullDist) |
| minHullDist = hullDist; |
| } |
| } |
| } |
| |
| //use distance as weight |
| if(minHullDist>1) minHullDist = 1; |
| |
| double weight = (1-Math.exp(-lamdaMissed*minHullDist)); |
| cmdp.p.setMeasureValue("HullDistWeight",weight); |
| |
| return weight*cmdp.connectivity; |
| } |
| } |
| |
| |
| private double misplacedError(CMMPoint cmdp, int assignedClusterID){ |
| double weight = 0; |
| |
| int gtAssignedID = matchMap[assignedClusterID]; |
| //TODO take care of noise cluster? |
| if(gtAssignedID ==-1){ |
| System.out.println("Point "+cmdp.getTimestamp()+" from gtcluster "+cmdp.trueClass+" assigned to noise cluster "+assignedClusterID); |
| return 1; |
| } |
| |
| if(gtAssignedID == cmdp.workclass()) |
| return 0; |
| else{ |
| //assigned and real GT0 cluster are not connected, but does the model have the |
| //chance of separating this point after all? |
| if(enableModelError && gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= pointInclusionProbThreshold){ |
| weight = 0; |
| cmdp.p.setMeasureValue("CMM Type","missplaced - byModel"); |
| } |
| else{ |
| //point was mapped onto wrong cluster (assigned), so check how far away |
| //the nearest point is within the wrongly assigned cluster |
| weight = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID); |
| } |
| } |
| double err_value; |
| //set to MIN_ERROR so we can still track the error |
| if(weight == 0){ |
| err_value= 0.00001; |
| } |
| else{ |
| err_value = weight*cmdp.connectivity; |
| cmdp.p.setMeasureValue("CMM Type","missplaced"); |
| } |
| |
| return err_value; |
| } |
| |
| public String getParameterString(){ |
| String para = gtAnalysis.getParameterString(); |
| para+="lambdaMissed="+lamdaMissed+";"; |
| return para; |
| } |
| |
| } |
| |
| |