| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.samoa.evaluation.measures; |
| |
| import java.util.ArrayList; |
| |
| import org.apache.samoa.evaluation.measures.CMM_GTAnalysis.CMMPoint; |
| import org.apache.samoa.moa.cluster.Cluster; |
| import org.apache.samoa.moa.cluster.Clustering; |
| import org.apache.samoa.moa.cluster.SphereCluster; |
| import org.apache.samoa.moa.core.DataPoint; |
| import org.apache.samoa.moa.evaluation.MeasureCollection; |
| |
| /** |
| * [CMM.java] |
| * |
| * CMM: Main class |
| * |
| * Reference: Kremer et al., "An Effective Evaluation Measure for Clustering on Evolving Data Streams", KDD, 2011 |
| * |
| * @author Timm jansen Data Management and Data Exploration Group, RWTH Aachen University |
| */ |
| |
| public class CMM extends MeasureCollection { |
| |
| private static final long serialVersionUID = 1L; |
| |
| /** |
| * found clustering |
| */ |
| private Clustering clustering; |
| |
| /** |
| * the ground truth analysis |
| */ |
| private CMM_GTAnalysis gtAnalysis; |
| |
| /** |
| * number of points within the horizon |
| */ |
| private int numPoints; |
| |
| /** |
| * number of clusters in the found clustering |
| */ |
| private int numFClusters; |
| |
| /** |
| * number of cluster in the adjusted groundtruth clustering that was calculated through the groundtruth analysis |
| */ |
| private int numGT0Classes; |
| |
| /** |
| * match found clusters to GT clusters |
| */ |
| private int matchMap[]; |
| |
| /** |
| * pointInclusionProbFC[p][C] contains the probability of point p being included in cluster C |
| */ |
| private double[][] pointInclusionProbFC; |
| |
| /** |
| * threshold that defines when a point is being considered belonging to a cluster |
| */ |
| private double pointInclusionProbThreshold = 0.5; |
| |
| /** |
| * parameterize the error weight of missed points (default 1) |
| */ |
| private double lamdaMissed = 1; |
| |
| /** |
| * enable/disable debug mode |
| */ |
| public boolean debug = false; |
| |
| /** |
| * enable/disable class merge (main feature of ground truth analysis) |
| */ |
| public boolean enableClassMerge = true; |
| |
| /** |
| * enable/disable model error when enabled errors that are caused by the underling cluster model will not be counted |
| */ |
| public boolean enableModelError = true; |
| |
| @Override |
| protected String[] getNames() { |
| String[] names = { "CMM", "CMM Basic", "CMM Missed", "CMM Misplaced", "CMM Noise", |
| "CA Seperability", "CA Noise", "CA Modell" }; |
| return names; |
| } |
| |
| @Override |
| protected boolean[] getDefaultEnabled() { |
| boolean[] defaults = { false, false, false, false, false, false, false, false }; |
| return defaults; |
| } |
| |
| @Override |
| public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) |
| throws Exception { |
| this.clustering = clustering; |
| |
| numPoints = points.size(); |
| numFClusters = clustering.size(); |
| |
| gtAnalysis = new CMM_GTAnalysis(trueClustering, points, enableClassMerge); |
| |
| numGT0Classes = gtAnalysis.getNumberOfGT0Classes(); |
| |
| addValue("CA Seperability", gtAnalysis.getClassSeparability()); |
| addValue("CA Noise", gtAnalysis.getNoiseSeparability()); |
| addValue("CA Modell", gtAnalysis.getModelQuality()); |
| |
| /* init the matching and point distances */ |
| calculateMatching(); |
| |
| /* calculate the actual error */ |
| calculateError(); |
| } |
| |
| /** |
| * calculates the CMM specific matching between found clusters and ground truth clusters |
| */ |
| private void calculateMatching() { |
| |
| /** |
| * found cluster frequencies |
| */ |
| int[][] mapFC = new int[numFClusters][numGT0Classes]; |
| |
| /** |
| * ground truth cluster frequencies |
| */ |
| int[][] mapGT = new int[numGT0Classes][numGT0Classes]; |
| int[] sumsFC = new int[numFClusters]; |
| |
| // calculate fuzzy mapping from |
| pointInclusionProbFC = new double[numPoints][numFClusters]; |
| for (int p = 0; p < numPoints; p++) { |
| CMMPoint cmdp = gtAnalysis.getPoint(p); |
| // found cluster frequencies |
| for (int fc = 0; fc < numFClusters; fc++) { |
| Cluster cl = clustering.get(fc); |
| pointInclusionProbFC[p][fc] = cl.getInclusionProbability(cmdp); |
| if (pointInclusionProbFC[p][fc] >= pointInclusionProbThreshold) { |
| // make sure we don't count points twice that are contained in two |
| // merged clusters |
| if (cmdp.isNoise()) |
| continue; |
| mapFC[fc][cmdp.workclass()]++; |
| sumsFC[fc]++; |
| } |
| } |
| |
| // ground truth cluster frequencies |
| if (!cmdp.isNoise()) { |
| for (int hc = 0; hc < numGT0Classes; hc++) { |
| if (hc == cmdp.workclass()) { |
| mapGT[hc][hc]++; |
| } |
| else { |
| if (gtAnalysis.getGT0Cluster(hc).getInclusionProbability(cmdp) >= 1) { |
| mapGT[hc][cmdp.workclass()]++; |
| } |
| } |
| } |
| } |
| } |
| |
| // assign each found cluster to a hidden cluster |
| matchMap = new int[numFClusters]; |
| for (int fc = 0; fc < numFClusters; fc++) { |
| int matchIndex = -1; |
| // check if we only have one entry anyway |
| for (int hc0 = 0; hc0 < numGT0Classes; hc0++) { |
| if (mapFC[fc][hc0] != 0) { |
| if (matchIndex == -1) |
| matchIndex = hc0; |
| else { |
| matchIndex = -1; |
| break; |
| } |
| } |
| } |
| |
| // more then one entry, so look for most similar frequency profile |
| int minDiff = Integer.MAX_VALUE; |
| if (sumsFC[fc] != 0 && matchIndex == -1) { |
| ArrayList<Integer> fitCandidates = new ArrayList<Integer>(); |
| for (int hc0 = 0; hc0 < numGT0Classes; hc0++) { |
| int errDiff = 0; |
| for (int hc1 = 0; hc1 < numGT0Classes; hc1++) { |
| // fc profile doesn't fit into current hc profile |
| double freq_diff = mapFC[fc][hc1] - mapGT[hc0][hc1]; |
| if (freq_diff > 0) { |
| errDiff += freq_diff; |
| } |
| } |
| if (errDiff == 0) { |
| fitCandidates.add(hc0); |
| } |
| if (errDiff < minDiff) { |
| minDiff = errDiff; |
| matchIndex = hc0; |
| } |
| if (debug) { |
| // System.out.println("FC"+fc+"("+Arrays.toString(mapFC[fc])+") - HC0_"+hc0+"("+Arrays.toString(mapGT[hc0])+"):"+errDiff); |
| } |
| } |
| // if we have a fitting profile overwrite the min error choice |
| // if we have multiple fit candidates, use majority vote of |
| // corresponding classes |
| if (fitCandidates.size() != 0) { |
| int bestGTfit = fitCandidates.get(0); |
| for (int i = 1; i < fitCandidates.size(); i++) { |
| int GTfit = fitCandidates.get(i); |
| if (mapFC[fc][GTfit] > mapFC[fc][bestGTfit]) |
| bestGTfit = fitCandidates.get(i); |
| } |
| matchIndex = bestGTfit; |
| } |
| } |
| |
| matchMap[fc] = matchIndex; |
| int realMatch = -1; |
| if (matchIndex == -1) { |
| if (debug) |
| System.out.println("No cluster match: needs to be implemented?"); |
| } |
| else { |
| realMatch = gtAnalysis.getGT0Cluster(matchMap[fc]).getLabel(); |
| } |
| clustering.get(fc).setMeasureValue("CMM Match", "C" + realMatch); |
| clustering.get(fc).setMeasureValue("CMM Workclass", "C" + matchMap[fc]); |
| } |
| |
| // print matching table |
| if (debug) { |
| for (int i = 0; i < numFClusters; i++) { |
| System.out.print("C" + ((int) clustering.get(i).getId()) + " N:" + ((int) clustering.get(i).getWeight()) |
| + " | "); |
| for (int j = 0; j < numGT0Classes; j++) { |
| System.out.print(mapFC[i][j] + " "); |
| } |
| System.out.print(" = " + sumsFC[i] + " | "); |
| String match = "-"; |
| if (matchMap[i] != -1) { |
| match = Integer.toString(gtAnalysis.getGT0Cluster(matchMap[i]).getLabel()); |
| } |
| System.out.println(" --> " + match + "(work:" + matchMap[i] + ")"); |
| } |
| } |
| } |
| |
| /** |
| * Calculate the actual error values |
| */ |
| private void calculateError() { |
| int totalErrorCount = 0; |
| int totalRedundancy = 0; |
| int trueCoverage = 0; |
| int totalCoverage = 0; |
| |
| int numNoise = 0; |
| double errorNoise = 0; |
| double errorNoiseMax = 0; |
| |
| double errorMissed = 0; |
| double errorMissedMax = 0; |
| |
| double errorMisplaced = 0; |
| double errorMisplacedMax = 0; |
| |
| double totalError = 0.0; |
| double totalErrorMax = 0.0; |
| |
| /** |
| * mainly iterate over all points and find the right error value for the point. within the same run calculate |
| * various other stuff like coverage etc... |
| */ |
| for (int p = 0; p < numPoints; p++) { |
| CMMPoint cmdp = gtAnalysis.getPoint(p); |
| double weight = cmdp.weight(); |
| // noise counter |
| if (cmdp.isNoise()) { |
| numNoise++; |
| // this is always 1 |
| errorNoiseMax += cmdp.connectivity * weight; |
| } |
| else { |
| errorMissedMax += cmdp.connectivity * weight; |
| errorMisplacedMax += cmdp.connectivity * weight; |
| } |
| // sum up maxError as the individual errors are the quality weighted |
| // between 0-1 |
| totalErrorMax += cmdp.connectivity * weight; |
| |
| double err = 0; |
| int coverage = 0; |
| |
| // check every FCluster |
| for (int c = 0; c < numFClusters; c++) { |
| // contained in cluster c? |
| if (pointInclusionProbFC[p][c] >= pointInclusionProbThreshold) { |
| coverage++; |
| |
| if (!cmdp.isNoise()) { |
| // PLACED CORRECTLY |
| if (matchMap[c] == cmdp.workclass()) { |
| } |
| // MISPLACED |
| else { |
| double errvalue = misplacedError(cmdp, c); |
| if (errvalue > err) |
| err = errvalue; |
| } |
| } |
| else { |
| // NOISE |
| double errvalue = noiseError(cmdp, c); |
| if (errvalue > err) |
| err = errvalue; |
| } |
| } |
| } |
| // not in any cluster |
| if (coverage == 0) { |
| // MISSED |
| if (!cmdp.isNoise()) { |
| err = missedError(cmdp, true); |
| errorMissed += weight * err; |
| } |
| // NOISE |
| else { |
| } |
| } |
| else { |
| if (!cmdp.isNoise()) { |
| errorMisplaced += err * weight; |
| } |
| else { |
| errorNoise += err * weight; |
| } |
| } |
| |
| /* processing of other evaluation values */ |
| totalError += err * weight; |
| if (err != 0) |
| totalErrorCount++; |
| if (coverage > 0) |
| totalCoverage++; // points covered by clustering (incl. noise) |
| if (coverage > 0 && !cmdp.isNoise()) |
| trueCoverage++; // points covered by clustering, don't count noise |
| if (coverage > 1) |
| totalRedundancy++; // include noise |
| |
| cmdp.p.setMeasureValue("CMM", err); |
| cmdp.p.setMeasureValue("Redundancy", coverage); |
| } |
| |
| addValue("CMM", (totalErrorMax != 0) ? 1 - totalError / totalErrorMax : 1); |
| addValue("CMM Missed", (errorMissedMax != 0) ? 1 - errorMissed / errorMissedMax : 1); |
| addValue("CMM Misplaced", (errorMisplacedMax != 0) ? 1 - errorMisplaced / errorMisplacedMax : 1); |
| addValue("CMM Noise", (errorNoiseMax != 0) ? 1 - errorNoise / errorNoiseMax : 1); |
| addValue("CMM Basic", 1 - ((double) totalErrorCount / (double) numPoints)); |
| |
| if (debug) { |
| System.out.println("-------------"); |
| } |
| } |
| |
| private double noiseError(CMMPoint cmdp, int assignedClusterID) { |
| int gtAssignedID = matchMap[assignedClusterID]; |
| double error; |
| |
| // Cluster wasn't matched, so just contains noise |
| // TODO: Noiscluster? |
| // also happens when we decrease the radius and there is only a noise point |
| // in the center |
| if (gtAssignedID == -1) { |
| error = 1; |
| cmdp.p.setMeasureValue("CMM Type", "noise - cluster"); |
| } |
| else { |
| if (enableModelError |
| && gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= pointInclusionProbThreshold) { |
| // set to MIN_ERROR so we can still track the error |
| error = 0.00001; |
| cmdp.p.setMeasureValue("CMM Type", "noise - byModel"); |
| } |
| else { |
| error = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID); |
| cmdp.p.setMeasureValue("CMM Type", "noise"); |
| } |
| } |
| |
| return error; |
| } |
| |
| private double missedError(CMMPoint cmdp, boolean useHullDistance) { |
| cmdp.p.setMeasureValue("CMM Type", "missed"); |
| if (!useHullDistance) { |
| return cmdp.connectivity; |
| } |
| else { |
| // main idea: look at relative distance of missed point to cluster |
| double minHullDist = 1; |
| for (int fc = 0; fc < numFClusters; fc++) { |
| // if fc is mappend onto the class of the point, check it for its |
| // hulldist |
| if (matchMap[fc] != -1 && matchMap[fc] == cmdp.workclass()) { |
| if (clustering.get(fc) instanceof SphereCluster) { |
| SphereCluster sc = (SphereCluster) clustering.get(fc); |
| double distanceFC = sc.getCenterDistance(cmdp); |
| double radius = sc.getRadius(); |
| double hullDist = (distanceFC - radius) / (distanceFC + radius); |
| if (hullDist < minHullDist) |
| minHullDist = hullDist; |
| } |
| else { |
| double min = 1; |
| double max = 1; |
| |
| // TODO: distance for random shape |
| // generate X points from the cluster with |
| // clustering.get(fc).sample(null) |
| // and find Min and Max values |
| |
| double hullDist = min / max; |
| if (hullDist < minHullDist) |
| minHullDist = hullDist; |
| } |
| } |
| } |
| |
| // use distance as weight |
| if (minHullDist > 1) |
| minHullDist = 1; |
| |
| double weight = (1 - Math.exp(-lamdaMissed * minHullDist)); |
| cmdp.p.setMeasureValue("HullDistWeight", weight); |
| |
| return weight * cmdp.connectivity; |
| } |
| } |
| |
| private double misplacedError(CMMPoint cmdp, int assignedClusterID) { |
| double weight = 0; |
| |
| int gtAssignedID = matchMap[assignedClusterID]; |
| // TODO take care of noise cluster? |
| if (gtAssignedID == -1) { |
| System.out.println("Point " + cmdp.getTimestamp() + " from gtcluster " + cmdp.trueClass |
| + " assigned to noise cluster " + assignedClusterID); |
| return 1; |
| } |
| |
| if (gtAssignedID == cmdp.workclass()) |
| return 0; |
| else { |
| // assigned and real GT0 cluster are not connected, but does the model |
| // have the |
| // chance of separating this point after all? |
| if (enableModelError |
| && gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= pointInclusionProbThreshold) { |
| weight = 0; |
| cmdp.p.setMeasureValue("CMM Type", "missplaced - byModel"); |
| } |
| else { |
| // point was mapped onto wrong cluster (assigned), so check how far away |
| // the nearest point is within the wrongly assigned cluster |
| weight = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID); |
| } |
| } |
| double err_value; |
| // set to MIN_ERROR so we can still track the error |
| if (weight == 0) { |
| err_value = 0.00001; |
| } |
| else { |
| err_value = weight * cmdp.connectivity; |
| cmdp.p.setMeasureValue("CMM Type", "missplaced"); |
| } |
| |
| return err_value; |
| } |
| |
| public String getParameterString() { |
| String para = gtAnalysis.getParameterString(); |
| para += "lambdaMissed=" + lamdaMissed + ";"; |
| return para; |
| } |
| |
| } |