| package com.yahoo.labs.samoa.evaluation.measures; |
| |
| /* |
| * #%L |
| * SAMOA |
| * %% |
| * Copyright (C) 2011 RWTH Aachen University, Germany |
| * %% |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * #L% |
| */ |
| |
| import com.yahoo.labs.samoa.instances.Instance; |
| import com.yahoo.labs.samoa.moa.cluster.Clustering; |
| import com.yahoo.labs.samoa.moa.core.AutoExpandVector; |
| import com.yahoo.labs.samoa.moa.core.DataPoint; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| |
| /** |
| * [CMM_GTAnalysis.java] |
| * |
| * CMM: Ground truth analysis |
| * |
| * Reference: Kremer et al., "An Effective Evaluation Measure for Clustering on Evolving Data Streams", KDD, 2011 |
| * |
| * @author Timm jansen Data Management and Data Exploration Group, RWTH Aachen University |
| */ |
| |
| /* |
| * TODO: - try to avoid calcualting the radius multiple times - avoid the full |
| * distance map? - knn functionality in clusters - noise error |
| */ |
| public class CMM_GTAnalysis { |
| |
| /** |
| * the given ground truth clustering |
| */ |
| private Clustering gtClustering; |
| |
| /** |
| * list of given points within the horizon |
| */ |
| private ArrayList<CMMPoint> cmmpoints; |
| |
| /** |
| * the newly calculate ground truth clustering |
| */ |
| private ArrayList<GTCluster> gt0Clusters; |
| |
| /** |
| * IDs of noise points |
| */ |
| private ArrayList<Integer> noise; |
| |
| /** |
| * total number of points |
| */ |
| private int numPoints; |
| |
| /** |
| * number of clusters of the original ground truth |
| */ |
| private int numGTClusters; |
| |
| /** |
| * number of classes of the original ground truth, in case of a micro clustering ground truth this differs from |
| * numGTClusters |
| */ |
| private int numGTClasses; |
| |
| /** |
| * number of classes after we are done with the analysis |
| */ |
| private int numGT0Classes; |
| |
| /** |
| * number of dimensions |
| */ |
| private int numDims; |
| |
| /** |
| * mapping between true cluster ID/class label of the original ground truth and the internal cluster ID/working class |
| * label. |
| * |
| * different original cluster IDs might map to the same new cluster ID due to merging of two clusters |
| */ |
| private HashMap<Integer, Integer> mapTrueLabelToWorkLabel; |
| |
| /** |
| * log of how clusters have been merged (for debugging) |
| */ |
| private int[] mergeMap; |
| |
| /** |
| * number of non-noise points that will create an error due to the underlying clustering model (e.g. point being |
| * covered by two clusters representing different classes) |
| */ |
| private int noiseErrorByModel; |
| |
| /** |
| * number of noise points that will create an error due to the underlying clustering model (e.g. noise point being |
| * covered by a cluster) |
| */ |
| private int pointErrorByModel; |
| |
| /** |
| * CMM debug mode |
| */ |
| private boolean debug = false; |
| |
| /******* CMM parameter ***********/ |
| |
| /** |
| * defines how many nearest neighbors will be used |
| */ |
| private int knnNeighbourhood = 2; |
| |
| /** |
| * the threshold which defines when ground truth clusters will be merged. set to 1 to disable merging |
| */ |
| private double tauConnection = 0.5; |
| |
| /** |
| * experimental (default: disabled) separate k for points to cluster and cluster to cluster |
| */ |
| private double clusterConnectionMaxPoints = knnNeighbourhood; |
| |
| /** |
| * experimental (default: disabled) use exponential connectivity function to model different behavior: closer points |
| * will have a stronger connection compared to the linear function. Use ConnRefXValue and ConnX to better parameterize |
| * lambda, which controls the decay of the connectivity |
| */ |
| private boolean useExpConnectivity = false; |
| private double lambdaConnRefXValue = 0.01; |
| private double lambdaConnX = 4; |
| private double lamdaConn; |
| |
| /******************************************/ |
| |
| /** |
| * Wrapper class for data points to store CMM relevant attributes |
| * |
| */ |
| protected class CMMPoint extends DataPoint { |
| /** |
| * Reference to original point |
| */ |
| protected DataPoint p = null; |
| |
| /** |
| * point ID |
| */ |
| protected int pID = 0; |
| |
| /** |
| * true class label |
| */ |
| protected int trueClass = -1; |
| |
| /** |
| * the connectivity of the point to its cluster |
| */ |
| protected double connectivity = 1.0; |
| |
| /** |
| * knn distnace within own cluster |
| */ |
| protected double knnInCluster = 0.0; |
| |
| /** |
| * knn indices (for debugging only) |
| */ |
| protected ArrayList<Integer> knnIndices; |
| |
| public CMMPoint(DataPoint point, int id) { |
| // make a copy, but keep reference |
| super(point, point.getTimestamp()); |
| p = point; |
| pID = id; |
| trueClass = (int) point.classValue(); |
| } |
| |
| /** |
| * Retruns the current working label of the cluster the point belongs to. The label can change due to merging of |
| * clusters. |
| * |
| * @return the current working class label |
| */ |
| protected int workclass() { |
| if (trueClass == -1) |
| return -1; |
| else |
| return mapTrueLabelToWorkLabel.get(trueClass); |
| } |
| } |
| |
| /** |
| * Main class to model the new clusters that will be the output of the cluster analysis |
| * |
| */ |
| protected class GTCluster { |
| /** points that are per definition in the cluster */ |
| private ArrayList<Integer> points = new ArrayList<Integer>(); |
| |
| /** |
| * a new GT cluster consists of one or more "old" GT clusters. Connected/overlapping clusters cannot be merged |
| * directly because of the underlying cluster model. E.g. for merging two spherical clusters the new cluster sphere |
| * can cover a lot more space then two separate smaller spheres. To keep the original coverage we need to keep the |
| * orignal clusters and merge them on an abstract level. |
| */ |
| private ArrayList<Integer> clusterRepresentations = new ArrayList<Integer>(); |
| |
| /** current work class (changes when merging) */ |
| private int workclass; |
| |
| /** original work class */ |
| private final int orgWorkClass; |
| |
| /** original class label */ |
| private final int label; |
| |
| /** clusters that have been merged into this cluster (debugging) */ |
| private ArrayList<Integer> mergedWorkLabels = null; |
| |
| /** average knn distance of all points in the cluster */ |
| private double knnMeanAvg = 0; |
| |
| /** average deviation of knn distance of all points */ |
| private double knnDevAvg = 0; |
| |
| /** connectivity of the cluster to all other clusters */ |
| private ArrayList<Double> connections = new ArrayList<Double>(); |
| |
| private GTCluster(int workclass, int label, int gtClusteringID) { |
| this.orgWorkClass = workclass; |
| this.workclass = workclass; |
| this.label = label; |
| this.clusterRepresentations.add(gtClusteringID); |
| } |
| |
| /** |
| * The original class label the cluster represents |
| * |
| * @return original class label |
| */ |
| protected int getLabel() { |
| return label; |
| } |
| |
| /** |
| * Calculate the probability of the point being covered through the cluster |
| * |
| * @param point |
| * to calculate the probability for |
| * @return probability of the point being covered through the cluster |
| */ |
| protected double getInclusionProbability(CMMPoint point) { |
| double prob = Double.MIN_VALUE; |
| // check all cluster representatives for coverage |
| for (int c = 0; c < clusterRepresentations.size(); c++) { |
| double tmp_prob = gtClustering.get(clusterRepresentations.get(c)).getInclusionProbability(point); |
| if (tmp_prob > prob) |
| prob = tmp_prob; |
| } |
| return prob; |
| } |
| |
| /** |
| * calculate knn distances of points within own cluster + average knn distance and average knn distance deviation of |
| * all points |
| */ |
| private void calculateKnn() { |
| for (int p0 : points) { |
| CMMPoint cmdp = cmmpoints.get(p0); |
| if (!cmdp.isNoise()) { |
| AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); |
| AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); |
| |
| // calculate nearest neighbours |
| getKnnInCluster(cmdp, knnNeighbourhood, points, knnDist, knnPointIndex); |
| |
| // TODO: What to do if we have less then k neighbours? |
| double avgKnn = 0; |
| for (int i = 0; i < knnDist.size(); i++) { |
| avgKnn += knnDist.get(i); |
| } |
| if (knnDist.size() != 0) |
| avgKnn /= knnDist.size(); |
| cmdp.knnInCluster = avgKnn; |
| cmdp.knnIndices = knnPointIndex; |
| cmdp.p.setMeasureValue("knnAvg", cmdp.knnInCluster); |
| |
| knnMeanAvg += avgKnn; |
| knnDevAvg += Math.pow(avgKnn, 2); |
| } |
| } |
| knnMeanAvg = knnMeanAvg / (double) points.size(); |
| knnDevAvg = knnDevAvg / (double) points.size(); |
| |
| double variance = knnDevAvg - Math.pow(knnMeanAvg, 2.0); |
| // Due to numerical errors, small negative values can occur. |
| if (variance <= 0.0) |
| variance = 1e-50; |
| knnDevAvg = Math.sqrt(variance); |
| |
| } |
| |
| /** |
| * Calculate the connection of a cluster to this cluster |
| * |
| * @param otherCid |
| * cluster id of the other cluster |
| * @param initial |
| * flag for initial run |
| */ |
| private void calculateClusterConnection(int otherCid, boolean initial) { |
| double avgConnection = 0; |
| if (workclass == otherCid) { |
| avgConnection = 1; |
| } |
| else { |
| AutoExpandVector<Double> kmax = new AutoExpandVector<Double>(); |
| AutoExpandVector<Integer> kmaxIndexes = new AutoExpandVector<Integer>(); |
| |
| for (int p : points) { |
| CMMPoint cmdp = cmmpoints.get(p); |
| double con_p_Cj = getConnectionValue(cmmpoints.get(p), otherCid); |
| double connection = cmdp.connectivity * con_p_Cj; |
| if (initial) { |
| cmdp.p.setMeasureValue("Connection to C" + otherCid, con_p_Cj); |
| } |
| |
| // connection |
| if (kmax.size() < clusterConnectionMaxPoints || connection > kmax.get(kmax.size() - 1)) { |
| int index = 0; |
| while (index < kmax.size() && connection < kmax.get(index)) { |
| index++; |
| } |
| kmax.add(index, connection); |
| kmaxIndexes.add(index, p); |
| if (kmax.size() > clusterConnectionMaxPoints) { |
| kmax.remove(kmax.size() - 1); |
| kmaxIndexes.add(kmaxIndexes.size() - 1); |
| } |
| } |
| } |
| // connection |
| for (int k = 0; k < kmax.size(); k++) { |
| avgConnection += kmax.get(k); |
| } |
| avgConnection /= kmax.size(); |
| } |
| |
| if (otherCid < connections.size()) { |
| connections.set(otherCid, avgConnection); |
| } |
| else if (connections.size() == otherCid) { |
| connections.add(avgConnection); |
| } |
| else |
| System.out.println("Something is going really wrong with the connection listing!" + knnNeighbourhood + " " |
| + tauConnection); |
| } |
| |
| /** |
| * Merge a cluster into this cluster |
| * |
| * @param mergeID |
| * the ID of the cluster to be merged |
| */ |
| private void mergeCluster(int mergeID) { |
| if (mergeID < gt0Clusters.size()) { |
| // track merging (debugging) |
| for (int i = 0; i < numGTClasses; i++) { |
| if (mergeMap[i] == mergeID) |
| mergeMap[i] = workclass; |
| if (mergeMap[i] > mergeID) |
| mergeMap[i]--; |
| } |
| GTCluster gtcMerge = gt0Clusters.get(mergeID); |
| if (debug) |
| System.out.println("Merging C" + gtcMerge.workclass + " into C" + workclass + |
| " with Con " + connections.get(mergeID) + " / " + gtcMerge.connections.get(workclass)); |
| |
| // update mapTrueLabelToWorkLabel |
| mapTrueLabelToWorkLabel.put(gtcMerge.label, workclass); |
| Iterator iterator = mapTrueLabelToWorkLabel.keySet().iterator(); |
| while (iterator.hasNext()) { |
| Integer key = (Integer) iterator.next(); |
| // update pointer of already merged cluster |
| int value = mapTrueLabelToWorkLabel.get(key); |
| if (value == mergeID) |
| mapTrueLabelToWorkLabel.put(key, workclass); |
| if (value > mergeID) |
| mapTrueLabelToWorkLabel.put(key, value - 1); |
| } |
| |
| // merge points from B into A |
| points.addAll(gtcMerge.points); |
| clusterRepresentations.addAll(gtcMerge.clusterRepresentations); |
| if (mergedWorkLabels == null) { |
| mergedWorkLabels = new ArrayList<Integer>(); |
| } |
| mergedWorkLabels.add(gtcMerge.orgWorkClass); |
| if (gtcMerge.mergedWorkLabels != null) |
| mergedWorkLabels.addAll(gtcMerge.mergedWorkLabels); |
| |
| gt0Clusters.remove(mergeID); |
| |
| // update workclass labels |
| for (int c = mergeID; c < gt0Clusters.size(); c++) { |
| gt0Clusters.get(c).workclass = c; |
| } |
| |
| // update knn distances |
| calculateKnn(); |
| for (int c = 0; c < gt0Clusters.size(); c++) { |
| gt0Clusters.get(c).connections.remove(mergeID); |
| |
| // recalculate connection from other clusters to the new merged one |
| gt0Clusters.get(c).calculateClusterConnection(workclass, false); |
| // and from new merged one to other clusters |
| gt0Clusters.get(workclass).calculateClusterConnection(c, false); |
| } |
| } |
| else { |
| System.out.println("Merge indices are not valid"); |
| } |
| } |
| } |
| |
| /** |
| * @param trueClustering |
| * the ground truth clustering |
| * @param points |
| * data points |
| * @param enableClassMerge |
| * allow class merging (should be set to true on default) |
| */ |
| public CMM_GTAnalysis(Clustering trueClustering, ArrayList<DataPoint> points, boolean enableClassMerge) { |
| if (debug) |
| System.out.println("GT Analysis Debug Output"); |
| |
| noiseErrorByModel = 0; |
| pointErrorByModel = 0; |
| if (!enableClassMerge) { |
| tauConnection = 1.0; |
| } |
| |
| lamdaConn = -Math.log(lambdaConnRefXValue) / Math.log(2) / lambdaConnX; |
| |
| this.gtClustering = trueClustering; |
| |
| numPoints = points.size(); |
| numDims = points.get(0).numAttributes() - 1; |
| numGTClusters = gtClustering.size(); |
| |
| // init mappings between work and true labels |
| mapTrueLabelToWorkLabel = new HashMap<Integer, Integer>(); |
| |
| // set up base of new clustering |
| gt0Clusters = new ArrayList<GTCluster>(); |
| int numWorkClasses = 0; |
| // create label to worklabel mapping as real labels can be just a set of |
| // unordered integers |
| for (int i = 0; i < numGTClusters; i++) { |
| int label = (int) gtClustering.get(i).getGroundTruth(); |
| if (!mapTrueLabelToWorkLabel.containsKey(label)) { |
| gt0Clusters.add(new GTCluster(numWorkClasses, label, i)); |
| mapTrueLabelToWorkLabel.put(label, numWorkClasses); |
| numWorkClasses++; |
| } |
| else { |
| gt0Clusters.get(mapTrueLabelToWorkLabel.get(label)).clusterRepresentations.add(i); |
| } |
| } |
| numGTClasses = numWorkClasses; |
| |
| mergeMap = new int[numGTClasses]; |
| for (int i = 0; i < numGTClasses; i++) { |
| mergeMap[i] = i; |
| } |
| |
| // create cmd point wrapper instances |
| cmmpoints = new ArrayList<CMMPoint>(); |
| for (int p = 0; p < points.size(); p++) { |
| CMMPoint cmdp = new CMMPoint(points.get(p), p); |
| cmmpoints.add(cmdp); |
| } |
| |
| // split points up into their GTClusters and Noise (according to class |
| // labels) |
| noise = new ArrayList<Integer>(); |
| for (int p = 0; p < numPoints; p++) { |
| if (cmmpoints.get(p).isNoise()) { |
| noise.add(p); |
| } |
| else { |
| gt0Clusters.get(cmmpoints.get(p).workclass()).points.add(p); |
| } |
| } |
| |
| // calculate initial knnMean and knnDev |
| for (GTCluster gtc : gt0Clusters) { |
| gtc.calculateKnn(); |
| } |
| |
| // calculate cluster connections |
| calculateGTClusterConnections(); |
| |
| // calculate point connections with own clusters |
| calculateGTPointQualities(); |
| |
| if (debug) |
| System.out.println("GT Analysis Debug End"); |
| |
| } |
| |
| /** |
| * Calculate the connection of a point to a cluster |
| * |
| * @param cmmp |
| * the point to calculate the connection for |
| * @param clusterID |
| * the corresponding cluster |
| * @return the connection value |
| */ |
| // TODO: Cache the connection value for a point to the different clusters??? |
| protected double getConnectionValue(CMMPoint cmmp, int clusterID) { |
| AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); |
| AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); |
| |
| // calculate the knn distance of the point to the cluster |
| getKnnInCluster(cmmp, knnNeighbourhood, gt0Clusters.get(clusterID).points, knnDist, knnPointIndex); |
| |
| // TODO: What to do if we have less then k neighbors? |
| double avgDist = 0; |
| for (int i = 0; i < knnDist.size(); i++) { |
| avgDist += knnDist.get(i); |
| } |
| // what to do if we only have a single point??? |
| if (knnDist.size() != 0) |
| avgDist /= knnDist.size(); |
| else |
| return 0; |
| |
| // get the upper knn distance of the cluster |
| double upperKnn = gt0Clusters.get(clusterID).knnMeanAvg + gt0Clusters.get(clusterID).knnDevAvg; |
| |
| /* |
| * calculate the connectivity based on knn distance of the point within the |
| * cluster and the upper knn distance of the cluster |
| */ |
| if (avgDist < upperKnn) { |
| return 1; |
| } |
| else { |
| // value that should be reached at upperKnn distance |
| // Choose connection formula |
| double conn; |
| if (useExpConnectivity) |
| conn = Math.pow(2, -lamdaConn * (avgDist - upperKnn) / upperKnn); |
| else |
| conn = upperKnn / avgDist; |
| |
| if (Double.isNaN(conn)) |
| System.out.println("Connectivity NaN at " + cmmp.p.getTimestamp()); |
| |
| return conn; |
| } |
| } |
| |
| /** |
| * @param cmmp |
| * point to calculate knn distance for |
| * @param k |
| * number of nearest neighbors to look for |
| * @param pointIDs |
| * list of point IDs to check |
| * @param knnDist |
| * sorted list of smallest knn distances (can already be filled to make updates possible) |
| * @param knnPointIndex |
| * list of corresponding knn indices |
| */ |
| private void getKnnInCluster(CMMPoint cmmp, int k, |
| ArrayList<Integer> pointIDs, |
| AutoExpandVector<Double> knnDist, |
| AutoExpandVector<Integer> knnPointIndex) { |
| |
| // iterate over every point in the choosen cluster, cal distance and insert |
| // into list |
| for (int p1 = 0; p1 < pointIDs.size(); p1++) { |
| int pid = pointIDs.get(p1); |
| if (cmmp.pID == pid) |
| continue; |
| double dist = distance(cmmp, cmmpoints.get(pid)); |
| if (knnDist.size() < k || dist < knnDist.get(knnDist.size() - 1)) { |
| int index = 0; |
| while (index < knnDist.size() && dist > knnDist.get(index)) { |
| index++; |
| } |
| knnDist.add(index, dist); |
| knnPointIndex.add(index, pid); |
| if (knnDist.size() > k) { |
| knnDist.remove(knnDist.size() - 1); |
| knnPointIndex.remove(knnPointIndex.size() - 1); |
| } |
| } |
| } |
| } |
| |
| /** |
| * calculate initial connectivities |
| */ |
| private void calculateGTPointQualities() { |
| for (int p = 0; p < numPoints; p++) { |
| CMMPoint cmdp = cmmpoints.get(p); |
| if (!cmdp.isNoise()) { |
| cmdp.connectivity = getConnectionValue(cmdp, cmdp.workclass()); |
| cmdp.p.setMeasureValue("Connectivity", cmdp.connectivity); |
| } |
| } |
| } |
| |
| /** |
| * Calculate connections between clusters and merge clusters accordingly as long as connections exceed threshold |
| */ |
| private void calculateGTClusterConnections() { |
| for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { |
| for (int c1 = 0; c1 < gt0Clusters.size(); c1++) { |
| gt0Clusters.get(c0).calculateClusterConnection(c1, true); |
| } |
| } |
| |
| boolean changedConnection = true; |
| while (changedConnection) { |
| if (debug) { |
| System.out.println("Cluster Connection"); |
| for (int c = 0; c < gt0Clusters.size(); c++) { |
| System.out.print("C" + gt0Clusters.get(c).label + " --> "); |
| for (int c1 = 0; c1 < gt0Clusters.get(c).connections.size(); c1++) { |
| System.out.print(" C" + gt0Clusters.get(c1).label + ": " + gt0Clusters.get(c).connections.get(c1)); |
| } |
| System.out.println(""); |
| } |
| System.out.println(""); |
| } |
| |
| double max = 0; |
| int maxIndexI = -1; |
| int maxIndexJ = -1; |
| |
| changedConnection = false; |
| for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { |
| for (int c1 = c0 + 1; c1 < gt0Clusters.size(); c1++) { |
| if (c0 == c1) |
| continue; |
| double min = Math.min(gt0Clusters.get(c0).connections.get(c1), gt0Clusters.get(c1).connections.get(c0)); |
| if (min > max) { |
| max = min; |
| maxIndexI = c0; |
| maxIndexJ = c1; |
| } |
| } |
| } |
| if (maxIndexI != -1 && max > tauConnection) { |
| gt0Clusters.get(maxIndexI).mergeCluster(maxIndexJ); |
| if (debug) |
| System.out.println("Merging " + maxIndexI + " and " + maxIndexJ + " because of connection " + max); |
| |
| changedConnection = true; |
| } |
| } |
| numGT0Classes = gt0Clusters.size(); |
| } |
| |
| /** |
| * Calculates how well the original clusters are separable. Small values indicate bad separability, values close to 1 |
| * indicate good separability |
| * |
| * @return index of seperability |
| */ |
| public double getClassSeparability() { |
| // int totalConn = numGTClasses*(numGTClasses-1)/2; |
| // int mergedConn = 0; |
| // for(GTCluster gt : gt0Clusters){ |
| // int merged = gt.clusterRepresentations.size(); |
| // if(merged > 1) |
| // mergedConn+=merged * (merged-1)/2; |
| // } |
| // if(totalConn == 0) |
| // return 0; |
| // else |
| // return 1-mergedConn/(double)totalConn; |
| return numGT0Classes / (double) numGTClasses; |
| |
| } |
| |
| /** |
| * Calculates how well noise is separable from the given clusters Small values indicate bad separability, values close |
| * to 1 indicate good separability |
| * |
| * @return index of noise separability |
| */ |
| public double getNoiseSeparability() { |
| if (noise.isEmpty()) |
| return 1; |
| |
| double connectivity = 0; |
| for (int p : noise) { |
| CMMPoint npoint = cmmpoints.get(p); |
| double maxConnection = 0; |
| |
| // TODO: some kind of pruning possible. what about weighting? |
| for (int c = 0; c < gt0Clusters.size(); c++) { |
| double connection = getConnectionValue(npoint, c); |
| if (connection > maxConnection) |
| maxConnection = connection; |
| } |
| connectivity += maxConnection; |
| npoint.p.setMeasureValue("MaxConnection", maxConnection); |
| } |
| |
| return 1 - (connectivity / noise.size()); |
| } |
| |
| /** |
| * Calculates the relative number of errors being caused by the underlying cluster model |
| * |
| * @return quality of the model |
| */ |
| public double getModelQuality() { |
| for (int p = 0; p < numPoints; p++) { |
| CMMPoint cmdp = cmmpoints.get(p); |
| for (int hc = 0; hc < numGTClusters; hc++) { |
| if (gtClustering.get(hc).getGroundTruth() != cmdp.trueClass) { |
| if (gtClustering.get(hc).getInclusionProbability(cmdp) >= 1) { |
| if (!cmdp.isNoise()) |
| pointErrorByModel++; |
| else |
| noiseErrorByModel++; |
| break; |
| } |
| } |
| } |
| } |
| if (debug) |
| System.out.println("Error by model: noise " + noiseErrorByModel + " point " + pointErrorByModel); |
| |
| return 1 - ((pointErrorByModel + noiseErrorByModel) / (double) numPoints); |
| } |
| |
| /** |
| * Get CMM internal point |
| * |
| * @param index |
| * of the point |
| * @return cmm point |
| */ |
| protected CMMPoint getPoint(int index) { |
| return cmmpoints.get(index); |
| } |
| |
| /** |
| * Return cluster |
| * |
| * @param index |
| * of the cluster to return |
| * @return cluster |
| */ |
| protected GTCluster getGT0Cluster(int index) { |
| return gt0Clusters.get(index); |
| } |
| |
| /** |
| * Number of classes/clusters of the new clustering |
| * |
| * @return number of new clusters |
| */ |
| protected int getNumberOfGT0Classes() { |
| return numGT0Classes; |
| } |
| |
| /** |
| * Calculates Euclidian distance |
| * |
| * @param inst1 |
| * point as double array |
| * @param inst2 |
| * point as double array |
| * @return euclidian distance |
| */ |
| private double distance(Instance inst1, Instance inst2) { |
| return distance(inst1, inst2.toDoubleArray()); |
| |
| } |
| |
| /** |
| * Calculates Euclidian distance |
| * |
| * @param inst1 |
| * point as an instance |
| * @param inst2 |
| * point as double array |
| * @return euclidian distance |
| */ |
| private double distance(Instance inst1, double[] inst2) { |
| double distance = 0.0; |
| for (int i = 0; i < numDims; i++) { |
| double d = inst1.value(i) - inst2[i]; |
| distance += d * d; |
| } |
| return Math.sqrt(distance); |
| } |
| |
| /** |
| * String with main CMM parameters |
| * |
| * @return main CMM parameter |
| */ |
| public String getParameterString() { |
| String para = ""; |
| para += "k=" + knnNeighbourhood + ";"; |
| if (useExpConnectivity) { |
| para += "lambdaConnX=" + lambdaConnX + ";"; |
| para += "lambdaConn=" + lamdaConn + ";"; |
| para += "lambdaConnRef=" + lambdaConnRefXValue + ";"; |
| } |
| para += "m=" + clusterConnectionMaxPoints + ";"; |
| para += "tauConn=" + tauConnection + ";"; |
| |
| return para; |
| } |
| } |