| package com.yahoo.labs.samoa.evaluation.measures; |
| |
| /* |
| * #%L |
| * SAMOA |
| * %% |
| * Copyright (C) 2011 RWTH Aachen University, Germany |
| * %% |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * #L% |
| */ |
| |
| import com.yahoo.labs.samoa.instances.Instance; |
| import com.yahoo.labs.samoa.moa.cluster.Clustering; |
| import com.yahoo.labs.samoa.moa.core.AutoExpandVector; |
| import com.yahoo.labs.samoa.moa.core.DataPoint; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| |
| /** |
| * [CMM_GTAnalysis.java] |
| * |
| * CMM: Ground truth analysis |
| * |
| * Reference: Kremer et al., "An Effective Evaluation Measure for Clustering on Evolving Data Streams", KDD, 2011 |
| * |
| * @author Timm jansen |
| * Data Management and Data Exploration Group, RWTH Aachen University |
| */ |
| |
| /* |
| * TODO: |
| * - try to avoid calcualting the radius multiple times |
| * - avoid the full distance map? |
| * - knn functionality in clusters |
| * - noise error |
| */ |
| public class CMM_GTAnalysis{ |
| |
| /** |
| * the given ground truth clustering |
| */ |
| private Clustering gtClustering; |
| |
| /** |
| * list of given points within the horizon |
| */ |
| private ArrayList<CMMPoint> cmmpoints; |
| |
| /** |
| * the newly calculate ground truth clustering |
| */ |
| private ArrayList<GTCluster> gt0Clusters; |
| |
| /** |
| * IDs of noise points |
| */ |
| private ArrayList<Integer> noise; |
| |
| /** |
| * total number of points |
| */ |
| private int numPoints; |
| |
| /** |
| * number of clusters of the original ground truth |
| */ |
| private int numGTClusters; |
| |
| /** |
| * number of classes of the original ground truth, in case of a |
| * micro clustering ground truth this differs from numGTClusters |
| */ |
| private int numGTClasses; |
| |
| /** |
| * number of classes after we are done with the analysis |
| */ |
| private int numGT0Classes; |
| |
| /** |
| * number of dimensions |
| */ |
| private int numDims; |
| |
| /** |
| * mapping between true cluster ID/class label of the original ground truth |
| * and the internal cluster ID/working class label. |
| * |
| * different original cluster IDs might map to the same new cluster ID due to merging of two clusters |
| */ |
| private HashMap<Integer, Integer> mapTrueLabelToWorkLabel; |
| |
| /** |
| * log of how clusters have been merged (for debugging) |
| */ |
| private int[] mergeMap; |
| |
| /** |
| * number of non-noise points that will create an error due to the underlying clustering model |
| * (e.g. point being covered by two clusters representing different classes) |
| */ |
| private int noiseErrorByModel; |
| |
| /** |
| * number of noise points that will create an error due to the underlying clustering model |
| * (e.g. noise point being covered by a cluster) |
| */ |
| private int pointErrorByModel; |
| |
| /** |
| * CMM debug mode |
| */ |
| private boolean debug = false; |
| |
| |
| /******* CMM parameter ***********/ |
| |
| /** |
| * defines how many nearest neighbors will be used |
| */ |
| private int knnNeighbourhood = 2; |
| |
| /** |
| * the threshold which defines when ground truth clusters will be merged. |
| * set to 1 to disable merging |
| */ |
| private double tauConnection = 0.5; |
| |
| /** |
| * experimental (default: disabled) |
| * separate k for points to cluster and cluster to cluster |
| */ |
| private double clusterConnectionMaxPoints = knnNeighbourhood; |
| |
| /** |
| * experimental (default: disabled) |
| * use exponential connectivity function to model different behavior: |
| * closer points will have a stronger connection compared to the linear function. |
| * Use ConnRefXValue and ConnX to better parameterize lambda, which controls |
| * the decay of the connectivity |
| */ |
| private boolean useExpConnectivity = false; |
| private double lambdaConnRefXValue = 0.01; |
| private double lambdaConnX = 4; |
| private double lamdaConn; |
| |
| |
| /******************************************/ |
| |
| |
| /** |
| * Wrapper class for data points to store CMM relevant attributes |
| * |
| */ |
| protected class CMMPoint extends DataPoint{ |
| /** |
| * Reference to original point |
| */ |
| protected DataPoint p = null; |
| |
| /** |
| * point ID |
| */ |
| protected int pID = 0; |
| |
| |
| /** |
| * true class label |
| */ |
| protected int trueClass = -1; |
| |
| |
| /** |
| * the connectivity of the point to its cluster |
| */ |
| protected double connectivity = 1.0; |
| |
| |
| /** |
| * knn distnace within own cluster |
| */ |
| protected double knnInCluster = 0.0; |
| |
| |
| /** |
| * knn indices (for debugging only) |
| */ |
| protected ArrayList<Integer> knnIndices; |
| |
| public CMMPoint(DataPoint point, int id) { |
| //make a copy, but keep reference |
| super(point,point.getTimestamp()); |
| p = point; |
| pID = id; |
| trueClass = (int)point.classValue(); |
| } |
| |
| |
| /** |
| * Retruns the current working label of the cluster the point belongs to. |
| * The label can change due to merging of clusters. |
| * |
| * @return the current working class label |
| */ |
| protected int workclass(){ |
| if(trueClass == -1 ) |
| return -1; |
| else |
| return mapTrueLabelToWorkLabel.get(trueClass); |
| } |
| } |
| |
| |
| |
| /** |
| * Main class to model the new clusters that will be the output of the cluster analysis |
| * |
| */ |
| protected class GTCluster{ |
| /** points that are per definition in the cluster */ |
| private ArrayList<Integer> points = new ArrayList<Integer>(); |
| |
| /** a new GT cluster consists of one or more "old" GT clusters. |
| * Connected/overlapping clusters cannot be merged directly because of the |
| * underlying cluster model. E.g. for merging two spherical clusters the new |
| * cluster sphere can cover a lot more space then two separate smaller spheres. |
| * To keep the original coverage we need to keep the orignal clusters and merge |
| * them on an abstract level. */ |
| private ArrayList<Integer> clusterRepresentations = new ArrayList<Integer>(); |
| |
| /** current work class (changes when merging) */ |
| private int workclass; |
| |
| /** original work class */ |
| private final int orgWorkClass; |
| |
| /** original class label*/ |
| private final int label; |
| |
| /** clusters that have been merged into this cluster (debugging)*/ |
| private ArrayList<Integer> mergedWorkLabels = null; |
| |
| /** average knn distance of all points in the cluster*/ |
| private double knnMeanAvg = 0; |
| |
| /** average deviation of knn distance of all points*/ |
| private double knnDevAvg = 0; |
| |
| /** connectivity of the cluster to all other clusters */ |
| private ArrayList<Double> connections = new ArrayList<Double>(); |
| |
| |
| private GTCluster(int workclass, int label, int gtClusteringID) { |
| this.orgWorkClass = workclass; |
| this.workclass = workclass; |
| this.label = label; |
| this.clusterRepresentations.add(gtClusteringID); |
| } |
| |
| |
| /** |
| * The original class label the cluster represents |
| * @return original class label |
| */ |
| protected int getLabel(){ |
| return label; |
| } |
| |
| /** |
| * Calculate the probability of the point being covered through the cluster |
| * @param point to calculate the probability for |
| * @return probability of the point being covered through the cluster |
| */ |
| protected double getInclusionProbability(CMMPoint point){ |
| double prob = Double.MIN_VALUE; |
| //check all cluster representatives for coverage |
| for (int c = 0; c < clusterRepresentations.size(); c++) { |
| double tmp_prob = gtClustering.get(clusterRepresentations.get(c)).getInclusionProbability(point); |
| if(tmp_prob > prob) prob = tmp_prob; |
| } |
| return prob; |
| } |
| |
| |
| /** |
| * calculate knn distances of points within own cluster |
| * + average knn distance and average knn distance deviation of all points |
| */ |
| private void calculateKnn(){ |
| for (int p0 : points) { |
| CMMPoint cmdp = cmmpoints.get(p0); |
| if(!cmdp.isNoise()){ |
| AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); |
| AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); |
| |
| //calculate nearest neighbours |
| getKnnInCluster(cmdp, knnNeighbourhood, points, knnDist,knnPointIndex); |
| |
| //TODO: What to do if we have less then k neighbours? |
| double avgKnn = 0; |
| for (int i = 0; i < knnDist.size(); i++) { |
| avgKnn+= knnDist.get(i); |
| } |
| if(knnDist.size()!=0) |
| avgKnn/=knnDist.size(); |
| cmdp.knnInCluster = avgKnn; |
| cmdp.knnIndices = knnPointIndex; |
| cmdp.p.setMeasureValue("knnAvg", cmdp.knnInCluster); |
| |
| knnMeanAvg+=avgKnn; |
| knnDevAvg+=Math.pow(avgKnn,2); |
| } |
| } |
| knnMeanAvg=knnMeanAvg/(double)points.size(); |
| knnDevAvg=knnDevAvg/(double)points.size(); |
| |
| double variance = knnDevAvg-Math.pow(knnMeanAvg,2.0); |
| // Due to numerical errors, small negative values can occur. |
| if (variance <= 0.0) variance = 1e-50; |
| knnDevAvg = Math.sqrt(variance); |
| |
| } |
| |
| |
| /** |
| * Calculate the connection of a cluster to this cluster |
| * @param otherCid cluster id of the other cluster |
| * @param initial flag for initial run |
| */ |
| private void calculateClusterConnection(int otherCid, boolean initial){ |
| double avgConnection = 0; |
| if(workclass==otherCid){ |
| avgConnection = 1; |
| } |
| else{ |
| AutoExpandVector<Double> kmax = new AutoExpandVector<Double>(); |
| AutoExpandVector<Integer> kmaxIndexes = new AutoExpandVector<Integer>(); |
| |
| for(int p : points){ |
| CMMPoint cmdp = cmmpoints.get(p); |
| double con_p_Cj = getConnectionValue(cmmpoints.get(p), otherCid); |
| double connection = cmdp.connectivity * con_p_Cj; |
| if(initial){ |
| cmdp.p.setMeasureValue("Connection to C"+otherCid, con_p_Cj); |
| } |
| |
| //connection |
| if(kmax.size() < clusterConnectionMaxPoints || connection > kmax.get(kmax.size()-1)){ |
| int index = 0; |
| while(index < kmax.size() && connection < kmax.get(index)) { |
| index++; |
| } |
| kmax.add(index, connection); |
| kmaxIndexes.add(index, p); |
| if(kmax.size() > clusterConnectionMaxPoints){ |
| kmax.remove(kmax.size()-1); |
| kmaxIndexes.add(kmaxIndexes.size()-1); |
| } |
| } |
| } |
| //connection |
| for (int k = 0; k < kmax.size(); k++) { |
| avgConnection+= kmax.get(k); |
| } |
| avgConnection/=kmax.size(); |
| } |
| |
| if(otherCid<connections.size()){ |
| connections.set(otherCid, avgConnection); |
| } |
| else |
| if(connections.size() == otherCid){ |
| connections.add(avgConnection); |
| } |
| else |
| System.out.println("Something is going really wrong with the connection listing!"+knnNeighbourhood+" "+tauConnection); |
| } |
| |
| |
| /** |
| * Merge a cluster into this cluster |
| * @param mergeID the ID of the cluster to be merged |
| */ |
| private void mergeCluster(int mergeID){ |
| if(mergeID < gt0Clusters.size()){ |
| //track merging (debugging) |
| for (int i = 0; i < numGTClasses; i++) { |
| if(mergeMap[i]==mergeID) |
| mergeMap[i]=workclass; |
| if(mergeMap[i]>mergeID) |
| mergeMap[i]--; |
| } |
| GTCluster gtcMerge = gt0Clusters.get(mergeID); |
| if(debug) |
| System.out.println("Merging C"+gtcMerge.workclass+" into C"+workclass+ |
| " with Con "+connections.get(mergeID)+" / "+gtcMerge.connections.get(workclass)); |
| |
| |
| //update mapTrueLabelToWorkLabel |
| mapTrueLabelToWorkLabel.put(gtcMerge.label, workclass); |
| Iterator iterator = mapTrueLabelToWorkLabel.keySet().iterator(); |
| while (iterator.hasNext()) { |
| Integer key = (Integer)iterator.next(); |
| //update pointer of already merged cluster |
| int value = mapTrueLabelToWorkLabel.get(key); |
| if(value == mergeID) |
| mapTrueLabelToWorkLabel.put(key, workclass); |
| if(value > mergeID) |
| mapTrueLabelToWorkLabel.put(key, value-1); |
| } |
| |
| //merge points from B into A |
| points.addAll(gtcMerge.points); |
| clusterRepresentations.addAll(gtcMerge.clusterRepresentations); |
| if(mergedWorkLabels==null){ |
| mergedWorkLabels = new ArrayList<Integer>(); |
| } |
| mergedWorkLabels.add(gtcMerge.orgWorkClass); |
| if(gtcMerge.mergedWorkLabels!=null) |
| mergedWorkLabels.addAll(gtcMerge.mergedWorkLabels); |
| |
| gt0Clusters.remove(mergeID); |
| |
| //update workclass labels |
| for(int c=mergeID; c < gt0Clusters.size(); c++){ |
| gt0Clusters.get(c).workclass = c; |
| } |
| |
| //update knn distances |
| calculateKnn(); |
| for(int c=0; c < gt0Clusters.size(); c++){ |
| gt0Clusters.get(c).connections.remove(mergeID); |
| |
| //recalculate connection from other clusters to the new merged one |
| gt0Clusters.get(c).calculateClusterConnection(workclass,false); |
| //and from new merged one to other clusters |
| gt0Clusters.get(workclass).calculateClusterConnection(c,false); |
| } |
| } |
| else{ |
| System.out.println("Merge indices are not valid"); |
| } |
| } |
| } |
| |
| |
| /** |
| * @param trueClustering the ground truth clustering |
| * @param points data points |
| * @param enableClassMerge allow class merging (should be set to true on default) |
| */ |
| public CMM_GTAnalysis(Clustering trueClustering, ArrayList<DataPoint> points, boolean enableClassMerge){ |
| if(debug) |
| System.out.println("GT Analysis Debug Output"); |
| |
| noiseErrorByModel = 0; |
| pointErrorByModel = 0; |
| if(!enableClassMerge){ |
| tauConnection = 1.0; |
| } |
| |
| lamdaConn = -Math.log(lambdaConnRefXValue)/Math.log(2)/lambdaConnX; |
| |
| this.gtClustering = trueClustering; |
| |
| numPoints = points.size(); |
| numDims = points.get(0).numAttributes()-1; |
| numGTClusters = gtClustering.size(); |
| |
| //init mappings between work and true labels |
| mapTrueLabelToWorkLabel = new HashMap<Integer, Integer>(); |
| |
| //set up base of new clustering |
| gt0Clusters = new ArrayList<GTCluster>(); |
| int numWorkClasses = 0; |
| //create label to worklabel mapping as real labels can be just a set of unordered integers |
| for (int i = 0; i < numGTClusters; i++) { |
| int label = (int)gtClustering.get(i).getGroundTruth(); |
| if(!mapTrueLabelToWorkLabel.containsKey(label)){ |
| gt0Clusters.add(new GTCluster(numWorkClasses,label,i)); |
| mapTrueLabelToWorkLabel.put(label,numWorkClasses); |
| numWorkClasses++; |
| } |
| else{ |
| gt0Clusters.get(mapTrueLabelToWorkLabel.get(label)).clusterRepresentations.add(i); |
| } |
| } |
| numGTClasses = numWorkClasses; |
| |
| mergeMap = new int[numGTClasses]; |
| for (int i = 0; i < numGTClasses; i++) { |
| mergeMap[i]=i; |
| } |
| |
| //create cmd point wrapper instances |
| cmmpoints = new ArrayList<CMMPoint>(); |
| for (int p = 0; p < points.size(); p++) { |
| CMMPoint cmdp = new CMMPoint(points.get(p), p); |
| cmmpoints.add(cmdp); |
| } |
| |
| |
| //split points up into their GTClusters and Noise (according to class labels) |
| noise = new ArrayList<Integer>(); |
| for (int p = 0; p < numPoints; p++) { |
| if(cmmpoints.get(p).isNoise()){ |
| noise.add(p); |
| } |
| else{ |
| gt0Clusters.get(cmmpoints.get(p).workclass()).points.add(p); |
| } |
| } |
| |
| //calculate initial knnMean and knnDev |
| for (GTCluster gtc : gt0Clusters) { |
| gtc.calculateKnn(); |
| } |
| |
| //calculate cluster connections |
| calculateGTClusterConnections(); |
| |
| //calculate point connections with own clusters |
| calculateGTPointQualities(); |
| |
| if(debug) |
| System.out.println("GT Analysis Debug End"); |
| |
| } |
| |
| /** |
| * Calculate the connection of a point to a cluster |
| * |
| * @param cmmp the point to calculate the connection for |
| * @param clusterID the corresponding cluster |
| * @return the connection value |
| */ |
| //TODO: Cache the connection value for a point to the different clusters??? |
| protected double getConnectionValue(CMMPoint cmmp, int clusterID){ |
| AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); |
| AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); |
| |
| //calculate the knn distance of the point to the cluster |
| getKnnInCluster(cmmp, knnNeighbourhood, gt0Clusters.get(clusterID).points, knnDist, knnPointIndex); |
| |
| //TODO: What to do if we have less then k neighbors? |
| double avgDist = 0; |
| for (int i = 0; i < knnDist.size(); i++) { |
| avgDist+= knnDist.get(i); |
| } |
| //what to do if we only have a single point??? |
| if(knnDist.size()!=0) |
| avgDist/=knnDist.size(); |
| else |
| return 0; |
| |
| //get the upper knn distance of the cluster |
| double upperKnn = gt0Clusters.get(clusterID).knnMeanAvg + gt0Clusters.get(clusterID).knnDevAvg; |
| |
| /* calculate the connectivity based on knn distance of the point within the cluster |
| and the upper knn distance of the cluster*/ |
| if(avgDist < upperKnn){ |
| return 1; |
| } |
| else{ |
| //value that should be reached at upperKnn distance |
| //Choose connection formula |
| double conn; |
| if(useExpConnectivity) |
| conn = Math.pow(2,-lamdaConn*(avgDist-upperKnn)/upperKnn); |
| else |
| conn = upperKnn/avgDist; |
| |
| if(Double.isNaN(conn)) |
| System.out.println("Connectivity NaN at "+cmmp.p.getTimestamp()); |
| |
| return conn; |
| } |
| } |
| |
| |
| /** |
| * @param cmmp point to calculate knn distance for |
| * @param k number of nearest neighbors to look for |
| * @param pointIDs list of point IDs to check |
| * @param knnDist sorted list of smallest knn distances (can already be filled to make updates possible) |
| * @param knnPointIndex list of corresponding knn indices |
| */ |
| private void getKnnInCluster(CMMPoint cmmp, int k, |
| ArrayList<Integer> pointIDs, |
| AutoExpandVector<Double> knnDist, |
| AutoExpandVector<Integer> knnPointIndex) { |
| |
| //iterate over every point in the choosen cluster, cal distance and insert into list |
| for (int p1 = 0; p1 < pointIDs.size(); p1++) { |
| int pid = pointIDs.get(p1); |
| if(cmmp.pID == pid) continue; |
| double dist = distance(cmmp,cmmpoints.get(pid)); |
| if(knnDist.size() < k || dist < knnDist.get(knnDist.size()-1)){ |
| int index = 0; |
| while(index < knnDist.size() && dist > knnDist.get(index)) { |
| index++; |
| } |
| knnDist.add(index, dist); |
| knnPointIndex.add(index,pid); |
| if(knnDist.size() > k){ |
| knnDist.remove(knnDist.size()-1); |
| knnPointIndex.remove(knnPointIndex.size()-1); |
| } |
| } |
| } |
| } |
| |
| |
| |
| /** |
| * calculate initial connectivities |
| */ |
| private void calculateGTPointQualities(){ |
| for (int p = 0; p < numPoints; p++) { |
| CMMPoint cmdp = cmmpoints.get(p); |
| if(!cmdp.isNoise()){ |
| cmdp.connectivity = getConnectionValue(cmdp, cmdp.workclass()); |
| cmdp.p.setMeasureValue("Connectivity", cmdp.connectivity); |
| } |
| } |
| } |
| |
| |
| |
| /** |
| * Calculate connections between clusters and merge clusters accordingly as |
| * long as connections exceed threshold |
| */ |
| private void calculateGTClusterConnections(){ |
| for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { |
| for (int c1 = 0; c1 < gt0Clusters.size(); c1++) { |
| gt0Clusters.get(c0).calculateClusterConnection(c1, true); |
| } |
| } |
| |
| boolean changedConnection = true; |
| while(changedConnection){ |
| if(debug){ |
| System.out.println("Cluster Connection"); |
| for (int c = 0; c < gt0Clusters.size(); c++) { |
| System.out.print("C"+gt0Clusters.get(c).label+" --> "); |
| for (int c1 = 0; c1 < gt0Clusters.get(c).connections.size(); c1++) { |
| System.out.print(" C"+gt0Clusters.get(c1).label+": "+gt0Clusters.get(c).connections.get(c1)); |
| } |
| System.out.println(""); |
| } |
| System.out.println(""); |
| } |
| |
| double max = 0; |
| int maxIndexI = -1; |
| int maxIndexJ = -1; |
| |
| changedConnection = false; |
| for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { |
| for (int c1 = c0+1; c1 < gt0Clusters.size(); c1++) { |
| if(c0==c1) continue; |
| double min =Math.min(gt0Clusters.get(c0).connections.get(c1), gt0Clusters.get(c1).connections.get(c0)); |
| if(min > max){ |
| max = min; |
| maxIndexI = c0; |
| maxIndexJ = c1; |
| } |
| } |
| } |
| if(maxIndexI!=-1 && max > tauConnection){ |
| gt0Clusters.get(maxIndexI).mergeCluster(maxIndexJ); |
| if(debug) |
| System.out.println("Merging "+maxIndexI+" and "+maxIndexJ+" because of connection "+max); |
| |
| changedConnection = true; |
| } |
| } |
| numGT0Classes = gt0Clusters.size(); |
| } |
| |
| |
| /** |
| * Calculates how well the original clusters are separable. |
| * Small values indicate bad separability, values close to 1 indicate good separability |
| * @return index of seperability |
| */ |
| public double getClassSeparability(){ |
| // int totalConn = numGTClasses*(numGTClasses-1)/2; |
| // int mergedConn = 0; |
| // for(GTCluster gt : gt0Clusters){ |
| // int merged = gt.clusterRepresentations.size(); |
| // if(merged > 1) |
| // mergedConn+=merged * (merged-1)/2; |
| // } |
| // if(totalConn == 0) |
| // return 0; |
| // else |
| // return 1-mergedConn/(double)totalConn; |
| return numGT0Classes/(double)numGTClasses; |
| |
| } |
| |
| |
| /** |
| * Calculates how well noise is separable from the given clusters |
| * Small values indicate bad separability, values close to 1 indicate good separability |
| * @return index of noise separability |
| */ |
| public double getNoiseSeparability(){ |
| if(noise.isEmpty()) |
| return 1; |
| |
| double connectivity = 0; |
| for(int p : noise){ |
| CMMPoint npoint = cmmpoints.get(p); |
| double maxConnection = 0; |
| |
| //TODO: some kind of pruning possible. what about weighting? |
| for (int c = 0; c < gt0Clusters.size(); c++) { |
| double connection = getConnectionValue(npoint, c); |
| if(connection > maxConnection) |
| maxConnection = connection; |
| } |
| connectivity+=maxConnection; |
| npoint.p.setMeasureValue("MaxConnection", maxConnection); |
| } |
| |
| return 1-(connectivity / noise.size()); |
| } |
| |
| |
| /** |
| * Calculates the relative number of errors being caused by the underlying cluster model |
| * @return quality of the model |
| */ |
| public double getModelQuality(){ |
| for(int p = 0; p < numPoints; p++){ |
| CMMPoint cmdp = cmmpoints.get(p); |
| for(int hc = 0; hc < numGTClusters;hc++){ |
| if(gtClustering.get(hc).getGroundTruth() != cmdp.trueClass){ |
| if(gtClustering.get(hc).getInclusionProbability(cmdp) >= 1){ |
| if(!cmdp.isNoise()) |
| pointErrorByModel++; |
| else |
| noiseErrorByModel++; |
| break; |
| } |
| } |
| } |
| } |
| if(debug) |
| System.out.println("Error by model: noise "+noiseErrorByModel+" point "+pointErrorByModel); |
| |
| return 1-((pointErrorByModel + noiseErrorByModel)/(double) numPoints); |
| } |
| |
| |
| /** |
| * Get CMM internal point |
| * @param index of the point |
| * @return cmm point |
| */ |
| protected CMMPoint getPoint(int index){ |
| return cmmpoints.get(index); |
| } |
| |
| |
| /** |
| * Return cluster |
| * @param index of the cluster to return |
| * @return cluster |
| */ |
| protected GTCluster getGT0Cluster(int index){ |
| return gt0Clusters.get(index); |
| } |
| |
| /** |
| * Number of classes/clusters of the new clustering |
| * @return number of new clusters |
| */ |
| protected int getNumberOfGT0Classes() { |
| return numGT0Classes; |
| } |
| |
| /** |
| * Calculates Euclidian distance |
| * @param inst1 point as double array |
| * @param inst2 point as double array |
| * @return euclidian distance |
| */ |
| private double distance(Instance inst1, Instance inst2){ |
| return distance(inst1, inst2.toDoubleArray()); |
| |
| } |
| |
| /** |
| * Calculates Euclidian distance |
| * @param inst1 point as an instance |
| * @param inst2 point as double array |
| * @return euclidian distance |
| */ |
| private double distance(Instance inst1, double[] inst2){ |
| double distance = 0.0; |
| for (int i = 0; i < numDims; i++) { |
| double d = inst1.value(i) - inst2[i]; |
| distance += d * d; |
| } |
| return Math.sqrt(distance); |
| } |
| |
| /** |
| * String with main CMM parameters |
| * @return main CMM parameter |
| */ |
| public String getParameterString(){ |
| String para = ""; |
| para+="k="+knnNeighbourhood+";"; |
| if(useExpConnectivity){ |
| para+="lambdaConnX="+lambdaConnX+";"; |
| para+="lambdaConn="+lamdaConn+";"; |
| para+="lambdaConnRef="+lambdaConnRefXValue+";"; |
| } |
| para+="m="+clusterConnectionMaxPoints+";"; |
| para+="tauConn="+tauConnection+";"; |
| |
| return para; |
| } |
| } |
| |
| |