blob: 39488aef3f156ac44a01036d24167ed0b0acbd89 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.text.similarity;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* Measures the Cosine similarity of two vectors of an inner product space and
* compares the angle between them.
*
* <p>
* For further explanation about the Cosine Similarity, refer to
* http://en.wikipedia.org/wiki/Cosine_similarity.
* </p>
*
* @since 1.0
*/
public class CosineSimilarity {
/**
* Calculates the cosine similarity for two given vectors.
*
* @param leftVector left vector
* @param rightVector right vector
* @return cosine similarity between the two vectors
*/
public Double cosineSimilarity(final Map<CharSequence, Integer> leftVector,
final Map<CharSequence, Integer> rightVector) {
if (leftVector == null || rightVector == null) {
throw new IllegalArgumentException("Vectors must not be null");
}
final Set<CharSequence> intersection = getIntersection(leftVector, rightVector);
final double dotProduct = dot(leftVector, rightVector, intersection);
double d1 = 0.0d;
for (final Integer value : leftVector.values()) {
d1 += Math.pow(value, 2);
}
double d2 = 0.0d;
for (final Integer value : rightVector.values()) {
d2 += Math.pow(value, 2);
}
double cosineSimilarity;
if (d1 <= 0.0 || d2 <= 0.0) {
cosineSimilarity = 0.0;
} else {
cosineSimilarity = dotProduct / (Math.sqrt(d1) * Math.sqrt(d2));
}
return cosineSimilarity;
}
/**
* Returns a set with strings common to the two given maps.
*
* @param leftVector left vector map
* @param rightVector right vector map
* @return common strings
*/
private Set<CharSequence> getIntersection(final Map<CharSequence, Integer> leftVector,
final Map<CharSequence, Integer> rightVector) {
final Set<CharSequence> intersection = new HashSet<>(leftVector.keySet());
intersection.retainAll(rightVector.keySet());
return intersection;
}
/**
* Computes the dot product of two vectors. It ignores remaining elements. It means
* that if a vector is longer than other, then a smaller part of it will be used to compute
* the dot product.
*
* @param leftVector left vector
* @param rightVector right vector
* @param intersection common elements
* @return the dot product
*/
private double dot(final Map<CharSequence, Integer> leftVector, final Map<CharSequence, Integer> rightVector,
final Set<CharSequence> intersection) {
long dotProduct = 0;
for (final CharSequence key : intersection) {
dotProduct += leftVector.get(key) * rightVector.get(key);
}
return dotProduct;
}
}