blob: 44541430ef70a2772e8d58b97dcabed759affb07 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.language;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
* Language profile based on ngram counts.
*
* @since Apache Tika 0.5
*/
public class LanguageProfile {
public static final int DEFAULT_NGRAM_LENGTH = 3;
private final int length;
/**
* The ngrams that make up this profile.
*/
private final Map<String, Counter> ngrams =
new HashMap<String, Counter>();
/**
* The sum of all ngram counts in this profile.
* Used to calculate relative ngram frequency.
*/
private long count = 0;
private class Counter {
private long count = 0;
public String toString() {
return Long.toString(count);
}
}
public LanguageProfile(int length) {
this.length = length;
}
public LanguageProfile() {
this(DEFAULT_NGRAM_LENGTH);
}
public LanguageProfile(String content, int length) {
this(length);
ProfilingWriter writer = new ProfilingWriter(this);
char[] ch = content.toCharArray();
writer.write(ch, 0, ch.length);
}
public LanguageProfile(String content) {
this(content, DEFAULT_NGRAM_LENGTH);
}
public long getCount() {
return count;
}
public long getCount(String ngram) {
Counter counter = ngrams.get(ngram);
if (counter != null) {
return counter.count;
} else {
return 0;
}
}
/**
* Adds a single occurrence of the given ngram to this profile.
*
* @param ngram the ngram
*/
public void add(String ngram) {
add(ngram, 1);
}
/**
* Adds multiple occurrences of the given ngram to this profile.
*
* @param ngram the ngram
* @param count number of occurrences to add
*/
public void add(String ngram, long count) {
if (length != ngram.length()) {
throw new IllegalArgumentException(
"Unable to add an ngram of incorrect length: "
+ ngram.length() + " != " + length);
}
Counter counter = ngrams.get(ngram);
if (counter == null) {
counter = new Counter();
ngrams.put(ngram, counter);
}
counter.count += count;
this.count += count;
}
/**
* Calculates the geometric distance between this and the given
* other language profile.
*
* @param that the other language profile
* @return distance between the profiles
*/
public double distance(LanguageProfile that) {
if (length != that.length) {
throw new IllegalArgumentException(
"Unable to calculage distance of language profiles"
+ " with different ngram lengths: "
+ that.length + " != " + length);
}
double sumOfSquares = 0.0;
double thisCount = Math.max(this.count, 1.0);
double thatCount = Math.max(that.count, 1.0);
Set<String> ngrams = new HashSet<String>();
ngrams.addAll(this.ngrams.keySet());
ngrams.addAll(that.ngrams.keySet());
for (String ngram : ngrams) {
double thisFrequency = this.getCount(ngram) / thisCount;
double thatFrequency = that.getCount(ngram) / thatCount;
double difference = thisFrequency - thatFrequency;
sumOfSquares += difference * difference;
}
return Math.sqrt(sumOfSquares);
}
@Override
public String toString() {
return ngrams.toString();
}
}