apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java - opennlp-sandbox - Git at Google

 /*
  * Copyright 2013 The Apache Software Foundation.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.domain.BaseLink;
 import opennlp.tools.entitylinker.domain.LinkedSpan;
 import opennlp.tools.ngram.NGramGenerator;
 import opennlp.tools.util.Span;

 /**
  *
  * Generates scores for string comparisons.
  */
 public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {

   @Override
   public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
     for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
       for (BaseLink link : linkedSpan.getLinkedEntries()) {
         Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);
         link.getScoreMap().put("dice", dice);
         Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""));
         link.getScoreMap().put("levenshtein", ld);
       }
     }


   }

   /**
    * Generates a score based on an overlap of nGrams between two strings using
    * the DiceCoefficient technique.
    *
    * @param s1     first string
    * @param s2     second string
    * @param nGrams number of chars in each gram
    * @return
    */
   public double getDiceCoefficient(String s1, String s2, int nGrams) {
     if (s1.equals("") || s1.equals("")) {
       return 0d;
     }
     List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");
     List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");

     Set<String> overlap = new HashSet<String>(s1Grams);
     overlap.retainAll(s2Grams);
     double totcombigrams = overlap.size();

     return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());
   }

   private int minimum(int a, int b, int c) {
     return Math.min(Math.min(a, b), c);
   }

   public int getLevenshteinDistance(CharSequence str1,
           CharSequence str2) {
     int[][] distance = new int[str1.length() + 1][str2.length() + 1];

     for (int i = 0; i <= str1.length(); i++) {
       distance[i][0] = i;
     }
     for (int j = 1; j <= str2.length(); j++) {
       distance[0][j] = j;
     }

     for (int i = 1; i <= str1.length(); i++) {
       for (int j = 1; j <= str2.length(); j++) {
         distance[i][j] = minimum(
                 distance[i - 1][j] + 1,
                 distance[i][j - 1] + 1,
                 distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));
       }
     }

     return distance[str1.length()][str2.length()];
   }
 }
	/*
	* Copyright 2013 The Apache Software Foundation.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;

	import java.util.HashSet;
	import java.util.List;
	import java.util.Set;
	import opennlp.tools.entitylinker.EntityLinkerProperties;
	import opennlp.tools.entitylinker.domain.BaseLink;
	import opennlp.tools.entitylinker.domain.LinkedSpan;
	import opennlp.tools.ngram.NGramGenerator;
	import opennlp.tools.util.Span;

	/**
	*
	* Generates scores for string comparisons.
	*/
	public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {

	@Override
	public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
	for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
	for (BaseLink link : linkedSpan.getLinkedEntries()) {
	Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);
	link.getScoreMap().put("dice", dice);
	Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""));
	link.getScoreMap().put("levenshtein", ld);
	}
	}


	}

	/**
	* Generates a score based on an overlap of nGrams between two strings using
	* the DiceCoefficient technique.
	*
	* @param s1 first string
	* @param s2 second string
	* @param nGrams number of chars in each gram
	* @return
	*/
	public double getDiceCoefficient(String s1, String s2, int nGrams) {
	if (s1.equals("") \|\| s1.equals("")) {
	return 0d;
	}
	List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");
	List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");

	Set<String> overlap = new HashSet<String>(s1Grams);
	overlap.retainAll(s2Grams);
	double totcombigrams = overlap.size();

	return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());
	}

	private int minimum(int a, int b, int c) {
	return Math.min(Math.min(a, b), c);
	}

	public int getLevenshteinDistance(CharSequence str1,
	CharSequence str2) {
	int[][] distance = new int[str1.length() + 1][str2.length() + 1];

	for (int i = 0; i <= str1.length(); i++) {
	distance[i][0] = i;
	}
	for (int j = 1; j <= str2.length(); j++) {
	distance[0][j] = j;
	}

	for (int i = 1; i <= str1.length(); i++) {
	for (int j = 1; j <= str2.length(); j++) {
	distance[i][j] = minimum(
	distance[i - 1][j] + 1,
	distance[i][j - 1] + 1,
	distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));
	}
	}

	return distance[str1.length()][str2.length()];
	}
	}