| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.apps.object_dedup; |
| |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.regex.Pattern; |
| |
| import opennlp.tools.similarity.apps.BingQueryRunner; |
| import opennlp.tools.similarity.apps.HitBase; |
| import opennlp.tools.similarity.apps.utils.LevensteinDistanceFinder; |
| import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; |
| import opennlp.tools.similarity.apps.utils.Utils; |
| import opennlp.tools.textsimilarity.TextProcessor; |
| |
| import org.apache.commons.lang.StringUtils; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /* This is a template class for deduplicator */ |
| |
| public class SimilarityAccessorBase { |
| private static final Logger LOG = LoggerFactory.getLogger(SimilarityAccessorBase.class); |
| |
| public static final int MAX_EV_TO_RECOMM = 6; |
| |
| private List<String> namesBothSides; |
| |
| private static final String[] ENGLISH_PREPOSITIONS = new String[] { "a", "aboard", "about", "above", "absent", |
| "across", "after", "against", "along", "alongside", "among", "around", "as", "at", "before", "behind", "below", |
| "beneath", "between", "beyond", "but", "by", "despite", "down", "during", "except", "excluding", "failing", |
| "following", "for", "from", "in", "including", "inside", "into", "like", "near", "next", "of", "off", "on", |
| "onto", "only", "opposite", "out", "outside", "over", "pace", "past", "per", "since", "than", "through", "and", |
| "thru", "till", "to", "toward", "under", "up", "upon", "versus", "with", "within", "you", "must", "know", |
| "when" }; |
| |
| private static final List<String> COMMON_WORDS_IN_EVENT_TITLES = Arrays.asList("community", "party", "film", |
| "music", "exhibition", "kareoke", "guitar", "quartet", "reggae", "r&b", "band", "dj ", "piano", "pray", |
| "worship", "god", "training", "class", "development", "training", "class", "course", "our", "comedy", ",fun", |
| "musical", "group", "alliance", "session", "feeding", "introduction", "school", "conversation", "learning", |
| "nursery", "unity", "trivia", "chat", "conference", "tuition", "technology", "teen", "communication", |
| "reception", "management", "beginner", "beginning", "collabora", "reuninon", "political", "course", "age", |
| "ages", "through", "grade", "networking", "workshop", "demonstration", "tuning", "program", "summit", |
| "convention", "day", "night", "one", "two", "outfest", "three", "online", "writing", "seminar", "coach", |
| ",expo", "advanced", "beginner", "intermediate", "earn", "free", "ii", "iii", "skills", "skill", "artist", |
| "summer", "winter", "autumn", "spring", "camp", "vacation", "miscrosoft", "kid", "child", "kids", "children", |
| "every", "everyone", "dancer", "dancers", "senior", "seniors", "basic", "elementary", "outfest", "2008", |
| "2009", "2010", "2011", "2012", "monday", "tuesday", "wednesday", "thirsday", "friday", "saturday", "sunday", |
| "mondays", "tuesdays", "wednesdays", "thirsdays", "fridays", "saturdays", "sundays", "men" // ? |
| ); |
| |
| private final BingQueryRunner webSearch = new BingQueryRunner(); |
| |
| private final StringDistanceMeasurer stringDistanceMeasurer = new StringDistanceMeasurer(); |
| |
| |
| public SimilarityAccessorBase() { |
| } |
| |
| |
| public void init() |
| { |
| namesBothSides = getWordsThatShouldBeOnBothSidesEvents(); |
| } |
| |
| protected List<String> removeDollarWordAndNonAlphaFromList(List<String> list) |
| { |
| List<String> result = new ArrayList<>(); |
| Pattern p = Pattern.compile("^\\$(\\d{1,3}(\\,\\d{3})*|(\\d+))(\\.\\d{2})?$"); |
| for (String w : list) |
| { |
| if (!(p.matcher(w).find()) && StringUtils.isAlphanumeric(w) && (w.length() >= 3 || !StringUtils.isAlpha(w))) |
| result.add(w); |
| } |
| return result; |
| } |
| |
| |
| public List<String> getWordsThatShouldBeOnBothSidesEvents() |
| { |
| /* |
| names.addAll(Arrays.asList(new String[] { "woman", "man", "women", "men", "womans", "mans", "womens", "mens", |
| "boy", "girl", "boys", "girls", "men's", "women's", "woman's", "ice", // for disney |
| "flight", "intermediate", "advanced", "beginner", |
| // "tour", TODO special consideration |
| "helicopter", "sexual", "junior", "jr" })); |
| */ |
| return null; |
| |
| } |
| |
| protected Boolean applySemanticNameSimilarityRule(Object es1, Object es2) { |
| |
| //TODO check attributes of objects |
| /* |
| if (!(es1.getVenueName().endsWith(es2.getVenueName()) || es2.getVenueName().endsWith(es1.getVenueName()))) |
| return false; |
| if (Math.abs(es1.getStarttime().getTime() - es2.getStarttime().getTime()) > 100000) |
| return false; |
| */ |
| |
| return true; |
| |
| } |
| |
| // this rule extract "OF" part and treats it as a whole expression |
| protected void applySubPhraseExtractionRule(List<String> name1Tokens, List<String> name2Tokens) |
| { |
| if (name1Tokens.indexOf("of") > 0 && name2Tokens.indexOf("of") > 0) |
| { |
| name1Tokens = extractMainNounPhrase(name1Tokens); |
| name2Tokens = extractMainNounPhrase(name2Tokens); |
| } |
| } |
| |
| private Boolean attemptShortTitlesSimilarityInWebSpace(String name1, String name2) |
| { |
| |
| // first delimeter processing |
| String name1v = name1.replace("'", "").replace("-", " "); |
| String name2v = name2.replace("'", "").replace("-", " "); |
| String name1vv = name1.replace("'", ""); |
| String name2vv = name2.replace("'", ""); |
| String name1vvv = name1.replace("-", " "); |
| String name2vvv = name2.replace("-", " "); |
| |
| if (name1.startsWith(name2) || name1vv.startsWith(name2) || name1.startsWith(name2v) |
| || name1.startsWith(name2vv) || name1.startsWith(name2vvv) || name1v.startsWith(name2v) |
| || name1v.startsWith(name2vv) || name2.startsWith(name1) || name2vv.startsWith(name1) |
| || name2.startsWith(name1v) || name2vvv.startsWith(name1vv) || name2.startsWith(name1vvv) |
| || name2v.startsWith(name1v) || name2v.startsWith(name1vv) || name1.endsWith(name2) |
| || name1vv.endsWith(name2) || name1.endsWith(name2v) || name1.endsWith(name2vv) || name1.endsWith(name2vvv) |
| || name1v.endsWith(name2v) || name1v.endsWith(name2vv) || name2.endsWith(name1) || name2vv.endsWith(name1) |
| || name2.endsWith(name1v) || name1vvv.endsWith(name2vv) || name2.endsWith(name1vvv) |
| || name2v.endsWith(name1v) || name2v.endsWith(name1vv)) |
| { |
| LOG.info("Found fuzzy substring of name1 and name2"); |
| return true; |
| } |
| if (name1.length() > 12 && name2.length() > 12) |
| return false; |
| |
| return areNamesSemanticallyCloseInWebSearchSpace(name1, name2, 0.8f, false).isDecision(); |
| |
| } |
| |
| public Boolean applyBothSidesRuleEvent(String name1, String name2) |
| { |
| List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); |
| List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); |
| // get unique names |
| List<String> name1TokensC = new ArrayList<>(name1Tokens), name2TokensC = new ArrayList<>( |
| name2Tokens); |
| name1TokensC.removeAll(name2Tokens); |
| name2TokensC.removeAll(name1Tokens); |
| // get all unique names |
| name1TokensC.addAll(name2TokensC); |
| |
| name1TokensC.retainAll(namesBothSides); |
| name1Tokens.retainAll(name2Tokens); |
| |
| if ((name1TokensC.size() > 0 && name1Tokens.size() < 3) || (name1TokensC.size() > 1 && name1Tokens.size() < 5)) |
| { // 'mens == men; case !(name1TokensC.size()==2 && (name1TokensC.get(0).indexOf(name1TokensC.get(1))>-1 || |
| // name1TokensC.get(1).indexOf(name1TokensC.get(0))>-1 ))){ |
| LOG.info("Found required common word present on one side and not on the other: " + name1TokensC.toString() |
| + " and less than 3 keywords overlap (or >1 common words and less than 5 overl"); |
| return false; |
| } |
| else |
| return true; |
| } |
| |
| protected List<String> tokenizeAndStem(String input) |
| { |
| |
| List<String> results = new ArrayList<>(); |
| List<String> toks = TextProcessor.fastTokenize(input.toLowerCase(), false); |
| for (String word : toks) |
| { |
| try |
| { |
| if (word.equals("theatre")) |
| word = "theater"; |
| results.add(word); |
| } |
| catch (Exception e) |
| { |
| results.add(word); |
| } |
| } |
| return results; |
| } |
| |
| protected List<String> stemList(List<String> toks) |
| { |
| |
| List<String> results = new ArrayList<>(); |
| for (String word : toks) |
| { |
| try |
| { |
| if (word.equals("theatre")) |
| word = "theater"; |
| results.add(word); |
| } |
| catch (Exception e) |
| { |
| results.add(word); |
| } |
| } |
| return results; |
| } |
| |
| public List<String> removeVenuePart(ArrayList<String> toks) |
| { |
| List<String> results = new ArrayList<>(); |
| boolean bVenuePart = false; |
| for (String word : toks) |
| { |
| // beginning of venue part |
| if (word.equals("at") || word.equals("@")) |
| bVenuePart = true; |
| // end of venue part |
| if (!StringUtils.isAlphanumeric(word) || word.startsWith("<punc")) |
| bVenuePart = false; |
| |
| if (!bVenuePart && !word.startsWith("<punc")) |
| results.add(word); |
| |
| } |
| return results; |
| } |
| |
| protected boolean isCapitalized(String lookup) |
| { |
| String[] titleWords = lookup.split(" "); |
| int count = 0; |
| for (String word : titleWords) |
| { |
| if (word.length() < 2) // '-', '|', ':' |
| break; |
| |
| if (word.equals(word.toLowerCase()) && (!Arrays.asList(ENGLISH_PREPOSITIONS).contains(word)) |
| && word.length() > 3 && StringUtils.isAlphanumeric(word)) |
| continue; // was return false; |
| if (count > 3) |
| break; |
| count++; |
| } |
| return true; |
| } |
| |
| protected List<String> extractMainNounPhrase(List<String> name1Tokens) |
| { |
| List<String> results = new ArrayList<>(); |
| int ofPos = name1Tokens.indexOf("of"); |
| List<String> ofList = name1Tokens.subList(ofPos + 1, name1Tokens.size() - 1); |
| // now iterate till next preposition towards the end of noun phrase |
| for (String preposCand : ofList) |
| { |
| if (Arrays.asList(ENGLISH_PREPOSITIONS).contains(preposCand)) |
| break; |
| results.add(preposCand); |
| } |
| return results; |
| |
| } |
| |
| public boolean verifyEventAttributesPost(List<String> name1Tokens, List<String> name2Tokens) |
| { |
| String[] attributeNamesPost = { "age", "ages", "game", "games", "grade", "grades", "level", "levels", "vs", |
| "vs.", "versus", "pottery", "competition", "contest", "skill", "skills", "day", "only", "basic", "class", |
| "completed", |
| // "tour", ? |
| "advanced", "beginner", "intermediate", "flight", "workshop", "latin", "adobe", "ballet", "dinner", |
| "breakfast", "lunch", "summer", // "canyon" |
| "tfestival", "festival", "mfestival" }; |
| try |
| { |
| for (String attr : attributeNamesPost) |
| { |
| |
| int agePos1 = name1Tokens.indexOf(attr); |
| int agePos2 = name2Tokens.indexOf(attr); |
| if (agePos1 > -1 && agePos2 > -1 && agePos1 < name1Tokens.size() - 1 |
| && agePos2 < name2Tokens.size() - 1) |
| { |
| double dist = LevensteinDistanceFinder.levensteinDistance(name1Tokens.get(agePos1 + 1), |
| name2Tokens.get(agePos2 + 1), 1, 10, 1, 10); |
| if (!name1Tokens.get(agePos1 + 1).equalsIgnoreCase(name2Tokens.get(agePos2 + 1)) |
| && (dist > 2.99 || name1Tokens.get(agePos1 + 1).length() < 4)) |
| { |
| LOG.info("Found disagreement in the attrib value for " + attr + " value = " |
| + name1Tokens.get(agePos1 + 1) + " <=> " + name2Tokens.get(agePos2 + 1)); |
| return false; |
| } |
| } |
| } |
| } |
| catch (Exception e) |
| { |
| e.printStackTrace(); |
| } |
| return true; |
| } |
| |
| public boolean verifyEventAttributesPre(List<String> name1Tokens, List<String> name2Tokens) |
| { |
| |
| String[] attributeNamesPre = { "hour", "vs", "vs.", "versus", "pottery", "program", "day", "only", |
| // dance styles followed by a param |
| "swing", "rumba", "samba", "doble", |
| "violence", // |
| // "level", |
| "class", "classes", "kid", "kids", "test", "west", "summer_camp", "session", "tfestival", "festival", |
| "mfestival" }; |
| try |
| { |
| for (String attr : attributeNamesPre) |
| { |
| int agePos1 = name1Tokens.indexOf(attr); |
| int agePos2 = name2Tokens.indexOf(attr); |
| if (agePos1 > 0 && agePos2 > 0) |
| { // not the first word is attr name |
| if (!name1Tokens.get(agePos1 - 1).equalsIgnoreCase(name2Tokens.get(agePos2 - 1)) |
| && (agePos1 < 2 || !name1Tokens.get(agePos1 - 2).equalsIgnoreCase(name2Tokens.get(agePos2 - 1))) |
| && |
| // ((agePos1<2 && agePos2 <2) || !name1Tokens.get(agePos1 - |
| // 2).equalsIgnoreCase(name2Tokens.get(agePos2 - 2 ))) && |
| (agePos2 < 2 || !name1Tokens.get(agePos1 - 1).equalsIgnoreCase(name2Tokens.get(agePos2 - 2))) |
| |
| ) |
| { |
| LOG.info("Found disagreement in the attrib value for " + attr + " value = " |
| + name1Tokens.get(agePos1 - 1) + " and " + name2Tokens.get(agePos2 - 1)); |
| return false; |
| } |
| } |
| } |
| } |
| catch (Exception e) |
| { |
| e.printStackTrace(); |
| } |
| return true; |
| } |
| |
| protected boolean bDifferentGroupOneSubnameOfAnother(String name1, String name2) |
| { |
| // first check a special case that both name1 and name2 are DIFFERENT groups at last.fm |
| Map<String, Integer> map1 = null; //LastFM_APIManager.extractTagsForArtist(name1); |
| Map<String, Integer> map2 = null; //LastFM_APIManager.extractTagsForArtist(name2); |
| if (map1 != null && map2 != null && map1.size() > 0 && map2.size() > 0) |
| map1.entrySet().removeAll(map2.entrySet()); |
| if (map1.size() > 0) // same or subset of tags => different groups |
| return true; |
| |
| return false; |
| } |
| |
| public boolean applyBothSidesRule(String name1, String name2) |
| { |
| List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); |
| List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); |
| // get unique names |
| List<String> name1TokensC = new ArrayList<>(name1Tokens), name2TokensC = new ArrayList<>( |
| name2Tokens); |
| name1TokensC.removeAll(name2Tokens); |
| name2TokensC.removeAll(name1Tokens); |
| // get all unique names |
| name1TokensC.addAll(name2TokensC); |
| |
| name1TokensC.retainAll(namesBothSides); |
| if (name1TokensC.size() > 0) |
| return false; |
| else |
| return true; |
| } |
| |
| private boolean succeededMenWomenSportsRule(String name1, String name2) |
| { |
| List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); |
| List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); |
| if (name1Tokens.contains("men") || name2Tokens.contains("men") || name1Tokens.contains("women") |
| || name2Tokens.contains("women") || name1Tokens.contains("disney") || name2Tokens.contains("disney")) |
| { // all words should be the |
| // same |
| name1Tokens.removeAll(name2Tokens); |
| name1Tokens.removeAll(Arrays.asList(ENGLISH_PREPOSITIONS)); |
| name1Tokens.removeAll(Arrays.asList(COMMON_WORDS_IN_EVENT_TITLES)); |
| if (name1Tokens.size() < 1) |
| return true; |
| |
| return false; |
| } |
| else |
| return true; |
| |
| } |
| |
| private boolean succeededSpecialGroupsSymphoniesRule(String name1, String name2) |
| { |
| List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), false); |
| List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), false); |
| if (name1Tokens.contains("orchestra") || name2Tokens.contains("symphony") || name2Tokens.contains("orchestra") |
| || name1Tokens.contains("symphony") || name2Tokens.contains("band") || name1Tokens.contains("band") |
| || name2Tokens.contains("trio") || name1Tokens.contains("trio") || name1Tokens.contains("soleil") |
| || name2Tokens.contains("soleil") || name1Tokens.contains("disney") || name2Tokens.contains("disney") |
| || name1Tokens.contains("lang") || name2Tokens.contains("lang")) // special group 'lang lang' |
| { // all words should be the |
| // same |
| List<String> name1TokensClone = new ArrayList<>(name1Tokens); |
| name1Tokens.removeAll(name2Tokens); |
| name2Tokens.removeAll(name1TokensClone); |
| name1Tokens.addAll(name2Tokens); |
| name1Tokens.removeAll(Arrays.asList(ENGLISH_PREPOSITIONS)); |
| // name1Tokens.removeAll(Arrays.asList(this.commonWordsInEventTitles)); |
| if (name1Tokens.size() < 1) |
| return true; |
| |
| return false; |
| } |
| else |
| return true; |
| |
| } |
| |
| public int getAttemptedNameMerge(String name1, String name2) |
| { |
| name1 = name1.replaceAll("[a-z][A-Z]", "$0&$0").replaceAll(".&.", " "); |
| // suspected word merge if higher case is in the middle of word |
| name2 = name2.replaceAll("[a-z][A-Z]", "$0&$0").replaceAll(".&.", " "); |
| |
| name1 = name1.toLowerCase(); |
| name2 = name2.toLowerCase(); |
| if (name1.equals(name2) || name1.startsWith(name2) || name2.startsWith(name1) || name1.endsWith(name2) |
| || name1.endsWith(name2) || name1.contains(name2) || name1.contains(name2)) // ?? |
| return 2; |
| String name2r = name2.replace(" ", ""); |
| if (name1.equals(name2r) || name1.startsWith(name2r) || name1.startsWith(name2r) || name1.endsWith(name2r) |
| || name1.endsWith(name2r)) |
| return 1; |
| String name1r = name1.replace(" ", ""); |
| if (name1r.equals(name2r) || name1r.startsWith(name2r) || name1r.startsWith(name2) || name1r.endsWith(name2r) |
| || name1r.endsWith(name2r) || name2r.equals(name1r) || name2r.startsWith(name1r) |
| || name2r.startsWith(name1) || name2r.endsWith(name1r) || name2r.endsWith(name2) |
| |
| ) |
| return 1; |
| |
| if (stringDistanceMeasurer.measureStringDistance(name1, name2) > 0.95) |
| return 2; |
| if (stringDistanceMeasurer.measureStringDistance(name1, name2) > 0.70) |
| return 1; |
| return 0; |
| } |
| |
| private String normalizeGenderAndOtherAttributes(String name1) |
| { |
| name1 = Utils.convertToASCII(name1.replace("/", " ").replace("w/", "with ")).replace('!', ' ').toLowerCase(); |
| |
| name1 = name1.replace("woman", "women").replace("womans", "women").replace("womens", "women") |
| .replace("women's", "women").replace("woman's", "women"); |
| name1 = name1.replace(" man ", " men ").replace(" mans ", " men ").replace(" men's ", " men ") |
| .replace(" man's ", " men ").replace(" mens ", " men ").replace("summer camp", "summer_camp") |
| .replace("gaea theatre festival", "tfestival"); // need regexp for this |
| return name1; |
| } |
| |
| /* |
| * Main semantic similarity function which applies boundary cases rule and focus on web mining rule The main |
| * criteria for a commonality between titles: to form an entity, searchable on the web |
| */ |
| public DedupResult areNamesSemanticallyCloseWebMineCommonPart(String name1, String name2, String venue) |
| { |
| // normalize gender |
| name1 = normalizeGenderAndOtherAttributes(name1); |
| name2 = normalizeGenderAndOtherAttributes(name2); |
| |
| Boolean bShortTitlesSimilarInWebSpace = attemptShortTitlesSimilarityInWebSpace(name1, name2); |
| if (bShortTitlesSimilarInWebSpace) |
| return new DedupResult("Accepted as short title by web mining", 2, true); |
| |
| StringBuilder reason = new StringBuilder(); |
| List<String> venueToks = removeVenuePart(TextProcessor.fastTokenize(venue.toLowerCase(), false)); |
| |
| LOG.info("\nComputing similarity between name = '" + name1 + "' and name = '" + name2 + "'"); |
| // convert titles into token lists |
| List<String> name1Tokens = removeVenuePart(TextProcessor.fastTokenize(name1.toLowerCase(), true)); |
| List<String> name2Tokens = removeVenuePart(TextProcessor.fastTokenize(name2.toLowerCase(), true)); |
| // applySubPhraseExtractionRule() |
| boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, name2Tokens) |
| && verifyEventAttributesPre(name1Tokens, name2Tokens); |
| if (!bSameAttrib) |
| { |
| LOG.info("similar events but different attributes"); |
| return new DedupResult("similar events but different attributes", 0, false); |
| } |
| |
| boolean bothSodesSuccess = applyBothSidesRuleEvent(name1, name2); |
| if (!bothSodesSuccess) |
| { |
| return new DedupResult("Failed common words test for sports", 0, false); |
| } |
| |
| float dist = (float) LevensteinDistanceFinder.levensteinDistance(name1, name2, 1, 10, 1, 10); |
| if (dist < 5.1) |
| { |
| LOG.info("Found low LevensteinDistance for name1 and name2"); |
| return new DedupResult("Found low LevensteinDistance", 2, true); |
| } |
| |
| int nameMergeScore = getAttemptedNameMerge(name1, name2); |
| if (nameMergeScore > 0) |
| { |
| LOG.info("Found low NameMerge Distance for name1 and name2"); |
| return new DedupResult("Found low NameMerge Distance", 2, true); |
| } |
| |
| // todo take into account order |
| // form common sub-list of tokens |
| name1Tokens.retainAll(name2Tokens); |
| name1Tokens.removeAll(venueToks); |
| |
| name1Tokens.removeAll(COMMON_WORDS_IN_EVENT_TITLES); |
| name1Tokens.removeAll(Arrays.asList(ENGLISH_PREPOSITIONS)); |
| name1Tokens = removeDollarWordAndNonAlphaFromList(name1Tokens); |
| // todo : to use full string measure |
| // boundary case: too many words => just do counts |
| float commonPortion = (float) name1Tokens.size() / (float) name2Tokens.size(); |
| if (commonPortion > 0.8 || name1Tokens.size() >= 4) |
| { // after typical |
| // title words |
| // are revomed 4 |
| // looks OK |
| LOG.info("Accepted since substantial common part"); |
| return new DedupResult("Accepted since substantial common part", Math.max((int) (commonPortion * 5.0), 2), |
| true); |
| } |
| // boundary case: no overlap |
| if (name1Tokens.size() < 1) |
| { |
| LOG.info("Rejected since nothing in common"); |
| return new DedupResult("Rejected since nothing in common", 0, false); |
| } |
| // get from list of tokens back to words to get search expression |
| String entityExpression = name1Tokens.toString().replace('[', ' ').replace(']', ' ').replace(',', ' ') |
| .replace(" ", " ").trim(); |
| /* |
| * // now try name merge reduced strings String entityExpression1 = name1TokensC.toString().replace('[', |
| * ' ').replace(']', ' ').replace(',', ' ') .replace(" ", " ").trim(); String entityExpression2 = |
| * name2Tokens.toString().replace('[', ' ').replace(']', ' ').replace(',', ' ') .replace(" ", " ").trim(); |
| * |
| * nameMergeScore = getAttemptedNameMerge(entityExpression1, entityExpression2); if (nameMergeScore>0){ |
| * LOG.info("Found low NameMerge Distance for REDUCED name1 and name2"); return new |
| * DedupResult("Found low NameMerge Distance REDUCED", 2, true); |
| * |
| * } |
| */ |
| |
| // Before doing web mining, make sure overlap between titles is NOT a |
| // set of common english words (use the vocabulary) |
| // if all words are common, then NOT an entity |
| if (name1Tokens.size() < 2) |
| { |
| boolean bCommonEnglishWord = false; |
| for (String word : name1Tokens) |
| { |
| // if (stopList.isCommonWord(word) /*&& mostFrequent1000Words.isMostFrequent1000Word(word)*/) |
| // bCommonEnglishWord = true; |
| } |
| |
| if (bCommonEnglishWord) |
| { |
| LOG.info("Rejected common entity: common word = " + entityExpression); |
| return new DedupResult("Rejected since common entity is common English word = " + entityExpression, 0, |
| false); |
| } |
| } |
| // accept common expression |
| LOG.info("Formed common entity = " + entityExpression); |
| reason.append("Formed common entity = ").append(entityExpression).append("\n"); |
| // now go to the web / bing api with this common expression |
| List<HitBase> searchResult = webSearch.runSearch(entityExpression); |
| float entityScore = 0f; |
| if (searchResult != null) |
| { |
| int count = 0; |
| for (HitBase item : searchResult) |
| { |
| String lookup = item.getTitle(); |
| LOG.info("Bing hit title = '" + lookup + "'"); |
| reason.append("Bing hit title = '").append(lookup).append("'\n"); |
| if (count > 4) |
| break; |
| count++; |
| // if occurrence is not capitalized then rejected, do not take |
| // into account in score |
| if (!isCapitalized(lookup)) |
| { |
| LOG.info("Rejected hit title since not capitalized"); |
| reason.append("Rejected hit title since not capitalized\n"); |
| continue; |
| } |
| |
| /* |
| * if (lookup.indexOf('-')>0 ){ lookup = lookup.split("-")[0]; } |
| */ |
| // now compute overlap between what found on the web for hit's |
| // title and the common expression between events |
| List<String> lookupTokens = tokenizeAndStem(lookup); |
| lookupTokens.retainAll(stemList(name1Tokens)); |
| if (lookupTokens.size() >= name1Tokens.size()) |
| // increment score if found hit title is acceptable |
| entityScore += 1.0; |
| else |
| { |
| LOG.info("Found hit title " + lookupTokens + " does not cover comonality expr = " + name1Tokens); |
| entityScore += 0.25; |
| |
| } |
| |
| } |
| } |
| return new DedupResult(reason.toString(), (int) entityScore, entityScore > 1.0); |
| } |
| |
| public DedupResult areNamesSemanticallyCloseInWebSearchSpace(String name1, String name2, Float thresh, boolean bStem) |
| { |
| |
| if (thresh == null || thresh == 0f) |
| { |
| thresh = 0.8f; |
| } |
| |
| // normalize gender |
| name1 = normalizeGenderAndOtherAttributes(name1); |
| name2 = normalizeGenderAndOtherAttributes(name2); |
| |
| StringBuilder reason = new StringBuilder(); |
| |
| boolean bSportsOrOrchestra = !succeededMenWomenSportsRule(name1, name2); |
| if (bSportsOrOrchestra) |
| return new DedupResult("Sports rule: different teams or teams of different venues", 0, false); |
| |
| bSportsOrOrchestra = !succeededSpecialGroupsSymphoniesRule(name1, name2); |
| if (bSportsOrOrchestra) |
| return new DedupResult("SpecialGroupsSymphoniesRule: different circus/band", 0, false); |
| |
| LOG.info("\nComputing similarity between name = '" + name1 + "' and name = '" + name2 + "'"); |
| |
| List<String> name1Tokens = TextProcessor.fastTokenize(name1.toLowerCase(), true); |
| List<String> name2Tokens = TextProcessor.fastTokenize(name2.toLowerCase(), true); |
| boolean bSameAttrib = verifyEventAttributesPost(name1Tokens, name2Tokens) |
| && verifyEventAttributesPre(name1Tokens, name2Tokens); |
| if (!bSameAttrib) |
| { |
| LOG.info("similar events but different attributes"); |
| return new DedupResult("similar events but different attributes", 0, false); |
| } |
| |
| List<HitBase> searchResult1 = webSearch.runSearch(name1); |
| List<HitBase> searchResult2 = webSearch.runSearch(name2); |
| int score = 0; |
| if (searchResult1 != null && searchResult2 != null) |
| { |
| for (HitBase item1 : searchResult1) |
| { |
| if (item1.getUrl().contains("myspace") || item1.getUrl().contains("wiki")) |
| continue; |
| for (HitBase item2 : searchResult2) |
| { |
| String lookup1 = item1.getTitle().replace("Facebook", "").replace("LinkedIn", "") |
| .replace("MySpace", ""); |
| String lookup2 = item2.getTitle().replace("Facebook", "").replace("LinkedIn", "") |
| .replace("MySpace", ""); |
| double d; |
| if (bStem) |
| d = stringDistanceMeasurer.measureStringDistance(lookup1, lookup2); |
| else |
| d = stringDistanceMeasurer.measureStringDistanceNoStemming(lookup1, lookup2); |
| if (d > thresh) // 0.8) |
| { |
| |
| reason.append("Found common search result title for group names '").append(lookup1).append(" < > ").append(lookup2).append(" sim = ").append(d).append("\n"); |
| LOG.info(("Found common search result title for group names '" + lookup1 + " < > " + lookup2 |
| + " sim = " + d)); |
| score++; |
| } |
| |
| } |
| } |
| } |
| |
| boolean bothSidesSuccess = applyBothSidesRule(name1, name2); |
| if (!bothSidesSuccess) |
| { |
| score = 1; |
| reason.append("Failed common words test for sports"); |
| } |
| if (score > 0) |
| { |
| boolean bDifferentGroup = bDifferentGroupOneSubnameOfAnother(name1, name2); |
| if (bDifferentGroup) |
| { |
| score = 1; |
| reason.append("Failed common words test for sports"); |
| } |
| } |
| return new DedupResult(reason.toString(), score, score > 1); |
| } |
| } |