| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.entitycomention.impl; |
| |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| import java.util.Set; |
| import java.util.SortedMap; |
| import java.util.TreeMap; |
| |
| import org.apache.clerezza.rdf.core.Language; |
| import org.apache.clerezza.rdf.core.MGraph; |
| import org.apache.clerezza.rdf.core.PlainLiteral; |
| import org.apache.clerezza.rdf.core.Resource; |
| import org.apache.clerezza.rdf.core.UriRef; |
| import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl; |
| import org.apache.stanbol.commons.indexedgraph.IndexedMGraph; |
| import org.apache.stanbol.enhancer.engines.entitylinking.Entity; |
| import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher; |
| import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer; |
| import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| /** |
| * EntitySearch implementation that does hold Entity information of mentioned |
| * Entities in memory. |
| * @author Rupert Westenthaler |
| * |
| */ |
| public class InMemoryEntityIndex implements EntitySearcher { |
| |
| private final Logger log = LoggerFactory.getLogger(InMemoryEntityIndex.class); |
| |
| protected final LabelTokenizer tokenizer; |
| //Holds Entity data |
| private SortedMap<String,Collection<Entity>> index = new TreeMap<String,Collection<Entity>>(String.CASE_INSENSITIVE_ORDER); |
| private Map<UriRef,Entity> entities = new HashMap<UriRef,Entity>(); |
| private Set<String> indexLanguages; |
| protected String language; |
| protected UriRef nameField; |
| |
| |
| public InMemoryEntityIndex(LabelTokenizer tokenizer, UriRef nameField, String...languages) { |
| this.indexLanguages = languages == null || languages.length < 1 ? |
| Collections.singleton((String)null) : |
| new HashSet<String>(Arrays.asList(languages)); |
| this.language = languages == null || languages.length < 1 ? null : |
| languages[0]; |
| this.tokenizer = tokenizer; |
| this.nameField = nameField; |
| } |
| |
| |
| public void addEntity(Entity entity){ |
| if(log.isDebugEnabled()){ |
| log.debug(" > register {}",entity); |
| } |
| entities.put(entity.getUri(), entity); |
| Iterator<PlainLiteral> labels = entity.getText(nameField); |
| while(labels.hasNext()){ |
| PlainLiteral label = labels.next(); |
| String lang = label.getLanguage() == null ? null : label.getLanguage().toString(); |
| if(indexLanguages.contains(lang)){ |
| for(String token : tokenizer.tokenize(label.getLexicalForm(),null)){ |
| token = token.toLowerCase(Locale.ROOT); |
| Collection<Entity> values = index.get(token); |
| if(values == null){ |
| values = new ArrayList<Entity>(); |
| index.put(token, values); |
| } |
| values.add(entity); |
| } |
| } //else ignore labels in other languages |
| } |
| |
| } |
| |
| @Override |
| public Entity get(UriRef id, Set<UriRef> includeFields, String...languages) throws IllegalStateException { |
| return entities.get(id); |
| } |
| |
| @Override |
| public Collection<? extends Entity> lookup(UriRef field, |
| Set<UriRef> includeFields, |
| List<String> search, String[] languages, |
| Integer numResults, Integer offset) throws IllegalStateException { |
| //this assumes that |
| assert nameField.equals(field); //the nameField is the field |
| assert Arrays.asList(languages).contains(language); //the parsed languages include the language |
| //NOTES: |
| // We can ignore the following parameters |
| // * includeFields: as we will return the Entities as added to the index |
| |
| //The Syntax requires to |
| // * AND over the tokenized elements of the search List |
| // * OR over the elements in the search |
| // * Elements that do match more search elements need to be ranked first |
| Map<Entity, int[]> results = new HashMap<Entity,int[]>(); |
| for(String qe : search){ |
| Set<Entity> qeResult = join(tokenizer.tokenize(qe, language)); |
| for(Entity e : qeResult){ |
| int[] count = results.get(e); |
| if(count != null){ |
| count[0] = count[0]+qe.length(); |
| } else { |
| results.put(e, new int[]{qe.length()}); |
| } |
| } |
| } |
| @SuppressWarnings("unchecked") //TODO how to create generic arrays |
| Entry<Entity,int[]>[] resultArray = results.entrySet().toArray(new Entry[results.size()]); |
| int index; |
| if(offset != null && offset.intValue() > 0){ |
| index = offset.intValue(); |
| } else { |
| index = 0; |
| } |
| if(index >= resultArray.length){ //no more results |
| return Collections.emptyList(); |
| } |
| //final ranking |
| Arrays.sort(resultArray, RESULT_SCORE_COMPARATOR); |
| List<Entity> resultList = new ArrayList<Entity>(Math.min(numResults+3, (resultArray.length-index))); |
| int lastScore = -1; |
| boolean done = false; |
| //start at the parsed offset |
| for(; index < resultArray.length && !done; index++){ |
| if(index < numResults){ |
| resultList.add(resultArray[index].getKey()); |
| if(index == (numResults - 1)){ //memorize the score of the last included |
| lastScore = resultArray[index].getValue()[0]; |
| } |
| } else if (lastScore == resultArray[index].getValue()[0]){ |
| //include additional results with the same score |
| resultList.add(resultArray[index].getKey()); |
| } else { //cut of |
| done = true; |
| } |
| } |
| return resultList; |
| } |
| |
| private static final Comparator<Collection<?>> COLLECTION_SIZE_COMPARATOR = new Comparator<Collection<?>>() { |
| |
| @Override |
| public int compare(Collection<?> c1, Collection<?> c2) { |
| return c1 == null && c2 == null ? 0 : |
| c1 == null ? -1 : c2 == null ? 1 : //null values last |
| c2.size() - c1.size(); //lowest size first |
| } |
| |
| }; |
| private static final Comparator<Entry<Entity,int[]>> RESULT_SCORE_COMPARATOR = new Comparator<Entry<Entity,int[]>>() { |
| |
| @Override |
| public int compare(Entry<Entity,int[]> e1, Entry<Entity,int[]> e2) { |
| return e1 == null && e2 == null ? 0 : |
| e1 == null ? -1 : e2 == null ? 1 : //null values last |
| e1.getValue()[0] - e2.getValue()[0]; //highest score first! |
| }}; |
| |
| /** |
| * Searches for Elements that do contain all the parsed Query Tokens |
| * @param queryTokens the query tokens. MUST NOT be NULL, empty or contain |
| * any NULL or empty string as element |
| * @return matching entities or an empty Set if none. |
| */ |
| private Set<Entity> join(String...queryTokens) { |
| @SuppressWarnings("unchecked") //TODO: how to create a generic typed array |
| Collection<Entity>[] tokenResults = new Collection[queryTokens.length]; |
| for(int i=0;i<queryTokens.length;i++){ |
| Collection<Entity> tokenResult = index.get(queryTokens[i].toLowerCase(Locale.ROOT)); |
| if(tokenResult == null || tokenResult.isEmpty()){ |
| return Collections.emptySet(); |
| } |
| tokenResults[i] = tokenResult; |
| } |
| Set<Entity> join = new HashSet<Entity>(tokenResults[0]); |
| if(tokenResults.length == 1){ |
| return join; |
| } //else we need to join the single results |
| |
| //we want to join the shortest results first |
| Arrays.sort(tokenResults,COLLECTION_SIZE_COMPARATOR); |
| for(int i = 1; i < tokenResults.length && !join.isEmpty(); i++){ |
| Set<Entity> old = join; |
| join = new HashSet<Entity>(); //new set to add all elements |
| for(Iterator<Entity> it = tokenResults[i].iterator(); it.hasNext() && !old.isEmpty();){ |
| Entity e = it.next(); |
| if(old.remove(e)){ |
| join.add(e); |
| } |
| } |
| } |
| return join; |
| } |
| |
| |
| @Override |
| public boolean supportsOfflineMode() { |
| return true; |
| } |
| |
| |
| @Override |
| public Integer getLimit() { |
| return null; |
| } |
| |
| @Override |
| public Map<UriRef,Collection<Resource>> getOriginInformation() { |
| return Collections.emptyMap(); |
| } |
| } |