| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.entitylinking.impl; |
| |
| import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION; |
| |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Set; |
| |
| import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig; |
| import org.apache.stanbol.enhancer.nlp.model.Chunk; |
| import org.apache.stanbol.enhancer.nlp.model.Section; |
| import org.apache.stanbol.enhancer.nlp.model.Span; |
| import org.apache.stanbol.enhancer.nlp.model.Token; |
| import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| public class SectionData { |
| |
| private static final Logger log = LoggerFactory.getLogger(SectionData.class); |
| |
| /** |
| * The section |
| */ |
| public final Section section; |
| /** |
| * Holds the {@link Token}s of the current {@link #sentence} |
| * to allow fast index based access. |
| */ |
| private List<TokenData> tokens = new ArrayList<TokenData>(64); |
| /** |
| * If a linkable token is present in this section |
| */ |
| private boolean hasLinkableToken = false; |
| |
| public SectionData(LanguageProcessingConfig tpc, Section section, |
| Set<SpanTypeEnum> enclosedSpanTypes, boolean isUnicaseLanguage){ |
| this.section = section; |
| Iterator<Span> enclosed = section.getEnclosed(enclosedSpanTypes); |
| ChunkData activeChunk = null; |
| while(enclosed.hasNext()){ |
| Span span = enclosed.next(); |
| if(span.getStart() >= span.getEnd()){ //save guard against empty spans |
| log.warn("Detected Empty Span {} in section {}: '{}'", |
| new Object[]{span,section, section.getSpan()}); |
| } |
| if(span.getType() == SpanTypeEnum.Chunk){ |
| ChunkData chunkData = new ChunkData(tpc,(Chunk)span); |
| if(chunkData.isProcessable()){ |
| if(activeChunk != null){ //current Chunk not yet closed -> overlapping chunks! |
| if(activeChunk.getEndChar() < span.getEnd()){ //merge partly overlapping chunks |
| log.info(" - merge overlapping and processable Chunks {} <-> {}", |
| activeChunk.merged == null? activeChunk.chunk : activeChunk.merged,span); |
| activeChunk.merged = (Chunk)span; //set this one as last merged |
| } //ignore completely covered chunks |
| } else { // a new Chunk starts |
| activeChunk = chunkData; |
| activeChunk.startToken = tokens.size(); |
| if(log.isDebugEnabled()){ |
| log.debug(">> Chunk: (type:{}, startPos: {}) text: '{}'", |
| new Object []{ |
| activeChunk.chunk.getType(), |
| activeChunk.startToken, |
| activeChunk.chunk.getSpan() |
| }); |
| } |
| } |
| } //else ignore chunks that are not processable |
| } else if(span.getType() == SpanTypeEnum.Token){ |
| TokenData tokenData = new TokenData(tpc,tokens.size(),(Token)span,activeChunk); |
| if(log.isDebugEnabled()){ |
| log.debug(" > {}: {} {}(pos:{}) chunk: '{}'", |
| new Object[]{tokenData.index,tokenData.token, |
| tokenData.morpho != null ? ("(lemma: "+tokenData.morpho.getLemma()+") ") : "", |
| tokenData.token.getAnnotations(POS_ANNOTATION), |
| tokenData.inChunk != null ? tokenData.inChunk.chunk.getSpan() : "none"}); |
| } |
| if(!tokenData.hasAlphaNumeric){ |
| tokenData.isLinkable = false; |
| tokenData.isMatchable = false; |
| } else { |
| // (1) apply basic rules for linkable/processable tokens |
| //determine if the token should be linked/matched |
| tokenData.isLinkable = tokenData.isLinkablePos != null ? tokenData.isLinkablePos : false; |
| //matchabel := linkable OR has matchablePos |
| tokenData.isMatchable = tokenData.isLinkable || |
| (tokenData.isMatchablePos != null && tokenData.isMatchablePos); |
| |
| //(2) for non linkable tokens check for upper case rules |
| if(!tokenData.isLinkable && tokenData.upperCase && |
| tokenData.index > 0 && //not a sentence or sub-sentence start |
| !tokens.get(tokenData.index-1).isSubSentenceStart){ |
| //We have an upper case token! |
| if(tpc.isLinkUpperCaseTokens()){ |
| if(tokenData.isMatchable) { //convert matchable to |
| tokenData.isLinkable = true; //linkable |
| tokenData.isMatchable = true; |
| } else { // and other tokens to |
| tokenData.isMatchable = true; //matchable |
| } |
| } else { |
| //finally we need to convert other Tokens to matchable |
| //if MatchUpperCaseTokens is active |
| if(!tokenData.isMatchable && tpc.isMatchUpperCaseTokens()){ |
| tokenData.isMatchable = true; |
| } |
| } |
| } //else not an upper case token |
| |
| //(3) Unknown POS tag Rules (see STANBOL-1049) |
| if(!tokenData.isLinkable && (tokenData.isLinkablePos == null || |
| tokenData.isMatchablePos == null)){ |
| if(isUnicaseLanguage || !tpc.isLinkOnlyUpperCaseTokensWithUnknownPos()){ |
| if(tokenData.isLinkablePos == null && tokenData.hasSearchableLength){ |
| tokenData.isLinkable = true; |
| tokenData.isMatchable = true; |
| } //else no need to change the state |
| } else { //non unicase language and link only upper case tokens enabled |
| if(tokenData.upperCase && // upper case token |
| tokenData.index > 0 && //not a sentence or sub-sentence start |
| !tokens.get(tokenData.index-1).isSubSentenceStart){ |
| if(tokenData.hasSearchableLength && tokenData.isLinkablePos == null){ |
| tokenData.isLinkable = true; |
| tokenData.isMatchable = true; |
| } else if(tokenData.isMatchablePos == null){ |
| tokenData.isMatchable = true; |
| } |
| } else if(tokenData.hasSearchableLength && //lower case and long token |
| tokenData.isMatchablePos == null){ |
| tokenData.isMatchable = true; |
| } //else lower case and short word |
| } |
| } //else already linkable or POS tag present |
| } |
| log.debug(" - {}",tokenData); |
| //add the token to the list |
| tokens.add(tokenData); |
| if(!hasLinkableToken){ |
| hasLinkableToken = tokenData.isLinkable; |
| } |
| if(activeChunk != null){ |
| if (tokenData.isLinkable){ |
| //ignore matchableCount in Chunks with linkable Tokens |
| activeChunk.matchableCount = -10; //by setting the count to -10 |
| } else if(tokenData.isMatchable){ |
| activeChunk.matchableCount++; |
| } |
| if (span.getEnd() >= activeChunk.getEndChar()){ |
| //this is the last token in the current chunk |
| activeChunk.endToken = tokens.size()-1; |
| log.debug(" - end Chunk@pos: {}", activeChunk.endToken); |
| if(tpc.isLinkMultiMatchableTokensInChunk() && |
| activeChunk.getMatchableCount() > 1 ){ |
| log.debug(" - multi-matchable Chunk:"); |
| //mark the last of two immediate following matchable |
| //tokens as processable |
| for(int i = activeChunk.endToken-1;i >= activeChunk.startToken+1;i--){ |
| TokenData ct = tokens.get(i); |
| TokenData pt = tokens.get(i-1); |
| if(ct.isMatchable && pt.isMatchable){ |
| if(!ct.isLinkable) { //if not already processable |
| log.debug(" > convert Token {}: {} (pos:{}) from matchable to processable", |
| new Object[]{i,ct.token.getSpan(),ct.token.getAnnotations(POS_ANNOTATION)}); |
| ct.isLinkable = true; |
| if(!hasLinkableToken){ |
| hasLinkableToken = true; |
| } |
| } |
| i--;//mark both (ct & pt) as processed |
| } |
| } |
| } |
| activeChunk = null; |
| } |
| } |
| } |
| } |
| } |
| |
| public List<TokenData> getTokens() { |
| return tokens; |
| } |
| |
| public boolean hasLinkableToken() { |
| return hasLinkableToken; |
| } |
| } |