| /* |
| * Copyright (c) 2012 Sebastian Schaffert |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.stanbol.enhancer.engines.smartcn.impl; |
| |
| import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage; |
| import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText; |
| |
| import java.io.IOException; |
| import java.io.StringReader; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.Map; |
| |
| import org.apache.clerezza.commons.rdf.IRI; |
| import org.apache.commons.io.input.CharSequenceReader; |
| import org.apache.felix.scr.annotations.Activate; |
| import org.apache.felix.scr.annotations.Component; |
| import org.apache.felix.scr.annotations.ConfigurationPolicy; |
| import org.apache.felix.scr.annotations.Deactivate; |
| import org.apache.felix.scr.annotations.Properties; |
| import org.apache.felix.scr.annotations.Property; |
| import org.apache.felix.scr.annotations.Reference; |
| import org.apache.felix.scr.annotations.Service; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.cn.smart.SentenceTokenizer; |
| import org.apache.lucene.analysis.cn.smart.WordTokenFilter; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| import org.apache.stanbol.enhancer.nlp.NlpProcessingRole; |
| import org.apache.stanbol.enhancer.nlp.NlpServiceProperties; |
| import org.apache.stanbol.enhancer.nlp.model.AnalysedText; |
| import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory; |
| import org.apache.stanbol.enhancer.nlp.model.Sentence; |
| import org.apache.stanbol.enhancer.nlp.model.Token; |
| import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper; |
| import org.apache.stanbol.enhancer.servicesapi.Blob; |
| import org.apache.stanbol.enhancer.servicesapi.ContentItem; |
| import org.apache.stanbol.enhancer.servicesapi.EngineException; |
| import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; |
| import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; |
| import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine; |
| import org.osgi.framework.Constants; |
| import org.osgi.service.cm.ConfigurationException; |
| import org.osgi.service.component.ComponentContext; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Sentence detection and word tokenizer for Chinese based on the Solr/Lucene |
| * smartcn analysers. |
| * |
| * @author Rupert Westenthaler |
| */ |
| |
| @Component(immediate = true, metatype = true, |
| policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration |
| @Service |
| @Properties(value={ |
| @Property(name= EnhancementEngine.PROPERTY_NAME,value="smartcn-token"), |
| @Property(name=Constants.SERVICE_RANKING,intValue=0) //give the default instance a ranking < 0 |
| }) |
| public class SmartcnTokenizerEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties { |
| |
| private static final Map<String,Object> SERVICE_PROPERTIES; |
| static { |
| Map<String,Object> props = new HashMap<String,Object>(); |
| props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING, |
| ServiceProperties.ORDERING_NLP_TOKENIZING); |
| props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE, |
| NlpProcessingRole.Tokenizing); |
| SERVICE_PROPERTIES = Collections.unmodifiableMap(props); |
| } |
| |
| |
| private static Logger log = LoggerFactory.getLogger(SmartcnTokenizerEngine.class); |
| |
| @Reference |
| private AnalysedTextFactory analysedTextFactory; |
| |
| /** |
| * Indicate if this engine can enhance supplied ContentItem, and if it |
| * suggests enhancing it synchronously or asynchronously. The |
| * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is |
| * just a suggestion from the engine. |
| * <p/> |
| * Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for |
| * the content item, CANNOT_ENHANCE otherwise. |
| * |
| * @throws org.apache.stanbol.enhancer.servicesapi.EngineException |
| * if the introspecting process of the content item |
| * fails |
| */ |
| @Override |
| public int canEnhance(ContentItem ci) throws EngineException { |
| // check if content is present |
| Map.Entry<IRI,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false); |
| if(entry == null || entry.getValue() == null) { |
| return CANNOT_ENHANCE; |
| } |
| |
| String language = getLanguage(this,ci,false); |
| if("zh".equals(language) || (language != null && language.startsWith("zh-"))) { |
| log.trace(" > can enhance ContentItem {} with language {}",ci,language); |
| return ENHANCE_ASYNC; |
| } else { |
| return CANNOT_ENHANCE; |
| } |
| } |
| |
| /** |
| * Compute enhancements for supplied ContentItem. The results of the process |
| * are expected to be stored in the metadata of the content item. |
| * <p/> |
| * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of |
| * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}. |
| * <p/> |
| * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and |
| * stores it as a new part in the content item. The metadata is not changed. |
| * |
| * @throws org.apache.stanbol.enhancer.servicesapi.EngineException |
| * if the underlying process failed to work as |
| * expected |
| */ |
| @Override |
| public void computeEnhancements(ContentItem ci) throws EngineException { |
| final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci); |
| |
| String language = getLanguage(this,ci,false); |
| if(!("zh".equals(language) || (language != null && language.startsWith("zh-")))) { |
| throw new IllegalStateException("The detected language is NOT 'zh'! " |
| + "As this is also checked within the #canEnhance(..) method this " |
| + "indicates an Bug in the used EnhancementJobManager implementation. " |
| + "Please report this on the dev@apache.stanbol.org or create an " |
| + "JIRA issue about this."); |
| } |
| if(!at.getSentences().hasNext()) { //no sentences ... use this engine to detect |
| //first the sentences |
| TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText())); |
| try { |
| while(sentences.incrementToken()){ |
| OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class); |
| Sentence s = at.addSentence(offset.startOffset(), offset.endOffset()); |
| if(log.isTraceEnabled()) { |
| log.trace("detected {}:{}",s,s.getSpan()); |
| } |
| } |
| } catch (IOException e) { |
| String message = String.format("IOException while reading from " |
| +"CharSequenceReader of AnalyzedText for ContentItem %s",ci.getUri()); |
| log.error(message,e); |
| throw new EngineException(this, ci, message, e); |
| } |
| } |
| //now the tokens |
| TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at)); |
| try { |
| tokens.reset(); |
| while(tokens.incrementToken()){ |
| OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class); |
| Token t = at.addToken(offset.startOffset(), offset.endOffset()); |
| log.trace("detected {}",t); |
| } |
| } catch (IOException e) { |
| String message = String.format("IOException while reading from " |
| +"CharSequenceReader of AnalyzedText for ContentItem %s",ci.getUri()); |
| log.error(message,e); |
| throw new EngineException(this, ci, message, e); |
| } |
| } |
| |
| @Override |
| public Map<String,Object> getServiceProperties() { |
| return SERVICE_PROPERTIES; |
| } |
| /** |
| * Activate and read the properties. Configures and initialises a POSTagger for each language configured in |
| * CONFIG_LANGUAGES. |
| * |
| * @param ce the {@link org.osgi.service.component.ComponentContext} |
| */ |
| @Activate |
| protected void activate(ComponentContext ce) throws ConfigurationException { |
| log.info("activating smartcn tokenizing engine"); |
| super.activate(ce); |
| } |
| |
| @Deactivate |
| protected void deactivate(ComponentContext context) { |
| super.deactivate(context); |
| } |
| |
| /** |
| * This is an internal helper class that avoids to execute sentences |
| * using the {@link SentenceTokenizer} twice. |
| * @author Rupert Westenthaler |
| * |
| */ |
| protected final class AnalyzedTextSentenceTokenizer extends Tokenizer { |
| private final AnalysedText at; |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); |
| private Iterator<Sentence> sentences; |
| private Sentence sentence = null; |
| |
| protected AnalyzedTextSentenceTokenizer(AnalysedText at) { |
| super(new StringReader(at.getText().toString())); |
| this.at = at; |
| sentences = at.getSentences(); |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| if(sentences.hasNext()){ |
| sentence = sentences.next(); |
| termAtt.setEmpty().append(sentence.getSpan()); |
| offsetAtt.setOffset(sentence.getStart(),sentence.getEnd()); |
| typeAtt.setType("sentence"); |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| @Override |
| public void end() throws IOException { |
| // set final offset |
| offsetAtt.setOffset(at.getEnd(), at.getEnd()); |
| } |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| sentences = at.getSentences(); |
| termAtt.setEmpty(); |
| offsetAtt.setOffset(0, 0); |
| typeAtt.setType(null); |
| } |
| } |
| |
| } |