blob: 9086b72bd54631e10b3372b106d5e147f62e0625 [file] [log] [blame]
/*
* Copyright (c) 2012 Sebastian Schaffert
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.smartcn.impl;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.clerezza.commons.rdf.IRI;
import org.apache.commons.io.input.CharSequenceReader;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.osgi.framework.Constants;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Sentence detection and word tokenizer for Chinese based on the Solr/Lucene
* smartcn analysers.
*
* @author Rupert Westenthaler
*/
@Component(immediate = true, metatype = true,
policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
@Service
@Properties(value={
@Property(name= EnhancementEngine.PROPERTY_NAME,value="smartcn-token"),
@Property(name=Constants.SERVICE_RANKING,intValue=0) //give the default instance a ranking < 0
})
public class SmartcnTokenizerEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {
private static final Map<String,Object> SERVICE_PROPERTIES;
static {
Map<String,Object> props = new HashMap<String,Object>();
props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
ServiceProperties.ORDERING_NLP_TOKENIZING);
props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
NlpProcessingRole.Tokenizing);
SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
}
private static Logger log = LoggerFactory.getLogger(SmartcnTokenizerEngine.class);
@Reference
private AnalysedTextFactory analysedTextFactory;
/**
* Indicate if this engine can enhance supplied ContentItem, and if it
* suggests enhancing it synchronously or asynchronously. The
* {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
* just a suggestion from the engine.
* <p/>
* Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for
* the content item, CANNOT_ENHANCE otherwise.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the introspecting process of the content item
* fails
*/
@Override
public int canEnhance(ContentItem ci) throws EngineException {
// check if content is present
Map.Entry<IRI,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
if(entry == null || entry.getValue() == null) {
return CANNOT_ENHANCE;
}
String language = getLanguage(this,ci,false);
if("zh".equals(language) || (language != null && language.startsWith("zh-"))) {
log.trace(" > can enhance ContentItem {} with language {}",ci,language);
return ENHANCE_ASYNC;
} else {
return CANNOT_ENHANCE;
}
}
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
String language = getLanguage(this,ci,false);
if(!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
throw new IllegalStateException("The detected language is NOT 'zh'! "
+ "As this is also checked within the #canEnhance(..) method this "
+ "indicates an Bug in the used EnhancementJobManager implementation. "
+ "Please report this on the dev@apache.stanbol.org or create an "
+ "JIRA issue about this.");
}
if(!at.getSentences().hasNext()) { //no sentences ... use this engine to detect
//first the sentences
TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
try {
while(sentences.incrementToken()){
OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
if(log.isTraceEnabled()) {
log.trace("detected {}:{}",s,s.getSpan());
}
}
} catch (IOException e) {
String message = String.format("IOException while reading from "
+"CharSequenceReader of AnalyzedText for ContentItem %s",ci.getUri());
log.error(message,e);
throw new EngineException(this, ci, message, e);
}
}
//now the tokens
TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
try {
tokens.reset();
while(tokens.incrementToken()){
OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
Token t = at.addToken(offset.startOffset(), offset.endOffset());
log.trace("detected {}",t);
}
} catch (IOException e) {
String message = String.format("IOException while reading from "
+"CharSequenceReader of AnalyzedText for ContentItem %s",ci.getUri());
log.error(message,e);
throw new EngineException(this, ci, message, e);
}
}
@Override
public Map<String,Object> getServiceProperties() {
return SERVICE_PROPERTIES;
}
/**
* Activate and read the properties. Configures and initialises a POSTagger for each language configured in
* CONFIG_LANGUAGES.
*
* @param ce the {@link org.osgi.service.component.ComponentContext}
*/
@Activate
protected void activate(ComponentContext ce) throws ConfigurationException {
log.info("activating smartcn tokenizing engine");
super.activate(ce);
}
@Deactivate
protected void deactivate(ComponentContext context) {
super.deactivate(context);
}
/**
* This is an internal helper class that avoids to execute sentences
* using the {@link SentenceTokenizer} twice.
* @author Rupert Westenthaler
*
*/
protected final class AnalyzedTextSentenceTokenizer extends Tokenizer {
private final AnalysedText at;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private Iterator<Sentence> sentences;
private Sentence sentence = null;
protected AnalyzedTextSentenceTokenizer(AnalysedText at) {
super(new StringReader(at.getText().toString()));
this.at = at;
sentences = at.getSentences();
}
@Override
public boolean incrementToken() throws IOException {
if(sentences.hasNext()){
sentence = sentences.next();
termAtt.setEmpty().append(sentence.getSpan());
offsetAtt.setOffset(sentence.getStart(),sentence.getEnd());
typeAtt.setType("sentence");
return true;
} else {
return false;
}
}
@Override
public void end() throws IOException {
// set final offset
offsetAtt.setOffset(at.getEnd(), at.getEnd());
}
@Override
public void reset() throws IOException {
super.reset();
sentences = at.getSentences();
termAtt.setEmpty();
offsetAtt.setOffset(0, 0);
typeAtt.setType(null);
}
}
}