enhancement-engines/paoding-token/src/main/java/org/apache/stanbol/enhancer/engines/paoding/token/PaodingTokenizerEngine.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.engines.paoding.token;

 import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
 import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;

 import java.io.IOException;
 import java.security.AccessController;
 import java.security.PrivilegedActionException;
 import java.security.PrivilegedExceptionAction;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;

 import net.paoding.analysis.analyzer.PaodingAnalyzer;

 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.commons.io.input.CharSequenceReader;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.ConfigurationPolicy;
 import org.apache.felix.scr.annotations.Properties;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Reference;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.util.Version;
 import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
 import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
 import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.osgi.framework.Constants;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 @Component(immediate = true, metatype = true,
 policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
 @Service
 @Properties(value={
     @Property(name= EnhancementEngine.PROPERTY_NAME,value="paoding-token"),
     @Property(name=Constants.SERVICE_RANKING,intValue=0) //give the default instance a ranking < 0
 })
 public class PaodingTokenizerEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {

     private Logger log = LoggerFactory.getLogger(PaodingTokenizerEngine.class);

     /*
      * Analyzer configuration constants
      */
     private static final String LUCENE_VERSION = Version.LUCENE_36.toString();
     private static final Map<String,String> CHAR_FILTER_FACTORY_CONFIG = new HashMap<String,String>();
     private static final Map<String,String> TOKENIZER_FACTORY_CONFIG = new HashMap<String,String>();
     static {
         CHAR_FILTER_FACTORY_CONFIG.put("luceneMatchVersion", LUCENE_VERSION);
         CHAR_FILTER_FACTORY_CONFIG.put("mapping", "gosen-mapping-japanese.txt");
         TOKENIZER_FACTORY_CONFIG.put("luceneMatchVersion", LUCENE_VERSION);
     }

     /**
      * Service Properties of this Engine
      */
     private static final Map<String,Object> SERVICE_PROPERTIES;
     static {
         Map<String,Object> props = new HashMap<String,Object>();
         props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
             ServiceProperties.ORDERING_NLP_TOKENIZING);
         props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
             NlpProcessingRole.Tokenizing);
         SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
     }

     @Reference
     protected AnalysedTextFactory analysedTextFactory;

     @Override
     protected void activate(ComponentContext ctx) throws ConfigurationException {
         super.activate(ctx);
         //init the Solr ResourceLoader used for initialising the components
     }

     @Override
     protected void deactivate(ComponentContext ctx) {
         super.deactivate(ctx);
     }


     @Override
     public int canEnhance(ContentItem ci) throws EngineException {
         // check if content is present
         Map.Entry<UriRef,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
         if(entry == null || entry.getValue() == null) {
             return CANNOT_ENHANCE;
         }

         String language = getLanguage(this,ci,false);
         if("zh".equals(language) || (language != null && language.startsWith("zh-"))) {
             log.trace(" > can enhance ContentItem {} with language {}",ci,language);
             return ENHANCE_ASYNC;
         } else {
             return CANNOT_ENHANCE;
         }


     }

     @Override
     public void computeEnhancements(ContentItem ci) throws EngineException {
         final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
         String language = getLanguage(this,ci,false);
         if(!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
             throw new IllegalStateException("The detected language is NOT 'zh'! "
                 + "As this is also checked within the #canEnhance(..) method this "
                 + "indicates an Bug in the used EnhancementJobManager implementation. "
                 + "Please report this on the dev@apache.stanbol.org or create an "
                 + "JIRA issue about this.");
         }
         PaodingAnalyzer pa;
         try {
             pa = AccessController.doPrivileged(new PrivilegedExceptionAction<PaodingAnalyzer>() {
                 public PaodingAnalyzer run() throws Exception {
                     return new PaodingAnalyzer();
                 }
             });
         } catch (PrivilegedActionException pae){
             Exception e = pae.getException();
             log.error("Unable to initialise PoadingAnalyzer",e);
             throw new EngineException("Unable to initialise PoadingAnalyzer",e);
         }
         TokenStream ts = pa.tokenStream("dummy", new CharSequenceReader(at.getText()));
         int lastEnd = 0;
         try {
         	ts.reset();
             while(ts.incrementToken()){
                 OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
                 //when tokenizing labels we need to preserve all chars
                 if(offset.startOffset() > lastEnd){ //add token for stopword
                     at.addToken(lastEnd,offset.startOffset());
                 }
                 at.addToken(offset.startOffset(), offset.endOffset());
                 lastEnd = offset.endOffset();
             }
         } catch (IOException e) {
             log.warn("IOException while reading the parsed Text",e);
             throw new EngineException("IOException while reading the parsed Text",e);
         }
     }

     @Override
     public Map<String,Object> getServiceProperties() {
         return SERVICE_PROPERTIES;
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.paoding.token;

	import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
	import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;

	import java.io.IOException;
	import java.security.AccessController;
	import java.security.PrivilegedActionException;
	import java.security.PrivilegedExceptionAction;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.Map;

	import net.paoding.analysis.analyzer.PaodingAnalyzer;

	import org.apache.clerezza.rdf.core.UriRef;
	import org.apache.commons.io.input.CharSequenceReader;
	import org.apache.felix.scr.annotations.Component;
	import org.apache.felix.scr.annotations.ConfigurationPolicy;
	import org.apache.felix.scr.annotations.Properties;
	import org.apache.felix.scr.annotations.Property;
	import org.apache.felix.scr.annotations.Reference;
	import org.apache.felix.scr.annotations.Service;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.util.Version;
	import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
	import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
	import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
	import org.apache.stanbol.enhancer.servicesapi.Blob;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.EngineException;
	import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
	import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
	import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
	import org.osgi.framework.Constants;
	import org.osgi.service.cm.ConfigurationException;
	import org.osgi.service.component.ComponentContext;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	@Component(immediate = true, metatype = true,
	policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
	@Service
	@Properties(value={
	@Property(name= EnhancementEngine.PROPERTY_NAME,value="paoding-token"),
	@Property(name=Constants.SERVICE_RANKING,intValue=0) //give the default instance a ranking < 0
	})
	public class PaodingTokenizerEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {

	private Logger log = LoggerFactory.getLogger(PaodingTokenizerEngine.class);

	/*
	* Analyzer configuration constants
	*/
	private static final String LUCENE_VERSION = Version.LUCENE_36.toString();
	private static final Map<String,String> CHAR_FILTER_FACTORY_CONFIG = new HashMap<String,String>();
	private static final Map<String,String> TOKENIZER_FACTORY_CONFIG = new HashMap<String,String>();
	static {
	CHAR_FILTER_FACTORY_CONFIG.put("luceneMatchVersion", LUCENE_VERSION);
	CHAR_FILTER_FACTORY_CONFIG.put("mapping", "gosen-mapping-japanese.txt");
	TOKENIZER_FACTORY_CONFIG.put("luceneMatchVersion", LUCENE_VERSION);
	}

	/**
	* Service Properties of this Engine
	*/
	private static final Map<String,Object> SERVICE_PROPERTIES;
	static {
	Map<String,Object> props = new HashMap<String,Object>();
	props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
	ServiceProperties.ORDERING_NLP_TOKENIZING);
	props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
	NlpProcessingRole.Tokenizing);
	SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
	}

	@Reference
	protected AnalysedTextFactory analysedTextFactory;

	@Override
	protected void activate(ComponentContext ctx) throws ConfigurationException {
	super.activate(ctx);
	//init the Solr ResourceLoader used for initialising the components
	}

	@Override
	protected void deactivate(ComponentContext ctx) {
	super.deactivate(ctx);
	}



	@Override
	public int canEnhance(ContentItem ci) throws EngineException {
	// check if content is present
	Map.Entry<UriRef,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
	if(entry == null \|\| entry.getValue() == null) {
	return CANNOT_ENHANCE;
	}

	String language = getLanguage(this,ci,false);
	if("zh".equals(language) \|\| (language != null && language.startsWith("zh-"))) {
	log.trace(" > can enhance ContentItem {} with language {}",ci,language);
	return ENHANCE_ASYNC;
	} else {
	return CANNOT_ENHANCE;
	}


	}

	@Override
	public void computeEnhancements(ContentItem ci) throws EngineException {
	final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
	String language = getLanguage(this,ci,false);
	if(!("zh".equals(language) \|\| (language != null && language.startsWith("zh-")))) {
	throw new IllegalStateException("The detected language is NOT 'zh'! "
	+ "As this is also checked within the #canEnhance(..) method this "
	+ "indicates an Bug in the used EnhancementJobManager implementation. "
	+ "Please report this on the dev@apache.stanbol.org or create an "
	+ "JIRA issue about this.");
	}
	PaodingAnalyzer pa;
	try {
	pa = AccessController.doPrivileged(new PrivilegedExceptionAction<PaodingAnalyzer>() {
	public PaodingAnalyzer run() throws Exception {
	return new PaodingAnalyzer();
	}
	});
	} catch (PrivilegedActionException pae){
	Exception e = pae.getException();
	log.error("Unable to initialise PoadingAnalyzer",e);
	throw new EngineException("Unable to initialise PoadingAnalyzer",e);
	}
	TokenStream ts = pa.tokenStream("dummy", new CharSequenceReader(at.getText()));
	int lastEnd = 0;
	try {
	ts.reset();
	while(ts.incrementToken()){
	OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
	//when tokenizing labels we need to preserve all chars
	if(offset.startOffset() > lastEnd){ //add token for stopword
	at.addToken(lastEnd,offset.startOffset());
	}
	at.addToken(offset.startOffset(), offset.endOffset());
	lastEnd = offset.endOffset();
	}
	} catch (IOException e) {
	log.warn("IOException while reading the parsed Text",e);
	throw new EngineException("IOException while reading the parsed Text",e);
	}
	}

	@Override
	public Map<String,Object> getServiceProperties() {
	return SERVICE_PROPERTIES;
	}

	}