enhancement-engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.stanbol.enhancer.engines.langdetect;

 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.DCTERMS_LINGUISTIC_SYSTEM;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;

 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.commons.io.IOUtils;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.Properties;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.Chain;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.cybozu.labs.langdetect.LangDetectException;
 import com.cybozu.labs.langdetect.Language;

 /**
  * {@link LanguageDetectionEnhancementEngine} provides functionality to enhance document
  * with their language.
  *
  * @author Walter Kasper, DFKI
  */
 @Component(immediate = true, metatype = true, inherit=true)
 @Service
 @Properties(value={
     @Property(name=EnhancementEngine.PROPERTY_NAME,value="langdetect")
 })
 public class LanguageDetectionEnhancementEngine
         extends AbstractEnhancementEngine<LangDetectException,RuntimeException>
         implements EnhancementEngine, ServiceProperties {

     /**
      * a configurable value of the text segment length to check
      */
     @Property(intValue=LanguageDetectionEnhancementEngine.PROBE_LENGTH_DEFAULT)
     public static final String PROBE_LENGTH_PROP = "org.apache.stanbol.enhancer.engines.langdetect.probe-length";

     /**
      * a configurable value of the maximum number of suggested languages
      */
     @Property(intValue=LanguageDetectionEnhancementEngine.DEFAULT_MAX_SUGGESTED_LANGUAGES)
     public static final String MAX_SUGGESTED_PROP = "org.apache.stanbol.enhancer.engines.langdetect.max-suggested";

     /**
      * The default value for the Execution of this Engine (
      * {@link ServiceProperties#ORDERING_NLP_LANGAUGE_DETECTION})<p>
      * NOTE: this information is used by the default and weighed {@link Chain}
      * implementation to determine the processing order of
      * {@link EnhancementEngine}s. Other {@link Chain} implementation do not
      * use this information.
      */
     public static final Integer defaultOrder = ServiceProperties.ORDERING_NLP_LANGAUGE_DETECTION;

     /**
      * This contains the only MIME type directly supported by this enhancement engine.
      */
     private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
     /**
      * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
      */
     private static final Set<String> SUPPORTED_MIMTYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);

     /**
      * This contains the logger.
      */
     private static final Logger log = LoggerFactory.getLogger(LanguageDetectionEnhancementEngine.class);

     /*
      * NOTE: Checked the Documentation: The tool already supports the taking
      * of several shorter samples randomly distributed over the parsed text
      * to imrpove results and reduce noise. See
      * http://code.google.com/p/language-detection/wiki/FrequentlyAskedQuestion
      * "Each detected language differs for the same document" for a hint.
      */
     private static final int PROBE_LENGTH_DEFAULT = -1;

     /**
      * Default value for the maximum number of suggested Languages
      */
     private static final int DEFAULT_MAX_SUGGESTED_LANGUAGES = 3;

     /**
      * How much text should be used for testing: If the value is 0 or smaller,
      * the complete text will be used. Otherwise a text probe of the given length
      * is taken from the middle of the text. The default length is 1000.
      */
     private int probeLength = PROBE_LENGTH_DEFAULT;

     private int maxSuggestedLanguages = DEFAULT_MAX_SUGGESTED_LANGUAGES;

     /**
      * The literal factory
      */
     private final LiteralFactory literalFactory = LiteralFactory.getInstance();


     private LanguageIdentifier languageIdentifier;

     /**
      * Initialize the language identifier model and load the prop length bound if
      * provided as a property.
      *
      * @param ce
      *            the {@link ComponentContext}
      */
     protected void activate(ComponentContext ce) throws ConfigurationException, LangDetectException {
         super.activate(ce);
         if (ce != null) {
             @SuppressWarnings("unchecked")
             Dictionary<String, String> properties = ce.getProperties();
             Object value = properties.get(PROBE_LENGTH_PROP);
             if(value instanceof Number){
                 probeLength = ((Number)value).intValue();
             } else if(value != null){
                 try {
                     probeLength = Integer.parseInt(value.toString());
                 } catch (NumberFormatException e) {
                     throw new ConfigurationException(PROBE_LENGTH_PROP,
                         "The parsed 'proble length' MUST be a valid Integer", e);
                 }
             } else {
                 probeLength = PROBE_LENGTH_DEFAULT;
             }
             value = properties.get(MAX_SUGGESTED_PROP);
             if(value instanceof Number){
                 maxSuggestedLanguages = ((Number)value).intValue();
             } else if(value != null){
                 try {
                     maxSuggestedLanguages = Integer.parseInt(value.toString());
                 } catch (NumberFormatException e) {
                     throw new ConfigurationException(MAX_SUGGESTED_PROP,
                         "The parsed number of the maximum suggested lanugages "
                         + "MUST BE a valid Integer", e);
                 }
             }
             if(maxSuggestedLanguages < 1){
                 maxSuggestedLanguages = DEFAULT_MAX_SUGGESTED_LANGUAGES;
             }
         }
         languageIdentifier = new LanguageIdentifier();
     }

     protected void deactivate(ComponentContext ce) {
         super.deactivate(ce);
         this.languageIdentifier = null;
         this.maxSuggestedLanguages = -1;
         this.probeLength = -1;
     }

     public int canEnhance(ContentItem ci) throws EngineException {
         if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null){
             return ENHANCE_ASYNC; //Langid now supports async processing
         } else {
             return CANNOT_ENHANCE;
         }
     }

     public void computeEnhancements(ContentItem ci) throws EngineException {
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
         if(contentPart == null){
             throw new IllegalStateException("No ContentPart with Mimetype '"
                     + TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri()
                     + ": This is also checked in the canEnhance method! -> This "
                     + "indicated an Bug in the implementation of the "
                     + "EnhancementJobManager!");
         }
         String text = "";
         try {
             text = ContentItemHelper.getText(contentPart.getValue());
         } catch (IOException e) {
             throw new InvalidContentException(this, ci, e);
         }
         //do not call trim() on long texts to check if the text is empty
         if (text.length() < 50  && text.trim().length() == 0) {
             log.info("No text contained in ContentPart {} of ContentItem {}",
                 contentPart.getKey(),ci.getUri());
             return;
         }

         // truncate text to some piece from the middle if probeLength > 0
         int checkLength = probeLength;
         if (checkLength > 0 && text.length() > checkLength) {
             text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
         }
         List<Language> languages = null;
         try {
             languages = languageIdentifier.getLanguages(text);
             log.debug("language identified: {}",languages);
         } catch (LangDetectException e) {
             StringBuilder msg = new StringBuilder("Could not identify language of text: ");
             if(text.length() < 200){
                 msg.append(text);
             } else {
                 msg.append(text.subSequence(0, 199)).append("...");
             }
             throw new EngineException(this, ci, msg.toString(), e);
         }

         // add language to metadata
         if (languages != null) {
             MGraph g = ci.getMetadata();
             ci.getLock().writeLock().lock();
             try {
                 for(int i=0;i<maxSuggestedLanguages && i<languages.size();i++){
                     // add a hypothesis
                     Language hypothesis = languages.get(i);
                     UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
                     g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(hypothesis.lang)));
                     g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
                     g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
                     g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE,
                         literalFactory.createTypedLiteral(hypothesis.prob)));
                 }
             } finally {
                 ci.getLock().writeLock().unlock();
             }
         }
     }

     public int getProbeLength() {
         return probeLength;
     }

     public void setProbeLength(int probeLength) {
         this.probeLength = probeLength;
     }

     public Map<String, Object> getServiceProperties() {
         return Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder);
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.langdetect;

	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.DCTERMS_LINGUISTIC_SYSTEM;

	import java.io.IOException;
	import java.io.InputStream;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Dictionary;
	import java.util.List;
	import java.util.Map;
	import java.util.Map.Entry;
	import java.util.Set;

	import org.apache.clerezza.rdf.core.LiteralFactory;
	import org.apache.clerezza.rdf.core.MGraph;
	import org.apache.clerezza.rdf.core.UriRef;
	import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
	import org.apache.clerezza.rdf.core.impl.TripleImpl;
	import org.apache.commons.io.IOUtils;
	import org.apache.felix.scr.annotations.Component;
	import org.apache.felix.scr.annotations.Properties;
	import org.apache.felix.scr.annotations.Property;
	import org.apache.felix.scr.annotations.Service;
	import org.apache.stanbol.enhancer.servicesapi.Blob;
	import org.apache.stanbol.enhancer.servicesapi.Chain;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.EngineException;
	import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
	import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
	import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
	import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
	import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
	import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
	import org.osgi.service.cm.ConfigurationException;
	import org.osgi.service.component.ComponentContext;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import com.cybozu.labs.langdetect.LangDetectException;
	import com.cybozu.labs.langdetect.Language;

	/**
	* {@link LanguageDetectionEnhancementEngine} provides functionality to enhance document
	* with their language.
	*
	* @author Walter Kasper, DFKI
	*/
	@Component(immediate = true, metatype = true, inherit=true)
	@Service
	@Properties(value={
	@Property(name=EnhancementEngine.PROPERTY_NAME,value="langdetect")
	})
	public class LanguageDetectionEnhancementEngine
	extends AbstractEnhancementEngine<LangDetectException,RuntimeException>
	implements EnhancementEngine, ServiceProperties {

	/**
	* a configurable value of the text segment length to check
	*/
	@Property(intValue=LanguageDetectionEnhancementEngine.PROBE_LENGTH_DEFAULT)
	public static final String PROBE_LENGTH_PROP = "org.apache.stanbol.enhancer.engines.langdetect.probe-length";

	/**
	* a configurable value of the maximum number of suggested languages
	*/
	@Property(intValue=LanguageDetectionEnhancementEngine.DEFAULT_MAX_SUGGESTED_LANGUAGES)
	public static final String MAX_SUGGESTED_PROP = "org.apache.stanbol.enhancer.engines.langdetect.max-suggested";

	/**
	* The default value for the Execution of this Engine (
	* {@link ServiceProperties#ORDERING_NLP_LANGAUGE_DETECTION})<p>
	* NOTE: this information is used by the default and weighed {@link Chain}
	* implementation to determine the processing order of
	* {@link EnhancementEngine}s. Other {@link Chain} implementation do not
	* use this information.
	*/
	public static final Integer defaultOrder = ServiceProperties.ORDERING_NLP_LANGAUGE_DETECTION;

	/**
	* This contains the only MIME type directly supported by this enhancement engine.
	*/
	private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
	/**
	* Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
	*/
	private static final Set<String> SUPPORTED_MIMTYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);

	/**
	* This contains the logger.
	*/
	private static final Logger log = LoggerFactory.getLogger(LanguageDetectionEnhancementEngine.class);

	/*
	* NOTE: Checked the Documentation: The tool already supports the taking
	* of several shorter samples randomly distributed over the parsed text
	* to imrpove results and reduce noise. See
	* http://code.google.com/p/language-detection/wiki/FrequentlyAskedQuestion
	* "Each detected language differs for the same document" for a hint.
	*/
	private static final int PROBE_LENGTH_DEFAULT = -1;

	/**
	* Default value for the maximum number of suggested Languages
	*/
	private static final int DEFAULT_MAX_SUGGESTED_LANGUAGES = 3;

	/**
	* How much text should be used for testing: If the value is 0 or smaller,
	* the complete text will be used. Otherwise a text probe of the given length
	* is taken from the middle of the text. The default length is 1000.
	*/
	private int probeLength = PROBE_LENGTH_DEFAULT;

	private int maxSuggestedLanguages = DEFAULT_MAX_SUGGESTED_LANGUAGES;

	/**
	* The literal factory
	*/
	private final LiteralFactory literalFactory = LiteralFactory.getInstance();


	private LanguageIdentifier languageIdentifier;

	/**
	* Initialize the language identifier model and load the prop length bound if
	* provided as a property.
	*
	* @param ce
	* the {@link ComponentContext}
	*/
	protected void activate(ComponentContext ce) throws ConfigurationException, LangDetectException {
	super.activate(ce);
	if (ce != null) {
	@SuppressWarnings("unchecked")
	Dictionary<String, String> properties = ce.getProperties();
	Object value = properties.get(PROBE_LENGTH_PROP);
	if(value instanceof Number){
	probeLength = ((Number)value).intValue();
	} else if(value != null){
	try {
	probeLength = Integer.parseInt(value.toString());
	} catch (NumberFormatException e) {
	throw new ConfigurationException(PROBE_LENGTH_PROP,
	"The parsed 'proble length' MUST be a valid Integer", e);
	}
	} else {
	probeLength = PROBE_LENGTH_DEFAULT;
	}
	value = properties.get(MAX_SUGGESTED_PROP);
	if(value instanceof Number){
	maxSuggestedLanguages = ((Number)value).intValue();
	} else if(value != null){
	try {
	maxSuggestedLanguages = Integer.parseInt(value.toString());
	} catch (NumberFormatException e) {
	throw new ConfigurationException(MAX_SUGGESTED_PROP,
	"The parsed number of the maximum suggested lanugages "
	+ "MUST BE a valid Integer", e);
	}
	}
	if(maxSuggestedLanguages < 1){
	maxSuggestedLanguages = DEFAULT_MAX_SUGGESTED_LANGUAGES;
	}
	}
	languageIdentifier = new LanguageIdentifier();
	}

	protected void deactivate(ComponentContext ce) {
	super.deactivate(ce);
	this.languageIdentifier = null;
	this.maxSuggestedLanguages = -1;
	this.probeLength = -1;
	}

	public int canEnhance(ContentItem ci) throws EngineException {
	if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null){
	return ENHANCE_ASYNC; //Langid now supports async processing
	} else {
	return CANNOT_ENHANCE;
	}
	}

	public void computeEnhancements(ContentItem ci) throws EngineException {
	Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
	if(contentPart == null){
	throw new IllegalStateException("No ContentPart with Mimetype '"
	+ TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri()
	+ ": This is also checked in the canEnhance method! -> This "
	+ "indicated an Bug in the implementation of the "
	+ "EnhancementJobManager!");
	}
	String text = "";
	try {
	text = ContentItemHelper.getText(contentPart.getValue());
	} catch (IOException e) {
	throw new InvalidContentException(this, ci, e);
	}
	//do not call trim() on long texts to check if the text is empty
	if (text.length() < 50 && text.trim().length() == 0) {
	log.info("No text contained in ContentPart {} of ContentItem {}",
	contentPart.getKey(),ci.getUri());
	return;
	}

	// truncate text to some piece from the middle if probeLength > 0
	int checkLength = probeLength;
	if (checkLength > 0 && text.length() > checkLength) {
	text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
	}
	List<Language> languages = null;
	try {
	languages = languageIdentifier.getLanguages(text);
	log.debug("language identified: {}",languages);
	} catch (LangDetectException e) {
	StringBuilder msg = new StringBuilder("Could not identify language of text: ");
	if(text.length() < 200){
	msg.append(text);
	} else {
	msg.append(text.subSequence(0, 199)).append("...");
	}
	throw new EngineException(this, ci, msg.toString(), e);
	}

	// add language to metadata
	if (languages != null) {
	MGraph g = ci.getMetadata();
	ci.getLock().writeLock().lock();
	try {
	for(int i=0;i<maxSuggestedLanguages && i<languages.size();i++){
	// add a hypothesis
	Language hypothesis = languages.get(i);
	UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
	g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(hypothesis.lang)));
	g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
	g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
	g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE,
	literalFactory.createTypedLiteral(hypothesis.prob)));
	}
	} finally {
	ci.getLock().writeLock().unlock();
	}
	}
	}

	public int getProbeLength() {
	return probeLength;
	}

	public void setProbeLength(int probeLength) {
	this.probeLength = probeLength;
	}

	public Map<String, Object> getServiceProperties() {
	return Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder);
	}

	}