blob: a839dc844960f65cc9469719eef6cac4598d2c6d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl;
import static org.apache.stanbol.enhancer.engines.celi.utils.Utils.getSelectionContext;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
import java.io.IOException;
import java.net.URL;
import java.util.Collections;
import java.util.Dictionary;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Vector;
import javax.xml.soap.SOAPException;
import org.apache.clerezza.commons.rdf.Language;
import org.apache.clerezza.commons.rdf.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.commons.rdf.Graph;
import org.apache.clerezza.commons.rdf.IRI;
import org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl;
import org.apache.clerezza.commons.rdf.impl.utils.TripleImpl;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
import org.apache.stanbol.enhancer.engines.celi.CeliConstants;
import org.apache.stanbol.enhancer.engines.celi.CeliMorphoFeatures;
import org.apache.stanbol.enhancer.engines.celi.CeliTagSetRegistry;
import org.apache.stanbol.enhancer.engines.celi.utils.Utils;
import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
import org.apache.stanbol.enhancer.nlp.morpho.Case;
import org.apache.stanbol.enhancer.nlp.morpho.Gender;
import org.apache.stanbol.enhancer.nlp.morpho.NumberFeature;
import org.apache.stanbol.enhancer.nlp.morpho.Person;
import org.apache.stanbol.enhancer.nlp.morpho.Tense;
import org.apache.stanbol.enhancer.nlp.morpho.TenseTag;
import org.apache.stanbol.enhancer.nlp.morpho.VerbMood;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Component(immediate = true, metatype = true)
@Service
@Properties(value = {
@Property(name = EnhancementEngine.PROPERTY_NAME, value = "celiLemmatizer"),
@Property(name = CeliConstants.CELI_LICENSE),
@Property(name = CeliConstants.CELI_TEST_ACCOUNT, boolValue = false),
@Property(name = CeliConstants.CELI_CONNECTION_TIMEOUT, intValue=CeliConstants.DEFAULT_CONECTION_TIMEOUT)
})
public class CeliLemmatizerEnhancementEngine extends AbstractEnhancementEngine<IOException, RuntimeException> implements EnhancementEngine, ServiceProperties {
// TODO: check if it is OK to define new properties in the FISE namespace
public static final IRI hasLemmaForm = new IRI("http://fise.iks-project.eu/ontology/hasLemmaForm");
/**
* This ensures that no connections to external services are made if Stanbol is started in offline mode as the OnlineMode service will only be available if OfflineMode is deactivated.
*/
@SuppressWarnings("unused")
@Reference
private OnlineMode onlineMode;
private static List<String> supportedLangs = new Vector<String>();
static {
supportedLangs.add("it");
supportedLangs.add("da");
supportedLangs.add("de");
supportedLangs.add("ru");
supportedLangs.add("ro");
}
/**
* The literal representing the LangIDEngine as creator.
*/
public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.celi.langid.impl.CeliLanguageIdentifierEnhancementEngine");
/**
* The default value for the Execution of this Engine. Currently set to {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
*/
public static final Integer defaultOrder = ServiceProperties.ORDERING_CONTENT_EXTRACTION;
private Logger log = LoggerFactory.getLogger(getClass());
/**
* This contains the only MIME type directly supported by this enhancement engine.
*/
private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
/**
* Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
*/
private static final Set<String> SUPPORTED_MIMTYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
@Property(value = "http://linguagrid.org/LSGrid/ws/morpho-analyser")
public static final String SERVICE_URL = "org.apache.stanbol.enhancer.engines.celi.lemmatizer.url";
@Property(boolValue = false)
public static final String MORPHOLOGICAL_ANALYSIS = "org.apache.stanbol.enhancer.engines.celi.lemmatizer.morphoAnalysis";
private String licenseKey;
private URL serviceURL;
private boolean completeMorphoAnalysis;
private LemmatizerClientHTTP client;
@Override
@Activate
protected void activate(ComponentContext ctx) throws IOException, ConfigurationException {
super.activate(ctx);
Dictionary<String, Object> properties = ctx.getProperties();
this.licenseKey = Utils.getLicenseKey(properties, ctx.getBundleContext());
String url = (String) properties.get(SERVICE_URL);
if (url == null || url.isEmpty()) {
throw new ConfigurationException(SERVICE_URL, String.format("%s : please configure the URL of the CELI Web Service (e.g. by" + "using the 'Configuration' tab of the Apache Felix Web Console).", getClass().getSimpleName()));
}
this.serviceURL = new URL(url);
try {
this.completeMorphoAnalysis = (Boolean) properties.get(MORPHOLOGICAL_ANALYSIS);
} catch (Exception e) {
this.completeMorphoAnalysis = false;
}
int conTimeout = Utils.getConnectionTimeout(properties, ctx.getBundleContext());
this.client = new LemmatizerClientHTTP(this.serviceURL, this.licenseKey, conTimeout);
}
@Override
@Deactivate
protected void deactivate(ComponentContext ce) {
super.deactivate(ce);
}
@Override
public int canEnhance(ContentItem ci) throws EngineException {
String language = EnhancementEngineHelper.getLanguage(ci);
if (language == null) {
log.warn("Unable to enhance ContentItem {} because language of the Content is unknown." + "Please check that a language identification engine is active in this EnhancementChain).", ci.getUri());
}
if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null && this.isLangSupported(language))
return ENHANCE_ASYNC;
else
return CANNOT_ENHANCE;
}
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
String language = EnhancementEngineHelper.getLanguage(ci);
if (!isLangSupported(language)) {
throw new IllegalStateException("Call to computeEnhancement with unsupported language '" + language + " for ContentItem " + ci.getUri() + ": This is also checked " + "in the canEnhance method! -> This indicated an Bug in the "
+ "implementation of the " + "EnhancementJobManager!");
}
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This "
+ "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
return;
}
Graph graph = ci.getMetadata();
if (this.completeMorphoAnalysis) {
this.addMorphoAnalysisEnhancement(ci, text, language, graph);
} else {
this.addLemmatizationEnhancement(ci, text, language, graph);
}
}
private void addMorphoAnalysisEnhancement(ContentItem ci, String text, String language, Graph g) throws EngineException {
Language lang = new Language(language); // clerezza language for PlainLiterals
List<LexicalEntry> terms;
try {
terms = this.client.performMorfologicalAnalysis(text, language);
} catch (IOException e) {
throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
}
// get a write lock before writing the enhancements
ci.getLock().writeLock().lock();
try {
LiteralFactory literalFactory = LiteralFactory.getInstance();
for (LexicalEntry le : terms) {
List<CeliMorphoFeatures> mFeatures = this.convertLexicalEntryToMorphFeatures(le, language);
for (CeliMorphoFeatures feat : mFeatures) {
// Create a text annotation for each interpretation produced by the morphological analyzer
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(le.getWordForm(), lang)));
if (le.from >= 0 && le.to > 0) {
g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(le.from)));
g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(le.to)));
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, le.getWordForm(), le.from), lang)));
}
g.addAll(feat.featuresAsTriples(textAnnotation, lang));
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
private void addLemmatizationEnhancement(ContentItem ci, String text, String language, Graph g) throws EngineException {
Language lang = new Language(language); // clerezza language for PlainLiterals
String lemmatizedContents;
try {
lemmatizedContents = this.client.lemmatizeContents(text, language);
} catch (IOException e) {
throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
}
// get a write lock before writing the enhancements
ci.getLock().writeLock().lock();
try {
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textEnhancement, CeliLemmatizerEnhancementEngine.hasLemmaForm, new PlainLiteralImpl(lemmatizedContents, lang)));
} finally {
ci.getLock().writeLock().unlock();
}
}
private List<CeliMorphoFeatures> convertLexicalEntryToMorphFeatures(LexicalEntry le, String lang) {
List<CeliMorphoFeatures> result = new Vector<CeliMorphoFeatures>();
if (!le.termReadings.isEmpty()) {
for (Reading r : le.termReadings) {
CeliMorphoFeatures morphoFeature = CeliMorphoFeatures.parseFrom(r, lang);
if(morphoFeature != null){
result.add(morphoFeature);
}
}
}
return result;
}
private boolean isLangSupported(String language) {
if (supportedLangs.contains(language))
return true;
else
return false;
}
@Override
public Map<String, Object> getServiceProperties() {
return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
}
}