enhancement-engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/CeliMorphoFeatures.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.engines.celi;

 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;

 import java.util.Collection;
 import java.util.List;
 import java.util.Map.Entry;
 import java.util.Vector;

 import org.apache.clerezza.rdf.core.Language;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliLemmatizerEnhancementEngine;
 import org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.Reading;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.nlp.morpho.Case;
 import org.apache.stanbol.enhancer.nlp.morpho.CaseTag;
 import org.apache.stanbol.enhancer.nlp.morpho.Definitness;
 import org.apache.stanbol.enhancer.nlp.morpho.GenderTag;
 import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
 import org.apache.stanbol.enhancer.nlp.morpho.NumberTag;
 import org.apache.stanbol.enhancer.nlp.morpho.Person;
 import org.apache.stanbol.enhancer.nlp.morpho.Tense;
 import org.apache.stanbol.enhancer.nlp.morpho.TenseTag;
 import org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag;
 import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 import org.apache.stanbol.enhancer.nlp.pos.PosTag;

 /**
  * Represents a morphological interpretation of a {@link Token word}. Words might have different interpretations (typically depending on the POS) so this Tag allows to add information about all possible interpretations to a single word. This is
  * needed if no POS information are present or if POS tags are ambiguous or of low confidence.
  * <p>
  * <b>TODO</b>s:
  * <ul>
  * <li>I would like to have {@link Case}, {@link Tense}, ... as own Annotations. However AFAIK those are all grouped to a single interpretation of the Token (driven by the POS tag).</li>
  * <li>Maybe add a possibility to add unmapped information as <code>Map&lt;String,List&lt;String&gt;&gt;</code>
  * </ul>
  *
  * @author Alessio Bosca
  *
  */
 public class CeliMorphoFeatures extends MorphoFeatures{

     private static CeliTagSetRegistry tagRegistry = CeliTagSetRegistry.getInstance();

     public static final UriRef HAS_NUMBER = new UriRef("http://purl.org/olia/olia.owl#hasNumber");
     public static final UriRef HAS_GENDER = new UriRef("http://purl.org/olia/olia.owl#hasGender");
     public static final UriRef HAS_PERSON = new UriRef("http://purl.org/olia/olia.owl#hasPerson");
     public static final UriRef HAS_CASE = new UriRef("http://purl.org/olia/olia.owl#hasCase");
     public static final UriRef HAS_DEFINITENESS = new UriRef("http://purl.org/olia/olia.owl#hasDefiniteness");
     public static final UriRef HAS_MOOD = new UriRef("http://purl.org/olia/olia.owl#hasMood");
     public static final UriRef HAS_TENSE = new UriRef("http://purl.org/olia/olia.owl#hasTense");

     public static CeliMorphoFeatures parseFrom(Reading reading, String lang){
         if(reading == null){
             return null;
         }
         CeliMorphoFeatures morphoFeature = new CeliMorphoFeatures(reading.getLemma());
         //parse the key,value pairs of the reading using the language as context
         for (Entry<String, List<String>> entry : reading.getLexicalFeatures().entrySet()) {
             String feature = entry.getKey();
             for (String value : entry.getValue()) {
                 if (feature.equals("POS")) {
                     morphoFeature.addPos(tagRegistry.getPosTag(lang,value));
                 } else if (feature.equals("CASE")) {
                     morphoFeature.addCase(tagRegistry.getCaseTag(lang,value));
                 } else if (feature.equals("GENDER")) {
                     morphoFeature.addGender(tagRegistry.getGenderTag(lang,value));
                 } else if (feature.equals("NUMBER")) {
                     morphoFeature.addNumber(tagRegistry.getNumber(lang,value));
                 } else if (feature.equals("PERSON")) {
                     morphoFeature.addPerson(tagRegistry.getPerson(lang,value));
                 } else if (feature.equals("VERB_FORM") || feature.equals("VFORM")) {
                     morphoFeature.addVerbForm(tagRegistry.getVerbMoodTag(lang,value));
                 } else if (feature.equals("TENSE") || feature.equals("VERB_TENSE")) {
                     morphoFeature.addTense(tagRegistry.getTenseTag(lang,value));
                 }
             }
         }
         return morphoFeature;
     }
     /**
      * Use {@link #parseFrom(Reading, String)} to instantiate
      * @param lemma
      */
     private CeliMorphoFeatures(String lemma) {
 	    super(lemma);
 	}

 	public Collection<? extends Triple> featuresAsTriples(UriRef textAnnotation, Language lang) {
 		Collection<TripleImpl> result = new Vector<TripleImpl>();
 		result.add(new TripleImpl(textAnnotation, CeliLemmatizerEnhancementEngine.hasLemmaForm,
 		    new PlainLiteralImpl(getLemma(), lang)));
 		for(PosTag pos: getPosList()){
 		    if(pos.isMapped()){
 		        for(LexicalCategory cat : pos.getCategories()){
 		            result.add(new TripleImpl(textAnnotation, RDF_TYPE, cat.getUri()));
 		        }
 		    }
 		}
 		for(NumberTag num : getNumberList()){
 		    if(num.getNumber() != null){
 		        result.add(new TripleImpl(textAnnotation, HAS_NUMBER, num.getNumber().getUri()));
 		    }
 		}
 		for(Person pers : getPersonList()){
 	        result.add(new TripleImpl(textAnnotation, HAS_PERSON, pers.getUri()));
 		}
 		for(GenderTag gender : getGenderList()){
 		    if(gender.getGender() != null){
 		        result.add(new TripleImpl(textAnnotation, HAS_GENDER, gender.getGender().getUri()));
 		    }
 		}
 		for(Definitness def : getDefinitnessList()){
 	        result.add(new TripleImpl(textAnnotation, HAS_DEFINITENESS, def.getUri()));
 		}
 		for(CaseTag caseFeat : getCaseList()){
 		    if(caseFeat.getCase() != null){
 		        result.add(new TripleImpl(textAnnotation, HAS_CASE, caseFeat.getCase().getUri()));
 		    }
 		}
 		for (VerbMoodTag vf : getVerbMoodList()){
 		    if(vf.getVerbForm() != null){
 		        result.add(new TripleImpl(textAnnotation, HAS_MOOD, vf.getVerbForm().getUri()));
 		    }
 		}
 		for(TenseTag tense : getTenseList()){
 		    if(tense.getTense() != null){
 		        result.add(new TripleImpl(textAnnotation, HAS_TENSE, tense.getTense().getUri()));
 		    }
 		}
 		return result;
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.celi;

	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;

	import java.util.Collection;
	import java.util.List;
	import java.util.Map.Entry;
	import java.util.Vector;

	import org.apache.clerezza.rdf.core.Language;
	import org.apache.clerezza.rdf.core.Triple;
	import org.apache.clerezza.rdf.core.UriRef;
	import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
	import org.apache.clerezza.rdf.core.impl.TripleImpl;
	import org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliLemmatizerEnhancementEngine;
	import org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.Reading;
	import org.apache.stanbol.enhancer.nlp.model.Token;
	import org.apache.stanbol.enhancer.nlp.morpho.Case;
	import org.apache.stanbol.enhancer.nlp.morpho.CaseTag;
	import org.apache.stanbol.enhancer.nlp.morpho.Definitness;
	import org.apache.stanbol.enhancer.nlp.morpho.GenderTag;
	import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
	import org.apache.stanbol.enhancer.nlp.morpho.NumberTag;
	import org.apache.stanbol.enhancer.nlp.morpho.Person;
	import org.apache.stanbol.enhancer.nlp.morpho.Tense;
	import org.apache.stanbol.enhancer.nlp.morpho.TenseTag;
	import org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag;
	import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
	import org.apache.stanbol.enhancer.nlp.pos.PosTag;

	/**
	* Represents a morphological interpretation of a {@link Token word}. Words might have different interpretations (typically depending on the POS) so this Tag allows to add information about all possible interpretations to a single word. This is
	* needed if no POS information are present or if POS tags are ambiguous or of low confidence.
	* <p>
	* <b>TODO</b>s:
	* <ul>
	* <li>I would like to have {@link Case}, {@link Tense}, ... as own Annotations. However AFAIK those are all grouped to a single interpretation of the Token (driven by the POS tag).</li>
	* <li>Maybe add a possibility to add unmapped information as <code>Map<String,List<String>></code>
	* </ul>
	*
	* @author Alessio Bosca
	*
	*/
	public class CeliMorphoFeatures extends MorphoFeatures{

	private static CeliTagSetRegistry tagRegistry = CeliTagSetRegistry.getInstance();

	public static final UriRef HAS_NUMBER = new UriRef("http://purl.org/olia/olia.owl#hasNumber");
	public static final UriRef HAS_GENDER = new UriRef("http://purl.org/olia/olia.owl#hasGender");
	public static final UriRef HAS_PERSON = new UriRef("http://purl.org/olia/olia.owl#hasPerson");
	public static final UriRef HAS_CASE = new UriRef("http://purl.org/olia/olia.owl#hasCase");
	public static final UriRef HAS_DEFINITENESS = new UriRef("http://purl.org/olia/olia.owl#hasDefiniteness");
	public static final UriRef HAS_MOOD = new UriRef("http://purl.org/olia/olia.owl#hasMood");
	public static final UriRef HAS_TENSE = new UriRef("http://purl.org/olia/olia.owl#hasTense");

	public static CeliMorphoFeatures parseFrom(Reading reading, String lang){
	if(reading == null){
	return null;
	}
	CeliMorphoFeatures morphoFeature = new CeliMorphoFeatures(reading.getLemma());
	//parse the key,value pairs of the reading using the language as context
	for (Entry<String, List<String>> entry : reading.getLexicalFeatures().entrySet()) {
	String feature = entry.getKey();
	for (String value : entry.getValue()) {
	if (feature.equals("POS")) {
	morphoFeature.addPos(tagRegistry.getPosTag(lang,value));
	} else if (feature.equals("CASE")) {
	morphoFeature.addCase(tagRegistry.getCaseTag(lang,value));
	} else if (feature.equals("GENDER")) {
	morphoFeature.addGender(tagRegistry.getGenderTag(lang,value));
	} else if (feature.equals("NUMBER")) {
	morphoFeature.addNumber(tagRegistry.getNumber(lang,value));
	} else if (feature.equals("PERSON")) {
	morphoFeature.addPerson(tagRegistry.getPerson(lang,value));
	} else if (feature.equals("VERB_FORM") \|\| feature.equals("VFORM")) {
	morphoFeature.addVerbForm(tagRegistry.getVerbMoodTag(lang,value));
	} else if (feature.equals("TENSE") \|\| feature.equals("VERB_TENSE")) {
	morphoFeature.addTense(tagRegistry.getTenseTag(lang,value));
	}
	}
	}
	return morphoFeature;
	}
	/**
	* Use {@link #parseFrom(Reading, String)} to instantiate
	* @param lemma
	*/
	private CeliMorphoFeatures(String lemma) {
	super(lemma);
	}

	public Collection<? extends Triple> featuresAsTriples(UriRef textAnnotation, Language lang) {
	Collection<TripleImpl> result = new Vector<TripleImpl>();
	result.add(new TripleImpl(textAnnotation, CeliLemmatizerEnhancementEngine.hasLemmaForm,
	new PlainLiteralImpl(getLemma(), lang)));
	for(PosTag pos: getPosList()){
	if(pos.isMapped()){
	for(LexicalCategory cat : pos.getCategories()){
	result.add(new TripleImpl(textAnnotation, RDF_TYPE, cat.getUri()));
	}
	}
	}
	for(NumberTag num : getNumberList()){
	if(num.getNumber() != null){
	result.add(new TripleImpl(textAnnotation, HAS_NUMBER, num.getNumber().getUri()));
	}
	}
	for(Person pers : getPersonList()){
	result.add(new TripleImpl(textAnnotation, HAS_PERSON, pers.getUri()));
	}
	for(GenderTag gender : getGenderList()){
	if(gender.getGender() != null){
	result.add(new TripleImpl(textAnnotation, HAS_GENDER, gender.getGender().getUri()));
	}
	}
	for(Definitness def : getDefinitnessList()){
	result.add(new TripleImpl(textAnnotation, HAS_DEFINITENESS, def.getUri()));
	}
	for(CaseTag caseFeat : getCaseList()){
	if(caseFeat.getCase() != null){
	result.add(new TripleImpl(textAnnotation, HAS_CASE, caseFeat.getCase().getUri()));
	}
	}
	for (VerbMoodTag vf : getVerbMoodList()){
	if(vf.getVerbForm() != null){
	result.add(new TripleImpl(textAnnotation, HAS_MOOD, vf.getVerbForm().getUri()));
	}
	}
	for(TenseTag tense : getTenseList()){
	if(tense.getTense() != null){
	result.add(new TripleImpl(textAnnotation, HAS_TENSE, tense.getTense().getUri()));
	}
	}
	return result;
	}
	}