blob: f6309ecf9d41906e0453e7cbb685e644047e389d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.opennlp.pos.model;
import java.util.HashMap;
import java.util.Map;
import org.apache.stanbol.commons.opennlp.PosTagsCollectionEnum;
import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.apache.stanbol.enhancer.nlp.pos.olia.English;
import org.apache.stanbol.enhancer.nlp.pos.olia.German;
import org.apache.stanbol.enhancer.nlp.pos.olia.Spanish;
/**
* {@link TagSet}s for known <a herf="http://opennlp.apache.org/">OpenNLP</a>
* POS models.<p>
* When available this refers to models defined by the
* <a herf="http://nlp2rdf.lod2.eu/olia/">OLIA</a> Ontologies. Other TagSets
* are - for now - directly defined in this class.
* <p>
* Specifications in this class are based on {@link PosTagsCollectionEnum}.
* Links/defines to the POS {@link TagSet}s used by
*
* @author Rupert Westenthaler
*
*/
public final class PosTagSetRegistry {
private static PosTagSetRegistry instance = new PosTagSetRegistry();
private PosTagSetRegistry(){}
private final Map<String, TagSet<PosTag>> models = new HashMap<String,TagSet<PosTag>>();
public static PosTagSetRegistry getInstance(){
return instance;
}
private void add(TagSet<PosTag> model) {
for(String lang : model.getLanguages()){
if(models.put(lang, model) != null){
throw new IllegalStateException("Multiple Models for Language '"
+ lang+"'! This is an error in the static confituration of "
+ "this class. Please report this to the stanbol-dev mailing"
+ "list!");
}
}
}
/**
* Getter for the {@link TagSet} by language. If no {@link TagSet}
* is available for an Language this will return <code>null</code>
* @param language the language
* @return the AnnotationModel or <code>null</code> if non is defined
*/
public TagSet<PosTag> getTagSet(String language){
return models.get(language);
}
/**
* Links to the Penn Treebank model as defined by the
* <a herf="http://nlp2rdf.lod2.eu/olia/">OLIA</a> Ontology.
* @see English#PENN_TREEBANK
*/
public static final TagSet<PosTag> ENGLISH = English.PENN_TREEBANK;
static { //adds the English model to the getInstance()
getInstance().add(ENGLISH);
}
/**
* Links to the STTS model as defined by the
* <a herf="http://nlp2rdf.lod2.eu/olia/">OLIA</a> Ontology.
* @see German#STTS
*/
public static final TagSet<PosTag> GERMAN = German.STTS;
static { //adds the English model to the getInstance()
getInstance().add(GERMAN);
}
/**
* Links to the PAROLE model as defined by the
* <a herf="http://nlp2rdf.lod2.eu/olia/">OLIA</a> Ontology.
* @see Spanish#PAROLE
*/
public static final TagSet<PosTag> SPANISH = Spanish.PAROLE;
static { //adds the Spanish model to the getInstance()
getInstance().add(SPANISH);
}
/**
* POS types representing Nouns for Danish based on the PAROLE Tagset as
* described by <a href="http://korpus.dsl.dk/paroledoc_en.pdf">this paper</a>
*/
public static final TagSet<PosTag> DANISH = new TagSet<PosTag>("PAROLE Danish","da");
static {
DANISH.addTag(new PosTag("N",LexicalCategory.Noun));
DANISH.addTag(new PosTag("NP",Pos.ProperNoun));
DANISH.addTag(new PosTag("NC",Pos.CommonNoun));
DANISH.addTag(new PosTag("AC",Pos.CardinalNumber)); //numbers
DANISH.addTag(new PosTag("AO",Pos.OrdinalNumber)); //numbers
DANISH.addTag(new PosTag("AN",LexicalCategory.Adjective));
DANISH.addTag(new PosTag("XX",Pos.Typo)); //non-words (incl typos ..)
DANISH.addTag(new PosTag("XF",Pos.Foreign)); //foreign word
DANISH.addTag(new PosTag("XR",Pos.Symbol)); //symbol letters
DANISH.addTag(new PosTag("XA",LexicalCategory.Noun)); //abbreviations
DANISH.addTag(new PosTag("XS",Pos.Abbreviation)); //abbreviations
DANISH.addTag(new PosTag("V",LexicalCategory.Verb));
DANISH.addTag(new PosTag("VA", Pos.MainVerb));
DANISH.addTag(new PosTag("VAD", Pos.MainVerb, Pos.IndicativeVerb));
DANISH.addTag(new PosTag("VAF", Pos.MainVerb, Pos.Infinitive));
DANISH.addTag(new PosTag("VAG", Pos.MainVerb, Pos.Gerund));
DANISH.addTag(new PosTag("VAPR", Pos.MainVerb, Pos.PresentParticiple));
DANISH.addTag(new PosTag("VAPA", Pos.MainVerb, Pos.PastParticiple));
DANISH.addTag(new PosTag("VE", LexicalCategory.Verb)); //TODO MedialVerb is missing
DANISH.addTag(new PosTag("VED",Pos.IndicativeVerb)); //TODO MedialVerb is missing
DANISH.addTag(new PosTag("VEF",Pos.Infinitive)); //TODO MedialVerb is missing
DANISH.addTag(new PosTag("XP",LexicalCategory.Punctuation));
DANISH.addTag(new PosTag("CC",Pos.CoordinatingConjunction));
DANISH.addTag(new PosTag("SC",Pos.SubordinatingConjunction));
DANISH.addTag(new PosTag("U")); //unmarked for degree
DANISH.addTag(new PosTag("SP",Pos.Preposition));
DANISH.addTag(new PosTag("R",Pos.AdjectivalAdverb));
DANISH.addTag(new PosTag("RG",LexicalCategory.Adverb));
DANISH.addTag(new PosTag("PD",Pos.DemonstrativePronoun));
DANISH.addTag(new PosTag("PI",Pos.IndefinitePronoun));
DANISH.addTag(new PosTag("PT",Pos.InterrogativePronoun,Pos.RelativePronoun));
DANISH.addTag(new PosTag("PP",Pos.PersonalPronoun)); //unsure
DANISH.addTag(new PosTag("PO",Pos.PossessivePronoun)); //unsure
DANISH.addTag(new PosTag("PC",Pos.ReciprocalPronoun)); //unsure
DANISH.addTag(new PosTag("U=",LexicalCategory.Unique)); //unsure
DANISH.addTag(new PosTag("I=",LexicalCategory.Interjection)); //unsure
getInstance().add(DANISH);
}
/**
* POS tags for the Portuguese POS model of OpenNLP based the
* <a href="http://beta.visl.sdu.dk/visl/pt/symbolset-floresta.html">PALAVRAS tag set</a>
* <p>
*/
public static final TagSet<PosTag> PORTUGUESE = new TagSet<PosTag>("PALAVRAS Portuguese","pt");
static {
PORTUGUESE.addTag(new PosTag("n",Pos.CommonNoun));
PORTUGUESE.addTag(new PosTag("prop",Pos.ProperNoun));
PORTUGUESE.addTag(new PosTag("adj",LexicalCategory.Adjective));
PORTUGUESE.addTag(new PosTag("v-fin",Pos.FiniteVerb));
PORTUGUESE.addTag(new PosTag("v-inf",Pos.Infinitive));
PORTUGUESE.addTag(new PosTag("v-pcp",Pos.Participle));
PORTUGUESE.addTag(new PosTag("v-ger",Pos.Gerund));
PORTUGUESE.addTag(new PosTag("art",Pos.Article));
PORTUGUESE.addTag(new PosTag("pron",Pos.Pronoun));
PORTUGUESE.addTag(new PosTag("pron-pers",Pos.PersonalPronoun));
PORTUGUESE.addTag(new PosTag("pron-det",Pos.DeterminalPronoun));
PORTUGUESE.addTag(new PosTag("pron-indp",Pos.Pronoun)); //TODO: missing independent pronoun
PORTUGUESE.addTag(new PosTag("adv",LexicalCategory.Adverb));
PORTUGUESE.addTag(new PosTag("num",Pos.Numeral));
PORTUGUESE.addTag(new PosTag("prp",Pos.Preposition));
PORTUGUESE.addTag(new PosTag("in",LexicalCategory.Interjection));
PORTUGUESE.addTag(new PosTag("conj-s",Pos.SubordinatingConjunction));
PORTUGUESE.addTag(new PosTag("conj-c",Pos.CoordinatingConjunction));
PORTUGUESE.addTag(new PosTag("punc",LexicalCategory.Punctuation)); //missing on the webpage ^
getInstance().add(PORTUGUESE);
}
/**
* POS tags used by the Dutch POS model of OpenNLP for Dutch.<p>
* Source: J.T. Berghmans, "WOTAN: Een automatische grammatikale tagger
* voor het Nederlands", doctoral dissertation, Department of language &
* Speech, Nijmegen University (renamed to Radboud University),
* december 1994.<p>
* <b>NOTE:</b> This {@link TagSet} DOES NOT distinquish beteen Proper- and
* Common- Nouns!<p>
*/
public static final TagSet<PosTag> DUTCH = new TagSet<PosTag>("WOTAN Dutch","nl");
static {
DUTCH.addTag(new PosTag("Adj",LexicalCategory.Adjective));
DUTCH.addTag(new PosTag("Adv",LexicalCategory.Adverb));
DUTCH.addTag(new PosTag("Art",Pos.Article));
DUTCH.addTag(new PosTag("Conj",LexicalCategory.Conjuction));
DUTCH.addTag(new PosTag("Int",LexicalCategory.Interjection));
DUTCH.addTag(new PosTag("N",LexicalCategory.Noun));
DUTCH.addTag(new PosTag("Num",Pos.Numeral));
DUTCH.addTag(new PosTag("Misc"));
DUTCH.addTag(new PosTag("Prep",Pos.Preposition));
DUTCH.addTag(new PosTag("Pron",Pos.Pronoun));
DUTCH.addTag(new PosTag("Punc",LexicalCategory.Punctuation));
DUTCH.addTag(new PosTag("V",LexicalCategory.Verb));
getInstance().add(DUTCH);
}
/**
* POS tags used by the Swedish POS model of OpenNLP for Swedish based on the
* <a href="http://w3.msi.vxu.se/users/nivre/research/MAMBAlex.html">
* Lexical categories in MAMBA</a>
* Most of the <i>'interesting'</i> {@link Pos} mappings would be defined
* as "Features" of MABAS.
*/
public static final TagSet<PosTag> SWEDISH = new TagSet<PosTag>("MAMBA Swedish","sv");
static {
SWEDISH.addTag(new PosTag("PN",Pos.ProperNoun));
SWEDISH.addTag(new PosTag("MN",Pos.CommonNoun)); //TODO: missing Meta-Nouns
SWEDISH.addTag(new PosTag("AN",Pos.CommonNoun)); //TODO: missing Adjectival noun
SWEDISH.addTag(new PosTag("VN",Pos.VerbalNoun));
SWEDISH.addTag(new PosTag("NN",Pos.CommonNoun));
SWEDISH.addTag(new PosTag("PO",Pos.Pronoun));
SWEDISH.addTag(new PosTag("EN",Pos.IndefiniteArticle, Pos.Numeral));
SWEDISH.addTag(new PosTag("RO",Pos.Numeral));
SWEDISH.addTag(new PosTag("AJ",LexicalCategory.Adjective));
SWEDISH.addTag(new PosTag("AV",LexicalCategory.Verb)); //"vara" (be)
SWEDISH.addTag(new PosTag("BV",LexicalCategory.Verb)); //"bli(va)" (become)
SWEDISH.addTag(new PosTag("HV",LexicalCategory.Verb)); //"ha(va)" (have)
SWEDISH.addTag(new PosTag("WV",LexicalCategory.Verb)); //"vilja" (want)
SWEDISH.addTag(new PosTag("QV",LexicalCategory.Verb)); //"kunna" (can)
SWEDISH.addTag(new PosTag("MV",LexicalCategory.Verb)); //"måste" (must)
SWEDISH.addTag(new PosTag("KV",LexicalCategory.Verb)); // locution "komma att" (periphrastic future)
SWEDISH.addTag(new PosTag("SV",LexicalCategory.Verb)); //"skola" (will, shall)
SWEDISH.addTag(new PosTag("GV",LexicalCategory.Verb)); //"göra" (do, make)
SWEDISH.addTag(new PosTag("FV",LexicalCategory.Verb)); //få" (get)
SWEDISH.addTag(new PosTag("VV",LexicalCategory.Verb)); //all other verbs
SWEDISH.addTag(new PosTag("TP",Pos.PastParticiple)); //PerfectParticle
SWEDISH.addTag(new PosTag("SP",Pos.PresentParticiple));
SWEDISH.addTag(new PosTag("AB",LexicalCategory.Adverb));
SWEDISH.addTag(new PosTag("PR",Pos.Preposition));
SWEDISH.addTag(new PosTag("IM",Pos.Infinitive));
SWEDISH.addTag(new PosTag("++",Pos.CoordinatingConjunction));
SWEDISH.addTag(new PosTag("UK",Pos.SubordinatingConjunction));
SWEDISH.addTag(new PosTag("IK",Pos.Comma));
SWEDISH.addTag(new PosTag("IP",Pos.Point));
SWEDISH.addTag(new PosTag("I?",Pos.QuestionMark));
SWEDISH.addTag(new PosTag("IU",Pos.ExclamativePoint));
SWEDISH.addTag(new PosTag("IQ",Pos.Colon));
SWEDISH.addTag(new PosTag("IS",Pos.SemiColon));
SWEDISH.addTag(new PosTag("IT",Pos.Hyphen));
SWEDISH.addTag(new PosTag("IR",Pos.ParentheticalPunctuation));
SWEDISH.addTag(new PosTag("IC",LexicalCategory.Punctuation));
SWEDISH.addTag(new PosTag("PU",Pos.ListMarker));
SWEDISH.addTag(new PosTag("IG",LexicalCategory.Punctuation));
SWEDISH.addTag(new PosTag("YY",Pos.Interjection));
SWEDISH.addTag(new PosTag("ID",LexicalCategory.Noun));
SWEDISH.addTag(new PosTag("XX"));
getInstance().add(SWEDISH);
}
}