blob: 9adf5bddcac6fc144502035e7075b6298a07f8c4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.tokenize;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.regex.Pattern;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.formats.ResourceAsStreamFactory;
import opennlp.tools.tokenize.DummyTokenizerFactory.DummyContextGenerator;
import opennlp.tools.tokenize.DummyTokenizerFactory.DummyDictionary;
import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
/**
* Tests for the {@link TokenizerFactory} class.
*/
public class TokenizerFactoryTest {
private static final Locale LOCALE_DUTCH = new Locale("nl");
private static final Locale LOCALE_POLISH = new Locale("pl");
private static final Locale LOCALE_PORTUGUESE = new Locale("pt");
private static final Locale LOCALE_SPANISH = new Locale("es");
private static ObjectStream<TokenSample> createSampleStream() throws IOException {
InputStreamFactory in = new ResourceAsStreamFactory(
TokenizerFactoryTest.class, "/opennlp/tools/tokenize/token.train");
return new TokenSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8));
}
private static TokenizerModel train(TokenizerFactory factory)
throws IOException {
return TokenizerME.train(createSampleStream(), factory, TrainingParameters.defaultParams());
}
private static Dictionary loadAbbDictionary(Locale loc) throws IOException {
final String abbrevDict;
if (loc.equals(LOCALE_DUTCH)) {
abbrevDict = "opennlp/tools/lang/abb_NL.xml";
} else if (loc.equals(Locale.GERMAN)) {
abbrevDict = "opennlp/tools/lang/abb_DE.xml";
} else if (loc.equals(Locale.FRENCH)) {
abbrevDict = "opennlp/tools/lang/abb_FR.xml";
} else if (loc.equals(Locale.ITALIAN)) {
abbrevDict = "opennlp/tools/lang/abb_IT.xml";
} else if (loc.equals(LOCALE_POLISH)) {
abbrevDict = "opennlp/tools/lang/abb_PL.xml";
} else if (loc.equals(LOCALE_PORTUGUESE)) {
abbrevDict = "opennlp/tools/lang/abb_PT.xml";
} else if (loc.equals(LOCALE_SPANISH)) {
abbrevDict = "opennlp/tools/lang/abb_ES.xml";
} else {
abbrevDict = "opennlp/tools/lang/abb_EN.xml";
}
return new Dictionary(TokenizerFactoryTest.class.getClassLoader()
.getResourceAsStream(abbrevDict));
}
@Test
void testDefault() throws IOException {
Dictionary dic = loadAbbDictionary(Locale.ENGLISH);
final String lang = "eng";
TokenizerModel model = train(new TokenizerFactory(lang, dic, false, null));
TokenizerFactory factory = model.getFactory();
Assertions.assertNotNull(factory.getAbbreviationDictionary());
Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory.getContextGenerator());
String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
ByteArrayOutputStream out = new ByteArrayOutputStream();
model.serialize(out);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
TokenizerModel fromSerialized = new TokenizerModel(in);
factory = fromSerialized.getFactory();
Assertions.assertNotNull(factory.getAbbreviationDictionary());
Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory.getContextGenerator());
Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
}
@Test
void testNullDict() throws IOException {
Dictionary dic = null;
final String lang = "eng";
TokenizerModel model = train(new TokenizerFactory(lang, dic, false, null));
TokenizerFactory factory = model.getFactory();
Assertions.assertNull(factory.getAbbreviationDictionary());
Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory.getContextGenerator());
String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
ByteArrayOutputStream out = new ByteArrayOutputStream();
model.serialize(out);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
TokenizerModel fromSerialized = new TokenizerModel(in);
factory = fromSerialized.getFactory();
Assertions.assertNull(factory.getAbbreviationDictionary());
Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory.getContextGenerator());
Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
}
@Test
void testCustomPatternAndAlphaOpt() throws IOException {
Dictionary dic = null;
final String lang = "spa";
String pattern = "^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$";
TokenizerModel model = train(new TokenizerFactory(lang, dic, true,
Pattern.compile(pattern)));
TokenizerFactory factory = model.getFactory();
Assertions.assertNull(factory.getAbbreviationDictionary());
Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory.getContextGenerator());
Assertions.assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertTrue(factory.isUseAlphaNumericOptimization());
ByteArrayOutputStream out = new ByteArrayOutputStream();
model.serialize(out);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
TokenizerModel fromSerialized = new TokenizerModel(in);
factory = fromSerialized.getFactory();
Assertions.assertNull(factory.getAbbreviationDictionary());
Assertions.assertInstanceOf(DefaultTokenContextGenerator.class, factory.getContextGenerator());
Assertions.assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertTrue(factory.isUseAlphaNumericOptimization());
}
void checkCustomPatternForTokenizerME(String lang, String pattern, String sentence,
int expectedNumTokens) throws IOException {
Locale loc = Locale.ENGLISH;
if ("dut".equals(lang) || "nld".equals(lang)) {
loc = LOCALE_DUTCH;
} else if ("deu".equals(lang)) {
loc = Locale.GERMAN;
} else if ("fra".equals(lang)) {
loc = Locale.FRENCH;
} else if ("ita".equals(lang)) {
loc = Locale.ITALIAN;
} else if ("pol".equals(lang)) {
loc = LOCALE_POLISH;
} else if ("por".equals(lang)) {
loc = LOCALE_PORTUGUESE;
} else if ("spa".equals(lang)) {
loc = LOCALE_SPANISH;
}
TokenizerModel model = train(new TokenizerFactory(lang, loadAbbDictionary(loc), true,
Pattern.compile(pattern)));
TokenizerME tokenizer = new TokenizerME(model);
String[] tokens = tokenizer.tokenize(sentence);
Assertions.assertEquals(expectedNumTokens, tokens.length);
String[] sentSplit = sentence
.replaceAll("'", " '")
.replaceAll(",", " ,")
.split(" ");
for (int i = 0; i < sentSplit.length; i++) {
String sElement = sentSplit[i];
if (i == sentSplit.length - 1) {
sElement = sElement.replace(".", ""); // compensate for sentence ending
}
Assertions.assertEquals(sElement, tokens[i]);
}
}
// For language specific patterns see: opennlp.tools.tokenize.lang.Factory
@Test
void testCustomPatternForTokenizerMEWithAbbreviationsDeu() throws IOException {
String lang = "deu";
String pattern = "^[A-Za-z0-9äéöüÄÉÖÜß]+$";
String sentence = "Ich wähle den auf S. 183 ff. mitgeteilten Traum von der botanischen Monographie.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 14);
}
@Test
void testCustomPatternForTokenizerMEWithAbbreviationsDut() throws IOException {
String lang = "dut";
String pattern = "^[A-Za-z0-9äöüëèéïijÄÖÜËÉÈÏIJ]+$";
String sentence = "Ik kies voor de droom van de botanische monografie die " +
"op p. 183 en volgende wordt beschreven.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 18);
}
@Test
void testCustomPatternForTokenizerMEWithAbbreviationsFra() throws IOException {
String lang = "fra";
String pattern = "^[a-zA-Z0-9àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+$";
String sentence = "Je choisis le rêve de la monographie botanique communiqué à la p. 205.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 14);
}
@Test
void testCustomPatternForTokenizerMEWithAbbreviationsPol() throws IOException {
String lang = "pol";
String pattern = "^[A-Za-z0-9żźćńółęąśŻŹĆĄŚĘŁÓŃ]+$";
String sentence = "W szkicu autobiograficznym pt. moje życie i psychoanaliza Freud pisze, że " +
"jego przodkowie żyli przez wiele lat w Kolonii.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 21);
}
@Test
void testCustomPatternForTokenizerMEWithAbbreviationsPor() throws IOException {
String lang = "por";
String pattern = "^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$";
String sentence = "O povo pernambucano, tradicionalmente inimigo dos imperadores, " +
"lembrava-se do tempo em que o Sr. D. Pedro de Alcantara dava-se ao luxo " +
"de visitar o norte.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 28);
}
@Test
void testCustomPatternForTokenizerMEWithAbbreviationsSpa() throws IOException {
String lang = "spa";
String pattern = "^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$";
String sentence = "Elegiremos el de la monografía botánica expuesto antes del " +
"capítulo V en pág. 448 del presente volumen.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 18);
}
@Test
void testCustomPatternForTokenizerMEPor() throws IOException {
String lang = "por";
String pattern = "^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$";
String sentence = "Na floresta mágica a raposa dança com unicórnios felizes.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 10);
}
@Test
void testCustomPatternForTokenizerMESpa() throws IOException {
String lang = "spa";
String pattern = "^[0-9a-záéíóúüýñA-ZÁÉÍÓÚÝÑ]+$";
String sentence = "En el verano los niños juegan en el parque y sus risas crean alegría.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 15);
}
@Test
void testCustomPatternForTokenizerMECat() throws IOException {
String lang = "cat";
String pattern = "^[0-9a-zàèéíïòóúüçA-ZÀÈÉÍÏÒÓÚÜÇ]+$";
String sentence = "Als xiuxiuejants avets l'os blau neda amb cignes i s'ho passen bé.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 15);
}
@Test
void testCustomPatternForTokenizerMEIta() throws IOException {
String lang = "ita";
String pattern = "^[0-9a-zàèéìîíòóùüA-ZÀÈÉÌÎÍÒÓÙÜ]+$";
String sentence = "Cosa fare di domenica per migliorare il tuo lunedì.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 10);
}
@Test
void testCustomPatternForTokenizerMEWithAbbreviationsIta() throws IOException {
String lang = "ita";
String pattern = "^[0-9a-zàèéìîíòóùüA-ZÀÈÉÌÎÍÒÓÙÜ]+$";
String sentence = "La chiesa fu costruita fra il 1258 ed il 1308 ca. come chiesa " +
"del convento degli Agostiniani.";
checkCustomPatternForTokenizerME(lang, pattern, sentence, 18);
}
@Test
void testContractionsIta() throws IOException {
Dictionary dic = null;
String lang = "ita";
String pattern = "^[0-9a-zàèéìîíòóùüA-ZÀÈÉÌÎÍÒÓÙÜ]+$";
TokenizerModel model = train(new TokenizerFactory(lang, dic, true,
Pattern.compile(pattern)));
TokenizerME tokenizer = new TokenizerME(model);
String sentence = "La contrazione di \"dove è\" è \"dov'è\".";
String[] tokens = tokenizer.tokenize(sentence);
Assertions.assertEquals(11, tokens.length);
String[] sentSplit = sentence.replaceAll("\\.", " .")
.replaceAll("'", " '").replaceAll("([^ ])\"", "$1 \"").split(" ");
for (int i = 0; i < sentSplit.length; i++) {
Assertions.assertEquals(sentSplit[i], tokens[i]);
}
}
@Test
void testContractionsEng() throws IOException {
Dictionary dic = null;
String lang = "eng";
String pattern = "^[A-Za-z0-9]+$";
TokenizerModel model = train(new TokenizerFactory(lang, dic, true,
Pattern.compile(pattern)));
TokenizerME tokenizer = new TokenizerME(model);
String sentence = "The cat wasn't in the house and the dog wasn't either.";
String[] tokens = tokenizer.tokenize(sentence);
Assertions.assertEquals(14, tokens.length);
String[] sentSplit = sentence.replaceAll("\\.", " .")
.replaceAll("'", " '").split(" ");
for (int i = 0; i < sentSplit.length; i++) {
Assertions.assertEquals(sentSplit[i], tokens[i]);
}
}
@Test
void testDummyFactory() throws IOException {
Dictionary dic = loadAbbDictionary(Locale.ENGLISH);
final String lang = "eng";
String pattern = "^[0-9A-Za-z]+$";
TokenizerModel model = train(new DummyTokenizerFactory(lang, dic, true,
Pattern.compile(pattern)));
TokenizerFactory factory = model.getFactory();
Assertions.assertInstanceOf(DummyDictionary.class, factory.getAbbreviationDictionary());
Assertions.assertInstanceOf(DummyContextGenerator.class, factory.getContextGenerator());
Assertions.assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertTrue(factory.isUseAlphaNumericOptimization());
ByteArrayOutputStream out = new ByteArrayOutputStream();
model.serialize(out);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
TokenizerModel fromSerialized = new TokenizerModel(in);
factory = fromSerialized.getFactory();
Assertions.assertInstanceOf(DummyDictionary.class, factory.getAbbreviationDictionary());
Assertions.assertInstanceOf(DummyContextGenerator.class, factory.getContextGenerator());
Assertions.assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertTrue(factory.isUseAlphaNumericOptimization());
}
@Test
void testCreateDummyFactory() throws IOException {
Dictionary dic = loadAbbDictionary(Locale.ENGLISH);
final String lang = "eng";
String pattern = "^[0-9A-Za-z]+$";
TokenizerFactory factory = TokenizerFactory.create(
DummyTokenizerFactory.class.getCanonicalName(), lang, dic, true,
Pattern.compile(pattern));
Assertions.assertInstanceOf(DummyDictionary.class, factory.getAbbreviationDictionary());
Assertions.assertInstanceOf(DummyContextGenerator.class, factory.getContextGenerator());
Assertions.assertEquals(pattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertTrue(factory.isUseAlphaNumericOptimization());
}
}