blob: 346c6c32a16d70b7321c2cd9da1de215bc3ef5f5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.email;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.EmojiTokenizationTestUnicode_11_0;
import org.apache.lucene.analysis.standard.WordBreakTestUnicode_9_0_0;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TestUtil;
public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
// LUCENE-5440: extremely slow tokenization of text matching email <local-part> (before the '@')
@Nightly
public void testLongEMAILatomText() throws Exception {
// EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
char[] emailAtomChars =
"!#$%&'*+,-./0123456789=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz{|}~"
.toCharArray();
StringBuilder builder = new StringBuilder();
int numChars = TestUtil.nextInt(random(), 100 * 1024, 3 * 1024 * 1024);
for (int i = 0; i < numChars; ++i) {
builder.append(emailAtomChars[random().nextInt(emailAtomChars.length)]);
}
int tokenCount = 0;
UAX29URLEmailTokenizer ts = new UAX29URLEmailTokenizer();
String text = builder.toString();
ts.setReader(new StringReader(text));
ts.reset();
while (ts.incrementToken()) {
tokenCount++;
}
ts.end();
ts.close();
assertTrue(tokenCount > 0);
tokenCount = 0;
int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
ts.setMaxTokenLength(newBufferSize);
ts.setReader(new StringReader(text));
ts.reset();
while (ts.incrementToken()) {
tokenCount++;
}
ts.end();
ts.close();
assertTrue(tokenCount > 0);
}
public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder();
char whitespace[] = new char[4094];
Arrays.fill(whitespace, ' ');
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(input));
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] {"testing", "1234"});
}
private Analyzer a, urlAnalyzer, emailAnalyzer;
@Override
public void setUp() throws Exception {
super.setUp();
a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
return new TokenStreamComponents(tokenizer);
}
};
urlAnalyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
tokenizer.setMaxTokenLength(
UAX29URLEmailTokenizer.MAX_TOKEN_LENGTH_LIMIT); // Tokenize arbitrary length URLs
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
emailAnalyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
TokenFilter filter = new EmailFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
}
@Override
public void tearDown() throws Exception {
IOUtils.close(a, urlAnalyzer, emailAnalyzer);
super.tearDown();
}
/** Passes through tokens with type "<URL>" and blocks all other types. */
private static class URLFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public URLFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.URL]) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
/** Passes through tokens with type "<EMAIL>" and blocks all other types. */
private static class EmailFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
public EmailFilter(TokenStream in) {
super(in);
}
@Override
public final boolean incrementToken() throws java.io.IOException {
boolean isTokenAvailable = false;
while (input.incrementToken()) {
if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMAIL]) {
isTokenAvailable = true;
break;
}
}
return isTokenAvailable;
}
}
public void testArmenian() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
new String[] {
"Վիքիպեդիայի",
"13",
"միլիոն",
"հոդվածները",
"4,600",
"հայերեն",
"վիքիպեդիայում",
"գրվել",
"են",
"կամավորների",
"կողմից",
"ու",
"համարյա",
"բոլոր",
"հոդվածները",
"կարող",
"է",
"խմբագրել",
"ցանկաց",
"մարդ",
"ով",
"կարող",
"է",
"բացել",
"Վիքիպեդիայի",
"կայքը"
});
}
public void testAmharic() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
new String[] {
"ዊኪፔድያ",
"የባለ",
"ብዙ",
"ቋንቋ",
"የተሟላ",
"ትክክለኛና",
"ነጻ",
"መዝገበ",
"ዕውቀት",
"ኢንሳይክሎፒዲያ",
"ነው",
"ማንኛውም"
});
}
public void testArabic() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
new String[] {
"الفيلم",
"الوثائقي",
"الأول",
"عن",
"ويكيبيديا",
"يسمى",
"الحقيقة",
"بالأرقام",
"قصة",
"ويكيبيديا",
"بالإنجليزية",
"Truth",
"in",
"Numbers",
"The",
"Wikipedia",
"Story",
"سيتم",
"إطلاقه",
"في",
"2008"
});
}
public void testAramaic() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
new String[] {
"ܘܝܩܝܦܕܝܐ",
"ܐܢܓܠܝܐ",
"Wikipedia",
"ܗܘ",
"ܐܝܢܣܩܠܘܦܕܝܐ",
"ܚܐܪܬܐ",
"ܕܐܢܛܪܢܛ",
"ܒܠܫܢ̈ܐ",
"ܣܓܝܐ̈ܐ",
"ܫܡܗ",
"ܐܬܐ",
"ܡܢ",
"ܡ̈ܠܬܐ",
"ܕ",
"ܘܝܩܝ",
"ܘ",
"ܐܝܢܣܩܠܘܦܕܝܐ"
});
}
public void testBengali() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
new String[] {
"এই",
"বিশ্বকোষ",
"পরিচালনা",
"করে",
"উইকিমিডিয়া",
"ফাউন্ডেশন",
"একটি",
"অলাভজনক",
"সংস্থা",
"উইকিপিডিয়ার",
"শুরু",
"১৫",
"জানুয়ারি",
"২০০১",
"সালে",
"এখন",
"পর্যন্ত",
"২০০টিরও",
"বেশী",
"ভাষায়",
"উইকিপিডিয়া",
"রয়েছে"
});
}
public void testFarsi() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
new String[] {
"ویکی",
"پدیای",
"انگلیسی",
"در",
"تاریخ",
"۲۵",
"دی",
"۱۳۷۹",
"به",
"صورت",
"مکملی",
"برای",
"دانشنامهٔ",
"تخصصی",
"نوپدیا",
"نوشته",
"شد"
});
}
public void testGreek() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
new String[] {
"Γράφεται",
"σε",
"συνεργασία",
"από",
"εθελοντές",
"με",
"το",
"λογισμικό",
"wiki",
"κάτι",
"που",
"σημαίνει",
"ότι",
"άρθρα",
"μπορεί",
"να",
"προστεθούν",
"ή",
"να",
"αλλάξουν",
"από",
"τον",
"καθένα"
});
}
public void testThai() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
new String[] {"การที่ได้ต้องแสดงว่างานดี", "แล้วเธอจะไปไหน", "๑๒๓๔"});
}
public void testLao() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"ສາທາລະນະລັດ ປະຊາທິປະໄຕ ປະຊາຊົນລາວ",
new String[] {"ສາທາລະນະລັດ", "ປະຊາທິປະໄຕ", "ປະຊາຊົນລາວ"});
}
public void testTibetan() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
new String[] {
"སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག",
"མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར",
"ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ"
});
}
/*
* For chinese, tokenize as char (these can later form bigrams or whatever)
*/
public void testChinese() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "我是中国人。 1234 Tests ", new String[] {"我", "是", "中", "国", "人", "1234", "Tests"});
}
public void testEmpty() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "", new String[] {});
BaseTokenStreamTestCase.assertAnalyzesTo(a, ".", new String[] {});
BaseTokenStreamTestCase.assertAnalyzesTo(a, " ", new String[] {});
}
/* test various jira issues this analyzer is related to */
public void testLUCENE1545() throws Exception {
/*
* Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
* The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
* Expected result is only on token "moͤchte".
*/
BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] {"moͤchte"});
}
/* Tests from StandardAnalyzer, just to show behavior is similar */
public void testAlphanumericSA() throws Exception {
// alphanumeric tokens
BaseTokenStreamTestCase.assertAnalyzesTo(a, "B2B", new String[] {"B2B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "2B", new String[] {"2B"});
}
public void testDelimitersSA() throws Exception {
// other delimiters: "-", "/", ","
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "some-dashed-phrase", new String[] {"some", "dashed", "phrase"});
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "dogs,chase,cats", new String[] {"dogs", "chase", "cats"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "ac/dc", new String[] {"ac", "dc"});
}
public void testApostrophesSA() throws Exception {
// internal apostrophes: O'Reilly, you're, O'Reilly's
BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[] {"O'Reilly"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[] {"you're"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[] {"she's"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[] {"Jim's"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[] {"don't"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[] {"O'Reilly's"});
}
public void testNumericSA() throws Exception {
// floating point, serial, model numbers, ip addresses, etc.
BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[] {"21.35"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[] {"R2D2", "C3PO"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[] {"216.239.63.104"});
}
public void testTextWithNumbersSA() throws Exception {
// numbers
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "David has 5000 bones", new String[] {"David", "has", "5000", "bones"});
}
public void testVariousTextSA() throws Exception {
// various
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "C embedded developers wanted", new String[] {"C", "embedded", "developers", "wanted"});
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "foo bar FOO BAR", new String[] {"foo", "bar", "FOO", "BAR"});
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "foo bar . FOO <> BAR", new String[] {"foo", "bar", "FOO", "BAR"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[] {"QUOTED", "word"});
}
public void testKoreanSA() throws Exception {
// Korean words
BaseTokenStreamTestCase.assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[] {"안녕하세요", "한글입니다"});
}
public void testOffsets() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"David has 5000 bones",
new String[] {"David", "has", "5000", "bones"},
new int[] {0, 6, 10, 15},
new int[] {5, 9, 14, 20});
}
public void testTypes() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"David has 5000 bones",
new String[] {"David", "has", "5000", "bones"},
new String[] {"<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>"});
}
public void testWikiURLs() throws Exception {
Reader reader = null;
String luceneResourcesWikiPage;
try {
reader =
new InputStreamReader(
getClass().getResourceAsStream("LuceneResourcesWikiPage.html"),
StandardCharsets.UTF_8);
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
luceneResourcesWikiPage = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != luceneResourcesWikiPage && luceneResourcesWikiPage.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<>();
bufferedReader =
new BufferedReader(
new InputStreamReader(
getClass().getResourceAsStream("LuceneResourcesWikiPageURLs.txt"),
StandardCharsets.UTF_8));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo(urlAnalyzer, luceneResourcesWikiPage, urls);
}
public void testEmails() throws Exception {
Reader reader = null;
String randomTextWithEmails;
try {
reader =
new InputStreamReader(
getClass().getResourceAsStream("random.text.with.email.addresses.txt"),
StandardCharsets.UTF_8);
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithEmails = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithEmails && randomTextWithEmails.length() > 0);
BufferedReader bufferedReader = null;
String[] emails;
try {
List<String> emailList = new ArrayList<>();
bufferedReader =
new BufferedReader(
new InputStreamReader(
getClass()
.getResourceAsStream(
"email.addresses.from.random.text.with.email.addresses.txt"),
StandardCharsets.UTF_8));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
emailList.add(line);
}
}
emails = emailList.toArray(new String[emailList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != emails && emails.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo(emailAnalyzer, randomTextWithEmails, emails);
}
public void testMailtoSchemeEmails() throws Exception {
// See LUCENE-3880
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"mailto:test@example.org",
new String[] {"mailto", "test@example.org"},
new String[] {"<ALPHANUM>", "<EMAIL>"});
// TODO: Support full mailto: scheme URIs. See RFC 6068: http://tools.ietf.org/html/rfc6068
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"mailto:personA@example.com,personB@example.com?cc=personC@example.com"
+ "&subject=Subjectivity&body=Corpusivity%20or%20something%20like%20that",
new String[] {
"mailto",
"personA@example.com",
// TODO: recognize ',' address delimiter. Also, see examples of ';' delimiter use at:
// http://www.mailto.co.uk/
",personB@example.com",
"?cc=personC@example.com", // TODO: split field keys/values
"subject",
"Subjectivity",
"body",
"Corpusivity",
"20or",
"20something",
"20like",
"20that"
}, // TODO: Hex decoding + re-tokenization
new String[] {
"<ALPHANUM>",
"<EMAIL>",
"<EMAIL>",
"<EMAIL>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>"
});
}
public void testURLs() throws Exception {
Reader reader = null;
String randomTextWithURLs;
try {
reader =
new InputStreamReader(
getClass().getResourceAsStream("random.text.with.urls.txt"), StandardCharsets.UTF_8);
StringBuilder builder = new StringBuilder();
char[] buffer = new char[1024];
int numCharsRead;
while (-1 != (numCharsRead = reader.read(buffer))) {
builder.append(buffer, 0, numCharsRead);
}
randomTextWithURLs = builder.toString();
} finally {
if (null != reader) {
reader.close();
}
}
assertTrue(null != randomTextWithURLs && randomTextWithURLs.length() > 0);
BufferedReader bufferedReader = null;
String[] urls;
try {
List<String> urlList = new ArrayList<>();
bufferedReader =
new BufferedReader(
new InputStreamReader(
getClass().getResourceAsStream("urls.from.random.text.with.urls.txt"),
StandardCharsets.UTF_8));
String line;
while (null != (line = bufferedReader.readLine())) {
line = line.trim();
if (line.length() > 0) {
urlList.add(line);
}
}
urls = urlList.toArray(new String[urlList.size()]);
} finally {
if (null != bufferedReader) {
bufferedReader.close();
}
}
assertTrue(null != urls && urls.length > 0);
BaseTokenStreamTestCase.assertAnalyzesTo(urlAnalyzer, randomTextWithURLs, urls);
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_9_0_0 wordBreakTest = new WordBreakTestUnicode_9_0_0();
wordBreakTest.test(a);
}
public void testSupplementary() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"𩬅艱鍟䇹愯瀛",
new String[] {"𩬅", "艱", "鍟", "䇹", "愯", "瀛"},
new String[] {
"<IDEOGRAPHIC>",
"<IDEOGRAPHIC>",
"<IDEOGRAPHIC>",
"<IDEOGRAPHIC>",
"<IDEOGRAPHIC>",
"<IDEOGRAPHIC>"
});
}
public void testKorean() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "훈민정음", new String[] {"훈민정음"}, new String[] {"<HANGUL>"});
}
public void testJapanese() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"仮名遣い カタカナ",
new String[] {"仮", "名", "遣", "い", "カタカナ"},
new String[] {
"<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>"
});
}
public void testCombiningMarks() throws Exception {
checkOneTerm(a, "ざ", "ざ"); // hiragana
checkOneTerm(a, "ザ", "ザ"); // katakana
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
checkOneTerm(a, "아゙", "아゙"); // hangul
}
/**
* Multiple consecutive chars in \p{Word_Break = MidLetter}, \p{Word_Break = MidNumLet}, and/or
* \p{Word_Break = MidNum} should trigger a token split.
*/
public void testMid() throws Exception {
// ':' is in \p{WB:MidLetter}, which should trigger a split unless there is a Letter char on
// both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B", new String[] {"A:B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A::B", new String[] {"A", "B"});
// '.' is in \p{WB:MidNumLet}, which should trigger a split unless there is a Letter or Numeric
// char on both sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2", new String[] {"1.2"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B", new String[] {"A.B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1..2", new String[] {"1", "2"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A..B", new String[] {"A", "B"});
// ',' is in \p{WB:MidNum}, which should trigger a split unless there is a Numeric char on both
// sides
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2", new String[] {"1,2"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,,2", new String[] {"1", "2"});
// Mixed consecutive \p{WB:MidLetter} and \p{WB:MidNumLet} should trigger a split
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.:B", new String[] {"A", "B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:.B", new String[] {"A", "B"});
// Mixed consecutive \p{WB:MidNum} and \p{WB:MidNumLet} should trigger a split
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,.2", new String[] {"1", "2"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.,2", new String[] {"1", "2"});
// '_' is in \p{WB:ExtendNumLet}
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B_A:B", new String[] {"A:B_A:B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A:B_A::B", new String[] {"A:B_A", "B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2_1.2", new String[] {"1.2_1.2"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B_A.B", new String[] {"A.B_A.B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1.2_1..2", new String[] {"1.2_1", "2"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "A.B_A..B", new String[] {"A.B_A", "B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2_1,2", new String[] {"1,2_1,2"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "1,2_1,,2", new String[] {"1,2_1", "2"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "C_A.:B", new String[] {"C_A", "B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "C_A:.B", new String[] {"C_A", "B"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1,.2", new String[] {"3_1", "2"});
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3_1.,2", new String[] {"3_1", "2"});
}
/** simple emoji */
public void testEmoji() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"💩 💩💩",
new String[] {"💩", "💩", "💩"},
new String[] {"<EMOJI>", "<EMOJI>", "<EMOJI>"});
}
/** emoji zwj sequence */
public void testEmojiSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "👩‍❤️‍👩", new String[] {"👩‍❤️‍👩"}, new String[] {"<EMOJI>"});
}
/** emoji zwj sequence with fitzpatrick modifier */
public void testEmojiSequenceWithModifier() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "👨🏼‍⚕️", new String[] {"👨🏼‍⚕️"}, new String[] {"<EMOJI>"});
}
/** regional indicator */
public void testEmojiRegionalIndicator() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "🇺🇸🇺🇸", new String[] {"🇺🇸", "🇺🇸"}, new String[] {"<EMOJI>", "<EMOJI>"});
}
/** variation sequence */
public void testEmojiVariationSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "#️⃣", new String[] {"#️⃣"}, new String[] {"<EMOJI>"});
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"3️⃣",
new String[] {
"3️⃣",
},
new String[] {"<EMOJI>"});
// text presentation sequences
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E", new String[] {}, new String[] {});
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"3\uFE0E", // \uFE0E is included in \p{WB:Extend}
new String[] {
"3\uFE0E",
},
new String[] {"<NUM>"});
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"\u2B55\uFE0E", // \u2B55 = HEAVY BLACK CIRCLE
new String[] {
"\u2B55",
},
new String[] {"<EMOJI>"});
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"\u2B55\uFE0E\u200D\u2B55\uFE0E",
new String[] {"\u2B55", "\u200D\u2B55"},
new String[] {"<EMOJI>", "<EMOJI>"});
}
public void testEmojiTagSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(
a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿", new String[] {"🏴󠁧󠁢󠁥󠁮󠁧󠁿"}, new String[] {"<EMOJI>"});
}
public void testEmojiTokenization() throws Exception {
// simple emoji around latin
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"poo💩poo",
new String[] {"poo", "💩", "poo"},
new String[] {"<ALPHANUM>", "<EMOJI>", "<ALPHANUM>"});
// simple emoji around non-latin
BaseTokenStreamTestCase.assertAnalyzesTo(
a,
"💩中國💩",
new String[] {"💩", "中", "國", "💩"},
new String[] {"<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>"});
}
public void testUnicodeEmojiTests() throws Exception {
EmojiTokenizationTestUnicode_11_0 emojiTest = new EmojiTokenizationTestUnicode_11_0();
emojiTest.test(a);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Random random = random();
checkRandomData(random, a, 3 * RANDOM_MULTIPLIER, 8192);
}
public void testExampleURLs() throws Exception {
String[] TLDs = {
"aaa",
"ac",
"ai",
"aarp",
"abarth",
"abb",
"abbott",
"abbvie",
"abc",
"able",
"abogado",
"abudhabi",
"academy",
"accenture",
"accountant",
"accountants",
"aco",
"active",
"actor",
"ad",
"adac",
"ads",
"adult",
"ae",
"aeg",
"aero",
"aetna",
"af",
"afamilycompany",
"afl",
"africa",
"ag",
"agakhan",
"agency",
"aig",
"aigo",
"airbus",
"airforce",
"airtel",
"akdn",
"al",
"alfaromeo",
"alibaba",
"alipay",
"allfinanz",
"allstate",
"ally",
"alsace",
"alstom",
"am",
"americanexpress",
"americanfamily",
"amex",
"amfam",
"amica",
"amsterdam",
"analytics",
"android",
"anquan",
"anz",
"ao",
"aol",
"apartments",
"app",
"apple",
"aq",
"aquarelle",
"ar",
"arab",
"aramco",
"archi",
"army",
"arpa",
"art",
"arte",
"as",
"asda",
"asia",
"associates",
"at",
"athleta",
"attorney",
"au",
"auction",
"audi",
"audible",
"audio",
"auspost",
"author",
"auto",
"autos",
"avianca",
"aw",
"aws",
"ax",
"axa",
"az",
"azure",
"ba",
"baby",
"baidu",
"banamex",
"bananarepublic",
"band",
"bank",
"bar",
"barcelona",
"barclaycard",
"barclays",
"barefoot",
"bargains",
"baseball",
"basketball",
"bauhaus",
"bayern",
"bb",
"bbc",
"bbt",
"bbva",
"bcg",
"bcn",
"bd",
"be",
"beats",
"beauty",
"beer",
"bentley",
"berlin",
"best",
"bestbuy",
"bet",
"bf",
"bg",
"bh",
"bharti",
"bi",
"bible",
"bid",
"bike",
"bing",
"bingo",
"bio",
"biz",
"bj",
"black",
"blackfriday",
"blanco",
"blockbuster",
"blog",
"bloomberg",
"blue",
"bm",
"bms",
"bmw",
"bn",
"bnl",
"bnpparibas",
"bo",
"boats",
"boehringer",
"bofa",
"bom",
"bond",
"boo",
"book",
"booking",
"bosch",
"bostik",
"boston",
"bot",
"boutique",
"box",
"br",
"bradesco",
"bridgestone",
"broadway",
"broker",
"brother",
"brussels",
"bs",
"bt",
"budapest",
"bugatti",
"build",
"builders",
"business",
"buy",
"buzz",
"bv",
"bw",
"by",
"bz",
"bzh",
"ca",
"cab",
"cafe",
"cal",
"call",
"calvinklein",
"cam",
"camera",
"camp",
"cancerresearch",
"canon",
"capetown",
"capital",
"capitalone",
"car",
"caravan",
"cards",
"care",
"career",
"careers",
"cars",
"cartier",
"casa",
"case",
"caseih",
"cash",
"casino",
"cat",
"catering",
"catholic",
"cba",
"cbn",
"cbre",
"cbs",
"cc",
"cd",
"ceb",
"center",
"ceo",
"cern",
"cf",
"cfa",
"cfd",
"cg",
"ch",
"chanel",
"channel",
"chase",
"chat",
"cheap",
"chintai",
"christmas",
"chrome",
"chrysler",
"church",
"ci",
"cipriani",
"circle",
"cisco",
"citadel",
"citi",
"citic",
"city",
"cityeats",
"ck",
"cl",
"claims",
"cleaning",
"click",
"clinic",
"clinique",
"clothing",
"cloud",
"club",
"clubmed",
"cm",
"cn",
"co",
"coach",
"codes",
"coffee",
"college",
"cologne",
"com",
"comcast",
"commbank",
"community",
"company",
"compare",
"computer",
"comsec",
"condos",
"construction",
"consulting",
"contact",
"contractors",
"cooking",
"cookingchannel",
"cool",
"coop",
"corsica",
"country",
"coupon",
"coupons",
"courses",
"cr",
"credit",
"creditcard",
"creditunion",
"cricket",
"crown",
"crs",
"cruise",
"cruises",
"csc",
"cu",
"cuisinella",
"cv",
"cw",
"cx",
"cy",
"cymru",
"cyou",
"cz",
"dabur",
"dad",
"dance",
"data",
"date",
"dating",
"datsun",
"day",
"dclk",
"dds",
"de",
"deal",
"dealer",
"deals",
"degree",
"delivery",
"dell",
"deloitte",
"delta",
"democrat",
"dental",
"dentist",
"desi",
"design",
"dev",
"dhl",
"diamonds",
"diet",
"digital",
"direct",
"directory",
"discount",
"discover",
"dish",
"diy",
"dj",
"dk",
"dm",
"dnp",
"do",
"docs",
"doctor",
"dodge",
"dog",
"doha",
"domains",
"dot",
"download",
"drive",
"dtv",
"dubai",
"duck",
"dunlop",
"duns",
"dupont",
"durban",
"dvag",
"dvr",
"dz",
"earth",
"eat",
"ec",
"eco",
"edeka",
"edu",
"education",
"ee",
"eg",
"email",
"emerck",
"energy",
"engineer",
"engineering",
"enterprises",
"epost",
"epson",
"equipment",
"er",
"ericsson",
"erni",
"es",
"esq",
"estate",
"esurance",
"et",
"etisalat",
"eu",
"eurovision",
"eus",
"events",
"everbank",
"exchange",
"expert",
"exposed",
"express",
"extraspace",
"fage",
"fail",
"fairwinds",
"faith",
"family",
"fan",
"fans",
"farm",
"farmers",
"fashion",
"fast",
"fedex",
"feedback",
"ferrari",
"ferrero",
"fi",
"fiat",
"fidelity",
"fido",
"film",
"final",
"finance",
"financial",
"fire",
"firestone",
"firmdale",
"fish",
"fishing",
"fit",
"fitness",
"fj",
"fk",
"flickr",
"flights",
"flir",
"florist",
"flowers",
"fly",
"fm",
"fo",
"foo",
"food",
"foodnetwork",
"football",
"ford",
"forex",
"forsale",
"forum",
"foundation",
"fox",
"fr",
"free",
"fresenius",
"frl",
"frogans",
"frontdoor",
"frontier",
"ftr",
"fujitsu",
"fujixerox",
"fun",
"fund",
"furniture",
"futbol",
"fyi",
"ga",
"gal",
"gallery",
"gallo",
"gallup",
"game",
"games",
"gap",
"garden",
"gb",
"gbiz",
"gd",
"gdn",
"ge",
"gea",
"gent",
"genting",
"george",
"gf",
"gg",
"ggee",
"gh",
"gi",
"gift",
"gifts",
"gives",
"giving",
"gl",
"glade",
"glass",
"gle",
"global",
"globo",
"gm",
"gmail",
"gmbh",
"gmo",
"gmx",
"gn",
"godaddy",
"gold",
"goldpoint",
"golf",
"goo",
"goodhands",
"goodyear",
"goog",
"google",
"gop",
"got",
"gov",
"gp",
"gq",
"gr",
"grainger",
"graphics",
"gratis",
"green",
"gripe",
"grocery",
"group",
"gs",
"gt",
"gu",
"guardian",
"gucci",
"guge",
"guide",
"guitars",
"guru",
"gw",
"gy",
"hair",
"hamburg",
"hangout",
"haus",
"hbo",
"hdfc",
"hdfcbank",
"health",
"healthcare",
"help",
"helsinki",
"here",
"hermes",
"hgtv",
"hiphop",
"hisamitsu",
"hitachi",
"hiv",
"hk",
"hkt",
"hm",
"hn",
"hockey",
"holdings",
"holiday",
"homedepot",
"homegoods",
"homes",
"homesense",
"honda",
"honeywell",
"horse",
"hospital",
"host",
"hosting",
"hot",
"hoteles",
"hotels",
"hotmail",
"house",
"how",
"hr",
"hsbc",
"ht",
"hu",
"hughes",
"hyatt",
"hyundai",
"ibm",
"icbc",
"ice",
"icu",
"id",
"ie",
"ieee",
"ifm",
"ikano",
"il",
"im",
"imamat",
"imdb",
"immo",
"immobilien",
"in",
"industries",
"infiniti",
"info",
"ing",
"ink",
"institute",
"insurance",
"insure",
"int",
"intel",
"international",
"intuit",
"investments",
"io",
"ipiranga",
"iq",
"ir",
"irish",
"is",
"iselect",
"ismaili",
"ist",
"istanbul",
"it",
"itau",
"itv",
"iveco",
"iwc",
"jaguar",
"java",
"jcb",
"jcp",
"je",
"jeep",
"jetzt",
"jewelry",
"jio",
"jlc",
"jll",
"jm",
"jmp",
"jnj",
"jo",
"jobs",
"joburg",
"jot",
"joy",
"jp",
"jpmorgan",
"jprs",
"juegos",
"juniper",
"kaufen",
"kddi",
"ke",
"kerryhotels",
"kerrylogistics",
"kerryproperties",
"kfh",
"kg",
"kh",
"ki",
"kia",
"kim",
"kinder",
"kindle",
"kitchen",
"kiwi",
"km",
"kn",
"koeln",
"komatsu",
"kosher",
"kp",
"kpmg",
"kpn",
"kr",
"krd",
"kred",
"kuokgroup",
"kw",
"ky",
"kyoto",
"kz",
"la",
"lacaixa",
"ladbrokes",
"lamborghini",
"lamer",
"lancaster",
"lancia",
"lancome",
"land",
"landrover",
"lanxess",
"lasalle",
"lat",
"latino",
"latrobe",
"law",
"lawyer",
"lb",
"lc",
"lds",
"lease",
"leclerc",
"lefrak",
"legal",
"lego",
"lexus",
"lgbt",
"li",
"liaison",
"lidl",
"life",
"lifeinsurance",
"lifestyle",
"lighting",
"like",
"lilly",
"limited",
"limo",
"lincoln",
"linde",
"link",
"lipsy",
"live",
"living",
"lixil",
"lk",
"llc",
"loan",
"loans",
"locker",
"locus",
"loft",
"lol",
"london",
"lotte",
"lotto",
"love",
"lpl",
"lplfinancial",
"lr",
"ls",
"lt",
"ltd",
"ltda",
"lu",
"lundbeck",
"lupin",
"luxe",
"luxury",
"lv",
"ly",
"ma",
"macys",
"madrid",
"maif",
"maison",
"makeup",
"man",
"management",
"mango",
"map",
"market",
"marketing",
"markets",
"marriott",
"marshalls",
"maserati",
"mattel",
"mba",
"mc",
"mckinsey",
"md",
"me",
"med",
"media",
"meet",
"melbourne",
"meme",
"memorial",
"men",
"menu",
"merckmsd",
"metlife",
"mg",
"mh",
"miami",
"microsoft",
"mil",
"mini",
"mint",
"mit",
"mitsubishi",
"mk",
"ml",
"mlb",
"mls",
"mm",
"mma",
"mn",
"mo",
"mobi",
"mobile",
"mobily",
"moda",
"moe",
"moi",
"mom",
"monash",
"money",
"monster",
"mopar",
"mormon",
"mortgage",
"moscow",
"moto",
"motorcycles",
"mov",
"movie",
"movistar",
"mp",
"mq",
"mr",
"ms",
"msd",
"mt",
"mtn",
"mtr",
"mu",
"museum",
"mutual",
"mv",
"mw",
"mx",
"my",
"mz",
"na",
"nab",
"nadex",
"nagoya",
"name",
"nationwide",
"natura",
"navy",
"nba",
"nc",
"ne",
"nec",
"net",
"netbank",
"netflix",
"network",
"neustar",
"new",
"newholland",
"news",
"next",
"nextdirect",
"nexus",
"nf",
"nfl",
"ng",
"ngo",
"nhk",
"ni",
"nico",
"nike",
"nikon",
"ninja",
"nissan",
"nissay",
"nl",
"no",
"nokia",
"northwesternmutual",
"norton",
"now",
"nowruz",
"nowtv",
"np",
"nr",
"nra",
"nrw",
"ntt",
"nu",
"nyc",
"nz",
"obi",
"observer",
"off",
"office",
"okinawa",
"olayan",
"olayangroup",
"oldnavy",
"ollo",
"om",
"omega",
"one",
"ong",
"onl",
"online",
"onyourside",
"ooo",
"open",
"oracle",
"orange",
"org",
"organic",
"origins",
"osaka",
"otsuka",
"ott",
"ovh",
"pa",
"page",
"panasonic",
"panerai",
"paris",
"pars",
"partners",
"parts",
"party",
"passagens",
"pay",
"pccw",
"pe",
"pet",
"pf",
"pfizer",
"pg",
"ph",
"pharmacy",
"phd",
"philips",
"phone",
"photo",
"photography",
"photos",
"physio",
"piaget",
"pics",
"pictet",
"pictures",
"pid",
"pin",
"ping",
"pink",
"pioneer",
"pizza",
"pk",
"pl",
"place",
"play",
"playstation",
"plumbing",
"plus",
"pm",
"pn",
"pnc",
"pohl",
"poker",
"politie",
"porn",
"post",
"pr",
"pramerica",
"praxi",
"press",
"prime",
"pro",
"prod",
"productions",
"prof",
"progressive",
"promo",
"properties",
"property",
"protection",
"pru",
"prudential",
"ps",
"pt",
"pub",
"pw",
"pwc",
"py",
"qa",
"qpon",
"quebec",
"quest",
"qvc",
"racing",
"radio",
"raid",
"re",
"read",
"realestate",
"realtor",
"realty",
"recipes",
"red",
"redstone",
"redumbrella",
"rehab",
"reise",
"reisen",
"reit",
"reliance",
"ren",
"rent",
"rentals",
"repair",
"report",
"republican",
"rest",
"restaurant",
"review",
"reviews",
"rexroth",
"rich",
"richardli",
"ricoh",
"rightathome",
"ril",
"rio",
"rip",
"rmit",
"ro",
"rocher",
"rocks",
"rodeo",
"rogers",
"room",
"rs",
"rsvp",
"ru",
"rugby",
"ruhr",
"run",
"rw",
"rwe",
"ryukyu",
"sa",
"saarland",
"safe",
"safety",
"sakura",
"sale",
"salon",
"samsclub",
"samsung",
"sandvik",
"sandvikcoromant",
"sanofi",
"sap",
"sarl",
"sas",
"save",
"saxo",
"sb",
"sbi",
"sbs",
"sc",
"sca",
"scb",
"schaeffler",
"schmidt",
"scholarships",
"school",
"schule",
"schwarz",
"science",
"scjohnson",
"scor",
"scot",
"sd",
"se",
"search",
"seat",
"secure",
"security",
"seek",
"select",
"sener",
"services",
"ses",
"seven",
"sew",
"sex",
"sexy",
"sfr",
"sg",
"sh",
"shangrila",
"sharp",
"shaw",
"shell",
"shia",
"shiksha",
"shoes",
"shop",
"shopping",
"shouji",
"show",
"showtime",
"shriram",
"si",
"silk",
"sina",
"singles",
"site",
"sj",
"sk",
"ski",
"skin",
"sky",
"skype",
"sl",
"sling",
"sm",
"smart",
"smile",
"sn",
"sncf",
"so",
"soccer",
"social",
"softbank",
"software",
"sohu",
"solar",
"solutions",
"song",
"sony",
"soy",
"space",
"spiegel",
"sport",
"spot",
"spreadbetting",
"sr",
"srl",
"srt",
"st",
"stada",
"staples",
"star",
"starhub",
"statebank",
"statefarm",
"statoil",
"stc",
"stcgroup",
"stockholm",
"storage",
"store",
"stream",
"studio",
"study",
"style",
"su",
"sucks",
"supplies",
"supply",
"support",
"surf",
"surgery",
"suzuki",
"sv",
"swatch",
"swiftcover",
"swiss",
"sx",
"sy",
"sydney",
"symantec",
"systems",
"sz",
"tab",
"taipei",
"talk",
"taobao",
"target",
"tatamotors",
"tatar",
"tattoo",
"tax",
"taxi",
"tc",
"tci",
"td",
"tdk",
"team",
"tech",
"technology",
"tel",
"telecity",
"telefonica",
"temasek",
"tennis",
"teva",
"tf",
"tg",
"th",
"thd",
"theater",
"theatre",
"tiaa",
"tickets",
"tienda",
"tiffany",
"tips",
"tires",
"tirol",
"tj",
"tjmaxx",
"tjx",
"tk",
"tkmaxx",
"tl",
"tm",
"tmall",
"tn",
"to",
"today",
"tokyo",
"tools",
"top",
"toray",
"toshiba",
"total",
"tours",
"town",
"toyota",
"toys",
"tr",
"trade",
"trading",
"training",
"travel",
"travelchannel",
"travelers",
"travelersinsurance",
"trust",
"trv",
"tt",
"tube",
"tui",
"tunes",
"tushu",
"tv",
"tvs",
"tw",
"tz",
"ua",
"ubank",
"ubs",
"uconnect",
"ug",
"uk",
"unicom",
"university",
"uno",
"uol",
"ups",
"us",
"uy",
"uz",
"va",
"vacations",
"vana",
"vanguard",
"vc",
"ve",
"vegas",
"ventures",
"verisign",
"versicherung",
"vet",
"vg",
"vi",
"viajes",
"video",
"vig",
"viking",
"villas",
"vin",
"vip",
"virgin",
"visa",
"vision",
"vista",
"vistaprint",
"viva",
"vivo",
"vlaanderen",
"vn",
"vodka",
"volkswagen",
"volvo",
"vote",
"voting",
"voto",
"voyage",
"vu",
"vuelos",
"wales",
"walmart",
"walter",
"wang",
"wanggou",
"warman",
"watch",
"watches",
"weather",
"weatherchannel",
"webcam",
"weber",
"website",
"wed",
"wedding",
"weibo",
"weir",
"wf",
"whoswho",
"wien",
"wiki",
"williamhill",
"win",
"windows",
"wine",
"winners",
"wme",
"wolterskluwer",
"woodside",
"work",
"works",
"world",
"wow",
"ws",
"wtc",
"wtf",
"xbox",
"xerox",
"xfinity",
"xihuan",
"xin",
"xn--11b4c3d",
"xn--1ck2e1b",
"xn--1qqw23a",
"xn--2scrj9c",
"xn--30rr7y",
"xn--3bst00m",
"xn--3ds443g",
"xn--3e0b707e",
"xn--3hcrj9c",
"xn--3oq18vl8pn36a",
"xn--3pxu8k",
"xn--42c2d9a",
"xn--45br5cyl",
"xn--45brj9c",
"xn--45q11c",
"xn--4gbrim",
"xn--54b7fta0cc",
"xn--55qw42g",
"xn--55qx5d",
"xn--5su34j936bgsg",
"xn--5tzm5g",
"xn--6frz82g",
"xn--6qq986b3xl",
"xn--80adxhks",
"xn--80ao21a",
"xn--80aqecdr1a",
"xn--80asehdb",
"xn--80aswg",
"xn--8y0a063a",
"xn--90a3ac",
"xn--90ae",
"xn--90ais",
"xn--9dbq2a",
"xn--9et52u",
"xn--9krt00a",
"xn--b4w605ferd",
"xn--bck1b9a5dre4c",
"xn--c1avg",
"xn--c2br7g",
"xn--cck2b3b",
"xn--cg4bki",
"xn--clchc0ea0b2g2a9gcd",
"xn--czr694b",
"xn--czrs0t",
"xn--czru2d",
"xn--d1acj3b",
"xn--d1alf",
"xn--e1a4c",
"xn--eckvdtc9d",
"xn--efvy88h",
"xn--estv75g",
"xn--fct429k",
"xn--fhbei",
"xn--fiq228c5hs",
"xn--fiq64b",
"xn--fiqs8s",
"xn--fiqz9s",
"xn--fjq720a",
"xn--flw351e",
"xn--fpcrj9c3d",
"xn--fzc2c9e2c",
"xn--fzys8d69uvgm",
"xn--g2xx48c",
"xn--gckr3f0f",
"xn--gecrj9c",
"xn--gk3at1e",
"xn--h2breg3eve",
"xn--h2brj9c",
"xn--h2brj9c8c",
"xn--hxt814e",
"xn--i1b6b1a6a2e",
"xn--imr513n",
"xn--io0a7i",
"xn--j1aef",
"xn--j1amh",
"xn--j6w193g",
"xn--jlq61u9w7b",
"xn--jvr189m",
"xn--kcrx77d1x4a",
"xn--kprw13d",
"xn--kpry57d",
"xn--kpu716f",
"xn--kput3i",
"xn--l1acc",
"xn--lgbbat1ad8j",
"xn--mgb9awbf",
"xn--mgba3a3ejt",
"xn--mgba3a4f16a",
"xn--mgba7c0bbn0a",
"xn--mgbaakc7dvf",
"xn--mgbaam7a8h",
"xn--mgbab2bd",
"xn--mgbai9azgqp6j",
"xn--mgbayh7gpa",
"xn--mgbb9fbpob",
"xn--mgbbh1a",
"xn--mgbbh1a71e",
"xn--mgbc0a9azcg",
"xn--mgbca7dzdo",
"xn--mgberp4a5d4ar",
"xn--mgbgu82a",
"xn--mgbi4ecexp",
"xn--mgbpl2fh",
"xn--mgbt3dhd",
"xn--mgbtx2b",
"xn--mgbx4cd0ab",
"xn--mix891f",
"xn--mk1bu44c",
"xn--mxtq1m",
"xn--ngbc5azd",
"xn--ngbe9e0a",
"xn--ngbrx",
"xn--node",
"xn--nqv7f",
"xn--nqv7fs00ema",
"xn--nyqy26a",
"xn--o3cw4h",
"xn--ogbpf8fl",
"xn--otu796d",
"xn--p1acf",
"xn--p1ai",
"xn--pbt977c",
"xn--pgbs0dh",
"xn--pssy2u",
"xn--q9jyb4c",
"xn--qcka1pmc",
"xn--qxam",
"xn--rhqv96g",
"xn--rovu88b",
"xn--rvc1e0am3e",
"xn--s9brj9c",
"xn--ses554g",
"xn--t60b56a",
"xn--tckwe",
"xn--tiq49xqyj",
"xn--unup4y",
"xn--vermgensberater-ctb",
"xn--vermgensberatung-pwb",
"xn--vhquv",
"xn--vuq861b",
"xn--w4r85el8fhu5dnra",
"xn--w4rs40l",
"xn--wgbh1c",
"xn--wgbl6a",
"xn--xhq521b",
"xn--xkc2al3hye2a",
"xn--xkc2dl3a5ee0h",
"xn--y9a3aq",
"xn--yfro4i67o",
"xn--ygbi2ammx",
"xn--zfr164b",
"xperia",
"xxx",
"xyz",
"yachts",
"yahoo",
"yamaxun",
"yandex",
"ye",
"yodobashi",
"yoga",
"yokohama",
"you",
"youtube",
"yt",
"yun",
"za",
"zappos",
"zara",
"zero",
"zip",
"zippo",
"zm",
"zone",
"zuerich",
"zw"
};
Analyzer analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new UAX29URLEmailTokenizer(newAttributeFactory()));
}
};
for (String tld : TLDs) {
String URL = "example." + tld;
BaseTokenStreamTestCase.assertAnalyzesTo(
analyzer, URL, new String[] {URL}, new String[] {"<URL>"});
}
}
}