blob: de32cc4ad7b324c3ff85c6e85528851b72405912 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.wikipedia;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.*;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
/** Basic Tests for {@link WikipediaTokenizer} */
public class TestWikipediaTokenizer extends BaseTokenStreamTestCase {
protected static final String LINK_PHRASES =
"click [[link here again]] click [http://lucene.apache.org here again] [[Category:a b c d]]";
public void testSimple() throws Exception {
String text = "This is a [[Category:foo]]";
WikipediaTokenizer tf =
new WikipediaTokenizer(
newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
tf.setReader(new StringReader(text));
assertTokenStreamContents(
tf,
new String[] {"This", "is", "a", "foo"},
new int[] {0, 5, 8, 21},
new int[] {4, 7, 9, 24},
new String[] {"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", CATEGORY},
new int[] {
1, 1, 1, 1,
},
text.length());
}
public void testHandwritten() throws Exception {
// make sure all tokens are in only one type
String test =
"[[link]] This is a [[Category:foo]] Category This is a linked [[:Category:bar none withstanding]] "
+ "Category This is (parens) This is a [[link]] This is an external URL [http://lucene.apache.org] "
+ "Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' "
+ " This is a [[link|display info]] This is a period. Here is $3.25 and here is 3.50. Here's Johnny. "
+ "==heading== ===sub head=== followed by some text [[Category:blah| ]] "
+ "''[[Category:ital_cat]]'' here is some that is ''italics [[Category:foo]] but is never closed."
+ "'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this"
+ " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]"
+ " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
WikipediaTokenizer tf =
new WikipediaTokenizer(
newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
tf.setReader(new StringReader(test));
assertTokenStreamContents(
tf,
new String[] {
"link",
"This",
"is",
"a",
"foo",
"Category",
"This",
"is",
"a",
"linked",
"bar",
"none",
"withstanding",
"Category",
"This",
"is",
"parens",
"This",
"is",
"a",
"link",
"This",
"is",
"an",
"external",
"URL",
"http://lucene.apache.org",
"Here",
"is",
"italics",
"and",
"more",
"italics",
"bold",
"and",
"five",
"quotes",
"This",
"is",
"a",
"link",
"display",
"info",
"This",
"is",
"a",
"period",
"Here",
"is",
"3.25",
"and",
"here",
"is",
"3.50",
"Here's",
"Johnny",
"heading",
"sub",
"head",
"followed",
"by",
"some",
"text",
"blah",
"ital",
"cat",
"here",
"is",
"some",
"that",
"is",
"italics",
"foo",
"but",
"is",
"never",
"closed",
"same",
"foo",
"goes",
"for",
"this",
"and2",
"foo",
"and",
"this",
"http://foo.boo.com/test/test/",
"Test",
"Test",
"http://foo.boo.com/test/test/test.html",
"Test",
"Test",
"http://foo.boo.com/test/test/test.html?g=b&c=d",
"Test",
"Test",
"Citation",
"martian",
"code"
},
new String[] {
INTERNAL_LINK,
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
CATEGORY,
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
CATEGORY,
CATEGORY,
CATEGORY,
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
INTERNAL_LINK,
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
EXTERNAL_LINK_URL,
"<ALPHANUM>",
"<ALPHANUM>",
ITALICS,
"<ALPHANUM>",
ITALICS,
ITALICS,
BOLD,
"<ALPHANUM>",
BOLD_ITALICS,
BOLD_ITALICS,
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
INTERNAL_LINK,
INTERNAL_LINK,
INTERNAL_LINK,
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<NUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<NUM>",
"<APOSTROPHE>",
"<ALPHANUM>",
HEADING,
SUB_HEADING,
SUB_HEADING,
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
CATEGORY,
CATEGORY,
CATEGORY,
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
ITALICS,
CATEGORY,
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
BOLD,
CATEGORY,
"<ALPHANUM>",
"<ALPHANUM>",
"<ALPHANUM>",
BOLD_ITALICS,
CATEGORY,
"<ALPHANUM>",
"<ALPHANUM>",
EXTERNAL_LINK_URL,
EXTERNAL_LINK,
EXTERNAL_LINK,
EXTERNAL_LINK_URL,
EXTERNAL_LINK,
EXTERNAL_LINK,
EXTERNAL_LINK_URL,
EXTERNAL_LINK,
EXTERNAL_LINK,
CITATION,
"<ALPHANUM>",
"<ALPHANUM>"
});
}
public void testLinkPhrases() throws Exception {
WikipediaTokenizer tf =
new WikipediaTokenizer(
newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
tf.setReader(new StringReader(LINK_PHRASES));
checkLinkPhrases(tf);
}
private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
assertTokenStreamContents(
tf,
new String[] {
"click",
"link",
"here",
"again",
"click",
"http://lucene.apache.org",
"here",
"again",
"a",
"b",
"c",
"d"
},
new int[] {1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
}
public void testLinks() throws Exception {
String test =
"[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
WikipediaTokenizer tf =
new WikipediaTokenizer(
newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
tf.setReader(new StringReader(test));
assertTokenStreamContents(
tf,
new String[] {
"http://lucene.apache.org/java/docs/index.html#news",
"here",
"http://lucene.apache.org/java/docs/index.html?b=c",
"here",
"https://lucene.apache.org/java/docs/index.html?b=c",
"here"
},
new String[] {
EXTERNAL_LINK_URL,
EXTERNAL_LINK,
EXTERNAL_LINK_URL,
EXTERNAL_LINK,
EXTERNAL_LINK_URL,
EXTERNAL_LINK,
});
}
public void testLucene1133() throws Exception {
Set<String> untoks = new HashSet<>();
untoks.add(WikipediaTokenizer.CATEGORY);
untoks.add(WikipediaTokenizer.ITALICS);
// should be exactly the same, regardless of untoks
WikipediaTokenizer tf =
new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, untoks);
tf.setReader(new StringReader(LINK_PHRASES));
checkLinkPhrases(tf);
String test =
"[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
tf = new WikipediaTokenizer(WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
tf.setReader(new StringReader(test));
assertTokenStreamContents(
tf,
new String[] {
"a b c d",
"e f g",
"link",
"here",
"link",
"there",
"italics here",
"something",
"more italics",
"h i j"
},
new int[] {11, 32, 42, 47, 56, 61, 71, 86, 98, 124},
new int[] {18, 37, 46, 51, 60, 66, 83, 95, 110, 133},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
}
public void testBoth() throws Exception {
Set<String> untoks = new HashSet<>();
untoks.add(WikipediaTokenizer.CATEGORY);
untoks.add(WikipediaTokenizer.ITALICS);
String test =
"[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
// should output all the indivual tokens plus the untokenized tokens as well. Untokenized
// tokens
WikipediaTokenizer tf =
new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
tf.setReader(new StringReader(test));
assertTokenStreamContents(
tf,
new String[] {
"a b c d",
"a",
"b",
"c",
"d",
"e f g",
"e",
"f",
"g",
"link",
"here",
"link",
"there",
"italics here",
"italics",
"here",
"something",
"more italics",
"more",
"italics",
"h i j",
"h",
"i",
"j"
},
new int[] {
11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124,
128, 132
},
new int[] {
18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133,
125, 129, 133
},
new int[] {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1});
// now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase?
tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
tf.setReader(new StringReader(test));
int expectedFlags[] =
new int[] {
UNTOKENIZED_TOKEN_FLAG,
0,
0,
0,
0,
UNTOKENIZED_TOKEN_FLAG,
0,
0,
0,
0,
0,
0,
0,
UNTOKENIZED_TOKEN_FLAG,
0,
0,
0,
UNTOKENIZED_TOKEN_FLAG,
0,
0,
UNTOKENIZED_TOKEN_FLAG,
0,
0,
0
};
FlagsAttribute flagsAtt = tf.addAttribute(FlagsAttribute.class);
tf.reset();
for (int i = 0; i < expectedFlags.length; i++) {
assertTrue(tf.incrementToken());
assertEquals("flags " + i, expectedFlags[i], flagsAtt.getFlags());
}
assertFalse(tf.incrementToken());
tf.close();
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer =
new WikipediaTokenizer(
newAttributeFactory(),
WikipediaTokenizer.TOKENS_ONLY,
Collections.<String>emptySet());
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
// TODO: properly support positionLengthAttribute
checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER, 20, false, false);
a.close();
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Random random = random();
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer =
new WikipediaTokenizer(
newAttributeFactory(),
WikipediaTokenizer.TOKENS_ONLY,
Collections.<String>emptySet());
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
// TODO: properly support positionLengthAttribute
checkRandomData(random, a, 10 * RANDOM_MULTIPLIER, 8192, false, false);
a.close();
}
}