blob: f4cbd541d2b562aa8292023d30d2c65a258a7293 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
/**
* Create an index with random unicode terms
* Generates random regexps, and validates against a simple impl.
*/
public class TestRegexpRandom2 extends LuceneTestCase {
protected IndexSearcher searcher1;
protected IndexSearcher searcher2;
private IndexReader reader;
private Directory dir;
protected String fieldName;
@Override
public void setUp() throws Exception {
super.setUp();
dir = newDirectory();
fieldName = random().nextBoolean() ? "field" : ""; // sometimes use an empty string as field name
RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false))
.setMaxBufferedDocs(TestUtil.nextInt(random(), 50, 1000)));
Document doc = new Document();
Field field = newStringField(fieldName, "", Field.Store.NO);
doc.add(field);
Field dvField = new SortedDocValuesField(fieldName, new BytesRef());
doc.add(dvField);
List<String> terms = new ArrayList<>();
int num = atLeast(200);
for (int i = 0; i < num; i++) {
String s = TestUtil.randomUnicodeString(random());
field.setStringValue(s);
dvField.setBytesValue(new BytesRef(s));
terms.add(s);
writer.addDocument(doc);
}
if (VERBOSE) {
// utf16 order
Collections.sort(terms);
System.out.println("UTF16 order:");
for(String s : terms) {
System.out.println(" " + UnicodeUtil.toHexString(s));
}
}
reader = writer.getReader();
searcher1 = newSearcher(reader);
searcher2 = newSearcher(reader);
writer.close();
}
@Override
public void tearDown() throws Exception {
reader.close();
dir.close();
super.tearDown();
}
/** a stupid regexp query that just blasts thru the terms */
private static class DumbRegexpQuery extends MultiTermQuery {
private final Automaton automaton;
DumbRegexpQuery(Term term, int flags) {
super(term.field());
RegExp re = new RegExp(term.text(), flags);
automaton = re.toAutomaton();
}
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
return new SimpleAutomatonTermsEnum(terms.iterator());
}
private class SimpleAutomatonTermsEnum extends FilteredTermsEnum {
CharacterRunAutomaton runAutomaton = new CharacterRunAutomaton(automaton);
CharsRefBuilder utf16 = new CharsRefBuilder();
private SimpleAutomatonTermsEnum(TermsEnum tenum) {
super(tenum);
setInitialSeekTerm(new BytesRef(""));
}
@Override
protected AcceptStatus accept(BytesRef term) throws IOException {
utf16.copyUTF8Bytes(term.bytes, term.offset, term.length);
return runAutomaton.run(utf16.chars(), 0, utf16.length()) ?
AcceptStatus.YES : AcceptStatus.NO;
}
}
@Override
public String toString(String field) {
return field.toString() + automaton.toString();
}
@Override
public void visit(QueryVisitor visitor) {
}
@Override
public boolean equals(Object obj) {
if (super.equals(obj) == false) {
return false;
}
final DumbRegexpQuery that = (DumbRegexpQuery) obj;
return automaton.equals(that.automaton);
}
}
/** test a bunch of random regular expressions */
public void testRegexps() throws Exception {
int num = atLeast(200);
for (int i = 0; i < num; i++) {
String reg = AutomatonTestUtil.randomRegexp(random());
if (VERBOSE) {
System.out.println("TEST: regexp='" + reg + "'");
}
assertSame(reg);
}
}
/** check that the # of hits is the same as from a very
* simple regexpquery implementation.
*/
protected void assertSame(String regexp) throws IOException {
RegexpQuery smart = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
DumbRegexpQuery dumb = new DumbRegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
TopDocs smartDocs = searcher1.search(smart, 25);
TopDocs dumbDocs = searcher2.search(dumb, 25);
CheckHits.checkEqual(smart, smartDocs.scoreDocs, dumbDocs.scoreDocs);
}
}