lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search;


 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;

 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.index.FilteredTermsEnum;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.AutomatonTestUtil;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.automaton.RegExp;

 /**
  * Create an index with random unicode terms
  * Generates random regexps, and validates against a simple impl.
  */
 public class TestRegexpRandom2 extends LuceneTestCase {
   protected IndexSearcher searcher1;
   protected IndexSearcher searcher2;
   private IndexReader reader;
   private Directory dir;
   protected String fieldName;

   @Override
   public void setUp() throws Exception {
     super.setUp();
     dir = newDirectory();
     fieldName = random().nextBoolean() ? "field" : ""; // sometimes use an empty string as field name
     RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
         newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false))
         .setMaxBufferedDocs(TestUtil.nextInt(random(), 50, 1000)));
     Document doc = new Document();
     Field field = newStringField(fieldName, "", Field.Store.NO);
     doc.add(field);
     Field dvField = new SortedDocValuesField(fieldName, new BytesRef());
     doc.add(dvField);
     List<String> terms = new ArrayList<>();
     int num = atLeast(200);
     for (int i = 0; i < num; i++) {
       String s = TestUtil.randomUnicodeString(random());
       field.setStringValue(s);
       dvField.setBytesValue(new BytesRef(s));
       terms.add(s);
       writer.addDocument(doc);
     }

     if (VERBOSE) {
       // utf16 order
       Collections.sort(terms);
       System.out.println("UTF16 order:");
       for(String s : terms) {
         System.out.println("  " + UnicodeUtil.toHexString(s));
       }
     }

     reader = writer.getReader();
     searcher1 = newSearcher(reader);
     searcher2 = newSearcher(reader);
     writer.close();
   }

   @Override
   public void tearDown() throws Exception {
     reader.close();
     dir.close();
     super.tearDown();
   }

   /** a stupid regexp query that just blasts thru the terms */
   private static class DumbRegexpQuery extends MultiTermQuery {
     private final Automaton automaton;

     DumbRegexpQuery(Term term, int flags) {
       super(term.field());
       RegExp re = new RegExp(term.text(), flags);
       automaton = re.toAutomaton();
     }

     @Override
     protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
       return new SimpleAutomatonTermsEnum(terms.iterator());
     }

     private class SimpleAutomatonTermsEnum extends FilteredTermsEnum {
       CharacterRunAutomaton runAutomaton = new CharacterRunAutomaton(automaton);
       CharsRefBuilder utf16 = new CharsRefBuilder();

       private SimpleAutomatonTermsEnum(TermsEnum tenum) {
         super(tenum);
         setInitialSeekTerm(new BytesRef(""));
       }

       @Override
       protected AcceptStatus accept(BytesRef term) throws IOException {
         utf16.copyUTF8Bytes(term.bytes, term.offset, term.length);
         return runAutomaton.run(utf16.chars(), 0, utf16.length()) ?
             AcceptStatus.YES : AcceptStatus.NO;
       }
     }

     @Override
     public String toString(String field) {
       return field.toString() + automaton.toString();
     }

     @Override
     public void visit(QueryVisitor visitor) {

     }

     @Override
     public boolean equals(Object obj) {
       if (super.equals(obj) == false) {
         return false;
       }
       final DumbRegexpQuery that = (DumbRegexpQuery) obj;
       return automaton.equals(that.automaton);
     }
   }

   /** test a bunch of random regular expressions */
   public void testRegexps() throws Exception {
     int num = atLeast(200);
     for (int i = 0; i < num; i++) {
       String reg = AutomatonTestUtil.randomRegexp(random());
       if (VERBOSE) {
         System.out.println("TEST: regexp='" + reg + "'");
       }
       assertSame(reg);
     }
   }

   /** check that the # of hits is the same as from a very
    * simple regexpquery implementation.
    */
   protected void assertSame(String regexp) throws IOException {
     RegexpQuery smart = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
     DumbRegexpQuery dumb = new DumbRegexpQuery(new Term(fieldName, regexp), RegExp.NONE);

     TopDocs smartDocs = searcher1.search(smart, 25);
     TopDocs dumbDocs = searcher2.search(dumb, 25);

     CheckHits.checkEqual(smart, smartDocs.scoreDocs, dumbDocs.scoreDocs);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search;


	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.List;

	import org.apache.lucene.analysis.MockAnalyzer;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.SortedDocValuesField;
	import org.apache.lucene.index.FilteredTermsEnum;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.RandomIndexWriter;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.util.AttributeSource;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.CharsRefBuilder;
	import org.apache.lucene.util.LuceneTestCase;
	import org.apache.lucene.util.TestUtil;
	import org.apache.lucene.util.UnicodeUtil;
	import org.apache.lucene.util.automaton.Automaton;
	import org.apache.lucene.util.automaton.AutomatonTestUtil;
	import org.apache.lucene.util.automaton.CharacterRunAutomaton;
	import org.apache.lucene.util.automaton.RegExp;

	/**
	* Create an index with random unicode terms
	* Generates random regexps, and validates against a simple impl.
	*/
	public class TestRegexpRandom2 extends LuceneTestCase {
	protected IndexSearcher searcher1;
	protected IndexSearcher searcher2;
	private IndexReader reader;
	private Directory dir;
	protected String fieldName;

	@Override
	public void setUp() throws Exception {
	super.setUp();
	dir = newDirectory();
	fieldName = random().nextBoolean() ? "field" : ""; // sometimes use an empty string as field name
	RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
	newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false))
	.setMaxBufferedDocs(TestUtil.nextInt(random(), 50, 1000)));
	Document doc = new Document();
	Field field = newStringField(fieldName, "", Field.Store.NO);
	doc.add(field);
	Field dvField = new SortedDocValuesField(fieldName, new BytesRef());
	doc.add(dvField);
	List<String> terms = new ArrayList<>();
	int num = atLeast(200);
	for (int i = 0; i < num; i++) {
	String s = TestUtil.randomUnicodeString(random());
	field.setStringValue(s);
	dvField.setBytesValue(new BytesRef(s));
	terms.add(s);
	writer.addDocument(doc);
	}

	if (VERBOSE) {
	// utf16 order
	Collections.sort(terms);
	System.out.println("UTF16 order:");
	for(String s : terms) {
	System.out.println(" " + UnicodeUtil.toHexString(s));
	}
	}

	reader = writer.getReader();
	searcher1 = newSearcher(reader);
	searcher2 = newSearcher(reader);
	writer.close();
	}

	@Override
	public void tearDown() throws Exception {
	reader.close();
	dir.close();
	super.tearDown();
	}

	/** a stupid regexp query that just blasts thru the terms */
	private static class DumbRegexpQuery extends MultiTermQuery {
	private final Automaton automaton;

	DumbRegexpQuery(Term term, int flags) {
	super(term.field());
	RegExp re = new RegExp(term.text(), flags);
	automaton = re.toAutomaton();
	}

	@Override
	protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
	return new SimpleAutomatonTermsEnum(terms.iterator());
	}

	private class SimpleAutomatonTermsEnum extends FilteredTermsEnum {
	CharacterRunAutomaton runAutomaton = new CharacterRunAutomaton(automaton);
	CharsRefBuilder utf16 = new CharsRefBuilder();

	private SimpleAutomatonTermsEnum(TermsEnum tenum) {
	super(tenum);
	setInitialSeekTerm(new BytesRef(""));
	}

	@Override
	protected AcceptStatus accept(BytesRef term) throws IOException {
	utf16.copyUTF8Bytes(term.bytes, term.offset, term.length);
	return runAutomaton.run(utf16.chars(), 0, utf16.length()) ?
	AcceptStatus.YES : AcceptStatus.NO;
	}
	}

	@Override
	public String toString(String field) {
	return field.toString() + automaton.toString();
	}

	@Override
	public void visit(QueryVisitor visitor) {

	}

	@Override
	public boolean equals(Object obj) {
	if (super.equals(obj) == false) {
	return false;
	}
	final DumbRegexpQuery that = (DumbRegexpQuery) obj;
	return automaton.equals(that.automaton);
	}
	}

	/** test a bunch of random regular expressions */
	public void testRegexps() throws Exception {
	int num = atLeast(200);
	for (int i = 0; i < num; i++) {
	String reg = AutomatonTestUtil.randomRegexp(random());
	if (VERBOSE) {
	System.out.println("TEST: regexp='" + reg + "'");
	}
	assertSame(reg);
	}
	}

	/** check that the # of hits is the same as from a very
	* simple regexpquery implementation.
	*/
	protected void assertSame(String regexp) throws IOException {
	RegexpQuery smart = new RegexpQuery(new Term(fieldName, regexp), RegExp.NONE);
	DumbRegexpQuery dumb = new DumbRegexpQuery(new Term(fieldName, regexp), RegExp.NONE);

	TopDocs smartDocs = searcher1.search(smart, 25);
	TopDocs dumbDocs = searcher2.search(dumb, 25);

	CheckHits.checkEqual(smart, smartDocs.scoreDocs, dumbDocs.scoreDocs);
	}
	}