lucene/test-framework/src/java/org/apache/lucene/analysis/CollationTestBase.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis;

 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;

 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;

 /**
  * Base test class for testing Unicode collation.
  */
 public abstract class CollationTestBase extends LuceneTestCase {

   protected String firstRangeBeginningOriginal = "\u062F";
   protected String firstRangeEndOriginal = "\u0698";

   protected String secondRangeBeginningOriginal = "\u0633";
   protected String secondRangeEndOriginal = "\u0638";

   public void testFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg,
                                             BytesRef firstEnd, BytesRef secondBeg,
                                             BytesRef secondEnd) throws Exception {
     Directory dir = newDirectory();
     IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer));
     Document doc = new Document();
     doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES));
     doc.add(new StringField("body", "body", Field.Store.YES));
     writer.addDocument(doc);
     writer.close();
     IndexReader reader = DirectoryReader.open(dir);
     IndexSearcher searcher = new IndexSearcher(reader);
     Query query = new TermQuery(new Term("body","body"));

     // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
     // orders the U+0698 character before the U+0633 character, so the single
     // index Term below should NOT be returned by a TermRangeFilter with a Farsi
     // Collator (or an Arabic one for the case when Farsi searcher not
     // supported).
     BooleanQuery.Builder bq = new BooleanQuery.Builder();
     bq.add(query, Occur.MUST);
     bq.add(new TermRangeQuery("content", firstBeg, firstEnd, true, true), Occur.FILTER);
     ScoreDoc[] result = searcher.search(bq.build(), 1).scoreDocs;
     assertEquals("The index Term should not be included.", 0, result.length);

     bq = new BooleanQuery.Builder();
     bq.add(query, Occur.MUST);
     bq.add(new TermRangeQuery("content", secondBeg, secondEnd, true, true), Occur.FILTER);
     result = searcher.search(bq.build(), 1).scoreDocs;
     assertEquals("The index Term should be included.", 1, result.length);

     reader.close();
     dir.close();
   }

   public void testFarsiRangeQueryCollating(Analyzer analyzer, BytesRef firstBeg,
                                             BytesRef firstEnd, BytesRef secondBeg,
                                             BytesRef secondEnd) throws Exception {
     Directory dir = newDirectory();
     IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer));
     Document doc = new Document();

     // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
     // orders the U+0698 character before the U+0633 character, so the single
     // index Term below should NOT be returned by a TermRangeQuery with a Farsi
     // Collator (or an Arabic one for the case when Farsi is not supported).
     doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES));
     writer.addDocument(doc);
     writer.close();
     IndexReader reader = DirectoryReader.open(dir);
     IndexSearcher searcher = new IndexSearcher(reader);

     Query query = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
     ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
     assertEquals("The index Term should not be included.", 0, hits.length);

     query = new TermRangeQuery("content", secondBeg, secondEnd, true, true);
     hits = searcher.search(query, 1000).scoreDocs;
     assertEquals("The index Term should be included.", 1, hits.length);
     reader.close();
     dir.close();
   }

   public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg,
       BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception {

     Directory farsiIndex = newDirectory();
     IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(analyzer));
     Document doc = new Document();
     doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES));
     doc.add(new StringField("body", "body", Field.Store.YES));
     writer.addDocument(doc);
     writer.close();

     IndexReader reader = DirectoryReader.open(farsiIndex);
     IndexSearcher search = newSearcher(reader);

     // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
     // orders the U+0698 character before the U+0633 character, so the single
     // index Term below should NOT be returned by a TermRangeQuery
     // with a Farsi Collator (or an Arabic one for the case when Farsi is
     // not supported).
     Query csrq
       = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
     ScoreDoc[] result = search.search(csrq, 1000).scoreDocs;
     assertEquals("The index Term should not be included.", 0, result.length);

     csrq = new TermRangeQuery
       ("content", secondBeg, secondEnd, true, true);
     result = search.search(csrq, 1000).scoreDocs;
     assertEquals("The index Term should be included.", 1, result.length);
     reader.close();
     farsiIndex.close();
   }

   // Make sure the documents returned by the search match the expected list
   // Copied from TestSort.java
   private void assertMatches(IndexSearcher searcher, Query query, Sort sort,
                              String expectedResult) throws IOException {
     ScoreDoc[] result = searcher.search(query, 1000, sort).scoreDocs;
     StringBuilder buff = new StringBuilder(10);
     int n = result.length;
     for (int i = 0 ; i < n ; ++i) {
       Document doc = searcher.doc(result[i].doc);
       IndexableField[] v = doc.getFields("tracer");
       for (int j = 0 ; j < v.length ; ++j) {
         buff.append(v[j].stringValue());
       }
     }
     assertEquals(expectedResult, buff.toString());
   }

   public void assertThreadSafe(final Analyzer analyzer) throws Exception {
     int numTestPoints = 100;
     int numThreads = TestUtil.nextInt(random(), 3, 5);
     final HashMap<String,BytesRef> map = new HashMap<>();

     // create a map<String,SortKey> up front.
     // then with multiple threads, generate sort keys for all the keys in the map
     // and ensure they are the same as the ones we produced in serial fashion.

     for (int i = 0; i < numTestPoints; i++) {
       String term = TestUtil.randomSimpleString(random());
       try (TokenStream ts = analyzer.tokenStream("fake", term)) {
         TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
         ts.reset();
         assertTrue(ts.incrementToken());
         // ensure we make a copy of the actual bytes too
         map.put(term, BytesRef.deepCopyOf(termAtt.getBytesRef()));
         assertFalse(ts.incrementToken());
         ts.end();
       }
     }

     Thread threads[] = new Thread[numThreads];
     for (int i = 0; i < numThreads; i++) {
       threads[i] = new Thread() {
         @Override
         public void run() {
           try {
             for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
               String term = mapping.getKey();
               BytesRef expected = mapping.getValue();
               try (TokenStream ts = analyzer.tokenStream("fake", term)) {
                 TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
                 ts.reset();
                 assertTrue(ts.incrementToken());
                 assertEquals(expected, termAtt.getBytesRef());
                 assertFalse(ts.incrementToken());
                 ts.end();
               }
             }
           } catch (IOException e) {
             throw new RuntimeException(e);
           }
         }
       };
     }
     for (int i = 0; i < numThreads; i++) {
       threads[i].start();
     }
     for (int i = 0; i < numThreads; i++) {
       threads[i].join();
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis;

	import java.io.IOException;
	import java.util.HashMap;
	import java.util.Map;

	import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.StringField;
	import org.apache.lucene.document.TextField;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.IndexWriterConfig;
	import org.apache.lucene.index.IndexableField;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.BooleanClause.Occur;
	import org.apache.lucene.search.BooleanQuery;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.ScoreDoc;
	import org.apache.lucene.search.Sort;
	import org.apache.lucene.search.TermQuery;
	import org.apache.lucene.search.TermRangeQuery;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.LuceneTestCase;
	import org.apache.lucene.util.TestUtil;

	/**
	* Base test class for testing Unicode collation.
	*/
	public abstract class CollationTestBase extends LuceneTestCase {

	protected String firstRangeBeginningOriginal = "\u062F";
	protected String firstRangeEndOriginal = "\u0698";

	protected String secondRangeBeginningOriginal = "\u0633";
	protected String secondRangeEndOriginal = "\u0638";

	public void testFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg,
	BytesRef firstEnd, BytesRef secondBeg,
	BytesRef secondEnd) throws Exception {
	Directory dir = newDirectory();
	IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer));
	Document doc = new Document();
	doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES));
	doc.add(new StringField("body", "body", Field.Store.YES));
	writer.addDocument(doc);
	writer.close();
	IndexReader reader = DirectoryReader.open(dir);
	IndexSearcher searcher = new IndexSearcher(reader);
	Query query = new TermQuery(new Term("body","body"));

	// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
	// orders the U+0698 character before the U+0633 character, so the single
	// index Term below should NOT be returned by a TermRangeFilter with a Farsi
	// Collator (or an Arabic one for the case when Farsi searcher not
	// supported).
	BooleanQuery.Builder bq = new BooleanQuery.Builder();
	bq.add(query, Occur.MUST);
	bq.add(new TermRangeQuery("content", firstBeg, firstEnd, true, true), Occur.FILTER);
	ScoreDoc[] result = searcher.search(bq.build(), 1).scoreDocs;
	assertEquals("The index Term should not be included.", 0, result.length);

	bq = new BooleanQuery.Builder();
	bq.add(query, Occur.MUST);
	bq.add(new TermRangeQuery("content", secondBeg, secondEnd, true, true), Occur.FILTER);
	result = searcher.search(bq.build(), 1).scoreDocs;
	assertEquals("The index Term should be included.", 1, result.length);

	reader.close();
	dir.close();
	}

	public void testFarsiRangeQueryCollating(Analyzer analyzer, BytesRef firstBeg,
	BytesRef firstEnd, BytesRef secondBeg,
	BytesRef secondEnd) throws Exception {
	Directory dir = newDirectory();
	IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer));
	Document doc = new Document();

	// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
	// orders the U+0698 character before the U+0633 character, so the single
	// index Term below should NOT be returned by a TermRangeQuery with a Farsi
	// Collator (or an Arabic one for the case when Farsi is not supported).
	doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES));
	writer.addDocument(doc);
	writer.close();
	IndexReader reader = DirectoryReader.open(dir);
	IndexSearcher searcher = new IndexSearcher(reader);

	Query query = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
	ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
	assertEquals("The index Term should not be included.", 0, hits.length);

	query = new TermRangeQuery("content", secondBeg, secondEnd, true, true);
	hits = searcher.search(query, 1000).scoreDocs;
	assertEquals("The index Term should be included.", 1, hits.length);
	reader.close();
	dir.close();
	}

	public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg,
	BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception {

	Directory farsiIndex = newDirectory();
	IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(analyzer));
	Document doc = new Document();
	doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES));
	doc.add(new StringField("body", "body", Field.Store.YES));
	writer.addDocument(doc);
	writer.close();

	IndexReader reader = DirectoryReader.open(farsiIndex);
	IndexSearcher search = newSearcher(reader);

	// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
	// orders the U+0698 character before the U+0633 character, so the single
	// index Term below should NOT be returned by a TermRangeQuery
	// with a Farsi Collator (or an Arabic one for the case when Farsi is
	// not supported).
	Query csrq
	= new TermRangeQuery("content", firstBeg, firstEnd, true, true);
	ScoreDoc[] result = search.search(csrq, 1000).scoreDocs;
	assertEquals("The index Term should not be included.", 0, result.length);

	csrq = new TermRangeQuery
	("content", secondBeg, secondEnd, true, true);
	result = search.search(csrq, 1000).scoreDocs;
	assertEquals("The index Term should be included.", 1, result.length);
	reader.close();
	farsiIndex.close();
	}

	// Make sure the documents returned by the search match the expected list
	// Copied from TestSort.java
	private void assertMatches(IndexSearcher searcher, Query query, Sort sort,
	String expectedResult) throws IOException {
	ScoreDoc[] result = searcher.search(query, 1000, sort).scoreDocs;
	StringBuilder buff = new StringBuilder(10);
	int n = result.length;
	for (int i = 0 ; i < n ; ++i) {
	Document doc = searcher.doc(result[i].doc);
	IndexableField[] v = doc.getFields("tracer");
	for (int j = 0 ; j < v.length ; ++j) {
	buff.append(v[j].stringValue());
	}
	}
	assertEquals(expectedResult, buff.toString());
	}

	public void assertThreadSafe(final Analyzer analyzer) throws Exception {
	int numTestPoints = 100;
	int numThreads = TestUtil.nextInt(random(), 3, 5);
	final HashMap<String,BytesRef> map = new HashMap<>();

	// create a map<String,SortKey> up front.
	// then with multiple threads, generate sort keys for all the keys in the map
	// and ensure they are the same as the ones we produced in serial fashion.

	for (int i = 0; i < numTestPoints; i++) {
	String term = TestUtil.randomSimpleString(random());
	try (TokenStream ts = analyzer.tokenStream("fake", term)) {
	TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
	ts.reset();
	assertTrue(ts.incrementToken());
	// ensure we make a copy of the actual bytes too
	map.put(term, BytesRef.deepCopyOf(termAtt.getBytesRef()));
	assertFalse(ts.incrementToken());
	ts.end();
	}
	}

	Thread threads[] = new Thread[numThreads];
	for (int i = 0; i < numThreads; i++) {
	threads[i] = new Thread() {
	@Override
	public void run() {
	try {
	for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
	String term = mapping.getKey();
	BytesRef expected = mapping.getValue();
	try (TokenStream ts = analyzer.tokenStream("fake", term)) {
	TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
	ts.reset();
	assertTrue(ts.incrementToken());
	assertEquals(expected, termAtt.getBytesRef());
	assertFalse(ts.incrementToken());
	ts.end();
	}
	}
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	}
	};
	}
	for (int i = 0; i < numThreads; i++) {
	threads[i].start();
	}
	for (int i = 0; i < numThreads; i++) {
	threads[i].join();
	}
	}
	}