| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis; |
| |
| import java.io.IOException; |
| import java.util.HashMap; |
| import java.util.Map; |
| |
| import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.StringField; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.IndexableField; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.BooleanClause.Occur; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.ScoreDoc; |
| import org.apache.lucene.search.Sort; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.TermRangeQuery; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| |
| /** |
| * Base test class for testing Unicode collation. |
| */ |
| public abstract class CollationTestBase extends LuceneTestCase { |
| |
| protected String firstRangeBeginningOriginal = "\u062F"; |
| protected String firstRangeEndOriginal = "\u0698"; |
| |
| protected String secondRangeBeginningOriginal = "\u0633"; |
| protected String secondRangeEndOriginal = "\u0638"; |
| |
| public void testFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg, |
| BytesRef firstEnd, BytesRef secondBeg, |
| BytesRef secondEnd) throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer)); |
| Document doc = new Document(); |
| doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES)); |
| doc.add(new StringField("body", "body", Field.Store.YES)); |
| writer.addDocument(doc); |
| writer.close(); |
| IndexReader reader = DirectoryReader.open(dir); |
| IndexSearcher searcher = new IndexSearcher(reader); |
| Query query = new TermQuery(new Term("body","body")); |
| |
| // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi |
| // orders the U+0698 character before the U+0633 character, so the single |
| // index Term below should NOT be returned by a TermRangeFilter with a Farsi |
| // Collator (or an Arabic one for the case when Farsi searcher not |
| // supported). |
| BooleanQuery.Builder bq = new BooleanQuery.Builder(); |
| bq.add(query, Occur.MUST); |
| bq.add(new TermRangeQuery("content", firstBeg, firstEnd, true, true), Occur.FILTER); |
| ScoreDoc[] result = searcher.search(bq.build(), 1).scoreDocs; |
| assertEquals("The index Term should not be included.", 0, result.length); |
| |
| bq = new BooleanQuery.Builder(); |
| bq.add(query, Occur.MUST); |
| bq.add(new TermRangeQuery("content", secondBeg, secondEnd, true, true), Occur.FILTER); |
| result = searcher.search(bq.build(), 1).scoreDocs; |
| assertEquals("The index Term should be included.", 1, result.length); |
| |
| reader.close(); |
| dir.close(); |
| } |
| |
| public void testFarsiRangeQueryCollating(Analyzer analyzer, BytesRef firstBeg, |
| BytesRef firstEnd, BytesRef secondBeg, |
| BytesRef secondEnd) throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer)); |
| Document doc = new Document(); |
| |
| // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi |
| // orders the U+0698 character before the U+0633 character, so the single |
| // index Term below should NOT be returned by a TermRangeQuery with a Farsi |
| // Collator (or an Arabic one for the case when Farsi is not supported). |
| doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES)); |
| writer.addDocument(doc); |
| writer.close(); |
| IndexReader reader = DirectoryReader.open(dir); |
| IndexSearcher searcher = new IndexSearcher(reader); |
| |
| Query query = new TermRangeQuery("content", firstBeg, firstEnd, true, true); |
| ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; |
| assertEquals("The index Term should not be included.", 0, hits.length); |
| |
| query = new TermRangeQuery("content", secondBeg, secondEnd, true, true); |
| hits = searcher.search(query, 1000).scoreDocs; |
| assertEquals("The index Term should be included.", 1, hits.length); |
| reader.close(); |
| dir.close(); |
| } |
| |
| public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg, |
| BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception { |
| |
| Directory farsiIndex = newDirectory(); |
| IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(analyzer)); |
| Document doc = new Document(); |
| doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES)); |
| doc.add(new StringField("body", "body", Field.Store.YES)); |
| writer.addDocument(doc); |
| writer.close(); |
| |
| IndexReader reader = DirectoryReader.open(farsiIndex); |
| IndexSearcher search = newSearcher(reader); |
| |
| // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi |
| // orders the U+0698 character before the U+0633 character, so the single |
| // index Term below should NOT be returned by a TermRangeQuery |
| // with a Farsi Collator (or an Arabic one for the case when Farsi is |
| // not supported). |
| Query csrq |
| = new TermRangeQuery("content", firstBeg, firstEnd, true, true); |
| ScoreDoc[] result = search.search(csrq, 1000).scoreDocs; |
| assertEquals("The index Term should not be included.", 0, result.length); |
| |
| csrq = new TermRangeQuery |
| ("content", secondBeg, secondEnd, true, true); |
| result = search.search(csrq, 1000).scoreDocs; |
| assertEquals("The index Term should be included.", 1, result.length); |
| reader.close(); |
| farsiIndex.close(); |
| } |
| |
| // Make sure the documents returned by the search match the expected list |
| // Copied from TestSort.java |
| private void assertMatches(IndexSearcher searcher, Query query, Sort sort, |
| String expectedResult) throws IOException { |
| ScoreDoc[] result = searcher.search(query, 1000, sort).scoreDocs; |
| StringBuilder buff = new StringBuilder(10); |
| int n = result.length; |
| for (int i = 0 ; i < n ; ++i) { |
| Document doc = searcher.doc(result[i].doc); |
| IndexableField[] v = doc.getFields("tracer"); |
| for (int j = 0 ; j < v.length ; ++j) { |
| buff.append(v[j].stringValue()); |
| } |
| } |
| assertEquals(expectedResult, buff.toString()); |
| } |
| |
| public void assertThreadSafe(final Analyzer analyzer) throws Exception { |
| int numTestPoints = 100; |
| int numThreads = TestUtil.nextInt(random(), 3, 5); |
| final HashMap<String,BytesRef> map = new HashMap<>(); |
| |
| // create a map<String,SortKey> up front. |
| // then with multiple threads, generate sort keys for all the keys in the map |
| // and ensure they are the same as the ones we produced in serial fashion. |
| |
| for (int i = 0; i < numTestPoints; i++) { |
| String term = TestUtil.randomSimpleString(random()); |
| try (TokenStream ts = analyzer.tokenStream("fake", term)) { |
| TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); |
| ts.reset(); |
| assertTrue(ts.incrementToken()); |
| // ensure we make a copy of the actual bytes too |
| map.put(term, BytesRef.deepCopyOf(termAtt.getBytesRef())); |
| assertFalse(ts.incrementToken()); |
| ts.end(); |
| } |
| } |
| |
| Thread threads[] = new Thread[numThreads]; |
| for (int i = 0; i < numThreads; i++) { |
| threads[i] = new Thread() { |
| @Override |
| public void run() { |
| try { |
| for (Map.Entry<String,BytesRef> mapping : map.entrySet()) { |
| String term = mapping.getKey(); |
| BytesRef expected = mapping.getValue(); |
| try (TokenStream ts = analyzer.tokenStream("fake", term)) { |
| TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); |
| ts.reset(); |
| assertTrue(ts.incrementToken()); |
| assertEquals(expected, termAtt.getBytesRef()); |
| assertFalse(ts.incrementToken()); |
| ts.end(); |
| } |
| } |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| }; |
| } |
| for (int i = 0; i < numThreads; i++) { |
| threads[i].start(); |
| } |
| for (int i = 0; i < numThreads; i++) { |
| threads[i].join(); |
| } |
| } |
| } |