blob: 4efc2a9cd3ff40215587c4a8238dde94dee8c9a4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/**
* Base test class for testing Unicode collation.
*/
public abstract class CollationTestBase extends LuceneTestCase {
protected String firstRangeBeginningOriginal = "\u062F";
protected String firstRangeEndOriginal = "\u0698";
protected String secondRangeBeginningOriginal = "\u0633";
protected String secondRangeEndOriginal = "\u0638";
public void testFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg,
BytesRef firstEnd, BytesRef secondBeg,
BytesRef secondEnd) throws Exception {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer));
Document doc = new Document();
doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES));
doc.add(new StringField("body", "body", Field.Store.YES));
writer.addDocument(doc);
writer.close();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
Query query = new TermQuery(new Term("body","body"));
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// index Term below should NOT be returned by a TermRangeFilter with a Farsi
// Collator (or an Arabic one for the case when Farsi searcher not
// supported).
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(query, Occur.MUST);
bq.add(new TermRangeQuery("content", firstBeg, firstEnd, true, true), Occur.FILTER);
ScoreDoc[] result = searcher.search(bq.build(), 1).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
bq = new BooleanQuery.Builder();
bq.add(query, Occur.MUST);
bq.add(new TermRangeQuery("content", secondBeg, secondEnd, true, true), Occur.FILTER);
result = searcher.search(bq.build(), 1).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
reader.close();
dir.close();
}
public void testFarsiRangeQueryCollating(Analyzer analyzer, BytesRef firstBeg,
BytesRef firstEnd, BytesRef secondBeg,
BytesRef secondEnd) throws Exception {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer));
Document doc = new Document();
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// index Term below should NOT be returned by a TermRangeQuery with a Farsi
// Collator (or an Arabic one for the case when Farsi is not supported).
doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES));
writer.addDocument(doc);
writer.close();
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
Query query = new TermRangeQuery("content", firstBeg, firstEnd, true, true);
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, hits.length);
query = new TermRangeQuery("content", secondBeg, secondEnd, true, true);
hits = searcher.search(query, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, hits.length);
reader.close();
dir.close();
}
public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg,
BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception {
Directory farsiIndex = newDirectory();
IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(analyzer));
Document doc = new Document();
doc.add(new TextField("content", "\u0633\u0627\u0628", Field.Store.YES));
doc.add(new StringField("body", "body", Field.Store.YES));
writer.addDocument(doc);
writer.close();
IndexReader reader = DirectoryReader.open(farsiIndex);
IndexSearcher search = newSearcher(reader);
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// index Term below should NOT be returned by a TermRangeQuery
// with a Farsi Collator (or an Arabic one for the case when Farsi is
// not supported).
Query csrq
= new TermRangeQuery("content", firstBeg, firstEnd, true, true);
ScoreDoc[] result = search.search(csrq, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
csrq = new TermRangeQuery
("content", secondBeg, secondEnd, true, true);
result = search.search(csrq, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
reader.close();
farsiIndex.close();
}
// Make sure the documents returned by the search match the expected list
// Copied from TestSort.java
private void assertMatches(IndexSearcher searcher, Query query, Sort sort,
String expectedResult) throws IOException {
ScoreDoc[] result = searcher.search(query, 1000, sort).scoreDocs;
StringBuilder buff = new StringBuilder(10);
int n = result.length;
for (int i = 0 ; i < n ; ++i) {
Document doc = searcher.doc(result[i].doc);
IndexableField[] v = doc.getFields("tracer");
for (int j = 0 ; j < v.length ; ++j) {
buff.append(v[j].stringValue());
}
}
assertEquals(expectedResult, buff.toString());
}
public void assertThreadSafe(final Analyzer analyzer) throws Exception {
int numTestPoints = 100;
int numThreads = TestUtil.nextInt(random(), 3, 5);
final HashMap<String,BytesRef> map = new HashMap<>();
// create a map<String,SortKey> up front.
// then with multiple threads, generate sort keys for all the keys in the map
// and ensure they are the same as the ones we produced in serial fashion.
for (int i = 0; i < numTestPoints; i++) {
String term = TestUtil.randomSimpleString(random());
try (TokenStream ts = analyzer.tokenStream("fake", term)) {
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
ts.reset();
assertTrue(ts.incrementToken());
// ensure we make a copy of the actual bytes too
map.put(term, BytesRef.deepCopyOf(termAtt.getBytesRef()));
assertFalse(ts.incrementToken());
ts.end();
}
}
Thread threads[] = new Thread[numThreads];
for (int i = 0; i < numThreads; i++) {
threads[i] = new Thread() {
@Override
public void run() {
try {
for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
String term = mapping.getKey();
BytesRef expected = mapping.getValue();
try (TokenStream ts = analyzer.tokenStream("fake", term)) {
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
ts.reset();
assertTrue(ts.incrementToken());
assertEquals(expected, termAtt.getBytesRef());
assertFalse(ts.incrementToken());
ts.end();
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
}
for (int i = 0; i < numThreads; i++) {
threads[i].start();
}
for (int i = 0; i < numThreads; i++) {
threads[i].join();
}
}
}