| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.vectorhighlight; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.List; |
| |
| import org.apache.lucene.analysis.*; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.index.IndexWriterConfig.OpenMode; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.BoostQuery; |
| import org.apache.lucene.search.DisjunctionMaxQuery; |
| import org.apache.lucene.search.PhraseQuery; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| |
| public abstract class AbstractTestCase extends LuceneTestCase { |
| |
| protected static final String F = "f"; |
| protected static final String F1 = "f1"; |
| protected static final String F2 = "f2"; |
| protected Directory dir; |
| protected Analyzer analyzerW; |
| protected Analyzer analyzerB; |
| protected Analyzer analyzerK; |
| protected IndexReader reader; |
| |
| protected static final String[] shortMVValues = { |
| "", |
| "", |
| "a b c", |
| "", // empty data in multi valued field |
| "d e" |
| }; |
| |
| protected static final String[] longMVValues = { |
| "Followings are the examples of customizable parameters and actual examples of customization:", |
| "The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically" |
| }; |
| |
| // test data for LUCENE-1448 bug |
| protected static final String[] biMVValues = { |
| "\nLucene/Solr does not require such additional hardware.", |
| "\nWhen you talk about processing speed, the" |
| }; |
| |
| protected static final String[] strMVValues = { |
| "abc", |
| "defg", |
| "hijkl" |
| }; |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| analyzerW = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); |
| analyzerB = new BigramAnalyzer(); |
| analyzerK = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); |
| dir = newDirectory(); |
| } |
| |
| @Override |
| public void tearDown() throws Exception { |
| if( reader != null ){ |
| reader.close(); |
| reader = null; |
| } |
| dir.close(); |
| super.tearDown(); |
| } |
| |
| protected Query tq( String text ){ |
| return tq( 1F, text ); |
| } |
| |
| protected Query tq( float boost, String text ){ |
| return tq( boost, F, text ); |
| } |
| |
| protected Query tq( String field, String text ){ |
| return tq( 1F, field, text ); |
| } |
| |
| protected Query tq( float boost, String field, String text ){ |
| Query query = new TermQuery( new Term( field, text ) ); |
| if (boost != 1f) { |
| query = new BoostQuery( query, boost ); |
| } |
| return query; |
| } |
| |
| protected Query pqF( String... texts ){ |
| return pqF( 1F, texts ); |
| } |
| |
| protected Query pqF( float boost, String... texts ){ |
| return pqF( boost, 0, texts ); |
| } |
| |
| protected Query pqF( float boost, int slop, String... texts ){ |
| return pq( boost, slop, F, texts ); |
| } |
| |
| protected Query pq( String field, String... texts ){ |
| return pq( 1F, 0, field, texts ); |
| } |
| |
| protected Query pq( float boost, String field, String... texts ){ |
| return pq( boost, 0, field, texts ); |
| } |
| |
| protected Query pq( float boost, int slop, String field, String... texts ){ |
| Query query = new PhraseQuery(slop, field, texts); |
| if (boost != 1f) { |
| query = new BoostQuery(query, boost); |
| } |
| return query; |
| } |
| |
| protected Query dmq( Query... queries ){ |
| return dmq( 0.0F, queries ); |
| } |
| |
| protected Query dmq( float tieBreakerMultiplier, Query... queries ){ |
| return new DisjunctionMaxQuery(Arrays.asList(queries), tieBreakerMultiplier); |
| } |
| |
| protected void assertCollectionQueries( Collection<Query> actual, Query... expected ){ |
| assertEquals( expected.length, actual.size() ); |
| for( Query query : expected ){ |
| assertTrue( actual.contains( query ) ); |
| } |
| } |
| |
| protected List<BytesRef> analyze(String text, String field, Analyzer analyzer) throws IOException { |
| List<BytesRef> bytesRefs = new ArrayList<>(); |
| |
| try (TokenStream tokenStream = analyzer.tokenStream(field, text)) { |
| TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class); |
| |
| tokenStream.reset(); |
| |
| while (tokenStream.incrementToken()) { |
| bytesRefs.add(BytesRef.deepCopyOf(termAttribute.getBytesRef())); |
| } |
| |
| tokenStream.end(); |
| } |
| |
| return bytesRefs; |
| } |
| |
| protected PhraseQuery toPhraseQuery(List<BytesRef> bytesRefs, String field) { |
| return new PhraseQuery(field, bytesRefs.toArray(new BytesRef[0])); |
| } |
| |
| static final class BigramAnalyzer extends Analyzer { |
| @Override |
| public TokenStreamComponents createComponents(String fieldName) { |
| return new TokenStreamComponents(new BasicNGramTokenizer()); |
| } |
| } |
| |
| static final class BasicNGramTokenizer extends Tokenizer { |
| |
| public static final int DEFAULT_N_SIZE = 2; |
| public static final String DEFAULT_DELIMITERS = " \t\n.,"; |
| private final int n; |
| private final String delimiters; |
| private int startTerm; |
| private int lenTerm; |
| private int startOffset; |
| private int nextStartOffset; |
| private int ch; |
| private String snippet; |
| private StringBuilder snippetBuffer; |
| private static final int BUFFER_SIZE = 4096; |
| private char[] charBuffer; |
| private int charBufferIndex; |
| private int charBufferLen; |
| |
| public BasicNGramTokenizer( ){ |
| this( DEFAULT_N_SIZE ); |
| } |
| |
| public BasicNGramTokenizer( int n ){ |
| this( n, DEFAULT_DELIMITERS ); |
| } |
| |
| public BasicNGramTokenizer( String delimiters ){ |
| this( DEFAULT_N_SIZE, delimiters ); |
| } |
| |
| public BasicNGramTokenizer(int n, String delimiters ){ |
| super(); |
| this.n = n; |
| this.delimiters = delimiters; |
| startTerm = 0; |
| nextStartOffset = 0; |
| snippet = null; |
| snippetBuffer = new StringBuilder(); |
| charBuffer = new char[BUFFER_SIZE]; |
| charBufferIndex = BUFFER_SIZE; |
| charBufferLen = 0; |
| ch = 0; |
| } |
| |
| CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| @Override |
| public boolean incrementToken() throws IOException { |
| if( !getNextPartialSnippet() ) |
| return false; |
| clearAttributes(); |
| termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm); |
| offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm)); |
| return true; |
| } |
| |
| private int getFinalOffset() { |
| return nextStartOffset; |
| } |
| |
| @Override |
| public final void end() throws IOException { |
| super.end(); |
| offsetAtt.setOffset(getFinalOffset(),getFinalOffset()); |
| } |
| |
| protected boolean getNextPartialSnippet() throws IOException { |
| if( snippet != null && snippet.length() >= startTerm + 1 + n ){ |
| startTerm++; |
| startOffset++; |
| lenTerm = n; |
| return true; |
| } |
| return getNextSnippet(); |
| } |
| |
| protected boolean getNextSnippet() throws IOException { |
| startTerm = 0; |
| startOffset = nextStartOffset; |
| snippetBuffer.delete( 0, snippetBuffer.length() ); |
| while( true ){ |
| if( ch != -1 ) |
| ch = readCharFromBuffer(); |
| if( ch == -1 ) break; |
| else if( !isDelimiter( ch ) ) |
| snippetBuffer.append( (char)ch ); |
| else if( snippetBuffer.length() > 0 ) |
| break; |
| else |
| startOffset++; |
| } |
| if( snippetBuffer.length() == 0 ) |
| return false; |
| snippet = snippetBuffer.toString(); |
| lenTerm = snippet.length() >= n ? n : snippet.length(); |
| return true; |
| } |
| |
| protected int readCharFromBuffer() throws IOException { |
| if( charBufferIndex >= charBufferLen ){ |
| charBufferLen = input.read( charBuffer ); |
| if( charBufferLen == -1 ){ |
| return -1; |
| } |
| charBufferIndex = 0; |
| } |
| int c = charBuffer[charBufferIndex++]; |
| nextStartOffset++; |
| return c; |
| } |
| |
| protected boolean isDelimiter( int c ){ |
| return delimiters.indexOf( c ) >= 0; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| startTerm = 0; |
| nextStartOffset = 0; |
| snippet = null; |
| snippetBuffer.setLength( 0 ); |
| charBufferIndex = BUFFER_SIZE; |
| charBufferLen = 0; |
| ch = 0; |
| } |
| } |
| |
| protected void make1d1fIndex( String value ) throws Exception { |
| make1dmfIndex( value ); |
| } |
| |
| protected void make1d1fIndexB( String value ) throws Exception { |
| make1dmfIndexB( value ); |
| } |
| |
| protected void make1dmfIndex( String... values ) throws Exception { |
| make1dmfIndex( analyzerW, values ); |
| } |
| |
| protected void make1dmfIndexB( String... values ) throws Exception { |
| make1dmfIndex( analyzerB, values ); |
| } |
| |
| // make 1 doc with multi valued field |
| protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception { |
| IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer).setOpenMode(OpenMode.CREATE)); |
| Document doc = new Document(); |
| FieldType customType = new FieldType(TextField.TYPE_STORED); |
| customType.setStoreTermVectors(true); |
| customType.setStoreTermVectorOffsets(true); |
| customType.setStoreTermVectorPositions(true); |
| for( String value: values ) { |
| doc.add( new Field( F, value, customType) ); |
| } |
| writer.addDocument( doc ); |
| writer.close(); |
| if (reader != null) reader.close(); |
| reader = DirectoryReader.open(dir); |
| } |
| |
| // make 1 doc with multi valued & not analyzed field |
| protected void make1dmfIndexNA( String... values ) throws Exception { |
| IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzerK).setOpenMode(OpenMode.CREATE)); |
| Document doc = new Document(); |
| FieldType customType = new FieldType(TextField.TYPE_STORED); |
| customType.setStoreTermVectors(true); |
| customType.setStoreTermVectorOffsets(true); |
| customType.setStoreTermVectorPositions(true); |
| for( String value: values ) { |
| doc.add( new Field( F, value, customType)); |
| //doc.add( new Field( F, value, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); |
| } |
| writer.addDocument( doc ); |
| writer.close(); |
| if (reader != null) reader.close(); |
| reader = DirectoryReader.open(dir); |
| } |
| |
| protected void makeIndexShortMV() throws Exception { |
| |
| // 0 |
| // "" |
| // 1 |
| // "" |
| |
| // 234567 |
| // "a b c" |
| // 0 1 2 |
| |
| // 8 |
| // "" |
| |
| // 111 |
| // 9012 |
| // "d e" |
| // 3 4 |
| make1dmfIndex( shortMVValues ); |
| } |
| |
| protected void makeIndexLongMV() throws Exception { |
| // 11111111112222222222333333333344444444445555555555666666666677777777778888888888999 |
| // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 |
| // Followings are the examples of customizable parameters and actual examples of customization: |
| // 0 1 2 3 4 5 6 7 8 9 10 11 |
| |
| // 1 2 |
| // 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122 |
| // 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901 |
| // The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically |
| // 12 13 (14) (15) 16 17 18 19 20 21 22 23 (24) (25) 26 27 28 29 30 31 32 33 34 |
| |
| make1dmfIndex( longMVValues ); |
| } |
| |
| protected void makeIndexLongMVB() throws Exception { |
| // "*" ... LF |
| |
| // 1111111111222222222233333333334444444444555555 |
| // 01234567890123456789012345678901234567890123456789012345 |
| // *Lucene/Solr does not require such additional hardware. |
| // Lu 0 do 10 re 15 su 21 na 31 |
| // uc 1 oe 11 eq 16 uc 22 al 32 |
| // ce 2 es 12 qu 17 ch 23 ha 33 |
| // en 3 no 13 ui 18 ad 24 ar 34 |
| // ne 4 ot 14 ir 19 dd 25 rd 35 |
| // e/ 5 re 20 di 26 dw 36 |
| // /S 6 it 27 wa 37 |
| // So 7 ti 28 ar 38 |
| // ol 8 io 29 re 39 |
| // lr 9 on 30 |
| |
| // 5555666666666677777777778888888888999999999 |
| // 6789012345678901234567890123456789012345678 |
| // *When you talk about processing speed, the |
| // Wh 40 ab 48 es 56 th 65 |
| // he 41 bo 49 ss 57 he 66 |
| // en 42 ou 50 si 58 |
| // yo 43 ut 51 in 59 |
| // ou 44 pr 52 ng 60 |
| // ta 45 ro 53 sp 61 |
| // al 46 oc 54 pe 62 |
| // lk 47 ce 55 ee 63 |
| // ed 64 |
| |
| make1dmfIndexB( biMVValues ); |
| } |
| |
| protected void makeIndexStrMV() throws Exception { |
| |
| // 0123 |
| // "abc" |
| |
| // 34567 |
| // "defg" |
| |
| // 111 |
| // 789012 |
| // "hijkl" |
| make1dmfIndexNA( strMVValues ); |
| } |
| } |