| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| |
| using System; |
| using System.Collections.Generic; |
| using System.Text; |
| |
| |
| using Lucene.Net.Analysis; |
| using Lucene.Net.Analysis.Tokenattributes; |
| using Lucene.Net.Search; |
| using Lucene.Net.Documents; |
| using Lucene.Net.QueryParsers; |
| using Lucene.Net.Store; |
| using Lucene.Net.Index; |
| using Lucene.Net.Util; |
| |
| using NUnit.Framework; |
| |
| namespace Lucene.Net.Search.Vectorhighlight |
| { |
| public abstract class AbstractTestCase |
| { |
| |
| protected String F = "f"; |
| protected String F1 = "f1"; |
| protected String F2 = "f2"; |
| protected Directory dir; |
| protected Analyzer analyzerW; |
| protected Analyzer analyzerB; |
| protected Analyzer analyzerK; |
| protected IndexReader reader; |
| protected QueryParser paW; |
| protected QueryParser paB; |
| |
| protected static String[] shortMVValues = { |
| "a b c", |
| "", // empty data in multi valued field |
| "d e" |
| }; |
| |
| protected static String[] longMVValues = { |
| "Followings are the examples of customizable parameters and actual examples of customization:", |
| "The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically" |
| }; |
| |
| // test data for LUCENE-1448 bug |
| protected static String[] biMVValues = { |
| "\nLucene/Solr does not require such additional hardware.", |
| "\nWhen you talk about processing speed, the" |
| }; |
| |
| protected static String[] strMVValues = { |
| "abc", |
| "defg", |
| "hijkl" |
| }; |
| |
| [SetUp] |
| public void SetUp() |
| { |
| analyzerW = new WhitespaceAnalyzer(); |
| analyzerB = new BigramAnalyzer(); |
| analyzerK = new KeywordAnalyzer(); |
| paW = new QueryParser(Util.Version.LUCENE_CURRENT, F, analyzerW); |
| paB = new QueryParser(Util.Version.LUCENE_CURRENT, F, analyzerB); |
| dir = new RAMDirectory(); |
| } |
| |
| [TearDown] |
| public void TearDown() |
| { |
| if (reader != null) |
| { |
| reader.Close(); |
| reader = null; |
| } |
| } |
| |
| protected Query Tq(String text) |
| { |
| return Tq(1F, text); |
| } |
| |
| protected Query Tq(float boost, String text) |
| { |
| return Tq(boost, F, text); |
| } |
| |
| protected Query Tq(String field, String text) |
| { |
| return Tq(1F, field, text); |
| } |
| |
| protected Query Tq(float boost, String field, String text) |
| { |
| Query query = new TermQuery(new Term(field, text)); |
| query.Boost = boost; |
| return query; |
| } |
| |
| protected Query Preq(String text) |
| { |
| return Preq(1F, text); |
| } |
| |
| protected Query Preq(float boost, String text) |
| { |
| return Preq(boost, F, text); |
| } |
| |
| protected Query Preq(String field, String text) |
| { |
| return Preq(1F, field, text); |
| } |
| |
| protected Query Preq(float boost, String field, String text) |
| { |
| Query query = new PrefixQuery(new Term(field, text)); |
| query.Boost = boost; |
| return query; |
| } |
| |
| protected Query PqF(params String[] texts) |
| { |
| return PqF(1F, texts); |
| } |
| |
| //protected Query pqF(String[] texts) |
| //{ |
| // return pqF(1F, texts); |
| //} |
| |
| protected Query PqF(float boost, params String[] texts) |
| { |
| return pqF(boost, 0, texts); |
| } |
| |
| protected Query pqF(float boost, int slop, params String[] texts) |
| { |
| return Pq(boost, slop, F, texts); |
| } |
| |
| protected Query Pq(String field, params String[] texts) |
| { |
| return Pq(1F, 0, field, texts); |
| } |
| |
| protected Query Pq(float boost, String field, params String[] texts) |
| { |
| return Pq(boost, 0, field, texts); |
| } |
| |
| protected Query Pq(float boost, int slop, String field, params String[] texts) |
| { |
| PhraseQuery query = new PhraseQuery(); |
| foreach (String text in texts) |
| { |
| query.Add(new Term(field, text)); |
| } |
| query.Boost = boost; |
| query.Slop = slop; |
| return query; |
| } |
| |
| protected Query Dmq(params Query[] queries) |
| { |
| return Dmq(0.0F, queries); |
| } |
| |
| protected Query Dmq(float tieBreakerMultiplier, params Query[] queries) |
| { |
| DisjunctionMaxQuery query = new DisjunctionMaxQuery(tieBreakerMultiplier); |
| foreach (Query q in queries) |
| { |
| query.Add(q); |
| } |
| return query; |
| } |
| |
| protected void AssertCollectionQueries(Dictionary<Query, Query> actual, params Query[] expected) |
| { |
| |
| Assert.AreEqual(expected.Length, actual.Count); |
| foreach (Query query in expected) |
| { |
| Assert.IsTrue(actual.ContainsKey(query)); |
| } |
| } |
| |
| class BigramAnalyzer : Analyzer |
| { |
| public override TokenStream TokenStream(String fieldName, System.IO.TextReader reader) |
| { |
| return new BasicNGramTokenizer(reader); |
| } |
| } |
| |
| class BasicNGramTokenizer : Tokenizer |
| { |
| |
| public static int DEFAULT_N_SIZE = 2; |
| public static String DEFAULT_DELIMITERS = " \t\n.,"; |
| private int n; |
| private String delimiters; |
| private int startTerm; |
| private int lenTerm; |
| private int startOffset; |
| private int nextStartOffset; |
| private int ch; |
| private String snippet; |
| private StringBuilder snippetBuffer; |
| private static int BUFFER_SIZE = 4096; |
| private char[] charBuffer; |
| private int charBufferIndex; |
| private int charBufferLen; |
| |
| public BasicNGramTokenizer(System.IO.TextReader inReader): this(inReader, DEFAULT_N_SIZE) |
| { |
| } |
| |
| public BasicNGramTokenizer(System.IO.TextReader inReader, int n): this(inReader, n, DEFAULT_DELIMITERS) |
| { |
| } |
| |
| public BasicNGramTokenizer(System.IO.TextReader inReader, String delimiters) : this(inReader, DEFAULT_N_SIZE, delimiters) |
| { |
| } |
| |
| public BasicNGramTokenizer(System.IO.TextReader inReader, int n, String delimiters) : base(inReader) |
| { |
| this.n = n; |
| this.delimiters = delimiters; |
| startTerm = 0; |
| nextStartOffset = 0; |
| snippet = null; |
| snippetBuffer = new StringBuilder(); |
| charBuffer = new char[BUFFER_SIZE]; |
| charBufferIndex = BUFFER_SIZE; |
| charBufferLen = 0; |
| ch = 0; |
| |
| Init(); |
| } |
| |
| void Init() |
| { |
| termAtt = AddAttribute<ITermAttribute>(); |
| offsetAtt = AddAttribute<IOffsetAttribute>(); |
| } |
| |
| ITermAttribute termAtt = null; |
| IOffsetAttribute offsetAtt = null; |
| |
| public override bool IncrementToken() |
| { |
| if (!GetNextPartialSnippet()) |
| return false; |
| ClearAttributes(); |
| termAtt.SetTermBuffer(snippet, startTerm, lenTerm); |
| offsetAtt.SetOffset(CorrectOffset(startOffset), CorrectOffset(startOffset + lenTerm)); |
| return true; |
| } |
| |
| private int GetFinalOffset() |
| { |
| return nextStartOffset; |
| } |
| |
| public override void End() |
| { |
| offsetAtt.SetOffset(GetFinalOffset(), GetFinalOffset()); |
| } |
| |
| protected bool GetNextPartialSnippet() |
| { |
| if (snippet != null && snippet.Length >= startTerm + 1 + n) |
| { |
| startTerm++; |
| startOffset++; |
| lenTerm = n; |
| return true; |
| } |
| return GetNextSnippet(); |
| } |
| |
| protected bool GetNextSnippet() |
| { |
| startTerm = 0; |
| startOffset = nextStartOffset; |
| snippetBuffer.Remove(0, snippetBuffer.Length); |
| while (true) |
| { |
| if (ch != -1) |
| ch = ReadCharFromBuffer(); |
| if (ch == -1) break; |
| else if (!IsDelimiter(ch)) |
| snippetBuffer.Append((char)ch); |
| else if (snippetBuffer.Length > 0) |
| break; |
| else |
| startOffset++; |
| } |
| if (snippetBuffer.Length == 0) |
| return false; |
| snippet = snippetBuffer.ToString(); |
| lenTerm = snippet.Length >= n ? n : snippet.Length; |
| return true; |
| } |
| |
| protected int ReadCharFromBuffer() |
| { |
| if (charBufferIndex >= charBufferLen) |
| { |
| charBufferLen = input.Read(charBuffer,0,charBuffer.Length); |
| if (charBufferLen <= 0) |
| { |
| return -1; |
| } |
| charBufferIndex = 0; |
| } |
| int c = (int)charBuffer[charBufferIndex++]; |
| nextStartOffset++; |
| return c; |
| } |
| |
| protected bool IsDelimiter(int c) |
| { |
| return delimiters.IndexOf(Convert.ToChar(c) ) >= 0; |
| } |
| } |
| |
| protected void Make1d1fIndex(String value) |
| { |
| Make1dmfIndex( value ); |
| } |
| |
| protected void Make1d1fIndexB(String value) |
| { |
| Make1dmfIndexB( value ); |
| } |
| |
| protected void Make1dmfIndex(params String[] values) |
| { |
| Make1dmfIndex(analyzerW, values); |
| } |
| |
| protected void Make1dmfIndexB(params String[] values) |
| { |
| Make1dmfIndex(analyzerB, values); |
| } |
| |
| // make 1 doc with multi valued field |
| protected void Make1dmfIndex(Analyzer analyzer, params String[] values) |
| { |
| IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); |
| Document doc = new Document(); |
| foreach (String value in values) |
| doc.Add(new Field(F, value, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); |
| writer.AddDocument(doc); |
| writer.Close(); |
| |
| reader = IndexReader.Open(dir,true); |
| } |
| |
| // make 1 doc with multi valued & not analyzed field |
| protected void Make1dmfIndexNA(String[] values) |
| { |
| IndexWriter writer = new IndexWriter(dir, analyzerK, true, IndexWriter.MaxFieldLength.LIMITED); |
| Document doc = new Document(); |
| foreach (String value in values) |
| doc.Add(new Field(F, value, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); |
| writer.AddDocument(doc); |
| writer.Close(); |
| |
| reader = IndexReader.Open(dir, true); |
| } |
| |
| protected void MakeIndexShortMV() |
| { |
| |
| // 012345 |
| // "a b c" |
| // 0 1 2 |
| |
| // "" |
| |
| // 6789 |
| // "d e" |
| // 3 4 |
| Make1dmfIndex(shortMVValues); |
| } |
| |
| protected void MakeIndexLongMV() |
| { |
| // 11111111112222222222333333333344444444445555555555666666666677777777778888888888999 |
| // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 |
| // Followings are the examples of customizable parameters and actual examples of customization: |
| // 0 1 2 3 4 5 6 7 8 9 10 11 |
| |
| // 1 2 |
| // 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122 |
| // 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901 |
| // The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically |
| // 12 13 (14) (15) 16 17 18 19 20 21 22 23 (24) (25) 26 27 28 29 30 31 32 33 34 |
| |
| Make1dmfIndex(longMVValues); |
| } |
| |
| protected void MakeIndexLongMVB() |
| { |
| // "*" [] LF |
| |
| // 1111111111222222222233333333334444444444555555 |
| // 01234567890123456789012345678901234567890123456789012345 |
| // *Lucene/Solr does not require such additional hardware. |
| // Lu 0 do 10 re 15 su 21 na 31 |
| // uc 1 oe 11 eq 16 uc 22 al 32 |
| // ce 2 es 12 qu 17 ch 23 ha 33 |
| // en 3 no 13 ui 18 ad 24 ar 34 |
| // ne 4 ot 14 ir 19 dd 25 rd 35 |
| // e/ 5 re 20 di 26 dw 36 |
| // /S 6 it 27 wa 37 |
| // So 7 ti 28 ar 38 |
| // ol 8 io 29 re 39 |
| // lr 9 on 30 |
| |
| // 5555666666666677777777778888888888999999999 |
| // 6789012345678901234567890123456789012345678 |
| // *When you talk about processing speed, the |
| // Wh 40 ab 48 es 56 th 65 |
| // he 41 bo 49 ss 57 he 66 |
| // en 42 ou 50 si 58 |
| // yo 43 ut 51 in 59 |
| // ou 44 pr 52 ng 60 |
| // ta 45 ro 53 sp 61 |
| // al 46 oc 54 pe 62 |
| // lk 47 ce 55 ee 63 |
| // ed 64 |
| |
| Make1dmfIndexB(biMVValues); |
| } |
| |
| protected void MakeIndexStrMV() |
| { |
| // 0123 |
| // "abc" |
| |
| // 34567 |
| // "defg" |
| |
| // 111 |
| // 789012 |
| // "hijkl" |
| Make1dmfIndexNA(strMVValues); |
| } |
| } |
| } |