blob: 935bca1e65723637bb537bf035f981c0983957e5 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Search;
using Lucene.Net.Documents;
using Lucene.Net.QueryParsers;
using Lucene.Net.Store;
using Lucene.Net.Index;
using Lucene.Net.Util;
using NUnit.Framework;
namespace Lucene.Net.Search.Vectorhighlight
{
public abstract class AbstractTestCase
{
protected String F = "f";
protected String F1 = "f1";
protected String F2 = "f2";
protected Directory dir;
protected Analyzer analyzerW;
protected Analyzer analyzerB;
protected Analyzer analyzerK;
protected IndexReader reader;
protected QueryParser paW;
protected QueryParser paB;
protected static String[] shortMVValues = {
"a b c",
"", // empty data in multi valued field
"d e"
};
protected static String[] longMVValues = {
"Followings are the examples of customizable parameters and actual examples of customization:",
"The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically"
};
// test data for LUCENE-1448 bug
protected static String[] biMVValues = {
"\nLucene/Solr does not require such additional hardware.",
"\nWhen you talk about processing speed, the"
};
protected static String[] strMVValues = {
"abc",
"defg",
"hijkl"
};
[SetUp]
public void SetUp()
{
analyzerW = new WhitespaceAnalyzer();
analyzerB = new BigramAnalyzer();
analyzerK = new KeywordAnalyzer();
paW = new QueryParser(Util.Version.LUCENE_CURRENT, F, analyzerW);
paB = new QueryParser(Util.Version.LUCENE_CURRENT, F, analyzerB);
dir = new RAMDirectory();
}
[TearDown]
public void TearDown()
{
if (reader != null)
{
reader.Close();
reader = null;
}
}
protected Query Tq(String text)
{
return Tq(1F, text);
}
protected Query Tq(float boost, String text)
{
return Tq(boost, F, text);
}
protected Query Tq(String field, String text)
{
return Tq(1F, field, text);
}
protected Query Tq(float boost, String field, String text)
{
Query query = new TermQuery(new Term(field, text));
query.Boost = boost;
return query;
}
protected Query Preq(String text)
{
return Preq(1F, text);
}
protected Query Preq(float boost, String text)
{
return Preq(boost, F, text);
}
protected Query Preq(String field, String text)
{
return Preq(1F, field, text);
}
protected Query Preq(float boost, String field, String text)
{
Query query = new PrefixQuery(new Term(field, text));
query.Boost = boost;
return query;
}
protected Query PqF(params String[] texts)
{
return PqF(1F, texts);
}
//protected Query pqF(String[] texts)
//{
// return pqF(1F, texts);
//}
protected Query PqF(float boost, params String[] texts)
{
return pqF(boost, 0, texts);
}
protected Query pqF(float boost, int slop, params String[] texts)
{
return Pq(boost, slop, F, texts);
}
protected Query Pq(String field, params String[] texts)
{
return Pq(1F, 0, field, texts);
}
protected Query Pq(float boost, String field, params String[] texts)
{
return Pq(boost, 0, field, texts);
}
protected Query Pq(float boost, int slop, String field, params String[] texts)
{
PhraseQuery query = new PhraseQuery();
foreach (String text in texts)
{
query.Add(new Term(field, text));
}
query.Boost = boost;
query.Slop = slop;
return query;
}
protected Query Dmq(params Query[] queries)
{
return Dmq(0.0F, queries);
}
protected Query Dmq(float tieBreakerMultiplier, params Query[] queries)
{
DisjunctionMaxQuery query = new DisjunctionMaxQuery(tieBreakerMultiplier);
foreach (Query q in queries)
{
query.Add(q);
}
return query;
}
protected void AssertCollectionQueries(Dictionary<Query, Query> actual, params Query[] expected)
{
Assert.AreEqual(expected.Length, actual.Count);
foreach (Query query in expected)
{
Assert.IsTrue(actual.ContainsKey(query));
}
}
class BigramAnalyzer : Analyzer
{
public override TokenStream TokenStream(String fieldName, System.IO.TextReader reader)
{
return new BasicNGramTokenizer(reader);
}
}
class BasicNGramTokenizer : Tokenizer
{
public static int DEFAULT_N_SIZE = 2;
public static String DEFAULT_DELIMITERS = " \t\n.,";
private int n;
private String delimiters;
private int startTerm;
private int lenTerm;
private int startOffset;
private int nextStartOffset;
private int ch;
private String snippet;
private StringBuilder snippetBuffer;
private static int BUFFER_SIZE = 4096;
private char[] charBuffer;
private int charBufferIndex;
private int charBufferLen;
public BasicNGramTokenizer(System.IO.TextReader inReader): this(inReader, DEFAULT_N_SIZE)
{
}
public BasicNGramTokenizer(System.IO.TextReader inReader, int n): this(inReader, n, DEFAULT_DELIMITERS)
{
}
public BasicNGramTokenizer(System.IO.TextReader inReader, String delimiters) : this(inReader, DEFAULT_N_SIZE, delimiters)
{
}
public BasicNGramTokenizer(System.IO.TextReader inReader, int n, String delimiters) : base(inReader)
{
this.n = n;
this.delimiters = delimiters;
startTerm = 0;
nextStartOffset = 0;
snippet = null;
snippetBuffer = new StringBuilder();
charBuffer = new char[BUFFER_SIZE];
charBufferIndex = BUFFER_SIZE;
charBufferLen = 0;
ch = 0;
Init();
}
void Init()
{
termAtt = AddAttribute<ITermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
}
ITermAttribute termAtt = null;
IOffsetAttribute offsetAtt = null;
public override bool IncrementToken()
{
if (!GetNextPartialSnippet())
return false;
ClearAttributes();
termAtt.SetTermBuffer(snippet, startTerm, lenTerm);
offsetAtt.SetOffset(CorrectOffset(startOffset), CorrectOffset(startOffset + lenTerm));
return true;
}
private int GetFinalOffset()
{
return nextStartOffset;
}
public override void End()
{
offsetAtt.SetOffset(GetFinalOffset(), GetFinalOffset());
}
protected bool GetNextPartialSnippet()
{
if (snippet != null && snippet.Length >= startTerm + 1 + n)
{
startTerm++;
startOffset++;
lenTerm = n;
return true;
}
return GetNextSnippet();
}
protected bool GetNextSnippet()
{
startTerm = 0;
startOffset = nextStartOffset;
snippetBuffer.Remove(0, snippetBuffer.Length);
while (true)
{
if (ch != -1)
ch = ReadCharFromBuffer();
if (ch == -1) break;
else if (!IsDelimiter(ch))
snippetBuffer.Append((char)ch);
else if (snippetBuffer.Length > 0)
break;
else
startOffset++;
}
if (snippetBuffer.Length == 0)
return false;
snippet = snippetBuffer.ToString();
lenTerm = snippet.Length >= n ? n : snippet.Length;
return true;
}
protected int ReadCharFromBuffer()
{
if (charBufferIndex >= charBufferLen)
{
charBufferLen = input.Read(charBuffer,0,charBuffer.Length);
if (charBufferLen <= 0)
{
return -1;
}
charBufferIndex = 0;
}
int c = (int)charBuffer[charBufferIndex++];
nextStartOffset++;
return c;
}
protected bool IsDelimiter(int c)
{
return delimiters.IndexOf(Convert.ToChar(c) ) >= 0;
}
}
protected void Make1d1fIndex(String value)
{
Make1dmfIndex( value );
}
protected void Make1d1fIndexB(String value)
{
Make1dmfIndexB( value );
}
protected void Make1dmfIndex(params String[] values)
{
Make1dmfIndex(analyzerW, values);
}
protected void Make1dmfIndexB(params String[] values)
{
Make1dmfIndex(analyzerB, values);
}
// make 1 doc with multi valued field
protected void Make1dmfIndex(Analyzer analyzer, params String[] values)
{
IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
foreach (String value in values)
doc.Add(new Field(F, value, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.AddDocument(doc);
writer.Close();
reader = IndexReader.Open(dir,true);
}
// make 1 doc with multi valued & not analyzed field
protected void Make1dmfIndexNA(String[] values)
{
IndexWriter writer = new IndexWriter(dir, analyzerK, true, IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
foreach (String value in values)
doc.Add(new Field(F, value, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.AddDocument(doc);
writer.Close();
reader = IndexReader.Open(dir, true);
}
protected void MakeIndexShortMV()
{
// 012345
// "a b c"
// 0 1 2
// ""
// 6789
// "d e"
// 3 4
Make1dmfIndex(shortMVValues);
}
protected void MakeIndexLongMV()
{
// 11111111112222222222333333333344444444445555555555666666666677777777778888888888999
// 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
// Followings are the examples of customizable parameters and actual examples of customization:
// 0 1 2 3 4 5 6 7 8 9 10 11
// 1 2
// 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122
// 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901
// The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically
// 12 13 (14) (15) 16 17 18 19 20 21 22 23 (24) (25) 26 27 28 29 30 31 32 33 34
Make1dmfIndex(longMVValues);
}
protected void MakeIndexLongMVB()
{
// "*" [] LF
// 1111111111222222222233333333334444444444555555
// 01234567890123456789012345678901234567890123456789012345
// *Lucene/Solr does not require such additional hardware.
// Lu 0 do 10 re 15 su 21 na 31
// uc 1 oe 11 eq 16 uc 22 al 32
// ce 2 es 12 qu 17 ch 23 ha 33
// en 3 no 13 ui 18 ad 24 ar 34
// ne 4 ot 14 ir 19 dd 25 rd 35
// e/ 5 re 20 di 26 dw 36
// /S 6 it 27 wa 37
// So 7 ti 28 ar 38
// ol 8 io 29 re 39
// lr 9 on 30
// 5555666666666677777777778888888888999999999
// 6789012345678901234567890123456789012345678
// *When you talk about processing speed, the
// Wh 40 ab 48 es 56 th 65
// he 41 bo 49 ss 57 he 66
// en 42 ou 50 si 58
// yo 43 ut 51 in 59
// ou 44 pr 52 ng 60
// ta 45 ro 53 sp 61
// al 46 oc 54 pe 62
// lk 47 ce 55 ee 63
// ed 64
Make1dmfIndexB(biMVValues);
}
protected void MakeIndexStrMV()
{
// 0123
// "abc"
// 34567
// "defg"
// 111
// 789012
// "hijkl"
Make1dmfIndexNA(strMVValues);
}
}
}