blob: c2406637bd389fcb69642b21875c038aa640bd32 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import java.text.BreakIterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.QueryBuilder;
public class LengthGoalBreakIteratorTest extends LuceneTestCase {
private static final String FIELD = "body";
private static final float[] ALIGNS = {0.f, 0.5f, 1.f};
// We test LengthGoalBreakIterator as it is used by the UnifiedHighlighter instead of directly, because it is
// not a general purpose BreakIterator. A unit test of it directly wouldn't give as much confidence.
private final Analyzer analyzer =
new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);//whitespace, punctuation, lowercase
// We do a '.' BreakIterator and test varying the length goal.
// 0 1
// 01234567890123456789
static final String CONTENT = "Aa bb. Cc dd. Ee ff";
static final String CONTENT2 = "Aa bb Cc dd X Ee ff Gg hh.";
static final String CONTENT3 = "Aa bbcc ddxyzee ffgg hh.";
public void testFragmentAlignmentConstructor() throws IOException {
BreakIterator baseBI = new CustomSeparatorBreakIterator('.');
// test fragmentAlignment validation
float[] valid_aligns = {0.f, 0.3333f, 0.5f, 0.99f, 1.f};
for (float alignment : valid_aligns) {
LengthGoalBreakIterator.createClosestToLength(baseBI, 50, alignment);
}
float[] invalid_aligns = {-0.01f, -1.f, 1.5f, Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY};
for (float alignment : invalid_aligns) {
expectThrows(IllegalArgumentException.class, () -> {
LengthGoalBreakIterator.createClosestToLength(baseBI, 50, alignment);
});
}
// test backwards compatibility constructors
String backwardCompString = LengthGoalBreakIterator.createClosestToLength(baseBI, 50).toString();
assertTrue(backwardCompString, backwardCompString.contains("fragAlign=0.0"));
backwardCompString = LengthGoalBreakIterator.createMinLength(baseBI, 50).toString();
assertTrue(backwardCompString, backwardCompString.contains("fragAlign=0.0"));
}
public void testTargetLen() throws IOException {
// "goal" means target length goal to find closest break
// at first word:
Query query = query("aa");
assertEquals("almost two sent A",
"<b>Aa</b> bb.", highlightClosestToLen(CONTENT, query, 7, 0.f));
assertEquals("almost two sent B",
"<b>Aa</b> bb.", highlightClosestToLen(CONTENT, query, 15, 0.5f));
assertEquals("almost two sent C",
"<b>Aa</b> bb.", highlightClosestToLen(CONTENT, query, 64, 1.f));
assertEquals("barely two sent A",
"<b>Aa</b> bb. Cc dd.", highlightClosestToLen(CONTENT, query, 8, 0.f));
assertEquals("barely two sent B",
"<b>Aa</b> bb. Cc dd.", highlightClosestToLen(CONTENT, query, 16, 0.5f));
assertEquals("long goal A",
"<b>Aa</b> bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 14 + random().nextInt(20), 0.f));
assertEquals("long goal B",
"<b>Aa</b> bb. Cc dd. Ee ff", highlightClosestToLen(CONTENT, query, 28 + random().nextInt(20), 0.5f));
// at some word not at start of passage
query = query("dd");
for (float align : ALIGNS) {
// alignment is not meaningful if fragsize is shorter than or closer to match-fragment boundaries
assertEquals("short goal " + align,
" Cc <b>dd</b>.", highlightClosestToLen(CONTENT, query, random().nextInt(4), align));
}
// preceding/following inclusion by alignment parameter
assertEquals("barely two sent A",
" Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 11, 0.f));
assertEquals("barely two sent B",
" Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 11, 0.5f));
assertEquals("barely two sent C",
"Aa bb. Cc <b>dd</b>.", highlightClosestToLen(CONTENT, query, 11, 1.f));
assertEquals("long goal A",
" Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 0.f));
assertEquals("long goal B",
"Aa bb. Cc <b>dd</b>. Ee ff", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 0.5f));
assertEquals("long goal C",
"Aa bb. Cc <b>dd</b>.", highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), 1.f));
query = query("ddxyzee");
assertEquals("test fragment search from the middle of the match; almost including",
"<b>ddxyzee</b> ", highlightClosestToLen(CONTENT3, query, 7, 0.5f, 1, ' '));
assertEquals("test fragment search from the middle of the match; barely including",
"bbcc <b>ddxyzee</b> ffgg ", highlightClosestToLen(CONTENT3, query, 14, 0.5f, 1, ' '));
}
public void testMinLen() throws IOException {
// minLen mode is simpler than targetLen... just test a few cases
Query query = query("dd");
assertEquals("almost two sent A",
" Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 0, 0.f));
assertEquals("almost two sent B",
" Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 1, 0.5f));
assertEquals("almost two sent C",
" Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 5, 1.f));
assertEquals("barely two sent A",
" Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 1, 0.f));
assertEquals("barely two sent B",
" Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 2, 0.5f));
assertEquals("barely two sent C",
"Aa bb. Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 7, 1.f));
assertEquals("barely two sent D/a",
" Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 2, 0.55f));
assertEquals("barely two sent D/b",
" Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 3, 0.55f));
assertEquals("barely two sent E/a",
" Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 10, 0.5f));
assertEquals("barely two sent E/b",
"Aa bb. Cc <b>dd</b>. Ee ff", highlightMinLen(CONTENT, query, 10, 0.7f));
assertEquals("barely two sent E/c",
"Aa bb. Cc <b>dd</b>.", highlightMinLen(CONTENT, query, 9, 0.9f));
query = query("ddxyzee");
assertEquals("test fragment search from the middle of the match; almost including",
"<b>ddxyzee</b> ", highlightMinLen(CONTENT3, query, 7, 0.5f, ' '));
assertEquals("test fragment search from the middle of the match; barely including",
"bbcc <b>ddxyzee</b> ffgg ", highlightMinLen(CONTENT3, query, 8, 0.5f, ' '));
}
public void testMinLenPrecision() throws IOException {
Query query = query("x");
assertEquals("test absolute minimal length",
"<b>X</b> ", highlightMinLen(CONTENT2, query, 1, 0.5f, ' '));
assertEquals("test slightly above minimal length",
"dd <b>X</b> Ee ", highlightMinLen(CONTENT2, query, 4, 0.5f, ' '));
}
public void testDefaultSummaryTargetLen() throws IOException {
Query query = query("zz");
for (float align : ALIGNS) { // alignment is not used for creating default-summary
assertEquals("Aa bb.",
highlightClosestToLen(CONTENT, query, 6 + random().nextInt(4), align));
assertEquals("Aa bb. Cc dd.",
highlightClosestToLen(CONTENT, query, 12 + random().nextInt(4), align));
assertEquals("Aa bb. Cc dd. Ee ff",
highlightClosestToLen(CONTENT, query, 17 + random().nextInt(20), align));
}
assertEquals("Aa bb. Cc dd.",
highlightClosestToLen(CONTENT, query, 6 + random().nextInt(4), 0.f, 2));
}
private Query query(String qStr) {
return new QueryBuilder(analyzer).createBooleanQuery(FIELD, qStr);
}
private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign) throws IOException {
return highlightClosestToLen(content, query, lengthGoal, fragAlign, 1);
}
private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign, int maxPassages) throws IOException {
return highlightClosestToLen(content, query, lengthGoal, fragAlign, maxPassages, '.');
}
private String highlightClosestToLen(String content, Query query, int lengthGoal, float fragAlign, int maxPassages, char separator) throws IOException {
UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createClosestToLength(new CustomSeparatorBreakIterator(separator), lengthGoal, fragAlign));
return highlighter.highlightWithoutSearcher(FIELD, query, content, maxPassages).toString();
}
private String highlightMinLen(String content, Query query, int lengthGoal, float fragAlign) throws IOException {
return highlightMinLen(content, query, lengthGoal, fragAlign, '.');
}
private String highlightMinLen(String content, Query query, int lengthGoal, float fragAlign, char separator) throws IOException {
// differs from above only by "createMinLength"
UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createMinLength(new CustomSeparatorBreakIterator(separator), lengthGoal, fragAlign));
return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString();
}
}