blob: 747e00bd609daf4dcd6c08cb2265ff2cc7f118af [file] [log] [blame]
Index: contrib/benchmark/conf/highlight-vs-vector-highlight.alg
===================================================================
--- contrib/benchmark/conf/highlight-vs-vector-highlight.alg (revision 0)
+++ contrib/benchmark/conf/highlight-vs-vector-highlight.alg (revision 0)
@@ -0,0 +1,78 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+
+ram.flush.mb=flush:32:32
+compound=cmpnd:true:false
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+doc.stored=true
+doc.tokenized=true
+doc.term.vector=true
+doc.term.vector.offsets=true
+doc.term.vector.positions=true
+log.step=2000
+
+docs.dir=reuters-out
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
+docs.file=temp/enwiki-20070527-pages-articles.xml
+
+# Use LUCENE-1770 WikipediaQueryMaker
+query.maker=org.apache.lucene.benchmark.byTask.feeds.WikipediaQueryMaker
+#query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=true
+# -------------------------------------------------------------------------------------
+{ "Populate"
+ CreateIndex
+ { "MAddDocs" AddDoc } : 20000
+ Optimize
+ CloseIndex
+}
+{
+ OpenReader
+ { "WarmTV" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 100
+ CloseReader
+}
+{
+ "Rounds"
+
+ ResetSystemSoft
+
+ OpenReader
+ { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 200
+ CloseReader
+
+ ResetSystemSoft
+
+ OpenReader
+ { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[3],fields[body]) > : 200
+ CloseReader
+
+ RepSumByPref Search
+
+ NewRound
+} : 4
+
+RepSumByNameRound
+RepSumByName
Index: contrib/benchmark/conf/vector-highlight-profile.alg
===================================================================
--- contrib/benchmark/conf/vector-highlight-profile.alg (revision 0)
+++ contrib/benchmark/conf/vector-highlight-profile.alg (revision 0)
@@ -0,0 +1,68 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+# multi val params are iterated by NewRound's, added to reports, start with column name.
+
+ram.flush.mb=flush:32:32
+compound=cmpnd:true:false
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+doc.stored=true
+doc.tokenized=true
+doc.term.vector=true
+doc.term.vector.offsets=true
+doc.term.vector.positions=true
+log.step=2000
+
+docs.dir=reuters-out
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=true
+# -------------------------------------------------------------------------------------
+{ "Populate"
+ CreateIndex
+ { "MAddDocs" AddDoc } : 20000
+ Optimize
+ CloseIndex
+ }
+{ "Rounds"
+
+ ResetSystemSoft
+
+
+ OpenReader
+ { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[10],fields[body]) > : 1000
+
+ CloseReader
+
+ RepSumByPref MAddDocs
+
+ NewRound
+
+} : 4
+
+RepSumByNameRound
+RepSumByName
+RepSumByPrefRound MAddDocs
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java (revision 0)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java (revision 0)
@@ -0,0 +1,141 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
+import org.apache.lucene.search.vectorhighlight.FieldQuery;
+
+import java.util.Set;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Collections;
+
+/**
+ * Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents by using FastVectorHighlighter.
+ *
+ * <p>Note: This task reuses the reader if it is already open.
+ * Otherwise a reader is opened at start and closed at the end.
+ * </p>
+ *
+ * <p>Takes optional multivalued, comma separated param string as: size[&lt;traversal size&gt;],highlight[&lt;int&gt;],maxFrags[&lt;int&gt;],mergeContiguous[&lt;boolean&gt;],fields[name1;name2;...]</p>
+ * <ul>
+ * <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
+ * <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
+ * <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
+ * <li>fragSize - The length of fragments</li>
+ * <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
+ * </ul>
+ * Example:
+ * <pre>"SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(size[10],highlight[10],maxFrags[3],fields[body]) > : 1000
+ * </pre>
+ *
+ * Fields must be stored and term vector offsets and positions in order must be true for this task to work.
+ *
+ * <p>Other side effects: counts additional 1 (record) for each traversed hit,
+ * and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
+ */
+public class SearchTravRetVectorHighlightTask extends SearchTravTask {
+
+ protected int numToHighlight = Integer.MAX_VALUE;
+ protected int maxFrags = 2;
+ protected int fragSize = 100;
+ protected Set paramFields = Collections.EMPTY_SET;
+ protected FastVectorHighlighter highlighter;
+
+ public SearchTravRetVectorHighlightTask(PerfRunData runData) {
+ super(runData);
+ }
+
+ public void setup() throws Exception {
+ super.setup();
+ //check to make sure either the doc is being stored
+ PerfRunData data = getRunData();
+ if (data.getConfig().get("doc.stored", false) == false){
+ throw new Exception("doc.stored must be set to true");
+ }
+ if (data.getConfig().get("doc.term.vector.offsets", false) == false){
+ throw new Exception("doc.term.vector.offsets must be set to true");
+ }
+ if (data.getConfig().get("doc.term.vector.positions", false) == false){
+ throw new Exception("doc.term.vector.positions must be set to true");
+ }
+ }
+
+ public boolean withRetrieve() {
+ return true;
+ }
+
+ public int numToHighlight() {
+ return numToHighlight;
+ }
+
+ protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
+ highlighter = new FastVectorHighlighter( false, false );
+ final FieldQuery fq = highlighter.getFieldQuery( q );
+ return new BenchmarkHighlighter(){
+ public int doHighlight(IndexReader reader, int doc, String field,
+ Document document, Analyzer analyzer, String text) throws Exception {
+ String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags);
+ return fragments != null ? fragments.length : 0;
+ }
+ };
+ }
+
+ public int maxNumFragments() {
+ return maxFrags;
+ }
+
+ protected Collection/*<String>*/ getFieldsToHighlight(Document document) {
+ Collection result = super.getFieldsToHighlight(document);
+ //if stored is false, then result will be empty, in which case just get all the param fields
+ if (paramFields.isEmpty() == false && result.isEmpty() == false) {
+ result.retainAll(paramFields);
+ } else {
+ result = paramFields;
+ }
+ return result;
+ }
+
+ public void setParams(String params) {
+ String [] splits = params.split(",");
+ for (int i = 0; i < splits.length; i++) {
+ if (splits[i].startsWith("size[") == true){
+ traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
+ } else if (splits[i].startsWith("highlight[") == true){
+ numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
+ } else if (splits[i].startsWith("maxFrags[") == true){
+ maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
+ } else if (splits[i].startsWith("fragSize[") == true){
+ fragSize = (int)Float.parseFloat(splits[i].substring("fragSize[".length(),splits[i].length() - 1));
+ } else if (splits[i].startsWith("fields[") == true){
+ paramFields = new HashSet();
+ String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
+ String [] fieldSplits = fieldNames.split(";");
+ for (int j = 0; j < fieldSplits.length; j++) {
+ paramFields.add(fieldSplits[j]);
+ }
+
+ }
+ }
+ }
+}
\ No newline at end of file
Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java
___________________________________________________________________
Name: svn:keywords
+ Date Author Id Revision HeadURL
Name: svn:eol-style
+ native
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java (revision 799981)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java (working copy)
@@ -17,9 +17,20 @@
* limitations under the License.
*/
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.highlight.Highlighter;
+import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
+import org.apache.lucene.search.highlight.QueryScorer;
+import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
+import org.apache.lucene.search.highlight.TextFragment;
+import org.apache.lucene.search.highlight.TokenSources;
+import java.io.IOException;
import java.util.Set;
import java.util.Collection;
import java.util.HashSet;
@@ -57,7 +68,7 @@
protected boolean mergeContiguous;
protected int maxFrags = 2;
protected Set paramFields = Collections.EMPTY_SET;
-
+ protected Highlighter highlighter;
public SearchTravRetHighlightTask(PerfRunData runData) {
super(runData);
@@ -79,7 +90,22 @@
public int numToHighlight() {
return numToHighlight;
}
+
+ protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
+ highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
+ return new BenchmarkHighlighter(){
+ public int doHighlight(IndexReader reader, int doc, String field,
+ Document document, Analyzer analyzer, String text) throws Exception {
+ TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer);
+ TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
+ return frag != null ? frag.length : 0;
+ }
+ };
+ }
+ /**
+ * @deprecated
+ */
public boolean isMergeContiguousFragments() {
return mergeContiguous;
}
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java (revision 0)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java (revision 0)
@@ -0,0 +1,27 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+
+public abstract class BenchmarkHighlighter {
+ public abstract int doHighlight( IndexReader reader, int doc, String field,
+ Document document, Analyzer analyzer, String text ) throws Exception ;
+}
Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java
___________________________________________________________________
Name: svn:keywords
+ Date Author Id Revision HeadURL
Name: svn:eol-style
+ native
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (revision 799981)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (working copy)
@@ -121,13 +121,12 @@
boolean retrieve = withRetrieve();
int numHighlight = Math.min(numToHighlight(), scoreDocs.length);
Analyzer analyzer = getRunData().getAnalyzer();
- Highlighter highlighter = null;
+ BenchmarkHighlighter highlighter = null;
int maxFrags = 1;
if (numHighlight > 0) {
- highlighter = getHighlighter(q);
+ highlighter = getBenchmarkHighlighter(q);
maxFrags = maxNumFragments();
}
- boolean merge = isMergeContiguousFragments();
for (int m = 0; m < traversalSize; m++) {
int id = scoreDocs[m].doc;
res++;
@@ -139,8 +138,7 @@
for (Iterator iterator = fieldsToHighlight.iterator(); iterator.hasNext();) {
String field = (String) iterator.next();
String text = document.get(field);
- TokenStream ts = TokenSources.getAnyTokenStream(ir, id, field, document, analyzer);
- res += doHighlight(ts, text, highlighter, merge, maxFrags);
+ res += highlighter.doHighlight(ir, id, field, document, analyzer, text);
}
}
}
@@ -241,9 +239,16 @@
return 0;
}
+ /**
+ * @deprecated Use {@link #getBenchmarkHighlighter(Query)}
+ */
protected Highlighter getHighlighter(Query q){
return new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
}
+
+ protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
+ return null;
+ }
/**
*
@@ -256,11 +261,15 @@
/**
*
* @return true if the highlighter should merge contiguous fragments
+ * @deprecated
*/
public boolean isMergeContiguousFragments(){
return false;
}
+ /**
+ * @deprecated
+ */
protected int doHighlight(TokenStream ts, String text, Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException, InvalidTokenOffsetsException {
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments);
return frag != null ? frag.length : 0;
Index: contrib/benchmark/build.xml
===================================================================
--- contrib/benchmark/build.xml (revision 799981)
+++ contrib/benchmark/build.xml (working copy)
@@ -105,6 +105,7 @@
<pathelement path="${common.dir}/build/classes/java"/>
<pathelement path="${common.dir}/build/classes/demo"/>
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
+ <pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/>
<fileset dir="lib">
<include name="**/*.jar"/>
</fileset>
@@ -148,8 +149,13 @@
<fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/>
</subant>
</target>
+ <target name="compile-vector-highlighter">
+ <subant target="compile">
+ <fileset dir="${common.dir}/contrib/fast-vector-highlighter" includes="build.xml"/>
+ </subant>
+ </target>
- <target name="init" depends="common.init,compile-demo, compile-highlighter,check-files"/>
+ <target name="init" depends="common.init,compile-demo,compile-highlighter,compile-vector-highlighter,check-files"/>
<!-- make sure online collections (reuters) are first downloaded -->
<target name="test" depends="init,get-files">