| Index: contrib/benchmark/conf/highlight-vs-vector-highlight.alg |
| =================================================================== |
| --- contrib/benchmark/conf/highlight-vs-vector-highlight.alg (revision 0) |
| +++ contrib/benchmark/conf/highlight-vs-vector-highlight.alg (revision 0) |
| @@ -0,0 +1,78 @@ |
| +#/** |
| +# * Licensed to the Apache Software Foundation (ASF) under one or more |
| +# * contributor license agreements. See the NOTICE file distributed with |
| +# * this work for additional information regarding copyright ownership. |
| +# * The ASF licenses this file to You under the Apache License, Version 2.0 |
| +# * (the "License"); you may not use this file except in compliance with |
| +# * the License. You may obtain a copy of the License at |
| +# * |
| +# * http://www.apache.org/licenses/LICENSE-2.0 |
| +# * |
| +# * Unless required by applicable law or agreed to in writing, software |
| +# * distributed under the License is distributed on an "AS IS" BASIS, |
| +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| +# * See the License for the specific language governing permissions and |
| +# * limitations under the License. |
| +# */ |
| +# ------------------------------------------------------------------------------------- |
| + |
| +ram.flush.mb=flush:32:32 |
| +compound=cmpnd:true:false |
| + |
| +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer |
| +directory=FSDirectory |
| + |
| +doc.stored=true |
| +doc.tokenized=true |
| +doc.term.vector=true |
| +doc.term.vector.offsets=true |
| +doc.term.vector.positions=true |
| +log.step=2000 |
| + |
| +docs.dir=reuters-out |
| + |
| +content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource |
| +docs.file=temp/enwiki-20070527-pages-articles.xml |
| + |
| +# Use LUCENE-1770 WikipediaQueryMaker |
| +query.maker=org.apache.lucene.benchmark.byTask.feeds.WikipediaQueryMaker |
| +#query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker |
| + |
| +# task at this depth or less would print when they start |
| +task.max.depth.log=2 |
| + |
| +log.queries=true |
| +# ------------------------------------------------------------------------------------- |
| +{ "Populate" |
| + CreateIndex |
| + { "MAddDocs" AddDoc } : 20000 |
| + Optimize |
| + CloseIndex |
| +} |
| +{ |
| + OpenReader |
| + { "WarmTV" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 100 |
| + CloseReader |
| +} |
| +{ |
| + "Rounds" |
| + |
| + ResetSystemSoft |
| + |
| + OpenReader |
| + { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 200 |
| + CloseReader |
| + |
| + ResetSystemSoft |
| + |
| + OpenReader |
| + { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[3],fields[body]) > : 200 |
| + CloseReader |
| + |
| + RepSumByPref Search |
| + |
| + NewRound |
| +} : 4 |
| + |
| +RepSumByNameRound |
| +RepSumByName |
| Index: contrib/benchmark/conf/vector-highlight-profile.alg |
| =================================================================== |
| --- contrib/benchmark/conf/vector-highlight-profile.alg (revision 0) |
| +++ contrib/benchmark/conf/vector-highlight-profile.alg (revision 0) |
| @@ -0,0 +1,68 @@ |
| +#/** |
| +# * Licensed to the Apache Software Foundation (ASF) under one or more |
| +# * contributor license agreements. See the NOTICE file distributed with |
| +# * this work for additional information regarding copyright ownership. |
| +# * The ASF licenses this file to You under the Apache License, Version 2.0 |
| +# * (the "License"); you may not use this file except in compliance with |
| +# * the License. You may obtain a copy of the License at |
| +# * |
| +# * http://www.apache.org/licenses/LICENSE-2.0 |
| +# * |
| +# * Unless required by applicable law or agreed to in writing, software |
| +# * distributed under the License is distributed on an "AS IS" BASIS, |
| +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| +# * See the License for the specific language governing permissions and |
| +# * limitations under the License. |
| +# */ |
| +# ------------------------------------------------------------------------------------- |
| +# multi val params are iterated by NewRound's, added to reports, start with column name. |
| + |
| +ram.flush.mb=flush:32:32 |
| +compound=cmpnd:true:false |
| + |
| +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer |
| +directory=FSDirectory |
| + |
| +doc.stored=true |
| +doc.tokenized=true |
| +doc.term.vector=true |
| +doc.term.vector.offsets=true |
| +doc.term.vector.positions=true |
| +log.step=2000 |
| + |
| +docs.dir=reuters-out |
| + |
| +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource |
| + |
| +query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker |
| + |
| +# task at this depth or less would print when they start |
| +task.max.depth.log=2 |
| + |
| +log.queries=true |
| +# ------------------------------------------------------------------------------------- |
| +{ "Populate" |
| + CreateIndex |
| + { "MAddDocs" AddDoc } : 20000 |
| + Optimize |
| + CloseIndex |
| + } |
| +{ "Rounds" |
| + |
| + ResetSystemSoft |
| + |
| + |
| + OpenReader |
| + { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[10],fields[body]) > : 1000 |
| + |
| + CloseReader |
| + |
| + RepSumByPref MAddDocs |
| + |
| + NewRound |
| + |
| +} : 4 |
| + |
| +RepSumByNameRound |
| +RepSumByName |
| +RepSumByPrefRound MAddDocs |
| Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java |
| =================================================================== |
| --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java (revision 0) |
| +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java (revision 0) |
| @@ -0,0 +1,141 @@ |
| +package org.apache.lucene.benchmark.byTask.tasks; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.benchmark.byTask.PerfRunData; |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.index.IndexReader; |
| +import org.apache.lucene.search.Query; |
| +import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; |
| +import org.apache.lucene.search.vectorhighlight.FieldQuery; |
| + |
| +import java.util.Set; |
| +import java.util.Collection; |
| +import java.util.HashSet; |
| +import java.util.Collections; |
| + |
| +/** |
| + * Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents by using FastVectorHighlighter. |
| + * |
| + * <p>Note: This task reuses the reader if it is already open. |
| + * Otherwise a reader is opened at start and closed at the end. |
| + * </p> |
| + * |
| + * <p>Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]</p> |
| + * <ul> |
| + * <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li> |
| + * <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li> |
| + * <li>maxFrags - The maximum number of fragments to score by the highlighter</li> |
| + * <li>fragSize - The length of fragments</li> |
| + * <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li> |
| + * </ul> |
| + * Example: |
| + * <pre>"SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(size[10],highlight[10],maxFrags[3],fields[body]) > : 1000 |
| + * </pre> |
| + * |
| + * Fields must be stored and term vector offsets and positions in order must be true for this task to work. |
| + * |
| + * <p>Other side effects: counts additional 1 (record) for each traversed hit, |
| + * and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p> |
| + */ |
| +public class SearchTravRetVectorHighlightTask extends SearchTravTask { |
| + |
| + protected int numToHighlight = Integer.MAX_VALUE; |
| + protected int maxFrags = 2; |
| + protected int fragSize = 100; |
| + protected Set paramFields = Collections.EMPTY_SET; |
| + protected FastVectorHighlighter highlighter; |
| + |
| + public SearchTravRetVectorHighlightTask(PerfRunData runData) { |
| + super(runData); |
| + } |
| + |
| + public void setup() throws Exception { |
| + super.setup(); |
| + //check to make sure either the doc is being stored |
| + PerfRunData data = getRunData(); |
| + if (data.getConfig().get("doc.stored", false) == false){ |
| + throw new Exception("doc.stored must be set to true"); |
| + } |
| + if (data.getConfig().get("doc.term.vector.offsets", false) == false){ |
| + throw new Exception("doc.term.vector.offsets must be set to true"); |
| + } |
| + if (data.getConfig().get("doc.term.vector.positions", false) == false){ |
| + throw new Exception("doc.term.vector.positions must be set to true"); |
| + } |
| + } |
| + |
| + public boolean withRetrieve() { |
| + return true; |
| + } |
| + |
| + public int numToHighlight() { |
| + return numToHighlight; |
| + } |
| + |
| + protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){ |
| + highlighter = new FastVectorHighlighter( false, false ); |
| + final FieldQuery fq = highlighter.getFieldQuery( q ); |
| + return new BenchmarkHighlighter(){ |
| + public int doHighlight(IndexReader reader, int doc, String field, |
| + Document document, Analyzer analyzer, String text) throws Exception { |
| + String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags); |
| + return fragments != null ? fragments.length : 0; |
| + } |
| + }; |
| + } |
| + |
| + public int maxNumFragments() { |
| + return maxFrags; |
| + } |
| + |
| + protected Collection/*<String>*/ getFieldsToHighlight(Document document) { |
| + Collection result = super.getFieldsToHighlight(document); |
| + //if stored is false, then result will be empty, in which case just get all the param fields |
| + if (paramFields.isEmpty() == false && result.isEmpty() == false) { |
| + result.retainAll(paramFields); |
| + } else { |
| + result = paramFields; |
| + } |
| + return result; |
| + } |
| + |
| + public void setParams(String params) { |
| + String [] splits = params.split(","); |
| + for (int i = 0; i < splits.length; i++) { |
| + if (splits[i].startsWith("size[") == true){ |
| + traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1)); |
| + } else if (splits[i].startsWith("highlight[") == true){ |
| + numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1)); |
| + } else if (splits[i].startsWith("maxFrags[") == true){ |
| + maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1)); |
| + } else if (splits[i].startsWith("fragSize[") == true){ |
| + fragSize = (int)Float.parseFloat(splits[i].substring("fragSize[".length(),splits[i].length() - 1)); |
| + } else if (splits[i].startsWith("fields[") == true){ |
| + paramFields = new HashSet(); |
| + String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1); |
| + String [] fieldSplits = fieldNames.split(";"); |
| + for (int j = 0; j < fieldSplits.length; j++) { |
| + paramFields.add(fieldSplits[j]); |
| + } |
| + |
| + } |
| + } |
| + } |
| +} |
| \ No newline at end of file |
| |
| Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java |
| ___________________________________________________________________ |
| Name: svn:keywords |
| + Date Author Id Revision HeadURL |
| Name: svn:eol-style |
| + native |
| |
| Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java |
| =================================================================== |
| --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java (revision 799981) |
| +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java (working copy) |
| @@ -17,9 +17,20 @@ |
| * limitations under the License. |
| */ |
| |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.benchmark.byTask.PerfRunData; |
| import org.apache.lucene.document.Document; |
| +import org.apache.lucene.index.IndexReader; |
| +import org.apache.lucene.search.Query; |
| +import org.apache.lucene.search.highlight.Highlighter; |
| +import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; |
| +import org.apache.lucene.search.highlight.QueryScorer; |
| +import org.apache.lucene.search.highlight.SimpleHTMLFormatter; |
| +import org.apache.lucene.search.highlight.TextFragment; |
| +import org.apache.lucene.search.highlight.TokenSources; |
| |
| +import java.io.IOException; |
| import java.util.Set; |
| import java.util.Collection; |
| import java.util.HashSet; |
| @@ -57,7 +68,7 @@ |
| protected boolean mergeContiguous; |
| protected int maxFrags = 2; |
| protected Set paramFields = Collections.EMPTY_SET; |
| - |
| + protected Highlighter highlighter; |
| |
| public SearchTravRetHighlightTask(PerfRunData runData) { |
| super(runData); |
| @@ -79,7 +90,22 @@ |
| public int numToHighlight() { |
| return numToHighlight; |
| } |
| + |
| + protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){ |
| + highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q)); |
| + return new BenchmarkHighlighter(){ |
| + public int doHighlight(IndexReader reader, int doc, String field, |
| + Document document, Analyzer analyzer, String text) throws Exception { |
| + TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer); |
| + TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags); |
| + return frag != null ? frag.length : 0; |
| + } |
| + }; |
| + } |
| |
| + /** |
| + * @deprecated |
| + */ |
| public boolean isMergeContiguousFragments() { |
| return mergeContiguous; |
| } |
| Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java |
| =================================================================== |
| --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java (revision 0) |
| +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java (revision 0) |
| @@ -0,0 +1,27 @@ |
| +package org.apache.lucene.benchmark.byTask.tasks; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.index.IndexReader; |
| + |
| +public abstract class BenchmarkHighlighter { |
| + public abstract int doHighlight( IndexReader reader, int doc, String field, |
| + Document document, Analyzer analyzer, String text ) throws Exception ; |
| +} |
| |
| Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java |
| ___________________________________________________________________ |
| Name: svn:keywords |
| + Date Author Id Revision HeadURL |
| Name: svn:eol-style |
| + native |
| |
| Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java |
| =================================================================== |
| --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (revision 799981) |
| +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (working copy) |
| @@ -121,13 +121,12 @@ |
| boolean retrieve = withRetrieve(); |
| int numHighlight = Math.min(numToHighlight(), scoreDocs.length); |
| Analyzer analyzer = getRunData().getAnalyzer(); |
| - Highlighter highlighter = null; |
| + BenchmarkHighlighter highlighter = null; |
| int maxFrags = 1; |
| if (numHighlight > 0) { |
| - highlighter = getHighlighter(q); |
| + highlighter = getBenchmarkHighlighter(q); |
| maxFrags = maxNumFragments(); |
| } |
| - boolean merge = isMergeContiguousFragments(); |
| for (int m = 0; m < traversalSize; m++) { |
| int id = scoreDocs[m].doc; |
| res++; |
| @@ -139,8 +138,7 @@ |
| for (Iterator iterator = fieldsToHighlight.iterator(); iterator.hasNext();) { |
| String field = (String) iterator.next(); |
| String text = document.get(field); |
| - TokenStream ts = TokenSources.getAnyTokenStream(ir, id, field, document, analyzer); |
| - res += doHighlight(ts, text, highlighter, merge, maxFrags); |
| + res += highlighter.doHighlight(ir, id, field, document, analyzer, text); |
| } |
| } |
| } |
| @@ -241,9 +239,16 @@ |
| return 0; |
| } |
| |
| + /** |
| + * @deprecated Use {@link #getBenchmarkHighlighter(Query)} |
| + */ |
| protected Highlighter getHighlighter(Query q){ |
| return new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q)); |
| } |
| + |
| + protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){ |
| + return null; |
| + } |
| |
| /** |
| * |
| @@ -256,11 +261,15 @@ |
| /** |
| * |
| * @return true if the highlighter should merge contiguous fragments |
| + * @deprecated |
| */ |
| public boolean isMergeContiguousFragments(){ |
| return false; |
| } |
| |
| + /** |
| + * @deprecated |
| + */ |
| protected int doHighlight(TokenStream ts, String text, Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException, InvalidTokenOffsetsException { |
| TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments); |
| return frag != null ? frag.length : 0; |
| Index: contrib/benchmark/build.xml |
| =================================================================== |
| --- contrib/benchmark/build.xml (revision 799981) |
| +++ contrib/benchmark/build.xml (working copy) |
| @@ -105,6 +105,7 @@ |
| <pathelement path="${common.dir}/build/classes/java"/> |
| <pathelement path="${common.dir}/build/classes/demo"/> |
| <pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/> |
| + <pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/> |
| <fileset dir="lib"> |
| <include name="**/*.jar"/> |
| </fileset> |
| @@ -148,8 +149,13 @@ |
| <fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/> |
| </subant> |
| </target> |
| + <target name="compile-vector-highlighter"> |
| + <subant target="compile"> |
| + <fileset dir="${common.dir}/contrib/fast-vector-highlighter" includes="build.xml"/> |
| + </subant> |
| + </target> |
| |
| - <target name="init" depends="common.init,compile-demo, compile-highlighter,check-files"/> |
| + <target name="init" depends="common.init,compile-demo,compile-highlighter,compile-vector-highlighter,check-files"/> |
| |
| <!-- make sure online collections (reuters) are first downloaded --> |
| <target name="test" depends="init,get-files"> |