docs/attachments/LUCENE-1773/LUCENE-1773.patch - lucene-jira-archive - Git at Google

 Index: contrib/benchmark/conf/highlight-vs-vector-highlight.alg
 ===================================================================
 --- contrib/benchmark/conf/highlight-vs-vector-highlight.alg	(revision 0)
 +++ contrib/benchmark/conf/highlight-vs-vector-highlight.alg	(revision 0)
 @@ -0,0 +1,78 @@
 +#/**
 +# * Licensed to the Apache Software Foundation (ASF) under one or more
 +# * contributor license agreements.  See the NOTICE file distributed with
 +# * this work for additional information regarding copyright ownership.
 +# * The ASF licenses this file to You under the Apache License, Version 2.0
 +# * (the "License"); you may not use this file except in compliance with
 +# * the License.  You may obtain a copy of the License at
 +# *
 +# *     http://www.apache.org/licenses/LICENSE-2.0
 +# *
 +# * Unless required by applicable law or agreed to in writing, software
 +# * distributed under the License is distributed on an "AS IS" BASIS,
 +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +# * See the License for the specific language governing permissions and
 +# * limitations under the License.
 +# */
 +# -------------------------------------------------------------------------------------
 +
 +ram.flush.mb=flush:32:32
 +compound=cmpnd:true:false
 +
 +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
 +directory=FSDirectory
 +
 +doc.stored=true
 +doc.tokenized=true
 +doc.term.vector=true
 +doc.term.vector.offsets=true
 +doc.term.vector.positions=true
 +log.step=2000
 +
 +docs.dir=reuters-out
 +
 +content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
 +docs.file=temp/enwiki-20070527-pages-articles.xml
 +
 +# Use LUCENE-1770 WikipediaQueryMaker
 +query.maker=org.apache.lucene.benchmark.byTask.feeds.WikipediaQueryMaker
 +#query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
 +
 +# task at this depth or less would print when they start
 +task.max.depth.log=2
 +
 +log.queries=true
 +# -------------------------------------------------------------------------------------
 +{ "Populate"
 +        CreateIndex
 +        { "MAddDocs" AddDoc } : 20000
 +        Optimize
 +        CloseIndex
 +}
 +{
 +        OpenReader
 +          { "WarmTV" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 100
 +        CloseReader
 +}
 +{
 +	"Rounds"
 +
 +        ResetSystemSoft
 +
 +        OpenReader
 +          { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 200
 +        CloseReader
 +
 +        ResetSystemSoft
 +
 +        OpenReader
 +          { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[3],fields[body]) > : 200
 +        CloseReader
 +
 +        RepSumByPref Search
 +
 +        NewRound
 +} : 4
 +
 +RepSumByNameRound
 +RepSumByName
 Index: contrib/benchmark/conf/vector-highlight-profile.alg
 ===================================================================
 --- contrib/benchmark/conf/vector-highlight-profile.alg	(revision 0)
 +++ contrib/benchmark/conf/vector-highlight-profile.alg	(revision 0)
 @@ -0,0 +1,68 @@
 +#/**
 +# * Licensed to the Apache Software Foundation (ASF) under one or more
 +# * contributor license agreements.  See the NOTICE file distributed with
 +# * this work for additional information regarding copyright ownership.
 +# * The ASF licenses this file to You under the Apache License, Version 2.0
 +# * (the "License"); you may not use this file except in compliance with
 +# * the License.  You may obtain a copy of the License at
 +# *
 +# *     http://www.apache.org/licenses/LICENSE-2.0
 +# *
 +# * Unless required by applicable law or agreed to in writing, software
 +# * distributed under the License is distributed on an "AS IS" BASIS,
 +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +# * See the License for the specific language governing permissions and
 +# * limitations under the License.
 +# */
 +# -------------------------------------------------------------------------------------
 +# multi val params are iterated by NewRound's, added to reports, start with column name.
 +
 +ram.flush.mb=flush:32:32
 +compound=cmpnd:true:false
 +
 +analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
 +directory=FSDirectory
 +
 +doc.stored=true
 +doc.tokenized=true
 +doc.term.vector=true
 +doc.term.vector.offsets=true
 +doc.term.vector.positions=true
 +log.step=2000
 +
 +docs.dir=reuters-out
 +
 +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
 +
 +query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
 +
 +# task at this depth or less would print when they start
 +task.max.depth.log=2
 +
 +log.queries=true
 +# -------------------------------------------------------------------------------------
 +{ "Populate"
 +        CreateIndex
 +        { "MAddDocs" AddDoc } : 20000
 +        Optimize
 +        CloseIndex
 +    }
 +{ "Rounds"
 +
 +    ResetSystemSoft
 +
 +
 +    OpenReader
 +      { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[10],fields[body]) > : 1000
 +
 +    CloseReader
 +
 +    RepSumByPref MAddDocs
 +
 +    NewRound
 +
 +} : 4
 +
 +RepSumByNameRound
 +RepSumByName
 +RepSumByPrefRound MAddDocs
 Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java
 ===================================================================
 --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java	(revision 0)
 +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java	(revision 0)
 @@ -0,0 +1,141 @@
 +package org.apache.lucene.benchmark.byTask.tasks;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import org.apache.lucene.analysis.Analyzer;
 +import org.apache.lucene.benchmark.byTask.PerfRunData;
 +import org.apache.lucene.document.Document;
 +import org.apache.lucene.index.IndexReader;
 +import org.apache.lucene.search.Query;
 +import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
 +import org.apache.lucene.search.vectorhighlight.FieldQuery;
 +
 +import java.util.Set;
 +import java.util.Collection;
 +import java.util.HashSet;
 +import java.util.Collections;
 +
 +/**
 + * Search and Traverse and Retrieve docs task.  Highlight the fields in the retrieved documents by using FastVectorHighlighter.
 + *
 + * <p>Note: This task reuses the reader if it is already open.
 + * Otherwise a reader is opened at start and closed at the end.
 + * </p>
 + *
 + * <p>Takes optional multivalued, comma separated param string as: size[&lt;traversal size&gt;],highlight[&lt;int&gt;],maxFrags[&lt;int&gt;],mergeContiguous[&lt;boolean&gt;],fields[name1;name2;...]</p>
 + * <ul>
 + * <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
 + * <li>highlight - The number of the hits to highlight.  Will always be less than or equal to traversal size.  Default is Integer.MAX_VALUE (i.e. hits.length())</li>
 + * <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
 + * <li>fragSize - The length of fragments</li>
 + * <li>fields - The fields to highlight.  If not specified all fields will be highlighted (or at least attempted)</li>
 + * </ul>
 + * Example:
 + * <pre>"SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(size[10],highlight[10],maxFrags[3],fields[body]) > : 1000
 + * </pre>
 + *
 + * Fields must be stored and term vector offsets and positions in order must be true for this task to work.
 + *
 + * <p>Other side effects: counts additional 1 (record) for each traversed hit,
 + * and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
 + */
 +public class SearchTravRetVectorHighlightTask extends SearchTravTask {
 +
 +  protected int numToHighlight = Integer.MAX_VALUE;
 +  protected int maxFrags = 2;
 +  protected int fragSize = 100;
 +  protected Set paramFields = Collections.EMPTY_SET;
 +  protected FastVectorHighlighter highlighter;
 +
 +  public SearchTravRetVectorHighlightTask(PerfRunData runData) {
 +    super(runData);
 +  }
 +
 +  public void setup() throws Exception {
 +    super.setup();
 +    //check to make sure either the doc is being stored
 +    PerfRunData data = getRunData();
 +    if (data.getConfig().get("doc.stored", false) == false){
 +      throw new Exception("doc.stored must be set to true");
 +    }
 +    if (data.getConfig().get("doc.term.vector.offsets", false) == false){
 +      throw new Exception("doc.term.vector.offsets must be set to true");
 +    }
 +    if (data.getConfig().get("doc.term.vector.positions", false) == false){
 +      throw new Exception("doc.term.vector.positions must be set to true");
 +    }
 +  }
 +
 +  public boolean withRetrieve() {
 +    return true;
 +  }
 +
 +  public int numToHighlight() {
 +    return numToHighlight;
 +  }
 +
 +  protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
 +    highlighter = new FastVectorHighlighter( false, false );
 +    final FieldQuery fq = highlighter.getFieldQuery( q );
 +    return new BenchmarkHighlighter(){
 +      public int doHighlight(IndexReader reader, int doc, String field,
 +          Document document, Analyzer analyzer, String text) throws Exception {
 +        String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags);
 +        return fragments != null ? fragments.length : 0;
 +      }
 +    };
 +  }
 +
 +  public int maxNumFragments() {
 +    return maxFrags;
 +  }
 +
 +  protected Collection/*<String>*/ getFieldsToHighlight(Document document) {
 +    Collection result = super.getFieldsToHighlight(document);
 +    //if stored is false, then result will be empty, in which case just get all the param fields
 +    if (paramFields.isEmpty() == false && result.isEmpty() == false) {
 +      result.retainAll(paramFields);
 +    } else {
 +      result = paramFields;
 +    }
 +    return result;
 +  }
 +
 +  public void setParams(String params) {
 +    String [] splits = params.split(",");
 +    for (int i = 0; i < splits.length; i++) {
 +      if (splits[i].startsWith("size[") == true){
 +        traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
 +      } else if (splits[i].startsWith("highlight[") == true){
 +        numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
 +      } else if (splits[i].startsWith("maxFrags[") == true){
 +        maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
 +      } else if (splits[i].startsWith("fragSize[") == true){
 +        fragSize = (int)Float.parseFloat(splits[i].substring("fragSize[".length(),splits[i].length() - 1));
 +      } else if (splits[i].startsWith("fields[") == true){
 +        paramFields = new HashSet();
 +        String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
 +        String [] fieldSplits = fieldNames.split(";");
 +        for (int j = 0; j < fieldSplits.length; j++) {
 +          paramFields.add(fieldSplits[j]);
 +        }
 +
 +      }
 +    }
 +  }
 +}
 \ No newline at end of file

 Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java
 ___________________________________________________________________
 Name: svn:keywords
    + Date Author Id Revision HeadURL
 Name: svn:eol-style
    + native

 Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
 ===================================================================
 --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java	(revision 799981)
 +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java	(working copy)
 @@ -17,9 +17,20 @@
   * limitations under the License.
   */

 +import org.apache.lucene.analysis.Analyzer;
 +import org.apache.lucene.analysis.TokenStream;
  import org.apache.lucene.benchmark.byTask.PerfRunData;
  import org.apache.lucene.document.Document;
 +import org.apache.lucene.index.IndexReader;
 +import org.apache.lucene.search.Query;
 +import org.apache.lucene.search.highlight.Highlighter;
 +import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
 +import org.apache.lucene.search.highlight.QueryScorer;
 +import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
 +import org.apache.lucene.search.highlight.TextFragment;
 +import org.apache.lucene.search.highlight.TokenSources;

 +import java.io.IOException;
  import java.util.Set;
  import java.util.Collection;
  import java.util.HashSet;
 @@ -57,7 +68,7 @@
    protected boolean mergeContiguous;
    protected int maxFrags = 2;
    protected Set paramFields = Collections.EMPTY_SET;
 -
 +  protected Highlighter highlighter;

    public SearchTravRetHighlightTask(PerfRunData runData) {
      super(runData);
 @@ -79,7 +90,22 @@
    public int numToHighlight() {
      return numToHighlight;
    }
 +
 +  protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
 +    highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
 +    return new BenchmarkHighlighter(){
 +      public int doHighlight(IndexReader reader, int doc, String field,
 +          Document document, Analyzer analyzer, String text) throws Exception {
 +        TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer);
 +        TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
 +        return frag != null ? frag.length : 0;
 +      }
 +    };
 +  }

 +  /**
 +   * @deprecated
 +   */
    public boolean isMergeContiguousFragments() {
      return mergeContiguous;
    }
 Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java
 ===================================================================
 --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java	(revision 0)
 +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java	(revision 0)
 @@ -0,0 +1,27 @@
 +package org.apache.lucene.benchmark.byTask.tasks;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import org.apache.lucene.analysis.Analyzer;
 +import org.apache.lucene.document.Document;
 +import org.apache.lucene.index.IndexReader;
 +
 +public abstract class BenchmarkHighlighter {
 +  public abstract int doHighlight( IndexReader reader, int doc, String field,
 +      Document document, Analyzer analyzer, String text ) throws Exception ;
 +}

 Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java
 ___________________________________________________________________
 Name: svn:keywords
    + Date Author Id Revision HeadURL
 Name: svn:eol-style
    + native

 Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
 ===================================================================
 --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java	(revision 799981)
 +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java	(working copy)
 @@ -121,13 +121,12 @@
              boolean retrieve = withRetrieve();
              int numHighlight = Math.min(numToHighlight(), scoreDocs.length);
              Analyzer analyzer = getRunData().getAnalyzer();
 -            Highlighter highlighter = null;
 +            BenchmarkHighlighter highlighter = null;
              int maxFrags = 1;
              if (numHighlight > 0) {
 -              highlighter = getHighlighter(q);
 +              highlighter = getBenchmarkHighlighter(q);
                maxFrags = maxNumFragments();
              }
 -            boolean merge = isMergeContiguousFragments();
              for (int m = 0; m < traversalSize; m++) {
                int id = scoreDocs[m].doc;
                res++;
 @@ -139,8 +138,7 @@
                    for (Iterator iterator = fieldsToHighlight.iterator(); iterator.hasNext();) {
                      String field = (String) iterator.next();
                      String text = document.get(field);
 -                    TokenStream ts = TokenSources.getAnyTokenStream(ir, id, field, document, analyzer);
 -                    res += doHighlight(ts, text, highlighter, merge, maxFrags);
 +                    res += highlighter.doHighlight(ir, id, field, document, analyzer, text);
                    }
                  }
                }
 @@ -241,9 +239,16 @@
      return 0;
    }

 +  /**
 +   * @deprecated Use {@link #getBenchmarkHighlighter(Query)}
 +   */
    protected Highlighter getHighlighter(Query q){
      return new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
    }
 +
 +  protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
 +    return null;
 +  }

    /**
     *
 @@ -256,11 +261,15 @@
    /**
     *
     * @return true if the highlighter should merge contiguous fragments
 +   * @deprecated
     */
    public boolean isMergeContiguousFragments(){
      return false;
    }

 +  /**
 +   * @deprecated
 +   */
    protected int doHighlight(TokenStream ts, String text,  Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException, InvalidTokenOffsetsException {
      TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments);
      return frag != null ? frag.length : 0;
 Index: contrib/benchmark/build.xml
 ===================================================================
 --- contrib/benchmark/build.xml	(revision 799981)
 +++ contrib/benchmark/build.xml	(working copy)
 @@ -105,6 +105,7 @@
          <pathelement path="${common.dir}/build/classes/java"/>
          <pathelement path="${common.dir}/build/classes/demo"/>
          <pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
 +        <pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/>
      	<fileset dir="lib">
      		<include name="**/*.jar"/>
      	</fileset>
 @@ -148,8 +149,13 @@
           <fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/>
        </subant>
      </target>
 +    <target name="compile-vector-highlighter">
 +      <subant target="compile">
 +         <fileset dir="${common.dir}/contrib/fast-vector-highlighter" includes="build.xml"/>
 +      </subant>
 +    </target>

 -    <target name="init" depends="common.init,compile-demo, compile-highlighter,check-files"/>
 +    <target name="init" depends="common.init,compile-demo,compile-highlighter,compile-vector-highlighter,check-files"/>

      <!-- make sure online collections (reuters) are first downloaded -->
      <target name="test" depends="init,get-files">
	Index: contrib/benchmark/conf/highlight-vs-vector-highlight.alg
	===================================================================
	--- contrib/benchmark/conf/highlight-vs-vector-highlight.alg (revision 0)
	+++ contrib/benchmark/conf/highlight-vs-vector-highlight.alg (revision 0)
	@@ -0,0 +1,78 @@
	+#/**
	+# * Licensed to the Apache Software Foundation (ASF) under one or more
	+# * contributor license agreements. See the NOTICE file distributed with
	+# * this work for additional information regarding copyright ownership.
	+# * The ASF licenses this file to You under the Apache License, Version 2.0
	+# * (the "License"); you may not use this file except in compliance with
	+# * the License. You may obtain a copy of the License at
	+# *
	+# * http://www.apache.org/licenses/LICENSE-2.0
	+# *
	+# * Unless required by applicable law or agreed to in writing, software
	+# * distributed under the License is distributed on an "AS IS" BASIS,
	+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+# * See the License for the specific language governing permissions and
	+# * limitations under the License.
	+# */
	+# -------------------------------------------------------------------------------------
	+
	+ram.flush.mb=flush:32:32
	+compound=cmpnd:true:false
	+
	+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
	+directory=FSDirectory
	+
	+doc.stored=true
	+doc.tokenized=true
	+doc.term.vector=true
	+doc.term.vector.offsets=true
	+doc.term.vector.positions=true
	+log.step=2000
	+
	+docs.dir=reuters-out
	+
	+content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
	+docs.file=temp/enwiki-20070527-pages-articles.xml
	+
	+# Use LUCENE-1770 WikipediaQueryMaker
	+query.maker=org.apache.lucene.benchmark.byTask.feeds.WikipediaQueryMaker
	+#query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
	+
	+# task at this depth or less would print when they start
	+task.max.depth.log=2
	+
	+log.queries=true
	+# -------------------------------------------------------------------------------------
	+{ "Populate"
	+ CreateIndex
	+ { "MAddDocs" AddDoc } : 20000
	+ Optimize
	+ CloseIndex
	+}
	+{
	+ OpenReader
	+ { "WarmTV" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 100
	+ CloseReader
	+}
	+{
	+ "Rounds"
	+
	+ ResetSystemSoft
	+
	+ OpenReader
	+ { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 200
	+ CloseReader
	+
	+ ResetSystemSoft
	+
	+ OpenReader
	+ { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[3],fields[body]) > : 200
	+ CloseReader
	+
	+ RepSumByPref Search
	+
	+ NewRound
	+} : 4
	+
	+RepSumByNameRound
	+RepSumByName
	Index: contrib/benchmark/conf/vector-highlight-profile.alg
	===================================================================
	--- contrib/benchmark/conf/vector-highlight-profile.alg (revision 0)
	+++ contrib/benchmark/conf/vector-highlight-profile.alg (revision 0)
	@@ -0,0 +1,68 @@
	+#/**
	+# * Licensed to the Apache Software Foundation (ASF) under one or more
	+# * contributor license agreements. See the NOTICE file distributed with
	+# * this work for additional information regarding copyright ownership.
	+# * The ASF licenses this file to You under the Apache License, Version 2.0
	+# * (the "License"); you may not use this file except in compliance with
	+# * the License. You may obtain a copy of the License at
	+# *
	+# * http://www.apache.org/licenses/LICENSE-2.0
	+# *
	+# * Unless required by applicable law or agreed to in writing, software
	+# * distributed under the License is distributed on an "AS IS" BASIS,
	+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+# * See the License for the specific language governing permissions and
	+# * limitations under the License.
	+# */
	+# -------------------------------------------------------------------------------------
	+# multi val params are iterated by NewRound's, added to reports, start with column name.
	+
	+ram.flush.mb=flush:32:32
	+compound=cmpnd:true:false
	+
	+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
	+directory=FSDirectory
	+
	+doc.stored=true
	+doc.tokenized=true
	+doc.term.vector=true
	+doc.term.vector.offsets=true
	+doc.term.vector.positions=true
	+log.step=2000
	+
	+docs.dir=reuters-out
	+
	+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
	+
	+query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
	+
	+# task at this depth or less would print when they start
	+task.max.depth.log=2
	+
	+log.queries=true
	+# -------------------------------------------------------------------------------------
	+{ "Populate"
	+ CreateIndex
	+ { "MAddDocs" AddDoc } : 20000
	+ Optimize
	+ CloseIndex
	+ }
	+{ "Rounds"
	+
	+ ResetSystemSoft
	+
	+
	+ OpenReader
	+ { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[10],fields[body]) > : 1000
	+
	+ CloseReader
	+
	+ RepSumByPref MAddDocs
	+
	+ NewRound
	+
	+} : 4
	+
	+RepSumByNameRound
	+RepSumByName
	+RepSumByPrefRound MAddDocs
	Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java
	===================================================================
	--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java (revision 0)
	+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java (revision 0)
	@@ -0,0 +1,141 @@
	+package org.apache.lucene.benchmark.byTask.tasks;
	+
	+/**
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+
	+import org.apache.lucene.analysis.Analyzer;
	+import org.apache.lucene.benchmark.byTask.PerfRunData;
	+import org.apache.lucene.document.Document;
	+import org.apache.lucene.index.IndexReader;
	+import org.apache.lucene.search.Query;
	+import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
	+import org.apache.lucene.search.vectorhighlight.FieldQuery;
	+
	+import java.util.Set;
	+import java.util.Collection;
	+import java.util.HashSet;
	+import java.util.Collections;
	+
	+/**
	+ * Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents by using FastVectorHighlighter.
	+ *
	+ * <p>Note: This task reuses the reader if it is already open.
	+ * Otherwise a reader is opened at start and closed at the end.
	+ * </p>
	+ *
	+ * <p>Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]</p>
	+ * <ul>
	+ * <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
	+ * <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
	+ * <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
	+ * <li>fragSize - The length of fragments</li>
	+ * <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
	+ * </ul>
	+ * Example:
	+ * <pre>"SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(size[10],highlight[10],maxFrags[3],fields[body]) > : 1000
	+ * </pre>
	+ *
	+ * Fields must be stored and term vector offsets and positions in order must be true for this task to work.
	+ *
	+ * <p>Other side effects: counts additional 1 (record) for each traversed hit,
	+ * and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
	+ */
	+public class SearchTravRetVectorHighlightTask extends SearchTravTask {
	+
	+ protected int numToHighlight = Integer.MAX_VALUE;
	+ protected int maxFrags = 2;
	+ protected int fragSize = 100;
	+ protected Set paramFields = Collections.EMPTY_SET;
	+ protected FastVectorHighlighter highlighter;
	+
	+ public SearchTravRetVectorHighlightTask(PerfRunData runData) {
	+ super(runData);
	+ }
	+
	+ public void setup() throws Exception {
	+ super.setup();
	+ //check to make sure either the doc is being stored
	+ PerfRunData data = getRunData();
	+ if (data.getConfig().get("doc.stored", false) == false){
	+ throw new Exception("doc.stored must be set to true");
	+ }
	+ if (data.getConfig().get("doc.term.vector.offsets", false) == false){
	+ throw new Exception("doc.term.vector.offsets must be set to true");
	+ }
	+ if (data.getConfig().get("doc.term.vector.positions", false) == false){
	+ throw new Exception("doc.term.vector.positions must be set to true");
	+ }
	+ }
	+
	+ public boolean withRetrieve() {
	+ return true;
	+ }
	+
	+ public int numToHighlight() {
	+ return numToHighlight;
	+ }
	+
	+ protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
	+ highlighter = new FastVectorHighlighter( false, false );
	+ final FieldQuery fq = highlighter.getFieldQuery( q );
	+ return new BenchmarkHighlighter(){
	+ public int doHighlight(IndexReader reader, int doc, String field,
	+ Document document, Analyzer analyzer, String text) throws Exception {
	+ String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags);
	+ return fragments != null ? fragments.length : 0;
	+ }
	+ };
	+ }
	+
	+ public int maxNumFragments() {
	+ return maxFrags;
	+ }
	+
	+ protected Collection/<String>/ getFieldsToHighlight(Document document) {
	+ Collection result = super.getFieldsToHighlight(document);
	+ //if stored is false, then result will be empty, in which case just get all the param fields
	+ if (paramFields.isEmpty() == false && result.isEmpty() == false) {
	+ result.retainAll(paramFields);
	+ } else {
	+ result = paramFields;
	+ }
	+ return result;
	+ }
	+
	+ public void setParams(String params) {
	+ String [] splits = params.split(",");
	+ for (int i = 0; i < splits.length; i++) {
	+ if (splits[i].startsWith("size[") == true){
	+ traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
	+ } else if (splits[i].startsWith("highlight[") == true){
	+ numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
	+ } else if (splits[i].startsWith("maxFrags[") == true){
	+ maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
	+ } else if (splits[i].startsWith("fragSize[") == true){
	+ fragSize = (int)Float.parseFloat(splits[i].substring("fragSize[".length(),splits[i].length() - 1));
	+ } else if (splits[i].startsWith("fields[") == true){
	+ paramFields = new HashSet();
	+ String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
	+ String [] fieldSplits = fieldNames.split(";");
	+ for (int j = 0; j < fieldSplits.length; j++) {
	+ paramFields.add(fieldSplits[j]);
	+ }
	+
	+ }
	+ }
	+ }
	+}
	\ No newline at end of file

	Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java
	___________________________________________________________________
	Name: svn:keywords
	+ Date Author Id Revision HeadURL
	Name: svn:eol-style
	+ native

	Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
	===================================================================
	--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java (revision 799981)
	+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java (working copy)
	@@ -17,9 +17,20 @@
	* limitations under the License.
	*/

	+import org.apache.lucene.analysis.Analyzer;
	+import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.benchmark.byTask.PerfRunData;
	import org.apache.lucene.document.Document;
	+import org.apache.lucene.index.IndexReader;
	+import org.apache.lucene.search.Query;
	+import org.apache.lucene.search.highlight.Highlighter;
	+import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
	+import org.apache.lucene.search.highlight.QueryScorer;
	+import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
	+import org.apache.lucene.search.highlight.TextFragment;
	+import org.apache.lucene.search.highlight.TokenSources;

	+import java.io.IOException;
	import java.util.Set;
	import java.util.Collection;
	import java.util.HashSet;
	@@ -57,7 +68,7 @@
	protected boolean mergeContiguous;
	protected int maxFrags = 2;
	protected Set paramFields = Collections.EMPTY_SET;
	-
	+ protected Highlighter highlighter;

	public SearchTravRetHighlightTask(PerfRunData runData) {
	super(runData);
	@@ -79,7 +90,22 @@
	public int numToHighlight() {
	return numToHighlight;
	}
	+
	+ protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
	+ highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
	+ return new BenchmarkHighlighter(){
	+ public int doHighlight(IndexReader reader, int doc, String field,
	+ Document document, Analyzer analyzer, String text) throws Exception {
	+ TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer);
	+ TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
	+ return frag != null ? frag.length : 0;
	+ }
	+ };
	+ }

	+ /**
	+ * @deprecated
	+ */
	public boolean isMergeContiguousFragments() {
	return mergeContiguous;
	}
	Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java
	===================================================================
	--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java (revision 0)
	+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java (revision 0)
	@@ -0,0 +1,27 @@
	+package org.apache.lucene.benchmark.byTask.tasks;
	+
	+/**
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+
	+import org.apache.lucene.analysis.Analyzer;
	+import org.apache.lucene.document.Document;
	+import org.apache.lucene.index.IndexReader;
	+
	+public abstract class BenchmarkHighlighter {
	+ public abstract int doHighlight( IndexReader reader, int doc, String field,
	+ Document document, Analyzer analyzer, String text ) throws Exception ;
	+}

	Property changes on: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java
	___________________________________________________________________
	Name: svn:keywords
	+ Date Author Id Revision HeadURL
	Name: svn:eol-style
	+ native

	Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
	===================================================================
	--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (revision 799981)
	+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (working copy)
	@@ -121,13 +121,12 @@
	boolean retrieve = withRetrieve();
	int numHighlight = Math.min(numToHighlight(), scoreDocs.length);
	Analyzer analyzer = getRunData().getAnalyzer();
	- Highlighter highlighter = null;
	+ BenchmarkHighlighter highlighter = null;
	int maxFrags = 1;
	if (numHighlight > 0) {
	- highlighter = getHighlighter(q);
	+ highlighter = getBenchmarkHighlighter(q);
	maxFrags = maxNumFragments();
	}
	- boolean merge = isMergeContiguousFragments();
	for (int m = 0; m < traversalSize; m++) {
	int id = scoreDocs[m].doc;
	res++;
	@@ -139,8 +138,7 @@
	for (Iterator iterator = fieldsToHighlight.iterator(); iterator.hasNext();) {
	String field = (String) iterator.next();
	String text = document.get(field);
	- TokenStream ts = TokenSources.getAnyTokenStream(ir, id, field, document, analyzer);
	- res += doHighlight(ts, text, highlighter, merge, maxFrags);
	+ res += highlighter.doHighlight(ir, id, field, document, analyzer, text);
	}
	}
	}
	@@ -241,9 +239,16 @@
	return 0;
	}

	+ /**
	+ * @deprecated Use {@link #getBenchmarkHighlighter(Query)}
	+ */
	protected Highlighter getHighlighter(Query q){
	return new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
	}
	+
	+ protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
	+ return null;
	+ }

	/**
	*
	@@ -256,11 +261,15 @@
	/**
	*
	* @return true if the highlighter should merge contiguous fragments
	+ * @deprecated
	*/
	public boolean isMergeContiguousFragments(){
	return false;
	}

	+ /**
	+ * @deprecated
	+ */
	protected int doHighlight(TokenStream ts, String text, Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException, InvalidTokenOffsetsException {
	TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFragments);
	return frag != null ? frag.length : 0;
	Index: contrib/benchmark/build.xml
	===================================================================
	--- contrib/benchmark/build.xml (revision 799981)
	+++ contrib/benchmark/build.xml (working copy)
	@@ -105,6 +105,7 @@
	<pathelement path="${common.dir}/build/classes/java"/>
	<pathelement path="${common.dir}/build/classes/demo"/>
	<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
	+ <pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/>
	<fileset dir="lib">
	<include name="*/.jar"/>
	</fileset>
	@@ -148,8 +149,13 @@
	<fileset dir="${common.dir}/contrib/highlighter" includes="build.xml"/>
	</subant>
	</target>
	+ <target name="compile-vector-highlighter">
	+ <subant target="compile">
	+ <fileset dir="${common.dir}/contrib/fast-vector-highlighter" includes="build.xml"/>
	+ </subant>
	+ </target>

	- <target name="init" depends="common.init,compile-demo, compile-highlighter,check-files"/>
	+ <target name="init" depends="common.init,compile-demo,compile-highlighter,compile-vector-highlighter,check-files"/>

	<!-- make sure online collections (reuters) are first downloaded -->
	<target name="test" depends="init,get-files">