lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.uhighlight;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.FilteringTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.memory.MemoryIndex;
 import org.apache.lucene.search.spans.SpanQuery;


 /**
  * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
  *
  * @lucene.internal
  */
 public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {

   private final MemoryIndex memoryIndex;
   private final LeafReader memIndexLeafReader;
   private final CharArrayMatcher preMemIndexFilterAutomaton;

   public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) {
     super(components, analyzer);
     boolean storePayloads = components.getPhraseHelper().hasPositionSensitivity(); // might be needed
     memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
     memIndexLeafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable
     // preFilter for MemoryIndex
     preMemIndexFilterAutomaton = buildCombinedAutomaton(components);
   }

   /**
    * Build one {@link CharArrayMatcher} matching any term the query might match.
    */
   private static CharArrayMatcher buildCombinedAutomaton(UHComponents components) {
     // We don't know enough about the query to do this confidently
     if (components.getTerms() == null || components.getAutomata() == null) {
       return null;
     }

     List<CharArrayMatcher> allAutomata = new ArrayList<>();
     if (components.getTerms().length > 0) {
       allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
     }
     Collections.addAll(allAutomata, components.getAutomata());
     for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {
       Collections.addAll(allAutomata,
           MultiTermHighlighting.extractAutomata(spanQuery, components.getFieldMatcher(), true));//true==lookInSpan
     }

     if (allAutomata.size() == 1) {
       return allAutomata.get(0);
     }

     //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
     //  could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented

     // Return an aggregate CharArrayMatcher of others
     return (chars, offset, length) -> {
       for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
         if (allAutomata.get(i).match(chars, offset, length)) {
           return true;
         }
       }
       return false;
     };
   }

   @Override
   public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException {
     // note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
     TokenStream tokenStream = tokenStream(content);

     // Filter the tokenStream to applicable terms
     if (preMemIndexFilterAutomaton != null) {
       tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
     }
     memoryIndex.reset();
     memoryIndex.addField(getField(), tokenStream);//note: calls tokenStream.reset() & close()

     if (reader == null) {
       return createOffsetsEnumFromReader(memIndexLeafReader, 0);
     } else {
       return createOffsetsEnumFromReader(
           new OverlaySingleDocTermsLeafReader(
               reader,
               memIndexLeafReader,
               getField(),
               docId),
           docId);
     }
   }

   private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
                                                         final CharArrayMatcher matcher) {
     // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
     return new FilteringTokenFilter(tokenStream) {
       final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);

       @Override
       protected boolean accept() throws IOException {
         return matcher.match(charAtt.buffer(), 0, charAtt.length());
       }
     };
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.uhighlight;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.List;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.FilteringTokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.index.LeafReader;
	import org.apache.lucene.index.memory.MemoryIndex;
	import org.apache.lucene.search.spans.SpanQuery;


	/**
	* Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
	*
	* @lucene.internal
	*/
	public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {

	private final MemoryIndex memoryIndex;
	private final LeafReader memIndexLeafReader;
	private final CharArrayMatcher preMemIndexFilterAutomaton;

	public MemoryIndexOffsetStrategy(UHComponents components, Analyzer analyzer) {
	super(components, analyzer);
	boolean storePayloads = components.getPhraseHelper().hasPositionSensitivity(); // might be needed
	memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
	memIndexLeafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable
	// preFilter for MemoryIndex
	preMemIndexFilterAutomaton = buildCombinedAutomaton(components);
	}

	/**
	* Build one {@link CharArrayMatcher} matching any term the query might match.
	*/
	private static CharArrayMatcher buildCombinedAutomaton(UHComponents components) {
	// We don't know enough about the query to do this confidently
	if (components.getTerms() == null \|\| components.getAutomata() == null) {
	return null;
	}

	List<CharArrayMatcher> allAutomata = new ArrayList<>();
	if (components.getTerms().length > 0) {
	allAutomata.add(CharArrayMatcher.fromTerms(Arrays.asList(components.getTerms())));
	}
	Collections.addAll(allAutomata, components.getAutomata());
	for (SpanQuery spanQuery : components.getPhraseHelper().getSpanQueries()) {
	Collections.addAll(allAutomata,
	MultiTermHighlighting.extractAutomata(spanQuery, components.getFieldMatcher(), true));//true==lookInSpan
	}

	if (allAutomata.size() == 1) {
	return allAutomata.get(0);
	}

	//TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
	// could union them all. But it's not exposed, and sometimes the automaton is byte (not char) oriented

	// Return an aggregate CharArrayMatcher of others
	return (chars, offset, length) -> {
	for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
	if (allAutomata.get(i).match(chars, offset, length)) {
	return true;
	}
	}
	return false;
	};
	}

	@Override
	public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content) throws IOException {
	// note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
	TokenStream tokenStream = tokenStream(content);

	// Filter the tokenStream to applicable terms
	if (preMemIndexFilterAutomaton != null) {
	tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
	}
	memoryIndex.reset();
	memoryIndex.addField(getField(), tokenStream);//note: calls tokenStream.reset() & close()

	if (reader == null) {
	return createOffsetsEnumFromReader(memIndexLeafReader, 0);
	} else {
	return createOffsetsEnumFromReader(
	new OverlaySingleDocTermsLeafReader(
	reader,
	memIndexLeafReader,
	getField(),
	docId),
	docId);
	}
	}

	private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
	final CharArrayMatcher matcher) {
	// it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
	return new FilteringTokenFilter(tokenStream) {
	final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);

	@Override
	protected boolean accept() throws IOException {
	return matcher.match(charAtt.buffer(), 0, charAtt.length());
	}
	};
	}

	}