lucene/monitor/src/java/org/apache/lucene/monitor/RegexpQueryHandler.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.monitor;

 import java.util.Collections;
 import java.util.Set;
 import java.util.function.BiConsumer;

 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.RegexpQuery;
 import org.apache.lucene.util.BytesRef;

 /**
  * A query handler implementation that matches Regexp queries by indexing regex
  * terms by their longest static substring, and generates ngrams from Document
  * tokens to match them.
  * <p>
  * This implementation will filter out more wildcard queries than TermFilteredPresearcher,
  * at the expense of longer document build times.  Which one is more performant will depend
  * on the type and number of queries registered in the Monitor, and the size of documents
  * to be monitored.  Profiling is recommended.
  */
 public class RegexpQueryHandler implements CustomQueryHandler {

   /**
    * The default suffix with which to mark ngrams
    */
   public static final String DEFAULT_NGRAM_SUFFIX = "XX";

   /**
    * The default maximum length of an input token before ANYTOKENS are generated
    */
   public static final int DEFAULT_MAX_TOKEN_SIZE = 30;

   /**
    * The default token to emit if a term is longer than MAX_TOKEN_SIZE
    */
   public static final String DEFAULT_WILDCARD_TOKEN = "__WILDCARD__";

   private final String ngramSuffix;

   private final String wildcardToken;
   private final BytesRef wildcardTokenBytes;

   private final int maxTokenSize;

   private final Set<String> excludedFields;

   /**
    * Creates a new RegexpQueryHandler
    *
    * @param ngramSuffix    the suffix with which to mark ngrams
    * @param maxTokenSize   the maximum length of an input token before WILDCARD tokens are generated
    * @param wildcardToken  the token to emit if a token is longer than maxTokenSize in length
    * @param excludedFields a Set of fields to ignore when generating ngrams
    */
   public RegexpQueryHandler(String ngramSuffix, int maxTokenSize, String wildcardToken, Set<String> excludedFields) {
     this.ngramSuffix = ngramSuffix;
     this.maxTokenSize = maxTokenSize;
     this.wildcardTokenBytes = new BytesRef(wildcardToken);
     this.wildcardToken = wildcardToken;
     this.excludedFields = excludedFields == null ? Collections.emptySet() : excludedFields;
   }

   /**
    * Creates a new RegexpQueryHandler using default settings
    */
   public RegexpQueryHandler() {
     this(DEFAULT_NGRAM_SUFFIX, DEFAULT_MAX_TOKEN_SIZE, DEFAULT_WILDCARD_TOKEN, null);
   }

   /**
    * Creates a new RegexpQueryHandler with a maximum token size
    *
    * @param maxTokenSize the maximum length of an input token before WILDCARD tokens are generated
    */
   public RegexpQueryHandler(int maxTokenSize) {
     this(DEFAULT_NGRAM_SUFFIX, maxTokenSize, DEFAULT_WILDCARD_TOKEN, null);
   }

   @Override
   public TokenStream wrapTermStream(String field, TokenStream ts) {
     if (excludedFields.contains(field))
       return ts;
     return new SuffixingNGramTokenFilter(ts, ngramSuffix, wildcardToken, maxTokenSize);
   }

   @Override
   public QueryTree handleQuery(Query q, TermWeightor termWeightor) {
     if (q instanceof RegexpQuery == false) {
       return null;
     }
     RegexpQuery query = (RegexpQuery) q;
     String regexp = parseOutRegexp(query.toString(""));
     String selected = selectLongestSubstring(regexp);
     Term term = new Term(query.getField(), selected + ngramSuffix);
     double weight = termWeightor.applyAsDouble(term);
     return new QueryTree() {
       @Override
       public double weight() {
         return weight;
       }

       @Override
       public void collectTerms(BiConsumer<String, BytesRef> termCollector) {
         termCollector.accept(term.field(), term.bytes());
         termCollector.accept(term.field(), wildcardTokenBytes);
       }

       @Override
       public boolean advancePhase(double minWeight) {
         return false;
       }

       @Override
       public String toString(int depth) {
         return space(depth) + "WILDCARD_NGRAM[" + term.toString() + "]^" + weight;
       }
     };
   }

   private static String parseOutRegexp(String rep) {
     int fieldSepPos = rep.indexOf(":");
     int firstSlash = rep.indexOf("/", fieldSepPos);
     int lastSlash = rep.lastIndexOf("/");
     return rep.substring(firstSlash + 1, lastSlash);
   }

   private static String selectLongestSubstring(String regexp) {
     String selected = "";
     for (String substr : regexp.split("\\.|\\*|.\\?")) {
       if (substr.length() > selected.length()) {
         selected = substr;
       }
     }
     return selected;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.monitor;

	import java.util.Collections;
	import java.util.Set;
	import java.util.function.BiConsumer;

	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.RegexpQuery;
	import org.apache.lucene.util.BytesRef;

	/**
	* A query handler implementation that matches Regexp queries by indexing regex
	* terms by their longest static substring, and generates ngrams from Document
	* tokens to match them.
	* <p>
	* This implementation will filter out more wildcard queries than TermFilteredPresearcher,
	* at the expense of longer document build times. Which one is more performant will depend
	* on the type and number of queries registered in the Monitor, and the size of documents
	* to be monitored. Profiling is recommended.
	*/
	public class RegexpQueryHandler implements CustomQueryHandler {

	/**
	* The default suffix with which to mark ngrams
	*/
	public static final String DEFAULT_NGRAM_SUFFIX = "XX";

	/**
	* The default maximum length of an input token before ANYTOKENS are generated
	*/
	public static final int DEFAULT_MAX_TOKEN_SIZE = 30;

	/**
	* The default token to emit if a term is longer than MAX_TOKEN_SIZE
	*/
	public static final String DEFAULT_WILDCARD_TOKEN = "__WILDCARD__";

	private final String ngramSuffix;

	private final String wildcardToken;
	private final BytesRef wildcardTokenBytes;

	private final int maxTokenSize;

	private final Set<String> excludedFields;

	/**
	* Creates a new RegexpQueryHandler
	*
	* @param ngramSuffix the suffix with which to mark ngrams
	* @param maxTokenSize the maximum length of an input token before WILDCARD tokens are generated
	* @param wildcardToken the token to emit if a token is longer than maxTokenSize in length
	* @param excludedFields a Set of fields to ignore when generating ngrams
	*/
	public RegexpQueryHandler(String ngramSuffix, int maxTokenSize, String wildcardToken, Set<String> excludedFields) {
	this.ngramSuffix = ngramSuffix;
	this.maxTokenSize = maxTokenSize;
	this.wildcardTokenBytes = new BytesRef(wildcardToken);
	this.wildcardToken = wildcardToken;
	this.excludedFields = excludedFields == null ? Collections.emptySet() : excludedFields;
	}

	/**
	* Creates a new RegexpQueryHandler using default settings
	*/
	public RegexpQueryHandler() {
	this(DEFAULT_NGRAM_SUFFIX, DEFAULT_MAX_TOKEN_SIZE, DEFAULT_WILDCARD_TOKEN, null);
	}

	/**
	* Creates a new RegexpQueryHandler with a maximum token size
	*
	* @param maxTokenSize the maximum length of an input token before WILDCARD tokens are generated
	*/
	public RegexpQueryHandler(int maxTokenSize) {
	this(DEFAULT_NGRAM_SUFFIX, maxTokenSize, DEFAULT_WILDCARD_TOKEN, null);
	}

	@Override
	public TokenStream wrapTermStream(String field, TokenStream ts) {
	if (excludedFields.contains(field))
	return ts;
	return new SuffixingNGramTokenFilter(ts, ngramSuffix, wildcardToken, maxTokenSize);
	}

	@Override
	public QueryTree handleQuery(Query q, TermWeightor termWeightor) {
	if (q instanceof RegexpQuery == false) {
	return null;
	}
	RegexpQuery query = (RegexpQuery) q;
	String regexp = parseOutRegexp(query.toString(""));
	String selected = selectLongestSubstring(regexp);
	Term term = new Term(query.getField(), selected + ngramSuffix);
	double weight = termWeightor.applyAsDouble(term);
	return new QueryTree() {
	@Override
	public double weight() {
	return weight;
	}

	@Override
	public void collectTerms(BiConsumer<String, BytesRef> termCollector) {
	termCollector.accept(term.field(), term.bytes());
	termCollector.accept(term.field(), wildcardTokenBytes);
	}

	@Override
	public boolean advancePhase(double minWeight) {
	return false;
	}

	@Override
	public String toString(int depth) {
	return space(depth) + "WILDCARD_NGRAM[" + term.toString() + "]^" + weight;
	}
	};
	}

	private static String parseOutRegexp(String rep) {
	int fieldSepPos = rep.indexOf(":");
	int firstSlash = rep.indexOf("/", fieldSepPos);
	int lastSlash = rep.lastIndexOf("/");
	return rep.substring(firstSlash + 1, lastSlash);
	}

	private static String selectLongestSubstring(String regexp) {
	String selected = "";
	for (String substr : regexp.split("\\.\|\\*\|.\\?")) {
	if (substr.length() > selected.length()) {
	selected = substr;
	}
	}
	return selected;
	}
	}