docs/attachments/LUCENE-7639/suffix-array.patch - lucene-jira-archive - Git at Google

 diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
 index 4ee3826..bdbffef 100644
 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
 +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
 @@ -61,6 +61,9 @@ public final class FieldReader extends Terms implements Accountable {
    final BlockTreeTermsReader parent;

    final FST<BytesRef> index;
 +
 +  final WildcardHelper wildcardHelper;
 +
    //private boolean DEBUG;

    FieldReader(BlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount,
 @@ -99,8 +102,14 @@ public final class FieldReader extends Terms implements Accountable {
          w.close();
          }
        */
 +      if (Boolean.parseBoolean(System.getProperty("solr.suffixArray.enable"))) {
 +        wildcardHelper = new WildcardHelper(this, this::iterator);
 +      } else {
 +        wildcardHelper = null;
 +      }
      } else {
        index = null;
 +      wildcardHelper = null;
      }
    }

 @@ -185,12 +194,22 @@ public final class FieldReader extends Terms implements Accountable {
      if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
        throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
      }
 +    final String wildcardText = compiled.wildcardText;
 +    if (wildcardText != null && wildcardText.startsWith("*") && wildcardHelper != null && wildcardHelper.isReady()) {
 +      String[] parts = wildcardText.split("[*?]");
 +      for (final String part : parts) {
 +        if (part.length() > 2) {
 +          return wildcardHelper.getTermsEnum(wildcardText, part);
 +        }
 +      }
 +    }
      return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm, compiled.sinkState);
    }

    @Override
    public long ramBytesUsed() {
 -    return BASE_RAM_BYTES_USED + ((index!=null)? index.ramBytesUsed() : 0);
 +    return BASE_RAM_BYTES_USED + ((index!=null)? index.ramBytesUsed() : 0)
 +        + (wildcardHelper != null ? wildcardHelper.ramBytesUsed() : 0);
    }

    @Override
 diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SuffixArrayBytes.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SuffixArrayBytes.java
 new file mode 100644
 index 0000000..5f0b5dc
 --- /dev/null
 +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SuffixArrayBytes.java
 @@ -0,0 +1,169 @@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +package org.apache.lucene.codecs.blocktree;
 +
 +/**
 + * Sort suffixes of the bytes array and allows to select range of suffixes that starts with given prefix.
 + */
 +public class SuffixArrayBytes {
 +
 +  private final byte[] text;
 +
 +  /**
 +   * index[i] = j means text.substring(j) is ith largest suffix
 +   */
 +  private final int[] index;
 +
 +  /**
 +   * number of suffixes, {@link #index} length
 +   */
 +  private final int n;
 +
 +  /**
 +   * Creates sorted suffix array for the given bytes array.
 +   * @param text the input bytes array that ends with {@link WildcardHelper#SAFETY_PAD_SIZE} {@link Byte#MIN_VALUE} values
 +   * @param index values of this array are starting position of suffixes in {@code text}
 +   */
 +  public SuffixArrayBytes(final byte[] text, final int[] index) {
 +    this.text = text;
 +    this.index = index;
 +    this.n = index.length;
 +    sort(0, n - 1, 0);
 +  }
 +
 +  /**
 +   * @return bytes array that represents input text for suffix array construction
 +   */
 +  public byte[] getText() {
 +    return text;
 +  }
 +
 +  /**
 +   * 3-way radix quicksort based on <a href="http://algs4.cs.princeton.edu/63suffix/SuffixArrayX.java.html">http://algs4.cs.princeton.edu/63suffix/SuffixArrayX.java.html</a>
 +   * @param lo first index
 +   * @param hi last index
 +   * @param d first symbol where suffixes can differ
 +   */
 +  private void sort(final int lo, final int hi, final int d) {
 +    if (hi <= lo) {
 +      return;
 +    }
 +
 +    int lt = lo, gt = hi;
 +    //index[lo] - probe suffix
 +    final byte probeSuffixPositionD = text[index[lo] + d];
 +    int i = lo + 1;
 +    while (i <= gt) {
 +      byte currentSuffixPositionD = text[index[i] + d];
 +      if (currentSuffixPositionD < probeSuffixPositionD) {
 +        swapSuffixes(lt++, i++);
 +      } else if (currentSuffixPositionD > probeSuffixPositionD) {
 +        swapSuffixes(i, gt--);
 +      } else {
 +        i++;
 +      }
 +    }
 +
 +    // index[lo]..index[lt-1] - suffixes before probe
 +    sort(lo, lt-1, d);
 +    // index[lt]..index[gt] - suffixes with the same d symbol as in probe
 +    sort(lt, gt, d+1);
 +    // index[gt+1]..index[hi] - suffixes after probe
 +    sort(gt+1, hi, d);
 +  }
 +
 +  private void swapSuffixes(final int i, final int j) {
 +    final int swap = index[i];
 +    index[i] = index[j];
 +    index[j] = swap;
 +  }
 +
 +  /**
 +   * @param i an integer between 0 and <em>n</em>-1
 +   * @return the starting position of the <em>i</em>th smallest suffix.
 +   */
 +  public int index(final int i) {
 +    return index[i];
 +  }
 +
 +  /**
 +   * Finds range of suffixes with given prefix.
 +   * @param prefix part of the wildcard query without * and ? converted to bytes
 +   * @return the range of suffixes starting with {@code prefix}, range[0] is the least suffix not less than {@code prefix},
 +   * {@code range[1]} - is the least not-matched after {@code range[0]}. If there is no matched suffixes {@code range[0] == range[1]};
 +   */
 +  public int[] getSuffixesWithPrefix(final byte[] prefix) {
 +    int lessThanPrefix = -1;
 +    int moreThanPrefix = n;
 +    while (lessThanPrefix + 1 < moreThanPrefix) {
 +      int mid = lessThanPrefix + (moreThanPrefix - lessThanPrefix) / 2;
 +      if (suffixIsAfterPrefix(index[mid], prefix)) {
 +        moreThanPrefix = mid;
 +      } else {
 +        lessThanPrefix = mid;
 +      }
 +    }
 +    if (moreThanPrefix == n || !suffixStartsWithPrefix(index[moreThanPrefix], prefix)) {
 +      return new int[] {moreThanPrefix, moreThanPrefix};
 +    }
 +    int maxValid = moreThanPrefix;
 +    int minInvalid = n;
 +    while (maxValid + 1 < minInvalid) {
 +      int mid = maxValid + (minInvalid - maxValid) / 2;
 +      if (suffixStartsWithPrefix(index[mid], prefix)) {
 +        maxValid = mid;
 +      } else {
 +        minInvalid = mid;
 +      }
 +    }
 +    return new int[]{moreThanPrefix, minInvalid};
 +  }
 +
 +  /**
 +   * Checks that suffix is after {@code prefix} bytes.
 +   * @param suffixStart start of the suffix
 +   * @param prefix for comparison
 +   * @return suffix is after {@code prefix}
 +   */
 +  private boolean suffixIsAfterPrefix(final int suffixStart, final byte[] prefix) {
 +    for (int j = 0; j < prefix.length; j++) {
 +      byte prefixByte = prefix[j];
 +      byte suffixByte = text[suffixStart + j];
 +      if (prefixByte < suffixByte) {
 +        return true;
 +      } else if (prefixByte > suffixByte) {
 +        return false;
 +      }
 +    }
 +    return true;
 +  }
 +
 +  /**
 +   * Checks that suffix starts with {@code prefix} bytes.
 +   * @param suffixStart start of the suffix
 +   * @param prefix prefix
 +   * @return suffix starts with {@code prefix}
 +   */
 +  private boolean suffixStartsWithPrefix(final int suffixStart, final byte[] prefix) {
 +    for (int j = 0; j < prefix.length; j++) {
 +      if (prefix[j] != text[suffixStart + j]) {
 +        return false;
 +      }
 +    }
 +    return true;
 +  }
 +}
 diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/WildcardHelper.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/WildcardHelper.java
 new file mode 100644
 index 0000000..d87428f
 --- /dev/null
 +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/WildcardHelper.java
 @@ -0,0 +1,365 @@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +package org.apache.lucene.codecs.blocktree;
 +
 +import org.apache.lucene.codecs.BlockTermState;
 +import org.apache.lucene.index.PostingsEnum;
 +import org.apache.lucene.index.TermState;
 +import org.apache.lucene.index.TermsEnum;
 +import org.apache.lucene.util.Accountable;
 +import org.apache.lucene.util.BytesRef;
 +import org.apache.lucene.util.BytesRefIterator;
 +import org.apache.lucene.util.RamUsageEstimator;
 +
 +import java.io.IOException;
 +import java.util.ArrayList;
 +import java.util.Collection;
 +import java.util.Collections;
 +import java.util.HashSet;
 +import java.util.List;
 +import java.util.Set;
 +import java.util.concurrent.ExecutorService;
 +import java.util.concurrent.LinkedBlockingQueue;
 +import java.util.concurrent.ThreadPoolExecutor;
 +import java.util.concurrent.TimeUnit;
 +import java.util.regex.Pattern;
 +
 +/**
 + * Finds terms matched to given wildcard using suffix array.
 + */
 +public class WildcardHelper implements Accountable {
 +
 +  /**
 +   * Number of {@link Byte#MIN_VALUE} values added to the end of the list of all words to avoid array bounds check while comparing suffixes.
 +   */
 +  public static final int SAFETY_PAD_SIZE = 8;
 +
 +  /** Suffix array created asynchronously. */
 +  private volatile SuffixArrayBytes suffixArray;
 +
 +  /**
 +   * List of all words represented as bytes from {@link org.apache.lucene.util.BytesRef} with
 +   * {#SAFETY_PAD_SIZE} {@link Byte#MIN_VALUE} values at the end.
 +   */
 +  private final byte[] allWords;
 +
 +  /** Starts of words {@link #allWords}, last element is total length of all words in bytes. */
 +  private final int[] wordStarts;
 +
 +  /** Original {#link FieldReader} where we add suffix array support for better performance. */
 +  private final FieldReader fieldReader;
 +
 +  /** Service that allows us to use several threads for suffix arrays sorting. */
 +  private final static ExecutorService suffixArrayInitializationService;
 +
 +  static {
 +    int initializationThreadsCount = 5;
 +
 +    try {
 +      String value = System.getProperty("solr.suffixArray.initialization_treads_count");
 +      if (value != null) {
 +        initializationThreadsCount = Integer.parseInt(value);
 +        //it should ne at least 1 thread
 +        if (initializationThreadsCount < 1) {
 +          initializationThreadsCount = 5;
 +        }
 +      }
 +    } catch (Throwable ignored) {
 +    }
 +
 +    //we should not create more threads than initializationThreadsCount and we need to free all of them after initialization is finished
 +    suffixArrayInitializationService = new ThreadPoolExecutor(0, initializationThreadsCount,
 +        0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<>());
 +  }
 +
 +  /** Valid suffix starts in {@link #allWords} */
 +  private final int[] suffixes;
 +
 +  /**
 +   * @param fieldReader original {@link FieldReader} we are trying to improve
 +   * @param bytesRefIteratorProvider prover for lists fo words (will be called 2 times)
 +   */
 +  public WildcardHelper(final FieldReader fieldReader, final BytesRefIteratorProvider bytesRefIteratorProvider) throws IOException {
 +    this.fieldReader = fieldReader;
 +    BytesRefIterator it = bytesRefIteratorProvider.iterator();
 +    int totalBytes = 0;
 +    int totalCharacters = 0;
 +    final List<Integer> suffixArrayWordStartsList = new ArrayList<>();
 +    BytesRef ref;
 +    while ((ref = it.next()) != null) {
 +      suffixArrayWordStartsList.add(totalBytes);
 +      totalBytes += ref.length;
 +      final String word = ref.utf8ToString();
 +      totalCharacters += word.length();
 +      for (int i = 0; i < word.length(); i++) {
 +        if (isSecondSymbolInSurrogatePair(word, i)) {
 +          totalCharacters--;
 +        }
 +      }
 +    }
 +    suffixArrayWordStartsList.add(totalBytes);
 +    wordStarts = suffixArrayWordStartsList.stream().mapToInt(i -> i).toArray();
 +
 +    final int bytesCount = wordStarts[wordStarts.length - 1];
 +    allWords = new byte[bytesCount + SAFETY_PAD_SIZE];
 +    for (int i = bytesCount; i < allWords.length; i++) {
 +      allWords[i] = Byte.MIN_VALUE;
 +    }
 +
 +    suffixes = new int[totalCharacters];
 +
 +    it = bytesRefIteratorProvider.iterator();
 +    int curCharacter = 0;
 +    int curByte = 0;
 +    while ((ref = it.next()) != null) {
 +      System.arraycopy(ref.bytes, ref.offset, allWords, curByte, ref.length);
 +      String word = ref.utf8ToString();
 +      curByte += ref.length;
 +      for (int i = 0; i < word.length(); i++) {
 +        if (isSecondSymbolInSurrogatePair(word, i)) {
 +          continue;
 +        }
 +        int suffixLength = new BytesRef(word.substring(i)).length;
 +        suffixes[curCharacter] = curByte - suffixLength;
 +        curCharacter++;
 +      }
 +    }
 +
 +    final Runnable suffixArrayInitializer = () -> {
 +      suffixArray = new SuffixArrayBytes(allWords, suffixes);
 +    };
 +
 +    suffixArrayInitializationService.submit(suffixArrayInitializer);
 +  }
 +
 +  /**
 +   * @return if suffix array already created and we are ready to process requests
 +   */
 +  public boolean isReady() {
 +    return suffixArray != null;
 +  }
 +
 +  public long ramBytesUsed() {
 +    return RamUsageEstimator.shallowSizeOfInstance(WildcardHelper.class) +
 +        allWords.length + 4 * (wordStarts.length + suffixes.length)
 +        + RamUsageEstimator.shallowSizeOfInstance(SuffixArrayBytes.class);
 +  }
 +
 +  /**
 +   * Fast search for terms that matches wildcard query.
 +   * @param wildcardQuery original query
 +   * @param hint substring of original query without * and ?
 +   * @return TermsEnum with matched terms
 +   */
 +  public TermsEnum getTermsEnum(final String wildcardQuery, final String hint) throws IOException {
 +    final Collection<BytesRef> matchingWords = getMatchingWords(wildcardQuery, hint);
 +    int wordsFound = matchingWords.size();
 +    final List<Integer> frequencies = new ArrayList<>(wordsFound);
 +    final List<TermState> states = new ArrayList<>(wordsFound);
 +    final SegmentTermsEnum termsEnum = new SegmentTermsEnum(fieldReader);
 +    for (BytesRef ref : matchingWords) {
 +      termsEnum.seekExact(ref);
 +      frequencies.add(termsEnum.docFreq());
 +      states.add(termsEnum.termState());
 +    }
 +    return new ListTermsEnum(matchingWords, frequencies, states, fieldReader);
 +  }
 +
 +  /**
 +   * Gets terms that matches wildcard query using suffix array.
 +   * @param wildcardQuery original query
 +   * @param hint substring of original query without * and ?
 +   * @return collection of matched terms
 +   */
 +  public Collection<BytesRef> getMatchingWords(final String wildcardQuery, final String hint) {
 +    final byte[] prefix = convertStringToBytes(hint);
 +    final int prefixLength = prefix.length;
 +    final CheckWord checkWord;
 +    if (wildcardQuery.equals("*" + hint + "*")) {
 +      checkWord = (wordStart, suffix, wordEnd) -> true;
 +    } else if (wildcardQuery.equals("*" + hint)) {
 +      checkWord = (wordStart, suffix, wordEnd) -> suffix + prefixLength == wordEnd;
 +    } else {
 +      final Pattern regex = wildcardToPattern(wildcardQuery);
 +      checkWord = (wordStart, suffix, wordEnd) -> regex.matcher(new BytesRef(allWords, wordStart, wordEnd - wordStart).utf8ToString()).matches();
 +    }
 +
 +    final int[] range = suffixArray.getSuffixesWithPrefix(prefix);
 +    final Set<Integer> usedWordIndexes = new HashSet<>();
 +    final List<BytesRef> simpleFilter = new ArrayList<>(range[1] - range[0]);
 +    for (int pos = range[0]; pos < range[1]; pos++) {
 +      final int suffix = suffixArray.index(pos);
 +      final int wordIndex = getWordIndex(suffix);
 +      final int wordEnd = wordStarts[wordIndex + 1];
 +      if (wordEnd >= suffix + prefixLength) {
 +        final int wordStart = wordStarts[wordIndex];
 +        if (checkWord.accept(wordStart, suffix, wordEnd)) {
 +          if (usedWordIndexes.add(wordIndex)) {
 +            simpleFilter.add(new BytesRef(allWords, wordStart, wordEnd - wordStart));
 +          }
 +        }
 +      }
 +    }
 +    Collections.sort(simpleFilter);
 +    return simpleFilter;
 +  }
 +
 +  /**
 +   * Converts {@code String} to {@code byte[]} representation from {@link BytesRef}
 +   */
 +  public static byte[] convertStringToBytes(String hint) {
 +    BytesRef ref = new BytesRef(hint);
 +    byte[] prefix = new byte[ref.length];
 +    System.arraycopy(ref.bytes, ref.offset, prefix, 0, ref.length);
 +    return prefix;
 +  }
 +
 +  /**
 +   * Converts wildcard query to {@link java.util.regex.Pattern}
 +   */
 +  public static Pattern wildcardToPattern(String wildcardQuery) {
 +    String regex = wildcardQuery.replaceAll("\\\\", "\\\\\\\\").replaceAll("\\.", "\\\\.").replaceAll("\\*", "\\.*").replaceAll("\\?", "\\.");
 +    return Pattern.compile(regex);
 +  }
 +
 +  /**
 +   * Finds word that matches to position in {@link #allWords}.
 +   * @param pos index of byte in {@link #allWords}
 +   * @return index that points to word start in {@link #wordStarts}
 +   */
 +  public int getWordIndex(final int pos) {
 +    int start = 0;
 +    int finish = wordStarts.length - 1;
 +    while (wordStarts[start + 1] <= pos) {
 +      start++;
 +      int middle = (start + finish)/2;
 +      int middlePos = wordStarts[middle];
 +      if (middlePos > pos) {
 +        finish = middle;
 +      } else {
 +        start = middle;
 +      }
 +    }
 +    return start;
 +  }
 +
 +  /**
 +   * Surrogate pair represented in {@link BytesRef} by 4 bytes, but single {@code char} from
 +   * the surrogate pair represented by 3 special bytes. It means that suffix starting with the 2nd
 +   * {@code char} in surrogate pair makes no sense.
 +   * @return char at charPosition is the 2nd char in surrogate pair
 +   */
 +  private boolean isSecondSymbolInSurrogatePair(String word, int charPosition) {
 +    int utf32 = word.charAt(charPosition);
 +    //Code values from org.apache.lucene.util.UnicodeUtil.UTF16toUTF8() method
 +    return utf32 >= 0xDC00 && utf32 <= 0xDFFF;
 +  }
 +
 +  /**
 +   * Allows to replace {@link java.util.regex.Pattern} usage with trivial checks fro queries like *abc and *abc*.
 +   */
 +  private interface CheckWord {
 +    boolean accept(final int wordStart, final int suffix, final int wordEnd);
 +  }
 +
 +  /**
 +   * This interface allows to create {@code WildcardHelper} in unit tests.
 +   */
 +  public interface BytesRefIteratorProvider {
 +    BytesRefIterator iterator() throws IOException;
 +  }
 +
 +  /**
 +   * Simple {@link TermsEnum} implementation when we have all matched terms already.
 +   */
 +  public static class ListTermsEnum extends TermsEnum {
 +    private int ord = -1;
 +    private final List<BytesRef> terms = new ArrayList<>();
 +    private final List<Integer> frequencies;
 +    private final List<TermState> states;
 +    private final FieldReader fieldReader;
 +
 +    public ListTermsEnum(final Collection<BytesRef> terms, final List<Integer> frequencies, final List<TermState> states,
 +                         final FieldReader fieldReader) {
 +      this.terms.addAll(terms);
 +      this.frequencies = frequencies;
 +      this.states = states;
 +      this.fieldReader = fieldReader;
 +    }
 +
 +    @Override
 +    public SeekStatus seekCeil(final BytesRef text) throws IOException {
 +      for (int i = 0; i < terms.size(); i++) {
 +        final BytesRef bytesRef = terms.get(i);
 +        if (bytesRef.equals(text)) {
 +          ord = i;
 +          return SeekStatus.FOUND;
 +        }
 +        if (bytesRef.compareTo(text) < 0) {
 +          ord = i;
 +          return SeekStatus.NOT_FOUND;
 +        }
 +      }
 +      ord = -1;
 +      return SeekStatus.END;
 +    }
 +
 +    @Override
 +    public void seekExact(final long ord) throws IOException {
 +      this.ord = (int)ord;
 +    }
 +
 +    @Override
 +    public BytesRef term() throws IOException {
 +      return terms.get(ord);
 +    }
 +
 +    @Override
 +    public long ord() throws IOException {
 +      return ord;
 +    }
 +
 +    @Override
 +    public int docFreq() throws IOException {
 +      return frequencies.get(ord);
 +    }
 +
 +    @Override
 +    public long totalTermFreq() throws IOException {
 +      return fieldReader.getSumTotalTermFreq();
 +    }
 +
 +    @Override
 +    public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
 +      return fieldReader.parent.postingsReader.postings(fieldReader.fieldInfo, (BlockTermState)termState(), reuse, flags);
 +    }
 +
 +    @Override
 +    public BytesRef next() throws IOException {
 +      ord++;
 +      if (ord >= terms.size()) {
 +        return null;
 +      }
 +      return term();
 +    }
 +
 +    @Override
 +    public TermState termState() throws IOException {
 +      return states.get(ord);
 +    }
 +  }
 +}
 diff --git a/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java b/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java
 index b775dca..d6b8a94 100644
 --- a/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java
 +++ b/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java
 @@ -100,8 +100,11 @@ public class WildcardQuery extends AutomatonQuery {
        }
        i += length;
      }
 -
 -    return Operations.concatenate(automata);
 +
 +    Automaton automaton = Operations.concatenate(automata);
 +    automaton.setWildcardText(wildcardText);
 +    return automaton;
 +
    }

    /**
 diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
 index e4a5bd9..1852d72 100644
 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
 +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
 @@ -83,6 +83,9 @@ public class Automaton implements Accountable {
    /** True if no state has two transitions leaving with the same label. */
    private boolean deterministic = true;

 +  /** Original wildcard query. */
 +  private String wildcardText = null;
 +
    /** Sole constructor; creates an automaton with no states. */
    public Automaton() {
       this(2, 2);
 @@ -322,6 +325,20 @@ public class Automaton implements Accountable {
      return deterministic;
    }

 +  /**
 +   * @return original wildcard query
 +   */
 +  public String getWildcardText() {
 +    return wildcardText;
 +  }
 +
 +  /**
 +   * Sets original wildcard query.
 +   */
 +  public void setWildcardText(String wildcardText) {
 +    this.wildcardText = wildcardText;
 +  }
 +
    /** Finishes the current state; call this once you are done adding
     *  transitions for a state.  This is automatically called if you
     *  start adding transitions to a new source state, but for the last
 diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
 index bd00a70..d868dfd 100644
 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
 +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
 @@ -93,6 +93,9 @@ public class CompiledAutomaton {
    /** Which state, if any, accepts all suffixes, else -1. */
    public final int sinkState;

 +  /** Original wildcard query */
 +  public final String wildcardText;
 +
    /** Create this, passing simplify=true and finite=null, so that we try
     *  to simplify the automaton and determine if it is finite. */
    public CompiledAutomaton(Automaton automaton) {
 @@ -149,6 +152,8 @@ public class CompiledAutomaton {
        automaton.createState();
      }

 +    wildcardText = automaton.getWildcardText();
 +
      if (simplify) {

        // Test whether the automaton is a "simple" form and
 diff --git a/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestSuffixArrayBytes.java b/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestSuffixArrayBytes.java
 new file mode 100644
 index 0000000..03d7e6b
 --- /dev/null
 +++ b/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestSuffixArrayBytes.java
 @@ -0,0 +1,124 @@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +package org.apache.lucene.codecs.blocktree;
 +
 +import com.carrotsearch.randomizedtesting.RandomizedContext;
 +import com.carrotsearch.randomizedtesting.RandomizedRunner;
 +import org.apache.lucene.util.BytesRef;
 +import org.apache.lucene.util.LuceneTestCase;
 +import org.junit.Test;
 +import org.junit.runner.RunWith;
 +
 +import java.util.Random;
 +
 +/**
 + * Tests for {@link SuffixArrayBytes}.
 + */
 +@RunWith(RandomizedRunner.class)
 +public class TestSuffixArrayBytes extends LuceneTestCase {
 +
 +  @Test
 +  public void testSuffixArraySortingEnglish() throws Exception {
 +    final Random rnd = RandomizedContext.current().getRandom();
 +    int capacity = 10000 + rnd.nextInt(10000);
 +    String text = getText(rnd, capacity, "qwertyuiopasdfghjklzxcvbnm");
 +    SuffixArrayBytes sa = getSuffixArray(text);
 +
 +    for (int i = 0; i < capacity - 1; i++) {
 +      String s1 = text.substring(sa.index(i));
 +      String s2 = text.substring(sa.index(i + 1));
 +      assertTrue("Suffix " + i + " should be before suffix " + (i+1), s1.compareTo(s2) < 0);
 +    }
 +  }
 +
 +  private SuffixArrayBytes getSuffixArray(String text) {
 +    byte[] textBytes = WildcardHelper.convertStringToBytes(text);
 +    for (int i = textBytes.length - WildcardHelper.SAFETY_PAD_SIZE; i < textBytes.length; i++) {
 +      textBytes[i] = Byte.MIN_VALUE;
 +    }
 +
 +    int[] suffixes = new int[text.length() - WildcardHelper.SAFETY_PAD_SIZE];
 +    for (int i = 0; i < suffixes.length; i++) {
 +      suffixes[i] = textBytes.length - new BytesRef(text.substring(i)).length;
 +    }
 +
 +    return new SuffixArrayBytes(textBytes, suffixes);
 +  }
 +
 +  @Test
 +  public void testSuffixArrayRange() throws Exception {
 +    final Random rnd = RandomizedContext.current().getRandom();
 +    int mainTextLength = 10000 + rnd.nextInt(10000);
 +    String text = getText(rnd, mainTextLength, "qwertyuiopasdfghjklzxcvbnmйцукенгшщзхъфывапролджэячсмитьбю");
 +    SuffixArrayBytes sa = getSuffixArray(text);
 +    byte[] bytes = sa.getText();
 +    for (int i = 0; i < mainTextLength - 1; i++) {
 +      int start1 = sa.index(i);
 +      int start2 = sa.index(i + 1);
 +      int diff = 0;
 +      while (true) {
 +        if (bytes[start1 + diff] < bytes[start2 + diff]) {
 +          break;
 +        }
 +        if (bytes[start1 + diff] > bytes[start2 + diff]) {
 +          assertTrue("Suffix " + i + "greater than " + (i+1), false);
 +        }
 +        diff++;
 +      }
 +    }
 +
 +    int endSuffixLength = 1 + rnd.nextInt(50);
 +    int endSuffixStart = mainTextLength - endSuffixLength;
 +    byte[] query = WildcardHelper.convertStringToBytes(text.substring(endSuffixStart, mainTextLength));
 +    assertEquals("End suffix should be found at the end", bytes.length - query.length - WildcardHelper.SAFETY_PAD_SIZE, sa.index(sa.getSuffixesWithPrefix(query)[0]));
 +    for (int i = 1; i < 1000; i++) {
 +      int beginIndex = rnd.nextInt(text.length() - 1000);
 +      String suffix = text.substring(beginIndex, beginIndex + 1 + rnd.nextInt(100));
 +      int[] range = sa.getSuffixesWithPrefix(WildcardHelper.convertStringToBytes(suffix));
 +      for (int pos = range[0]; pos < range[1]; pos++) {
 +        assertTrue("Suffix should be found in text", getSuffix(sa, sa.index(pos)).startsWith(suffix));
 +      }
 +      if (range[0] > 0) {
 +        int suffixStart = sa.index(range[0] - 1);
 +        assertFalse("Suffix should not be found before range at " + (range[0] - 1), getSuffix(sa, suffixStart).startsWith(suffix));
 +      }
 +      if (range[1] < mainTextLength) {
 +        int suffixStart = sa.index(range[1]);
 +        assertFalse("Suffix should not be found after range at " + range[1], getSuffix(sa, suffixStart).startsWith(suffix));
 +      }
 +
 +    }
 +    assertEquals("{127} suffix should be positioned at the end", sa.getSuffixesWithPrefix(new byte[]{127, 127, 127, 127})[0], mainTextLength);
 +    assertEquals("{-128} suffix should be positioned at 0", sa.getSuffixesWithPrefix(new byte[]{-128, -128, -128,-128})[0], 0);
 +  }
 +
 +  private String getText(Random rnd, int mainTextLength, String letters) {
 +    StringBuilder sb = new StringBuilder(mainTextLength);
 +    for (int i = 0; i < mainTextLength; i++) {
 +      sb.append(letters.charAt(rnd.nextInt(letters.length())));
 +    }
 +    for (int i = 0; i < WildcardHelper.SAFETY_PAD_SIZE; i++) {
 +      sb.append("0");
 +    }
 +    return sb.toString();
 +  }
 +
 +  private String getSuffix(final SuffixArrayBytes sa, final int suffixStart) {
 +    final byte[] textBytes = sa.getText();
 +    return new BytesRef(textBytes, suffixStart, textBytes.length - suffixStart - WildcardHelper.SAFETY_PAD_SIZE).utf8ToString();
 +  }
 +}
 diff --git a/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestWildcardHelper.java b/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestWildcardHelper.java
 new file mode 100644
 index 0000000..ec4c4b8
 --- /dev/null
 +++ b/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestWildcardHelper.java
 @@ -0,0 +1,156 @@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +package org.apache.lucene.codecs.blocktree;
 +
 +import com.carrotsearch.randomizedtesting.RandomizedContext;
 +import com.carrotsearch.randomizedtesting.RandomizedRunner;
 +import org.apache.lucene.util.BytesRef;
 +import org.apache.lucene.util.BytesRefIterator;
 +import org.apache.lucene.util.LuceneTestCase;
 +import org.junit.Test;
 +import org.junit.runner.RunWith;
 +
 +import java.io.IOException;
 +import java.util.ArrayList;
 +import java.util.Collection;
 +import java.util.Collections;
 +import java.util.HashSet;
 +import java.util.List;
 +import java.util.Random;
 +import java.util.Set;
 +import java.util.regex.Pattern;
 +import java.util.stream.Collectors;
 +
 +/**
 + * Tests for {@link WildcardHelper}.
 + */
 +@RunWith(RandomizedRunner.class)
 +public class TestWildcardHelper extends LuceneTestCase {
 +
 +  @Test
 +  public void testSuffixArrayIntegration() throws Exception {
 +    final Random rnd = RandomizedContext.current().getRandom();
 +    int capacity = 1000000 + rnd.nextInt(1000000);
 +    String letters = "qwertyuiopasdfghjklzxcvbnmйцукенгшщзхъфывапролджэячсмитьбю";
 +    final List<String> words = new ArrayList<>();
 +    final Set<String> allWordsSet = new HashSet<>();
 +    StringBuilder wordBuilder = new StringBuilder();
 +    for (int i = 0; i < capacity; i++) {
 +      char ch = letters.charAt(rnd.nextInt(letters.length()));
 +      wordBuilder.append(ch);
 +      if (rnd.nextInt(10) == 1) {
 +        String word = wordBuilder.toString();
 +        if (allWordsSet.add(word)) {
 +          words.add(word);
 +          wordBuilder = new StringBuilder();
 +        }
 +      }
 +    }
 +
 +    WildcardHelper wildcardHelper = new WildcardHelper(null, () ->
 +        new BytesRefIterator() {
 +          private int cur = -1;
 +
 +          @Override
 +          public BytesRef next() throws IOException {
 +            cur++;
 +            return cur >= words.size() ? null : new BytesRef(words.get(cur));
 +          }
 +        });
 +
 +    int pos = 0;
 +    for (int i = 0; i < words.size(); i++) {
 +      String word = words.get(i);
 +      int len = new BytesRef(word).length;
 +      for (int j = 0; j < len; j++) {
 +        assertEquals(pos + " points to wrong word", i, wildcardHelper.getWordIndex(pos));
 +        pos++;
 +      }
 +    }
 +    while (!wildcardHelper.isReady()) {
 +      Thread.sleep(1000);
 +    }
 +    String substring = "";
 +    for (int i = 0; i < 6; i++) {
 +      char ch = letters.charAt(rnd.nextInt(letters.length()));
 +      substring += ch;
 +      Set<String> wordsWithSubstringSet = new HashSet<>();
 +      Set<String> wordsEndsWithSubstringSet = new HashSet<>();
 +      Set<String> wordsEndsWithSubstringQSet = new HashSet<>();
 +      for (String word : words) {
 +        if (word.contains(substring)) {
 +          wordsWithSubstringSet.add(word);
 +        }
 +        if (word.endsWith(substring)) {
 +          wordsEndsWithSubstringSet.add(word);
 +        }
 +        if (word.substring(0, word.length() -1).endsWith(substring)) {
 +          wordsEndsWithSubstringQSet.add(word);
 +        }
 +      }
 +      testCollection(wildcardHelper, "*" + substring + "*", wordsWithSubstringSet, substring);
 +      testCollection(wildcardHelper, "*" + substring, wordsEndsWithSubstringSet, substring);
 +      testCollection(wildcardHelper, "*" + substring + "?", wordsEndsWithSubstringQSet, substring);
 +    }
 +    String wildcard = "*" + substring.substring(0, 2) + "?" + substring.charAt(3);
 +    Pattern pattern = WildcardHelper.wildcardToPattern(wildcard);
 +    Set<String> matchedWords = words.stream().filter(word -> pattern.matcher(word).matches()).collect(Collectors.toSet());
 +    testCollection(wildcardHelper, wildcard, matchedWords, substring.substring(0, 2));
 +  }
 +
 +  @Test
 +  public void testWildcardToPattern() throws Exception {
 +    testPattern("g*ks", new String[] {"geeks", "g//.ks", "gks"}, new String[]{"gek", "eks", "geekst"});
 +    testPattern("ge?ks*", new String[] {"geeksforgeeks", "geeks", "geoks76"}, new String[]{"geks", "geeeks", "ogeeks123"});
 +    testPattern("g*k", new String[] {"geek", "gk", "gkkkk"}, new String[]{"sgk", "geeks", "get"});
 +    testPattern("*pqrs", new String[] {"pqrs", "pqrpqrs", "pqropqrs"}, new String[]{"pqr", "opqrst", "pqrst"});
 +    testPattern("abc*bcd", new String[] {"abcdhghgbcd", "abcbcd", "abcabcabcbcd"}, new String[]{"abcbcdbcdabc", "abqwertcbcd", "abcbabcabacd"});
 +    testPattern("abc*c?d", new String[] {"abccod", "abccccd", "abcdcad"}, new String[]{"abccd", "cdabccd", "abc0cppd"});
 +    testPattern("*c*d", new String[] {"abcd", "qwecd", "coood"}, new String[]{"cdcdcdt", "aaaaaaad", "ddddcccc"});
 +    testPattern("*?c*d", new String[] {"abcd", "cccd", "qwcdcdcod"}, new String[]{"cod", "cd", "qcdwertcdq"});
 +    testPattern("\\test.", new String[] {"\\test."}, new String[] {"test.", "\\test", "\\tes.", "\\test"});
 +  }
 +
 +  private void testPattern(String query, String[] positive, String[] negative) {
 +    Pattern pattern = WildcardHelper.wildcardToPattern(query);
 +    for (String word : positive) {
 +      assertTrue(word + " should match " + query, pattern.matcher(word).matches());
 +    }
 +    for (String word : negative) {
 +      assertFalse(word + " shouldn't match " + query, pattern.matcher(word).matches());
 +    }
 +  }
 +
 +  private void testCollection(WildcardHelper wildcardHelper, String wildcardText, Set<String> matchingWordsSet, String substring) throws IOException {
 +    List<String> matchingWords = new ArrayList<>(matchingWordsSet);
 +    Collections.sort(matchingWords);
 +    Collection<BytesRef> bytesRefs = wildcardHelper.getMatchingWords(wildcardText, substring);
 +    int pos = 0;
 +    for (BytesRef bytesRef : bytesRefs) {
 +      assertEquals("Mismatch in list of words for " + wildcardText + " at position " + pos, bytesRef.utf8ToString(), matchingWords.get(pos));
 +      pos++;
 +    }
 +    WildcardHelper.ListTermsEnum listTermsEnum = new WildcardHelper.ListTermsEnum(bytesRefs, null, null, null);
 +    BytesRef term;
 +    pos = 0;
 +    while ((term = listTermsEnum.next()) != null) {
 +      assertEquals("ListTermsEnum contains wrong word", matchingWords.get(pos), term.utf8ToString());
 +      pos++;
 +    }
 +    assertEquals("ListTermsEnum is missing some words", matchingWords.size(), pos);
 +  }
 +}