blob: f173e2b680923fdc09debb804cb059d8a8217bcd [file] [log] [blame]
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
index 4ee3826..bdbffef 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java
@@ -61,6 +61,9 @@ public final class FieldReader extends Terms implements Accountable {
final BlockTreeTermsReader parent;
final FST<BytesRef> index;
+
+ final WildcardHelper wildcardHelper;
+
//private boolean DEBUG;
FieldReader(BlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount,
@@ -99,8 +102,14 @@ public final class FieldReader extends Terms implements Accountable {
w.close();
}
*/
+ if (Boolean.parseBoolean(System.getProperty("solr.suffixArray.enable"))) {
+ wildcardHelper = new WildcardHelper(this, this::iterator);
+ } else {
+ wildcardHelper = null;
+ }
} else {
index = null;
+ wildcardHelper = null;
}
}
@@ -185,12 +194,22 @@ public final class FieldReader extends Terms implements Accountable {
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
+ final String wildcardText = compiled.wildcardText;
+ if (wildcardText != null && wildcardText.startsWith("*") && wildcardHelper != null && wildcardHelper.isReady()) {
+ String[] parts = wildcardText.split("[*?]");
+ for (final String part : parts) {
+ if (part.length() > 2) {
+ return wildcardHelper.getTermsEnum(wildcardText, part);
+ }
+ }
+ }
return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm, compiled.sinkState);
}
@Override
public long ramBytesUsed() {
- return BASE_RAM_BYTES_USED + ((index!=null)? index.ramBytesUsed() : 0);
+ return BASE_RAM_BYTES_USED + ((index!=null)? index.ramBytesUsed() : 0)
+ + (wildcardHelper != null ? wildcardHelper.ramBytesUsed() : 0);
}
@Override
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SuffixArrayBytes.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SuffixArrayBytes.java
new file mode 100644
index 0000000..5f0b5dc
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SuffixArrayBytes.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.blocktree;
+
+/**
+ * Sort suffixes of the bytes array and allows to select range of suffixes that starts with given prefix.
+ */
+public class SuffixArrayBytes {
+
+ private final byte[] text;
+
+ /**
+ * index[i] = j means text.substring(j) is ith largest suffix
+ */
+ private final int[] index;
+
+ /**
+ * number of suffixes, {@link #index} length
+ */
+ private final int n;
+
+ /**
+ * Creates sorted suffix array for the given bytes array.
+ * @param text the input bytes array that ends with {@link WildcardHelper#SAFETY_PAD_SIZE} {@link Byte#MIN_VALUE} values
+ * @param index values of this array are starting position of suffixes in {@code text}
+ */
+ public SuffixArrayBytes(final byte[] text, final int[] index) {
+ this.text = text;
+ this.index = index;
+ this.n = index.length;
+ sort(0, n - 1, 0);
+ }
+
+ /**
+ * @return bytes array that represents input text for suffix array construction
+ */
+ public byte[] getText() {
+ return text;
+ }
+
+ /**
+ * 3-way radix quicksort based on <a href="http://algs4.cs.princeton.edu/63suffix/SuffixArrayX.java.html">http://algs4.cs.princeton.edu/63suffix/SuffixArrayX.java.html</a>
+ * @param lo first index
+ * @param hi last index
+ * @param d first symbol where suffixes can differ
+ */
+ private void sort(final int lo, final int hi, final int d) {
+ if (hi <= lo) {
+ return;
+ }
+
+ int lt = lo, gt = hi;
+ //index[lo] - probe suffix
+ final byte probeSuffixPositionD = text[index[lo] + d];
+ int i = lo + 1;
+ while (i <= gt) {
+ byte currentSuffixPositionD = text[index[i] + d];
+ if (currentSuffixPositionD < probeSuffixPositionD) {
+ swapSuffixes(lt++, i++);
+ } else if (currentSuffixPositionD > probeSuffixPositionD) {
+ swapSuffixes(i, gt--);
+ } else {
+ i++;
+ }
+ }
+
+ // index[lo]..index[lt-1] - suffixes before probe
+ sort(lo, lt-1, d);
+ // index[lt]..index[gt] - suffixes with the same d symbol as in probe
+ sort(lt, gt, d+1);
+ // index[gt+1]..index[hi] - suffixes after probe
+ sort(gt+1, hi, d);
+ }
+
+ private void swapSuffixes(final int i, final int j) {
+ final int swap = index[i];
+ index[i] = index[j];
+ index[j] = swap;
+ }
+
+ /**
+ * @param i an integer between 0 and <em>n</em>-1
+ * @return the starting position of the <em>i</em>th smallest suffix.
+ */
+ public int index(final int i) {
+ return index[i];
+ }
+
+ /**
+ * Finds range of suffixes with given prefix.
+ * @param prefix part of the wildcard query without * and ? converted to bytes
+ * @return the range of suffixes starting with {@code prefix}, range[0] is the least suffix not less than {@code prefix},
+ * {@code range[1]} - is the least not-matched after {@code range[0]}. If there is no matched suffixes {@code range[0] == range[1]};
+ */
+ public int[] getSuffixesWithPrefix(final byte[] prefix) {
+ int lessThanPrefix = -1;
+ int moreThanPrefix = n;
+ while (lessThanPrefix + 1 < moreThanPrefix) {
+ int mid = lessThanPrefix + (moreThanPrefix - lessThanPrefix) / 2;
+ if (suffixIsAfterPrefix(index[mid], prefix)) {
+ moreThanPrefix = mid;
+ } else {
+ lessThanPrefix = mid;
+ }
+ }
+ if (moreThanPrefix == n || !suffixStartsWithPrefix(index[moreThanPrefix], prefix)) {
+ return new int[] {moreThanPrefix, moreThanPrefix};
+ }
+ int maxValid = moreThanPrefix;
+ int minInvalid = n;
+ while (maxValid + 1 < minInvalid) {
+ int mid = maxValid + (minInvalid - maxValid) / 2;
+ if (suffixStartsWithPrefix(index[mid], prefix)) {
+ maxValid = mid;
+ } else {
+ minInvalid = mid;
+ }
+ }
+ return new int[]{moreThanPrefix, minInvalid};
+ }
+
+ /**
+ * Checks that suffix is after {@code prefix} bytes.
+ * @param suffixStart start of the suffix
+ * @param prefix for comparison
+ * @return suffix is after {@code prefix}
+ */
+ private boolean suffixIsAfterPrefix(final int suffixStart, final byte[] prefix) {
+ for (int j = 0; j < prefix.length; j++) {
+ byte prefixByte = prefix[j];
+ byte suffixByte = text[suffixStart + j];
+ if (prefixByte < suffixByte) {
+ return true;
+ } else if (prefixByte > suffixByte) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Checks that suffix starts with {@code prefix} bytes.
+ * @param suffixStart start of the suffix
+ * @param prefix prefix
+ * @return suffix starts with {@code prefix}
+ */
+ private boolean suffixStartsWithPrefix(final int suffixStart, final byte[] prefix) {
+ for (int j = 0; j < prefix.length; j++) {
+ if (prefix[j] != text[suffixStart + j]) {
+ return false;
+ }
+ }
+ return true;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/WildcardHelper.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/WildcardHelper.java
new file mode 100644
index 0000000..d87428f
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/WildcardHelper.java
@@ -0,0 +1,365 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.blocktree;
+
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefIterator;
+import org.apache.lucene.util.RamUsageEstimator;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Pattern;
+
+/**
+ * Finds terms matched to given wildcard using suffix array.
+ */
+public class WildcardHelper implements Accountable {
+
+ /**
+ * Number of {@link Byte#MIN_VALUE} values added to the end of the list of all words to avoid array bounds check while comparing suffixes.
+ */
+ public static final int SAFETY_PAD_SIZE = 8;
+
+ /** Suffix array created asynchronously. */
+ private volatile SuffixArrayBytes suffixArray;
+
+ /**
+ * List of all words represented as bytes from {@link org.apache.lucene.util.BytesRef} with
+ * {#SAFETY_PAD_SIZE} {@link Byte#MIN_VALUE} values at the end.
+ */
+ private final byte[] allWords;
+
+ /** Starts of words {@link #allWords}, last element is total length of all words in bytes. */
+ private final int[] wordStarts;
+
+ /** Original {#link FieldReader} where we add suffix array support for better performance. */
+ private final FieldReader fieldReader;
+
+ /** Service that allows us to use several threads for suffix arrays sorting. */
+ private final static ExecutorService suffixArrayInitializationService;
+
+ static {
+ int initializationThreadsCount = 5;
+
+ try {
+ String value = System.getProperty("solr.suffixArray.initialization_treads_count");
+ if (value != null) {
+ initializationThreadsCount = Integer.parseInt(value);
+ //it should ne at least 1 thread
+ if (initializationThreadsCount < 1) {
+ initializationThreadsCount = 5;
+ }
+ }
+ } catch (Throwable ignored) {
+ }
+
+ //we should not create more threads than initializationThreadsCount and we need to free all of them after initialization is finished
+ suffixArrayInitializationService = new ThreadPoolExecutor(0, initializationThreadsCount,
+ 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<>());
+ }
+
+ /** Valid suffix starts in {@link #allWords} */
+ private final int[] suffixes;
+
+ /**
+ * @param fieldReader original {@link FieldReader} we are trying to improve
+ * @param bytesRefIteratorProvider prover for lists fo words (will be called 2 times)
+ */
+ public WildcardHelper(final FieldReader fieldReader, final BytesRefIteratorProvider bytesRefIteratorProvider) throws IOException {
+ this.fieldReader = fieldReader;
+ BytesRefIterator it = bytesRefIteratorProvider.iterator();
+ int totalBytes = 0;
+ int totalCharacters = 0;
+ final List<Integer> suffixArrayWordStartsList = new ArrayList<>();
+ BytesRef ref;
+ while ((ref = it.next()) != null) {
+ suffixArrayWordStartsList.add(totalBytes);
+ totalBytes += ref.length;
+ final String word = ref.utf8ToString();
+ totalCharacters += word.length();
+ for (int i = 0; i < word.length(); i++) {
+ if (isSecondSymbolInSurrogatePair(word, i)) {
+ totalCharacters--;
+ }
+ }
+ }
+ suffixArrayWordStartsList.add(totalBytes);
+ wordStarts = suffixArrayWordStartsList.stream().mapToInt(i -> i).toArray();
+
+ final int bytesCount = wordStarts[wordStarts.length - 1];
+ allWords = new byte[bytesCount + SAFETY_PAD_SIZE];
+ for (int i = bytesCount; i < allWords.length; i++) {
+ allWords[i] = Byte.MIN_VALUE;
+ }
+
+ suffixes = new int[totalCharacters];
+
+ it = bytesRefIteratorProvider.iterator();
+ int curCharacter = 0;
+ int curByte = 0;
+ while ((ref = it.next()) != null) {
+ System.arraycopy(ref.bytes, ref.offset, allWords, curByte, ref.length);
+ String word = ref.utf8ToString();
+ curByte += ref.length;
+ for (int i = 0; i < word.length(); i++) {
+ if (isSecondSymbolInSurrogatePair(word, i)) {
+ continue;
+ }
+ int suffixLength = new BytesRef(word.substring(i)).length;
+ suffixes[curCharacter] = curByte - suffixLength;
+ curCharacter++;
+ }
+ }
+
+ final Runnable suffixArrayInitializer = () -> {
+ suffixArray = new SuffixArrayBytes(allWords, suffixes);
+ };
+
+ suffixArrayInitializationService.submit(suffixArrayInitializer);
+ }
+
+ /**
+ * @return if suffix array already created and we are ready to process requests
+ */
+ public boolean isReady() {
+ return suffixArray != null;
+ }
+
+ public long ramBytesUsed() {
+ return RamUsageEstimator.shallowSizeOfInstance(WildcardHelper.class) +
+ allWords.length + 4 * (wordStarts.length + suffixes.length)
+ + RamUsageEstimator.shallowSizeOfInstance(SuffixArrayBytes.class);
+ }
+
+ /**
+ * Fast search for terms that matches wildcard query.
+ * @param wildcardQuery original query
+ * @param hint substring of original query without * and ?
+ * @return TermsEnum with matched terms
+ */
+ public TermsEnum getTermsEnum(final String wildcardQuery, final String hint) throws IOException {
+ final Collection<BytesRef> matchingWords = getMatchingWords(wildcardQuery, hint);
+ int wordsFound = matchingWords.size();
+ final List<Integer> frequencies = new ArrayList<>(wordsFound);
+ final List<TermState> states = new ArrayList<>(wordsFound);
+ final SegmentTermsEnum termsEnum = new SegmentTermsEnum(fieldReader);
+ for (BytesRef ref : matchingWords) {
+ termsEnum.seekExact(ref);
+ frequencies.add(termsEnum.docFreq());
+ states.add(termsEnum.termState());
+ }
+ return new ListTermsEnum(matchingWords, frequencies, states, fieldReader);
+ }
+
+ /**
+ * Gets terms that matches wildcard query using suffix array.
+ * @param wildcardQuery original query
+ * @param hint substring of original query without * and ?
+ * @return collection of matched terms
+ */
+ public Collection<BytesRef> getMatchingWords(final String wildcardQuery, final String hint) {
+ final byte[] prefix = convertStringToBytes(hint);
+ final int prefixLength = prefix.length;
+ final CheckWord checkWord;
+ if (wildcardQuery.equals("*" + hint + "*")) {
+ checkWord = (wordStart, suffix, wordEnd) -> true;
+ } else if (wildcardQuery.equals("*" + hint)) {
+ checkWord = (wordStart, suffix, wordEnd) -> suffix + prefixLength == wordEnd;
+ } else {
+ final Pattern regex = wildcardToPattern(wildcardQuery);
+ checkWord = (wordStart, suffix, wordEnd) -> regex.matcher(new BytesRef(allWords, wordStart, wordEnd - wordStart).utf8ToString()).matches();
+ }
+
+ final int[] range = suffixArray.getSuffixesWithPrefix(prefix);
+ final Set<Integer> usedWordIndexes = new HashSet<>();
+ final List<BytesRef> simpleFilter = new ArrayList<>(range[1] - range[0]);
+ for (int pos = range[0]; pos < range[1]; pos++) {
+ final int suffix = suffixArray.index(pos);
+ final int wordIndex = getWordIndex(suffix);
+ final int wordEnd = wordStarts[wordIndex + 1];
+ if (wordEnd >= suffix + prefixLength) {
+ final int wordStart = wordStarts[wordIndex];
+ if (checkWord.accept(wordStart, suffix, wordEnd)) {
+ if (usedWordIndexes.add(wordIndex)) {
+ simpleFilter.add(new BytesRef(allWords, wordStart, wordEnd - wordStart));
+ }
+ }
+ }
+ }
+ Collections.sort(simpleFilter);
+ return simpleFilter;
+ }
+
+ /**
+ * Converts {@code String} to {@code byte[]} representation from {@link BytesRef}
+ */
+ public static byte[] convertStringToBytes(String hint) {
+ BytesRef ref = new BytesRef(hint);
+ byte[] prefix = new byte[ref.length];
+ System.arraycopy(ref.bytes, ref.offset, prefix, 0, ref.length);
+ return prefix;
+ }
+
+ /**
+ * Converts wildcard query to {@link java.util.regex.Pattern}
+ */
+ public static Pattern wildcardToPattern(String wildcardQuery) {
+ String regex = wildcardQuery.replaceAll("\\\\", "\\\\\\\\").replaceAll("\\.", "\\\\.").replaceAll("\\*", "\\.*").replaceAll("\\?", "\\.");
+ return Pattern.compile(regex);
+ }
+
+ /**
+ * Finds word that matches to position in {@link #allWords}.
+ * @param pos index of byte in {@link #allWords}
+ * @return index that points to word start in {@link #wordStarts}
+ */
+ public int getWordIndex(final int pos) {
+ int start = 0;
+ int finish = wordStarts.length - 1;
+ while (wordStarts[start + 1] <= pos) {
+ start++;
+ int middle = (start + finish)/2;
+ int middlePos = wordStarts[middle];
+ if (middlePos > pos) {
+ finish = middle;
+ } else {
+ start = middle;
+ }
+ }
+ return start;
+ }
+
+ /**
+ * Surrogate pair represented in {@link BytesRef} by 4 bytes, but single {@code char} from
+ * the surrogate pair represented by 3 special bytes. It means that suffix starting with the 2nd
+ * {@code char} in surrogate pair makes no sense.
+ * @return char at charPosition is the 2nd char in surrogate pair
+ */
+ private boolean isSecondSymbolInSurrogatePair(String word, int charPosition) {
+ int utf32 = word.charAt(charPosition);
+ //Code values from org.apache.lucene.util.UnicodeUtil.UTF16toUTF8() method
+ return utf32 >= 0xDC00 && utf32 <= 0xDFFF;
+ }
+
+ /**
+ * Allows to replace {@link java.util.regex.Pattern} usage with trivial checks fro queries like *abc and *abc*.
+ */
+ private interface CheckWord {
+ boolean accept(final int wordStart, final int suffix, final int wordEnd);
+ }
+
+ /**
+ * This interface allows to create {@code WildcardHelper} in unit tests.
+ */
+ public interface BytesRefIteratorProvider {
+ BytesRefIterator iterator() throws IOException;
+ }
+
+ /**
+ * Simple {@link TermsEnum} implementation when we have all matched terms already.
+ */
+ public static class ListTermsEnum extends TermsEnum {
+ private int ord = -1;
+ private final List<BytesRef> terms = new ArrayList<>();
+ private final List<Integer> frequencies;
+ private final List<TermState> states;
+ private final FieldReader fieldReader;
+
+ public ListTermsEnum(final Collection<BytesRef> terms, final List<Integer> frequencies, final List<TermState> states,
+ final FieldReader fieldReader) {
+ this.terms.addAll(terms);
+ this.frequencies = frequencies;
+ this.states = states;
+ this.fieldReader = fieldReader;
+ }
+
+ @Override
+ public SeekStatus seekCeil(final BytesRef text) throws IOException {
+ for (int i = 0; i < terms.size(); i++) {
+ final BytesRef bytesRef = terms.get(i);
+ if (bytesRef.equals(text)) {
+ ord = i;
+ return SeekStatus.FOUND;
+ }
+ if (bytesRef.compareTo(text) < 0) {
+ ord = i;
+ return SeekStatus.NOT_FOUND;
+ }
+ }
+ ord = -1;
+ return SeekStatus.END;
+ }
+
+ @Override
+ public void seekExact(final long ord) throws IOException {
+ this.ord = (int)ord;
+ }
+
+ @Override
+ public BytesRef term() throws IOException {
+ return terms.get(ord);
+ }
+
+ @Override
+ public long ord() throws IOException {
+ return ord;
+ }
+
+ @Override
+ public int docFreq() throws IOException {
+ return frequencies.get(ord);
+ }
+
+ @Override
+ public long totalTermFreq() throws IOException {
+ return fieldReader.getSumTotalTermFreq();
+ }
+
+ @Override
+ public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
+ return fieldReader.parent.postingsReader.postings(fieldReader.fieldInfo, (BlockTermState)termState(), reuse, flags);
+ }
+
+ @Override
+ public BytesRef next() throws IOException {
+ ord++;
+ if (ord >= terms.size()) {
+ return null;
+ }
+ return term();
+ }
+
+ @Override
+ public TermState termState() throws IOException {
+ return states.get(ord);
+ }
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java b/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java
index b775dca..d6b8a94 100644
--- a/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java
@@ -100,8 +100,11 @@ public class WildcardQuery extends AutomatonQuery {
}
i += length;
}
-
- return Operations.concatenate(automata);
+
+ Automaton automaton = Operations.concatenate(automata);
+ automaton.setWildcardText(wildcardText);
+ return automaton;
+
}
/**
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
index e4a5bd9..1852d72 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java
@@ -83,6 +83,9 @@ public class Automaton implements Accountable {
/** True if no state has two transitions leaving with the same label. */
private boolean deterministic = true;
+ /** Original wildcard query. */
+ private String wildcardText = null;
+
/** Sole constructor; creates an automaton with no states. */
public Automaton() {
this(2, 2);
@@ -322,6 +325,20 @@ public class Automaton implements Accountable {
return deterministic;
}
+ /**
+ * @return original wildcard query
+ */
+ public String getWildcardText() {
+ return wildcardText;
+ }
+
+ /**
+ * Sets original wildcard query.
+ */
+ public void setWildcardText(String wildcardText) {
+ this.wildcardText = wildcardText;
+ }
+
/** Finishes the current state; call this once you are done adding
* transitions for a state. This is automatically called if you
* start adding transitions to a new source state, but for the last
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
index bd00a70..d868dfd 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CompiledAutomaton.java
@@ -93,6 +93,9 @@ public class CompiledAutomaton {
/** Which state, if any, accepts all suffixes, else -1. */
public final int sinkState;
+ /** Original wildcard query */
+ public final String wildcardText;
+
/** Create this, passing simplify=true and finite=null, so that we try
* to simplify the automaton and determine if it is finite. */
public CompiledAutomaton(Automaton automaton) {
@@ -149,6 +152,8 @@ public class CompiledAutomaton {
automaton.createState();
}
+ wildcardText = automaton.getWildcardText();
+
if (simplify) {
// Test whether the automaton is a "simple" form and
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestSuffixArrayBytes.java b/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestSuffixArrayBytes.java
new file mode 100644
index 0000000..03d7e6b
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestSuffixArrayBytes.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.blocktree;
+
+import com.carrotsearch.randomizedtesting.RandomizedContext;
+import com.carrotsearch.randomizedtesting.RandomizedRunner;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import java.util.Random;
+
+/**
+ * Tests for {@link SuffixArrayBytes}.
+ */
+@RunWith(RandomizedRunner.class)
+public class TestSuffixArrayBytes extends LuceneTestCase {
+
+ @Test
+ public void testSuffixArraySortingEnglish() throws Exception {
+ final Random rnd = RandomizedContext.current().getRandom();
+ int capacity = 10000 + rnd.nextInt(10000);
+ String text = getText(rnd, capacity, "qwertyuiopasdfghjklzxcvbnm");
+ SuffixArrayBytes sa = getSuffixArray(text);
+
+ for (int i = 0; i < capacity - 1; i++) {
+ String s1 = text.substring(sa.index(i));
+ String s2 = text.substring(sa.index(i + 1));
+ assertTrue("Suffix " + i + " should be before suffix " + (i+1), s1.compareTo(s2) < 0);
+ }
+ }
+
+ private SuffixArrayBytes getSuffixArray(String text) {
+ byte[] textBytes = WildcardHelper.convertStringToBytes(text);
+ for (int i = textBytes.length - WildcardHelper.SAFETY_PAD_SIZE; i < textBytes.length; i++) {
+ textBytes[i] = Byte.MIN_VALUE;
+ }
+
+ int[] suffixes = new int[text.length() - WildcardHelper.SAFETY_PAD_SIZE];
+ for (int i = 0; i < suffixes.length; i++) {
+ suffixes[i] = textBytes.length - new BytesRef(text.substring(i)).length;
+ }
+
+ return new SuffixArrayBytes(textBytes, suffixes);
+ }
+
+ @Test
+ public void testSuffixArrayRange() throws Exception {
+ final Random rnd = RandomizedContext.current().getRandom();
+ int mainTextLength = 10000 + rnd.nextInt(10000);
+ String text = getText(rnd, mainTextLength, "qwertyuiopasdfghjklzxcvbnmйцукенгшщзхъфывапролджэячсмитьбю");
+ SuffixArrayBytes sa = getSuffixArray(text);
+ byte[] bytes = sa.getText();
+ for (int i = 0; i < mainTextLength - 1; i++) {
+ int start1 = sa.index(i);
+ int start2 = sa.index(i + 1);
+ int diff = 0;
+ while (true) {
+ if (bytes[start1 + diff] < bytes[start2 + diff]) {
+ break;
+ }
+ if (bytes[start1 + diff] > bytes[start2 + diff]) {
+ assertTrue("Suffix " + i + "greater than " + (i+1), false);
+ }
+ diff++;
+ }
+ }
+
+ int endSuffixLength = 1 + rnd.nextInt(50);
+ int endSuffixStart = mainTextLength - endSuffixLength;
+ byte[] query = WildcardHelper.convertStringToBytes(text.substring(endSuffixStart, mainTextLength));
+ assertEquals("End suffix should be found at the end", bytes.length - query.length - WildcardHelper.SAFETY_PAD_SIZE, sa.index(sa.getSuffixesWithPrefix(query)[0]));
+ for (int i = 1; i < 1000; i++) {
+ int beginIndex = rnd.nextInt(text.length() - 1000);
+ String suffix = text.substring(beginIndex, beginIndex + 1 + rnd.nextInt(100));
+ int[] range = sa.getSuffixesWithPrefix(WildcardHelper.convertStringToBytes(suffix));
+ for (int pos = range[0]; pos < range[1]; pos++) {
+ assertTrue("Suffix should be found in text", getSuffix(sa, sa.index(pos)).startsWith(suffix));
+ }
+ if (range[0] > 0) {
+ int suffixStart = sa.index(range[0] - 1);
+ assertFalse("Suffix should not be found before range at " + (range[0] - 1), getSuffix(sa, suffixStart).startsWith(suffix));
+ }
+ if (range[1] < mainTextLength) {
+ int suffixStart = sa.index(range[1]);
+ assertFalse("Suffix should not be found after range at " + range[1], getSuffix(sa, suffixStart).startsWith(suffix));
+ }
+
+ }
+ assertEquals("{127} suffix should be positioned at the end", sa.getSuffixesWithPrefix(new byte[]{127, 127, 127, 127})[0], mainTextLength);
+ assertEquals("{-128} suffix should be positioned at 0", sa.getSuffixesWithPrefix(new byte[]{-128, -128, -128,-128})[0], 0);
+ }
+
+ private String getText(Random rnd, int mainTextLength, String letters) {
+ StringBuilder sb = new StringBuilder(mainTextLength);
+ for (int i = 0; i < mainTextLength; i++) {
+ sb.append(letters.charAt(rnd.nextInt(letters.length())));
+ }
+ for (int i = 0; i < WildcardHelper.SAFETY_PAD_SIZE; i++) {
+ sb.append("0");
+ }
+ return sb.toString();
+ }
+
+ private String getSuffix(final SuffixArrayBytes sa, final int suffixStart) {
+ final byte[] textBytes = sa.getText();
+ return new BytesRef(textBytes, suffixStart, textBytes.length - suffixStart - WildcardHelper.SAFETY_PAD_SIZE).utf8ToString();
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestWildcardHelper.java b/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestWildcardHelper.java
new file mode 100644
index 0000000..ec4c4b8
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/codecs/blocktree/TestWildcardHelper.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.blocktree;
+
+import com.carrotsearch.randomizedtesting.RandomizedContext;
+import com.carrotsearch.randomizedtesting.RandomizedRunner;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefIterator;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * Tests for {@link WildcardHelper}.
+ */
+@RunWith(RandomizedRunner.class)
+public class TestWildcardHelper extends LuceneTestCase {
+
+ @Test
+ public void testSuffixArrayIntegration() throws Exception {
+ final Random rnd = RandomizedContext.current().getRandom();
+ int capacity = 1000000 + rnd.nextInt(1000000);
+ String letters = "qwertyuiopasdfghjklzxcvbnmйцукенгшщзхъфывапролджэячсмитьбю";
+ final List<String> words = new ArrayList<>();
+ final Set<String> allWordsSet = new HashSet<>();
+ StringBuilder wordBuilder = new StringBuilder();
+ for (int i = 0; i < capacity; i++) {
+ char ch = letters.charAt(rnd.nextInt(letters.length()));
+ wordBuilder.append(ch);
+ if (rnd.nextInt(10) == 1) {
+ String word = wordBuilder.toString();
+ if (allWordsSet.add(word)) {
+ words.add(word);
+ wordBuilder = new StringBuilder();
+ }
+ }
+ }
+
+ WildcardHelper wildcardHelper = new WildcardHelper(null, () ->
+ new BytesRefIterator() {
+ private int cur = -1;
+
+ @Override
+ public BytesRef next() throws IOException {
+ cur++;
+ return cur >= words.size() ? null : new BytesRef(words.get(cur));
+ }
+ });
+
+ int pos = 0;
+ for (int i = 0; i < words.size(); i++) {
+ String word = words.get(i);
+ int len = new BytesRef(word).length;
+ for (int j = 0; j < len; j++) {
+ assertEquals(pos + " points to wrong word", i, wildcardHelper.getWordIndex(pos));
+ pos++;
+ }
+ }
+ while (!wildcardHelper.isReady()) {
+ Thread.sleep(1000);
+ }
+ String substring = "";
+ for (int i = 0; i < 6; i++) {
+ char ch = letters.charAt(rnd.nextInt(letters.length()));
+ substring += ch;
+ Set<String> wordsWithSubstringSet = new HashSet<>();
+ Set<String> wordsEndsWithSubstringSet = new HashSet<>();
+ Set<String> wordsEndsWithSubstringQSet = new HashSet<>();
+ for (String word : words) {
+ if (word.contains(substring)) {
+ wordsWithSubstringSet.add(word);
+ }
+ if (word.endsWith(substring)) {
+ wordsEndsWithSubstringSet.add(word);
+ }
+ if (word.substring(0, word.length() -1).endsWith(substring)) {
+ wordsEndsWithSubstringQSet.add(word);
+ }
+ }
+ testCollection(wildcardHelper, "*" + substring + "*", wordsWithSubstringSet, substring);
+ testCollection(wildcardHelper, "*" + substring, wordsEndsWithSubstringSet, substring);
+ testCollection(wildcardHelper, "*" + substring + "?", wordsEndsWithSubstringQSet, substring);
+ }
+ String wildcard = "*" + substring.substring(0, 2) + "?" + substring.charAt(3);
+ Pattern pattern = WildcardHelper.wildcardToPattern(wildcard);
+ Set<String> matchedWords = words.stream().filter(word -> pattern.matcher(word).matches()).collect(Collectors.toSet());
+ testCollection(wildcardHelper, wildcard, matchedWords, substring.substring(0, 2));
+ }
+
+ @Test
+ public void testWildcardToPattern() throws Exception {
+ testPattern("g*ks", new String[] {"geeks", "g//.ks", "gks"}, new String[]{"gek", "eks", "geekst"});
+ testPattern("ge?ks*", new String[] {"geeksforgeeks", "geeks", "geoks76"}, new String[]{"geks", "geeeks", "ogeeks123"});
+ testPattern("g*k", new String[] {"geek", "gk", "gkkkk"}, new String[]{"sgk", "geeks", "get"});
+ testPattern("*pqrs", new String[] {"pqrs", "pqrpqrs", "pqropqrs"}, new String[]{"pqr", "opqrst", "pqrst"});
+ testPattern("abc*bcd", new String[] {"abcdhghgbcd", "abcbcd", "abcabcabcbcd"}, new String[]{"abcbcdbcdabc", "abqwertcbcd", "abcbabcabacd"});
+ testPattern("abc*c?d", new String[] {"abccod", "abccccd", "abcdcad"}, new String[]{"abccd", "cdabccd", "abc0cppd"});
+ testPattern("*c*d", new String[] {"abcd", "qwecd", "coood"}, new String[]{"cdcdcdt", "aaaaaaad", "ddddcccc"});
+ testPattern("*?c*d", new String[] {"abcd", "cccd", "qwcdcdcod"}, new String[]{"cod", "cd", "qcdwertcdq"});
+ testPattern("\\test.", new String[] {"\\test."}, new String[] {"test.", "\\test", "\\tes.", "\\test"});
+ }
+
+ private void testPattern(String query, String[] positive, String[] negative) {
+ Pattern pattern = WildcardHelper.wildcardToPattern(query);
+ for (String word : positive) {
+ assertTrue(word + " should match " + query, pattern.matcher(word).matches());
+ }
+ for (String word : negative) {
+ assertFalse(word + " shouldn't match " + query, pattern.matcher(word).matches());
+ }
+ }
+
+ private void testCollection(WildcardHelper wildcardHelper, String wildcardText, Set<String> matchingWordsSet, String substring) throws IOException {
+ List<String> matchingWords = new ArrayList<>(matchingWordsSet);
+ Collections.sort(matchingWords);
+ Collection<BytesRef> bytesRefs = wildcardHelper.getMatchingWords(wildcardText, substring);
+ int pos = 0;
+ for (BytesRef bytesRef : bytesRefs) {
+ assertEquals("Mismatch in list of words for " + wildcardText + " at position " + pos, bytesRef.utf8ToString(), matchingWords.get(pos));
+ pos++;
+ }
+ WildcardHelper.ListTermsEnum listTermsEnum = new WildcardHelper.ListTermsEnum(bytesRefs, null, null, null);
+ BytesRef term;
+ pos = 0;
+ while ((term = listTermsEnum.next()) != null) {
+ assertEquals("ListTermsEnum contains wrong word", matchingWords.get(pos), term.utf8ToString());
+ pos++;
+ }
+ assertEquals("ListTermsEnum is missing some words", matchingWords.size(), pos);
+ }
+}