| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.hunspell; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.function.BiConsumer; |
| import org.apache.lucene.store.ByteArrayDataInput; |
| import org.apache.lucene.store.ByteArrayDataOutput; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.fst.IntSequenceOutputs; |
| |
| /** |
| * A data structure for memory-efficient word storage and fast lookup/enumeration. Each dictionary |
| * entry is stored as: |
| * |
| * <ol> |
| * <li>the last character |
| * <li>pointer to a similar entry for the prefix (all characters except the last one) |
| * <li>value data: a list of ints representing word flags and morphological data, and a pointer to |
| * hash collisions, if any |
| * </ol> |
| * |
| * There's only one entry for each prefix, so it's like a trie/{@link |
| * org.apache.lucene.util.fst.FST}, but a reversed one: each node points to a single previous node |
| * instead of several following ones. For example, "abc" and "abd" point to the same prefix entry |
| * "ab" which points to "a" which points to 0.<br> |
| * <br> |
| * The entries are stored in a contiguous byte array, identified by their offsets, using {@link |
| * DataOutput#writeVInt} ()} VINT} format for compression. |
| */ |
| class WordStorage { |
| private static final int OFFSET_BITS = 25; |
| private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1; |
| private static final int COLLISION_MASK = 0x40; |
| private static final int MAX_STORED_LENGTH = COLLISION_MASK - 1; |
| |
| /** |
| * A map from word's hash (modulo array's length) into an int containing: |
| * |
| * <ul> |
| * <li>lower {@link #OFFSET_BITS}: the offset in {@link #wordData} of the last entry with this |
| * hash |
| * <li>the remaining highest bits: COLLISION+LENGTH info for that entry, i.e. one bit indicating |
| * whether there are other entries with the same hash, and the length of the entry in chars, |
| * or {@link #MAX_STORED_LENGTH} if the length exceeds that limit (next highest bits) |
| * </ul> |
| */ |
| private final int[] hashTable; |
| |
| /** |
| * An array of word entries: |
| * |
| * <ul> |
| * <li>VINT: the word's last character |
| * <li>VINT: a delta pointer to the entry for the same word without the last character. |
| * Precisely, it's the difference of this entry's start and the prefix's entry start. 0 for |
| * single-character entries |
| * <li>(Optional, for hash-colliding entries only) |
| * <ul> |
| * <li>BYTE: COLLISION+LENGTH info (see {@link #hashTable}) for the previous entry with |
| * the same hash |
| * <li>VINT: (delta) pointer to the previous entry |
| * </ul> |
| * <li>(Optional, for non-leaf entries only) VINT+: word form data, returned from {@link |
| * #lookupWord}, preceded by its length |
| * </ul> |
| */ |
| private final byte[] wordData; |
| |
| private WordStorage(int[] hashTable, byte[] wordData) { |
| this.hashTable = hashTable; |
| this.wordData = wordData; |
| } |
| |
| IntsRef lookupWord(char[] word, int offset, int length) { |
| assert length > 0; |
| |
| int hash = Math.abs(CharsRef.stringHashCode(word, offset, length) % hashTable.length); |
| int entryCode = hashTable[hash]; |
| if (entryCode == 0) { |
| return null; |
| } |
| |
| int pos = entryCode & OFFSET_MASK; |
| int mask = entryCode >>> OFFSET_BITS; |
| |
| char lastChar = word[offset + length - 1]; |
| ByteArrayDataInput in = new ByteArrayDataInput(wordData); |
| while (true) { |
| in.setPosition(pos); |
| char c = (char) in.readVInt(); |
| int prevPos = pos - in.readVInt(); |
| |
| boolean last = !hasCollision(mask); |
| boolean mightMatch = c == lastChar && hasLength(mask, length); |
| |
| if (!last) { |
| mask = in.readByte(); |
| pos -= in.readVInt(); |
| } |
| |
| if (mightMatch) { |
| int beforeForms = in.getPosition(); |
| if (isSameString(word, offset, length - 1, prevPos, in)) { |
| in.setPosition(beforeForms); |
| int formLength = in.readVInt(); |
| IntsRef forms = new IntsRef(formLength); |
| readForms(forms, in, formLength); |
| return forms; |
| } |
| } |
| |
| if (last) { |
| return null; |
| } |
| } |
| } |
| |
| private static boolean hasCollision(int mask) { |
| return (mask & COLLISION_MASK) != 0; |
| } |
| |
| /** |
| * Calls the processor for every dictionary entry with length between minLength and maxLength, |
| * both ends inclusive. Note that the callback arguments (word and forms) are reused, so they can |
| * be modified in any way, but may not be saved for later by the processor |
| */ |
| void processAllWords(int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) { |
| assert minLength <= maxLength; |
| CharsRef chars = new CharsRef(maxLength); |
| IntsRef forms = new IntsRef(); |
| ByteArrayDataInput in = new ByteArrayDataInput(wordData); |
| for (int entryCode : hashTable) { |
| int pos = entryCode & OFFSET_MASK; |
| int mask = entryCode >>> OFFSET_BITS; |
| |
| while (pos != 0) { |
| int wordStart = maxLength - 1; |
| |
| in.setPosition(pos); |
| chars.chars[wordStart] = (char) in.readVInt(); |
| int prevPos = pos - in.readVInt(); |
| |
| boolean last = !hasCollision(mask); |
| boolean mightMatch = hasLengthInRange(mask, minLength, maxLength); |
| |
| if (!last) { |
| mask = in.readByte(); |
| pos -= in.readVInt(); |
| } |
| |
| if (mightMatch) { |
| int dataLength = in.readVInt(); |
| if (forms.ints.length < dataLength) { |
| forms.ints = new int[dataLength]; |
| } |
| readForms(forms, in, dataLength); |
| while (prevPos != 0 && wordStart > 0) { |
| in.setPosition(prevPos); |
| chars.chars[--wordStart] = (char) in.readVInt(); |
| prevPos -= in.readVInt(); |
| } |
| |
| if (prevPos == 0) { |
| chars.offset = wordStart; |
| chars.length = maxLength - wordStart; |
| processor.accept(chars, forms); |
| } |
| } |
| |
| if (last) { |
| break; |
| } |
| } |
| } |
| } |
| |
| private boolean hasLength(int mask, int length) { |
| int lenCode = mask & MAX_STORED_LENGTH; |
| return lenCode == MAX_STORED_LENGTH ? length >= MAX_STORED_LENGTH : lenCode == length; |
| } |
| |
| private static boolean hasLengthInRange(int mask, int minLength, int maxLength) { |
| int lenCode = mask & MAX_STORED_LENGTH; |
| if (lenCode == MAX_STORED_LENGTH) { |
| return maxLength >= MAX_STORED_LENGTH; |
| } |
| return lenCode >= minLength && lenCode <= maxLength; |
| } |
| |
| private boolean isSameString( |
| char[] word, int offset, int length, int dataPos, ByteArrayDataInput in) { |
| for (int i = length - 1; i >= 0; i--) { |
| in.setPosition(dataPos); |
| char c = (char) in.readVInt(); |
| if (c != word[i + offset]) { |
| return false; |
| } |
| dataPos -= in.readVInt(); |
| if (dataPos == 0) { |
| return i == 0; |
| } |
| } |
| return length == 0 && dataPos == 0; |
| } |
| |
| private void readForms(IntsRef forms, ByteArrayDataInput in, int length) { |
| for (int i = 0; i < length; i++) { |
| forms.ints[i] = in.readVInt(); |
| } |
| forms.length = length; |
| } |
| |
| static class Builder { |
| private final boolean hasCustomMorphData; |
| private final int[] hashTable; |
| private byte[] wordData; |
| private final int[] chainLengths; |
| |
| private final IntsRefBuilder currentOrds = new IntsRefBuilder(); |
| private final List<char[]> group = new ArrayList<>(); |
| private final List<Integer> morphDataIDs = new ArrayList<>(); |
| private String currentEntry = null; |
| private final int wordCount; |
| private final FlagEnumerator flagEnumerator; |
| |
| private final ByteArrayDataOutput dataWriter; |
| private int commonPrefixLength, commonPrefixPos; |
| private int actualWords; |
| |
| /** |
| * @param wordCount an approximate number of the words in the resulting dictionary, used to |
| * pre-size the hash table. This argument can be a bit larger than the actual word count, |
| * but not smaller. |
| */ |
| Builder(int wordCount, boolean hasCustomMorphData, FlagEnumerator flagEnumerator) { |
| this.wordCount = wordCount; |
| this.flagEnumerator = flagEnumerator; |
| this.hasCustomMorphData = hasCustomMorphData; |
| |
| hashTable = new int[wordCount]; |
| wordData = new byte[wordCount * 6]; |
| |
| dataWriter = |
| new ByteArrayDataOutput(wordData) { |
| @Override |
| public void writeByte(byte b) { |
| int pos = getPosition(); |
| if (pos == wordData.length) { |
| wordData = ArrayUtil.grow(wordData); |
| reset(wordData, pos, wordData.length - pos); |
| } |
| super.writeByte(b); |
| } |
| }; |
| dataWriter.writeByte((byte) 0); // zero index is root, contains nothing |
| chainLengths = new int[hashTable.length]; |
| } |
| |
| /** |
| * Add a dictionary entry. This method should be called for entries sorted non-descending by |
| * {@link String#compareTo} rules. |
| */ |
| void add(String entry, char[] flags, int morphDataID) throws IOException { |
| if (!entry.equals(currentEntry)) { |
| if (currentEntry != null) { |
| if (entry.compareTo(currentEntry) < 0) { |
| throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); |
| } |
| int pos = flushGroup(); |
| |
| commonPrefixLength = GeneratingSuggester.commonPrefix(currentEntry, entry); |
| ByteArrayDataInput in = new ByteArrayDataInput(wordData); |
| in.setPosition(pos); |
| for (int i = currentEntry.length() - 1; i >= commonPrefixLength; i--) { |
| char c = (char) in.readVInt(); |
| assert c == currentEntry.charAt(i); |
| pos -= in.readVInt(); |
| in.setPosition(pos); |
| } |
| commonPrefixPos = pos; |
| } |
| currentEntry = entry; |
| } |
| |
| group.add(flags); |
| if (hasCustomMorphData) { |
| morphDataIDs.add(morphDataID); |
| } |
| } |
| |
| private int flushGroup() throws IOException { |
| if (++actualWords > wordCount) { |
| throw new RuntimeException("Don't add more words than wordCount!"); |
| } |
| |
| currentOrds.clear(); |
| boolean hasNonHidden = false; |
| for (char[] flags : group) { |
| if (!hasHiddenFlag(flags)) { |
| hasNonHidden = true; |
| break; |
| } |
| } |
| |
| for (int i = 0; i < group.size(); i++) { |
| char[] flags = group.get(i); |
| if (hasNonHidden && hasHiddenFlag(flags)) { |
| continue; |
| } |
| |
| currentOrds.append(flagEnumerator.add(flags)); |
| if (hasCustomMorphData) { |
| currentOrds.append(morphDataIDs.get(i)); |
| } |
| } |
| |
| // write the non-leaf entries for chars after the shared prefix, except the last one |
| int lastPos = commonPrefixPos; |
| for (int i = commonPrefixLength; i < currentEntry.length() - 1; i++) { |
| int pos = dataWriter.getPosition(); |
| dataWriter.writeVInt(currentEntry.charAt(i)); |
| dataWriter.writeVInt(pos - lastPos); |
| lastPos = pos; |
| } |
| |
| int pos = dataWriter.getPosition(); |
| if (pos >= 1 << OFFSET_BITS) { |
| throw new RuntimeException( |
| "Too much word data, please report this to dev@lucene.apache.org"); |
| } |
| int hash = Math.abs(currentEntry.hashCode() % hashTable.length); |
| int prevCode = hashTable[hash]; |
| |
| int mask = |
| (prevCode == 0 ? 0 : COLLISION_MASK) | Math.min(currentEntry.length(), MAX_STORED_LENGTH); |
| hashTable[hash] = (mask << OFFSET_BITS) | pos; |
| |
| if (++chainLengths[hash] > 20) { |
| throw new RuntimeException( |
| "Too many collisions, please report this to dev@lucene.apache.org"); |
| } |
| |
| // write the leaf entry for the last character |
| dataWriter.writeVInt(currentEntry.charAt(currentEntry.length() - 1)); |
| dataWriter.writeVInt(pos - lastPos); |
| if (prevCode != 0) { |
| dataWriter.writeByte((byte) (prevCode >>> OFFSET_BITS)); |
| dataWriter.writeVInt(pos - (prevCode & OFFSET_MASK)); |
| } |
| IntSequenceOutputs.getSingleton().write(currentOrds.get(), dataWriter); |
| |
| group.clear(); |
| morphDataIDs.clear(); |
| return pos; |
| } |
| |
| private static boolean hasHiddenFlag(char[] flags) { |
| for (char flag : flags) { |
| if (flag == Dictionary.HIDDEN_FLAG) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| WordStorage build() throws IOException { |
| assert !group.isEmpty() : "build() should be only called once"; |
| flushGroup(); |
| return new WordStorage( |
| hashTable, ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition())); |
| } |
| } |
| } |