| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.hunspell; |
| |
| import static org.apache.lucene.analysis.hunspell.AffixKind.*; |
| |
| import java.io.BufferedInputStream; |
| import java.io.BufferedReader; |
| import java.io.ByteArrayInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.LineNumberReader; |
| import java.nio.charset.Charset; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CodingErrorAction; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| import java.text.ParseException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.LinkedHashMap; |
| import java.util.LinkedHashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.TreeMap; |
| import java.util.stream.Collectors; |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.IOContext; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.OfflineSorter; |
| import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; |
| import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.FSTCompiler; |
| import org.apache.lucene.util.fst.IntSequenceOutputs; |
| import org.apache.lucene.util.fst.Util; |
| |
| /** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */ |
| public class Dictionary { |
| // Derived from woorm/LibreOffice dictionaries. |
| // See TestAllDictionaries.testMaxPrologueNeeded. |
| static final int MAX_PROLOGUE_SCAN_WINDOW = 30 * 1024; |
| |
| static final char[] NOFLAGS = new char[0]; |
| |
| static final char FLAG_UNSET = (char) 0; |
| private static final int DEFAULT_FLAGS = 65510; |
| static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell |
| |
| static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1; |
| CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET); |
| |
| FST<IntsRef> prefixes; |
| FST<IntsRef> suffixes; |
| Breaks breaks = Breaks.DEFAULT; |
| |
| /** |
| * All condition checks used by prefixes and suffixes. these are typically re-used across many |
| * affix stripping rules. so these are deduplicated, to save RAM. |
| */ |
| ArrayList<AffixCondition> patterns = new ArrayList<>(); |
| |
| /** |
| * The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list |
| * for flagLookup. |
| */ |
| FST<IntsRef> words; |
| |
| /** A Bloom filter over {@link #words} to avoid unnecessary expensive FST traversals */ |
| FixedBitSet wordHashes; |
| |
| /** |
| * The list of unique flagsets (wordforms). theoretically huge, but practically small (for Polish |
| * this is 756), otherwise humans wouldn't be able to deal with it either. |
| */ |
| final FlagEnumerator.Lookup flagLookup; |
| |
| // the list of unique strip affixes. |
| char[] stripData; |
| int[] stripOffsets; |
| |
| String wordChars = ""; |
| |
| // 4 chars per affix, each char representing an unsigned 2-byte integer |
| char[] affixData = new char[32]; |
| private int currentAffix = 0; |
| |
| // offsets in affixData |
| static final int AFFIX_FLAG = 0; |
| static final int AFFIX_STRIP_ORD = 1; |
| private static final int AFFIX_CONDITION = 2; |
| static final int AFFIX_APPEND = 3; |
| |
| // Default flag parsing strategy |
| FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); |
| |
| // AF entries |
| private String[] aliases; |
| private int aliasCount = 0; |
| |
| // AM entries |
| private String[] morphAliases; |
| private int morphAliasCount = 0; |
| |
| final List<String> morphData = new ArrayList<>(Collections.singletonList("")); // empty data at 0 |
| |
| /** |
| * we set this during sorting, so we know to add an extra int (index in {@link #morphData}) to FST |
| * output |
| */ |
| boolean hasCustomMorphData; |
| |
| boolean ignoreCase; |
| boolean checkSharpS; |
| boolean complexPrefixes; |
| |
| /** |
| * All flags used in affix continuation classes. If an outer affix's flag isn't here, there's no |
| * need to do 2-level affix stripping with it. |
| */ |
| private char[] secondStagePrefixFlags, secondStageSuffixFlags; |
| |
| char circumfix; |
| char keepcase, forceUCase; |
| char needaffix; |
| char forbiddenword; |
| char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundFlag; |
| char compoundPermit, compoundForbid; |
| boolean checkCompoundCase, checkCompoundDup, checkCompoundRep; |
| boolean checkCompoundTriple, simplifiedTriple; |
| int compoundMin = 3, compoundMax = Integer.MAX_VALUE; |
| List<CompoundRule> compoundRules; // nullable |
| List<CheckCompoundPattern> checkCompoundPatterns = new ArrayList<>(); |
| |
| // ignored characters (dictionary, affix, inputs) |
| private char[] ignore; |
| |
| String tryChars = ""; |
| String[] neighborKeyGroups = {"qwertyuiop", "asdfghjkl", "zxcvbnm"}; |
| boolean enableSplitSuggestions = true; |
| List<RepEntry> repTable = new ArrayList<>(); |
| List<List<String>> mapTable = new ArrayList<>(); |
| int maxDiff = 5; |
| int maxNGramSuggestions = 4; |
| boolean onlyMaxDiff; |
| char noSuggest, subStandard; |
| ConvTable iconv, oconv; |
| |
| // true if we can strip suffixes "down to nothing" |
| boolean fullStrip; |
| |
| // language declaration of the dictionary |
| String language; |
| // true if case algorithms should use alternate (Turkish/Azeri) mapping |
| private boolean alternateCasing; |
| |
| /** |
| * Creates a new Dictionary containing the information read from the provided InputStreams to |
| * hunspell affix and dictionary files. You have to close the provided InputStreams yourself. |
| * |
| * @param tempDir Directory to use for offline sorting |
| * @param tempFileNamePrefix prefix to use to generate temp file names |
| * @param affix InputStream for reading the hunspell affix file (won't be closed). |
| * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed). |
| * @throws IOException Can be thrown while reading from the InputStreams |
| * @throws ParseException Can be thrown if the content of the files does not meet expected formats |
| */ |
| public Dictionary( |
| Directory tempDir, String tempFileNamePrefix, InputStream affix, InputStream dictionary) |
| throws IOException, ParseException { |
| this(tempDir, tempFileNamePrefix, affix, Collections.singletonList(dictionary), false); |
| } |
| |
| /** |
| * Creates a new Dictionary containing the information read from the provided InputStreams to |
| * hunspell affix and dictionary files. You have to close the provided InputStreams yourself. |
| * |
| * @param tempDir Directory to use for offline sorting |
| * @param tempFileNamePrefix prefix to use to generate temp file names |
| * @param affix InputStream for reading the hunspell affix file (won't be closed). |
| * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). |
| * @throws IOException Can be thrown while reading from the InputStreams |
| * @throws ParseException Can be thrown if the content of the files does not meet expected formats |
| */ |
| public Dictionary( |
| Directory tempDir, |
| String tempFileNamePrefix, |
| InputStream affix, |
| List<InputStream> dictionaries, |
| boolean ignoreCase) |
| throws IOException, ParseException { |
| this.ignoreCase = ignoreCase; |
| |
| try (BufferedInputStream affixStream = |
| new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) { |
| @Override |
| public void close() { |
| // TODO: maybe we should consume and close it? Why does it need to stay open? |
| // Don't close the affix stream as per javadoc. |
| } |
| }) { |
| // I assume we don't support other BOMs (utf16, etc.)? We trivially could, |
| // by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have |
| // any such exotic examples. |
| Charset streamCharset; |
| if (maybeConsume(affixStream, BOM_UTF8)) { |
| streamCharset = StandardCharsets.UTF_8; |
| } else { |
| streamCharset = DEFAULT_CHARSET; |
| } |
| |
| /* |
| * pass 1: look for encoding & flag. This is simple but works. We just prefetch |
| * a large enough chunk of the input and scan through it. The buffered data will |
| * be subsequently reused anyway so nothing is wasted. |
| */ |
| affixStream.mark(MAX_PROLOGUE_SCAN_WINDOW); |
| byte[] prologue = affixStream.readNBytes(MAX_PROLOGUE_SCAN_WINDOW - 1); |
| affixStream.reset(); |
| readConfig(new ByteArrayInputStream(prologue), streamCharset); |
| |
| // pass 2: parse affixes |
| FlagEnumerator flagEnumerator = new FlagEnumerator(); |
| readAffixFile(affixStream, decoder, flagEnumerator); |
| |
| // read dictionary entries |
| IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT); |
| int wordCount = mergeDictionaries(dictionaries, decoder, unsorted); |
| wordHashes = new FixedBitSet(Integer.highestOneBit(wordCount * 10)); |
| String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted); |
| words = readSortedDictionaries(tempDir, sortedFile, flagEnumerator); |
| flagLookup = flagEnumerator.finish(); |
| aliases = null; // no longer needed |
| morphAliases = null; // no longer needed |
| } |
| } |
| |
| int formStep() { |
| return hasCustomMorphData ? 2 : 1; |
| } |
| |
| /** Looks up Hunspell word forms from the dictionary */ |
| IntsRef lookupWord(char[] word, int offset, int length) { |
| int hash = CharsRef.stringHashCode(word, offset, length); |
| if (!wordHashes.get(Math.abs(hash) % wordHashes.length())) { |
| return null; |
| } |
| |
| return lookup(words, word, offset, length); |
| } |
| |
| // only for testing |
| IntsRef lookupPrefix(char[] word) { |
| return lookup(prefixes, word, 0, word.length); |
| } |
| |
| // only for testing |
| IntsRef lookupSuffix(char[] word) { |
| return lookup(suffixes, word, 0, word.length); |
| } |
| |
| IntsRef lookup(FST<IntsRef> fst, char[] word, int offset, int length) { |
| if (fst == null) { |
| return null; |
| } |
| final FST.BytesReader bytesReader = fst.getBytesReader(); |
| final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>()); |
| // Accumulate output as we go |
| IntsRef output = fst.outputs.getNoOutput(); |
| |
| int l = offset + length; |
| for (int i = offset, cp; i < l; i += Character.charCount(cp)) { |
| cp = Character.codePointAt(word, i, l); |
| output = nextArc(fst, arc, bytesReader, output, cp); |
| if (output == null) { |
| return null; |
| } |
| } |
| return nextArc(fst, arc, bytesReader, output, FST.END_LABEL); |
| } |
| |
| static IntsRef nextArc( |
| FST<IntsRef> fst, FST.Arc<IntsRef> arc, FST.BytesReader reader, IntsRef output, int ch) { |
| try { |
| if (fst.findTargetArc(ch, arc, arc, reader) == null) { |
| return null; |
| } |
| } catch (IOException bogus) { |
| throw new RuntimeException(bogus); |
| } |
| return fst.outputs.add(output, arc.output()); |
| } |
| |
| /** |
| * Reads the affix file through the provided InputStream, building up the prefix and suffix maps |
| * |
| * @param affixStream InputStream to read the content of the affix file from |
| * @param decoder CharsetDecoder to decode the content of the file |
| * @throws IOException Can be thrown while reading from the InputStream |
| */ |
| private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, FlagEnumerator flags) |
| throws IOException, ParseException { |
| TreeMap<String, List<Integer>> prefixes = new TreeMap<>(); |
| TreeMap<String, List<Integer>> suffixes = new TreeMap<>(); |
| Set<Character> prefixContFlags = new HashSet<>(); |
| Set<Character> suffixContFlags = new HashSet<>(); |
| Map<String, Integer> seenPatterns = new HashMap<>(); |
| |
| // zero condition -> 0 ord |
| seenPatterns.put(AffixCondition.ALWAYS_TRUE_KEY, 0); |
| patterns.add(null); |
| |
| // zero strip -> 0 ord |
| Map<String, Integer> seenStrips = new LinkedHashMap<>(); |
| seenStrips.put("", 0); |
| |
| LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); |
| String line; |
| while ((line = reader.readLine()) != null) { |
| // ignore any BOM marker on first line |
| if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) { |
| line = line.substring(1); |
| } |
| line = line.trim(); |
| if (line.isEmpty()) continue; |
| |
| String firstWord = line.split("\\s")[0]; |
| // TODO: convert to a switch? |
| if ("AF".equals(firstWord)) { |
| parseAlias(line); |
| } else if ("AM".equals(firstWord)) { |
| parseMorphAlias(line); |
| } else if ("PFX".equals(firstWord)) { |
| parseAffix( |
| prefixes, prefixContFlags, line, reader, PREFIX, seenPatterns, seenStrips, flags); |
| } else if ("SFX".equals(firstWord)) { |
| parseAffix( |
| suffixes, suffixContFlags, line, reader, SUFFIX, seenPatterns, seenStrips, flags); |
| } else if (line.equals("COMPLEXPREFIXES")) { |
| complexPrefixes = |
| true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix |
| } else if ("CIRCUMFIX".equals(firstWord)) { |
| circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("KEEPCASE".equals(firstWord)) { |
| keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("FORCEUCASE".equals(firstWord)) { |
| forceUCase = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) { |
| needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("ONLYINCOMPOUND".equals(firstWord)) { |
| onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("CHECKSHARPS".equals(firstWord)) { |
| checkSharpS = true; |
| } else if ("IGNORE".equals(firstWord)) { |
| ignore = singleArgument(reader, line).toCharArray(); |
| Arrays.sort(ignore); |
| } else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) { |
| int num = parseNum(reader, line); |
| ConvTable res = parseConversions(reader, num); |
| if (line.startsWith("I")) { |
| iconv = res; |
| } else { |
| oconv = res; |
| } |
| } else if ("FULLSTRIP".equals(firstWord)) { |
| fullStrip = true; |
| } else if ("LANG".equals(firstWord)) { |
| language = singleArgument(reader, line); |
| this.alternateCasing = hasLanguage("tr", "az"); |
| } else if ("BREAK".equals(firstWord)) { |
| breaks = parseBreaks(reader, line); |
| } else if ("WORDCHARS".equals(firstWord)) { |
| wordChars = firstArgument(reader, line); |
| } else if ("TRY".equals(firstWord)) { |
| tryChars = firstArgument(reader, line); |
| } else if ("REP".equals(firstWord)) { |
| int count = parseNum(reader, line); |
| for (int i = 0; i < count; i++) { |
| String[] parts = splitBySpace(reader, reader.readLine(), 3, Integer.MAX_VALUE); |
| repTable.add(new RepEntry(parts[1], parts[2])); |
| } |
| } else if ("MAP".equals(firstWord)) { |
| int count = parseNum(reader, line); |
| for (int i = 0; i < count; i++) { |
| mapTable.add(parseMapEntry(reader, reader.readLine())); |
| } |
| } else if ("KEY".equals(firstWord)) { |
| neighborKeyGroups = singleArgument(reader, line).split("\\|"); |
| } else if ("NOSPLITSUGS".equals(firstWord)) { |
| enableSplitSuggestions = false; |
| } else if ("MAXNGRAMSUGS".equals(firstWord)) { |
| maxNGramSuggestions = Integer.parseInt(singleArgument(reader, line)); |
| } else if ("MAXDIFF".equals(firstWord)) { |
| int i = Integer.parseInt(singleArgument(reader, line)); |
| if (i < 0 || i > 10) { |
| throw new ParseException("MAXDIFF should be between 0 and 10", reader.getLineNumber()); |
| } |
| maxDiff = i; |
| } else if ("ONLYMAXDIFF".equals(firstWord)) { |
| onlyMaxDiff = true; |
| } else if ("FORBIDDENWORD".equals(firstWord)) { |
| forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("NOSUGGEST".equals(firstWord)) { |
| noSuggest = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("SUBSTANDARD".equals(firstWord)) { |
| subStandard = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("COMPOUNDMIN".equals(firstWord)) { |
| compoundMin = Math.max(1, parseNum(reader, line)); |
| } else if ("COMPOUNDWORDMAX".equals(firstWord)) { |
| compoundMax = Math.max(1, parseNum(reader, line)); |
| } else if ("COMPOUNDRULE".equals(firstWord)) { |
| compoundRules = parseCompoundRules(reader, parseNum(reader, line)); |
| } else if ("COMPOUNDFLAG".equals(firstWord)) { |
| compoundFlag = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("COMPOUNDBEGIN".equals(firstWord)) { |
| compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("COMPOUNDMIDDLE".equals(firstWord)) { |
| compoundMiddle = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("COMPOUNDEND".equals(firstWord)) { |
| compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("COMPOUNDPERMITFLAG".equals(firstWord)) { |
| compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("COMPOUNDFORBIDFLAG".equals(firstWord)) { |
| compoundForbid = flagParsingStrategy.parseFlag(singleArgument(reader, line)); |
| } else if ("CHECKCOMPOUNDCASE".equals(firstWord)) { |
| checkCompoundCase = true; |
| } else if ("CHECKCOMPOUNDDUP".equals(firstWord)) { |
| checkCompoundDup = true; |
| } else if ("CHECKCOMPOUNDREP".equals(firstWord)) { |
| checkCompoundRep = true; |
| } else if ("CHECKCOMPOUNDTRIPLE".equals(firstWord)) { |
| checkCompoundTriple = true; |
| } else if ("SIMPLIFIEDTRIPLE".equals(firstWord)) { |
| simplifiedTriple = true; |
| } else if ("CHECKCOMPOUNDPATTERN".equals(firstWord)) { |
| int count = parseNum(reader, line); |
| for (int i = 0; i < count; i++) { |
| checkCompoundPatterns.add( |
| new CheckCompoundPattern(reader.readLine(), flagParsingStrategy, this)); |
| } |
| } else if ("SET".equals(firstWord)) { |
| checkCriticalDirectiveSame( |
| "SET", reader, decoder.charset(), getDecoder(singleArgument(reader, line)).charset()); |
| } else if ("FLAG".equals(firstWord)) { |
| FlagParsingStrategy strategy = getFlagParsingStrategy(line, decoder.charset()); |
| checkCriticalDirectiveSame( |
| "FLAG", reader, flagParsingStrategy.getClass(), strategy.getClass()); |
| } |
| } |
| |
| this.prefixes = affixFST(prefixes); |
| this.suffixes = affixFST(suffixes); |
| secondStagePrefixFlags = toSortedCharArray(prefixContFlags); |
| secondStageSuffixFlags = toSortedCharArray(suffixContFlags); |
| |
| int totalChars = 0; |
| for (String strip : seenStrips.keySet()) { |
| totalChars += strip.length(); |
| } |
| stripData = new char[totalChars]; |
| stripOffsets = new int[seenStrips.size() + 1]; |
| int currentOffset = 0; |
| int currentIndex = 0; |
| for (String strip : seenStrips.keySet()) { |
| stripOffsets[currentIndex++] = currentOffset; |
| strip.getChars(0, strip.length(), stripData, currentOffset); |
| currentOffset += strip.length(); |
| } |
| assert currentIndex == seenStrips.size(); |
| stripOffsets[currentIndex] = currentOffset; |
| } |
| |
| private void checkCriticalDirectiveSame( |
| String directive, LineNumberReader reader, Object expected, Object actual) |
| throws ParseException { |
| if (!expected.equals(actual)) { |
| throw new ParseException( |
| directive |
| + " directive should occur at most once, and in the first " |
| + MAX_PROLOGUE_SCAN_WINDOW |
| + " bytes of the *.aff file", |
| reader.getLineNumber()); |
| } |
| } |
| |
| private List<String> parseMapEntry(LineNumberReader reader, String line) throws ParseException { |
| String unparsed = firstArgument(reader, line); |
| List<String> mapEntry = new ArrayList<>(); |
| for (int j = 0; j < unparsed.length(); j++) { |
| if (unparsed.charAt(j) == '(') { |
| int closing = unparsed.indexOf(')', j); |
| if (closing < 0) { |
| throw new ParseException("Unclosed parenthesis: " + line, reader.getLineNumber()); |
| } |
| |
| mapEntry.add(unparsed.substring(j + 1, closing)); |
| j = closing; |
| } else { |
| mapEntry.add(String.valueOf(unparsed.charAt(j))); |
| } |
| } |
| return mapEntry; |
| } |
| |
| boolean hasLanguage(String... langCodes) { |
| if (language == null) return false; |
| String langCode = extractLanguageCode(language); |
| for (String code : langCodes) { |
| if (langCode.equals(code)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** |
| * @param root a string to look up in the dictionary. No case conversion or affix removal is |
| * performed. To get the possible roots of any word, you may call {@link |
| * Hunspell#getRoots(String)} |
| * @return the dictionary entries for the given root, or {@code null} if there's none |
| */ |
| public DictEntries lookupEntries(String root) { |
| IntsRef forms = lookupWord(root.toCharArray(), 0, root.length()); |
| if (forms == null) return null; |
| |
| return new DictEntries() { |
| @Override |
| public int size() { |
| return forms.length / (hasCustomMorphData ? 2 : 1); |
| } |
| |
| @Override |
| public String getMorphologicalData(int entryIndex) { |
| if (!hasCustomMorphData) return ""; |
| return morphData.get(forms.ints[forms.offset + entryIndex * 2 + 1]); |
| } |
| |
| @Override |
| public List<String> getMorphologicalValues(int entryIndex, String key) { |
| assert key.length() == 3; |
| assert key.charAt(2) == ':'; |
| |
| String fields = getMorphologicalData(entryIndex); |
| if (fields.isEmpty() || !fields.contains(key)) return Collections.emptyList(); |
| |
| return Arrays.stream(fields.split(" ")) |
| .filter(s -> s.startsWith(key)) |
| .map(s -> s.substring(3)) |
| .collect(Collectors.toList()); |
| } |
| }; |
| } |
| |
| static String extractLanguageCode(String isoCode) { |
| int underscore = isoCode.indexOf("_"); |
| return underscore < 0 ? isoCode : isoCode.substring(0, underscore); |
| } |
| |
| private int parseNum(LineNumberReader reader, String line) throws ParseException { |
| return Integer.parseInt(splitBySpace(reader, line, 2, Integer.MAX_VALUE)[1]); |
| } |
| |
| private String singleArgument(LineNumberReader reader, String line) throws ParseException { |
| return splitBySpace(reader, line, 2)[1]; |
| } |
| |
| private String firstArgument(LineNumberReader reader, String line) throws ParseException { |
| return splitBySpace(reader, line, 2, Integer.MAX_VALUE)[1]; |
| } |
| |
| private String[] splitBySpace(LineNumberReader reader, String line, int expectedParts) |
| throws ParseException { |
| return splitBySpace(reader, line, expectedParts, expectedParts); |
| } |
| |
| private String[] splitBySpace(LineNumberReader reader, String line, int minParts, int maxParts) |
| throws ParseException { |
| String[] parts = line.split("\\s+"); |
| if (parts.length < minParts || parts.length > maxParts && !parts[maxParts].startsWith("#")) { |
| throw new ParseException("Invalid syntax: " + line, reader.getLineNumber()); |
| } |
| return parts; |
| } |
| |
| private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num) |
| throws IOException, ParseException { |
| List<CompoundRule> compoundRules = new ArrayList<>(); |
| for (int i = 0; i < num; i++) { |
| compoundRules.add(new CompoundRule(singleArgument(reader, reader.readLine()), this)); |
| } |
| return compoundRules; |
| } |
| |
| private Breaks parseBreaks(LineNumberReader reader, String line) |
| throws IOException, ParseException { |
| Set<String> starting = new LinkedHashSet<>(); |
| Set<String> ending = new LinkedHashSet<>(); |
| Set<String> middle = new LinkedHashSet<>(); |
| int num = parseNum(reader, line); |
| for (int i = 0; i < num; i++) { |
| String breakStr = singleArgument(reader, reader.readLine()); |
| if (breakStr.startsWith("^")) { |
| starting.add(breakStr.substring(1)); |
| } else if (breakStr.endsWith("$")) { |
| ending.add(breakStr.substring(0, breakStr.length() - 1)); |
| } else { |
| middle.add(breakStr); |
| } |
| } |
| return new Breaks(starting, ending, middle); |
| } |
| |
| private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException { |
| IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); |
| FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs); |
| IntsRefBuilder scratch = new IntsRefBuilder(); |
| for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) { |
| Util.toUTF32(entry.getKey(), scratch); |
| List<Integer> entries = entry.getValue(); |
| IntsRef output = new IntsRef(entries.size()); |
| for (Integer c : entries) { |
| output.ints[output.length++] = c; |
| } |
| fstCompiler.add(scratch.get(), output); |
| } |
| return fstCompiler.compile(); |
| } |
| |
| /** |
| * Parses a specific affix rule putting the result into the provided affix map |
| * |
| * @param affixes Map where the result of the parsing will be put |
| * @param header Header line of the affix rule |
| * @param reader BufferedReader to read the content of the rule from |
| * @param seenPatterns map from condition -> index of patterns, for deduplication. |
| * @throws IOException Can be thrown while reading the rule |
| */ |
| private void parseAffix( |
| TreeMap<String, List<Integer>> affixes, |
| Set<Character> secondStageFlags, |
| String header, |
| LineNumberReader reader, |
| AffixKind kind, |
| Map<String, Integer> seenPatterns, |
| Map<String, Integer> seenStrips, |
| FlagEnumerator flags) |
| throws IOException, ParseException { |
| |
| StringBuilder sb = new StringBuilder(); |
| String[] args = header.split("\\s+"); |
| |
| boolean crossProduct = args[2].equals("Y"); |
| |
| int numLines; |
| try { |
| numLines = Integer.parseInt(args[3]); |
| } catch (NumberFormatException e) { |
| return; |
| } |
| affixData = ArrayUtil.grow(affixData, currentAffix * 4 + numLines * 4); |
| |
| for (int i = 0; i < numLines; i++) { |
| String line = reader.readLine(); |
| // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] |
| String[] ruleArgs = splitBySpace(reader, line, 4, Integer.MAX_VALUE); |
| |
| char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); |
| String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; |
| String affixArg = ruleArgs[3]; |
| char[] appendFlags = null; |
| |
| // first: parse continuation classes out of affix |
| int flagSep = affixArg.lastIndexOf('/'); |
| if (flagSep != -1) { |
| String flagPart = affixArg.substring(flagSep + 1); |
| affixArg = affixArg.substring(0, flagSep); |
| |
| if (aliasCount > 0) { |
| flagPart = getAliasValue(Integer.parseInt(flagPart)); |
| } |
| |
| appendFlags = flagParsingStrategy.parseFlags(flagPart); |
| for (char appendFlag : appendFlags) { |
| secondStageFlags.add(appendFlag); |
| } |
| } |
| // zero affix -> empty string |
| if ("0".equals(affixArg)) { |
| affixArg = ""; |
| } |
| |
| String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; |
| String key = AffixCondition.uniqueKey(kind, strip, condition); |
| |
| // deduplicate patterns |
| Integer patternIndex = seenPatterns.get(key); |
| if (patternIndex == null) { |
| patternIndex = patterns.size(); |
| if (patternIndex > Short.MAX_VALUE) { |
| throw new UnsupportedOperationException( |
| "Too many patterns, please report this to dev@lucene.apache.org"); |
| } |
| seenPatterns.put(key, patternIndex); |
| patterns.add(AffixCondition.compile(kind, strip, condition, line)); |
| } |
| |
| Integer stripOrd = seenStrips.get(strip); |
| if (stripOrd == null) { |
| stripOrd = seenStrips.size(); |
| seenStrips.put(strip, stripOrd); |
| if (stripOrd > Character.MAX_VALUE) { |
| throw new UnsupportedOperationException( |
| "Too many unique strips, please report this to dev@lucene.apache.org"); |
| } |
| } |
| |
| if (appendFlags == null) { |
| appendFlags = NOFLAGS; |
| } |
| |
| int appendFlagsOrd = flags.add(appendFlags); |
| if (appendFlagsOrd < 0) { |
| // already exists in our hash |
| appendFlagsOrd = (-appendFlagsOrd) - 1; |
| } else if (appendFlagsOrd > Short.MAX_VALUE) { |
| // this limit is probably flexible, but it's a good sanity check too |
| throw new UnsupportedOperationException( |
| "Too many unique append flags, please report this to dev@lucene.apache.org"); |
| } |
| |
| int dataStart = currentAffix * 4; |
| affixData[dataStart + AFFIX_FLAG] = flag; |
| affixData[dataStart + AFFIX_STRIP_ORD] = (char) stripOrd.intValue(); |
| // encode crossProduct into patternIndex |
| int patternOrd = patternIndex << 1 | (crossProduct ? 1 : 0); |
| affixData[dataStart + AFFIX_CONDITION] = (char) patternOrd; |
| affixData[dataStart + AFFIX_APPEND] = (char) appendFlagsOrd; |
| |
| if (needsInputCleaning(affixArg)) { |
| affixArg = cleanInput(affixArg, sb).toString(); |
| } |
| |
| if (kind == SUFFIX) { |
| affixArg = new StringBuilder(affixArg).reverse().toString(); |
| } |
| |
| affixes.computeIfAbsent(affixArg, __ -> new ArrayList<>()).add(currentAffix); |
| currentAffix++; |
| } |
| } |
| |
| char affixData(int affixIndex, int offset) { |
| return affixData[affixIndex * 4 + offset]; |
| } |
| |
| boolean isCrossProduct(int affix) { |
| return (affixData(affix, AFFIX_CONDITION) & 1) == 1; |
| } |
| |
| int getAffixCondition(int affix) { |
| return affixData(affix, AFFIX_CONDITION) >>> 1; |
| } |
| |
| private ConvTable parseConversions(LineNumberReader reader, int num) |
| throws IOException, ParseException { |
| TreeMap<String, String> mappings = new TreeMap<>(); |
| |
| for (int i = 0; i < num; i++) { |
| String[] parts = splitBySpace(reader, reader.readLine(), 3); |
| if (mappings.put(parts[1], parts[2]) != null) { |
| throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); |
| } |
| } |
| |
| return new ConvTable(mappings); |
| } |
| |
| private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf}; |
| |
| /** Parses the encoding and flag format specified in the provided InputStream */ |
| private void readConfig(InputStream stream, Charset streamCharset) |
| throws IOException, ParseException { |
| LineNumberReader reader = new LineNumberReader(new InputStreamReader(stream, streamCharset)); |
| String line; |
| String flagLine = null; |
| boolean charsetFound = false; |
| boolean flagFound = false; |
| while ((line = reader.readLine()) != null) { |
| if (line.isBlank()) continue; |
| |
| String firstWord = line.split("\\s")[0]; |
| if ("SET".equals(firstWord)) { |
| decoder = getDecoder(singleArgument(reader, line)); |
| charsetFound = true; |
| } else if ("FLAG".equals(firstWord)) { |
| // Preserve the flag line for parsing later since we need the decoder's charset |
| // and just in case they come out of order. |
| flagLine = line; |
| flagFound = true; |
| } else { |
| continue; |
| } |
| |
| if (charsetFound && flagFound) { |
| break; |
| } |
| } |
| |
| if (flagFound) { |
| flagParsingStrategy = getFlagParsingStrategy(flagLine, decoder.charset()); |
| } |
| } |
| |
| /** |
| * Consume the provided byte sequence in full, if present. Otherwise leave the input stream |
| * intact. |
| * |
| * @return {@code true} if the sequence matched and has been consumed. |
| */ |
| @SuppressWarnings("SameParameterValue") |
| private static boolean maybeConsume(BufferedInputStream stream, byte[] bytes) throws IOException { |
| stream.mark(bytes.length); |
| for (byte b : bytes) { |
| int nextByte = stream.read(); |
| if (nextByte != (b & 0xff)) { // covers EOF (-1) as well. |
| stream.reset(); |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| static final Map<String, String> CHARSET_ALIASES = |
| Map.of("microsoft-cp1251", "windows-1251", "TIS620-2533", "TIS-620"); |
| |
| /** |
| * Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think |
| * ISCII-DEVANAGARI and MICROSOFT-CP1251 etc are allowed... |
| * |
| * @param encoding Encoding to retrieve the CharsetDecoder for |
| * @return CharSetDecoder for the given encoding |
| */ |
| private CharsetDecoder getDecoder(String encoding) { |
| if ("ISO8859-14".equals(encoding)) { |
| return new ISO8859_14Decoder(); |
| } |
| String canon = CHARSET_ALIASES.get(encoding); |
| if (canon != null) { |
| encoding = canon; |
| } |
| return replacingDecoder(Charset.forName(encoding)); |
| } |
| |
| private static CharsetDecoder replacingDecoder(Charset charset) { |
| return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE); |
| } |
| |
| /** |
| * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken |
| * from the affix file |
| * |
| * @param flagLine Line containing the flag information |
| * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG |
| * definition |
| */ |
| static FlagParsingStrategy getFlagParsingStrategy(String flagLine, Charset charset) { |
| String[] parts = flagLine.split("\\s+"); |
| if (parts.length != 2) { |
| throw new IllegalArgumentException("Illegal FLAG specification: " + flagLine); |
| } |
| String flagType = parts[1]; |
| |
| if ("num".equals(flagType)) { |
| return new NumFlagParsingStrategy(); |
| } else if ("UTF-8".equals(flagType)) { |
| if (DEFAULT_CHARSET.equals(charset)) { |
| return new DefaultAsUtf8FlagParsingStrategy(); |
| } |
| return new SimpleFlagParsingStrategy(); |
| } else if ("long".equals(flagType)) { |
| return new DoubleASCIIFlagParsingStrategy(); |
| } |
| |
| throw new IllegalArgumentException("Unknown flag type: " + flagType); |
| } |
| |
| private static final char FLAG_SEPARATOR = 0x1f; // flag separator after escaping |
| private static final char MORPH_SEPARATOR = |
| 0x1e; // separator for boundary of entry (may be followed by morph data) |
| |
| private String unescapeEntry(String entry) { |
| StringBuilder sb = new StringBuilder(); |
| int end = morphBoundary(entry); |
| for (int i = 0; i < end; i++) { |
| char ch = entry.charAt(i); |
| if (ch == '\\' && i + 1 < entry.length()) { |
| sb.append(entry.charAt(i + 1)); |
| i++; |
| } else if (ch == '/' && i > 0) { |
| sb.append(FLAG_SEPARATOR); |
| } else if (!shouldSkipEscapedChar(ch)) { |
| sb.append(ch); |
| } |
| } |
| sb.append(MORPH_SEPARATOR); |
| if (end < entry.length()) { |
| for (int i = end; i < entry.length(); i++) { |
| char c = entry.charAt(i); |
| if (!shouldSkipEscapedChar(c)) { |
| sb.append(c); |
| } |
| } |
| } |
| return sb.toString(); |
| } |
| |
| private static boolean shouldSkipEscapedChar(char ch) { |
| return ch == FLAG_SEPARATOR |
| || ch == MORPH_SEPARATOR; // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!! |
| } |
| |
| private static int morphBoundary(String line) { |
| int end = indexOfSpaceOrTab(line, 0); |
| if (end == -1) { |
| return line.length(); |
| } |
| while (end >= 0 && end < line.length()) { |
| if (line.charAt(end) == '\t' |
| || end > 0 |
| && end + 3 < line.length() |
| && Character.isLetter(line.charAt(end + 1)) |
| && Character.isLetter(line.charAt(end + 2)) |
| && line.charAt(end + 3) == ':') { |
| break; |
| } |
| end = indexOfSpaceOrTab(line, end + 1); |
| } |
| if (end == -1) { |
| return line.length(); |
| } |
| return end; |
| } |
| |
| static int indexOfSpaceOrTab(String text, int start) { |
| int pos1 = text.indexOf('\t', start); |
| int pos2 = text.indexOf(' ', start); |
| if (pos1 >= 0 && pos2 >= 0) { |
| return Math.min(pos1, pos2); |
| } else { |
| return Math.max(pos1, pos2); |
| } |
| } |
| |
| private int mergeDictionaries( |
| List<InputStream> dictionaries, CharsetDecoder decoder, IndexOutput output) |
| throws IOException { |
| StringBuilder sb = new StringBuilder(); |
| int wordCount = 0; |
| try (ByteSequencesWriter writer = new ByteSequencesWriter(output)) { |
| for (InputStream dictionary : dictionaries) { |
| BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); |
| lines.readLine(); // first line is number of entries (approximately, sometimes) |
| |
| String line; |
| while ((line = lines.readLine()) != null) { |
| // wild and unpredictable code comment rules |
| if (line.isEmpty() || line.charAt(0) == '#' || line.charAt(0) == '\t') { |
| continue; |
| } |
| line = unescapeEntry(line); |
| // if we haven't seen any custom morphological data, try to parse one |
| if (!hasCustomMorphData) { |
| int morphStart = line.indexOf(MORPH_SEPARATOR); |
| if (morphStart >= 0 && morphStart < line.length()) { |
| String data = line.substring(morphStart + 1); |
| hasCustomMorphData = |
| splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:")); |
| } |
| } |
| |
| wordCount += writeNormalizedWordEntry(sb, writer, line); |
| } |
| } |
| CodecUtil.writeFooter(output); |
| } |
| return wordCount; |
| } |
| |
| /** @return the number of word entries written */ |
| private int writeNormalizedWordEntry(StringBuilder reuse, ByteSequencesWriter writer, String line) |
| throws IOException { |
| int flagSep = line.indexOf(FLAG_SEPARATOR); |
| int morphSep = line.indexOf(MORPH_SEPARATOR); |
| assert morphSep > 0; |
| assert morphSep > flagSep; |
| int sep = flagSep < 0 ? morphSep : flagSep; |
| if (sep == 0) return 0; |
| |
| CharSequence toWrite; |
| String beforeSep = line.substring(0, sep); |
| if (needsInputCleaning(beforeSep)) { |
| cleanInput(beforeSep, reuse); |
| reuse.append(line, sep, line.length()); |
| toWrite = reuse; |
| } else { |
| toWrite = line; |
| } |
| |
| String written = toWrite.toString(); |
| sep = written.length() - (line.length() - sep); |
| writer.write(written.getBytes(StandardCharsets.UTF_8)); |
| |
| WordCase wordCase = WordCase.caseOf(written, sep); |
| if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) { |
| addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep)); |
| return 2; |
| } |
| return 1; |
| } |
| |
| private void addHiddenCapitalizedWord( |
| StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep) |
| throws IOException { |
| reuse.setLength(0); |
| reuse.append(Character.toUpperCase(word.charAt(0))); |
| for (int i = 1; i < word.length(); i++) { |
| reuse.append(caseFold(word.charAt(i))); |
| } |
| reuse.append(FLAG_SEPARATOR); |
| reuse.append(HIDDEN_FLAG); |
| reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length()); |
| writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8)); |
| } |
| |
| String toLowerCase(String word) { |
| char[] chars = new char[word.length()]; |
| for (int i = 0; i < word.length(); i++) { |
| chars[i] = caseFold(word.charAt(i)); |
| } |
| return new String(chars); |
| } |
| |
| String toTitleCase(String word) { |
| char[] chars = new char[word.length()]; |
| chars[0] = Character.toUpperCase(word.charAt(0)); |
| for (int i = 1; i < word.length(); i++) { |
| chars[i] = caseFold(word.charAt(i)); |
| } |
| return new String(chars); |
| } |
| |
| private String sortWordsOffline( |
| Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException { |
| OfflineSorter sorter = |
| new OfflineSorter( |
| tempDir, |
| tempFileNamePrefix, |
| new Comparator<>() { |
| final BytesRef scratch1 = new BytesRef(); |
| final BytesRef scratch2 = new BytesRef(); |
| |
| private void initScratch(BytesRef o, BytesRef scratch) { |
| scratch.bytes = o.bytes; |
| scratch.offset = o.offset; |
| scratch.length = o.length; |
| |
| for (int i = scratch.length - 1; i >= 0; i--) { |
| if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR |
| || scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) { |
| scratch.length = i; |
| break; |
| } |
| } |
| } |
| |
| @Override |
| public int compare(BytesRef o1, BytesRef o2) { |
| initScratch(o1, scratch1); |
| initScratch(o2, scratch2); |
| |
| int cmp = scratch1.compareTo(scratch2); |
| if (cmp == 0) { |
| // tie break on whole row |
| return o1.compareTo(o2); |
| } else { |
| return cmp; |
| } |
| } |
| }); |
| |
| String sorted; |
| boolean success = false; |
| try { |
| sorted = sorter.sort(unsorted.getName()); |
| success = true; |
| } finally { |
| if (success) { |
| tempDir.deleteFile(unsorted.getName()); |
| } else { |
| IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName()); |
| } |
| } |
| return sorted; |
| } |
| |
| private FST<IntsRef> readSortedDictionaries( |
| Directory tempDir, String sorted, FlagEnumerator flags) throws IOException { |
| boolean success = false; |
| |
| Map<String, Integer> morphIndices = new HashMap<>(); |
| |
| EntryGrouper grouper = new EntryGrouper(flags); |
| |
| try (ByteSequencesReader reader = |
| new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) { |
| |
| // TODO: the flags themselves can be double-chars (long) or also numeric |
| // either way the trick is to encode them as char... but they must be parsed differently |
| |
| while (true) { |
| BytesRef scratch = reader.next(); |
| if (scratch == null) { |
| break; |
| } |
| |
| String line = scratch.utf8ToString(); |
| String entry; |
| char[] wordForm; |
| int end; |
| |
| int flagSep = line.indexOf(FLAG_SEPARATOR); |
| if (flagSep == -1) { |
| wordForm = NOFLAGS; |
| end = line.indexOf(MORPH_SEPARATOR); |
| entry = line.substring(0, end); |
| } else { |
| end = line.indexOf(MORPH_SEPARATOR); |
| boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG; |
| String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end); |
| if (aliasCount > 0 && !flagPart.isEmpty()) { |
| flagPart = getAliasValue(Integer.parseInt(flagPart)); |
| } |
| |
| wordForm = flagParsingStrategy.parseFlags(flagPart); |
| if (hidden) { |
| wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1); |
| wordForm[wordForm.length - 1] = HIDDEN_FLAG; |
| } |
| entry = line.substring(0, flagSep); |
| } |
| |
| int morphDataID = 0; |
| if (end + 1 < line.length()) { |
| List<String> morphFields = readMorphFields(entry, line.substring(end + 1)); |
| if (!morphFields.isEmpty()) { |
| morphFields.sort(Comparator.naturalOrder()); |
| morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields)); |
| } |
| } |
| |
| wordHashes.set(Math.abs(entry.hashCode()) % wordHashes.length()); |
| grouper.add(entry, wordForm, morphDataID); |
| } |
| |
| // finalize last entry |
| grouper.flushGroup(); |
| success = true; |
| return grouper.words.compile(); |
| } finally { |
| if (success) { |
| tempDir.deleteFile(sorted); |
| } else { |
| IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted); |
| } |
| } |
| } |
| |
| private List<String> readMorphFields(String word, String unparsed) { |
| List<String> morphFields = null; |
| for (String datum : splitMorphData(unparsed)) { |
| if (datum.startsWith("ph:")) { |
| addPhoneticRepEntries(word, datum.substring(3)); |
| } else { |
| if (morphFields == null) morphFields = new ArrayList<>(1); |
| morphFields.add(datum); |
| } |
| } |
| return morphFields == null ? Collections.emptyList() : morphFields; |
| } |
| |
| private int addMorphFields(Map<String, Integer> indices, String morphFields) { |
| Integer alreadyCached = indices.get(morphFields); |
| if (alreadyCached != null) { |
| return alreadyCached; |
| } |
| |
| int index = morphData.size(); |
| indices.put(morphFields, index); |
| morphData.add(morphFields); |
| return index; |
| } |
| |
| private void addPhoneticRepEntries(String word, String ph) { |
| // e.g. "pretty ph:prity ph:priti->pretti" to suggest both prity->pretty and pritier->prettiest |
| int arrow = ph.indexOf("->"); |
| String pattern; |
| String replacement; |
| if (arrow > 0) { |
| pattern = ph.substring(0, arrow); |
| replacement = ph.substring(arrow + 2); |
| } else { |
| pattern = ph; |
| replacement = word; |
| } |
| |
| // when the ph: field ends with *, strip last character of pattern and replacement |
| // e.g., "pretty ph:prity*" results in "prit->prett" replacement instead of "prity->pretty", |
| // to get both prity->pretty and pritiest->prettiest suggestions. |
| if (pattern.endsWith("*") && pattern.length() > 2 && replacement.length() > 1) { |
| pattern = pattern.substring(0, pattern.length() - 2); |
| replacement = replacement.substring(0, replacement.length() - 1); |
| } |
| |
| // capitalize lowercase pattern for capitalized words to support |
| // good suggestions also for capitalized misspellings, |
| // e.g. Wednesday ph:wendsay results in wendsay -> Wednesday and Wendsay -> Wednesday. |
| if (WordCase.caseOf(word) == WordCase.TITLE && WordCase.caseOf(pattern) == WordCase.LOWER) { |
| // add also lowercase word in the case of German or |
| // Hungarian to support lowercase suggestions lowercased by |
| // compound word generation or derivational suffixes |
| // for example by adjectival suffix "-i" of geographical names in Hungarian: |
| // Massachusetts ph:messzecsuzec |
| // messzecsuzeci -> massachusettsi (adjective) |
| // For lowercasing by conditional PFX rules, see e.g. germancompounding test |
| if (hasLanguage("de", "hu")) { |
| repTable.add(new RepEntry(pattern, toLowerCase(replacement))); |
| } |
| repTable.add(new RepEntry(toTitleCase(pattern), replacement)); |
| } |
| repTable.add(new RepEntry(pattern, replacement)); |
| } |
| |
| boolean isDotICaseChangeDisallowed(char[] word) { |
| return word[0] == 'İ' && !alternateCasing; |
| } |
| |
| private class EntryGrouper { |
| final FSTCompiler<IntsRef> words = |
| new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton()); |
| private final List<char[]> group = new ArrayList<>(); |
| private final List<Integer> morphDataIDs = new ArrayList<>(); |
| private final IntsRefBuilder scratchInts = new IntsRefBuilder(); |
| private String currentEntry = null; |
| private final FlagEnumerator flagEnumerator; |
| |
| EntryGrouper(FlagEnumerator flagEnumerator) { |
| this.flagEnumerator = flagEnumerator; |
| } |
| |
| void add(String entry, char[] flags, int morphDataID) throws IOException { |
| if (!entry.equals(currentEntry)) { |
| if (currentEntry != null) { |
| if (entry.compareTo(currentEntry) < 0) { |
| throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); |
| } |
| flushGroup(); |
| } |
| currentEntry = entry; |
| } |
| |
| group.add(flags); |
| if (hasCustomMorphData) { |
| morphDataIDs.add(morphDataID); |
| } |
| } |
| |
| void flushGroup() throws IOException { |
| IntsRefBuilder currentOrds = new IntsRefBuilder(); |
| |
| boolean hasNonHidden = false; |
| for (char[] flags : group) { |
| if (!hasHiddenFlag(flags)) { |
| hasNonHidden = true; |
| break; |
| } |
| } |
| |
| for (int i = 0; i < group.size(); i++) { |
| char[] flags = group.get(i); |
| if (hasNonHidden && hasHiddenFlag(flags)) { |
| continue; |
| } |
| |
| currentOrds.append(flagEnumerator.add(flags)); |
| if (hasCustomMorphData) { |
| currentOrds.append(morphDataIDs.get(i)); |
| } |
| } |
| |
| Util.toUTF32(currentEntry, scratchInts); |
| words.add(scratchInts.get(), currentOrds.get()); |
| |
| group.clear(); |
| morphDataIDs.clear(); |
| } |
| } |
| |
| private static boolean hasHiddenFlag(char[] flags) { |
| for (char flag : flags) { |
| if (flag == HIDDEN_FLAG) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| private void parseAlias(String line) { |
| String[] ruleArgs = line.split("\\s+"); |
| if (aliases == null) { |
| // first line should be the aliases count |
| final int count = Integer.parseInt(ruleArgs[1]); |
| aliases = new String[count]; |
| } else { |
| // an alias can map to no flags |
| String aliasValue = ruleArgs.length == 1 ? "" : ruleArgs[1]; |
| aliases[aliasCount++] = aliasValue; |
| } |
| } |
| |
| private String getAliasValue(int id) { |
| try { |
| return aliases[id - 1]; |
| } catch (IndexOutOfBoundsException ex) { |
| throw new IllegalArgumentException("Bad flag alias number:" + id, ex); |
| } |
| } |
| |
| private void parseMorphAlias(String line) { |
| if (morphAliases == null) { |
| // first line should be the aliases count |
| final int count = Integer.parseInt(line.substring(3)); |
| morphAliases = new String[count]; |
| } else { |
| String arg = line.substring(2); // leave the space |
| morphAliases[morphAliasCount++] = arg; |
| } |
| } |
| |
| private List<String> splitMorphData(String morphData) { |
| // first see if it's an alias |
| if (morphAliasCount > 0) { |
| try { |
| int alias = Integer.parseInt(morphData.trim()); |
| morphData = morphAliases[alias - 1]; |
| } catch (NumberFormatException ignored) { |
| } |
| } |
| if (morphData.isBlank()) { |
| return Collections.emptyList(); |
| } |
| return Arrays.stream(morphData.split("\\s+")) |
| .filter( |
| s -> |
| s.length() > 3 |
| && Character.isLetter(s.charAt(0)) |
| && Character.isLetter(s.charAt(1)) |
| && s.charAt(2) == ':') |
| .collect(Collectors.toList()); |
| } |
| |
| boolean hasFlag(IntsRef forms, char flag) { |
| int formStep = formStep(); |
| for (int i = 0; i < forms.length; i += formStep) { |
| if (hasFlag(forms.ints[forms.offset + i], flag)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** Abstraction of the process of parsing flags taken from the affix and dic files */ |
| abstract static class FlagParsingStrategy { |
| // we don't check the flag count, as Hunspell accepts longer sequences |
| // https://github.com/hunspell/hunspell/issues/707 |
| static final boolean checkFlags = false; |
| |
| /** |
| * Parses the given String into a single flag |
| * |
| * @param rawFlag String to parse into a flag |
| * @return Parsed flag |
| */ |
| char parseFlag(String rawFlag) { |
| char[] flags = parseFlags(rawFlag); |
| if (checkFlags && flags.length != 1) { |
| throw new IllegalArgumentException("expected only one flag, got: " + rawFlag); |
| } |
| return flags[0]; |
| } |
| |
| /** |
| * Parses the given String into multiple flags |
| * |
| * @param rawFlags String to parse into flags |
| * @return Parsed flags |
| */ |
| abstract char[] parseFlags(String rawFlags); |
| } |
| |
| /** |
| * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a |
| * individual flags. Can be used with both the ASCII and UTF-8 flag types. |
| */ |
| private static class SimpleFlagParsingStrategy extends FlagParsingStrategy { |
| @Override |
| public char[] parseFlags(String rawFlags) { |
| return rawFlags.toCharArray(); |
| } |
| } |
| |
| /** Used to read flags as UTF-8 even if the rest of the file is in the default (8-bit) encoding */ |
| private static class DefaultAsUtf8FlagParsingStrategy extends FlagParsingStrategy { |
| @Override |
| public char[] parseFlags(String rawFlags) { |
| return new String(rawFlags.getBytes(DEFAULT_CHARSET), StandardCharsets.UTF_8).toCharArray(); |
| } |
| } |
| |
| /** |
| * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its |
| * numerical form. In the case of multiple flags, each number is separated by a comma. |
| */ |
| private static class NumFlagParsingStrategy extends FlagParsingStrategy { |
| @Override |
| public char[] parseFlags(String rawFlags) { |
| StringBuilder result = new StringBuilder(); |
| StringBuilder group = new StringBuilder(); |
| for (int i = 0; i <= rawFlags.length(); i++) { |
| if (i == rawFlags.length() || rawFlags.charAt(i) == ',') { |
| if (group.length() > 0) { // ignoring empty flags (this happens in danish, for example) |
| int flag = Integer.parseInt(group, 0, group.length(), 10); |
| if (flag >= DEFAULT_FLAGS) { |
| // accept 0 due to https://github.com/hunspell/hunspell/issues/708 |
| throw new IllegalArgumentException( |
| "Num flags should be between 0 and " + DEFAULT_FLAGS + ", found " + flag); |
| } |
| result.append((char) flag); |
| group.setLength(0); |
| } |
| } else if (rawFlags.charAt(i) >= '0' && rawFlags.charAt(i) <= '9') { |
| group.append(rawFlags.charAt(i)); |
| } |
| } |
| |
| return result.toString().toCharArray(); |
| } |
| } |
| |
| /** |
| * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII |
| * characters whose codes must be combined into a single character. |
| */ |
| private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy { |
| |
| @Override |
| public char[] parseFlags(String rawFlags) { |
| if (checkFlags && rawFlags.length() % 2 == 1) { |
| throw new IllegalArgumentException( |
| "Invalid flags (should be even number of characters): " + rawFlags); |
| } |
| |
| char[] flags = new char[rawFlags.length() / 2]; |
| for (int i = 0; i < flags.length; i++) { |
| char f1 = rawFlags.charAt(i * 2); |
| char f2 = rawFlags.charAt(i * 2 + 1); |
| if (f1 >= 256 || f2 >= 256) { |
| throw new IllegalArgumentException( |
| "Invalid flags (LONG flags must be double ASCII): " + rawFlags); |
| } |
| flags[i] = (char) (f1 << 8 | f2); |
| } |
| return flags; |
| } |
| } |
| |
| boolean hasFlag(int entryId, char flag) { |
| return flagLookup.hasFlag(entryId, flag); |
| } |
| |
| boolean mayNeedInputCleaning() { |
| return ignoreCase || ignore != null || iconv != null; |
| } |
| |
| boolean needsInputCleaning(CharSequence input) { |
| if (mayNeedInputCleaning()) { |
| for (int i = 0; i < input.length(); i++) { |
| char ch = input.charAt(i); |
| if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0 |
| || ignoreCase && caseFold(ch) != ch |
| || iconv != null && iconv.mightReplaceChar(ch)) { |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| CharSequence cleanInput(CharSequence input, StringBuilder reuse) { |
| reuse.setLength(0); |
| |
| for (int i = 0; i < input.length(); i++) { |
| char ch = input.charAt(i); |
| |
| if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) { |
| continue; |
| } |
| |
| if (ignoreCase && iconv == null) { |
| // if we have no input conversion mappings, do this on-the-fly |
| ch = caseFold(ch); |
| } |
| |
| reuse.append(ch); |
| } |
| |
| if (iconv != null) { |
| iconv.applyMappings(reuse); |
| if (ignoreCase) { |
| for (int i = 0; i < reuse.length(); i++) { |
| reuse.setCharAt(i, caseFold(reuse.charAt(i))); |
| } |
| } |
| } |
| |
| return reuse; |
| } |
| |
| private static char[] toSortedCharArray(Set<Character> set) { |
| char[] chars = new char[set.size()]; |
| int i = 0; |
| for (Character c : set) { |
| chars[i++] = c; |
| } |
| Arrays.sort(chars); |
| return chars; |
| } |
| |
| boolean isSecondStagePrefix(char flag) { |
| return Arrays.binarySearch(secondStagePrefixFlags, flag) >= 0; |
| } |
| |
| boolean isSecondStageSuffix(char flag) { |
| return Arrays.binarySearch(secondStageSuffixFlags, flag) >= 0; |
| } |
| |
| /** folds single character (according to LANG if present) */ |
| char caseFold(char c) { |
| if (alternateCasing) { |
| if (c == 'I') { |
| return 'ı'; |
| } else if (c == 'İ') { |
| return 'i'; |
| } else { |
| return Character.toLowerCase(c); |
| } |
| } else { |
| return Character.toLowerCase(c); |
| } |
| } |
| |
| /** Returns true if this dictionary was constructed with the {@code ignoreCase} option */ |
| public boolean getIgnoreCase() { |
| return ignoreCase; |
| } |
| |
| /** |
| * Returns the default temporary directory pointed to by {@code java.io.tmpdir}. If not accessible |
| * or not available, an IOException is thrown. |
| */ |
| static Path getDefaultTempDir() throws IOException { |
| String tmpDir = System.getProperty("java.io.tmpdir"); |
| if (tmpDir == null) { |
| throw new IOException("No temporary path (java.io.tmpdir)?"); |
| } |
| Path tmpPath = Paths.get(tmpDir); |
| if (!Files.isWritable(tmpPath)) { |
| throw new IOException( |
| "Temporary path not present or writeable?: " + tmpPath.toAbsolutePath()); |
| } |
| return tmpPath; |
| } |
| |
| /** Possible word breaks according to BREAK directives */ |
| static class Breaks { |
| private static final Set<String> MINUS = Collections.singleton("-"); |
| static final Breaks DEFAULT = new Breaks(MINUS, MINUS, MINUS); |
| final String[] starting, ending, middle; |
| |
| Breaks(Collection<String> starting, Collection<String> ending, Collection<String> middle) { |
| this.starting = starting.toArray(new String[0]); |
| this.ending = ending.toArray(new String[0]); |
| this.middle = middle.toArray(new String[0]); |
| } |
| |
| boolean isNotEmpty() { |
| return middle.length > 0 || starting.length > 0 || ending.length > 0; |
| } |
| } |
| } |