| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.hunspell; |
| |
| |
| import java.io.BufferedInputStream; |
| import java.io.BufferedOutputStream; |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.LineNumberReader; |
| import java.io.OutputStream; |
| import java.nio.charset.Charset; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CodingErrorAction; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| import java.text.ParseException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashMap; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.TreeMap; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.store.ByteArrayDataOutput; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.IOContext; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.BytesRefHash; |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; |
| import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; |
| import org.apache.lucene.util.OfflineSorter; |
| import org.apache.lucene.util.automaton.CharacterRunAutomaton; |
| import org.apache.lucene.util.automaton.RegExp; |
| import org.apache.lucene.util.fst.Builder; |
| import org.apache.lucene.util.fst.CharSequenceOutputs; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.IntSequenceOutputs; |
| import org.apache.lucene.util.fst.Outputs; |
| import org.apache.lucene.util.fst.Util; |
| |
| /** |
| * In-memory structure for the dictionary (.dic) and affix (.aff) |
| * data of a hunspell dictionary. |
| */ |
| public class Dictionary { |
| |
| static final char[] NOFLAGS = new char[0]; |
| |
| private static final String ALIAS_KEY = "AF"; |
| private static final String MORPH_ALIAS_KEY = "AM"; |
| private static final String PREFIX_KEY = "PFX"; |
| private static final String SUFFIX_KEY = "SFX"; |
| private static final String FLAG_KEY = "FLAG"; |
| private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES"; |
| private static final String CIRCUMFIX_KEY = "CIRCUMFIX"; |
| private static final String IGNORE_KEY = "IGNORE"; |
| private static final String ICONV_KEY = "ICONV"; |
| private static final String OCONV_KEY = "OCONV"; |
| private static final String FULLSTRIP_KEY = "FULLSTRIP"; |
| private static final String LANG_KEY = "LANG"; |
| private static final String KEEPCASE_KEY = "KEEPCASE"; |
| private static final String NEEDAFFIX_KEY = "NEEDAFFIX"; |
| private static final String PSEUDOROOT_KEY = "PSEUDOROOT"; |
| private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND"; |
| |
| private static final String NUM_FLAG_TYPE = "num"; |
| private static final String UTF8_FLAG_TYPE = "UTF-8"; |
| private static final String LONG_FLAG_TYPE = "long"; |
| |
| // TODO: really for suffixes we should reverse the automaton and run them backwards |
| private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; |
| private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; |
| |
| FST<IntsRef> prefixes; |
| FST<IntsRef> suffixes; |
| |
| // all condition checks used by prefixes and suffixes. these are typically re-used across |
| // many affix stripping rules. so these are deduplicated, to save RAM. |
| ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>(); |
| |
| // the entries in the .dic file, mapping to their set of flags. |
| // the fst output is the ordinal list for flagLookup |
| FST<IntsRef> words; |
| // the list of unique flagsets (wordforms). theoretically huge, but practically |
| // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either. |
| BytesRefHash flagLookup = new BytesRefHash(); |
| |
| // the list of unique strip affixes. |
| char[] stripData; |
| int[] stripOffsets; |
| |
| // 8 bytes per affix |
| byte[] affixData = new byte[64]; |
| private int currentAffix = 0; |
| |
| private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy |
| |
| // AF entries |
| private String[] aliases; |
| private int aliasCount = 0; |
| |
| // AM entries |
| private String[] morphAliases; |
| private int morphAliasCount = 0; |
| |
| // st: morphological entries (either directly, or aliased from AM) |
| private String[] stemExceptions = new String[8]; |
| private int stemExceptionCount = 0; |
| // we set this during sorting, so we know to add an extra FST output. |
| // when set, some words have exceptional stems, and the last entry is a pointer to stemExceptions |
| boolean hasStemExceptions; |
| |
| private final Path tempPath = getDefaultTempDir(); // TODO: make this configurable? |
| |
| boolean ignoreCase; |
| boolean complexPrefixes; |
| boolean twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping |
| |
| int circumfix = -1; // circumfix flag, or -1 if one is not defined |
| int keepcase = -1; // keepcase flag, or -1 if one is not defined |
| int needaffix = -1; // needaffix flag, or -1 if one is not defined |
| int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined |
| |
| // ignored characters (dictionary, affix, inputs) |
| private char[] ignore; |
| |
| // FSTs used for ICONV/OCONV, output ord pointing to replacement text |
| FST<CharsRef> iconv; |
| FST<CharsRef> oconv; |
| |
| boolean needsInputCleaning; |
| boolean needsOutputCleaning; |
| |
| // true if we can strip suffixes "down to nothing" |
| boolean fullStrip; |
| |
| // language declaration of the dictionary |
| String language; |
| // true if case algorithms should use alternate (Turkish/Azeri) mapping |
| boolean alternateCasing; |
| |
| /** |
| * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix |
| * and dictionary files. |
| * You have to close the provided InputStreams yourself. |
| * |
| * @param tempDir Directory to use for offline sorting |
| * @param tempFileNamePrefix prefix to use to generate temp file names |
| * @param affix InputStream for reading the hunspell affix file (won't be closed). |
| * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed). |
| * @throws IOException Can be thrown while reading from the InputStreams |
| * @throws ParseException Can be thrown if the content of the files does not meet expected formats |
| */ |
| public Dictionary(Directory tempDir, String tempFileNamePrefix, InputStream affix, InputStream dictionary) throws IOException, ParseException { |
| this(tempDir, tempFileNamePrefix, affix, Collections.singletonList(dictionary), false); |
| } |
| |
| /** |
| * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix |
| * and dictionary files. |
| * You have to close the provided InputStreams yourself. |
| * |
| * @param tempDir Directory to use for offline sorting |
| * @param tempFileNamePrefix prefix to use to generate temp file names |
| * @param affix InputStream for reading the hunspell affix file (won't be closed). |
| * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). |
| * @throws IOException Can be thrown while reading from the InputStreams |
| * @throws ParseException Can be thrown if the content of the files does not meet expected formats |
| */ |
| public Dictionary(Directory tempDir, String tempFileNamePrefix, InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException { |
| this.ignoreCase = ignoreCase; |
| this.needsInputCleaning = ignoreCase; |
| this.needsOutputCleaning = false; // set if we have an OCONV |
| flagLookup.add(new BytesRef()); // no flags -> ord 0 |
| |
| Path aff = Files.createTempFile(tempPath, "affix", "aff"); |
| OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff)); |
| InputStream aff1 = null; |
| InputStream aff2 = null; |
| boolean success = false; |
| try { |
| // copy contents of affix stream to temp file |
| final byte [] buffer = new byte [1024 * 8]; |
| int len; |
| while ((len = affix.read(buffer)) > 0) { |
| out.write(buffer, 0, len); |
| } |
| out.close(); |
| |
| // pass 1: get encoding |
| aff1 = new BufferedInputStream(Files.newInputStream(aff)); |
| String encoding = getDictionaryEncoding(aff1); |
| |
| // pass 2: parse affixes |
| CharsetDecoder decoder = getJavaEncoding(encoding); |
| aff2 = new BufferedInputStream(Files.newInputStream(aff)); |
| readAffixFile(aff2, decoder); |
| |
| // read dictionary entries |
| IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); |
| Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o); |
| readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, b); |
| words = b.finish(); |
| aliases = null; // no longer needed |
| morphAliases = null; // no longer needed |
| success = true; |
| } finally { |
| IOUtils.closeWhileHandlingException(out, aff1, aff2); |
| if (success) { |
| Files.delete(aff); |
| } else { |
| IOUtils.deleteFilesIgnoringExceptions(aff); |
| } |
| } |
| } |
| |
| /** |
| * Looks up Hunspell word forms from the dictionary |
| */ |
| IntsRef lookupWord(char word[], int offset, int length) { |
| return lookup(words, word, offset, length); |
| } |
| |
| // only for testing |
| IntsRef lookupPrefix(char word[], int offset, int length) { |
| return lookup(prefixes, word, offset, length); |
| } |
| |
| // only for testing |
| IntsRef lookupSuffix(char word[], int offset, int length) { |
| return lookup(suffixes, word, offset, length); |
| } |
| |
| IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) { |
| if (fst == null) { |
| return null; |
| } |
| final FST.BytesReader bytesReader = fst.getBytesReader(); |
| final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>()); |
| // Accumulate output as we go |
| final IntsRef NO_OUTPUT = fst.outputs.getNoOutput(); |
| IntsRef output = NO_OUTPUT; |
| |
| int l = offset + length; |
| try { |
| for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) { |
| cp = Character.codePointAt(word, i, l); |
| if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) { |
| return null; |
| } else if (arc.output != NO_OUTPUT) { |
| output = fst.outputs.add(output, arc.output); |
| } |
| } |
| if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) { |
| return null; |
| } else if (arc.output != NO_OUTPUT) { |
| return fst.outputs.add(output, arc.output); |
| } else { |
| return output; |
| } |
| } catch (IOException bogus) { |
| throw new RuntimeException(bogus); |
| } |
| } |
| |
| /** |
| * Reads the affix file through the provided InputStream, building up the prefix and suffix maps |
| * |
| * @param affixStream InputStream to read the content of the affix file from |
| * @param decoder CharsetDecoder to decode the content of the file |
| * @throws IOException Can be thrown while reading from the InputStream |
| */ |
| private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException { |
| TreeMap<String, List<Integer>> prefixes = new TreeMap<>(); |
| TreeMap<String, List<Integer>> suffixes = new TreeMap<>(); |
| Map<String,Integer> seenPatterns = new HashMap<>(); |
| |
| // zero condition -> 0 ord |
| seenPatterns.put(".*", 0); |
| patterns.add(null); |
| |
| // zero strip -> 0 ord |
| Map<String,Integer> seenStrips = new LinkedHashMap<>(); |
| seenStrips.put("", 0); |
| |
| LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); |
| String line = null; |
| while ((line = reader.readLine()) != null) { |
| // ignore any BOM marker on first line |
| if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) { |
| line = line.substring(1); |
| } |
| if (line.startsWith(ALIAS_KEY)) { |
| parseAlias(line); |
| } else if (line.startsWith(MORPH_ALIAS_KEY)) { |
| parseMorphAlias(line); |
| } else if (line.startsWith(PREFIX_KEY)) { |
| parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); |
| } else if (line.startsWith(SUFFIX_KEY)) { |
| parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); |
| } else if (line.startsWith(FLAG_KEY)) { |
| // Assume that the FLAG line comes before any prefix or suffixes |
| // Store the strategy so it can be used when parsing the dic file |
| flagParsingStrategy = getFlagParsingStrategy(line); |
| } else if (line.equals(COMPLEXPREFIXES_KEY)) { |
| complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix |
| } else if (line.startsWith(CIRCUMFIX_KEY)) { |
| String parts[] = line.split("\\s+"); |
| if (parts.length != 2) { |
| throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber()); |
| } |
| circumfix = flagParsingStrategy.parseFlag(parts[1]); |
| } else if (line.startsWith(KEEPCASE_KEY)) { |
| String parts[] = line.split("\\s+"); |
| if (parts.length != 2) { |
| throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber()); |
| } |
| keepcase = flagParsingStrategy.parseFlag(parts[1]); |
| } else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) { |
| String parts[] = line.split("\\s+"); |
| if (parts.length != 2) { |
| throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber()); |
| } |
| needaffix = flagParsingStrategy.parseFlag(parts[1]); |
| } else if (line.startsWith(ONLYINCOMPOUND_KEY)) { |
| String parts[] = line.split("\\s+"); |
| if (parts.length != 2) { |
| throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber()); |
| } |
| onlyincompound = flagParsingStrategy.parseFlag(parts[1]); |
| } else if (line.startsWith(IGNORE_KEY)) { |
| String parts[] = line.split("\\s+"); |
| if (parts.length != 2) { |
| throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber()); |
| } |
| ignore = parts[1].toCharArray(); |
| Arrays.sort(ignore); |
| needsInputCleaning = true; |
| } else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) { |
| String parts[] = line.split("\\s+"); |
| String type = parts[0]; |
| if (parts.length != 2) { |
| throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber()); |
| } |
| int num = Integer.parseInt(parts[1]); |
| FST<CharsRef> res = parseConversions(reader, num); |
| if (type.equals("ICONV")) { |
| iconv = res; |
| needsInputCleaning |= iconv != null; |
| } else { |
| oconv = res; |
| needsOutputCleaning |= oconv != null; |
| } |
| } else if (line.startsWith(FULLSTRIP_KEY)) { |
| fullStrip = true; |
| } else if (line.startsWith(LANG_KEY)) { |
| language = line.substring(LANG_KEY.length()).trim(); |
| alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language); |
| } |
| } |
| |
| this.prefixes = affixFST(prefixes); |
| this.suffixes = affixFST(suffixes); |
| |
| int totalChars = 0; |
| for (String strip : seenStrips.keySet()) { |
| totalChars += strip.length(); |
| } |
| stripData = new char[totalChars]; |
| stripOffsets = new int[seenStrips.size()+1]; |
| int currentOffset = 0; |
| int currentIndex = 0; |
| for (String strip : seenStrips.keySet()) { |
| stripOffsets[currentIndex++] = currentOffset; |
| strip.getChars(0, strip.length(), stripData, currentOffset); |
| currentOffset += strip.length(); |
| } |
| assert currentIndex == seenStrips.size(); |
| stripOffsets[currentIndex] = currentOffset; |
| } |
| |
| private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException { |
| IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); |
| Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); |
| IntsRefBuilder scratch = new IntsRefBuilder(); |
| for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) { |
| Util.toUTF32(entry.getKey(), scratch); |
| List<Integer> entries = entry.getValue(); |
| IntsRef output = new IntsRef(entries.size()); |
| for (Integer c : entries) { |
| output.ints[output.length++] = c; |
| } |
| builder.add(scratch.get(), output); |
| } |
| return builder.finish(); |
| } |
| |
| static String escapeDash(String re) { |
| // we have to be careful, even though dash doesn't have a special meaning, |
| // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it |
| StringBuilder escaped = new StringBuilder(); |
| for (int i = 0; i < re.length(); i++) { |
| char c = re.charAt(i); |
| if (c == '-') { |
| escaped.append("\\-"); |
| } else { |
| escaped.append(c); |
| if (c == '\\' && i + 1 < re.length()) { |
| escaped.append(re.charAt(i+1)); |
| i++; |
| } |
| } |
| } |
| return escaped.toString(); |
| } |
| |
| /** |
| * Parses a specific affix rule putting the result into the provided affix map |
| * |
| * @param affixes Map where the result of the parsing will be put |
| * @param header Header line of the affix rule |
| * @param reader BufferedReader to read the content of the rule from |
| * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex |
| * pattern |
| * @param seenPatterns map from condition -> index of patterns, for deduplication. |
| * @throws IOException Can be thrown while reading the rule |
| */ |
| private void parseAffix(TreeMap<String,List<Integer>> affixes, |
| String header, |
| LineNumberReader reader, |
| String conditionPattern, |
| Map<String,Integer> seenPatterns, |
| Map<String,Integer> seenStrips) throws IOException, ParseException { |
| |
| BytesRefBuilder scratch = new BytesRefBuilder(); |
| StringBuilder sb = new StringBuilder(); |
| String args[] = header.split("\\s+"); |
| |
| boolean crossProduct = args[2].equals("Y"); |
| boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; |
| |
| int numLines = Integer.parseInt(args[3]); |
| affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); |
| ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); |
| |
| for (int i = 0; i < numLines; i++) { |
| assert affixWriter.getPosition() == currentAffix << 3; |
| String line = reader.readLine(); |
| String ruleArgs[] = line.split("\\s+"); |
| |
| // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] |
| // condition is optional |
| if (ruleArgs.length < 4) { |
| throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); |
| } |
| |
| char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); |
| String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; |
| String affixArg = ruleArgs[3]; |
| char appendFlags[] = null; |
| |
| // first: parse continuation classes out of affix |
| int flagSep = affixArg.lastIndexOf('/'); |
| if (flagSep != -1) { |
| String flagPart = affixArg.substring(flagSep + 1); |
| affixArg = affixArg.substring(0, flagSep); |
| |
| if (aliasCount > 0) { |
| flagPart = getAliasValue(Integer.parseInt(flagPart)); |
| } |
| |
| appendFlags = flagParsingStrategy.parseFlags(flagPart); |
| Arrays.sort(appendFlags); |
| twoStageAffix = true; |
| } |
| // zero affix -> empty string |
| if ("0".equals(affixArg)) { |
| affixArg = ""; |
| } |
| |
| String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; |
| // at least the gascon affix file has this issue |
| if (condition.startsWith("[") && condition.indexOf(']') == -1) { |
| condition = condition + "]"; |
| } |
| // "dash hasn't got special meaning" (we must escape it) |
| if (condition.indexOf('-') >= 0) { |
| condition = escapeDash(condition); |
| } |
| |
| final String regex; |
| if (".".equals(condition)) { |
| regex = ".*"; // Zero condition is indicated by dot |
| } else if (condition.equals(strip)) { |
| regex = ".*"; // TODO: optimize this better: |
| // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! |
| // but this is complicated... |
| } else { |
| regex = String.format(Locale.ROOT, conditionPattern, condition); |
| } |
| |
| // deduplicate patterns |
| Integer patternIndex = seenPatterns.get(regex); |
| if (patternIndex == null) { |
| patternIndex = patterns.size(); |
| if (patternIndex > Short.MAX_VALUE) { |
| throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org"); |
| } |
| seenPatterns.put(regex, patternIndex); |
| CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton()); |
| patterns.add(pattern); |
| } |
| |
| Integer stripOrd = seenStrips.get(strip); |
| if (stripOrd == null) { |
| stripOrd = seenStrips.size(); |
| seenStrips.put(strip, stripOrd); |
| if (stripOrd > Character.MAX_VALUE) { |
| throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org"); |
| } |
| } |
| |
| if (appendFlags == null) { |
| appendFlags = NOFLAGS; |
| } |
| |
| encodeFlags(scratch, appendFlags); |
| int appendFlagsOrd = flagLookup.add(scratch.get()); |
| if (appendFlagsOrd < 0) { |
| // already exists in our hash |
| appendFlagsOrd = (-appendFlagsOrd)-1; |
| } else if (appendFlagsOrd > Short.MAX_VALUE) { |
| // this limit is probably flexible, but it's a good sanity check too |
| throw new UnsupportedOperationException("Too many unique append flags, please report this to dev@lucene.apache.org"); |
| } |
| |
| affixWriter.writeShort((short)flag); |
| affixWriter.writeShort((short)stripOrd.intValue()); |
| // encode crossProduct into patternIndex |
| int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); |
| affixWriter.writeShort((short)patternOrd); |
| affixWriter.writeShort((short)appendFlagsOrd); |
| |
| if (needsInputCleaning) { |
| CharSequence cleaned = cleanInput(affixArg, sb); |
| affixArg = cleaned.toString(); |
| } |
| |
| if (isSuffix) { |
| affixArg = new StringBuilder(affixArg).reverse().toString(); |
| } |
| |
| List<Integer> list = affixes.get(affixArg); |
| if (list == null) { |
| list = new ArrayList<>(); |
| affixes.put(affixArg, list); |
| } |
| list.add(currentAffix); |
| currentAffix++; |
| } |
| } |
| |
| private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { |
| Map<String,String> mappings = new TreeMap<>(); |
| |
| for (int i = 0; i < num; i++) { |
| String line = reader.readLine(); |
| String parts[] = line.split("\\s+"); |
| if (parts.length != 3) { |
| throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); |
| } |
| if (mappings.put(parts[1], parts[2]) != null) { |
| throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); |
| } |
| } |
| |
| Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); |
| Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); |
| IntsRefBuilder scratchInts = new IntsRefBuilder(); |
| for (Map.Entry<String,String> entry : mappings.entrySet()) { |
| Util.toUTF16(entry.getKey(), scratchInts); |
| builder.add(scratchInts.get(), new CharsRef(entry.getValue())); |
| } |
| |
| return builder.finish(); |
| } |
| |
| /** pattern accepts optional BOM + SET + any whitespace */ |
| final static Pattern ENCODING_PATTERN = Pattern.compile("^(\u00EF\u00BB\u00BF)?SET\\s+"); |
| |
| /** |
| * Parses the encoding specified in the affix file readable through the provided InputStream |
| * |
| * @param affix InputStream for reading the affix file |
| * @return Encoding specified in the affix file |
| * @throws IOException Can be thrown while reading from the InputStream |
| * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>} |
| */ |
| static String getDictionaryEncoding(InputStream affix) throws IOException, ParseException { |
| final StringBuilder encoding = new StringBuilder(); |
| for (;;) { |
| encoding.setLength(0); |
| int ch; |
| while ((ch = affix.read()) >= 0) { |
| if (ch == '\n') { |
| break; |
| } |
| if (ch != '\r') { |
| encoding.append((char)ch); |
| } |
| } |
| if ( |
| encoding.length() == 0 || encoding.charAt(0) == '#' || |
| // this test only at the end as ineffective but would allow lines only containing spaces: |
| encoding.toString().trim().length() == 0 |
| ) { |
| if (ch < 0) { |
| throw new ParseException("Unexpected end of affix file.", 0); |
| } |
| continue; |
| } |
| Matcher matcher = ENCODING_PATTERN.matcher(encoding); |
| if (matcher.find()) { |
| int last = matcher.end(); |
| return encoding.substring(last).trim(); |
| } |
| } |
| } |
| |
| static final Map<String,String> CHARSET_ALIASES; |
| static { |
| Map<String,String> m = new HashMap<>(); |
| m.put("microsoft-cp1251", "windows-1251"); |
| m.put("TIS620-2533", "TIS-620"); |
| CHARSET_ALIASES = Collections.unmodifiableMap(m); |
| } |
| |
| /** |
| * Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and |
| * MICROSOFT-CP1251 etc are allowed... |
| * |
| * @param encoding Encoding to retrieve the CharsetDecoder for |
| * @return CharSetDecoder for the given encoding |
| */ |
| private CharsetDecoder getJavaEncoding(String encoding) { |
| if ("ISO8859-14".equals(encoding)) { |
| return new ISO8859_14Decoder(); |
| } |
| String canon = CHARSET_ALIASES.get(encoding); |
| if (canon != null) { |
| encoding = canon; |
| } |
| Charset charset = Charset.forName(encoding); |
| return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE); |
| } |
| |
| /** |
| * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file |
| * |
| * @param flagLine Line containing the flag information |
| * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition |
| */ |
| static FlagParsingStrategy getFlagParsingStrategy(String flagLine) { |
| String parts[] = flagLine.split("\\s+"); |
| if (parts.length != 2) { |
| throw new IllegalArgumentException("Illegal FLAG specification: " + flagLine); |
| } |
| String flagType = parts[1]; |
| |
| if (NUM_FLAG_TYPE.equals(flagType)) { |
| return new NumFlagParsingStrategy(); |
| } else if (UTF8_FLAG_TYPE.equals(flagType)) { |
| return new SimpleFlagParsingStrategy(); |
| } else if (LONG_FLAG_TYPE.equals(flagType)) { |
| return new DoubleASCIIFlagParsingStrategy(); |
| } |
| |
| throw new IllegalArgumentException("Unknown flag type: " + flagType); |
| } |
| |
| final char FLAG_SEPARATOR = 0x1f; // flag separator after escaping |
| final char MORPH_SEPARATOR = 0x1e; // separator for boundary of entry (may be followed by morph data) |
| |
| String unescapeEntry(String entry) { |
| StringBuilder sb = new StringBuilder(); |
| int end = morphBoundary(entry); |
| for (int i = 0; i < end; i++) { |
| char ch = entry.charAt(i); |
| if (ch == '\\' && i+1 < entry.length()) { |
| sb.append(entry.charAt(i+1)); |
| i++; |
| } else if (ch == '/') { |
| sb.append(FLAG_SEPARATOR); |
| } else if (ch == MORPH_SEPARATOR || ch == FLAG_SEPARATOR) { |
| // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!! |
| } else { |
| sb.append(ch); |
| } |
| } |
| sb.append(MORPH_SEPARATOR); |
| if (end < entry.length()) { |
| for (int i = end; i < entry.length(); i++) { |
| char c = entry.charAt(i); |
| if (c == FLAG_SEPARATOR || c == MORPH_SEPARATOR) { |
| // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!! |
| } else { |
| sb.append(c); |
| } |
| } |
| } |
| return sb.toString(); |
| } |
| |
| static int morphBoundary(String line) { |
| int end = indexOfSpaceOrTab(line, 0); |
| if (end == -1) { |
| return line.length(); |
| } |
| while (end >= 0 && end < line.length()) { |
| if (line.charAt(end) == '\t' || |
| end+3 < line.length() && |
| Character.isLetter(line.charAt(end+1)) && |
| Character.isLetter(line.charAt(end+2)) && |
| line.charAt(end+3) == ':') { |
| break; |
| } |
| end = indexOfSpaceOrTab(line, end+1); |
| } |
| if (end == -1) { |
| return line.length(); |
| } |
| return end; |
| } |
| |
| static int indexOfSpaceOrTab(String text, int start) { |
| int pos1 = text.indexOf('\t', start); |
| int pos2 = text.indexOf(' ', start); |
| if (pos1 >= 0 && pos2 >= 0) { |
| return Math.min(pos1, pos2); |
| } else { |
| return Math.max(pos1, pos2); |
| } |
| } |
| |
| /** |
| * Reads the dictionary file through the provided InputStreams, building up the words map |
| * |
| * @param dictionaries InputStreams to read the dictionary file through |
| * @param decoder CharsetDecoder used to decode the contents of the file |
| * @throws IOException Can be thrown while reading from the file |
| */ |
| private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { |
| BytesRefBuilder flagsScratch = new BytesRefBuilder(); |
| IntsRefBuilder scratchInts = new IntsRefBuilder(); |
| |
| StringBuilder sb = new StringBuilder(); |
| |
| IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT); |
| try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) { |
| for (InputStream dictionary : dictionaries) { |
| BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); |
| String line = lines.readLine(); // first line is number of entries (approximately, sometimes) |
| |
| while ((line = lines.readLine()) != null) { |
| // wild and unpredictable code comment rules |
| if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') { |
| continue; |
| } |
| line = unescapeEntry(line); |
| // if we havent seen any stem exceptions, try to parse one |
| if (hasStemExceptions == false) { |
| int morphStart = line.indexOf(MORPH_SEPARATOR); |
| if (morphStart >= 0 && morphStart < line.length()) { |
| hasStemExceptions = parseStemException(line.substring(morphStart+1)) != null; |
| } |
| } |
| if (needsInputCleaning) { |
| int flagSep = line.indexOf(FLAG_SEPARATOR); |
| if (flagSep == -1) { |
| flagSep = line.indexOf(MORPH_SEPARATOR); |
| } |
| if (flagSep == -1) { |
| CharSequence cleansed = cleanInput(line, sb); |
| writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); |
| } else { |
| String text = line.substring(0, flagSep); |
| CharSequence cleansed = cleanInput(text, sb); |
| if (cleansed != sb) { |
| sb.setLength(0); |
| sb.append(cleansed); |
| } |
| sb.append(line.substring(flagSep)); |
| writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); |
| } |
| } else { |
| writer.write(line.getBytes(StandardCharsets.UTF_8)); |
| } |
| } |
| } |
| CodecUtil.writeFooter(unsorted); |
| } |
| |
| OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new Comparator<BytesRef>() { |
| BytesRef scratch1 = new BytesRef(); |
| BytesRef scratch2 = new BytesRef(); |
| |
| @Override |
| public int compare(BytesRef o1, BytesRef o2) { |
| scratch1.bytes = o1.bytes; |
| scratch1.offset = o1.offset; |
| scratch1.length = o1.length; |
| |
| for (int i = scratch1.length - 1; i >= 0; i--) { |
| if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) { |
| scratch1.length = i; |
| break; |
| } |
| } |
| |
| scratch2.bytes = o2.bytes; |
| scratch2.offset = o2.offset; |
| scratch2.length = o2.length; |
| |
| for (int i = scratch2.length - 1; i >= 0; i--) { |
| if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) { |
| scratch2.length = i; |
| break; |
| } |
| } |
| |
| int cmp = scratch1.compareTo(scratch2); |
| if (cmp == 0) { |
| // tie break on whole row |
| return o1.compareTo(o2); |
| } else { |
| return cmp; |
| } |
| } |
| }); |
| |
| String sorted; |
| boolean success = false; |
| try { |
| sorted = sorter.sort(unsorted.getName()); |
| success = true; |
| } finally { |
| if (success) { |
| tempDir.deleteFile(unsorted.getName()); |
| } else { |
| IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName()); |
| } |
| } |
| |
| boolean success2 = false; |
| |
| try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) { |
| |
| // TODO: the flags themselves can be double-chars (long) or also numeric |
| // either way the trick is to encode them as char... but they must be parsed differently |
| |
| String currentEntry = null; |
| IntsRefBuilder currentOrds = new IntsRefBuilder(); |
| |
| while (true) { |
| BytesRef scratch = reader.next(); |
| if (scratch == null) { |
| break; |
| } |
| |
| String line = scratch.utf8ToString(); |
| String entry; |
| char wordForm[]; |
| int end; |
| |
| int flagSep = line.indexOf(FLAG_SEPARATOR); |
| if (flagSep == -1) { |
| wordForm = NOFLAGS; |
| end = line.indexOf(MORPH_SEPARATOR); |
| entry = line.substring(0, end); |
| } else { |
| end = line.indexOf(MORPH_SEPARATOR); |
| String flagPart = line.substring(flagSep + 1, end); |
| if (aliasCount > 0) { |
| flagPart = getAliasValue(Integer.parseInt(flagPart)); |
| } |
| |
| wordForm = flagParsingStrategy.parseFlags(flagPart); |
| Arrays.sort(wordForm); |
| entry = line.substring(0, flagSep); |
| } |
| // we possibly have morphological data |
| int stemExceptionID = 0; |
| if (hasStemExceptions && end+1 < line.length()) { |
| String stemException = parseStemException(line.substring(end+1)); |
| if (stemException != null) { |
| stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount+1); |
| stemExceptionID = stemExceptionCount+1; // we use '0' to indicate no exception for the form |
| stemExceptions[stemExceptionCount++] = stemException; |
| } |
| } |
| |
| int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); |
| if (cmp < 0) { |
| throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); |
| } else { |
| encodeFlags(flagsScratch, wordForm); |
| int ord = flagLookup.add(flagsScratch.get()); |
| if (ord < 0) { |
| // already exists in our hash |
| ord = (-ord)-1; |
| } |
| // finalize current entry, and switch "current" if necessary |
| if (cmp > 0 && currentEntry != null) { |
| Util.toUTF32(currentEntry, scratchInts); |
| words.add(scratchInts.get(), currentOrds.get()); |
| } |
| // swap current |
| if (cmp > 0 || currentEntry == null) { |
| currentEntry = entry; |
| currentOrds = new IntsRefBuilder(); // must be this way |
| } |
| if (hasStemExceptions) { |
| currentOrds.append(ord); |
| currentOrds.append(stemExceptionID); |
| } else { |
| currentOrds.append(ord); |
| } |
| } |
| } |
| |
| // finalize last entry |
| Util.toUTF32(currentEntry, scratchInts); |
| words.add(scratchInts.get(), currentOrds.get()); |
| success2 = true; |
| } finally { |
| if (success2) { |
| tempDir.deleteFile(sorted); |
| } else { |
| IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted); |
| } |
| } |
| } |
| |
| static char[] decodeFlags(BytesRef b) { |
| if (b.length == 0) { |
| return CharsRef.EMPTY_CHARS; |
| } |
| int len = b.length >>> 1; |
| char flags[] = new char[len]; |
| int upto = 0; |
| int end = b.offset + b.length; |
| for (int i = b.offset; i < end; i += 2) { |
| flags[upto++] = (char)((b.bytes[i] << 8) | (b.bytes[i+1] & 0xff)); |
| } |
| return flags; |
| } |
| |
| static void encodeFlags(BytesRefBuilder b, char flags[]) { |
| int len = flags.length << 1; |
| b.grow(len); |
| b.clear(); |
| for (int i = 0; i < flags.length; i++) { |
| int flag = flags[i]; |
| b.append((byte) ((flag >> 8) & 0xff)); |
| b.append((byte) (flag & 0xff)); |
| } |
| } |
| |
| private void parseAlias(String line) { |
| String ruleArgs[] = line.split("\\s+"); |
| if (aliases == null) { |
| //first line should be the aliases count |
| final int count = Integer.parseInt(ruleArgs[1]); |
| aliases = new String[count]; |
| } else { |
| // an alias can map to no flags |
| String aliasValue = ruleArgs.length == 1 ? "" : ruleArgs[1]; |
| aliases[aliasCount++] = aliasValue; |
| } |
| } |
| |
| private String getAliasValue(int id) { |
| try { |
| return aliases[id - 1]; |
| } catch (IndexOutOfBoundsException ex) { |
| throw new IllegalArgumentException("Bad flag alias number:" + id, ex); |
| } |
| } |
| |
| String getStemException(int id) { |
| return stemExceptions[id-1]; |
| } |
| |
| private void parseMorphAlias(String line) { |
| if (morphAliases == null) { |
| //first line should be the aliases count |
| final int count = Integer.parseInt(line.substring(3)); |
| morphAliases = new String[count]; |
| } else { |
| String arg = line.substring(2); // leave the space |
| morphAliases[morphAliasCount++] = arg; |
| } |
| } |
| |
| private String parseStemException(String morphData) { |
| // first see if it's an alias |
| if (morphAliasCount > 0) { |
| try { |
| int alias = Integer.parseInt(morphData.trim()); |
| morphData = morphAliases[alias-1]; |
| } catch (NumberFormatException e) { |
| // fine |
| } |
| } |
| // try to parse morph entry |
| int index = morphData.indexOf(" st:"); |
| if (index < 0) { |
| index = morphData.indexOf("\tst:"); |
| } |
| if (index >= 0) { |
| int endIndex = indexOfSpaceOrTab(morphData, index+1); |
| if (endIndex < 0) { |
| endIndex = morphData.length(); |
| } |
| return morphData.substring(index+4, endIndex); |
| } |
| return null; |
| } |
| |
| /** |
| * Abstraction of the process of parsing flags taken from the affix and dic files |
| */ |
| static abstract class FlagParsingStrategy { |
| |
| /** |
| * Parses the given String into a single flag |
| * |
| * @param rawFlag String to parse into a flag |
| * @return Parsed flag |
| */ |
| char parseFlag(String rawFlag) { |
| char flags[] = parseFlags(rawFlag); |
| if (flags.length != 1) { |
| throw new IllegalArgumentException("expected only one flag, got: " + rawFlag); |
| } |
| return flags[0]; |
| } |
| |
| /** |
| * Parses the given String into multiple flags |
| * |
| * @param rawFlags String to parse into flags |
| * @return Parsed flags |
| */ |
| abstract char[] parseFlags(String rawFlags); |
| } |
| |
| /** |
| * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags. |
| * Can be used with both the ASCII and UTF-8 flag types. |
| */ |
| private static class SimpleFlagParsingStrategy extends FlagParsingStrategy { |
| @Override |
| public char[] parseFlags(String rawFlags) { |
| return rawFlags.toCharArray(); |
| } |
| } |
| |
| /** |
| * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case |
| * of multiple flags, each number is separated by a comma. |
| */ |
| private static class NumFlagParsingStrategy extends FlagParsingStrategy { |
| @Override |
| public char[] parseFlags(String rawFlags) { |
| String[] rawFlagParts = rawFlags.trim().split(","); |
| char[] flags = new char[rawFlagParts.length]; |
| int upto = 0; |
| |
| for (int i = 0; i < rawFlagParts.length; i++) { |
| // note, removing the trailing X/leading I for nepali... what is the rule here?! |
| String replacement = rawFlagParts[i].replaceAll("[^0-9]", ""); |
| // note, ignoring empty flags (this happens in danish, for example) |
| if (replacement.isEmpty()) { |
| continue; |
| } |
| flags[upto++] = (char) Integer.parseInt(replacement); |
| } |
| |
| if (upto < flags.length) { |
| flags = ArrayUtil.copyOfSubArray(flags, 0, upto); |
| } |
| return flags; |
| } |
| } |
| |
| /** |
| * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes |
| * must be combined into a single character. |
| */ |
| private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy { |
| |
| @Override |
| public char[] parseFlags(String rawFlags) { |
| if (rawFlags.length() == 0) { |
| return new char[0]; |
| } |
| |
| StringBuilder builder = new StringBuilder(); |
| if (rawFlags.length() % 2 == 1) { |
| throw new IllegalArgumentException("Invalid flags (should be even number of characters): " + rawFlags); |
| } |
| for (int i = 0; i < rawFlags.length(); i+=2) { |
| char f1 = rawFlags.charAt(i); |
| char f2 = rawFlags.charAt(i+1); |
| if (f1 >= 256 || f2 >= 256) { |
| throw new IllegalArgumentException("Invalid flags (LONG flags must be double ASCII): " + rawFlags); |
| } |
| char combined = (char) (f1 << 8 | f2); |
| builder.append(combined); |
| } |
| |
| char flags[] = new char[builder.length()]; |
| builder.getChars(0, builder.length(), flags, 0); |
| return flags; |
| } |
| } |
| |
| static boolean hasFlag(char flags[], char flag) { |
| return Arrays.binarySearch(flags, flag) >= 0; |
| } |
| |
| CharSequence cleanInput(CharSequence input, StringBuilder reuse) { |
| reuse.setLength(0); |
| |
| for (int i = 0; i < input.length(); i++) { |
| char ch = input.charAt(i); |
| |
| if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) { |
| continue; |
| } |
| |
| if (ignoreCase && iconv == null) { |
| // if we have no input conversion mappings, do this on-the-fly |
| ch = caseFold(ch); |
| } |
| |
| reuse.append(ch); |
| } |
| |
| if (iconv != null) { |
| try { |
| applyMappings(iconv, reuse); |
| } catch (IOException bogus) { |
| throw new RuntimeException(bogus); |
| } |
| if (ignoreCase) { |
| for (int i = 0; i < reuse.length(); i++) { |
| reuse.setCharAt(i, caseFold(reuse.charAt(i))); |
| } |
| } |
| } |
| |
| return reuse; |
| } |
| |
| /** folds single character (according to LANG if present) */ |
| char caseFold(char c) { |
| if (alternateCasing) { |
| if (c == 'I') { |
| return 'ı'; |
| } else if (c == 'İ') { |
| return 'i'; |
| } else { |
| return Character.toLowerCase(c); |
| } |
| } else { |
| return Character.toLowerCase(c); |
| } |
| } |
| |
| // TODO: this could be more efficient! |
| static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException { |
| final FST.BytesReader bytesReader = fst.getBytesReader(); |
| final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>()); |
| final CharsRef NO_OUTPUT = fst.outputs.getNoOutput(); |
| |
| // temporary stuff |
| final FST.Arc<CharsRef> arc = new FST.Arc<>(); |
| int longestMatch; |
| CharsRef longestOutput; |
| |
| for (int i = 0; i < sb.length(); i++) { |
| arc.copyFrom(firstArc); |
| CharsRef output = NO_OUTPUT; |
| longestMatch = -1; |
| longestOutput = null; |
| |
| for (int j = i; j < sb.length(); j++) { |
| char ch = sb.charAt(j); |
| if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { |
| break; |
| } else { |
| output = fst.outputs.add(output, arc.output); |
| } |
| if (arc.isFinal()) { |
| longestOutput = fst.outputs.add(output, arc.nextFinalOutput); |
| longestMatch = j; |
| } |
| } |
| |
| if (longestMatch >= 0) { |
| sb.delete(i, longestMatch+1); |
| sb.insert(i, longestOutput); |
| i += (longestOutput.length - 1); |
| } |
| } |
| } |
| |
| /** Returns true if this dictionary was constructed with the {@code ignoreCase} option */ |
| public boolean getIgnoreCase() { |
| return ignoreCase; |
| } |
| |
| private static Path DEFAULT_TEMP_DIR; |
| |
| /** Used by test framework */ |
| public static void setDefaultTempDir(Path tempDir) { |
| DEFAULT_TEMP_DIR = tempDir; |
| } |
| |
| /** |
| * Returns the default temporary directory. By default, java.io.tmpdir. If not accessible |
| * or not available, an IOException is thrown |
| */ |
| synchronized static Path getDefaultTempDir() throws IOException { |
| if (DEFAULT_TEMP_DIR == null) { |
| // Lazy init |
| String tempDirPath = System.getProperty("java.io.tmpdir"); |
| if (tempDirPath == null) { |
| throw new IOException("Java has no temporary folder property (java.io.tmpdir)?"); |
| } |
| Path tempDirectory = Paths.get(tempDirPath); |
| if (Files.isWritable(tempDirectory) == false) { |
| throw new IOException("Java's temporary folder not present or writeable?: " |
| + tempDirectory.toAbsolutePath()); |
| } |
| DEFAULT_TEMP_DIR = tempDirectory; |
| } |
| |
| return DEFAULT_TEMP_DIR; |
| } |
| } |