| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.synonym; |
| |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.text.ParseException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.store.ByteArrayDataOutput; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.BytesRefHash; |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.CharsRefBuilder; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.fst.ByteSequenceOutputs; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.Util; |
| |
| /** |
| * A map of synonyms, keys and values are phrases. |
| * @lucene.experimental |
| */ |
| public class SynonymMap { |
| /** for multiword support, you must separate words with this separator */ |
| public static final char WORD_SEPARATOR = 0; |
| /** map<input word, list<ord>> */ |
| public final FST<BytesRef> fst; |
| /** map<ord, outputword> */ |
| public final BytesRefHash words; |
| /** maxHorizontalContext: maximum context we need on the tokenstream */ |
| public final int maxHorizontalContext; |
| |
| public SynonymMap(FST<BytesRef> fst, BytesRefHash words, int maxHorizontalContext) { |
| this.fst = fst; |
| this.words = words; |
| this.maxHorizontalContext = maxHorizontalContext; |
| } |
| |
| /** |
| * Builds an FSTSynonymMap. |
| * <p> |
| * Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap |
| * @lucene.experimental |
| */ |
| public static class Builder { |
| private final HashMap<CharsRef,MapEntry> workingSet = new HashMap<>(); |
| private final BytesRefHash words = new BytesRefHash(); |
| private final BytesRefBuilder utf8Scratch = new BytesRefBuilder(); |
| private int maxHorizontalContext; |
| private final boolean dedup; |
| |
| /** Default constructor, passes {@code dedup=true}. */ |
| public Builder() { |
| this(true); |
| } |
| |
| /** If dedup is true then identical rules (same input, |
| * same output) will be added only once. */ |
| public Builder(boolean dedup) { |
| this.dedup = dedup; |
| } |
| |
| private static class MapEntry { |
| boolean includeOrig; |
| // we could sort for better sharing ultimately, but it could confuse people |
| ArrayList<Integer> ords = new ArrayList<>(); |
| } |
| |
| /** Sugar: just joins the provided terms with {@link |
| * SynonymMap#WORD_SEPARATOR}. reuse and its chars |
| * must not be null. */ |
| public static CharsRef join(String[] words, CharsRefBuilder reuse) { |
| int upto = 0; |
| char[] buffer = reuse.chars(); |
| for (String word : words) { |
| final int wordLen = word.length(); |
| final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR |
| if (needed > buffer.length) { |
| reuse.grow(needed); |
| buffer = reuse.chars(); |
| } |
| if (upto > 0) { |
| buffer[upto++] = SynonymMap.WORD_SEPARATOR; |
| } |
| |
| word.getChars(0, wordLen, buffer, upto); |
| upto += wordLen; |
| } |
| reuse.setLength(upto); |
| return reuse.get(); |
| } |
| |
| /** only used for asserting! */ |
| private boolean hasHoles(CharsRef chars) { |
| final int end = chars.offset + chars.length; |
| for(int idx=chars.offset+1;idx<end;idx++) { |
| if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx-1] == SynonymMap.WORD_SEPARATOR) { |
| return true; |
| } |
| } |
| if (chars.chars[chars.offset] == '\u0000') { |
| return true; |
| } |
| if (chars.chars[chars.offset + chars.length - 1] == '\u0000') { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| // NOTE: while it's tempting to make this public, since |
| // caller's parser likely knows the |
| // numInput/numOutputWords, sneaky exceptions, much later |
| // on, will result if these values are wrong; so we always |
| // recompute ourselves to be safe: |
| private void add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) { |
| // first convert to UTF-8 |
| if (numInputWords <= 0) { |
| throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")"); |
| } |
| if (input.length <= 0) { |
| throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")"); |
| } |
| if (numOutputWords <= 0) { |
| throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")"); |
| } |
| if (output.length <= 0) { |
| throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")"); |
| } |
| |
| assert !hasHoles(input): "input has holes: " + input; |
| assert !hasHoles(output): "output has holes: " + output; |
| |
| //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); |
| utf8Scratch.copyChars(output.chars, output.offset, output.length); |
| // lookup in hash |
| int ord = words.add(utf8Scratch.get()); |
| if (ord < 0) { |
| // already exists in our hash |
| ord = (-ord)-1; |
| //System.out.println(" output=" + output + " old ord=" + ord); |
| } else { |
| //System.out.println(" output=" + output + " new ord=" + ord); |
| } |
| |
| MapEntry e = workingSet.get(input); |
| if (e == null) { |
| e = new MapEntry(); |
| workingSet.put(CharsRef.deepCopyOf(input), e); // make a copy, since we will keep around in our map |
| } |
| |
| e.ords.add(ord); |
| e.includeOrig |= includeOrig; |
| maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords); |
| maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords); |
| } |
| |
| private int countWords(CharsRef chars) { |
| int wordCount = 1; |
| int upto = chars.offset; |
| final int limit = chars.offset + chars.length; |
| while(upto < limit) { |
| if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) { |
| wordCount++; |
| } |
| } |
| return wordCount; |
| } |
| |
| /** |
| * Add a phrase->phrase synonym mapping. |
| * Phrases are character sequences where words are |
| * separated with character zero (U+0000). Empty words |
| * (two U+0000s in a row) are not allowed in the input nor |
| * the output! |
| * |
| * @param input input phrase |
| * @param output output phrase |
| * @param includeOrig true if the original should be included |
| */ |
| public void add(CharsRef input, CharsRef output, boolean includeOrig) { |
| add(input, countWords(input), output, countWords(output), includeOrig); |
| } |
| |
| /** |
| * Builds an {@link SynonymMap} and returns it. |
| */ |
| public SynonymMap build() throws IOException { |
| ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); |
| // TODO: are we using the best sharing options? |
| org.apache.lucene.util.fst.Builder<BytesRef> builder = |
| new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs); |
| |
| BytesRefBuilder scratch = new BytesRefBuilder(); |
| ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); |
| |
| final Set<Integer> dedupSet; |
| |
| if (dedup) { |
| dedupSet = new HashSet<>(); |
| } else { |
| dedupSet = null; |
| } |
| |
| final byte[] spare = new byte[5]; |
| |
| Set<CharsRef> keys = workingSet.keySet(); |
| CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]); |
| Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator()); |
| |
| final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); |
| |
| //System.out.println("fmap.build"); |
| for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) { |
| CharsRef input = sortedKeys[keyIdx]; |
| MapEntry output = workingSet.get(input); |
| |
| int numEntries = output.ords.size(); |
| // output size, assume the worst case |
| int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry |
| |
| scratch.grow(estimatedSize); |
| scratchOutput.reset(scratch.bytes()); |
| |
| // now write our output data: |
| int count = 0; |
| for (int i = 0; i < numEntries; i++) { |
| if (dedupSet != null) { |
| // box once |
| final Integer ent = output.ords.get(i); |
| if (dedupSet.contains(ent)) { |
| continue; |
| } |
| dedupSet.add(ent); |
| } |
| scratchOutput.writeVInt(output.ords.get(i)); |
| count++; |
| } |
| |
| final int pos = scratchOutput.getPosition(); |
| scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1)); |
| final int pos2 = scratchOutput.getPosition(); |
| final int vIntLen = pos2-pos; |
| |
| // Move the count + includeOrig to the front of the byte[]: |
| System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen); |
| System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos); |
| System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen); |
| |
| if (dedupSet != null) { |
| dedupSet.clear(); |
| } |
| |
| scratch.setLength(scratchOutput.getPosition()); |
| //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); |
| builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef()); |
| } |
| |
| FST<BytesRef> fst = builder.finish(); |
| return new SynonymMap(fst, words, maxHorizontalContext); |
| } |
| } |
| |
| /** |
| * Abstraction for parsing synonym files. |
| * |
| * @lucene.experimental |
| */ |
| public static abstract class Parser extends Builder { |
| |
| private final Analyzer analyzer; |
| |
| public Parser(boolean dedup, Analyzer analyzer) { |
| super(dedup); |
| this.analyzer = analyzer; |
| } |
| |
| /** |
| * Parse the given input, adding synonyms to the inherited {@link Builder}. |
| * @param in The input to parse |
| */ |
| public abstract void parse(Reader in) throws IOException, ParseException; |
| |
| /** Sugar: analyzes the text with the analyzer and |
| * separates by {@link SynonymMap#WORD_SEPARATOR}. |
| * reuse and its chars must not be null. */ |
| public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException { |
| try (TokenStream ts = analyzer.tokenStream("", text)) { |
| CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); |
| PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); |
| ts.reset(); |
| reuse.clear(); |
| while (ts.incrementToken()) { |
| int length = termAtt.length(); |
| if (length == 0) { |
| throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); |
| } |
| if (posIncAtt.getPositionIncrement() != 1) { |
| throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt + |
| ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")"); |
| } |
| reuse.grow(reuse.length() + length + 1); /* current + word + separator */ |
| int end = reuse.length(); |
| if (reuse.length() > 0) { |
| reuse.setCharAt(end++, SynonymMap.WORD_SEPARATOR); |
| reuse.setLength(reuse.length() + 1); |
| } |
| System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length); |
| reuse.setLength(reuse.length() + length); |
| } |
| ts.end(); |
| } |
| if (reuse.length() == 0) { |
| throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); |
| } |
| return reuse.get(); |
| } |
| } |
| |
| } |