lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/SynonymFilterStage.java - lucene-solr - Git at Google

 package org.apache.lucene.analysis.stages;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import org.apache.lucene.analysis.stages.attributes.ArcAttribute;
 import org.apache.lucene.analysis.stages.attributes.OffsetAttribute;
 import org.apache.lucene.analysis.stages.attributes.TermAttribute;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.fst.FST;

 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
 import java.util.LinkedList;
 import java.util.List;

 // nocommit does not do keepOrig

 // nocommit should we ... allow recursing on ourselves?  ie
 // so sometimes an output could be parsed against an input
 // rule?

 /** Synonym filter, that improves on existing one: it can
  *  handle any graph input, and it can create new positions
  *  (e.g., dns -> domain name service). */

 public class SynonymFilterStage extends Stage {

   // We change the term:
   private final TermAttribute termAttIn;
   private final TermAttribute termAttOut;

   // We change the to/from:
   private final ArcAttribute arcAttIn;
   private final ArcAttribute arcAttOut;

   // We set offsets:
   private final OffsetAttribute offsetAttIn;
   private final OffsetAttribute offsetAttOut;

   // Used only inside addMatch
   private final BytesRef scratchBytes = new BytesRef();
   private final CharsRef scratchChars = new CharsRef();

   private final LinkedList<OutputToken> pendingOutputs = new LinkedList<OutputToken>();

   private final List<PartialMatch> pending = new ArrayList<PartialMatch>();

   private final SynonymMap synonyms;

   private final FST.Arc<BytesRef> scratchArc;

   private final FST<BytesRef> fst;

   private final boolean ignoreCase;

   private final FST.BytesReader fstReader;

   /** Used to decode outputs */
   private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();

   private static class PartialMatch {
     final int fromNode;
     final int toNode;
     final int startOffset;
     final FST.Arc<BytesRef> fstNode;
     final BytesRef output;

     public PartialMatch(int startOffset, int fromNode, int toNode, FST.Arc<BytesRef> fstNode, BytesRef output) {
       this.startOffset = startOffset;
       this.fromNode = fromNode;
       this.toNode = toNode;
       this.fstNode = fstNode;
       this.output = output;
     }
   }

   private static class OutputToken {
     final String text;
     final int startOffset;
     final int endOffset;
     final int fromNode;
     final int toNode;

     public OutputToken(String text, int startOffset, int endOffset, int fromNode, int toNode) {
       this.text = text;
       this.startOffset = startOffset;
       this.endOffset = endOffset;
       this.fromNode = fromNode;
       this.toNode = toNode;
     }
   }

   public SynonymFilterStage(Stage prevStage, SynonymMap synonyms, boolean ignoreCase) {
     super(prevStage);
     termAttIn = prevStage.get(TermAttribute.class);
     termAttOut = create(TermAttribute.class);
     arcAttIn = prevStage.get(ArcAttribute.class);
     arcAttOut = create(ArcAttribute.class);
     offsetAttIn = prevStage.get(OffsetAttribute.class);
     offsetAttOut = create(OffsetAttribute.class);
     this.synonyms = synonyms;
     this.ignoreCase = ignoreCase;
     fst = synonyms.fst;
     fstReader = fst.getBytesReader();
     if (fst == null) {
       throw new IllegalArgumentException("fst must be non-null");
     }
     scratchArc = new FST.Arc<BytesRef>();
   }

   // nocommit we need reset!  make test that fails if only
   // partial consume
   @Override
   public void reset(Reader reader) {
     super.reset(reader);
     pending.clear();
     pendingOutputs.clear();
   }

   /** Extends all pending paths, and starts new paths,
    *  matching the current token. */
   private void matchOne(PartialMatch match) throws IOException {
     BytesRef output;
     if (match != null) {
       scratchArc.copyFrom(match.fstNode);
       output = match.output;
     } else {
       // Start a new match here:
       fst.getFirstArc(scratchArc);
       output = fst.outputs.getNoOutput();
     }

     int bufferLen = termAttIn.get().length();
     int bufUpto = 0;
     while(bufUpto < bufferLen) {
       final int codePoint = Character.codePointAt(termAttIn.get(), bufUpto);
       if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
         return;
       }

       // Accum the output
       output = fst.outputs.add(output, scratchArc.output);
       bufUpto += Character.charCount(codePoint);
     }

     // Entire token matched; now see if this is a final
     // state:
     if (scratchArc.isFinal()) {
       // A match!
       addMatch(match, fst.outputs.add(output, scratchArc.nextFinalOutput));
     }

     // See if the FST wants to continue matching (ie, needs to
     // see the next input token):
     if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) != null) {
       // More matching is possible -- accum the output (if
       // any) of the WORD_SEP arc and add a new
       // PartialMatch:
       int startOffset;
       int fromNode;
       if (match == null) {
         startOffset = offsetAttIn.startOffset();
         fromNode = arcAttIn.from();
       } else {
         startOffset = match.startOffset;
         fromNode = match.fromNode;
       }

       // Reserve the right to use this node as a future fromNode:
       nodes.mayChange(fromNode);
       //System.out.println("  incr mayChange node=" + fromNode);
       //System.out.println("  add pending to node=" + arcAttIn.to());
       pending.add(new PartialMatch(startOffset,
                                    fromNode,
                                    arcAttIn.to(),
                                    new FST.Arc<BytesRef>().copyFrom(scratchArc),
                                    fst.outputs.add(output, scratchArc.output)));
     }
   }

   /** Records a full match; on the next next() we will feed
    *  output tokens from it. */
   private void addMatch(PartialMatch match, BytesRef bytes) {
     //System.out.println("  add full match!");

     bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

     final int code = bytesReader.readVInt();
     final boolean keepOrig = (code & 0x1) == 0;
     if (!keepOrig) {
       throw new IllegalArgumentException("this SynonymFilter requires keepOrig = true");
     }

     final int count = code >>> 1;

     for (int outputIDX=0;outputIDX<count;outputIDX++) {
       synonyms.words.get(bytesReader.readVInt(), scratchBytes);
       UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
       assert scratchChars.offset == 0;

       int lastStart = 0;
       int lastNode = match.fromNode;

       for (int chIDX=0;chIDX<=scratchChars.length;chIDX++) {
         if (chIDX == scratchChars.length || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) {
           final int outputLen = chIDX - lastStart;

           // Caller is not allowed to have empty string in
           // the output:
           assert outputLen > 0: "output contains empty string: " + scratchChars;

           int toNode;
           if (chIDX == scratchChars.length) {
             toNode = arcAttIn.to();
           } else {
             toNode = nodes.newNode();
             lastNode = toNode;
             lastStart = 1+chIDX;
           }

           // These offsets make sense for "domain name service ->
           // DNS", but for "DNS -> domain name service" it's a
           // little weird because each of the 3 tokens will have
           // the same offsets ... but I don't see how we can do
           // any better?
           //System.out.println("  add output=" + new String(scratchChars.chars, lastStart, outputLen));
           pendingOutputs.add(new OutputToken(new String(scratchChars.chars, lastStart, outputLen),
                                              match.startOffset, offsetAttIn.endOffset(),
                                              lastNode, toNode));
         }
       }
     }
   }

   /** Update all matches for the current token. */
   private void match() throws IOException {
     int from = arcAttIn.from();

     // First extend any existing partial matches:
     int end = pending.size();
     for(int i=0;i<end;i++) {
       PartialMatch match = pending.get(i);
       //System.out.println("  try to extend match ending @ node=" + match.toNode + " vs from=" + from);
       if (match.toNode == from) {
         matchOne(match);
       }
     }

     // Then start any new matches:
     matchOne(null);

     // Prune any dead partial matches:
     int upto = 0;
     for(int i=0;i<pending.size();i++) {
       PartialMatch match = pending.get(i);
       int to = match.toNode;
       if (to != arcAttIn.from() && to != arcAttIn.to() && !nodes.getCanChange(to)) {
         // Prune this path
         //System.out.println("  prune path @ node=" + match.toNode);
         //System.out.println("  decr node=" + match.fromNode);
         nodes.wontChange(match.fromNode);
       } else {
         if (upto < i) {
           pending.set(upto, pending.get(i));
         }
         upto++;
       }
     }

     pending.subList(upto, pending.size()).clear();
   }

   private boolean insertOutputToken() {
     OutputToken token = pendingOutputs.pollFirst();
     if (token != null) {
       //System.out.println("  insert output token! text=" + token.text + " offset=" + token.startOffset + "/" + token.endOffset);
       // We still have outputs pending from previous match:
       termAttOut.set(token.text);
       offsetAttOut.setOffset(token.startOffset, token.endOffset);
       arcAttOut.set(token.fromNode, token.toNode);
       return true;
     } else {
       return false;
     }
   }

   public boolean next() throws IOException {
     //System.out.println("\nSYN.next");
     if (insertOutputToken()) {
       return true;
     }
     //System.out.println("  input.next()");
     if (prevStage.next()) {
       // Extend matches for this new token:
       //System.out.println("  got token=" + termAttIn + " from=" + arcAttIn.from() + " to=" + arcAttIn.to());
       match();

       // nocommit copy?
       termAttOut.set(termAttIn.get());

       offsetAttOut.setOffset(offsetAttIn.startOffset(),
                              offsetAttIn.endOffset());
       arcAttOut.set(arcAttIn.from(),
                     arcAttIn.to());
       return true;
     } else {
       // Prune any remaining partial matches:
       for(int i=0;i<pending.size();i++) {
         nodes.wontChange(pending.get(i).fromNode);
       }
       pending.clear();
       return false;
     }
   }
 }
	package org.apache.lucene.analysis.stages;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import org.apache.lucene.analysis.stages.attributes.ArcAttribute;
	import org.apache.lucene.analysis.stages.attributes.OffsetAttribute;
	import org.apache.lucene.analysis.stages.attributes.TermAttribute;
	import org.apache.lucene.analysis.synonym.SynonymMap;
	import org.apache.lucene.store.ByteArrayDataInput;
	import org.apache.lucene.util.Attribute;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.CharsRef;
	import org.apache.lucene.util.UnicodeUtil;
	import org.apache.lucene.util.fst.FST;

	import java.io.IOException;
	import java.io.Reader;
	import java.util.ArrayList;
	import java.util.LinkedList;
	import java.util.List;

	// nocommit does not do keepOrig

	// nocommit should we ... allow recursing on ourselves? ie
	// so sometimes an output could be parsed against an input
	// rule?

	/** Synonym filter, that improves on existing one: it can
	* handle any graph input, and it can create new positions
	* (e.g., dns -> domain name service). */

	public class SynonymFilterStage extends Stage {

	// We change the term:
	private final TermAttribute termAttIn;
	private final TermAttribute termAttOut;

	// We change the to/from:
	private final ArcAttribute arcAttIn;
	private final ArcAttribute arcAttOut;

	// We set offsets:
	private final OffsetAttribute offsetAttIn;
	private final OffsetAttribute offsetAttOut;

	// Used only inside addMatch
	private final BytesRef scratchBytes = new BytesRef();
	private final CharsRef scratchChars = new CharsRef();

	private final LinkedList<OutputToken> pendingOutputs = new LinkedList<OutputToken>();

	private final List<PartialMatch> pending = new ArrayList<PartialMatch>();

	private final SynonymMap synonyms;

	private final FST.Arc<BytesRef> scratchArc;

	private final FST<BytesRef> fst;

	private final boolean ignoreCase;

	private final FST.BytesReader fstReader;

	/** Used to decode outputs */
	private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();

	private static class PartialMatch {
	final int fromNode;
	final int toNode;
	final int startOffset;
	final FST.Arc<BytesRef> fstNode;
	final BytesRef output;

	public PartialMatch(int startOffset, int fromNode, int toNode, FST.Arc<BytesRef> fstNode, BytesRef output) {
	this.startOffset = startOffset;
	this.fromNode = fromNode;
	this.toNode = toNode;
	this.fstNode = fstNode;
	this.output = output;
	}
	}

	private static class OutputToken {
	final String text;
	final int startOffset;
	final int endOffset;
	final int fromNode;
	final int toNode;

	public OutputToken(String text, int startOffset, int endOffset, int fromNode, int toNode) {
	this.text = text;
	this.startOffset = startOffset;
	this.endOffset = endOffset;
	this.fromNode = fromNode;
	this.toNode = toNode;
	}
	}

	public SynonymFilterStage(Stage prevStage, SynonymMap synonyms, boolean ignoreCase) {
	super(prevStage);
	termAttIn = prevStage.get(TermAttribute.class);
	termAttOut = create(TermAttribute.class);
	arcAttIn = prevStage.get(ArcAttribute.class);
	arcAttOut = create(ArcAttribute.class);
	offsetAttIn = prevStage.get(OffsetAttribute.class);
	offsetAttOut = create(OffsetAttribute.class);
	this.synonyms = synonyms;
	this.ignoreCase = ignoreCase;
	fst = synonyms.fst;
	fstReader = fst.getBytesReader();
	if (fst == null) {
	throw new IllegalArgumentException("fst must be non-null");
	}
	scratchArc = new FST.Arc<BytesRef>();
	}

	// nocommit we need reset! make test that fails if only
	// partial consume
	@Override
	public void reset(Reader reader) {
	super.reset(reader);
	pending.clear();
	pendingOutputs.clear();
	}

	/** Extends all pending paths, and starts new paths,
	* matching the current token. */
	private void matchOne(PartialMatch match) throws IOException {
	BytesRef output;
	if (match != null) {
	scratchArc.copyFrom(match.fstNode);
	output = match.output;
	} else {
	// Start a new match here:
	fst.getFirstArc(scratchArc);
	output = fst.outputs.getNoOutput();
	}

	int bufferLen = termAttIn.get().length();
	int bufUpto = 0;
	while(bufUpto < bufferLen) {
	final int codePoint = Character.codePointAt(termAttIn.get(), bufUpto);
	if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
	return;
	}

	// Accum the output
	output = fst.outputs.add(output, scratchArc.output);
	bufUpto += Character.charCount(codePoint);
	}

	// Entire token matched; now see if this is a final
	// state:
	if (scratchArc.isFinal()) {
	// A match!
	addMatch(match, fst.outputs.add(output, scratchArc.nextFinalOutput));
	}

	// See if the FST wants to continue matching (ie, needs to
	// see the next input token):
	if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) != null) {
	// More matching is possible -- accum the output (if
	// any) of the WORD_SEP arc and add a new
	// PartialMatch:
	int startOffset;
	int fromNode;
	if (match == null) {
	startOffset = offsetAttIn.startOffset();
	fromNode = arcAttIn.from();
	} else {
	startOffset = match.startOffset;
	fromNode = match.fromNode;
	}

	// Reserve the right to use this node as a future fromNode:
	nodes.mayChange(fromNode);
	//System.out.println(" incr mayChange node=" + fromNode);
	//System.out.println(" add pending to node=" + arcAttIn.to());
	pending.add(new PartialMatch(startOffset,
	fromNode,
	arcAttIn.to(),
	new FST.Arc<BytesRef>().copyFrom(scratchArc),
	fst.outputs.add(output, scratchArc.output)));
	}
	}

	/** Records a full match; on the next next() we will feed
	* output tokens from it. */
	private void addMatch(PartialMatch match, BytesRef bytes) {
	//System.out.println(" add full match!");

	bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

	final int code = bytesReader.readVInt();
	final boolean keepOrig = (code & 0x1) == 0;
	if (!keepOrig) {
	throw new IllegalArgumentException("this SynonymFilter requires keepOrig = true");
	}

	final int count = code >>> 1;

	for (int outputIDX=0;outputIDX<count;outputIDX++) {
	synonyms.words.get(bytesReader.readVInt(), scratchBytes);
	UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
	assert scratchChars.offset == 0;

	int lastStart = 0;
	int lastNode = match.fromNode;

	for (int chIDX=0;chIDX<=scratchChars.length;chIDX++) {
	if (chIDX == scratchChars.length \|\| scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) {
	final int outputLen = chIDX - lastStart;

	// Caller is not allowed to have empty string in
	// the output:
	assert outputLen > 0: "output contains empty string: " + scratchChars;

	int toNode;
	if (chIDX == scratchChars.length) {
	toNode = arcAttIn.to();
	} else {
	toNode = nodes.newNode();
	lastNode = toNode;
	lastStart = 1+chIDX;
	}

	// These offsets make sense for "domain name service ->
	// DNS", but for "DNS -> domain name service" it's a
	// little weird because each of the 3 tokens will have
	// the same offsets ... but I don't see how we can do
	// any better?
	//System.out.println(" add output=" + new String(scratchChars.chars, lastStart, outputLen));
	pendingOutputs.add(new OutputToken(new String(scratchChars.chars, lastStart, outputLen),
	match.startOffset, offsetAttIn.endOffset(),
	lastNode, toNode));
	}
	}
	}
	}

	/** Update all matches for the current token. */
	private void match() throws IOException {
	int from = arcAttIn.from();

	// First extend any existing partial matches:
	int end = pending.size();
	for(int i=0;i<end;i++) {
	PartialMatch match = pending.get(i);
	//System.out.println(" try to extend match ending @ node=" + match.toNode + " vs from=" + from);
	if (match.toNode == from) {
	matchOne(match);
	}
	}

	// Then start any new matches:
	matchOne(null);

	// Prune any dead partial matches:
	int upto = 0;
	for(int i=0;i<pending.size();i++) {
	PartialMatch match = pending.get(i);
	int to = match.toNode;
	if (to != arcAttIn.from() && to != arcAttIn.to() && !nodes.getCanChange(to)) {
	// Prune this path
	//System.out.println(" prune path @ node=" + match.toNode);
	//System.out.println(" decr node=" + match.fromNode);
	nodes.wontChange(match.fromNode);
	} else {
	if (upto < i) {
	pending.set(upto, pending.get(i));
	}
	upto++;
	}
	}

	pending.subList(upto, pending.size()).clear();
	}

	private boolean insertOutputToken() {
	OutputToken token = pendingOutputs.pollFirst();
	if (token != null) {
	//System.out.println(" insert output token! text=" + token.text + " offset=" + token.startOffset + "/" + token.endOffset);
	// We still have outputs pending from previous match:
	termAttOut.set(token.text);
	offsetAttOut.setOffset(token.startOffset, token.endOffset);
	arcAttOut.set(token.fromNode, token.toNode);
	return true;
	} else {
	return false;
	}
	}

	public boolean next() throws IOException {
	//System.out.println("\nSYN.next");
	if (insertOutputToken()) {
	return true;
	}
	//System.out.println(" input.next()");
	if (prevStage.next()) {
	// Extend matches for this new token:
	//System.out.println(" got token=" + termAttIn + " from=" + arcAttIn.from() + " to=" + arcAttIn.to());
	match();

	// nocommit copy?
	termAttOut.set(termAttIn.get());

	offsetAttOut.setOffset(offsetAttIn.startOffset(),
	offsetAttIn.endOffset());
	arcAttOut.set(arcAttIn.from(),
	arcAttIn.to());
	return true;
	} else {
	// Prune any remaining partial matches:
	for(int i=0;i<pending.size();i++) {
	nodes.wontChange(pending.get(i).fromNode);
	}
	pending.clear();
	return false;
	}
	}
	}