lucene/analysis/common/src/java/org/apache/lucene/analysis/stages/InsertDeletedPunctuationStage.java - lucene-solr - Git at Google

 package org.apache.lucene.analysis.stages;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import org.apache.lucene.analysis.CharFilter;
 import org.apache.lucene.analysis.stages.attributes.ArcAttribute;
 import org.apache.lucene.analysis.stages.attributes.DeletedAttribute;
 import org.apache.lucene.analysis.stages.attributes.OffsetAttribute;
 import org.apache.lucene.analysis.stages.attributes.TermAttribute;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.FixedBitSet;

 import java.io.IOException;
 import java.io.Reader;

 /** Uses a CharFilter to detect when punctuation occurs in the
  *  input in between two tokens, and then as a Stage it will
  *  re-insert [deleted] tokens when it notices the tokenizer
  *  had deleted the punctuation.  E.g. this can be used to
  *  prevent synonyms/phrases from matching across punctuation. */

 public class InsertDeletedPunctuationStage extends Stage {

   private final DeletedAttribute delAttIn;
   private final ArcAttribute arcAttIn;
   private final TermAttribute termAttIn;
   private final OffsetAttribute offsetAttIn;

   private final ArcAttribute arcAttOut;
   private final DeletedAttribute delAttOut;
   private final TermAttribute termAttOut;
   private final OffsetAttribute offsetAttOut;

   private final String punctToken;

   public InsertDeletedPunctuationStage(Stage prevStage, String punctToken) {
     super(prevStage);
     this.punctToken = punctToken;

     delAttIn = prevStage.get(DeletedAttribute.class);
     offsetAttIn = prevStage.get(OffsetAttribute.class);
     arcAttIn = prevStage.get(ArcAttribute.class);
     termAttIn = prevStage.get(TermAttribute.class);

     delAttOut = create(DeletedAttribute.class);
     offsetAttOut = create(OffsetAttribute.class);
     arcAttOut = create(ArcAttribute.class);
     termAttOut = create(TermAttribute.class);
   }

   private static class FindPunctuationCharFilter extends CharFilter {
     FixedBitSet wasPunct = new FixedBitSet(128);
     private int pos;

     public FindPunctuationCharFilter(Reader input) {
       super(input);
     }

     @Override
     protected int correct(int offset) {
       return offset;
     }

     @Override
     public int read(char[] buffer, int offset, int length) throws IOException {
       int count = input.read(buffer, offset, length);
       for(int i=0;i<count;i++) {
         if (isPunct(buffer[offset+i])) {
           if (wasPunct.length() <= pos) {
             int nextSize = ArrayUtil.oversize(pos+1, 1);
             FixedBitSet nextBits = new FixedBitSet(nextSize);
             nextBits.or(wasPunct);
             wasPunct = nextBits;
           }
           wasPunct.set(pos);
         }
         pos++;
       }

       return count;
     }

     protected boolean isPunct(char ch) {
       // TODO: use proper Character.isXXX apis:
       return ch == '.' || ch == ',' || ch == ':' || ch == ';';
     }
   }

   @Override
   public void reset(Reader input) {
     // nocommit this is iffy?  if an earlier stage also
     // wraps, then, we are different offsets
     charFilter = new FindPunctuationCharFilter(input);
     super.reset(charFilter);
     lastEndOffset = 0;
     lastPunct = false;
     nodeOffset = 0;
   }

   private FindPunctuationCharFilter charFilter;
   private boolean lastPunct;
   private int lastEndOffset;
   private int nodeOffset;

   @Override
   public boolean next() throws IOException {
     if (lastPunct) {
       // Return previously buffered token:
       copyToken();
       lastPunct = false;
       return true;
     }

     if (prevStage.next()) {
       int startOffset = offsetAttIn.startOffset();
       assert startOffset <= charFilter.wasPunct.length();
       for(int i=lastEndOffset;i<startOffset;i++) {
         if (charFilter.wasPunct.get(i)) {
           // The gap between the end of the last token,
           // and this token, had punctuation:
           lastPunct = true;
           break;
         }
       }

       if (lastPunct) {
         // We insert a new node and token here:

         // nocommit this (single int nodeOffset) is too simplistic?
         arcAttOut.set(arcAttIn.from() + nodeOffset, arcAttIn.from() + nodeOffset + 1);
         delAttOut.set(true);
         offsetAttOut.setOffset(lastEndOffset, startOffset);
         // nocommit: should we copy over the actual punct chars...?
         termAttOut.set(punctToken);
         nodeOffset++;
       } else {
         copyToken();
       }
       lastEndOffset = offsetAttIn.endOffset();
       return true;
     } else {
       return false;
     }
   }

   private void copyToken() {
     if (delAttIn != null) {
       delAttOut.set(delAttIn.deleted());
     } else {
       delAttOut.set(false);
     }
     termAttOut.set(termAttIn.get());
     offsetAttOut.setOffset(offsetAttIn.startOffset(), offsetAttIn.endOffset());
     arcAttOut.set(arcAttIn.from()+nodeOffset, arcAttIn.to() + nodeOffset);
   }
 }
	package org.apache.lucene.analysis.stages;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import org.apache.lucene.analysis.CharFilter;
	import org.apache.lucene.analysis.stages.attributes.ArcAttribute;
	import org.apache.lucene.analysis.stages.attributes.DeletedAttribute;
	import org.apache.lucene.analysis.stages.attributes.OffsetAttribute;
	import org.apache.lucene.analysis.stages.attributes.TermAttribute;
	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.FixedBitSet;

	import java.io.IOException;
	import java.io.Reader;

	/** Uses a CharFilter to detect when punctuation occurs in the
	* input in between two tokens, and then as a Stage it will
	* re-insert [deleted] tokens when it notices the tokenizer
	* had deleted the punctuation. E.g. this can be used to
	* prevent synonyms/phrases from matching across punctuation. */

	public class InsertDeletedPunctuationStage extends Stage {

	private final DeletedAttribute delAttIn;
	private final ArcAttribute arcAttIn;
	private final TermAttribute termAttIn;
	private final OffsetAttribute offsetAttIn;

	private final ArcAttribute arcAttOut;
	private final DeletedAttribute delAttOut;
	private final TermAttribute termAttOut;
	private final OffsetAttribute offsetAttOut;

	private final String punctToken;

	public InsertDeletedPunctuationStage(Stage prevStage, String punctToken) {
	super(prevStage);
	this.punctToken = punctToken;

	delAttIn = prevStage.get(DeletedAttribute.class);
	offsetAttIn = prevStage.get(OffsetAttribute.class);
	arcAttIn = prevStage.get(ArcAttribute.class);
	termAttIn = prevStage.get(TermAttribute.class);

	delAttOut = create(DeletedAttribute.class);
	offsetAttOut = create(OffsetAttribute.class);
	arcAttOut = create(ArcAttribute.class);
	termAttOut = create(TermAttribute.class);
	}

	private static class FindPunctuationCharFilter extends CharFilter {
	FixedBitSet wasPunct = new FixedBitSet(128);
	private int pos;

	public FindPunctuationCharFilter(Reader input) {
	super(input);
	}

	@Override
	protected int correct(int offset) {
	return offset;
	}

	@Override
	public int read(char[] buffer, int offset, int length) throws IOException {
	int count = input.read(buffer, offset, length);
	for(int i=0;i<count;i++) {
	if (isPunct(buffer[offset+i])) {
	if (wasPunct.length() <= pos) {
	int nextSize = ArrayUtil.oversize(pos+1, 1);
	FixedBitSet nextBits = new FixedBitSet(nextSize);
	nextBits.or(wasPunct);
	wasPunct = nextBits;
	}
	wasPunct.set(pos);
	}
	pos++;
	}

	return count;
	}

	protected boolean isPunct(char ch) {
	// TODO: use proper Character.isXXX apis:
	return ch == '.' \|\| ch == ',' \|\| ch == ':' \|\| ch == ';';
	}
	}

	@Override
	public void reset(Reader input) {
	// nocommit this is iffy? if an earlier stage also
	// wraps, then, we are different offsets
	charFilter = new FindPunctuationCharFilter(input);
	super.reset(charFilter);
	lastEndOffset = 0;
	lastPunct = false;
	nodeOffset = 0;
	}

	private FindPunctuationCharFilter charFilter;
	private boolean lastPunct;
	private int lastEndOffset;
	private int nodeOffset;

	@Override
	public boolean next() throws IOException {
	if (lastPunct) {
	// Return previously buffered token:
	copyToken();
	lastPunct = false;
	return true;
	}

	if (prevStage.next()) {
	int startOffset = offsetAttIn.startOffset();
	assert startOffset <= charFilter.wasPunct.length();
	for(int i=lastEndOffset;i<startOffset;i++) {
	if (charFilter.wasPunct.get(i)) {
	// The gap between the end of the last token,
	// and this token, had punctuation:
	lastPunct = true;
	break;
	}
	}

	if (lastPunct) {
	// We insert a new node and token here:

	// nocommit this (single int nodeOffset) is too simplistic?
	arcAttOut.set(arcAttIn.from() + nodeOffset, arcAttIn.from() + nodeOffset + 1);
	delAttOut.set(true);
	offsetAttOut.setOffset(lastEndOffset, startOffset);
	// nocommit: should we copy over the actual punct chars...?
	termAttOut.set(punctToken);
	nodeOffset++;
	} else {
	copyToken();
	}
	lastEndOffset = offsetAttIn.endOffset();
	return true;
	} else {
	return false;
	}
	}

	private void copyToken() {
	if (delAttIn != null) {
	delAttOut.set(delAttIn.deleted());
	} else {
	delAttOut.set(false);
	}
	termAttOut.set(termAttIn.get());
	offsetAttOut.setOffset(offsetAttIn.startOffset(), offsetAttIn.endOffset());
	arcAttOut.set(arcAttIn.from()+nodeOffset, arcAttIn.to() + nodeOffset);
	}
	}