lucene/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.util;

 import java.io.IOException;
 import java.io.Reader;
 import java.text.BreakIterator;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.util.AttributeFactory;

 /**
  * Breaks text into sentences with a {@link BreakIterator} and allows subclasses to decompose these
  * sentences into words.
  *
  * <p>This can be used by subclasses that need sentence context for tokenization purposes, such as
  * CJK segmenters.
  *
  * <p>Additionally it can be used by subclasses that want to mark sentence boundaries (with a custom
  * attribute, extra token, position increment, etc) for downstream processing.
  *
  * @lucene.experimental
  */
 public abstract class SegmentingTokenizerBase extends Tokenizer {
   protected static final int BUFFERMAX = 1024;
   protected final char buffer[] = new char[BUFFERMAX];
   /** true length of text in the buffer */
   private int length = 0;
   /** length in buffer that can be evaluated safely, up to a safe end point */
   private int usableLength = 0;
   /** accumulated offset of previous buffers for this reader, for offsetAtt */
   protected int offset = 0;

   private final BreakIterator iterator;
   private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance();

   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

   /**
    * Construct a new SegmenterBase, using the provided BreakIterator for sentence segmentation.
    *
    * <p>Note that you should never share BreakIterators across different TokenStreams, instead a
    * newly created or cloned one should always be provided to this constructor.
    */
   public SegmentingTokenizerBase(BreakIterator iterator) {
     this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, iterator);
   }

   /** Construct a new SegmenterBase, also supplying the AttributeFactory */
   public SegmentingTokenizerBase(AttributeFactory factory, BreakIterator iterator) {
     super(factory);
     this.iterator = iterator;
   }

   @Override
   public final boolean incrementToken() throws IOException {
     if (length == 0 || !incrementWord()) {
       while (!incrementSentence()) {
         refill();
         if (length <= 0) // no more bytes to read;
         return false;
       }
     }

     return true;
   }

   @Override
   public void reset() throws IOException {
     super.reset();
     wrapper.setText(buffer, 0, 0);
     iterator.setText(wrapper);
     length = usableLength = offset = 0;
   }

   @Override
   public final void end() throws IOException {
     super.end();
     final int finalOffset = correctOffset(length < 0 ? offset : offset + length);
     offsetAtt.setOffset(finalOffset, finalOffset);
   }

   /** Returns the last unambiguous break position in the text. */
   private int findSafeEnd() {
     for (int i = length - 1; i >= 0; i--) if (isSafeEnd(buffer[i])) return i + 1;
     return -1;
   }

   /** For sentence tokenization, these are the unambiguous break positions. */
   protected boolean isSafeEnd(char ch) {
     switch (ch) {
       case 0x000D:
       case 0x000A:
       case 0x0085:
       case 0x2028:
       case 0x2029:
         return true;
       default:
         return false;
     }
   }

   /**
    * Refill the buffer, accumulating the offset and setting usableLength to the last unambiguous
    * break position
    */
   private void refill() throws IOException {
     offset += usableLength;
     int leftover = length - usableLength;
     System.arraycopy(buffer, usableLength, buffer, 0, leftover);
     int requested = buffer.length - leftover;
     int returned = read(input, buffer, leftover, requested);
     length = returned < 0 ? leftover : returned + leftover;
     if (returned < requested) /* reader has been emptied, process the rest */ usableLength = length;
     else {
       /* still more data to be read, find a safe-stopping place */
       usableLength = findSafeEnd();
       if (usableLength < 0) usableLength = length; /*
                                 * more than IOBUFFER of text without breaks,
                                 * gonna possibly truncate tokens
                                 */
     }

     wrapper.setText(buffer, 0, Math.max(0, usableLength));
     iterator.setText(wrapper);
   }

   // TODO: refactor to a shared readFully somewhere
   // (NGramTokenizer does this too):
   /** commons-io's readFully, but without bugs if offset != 0 */
   private static int read(Reader input, char[] buffer, int offset, int length) throws IOException {
     assert length >= 0 : "length must not be negative: " + length;

     int remaining = length;
     while (remaining > 0) {
       int location = length - remaining;
       int count = input.read(buffer, offset + location, remaining);
       if (-1 == count) { // EOF
         break;
       }
       remaining -= count;
     }
     return length - remaining;
   }

   /** return true if there is a token from the buffer, or null if it is exhausted. */
   private boolean incrementSentence() throws IOException {
     if (length == 0) // we must refill the buffer
     return false;

     while (true) {
       int start = iterator.current();

       if (start == BreakIterator.DONE) return false; // BreakIterator exhausted

       // find the next set of boundaries
       int end = iterator.next();

       if (end == BreakIterator.DONE) return false; // BreakIterator exhausted

       setNextSentence(start, end);
       if (incrementWord()) {
         return true;
       }
     }
   }

   /** Provides the next input sentence for analysis */
   protected abstract void setNextSentence(int sentenceStart, int sentenceEnd);

   /** Returns true if another word is available */
   protected abstract boolean incrementWord();
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.util;

	import java.io.IOException;
	import java.io.Reader;
	import java.text.BreakIterator;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.util.AttributeFactory;

	/**
	* Breaks text into sentences with a {@link BreakIterator} and allows subclasses to decompose these
	* sentences into words.
	*
	* <p>This can be used by subclasses that need sentence context for tokenization purposes, such as
	* CJK segmenters.
	*
	* <p>Additionally it can be used by subclasses that want to mark sentence boundaries (with a custom
	* attribute, extra token, position increment, etc) for downstream processing.
	*
	* @lucene.experimental
	*/
	public abstract class SegmentingTokenizerBase extends Tokenizer {
	protected static final int BUFFERMAX = 1024;
	protected final char buffer[] = new char[BUFFERMAX];
	/** true length of text in the buffer */
	private int length = 0;
	/** length in buffer that can be evaluated safely, up to a safe end point */
	private int usableLength = 0;
	/** accumulated offset of previous buffers for this reader, for offsetAtt */
	protected int offset = 0;

	private final BreakIterator iterator;
	private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance();

	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

	/**
	* Construct a new SegmenterBase, using the provided BreakIterator for sentence segmentation.
	*
	* <p>Note that you should never share BreakIterators across different TokenStreams, instead a
	* newly created or cloned one should always be provided to this constructor.
	*/
	public SegmentingTokenizerBase(BreakIterator iterator) {
	this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, iterator);
	}

	/** Construct a new SegmenterBase, also supplying the AttributeFactory */
	public SegmentingTokenizerBase(AttributeFactory factory, BreakIterator iterator) {
	super(factory);
	this.iterator = iterator;
	}

	@Override
	public final boolean incrementToken() throws IOException {
	if (length == 0 \|\| !incrementWord()) {
	while (!incrementSentence()) {
	refill();
	if (length <= 0) // no more bytes to read;
	return false;
	}
	}

	return true;
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	wrapper.setText(buffer, 0, 0);
	iterator.setText(wrapper);
	length = usableLength = offset = 0;
	}

	@Override
	public final void end() throws IOException {
	super.end();
	final int finalOffset = correctOffset(length < 0 ? offset : offset + length);
	offsetAtt.setOffset(finalOffset, finalOffset);
	}

	/** Returns the last unambiguous break position in the text. */
	private int findSafeEnd() {
	for (int i = length - 1; i >= 0; i--) if (isSafeEnd(buffer[i])) return i + 1;
	return -1;
	}

	/** For sentence tokenization, these are the unambiguous break positions. */
	protected boolean isSafeEnd(char ch) {
	switch (ch) {
	case 0x000D:
	case 0x000A:
	case 0x0085:
	case 0x2028:
	case 0x2029:
	return true;
	default:
	return false;
	}
	}

	/**
	* Refill the buffer, accumulating the offset and setting usableLength to the last unambiguous
	* break position
	*/
	private void refill() throws IOException {
	offset += usableLength;
	int leftover = length - usableLength;
	System.arraycopy(buffer, usableLength, buffer, 0, leftover);
	int requested = buffer.length - leftover;
	int returned = read(input, buffer, leftover, requested);
	length = returned < 0 ? leftover : returned + leftover;
	if (returned < requested) /* reader has been emptied, process the rest */ usableLength = length;
	else {
	/* still more data to be read, find a safe-stopping place */
	usableLength = findSafeEnd();
	if (usableLength < 0) usableLength = length; /*
	* more than IOBUFFER of text without breaks,
	* gonna possibly truncate tokens
	*/
	}

	wrapper.setText(buffer, 0, Math.max(0, usableLength));
	iterator.setText(wrapper);
	}

	// TODO: refactor to a shared readFully somewhere
	// (NGramTokenizer does this too):
	/** commons-io's readFully, but without bugs if offset != 0 */
	private static int read(Reader input, char[] buffer, int offset, int length) throws IOException {
	assert length >= 0 : "length must not be negative: " + length;

	int remaining = length;
	while (remaining > 0) {
	int location = length - remaining;
	int count = input.read(buffer, offset + location, remaining);
	if (-1 == count) { // EOF
	break;
	}
	remaining -= count;
	}
	return length - remaining;
	}

	/** return true if there is a token from the buffer, or null if it is exhausted. */
	private boolean incrementSentence() throws IOException {
	if (length == 0) // we must refill the buffer
	return false;

	while (true) {
	int start = iterator.current();

	if (start == BreakIterator.DONE) return false; // BreakIterator exhausted

	// find the next set of boundaries
	int end = iterator.next();

	if (end == BreakIterator.DONE) return false; // BreakIterator exhausted

	setNextSentence(start, end);
	if (incrementWord()) {
	return true;
	}
	}
	}

	/** Provides the next input sentence for analysis */
	protected abstract void setNextSentence(int sentenceStart, int sentenceEnd);

	/** Returns true if another word is available */
	protected abstract boolean incrementWord();
	}