src/Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs - lucenenet - Git at Google

 #if FEATURE_BREAKITERATOR
 using ICU4N.Text;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Diagnostics;
 using System;
 using System.Diagnostics;
 using System.IO;

 namespace Lucene.Net.Analysis.Util
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Breaks text into sentences with a <see cref="BreakIterator"/> and
     /// allows subclasses to decompose these sentences into words.
     /// <para>
     /// This can be used by subclasses that need sentence context
     /// for tokenization purposes, such as CJK segmenters.
     /// </para>
     /// <para>
     /// Additionally it can be used by subclasses that want to mark
     /// sentence boundaries (with a custom attribute, extra token, position
     /// increment, etc) for downstream processing.
     ///
     /// @lucene.experimental
     /// </para>
     /// </summary>
     public abstract class SegmentingTokenizerBase : Tokenizer
     {
         protected const int BUFFERMAX = 1024;
         protected readonly char[] m_buffer = new char[BUFFERMAX];
         /// <summary>
         /// true length of text in the buffer </summary>
         private int length = 0;
         /// <summary>
         /// length in buffer that can be evaluated safely, up to a safe end point </summary>
         private int usableLength = 0;
         /// <summary>
         /// accumulated offset of previous buffers for this reader, for offsetAtt </summary>
         protected int m_offset = 0;

         private readonly BreakIterator iterator;
         private readonly CharArrayIterator wrapper = CharArrayIterator.NewSentenceInstance();

         private readonly IOffsetAttribute offsetAtt;

         /// <summary>
         /// Construct a new SegmenterBase, using
         /// the provided <see cref="BreakIterator"/> for sentence segmentation.
         /// <para>
         /// Note that you should never share <see cref="BreakIterator"/>s across different
         /// <see cref="TokenStream"/>s, instead a newly created or cloned one should always
         /// be provided to this constructor.
         /// </para>
         /// </summary>
         public SegmentingTokenizerBase(TextReader reader, BreakIterator iterator)
             : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, iterator)
         {
         }

         /// <summary>
         /// Construct a new SegmenterBase, also supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/>
         /// </summary>
         public SegmentingTokenizerBase(AttributeFactory factory, TextReader reader, BreakIterator iterator)
             : base(factory, reader)
         {
             offsetAtt = AddAttribute<IOffsetAttribute>();
             this.iterator = iterator;
         }

         public override sealed bool IncrementToken()
         {
             if (length == 0 || !IncrementWord())
             {
                 while (!IncrementSentence())
                 {
                     Refill();
                     if (length <= 0) // no more bytes to read;
                     {
                         return false;
                     }
                 }
             }

             return true;
         }

         public override void Reset()
         {
             base.Reset();
             wrapper.SetText(m_buffer, 0, 0);
             iterator.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
             length = usableLength = m_offset = 0;
         }

         public override sealed void End()
         {
             base.End();
             int finalOffset = CorrectOffset(length < 0 ? m_offset : m_offset + length);
             offsetAtt.SetOffset(finalOffset, finalOffset);
         }

         /// <summary>
         /// Returns the last unambiguous break position in the text. </summary>
         private int FindSafeEnd()
         {
             for (int i = length - 1; i >= 0; i--)
             {
                 if (IsSafeEnd(m_buffer[i]))
                 {
                     return i + 1;
                 }
             }
             return -1;
         }

         /// <summary>
         /// For sentence tokenization, these are the unambiguous break positions. </summary>
         protected virtual bool IsSafeEnd(char ch)
         {
             switch ((int)ch)
             {
                 case 0x000D:
                 case 0x000A:
                 case 0x0085:
                 case 0x2028:
                 case 0x2029:
                     return true;
                 default:
                     return false;
             }
         }

         /// <summary>
         /// Refill the buffer, accumulating the offset and setting usableLength to the
         /// last unambiguous break position
         /// </summary>
         private void Refill()
         {
             m_offset += usableLength;
             int leftover = length - usableLength;
             Array.Copy(m_buffer, usableLength, m_buffer, 0, leftover);
             int requested = m_buffer.Length - leftover;
             int returned = Read(m_input, m_buffer, leftover, requested);
             length = returned < 0 ? leftover : returned + leftover;
             if (returned < requested) // reader has been emptied, process the rest
             {
                 usableLength = length;
             }
             else // still more data to be read, find a safe-stopping place
             {
                 usableLength = FindSafeEnd();
                 if (usableLength < 0)
                 {
                     usableLength = length; /*
                                     * more than IOBUFFER of text without breaks,
                                     * gonna possibly truncate tokens
                                     */
                 }
             }

             wrapper.SetText(m_buffer, 0, Math.Max(0, usableLength));
             iterator.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
         }

         // TODO: refactor to a shared readFully somewhere
         // (NGramTokenizer does this too):
         /// <summary>
         /// commons-io's readFully, but without bugs if offset != 0 </summary>
         private static int Read(TextReader input, char[] buffer, int offset, int length)
         {
             if (Debugging.AssertsEnabled) Debugging.Assert(length >= 0, () => "length must not be negative: " + length);

             int remaining = length;
             while (remaining > 0)
             {
                 int location = length - remaining;
                 int count = input.Read(buffer, offset + location, remaining);
                 if (count <= 0) // EOF
                 {
                     break;
                 }
                 remaining -= count;
             }
             return length - remaining;
         }

         /// <summary>
         /// return true if there is a token from the buffer, or null if it is
         /// exhausted.
         /// </summary>
         private bool IncrementSentence()
         {
             if (length == 0) // we must refill the buffer
             {
                 return false;
             }

             while (true)
             {
                 int start = iterator.Current;

                 if (start == BreakIterator.Done)
                 {
                     return false; // BreakIterator exhausted
                 }

                 // find the next set of boundaries
                 int end = iterator.Next();

                 if (end == BreakIterator.Done)
                 {
                     return false; // BreakIterator exhausted
                 }

                 SetNextSentence(start, end);
                 if (IncrementWord())
                 {
                     return true;
                 }
             }
         }

         /// <summary>
         /// Provides the next input sentence for analysis </summary>
         protected abstract void SetNextSentence(int sentenceStart, int sentenceEnd);

         /// <summary>
         /// Returns true if another word is available </summary>
         protected abstract bool IncrementWord();
     }
 }
 #endif
	#if FEATURE_BREAKITERATOR
	using ICU4N.Text;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Diagnostics;
	using System;
	using System.Diagnostics;
	using System.IO;

	namespace Lucene.Net.Analysis.Util
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Breaks text into sentences with a <see cref="BreakIterator"/> and
	/// allows subclasses to decompose these sentences into words.
	/// <para>
	/// This can be used by subclasses that need sentence context
	/// for tokenization purposes, such as CJK segmenters.
	/// </para>
	/// <para>
	/// Additionally it can be used by subclasses that want to mark
	/// sentence boundaries (with a custom attribute, extra token, position
	/// increment, etc) for downstream processing.
	///
	/// @lucene.experimental
	/// </para>
	/// </summary>
	public abstract class SegmentingTokenizerBase : Tokenizer
	{
	protected const int BUFFERMAX = 1024;
	protected readonly char[] m_buffer = new char[BUFFERMAX];
	/// <summary>
	/// true length of text in the buffer </summary>
	private int length = 0;
	/// <summary>
	/// length in buffer that can be evaluated safely, up to a safe end point </summary>
	private int usableLength = 0;
	/// <summary>
	/// accumulated offset of previous buffers for this reader, for offsetAtt </summary>
	protected int m_offset = 0;

	private readonly BreakIterator iterator;
	private readonly CharArrayIterator wrapper = CharArrayIterator.NewSentenceInstance();

	private readonly IOffsetAttribute offsetAtt;

	/// <summary>
	/// Construct a new SegmenterBase, using
	/// the provided <see cref="BreakIterator"/> for sentence segmentation.
	/// <para>
	/// Note that you should never share <see cref="BreakIterator"/>s across different
	/// <see cref="TokenStream"/>s, instead a newly created or cloned one should always
	/// be provided to this constructor.
	/// </para>
	/// </summary>
	public SegmentingTokenizerBase(TextReader reader, BreakIterator iterator)
	: this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, iterator)
	{
	}

	/// <summary>
	/// Construct a new SegmenterBase, also supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/>
	/// </summary>
	public SegmentingTokenizerBase(AttributeFactory factory, TextReader reader, BreakIterator iterator)
	: base(factory, reader)
	{
	offsetAtt = AddAttribute<IOffsetAttribute>();
	this.iterator = iterator;
	}

	public override sealed bool IncrementToken()
	{
	if (length == 0 \|\| !IncrementWord())
	{
	while (!IncrementSentence())
	{
	Refill();
	if (length <= 0) // no more bytes to read;
	{
	return false;
	}
	}
	}

	return true;
	}

	public override void Reset()
	{
	base.Reset();
	wrapper.SetText(m_buffer, 0, 0);
	iterator.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
	length = usableLength = m_offset = 0;
	}

	public override sealed void End()
	{
	base.End();
	int finalOffset = CorrectOffset(length < 0 ? m_offset : m_offset + length);
	offsetAtt.SetOffset(finalOffset, finalOffset);
	}

	/// <summary>
	/// Returns the last unambiguous break position in the text. </summary>
	private int FindSafeEnd()
	{
	for (int i = length - 1; i >= 0; i--)
	{
	if (IsSafeEnd(m_buffer[i]))
	{
	return i + 1;
	}
	}
	return -1;
	}

	/// <summary>
	/// For sentence tokenization, these are the unambiguous break positions. </summary>
	protected virtual bool IsSafeEnd(char ch)
	{
	switch ((int)ch)
	{
	case 0x000D:
	case 0x000A:
	case 0x0085:
	case 0x2028:
	case 0x2029:
	return true;
	default:
	return false;
	}
	}

	/// <summary>
	/// Refill the buffer, accumulating the offset and setting usableLength to the
	/// last unambiguous break position
	/// </summary>
	private void Refill()
	{
	m_offset += usableLength;
	int leftover = length - usableLength;
	Array.Copy(m_buffer, usableLength, m_buffer, 0, leftover);
	int requested = m_buffer.Length - leftover;
	int returned = Read(m_input, m_buffer, leftover, requested);
	length = returned < 0 ? leftover : returned + leftover;
	if (returned < requested) // reader has been emptied, process the rest
	{
	usableLength = length;
	}
	else // still more data to be read, find a safe-stopping place
	{
	usableLength = FindSafeEnd();
	if (usableLength < 0)
	{
	usableLength = length; /*
	* more than IOBUFFER of text without breaks,
	* gonna possibly truncate tokens
	*/
	}
	}

	wrapper.SetText(m_buffer, 0, Math.Max(0, usableLength));
	iterator.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
	}

	// TODO: refactor to a shared readFully somewhere
	// (NGramTokenizer does this too):
	/// <summary>
	/// commons-io's readFully, but without bugs if offset != 0 </summary>
	private static int Read(TextReader input, char[] buffer, int offset, int length)
	{
	if (Debugging.AssertsEnabled) Debugging.Assert(length >= 0, () => "length must not be negative: " + length);

	int remaining = length;
	while (remaining > 0)
	{
	int location = length - remaining;
	int count = input.Read(buffer, offset + location, remaining);
	if (count <= 0) // EOF
	{
	break;
	}
	remaining -= count;
	}
	return length - remaining;
	}

	/// <summary>
	/// return true if there is a token from the buffer, or null if it is
	/// exhausted.
	/// </summary>
	private bool IncrementSentence()
	{
	if (length == 0) // we must refill the buffer
	{
	return false;
	}

	while (true)
	{
	int start = iterator.Current;

	if (start == BreakIterator.Done)
	{
	return false; // BreakIterator exhausted
	}

	// find the next set of boundaries
	int end = iterator.Next();

	if (end == BreakIterator.Done)
	{
	return false; // BreakIterator exhausted
	}

	SetNextSentence(start, end);
	if (IncrementWord())
	{
	return true;
	}
	}
	}

	/// <summary>
	/// Provides the next input sentence for analysis </summary>
	protected abstract void SetNextSentence(int sentenceStart, int sentenceEnd);

	/// <summary>
	/// Returns true if another word is available </summary>
	protected abstract bool IncrementWord();
	}
	}
	#endif