blob: 396cc003fc80fd003bee947ef45123beeaed0262 [file] [log] [blame]
using ICU4N.Text;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Diagnostics;
using System;
using System.Diagnostics;
using System.IO;
namespace Lucene.Net.Analysis.Util
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
/// <summary>
/// Breaks text into sentences with a <see cref="BreakIterator"/> and
/// allows subclasses to decompose these sentences into words.
/// <para>
/// This can be used by subclasses that need sentence context
/// for tokenization purposes, such as CJK segmenters.
/// </para>
/// <para>
/// Additionally it can be used by subclasses that want to mark
/// sentence boundaries (with a custom attribute, extra token, position
/// increment, etc) for downstream processing.
/// @lucene.experimental
/// </para>
/// </summary>
public abstract class SegmentingTokenizerBase : Tokenizer
protected const int BUFFERMAX = 1024;
protected readonly char[] m_buffer = new char[BUFFERMAX];
/// <summary>
/// true length of text in the buffer </summary>
private int length = 0;
/// <summary>
/// length in buffer that can be evaluated safely, up to a safe end point </summary>
private int usableLength = 0;
/// <summary>
/// accumulated offset of previous buffers for this reader, for offsetAtt </summary>
protected int m_offset = 0;
private readonly BreakIterator iterator;
private readonly CharArrayIterator wrapper = CharArrayIterator.NewSentenceInstance();
private readonly IOffsetAttribute offsetAtt;
/// <summary>
/// Construct a new SegmenterBase, using
/// the provided <see cref="BreakIterator"/> for sentence segmentation.
/// <para>
/// Note that you should never share <see cref="BreakIterator"/>s across different
/// <see cref="TokenStream"/>s, instead a newly created or cloned one should always
/// be provided to this constructor.
/// </para>
/// </summary>
public SegmentingTokenizerBase(TextReader reader, BreakIterator iterator)
: this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader, iterator)
/// <summary>
/// Construct a new SegmenterBase, also supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/>
/// </summary>
public SegmentingTokenizerBase(AttributeFactory factory, TextReader reader, BreakIterator iterator)
: base(factory, reader)
offsetAtt = AddAttribute<IOffsetAttribute>();
this.iterator = iterator;
public override sealed bool IncrementToken()
if (length == 0 || !IncrementWord())
while (!IncrementSentence())
if (length <= 0) // no more bytes to read;
return false;
return true;
public override void Reset()
wrapper.SetText(m_buffer, 0, 0);
iterator.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
length = usableLength = m_offset = 0;
public override sealed void End()
int finalOffset = CorrectOffset(length < 0 ? m_offset : m_offset + length);
offsetAtt.SetOffset(finalOffset, finalOffset);
/// <summary>
/// Returns the last unambiguous break position in the text. </summary>
private int FindSafeEnd()
for (int i = length - 1; i >= 0; i--)
if (IsSafeEnd(m_buffer[i]))
return i + 1;
return -1;
/// <summary>
/// For sentence tokenization, these are the unambiguous break positions. </summary>
protected virtual bool IsSafeEnd(char ch)
switch ((int)ch)
case 0x000D:
case 0x000A:
case 0x0085:
case 0x2028:
case 0x2029:
return true;
return false;
/// <summary>
/// Refill the buffer, accumulating the offset and setting usableLength to the
/// last unambiguous break position
/// </summary>
private void Refill()
m_offset += usableLength;
int leftover = length - usableLength;
Array.Copy(m_buffer, usableLength, m_buffer, 0, leftover);
int requested = m_buffer.Length - leftover;
int returned = Read(m_input, m_buffer, leftover, requested);
length = returned < 0 ? leftover : returned + leftover;
if (returned < requested) // reader has been emptied, process the rest
usableLength = length;
else // still more data to be read, find a safe-stopping place
usableLength = FindSafeEnd();
if (usableLength < 0)
usableLength = length; /*
* more than IOBUFFER of text without breaks,
* gonna possibly truncate tokens
wrapper.SetText(m_buffer, 0, Math.Max(0, usableLength));
iterator.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length));
// TODO: refactor to a shared readFully somewhere
// (NGramTokenizer does this too):
/// <summary>
/// commons-io's readFully, but without bugs if offset != 0 </summary>
private static int Read(TextReader input, char[] buffer, int offset, int length)
if (Debugging.AssertsEnabled) Debugging.Assert(length >= 0,"length must not be negative: {0}", length);
int remaining = length;
while (remaining > 0)
int location = length - remaining;
int count = input.Read(buffer, offset + location, remaining);
if (count <= 0) // EOF
remaining -= count;
return length - remaining;
/// <summary>
/// return true if there is a token from the buffer, or null if it is
/// exhausted.
/// </summary>
private bool IncrementSentence()
if (length == 0) // we must refill the buffer
return false;
while (true)
int start = iterator.Current;
if (start == BreakIterator.Done)
return false; // BreakIterator exhausted
// find the next set of boundaries
int end = iterator.Next();
if (end == BreakIterator.Done)
return false; // BreakIterator exhausted
SetNextSentence(start, end);
if (IncrementWord())
return true;
/// <summary>
/// Provides the next input sentence for analysis </summary>
protected abstract void SetNextSentence(int sentenceStart, int sentenceEnd);
/// <summary>
/// Returns true if another word is available </summary>
protected abstract bool IncrementWord();