blob: 0d2ed0ca18626a7f02b45a564899a1077906d3e0 [file] [log] [blame]
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Lucene.Net.Search.Highlight.Test
{
/// <summary>
/// The class performs token processing in strings
/// </summary>
public class Tokenizer : IEnumerator<string>
{
/// Position over the string
private long currentPos = 0;
/// Include demiliters in the results.
private bool includeDelims = false;
/// Char representation of the String to tokenize.
private char[] chars = null;
//The tokenizer uses the default delimiter set: the space character, the tab character, the newline character, and the carriage-return character and the form-feed character
private string delimiters = " \t\n\r\f";
/// <summary>
/// Initializes a new class instance with a specified string to process
/// </summary>
/// <param name="source">String to tokenize</param>
public Tokenizer(System.String source)
{
this.chars = source.ToCharArray();
}
/// <summary>
/// Initializes a new class instance with a specified string to process
/// and the specified token delimiters to use
/// </summary>
/// <param name="source">String to tokenize</param>
/// <param name="delimiters">String containing the delimiters</param>
public Tokenizer(System.String source, System.String delimiters)
: this(source)
{
this.delimiters = delimiters;
}
/// <summary>
/// Initializes a new class instance with a specified string to process, the specified token
/// delimiters to use, and whether the delimiters must be included in the results.
/// </summary>
/// <param name="source">String to tokenize</param>
/// <param name="delimiters">String containing the delimiters</param>
/// <param name="includeDelims">Determines if delimiters are included in the results.</param>
public Tokenizer(System.String source, System.String delimiters, bool includeDelims)
: this(source, delimiters)
{
this.includeDelims = includeDelims;
}
/// <summary>
/// Returns the next token from the token list
/// </summary>
/// <returns>The string value of the token</returns>
public System.String NextToken()
{
return NextToken(this.delimiters);
}
/// <summary>
/// Returns the next token from the source string, using the provided
/// token delimiters
/// </summary>
/// <param name="delimiters">String containing the delimiters to use</param>
/// <returns>The string value of the token</returns>
public System.String NextToken(System.String delimiters)
{
//According to documentation, the usage of the received delimiters should be temporary (only for this call).
//However, it seems it is not true, so the following line is necessary.
this.delimiters = delimiters;
//at the end
if (this.currentPos == this.chars.Length)
throw new System.ArgumentOutOfRangeException();
//if over a delimiter and delimiters must be returned
else if ((System.Array.IndexOf(delimiters.ToCharArray(), chars[this.currentPos]) != -1)
&& this.includeDelims)
return "" + this.chars[this.currentPos++];
//need to get the token wo delimiters.
else
return NextToken(delimiters.ToCharArray());
}
//Returns the nextToken wo delimiters
private System.String NextToken(char[] delimiters)
{
string token = "";
long pos = this.currentPos;
//skip possible delimiters
while (System.Array.IndexOf(delimiters, this.chars[currentPos]) != -1)
//The last one is a delimiter (i.e there is no more tokens)
if (++this.currentPos == this.chars.Length)
{
this.currentPos = pos;
throw new System.ArgumentOutOfRangeException();
}
//getting the token
while (System.Array.IndexOf(delimiters, this.chars[this.currentPos]) == -1)
{
token += this.chars[this.currentPos];
//the last one is not a delimiter
if (++this.currentPos == this.chars.Length)
break;
}
return token;
}
/// <summary>
/// Determines if there are more tokens to return from the source string
/// </summary>
/// <returns>True or false, depending if there are more tokens</returns>
public bool HasMoreTokens()
{
//keeping the current pos
long pos = this.currentPos;
try
{
this.NextToken();
}
catch (System.ArgumentOutOfRangeException)
{
return false;
}
finally
{
this.currentPos = pos;
}
return true;
}
/// <summary>
/// Remaining tokens count
/// </summary>
public int Count
{
get
{
//keeping the current pos
long pos = this.currentPos;
int i = 0;
try
{
while (true)
{
this.NextToken();
i++;
}
}
catch (System.ArgumentOutOfRangeException)
{
this.currentPos = pos;
return i;
}
}
}
/// <summary>
/// Performs the same action as NextToken.
/// </summary>
public string Current
{
get { return this.NextToken(); }
}
/// <summary>
/// Performs the same action as NextToken.
/// </summary>
object IEnumerator.Current
{
get { return Current; }
}
/// <summary>
// Performs the same action as HasMoreTokens.
/// </summary>
/// <returns>True or false, depending if there are more tokens</returns>
public bool MoveNext()
{
return this.HasMoreTokens();
}
/// <summary>
/// Does nothing.
/// </summary>
public void Reset()
{
}
/// <summary>
/// Does nothing.
/// </summary>
public void Dispose()
{
}
}
}