blob: b8a6d2636ae437a1056f919cdd346ee89d352ac8 [file] [log] [blame]
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Support;
using System;
using System.IO;
using System.Text.RegularExpressions;
namespace Lucene.Net.Analysis.Cjk
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
/// <para>
/// The tokens returned are every two adjacent characters with overlap match.
/// </para>
/// <para>
/// Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
/// </para>
/// Additionally, the following is applied to Latin text (such as English):
/// <ul>
/// <li>Text is converted to lowercase.
/// <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
/// <li>Full-width forms are converted to half-width forms.
/// </ul>
/// For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
/// please search <a
/// href="http://www.google.com/search?q=word+chinese+segment">google</a>
/// </summary>
/// @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.
[Obsolete("Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.")]
public sealed class CJKTokenizer : Tokenizer
{
//~ Static fields/initializers ---------------------------------------------
/// <summary>
/// Word token type </summary>
internal const int WORD_TYPE = 0;
/// <summary>
/// Single byte token type </summary>
internal const int SINGLE_TOKEN_TYPE = 1;
/// <summary>
/// Double byte token type </summary>
internal const int DOUBLE_TOKEN_TYPE = 2;
/// <summary>
/// Names for token types </summary>
internal static readonly string[] TOKEN_TYPE_NAMES = new string[] { "word", "single", "double" };
/// <summary>
/// Max word length </summary>
private const int MAX_WORD_LEN = 255;
/// <summary>
/// buffer size: </summary>
private const int IO_BUFFER_SIZE = 256;
//~ Instance fields --------------------------------------------------------
/// <summary>
/// word offset, used to imply which character(in ) is parsed </summary>
private int offset = 0;
/// <summary>
/// the index used only for ioBuffer </summary>
private int bufferIndex = 0;
/// <summary>
/// data length </summary>
private int dataLen = 0;
/// <summary>
/// character buffer, store the characters which are used to compose <br>
/// the returned Token
/// </summary>
private readonly char[] buffer = new char[MAX_WORD_LEN];
/// <summary>
/// I/O buffer, used to store the content of the input(one of the <br>
/// members of Tokenizer)
/// </summary>
private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
/// <summary>
/// word type: single=>ASCII double=>non-ASCII word=>default </summary>
private int tokenType = WORD_TYPE;
/// <summary>
/// tag: previous character is a cached double-byte character "C1C2C3C4"
/// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
/// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
/// </summary>
private bool preIsTokened = false;
private ICharTermAttribute termAtt;
private IOffsetAttribute offsetAtt;
private ITypeAttribute typeAtt;
//~ Constructors -----------------------------------------------------------
/// <summary>
/// Construct a token stream processing the given input.
/// </summary>
/// <param name="in"> I/O reader </param>
public CJKTokenizer(TextReader @in)
: base(@in)
{
Init();
}
public CJKTokenizer(AttributeFactory factory, TextReader @in)
: base(factory, @in)
{
Init();
}
private void Init()
{
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
typeAtt = AddAttribute<ITypeAttribute>();
}
//~ Methods ----------------------------------------------------------------
/// <summary>
/// Returns true for the next token in the stream, or false at EOS.
/// See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
/// for detail.
/// </summary>
/// <returns> false for end of stream, true otherwise
/// </returns>
/// <exception cref="java.io.IOException"> - throw IOException when read error <br>
/// happened in the InputStream
/// </exception>
public override bool IncrementToken()
{
ClearAttributes();
/// <summary>
/// how many character(s) has been stored in buffer </summary>
while (true) // loop until we find a non-empty token
{
int length = 0;
/// <summary>
/// the position used to create Token </summary>
int start = offset;
while (true) // loop until we've found a full token
{
/// <summary>
/// current character </summary>
char c;
offset++;
if (bufferIndex >= dataLen)
{
dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
bufferIndex = 0;
}
if (dataLen <= 0)
{
if (length > 0)
{
if (preIsTokened == true)
{
length = 0;
preIsTokened = false;
}
else
{
offset--;
}
break;
}
else
{
offset--;
return false;
}
}
else
{
//get current character
c = ioBuffer[bufferIndex++];
}
//if the current character is ASCII or Extend ASCII
// LUCENENET Port Reference: https://msdn.microsoft.com/en-us/library/20bw873z.aspx#SupportedNamedBlocks
string charAsString = new string(new char[] { c });
bool isHalfwidthAndFullwidthForms = Regex.IsMatch(charAsString, @"\p{IsHalfwidthandFullwidthForms}");
if (Regex.IsMatch(charAsString, @"\p{IsBasicLatin}") || isHalfwidthAndFullwidthForms)
{
if (isHalfwidthAndFullwidthForms)
{
int i = (int)c;
if (i >= 65281 && i <= 65374)
{
// convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
i = i - 65248;
c = (char)i;
}
}
// if the current character is a letter or "_" "+" "#"
if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')))
{
if (length == 0)
{
// "javaC1C2C3C4linux" <br>
// ^--: the current character begin to token the ASCII
// letter
start = offset - 1;
}
else if (tokenType == DOUBLE_TOKEN_TYPE)
{
// "javaC1C2C3C4linux" <br>
// ^--: the previous non-ASCII
// : the current character
offset--;
bufferIndex--;
if (preIsTokened == true)
{
// there is only one non-ASCII has been stored
length = 0;
preIsTokened = false;
break;
}
else
{
break;
}
}
// store the LowerCase(c) in the buffer
buffer[length++] = char.ToLower(c);
tokenType = SINGLE_TOKEN_TYPE;
// break the procedure if buffer overflowed!
if (length == MAX_WORD_LEN)
{
break;
}
}
else if (length > 0)
{
if (preIsTokened == true)
{
length = 0;
preIsTokened = false;
}
else
{
break;
}
}
}
else
{
// non-ASCII letter, e.g."C1C2C3C4"
if (Character.IsLetter(c))
{
if (length == 0)
{
start = offset - 1;
buffer[length++] = c;
tokenType = DOUBLE_TOKEN_TYPE;
}
else
{
if (tokenType == SINGLE_TOKEN_TYPE)
{
offset--;
bufferIndex--;
//return the previous ASCII characters
break;
}
else
{
buffer[length++] = c;
tokenType = DOUBLE_TOKEN_TYPE;
if (length == 2)
{
offset--;
bufferIndex--;
preIsTokened = true;
break;
}
}
}
}
else if (length > 0)
{
if (preIsTokened == true)
{
// empty the buffer
length = 0;
preIsTokened = false;
}
else
{
break;
}
}
}
}
if (length > 0)
{
termAtt.CopyBuffer(buffer, 0, length);
offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
typeAtt.Type = TOKEN_TYPE_NAMES[tokenType];
return true;
}
else if (dataLen <= 0)
{
offset--;
return false;
}
// Cycle back and try for the next token (don't
// return an empty string)
}
}
public override void End()
{
base.End();
// set final offset
int finalOffset = CorrectOffset(offset);
this.offsetAtt.SetOffset(finalOffset, finalOffset);
}
public override void Reset()
{
base.Reset();
offset = bufferIndex = dataLen = 0;
preIsTokened = false;
tokenType = WORD_TYPE;
}
}
}