| using Lucene.Net.Analysis.Tokenattributes; |
| using Lucene.Net.Support; |
| using System; |
| using System.IO; |
| using System.Text.RegularExpressions; |
| |
| namespace Lucene.Net.Analysis.Cjk |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// CJKTokenizer is designed for Chinese, Japanese, and Korean languages. |
| /// <para> |
| /// The tokens returned are every two adjacent characters with overlap match. |
| /// </para> |
| /// <para> |
| /// Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4". |
| /// </para> |
| /// Additionally, the following is applied to Latin text (such as English): |
| /// <ul> |
| /// <li>Text is converted to lowercase. |
| /// <li>Numeric digits, '+', '#', and '_' are tokenized as letters. |
| /// <li>Full-width forms are converted to half-width forms. |
| /// </ul> |
| /// For more info on Asian language (Chinese, Japanese, and Korean) text segmentation: |
| /// please search <a |
| /// href="http://www.google.com/search?q=word+chinese+segment">google</a> |
| /// </summary> |
| /// @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead. |
| [Obsolete("Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.")] |
| public sealed class CJKTokenizer : Tokenizer |
| { |
| //~ Static fields/initializers --------------------------------------------- |
| /// <summary> |
| /// Word token type </summary> |
| internal const int WORD_TYPE = 0; |
| |
| /// <summary> |
| /// Single byte token type </summary> |
| internal const int SINGLE_TOKEN_TYPE = 1; |
| |
| /// <summary> |
| /// Double byte token type </summary> |
| internal const int DOUBLE_TOKEN_TYPE = 2; |
| |
| /// <summary> |
| /// Names for token types </summary> |
| internal static readonly string[] TOKEN_TYPE_NAMES = new string[] { "word", "single", "double" }; |
| |
| /// <summary> |
| /// Max word length </summary> |
| private const int MAX_WORD_LEN = 255; |
| |
| /// <summary> |
| /// buffer size: </summary> |
| private const int IO_BUFFER_SIZE = 256; |
| |
| //~ Instance fields -------------------------------------------------------- |
| |
| /// <summary> |
| /// word offset, used to imply which character(in ) is parsed </summary> |
| private int offset = 0; |
| |
| /// <summary> |
| /// the index used only for ioBuffer </summary> |
| private int bufferIndex = 0; |
| |
| /// <summary> |
| /// data length </summary> |
| private int dataLen = 0; |
| |
| /// <summary> |
| /// character buffer, store the characters which are used to compose <br> |
| /// the returned Token |
| /// </summary> |
| private readonly char[] buffer = new char[MAX_WORD_LEN]; |
| |
| /// <summary> |
| /// I/O buffer, used to store the content of the input(one of the <br> |
| /// members of Tokenizer) |
| /// </summary> |
| private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE]; |
| |
| /// <summary> |
| /// word type: single=>ASCII double=>non-ASCII word=>default </summary> |
| private int tokenType = WORD_TYPE; |
| |
| /// <summary> |
| /// tag: previous character is a cached double-byte character "C1C2C3C4" |
| /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened) |
| /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" |
| /// </summary> |
| private bool preIsTokened = false; |
| |
| private ICharTermAttribute termAtt; |
| private IOffsetAttribute offsetAtt; |
| private ITypeAttribute typeAtt; |
| |
| //~ Constructors ----------------------------------------------------------- |
| |
| /// <summary> |
| /// Construct a token stream processing the given input. |
| /// </summary> |
| /// <param name="in"> I/O reader </param> |
| public CJKTokenizer(TextReader @in) |
| : base(@in) |
| { |
| Init(); |
| } |
| |
| public CJKTokenizer(AttributeFactory factory, TextReader @in) |
| : base(factory, @in) |
| { |
| Init(); |
| } |
| |
| private void Init() |
| { |
| termAtt = AddAttribute<ICharTermAttribute>(); |
| offsetAtt = AddAttribute<IOffsetAttribute>(); |
| typeAtt = AddAttribute<ITypeAttribute>(); |
| } |
| |
| //~ Methods ---------------------------------------------------------------- |
| |
| /// <summary> |
| /// Returns true for the next token in the stream, or false at EOS. |
| /// See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html |
| /// for detail. |
| /// </summary> |
| /// <returns> false for end of stream, true otherwise |
| /// </returns> |
| /// <exception cref="java.io.IOException"> - throw IOException when read error <br> |
| /// happened in the InputStream |
| /// </exception> |
| public override bool IncrementToken() |
| { |
| ClearAttributes(); |
| /// <summary> |
| /// how many character(s) has been stored in buffer </summary> |
| |
| while (true) // loop until we find a non-empty token |
| { |
| |
| int length = 0; |
| |
| /// <summary> |
| /// the position used to create Token </summary> |
| int start = offset; |
| |
| while (true) // loop until we've found a full token |
| { |
| /// <summary> |
| /// current character </summary> |
| char c; |
| |
| offset++; |
| |
| if (bufferIndex >= dataLen) |
| { |
| dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); |
| bufferIndex = 0; |
| } |
| |
| if (dataLen <= 0) |
| { |
| if (length > 0) |
| { |
| if (preIsTokened == true) |
| { |
| length = 0; |
| preIsTokened = false; |
| } |
| else |
| { |
| offset--; |
| } |
| |
| break; |
| } |
| else |
| { |
| offset--; |
| return false; |
| } |
| } |
| else |
| { |
| //get current character |
| c = ioBuffer[bufferIndex++]; |
| } |
| |
| //if the current character is ASCII or Extend ASCII |
| // LUCENENET Port Reference: https://msdn.microsoft.com/en-us/library/20bw873z.aspx#SupportedNamedBlocks |
| string charAsString = new string(new char[] { c }); |
| bool isHalfwidthAndFullwidthForms = Regex.IsMatch(charAsString, @"\p{IsHalfwidthandFullwidthForms}"); |
| if (Regex.IsMatch(charAsString, @"\p{IsBasicLatin}") || isHalfwidthAndFullwidthForms) |
| { |
| if (isHalfwidthAndFullwidthForms) |
| { |
| int i = (int)c; |
| if (i >= 65281 && i <= 65374) |
| { |
| // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN |
| i = i - 65248; |
| c = (char)i; |
| } |
| } |
| |
| // if the current character is a letter or "_" "+" "#" |
| if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#'))) |
| { |
| if (length == 0) |
| { |
| // "javaC1C2C3C4linux" <br> |
| // ^--: the current character begin to token the ASCII |
| // letter |
| start = offset - 1; |
| } |
| else if (tokenType == DOUBLE_TOKEN_TYPE) |
| { |
| // "javaC1C2C3C4linux" <br> |
| // ^--: the previous non-ASCII |
| // : the current character |
| offset--; |
| bufferIndex--; |
| |
| if (preIsTokened == true) |
| { |
| // there is only one non-ASCII has been stored |
| length = 0; |
| preIsTokened = false; |
| break; |
| } |
| else |
| { |
| break; |
| } |
| } |
| |
| // store the LowerCase(c) in the buffer |
| buffer[length++] = char.ToLower(c); |
| tokenType = SINGLE_TOKEN_TYPE; |
| |
| // break the procedure if buffer overflowed! |
| if (length == MAX_WORD_LEN) |
| { |
| break; |
| } |
| } |
| else if (length > 0) |
| { |
| if (preIsTokened == true) |
| { |
| length = 0; |
| preIsTokened = false; |
| } |
| else |
| { |
| break; |
| } |
| } |
| } |
| else |
| { |
| // non-ASCII letter, e.g."C1C2C3C4" |
| if (Character.IsLetter(c)) |
| { |
| if (length == 0) |
| { |
| start = offset - 1; |
| buffer[length++] = c; |
| tokenType = DOUBLE_TOKEN_TYPE; |
| } |
| else |
| { |
| if (tokenType == SINGLE_TOKEN_TYPE) |
| { |
| offset--; |
| bufferIndex--; |
| |
| //return the previous ASCII characters |
| break; |
| } |
| else |
| { |
| buffer[length++] = c; |
| tokenType = DOUBLE_TOKEN_TYPE; |
| |
| if (length == 2) |
| { |
| offset--; |
| bufferIndex--; |
| preIsTokened = true; |
| |
| break; |
| } |
| } |
| } |
| } |
| else if (length > 0) |
| { |
| if (preIsTokened == true) |
| { |
| // empty the buffer |
| length = 0; |
| preIsTokened = false; |
| } |
| else |
| { |
| break; |
| } |
| } |
| } |
| } |
| |
| if (length > 0) |
| { |
| termAtt.CopyBuffer(buffer, 0, length); |
| offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); |
| typeAtt.Type = TOKEN_TYPE_NAMES[tokenType]; |
| return true; |
| } |
| else if (dataLen <= 0) |
| { |
| offset--; |
| return false; |
| } |
| |
| // Cycle back and try for the next token (don't |
| // return an empty string) |
| } |
| } |
| |
| public override void End() |
| { |
| base.End(); |
| // set final offset |
| int finalOffset = CorrectOffset(offset); |
| this.offsetAtt.SetOffset(finalOffset, finalOffset); |
| } |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| offset = bufferIndex = dataLen = 0; |
| preIsTokened = false; |
| tokenType = WORD_TYPE; |
| } |
| } |
| } |