﻿using J2N;
using System.Globalization;

namespace Lucene.Net.Analysis.Miscellaneous
{
    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *     http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */

    /// <summary>
    /// A BreakIterator-like API for iterating over subwords in text, according to <see cref="WordDelimiterFilter"/> rules.
    /// @lucene.internal
    /// </summary>
    public sealed class WordDelimiterIterator
    {
        /// <summary>
        /// Indicates the end of iteration </summary>
        public const int DONE = -1;

        public static readonly byte[] DEFAULT_WORD_DELIM_TABLE = LoadDefaultWordDelimTable();

        internal char[] text;
        private int length;

        /// <summary>
        /// start position of text, excluding leading delimiters </summary>
        private int startBounds;
        /// <summary>
        /// end position of text, excluding trailing delimiters </summary>
        private int endBounds;

        /// <summary>
        /// Beginning of subword </summary>
        internal int current;
        /// <summary>
        /// End of subword </summary>
        internal int end;

        /// <summary>does this string end with a possessive such as 's</summary>
        private bool hasFinalPossessive = false;

        /// <summary>
        /// If false, causes case changes to be ignored (subwords will only be generated
        /// given SUBWORD_DELIM tokens). (Defaults to true)
        /// </summary>
        private readonly bool splitOnCaseChange;

        /// <summary>
        /// If false, causes numeric changes to be ignored (subwords will only be generated
        /// given SUBWORD_DELIM tokens). (Defaults to true)
        /// </summary>
        private readonly bool splitOnNumerics;

        /// <summary>
        /// If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
        /// <p/>
        /// "O'Neil's" => "O", "Neil"
        /// </summary>
        private readonly bool stemEnglishPossessive;

        private readonly byte[] charTypeTable;

        /// <summary>
        /// if true, need to skip over a possessive found in the last call to next() </summary>
        private bool skipPossessive = false;

        // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
        // done if separated by these chars?) "," would be an obvious candidate...
        private static byte[] LoadDefaultWordDelimTable() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
        {
            var tab = new byte[256];
            for (int i = 0; i < 256; i++)
            {
                byte code = 0;
                if (Character.IsLower(i))
                {
                    code |= WordDelimiterFilter.LOWER;
                }
                else if (Character.IsUpper(i))
                {
                    code |= WordDelimiterFilter.UPPER;
                }
                else if (Character.IsDigit(i))
                {
                    code |= WordDelimiterFilter.DIGIT;
                }
                if (code == 0)
                {
                    code = WordDelimiterFilter.SUBWORD_DELIM;
                }
                tab[i] = code;
            }
            return tab;
        }

        /// <summary>
        /// Create a new <see cref="WordDelimiterIterator"/> operating with the supplied rules.
        /// </summary>
        /// <param name="charTypeTable"> table containing character types </param>
        /// <param name="splitOnCaseChange"> if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) </param>
        /// <param name="splitOnNumerics"> if true, causes "j2se" to be three tokens; "j" "2" "se" </param>
        /// <param name="stemEnglishPossessive"> if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" </param>
        internal WordDelimiterIterator(byte[] charTypeTable, bool splitOnCaseChange, bool splitOnNumerics, bool stemEnglishPossessive)
        {
            this.charTypeTable = charTypeTable;
            this.splitOnCaseChange = splitOnCaseChange;
            this.splitOnNumerics = splitOnNumerics;
            this.stemEnglishPossessive = stemEnglishPossessive;
        }

        /// <summary>
        /// Advance to the next subword in the string.
        /// </summary>
        /// <returns> index of the next subword, or <see cref="DONE"/> if all subwords have been returned </returns>
        internal int Next()
        {
            current = end;
            if (current == DONE)
            {
                return DONE;
            }

            if (skipPossessive)
            {
                current += 2;
                skipPossessive = false;
            }

            int lastType = 0;

            while (current < endBounds && (WordDelimiterFilter.IsSubwordDelim(lastType = CharType(text[current]))))
            {
                current++;
            }

            if (current >= endBounds)
            {
                return end = DONE;
            }

            for (end = current + 1; end < endBounds; end++)
            {
                int type = CharType(text[end]);
                if (IsBreak(lastType, type))
                {
                    break;
                }
                lastType = type;
            }

            if (end < endBounds - 1 && EndsWithPossessive(end + 2))
            {
                skipPossessive = true;
            }

            return end;
        }


        /// <summary>
        /// Return the type of the current subword.
        /// This currently uses the type of the first character in the subword.
        /// </summary>
        /// <returns> type of the current word </returns>
        internal int Type
        {
            get
            {
                if (end == DONE)
                {
                    return 0;
                }

                int type = CharType(text[current]);
                switch (type)
                {
                    // return ALPHA word type for both lower and upper
                    case WordDelimiterFilter.LOWER:
                    case WordDelimiterFilter.UPPER:
                        return WordDelimiterFilter.ALPHA;
                    default:
                        return type;
                }
            }
        }

        /// <summary>
        /// Reset the text to a new value, and reset all state
        /// </summary>
        /// <param name="text"> New text </param>
        /// <param name="length"> length of the text </param>
        internal void SetText(char[] text, int length)
        {
            this.text = text;
            this.length = this.endBounds = length;
            current = startBounds = end = 0;
            skipPossessive = hasFinalPossessive = false;
            SetBounds();
        }

        // ================================================= Helper Methods ================================================

        /// <summary>
        /// Determines whether the transition from lastType to type indicates a break
        /// </summary>
        /// <param name="lastType"> Last subword type </param>
        /// <param name="type"> Current subword type </param>
        /// <returns> <c>true</c> if the transition indicates a break, <c>false</c> otherwise </returns>
        private bool IsBreak(int lastType, int type)
        {
            if ((type & lastType) != 0)
            {
                return false;
            }

            if (!splitOnCaseChange && WordDelimiterFilter.IsAlpha(lastType) && WordDelimiterFilter.IsAlpha(type))
            {
                // ALPHA->ALPHA: always ignore if case isn't considered.
                return false;
            }
            else if (WordDelimiterFilter.IsUpper(lastType) && WordDelimiterFilter.IsAlpha(type))
            {
                // UPPER->letter: Don't split
                return false;
            }
            else if (!splitOnNumerics && ((WordDelimiterFilter.IsAlpha(lastType) && WordDelimiterFilter.IsDigit(type)) || (WordDelimiterFilter.IsDigit(lastType) && WordDelimiterFilter.IsAlpha(type))))
            {
                // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
                return false;
            }

            return true;
        }

        /// <summary>
        /// Determines if the current word contains only one subword.  Note, it could be potentially surrounded by delimiters
        /// </summary>
        /// <returns> <c>true</c> if the current word contains only one subword, <c>false</c> otherwise </returns>
        internal bool IsSingleWord() 
        {
            if (hasFinalPossessive)
            {
                return current == startBounds && end == endBounds - 2;
            }
            else
            {
                return current == startBounds && end == endBounds;
            }
        }

        /// <summary>
        /// Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
        /// it yet, simply note it.
        /// </summary>
        private void SetBounds()
        {
            while (startBounds < length && (WordDelimiterFilter.IsSubwordDelim(CharType(text[startBounds]))))
            {
                startBounds++;
            }

            while (endBounds > startBounds && (WordDelimiterFilter.IsSubwordDelim(CharType(text[endBounds - 1]))))
            {
                endBounds--;
            }
            if (EndsWithPossessive(endBounds))
            {
                hasFinalPossessive = true;
            }
            current = startBounds;
        }

        /// <summary>
        /// Determines if the text at the given position indicates an English possessive which should be removed
        /// </summary>
        /// <param name="pos"> Position in the text to check if it indicates an English possessive </param>
        /// <returns> <c>true</c> if the text at the position indicates an English posessive, <c>false</c> otherwise </returns>
        private bool EndsWithPossessive(int pos)
        {
            return (stemEnglishPossessive && 
                pos > 2 && 
                text[pos - 2] == '\'' && 
                (text[pos - 1] == 's' || text[pos - 1] == 'S') && 
                WordDelimiterFilter.IsAlpha(CharType(text[pos - 3])) && 
                (pos == endBounds || WordDelimiterFilter.IsSubwordDelim(CharType(text[pos]))));
        }

        /// <summary>
        /// Determines the type of the given character
        /// </summary>
        /// <param name="ch"> Character whose type is to be determined </param>
        /// <returns> Type of the character </returns>
        private int CharType(int ch)
        {
            if (ch < charTypeTable.Length)
            {
                return charTypeTable[ch];
            }
            return GetType(ch);
        }

        /// <summary>
        /// Computes the type of the given character
        /// </summary>
        /// <param name="ch"> Character whose type is to be determined </param>
        /// <returns> Type of the character </returns>
        public static byte GetType(int ch)
        {
            switch (Character.GetType(ch))
            {
                case UnicodeCategory.UppercaseLetter:
                    return WordDelimiterFilter.UPPER;
                case UnicodeCategory.LowercaseLetter:
                    return WordDelimiterFilter.LOWER;

                case UnicodeCategory.TitlecaseLetter:
                case UnicodeCategory.ModifierLetter:
                case UnicodeCategory.OtherLetter:
                case UnicodeCategory.NonSpacingMark:
                case UnicodeCategory.EnclosingMark: // depends what it encloses?
                case UnicodeCategory.SpacingCombiningMark:
                    return WordDelimiterFilter.ALPHA;

                case UnicodeCategory.DecimalDigitNumber:
                case UnicodeCategory.LetterNumber:
                case UnicodeCategory.OtherNumber:
                    return WordDelimiterFilter.DIGIT;

                // case Character.SPACE_SEPARATOR:
                // case Character.LINE_SEPARATOR:
                // case Character.PARAGRAPH_SEPARATOR:
                // case Character.CONTROL:
                // case Character.FORMAT:
                // case Character.PRIVATE_USE:

                case UnicodeCategory.Surrogate:
                    return WordDelimiterFilter.ALPHA | WordDelimiterFilter.DIGIT;

                // case Character.DASH_PUNCTUATION:
                // case Character.START_PUNCTUATION:
                // case Character.END_PUNCTUATION:
                // case Character.CONNECTOR_PUNCTUATION:
                // case Character.OTHER_PUNCTUATION:
                // case Character.MATH_SYMBOL:
                // case Character.CURRENCY_SYMBOL:
                // case Character.MODIFIER_SYMBOL:
                // case Character.OTHER_SYMBOL:
                // case Character.INITIAL_QUOTE_PUNCTUATION:
                // case Character.FINAL_QUOTE_PUNCTUATION:

                default:
                    return WordDelimiterFilter.SUBWORD_DELIM;

            }
        }
    }
}