| using J2N; |
| using Lucene.Net.Analysis.Ja.Dict; |
| using Lucene.Net.Analysis.Ja.TokenAttributes; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Analysis.Util; |
| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util; |
| using Lucene.Net.Util.Fst; |
| using System.Collections.Generic; |
| using System.Globalization; |
| using System.IO; |
| using System.Threading; |
| using Console = Lucene.Net.Util.SystemConsole; |
| |
| namespace Lucene.Net.Analysis.Ja |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Tokenizer for Japanese that uses morphological analysis. |
| /// </summary> |
| /// <remarks> |
| /// This tokenizer sets a number of additional attributes: |
| /// <list type="bullet"> |
| /// <item><description><see cref="IBaseFormAttribute"/> containing base form for inflected adjectives and verbs.</description></item> |
| /// <item><description><see cref="IPartOfSpeechAttribute"/> containing part-of-speech.</description></item> |
| /// <item><description><see cref="IReadingAttribute"/> containing reading and pronunciation.</description></item> |
| /// <item><description><see cref="IInflectionAttribute"/> containing additional part-of-speech information for inflected forms.</description></item> |
| /// </list> |
| /// <para/> |
| /// This tokenizer uses a rolling Viterbi search to find the |
| /// least cost segmentation (path) of the incoming characters. |
| /// For tokens that appear to be compound (> length 2 for all |
| /// Kanji, or > length 7 for non-Kanji), we see if there is a |
| /// 2nd best segmentation of that token after applying |
| /// penalties to the long tokens. If so, and the Mode is |
| /// <see cref="JapaneseTokenizerMode.SEARCH"/>, we output the alternate segmentation |
| /// as well. |
| /// </remarks> |
| public sealed class JapaneseTokenizer : Tokenizer |
| { |
| // LUCENENET specific: de-nested Mode and renamed JapaneseTokenizerMode |
| |
| /// <summary> |
| /// Default tokenization mode. Currently this is <see cref="JapaneseTokenizerMode.SEARCH"/>. |
| /// </summary> |
| public static readonly JapaneseTokenizerMode DEFAULT_MODE = JapaneseTokenizerMode.SEARCH; |
| |
| // LUCENENET specific: de-nested Type and renamed JapaneseTokenizerType |
| |
| |
| private static readonly bool VERBOSE = false; |
| |
| private static readonly int SEARCH_MODE_KANJI_LENGTH = 2; |
| |
| private static readonly int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH |
| |
| private static readonly int SEARCH_MODE_KANJI_PENALTY = 3000; |
| |
| private static readonly int SEARCH_MODE_OTHER_PENALTY = 1700; |
| |
| // For safety: |
| private static readonly int MAX_UNKNOWN_WORD_LENGTH = 1024; |
| private static readonly int MAX_BACKTRACE_GAP = 1024; |
| |
| private readonly IDictionary<JapaneseTokenizerType, IDictionary> dictionaryMap = new Dictionary<JapaneseTokenizerType, IDictionary>(); |
| |
| private readonly TokenInfoFST fst; |
| private readonly TokenInfoDictionary dictionary; |
| private readonly UnknownDictionary unkDictionary; |
| private readonly ConnectionCosts costs; |
| private readonly UserDictionary userDictionary; |
| private readonly CharacterDefinition characterDefinition; |
| |
| private readonly FST.Arc<long?> arc = new FST.Arc<long?>(); |
| private readonly FST.BytesReader fstReader; |
| private readonly Int32sRef wordIdRef = new Int32sRef(); |
| |
| private readonly FST.BytesReader userFSTReader; |
| private readonly TokenInfoFST userFST; |
| |
| private readonly RollingCharBuffer buffer = new RollingCharBuffer(); |
| |
| private readonly WrappedPositionArray positions = new WrappedPositionArray(); |
| |
| private readonly bool discardPunctuation; |
| private readonly bool searchMode; |
| private readonly bool extendedMode; |
| private readonly bool outputCompounds; |
| |
| // Index of the last character of unknown word: |
| private int unknownWordEndIndex = -1; |
| |
| // True once we've hit the EOF from the input reader: |
| private bool end; |
| |
| // Last absolute position we backtraced from: |
| private int lastBackTracePos; |
| |
| // Position of last token we returned; we use this to |
| // figure out whether to set posIncr to 0 or 1: |
| private int lastTokenPos; |
| |
| // Next absolute position to process: |
| private int pos; |
| |
| // Already parsed, but not yet passed to caller, tokens: |
| private readonly IList<Token> pending = new List<Token>(); |
| |
| private readonly ICharTermAttribute termAtt; |
| private readonly IOffsetAttribute offsetAtt; |
| private readonly IPositionIncrementAttribute posIncAtt; |
| private readonly IPositionLengthAttribute posLengthAtt; |
| private readonly IBaseFormAttribute basicFormAtt; |
| private readonly IPartOfSpeechAttribute posAtt; |
| private readonly IReadingAttribute readingAtt; |
| private readonly IInflectionAttribute inflectionAtt; |
| |
| /// <summary> |
| /// Create a new JapaneseTokenizer. |
| /// <para/> |
| /// Uses the default AttributeFactory. |
| /// </summary> |
| /// <param name="input">TextReader containing text.</param> |
| /// <param name="userDictionary">Optional: if non-null, user dictionary.</param> |
| /// <param name="discardPunctuation"><c>true</c> if punctuation tokens should be dropped from the output.</param> |
| /// <param name="mode">Tokenization mode.</param> |
| public JapaneseTokenizer(TextReader input, UserDictionary userDictionary, bool discardPunctuation, JapaneseTokenizerMode mode) |
| : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode) |
| { |
| } |
| |
| /// <summary> |
| /// Create a new JapaneseTokenizer. |
| /// </summary> |
| /// <param name="factory">The AttributeFactory to use.</param> |
| /// <param name="input">TextReader containing text.</param> |
| /// <param name="userDictionary">Optional: if non-null, user dictionary.</param> |
| /// <param name="discardPunctuation"><c>true</c> if punctuation tokens should be dropped from the output.</param> |
| /// <param name="mode">Tokenization mode.</param> |
| public JapaneseTokenizer |
| (AttributeFactory factory, TextReader input, UserDictionary userDictionary, bool discardPunctuation, JapaneseTokenizerMode mode) |
| : base(factory, input) |
| { |
| this.termAtt = AddAttribute<ICharTermAttribute>(); |
| this.offsetAtt = AddAttribute<IOffsetAttribute>(); |
| this.posIncAtt = AddAttribute<IPositionIncrementAttribute>(); |
| this.posLengthAtt = AddAttribute<IPositionLengthAttribute>(); |
| this.basicFormAtt = AddAttribute<IBaseFormAttribute>(); |
| this.posAtt = AddAttribute<IPartOfSpeechAttribute>(); |
| this.readingAtt = AddAttribute<IReadingAttribute>(); |
| this.inflectionAtt = AddAttribute<IInflectionAttribute>(); |
| |
| dictionary = TokenInfoDictionary.Instance; |
| fst = dictionary.FST; |
| unkDictionary = UnknownDictionary.Instance; |
| characterDefinition = unkDictionary.CharacterDefinition; |
| this.userDictionary = userDictionary; |
| costs = ConnectionCosts.Instance; |
| fstReader = fst.GetBytesReader(); |
| if (userDictionary != null) |
| { |
| userFST = userDictionary.FST; |
| userFSTReader = userFST.GetBytesReader(); |
| } |
| else |
| { |
| userFST = null; |
| userFSTReader = null; |
| } |
| this.discardPunctuation = discardPunctuation; |
| switch (mode) |
| { |
| case JapaneseTokenizerMode.SEARCH: |
| searchMode = true; |
| extendedMode = false; |
| outputCompounds = true; |
| break; |
| case JapaneseTokenizerMode.EXTENDED: |
| searchMode = true; |
| extendedMode = true; |
| outputCompounds = false; |
| break; |
| default: |
| searchMode = false; |
| extendedMode = false; |
| outputCompounds = false; |
| break; |
| } |
| buffer.Reset(this.m_input); |
| |
| ResetState(); |
| |
| dictionaryMap[JapaneseTokenizerType.KNOWN] = dictionary; |
| dictionaryMap[JapaneseTokenizerType.UNKNOWN] = unkDictionary; |
| dictionaryMap[JapaneseTokenizerType.USER] = userDictionary; |
| } |
| |
| private GraphvizFormatter dotOut; |
| |
| // LUCENENET specific - added getter and made into property |
| // so we can set this during object initialization. |
| |
| /// <summary> |
| /// Expert: set this to produce graphviz (dot) output of |
| /// the Viterbi lattice |
| /// </summary> |
| public GraphvizFormatter GraphvizFormatter |
| { |
| get => this.dotOut; |
| set => this.dotOut = value; |
| } |
| |
| protected override void Dispose(bool disposing) |
| { |
| base.Dispose(disposing); |
| if (disposing) |
| { |
| buffer.Reset(m_input); |
| } |
| } |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| buffer.Reset(m_input); |
| ResetState(); |
| } |
| |
| private void ResetState() |
| { |
| positions.Reset(); |
| unknownWordEndIndex = -1; |
| pos = 0; |
| end = false; |
| lastBackTracePos = 0; |
| lastTokenPos = -1; |
| pending.Clear(); |
| |
| // Add BOS: |
| positions.Get(0).Add(0, 0, -1, -1, -1, JapaneseTokenizerType.KNOWN); |
| } |
| |
| public override void End() |
| { |
| base.End(); |
| // Set final offset |
| int finalOffset = CorrectOffset(pos); |
| offsetAtt.SetOffset(finalOffset, finalOffset); |
| } |
| |
| // Returns the added cost that a 2nd best segmentation is |
| // allowed to have. Ie, if we see path with cost X, |
| // ending in a compound word, and this method returns |
| // threshold > 0, then we will also find the 2nd best |
| // segmentation and if its path score is within this |
| // threshold of X, we'll include it in the output: |
| private int ComputeSecondBestThreshold(int pos, int length) |
| { |
| // TODO: maybe we do something else here, instead of just |
| // using the penalty...? EG we can be more aggressive on |
| // when to also test for 2nd best path |
| return ComputePenalty(pos, length); |
| } |
| |
| private int ComputePenalty(int pos, int length) |
| { |
| if (length > SEARCH_MODE_KANJI_LENGTH) |
| { |
| bool allKanji = true; |
| // check if node consists of only kanji |
| int endPos = pos + length; |
| for (int pos2 = pos; pos2 < endPos; pos2++) |
| { |
| if (!characterDefinition.IsKanji((char)buffer.Get(pos2))) |
| { |
| allKanji = false; |
| break; |
| } |
| } |
| if (allKanji) |
| { // Process only Kanji keywords |
| return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY; |
| } |
| else if (length > SEARCH_MODE_OTHER_LENGTH) |
| { |
| return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY; |
| } |
| } |
| return 0; |
| } |
| |
| // LUCENENET specific - de-nested Position class |
| |
| private void Add(IDictionary dict, Position fromPosData, int endPos, int wordID, JapaneseTokenizerType type, bool addPenalty) |
| { |
| int wordCost = dict.GetWordCost(wordID); |
| int leftID = dict.GetLeftId(wordID); |
| int leastCost = int.MaxValue; |
| int leastIDX = -1; |
| if (Debugging.AssertsEnabled) Debugging.Assert(fromPosData.count > 0); |
| for (int idx = 0; idx < fromPosData.count; idx++) |
| { |
| // Cost is path cost so far, plus word cost (added at |
| // end of loop), plus bigram cost: |
| int cost = fromPosData.costs[idx] + costs.Get(fromPosData.lastRightID[idx], leftID); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" fromIDX=" + idx + ": cost=" + cost + " (prevCost=" + fromPosData.costs[idx] + " wordCost=" + wordCost + " bgCost=" + costs.Get(fromPosData.lastRightID[idx], leftID) + " leftID=" + leftID); |
| } |
| if (cost < leastCost) |
| { |
| leastCost = cost; |
| leastIDX = idx; |
| if (VERBOSE) |
| { |
| Console.WriteLine(" **"); |
| } |
| } |
| } |
| |
| leastCost += wordCost; |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine(" + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos=" + endPos + " toPos.idx=" + positions.Get(endPos).count); |
| } |
| |
| if ((addPenalty || (!outputCompounds && searchMode)) && type != JapaneseTokenizerType.USER) |
| { |
| int penalty = ComputePenalty(fromPosData.pos, endPos - fromPosData.pos); |
| if (VERBOSE) |
| { |
| if (penalty > 0) |
| { |
| Console.WriteLine(" + penalty=" + penalty + " cost=" + (leastCost + penalty)); |
| } |
| } |
| leastCost += penalty; |
| } |
| |
| //positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, leastIDX, wordID, type); |
| if (Debugging.AssertsEnabled) Debugging.Assert(leftID == dict.GetRightId(wordID)); |
| positions.Get(endPos).Add(leastCost, leftID, fromPosData.pos, leastIDX, wordID, type); |
| } |
| |
| public override bool IncrementToken() |
| { |
| |
| // parse() is able to return w/o producing any new |
| // tokens, when the tokens it had produced were entirely |
| // punctuation. So we loop here until we get a real |
| // token or we end: |
| while (pending.Count == 0) |
| { |
| if (end) |
| { |
| return false; |
| } |
| |
| // Push Viterbi forward some more: |
| Parse(); |
| } |
| |
| Token token = pending[pending.Count - 1]; // LUCENENET: The above loop ensures we don't get here unless we have at least 1 item |
| if (token != null) |
| { |
| pending.Remove(token); |
| } |
| |
| int position = token.Position; |
| int length = token.Length; |
| ClearAttributes(); |
| if (Debugging.AssertsEnabled) Debugging.Assert(length > 0); |
| //System.out.println("off=" + token.getOffset() + " len=" + length + " vs " + token.getSurfaceForm().length); |
| termAtt.CopyBuffer(token.SurfaceForm, token.Offset, length); |
| offsetAtt.SetOffset(CorrectOffset(position), CorrectOffset(position + length)); |
| basicFormAtt.SetToken(token); |
| posAtt.SetToken(token); |
| readingAtt.SetToken(token); |
| inflectionAtt.SetToken(token); |
| if (token.Position == lastTokenPos) |
| { |
| posIncAtt.PositionIncrement = 0; |
| posLengthAtt.PositionLength = token.PositionLength; |
| } |
| else |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(token.Position > lastTokenPos); |
| posIncAtt.PositionIncrement = 1; |
| posLengthAtt.PositionLength = 1; |
| } |
| if (VERBOSE) |
| { |
| Console.WriteLine(Thread.CurrentThread.Name + ": incToken: return token=" + token); |
| } |
| lastTokenPos = token.Position; |
| return true; |
| } |
| |
| /// <summary> |
| /// Incrementally parse some more characters. This runs |
| /// the viterbi search forwards "enough" so that we |
| /// generate some more tokens. How much forward depends on |
| /// the chars coming in, since some chars could cause |
| /// longer-lasting ambiguity in the parsing. Once the |
| /// ambiguity is resolved, then we back trace, produce |
| /// the pending tokens, and return. |
| /// </summary> |
| private void Parse() |
| { |
| if (VERBOSE) |
| { |
| Console.WriteLine("\nPARSE"); |
| } |
| |
| // Advances over each position (character): |
| while (true) |
| { |
| |
| if (buffer.Get(pos) == -1) |
| { |
| // End |
| break; |
| } |
| |
| Position posData = positions.Get(pos); |
| bool isFrontier = positions.GetNextPos() == pos + 1; |
| |
| if (posData.count == 0) |
| { |
| // No arcs arrive here; move to next position: |
| if (VERBOSE) |
| { |
| Console.WriteLine(" no arcs in; skip pos=" + pos); |
| } |
| pos++; |
| continue; |
| } |
| |
| if (pos > lastBackTracePos && posData.count == 1 && isFrontier) |
| { |
| // if (pos > lastBackTracePos && posData.count == 1 && isFrontier) { |
| // We are at a "frontier", and only one node is |
| // alive, so whatever the eventual best path is must |
| // come through this node. So we can safely commit |
| // to the prefix of the best path at this point: |
| Backtrace(posData, 0); |
| |
| // Re-base cost so we don't risk int overflow: |
| posData.costs[0] = 0; |
| |
| if (pending.Count != 0) |
| { |
| return; |
| } |
| else |
| { |
| // This means the backtrace only produced |
| // punctuation tokens, so we must keep parsing. |
| } |
| } |
| |
| if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) |
| { |
| // Safety: if we've buffered too much, force a |
| // backtrace now. We find the least-cost partial |
| // path, across all paths, backtrace from it, and |
| // then prune all others. Note that this, in |
| // general, can produce the wrong result, if the |
| // total best path did not in fact back trace |
| // through this partial best path. But it's the |
| // best we can do... (short of not having a |
| // safety!). |
| |
| // First pass: find least cost partial path so far, |
| // including ending at future positions: |
| int leastIDX = -1; |
| int leastCost = int.MaxValue; |
| Position leastPosData = null; |
| for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++) |
| { |
| Position posData2 = positions.Get(pos2); |
| for (int idx = 0; idx < posData2.count; idx++) |
| { |
| //System.out.println(" idx=" + idx + " cost=" + cost); |
| int cost = posData2.costs[idx]; |
| if (cost < leastCost) |
| { |
| leastCost = cost; |
| leastIDX = idx; |
| leastPosData = posData2; |
| } |
| } |
| } |
| |
| // We will always have at least one live path: |
| if (Debugging.AssertsEnabled) Debugging.Assert(leastIDX != -1); |
| |
| // Second pass: prune all but the best path: |
| for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++) |
| { |
| Position posData2 = positions.Get(pos2); |
| if (posData2 != leastPosData) |
| { |
| posData2.Reset(); |
| } |
| else |
| { |
| if (leastIDX != 0) |
| { |
| posData2.costs[0] = posData2.costs[leastIDX]; |
| posData2.lastRightID[0] = posData2.lastRightID[leastIDX]; |
| posData2.backPos[0] = posData2.backPos[leastIDX]; |
| posData2.backIndex[0] = posData2.backIndex[leastIDX]; |
| posData2.backID[0] = posData2.backID[leastIDX]; |
| posData2.backType[0] = posData2.backType[leastIDX]; |
| } |
| posData2.count = 1; |
| } |
| } |
| |
| Backtrace(leastPosData, 0); |
| |
| // Re-base cost so we don't risk int overflow: |
| Arrays.Fill(leastPosData.costs, 0, leastPosData.count, 0); |
| |
| if (pos != leastPosData.pos) |
| { |
| // We jumped into a future position: |
| if (Debugging.AssertsEnabled) Debugging.Assert(pos < leastPosData.pos); |
| pos = leastPosData.pos; |
| } |
| |
| if (pending.Count != 0) |
| { |
| return; |
| } |
| else |
| { |
| // This means the backtrace only produced |
| // punctuation tokens, so we must keep parsing. |
| continue; |
| } |
| } |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine("\n extend @ pos=" + pos + " char=" + (char)buffer.Get(pos)); |
| } |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine(" " + posData.count + " arcs in"); |
| } |
| |
| bool anyMatches = false; |
| |
| // First try user dict: |
| if (userFST != null) |
| { |
| userFST.GetFirstArc(arc); |
| int output = 0; |
| for (int posAhead = posData.pos; ; posAhead++) |
| { |
| int ch = buffer.Get(posAhead); |
| if (ch == -1) |
| { |
| break; |
| } |
| if (userFST.FindTargetArc(ch, arc, arc, posAhead == posData.pos, userFSTReader) == null) |
| { |
| break; |
| } |
| output += (int)arc.Output; |
| if (arc.IsFinal) |
| { |
| if (VERBOSE) |
| { |
| Console.WriteLine(" USER word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1)); |
| } |
| Add(userDictionary, posData, posAhead + 1, output + (int)arc.NextFinalOutput, JapaneseTokenizerType.USER, false); |
| anyMatches = true; |
| } |
| } |
| } |
| |
| // TODO: we can be more aggressive about user |
| // matches? if we are "under" a user match then don't |
| // extend KNOWN/UNKNOWN paths? |
| |
| if (!anyMatches) |
| { |
| // Next, try known dictionary matches |
| fst.GetFirstArc(arc); |
| int output = 0; |
| |
| for (int posAhead = posData.pos; ; posAhead++) |
| { |
| int ch = buffer.Get(posAhead); |
| if (ch == -1) |
| { |
| break; |
| } |
| //System.out.println(" match " + (char) ch + " posAhead=" + posAhead); |
| |
| if (fst.FindTargetArc(ch, arc, arc, posAhead == posData.pos, fstReader) == null) |
| { |
| break; |
| } |
| |
| output += (int)arc.Output; |
| |
| // Optimization: for known words that are too-long |
| // (compound), we should pre-compute the 2nd |
| // best segmentation and store it in the |
| // dictionary instead of recomputing it each time a |
| // match is found. |
| |
| if (arc.IsFinal) |
| { |
| dictionary.LookupWordIds(output + (int)arc.NextFinalOutput, wordIdRef); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" KNOWN word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.Length + " wordIDs"); |
| } |
| for (int ofs = 0; ofs < wordIdRef.Length; ofs++) |
| { |
| Add(dictionary, posData, posAhead + 1, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.KNOWN, false); |
| anyMatches = true; |
| } |
| } |
| } |
| } |
| |
| // In the case of normal mode, it doesn't process unknown word greedily. |
| |
| if (!searchMode && unknownWordEndIndex > posData.pos) |
| { |
| pos++; |
| continue; |
| } |
| |
| char firstCharacter = (char)buffer.Get(pos); |
| if (!anyMatches || characterDefinition.IsInvoke(firstCharacter)) |
| { |
| |
| // Find unknown match: |
| int characterId = characterDefinition.GetCharacterClass(firstCharacter); |
| bool isPunct = IsPunctuation(firstCharacter); |
| |
| // NOTE: copied from UnknownDictionary.lookup: |
| int unknownWordLength; |
| if (!characterDefinition.IsGroup(firstCharacter)) |
| { |
| unknownWordLength = 1; |
| } |
| else |
| { |
| // Extract unknown word. Characters with the same character class are considered to be part of unknown word |
| unknownWordLength = 1; |
| for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) |
| { |
| int ch = buffer.Get(posAhead); |
| if (ch == -1) |
| { |
| break; |
| } |
| if (characterId == characterDefinition.GetCharacterClass((char)ch) && |
| IsPunctuation((char)ch) == isPunct) |
| { |
| unknownWordLength++; |
| } |
| else |
| { |
| break; |
| } |
| } |
| } |
| |
| unkDictionary.LookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same |
| if (VERBOSE) |
| { |
| Console.WriteLine(" UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.Length + " wordIDs"); |
| } |
| for (int ofs = 0; ofs < wordIdRef.Length; ofs++) |
| { |
| Add(unkDictionary, posData, posData.pos + unknownWordLength, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.UNKNOWN, false); |
| } |
| |
| unknownWordEndIndex = posData.pos + unknownWordLength; |
| } |
| |
| pos++; |
| } |
| |
| end = true; |
| |
| if (pos > 0) |
| { |
| |
| Position endPosData = positions.Get(pos); |
| int leastCost = int.MaxValue; |
| int leastIDX = -1; |
| if (VERBOSE) |
| { |
| Console.WriteLine(" end: " + endPosData.count + " nodes"); |
| } |
| for (int idx = 0; idx < endPosData.count; idx++) |
| { |
| // Add EOS cost: |
| int cost = endPosData.costs[idx] + costs.Get(endPosData.lastRightID[idx], 0); |
| //System.out.println(" idx=" + idx + " cost=" + cost + " (pathCost=" + endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ") backPos=" + endPosData.backPos[idx]); |
| if (cost < leastCost) |
| { |
| leastCost = cost; |
| leastIDX = idx; |
| } |
| } |
| |
| Backtrace(endPosData, leastIDX); |
| } |
| else |
| { |
| // No characters in the input string; return no tokens! |
| } |
| } |
| |
| // Eliminates arcs from the lattice that are compound |
| // tokens (have a penalty) or are not congruent with the |
| // compound token we've matched (ie, span across the |
| // startPos). This should be fairly efficient, because we |
| // just keep the already intersected structure of the |
| // graph, eg we don't have to consult the FSTs again: |
| |
| private void PruneAndRescore(int startPos, int endPos, int bestStartIDX) |
| { |
| if (VERBOSE) |
| { |
| Console.WriteLine(" pruneAndRescore startPos=" + startPos + " endPos=" + endPos + " bestStartIDX=" + bestStartIDX); |
| } |
| |
| // First pass: walk backwards, building up the forward |
| // arcs and pruning inadmissible arcs: |
| for (int pos = endPos; pos > startPos; pos--) |
| { |
| Position posData = positions.Get(pos); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" back pos=" + pos); |
| } |
| for (int arcIDX = 0; arcIDX < posData.count; arcIDX++) |
| { |
| int backPos = posData.backPos[arcIDX]; |
| if (backPos >= startPos) |
| { |
| // Keep this arc: |
| //System.out.println(" keep backPos=" + backPos); |
| positions.Get(backPos).AddForward(pos, |
| arcIDX, |
| posData.backID[arcIDX], |
| posData.backType[arcIDX]); |
| } |
| else |
| { |
| if (VERBOSE) |
| { |
| Console.WriteLine(" prune"); |
| } |
| } |
| } |
| if (pos != startPos) |
| { |
| posData.count = 0; |
| } |
| } |
| |
| // Second pass: walk forward, re-scoring: |
| for (int pos = startPos; pos < endPos; pos++) |
| { |
| Position posData = positions.Get(pos); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" forward pos=" + pos + " count=" + posData.forwardCount); |
| } |
| if (posData.count == 0) |
| { |
| // No arcs arrive here... |
| if (VERBOSE) |
| { |
| Console.WriteLine(" skip"); |
| } |
| posData.forwardCount = 0; |
| continue; |
| } |
| |
| if (pos == startPos) |
| { |
| // On the initial position, only consider the best |
| // path so we "force congruence": the |
| // sub-segmentation is "in context" of what the best |
| // path (compound token) had matched: |
| int rightID; |
| if (startPos == 0) |
| { |
| rightID = 0; |
| } |
| else |
| { |
| rightID = GetDict(posData.backType[bestStartIDX]).GetRightId(posData.backID[bestStartIDX]); |
| } |
| int pathCost = posData.costs[bestStartIDX]; |
| for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++) |
| { |
| JapaneseTokenizerType forwardType = posData.forwardType[forwardArcIDX]; |
| IDictionary dict2 = GetDict(forwardType); |
| int wordID = posData.forwardID[forwardArcIDX]; |
| int toPos = posData.forwardPos[forwardArcIDX]; |
| int newCost = pathCost + dict2.GetWordCost(wordID) + |
| costs.Get(rightID, dict2.GetLeftId(wordID)) + |
| ComputePenalty(pos, toPos - pos); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" + " + forwardType + " word " + new string(buffer.Get(pos, toPos - pos)) + " toPos=" + toPos + " cost=" + newCost + " penalty=" + ComputePenalty(pos, toPos - pos) + " toPos.idx=" + positions.Get(toPos).count); |
| } |
| positions.Get(toPos).Add(newCost, |
| dict2.GetRightId(wordID), |
| pos, |
| bestStartIDX, |
| wordID, |
| forwardType); |
| } |
| } |
| else |
| { |
| // On non-initial positions, we maximize score |
| // across all arriving lastRightIDs: |
| for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++) |
| { |
| JapaneseTokenizerType forwardType = posData.forwardType[forwardArcIDX]; |
| int toPos = posData.forwardPos[forwardArcIDX]; |
| if (VERBOSE) |
| { |
| Console.WriteLine(" + " + forwardType + " word " + new string(buffer.Get(pos, toPos - pos)) + " toPos=" + toPos); |
| } |
| Add(GetDict(forwardType), |
| posData, |
| toPos, |
| posData.forwardID[forwardArcIDX], |
| forwardType, |
| true); |
| } |
| } |
| posData.forwardCount = 0; |
| } |
| } |
| |
| // Backtrace from the provided position, back to the last |
| // time we back-traced, accumulating the resulting tokens to |
| // the pending list. The pending list is then in-reverse |
| // (last token should be returned first). |
| private void Backtrace(Position endPosData, int fromIDX) |
| { |
| int endPos = endPosData.pos; |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine("\n backtrace: endPos=" + endPos + " pos=" + this.pos + "; " + (this.pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]); |
| } |
| |
| char[] fragment = buffer.Get(lastBackTracePos, endPos - lastBackTracePos); |
| |
| if (dotOut != null) |
| { |
| dotOut.OnBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end); |
| } |
| |
| int pos = endPos; |
| int bestIDX = fromIDX; |
| Token altToken = null; |
| |
| // We trace backwards, so this will be the leftWordID of |
| // the token after the one we are now on: |
| int lastLeftWordID = -1; |
| |
| int backCount = 0; |
| |
| // TODO: sort of silly to make Token instances here; the |
| // back trace has all info needed to generate the |
| // token. So, we could just directly set the attrs, |
| // from the backtrace, in incrementToken w/o ever |
| // creating Token; we'd have to defer calling freeBefore |
| // until after the backtrace was fully "consumed" by |
| // incrementToken. |
| |
| while (pos > lastBackTracePos) |
| { |
| //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX); |
| Position posData = positions.Get(pos); |
| if (Debugging.AssertsEnabled) Debugging.Assert(bestIDX < posData.count); |
| |
| int backPos = posData.backPos[bestIDX]; |
| if (Debugging.AssertsEnabled) Debugging.Assert(backPos >= lastBackTracePos, () => "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos); |
| int length = pos - backPos; |
| JapaneseTokenizerType backType = posData.backType[bestIDX]; |
| int backID = posData.backID[bestIDX]; |
| int nextBestIDX = posData.backIndex[bestIDX]; |
| |
| if (outputCompounds && searchMode && altToken == null && backType != JapaneseTokenizerType.USER) |
| { |
| |
| // In searchMode, if best path had picked a too-long |
| // token, we use the "penalty" to compute the allowed |
| // max cost of an alternate back-trace. If we find an |
| // alternate back trace with cost below that |
| // threshold, we pursue it instead (but also output |
| // the long token). |
| //System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos); |
| |
| int penalty = ComputeSecondBestThreshold(backPos, pos - backPos); |
| |
| if (penalty > 0) |
| { |
| if (VERBOSE) |
| { |
| Console.WriteLine(" compound=" + new string(buffer.Get(backPos, pos - backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID); |
| } |
| |
| // Use the penalty to set maxCost on the 2nd best |
| // segmentation: |
| int maxCost = posData.costs[bestIDX] + penalty; |
| if (lastLeftWordID != -1) |
| { |
| maxCost += costs.Get(GetDict(backType).GetRightId(backID), lastLeftWordID); |
| } |
| |
| // Now, prune all too-long tokens from the graph: |
| PruneAndRescore(backPos, pos, |
| posData.backIndex[bestIDX]); |
| |
| // Finally, find 2nd best back-trace and resume |
| // backtrace there: |
| int leastCost = int.MaxValue; |
| int leastIDX = -1; |
| for (int idx = 0; idx < posData.count; idx++) |
| { |
| int cost = posData.costs[idx]; |
| //System.out.println(" idx=" + idx + " prevCost=" + cost); |
| |
| if (lastLeftWordID != -1) |
| { |
| cost += costs.Get(GetDict(posData.backType[idx]).GetRightId(posData.backID[idx]), |
| lastLeftWordID); |
| //System.out.println(" += bgCost=" + costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]), |
| //lastLeftWordID) + " -> " + cost); |
| } |
| //System.out.println("penalty " + posData.backPos[idx] + " to " + pos); |
| //cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]); |
| if (cost < leastCost) |
| { |
| //System.out.println(" ** "); |
| leastCost = cost; |
| leastIDX = idx; |
| } |
| } |
| //System.out.println(" leastIDX=" + leastIDX); |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine(" afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID); |
| } |
| |
| if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) |
| { |
| // We should have pruned the altToken from the graph: |
| if (Debugging.AssertsEnabled) Debugging.Assert(posData.backPos[leastIDX] != backPos); |
| |
| // Save the current compound token, to output when |
| // this alternate path joins back: |
| altToken = new Token(backID, |
| fragment, |
| backPos - lastBackTracePos, |
| length, |
| backType, |
| backPos, |
| GetDict(backType)); |
| |
| // Redirect our backtrace to 2nd best: |
| bestIDX = leastIDX; |
| nextBestIDX = posData.backIndex[bestIDX]; |
| |
| backPos = posData.backPos[bestIDX]; |
| length = pos - backPos; |
| backType = posData.backType[bestIDX]; |
| backID = posData.backID[bestIDX]; |
| backCount = 0; |
| //System.out.println(" do alt token!"); |
| |
| } |
| else |
| { |
| // I think in theory it's possible there is no |
| // 2nd best path, which is fine; in this case we |
| // only output the compound token: |
| //System.out.println(" no alt token! bestIDX=" + bestIDX); |
| } |
| } |
| } |
| |
| int offset = backPos - lastBackTracePos; |
| if (Debugging.AssertsEnabled) Debugging.Assert(offset >= 0); |
| |
| if (altToken != null && altToken.Position >= backPos) |
| { |
| |
| // We've backtraced to the position where the |
| // compound token starts; add it now: |
| |
| // The pruning we did when we created the altToken |
| // ensures that the back trace will align back with |
| // the start of the altToken: |
| if (Debugging.AssertsEnabled) Debugging.Assert(altToken.Position == backPos, () => altToken.Position + " vs " + backPos); |
| |
| // NOTE: not quite right: the compound token may |
| // have had all punctuation back traced so far, but |
| // then the decompounded token at this position is |
| // not punctuation. In this case backCount is 0, |
| // but we should maybe add the altToken anyway...? |
| |
| if (backCount > 0) |
| { |
| backCount++; |
| altToken.PositionLength = backCount; |
| if (VERBOSE) |
| { |
| Console.WriteLine(" add altToken=" + altToken); |
| } |
| pending.Add(altToken); |
| } |
| else |
| { |
| // This means alt token was all punct tokens: |
| if (VERBOSE) |
| { |
| Console.WriteLine(" discard all-punctuation altToken=" + altToken); |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(discardPunctuation); |
| } |
| altToken = null; |
| } |
| |
| IDictionary dict = GetDict(backType); |
| |
| if (backType == JapaneseTokenizerType.USER) |
| { |
| |
| // Expand the phraseID we recorded into the actual |
| // segmentation: |
| int[] wordIDAndLength = userDictionary.LookupSegmentation(backID); |
| int wordID = wordIDAndLength[0]; |
| int current = 0; |
| for (int j = 1; j < wordIDAndLength.Length; j++) |
| { |
| int len = wordIDAndLength[j]; |
| //System.out.println(" add user: len=" + len); |
| pending.Add(new Token(wordID + j - 1, |
| fragment, |
| current + offset, |
| len, |
| JapaneseTokenizerType.USER, |
| current + backPos, |
| dict)); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" add USER token=" + pending[pending.Count - 1]); |
| } |
| current += len; |
| } |
| |
| // Reverse the tokens we just added, because when we |
| // serve them up from incrementToken we serve in |
| // reverse: |
| Collections.Reverse(pending.SubList(pending.Count - (wordIDAndLength.Length - 1), |
| pending.Count)); |
| |
| backCount += wordIDAndLength.Length - 1; |
| } |
| else |
| { |
| |
| if (extendedMode && backType == JapaneseTokenizerType.UNKNOWN) |
| { |
| // In EXTENDED mode we convert unknown word into |
| // unigrams: |
| int unigramTokenCount = 0; |
| for (int i = length - 1; i >= 0; i--) |
| { |
| int charLen = 1; |
| if (i > 0 && char.IsLowSurrogate(fragment[offset + i])) |
| { |
| i--; |
| charLen = 2; |
| } |
| //System.out.println(" extended tok offset=" |
| //+ (offset + i)); |
| if (!discardPunctuation || !IsPunctuation(fragment[offset + i])) |
| { |
| pending.Add(new Token(CharacterDefinition.NGRAM, |
| fragment, |
| offset + i, |
| charLen, |
| JapaneseTokenizerType.UNKNOWN, |
| backPos + i, |
| unkDictionary)); |
| unigramTokenCount++; |
| } |
| } |
| backCount += unigramTokenCount; |
| |
| } |
| else if (!discardPunctuation || length == 0 || !IsPunctuation(fragment[offset])) |
| { |
| pending.Add(new Token(backID, |
| fragment, |
| offset, |
| length, |
| backType, |
| backPos, |
| dict)); |
| if (VERBOSE) |
| { |
| Console.WriteLine(" add token=" + pending[pending.Count - 1]); |
| } |
| backCount++; |
| } |
| else |
| { |
| if (VERBOSE) |
| { |
| Console.WriteLine(" skip punctuation token=" + new string(fragment, offset, length)); |
| } |
| } |
| } |
| |
| lastLeftWordID = dict.GetLeftId(backID); |
| pos = backPos; |
| bestIDX = nextBestIDX; |
| } |
| |
| lastBackTracePos = endPos; |
| |
| if (VERBOSE) |
| { |
| Console.WriteLine(" freeBefore pos=" + endPos); |
| } |
| // Notify the circular buffers that we are done with |
| // these positions: |
| buffer.FreeBefore(endPos); |
| positions.FreeBefore(endPos); |
| } |
| |
| internal IDictionary GetDict(JapaneseTokenizerType type) |
| { |
| IDictionary result; |
| dictionaryMap.TryGetValue(type, out result); |
| return result; |
| } |
| |
| private static bool IsPunctuation(char ch) |
| { |
| switch (Character.GetType(ch)) |
| { |
| case UnicodeCategory.SpaceSeparator: |
| case UnicodeCategory.LineSeparator: |
| case UnicodeCategory.ParagraphSeparator: |
| case UnicodeCategory.Control: |
| case UnicodeCategory.Format: |
| case UnicodeCategory.DashPunctuation: |
| case UnicodeCategory.OpenPunctuation: |
| case UnicodeCategory.ClosePunctuation: |
| case UnicodeCategory.ConnectorPunctuation: |
| case UnicodeCategory.OtherPunctuation: |
| case UnicodeCategory.MathSymbol: |
| case UnicodeCategory.CurrencySymbol: |
| case UnicodeCategory.ModifierSymbol: |
| case UnicodeCategory.OtherSymbol: |
| case UnicodeCategory.InitialQuotePunctuation: |
| case UnicodeCategory.FinalQuotePunctuation: |
| return true; |
| default: |
| return false; |
| } |
| } |
| } |
| |
| // LUCENENET specific - de-nested Mode and renamed JapaneseTokenizerMode |
| |
| /// <summary> |
| /// Tokenization mode: this determines how the tokenizer handles |
| /// compound and unknown words. |
| /// </summary> |
| public enum JapaneseTokenizerMode |
| { |
| /// <summary> |
| /// Ordinary segmentation: no decomposition for compounds, |
| /// </summary> |
| NORMAL, |
| |
| /// <summary> |
| /// Segmentation geared towards search: this includes a |
| /// decompounding process for long nouns, also including |
| /// the full compound token as a synonym. |
| /// </summary> |
| SEARCH, |
| |
| /// <summary> |
| /// Extended mode outputs unigrams for unknown words. |
| /// <para/> |
| /// @lucene.experimental |
| /// </summary> |
| EXTENDED |
| } |
| |
| // LUCENENET specific: de-nested Type and renamed JapaneseTokenizerType |
| |
| /// <summary> |
| /// Token type reflecting the original source of this token |
| /// </summary> |
| public enum JapaneseTokenizerType |
| { |
| /// <summary> |
| /// Known words from the system dictionary. |
| /// </summary> |
| KNOWN, |
| /// <summary> |
| /// Unknown words (heuristically segmented). |
| /// </summary> |
| UNKNOWN, |
| /// <summary> |
| /// Known words from the user dictionary. |
| /// </summary> |
| USER |
| } |
| |
| |
| // LUCENENET specific - De-nested Position |
| |
| // Holds all back pointers arriving to this position: |
| internal sealed class Position |
| { |
| |
| internal int pos; |
| |
| internal int count; |
| |
| // maybe single int array * 5? |
| internal int[] costs = new int[8]; |
| internal int[] lastRightID = new int[8]; |
| internal int[] backPos = new int[8]; |
| internal int[] backIndex = new int[8]; |
| internal int[] backID = new int[8]; |
| internal JapaneseTokenizerType[] backType = new JapaneseTokenizerType[8]; |
| |
| // Only used when finding 2nd best segmentation under a |
| // too-long token: |
| internal int forwardCount; |
| internal int[] forwardPos = new int[8]; |
| internal int[] forwardID = new int[8]; |
| internal int[] forwardIndex = new int[8]; |
| internal JapaneseTokenizerType[] forwardType = new JapaneseTokenizerType[8]; |
| |
| public void Grow() |
| { |
| costs = ArrayUtil.Grow(costs, 1 + count); |
| lastRightID = ArrayUtil.Grow(lastRightID, 1 + count); |
| backPos = ArrayUtil.Grow(backPos, 1 + count); |
| backIndex = ArrayUtil.Grow(backIndex, 1 + count); |
| backID = ArrayUtil.Grow(backID, 1 + count); |
| |
| // NOTE: sneaky: grow separately because |
| // ArrayUtil.grow will otherwise pick a different |
| // length than the int[]s we just grew: |
| JapaneseTokenizerType[] newBackType = new JapaneseTokenizerType[backID.Length]; |
| System.Array.Copy(backType, 0, newBackType, 0, backType.Length); |
| backType = newBackType; |
| } |
| |
| public void GrowForward() |
| { |
| forwardPos = ArrayUtil.Grow(forwardPos, 1 + forwardCount); |
| forwardID = ArrayUtil.Grow(forwardID, 1 + forwardCount); |
| forwardIndex = ArrayUtil.Grow(forwardIndex, 1 + forwardCount); |
| |
| // NOTE: sneaky: grow separately because |
| // ArrayUtil.grow will otherwise pick a different |
| // length than the int[]s we just grew: |
| JapaneseTokenizerType[] newForwardType = new JapaneseTokenizerType[forwardPos.Length]; |
| System.Array.Copy(forwardType, 0, newForwardType, 0, forwardType.Length); |
| forwardType = newForwardType; |
| } |
| |
| public void Add(int cost, int lastRightID, int backPos, int backIndex, int backID, JapaneseTokenizerType backType) |
| { |
| // NOTE: this isn't quite a true Viterbi search, |
| // because we should check if lastRightID is |
| // already present here, and only update if the new |
| // cost is less than the current cost, instead of |
| // simply appending. However, that will likely hurt |
| // performance (usually we add a lastRightID only once), |
| // and it means we actually create the full graph |
| // intersection instead of a "normal" Viterbi lattice: |
| if (count == costs.Length) |
| { |
| Grow(); |
| } |
| this.costs[count] = cost; |
| this.lastRightID[count] = lastRightID; |
| this.backPos[count] = backPos; |
| this.backIndex[count] = backIndex; |
| this.backID[count] = backID; |
| this.backType[count] = backType; |
| count++; |
| } |
| |
| public void AddForward(int forwardPos, int forwardIndex, int forwardID, JapaneseTokenizerType forwardType) |
| { |
| if (forwardCount == this.forwardID.Length) |
| { |
| GrowForward(); |
| } |
| this.forwardPos[forwardCount] = forwardPos; |
| this.forwardIndex[forwardCount] = forwardIndex; |
| this.forwardID[forwardCount] = forwardID; |
| this.forwardType[forwardCount] = forwardType; |
| forwardCount++; |
| } |
| |
| public void Reset() |
| { |
| count = 0; |
| // forwardCount naturally resets after it runs: |
| if (Debugging.AssertsEnabled) Debugging.Assert(forwardCount == 0, () => "pos=" + pos + " forwardCount=" + forwardCount); |
| } |
| } |
| |
| |
| // LUCENENET specific - de-nested WrappedPositionArray |
| |
| // TODO: make generic'd version of this "circular array"? |
| // It's a bit tricky because we do things to the Position |
| // (eg, set .pos = N on reuse)... |
| internal sealed class WrappedPositionArray |
| { |
| private Position[] positions = new Position[8]; |
| |
| public WrappedPositionArray() |
| { |
| for (int i = 0; i < positions.Length; i++) |
| { |
| positions[i] = new Position(); |
| } |
| } |
| |
| // Next array index to write to in positions: |
| private int nextWrite; |
| |
| // Next position to write: |
| private int nextPos; |
| |
| // How many valid Position instances are held in the |
| // positions array: |
| private int count; |
| |
| public void Reset() |
| { |
| nextWrite--; |
| while (count > 0) |
| { |
| if (nextWrite == -1) |
| { |
| nextWrite = positions.Length - 1; |
| } |
| positions[nextWrite--].Reset(); |
| count--; |
| } |
| nextWrite = 0; |
| nextPos = 0; |
| count = 0; |
| } |
| |
| /// <summary> |
| /// Get Position instance for this absolute position; |
| /// this is allowed to be arbitrarily far "in the |
| /// future" but cannot be before the last freeBefore. |
| /// </summary> |
| public Position Get(int pos) |
| { |
| while (pos >= nextPos) |
| { |
| //System.out.println("count=" + count + " vs len=" + positions.length); |
| if (count == positions.Length) |
| { |
| Position[] newPositions = new Position[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; |
| //System.out.println("grow positions " + newPositions.length); |
| System.Array.Copy(positions, nextWrite, newPositions, 0, positions.Length - nextWrite); |
| System.Array.Copy(positions, 0, newPositions, positions.Length - nextWrite, nextWrite); |
| for (int i = positions.Length; i < newPositions.Length; i++) |
| { |
| newPositions[i] = new Position(); |
| } |
| nextWrite = positions.Length; |
| positions = newPositions; |
| } |
| if (nextWrite == positions.Length) |
| { |
| nextWrite = 0; |
| } |
| // Should have already been reset: |
| if (Debugging.AssertsEnabled) Debugging.Assert(positions[nextWrite].count == 0); |
| positions[nextWrite++].pos = nextPos++; |
| count++; |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(InBounds(pos)); |
| int index = GetIndex(pos); |
| if (Debugging.AssertsEnabled) Debugging.Assert(positions[index].pos == pos); |
| return positions[index]; |
| } |
| |
| public int GetNextPos() |
| { |
| return nextPos; |
| } |
| |
| // For assert: |
| private bool InBounds(int pos) |
| { |
| return pos < nextPos && pos >= nextPos - count; |
| } |
| |
| private int GetIndex(int pos) |
| { |
| int index = nextWrite - (nextPos - pos); |
| if (index < 0) |
| { |
| index += positions.Length; |
| } |
| return index; |
| } |
| |
| public void FreeBefore(int pos) |
| { |
| int toFree = count - (nextPos - pos); |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(toFree >= 0); |
| Debugging.Assert(toFree <= count); |
| } |
| int index = nextWrite - count; |
| if (index < 0) |
| { |
| index += positions.Length; |
| } |
| for (int i = 0; i < toFree; i++) |
| { |
| if (index == positions.Length) |
| { |
| index = 0; |
| } |
| //System.out.println(" fb idx=" + index); |
| positions[index].Reset(); |
| index++; |
| } |
| count -= toFree; |
| } |
| } |
| } |