src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizer.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Support;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Text;

 namespace Lucene.Net.Analysis.Wikipedia
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Extension of <see cref="Standard.StandardTokenizer"/> that is aware of Wikipedia syntax.  It is based off of the
     /// Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
     /// <para/>
     /// @lucene.experimental
     /// </summary>
     public sealed class WikipediaTokenizer : Tokenizer
     {
         public const string INTERNAL_LINK = "il";
         public const string EXTERNAL_LINK = "el";
         //The URL part of the link, i.e. the first token
         public const string EXTERNAL_LINK_URL = "elu";
         public const string CITATION = "ci";
         public const string CATEGORY = "c";
         public const string BOLD = "b";
         public const string ITALICS = "i";
         public const string BOLD_ITALICS = "bi";
         public const string HEADING = "h";
         public const string SUB_HEADING = "sh";

         public const int ALPHANUM_ID = 0;
         public const int APOSTROPHE_ID = 1;
         public const int ACRONYM_ID = 2;
         public const int COMPANY_ID = 3;
         public const int EMAIL_ID = 4;
         public const int HOST_ID = 5;
         public const int NUM_ID = 6;
         public const int CJ_ID = 7;
         public const int INTERNAL_LINK_ID = 8;
         public const int EXTERNAL_LINK_ID = 9;
         public const int CITATION_ID = 10;
         public const int CATEGORY_ID = 11;
         public const int BOLD_ID = 12;
         public const int ITALICS_ID = 13;
         public const int BOLD_ITALICS_ID = 14;
         public const int HEADING_ID = 15;
         public const int SUB_HEADING_ID = 16;
         public const int EXTERNAL_LINK_URL_ID = 17;

         /// <summary>
         /// String token types that correspond to token type int constants </summary>
         public static readonly string[] TOKEN_TYPES = new string[] {
             "<ALPHANUM>",
             "<APOSTROPHE>",
             "<ACRONYM>",
             "<COMPANY>",
             "<EMAIL>",
             "<HOST>",
             "<NUM>",
             "<CJ>",
             INTERNAL_LINK,
             EXTERNAL_LINK,
             CITATION,
             CATEGORY,
             BOLD,
             ITALICS,
             BOLD_ITALICS,
             HEADING,
             SUB_HEADING,
             EXTERNAL_LINK_URL
         };

         /// <summary>
         /// Only output tokens
         /// </summary>
         public const int TOKENS_ONLY = 0;
         /// <summary>
         /// Only output untokenized tokens, which are tokens that would normally be split into several tokens
         /// </summary>
         public const int UNTOKENIZED_ONLY = 1;
         /// <summary>
         /// Output the both the untokenized token and the splits
         /// </summary>
         public const int BOTH = 2;
         /// <summary>
         /// This flag is used to indicate that the produced "Token" would, if <see cref="TOKENS_ONLY"/> was used, produce multiple tokens.
         /// </summary>
         public const int UNTOKENIZED_TOKEN_FLAG = 1;
         /// <summary>
         /// A private instance of the JFlex-constructed scanner
         /// </summary>
         private readonly WikipediaTokenizerImpl scanner;

         private int tokenOutput = TOKENS_ONLY;
         private ICollection<string> untokenizedTypes = Collections.EmptySet<string>();
         private IEnumerator<AttributeSource.State> tokens = null;

         private IOffsetAttribute offsetAtt;
         private ITypeAttribute typeAtt;
         private IPositionIncrementAttribute posIncrAtt;
         private ICharTermAttribute termAtt;
         private IFlagsAttribute flagsAtt;

         private bool first;

         /// <summary>
         /// Creates a new instance of the <see cref="WikipediaTokenizer"/>. Attaches the
         /// <paramref name="input"/> to a newly created JFlex scanner.
         /// </summary>
         /// <param name="input"> The Input <see cref="TextReader"/> </param>
         public WikipediaTokenizer(TextReader input)
             : this(input, TOKENS_ONLY, Collections.EmptySet<string>())
         {
         }

         /// <summary>
         /// Creates a new instance of the <see cref="WikipediaTokenizer"/>.  Attaches the
         /// <paramref name="input"/> to a the newly created JFlex scanner.
         /// </summary>
         /// <param name="input"> The input </param>
         /// <param name="tokenOutput"> One of <see cref="TOKENS_ONLY"/>, <see cref="UNTOKENIZED_ONLY"/>, <see cref="BOTH"/> </param>
         /// <param name="untokenizedTypes"> Untokenized types </param>
         public WikipediaTokenizer(TextReader input, int tokenOutput, ICollection<string> untokenizedTypes)
             : base(input)
         {
             this.scanner = new WikipediaTokenizerImpl(this.m_input);
             Init(tokenOutput, untokenizedTypes);
         }

         /// <summary>
         /// Creates a new instance of the <see cref="WikipediaTokenizer"/>.  Attaches the
         /// <paramref name="input"/> to a the newly created JFlex scanner. Uses the given <see cref="AttributeSource.AttributeFactory"/>.
         /// </summary>
         /// <param name="factory"> The <see cref="AttributeSource.AttributeFactory"/> </param>
         /// <param name="input"> The input </param>
         /// <param name="tokenOutput"> One of <see cref="TOKENS_ONLY"/>, <see cref="UNTOKENIZED_ONLY"/>, <see cref="BOTH"/> </param>
         /// <param name="untokenizedTypes"> Untokenized types </param>
         public WikipediaTokenizer(AttributeFactory factory, TextReader input, int tokenOutput, ICollection<string> untokenizedTypes)
               : base(factory, input)
         {
             this.scanner = new WikipediaTokenizerImpl(this.m_input);
             Init(tokenOutput, untokenizedTypes);
         }

         private void Init(int tokenOutput, ICollection<string> untokenizedTypes)
         {
             // TODO: cutover to enum
             if (tokenOutput != TOKENS_ONLY && tokenOutput != UNTOKENIZED_ONLY && tokenOutput != BOTH)
             {
                 throw new ArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH");
             }
             this.tokenOutput = tokenOutput;
             this.untokenizedTypes = untokenizedTypes;
             offsetAtt = AddAttribute<IOffsetAttribute>();
             typeAtt = AddAttribute<ITypeAttribute>();
             posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
             termAtt = AddAttribute<ICharTermAttribute>();
             flagsAtt = AddAttribute<IFlagsAttribute>();
         }

         /// <summary>
         /// <see cref="TokenStream.IncrementToken"/>
         /// </summary>
         public override sealed bool IncrementToken()
         {
             if (tokens != null && tokens.MoveNext())
             {
                 AttributeSource.State state = tokens.Current;
                 RestoreState(state);
                 return true;
             }
             ClearAttributes();
             int tokenType = scanner.GetNextToken();

             if (tokenType == WikipediaTokenizerImpl.YYEOF)
             {
                 return false;
             }
             string type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
             if (tokenOutput == TOKENS_ONLY || untokenizedTypes.Contains(type) == false)
             {
                 SetupToken();
             }
             else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.Contains(type) == true)
             {
                 CollapseTokens(tokenType);

             }
             else if (tokenOutput == BOTH)
             {
                 //collapse into a single token, add it to tokens AND output the individual tokens
                 //output the untokenized Token first
                 CollapseAndSaveTokens(tokenType, type);
             }
             int posinc = scanner.PositionIncrement;
             if (first && posinc == 0)
             {
                 posinc = 1; // don't emit posinc=0 for the first token!
             }
             posIncrAtt.PositionIncrement = posinc;
             typeAtt.Type = type;
             first = false;
             return true;
         }

         private void CollapseAndSaveTokens(int tokenType, string type)
         {
             //collapse
             StringBuilder buffer = new StringBuilder(32);
             int numAdded = scanner.SetText(buffer);
             //TODO: how to know how much whitespace to add
             int theStart = scanner.YyChar;
             int lastPos = theStart + numAdded;
             int tmpTokType;
             int numSeen = 0;
             IList<AttributeSource.State> tmp = new List<AttributeSource.State>();
             SetupSavedToken(0, type);
             tmp.Add(CaptureState());
             //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
             while ((tmpTokType = scanner.GetNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen)
             {
                 int currPos = scanner.YyChar;
                 //append whitespace
                 for (int i = 0; i < (currPos - lastPos); i++)
                 {
                     buffer.Append(' ');
                 }
                 numAdded = scanner.SetText(buffer);
                 SetupSavedToken(scanner.PositionIncrement, type);
                 tmp.Add(CaptureState());
                 numSeen++;
                 lastPos = currPos + numAdded;
             }
             //trim the buffer
             // TODO: this is inefficient
             string s = buffer.ToString().Trim();
             termAtt.SetEmpty().Append(s);
             offsetAtt.SetOffset(CorrectOffset(theStart), CorrectOffset(theStart + s.Length));
             flagsAtt.Flags = UNTOKENIZED_TOKEN_FLAG;
             //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
             if (tmpTokType != WikipediaTokenizerImpl.YYEOF)
             {
                 scanner.YyPushBack(scanner.YyLength);
             }
             tokens = tmp.GetEnumerator();
         }

         private void SetupSavedToken(int positionInc, string type)
         {
             SetupToken();
             posIncrAtt.PositionIncrement = positionInc;
             typeAtt.Type = type;
         }

         private void CollapseTokens(int tokenType)
         {
             //collapse
             StringBuilder buffer = new StringBuilder(32);
             int numAdded = scanner.SetText(buffer);
             //TODO: how to know how much whitespace to add
             int theStart = scanner.YyChar;
             int lastPos = theStart + numAdded;
             int tmpTokType;
             int numSeen = 0;
             //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
             while ((tmpTokType = scanner.GetNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen)
             {
                 int currPos = scanner.YyChar;
                 //append whitespace
                 for (int i = 0; i < (currPos - lastPos); i++)
                 {
                     buffer.Append(' ');
                 }
                 numAdded = scanner.SetText(buffer);
                 numSeen++;
                 lastPos = currPos + numAdded;
             }
             //trim the buffer
             // TODO: this is inefficient
             string s = buffer.ToString().Trim();
             termAtt.SetEmpty().Append(s);
             offsetAtt.SetOffset(CorrectOffset(theStart), CorrectOffset(theStart + s.Length));
             flagsAtt.Flags = UNTOKENIZED_TOKEN_FLAG;
             //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
             if (tmpTokType != WikipediaTokenizerImpl.YYEOF)
             {
                 scanner.YyPushBack(scanner.YyLength);
             }
             else
             {
                 tokens = null;
             }
         }

         private void SetupToken()
         {
             scanner.GetText(termAtt);
             int start = scanner.YyChar;
             offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length));
         }

         protected override void Dispose(bool disposing)
         {
             base.Dispose(disposing);
             if (disposing)
             {
                 scanner.YyReset(m_input);
             }
         }

         /// <summary>
         /// <see cref="TokenStream.Reset"/>
         /// </summary>
         public override void Reset()
         {
             base.Reset();
             scanner.YyReset(m_input);
             tokens = null;
             scanner.Reset();
             first = true;
         }

         public override void End()
         {
             base.End();
             // set final offset
             int finalOffset = CorrectOffset(scanner.YyChar + scanner.YyLength);
             this.offsetAtt.SetOffset(finalOffset, finalOffset);
         }
     }
 }
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Support;
	using Lucene.Net.Util;
	using System;
	using System.Collections.Generic;
	using System.IO;
	using System.Text;

	namespace Lucene.Net.Analysis.Wikipedia
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Extension of <see cref="Standard.StandardTokenizer"/> that is aware of Wikipedia syntax. It is based off of the
	/// Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	public sealed class WikipediaTokenizer : Tokenizer
	{
	public const string INTERNAL_LINK = "il";
	public const string EXTERNAL_LINK = "el";
	//The URL part of the link, i.e. the first token
	public const string EXTERNAL_LINK_URL = "elu";
	public const string CITATION = "ci";
	public const string CATEGORY = "c";
	public const string BOLD = "b";
	public const string ITALICS = "i";
	public const string BOLD_ITALICS = "bi";
	public const string HEADING = "h";
	public const string SUB_HEADING = "sh";

	public const int ALPHANUM_ID = 0;
	public const int APOSTROPHE_ID = 1;
	public const int ACRONYM_ID = 2;
	public const int COMPANY_ID = 3;
	public const int EMAIL_ID = 4;
	public const int HOST_ID = 5;
	public const int NUM_ID = 6;
	public const int CJ_ID = 7;
	public const int INTERNAL_LINK_ID = 8;
	public const int EXTERNAL_LINK_ID = 9;
	public const int CITATION_ID = 10;
	public const int CATEGORY_ID = 11;
	public const int BOLD_ID = 12;
	public const int ITALICS_ID = 13;
	public const int BOLD_ITALICS_ID = 14;
	public const int HEADING_ID = 15;
	public const int SUB_HEADING_ID = 16;
	public const int EXTERNAL_LINK_URL_ID = 17;

	/// <summary>
	/// String token types that correspond to token type int constants </summary>
	public static readonly string[] TOKEN_TYPES = new string[] {
	"<ALPHANUM>",
	"<APOSTROPHE>",
	"<ACRONYM>",
	"<COMPANY>",
	"<EMAIL>",
	"<HOST>",
	"<NUM>",
	"<CJ>",
	INTERNAL_LINK,
	EXTERNAL_LINK,
	CITATION,
	CATEGORY,
	BOLD,
	ITALICS,
	BOLD_ITALICS,
	HEADING,
	SUB_HEADING,
	EXTERNAL_LINK_URL
	};

	/// <summary>
	/// Only output tokens
	/// </summary>
	public const int TOKENS_ONLY = 0;
	/// <summary>
	/// Only output untokenized tokens, which are tokens that would normally be split into several tokens
	/// </summary>
	public const int UNTOKENIZED_ONLY = 1;
	/// <summary>
	/// Output the both the untokenized token and the splits
	/// </summary>
	public const int BOTH = 2;
	/// <summary>
	/// This flag is used to indicate that the produced "Token" would, if <see cref="TOKENS_ONLY"/> was used, produce multiple tokens.
	/// </summary>
	public const int UNTOKENIZED_TOKEN_FLAG = 1;
	/// <summary>
	/// A private instance of the JFlex-constructed scanner
	/// </summary>
	private readonly WikipediaTokenizerImpl scanner;

	private int tokenOutput = TOKENS_ONLY;
	private ICollection<string> untokenizedTypes = Collections.EmptySet<string>();
	private IEnumerator<AttributeSource.State> tokens = null;

	private IOffsetAttribute offsetAtt;
	private ITypeAttribute typeAtt;
	private IPositionIncrementAttribute posIncrAtt;
	private ICharTermAttribute termAtt;
	private IFlagsAttribute flagsAtt;

	private bool first;

	/// <summary>
	/// Creates a new instance of the <see cref="WikipediaTokenizer"/>. Attaches the
	/// <paramref name="input"/> to a newly created JFlex scanner.
	/// </summary>
	/// <param name="input"> The Input <see cref="TextReader"/> </param>
	public WikipediaTokenizer(TextReader input)
	: this(input, TOKENS_ONLY, Collections.EmptySet<string>())
	{
	}

	/// <summary>
	/// Creates a new instance of the <see cref="WikipediaTokenizer"/>. Attaches the
	/// <paramref name="input"/> to a the newly created JFlex scanner.
	/// </summary>
	/// <param name="input"> The input </param>
	/// <param name="tokenOutput"> One of <see cref="TOKENS_ONLY"/>, <see cref="UNTOKENIZED_ONLY"/>, <see cref="BOTH"/> </param>
	/// <param name="untokenizedTypes"> Untokenized types </param>
	public WikipediaTokenizer(TextReader input, int tokenOutput, ICollection<string> untokenizedTypes)
	: base(input)
	{
	this.scanner = new WikipediaTokenizerImpl(this.m_input);
	Init(tokenOutput, untokenizedTypes);
	}

	/// <summary>
	/// Creates a new instance of the <see cref="WikipediaTokenizer"/>. Attaches the
	/// <paramref name="input"/> to a the newly created JFlex scanner. Uses the given <see cref="AttributeSource.AttributeFactory"/>.
	/// </summary>
	/// <param name="factory"> The <see cref="AttributeSource.AttributeFactory"/> </param>
	/// <param name="input"> The input </param>
	/// <param name="tokenOutput"> One of <see cref="TOKENS_ONLY"/>, <see cref="UNTOKENIZED_ONLY"/>, <see cref="BOTH"/> </param>
	/// <param name="untokenizedTypes"> Untokenized types </param>
	public WikipediaTokenizer(AttributeFactory factory, TextReader input, int tokenOutput, ICollection<string> untokenizedTypes)
	: base(factory, input)
	{
	this.scanner = new WikipediaTokenizerImpl(this.m_input);
	Init(tokenOutput, untokenizedTypes);
	}

	private void Init(int tokenOutput, ICollection<string> untokenizedTypes)
	{
	// TODO: cutover to enum
	if (tokenOutput != TOKENS_ONLY && tokenOutput != UNTOKENIZED_ONLY && tokenOutput != BOTH)
	{
	throw new ArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH");
	}
	this.tokenOutput = tokenOutput;
	this.untokenizedTypes = untokenizedTypes;
	offsetAtt = AddAttribute<IOffsetAttribute>();
	typeAtt = AddAttribute<ITypeAttribute>();
	posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
	termAtt = AddAttribute<ICharTermAttribute>();
	flagsAtt = AddAttribute<IFlagsAttribute>();
	}

	/// <summary>
	/// <see cref="TokenStream.IncrementToken"/>
	/// </summary>
	public override sealed bool IncrementToken()
	{
	if (tokens != null && tokens.MoveNext())
	{
	AttributeSource.State state = tokens.Current;
	RestoreState(state);
	return true;
	}
	ClearAttributes();
	int tokenType = scanner.GetNextToken();

	if (tokenType == WikipediaTokenizerImpl.YYEOF)
	{
	return false;
	}
	string type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
	if (tokenOutput == TOKENS_ONLY \|\| untokenizedTypes.Contains(type) == false)
	{
	SetupToken();
	}
	else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.Contains(type) == true)
	{
	CollapseTokens(tokenType);

	}
	else if (tokenOutput == BOTH)
	{
	//collapse into a single token, add it to tokens AND output the individual tokens
	//output the untokenized Token first
	CollapseAndSaveTokens(tokenType, type);
	}
	int posinc = scanner.PositionIncrement;
	if (first && posinc == 0)
	{
	posinc = 1; // don't emit posinc=0 for the first token!
	}
	posIncrAtt.PositionIncrement = posinc;
	typeAtt.Type = type;
	first = false;
	return true;
	}

	private void CollapseAndSaveTokens(int tokenType, string type)
	{
	//collapse
	StringBuilder buffer = new StringBuilder(32);
	int numAdded = scanner.SetText(buffer);
	//TODO: how to know how much whitespace to add
	int theStart = scanner.YyChar;
	int lastPos = theStart + numAdded;
	int tmpTokType;
	int numSeen = 0;
	IList<AttributeSource.State> tmp = new List<AttributeSource.State>();
	SetupSavedToken(0, type);
	tmp.Add(CaptureState());
	//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
	while ((tmpTokType = scanner.GetNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen)
	{
	int currPos = scanner.YyChar;
	//append whitespace
	for (int i = 0; i < (currPos - lastPos); i++)
	{
	buffer.Append(' ');
	}
	numAdded = scanner.SetText(buffer);
	SetupSavedToken(scanner.PositionIncrement, type);
	tmp.Add(CaptureState());
	numSeen++;
	lastPos = currPos + numAdded;
	}
	//trim the buffer
	// TODO: this is inefficient
	string s = buffer.ToString().Trim();
	termAtt.SetEmpty().Append(s);
	offsetAtt.SetOffset(CorrectOffset(theStart), CorrectOffset(theStart + s.Length));
	flagsAtt.Flags = UNTOKENIZED_TOKEN_FLAG;
	//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
	if (tmpTokType != WikipediaTokenizerImpl.YYEOF)
	{
	scanner.YyPushBack(scanner.YyLength);
	}
	tokens = tmp.GetEnumerator();
	}

	private void SetupSavedToken(int positionInc, string type)
	{
	SetupToken();
	posIncrAtt.PositionIncrement = positionInc;
	typeAtt.Type = type;
	}

	private void CollapseTokens(int tokenType)
	{
	//collapse
	StringBuilder buffer = new StringBuilder(32);
	int numAdded = scanner.SetText(buffer);
	//TODO: how to know how much whitespace to add
	int theStart = scanner.YyChar;
	int lastPos = theStart + numAdded;
	int tmpTokType;
	int numSeen = 0;
	//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
	while ((tmpTokType = scanner.GetNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen)
	{
	int currPos = scanner.YyChar;
	//append whitespace
	for (int i = 0; i < (currPos - lastPos); i++)
	{
	buffer.Append(' ');
	}
	numAdded = scanner.SetText(buffer);
	numSeen++;
	lastPos = currPos + numAdded;
	}
	//trim the buffer
	// TODO: this is inefficient
	string s = buffer.ToString().Trim();
	termAtt.SetEmpty().Append(s);
	offsetAtt.SetOffset(CorrectOffset(theStart), CorrectOffset(theStart + s.Length));
	flagsAtt.Flags = UNTOKENIZED_TOKEN_FLAG;
	//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
	if (tmpTokType != WikipediaTokenizerImpl.YYEOF)
	{
	scanner.YyPushBack(scanner.YyLength);
	}
	else
	{
	tokens = null;
	}
	}

	private void SetupToken()
	{
	scanner.GetText(termAtt);
	int start = scanner.YyChar;
	offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length));
	}

	protected override void Dispose(bool disposing)
	{
	base.Dispose(disposing);
	if (disposing)
	{
	scanner.YyReset(m_input);
	}
	}

	/// <summary>
	/// <see cref="TokenStream.Reset"/>
	/// </summary>
	public override void Reset()
	{
	base.Reset();
	scanner.YyReset(m_input);
	tokens = null;
	scanner.Reset();
	first = true;
	}

	public override void End()
	{
	base.End();
	// set final offset
	int finalOffset = CorrectOffset(scanner.YyChar + scanner.YyLength);
	this.offsetAtt.SetOffset(finalOffset, finalOffset);
	}
	}
	}