blob: 8adb7551b1904807f903be97790433ce22d5706f [file] [log] [blame]
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace Lucene.Net.Analysis.Wikipedia
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
/// <summary>
/// Extension of <see cref="Standard.StandardTokenizer"/> that is aware of Wikipedia syntax. It is based off of the
/// Wikipedia tutorial available at, but it may not be complete.
/// <para/>
/// @lucene.experimental
/// </summary>
public sealed class WikipediaTokenizer : Tokenizer
public const string INTERNAL_LINK = "il";
public const string EXTERNAL_LINK = "el";
//The URL part of the link, i.e. the first token
public const string EXTERNAL_LINK_URL = "elu";
public const string CITATION = "ci";
public const string CATEGORY = "c";
public const string BOLD = "b";
public const string ITALICS = "i";
public const string BOLD_ITALICS = "bi";
public const string HEADING = "h";
public const string SUB_HEADING = "sh";
public const int ALPHANUM_ID = 0;
public const int APOSTROPHE_ID = 1;
public const int ACRONYM_ID = 2;
public const int COMPANY_ID = 3;
public const int EMAIL_ID = 4;
public const int HOST_ID = 5;
public const int NUM_ID = 6;
public const int CJ_ID = 7;
public const int INTERNAL_LINK_ID = 8;
public const int EXTERNAL_LINK_ID = 9;
public const int CITATION_ID = 10;
public const int CATEGORY_ID = 11;
public const int BOLD_ID = 12;
public const int ITALICS_ID = 13;
public const int BOLD_ITALICS_ID = 14;
public const int HEADING_ID = 15;
public const int SUB_HEADING_ID = 16;
public const int EXTERNAL_LINK_URL_ID = 17;
/// <summary>
/// String token types that correspond to token type int constants </summary>
public static readonly string[] TOKEN_TYPES = new string[] {
/// <summary>
/// Only output tokens
/// </summary>
public const int TOKENS_ONLY = 0;
/// <summary>
/// Only output untokenized tokens, which are tokens that would normally be split into several tokens
/// </summary>
public const int UNTOKENIZED_ONLY = 1;
/// <summary>
/// Output the both the untokenized token and the splits
/// </summary>
public const int BOTH = 2;
/// <summary>
/// This flag is used to indicate that the produced "Token" would, if <see cref="TOKENS_ONLY"/> was used, produce multiple tokens.
/// </summary>
public const int UNTOKENIZED_TOKEN_FLAG = 1;
/// <summary>
/// A private instance of the JFlex-constructed scanner
/// </summary>
private readonly WikipediaTokenizerImpl scanner;
private int tokenOutput = TOKENS_ONLY;
private ICollection<string> untokenizedTypes = Collections.EmptySet<string>();
private IEnumerator<AttributeSource.State> tokens = null;
private IOffsetAttribute offsetAtt;
private ITypeAttribute typeAtt;
private IPositionIncrementAttribute posIncrAtt;
private ICharTermAttribute termAtt;
private IFlagsAttribute flagsAtt;
private bool first;
/// <summary>
/// Creates a new instance of the <see cref="WikipediaTokenizer"/>. Attaches the
/// <paramref name="input"/> to a newly created JFlex scanner.
/// </summary>
/// <param name="input"> The Input <see cref="TextReader"/> </param>
public WikipediaTokenizer(TextReader input)
: this(input, TOKENS_ONLY, Collections.EmptySet<string>())
/// <summary>
/// Creates a new instance of the <see cref="WikipediaTokenizer"/>. Attaches the
/// <paramref name="input"/> to a the newly created JFlex scanner.
/// </summary>
/// <param name="input"> The input </param>
/// <param name="tokenOutput"> One of <see cref="TOKENS_ONLY"/>, <see cref="UNTOKENIZED_ONLY"/>, <see cref="BOTH"/> </param>
/// <param name="untokenizedTypes"> Untokenized types </param>
public WikipediaTokenizer(TextReader input, int tokenOutput, ICollection<string> untokenizedTypes)
: base(input)
this.scanner = new WikipediaTokenizerImpl(this.m_input);
Init(tokenOutput, untokenizedTypes);
/// <summary>
/// Creates a new instance of the <see cref="WikipediaTokenizer"/>. Attaches the
/// <paramref name="input"/> to a the newly created JFlex scanner. Uses the given <see cref="AttributeSource.AttributeFactory"/>.
/// </summary>
/// <param name="factory"> The <see cref="AttributeSource.AttributeFactory"/> </param>
/// <param name="input"> The input </param>
/// <param name="tokenOutput"> One of <see cref="TOKENS_ONLY"/>, <see cref="UNTOKENIZED_ONLY"/>, <see cref="BOTH"/> </param>
/// <param name="untokenizedTypes"> Untokenized types </param>
public WikipediaTokenizer(AttributeFactory factory, TextReader input, int tokenOutput, ICollection<string> untokenizedTypes)
: base(factory, input)
this.scanner = new WikipediaTokenizerImpl(this.m_input);
Init(tokenOutput, untokenizedTypes);
private void Init(int tokenOutput, ICollection<string> untokenizedTypes)
// TODO: cutover to enum
if (tokenOutput != TOKENS_ONLY && tokenOutput != UNTOKENIZED_ONLY && tokenOutput != BOTH)
throw new ArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH");
this.tokenOutput = tokenOutput;
this.untokenizedTypes = untokenizedTypes;
offsetAtt = AddAttribute<IOffsetAttribute>();
typeAtt = AddAttribute<ITypeAttribute>();
posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
termAtt = AddAttribute<ICharTermAttribute>();
flagsAtt = AddAttribute<IFlagsAttribute>();
/// <summary>
/// <see cref="TokenStream.IncrementToken"/>
/// </summary>
public override sealed bool IncrementToken()
if (tokens != null && tokens.MoveNext())
AttributeSource.State state = tokens.Current;
return true;
int tokenType = scanner.GetNextToken();
if (tokenType == WikipediaTokenizerImpl.YYEOF)
return false;
string type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
if (tokenOutput == TOKENS_ONLY || untokenizedTypes.Contains(type) == false)
else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.Contains(type) == true)
else if (tokenOutput == BOTH)
//collapse into a single token, add it to tokens AND output the individual tokens
//output the untokenized Token first
CollapseAndSaveTokens(tokenType, type);
int posinc = scanner.PositionIncrement;
if (first && posinc == 0)
posinc = 1; // don't emit posinc=0 for the first token!
posIncrAtt.PositionIncrement = posinc;
typeAtt.Type = type;
first = false;
return true;
private void CollapseAndSaveTokens(int tokenType, string type)
StringBuilder buffer = new StringBuilder(32);
int numAdded = scanner.SetText(buffer);
//TODO: how to know how much whitespace to add
int theStart = scanner.YyChar;
int lastPos = theStart + numAdded;
int tmpTokType;
int numSeen = 0;
IList<AttributeSource.State> tmp = new List<AttributeSource.State>();
SetupSavedToken(0, type);
//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
while ((tmpTokType = scanner.GetNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen)
int currPos = scanner.YyChar;
//append whitespace
for (int i = 0; i < (currPos - lastPos); i++)
buffer.Append(' ');
numAdded = scanner.SetText(buffer);
SetupSavedToken(scanner.PositionIncrement, type);
lastPos = currPos + numAdded;
//trim the buffer
// TODO: this is inefficient
string s = buffer.ToString().Trim();
offsetAtt.SetOffset(CorrectOffset(theStart), CorrectOffset(theStart + s.Length));
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF)
tokens = tmp.GetEnumerator();
private void SetupSavedToken(int positionInc, string type)
posIncrAtt.PositionIncrement = positionInc;
typeAtt.Type = type;
private void CollapseTokens(int tokenType)
StringBuilder buffer = new StringBuilder(32);
int numAdded = scanner.SetText(buffer);
//TODO: how to know how much whitespace to add
int theStart = scanner.YyChar;
int lastPos = theStart + numAdded;
int tmpTokType;
int numSeen = 0;
//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
while ((tmpTokType = scanner.GetNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen)
int currPos = scanner.YyChar;
//append whitespace
for (int i = 0; i < (currPos - lastPos); i++)
buffer.Append(' ');
numAdded = scanner.SetText(buffer);
lastPos = currPos + numAdded;
//trim the buffer
// TODO: this is inefficient
string s = buffer.ToString().Trim();
offsetAtt.SetOffset(CorrectOffset(theStart), CorrectOffset(theStart + s.Length));
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF)
tokens = null;
private void SetupToken()
int start = scanner.YyChar;
offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length));
protected override void Dispose(bool disposing)
if (disposing)
/// <summary>
/// <see cref="TokenStream.Reset"/>
/// </summary>
public override void Reset()
tokens = null;
first = true;
public override void End()
// set final offset
int finalOffset = CorrectOffset(scanner.YyChar + scanner.YyLength);
this.offsetAtt.SetOffset(finalOffset, finalOffset);