blob: 8adb7551b1904807f903be97790433ce22d5706f [file] [log] [blame]
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace Lucene.Net.Analysis.Wikipedia
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Extension of <see cref="Standard.StandardTokenizer"/> that is aware of Wikipedia syntax. It is based off of the
/// Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
/// <para/>
/// @lucene.experimental
/// </summary>
public sealed class WikipediaTokenizer : Tokenizer
{
public const string INTERNAL_LINK = "il";
public const string EXTERNAL_LINK = "el";
//The URL part of the link, i.e. the first token
public const string EXTERNAL_LINK_URL = "elu";
public const string CITATION = "ci";
public const string CATEGORY = "c";
public const string BOLD = "b";
public const string ITALICS = "i";
public const string BOLD_ITALICS = "bi";
public const string HEADING = "h";
public const string SUB_HEADING = "sh";
public const int ALPHANUM_ID = 0;
public const int APOSTROPHE_ID = 1;
public const int ACRONYM_ID = 2;
public const int COMPANY_ID = 3;
public const int EMAIL_ID = 4;
public const int HOST_ID = 5;
public const int NUM_ID = 6;
public const int CJ_ID = 7;
public const int INTERNAL_LINK_ID = 8;
public const int EXTERNAL_LINK_ID = 9;
public const int CITATION_ID = 10;
public const int CATEGORY_ID = 11;
public const int BOLD_ID = 12;
public const int ITALICS_ID = 13;
public const int BOLD_ITALICS_ID = 14;
public const int HEADING_ID = 15;
public const int SUB_HEADING_ID = 16;
public const int EXTERNAL_LINK_URL_ID = 17;
/// <summary>
/// String token types that correspond to token type int constants </summary>
public static readonly string[] TOKEN_TYPES = new string[] {
"<ALPHANUM>",
"<APOSTROPHE>",
"<ACRONYM>",
"<COMPANY>",
"<EMAIL>",
"<HOST>",
"<NUM>",
"<CJ>",
INTERNAL_LINK,
EXTERNAL_LINK,
CITATION,
CATEGORY,
BOLD,
ITALICS,
BOLD_ITALICS,
HEADING,
SUB_HEADING,
EXTERNAL_LINK_URL
};
/// <summary>
/// Only output tokens
/// </summary>
public const int TOKENS_ONLY = 0;
/// <summary>
/// Only output untokenized tokens, which are tokens that would normally be split into several tokens
/// </summary>
public const int UNTOKENIZED_ONLY = 1;
/// <summary>
/// Output the both the untokenized token and the splits
/// </summary>
public const int BOTH = 2;
/// <summary>
/// This flag is used to indicate that the produced "Token" would, if <see cref="TOKENS_ONLY"/> was used, produce multiple tokens.
/// </summary>
public const int UNTOKENIZED_TOKEN_FLAG = 1;
/// <summary>
/// A private instance of the JFlex-constructed scanner
/// </summary>
private readonly WikipediaTokenizerImpl scanner;
private int tokenOutput = TOKENS_ONLY;
private ICollection<string> untokenizedTypes = Collections.EmptySet<string>();
private IEnumerator<AttributeSource.State> tokens = null;
private IOffsetAttribute offsetAtt;
private ITypeAttribute typeAtt;
private IPositionIncrementAttribute posIncrAtt;
private ICharTermAttribute termAtt;
private IFlagsAttribute flagsAtt;
private bool first;
/// <summary>
/// Creates a new instance of the <see cref="WikipediaTokenizer"/>. Attaches the
/// <paramref name="input"/> to a newly created JFlex scanner.
/// </summary>
/// <param name="input"> The Input <see cref="TextReader"/> </param>
public WikipediaTokenizer(TextReader input)
: this(input, TOKENS_ONLY, Collections.EmptySet<string>())
{
}
/// <summary>
/// Creates a new instance of the <see cref="WikipediaTokenizer"/>. Attaches the
/// <paramref name="input"/> to a the newly created JFlex scanner.
/// </summary>
/// <param name="input"> The input </param>
/// <param name="tokenOutput"> One of <see cref="TOKENS_ONLY"/>, <see cref="UNTOKENIZED_ONLY"/>, <see cref="BOTH"/> </param>
/// <param name="untokenizedTypes"> Untokenized types </param>
public WikipediaTokenizer(TextReader input, int tokenOutput, ICollection<string> untokenizedTypes)
: base(input)
{
this.scanner = new WikipediaTokenizerImpl(this.m_input);
Init(tokenOutput, untokenizedTypes);
}
/// <summary>
/// Creates a new instance of the <see cref="WikipediaTokenizer"/>. Attaches the
/// <paramref name="input"/> to a the newly created JFlex scanner. Uses the given <see cref="AttributeSource.AttributeFactory"/>.
/// </summary>
/// <param name="factory"> The <see cref="AttributeSource.AttributeFactory"/> </param>
/// <param name="input"> The input </param>
/// <param name="tokenOutput"> One of <see cref="TOKENS_ONLY"/>, <see cref="UNTOKENIZED_ONLY"/>, <see cref="BOTH"/> </param>
/// <param name="untokenizedTypes"> Untokenized types </param>
public WikipediaTokenizer(AttributeFactory factory, TextReader input, int tokenOutput, ICollection<string> untokenizedTypes)
: base(factory, input)
{
this.scanner = new WikipediaTokenizerImpl(this.m_input);
Init(tokenOutput, untokenizedTypes);
}
private void Init(int tokenOutput, ICollection<string> untokenizedTypes)
{
// TODO: cutover to enum
if (tokenOutput != TOKENS_ONLY && tokenOutput != UNTOKENIZED_ONLY && tokenOutput != BOTH)
{
throw new ArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH");
}
this.tokenOutput = tokenOutput;
this.untokenizedTypes = untokenizedTypes;
offsetAtt = AddAttribute<IOffsetAttribute>();
typeAtt = AddAttribute<ITypeAttribute>();
posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
termAtt = AddAttribute<ICharTermAttribute>();
flagsAtt = AddAttribute<IFlagsAttribute>();
}
/// <summary>
/// <see cref="TokenStream.IncrementToken"/>
/// </summary>
public override sealed bool IncrementToken()
{
if (tokens != null && tokens.MoveNext())
{
AttributeSource.State state = tokens.Current;
RestoreState(state);
return true;
}
ClearAttributes();
int tokenType = scanner.GetNextToken();
if (tokenType == WikipediaTokenizerImpl.YYEOF)
{
return false;
}
string type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
if (tokenOutput == TOKENS_ONLY || untokenizedTypes.Contains(type) == false)
{
SetupToken();
}
else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.Contains(type) == true)
{
CollapseTokens(tokenType);
}
else if (tokenOutput == BOTH)
{
//collapse into a single token, add it to tokens AND output the individual tokens
//output the untokenized Token first
CollapseAndSaveTokens(tokenType, type);
}
int posinc = scanner.PositionIncrement;
if (first && posinc == 0)
{
posinc = 1; // don't emit posinc=0 for the first token!
}
posIncrAtt.PositionIncrement = posinc;
typeAtt.Type = type;
first = false;
return true;
}
private void CollapseAndSaveTokens(int tokenType, string type)
{
//collapse
StringBuilder buffer = new StringBuilder(32);
int numAdded = scanner.SetText(buffer);
//TODO: how to know how much whitespace to add
int theStart = scanner.YyChar;
int lastPos = theStart + numAdded;
int tmpTokType;
int numSeen = 0;
IList<AttributeSource.State> tmp = new List<AttributeSource.State>();
SetupSavedToken(0, type);
tmp.Add(CaptureState());
//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
while ((tmpTokType = scanner.GetNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen)
{
int currPos = scanner.YyChar;
//append whitespace
for (int i = 0; i < (currPos - lastPos); i++)
{
buffer.Append(' ');
}
numAdded = scanner.SetText(buffer);
SetupSavedToken(scanner.PositionIncrement, type);
tmp.Add(CaptureState());
numSeen++;
lastPos = currPos + numAdded;
}
//trim the buffer
// TODO: this is inefficient
string s = buffer.ToString().Trim();
termAtt.SetEmpty().Append(s);
offsetAtt.SetOffset(CorrectOffset(theStart), CorrectOffset(theStart + s.Length));
flagsAtt.Flags = UNTOKENIZED_TOKEN_FLAG;
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF)
{
scanner.YyPushBack(scanner.YyLength);
}
tokens = tmp.GetEnumerator();
}
private void SetupSavedToken(int positionInc, string type)
{
SetupToken();
posIncrAtt.PositionIncrement = positionInc;
typeAtt.Type = type;
}
private void CollapseTokens(int tokenType)
{
//collapse
StringBuilder buffer = new StringBuilder(32);
int numAdded = scanner.SetText(buffer);
//TODO: how to know how much whitespace to add
int theStart = scanner.YyChar;
int lastPos = theStart + numAdded;
int tmpTokType;
int numSeen = 0;
//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
while ((tmpTokType = scanner.GetNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen)
{
int currPos = scanner.YyChar;
//append whitespace
for (int i = 0; i < (currPos - lastPos); i++)
{
buffer.Append(' ');
}
numAdded = scanner.SetText(buffer);
numSeen++;
lastPos = currPos + numAdded;
}
//trim the buffer
// TODO: this is inefficient
string s = buffer.ToString().Trim();
termAtt.SetEmpty().Append(s);
offsetAtt.SetOffset(CorrectOffset(theStart), CorrectOffset(theStart + s.Length));
flagsAtt.Flags = UNTOKENIZED_TOKEN_FLAG;
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF)
{
scanner.YyPushBack(scanner.YyLength);
}
else
{
tokens = null;
}
}
private void SetupToken()
{
scanner.GetText(termAtt);
int start = scanner.YyChar;
offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length));
}
protected override void Dispose(bool disposing)
{
base.Dispose(disposing);
if (disposing)
{
scanner.YyReset(m_input);
}
}
/// <summary>
/// <see cref="TokenStream.Reset"/>
/// </summary>
public override void Reset()
{
base.Reset();
scanner.YyReset(m_input);
tokens = null;
scanner.Reset();
first = true;
}
public override void End()
{
base.End();
// set final offset
int finalOffset = CorrectOffset(scanner.YyChar + scanner.YyLength);
this.offsetAtt.SetOffset(finalOffset, finalOffset);
}
}
}