blob: 100a89c83f85972b33ed095172115ebb194db1d0 [file] [log] [blame]
/**************************************************************
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*************************************************************/
package org.apache.openoffice.ooxml.viewer.xml;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Vector;
public class XMLScanner
{
XMLScanner (final InputStream aIn)
{
Reader aReader = null;
try
{
aReader = new InputStreamReader(aIn, "UTF8");
}
catch (UnsupportedEncodingException e)
{
e.printStackTrace();
}
maIn = aReader;
mnNextCharacter = 0;
maTokens = new Vector<Token>();
mnTokensReadIndex = 0;
mbIsInsideTag = false;
maTextBuffer = new int[1024];
}
public Token Next ()
{
while (maTokens.isEmpty())
ProvideToken();
final Token aToken = maTokens.get(mnTokensReadIndex);
++mnTokensReadIndex;
if (mnTokensReadIndex >= maTokens.size())
{
maTokens.clear();
mnTokensReadIndex = 0;
}
return aToken;
}
public Token Peek()
{
while (maTokens.isEmpty())
ProvideToken();
return maTokens.get(mnTokensReadIndex);
}
private void ProvideToken ()
{
final int nC = PeekCharacter();
if (nC == -1)
{
AddToken(TokenType.EOF, "", mnOffset);
}
else if (mbIsInsideTag)
{
switch (Character.getType(nC))
{
case Character.DIRECTIONALITY_WHITESPACE:
case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
ScanWhitespace();
break;
default:
switch(nC)
{
case '?':
case '/':
case '>':
case '=':
case ':':
case '-':
switch(ScanSymbol())
{
case TAG_END:
case INTRO_END:
case ELEMENT_END:
mbIsInsideTag = false;
break;
default:
break;
}
break;
case '"':
ScanQuotedValue();
break;
default:
ScanIdentifier();
break;
}
}
}
else
{
switch (Character.getType(PeekCharacter()))
{
case Character.DIRECTIONALITY_WHITESPACE:
case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
ScanWhitespace();
break;
default:
if (nC == '<')
{
mbIsInsideTag = true;
ScanSymbol();
}
else
{
ScanText();
}
break;
}
}
}
Token NextNonWhitespaceToken ()
{
while(true)
{
final Token aToken = Next();
if (aToken.Type != TokenType.WHITESPACE)
return aToken;
}
}
private TokenType ScanSymbol ()
{
final int nStartOffset = mnOffset;
switch (PeekCharacter())
{
case -1:
AddToken(TokenType.EOF, "", nStartOffset);
return TokenType.EOF;
case '<':
GetNextCharacter();
switch(PeekCharacter())
{
case '/':
GetNextCharacter();
AddToken(TokenType.END_TAG_START, "</", nStartOffset);
break;
case '?':
GetNextCharacter();
AddToken(TokenType.INTRO_START, "<?", nStartOffset);
break;
case '!':
GetNextCharacter();
if (GetNextCharacter() != '-')
throw new RuntimeException("expected '-' after '<!'");
if (GetNextCharacter() != '-')
throw new RuntimeException("expected '-' after '<!-'");
AddToken(TokenType.COMMENT_START, "<!--", nStartOffset);
break;
default:
AddToken(TokenType.TAG_START, "<", nStartOffset);
break;
}
return maTokens.lastElement().Type;
case '>':
GetNextCharacter();
AddToken(TokenType.TAG_END, ">", nStartOffset);
return TokenType.TAG_END;
case '/':
GetNextCharacter();
if (GetNextCharacter() != '>')
throw new RuntimeException("expected '>' after '/'");
AddToken(TokenType.ELEMENT_END, "/>", nStartOffset);
return TokenType.ELEMENT_END;
case '?':
GetNextCharacter();
if (GetNextCharacter() != '>')
throw new RuntimeException("expected '>' after '?'");
AddToken(TokenType.INTRO_END, "?>", nStartOffset);
return TokenType.INTRO_END;
case '-':
GetNextCharacter();
if (GetNextCharacter() != '-')
throw new RuntimeException("expected '-' after '-'");
if (GetNextCharacter() != '>')
throw new RuntimeException("expected '>' after '--'");
AddToken(TokenType.COMMENT_END, "-->", nStartOffset);
return TokenType.COMMENT_END;
case '=':
GetNextCharacter();
AddToken(TokenType.ATTRIBUTE_DEFINE, "=", nStartOffset);
return TokenType.ATTRIBUTE_DEFINE;
case ':':
GetNextCharacter();
AddToken(TokenType.COLON, ":", nStartOffset);
return TokenType.COLON;
default:
throw new RuntimeException(String.format(
"unexpected character '%c' of type %d",
PeekCharacter(),
Character.getType(PeekCharacter())));
}
}
private boolean ScanIdentifier ()
{
final int nStartOffset = mnOffset;
int nBufferWriteIndex = 0;
while (true)
{
switch(Character.getType(PeekCharacter()))
{
default:
case -1:
if (nBufferWriteIndex == 0)
throw new RuntimeException(
String.format(
"missing identifier, got '%c' of type %d",
PeekCharacter(),
Character.getType(PeekCharacter())));
AddToken(
TokenType.IDENTIFIER,
new String(maTextBuffer, 0, nBufferWriteIndex),
nStartOffset);
return true;
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
case Character.DECIMAL_DIGIT_NUMBER:
if (nBufferWriteIndex >= maTextBuffer.length)
maTextBuffer = Arrays.copyOf(maTextBuffer, maTextBuffer.length*2);
maTextBuffer[nBufferWriteIndex] = GetNextCharacter();
++nBufferWriteIndex;
break;
}
}
}
private void ScanWhitespace ()
{
final StringBuffer aBuffer = new StringBuffer();
final int nStartOffset = mnOffset;
while (true)
{
switch(Character.getType(PeekCharacter()))
{
default:
if (aBuffer.length() > 0)
AddToken(TokenType.WHITESPACE, aBuffer.toString(), nStartOffset);
return;
case -1:
AddToken(TokenType.WHITESPACE, aBuffer.toString(), nStartOffset);
AddToken(TokenType.EOF, "", nStartOffset);
return;
case Character.DIRECTIONALITY_WHITESPACE:
case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
aBuffer.append((char)GetNextCharacter());
break;
}
}
}
private void ScanQuotedValue ()
{
if (PeekCharacter() == '"')
{
final int nStartOffset = mnOffset;
int nBufferWriteIndex = 0;
maTextBuffer[nBufferWriteIndex++] = GetNextCharacter();
while (PeekCharacter() != '"')
{
// Make sure that there is enough space for this character and the end quote.
if (nBufferWriteIndex >= maTextBuffer.length-1)
maTextBuffer = Arrays.copyOf(maTextBuffer, maTextBuffer.length*2);
maTextBuffer[nBufferWriteIndex++] = GetNextCharacter();
}
maTextBuffer[nBufferWriteIndex++] = GetNextCharacter();
AddToken(TokenType.ATTRIBUTE_VALUE, new String(maTextBuffer, 0, nBufferWriteIndex), nStartOffset);
}
}
private void ScanText ()
{
final int nStartOffset = mnOffset;
int nBufferWriteIndex = 0;
maTextBuffer[nBufferWriteIndex++] = GetNextCharacter();
while (PeekCharacter() != '<')
{
if (nBufferWriteIndex >= maTextBuffer.length)
maTextBuffer = Arrays.copyOf(maTextBuffer, maTextBuffer.length*2);
maTextBuffer[nBufferWriteIndex++] = GetNextCharacter();
}
AddToken(TokenType.TEXT, new String(maTextBuffer, 0, nBufferWriteIndex), nStartOffset);
}
private int GetNextCharacter ()
{
final int nC;
if (mnNextCharacter != 0)
{
nC = mnNextCharacter;
mnNextCharacter = 0;
}
else
{
try
{
nC = maIn.read();
}
catch (Exception e)
{
e.printStackTrace();
return -1;
}
}
++mnOffset;
return nC;
}
private int PeekCharacter ()
{
if (mnNextCharacter == 0)
{
try
{
mnNextCharacter = maIn.read();
}
catch (IOException e)
{
e.printStackTrace();
mnNextCharacter = -1;
}
}
return mnNextCharacter;
}
private void AddToken (
final TokenType eType,
final String sText,
final int nOffset)
{
if (eType != TokenType.WHITESPACE)
maTokens.add(new Token(eType, sText, nOffset));
}
private final Reader maIn;
private int mnNextCharacter;
private Vector<Token> maTokens;
private int mnTokensReadIndex;
private boolean mbIsInsideTag;
private int[] maTextBuffer;
private int mnOffset;
}