blob: 51df3f5a48ac99f332340e9f743f5694b0f4c06c [file] [log] [blame]
/* $Id$ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.core.fuzzyml;
import org.apache.manifoldcf.core.interfaces.*;
import java.io.*;
import java.util.*;
/** Class to keep track of XML hierarchy in the face of possibly corrupt
* XML and with case-insensitive tags, etc.
* Basically, this class accepts what is supposedly XML but allows for various
* kinds of handwritten corruption. Specific kinds of errors allowed include:
*
* - Bad character encoding
* - Tag case match problems; all attributes are (optionally) bashed to lower case,
* and tag names are checked to match when all lower case, if case-sensitive didn't
* work
* - End tag matching problems, where someone lost an end tag somehow
* - Other parsing recoveries to be added as they arise
*
* The functionality of this class is also somewhat lessened vs. standard
* SAX-type parsers. No namespace interpretation is done, for instance; tag qnames
* are split into namespace name and local name, and that's all folks. But if you need
* more power, you can write a class extension that will do that readily.
*/
public class XMLFuzzyHierarchicalParseState extends XMLFuzzyParseState
{
/** The current context */
protected XMLParsingContext currentContext = null;
/** The current value buffer */
protected StringBuilder characterBuffer = new StringBuilder();
/** Whether we're capturing escaped characters */
protected boolean captureEscaped = false;
/** This is the maximum size of a chunk of characters getting sent to the characters() method.
*/
protected final static int MAX_CHUNK_SIZE = 4096;
/** Constructor with default properties.
*/
public XMLFuzzyHierarchicalParseState()
{
this(true,true,true,true,true,true);
}
/** Constructor.
*/
public XMLFuzzyHierarchicalParseState(boolean lowerCaseAttributes, boolean lowerCaseTags,
boolean lowerCaseQAttributes, boolean lowerCaseQTags,
boolean lowerCaseBTags, boolean lowerCaseEscapeTags)
{
super(lowerCaseAttributes,lowerCaseTags,lowerCaseQAttributes,lowerCaseQTags,lowerCaseBTags,lowerCaseEscapeTags);
}
public void setContext(XMLParsingContext context)
{
currentContext = context;
}
public XMLParsingContext getContext()
{
return currentContext;
}
/** Call this method to clean up completely after a parse attempt, whether successful or failure. */
public void cleanup()
throws ManifoldCFException
{
// This sets currentContext == null as a side effect, unless an error occurs during cleanup!!
currentContext.cleanup();
}
/** Map version of the noteTag method.
*@return true to halt further processing.
*/
@Override
protected boolean noteTagEx(String tagName, String nameSpace, String localName, Map<String,String> attributes)
throws ManifoldCFException
{
flushCharacterBuffer();
if (currentContext != null)
currentContext.startElement(nameSpace,localName,tagName,attributes);
return false;
}
/** Note end tag.
*/
@Override
protected boolean noteEndTagEx(String tagName, String nameSpace, String localName)
throws ManifoldCFException
{
flushCharacterBuffer();
if (currentContext != null)
currentContext.endElement(nameSpace,localName,tagName);
return false;
}
/** This method gets called for every character that is not part of a tag etc.
* Override this method to intercept such characters.
*@return true to halt further processing.
*/
@Override
protected boolean noteNormalCharacter(char thisChar)
throws ManifoldCFException
{
appendToCharacterBuffer(thisChar);
return false;
}
protected void appendToCharacterBuffer(char thisChar)
throws ManifoldCFException
{
characterBuffer.append(thisChar);
if (characterBuffer.length() >= MAX_CHUNK_SIZE)
flushCharacterBuffer();
}
protected void flushCharacterBuffer()
throws ManifoldCFException
{
if (characterBuffer.length() > 0)
{
if (currentContext != null)
currentContext.characters(characterBuffer.toString());
characterBuffer.setLength(0);
}
}
/** New version of the noteEscapedTag method.
*@return true to halt further processing.
*/
@Override
protected boolean noteEscapedEx(String token)
throws ManifoldCFException
{
if (token.toLowerCase(Locale.ROOT).equals("cdata"))
captureEscaped = true;
return false;
}
/** This method gets called for every character that is found within an
* escape block, e.g. CDATA.
* Override this method to intercept such characters.
*@return true to halt further processing.
*/
@Override
protected boolean noteEscapedCharacter(char thisChar)
throws ManifoldCFException
{
if (captureEscaped)
appendToCharacterBuffer(thisChar);
return false;
}
/** Called for the end of every cdata-like tag.
*@return true to halt further processing.
*/
@Override
protected boolean noteEndEscaped()
throws ManifoldCFException
{
captureEscaped = false;
return false;
}
/** Called at the end of everything.
*/
@Override
public void finishUp()
throws ManifoldCFException
{
flushCharacterBuffer();
super.finishUp();
}
}