blob: 85f7acc3c5c5e0e13ebe254c4c1d2dcadc8ad35e [file] [log] [blame]
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* $Log$
* Revision 1.1 1999/11/09 01:03:17 twl
* Initial revision
*
* Revision 1.2 1999/11/08 20:45:38 rahul
* Swat for adding in Product name and CVS comment log variable.
*
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <util/RuntimeException.hpp>
#include <framework/XMLValidator.hpp>
#include <validators/DTD/CMBinaryOp.hpp>
#include <validators/DTD/CMLeaf.hpp>
#include <validators/DTD/CMUnaryOp.hpp>
#include <validators/DTD/DFAContentModel.hpp>
#include <validators/DTD/ContentSpecNode.hpp>
#include <validators/DTD/DTDElementDecl.hpp>
// ---------------------------------------------------------------------------
// Local static data
//
// gEOCFakeId
// gEpsilonFakeId
// We have to put in a couple of special CMLeaf nodes to represent
// special values, using fake element ids that we know won't conflict
// with real element ids.
// ---------------------------------------------------------------------------
static const unsigned int gEOCFakeId = 0xFFFFFFF1;
static const unsigned int gEpsilonFakeId = 0xFFFFFFF2;
// ---------------------------------------------------------------------------
// DFAContentModel: Constructors and Destructor
// ---------------------------------------------------------------------------
DFAContentModel::DFAContentModel(const DTDElementDecl& elemDecl) :
fElemDecl(elemDecl)
, fElemMap(0)
, fElemMapSize(0)
, fEmptyOk(false)
, fEOCPos(0)
, fFinalStateFlags(0)
, fFollowList(0)
, fHeadNode(0)
, fLeafCount(0)
, fLeafList(0)
, fSpecNode(0)
, fTransTable(0)
, fTransTableSize(0)
{
//
// Store away our content spec node. This is used all over the place
// here so its easier to store a member pointer than to pass it around.
//
fSpecNode = elemDecl.getContentSpec();
// And build the DFA data structures
buildDFA();
}
DFAContentModel::~DFAContentModel()
{
//
// Clean up all the stuff that is not just temporary representation
// data that was cleaned up after building the DFA.
//
delete [] fFinalStateFlags;
unsigned index;
for (index = 0; index < fTransTableSize; index++)
delete [] fTransTable[index];
delete [] fTransTable;
delete [] fElemMap;
}
// ---------------------------------------------------------------------------
// DFAContentModel: Implementation of the ContentModel virtual interface
// ---------------------------------------------------------------------------
int
DFAContentModel::validateContent( const unsigned int* childIds
, const unsigned int childCount) const
{
//
// If there are no children, then either we fail on the 0th element
// or we return success. It depends upon whether this content model
// accepts empty content, which we determined earlier.
//
if (!childCount)
{
if (fEmptyOk)
return -1;
return 0;
}
//
// Lets loop through the children in the array and move our way
// through the states. Note that we use the fElemMap array to map
// an element index to a state index.
//
unsigned int curState = 0;
unsigned int childIndex = 0;
for (; childIndex < childCount; childIndex++)
{
// Get the current element index out
const unsigned int curElem = childIds[childIndex];
// Look up this child in our element map
unsigned int elemIndex = 0;
for (; elemIndex < fElemMapSize; elemIndex++)
{
if (fElemMap[elemIndex] == curElem)
break;
}
// If we didn't find it, then obviously not valid
if (elemIndex == fElemMapSize)
return childIndex;
//
// Look up the next state for this input symbol when in the
// current state.
//
curState = fTransTable[curState][elemIndex];
// If its not a legal transition then we failed.
if (curState == -1)
return childIndex;
}
//
// We transitioned all the way through the input list. However, that
// does not mean that we ended in a final state. So check whether
// our ending state is a final state.
//
if (!fFinalStateFlags[curState])
return childIndex;
return XMLValidator::Success;
}
// ---------------------------------------------------------------------------
// DFAContentModel: Private helper methods
// ---------------------------------------------------------------------------
void DFAContentModel::buildDFA()
{
unsigned int index;
//
// The first step we need to take is to rewrite the content model using
// our CMNode objects, and in the process get rid of any repetition short
// cuts, converting them into '*' style repetitions or getting rid of
// repetitions altogether.
//
// The conversions done are:
//
// x+ -> (x|x*)
// x? -> (x|epsilon)
//
// This is a relatively complex scenario. What is happening is that we
// create a top level binary node of which the special EOC value is set
// as the right side node. The the left side is set to the rewritten
// syntax tree. The source is the original content model info from the
// decl pool. The rewrite is done by buildSyntaxTree() which recurses the
// decl pool's content of the element and builds a new tree in the
// process.
//
// Note that, during this operation, we set each non-epsilon leaf node's
// DFA state position and count the number of such leafs, which is left
// in the fLeafCount member.
//
CMLeaf* nodeEOC = new CMLeaf(gEOCFakeId);
CMNode* nodeOrgContent = buildSyntaxTree(fElemDecl.getContentSpec());
fHeadNode = new CMBinaryOp
(
ContentSpecNode::Sequence
, nodeOrgContent
, nodeEOC
);
//
// And handle specially the EOC node, which also must be numbered and
// counted as a non-epsilon leaf node. It could not be handled in the
// above tree build because it was created before all that started. We
// save the EOC position since its used during the DFA building loop.
//
fEOCPos = fLeafCount;
nodeEOC->setPosition(fLeafCount++);
//
// Ok, so now we have to iterate the new tree and do a little more work
// now that we know the leaf count. One thing we need to do is to
// calculate the first and last position sets of each node. This is
// cached away in each of the nodes.
//
// Along the way we also set the leaf count in each node as the maximum
// state count. They must know this in order to create their first/last
// position sets.
//
// We also need to build an array of references to the non-epsilon
// leaf nodes. Since we iterate here the same way as we did during the
// initial tree build (which built their position numbers, we will put
// them in the array according to their position values.
//
fLeafList = new CMLeaf*[fLeafCount];
postTreeBuildInit(fHeadNode, 0);
//
// And, moving onward... We now need to build the follow position sets
// for all the nodes. So we allocate an array of pointers to state sets,
// one for each leaf node (i.e. each significant DFA position.)
//
fFollowList = new CMStateSet*[fLeafCount];
for (index = 0; index < fLeafCount; index++)
fFollowList[index] = new CMStateSet(fLeafCount);
calcFollowList(fHeadNode);
//
// Check the DFA for ambiguity. If it is ambiguous, then set the
// ambiguous flag member and keep going.
//
fIsAmbiguous = isAmbiguous();
//
// Check to see whether this content model can handle an empty content,
// which is something we need to optimize by looking now before we
// throw away the info that would tell us that.
//
// If the left node of the head (the top level of the original content)
// is nullable, then its true.
//
fEmptyOk = nodeOrgContent->isNullable();
//
// And finally the big push... Now we build the DFA using all the states
// and the tree we've built up. First we set up the various data
// structures we are going to use while we do this.
//
// First of all we need an array of unique element ids in our content
// model. For each transition table entry, we need a set of contiguous
// indices to represent the transitions for a particular input element.
// So we need to a zero based range of indexes that map to element types.
// This element map provides that mapping.
//
fElemMap = new unsigned int[fLeafCount];
fElemMapSize = 0;
for (unsigned int outIndex = 0; outIndex < fLeafCount; outIndex++)
{
// Get the current leaf's element index
const unsigned int elemId = fLeafList[outIndex]->getId();
// See if the current leaf node's element index is in the list
unsigned int inIndex = 0;
for (; inIndex < fElemMapSize; inIndex++)
{
if (fElemMap[inIndex] == elemId)
break;
}
// If it was not in the list, then add it and bump the map size
if (inIndex == fElemMapSize)
fElemMap[fElemMapSize++] = elemId;
}
//
// Next lets create some arrays, some that that hold transient info
// during the DFA build and some that are permament. These are kind of
// sticky since we cannot know how big they will get, but we don't want
// to use any collection type classes because of performance.
//
// Basically they will probably be about fLeafCount*2 on average, but can
// be as large as 2^(fLeafCount*2), worst case. So we start with
// fLeafCount*4 as a middle ground. This will be very unlikely to ever
// have to expand though, it if does, the overhead will be somewhat ugly.
//
unsigned int curArraySize = fLeafCount * 4;
const CMStateSet** statesToDo = new const CMStateSet*[curArraySize];
fFinalStateFlags = new bool[curArraySize];
fTransTable = new unsigned int*[curArraySize];
//
// Ok we start with the initial set as the first pos set of the head node
// (which is the seq node that holds the content model and the EOC node.)
//
const CMStateSet* setT = new CMStateSet(fHeadNode->getFirstPos());
//
// Init our two state flags. Basically the unmarked state counter is
// always chasing the current state counter. When it catches up, that
// means we made a pass through that did not add any new states to the
// lists, at which time we are done. We could have used a expanding array
// of flags which we used to mark off states as we complete them, but
// this is easier though less readable maybe.
//
unsigned int unmarkedState = 0;
unsigned int curState = 0;
//
// Init the first transition table entry, and put the initial state
// into the states to do list, then bump the current state.
//
fTransTable[curState] = makeDefStateList();
statesToDo[curState] = setT;
curState++;
//
// Ok, almost done with the algorithm from hell... We now enter the
// loop where we go until the states done counter catches up with
// the states to do counter.
//
CMStateSet* newSet = 0;
while (unmarkedState < curState)
{
//
// Get the next unmarked state out of the list of states to do.
// And get the associated transition table entry.
//
setT = statesToDo[unmarkedState];
unsigned int* transEntry = fTransTable[unmarkedState];
// Mark this one final if it contains the EOC state
fFinalStateFlags[unmarkedState] = setT->getBit(fEOCPos);
// Bump up the unmarked state count, marking this state done
unmarkedState++;
// Loop through each possible input symbol in the element map
for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
{
//
// Build up a set of states which is the union of all of the
// follow sets of DFA positions that are in the current state. If
// we gave away the new set last time through then create a new
// one. Otherwise, zero out the existing one.
//
if (!newSet)
newSet = new CMStateSet(fLeafCount);
else
newSet->zeroBits();
for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++)
{
// If this leaf index (DFA position) is in the current set...
if (setT->getBit(leafIndex))
{
//
// If this leaf is the current input symbol, then we want
// to add its follow list to the set of states to transition
// to from the current state.
//
if (fLeafList[leafIndex]->getId() == fElemMap[elemIndex])
*newSet |= *fFollowList[leafIndex];
}
}
//
// If this new set is not empty, then see if its in the list
// of states to do. If not, then add it.
//
if (!newSet->isEmpty())
{
//
// Search the 'states to do' list to see if this new
// state set is already in there.
//
unsigned int stateIndex = 0;
for (; stateIndex < curState; stateIndex++)
{
if (*statesToDo[stateIndex] == *newSet)
break;
}
// If we did not find it, then add it
if (stateIndex == curState)
{
//
// Put this new state into the states to do and init
// a new entry at the same index in the transition
// table.
//
statesToDo[curState] = newSet;
fTransTable[curState] = makeDefStateList();
// We now have a new state to do so bump the count
curState++;
//
// Null out the new set to indicate we adopted it. This
// will cause the creation of a new set on the next time
// around the loop.
//
newSet = 0;
}
//
// Now set this state in the transition table's entry for this
// element (using its index), with the DFA state we will move
// to from the current state when we see this input element.
//
transEntry[elemIndex] = stateIndex;
// Expand the arrays if we're full
if (curState == curArraySize)
{
//
// Yikes, we overflowed the initial array size, so we've
// got to expand all of these arrays. So adjust up the
// size by 50% and allocate new arrays.
//
const unsigned int newSize = (unsigned int)(curArraySize * 1.5);
const CMStateSet** newToDo = new const CMStateSet*[newSize];
bool* newFinalFlags = new bool[newSize];
unsigned int** newTransTable = new unsigned int*[newSize];
// Copy over all of the existing content
for (unsigned int expIndex = 0; expIndex < curArraySize; expIndex++)
{
newToDo[expIndex] = statesToDo[expIndex];
newFinalFlags[expIndex] = fFinalStateFlags[expIndex];
newTransTable[expIndex] = fTransTable[expIndex];
}
// Clean up the old stuff
delete [] statesToDo;
delete [] fFinalStateFlags;
delete [] fTransTable;
// Store the new array size and pointers
curArraySize = newSize;
statesToDo = newToDo;
fFinalStateFlags = newFinalFlags;
fTransTable = newTransTable;
}
}
}
}
// Store the current state count in the trans table size
fTransTableSize = curState;
// If the last temp set was not stored, then clean it up
if (newSet)
delete newSet;
//
// Now we can clean up all of the temporary data that was needed during
// DFA build.
//
delete fHeadNode;
fHeadNode = 0;
for (index = 0; index < fLeafCount; index++)
delete fFollowList[index];
for (index = 0; index < curState; index++)
delete (CMStateSet*)statesToDo[index];
delete [] fLeafList;
delete [] fFollowList;
delete [] statesToDo;
}
CMNode* DFAContentModel::buildSyntaxTree(const ContentSpecNode* const curNode)
{
// Initialize a return node pointer
CMNode* retNode = 0;
// Get the spec type of the passed node
const ContentSpecNode::NodeTypes curType = curNode->getType();
if (curType == ContentSpecNode::Leaf)
{
//
// Create a new leaf node, and pass it the current leaf count, which
// is its DFA state position. Bump the leaf count after storing it.
// This makes the positions zero based since we store first and then
// increment.
//
retNode = new CMLeaf(curNode->getElemId(), fLeafCount++);
}
else
{
//
// Its not a leaf, so we have to recurse its left and maybe right
// nodes. Save both values before we recurse and trash the node.
//
const ContentSpecNode* leftNode = curNode->getFirst();
const ContentSpecNode* rightNode = curNode->getSecond();
if ((curType == ContentSpecNode::Choice)
|| (curType == ContentSpecNode::Sequence))
{
//
// Recurse on both children, and return a binary op node with the
// two created sub nodes as its children. The node type is the
// same type as the source.
//
CMNode* newLeft = buildSyntaxTree(leftNode);
CMNode* newRight = buildSyntaxTree(rightNode);
retNode = new CMBinaryOp(curType, newLeft, newRight);
}
else if (curType == ContentSpecNode::ZeroOrMore)
{
// This one is fine as is, just change to our form
retNode = new CMUnaryOp(curType, buildSyntaxTree(leftNode));
}
else if (curType == ContentSpecNode::ZeroOrOne)
{
// Convert to (x|epsilon)
CMNode* newLeft = buildSyntaxTree(leftNode);
retNode = new CMBinaryOp
(
ContentSpecNode::Choice
, newLeft
, new CMLeaf(gEpsilonFakeId)
);
}
else if (curType == ContentSpecNode::OneOrMore)
{
// Convert to (x,x*)
CMNode* newLeft = buildSyntaxTree(leftNode);
CMNode* newLeft2 = buildSyntaxTree(leftNode);
retNode = new CMBinaryOp
(
ContentSpecNode::Sequence
, newLeft
, new CMUnaryOp(ContentSpecNode::ZeroOrMore, newLeft2)
);
}
else
{
ThrowXML(RuntimeException, XML4CExcepts::CM_UnknownCMSpecType);
}
}
return retNode;
}
void DFAContentModel::calcFollowList(CMNode* const curNode)
{
// Get the spec type of the passed node
const ContentSpecNode::NodeTypes curType = curNode->getType();
if (curType == ContentSpecNode::Choice)
{
// Just recurse
calcFollowList(((CMBinaryOp*)curNode)->getLeft());
calcFollowList(((CMBinaryOp*)curNode)->getRight());
}
else if (curType == ContentSpecNode::Sequence)
{
// Recurse before we process this node
calcFollowList(((CMBinaryOp*)curNode)->getLeft());
calcFollowList(((CMBinaryOp*)curNode)->getRight());
//
// Now handle our level. We use our left child's last pos set and our
// right child's first pos set, so get them now for convenience.
//
const CMStateSet& last = ((CMBinaryOp*)curNode)->getLeft()->getLastPos();
const CMStateSet& first = ((CMBinaryOp*)curNode)->getRight()->getFirstPos();
//
// Now, for every position which is in our left child's last set
// add all of the states in our right child's first set to the
// follow set for that position.
//
for (unsigned int index = 0; index < fLeafCount; index++)
{
if (last.getBit(index))
*fFollowList[index] |= first;
}
}
else if (curType == ContentSpecNode::ZeroOrMore)
{
// Recurse first
calcFollowList(((CMUnaryOp*)curNode)->getChild());
//
// Now handle our level. We use our own first and last position
// sets, so get them up front.
//
const CMStateSet& first = curNode->getFirstPos();
const CMStateSet& last = curNode->getLastPos();
//
// For every position which is in our last position set, add all
// of our first position states to the follow set for that
// position.
//
for (unsigned int index = 0; index < fLeafCount; index++)
{
if (last.getBit(index))
*fFollowList[index] |= first;
}
}
else if ((curType == ContentSpecNode::OneOrMore)
|| (curType == ContentSpecNode::ZeroOrOne))
{
ThrowXML1(RuntimeException, XML4CExcepts::CM_NotValidForSpecType, "CalcFollowList");
}
}
bool DFAContentModel::isAmbiguous() const
{
unsigned int index;
//
// Run through the current leaves and remember the min and max element
// ids. These will be used to create a bit set that will map to element
// ids (adjusted by the min value.)
//
// Do fLeafCount - 1 because we don't want to pick up the 'EOC' node,
// which has a very large dummy value in it.
//
unsigned int minId = 0xFFFFFFFF;
unsigned int maxId = 0;
for (index = 0; index < fLeafCount - 1; index++)
{
const unsigned int curId = fLeafList[index]->getId();
if (curId < minId)
minId = curId;
if (curId > maxId)
maxId = curId;
}
//
// Ok, now we can create a range value that represents the spread
// between the min/max element value. The range can never be larger
// than the number of elements in the content model, so it would
// never be outrageously large unless the content model was just
// totally pathological.
//
// With this number we can create a state set that has a bit per
// possible entry in the leaf array.
//
const unsigned int idRange = (maxId - minId) + 1;
CMStateSet idSet(idRange);
// Check each follow list
for (index = 0; index < fLeafCount - 1; index++)
{
// Get the current follow list set
const CMStateSet* curSet = fFollowList[index];
//
// For each possible leaf, we now go through and get the element
// id, adjust it to zero base it, then check it in the idSet. If
// that bit is already on, its ambiguous. Else set that bit and
// keep going.
//
for (unsigned int inner = 0; inner < fLeafCount - 1; inner++)
{
// If this bit is not on in the follow set, skip to next
if (!curSet->getBit(inner))
continue;
// Get the adjusted id value for the element in the follow list
const unsigned int adjustedId = fLeafList[inner]->getId() - minId;
if (idSet.getBit(adjustedId))
return true;
idSet.setBit(adjustedId);
}
// Zero out the bitset for the next round
idSet.zeroBits();
}
return false;
}
//
// -1 is used to represent bad transitions in the transition table entry for
// each state. So each entry is initialized to an all -1 array. This method
// creates a new entry and initializes it.
//
unsigned int* DFAContentModel::makeDefStateList() const
{
unsigned int* retArray = new unsigned int[fElemMapSize];
for (unsigned int index = 0; index < fElemMapSize; index++)
retArray[index] = -1;
return retArray;
}
int DFAContentModel::postTreeBuildInit( CMNode* const nodeCur
, const unsigned int curIndex)
{
// Set the maximum states on this node
nodeCur->setMaxStates(fLeafCount);
// Get the spec type of the passed node
const ContentSpecNode::NodeTypes curType = nodeCur->getType();
// Get a copy of the index we can modify
unsigned int newIndex = curIndex;
// Recurse as required
if ((curType == ContentSpecNode::Choice)
|| (curType == ContentSpecNode::Sequence))
{
newIndex = postTreeBuildInit(((CMBinaryOp*)nodeCur)->getLeft(), newIndex);
newIndex = postTreeBuildInit(((CMBinaryOp*)nodeCur)->getRight(), newIndex);
}
else if (curType == ContentSpecNode::ZeroOrMore)
{
newIndex = postTreeBuildInit(((CMUnaryOp*)nodeCur)->getChild(), newIndex);
}
else if (curType == ContentSpecNode::Leaf)
{
//
// Put this node in the leaf list at the current index if its
// a non-epsilon leaf.
//
if (((CMLeaf*)nodeCur)->getId() != gEpsilonFakeId)
fLeafList[newIndex++] = (CMLeaf*)nodeCur;
}
else
{
ThrowXML(RuntimeException, XML4CExcepts::CM_UnknownCMSpecType);
}
return newIndex;
}