| /* |
| * The Apache Software License, Version 1.1 |
| * |
| * |
| * Copyright (c) 1999,2000 The Apache Software Foundation. All rights |
| * reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * |
| * 3. The end-user documentation included with the redistribution, |
| * if any, must include the following acknowledgment: |
| * "This product includes software developed by the |
| * Apache Software Foundation (http://www.apache.org/)." |
| * Alternately, this acknowledgment may appear in the software itself, |
| * if and wherever such third-party acknowledgments normally appear. |
| * |
| * 4. The names "Xerces" and "Apache Software Foundation" must |
| * not be used to endorse or promote products derived from this |
| * software without prior written permission. For written |
| * permission, please contact apache@apache.org. |
| * |
| * 5. Products derived from this software may not be called "Apache", |
| * nor may "Apache" appear in their name, without prior written |
| * permission of the Apache Software Foundation. |
| * |
| * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
| * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
| * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
| * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| * ==================================================================== |
| * |
| * This software consists of voluntary contributions made by many |
| * individuals on behalf of the Apache Software Foundation and was |
| * originally based on software copyright (c) 1999, International |
| * Business Machines, Inc., http://www.apache.org. For more |
| * information on the Apache Software Foundation, please see |
| * <http://www.apache.org/>. |
| */ |
| |
| package org.apache.xerces.validators.common; |
| |
| import org.apache.xerces.framework.XMLContentSpec; |
| import org.apache.xerces.utils.ImplementationMessages; |
| import org.apache.xerces.utils.QName; |
| import org.apache.xerces.validators.schema.EquivClassComparator; |
| //import org.apache.xerces.utils.StringPool; |
| |
| /** |
| * DFAContentModel is the derivative of ContentModel that does |
| * all of the non-trivial element content validation. This class does |
| * the conversion from the regular expression to the DFA that |
| * it then uses in its validation algorithm. |
| * <p> |
| * <b>Note:</b> Upstream work insures that this class will never see |
| * a content model with PCDATA in it. Any model with PCDATA is 'mixed' |
| * and is handled via the MixedContentModel class since mixed models |
| * are very constrained in form and easily handled via a special case. |
| * This also makes implementation of this class much easier. |
| * |
| * @version $Id$ |
| */ |
| public class DFAContentModel |
| implements XMLContentModel { |
| |
| // |
| // Constants |
| // |
| // special strings |
| |
| /** Epsilon string. */ |
| //private static final String fEpsilonString = "<<CMNODE_EPSILON>>"; |
| private static final int EPSILON = -2; |
| |
| /** End-of-content string. */ |
| //private static final String fEOCString = "<<CMNODE_EOC>>"; |
| private static final int EOC = -3; |
| |
| // debugging |
| |
| /** Set to true to debug content model validation. */ |
| private static final boolean DEBUG_VALIDATE_CONTENT = false; |
| |
| // |
| // Data |
| // |
| |
| /* this is the EquivClassComparator object */ |
| private EquivClassComparator comparator = null; |
| |
| /** |
| * This is the map of unique input symbol elements to indices into |
| * each state's per-input symbol transition table entry. This is part |
| * of the built DFA information that must be kept around to do the |
| * actual validation. |
| */ |
| private QName fElemMap[] = null; |
| |
| /** |
| * This is a map of whether the element map contains information |
| * related to ANY models. |
| */ |
| private int fElemMapType[] = null; |
| |
| /** The element map size. */ |
| private int fElemMapSize = 0; |
| |
| /** Boolean to allow DTDs to validate even with namespace support. */ |
| private boolean fDTD; |
| |
| /** |
| * The string index for the 'end of content' string that we add to |
| * the string pool. This is used as the special name of an element |
| * that represents the end of the syntax tree. |
| */ |
| private int fEOCIndex = 0; |
| |
| /** |
| * The NFA position of the special EOC (end of content) node. This |
| * is saved away since it's used during the DFA build. |
| */ |
| private int fEOCPos = 0; |
| |
| /** |
| * The string index for the 'epsilon' string that we add to the |
| * string pool. This represents epsilon node transitions in the |
| * syntax tree. |
| */ |
| private int fEpsilonIndex = 0; |
| |
| /** |
| * This is an array of booleans, one per state (there are |
| * fTransTableSize states in the DFA) that indicates whether that |
| * state is a final state. |
| */ |
| private boolean fFinalStateFlags[] = null; |
| |
| /** |
| * The list of follow positions for each NFA position (i.e. for each |
| * non-epsilon leaf node.) This is only used during the building of |
| * the DFA, and is let go afterwards. |
| */ |
| private CMStateSet fFollowList[] = null; |
| |
| /** |
| * This is the head node of our intermediate representation. It is |
| * only non-null during the building of the DFA (just so that it |
| * does not have to be passed all around.) Once the DFA is built, |
| * this is no longer required so its nulled out. |
| */ |
| private CMNode fHeadNode = null; |
| |
| /** |
| * The count of leaf nodes. This is an important number that set some |
| * limits on the sizes of data structures in the DFA process. |
| */ |
| private int fLeafCount = 0; |
| |
| /** |
| * An array of non-epsilon leaf nodes, which is used during the DFA |
| * build operation, then dropped. |
| */ |
| private CMLeaf fLeafList[] = null; |
| |
| /** Array mapping ANY types to the leaf list. */ |
| private int fLeafListType[] = null; |
| |
| private ContentLeafNameTypeVector fLeafNameTypeVector = null; |
| |
| /** |
| * The string pool of our parser session. This is set during construction |
| * and kept around. |
| */ |
| //private StringPool fStringPool = null; |
| |
| /** |
| * This is the transition table that is the main by product of all |
| * of the effort here. It is an array of arrays of ints. The first |
| * dimension is the number of states we end up with in the DFA. The |
| * second dimensions is the number of unique elements in the content |
| * model (fElemMapSize). Each entry in the second dimension indicates |
| * the new state given that input for the first dimension's start |
| * state. |
| * <p> |
| * The fElemMap array handles mapping from element indexes to |
| * positions in the second dimension of the transition table. |
| */ |
| private int fTransTable[][] = null; |
| |
| /** |
| * The number of valid entries in the transition table, and in the other |
| * related tables such as fFinalStateFlags. |
| */ |
| private int fTransTableSize = 0; |
| |
| /** |
| * Flag that indicates that even though we have a "complicated" |
| * content model, it is valid to have no content. In other words, |
| * all parts of the content model are optional. For example: |
| * <pre> |
| * <!ELEMENT AllOptional (Optional*,NotRequired?)> |
| * </pre> |
| */ |
| private boolean fEmptyContentIsValid = false; |
| |
| // temp variables |
| |
| /** Temporary qualified name. */ |
| private QName fQName = new QName(); |
| |
| // |
| // Constructors |
| // |
| |
| /** |
| * Constructs a DFA content model. |
| * |
| * @param stringPool The string pool. |
| * @param syntaxTree The syntax tree of the content model. |
| * @param leafCount The number of leaves. |
| * |
| * @exception CMException Thrown if DMA can't be built. |
| */ |
| |
| // public DFAContentModel(StringPool stringPool, |
| public DFAContentModel( CMNode syntaxTree, |
| int leafCount) throws CMException { |
| this(syntaxTree, leafCount, false); |
| } |
| |
| /** |
| * Constructs a DFA content model. |
| * |
| * @param stringPool The string pool. |
| * @param syntaxTree The syntax tree of the content model. |
| * @param leafCount The number of leaves. |
| * |
| * @exception CMException Thrown if DMA can't be built. |
| */ |
| |
| // public DFAContentModel(StringPool stringPool, |
| public DFAContentModel( CMNode syntaxTree, |
| int leafCount, boolean dtd) throws CMException { |
| |
| // Store away our index and pools in members |
| //fStringPool = stringPool; |
| fLeafCount = leafCount; |
| |
| // |
| // Create some string pool indexes that represent the names of some |
| // magical nodes in the syntax tree. |
| // |
| /*** Defect 945 *** |
| if (fEpsilonString == null) |
| { |
| fEpsilonString = new String("<<CMNODE_EPSILON>>"); |
| fEpsilonString.intern(); |
| fEOCString = new String("<<CMNODE_EOC>>"); |
| fEOCString.intern(); |
| } |
| /***/ |
| |
| // fEpsilonIndex = fStringPool.addSymbol(fEpsilonString); |
| // fEOCIndex = fStringPool.addSymbol(fEOCString); |
| |
| fEpsilonIndex = EPSILON; |
| fEOCIndex = EOC; |
| |
| fDTD = dtd; |
| |
| // |
| // Ok, so lets grind through the building of the DFA. This method |
| // handles the high level logic of the algorithm, but it uses a |
| // number of helper classes to do its thing. |
| // |
| // In order to avoid having hundreds of references to the error and |
| // string handlers around, this guy and all of his helper classes |
| // just throw a simple exception and we then pass it along. |
| // |
| buildDFA(syntaxTree); |
| } |
| |
| // |
| // XMLContentModel methods |
| // |
| |
| /** |
| * Check that the specified content is valid according to this |
| * content model. This method can also be called to do 'what if' |
| * testing of content models just to see if they would be valid. |
| * <p> |
| * A value of -1 in the children array indicates a PCDATA node. All other |
| * indexes will be positive and represent child elements. The count can be |
| * zero, since some elements have the EMPTY content model and that must be |
| * confirmed. |
| * |
| * @param children The children of this element. Each integer is an index within |
| * the <code>StringPool</code> of the child element name. An index |
| * of -1 is used to indicate an occurrence of non-whitespace character |
| * data. |
| * @param offset Offset into the array where the children starts. |
| * @param length The number of entries in the <code>children</code> array. |
| * |
| * @return The value -1 if fully valid, else the 0 based index of the child |
| * that first failed. If the value returned is equal to the number |
| * of children, then the specified children are valid but additional |
| * content is required to reach a valid ending state. |
| * |
| * @exception CMException Thrown on error. |
| */ |
| public int validateContent(QName children[], int offset, int length) throws CMException { |
| |
| if (DEBUG_VALIDATE_CONTENT) |
| System.out.println("DFAContentModel#validateContent"); |
| |
| // |
| // A DFA content model must *always* have at least 1 child |
| // so a failure is given if no children present. |
| // |
| // Defect 782: This is an incorrect statement because a DFA |
| // content model is also used for constructions such as: |
| // |
| // (Optional*,NotRequired?) |
| // |
| // where a perfectly valid content would be NO CHILDREN. |
| // Therefore, if there are no children, we must check to |
| // see if the CMNODE_EOC marker is a valid start state! -Ac |
| // |
| if (length == 0) { |
| if (DEBUG_VALIDATE_CONTENT) { |
| System.out.println("!!! no children"); |
| System.out.println("elemMap="+fElemMap); |
| for (int i = 0; i < fElemMap.length; i++) { |
| int uriIndex = fElemMap[i].uri; |
| int localpartIndex = fElemMap[i].localpart; |
| /* |
| System.out.println("fElemMap["+i+"]="+uriIndex+","+ |
| localpartIndex+" ("+ |
| fStringPool.toString(uriIndex)+", "+ |
| fStringPool.toString(localpartIndex)+ |
| ')'); |
| */ |
| } |
| System.out.println("EOCIndex="+fEOCIndex); |
| } |
| |
| return fEmptyContentIsValid ? -1 : 0; |
| |
| } // if child count == 0 |
| |
| // |
| // Lets loop through the children in the array and move our way |
| // through the states. Note that we use the fElemMap array to map |
| // an element index to a state index. |
| // |
| int curState = 0; |
| for (int childIndex = 0; childIndex < length; childIndex++) |
| { |
| // Get the current element index out |
| final QName curElem = children[offset + childIndex]; |
| //System.out.println("children["+(offset+childIndex)+"]: "+curElem); |
| |
| // Look up this child in our element map |
| int elemIndex = 0; |
| for (; elemIndex < fElemMapSize; elemIndex++) |
| { |
| int type = fElemMapType[elemIndex] & 0x0f ; |
| if (type == XMLContentSpec.CONTENTSPECNODE_LEAF) { |
| //System.out.println("fElemMap["+elemIndex+"]: "+fElemMap[elemIndex]); |
| if (fDTD) { |
| if (fElemMap[elemIndex].rawname == curElem.rawname) { |
| break; |
| } |
| } |
| else { |
| if (fElemMap[elemIndex].uri==curElem.uri |
| && fElemMap[elemIndex].localpart == curElem.localpart) |
| break; |
| } |
| } |
| else if (type == XMLContentSpec.CONTENTSPECNODE_ANY) { |
| int uri = fElemMap[elemIndex].uri; |
| if (uri == -1 || uri == curElem.uri) { |
| break; |
| } |
| } |
| else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL) { |
| if (curElem.uri == -1) { |
| break; |
| } |
| } |
| else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) { |
| if (fElemMap[elemIndex].uri != curElem.uri) { |
| break; |
| } |
| } |
| } |
| |
| // If we didn't find it, then obviously not valid |
| if (elemIndex == fElemMapSize) { |
| if (DEBUG_VALIDATE_CONTENT) { |
| System.out.println("!!! didn't find it"); |
| |
| System.out.println("curElem : " +curElem ); |
| for (int i=0; i<fElemMapSize; i++) { |
| System.out.println("fElemMap["+i+"] = " +fElemMap[i] ); |
| System.out.println("fElemMapType["+i+"] = " +fElemMapType[i] ); |
| } |
| } |
| |
| return childIndex; |
| } |
| |
| // |
| // Look up the next state for this input symbol when in the |
| // current state. |
| // |
| curState = fTransTable[curState][elemIndex]; |
| |
| // If its not a legal transition, then invalid |
| if (curState == -1) { |
| if (DEBUG_VALIDATE_CONTENT) |
| System.out.println("!!! not a legal transition"); |
| return childIndex; |
| } |
| } |
| |
| // |
| // We transitioned all the way through the input list. However, that |
| // does not mean that we ended in a final state. So check whether |
| // our ending state is a final state. |
| // |
| if (DEBUG_VALIDATE_CONTENT) |
| System.out.println("curState="+curState+", childCount="+length); |
| if (!fFinalStateFlags[curState]) |
| return length; |
| |
| // success! |
| return -1; |
| } |
| |
| private boolean isEqual(QName name1, QName name2) { |
| return name1.localpart == name2.localpart && |
| name1.uri == name2.uri; |
| } |
| |
| public int validateContentSpecial(QName children[], int offset, int length) throws Exception{ |
| if (DEBUG_VALIDATE_CONTENT) |
| System.out.println("DFAContentModel#validateContentSpecial"); |
| |
| if (comparator==null) { |
| return validateContent(children,offset, length); |
| } |
| |
| |
| if (length == 0) { |
| if (DEBUG_VALIDATE_CONTENT) { |
| System.out.println("!!! no children"); |
| System.out.println("elemMap="+fElemMap); |
| for (int i = 0; i < fElemMap.length; i++) { |
| int uriIndex = fElemMap[i].uri; |
| int localpartIndex = fElemMap[i].localpart; |
| } |
| System.out.println("EOCIndex="+fEOCIndex); |
| } |
| |
| return fEmptyContentIsValid ? -1 : 0; |
| |
| } // if child count == 0 |
| |
| // |
| // Lets loop through the children in the array and move our way |
| // through the states. Note that we use the fElemMap array to map |
| // an element index to a state index. |
| // |
| int curState = 0; |
| for (int childIndex = 0; childIndex < length; childIndex++) |
| { |
| // Get the current element index out |
| final QName curElem = children[offset + childIndex]; |
| |
| // Look up this child in our element map |
| int elemIndex = 0; |
| for (; elemIndex < fElemMapSize; elemIndex++) |
| { |
| int type = fElemMapType[elemIndex] & 0x0f; |
| if (type == XMLContentSpec.CONTENTSPECNODE_LEAF) { |
| if (comparator.isEquivalentTo(curElem,fElemMap[elemIndex] ) ) |
| break; |
| } |
| else if (type == XMLContentSpec.CONTENTSPECNODE_ANY) { |
| int uri = fElemMap[elemIndex].uri; |
| if (uri == -1 || uri == curElem.uri) { |
| break; |
| } |
| } |
| else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL) { |
| if (curElem.uri == -1) { |
| break; |
| } |
| } |
| else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) { |
| if (fElemMap[elemIndex].uri != curElem.uri) { |
| break; |
| } |
| } |
| } |
| |
| // If we didn't find it, then obviously not valid |
| if (elemIndex == fElemMapSize) { |
| if (DEBUG_VALIDATE_CONTENT) { |
| System.out.println("!!! didn't find it"); |
| |
| System.out.println("curElem : " +curElem ); |
| for (int i=0; i<fElemMapSize; i++) { |
| System.out.println("fElemMap["+i+"] = " +fElemMap[i] ); |
| System.out.println("fElemMapType["+i+"] = " +fElemMapType[i] ); |
| } |
| } |
| |
| return childIndex; |
| } |
| |
| // |
| // Look up the next state for this input symbol when in the |
| // current state. |
| // |
| curState = fTransTable[curState][elemIndex]; |
| |
| // If its not a legal transition, then invalid |
| if (curState == -1) { |
| if (DEBUG_VALIDATE_CONTENT) |
| System.out.println("!!! not a legal transition"); |
| return childIndex; |
| } |
| } |
| |
| // |
| // We transitioned all the way through the input list. However, that |
| // does not mean that we ended in a final state. So check whether |
| // our ending state is a final state. |
| // |
| if (DEBUG_VALIDATE_CONTENT) |
| System.out.println("curState="+curState+", childCount="+length); |
| if (!fFinalStateFlags[curState]) |
| return length; |
| |
| // success! |
| return -1; |
| } |
| |
| public void setEquivClassComparator(EquivClassComparator comparator) { |
| this.comparator = comparator; |
| } |
| |
| /** |
| * Returns information about which elements can be placed at a particular point |
| * in the passed element's content model. |
| * <p> |
| * Note that the incoming content model to test must be valid at least up to |
| * the insertion point. If not, then -1 will be returned and the info object |
| * will not have been filled in. |
| * <p> |
| * If, on return, the info.isValidEOC flag is set, then the 'insert after' |
| * element is a valid end of content. In other words, nothing needs to be |
| * inserted after it to make the parent element's content model valid. |
| * |
| * @param fullyValid Only return elements that can be inserted and still |
| * maintain the validity of subsequent elements past the |
| * insertion point (if any). If the insertion point is at |
| * the end, and this is true, then only elements that can |
| * be legal final states will be returned. |
| * @param info An object that contains the required input data for the method, |
| * and which will contain the output information if successful. |
| * |
| * @return The value -1 if fully valid, else the 0 based index of the child |
| * that first failed before the insertion point. If the value |
| * returned is equal to the number of children, then the specified |
| * children are valid but additional content is required to reach a |
| * valid ending state. |
| * |
| * @see InsertableElementsInfo |
| */ |
| public int whatCanGoHere(boolean fullyValid, |
| InsertableElementsInfo info) throws CMException { |
| |
| // |
| // First, lets make sure that the passed in current content is valid |
| // up to the insert point. |
| // |
| int curState = 0; |
| for (int childIndex = 0; childIndex < info.insertAt; childIndex++) |
| { |
| // Get the current element index out |
| final QName curElem = info.curChildren[childIndex]; |
| |
| // Look up this child in our element map |
| int elemIndex = 0; |
| for (; elemIndex < fElemMapSize; elemIndex++) |
| { |
| if (fElemMap[elemIndex].uri == curElem.uri && |
| fElemMap[elemIndex].localpart == curElem.localpart) |
| break; |
| } |
| |
| // If we didn't find it, then not valid so return failure index |
| if (elemIndex == fElemMapSize) |
| return childIndex; |
| |
| // |
| // Look up the next state for this input symbol when in the |
| // current state. |
| // |
| curState = fTransTable[curState][elemIndex]; |
| |
| // If its not a legal transition, then invalid |
| if (curState == -1) |
| return childIndex; |
| } |
| |
| // |
| // If we got here, then curState is set to the state that would be |
| // the transition before the insertion point. We let this sit until |
| // below, where it will be needed. |
| // |
| final int insertState = curState; |
| |
| // |
| // Set any stuff we can know right off the bat for all cases. We know |
| // that this content model will never get PCData nodes because that |
| // is a mixed model. We can also set the valid EOC flag at this point |
| // since its just based on the state we ended in at the insert point. |
| // |
| info.canHoldPCData = false; |
| info.isValidEOC = fFinalStateFlags[insertState]; |
| |
| // |
| // Set the results count member and then see if we need to reallocate |
| // the outgoing arrays. |
| // |
| info.resultsCount = fElemMapSize; |
| |
| if ((info.results == null) || (info.results.length < info.resultsCount)) |
| info.results = new boolean[info.resultsCount]; |
| |
| if ((info.possibleChildren == null) |
| || (info.possibleChildren.length < info.resultsCount)) |
| { |
| info.possibleChildren = new QName[info.resultsCount]; |
| for (int i = 0; i < info.possibleChildren.length; i++) { |
| info.possibleChildren[i] = new QName(); |
| } |
| } |
| |
| // |
| // Fill in the possible children array, from our array. For each one |
| // of them, see if there is a valid transition from our insert at |
| // state on that input. Mark the results index for that child according |
| // to whether there is a transition or not. |
| // |
| for (int index = 0; index < fElemMapSize; index++) |
| { |
| info.possibleChildren[index].setValues(fElemMap[index]); |
| info.results[index] = (fTransTable[insertState][index] != -1); |
| } |
| |
| // |
| // If the fully valid parameter is set, then we have to go through |
| // the grunt work of plugging in each possible insertable element |
| // and running the DFA from that point to see if it would create a |
| // fully valid content model. |
| // |
| // <TBD> When/if the validator is changed to be stateful, then change |
| // this stuff to start the exploratory validation at the insert state, |
| // not from the start each time. |
| // |
| if (fullyValid) |
| { |
| for (int index = 0; index < info.resultsCount; index++) |
| { |
| // Don't need to consider this one since its not insertable |
| if (!info.results[index]) |
| continue; |
| |
| // Stick this element into the insert at spot |
| info.curChildren[info.insertAt] = info.possibleChildren[index]; |
| |
| // And validate it. If it fails, then this one loses |
| if (validateContent(info.curChildren, 0, info.childCount) != -1) |
| info.results[index] = false; |
| } |
| } |
| |
| return -1; |
| } |
| |
| public ContentLeafNameTypeVector getContentLeafNameTypeVector() { |
| return fLeafNameTypeVector; |
| } |
| |
| // |
| // Private methods |
| // |
| |
| /** |
| * Builds the internal DFA transition table from the given syntax tree. |
| * |
| * @param syntaxTree The syntax tree. |
| * |
| * @exception CMException Thrown if DFA cannot be built. |
| */ |
| private void buildDFA(CMNode syntaxTree) throws CMException |
| { |
| // |
| // The first step we need to take is to rewrite the content model |
| // using our CMNode objects, and in the process get rid of any |
| // repetition short cuts, converting them into '*' style repetitions |
| // or getting rid of repetitions altogether. |
| // |
| // The conversions done are: |
| // |
| // x+ -> (x|x*) |
| // x? -> (x|epsilon) |
| // |
| // This is a relatively complex scenario. What is happening is that |
| // we create a top level binary node of which the special EOC value |
| // is set as the right side node. The the left side is set to the |
| // rewritten syntax tree. The source is the original content model |
| // info from the decl pool. The rewrite is done by buildSyntaxTree() |
| // which recurses the decl pool's content of the element and builds |
| // a new tree in the process. |
| // |
| // Note that, during this operation, we set each non-epsilon leaf |
| // node's DFA state position and count the number of such leafs, which |
| // is left in the fLeafCount member. |
| // |
| // The nodeTmp object is passed in just as a temp node to use during |
| // the recursion. Otherwise, we'd have to create a new node on every |
| // level of recursion, which would be piggy in Java (as is everything |
| // for that matter.) |
| // |
| fQName.setValues(-1, fEOCIndex, fEOCIndex); |
| CMLeaf nodeEOC = new CMLeaf(fQName); |
| fHeadNode = new CMBinOp |
| ( |
| XMLContentSpec.CONTENTSPECNODE_SEQ |
| , syntaxTree |
| , nodeEOC |
| ); |
| |
| // |
| // And handle specially the EOC node, which also must be numbered |
| // and counted as a non-epsilon leaf node. It could not be handled |
| // in the above tree build because it was created before all that |
| // started. We save the EOC position since its used during the DFA |
| // building loop. |
| // |
| fEOCPos = fLeafCount; |
| nodeEOC.setPosition(fLeafCount++); |
| |
| // |
| // Ok, so now we have to iterate the new tree and do a little more |
| // work now that we know the leaf count. One thing we need to do is |
| // to calculate the first and last position sets of each node. This |
| // is cached away in each of the nodes. |
| // |
| // Along the way we also set the leaf count in each node as the |
| // maximum state count. They must know this in order to create their |
| // first/last pos sets. |
| // |
| // We also need to build an array of references to the non-epsilon |
| // leaf nodes. Since we iterate it in the same way as before, this |
| // will put them in the array according to their position values. |
| // |
| fLeafList = new CMLeaf[fLeafCount]; |
| fLeafListType = new int[fLeafCount]; |
| postTreeBuildInit(fHeadNode, 0); |
| |
| // |
| // And, moving onward... We now need to build the follow position |
| // sets for all the nodes. So we allocate an array of state sets, |
| // one for each leaf node (i.e. each DFA position.) |
| // |
| fFollowList = new CMStateSet[fLeafCount]; |
| for (int index = 0; index < fLeafCount; index++) |
| fFollowList[index] = new CMStateSet(fLeafCount); |
| calcFollowList(fHeadNode); |
| // |
| // And finally the big push... Now we build the DFA using all the |
| // states and the tree we've built up. First we set up the various |
| // data structures we are going to use while we do this. |
| // |
| // First of all we need an array of unique element names in our |
| // content model. For each transition table entry, we need a set of |
| // contiguous indices to represent the transitions for a particular |
| // input element. So we need to a zero based range of indexes that |
| // map to element types. This element map provides that mapping. |
| // |
| fElemMap = new QName[fLeafCount]; |
| fElemMapType = new int[fLeafCount]; |
| fElemMapSize = 0; |
| for (int outIndex = 0; outIndex < fLeafCount; outIndex++) |
| { |
| fElemMap[outIndex] = new QName(); |
| |
| if ( (fLeafListType[outIndex] & 0x0f) != 0 ) { |
| if (fLeafNameTypeVector == null) { |
| fLeafNameTypeVector = new ContentLeafNameTypeVector(); |
| } |
| } |
| |
| // Get the current leaf's element index |
| final QName element = fLeafList[outIndex].getElement(); |
| |
| // See if the current leaf node's element index is in the list |
| int inIndex = 0; |
| for (; inIndex < fElemMapSize; inIndex++) |
| { |
| if (fDTD) { |
| if (fElemMap[inIndex].rawname == element.rawname) { |
| break; |
| } |
| } |
| else { |
| if (fElemMap[inIndex].uri == element.uri && |
| fElemMap[inIndex].localpart == element.localpart && |
| fElemMapType[inIndex] == fLeafListType[outIndex] ) |
| break; |
| } |
| } |
| |
| // If it was not in the list, then add it, if not the EOC node |
| if (inIndex == fElemMapSize) { |
| //if (fDTD) { |
| // fElemMap[fElemMapSize].setValues(-1, element.rawname, element.rawname, -1); |
| //} |
| //else { |
| fElemMap[fElemMapSize].setValues(element); |
| //} |
| fElemMapType[fElemMapSize] = fLeafListType[outIndex]; |
| fElemMapSize++; |
| } |
| } |
| // set up the fLeafNameTypeVector object if there is one. |
| if (fLeafNameTypeVector != null) { |
| fLeafNameTypeVector.setValues(fElemMap, fElemMapType, fElemMapSize); |
| } |
| |
| // |
| // Next lets create some arrays, some that that hold transient |
| // information during the DFA build and some that are permament. |
| // These are kind of sticky since we cannot know how big they will |
| // get, but we don't want to use any Java collections because of |
| // performance. |
| // |
| // Basically they will probably be about fLeafCount*2 on average, |
| // but can be as large as 2^(fLeafCount*2), worst case. So we start |
| // with fLeafCount*4 as a middle ground. This will be very unlikely |
| // to ever have to expand, though it if does, the overhead will be |
| // somewhat ugly. |
| // |
| int curArraySize = fLeafCount * 4; |
| CMStateSet[] statesToDo = new CMStateSet[curArraySize]; |
| fFinalStateFlags = new boolean[curArraySize]; |
| fTransTable = new int[curArraySize][]; |
| |
| // |
| // Ok we start with the initial set as the first pos set of the |
| // head node (which is the seq node that holds the content model |
| // and the EOC node.) |
| // |
| CMStateSet setT = fHeadNode.firstPos(); |
| |
| // |
| // Init our two state flags. Basically the unmarked state counter |
| // is always chasing the current state counter. When it catches up, |
| // that means we made a pass through that did not add any new states |
| // to the lists, at which time we are done. We could have used a |
| // expanding array of flags which we used to mark off states as we |
| // complete them, but this is easier though less readable maybe. |
| // |
| int unmarkedState = 0; |
| int curState = 0; |
| |
| // |
| // Init the first transition table entry, and put the initial state |
| // into the states to do list, then bump the current state. |
| // |
| fTransTable[curState] = makeDefStateList(); |
| statesToDo[curState] = setT; |
| curState++; |
| |
| // |
| // Ok, almost done with the algorithm... We now enter the |
| // loop where we go until the states done counter catches up with |
| // the states to do counter. |
| // |
| while (unmarkedState < curState) |
| { |
| // |
| // Get the first unmarked state out of the list of states to do. |
| // And get the associated transition table entry. |
| // |
| setT = statesToDo[unmarkedState]; |
| int[] transEntry = fTransTable[unmarkedState]; |
| |
| // Mark this one final if it contains the EOC state |
| fFinalStateFlags[unmarkedState] = setT.getBit(fEOCPos); |
| |
| // Bump up the unmarked state count, marking this state done |
| unmarkedState++; |
| |
| // Loop through each possible input symbol in the element map |
| CMStateSet newSet = null; |
| for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) |
| { |
| // |
| // Build up a set of states which is the union of all of |
| // the follow sets of DFA positions that are in the current |
| // state. If we gave away the new set last time through then |
| // create a new one. Otherwise, zero out the existing one. |
| // |
| if (newSet == null) |
| newSet = new CMStateSet(fLeafCount); |
| else |
| newSet.zeroBits(); |
| |
| for (int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) |
| { |
| // If this leaf index (DFA position) is in the current set... |
| if (setT.getBit(leafIndex)) |
| { |
| // |
| // If this leaf is the current input symbol, then we |
| // want to add its follow list to the set of states to |
| // transition to from the current state. |
| // |
| final QName leaf = fLeafList[leafIndex].getElement(); |
| final QName element = fElemMap[elemIndex]; |
| if (fDTD) { |
| if (leaf.rawname == element.rawname) { |
| newSet.union(fFollowList[leafIndex]); |
| } |
| } |
| else { |
| if (leaf.uri == element.uri && |
| leaf.localpart == element.localpart) |
| newSet.union(fFollowList[leafIndex]); |
| } |
| } |
| } |
| |
| // |
| // If this new set is not empty, then see if its in the list |
| // of states to do. If not, then add it. |
| // |
| if (!newSet.isEmpty()) |
| { |
| // |
| // Search the 'states to do' list to see if this new |
| // state set is already in there. |
| // |
| int stateIndex = 0; |
| for (; stateIndex < curState; stateIndex++) |
| { |
| if (statesToDo[stateIndex].isSameSet(newSet)) |
| break; |
| } |
| |
| // If we did not find it, then add it |
| if (stateIndex == curState) |
| { |
| // |
| // Put this new state into the states to do and init |
| // a new entry at the same index in the transition |
| // table. |
| // |
| statesToDo[curState] = newSet; |
| fTransTable[curState] = makeDefStateList(); |
| |
| // We now have a new state to do so bump the count |
| curState++; |
| |
| // |
| // Null out the new set to indicate we adopted it. |
| // This will cause the creation of a new set on the |
| // next time around the loop. |
| // |
| newSet = null; |
| } |
| |
| // |
| // Now set this state in the transition table's entry |
| // for this element (using its index), with the DFA |
| // state we will move to from the current state when we |
| // see this input element. |
| // |
| transEntry[elemIndex] = stateIndex; |
| |
| // Expand the arrays if we're full |
| if (curState == curArraySize) |
| { |
| // |
| // Yikes, we overflowed the initial array size, so |
| // we've got to expand all of these arrays. So adjust |
| // up the size by 50% and allocate new arrays. |
| // |
| final int newSize = (int)(curArraySize * 1.5); |
| CMStateSet[] newToDo = new CMStateSet[newSize]; |
| boolean[] newFinalFlags = new boolean[newSize]; |
| int[][] newTransTable = new int[newSize][]; |
| |
| // Copy over all of the existing content |
| for (int expIndex = 0; expIndex < curArraySize; expIndex++) |
| { |
| newToDo[expIndex] = statesToDo[expIndex]; |
| newFinalFlags[expIndex] = fFinalStateFlags[expIndex]; |
| newTransTable[expIndex] = fTransTable[expIndex]; |
| } |
| |
| // Store the new array size |
| curArraySize = newSize; |
| statesToDo = newToDo; |
| fFinalStateFlags = newFinalFlags; |
| fTransTable = newTransTable; |
| } |
| } |
| } |
| } |
| |
| // Check to see if we can set the fEmptyContentIsValid flag. |
| fEmptyContentIsValid = ((CMBinOp)fHeadNode).getLeft().isNullable(); |
| |
| // |
| // And now we can say bye bye to the temp representation since we've |
| // built the DFA. |
| // |
| if (DEBUG_VALIDATE_CONTENT) |
| dumpTree(fHeadNode, 0); |
| fHeadNode = null; |
| fLeafList = null; |
| fFollowList = null; |
| |
| } |
| |
| /** |
| * Calculates the follow list of the current node. |
| * |
| * @param nodeCur The curent node. |
| * |
| * @exception CMException Thrown if follow list cannot be calculated. |
| */ |
| private void calcFollowList(CMNode nodeCur) throws CMException |
| { |
| // Recurse as required |
| if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE) |
| { |
| // Recurse only |
| calcFollowList(((CMBinOp)nodeCur).getLeft()); |
| calcFollowList(((CMBinOp)nodeCur).getRight()); |
| } |
| else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ) |
| { |
| // Recurse first |
| calcFollowList(((CMBinOp)nodeCur).getLeft()); |
| calcFollowList(((CMBinOp)nodeCur).getRight()); |
| |
| // |
| // Now handle our level. We use our left child's last pos |
| // set and our right child's first pos set, so go ahead and |
| // get them ahead of time. |
| // |
| final CMStateSet last = ((CMBinOp)nodeCur).getLeft().lastPos(); |
| final CMStateSet first = ((CMBinOp)nodeCur).getRight().firstPos(); |
| |
| // |
| // Now, for every position which is in our left child's last set |
| // add all of the states in our right child's first set to the |
| // follow set for that position. |
| // |
| for (int index = 0; index < fLeafCount; index++) |
| { |
| if (last.getBit(index)) |
| fFollowList[index].union(first); |
| } |
| } |
| else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE) |
| { |
| // Recurse first |
| calcFollowList(((CMUniOp)nodeCur).getChild()); |
| |
| // |
| // Now handle our level. We use our own first and last position |
| // sets, so get them up front. |
| // |
| final CMStateSet first = nodeCur.firstPos(); |
| final CMStateSet last = nodeCur.lastPos(); |
| |
| // |
| // For every position which is in our last position set, add all |
| // of our first position states to the follow set for that |
| // position. |
| // |
| for (int index = 0; index < fLeafCount; index++) |
| { |
| if (last.getBit(index)) |
| fFollowList[index].union(first); |
| } |
| } |
| else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE) |
| || (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE)) |
| { |
| throw new CMException(ImplementationMessages.VAL_NIICM); |
| } |
| } |
| |
| /** |
| * Dumps the tree of the current node to standard output. |
| * |
| * @param nodeCur The current node. |
| * @param level The maximum levels to output. |
| * |
| * @exception CMException Thrown on error. |
| */ |
| private void dumpTree(CMNode nodeCur, int level) throws CMException |
| { |
| for (int index = 0; index < level; index++) |
| System.out.print(" "); |
| |
| int type = nodeCur.type(); |
| if ((type == XMLContentSpec.CONTENTSPECNODE_CHOICE) |
| || (type == XMLContentSpec.CONTENTSPECNODE_SEQ)) |
| { |
| if (type == XMLContentSpec.CONTENTSPECNODE_CHOICE) |
| System.out.print("Choice Node "); |
| else |
| System.out.print("Seq Node "); |
| |
| if (nodeCur.isNullable()) |
| System.out.print("Nullable "); |
| |
| System.out.print("firstPos="); |
| System.out.print(nodeCur.firstPos().toString()); |
| System.out.print(" lastPos="); |
| System.out.println(nodeCur.lastPos().toString()); |
| |
| dumpTree(((CMBinOp)nodeCur).getLeft(), level+1); |
| dumpTree(((CMBinOp)nodeCur).getRight(), level+1); |
| } |
| else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE) |
| { |
| System.out.print("Rep Node "); |
| |
| if (nodeCur.isNullable()) |
| System.out.print("Nullable "); |
| |
| System.out.print("firstPos="); |
| System.out.print(nodeCur.firstPos().toString()); |
| System.out.print(" lastPos="); |
| System.out.println(nodeCur.lastPos().toString()); |
| |
| dumpTree(((CMUniOp)nodeCur).getChild(), level+1); |
| } |
| else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF) |
| { |
| System.out.print |
| ( |
| "Leaf: (pos=" |
| + ((CMLeaf)nodeCur).getPosition() |
| + "), " |
| + ((CMLeaf)nodeCur).getElement() |
| + "(elemIndex=" |
| + ((CMLeaf)nodeCur).getElement() |
| + ") " |
| ); |
| |
| if (nodeCur.isNullable()) |
| System.out.print(" Nullable "); |
| |
| System.out.print("firstPos="); |
| System.out.print(nodeCur.firstPos().toString()); |
| System.out.print(" lastPos="); |
| System.out.println(nodeCur.lastPos().toString()); |
| } |
| else |
| { |
| throw new CMException(ImplementationMessages.VAL_NIICM); |
| } |
| } |
| |
| |
| /** |
| * -1 is used to represent bad transitions in the transition table |
| * entry for each state. So each entry is initialized to an all -1 |
| * array. This method creates a new entry and initializes it. |
| */ |
| private int[] makeDefStateList() |
| { |
| int[] retArray = new int[fElemMapSize]; |
| for (int index = 0; index < fElemMapSize; index++) |
| retArray[index] = -1; |
| return retArray; |
| } |
| |
| /** Post tree build initialization. */ |
| private int postTreeBuildInit(CMNode nodeCur, int curIndex) throws CMException |
| { |
| // Set the maximum states on this node |
| nodeCur.setMaxStates(fLeafCount); |
| |
| // Recurse as required |
| if ((nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY || |
| (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL || |
| (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) { |
| // REVISIT: Don't waste these structures. |
| QName qname = new QName(-1, -1, -1, ((CMAny)nodeCur).getURI()); |
| fLeafList[curIndex] = new CMLeaf(qname, ((CMAny)nodeCur).getPosition()); |
| fLeafListType[curIndex] = nodeCur.type(); |
| curIndex++; |
| } |
| else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE) |
| || (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ)) |
| { |
| curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getLeft(), curIndex); |
| curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getRight(), curIndex); |
| } |
| else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE) |
| { |
| curIndex = postTreeBuildInit(((CMUniOp)nodeCur).getChild(), curIndex); |
| } |
| else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF) |
| { |
| // |
| // Put this node in the leaf list at the current index if its |
| // a non-epsilon leaf. |
| // |
| final QName node = ((CMLeaf)nodeCur).getElement(); |
| if (node.localpart != fEpsilonIndex) { |
| fLeafList[curIndex] = (CMLeaf)nodeCur; |
| fLeafListType[curIndex] = XMLContentSpec.CONTENTSPECNODE_LEAF; |
| curIndex++; |
| } |
| } |
| else |
| { |
| throw new CMException(ImplementationMessages.VAL_NIICM); |
| } |
| return curIndex; |
| } |
| |
| |
| } // class DFAContentModel |