| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| */ |
| package org.apache.vysper.xml.sax.impl; |
| |
| import java.nio.charset.CharsetDecoder; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Stack; |
| import java.util.Map.Entry; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.apache.mina.core.buffer.IoBuffer; |
| import org.apache.vysper.xml.sax.impl.XMLTokenizer.TokenListener; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.ErrorHandler; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.SAXParseException; |
| |
| /** |
| * |
| * @author The Apache MINA Project (dev@mina.apache.org) |
| */ |
| public class XMLParser implements TokenListener { |
| |
| private Logger log = LoggerFactory.getLogger(XMLParser.class); |
| |
| private static final String nameStartChar = ":A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD"; |
| |
| private static final String nameChar = nameStartChar + "-\\.0-9\\u00B7\\u0300-\\u036F\\u203F-\\u2040"; |
| |
| public static final Pattern NAME_PATTERN = Pattern.compile("^[" + nameStartChar + "][" + nameChar + "]*$"); |
| |
| public static final Pattern NAME_PREFIX_PATTERN = Pattern.compile("^xml", Pattern.CASE_INSENSITIVE); |
| |
| public static final Pattern UNESCAPE_UNICODE_PATTERN = Pattern.compile("\\&\\#(x?)(.+);"); |
| |
| private ContentHandler contentHandler; |
| |
| private ErrorHandler errorHandler; |
| |
| private ParserNamespaceResolver nsResolver = new ParserNamespaceResolver(); |
| |
| private static enum State { |
| START, IN_TAG, IN_DECLARATION, IN_END_TAG, AFTER_START_NAME, AFTER_END_NAME, IN_EMPTY_TAG, AFTER_ATTRIBUTE_NAME, AFTER_ATTRIBUTE_EQUALS, AFTER_ATTRIBUTE_FIRST_QUOTE, AFTER_ATTRIBUTE_VALUE, AFTER_COMMENT_BANG, AFTER_COMMENT_DASH1, AFTER_COMMENT_DASH2, AFTER_COMMENT, AFTER_COMMENT_CLOSING_DASH1, AFTER_COMMENT_CLOSING_DASH2, AFTER_COMMENT_ENDING_DASH1, AFTER_COMMENT_ENDING_DASH2, CLOSED |
| } |
| |
| private XMLTokenizer tokenizer; |
| |
| private State state = State.START; |
| |
| private String qname; |
| |
| // qname/value map |
| private Map<String, String> attributes; |
| |
| private String attributeName; |
| |
| // element names as {uri}qname |
| private Stack<String> elements = new Stack<String>(); |
| |
| private boolean sentStartDocument = false; |
| |
| // features |
| private boolean reportNsAttributes = false; |
| |
| private boolean commentsAllowed = true; |
| |
| private boolean restartsAllowed = false; |
| |
| private String restartQname = null; |
| |
| public XMLParser(ContentHandler contentHandler, ErrorHandler errorHandler, Map<String, Boolean> features, |
| Map<String, Object> properties) { |
| this.contentHandler = contentHandler; |
| this.errorHandler = errorHandler; |
| |
| commentsAllowed = feature(features, DefaultNonBlockingXMLReader.FEATURE_COMMENTS_ALLOWED, true); |
| reportNsAttributes = feature(features, DefaultNonBlockingXMLReader.FEATURE_NAMESPACE_PREFIXES, false); |
| reportNsAttributes = feature(features, DefaultNonBlockingXMLReader.FEATURE_NAMESPACE_PREFIXES, false); |
| restartsAllowed = feature(features, DefaultNonBlockingXMLReader.FEATURE_RESTART_ALLOWED, false); |
| restartQname = (String) properties.get(DefaultNonBlockingXMLReader.PROPERTY_RESTART_QNAME); |
| |
| this.tokenizer = new XMLTokenizer(this); |
| } |
| |
| private boolean feature(Map<String, Boolean> features, String name, boolean defaultValue) { |
| if (features.containsKey(name)) { |
| return features.get(name); |
| } else { |
| return defaultValue; |
| } |
| } |
| |
| public void parse(IoBuffer byteBuffer, CharsetDecoder charsetDecoder) throws SAXException { |
| if (state == State.CLOSED) |
| throw new SAXException("Parser is closed"); |
| |
| try { |
| tokenizer.parse(byteBuffer, charsetDecoder); |
| } catch (RuntimeException e) { |
| e.printStackTrace(); |
| fatalError(e.getMessage()); |
| } |
| } |
| |
| public void token(char c, String token) throws SAXException { |
| if (log.isTraceEnabled()) { |
| String s = (token == null) ? Character.toString(c) : token; |
| log.trace("Parser got token {} in state {}", s, state); |
| } |
| |
| switch (state) { |
| case START: |
| if (c == '<') { |
| state = State.IN_TAG; |
| attributes = new HashMap<String, String>(); |
| } else { |
| characters(token); |
| } |
| break; |
| case IN_TAG: |
| // token must be element name or / for a end tag |
| if (c == '/') { |
| state = State.IN_END_TAG; |
| } else if (c == '?') { |
| state = State.IN_DECLARATION; |
| xmlDeclaration(); |
| } else if (c == '!') { |
| if (commentsAllowed) { |
| state = State.AFTER_COMMENT_BANG; |
| } else { |
| fatalError("Comments not allowed"); |
| return; |
| } |
| } else { |
| if (token != null && isValidName(token)) { |
| qname = token; |
| state = State.AFTER_START_NAME; |
| } else { |
| if(token != null) { |
| fatalError("Invalid element name: " + qname); |
| } else { |
| fatalError("Not well-formed start tag"); |
| } |
| return; |
| } |
| } |
| break; |
| case IN_END_TAG: |
| // token must be element name |
| qname = token; |
| state = State.AFTER_END_NAME; |
| break; |
| case AFTER_START_NAME: |
| // token must be attribute name or > or / |
| if (c == '>') { |
| // end of start or end tag |
| if (state == State.AFTER_START_NAME) { |
| startElement(); |
| state = State.START; |
| attributes = null; |
| } else if (state == State.AFTER_END_NAME) { |
| state = State.START; |
| endElement(); |
| } |
| } else if (c == '/') { |
| state = State.IN_EMPTY_TAG; |
| } else { |
| // must be attribute name |
| attributeName = token; |
| state = State.AFTER_ATTRIBUTE_NAME; |
| } |
| break; |
| case AFTER_ATTRIBUTE_NAME: |
| // token must be = |
| if (c == '=') { |
| state = State.AFTER_ATTRIBUTE_EQUALS; |
| } else { |
| fatalError("Not wellformed"); |
| } |
| break; |
| case AFTER_ATTRIBUTE_EQUALS: |
| // token must be " or ' |
| if (c == '"' || c == '\'') { |
| state = State.AFTER_ATTRIBUTE_FIRST_QUOTE; |
| } |
| break; |
| case AFTER_ATTRIBUTE_FIRST_QUOTE: |
| // token must be attribute value |
| attributes.put(attributeName, unescape(token)); |
| state = State.AFTER_ATTRIBUTE_VALUE; |
| break; |
| case AFTER_ATTRIBUTE_VALUE: |
| // token must be " or ' |
| if (c == '"' || c == '\'') { |
| state = State.AFTER_START_NAME; |
| } else { |
| fatalError("Not wellformed"); |
| } |
| break; |
| case AFTER_END_NAME: |
| // token must be > |
| if (c == '>') { |
| state = State.START; |
| endElement(); |
| } |
| break; |
| case IN_EMPTY_TAG: |
| // token must be > |
| if (c == '>') { |
| startElement(); |
| attributes = null; |
| |
| if (state != State.CLOSED) { |
| state = State.START; |
| endElement(); |
| } |
| } |
| break; |
| case AFTER_COMMENT_BANG: |
| // token must be - |
| if (c == '-') { |
| state = State.AFTER_COMMENT_DASH1; |
| } else { |
| fatalError("Comment not wellformed"); |
| return; |
| } |
| break; |
| case AFTER_COMMENT_DASH1: |
| // token must be - |
| if (c == '-') { |
| state = State.AFTER_COMMENT_DASH2; |
| } else { |
| fatalError("Comment not wellformed"); |
| return; |
| } |
| break; |
| case AFTER_COMMENT_DASH2: |
| // we should now get the comment content, ignore |
| if (c == '-') { |
| state = State.AFTER_COMMENT_CLOSING_DASH1; |
| } else { |
| state = State.AFTER_COMMENT; |
| } |
| break; |
| case AFTER_COMMENT: |
| // token must be - or some text |
| if (c == '-') { |
| state = State.AFTER_COMMENT_CLOSING_DASH1; |
| } else if (c == '>') { |
| fatalError("Comment not wellformed"); |
| return; |
| } else { |
| // ignore |
| } |
| break; |
| case AFTER_COMMENT_CLOSING_DASH1: |
| // token must be - |
| if (c == '-') { |
| state = State.AFTER_COMMENT_CLOSING_DASH2; |
| } else { |
| fatalError("Comment not wellformed"); |
| return; |
| } |
| break; |
| case AFTER_COMMENT_CLOSING_DASH2: |
| // token must be > |
| if (c == '>') { |
| state = State.START; |
| } else { |
| fatalError("Comment not wellformed"); |
| return; |
| } |
| break; |
| case IN_DECLARATION: |
| // wait for > |
| if (c == '>') { |
| state = State.START; |
| } |
| break; |
| } |
| } |
| |
| private void characters(String s) throws SAXException { |
| // text only allowed in element |
| if (!elements.isEmpty()) { |
| String unescaped = unescape(s); |
| log.trace("Parser emitting characters \"{}\"", unescaped); |
| contentHandler.characters(unescaped.toCharArray(), 0, unescaped.length()); |
| } else if (s.trim().length() > 0) { |
| // must start document, even that document is not wellformed |
| startDocument(); |
| fatalError("Text only allowed in element"); |
| } else { |
| // ignorable whitespace |
| startDocument(); |
| } |
| } |
| |
| private boolean isValidName(String name) { |
| // element names must only contain valid characters |
| // element names must not begin with "xml" in any casing |
| return NAME_PATTERN.matcher(name).find() && !NAME_PREFIX_PATTERN.matcher(name).find(); |
| } |
| |
| private boolean needsRestart() { |
| return elements.size() > 0; |
| } |
| |
| private void restart() { |
| log.trace("Restarting XML stream"); |
| |
| elements.clear(); |
| nsResolver = new ParserNamespaceResolver(); |
| sentStartDocument = false; |
| tokenizer.restart(); |
| } |
| |
| private void xmlDeclaration() { |
| // we got an XML declaration, should we restart stream? |
| // TODO could also be a PI, if we want to support PIs, this code needs further attention |
| if (needsRestart()) { |
| if (restartsAllowed) { |
| // ok, restart |
| restart(); |
| } else { |
| // restarts not allowed, fail |
| } |
| } |
| } |
| |
| private void startDocument() throws SAXException { |
| if (!sentStartDocument) { |
| contentHandler.startDocument(); |
| sentStartDocument = true; |
| } |
| } |
| |
| private void startElement() throws SAXException { |
| log.trace("StartElement {}", qname); |
| |
| // check if this should restart stream |
| if (restartsAllowed && needsRestart() && qname.equals(restartQname)) { |
| restart(); |
| } |
| |
| if (elements.isEmpty()) { |
| startDocument(); |
| } |
| |
| // find all namespace declarations so we can populate the NS resolver |
| Map<String, String> nsDeclarations = new HashMap<String, String>(); |
| for (Entry<String, String> attribute : attributes.entrySet()) { |
| if (attribute.getKey().equals("xmlns")) { |
| // is namespace attribute |
| nsDeclarations.put("", attribute.getValue()); |
| } else if (attribute.getKey().startsWith("xmlns:")) { |
| nsDeclarations.put(attribute.getKey().substring(6), attribute.getValue()); |
| } |
| } |
| nsResolver.push(nsDeclarations); |
| |
| // find all non-namespace attributes |
| List<Attribute> nonNsAttributes = new ArrayList<Attribute>(); |
| for (Entry<String, String> attribute : attributes.entrySet()) { |
| String attQname = attribute.getKey(); |
| |
| // only report NS declaration attributes if the feature is set to |
| if (reportNsAttributes) { |
| nonNsAttributes.add(new Attribute(attQname, null, attQname, attribute.getValue())); |
| } else if (!attQname.equals("xmlns") && !attQname.startsWith("xmlns:")) { |
| String attLocalName = extractLocalName(attQname); |
| String attPrefix = extractNsPrefix(attQname); |
| String attUri; |
| if (attPrefix.length() > 0) { |
| attUri = nsResolver.resolveUri(attPrefix); |
| if (attUri == null) { |
| if (attPrefix.length() > 0) { |
| fatalError("Undeclared namespace prefix: " + attPrefix); |
| return; |
| } else { |
| attUri = ""; |
| } |
| } |
| } else { |
| // by default, attributes are in the empty namespace |
| attUri = ""; |
| } |
| nonNsAttributes.add(new Attribute(attLocalName, attUri, attQname, attribute.getValue())); |
| } |
| } |
| |
| String prefix = extractNsPrefix(qname); |
| String uri = nsResolver.resolveUri(prefix); |
| if (uri == null) { |
| if (prefix.length() > 0) { |
| fatalError("Undeclared namespace prefix: " + prefix); |
| return; |
| } else { |
| uri = ""; |
| } |
| } |
| |
| String localName = extractLocalName(qname); |
| |
| elements.add(fullyQualifiedName(uri, qname)); |
| |
| contentHandler.startElement(uri, localName, qname, new DefaultAttributes(nonNsAttributes)); |
| } |
| |
| private String extractLocalName(String qname) { |
| int index = qname.indexOf(':'); |
| |
| if (index > -1) { |
| return qname.substring(index + 1); |
| } else { |
| return qname; |
| } |
| } |
| |
| private String extractNsPrefix(String qname) { |
| int index = qname.indexOf(':'); |
| |
| if (index > -1) { |
| return qname.substring(0, index); |
| } else { |
| return ""; |
| } |
| } |
| |
| private String fullyQualifiedName(String uri, String qname) { |
| return "{" + uri + "}" + qname; |
| } |
| |
| private void endElement() throws SAXException { |
| log.trace("EndElement {}", qname); |
| |
| if (state == State.CLOSED) |
| return; |
| |
| String prefix = extractNsPrefix(qname); |
| String uri = nsResolver.resolveUri(prefix); |
| if (uri == null) { |
| if (prefix.length() > 0) { |
| fatalError("Undeclared namespace prefix: " + prefix); |
| return; |
| } else { |
| uri = ""; |
| } |
| } |
| |
| nsResolver.pop(); |
| |
| String localName = extractLocalName(qname); |
| |
| String fqn = elements.pop(); |
| if (fqn.equals(fullyQualifiedName(uri, qname))) { |
| contentHandler.endElement(uri, localName, qname); |
| |
| if (elements.isEmpty()) { |
| contentHandler.endDocument(); |
| state = State.CLOSED; |
| } |
| } else { |
| fatalError("Invalid element name " + qname); |
| } |
| } |
| |
| private void fatalError(String message) throws SAXException { |
| log.debug("Fatal error: {}", message); |
| state = State.CLOSED; |
| tokenizer.close(); |
| |
| // make sure we send a start document event |
| startDocument(); |
| |
| errorHandler.fatalError(new SAXParseException(message, null)); |
| } |
| |
| private String unescape(String s) { |
| s = s.replace("&", "&").replace(">", ">").replace("<", "<").replace("'", "'").replace(""", |
| "\""); |
| |
| StringBuffer sb = new StringBuffer(); |
| |
| Matcher matcher = UNESCAPE_UNICODE_PATTERN.matcher(s); |
| int end = 0; |
| while (matcher.find()) { |
| boolean isHex = matcher.group(1).equals("x"); |
| String unicodeCode = matcher.group(2); |
| |
| int base = isHex ? 16 : 10; |
| int i = Integer.valueOf(unicodeCode, base).intValue(); |
| char[] c = Character.toChars(i); |
| sb.append(s.substring(end, matcher.start())); |
| end = matcher.end(); |
| sb.append(c); |
| } |
| sb.append(s.substring(end, s.length())); |
| |
| return sb.toString(); |
| } |
| } |