| /************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *************************************************************/ |
| |
| |
| |
| #include "precompiled_xmlreader.hxx" |
| #include "sal/config.h" |
| |
| #include <climits> |
| #include <cstddef> |
| |
| #include "com/sun/star/container/NoSuchElementException.hpp" |
| #include "com/sun/star/uno/Reference.hxx" |
| #include "com/sun/star/uno/RuntimeException.hpp" |
| #include "com/sun/star/uno/XInterface.hpp" |
| #include "osl/diagnose.h" |
| #include "osl/file.h" |
| #include "rtl/string.h" |
| #include "rtl/ustring.h" |
| #include "rtl/ustring.hxx" |
| #include "sal/types.h" |
| #include "xmlreader/pad.hxx" |
| #include "xmlreader/span.hxx" |
| #include "xmlreader/xmlreader.hxx" |
| |
| namespace xmlreader { |
| |
| namespace { |
| |
| namespace css = com::sun::star; |
| |
| bool isSpace(char c) { |
| switch (c) { |
| case '\x09': |
| case '\x0A': |
| case '\x0D': |
| case ' ': |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| } |
| |
| XmlReader::XmlReader(rtl::OUString const & fileUrl) |
| SAL_THROW(( |
| css::container::NoSuchElementException, css::uno::RuntimeException)): |
| fileUrl_(fileUrl) |
| { |
| switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read)) |
| { |
| case osl_File_E_None: |
| break; |
| case osl_File_E_NOENT: |
| throw css::container::NoSuchElementException( |
| fileUrl_, css::uno::Reference< css::uno::XInterface >()); |
| default: |
| throw css::uno::RuntimeException( |
| (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| oslFileError e = osl_getFileSize(fileHandle_, &fileSize_); |
| if (e == osl_File_E_None) { |
| e = osl_mapFile( |
| fileHandle_, &fileAddress_, fileSize_, 0, |
| osl_File_MapFlag_WillNeed); |
| } |
| if (e != osl_File_E_None) { |
| e = osl_closeFile(fileHandle_); |
| if (e != osl_File_E_None) { |
| OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); |
| } |
| throw css::uno::RuntimeException( |
| (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| namespaceIris_.push_back( |
| Span( |
| RTL_CONSTASCII_STRINGPARAM( |
| "http://www.w3.org/XML/1998/namespace"))); |
| namespaces_.push_back( |
| NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML)); |
| pos_ = static_cast< char * >(fileAddress_); |
| end_ = pos_ + fileSize_; |
| state_ = STATE_CONTENT; |
| } |
| |
| XmlReader::~XmlReader() { |
| oslFileError e = osl_unmapFile(fileAddress_, fileSize_); |
| if (e != osl_File_E_None) { |
| OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e)); |
| } |
| e = osl_closeFile(fileHandle_); |
| if (e != osl_File_E_None) { |
| OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e)); |
| } |
| } |
| |
| int XmlReader::registerNamespaceIri(Span const & iri) { |
| int id = toNamespaceId(namespaceIris_.size()); |
| namespaceIris_.push_back(iri); |
| if (iri.equals( |
| Span( |
| RTL_CONSTASCII_STRINGPARAM( |
| "http://www.w3.org/2001/XMLSchema-instance")))) |
| { |
| // Old user layer .xcu files used the xsi namespace prefix without |
| // declaring a corresponding namespace binding, see issue 77174; reading |
| // those files during migration would fail without this hack that can be |
| // removed once migration is no longer relevant (see |
| // configmgr::Components::parseModificationLayer): |
| namespaces_.push_back( |
| NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id)); |
| } |
| return id; |
| } |
| |
| XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) |
| { |
| switch (state_) { |
| case STATE_CONTENT: |
| switch (reportText) { |
| case TEXT_NONE: |
| return handleSkippedText(data, nsId); |
| case TEXT_RAW: |
| return handleRawText(data); |
| case TEXT_NORMALIZED: |
| return handleNormalizedText(data); |
| } |
| case STATE_START_TAG: |
| return handleStartTag(nsId, data); |
| case STATE_END_TAG: |
| return handleEndTag(); |
| case STATE_EMPTY_ELEMENT_TAG: |
| handleElementEnd(); |
| return RESULT_END; |
| default: // STATE_DONE |
| return RESULT_DONE; |
| } |
| } |
| |
| bool XmlReader::nextAttribute(int * nsId, Span * localName) { |
| OSL_ASSERT(nsId != 0 && localName != 0); |
| if (firstAttribute_) { |
| currentAttribute_ = attributes_.begin(); |
| firstAttribute_ = false; |
| } else { |
| ++currentAttribute_; |
| } |
| if (currentAttribute_ == attributes_.end()) { |
| return false; |
| } |
| if (currentAttribute_->nameColon == 0) { |
| *nsId = NAMESPACE_NONE; |
| *localName = Span( |
| currentAttribute_->nameBegin, |
| currentAttribute_->nameEnd - currentAttribute_->nameBegin); |
| } else { |
| *nsId = getNamespaceId( |
| Span( |
| currentAttribute_->nameBegin, |
| currentAttribute_->nameColon - currentAttribute_->nameBegin)); |
| *localName = Span( |
| currentAttribute_->nameColon + 1, |
| currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); |
| } |
| return true; |
| } |
| |
| Span XmlReader::getAttributeValue(bool fullyNormalize) { |
| return handleAttributeValue( |
| currentAttribute_->valueBegin, currentAttribute_->valueEnd, |
| fullyNormalize); |
| } |
| |
| int XmlReader::getNamespaceId(Span const & prefix) const { |
| for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin()); |
| i != namespaces_.rend(); ++i) |
| { |
| if (prefix.equals(i->prefix)) { |
| return i->nsId; |
| } |
| } |
| return NAMESPACE_UNKNOWN; |
| } |
| |
| rtl::OUString XmlReader::getUrl() const { |
| return fileUrl_; |
| } |
| |
| void XmlReader::normalizeLineEnds(Span const & text) { |
| char const * p = text.begin; |
| sal_Int32 n = text.length; |
| for (;;) { |
| sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); |
| if (i < 0) { |
| break; |
| } |
| pad_.add(p, i); |
| p += i + 1; |
| n -= i + 1; |
| if (n == 0 || *p != '\x0A') { |
| pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); |
| } |
| } |
| pad_.add(p, n); |
| } |
| |
| void XmlReader::skipSpace() { |
| while (isSpace(peek())) { |
| ++pos_; |
| } |
| } |
| |
| bool XmlReader::skipComment() { |
| if (rtl_str_shortenedCompare_WithLength( |
| pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), |
| RTL_CONSTASCII_LENGTH("--")) != |
| 0) |
| { |
| return false; |
| } |
| pos_ += RTL_CONSTASCII_LENGTH("--"); |
| sal_Int32 i = rtl_str_indexOfStr_WithLength( |
| pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "premature end (within comment) of ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| pos_ += i + RTL_CONSTASCII_LENGTH("--"); |
| if (read() != '>') { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "illegal \"--\" within comment in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| return true; |
| } |
| |
| void XmlReader::skipProcessingInstruction() { |
| sal_Int32 i = rtl_str_indexOfStr_WithLength( |
| pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| pos_ += i + RTL_CONSTASCII_LENGTH("?>"); |
| } |
| |
| void XmlReader::skipDocumentTypeDeclaration() { |
| // Neither is it checked that the doctypedecl is at the correct position in |
| // the document, nor that it is well-formed: |
| for (;;) { |
| char c = read(); |
| switch (c) { |
| case '\0': // i.e., EOF |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "premature end (within DTD) of ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| case '"': |
| case '\'': |
| { |
| sal_Int32 i = rtl_str_indexOfChar_WithLength( |
| pos_, end_ - pos_, c); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "premature end (within DTD) of ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| pos_ += i + 1; |
| } |
| break; |
| case '>': |
| return; |
| case '[': |
| for (;;) { |
| c = read(); |
| switch (c) { |
| case '\0': // i.e., EOF |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "premature end (within DTD) of ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| case '"': |
| case '\'': |
| { |
| sal_Int32 i = rtl_str_indexOfChar_WithLength( |
| pos_, end_ - pos_, c); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "premature end (within DTD) of ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| pos_ += i + 1; |
| } |
| break; |
| case '<': |
| switch (read()) { |
| case '\0': // i.e., EOF |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "premature end (within DTD) of ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| case '!': |
| skipComment(); |
| break; |
| case '?': |
| skipProcessingInstruction(); |
| break; |
| default: |
| break; |
| } |
| break; |
| case ']': |
| skipSpace(); |
| if (read() != '>') { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "missing \">\" of DTD in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| return; |
| default: |
| break; |
| } |
| } |
| default: |
| break; |
| } |
| } |
| } |
| |
| Span XmlReader::scanCdataSection() { |
| if (rtl_str_shortenedCompare_WithLength( |
| pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), |
| RTL_CONSTASCII_LENGTH("[CDATA[")) != |
| 0) |
| { |
| return Span(); |
| } |
| pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); |
| char const * begin = pos_; |
| sal_Int32 i = rtl_str_indexOfStr_WithLength( |
| pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "premature end (within CDATA section) of ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); |
| return Span(begin, i); |
| } |
| |
| bool XmlReader::scanName(char const ** nameColon) { |
| OSL_ASSERT(nameColon != 0 && *nameColon == 0); |
| for (char const * begin = pos_;; ++pos_) { |
| switch (peek()) { |
| case '\0': // i.e., EOF |
| case '\x09': |
| case '\x0A': |
| case '\x0D': |
| case ' ': |
| case '/': |
| case '=': |
| case '>': |
| return pos_ != begin; |
| case ':': |
| *nameColon = pos_; |
| break; |
| default: |
| break; |
| } |
| } |
| } |
| |
| int XmlReader::scanNamespaceIri(char const * begin, char const * end) { |
| OSL_ASSERT(begin != 0 && begin <= end); |
| Span iri(handleAttributeValue(begin, end, false)); |
| for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { |
| if (namespaceIris_[i].equals(iri)) { |
| return toNamespaceId(i); |
| } |
| } |
| return XmlReader::NAMESPACE_UNKNOWN; |
| } |
| |
| char const * XmlReader::handleReference(char const * position, char const * end) |
| { |
| OSL_ASSERT(position != 0 && *position == '&' && position < end); |
| ++position; |
| if (*position == '#') { |
| ++position; |
| sal_Int32 val = 0; |
| char const * p; |
| if (*position == 'x') { |
| ++position; |
| p = position; |
| for (;; ++position) { |
| char c = *position; |
| if (c >= '0' && c <= '9') { |
| val = 16 * val + (c - '0'); |
| } else if (c >= 'A' && c <= 'F') { |
| val = 16 * val + (c - 'A') + 10; |
| } else if (c >= 'a' && c <= 'f') { |
| val = 16 * val + (c - 'a') + 10; |
| } else { |
| break; |
| } |
| if (val > 0x10FFFF) { // avoid overflow |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "'&#x...' too large in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| } |
| } else { |
| p = position; |
| for (;; ++position) { |
| char c = *position; |
| if (c >= '0' && c <= '9') { |
| val = 10 * val + (c - '0'); |
| } else { |
| break; |
| } |
| if (val > 0x10FFFF) { // avoid overflow |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "'&#...' too large in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| } |
| } |
| if (position == p || *position++ != ';') { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| OSL_ASSERT(val >= 0 && val <= 0x10FFFF); |
| if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || |
| (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) |
| { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "character reference denoting invalid character in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| char buf[4]; |
| sal_Int32 len; |
| if (val < 0x80) { |
| buf[0] = static_cast< char >(val); |
| len = 1; |
| } else if (val < 0x800) { |
| buf[0] = static_cast< char >((val >> 6) | 0xC0); |
| buf[1] = static_cast< char >((val & 0x3F) | 0x80); |
| len = 2; |
| } else if (val < 0x10000) { |
| buf[0] = static_cast< char >((val >> 12) | 0xE0); |
| buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); |
| buf[2] = static_cast< char >((val & 0x3F) | 0x80); |
| len = 3; |
| } else { |
| buf[0] = static_cast< char >((val >> 18) | 0xF0); |
| buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); |
| buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); |
| buf[3] = static_cast< char >((val & 0x3F) | 0x80); |
| len = 4; |
| } |
| pad_.addEphemeral(buf, len); |
| return position; |
| } else { |
| struct EntityRef { |
| char const * inBegin; |
| sal_Int32 inLength; |
| char const * outBegin; |
| sal_Int32 outLength; |
| }; |
| static EntityRef const refs[] = { |
| { RTL_CONSTASCII_STRINGPARAM("amp;"), |
| RTL_CONSTASCII_STRINGPARAM("&") }, |
| { RTL_CONSTASCII_STRINGPARAM("lt;"), |
| RTL_CONSTASCII_STRINGPARAM("<") }, |
| { RTL_CONSTASCII_STRINGPARAM("gt;"), |
| RTL_CONSTASCII_STRINGPARAM(">") }, |
| { RTL_CONSTASCII_STRINGPARAM("apos;"), |
| RTL_CONSTASCII_STRINGPARAM("'") }, |
| { RTL_CONSTASCII_STRINGPARAM("quot;"), |
| RTL_CONSTASCII_STRINGPARAM("\"") } }; |
| for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) { |
| if (rtl_str_shortenedCompare_WithLength( |
| position, end - position, refs[i].inBegin, refs[i].inLength, |
| refs[i].inLength) == |
| 0) |
| { |
| position += refs[i].inLength; |
| pad_.add(refs[i].outBegin, refs[i].outLength); |
| return position; |
| } |
| } |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| } |
| |
| Span XmlReader::handleAttributeValue( |
| char const * begin, char const * end, bool fullyNormalize) |
| { |
| pad_.clear(); |
| if (fullyNormalize) { |
| while (begin != end && isSpace(*begin)) { |
| ++begin; |
| } |
| while (end != begin && isSpace(end[-1])) { |
| --end; |
| } |
| char const * p = begin; |
| enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; |
| // a single true space character can go into the current span, |
| // everything else breaks the span |
| Space space = SPACE_NONE; |
| while (p != end) { |
| switch (*p) { |
| case '\x09': |
| case '\x0A': |
| case '\x0D': |
| switch (space) { |
| case SPACE_NONE: |
| pad_.add(begin, p - begin); |
| pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); |
| space = SPACE_BREAK; |
| break; |
| case SPACE_SPAN: |
| pad_.add(begin, p - begin); |
| space = SPACE_BREAK; |
| break; |
| case SPACE_BREAK: |
| break; |
| } |
| begin = ++p; |
| break; |
| case ' ': |
| switch (space) { |
| case SPACE_NONE: |
| ++p; |
| space = SPACE_SPAN; |
| break; |
| case SPACE_SPAN: |
| pad_.add(begin, p - begin); |
| begin = ++p; |
| space = SPACE_BREAK; |
| break; |
| case SPACE_BREAK: |
| begin = ++p; |
| break; |
| } |
| break; |
| case '&': |
| pad_.add(begin, p - begin); |
| p = handleReference(p, end); |
| begin = p; |
| space = SPACE_NONE; |
| break; |
| default: |
| ++p; |
| space = SPACE_NONE; |
| break; |
| } |
| } |
| pad_.add(begin, p - begin); |
| } else { |
| char const * p = begin; |
| while (p != end) { |
| switch (*p) { |
| case '\x09': |
| case '\x0A': |
| pad_.add(begin, p - begin); |
| begin = ++p; |
| pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); |
| break; |
| case '\x0D': |
| pad_.add(begin, p - begin); |
| ++p; |
| if (peek() == '\x0A') { |
| ++p; |
| } |
| begin = p; |
| pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); |
| break; |
| case '&': |
| pad_.add(begin, p - begin); |
| p = handleReference(p, end); |
| begin = p; |
| break; |
| default: |
| ++p; |
| break; |
| } |
| } |
| pad_.add(begin, p - begin); |
| } |
| return pad_.get(); |
| } |
| |
| XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { |
| OSL_ASSERT(nsId != 0 && localName); |
| char const * nameBegin = pos_; |
| char const * nameColon = 0; |
| if (!scanName(&nameColon)) { |
| throw css::uno::RuntimeException( |
| (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| char const * nameEnd = pos_; |
| NamespaceList::size_type inheritedNamespaces = namespaces_.size(); |
| bool hasDefaultNs = false; |
| int defaultNsId = NAMESPACE_NONE; |
| attributes_.clear(); |
| for (;;) { |
| char const * p = pos_; |
| skipSpace(); |
| if (peek() == '/' || peek() == '>') { |
| break; |
| } |
| if (pos_ == p) { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "missing whitespace before attribute in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| char const * attrNameBegin = pos_; |
| char const * attrNameColon = 0; |
| if (!scanName(&attrNameColon)) { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| char const * attrNameEnd = pos_; |
| skipSpace(); |
| if (read() != '=') { |
| throw css::uno::RuntimeException( |
| (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| skipSpace(); |
| char del = read(); |
| if (del != '\'' && del != '"') { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| char const * valueBegin = pos_; |
| sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM( |
| "unterminated attribute value in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| char const * valueEnd = pos_ + i; |
| pos_ += i + 1; |
| if (attrNameColon == 0 && |
| Span(attrNameBegin, attrNameEnd - attrNameBegin).equals( |
| RTL_CONSTASCII_STRINGPARAM("xmlns"))) |
| { |
| hasDefaultNs = true; |
| defaultNsId = scanNamespaceIri(valueBegin, valueEnd); |
| } else if (attrNameColon != 0 && |
| Span(attrNameBegin, attrNameColon - attrNameBegin).equals( |
| RTL_CONSTASCII_STRINGPARAM("xmlns"))) |
| { |
| namespaces_.push_back( |
| NamespaceData( |
| Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), |
| scanNamespaceIri(valueBegin, valueEnd))); |
| } else { |
| attributes_.push_back( |
| AttributeData( |
| attrNameBegin, attrNameEnd, attrNameColon, valueBegin, |
| valueEnd)); |
| } |
| } |
| if (!hasDefaultNs && !elements_.empty()) { |
| defaultNsId = elements_.top().defaultNamespaceId; |
| } |
| firstAttribute_ = true; |
| if (peek() == '/') { |
| state_ = STATE_EMPTY_ELEMENT_TAG; |
| ++pos_; |
| } else { |
| state_ = STATE_CONTENT; |
| } |
| if (peek() != '>') { |
| throw css::uno::RuntimeException( |
| (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| ++pos_; |
| elements_.push( |
| ElementData( |
| Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, |
| defaultNsId)); |
| if (nameColon == 0) { |
| *nsId = defaultNsId; |
| *localName = Span(nameBegin, nameEnd - nameBegin); |
| } else { |
| *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); |
| *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); |
| } |
| return RESULT_BEGIN; |
| } |
| |
| XmlReader::Result XmlReader::handleEndTag() { |
| if (elements_.empty()) { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| char const * nameBegin = pos_; |
| char const * nameColon = 0; |
| if (!scanName(&nameColon) || |
| !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) |
| { |
| throw css::uno::RuntimeException( |
| (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| handleElementEnd(); |
| skipSpace(); |
| if (peek() != '>') { |
| throw css::uno::RuntimeException( |
| (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| ++pos_; |
| return RESULT_END; |
| } |
| |
| void XmlReader::handleElementEnd() { |
| OSL_ASSERT(!elements_.empty()); |
| namespaces_.resize(elements_.top().inheritedNamespaces); |
| elements_.pop(); |
| state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT; |
| } |
| |
| XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { |
| for (;;) { |
| sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<'); |
| if (i < 0) { |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| } |
| pos_ += i + 1; |
| switch (peek()) { |
| case '!': |
| ++pos_; |
| if (!skipComment() && !scanCdataSection().is()) { |
| skipDocumentTypeDeclaration(); |
| } |
| break; |
| case '/': |
| ++pos_; |
| return handleEndTag(); |
| case '?': |
| ++pos_; |
| skipProcessingInstruction(); |
| break; |
| default: |
| return handleStartTag(nsId, data); |
| } |
| } |
| } |
| |
| XmlReader::Result XmlReader::handleRawText(Span * text) { |
| pad_.clear(); |
| for (char const * begin = pos_;;) { |
| switch (peek()) { |
| case '\0': // i.e., EOF |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| case '\x0D': |
| pad_.add(begin, pos_ - begin); |
| ++pos_; |
| if (peek() != '\x0A') { |
| pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A")); |
| } |
| begin = pos_; |
| break; |
| case '&': |
| pad_.add(begin, pos_ - begin); |
| pos_ = handleReference(pos_, end_); |
| begin = pos_; |
| break; |
| case '<': |
| pad_.add(begin, pos_ - begin); |
| ++pos_; |
| switch (peek()) { |
| case '!': |
| ++pos_; |
| if (!skipComment()) { |
| Span cdata(scanCdataSection()); |
| if (cdata.is()) { |
| normalizeLineEnds(cdata); |
| } else { |
| skipDocumentTypeDeclaration(); |
| } |
| } |
| begin = pos_; |
| break; |
| case '/': |
| *text = pad_.get(); |
| ++pos_; |
| state_ = STATE_END_TAG; |
| return RESULT_TEXT; |
| case '?': |
| ++pos_; |
| skipProcessingInstruction(); |
| begin = pos_; |
| break; |
| default: |
| *text = pad_.get(); |
| state_ = STATE_START_TAG; |
| return RESULT_TEXT; |
| } |
| break; |
| default: |
| ++pos_; |
| break; |
| } |
| } |
| } |
| |
| XmlReader::Result XmlReader::handleNormalizedText(Span * text) { |
| pad_.clear(); |
| char const * flowBegin = pos_; |
| char const * flowEnd = pos_; |
| enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; |
| // a single true space character can go into the current flow, |
| // everything else breaks the flow |
| Space space = SPACE_START; |
| for (;;) { |
| switch (peek()) { |
| case '\0': // i.e., EOF |
| throw css::uno::RuntimeException( |
| (rtl::OUString( |
| RTL_CONSTASCII_USTRINGPARAM("premature end of ")) + |
| fileUrl_), |
| css::uno::Reference< css::uno::XInterface >()); |
| case '\x09': |
| case '\x0A': |
| case '\x0D': |
| switch (space) { |
| case SPACE_START: |
| case SPACE_BREAK: |
| break; |
| case SPACE_NONE: |
| case SPACE_SPAN: |
| space = SPACE_BREAK; |
| break; |
| } |
| ++pos_; |
| break; |
| case ' ': |
| switch (space) { |
| case SPACE_START: |
| case SPACE_BREAK: |
| break; |
| case SPACE_NONE: |
| space = SPACE_SPAN; |
| break; |
| case SPACE_SPAN: |
| space = SPACE_BREAK; |
| break; |
| } |
| ++pos_; |
| break; |
| case '&': |
| switch (space) { |
| case SPACE_START: |
| break; |
| case SPACE_NONE: |
| case SPACE_SPAN: |
| pad_.add(flowBegin, pos_ - flowBegin); |
| break; |
| case SPACE_BREAK: |
| pad_.add(flowBegin, flowEnd - flowBegin); |
| pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); |
| break; |
| } |
| pos_ = handleReference(pos_, end_); |
| flowBegin = pos_; |
| flowEnd = pos_; |
| space = SPACE_NONE; |
| break; |
| case '<': |
| ++pos_; |
| switch (peek()) { |
| case '!': |
| ++pos_; |
| if (skipComment()) { |
| space = SPACE_BREAK; |
| } else { |
| Span cdata(scanCdataSection()); |
| if (cdata.is()) { |
| // CDATA is not normalized (similar to character |
| // references; it keeps the code simple), but it might |
| // arguably be better to normalize it: |
| switch (space) { |
| case SPACE_START: |
| break; |
| case SPACE_NONE: |
| case SPACE_SPAN: |
| pad_.add(flowBegin, pos_ - flowBegin); |
| break; |
| case SPACE_BREAK: |
| pad_.add(flowBegin, flowEnd - flowBegin); |
| pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); |
| break; |
| } |
| normalizeLineEnds(cdata); |
| flowBegin = pos_; |
| flowEnd = pos_; |
| space = SPACE_NONE; |
| } else { |
| skipDocumentTypeDeclaration(); |
| } |
| } |
| break; |
| case '/': |
| ++pos_; |
| pad_.add(flowBegin, flowEnd - flowBegin); |
| *text = pad_.get(); |
| state_ = STATE_END_TAG; |
| return RESULT_TEXT; |
| case '?': |
| ++pos_; |
| skipProcessingInstruction(); |
| space = SPACE_BREAK; |
| break; |
| default: |
| pad_.add(flowBegin, flowEnd - flowBegin); |
| *text = pad_.get(); |
| state_ = STATE_START_TAG; |
| return RESULT_TEXT; |
| } |
| break; |
| default: |
| switch (space) { |
| case SPACE_START: |
| flowBegin = pos_; |
| break; |
| case SPACE_NONE: |
| case SPACE_SPAN: |
| break; |
| case SPACE_BREAK: |
| pad_.add(flowBegin, flowEnd - flowBegin); |
| pad_.add(RTL_CONSTASCII_STRINGPARAM(" ")); |
| flowBegin = pos_; |
| break; |
| } |
| flowEnd = ++pos_; |
| space = SPACE_NONE; |
| break; |
| } |
| } |
| } |
| |
| int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { |
| OSL_ASSERT(pos <= INT_MAX); |
| return static_cast< int >(pos); |
| } |
| |
| } |