| Index: src/org/apache/xerces/impl/XMLScanner.java
|
| ===================================================================
|
| --- src/org/apache/xerces/impl/XMLScanner.java (.../xerces/java/tags/Xerces-J_2_9_1) (revision 575832)
|
| +++ src/org/apache/xerces/impl/XMLScanner.java (.../incubator/lcf/upstream/xerces2-j-2.9.1-mcf/trunk) (working copy)
|
| @@ -823,6 +823,7 @@
|
| String entityName = fEntityScanner.scanName(); |
| if (entityName == null) { |
| reportFatalError("NameRequiredInReference", null); |
| + entityName = "unknown"; |
| } |
| else if (entityDepth == fEntityDepth) { |
| fStringBuffer2.append(entityName); |
| @@ -1027,6 +1028,14 @@
|
| if (XMLChar.isMarkup(c) || c == ']') { |
| fStringBuffer.append((char)fEntityScanner.scanChar()); |
| } |
| + else if (XMLChar.isHighSurrogate(c)) { |
| + scanSurrogates(fStringBuffer); |
| + } |
| + else if (isInvalidLiteral(c)) { |
| + reportFatalError("InvalidCharInSystemID", |
| + new Object[] { Integer.toHexString(c) }); |
| + fEntityScanner.scanChar(); |
| + } |
| } while (fEntityScanner.scanLiteral(quote, ident) != quote); |
| fStringBuffer.append(ident); |
| ident = fStringBuffer; |
| Index: src/org/apache/xerces/impl/Constants.java
|
| ===================================================================
|
| --- src/org/apache/xerces/impl/Constants.java (.../xerces/java/tags/Xerces-J_2_9_1) (revision 575832)
|
| +++ src/org/apache/xerces/impl/Constants.java (.../incubator/lcf/upstream/xerces2-j-2.9.1-mcf/trunk) (working copy)
|
| @@ -219,6 +219,9 @@
|
| |
| /** Warn on undeclared element feature ("validation/warn-on-undeclared-elemdef"). */ |
| public static final String WARN_ON_UNDECLARED_ELEMDEF_FEATURE = "validation/warn-on-undeclared-elemdef"; |
| + |
| + /** Ignore misencoded characters feature */ |
| + public static final String IGNORE_BADLY_ENCODED_CHARS = "ignore-badly-encoded-chars"; |
| |
| /** Warn on duplicate entity declaration feature ("warn-on-duplicate-entitydef"). */ |
| public static final String WARN_ON_DUPLICATE_ENTITYDEF_FEATURE = "warn-on-duplicate-entitydef"; |
| Index: src/org/apache/xerces/impl/XMLEntityManager.java
|
| ===================================================================
|
| --- src/org/apache/xerces/impl/XMLEntityManager.java (.../xerces/java/tags/Xerces-J_2_9_1) (revision 575832)
|
| +++ src/org/apache/xerces/impl/XMLEntityManager.java (.../incubator/lcf/upstream/xerces2-j-2.9.1-mcf/trunk) (working copy)
|
| @@ -132,6 +132,10 @@
|
| protected static final String PARSER_SETTINGS = |
| Constants.XERCES_FEATURE_PREFIX + Constants.PARSER_SETTINGS; |
| |
| + /** Feature identifier: ignore badly encoded characters */ |
| + protected static final String IGNORE_BADLY_ENCODED_CHARS = |
| + Constants.XERCES_FEATURE_PREFIX + Constants.IGNORE_BADLY_ENCODED_CHARS; |
| + |
| // property identifiers |
| |
| /** Property identifier: symbol table. */ |
| @@ -167,7 +171,8 @@
|
| EXTERNAL_PARAMETER_ENTITIES, |
| ALLOW_JAVA_ENCODINGS, |
| WARN_ON_DUPLICATE_ENTITYDEF, |
| - STANDARD_URI_CONFORMANT |
| + STANDARD_URI_CONFORMANT, |
| + IGNORE_BADLY_ENCODED_CHARS |
| }; |
| |
| /** Feature defaults. */ |
| @@ -177,6 +182,7 @@
|
| Boolean.TRUE, |
| Boolean.FALSE, |
| Boolean.FALSE, |
| + Boolean.FALSE, |
| Boolean.FALSE |
| }; |
| |
| @@ -262,6 +268,12 @@
|
| */ |
| protected boolean fStrictURI; |
| |
| + /** |
| + * allow badly encoded characters (skip them) |
| + * http://apache.org/xml/features/ignore-badly-encoded-chars |
| + */ |
| + protected boolean fAllowBadlyEncodedChars; |
| + |
| // properties |
| |
| /** |
| @@ -1310,6 +1322,13 @@
|
| fStrictURI = false; |
| } |
| |
| + try { |
| + fAllowBadlyEncodedChars = componentManager.getFeature(IGNORE_BADLY_ENCODED_CHARS); |
| + } |
| + catch (XMLConfigurationException e) { |
| + fAllowBadlyEncodedChars = false; |
| + } |
| + |
| // xerces properties |
| fSymbolTable = (SymbolTable)componentManager.getProperty(SYMBOL_TABLE); |
| fErrorReporter = (XMLErrorReporter)componentManager.getProperty(ERROR_REPORTER); |
| @@ -2082,6 +2101,33 @@
|
| protected Reader createReader(InputStream inputStream, String encoding, Boolean isBigEndian) |
| throws IOException { |
| |
| + Reader internalReader = createInternalReader(inputStream, encoding, isBigEndian); |
| + if (fAllowBadlyEncodedChars) |
| + { |
| + // Wrap the reader so that bad characters are ignored rather than causing aborts |
| + return new LaxReader(internalReader); |
| + } |
| + return internalReader; |
| + } |
| + |
| + /** |
| + * Creates a reader capable of reading the given input stream in |
| + * the specified encoding. |
| + * |
| + * @param inputStream The input stream. |
| + * @param encoding The encoding name that the input stream is |
| + * encoded using. If the user has specified that |
| + * Java encoding names are allowed, then the |
| + * encoding name may be a Java encoding name; |
| + * otherwise, it is an ianaEncoding name. |
| + * @param isBigEndian For encodings (like uCS-4), whose names cannot |
| + * specify a byte order, this tells whether the order is bigEndian. null menas |
| + * unknown or not relevant. |
| + * |
| + * @return Returns a reader. |
| + */ |
| + protected Reader createInternalReader(InputStream inputStream, String encoding, Boolean isBigEndian) |
| + throws IOException { |
| // if the encoding is UTF-8 use the optimized UTF-8 reader |
| if (encoding == "UTF-8" || encoding == null) { |
| if (DEBUG_ENCODINGS) { |
| @@ -3025,6 +3071,9 @@
|
| return -1; |
| } |
| if (fOffset == fData.length) { |
| + if (fCurrentEntity.mayReadChunks) { |
| + return fInputStream.read(); |
| + } |
| byte[] newData = new byte[fOffset << 1]; |
| System.arraycopy(fData, 0, newData, 0, fOffset); |
| fData = newData; |
| @@ -3138,4 +3187,105 @@
|
| } |
| } // end of RewindableInputStream class |
| |
| + protected static class LaxReader extends Reader |
| + { |
| + protected Reader internalReader; |
| + |
| + public LaxReader(Reader internalReader) |
| + { |
| + this.internalReader = internalReader; |
| + } |
| + |
| + public int read() |
| + throws IOException |
| + { |
| + // Since we need to be able to skip ahead at the point of error, and not drop huge amounts on the floor, |
| + // all read operations for this class are channeled through the single-character operation. This is less |
| + // efficient, but hopefully not terribly so. |
| + try |
| + { |
| + return internalReader.read(); |
| + } |
| + catch (org.apache.xerces.impl.io.MalformedByteSequenceException e) |
| + { |
| + // When this fails, it means we detected a bad character. |
| + // However, the bad character has already been pulled off the stream, so we are free to stuff in a "?" and |
| + // just keep going. |
| + return (int)'?'; |
| + } |
| + } |
| + |
| + public int read(char[] cbuf) |
| + throws IOException |
| + { |
| + return read(cbuf,0,cbuf.length); |
| + } |
| + |
| + public int read(char[] cbuf, |
| + int off, |
| + int len) |
| + throws IOException |
| + { |
| + int amtRead = 0; |
| + while (amtRead < len) |
| + { |
| + int cval = read(); |
| + if (cval == -1) |
| + { |
| + if (amtRead == 0) |
| + return -1; |
| + else |
| + return amtRead; |
| + } |
| + cbuf[off++] = (char)cval; |
| + amtRead++; |
| + } |
| + return amtRead; |
| + } |
| + |
| + public long skip(long n) |
| + throws IOException |
| + { |
| + long skipped = 0; |
| + while (skipped < n) |
| + { |
| + int cval = read(); |
| + if (cval == -1) |
| + break; |
| + skipped++; |
| + } |
| + return skipped; |
| + } |
| + |
| + public boolean ready() |
| + throws IOException |
| + { |
| + return internalReader.ready(); |
| + } |
| + |
| + public boolean markSupported() |
| + { |
| + return internalReader.markSupported(); |
| + } |
| + |
| + public void mark(int readAheadLimit) |
| + throws IOException |
| + { |
| + internalReader.mark(readAheadLimit); |
| + } |
| + |
| + public void reset() |
| + throws IOException |
| + { |
| + internalReader.reset(); |
| + } |
| + |
| + public void close() |
| + throws IOException |
| + { |
| + internalReader.close(); |
| + } |
| + } |
| + |
| + |
| } // class XMLEntityManager |
| Index: src/org/apache/xerces/impl/XMLDocumentScannerImpl.java
|
| ===================================================================
|
| --- src/org/apache/xerces/impl/XMLDocumentScannerImpl.java (.../xerces/java/tags/Xerces-J_2_9_1) (revision 575832)
|
| +++ src/org/apache/xerces/impl/XMLDocumentScannerImpl.java (.../incubator/lcf/upstream/xerces2-j-2.9.1-mcf/trunk) (working copy)
|
| @@ -783,6 +783,8 @@
|
| else { |
| reportFatalError("MarkupNotRecognizedInProlog", |
| null); |
| + // Don't loop forever! |
| + fEntityScanner.scanChar(); |
| } |
| } |
| else if (isValidNameStartChar(fEntityScanner.peekChar())) { |
| @@ -802,6 +804,8 @@
|
| else { |
| reportFatalError("MarkupNotRecognizedInProlog", |
| null); |
| + // Don't loop forever! |
| + fEntityScanner.scanChar(); |
| } |
| break; |
| } |
| @@ -872,6 +876,8 @@
|
| } |
| case SCANNER_STATE_REFERENCE: { |
| reportFatalError("ReferenceIllegalInProlog", null); |
| + // Don't loop forever! |
| + fEntityScanner.scanChar(); |
| } |
| } |
| } while (complete || again); |
| @@ -1277,6 +1283,8 @@
|
| else { |
| reportFatalError("MarkupNotRecognizedInMisc", |
| null); |
| + // Skip forward one character, otherwise we loop forever. |
| + fEntityScanner.scanChar(); |
| } |
| break; |
| } |