blob: cfe1c1e8d37ebca57b29f7c37420cc28dcf7ace4 [file] [log] [blame]
Index: src/org/apache/xerces/impl/XMLScanner.java
===================================================================
--- src/org/apache/xerces/impl/XMLScanner.java (.../xerces/java/tags/Xerces-J_2_9_1) (revision 575832)
+++ src/org/apache/xerces/impl/XMLScanner.java (.../incubator/lcf/upstream/xerces2-j-2.9.1-mcf/trunk) (working copy)
@@ -823,6 +823,7 @@
String entityName = fEntityScanner.scanName();
if (entityName == null) {
reportFatalError("NameRequiredInReference", null);
+ entityName = "unknown";
}
else if (entityDepth == fEntityDepth) {
fStringBuffer2.append(entityName);
@@ -1027,6 +1028,14 @@
if (XMLChar.isMarkup(c) || c == ']') {
fStringBuffer.append((char)fEntityScanner.scanChar());
}
+ else if (XMLChar.isHighSurrogate(c)) {
+ scanSurrogates(fStringBuffer);
+ }
+ else if (isInvalidLiteral(c)) {
+ reportFatalError("InvalidCharInSystemID",
+ new Object[] { Integer.toHexString(c) });
+ fEntityScanner.scanChar();
+ }
} while (fEntityScanner.scanLiteral(quote, ident) != quote);
fStringBuffer.append(ident);
ident = fStringBuffer;
Index: src/org/apache/xerces/impl/Constants.java
===================================================================
--- src/org/apache/xerces/impl/Constants.java (.../xerces/java/tags/Xerces-J_2_9_1) (revision 575832)
+++ src/org/apache/xerces/impl/Constants.java (.../incubator/lcf/upstream/xerces2-j-2.9.1-mcf/trunk) (working copy)
@@ -219,6 +219,9 @@
/** Warn on undeclared element feature ("validation/warn-on-undeclared-elemdef"). */
public static final String WARN_ON_UNDECLARED_ELEMDEF_FEATURE = "validation/warn-on-undeclared-elemdef";
+
+ /** Ignore misencoded characters feature */
+ public static final String IGNORE_BADLY_ENCODED_CHARS = "ignore-badly-encoded-chars";
/** Warn on duplicate entity declaration feature ("warn-on-duplicate-entitydef"). */
public static final String WARN_ON_DUPLICATE_ENTITYDEF_FEATURE = "warn-on-duplicate-entitydef";
Index: src/org/apache/xerces/impl/XMLEntityManager.java
===================================================================
--- src/org/apache/xerces/impl/XMLEntityManager.java (.../xerces/java/tags/Xerces-J_2_9_1) (revision 575832)
+++ src/org/apache/xerces/impl/XMLEntityManager.java (.../incubator/lcf/upstream/xerces2-j-2.9.1-mcf/trunk) (working copy)
@@ -132,6 +132,10 @@
protected static final String PARSER_SETTINGS =
Constants.XERCES_FEATURE_PREFIX + Constants.PARSER_SETTINGS;
+ /** Feature identifier: ignore badly encoded characters */
+ protected static final String IGNORE_BADLY_ENCODED_CHARS =
+ Constants.XERCES_FEATURE_PREFIX + Constants.IGNORE_BADLY_ENCODED_CHARS;
+
// property identifiers
/** Property identifier: symbol table. */
@@ -167,7 +171,8 @@
EXTERNAL_PARAMETER_ENTITIES,
ALLOW_JAVA_ENCODINGS,
WARN_ON_DUPLICATE_ENTITYDEF,
- STANDARD_URI_CONFORMANT
+ STANDARD_URI_CONFORMANT,
+ IGNORE_BADLY_ENCODED_CHARS
};
/** Feature defaults. */
@@ -177,6 +182,7 @@
Boolean.TRUE,
Boolean.FALSE,
Boolean.FALSE,
+ Boolean.FALSE,
Boolean.FALSE
};
@@ -262,6 +268,12 @@
*/
protected boolean fStrictURI;
+ /**
+ * allow badly encoded characters (skip them)
+ * http://apache.org/xml/features/ignore-badly-encoded-chars
+ */
+ protected boolean fAllowBadlyEncodedChars;
+
// properties
/**
@@ -1310,6 +1322,13 @@
fStrictURI = false;
}
+ try {
+ fAllowBadlyEncodedChars = componentManager.getFeature(IGNORE_BADLY_ENCODED_CHARS);
+ }
+ catch (XMLConfigurationException e) {
+ fAllowBadlyEncodedChars = false;
+ }
+
// xerces properties
fSymbolTable = (SymbolTable)componentManager.getProperty(SYMBOL_TABLE);
fErrorReporter = (XMLErrorReporter)componentManager.getProperty(ERROR_REPORTER);
@@ -2082,6 +2101,33 @@
protected Reader createReader(InputStream inputStream, String encoding, Boolean isBigEndian)
throws IOException {
+ Reader internalReader = createInternalReader(inputStream, encoding, isBigEndian);
+ if (fAllowBadlyEncodedChars)
+ {
+ // Wrap the reader so that bad characters are ignored rather than causing aborts
+ return new LaxReader(internalReader);
+ }
+ return internalReader;
+ }
+
+ /**
+ * Creates a reader capable of reading the given input stream in
+ * the specified encoding.
+ *
+ * @param inputStream The input stream.
+ * @param encoding The encoding name that the input stream is
+ * encoded using. If the user has specified that
+ * Java encoding names are allowed, then the
+ * encoding name may be a Java encoding name;
+ * otherwise, it is an ianaEncoding name.
+ * @param isBigEndian For encodings (like uCS-4), whose names cannot
+ * specify a byte order, this tells whether the order is bigEndian. null menas
+ * unknown or not relevant.
+ *
+ * @return Returns a reader.
+ */
+ protected Reader createInternalReader(InputStream inputStream, String encoding, Boolean isBigEndian)
+ throws IOException {
// if the encoding is UTF-8 use the optimized UTF-8 reader
if (encoding == "UTF-8" || encoding == null) {
if (DEBUG_ENCODINGS) {
@@ -3025,6 +3071,9 @@
return -1;
}
if (fOffset == fData.length) {
+ if (fCurrentEntity.mayReadChunks) {
+ return fInputStream.read();
+ }
byte[] newData = new byte[fOffset << 1];
System.arraycopy(fData, 0, newData, 0, fOffset);
fData = newData;
@@ -3138,4 +3187,105 @@
}
} // end of RewindableInputStream class
+ protected static class LaxReader extends Reader
+ {
+ protected Reader internalReader;
+
+ public LaxReader(Reader internalReader)
+ {
+ this.internalReader = internalReader;
+ }
+
+ public int read()
+ throws IOException
+ {
+ // Since we need to be able to skip ahead at the point of error, and not drop huge amounts on the floor,
+ // all read operations for this class are channeled through the single-character operation. This is less
+ // efficient, but hopefully not terribly so.
+ try
+ {
+ return internalReader.read();
+ }
+ catch (org.apache.xerces.impl.io.MalformedByteSequenceException e)
+ {
+ // When this fails, it means we detected a bad character.
+ // However, the bad character has already been pulled off the stream, so we are free to stuff in a "?" and
+ // just keep going.
+ return (int)'?';
+ }
+ }
+
+ public int read(char[] cbuf)
+ throws IOException
+ {
+ return read(cbuf,0,cbuf.length);
+ }
+
+ public int read(char[] cbuf,
+ int off,
+ int len)
+ throws IOException
+ {
+ int amtRead = 0;
+ while (amtRead < len)
+ {
+ int cval = read();
+ if (cval == -1)
+ {
+ if (amtRead == 0)
+ return -1;
+ else
+ return amtRead;
+ }
+ cbuf[off++] = (char)cval;
+ amtRead++;
+ }
+ return amtRead;
+ }
+
+ public long skip(long n)
+ throws IOException
+ {
+ long skipped = 0;
+ while (skipped < n)
+ {
+ int cval = read();
+ if (cval == -1)
+ break;
+ skipped++;
+ }
+ return skipped;
+ }
+
+ public boolean ready()
+ throws IOException
+ {
+ return internalReader.ready();
+ }
+
+ public boolean markSupported()
+ {
+ return internalReader.markSupported();
+ }
+
+ public void mark(int readAheadLimit)
+ throws IOException
+ {
+ internalReader.mark(readAheadLimit);
+ }
+
+ public void reset()
+ throws IOException
+ {
+ internalReader.reset();
+ }
+
+ public void close()
+ throws IOException
+ {
+ internalReader.close();
+ }
+ }
+
+
} // class XMLEntityManager
Index: src/org/apache/xerces/impl/XMLDocumentScannerImpl.java
===================================================================
--- src/org/apache/xerces/impl/XMLDocumentScannerImpl.java (.../xerces/java/tags/Xerces-J_2_9_1) (revision 575832)
+++ src/org/apache/xerces/impl/XMLDocumentScannerImpl.java (.../incubator/lcf/upstream/xerces2-j-2.9.1-mcf/trunk) (working copy)
@@ -783,6 +783,8 @@
else {
reportFatalError("MarkupNotRecognizedInProlog",
null);
+ // Don't loop forever!
+ fEntityScanner.scanChar();
}
}
else if (isValidNameStartChar(fEntityScanner.peekChar())) {
@@ -802,6 +804,8 @@
else {
reportFatalError("MarkupNotRecognizedInProlog",
null);
+ // Don't loop forever!
+ fEntityScanner.scanChar();
}
break;
}
@@ -872,6 +876,8 @@
}
case SCANNER_STATE_REFERENCE: {
reportFatalError("ReferenceIllegalInProlog", null);
+ // Don't loop forever!
+ fEntityScanner.scanChar();
}
}
} while (complete || again);
@@ -1277,6 +1283,8 @@
else {
reportFatalError("MarkupNotRecognizedInMisc",
null);
+ // Skip forward one character, otherwise we loop forever.
+ fEntityScanner.scanChar();
}
break;
}