[XERCESC-2180] Remove assertion when a surrogate pair is split by the boundary
of an input buffer (transcoders try to avoid this, but UTF-16 transcoder doesn't
have this check in place). The reader now pulls in more data on demand.
git-svn-id: https://svn.apache.org/repos/asf/xerces/c/trunk@1871620 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/xercesc/internal/XMLReader.cpp b/src/xercesc/internal/XMLReader.cpp
index befe51c..405474a 100644
--- a/src/xercesc/internal/XMLReader.cpp
+++ b/src/xercesc/internal/XMLReader.cpp
@@ -646,11 +646,16 @@
if (!token)
{
if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) {
- // make sure one more char is in the buffer, the transcoder
- // should put only a complete surrogate pair into the buffer
- assert(fCharIndex+1 < fCharsAvail);
- if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
- return false;
+ // if there isn't one more char in the buffer, read more data
+ if (fCharIndex+1 == fCharsAvail)
+ {
+ if (!refreshCharBuffer())
+ return false;
+ // reset the start buffer to the new location of the cursor
+ charIndex_start = fCharIndex;
+ }
+ if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
+ return false;
// Looks ok, so lets eat it
fCharIndex += 2;
@@ -675,9 +680,21 @@
// break out.
if ( (fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) )
{
- // make sure one more char is in the buffer, the transcoder
- // should put only a complete surrogate pair into the buffer
- assert(fCharIndex+1 < fCharsAvail);
+ // if there isn't one more char in the buffer, read more data
+ if (fCharIndex+1 == fCharsAvail)
+ {
+ // but first copy the accepted character(s), and update column
+ if (fCharIndex != charIndex_start)
+ {
+ fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
+ toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
+ }
+
+ if (!refreshCharBuffer())
+ break;
+
+ charIndex_start = fCharIndex;
+ }
if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
(fCharBuf[fCharIndex+1] > 0xDFFF) )
break;
@@ -721,9 +738,14 @@
// what's the point in living mannnn? Just give up now. We only do this
// if its a name and not a name token that they want.
if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) {
- // make sure one more char is in the buffer, the transcoder
- // should put only a complete surrogate pair into the buffer
- assert(fCharIndex+1 < fCharsAvail);
+ // if there isn't one more char in the buffer, read more data
+ if (fCharIndex+1 == fCharsAvail)
+ {
+ if (!refreshCharBuffer())
+ return false;
+ // reset the start buffer to the new location of the cursor
+ charIndex_start = fCharIndex;
+ }
if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
return false;
@@ -758,7 +780,28 @@
// Check the current char and take it if it's a name char
while(fCharIndex < fCharsAvail)
{
- if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) && fCharIndex+1 < fCharsAvail && ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))) fCharIndex+=2;
+ if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))
+ {
+ // if there isn't one more char in the buffer, read more data
+ if (fCharIndex+1 == fCharsAvail)
+ {
+ // but first copy the accepted character(s), and update column
+ if (fCharIndex != charIndex_start)
+ {
+ fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
+ toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
+ }
+
+ if (!refreshCharBuffer())
+ break;
+
+ charIndex_start = fCharIndex;
+ }
+ if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
+ (fCharBuf[fCharIndex+1] > 0xDFFF) )
+ break;
+ fCharIndex += 2;
+ }
else if(isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++;
else break;
}
diff --git a/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml b/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml
new file mode 100644
index 0000000..a8de93b
--- /dev/null
+++ b/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml
Binary files differ
diff --git a/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml b/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml
new file mode 100644
index 0000000..e46a9fd
--- /dev/null
+++ b/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml
Binary files differ
diff --git a/tests/src/XSTSHarness/regression/XercesXML.testSet b/tests/src/XSTSHarness/regression/XercesXML.testSet
new file mode 100644
index 0000000..a7a2427
--- /dev/null
+++ b/tests/src/XSTSHarness/regression/XercesXML.testSet
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<TESTSUITE>
+ <TESTCASES xml:base="XERCESC-2180">
+ <!-- https://issues.apache.org/jira/browse/XERCESC-2180: Assertion when scanner splits a surrogate pair across two separate buffers -->
+ <TEST ID="XERCESC-2180" TYPE="invalid" URI="crash.xml"/>
+ <TEST ID="XERCESC-2180" TYPE="invalid" URI="crash2.xml"/>
+ </TESTCASES>
+</TESTSUITE>
\ No newline at end of file