[XERCESC-2180] Remove assertion when a surrogate pair is split by the boundary of an input buffer (transcoders try to avoid this, but UTF-16 transcoder doesn't have this check in place). The reader now pulls in more data on demand. git-svn-id: https://svn.apache.org/repos/asf/xerces/c/trunk@1871620 13f79535-47bb-0310-9956-ffa450edef68

commit: 8e21670f8dd6ffa8da789e5df8bb29f5229252fc [log] [tgz]
author: Alberto Massari <amassari@apache.org> Sun Dec 15 21:18:15 2019 +0000
committer: Alberto Massari <amassari@apache.org> Sun Dec 15 21:18:15 2019 +0000
tree: 174232d282fe05acc51242ce71a455d1dea75009
parent: 87507dcce3358a8ef1334b4757c322f19941d786 [diff]
diff --git a/src/xercesc/internal/XMLReader.cpp b/src/xercesc/internal/XMLReader.cpp
index befe51c..405474a 100644
--- a/src/xercesc/internal/XMLReader.cpp
+++ b/src/xercesc/internal/XMLReader.cpp

@@ -646,11 +646,16 @@
     if (!token)
     {
         if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) {
-           // make sure one more char is in the buffer, the transcoder
-           // should put only a complete surrogate pair into the buffer
-           assert(fCharIndex+1 < fCharsAvail);
-           if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
-               return false;
+            // if there isn't one more char in the buffer, read more data
+            if (fCharIndex+1 == fCharsAvail)
+            {
+                if (!refreshCharBuffer())
+                    return false;
+                // reset the start buffer to the new location of the cursor
+                charIndex_start = fCharIndex;
+            }
+            if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
+                return false;
 
             // Looks ok, so lets eat it
             fCharIndex += 2;
@@ -675,9 +680,21 @@
             //  break out.
             if ( (fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) )
             {
-                // make sure one more char is in the buffer, the transcoder
-                // should put only a complete surrogate pair into the buffer
-                assert(fCharIndex+1 < fCharsAvail);
+                // if there isn't one more char in the buffer, read more data
+                if (fCharIndex+1 == fCharsAvail)
+                {
+                    // but first copy the accepted character(s), and update column
+                    if (fCharIndex != charIndex_start)
+                    {
+                        fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
+                        toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
+                    }
+
+                    if (!refreshCharBuffer())
+                        break;
+
+                    charIndex_start = fCharIndex;
+                }
                 if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
                         (fCharBuf[fCharIndex+1] > 0xDFFF)  )
                     break;
@@ -721,9 +738,14 @@
     //  what's the point in living mannnn? Just give up now. We only do this
     //  if its a name and not a name token that they want.
     if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) {
-        // make sure one more char is in the buffer, the transcoder
-        // should put only a complete surrogate pair into the buffer
-        assert(fCharIndex+1 < fCharsAvail);
+        // if there isn't one more char in the buffer, read more data
+        if (fCharIndex+1 == fCharsAvail)
+        {
+            if (!refreshCharBuffer())
+                return false;
+            // reset the start buffer to the new location of the cursor
+            charIndex_start = fCharIndex;
+        }
         if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
             return false;
 
@@ -758,7 +780,28 @@
         //  Check the current char and take it if it's a name char
         while(fCharIndex < fCharsAvail)
         {
-            if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) && fCharIndex+1 < fCharsAvail && ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))) fCharIndex+=2;
+            if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))
+            {
+                // if there isn't one more char in the buffer, read more data
+                if (fCharIndex+1 == fCharsAvail)
+                {
+                    // but first copy the accepted character(s), and update column
+                    if (fCharIndex != charIndex_start)
+                    {
+                        fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
+                        toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
+                    }
+
+                    if (!refreshCharBuffer())
+                        break;
+
+                    charIndex_start = fCharIndex;
+                }
+                if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
+                    (fCharBuf[fCharIndex+1] > 0xDFFF)  )
+                    break;
+                fCharIndex += 2;
+            }
             else if(isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++;
             else break;
         }

diff --git a/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml b/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml
new file mode 100644
index 0000000..a8de93b
--- /dev/null
+++ b/tests/src/XSTSHarness/regression/XERCESC-2180/crash.xml
Binary files differ

diff --git a/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml b/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml
new file mode 100644
index 0000000..e46a9fd
--- /dev/null
+++ b/tests/src/XSTSHarness/regression/XERCESC-2180/crash2.xml
Binary files differ

diff --git a/tests/src/XSTSHarness/regression/XercesXML.testSet b/tests/src/XSTSHarness/regression/XercesXML.testSet
new file mode 100644
index 0000000..a7a2427
--- /dev/null
+++ b/tests/src/XSTSHarness/regression/XercesXML.testSet

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>

+<TESTSUITE>

+  <TESTCASES xml:base="XERCESC-2180">

+    <!-- https://issues.apache.org/jira/browse/XERCESC-2180: Assertion when scanner splits a surrogate pair across two separate buffers -->

+    <TEST ID="XERCESC-2180" TYPE="invalid" URI="crash.xml"/>

+    <TEST ID="XERCESC-2180" TYPE="invalid" URI="crash2.xml"/>

+  </TESTCASES>

+</TESTSUITE>
\ No newline at end of file
commit	8e21670f8dd6ffa8da789e5df8bb29f5229252fc	[log] [tgz]
author	Alberto Massari <amassari@apache.org>	Sun Dec 15 21:18:15 2019 +0000
committer	Alberto Massari <amassari@apache.org>	Sun Dec 15 21:18:15 2019 +0000
tree	174232d282fe05acc51242ce71a455d1dea75009
parent	87507dcce3358a8ef1334b4757c322f19941d786 [diff]