PROTON-834: further UTF-8 encoder fixes
After commit c65e897 it turned out there were still some issues with
strings containing a codepoint >0xDBFF which was being incorrectly
treated as a surrogate pair in the calculateUTF8Length method.
Fixed this up and added some more test coverage.
Closes #13
(cherry picked from commit 7b9b516d445ab9e86a0313709c77218d901435b1)
diff --git a/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java b/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java
index 092894d..a035e94 100644
--- a/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java
+++ b/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java
@@ -97,7 +97,7 @@
{
len++;
// surrogate pairs should always combine to create a code point with a 4 octet representation
- if ((c & 0xD800) == 0xD800)
+ if ((c & 0xD800) == 0xD800 && c < 0xDC00)
{
i++;
}
diff --git a/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java b/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java
index 7d78f65..7a44063 100644
--- a/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java
+++ b/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java
@@ -140,9 +140,17 @@
UnicodeBlock.MUSICAL_SYMBOLS,
/*UnicodeBlock.EMOTICONS,*/
/*UnicodeBlock.PLAYING_CARDS,*/
+ UnicodeBlock.BOX_DRAWING,
UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS,
UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_A,
UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_B));
+ // some additional combinations of characters that could cause problems to the encoder
+ String[] boxDrawing = getAllStringsFromUnicodeBlocks(UnicodeBlock.BOX_DRAWING).toArray(new String[0]);
+ String[] halfFullWidthForms = getAllStringsFromUnicodeBlocks(UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS).toArray(new String[0]);
+ for (int i = 0; i < halfFullWidthForms.length; i++)
+ {
+ add(halfFullWidthForms[i] + boxDrawing[i % boxDrawing.length]);
+ }
}
};
}