OPENNLP-1268 -- fix StringUtil.toLowerCase() to work on codepoints, not chars (#356)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java b/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
index c3bd7e6..88f0fa6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/StringUtil.java
@@ -66,20 +66,15 @@
/**
* Converts to lower case independent of the current locale via
- * {@link Character#toLowerCase(char)} which uses mapping information
+ * {@link Character#toLowerCase(int)} which uses mapping information
* from the UnicodeData file.
*
* @param string
* @return lower cased String
*/
public static String toLowerCase(CharSequence string) {
- char[] lowerCaseChars = new char[string.length()];
-
- for (int i = 0; i < string.length(); i++) {
- lowerCaseChars[i] = Character.toLowerCase(string.charAt(i));
- }
-
- return new String(lowerCaseChars);
+ int[] cp = string.codePoints().map(Character::toLowerCase).toArray();
+ return new String(cp, 0, cp.length);
}
/**
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/StringUtilTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/StringUtilTest.java
index f2cc41f..4aa0b59 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/util/StringUtilTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/StringUtilTest.java
@@ -60,4 +60,12 @@
StringUtil.isEmpty(null);
}
+ @Test
+ public void testLowercaseBeyondBMP() throws Exception {
+ int[] codePoints = new int[]{65,66578,67}; //A,Deseret capital BEE,C
+ int[] expectedCodePoints = new int[]{97,66618,99};//a,Deseret lowercase b,c
+ String input = new String(codePoints, 0, codePoints.length);
+ String lc = StringUtil.toLowerCase(input);
+ Assert.assertArrayEquals(expectedCodePoints, lc.codePoints().toArray());
+ }
}