Fix handling of non-ASCII letters & numbers in RandomStringUtils (#1273)
An optimization introduced in commit
f382d61a03778ccf838c6c051bd8692e4834dec2
incorrectly assumed that letters and numbers/digits are ASCII.
For example, `RandomStringUtils.random(1, 0x4e00, 0x4e01, true, false)`
would take too long to terminate and would not output the correct
answer, that is the string with a single character `0x4e00`.
The issue is that `end` would be replaced by `'z' + 1 = 123` (the latest
ASCII letter + 1) there: https://github.com/apache/commons-lang/blob/f382d61a03778ccf838c6c051bd8692e4834dec2/src/main/java/org/apache/commons/lang3/RandomStringUtils.java#L266-L270
Thus, we would have `end < start` and `gap = end - start < 0`.
This PR fixes this issue by restricting the optimization to cases
where only ASCII characters are requested, that is `end < 0x7f`.
This PR also slightly clarifies the code, using constant variables
instead of '0', '9', 'A', and 'z'.
The PR also includes two regression tests.
Co-authored-by: Fabrice Benhamouda <yfabrice@amazon.com>
diff --git a/src/main/java/org/apache/commons/lang3/RandomStringUtils.java b/src/main/java/org/apache/commons/lang3/RandomStringUtils.java
index ca2f4a1..573f9ae 100644
--- a/src/main/java/org/apache/commons/lang3/RandomStringUtils.java
+++ b/src/main/java/org/apache/commons/lang3/RandomStringUtils.java
@@ -277,45 +277,50 @@ public static String random(int count, int start, int end, final boolean letters
end = Character.MAX_CODE_POINT;
}
- // Optimize generation of full alphanumerical characters
- // Normally, we would need to pick a 7-bit integer, since gap = 'z' - '0' + 1 = 75 > 64
- // In turn, this would make us reject the sampling with probability 1 - 62 / 2^7 > 1 / 2
- // Instead we can pick directly from the right set of 62 characters, which requires
- // picking a 6-bit integer and only rejecting with probability 2 / 64 = 1 / 32
- if (chars == null && letters && numbers && start <= '0' && end >= 'z' + 1) {
- return random(count, 0, 0, false, false, ALPHANUMERICAL_CHARS, random);
- }
+ // Optimizations and tests when chars == null and using ASCII characters (end <= 0x7f)
+ if (chars == null && end <= 0x7f) {
+ final int zeroDigitAscii = 48; // '0'
+ final int lastDigitAscii = 57; // '9'
+ final int firstLetterAscii = 65; // 'A'
+ final int lastLetterAscii = 122; // 'z'
- // Optimize start and end when filtering by letters and/or numbers:
- // The range provided may be too large since we filter anyway afterward.
- // Note the use of Math.min/max (as opposed to setting start to '0' for example),
- // since it is possible the range start/end excludes some of the letters/numbers,
- // e.g., it is possible that start already is '1' when numbers = true, and start
- // needs to stay equal to '1' in that case.
- if (chars == null) {
+ // Optimize generation of full alphanumerical characters
+ // Normally, we would need to pick a 7-bit integer, since gap = 'z' - '0' + 1 = 75 > 64
+ // In turn, this would make us reject the sampling with probability 1 - 62 / 2^7 > 1 / 2
+ // Instead we can pick directly from the right set of 62 characters, which requires
+ // picking a 6-bit integer and only rejecting with probability 2 / 64 = 1 / 32
+ if (letters && numbers && start <= zeroDigitAscii && end >= lastLetterAscii + 1) {
+ return random(count, 0, 0, false, false, ALPHANUMERICAL_CHARS, random);
+ }
+
+ if (numbers && end <= zeroDigitAscii || letters && end <= firstLetterAscii) {
+ throw new IllegalArgumentException(
+ "Parameter end (" + end + ") must be greater then (" + zeroDigitAscii + ") for generating digits "
+ + "or greater then (" + firstLetterAscii + ") for generating letters.");
+ }
+
+ // Optimize start and end when filtering by letters and/or numbers:
+ // The range provided may be too large since we filter anyway afterward.
+ // Note the use of Math.min/max (as opposed to setting start to '0' for example),
+ // since it is possible the range start/end excludes some of the letters/numbers,
+ // e.g., it is possible that start already is '1' when numbers = true, and start
+ // needs to stay equal to '1' in that case.
+ // Note that because of the above test, we will always have start < end
+ // even after this optimization.
if (letters && numbers) {
- start = Math.max('0', start);
- end = Math.min('z' + 1, end);
+ start = Math.max(zeroDigitAscii, start);
+ end = Math.min(lastLetterAscii + 1, end);
} else if (numbers) {
// just numbers, no letters
- start = Math.max('0', start);
- end = Math.min('9' + 1, end);
+ start = Math.max(zeroDigitAscii, start);
+ end = Math.min(lastDigitAscii + 1, end);
} else if (letters) {
// just letters, no numbers
- start = Math.max('A', start);
- end = Math.min('z' + 1, end);
+ start = Math.max(firstLetterAscii, start);
+ end = Math.min(lastLetterAscii + 1, end);
}
}
- final int zeroDigitAscii = 48;
- final int firstLetterAscii = 65;
-
- if (chars == null && (numbers && end <= zeroDigitAscii || letters && end <= firstLetterAscii)) {
- throw new IllegalArgumentException(
- "Parameter end (" + end + ") must be greater then (" + zeroDigitAscii + ") for generating digits "
- + "or greater then (" + firstLetterAscii + ") for generating letters.");
- }
-
final StringBuilder builder = new StringBuilder(count);
final int gap = end - start;
final int gapBits = Integer.SIZE - Integer.numberOfLeadingZeros(gap);
diff --git a/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java b/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java
index 4250f7b..2fb68e2 100644
--- a/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java
+++ b/src/test/java/org/apache/commons/lang3/RandomStringUtilsTest.java
@@ -732,4 +732,74 @@ public void testRandomWithChars(final RandomStringUtils rsu) {
assertNotEquals(r1, r3);
assertNotEquals(r2, r3);
}
+
+ /**
+ * Test {@code RandomStringUtils.random} works appropriately when letters=true
+ * and the range does not only include ASCII letters.
+ * Fails with probability less than 2^-40 (in practice this never happens).
+ */
+ @ParameterizedTest
+ @MethodSource("randomProvider")
+ void testNonASCIILetters(final RandomStringUtils rsu) {
+ // Check that the following create a string with 10 characters 0x4e00 (a non-ASCII letter)
+ String r1 = rsu.next(10, 0x4e00, 0x4e01, true, false);
+ assertEquals(10, r1.length(), "wrong length");
+ for (int i = 0; i < r1.length(); i++) {
+ assertEquals(0x4e00, r1.charAt(i), "characters not all equal to 0x4e00");
+ }
+
+ // Same with both letters=true and numbers=true
+ r1 = rsu.next(10, 0x4e00, 0x4e01, true, true);
+ assertEquals(10, r1.length(), "wrong length");
+ for (int i = 0; i < r1.length(); i++) {
+ assertEquals(0x4e00, r1.charAt(i), "characters not all equal to 0x4e00");
+ }
+
+ // Check that at least one letter is not ASCII
+ boolean found = false;
+ r1 = rsu.next(40, 'F', 0x3000, true, false);
+ assertEquals(40, r1.length(), "wrong length");
+ for (int i = 0; i < r1.length(); i++) {
+ assertTrue(Character.isLetter(r1.charAt(i)), "characters not all letters");
+ if (r1.charAt(i) > 0x7f) {
+ found = true;
+ }
+ }
+ assertTrue(found, "no non-ASCII letter generated");
+ }
+
+ /**
+ * Test {@code RandomStringUtils.random} works appropriately when numbers=true
+ * and the range does not only include ASCII numbers/digits.
+ * Fails with probability less than 2^-40 (in practice this never happens).
+ */
+ @ParameterizedTest
+ @MethodSource("randomProvider")
+ void testNonASCIINumbers(final RandomStringUtils rsu) {
+ // Check that the following create a string with 10 characters 0x0660 (a non-ASCII digit)
+ String r1 = rsu.next(10, 0x0660, 0x0661, false, true);
+ assertEquals(10, r1.length(), "wrong length");
+ for (int i = 0; i < r1.length(); i++) {
+ assertEquals(0x0660, r1.charAt(i), "characters not all equal to 0x0660");
+ }
+
+ // Same with both letters=true and numbers=true
+ r1 = rsu.next(10, 0x0660, 0x0661, true, true);
+ assertEquals(10, r1.length(), "wrong length");
+ for (int i = 0; i < r1.length(); i++) {
+ assertEquals(0x0660, r1.charAt(i), "characters not all equal to 0x0660");
+ }
+
+ // Check that at least one letter is not ASCII
+ boolean found = false;
+ r1 = rsu.next(40, 'F', 0x3000, false, true);
+ assertEquals(40, r1.length(), "wrong length");
+ for (int i = 0; i < r1.length(); i++) {
+ assertTrue(Character.isDigit(r1.charAt(i)), "characters not all numbers");
+ if (r1.charAt(i) > 0x7f) {
+ found = true;
+ }
+ }
+ assertTrue(found, "no non-ASCII number generated");
+ }
}