apache · zhongyujiang · Sep 18, 2024 · Sep 19, 2024 · RussellSpitzer · Sep 19, 2024
diff --git a/api/src/main/java/org/apache/iceberg/types/Comparators.java b/api/src/main/java/org/apache/iceberg/types/Comparators.java
@@ -321,9 +321,9 @@ private CharSeqComparator() {}
      * represented using two Java characters (using UTF-16 surrogate pairs). Character by character
      * comparison may yield incorrect results while comparing a 4 byte UTF-8 character to a java
      * char. Character by character comparison works as expected if both characters are <= 3 byte
-     * UTF-8 character or both characters are 4 byte UTF-8 characters.
-     * isCharInUTF16HighSurrogateRange method detects a 4-byte character and considers that
-     * character to be lexicographically greater than any 3 byte or lower UTF-8 character.
+     * UTF-8 character or both characters are 4 byte UTF-8 characters. isCharHighSurrogate method
+     * detects a high surrogate (4-byte character) and considers that character to be
+     * lexicographically greater than any 3 byte or lower UTF-8 character.
      */
     @Override
     public int compare(CharSequence s1, CharSequence s2) {

diff --git a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
@@ -82,9 +82,9 @@ public static Literal<CharSequence> truncateStringMax(Literal<CharSequence> inpu
     for (int i = length - 1; i >= 0; i--) {
       // Get the offset in the truncated string buffer where the number of unicode characters = i
       int offsetByCodePoint = truncatedStringBuilder.offsetByCodePoints(0, i);
-      int nextCodePoint = truncatedStringBuilder.codePointAt(offsetByCodePoint) + 1;
+      int nextCodePoint = incrementCodePoint(truncatedStringBuilder.codePointAt(offsetByCodePoint));
       // No overflow
-      if (nextCodePoint != 0 && Character.isValidCodePoint(nextCodePoint)) {
+      if (nextCodePoint != 0) {
         truncatedStringBuilder.setLength(offsetByCodePoint);
         // Append next code point to the truncated substring
         truncatedStringBuilder.appendCodePoint(nextCodePoint);
@@ -93,4 +93,24 @@ public static Literal<CharSequence> truncateStringMax(Literal<CharSequence> inpu
     }
     return null; // Cannot find a valid upper bound
   }
+
+  private static int incrementCodePoint(int codePoint) {
+    // surrogate code points are not Unicode scalar values,
+    // any UTF-8 byte sequence that would otherwise map to code points U+D800..U+DFFF is ill-formed.
+    // see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27288
+    Preconditions.checkArgument(
+        codePoint < Character.MIN_SURROGATE || codePoint > Character.MAX_SURROGATE,
+        "invalid code point: %s",
+        codePoint);
+
+    if (codePoint == Character.MIN_SURROGATE - 1) {
+      // increment to the next Unicode scalar value
+      return Character.MAX_SURROGATE + 1;
+    } else if (codePoint == Character.MAX_CODE_POINT) {
+      // overflow
+      return 0;
+    } else {
+      return codePoint + 1;
+    }
+  }
 }
diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
@@ -274,4 +274,17 @@ public void testTruncateStringMax() {
             "Test input with multiple 4 byte UTF-8 character where the first unicode character should be incremented")
         .isEqualTo(0);
   }
+
+  @Test
+  public void testTruncateStringMaxUpperBound() {
+    String max = "abcdefghigklmno" + (char) (Character.MIN_SURROGATE - 1) + "p";
+    String expectedUpper = "abcdefghigklmno" + (char) (Character.MAX_SURROGATE + 1);
+    Comparator<CharSequence> cmp = Literal.of(max).comparator();
+    CharSequence truncatedUpper = truncateStringMax(Literal.of(max), 16).value();
+    assertThat(cmp.compare(truncatedUpper, max))
+        .as("Truncated upper bound should be greater than the input max")
+        .isGreaterThan(1);
+
+    assertThat(truncatedUpper).usingComparator(cmp).isEqualTo(expectedUpper);
+  }
 }