Skip to content

Commit e77b292

Browse files
Make characters with Line_Break=Ambiguous ambiguous (#61)
1 parent 5a7fced commit e77b292

File tree

4 files changed

+39
-15
lines changed

4 files changed

+39
-15
lines changed

scripts/unicode.py

+12
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,19 @@
1515
# - DerivedCoreProperties.txt
1616
# - EastAsianWidth.txt
1717
# - HangulSyllableType.txt
18+
# - LineBreak.txt
1819
# - NormalizationTest.txt (for tests only)
1920
# - PropList.txt
2021
# - ReadMe.txt
2122
# - UnicodeData.txt
2223
# - auxiliary/GraphemeBreakProperty.txt
2324
# - emoji/emoji-data.txt
25+
# - emoji/emoji-test.txt (for tests only)
2426
# - emoji/emoji-variation-sequences.txt
27+
# - extracted/DerivedCombiningClass.txt
2528
# - extracted/DerivedGeneralCategory.txt
29+
# - extracted/DerivedJoiningGroup.txt
30+
# - extracted/DerivedJoiningType.txt
2631
#
2732
# Since this should not require frequent updates, we just store this
2833
# out-of-line and check the generated module into git.
@@ -429,6 +434,13 @@ def load_east_asian_widths() -> list[EastAsianWidth]:
429434
# Catch any leftover codepoints and assign them implicit Neutral/narrow width.
430435
width_map.append(EastAsianWidth.NARROW)
431436

437+
# Characters with ambiguous line breaking are ambiguous
438+
load_property(
439+
"LineBreak.txt",
440+
"AI",
441+
lambda cp: (operator.setitem(width_map, cp, EastAsianWidth.AMBIGUOUS)),
442+
)
443+
432444
# Ambiguous `Letter`s and `Modifier_Symbol`s are narrow
433445
load_property(
434446
"extracted/DerivedGeneralCategory.txt",

src/lib.rs

+8-3
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,11 @@
119119
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
120120
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
121121
//! 6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
122-
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
123-
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
124-
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
122+
//! - Fulfills one of the following conditions:
123+
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
124+
//! - Has a [`Line_Break`] of [`AI`], or
125+
//! - Has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
126+
//! - Is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387); and
125127
//! - Does not have a [`General_Category`] of `Letter` or `Modifier_Symbol`.
126128
//! 7. All other characters have width 1.
127129
//!
@@ -138,13 +140,16 @@
138140
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
139141
//! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862
140142
//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G50009
143+
//! [`Line_Break`]: https://www.unicode.org/reports/tr14/#LD5
141144
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
142145
//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
143146
//!
144147
//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
145148
//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
146149
//! [`Ambiguous`]: https://www.unicode.org/reports/tr11/#ED6
147150
//!
151+
//! [`AI`]: https://www.unicode.org/reports/tr14/#AI
152+
//!
148153
//! [combining marks]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G30602
149154
//!
150155
//! [emoji ZWJ sequences]: https://www.unicode.org/reports/tr51/#def_emoji_sequence

src/tables.rs

+12-12
Original file line numberDiff line numberDiff line change
@@ -1030,8 +1030,8 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([
10301030
],
10311031
#[cfg(feature = "cjk")]
10321032
[
1033-
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x2E, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD,
1034-
0x33, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAE, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38,
1033+
0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x2E, 0xA7, 0x39, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC,
1034+
0xAD, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAE, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38,
10351035
0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0xAF, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
10361036
0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
10371037
0x39, 0x39, 0x39, 0x39,
@@ -1878,7 +1878,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
18781878
#[cfg(feature = "cjk")]
18791879
[
18801880
0x95, 0x59, 0x59, 0x55, 0x55, 0x65, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1881-
0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x56, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA,
1881+
0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x5A, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA,
18821882
0x5A, 0x55,
18831883
],
18841884
#[cfg(feature = "cjk")]
@@ -1914,13 +1914,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
19141914
#[cfg(feature = "cjk")]
19151915
[
19161916
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
1917-
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x9A, 0xAA, 0xAA, 0xAA,
1918-
0xAA, 0xAA,
1919-
],
1920-
#[cfg(feature = "cjk")]
1921-
[
1922-
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
1923-
0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55,
1917+
0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x56,
19241918
0x55, 0x55,
19251919
],
19261920
#[cfg(feature = "cjk")]
@@ -1931,7 +1925,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
19311925
],
19321926
#[cfg(feature = "cjk")]
19331927
[
1934-
0x55, 0x69, 0x59, 0xA5, 0x55, 0x5F, 0x55, 0x66, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1928+
0x55, 0x69, 0x59, 0xA5, 0x55, 0xAF, 0x55, 0x66, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
19351929
0x55, 0x66, 0x55, 0xFF, 0xFF, 0xFF, 0x55, 0x55, 0x55, 0x9A, 0x9A, 0x6A, 0x9A, 0x55, 0x55,
19361930
0x55, 0xD5,
19371931
],
@@ -1948,6 +1942,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
19481942
0xAA, 0xAA,
19491943
],
19501944
#[cfg(feature = "cjk")]
1945+
[
1946+
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xFD, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0x55, 0x55,
1947+
0xD5, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1948+
0x55, 0x55,
1949+
],
1950+
#[cfg(feature = "cjk")]
19511951
[
19521952
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xD5, 0x57, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
19531953
0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0xAD, 0x5A, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
@@ -1973,7 +1973,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
19731973
],
19741974
#[cfg(feature = "cjk")]
19751975
[
1976-
0xAA, 0xAA, 0x6A, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0xAA, 0xAA, 0xAA,
1976+
0xAA, 0xAA, 0xAA, 0x56, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0xAA, 0xAA, 0xAA,
19771977
0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0x55, 0xAA, 0xAA,
19781978
0xAA, 0xAA,
19791979
],

tests/tests.rs

+7
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,13 @@ fn emoji_test_file() {
588588
}
589589
}
590590

591+
#[test]
592+
fn ambiguous_line_break() {
593+
assert_width!("\u{24EA}", 1, 2);
594+
assert_width!("\u{2616}", 1, 2);
595+
assert_width!("\u{2780}", 1, 2);
596+
}
597+
591598
// Test traits are unsealed
592599

593600
#[cfg(feature = "cjk")]

0 commit comments

Comments
 (0)