Skip to content

Commit 2dbe07f

Browse files
Don't handle noncharacters differently than other unassigned codepoints
1 parent 0b13808 commit 2dbe07f

File tree

5 files changed

+5
-32
lines changed

5 files changed

+5
-32
lines changed

scripts/unicode.py

+1-9
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def _load_unicode_data(self):
106106

107107
# Characters that cannot be part of a combining character sequence:
108108
# control characters, format characters other than ZWJ and ZWNJ,
109-
# the line and paragraph separators, and noncharacters.
109+
# and the line and paragraph separators.
110110
self.not_in_ccs = []
111111

112112
assigned_start = 0;
@@ -147,14 +147,6 @@ def _load_unicode_data(self):
147147

148148
self.general_category_public_assigned.append((assigned_start, prev_char_int))
149149

150-
# Mark noncharacters as nongraphic
151-
for i in range(0xFDD0, 0xFDF0):
152-
self.not_in_ccs.append(i)
153-
for prefix in range(0, 0x11):
154-
shifted = prefix << 16
155-
self.not_in_ccs.append(shifted | 0xFFFE)
156-
self.not_in_ccs.append(shifted | 0xFFFF)
157-
158150
self.not_in_ccs.sort()
159151

160152
def _load_default_ignorable_marks(self):

src/correct_ccs.rs

+1-2
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,7 @@ impl CcsKind {
4141
/// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
4242
/// by inserting U+00A0 NO-BREAK SPACE in front of them.
4343
///
44-
/// For the purposes of this iterator, private use characters,
45-
/// as well as unassigned codepoints other than noncharacters,
44+
/// For the purposes of this iterator, private use characters and unassigned codepoints
4645
/// are considered valid base characters,
4746
/// so combining character sequences that follow such will not be modified.
4847
///

src/lib.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
139139
/// with the correct advance width,
140140
/// in diverse contexts (for example, when printed to a terminal).
141141
///
142-
/// Sequences following a private use character or an unassigned codepoint that is not a noncharacter
142+
/// Sequences following a private use character or an unassigned codepoint
143143
/// are not corrected. Additionally, combining character sequences consisting entirely of
144144
/// [default-ignorable code points](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I8.1.40715)
145145
/// are also left untouched. Handling this last case may require the iterator

src/tables.rs

+1-19
Original file line numberDiff line numberDiff line change
@@ -20973,33 +20973,15 @@ pub fn not_in_ccs(c: char) -> bool {
2097320973
| '\u{2028}'..='\u{202E}'
2097420974
| '\u{2060}'..='\u{2064}'
2097520975
| '\u{2066}'..='\u{206F}'
20976-
| '\u{FDD0}'..='\u{FDEF}'
2097720976
| '\u{FEFF}'
2097820977
| '\u{FFF9}'..='\u{FFFB}'
20979-
| '\u{FFFE}'..='\u{FFFF}'
2098020978
| '\u{110BD}'
2098120979
| '\u{110CD}'
2098220980
| '\u{13430}'..='\u{1343F}'
2098320981
| '\u{1BCA0}'..='\u{1BCA3}'
2098420982
| '\u{1D173}'..='\u{1D17A}'
20985-
| '\u{1FFFE}'..='\u{1FFFF}'
20986-
| '\u{2FFFE}'..='\u{2FFFF}'
20987-
| '\u{3FFFE}'..='\u{3FFFF}'
20988-
| '\u{4FFFE}'..='\u{4FFFF}'
20989-
| '\u{5FFFE}'..='\u{5FFFF}'
20990-
| '\u{6FFFE}'..='\u{6FFFF}'
20991-
| '\u{7FFFE}'..='\u{7FFFF}'
20992-
| '\u{8FFFE}'..='\u{8FFFF}'
20993-
| '\u{9FFFE}'..='\u{9FFFF}'
20994-
| '\u{AFFFE}'..='\u{AFFFF}'
20995-
| '\u{BFFFE}'..='\u{BFFFF}'
20996-
| '\u{CFFFE}'..='\u{CFFFF}'
20997-
| '\u{DFFFE}'..='\u{DFFFF}'
2099820983
| '\u{E0001}'
20999-
| '\u{E0020}'..='\u{E007F}'
21000-
| '\u{EFFFE}'..='\u{EFFFF}'
21001-
| '\u{FFFFE}'..='\u{FFFFF}'
21002-
| '\u{10FFFE}'..='\u{10FFFF}' => true,
20984+
| '\u{E0020}'..='\u{E007F}' => true,
2100320985
_ => false,
2100420986
}
2100520987
}

tests/correct_defective_ccs.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ fn defective_css() {
1818
check_ccs!("\u{200C}\u{0301}bcde", "\u{00A0}\u{200C}\u{0301}bcde");
1919
check_ccs!("\u{200C}bcde", "\u{200C}bcde");
2020
check_ccs!("\u{180F}bcde", "\u{180F}bcde");
21-
check_ccs!("\u{FFFF}\u{0301}bcde", "\u{FFFF}\u{00A0}\u{0301}bcde");
21+
check_ccs!("\u{FFFF}\u{0301}bcde", "\u{FFFF}\u{0301}bcde");
2222
check_ccs!("\u{10FFFD}\u{0301}bcde", "\u{10FFFD}\u{0301}bcde");
2323
check_ccs!("\u{180F}\u{180F}\u{180F}", "\u{180F}\u{180F}\u{180F}");
2424
check_ccs!("\u{180F}\u{180F}\u{180F}a", "\u{180F}\u{180F}\u{180F}a");

0 commit comments

Comments
 (0)