Skip to content

Commit d4decae

Browse files
Add API to correct defective combining character sequences
1 parent ac8fa20 commit d4decae

File tree

5 files changed

+413
-1
lines changed

5 files changed

+413
-1
lines changed

scripts/unicode.py

+114-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
# Since this should not require frequent updates, we just store this
2020
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
2121
import collections
22+
import re
2223
import urllib.request
2324
from itertools import batched
2425

@@ -67,6 +68,8 @@
6768
class UnicodeData(object):
6869
def __init__(self):
6970
self._load_unicode_data()
71+
self._load_default_ignorable_marks()
72+
7073
self.norm_props = self._load_norm_props()
7174
self.norm_tests = self._load_norm_tests()
7275

@@ -101,6 +104,11 @@ def _load_unicode_data(self):
101104
self.general_category_mark = []
102105
self.general_category_public_assigned = []
103106

107+
# Characters that cannot be part of a combining character sequence:
108+
# control characters, format characters other than ZWJ and ZWNJ,
109+
# the line and paragraph separators, and noncharacters.
110+
self.not_in_ccs = []
111+
104112
assigned_start = 0;
105113
prev_char_int = -1;
106114
prev_name = "";
@@ -126,6 +134,9 @@ def _load_unicode_data(self):
126134
if category == 'M' or 'M' in expanded_categories.get(category, []):
127135
self.general_category_mark.append(char_int)
128136

137+
if category in ['Cc', 'Cf', 'Zl', 'Zp'] and char_int not in [0x200C, 0x200D]:
138+
self.not_in_ccs.append(char_int)
139+
129140
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
130141
if category not in ['Co', 'Cs']:
131142
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
@@ -136,6 +147,44 @@ def _load_unicode_data(self):
136147

137148
self.general_category_public_assigned.append((assigned_start, prev_char_int))
138149

150+
# Mark noncharacters as nongraphic
151+
for i in range(0xFDD0, 0xFDF0):
152+
self.not_in_ccs.append(i)
153+
for prefix in range(0, 0x11):
154+
shifted = prefix << 16
155+
self.not_in_ccs.append(shifted | 0xFFFE)
156+
self.not_in_ccs.append(shifted | 0xFFFF)
157+
158+
self.not_in_ccs.sort()
159+
160+
def _load_default_ignorable_marks(self):
161+
default_ignorable_cps = set()
162+
163+
single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
164+
multiple = re.compile(
165+
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
166+
)
167+
168+
for line in self._fetch("DerivedCoreProperties.txt").splitlines():
169+
raw_data = None # (low, high)
170+
if match := single.match(line):
171+
raw_data = (match.group(1), match.group(1))
172+
elif match := multiple.match(line):
173+
raw_data = (match.group(1), match.group(2))
174+
else:
175+
continue
176+
low = int(raw_data[0], 16)
177+
high = int(raw_data[1], 16)
178+
for cp in range(low, high + 1):
179+
default_ignorable_cps.add(cp)
180+
181+
self.default_ignorable_marks = []
182+
for cp in self.general_category_mark:
183+
if cp in default_ignorable_cps:
184+
self.default_ignorable_marks.append(cp)
185+
186+
self.default_ignorable_marks.sort()
187+
139188
def _load_cjk_compat_ideograph_variants(self):
140189
for line in self._fetch("StandardizedVariants.txt").splitlines():
141190
strip_comments = line.split('#', 1)[0].strip()
@@ -461,7 +510,7 @@ def gen_combining_mark(general_category_mark, out):
461510

462511
def gen_public_assigned(general_category_public_assigned, out):
463512
# This could be done as a hash but the table is somewhat small.
464-
out.write("#[inline]\n")
513+
out.write("\n#[inline]\n")
465514
out.write("pub fn is_public_assigned(c: char) -> bool {\n")
466515
out.write(" match c {\n")
467516

@@ -482,6 +531,66 @@ def gen_public_assigned(general_category_public_assigned, out):
482531
out.write(" }\n")
483532
out.write("}\n")
484533

534+
def gen_not_in_ccs(not_in_ccs, out):
535+
# List of codepoints to list of ranges
536+
range_list = []
537+
for cp in not_in_ccs:
538+
if len(range_list) != 0 and range_list[-1][1] == cp - 1:
539+
range_list[-1] = (range_list[-1][0], cp)
540+
else:
541+
range_list.append((cp, cp))
542+
543+
out.write("\n#[inline]\n")
544+
out.write("pub fn not_in_ccs(c: char) -> bool {\n")
545+
out.write(" match c {\n")
546+
547+
start = True
548+
for first, last in range_list:
549+
if start:
550+
out.write(" ")
551+
start = False
552+
else:
553+
out.write("\n | ")
554+
if first == last:
555+
out.write("'\\u{%s}'" % hexify(first))
556+
else:
557+
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
558+
out.write(" => true,\n")
559+
560+
out.write(" _ => false,\n")
561+
out.write(" }\n")
562+
out.write("}\n")
563+
564+
def gen_default_ignorable_mark(default_ignorable_marks, out):
565+
# List of codepoints to list of ranges
566+
range_list = []
567+
for cp in default_ignorable_marks:
568+
if len(range_list) != 0 and range_list[-1][1] == cp - 1:
569+
range_list[-1] = (range_list[-1][0], cp)
570+
else:
571+
range_list.append((cp, cp))
572+
573+
out.write("\n#[inline]\n")
574+
out.write("pub fn is_default_ignorable_mark(c: char) -> bool {\n")
575+
out.write(" match c {\n")
576+
577+
start = True
578+
for first, last in range_list:
579+
if start:
580+
out.write(" ")
581+
start = False
582+
else:
583+
out.write("\n | ")
584+
if first == last:
585+
out.write("'\\u{%s}'" % hexify(first))
586+
else:
587+
out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
588+
out.write(" => true,\n")
589+
590+
out.write(" _ => false,\n")
591+
out.write(" }\n")
592+
out.write("}\n")
593+
485594
def gen_stream_safe(leading, trailing, out):
486595
# This could be done as a hash but the table is very small.
487596
out.write("#[inline]\n")
@@ -602,6 +711,10 @@ def minimal_perfect_hash(d):
602711

603712
gen_public_assigned(data.general_category_public_assigned, out)
604713

714+
gen_not_in_ccs(data.not_in_ccs, out)
715+
716+
gen_default_ignorable_mark(data.default_ignorable_marks, out)
717+
605718
gen_nfc_qc(data.norm_props, out)
606719

607720
gen_nfkc_qc(data.norm_props, out)

src/correct_ccs.rs

+177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#[cfg(not(feature = "std"))]
2+
use alloc::collections::VecDeque;
3+
use core::iter::FusedIterator;
4+
#[cfg(feature = "std")]
5+
use std::collections::VecDeque;
6+
7+
use crate::{lookups, tables};
8+
9+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
10+
enum CcsKind {
11+
/// A CCS base character (graphic character other than combining mark).
12+
Base,
13+
14+
/// A combining character other than a `Default_Ignorable_Code_Point`.
15+
NonIgnorableCombining,
16+
17+
/// A default-ignorable combining character, ZWJ, or ZWNJ.
18+
IgnorableCombining,
19+
}
20+
21+
impl CcsKind {
22+
fn of(c: char) -> Option<Self> {
23+
if c == '\u{200C}' || c == '\u{200D}' {
24+
// ZWNJ || ZWJ
25+
Some(CcsKind::IgnorableCombining)
26+
} else if lookups::is_combining_mark(c) {
27+
if tables::is_default_ignorable_mark(c) {
28+
Some(CcsKind::IgnorableCombining)
29+
} else {
30+
Some(CcsKind::NonIgnorableCombining)
31+
}
32+
} else if tables::not_in_ccs(c) {
33+
None
34+
} else {
35+
Some(CcsKind::Base)
36+
}
37+
}
38+
}
39+
40+
/// An iterator over the string that corrects
41+
/// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
42+
/// by inserting U+00A0 NO-BREAK SPACE in front of them.
43+
///
44+
/// For the purposes of this iterator, private use characters,
45+
/// as well as unassigned codepoints other than noncharacters,
46+
/// are considered valid base characters,
47+
/// so combining character sequences that start with such will not be modified.
48+
///
49+
/// In addition, combining character sequences that consist entirely of `Default_Ignorable_Code_Point`s
50+
/// will not be modified. (Because of this, this iterator may buffer up to the entire length of its input;
51+
/// it is *not* "stream-safe" *even if* used with [`StreamSafe`][crate::StreamSafe]).
52+
#[derive(Clone, Debug)]
53+
pub struct CorrectDefectiveCcs<I> {
54+
/// Whether the last character emitted was part of a CCS.
55+
in_ccs: bool,
56+
buffer: VecDeque<Option<char>>,
57+
/// Whether the last character in `buffer` is part of a CCS.
58+
/// (Updated only when `is_ccs` is set from false to true).
59+
end_of_buffer_in_ccs: bool,
60+
iter: I,
61+
}
62+
63+
impl<I: Iterator<Item = char>> Iterator for CorrectDefectiveCcs<I> {
64+
type Item = char;
65+
66+
fn next(&mut self) -> Option<Self::Item> {
67+
if self.in_ccs {
68+
if let Some(c) = self.buffer.pop_front() {
69+
// Empty buffer
70+
71+
if self.buffer.is_empty() {
72+
self.in_ccs = self.end_of_buffer_in_ccs;
73+
}
74+
c
75+
} else {
76+
// Forward from inner iterator
77+
78+
let c = self.iter.next();
79+
if c.map_or(true, tables::not_in_ccs) {
80+
self.in_ccs = false;
81+
}
82+
c
83+
}
84+
} else {
85+
if self.buffer.is_empty() {
86+
// We don't have a buffer of default ignorable combining characters built up
87+
88+
let c = self.iter.next()?;
89+
match CcsKind::of(c) {
90+
// Character not in CCS, just forward it
91+
None => return Some(c),
92+
93+
// Character starts non-defective CCS,
94+
// label ourselves as in CCS and forward it
95+
Some(CcsKind::Base) => {
96+
self.in_ccs = true;
97+
return Some(c);
98+
}
99+
100+
// Character starts defective CCS and is not default-ignorable.
101+
// Put it in the buffer to emit on next iteration,
102+
// mark ourselves as in CCS,
103+
// and emit NO-BREAK SPACE
104+
Some(CcsKind::NonIgnorableCombining) => {
105+
self.in_ccs = true;
106+
self.end_of_buffer_in_ccs = true;
107+
self.buffer.push_back(Some(c));
108+
return Some('\u{00A0}'); // NO-BREAK SPACE
109+
}
110+
111+
// Character starts defective CCS and is default-ignorable.
112+
// Put it in the buffer, and fall through to loop below
113+
// to find out whether we emit a NO-BREAK SPACE first.
114+
Some(CcsKind::IgnorableCombining) => {
115+
self.buffer.push_back(Some(c));
116+
}
117+
}
118+
}
119+
120+
loop {
121+
// We do have a buffer of default ignorable combining characters built up,
122+
// and we need to figure out whether to emit a NO-BREAK SPACE first.
123+
124+
let c = self.iter.next();
125+
match c.and_then(CcsKind::of) {
126+
// Inner iterator yielded character outside CCS (or `None`).
127+
// Emit the built-up buffer with no leading NO-BREAK SPACE.
128+
None => {
129+
self.in_ccs = true;
130+
self.end_of_buffer_in_ccs = false;
131+
let ret = self.buffer.pop_front().unwrap();
132+
self.buffer.push_back(c);
133+
return ret;
134+
}
135+
136+
// Inner iterator yielded character that starts a new CCS.
137+
// Emit the built-up buffer with no leading NO-BREAK SPACE.
138+
Some(CcsKind::Base) => {
139+
self.in_ccs = true;
140+
self.end_of_buffer_in_ccs = true;
141+
let ret = self.buffer.pop_front().unwrap();
142+
self.buffer.push_back(c);
143+
return ret;
144+
}
145+
146+
// Inner iterator yielded non-ignorable combining character.
147+
// Emit the built-up buffer with leading NO-BREAK SPACE.
148+
Some(CcsKind::NonIgnorableCombining) => {
149+
self.in_ccs = true;
150+
self.end_of_buffer_in_ccs = true;
151+
self.buffer.push_back(c);
152+
return Some('\u{00A0}'); // NO-BREAK SPACE
153+
}
154+
155+
// Inner iterator yielded ignorable combining character.
156+
// Add it to the buffer, don't emit anything.
157+
Some(CcsKind::IgnorableCombining) => {
158+
self.buffer.push_back(c);
159+
}
160+
}
161+
}
162+
}
163+
}
164+
}
165+
166+
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for CorrectDefectiveCcs<I> {}
167+
168+
impl<I> CorrectDefectiveCcs<I> {
169+
pub(crate) fn new(iter: I) -> Self {
170+
Self {
171+
in_ccs: false,
172+
buffer: VecDeque::new(),
173+
end_of_buffer_in_ccs: false,
174+
iter,
175+
}
176+
}
177+
}

0 commit comments

Comments
 (0)