unicode-rs · Jules-Bertholet · Mar 4, 2024 · Mar 5, 2024 · Mar 13, 2024
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -19,6 +19,7 @@
 # Since this should not require frequent updates, we just store this
 # out-of-line and check the tables.rs and normalization_tests.rs files into git.
 import collections
+import re
 import urllib.request
 from itertools import batched
 
@@ -67,6 +68,8 @@
 class UnicodeData(object):
     def __init__(self):
         self._load_unicode_data()
+        self._load_default_ignorable_marks()
+
         self.norm_props = self._load_norm_props()
         self.norm_tests = self._load_norm_tests()
 
@@ -101,6 +104,11 @@ def _load_unicode_data(self):
         self.general_category_mark = []
         self.general_category_public_assigned = []
 
+        # Characters that cannot be part of a combining character sequence:
+        # control characters, format characters other than ZWJ and ZWNJ,
+        # and the line and paragraph separators.
+        self.not_in_ccs = []
+
         assigned_start = 0;
         prev_char_int = -1;
         prev_name = "";
@@ -126,6 +134,9 @@ def _load_unicode_data(self):
             if category == 'M' or 'M' in expanded_categories.get(category, []):
                 self.general_category_mark.append(char_int)
 
+            if category in ['Cc', 'Cf', 'Zl', 'Zp'] and char_int not in [0x200C, 0x200D]:
+                self.not_in_ccs.append(char_int)
+
             assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
             if category not in ['Co', 'Cs']:
                 if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
@@ -136,6 +147,36 @@ def _load_unicode_data(self):
 
         self.general_category_public_assigned.append((assigned_start, prev_char_int))
 
+        self.not_in_ccs.sort()
+
+    def _load_default_ignorable_marks(self):
+        default_ignorable_cps = set()
+
+        single = re.compile(r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+")
+        multiple = re.compile(
+            r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
+        )
+
+        for line in self._fetch("DerivedCoreProperties.txt").splitlines():
+            raw_data = None  # (low, high)
+            if match := single.match(line):
+                raw_data = (match.group(1), match.group(1))
+            elif match := multiple.match(line):
+                raw_data = (match.group(1), match.group(2))
+            else:
+                continue
+            low = int(raw_data[0], 16)
+            high = int(raw_data[1], 16)
+            for cp in range(low, high + 1):
+                default_ignorable_cps.add(cp)
+
+        self.default_ignorable_marks = []
+        for cp in self.general_category_mark:
+            if cp in default_ignorable_cps:
+                self.default_ignorable_marks.append(cp)
+
+        self.default_ignorable_marks.sort()
+
     def _load_cjk_compat_ideograph_variants(self):
         for line in self._fetch("StandardizedVariants.txt").splitlines():
             strip_comments = line.split('#', 1)[0].strip()
@@ -461,7 +502,7 @@ def gen_combining_mark(general_category_mark, out):
 
 def gen_public_assigned(general_category_public_assigned, out):
     # This could be done as a hash but the table is somewhat small.
-    out.write("#[inline]\n")
+    out.write("\n#[inline]\n")
     out.write("pub fn is_public_assigned(c: char) -> bool {\n")
     out.write("    match c {\n")
 
@@ -482,6 +523,66 @@ def gen_public_assigned(general_category_public_assigned, out):
     out.write("    }\n")
     out.write("}\n")
 
+def gen_not_in_ccs(not_in_ccs, out):
+    # List of codepoints to list of ranges
+    range_list = []
+    for cp in not_in_ccs:
+        if len(range_list) != 0 and range_list[-1][1] == cp - 1:
+            range_list[-1] = (range_list[-1][0], cp)
+        else:
+            range_list.append((cp, cp))
+
+    out.write("\n#[inline]\n")
+    out.write("pub fn not_in_ccs(c: char) -> bool {\n")
+    out.write("    match c {\n")
+
+    start = True
+    for first, last in range_list:
+        if start:
+            out.write("        ")
+            start = False
+        else:
+            out.write("\n        | ")
+        if first == last:
+            out.write("'\\u{%s}'" % hexify(first))
+        else:
+            out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
+    out.write(" => true,\n")
+
+    out.write("        _ => false,\n")
+    out.write("    }\n")
+    out.write("}\n")
+
+def gen_default_ignorable_mark(default_ignorable_marks, out):
+    # List of codepoints to list of ranges
+    range_list = []
+    for cp in default_ignorable_marks:
+        if len(range_list) != 0 and range_list[-1][1] == cp - 1:
+            range_list[-1] = (range_list[-1][0], cp)
+        else:
+            range_list.append((cp, cp))
+
+    out.write("\n#[inline]\n")
+    out.write("pub fn is_default_ignorable_mark(c: char) -> bool {\n")
+    out.write("    match c {\n")
+
+    start = True
+    for first, last in range_list:
+        if start:
+            out.write("        ")
+            start = False
+        else:
+            out.write("\n        | ")
+        if first == last:
+            out.write("'\\u{%s}'" % hexify(first))
+        else:
+            out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last)))
+    out.write(" => true,\n")
+
+    out.write("        _ => false,\n")
+    out.write("    }\n")
+    out.write("}\n")
+
 def gen_stream_safe(leading, trailing, out):
     # This could be done as a hash but the table is very small.
     out.write("#[inline]\n")
@@ -602,6 +703,10 @@ def minimal_perfect_hash(d):
 
         gen_public_assigned(data.general_category_public_assigned, out)
 
+        gen_not_in_ccs(data.not_in_ccs, out)
+
+        gen_default_ignorable_mark(data.default_ignorable_marks, out)
+
         gen_nfc_qc(data.norm_props, out)
 
         gen_nfkc_qc(data.norm_props, out)

diff --git a/src/correct_ccs.rs b/src/correct_ccs.rs
@@ -0,0 +1,176 @@
+#[cfg(not(feature = "std"))]
+use alloc::collections::VecDeque;
+use core::iter::FusedIterator;
+#[cfg(feature = "std")]
+use std::collections::VecDeque;
+
+use crate::{lookups, tables};
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum CcsKind {
+    /// A CCS base character (graphic character other than combining mark).
+    Base,
+
+    /// A combining character other than a `Default_Ignorable_Code_Point`.
+    NonIgnorableCombining,
+
+    /// A default-ignorable combining character, ZWJ, or ZWNJ.
+    IgnorableCombining,
+}
+
+impl CcsKind {
+    fn of(c: char) -> Option<Self> {
+        if c == '\u{200C}' || c == '\u{200D}' {
+            // ZWNJ || ZWJ
+            Some(CcsKind::IgnorableCombining)
+        } else if lookups::is_combining_mark(c) {
+            if tables::is_default_ignorable_mark(c) {
+                Some(CcsKind::IgnorableCombining)
+            } else {
+                Some(CcsKind::NonIgnorableCombining)
+            }
+        } else if tables::not_in_ccs(c) {
+            None
+        } else {
+            Some(CcsKind::Base)
+        }
+    }
+}
+
+/// An iterator over the string that corrects
+/// [defective combining character sequences](https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#I6.1.36487)
+/// by inserting U+00A0 NO-BREAK SPACE in front of them.
+///
+/// For the purposes of this iterator, private use characters and unassigned codepoints
+/// are considered valid base characters,
+/// so combining character sequences that follow such will not be modified.
+///
+/// In addition, combining character sequences that consist entirely of `Default_Ignorable_Code_Point`s
+/// will not be modified. (Because of this, this iterator may buffer up to the entire length of its input;
+/// it is *not* "stream-safe" *even if* used with [`StreamSafe`][crate::StreamSafe]).
+#[derive(Clone, Debug)]
+pub struct CorrectDefectiveCcs<I> {
+    /// Whether the last character emitted was part of a CCS.
+    in_ccs: bool,
+    buffer: VecDeque<Option<char>>,
+    /// Whether the last character in `buffer` is part of a CCS.
+    /// (Updated only when `is_ccs` is set from false to true).
+    end_of_buffer_in_ccs: bool,
+    iter: I,
+}
+
+impl<I: Iterator<Item = char>> Iterator for CorrectDefectiveCcs<I> {
+    type Item = char;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.in_ccs {
+            if let Some(c) = self.buffer.pop_front() {
+                // Empty buffer
+
+                if self.buffer.is_empty() {
+                    self.in_ccs = self.end_of_buffer_in_ccs;
+                }
+                c
+            } else {
+                // Forward from inner iterator
+
+                let c = self.iter.next();
+                if c.map_or(true, tables::not_in_ccs) {
+                    self.in_ccs = false;
+                }
+                c
+            }
+        } else {
+            if self.buffer.is_empty() {
+                // We don't have a buffer of default ignorable combining characters built up
+
+                let c = self.iter.next()?;
+                match CcsKind::of(c) {
+                    // Character not in CCS, just forward it
+                    None => return Some(c),
+
+                    // Character starts non-defective CCS,
+                    // label ourselves as in CCS and forward it
+                    Some(CcsKind::Base) => {
+                        self.in_ccs = true;
+                        return Some(c);
+                    }
+
+                    // Character starts defective CCS and is not default-ignorable.
+                    // Put it in the buffer to emit on next iteration,
+                    // mark ourselves as in CCS,
+                    // and emit NO-BREAK SPACE
+                    Some(CcsKind::NonIgnorableCombining) => {
+                        self.in_ccs = true;
+                        self.end_of_buffer_in_ccs = true;
+                        self.buffer.push_back(Some(c));
+                        return Some('\u{00A0}'); // NO-BREAK SPACE
+                    }
+
+                    // Character starts defective CCS and is default-ignorable.
+                    // Put it in the buffer, and fall through to loop below
+                    // to find out whether we emit a NO-BREAK SPACE first.
+                    Some(CcsKind::IgnorableCombining) => {
+                        self.buffer.push_back(Some(c));
+                    }
+                }
+            }
+
+            loop {
+                // We do have a buffer of default ignorable combining characters built up,
+                // and we need to figure out whether to emit a NO-BREAK SPACE first.
+
+                let c = self.iter.next();
+                match c.and_then(CcsKind::of) {
+                    // Inner iterator yielded character outside CCS (or `None`).
+                    // Emit the built-up buffer with no leading NO-BREAK SPACE.
+                    None => {
+                        self.in_ccs = true;
+                        self.end_of_buffer_in_ccs = false;
+                        let ret = self.buffer.pop_front().unwrap();
+                        self.buffer.push_back(c);
+                        return ret;
+                    }
+
+                    // Inner iterator yielded character that starts a new CCS.
+                    // Emit the built-up buffer with no leading NO-BREAK SPACE.
+                    Some(CcsKind::Base) => {
+                        self.in_ccs = true;
+                        self.end_of_buffer_in_ccs = true;
+                        let ret = self.buffer.pop_front().unwrap();
+                        self.buffer.push_back(c);
+                        return ret;
+                    }
+
+                    // Inner iterator yielded non-ignorable combining character.
+                    // Emit the built-up buffer with leading NO-BREAK SPACE.
+                    Some(CcsKind::NonIgnorableCombining) => {
+                        self.in_ccs = true;
+                        self.end_of_buffer_in_ccs = true;
+                        self.buffer.push_back(c);
+                        return Some('\u{00A0}'); // NO-BREAK SPACE
+                    }
+
+                    // Inner iterator yielded ignorable combining character.
+                    // Add it to the buffer, don't emit anything.
+                    Some(CcsKind::IgnorableCombining) => {
+                        self.buffer.push_back(c);
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for CorrectDefectiveCcs<I> {}
+
+impl<I> CorrectDefectiveCcs<I> {
+    pub(crate) fn new(iter: I) -> Self {
+        Self {
+            in_ccs: false,
+            buffer: VecDeque::new(),
+            end_of_buffer_in_ccs: false,
+            iter,
+        }
+    }
+}