19
19
# Since this should not require frequent updates, we just store this
20
20
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
21
21
import collections
22
+ import re
22
23
import urllib .request
23
24
from itertools import batched
24
25
67
68
class UnicodeData (object ):
68
69
def __init__ (self ):
69
70
self ._load_unicode_data ()
71
+ self ._load_default_ignorable_marks ()
72
+
70
73
self .norm_props = self ._load_norm_props ()
71
74
self .norm_tests = self ._load_norm_tests ()
72
75
@@ -101,6 +104,11 @@ def _load_unicode_data(self):
101
104
self .general_category_mark = []
102
105
self .general_category_public_assigned = []
103
106
107
+ # Characters that cannot be part of a combining character sequence:
108
+ # control characters, format characters other than ZWJ and ZWNJ,
109
+ # the line and paragraph separators, and noncharacters.
110
+ self .not_in_ccs = []
111
+
104
112
assigned_start = 0 ;
105
113
prev_char_int = - 1 ;
106
114
prev_name = "" ;
@@ -126,6 +134,9 @@ def _load_unicode_data(self):
126
134
if category == 'M' or 'M' in expanded_categories .get (category , []):
127
135
self .general_category_mark .append (char_int )
128
136
137
+ if category in ['Cc' , 'Cf' , 'Zl' , 'Zp' ] and char_int not in [0x200C , 0x200D ]:
138
+ self .not_in_ccs .append (char_int )
139
+
129
140
assert category != 'Cn' , "Unexpected: Unassigned codepoint in UnicodeData.txt"
130
141
if category not in ['Co' , 'Cs' ]:
131
142
if char_int != prev_char_int + 1 and not is_first_and_last (prev_name , name ):
@@ -136,6 +147,44 @@ def _load_unicode_data(self):
136
147
137
148
self .general_category_public_assigned .append ((assigned_start , prev_char_int ))
138
149
150
+ # Mark noncharacters as nongraphic
151
+ for i in range (0xFDD0 , 0xFDF0 ):
152
+ self .not_in_ccs .append (i )
153
+ for prefix in range (0 , 0x11 ):
154
+ shifted = prefix << 16
155
+ self .not_in_ccs .append (shifted | 0xFFFE )
156
+ self .not_in_ccs .append (shifted | 0xFFFF )
157
+
158
+ self .not_in_ccs .sort ()
159
+
160
+ def _load_default_ignorable_marks (self ):
161
+ default_ignorable_cps = set ()
162
+
163
+ single = re .compile (r"^([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+" )
164
+ multiple = re .compile (
165
+ r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+Default_Ignorable_Code_Point\s+"
166
+ )
167
+
168
+ for line in self ._fetch ("DerivedCoreProperties.txt" ).splitlines ():
169
+ raw_data = None # (low, high)
170
+ if match := single .match (line ):
171
+ raw_data = (match .group (1 ), match .group (1 ))
172
+ elif match := multiple .match (line ):
173
+ raw_data = (match .group (1 ), match .group (2 ))
174
+ else :
175
+ continue
176
+ low = int (raw_data [0 ], 16 )
177
+ high = int (raw_data [1 ], 16 )
178
+ for cp in range (low , high + 1 ):
179
+ default_ignorable_cps .add (cp )
180
+
181
+ self .default_ignorable_marks = []
182
+ for cp in self .general_category_mark :
183
+ if cp in default_ignorable_cps :
184
+ self .default_ignorable_marks .append (cp )
185
+
186
+ self .default_ignorable_marks .sort ()
187
+
139
188
def _load_cjk_compat_ideograph_variants (self ):
140
189
for line in self ._fetch ("StandardizedVariants.txt" ).splitlines ():
141
190
strip_comments = line .split ('#' , 1 )[0 ].strip ()
@@ -461,7 +510,7 @@ def gen_combining_mark(general_category_mark, out):
461
510
462
511
def gen_public_assigned (general_category_public_assigned , out ):
463
512
# This could be done as a hash but the table is somewhat small.
464
- out .write ("#[inline]\n " )
513
+ out .write ("\n #[inline]\n " )
465
514
out .write ("pub fn is_public_assigned(c: char) -> bool {\n " )
466
515
out .write (" match c {\n " )
467
516
@@ -482,6 +531,66 @@ def gen_public_assigned(general_category_public_assigned, out):
482
531
out .write (" }\n " )
483
532
out .write ("}\n " )
484
533
534
+ def gen_not_in_ccs (not_in_ccs , out ):
535
+ # List of codepoints to list of ranges
536
+ range_list = []
537
+ for cp in not_in_ccs :
538
+ if len (range_list ) != 0 and range_list [- 1 ][1 ] == cp - 1 :
539
+ range_list [- 1 ] = (range_list [- 1 ][0 ], cp )
540
+ else :
541
+ range_list .append ((cp , cp ))
542
+
543
+ out .write ("\n #[inline]\n " )
544
+ out .write ("pub fn not_in_ccs(c: char) -> bool {\n " )
545
+ out .write (" match c {\n " )
546
+
547
+ start = True
548
+ for first , last in range_list :
549
+ if start :
550
+ out .write (" " )
551
+ start = False
552
+ else :
553
+ out .write ("\n | " )
554
+ if first == last :
555
+ out .write ("'\\ u{%s}'" % hexify (first ))
556
+ else :
557
+ out .write ("'\\ u{%s}'..='\\ u{%s}'" % (hexify (first ), hexify (last )))
558
+ out .write (" => true,\n " )
559
+
560
+ out .write (" _ => false,\n " )
561
+ out .write (" }\n " )
562
+ out .write ("}\n " )
563
+
564
+ def gen_default_ignorable_mark (default_ignorable_marks , out ):
565
+ # List of codepoints to list of ranges
566
+ range_list = []
567
+ for cp in default_ignorable_marks :
568
+ if len (range_list ) != 0 and range_list [- 1 ][1 ] == cp - 1 :
569
+ range_list [- 1 ] = (range_list [- 1 ][0 ], cp )
570
+ else :
571
+ range_list .append ((cp , cp ))
572
+
573
+ out .write ("\n #[inline]\n " )
574
+ out .write ("pub fn is_default_ignorable_mark(c: char) -> bool {\n " )
575
+ out .write (" match c {\n " )
576
+
577
+ start = True
578
+ for first , last in range_list :
579
+ if start :
580
+ out .write (" " )
581
+ start = False
582
+ else :
583
+ out .write ("\n | " )
584
+ if first == last :
585
+ out .write ("'\\ u{%s}'" % hexify (first ))
586
+ else :
587
+ out .write ("'\\ u{%s}'..='\\ u{%s}'" % (hexify (first ), hexify (last )))
588
+ out .write (" => true,\n " )
589
+
590
+ out .write (" _ => false,\n " )
591
+ out .write (" }\n " )
592
+ out .write ("}\n " )
593
+
485
594
def gen_stream_safe (leading , trailing , out ):
486
595
# This could be done as a hash but the table is very small.
487
596
out .write ("#[inline]\n " )
@@ -602,6 +711,10 @@ def minimal_perfect_hash(d):
602
711
603
712
gen_public_assigned (data .general_category_public_assigned , out )
604
713
714
+ gen_not_in_ccs (data .not_in_ccs , out )
715
+
716
+ gen_default_ignorable_mark (data .default_ignorable_marks , out )
717
+
605
718
gen_nfc_qc (data .norm_props , out )
606
719
607
720
gen_nfkc_qc (data .norm_props , out )
0 commit comments