@@ -155,11 +155,11 @@ def format_table_content(f, content, indent):
155
155
line = " " * indent + chunk
156
156
f .write (line )
157
157
158
- def load_properties (f , interestingprops ):
158
+ def load_properties (f , interestingprops : "list[str | tuple[str, str]] | None" = None ):
159
159
fetch (f )
160
160
props = {}
161
- re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+)" )
162
- re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)" )
161
+ re1 = re .compile (r"^\s *([0-9A-F]+)\s*;\s *(\w+)(?:\s*;\s*(\w+))? " )
162
+ re2 = re .compile (r"^\s *([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s *(\w+)(?:\s*;\s*(\w+))? " )
163
163
164
164
for line in fileinput .input (os .path .basename (f )):
165
165
prop = None
@@ -168,17 +168,21 @@ def load_properties(f, interestingprops):
168
168
m = re1 .match (line )
169
169
if m :
170
170
d_lo = m .group (1 )
171
- d_hi = m . group ( 1 )
171
+ d_hi = d_lo
172
172
prop = m .group (2 )
173
+ value = m .group (3 )
173
174
else :
174
175
m = re2 .match (line )
175
176
if m :
176
177
d_lo = m .group (1 )
177
178
d_hi = m .group (2 )
178
179
prop = m .group (3 )
180
+ value = m .group (4 )
179
181
else :
180
182
continue
181
- if interestingprops and prop not in interestingprops :
183
+ if value is not None :
184
+ prop = (prop , value )
185
+ if interestingprops is not None and prop not in interestingprops :
182
186
continue
183
187
d_lo = int (d_lo , 16 )
184
188
d_hi = int (d_hi , 16 )
@@ -195,7 +199,7 @@ def load_properties(f, interestingprops):
195
199
def escape_char (c ):
196
200
return "'\\ u{%x}'" % c
197
201
198
- def emit_table (f , name , t_data , t_type = "&'static [(char, char)]" , is_pub = True ,
202
+ def emit_table (f , name , t_data , t_type = "&[(char, char)]" , is_pub = True ,
199
203
pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ])), is_const = True ):
200
204
pub_string = "const"
201
205
if not is_const :
@@ -217,7 +221,7 @@ def emit_util_mod(f):
217
221
f .write ("""
218
222
pub mod util {
219
223
#[inline]
220
- pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
224
+ pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {
221
225
use core::cmp::Ordering::{Equal, Less, Greater};
222
226
r.binary_search_by(|&(lo,hi)| {
223
227
if lo <= c && c <= hi { Equal }
@@ -252,13 +256,22 @@ def emit_util_mod(f):
252
256
253
257
""" )
254
258
255
- def emit_property_module (f , mod , tbl , emit ):
256
- f .write ("mod %s {\n " % mod )
257
- for cat in sorted (emit ):
258
- emit_table (f , "%s_table" % cat , tbl [cat ], is_pub = False )
259
+ def emit_property_module (f , mod , tbl , emit : "list[str | tuple[str, str]]" ):
260
+ f .write ("pub mod %s {\n " % mod )
261
+
262
+ cats = []
263
+ for cat in emit :
264
+ if type (cat ) is tuple :
265
+ cats .append ((f"{ cat [0 ]} _{ cat [1 ]} " , cat ))
266
+ else :
267
+ cats .append ((cat , cat ))
268
+ cats .sort (key = lambda x : x [0 ])
269
+
270
+ for cat_str , cat in cats :
271
+ emit_table (f , "%s_table" % cat_str , tbl [cat ], is_pub = False )
259
272
f .write (" #[inline]\n " )
260
- f .write (" pub fn %s(c: char) -> bool {\n " % cat )
261
- f .write (" super::util::bsearch_range_table(c, %s_table)\n " % cat )
273
+ f .write (" pub fn %s(c: char) -> bool {\n " % cat_str )
274
+ f .write (" super::util::bsearch_range_table(c, %s_table)\n " % cat_str )
262
275
f .write (" }\n \n " )
263
276
f .write ("}\n \n " )
264
277
@@ -303,7 +316,7 @@ def emit_break_module(f, break_table, break_cats, name):
303
316
f .write ((" %sC_" % Name [0 ]) + cat + ",\n " )
304
317
f .write (""" }
305
318
306
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
319
+ fn bsearch_range_value_table(c: char, r: &[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
307
320
use core::cmp::Ordering::{Equal, Less, Greater};
308
321
match r.binary_search_by(|&(lo, hi, _)| {
309
322
if lo <= c && c <= hi { Equal }
@@ -355,11 +368,11 @@ def emit_break_module(f, break_table, break_cats, name):
355
368
else :
356
369
lookup_type = "u32"
357
370
358
- emit_table (f , "%s_cat_lookup" % name , lookup_table , "&'static [%s]" % lookup_type ,
371
+ emit_table (f , "%s_cat_lookup" % name , lookup_table , "&[%s]" % lookup_type ,
359
372
pfun = lambda x : "%d" % x ,
360
373
is_pub = False , is_const = True )
361
374
362
- emit_table (f , "%s_cat_table" % name , break_table , "&'static [(char, char, %sCat)]" % Name ,
375
+ emit_table (f , "%s_cat_table" % name , break_table , "&[(char, char, %sCat)]" % Name ,
363
376
pfun = lambda x : "(%s,%s,%sC_%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), Name [0 ], x [2 ]),
364
377
is_pub = False , is_const = True )
365
378
f .write ("}\n " )
@@ -379,17 +392,26 @@ def emit_break_module(f, break_table, break_cats, name):
379
392
380
393
# download and parse all the data
381
394
gencats = load_gencats ("UnicodeData.txt" )
382
- derived = load_properties ("DerivedCoreProperties.txt" , ["Alphabetic" ])
395
+ derived = load_properties ("DerivedCoreProperties.txt" , ["Alphabetic" , ( "InCB" , "Consonant" ), ( "InCB" , "Extend" ), ( "InCB" , "Linker" ) ])
383
396
384
397
emit_util_mod (rf )
385
398
for (name , cat , pfuns ) in ("general_category" , gencats , ["N" ]), \
386
- ("derived_property" , derived , ["Alphabetic" ]):
399
+ ("derived_property" , derived , ["Alphabetic" , ( "InCB" , "Extend" ) ]):
387
400
emit_property_module (rf , name , cat , pfuns )
388
401
402
+ rf .write ("""pub fn is_incb_linker(c: char) -> bool {
403
+ matches!(c,""" )
404
+
405
+ for (lo , hi ) in derived [("InCB" , "Linker" )]:
406
+ rf .write (f" | '\\ u{{{ lo :X} }}'" )
407
+ if lo != hi :
408
+ rf .write (f"..'\\ u{{{ lo :X} }}'" )
409
+
410
+ rf .write (")\n }\n \n " )
411
+
389
412
### grapheme cluster module
390
413
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
391
- grapheme_cats = load_properties ("auxiliary/GraphemeBreakProperty.txt" , [])
392
-
414
+ grapheme_cats = load_properties ("auxiliary/GraphemeBreakProperty.txt" )
393
415
# Control
394
416
# Note:
395
417
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
@@ -398,22 +420,22 @@ def emit_break_module(f, break_table, break_cats, name):
398
420
grapheme_cats ["Control" ] = group_cat (list (
399
421
set (ungroup_cat (grapheme_cats ["Control" ]))
400
422
- set (ungroup_cat ([surrogate_codepoints ]))))
401
-
423
+ grapheme_cats ["InCB_Consonant" ] = derived [("InCB" , "Consonant" )]
424
+ emoji_props = load_properties ("emoji-data.txt" , ["Extended_Pictographic" ])
425
+ grapheme_cats ["Extended_Pictographic" ] = emoji_props ["Extended_Pictographic" ]
402
426
grapheme_table = []
403
427
for cat in grapheme_cats :
404
428
grapheme_table .extend ([(x , y , cat ) for (x , y ) in grapheme_cats [cat ]])
405
- emoji_props = load_properties ("emoji-data.txt" , ["Extended_Pictographic" ])
406
- grapheme_table .extend ([(x , y , "Extended_Pictographic" ) for (x , y ) in emoji_props ["Extended_Pictographic" ]])
407
429
grapheme_table .sort (key = lambda w : w [0 ])
408
430
last = - 1
409
431
for chars in grapheme_table :
410
432
if chars [0 ] <= last :
411
433
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
412
434
last = chars [1 ]
413
- emit_break_module (rf , grapheme_table , list (grapheme_cats .keys ()) + [ "Extended_Pictographic" ] , "grapheme" )
435
+ emit_break_module (rf , grapheme_table , list (grapheme_cats .keys ()), "grapheme" )
414
436
rf .write ("\n " )
415
437
416
- word_cats = load_properties ("auxiliary/WordBreakProperty.txt" , [] )
438
+ word_cats = load_properties ("auxiliary/WordBreakProperty.txt" )
417
439
word_table = []
418
440
for cat in word_cats :
419
441
word_table .extend ([(x , y , cat ) for (x , y ) in word_cats [cat ]])
@@ -425,7 +447,7 @@ def emit_break_module(f, break_table, break_cats, name):
425
447
emoji_table = [(x , y , "Extended_Pictographic" ) for (x , y ) in emoji_props ["Extended_Pictographic" ]]
426
448
emit_break_module (rf , emoji_table , ["Extended_Pictographic" ], "emoji" )
427
449
428
- sentence_cats = load_properties ("auxiliary/SentenceBreakProperty.txt" , [] )
450
+ sentence_cats = load_properties ("auxiliary/SentenceBreakProperty.txt" )
429
451
sentence_table = []
430
452
for cat in sentence_cats :
431
453
sentence_table .extend ([(x , y , cat ) for (x , y ) in sentence_cats [cat ]])
0 commit comments