Skip to content

Commit 592ce00

Browse files
authored
Merge pull request #134 from Jules-Bertholet/fix
Fix #125
2 parents 3ff9de6 + dce3a34 commit 592ce00

File tree

11 files changed

+1271
-2069
lines changed

11 files changed

+1271
-2069
lines changed

.github/workflows/rust.yml

+9-6
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,31 @@ on:
77
branches: [ master ]
88

99
env:
10+
CARGO_INCREMENTAL: 0
1011
CARGO_TERM_COLOR: always
12+
RUST_BACKTRACE: 1
13+
RUSTFLAGS: -D warnings
14+
RUSTDOCFLAGS: -D warnings
1115

1216
jobs:
1317
build:
14-
1518
runs-on: ubuntu-latest
16-
1719
steps:
1820
- uses: actions/checkout@v2
1921
- name: Build
2022
run: cargo build --verbose
2123
- name: Run tests
2224
run: cargo test --verbose
23-
fmt:
25+
- name: Run clippy
26+
run: cargo clippy --all-targets --all --verbose
2427

28+
fmt:
2529
runs-on: ubuntu-latest
26-
2730
steps:
2831
- uses: actions/checkout@v2
2932
- name: Rustfmt
30-
run: cargo fmt --check
33+
run: cargo fmt --all --check
3134
- name: Verify regenerated files
3235
run: ./scripts/unicode.py && diff tables.rs src/tables.rs
3336
- name: Verify regenerated tests
34-
run: ./scripts/unicode_gen_breaktests.py && rustfmt testdata.rs && diff testdata.rs src/testdata.rs
37+
run: ./scripts/unicode_gen_breaktests.py && diff testdata.rs tests/testdata/mod.rs

benches/chars.rs

+2-3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
//! is how much slower full unicode handling is.
77
88
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
9-
use unicode_segmentation;
109

1110
use std::fs;
1211
use unicode_segmentation::UnicodeSegmentation;
@@ -24,14 +23,14 @@ const FILES: &[&str] = &[
2423

2524
#[inline(always)]
2625
fn grapheme(text: &str) {
27-
for c in UnicodeSegmentation::graphemes(black_box(&*text), true) {
26+
for c in UnicodeSegmentation::graphemes(black_box(text), true) {
2827
black_box(c);
2928
}
3029
}
3130

3231
#[inline(always)]
3332
fn scalar(text: &str) {
34-
for c in black_box(&*text).chars() {
33+
for c in black_box(text).chars() {
3534
black_box(c);
3635
}
3736
}

scripts/unicode.py

+48-26
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,11 @@ def format_table_content(f, content, indent):
155155
line = " "*indent + chunk
156156
f.write(line)
157157

158-
def load_properties(f, interestingprops):
158+
def load_properties(f, interestingprops: "list[str | tuple[str, str]] | None" = None):
159159
fetch(f)
160160
props = {}
161-
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
162-
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
161+
re1 = re.compile(r"^\s*([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
162+
re2 = re.compile(r"^\s*([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
163163

164164
for line in fileinput.input(os.path.basename(f)):
165165
prop = None
@@ -168,17 +168,21 @@ def load_properties(f, interestingprops):
168168
m = re1.match(line)
169169
if m:
170170
d_lo = m.group(1)
171-
d_hi = m.group(1)
171+
d_hi = d_lo
172172
prop = m.group(2)
173+
value = m.group(3)
173174
else:
174175
m = re2.match(line)
175176
if m:
176177
d_lo = m.group(1)
177178
d_hi = m.group(2)
178179
prop = m.group(3)
180+
value = m.group(4)
179181
else:
180182
continue
181-
if interestingprops and prop not in interestingprops:
183+
if value is not None:
184+
prop = (prop, value)
185+
if interestingprops is not None and prop not in interestingprops:
182186
continue
183187
d_lo = int(d_lo, 16)
184188
d_hi = int(d_hi, 16)
@@ -195,7 +199,7 @@ def load_properties(f, interestingprops):
195199
def escape_char(c):
196200
return "'\\u{%x}'" % c
197201

198-
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
202+
def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True,
199203
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
200204
pub_string = "const"
201205
if not is_const:
@@ -217,7 +221,7 @@ def emit_util_mod(f):
217221
f.write("""
218222
pub mod util {
219223
#[inline]
220-
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
224+
pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {
221225
use core::cmp::Ordering::{Equal, Less, Greater};
222226
r.binary_search_by(|&(lo,hi)| {
223227
if lo <= c && c <= hi { Equal }
@@ -252,13 +256,22 @@ def emit_util_mod(f):
252256
253257
""")
254258

255-
def emit_property_module(f, mod, tbl, emit):
256-
f.write("mod %s {\n" % mod)
257-
for cat in sorted(emit):
258-
emit_table(f, "%s_table" % cat, tbl[cat], is_pub=False)
259+
def emit_property_module(f, mod, tbl, emit: "list[str | tuple[str, str]]"):
260+
f.write("pub mod %s {\n" % mod)
261+
262+
cats = []
263+
for cat in emit:
264+
if type(cat) is tuple:
265+
cats.append((f"{cat[0]}_{cat[1]}", cat))
266+
else:
267+
cats.append((cat, cat))
268+
cats.sort(key=lambda x: x[0])
269+
270+
for cat_str, cat in cats:
271+
emit_table(f, "%s_table" % cat_str, tbl[cat], is_pub=False)
259272
f.write(" #[inline]\n")
260-
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
261-
f.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat)
273+
f.write(" pub fn %s(c: char) -> bool {\n" % cat_str)
274+
f.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat_str)
262275
f.write(" }\n\n")
263276
f.write("}\n\n")
264277

@@ -303,7 +316,7 @@ def emit_break_module(f, break_table, break_cats, name):
303316
f.write((" %sC_" % Name[0]) + cat + ",\n")
304317
f.write(""" }
305318
306-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
319+
fn bsearch_range_value_table(c: char, r: &[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
307320
use core::cmp::Ordering::{Equal, Less, Greater};
308321
match r.binary_search_by(|&(lo, hi, _)| {
309322
if lo <= c && c <= hi { Equal }
@@ -355,11 +368,11 @@ def emit_break_module(f, break_table, break_cats, name):
355368
else:
356369
lookup_type = "u32"
357370

358-
emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
371+
emit_table(f, "%s_cat_lookup" % name, lookup_table, "&[%s]" % lookup_type,
359372
pfun=lambda x: "%d" % x,
360373
is_pub=False, is_const=True)
361374

362-
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
375+
emit_table(f, "%s_cat_table" % name, break_table, "&[(char, char, %sCat)]" % Name,
363376
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
364377
is_pub=False, is_const=True)
365378
f.write("}\n")
@@ -379,17 +392,26 @@ def emit_break_module(f, break_table, break_cats, name):
379392

380393
# download and parse all the data
381394
gencats = load_gencats("UnicodeData.txt")
382-
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
395+
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic", ("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])
383396

384397
emit_util_mod(rf)
385398
for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
386-
("derived_property", derived, ["Alphabetic"]):
399+
("derived_property", derived, ["Alphabetic", ("InCB", "Extend")]):
387400
emit_property_module(rf, name, cat, pfuns)
388401

402+
rf.write("""pub fn is_incb_linker(c: char) -> bool {
403+
matches!(c,""")
404+
405+
for (lo, hi) in derived[("InCB", "Linker")]:
406+
rf.write(f" | '\\u{{{lo:X}}}'")
407+
if lo != hi:
408+
rf.write(f"..'\\u{{{lo:X}}}'")
409+
410+
rf.write(")\n}\n\n")
411+
389412
### grapheme cluster module
390413
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
391-
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])
392-
414+
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt")
393415
# Control
394416
# Note:
395417
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
@@ -398,22 +420,22 @@ def emit_break_module(f, break_table, break_cats, name):
398420
grapheme_cats["Control"] = group_cat(list(
399421
set(ungroup_cat(grapheme_cats["Control"]))
400422
- set(ungroup_cat([surrogate_codepoints]))))
401-
423+
grapheme_cats["InCB_Consonant"] = derived[("InCB", "Consonant")]
424+
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
425+
grapheme_cats["Extended_Pictographic"] = emoji_props["Extended_Pictographic"]
402426
grapheme_table = []
403427
for cat in grapheme_cats:
404428
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
405-
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
406-
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
407429
grapheme_table.sort(key=lambda w: w[0])
408430
last = -1
409431
for chars in grapheme_table:
410432
if chars[0] <= last:
411433
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
412434
last = chars[1]
413-
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
435+
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
414436
rf.write("\n")
415437

416-
word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
438+
word_cats = load_properties("auxiliary/WordBreakProperty.txt")
417439
word_table = []
418440
for cat in word_cats:
419441
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
@@ -425,7 +447,7 @@ def emit_break_module(f, break_table, break_cats, name):
425447
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
426448
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
427449

428-
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
450+
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt")
429451
sentence_table = []
430452
for cat in sentence_cats:
431453
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])

scripts/unicode_gen_breaktests.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,8 @@ def showfun(x):
140140
return outstr
141141

142142
def create_grapheme_data(f):
143-
# rules 9.1 and 9.2 are for extended graphemes only
144-
optsplits = ['9.1','9.2']
143+
# rules 9.1, 9.2, and 9.3 are for extended graphemes only
144+
optsplits = ['9.1', '9.2', '9.3']
145145
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
146146

147147
test_same = []
@@ -169,8 +169,8 @@ def create_grapheme_data(f):
169169
else:
170170
test_diff.append((allchars, extgraphs, c))
171171

172-
stype = "&'static [(&'static str, &'static [&'static str])]"
173-
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
172+
stype = "&[(&str, &[&str])]"
173+
dtype = "&[(&str, &[&str], &[&str])]"
174174
f.write(" // official Unicode test data\n")
175175
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
176176
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
@@ -185,7 +185,7 @@ def create_words_data(f):
185185
allchars = [cn for s in c for cn in s]
186186
test.append((allchars, c))
187187

188-
wtype = "&'static [(&'static str, &'static [&'static str])]"
188+
wtype = "&[(&str, &[&str])]"
189189
f.write(" // official Unicode test data\n")
190190
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
191191
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
@@ -199,7 +199,7 @@ def create_sentence_data(f):
199199
allchars = [cn for s in c for cn in s]
200200
test.append((allchars, c))
201201

202-
wtype = "&'static [(&'static str, &'static [&'static str])]"
202+
wtype = "&[(&str, &[&str])]"
203203
f.write(" // official Unicode test data\n")
204204
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
205205
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

0 commit comments

Comments
 (0)