Skip to content

Commit 4c8eb0c

Browse files
committed
Remove private field access from generate ns
Use Unicode standard files for regex categories, blocks and scripts, and just hardcode POSIX, Java and binary regex properties.
1 parent d7577c1 commit 4c8eb0c

File tree

8 files changed

+2080
-285
lines changed

8 files changed

+2080
-285
lines changed

clj/bin/update-unicode

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
VERSION=13.0.0
4+
5+
curl "https://www.unicode.org/Public/$VERSION/ucd/PropertyValueAliases.txt" -o resources/unicode/PropertyValueAliases.txt
6+
curl "https://www.unicode.org/Public/$VERSION/ucd/Blocks.txt" -o resources/unicode/Blocks.txt

clj/project.clj

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@
44
:license {:name "Vim License"
55
:url "http://vimdoc.sourceforge.net/htmldoc/uganda.html#license"
66
:comments ":help license"}
7-
:dependencies [[org.clojure/clojure "1.10.1"]
7+
:dependencies [[org.clojure/data.csv "1.0.0"]
8+
[org.clojure/clojure "1.10.1"]
89
[frak "0.1.9"]])

clj/resources/unicode/Blocks.txt

+344
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
# Blocks-13.0.0.txt
2+
# Date: 2019-07-10, 19:06:00 GMT [KW]
3+
# © 2019 Unicode®, Inc.
4+
# For terms of use, see http://www.unicode.org/terms_of_use.html
5+
#
6+
# Unicode Character Database
7+
# For documentation, see http://www.unicode.org/reports/tr44/
8+
#
9+
# Format:
10+
# Start Code..End Code; Block Name
11+
12+
# ================================================
13+
14+
# Note: When comparing block names, casing, whitespace, hyphens,
15+
# and underbars are ignored.
16+
# For example, "Latin Extended-A" and "latin extended a" are equivalent.
17+
# For more information on the comparison of property values,
18+
# see UAX #44: http://www.unicode.org/reports/tr44/
19+
#
20+
# All block ranges start with a value where (cp MOD 16) = 0,
21+
# and end with a value where (cp MOD 16) = 15. In other words,
22+
# the last hexadecimal digit of the start of range is ...0
23+
# and the last hexadecimal digit of the end of range is ...F.
24+
# This constraint on block ranges guarantees that allocations
25+
# are done in terms of whole columns, and that code chart display
26+
# never involves splitting columns in the charts.
27+
#
28+
# All code points not explicitly listed for Block
29+
# have the value No_Block.
30+
31+
# Property: Block
32+
#
33+
# @missing: 0000..10FFFF; No_Block
34+
35+
0000..007F; Basic Latin
36+
0080..00FF; Latin-1 Supplement
37+
0100..017F; Latin Extended-A
38+
0180..024F; Latin Extended-B
39+
0250..02AF; IPA Extensions
40+
02B0..02FF; Spacing Modifier Letters
41+
0300..036F; Combining Diacritical Marks
42+
0370..03FF; Greek and Coptic
43+
0400..04FF; Cyrillic
44+
0500..052F; Cyrillic Supplement
45+
0530..058F; Armenian
46+
0590..05FF; Hebrew
47+
0600..06FF; Arabic
48+
0700..074F; Syriac
49+
0750..077F; Arabic Supplement
50+
0780..07BF; Thaana
51+
07C0..07FF; NKo
52+
0800..083F; Samaritan
53+
0840..085F; Mandaic
54+
0860..086F; Syriac Supplement
55+
08A0..08FF; Arabic Extended-A
56+
0900..097F; Devanagari
57+
0980..09FF; Bengali
58+
0A00..0A7F; Gurmukhi
59+
0A80..0AFF; Gujarati
60+
0B00..0B7F; Oriya
61+
0B80..0BFF; Tamil
62+
0C00..0C7F; Telugu
63+
0C80..0CFF; Kannada
64+
0D00..0D7F; Malayalam
65+
0D80..0DFF; Sinhala
66+
0E00..0E7F; Thai
67+
0E80..0EFF; Lao
68+
0F00..0FFF; Tibetan
69+
1000..109F; Myanmar
70+
10A0..10FF; Georgian
71+
1100..11FF; Hangul Jamo
72+
1200..137F; Ethiopic
73+
1380..139F; Ethiopic Supplement
74+
13A0..13FF; Cherokee
75+
1400..167F; Unified Canadian Aboriginal Syllabics
76+
1680..169F; Ogham
77+
16A0..16FF; Runic
78+
1700..171F; Tagalog
79+
1720..173F; Hanunoo
80+
1740..175F; Buhid
81+
1760..177F; Tagbanwa
82+
1780..17FF; Khmer
83+
1800..18AF; Mongolian
84+
18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
85+
1900..194F; Limbu
86+
1950..197F; Tai Le
87+
1980..19DF; New Tai Lue
88+
19E0..19FF; Khmer Symbols
89+
1A00..1A1F; Buginese
90+
1A20..1AAF; Tai Tham
91+
1AB0..1AFF; Combining Diacritical Marks Extended
92+
1B00..1B7F; Balinese
93+
1B80..1BBF; Sundanese
94+
1BC0..1BFF; Batak
95+
1C00..1C4F; Lepcha
96+
1C50..1C7F; Ol Chiki
97+
1C80..1C8F; Cyrillic Extended-C
98+
1C90..1CBF; Georgian Extended
99+
1CC0..1CCF; Sundanese Supplement
100+
1CD0..1CFF; Vedic Extensions
101+
1D00..1D7F; Phonetic Extensions
102+
1D80..1DBF; Phonetic Extensions Supplement
103+
1DC0..1DFF; Combining Diacritical Marks Supplement
104+
1E00..1EFF; Latin Extended Additional
105+
1F00..1FFF; Greek Extended
106+
2000..206F; General Punctuation
107+
2070..209F; Superscripts and Subscripts
108+
20A0..20CF; Currency Symbols
109+
20D0..20FF; Combining Diacritical Marks for Symbols
110+
2100..214F; Letterlike Symbols
111+
2150..218F; Number Forms
112+
2190..21FF; Arrows
113+
2200..22FF; Mathematical Operators
114+
2300..23FF; Miscellaneous Technical
115+
2400..243F; Control Pictures
116+
2440..245F; Optical Character Recognition
117+
2460..24FF; Enclosed Alphanumerics
118+
2500..257F; Box Drawing
119+
2580..259F; Block Elements
120+
25A0..25FF; Geometric Shapes
121+
2600..26FF; Miscellaneous Symbols
122+
2700..27BF; Dingbats
123+
27C0..27EF; Miscellaneous Mathematical Symbols-A
124+
27F0..27FF; Supplemental Arrows-A
125+
2800..28FF; Braille Patterns
126+
2900..297F; Supplemental Arrows-B
127+
2980..29FF; Miscellaneous Mathematical Symbols-B
128+
2A00..2AFF; Supplemental Mathematical Operators
129+
2B00..2BFF; Miscellaneous Symbols and Arrows
130+
2C00..2C5F; Glagolitic
131+
2C60..2C7F; Latin Extended-C
132+
2C80..2CFF; Coptic
133+
2D00..2D2F; Georgian Supplement
134+
2D30..2D7F; Tifinagh
135+
2D80..2DDF; Ethiopic Extended
136+
2DE0..2DFF; Cyrillic Extended-A
137+
2E00..2E7F; Supplemental Punctuation
138+
2E80..2EFF; CJK Radicals Supplement
139+
2F00..2FDF; Kangxi Radicals
140+
2FF0..2FFF; Ideographic Description Characters
141+
3000..303F; CJK Symbols and Punctuation
142+
3040..309F; Hiragana
143+
30A0..30FF; Katakana
144+
3100..312F; Bopomofo
145+
3130..318F; Hangul Compatibility Jamo
146+
3190..319F; Kanbun
147+
31A0..31BF; Bopomofo Extended
148+
31C0..31EF; CJK Strokes
149+
31F0..31FF; Katakana Phonetic Extensions
150+
3200..32FF; Enclosed CJK Letters and Months
151+
3300..33FF; CJK Compatibility
152+
3400..4DBF; CJK Unified Ideographs Extension A
153+
4DC0..4DFF; Yijing Hexagram Symbols
154+
4E00..9FFF; CJK Unified Ideographs
155+
A000..A48F; Yi Syllables
156+
A490..A4CF; Yi Radicals
157+
A4D0..A4FF; Lisu
158+
A500..A63F; Vai
159+
A640..A69F; Cyrillic Extended-B
160+
A6A0..A6FF; Bamum
161+
A700..A71F; Modifier Tone Letters
162+
A720..A7FF; Latin Extended-D
163+
A800..A82F; Syloti Nagri
164+
A830..A83F; Common Indic Number Forms
165+
A840..A87F; Phags-pa
166+
A880..A8DF; Saurashtra
167+
A8E0..A8FF; Devanagari Extended
168+
A900..A92F; Kayah Li
169+
A930..A95F; Rejang
170+
A960..A97F; Hangul Jamo Extended-A
171+
A980..A9DF; Javanese
172+
A9E0..A9FF; Myanmar Extended-B
173+
AA00..AA5F; Cham
174+
AA60..AA7F; Myanmar Extended-A
175+
AA80..AADF; Tai Viet
176+
AAE0..AAFF; Meetei Mayek Extensions
177+
AB00..AB2F; Ethiopic Extended-A
178+
AB30..AB6F; Latin Extended-E
179+
AB70..ABBF; Cherokee Supplement
180+
ABC0..ABFF; Meetei Mayek
181+
AC00..D7AF; Hangul Syllables
182+
D7B0..D7FF; Hangul Jamo Extended-B
183+
D800..DB7F; High Surrogates
184+
DB80..DBFF; High Private Use Surrogates
185+
DC00..DFFF; Low Surrogates
186+
E000..F8FF; Private Use Area
187+
F900..FAFF; CJK Compatibility Ideographs
188+
FB00..FB4F; Alphabetic Presentation Forms
189+
FB50..FDFF; Arabic Presentation Forms-A
190+
FE00..FE0F; Variation Selectors
191+
FE10..FE1F; Vertical Forms
192+
FE20..FE2F; Combining Half Marks
193+
FE30..FE4F; CJK Compatibility Forms
194+
FE50..FE6F; Small Form Variants
195+
FE70..FEFF; Arabic Presentation Forms-B
196+
FF00..FFEF; Halfwidth and Fullwidth Forms
197+
FFF0..FFFF; Specials
198+
10000..1007F; Linear B Syllabary
199+
10080..100FF; Linear B Ideograms
200+
10100..1013F; Aegean Numbers
201+
10140..1018F; Ancient Greek Numbers
202+
10190..101CF; Ancient Symbols
203+
101D0..101FF; Phaistos Disc
204+
10280..1029F; Lycian
205+
102A0..102DF; Carian
206+
102E0..102FF; Coptic Epact Numbers
207+
10300..1032F; Old Italic
208+
10330..1034F; Gothic
209+
10350..1037F; Old Permic
210+
10380..1039F; Ugaritic
211+
103A0..103DF; Old Persian
212+
10400..1044F; Deseret
213+
10450..1047F; Shavian
214+
10480..104AF; Osmanya
215+
104B0..104FF; Osage
216+
10500..1052F; Elbasan
217+
10530..1056F; Caucasian Albanian
218+
10600..1077F; Linear A
219+
10800..1083F; Cypriot Syllabary
220+
10840..1085F; Imperial Aramaic
221+
10860..1087F; Palmyrene
222+
10880..108AF; Nabataean
223+
108E0..108FF; Hatran
224+
10900..1091F; Phoenician
225+
10920..1093F; Lydian
226+
10980..1099F; Meroitic Hieroglyphs
227+
109A0..109FF; Meroitic Cursive
228+
10A00..10A5F; Kharoshthi
229+
10A60..10A7F; Old South Arabian
230+
10A80..10A9F; Old North Arabian
231+
10AC0..10AFF; Manichaean
232+
10B00..10B3F; Avestan
233+
10B40..10B5F; Inscriptional Parthian
234+
10B60..10B7F; Inscriptional Pahlavi
235+
10B80..10BAF; Psalter Pahlavi
236+
10C00..10C4F; Old Turkic
237+
10C80..10CFF; Old Hungarian
238+
10D00..10D3F; Hanifi Rohingya
239+
10E60..10E7F; Rumi Numeral Symbols
240+
10E80..10EBF; Yezidi
241+
10F00..10F2F; Old Sogdian
242+
10F30..10F6F; Sogdian
243+
10FB0..10FDF; Chorasmian
244+
10FE0..10FFF; Elymaic
245+
11000..1107F; Brahmi
246+
11080..110CF; Kaithi
247+
110D0..110FF; Sora Sompeng
248+
11100..1114F; Chakma
249+
11150..1117F; Mahajani
250+
11180..111DF; Sharada
251+
111E0..111FF; Sinhala Archaic Numbers
252+
11200..1124F; Khojki
253+
11280..112AF; Multani
254+
112B0..112FF; Khudawadi
255+
11300..1137F; Grantha
256+
11400..1147F; Newa
257+
11480..114DF; Tirhuta
258+
11580..115FF; Siddham
259+
11600..1165F; Modi
260+
11660..1167F; Mongolian Supplement
261+
11680..116CF; Takri
262+
11700..1173F; Ahom
263+
11800..1184F; Dogra
264+
118A0..118FF; Warang Citi
265+
11900..1195F; Dives Akuru
266+
119A0..119FF; Nandinagari
267+
11A00..11A4F; Zanabazar Square
268+
11A50..11AAF; Soyombo
269+
11AC0..11AFF; Pau Cin Hau
270+
11C00..11C6F; Bhaiksuki
271+
11C70..11CBF; Marchen
272+
11D00..11D5F; Masaram Gondi
273+
11D60..11DAF; Gunjala Gondi
274+
11EE0..11EFF; Makasar
275+
11FB0..11FBF; Lisu Supplement
276+
11FC0..11FFF; Tamil Supplement
277+
12000..123FF; Cuneiform
278+
12400..1247F; Cuneiform Numbers and Punctuation
279+
12480..1254F; Early Dynastic Cuneiform
280+
13000..1342F; Egyptian Hieroglyphs
281+
13430..1343F; Egyptian Hieroglyph Format Controls
282+
14400..1467F; Anatolian Hieroglyphs
283+
16800..16A3F; Bamum Supplement
284+
16A40..16A6F; Mro
285+
16AD0..16AFF; Bassa Vah
286+
16B00..16B8F; Pahawh Hmong
287+
16E40..16E9F; Medefaidrin
288+
16F00..16F9F; Miao
289+
16FE0..16FFF; Ideographic Symbols and Punctuation
290+
17000..187FF; Tangut
291+
18800..18AFF; Tangut Components
292+
18B00..18CFF; Khitan Small Script
293+
18D00..18D8F; Tangut Supplement
294+
1B000..1B0FF; Kana Supplement
295+
1B100..1B12F; Kana Extended-A
296+
1B130..1B16F; Small Kana Extension
297+
1B170..1B2FF; Nushu
298+
1BC00..1BC9F; Duployan
299+
1BCA0..1BCAF; Shorthand Format Controls
300+
1D000..1D0FF; Byzantine Musical Symbols
301+
1D100..1D1FF; Musical Symbols
302+
1D200..1D24F; Ancient Greek Musical Notation
303+
1D2E0..1D2FF; Mayan Numerals
304+
1D300..1D35F; Tai Xuan Jing Symbols
305+
1D360..1D37F; Counting Rod Numerals
306+
1D400..1D7FF; Mathematical Alphanumeric Symbols
307+
1D800..1DAAF; Sutton SignWriting
308+
1E000..1E02F; Glagolitic Supplement
309+
1E100..1E14F; Nyiakeng Puachue Hmong
310+
1E2C0..1E2FF; Wancho
311+
1E800..1E8DF; Mende Kikakui
312+
1E900..1E95F; Adlam
313+
1EC70..1ECBF; Indic Siyaq Numbers
314+
1ED00..1ED4F; Ottoman Siyaq Numbers
315+
1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols
316+
1F000..1F02F; Mahjong Tiles
317+
1F030..1F09F; Domino Tiles
318+
1F0A0..1F0FF; Playing Cards
319+
1F100..1F1FF; Enclosed Alphanumeric Supplement
320+
1F200..1F2FF; Enclosed Ideographic Supplement
321+
1F300..1F5FF; Miscellaneous Symbols and Pictographs
322+
1F600..1F64F; Emoticons
323+
1F650..1F67F; Ornamental Dingbats
324+
1F680..1F6FF; Transport and Map Symbols
325+
1F700..1F77F; Alchemical Symbols
326+
1F780..1F7FF; Geometric Shapes Extended
327+
1F800..1F8FF; Supplemental Arrows-C
328+
1F900..1F9FF; Supplemental Symbols and Pictographs
329+
1FA00..1FA6F; Chess Symbols
330+
1FA70..1FAFF; Symbols and Pictographs Extended-A
331+
1FB00..1FBFF; Symbols for Legacy Computing
332+
20000..2A6DF; CJK Unified Ideographs Extension B
333+
2A700..2B73F; CJK Unified Ideographs Extension C
334+
2B740..2B81F; CJK Unified Ideographs Extension D
335+
2B820..2CEAF; CJK Unified Ideographs Extension E
336+
2CEB0..2EBEF; CJK Unified Ideographs Extension F
337+
2F800..2FA1F; CJK Compatibility Ideographs Supplement
338+
30000..3134F; CJK Unified Ideographs Extension G
339+
E0000..E007F; Tags
340+
E0100..E01EF; Variation Selectors Supplement
341+
F0000..FFFFF; Supplementary Private Use Area-A
342+
100000..10FFFF; Supplementary Private Use Area-B
343+
344+
# EOF

0 commit comments

Comments
 (0)