Skip to content

Commit ff77d5e

Browse files
committedDec 30, 2020
Remove private field access from generate ns
Use Unicode standard files for regex categories, blocks and scripts, and just hardcode POSIX, Java and binary regex properties.
1 parent d7577c1 commit ff77d5e

File tree

8 files changed

+2074
-283
lines changed

8 files changed

+2074
-283
lines changed
 

‎clj/bin/update-unicode

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
VERSION=13.0.0
4+
5+
curl "https://www.unicode.org/Public/$VERSION/ucd/PropertyValueAliases.txt" -o resources/unicode/PropertyValueAliases.txt
6+
curl "https://www.unicode.org/Public/$VERSION/ucd/Blocks.txt" -o resources/unicode/Blocks.txt

‎clj/project.clj

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@
44
:license {:name "Vim License"
55
:url "http://vimdoc.sourceforge.net/htmldoc/uganda.html#license"
66
:comments ":help license"}
7-
:dependencies [[org.clojure/clojure "1.10.1"]
7+
:dependencies [[org.clojure/data.csv "1.0.0"]
8+
[org.clojure/clojure "1.10.1"]
89
[frak "0.1.9"]])

‎clj/resources/unicode/Blocks.txt

+344
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
# Blocks-13.0.0.txt
2+
# Date: 2019-07-10, 19:06:00 GMT [KW]
3+
# © 2019 Unicode®, Inc.
4+
# For terms of use, see http://www.unicode.org/terms_of_use.html
5+
#
6+
# Unicode Character Database
7+
# For documentation, see http://www.unicode.org/reports/tr44/
8+
#
9+
# Format:
10+
# Start Code..End Code; Block Name
11+
12+
# ================================================
13+
14+
# Note: When comparing block names, casing, whitespace, hyphens,
15+
# and underbars are ignored.
16+
# For example, "Latin Extended-A" and "latin extended a" are equivalent.
17+
# For more information on the comparison of property values,
18+
# see UAX #44: http://www.unicode.org/reports/tr44/
19+
#
20+
# All block ranges start with a value where (cp MOD 16) = 0,
21+
# and end with a value where (cp MOD 16) = 15. In other words,
22+
# the last hexadecimal digit of the start of range is ...0
23+
# and the last hexadecimal digit of the end of range is ...F.
24+
# This constraint on block ranges guarantees that allocations
25+
# are done in terms of whole columns, and that code chart display
26+
# never involves splitting columns in the charts.
27+
#
28+
# All code points not explicitly listed for Block
29+
# have the value No_Block.
30+
31+
# Property: Block
32+
#
33+
# @missing: 0000..10FFFF; No_Block
34+
35+
0000..007F; Basic Latin
36+
0080..00FF; Latin-1 Supplement
37+
0100..017F; Latin Extended-A
38+
0180..024F; Latin Extended-B
39+
0250..02AF; IPA Extensions
40+
02B0..02FF; Spacing Modifier Letters
41+
0300..036F; Combining Diacritical Marks
42+
0370..03FF; Greek and Coptic
43+
0400..04FF; Cyrillic
44+
0500..052F; Cyrillic Supplement
45+
0530..058F; Armenian
46+
0590..05FF; Hebrew
47+
0600..06FF; Arabic
48+
0700..074F; Syriac
49+
0750..077F; Arabic Supplement
50+
0780..07BF; Thaana
51+
07C0..07FF; NKo
52+
0800..083F; Samaritan
53+
0840..085F; Mandaic
54+
0860..086F; Syriac Supplement
55+
08A0..08FF; Arabic Extended-A
56+
0900..097F; Devanagari
57+
0980..09FF; Bengali
58+
0A00..0A7F; Gurmukhi
59+
0A80..0AFF; Gujarati
60+
0B00..0B7F; Oriya
61+
0B80..0BFF; Tamil
62+
0C00..0C7F; Telugu
63+
0C80..0CFF; Kannada
64+
0D00..0D7F; Malayalam
65+
0D80..0DFF; Sinhala
66+
0E00..0E7F; Thai
67+
0E80..0EFF; Lao
68+
0F00..0FFF; Tibetan
69+
1000..109F; Myanmar
70+
10A0..10FF; Georgian
71+
1100..11FF; Hangul Jamo
72+
1200..137F; Ethiopic
73+
1380..139F; Ethiopic Supplement
74+
13A0..13FF; Cherokee
75+
1400..167F; Unified Canadian Aboriginal Syllabics
76+
1680..169F; Ogham
77+
16A0..16FF; Runic
78+
1700..171F; Tagalog
79+
1720..173F; Hanunoo
80+
1740..175F; Buhid
81+
1760..177F; Tagbanwa
82+
1780..17FF; Khmer
83+
1800..18AF; Mongolian
84+
18B0..18FF; Unified Canadian Aboriginal Syllabics Extended
85+
1900..194F; Limbu
86+
1950..197F; Tai Le
87+
1980..19DF; New Tai Lue
88+
19E0..19FF; Khmer Symbols
89+
1A00..1A1F; Buginese
90+
1A20..1AAF; Tai Tham
91+
1AB0..1AFF; Combining Diacritical Marks Extended
92+
1B00..1B7F; Balinese
93+
1B80..1BBF; Sundanese
94+
1BC0..1BFF; Batak
95+
1C00..1C4F; Lepcha
96+
1C50..1C7F; Ol Chiki
97+
1C80..1C8F; Cyrillic Extended-C
98+
1C90..1CBF; Georgian Extended
99+
1CC0..1CCF; Sundanese Supplement
100+
1CD0..1CFF; Vedic Extensions
101+
1D00..1D7F; Phonetic Extensions
102+
1D80..1DBF; Phonetic Extensions Supplement
103+
1DC0..1DFF; Combining Diacritical Marks Supplement
104+
1E00..1EFF; Latin Extended Additional
105+
1F00..1FFF; Greek Extended
106+
2000..206F; General Punctuation
107+
2070..209F; Superscripts and Subscripts
108+
20A0..20CF; Currency Symbols
109+
20D0..20FF; Combining Diacritical Marks for Symbols
110+
2100..214F; Letterlike Symbols
111+
2150..218F; Number Forms
112+
2190..21FF; Arrows
113+
2200..22FF; Mathematical Operators
114+
2300..23FF; Miscellaneous Technical
115+
2400..243F; Control Pictures
116+
2440..245F; Optical Character Recognition
117+
2460..24FF; Enclosed Alphanumerics
118+
2500..257F; Box Drawing
119+
2580..259F; Block Elements
120+
25A0..25FF; Geometric Shapes
121+
2600..26FF; Miscellaneous Symbols
122+
2700..27BF; Dingbats
123+
27C0..27EF; Miscellaneous Mathematical Symbols-A
124+
27F0..27FF; Supplemental Arrows-A
125+
2800..28FF; Braille Patterns
126+
2900..297F; Supplemental Arrows-B
127+
2980..29FF; Miscellaneous Mathematical Symbols-B
128+
2A00..2AFF; Supplemental Mathematical Operators
129+
2B00..2BFF; Miscellaneous Symbols and Arrows
130+
2C00..2C5F; Glagolitic
131+
2C60..2C7F; Latin Extended-C
132+
2C80..2CFF; Coptic
133+
2D00..2D2F; Georgian Supplement
134+
2D30..2D7F; Tifinagh
135+
2D80..2DDF; Ethiopic Extended
136+
2DE0..2DFF; Cyrillic Extended-A
137+
2E00..2E7F; Supplemental Punctuation
138+
2E80..2EFF; CJK Radicals Supplement
139+
2F00..2FDF; Kangxi Radicals
140+
2FF0..2FFF; Ideographic Description Characters
141+
3000..303F; CJK Symbols and Punctuation
142+
3040..309F; Hiragana
143+
30A0..30FF; Katakana
144+
3100..312F; Bopomofo
145+
3130..318F; Hangul Compatibility Jamo
146+
3190..319F; Kanbun
147+
31A0..31BF; Bopomofo Extended
148+
31C0..31EF; CJK Strokes
149+
31F0..31FF; Katakana Phonetic Extensions
150+
3200..32FF; Enclosed CJK Letters and Months
151+
3300..33FF; CJK Compatibility
152+
3400..4DBF; CJK Unified Ideographs Extension A
153+
4DC0..4DFF; Yijing Hexagram Symbols
154+
4E00..9FFF; CJK Unified Ideographs
155+
A000..A48F; Yi Syllables
156+
A490..A4CF; Yi Radicals
157+
A4D0..A4FF; Lisu
158+
A500..A63F; Vai
159+
A640..A69F; Cyrillic Extended-B
160+
A6A0..A6FF; Bamum
161+
A700..A71F; Modifier Tone Letters
162+
A720..A7FF; Latin Extended-D
163+
A800..A82F; Syloti Nagri
164+
A830..A83F; Common Indic Number Forms
165+
A840..A87F; Phags-pa
166+
A880..A8DF; Saurashtra
167+
A8E0..A8FF; Devanagari Extended
168+
A900..A92F; Kayah Li
169+
A930..A95F; Rejang
170+
A960..A97F; Hangul Jamo Extended-A
171+
A980..A9DF; Javanese
172+
A9E0..A9FF; Myanmar Extended-B
173+
AA00..AA5F; Cham
174+
AA60..AA7F; Myanmar Extended-A
175+
AA80..AADF; Tai Viet
176+
AAE0..AAFF; Meetei Mayek Extensions
177+
AB00..AB2F; Ethiopic Extended-A
178+
AB30..AB6F; Latin Extended-E
179+
AB70..ABBF; Cherokee Supplement
180+
ABC0..ABFF; Meetei Mayek
181+
AC00..D7AF; Hangul Syllables
182+
D7B0..D7FF; Hangul Jamo Extended-B
183+
D800..DB7F; High Surrogates
184+
DB80..DBFF; High Private Use Surrogates
185+
DC00..DFFF; Low Surrogates
186+
E000..F8FF; Private Use Area
187+
F900..FAFF; CJK Compatibility Ideographs
188+
FB00..FB4F; Alphabetic Presentation Forms
189+
FB50..FDFF; Arabic Presentation Forms-A
190+
FE00..FE0F; Variation Selectors
191+
FE10..FE1F; Vertical Forms
192+
FE20..FE2F; Combining Half Marks
193+
FE30..FE4F; CJK Compatibility Forms
194+
FE50..FE6F; Small Form Variants
195+
FE70..FEFF; Arabic Presentation Forms-B
196+
FF00..FFEF; Halfwidth and Fullwidth Forms
197+
FFF0..FFFF; Specials
198+
10000..1007F; Linear B Syllabary
199+
10080..100FF; Linear B Ideograms
200+
10100..1013F; Aegean Numbers
201+
10140..1018F; Ancient Greek Numbers
202+
10190..101CF; Ancient Symbols
203+
101D0..101FF; Phaistos Disc
204+
10280..1029F; Lycian
205+
102A0..102DF; Carian
206+
102E0..102FF; Coptic Epact Numbers
207+
10300..1032F; Old Italic
208+
10330..1034F; Gothic
209+
10350..1037F; Old Permic
210+
10380..1039F; Ugaritic
211+
103A0..103DF; Old Persian
212+
10400..1044F; Deseret
213+
10450..1047F; Shavian
214+
10480..104AF; Osmanya
215+
104B0..104FF; Osage
216+
10500..1052F; Elbasan
217+
10530..1056F; Caucasian Albanian
218+
10600..1077F; Linear A
219+
10800..1083F; Cypriot Syllabary
220+
10840..1085F; Imperial Aramaic
221+
10860..1087F; Palmyrene
222+
10880..108AF; Nabataean
223+
108E0..108FF; Hatran
224+
10900..1091F; Phoenician
225+
10920..1093F; Lydian
226+
10980..1099F; Meroitic Hieroglyphs
227+
109A0..109FF; Meroitic Cursive
228+
10A00..10A5F; Kharoshthi
229+
10A60..10A7F; Old South Arabian
230+
10A80..10A9F; Old North Arabian
231+
10AC0..10AFF; Manichaean
232+
10B00..10B3F; Avestan
233+
10B40..10B5F; Inscriptional Parthian
234+
10B60..10B7F; Inscriptional Pahlavi
235+
10B80..10BAF; Psalter Pahlavi
236+
10C00..10C4F; Old Turkic
237+
10C80..10CFF; Old Hungarian
238+
10D00..10D3F; Hanifi Rohingya
239+
10E60..10E7F; Rumi Numeral Symbols
240+
10E80..10EBF; Yezidi
241+
10F00..10F2F; Old Sogdian
242+
10F30..10F6F; Sogdian
243+
10FB0..10FDF; Chorasmian
244+
10FE0..10FFF; Elymaic
245+
11000..1107F; Brahmi
246+
11080..110CF; Kaithi
247+
110D0..110FF; Sora Sompeng
248+
11100..1114F; Chakma
249+
11150..1117F; Mahajani
250+
11180..111DF; Sharada
251+
111E0..111FF; Sinhala Archaic Numbers
252+
11200..1124F; Khojki
253+
11280..112AF; Multani
254+
112B0..112FF; Khudawadi
255+
11300..1137F; Grantha
256+
11400..1147F; Newa
257+
11480..114DF; Tirhuta
258+
11580..115FF; Siddham
259+
11600..1165F; Modi
260+
11660..1167F; Mongolian Supplement
261+
11680..116CF; Takri
262+
11700..1173F; Ahom
263+
11800..1184F; Dogra
264+
118A0..118FF; Warang Citi
265+
11900..1195F; Dives Akuru
266+
119A0..119FF; Nandinagari
267+
11A00..11A4F; Zanabazar Square
268+
11A50..11AAF; Soyombo
269+
11AC0..11AFF; Pau Cin Hau
270+
11C00..11C6F; Bhaiksuki
271+
11C70..11CBF; Marchen
272+
11D00..11D5F; Masaram Gondi
273+
11D60..11DAF; Gunjala Gondi
274+
11EE0..11EFF; Makasar
275+
11FB0..11FBF; Lisu Supplement
276+
11FC0..11FFF; Tamil Supplement
277+
12000..123FF; Cuneiform
278+
12400..1247F; Cuneiform Numbers and Punctuation
279+
12480..1254F; Early Dynastic Cuneiform
280+
13000..1342F; Egyptian Hieroglyphs
281+
13430..1343F; Egyptian Hieroglyph Format Controls
282+
14400..1467F; Anatolian Hieroglyphs
283+
16800..16A3F; Bamum Supplement
284+
16A40..16A6F; Mro
285+
16AD0..16AFF; Bassa Vah
286+
16B00..16B8F; Pahawh Hmong
287+
16E40..16E9F; Medefaidrin
288+
16F00..16F9F; Miao
289+
16FE0..16FFF; Ideographic Symbols and Punctuation
290+
17000..187FF; Tangut
291+
18800..18AFF; Tangut Components
292+
18B00..18CFF; Khitan Small Script
293+
18D00..18D8F; Tangut Supplement
294+
1B000..1B0FF; Kana Supplement
295+
1B100..1B12F; Kana Extended-A
296+
1B130..1B16F; Small Kana Extension
297+
1B170..1B2FF; Nushu
298+
1BC00..1BC9F; Duployan
299+
1BCA0..1BCAF; Shorthand Format Controls
300+
1D000..1D0FF; Byzantine Musical Symbols
301+
1D100..1D1FF; Musical Symbols
302+
1D200..1D24F; Ancient Greek Musical Notation
303+
1D2E0..1D2FF; Mayan Numerals
304+
1D300..1D35F; Tai Xuan Jing Symbols
305+
1D360..1D37F; Counting Rod Numerals
306+
1D400..1D7FF; Mathematical Alphanumeric Symbols
307+
1D800..1DAAF; Sutton SignWriting
308+
1E000..1E02F; Glagolitic Supplement
309+
1E100..1E14F; Nyiakeng Puachue Hmong
310+
1E2C0..1E2FF; Wancho
311+
1E800..1E8DF; Mende Kikakui
312+
1E900..1E95F; Adlam
313+
1EC70..1ECBF; Indic Siyaq Numbers
314+
1ED00..1ED4F; Ottoman Siyaq Numbers
315+
1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols
316+
1F000..1F02F; Mahjong Tiles
317+
1F030..1F09F; Domino Tiles
318+
1F0A0..1F0FF; Playing Cards
319+
1F100..1F1FF; Enclosed Alphanumeric Supplement
320+
1F200..1F2FF; Enclosed Ideographic Supplement
321+
1F300..1F5FF; Miscellaneous Symbols and Pictographs
322+
1F600..1F64F; Emoticons
323+
1F650..1F67F; Ornamental Dingbats
324+
1F680..1F6FF; Transport and Map Symbols
325+
1F700..1F77F; Alchemical Symbols
326+
1F780..1F7FF; Geometric Shapes Extended
327+
1F800..1F8FF; Supplemental Arrows-C
328+
1F900..1F9FF; Supplemental Symbols and Pictographs
329+
1FA00..1FA6F; Chess Symbols
330+
1FA70..1FAFF; Symbols and Pictographs Extended-A
331+
1FB00..1FBFF; Symbols for Legacy Computing
332+
20000..2A6DF; CJK Unified Ideographs Extension B
333+
2A700..2B73F; CJK Unified Ideographs Extension C
334+
2B740..2B81F; CJK Unified Ideographs Extension D
335+
2B820..2CEAF; CJK Unified Ideographs Extension E
336+
2CEB0..2EBEF; CJK Unified Ideographs Extension F
337+
2F800..2FA1F; CJK Compatibility Ideographs Supplement
338+
30000..3134F; CJK Unified Ideographs Extension G
339+
E0000..E007F; Tags
340+
E0100..E01EF; Variation Selectors Supplement
341+
F0000..FFFFF; Supplementary Private Use Area-A
342+
100000..10FFFF; Supplementary Private Use Area-B
343+
344+
# EOF

‎clj/resources/unicode/PropertyValueAliases.txt

+1,595
Large diffs are not rendered by default.

‎clj/resources/unicode/license

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
2+
See Terms of Use for definitions of Unicode Inc.'s
3+
Data Files and Software.
4+
5+
NOTICE TO USER: Carefully read the following legal agreement.
6+
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
7+
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
8+
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
9+
TERMS AND CONDITIONS OF THIS AGREEMENT.
10+
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
11+
THE DATA FILES OR SOFTWARE.
12+
13+
COPYRIGHT AND PERMISSION NOTICE
14+
15+
Copyright © 1991-2020 Unicode, Inc. All rights reserved.
16+
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
17+
18+
Permission is hereby granted, free of charge, to any person obtaining
19+
a copy of the Unicode data files and any associated documentation
20+
(the "Data Files") or Unicode software and any associated documentation
21+
(the "Software") to deal in the Data Files or Software
22+
without restriction, including without limitation the rights to use,
23+
copy, modify, merge, publish, distribute, and/or sell copies of
24+
the Data Files or Software, and to permit persons to whom the Data Files
25+
or Software are furnished to do so, provided that either
26+
(a) this copyright and permission notice appear with all copies
27+
of the Data Files or Software, or
28+
(b) this copyright and permission notice appear in associated
29+
Documentation.
30+
31+
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
32+
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
33+
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
34+
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
35+
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
36+
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
37+
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
38+
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
39+
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
40+
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
41+
42+
Except as contained in this notice, the name of a copyright holder
43+
shall not be used in advertising or otherwise to promote the sale,
44+
use or other dealings in these Data Files or Software without prior
45+
written authorization of the copyright holder.

‎clj/src/vim_clojure_static/generate.clj

+69-40
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,12 @@
66
[clojure.java.shell :refer [sh]]
77
[clojure.set :as set]
88
[clojure.string :as string]
9+
[clojure.data.csv :as csv]
910
[frak :as f])
1011
(:import (clojure.lang MultiFn)
11-
(java.lang Character$UnicodeBlock Character$UnicodeScript)
12-
(java.lang.reflect Field)
1312
(java.text SimpleDateFormat)
1413
(java.util Date)
15-
(java.util.regex Pattern Pattern$CharPropertyNames UnicodeProp)))
14+
(java.util.regex Pattern)))
1615

1716
;;
1817
;; Helpers
@@ -40,23 +39,12 @@
4039
(name group)
4140
(property-pattern (format fmt (vim-frak-pattern props)) braces?))))
4241

43-
(defn- get-private-field
44-
"Violate encapsulation and get the value of a private field."
45-
[^Class cls fieldname]
46-
(let [^Field field (first (filter #(= fieldname (.getName ^Field %))
47-
(.getDeclaredFields cls)))]
48-
(.setAccessible field true)
49-
(.get field field)))
50-
5142
(defn- fn-var? [v]
5243
(let [f @v]
5344
(or (contains? (meta v) :arglists)
5445
(fn? f)
5546
(instance? MultiFn f))))
5647

57-
(defn- inner-class-name [^Class cls]
58-
(string/replace (.getName cls) #".*\$(.+)" "$1"))
59-
6048
(defn- map-keyword-names [coll]
6149
(reduce
6250
(fn [v x]
@@ -129,33 +117,74 @@
129117
(set/difference syms group-syms)]))
130118
[builtins coresyms] group-preds))))
131119

120+
;; Java 8 Character class implements Unicode standard 6.2 from 2012 [1],
121+
;; Java 15 implements Unicode standard 13 from 2020 [2],
122+
;; the latter standard includes a few more scripts and removes some.
123+
;; Unicode Technical Standard #18 [3] describes Unicode Regular Expressions.
124+
;; java.util.regex.Pattern [4] describes which parts of Unicode standard are supported.
125+
;; Some values which aren't mentioned in Unicode or Javadoc, are also supported [5].
126+
;;
127+
;; [1] https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html
128+
;; [2] https://docs.oracle.com/en/java/javase/15/docs/api/java.base/java/lang/Character.html
129+
;; [3] https://unicode.org/reports/tr18/
130+
;; [4] https://docs.oracle.com/en/java/javase/15/docs/api/java.base/java/util/regex/Pattern.html
131+
;; [5] https://github.com/openjdk/jdk/blob/4d13bf33d4932cc210a29c4e3a68f848db18575b/src/java.base/share/classes/java/util/regex/CharPredicates.java
132+
133+
(def unicode-property-value-aliases
134+
(with-open [f (io/reader (io/resource "unicode/PropertyValueAliases.txt"))]
135+
(->> (csv/read-csv f :separator \;)
136+
(map (fn [row]
137+
(mapv string/trim row)))
138+
doall)))
139+
140+
(def unicode-blocks
141+
(with-open [f (io/reader (io/resource "unicode/Blocks.txt"))]
142+
(->> (csv/read-csv f :separator \;)
143+
(map (fn [row]
144+
(mapv string/trim row)))
145+
doall)))
146+
147+
(defn- block-alt-names [block-name]
148+
(let [n (string/upper-case block-name)]
149+
[n
150+
(string/replace n #"[ -]" "_")
151+
(string/replace n #" " "")]))
152+
132153
(def character-properties
133-
"Character property names derived via reflection."
134-
(let [props (->> (get-private-field Pattern$CharPropertyNames "map")
135-
(mapv (fn [[prop field]] [(inner-class-name (class field)) prop]))
136-
(group-by first)
137-
(reduce-kv (fn [m k v] (assoc m k (mapv peek v))) {}))
138-
binary (concat (map #(.name ^UnicodeProp %) (get-private-field UnicodeProp "$VALUES"))
139-
(keys (get-private-field UnicodeProp "aliases")))
140-
script (concat (map #(.name ^Character$UnicodeScript %) (Character$UnicodeScript/values))
141-
(keys (get-private-field Character$UnicodeScript "aliases")))
142-
block (keys (get-private-field Character$UnicodeBlock "map"))]
143-
;;
144-
;; * The keys "1"…"5" reflect the order of CharPropertyFactory
145-
;; declarations in Pattern.java!
146-
;;
147-
;; * The "L1" (Latin-1) category is not defined by Unicode and exists
148-
;; merely as an alias for the first 8 bits of code points.
149-
;;
150-
;; * The "all" category is the Unicode "Any" category by a different name,
151-
;; and thus excluded.
152-
;;
153-
{:posix (disj (set (mapcat (partial get props) ["2" "3"])) "L1")
154-
:java (set (get props "4"))
155-
:binary (set binary)
156-
:category (set (get props "1"))
157-
:script (set script)
158-
:block (set block)}))
154+
{:posix #{"Lower" "Space" "XDigit" "Alnum" "Cntrl" "Graph" "Alpha" "Print"
155+
"Blank" "Digit" "Upper" "Punct" "ASCII"}
156+
:java #{"javaSpaceChar" "javaUnicodeIdentifierPart" "javaLetterOrDigit"
157+
"javaTitleCase" "javaLowerCase" "javaDefined" "javaAlphabetic"
158+
"javaIdentifierIgnorable" "javaJavaIdentifierStart"
159+
"javaIdeographic" "javaWhitespace" "javaMirrored"
160+
"javaUnicodeIdentifierStart" "javaISOControl" "javaUpperCase"
161+
"javaDigit" "javaLetter" "javaJavaIdentifierPart"}
162+
:binary #{"IDEOGRAPHIC" "HEX_DIGIT" "ALPHABETIC" "NONCHARACTERCODEPOINT"
163+
"GRAPH" "PUNCTUATION" "WORD" "LETTER" "TITLECASE" "JOIN_CONTROL"
164+
"CONTROL" "HEXDIGIT" "LOWERCASE" "NONCHARACTER_CODE_POINT"
165+
"JOINCONTROL" "BLANK" "WHITESPACE" "ALNUM" "DIGIT" "WHITE_SPACE"
166+
"ASSIGNED" "UPPERCASE" "PRINT"}
167+
;; https://www.unicode.org/reports/tr44/#General_Category_Values
168+
:category (->> unicode-property-value-aliases
169+
(filter #(= "gc" (first %)))
170+
(map second)
171+
;; Supported by Java but not in standard or docs.
172+
(concat ["LD"])
173+
set)
174+
:script (->> unicode-property-value-aliases
175+
(filter #(= "sc" (first %)))
176+
(mapcat #(subvec % 1 3))
177+
(map string/upper-case)
178+
set)
179+
:block (->> unicode-blocks
180+
(filter (fn [row]
181+
(and (seq (first row))
182+
(not (string/starts-with? (first row) "#")))))
183+
(map second)
184+
;; Old names supported by Java
185+
(concat ["GREEK" "COMBINING MARKS FOR SYMBOLS" "CYRILLIC SUPPLEMENTARY"])
186+
(mapcat block-alt-names)
187+
set)})
159188

160189
(def lispwords
161190
"Specially indented symbols in clojure.core and clojure.test. The following

‎clj/test/vim_clojure_static/generate_test.clj

+10-239
Large diffs are not rendered by default.

‎syntax/clojure.vim

+3-3
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.