-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tokenizer.py
executable file
·1868 lines (1528 loc) · 69.8 KB
/
Tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
#
# Tokenizer.pm: Port of my Perl tokenizer.
# 2012-08-22: Written by Steven J. DeRose.
#
import sys
import codecs
import unicodedata
import urllib
import html
#from html.parser import HTMLParser
import regex as re # Adds support for \p{}. See https://pypi.org/project/regex/
def unicode(s, encoding="utf-8", errors="strict"):
if (not isinstance(s, str)): return str(s, encoding, errors)
return s
__metadata__ = {
"title" : "Tokenizer",
"description" : "An NLP tokenizer that knows about Unicode.",
"rightsHolder" : "Steven J. DeRose",
"creator" : "http://viaf.org/viaf/50334488",
"type" : "http://purl.org/dc/dcmitype/Software",
"language" : "Python 3.7",
"created" : "2012-08-22",
"modified" : "2022-03-11",
"publisher" : "http://github.com/sderose",
"license" : "https://creativecommons.org/licenses/by-sa/3.0/"
}
__version__ = __metadata__["modified"]
descr = """
=Description=
Divide strings into tokens, trying to be sensible about Unicode issues,
complicated numbers, dates and times, DNA, URLs, emails, contractions, etc.
This is a natural-language tokenizer, intended as a front-end to NLP
software, particularly lexico-statistical calculators. It's pretty
knowledgeable about Unicode, especially characters used in quality typography.
It can also be used to normalize text without tokenizing, or
as a preprocessor for more extensive NLP stacks.
If you run this package directly from the command line, it will tokenize
and display the specified input file(s) or STDIN. If you specify "*" instead
of a filename, it will use some text of its own as a test.
There are several tokenizers included:
* The ''SimpleTokenizer'' class is quick and should be adequate for
many uses. It drops soft/optional hyphens, normalizes Unicode (accents,
ligatures, etc.), then applyies a few regexes to insert
extra spaces (such as before the apostrophe of contractions, around emdashes,
after opening punctuation, etc.), and then splits on space-runs.
It offers an option to use ''TokensEN.py'' for contractions, but
otherwise applies a simplified rule for them.
This is the easiest to use, for example:
from Tokenizer import SimpleTokenizer
myTok = SimpleTokenizer()
for rec in fh.readline():
tokens = myTok.tokenize(rec)
print("\n".join(tokens))
It provides a few keyword options on the constructor:
**'''normalize''' -- What Unicode normalization to apply.
Default: 'NFKD' (others are 'NFC', 'NFD', and 'NFKC').
**'''breakHyphens''' -- Whether to split hyphenated words. Default: False.
**'''fancyContractions''' -- Whether to use the ''TokensEN'' package
to handle contractions, or just some local regexes. Default: False.
**'''verbose''' -- Whether to print steps of progress. Default: False.
* ''NLTKTokenizerPlus'' applies the NLTK tokenizer, then
makes a few adjustments. Usage is the same as just shown,
just substitute "NLTKTokenizerPlus".
* ''HeavyTokenizer'' provides a variety of useful but complex steps, and
can do a lot of manipulation to help when gathering lexicostatistics.
Usage is the same as just shown, just substitute "HeavyTokenizer". However, you'll
probably want to use ''setOption''() to pick exactly what behaviors you want.
==Details on the HeavyTokenizer==
* Expand format-specific special characters like < %42 \\uFFFF
* Normalize the character inventory (accents, ligatures, quotes,
dashes (soft, em, etc.), spaces (non-breaking, m-width, etc.), etc.
* Shorten long repetition sequences like 'argggggg'
* Deal with contractions (this part is English-only so far)
* Identify many specific types of non-word tokens (numbers, date, URIs, DNA, emoticons,...)
* Filter out or merge particular tokens (for example, sometimes you
might want only alphabetic tokens; or no punctuation; or want a "cover"
token such as "99" to represent ''all'' numbers; etc.) This is especially
useful when developing lexica from corpora.
* Generate the actual tokenized result.
HeavyTokenizer focuses on handling complex Unicode issues well (other than
word-division in space-less orthographies, which it doesn't attempt).
Some of the features in more detail:
* Character represented in special ways, such as %xx codes used in URIs,
character references like " or A in HTML and XML, and so on. These are
very often found in lexical databases, and are often handled incorrectly.
* Less-common characters such as ligatures, accents,
non-Latin digits, fractions, hyphens and dashes, quotes, and spaces,
presentation variants, footnote markers, angstrom vs. a-with-ring, etc.
Very many of the NLP systems I've examined fail on quite common cases such as
hard spaces, ligatures, curly quotes, soft hyphens, and em-dashes.
This is intended not to.
* Many kinds of non-word tokens, such as URIs, Twitter hashtags and userids,
and jargon; numbers, dates, times, email addresses, etc. Imho, NLP systems
shouldn't break URIs at every "/" and then try to POS-tag the directory names.
* Contemporary conventions such as emphasis via special puntuations
(*word*), or via repeating letters (aaaarrrrrggggghhhhhh, hahahaha).
* Special cases such as contractions and possessives
(with and without explicit apostrophes), hyphenated words
(not the same thing as em-dash-separated clauses), etc.
* When collecting or measuring vocabulary,
options to filter out unwanted tokens are very useful.
For example, the non-word types already mentioned are important for some purposes, but
not for others. Words already listed in a given dictionary(s) can be discarded. Tokens
in all lower, all upper, title, camel, or other case patterns; numers
tokens containing special characters, long or short tokens, etc. There are many filtering
options, so you can easily winnow a list down to just what you want.
* Numerics are, I think, handled better than is typical. You don't need integers listed
in a lexicon, or to miss ones that aren't. Likewise for floating-point numbers,
currently, etc. Also decades such as 50's 60s '70s and '80's should just work.
=Usage=
==Example==
from Tokenizer import HeavyTokenizer
myTok = HeavyTokenizer("characters")
myTok.setOption("Uppercase_Letter", "lower")
for rec in fh.readline():
tokens = myTok.tokenize(rec)
for token in (tokens):
counts[token] += 1
There are several steps to the process of tokenizing.
They are described in order below, with the options applicable to each.
Option names appear in '''BOLD''', and values in ''ITALIC'' below.
The type of value expected is shown in (parentheses): either (boolean), (int),
or (disp), unless otherwise described.
==1: Expand escaped characters==
These options all begin with "X_" and all take (boolean) values,
for whether to expand them to a literal character.
* '''X_BACKSLASH''' -- A lot of cases are covered.
* '''X_URI''' -- %-escapes as used in URIs.
Not to be confused with the '''T_URI''' option for tokenizing (see below).
* '''X_ENTITY''' -- Covers HTML and XML named entities and
numeric character references (assuming the caller didn't already parse and
expand them).
==2: Normalize the character set==
These options are distinguished by being named in Title_Case with underscores
(following the Perl convention for Unicode character class names).
See [http://unicode.org/reports/tr44/tr44-4.html#General_Category_Values].
This all assumes that the data is already Unicode, so be careful of CP1252.
* '''Ascii_Only''' (boolean) -- a special case.
Discards all non-ASCII characters, and turns control characters (such as
CR, LF, FF, VT, and TAB) to space. If you specify this, you should not specify
other character set normalization options.
All other character set normalization options are of type (disp):
(disp) values that apply to any character category at all:
"keep" -- Don't change the characters
"delete" -- Delete the characters entirely
"space" -- Replace the characters with a space
"unify" -- Convert all matches to a single character (see below)
(disp) values only for Number and its subtypes:
"value" -- Replace with the value
(disp) values only for Letter and its subtypes:
"upper" -- Force to upper-case
"lower" -- Force to lower-case
"strip" -- Decompose (NFKD) and then strip any diacritics
"decompose" -- Decompose (NFKD) into component characters
''Letter'' and its subcategories default to `keep`; all other
character categories default to `unify` (see below for the
meaning of "unify" for each case).
'''Note''': A character may have multiple decompositions, or may be
undecomposable. The resulting string will also be in Compatibility decomposition
(see [http://unicode.org/reports/tr15/]) and
Unicode's Canonical Ordering Behavior. Compatibility decomposition combines
stylistic variations such as font, breaking, cursive, circled, width,
rotation, superscript, squared, fractions, ''some'' ligatures
(for example ff but not oe), and pairs like angstrong vs. A with ring,
ohm vs omega, long s vs. s.
`#unify` changes each character of the given class
to one particular ASCII character to represent the class (this is useful for
finding interesting patterns of use):
Letter unifies to "A"
Cased_Letter unifies to "A"
Uppercase_Letter unifies to "A"
Lowercase_Letter unifies to "a"
Titlecase_Letter unifies to "Fi"
Modifier_Letter unifies to "A"
Other_Letter unifies to "A"
Mark unifies to " "
Nonspacing_Mark unifies to " "
Spacing_Mark unifies to " "
Enclosing_Mark unifies to " "
Number unifies to "9"
Decimal_Number unifies to "9"
Letter_Number unifies to "9"
Other_Number unifies to "9"
Punctuation unifies to "."
Connector_Punctuation unifies to "_"
Dash_Punctuation unifies to "-"
Open_Punctuation unifies to "("
Close_Punctuation unifies to ")"
Initial_Punctuation unifies to "`"
Final_Punctuation unifies to "'"
Other_Punctuation unifies to "*"
Symbol unifies to "#"
Math_Symbol unifies to "="
Currency_Symbol unifies to "\\$"
Modifier_Symbol unifies to "#"
Other_Symbol unifies to "#"
Separator unifies to " "
Space_Separator unifies to " "
Line_Separator unifies to " "
Paragraph_Separator unifies to " "
Other unifies to "?"
Control unifies to "?"
(includes > 64 characters. For example, U+00A0.
Format unifies to "?"
Surrogate unifies to "?"
Private_Use unifies to "?"
Unassigned unifies to "?"
`unify` can also be used for the Non-word token options (see below); in that
case, each option has a particular value to which matching ''tokens'' unify.
Setting the option for a cover category (such as ''Letter'') is merely shorthand
for setting all its subcategories to that value. Some or all subcategories can
still be reset afterward, but any ''earlier'' setting for a subcategory
is discarded when you set its cover category.
To get a list of the category options run `Tokenizer.pm -list`.
The following character set normalization options can also be used
(but are not Unicode General Categories):
* '''Accent''' --
These are related to Unicode '''Nonspacing_Mark''',
but that also would include vowel marks, which this doesn't.
''#decompose'' and ''strip'' are important value for this option:
the format splits a composed letter+diacritic or similar combination
into its component parts; the latter discards the diacritic instead.
''#delete'' discards the whole accent+letter combination (?).
'''Note''': There is a separate Unicode property called "Diacritic",
but it isn't available here yet.
* '''Control_0''' -- The C0 control characters.
That is, the usual ones from \\x00 to \\x1F.
This option only matters if ''Control'' is set to `keep`.
* '''Control_1''' -- The C1 control characters.
That is, the "upper half" ones from \\x80 to \\x9F.
'''Note''': These are graphical characters in the common Windows(r) character
set known as "CP1252", but not in Unicode or most other sets.
This option only matters if ''Control'' is set to `keep`.
* '''Digit''' -- characters 0-9 -- Cf Unicode '''Number''', which is broader.
* '''Ligature''' characters -- This also includes titlecase and digraph
characters. '''Note''': Some Unicode ligatures, particular in Greek, may also
be involved in accent normalization.
See also [http://en.wikipedia.org/wiki/Typographic_ligature]
'''(not yet supported)'''
* '''Fullwidth''' --
See [http://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms]
'''(not yet supported)'''
* '''Math''' -- Unicode includes many variants of the entire Latin
alphabet, such as script, sans serif, and others.
These are in the Unicode '''Math''' general category.
'''(not yet supported)'''
* '''Nbsp''' -- The non-breaking space character, U+00A0. This
defaults to being changed to a regular space.
* '''Soft_Hyphen''' -- The soft (optional) hyphen characters,
U+00AD and U+1806. These default to being deleted.
==3: Shorten runs of the same character==
These options are all (boolean).
* '''N_CHAR''' Reduce runs of >= N of the same
word-character in a row, to just N occurrences. This is for things like
"aaaaaaaarrrrrrrrrgggggggghhhhhh". However, it does not yet cover things
like "hahahaha".
* '''N_SPACE''' Reduce runs of >= N white-space characters
(not necessarily all the same) to just N.
==4: Non-word tokens==
This step can tweak various kinds of non-word tokens, such as
numbers, URIs, etc. The options are of type (disp), but the
only meaningful settings are "keep", "delete", "space", and "unify".
* '''T_TIME''' tokens, such as "6:24 pm".
* '''T_DATE''' tokens, such as "2012-08-22" or "2012 BCE".
Month names and abbreviations are not yet supported.
* '''T_FRACTION''' (including Unicode fraction characters if they
were not already normalized).
* '''T_NUMBER''' tokens, including signed or unsigned integers, reals,
and exponential notation (however, fractions are dealt with separately).
This does not include spelled-out numbers such as "five hundred".
(not yet supported)
* '''T_CURRENCY''' tokens, consisting of a currency symbol and a number,
such as $1, $29.95, etc.
* '''T_EMOTICON''' items
* '''T_HASHTAG''' items as in Twitter (#ibm)
* '''T_USER''' items as in Twitter (@john)
* '''T_EMAIL''' addresses
* '''T_URI''' items (see also the '''X_URI''' unescaping option earlier)
==4: Split tokens==
As a first approximation, text can be broken at each white-space character(s),
at all individual `characters`, or `none` at all. The choice depends on the
''TOKENTYPE'' option. There is a host of other rules for various cases.
Then leading and trailing punctuation are broken off.
This prevents leaving parentheses, commas, quotes, etc. attached to words.
However, the script is not smart (at least, yet) about special cases such as:
$12 ~5.2 #1 +12
5'6" 5! 5%
U.S. p.m.
). ." +/-
(a) 501(c)(3)
@user #topic ~a AT&T
e'tre D'Avaux let's y'all and/or
house(s)
This needs some adjustments re. which punctuation is allowed on which
end. Harder problems include plural genitives: "The three ''dogs''' tails."
and abbreviations versus sentence-ends.
A few special cases are controlled by these ("S_") options, such as
re-mapping contractions and breaking up hyphenated words (by inserting
extra spaces).
* '''S_CONTRACTION''' can be set to "unify" in order to
expand most English contractions. For example:
won't, ain't, we'll, we'd, we're, we'll, somebody'd,
y'all, let's, gonna, cannot.
Not very useful for non-English cases like "dell'" or "c'est".
(see also POS/multitagTokens).
* '''S_HYPHENATED''' break at hyphens, making the hyphen a separate
token. (Doesn't deal with soft hyphens or other '''Format''' characters.
* '''S_GENITIVE''' break "'s" to a separate token. This does not actually
catch all genitives, even in English (and, many "'s" cases in English
can be either genitives or contractions of "is".
'''(not yet supported)'''
==6: Filter out unwanted tokens ("words" mode only)==
These options are all (boolean) except for '''F_MINLENGTH''' and '''F_MAXLENGTH'''.
For Boolean filter options, the default is off, which means the tokens
are not discarded.
* '''F_MINLENGTH''' (int) -- Discard all tokens shorter than this.
* '''F_MAXLENGTH''' (int) -- Discard all tokens longer than this.
* '''F_SPACE''' (boolean) -- can be used to delete all white-space items.
* Filter by case and special-character pattern
Each of the following (disjoint) categories
can be controlled separately (see also ''--ignoreCase'', ''--Letter'', etc.):
** '''F_UPPER''' (boolean) -- remove words with only capital or caseless letters
** '''F_LOWER''' (boolean) -- remove words with only lower case or caseless letters
** '''F_TITLE''' (boolean) -- remove words with only an initial capital or
titlecase letter, followed by only lower case or caseless letters.
** '''F_MIXED''' (boolean) -- remove words with at least two capital and/or
titlecase letters, along with any number of lower case or caseless letters.
** '''F_ALNUM''' (boolean) -- remove words that contain both digits and
letters.
** '''F_PUNCT''' (boolean) -- remove words that contain both punctuation and
letters. However, hyphens, apostrophes, and periods do no count.
* Tokens in any specified '''F_DICT''' list. '''F_MINLENGTH''' ''4''
(see above) can serve as a passable substitute for a dictionary of
function words.
=Outline of token types (unfinished)=
(see also earlier sections)
(is there a useful type for "nested syntax"? dates, times, formulae,
phone numbers, music,
Numeric
Int
Dec, Oct, Bin, Hex, Roman
Real
Float, Exp, Frac, pct/pmil
Ordinal
1st, #1, first
Complex
Matrix
Math forms, roman and frac unicode, circled,....
Formula
SPecial constants: pi, euler, c, angstrom, micro prefix
Date/time (under numeric? unit?)
Unit
Currency
Dimension, scale
32F 100C 273.15K
Punc
Quote
Left/right/plain, single/double/angle
Dash
em/en/fig/soft?
Brace
left/right/shape/balanced
Grammatical
period, ellipsis, colon, semi, comma
Verbal
&, &c
Identifier
URL
domain name
email (incl. mailto?)
hashtag
@user
Phone
PostCode
element
substance
Lexeme
simplex
lower, title, mixed, caps, uncased
abbrev
single initial?
mixed-script
construct
dimethyltrichloroacetate
genome (incl. end indicators)
multiplex
hyphenated
acronym
contraction (vs. possessive)
gonna, ima, afaik
idiom
as far as, so as to,
Dingbat
Bullet, arrow
emoticon, emoji
sepline
Mixture
(>1 of alpha, num, punc)
Oddball cases:
4x4 1'2" AT&T and/or
60's
Ph.D. vs. PhD (treat like soft hyphen?
> or | for email quoting
Mg+2 H2O
gender symbols
footnote numbers, daggers, etc.
dominos, cards, dice
c/o
+/-
O'Donnell
=Some high-order problem cases (not including Unicode cases)=
and/or b/c +/-
JRR vs. J.R.R. vs. J. R. R. Tolkien. Mr. J. Smith is here.
See section (II) (A) (3), and also part B. Or Part [C].
A: Introduction--fer real-life use.
1'2". She is 5' tall. Or so they "think."
I'm gonna be a contraction that'd be the writer's bane.
=Differences from some other tokenizers=
Such as Stanford https://nlp.stanford.edu/software/tokenizer.shtml.
* This tokenizer knows a lot about Unicode, especially punctuation, such
as the many varieties of dash, space, and brackets. Stanford also has several
Unicode-related options, including the thoughtful:
strictTreebank3: PTBTokenizer deliberately deviates from strict PTB3
WSJ tokenization in two cases. Setting this improves compatibility for
those cases. They are: (i) When an acronym is followed by a sentence
end, such as "U.K." at the end of a sentence, the PTB3 has tokens of
"Corp" and ".", while by default PTBTokenizer duplicates the period
returning tokens of "Corp." and ".", and (ii) PTBTokenizer will return
numbers with a whole number and a fractional part like "5 7/8" as a
single token, with a non-breaking space in the middle, while the PTB3
separates them into two tokens "5" and "7/8". (Exception: for only
"U.S." the treebank does have the two tokens "U.S." and "." like our
default; strictTreebank3 now does that too.) The default is false.
* Many only break, rather than expand contractions. This means your lexicon
has to know "tokens" like:
ca n't ai gim gon na lem haf ta da whad dya t shan
Adding these to the lexicon typically means they are accepted as real words
even when they didn't come from expading a contraction (and most tokenizers
don't pass along any signal of where the contractions were).
* One ''advantage'' of merely splitting, is that a few cases are ambiguous:
"'s" can repreent either "is" or "was", while "'re" can represent either
"are" or "were" and
"'d" can represent "would", "had", or possibly "did".
If you expand, you have to choose.
* Contracted "not" sometimes re-uses the "n" from the base, as in "can't".
Tokenizers vary in getting can|'t, ca|n't, can|n't, can|not. The only reason
seems to be a desire to not insert another "n"; but why avoid that gnat
when swallowing the camel of adding all these fragments to your lexicon?
Stanford, for one, (I think wisely) duplicates "." with sentence-end abbreviations,
which seems to me essentially the same thing.
* Some cases, like "ima" for "i am going to" don't even have enough letters
to do the "split" approach, so it's not strictly possible to be consistent
without adding new characters (or new information in some form).
* Worse, some of those tokens collide with real words, adding lexical
ambiguity that just isn't there: "coulda" means "a" can be "HAVE", while
"buncha" means "a" can be "OF" -- but only in those cases. If you build
probability for POS tagging from those tokens, you're worse off.
* I haven't yet seen a tokenizer that accurately keeps offsets into a source
text, so everyone already accepts that you can't easily map back to the user's
"ur" input (say, to highlight something). This is bad enough in the case of
whitespace runs, line-breaks, tabs, etc; many tokenizers also normalize
(some, but rarely all) Unicode Zs (white-space-ish) characters to ASCII " ".
This is appropriate for some characters, but the notion that tab, LF,
non-breaking space, and hair space are all the same is wrong; they mean
very different things to actual readers.
* A very few contractions start with an apostrophe, making it harder to know
when to split that off.
* Some tokenizers "normalize" double-quotes (including curly ones, but rarely
chevrons or others) into apostrophes. This loses information that
can be important for syntactic processing, and also raises edge cases where
single and double quotes co-occurs, which is very common "Walk the 'dog'", she said."
* Some other tokenizers also have never heard of clausal dashes, turning two
hyphens (or, if they thought of it, Unicode em dash) into a single hyphen.
=Methods=
* '''new'''(tokenType)
Instantiate the tokenizer, and set it up for the ''tokenTYpe'' to be
either '''characters''' or '''words'''.
* '''addOptionsToGetoptLongArg(hashRef,prefix)'''
Add the options for this package to ''hashRef'', in the form expected by
`argparse`. If ''prefix'' is provided, add it to the beginning of each
option name (to avoid name conflicts). All the options for this package
are distinct even ignoring case, so callers may ignore or regard case
for options as desired.
* '''setOption'''(name,value)
Change the value of the named option.
Option names are case-sensitive (but see previous method).
'''Note''': Setting the option for a Unicode cover category
(such as '''Letter''' rather than '''Uppercase_Letter'''), is merely
shorthand for setting all its subcategories to that value
(subcategories can still be reset afterward).
* '''getOption'''(name)
Return the present value of the named option.
Option names are case-sensitive.
* '''tokenize'''(string)
Break ''string'' into tokens according to the settings in effect, and return
a reference to an array of them. '''Note''': This method uses several other
internal methods; they can be invoked separately is desired, but are not
documented fully here; the methods are as shown below (`s` is a string to
handle):
s = tkz.expand(s)
s = tkz.normalize(s)
s = tkz.shorten(s)
s = tkz.nonWordTokens(s)
tokens = tkz.splitTokens(s)
tokens = tkz.filter(tokens)
=Known Bugs and Limitations=
==True tokenization issues==
* house(s) breaks funny.
* Can't break words in orthographies that lack spaces (e.g., Japanese).
* Too generous about expanding contractions (e.g. "Tom's")
==Other==
Not all options are finished. For example:
''Ligature, Math, Fullwidth, S_GENITIVE,'' etc.
''T_NUMBER'' is disabled for the moment.
Titlecase characters, etc.
Some of this can be done in a pre-pass with:
iconv -f utf8 -t ascii//TRANSLIT
(disp) values upper, lower, and decompose do not restrict themselves
to just a single category, but affect all if set for any.
Can't distinguish single vs. double quotes while unifying variants.
Abbreviations, acronyms, and other cases with word-final punctuation
are a little wonky: "U.S." loses the final ".".
Acronyms with periods ''and'' spaces aren't caught at all.
Acronyms aren't allowed within Names Entity References.
W/ testTokenizer defaults, turns B&O into 9/9&O ... into \\.\\.\\. doesn't
separate }. Default unifies URIs, emails, and some (?) numerics. Doesn't do @userid.
Probably should move Unification out of Tokenizer?
Processing XML/HTML with default options, ends up splitting the SGML delimiters
apart from their constructs. Use `dropXMLtags` is necessary first.
=Related commands and data=
This uses the "regex" library [https://pypi.org/project/regex] instead of
the built-in Python "re". It adds support for \\p{}.
This Python program is mostly a port of a much earlier Perl one, which is also
available.
The `TokensEN.py` package gives access to list of English tokens that are
especially useful in tokenizing, such as abbreviations (for splitting the "."),
contractions (for distinguishing them from possessives), personal titles,
week and month names
`vocab`, `ngrams`, `normalizeSpace`, `SimplifyUnicode`,
`volsunga`, `findNERcandidates`,....
There is some test data inline, and more extensive data at
[https://github.com/sderose/Data/NLPFormatSamples/blob/master/TokenizerTestData.txt].
See [https://github.com/sderose/Lexicon.git/blob/master/xsv/contractions.xsv],
[https://github.com/sderose/Lexicon.git/blob/master/python/abbreviations.py]
=Related Commands=
Tokenize.py, Tokenizer.pm, Volsunga tokenizer (qv).
The "smoke test" data included in the code, has a lot of nice cases.
=Known bugs and Limitations=
Ported from Tokenizer.pm (also) by Steven J. DeRose.
=To do=
More testing.
Add timer.
Incorporate abbrev list.
Add any options Stanford or others have that this doesn't, such as:
[https://nlp.stanford.edu/software/tokenizer.shtml]
-encoding charset The character set encoding. By default, it assumues
utf-8, but you can tell it to use another character encoding.
-preserveLines Keep the input line breaks vs. one token per line.
-oneLinePerElement Print the tokens of an element space-separated on one line.
-parseInside regex Only tokenize inside XML elements which match the regex.
-filter regex Delete any token that matches() (in its entirety).
-lowerCase Force lowercase.
-dump Print out everything about each token.
-options optionString (bunch more)
-ioFileList file+ Treat files as lists of in\tout files.
-fileList file+ Treat files as lists of input filenames, one per line.
-untok Makes a best effort attempt at undoing PTB tokenization.
Split each stage (expand, filter, etc.) into a separate class?
Consider switching this to Antlr 4?
"''" as double quote
Emoticons
Should ellipsis plus [.,;?!] count as one punc unit or two?
=History=
* 2012: Written by Steven J. DeRose, in Perl.
* ????: Ported to Python.
* 2020-03-04: Fixes.
* 2021-04-09: Clean up. Spell NFKD right. Re-sync versions.
Clean up handling of `dispTypes`, quotes and general lint.
* 2022-03-11: Drop Python2. Lint.
=Rights=
Copyright 2012 by Steven J. DeRose. This work is licensed under a Creative Commons
Attribution-Share Alike 3.0 Unported License. For further information on
this license, see [http://creativecommons.org/licenses/by-sa/3.0].
For the most recent version, see [http://www.derose.net/steve/utilities] or
[http://github.com/sderose].
=Options=
"""
# Unicode "general categories", available via unicodedata.category(c)
# https://stackoverflow.com/questions/1832893/
# regex also supports single-char cover categories, I think.
#
unicodeCategories = {
"Cc": "Other, Control",
"Cf": "Other, Format",
"Cn": "Other, Not Assigned",
"Co": "Other, Private Use",
"Cs": "Other, Surrogate",
"LC": "Letter, Cased",
"Ll": "Letter, Lowercase",
"Lm": "Letter, Modifier",
"Lo": "Letter, Other",
"Lt": "Letter, Titlecase",
"Lu": "Letter, Uppercase",
"Mc": "Mark, Spacing Combining",
"Me": "Mark, Enclosing",
"Mn": "Mark, Nonspacing",
"Nd": "Number, Decimal Digit",
"Nl": "Number, Letter",
"No": "Number, Other",
"Pc": "Punctuation, Connector",
"Pd": "Punctuation, Dash",
"Pe": "Punctuation, Close",
"Pf": "Punctuation, Final quote",
"Pi": "Punctuation, Initial quote",
"Po": "Punctuation, Other",
"Ps": "Punctuation, Open",
"Sc": "Symbol, Currency",
"Sk": "Symbol, Modifier",
"Sm": "Symbol, Math",
"So": "Symbol, Other",
"Zl": "Separator, Line",
"Zp": "Separator, Paragraph",
"Zs": "Separator, Space",
}
unicodeSpaces = [
0x0009, # "CHARACTER TABULATION",
0x000A, # "LINE FEED",
0x000B, # "LINE TABULATION",
0x000C, # "FORM FEED",
0x000D, # "CARRIAGE RETURN",
0x0020, # "SPACE",
0x0089, # "CHARACTER TABULATION WITH JUSTIFICATION",
0x00A0, # "NO-BREAK SPACE",
0x2000, # "EN QUAD",
0x2001, # "EM QUAD",
0x2002, # "EN SPACE",
0x2003, # "EM SPACE",
0x2004, # "THREE-PER-EM SPACE",
0x2005, # "FOUR-PER-EM SPACE",
0x2006, # "SIX-PER-EM SPACE",
0x2007, # "FIGURE SPACE",
0x2008, # "PUNCTUATION SPACE",
0x2009, # "THIN SPACE",
0x200A, # "HAIR SPACE",
0x200B, # "ZERO WIDTH SPACE",
0x202F, # "NARROW NO-BREAK SPACE",
0x205F, # "MEDIUM MATHEMATICAL SPACE",
0x2420, # "SYMBOL FOR SPACE",
0x3000, # "IDEOGRAPHIC SPACE",
0x303F, # "IDEOGRAPHIC HALF FILL SPACE",
]
unicodeDashes = [
0x002D, # "HYPHEN-MINUS",
0x058A, # "ARMENIAN HYPHEN",
0x1B60, # "BALINESE PAMENENG (line-breaking hyphen)",
0x2010, # "HYPHEN",
0x2011, # "NON-BREAKING HYPHEN",
0x2012, # "FIGURE DASH",
0x2013, # "EN DASH",
0x2043, # "HYPHEN BULLET",
0x2212, # "MINUS",
0x2448, # "OCR DASH",
0xFE63, # "SMALL HYPHEN-MINUS",
0xFF0D, # "FULLWIDTH HYPHEN-MINUS",
]
# What of primes U+2032...U+2037, U+2057
unicodeLQuotes = [
0x00AB, # "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK *",
0x2018, # "LEFT SINGLE QUOTATION MARK",
0x201A, # "SINGLE LOW-9 QUOTATION MARK",
0x201C, # "LEFT DOUBLE QUOTATION MARK",
0x201E, # "DOUBLE LOW-9 QUOTATION MARK",
0x2039, # "SINGLE LEFT-POINTING ANGLE QUOTATION MARK",
0x2E02, # "LEFT SUBSTITUTION BRACKET",
0x2E04, # "LEFT DOTTED SUBSTITUTION BRACKET",
0x2E09, # "LEFT TRANSPOSITION BRACKET",
0x2E0C, # "LEFT RAISED OMISSION BRACKET",
0x2E1C, # "LEFT LOW PARAPHRASE BRACKET",
0x2E20, # "LEFT VERTICAL BAR WITH QUILL",
]
unicodeRQuotes = [
0x00BB, # "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK *",
0x2019, # "RIGHT SINGLE QUOTATION MARK",
0x201B, # "SINGLE HIGH-REVERSED-9 QUOTATION MARK",
0x201D, # "RIGHT DOUBLE QUOTATION MARK",
0x201F, # "DOUBLE HIGH-REVERSED-9 QUOTATION MARK",
0x203A, # "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK",
0x2358, # "APL FUNCTIONAL SYMBOL QUOTE UNDERBAR",
0x235E, # "APL FUNCTIONAL SYMBOL QUOTE QUAD",
0x2E03, # "RIGHT SUBSTITUTION BRACKET",
0x2E05, # "RIGHT DOTTED SUBSTITUTION BRACKET",
0x2E0A, # "RIGHT TRANSPOSITION BRACKET",
0x2E0D, # "RIGHT RAISED OMISSION BRACKET",
0x2E1D, # "RIGHT LOW PARAPHRASE BRACKET",
0x2E21, # "RIGHT VERTICAL BAR WITH QUILL",
]
unicodeEllipses = [
0x0eaf, # LAO ELLIPSIS
0x1801, # MONGOLIAN ELLIPSIS
0x2026, # HORIZONTAL ELLIPSIS
0x22ee, # VERTICAL ELLIPSIS
0x22ef, # MIDLINE HORIZONTAL ELLIPSIS
]
unicodeToDelete = [
0x00AD, # "SOFT HYPHEN"
0x1806, # "MONGOLIAN TODO SOFT HYPHEN"
0x2027, # "HYPHENATION POINT"
]
unicodeDoubles = [
0x2014, # EM DASH
0x30a0, # KATAKANA-HIRAGANA DOUBLE HYPHEN
0xfe58, # SMALL EM DASH
]
# (Cyrillic, Armenian, Hebrew, and many, many Arabic ligatures omitted)
# Could do this instead with unicodedb "Decompose" feature.
unicodeLigatures = [
0x0132, # "IJ", #"LATIN CAPITAL LIGATURE IJ",
0x0133, # "ij", #"LATIN SMALL LIGATURE IJ",
0x0152, # "OE", #"LATIN CAPITAL LIGATURE OE",
0x0153, # "oe", #"LATIN SMALL LIGATURE OE",
0xa7f9, # "oe", #"MODIFIER LETTER SMALL LIGATURE OE",
0xfb00, # "ff", #"LATIN SMALL LIGATURE FF",
0xfb01, # "fi", #"LATIN SMALL LIGATURE FI",
0xfb02, # "fl", #"LATIN SMALL LIGATURE FL",
0xfb03, # "ffi", #"LATIN SMALL LIGATURE FFI",
0xfb04, # "ffl", #"LATIN SMALL LIGATURE FFL",
0xfb05, # "st", #"LATIN SMALL LIGATURE LONG S T",
0xfb06, # "st", #"LATIN SMALL LIGATURE ST",
]
if (False):
elRegex = "[" + "".join([ chr(x) for x in unicodeEllipses]) + "]" # --
spRegex = "[" + "".join([ chr(x) for x in unicodeSpaces ]) + "]" # Zs
hyRegex = "[" + "".join([ chr(x) for x in unicodeDashes ]) + "]" # Pd
lqRegex = "[" + "".join([ chr(x) for x in unicodeLQuotes ]) + "]" # Pi
rqRegex = "[" + "".join([ chr(x) for x in unicodeRQuotes ]) + "]" # Pf
delRegex = "[" + "".join([ chr(x) for x in unicodeToDelete]) + "]" # --
dblRegex = "[" + "".join([ chr(x) for x in unicodeDoubles ]) + "]" # --
ligRegex = "[" + "".join([ chr(x) for x in unicodeLigatures]) + "]"
#print("ligRegex: '%s'." % (ligRegex))
else:
elRegex = "[" + "".join([ chr(x) for x in unicodeEllipses]) + "]" # --
spRegex = r"\p{Zs}"
hyRegex = r"\p{Pd}"
lqRegex = r"\p{Pi}"
rqRegex = r"\p{Pf}"
delRegex = "[" + "".join([ chr(x) for x in unicodeToDelete]) + "]" # --
dblRegex = "[" + "".join([ chr(x) for x in unicodeDoubles ]) + "]" # --
ligRegex = "[" + "".join([ chr(x) for x in unicodeLigatures]) + "]"
#print("ligRegex: '%s'." % (ligRegex))
lexemeTypes = {
"lower": [ "lower", r"[a-z]+" ],
"upper": [ "upper", r"[A-Z]+" ],
"title": [ "title", r"[A-Z][a-z]+" ],
"mixed": [ "mixed", r"\w+" ],
}
# Reserved set of option values, to specify for how to map char classes.
# Use numbers for faster tests in map().
#
DT_KEEP = "KEEP"
DT_UNIFY = "UNIFY"
DT_DELETE = "DELETE"
DT_SPACE = "SPACE"
DT_STRIP = "STRIP"
DT_VALUE = "VALUE"
DT_UPPER = "UPPER"
DT_LOWER = "LOWER"
DT_DECOMPOSE = "DECOMPOSE"
dispTypes = {
# Keyword: ( DT_NAME, map?, whichClasses),
DT_KEEP: ( DT_KEEP, False, "*" ),
DT_UNIFY: ( DT_UNIFY, True, "*" ),
DT_DELETE: ( DT_DELETE, True, "*" ),
DT_SPACE: ( DT_SPACE, True, "*" ),
DT_STRIP: ( DT_STRIP, True, "Letter" ),
DT_VALUE: ( DT_VALUE, True, "Number" ),
DT_UPPER: ( DT_UPPER, False, "Letter" ),
DT_LOWER: ( DT_LOWER, False, "Letter" ),