Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit c8e26eb

Browse files
committedJul 23, 2022
Adopt uax29 segmenter
Replacing blevesearch/segment. ~2x throughput improvement. Refactor allocations, now ~O(1). Add tests & multilingual sample text to ensure identical behavior. Known differences from previous segmenter: - The original segmenter splits runs of spaces into separate tokens; uax29 concatenates runs into a single token. - The original segmenter doesn’t handle emoji skin tone modifiers, the new one does, attributable to Unicode version update.
1 parent ec0d3aa commit c8e26eb

File tree

5 files changed

+562
-106
lines changed

5 files changed

+562
-106
lines changed
 

‎analysis/tokenizer/unicode/testdata/sample.txt

+376
Large diffs are not rendered by default.

‎analysis/tokenizer/unicode/unicode.go

+35-75
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
package unicode
1616

1717
import (
18-
"github.com/blevesearch/segment"
18+
"github.com/clipperhouse/uax29/iterators/filter"
19+
"github.com/clipperhouse/uax29/words"
1920

2021
"github.com/blevesearch/bleve/v2/analysis"
2122
"github.com/blevesearch/bleve/v2/registry"
@@ -31,83 +32,44 @@ func NewUnicodeTokenizer() *UnicodeTokenizer {
3132
}
3233

3334
func (rt *UnicodeTokenizer) Tokenize(input []byte) analysis.TokenStream {
34-
rvx := make([]analysis.TokenStream, 0, 10) // When rv gets full, append to rvx.
35-
rv := make(analysis.TokenStream, 0, 1)
35+
inputBytes := len(input)
3636

37-
ta := []analysis.Token(nil)
38-
taNext := 0
37+
// An optimization to pre-allocate & avoid re-sizing
38+
const guessBytesPerToken = 6
39+
guessTokens := (inputBytes / guessBytesPerToken) | 1 // ensure minimum of 1
3940

40-
segmenter := segment.NewWordSegmenterDirect(input)
41-
start := 0
42-
pos := 1
41+
result := make(analysis.TokenStream, 0, guessTokens)
4342

44-
guessRemaining := func(end int) int {
45-
avgSegmentLen := end / (len(rv) + 1)
46-
if avgSegmentLen < 1 {
47-
avgSegmentLen = 1
48-
}
49-
50-
remainingLen := len(input) - end
43+
// Pre-allocate token pool
44+
pool := make([]analysis.Token, guessTokens)
45+
poolIndex := 0
5146

52-
return remainingLen / avgSegmentLen
53-
}
47+
segmenter := words.NewSegmenter(input)
48+
segmenter.Filter(filter.AlphaNumeric)
5449

55-
for segmenter.Segment() {
56-
segmentBytes := segmenter.Bytes()
57-
end := start + len(segmentBytes)
58-
if segmenter.Type() != segment.None {
59-
if taNext >= len(ta) {
60-
remainingSegments := guessRemaining(end)
61-
if remainingSegments > 1000 {
62-
remainingSegments = 1000
63-
}
64-
if remainingSegments < 1 {
65-
remainingSegments = 1
66-
}
67-
68-
ta = make([]analysis.Token, remainingSegments)
69-
taNext = 0
70-
}
71-
72-
token := &ta[taNext]
73-
taNext++
74-
75-
token.Term = segmentBytes
76-
token.Start = start
77-
token.End = end
78-
token.Position = pos
79-
token.Type = convertType(segmenter.Type())
80-
81-
if len(rv) >= cap(rv) { // When rv is full, save it into rvx.
82-
rvx = append(rvx, rv)
83-
84-
rvCap := cap(rv) * 2
85-
if rvCap > 256 {
86-
rvCap = 256
87-
}
88-
89-
rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger.
90-
}
91-
92-
rv = append(rv, token)
93-
pos++
50+
for segmenter.Next() {
51+
if poolIndex >= len(pool) {
52+
bytesSoFar := segmenter.End()
53+
tokensSoFar := len(result) | 1
54+
avgBytesPerToken := (bytesSoFar / tokensSoFar) | 1
55+
guessTokensRemaining := ((inputBytes - bytesSoFar) / avgBytesPerToken) | 1
56+
pool = make([]analysis.Token, guessTokensRemaining)
57+
poolIndex = 0
9458
}
95-
start = end
96-
}
9759

98-
if len(rvx) > 0 {
99-
n := len(rv)
100-
for _, r := range rvx {
101-
n += len(r)
102-
}
103-
rall := make(analysis.TokenStream, 0, n)
104-
for _, r := range rvx {
105-
rall = append(rall, r...)
106-
}
107-
return append(rall, rv...)
60+
token := &pool[poolIndex]
61+
poolIndex++
62+
63+
token.Term = segmenter.Bytes()
64+
token.Start = segmenter.Start()
65+
token.End = segmenter.End()
66+
token.Position = len(result) + 1 // 1-indexed
67+
token.Type = getType(segmenter.Bytes())
68+
69+
result = append(result, token)
10870
}
10971

110-
return rv
72+
return result
11173
}
11274

11375
func UnicodeTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
@@ -118,13 +80,11 @@ func init() {
11880
registry.RegisterTokenizer(Name, UnicodeTokenizerConstructor)
11981
}
12082

121-
func convertType(segmentWordType int) analysis.TokenType {
122-
switch segmentWordType {
123-
case segment.Ideo:
124-
return analysis.Ideographic
125-
case segment.Kana:
83+
func getType(segment []byte) analysis.TokenType {
84+
switch {
85+
case words.BleveIdeographic(segment):
12686
return analysis.Ideographic
127-
case segment.Number:
87+
case words.BleveNumeric(segment):
12888
return analysis.Numeric
12989
}
13090
return analysis.AlphaNumeric

‎analysis/tokenizer/unicode/unicode_test.go

+148-31
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
package unicode
1616

1717
import (
18+
"bytes"
19+
"io/ioutil"
1820
"reflect"
1921
"testing"
2022

@@ -151,52 +153,167 @@ func TestUnicode(t *testing.T) {
151153
actual := tokenizer.Tokenize(test.input)
152154

153155
if !reflect.DeepEqual(actual, test.output) {
154-
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
156+
t.Errorf("Expected\n%v\n, got\n%v\n for %q", test.output, actual, string(test.input))
155157
}
156158
}
157159
}
158160

159-
var sampleLargeInput = []byte(`There are three characteristics of liquids which are relevant to the discussion of a BLEVE:
160-
If a liquid in a sealed container is boiled, the pressure inside the container increases. As the liquid changes to a gas it expands - this expansion in a vented container would cause the gas and liquid to take up more space. In a sealed container the gas and liquid are not able to take up more space and so the pressure rises. Pressurized vessels containing liquids can reach an equilibrium where the liquid stops boiling and the pressure stops rising. This occurs when no more heat is being added to the system (either because it has reached ambient temperature or has had a heat source removed).
161-
The boiling temperature of a liquid is dependent on pressure - high pressures will yield high boiling temperatures, and low pressures will yield low boiling temperatures. A common simple experiment is to place a cup of water in a vacuum chamber, and then reduce the pressure in the chamber until the water boils. By reducing the pressure the water will boil even at room temperature. This works both ways - if the pressure is increased beyond normal atmospheric pressures, the boiling of hot water could be suppressed far beyond normal temperatures. The cooling system of a modern internal combustion engine is a real-world example.
162-
When a liquid boils it turns into a gas. The resulting gas takes up far more space than the liquid did.
163-
Typically, a BLEVE starts with a container of liquid which is held above its normal, atmospheric-pressure boiling temperature. Many substances normally stored as liquids, such as CO2, oxygen, and other similar industrial gases have boiling temperatures, at atmospheric pressure, far below room temperature. In the case of water, a BLEVE could occur if a pressurized chamber of water is heated far beyond the standard 100 °C (212 °F). That container, because the boiling water pressurizes it, is capable of holding liquid water at very high temperatures.
164-
If the pressurized vessel, containing liquid at high temperature (which may be room temperature, depending on the substance) ruptures, the pressure which prevents the liquid from boiling is lost. If the rupture is catastrophic, where the vessel is immediately incapable of holding any pressure at all, then there suddenly exists a large mass of liquid which is at very high temperature and very low pressure. This causes the entire volume of liquid to instantaneously boil, which in turn causes an extremely rapid expansion. Depending on temperatures, pressures and the substance involved, that expansion may be so rapid that it can be classified as an explosion, fully capable of inflicting severe damage on its surroundings.`)
161+
func TestPreviousIdentical(t *testing.T) {
162+
file, err := ioutil.ReadFile("testdata/sample.txt")
163+
if err != nil {
164+
t.Fatal(err)
165+
}
166+
167+
previous := TokenizePrevious(file)
168+
current := NewUnicodeTokenizer().Tokenize(file)
169+
170+
if len(previous) != len(current) {
171+
t.Errorf("len(previous): %d, len(current): %d\n", len(previous), len(current))
172+
}
173+
174+
for i := range previous {
175+
prev := previous[i]
176+
curr := current[i]
177+
if !bytes.Equal(prev.Term, curr.Term) {
178+
t.Fatalf("previous term: %q, current term: %q", prev.Term, curr.Term)
179+
}
180+
if prev.Start != curr.Start {
181+
t.Fatalf("prev.Start: %d, curr.Start: %d", prev.Start, curr.Start)
182+
}
183+
if prev.End != curr.End {
184+
t.Fatalf("prev.End: %d, curr.End: %d", prev.End, curr.End)
185+
}
186+
if prev.Position != curr.Position {
187+
t.Fatalf("prev.Position: %d, curr.Position: %d", prev.Position, curr.Position)
188+
}
189+
if prev.Type != curr.Type {
190+
t.Errorf("prev.Type: %v, curr.Type: %v\n\n", prev.Type, curr.Type)
191+
}
192+
}
193+
}
165194

166-
func BenchmarkTokenizeEnglishText(b *testing.B) {
195+
func BenchmarkTokenizeMultilingual(b *testing.B) {
196+
file, err := ioutil.ReadFile("testdata/sample.txt")
197+
if err != nil {
198+
b.Fatal(err)
199+
}
167200

168201
tokenizer := NewUnicodeTokenizer()
169-
b.ResetTimer()
170202

203+
b.SetBytes(int64(len(file)))
204+
205+
b.ResetTimer()
171206
for i := 0; i < b.N; i++ {
172-
tokenizer.Tokenize(sampleLargeInput)
207+
tokens := tokenizer.Tokenize(file)
208+
b.ReportMetric(float64(len(tokens)), "tokens")
173209
}
210+
}
174211

212+
func BenchmarkTokenizeMultilingualPrevious(b *testing.B) {
213+
file, err := ioutil.ReadFile("testdata/sample.txt")
214+
if err != nil {
215+
b.Fatal(err)
216+
}
217+
218+
b.SetBytes(int64(len(file)))
219+
220+
b.ResetTimer()
221+
for i := 0; i < b.N; i++ {
222+
tokens := TokenizePrevious(file)
223+
b.ReportMetric(float64(len(tokens)), "tokens")
224+
}
175225
}
176226

177-
func TestConvertType(t *testing.T) {
178-
tests := []struct {
179-
in int
180-
out analysis.TokenType
181-
}{
182-
{
183-
segment.Ideo, analysis.Ideographic,
184-
},
185-
{
186-
segment.Kana, analysis.Ideographic,
187-
},
188-
{
189-
segment.Number, analysis.Numeric,
190-
},
191-
{
192-
segment.Letter, analysis.AlphaNumeric,
193-
},
227+
// Previous implementation for testing ↓
228+
229+
func TokenizePrevious(input []byte) analysis.TokenStream {
230+
rvx := make([]analysis.TokenStream, 0, 10) // When rv gets full, append to rvx.
231+
rv := make(analysis.TokenStream, 0, 1)
232+
233+
ta := []analysis.Token(nil)
234+
taNext := 0
235+
236+
segmenter := segment.NewWordSegmenterDirect(input)
237+
start := 0
238+
pos := 1
239+
240+
guessRemaining := func(end int) int {
241+
avgSegmentLen := end / (len(rv) + 1)
242+
if avgSegmentLen < 1 {
243+
avgSegmentLen = 1
244+
}
245+
246+
remainingLen := len(input) - end
247+
248+
return remainingLen / avgSegmentLen
194249
}
195250

196-
for _, test := range tests {
197-
actual := convertType(test.in)
198-
if actual != test.out {
199-
t.Errorf("expected %d, got %d for %d", test.out, actual, test.in)
251+
for segmenter.Segment() {
252+
segmentBytes := segmenter.Bytes()
253+
end := start + len(segmentBytes)
254+
if segmenter.Type() != segment.None {
255+
if taNext >= len(ta) {
256+
remainingSegments := guessRemaining(end)
257+
if remainingSegments > 1000 {
258+
remainingSegments = 1000
259+
}
260+
if remainingSegments < 1 {
261+
remainingSegments = 1
262+
}
263+
264+
ta = make([]analysis.Token, remainingSegments)
265+
taNext = 0
266+
}
267+
268+
token := &ta[taNext]
269+
taNext++
270+
271+
token.Term = segmentBytes
272+
token.Start = start
273+
token.End = end
274+
token.Position = pos
275+
token.Type = convertType(segmenter.Type())
276+
277+
if len(rv) >= cap(rv) { // When rv is full, save it into rvx.
278+
rvx = append(rvx, rv)
279+
280+
rvCap := cap(rv) * 2
281+
if rvCap > 256 {
282+
rvCap = 256
283+
}
284+
285+
rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger.
286+
}
287+
288+
rv = append(rv, token)
289+
pos++
290+
}
291+
start = end
292+
}
293+
294+
if len(rvx) > 0 {
295+
n := len(rv)
296+
for _, r := range rvx {
297+
n += len(r)
200298
}
299+
rall := make(analysis.TokenStream, 0, n)
300+
for _, r := range rvx {
301+
rall = append(rall, r...)
302+
}
303+
return append(rall, rv...)
304+
}
305+
306+
return rv
307+
}
308+
309+
func convertType(segmentWordType int) analysis.TokenType {
310+
switch segmentWordType {
311+
case segment.Ideo:
312+
return analysis.Ideographic
313+
case segment.Kana:
314+
return analysis.Ideographic
315+
case segment.Number:
316+
return analysis.Numeric
201317
}
318+
return analysis.AlphaNumeric
202319
}

‎go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ require (
2222
github.com/blevesearch/zapx/v13 v13.3.5
2323
github.com/blevesearch/zapx/v14 v14.3.5
2424
github.com/blevesearch/zapx/v15 v15.3.5-0.20220722171731-5dd118e621d2
25+
github.com/clipperhouse/uax29 v1.12.4
2526
github.com/couchbase/moss v0.2.0
2627
github.com/golang/protobuf v1.3.2
2728
github.com/spf13/cobra v0.0.5

‎go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ github.com/blevesearch/zapx/v14 v14.3.5 h1:hEvVjZaagFCvOUJrlFQ6/Z6Jjy0opM3g7TMEo
4141
github.com/blevesearch/zapx/v14 v14.3.5/go.mod h1:954A/eKFb+pg/ncIYWLWCKY+mIjReM9FGTGIO2Wu1cU=
4242
github.com/blevesearch/zapx/v15 v15.3.5-0.20220722171731-5dd118e621d2 h1:kXBfopw0cwKD37xif91wzHqSkP9eyI9MAZv4MNxPR4w=
4343
github.com/blevesearch/zapx/v15 v15.3.5-0.20220722171731-5dd118e621d2/go.mod h1:QMUh2hXCaYIWFKPYGavq/Iga2zbHWZ9DZAa9uFbWyvg=
44+
github.com/clipperhouse/uax29 v1.12.4 h1:on+uPLg2CYxLMReDh3xrIv4F43PtluOmZfszJctSmgI=
45+
github.com/clipperhouse/uax29 v1.12.4/go.mod h1:JGonRhbyeZzi0GciYzJmXCDP3C/sxVSSv1rBh3zURuU=
4446
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
4547
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
4648
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=

0 commit comments

Comments
 (0)
Please sign in to comment.