blevesearch · Jul 23, 2022
diff --git a/‎analysis/tokenizer/unicode/testdata/sample.txt
+376 b/‎analysis/tokenizer/unicode/testdata/sample.txt
+376
diff --git a/‎analysis/tokenizer/unicode/unicode.go
+35-75 b/‎analysis/tokenizer/unicode/unicode.go
+35-75
diff --git a/‎analysis/tokenizer/unicode/unicode_test.go
+148-31 b/‎analysis/tokenizer/unicode/unicode_test.go
+148-31
diff --git a/‎go.mod
+1 b/‎go.mod
+1
diff --git a/‎go.sum
+2 b/‎go.sum
+2
@@ -15,7 +15,8 @@
 package unicode
 
 import (
-	"github.com/blevesearch/segment"
+	"github.com/clipperhouse/uax29/iterators/filter"
+	"github.com/clipperhouse/uax29/words"
 
 	"github.com/blevesearch/bleve/v2/analysis"
 	"github.com/blevesearch/bleve/v2/registry"
@@ -31,83 +32,44 @@ func NewUnicodeTokenizer() *UnicodeTokenizer {
 }
 
 func (rt *UnicodeTokenizer) Tokenize(input []byte) analysis.TokenStream {
-	rvx := make([]analysis.TokenStream, 0, 10) // When rv gets full, append to rvx.
-	rv := make(analysis.TokenStream, 0, 1)
+	inputBytes := len(input)
 
-	ta := []analysis.Token(nil)
-	taNext := 0
+	// An optimization to pre-allocate & avoid re-sizing
+	const guessBytesPerToken = 6
+	guessTokens := (inputBytes / guessBytesPerToken) | 1 // ensure minimum of 1
 
-	segmenter := segment.NewWordSegmenterDirect(input)
-	start := 0
-	pos := 1
+	result := make(analysis.TokenStream, 0, guessTokens)
 
-	guessRemaining := func(end int) int {
-		avgSegmentLen := end / (len(rv) + 1)
-		if avgSegmentLen < 1 {
-			avgSegmentLen = 1
-		}
-
-		remainingLen := len(input) - end
+	// Pre-allocate token pool
+	pool := make([]analysis.Token, guessTokens)
+	poolIndex := 0
 
-		return remainingLen / avgSegmentLen
-	}
+	segmenter := words.NewSegmenter(input)
+	segmenter.Filter(filter.AlphaNumeric)
 
-	for segmenter.Segment() {
-		segmentBytes := segmenter.Bytes()
-		end := start + len(segmentBytes)
-		if segmenter.Type() != segment.None {
-			if taNext >= len(ta) {
-				remainingSegments := guessRemaining(end)
-				if remainingSegments > 1000 {
-					remainingSegments = 1000
-				}
-				if remainingSegments < 1 {
-					remainingSegments = 1
-				}
-
-				ta = make([]analysis.Token, remainingSegments)
-				taNext = 0
-			}
-
-			token := &ta[taNext]
-			taNext++
-
-			token.Term = segmentBytes
-			token.Start = start
-			token.End = end
-			token.Position = pos
-			token.Type = convertType(segmenter.Type())
-
-			if len(rv) >= cap(rv) { // When rv is full, save it into rvx.
-				rvx = append(rvx, rv)
-
-				rvCap := cap(rv) * 2
-				if rvCap > 256 {
-					rvCap = 256
-				}
-
-				rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger.
-			}
-
-			rv = append(rv, token)
-			pos++
+	for segmenter.Next() {
+		if poolIndex >= len(pool) {
+			bytesSoFar := segmenter.End()
+			tokensSoFar := len(result) | 1
+			avgBytesPerToken := (bytesSoFar / tokensSoFar) | 1
+			guessTokensRemaining := ((inputBytes - bytesSoFar) / avgBytesPerToken) | 1
+			pool = make([]analysis.Token, guessTokensRemaining)
+			poolIndex = 0
 		}
-		start = end
-	}
 
-	if len(rvx) > 0 {
-		n := len(rv)
-		for _, r := range rvx {
-			n += len(r)
-		}
-		rall := make(analysis.TokenStream, 0, n)
-		for _, r := range rvx {
-			rall = append(rall, r...)
-		}
-		return append(rall, rv...)
+		token := &pool[poolIndex]
+		poolIndex++
+
+		token.Term = segmenter.Bytes()
+		token.Start = segmenter.Start()
+		token.End = segmenter.End()
+		token.Position = len(result) + 1 // 1-indexed
+		token.Type = getType(segmenter.Bytes())
+
+		result = append(result, token)
 	}
 
-	return rv
+	return result
 }
 
 func UnicodeTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
@@ -118,13 +80,11 @@ func init() {
 	registry.RegisterTokenizer(Name, UnicodeTokenizerConstructor)
 }
 
-func convertType(segmentWordType int) analysis.TokenType {
-	switch segmentWordType {
-	case segment.Ideo:
-		return analysis.Ideographic
-	case segment.Kana:
+func getType(segment []byte) analysis.TokenType {
+	switch {
+	case words.BleveIdeographic(segment):
 		return analysis.Ideographic
-	case segment.Number:
+	case words.BleveNumeric(segment):
 		return analysis.Numeric
 	}
 	return analysis.AlphaNumeric
 
@@ -15,6 +15,8 @@
 package unicode
 
 import (
+	"bytes"
+	"io/ioutil"
 	"reflect"
 	"testing"
 
@@ -151,52 +153,167 @@ func TestUnicode(t *testing.T) {
 		actual := tokenizer.Tokenize(test.input)
 
 		if !reflect.DeepEqual(actual, test.output) {
-			t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
+			t.Errorf("Expected\n%v\n, got\n%v\n for %q", test.output, actual, string(test.input))
 		}
 	}
 }
 
-var sampleLargeInput = []byte(`There are three characteristics of liquids which are relevant to the discussion of a BLEVE:
-If a liquid in a sealed container is boiled, the pressure inside the container increases. As the liquid changes to a gas it expands - this expansion in a vented container would cause the gas and liquid to take up more space. In a sealed container the gas and liquid are not able to take up more space and so the pressure rises. Pressurized vessels containing liquids can reach an equilibrium where the liquid stops boiling and the pressure stops rising. This occurs when no more heat is being added to the system (either because it has reached ambient temperature or has had a heat source removed).
-The boiling temperature of a liquid is dependent on pressure - high pressures will yield high boiling temperatures, and low pressures will yield low boiling temperatures. A common simple experiment is to place a cup of water in a vacuum chamber, and then reduce the pressure in the chamber until the water boils. By reducing the pressure the water will boil even at room temperature. This works both ways - if the pressure is increased beyond normal atmospheric pressures, the boiling of hot water could be suppressed far beyond normal temperatures. The cooling system of a modern internal combustion engine is a real-world example.
-When a liquid boils it turns into a gas. The resulting gas takes up far more space than the liquid did.
-Typically, a BLEVE starts with a container of liquid which is held above its normal, atmospheric-pressure boiling temperature. Many substances normally stored as liquids, such as CO2, oxygen, and other similar industrial gases have boiling temperatures, at atmospheric pressure, far below room temperature. In the case of water, a BLEVE could occur if a pressurized chamber of water is heated far beyond the standard 100 °C (212 °F). That container, because the boiling water pressurizes it, is capable of holding liquid water at very high temperatures.
-If the pressurized vessel, containing liquid at high temperature (which may be room temperature, depending on the substance) ruptures, the pressure which prevents the liquid from boiling is lost. If the rupture is catastrophic, where the vessel is immediately incapable of holding any pressure at all, then there suddenly exists a large mass of liquid which is at very high temperature and very low pressure. This causes the entire volume of liquid to instantaneously boil, which in turn causes an extremely rapid expansion. Depending on temperatures, pressures and the substance involved, that expansion may be so rapid that it can be classified as an explosion, fully capable of inflicting severe damage on its surroundings.`)
+func TestPreviousIdentical(t *testing.T) {
+	file, err := ioutil.ReadFile("testdata/sample.txt")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	previous := TokenizePrevious(file)
+	current := NewUnicodeTokenizer().Tokenize(file)
+
+	if len(previous) != len(current) {
+		t.Errorf("len(previous): %d, len(current): %d\n", len(previous), len(current))
+	}
+
+	for i := range previous {
+		prev := previous[i]
+		curr := current[i]
+		if !bytes.Equal(prev.Term, curr.Term) {
+			t.Fatalf("previous term: %q, current term: %q", prev.Term, curr.Term)
+		}
+		if prev.Start != curr.Start {
+			t.Fatalf("prev.Start: %d, curr.Start: %d", prev.Start, curr.Start)
+		}
+		if prev.End != curr.End {
+			t.Fatalf("prev.End: %d, curr.End: %d", prev.End, curr.End)
+		}
+		if prev.Position != curr.Position {
+			t.Fatalf("prev.Position: %d, curr.Position: %d", prev.Position, curr.Position)
+		}
+		if prev.Type != curr.Type {
+			t.Errorf("prev.Type: %v, curr.Type: %v\n\n", prev.Type, curr.Type)
+		}
+	}
+}
 
-func BenchmarkTokenizeEnglishText(b *testing.B) {
+func BenchmarkTokenizeMultilingual(b *testing.B) {
+	file, err := ioutil.ReadFile("testdata/sample.txt")
+	if err != nil {
+		b.Fatal(err)
+	}
 
 	tokenizer := NewUnicodeTokenizer()
-	b.ResetTimer()
 
+	b.SetBytes(int64(len(file)))
+
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		tokenizer.Tokenize(sampleLargeInput)
+		tokens := tokenizer.Tokenize(file)
+		b.ReportMetric(float64(len(tokens)), "tokens")
 	}
+}
 
+func BenchmarkTokenizeMultilingualPrevious(b *testing.B) {
+	file, err := ioutil.ReadFile("testdata/sample.txt")
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	b.SetBytes(int64(len(file)))
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		tokens := TokenizePrevious(file)
+		b.ReportMetric(float64(len(tokens)), "tokens")
+	}
 }
 
-func TestConvertType(t *testing.T) {
-	tests := []struct {
-		in  int
-		out analysis.TokenType
-	}{
-		{
-			segment.Ideo, analysis.Ideographic,
-		},
-		{
-			segment.Kana, analysis.Ideographic,
-		},
-		{
-			segment.Number, analysis.Numeric,
-		},
-		{
-			segment.Letter, analysis.AlphaNumeric,
-		},
+// Previous implementation for testing ↓
+
+func TokenizePrevious(input []byte) analysis.TokenStream {
+	rvx := make([]analysis.TokenStream, 0, 10) // When rv gets full, append to rvx.
+	rv := make(analysis.TokenStream, 0, 1)
+
+	ta := []analysis.Token(nil)
+	taNext := 0
+
+	segmenter := segment.NewWordSegmenterDirect(input)
+	start := 0
+	pos := 1
+
+	guessRemaining := func(end int) int {
+		avgSegmentLen := end / (len(rv) + 1)
+		if avgSegmentLen < 1 {
+			avgSegmentLen = 1
+		}
+
+		remainingLen := len(input) - end
+
+		return remainingLen / avgSegmentLen
 	}
 
-	for _, test := range tests {
-		actual := convertType(test.in)
-		if actual != test.out {
-			t.Errorf("expected %d, got %d for %d", test.out, actual, test.in)
+	for segmenter.Segment() {
+		segmentBytes := segmenter.Bytes()
+		end := start + len(segmentBytes)
+		if segmenter.Type() != segment.None {
+			if taNext >= len(ta) {
+				remainingSegments := guessRemaining(end)
+				if remainingSegments > 1000 {
+					remainingSegments = 1000
+				}
+				if remainingSegments < 1 {
+					remainingSegments = 1
+				}
+
+				ta = make([]analysis.Token, remainingSegments)
+				taNext = 0
+			}
+
+			token := &ta[taNext]
+			taNext++
+
+			token.Term = segmentBytes
+			token.Start = start
+			token.End = end
+			token.Position = pos
+			token.Type = convertType(segmenter.Type())
+
+			if len(rv) >= cap(rv) { // When rv is full, save it into rvx.
+				rvx = append(rvx, rv)
+
+				rvCap := cap(rv) * 2
+				if rvCap > 256 {
+					rvCap = 256
+				}
+
+				rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger.
+			}
+
+			rv = append(rv, token)
+			pos++
+		}
+		start = end
+	}
+
+	if len(rvx) > 0 {
+		n := len(rv)
+		for _, r := range rvx {
+			n += len(r)
 		}
+		rall := make(analysis.TokenStream, 0, n)
+		for _, r := range rvx {
+			rall = append(rall, r...)
+		}
+		return append(rall, rv...)
+	}
+
+	return rv
+}
+
+func convertType(segmentWordType int) analysis.TokenType {
+	switch segmentWordType {
+	case segment.Ideo:
+		return analysis.Ideographic
+	case segment.Kana:
+		return analysis.Ideographic
+	case segment.Number:
+		return analysis.Numeric
 	}
+	return analysis.AlphaNumeric
 }
@@ -22,6 +22,7 @@ require (
 	github.com/blevesearch/zapx/v13 v13.3.5
 	github.com/blevesearch/zapx/v14 v14.3.5
 	github.com/blevesearch/zapx/v15 v15.3.5-0.20220722171731-5dd118e621d2
+	github.com/clipperhouse/uax29 v1.12.4
 	github.com/couchbase/moss v0.2.0
 	github.com/golang/protobuf v1.3.2
 	github.com/spf13/cobra v0.0.5
 
@@ -41,6 +41,8 @@ github.com/blevesearch/zapx/v14 v14.3.5 h1:hEvVjZaagFCvOUJrlFQ6/Z6Jjy0opM3g7TMEo
 github.com/blevesearch/zapx/v14 v14.3.5/go.mod h1:954A/eKFb+pg/ncIYWLWCKY+mIjReM9FGTGIO2Wu1cU=
 github.com/blevesearch/zapx/v15 v15.3.5-0.20220722171731-5dd118e621d2 h1:kXBfopw0cwKD37xif91wzHqSkP9eyI9MAZv4MNxPR4w=
 github.com/blevesearch/zapx/v15 v15.3.5-0.20220722171731-5dd118e621d2/go.mod h1:QMUh2hXCaYIWFKPYGavq/Iga2zbHWZ9DZAa9uFbWyvg=
+github.com/clipperhouse/uax29 v1.12.4 h1:on+uPLg2CYxLMReDh3xrIv4F43PtluOmZfszJctSmgI=
+github.com/clipperhouse/uax29 v1.12.4/go.mod h1:JGonRhbyeZzi0GciYzJmXCDP3C/sxVSSv1rBh3zURuU=
 github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
 github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
 github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=