Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

matchtree: do not depend on String for symbol all optimization #746

Merged
merged 3 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 56 additions & 23 deletions matchtree.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ type noVisitMatchTree struct {
type regexpMatchTree struct {
regexp *regexp.Regexp

// origRegexp is the original parsed regexp from the query structure. It
// does not include mutations such as case sensitivity.
origRegexp *syntax.Regexp

fileName bool

// mutable
Expand All @@ -193,6 +197,19 @@ type regexpMatchTree struct {
bruteForceMatchTree
}

func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
prefix := ""
if !s.CaseSensitive {
prefix = "(?i)"
}

return &regexpMatchTree{
regexp: regexp.MustCompile(prefix + s.Regexp.String()),
origRegexp: s.Regexp,
fileName: s.FileName,
}
}

// \bLITERAL\b
type wordMatchTree struct {
word string
Expand Down Expand Up @@ -972,15 +989,7 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error)
// provide something faster.
tr = wmt
} else {
prefix := ""
if !s.CaseSensitive {
prefix = "(?i)"
}

tr = &regexpMatchTree{
regexp: regexp.MustCompile(prefix + s.Regexp.String()),
fileName: s.FileName,
}
tr = newRegexpMatchTree(s)
}

return &andMatchTree{
Expand Down Expand Up @@ -1103,19 +1112,19 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error)
}, nil
}

var regexp *regexp.Regexp
var regexpMT *regexpMatchTree
visitMatchTree(subMT, func(mt matchTree) {
if t, ok := mt.(*regexpMatchTree); ok {
regexp = t.regexp
regexpMT = t
}
})
if regexp == nil {
if regexpMT == nil {
return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
}

return &symbolRegexpMatchTree{
regexp: regexp,
all: regexp.String() == "(?i)(?-s:.*)",
regexp: regexpMT.regexp,
all: isRegexpAll(regexpMT.origRegexp),
matchTree: subMT,
}, nil

Expand Down Expand Up @@ -1230,15 +1239,12 @@ func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error)
}

if utf8.RuneCountInString(s.Pattern) < ngramSize {
prefix := ""
if !s.CaseSensitive {
prefix = "(?i)"
}
t := &regexpMatchTree{
regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
fileName: s.FileName,
}
return t, nil
return newRegexpMatchTree(&query.Regexp{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks equivalent to me, but it's not 100% obvious!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah this was duplicating the logic we did in the main regexpMatchTree construction so this feels good to avoid that duplicate logic.

What was the most hard to validate part? I am guessing that regexp.QuoteMeta is equivalent to the direct syntax.Regexp use?

Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
FileName: s.FileName,
Content: s.Content,
CaseSensitive: s.CaseSensitive,
}), nil
}

result, err := d.iterateNgrams(s)
Expand Down Expand Up @@ -1375,3 +1381,30 @@ func pruneMatchTree(mt matchTree) (matchTree, error) {
}
return mt, err
}

// isRegexpAll returns true if the query matches all possible lines.
//
// Note: it is possible for a funky regex to actually match all but this
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did the previous implementation have the same limitation?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The previous implementation had this limitation in a worse way. You had to a have a very specific representation after parsing. While this implementation will ignore a bunch of stuff that won't matter for this test.

// returns false. This returns true for normal looking regexes like ".*" or
// "(?-s:.*)".
func isRegexpAll(r *syntax.Regexp) bool {
// Note: we don't care about any flags since we are looking for .* and we
// don't mind if . matches all or everything but newline.

// for loop is for visiting children of OpCapture/OpConcat until we hit .*
for {
// Our main target: .*
if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
// (?s:.) or (?-s:.)
return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
}

// Strip away expressions being wrapped in paranthesis
if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
r = r.Sub[0]
continue
}

return false
}
}
36 changes: 36 additions & 0 deletions matchtree_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package zoekt

import (
"reflect"
"regexp/syntax"
"testing"

"github.com/RoaringBitmap/roaring"
Expand Down Expand Up @@ -385,3 +386,38 @@ func TestRepoIDs(t *testing.T) {
t.Fatalf("expected %d document, but got at least 1 more", len(want))
}
}

func TestIsRegexpAll(t *testing.T) {
valid := []string{
".*",
"(.*)",
"(?-s:.*)",
"(?s:.*)",
"(?i)(?-s:.*)",
}
invalid := []string{
".",
"foo",
"(foo.*)",
}

for _, s := range valid {
r, err := syntax.Parse(s, syntax.Perl)
if err != nil {
t.Fatal(err)
}
if !isRegexpAll(r) {
t.Errorf("expected %q to match all", s)
}
}

for _, s := range invalid {
r, err := syntax.Parse(s, syntax.Perl)
if err != nil {
t.Fatal(err)
}
if isRegexpAll(r) {
t.Errorf("expected %q to not match all", s)
}
}
}
Loading