Skip to content

Commit

Permalink
matchtree: do not depend on String for symbol all optimization (#746)
Browse files Browse the repository at this point in the history
In go 1.22 the way go converts a query like .* into a String changed.
This meant that when we compiled zoekt in the sourcegraph codebase our
"regex matches all" optimization stopped working.

This commit moves us to a simple inspection of the syntax tree which
will be stable across versions and to be honest just feels better.

To do this we introduce a field "origRegexp" to the regexpMatchTree so
we can avoid needing to recompile the syntax tree. Note: Regexp does not
offer a way to get at its internal syntax.Regexp which would be nice.

Test Plan: go test and added a unit test
  • Loading branch information
keegancsmith authored Mar 13, 2024
1 parent 52a28e3 commit cc5e093
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 23 deletions.
79 changes: 56 additions & 23 deletions matchtree.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ type noVisitMatchTree struct {
type regexpMatchTree struct {
regexp *regexp.Regexp

// origRegexp is the original parsed regexp from the query structure. It
// does not include mutations such as case sensitivity.
origRegexp *syntax.Regexp

fileName bool

// mutable
Expand All @@ -193,6 +197,19 @@ type regexpMatchTree struct {
bruteForceMatchTree
}

func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
prefix := ""
if !s.CaseSensitive {
prefix = "(?i)"
}

return &regexpMatchTree{
regexp: regexp.MustCompile(prefix + s.Regexp.String()),
origRegexp: s.Regexp,
fileName: s.FileName,
}
}

// \bLITERAL\b
type wordMatchTree struct {
word string
Expand Down Expand Up @@ -972,15 +989,7 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error)
// provide something faster.
tr = wmt
} else {
prefix := ""
if !s.CaseSensitive {
prefix = "(?i)"
}

tr = &regexpMatchTree{
regexp: regexp.MustCompile(prefix + s.Regexp.String()),
fileName: s.FileName,
}
tr = newRegexpMatchTree(s)
}

return &andMatchTree{
Expand Down Expand Up @@ -1103,19 +1112,19 @@ func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error)
}, nil
}

var regexp *regexp.Regexp
var regexpMT *regexpMatchTree
visitMatchTree(subMT, func(mt matchTree) {
if t, ok := mt.(*regexpMatchTree); ok {
regexp = t.regexp
regexpMT = t
}
})
if regexp == nil {
if regexpMT == nil {
return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
}

return &symbolRegexpMatchTree{
regexp: regexp,
all: regexp.String() == "(?i)(?-s:.*)",
regexp: regexpMT.regexp,
all: isRegexpAll(regexpMT.origRegexp),
matchTree: subMT,
}, nil

Expand Down Expand Up @@ -1230,15 +1239,12 @@ func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error)
}

if utf8.RuneCountInString(s.Pattern) < ngramSize {
prefix := ""
if !s.CaseSensitive {
prefix = "(?i)"
}
t := &regexpMatchTree{
regexp: regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
fileName: s.FileName,
}
return t, nil
return newRegexpMatchTree(&query.Regexp{
Regexp: &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
FileName: s.FileName,
Content: s.Content,
CaseSensitive: s.CaseSensitive,
}), nil
}

result, err := d.iterateNgrams(s)
Expand Down Expand Up @@ -1375,3 +1381,30 @@ func pruneMatchTree(mt matchTree) (matchTree, error) {
}
return mt, err
}

// isRegexpAll returns true if the query matches all possible lines.
//
// Note: it is possible for a funky regex to actually match all but this
// returns false. This returns true for normal looking regexes like ".*" or
// "(?-s:.*)".
func isRegexpAll(r *syntax.Regexp) bool {
// Note: we don't care about any flags since we are looking for .* and we
// don't mind if . matches all or everything but newline.

// for loop is for visiting children of OpCapture/OpConcat until we hit .*
for {
// Our main target: .*
if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
// (?s:.) or (?-s:.)
return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
}

// Strip away expressions being wrapped in paranthesis
if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
r = r.Sub[0]
continue
}

return false
}
}
36 changes: 36 additions & 0 deletions matchtree_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package zoekt

import (
"reflect"
"regexp/syntax"
"testing"

"github.com/RoaringBitmap/roaring"
Expand Down Expand Up @@ -385,3 +386,38 @@ func TestRepoIDs(t *testing.T) {
t.Fatalf("expected %d document, but got at least 1 more", len(want))
}
}

func TestIsRegexpAll(t *testing.T) {
valid := []string{
".*",
"(.*)",
"(?-s:.*)",
"(?s:.*)",
"(?i)(?-s:.*)",
}
invalid := []string{
".",
"foo",
"(foo.*)",
}

for _, s := range valid {
r, err := syntax.Parse(s, syntax.Perl)
if err != nil {
t.Fatal(err)
}
if !isRegexpAll(r) {
t.Errorf("expected %q to match all", s)
}
}

for _, s := range invalid {
r, err := syntax.Parse(s, syntax.Perl)
if err != nil {
t.Fatal(err)
}
if isRegexpAll(r) {
t.Errorf("expected %q to not match all", s)
}
}
}

0 comments on commit cc5e093

Please sign in to comment.