score: introduce query.Boost to scale score

keegancsmith · keegancsmith · commit 6ab827fad53c · 2024-01-29T13:29:32.000+02:00
This commit introduces a new primitive Boost to our query language. It
allows boosting (or dampening) the contribution to the score a query
atoms will match contribute.

To achieve this we introduce boostMatchTree which records this weight.
We then adjust the visitMatches to take an initial score weight (1.0),
and then each time we recurse through a boostMatchTree the score weight
is multiplied by the boost weight. Additionally candidateMatch now has a
new field, scoreWeight, which records the weight at time of candidate
collection. Without boosting in the query this value will always be 1.

Finally when scoring a candidateMatch we take the final score for it and
multiply it by scoreWeight.

Note: we do not expose a way to set this in the query language, only the
query API.

Test Plan: Manual testing against webserver via the new phrase-boost URL
param. Additionally updated ranking tests to use the phrase booster.
diff --git a/api_test.go b/api_test.go
@@ -152,7 +152,7 @@ func TestMatchSize(t *testing.T) {
 		size: 112,
 	}, {
 		v:    candidateMatch{},
-		size: 72,
+		size: 80,
 	}, {
 		v:    candidateChunk{},
 		size: 40,
diff --git a/bits.go b/bits.go
@@ -16,6 +16,7 @@ package zoekt
 
 import (
 	"encoding/binary"
+	"math"
 	"sort"
 	"unicode"
 	"unicode/utf8"
@@ -391,3 +392,7 @@ func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
 func (m runeOffsetMap) sizeBytes() int {
 	return 8 * len(m)
 }
+
+func epsilonEqualsOne(scoreWeight float64) bool {
+	return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9
+}
diff --git a/contentprovider.go b/contentprovider.go
@@ -660,6 +660,14 @@ func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language str
 			}
 		}
 
+		// scoreWeight != 1 means it affects score
+		if !epsilonEqualsOne(m.scoreWeight) {
+			score.score = score.score * m.scoreWeight
+			if debug {
+				score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
+			}
+		}
+
 		if score.score > maxScore.score {
 			maxScore.score = score.score
 			maxScore.what = score.what
diff --git a/eval.go b/eval.go
@@ -420,7 +420,7 @@ nextFileMatch:
 // whether there's an exact match on a symbol, the number of query clauses that matched, etc.
 func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
 	atomMatchCount := 0
-	visitMatches(mt, known, func(mt matchTree) {
+	visitMatchAtoms(mt, known, func(mt matchTree) {
 		atomMatchCount++
 	})
 
@@ -544,6 +544,15 @@ func (m sortByOffsetSlice) Less(i, j int) bool {
 	return m[i].byteOffset < m[j].byteOffset
 }
 
+// setScoreWeight is a helper used by gatherMatches to set the weight based on
+// the score weight of the matchTree.
+func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch {
+	for _, m := range cm {
+		m.scoreWeight = scoreWeight
+	}
+	return cm
+}
+
 // Gather matches from this document. This never returns a mixture of
 // filename/content matches: if there are content matches, all
 // filename matches are trimmed from the result. The matches are
@@ -554,18 +563,18 @@ func (m sortByOffsetSlice) Less(i, j int) bool {
 // but adjacent matches will remain.
 func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
 	var cands []*candidateMatch
-	visitMatches(mt, known, func(mt matchTree) {
+	visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) {
 		if smt, ok := mt.(*substrMatchTree); ok {
-			cands = append(cands, smt.current...)
+			cands = append(cands, setScoreWeight(scoreWeight, smt.current)...)
 		}
 		if rmt, ok := mt.(*regexpMatchTree); ok {
-			cands = append(cands, rmt.found...)
+			cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
 		}
 		if rmt, ok := mt.(*wordMatchTree); ok {
-			cands = append(cands, rmt.found...)
+			cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
 		}
 		if smt, ok := mt.(*symbolRegexpMatchTree); ok {
-			cands = append(cands, smt.found...)
+			cands = append(cands, setScoreWeight(scoreWeight, smt.found)...)
 		}
 	})
 
@@ -590,6 +599,7 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid
 		// are non-overlapping.
 		sort.Sort((sortByOffsetSlice)(cands))
 		res = cands[:0]
+		mergeRun := 1
 		for i, c := range cands {
 			if i == 0 {
 				res = append(res, c)
@@ -599,10 +609,23 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid
 			lastEnd := last.byteOffset + last.byteMatchSz
 			end := c.byteOffset + c.byteMatchSz
 			if lastEnd >= c.byteOffset {
+				mergeRun++
+
+				// Average out the score across the merged candidates. Only do it if
+				// we are boosting to avoid floating point funkiness in the normal
+				// case.
+				if !(epsilonEqualsOne(last.scoreWeight) && epsilonEqualsOne(c.scoreWeight)) {
+					last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun)
+				}
+
+				// latest candidate goes further, update our end
 				if end > lastEnd {
 					last.byteMatchSz = end - last.byteOffset
 				}
+
 				continue
+			} else {
+				mergeRun = 1
 			}
 
 			res = append(res, c)
@@ -649,7 +672,7 @@ func (d *indexData) branchIndex(docID uint32) int {
 // returns all branches containing docID.
 func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
 	var mask uint64
-	visitMatches(mt, known, func(mt matchTree) {
+	visitMatchAtoms(mt, known, func(mt matchTree) {
 		bq, ok := mt.(*branchQueryMatchTree)
 		if !ok {
 			return
diff --git a/internal/e2e/e2e_rank_test.go b/internal/e2e/e2e_rank_test.go
@@ -118,6 +118,10 @@ func TestRanking(t *testing.T) {
 				t.Fatal(err)
 			}
 
+			// q is marshalled as part of the test, so avoid our rewrites for
+			// ranking.
+			qSearch := query.ExpirementalPhraseBoost(q, rq.Query, query.ExperimentalPhraseBoostOptions{})
+
 			sOpts := zoekt.SearchOptions{
 				// Use the same options sourcegraph has by default
 				ChunkMatches:       true,
@@ -128,7 +132,7 @@ func TestRanking(t *testing.T) {
 
 				DebugScore: *debugScore,
 			}
-			result, err := ss.Search(context.Background(), q, &sOpts)
+			result, err := ss.Search(context.Background(), qSearch, &sOpts)
 			if err != nil {
 				t.Fatal(err)
 			}
diff --git a/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt b/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt
@@ -3,9 +3,9 @@ query: (and substr:"assets" substr:"are" substr:"not" substr:"configured" substr
 targetRank: 1
 
 **github.com/sourcegraph/sourcegraph/ui/assets/assets.go**
+30:	return nil, errors.New("assets are not configured for this binary, please see ui/assets")
+34:	panic("assets are not configured for this binary, please see ui/assets")
 33:func (p FailingAssetsProvider) Assets() http.FileSystem {
-14:	Assets() http.FileSystem
-1:package assets
 hidden 12 more line matches
 
 github.com/sourcegraph/sourcegraph/schema/schema.go
diff --git a/internal/e2e/testdata/generate_unit_test.txt b/internal/e2e/testdata/generate_unit_test.txt
@@ -1,6 +1,30 @@
 queryString: generate unit test
 query: (and substr:"generate" substr:"unit" substr:"test")
-targetRank: 11
+targetRank: 1
+
+**github.com/sourcegraph/cody/lib/shared/src/chat/recipes/generate-test.ts**
+16:    public title = 'Generate Unit Test'
+14:export class GenerateTest implements Recipe {
+15:    public id: RecipeID = 'generate-unit-test'
+hidden 3 more line matches
+
+github.com/sourcegraph/sourcegraph/client/jetbrains/README.md
+40:- Generate unit test
+41:- Generate docstring
+61:Cody is powered by Sourcegraph’s code graph and uses context of your codebase to extend its capabilities. By using context from entire repositories, Cody is able to give more accurate answers and generate idiomatic code.
+hidden 7 more line matches
+
+github.com/sourcegraph/cody/vscode/CHANGELOG.md
+298:- The `/test` (Generate Unit Test) command was updated to use file dependencies and test examples when fetching context, in order to produce better results. To use this command, select code in your editor and run the `/test` command. It is recommended to set up test files before running the command to get optimal results. [pull/683](https://github.com/sourcegraph/cody/pull/683) [pull/602](https://github.com/sourcegraph/cody/pull/602)
+218:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907)
+264:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907)
+hidden 17 more line matches
+
+github.com/sourcegraph/sourcegraph/doc/cody/overview/install-jetbrains.md
+158:- Generate unit test
+138:Log in to your Sourcegraph instance and go to `settings` / `access token` (`https://<your-instance>.sourcegraph.com/users/<your-instance>/settings/tokens`). From here, generate a new access token.
+159:- Generate docstring
+hidden 3 more line matches
 
 github.com/sourcegraph/sourcegraph/cmd/frontend/internal/insights/resolvers/insight_series_resolver.go
 300:func (j *seriesResolverGenerator) Generate(ctx context.Context, series types.InsightViewSeries, baseResolver baseInsightResolver, filters types.InsightViewFilters, options types.SeriesDisplayOptions) ([]graphqlbackend.InsightSeriesResolver, error) {
@@ -14,28 +38,4 @@ github.com/golang/go/src/cmd/vendor/github.com/google/pprof/internal/report/repo
 75:	SampleUnit        string // Unit for the sample data from the profile.
 hidden 48 more line matches
 
-github.com/sourcegraph/sourcegraph/internal/codeintel/autoindexing/internal/inference/lua/test.lua
-9:  generate = function(_, paths)
-6:  patterns = { pattern.new_path_basename "sg-test" },
-8:  -- Invoked as part of unit tests for the autoindexing service
-hidden 1 more line matches
-
-github.com/golang/go/src/cmd/internal/testdir/testdir_test.go
-273:type test struct {
-74:func Test(t *testing.T) {
-263:type testCommon struct {
-hidden 120 more line matches
-
-github.com/golang/go/src/cmd/vendor/github.com/google/pprof/profile/profile.go
-65:	Unit string // seconds, nanoseconds, bytes, etc
-77:	NumUnit  map[string][]string
-68:	unitX int64
-hidden 44 more line matches
-
-github.com/golang/go/src/cmd/link/internal/loader/loader.go
-79:	unit         *sym.CompilationUnit
-1544:func (l *Loader) SymUnit(i Sym) *sym.CompilationUnit {
-228:	generatedSyms        Bitmap // symbols that generate their content, indexed by ext sym idx
-hidden 50 more line matches
-
 hidden 245 more file matches
diff --git a/internal/e2e/testdata/rank_stats.txt b/internal/e2e/testdata/rank_stats.txt
@@ -1,4 +1,4 @@
 queries: 14
-recall@1: 7 (50%)
-recall@5: 9 (64%)
-mrr: 0.579471
+recall@1: 9 (64%)
+recall@5: 11 (79%)
+mrr: 0.710733
diff --git a/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt b/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt
@@ -1,6 +1,11 @@
 queryString: sourcegraph/server docker image build
 query: (and substr:"sourcegraph/server" substr:"docker" substr:"image" substr:"build")
-targetRank: 14
+targetRank: 1
+
+**github.com/sourcegraph/sourcegraph/dev/tools.go**
+7:	// zoekt-* used in sourcegraph/server docker image build
+1://go:build tools
+2:// +build tools
 
 github.com/sourcegraph/sourcegraph/dev/sg/internal/images/images.go
 458:	Build       int
@@ -32,10 +37,4 @@ github.com/sourcegraph/sourcegraph/internal/updatecheck/handler.go
 50:	latestReleaseDockerComposeOrPureDocker = newPingResponse("5.1.8")
 hidden 19 more line matches
 
-github.com/sourcegraph/sourcegraph/doc/admin/deploy/docker-single-container/index.md
-1:# Docker Single Container Deployment
-294:### Insiders build
-238:### File system performance on Docker for Mac
-hidden 52 more line matches
-
 hidden 15 more file matches
diff --git a/matchiter.go b/matchiter.go
@@ -27,6 +27,8 @@ type candidateMatch struct {
 	substrBytes   []byte
 	substrLowered []byte
 
+	scoreWeight float64
+
 	file      uint32
 	symbolIdx uint32
 
diff --git a/matchtree.go b/matchtree.go
diff --git a/query/boost.go b/query/boost.go
diff --git a/query/query.go b/query/query.go
diff --git a/web/server.go b/web/server.go

Original file line number	Diff line number	Diff line change
`@@ -660,6 +660,14 @@ func (p contentProvider) candidateMatchScore(ms []candidateMatch, language str`
`660`	`660`	`}`
`661`	`661`	`}`
`662`	`662`
	`663`	`+ // scoreWeight != 1 means it affects score`
	`664`	`+ if !epsilonEqualsOne(m.scoreWeight) {`
	`665`	`+ score.score = score.score * m.scoreWeight`
	`666`	`+ if debug {`
	`667`	`+ score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)`
	`668`	`+ }`
	`669`	`+ }`
	`670`	`+`
`663`	`671`	`if score.score > maxScore.score {`
`664`	`672`	`maxScore.score = score.score`
`665`	`673`	`maxScore.what = score.what`
Original file line number	Diff line number	Diff line change
`@@ -118,6 +118,10 @@ func TestRanking(t *testing.T) {`
`118`	`118`	`t.Fatal(err)`
`119`	`119`	`}`
`120`	`120`
	`121`	`+ // q is marshalled as part of the test, so avoid our rewrites for`
	`122`	`+ // ranking.`
	`123`	`+ qSearch := query.ExpirementalPhraseBoost(q, rq.Query, query.ExperimentalPhraseBoostOptions{})`
	`124`	`+`
`121`	`125`	`sOpts := zoekt.SearchOptions{`
`122`	`126`	`// Use the same options sourcegraph has by default`
`123`	`127`	`ChunkMatches: true,`
`@@ -128,7 +132,7 @@ func TestRanking(t *testing.T) {`
`128`	`132`
`129`	`133`	`DebugScore: *debugScore,`
`130`	`134`	`}`
`131`		`- result, err := ss.Search(context.Background(), q, &sOpts)`
	`135`	`+ result, err := ss.Search(context.Background(), qSearch, &sOpts)`
`132`	`136`	`if err != nil {`
`133`	`137`	`t.Fatal(err)`
`134`	`138`	`}`