sourcegraph · keegancsmith · Jan 29, 2024 · Jan 28, 2024
diff --git a/api_test.go b/api_test.go
@@ -152,7 +152,7 @@ func TestMatchSize(t *testing.T) {
 		size: 112,
 	}, {
 		v:    candidateMatch{},
-		size: 72,
+		size: 80,
 	}, {
 		v:    candidateChunk{},
 		size: 40,

diff --git a/bits.go b/bits.go
@@ -16,6 +16,7 @@ package zoekt
 
 import (
 	"encoding/binary"
+	"math"
 	"sort"
 	"unicode"
 	"unicode/utf8"
@@ -391,3 +392,7 @@ func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
 func (m runeOffsetMap) sizeBytes() int {
 	return 8 * len(m)
 }
+
+func epsilonEqualsOne(scoreWeight float64) bool {
+	return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9
+}
diff --git a/contentprovider.go b/contentprovider.go
@@ -660,6 +660,14 @@ func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language str
 			}
 		}
 
+		// scoreWeight != 1 means it affects score
+		if !epsilonEqualsOne(m.scoreWeight) {
+			score.score = score.score * m.scoreWeight
+			if debug {
+				score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
+			}
+		}
+
 		if score.score > maxScore.score {
 			maxScore.score = score.score
 			maxScore.what = score.what

diff --git a/eval.go b/eval.go
@@ -420,7 +420,7 @@ nextFileMatch:
 // whether there's an exact match on a symbol, the number of query clauses that matched, etc.
 func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
 	atomMatchCount := 0
-	visitMatches(mt, known, func(mt matchTree) {
+	visitMatchAtoms(mt, known, func(mt matchTree) {
 		atomMatchCount++
 	})
 
@@ -544,6 +544,15 @@ func (m sortByOffsetSlice) Less(i, j int) bool {
 	return m[i].byteOffset < m[j].byteOffset
 }
 
+// setScoreWeight is a helper used by gatherMatches to set the weight based on
+// the score weight of the matchTree.
+func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch {
+	for _, m := range cm {
+		m.scoreWeight = scoreWeight
+	}
+	return cm
+}
+
 // Gather matches from this document. This never returns a mixture of
 // filename/content matches: if there are content matches, all
 // filename matches are trimmed from the result. The matches are
@@ -554,18 +563,18 @@ func (m sortByOffsetSlice) Less(i, j int) bool {
 // but adjacent matches will remain.
 func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
 	var cands []*candidateMatch
-	visitMatches(mt, known, func(mt matchTree) {
+	visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) {
 		if smt, ok := mt.(*substrMatchTree); ok {
-			cands = append(cands, smt.current...)
+			cands = append(cands, setScoreWeight(scoreWeight, smt.current)...)
 		}
 		if rmt, ok := mt.(*regexpMatchTree); ok {
-			cands = append(cands, rmt.found...)
+			cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
 		}
 		if rmt, ok := mt.(*wordMatchTree); ok {
-			cands = append(cands, rmt.found...)
+			cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
 		}
 		if smt, ok := mt.(*symbolRegexpMatchTree); ok {
-			cands = append(cands, smt.found...)
+			cands = append(cands, setScoreWeight(scoreWeight, smt.found)...)
 		}
 	})
 
@@ -590,6 +599,7 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid
 		// are non-overlapping.
 		sort.Sort((sortByOffsetSlice)(cands))
 		res = cands[:0]
+		mergeRun := 1
 		for i, c := range cands {
 			if i == 0 {
 				res = append(res, c)
@@ -599,10 +609,23 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid
 			lastEnd := last.byteOffset + last.byteMatchSz
 			end := c.byteOffset + c.byteMatchSz
 			if lastEnd >= c.byteOffset {
+				mergeRun++
+
+				// Average out the score across the merged candidates. Only do it if
+				// we are boosting to avoid floating point funkiness in the normal
+				// case.
+				if !(epsilonEqualsOne(last.scoreWeight) && epsilonEqualsOne(c.scoreWeight)) {
+					last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun)
+				}
+
+				// latest candidate goes further, update our end
 				if end > lastEnd {
 					last.byteMatchSz = end - last.byteOffset
 				}
+
 				continue
+			} else {
+				mergeRun = 1
 			}
 
 			res = append(res, c)
@@ -649,7 +672,7 @@ func (d *indexData) branchIndex(docID uint32) int {
 // returns all branches containing docID.
 func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
 	var mask uint64
-	visitMatches(mt, known, func(mt matchTree) {
+	visitMatchAtoms(mt, known, func(mt matchTree) {
 		bq, ok := mt.(*branchQueryMatchTree)
 		if !ok {
 			return

diff --git a/internal/e2e/e2e_rank_test.go b/internal/e2e/e2e_rank_test.go
@@ -118,6 +118,10 @@ func TestRanking(t *testing.T) {
 				t.Fatal(err)
 			}
 
+			// q is marshalled as part of the test, so avoid our rewrites for
+			// ranking.
+			qSearch := query.ExpirementalPhraseBoost(q, rq.Query, query.ExperimentalPhraseBoostOptions{})
+
 			sOpts := zoekt.SearchOptions{
 				// Use the same options sourcegraph has by default
 				ChunkMatches:       true,
@@ -128,7 +132,7 @@ func TestRanking(t *testing.T) {
 
 				DebugScore: *debugScore,
 			}
-			result, err := ss.Search(context.Background(), q, &sOpts)
+			result, err := ss.Search(context.Background(), qSearch, &sOpts)
 			if err != nil {
 				t.Fatal(err)
 			}

diff --git a/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt b/internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt
@@ -3,9 +3,9 @@ query: (and substr:"assets" substr:"are" substr:"not" substr:"configured" substr
 targetRank: 1
 
 **github.com/sourcegraph/sourcegraph/ui/assets/assets.go**
+30:	return nil, errors.New("assets are not configured for this binary, please see ui/assets")
+34:	panic("assets are not configured for this binary, please see ui/assets")
 33:func (p FailingAssetsProvider) Assets() http.FileSystem {
-14:	Assets() http.FileSystem
-1:package assets
 hidden 12 more line matches
 
 github.com/sourcegraph/sourcegraph/schema/schema.go

diff --git a/internal/e2e/testdata/generate_unit_test.txt b/internal/e2e/testdata/generate_unit_test.txt
@@ -1,6 +1,30 @@
 queryString: generate unit test
 query: (and substr:"generate" substr:"unit" substr:"test")
-targetRank: 11
+targetRank: 1
+
+**github.com/sourcegraph/cody/lib/shared/src/chat/recipes/generate-test.ts**
+16:    public title = 'Generate Unit Test'
+14:export class GenerateTest implements Recipe {
+15:    public id: RecipeID = 'generate-unit-test'
+hidden 3 more line matches
+
+github.com/sourcegraph/sourcegraph/client/jetbrains/README.md
+40:- Generate unit test
+41:- Generate docstring
+61:Cody is powered by Sourcegraph’s code graph and uses context of your codebase to extend its capabilities. By using context from entire repositories, Cody is able to give more accurate answers and generate idiomatic code.
+hidden 7 more line matches
+
+github.com/sourcegraph/cody/vscode/CHANGELOG.md
+298:- The `/test` (Generate Unit Test) command was updated to use file dependencies and test examples when fetching context, in order to produce better results. To use this command, select code in your editor and run the `/test` command. It is recommended to set up test files before running the command to get optimal results. [pull/683](https://github.com/sourcegraph/cody/pull/683) [pull/602](https://github.com/sourcegraph/cody/pull/602)
+218:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907)
+264:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907)
+hidden 17 more line matches
+
+github.com/sourcegraph/sourcegraph/doc/cody/overview/install-jetbrains.md
+158:- Generate unit test
+138:Log in to your Sourcegraph instance and go to `settings` / `access token` (`https://<your-instance>.sourcegraph.com/users/<your-instance>/settings/tokens`). From here, generate a new access token.
+159:- Generate docstring
+hidden 3 more line matches
 
 github.com/sourcegraph/sourcegraph/cmd/frontend/internal/insights/resolvers/insight_series_resolver.go
 300:func (j *seriesResolverGenerator) Generate(ctx context.Context, series types.InsightViewSeries, baseResolver baseInsightResolver, filters types.InsightViewFilters, options types.SeriesDisplayOptions) ([]graphqlbackend.InsightSeriesResolver, error) {
@@ -14,28 +38,4 @@ github.com/golang/go/src/cmd/vendor/github.com/google/pprof/internal/report/repo
 75:	SampleUnit        string // Unit for the sample data from the profile.
 hidden 48 more line matches
 
-github.com/sourcegraph/sourcegraph/internal/codeintel/autoindexing/internal/inference/lua/test.lua
-9:  generate = function(_, paths)
-6:  patterns = { pattern.new_path_basename "sg-test" },
-8:  -- Invoked as part of unit tests for the autoindexing service
-hidden 1 more line matches
-
-github.com/golang/go/src/cmd/internal/testdir/testdir_test.go
-273:type test struct {
-74:func Test(t *testing.T) {
-263:type testCommon struct {
-hidden 120 more line matches
-
-github.com/golang/go/src/cmd/vendor/github.com/google/pprof/profile/profile.go
-65:	Unit string // seconds, nanoseconds, bytes, etc
-77:	NumUnit  map[string][]string
-68:	unitX int64
-hidden 44 more line matches
-
-github.com/golang/go/src/cmd/link/internal/loader/loader.go
-79:	unit         *sym.CompilationUnit
-1544:func (l *Loader) SymUnit(i Sym) *sym.CompilationUnit {
-228:	generatedSyms        Bitmap // symbols that generate their content, indexed by ext sym idx
-hidden 50 more line matches
-
 hidden 245 more file matches
diff --git a/internal/e2e/testdata/rank_stats.txt b/internal/e2e/testdata/rank_stats.txt
@@ -1,4 +1,4 @@
 queries: 14
-recall@1: 7 (50%)
-recall@5: 9 (64%)
-mrr: 0.579471
+recall@1: 9 (64%)
+recall@5: 11 (79%)
+mrr: 0.710733
diff --git a/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt b/internal/e2e/testdata/sourcegraphserver_docker_image_build.txt
@@ -1,6 +1,11 @@
 queryString: sourcegraph/server docker image build
 query: (and substr:"sourcegraph/server" substr:"docker" substr:"image" substr:"build")
-targetRank: 14
+targetRank: 1
+
+**github.com/sourcegraph/sourcegraph/dev/tools.go**
+7:	// zoekt-* used in sourcegraph/server docker image build
+1://go:build tools
+2:// +build tools
 
 github.com/sourcegraph/sourcegraph/dev/sg/internal/images/images.go
 458:	Build       int
@@ -32,10 +37,4 @@ github.com/sourcegraph/sourcegraph/internal/updatecheck/handler.go
 50:	latestReleaseDockerComposeOrPureDocker = newPingResponse("5.1.8")
 hidden 19 more line matches
 
-github.com/sourcegraph/sourcegraph/doc/admin/deploy/docker-single-container/index.md
-1:# Docker Single Container Deployment
-294:### Insiders build
-238:### File system performance on Docker for Mac
-hidden 52 more line matches
-
 hidden 15 more file matches
diff --git a/matchiter.go b/matchiter.go
@@ -27,6 +27,8 @@ type candidateMatch struct {
 	substrBytes   []byte
 	substrLowered []byte
 
+	scoreWeight float64
+
 	file      uint32
 	symbolIdx uint32