Skip to content

Commit 6ab827f

Browse files
committed
score: introduce query.Boost to scale score
This commit introduces a new primitive Boost to our query language. It allows boosting (or dampening) the contribution to the score a query atoms will match contribute. To achieve this we introduce boostMatchTree which records this weight. We then adjust the visitMatches to take an initial score weight (1.0), and then each time we recurse through a boostMatchTree the score weight is multiplied by the boost weight. Additionally candidateMatch now has a new field, scoreWeight, which records the weight at time of candidate collection. Without boosting in the query this value will always be 1. Finally when scoring a candidateMatch we take the final score for it and multiply it by scoreWeight. Note: we do not expose a way to set this in the query language, only the query API. Test Plan: Manual testing against webserver via the new phrase-boost URL param. Additionally updated ranking tests to use the phrase booster.
1 parent cdb1665 commit 6ab827f

14 files changed

+230
-52
lines changed

api_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ func TestMatchSize(t *testing.T) {
152152
size: 112,
153153
}, {
154154
v: candidateMatch{},
155-
size: 72,
155+
size: 80,
156156
}, {
157157
v: candidateChunk{},
158158
size: 40,

bits.go

+5
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package zoekt
1616

1717
import (
1818
"encoding/binary"
19+
"math"
1920
"sort"
2021
"unicode"
2122
"unicode/utf8"
@@ -391,3 +392,7 @@ func (m runeOffsetMap) lookup(runeOffset uint32) (uint32, uint32) {
391392
func (m runeOffsetMap) sizeBytes() int {
392393
return 8 * len(m)
393394
}
395+
396+
func epsilonEqualsOne(scoreWeight float64) bool {
397+
return scoreWeight == 1 || math.Abs(scoreWeight-1.0) < 1e-9
398+
}

contentprovider.go

+8
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,14 @@ func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language str
660660
}
661661
}
662662

663+
// scoreWeight != 1 means it affects score
664+
if !epsilonEqualsOne(m.scoreWeight) {
665+
score.score = score.score * m.scoreWeight
666+
if debug {
667+
score.what += fmt.Sprintf("boost:%.2f, ", m.scoreWeight)
668+
}
669+
}
670+
663671
if score.score > maxScore.score {
664672
maxScore.score = score.score
665673
maxScore.what = score.what

eval.go

+30-7
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ nextFileMatch:
420420
// whether there's an exact match on a symbol, the number of query clauses that matched, etc.
421421
func (d *indexData) scoreFile(fileMatch *FileMatch, doc uint32, mt matchTree, known map[matchTree]bool, opts *SearchOptions) {
422422
atomMatchCount := 0
423-
visitMatches(mt, known, func(mt matchTree) {
423+
visitMatchAtoms(mt, known, func(mt matchTree) {
424424
atomMatchCount++
425425
})
426426

@@ -544,6 +544,15 @@ func (m sortByOffsetSlice) Less(i, j int) bool {
544544
return m[i].byteOffset < m[j].byteOffset
545545
}
546546

547+
// setScoreWeight is a helper used by gatherMatches to set the weight based on
548+
// the score weight of the matchTree.
549+
func setScoreWeight(scoreWeight float64, cm []*candidateMatch) []*candidateMatch {
550+
for _, m := range cm {
551+
m.scoreWeight = scoreWeight
552+
}
553+
return cm
554+
}
555+
547556
// Gather matches from this document. This never returns a mixture of
548557
// filename/content matches: if there are content matches, all
549558
// filename matches are trimmed from the result. The matches are
@@ -554,18 +563,18 @@ func (m sortByOffsetSlice) Less(i, j int) bool {
554563
// but adjacent matches will remain.
555564
func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candidateMatch {
556565
var cands []*candidateMatch
557-
visitMatches(mt, known, func(mt matchTree) {
566+
visitMatches(mt, known, 1, func(mt matchTree, scoreWeight float64) {
558567
if smt, ok := mt.(*substrMatchTree); ok {
559-
cands = append(cands, smt.current...)
568+
cands = append(cands, setScoreWeight(scoreWeight, smt.current)...)
560569
}
561570
if rmt, ok := mt.(*regexpMatchTree); ok {
562-
cands = append(cands, rmt.found...)
571+
cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
563572
}
564573
if rmt, ok := mt.(*wordMatchTree); ok {
565-
cands = append(cands, rmt.found...)
574+
cands = append(cands, setScoreWeight(scoreWeight, rmt.found)...)
566575
}
567576
if smt, ok := mt.(*symbolRegexpMatchTree); ok {
568-
cands = append(cands, smt.found...)
577+
cands = append(cands, setScoreWeight(scoreWeight, smt.found)...)
569578
}
570579
})
571580

@@ -590,6 +599,7 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid
590599
// are non-overlapping.
591600
sort.Sort((sortByOffsetSlice)(cands))
592601
res = cands[:0]
602+
mergeRun := 1
593603
for i, c := range cands {
594604
if i == 0 {
595605
res = append(res, c)
@@ -599,10 +609,23 @@ func gatherMatches(mt matchTree, known map[matchTree]bool, merge bool) []*candid
599609
lastEnd := last.byteOffset + last.byteMatchSz
600610
end := c.byteOffset + c.byteMatchSz
601611
if lastEnd >= c.byteOffset {
612+
mergeRun++
613+
614+
// Average out the score across the merged candidates. Only do it if
615+
// we are boosting to avoid floating point funkiness in the normal
616+
// case.
617+
if !(epsilonEqualsOne(last.scoreWeight) && epsilonEqualsOne(c.scoreWeight)) {
618+
last.scoreWeight = ((last.scoreWeight * float64(mergeRun-1)) + c.scoreWeight) / float64(mergeRun)
619+
}
620+
621+
// latest candidate goes further, update our end
602622
if end > lastEnd {
603623
last.byteMatchSz = end - last.byteOffset
604624
}
625+
605626
continue
627+
} else {
628+
mergeRun = 1
606629
}
607630

608631
res = append(res, c)
@@ -649,7 +672,7 @@ func (d *indexData) branchIndex(docID uint32) int {
649672
// returns all branches containing docID.
650673
func (d *indexData) gatherBranches(docID uint32, mt matchTree, known map[matchTree]bool) []string {
651674
var mask uint64
652-
visitMatches(mt, known, func(mt matchTree) {
675+
visitMatchAtoms(mt, known, func(mt matchTree) {
653676
bq, ok := mt.(*branchQueryMatchTree)
654677
if !ok {
655678
return

internal/e2e/e2e_rank_test.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,10 @@ func TestRanking(t *testing.T) {
118118
t.Fatal(err)
119119
}
120120

121+
// q is marshalled as part of the test, so avoid our rewrites for
122+
// ranking.
123+
qSearch := query.ExpirementalPhraseBoost(q, rq.Query, query.ExperimentalPhraseBoostOptions{})
124+
121125
sOpts := zoekt.SearchOptions{
122126
// Use the same options sourcegraph has by default
123127
ChunkMatches: true,
@@ -128,7 +132,7 @@ func TestRanking(t *testing.T) {
128132

129133
DebugScore: *debugScore,
130134
}
131-
result, err := ss.Search(context.Background(), q, &sOpts)
135+
result, err := ss.Search(context.Background(), qSearch, &sOpts)
132136
if err != nil {
133137
t.Fatal(err)
134138
}

internal/e2e/testdata/assets_are_not_configured_for_this_binary.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ query: (and substr:"assets" substr:"are" substr:"not" substr:"configured" substr
33
targetRank: 1
44

55
**github.com/sourcegraph/sourcegraph/ui/assets/assets.go**
6+
30: return nil, errors.New("assets are not configured for this binary, please see ui/assets")
7+
34: panic("assets are not configured for this binary, please see ui/assets")
68
33:func (p FailingAssetsProvider) Assets() http.FileSystem {
7-
14: Assets() http.FileSystem
8-
1:package assets
99
hidden 12 more line matches
1010

1111
github.com/sourcegraph/sourcegraph/schema/schema.go
+25-25
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,30 @@
11
queryString: generate unit test
22
query: (and substr:"generate" substr:"unit" substr:"test")
3-
targetRank: 11
3+
targetRank: 1
4+
5+
**github.com/sourcegraph/cody/lib/shared/src/chat/recipes/generate-test.ts**
6+
16: public title = 'Generate Unit Test'
7+
14:export class GenerateTest implements Recipe {
8+
15: public id: RecipeID = 'generate-unit-test'
9+
hidden 3 more line matches
10+
11+
github.com/sourcegraph/sourcegraph/client/jetbrains/README.md
12+
40:- Generate unit test
13+
41:- Generate docstring
14+
61:Cody is powered by Sourcegraph’s code graph and uses context of your codebase to extend its capabilities. By using context from entire repositories, Cody is able to give more accurate answers and generate idiomatic code.
15+
hidden 7 more line matches
16+
17+
github.com/sourcegraph/cody/vscode/CHANGELOG.md
18+
298:- The `/test` (Generate Unit Test) command was updated to use file dependencies and test examples when fetching context, in order to produce better results. To use this command, select code in your editor and run the `/test` command. It is recommended to set up test files before running the command to get optimal results. [pull/683](https://github.com/sourcegraph/cody/pull/683) [pull/602](https://github.com/sourcegraph/cody/pull/602)
19+
218:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907)
20+
264:- The `Generate Unit Tests` command has been improved with an enhanced context fetching process that produces test results with better quality. [pull/907](https://github.com/sourcegraph/cody/pull/907)
21+
hidden 17 more line matches
22+
23+
github.com/sourcegraph/sourcegraph/doc/cody/overview/install-jetbrains.md
24+
158:- Generate unit test
25+
138:Log in to your Sourcegraph instance and go to `settings` / `access token` (`https://<your-instance>.sourcegraph.com/users/<your-instance>/settings/tokens`). From here, generate a new access token.
26+
159:- Generate docstring
27+
hidden 3 more line matches
428

529
github.com/sourcegraph/sourcegraph/cmd/frontend/internal/insights/resolvers/insight_series_resolver.go
630
300:func (j *seriesResolverGenerator) Generate(ctx context.Context, series types.InsightViewSeries, baseResolver baseInsightResolver, filters types.InsightViewFilters, options types.SeriesDisplayOptions) ([]graphqlbackend.InsightSeriesResolver, error) {
@@ -14,28 +38,4 @@ github.com/golang/go/src/cmd/vendor/github.com/google/pprof/internal/report/repo
1438
75: SampleUnit string // Unit for the sample data from the profile.
1539
hidden 48 more line matches
1640

17-
github.com/sourcegraph/sourcegraph/internal/codeintel/autoindexing/internal/inference/lua/test.lua
18-
9: generate = function(_, paths)
19-
6: patterns = { pattern.new_path_basename "sg-test" },
20-
8: -- Invoked as part of unit tests for the autoindexing service
21-
hidden 1 more line matches
22-
23-
github.com/golang/go/src/cmd/internal/testdir/testdir_test.go
24-
273:type test struct {
25-
74:func Test(t *testing.T) {
26-
263:type testCommon struct {
27-
hidden 120 more line matches
28-
29-
github.com/golang/go/src/cmd/vendor/github.com/google/pprof/profile/profile.go
30-
65: Unit string // seconds, nanoseconds, bytes, etc
31-
77: NumUnit map[string][]string
32-
68: unitX int64
33-
hidden 44 more line matches
34-
35-
github.com/golang/go/src/cmd/link/internal/loader/loader.go
36-
79: unit *sym.CompilationUnit
37-
1544:func (l *Loader) SymUnit(i Sym) *sym.CompilationUnit {
38-
228: generatedSyms Bitmap // symbols that generate their content, indexed by ext sym idx
39-
hidden 50 more line matches
40-
4141
hidden 245 more file matches

internal/e2e/testdata/rank_stats.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
queries: 14
2-
recall@1: 7 (50%)
3-
recall@5: 9 (64%)
4-
mrr: 0.579471
2+
recall@1: 9 (64%)
3+
recall@5: 11 (79%)
4+
mrr: 0.710733

internal/e2e/testdata/sourcegraphserver_docker_image_build.txt

+6-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
queryString: sourcegraph/server docker image build
22
query: (and substr:"sourcegraph/server" substr:"docker" substr:"image" substr:"build")
3-
targetRank: 14
3+
targetRank: 1
4+
5+
**github.com/sourcegraph/sourcegraph/dev/tools.go**
6+
7: // zoekt-* used in sourcegraph/server docker image build
7+
1://go:build tools
8+
2:// +build tools
49

510
github.com/sourcegraph/sourcegraph/dev/sg/internal/images/images.go
611
458: Build int
@@ -32,10 +37,4 @@ github.com/sourcegraph/sourcegraph/internal/updatecheck/handler.go
3237
50: latestReleaseDockerComposeOrPureDocker = newPingResponse("5.1.8")
3338
hidden 19 more line matches
3439

35-
github.com/sourcegraph/sourcegraph/doc/admin/deploy/docker-single-container/index.md
36-
1:# Docker Single Container Deployment
37-
294:### Insiders build
38-
238:### File system performance on Docker for Mac
39-
hidden 52 more line matches
40-
4140
hidden 15 more file matches

matchiter.go

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ type candidateMatch struct {
2727
substrBytes []byte
2828
substrLowered []byte
2929

30+
scoreWeight float64
31+
3032
file uint32
3133
symbolIdx uint32
3234

0 commit comments

Comments
 (0)