Skip to content

Commit

Permalink
Explicitly pass nil content to lang detection
Browse files Browse the repository at this point in the history
  • Loading branch information
jtibshirani committed Feb 5, 2025
1 parent 114d8b2 commit 8ffda23
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 9 deletions.
20 changes: 11 additions & 9 deletions index/shard_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,6 @@ var _ = log.Println

const ngramSize = 3

// MaxFileSize 1 MB; match https://sourcegraph.sgdev.org/github.com/sourcegraph/sourcegraph/-/blob/cmd/symbols/internal/symbols/search.go#L22
// NOTE: if you change this, you must also update gitIndex to use the same value when fetching the repo.
// Change here as well, if you're changing the value https://sourcegraph.com/github.com/sourcegraph/zoekt/-/blob/cmd/zoekt-sourcegraph-indexserver/main.go?L167-169
const MaxFileSize = 1 << 20

type searchableString struct {
data []byte
}
Expand Down Expand Up @@ -401,7 +396,16 @@ func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) {
}

func DetermineLanguageIfUnknown(doc *Document) {
if doc.Language == "" {
if doc.Language != "" {
return
}

if doc.SkipReason != "" {
// If this document has been skipped, it's likely very large, or it's a non-code file like binary.
// In this case, we just guess the language based on file name to avoid examining the contents.
// Note: passing nil content is allowed by the go-enry contract (the underlying library we use here).
doc.Language = languages.GetLanguage(doc.Name, nil)
} else {
doc.Language = languages.GetLanguage(doc.Name, doc.Content)
}
}
Expand All @@ -410,9 +414,7 @@ func DetermineLanguageIfUnknown(doc *Document) {
func (b *ShardBuilder) Add(doc Document) error {
hasher := crc64.New(crc64.MakeTable(crc64.ISO))

if len(doc.Content) > MaxFileSize {
doc.SkipReason = fmt.Sprintf("file size %d exceeds maximum size %d", len(doc.Content), MaxFileSize)
} else if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
}

Expand Down
46 changes: 46 additions & 0 deletions index/shard_builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,49 @@ func TestShardName(t *testing.T) {
})
}
}

func TestDetermineLanguageIfUnknown(t *testing.T) {
tests := []struct {
name string
doc Document
wantLang string
skipContent bool
}{
{
name: "already has language",
doc: Document{
Name: "test.java",
Language: "Go",
Content: []byte("package main"),
},
wantLang: "Go",
},
{
name: "skipped file",
doc: Document{
Name: "large.js",
SkipReason: "too large",
Content: []byte(notIndexedMarker + "too large"),
},
wantLang: "JavaScript",
},
{
name: "skipped file with unknown extension",
doc: Document{
Name: "deadb33f",
SkipReason: "binary",
Content: []byte(notIndexedMarker + "binary"),
},
wantLang: "",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
DetermineLanguageIfUnknown(&tt.doc)
if tt.doc.Language != tt.wantLang {
t.Errorf("DetermineLanguageIfUnknown() got language = %v, want %v", tt.doc.Language, tt.wantLang)
}
})
}
}

0 comments on commit 8ffda23

Please sign in to comment.