diff --git a/index/shard_builder.go b/index/shard_builder.go index 3f76cc8e..8a3057b6 100644 --- a/index/shard_builder.go +++ b/index/shard_builder.go @@ -37,11 +37,6 @@ var _ = log.Println const ngramSize = 3 -// MaxFileSize 1 MB; match https://sourcegraph.sgdev.org/github.com/sourcegraph/sourcegraph/-/blob/cmd/symbols/internal/symbols/search.go#L22 -// NOTE: if you change this, you must also update gitIndex to use the same value when fetching the repo. -// Change here as well, if you're changing the value https://sourcegraph.com/github.com/sourcegraph/zoekt/-/blob/cmd/zoekt-sourcegraph-indexserver/main.go?L167-169 -const MaxFileSize = 1 << 20 - type searchableString struct { data []byte } @@ -401,7 +396,16 @@ func (b *ShardBuilder) addSymbols(symbols []*zoekt.Symbol) { } func DetermineLanguageIfUnknown(doc *Document) { - if doc.Language == "" { + if doc.Language != "" { + return + } + + if doc.SkipReason != "" { + // If this document has been skipped, it's likely very large, or it's a non-code file like binary. + // In this case, we just guess the language based on file name to avoid examining the contents. + // Note: passing nil content is allowed by the go-enry contract (the underlying library we use here). + doc.Language = languages.GetLanguage(doc.Name, nil) + } else { doc.Language = languages.GetLanguage(doc.Name, doc.Content) } } @@ -410,9 +414,7 @@ func DetermineLanguageIfUnknown(doc *Document) { func (b *ShardBuilder) Add(doc Document) error { hasher := crc64.New(crc64.MakeTable(crc64.ISO)) - if len(doc.Content) > MaxFileSize { - doc.SkipReason = fmt.Sprintf("file size %d exceeds maximum size %d", len(doc.Content), MaxFileSize) - } else if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { + if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) } diff --git a/index/shard_builder_test.go b/index/shard_builder_test.go index e66b3cf8..662ac8cd 100644 --- a/index/shard_builder_test.go +++ b/index/shard_builder_test.go @@ -47,3 +47,49 @@ func TestShardName(t *testing.T) { }) } } + +func TestDetermineLanguageIfUnknown(t *testing.T) { + tests := []struct { + name string + doc Document + wantLang string + skipContent bool + }{ + { + name: "already has language", + doc: Document{ + Name: "test.java", + Language: "Go", + Content: []byte("package main"), + }, + wantLang: "Go", + }, + { + name: "skipped file", + doc: Document{ + Name: "large.js", + SkipReason: "too large", + Content: []byte(notIndexedMarker + "too large"), + }, + wantLang: "JavaScript", + }, + { + name: "skipped file with unknown extension", + doc: Document{ + Name: "deadb33f", + SkipReason: "binary", + Content: []byte(notIndexedMarker + "binary"), + }, + wantLang: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + DetermineLanguageIfUnknown(&tt.doc) + if tt.doc.Language != tt.wantLang { + t.Errorf("DetermineLanguageIfUnknown() got language = %v, want %v", tt.doc.Language, tt.wantLang) + } + }) + } +}