Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow code search by filename #32210

Merged
merged 5 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions models/fixtures/repo_unit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -712,3 +712,24 @@
type: 3
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
created_unix: 946684810

-
id: 108
repo_id: 62
type: 1
config: "{}"
created_unix: 946684810

-
id: 109
repo_id: 62
type: 2
config: "{\"EnableTimetracker\":true,\"AllowOnlyContributorsToTrackTime\":true}"
created_unix: 946684810

-
id: 110
repo_id: 62
type: 3
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
created_unix: 946684810
31 changes: 31 additions & 0 deletions models/fixtures/repository.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1768,3 +1768,34 @@
size: 0
is_fsck_enabled: true
close_issues_via_commit_in_any_branch: false

-
id: 62
owner_id: 42
owner_name: org42
lower_name: search-by-path
name: search-by-path
default_branch: master
num_watches: 0
num_stars: 0
num_forks: 0
num_issues: 0
num_closed_issues: 0
num_pulls: 0
num_closed_pulls: 0
num_milestones: 0
num_closed_milestones: 0
num_projects: 0
num_closed_projects: 0
is_private: false
is_empty: false
is_archived: false
is_mirror: false
status: 0
is_fork: false
fork_id: 0
is_template: false
template_id: 0
size: 0
is_fsck_enabled: true
close_issues_via_commit_in_any_branch: false
37 changes: 37 additions & 0 deletions models/fixtures/user.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1517,3 +1517,40 @@
repo_admin_change_team_access: false
theme: ""
keep_activity_private: false

-
id: 42
lower_name: org42
name: org42
full_name: Org42
email: org42@example.com
keep_email_private: false
email_notifications_preference: onmention
passwd: ZogKvWdyEx:password
passwd_hash_algo: dummy
must_change_password: false
login_source: 0
login_name: org42
type: 1
salt: ZogKvWdyEx
max_repo_creation: -1
is_active: false
is_admin: false
is_restricted: false
allow_git_hook: false
allow_import_local: false
allow_create_organization: true
prohibit_login: false
avatar: avatar42
avatar_email: org42@example.com
use_custom_avatar: false
num_followers: 0
num_following: 0
num_stars: 0
num_repos: 1
num_teams: 0
num_members: 0
visibility: 0
repo_admin_change_team_access: false
theme: ""
keep_activity_private: false
6 changes: 3 additions & 3 deletions models/repo/repo_list_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,12 @@ func getTestCases() []struct {
{
name: "AllPublic/PublicRepositoriesOfUserIncludingCollaborative",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, AllPublic: true, Template: optional.Some(false)},
count: 33,
count: 34,
},
{
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborative",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, Private: true, AllPublic: true, AllLimited: true, Template: optional.Some(false)},
count: 38,
count: 39,
},
{
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborativeByName",
Expand All @@ -158,7 +158,7 @@ func getTestCases() []struct {
{
name: "AllPublic/PublicRepositoriesOfOrganization",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 17, AllPublic: true, Collaborate: optional.Some(false), Template: optional.Some(false)},
count: 33,
count: 34,
},
{
name: "AllTemplates",
Expand Down
5 changes: 4 additions & 1 deletion models/user/user_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ func TestSearchUsers(t *testing.T) {
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 4, PageSize: 2}},
[]int64{26, 41})

testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
[]int64{42})

testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 6, PageSize: 2}},
[]int64{})

// test users
Expand Down
44 changes: 37 additions & 7 deletions modules/indexer/code/bleve/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo"
path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
Expand Down Expand Up @@ -53,6 +54,7 @@ type RepoIndexerData struct {
RepoID int64
CommitID string
Content string
Filename string
Language string
UpdatedAt time.Time
}
Expand All @@ -64,8 +66,10 @@ func (d *RepoIndexerData) Type() string {

const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
filenameIndexerTokenizer = "filenameIndexerTokenizer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 6
repoIndexerLatestVersion = 7
)

// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
Expand All @@ -79,6 +83,11 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)

fileNamedMapping := bleve.NewTextFieldMapping()
fileNamedMapping.IncludeInAll = false
fileNamedMapping.Analyzer = filenameIndexerAnalyzer
docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)

termFieldMapping := bleve.NewTextFieldMapping()
termFieldMapping.IncludeInAll = false
termFieldMapping.Analyzer = analyzer_keyword.Name
Expand All @@ -90,6 +99,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)

mapping := bleve.NewIndexMapping()

if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
Expand All @@ -100,6 +110,16 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
}); err != nil {
return nil, err
}

if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
}); err != nil {
return nil, err
}

mapping.DefaultAnalyzer = repoIndexerAnalyzer
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
Expand Down Expand Up @@ -174,6 +194,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Filename: update.Filename,
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
Expand Down Expand Up @@ -240,14 +261,19 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
keywordQuery query.Query
)

phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
pathQuery.FieldVal = "Filename"
pathQuery.SetBoost(10)

contentQuery := bleve.NewMatchQuery(opts.Keyword)
contentQuery.FieldVal = "Content"

if opts.IsKeywordFuzzy {
phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
}

keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)

if len(opts.RepoIDs) > 0 {
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
for _, repoID := range opts.RepoIDs {
Expand Down Expand Up @@ -277,7 +303,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int

from, pageSize := opts.GetSkipTake()
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true

if len(opts.Language) == 0 {
Expand Down Expand Up @@ -307,6 +333,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
endIndex = locationEnd
}
}
if len(hit.Locations["Filename"]) > 0 {
startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
}

language := hit.Fields["Language"].(string)
var updatedUnix timeutil.TimeStamp
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
Expand Down
101 changes: 101 additions & 0 deletions modules/indexer/code/bleve/token/path/path.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Copyright 2024 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT

package path

import (
"slices"
"strings"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const (
Name = "gitea/path"
)

type TokenFilter struct{}

func NewTokenFilter() *TokenFilter {
return &TokenFilter{}
}

func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewTokenFilter(), nil
}
delvh marked this conversation as resolved.
Show resolved Hide resolved

func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
if len(input) == 1 {
// if there is only one token, we dont need to generate the reversed chain
return generatePathTokens(input, false)
}

normal := generatePathTokens(input, false)
reversed := generatePathTokens(input, true)

return append(normal, reversed...)
}

// Generates path tokens from the input tokens.
// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
//
// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
// to efficiently search for filenames without supplying the fullpath.
func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
delvh marked this conversation as resolved.
Show resolved Hide resolved
terms := make([]string, 0, len(input))
longestTerm := 0

if reversed {
slices.Reverse(input)
}

for i := 0; i < len(input); i++ {
var sb strings.Builder
sb.WriteString(string(input[0].Term))

for j := 1; j < i; j++ {
sb.WriteString("/")
sb.WriteString(string(input[j].Term))
}

term := sb.String()

if longestTerm < len(term) {
longestTerm = len(term)
}

terms = append(terms, term)
}

output := make(analysis.TokenStream, 0, len(terms))

for _, term := range terms {
var start, end int

if reversed {
start = 0
end = len(term)
} else {
start = longestTerm - len(term)
end = longestTerm
}

token := analysis.Token{
Position: 1,
Start: start,
End: end,
Type: analysis.AlphaNumeric,
Term: []byte(term),
}

output = append(output, &token)
}

return output
}

func init() {
registry.RegisterTokenFilter(Name, TokenFilterConstructor)
}
Loading
Loading