Skip to content

Commit

Permalink
Refactor code indexer (#9313)
Browse files Browse the repository at this point in the history
* Refactor code indexer

* fix test

* fix test

* refactor code indexer

* fix import

* improve code

* fix typo

* fix test and make code clean

* fix lint
  • Loading branch information
lunny authored Dec 23, 2019
1 parent 2f9564f commit 89b4e04
Show file tree
Hide file tree
Showing 13 changed files with 649 additions and 638 deletions.
529 changes: 247 additions & 282 deletions modules/indexer/code/bleve.go

Large diffs are not rendered by default.

54 changes: 54 additions & 0 deletions modules/indexer/code/bleve_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,66 @@
package code

import (
"os"
"path/filepath"
"testing"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"

"github.com/stretchr/testify/assert"
)

func TestMain(m *testing.M) {
models.MainTest(m, filepath.Join("..", "..", ".."))
}

func TestIndexAndSearch(t *testing.T) {
models.PrepareTestEnv(t)

dir := "./bleve.index"
os.RemoveAll(dir)

setting.Indexer.RepoIndexerEnabled = true
idx, _, err := NewBleveIndexer(dir)
if err != nil {
idx.Close()
log.Fatal("indexer.Init: %v", err)
}

err = idx.Index(1)
assert.NoError(t, err)

var (
keywords = []struct {
Keyword string
IDs []int64
}{
{
Keyword: "Description",
IDs: []int64{1},
},
{
Keyword: "repo1",
IDs: []int64{1},
},
{
Keyword: "non-exist",
IDs: []int64{},
},
}
)

for _, kw := range keywords {
total, res, err := idx.Search(nil, kw.Keyword, 1, 10)
assert.NoError(t, err)
assert.EqualValues(t, len(kw.IDs), total)

var ids = make([]int64, 0, len(res))
for _, hit := range res {
ids = append(ids, hit.RepoID)
}
assert.EqualValues(t, kw.IDs, ids)
}
}
147 changes: 147 additions & 0 deletions modules/indexer/code/git.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package code

import (
"strconv"
"strings"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
)

type fileUpdate struct {
Filename string
BlobSha string
}

// repoChanges changes (file additions/updates/removals) to a repo
type repoChanges struct {
Updates []fileUpdate
RemovedFilenames []string
}

func getDefaultBranchSha(repo *models.Repository) (string, error) {
stdout, err := git.NewCommand("show-ref", "-s", git.BranchPrefix+repo.DefaultBranch).RunInDir(repo.RepoPath())
if err != nil {
return "", err
}
return strings.TrimSpace(stdout), nil
}

// getRepoChanges returns changes to repo since last indexer update
func getRepoChanges(repo *models.Repository, revision string) (*repoChanges, error) {
if err := repo.GetIndexerStatus(); err != nil {
return nil, err
}

if len(repo.IndexerStatus.CommitSha) == 0 {
return genesisChanges(repo, revision)
}
return nonGenesisChanges(repo, revision)
}

func isIndexable(entry *git.TreeEntry) bool {
if !entry.IsRegular() && !entry.IsExecutable() {
return false
}
name := strings.ToLower(entry.Name())
for _, g := range setting.Indexer.ExcludePatterns {
if g.Match(name) {
return false
}
}
for _, g := range setting.Indexer.IncludePatterns {
if g.Match(name) {
return true
}
}
return len(setting.Indexer.IncludePatterns) == 0
}

// parseGitLsTreeOutput parses the output of a `git ls-tree -r --full-name` command
func parseGitLsTreeOutput(stdout []byte) ([]fileUpdate, error) {
entries, err := git.ParseTreeEntries(stdout)
if err != nil {
return nil, err
}
var idxCount = 0
updates := make([]fileUpdate, len(entries))
for _, entry := range entries {
if isIndexable(entry) {
updates[idxCount] = fileUpdate{
Filename: entry.Name(),
BlobSha: entry.ID.String(),
}
idxCount++
}
}
return updates[:idxCount], nil
}

// genesisChanges get changes to add repo to the indexer for the first time
func genesisChanges(repo *models.Repository, revision string) (*repoChanges, error) {
var changes repoChanges
stdout, err := git.NewCommand("ls-tree", "--full-tree", "-r", revision).
RunInDirBytes(repo.RepoPath())
if err != nil {
return nil, err
}
changes.Updates, err = parseGitLsTreeOutput(stdout)
return &changes, err
}

// nonGenesisChanges get changes since the previous indexer update
func nonGenesisChanges(repo *models.Repository, revision string) (*repoChanges, error) {
diffCmd := git.NewCommand("diff", "--name-status",
repo.IndexerStatus.CommitSha, revision)
stdout, err := diffCmd.RunInDir(repo.RepoPath())
if err != nil {
// previous commit sha may have been removed by a force push, so
// try rebuilding from scratch
log.Warn("git diff: %v", err)
if err = indexer.Delete(repo.ID); err != nil {
return nil, err
}
return genesisChanges(repo, revision)
}
var changes repoChanges
updatedFilenames := make([]string, 0, 10)
for _, line := range strings.Split(stdout, "\n") {
line = strings.TrimSpace(line)
if len(line) == 0 {
continue
}
filename := strings.TrimSpace(line[1:])
if len(filename) == 0 {
continue
} else if filename[0] == '"' {
filename, err = strconv.Unquote(filename)
if err != nil {
return nil, err
}
}

switch status := line[0]; status {
case 'M', 'A':
updatedFilenames = append(updatedFilenames, filename)
case 'D':
changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
default:
log.Warn("Unrecognized status: %c (line=%s)", status, line)
}
}

cmd := git.NewCommand("ls-tree", "--full-tree", revision, "--")
cmd.AddArguments(updatedFilenames...)
lsTreeStdout, err := cmd.RunInDirBytes(repo.RepoPath())
if err != nil {
return nil, err
}
changes.Updates, err = parseGitLsTreeOutput(lsTreeStdout)
return &changes, err
}
107 changes: 54 additions & 53 deletions modules/indexer/code/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,72 +5,73 @@
package code

import (
"os"
"strconv"
"time"

"code.gitea.io/gitea/modules/graceful"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
)

"github.com/blevesearch/bleve"
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/index/upsidedown"
"github.com/blevesearch/bleve/mapping"
"github.com/blevesearch/bleve/search/query"
"github.com/ethantkoenig/rupture"
var (
indexer Indexer
)

// indexerID a bleve-compatible unique identifier for an integer id
func indexerID(id int64) string {
return strconv.FormatInt(id, 36)
// SearchResult result of performing a search in a repo
type SearchResult struct {
RepoID int64
StartIndex int
EndIndex int
Filename string
Content string
}

// numericEqualityQuery a numeric equality query for the given value and field
func numericEqualityQuery(value int64, field string) *query.NumericRangeQuery {
f := float64(value)
tru := true
q := bleve.NewNumericRangeInclusiveQuery(&f, &f, &tru, &tru)
q.SetField(field)
return q
// Indexer defines an interface to indexer issues contents
type Indexer interface {
Index(repoID int64) error
Delete(repoID int64) error
Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error)
Close()
}

const unicodeNormalizeName = "unicodeNormalize"
// Init initialize the repo indexer
func Init() {
if !setting.Indexer.RepoIndexerEnabled {
return
}

func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]interface{}{
"type": unicodenorm.Name,
"form": unicodenorm.NFC,
})
}
waitChannel := make(chan time.Duration)
go func() {
start := time.Now()
log.Info("Initializing Repository Indexer")
var created bool
var err error
indexer, created, err = NewBleveIndexer(setting.Indexer.RepoPath)
if err != nil {
indexer.Close()
log.Fatal("indexer.Init: %v", err)
}

const maxBatchSize = 16
go processRepoIndexerOperationQueue(indexer)

// openIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
_, err := os.Stat(setting.Indexer.IssuePath)
if err != nil && os.IsNotExist(err) {
return nil, nil
} else if err != nil {
return nil, err
}
if created {
go populateRepoIndexer()
}

metadata, err := rupture.ReadIndexMetadata(path)
if err != nil {
return nil, err
}
if metadata.Version < latestVersion {
// the indexer is using a previous version, so we should delete it and
// re-populate
return nil, os.RemoveAll(path)
}
waitChannel <- time.Since(start)
}()

index, err := bleve.Open(path)
if err != nil && err == upsidedown.IncompatibleVersion {
// the indexer was built with a previous version of bleve, so we should
// delete it and re-populate
return nil, os.RemoveAll(path)
} else if err != nil {
return nil, err
if setting.Indexer.StartupTimeout > 0 {
go func() {
timeout := setting.Indexer.StartupTimeout
if graceful.GetManager().IsChild() && setting.GracefulHammerTime > 0 {
timeout += setting.GracefulHammerTime
}
select {
case duration := <-waitChannel:
log.Info("Repository Indexer Initialization took %v", duration)
case <-time.After(timeout):
log.Fatal("Repository Indexer Initialization Timed-Out after: %v", timeout)
}
}()
}
return index, nil
}
Loading

0 comments on commit 89b4e04

Please sign in to comment.