-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve sync performance for pull-mirrors (#19125)
This addresses #18352 It aims to improve performance (and resource use) of the `SyncReleasesWithTags` operation for pull-mirrors. For large repositories with many tags, `SyncReleasesWithTags` can be a costly operation (taking several minutes to complete). The reason is two-fold: 1. on sync, every upstream repo tag is compared (for changes) against existing local entries in the release table to ensure that they are up-to-date. 2. the procedure for getting _each tag_ involves a series of git operations ```bash git show-ref --tags -- v8.2.4477 git cat-file -t 29ab6ce9f36660cffaad3c8789e71162e5db5d2f git cat-file -p 29ab6ce9f36660cffaad3c8789e71162e5db5d2f git rev-list --count 29ab6ce9f36660cffaad3c8789e71162e5db5d2f ``` of which the `git rev-list --count` can be particularly heavy. This PR optimizes performance for pull-mirrors. We utilize the fact that a pull-mirror is always identical to its upstream and rebuild the entire release table on every sync and use a batch `git for-each-ref .. refs/tags` call to retrieve all tags in one go. For large mirror repos, with hundreds of annotated tags, this brings down the duration of the sync operation from several minutes to a few seconds. A few unscientific examples run on my local machine: - https://github.com/spring-projects/spring-boot (223 tags) - before: `0m28,673s` - after: `0m2,244s` - https://github.com/kubernetes/kubernetes (890 tags) - before: `8m00s` - after: `0m8,520s` - https://github.com/vim/vim (13954 tags) - before: `14m20,383s` - after: `0m35,467s` I added a `foreachref` package which contains a flexible way of specifying which reference fields are of interest (`git-for-each-ref(1)`) and to produce a parser for the expected output. These could be reused in other places where `for-each-ref` is used. I'll add unit tests for those if the overall PR looks promising.
- Loading branch information
1 parent
b877504
commit e28cc79
Showing
7 changed files
with
834 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
// Copyright 2022 The Gitea Authors. All rights reserved. | ||
// Use of this source code is governed by a MIT-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package foreachref | ||
|
||
import ( | ||
"encoding/hex" | ||
"fmt" | ||
"io" | ||
"strings" | ||
) | ||
|
||
var ( | ||
nullChar = []byte("\x00") | ||
dualNullChar = []byte("\x00\x00") | ||
) | ||
|
||
// Format supports specifying and parsing an output format for 'git | ||
// for-each-ref'. See See git-for-each-ref(1) for available fields. | ||
type Format struct { | ||
// fieldNames hold %(fieldname)s to be passed to the '--format' flag of | ||
// for-each-ref. See git-for-each-ref(1) for available fields. | ||
fieldNames []string | ||
|
||
// fieldDelim is the character sequence that is used to separate fields | ||
// for each reference. fieldDelim and refDelim should be selected to not | ||
// interfere with each other and to not be present in field values. | ||
fieldDelim []byte | ||
// fieldDelimStr is a string representation of fieldDelim. Used to save | ||
// us from repetitive reallocation whenever we need the delimiter as a | ||
// string. | ||
fieldDelimStr string | ||
// refDelim is the character sequence used to separate reference from | ||
// each other in the output. fieldDelim and refDelim should be selected | ||
// to not interfere with each other and to not be present in field | ||
// values. | ||
refDelim []byte | ||
} | ||
|
||
// NewFormat creates a forEachRefFormat using the specified fieldNames. See | ||
// git-for-each-ref(1) for available fields. | ||
func NewFormat(fieldNames ...string) Format { | ||
return Format{ | ||
fieldNames: fieldNames, | ||
fieldDelim: nullChar, | ||
fieldDelimStr: string(nullChar), | ||
refDelim: dualNullChar, | ||
} | ||
} | ||
|
||
// Flag returns a for-each-ref --format flag value that captures the fieldNames. | ||
func (f Format) Flag() string { | ||
var formatFlag strings.Builder | ||
for i, field := range f.fieldNames { | ||
// field key and field value | ||
formatFlag.WriteString(fmt.Sprintf("%s %%(%s)", field, field)) | ||
|
||
if i < len(f.fieldNames)-1 { | ||
// note: escape delimiters to allow control characters as | ||
// delimiters. For example, '%00' for null character or '%0a' | ||
// for newline. | ||
formatFlag.WriteString(f.hexEscaped(f.fieldDelim)) | ||
} | ||
} | ||
formatFlag.WriteString(f.hexEscaped(f.refDelim)) | ||
return formatFlag.String() | ||
} | ||
|
||
// Parser returns a Parser capable of parsing 'git for-each-ref' output produced | ||
// with this Format. | ||
func (f Format) Parser(r io.Reader) *Parser { | ||
return NewParser(r, f) | ||
} | ||
|
||
// hexEscaped produces hex-escpaed characters from a string. For example, "\n\0" | ||
// would turn into "%0a%00". | ||
func (f Format) hexEscaped(delim []byte) string { | ||
escaped := "" | ||
for i := 0; i < len(delim); i++ { | ||
escaped += "%" + hex.EncodeToString([]byte{delim[i]}) | ||
} | ||
return escaped | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
// Copyright 2022 The Gitea Authors. All rights reserved. | ||
// Use of this source code is governed by a MIT-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package foreachref_test | ||
|
||
import ( | ||
"testing" | ||
|
||
"code.gitea.io/gitea/modules/git/foreachref" | ||
|
||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestFormat_Flag(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
|
||
givenFormat foreachref.Format | ||
|
||
wantFlag string | ||
}{ | ||
{ | ||
name: "references are delimited by dual null chars", | ||
|
||
// no reference fields requested | ||
givenFormat: foreachref.NewFormat(), | ||
|
||
// only a reference delimiter field in --format | ||
wantFlag: "%00%00", | ||
}, | ||
|
||
{ | ||
name: "a field is a space-separated key-value pair", | ||
|
||
givenFormat: foreachref.NewFormat("refname:short"), | ||
|
||
// only a reference delimiter field | ||
wantFlag: "refname:short %(refname:short)%00%00", | ||
}, | ||
|
||
{ | ||
name: "fields are separated by a null char field-delimiter", | ||
|
||
givenFormat: foreachref.NewFormat("refname:short", "author"), | ||
|
||
wantFlag: "refname:short %(refname:short)%00author %(author)%00%00", | ||
}, | ||
|
||
{ | ||
name: "multiple fields", | ||
|
||
givenFormat: foreachref.NewFormat("refname:short", "objecttype", "objectname"), | ||
|
||
wantFlag: "refname:short %(refname:short)%00objecttype %(objecttype)%00objectname %(objectname)%00%00", | ||
}, | ||
} | ||
|
||
for _, test := range tests { | ||
tc := test // don't close over loop variable | ||
t.Run(tc.name, func(t *testing.T) { | ||
gotFlag := tc.givenFormat.Flag() | ||
|
||
require.Equal(t, tc.wantFlag, gotFlag, "unexpected for-each-ref --format string. wanted: '%s', got: '%s'", tc.wantFlag, gotFlag) | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
// Copyright 2022 The Gitea Authors. All rights reserved. | ||
// Use of this source code is governed by a MIT-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package foreachref | ||
|
||
import ( | ||
"bufio" | ||
"bytes" | ||
"fmt" | ||
"io" | ||
"strings" | ||
) | ||
|
||
// Parser parses 'git for-each-ref' output according to a given output Format. | ||
type Parser struct { | ||
// tokenizes 'git for-each-ref' output into "reference paragraphs". | ||
scanner *bufio.Scanner | ||
|
||
// format represents the '--format' string that describes the expected | ||
// 'git for-each-ref' output structure. | ||
format Format | ||
|
||
// err holds the last encountered error during parsing. | ||
err error | ||
} | ||
|
||
// NewParser creates a 'git for-each-ref' output parser that will parse all | ||
// references in the provided Reader. The references in the output are assumed | ||
// to follow the specified Format. | ||
func NewParser(r io.Reader, format Format) *Parser { | ||
scanner := bufio.NewScanner(r) | ||
|
||
// in addition to the reference delimiter we specified in the --format, | ||
// `git for-each-ref` will always add a newline after every reference. | ||
refDelim := make([]byte, 0, len(format.refDelim)+1) | ||
refDelim = append(refDelim, format.refDelim...) | ||
refDelim = append(refDelim, '\n') | ||
|
||
// Split input into delimiter-separated "reference blocks". | ||
scanner.Split( | ||
func(data []byte, atEOF bool) (advance int, token []byte, err error) { | ||
// Scan until delimiter, marking end of reference. | ||
delimIdx := bytes.Index(data, refDelim) | ||
if delimIdx >= 0 { | ||
token := data[:delimIdx] | ||
advance := delimIdx + len(refDelim) | ||
return advance, token, nil | ||
} | ||
// If we're at EOF, we have a final, non-terminated reference. Return it. | ||
if atEOF { | ||
return len(data), data, nil | ||
} | ||
// Not yet a full field. Request more data. | ||
return 0, nil, nil | ||
}) | ||
|
||
return &Parser{ | ||
scanner: scanner, | ||
format: format, | ||
err: nil, | ||
} | ||
} | ||
|
||
// Next returns the next reference as a collection of key-value pairs. nil | ||
// denotes EOF but is also returned on errors. The Err method should always be | ||
// consulted after Next returning nil. | ||
// | ||
// It could, for example return something like: | ||
// | ||
// { "objecttype": "tag", "refname:short": "v1.16.4", "object": "f460b7543ed500e49c133c2cd85c8c55ee9dbe27" } | ||
// | ||
func (p *Parser) Next() map[string]string { | ||
if !p.scanner.Scan() { | ||
return nil | ||
} | ||
fields, err := p.parseRef(p.scanner.Text()) | ||
if err != nil { | ||
p.err = err | ||
return nil | ||
} | ||
return fields | ||
} | ||
|
||
// Err returns the latest encountered parsing error. | ||
func (p *Parser) Err() error { | ||
return p.err | ||
} | ||
|
||
// parseRef parses out all key-value pairs from a single reference block, such as | ||
// | ||
// "objecttype tag\0refname:short v1.16.4\0object f460b7543ed500e49c133c2cd85c8c55ee9dbe27" | ||
// | ||
func (p *Parser) parseRef(refBlock string) (map[string]string, error) { | ||
if refBlock == "" { | ||
// must be at EOF | ||
return nil, nil | ||
} | ||
|
||
fieldValues := make(map[string]string) | ||
|
||
fields := strings.Split(refBlock, p.format.fieldDelimStr) | ||
if len(fields) != len(p.format.fieldNames) { | ||
return nil, fmt.Errorf("unexpected number of reference fields: wanted %d, was %d", | ||
len(fields), len(p.format.fieldNames)) | ||
} | ||
for i, field := range fields { | ||
field = strings.TrimSpace(field) | ||
|
||
var fieldKey string | ||
var fieldVal string | ||
firstSpace := strings.Index(field, " ") | ||
if firstSpace > 0 { | ||
fieldKey = field[:firstSpace] | ||
fieldVal = field[firstSpace+1:] | ||
} else { | ||
// could be the case if the requested field had no value | ||
fieldKey = field | ||
} | ||
|
||
// enforce the format order of fields | ||
if p.format.fieldNames[i] != fieldKey { | ||
return nil, fmt.Errorf("unexpected field name at position %d: wanted: '%s', was: '%s'", | ||
i, p.format.fieldNames[i], fieldKey) | ||
} | ||
|
||
fieldValues[fieldKey] = fieldVal | ||
} | ||
|
||
return fieldValues, nil | ||
} |
Oops, something went wrong.