Refactor iterate git tree

This commit is contained in:
Lunny Xiao 2024-05-25 17:18:19 +08:00
parent ec771fdfcd
commit c7359599c7
No known key found for this signature in database
GPG Key ID: C3B7C91B632F738A
6 changed files with 105 additions and 46 deletions

View File

@ -5,6 +5,7 @@ package actions
import ( import (
"bytes" "bytes"
"context"
"io" "io"
"strings" "strings"
@ -55,7 +56,7 @@ func ListWorkflows(commit *git.Commit) (git.Entries, error) {
return nil, err return nil, err
} }
entries, err := tree.ListEntriesRecursiveFast() entries, err := tree.ListEntriesRecursiveFast(context.Background())
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -24,8 +24,15 @@ func ParseTreeEntries(data []byte) ([]*TreeEntry, error) {
var sepSpace = []byte{' '} var sepSpace = []byte{' '}
func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) { func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
var err error
entries := make([]*TreeEntry, 0, bytes.Count(data, []byte{'\n'})+1) entries := make([]*TreeEntry, 0, bytes.Count(data, []byte{'\n'})+1)
return entries, iterateTreeEntries(data, ptree, func(entry *TreeEntry) error {
entries = append(entries, entry)
return nil
})
}
func iterateTreeEntries(data []byte, ptree *Tree, f func(entry *TreeEntry) error) error {
var err error
for pos := 0; pos < len(data); { for pos := 0; pos < len(data); {
// expect line to be of the form: // expect line to be of the form:
// <mode> <type> <sha> <space-padded-size>\t<filename> // <mode> <type> <sha> <space-padded-size>\t<filename>
@ -39,7 +46,7 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
line := data[pos:posEnd] line := data[pos:posEnd]
posTab := bytes.IndexByte(line, '\t') posTab := bytes.IndexByte(line, '\t')
if posTab == -1 { if posTab == -1 {
return nil, fmt.Errorf("invalid ls-tree output (no tab): %q", line) return fmt.Errorf("invalid ls-tree output (no tab): %q", line)
} }
entry := new(TreeEntry) entry := new(TreeEntry)
@ -69,27 +76,29 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
case "040000", "040755": // git uses 040000 for tree object, but some users may get 040755 for unknown reasons case "040000", "040755": // git uses 040000 for tree object, but some users may get 040755 for unknown reasons
entry.entryMode = EntryModeTree entry.entryMode = EntryModeTree
default: default:
return nil, fmt.Errorf("unknown type: %v", string(entryMode)) return fmt.Errorf("unknown type: %v", string(entryMode))
} }
entry.ID, err = NewIDFromString(string(entryObjectID)) entry.ID, err = NewIDFromString(string(entryObjectID))
if err != nil { if err != nil {
return nil, fmt.Errorf("invalid ls-tree output (invalid object id): %q, err: %w", line, err) return fmt.Errorf("invalid ls-tree output (invalid object id): %q, err: %w", line, err)
} }
if len(entryName) > 0 && entryName[0] == '"' { if len(entryName) > 0 && entryName[0] == '"' {
entry.name, err = strconv.Unquote(string(entryName)) entry.name, err = strconv.Unquote(string(entryName))
if err != nil { if err != nil {
return nil, fmt.Errorf("invalid ls-tree output (invalid name): %q, err: %w", line, err) return fmt.Errorf("invalid ls-tree output (invalid name): %q, err: %w", line, err)
} }
} else { } else {
entry.name = string(entryName) entry.name = string(entryName)
} }
pos = posEnd + 1 pos = posEnd + 1
entries = append(entries, entry) if err := f(entry); err != nil {
return err
}
} }
return entries, nil return nil
} }
func catBatchParseTreeEntries(objectFormat ObjectFormat, ptree *Tree, rd *bufio.Reader, sz int64) ([]*TreeEntry, error) { func catBatchParseTreeEntries(objectFormat ObjectFormat, ptree *Tree, rd *bufio.Reader, sz int64) ([]*TreeEntry, error) {

View File

@ -54,11 +54,6 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
tree := commit.Tree tree := commit.Tree
entries, err := tree.ListEntriesRecursiveWithSize()
if err != nil {
return nil, err
}
checker, deferable := repo.CheckAttributeReader(commitID) checker, deferable := repo.CheckAttributeReader(commitID)
defer deferable() defer deferable()
@ -74,10 +69,10 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
firstExcludedLanguage := "" firstExcludedLanguage := ""
firstExcludedLanguageSize := int64(0) firstExcludedLanguageSize := int64(0)
for _, f := range entries { if err := tree.IterateEntriesWithSize(func(f *TreeEntry) error {
select { select {
case <-repo.Ctx.Done(): case <-repo.Ctx.Done():
return sizes, repo.Ctx.Err() return repo.Ctx.Err()
default: default:
} }
@ -85,7 +80,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
content = contentBuf.Bytes() content = contentBuf.Bytes()
if f.Size() == 0 { if f.Size() == 0 {
continue return nil
} }
isVendored := optional.None[bool]() isVendored := optional.None[bool]()
@ -98,22 +93,22 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
if err == nil { if err == nil {
isVendored = AttributeToBool(attrs, AttributeLinguistVendored) isVendored = AttributeToBool(attrs, AttributeLinguistVendored)
if isVendored.ValueOrDefault(false) { if isVendored.ValueOrDefault(false) {
continue return nil
} }
isGenerated = AttributeToBool(attrs, AttributeLinguistGenerated) isGenerated = AttributeToBool(attrs, AttributeLinguistGenerated)
if isGenerated.ValueOrDefault(false) { if isGenerated.ValueOrDefault(false) {
continue return nil
} }
isDocumentation = AttributeToBool(attrs, AttributeLinguistDocumentation) isDocumentation = AttributeToBool(attrs, AttributeLinguistDocumentation)
if isDocumentation.ValueOrDefault(false) { if isDocumentation.ValueOrDefault(false) {
continue return nil
} }
isDetectable = AttributeToBool(attrs, AttributeLinguistDetectable) isDetectable = AttributeToBool(attrs, AttributeLinguistDetectable)
if !isDetectable.ValueOrDefault(true) { if !isDetectable.ValueOrDefault(true) {
continue return nil
} }
hasLanguage := TryReadLanguageAttribute(attrs) hasLanguage := TryReadLanguageAttribute(attrs)
@ -128,7 +123,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
// this language will always be added to the size // this language will always be added to the size
sizes[language] += f.Size() sizes[language] += f.Size()
continue return nil
} }
} }
} }
@ -137,19 +132,19 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
enry.IsDotFile(f.Name()) || enry.IsDotFile(f.Name()) ||
(!isDocumentation.Has() && enry.IsDocumentation(f.Name())) || (!isDocumentation.Has() && enry.IsDocumentation(f.Name())) ||
enry.IsConfiguration(f.Name()) { enry.IsConfiguration(f.Name()) {
continue return nil
} }
// If content can not be read or file is too big just do detection by filename // If content can not be read or file is too big just do detection by filename
if f.Size() <= bigFileSize { if f.Size() <= bigFileSize {
if err := writeID(f.ID.String()); err != nil { if err := writeID(f.ID.String()); err != nil {
return nil, err return err
} }
_, _, size, err := ReadBatchLine(batchReader) _, _, size, err := ReadBatchLine(batchReader)
if err != nil { if err != nil {
log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err) log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err)
return nil, err return err
} }
sizeToRead := size sizeToRead := size
@ -161,22 +156,22 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead)) _, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
if err != nil { if err != nil {
return nil, err return err
} }
content = contentBuf.Bytes() content = contentBuf.Bytes()
if err := DiscardFull(batchReader, discard); err != nil { if err := DiscardFull(batchReader, discard); err != nil {
return nil, err return err
} }
} }
if !isGenerated.Has() && enry.IsGenerated(f.Name(), content) { if !isGenerated.Has() && enry.IsGenerated(f.Name(), content) {
continue return nil
} }
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary? // FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
// - eg. do the all the detection tests using filename first before reading content. // - eg. do the all the detection tests using filename first before reading content.
language := analyze.GetCodeLanguage(f.Name(), content) language := analyze.GetCodeLanguage(f.Name(), content)
if language == "" { if language == "" {
continue return nil
} }
// group languages, such as Pug -> HTML; SCSS -> CSS // group languages, such as Pug -> HTML; SCSS -> CSS
@ -197,6 +192,9 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
firstExcludedLanguage = language firstExcludedLanguage = language
firstExcludedLanguageSize += f.Size() firstExcludedLanguageSize += f.Size()
} }
return nil
}); err != nil {
return sizes, err
} }
// If there are no included languages add the first excluded language // If there are no included languages add the first excluded language

View File

@ -6,6 +6,8 @@
package git package git
import ( import (
"bufio"
"context"
"io" "io"
"strings" "strings"
) )
@ -88,34 +90,83 @@ func (t *Tree) ListEntries() (Entries, error) {
// listEntriesRecursive returns all entries of current tree recursively including all subtrees // listEntriesRecursive returns all entries of current tree recursively including all subtrees
// extraArgs could be "-l" to get the size, which is slower // extraArgs could be "-l" to get the size, which is slower
func (t *Tree) listEntriesRecursive(extraArgs TrustedCmdArgs) (Entries, error) { func (t *Tree) listEntriesRecursive(ctx context.Context, extraArgs TrustedCmdArgs) (Entries, error) {
if t.entriesRecursiveParsed { if t.entriesRecursiveParsed {
return t.entriesRecursive, nil return t.entriesRecursive, nil
} }
stdout, _, runErr := NewCommand(t.repo.Ctx, "ls-tree", "-t", "-r"). t.entriesRecursive = make([]*TreeEntry, 0)
AddArguments(extraArgs...). err := t.iterateEntriesRecursive(func(entry *TreeEntry) error {
AddDynamicArguments(t.ID.String()). select {
RunStdBytes(&RunOpts{Dir: t.repo.Path}) case <-ctx.Done():
if runErr != nil { return ctx.Err()
return nil, runErr default:
} }
t.entriesRecursive = append(t.entriesRecursive, entry)
var err error return nil
t.entriesRecursive, err = parseTreeEntries(stdout, t) }, extraArgs)
if err == nil { if err == nil {
t.entriesRecursiveParsed = true t.entriesRecursiveParsed = true
} }
return t.entriesRecursive, err return t.entriesRecursive, err
} }
// ListEntriesRecursiveFast returns all entries of current tree recursively including all subtrees, no size // ListEntriesRecursiveFast returns all entries of current tree recursively including all subtrees, no size
func (t *Tree) ListEntriesRecursiveFast() (Entries, error) { func (t *Tree) ListEntriesRecursiveFast(ctx context.Context) (Entries, error) {
return t.listEntriesRecursive(nil) return t.listEntriesRecursive(ctx, nil)
} }
// ListEntriesRecursiveWithSize returns all entries of current tree recursively including all subtrees, with size // ListEntriesRecursiveWithSize returns all entries of current tree recursively including all subtrees, with size
func (t *Tree) ListEntriesRecursiveWithSize() (Entries, error) { func (t *Tree) ListEntriesRecursiveWithSize(ctx context.Context) (Entries, error) {
return t.listEntriesRecursive(TrustedCmdArgs{"--long"}) return t.listEntriesRecursive(ctx, TrustedCmdArgs{"--long"})
}
// iterateEntriesRecursive returns iterate entries of current tree recursively including all subtrees
// extraArgs could be "-l" to get the size, which is slower
func (t *Tree) iterateEntriesRecursive(f func(entry *TreeEntry) error, extraArgs TrustedCmdArgs) error {
if t.entriesRecursiveParsed {
return nil
}
reader, writer := io.Pipe()
done := make(chan error)
go func(done chan error, writer *io.PipeWriter, reader *io.PipeReader) {
runErr := NewCommand(t.repo.Ctx, "ls-tree", "-t", "-r").
AddArguments(extraArgs...).
AddDynamicArguments(t.ID.String()).
Run(&RunOpts{
Dir: t.repo.Path,
Stdout: writer,
})
_ = writer.Close()
_ = reader.Close()
done <- runErr
}(done, writer, reader)
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
if err := scanner.Err(); err != nil {
return err
}
data := scanner.Bytes()
if err := iterateTreeEntries(data, t, func(entry *TreeEntry) error {
select {
case runErr := <-done:
return runErr
default:
return f(entry)
}
}); err != nil {
return err
}
}
t.entriesRecursiveParsed = true
return nil
}
func (t *Tree) IterateEntriesWithSize(f func(*TreeEntry) error) error {
return t.iterateEntriesRecursive(f, TrustedCmdArgs{"--long"})
} }

View File

@ -21,7 +21,7 @@ func TreeList(ctx *context.Context) {
return return
} }
entries, err := tree.ListEntriesRecursiveFast() entries, err := tree.ListEntriesRecursiveFast(ctx)
if err != nil { if err != nil {
ctx.ServerError("ListEntriesRecursiveFast", err) ctx.ServerError("ListEntriesRecursiveFast", err)
return return

View File

@ -28,7 +28,7 @@ func GetTreeBySHA(ctx context.Context, repo *repo_model.Repository, gitRepo *git
tree.URL = repo.APIURL() + "/git/trees/" + url.PathEscape(tree.SHA) tree.URL = repo.APIURL() + "/git/trees/" + url.PathEscape(tree.SHA)
var entries git.Entries var entries git.Entries
if recursive { if recursive {
entries, err = gitTree.ListEntriesRecursiveWithSize() entries, err = gitTree.ListEntriesRecursiveWithSize(ctx)
} else { } else {
entries, err = gitTree.ListEntries() entries, err = gitTree.ListEntries()
} }