From c7359599c78b53d2ed07a35ef3847a706811f52a Mon Sep 17 00:00:00 2001 From: Lunny Xiao Date: Sat, 25 May 2024 17:18:19 +0800 Subject: [PATCH] Refactor iterate git tree --- modules/actions/workflows.go | 3 +- modules/git/parse_nogogit.go | 23 ++++-- modules/git/repo_language_stats_nogogit.go | 38 +++++----- modules/git/tree_nogogit.go | 83 +++++++++++++++++----- routers/web/repo/treelist.go | 2 +- services/repository/files/tree.go | 2 +- 6 files changed, 105 insertions(+), 46 deletions(-) diff --git a/modules/actions/workflows.go b/modules/actions/workflows.go index 0d2b0dd919..1edf8498aa 100644 --- a/modules/actions/workflows.go +++ b/modules/actions/workflows.go @@ -5,6 +5,7 @@ package actions import ( "bytes" + "context" "io" "strings" @@ -55,7 +56,7 @@ func ListWorkflows(commit *git.Commit) (git.Entries, error) { return nil, err } - entries, err := tree.ListEntriesRecursiveFast() + entries, err := tree.ListEntriesRecursiveFast(context.Background()) if err != nil { return nil, err } diff --git a/modules/git/parse_nogogit.go b/modules/git/parse_nogogit.go index 546b38be37..619d86a72a 100644 --- a/modules/git/parse_nogogit.go +++ b/modules/git/parse_nogogit.go @@ -24,8 +24,15 @@ func ParseTreeEntries(data []byte) ([]*TreeEntry, error) { var sepSpace = []byte{' '} func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) { - var err error entries := make([]*TreeEntry, 0, bytes.Count(data, []byte{'\n'})+1) + return entries, iterateTreeEntries(data, ptree, func(entry *TreeEntry) error { + entries = append(entries, entry) + return nil + }) +} + +func iterateTreeEntries(data []byte, ptree *Tree, f func(entry *TreeEntry) error) error { + var err error for pos := 0; pos < len(data); { // expect line to be of the form: // \t @@ -39,7 +46,7 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) { line := data[pos:posEnd] posTab := bytes.IndexByte(line, '\t') if posTab == -1 { - return nil, fmt.Errorf("invalid ls-tree output (no tab): %q", line) + return fmt.Errorf("invalid ls-tree output (no tab): %q", line) } entry := new(TreeEntry) @@ -69,27 +76,29 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) { case "040000", "040755": // git uses 040000 for tree object, but some users may get 040755 for unknown reasons entry.entryMode = EntryModeTree default: - return nil, fmt.Errorf("unknown type: %v", string(entryMode)) + return fmt.Errorf("unknown type: %v", string(entryMode)) } entry.ID, err = NewIDFromString(string(entryObjectID)) if err != nil { - return nil, fmt.Errorf("invalid ls-tree output (invalid object id): %q, err: %w", line, err) + return fmt.Errorf("invalid ls-tree output (invalid object id): %q, err: %w", line, err) } if len(entryName) > 0 && entryName[0] == '"' { entry.name, err = strconv.Unquote(string(entryName)) if err != nil { - return nil, fmt.Errorf("invalid ls-tree output (invalid name): %q, err: %w", line, err) + return fmt.Errorf("invalid ls-tree output (invalid name): %q, err: %w", line, err) } } else { entry.name = string(entryName) } pos = posEnd + 1 - entries = append(entries, entry) + if err := f(entry); err != nil { + return err + } } - return entries, nil + return nil } func catBatchParseTreeEntries(objectFormat ObjectFormat, ptree *Tree, rd *bufio.Reader, sz int64) ([]*TreeEntry, error) { diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index 318fc091ce..5cb51aef76 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -54,11 +54,6 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err tree := commit.Tree - entries, err := tree.ListEntriesRecursiveWithSize() - if err != nil { - return nil, err - } - checker, deferable := repo.CheckAttributeReader(commitID) defer deferable() @@ -74,10 +69,10 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err firstExcludedLanguage := "" firstExcludedLanguageSize := int64(0) - for _, f := range entries { + if err := tree.IterateEntriesWithSize(func(f *TreeEntry) error { select { case <-repo.Ctx.Done(): - return sizes, repo.Ctx.Err() + return repo.Ctx.Err() default: } @@ -85,7 +80,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err content = contentBuf.Bytes() if f.Size() == 0 { - continue + return nil } isVendored := optional.None[bool]() @@ -98,22 +93,22 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err if err == nil { isVendored = AttributeToBool(attrs, AttributeLinguistVendored) if isVendored.ValueOrDefault(false) { - continue + return nil } isGenerated = AttributeToBool(attrs, AttributeLinguistGenerated) if isGenerated.ValueOrDefault(false) { - continue + return nil } isDocumentation = AttributeToBool(attrs, AttributeLinguistDocumentation) if isDocumentation.ValueOrDefault(false) { - continue + return nil } isDetectable = AttributeToBool(attrs, AttributeLinguistDetectable) if !isDetectable.ValueOrDefault(true) { - continue + return nil } hasLanguage := TryReadLanguageAttribute(attrs) @@ -128,7 +123,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err // this language will always be added to the size sizes[language] += f.Size() - continue + return nil } } } @@ -137,19 +132,19 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err enry.IsDotFile(f.Name()) || (!isDocumentation.Has() && enry.IsDocumentation(f.Name())) || enry.IsConfiguration(f.Name()) { - continue + return nil } // If content can not be read or file is too big just do detection by filename if f.Size() <= bigFileSize { if err := writeID(f.ID.String()); err != nil { - return nil, err + return err } _, _, size, err := ReadBatchLine(batchReader) if err != nil { log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err) - return nil, err + return err } sizeToRead := size @@ -161,22 +156,22 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err _, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead)) if err != nil { - return nil, err + return err } content = contentBuf.Bytes() if err := DiscardFull(batchReader, discard); err != nil { - return nil, err + return err } } if !isGenerated.Has() && enry.IsGenerated(f.Name(), content) { - continue + return nil } // FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary? // - eg. do the all the detection tests using filename first before reading content. language := analyze.GetCodeLanguage(f.Name(), content) if language == "" { - continue + return nil } // group languages, such as Pug -> HTML; SCSS -> CSS @@ -197,6 +192,9 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err firstExcludedLanguage = language firstExcludedLanguageSize += f.Size() } + return nil + }); err != nil { + return sizes, err } // If there are no included languages add the first excluded language diff --git a/modules/git/tree_nogogit.go b/modules/git/tree_nogogit.go index e0a72de5b8..84283f73ed 100644 --- a/modules/git/tree_nogogit.go +++ b/modules/git/tree_nogogit.go @@ -6,6 +6,8 @@ package git import ( + "bufio" + "context" "io" "strings" ) @@ -88,34 +90,83 @@ func (t *Tree) ListEntries() (Entries, error) { // listEntriesRecursive returns all entries of current tree recursively including all subtrees // extraArgs could be "-l" to get the size, which is slower -func (t *Tree) listEntriesRecursive(extraArgs TrustedCmdArgs) (Entries, error) { +func (t *Tree) listEntriesRecursive(ctx context.Context, extraArgs TrustedCmdArgs) (Entries, error) { if t.entriesRecursiveParsed { return t.entriesRecursive, nil } - stdout, _, runErr := NewCommand(t.repo.Ctx, "ls-tree", "-t", "-r"). - AddArguments(extraArgs...). - AddDynamicArguments(t.ID.String()). - RunStdBytes(&RunOpts{Dir: t.repo.Path}) - if runErr != nil { - return nil, runErr - } - - var err error - t.entriesRecursive, err = parseTreeEntries(stdout, t) + t.entriesRecursive = make([]*TreeEntry, 0) + err := t.iterateEntriesRecursive(func(entry *TreeEntry) error { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + t.entriesRecursive = append(t.entriesRecursive, entry) + return nil + }, extraArgs) if err == nil { t.entriesRecursiveParsed = true } - return t.entriesRecursive, err } // ListEntriesRecursiveFast returns all entries of current tree recursively including all subtrees, no size -func (t *Tree) ListEntriesRecursiveFast() (Entries, error) { - return t.listEntriesRecursive(nil) +func (t *Tree) ListEntriesRecursiveFast(ctx context.Context) (Entries, error) { + return t.listEntriesRecursive(ctx, nil) } // ListEntriesRecursiveWithSize returns all entries of current tree recursively including all subtrees, with size -func (t *Tree) ListEntriesRecursiveWithSize() (Entries, error) { - return t.listEntriesRecursive(TrustedCmdArgs{"--long"}) +func (t *Tree) ListEntriesRecursiveWithSize(ctx context.Context) (Entries, error) { + return t.listEntriesRecursive(ctx, TrustedCmdArgs{"--long"}) +} + +// iterateEntriesRecursive returns iterate entries of current tree recursively including all subtrees +// extraArgs could be "-l" to get the size, which is slower +func (t *Tree) iterateEntriesRecursive(f func(entry *TreeEntry) error, extraArgs TrustedCmdArgs) error { + if t.entriesRecursiveParsed { + return nil + } + + reader, writer := io.Pipe() + done := make(chan error) + + go func(done chan error, writer *io.PipeWriter, reader *io.PipeReader) { + runErr := NewCommand(t.repo.Ctx, "ls-tree", "-t", "-r"). + AddArguments(extraArgs...). + AddDynamicArguments(t.ID.String()). + Run(&RunOpts{ + Dir: t.repo.Path, + Stdout: writer, + }) + + _ = writer.Close() + _ = reader.Close() + + done <- runErr + }(done, writer, reader) + + scanner := bufio.NewScanner(reader) + for scanner.Scan() { + if err := scanner.Err(); err != nil { + return err + } + data := scanner.Bytes() + if err := iterateTreeEntries(data, t, func(entry *TreeEntry) error { + select { + case runErr := <-done: + return runErr + default: + return f(entry) + } + }); err != nil { + return err + } + } + t.entriesRecursiveParsed = true + return nil +} + +func (t *Tree) IterateEntriesWithSize(f func(*TreeEntry) error) error { + return t.iterateEntriesRecursive(f, TrustedCmdArgs{"--long"}) } diff --git a/routers/web/repo/treelist.go b/routers/web/repo/treelist.go index d11af4669f..3626081e28 100644 --- a/routers/web/repo/treelist.go +++ b/routers/web/repo/treelist.go @@ -21,7 +21,7 @@ func TreeList(ctx *context.Context) { return } - entries, err := tree.ListEntriesRecursiveFast() + entries, err := tree.ListEntriesRecursiveFast(ctx) if err != nil { ctx.ServerError("ListEntriesRecursiveFast", err) return diff --git a/services/repository/files/tree.go b/services/repository/files/tree.go index e3a7f3b8b0..eaffd21625 100644 --- a/services/repository/files/tree.go +++ b/services/repository/files/tree.go @@ -28,7 +28,7 @@ func GetTreeBySHA(ctx context.Context, repo *repo_model.Repository, gitRepo *git tree.URL = repo.APIURL() + "/git/trees/" + url.PathEscape(tree.SHA) var entries git.Entries if recursive { - entries, err = gitTree.ListEntriesRecursiveWithSize() + entries, err = gitTree.ListEntriesRecursiveWithSize(ctx) } else { entries, err = gitTree.ListEntries() }