From 7dcccc3bb19655a6f83dd495ffc332708d0c8678 Mon Sep 17 00:00:00 2001 From: Rowan Bohde Date: Fri, 1 Nov 2024 22:29:37 -0500 Subject: [PATCH] improve performance of diffs (#32393) This has two major changes that significantly reduce the amount of work done for large diffs: * Kill a running git process when reaching the maximum number of files in a diff, preventing it from processing the entire diff. * When loading a diff with the URL param `file-only=true`, skip loading stats. This speeds up loading both hidden files of a diff and sections of a diff when clicking the "Show More" button. A couple of minor things from profiling are also included: * Reuse existing repo in `PrepareViewPullInfo` if head and base are the same. The performance impact is going to depend heavily on the individual diff and the hardware it runs on, but when testing locally on a diff changing 100k+ lines over hundreds of files, I'm seeing a roughly 75% reduction in time to load the result of "Show More" --------- Co-authored-by: wxiaoguang --- routers/web/repo/commit.go | 1 + routers/web/repo/compare.go | 3 ++ routers/web/repo/pull.go | 7 +-- services/gitdiff/gitdiff.go | 62 +++++++++++--------------- services/repository/files/temp_repo.go | 13 ++++-- 5 files changed, 44 insertions(+), 42 deletions(-) diff --git a/routers/web/repo/commit.go b/routers/web/repo/commit.go index 0e4e10bf509..d7865e18d63 100644 --- a/routers/web/repo/commit.go +++ b/routers/web/repo/commit.go @@ -328,6 +328,7 @@ func Diff(ctx *context.Context) { MaxLineCharacters: setting.Git.MaxGitDiffLineCharacters, MaxFiles: maxFiles, WhitespaceBehavior: gitdiff.GetWhitespaceFlag(ctx.Data["WhitespaceBehavior"].(string)), + FileOnly: fileOnly, }, files...) if err != nil { ctx.NotFound("GetDiff", err) diff --git a/routers/web/repo/compare.go b/routers/web/repo/compare.go index e6a04782e5b..39279728375 100644 --- a/routers/web/repo/compare.go +++ b/routers/web/repo/compare.go @@ -611,6 +611,8 @@ func PrepareCompareDiff( maxLines, maxFiles = -1, -1 } + fileOnly := ctx.FormBool("file-only") + diff, err := gitdiff.GetDiff(ctx, ci.HeadGitRepo, &gitdiff.DiffOptions{ BeforeCommitID: beforeCommitID, @@ -621,6 +623,7 @@ func PrepareCompareDiff( MaxFiles: maxFiles, WhitespaceBehavior: whitespaceBehavior, DirectComparison: ci.DirectComparison, + FileOnly: fileOnly, }, ctx.FormStrings("files")...) if err != nil { ctx.ServerError("GetDiffRangeWithWhitespaceBehavior", err) diff --git a/routers/web/repo/pull.go b/routers/web/repo/pull.go index 0efe1be76a3..cc554a71ff6 100644 --- a/routers/web/repo/pull.go +++ b/routers/web/repo/pull.go @@ -395,12 +395,12 @@ func PrepareViewPullInfo(ctx *context.Context, issue *issues_model.Issue) *git.C var headBranchSha string // HeadRepo may be missing if pull.HeadRepo != nil { - headGitRepo, err := gitrepo.OpenRepository(ctx, pull.HeadRepo) + headGitRepo, closer, err := gitrepo.RepositoryFromContextOrOpen(ctx, pull.HeadRepo) if err != nil { - ctx.ServerError("OpenRepository", err) + ctx.ServerError("RepositoryFromContextOrOpen", err) return nil } - defer headGitRepo.Close() + defer closer.Close() if pull.Flow == issues_model.PullRequestFlowGithub { headBranchExist = headGitRepo.IsBranchExist(pull.HeadBranch) @@ -747,6 +747,7 @@ func viewPullFiles(ctx *context.Context, specifiedStartCommit, specifiedEndCommi MaxLineCharacters: setting.Git.MaxGitDiffLineCharacters, MaxFiles: maxFiles, WhitespaceBehavior: gitdiff.GetWhitespaceFlag(ctx.Data["WhitespaceBehavior"].(string)), + FileOnly: fileOnly, } if !willShowSpecifiedCommit { diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index 0ddd5a48e21..cf7da0707b8 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -380,18 +380,11 @@ func (diffFile *DiffFile) GetType() int { } // GetTailSection creates a fake DiffLineSection if the last section is not the end of the file -func (diffFile *DiffFile) GetTailSection(gitRepo *git.Repository, leftCommitID, rightCommitID string) *DiffSection { +func (diffFile *DiffFile) GetTailSection(gitRepo *git.Repository, leftCommit, rightCommit *git.Commit) *DiffSection { if len(diffFile.Sections) == 0 || diffFile.Type != DiffFileChange || diffFile.IsBin || diffFile.IsLFSFile { return nil } - leftCommit, err := gitRepo.GetCommit(leftCommitID) - if err != nil { - return nil - } - rightCommit, err := gitRepo.GetCommit(rightCommitID) - if err != nil { - return nil - } + lastSection := diffFile.Sections[len(diffFile.Sections)-1] lastLine := lastSection.Lines[len(lastSection.Lines)-1] leftLineCount := getCommitFileLineCount(leftCommit, diffFile.Name) @@ -536,11 +529,6 @@ parsingLoop: lastFile := createDiffFile(diff, line) diff.End = lastFile.Name diff.IsIncomplete = true - _, err := io.Copy(io.Discard, reader) - if err != nil { - // By the definition of io.Copy this never returns io.EOF - return diff, fmt.Errorf("error during io.Copy: %w", err) - } break parsingLoop } @@ -1101,6 +1089,7 @@ type DiffOptions struct { MaxFiles int WhitespaceBehavior git.TrustedCmdArgs DirectComparison bool + FileOnly bool } // GetDiff builds a Diff between two commits of a repository. @@ -1109,12 +1098,16 @@ type DiffOptions struct { func GetDiff(ctx context.Context, gitRepo *git.Repository, opts *DiffOptions, files ...string) (*Diff, error) { repoPath := gitRepo.Path + var beforeCommit *git.Commit commit, err := gitRepo.GetCommit(opts.AfterCommitID) if err != nil { return nil, err } - cmdDiff := git.NewCommand(gitRepo.Ctx) + cmdCtx, cmdCancel := context.WithCancel(ctx) + defer cmdCancel() + + cmdDiff := git.NewCommand(cmdCtx) objectFormat, err := gitRepo.GetObjectFormat() if err != nil { return nil, err @@ -1136,6 +1129,12 @@ func GetDiff(ctx context.Context, gitRepo *git.Repository, opts *DiffOptions, fi AddArguments(opts.WhitespaceBehavior...). AddDynamicArguments(actualBeforeCommitID, opts.AfterCommitID) opts.BeforeCommitID = actualBeforeCommitID + + var err error + beforeCommit, err = gitRepo.GetCommit(opts.BeforeCommitID) + if err != nil { + return nil, err + } } // In git 2.31, git diff learned --skip-to which we can use to shortcut skip to file @@ -1163,14 +1162,16 @@ func GetDiff(ctx context.Context, gitRepo *git.Repository, opts *DiffOptions, fi Dir: repoPath, Stdout: writer, Stderr: stderr, - }); err != nil { + }); err != nil && err.Error() != "signal: killed" { log.Error("error during GetDiff(git diff dir: %s): %v, stderr: %s", repoPath, err, stderr.String()) } _ = writer.Close() }() - diff, err := ParsePatch(ctx, opts.MaxLines, opts.MaxLineCharacters, opts.MaxFiles, reader, parsePatchSkipToFile) + diff, err := ParsePatch(cmdCtx, opts.MaxLines, opts.MaxLineCharacters, opts.MaxFiles, reader, parsePatchSkipToFile) + // Ensure the git process is killed if it didn't exit already + cmdCancel() if err != nil { return nil, fmt.Errorf("unable to ParsePatch: %w", err) } @@ -1205,37 +1206,28 @@ func GetDiff(ctx context.Context, gitRepo *git.Repository, opts *DiffOptions, fi } diffFile.IsGenerated = isGenerated.Value() - tailSection := diffFile.GetTailSection(gitRepo, opts.BeforeCommitID, opts.AfterCommitID) + tailSection := diffFile.GetTailSection(gitRepo, beforeCommit, commit) if tailSection != nil { diffFile.Sections = append(diffFile.Sections, tailSection) } } - separator := "..." - if opts.DirectComparison { - separator = ".." + if opts.FileOnly { + return diff, nil } - diffPaths := []string{opts.BeforeCommitID + separator + opts.AfterCommitID} - if len(opts.BeforeCommitID) == 0 || opts.BeforeCommitID == objectFormat.EmptyObjectID().String() { - diffPaths = []string{objectFormat.EmptyTree().String(), opts.AfterCommitID} - } - diff.NumFiles, diff.TotalAddition, diff.TotalDeletion, err = git.GetDiffShortStat(gitRepo.Ctx, repoPath, nil, diffPaths...) - if err != nil && strings.Contains(err.Error(), "no merge base") { - // git >= 2.28 now returns an error if base and head have become unrelated. - // previously it would return the results of git diff --shortstat base head so let's try that... - diffPaths = []string{opts.BeforeCommitID, opts.AfterCommitID} - diff.NumFiles, diff.TotalAddition, diff.TotalDeletion, err = git.GetDiffShortStat(gitRepo.Ctx, repoPath, nil, diffPaths...) - } + stats, err := GetPullDiffStats(gitRepo, opts) if err != nil { return nil, err } + diff.NumFiles, diff.TotalAddition, diff.TotalDeletion = stats.NumFiles, stats.TotalAddition, stats.TotalDeletion + return diff, nil } type PullDiffStats struct { - TotalAddition, TotalDeletion int + NumFiles, TotalAddition, TotalDeletion int } // GetPullDiffStats @@ -1259,12 +1251,12 @@ func GetPullDiffStats(gitRepo *git.Repository, opts *DiffOptions) (*PullDiffStat diffPaths = []string{objectFormat.EmptyTree().String(), opts.AfterCommitID} } - _, diff.TotalAddition, diff.TotalDeletion, err = git.GetDiffShortStat(gitRepo.Ctx, repoPath, nil, diffPaths...) + diff.NumFiles, diff.TotalAddition, diff.TotalDeletion, err = git.GetDiffShortStat(gitRepo.Ctx, repoPath, nil, diffPaths...) if err != nil && strings.Contains(err.Error(), "no merge base") { // git >= 2.28 now returns an error if base and head have become unrelated. // previously it would return the results of git diff --shortstat base head so let's try that... diffPaths = []string{opts.BeforeCommitID, opts.AfterCommitID} - _, diff.TotalAddition, diff.TotalDeletion, err = git.GetDiffShortStat(gitRepo.Ctx, repoPath, nil, diffPaths...) + diff.NumFiles, diff.TotalAddition, diff.TotalDeletion, err = git.GetDiffShortStat(gitRepo.Ctx, repoPath, nil, diffPaths...) } if err != nil { return nil, err diff --git a/services/repository/files/temp_repo.go b/services/repository/files/temp_repo.go index d70b1e8d54e..ce98967e797 100644 --- a/services/repository/files/temp_repo.go +++ b/services/repository/files/temp_repo.go @@ -358,6 +358,7 @@ func (t *TemporaryUploadRepository) DiffIndex() (*gitdiff.Diff, error) { Stderr: stderr, PipelineFunc: func(ctx context.Context, cancel context.CancelFunc) error { _ = stdoutWriter.Close() + defer cancel() diff, finalErr = gitdiff.ParsePatch(t.ctx, setting.Git.MaxGitDiffLines, setting.Git.MaxGitDiffLineCharacters, setting.Git.MaxGitDiffFiles, stdoutReader, "") if finalErr != nil { log.Error("ParsePatch: %v", finalErr) @@ -371,10 +372,14 @@ func (t *TemporaryUploadRepository) DiffIndex() (*gitdiff.Diff, error) { log.Error("Unable to ParsePatch in temporary repo %s (%s). Error: %v", t.repo.FullName(), t.basePath, finalErr) return nil, finalErr } - log.Error("Unable to run diff-index pipeline in temporary repo %s (%s). Error: %v\nStderr: %s", - t.repo.FullName(), t.basePath, err, stderr) - return nil, fmt.Errorf("Unable to run diff-index pipeline in temporary repo %s. Error: %w\nStderr: %s", - t.repo.FullName(), err, stderr) + + // If the process exited early, don't error + if err != context.Canceled { + log.Error("Unable to run diff-index pipeline in temporary repo %s (%s). Error: %v\nStderr: %s", + t.repo.FullName(), t.basePath, err, stderr) + return nil, fmt.Errorf("Unable to run diff-index pipeline in temporary repo %s. Error: %w\nStderr: %s", + t.repo.FullName(), err, stderr) + } } diff.NumFiles, diff.TotalAddition, diff.TotalDeletion, err = git.GetDiffShortStat(t.ctx, t.basePath, git.TrustedCmdArgs{"--cached"}, "HEAD")