From 6bab678bed726e9d5371cd10050ea217ab076c24 Mon Sep 17 00:00:00 2001 From: zeripath Date: Fri, 16 Oct 2020 18:13:18 +0100 Subject: [PATCH] Fix diff skipping lines (#13154) * Fix diff skipping lines ParsePatch previously just skipped all lines that start with "+++ " or "--- " and makes no attempt to see these lines in context. This PR rewrites ParsePatch to pay attention to context and position within a patch, ensuring that --- and +++ are only skipped if appropriate. This PR also fixes several issues with incomplete files. Fix https://codeberg.org/Codeberg/Community/issues/308 Fix #13153 Signed-off-by: Andrew Thornton * Add testcase Signed-off-by: Andrew Thornton * fix comment * simplify error handling Signed-off-by: Andrew Thornton * never return io.EOF Signed-off-by: Andrew Thornton Co-authored-by: Lauris BH --- services/gitdiff/gitdiff.go | 529 ++++++++++++++++++++----------- services/gitdiff/gitdiff_test.go | 21 ++ 2 files changed, 358 insertions(+), 192 deletions(-) diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index 1882b48ed4c..91105399db8 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -441,91 +441,252 @@ func (diff *Diff) LoadComments(issue *models.Issue, currentUser *models.User) er const cmdDiffHead = "diff --git " -// ParsePatch builds a Diff object from a io.Reader and some -// parameters. -// TODO: move this function to gogits/git-module +// ParsePatch builds a Diff object from a io.Reader and some parameters. func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*Diff, error) { - var ( - diff = &Diff{Files: make([]*DiffFile, 0)} - curFile = &DiffFile{} - curSection = &DiffSection{ - Lines: make([]*DiffLine, 0, 10), + var curFile *DiffFile + + diff := &Diff{Files: make([]*DiffFile, 0)} + + sb := strings.Builder{} + + // OK let's set a reasonable buffer size. + // This should be let's say at least the size of maxLineCharacters or 4096 whichever is larger. + readerSize := maxLineCharacters + if readerSize < 4096 { + readerSize = 4096 + } + + input := bufio.NewReaderSize(reader, readerSize) + line, err := input.ReadString('\n') + if err != nil { + if err == io.EOF { + return diff, nil + } + return diff, err + } +parsingLoop: + for { + // 1. A patch file always begins with `diff --git ` + `a/path b/path` (possibly quoted) + // if it does not we have bad input! + if !strings.HasPrefix(line, cmdDiffHead) { + return diff, fmt.Errorf("Invalid first file line: %s", line) } - leftLine, rightLine int - lineCount int - curFileLinesCount int - curFileLFSPrefix bool + // TODO: Handle skipping first n files + if len(diff.Files) >= maxFiles { + diff.IsIncomplete = true + _, err := io.Copy(ioutil.Discard, reader) + if err != nil { + // By the definition of io.Copy this never returns io.EOF + return diff, fmt.Errorf("Copy: %v", err) + } + break parsingLoop + } + + curFile = createDiffFile(diff, line) + diff.Files = append(diff.Files, curFile) + + // 2. It is followed by one or more extended header lines: + // + // old mode + // new mode + // deleted file mode + // new file mode + // copy from + // copy to + // rename from + // rename to + // similarity index + // dissimilarity index + // index .. + // + // * 6-digit octal numbers including the file type and file permission bits. + // * does not include the a/ and b/ prefixes + // * percentage of unchanged lines for similarity, percentage of changed + // lines dissimilarity as integer rounded down with terminal %. 100% => equal files. + // * The index line includes the blob object names before and after the change. + // The is included if the file mode does not change; otherwise, separate + // lines indicate the old and the new mode. + // 3. Following this header the "standard unified" diff format header may be encountered: (but not for every case...) + // + // --- a/ + // +++ b/ + // + // With multiple hunks + // + // @@ @@ + // +added line + // -removed line + // unchanged line + // + // 4. Binary files get: + // + // Binary files a/ and b/ differ + // + // but one of a/ and b/ could be /dev/null. + curFileLoop: + for { + line, err = input.ReadString('\n') + if err != nil { + if err != io.EOF { + return diff, err + } + break parsingLoop + } + switch { + case strings.HasPrefix(line, "old mode ") || + strings.HasPrefix(line, "new mode "): + if strings.HasSuffix(line, " 160000\n") { + curFile.IsSubmodule = true + } + case strings.HasPrefix(line, "copy from "): + curFile.IsRenamed = true + curFile.Type = DiffFileCopy + case strings.HasPrefix(line, "copy to "): + curFile.IsRenamed = true + curFile.Type = DiffFileCopy + case strings.HasPrefix(line, "new file"): + curFile.Type = DiffFileAdd + curFile.IsCreated = true + if strings.HasSuffix(line, " 160000\n") { + curFile.IsSubmodule = true + } + case strings.HasPrefix(line, "deleted"): + curFile.Type = DiffFileDel + curFile.IsDeleted = true + if strings.HasSuffix(line, " 160000\n") { + curFile.IsSubmodule = true + } + case strings.HasPrefix(line, "index"): + if strings.HasSuffix(line, " 160000\n") { + curFile.IsSubmodule = true + } + case strings.HasPrefix(line, "similarity index 100%"): + curFile.Type = DiffFileRename + case strings.HasPrefix(line, "Binary"): + curFile.IsBin = true + case strings.HasPrefix(line, "--- "): + // Do nothing with this line + case strings.HasPrefix(line, "+++ "): + // Do nothing with this line + lineBytes, isFragment, err := parseHunks(curFile, maxLines, maxLineCharacters, input) + diff.TotalAddition += curFile.Addition + diff.TotalDeletion += curFile.Deletion + if err != nil { + if err != io.EOF { + return diff, err + } + break parsingLoop + } + sb.Reset() + _, _ = sb.Write(lineBytes) + for isFragment { + lineBytes, isFragment, err = input.ReadLine() + if err != nil { + // Now by the definition of ReadLine this cannot be io.EOF + return diff, fmt.Errorf("Unable to ReadLine: %v", err) + } + _, _ = sb.Write(lineBytes) + } + line = sb.String() + sb.Reset() + + break curFileLoop + } + } + + } + + // FIXME: There are numerous issues with this: + // - we might want to consider detecting encoding while parsing but... + // - we're likely to fail to get the correct encoding here anyway as we won't have enough information + // - and this doesn't really account for changes in encoding + var buf bytes.Buffer + for _, f := range diff.Files { + buf.Reset() + for _, sec := range f.Sections { + for _, l := range sec.Lines { + if l.Type == DiffLineSection { + continue + } + buf.WriteString(l.Content[1:]) + buf.WriteString("\n") + } + } + charsetLabel, err := charset.DetectEncoding(buf.Bytes()) + if charsetLabel != "UTF-8" && err == nil { + encoding, _ := stdcharset.Lookup(charsetLabel) + if encoding != nil { + d := encoding.NewDecoder() + for _, sec := range f.Sections { + for _, l := range sec.Lines { + if l.Type == DiffLineSection { + continue + } + if c, _, err := transform.String(d, l.Content[1:]); err == nil { + l.Content = l.Content[0:1] + c + } + } + } + } + } + } + + diff.NumFiles = len(diff.Files) + return diff, nil +} + +func parseHunks(curFile *DiffFile, maxLines, maxLineCharacters int, input *bufio.Reader) (lineBytes []byte, isFragment bool, err error) { + sb := strings.Builder{} + + var ( + curSection *DiffSection + curFileLinesCount int + curFileLFSPrefix bool ) - input := bufio.NewReader(reader) - isEOF := false - for !isEOF { - var linebuf bytes.Buffer - for { - b, err := input.ReadByte() - if err != nil { - if err == io.EOF { - isEOF = true - break - } else { - return nil, fmt.Errorf("ReadByte: %v", err) - } + leftLine, rightLine := 1, 1 + + for { + sb.Reset() + lineBytes, isFragment, err = input.ReadLine() + if err != nil { + if err == io.EOF { + return } - if b == '\n' { - break - } - if linebuf.Len() < maxLineCharacters { - linebuf.WriteByte(b) - } else if linebuf.Len() == maxLineCharacters { + err = fmt.Errorf("Unable to ReadLine: %v", err) + return + } + if lineBytes[0] == 'd' { + // End of hunks + return + } + + switch lineBytes[0] { + case '@': + if curFileLinesCount >= maxLines { curFile.IsIncomplete = true + continue } - } - line := linebuf.String() - if strings.HasPrefix(line, "+++ ") || strings.HasPrefix(line, "--- ") || len(line) == 0 { - continue - } - - trimLine := strings.Trim(line, "+- ") - - if trimLine == models.LFSMetaFileIdentifier { - curFileLFSPrefix = true - } - - if curFileLFSPrefix && strings.HasPrefix(trimLine, models.LFSMetaFileOidPrefix) { - oid := strings.TrimPrefix(trimLine, models.LFSMetaFileOidPrefix) - - if len(oid) == 64 { - m := &models.LFSMetaObject{Oid: oid} - count, err := models.Count(m) - - if err == nil && count > 0 { - curFile.IsBin = true - curFile.IsLFSFile = true - curSection.Lines = nil + _, _ = sb.Write(lineBytes) + for isFragment { + // This is very odd indeed - we're in a section header and the line is too long + // This really shouldn't happen... + lineBytes, isFragment, err = input.ReadLine() + if err != nil { + // Now by the definition of ReadLine this cannot be io.EOF + err = fmt.Errorf("Unable to ReadLine: %v", err) + return } + _, _ = sb.Write(lineBytes) } - } + line := sb.String() - curFileLinesCount++ - lineCount++ - - // Diff data too large, we only show the first about maxLines lines - if curFileLinesCount >= maxLines { - curFile.IsIncomplete = true - } - switch { - case line[0] == ' ': - diffLine := &DiffLine{Type: DiffLinePlain, Content: line, LeftIdx: leftLine, RightIdx: rightLine} - leftLine++ - rightLine++ - curSection.Lines = append(curSection.Lines, diffLine) - curSection.FileName = curFile.Name - continue - case line[0] == '@': + // Create a new section to represent this hunk curSection = &DiffSection{} curFile.Sections = append(curFile.Sections, curSection) + lineSectionInfo := getDiffLineSectionInfo(curFile.Name, line, leftLine-1, rightLine-1) diffLine := &DiffLine{ Type: DiffLineSection, @@ -538,148 +699,132 @@ func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*D leftLine = lineSectionInfo.LeftIdx rightLine = lineSectionInfo.RightIdx continue - case line[0] == '+': + case '\\': + if curFileLinesCount >= maxLines { + curFile.IsIncomplete = true + continue + } + // This is used only to indicate that the current file does not have a terminal newline + if !bytes.Equal(lineBytes, []byte("\\ No newline at end of file")) { + err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes)) + return + } + // Technically this should be the end the file! + // FIXME: we should be putting a marker at the end of the file if there is no terminal new line + continue + case '+': + curFileLinesCount++ curFile.Addition++ - diff.TotalAddition++ - diffLine := &DiffLine{Type: DiffLineAdd, Content: line, RightIdx: rightLine} + if curFileLinesCount >= maxLines { + curFile.IsIncomplete = true + continue + } + diffLine := &DiffLine{Type: DiffLineAdd, RightIdx: rightLine} rightLine++ curSection.Lines = append(curSection.Lines, diffLine) - curSection.FileName = curFile.Name - continue - case line[0] == '-': + case '-': + curFileLinesCount++ curFile.Deletion++ - diff.TotalDeletion++ - diffLine := &DiffLine{Type: DiffLineDel, Content: line, LeftIdx: leftLine} + if curFileLinesCount >= maxLines { + curFile.IsIncomplete = true + continue + } + diffLine := &DiffLine{Type: DiffLineDel, LeftIdx: leftLine} if leftLine > 0 { leftLine++ } curSection.Lines = append(curSection.Lines, diffLine) - curSection.FileName = curFile.Name - case strings.HasPrefix(line, "Binary"): - curFile.IsBin = true - continue + case ' ': + curFileLinesCount++ + if curFileLinesCount >= maxLines { + curFile.IsIncomplete = true + continue + } + diffLine := &DiffLine{Type: DiffLinePlain, LeftIdx: leftLine, RightIdx: rightLine} + leftLine++ + rightLine++ + curSection.Lines = append(curSection.Lines, diffLine) + default: + // This is unexpected + err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes)) + return } - // Get new file. - if strings.HasPrefix(line, cmdDiffHead) { - if len(diff.Files) >= maxFiles { - diff.IsIncomplete = true - _, err := io.Copy(ioutil.Discard, reader) + line := string(lineBytes) + if isFragment { + curFile.IsIncomplete = true + for isFragment { + lineBytes, isFragment, err = input.ReadLine() if err != nil { - return nil, fmt.Errorf("Copy: %v", err) + // Now by the definition of ReadLine this cannot be io.EOF + err = fmt.Errorf("Unable to ReadLine: %v", err) + return } - break } + } + curSection.Lines[len(curSection.Lines)-1].Content = line - // Note: In case file name is surrounded by double quotes (it happens only in git-shell). - // e.g. diff --git "a/xxx" "b/xxx" - var a string - var b string + // handle LFS + if line[1:] == models.LFSMetaFileIdentifier { + curFileLFSPrefix = true + } else if curFileLFSPrefix && strings.HasPrefix(line[1:], models.LFSMetaFileOidPrefix) { + oid := strings.TrimPrefix(line[1:], models.LFSMetaFileOidPrefix) + if len(oid) == 64 { + m := &models.LFSMetaObject{Oid: oid} + count, err := models.Count(m) - rd := strings.NewReader(line[len(cmdDiffHead):]) - char, _ := rd.ReadByte() - _ = rd.UnreadByte() - if char == '"' { - fmt.Fscanf(rd, "%q ", &a) - if a[0] == '\\' { - a = a[1:] - } - } else { - fmt.Fscanf(rd, "%s ", &a) - } - char, _ = rd.ReadByte() - _ = rd.UnreadByte() - if char == '"' { - fmt.Fscanf(rd, "%q", &b) - if b[0] == '\\' { - b = b[1:] - } - } else { - fmt.Fscanf(rd, "%s", &b) - } - a = a[2:] - b = b[2:] - - curFile = &DiffFile{ - Name: b, - OldName: a, - Index: len(diff.Files) + 1, - Type: DiffFileChange, - Sections: make([]*DiffSection, 0, 10), - IsRenamed: a != b, - } - diff.Files = append(diff.Files, curFile) - curFileLinesCount = 0 - leftLine = 1 - rightLine = 1 - curFileLFSPrefix = false - - // Check file diff type and is submodule. - for { - line, err := input.ReadString('\n') - if err != nil { - if err == io.EOF { - isEOF = true - } else { - return nil, fmt.Errorf("ReadString: %v", err) - } - } - - switch { - case strings.HasPrefix(line, "copy from "): - curFile.IsRenamed = true - curFile.Type = DiffFileCopy - case strings.HasPrefix(line, "copy to "): - curFile.IsRenamed = true - curFile.Type = DiffFileCopy - case strings.HasPrefix(line, "new file"): - curFile.Type = DiffFileAdd - curFile.IsCreated = true - case strings.HasPrefix(line, "deleted"): - curFile.Type = DiffFileDel - curFile.IsDeleted = true - case strings.HasPrefix(line, "index"): - curFile.Type = DiffFileChange - case strings.HasPrefix(line, "similarity index 100%"): - curFile.Type = DiffFileRename - } - if curFile.Type > 0 { - if strings.HasSuffix(line, " 160000\n") { - curFile.IsSubmodule = true - } - break + if err == nil && count > 0 { + curFile.IsBin = true + curFile.IsLFSFile = true + curSection.Lines = nil } } } } +} - // FIXME: detect encoding while parsing. - var buf bytes.Buffer - for _, f := range diff.Files { - buf.Reset() - for _, sec := range f.Sections { - for _, l := range sec.Lines { - buf.WriteString(l.Content) - buf.WriteString("\n") - } - } - charsetLabel, err := charset.DetectEncoding(buf.Bytes()) - if charsetLabel != "UTF-8" && err == nil { - encoding, _ := stdcharset.Lookup(charsetLabel) - if encoding != nil { - d := encoding.NewDecoder() - for _, sec := range f.Sections { - for _, l := range sec.Lines { - if c, _, err := transform.String(d, l.Content); err == nil { - l.Content = c - } - } - } - } - } +func createDiffFile(diff *Diff, line string) *DiffFile { + // The a/ and b/ filenames are the same unless rename/copy is involved. + // Especially, even for a creation or a deletion, /dev/null is not used + // in place of the a/ or b/ filenames. + // + // When rename/copy is involved, file1 and file2 show the name of the + // source file of the rename/copy and the name of the file that rename/copy + // produces, respectively. + // + // Path names are quoted if necessary. + // + // This means that you should always be able to determine the file name even when there + // there is potential ambiguity... + // + // but we can be simpler with our heuristics by just forcing git to prefix things nicely + curFile := &DiffFile{ + Index: len(diff.Files) + 1, + Type: DiffFileChange, + Sections: make([]*DiffSection, 0, 10), } - diff.NumFiles = len(diff.Files) - return diff, nil + + rd := strings.NewReader(line[len(cmdDiffHead):] + " ") + curFile.Type = DiffFileChange + curFile.OldName = readFileName(rd) + curFile.Name = readFileName(rd) + curFile.IsRenamed = curFile.Name != curFile.OldName + return curFile +} + +func readFileName(rd *strings.Reader) string { + var name string + char, _ := rd.ReadByte() + _ = rd.UnreadByte() + if char == '"' { + fmt.Fscanf(rd, "%q ", &name) + if name[0] == '\\' { + name = name[1:] + } + } else { + fmt.Fscanf(rd, "%s ", &name) + } + return name[2:] } // GetDiffRange builds a Diff between two commits of a repository. diff --git a/services/gitdiff/gitdiff_test.go b/services/gitdiff/gitdiff_test.go index b90ad657f61..64cd4f1c21f 100644 --- a/services/gitdiff/gitdiff_test.go +++ b/services/gitdiff/gitdiff_test.go @@ -182,6 +182,27 @@ rename to a b/a a/file b/b file oldFilename: "a b/file b/a a/file", filename: "a b/a a/file b/b file", }, + { + name: "minuses-and-pluses", + gitdiff: `diff --git a/minuses-and-pluses b/minuses-and-pluses +index 6961180..9ba1a00 100644 +--- a/minuses-and-pluses ++++ b/minuses-and-pluses +@@ -1,4 +1,4 @@ +--- 1st line +-++ 2nd line +--- 3rd line +-++ 4th line ++++ 1st line ++-- 2nd line ++++ 3rd line ++-- 4th line +`, + oldFilename: "minuses-and-pluses", + filename: "minuses-and-pluses", + addition: 4, + deletion: 4, + }, } for _, testcase := range tests {