From c7bb3aa03436314e20d43e0ae44e791dd3e64909 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Sat, 4 May 2024 09:48:16 +0800 Subject: [PATCH] Fix markdown URL parsing for commit ID (#30812) --- modules/markup/html.go | 122 +++++++++++++++------------ modules/markup/html_codepreview.go | 3 +- modules/markup/html_internal_test.go | 59 +++++++++---- modules/markup/html_test.go | 7 +- 4 files changed, 116 insertions(+), 75 deletions(-) diff --git a/modules/markup/html.go b/modules/markup/html.go index 5ae0cc8755..2958dc9646 100644 --- a/modules/markup/html.go +++ b/modules/markup/html.go @@ -10,6 +10,7 @@ import ( "path" "path/filepath" "regexp" + "slices" "strings" "sync" @@ -54,7 +55,7 @@ var ( shortLinkPattern = regexp.MustCompile(`\[\[(.*?)\]\](\w*)`) // anyHashPattern splits url containing SHA into parts - anyHashPattern = regexp.MustCompile(`https?://(?:\S+/){4,5}([0-9a-f]{40,64})(/[-+~_%.a-zA-Z0-9/]+)?(#[-+~_%.a-zA-Z0-9]+)?`) + anyHashPattern = regexp.MustCompile(`https?://(?:\S+/){4,5}([0-9a-f]{40,64})(/[-+~%./\w]+)?(\?[-+~%.\w&=]+)?(#[-+~%.\w]+)?`) // comparePattern matches "http://domain/org/repo/compare/COMMIT1...COMMIT2#hash" comparePattern = regexp.MustCompile(`https?://(?:\S+/){4,5}([0-9a-f]{7,64})(\.\.\.?)([0-9a-f]{7,64})?(#[-+~_%.a-zA-Z0-9]+)?`) @@ -591,7 +592,8 @@ func replaceContentList(node *html.Node, i, j int, newNodes []*html.Node) { func mentionProcessor(ctx *RenderContext, node *html.Node) { start := 0 - for node != nil { + nodeStop := node.NextSibling + for node != nodeStop { found, loc := references.FindFirstMentionBytes(util.UnsafeStringToBytes(node.Data[start:])) if !found { node = node.NextSibling @@ -962,57 +964,68 @@ func commitCrossReferencePatternProcessor(ctx *RenderContext, node *html.Node) { } } +type anyHashPatternResult struct { + PosStart int + PosEnd int + FullURL string + CommitID string + SubPath string + QueryHash string +} + +func anyHashPatternExtract(s string) (ret anyHashPatternResult, ok bool) { + m := anyHashPattern.FindStringSubmatchIndex(s) + if m == nil { + return ret, false + } + + ret.PosStart, ret.PosEnd = m[0], m[1] + ret.FullURL = s[ret.PosStart:ret.PosEnd] + if strings.HasSuffix(ret.FullURL, ".") { + // if url ends in '.', it's very likely that it is not part of the actual url but used to finish a sentence. + ret.PosEnd-- + ret.FullURL = ret.FullURL[:len(ret.FullURL)-1] + for i := 0; i < len(m); i++ { + m[i] = min(m[i], ret.PosEnd) + } + } + + ret.CommitID = s[m[2]:m[3]] + if m[5] > 0 { + ret.SubPath = s[m[4]:m[5]] + } + + lastStart, lastEnd := m[len(m)-2], m[len(m)-1] + if lastEnd > 0 { + ret.QueryHash = s[lastStart:lastEnd][1:] + } + return ret, true +} + // fullHashPatternProcessor renders SHA containing URLs func fullHashPatternProcessor(ctx *RenderContext, node *html.Node) { if ctx.Metas == nil { return } - - next := node.NextSibling - for node != nil && node != next { - m := anyHashPattern.FindStringSubmatchIndex(node.Data) - if m == nil { - return + nodeStop := node.NextSibling + for node != nodeStop { + if node.Type != html.TextNode { + node = node.NextSibling + continue } - - urlFull := node.Data[m[0]:m[1]] - text := base.ShortSha(node.Data[m[2]:m[3]]) - - // 3rd capture group matches a optional path - subpath := "" - if m[5] > 0 { - subpath = node.Data[m[4]:m[5]] + ret, ok := anyHashPatternExtract(node.Data) + if !ok { + node = node.NextSibling + continue } - - // 4th capture group matches a optional url hash - hash := "" - if m[7] > 0 { - hash = node.Data[m[6]:m[7]][1:] + text := base.ShortSha(ret.CommitID) + if ret.SubPath != "" { + text += ret.SubPath } - - start := m[0] - end := m[1] - - // If url ends in '.', it's very likely that it is not part of the - // actual url but used to finish a sentence. - if strings.HasSuffix(urlFull, ".") { - end-- - urlFull = urlFull[:len(urlFull)-1] - if hash != "" { - hash = hash[:len(hash)-1] - } else if subpath != "" { - subpath = subpath[:len(subpath)-1] - } + if ret.QueryHash != "" { + text += " (" + ret.QueryHash + ")" } - - if subpath != "" { - text += subpath - } - - if hash != "" { - text += " (" + hash + ")" - } - replaceContent(node, start, end, createCodeLink(urlFull, text, "commit")) + replaceContent(node, ret.PosStart, ret.PosEnd, createCodeLink(ret.FullURL, text, "commit")) node = node.NextSibling.NextSibling } } @@ -1021,19 +1034,16 @@ func comparePatternProcessor(ctx *RenderContext, node *html.Node) { if ctx.Metas == nil { return } - - next := node.NextSibling - for node != nil && node != next { - m := comparePattern.FindStringSubmatchIndex(node.Data) - if m == nil { - return + nodeStop := node.NextSibling + for node != nodeStop { + if node.Type != html.TextNode { + node = node.NextSibling + continue } - - // Ensure that every group (m[0]...m[7]) has a match - for i := 0; i < 8; i++ { - if m[i] == -1 { - return - } + m := comparePattern.FindStringSubmatchIndex(node.Data) + if m == nil || slices.Contains(m[:8], -1) { // ensure that every group (m[0]...m[7]) has a match + node = node.NextSibling + continue } urlFull := node.Data[m[0]:m[1]] diff --git a/modules/markup/html_codepreview.go b/modules/markup/html_codepreview.go index d9da24ea34..5ef2217e3d 100644 --- a/modules/markup/html_codepreview.go +++ b/modules/markup/html_codepreview.go @@ -60,7 +60,8 @@ func renderCodeBlock(ctx *RenderContext, node *html.Node) (urlPosStart, urlPosSt } func codePreviewPatternProcessor(ctx *RenderContext, node *html.Node) { - for node != nil { + nodeStop := node.NextSibling + for node != nodeStop { if node.Type != html.TextNode { node = node.NextSibling continue diff --git a/modules/markup/html_internal_test.go b/modules/markup/html_internal_test.go index e313be7040..3ff0597851 100644 --- a/modules/markup/html_internal_test.go +++ b/modules/markup/html_internal_test.go @@ -399,36 +399,61 @@ func TestRegExp_sha1CurrentPattern(t *testing.T) { } func TestRegExp_anySHA1Pattern(t *testing.T) { - testCases := map[string][]string{ + testCases := map[string]anyHashPatternResult{ "https://github.com/jquery/jquery/blob/a644101ed04d0beacea864ce805e0c4f86ba1cd1/test/unit/event.js#L2703": { - "a644101ed04d0beacea864ce805e0c4f86ba1cd1", - "/test/unit/event.js", - "#L2703", + CommitID: "a644101ed04d0beacea864ce805e0c4f86ba1cd1", + SubPath: "/test/unit/event.js", + QueryHash: "L2703", }, "https://github.com/jquery/jquery/blob/a644101ed04d0beacea864ce805e0c4f86ba1cd1/test/unit/event.js": { - "a644101ed04d0beacea864ce805e0c4f86ba1cd1", - "/test/unit/event.js", - "", + CommitID: "a644101ed04d0beacea864ce805e0c4f86ba1cd1", + SubPath: "/test/unit/event.js", }, "https://github.com/jquery/jquery/commit/0705be475092aede1eddae01319ec931fb9c65fc": { - "0705be475092aede1eddae01319ec931fb9c65fc", - "", - "", + CommitID: "0705be475092aede1eddae01319ec931fb9c65fc", }, "https://github.com/jquery/jquery/tree/0705be475092aede1eddae01319ec931fb9c65fc/src": { - "0705be475092aede1eddae01319ec931fb9c65fc", - "/src", - "", + CommitID: "0705be475092aede1eddae01319ec931fb9c65fc", + SubPath: "/src", }, "https://try.gogs.io/gogs/gogs/commit/d8a994ef243349f321568f9e36d5c3f444b99cae#diff-2": { - "d8a994ef243349f321568f9e36d5c3f444b99cae", - "", - "#diff-2", + CommitID: "d8a994ef243349f321568f9e36d5c3f444b99cae", + QueryHash: "diff-2", + }, + "non-url": {}, + "http://a/b/c/d/e/1234567812345678123456781234567812345678123456781234567812345678?a=b#L1-L2": { + CommitID: "1234567812345678123456781234567812345678123456781234567812345678", + QueryHash: "L1-L2", + }, + "http://a/b/c/d/e/1234567812345678123456781234567812345678123456781234567812345678.": { + CommitID: "1234567812345678123456781234567812345678123456781234567812345678", + }, + "http://a/b/c/d/e/1234567812345678123456781234567812345678123456781234567812345678/sub.": { + CommitID: "1234567812345678123456781234567812345678123456781234567812345678", + SubPath: "/sub", + }, + "http://a/b/c/d/e/1234567812345678123456781234567812345678123456781234567812345678?a=b.": { + CommitID: "1234567812345678123456781234567812345678123456781234567812345678", + }, + "http://a/b/c/d/e/1234567812345678123456781234567812345678123456781234567812345678?a=b&c=d": { + CommitID: "1234567812345678123456781234567812345678123456781234567812345678", + }, + "http://a/b/c/d/e/1234567812345678123456781234567812345678123456781234567812345678#hash.": { + CommitID: "1234567812345678123456781234567812345678123456781234567812345678", + QueryHash: "hash", }, } for k, v := range testCases { - assert.Equal(t, anyHashPattern.FindStringSubmatch(k)[1:], v) + ret, ok := anyHashPatternExtract(k) + if v.CommitID == "" { + assert.False(t, ok) + } else { + assert.EqualValues(t, strings.TrimSuffix(k, "."), ret.FullURL) + assert.EqualValues(t, v.CommitID, ret.CommitID) + assert.EqualValues(t, v.SubPath, ret.SubPath) + assert.EqualValues(t, v.QueryHash, ret.QueryHash) + } } } diff --git a/modules/markup/html_test.go b/modules/markup/html_test.go index 916e74fb62..a2ae18d777 100644 --- a/modules/markup/html_test.go +++ b/modules/markup/html_test.go @@ -124,6 +124,11 @@ func TestRender_CrossReferences(t *testing.T) { test( util.URLJoin(markup.TestAppURL, "gogitea", "some-repo-name", "issues", "12345"), `

gogitea/some-repo-name#12345

`) + + inputURL := "https://host/a/b/commit/0123456789012345678901234567890123456789/foo.txt?a=b#L2-L3" + test( + inputURL, + `

0123456789/foo.txt (L2-L3)

`) } func TestMisc_IsSameDomain(t *testing.T) { @@ -695,7 +700,7 @@ func TestIssue18471(t *testing.T) { }, strings.NewReader(data), &res) assert.NoError(t, err) - assert.Equal(t, "783b039...da951ce", res.String()) + assert.Equal(t, `783b039...da951ce`, res.String()) } func TestIsFullURL(t *testing.T) {