mirror of
https://github.com/go-gitea/gitea
synced 2025-01-18 19:47:44 +01:00
8a20fba8eb
Remove unmaintainable sanitizer rules. No need to add special "class" regexp rules anymore, use RenderInternal.SafeAttr instead, more details (and examples) are in the tests
446 lines
14 KiB
Go
446 lines
14 KiB
Go
// Copyright 2017 The Gitea Authors. All rights reserved.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
package markup
|
|
|
|
import (
|
|
"bytes"
|
|
"io"
|
|
"regexp"
|
|
"slices"
|
|
"strings"
|
|
"sync"
|
|
|
|
"code.gitea.io/gitea/modules/markup/common"
|
|
|
|
"golang.org/x/net/html"
|
|
"golang.org/x/net/html/atom"
|
|
"mvdan.cc/xurls/v2"
|
|
)
|
|
|
|
// Issue name styles
|
|
const (
|
|
IssueNameStyleNumeric = "numeric"
|
|
IssueNameStyleAlphanumeric = "alphanumeric"
|
|
IssueNameStyleRegexp = "regexp"
|
|
)
|
|
|
|
type globalVarsType struct {
|
|
hashCurrentPattern *regexp.Regexp
|
|
shortLinkPattern *regexp.Regexp
|
|
anyHashPattern *regexp.Regexp
|
|
comparePattern *regexp.Regexp
|
|
fullURLPattern *regexp.Regexp
|
|
emailRegex *regexp.Regexp
|
|
blackfridayExtRegex *regexp.Regexp
|
|
emojiShortCodeRegex *regexp.Regexp
|
|
issueFullPattern *regexp.Regexp
|
|
filesChangedFullPattern *regexp.Regexp
|
|
codePreviewPattern *regexp.Regexp
|
|
|
|
tagCleaner *regexp.Regexp
|
|
nulCleaner *strings.Replacer
|
|
}
|
|
|
|
var globalVars = sync.OnceValue[*globalVarsType](func() *globalVarsType {
|
|
v := &globalVarsType{}
|
|
// NOTE: All below regex matching do not perform any extra validation.
|
|
// Thus a link is produced even if the linked entity does not exist.
|
|
// While fast, this is also incorrect and lead to false positives.
|
|
// TODO: fix invalid linking issue
|
|
|
|
// valid chars in encoded path and parameter: [-+~_%.a-zA-Z0-9/]
|
|
|
|
// hashCurrentPattern matches string that represents a commit SHA, e.g. d8a994ef243349f321568f9e36d5c3f444b99cae
|
|
// Although SHA1 hashes are 40 chars long, SHA256 are 64, the regex matches the hash from 7 to 64 chars in length
|
|
// so that abbreviated hash links can be used as well. This matches git and GitHub usability.
|
|
v.hashCurrentPattern = regexp.MustCompile(`(?:\s|^|\(|\[)([0-9a-f]{7,64})(?:\s|$|\)|\]|[.,:](\s|$))`)
|
|
|
|
// shortLinkPattern matches short but difficult to parse [[name|link|arg=test]] syntax
|
|
v.shortLinkPattern = regexp.MustCompile(`\[\[(.*?)\]\](\w*)`)
|
|
|
|
// anyHashPattern splits url containing SHA into parts
|
|
v.anyHashPattern = regexp.MustCompile(`https?://(?:\S+/){4,5}([0-9a-f]{40,64})(/[-+~%./\w]+)?(\?[-+~%.\w&=]+)?(#[-+~%.\w]+)?`)
|
|
|
|
// comparePattern matches "http://domain/org/repo/compare/COMMIT1...COMMIT2#hash"
|
|
v.comparePattern = regexp.MustCompile(`https?://(?:\S+/){4,5}([0-9a-f]{7,64})(\.\.\.?)([0-9a-f]{7,64})?(#[-+~_%.a-zA-Z0-9]+)?`)
|
|
|
|
// fullURLPattern matches full URL like "mailto:...", "https://..." and "ssh+git://..."
|
|
v.fullURLPattern = regexp.MustCompile(`^[a-z][-+\w]+:`)
|
|
|
|
// emailRegex is definitely not perfect with edge cases,
|
|
// it is still accepted by the CommonMark specification, as well as the HTML5 spec:
|
|
// http://spec.commonmark.org/0.28/#email-address
|
|
// https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail)
|
|
v.emailRegex = regexp.MustCompile("(?:\\s|^|\\(|\\[)([a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9]{2,}(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)(?:\\s|$|\\)|\\]|;|,|\\?|!|\\.(\\s|$))")
|
|
|
|
// blackfridayExtRegex is for blackfriday extensions create IDs like fn:user-content-footnote
|
|
v.blackfridayExtRegex = regexp.MustCompile(`[^:]*:user-content-`)
|
|
|
|
// emojiShortCodeRegex find emoji by alias like :smile:
|
|
v.emojiShortCodeRegex = regexp.MustCompile(`:[-+\w]+:`)
|
|
|
|
// example: https://domain/org/repo/pulls/27#hash
|
|
v.issueFullPattern = regexp.MustCompile(`https?://(?:\S+/)[\w_.-]+/[\w_.-]+/(?:issues|pulls)/((?:\w{1,10}-)?[1-9][0-9]*)([\?|#](\S+)?)?\b`)
|
|
|
|
// example: https://domain/org/repo/pulls/27/files#hash
|
|
v.filesChangedFullPattern = regexp.MustCompile(`https?://(?:\S+/)[\w_.-]+/[\w_.-]+/pulls/((?:\w{1,10}-)?[1-9][0-9]*)/files([\?|#](\S+)?)?\b`)
|
|
|
|
// codePreviewPattern matches "http://domain/.../{owner}/{repo}/src/commit/{commit}/{filepath}#L10-L20"
|
|
v.codePreviewPattern = regexp.MustCompile(`https?://\S+/([^\s/]+)/([^\s/]+)/src/commit/([0-9a-f]{7,64})(/\S+)#(L\d+(-L\d+)?)`)
|
|
|
|
v.tagCleaner = regexp.MustCompile(`<((?:/?\w+/\w+)|(?:/[\w ]+/)|(/?[hH][tT][mM][lL]\b)|(/?[hH][eE][aA][dD]\b))`)
|
|
v.nulCleaner = strings.NewReplacer("\000", "")
|
|
return v
|
|
})
|
|
|
|
// IsFullURLBytes reports whether link fits valid format.
|
|
func IsFullURLBytes(link []byte) bool {
|
|
return globalVars().fullURLPattern.Match(link)
|
|
}
|
|
|
|
func IsFullURLString(link string) bool {
|
|
return globalVars().fullURLPattern.MatchString(link)
|
|
}
|
|
|
|
func IsNonEmptyRelativePath(link string) bool {
|
|
return link != "" && !IsFullURLString(link) && link[0] != '/' && link[0] != '?' && link[0] != '#'
|
|
}
|
|
|
|
// CustomLinkURLSchemes allows for additional schemes to be detected when parsing links within text
|
|
func CustomLinkURLSchemes(schemes []string) {
|
|
schemes = append(schemes, "http", "https")
|
|
withAuth := make([]string, 0, len(schemes))
|
|
validScheme := regexp.MustCompile(`^[a-z]+$`)
|
|
for _, s := range schemes {
|
|
if !validScheme.MatchString(s) {
|
|
continue
|
|
}
|
|
without := false
|
|
for _, sna := range xurls.SchemesNoAuthority {
|
|
if s == sna {
|
|
without = true
|
|
break
|
|
}
|
|
}
|
|
if without {
|
|
s += ":"
|
|
} else {
|
|
s += "://"
|
|
}
|
|
withAuth = append(withAuth, s)
|
|
}
|
|
common.GlobalVars().LinkRegex, _ = xurls.StrictMatchingScheme(strings.Join(withAuth, "|"))
|
|
}
|
|
|
|
type postProcessError struct {
|
|
context string
|
|
err error
|
|
}
|
|
|
|
func (p *postProcessError) Error() string {
|
|
return "PostProcess: " + p.context + ", " + p.err.Error()
|
|
}
|
|
|
|
type processor func(ctx *RenderContext, node *html.Node)
|
|
|
|
var defaultProcessors = []processor{
|
|
fullIssuePatternProcessor,
|
|
comparePatternProcessor,
|
|
codePreviewPatternProcessor,
|
|
fullHashPatternProcessor,
|
|
shortLinkProcessor,
|
|
linkProcessor,
|
|
mentionProcessor,
|
|
issueIndexPatternProcessor,
|
|
commitCrossReferencePatternProcessor,
|
|
hashCurrentPatternProcessor,
|
|
emailAddressProcessor,
|
|
emojiProcessor,
|
|
emojiShortCodeProcessor,
|
|
}
|
|
|
|
// PostProcess does the final required transformations to the passed raw HTML
|
|
// data, and ensures its validity. Transformations include: replacing links and
|
|
// emails with HTML links, parsing shortlinks in the format of [[Link]], like
|
|
// MediaWiki, linking issues in the format #ID, and mentions in the format
|
|
// @user, and others.
|
|
func PostProcess(ctx *RenderContext, input io.Reader, output io.Writer) error {
|
|
return postProcess(ctx, defaultProcessors, input, output)
|
|
}
|
|
|
|
var commitMessageProcessors = []processor{
|
|
fullIssuePatternProcessor,
|
|
comparePatternProcessor,
|
|
fullHashPatternProcessor,
|
|
linkProcessor,
|
|
mentionProcessor,
|
|
issueIndexPatternProcessor,
|
|
commitCrossReferencePatternProcessor,
|
|
hashCurrentPatternProcessor,
|
|
emailAddressProcessor,
|
|
emojiProcessor,
|
|
emojiShortCodeProcessor,
|
|
}
|
|
|
|
// RenderCommitMessage will use the same logic as PostProcess, but will disable
|
|
// the shortLinkProcessor and will add a defaultLinkProcessor if defaultLink is
|
|
// set, which changes every text node into a link to the passed default link.
|
|
func RenderCommitMessage(ctx *RenderContext, content string) (string, error) {
|
|
procs := commitMessageProcessors
|
|
return renderProcessString(ctx, procs, content)
|
|
}
|
|
|
|
var commitMessageSubjectProcessors = []processor{
|
|
fullIssuePatternProcessor,
|
|
comparePatternProcessor,
|
|
fullHashPatternProcessor,
|
|
linkProcessor,
|
|
mentionProcessor,
|
|
issueIndexPatternProcessor,
|
|
commitCrossReferencePatternProcessor,
|
|
hashCurrentPatternProcessor,
|
|
emojiShortCodeProcessor,
|
|
emojiProcessor,
|
|
}
|
|
|
|
var emojiProcessors = []processor{
|
|
emojiShortCodeProcessor,
|
|
emojiProcessor,
|
|
}
|
|
|
|
// RenderCommitMessageSubject will use the same logic as PostProcess and
|
|
// RenderCommitMessage, but will disable the shortLinkProcessor and
|
|
// emailAddressProcessor, will add a defaultLinkProcessor if defaultLink is set,
|
|
// which changes every text node into a link to the passed default link.
|
|
func RenderCommitMessageSubject(ctx *RenderContext, defaultLink, content string) (string, error) {
|
|
procs := slices.Clone(commitMessageSubjectProcessors)
|
|
procs = append(procs, func(ctx *RenderContext, node *html.Node) {
|
|
ch := &html.Node{Parent: node, Type: html.TextNode, Data: node.Data}
|
|
node.Type = html.ElementNode
|
|
node.Data = "a"
|
|
node.DataAtom = atom.A
|
|
node.Attr = []html.Attribute{{Key: "href", Val: defaultLink}, {Key: "class", Val: "muted"}}
|
|
node.FirstChild, node.LastChild = ch, ch
|
|
})
|
|
return renderProcessString(ctx, procs, content)
|
|
}
|
|
|
|
// RenderIssueTitle to process title on individual issue/pull page
|
|
func RenderIssueTitle(ctx *RenderContext, title string) (string, error) {
|
|
// do not render other issue/commit links in an issue's title - which in most cases is already a link.
|
|
return renderProcessString(ctx, []processor{
|
|
emojiShortCodeProcessor,
|
|
emojiProcessor,
|
|
}, title)
|
|
}
|
|
|
|
func renderProcessString(ctx *RenderContext, procs []processor, content string) (string, error) {
|
|
var buf strings.Builder
|
|
if err := postProcess(ctx, procs, strings.NewReader(content), &buf); err != nil {
|
|
return "", err
|
|
}
|
|
return buf.String(), nil
|
|
}
|
|
|
|
// RenderDescriptionHTML will use similar logic as PostProcess, but will
|
|
// use a single special linkProcessor.
|
|
func RenderDescriptionHTML(ctx *RenderContext, content string) (string, error) {
|
|
return renderProcessString(ctx, []processor{
|
|
descriptionLinkProcessor,
|
|
emojiShortCodeProcessor,
|
|
emojiProcessor,
|
|
}, content)
|
|
}
|
|
|
|
// RenderEmoji for when we want to just process emoji and shortcodes
|
|
// in various places it isn't already run through the normal markdown processor
|
|
func RenderEmoji(ctx *RenderContext, content string) (string, error) {
|
|
return renderProcessString(ctx, emojiProcessors, content)
|
|
}
|
|
|
|
func postProcess(ctx *RenderContext, procs []processor, input io.Reader, output io.Writer) error {
|
|
defer ctx.Cancel()
|
|
// FIXME: don't read all content to memory
|
|
rawHTML, err := io.ReadAll(input)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// parse the HTML
|
|
node, err := html.Parse(io.MultiReader(
|
|
// prepend "<html><body>"
|
|
strings.NewReader("<html><body>"),
|
|
// Strip out nuls - they're always invalid
|
|
bytes.NewReader(globalVars().tagCleaner.ReplaceAll([]byte(globalVars().nulCleaner.Replace(string(rawHTML))), []byte("<$1"))),
|
|
// close the tags
|
|
strings.NewReader("</body></html>"),
|
|
))
|
|
if err != nil {
|
|
return &postProcessError{"invalid HTML", err}
|
|
}
|
|
|
|
if node.Type == html.DocumentNode {
|
|
node = node.FirstChild
|
|
}
|
|
|
|
visitNode(ctx, procs, node)
|
|
|
|
newNodes := make([]*html.Node, 0, 5)
|
|
|
|
if node.Data == "html" {
|
|
node = node.FirstChild
|
|
for node != nil && node.Data != "body" {
|
|
node = node.NextSibling
|
|
}
|
|
}
|
|
if node != nil {
|
|
if node.Data == "body" {
|
|
child := node.FirstChild
|
|
for child != nil {
|
|
newNodes = append(newNodes, child)
|
|
child = child.NextSibling
|
|
}
|
|
} else {
|
|
newNodes = append(newNodes, node)
|
|
}
|
|
}
|
|
|
|
// Render everything to buf.
|
|
for _, node := range newNodes {
|
|
if err := html.Render(output, node); err != nil {
|
|
return &postProcessError{"error rendering processed HTML", err}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func isEmojiNode(node *html.Node) bool {
|
|
if node.Type == html.ElementNode && node.Data == atom.Span.String() {
|
|
for _, attr := range node.Attr {
|
|
if (attr.Key == "class" || attr.Key == "data-attr-class") && strings.Contains(attr.Val, "emoji") {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func visitNode(ctx *RenderContext, procs []processor, node *html.Node) *html.Node {
|
|
// Add user-content- to IDs and "#" links if they don't already have them
|
|
for idx, attr := range node.Attr {
|
|
val := strings.TrimPrefix(attr.Val, "#")
|
|
notHasPrefix := !(strings.HasPrefix(val, "user-content-") || globalVars().blackfridayExtRegex.MatchString(val))
|
|
|
|
if attr.Key == "id" && notHasPrefix {
|
|
node.Attr[idx].Val = "user-content-" + attr.Val
|
|
}
|
|
|
|
if attr.Key == "href" && strings.HasPrefix(attr.Val, "#") && notHasPrefix {
|
|
node.Attr[idx].Val = "#user-content-" + val
|
|
}
|
|
}
|
|
|
|
switch node.Type {
|
|
case html.TextNode:
|
|
for _, proc := range procs {
|
|
proc(ctx, node) // it might add siblings
|
|
}
|
|
|
|
case html.ElementNode:
|
|
if isEmojiNode(node) {
|
|
// TextNode emoji will be converted to `<span class="emoji">`, then the next iteration will visit the "span"
|
|
// if we don't stop it, it will go into the TextNode again and create an infinite recursion
|
|
return node.NextSibling
|
|
} else if node.Data == "code" || node.Data == "pre" {
|
|
return node.NextSibling // ignore code and pre nodes
|
|
} else if node.Data == "img" {
|
|
return visitNodeImg(ctx, node)
|
|
} else if node.Data == "video" {
|
|
return visitNodeVideo(ctx, node)
|
|
} else if node.Data == "a" {
|
|
procs = emojiProcessors // Restrict text in links to emojis
|
|
}
|
|
for n := node.FirstChild; n != nil; {
|
|
n = visitNode(ctx, procs, n)
|
|
}
|
|
default:
|
|
}
|
|
return node.NextSibling
|
|
}
|
|
|
|
// createKeyword() renders a highlighted version of an action keyword
|
|
func createKeyword(ctx *RenderContext, content string) *html.Node {
|
|
// CSS class for action keywords (e.g. "closes: #1")
|
|
const keywordClass = "issue-keyword"
|
|
|
|
span := &html.Node{
|
|
Type: html.ElementNode,
|
|
Data: atom.Span.String(),
|
|
Attr: []html.Attribute{},
|
|
}
|
|
span.Attr = append(span.Attr, ctx.RenderInternal.NodeSafeAttr("class", keywordClass))
|
|
|
|
text := &html.Node{
|
|
Type: html.TextNode,
|
|
Data: content,
|
|
}
|
|
span.AppendChild(text)
|
|
|
|
return span
|
|
}
|
|
|
|
func createLink(ctx *RenderContext, href, content, class string) *html.Node {
|
|
a := &html.Node{
|
|
Type: html.ElementNode,
|
|
Data: atom.A.String(),
|
|
Attr: []html.Attribute{{Key: "href", Val: href}},
|
|
}
|
|
if !RenderBehaviorForTesting.DisableInternalAttributes {
|
|
a.Attr = append(a.Attr, html.Attribute{Key: "data-markdown-generated-content"})
|
|
}
|
|
if class != "" {
|
|
a.Attr = append(a.Attr, ctx.RenderInternal.NodeSafeAttr("class", class))
|
|
}
|
|
|
|
text := &html.Node{
|
|
Type: html.TextNode,
|
|
Data: content,
|
|
}
|
|
|
|
a.AppendChild(text)
|
|
return a
|
|
}
|
|
|
|
// replaceContent takes text node, and in its content it replaces a section of
|
|
// it with the specified newNode.
|
|
func replaceContent(node *html.Node, i, j int, newNode *html.Node) {
|
|
replaceContentList(node, i, j, []*html.Node{newNode})
|
|
}
|
|
|
|
// replaceContentList takes text node, and in its content it replaces a section of
|
|
// it with the specified newNodes. An example to visualize how this can work can
|
|
// be found here: https://play.golang.org/p/5zP8NnHZ03s
|
|
func replaceContentList(node *html.Node, i, j int, newNodes []*html.Node) {
|
|
// get the data before and after the match
|
|
before := node.Data[:i]
|
|
after := node.Data[j:]
|
|
|
|
// Replace in the current node the text, so that it is only what it is
|
|
// supposed to have.
|
|
node.Data = before
|
|
|
|
// Get the current next sibling, before which we place the replaced data,
|
|
// and after that we place the new text node.
|
|
nextSibling := node.NextSibling
|
|
for _, n := range newNodes {
|
|
node.Parent.InsertBefore(n, nextSibling)
|
|
}
|
|
if after != "" {
|
|
node.Parent.InsertBefore(&html.Node{
|
|
Type: html.TextNode,
|
|
Data: after,
|
|
}, nextSibling)
|
|
}
|
|
}
|