From 8c461eb261c060811859600c37ad980a7189bd2d Mon Sep 17 00:00:00 2001 From: zeripath Date: Wed, 17 Mar 2021 08:58:58 +0000 Subject: [PATCH] Fix several render issues (#14986) (#15013) Backport #14986 * Fix an issue with panics related to attributes * Wrap goldmark render in a recovery function * Reduce memory use in render emoji * Use a pipe for rendering goldmark - still needs more work and a limiter Signed-off-by: Andrew Thornton Co-authored-by: Lauris BH --- modules/emoji/emoji.go | 75 +++++++++++------ modules/emoji/emoji_test.go | 33 ++++++++ modules/markup/html.go | 24 ++++-- modules/markup/markdown/goldmark.go | 6 ++ modules/markup/markdown/markdown.go | 102 +++++++++++++++++++++-- modules/markup/markdown/markdown_test.go | 20 +++++ 6 files changed, 216 insertions(+), 44 deletions(-) diff --git a/modules/emoji/emoji.go b/modules/emoji/emoji.go index 169ee0a182d..01fb764ce36 100644 --- a/modules/emoji/emoji.go +++ b/modules/emoji/emoji.go @@ -30,6 +30,9 @@ var ( // aliasMap provides a map of the alias to its emoji data. aliasMap map[string]int + // emptyReplacer is the string replacer for emoji codes. + emptyReplacer *strings.Replacer + // codeReplacer is the string replacer for emoji codes. codeReplacer *strings.Replacer @@ -49,6 +52,7 @@ func loadMap() { // process emoji codes and aliases codePairs := make([]string, 0) + emptyPairs := make([]string, 0) aliasPairs := make([]string, 0) // sort from largest to small so we match combined emoji first @@ -64,6 +68,7 @@ func loadMap() { // setup codes codeMap[e.Emoji] = i codePairs = append(codePairs, e.Emoji, ":"+e.Aliases[0]+":") + emptyPairs = append(emptyPairs, e.Emoji, e.Emoji) // setup aliases for _, a := range e.Aliases { @@ -77,6 +82,7 @@ func loadMap() { } // create replacers + emptyReplacer = strings.NewReplacer(emptyPairs...) codeReplacer = strings.NewReplacer(codePairs...) aliasReplacer = strings.NewReplacer(aliasPairs...) }) @@ -127,38 +133,53 @@ func ReplaceAliases(s string) string { return aliasReplacer.Replace(s) } +type rememberSecondWriteWriter struct { + pos int + idx int + end int + writecount int +} + +func (n *rememberSecondWriteWriter) Write(p []byte) (int, error) { + n.writecount++ + if n.writecount == 2 { + n.idx = n.pos + n.end = n.pos + len(p) + } + n.pos += len(p) + return len(p), nil +} + +func (n *rememberSecondWriteWriter) WriteString(s string) (int, error) { + n.writecount++ + if n.writecount == 2 { + n.idx = n.pos + n.end = n.pos + len(s) + } + n.pos += len(s) + return len(s), nil +} + // FindEmojiSubmatchIndex returns index pair of longest emoji in a string func FindEmojiSubmatchIndex(s string) []int { loadMap() - found := make(map[int]int) - keys := make([]int, 0) + secondWriteWriter := rememberSecondWriteWriter{} - //see if there are any emoji in string before looking for position of specific ones - //no performance difference when there is a match but 10x faster when there are not - if s == ReplaceCodes(s) { + // A faster and clean implementation would copy the trie tree formation in strings.NewReplacer but + // we can be lazy here. + // + // The implementation of strings.Replacer.WriteString is such that the first index of the emoji + // submatch is simply the second thing that is written to WriteString in the writer. + // + // Therefore we can simply take the index of the second write as our first emoji + // + // FIXME: just copy the trie implementation from strings.NewReplacer + _, _ = emptyReplacer.WriteString(&secondWriteWriter, s) + + // if we wrote less than twice then we never "replaced" + if secondWriteWriter.writecount < 2 { return nil } - // get index of first emoji occurrence while also checking for longest combination - for j := range GemojiData { - i := strings.Index(s, GemojiData[j].Emoji) - if i != -1 { - if _, ok := found[i]; !ok { - if len(keys) == 0 || i < keys[0] { - found[i] = j - keys = []int{i} - } - if i == 0 { - break - } - } - } - } - - if len(keys) > 0 { - index := keys[0] - return []int{index, index + len(GemojiData[found[index]].Emoji)} - } - - return nil + return []int{secondWriteWriter.idx, secondWriteWriter.end} } diff --git a/modules/emoji/emoji_test.go b/modules/emoji/emoji_test.go index 3eca3a8d8a6..def252896fd 100644 --- a/modules/emoji/emoji_test.go +++ b/modules/emoji/emoji_test.go @@ -8,6 +8,8 @@ package emoji import ( "reflect" "testing" + + "github.com/stretchr/testify/assert" ) func TestDumpInfo(t *testing.T) { @@ -65,3 +67,34 @@ func TestReplacers(t *testing.T) { } } } + +func TestFindEmojiSubmatchIndex(t *testing.T) { + type testcase struct { + teststring string + expected []int + } + + testcases := []testcase{ + { + "\U0001f44d", + []int{0, len("\U0001f44d")}, + }, + { + "\U0001f44d +1 \U0001f44d \U0001f37a", + []int{0, 4}, + }, + { + " \U0001f44d", + []int{1, 1 + len("\U0001f44d")}, + }, + { + string([]byte{'\u0001'}) + "\U0001f44d", + []int{1, 1 + len("\U0001f44d")}, + }, + } + + for _, kase := range testcases { + actual := FindEmojiSubmatchIndex(kase.teststring) + assert.Equal(t, kase.expected, actual) + } +} diff --git a/modules/markup/html.go b/modules/markup/html.go index 1d4a9be58c2..a81b03f0711 100644 --- a/modules/markup/html.go +++ b/modules/markup/html.go @@ -298,19 +298,27 @@ func RenderEmoji( return ctx.postProcess(rawHTML) } +var tagCleaner = regexp.MustCompile(`<((?:/?\w+/\w+)|(?:/[\w ]+/)|(/?[hH][tT][mM][lL][ />]))`) +var nulCleaner = strings.NewReplacer("\000", "") + func (ctx *postProcessCtx) postProcess(rawHTML []byte) ([]byte, error) { if ctx.procs == nil { ctx.procs = defaultProcessors } // give a generous extra 50 bytes - res := make([]byte, 0, len(rawHTML)+50) - res = append(res, ""...) - res = append(res, rawHTML...) - res = append(res, ""...) + res := bytes.NewBuffer(make([]byte, 0, len(rawHTML)+50)) + // prepend "" + _, _ = res.WriteString("") + + // Strip out nuls - they're always invalid + _, _ = nulCleaner.WriteString(res, string(tagCleaner.ReplaceAll(rawHTML, []byte("<$1")))) + + // close the tags + _, _ = res.WriteString("") // parse the HTML - nodes, err := html.ParseFragment(bytes.NewReader(res), nil) + nodes, err := html.ParseFragment(res, nil) if err != nil { return nil, &postProcessError{"invalid HTML", err} } @@ -347,17 +355,17 @@ func (ctx *postProcessCtx) postProcess(rawHTML []byte) ([]byte, error) { // Create buffer in which the data will be placed again. We know that the // length will be at least that of res; to spare a few alloc+copy, we // reuse res, resetting its length to 0. - buf := bytes.NewBuffer(res[:0]) + res.Reset() // Render everything to buf. for _, node := range nodes { - err = html.Render(buf, node) + err = html.Render(res, node) if err != nil { return nil, &postProcessError{"error rendering processed HTML", err} } } // Everything done successfully, return parsed data. - return buf.Bytes(), nil + return res.Bytes(), nil } func (ctx *postProcessCtx) visitNode(node *html.Node, visitText bool) { diff --git a/modules/markup/markdown/goldmark.go b/modules/markup/markdown/goldmark.go index b1f5d58d485..48544b89ab0 100644 --- a/modules/markup/markdown/goldmark.go +++ b/modules/markup/markdown/goldmark.go @@ -77,6 +77,12 @@ func (g *ASTTransformer) Transform(node *ast.Document, reader text.Reader, pc pa header.ID = util.BytesToReadOnlyString(id.([]byte)) } toc = append(toc, header) + } else { + for _, attr := range v.Attributes() { + if _, ok := attr.Value.([]byte); !ok { + v.SetAttribute(attr.Name, []byte(fmt.Sprintf("%v", attr.Value))) + } + } } case *ast.Image: // Images need two things: diff --git a/modules/markup/markdown/markdown.go b/modules/markup/markdown/markdown.go index 999ae52bb52..93235d77e5a 100644 --- a/modules/markup/markdown/markdown.go +++ b/modules/markup/markdown/markdown.go @@ -6,7 +6,8 @@ package markdown import ( - "bytes" + "fmt" + "io" "strings" "sync" @@ -18,7 +19,7 @@ import ( chromahtml "github.com/alecthomas/chroma/formatters/html" "github.com/yuin/goldmark" - "github.com/yuin/goldmark-highlighting" + highlighting "github.com/yuin/goldmark-highlighting" meta "github.com/yuin/goldmark-meta" "github.com/yuin/goldmark/extension" "github.com/yuin/goldmark/parser" @@ -34,6 +35,44 @@ var urlPrefixKey = parser.NewContextKey() var isWikiKey = parser.NewContextKey() var renderMetasKey = parser.NewContextKey() +type closesWithError interface { + io.WriteCloser + CloseWithError(err error) error +} + +type limitWriter struct { + w closesWithError + sum int64 + limit int64 +} + +// Write implements the standard Write interface: +func (l *limitWriter) Write(data []byte) (int, error) { + leftToWrite := l.limit - l.sum + if leftToWrite < int64(len(data)) { + n, err := l.w.Write(data[:leftToWrite]) + l.sum += int64(n) + if err != nil { + return n, err + } + _ = l.w.Close() + return n, fmt.Errorf("Rendered content too large - truncating render") + } + n, err := l.w.Write(data) + l.sum += int64(n) + return n, err +} + +// Close closes the writer +func (l *limitWriter) Close() error { + return l.w.Close() +} + +// CloseWithError closes the writer +func (l *limitWriter) CloseWithError(err error) error { + return l.w.CloseWithError(err) +} + // NewGiteaParseContext creates a parser.Context with the gitea context set func NewGiteaParseContext(urlPrefix string, metas map[string]string, isWiki bool) parser.Context { pc := parser.NewContext(parser.WithIDs(newPrefixedIDs())) @@ -43,8 +82,8 @@ func NewGiteaParseContext(urlPrefix string, metas map[string]string, isWiki bool return pc } -// render renders Markdown to HTML without handling special links. -func render(body []byte, urlPrefix string, metas map[string]string, wikiMarkdown bool) []byte { +// actualRender renders Markdown to HTML without handling special links. +func actualRender(body []byte, urlPrefix string, metas map[string]string, wikiMarkdown bool) []byte { once.Do(func() { converter = goldmark.New( goldmark.WithExtensions(extension.Table, @@ -119,12 +158,57 @@ func render(body []byte, urlPrefix string, metas map[string]string, wikiMarkdown }) - pc := NewGiteaParseContext(urlPrefix, metas, wikiMarkdown) - var buf bytes.Buffer - if err := converter.Convert(giteautil.NormalizeEOL(body), &buf, parser.WithContext(pc)); err != nil { - log.Error("Unable to render: %v", err) + rd, wr := io.Pipe() + defer func() { + _ = rd.Close() + _ = wr.Close() + }() + + lw := &limitWriter{ + w: wr, + limit: setting.UI.MaxDisplayFileSize * 3, } - return markup.SanitizeReader(&buf).Bytes() + + // FIXME: should we include a timeout that closes the pipe to abort the parser and sanitizer if it takes too long? + go func() { + defer func() { + err := recover() + if err == nil { + return + } + + log.Warn("Unable to render markdown due to panic in goldmark: %v", err) + if log.IsDebug() { + log.Debug("Panic in markdown: %v\n%s", err, string(log.Stack(2))) + } + _ = lw.CloseWithError(fmt.Errorf("%v", err)) + }() + + pc := NewGiteaParseContext(urlPrefix, metas, wikiMarkdown) + if err := converter.Convert(giteautil.NormalizeEOL(body), lw, parser.WithContext(pc)); err != nil { + log.Error("Unable to render: %v", err) + _ = lw.CloseWithError(err) + return + } + _ = lw.Close() + }() + return markup.SanitizeReader(rd).Bytes() +} + +func render(body []byte, urlPrefix string, metas map[string]string, wikiMarkdown bool) (ret []byte) { + defer func() { + err := recover() + if err == nil { + return + } + + log.Warn("Unable to render markdown due to panic in goldmark - will return sanitized raw bytes") + if log.IsDebug() { + log.Debug("Panic in markdown: %v\n%s", err, string(log.Stack(2))) + } + ret = markup.SanitizeBytes(body) + }() + return actualRender(body, urlPrefix, metas, wikiMarkdown) } var ( diff --git a/modules/markup/markdown/markdown_test.go b/modules/markup/markdown/markdown_test.go index 176bf4baade..d5b211b3e0c 100644 --- a/modules/markup/markdown/markdown_test.go +++ b/modules/markup/markdown/markdown_test.go @@ -309,6 +309,25 @@ func TestRender_RenderParagraphs(t *testing.T) { test(t, "A\n\n\nB\nC\n", 2) } +func TestMarkdownRenderRaw(t *testing.T) { + testcases := [][]byte{ + { // clusterfuzz_testcase_minimized_fuzz_markdown_render_raw_6267570554535936 + 0x2a, 0x20, 0x2d, 0x0a, 0x09, 0x20, 0x60, 0x5b, 0x0a, 0x09, 0x20, 0x60, + 0x5b, + }, + { // clusterfuzz_testcase_minimized_fuzz_markdown_render_raw_6278827345051648 + 0x2d, 0x20, 0x2d, 0x0d, 0x09, 0x60, 0x0d, 0x09, 0x60, + }, + { // clusterfuzz_testcase_minimized_fuzz_markdown_render_raw_6016973788020736[] = { + 0x7b, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x35, 0x7d, 0x0a, 0x3d, + }, + } + + for _, testcase := range testcases { + _ = RenderRaw(testcase, "", false) + } +} + func TestRenderSiblingImages_Issue12925(t *testing.T) { testcase := `![image1](/image1) ![image2](/image2) @@ -318,4 +337,5 @@ func TestRenderSiblingImages_Issue12925(t *testing.T) { ` res := string(RenderRaw([]byte(testcase), "", false)) assert.Equal(t, expected, res) + }