gitea/vendor/github.com/andybalholm/brotli/literal_cost.go

package brotli

func utf8Position(last uint, c uint, clamp uint) uint {
	if c < 128 {
		return 0 /* Next one is the 'Byte 1' again. */
	} else if c >= 192 { /* Next one is the 'Byte 2' of utf-8 encoding. */
		return brotli_min_size_t(1, clamp)
	} else {
		/* Let's decide over the last byte if this ends the sequence. */
		if last < 0xE0 {
			return 0 /* Completed two or three byte coding. */ /* Next one is the 'Byte 3' of utf-8 encoding. */
		} else {
			return brotli_min_size_t(2, clamp)
		}
	}
}

func decideMultiByteStatsLevel(pos uint, len uint, mask uint, data []byte) uint {
	var counts = [3]uint{0} /* should be 2, but 1 compresses better. */
	var max_utf8 uint = 1
	var last_c uint = 0
	var i uint
	for i = 0; i < len; i++ {
		var c uint = uint(data[(pos+i)&mask])
		counts[utf8Position(last_c, c, 2)]++
		last_c = c
	}

	if counts[2] < 500 {
		max_utf8 = 1
	}

	if counts[1]+counts[2] < 25 {
		max_utf8 = 0
	}

	return max_utf8
}

func estimateBitCostsForLiteralsUTF8(pos uint, len uint, mask uint, data []byte, cost []float32) {
	var max_utf8 uint = decideMultiByteStatsLevel(pos, uint(len), mask, data)
	/* Bootstrap histograms. */
	var histogram = [3][256]uint{[256]uint{0}}
	var window_half uint = 495
	var in_window uint = brotli_min_size_t(window_half, uint(len))
	var in_window_utf8 = [3]uint{0}
	/* max_utf8 is 0 (normal ASCII single byte modeling),
	   1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */

	var i uint
	{
		var last_c uint = 0
		var utf8_pos uint = 0
		for i = 0; i < in_window; i++ {
			var c uint = uint(data[(pos+i)&mask])
			histogram[utf8_pos][c]++
			in_window_utf8[utf8_pos]++
			utf8_pos = utf8Position(last_c, c, max_utf8)
			last_c = c
		}
	}

	/* Compute bit costs with sliding window. */
	for i = 0; i < len; i++ {
		if i >= window_half {
			var c uint
			var last_c uint
			if i < window_half+1 {
				c = 0
			} else {
				c = uint(data[(pos+i-window_half-1)&mask])
			}
			if i < window_half+2 {
				last_c = 0
			} else {
				last_c = uint(data[(pos+i-window_half-2)&mask])
			}
			/* Remove a byte in the past. */

			var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)
			histogram[utf8_pos2][data[(pos+i-window_half)&mask]]--
			in_window_utf8[utf8_pos2]--
		}

		if i+window_half < len {
			var c uint = uint(data[(pos+i+window_half-1)&mask])
			var last_c uint = uint(data[(pos+i+window_half-2)&mask])
			/* Add a byte in the future. */

			var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)
			histogram[utf8_pos2][data[(pos+i+window_half)&mask]]++
			in_window_utf8[utf8_pos2]++
		}
		{
			var c uint
			var last_c uint
			if i < 1 {
				c = 0
			} else {
				c = uint(data[(pos+i-1)&mask])
			}
			if i < 2 {
				last_c = 0
			} else {
				last_c = uint(data[(pos+i-2)&mask])
			}
			var utf8_pos uint = utf8Position(last_c, c, max_utf8)
			var masked_pos uint = (pos + i) & mask
			var histo uint = histogram[utf8_pos][data[masked_pos]]
			var lit_cost float64
			if histo == 0 {
				histo = 1
			}

			lit_cost = fastLog2(in_window_utf8[utf8_pos]) - fastLog2(histo)
			lit_cost += 0.02905
			if lit_cost < 1.0 {
				lit_cost *= 0.5
				lit_cost += 0.5
			}

			/* Make the first bytes more expensive -- seems to help, not sure why.
			   Perhaps because the entropy source is changing its properties
			   rapidly in the beginning of the file, perhaps because the beginning
			   of the data is a statistical "anomaly". */
			if i < 2000 {
				lit_cost += 0.7 - (float64(2000-i) / 2000.0 * 0.35)
			}

			cost[i] = float32(lit_cost)
		}
	}
}

func estimateBitCostsForLiterals(pos uint, len uint, mask uint, data []byte, cost []float32) {
	if isMostlyUTF8(data, pos, mask, uint(len), kMinUTF8Ratio) {
		estimateBitCostsForLiteralsUTF8(pos, uint(len), mask, data, cost)
		return
	} else {
		var histogram = [256]uint{0}
		var window_half uint = 2000
		var in_window uint = brotli_min_size_t(window_half, uint(len))
		var i uint
		/* Bootstrap histogram. */
		for i = 0; i < in_window; i++ {
			histogram[data[(pos+i)&mask]]++
		}

		/* Compute bit costs with sliding window. */
		for i = 0; i < len; i++ {
			var histo uint
			if i >= window_half {
				/* Remove a byte in the past. */
				histogram[data[(pos+i-window_half)&mask]]--

				in_window--
			}

			if i+window_half < len {
				/* Add a byte in the future. */
				histogram[data[(pos+i+window_half)&mask]]++

				in_window++
			}

			histo = histogram[data[(pos+i)&mask]]
			if histo == 0 {
				histo = 1
			}
			{
				var lit_cost float64 = fastLog2(in_window) - fastLog2(histo)
				lit_cost += 0.029
				if lit_cost < 1.0 {
					lit_cost *= 0.5
					lit_cost += 0.5
				}

				cost[i] = float32(lit_cost)
			}
		}
	}
}
Dump: add output format tar and output to stdout (#10376) * Dump: Use mholt/archive/v3 to support tar including many compressions Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: Allow dump output to stdout Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: Fixed bug present since #6677 where SessionConfig.Provider is never "file" Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: never pack RepoRootPath, LFS.ContentPath and LogRootPath when they are below AppDataPath Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: also dump LFS (fixes #10058) Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Dump: never dump CustomPath if CustomPath is a subdir of or equal to AppDataPath (fixes #10365) Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * Use log.Info instead of fmt.Fprintf Signed-off-by: Philipp Homann <homann.philipp@googlemail.com> * import ordering * make fmt Co-authored-by: zeripath <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io> Co-authored-by: Matti R <matti@mdranta.net> 2020-06-05 22:47:39 +02:00			`package brotli`

			`func utf8Position(last uint, c uint, clamp uint) uint {`
			`if c < 128 {`
			`return 0 /* Next one is the 'Byte 1' again. */`
			`} else if c >= 192 { /* Next one is the 'Byte 2' of utf-8 encoding. */`
			`return brotli_min_size_t(1, clamp)`
			`} else {`
			`/* Let's decide over the last byte if this ends the sequence. */`
			`if last < 0xE0 {`
			`return 0 /* Completed two or three byte coding. / / Next one is the 'Byte 3' of utf-8 encoding. */`
			`} else {`
			`return brotli_min_size_t(2, clamp)`
			`}`
			`}`
			`}`

			`func decideMultiByteStatsLevel(pos uint, len uint, mask uint, data []byte) uint {`
			`var counts = [3]uint{0} /* should be 2, but 1 compresses better. */`
			`var max_utf8 uint = 1`
			`var last_c uint = 0`
			`var i uint`
			`for i = 0; i < len; i++ {`
			`var c uint = uint(data[(pos+i)&mask])`
			`counts[utf8Position(last_c, c, 2)]++`
			`last_c = c`
			`}`

			`if counts[2] < 500 {`
			`max_utf8 = 1`
			`}`

			`if counts[1]+counts[2] < 25 {`
			`max_utf8 = 0`
			`}`

			`return max_utf8`
			`}`

			`func estimateBitCostsForLiteralsUTF8(pos uint, len uint, mask uint, data []byte, cost []float32) {`
			`var max_utf8 uint = decideMultiByteStatsLevel(pos, uint(len), mask, data)`
			`/* Bootstrap histograms. */`
			`var histogram = [3][256]uint{[256]uint{0}}`
			`var window_half uint = 495`
			`var in_window uint = brotli_min_size_t(window_half, uint(len))`
			`var in_window_utf8 = [3]uint{0}`
			`/* max_utf8 is 0 (normal ASCII single byte modeling),`
			`1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */`

			`var i uint`
			`{`
			`var last_c uint = 0`
			`var utf8_pos uint = 0`
			`for i = 0; i < in_window; i++ {`
			`var c uint = uint(data[(pos+i)&mask])`
			`histogram[utf8_pos][c]++`
			`in_window_utf8[utf8_pos]++`
			`utf8_pos = utf8Position(last_c, c, max_utf8)`
			`last_c = c`
			`}`
			`}`

			`/* Compute bit costs with sliding window. */`
			`for i = 0; i < len; i++ {`
			`if i >= window_half {`
			`var c uint`
			`var last_c uint`
			`if i < window_half+1 {`
			`c = 0`
			`} else {`
			`c = uint(data[(pos+i-window_half-1)&mask])`
			`}`
			`if i < window_half+2 {`
			`last_c = 0`
			`} else {`
			`last_c = uint(data[(pos+i-window_half-2)&mask])`
			`}`
			`/* Remove a byte in the past. */`

			`var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)`
			`histogram[utf8_pos2][data[(pos+i-window_half)&mask]]--`
			`in_window_utf8[utf8_pos2]--`
			`}`

			`if i+window_half < len {`
			`var c uint = uint(data[(pos+i+window_half-1)&mask])`
			`var last_c uint = uint(data[(pos+i+window_half-2)&mask])`
			`/* Add a byte in the future. */`

			`var utf8_pos2 uint = utf8Position(last_c, c, max_utf8)`
			`histogram[utf8_pos2][data[(pos+i+window_half)&mask]]++`
			`in_window_utf8[utf8_pos2]++`
			`}`
			`{`
			`var c uint`
			`var last_c uint`
			`if i < 1 {`
			`c = 0`
			`} else {`
			`c = uint(data[(pos+i-1)&mask])`
			`}`
			`if i < 2 {`
			`last_c = 0`
			`} else {`
			`last_c = uint(data[(pos+i-2)&mask])`
			`}`
			`var utf8_pos uint = utf8Position(last_c, c, max_utf8)`
			`var masked_pos uint = (pos + i) & mask`
			`var histo uint = histogram[utf8_pos][data[masked_pos]]`
			`var lit_cost float64`
			`if histo == 0 {`
			`histo = 1`
			`}`

			`lit_cost = fastLog2(in_window_utf8[utf8_pos]) - fastLog2(histo)`
			`lit_cost += 0.02905`
			`if lit_cost < 1.0 {`
			`lit_cost *= 0.5`
			`lit_cost += 0.5`
			`}`

			`/* Make the first bytes more expensive -- seems to help, not sure why.`
			`Perhaps because the entropy source is changing its properties`
			`rapidly in the beginning of the file, perhaps because the beginning`
			`of the data is a statistical "anomaly". */`
			`if i < 2000 {`
			`lit_cost += 0.7 - (float64(2000-i) / 2000.0 * 0.35)`
			`}`

			`cost[i] = float32(lit_cost)`
			`}`
			`}`
			`}`

			`func estimateBitCostsForLiterals(pos uint, len uint, mask uint, data []byte, cost []float32) {`
			`if isMostlyUTF8(data, pos, mask, uint(len), kMinUTF8Ratio) {`
			`estimateBitCostsForLiteralsUTF8(pos, uint(len), mask, data, cost)`
			`return`
			`} else {`
			`var histogram = [256]uint{0}`
			`var window_half uint = 2000`
			`var in_window uint = brotli_min_size_t(window_half, uint(len))`
			`var i uint`
			`/* Bootstrap histogram. */`
			`for i = 0; i < in_window; i++ {`
			`histogram[data[(pos+i)&mask]]++`
			`}`

			`/* Compute bit costs with sliding window. */`
			`for i = 0; i < len; i++ {`
			`var histo uint`
			`if i >= window_half {`
			`/* Remove a byte in the past. */`
			`histogram[data[(pos+i-window_half)&mask]]--`

			`in_window--`
			`}`

			`if i+window_half < len {`
			`/* Add a byte in the future. */`
			`histogram[data[(pos+i+window_half)&mask]]++`

			`in_window++`
			`}`

			`histo = histogram[data[(pos+i)&mask]]`
			`if histo == 0 {`
			`histo = 1`
			`}`
			`{`
			`var lit_cost float64 = fastLog2(in_window) - fastLog2(histo)`
			`lit_cost += 0.029`
			`if lit_cost < 1.0 {`
			`lit_cost *= 0.5`
			`lit_cost += 0.5`
			`}`

			`cost[i] = float32(lit_cost)`
			`}`
			`}`
			`}`
			`}`