272 lines
8.5 KiB
Plaintext
272 lines
8.5 KiB
Plaintext
|
// TITLE("Compute Checksum")
|
|||
|
//++
|
|||
|
//
|
|||
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|||
|
//
|
|||
|
// Module Name:
|
|||
|
//
|
|||
|
// xsum.s
|
|||
|
//
|
|||
|
// Abstract:
|
|||
|
//
|
|||
|
// This module implements a function to compute the checksum of a buffer.
|
|||
|
//
|
|||
|
// Author:
|
|||
|
//
|
|||
|
// John Vert (jvert) 11-Jul-1994
|
|||
|
//
|
|||
|
// Environment:
|
|||
|
//
|
|||
|
// Revision History:
|
|||
|
//
|
|||
|
//--
|
|||
|
|
|||
|
#include "ksalpha.h"
|
|||
|
|
|||
|
|
|||
|
SBTTL("Compute Checksum")
|
|||
|
//++
|
|||
|
//
|
|||
|
// ULONG
|
|||
|
// tcpxsum (
|
|||
|
// IN ULONG Checksum,
|
|||
|
// IN PUSHORT Source,
|
|||
|
// IN ULONG Length
|
|||
|
// )
|
|||
|
//
|
|||
|
// Routine Description:
|
|||
|
//
|
|||
|
// This function computes the checksum of the specified buffer.
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Checksum (a0) - Supplies the initial checksum value.
|
|||
|
//
|
|||
|
// Source (a1) - Supplies a pointer to the checksum buffer
|
|||
|
//
|
|||
|
// Length (a2) - Supplies the length of the buffer in words.
|
|||
|
//
|
|||
|
// Return Value:
|
|||
|
//
|
|||
|
// The computed checksum is returned as the function value.
|
|||
|
//
|
|||
|
//--
|
|||
|
|
|||
|
LEAF_ENTRY(tcpxsum)
|
|||
|
zap a0, 0xf0, a0 // clear high half of a0
|
|||
|
bis a1, zero, t6 // save initial buffer address
|
|||
|
bis zero, zero, v0 // clear accumulated checksum
|
|||
|
|
|||
|
//
|
|||
|
// Check if the buffer is quadword aligned.
|
|||
|
//
|
|||
|
// If the buffer is not quadword aligned, then add the leading words to the
|
|||
|
// checksum.
|
|||
|
//
|
|||
|
ldq_u t0, 0(a1) // get containing quadword of first part
|
|||
|
blbc a1, 10f // check for word alignment
|
|||
|
beq a2, 65f // if zero bytes, don't do anything
|
|||
|
extbl t0, a1, t1 // get leading byte
|
|||
|
sll t1, 8, v0 // shift it to correct spot for later byte swap
|
|||
|
addq a1, 1, a1 // increment buffer to first full word
|
|||
|
subq a2, 1, a2 // decrement byte count
|
|||
|
|
|||
|
10:
|
|||
|
and a1, 6, t2 // check if buffer quadword aligned
|
|||
|
beq t2, 20f // if eq, quadword aligned
|
|||
|
extql t0, t2, t0 // extract bytes to checksum
|
|||
|
and a1, 7, t3 // compute bytes summed
|
|||
|
subq zero, t3, t3
|
|||
|
addq t3, 8, t3
|
|||
|
addq a1, 8, a1 // advance buffer address to next qword
|
|||
|
bic a1, 7, a1 //
|
|||
|
subq a2, t3, t2
|
|||
|
blt t2, 55f // if ltz, too many, jump to residual code
|
|||
|
|
|||
|
addq v0, t0, v0 // add bytes to partial checksum
|
|||
|
cmpult v0, t0, t1 // generate carry
|
|||
|
addq t1, v0, v0 // add carry back into checksum
|
|||
|
|
|||
|
bis t2, zero, a2 // reduce count of bytes to checksum
|
|||
|
beq t2, 60f // if eq, no more bytes
|
|||
|
|
|||
|
20:
|
|||
|
//
|
|||
|
// Compute the checksum in 64-byte blocks
|
|||
|
//
|
|||
|
bic a2, 7, t4 // subtract out residual bytes
|
|||
|
beq t4, 40f // if eq, no quadwords to checksum
|
|||
|
subq zero, t4, t2 // compute negative of byte count
|
|||
|
and t2, 15 << 2, t3 // compute bytes in first iteration
|
|||
|
ldq t0, 0(a1) // get first quadword to checksum
|
|||
|
beq t3, 35f // if eq, full 64-byte block
|
|||
|
subq a1, t3, a1 // bias buffer address by offset
|
|||
|
bic t4, 64-1, t4 // subtract out bytes in first iteration
|
|||
|
lda t2, 30f // get base address of code vector
|
|||
|
addl t3, t3, t3 //
|
|||
|
addq t3, t2, t2 // compute code vector offset
|
|||
|
bis t0, zero, t1 // copy first quadword to checksum
|
|||
|
jmp (t2) // dispatch
|
|||
|
|
|||
|
|
|||
|
30:
|
|||
|
//
|
|||
|
// The following code vector computes the checksum of a 64-byte block.
|
|||
|
//
|
|||
|
.set noreorder
|
|||
|
ldq t1, 8(a1)
|
|||
|
addq v0, t0, v0
|
|||
|
cmpult v0, t0, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t0, 16(a1)
|
|||
|
addq v0, t1, v0
|
|||
|
cmpult v0, t1, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t1, 24(a1)
|
|||
|
addq v0, t0, v0
|
|||
|
cmpult v0, t0, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t0, 32(a1)
|
|||
|
addq v0, t1, v0
|
|||
|
cmpult v0, t1, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t1, 40(a1)
|
|||
|
addq v0, t0, v0
|
|||
|
cmpult v0, t0, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t0, 48(a1)
|
|||
|
addq v0, t1, v0
|
|||
|
cmpult v0, t1, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t1, 56(a1)
|
|||
|
addq v0, t0, v0
|
|||
|
cmpult v0, t0, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
addq a1, 64, a1
|
|||
|
addq v0, t1, v0
|
|||
|
cmpult v0, t1, t2
|
|||
|
addq v0, t2, v0
|
|||
|
.set reorder
|
|||
|
|
|||
|
beq t4, 40f // if zero, end of block
|
|||
|
|
|||
|
35:
|
|||
|
ldq t0, 0(a1)
|
|||
|
//
|
|||
|
// The following loop is allowed to be reordered by the assembler for
|
|||
|
// optimal scheduling. It is never branched into.
|
|||
|
//
|
|||
|
subq t4, 64, t4 // reduce byte count of longwords
|
|||
|
|
|||
|
ldq t1, 8(a1)
|
|||
|
addq v0, t0, v0
|
|||
|
cmpult v0, t0, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t0, 16(a1)
|
|||
|
addq v0, t1, v0
|
|||
|
cmpult v0, t1, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t1, 24(a1)
|
|||
|
addq v0, t0, v0
|
|||
|
cmpult v0, t0, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t0, 32(a1)
|
|||
|
addq v0, t1, v0
|
|||
|
cmpult v0, t1, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t1, 40(a1)
|
|||
|
addq v0, t0, v0
|
|||
|
cmpult v0, t0, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t0, 48(a1)
|
|||
|
addq v0, t1, v0
|
|||
|
cmpult v0, t1, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
ldq t1, 56(a1)
|
|||
|
addq v0, t0, v0
|
|||
|
cmpult v0, t0, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
addq a1, 64, a1
|
|||
|
addq v0, t1, v0
|
|||
|
cmpult v0, t1, t2
|
|||
|
addq v0, t2, v0
|
|||
|
|
|||
|
bne t4, 35b // if ne zero, not end of block
|
|||
|
|
|||
|
40:
|
|||
|
//
|
|||
|
// Check for any remaining bytes.
|
|||
|
//
|
|||
|
and a2, 7, a2 // isolate residual bytes
|
|||
|
beq a2, 60f // if eq, no residual bytes
|
|||
|
50:
|
|||
|
//
|
|||
|
// Checksum remaining bytes.
|
|||
|
//
|
|||
|
// The technique we use here is to load the final quadword, then
|
|||
|
// zero out the bytes that are not included.
|
|||
|
//
|
|||
|
ldq t0, 0(a1) // get quadword surrounding remainder
|
|||
|
55:
|
|||
|
ornot zero, zero, t1 // get FF mask
|
|||
|
sll t1, a2, t2 // shift to produce byte mask
|
|||
|
zap t0, t2, t0 // zero out bytes past end of buffer
|
|||
|
addq v0, t0, v0 // add quadword to partial checksum
|
|||
|
cmpult v0, t0, t1 // generate carry
|
|||
|
addq t1, v0, v0 // add carry back into checksum
|
|||
|
60:
|
|||
|
//
|
|||
|
// Byte swap the 64-bit checksum if the start of the buffer was not word aligned
|
|||
|
//
|
|||
|
blbc t6, 65f
|
|||
|
zap v0, 0xAA, t0 // isolate even bytes
|
|||
|
sll t0, 8, t0 // shift even bytes into odd positions
|
|||
|
srl v0, 8, t1 // shift odd bytes into even positions
|
|||
|
zap t1, 0xAA, t1 // isolate odd bytes
|
|||
|
bis t0, t1, v0 // merge bytes back together
|
|||
|
|
|||
|
65:
|
|||
|
//
|
|||
|
// add computed checksum to original checksum, and fold the 64-bit
|
|||
|
// result down to 16 bits.
|
|||
|
//
|
|||
|
addq v0, a0, v0 // add computed checksum to original
|
|||
|
cmpult v0, a0, t0 // generate carry
|
|||
|
addq v0, t0, v0 // add carry back into checksum
|
|||
|
|
|||
|
//
|
|||
|
// swap the longwords in order to sum two longwords and their carry in one add.
|
|||
|
//
|
|||
|
sll v0, 32, t0 // shift low longword into high
|
|||
|
srl v0, 32, t1 // shift high longword into low
|
|||
|
bis t1, t0, t5 // merge back together
|
|||
|
|
|||
|
addq v0, t5, t0 // produce sum + carry in high longword
|
|||
|
srl t0, 32, t1 // shift back down to low half
|
|||
|
//
|
|||
|
// swap the words in order to sum two words and their carry in one add
|
|||
|
//
|
|||
|
sll t1, 16, t2 // shift high word into low
|
|||
|
srl t1, 16, t3 // shift low word into high
|
|||
|
bis t2, t3, t4 // merge back together
|
|||
|
addq t4, t1, t2 // produce sum and carry in high word
|
|||
|
extwl t2, 2, v0 // extract result.
|
|||
|
ret zero, (ra) // return
|
|||
|
|
|||
|
.end tcpxsum
|
|||
|
|