From 73d978c31e9516f7523cca05995cc6f311f0cb29 Mon Sep 17 00:00:00 2001 From: Evgenii Stratonikov Date: Mon, 10 Jan 2022 18:01:15 +0300 Subject: [PATCH] Rewrite AVX2 loop in assembly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Helps to get rid of MOV and generating constants for each iteration. ``` name old time/op new time/op delta Sum/AVX2Inline_digest-8 1.57ms ± 2% 1.41ms ± 0% -10.52% (p=0.000 n=9+9) name old speed new speed delta Sum/AVX2Inline_digest-8 63.6MB/s ± 1% 71.1MB/s ± 0% +11.76% (p=0.000 n=9+9) ``` Signed-off-by: Evgenii Stratonikov --- tz/avx2_amd64.s | 18 +++++++++++++++--- tz/avx2_inline.go | 6 +++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/tz/avx2_amd64.s b/tz/avx2_amd64.s index a9188d8..4e974fd 100644 --- a/tz/avx2_amd64.s +++ b/tz/avx2_amd64.s @@ -18,8 +18,8 @@ VPAND out_1, Y2, Y4 \ VPXOR Y4, in_1, out_2 \ -// func mulByteRightx2(c00c10, c01c11 *[4]uint64, b byte) -TEXT ·mulByteRightx2(SB),NOSPLIT,$0 +// func mulByteSliceRightx2(c00c10, c01c11 *[4]uint64, n int, data *byte) +TEXT ·mulByteSliceRightx2(SB),NOSPLIT,$0 MOVQ c00c10+0(FP), AX VMOVDQU (AX), Y0 MOVQ c01c11+8(FP), BX @@ -31,8 +31,17 @@ TEXT ·mulByteRightx2(SB),NOSPLIT,$0 VPSUBW Y14, Y13, Y12 // Y12 = 0x00010001... (packed words of 1) VPSLLQ $63, Y10, Y14 // Y14 = 0x10000000... (packed quad-words with HSB set) - VPBROADCASTB b+16(FP), X10 // X10 = packed bytes of b. + MOVQ n+16(FP), CX + MOVQ data+24(FP), DX + +loop: + CMPQ CX, $0 + JEQ finish + SUBQ $1, CX + + VPBROADCASTB (DX), X10 // X10 = packed bytes of b. VPMOVZXBW X10, Y10 // Extend with zeroes to packed words. + ADDQ $1, DX mulBit($7, Y0, Y8, Y5, Y6) mulBit($6, Y5, Y6, Y0, Y8) @@ -43,6 +52,9 @@ TEXT ·mulByteRightx2(SB),NOSPLIT,$0 mulBit($1, Y0, Y8, Y5, Y6) mulBit($0, Y5, Y6, Y0, Y8) + JMP loop + +finish: VMOVDQU Y8, (BX) VMOVDQU Y0, (AX) diff --git a/tz/avx2_inline.go b/tz/avx2_inline.go index 09afbc1..3e68c3d 100644 --- a/tz/avx2_inline.go +++ b/tz/avx2_inline.go @@ -25,8 +25,8 @@ func newAVX2Inline() *digest3 { func (d *digest3) Write(data []byte) (n int, err error) { n = len(data) - for _, b := range data { - mulByteRightx2(&d.x[0], &d.x[1], b) + if len(data) != 0 { + mulByteSliceRightx2(&d.x[0], &d.x[1], n, &data[0]) } return } @@ -56,4 +56,4 @@ func (d *digest3) checkSum() (b [hashSize]byte) { return } -func mulByteRightx2(c00c10 *avx2.GF127x2, c01c11 *avx2.GF127x2, b byte) +func mulByteSliceRightx2(c00c10 *avx2.GF127x2, c01c11 *avx2.GF127x2, n int, data *byte)