From 921f8b057906c3807b77346a3e0ba0bc994fe629 Mon Sep 17 00:00:00 2001 From: Evgenii Stratonikov <evgeniy@nspcc.ru> Date: Wed, 29 Dec 2021 15:56:02 +0300 Subject: [PATCH] Optimize AVX implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Do the same mask trick as with AVX2. 2. Get rid of load, generate constant on the fly. ``` name old time/op new time/op delta Sum/AVXInline_digest-8 2.26ms ± 4% 2.17ms ± 5% -4.05% (p=0.000 n=19+17) name old speed new speed delta Sum/AVXInline_digest-8 44.3MB/s ± 4% 46.2MB/s ± 5% +4.25% (p=0.000 n=19+17) ``` Signed-off-by: Evgenii Stratonikov <evgeniy@nspcc.ru> --- tz/avx_amd64.s | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/tz/avx_amd64.s b/tz/avx_amd64.s index 7c867b7..3648741 100644 --- a/tz/avx_amd64.s +++ b/tz/avx_amd64.s @@ -6,22 +6,15 @@ VPSLLQ $1, FROM, TO \ VPALIGNR $8, TO, FROM, R2 \ VPSRLQ $63, R2, R2 \ - VMOVDQU ·x127x63(SB), R3 \ - VANDPD TO, R3, R3 \ + VANDPD TO, X14, R3 \ VPUNPCKHQDQ R3, R3, R3 \ VXORPD R2, TO, TO \ VXORPD R3, TO, TO -#define mask(bit, src, tmp, to1, to2) \ - MOVQ src, tmp \ - SHRQ bit, tmp \ - ANDQ $1, tmp \ - NEGQ tmp \ - MOVQ tmp, to1 \ - VSHUFPS $0, to1, to1, to2 - // VPBROADCASTB to1, to2 - // Can't use VPBROADCASTB because it is AVX2 instruction - //https://software.intel.com/en-us/forums/intel-isa-extensions/topic/301461 +#define mask(bit, tmp, to) \ + VPSRLW bit, X10, tmp \ + VPAND X12, tmp, to \ // to = 0x000<bit>000<bit>... + VPSUBW to, X13, to // to = 0xFFFF.. or 0x0000 depending on bit #define mulBit(bit) \ VMOVDQU X0, X8 \ @@ -30,7 +23,7 @@ VXORPD X1, X5, X0 \ mul2(X2, X5, X6, X7) \ VXORPD X3, X5, X2 \ - mask(bit, CX, DX, X6, X5) \ + mask(bit, X6, X5) \ VANDPD X0, X5, X1 \ VXORPD X8, X1, X1 \ VANDPD X2, X5, X3 \ @@ -49,6 +42,11 @@ TEXT ·mulBitRight(SB),NOSPLIT,$0 MOVQ c11+24(FP), DX VMOVDQU (DX), X3 + VPXOR X13, X13, X13 // Y13 = 0x0000... + VPCMPEQB X14, X14, X14 // Y14 = 0xFFFF... + VPSUBQ X14, X13, X13 + VPSLLQ $63, X13, X14 + mul2(X0, X5, X6, X7) // c00 *= 2 VXORPD X5, X1, X0 // c00 += c01 mul2(X2, X5, X6, X7) // c10 *= 2 @@ -77,8 +75,19 @@ TEXT ·mulByteRight(SB),NOSPLIT,$0 VMOVDQU (CX), X2 MOVQ c11+24(FP), DX VMOVDQU (DX), X3 + MOVQ $0, CX MOVB b+32(FP), CX + VPXOR X13, X13, X13 // X13 = 0x0000... + VPCMPEQB X14, X14, X14 // X14 = 0xFFFF... + VPSUBQ X14, X13, X10 + VPSUBW X14, X13, X12 // X12 = 0x00010001... (packed words of 1) + VPSLLQ $63, X10, X14 // X14 = 0x10000000... (packed quad-words with HSB set) + + MOVQ CX, X10 + VPSHUFLW $0, X10, X11 + VPSHUFD $0, X11, X10 + mulBit($7) mulBit($6) mulBit($5)