From a370c525ba95e6470d77634598b944d934e8e2e2 Mon Sep 17 00:00:00 2001 From: Evgenii Stratonikov Date: Tue, 28 Dec 2021 15:24:46 +0300 Subject: [PATCH] Replace all SSE instructions with AVX ones Also use integer MOV* variant instead of floating-point one. Signed-off-by: Evgenii Stratonikov --- tz/avx_amd64.s | 60 +++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/tz/avx_amd64.s b/tz/avx_amd64.s index d8b5e53..7c867b7 100644 --- a/tz/avx_amd64.s +++ b/tz/avx_amd64.s @@ -5,12 +5,12 @@ #define mul2(FROM, TO, R2, R3) \ VPSLLQ $1, FROM, TO \ VPALIGNR $8, TO, FROM, R2 \ - PSRLQ $63, R2 \ - MOVUPD ·x127x63(SB), R3 \ - ANDPD TO, R3 \ + VPSRLQ $63, R2, R2 \ + VMOVDQU ·x127x63(SB), R3 \ + VANDPD TO, R3, R3 \ VPUNPCKHQDQ R3, R3, R3 \ - XORPD R2, TO \ - XORPD R3, TO + VXORPD R2, TO, TO \ + VXORPD R3, TO, TO #define mask(bit, src, tmp, to1, to2) \ MOVQ src, tmp \ @@ -24,59 +24,59 @@ //https://software.intel.com/en-us/forums/intel-isa-extensions/topic/301461 #define mulBit(bit) \ - MOVUPD X0, X8 \ - MOVUPD X2, X9 \ + VMOVDQU X0, X8 \ + VMOVDQU X2, X9 \ mul2(X0, X5, X6, X7) \ VXORPD X1, X5, X0 \ mul2(X2, X5, X6, X7) \ VXORPD X3, X5, X2 \ mask(bit, CX, DX, X6, X5) \ VANDPD X0, X5, X1 \ - XORPD X8, X1 \ + VXORPD X8, X1, X1 \ VANDPD X2, X5, X3 \ - XORPD X9, X3 + VXORPD X9, X3, X3 // func mulBitRight(c00, c01, c10, c11, e *[2]uint64) TEXT ·mulBitRight(SB),NOSPLIT,$0 MOVQ c00+0(FP), AX - MOVUPD (AX), X0 - MOVUPD X0, X8 // remember c00 value + VMOVDQU (AX), X0 + VMOVDQU X0, X8 // remember c00 value MOVQ c01+8(FP), BX - MOVUPD (BX), X1 + VMOVDQU (BX), X1 MOVQ c10+16(FP), CX - MOVUPD (CX), X2 - MOVUPD X2, X9 // remember c10 value + VMOVDQU (CX), X2 + VMOVDQU X2, X9 // remember c10 value MOVQ c11+24(FP), DX - MOVUPD (DX), X3 + VMOVDQU (DX), X3 mul2(X0, X5, X6, X7) // c00 *= 2 VXORPD X5, X1, X0 // c00 += c01 mul2(X2, X5, X6, X7) // c10 *= 2 VXORPD X3, X5, X2 // c10 += c11 MOVQ e+32(FP), CX - MOVUPD (CX), X5 + VMOVDQU (CX), X5 VANDPD X0, X5, X1 // c01 = c00 + e - XORPD X8, X1 // c01 += X8 (old c00) + VXORPD X8, X1, X1 // c01 += X8 (old c00) VANDPD X2, X5, X3 // c11 = c10 + e - XORPD X9, X3 // c11 += x9 (old c10) + VXORPD X9, X3, X3 // c11 += x9 (old c10) - MOVUPD X0, (AX) + VMOVDQU X0, (AX) MOVQ c10+16(FP), CX - MOVUPD X2, (CX) - MOVUPD X1, (BX) - MOVUPD X3, (DX) + VMOVDQU X2, (CX) + VMOVDQU X1, (BX) + VMOVDQU X3, (DX) RET TEXT ·mulByteRight(SB),NOSPLIT,$0 MOVQ c00+0(FP), AX - MOVUPD (AX), X0 + VMOVDQU (AX), X0 MOVQ c01+8(FP), BX - MOVUPD (BX), X1 + VMOVDQU (BX), X1 MOVQ c10+16(FP), CX - MOVUPD (CX), X2 + VMOVDQU (CX), X2 MOVQ c11+24(FP), DX - MOVUPD (DX), X3 + VMOVDQU (DX), X3 MOVB b+32(FP), CX mulBit($7) @@ -88,11 +88,11 @@ TEXT ·mulByteRight(SB),NOSPLIT,$0 mulBit($1) mulBit($0) - MOVUPD X0, (AX) + VMOVDQU X0, (AX) MOVQ c10+16(FP), CX - MOVUPD X2, (CX) - MOVUPD X1, (BX) + VMOVDQU X2, (CX) + VMOVDQU X1, (BX) MOVQ c11+24(FP), DX - MOVUPD X3, (DX) + VMOVDQU X3, (DX) RET