diff --git a/tz/avx_amd64.s b/tz/avx_amd64.s index 7c867b7..3648741 100644 --- a/tz/avx_amd64.s +++ b/tz/avx_amd64.s @@ -6,22 +6,15 @@ VPSLLQ $1, FROM, TO \ VPALIGNR $8, TO, FROM, R2 \ VPSRLQ $63, R2, R2 \ - VMOVDQU ·x127x63(SB), R3 \ - VANDPD TO, R3, R3 \ + VANDPD TO, X14, R3 \ VPUNPCKHQDQ R3, R3, R3 \ VXORPD R2, TO, TO \ VXORPD R3, TO, TO -#define mask(bit, src, tmp, to1, to2) \ - MOVQ src, tmp \ - SHRQ bit, tmp \ - ANDQ $1, tmp \ - NEGQ tmp \ - MOVQ tmp, to1 \ - VSHUFPS $0, to1, to1, to2 - // VPBROADCASTB to1, to2 - // Can't use VPBROADCASTB because it is AVX2 instruction - //https://software.intel.com/en-us/forums/intel-isa-extensions/topic/301461 +#define mask(bit, tmp, to) \ + VPSRLW bit, X10, tmp \ + VPAND X12, tmp, to \ // to = 0x000000... + VPSUBW to, X13, to // to = 0xFFFF.. or 0x0000 depending on bit #define mulBit(bit) \ VMOVDQU X0, X8 \ @@ -30,7 +23,7 @@ VXORPD X1, X5, X0 \ mul2(X2, X5, X6, X7) \ VXORPD X3, X5, X2 \ - mask(bit, CX, DX, X6, X5) \ + mask(bit, X6, X5) \ VANDPD X0, X5, X1 \ VXORPD X8, X1, X1 \ VANDPD X2, X5, X3 \ @@ -49,6 +42,11 @@ TEXT ·mulBitRight(SB),NOSPLIT,$0 MOVQ c11+24(FP), DX VMOVDQU (DX), X3 + VPXOR X13, X13, X13 // Y13 = 0x0000... + VPCMPEQB X14, X14, X14 // Y14 = 0xFFFF... + VPSUBQ X14, X13, X13 + VPSLLQ $63, X13, X14 + mul2(X0, X5, X6, X7) // c00 *= 2 VXORPD X5, X1, X0 // c00 += c01 mul2(X2, X5, X6, X7) // c10 *= 2 @@ -77,8 +75,19 @@ TEXT ·mulByteRight(SB),NOSPLIT,$0 VMOVDQU (CX), X2 MOVQ c11+24(FP), DX VMOVDQU (DX), X3 + MOVQ $0, CX MOVB b+32(FP), CX + VPXOR X13, X13, X13 // X13 = 0x0000... + VPCMPEQB X14, X14, X14 // X14 = 0xFFFF... + VPSUBQ X14, X13, X10 + VPSUBW X14, X13, X12 // X12 = 0x00010001... (packed words of 1) + VPSLLQ $63, X10, X14 // X14 = 0x10000000... (packed quad-words with HSB set) + + MOVQ CX, X10 + VPSHUFLW $0, X10, X11 + VPSHUFD $0, X11, X10 + mulBit($7) mulBit($6) mulBit($5)