diff --git a/tz/avx2_inline_amd64.s b/tz/avx2_amd64.s similarity index 62% rename from tz/avx2_inline_amd64.s rename to tz/avx2_amd64.s index fb7f83a..200163c 100644 --- a/tz/avx2_inline_amd64.s +++ b/tz/avx2_amd64.s @@ -44,3 +44,29 @@ TEXT ·mulByteRightx2(SB),NOSPLIT,$0 VMOVDQA Y0, (AX) RET + +// func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64) +TEXT ·mulBitRightx2(SB),NOSPLIT,$0 + MOVQ c00c10+0(FP), AX + VMOVDQA (AX), Y0 + MOVQ c01c11+8(FP), BX + VMOVDQA (BX), Y8 + + VPSLLQ $1, Y0, Y1 + VPALIGNR $8, Y1, Y0, Y2 + VPSRLQ $63, Y2, Y2 + VPXOR Y1, Y2, Y2 + VPSRLQ $63, Y1, Y3 + VPSLLQ $63, Y3, Y3 + VPUNPCKHQDQ Y3, Y3, Y3 + VPXOR Y2, Y3, Y3 + + MOVQ e+16(FP), CX + VBROADCASTI128 (CX), Y2 + + VPXOR Y3, Y8, Y3 + VPAND Y3, Y2, Y4 + VPXOR Y4, Y0, Y8 + VMOVDQA Y8, (BX) + VMOVDQA Y3, (AX) + RET diff --git a/tz/avx_amd64.s b/tz/avx_amd64.s index a380b3a..d8b5e53 100644 --- a/tz/avx_amd64.s +++ b/tz/avx_amd64.s @@ -96,29 +96,3 @@ TEXT ·mulByteRight(SB),NOSPLIT,$0 MOVUPD X3, (DX) RET - -// func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64) -TEXT ·mulBitRightx2(SB),NOSPLIT,$0 - MOVQ c00c10+0(FP), AX - VMOVDQA (AX), Y0 - MOVQ c01c11+8(FP), BX - VMOVDQA (BX), Y8 - - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 - - MOVQ e+16(FP), CX - VBROADCASTI128 (CX), Y2 - - VPXOR Y3, Y8, Y3 - VPAND Y3, Y2, Y4 - VPXOR Y4, Y0, Y8 - VMOVDQA Y8, (BX) - VMOVDQA Y3, (AX) - RET