diff --git a/tz/avx2_amd64.s b/tz/avx2_amd64.s index 9aa8a70..787dfc7 100644 --- a/tz/avx2_amd64.s +++ b/tz/avx2_amd64.s @@ -18,14 +18,14 @@ VPXOR Y3, Y8, Y3 \ VPAND Y3, Y2, Y4 \ VPXOR Y4, Y0, Y8 \ - VMOVDQA Y3, Y0 + VMOVDQU Y3, Y0 // func mulByteRightx2(c00c10, c01c11 *[4]uint64, b byte) TEXT ·mulByteRightx2(SB),NOSPLIT,$0 MOVQ c00c10+0(FP), AX - VMOVDQA (AX), Y0 + VMOVDQU (AX), Y0 MOVQ c01c11+8(FP), BX - VMOVDQA (BX), Y8 + VMOVDQU (BX), Y8 VPXOR Y13, Y13, Y13 // Y13 = 0x0000... VPCMPEQB Y12, Y12, Y12 // Y12 = 0xFFFF... @@ -43,17 +43,17 @@ TEXT ·mulByteRightx2(SB),NOSPLIT,$0 mulBit($1) mulBit($0) - VMOVDQA Y8, (BX) - VMOVDQA Y0, (AX) + VMOVDQU Y8, (BX) + VMOVDQU Y0, (AX) RET // func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64) TEXT ·mulBitRightx2(SB),NOSPLIT,$0 MOVQ c00c10+0(FP), AX - VMOVDQA (AX), Y0 + VMOVDQU (AX), Y0 MOVQ c01c11+8(FP), BX - VMOVDQA (BX), Y8 + VMOVDQU (BX), Y8 VPSLLQ $1, Y0, Y1 VPALIGNR $8, Y1, Y0, Y2 @@ -70,6 +70,6 @@ TEXT ·mulBitRightx2(SB),NOSPLIT,$0 VPXOR Y3, Y8, Y3 VPAND Y3, Y2, Y4 VPXOR Y4, Y0, Y8 - VMOVDQA Y8, (BX) - VMOVDQA Y3, (AX) + VMOVDQU Y8, (BX) + VMOVDQU Y3, (AX) RET