diff --git a/gf127/avx/gf127_amd64.s b/gf127/avx/gf127_amd64.s index d159814..281efc6 100644 --- a/gf127/avx/gf127_amd64.s +++ b/gf127/avx/gf127_amd64.s @@ -1,81 +1,81 @@ #include "textflag.h" // func Add(a, b, c *[2]uint64) -TEXT ·Add(SB),NOSPLIT,$0 - MOVQ a+0(FP), AX - MOVUPD (AX), X0 - MOVQ b+8(FP), BX - MOVUPD (BX), X1 - XORPD X1, X0 - MOVQ c+16(FP), CX - MOVUPD X0, (CX) - RET +TEXT ·Add(SB), NOSPLIT, $0 + MOVQ a+0(FP), AX + MOVUPD (AX), X0 + MOVQ b+8(FP), BX + MOVUPD (BX), X1 + XORPD X1, X0 + MOVQ c+16(FP), CX + MOVUPD X0, (CX) + RET // func Mul10(a, b *[2]uint64) -TEXT ·Mul10(SB),NOSPLIT,$0 - MOVQ a+0(FP), AX - MOVUPD (AX), X0 - VPSLLQ $1, X0, X1 - VPALIGNR $8, X1, X0, X2 - PSRLQ $63, X2 - MOVUPD ·x127x63(SB), X3 - ANDPD X1, X3 - VPUNPCKHQDQ X3, X3, X3 - XORPD X2, X1 - XORPD X3, X1 - MOVQ b+8(FP), AX - MOVUPD X1, (AX) - RET +TEXT ·Mul10(SB), NOSPLIT, $0 + MOVQ a+0(FP), AX + MOVUPD (AX), X0 + VPSLLQ $1, X0, X1 + VPALIGNR $8, X1, X0, X2 + PSRLQ $63, X2 + MOVUPD ·x127x63(SB), X3 + ANDPD X1, X3 + VPUNPCKHQDQ X3, X3, X3 + XORPD X2, X1 + XORPD X3, X1 + MOVQ b+8(FP), AX + MOVUPD X1, (AX) + RET // func Mul11(a, b *[2]uint64) -TEXT ·Mul11(SB),NOSPLIT,$0 - MOVQ a+0(FP), AX - MOVUPD (AX), X0 - VPSLLQ $1, X0, X1 - VPALIGNR $8, X1, X0, X2 - PSRLQ $63, X2 - MOVUPD ·x127x63(SB), X3 - ANDPD X1, X3 - VPUNPCKHQDQ X3, X3, X3 - XORPD X2, X1 - XORPD X3, X1 - XORPD X0, X1 - MOVQ b+8(FP), AX - MOVUPD X1, (AX) - RET +TEXT ·Mul11(SB), NOSPLIT, $0 + MOVQ a+0(FP), AX + MOVUPD (AX), X0 + VPSLLQ $1, X0, X1 + VPALIGNR $8, X1, X0, X2 + PSRLQ $63, X2 + MOVUPD ·x127x63(SB), X3 + ANDPD X1, X3 + VPUNPCKHQDQ X3, X3, X3 + XORPD X2, X1 + XORPD X3, X1 + XORPD X0, X1 + MOVQ b+8(FP), AX + MOVUPD X1, (AX) + RET // func Mul(a, b, c *[2]uint64) -TEXT ·Mul(SB),NOSPLIT,$0 - MOVQ a+0(FP), AX // X0 = a0 . a1 - MOVUPD (AX), X0 // X0 = a0 . a1 - MOVQ b+8(FP), BX // X1 = b0 . b1 - MOVUPD (BX), X1 // X1 = b0 . b1 - VPUNPCKLQDQ X1, X0, X2 // X2 = a0 . b0 - VPUNPCKHQDQ X1, X0, X3 // X3 = a1 . b1 - XORPD X2, X3 // X3 = (a0 + a1) . (b0 + b1) - PCLMULQDQ $0x10, X3, X3 // X3 = (a0 + a1) * (b0 + b1) - VPCLMULQDQ $0x00, X0, X1, X4 // X4 = a0 * b0 - VPCLMULQDQ $0x11, X0, X1, X5 // X5 = a1 * b1 - XORPD X4, X3 // - XORPD X5, X3 // X3 = a0 * b1 + a1 * b0 - VPSLLDQ $8, X3, X2 // - XORPD X2, X4 // X4 = a0 * b0 + lo(X3) - VPSRLDQ $8, X3, X6 // - XORPD X6, X5 // X5 = a1 * b1 + hi(X3) +TEXT ·Mul(SB), NOSPLIT, $0 + MOVQ a+0(FP), AX // X0 = a0 . a1 + MOVUPD (AX), X0 // X0 = a0 . a1 + MOVQ b+8(FP), BX // X1 = b0 . b1 + MOVUPD (BX), X1 // X1 = b0 . b1 + VPUNPCKLQDQ X1, X0, X2 // X2 = a0 . b0 + VPUNPCKHQDQ X1, X0, X3 // X3 = a1 . b1 + XORPD X2, X3 // X3 = (a0 + a1) . (b0 + b1) + PCLMULQDQ $0x10, X3, X3 // X3 = (a0 + a1) * (b0 + b1) + VPCLMULQDQ $0x00, X0, X1, X4 // X4 = a0 * b0 + VPCLMULQDQ $0x11, X0, X1, X5 // X5 = a1 * b1 + XORPD X4, X3 + XORPD X5, X3 // X3 = a0 * b1 + a1 * b0 + VPSLLDQ $8, X3, X2 + XORPD X2, X4 // X4 = a0 * b0 + lo(X3) + VPSRLDQ $8, X3, X6 + XORPD X6, X5 // X5 = a1 * b1 + hi(X3) - // at this point, a * b = X4 . X5 (as 256-bit number) - // reduction modulo x^127 + x^63 + 1 - VPALIGNR $8, X4, X5, X3 - XORPD X5, X3 - PSLLQ $1, X5 - XORPD X5, X4 - VPUNPCKHQDQ X3, X5, X5 - XORPD X5, X4 - PSRLQ $63, X3 - XORPD X3, X4 - VPUNPCKLQDQ X3, X3, X5 - PSLLQ $63, X5 - XORPD X5, X4 - MOVQ c+16(FP), CX - MOVUPD X4, (CX) - RET + // at this point, a * b = X4 . X5 (as 256-bit number) + // reduction modulo x^127 + x^63 + 1 + VPALIGNR $8, X4, X5, X3 + XORPD X5, X3 + PSLLQ $1, X5 + XORPD X5, X4 + VPUNPCKHQDQ X3, X5, X5 + XORPD X5, X4 + PSRLQ $63, X3 + XORPD X3, X4 + VPUNPCKLQDQ X3, X3, X5 + PSLLQ $63, X5 + XORPD X5, X4 + MOVQ c+16(FP), CX + MOVUPD X4, (CX) + RET diff --git a/gf127/avx2/gf127x2_amd64.s b/gf127/avx2/gf127x2_amd64.s index 05f3700..ac708e5 100644 --- a/gf127/avx2/gf127x2_amd64.s +++ b/gf127/avx2/gf127x2_amd64.s @@ -1,35 +1,34 @@ #include "textflag.h" // func Mul10x2(a, b) *[4]uint64 -TEXT ·Mul10x2(SB),NOSPLIT,$0 - MOVQ a+0(FP), AX - VMOVDQA (AX), Y0 - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 - MOVQ b+8(FP), AX - VMOVDQA Y3, (AX) - RET - +TEXT ·Mul10x2(SB), NOSPLIT, $0 + MOVQ a+0(FP), AX + VMOVDQA (AX), Y0 + VPSLLQ $1, Y0, Y1 + VPALIGNR $8, Y1, Y0, Y2 + VPSRLQ $63, Y2, Y2 + VPXOR Y1, Y2, Y2 + VPSRLQ $63, Y1, Y3 + VPSLLQ $63, Y3, Y3 + VPUNPCKHQDQ Y3, Y3, Y3 + VPXOR Y2, Y3, Y3 + MOVQ b+8(FP), AX + VMOVDQA Y3, (AX) + RET // func Mul11x2(a, b) *[4]uint64 -TEXT ·Mul11x2(SB),NOSPLIT,$0 - MOVQ a+0(FP), AX - VMOVDQA (AX), Y0 - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 - VPXOR Y0, Y3, Y3 - MOVQ b+8(FP), AX - VMOVDQA Y3, (AX) - RET +TEXT ·Mul11x2(SB), NOSPLIT, $0 + MOVQ a+0(FP), AX + VMOVDQA (AX), Y0 + VPSLLQ $1, Y0, Y1 + VPALIGNR $8, Y1, Y0, Y2 + VPSRLQ $63, Y2, Y2 + VPXOR Y1, Y2, Y2 + VPSRLQ $63, Y1, Y3 + VPSLLQ $63, Y3, Y3 + VPUNPCKHQDQ Y3, Y3, Y3 + VPXOR Y2, Y3, Y3 + VPXOR Y0, Y3, Y3 + MOVQ b+8(FP), AX + VMOVDQA Y3, (AX) + RET diff --git a/tz/avx2_amd64.s b/tz/avx2_amd64.s index 4e974fd..0a0de51 100644 --- a/tz/avx2_amd64.s +++ b/tz/avx2_amd64.s @@ -1,87 +1,87 @@ #include "textflag.h" #define mask(bit, tmp, to) \ - VPSRLW bit, Y10, tmp \ - VPAND Y12, tmp, to \ // to = 0x000000... - VPSUBW to, Y13, to // to = 0xFFFF.. or 0x0000 depending on bit + VPSRLW bit, Y10, tmp \ + VPAND Y12, tmp, to \ // to = 0x000000... + VPSUBW to, Y13, to // to = 0xFFFF.. or 0x0000 depending on bit #define mulBit(bit, in_1, in_2, out_1, out_2) \ - VPSLLQ $1, in_1, Y1 \ - VPALIGNR $8, Y1, in_1, Y2 \ - VPSRLQ $63, Y2, Y2 \ - VPXOR Y1, Y2, Y2 \ - VPAND Y1, Y14, Y3 \ - VPUNPCKHQDQ Y3, Y3, Y3 \ - VPXOR Y2, Y3, Y3 \ - mask(bit, Y11, Y2) \ - VPXOR Y3, in_2, out_1 \ - VPAND out_1, Y2, Y4 \ - VPXOR Y4, in_1, out_2 \ + VPSLLQ $1, in_1, Y1 \ + VPALIGNR $8, Y1, in_1, Y2 \ + VPSRLQ $63, Y2, Y2 \ + VPXOR Y1, Y2, Y2 \ + VPAND Y1, Y14, Y3 \ + VPUNPCKHQDQ Y3, Y3, Y3 \ + VPXOR Y2, Y3, Y3 \ + mask(bit, Y11, Y2) \ + VPXOR Y3, in_2, out_1 \ + VPAND out_1, Y2, Y4 \ + VPXOR Y4, in_1, out_2 \ // func mulByteSliceRightx2(c00c10, c01c11 *[4]uint64, n int, data *byte) -TEXT ·mulByteSliceRightx2(SB),NOSPLIT,$0 - MOVQ c00c10+0(FP), AX - VMOVDQU (AX), Y0 - MOVQ c01c11+8(FP), BX - VMOVDQU (BX), Y8 +TEXT ·mulByteSliceRightx2(SB), NOSPLIT, $0 + MOVQ c00c10+0(FP), AX + VMOVDQU (AX), Y0 + MOVQ c01c11+8(FP), BX + VMOVDQU (BX), Y8 - VPXOR Y13, Y13, Y13 // Y13 = 0x0000... - VPCMPEQB Y14, Y14, Y14 // Y14 = 0xFFFF... - VPSUBQ Y14, Y13, Y10 - VPSUBW Y14, Y13, Y12 // Y12 = 0x00010001... (packed words of 1) - VPSLLQ $63, Y10, Y14 // Y14 = 0x10000000... (packed quad-words with HSB set) + VPXOR Y13, Y13, Y13 // Y13 = 0x0000... + VPCMPEQB Y14, Y14, Y14 // Y14 = 0xFFFF... + VPSUBQ Y14, Y13, Y10 + VPSUBW Y14, Y13, Y12 // Y12 = 0x00010001... (packed words of 1) + VPSLLQ $63, Y10, Y14 // Y14 = 0x10000000... (packed quad-words with HSB set) - MOVQ n+16(FP), CX - MOVQ data+24(FP), DX + MOVQ n+16(FP), CX + MOVQ data+24(FP), DX loop: - CMPQ CX, $0 - JEQ finish - SUBQ $1, CX + CMPQ CX, $0 + JEQ finish + SUBQ $1, CX - VPBROADCASTB (DX), X10 // X10 = packed bytes of b. - VPMOVZXBW X10, Y10 // Extend with zeroes to packed words. - ADDQ $1, DX + VPBROADCASTB (DX), X10 // X10 = packed bytes of b. + VPMOVZXBW X10, Y10 // Extend with zeroes to packed words. + ADDQ $1, DX - mulBit($7, Y0, Y8, Y5, Y6) - mulBit($6, Y5, Y6, Y0, Y8) - mulBit($5, Y0, Y8, Y5, Y6) - mulBit($4, Y5, Y6, Y0, Y8) - mulBit($3, Y0, Y8, Y5, Y6) - mulBit($2, Y5, Y6, Y0, Y8) - mulBit($1, Y0, Y8, Y5, Y6) - mulBit($0, Y5, Y6, Y0, Y8) + mulBit($7, Y0, Y8, Y5, Y6) + mulBit($6, Y5, Y6, Y0, Y8) + mulBit($5, Y0, Y8, Y5, Y6) + mulBit($4, Y5, Y6, Y0, Y8) + mulBit($3, Y0, Y8, Y5, Y6) + mulBit($2, Y5, Y6, Y0, Y8) + mulBit($1, Y0, Y8, Y5, Y6) + mulBit($0, Y5, Y6, Y0, Y8) - JMP loop + JMP loop finish: - VMOVDQU Y8, (BX) - VMOVDQU Y0, (AX) + VMOVDQU Y8, (BX) + VMOVDQU Y0, (AX) - RET + RET // func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64) -TEXT ·mulBitRightx2(SB),NOSPLIT,$0 - MOVQ c00c10+0(FP), AX - VMOVDQU (AX), Y0 - MOVQ c01c11+8(FP), BX - VMOVDQU (BX), Y8 +TEXT ·mulBitRightx2(SB), NOSPLIT, $0 + MOVQ c00c10+0(FP), AX + VMOVDQU (AX), Y0 + MOVQ c01c11+8(FP), BX + VMOVDQU (BX), Y8 - VPSLLQ $1, Y0, Y1 - VPALIGNR $8, Y1, Y0, Y2 - VPSRLQ $63, Y2, Y2 - VPXOR Y1, Y2, Y2 - VPSRLQ $63, Y1, Y3 - VPSLLQ $63, Y3, Y3 - VPUNPCKHQDQ Y3, Y3, Y3 - VPXOR Y2, Y3, Y3 + VPSLLQ $1, Y0, Y1 + VPALIGNR $8, Y1, Y0, Y2 + VPSRLQ $63, Y2, Y2 + VPXOR Y1, Y2, Y2 + VPSRLQ $63, Y1, Y3 + VPSLLQ $63, Y3, Y3 + VPUNPCKHQDQ Y3, Y3, Y3 + VPXOR Y2, Y3, Y3 - MOVQ e+16(FP), CX - VBROADCASTI128 (CX), Y2 + MOVQ e+16(FP), CX + VBROADCASTI128 (CX), Y2 - VPXOR Y3, Y8, Y3 - VPAND Y3, Y2, Y4 - VPXOR Y4, Y0, Y8 - VMOVDQU Y8, (BX) - VMOVDQU Y3, (AX) - RET + VPXOR Y3, Y8, Y3 + VPAND Y3, Y2, Y4 + VPXOR Y4, Y0, Y8 + VMOVDQU Y8, (BX) + VMOVDQU Y3, (AX) + RET diff --git a/tz/avx_amd64.s b/tz/avx_amd64.s index 3648741..7a56a5c 100644 --- a/tz/avx_amd64.s +++ b/tz/avx_amd64.s @@ -3,105 +3,105 @@ // mul2 multiplicates FROM by 2, stores result in R1 // and uses R1, R2 and R3 for internal computations. #define mul2(FROM, TO, R2, R3) \ - VPSLLQ $1, FROM, TO \ - VPALIGNR $8, TO, FROM, R2 \ - VPSRLQ $63, R2, R2 \ - VANDPD TO, X14, R3 \ - VPUNPCKHQDQ R3, R3, R3 \ - VXORPD R2, TO, TO \ - VXORPD R3, TO, TO + VPSLLQ $1, FROM, TO \ + VPALIGNR $8, TO, FROM, R2 \ + VPSRLQ $63, R2, R2 \ + VANDPD TO, X14, R3 \ + VPUNPCKHQDQ R3, R3, R3 \ + VXORPD R2, TO, TO \ + VXORPD R3, TO, TO #define mask(bit, tmp, to) \ - VPSRLW bit, X10, tmp \ - VPAND X12, tmp, to \ // to = 0x000000... - VPSUBW to, X13, to // to = 0xFFFF.. or 0x0000 depending on bit + VPSRLW bit, X10, tmp \ + VPAND X12, tmp, to \ // to = 0x000000... + VPSUBW to, X13, to // to = 0xFFFF.. or 0x0000 depending on bit #define mulBit(bit) \ - VMOVDQU X0, X8 \ - VMOVDQU X2, X9 \ - mul2(X0, X5, X6, X7) \ - VXORPD X1, X5, X0 \ - mul2(X2, X5, X6, X7) \ - VXORPD X3, X5, X2 \ - mask(bit, X6, X5) \ - VANDPD X0, X5, X1 \ - VXORPD X8, X1, X1 \ - VANDPD X2, X5, X3 \ - VXORPD X9, X3, X3 + VMOVDQU X0, X8 \ + VMOVDQU X2, X9 \ + mul2(X0, X5, X6, X7) \ + VXORPD X1, X5, X0 \ + mul2(X2, X5, X6, X7) \ + VXORPD X3, X5, X2 \ + mask(bit, X6, X5) \ + VANDPD X0, X5, X1 \ + VXORPD X8, X1, X1 \ + VANDPD X2, X5, X3 \ + VXORPD X9, X3, X3 // func mulBitRight(c00, c01, c10, c11, e *[2]uint64) -TEXT ·mulBitRight(SB),NOSPLIT,$0 - MOVQ c00+0(FP), AX - VMOVDQU (AX), X0 - VMOVDQU X0, X8 // remember c00 value - MOVQ c01+8(FP), BX - VMOVDQU (BX), X1 - MOVQ c10+16(FP), CX - VMOVDQU (CX), X2 - VMOVDQU X2, X9 // remember c10 value - MOVQ c11+24(FP), DX - VMOVDQU (DX), X3 +TEXT ·mulBitRight(SB), NOSPLIT, $0 + MOVQ c00+0(FP), AX + VMOVDQU (AX), X0 + VMOVDQU X0, X8 // remember c00 value + MOVQ c01+8(FP), BX + VMOVDQU (BX), X1 + MOVQ c10+16(FP), CX + VMOVDQU (CX), X2 + VMOVDQU X2, X9 // remember c10 value + MOVQ c11+24(FP), DX + VMOVDQU (DX), X3 - VPXOR X13, X13, X13 // Y13 = 0x0000... - VPCMPEQB X14, X14, X14 // Y14 = 0xFFFF... - VPSUBQ X14, X13, X13 - VPSLLQ $63, X13, X14 + VPXOR X13, X13, X13 // Y13 = 0x0000... + VPCMPEQB X14, X14, X14 // Y14 = 0xFFFF... + VPSUBQ X14, X13, X13 + VPSLLQ $63, X13, X14 - mul2(X0, X5, X6, X7) // c00 *= 2 - VXORPD X5, X1, X0 // c00 += c01 - mul2(X2, X5, X6, X7) // c10 *= 2 - VXORPD X3, X5, X2 // c10 += c11 - MOVQ e+32(FP), CX - VMOVDQU (CX), X5 - VANDPD X0, X5, X1 // c01 = c00 + e - VXORPD X8, X1, X1 // c01 += X8 (old c00) - VANDPD X2, X5, X3 // c11 = c10 + e - VXORPD X9, X3, X3 // c11 += x9 (old c10) + mul2(X0, X5, X6, X7) // c00 *= 2 + VXORPD X5, X1, X0 // c00 += c01 + mul2(X2, X5, X6, X7) // c10 *= 2 + VXORPD X3, X5, X2 // c10 += c11 + MOVQ e+32(FP), CX + VMOVDQU (CX), X5 + VANDPD X0, X5, X1 // c01 = c00 + e + VXORPD X8, X1, X1 // c01 += X8 (old c00) + VANDPD X2, X5, X3 // c11 = c10 + e + VXORPD X9, X3, X3 // c11 += x9 (old c10) - VMOVDQU X0, (AX) - MOVQ c10+16(FP), CX - VMOVDQU X2, (CX) - VMOVDQU X1, (BX) - VMOVDQU X3, (DX) + VMOVDQU X0, (AX) + MOVQ c10+16(FP), CX + VMOVDQU X2, (CX) + VMOVDQU X1, (BX) + VMOVDQU X3, (DX) - RET + RET -TEXT ·mulByteRight(SB),NOSPLIT,$0 - MOVQ c00+0(FP), AX - VMOVDQU (AX), X0 - MOVQ c01+8(FP), BX - VMOVDQU (BX), X1 - MOVQ c10+16(FP), CX - VMOVDQU (CX), X2 - MOVQ c11+24(FP), DX - VMOVDQU (DX), X3 - MOVQ $0, CX - MOVB b+32(FP), CX +TEXT ·mulByteRight(SB), NOSPLIT, $0 + MOVQ c00+0(FP), AX + VMOVDQU (AX), X0 + MOVQ c01+8(FP), BX + VMOVDQU (BX), X1 + MOVQ c10+16(FP), CX + VMOVDQU (CX), X2 + MOVQ c11+24(FP), DX + VMOVDQU (DX), X3 + MOVQ $0, CX + MOVB b+32(FP), CX - VPXOR X13, X13, X13 // X13 = 0x0000... - VPCMPEQB X14, X14, X14 // X14 = 0xFFFF... - VPSUBQ X14, X13, X10 - VPSUBW X14, X13, X12 // X12 = 0x00010001... (packed words of 1) - VPSLLQ $63, X10, X14 // X14 = 0x10000000... (packed quad-words with HSB set) + VPXOR X13, X13, X13 // X13 = 0x0000... + VPCMPEQB X14, X14, X14 // X14 = 0xFFFF... + VPSUBQ X14, X13, X10 + VPSUBW X14, X13, X12 // X12 = 0x00010001... (packed words of 1) + VPSLLQ $63, X10, X14 // X14 = 0x10000000... (packed quad-words with HSB set) - MOVQ CX, X10 - VPSHUFLW $0, X10, X11 - VPSHUFD $0, X11, X10 + MOVQ CX, X10 + VPSHUFLW $0, X10, X11 + VPSHUFD $0, X11, X10 - mulBit($7) - mulBit($6) - mulBit($5) - mulBit($4) - mulBit($3) - mulBit($2) - mulBit($1) - mulBit($0) + mulBit($7) + mulBit($6) + mulBit($5) + mulBit($4) + mulBit($3) + mulBit($2) + mulBit($1) + mulBit($0) - VMOVDQU X0, (AX) - MOVQ c10+16(FP), CX - VMOVDQU X2, (CX) - VMOVDQU X1, (BX) - MOVQ c11+24(FP), DX - VMOVDQU X3, (DX) + VMOVDQU X0, (AX) + MOVQ c10+16(FP), CX + VMOVDQU X2, (CX) + VMOVDQU X1, (BX) + MOVQ c11+24(FP), DX + VMOVDQU X3, (DX) - RET + RET