*: format assembly code with asmfmt

Signed-off-by: Evgenii Stratonikov <evgeniy@nspcc.ru>
This commit is contained in:
Evgenii Stratonikov 2022-03-09 14:38:53 +03:00 committed by Alex Vanin
parent 1520cde665
commit 0fa6b1314e
4 changed files with 251 additions and 252 deletions

View file

@ -1,81 +1,81 @@
#include "textflag.h"
// func Add(a, b, c *[2]uint64)
TEXT ·Add(SB),NOSPLIT,$0
MOVQ a+0(FP), AX
MOVUPD (AX), X0
MOVQ b+8(FP), BX
MOVUPD (BX), X1
XORPD X1, X0
MOVQ c+16(FP), CX
MOVUPD X0, (CX)
RET
TEXT ·Add(SB), NOSPLIT, $0
MOVQ a+0(FP), AX
MOVUPD (AX), X0
MOVQ b+8(FP), BX
MOVUPD (BX), X1
XORPD X1, X0
MOVQ c+16(FP), CX
MOVUPD X0, (CX)
RET
// func Mul10(a, b *[2]uint64)
TEXT ·Mul10(SB),NOSPLIT,$0
MOVQ a+0(FP), AX
MOVUPD (AX), X0
VPSLLQ $1, X0, X1
VPALIGNR $8, X1, X0, X2
PSRLQ $63, X2
MOVUPD ·x127x63(SB), X3
ANDPD X1, X3
VPUNPCKHQDQ X3, X3, X3
XORPD X2, X1
XORPD X3, X1
MOVQ b+8(FP), AX
MOVUPD X1, (AX)
RET
TEXT ·Mul10(SB), NOSPLIT, $0
MOVQ a+0(FP), AX
MOVUPD (AX), X0
VPSLLQ $1, X0, X1
VPALIGNR $8, X1, X0, X2
PSRLQ $63, X2
MOVUPD ·x127x63(SB), X3
ANDPD X1, X3
VPUNPCKHQDQ X3, X3, X3
XORPD X2, X1
XORPD X3, X1
MOVQ b+8(FP), AX
MOVUPD X1, (AX)
RET
// func Mul11(a, b *[2]uint64)
TEXT ·Mul11(SB),NOSPLIT,$0
MOVQ a+0(FP), AX
MOVUPD (AX), X0
VPSLLQ $1, X0, X1
VPALIGNR $8, X1, X0, X2
PSRLQ $63, X2
MOVUPD ·x127x63(SB), X3
ANDPD X1, X3
VPUNPCKHQDQ X3, X3, X3
XORPD X2, X1
XORPD X3, X1
XORPD X0, X1
MOVQ b+8(FP), AX
MOVUPD X1, (AX)
RET
TEXT ·Mul11(SB), NOSPLIT, $0
MOVQ a+0(FP), AX
MOVUPD (AX), X0
VPSLLQ $1, X0, X1
VPALIGNR $8, X1, X0, X2
PSRLQ $63, X2
MOVUPD ·x127x63(SB), X3
ANDPD X1, X3
VPUNPCKHQDQ X3, X3, X3
XORPD X2, X1
XORPD X3, X1
XORPD X0, X1
MOVQ b+8(FP), AX
MOVUPD X1, (AX)
RET
// func Mul(a, b, c *[2]uint64)
TEXT ·Mul(SB),NOSPLIT,$0
MOVQ a+0(FP), AX // X0 = a0 . a1
MOVUPD (AX), X0 // X0 = a0 . a1
MOVQ b+8(FP), BX // X1 = b0 . b1
MOVUPD (BX), X1 // X1 = b0 . b1
VPUNPCKLQDQ X1, X0, X2 // X2 = a0 . b0
VPUNPCKHQDQ X1, X0, X3 // X3 = a1 . b1
XORPD X2, X3 // X3 = (a0 + a1) . (b0 + b1)
PCLMULQDQ $0x10, X3, X3 // X3 = (a0 + a1) * (b0 + b1)
VPCLMULQDQ $0x00, X0, X1, X4 // X4 = a0 * b0
VPCLMULQDQ $0x11, X0, X1, X5 // X5 = a1 * b1
XORPD X4, X3 //
XORPD X5, X3 // X3 = a0 * b1 + a1 * b0
VPSLLDQ $8, X3, X2 //
XORPD X2, X4 // X4 = a0 * b0 + lo(X3)
VPSRLDQ $8, X3, X6 //
XORPD X6, X5 // X5 = a1 * b1 + hi(X3)
TEXT ·Mul(SB), NOSPLIT, $0
MOVQ a+0(FP), AX // X0 = a0 . a1
MOVUPD (AX), X0 // X0 = a0 . a1
MOVQ b+8(FP), BX // X1 = b0 . b1
MOVUPD (BX), X1 // X1 = b0 . b1
VPUNPCKLQDQ X1, X0, X2 // X2 = a0 . b0
VPUNPCKHQDQ X1, X0, X3 // X3 = a1 . b1
XORPD X2, X3 // X3 = (a0 + a1) . (b0 + b1)
PCLMULQDQ $0x10, X3, X3 // X3 = (a0 + a1) * (b0 + b1)
VPCLMULQDQ $0x00, X0, X1, X4 // X4 = a0 * b0
VPCLMULQDQ $0x11, X0, X1, X5 // X5 = a1 * b1
XORPD X4, X3
XORPD X5, X3 // X3 = a0 * b1 + a1 * b0
VPSLLDQ $8, X3, X2
XORPD X2, X4 // X4 = a0 * b0 + lo(X3)
VPSRLDQ $8, X3, X6
XORPD X6, X5 // X5 = a1 * b1 + hi(X3)
// at this point, a * b = X4 . X5 (as 256-bit number)
// reduction modulo x^127 + x^63 + 1
VPALIGNR $8, X4, X5, X3
XORPD X5, X3
PSLLQ $1, X5
XORPD X5, X4
VPUNPCKHQDQ X3, X5, X5
XORPD X5, X4
PSRLQ $63, X3
XORPD X3, X4
VPUNPCKLQDQ X3, X3, X5
PSLLQ $63, X5
XORPD X5, X4
MOVQ c+16(FP), CX
MOVUPD X4, (CX)
RET
// at this point, a * b = X4 . X5 (as 256-bit number)
// reduction modulo x^127 + x^63 + 1
VPALIGNR $8, X4, X5, X3
XORPD X5, X3
PSLLQ $1, X5
XORPD X5, X4
VPUNPCKHQDQ X3, X5, X5
XORPD X5, X4
PSRLQ $63, X3
XORPD X3, X4
VPUNPCKLQDQ X3, X3, X5
PSLLQ $63, X5
XORPD X5, X4
MOVQ c+16(FP), CX
MOVUPD X4, (CX)
RET

View file

@ -1,35 +1,34 @@
#include "textflag.h"
// func Mul10x2(a, b) *[4]uint64
TEXT ·Mul10x2(SB),NOSPLIT,$0
MOVQ a+0(FP), AX
VMOVDQA (AX), Y0
VPSLLQ $1, Y0, Y1
VPALIGNR $8, Y1, Y0, Y2
VPSRLQ $63, Y2, Y2
VPXOR Y1, Y2, Y2
VPSRLQ $63, Y1, Y3
VPSLLQ $63, Y3, Y3
VPUNPCKHQDQ Y3, Y3, Y3
VPXOR Y2, Y3, Y3
MOVQ b+8(FP), AX
VMOVDQA Y3, (AX)
RET
TEXT ·Mul10x2(SB), NOSPLIT, $0
MOVQ a+0(FP), AX
VMOVDQA (AX), Y0
VPSLLQ $1, Y0, Y1
VPALIGNR $8, Y1, Y0, Y2
VPSRLQ $63, Y2, Y2
VPXOR Y1, Y2, Y2
VPSRLQ $63, Y1, Y3
VPSLLQ $63, Y3, Y3
VPUNPCKHQDQ Y3, Y3, Y3
VPXOR Y2, Y3, Y3
MOVQ b+8(FP), AX
VMOVDQA Y3, (AX)
RET
// func Mul11x2(a, b) *[4]uint64
TEXT ·Mul11x2(SB),NOSPLIT,$0
MOVQ a+0(FP), AX
VMOVDQA (AX), Y0
VPSLLQ $1, Y0, Y1
VPALIGNR $8, Y1, Y0, Y2
VPSRLQ $63, Y2, Y2
VPXOR Y1, Y2, Y2
VPSRLQ $63, Y1, Y3
VPSLLQ $63, Y3, Y3
VPUNPCKHQDQ Y3, Y3, Y3
VPXOR Y2, Y3, Y3
VPXOR Y0, Y3, Y3
MOVQ b+8(FP), AX
VMOVDQA Y3, (AX)
RET
TEXT ·Mul11x2(SB), NOSPLIT, $0
MOVQ a+0(FP), AX
VMOVDQA (AX), Y0
VPSLLQ $1, Y0, Y1
VPALIGNR $8, Y1, Y0, Y2
VPSRLQ $63, Y2, Y2
VPXOR Y1, Y2, Y2
VPSRLQ $63, Y1, Y3
VPSLLQ $63, Y3, Y3
VPUNPCKHQDQ Y3, Y3, Y3
VPXOR Y2, Y3, Y3
VPXOR Y0, Y3, Y3
MOVQ b+8(FP), AX
VMOVDQA Y3, (AX)
RET

View file

@ -1,87 +1,87 @@
#include "textflag.h"
#define mask(bit, tmp, to) \
VPSRLW bit, Y10, tmp \
VPAND Y12, tmp, to \ // to = 0x000<bit>000<bit>...
VPSUBW to, Y13, to // to = 0xFFFF.. or 0x0000 depending on bit
VPSRLW bit, Y10, tmp \
VPAND Y12, tmp, to \ // to = 0x000<bit>000<bit>...
VPSUBW to, Y13, to // to = 0xFFFF.. or 0x0000 depending on bit
#define mulBit(bit, in_1, in_2, out_1, out_2) \
VPSLLQ $1, in_1, Y1 \
VPALIGNR $8, Y1, in_1, Y2 \
VPSRLQ $63, Y2, Y2 \
VPXOR Y1, Y2, Y2 \
VPAND Y1, Y14, Y3 \
VPUNPCKHQDQ Y3, Y3, Y3 \
VPXOR Y2, Y3, Y3 \
mask(bit, Y11, Y2) \
VPXOR Y3, in_2, out_1 \
VPAND out_1, Y2, Y4 \
VPXOR Y4, in_1, out_2 \
VPSLLQ $1, in_1, Y1 \
VPALIGNR $8, Y1, in_1, Y2 \
VPSRLQ $63, Y2, Y2 \
VPXOR Y1, Y2, Y2 \
VPAND Y1, Y14, Y3 \
VPUNPCKHQDQ Y3, Y3, Y3 \
VPXOR Y2, Y3, Y3 \
mask(bit, Y11, Y2) \
VPXOR Y3, in_2, out_1 \
VPAND out_1, Y2, Y4 \
VPXOR Y4, in_1, out_2 \
// func mulByteSliceRightx2(c00c10, c01c11 *[4]uint64, n int, data *byte)
TEXT ·mulByteSliceRightx2(SB),NOSPLIT,$0
MOVQ c00c10+0(FP), AX
VMOVDQU (AX), Y0
MOVQ c01c11+8(FP), BX
VMOVDQU (BX), Y8
TEXT ·mulByteSliceRightx2(SB), NOSPLIT, $0
MOVQ c00c10+0(FP), AX
VMOVDQU (AX), Y0
MOVQ c01c11+8(FP), BX
VMOVDQU (BX), Y8
VPXOR Y13, Y13, Y13 // Y13 = 0x0000...
VPCMPEQB Y14, Y14, Y14 // Y14 = 0xFFFF...
VPSUBQ Y14, Y13, Y10
VPSUBW Y14, Y13, Y12 // Y12 = 0x00010001... (packed words of 1)
VPSLLQ $63, Y10, Y14 // Y14 = 0x10000000... (packed quad-words with HSB set)
VPXOR Y13, Y13, Y13 // Y13 = 0x0000...
VPCMPEQB Y14, Y14, Y14 // Y14 = 0xFFFF...
VPSUBQ Y14, Y13, Y10
VPSUBW Y14, Y13, Y12 // Y12 = 0x00010001... (packed words of 1)
VPSLLQ $63, Y10, Y14 // Y14 = 0x10000000... (packed quad-words with HSB set)
MOVQ n+16(FP), CX
MOVQ data+24(FP), DX
MOVQ n+16(FP), CX
MOVQ data+24(FP), DX
loop:
CMPQ CX, $0
JEQ finish
SUBQ $1, CX
CMPQ CX, $0
JEQ finish
SUBQ $1, CX
VPBROADCASTB (DX), X10 // X10 = packed bytes of b.
VPMOVZXBW X10, Y10 // Extend with zeroes to packed words.
ADDQ $1, DX
VPBROADCASTB (DX), X10 // X10 = packed bytes of b.
VPMOVZXBW X10, Y10 // Extend with zeroes to packed words.
ADDQ $1, DX
mulBit($7, Y0, Y8, Y5, Y6)
mulBit($6, Y5, Y6, Y0, Y8)
mulBit($5, Y0, Y8, Y5, Y6)
mulBit($4, Y5, Y6, Y0, Y8)
mulBit($3, Y0, Y8, Y5, Y6)
mulBit($2, Y5, Y6, Y0, Y8)
mulBit($1, Y0, Y8, Y5, Y6)
mulBit($0, Y5, Y6, Y0, Y8)
mulBit($7, Y0, Y8, Y5, Y6)
mulBit($6, Y5, Y6, Y0, Y8)
mulBit($5, Y0, Y8, Y5, Y6)
mulBit($4, Y5, Y6, Y0, Y8)
mulBit($3, Y0, Y8, Y5, Y6)
mulBit($2, Y5, Y6, Y0, Y8)
mulBit($1, Y0, Y8, Y5, Y6)
mulBit($0, Y5, Y6, Y0, Y8)
JMP loop
JMP loop
finish:
VMOVDQU Y8, (BX)
VMOVDQU Y0, (AX)
VMOVDQU Y8, (BX)
VMOVDQU Y0, (AX)
RET
RET
// func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64)
TEXT ·mulBitRightx2(SB),NOSPLIT,$0
MOVQ c00c10+0(FP), AX
VMOVDQU (AX), Y0
MOVQ c01c11+8(FP), BX
VMOVDQU (BX), Y8
TEXT ·mulBitRightx2(SB), NOSPLIT, $0
MOVQ c00c10+0(FP), AX
VMOVDQU (AX), Y0
MOVQ c01c11+8(FP), BX
VMOVDQU (BX), Y8
VPSLLQ $1, Y0, Y1
VPALIGNR $8, Y1, Y0, Y2
VPSRLQ $63, Y2, Y2
VPXOR Y1, Y2, Y2
VPSRLQ $63, Y1, Y3
VPSLLQ $63, Y3, Y3
VPUNPCKHQDQ Y3, Y3, Y3
VPXOR Y2, Y3, Y3
VPSLLQ $1, Y0, Y1
VPALIGNR $8, Y1, Y0, Y2
VPSRLQ $63, Y2, Y2
VPXOR Y1, Y2, Y2
VPSRLQ $63, Y1, Y3
VPSLLQ $63, Y3, Y3
VPUNPCKHQDQ Y3, Y3, Y3
VPXOR Y2, Y3, Y3
MOVQ e+16(FP), CX
VBROADCASTI128 (CX), Y2
MOVQ e+16(FP), CX
VBROADCASTI128 (CX), Y2
VPXOR Y3, Y8, Y3
VPAND Y3, Y2, Y4
VPXOR Y4, Y0, Y8
VMOVDQU Y8, (BX)
VMOVDQU Y3, (AX)
RET
VPXOR Y3, Y8, Y3
VPAND Y3, Y2, Y4
VPXOR Y4, Y0, Y8
VMOVDQU Y8, (BX)
VMOVDQU Y3, (AX)
RET

View file

@ -3,105 +3,105 @@
// mul2 multiplicates FROM by 2, stores result in R1
// and uses R1, R2 and R3 for internal computations.
#define mul2(FROM, TO, R2, R3) \
VPSLLQ $1, FROM, TO \
VPALIGNR $8, TO, FROM, R2 \
VPSRLQ $63, R2, R2 \
VANDPD TO, X14, R3 \
VPUNPCKHQDQ R3, R3, R3 \
VXORPD R2, TO, TO \
VXORPD R3, TO, TO
VPSLLQ $1, FROM, TO \
VPALIGNR $8, TO, FROM, R2 \
VPSRLQ $63, R2, R2 \
VANDPD TO, X14, R3 \
VPUNPCKHQDQ R3, R3, R3 \
VXORPD R2, TO, TO \
VXORPD R3, TO, TO
#define mask(bit, tmp, to) \
VPSRLW bit, X10, tmp \
VPAND X12, tmp, to \ // to = 0x000<bit>000<bit>...
VPSUBW to, X13, to // to = 0xFFFF.. or 0x0000 depending on bit
VPSRLW bit, X10, tmp \
VPAND X12, tmp, to \ // to = 0x000<bit>000<bit>...
VPSUBW to, X13, to // to = 0xFFFF.. or 0x0000 depending on bit
#define mulBit(bit) \
VMOVDQU X0, X8 \
VMOVDQU X2, X9 \
mul2(X0, X5, X6, X7) \
VXORPD X1, X5, X0 \
mul2(X2, X5, X6, X7) \
VXORPD X3, X5, X2 \
mask(bit, X6, X5) \
VANDPD X0, X5, X1 \
VXORPD X8, X1, X1 \
VANDPD X2, X5, X3 \
VXORPD X9, X3, X3
VMOVDQU X0, X8 \
VMOVDQU X2, X9 \
mul2(X0, X5, X6, X7) \
VXORPD X1, X5, X0 \
mul2(X2, X5, X6, X7) \
VXORPD X3, X5, X2 \
mask(bit, X6, X5) \
VANDPD X0, X5, X1 \
VXORPD X8, X1, X1 \
VANDPD X2, X5, X3 \
VXORPD X9, X3, X3
// func mulBitRight(c00, c01, c10, c11, e *[2]uint64)
TEXT ·mulBitRight(SB),NOSPLIT,$0
MOVQ c00+0(FP), AX
VMOVDQU (AX), X0
VMOVDQU X0, X8 // remember c00 value
MOVQ c01+8(FP), BX
VMOVDQU (BX), X1
MOVQ c10+16(FP), CX
VMOVDQU (CX), X2
VMOVDQU X2, X9 // remember c10 value
MOVQ c11+24(FP), DX
VMOVDQU (DX), X3
TEXT ·mulBitRight(SB), NOSPLIT, $0
MOVQ c00+0(FP), AX
VMOVDQU (AX), X0
VMOVDQU X0, X8 // remember c00 value
MOVQ c01+8(FP), BX
VMOVDQU (BX), X1
MOVQ c10+16(FP), CX
VMOVDQU (CX), X2
VMOVDQU X2, X9 // remember c10 value
MOVQ c11+24(FP), DX
VMOVDQU (DX), X3
VPXOR X13, X13, X13 // Y13 = 0x0000...
VPCMPEQB X14, X14, X14 // Y14 = 0xFFFF...
VPSUBQ X14, X13, X13
VPSLLQ $63, X13, X14
VPXOR X13, X13, X13 // Y13 = 0x0000...
VPCMPEQB X14, X14, X14 // Y14 = 0xFFFF...
VPSUBQ X14, X13, X13
VPSLLQ $63, X13, X14
mul2(X0, X5, X6, X7) // c00 *= 2
VXORPD X5, X1, X0 // c00 += c01
mul2(X2, X5, X6, X7) // c10 *= 2
VXORPD X3, X5, X2 // c10 += c11
MOVQ e+32(FP), CX
VMOVDQU (CX), X5
VANDPD X0, X5, X1 // c01 = c00 + e
VXORPD X8, X1, X1 // c01 += X8 (old c00)
VANDPD X2, X5, X3 // c11 = c10 + e
VXORPD X9, X3, X3 // c11 += x9 (old c10)
mul2(X0, X5, X6, X7) // c00 *= 2
VXORPD X5, X1, X0 // c00 += c01
mul2(X2, X5, X6, X7) // c10 *= 2
VXORPD X3, X5, X2 // c10 += c11
MOVQ e+32(FP), CX
VMOVDQU (CX), X5
VANDPD X0, X5, X1 // c01 = c00 + e
VXORPD X8, X1, X1 // c01 += X8 (old c00)
VANDPD X2, X5, X3 // c11 = c10 + e
VXORPD X9, X3, X3 // c11 += x9 (old c10)
VMOVDQU X0, (AX)
MOVQ c10+16(FP), CX
VMOVDQU X2, (CX)
VMOVDQU X1, (BX)
VMOVDQU X3, (DX)
VMOVDQU X0, (AX)
MOVQ c10+16(FP), CX
VMOVDQU X2, (CX)
VMOVDQU X1, (BX)
VMOVDQU X3, (DX)
RET
RET
TEXT ·mulByteRight(SB),NOSPLIT,$0
MOVQ c00+0(FP), AX
VMOVDQU (AX), X0
MOVQ c01+8(FP), BX
VMOVDQU (BX), X1
MOVQ c10+16(FP), CX
VMOVDQU (CX), X2
MOVQ c11+24(FP), DX
VMOVDQU (DX), X3
MOVQ $0, CX
MOVB b+32(FP), CX
TEXT ·mulByteRight(SB), NOSPLIT, $0
MOVQ c00+0(FP), AX
VMOVDQU (AX), X0
MOVQ c01+8(FP), BX
VMOVDQU (BX), X1
MOVQ c10+16(FP), CX
VMOVDQU (CX), X2
MOVQ c11+24(FP), DX
VMOVDQU (DX), X3
MOVQ $0, CX
MOVB b+32(FP), CX
VPXOR X13, X13, X13 // X13 = 0x0000...
VPCMPEQB X14, X14, X14 // X14 = 0xFFFF...
VPSUBQ X14, X13, X10
VPSUBW X14, X13, X12 // X12 = 0x00010001... (packed words of 1)
VPSLLQ $63, X10, X14 // X14 = 0x10000000... (packed quad-words with HSB set)
VPXOR X13, X13, X13 // X13 = 0x0000...
VPCMPEQB X14, X14, X14 // X14 = 0xFFFF...
VPSUBQ X14, X13, X10
VPSUBW X14, X13, X12 // X12 = 0x00010001... (packed words of 1)
VPSLLQ $63, X10, X14 // X14 = 0x10000000... (packed quad-words with HSB set)
MOVQ CX, X10
VPSHUFLW $0, X10, X11
VPSHUFD $0, X11, X10
MOVQ CX, X10
VPSHUFLW $0, X10, X11
VPSHUFD $0, X11, X10
mulBit($7)
mulBit($6)
mulBit($5)
mulBit($4)
mulBit($3)
mulBit($2)
mulBit($1)
mulBit($0)
mulBit($7)
mulBit($6)
mulBit($5)
mulBit($4)
mulBit($3)
mulBit($2)
mulBit($1)
mulBit($0)
VMOVDQU X0, (AX)
MOVQ c10+16(FP), CX
VMOVDQU X2, (CX)
VMOVDQU X1, (BX)
MOVQ c11+24(FP), DX
VMOVDQU X3, (DX)
VMOVDQU X0, (AX)
MOVQ c10+16(FP), CX
VMOVDQU X2, (CX)
VMOVDQU X1, (BX)
MOVQ c11+24(FP), DX
VMOVDQU X3, (DX)
RET
RET