*: format assembly code with asmfmt

Signed-off-by: Evgenii Stratonikov <evgeniy@nspcc.ru>
This commit is contained in:
Evgenii Stratonikov 2022-03-09 14:38:53 +03:00 committed by Alex Vanin
parent 1520cde665
commit 0fa6b1314e
4 changed files with 251 additions and 252 deletions

View file

@ -1,81 +1,81 @@
#include "textflag.h" #include "textflag.h"
// func Add(a, b, c *[2]uint64) // func Add(a, b, c *[2]uint64)
TEXT ·Add(SB),NOSPLIT,$0 TEXT ·Add(SB), NOSPLIT, $0
MOVQ a+0(FP), AX MOVQ a+0(FP), AX
MOVUPD (AX), X0 MOVUPD (AX), X0
MOVQ b+8(FP), BX MOVQ b+8(FP), BX
MOVUPD (BX), X1 MOVUPD (BX), X1
XORPD X1, X0 XORPD X1, X0
MOVQ c+16(FP), CX MOVQ c+16(FP), CX
MOVUPD X0, (CX) MOVUPD X0, (CX)
RET RET
// func Mul10(a, b *[2]uint64) // func Mul10(a, b *[2]uint64)
TEXT ·Mul10(SB),NOSPLIT,$0 TEXT ·Mul10(SB), NOSPLIT, $0
MOVQ a+0(FP), AX MOVQ a+0(FP), AX
MOVUPD (AX), X0 MOVUPD (AX), X0
VPSLLQ $1, X0, X1 VPSLLQ $1, X0, X1
VPALIGNR $8, X1, X0, X2 VPALIGNR $8, X1, X0, X2
PSRLQ $63, X2 PSRLQ $63, X2
MOVUPD ·x127x63(SB), X3 MOVUPD ·x127x63(SB), X3
ANDPD X1, X3 ANDPD X1, X3
VPUNPCKHQDQ X3, X3, X3 VPUNPCKHQDQ X3, X3, X3
XORPD X2, X1 XORPD X2, X1
XORPD X3, X1 XORPD X3, X1
MOVQ b+8(FP), AX MOVQ b+8(FP), AX
MOVUPD X1, (AX) MOVUPD X1, (AX)
RET RET
// func Mul11(a, b *[2]uint64) // func Mul11(a, b *[2]uint64)
TEXT ·Mul11(SB),NOSPLIT,$0 TEXT ·Mul11(SB), NOSPLIT, $0
MOVQ a+0(FP), AX MOVQ a+0(FP), AX
MOVUPD (AX), X0 MOVUPD (AX), X0
VPSLLQ $1, X0, X1 VPSLLQ $1, X0, X1
VPALIGNR $8, X1, X0, X2 VPALIGNR $8, X1, X0, X2
PSRLQ $63, X2 PSRLQ $63, X2
MOVUPD ·x127x63(SB), X3 MOVUPD ·x127x63(SB), X3
ANDPD X1, X3 ANDPD X1, X3
VPUNPCKHQDQ X3, X3, X3 VPUNPCKHQDQ X3, X3, X3
XORPD X2, X1 XORPD X2, X1
XORPD X3, X1 XORPD X3, X1
XORPD X0, X1 XORPD X0, X1
MOVQ b+8(FP), AX MOVQ b+8(FP), AX
MOVUPD X1, (AX) MOVUPD X1, (AX)
RET RET
// func Mul(a, b, c *[2]uint64) // func Mul(a, b, c *[2]uint64)
TEXT ·Mul(SB),NOSPLIT,$0 TEXT ·Mul(SB), NOSPLIT, $0
MOVQ a+0(FP), AX // X0 = a0 . a1 MOVQ a+0(FP), AX // X0 = a0 . a1
MOVUPD (AX), X0 // X0 = a0 . a1 MOVUPD (AX), X0 // X0 = a0 . a1
MOVQ b+8(FP), BX // X1 = b0 . b1 MOVQ b+8(FP), BX // X1 = b0 . b1
MOVUPD (BX), X1 // X1 = b0 . b1 MOVUPD (BX), X1 // X1 = b0 . b1
VPUNPCKLQDQ X1, X0, X2 // X2 = a0 . b0 VPUNPCKLQDQ X1, X0, X2 // X2 = a0 . b0
VPUNPCKHQDQ X1, X0, X3 // X3 = a1 . b1 VPUNPCKHQDQ X1, X0, X3 // X3 = a1 . b1
XORPD X2, X3 // X3 = (a0 + a1) . (b0 + b1) XORPD X2, X3 // X3 = (a0 + a1) . (b0 + b1)
PCLMULQDQ $0x10, X3, X3 // X3 = (a0 + a1) * (b0 + b1) PCLMULQDQ $0x10, X3, X3 // X3 = (a0 + a1) * (b0 + b1)
VPCLMULQDQ $0x00, X0, X1, X4 // X4 = a0 * b0 VPCLMULQDQ $0x00, X0, X1, X4 // X4 = a0 * b0
VPCLMULQDQ $0x11, X0, X1, X5 // X5 = a1 * b1 VPCLMULQDQ $0x11, X0, X1, X5 // X5 = a1 * b1
XORPD X4, X3 // XORPD X4, X3
XORPD X5, X3 // X3 = a0 * b1 + a1 * b0 XORPD X5, X3 // X3 = a0 * b1 + a1 * b0
VPSLLDQ $8, X3, X2 // VPSLLDQ $8, X3, X2
XORPD X2, X4 // X4 = a0 * b0 + lo(X3) XORPD X2, X4 // X4 = a0 * b0 + lo(X3)
VPSRLDQ $8, X3, X6 // VPSRLDQ $8, X3, X6
XORPD X6, X5 // X5 = a1 * b1 + hi(X3) XORPD X6, X5 // X5 = a1 * b1 + hi(X3)
// at this point, a * b = X4 . X5 (as 256-bit number) // at this point, a * b = X4 . X5 (as 256-bit number)
// reduction modulo x^127 + x^63 + 1 // reduction modulo x^127 + x^63 + 1
VPALIGNR $8, X4, X5, X3 VPALIGNR $8, X4, X5, X3
XORPD X5, X3 XORPD X5, X3
PSLLQ $1, X5 PSLLQ $1, X5
XORPD X5, X4 XORPD X5, X4
VPUNPCKHQDQ X3, X5, X5 VPUNPCKHQDQ X3, X5, X5
XORPD X5, X4 XORPD X5, X4
PSRLQ $63, X3 PSRLQ $63, X3
XORPD X3, X4 XORPD X3, X4
VPUNPCKLQDQ X3, X3, X5 VPUNPCKLQDQ X3, X3, X5
PSLLQ $63, X5 PSLLQ $63, X5
XORPD X5, X4 XORPD X5, X4
MOVQ c+16(FP), CX MOVQ c+16(FP), CX
MOVUPD X4, (CX) MOVUPD X4, (CX)
RET RET

View file

@ -1,35 +1,34 @@
#include "textflag.h" #include "textflag.h"
// func Mul10x2(a, b) *[4]uint64 // func Mul10x2(a, b) *[4]uint64
TEXT ·Mul10x2(SB),NOSPLIT,$0 TEXT ·Mul10x2(SB), NOSPLIT, $0
MOVQ a+0(FP), AX MOVQ a+0(FP), AX
VMOVDQA (AX), Y0 VMOVDQA (AX), Y0
VPSLLQ $1, Y0, Y1 VPSLLQ $1, Y0, Y1
VPALIGNR $8, Y1, Y0, Y2 VPALIGNR $8, Y1, Y0, Y2
VPSRLQ $63, Y2, Y2 VPSRLQ $63, Y2, Y2
VPXOR Y1, Y2, Y2 VPXOR Y1, Y2, Y2
VPSRLQ $63, Y1, Y3 VPSRLQ $63, Y1, Y3
VPSLLQ $63, Y3, Y3 VPSLLQ $63, Y3, Y3
VPUNPCKHQDQ Y3, Y3, Y3 VPUNPCKHQDQ Y3, Y3, Y3
VPXOR Y2, Y3, Y3 VPXOR Y2, Y3, Y3
MOVQ b+8(FP), AX MOVQ b+8(FP), AX
VMOVDQA Y3, (AX) VMOVDQA Y3, (AX)
RET RET
// func Mul11x2(a, b) *[4]uint64 // func Mul11x2(a, b) *[4]uint64
TEXT ·Mul11x2(SB),NOSPLIT,$0 TEXT ·Mul11x2(SB), NOSPLIT, $0
MOVQ a+0(FP), AX MOVQ a+0(FP), AX
VMOVDQA (AX), Y0 VMOVDQA (AX), Y0
VPSLLQ $1, Y0, Y1 VPSLLQ $1, Y0, Y1
VPALIGNR $8, Y1, Y0, Y2 VPALIGNR $8, Y1, Y0, Y2
VPSRLQ $63, Y2, Y2 VPSRLQ $63, Y2, Y2
VPXOR Y1, Y2, Y2 VPXOR Y1, Y2, Y2
VPSRLQ $63, Y1, Y3 VPSRLQ $63, Y1, Y3
VPSLLQ $63, Y3, Y3 VPSLLQ $63, Y3, Y3
VPUNPCKHQDQ Y3, Y3, Y3 VPUNPCKHQDQ Y3, Y3, Y3
VPXOR Y2, Y3, Y3 VPXOR Y2, Y3, Y3
VPXOR Y0, Y3, Y3 VPXOR Y0, Y3, Y3
MOVQ b+8(FP), AX MOVQ b+8(FP), AX
VMOVDQA Y3, (AX) VMOVDQA Y3, (AX)
RET RET

View file

@ -1,87 +1,87 @@
#include "textflag.h" #include "textflag.h"
#define mask(bit, tmp, to) \ #define mask(bit, tmp, to) \
VPSRLW bit, Y10, tmp \ VPSRLW bit, Y10, tmp \
VPAND Y12, tmp, to \ // to = 0x000<bit>000<bit>... VPAND Y12, tmp, to \ // to = 0x000<bit>000<bit>...
VPSUBW to, Y13, to // to = 0xFFFF.. or 0x0000 depending on bit VPSUBW to, Y13, to // to = 0xFFFF.. or 0x0000 depending on bit
#define mulBit(bit, in_1, in_2, out_1, out_2) \ #define mulBit(bit, in_1, in_2, out_1, out_2) \
VPSLLQ $1, in_1, Y1 \ VPSLLQ $1, in_1, Y1 \
VPALIGNR $8, Y1, in_1, Y2 \ VPALIGNR $8, Y1, in_1, Y2 \
VPSRLQ $63, Y2, Y2 \ VPSRLQ $63, Y2, Y2 \
VPXOR Y1, Y2, Y2 \ VPXOR Y1, Y2, Y2 \
VPAND Y1, Y14, Y3 \ VPAND Y1, Y14, Y3 \
VPUNPCKHQDQ Y3, Y3, Y3 \ VPUNPCKHQDQ Y3, Y3, Y3 \
VPXOR Y2, Y3, Y3 \ VPXOR Y2, Y3, Y3 \
mask(bit, Y11, Y2) \ mask(bit, Y11, Y2) \
VPXOR Y3, in_2, out_1 \ VPXOR Y3, in_2, out_1 \
VPAND out_1, Y2, Y4 \ VPAND out_1, Y2, Y4 \
VPXOR Y4, in_1, out_2 \ VPXOR Y4, in_1, out_2 \
// func mulByteSliceRightx2(c00c10, c01c11 *[4]uint64, n int, data *byte) // func mulByteSliceRightx2(c00c10, c01c11 *[4]uint64, n int, data *byte)
TEXT ·mulByteSliceRightx2(SB),NOSPLIT,$0 TEXT ·mulByteSliceRightx2(SB), NOSPLIT, $0
MOVQ c00c10+0(FP), AX MOVQ c00c10+0(FP), AX
VMOVDQU (AX), Y0 VMOVDQU (AX), Y0
MOVQ c01c11+8(FP), BX MOVQ c01c11+8(FP), BX
VMOVDQU (BX), Y8 VMOVDQU (BX), Y8
VPXOR Y13, Y13, Y13 // Y13 = 0x0000... VPXOR Y13, Y13, Y13 // Y13 = 0x0000...
VPCMPEQB Y14, Y14, Y14 // Y14 = 0xFFFF... VPCMPEQB Y14, Y14, Y14 // Y14 = 0xFFFF...
VPSUBQ Y14, Y13, Y10 VPSUBQ Y14, Y13, Y10
VPSUBW Y14, Y13, Y12 // Y12 = 0x00010001... (packed words of 1) VPSUBW Y14, Y13, Y12 // Y12 = 0x00010001... (packed words of 1)
VPSLLQ $63, Y10, Y14 // Y14 = 0x10000000... (packed quad-words with HSB set) VPSLLQ $63, Y10, Y14 // Y14 = 0x10000000... (packed quad-words with HSB set)
MOVQ n+16(FP), CX MOVQ n+16(FP), CX
MOVQ data+24(FP), DX MOVQ data+24(FP), DX
loop: loop:
CMPQ CX, $0 CMPQ CX, $0
JEQ finish JEQ finish
SUBQ $1, CX SUBQ $1, CX
VPBROADCASTB (DX), X10 // X10 = packed bytes of b. VPBROADCASTB (DX), X10 // X10 = packed bytes of b.
VPMOVZXBW X10, Y10 // Extend with zeroes to packed words. VPMOVZXBW X10, Y10 // Extend with zeroes to packed words.
ADDQ $1, DX ADDQ $1, DX
mulBit($7, Y0, Y8, Y5, Y6) mulBit($7, Y0, Y8, Y5, Y6)
mulBit($6, Y5, Y6, Y0, Y8) mulBit($6, Y5, Y6, Y0, Y8)
mulBit($5, Y0, Y8, Y5, Y6) mulBit($5, Y0, Y8, Y5, Y6)
mulBit($4, Y5, Y6, Y0, Y8) mulBit($4, Y5, Y6, Y0, Y8)
mulBit($3, Y0, Y8, Y5, Y6) mulBit($3, Y0, Y8, Y5, Y6)
mulBit($2, Y5, Y6, Y0, Y8) mulBit($2, Y5, Y6, Y0, Y8)
mulBit($1, Y0, Y8, Y5, Y6) mulBit($1, Y0, Y8, Y5, Y6)
mulBit($0, Y5, Y6, Y0, Y8) mulBit($0, Y5, Y6, Y0, Y8)
JMP loop JMP loop
finish: finish:
VMOVDQU Y8, (BX) VMOVDQU Y8, (BX)
VMOVDQU Y0, (AX) VMOVDQU Y0, (AX)
RET RET
// func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64) // func mulBitRightx2(c00c10, c01c11 *[4]uint64, e *[2]uint64)
TEXT ·mulBitRightx2(SB),NOSPLIT,$0 TEXT ·mulBitRightx2(SB), NOSPLIT, $0
MOVQ c00c10+0(FP), AX MOVQ c00c10+0(FP), AX
VMOVDQU (AX), Y0 VMOVDQU (AX), Y0
MOVQ c01c11+8(FP), BX MOVQ c01c11+8(FP), BX
VMOVDQU (BX), Y8 VMOVDQU (BX), Y8
VPSLLQ $1, Y0, Y1 VPSLLQ $1, Y0, Y1
VPALIGNR $8, Y1, Y0, Y2 VPALIGNR $8, Y1, Y0, Y2
VPSRLQ $63, Y2, Y2 VPSRLQ $63, Y2, Y2
VPXOR Y1, Y2, Y2 VPXOR Y1, Y2, Y2
VPSRLQ $63, Y1, Y3 VPSRLQ $63, Y1, Y3
VPSLLQ $63, Y3, Y3 VPSLLQ $63, Y3, Y3
VPUNPCKHQDQ Y3, Y3, Y3 VPUNPCKHQDQ Y3, Y3, Y3
VPXOR Y2, Y3, Y3 VPXOR Y2, Y3, Y3
MOVQ e+16(FP), CX MOVQ e+16(FP), CX
VBROADCASTI128 (CX), Y2 VBROADCASTI128 (CX), Y2
VPXOR Y3, Y8, Y3 VPXOR Y3, Y8, Y3
VPAND Y3, Y2, Y4 VPAND Y3, Y2, Y4
VPXOR Y4, Y0, Y8 VPXOR Y4, Y0, Y8
VMOVDQU Y8, (BX) VMOVDQU Y8, (BX)
VMOVDQU Y3, (AX) VMOVDQU Y3, (AX)
RET RET

View file

@ -3,105 +3,105 @@
// mul2 multiplicates FROM by 2, stores result in R1 // mul2 multiplicates FROM by 2, stores result in R1
// and uses R1, R2 and R3 for internal computations. // and uses R1, R2 and R3 for internal computations.
#define mul2(FROM, TO, R2, R3) \ #define mul2(FROM, TO, R2, R3) \
VPSLLQ $1, FROM, TO \ VPSLLQ $1, FROM, TO \
VPALIGNR $8, TO, FROM, R2 \ VPALIGNR $8, TO, FROM, R2 \
VPSRLQ $63, R2, R2 \ VPSRLQ $63, R2, R2 \
VANDPD TO, X14, R3 \ VANDPD TO, X14, R3 \
VPUNPCKHQDQ R3, R3, R3 \ VPUNPCKHQDQ R3, R3, R3 \
VXORPD R2, TO, TO \ VXORPD R2, TO, TO \
VXORPD R3, TO, TO VXORPD R3, TO, TO
#define mask(bit, tmp, to) \ #define mask(bit, tmp, to) \
VPSRLW bit, X10, tmp \ VPSRLW bit, X10, tmp \
VPAND X12, tmp, to \ // to = 0x000<bit>000<bit>... VPAND X12, tmp, to \ // to = 0x000<bit>000<bit>...
VPSUBW to, X13, to // to = 0xFFFF.. or 0x0000 depending on bit VPSUBW to, X13, to // to = 0xFFFF.. or 0x0000 depending on bit
#define mulBit(bit) \ #define mulBit(bit) \
VMOVDQU X0, X8 \ VMOVDQU X0, X8 \
VMOVDQU X2, X9 \ VMOVDQU X2, X9 \
mul2(X0, X5, X6, X7) \ mul2(X0, X5, X6, X7) \
VXORPD X1, X5, X0 \ VXORPD X1, X5, X0 \
mul2(X2, X5, X6, X7) \ mul2(X2, X5, X6, X7) \
VXORPD X3, X5, X2 \ VXORPD X3, X5, X2 \
mask(bit, X6, X5) \ mask(bit, X6, X5) \
VANDPD X0, X5, X1 \ VANDPD X0, X5, X1 \
VXORPD X8, X1, X1 \ VXORPD X8, X1, X1 \
VANDPD X2, X5, X3 \ VANDPD X2, X5, X3 \
VXORPD X9, X3, X3 VXORPD X9, X3, X3
// func mulBitRight(c00, c01, c10, c11, e *[2]uint64) // func mulBitRight(c00, c01, c10, c11, e *[2]uint64)
TEXT ·mulBitRight(SB),NOSPLIT,$0 TEXT ·mulBitRight(SB), NOSPLIT, $0
MOVQ c00+0(FP), AX MOVQ c00+0(FP), AX
VMOVDQU (AX), X0 VMOVDQU (AX), X0
VMOVDQU X0, X8 // remember c00 value VMOVDQU X0, X8 // remember c00 value
MOVQ c01+8(FP), BX MOVQ c01+8(FP), BX
VMOVDQU (BX), X1 VMOVDQU (BX), X1
MOVQ c10+16(FP), CX MOVQ c10+16(FP), CX
VMOVDQU (CX), X2 VMOVDQU (CX), X2
VMOVDQU X2, X9 // remember c10 value VMOVDQU X2, X9 // remember c10 value
MOVQ c11+24(FP), DX MOVQ c11+24(FP), DX
VMOVDQU (DX), X3 VMOVDQU (DX), X3
VPXOR X13, X13, X13 // Y13 = 0x0000... VPXOR X13, X13, X13 // Y13 = 0x0000...
VPCMPEQB X14, X14, X14 // Y14 = 0xFFFF... VPCMPEQB X14, X14, X14 // Y14 = 0xFFFF...
VPSUBQ X14, X13, X13 VPSUBQ X14, X13, X13
VPSLLQ $63, X13, X14 VPSLLQ $63, X13, X14
mul2(X0, X5, X6, X7) // c00 *= 2 mul2(X0, X5, X6, X7) // c00 *= 2
VXORPD X5, X1, X0 // c00 += c01 VXORPD X5, X1, X0 // c00 += c01
mul2(X2, X5, X6, X7) // c10 *= 2 mul2(X2, X5, X6, X7) // c10 *= 2
VXORPD X3, X5, X2 // c10 += c11 VXORPD X3, X5, X2 // c10 += c11
MOVQ e+32(FP), CX MOVQ e+32(FP), CX
VMOVDQU (CX), X5 VMOVDQU (CX), X5
VANDPD X0, X5, X1 // c01 = c00 + e VANDPD X0, X5, X1 // c01 = c00 + e
VXORPD X8, X1, X1 // c01 += X8 (old c00) VXORPD X8, X1, X1 // c01 += X8 (old c00)
VANDPD X2, X5, X3 // c11 = c10 + e VANDPD X2, X5, X3 // c11 = c10 + e
VXORPD X9, X3, X3 // c11 += x9 (old c10) VXORPD X9, X3, X3 // c11 += x9 (old c10)
VMOVDQU X0, (AX) VMOVDQU X0, (AX)
MOVQ c10+16(FP), CX MOVQ c10+16(FP), CX
VMOVDQU X2, (CX) VMOVDQU X2, (CX)
VMOVDQU X1, (BX) VMOVDQU X1, (BX)
VMOVDQU X3, (DX) VMOVDQU X3, (DX)
RET RET
TEXT ·mulByteRight(SB),NOSPLIT,$0 TEXT ·mulByteRight(SB), NOSPLIT, $0
MOVQ c00+0(FP), AX MOVQ c00+0(FP), AX
VMOVDQU (AX), X0 VMOVDQU (AX), X0
MOVQ c01+8(FP), BX MOVQ c01+8(FP), BX
VMOVDQU (BX), X1 VMOVDQU (BX), X1
MOVQ c10+16(FP), CX MOVQ c10+16(FP), CX
VMOVDQU (CX), X2 VMOVDQU (CX), X2
MOVQ c11+24(FP), DX MOVQ c11+24(FP), DX
VMOVDQU (DX), X3 VMOVDQU (DX), X3
MOVQ $0, CX MOVQ $0, CX
MOVB b+32(FP), CX MOVB b+32(FP), CX
VPXOR X13, X13, X13 // X13 = 0x0000... VPXOR X13, X13, X13 // X13 = 0x0000...
VPCMPEQB X14, X14, X14 // X14 = 0xFFFF... VPCMPEQB X14, X14, X14 // X14 = 0xFFFF...
VPSUBQ X14, X13, X10 VPSUBQ X14, X13, X10
VPSUBW X14, X13, X12 // X12 = 0x00010001... (packed words of 1) VPSUBW X14, X13, X12 // X12 = 0x00010001... (packed words of 1)
VPSLLQ $63, X10, X14 // X14 = 0x10000000... (packed quad-words with HSB set) VPSLLQ $63, X10, X14 // X14 = 0x10000000... (packed quad-words with HSB set)
MOVQ CX, X10 MOVQ CX, X10
VPSHUFLW $0, X10, X11 VPSHUFLW $0, X10, X11
VPSHUFD $0, X11, X10 VPSHUFD $0, X11, X10
mulBit($7) mulBit($7)
mulBit($6) mulBit($6)
mulBit($5) mulBit($5)
mulBit($4) mulBit($4)
mulBit($3) mulBit($3)
mulBit($2) mulBit($2)
mulBit($1) mulBit($1)
mulBit($0) mulBit($0)
VMOVDQU X0, (AX) VMOVDQU X0, (AX)
MOVQ c10+16(FP), CX MOVQ c10+16(FP), CX
VMOVDQU X2, (CX) VMOVDQU X2, (CX)
VMOVDQU X1, (BX) VMOVDQU X1, (BX)
MOVQ c11+24(FP), DX MOVQ c11+24(FP), DX
VMOVDQU X3, (DX) VMOVDQU X3, (DX)
RET RET