200 lines
5.7 KiB
ArmAsm
200 lines
5.7 KiB
ArmAsm
|
// The MIT License (MIT)
|
||
|
//
|
||
|
// Copyright (C) 2016-2017 Vivint, Inc.
|
||
|
// Copyright (c) 2015 Klaus Post
|
||
|
// Copyright (c) 2015 Backblaze
|
||
|
//
|
||
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||
|
// of this software and associated documentation files (the "Software"), to deal
|
||
|
// in the Software without restriction, including without limitation the rights
|
||
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||
|
// copies of the Software, and to permit persons to whom the Software is
|
||
|
// furnished to do so, subject to the following conditions:
|
||
|
//
|
||
|
// The above copyright notice and this permission notice shall be included in all
|
||
|
// copies or substantial portions of the Software.
|
||
|
//
|
||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||
|
// SOFTWARE.
|
||
|
|
||
|
/*
|
||
|
The corresponding C implementations:
|
||
|
|
||
|
void addmul(
|
||
|
uint8_t * restrict lowhigh,
|
||
|
uint8_t * restrict in,
|
||
|
uint8_t * restrict out,
|
||
|
int n
|
||
|
) {
|
||
|
for(int i = 0; i < n; i++){
|
||
|
int value = in[i];
|
||
|
int low = value & 15;
|
||
|
int high = value >> 4;
|
||
|
out[i] = out[i] ^ lowhigh[low] ^ lowhigh[high+16];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void addmulSSSE3(
|
||
|
uint8_t * restrict lowhigh,
|
||
|
uint8_t * restrict in,
|
||
|
uint8_t * restrict out,
|
||
|
int n
|
||
|
) {
|
||
|
int i = 0;
|
||
|
|
||
|
__m128i lotbl = _mm_loadu_si128((__m128i*)(&lowhigh[0]));
|
||
|
__m128i hitbl = _mm_loadu_si128((__m128i*)(&lowhigh[16]));
|
||
|
|
||
|
__m128i lomask = _mm_set1_epi8(0xF);
|
||
|
|
||
|
#pragma nounroll
|
||
|
for(i = 0; i < (n/16)*16; i += 16){
|
||
|
__m128i input8 = _mm_loadu_si128((__m128i*)(&in[i]));
|
||
|
__m128i output8 = _mm_loadu_si128((__m128i*)(&out[i]));
|
||
|
|
||
|
__m128i lo8 = _mm_and_si128(lomask, input8);
|
||
|
__m128i hi8 = _mm_and_si128(lomask, _mm_srli_si128(input8, 4)); // simulate shrli epi8
|
||
|
|
||
|
output8 = _mm_xor_si128(output8, _mm_shuffle_epi8(lotbl, lo8));
|
||
|
output8 = _mm_xor_si128(output8, _mm_shuffle_epi8(hitbl, hi8));
|
||
|
|
||
|
_mm_storeu_si128((__m128i*)(&out[i]), output8);
|
||
|
}
|
||
|
}
|
||
|
*/
|
||
|
|
||
|
#include "textflag.h"
|
||
|
DATA nybble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||
|
DATA nybble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||
|
DATA nybble_mask<>+0x10(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||
|
DATA nybble_mask<>+0x18(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||
|
GLOBL nybble_mask<>(SB), (NOPTR+RODATA), $32
|
||
|
|
||
|
#define LOWHIGH DI
|
||
|
#define LOW X8
|
||
|
#define HIGH X9
|
||
|
#define IN SI
|
||
|
#define OUT DX
|
||
|
#define INDEX AX
|
||
|
|
||
|
#define LEN CX
|
||
|
#define LEN16 R8 // LEN16 = (LEN / 16) * 16
|
||
|
|
||
|
#define LOMASK X7 // LOMASK = repeated 15
|
||
|
// X0-X5 temps
|
||
|
|
||
|
// func addmulSSSE3(lowhigh *[2][16]byte, in, out *byte, len int)
|
||
|
TEXT ·addmulSSSE3(SB), 7, $0
|
||
|
MOVQ _in+8(FP), IN
|
||
|
MOVQ _out+16(FP), OUT
|
||
|
MOVQ _len+24(FP), LEN
|
||
|
|
||
|
MOVQ LEN, LEN16
|
||
|
ANDQ $-16, LEN16
|
||
|
|
||
|
JLE start_slow // if LEN16 == 0 { goto done }
|
||
|
|
||
|
MOVQ _lohi+0(FP), LOWHIGH
|
||
|
MOVOU (LOWHIGH), LOW
|
||
|
MOVOU 16(LOWHIGH), HIGH
|
||
|
|
||
|
MOVOU nybble_mask<>(SB), LOMASK
|
||
|
XORQ INDEX, INDEX // INDEX = 0
|
||
|
|
||
|
loop16:
|
||
|
MOVOU (IN)(INDEX*1), X0 // X0 = INPUT[INDEX]
|
||
|
MOVOU LOW, X4 // X4 = copy(LOW)
|
||
|
MOVOU (OUT)(INDEX*1), X2 // X2 = OUT[INDEX]
|
||
|
MOVOU X0, X1 // X0 = input[index] & 15
|
||
|
MOVOU HIGH, X5 // X5 = copy(HIGH)
|
||
|
|
||
|
PAND LOMASK, X0
|
||
|
PSRLQ $4, X1 // X1 = input[index]
|
||
|
PSHUFB X0, X4 // X4 = LOW[X0]
|
||
|
|
||
|
PAND LOMASK, X1 // X1 = input[index] >> 4
|
||
|
PSHUFB X1, X5 // X5 = HIGH[X1]
|
||
|
PXOR X4, X2 // X2 = OUT[INDEX] ^ X4 ^ X5
|
||
|
PXOR X5, X2
|
||
|
|
||
|
MOVOU X2, 0(OUT)(INDEX*1)
|
||
|
|
||
|
ADDQ $16, INDEX
|
||
|
CMPQ LEN16, INDEX // INDEX < LEN16
|
||
|
JG loop16
|
||
|
|
||
|
start_slow:
|
||
|
MOVQ _len+32(FP), LOWHIGH
|
||
|
MOVQ LEN16, INDEX
|
||
|
CMPQ LEN, INDEX
|
||
|
JLE done
|
||
|
|
||
|
loop1:
|
||
|
MOVBQZX (IN)(INDEX*1), R9 // R9 := in[index]
|
||
|
MOVBQZX (LOWHIGH)(R9*1), R10 // R10 := multiply[R9]
|
||
|
XORB R10B, (OUT)(INDEX*1) // out[index] ^= R10
|
||
|
INCQ INDEX
|
||
|
CMPQ LEN, INDEX
|
||
|
JG loop1
|
||
|
|
||
|
done:
|
||
|
RET
|
||
|
|
||
|
#undef LOWHIGH
|
||
|
#undef LOW
|
||
|
#undef HIGH
|
||
|
#undef IN
|
||
|
#undef OUT
|
||
|
#undef LEN
|
||
|
#undef INDEX
|
||
|
#undef LEN16
|
||
|
#undef LOMASK
|
||
|
|
||
|
// func addmulAVX2(lowhigh *[2][16]byte, in, out *byte, len int)
|
||
|
TEXT ·addmulAVX2(SB), 7, $0
|
||
|
MOVQ low+0(FP), SI // SI: &lowhigh
|
||
|
MOVOU (SI), X6 // X6: low
|
||
|
MOVOU 16(SI), X7 // X7: high
|
||
|
|
||
|
MOVQ $15, BX // BX: low mask
|
||
|
MOVQ BX, X5
|
||
|
|
||
|
MOVQ len+24(FP), R9 // R9: len(in), len(out)
|
||
|
|
||
|
LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
|
||
|
LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
|
||
|
LONG $0x787d62c4; BYTE $0xc5 // VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
||
|
|
||
|
SHRQ $5, R9 // len(in) / 32
|
||
|
MOVQ out+16(FP), DX // DX: &out
|
||
|
MOVQ in+8(FP), SI // R11: &in
|
||
|
TESTQ R9, R9
|
||
|
JZ done_xor_avx2
|
||
|
|
||
|
loopback_xor_avx2:
|
||
|
LONG $0x066ffec5 // VMOVDQU YMM0, [rsi]
|
||
|
LONG $0x226ffec5 // VMOVDQU YMM4, [rdx]
|
||
|
LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ YMM1, YMM0, 4 ; X1: high input
|
||
|
LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND YMM0, YMM0, YMM8 ; X0: low input
|
||
|
LONG $0xdb75c1c4; BYTE $0xc8 // VPAND YMM1, YMM1, YMM8 ; X1: high input
|
||
|
LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
|
||
|
LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
|
||
|
LONG $0xdbefedc5 // VPXOR YMM3, YMM2, YMM3 ; X3: Result
|
||
|
LONG $0xe4efe5c5 // VPXOR YMM4, YMM3, YMM4 ; X4: Result
|
||
|
LONG $0x227ffec5 // VMOVDQU [rdx], YMM4
|
||
|
|
||
|
ADDQ $32, SI // in+=32
|
||
|
ADDQ $32, DX // out+=32
|
||
|
SUBQ $1, R9
|
||
|
JNZ loopback_xor_avx2
|
||
|
|
||
|
done_xor_avx2:
|
||
|
// VZEROUPPER
|
||
|
BYTE $0xc5; BYTE $0xf8; BYTE $0x77
|
||
|
RET
|