diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go new file mode 100644 index 000000000..87f1e369c --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go @@ -0,0 +1,17 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.11 +// +build !gccgo,!appengine + +package chacha20 + +const bufSize = 256 + +//go:noescape +func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) + +func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) { + xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter) +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s new file mode 100644 index 000000000..b3a16ef75 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s @@ -0,0 +1,308 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.11 +// +build !gccgo,!appengine + +#include "textflag.h" + +#define NUM_ROUNDS 10 + +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 + MOVD dst+0(FP), R1 + MOVD src+24(FP), R2 + MOVD src_len+32(FP), R3 + MOVD key+48(FP), R4 + MOVD nonce+56(FP), R6 + MOVD counter+64(FP), R7 + + MOVD $·constants(SB), R10 + MOVD $·incRotMatrix(SB), R11 + + MOVW (R7), R20 + + AND $~255, R3, R13 + ADD R2, R13, R12 // R12 for block end + AND $255, R3, R13 +loop: + MOVD $NUM_ROUNDS, R21 + VLD1 (R11), [V30.S4, V31.S4] + + // load contants + // VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4] + WORD $0x4D60E940 + + // load keys + // VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4] + WORD $0x4DFFE884 + // VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4] + WORD $0x4DFFE888 + SUB $32, R4 + + // load counter + nonce + // VLD1R (R7), [V12.S4] + WORD $0x4D40C8EC + + // VLD3R (R6), [V13.S4, V14.S4, V15.S4] + WORD $0x4D40E8CD + + // update counter + VADD V30.S4, V12.S4, V12.S4 + +chacha: + // V0..V3 += V4..V7 + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16) + VADD V0.S4, V4.S4, V0.S4 + VADD V1.S4, V5.S4, V1.S4 + VADD V2.S4, V6.S4, V2.S4 + VADD V3.S4, V7.S4, V3.S4 + VEOR V12.B16, V0.B16, V12.B16 + VEOR V13.B16, V1.B16, V13.B16 + VEOR V14.B16, V2.B16, V14.B16 + VEOR V15.B16, V3.B16, V15.B16 + VREV32 V12.H8, V12.H8 + VREV32 V13.H8, V13.H8 + VREV32 V14.H8, V14.H8 + VREV32 V15.H8, V15.H8 + // V8..V11 += V12..V15 + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12) + VADD V8.S4, V12.S4, V8.S4 + VADD V9.S4, V13.S4, V9.S4 + VADD V10.S4, V14.S4, V10.S4 + VADD V11.S4, V15.S4, V11.S4 + VEOR V8.B16, V4.B16, V16.B16 + VEOR V9.B16, V5.B16, V17.B16 + VEOR V10.B16, V6.B16, V18.B16 + VEOR V11.B16, V7.B16, V19.B16 + VSHL $12, V16.S4, V4.S4 + VSHL $12, V17.S4, V5.S4 + VSHL $12, V18.S4, V6.S4 + VSHL $12, V19.S4, V7.S4 + VSRI $20, V16.S4, V4.S4 + VSRI $20, V17.S4, V5.S4 + VSRI $20, V18.S4, V6.S4 + VSRI $20, V19.S4, V7.S4 + + // V0..V3 += V4..V7 + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8) + VADD V0.S4, V4.S4, V0.S4 + VADD V1.S4, V5.S4, V1.S4 + VADD V2.S4, V6.S4, V2.S4 + VADD V3.S4, V7.S4, V3.S4 + VEOR V12.B16, V0.B16, V12.B16 + VEOR V13.B16, V1.B16, V13.B16 + VEOR V14.B16, V2.B16, V14.B16 + VEOR V15.B16, V3.B16, V15.B16 + VTBL V31.B16, [V12.B16], V12.B16 + VTBL V31.B16, [V13.B16], V13.B16 + VTBL V31.B16, [V14.B16], V14.B16 + VTBL V31.B16, [V15.B16], V15.B16 + + // V8..V11 += V12..V15 + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7) + VADD V12.S4, V8.S4, V8.S4 + VADD V13.S4, V9.S4, V9.S4 + VADD V14.S4, V10.S4, V10.S4 + VADD V15.S4, V11.S4, V11.S4 + VEOR V8.B16, V4.B16, V16.B16 + VEOR V9.B16, V5.B16, V17.B16 + VEOR V10.B16, V6.B16, V18.B16 + VEOR V11.B16, V7.B16, V19.B16 + VSHL $7, V16.S4, V4.S4 + VSHL $7, V17.S4, V5.S4 + VSHL $7, V18.S4, V6.S4 + VSHL $7, V19.S4, V7.S4 + VSRI $25, V16.S4, V4.S4 + VSRI $25, V17.S4, V5.S4 + VSRI $25, V18.S4, V6.S4 + VSRI $25, V19.S4, V7.S4 + + // V0..V3 += V5..V7, V4 + // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16) + VADD V0.S4, V5.S4, V0.S4 + VADD V1.S4, V6.S4, V1.S4 + VADD V2.S4, V7.S4, V2.S4 + VADD V3.S4, V4.S4, V3.S4 + VEOR V15.B16, V0.B16, V15.B16 + VEOR V12.B16, V1.B16, V12.B16 + VEOR V13.B16, V2.B16, V13.B16 + VEOR V14.B16, V3.B16, V14.B16 + VREV32 V12.H8, V12.H8 + VREV32 V13.H8, V13.H8 + VREV32 V14.H8, V14.H8 + VREV32 V15.H8, V15.H8 + + // V10 += V15; V5 <<<= ((V10 XOR V5), 12) + // ... + VADD V15.S4, V10.S4, V10.S4 + VADD V12.S4, V11.S4, V11.S4 + VADD V13.S4, V8.S4, V8.S4 + VADD V14.S4, V9.S4, V9.S4 + VEOR V10.B16, V5.B16, V16.B16 + VEOR V11.B16, V6.B16, V17.B16 + VEOR V8.B16, V7.B16, V18.B16 + VEOR V9.B16, V4.B16, V19.B16 + VSHL $12, V16.S4, V5.S4 + VSHL $12, V17.S4, V6.S4 + VSHL $12, V18.S4, V7.S4 + VSHL $12, V19.S4, V4.S4 + VSRI $20, V16.S4, V5.S4 + VSRI $20, V17.S4, V6.S4 + VSRI $20, V18.S4, V7.S4 + VSRI $20, V19.S4, V4.S4 + + // V0 += V5; V15 <<<= ((V0 XOR V15), 8) + // ... + VADD V5.S4, V0.S4, V0.S4 + VADD V6.S4, V1.S4, V1.S4 + VADD V7.S4, V2.S4, V2.S4 + VADD V4.S4, V3.S4, V3.S4 + VEOR V0.B16, V15.B16, V15.B16 + VEOR V1.B16, V12.B16, V12.B16 + VEOR V2.B16, V13.B16, V13.B16 + VEOR V3.B16, V14.B16, V14.B16 + VTBL V31.B16, [V12.B16], V12.B16 + VTBL V31.B16, [V13.B16], V13.B16 + VTBL V31.B16, [V14.B16], V14.B16 + VTBL V31.B16, [V15.B16], V15.B16 + + // V10 += V15; V5 <<<= ((V10 XOR V5), 7) + // ... + VADD V15.S4, V10.S4, V10.S4 + VADD V12.S4, V11.S4, V11.S4 + VADD V13.S4, V8.S4, V8.S4 + VADD V14.S4, V9.S4, V9.S4 + VEOR V10.B16, V5.B16, V16.B16 + VEOR V11.B16, V6.B16, V17.B16 + VEOR V8.B16, V7.B16, V18.B16 + VEOR V9.B16, V4.B16, V19.B16 + VSHL $7, V16.S4, V5.S4 + VSHL $7, V17.S4, V6.S4 + VSHL $7, V18.S4, V7.S4 + VSHL $7, V19.S4, V4.S4 + VSRI $25, V16.S4, V5.S4 + VSRI $25, V17.S4, V6.S4 + VSRI $25, V18.S4, V7.S4 + VSRI $25, V19.S4, V4.S4 + + SUB $1, R21 + CBNZ R21, chacha + + // VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4] + WORD $0x4D60E950 + + // VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4] + WORD $0x4DFFE894 + VADD V30.S4, V12.S4, V12.S4 + VADD V16.S4, V0.S4, V0.S4 + VADD V17.S4, V1.S4, V1.S4 + VADD V18.S4, V2.S4, V2.S4 + VADD V19.S4, V3.S4, V3.S4 + // VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4] + WORD $0x4DFFE898 + // restore R4 + SUB $32, R4 + + // load counter + nonce + // VLD1R (R7), [V28.S4] + WORD $0x4D40C8FC + // VLD3R (R6), [V29.S4, V30.S4, V31.S4] + WORD $0x4D40E8DD + + VADD V20.S4, V4.S4, V4.S4 + VADD V21.S4, V5.S4, V5.S4 + VADD V22.S4, V6.S4, V6.S4 + VADD V23.S4, V7.S4, V7.S4 + VADD V24.S4, V8.S4, V8.S4 + VADD V25.S4, V9.S4, V9.S4 + VADD V26.S4, V10.S4, V10.S4 + VADD V27.S4, V11.S4, V11.S4 + VADD V28.S4, V12.S4, V12.S4 + VADD V29.S4, V13.S4, V13.S4 + VADD V30.S4, V14.S4, V14.S4 + VADD V31.S4, V15.S4, V15.S4 + + VZIP1 V1.S4, V0.S4, V16.S4 + VZIP2 V1.S4, V0.S4, V17.S4 + VZIP1 V3.S4, V2.S4, V18.S4 + VZIP2 V3.S4, V2.S4, V19.S4 + VZIP1 V5.S4, V4.S4, V20.S4 + VZIP2 V5.S4, V4.S4, V21.S4 + VZIP1 V7.S4, V6.S4, V22.S4 + VZIP2 V7.S4, V6.S4, V23.S4 + VZIP1 V9.S4, V8.S4, V24.S4 + VZIP2 V9.S4, V8.S4, V25.S4 + VZIP1 V11.S4, V10.S4, V26.S4 + VZIP2 V11.S4, V10.S4, V27.S4 + VZIP1 V13.S4, V12.S4, V28.S4 + VZIP2 V13.S4, V12.S4, V29.S4 + VZIP1 V15.S4, V14.S4, V30.S4 + VZIP2 V15.S4, V14.S4, V31.S4 + VZIP1 V18.D2, V16.D2, V0.D2 + VZIP2 V18.D2, V16.D2, V4.D2 + VZIP1 V19.D2, V17.D2, V8.D2 + VZIP2 V19.D2, V17.D2, V12.D2 + VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16] + + VZIP1 V22.D2, V20.D2, V1.D2 + VZIP2 V22.D2, V20.D2, V5.D2 + VZIP1 V23.D2, V21.D2, V9.D2 + VZIP2 V23.D2, V21.D2, V13.D2 + VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16] + VZIP1 V26.D2, V24.D2, V2.D2 + VZIP2 V26.D2, V24.D2, V6.D2 + VZIP1 V27.D2, V25.D2, V10.D2 + VZIP2 V27.D2, V25.D2, V14.D2 + VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16] + VZIP1 V30.D2, V28.D2, V3.D2 + VZIP2 V30.D2, V28.D2, V7.D2 + VZIP1 V31.D2, V29.D2, V11.D2 + VZIP2 V31.D2, V29.D2, V15.D2 + VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16] + VEOR V0.B16, V16.B16, V16.B16 + VEOR V1.B16, V17.B16, V17.B16 + VEOR V2.B16, V18.B16, V18.B16 + VEOR V3.B16, V19.B16, V19.B16 + VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1) + VEOR V4.B16, V20.B16, V20.B16 + VEOR V5.B16, V21.B16, V21.B16 + VEOR V6.B16, V22.B16, V22.B16 + VEOR V7.B16, V23.B16, V23.B16 + VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1) + VEOR V8.B16, V24.B16, V24.B16 + VEOR V9.B16, V25.B16, V25.B16 + VEOR V10.B16, V26.B16, V26.B16 + VEOR V11.B16, V27.B16, V27.B16 + VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1) + VEOR V12.B16, V28.B16, V28.B16 + VEOR V13.B16, V29.B16, V29.B16 + VEOR V14.B16, V30.B16, V30.B16 + VEOR V15.B16, V31.B16, V31.B16 + VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1) + + ADD $4, R20 + MOVW R20, (R7) // update counter + + CMP R2, R12 + BGT loop + + RET + + +DATA ·constants+0x00(SB)/4, $0x61707865 +DATA ·constants+0x04(SB)/4, $0x3320646e +DATA ·constants+0x08(SB)/4, $0x79622d32 +DATA ·constants+0x0c(SB)/4, $0x6b206574 +GLOBL ·constants(SB), NOPTR|RODATA, $32 + +DATA ·incRotMatrix+0x00(SB)/4, $0x00000000 +DATA ·incRotMatrix+0x04(SB)/4, $0x00000001 +DATA ·incRotMatrix+0x08(SB)/4, $0x00000002 +DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003 +DATA ·incRotMatrix+0x10(SB)/4, $0x02010003 +DATA ·incRotMatrix+0x14(SB)/4, $0x06050407 +DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B +DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F +GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32 diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_generic.go b/vendor/golang.org/x/crypto/chacha20/chacha_generic.go new file mode 100644 index 000000000..098ec9f6b --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_generic.go @@ -0,0 +1,364 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package chacha20 implements the ChaCha20 and XChaCha20 encryption algorithms +// as specified in RFC 8439 and draft-irtf-cfrg-xchacha-01. +package chacha20 + +import ( + "crypto/cipher" + "encoding/binary" + "errors" + "math/bits" + + "golang.org/x/crypto/internal/subtle" +) + +const ( + // KeySize is the size of the key used by this cipher, in bytes. + KeySize = 32 + + // NonceSize is the size of the nonce used with the standard variant of this + // cipher, in bytes. + // + // Note that this is too short to be safely generated at random if the same + // key is reused more than 2³² times. + NonceSize = 12 + + // NonceSizeX is the size of the nonce used with the XChaCha20 variant of + // this cipher, in bytes. + NonceSizeX = 24 +) + +// Cipher is a stateful instance of ChaCha20 or XChaCha20 using a particular key +// and nonce. A *Cipher implements the cipher.Stream interface. +type Cipher struct { + // The ChaCha20 state is 16 words: 4 constant, 8 of key, 1 of counter + // (incremented after each block), and 3 of nonce. + key [8]uint32 + counter uint32 + nonce [3]uint32 + + // The last len bytes of buf are leftover key stream bytes from the previous + // XORKeyStream invocation. The size of buf depends on how many blocks are + // computed at a time. + buf [bufSize]byte + len int + + // The counter-independent results of the first round are cached after they + // are computed the first time. + precompDone bool + p1, p5, p9, p13 uint32 + p2, p6, p10, p14 uint32 + p3, p7, p11, p15 uint32 +} + +var _ cipher.Stream = (*Cipher)(nil) + +// NewUnauthenticatedCipher creates a new ChaCha20 stream cipher with the given +// 32 bytes key and a 12 or 24 bytes nonce. If a nonce of 24 bytes is provided, +// the XChaCha20 construction will be used. It returns an error if key or nonce +// have any other length. +// +// Note that ChaCha20, like all stream ciphers, is not authenticated and allows +// attackers to silently tamper with the plaintext. For this reason, it is more +// appropriate as a building block than as a standalone encryption mechanism. +// Instead, consider using package golang.org/x/crypto/chacha20poly1305. +func NewUnauthenticatedCipher(key, nonce []byte) (*Cipher, error) { + // This function is split into a wrapper so that the Cipher allocation will + // be inlined, and depending on how the caller uses the return value, won't + // escape to the heap. + c := &Cipher{} + return newUnauthenticatedCipher(c, key, nonce) +} + +func newUnauthenticatedCipher(c *Cipher, key, nonce []byte) (*Cipher, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20: wrong key size") + } + if len(nonce) == NonceSizeX { + // XChaCha20 uses the ChaCha20 core to mix 16 bytes of the nonce into a + // derived key, allowing it to operate on a nonce of 24 bytes. See + // draft-irtf-cfrg-xchacha-01, Section 2.3. + key, _ = HChaCha20(key, nonce[0:16]) + cNonce := make([]byte, NonceSize) + copy(cNonce[4:12], nonce[16:24]) + nonce = cNonce + } else if len(nonce) != NonceSize { + return nil, errors.New("chacha20: wrong nonce size") + } + + c.key = [8]uint32{ + binary.LittleEndian.Uint32(key[0:4]), + binary.LittleEndian.Uint32(key[4:8]), + binary.LittleEndian.Uint32(key[8:12]), + binary.LittleEndian.Uint32(key[12:16]), + binary.LittleEndian.Uint32(key[16:20]), + binary.LittleEndian.Uint32(key[20:24]), + binary.LittleEndian.Uint32(key[24:28]), + binary.LittleEndian.Uint32(key[28:32]), + } + c.nonce = [3]uint32{ + binary.LittleEndian.Uint32(nonce[0:4]), + binary.LittleEndian.Uint32(nonce[4:8]), + binary.LittleEndian.Uint32(nonce[8:12]), + } + return c, nil +} + +// The constant first 4 words of the ChaCha20 state. +const ( + j0 uint32 = 0x61707865 // expa + j1 uint32 = 0x3320646e // nd 3 + j2 uint32 = 0x79622d32 // 2-by + j3 uint32 = 0x6b206574 // te k +) + +const blockSize = 64 + +// quarterRound is the core of ChaCha20. It shuffles the bits of 4 state words. +// It's executed 4 times for each of the 20 ChaCha20 rounds, operating on all 16 +// words each round, in columnar or diagonal groups of 4 at a time. +func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) { + a += b + d ^= a + d = bits.RotateLeft32(d, 16) + c += d + b ^= c + b = bits.RotateLeft32(b, 12) + a += b + d ^= a + d = bits.RotateLeft32(d, 8) + c += d + b ^= c + b = bits.RotateLeft32(b, 7) + return a, b, c, d +} + +// XORKeyStream XORs each byte in the given slice with a byte from the +// cipher's key stream. Dst and src must overlap entirely or not at all. +// +// If len(dst) < len(src), XORKeyStream will panic. It is acceptable +// to pass a dst bigger than src, and in that case, XORKeyStream will +// only update dst[:len(src)] and will not touch the rest of dst. +// +// Multiple calls to XORKeyStream behave as if the concatenation of +// the src buffers was passed in a single run. That is, Cipher +// maintains state and does not reset at each XORKeyStream call. +func (s *Cipher) XORKeyStream(dst, src []byte) { + if len(src) == 0 { + return + } + if len(dst) < len(src) { + panic("chacha20: output smaller than input") + } + dst = dst[:len(src)] + if subtle.InexactOverlap(dst, src) { + panic("chacha20: invalid buffer overlap") + } + + // First, drain any remaining key stream from a previous XORKeyStream. + if s.len != 0 { + keyStream := s.buf[bufSize-s.len:] + if len(src) < len(keyStream) { + keyStream = keyStream[:len(src)] + } + _ = src[len(keyStream)-1] // bounds check elimination hint + for i, b := range keyStream { + dst[i] = src[i] ^ b + } + s.len -= len(keyStream) + src = src[len(keyStream):] + dst = dst[len(keyStream):] + } + + const blocksPerBuf = bufSize / blockSize + numBufs := (uint64(len(src)) + bufSize - 1) / bufSize + if uint64(s.counter)+numBufs*blocksPerBuf >= 1<<32 { + panic("chacha20: counter overflow") + } + + // xorKeyStreamBlocks implementations expect input lengths that are a + // multiple of bufSize. Platform-specific ones process multiple blocks at a + // time, so have bufSizes that are a multiple of blockSize. + + rem := len(src) % bufSize + full := len(src) - rem + + if full > 0 { + s.xorKeyStreamBlocks(dst[:full], src[:full]) + } + + // If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and + // keep the leftover keystream for the next XORKeyStream invocation. + if rem > 0 { + s.buf = [bufSize]byte{} + copy(s.buf[:], src[full:]) + s.xorKeyStreamBlocks(s.buf[:], s.buf[:]) + s.len = bufSize - copy(dst[full:], s.buf[:]) + } +} + +func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) { + if len(dst) != len(src) || len(dst)%blockSize != 0 { + panic("chacha20: internal error: wrong dst and/or src length") + } + + // To generate each block of key stream, the initial cipher state + // (represented below) is passed through 20 rounds of shuffling, + // alternatively applying quarterRounds by columns (like 1, 5, 9, 13) + // or by diagonals (like 1, 6, 11, 12). + // + // 0:cccccccc 1:cccccccc 2:cccccccc 3:cccccccc + // 4:kkkkkkkk 5:kkkkkkkk 6:kkkkkkkk 7:kkkkkkkk + // 8:kkkkkkkk 9:kkkkkkkk 10:kkkkkkkk 11:kkkkkkkk + // 12:bbbbbbbb 13:nnnnnnnn 14:nnnnnnnn 15:nnnnnnnn + // + // c=constant k=key b=blockcount n=nonce + var ( + c0, c1, c2, c3 = j0, j1, j2, j3 + c4, c5, c6, c7 = s.key[0], s.key[1], s.key[2], s.key[3] + c8, c9, c10, c11 = s.key[4], s.key[5], s.key[6], s.key[7] + _, c13, c14, c15 = s.counter, s.nonce[0], s.nonce[1], s.nonce[2] + ) + + // Three quarters of the first round don't depend on the counter, so we can + // calculate them here, and reuse them for multiple blocks in the loop, and + // for future XORKeyStream invocations. + if !s.precompDone { + s.p1, s.p5, s.p9, s.p13 = quarterRound(c1, c5, c9, c13) + s.p2, s.p6, s.p10, s.p14 = quarterRound(c2, c6, c10, c14) + s.p3, s.p7, s.p11, s.p15 = quarterRound(c3, c7, c11, c15) + s.precompDone = true + } + + for i := 0; i < len(src); i += blockSize { + // The remainder of the first column round. + fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter) + + // The second diagonal round. + x0, x5, x10, x15 := quarterRound(fcr0, s.p5, s.p10, s.p15) + x1, x6, x11, x12 := quarterRound(s.p1, s.p6, s.p11, fcr12) + x2, x7, x8, x13 := quarterRound(s.p2, s.p7, fcr8, s.p13) + x3, x4, x9, x14 := quarterRound(s.p3, fcr4, s.p9, s.p14) + + // The remaining 18 rounds. + for i := 0; i < 9; i++ { + // Column round. + x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12) + x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13) + x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14) + x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15) + + // Diagonal round. + x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15) + x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12) + x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13) + x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14) + } + + // Finally, add back the initial state to generate the key stream. + x0 += c0 + x1 += c1 + x2 += c2 + x3 += c3 + x4 += c4 + x5 += c5 + x6 += c6 + x7 += c7 + x8 += c8 + x9 += c9 + x10 += c10 + x11 += c11 + x12 += s.counter + x13 += c13 + x14 += c14 + x15 += c15 + + s.counter += 1 + if s.counter == 0 { + panic("chacha20: internal error: counter overflow") + } + + in, out := src[i:], dst[i:] + in, out = in[:blockSize], out[:blockSize] // bounds check elimination hint + + // XOR the key stream with the source and write out the result. + xor(out[0:], in[0:], x0) + xor(out[4:], in[4:], x1) + xor(out[8:], in[8:], x2) + xor(out[12:], in[12:], x3) + xor(out[16:], in[16:], x4) + xor(out[20:], in[20:], x5) + xor(out[24:], in[24:], x6) + xor(out[28:], in[28:], x7) + xor(out[32:], in[32:], x8) + xor(out[36:], in[36:], x9) + xor(out[40:], in[40:], x10) + xor(out[44:], in[44:], x11) + xor(out[48:], in[48:], x12) + xor(out[52:], in[52:], x13) + xor(out[56:], in[56:], x14) + xor(out[60:], in[60:], x15) + } +} + +// HChaCha20 uses the ChaCha20 core to generate a derived key from a 32 bytes +// key and a 16 bytes nonce. It returns an error if key or nonce have any other +// length. It is used as part of the XChaCha20 construction. +func HChaCha20(key, nonce []byte) ([]byte, error) { + // This function is split into a wrapper so that the slice allocation will + // be inlined, and depending on how the caller uses the return value, won't + // escape to the heap. + out := make([]byte, 32) + return hChaCha20(out, key, nonce) +} + +func hChaCha20(out, key, nonce []byte) ([]byte, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20: wrong HChaCha20 key size") + } + if len(nonce) != 16 { + return nil, errors.New("chacha20: wrong HChaCha20 nonce size") + } + + x0, x1, x2, x3 := j0, j1, j2, j3 + x4 := binary.LittleEndian.Uint32(key[0:4]) + x5 := binary.LittleEndian.Uint32(key[4:8]) + x6 := binary.LittleEndian.Uint32(key[8:12]) + x7 := binary.LittleEndian.Uint32(key[12:16]) + x8 := binary.LittleEndian.Uint32(key[16:20]) + x9 := binary.LittleEndian.Uint32(key[20:24]) + x10 := binary.LittleEndian.Uint32(key[24:28]) + x11 := binary.LittleEndian.Uint32(key[28:32]) + x12 := binary.LittleEndian.Uint32(nonce[0:4]) + x13 := binary.LittleEndian.Uint32(nonce[4:8]) + x14 := binary.LittleEndian.Uint32(nonce[8:12]) + x15 := binary.LittleEndian.Uint32(nonce[12:16]) + + for i := 0; i < 10; i++ { + // Diagonal round. + x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12) + x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13) + x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14) + x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15) + + // Column round. + x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15) + x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12) + x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13) + x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14) + } + + _ = out[31] // bounds check elimination hint + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x12) + binary.LittleEndian.PutUint32(out[20:24], x13) + binary.LittleEndian.PutUint32(out[24:28], x14) + binary.LittleEndian.PutUint32(out[28:32], x15) + return out, nil +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go b/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go new file mode 100644 index 000000000..ec609ed86 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go @@ -0,0 +1,13 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !arm64,!s390x,!ppc64le arm64,!go1.11 gccgo appengine + +package chacha20 + +const bufSize = blockSize + +func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) { + s.xorKeyStreamBlocksGeneric(dst, src) +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go new file mode 100644 index 000000000..d0ec61f08 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go @@ -0,0 +1,16 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !gccgo,!appengine + +package chacha20 + +const bufSize = 256 + +//go:noescape +func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) + +func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) { + chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter) +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s new file mode 100644 index 000000000..533014ea3 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s @@ -0,0 +1,449 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Based on CRYPTOGAMS code with the following comment: +// # ==================================================================== +// # Written by Andy Polyakov for the OpenSSL +// # project. The module is, however, dual licensed under OpenSSL and +// # CRYPTOGAMS licenses depending on where you obtain it. For further +// # details see http://www.openssl.org/~appro/cryptogams/. +// # ==================================================================== + +// Code for the perl script that generates the ppc64 assembler +// can be found in the cryptogams repository at the link below. It is based on +// the original from openssl. + +// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91 + +// The differences in this and the original implementation are +// due to the calling conventions and initialization of constants. + +// +build !gccgo,!appengine + +#include "textflag.h" + +#define OUT R3 +#define INP R4 +#define LEN R5 +#define KEY R6 +#define CNT R7 +#define TMP R15 + +#define CONSTBASE R16 +#define BLOCKS R17 + +DATA consts<>+0x00(SB)/8, $0x3320646e61707865 +DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 +DATA consts<>+0x10(SB)/8, $0x0000000000000001 +DATA consts<>+0x18(SB)/8, $0x0000000000000000 +DATA consts<>+0x20(SB)/8, $0x0000000000000004 +DATA consts<>+0x28(SB)/8, $0x0000000000000000 +DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d +DATA consts<>+0x38(SB)/8, $0x0203000106070405 +DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c +DATA consts<>+0x48(SB)/8, $0x0102030005060704 +DATA consts<>+0x50(SB)/8, $0x6170786561707865 +DATA consts<>+0x58(SB)/8, $0x6170786561707865 +DATA consts<>+0x60(SB)/8, $0x3320646e3320646e +DATA consts<>+0x68(SB)/8, $0x3320646e3320646e +DATA consts<>+0x70(SB)/8, $0x79622d3279622d32 +DATA consts<>+0x78(SB)/8, $0x79622d3279622d32 +DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 +DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 +DATA consts<>+0x90(SB)/8, $0x0000000100000000 +DATA consts<>+0x98(SB)/8, $0x0000000300000002 +GLOBL consts<>(SB), RODATA, $0xa0 + +//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) +TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 + MOVD out+0(FP), OUT + MOVD inp+8(FP), INP + MOVD len+16(FP), LEN + MOVD key+24(FP), KEY + MOVD counter+32(FP), CNT + + // Addressing for constants + MOVD $consts<>+0x00(SB), CONSTBASE + MOVD $16, R8 + MOVD $32, R9 + MOVD $48, R10 + MOVD $64, R11 + SRD $6, LEN, BLOCKS + // V16 + LXVW4X (CONSTBASE)(R0), VS48 + ADD $80,CONSTBASE + + // Load key into V17,V18 + LXVW4X (KEY)(R0), VS49 + LXVW4X (KEY)(R8), VS50 + + // Load CNT, NONCE into V19 + LXVW4X (CNT)(R0), VS51 + + // Clear V27 + VXOR V27, V27, V27 + + // V28 + LXVW4X (CONSTBASE)(R11), VS60 + + // splat slot from V19 -> V26 + VSPLTW $0, V19, V26 + + VSLDOI $4, V19, V27, V19 + VSLDOI $12, V27, V19, V19 + + VADDUWM V26, V28, V26 + + MOVD $10, R14 + MOVD R14, CTR + +loop_outer_vsx: + // V0, V1, V2, V3 + LXVW4X (R0)(CONSTBASE), VS32 + LXVW4X (R8)(CONSTBASE), VS33 + LXVW4X (R9)(CONSTBASE), VS34 + LXVW4X (R10)(CONSTBASE), VS35 + + // splat values from V17, V18 into V4-V11 + VSPLTW $0, V17, V4 + VSPLTW $1, V17, V5 + VSPLTW $2, V17, V6 + VSPLTW $3, V17, V7 + VSPLTW $0, V18, V8 + VSPLTW $1, V18, V9 + VSPLTW $2, V18, V10 + VSPLTW $3, V18, V11 + + // VOR + VOR V26, V26, V12 + + // splat values from V19 -> V13, V14, V15 + VSPLTW $1, V19, V13 + VSPLTW $2, V19, V14 + VSPLTW $3, V19, V15 + + // splat const values + VSPLTISW $-16, V27 + VSPLTISW $12, V28 + VSPLTISW $8, V29 + VSPLTISW $7, V30 + +loop_vsx: + VADDUWM V0, V4, V0 + VADDUWM V1, V5, V1 + VADDUWM V2, V6, V2 + VADDUWM V3, V7, V3 + + VXOR V12, V0, V12 + VXOR V13, V1, V13 + VXOR V14, V2, V14 + VXOR V15, V3, V15 + + VRLW V12, V27, V12 + VRLW V13, V27, V13 + VRLW V14, V27, V14 + VRLW V15, V27, V15 + + VADDUWM V8, V12, V8 + VADDUWM V9, V13, V9 + VADDUWM V10, V14, V10 + VADDUWM V11, V15, V11 + + VXOR V4, V8, V4 + VXOR V5, V9, V5 + VXOR V6, V10, V6 + VXOR V7, V11, V7 + + VRLW V4, V28, V4 + VRLW V5, V28, V5 + VRLW V6, V28, V6 + VRLW V7, V28, V7 + + VADDUWM V0, V4, V0 + VADDUWM V1, V5, V1 + VADDUWM V2, V6, V2 + VADDUWM V3, V7, V3 + + VXOR V12, V0, V12 + VXOR V13, V1, V13 + VXOR V14, V2, V14 + VXOR V15, V3, V15 + + VRLW V12, V29, V12 + VRLW V13, V29, V13 + VRLW V14, V29, V14 + VRLW V15, V29, V15 + + VADDUWM V8, V12, V8 + VADDUWM V9, V13, V9 + VADDUWM V10, V14, V10 + VADDUWM V11, V15, V11 + + VXOR V4, V8, V4 + VXOR V5, V9, V5 + VXOR V6, V10, V6 + VXOR V7, V11, V7 + + VRLW V4, V30, V4 + VRLW V5, V30, V5 + VRLW V6, V30, V6 + VRLW V7, V30, V7 + + VADDUWM V0, V5, V0 + VADDUWM V1, V6, V1 + VADDUWM V2, V7, V2 + VADDUWM V3, V4, V3 + + VXOR V15, V0, V15 + VXOR V12, V1, V12 + VXOR V13, V2, V13 + VXOR V14, V3, V14 + + VRLW V15, V27, V15 + VRLW V12, V27, V12 + VRLW V13, V27, V13 + VRLW V14, V27, V14 + + VADDUWM V10, V15, V10 + VADDUWM V11, V12, V11 + VADDUWM V8, V13, V8 + VADDUWM V9, V14, V9 + + VXOR V5, V10, V5 + VXOR V6, V11, V6 + VXOR V7, V8, V7 + VXOR V4, V9, V4 + + VRLW V5, V28, V5 + VRLW V6, V28, V6 + VRLW V7, V28, V7 + VRLW V4, V28, V4 + + VADDUWM V0, V5, V0 + VADDUWM V1, V6, V1 + VADDUWM V2, V7, V2 + VADDUWM V3, V4, V3 + + VXOR V15, V0, V15 + VXOR V12, V1, V12 + VXOR V13, V2, V13 + VXOR V14, V3, V14 + + VRLW V15, V29, V15 + VRLW V12, V29, V12 + VRLW V13, V29, V13 + VRLW V14, V29, V14 + + VADDUWM V10, V15, V10 + VADDUWM V11, V12, V11 + VADDUWM V8, V13, V8 + VADDUWM V9, V14, V9 + + VXOR V5, V10, V5 + VXOR V6, V11, V6 + VXOR V7, V8, V7 + VXOR V4, V9, V4 + + VRLW V5, V30, V5 + VRLW V6, V30, V6 + VRLW V7, V30, V7 + VRLW V4, V30, V4 + BC 16, LT, loop_vsx + + VADDUWM V12, V26, V12 + + WORD $0x13600F8C // VMRGEW V0, V1, V27 + WORD $0x13821F8C // VMRGEW V2, V3, V28 + + WORD $0x10000E8C // VMRGOW V0, V1, V0 + WORD $0x10421E8C // VMRGOW V2, V3, V2 + + WORD $0x13A42F8C // VMRGEW V4, V5, V29 + WORD $0x13C63F8C // VMRGEW V6, V7, V30 + + XXPERMDI VS32, VS34, $0, VS33 + XXPERMDI VS32, VS34, $3, VS35 + XXPERMDI VS59, VS60, $0, VS32 + XXPERMDI VS59, VS60, $3, VS34 + + WORD $0x10842E8C // VMRGOW V4, V5, V4 + WORD $0x10C63E8C // VMRGOW V6, V7, V6 + + WORD $0x13684F8C // VMRGEW V8, V9, V27 + WORD $0x138A5F8C // VMRGEW V10, V11, V28 + + XXPERMDI VS36, VS38, $0, VS37 + XXPERMDI VS36, VS38, $3, VS39 + XXPERMDI VS61, VS62, $0, VS36 + XXPERMDI VS61, VS62, $3, VS38 + + WORD $0x11084E8C // VMRGOW V8, V9, V8 + WORD $0x114A5E8C // VMRGOW V10, V11, V10 + + WORD $0x13AC6F8C // VMRGEW V12, V13, V29 + WORD $0x13CE7F8C // VMRGEW V14, V15, V30 + + XXPERMDI VS40, VS42, $0, VS41 + XXPERMDI VS40, VS42, $3, VS43 + XXPERMDI VS59, VS60, $0, VS40 + XXPERMDI VS59, VS60, $3, VS42 + + WORD $0x118C6E8C // VMRGOW V12, V13, V12 + WORD $0x11CE7E8C // VMRGOW V14, V15, V14 + + VSPLTISW $4, V27 + VADDUWM V26, V27, V26 + + XXPERMDI VS44, VS46, $0, VS45 + XXPERMDI VS44, VS46, $3, VS47 + XXPERMDI VS61, VS62, $0, VS44 + XXPERMDI VS61, VS62, $3, VS46 + + VADDUWM V0, V16, V0 + VADDUWM V4, V17, V4 + VADDUWM V8, V18, V8 + VADDUWM V12, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + // Bottom of loop + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V1, V16, V0 + VADDUWM V5, V17, V4 + VADDUWM V9, V18, V8 + VADDUWM V13, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + VXOR V27, V0, V27 + + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(V10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V2, V16, V0 + VADDUWM V6, V17, V4 + VADDUWM V10, V18, V8 + VADDUWM V14, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V3, V16, V0 + VADDUWM V7, V17, V4 + VADDUWM V11, V18, V8 + VADDUWM V15, V19, V12 + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + + MOVD $10, R14 + MOVD R14, CTR + BNE loop_outer_vsx + +done_vsx: + // Increment counter by number of 64 byte blocks + MOVD (CNT), R14 + ADD BLOCKS, R14 + MOVD R14, (CNT) + RET + +tail_vsx: + ADD $32, R1, R11 + MOVD LEN, CTR + + // Save values on stack to copy from + STXVW4X VS32, (R11)(R0) + STXVW4X VS36, (R11)(R8) + STXVW4X VS40, (R11)(R9) + STXVW4X VS44, (R11)(R10) + ADD $-1, R11, R12 + ADD $-1, INP + ADD $-1, OUT + +looptail_vsx: + // Copying the result to OUT + // in bytes. + MOVBZU 1(R12), KEY + MOVBZU 1(INP), TMP + XOR KEY, TMP, KEY + MOVBU KEY, 1(OUT) + BC 16, LT, looptail_vsx + + // Clear the stack values + STXVW4X VS48, (R11)(R0) + STXVW4X VS48, (R11)(R8) + STXVW4X VS48, (R11)(R9) + STXVW4X VS48, (R11)(R10) + BR done_vsx diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go new file mode 100644 index 000000000..cd55f45a3 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go @@ -0,0 +1,26 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !gccgo,!appengine + +package chacha20 + +import "golang.org/x/sys/cpu" + +var haveAsm = cpu.S390X.HasVX + +const bufSize = 256 + +// xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only +// be called when the vector facility is available. Implementation in asm_s390x.s. +//go:noescape +func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) + +func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) { + if cpu.S390X.HasVX { + xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter) + } else { + c.xorKeyStreamBlocksGeneric(dst, src) + } +} diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_s390x.s b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.s new file mode 100644 index 000000000..de52a2ea8 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.s @@ -0,0 +1,224 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !gccgo,!appengine + +#include "go_asm.h" +#include "textflag.h" + +// This is an implementation of the ChaCha20 encryption algorithm as +// specified in RFC 7539. It uses vector instructions to compute +// 4 keystream blocks in parallel (256 bytes) which are then XORed +// with the bytes in the input slice. + +GLOBL ·constants<>(SB), RODATA|NOPTR, $32 +// BSWAP: swap bytes in each 4-byte element +DATA ·constants<>+0x00(SB)/4, $0x03020100 +DATA ·constants<>+0x04(SB)/4, $0x07060504 +DATA ·constants<>+0x08(SB)/4, $0x0b0a0908 +DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c +// J0: [j0, j1, j2, j3] +DATA ·constants<>+0x10(SB)/4, $0x61707865 +DATA ·constants<>+0x14(SB)/4, $0x3320646e +DATA ·constants<>+0x18(SB)/4, $0x79622d32 +DATA ·constants<>+0x1c(SB)/4, $0x6b206574 + +#define BSWAP V5 +#define J0 V6 +#define KEY0 V7 +#define KEY1 V8 +#define NONCE V9 +#define CTR V10 +#define M0 V11 +#define M1 V12 +#define M2 V13 +#define M3 V14 +#define INC V15 +#define X0 V16 +#define X1 V17 +#define X2 V18 +#define X3 V19 +#define X4 V20 +#define X5 V21 +#define X6 V22 +#define X7 V23 +#define X8 V24 +#define X9 V25 +#define X10 V26 +#define X11 V27 +#define X12 V28 +#define X13 V29 +#define X14 V30 +#define X15 V31 + +#define NUM_ROUNDS 20 + +#define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \ + VAF a1, a0, a0 \ + VAF b1, b0, b0 \ + VAF c1, c0, c0 \ + VAF d1, d0, d0 \ + VX a0, a2, a2 \ + VX b0, b2, b2 \ + VX c0, c2, c2 \ + VX d0, d2, d2 \ + VERLLF $16, a2, a2 \ + VERLLF $16, b2, b2 \ + VERLLF $16, c2, c2 \ + VERLLF $16, d2, d2 \ + VAF a2, a3, a3 \ + VAF b2, b3, b3 \ + VAF c2, c3, c3 \ + VAF d2, d3, d3 \ + VX a3, a1, a1 \ + VX b3, b1, b1 \ + VX c3, c1, c1 \ + VX d3, d1, d1 \ + VERLLF $12, a1, a1 \ + VERLLF $12, b1, b1 \ + VERLLF $12, c1, c1 \ + VERLLF $12, d1, d1 \ + VAF a1, a0, a0 \ + VAF b1, b0, b0 \ + VAF c1, c0, c0 \ + VAF d1, d0, d0 \ + VX a0, a2, a2 \ + VX b0, b2, b2 \ + VX c0, c2, c2 \ + VX d0, d2, d2 \ + VERLLF $8, a2, a2 \ + VERLLF $8, b2, b2 \ + VERLLF $8, c2, c2 \ + VERLLF $8, d2, d2 \ + VAF a2, a3, a3 \ + VAF b2, b3, b3 \ + VAF c2, c3, c3 \ + VAF d2, d3, d3 \ + VX a3, a1, a1 \ + VX b3, b1, b1 \ + VX c3, c1, c1 \ + VX d3, d1, d1 \ + VERLLF $7, a1, a1 \ + VERLLF $7, b1, b1 \ + VERLLF $7, c1, c1 \ + VERLLF $7, d1, d1 + +#define PERMUTE(mask, v0, v1, v2, v3) \ + VPERM v0, v0, mask, v0 \ + VPERM v1, v1, mask, v1 \ + VPERM v2, v2, mask, v2 \ + VPERM v3, v3, mask, v3 + +#define ADDV(x, v0, v1, v2, v3) \ + VAF x, v0, v0 \ + VAF x, v1, v1 \ + VAF x, v2, v2 \ + VAF x, v3, v3 + +#define XORV(off, dst, src, v0, v1, v2, v3) \ + VLM off(src), M0, M3 \ + PERMUTE(BSWAP, v0, v1, v2, v3) \ + VX v0, M0, M0 \ + VX v1, M1, M1 \ + VX v2, M2, M2 \ + VX v3, M3, M3 \ + VSTM M0, M3, off(dst) + +#define SHUFFLE(a, b, c, d, t, u, v, w) \ + VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]} + VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]} + VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]} + VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]} + VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]} + VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]} + VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]} + VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]} + +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 + MOVD $·constants<>(SB), R1 + MOVD dst+0(FP), R2 // R2=&dst[0] + LMG src+24(FP), R3, R4 // R3=&src[0] R4=len(src) + MOVD key+48(FP), R5 // R5=key + MOVD nonce+56(FP), R6 // R6=nonce + MOVD counter+64(FP), R7 // R7=counter + + // load BSWAP and J0 + VLM (R1), BSWAP, J0 + + // setup + MOVD $95, R0 + VLM (R5), KEY0, KEY1 + VLL R0, (R6), NONCE + VZERO M0 + VLEIB $7, $32, M0 + VSRLB M0, NONCE, NONCE + + // initialize counter values + VLREPF (R7), CTR + VZERO INC + VLEIF $1, $1, INC + VLEIF $2, $2, INC + VLEIF $3, $3, INC + VAF INC, CTR, CTR + VREPIF $4, INC + +chacha: + VREPF $0, J0, X0 + VREPF $1, J0, X1 + VREPF $2, J0, X2 + VREPF $3, J0, X3 + VREPF $0, KEY0, X4 + VREPF $1, KEY0, X5 + VREPF $2, KEY0, X6 + VREPF $3, KEY0, X7 + VREPF $0, KEY1, X8 + VREPF $1, KEY1, X9 + VREPF $2, KEY1, X10 + VREPF $3, KEY1, X11 + VLR CTR, X12 + VREPF $1, NONCE, X13 + VREPF $2, NONCE, X14 + VREPF $3, NONCE, X15 + + MOVD $(NUM_ROUNDS/2), R1 + +loop: + ROUND4(X0, X4, X12, X8, X1, X5, X13, X9, X2, X6, X14, X10, X3, X7, X15, X11) + ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8, X3, X4, X14, X9) + + ADD $-1, R1 + BNE loop + + // decrement length + ADD $-256, R4 + + // rearrange vectors + SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3) + ADDV(J0, X0, X1, X2, X3) + SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3) + ADDV(KEY0, X4, X5, X6, X7) + SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3) + ADDV(KEY1, X8, X9, X10, X11) + VAF CTR, X12, X12 + SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3) + ADDV(NONCE, X12, X13, X14, X15) + + // increment counters + VAF INC, CTR, CTR + + // xor keystream with plaintext + XORV(0*64, R2, R3, X0, X4, X8, X12) + XORV(1*64, R2, R3, X1, X5, X9, X13) + XORV(2*64, R2, R3, X2, X6, X10, X14) + XORV(3*64, R2, R3, X3, X7, X11, X15) + + // increment pointers + MOVD $256(R2), R2 + MOVD $256(R3), R3 + + CMPBNE R4, $0, chacha + + VSTEF $0, CTR, (R7) + RET diff --git a/vendor/golang.org/x/crypto/chacha20/xor.go b/vendor/golang.org/x/crypto/chacha20/xor.go new file mode 100644 index 000000000..0110c9865 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/xor.go @@ -0,0 +1,41 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found src the LICENSE file. + +package chacha20 + +import "runtime" + +// Platforms that have fast unaligned 32-bit little endian accesses. +const unaligned = runtime.GOARCH == "386" || + runtime.GOARCH == "amd64" || + runtime.GOARCH == "arm64" || + runtime.GOARCH == "ppc64le" || + runtime.GOARCH == "s390x" + +// xor reads a little endian uint32 from src, XORs it with u and +// places the result in little endian byte order in dst. +func xor(dst, src []byte, u uint32) { + _, _ = src[3], dst[3] // eliminate bounds checks + if unaligned { + // The compiler should optimize this code into + // 32-bit unaligned little endian loads and stores. + // TODO: delete once the compiler does a reliably + // good job with the generic code below. + // See issue #25111 for more details. + v := uint32(src[0]) + v |= uint32(src[1]) << 8 + v |= uint32(src[2]) << 16 + v |= uint32(src[3]) << 24 + v ^= u + dst[0] = byte(v) + dst[1] = byte(v >> 8) + dst[2] = byte(v >> 16) + dst[3] = byte(v >> 24) + } else { + dst[0] = src[0] ^ byte(u) + dst[1] = src[1] ^ byte(u>>8) + dst[2] = src[2] ^ byte(u>>16) + dst[3] = src[3] ^ byte(u>>24) + } +} diff --git a/vendor/golang.org/x/crypto/poly1305/bits_compat.go b/vendor/golang.org/x/crypto/poly1305/bits_compat.go new file mode 100644 index 000000000..157a69f61 --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/bits_compat.go @@ -0,0 +1,39 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !go1.13 + +package poly1305 + +// Generic fallbacks for the math/bits intrinsics, copied from +// src/math/bits/bits.go. They were added in Go 1.12, but Add64 and Sum64 had +// variable time fallbacks until Go 1.13. + +func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) { + sum = x + y + carry + carryOut = ((x & y) | ((x | y) &^ sum)) >> 63 + return +} + +func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) { + diff = x - y - borrow + borrowOut = ((^x & y) | (^(x ^ y) & diff)) >> 63 + return +} + +func bitsMul64(x, y uint64) (hi, lo uint64) { + const mask32 = 1<<32 - 1 + x0 := x & mask32 + x1 := x >> 32 + y0 := y & mask32 + y1 := y >> 32 + w0 := x0 * y0 + t := x1*y0 + w0>>32 + w1 := t & mask32 + w2 := t >> 32 + w1 += x0 * y1 + hi = x1*y1 + w2 + w1>>32 + lo = x * y + return +} diff --git a/vendor/golang.org/x/crypto/poly1305/bits_go1.13.go b/vendor/golang.org/x/crypto/poly1305/bits_go1.13.go new file mode 100644 index 000000000..a0a185f0f --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/bits_go1.13.go @@ -0,0 +1,21 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.13 + +package poly1305 + +import "math/bits" + +func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) { + return bits.Add64(x, y, carry) +} + +func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) { + return bits.Sub64(x, y, borrow) +} + +func bitsMul64(x, y uint64) (hi, lo uint64) { + return bits.Mul64(x, y) +}