vendor: update resumable dependency

Updates resumable hash implementation to Go 1.8 equivalent. This should
be a major speedup, since it includes a number of optimizations from Go
1.7.

Signed-off-by: Stephen J Day <stephen.day@docker.com>
This commit is contained in:
Stephen J Day 2017-03-02 13:39:41 -08:00
parent 83f857ca12
commit f01bcc8f62
No known key found for this signature in database
GPG key ID: 67B3DED84EDC823F
19 changed files with 1118 additions and 98 deletions

View file

@ -23,7 +23,7 @@ github.com/mitchellh/mapstructure 482a9fd5fa83e8c4e7817413b80f3eb8feec03ef
github.com/ncw/swift b964f2ca856aac39885e258ad25aec08d5f64ee6 github.com/ncw/swift b964f2ca856aac39885e258ad25aec08d5f64ee6
github.com/spf13/cobra 312092086bed4968099259622145a0c9ae280064 github.com/spf13/cobra 312092086bed4968099259622145a0c9ae280064
github.com/spf13/pflag 5644820622454e71517561946e3d94b9f9db6842 github.com/spf13/pflag 5644820622454e71517561946e3d94b9f9db6842
github.com/stevvooe/resumable 51ad44105773cafcbe91927f70ac68e1bf78f8b4 github.com/stevvooe/resumable 2aaf90b2ceea5072cb503ef2a620b08ff3119870
github.com/xenolf/lego a9d8cec0e6563575e5868a005359ac97911b5985 github.com/xenolf/lego a9d8cec0e6563575e5868a005359ac97911b5985
github.com/yvasiyarov/go-metrics 57bccd1ccd43f94bb17fdd8bf3007059b802f85e github.com/yvasiyarov/go-metrics 57bccd1ccd43f94bb17fdd8bf3007059b802f85e
github.com/yvasiyarov/gorelic a9bba5b9ab508a086f9a12b8c51fab68478e2128 github.com/yvasiyarov/gorelic a9bba5b9ab508a086f9a12b8c51fab68478e2128

View file

@ -26,7 +26,15 @@
// functions. // functions.
package resumable package resumable
import "hash" import (
"fmt"
"hash"
)
var (
// ErrBadState is returned if Restore fails post-unmarshaling validation.
ErrBadState = fmt.Errorf("bad hash state")
)
// Hash is the common interface implemented by all resumable hash functions. // Hash is the common interface implemented by all resumable hash functions.
type Hash interface { type Hash interface {

View file

@ -2,8 +2,10 @@ package sha256
import ( import (
"bytes" "bytes"
"crypto"
"encoding/gob" "encoding/gob"
"github.com/stevvooe/resumable"
// import to ensure that our init function runs after the standard package // import to ensure that our init function runs after the standard package
_ "crypto/sha256" _ "crypto/sha256"
) )
@ -18,10 +20,15 @@ func (d *digest) State() ([]byte, error) {
var buf bytes.Buffer var buf bytes.Buffer
encoder := gob.NewEncoder(&buf) encoder := gob.NewEncoder(&buf)
function := crypto.SHA256
if d.is224 {
function = crypto.SHA224
}
// We encode this way so that we do not have // We encode this way so that we do not have
// to export these fields of the digest struct. // to export these fields of the digest struct.
vals := []interface{}{ vals := []interface{}{
d.h, d.x, d.nx, d.len, d.is224, d.h, d.x, d.nx, d.len, function,
} }
for _, val := range vals { for _, val := range vals {
@ -37,10 +44,12 @@ func (d *digest) State() ([]byte, error) {
func (d *digest) Restore(state []byte) error { func (d *digest) Restore(state []byte) error {
decoder := gob.NewDecoder(bytes.NewReader(state)) decoder := gob.NewDecoder(bytes.NewReader(state))
var function uint
// We decode this way so that we do not have // We decode this way so that we do not have
// to export these fields of the digest struct. // to export these fields of the digest struct.
vals := []interface{}{ vals := []interface{}{
&d.h, &d.x, &d.nx, &d.len, &d.is224, &d.h, &d.x, &d.nx, &d.len, &function,
} }
for _, val := range vals { for _, val := range vals {
@ -49,5 +58,14 @@ func (d *digest) Restore(state []byte) error {
} }
} }
switch crypto.Hash(function) {
case crypto.SHA224:
d.is224 = true
case crypto.SHA256:
d.is224 = false
default:
return resumable.ErrBadState
}
return nil return nil
} }

View file

@ -2,8 +2,6 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// +build !386,!amd64
// SHA256 block step. // SHA256 block step.
// In its own file so that a faster assembly or C version // In its own file so that a faster assembly or C version
// can be substituted easily. // can be substituted easily.
@ -77,7 +75,7 @@ var _K = []uint32{
0xc67178f2, 0xc67178f2,
} }
func block(dig *digest, p []byte) { func blockGeneric(dig *digest, p []byte) {
var w [64]uint32 var w [64]uint32
h0, h1, h2, h3, h4, h5, h6, h7 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] h0, h1, h2, h3, h4, h5, h6, h7 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]
for len(p) >= chunk { for len(p) >= chunk {

View file

@ -141,7 +141,7 @@
MSGSCHEDULE1(index); \ MSGSCHEDULE1(index); \
SHA256ROUND(index, const, a, b, c, d, e, f, g, h) SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
TEXT ·block(SB),0,$296-12 TEXT ·block(SB),0,$296-16
MOVL p_base+4(FP), SI MOVL p_base+4(FP), SI
MOVL p_len+8(FP), DX MOVL p_len+8(FP), DX
SHRL $6, DX SHRL $6, DX

View file

@ -9,7 +9,18 @@
// The algorithm is detailed in FIPS 180-4: // The algorithm is detailed in FIPS 180-4:
// //
// http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf // http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
//
// The avx2-version is described in an Intel White-Paper:
// "Fast SHA-256 Implementations on Intel Architecture Processors"
// To find it, surf to http://www.intel.com/p/en_US/embedded
// and search for that title.
// AVX2 version by Intel, same algorithm as code in Linux kernel:
// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
// by
// James Guilford <james.guilford@intel.com>
// Kirk Yap <kirk.s.yap@intel.com>
// Tim Chen <tim.c.chen@linux.intel.com>
// Wt = Mt; for 0 <= t <= 15 // Wt = Mt; for 0 <= t <= 15
// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
// //
@ -140,7 +151,420 @@
MSGSCHEDULE1(index); \ MSGSCHEDULE1(index); \
SHA256ROUND(index, const, a, b, c, d, e, f, g, h) SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
TEXT ·block(SB),0,$264-32
// Definitions for AVX2 version
// addm (mem), reg
// Add reg to mem using reg-mem add and store
#define addm(P1, P2) \
ADDL P2, P1; \
MOVL P1, P2
#define XDWORD0 Y4
#define XDWORD1 Y5
#define XDWORD2 Y6
#define XDWORD3 Y7
#define XWORD0 X4
#define XWORD1 X5
#define XWORD2 X6
#define XWORD3 X7
#define XTMP0 Y0
#define XTMP1 Y1
#define XTMP2 Y2
#define XTMP3 Y3
#define XTMP4 Y8
#define XTMP5 Y11
#define XFER Y9
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
#define X_BYTE_FLIP_MASK X13
#define NUM_BYTES DX
#define INP DI
#define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
#define a AX
#define b BX
#define c CX
#define d R8
#define e DX
#define f R9
#define g R10
#define h R11
#define old_h R11
#define TBL BP
#define SRND SI // SRND is same register as CTX
#define T1 R12
#define y0 R13
#define y1 R14
#define y2 R15
#define y3 DI
// Offsets
#define XFER_SIZE 2*64*4
#define INP_END_SIZE 8
#define INP_SIZE 8
#define TMP_SIZE 4
#define _XFER 0
#define _INP_END _XFER + XFER_SIZE
#define _INP _INP_END + INP_END_SIZE
#define _TMP _INP + INP_SIZE
#define STACK_SIZE _TMP + TMP_SIZE
#define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
; \ // ############################# RND N + 0 ############################//
MOVL a, y3; \ // y3 = a // MAJA
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
; \
ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // disp = k + w
ORL c, y3; \ // y3 = a|c // MAJA
VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
MOVL f, y2; \ // y2 = f // CH
RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
XORL g, y2; \ // y2 = f^g // CH
VPADDD XDWORD0, XTMP0, XTMP0; \ // XTMP0 = W[-7] + W[-16] // y1 = (e >> 6) // S1
RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
; \
ANDL e, y2; \ // y2 = (f^g)&e // CH
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
ADDL h, d; \ // d = k + w + h + d // --
; \
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
; \
XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
VPSRLD $7, XTMP1, XTMP2; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
MOVL a, T1; \ // T1 = a // MAJB
ANDL c, T1; \ // T1 = a&c // MAJB
; \
ADDL y0, y2; \ // y2 = S1 + CH // --
VPSLLD $(32-7), XTMP1, XTMP3; \
ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
ADDL y1, h; \ // h = k + w + h + S0 // --
; \
ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
VPOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7
; \
VPSRLD $18, XTMP1, XTMP2; \
ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
ADDL y3, h // h = t1 + S0 + MAJ // --
#define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
; \ // ################################### RND N + 1 ############################
; \
MOVL a, y3; \ // y3 = a // MAJA
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
VPSRLD $3, XTMP1, XTMP4; \ // XTMP4 = W[-15] >> 3
MOVL f, y2; \ // y2 = f // CH
RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
XORL g, y2; \ // y2 = f^g // CH
; \
RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
ANDL e, y2; \ // y2 = (f^g)&e // CH
ADDL h, d; \ // d = k + w + h + d // --
; \
VPSLLD $(32-18), XTMP1, XTMP1; \
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
; \
VPXOR XTMP1, XTMP3, XTMP3; \
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
; \
VPXOR XTMP2, XTMP3, XTMP3; \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
MOVL a, T1; \ // T1 = a // MAJB
ANDL c, T1; \ // T1 = a&c // MAJB
ADDL y0, y2; \ // y2 = S1 + CH // --
; \
VPXOR XTMP4, XTMP3, XTMP1; \ // XTMP1 = s0
VPSHUFD $0xFA, XDWORD3, XTMP2; \ // XTMP2 = W[-2] {BBAA}
ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
ADDL y1, h; \ // h = k + w + h + S0 // --
; \
VPADDD XTMP1, XTMP0, XTMP0; \ // XTMP0 = W[-16] + W[-7] + s0
ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
ADDL y3, h; \ // h = t1 + S0 + MAJ // --
; \
VPSRLD $10, XTMP2, XTMP4 // XTMP4 = W[-2] >> 10 {BBAA}
#define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
; \ // ################################### RND N + 2 ############################
; \
MOVL a, y3; \ // y3 = a // MAJA
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
; \
VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xBxA}
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
ORL c, y3; \ // y3 = a|c // MAJA
MOVL f, y2; \ // y2 = f // CH
XORL g, y2; \ // y2 = f^g // CH
; \
RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xBxA}
ANDL e, y2; \ // y2 = (f^g)&e // CH
; \
RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
VPXOR XTMP3, XTMP2, XTMP2; \
ADDL h, d; \ // d = k + w + h + d // --
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
VPXOR XTMP2, XTMP4, XTMP4; \ // XTMP4 = s1 {xBxA}
XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
; \
MOVL f, _TMP(SP); \
MOVQ $shuff_00BA<>(SB), f; \ // f is used to keep SHUF_00BA
VPSHUFB (f), XTMP4, XTMP4; \ // XTMP4 = s1 {00BA}
MOVL _TMP(SP), f; \ // f is restored
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
VPADDD XTMP4, XTMP0, XTMP0; \ // XTMP0 = {..., ..., W[1], W[0]}
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
MOVL a, T1; \ // T1 = a // MAJB
ANDL c, T1; \ // T1 = a&c // MAJB
ADDL y0, y2; \ // y2 = S1 + CH // --
VPSHUFD $80, XTMP0, XTMP2; \ // XTMP2 = W[-2] {DDCC}
; \
ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
ADDL y1, h; \ // h = k + w + h + S0 // --
ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
; \
ADDL y3, h // h = t1 + S0 + MAJ // --
#define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
; \ // ################################### RND N + 3 ############################
; \
MOVL a, y3; \ // y3 = a // MAJA
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
VPSRLD $10, XTMP2, XTMP5; \ // XTMP5 = W[-2] >> 10 {DDCC}
MOVL f, y2; \ // y2 = f // CH
RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
XORL g, y2; \ // y2 = f^g // CH
; \
VPSRLQ $19, XTMP2, XTMP3; \ // XTMP3 = W[-2] ror 19 {xDxC}
RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
ANDL e, y2; \ // y2 = (f^g)&e // CH
ADDL h, d; \ // d = k + w + h + d // --
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
; \
VPSRLQ $17, XTMP2, XTMP2; \ // XTMP2 = W[-2] ror 17 {xDxC}
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
; \
VPXOR XTMP3, XTMP2, XTMP2; \
RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
ADDL y0, y2; \ // y2 = S1 + CH // --
; \
VPXOR XTMP2, XTMP5, XTMP5; \ // XTMP5 = s1 {xDxC}
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
; \
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
; \
MOVL f, _TMP(SP); \ // Save f
MOVQ $shuff_DC00<>(SB), f; \ // SHUF_00DC
VPSHUFB (f), XTMP5, XTMP5; \ // XTMP5 = s1 {DC00}
MOVL _TMP(SP), f; \ // Restore f
; \
VPADDD XTMP0, XTMP5, XDWORD0; \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
MOVL a, T1; \ // T1 = a // MAJB
ANDL c, T1; \ // T1 = a&c // MAJB
ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
; \
ADDL y1, h; \ // h = k + w + h + S0 // --
ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
ADDL y3, h // h = t1 + S0 + MAJ // --
#define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
; \ // ################################### RND N + 0 ###########################
MOVL f, y2; \ // y2 = f // CH
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
XORL g, y2; \ // y2 = f^g // CH
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
ANDL e, y2; \ // y2 = (f^g)&e // CH
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
MOVL a, y3; \ // y3 = a // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
ADDL (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
MOVL a, T1; \ // T1 = a // MAJB
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
ANDL c, T1; \ // T1 = a&c // MAJB
ADDL y0, y2; \ // y2 = S1 + CH // --
; \
ADDL h, d; \ // d = k + w + h + d // --
ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
ADDL y1, h; \ // h = k + w + h + S0 // --
ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
#define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
; \ // ################################### RND N + 1 ###########################
ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
MOVL f, y2; \ // y2 = f // CH
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
XORL g, y2; \ // y2 = f^g // CH
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
ANDL e, y2; \ // y2 = (f^g)&e // CH
ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
MOVL a, y3; \ // y3 = a // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
ADDL (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
MOVL a, T1; \ // T1 = a // MAJB
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
ANDL c, T1; \ // T1 = a&c // MAJB
ADDL y0, y2; \ // y2 = S1 + CH // --
; \
ADDL h, d; \ // d = k + w + h + d // --
ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
ADDL y1, h; \ // h = k + w + h + S0 // --
; \
ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
#define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
; \ // ################################### RND N + 2 ##############################
ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
MOVL f, y2; \ // y2 = f // CH
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
XORL g, y2; \ // y2 = f^g // CH
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
ANDL e, y2; \ // y2 = (f^g)&e // CH
ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
MOVL a, y3; \ // y3 = a // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
ADDL (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
MOVL a, T1; \ // T1 = a // MAJB
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
ANDL c, T1; \ // T1 = a&c // MAJB
ADDL y0, y2; \ // y2 = S1 + CH // --
; \
ADDL h, d; \ // d = k + w + h + d // --
ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
ADDL y1, h; \ // h = k + w + h + S0 // --
; \
ADDL y2, d // d = k + w + h + d + S1 + CH = d + t1 // --
#define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
; \ // ################################### RND N + 3 ###########################
ADDL y2, old_h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
MOVL f, y2; \ // y2 = f // CH
RORXL $25, e, y0; \ // y0 = e >> 25 // S1A
RORXL $11, e, y1; \ // y1 = e >> 11 // S1B
XORL g, y2; \ // y2 = f^g // CH
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) // S1
RORXL $6, e, y1; \ // y1 = (e >> 6) // S1
ANDL e, y2; \ // y2 = (f^g)&e // CH
ADDL y3, old_h; \ // h = t1 + S0 + MAJ // --
; \
XORL y1, y0; \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6) // S1
RORXL $13, a, T1; \ // T1 = a >> 13 // S0B
XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
RORXL $22, a, y1; \ // y1 = a >> 22 // S0A
MOVL a, y3; \ // y3 = a // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) // S0
RORXL $2, a, T1; \ // T1 = (a >> 2) // S0
ADDL (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h // --
ORL c, y3; \ // y3 = a|c // MAJA
; \
XORL T1, y1; \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2) // S0
MOVL a, T1; \ // T1 = a // MAJB
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
ANDL c, T1; \ // T1 = a&c // MAJB
ADDL y0, y2; \ // y2 = S1 + CH // --
; \
ADDL h, d; \ // d = k + w + h + d // --
ORL T1, y3; \ // y3 = MAJ = (a|c)&b)|(a&c) // MAJ
ADDL y1, h; \ // h = k + w + h + S0 // --
; \
ADDL y2, d; \ // d = k + w + h + d + S1 + CH = d + t1 // --
; \
ADDL y2, h; \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
; \
ADDL y3, h // h = t1 + S0 + MAJ // --
TEXT ·block(SB), 0, $536-32
CMPB runtime·support_avx2(SB), $0
JE noavx2bmi2
CMPB runtime·support_bmi2(SB), $1 // check for RORXL instruction
JE avx2
noavx2bmi2:
MOVQ p_base+8(FP), SI MOVQ p_base+8(FP), SI
MOVQ p_len+16(FP), DX MOVQ p_len+16(FP), DX
SHRQ $6, DX SHRQ $6, DX
@ -162,7 +586,7 @@ TEXT ·block(SB),0,$264-32
MOVL (7*4)(BP), R15 // h = H7 MOVL (7*4)(BP), R15 // h = H7
loop: loop:
MOVQ SP, BP // message schedule MOVQ SP, BP
SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15) SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14) SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
@ -254,3 +678,367 @@ loop:
end: end:
RET RET
avx2:
MOVQ dig+0(FP), CTX // d.h[8]
MOVQ p_base+8(FP), INP
MOVQ p_len+16(FP), NUM_BYTES
LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
MOVQ NUM_BYTES, _INP_END(SP)
CMPQ NUM_BYTES, INP
JE avx2_only_one_block
// Load initial digest
MOVL 0(CTX), a // a = H0
MOVL 4(CTX), b // b = H1
MOVL 8(CTX), c // c = H2
MOVL 12(CTX), d // d = H3
MOVL 16(CTX), e // e = H4
MOVL 20(CTX), f // f = H5
MOVL 24(CTX), g // g = H6
MOVL 28(CTX), h // h = H7
avx2_loop0: // at each iteration works with one block (512 bit)
VMOVDQU (0*32)(INP), XTMP0
VMOVDQU (1*32)(INP), XTMP1
VMOVDQU (2*32)(INP), XTMP2
VMOVDQU (3*32)(INP), XTMP3
MOVQ $flip_mask<>(SB), BP // BYTE_FLIP_MASK
VMOVDQU (BP), BYTE_FLIP_MASK
// Apply Byte Flip Mask: LE -> BE
VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
// Transpose data into high/low parts
VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
avx2_last_block_enter:
ADDQ $64, INP
MOVQ INP, _INP(SP)
XORQ SRND, SRND
avx2_loop1: // for w0 - w47
// Do 4 rounds and scheduling
VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER
VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
// Do 4 rounds and scheduling
VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER
VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
// Do 4 rounds and scheduling
VPADDD 2*32(TBL)(SRND*1), XDWORD2, XFER
VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
// Do 4 rounds and scheduling
VPADDD 3*32(TBL)(SRND*1), XDWORD3, XFER
VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
ADDQ $4*32, SRND
CMPQ SRND, $3*4*32
JB avx2_loop1
avx2_loop2:
// w48 - w63 processed with no scheduliung (last 16 rounds)
VPADDD 0*32(TBL)(SRND*1), XDWORD0, XFER
VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
VPADDD 1*32(TBL)(SRND*1), XDWORD1, XFER
VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
ADDQ $2*32, SRND
VMOVDQU XDWORD2, XDWORD0
VMOVDQU XDWORD3, XDWORD1
CMPQ SRND, $4*4*32
JB avx2_loop2
MOVQ dig+0(FP), CTX // d.h[8]
MOVQ _INP(SP), INP
addm( 0(CTX), a)
addm( 4(CTX), b)
addm( 8(CTX), c)
addm( 12(CTX), d)
addm( 16(CTX), e)
addm( 20(CTX), f)
addm( 24(CTX), g)
addm( 28(CTX), h)
CMPQ _INP_END(SP), INP
JB done_hash
XORQ SRND, SRND
avx2_loop3: // Do second block using previously scheduled results
DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
ADDQ $2*32, SRND
CMPQ SRND, $4*4*32
JB avx2_loop3
MOVQ dig+0(FP), CTX // d.h[8]
MOVQ _INP(SP), INP
ADDQ $64, INP
addm( 0(CTX), a)
addm( 4(CTX), b)
addm( 8(CTX), c)
addm( 12(CTX), d)
addm( 16(CTX), e)
addm( 20(CTX), f)
addm( 24(CTX), g)
addm( 28(CTX), h)
CMPQ _INP_END(SP), INP
JA avx2_loop0
JB done_hash
avx2_do_last_block:
VMOVDQU 0(INP), XWORD0
VMOVDQU 16(INP), XWORD1
VMOVDQU 32(INP), XWORD2
VMOVDQU 48(INP), XWORD3
MOVQ $flip_mask<>(SB), BP
VMOVDQU (BP), X_BYTE_FLIP_MASK
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
MOVQ $K256<>(SB), TBL
JMP avx2_last_block_enter
avx2_only_one_block:
// Load initial digest
MOVL 0(CTX), a // a = H0
MOVL 4(CTX), b // b = H1
MOVL 8(CTX), c // c = H2
MOVL 12(CTX), d // d = H3
MOVL 16(CTX), e // e = H4
MOVL 20(CTX), f // f = H5
MOVL 24(CTX), g // g = H6
MOVL 28(CTX), h // h = H7
JMP avx2_do_last_block
done_hash:
VZEROUPPER
RET
// shuffle byte order from LE to BE
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
GLOBL flip_mask<>(SB), 8, $32
// shuffle xBxA -> 00BA
DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
GLOBL shuff_00BA<>(SB), 8, $32
// shuffle xDxC -> DC00
DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
GLOBL shuff_DC00<>(SB), 8, $32
// Round specific constants
DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
DATA K256<>+0x04(SB)/4, $0x71374491 // k2
DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
DATA K256<>+0x14(SB)/4, $0x71374491 // k2
DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
DATA K256<>+0x24(SB)/4, $0x59f111f1
DATA K256<>+0x28(SB)/4, $0x923f82a4
DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
DATA K256<>+0x30(SB)/4, $0x3956c25b
DATA K256<>+0x34(SB)/4, $0x59f111f1
DATA K256<>+0x38(SB)/4, $0x923f82a4
DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
DATA K256<>+0x44(SB)/4, $0x12835b01
DATA K256<>+0x48(SB)/4, $0x243185be
DATA K256<>+0x4c(SB)/4, $0x550c7dc3
DATA K256<>+0x50(SB)/4, $0xd807aa98
DATA K256<>+0x54(SB)/4, $0x12835b01
DATA K256<>+0x58(SB)/4, $0x243185be
DATA K256<>+0x5c(SB)/4, $0x550c7dc3
DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
DATA K256<>+0x64(SB)/4, $0x80deb1fe
DATA K256<>+0x68(SB)/4, $0x9bdc06a7
DATA K256<>+0x6c(SB)/4, $0xc19bf174
DATA K256<>+0x70(SB)/4, $0x72be5d74
DATA K256<>+0x74(SB)/4, $0x80deb1fe
DATA K256<>+0x78(SB)/4, $0x9bdc06a7
DATA K256<>+0x7c(SB)/4, $0xc19bf174
DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
DATA K256<>+0x84(SB)/4, $0xefbe4786
DATA K256<>+0x88(SB)/4, $0x0fc19dc6
DATA K256<>+0x8c(SB)/4, $0x240ca1cc
DATA K256<>+0x90(SB)/4, $0xe49b69c1
DATA K256<>+0x94(SB)/4, $0xefbe4786
DATA K256<>+0x98(SB)/4, $0x0fc19dc6
DATA K256<>+0x9c(SB)/4, $0x240ca1cc
DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
DATA K256<>+0xa4(SB)/4, $0x4a7484aa
DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
DATA K256<>+0xac(SB)/4, $0x76f988da
DATA K256<>+0xb0(SB)/4, $0x2de92c6f
DATA K256<>+0xb4(SB)/4, $0x4a7484aa
DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
DATA K256<>+0xbc(SB)/4, $0x76f988da
DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
DATA K256<>+0xc4(SB)/4, $0xa831c66d
DATA K256<>+0xc8(SB)/4, $0xb00327c8
DATA K256<>+0xcc(SB)/4, $0xbf597fc7
DATA K256<>+0xd0(SB)/4, $0x983e5152
DATA K256<>+0xd4(SB)/4, $0xa831c66d
DATA K256<>+0xd8(SB)/4, $0xb00327c8
DATA K256<>+0xdc(SB)/4, $0xbf597fc7
DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
DATA K256<>+0xe4(SB)/4, $0xd5a79147
DATA K256<>+0xe8(SB)/4, $0x06ca6351
DATA K256<>+0xec(SB)/4, $0x14292967
DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
DATA K256<>+0xf4(SB)/4, $0xd5a79147
DATA K256<>+0xf8(SB)/4, $0x06ca6351
DATA K256<>+0xfc(SB)/4, $0x14292967
DATA K256<>+0x100(SB)/4, $0x27b70a85
DATA K256<>+0x104(SB)/4, $0x2e1b2138
DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
DATA K256<>+0x10c(SB)/4, $0x53380d13
DATA K256<>+0x110(SB)/4, $0x27b70a85
DATA K256<>+0x114(SB)/4, $0x2e1b2138
DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
DATA K256<>+0x11c(SB)/4, $0x53380d13
DATA K256<>+0x120(SB)/4, $0x650a7354
DATA K256<>+0x124(SB)/4, $0x766a0abb
DATA K256<>+0x128(SB)/4, $0x81c2c92e
DATA K256<>+0x12c(SB)/4, $0x92722c85
DATA K256<>+0x130(SB)/4, $0x650a7354
DATA K256<>+0x134(SB)/4, $0x766a0abb
DATA K256<>+0x138(SB)/4, $0x81c2c92e
DATA K256<>+0x13c(SB)/4, $0x92722c85
DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
DATA K256<>+0x144(SB)/4, $0xa81a664b
DATA K256<>+0x148(SB)/4, $0xc24b8b70
DATA K256<>+0x14c(SB)/4, $0xc76c51a3
DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
DATA K256<>+0x154(SB)/4, $0xa81a664b
DATA K256<>+0x158(SB)/4, $0xc24b8b70
DATA K256<>+0x15c(SB)/4, $0xc76c51a3
DATA K256<>+0x160(SB)/4, $0xd192e819
DATA K256<>+0x164(SB)/4, $0xd6990624
DATA K256<>+0x168(SB)/4, $0xf40e3585
DATA K256<>+0x16c(SB)/4, $0x106aa070
DATA K256<>+0x170(SB)/4, $0xd192e819
DATA K256<>+0x174(SB)/4, $0xd6990624
DATA K256<>+0x178(SB)/4, $0xf40e3585
DATA K256<>+0x17c(SB)/4, $0x106aa070
DATA K256<>+0x180(SB)/4, $0x19a4c116
DATA K256<>+0x184(SB)/4, $0x1e376c08
DATA K256<>+0x188(SB)/4, $0x2748774c
DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
DATA K256<>+0x190(SB)/4, $0x19a4c116
DATA K256<>+0x194(SB)/4, $0x1e376c08
DATA K256<>+0x198(SB)/4, $0x2748774c
DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
DATA K256<>+0x1c0(SB)/4, $0x748f82ee
DATA K256<>+0x1c4(SB)/4, $0x78a5636f
DATA K256<>+0x1c8(SB)/4, $0x84c87814
DATA K256<>+0x1cc(SB)/4, $0x8cc70208
DATA K256<>+0x1d0(SB)/4, $0x748f82ee
DATA K256<>+0x1d4(SB)/4, $0x78a5636f
DATA K256<>+0x1d8(SB)/4, $0x84c87814
DATA K256<>+0x1dc(SB)/4, $0x8cc70208
DATA K256<>+0x1e0(SB)/4, $0x90befffa
DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
DATA K256<>+0x1ec(SB)/4, $0xc67178f2
DATA K256<>+0x1f0(SB)/4, $0x90befffa
DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
DATA K256<>+0x1fc(SB)/4, $0xc67178f2
GLOBL K256<>(SB), (NOPTR + RODATA), $512

View file

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// +build 386 amd64 // +build 386 amd64 s390x ppc64le
package sha256 package sha256

View file

@ -0,0 +1,9 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!386,!s390x,!ppc64le
package sha256
var block = blockGeneric

View file

@ -0,0 +1,12 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package sha256
// featureCheck reports whether the CPU supports the
// SHA256 compute intermediate message digest (KIMD)
// function code.
func featureCheck() bool
var useAsm = featureCheck()

View file

@ -0,0 +1,34 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// func featureCheck() bool
TEXT ·featureCheck(SB),NOSPLIT,$16-1
LA tmp-16(SP), R1
XOR R0, R0 // query function code is 0
WORD $0xB93E0006 // KIMD (R6 is ignored)
MOVBZ tmp-16(SP), R4 // get the first byte
AND $0x20, R4 // bit 2 (big endian) for SHA256
CMPBEQ R4, $0, nosha256
MOVB $1, ret+0(FP)
RET
nosha256:
MOVB $0, ret+0(FP)
RET
// func block(dig *digest, p []byte)
TEXT ·block(SB),NOSPLIT,$0-32
MOVBZ ·useAsm(SB), R4
LMG dig+0(FP), R1, R3 // R2 = &p[0], R3 = len(p)
CMPBNE R4, $1, generic
MOVBZ $2, R0 // SHA256 function code
loop:
WORD $0xB93E0002 // KIMD R2
BVS loop // continue if interrupted
done:
XOR R0, R0 // restore R0
RET
generic:
BR ·blockGeneric(SB)

View file

@ -2,8 +2,11 @@ package sha512
import ( import (
"bytes" "bytes"
"crypto"
"encoding/gob" "encoding/gob"
"github.com/stevvooe/resumable"
// import to ensure that our init function runs after the standard package // import to ensure that our init function runs after the standard package
_ "crypto/sha512" _ "crypto/sha512"
) )
@ -21,7 +24,7 @@ func (d *digest) State() ([]byte, error) {
// We encode this way so that we do not have // We encode this way so that we do not have
// to export these fields of the digest struct. // to export these fields of the digest struct.
vals := []interface{}{ vals := []interface{}{
d.h, d.x, d.nx, d.len, d.is384, d.h, d.x, d.nx, d.len, d.function,
} }
for _, val := range vals { for _, val := range vals {
@ -40,7 +43,7 @@ func (d *digest) Restore(state []byte) error {
// We decode this way so that we do not have // We decode this way so that we do not have
// to export these fields of the digest struct. // to export these fields of the digest struct.
vals := []interface{}{ vals := []interface{}{
&d.h, &d.x, &d.nx, &d.len, &d.is384, &d.h, &d.x, &d.nx, &d.len, &d.function,
} }
for _, val := range vals { for _, val := range vals {
@ -49,5 +52,12 @@ func (d *digest) Restore(state []byte) error {
} }
} }
switch d.function {
case crypto.SHA384, crypto.SHA512, crypto.SHA512_224, crypto.SHA512_256:
break
default:
return resumable.ErrBadState
}
return nil return nil
} }

View file

@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// Package sha512 implements the SHA384 and SHA512 hash algorithms as defined // Package sha512 implements the SHA-384, SHA-512, SHA-512/224, and SHA-512/256
// in FIPS 180-2. // hash algorithms as defined in FIPS 180-4.
package sha512 package sha512
import ( import (
@ -14,16 +14,27 @@ import (
func init() { func init() {
crypto.RegisterHash(crypto.SHA384, New384) crypto.RegisterHash(crypto.SHA384, New384)
crypto.RegisterHash(crypto.SHA512, New) crypto.RegisterHash(crypto.SHA512, New)
crypto.RegisterHash(crypto.SHA512_224, New512_224)
crypto.RegisterHash(crypto.SHA512_256, New512_256)
} }
// The size of a SHA512 checksum in bytes. const (
const Size = 64 // Size is the size, in bytes, of a SHA-512 checksum.
Size = 64
// The size of a SHA384 checksum in bytes. // Size224 is the size, in bytes, of a SHA-512/224 checksum.
const Size384 = 48 Size224 = 28
// The blocksize of SHA512 and SHA384 in bytes. // Size256 is the size, in bytes, of a SHA-512/256 checksum.
const BlockSize = 128 Size256 = 32
// Size384 is the size, in bytes, of a SHA-384 checksum.
Size384 = 48
// BlockSize is the block size, in bytes, of the SHA-512/224,
// SHA-512/256, SHA-384 and SHA-512 hash functions.
BlockSize = 128
)
const ( const (
chunk = 128 chunk = 128
@ -35,6 +46,22 @@ const (
init5 = 0x9b05688c2b3e6c1f init5 = 0x9b05688c2b3e6c1f
init6 = 0x1f83d9abfb41bd6b init6 = 0x1f83d9abfb41bd6b
init7 = 0x5be0cd19137e2179 init7 = 0x5be0cd19137e2179
init0_224 = 0x8c3d37c819544da2
init1_224 = 0x73e1996689dcd4d6
init2_224 = 0x1dfab7ae32ff9c82
init3_224 = 0x679dd514582f9fcf
init4_224 = 0x0f6d2b697bd44da8
init5_224 = 0x77e36f7304c48942
init6_224 = 0x3f9d85a86a1d36c8
init7_224 = 0x1112e6ad91d692a1
init0_256 = 0x22312194fc2bf72c
init1_256 = 0x9f555fa3c84c64c2
init2_256 = 0x2393b86b6f53b151
init3_256 = 0x963877195940eabd
init4_256 = 0x96283ee2a88effe3
init5_256 = 0xbe5e1e2553863992
init6_256 = 0x2b0199fc2c85b8aa
init7_256 = 0x0eb72ddc81c52ca2
init0_384 = 0xcbbb9d5dc1059ed8 init0_384 = 0xcbbb9d5dc1059ed8
init1_384 = 0x629a292a367cd507 init1_384 = 0x629a292a367cd507
init2_384 = 0x9159015a3070dd17 init2_384 = 0x9159015a3070dd17
@ -51,20 +78,12 @@ type digest struct {
x [chunk]byte x [chunk]byte
nx int nx int
len uint64 len uint64
is384 bool // mark if this digest is SHA-384 function crypto.Hash
} }
func (d *digest) Reset() { func (d *digest) Reset() {
if !d.is384 { switch d.function {
d.h[0] = init0 case crypto.SHA384:
d.h[1] = init1
d.h[2] = init2
d.h[3] = init3
d.h[4] = init4
d.h[5] = init5
d.h[6] = init6
d.h[7] = init7
} else {
d.h[0] = init0_384 d.h[0] = init0_384
d.h[1] = init1_384 d.h[1] = init1_384
d.h[2] = init2_384 d.h[2] = init2_384
@ -73,31 +92,77 @@ func (d *digest) Reset() {
d.h[5] = init5_384 d.h[5] = init5_384
d.h[6] = init6_384 d.h[6] = init6_384
d.h[7] = init7_384 d.h[7] = init7_384
case crypto.SHA512_224:
d.h[0] = init0_224
d.h[1] = init1_224
d.h[2] = init2_224
d.h[3] = init3_224
d.h[4] = init4_224
d.h[5] = init5_224
d.h[6] = init6_224
d.h[7] = init7_224
case crypto.SHA512_256:
d.h[0] = init0_256
d.h[1] = init1_256
d.h[2] = init2_256
d.h[3] = init3_256
d.h[4] = init4_256
d.h[5] = init5_256
d.h[6] = init6_256
d.h[7] = init7_256
default:
d.h[0] = init0
d.h[1] = init1
d.h[2] = init2
d.h[3] = init3
d.h[4] = init4
d.h[5] = init5
d.h[6] = init6
d.h[7] = init7
} }
d.nx = 0 d.nx = 0
d.len = 0 d.len = 0
} }
// New returns a new hash.Hash computing the SHA512 checksum. // New returns a new hash.Hash computing the SHA-512 checksum.
func New() hash.Hash { func New() hash.Hash {
d := new(digest) d := &digest{function: crypto.SHA512}
d.Reset() d.Reset()
return d return d
} }
// New384 returns a new hash.Hash computing the SHA384 checksum. // New512_224 returns a new hash.Hash computing the SHA-512/224 checksum.
func New512_224() hash.Hash {
d := &digest{function: crypto.SHA512_224}
d.Reset()
return d
}
// New512_256 returns a new hash.Hash computing the SHA-512/256 checksum.
func New512_256() hash.Hash {
d := &digest{function: crypto.SHA512_256}
d.Reset()
return d
}
// New384 returns a new hash.Hash computing the SHA-384 checksum.
func New384() hash.Hash { func New384() hash.Hash {
d := new(digest) d := &digest{function: crypto.SHA384}
d.is384 = true
d.Reset() d.Reset()
return d return d
} }
func (d *digest) Size() int { func (d *digest) Size() int {
if !d.is384 { switch d.function {
case crypto.SHA512_224:
return Size224
case crypto.SHA512_256:
return Size256
case crypto.SHA384:
return Size384
default:
return Size return Size
} }
return Size384
} }
func (d *digest) BlockSize() int { return BlockSize } func (d *digest) BlockSize() int { return BlockSize }
@ -130,11 +195,17 @@ func (d0 *digest) Sum(in []byte) []byte {
d := new(digest) d := new(digest)
*d = *d0 *d = *d0
hash := d.checkSum() hash := d.checkSum()
if d.is384 { switch d.function {
case crypto.SHA384:
return append(in, hash[:Size384]...) return append(in, hash[:Size384]...)
} case crypto.SHA512_224:
return append(in, hash[:Size224]...)
case crypto.SHA512_256:
return append(in, hash[:Size256]...)
default:
return append(in, hash[:]...) return append(in, hash[:]...)
} }
}
func (d *digest) checkSum() [Size]byte { func (d *digest) checkSum() [Size]byte {
// Padding. Add a 1 bit and 0 bits until 112 bytes mod 128. // Padding. Add a 1 bit and 0 bits until 112 bytes mod 128.
@ -159,7 +230,7 @@ func (d *digest) checkSum() [Size]byte {
} }
h := d.h[:] h := d.h[:]
if d.is384 { if d.function == crypto.SHA384 {
h = d.h[:6] h = d.h[:6]
} }
@ -180,7 +251,7 @@ func (d *digest) checkSum() [Size]byte {
// Sum512 returns the SHA512 checksum of the data. // Sum512 returns the SHA512 checksum of the data.
func Sum512(data []byte) [Size]byte { func Sum512(data []byte) [Size]byte {
var d digest d := digest{function: crypto.SHA512}
d.Reset() d.Reset()
d.Write(data) d.Write(data)
return d.checkSum() return d.checkSum()
@ -188,11 +259,30 @@ func Sum512(data []byte) [Size]byte {
// Sum384 returns the SHA384 checksum of the data. // Sum384 returns the SHA384 checksum of the data.
func Sum384(data []byte) (sum384 [Size384]byte) { func Sum384(data []byte) (sum384 [Size384]byte) {
var d digest d := digest{function: crypto.SHA384}
d.is384 = true
d.Reset() d.Reset()
d.Write(data) d.Write(data)
sum := d.checkSum() sum := d.checkSum()
copy(sum384[:], sum[:Size384]) copy(sum384[:], sum[:Size384])
return return
} }
// Sum512_224 returns the Sum512/224 checksum of the data.
func Sum512_224(data []byte) (sum224 [Size224]byte) {
d := digest{function: crypto.SHA512_224}
d.Reset()
d.Write(data)
sum := d.checkSum()
copy(sum224[:], sum[:Size224])
return
}
// Sum512_256 returns the Sum512/256 checksum of the data.
func Sum512_256(data []byte) (sum256 [Size256]byte) {
d := digest{function: crypto.SHA512_256}
d.Reset()
d.Write(data)
sum := d.checkSum()
copy(sum256[:], sum[:Size256])
return
}

View file

@ -2,8 +2,6 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// +build !amd64
// SHA512 block step. // SHA512 block step.
// In its own file so that a faster assembly or C version // In its own file so that a faster assembly or C version
// can be substituted easily. // can be substituted easily.
@ -93,7 +91,7 @@ var _K = []uint64{
0x6c44198c4a475817, 0x6c44198c4a475817,
} }
func block(dig *digest, p []byte) { func blockGeneric(dig *digest, p []byte) {
var w [80]uint64 var w [80]uint64
h0, h1, h2, h3, h4, h5, h6, h7 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] h0, h1, h2, h3, h4, h5, h6, h7 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]
for len(p) >= chunk { for len(p) >= chunk {

View file

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// +build amd64 // +build amd64 s390x ppc64le
package sha512 package sha512

View file

@ -0,0 +1,9 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!s390x,!ppc64le
package sha512
var block = blockGeneric

View file

@ -0,0 +1,12 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package sha512
// featureCheck reports whether the CPU supports the
// SHA512 compute intermediate message digest (KIMD)
// function code.
func featureCheck() bool
var useAsm = featureCheck()

View file

@ -0,0 +1,34 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// func featureCheck() bool
TEXT ·featureCheck(SB),NOSPLIT,$16-1
LA tmp-16(SP), R1
XOR R0, R0 // query function code is 0
WORD $0xB93E0006 // KIMD (R6 is ignored)
MOVBZ tmp-16(SP), R4 // get the first byte
AND $0x10, R4 // bit 3 (big endian) for SHA512
CMPBEQ R4, $0, nosha512
MOVB $1, ret+0(FP)
RET
nosha512:
MOVB $0, ret+0(FP)
RET
// func block(dig *digest, p []byte)
TEXT ·block(SB),NOSPLIT,$0-32
MOVBZ ·useAsm(SB), R4
LMG dig+0(FP), R1, R3 // R2 = &p[0], R3 = len(p)
CMPBNE R4, $1, generic
MOVBZ $3, R0 // SHA512 function code
loop:
WORD $0xB93E0002 // KIMD R2
BVS loop // continue if interrupted
done:
XOR R0, R0 // restore R0
RET
generic:
BR ·blockGeneric(SB)