libbb/sha256: optional x86 hardware accelerated hashing

64 bit:
function                                             old     new   delta
sha256_process_block64_shaNI                           -     730    +730
.rodata                                           108314  108586    +272
sha256_begin                                          31      83     +52
------------------------------------------------------------------------------
(add/remove: 5/1 grow/shrink: 2/0 up/down: 1055/-1)          Total: 1054 bytes

32 bit:
function                                             old     new   delta
sha256_process_block64_shaNI                           -     747    +747
.rodata                                           104318  104590    +272
sha256_begin                                          29      84     +55
------------------------------------------------------------------------------
(add/remove: 5/1 grow/shrink: 2/0 up/down: 1075/-1)          Total: 1074 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2022-02-03 14:15:20 +01:00
parent 205042c07a
commit 6472ac9428
9 changed files with 612 additions and 26 deletions

View File

@ -70,6 +70,12 @@ config SHA1_HWACCEL
On x86, this adds ~590 bytes of code. Throughput
is about twice as fast as fully-unrolled generic code.
config SHA256_HWACCEL
bool "SHA256: Use hardware accelerated instructions if possible"
default y
help
On x86, this adds ~1k bytes of code.
config SHA3_SMALL
int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
default 1 # all "fast or small" options default to small

View File

@ -59,6 +59,8 @@ lib-y += hash_md5_sha.o
lib-y += hash_md5_sha_x86-64.o
lib-y += hash_md5_sha_x86-64_shaNI.o
lib-y += hash_md5_sha_x86-32_shaNI.o
lib-y += hash_md5_sha256_x86-64_shaNI.o
lib-y += hash_md5_sha256_x86-32_shaNI.o
# Alternative (disabled) MD5 implementation
#lib-y += hash_md5prime.o
lib-y += messages.o

View File

@ -13,6 +13,27 @@
#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
#if ENABLE_SHA1_HWACCEL || ENABLE_SHA256_HWACCEL
# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
{
asm ("cpuid"
: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
: "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx)
);
}
static smallint shaNI;
void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
void FAST_FUNC sha256_process_block64_shaNI(sha256_ctx_t *ctx);
# if defined(__i386__)
struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 76)]; };
# endif
# if defined(__x86_64__)
struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 80)]; };
# endif
# endif
#endif
/* gcc 4.2.1 optimizes rotr64 better with inline than with macro
* (for rotX32, there is no difference). Why? My guess is that
* macro requires clever common subexpression elimination heuristics
@ -1142,25 +1163,6 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
}
#endif /* NEED_SHA512 */
#if ENABLE_SHA1_HWACCEL
# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
{
asm ("cpuid"
: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
: "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx)
);
}
void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
# if defined(__i386__)
struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; };
# endif
# if defined(__x86_64__)
struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
# endif
# endif
#endif
void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
{
ctx->hash[0] = 0x67452301;
@ -1173,7 +1175,6 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
#if ENABLE_SHA1_HWACCEL
# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
{
static smallint shaNI;
if (!shaNI) {
unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
cpuid(&eax, &ebx, &ecx, &edx);
@ -1225,6 +1226,19 @@ void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
memcpy(&ctx->total64, init256, sizeof(init256));
/*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
ctx->process_block = sha256_process_block64;
#if ENABLE_SHA256_HWACCEL
# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
{
if (!shaNI) {
unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
cpuid(&eax, &ebx, &ecx, &edx);
shaNI = ((ebx >> 29) << 1) - 1;
}
if (shaNI > 0)
ctx->process_block = sha256_process_block64_shaNI;
}
# endif
#endif
}
#if NEED_SHA512

View File

@ -0,0 +1,283 @@
#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
/* The code is adapted from Linux kernel's source */
// We use shorter insns, even though they are for "wrong"
// data type (fp, not int).
// For Intel, there is no penalty for doing it at all
// (CPUs which do have such penalty do not support SHA1 insns).
// For AMD, the penalty is one extra cycle
// (allegedly: I failed to find measurable difference).
//#define mova128 movdqa
#define mova128 movaps
//#define movu128 movdqu
#define movu128 movups
//#define shuf128_32 pshufd
#define shuf128_32 shufps
.section .text.sha256_process_block64_shaNI, "ax", @progbits
.globl sha256_process_block64_shaNI
.hidden sha256_process_block64_shaNI
.type sha256_process_block64_shaNI, @function
#define DATA_PTR %eax
#define SHA256CONSTANTS %ecx
#define MSG %xmm0
#define STATE0 %xmm1
#define STATE1 %xmm2
#define MSGTMP0 %xmm3
#define MSGTMP1 %xmm4
#define MSGTMP2 %xmm5
#define MSGTMP3 %xmm6
#define MSGTMP4 %xmm7
.balign 8 # allow decoders to fetch at least 3 first insns
sha256_process_block64_shaNI:
pushl %ebp
movl %esp, %ebp
subl $32, %esp
andl $~0xF, %esp # paddd needs aligned memory operand
movu128 76+0*16(%eax), STATE0
movu128 76+1*16(%eax), STATE1
shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */
shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */
mova128 STATE0, MSGTMP4
palignr $8, STATE1, STATE0 /* ABEF */
pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
# mova128 PSHUFFLE_BSWAP32_FLIP_MASK, SHUF_MASK
lea K256, SHA256CONSTANTS
/* Save hash values for addition after rounds */
mova128 STATE0, 0*16(%esp)
mova128 STATE1, 1*16(%esp)
/* Rounds 0-3 */
movu128 0*16(DATA_PTR), MSG
pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
mova128 MSG, MSGTMP0
paddd 0*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
/* Rounds 4-7 */
movu128 1*16(DATA_PTR), MSG
pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
mova128 MSG, MSGTMP1
paddd 1*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP1, MSGTMP0
/* Rounds 8-11 */
movu128 2*16(DATA_PTR), MSG
pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
mova128 MSG, MSGTMP2
paddd 2*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP2, MSGTMP1
/* Rounds 12-15 */
movu128 3*16(DATA_PTR), MSG
pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
mova128 MSG, MSGTMP3
paddd 3*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP3, MSGTMP4
palignr $4, MSGTMP2, MSGTMP4
paddd MSGTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP3, MSGTMP2
/* Rounds 16-19 */
mova128 MSGTMP0, MSG
paddd 4*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP0, MSGTMP4
palignr $4, MSGTMP3, MSGTMP4
paddd MSGTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP0, MSGTMP3
/* Rounds 20-23 */
mova128 MSGTMP1, MSG
paddd 5*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP1, MSGTMP4
palignr $4, MSGTMP0, MSGTMP4
paddd MSGTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP1, MSGTMP0
/* Rounds 24-27 */
mova128 MSGTMP2, MSG
paddd 6*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP2, MSGTMP4
palignr $4, MSGTMP1, MSGTMP4
paddd MSGTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP2, MSGTMP1
/* Rounds 28-31 */
mova128 MSGTMP3, MSG
paddd 7*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP3, MSGTMP4
palignr $4, MSGTMP2, MSGTMP4
paddd MSGTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP3, MSGTMP2
/* Rounds 32-35 */
mova128 MSGTMP0, MSG
paddd 8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP0, MSGTMP4
palignr $4, MSGTMP3, MSGTMP4
paddd MSGTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP0, MSGTMP3
/* Rounds 36-39 */
mova128 MSGTMP1, MSG
paddd 9*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP1, MSGTMP4
palignr $4, MSGTMP0, MSGTMP4
paddd MSGTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP1, MSGTMP0
/* Rounds 40-43 */
mova128 MSGTMP2, MSG
paddd 10*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP2, MSGTMP4
palignr $4, MSGTMP1, MSGTMP4
paddd MSGTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP2, MSGTMP1
/* Rounds 44-47 */
mova128 MSGTMP3, MSG
paddd 11*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP3, MSGTMP4
palignr $4, MSGTMP2, MSGTMP4
paddd MSGTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP3, MSGTMP2
/* Rounds 48-51 */
mova128 MSGTMP0, MSG
paddd 12*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP0, MSGTMP4
palignr $4, MSGTMP3, MSGTMP4
paddd MSGTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP0, MSGTMP3
/* Rounds 52-55 */
mova128 MSGTMP1, MSG
paddd 13*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP1, MSGTMP4
palignr $4, MSGTMP0, MSGTMP4
paddd MSGTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
/* Rounds 56-59 */
mova128 MSGTMP2, MSG
paddd 14*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP2, MSGTMP4
palignr $4, MSGTMP1, MSGTMP4
paddd MSGTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
/* Rounds 60-63 */
mova128 MSGTMP3, MSG
paddd 15*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
/* Add current hash values with previously saved */
paddd 0*16(%esp), STATE0
paddd 1*16(%esp), STATE1
/* Write hash values back in the correct order */
shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */
shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */
mova128 STATE0, MSGTMP4
pblendw $0xF0, STATE1, STATE0 /* DCBA */
palignr $8, MSGTMP4, STATE1 /* HGFE */
movu128 STATE0, 76+0*16(%eax)
movu128 STATE1, 76+1*16(%eax)
movl %ebp, %esp
popl %ebp
ret
.size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
.section .rodata.cst256.K256, "aM", @progbits, 256
.balign 16
K256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
.balign 16
PSHUFFLE_BSWAP32_FLIP_MASK:
.octa 0x0c0d0e0f08090a0b0405060700010203
#endif

View File

@ -0,0 +1,281 @@
#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
/* The code is adapted from Linux kernel's source */
// We use shorter insns, even though they are for "wrong"
// data type (fp, not int).
// For Intel, there is no penalty for doing it at all
// (CPUs which do have such penalty do not support SHA1 insns).
// For AMD, the penalty is one extra cycle
// (allegedly: I failed to find measurable difference).
//#define mova128 movdqa
#define mova128 movaps
//#define movu128 movdqu
#define movu128 movups
//#define shuf128_32 pshufd
#define shuf128_32 shufps
.section .text.sha256_process_block64_shaNI, "ax", @progbits
.globl sha256_process_block64_shaNI
.hidden sha256_process_block64_shaNI
.type sha256_process_block64_shaNI, @function
#define DATA_PTR %rdi
#define SHA256CONSTANTS %rax
#define MSG %xmm0
#define STATE0 %xmm1
#define STATE1 %xmm2
#define MSGTMP0 %xmm3
#define MSGTMP1 %xmm4
#define MSGTMP2 %xmm5
#define MSGTMP3 %xmm6
#define MSGTMP4 %xmm7
#define SHUF_MASK %xmm8
#define ABEF_SAVE %xmm9
#define CDGH_SAVE %xmm10
.balign 8 # allow decoders to fetch at least 2 first insns
sha256_process_block64_shaNI:
movu128 80+0*16(%rdi), STATE0
movu128 80+1*16(%rdi), STATE1
shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */
shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */
mova128 STATE0, MSGTMP4
palignr $8, STATE1, STATE0 /* ABEF */
pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), SHUF_MASK
lea K256(%rip), SHA256CONSTANTS
/* Save hash values for addition after rounds */
mova128 STATE0, ABEF_SAVE
mova128 STATE1, CDGH_SAVE
/* Rounds 0-3 */
movu128 0*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG
mova128 MSG, MSGTMP0
paddd 0*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
/* Rounds 4-7 */
movu128 1*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG
mova128 MSG, MSGTMP1
paddd 1*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP1, MSGTMP0
/* Rounds 8-11 */
movu128 2*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG
mova128 MSG, MSGTMP2
paddd 2*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP2, MSGTMP1
/* Rounds 12-15 */
movu128 3*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG
mova128 MSG, MSGTMP3
paddd 3*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP3, MSGTMP4
palignr $4, MSGTMP2, MSGTMP4
paddd MSGTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP3, MSGTMP2
/* Rounds 16-19 */
mova128 MSGTMP0, MSG
paddd 4*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP0, MSGTMP4
palignr $4, MSGTMP3, MSGTMP4
paddd MSGTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP0, MSGTMP3
/* Rounds 20-23 */
mova128 MSGTMP1, MSG
paddd 5*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP1, MSGTMP4
palignr $4, MSGTMP0, MSGTMP4
paddd MSGTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP1, MSGTMP0
/* Rounds 24-27 */
mova128 MSGTMP2, MSG
paddd 6*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP2, MSGTMP4
palignr $4, MSGTMP1, MSGTMP4
paddd MSGTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP2, MSGTMP1
/* Rounds 28-31 */
mova128 MSGTMP3, MSG
paddd 7*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP3, MSGTMP4
palignr $4, MSGTMP2, MSGTMP4
paddd MSGTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP3, MSGTMP2
/* Rounds 32-35 */
mova128 MSGTMP0, MSG
paddd 8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP0, MSGTMP4
palignr $4, MSGTMP3, MSGTMP4
paddd MSGTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP0, MSGTMP3
/* Rounds 36-39 */
mova128 MSGTMP1, MSG
paddd 9*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP1, MSGTMP4
palignr $4, MSGTMP0, MSGTMP4
paddd MSGTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP1, MSGTMP0
/* Rounds 40-43 */
mova128 MSGTMP2, MSG
paddd 10*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP2, MSGTMP4
palignr $4, MSGTMP1, MSGTMP4
paddd MSGTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP2, MSGTMP1
/* Rounds 44-47 */
mova128 MSGTMP3, MSG
paddd 11*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP3, MSGTMP4
palignr $4, MSGTMP2, MSGTMP4
paddd MSGTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP3, MSGTMP2
/* Rounds 48-51 */
mova128 MSGTMP0, MSG
paddd 12*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP0, MSGTMP4
palignr $4, MSGTMP3, MSGTMP4
paddd MSGTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
sha256msg1 MSGTMP0, MSGTMP3
/* Rounds 52-55 */
mova128 MSGTMP1, MSG
paddd 13*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP1, MSGTMP4
palignr $4, MSGTMP0, MSGTMP4
paddd MSGTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
/* Rounds 56-59 */
mova128 MSGTMP2, MSG
paddd 14*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
mova128 MSGTMP2, MSGTMP4
palignr $4, MSGTMP1, MSGTMP4
paddd MSGTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
/* Rounds 60-63 */
mova128 MSGTMP3, MSG
paddd 15*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1
shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0
/* Add current hash values with previously saved */
paddd ABEF_SAVE, STATE0
paddd CDGH_SAVE, STATE1
/* Write hash values back in the correct order */
shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */
shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */
mova128 STATE0, MSGTMP4
pblendw $0xF0, STATE1, STATE0 /* DCBA */
palignr $8, MSGTMP4, STATE1 /* HGFE */
movu128 STATE0, 80+0*16(%rdi)
movu128 STATE1, 80+1*16(%rdi)
ret
.size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
.section .rodata.cst256.K256, "aM", @progbits, 256
.balign 16
K256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
.balign 16
PSHUFFLE_BSWAP32_FLIP_MASK:
.octa 0x0c0d0e0f08090a0b0405060700010203
#endif

View File

@ -20,7 +20,7 @@
#define extr128_32 pextrd
//#define extr128_32 extractps # not shorter
.section .text.sha1_process_block64_shaNI,"ax",@progbits
.section .text.sha1_process_block64_shaNI, "ax", @progbits
.globl sha1_process_block64_shaNI
.hidden sha1_process_block64_shaNI
.type sha1_process_block64_shaNI, @function
@ -224,7 +224,7 @@ sha1_process_block64_shaNI:
.size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
.align 16
.balign 16
PSHUFFLE_BYTE_FLIP_MASK:
.octa 0x000102030405060708090a0b0c0d0e0f

View File

@ -1394,7 +1394,7 @@ sha1_process_block64:
.size sha1_process_block64, .-sha1_process_block64
.section .rodata.cst16.sha1const, "aM", @progbits, 16
.align 16
.balign 16
rconst0x5A827999:
.long 0x5A827999
.long 0x5A827999

View File

@ -433,7 +433,7 @@ echo "
.size sha1_process_block64, .-sha1_process_block64
.section .rodata.cst16.sha1const, \"aM\", @progbits, 16
.align 16
.balign 16
rconst0x5A827999:
.long 0x5A827999
.long 0x5A827999

View File

@ -20,7 +20,7 @@
#define extr128_32 pextrd
//#define extr128_32 extractps # not shorter
.section .text.sha1_process_block64_shaNI,"ax",@progbits
.section .text.sha1_process_block64_shaNI, "ax", @progbits
.globl sha1_process_block64_shaNI
.hidden sha1_process_block64_shaNI
.type sha1_process_block64_shaNI, @function
@ -218,7 +218,7 @@ sha1_process_block64_shaNI:
.size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
.align 16
.balign 16
PSHUFFLE_BYTE_FLIP_MASK:
.octa 0x000102030405060708090a0b0c0d0e0f