libbb/sha1: x86_64 version: move to a separate .S file, no code changes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2022-01-03 01:57:29 +01:00
parent 5c0c558231
commit 05fd13ebec
3 changed files with 1353 additions and 389 deletions

View File

@ -56,6 +56,7 @@ lib-y += login.o
lib-y += make_directory.o
lib-y += makedev.o
lib-y += hash_md5_sha.o
lib-y += hash_md5_sha_x86-64.o
# Alternative (disabled) MD5 implementation
#lib-y += hash_md5prime.o
lib-y += messages.o

View File

@ -696,397 +696,11 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
#undef RCONST
}
# elif defined(__GNUC__) && defined(__x86_64__)
static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
{
BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80);
asm(
"\n\
pushq %r15 # \n\
pushq %r14 # \n\
pushq %r13 # \n\
pushq %r12 # \n\
pushq %rbp # \n\
pushq %rbx # \n\
pushq %rdi # we need ctx at the end \n\
\n\
#Register and stack use: \n\
# eax..edx: a..d \n\
# ebp: e \n\
# esi,edi: temps \n\
# -32+4*n(%rsp),r8...r15: W[0..7,8..15] \n\
.macro loadW n,r \n\
.if \\n == 0 \n\
movl -32+4*0(%rsp),\\r \n\
.endif \n\
.if \\n == 1 \n\
movl -32+4*1(%rsp),\\r \n\
.endif \n\
.if \\n == 2 \n\
movl -32+4*2(%rsp),\\r \n\
.endif \n\
.if \\n == 3 \n\
movl -32+4*3(%rsp),\\r \n\
.endif \n\
.if \\n == 4 \n\
movl -32+4*4(%rsp),\\r \n\
.endif \n\
.if \\n == 5 \n\
movl -32+4*5(%rsp),\\r \n\
.endif \n\
.if \\n == 6 \n\
movl -32+4*6(%rsp),\\r \n\
.endif \n\
.if \\n == 7 \n\
movl -32+4*7(%rsp),\\r \n\
.endif \n\
.if \\n == 8 \n\
movl %r8d,\\r \n\
.endif \n\
.if \\n == 9 \n\
movl %r9d,\\r \n\
.endif \n\
.if \\n == 10 \n\
movl %r10d,\\r \n\
.endif \n\
.if \\n == 11 \n\
movl %r11d,\\r \n\
.endif \n\
.if \\n == 12 \n\
movl %r12d,\\r \n\
.endif \n\
.if \\n == 13 \n\
movl %r13d,\\r \n\
.endif \n\
.if \\n == 14 \n\
movl %r14d,\\r \n\
.endif \n\
.if \\n == 15 \n\
movl %r15d,\\r \n\
.endif \n\
.endm \n\
\n\
.macro storeW r,n \n\
.if \\n == 0 \n\
movl \\r,-32+4*0(%rsp) \n\
.endif \n\
.if \\n == 1 \n\
movl \\r,-32+4*1(%rsp) \n\
.endif \n\
.if \\n == 2 \n\
movl \\r,-32+4*2(%rsp) \n\
.endif \n\
.if \\n == 3 \n\
movl \\r,-32+4*3(%rsp) \n\
.endif \n\
.if \\n == 4 \n\
movl \\r,-32+4*4(%rsp) \n\
.endif \n\
.if \\n == 5 \n\
movl \\r,-32+4*5(%rsp) \n\
.endif \n\
.if \\n == 6 \n\
movl \\r,-32+4*6(%rsp) \n\
.endif \n\
.if \\n == 7 \n\
movl \\r,-32+4*7(%rsp) \n\
.endif \n\
.if \\n == 8 \n\
movl \\r,%r8d \n\
.endif \n\
.if \\n == 9 \n\
movl \\r,%r9d \n\
.endif \n\
.if \\n == 10 \n\
movl \\r,%r10d \n\
.endif \n\
.if \\n == 11 \n\
movl \\r,%r11d \n\
.endif \n\
.if \\n == 12 \n\
movl \\r,%r12d \n\
.endif \n\
.if \\n == 13 \n\
movl \\r,%r13d \n\
.endif \n\
.if \\n == 14 \n\
movl \\r,%r14d \n\
.endif \n\
.if \\n == 15 \n\
movl \\r,%r15d \n\
.endif \n\
.endm \n\
\n\
.macro xorW n,r \n\
.if \\n == 0 \n\
xorl -32+4*0(%rsp),\\r \n\
.endif \n\
.if \\n == 1 \n\
xorl -32+4*1(%rsp),\\r \n\
.endif \n\
.if \\n == 2 \n\
xorl -32+4*2(%rsp),\\r \n\
.endif \n\
.if \\n == 3 \n\
xorl -32+4*3(%rsp),\\r \n\
.endif \n\
.if \\n == 4 \n\
xorl -32+4*4(%rsp),\\r \n\
.endif \n\
.if \\n == 5 \n\
xorl -32+4*5(%rsp),\\r \n\
.endif \n\
.if \\n == 6 \n\
xorl -32+4*6(%rsp),\\r \n\
.endif \n\
.if \\n == 7 \n\
xorl -32+4*7(%rsp),\\r \n\
.endif \n\
.if \\n == 8 \n\
xorl %r8d,\\r \n\
.endif \n\
.if \\n == 9 \n\
xorl %r9d,\\r \n\
.endif \n\
.if \\n == 10 \n\
xorl %r10d,\\r \n\
.endif \n\
.if \\n == 11 \n\
xorl %r11d,\\r \n\
.endif \n\
.if \\n == 12 \n\
xorl %r12d,\\r \n\
.endif \n\
.if \\n == 13 \n\
xorl %r13d,\\r \n\
.endif \n\
.if \\n == 14 \n\
xorl %r14d,\\r \n\
.endif \n\
.if \\n == 15 \n\
xorl %r15d,\\r \n\
.endif \n\
.endm \n\
\n\
movq 4*8(%rdi), %r8 \n\
bswap %r8 \n\
movl %r8d, %r9d \n\
shrq $32, %r8 \n\
movq 4*10(%rdi), %r10 \n\
bswap %r10 \n\
movl %r10d, %r11d \n\
shrq $32, %r10 \n\
movq 4*12(%rdi), %r12 \n\
bswap %r12 \n\
movl %r12d, %r13d \n\
shrq $32, %r12 \n\
movq 4*14(%rdi), %r14 \n\
bswap %r14 \n\
movl %r14d, %r15d \n\
shrq $32, %r14 \n\
\n\
movl $3, %eax \n\
1: \n\
movq (%rdi,%rax,8), %rsi \n\
bswap %rsi \n\
rolq $32, %rsi \n\
movq %rsi, -32(%rsp,%rax,8) \n\
decl %eax \n\
jns 1b \n\
movl 80(%rdi), %eax # a = ctx->hash[0] \n\
movl 84(%rdi), %ebx # b = ctx->hash[1] \n\
movl 88(%rdi), %ecx # c = ctx->hash[2] \n\
movl 92(%rdi), %edx # d = ctx->hash[3] \n\
movl 96(%rdi), %ebp # e = ctx->hash[4] \n\
"
#define RD1As(a,b,c,d,e, n, RCONST) \
"\n\
##loadW "n", %esi # n=0, W[0] already in %esi \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
andl %e"b", %edi # &b \n\
xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\
addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD1Bs(a,b,c,d,e, n, RCONST) \
"\n\
loadW "n", %esi # W[n] \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
andl %e"b", %edi # &b \n\
xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n] \n\
addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD1Cs(a,b,c,d,e, n, RCONST) \
"\n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
andl %e"b", %edi # &b \n\
xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\
leal "RCONST"(%r"e",%r"n"), %e"e" # e += RCONST + W[n] \n\
addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD1Ds(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
loadW "n13", %esi # W[(n+13) & 15] \n\
xorW "n8", %esi # ^W[(n+8) & 15] \n\
xorW "n2", %esi # ^W[(n+2) & 15] \n\
xorW "n", %esi # ^W[n & 15] \n\
roll %esi # \n\
storeW %esi, "n" # store to W[n & 15] \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
andl %e"b", %edi # &b \n\
xorl %e"d", %edi # (((c ^ d) & b) ^ d) \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
addl %edi, %e"e" # e += (((c ^ d) & b) ^ d) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST))
#define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST))
#define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST))
#define RD1D(a,b,c,d,e, n) RD1Ds(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
#undef RCONST
#define RCONST 0x5A827999
RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1C(cx,dx,bp,ax,bx, 8) RD1C(bx,cx,dx,bp,ax, 9)
RD1C(ax,bx,cx,dx,bp,10) RD1C(bp,ax,bx,cx,dx,11) RD1C(dx,bp,ax,bx,cx,12) RD1C(cx,dx,bp,ax,bx,13) RD1C(bx,cx,dx,bp,ax,14)
RD1C(ax,bx,cx,dx,bp,15) RD1D(bp,ax,bx,cx,dx,16) RD1D(dx,bp,ax,bx,cx,17) RD1D(cx,dx,bp,ax,bx,18) RD1D(bx,cx,dx,bp,ax,19)
#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
loadW "n13", %esi # W[(n+13) & 15] \n\
xorW "n8", %esi # ^W[(n+8) & 15] \n\
xorW "n2", %esi # ^W[(n+2) & 15] \n\
xorW "n", %esi # ^W[n & 15] \n\
roll %esi # \n\
storeW %esi, "n" # store to W[n & 15] \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
xorl %e"b", %edi # ^b \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
addl %edi, %e"e" # e += (c ^ d ^ b) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD2(a,b,c,d,e, n) RD2s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
#undef RCONST
#define RCONST 0x6ED9EBA1
RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
movl %e"b", %edi # di: b \n\
movl %e"b", %esi # si: b \n\
orl %e"c", %edi # di: b | c \n\
andl %e"c", %esi # si: b & c \n\
andl %e"d", %edi # di: (b | c) & d \n\
orl %esi, %edi # ((b | c) & d) | (b & c) \n\
loadW "n13", %esi # W[(n+13) & 15] \n\
xorW "n8", %esi # ^W[(n+8) & 15] \n\
xorW "n2", %esi # ^W[(n+2) & 15] \n\
xorW "n", %esi # ^W[n & 15] \n\
roll %esi # \n\
storeW %esi, "n" # store to W[n & 15] \n\
addl %edi, %e"e" # += ((b | c) & d) | (b & c)\n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD3(a,b,c,d,e, n) RD3s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
#undef RCONST
//#define RCONST 0x8F1BBCDC "out of range for signed 32bit displacement"
#define RCONST -0x70e44324
RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
/* in hash_md5_sha_x86-64.S */
struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM);
#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
loadW "n13", %esi # W[(n+13) & 15] \n\
xorW "n8", %esi # ^W[(n+8) & 15] \n\
xorW "n2", %esi # ^W[(n+2) & 15] \n\
xorW "n", %esi # ^W[n & 15] \n\
roll %esi # \n\
storeW %esi, "n" # store to W[n & 15] \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
xorl %e"b", %edi # ^b \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
addl %edi, %e"e" # e += (c ^ d ^ b) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
"\n\
loadW "n13", %esi # W[(n+13) & 15] \n\
xorW "n8", %esi # ^W[(n+8) & 15] \n\
xorW "n2", %esi # ^W[(n+2) & 15] \n\
xorW "n", %esi # ^W[n & 15] \n\
roll %esi # \n\
#storeW %esi, "n" # store to W[n & 15] elided \n\
movl %e"c", %edi # c \n\
xorl %e"d", %edi # ^d \n\
xorl %e"b", %edi # ^b \n\
leal "RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
addl %edi, %e"e" # e += (c ^ d ^ b) \n\
movl %e"a", %esi # \n\
roll $5, %esi # rotl32(a,5) \n\
addl %esi, %e"e" # e += rotl32(a,5) \n\
rorl $2, %e"b" # b = rotl32(b,30) \n\
"
#define RD4A(a,b,c,d,e, n) RD4As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
#define RD4B(a,b,c,d,e, n) RD4Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
#undef RCONST
//#define RCONST 0xCA62C1D6 "out of range for signed 32bit displacement"
#define RCONST -0x359d3e2a
RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
"\n\
popq %rdi # \n\
addl %eax, 80(%rdi) # ctx->hash[0] += a \n\
addl %ebx, 84(%rdi) # ctx->hash[1] += b \n\
addl %ecx, 88(%rdi) # ctx->hash[2] += c \n\
addl %edx, 92(%rdi) # ctx->hash[3] += d \n\
addl %ebp, 96(%rdi) # ctx->hash[4] += e \n\
popq %rbx # \n\
popq %rbp # \n\
popq %r12 # \n\
popq %r13 # \n\
popq %r14 # \n\
popq %r15 # \n\
"
); /* asm */
#undef RCONST
}
# else
/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
* It seems further speedup can be achieved by handling more than

1349
libbb/hash_md5_sha_x86-64.S Normal file

File diff suppressed because it is too large Load Diff