libbb/sha256: optional x86 hardware accelerated hashing

64 bit: function old new delta sha256_process_block64_shaNI - 730 +730 .rodata 108314 108586 +272 sha256_begin 31 83 +52 ------------------------------------------------------------------------------ (add/remove: 5/1 grow/shrink: 2/0 up/down: 1055/-1) Total: 1054 bytes 32 bit: function old new delta sha256_process_block64_shaNI - 747 +747 .rodata 104318 104590 +272 sha256_begin 29 84 +55 ------------------------------------------------------------------------------ (add/remove: 5/1 grow/shrink: 2/0 up/down: 1075/-1) Total: 1074 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
2022-02-03 14:15:20 +01:00 · 2022-02-03 14:15:20 +01:00 · 6472ac9428
commit 6472ac9428
parent 205042c07a
9 changed files with 612 additions and 26 deletions
--- a/libbb/Config.src
+++ b/libbb/Config.src
@ -70,6 +70,12 @@ config SHA1_HWACCEL
 	On x86, this adds ~590 bytes of code. Throughput
 	is about twice as fast as fully-unrolled generic code.
 config SHA256_HWACCEL
 	bool "SHA256: Use hardware accelerated instructions if possible"
 	default y
 	help
 	On x86, this adds ~1k bytes of code.
 config SHA3_SMALL
 	int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
 	default 1  # all "fast or small" options default to small
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@ -59,6 +59,8 @@ lib-y += hash_md5_sha.o
 lib-y += hash_md5_sha_x86-64.o
 lib-y += hash_md5_sha_x86-64_shaNI.o
 lib-y += hash_md5_sha_x86-32_shaNI.o
 lib-y += hash_md5_sha256_x86-64_shaNI.o
 lib-y += hash_md5_sha256_x86-32_shaNI.o
 # Alternative (disabled) MD5 implementation
 #lib-y += hash_md5prime.o
 lib-y += messages.o
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@ -13,6 +13,27 @@
 #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
 #if ENABLE_SHA1_HWACCEL || ENABLE_SHA256_HWACCEL
 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
 {
 	asm ("cpuid"
 		: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
 		: "0"(*eax),  "1"(*ebx),  "2"(*ecx),  "3"(*edx)
 	);
 }
 static smallint shaNI;
 void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
 void FAST_FUNC sha256_process_block64_shaNI(sha256_ctx_t *ctx);
 #  if defined(__i386__)
 struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 76)]; };
 #  endif
 #  if defined(__x86_64__)
 struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 80)]; };
 #  endif
 # endif
 #endif
 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
 * (for rotX32, there is no difference). Why? My guess is that
 * macro requires clever common subexpression elimination heuristics
@ -1142,25 +1163,6 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
 }
 #endif /* NEED_SHA512 */
 #if ENABLE_SHA1_HWACCEL
 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
 {
 	asm ("cpuid"
 		: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
 		: "0"(*eax),  "1"(*ebx),  "2"(*ecx),  "3"(*edx)
 	);
 }
 void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
 #  if defined(__i386__)
 struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; };
 #  endif
 #  if defined(__x86_64__)
 struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
 #  endif
 # endif
 #endif
 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 {
 	ctx->hash[0] = 0x67452301;
@ -1173,7 +1175,6 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 #if ENABLE_SHA1_HWACCEL
 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 	{
 		static smallint shaNI;
 		if (!shaNI) {
 			unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
 			cpuid(&eax, &ebx, &ecx, &edx);
@ -1225,6 +1226,19 @@ void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
 	memcpy(&ctx->total64, init256, sizeof(init256));
 	/*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
 	ctx->process_block = sha256_process_block64;
 #if ENABLE_SHA256_HWACCEL
 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 	{
 		if (!shaNI) {
 			unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
 			cpuid(&eax, &ebx, &ecx, &edx);
 			shaNI = ((ebx >> 29) << 1) - 1;
 		}
 		if (shaNI > 0)
 			ctx->process_block = sha256_process_block64_shaNI;
 	}
 # endif
 #endif
 }
 #if NEED_SHA512
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@ -0,0 +1,283 @@
 #if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
 /* The code is adapted from Linux kernel's source */
 // We use shorter insns, even though they are for "wrong"
 // data type (fp, not int).
 // For Intel, there is no penalty for doing it at all
 // (CPUs which do have such penalty do not support SHA1 insns).
 // For AMD, the penalty is one extra cycle
 // (allegedly: I failed to find measurable difference).
 //#define mova128 movdqa
 #define mova128 movaps
 //#define movu128 movdqu
 #define movu128 movups
 //#define shuf128_32 pshufd
 #define shuf128_32 shufps
 	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
 	.globl	sha256_process_block64_shaNI
 	.hidden	sha256_process_block64_shaNI
 	.type	sha256_process_block64_shaNI, @function
 #define DATA_PTR	%eax
 #define SHA256CONSTANTS	%ecx
 #define MSG		%xmm0
 #define STATE0		%xmm1
 #define STATE1		%xmm2
 #define MSGTMP0		%xmm3
 #define MSGTMP1		%xmm4
 #define MSGTMP2		%xmm5
 #define MSGTMP3		%xmm6
 #define MSGTMP4		%xmm7
 	.balign	8	# allow decoders to fetch at least 3 first insns
 sha256_process_block64_shaNI:
 	pushl		%ebp
 	movl		%esp, %ebp
 	subl		$32, %esp
 	andl		$~0xF, %esp	# paddd needs aligned memory operand
 	movu128		76+0*16(%eax), STATE0
 	movu128		76+1*16(%eax), STATE1
 	shuf128_32	$0xB1, STATE0,  STATE0		/* CDAB */
 	shuf128_32	$0x1B, STATE1,  STATE1		/* EFGH */
 	mova128		STATE0, MSGTMP4
 	palignr		$8, STATE1,  STATE0		/* ABEF */
 	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
 #	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, SHUF_MASK
 	lea		K256, SHA256CONSTANTS
 	/* Save hash values for addition after rounds */
 	mova128		STATE0, 0*16(%esp)
 	mova128		STATE1, 1*16(%esp)
 	/* Rounds 0-3 */
 	movu128		0*16(DATA_PTR), MSG
 	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
 	mova128		MSG, MSGTMP0
 		paddd		0*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	/* Rounds 4-7 */
 	movu128		1*16(DATA_PTR), MSG
 	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
 	mova128		MSG, MSGTMP1
 		paddd		1*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 	/* Rounds 8-11 */
 	movu128		2*16(DATA_PTR), MSG
 	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
 	mova128		MSG, MSGTMP2
 		paddd		2*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 	/* Rounds 12-15 */
 	movu128		3*16(DATA_PTR), MSG
 	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
 	mova128		MSG, MSGTMP3
 		paddd		3*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
 	paddd		MSGTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 	/* Rounds 16-19 */
 	mova128		MSGTMP0, MSG
 		paddd		4*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
 	paddd		MSGTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 	/* Rounds 20-23 */
 	mova128		MSGTMP1, MSG
 		paddd		5*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
 	paddd		MSGTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 	/* Rounds 24-27 */
 	mova128		MSGTMP2, MSG
 		paddd		6*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
 	paddd		MSGTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 	/* Rounds 28-31 */
 	mova128		MSGTMP3, MSG
 		paddd		7*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
 	paddd		MSGTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 	/* Rounds 32-35 */
 	mova128		MSGTMP0, MSG
 		paddd		8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
 	paddd		MSGTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 	/* Rounds 36-39 */
 	mova128		MSGTMP1, MSG
 		paddd		9*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
 	paddd		MSGTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 	/* Rounds 40-43 */
 	mova128		MSGTMP2, MSG
 		paddd		10*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
 	paddd		MSGTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 	/* Rounds 44-47 */
 	mova128		MSGTMP3, MSG
 		paddd		11*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
 	paddd		MSGTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 	/* Rounds 48-51 */
 	mova128		MSGTMP0, MSG
 		paddd		12*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
 	paddd		MSGTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 	/* Rounds 52-55 */
 	mova128		MSGTMP1, MSG
 		paddd		13*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
 	paddd		MSGTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	/* Rounds 56-59 */
 	mova128		MSGTMP2, MSG
 		paddd		14*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
 	paddd		MSGTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	/* Rounds 60-63 */
 	mova128		MSGTMP3, MSG
 		paddd		15*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	/* Add current hash values with previously saved */
 	paddd		0*16(%esp), STATE0
 	paddd		1*16(%esp), STATE1
 	/* Write hash values back in the correct order */
 	shuf128_32	$0x1B, STATE0,  STATE0		/* FEBA */
 	shuf128_32	$0xB1, STATE1,  STATE1		/* DCHG */
 	mova128		STATE0, MSGTMP4
 	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
 	palignr		$8, MSGTMP4, STATE1		/* HGFE */
 	movu128		STATE0, 76+0*16(%eax)
 	movu128		STATE1, 76+1*16(%eax)
 	movl	%ebp, %esp
 	popl	%ebp
 	ret
 	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
 .section	.rodata.cst256.K256, "aM", @progbits, 256
 .balign 16
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
 .balign 16
 PSHUFFLE_BSWAP32_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203
 #endif
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@ -0,0 +1,281 @@
 #if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
 /* The code is adapted from Linux kernel's source */
 // We use shorter insns, even though they are for "wrong"
 // data type (fp, not int).
 // For Intel, there is no penalty for doing it at all
 // (CPUs which do have such penalty do not support SHA1 insns).
 // For AMD, the penalty is one extra cycle
 // (allegedly: I failed to find measurable difference).
 //#define mova128 movdqa
 #define mova128 movaps
 //#define movu128 movdqu
 #define movu128 movups
 //#define shuf128_32 pshufd
 #define shuf128_32 shufps
 	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
 	.globl	sha256_process_block64_shaNI
 	.hidden	sha256_process_block64_shaNI
 	.type	sha256_process_block64_shaNI, @function
 #define DATA_PTR	%rdi
 #define SHA256CONSTANTS	%rax
 #define MSG		%xmm0
 #define STATE0		%xmm1
 #define STATE1		%xmm2
 #define MSGTMP0		%xmm3
 #define MSGTMP1		%xmm4
 #define MSGTMP2		%xmm5
 #define MSGTMP3		%xmm6
 #define MSGTMP4		%xmm7
 #define SHUF_MASK	%xmm8
 #define ABEF_SAVE	%xmm9
 #define CDGH_SAVE	%xmm10
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha256_process_block64_shaNI:
 	movu128		80+0*16(%rdi), STATE0
 	movu128		80+1*16(%rdi), STATE1
 	shuf128_32	$0xB1, STATE0,  STATE0		/* CDAB */
 	shuf128_32	$0x1B, STATE1,  STATE1		/* EFGH */
 	mova128		STATE0, MSGTMP4
 	palignr		$8, STATE1,  STATE0		/* ABEF */
 	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
 	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), SHUF_MASK
 	lea		K256(%rip), SHA256CONSTANTS
 	/* Save hash values for addition after rounds */
 	mova128		STATE0, ABEF_SAVE
 	mova128		STATE1, CDGH_SAVE
 	/* Rounds 0-3 */
 	movu128		0*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	mova128		MSG, MSGTMP0
 		paddd		0*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	/* Rounds 4-7 */
 	movu128		1*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	mova128		MSG, MSGTMP1
 		paddd		1*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 	/* Rounds 8-11 */
 	movu128		2*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	mova128		MSG, MSGTMP2
 		paddd		2*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 	/* Rounds 12-15 */
 	movu128		3*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	mova128		MSG, MSGTMP3
 		paddd		3*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
 	paddd		MSGTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 	/* Rounds 16-19 */
 	mova128		MSGTMP0, MSG
 		paddd		4*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
 	paddd		MSGTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 	/* Rounds 20-23 */
 	mova128		MSGTMP1, MSG
 		paddd		5*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
 	paddd		MSGTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 	/* Rounds 24-27 */
 	mova128		MSGTMP2, MSG
 		paddd		6*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
 	paddd		MSGTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 	/* Rounds 28-31 */
 	mova128		MSGTMP3, MSG
 		paddd		7*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
 	paddd		MSGTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 	/* Rounds 32-35 */
 	mova128		MSGTMP0, MSG
 		paddd		8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
 	paddd		MSGTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 	/* Rounds 36-39 */
 	mova128		MSGTMP1, MSG
 		paddd		9*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
 	paddd		MSGTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 	/* Rounds 40-43 */
 	mova128		MSGTMP2, MSG
 		paddd		10*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
 	paddd		MSGTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 	/* Rounds 44-47 */
 	mova128		MSGTMP3, MSG
 		paddd		11*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
 	paddd		MSGTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 	/* Rounds 48-51 */
 	mova128		MSGTMP0, MSG
 		paddd		12*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
 	paddd		MSGTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 	/* Rounds 52-55 */
 	mova128		MSGTMP1, MSG
 		paddd		13*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
 	paddd		MSGTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	/* Rounds 56-59 */
 	mova128		MSGTMP2, MSG
 		paddd		14*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
 	paddd		MSGTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	/* Rounds 60-63 */
 	mova128		MSGTMP3, MSG
 		paddd		15*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 	/* Add current hash values with previously saved */
 	paddd		ABEF_SAVE, STATE0
 	paddd		CDGH_SAVE, STATE1
 	/* Write hash values back in the correct order */
 	shuf128_32	$0x1B, STATE0,  STATE0		/* FEBA */
 	shuf128_32	$0xB1, STATE1,  STATE1		/* DCHG */
 	mova128		STATE0, MSGTMP4
 	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
 	palignr		$8, MSGTMP4, STATE1		/* HGFE */
 	movu128		STATE0, 80+0*16(%rdi)
 	movu128		STATE1, 80+1*16(%rdi)
 	ret
 	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
 .section	.rodata.cst256.K256, "aM", @progbits, 256
 .balign 16
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
 .balign 16
 PSHUFFLE_BSWAP32_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203
 #endif
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@ -224,7 +224,7 @@ sha1_process_block64_shaNI:
 	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
 .section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
+.balign 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x000102030405060708090a0b0c0d0e0f
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@ -1394,7 +1394,7 @@ sha1_process_block64:
 	.size	sha1_process_block64, .-sha1_process_block64
 	.section	.rodata.cst16.sha1const, "aM", @progbits, 16
-	.align	16
+	.balign	16
 rconst0x5A827999:
 	.long	0x5A827999
 	.long	0x5A827999
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@ -433,7 +433,7 @@ echo "
 	.size	sha1_process_block64, .-sha1_process_block64
 	.section	.rodata.cst16.sha1const, \"aM\", @progbits, 16
-	.align	16
+	.balign	16
 rconst0x5A827999:
 	.long	0x5A827999
 	.long	0x5A827999
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@ -218,7 +218,7 @@ sha1_process_block64_shaNI:
 	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
 .section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
+.balign 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x000102030405060708090a0b0c0d0e0f