libbb/sha256: code shrink in 64-bit x86

function                                             old     new   delta
sha256_process_block64_shaNI                         706     701      -5

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
Denys Vlasenko 2022-02-06 00:30:03 +01:00
parent a1429fbb8c
commit 31c1c31077

View File

@ -31,9 +31,7 @@
#define MSGTMP1 %xmm4 #define MSGTMP1 %xmm4
#define MSGTMP2 %xmm5 #define MSGTMP2 %xmm5
#define MSGTMP3 %xmm6 #define MSGTMP3 %xmm6
#define MSGTMP4 %xmm7 #define XMMTMP4 %xmm7
#define SHUF_MASK %xmm8
#define ABEF_SAVE %xmm9 #define ABEF_SAVE %xmm9
#define CDGH_SAVE %xmm10 #define CDGH_SAVE %xmm10
@ -45,11 +43,12 @@ sha256_process_block64_shaNI:
shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */ shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */
shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */ shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */
mova128 STATE0, MSGTMP4 mova128 STATE0, XMMTMP4
palignr $8, STATE1, STATE0 /* ABEF */ palignr $8, STATE1, STATE0 /* ABEF */
pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ pblendw $0xF0, XMMTMP4, STATE1 /* CDGH */
mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), SHUF_MASK /* XMMTMP4 holds flip mask from here... */
mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP4
leaq K256+8*16(%rip), SHA256CONSTANTS leaq K256+8*16(%rip), SHA256CONSTANTS
/* Save hash values for addition after rounds */ /* Save hash values for addition after rounds */
@ -58,7 +57,7 @@ sha256_process_block64_shaNI:
/* Rounds 0-3 */ /* Rounds 0-3 */
movu128 0*16(DATA_PTR), MSG movu128 0*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG pshufb XMMTMP4, MSG
mova128 MSG, MSGTMP0 mova128 MSG, MSGTMP0
paddd 0*16-8*16(SHA256CONSTANTS), MSG paddd 0*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
@ -67,7 +66,7 @@ sha256_process_block64_shaNI:
/* Rounds 4-7 */ /* Rounds 4-7 */
movu128 1*16(DATA_PTR), MSG movu128 1*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG pshufb XMMTMP4, MSG
mova128 MSG, MSGTMP1 mova128 MSG, MSGTMP1
paddd 1*16-8*16(SHA256CONSTANTS), MSG paddd 1*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
@ -77,7 +76,7 @@ sha256_process_block64_shaNI:
/* Rounds 8-11 */ /* Rounds 8-11 */
movu128 2*16(DATA_PTR), MSG movu128 2*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG pshufb XMMTMP4, MSG
mova128 MSG, MSGTMP2 mova128 MSG, MSGTMP2
paddd 2*16-8*16(SHA256CONSTANTS), MSG paddd 2*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
@ -87,13 +86,14 @@ sha256_process_block64_shaNI:
/* Rounds 12-15 */ /* Rounds 12-15 */
movu128 3*16(DATA_PTR), MSG movu128 3*16(DATA_PTR), MSG
pshufb SHUF_MASK, MSG pshufb XMMTMP4, MSG
/* ...to here */
mova128 MSG, MSGTMP3 mova128 MSG, MSGTMP3
paddd 3*16-8*16(SHA256CONSTANTS), MSG paddd 3*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP3, MSGTMP4 mova128 MSGTMP3, XMMTMP4
palignr $4, MSGTMP2, MSGTMP4 palignr $4, MSGTMP2, XMMTMP4
paddd MSGTMP4, MSGTMP0 paddd XMMTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -103,9 +103,9 @@ sha256_process_block64_shaNI:
mova128 MSGTMP0, MSG mova128 MSGTMP0, MSG
paddd 4*16-8*16(SHA256CONSTANTS), MSG paddd 4*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP0, MSGTMP4 mova128 MSGTMP0, XMMTMP4
palignr $4, MSGTMP3, MSGTMP4 palignr $4, MSGTMP3, XMMTMP4
paddd MSGTMP4, MSGTMP1 paddd XMMTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -115,9 +115,9 @@ sha256_process_block64_shaNI:
mova128 MSGTMP1, MSG mova128 MSGTMP1, MSG
paddd 5*16-8*16(SHA256CONSTANTS), MSG paddd 5*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP1, MSGTMP4 mova128 MSGTMP1, XMMTMP4
palignr $4, MSGTMP0, MSGTMP4 palignr $4, MSGTMP0, XMMTMP4
paddd MSGTMP4, MSGTMP2 paddd XMMTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -127,9 +127,9 @@ sha256_process_block64_shaNI:
mova128 MSGTMP2, MSG mova128 MSGTMP2, MSG
paddd 6*16-8*16(SHA256CONSTANTS), MSG paddd 6*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP2, MSGTMP4 mova128 MSGTMP2, XMMTMP4
palignr $4, MSGTMP1, MSGTMP4 palignr $4, MSGTMP1, XMMTMP4
paddd MSGTMP4, MSGTMP3 paddd XMMTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -139,9 +139,9 @@ sha256_process_block64_shaNI:
mova128 MSGTMP3, MSG mova128 MSGTMP3, MSG
paddd 7*16-8*16(SHA256CONSTANTS), MSG paddd 7*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP3, MSGTMP4 mova128 MSGTMP3, XMMTMP4
palignr $4, MSGTMP2, MSGTMP4 palignr $4, MSGTMP2, XMMTMP4
paddd MSGTMP4, MSGTMP0 paddd XMMTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -151,9 +151,9 @@ sha256_process_block64_shaNI:
mova128 MSGTMP0, MSG mova128 MSGTMP0, MSG
paddd 8*16-8*16(SHA256CONSTANTS), MSG paddd 8*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP0, MSGTMP4 mova128 MSGTMP0, XMMTMP4
palignr $4, MSGTMP3, MSGTMP4 palignr $4, MSGTMP3, XMMTMP4
paddd MSGTMP4, MSGTMP1 paddd XMMTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -163,9 +163,9 @@ sha256_process_block64_shaNI:
mova128 MSGTMP1, MSG mova128 MSGTMP1, MSG
paddd 9*16-8*16(SHA256CONSTANTS), MSG paddd 9*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP1, MSGTMP4 mova128 MSGTMP1, XMMTMP4
palignr $4, MSGTMP0, MSGTMP4 palignr $4, MSGTMP0, XMMTMP4
paddd MSGTMP4, MSGTMP2 paddd XMMTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -175,9 +175,9 @@ sha256_process_block64_shaNI:
mova128 MSGTMP2, MSG mova128 MSGTMP2, MSG
paddd 10*16-8*16(SHA256CONSTANTS), MSG paddd 10*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP2, MSGTMP4 mova128 MSGTMP2, XMMTMP4
palignr $4, MSGTMP1, MSGTMP4 palignr $4, MSGTMP1, XMMTMP4
paddd MSGTMP4, MSGTMP3 paddd XMMTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -187,9 +187,9 @@ sha256_process_block64_shaNI:
mova128 MSGTMP3, MSG mova128 MSGTMP3, MSG
paddd 11*16-8*16(SHA256CONSTANTS), MSG paddd 11*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP3, MSGTMP4 mova128 MSGTMP3, XMMTMP4
palignr $4, MSGTMP2, MSGTMP4 palignr $4, MSGTMP2, XMMTMP4
paddd MSGTMP4, MSGTMP0 paddd XMMTMP4, MSGTMP0
sha256msg2 MSGTMP3, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -199,9 +199,9 @@ sha256_process_block64_shaNI:
mova128 MSGTMP0, MSG mova128 MSGTMP0, MSG
paddd 12*16-8*16(SHA256CONSTANTS), MSG paddd 12*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP0, MSGTMP4 mova128 MSGTMP0, XMMTMP4
palignr $4, MSGTMP3, MSGTMP4 palignr $4, MSGTMP3, XMMTMP4
paddd MSGTMP4, MSGTMP1 paddd XMMTMP4, MSGTMP1
sha256msg2 MSGTMP0, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -211,9 +211,9 @@ sha256_process_block64_shaNI:
mova128 MSGTMP1, MSG mova128 MSGTMP1, MSG
paddd 13*16-8*16(SHA256CONSTANTS), MSG paddd 13*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP1, MSGTMP4 mova128 MSGTMP1, XMMTMP4
palignr $4, MSGTMP0, MSGTMP4 palignr $4, MSGTMP0, XMMTMP4
paddd MSGTMP4, MSGTMP2 paddd XMMTMP4, MSGTMP2
sha256msg2 MSGTMP1, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -222,9 +222,9 @@ sha256_process_block64_shaNI:
mova128 MSGTMP2, MSG mova128 MSGTMP2, MSG
paddd 14*16-8*16(SHA256CONSTANTS), MSG paddd 14*16-8*16(SHA256CONSTANTS), MSG
sha256rnds2 STATE0, STATE1 sha256rnds2 STATE0, STATE1
mova128 MSGTMP2, MSGTMP4 mova128 MSGTMP2, XMMTMP4
palignr $4, MSGTMP1, MSGTMP4 palignr $4, MSGTMP1, XMMTMP4
paddd MSGTMP4, MSGTMP3 paddd XMMTMP4, MSGTMP3
sha256msg2 MSGTMP2, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3
shuf128_32 $0x0E, MSG, MSG shuf128_32 $0x0E, MSG, MSG
sha256rnds2 STATE1, STATE0 sha256rnds2 STATE1, STATE0
@ -243,9 +243,9 @@ sha256_process_block64_shaNI:
/* Write hash values back in the correct order */ /* Write hash values back in the correct order */
shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */ shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */
shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */ shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */
mova128 STATE0, MSGTMP4 mova128 STATE0, XMMTMP4
pblendw $0xF0, STATE1, STATE0 /* DCBA */ pblendw $0xF0, STATE1, STATE0 /* DCBA */
palignr $8, MSGTMP4, STATE1 /* HGFE */ palignr $8, XMMTMP4, STATE1 /* HGFE */
movu128 STATE0, 80+0*16(%rdi) movu128 STATE0, 80+0*16(%rdi)
movu128 STATE1, 80+1*16(%rdi) movu128 STATE1, 80+1*16(%rdi)