libbb/sha1: shrink and speed up unrolled x86-64 code
function old new delta sha1_process_block64 3514 3482 -32 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
This commit is contained in:
parent
987be932ed
commit
c193cbd6df
@ -257,8 +257,8 @@ sha256_process_block64_shaNI:
|
|||||||
ret
|
ret
|
||||||
.size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
|
.size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
|
||||||
|
|
||||||
.section .rodata.cst256.K256, "aM", @progbits, 256
|
.section .rodata.cst256.K256, "aM", @progbits, 256
|
||||||
.balign 16
|
.balign 16
|
||||||
K256:
|
K256:
|
||||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||||
@ -277,8 +277,8 @@ K256:
|
|||||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||||
|
|
||||||
.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
|
.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
|
||||||
.balign 16
|
.balign 16
|
||||||
PSHUFFLE_BSWAP32_FLIP_MASK:
|
PSHUFFLE_BSWAP32_FLIP_MASK:
|
||||||
.octa 0x0c0d0e0f08090a0b0405060700010203
|
.octa 0x0c0d0e0f08090a0b0405060700010203
|
||||||
|
|
||||||
|
@ -253,8 +253,8 @@ sha256_process_block64_shaNI:
|
|||||||
ret
|
ret
|
||||||
.size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
|
.size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
|
||||||
|
|
||||||
.section .rodata.cst256.K256, "aM", @progbits, 256
|
.section .rodata.cst256.K256, "aM", @progbits, 256
|
||||||
.balign 16
|
.balign 16
|
||||||
K256:
|
K256:
|
||||||
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||||
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||||
@ -273,8 +273,8 @@ K256:
|
|||||||
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||||
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||||
|
|
||||||
.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
|
.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
|
||||||
.balign 16
|
.balign 16
|
||||||
PSHUFFLE_BSWAP32_FLIP_MASK:
|
PSHUFFLE_BSWAP32_FLIP_MASK:
|
||||||
.octa 0x0c0d0e0f08090a0b0405060700010203
|
.octa 0x0c0d0e0f08090a0b0405060700010203
|
||||||
|
|
||||||
|
@ -223,8 +223,8 @@ sha1_process_block64_shaNI:
|
|||||||
ret
|
ret
|
||||||
.size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
|
.size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
|
||||||
|
|
||||||
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
|
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
|
||||||
.balign 16
|
.balign 16
|
||||||
PSHUFFLE_BYTE_FLIP_MASK:
|
PSHUFFLE_BYTE_FLIP_MASK:
|
||||||
.octa 0x000102030405060708090a0b0c0d0e0f
|
.octa 0x000102030405060708090a0b0c0d0e0f
|
||||||
|
|
||||||
|
@ -180,8 +180,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
|
# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
|
||||||
movaps %xmm3, %xmm4
|
movaps %xmm3, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm0, %xmm5
|
||||||
|
shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm0 # ^
|
xorps %xmm5, %xmm0 # ^
|
||||||
@ -252,8 +257,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
|
# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
|
||||||
movaps %xmm0, %xmm4
|
movaps %xmm0, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm1, %xmm5
|
||||||
|
shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm1 # ^
|
xorps %xmm5, %xmm1 # ^
|
||||||
@ -323,8 +333,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
|
# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
|
||||||
movaps %xmm1, %xmm4
|
movaps %xmm1, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm2, %xmm5
|
||||||
|
shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm2 # ^
|
xorps %xmm5, %xmm2 # ^
|
||||||
@ -392,8 +407,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
|
# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
|
||||||
movaps %xmm2, %xmm4
|
movaps %xmm2, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm3, %xmm5
|
||||||
|
shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm3 # ^
|
xorps %xmm5, %xmm3 # ^
|
||||||
@ -457,8 +477,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
|
# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
|
||||||
movaps %xmm3, %xmm4
|
movaps %xmm3, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm0, %xmm5
|
||||||
|
shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm0 # ^
|
xorps %xmm5, %xmm0 # ^
|
||||||
@ -522,8 +547,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
|
# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
|
||||||
movaps %xmm0, %xmm4
|
movaps %xmm0, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm1, %xmm5
|
||||||
|
shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm1 # ^
|
xorps %xmm5, %xmm1 # ^
|
||||||
@ -588,8 +618,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
|
# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
|
||||||
movaps %xmm1, %xmm4
|
movaps %xmm1, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm2, %xmm5
|
||||||
|
shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm2 # ^
|
xorps %xmm5, %xmm2 # ^
|
||||||
@ -653,8 +688,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
|
# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
|
||||||
movaps %xmm2, %xmm4
|
movaps %xmm2, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm3, %xmm5
|
||||||
|
shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm3 # ^
|
xorps %xmm5, %xmm3 # ^
|
||||||
@ -718,8 +758,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
|
# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
|
||||||
movaps %xmm3, %xmm4
|
movaps %xmm3, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm0, %xmm5
|
||||||
|
shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm0 # ^
|
xorps %xmm5, %xmm0 # ^
|
||||||
@ -795,8 +840,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
|
# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
|
||||||
movaps %xmm0, %xmm4
|
movaps %xmm0, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm1, %xmm5
|
||||||
|
shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm1 # ^
|
xorps %xmm5, %xmm1 # ^
|
||||||
@ -872,8 +922,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
|
# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
|
||||||
movaps %xmm1, %xmm4
|
movaps %xmm1, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm2, %xmm5
|
||||||
|
shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm2 # ^
|
xorps %xmm5, %xmm2 # ^
|
||||||
@ -950,8 +1005,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
|
# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
|
||||||
movaps %xmm2, %xmm4
|
movaps %xmm2, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm3, %xmm5
|
||||||
|
shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm3 # ^
|
xorps %xmm5, %xmm3 # ^
|
||||||
@ -1027,8 +1087,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
|
# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
|
||||||
movaps %xmm3, %xmm4
|
movaps %xmm3, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm0, %xmm5
|
||||||
|
shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm0 # ^
|
xorps %xmm5, %xmm0 # ^
|
||||||
@ -1104,8 +1169,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
|
# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
|
||||||
movaps %xmm0, %xmm4
|
movaps %xmm0, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm1, %xmm5
|
||||||
|
shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm1 # ^
|
xorps %xmm5, %xmm1 # ^
|
||||||
@ -1169,8 +1239,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
|
# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
|
||||||
movaps %xmm1, %xmm4
|
movaps %xmm1, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm2, %xmm5
|
||||||
|
shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm2 # ^
|
xorps %xmm5, %xmm2 # ^
|
||||||
@ -1234,8 +1309,13 @@ sha1_process_block64:
|
|||||||
# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
|
# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
|
||||||
movaps %xmm2, %xmm4
|
movaps %xmm2, %xmm4
|
||||||
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps %xmm3, %xmm5
|
||||||
|
shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
xorps %xmm5, %xmm3 # ^
|
xorps %xmm5, %xmm3 # ^
|
||||||
|
@ -203,8 +203,13 @@ echo "# PREP $@
|
|||||||
movaps $xmmW12, $xmmT1
|
movaps $xmmW12, $xmmT1
|
||||||
psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
|
||||||
|
|
||||||
pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
# pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
|
||||||
punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
# punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
|
||||||
|
# same result as above, but shorter and faster:
|
||||||
|
# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
|
||||||
|
# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
|
||||||
|
movaps $xmmW0, $xmmT2
|
||||||
|
shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
|
||||||
|
|
||||||
xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
|
||||||
xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
|
||||||
|
@ -217,8 +217,8 @@ sha1_process_block64_shaNI:
|
|||||||
ret
|
ret
|
||||||
.size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
|
.size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
|
||||||
|
|
||||||
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
|
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
|
||||||
.balign 16
|
.balign 16
|
||||||
PSHUFFLE_BYTE_FLIP_MASK:
|
PSHUFFLE_BYTE_FLIP_MASK:
|
||||||
.octa 0x000102030405060708090a0b0c0d0e0f
|
.octa 0x000102030405060708090a0b0c0d0e0f
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user