diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 52c5d47ef5a14..603d159de4007 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -41,8 +41,7 @@ */ #define CRC32C_PCL_BREAKEVEN 512 -asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len, - unsigned int crc_init); +asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); #endif /* CONFIG_X86_64 */ static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) @@ -159,7 +158,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, */ if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { kernel_fpu_begin(); - *crcp = crc_pcl(data, len, *crcp); + *crcp = crc32c_x86_3way(*crcp, data, len); kernel_fpu_end(); } else *crcp = crc32c_intel_le_hw(*crcp, data, len); @@ -171,7 +170,7 @@ static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, { if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { kernel_fpu_begin(); - *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); + *(__le32 *)out = ~cpu_to_le32(crc32c_x86_3way(*crcp, data, len)); kernel_fpu_end(); } else *(__le32 *)out = diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 752812bc4991d..9b8770503bbcd 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -52,15 +52,16 @@ # regular CRC code that does not interleave the CRC instructions. #define SMALL_SIZE 200 -# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init); +# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); .text -SYM_FUNC_START(crc_pcl) -#define bufp %rdi -#define bufp_d %edi -#define len %esi -#define crc_init %edx -#define crc_init_q %rdx +SYM_FUNC_START(crc32c_x86_3way) +#define crc0 %edi +#define crc0_q %rdi +#define bufp %rsi +#define bufp_d %esi +#define len %rdx +#define len_dw %edx #define n_misaligned %ecx /* overlaps chunk_bytes! */ #define n_misaligned_q %rcx #define chunk_bytes %ecx /* overlaps n_misaligned! */ @@ -85,9 +86,9 @@ SYM_FUNC_START(crc_pcl) .Ldo_align: movq (bufp), %rax add n_misaligned_q, bufp - sub n_misaligned, len + sub n_misaligned_q, len .Lalign_loop: - crc32b %al, crc_init # compute crc32 of 1-byte + crc32b %al, crc0 # compute crc32 of 1-byte shr $8, %rax # get next byte dec n_misaligned jne .Lalign_loop @@ -102,7 +103,7 @@ SYM_FUNC_START(crc_pcl) .Lpartial_block: # Compute floor(len / 24) to get num qwords to process from each lane. - imul $2731, len, %eax # 2731 = ceil(2^16 / 24) + imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24) shr $16, %eax jmp .Lcrc_3lanes @@ -125,16 +126,16 @@ SYM_FUNC_START(crc_pcl) # Unroll the loop by a factor of 4 to reduce the overhead of the loop # bookkeeping instructions, which can compete with crc32q for the ALUs. .Lcrc_3lanes_4x_loop: - crc32q (bufp), crc_init_q + crc32q (bufp), crc0_q crc32q (bufp,chunk_bytes_q), crc1 crc32q (bufp,chunk_bytes_q,2), crc2 - crc32q 8(bufp), crc_init_q + crc32q 8(bufp), crc0_q crc32q 8(bufp,chunk_bytes_q), crc1 crc32q 8(bufp,chunk_bytes_q,2), crc2 - crc32q 16(bufp), crc_init_q + crc32q 16(bufp), crc0_q crc32q 16(bufp,chunk_bytes_q), crc1 crc32q 16(bufp,chunk_bytes_q,2), crc2 - crc32q 24(bufp), crc_init_q + crc32q 24(bufp), crc0_q crc32q 24(bufp,chunk_bytes_q), crc1 crc32q 24(bufp,chunk_bytes_q,2), crc2 add $32, bufp @@ -146,7 +147,7 @@ SYM_FUNC_START(crc_pcl) jz .Lcrc_3lanes_last_qword .Lcrc_3lanes_1x_loop: - crc32q (bufp), crc_init_q + crc32q (bufp), crc0_q crc32q (bufp,chunk_bytes_q), crc1 crc32q (bufp,chunk_bytes_q,2), crc2 add $8, bufp @@ -154,7 +155,7 @@ SYM_FUNC_START(crc_pcl) jnz .Lcrc_3lanes_1x_loop .Lcrc_3lanes_last_qword: - crc32q (bufp), crc_init_q + crc32q (bufp), crc0_q crc32q (bufp,chunk_bytes_q), crc1 # SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet @@ -165,9 +166,9 @@ SYM_FUNC_START(crc_pcl) lea (K_table-8)(%rip), %rax # first entry is for idx 1 pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 - sub %eax, len # len -= chunk_bytes * 3 + sub %rax, len # len -= chunk_bytes * 3 - movq crc_init_q, %xmm1 # CRC for block 1 + movq crc0_q, %xmm1 # CRC for block 1 pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 movq crc1, %xmm2 # CRC for block 2 @@ -176,8 +177,8 @@ SYM_FUNC_START(crc_pcl) pxor %xmm2,%xmm1 movq %xmm1, %rax xor (bufp,chunk_bytes_q,2), %rax - mov crc2, crc_init_q - crc32 %rax, crc_init_q + mov crc2, crc0_q + crc32 %rax, crc0_q lea 8(bufp,chunk_bytes_q,2), bufp ################################################################ @@ -193,34 +194,34 @@ SYM_FUNC_START(crc_pcl) ## 6) Process any remainder without interleaving: ####################################################################### .Lsmall: - test len, len + test len_dw, len_dw jz .Ldone - mov len, %eax + mov len_dw, %eax shr $3, %eax jz .Ldo_dword .Ldo_qwords: - crc32q (bufp), crc_init_q + crc32q (bufp), crc0_q add $8, bufp dec %eax jnz .Ldo_qwords .Ldo_dword: - test $4, len + test $4, len_dw jz .Ldo_word - crc32l (bufp), crc_init + crc32l (bufp), crc0 add $4, bufp .Ldo_word: - test $2, len + test $2, len_dw jz .Ldo_byte - crc32w (bufp), crc_init + crc32w (bufp), crc0 add $2, bufp .Ldo_byte: - test $1, len + test $1, len_dw jz .Ldone - crc32b (bufp), crc_init + crc32b (bufp), crc0 .Ldone: - mov crc_init, %eax + mov crc0, %eax RET -SYM_FUNC_END(crc_pcl) +SYM_FUNC_END(crc32c_x86_3way) .section .rodata, "a", @progbits ################################################################