-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
arm64: crypto: add NEON accelerated XOR implementation
This is a NEON acceleration method that can improve performance by approximately 20%. I got the following data from the centos 7.5 on Huawei's HISI1616 chip: [ 93.837726] xor: measuring software checksum speed [ 93.874039] 8regs : 7123.200 MB/sec [ 93.914038] 32regs : 7180.300 MB/sec [ 93.954043] arm64_neon: 9856.000 MB/sec [ 93.954047] xor: using function: arm64_neon (9856.000 MB/sec) I believe this code can bring some optimization for all arm64 platform. thanks for Ard Biesheuvel's suggestions. Signed-off-by: Jackie Liu <liuyun01@kylinos.cn> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Will Deacon <will.deacon@arm.com>
- Loading branch information
Jackie Liu
authored and
Will Deacon
committed
Dec 6, 2018
1 parent
21e2854
commit cc9f834
Showing
4 changed files
with
263 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,4 +27,3 @@ generic-y += trace_clock.h | |
generic-y += unaligned.h | ||
generic-y += user.h | ||
generic-y += vga.h | ||
generic-y += xor.h |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
/* | ||
* arch/arm64/include/asm/xor.h | ||
* | ||
* Authors: Jackie Liu <liuyun01@kylinos.cn> | ||
* Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. | ||
* | ||
* This program is free software; you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License version 2 as | ||
* published by the Free Software Foundation. | ||
*/ | ||
|
||
#include <linux/hardirq.h> | ||
#include <asm-generic/xor.h> | ||
#include <asm/hwcap.h> | ||
#include <asm/neon.h> | ||
|
||
#ifdef CONFIG_KERNEL_MODE_NEON | ||
|
||
extern struct xor_block_template const xor_block_inner_neon; | ||
|
||
static void | ||
xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
{ | ||
kernel_neon_begin(); | ||
xor_block_inner_neon.do_2(bytes, p1, p2); | ||
kernel_neon_end(); | ||
} | ||
|
||
static void | ||
xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
unsigned long *p3) | ||
{ | ||
kernel_neon_begin(); | ||
xor_block_inner_neon.do_3(bytes, p1, p2, p3); | ||
kernel_neon_end(); | ||
} | ||
|
||
static void | ||
xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
unsigned long *p3, unsigned long *p4) | ||
{ | ||
kernel_neon_begin(); | ||
xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4); | ||
kernel_neon_end(); | ||
} | ||
|
||
static void | ||
xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
{ | ||
kernel_neon_begin(); | ||
xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5); | ||
kernel_neon_end(); | ||
} | ||
|
||
static struct xor_block_template xor_block_arm64 = { | ||
.name = "arm64_neon", | ||
.do_2 = xor_neon_2, | ||
.do_3 = xor_neon_3, | ||
.do_4 = xor_neon_4, | ||
.do_5 = xor_neon_5 | ||
}; | ||
#undef XOR_TRY_TEMPLATES | ||
#define XOR_TRY_TEMPLATES \ | ||
do { \ | ||
xor_speed(&xor_block_8regs); \ | ||
xor_speed(&xor_block_32regs); \ | ||
if (cpu_has_neon()) { \ | ||
xor_speed(&xor_block_arm64);\ | ||
} \ | ||
} while (0) | ||
|
||
#endif /* ! CONFIG_KERNEL_MODE_NEON */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
/* | ||
* arch/arm64/lib/xor-neon.c | ||
* | ||
* Authors: Jackie Liu <liuyun01@kylinos.cn> | ||
* Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd. | ||
* | ||
* This program is free software; you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License version 2 as | ||
* published by the Free Software Foundation. | ||
*/ | ||
|
||
#include <linux/raid/xor.h> | ||
#include <linux/module.h> | ||
#include <asm/neon-intrinsics.h> | ||
|
||
void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1, | ||
unsigned long *p2) | ||
{ | ||
uint64_t *dp1 = (uint64_t *)p1; | ||
uint64_t *dp2 = (uint64_t *)p2; | ||
|
||
register uint64x2_t v0, v1, v2, v3; | ||
long lines = bytes / (sizeof(uint64x2_t) * 4); | ||
|
||
do { | ||
/* p1 ^= p2 */ | ||
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); | ||
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); | ||
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); | ||
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); | ||
|
||
/* store */ | ||
vst1q_u64(dp1 + 0, v0); | ||
vst1q_u64(dp1 + 2, v1); | ||
vst1q_u64(dp1 + 4, v2); | ||
vst1q_u64(dp1 + 6, v3); | ||
|
||
dp1 += 8; | ||
dp2 += 8; | ||
} while (--lines > 0); | ||
} | ||
|
||
void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1, | ||
unsigned long *p2, unsigned long *p3) | ||
{ | ||
uint64_t *dp1 = (uint64_t *)p1; | ||
uint64_t *dp2 = (uint64_t *)p2; | ||
uint64_t *dp3 = (uint64_t *)p3; | ||
|
||
register uint64x2_t v0, v1, v2, v3; | ||
long lines = bytes / (sizeof(uint64x2_t) * 4); | ||
|
||
do { | ||
/* p1 ^= p2 */ | ||
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); | ||
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); | ||
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); | ||
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); | ||
|
||
/* p1 ^= p3 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); | ||
|
||
/* store */ | ||
vst1q_u64(dp1 + 0, v0); | ||
vst1q_u64(dp1 + 2, v1); | ||
vst1q_u64(dp1 + 4, v2); | ||
vst1q_u64(dp1 + 6, v3); | ||
|
||
dp1 += 8; | ||
dp2 += 8; | ||
dp3 += 8; | ||
} while (--lines > 0); | ||
} | ||
|
||
void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1, | ||
unsigned long *p2, unsigned long *p3, unsigned long *p4) | ||
{ | ||
uint64_t *dp1 = (uint64_t *)p1; | ||
uint64_t *dp2 = (uint64_t *)p2; | ||
uint64_t *dp3 = (uint64_t *)p3; | ||
uint64_t *dp4 = (uint64_t *)p4; | ||
|
||
register uint64x2_t v0, v1, v2, v3; | ||
long lines = bytes / (sizeof(uint64x2_t) * 4); | ||
|
||
do { | ||
/* p1 ^= p2 */ | ||
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); | ||
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); | ||
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); | ||
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); | ||
|
||
/* p1 ^= p3 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); | ||
|
||
/* p1 ^= p4 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); | ||
|
||
/* store */ | ||
vst1q_u64(dp1 + 0, v0); | ||
vst1q_u64(dp1 + 2, v1); | ||
vst1q_u64(dp1 + 4, v2); | ||
vst1q_u64(dp1 + 6, v3); | ||
|
||
dp1 += 8; | ||
dp2 += 8; | ||
dp3 += 8; | ||
dp4 += 8; | ||
} while (--lines > 0); | ||
} | ||
|
||
void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1, | ||
unsigned long *p2, unsigned long *p3, | ||
unsigned long *p4, unsigned long *p5) | ||
{ | ||
uint64_t *dp1 = (uint64_t *)p1; | ||
uint64_t *dp2 = (uint64_t *)p2; | ||
uint64_t *dp3 = (uint64_t *)p3; | ||
uint64_t *dp4 = (uint64_t *)p4; | ||
uint64_t *dp5 = (uint64_t *)p5; | ||
|
||
register uint64x2_t v0, v1, v2, v3; | ||
long lines = bytes / (sizeof(uint64x2_t) * 4); | ||
|
||
do { | ||
/* p1 ^= p2 */ | ||
v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0)); | ||
v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2)); | ||
v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4)); | ||
v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); | ||
|
||
/* p1 ^= p3 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp3 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp3 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp3 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp3 + 6)); | ||
|
||
/* p1 ^= p4 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp4 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp4 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp4 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp4 + 6)); | ||
|
||
/* p1 ^= p5 */ | ||
v0 = veorq_u64(v0, vld1q_u64(dp5 + 0)); | ||
v1 = veorq_u64(v1, vld1q_u64(dp5 + 2)); | ||
v2 = veorq_u64(v2, vld1q_u64(dp5 + 4)); | ||
v3 = veorq_u64(v3, vld1q_u64(dp5 + 6)); | ||
|
||
/* store */ | ||
vst1q_u64(dp1 + 0, v0); | ||
vst1q_u64(dp1 + 2, v1); | ||
vst1q_u64(dp1 + 4, v2); | ||
vst1q_u64(dp1 + 6, v3); | ||
|
||
dp1 += 8; | ||
dp2 += 8; | ||
dp3 += 8; | ||
dp4 += 8; | ||
dp5 += 8; | ||
} while (--lines > 0); | ||
} | ||
|
||
struct xor_block_template const xor_block_inner_neon = { | ||
.name = "__inner_neon__", | ||
.do_2 = xor_arm64_neon_2, | ||
.do_3 = xor_arm64_neon_3, | ||
.do_4 = xor_arm64_neon_4, | ||
.do_5 = xor_arm64_neon_5, | ||
}; | ||
EXPORT_SYMBOL(xor_block_inner_neon); | ||
|
||
MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>"); | ||
MODULE_DESCRIPTION("ARMv8 XOR Extensions"); | ||
MODULE_LICENSE("GPL"); |