Skip to content

Commit

Permalink
crypto: arm64/chacha20 - add XChaCha20 support
Browse files Browse the repository at this point in the history
Add an XChaCha20 implementation that is hooked up to the ARM64 NEON
implementation of ChaCha20.  This can be used by Adiantum.

A NEON implementation of single-block HChaCha20 is also added so that
XChaCha20 can use it rather than the generic implementation.  This
required refactoring the ChaCha20 permutation into its own function.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
  • Loading branch information
Eric Biggers authored and Herbert Xu committed Dec 13, 2018
1 parent a00fa0c commit cc7cf99
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 43 deletions.
2 changes: 1 addition & 1 deletion arch/arm64/crypto/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ config CRYPTO_AES_ARM64_NEON_BLK
select CRYPTO_SIMD

config CRYPTO_CHACHA20_NEON
tristate "NEON accelerated ChaCha20 symmetric cipher"
tristate "ChaCha20 and XChaCha20 stream ciphers using NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
Expand Down
65 changes: 48 additions & 17 deletions arch/arm64/crypto/chacha20-neon-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,20 @@
.text
.align 6

ENTRY(chacha20_block_xor_neon)
// x0: Input state matrix, s
// x1: 1 data block output, o
// x2: 1 data block input, i

//
// This function encrypts one ChaCha20 block by loading the state matrix
// in four NEON registers. It performs matrix operation on four words in
// parallel, but requires shuffling to rearrange the words after each
// round.
//

// x0..3 = s0..3
adr x3, ROT8
ld1 {v0.4s-v3.4s}, [x0]
ld1 {v8.4s-v11.4s}, [x0]
ld1 {v12.4s}, [x3]
/*
* chacha20_permute - permute one block
*
* Permute one 64-byte block where the state matrix is stored in the four NEON
* registers v0-v3. It performs matrix operations on four words in parallel,
* but requires shuffling to rearrange the words after each round.
*
* Clobbers: x3, x10, v4, v12
*/
chacha20_permute:

mov x3, #10
adr x10, ROT8
ld1 {v12.4s}, [x10]

.Ldoubleround:
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
Expand Down Expand Up @@ -105,6 +100,23 @@ ENTRY(chacha20_block_xor_neon)
subs x3, x3, #1
b.ne .Ldoubleround

ret
ENDPROC(chacha20_permute)

ENTRY(chacha20_block_xor_neon)
// x0: Input state matrix, s
// x1: 1 data block output, o
// x2: 1 data block input, i

stp x29, x30, [sp, #-16]!
mov x29, sp

// x0..3 = s0..3
ld1 {v0.4s-v3.4s}, [x0]
ld1 {v8.4s-v11.4s}, [x0]

bl chacha20_permute

ld1 {v4.16b-v7.16b}, [x2]

// o0 = i0 ^ (x0 + s0)
Expand All @@ -125,9 +137,28 @@ ENTRY(chacha20_block_xor_neon)

st1 {v0.16b-v3.16b}, [x1]

ldp x29, x30, [sp], #16
ret
ENDPROC(chacha20_block_xor_neon)

ENTRY(hchacha20_block_neon)
// x0: Input state matrix, s
// x1: output (8 32-bit words)

stp x29, x30, [sp, #-16]!
mov x29, sp

ld1 {v0.4s-v3.4s}, [x0]

bl chacha20_permute

st1 {v0.16b}, [x1], #16
st1 {v3.16b}, [x1]

ldp x29, x30, [sp], #16
ret
ENDPROC(hchacha20_block_neon)

.align 6
ENTRY(chacha20_4block_xor_neon)
// x0: Input state matrix, s
Expand Down
101 changes: 76 additions & 25 deletions arch/arm64/crypto/chacha20-neon-glue.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
asmlinkage void hchacha20_block_neon(const u32 *state, u32 *out);

static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
Expand Down Expand Up @@ -65,20 +66,16 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
kernel_neon_end();
}

static int chacha20_neon(struct skcipher_request *req)
static int chacha20_neon_stream_xor(struct skcipher_request *req,
struct chacha_ctx *ctx, u8 *iv)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
u32 state[16];
int err;

if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE)
return crypto_chacha_crypt(req);

err = skcipher_walk_virt(&walk, req, false);

crypto_chacha_init(state, ctx, walk.iv);
crypto_chacha_init(state, ctx, iv);

while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
Expand All @@ -94,35 +91,86 @@ static int chacha20_neon(struct skcipher_request *req)
return err;
}

static struct skcipher_alg alg = {
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,

.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = chacha20_neon,
.decrypt = chacha20_neon,
static int chacha20_neon(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);

if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
return crypto_chacha_crypt(req);

return chacha20_neon_stream_xor(req, ctx, req->iv);
}

static int xchacha20_neon(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct chacha_ctx subctx;
u32 state[16];
u8 real_iv[16];

if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
return crypto_xchacha_crypt(req);

crypto_chacha_init(state, ctx, req->iv);

kernel_neon_begin();
hchacha20_block_neon(state, subctx.key);
kernel_neon_end();

memcpy(&real_iv[0], req->iv + 24, 8);
memcpy(&real_iv[8], req->iv + 16, 8);
return chacha20_neon_stream_xor(req, &subctx, real_iv);
}

static struct skcipher_alg algs[] = {
{
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,

.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = chacha20_neon,
.decrypt = chacha20_neon,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,

.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = xchacha20_neon,
.decrypt = xchacha20_neon,
}
};

static int __init chacha20_simd_mod_init(void)
{
if (!(elf_hwcap & HWCAP_ASIMD))
return -ENODEV;

return crypto_register_skcipher(&alg);
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
}

static void __exit chacha20_simd_mod_fini(void)
{
crypto_unregister_skcipher(&alg);
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}

module_init(chacha20_simd_mod_init);
Expand All @@ -131,3 +179,6 @@ module_exit(chacha20_simd_mod_fini);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-neon");
MODULE_ALIAS_CRYPTO("xchacha20");
MODULE_ALIAS_CRYPTO("xchacha20-neon");

0 comments on commit cc7cf99

Please sign in to comment.