Skip to content

Commit

Permalink
crypto: arm64/sha1-ce - move SHA-1 ARMv8 implementation to base layer
Browse files Browse the repository at this point in the history
This removes all the boilerplate from the existing implementation,
and replaces it with calls into the base layer.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
  • Loading branch information
Ard Biesheuvel authored and Herbert Xu committed Apr 10, 2015
1 parent 9205b94 commit 07eb54d
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 125 deletions.
33 changes: 15 additions & 18 deletions arch/arm64/crypto/sha1-ce-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@
.word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6

/*
* void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
* u8 *head, long bytes)
* void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
* int blocks)
*/
ENTRY(sha1_ce_transform)
/* load round constants */
Expand All @@ -78,25 +78,22 @@ ENTRY(sha1_ce_transform)
ld1r {k3.4s}, [x6]

/* load state */
ldr dga, [x2]
ldr dgb, [x2, #16]
ldr dga, [x0]
ldr dgb, [x0, #16]

/* load partial state (if supplied) */
cbz x3, 0f
ld1 {v8.4s-v11.4s}, [x3]
b 1f
/* load sha1_ce_state::finalize */
ldr w4, [x0, #:lo12:sha1_ce_offsetof_finalize]

/* load input */
0: ld1 {v8.4s-v11.4s}, [x1], #64
sub w0, w0, #1
sub w2, w2, #1

1:
CPU_LE( rev32 v8.16b, v8.16b )
CPU_LE( rev32 v9.16b, v9.16b )
CPU_LE( rev32 v10.16b, v10.16b )
CPU_LE( rev32 v11.16b, v11.16b )

2: add t0.4s, v8.4s, k0.4s
1: add t0.4s, v8.4s, k0.4s
mov dg0v.16b, dgav.16b

add_update c, ev, k0, 8, 9, 10, 11, dgb
Expand Down Expand Up @@ -127,15 +124,15 @@ CPU_LE( rev32 v11.16b, v11.16b )
add dgbv.2s, dgbv.2s, dg1v.2s
add dgav.4s, dgav.4s, dg0v.4s

cbnz w0, 0b
cbnz w2, 0b

/*
* Final block: add padding and total bit count.
* Skip if we have no total byte count in x4. In that case, the input
* size was not a round multiple of the block size, and the padding is
* handled by the C code.
* Skip if the input size was not a round multiple of the block size,
* the padding is handled by the C code in that case.
*/
cbz x4, 3f
ldr x4, [x0, #:lo12:sha1_ce_offsetof_count]
movi v9.2d, #0
mov x8, #0x80000000
movi v10.2d, #0
Expand All @@ -144,10 +141,10 @@ CPU_LE( rev32 v11.16b, v11.16b )
mov x4, #0
mov v11.d[0], xzr
mov v11.d[1], x7
b 2b
b 1b

/* store new state */
3: str dga, [x2]
str dgb, [x2, #16]
3: str dga, [x0]
str dgb, [x0, #16]
ret
ENDPROC(sha1_ce_transform)
151 changes: 44 additions & 107 deletions arch/arm64/crypto/sha1-ce-glue.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,144 +12,81 @@
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/sha.h>
#include <crypto/sha1_base.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/module.h>

#define ASM_EXPORT(sym, val) \
asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val));

MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");

asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
u8 *head, long bytes);
struct sha1_ce_state {
struct sha1_state sst;
u32 finalize;
};

static int sha1_init(struct shash_desc *desc)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
int blocks);

*sctx = (struct sha1_state){
.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
};
return 0;
}

static int sha1_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;

sctx->count += len;

if ((partial + len) >= SHA1_BLOCK_SIZE) {
int blocks;

if (partial) {
int p = SHA1_BLOCK_SIZE - partial;
struct sha1_ce_state *sctx = shash_desc_ctx(desc);

memcpy(sctx->buffer + partial, data, p);
data += p;
len -= p;
}

blocks = len / SHA1_BLOCK_SIZE;
len %= SHA1_BLOCK_SIZE;

kernel_neon_begin_partial(16);
sha1_ce_transform(blocks, data, sctx->state,
partial ? sctx->buffer : NULL, 0);
kernel_neon_end();
sctx->finalize = 0;
kernel_neon_begin_partial(16);
sha1_base_do_update(desc, data, len,
(sha1_block_fn *)sha1_ce_transform);
kernel_neon_end();

data += blocks * SHA1_BLOCK_SIZE;
partial = 0;
}
if (len)
memcpy(sctx->buffer + partial, data, len);
return 0;
}

static int sha1_final(struct shash_desc *desc, u8 *out)
static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
struct sha1_ce_state *sctx = shash_desc_ctx(desc);
bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE);

struct sha1_state *sctx = shash_desc_ctx(desc);
__be64 bits = cpu_to_be64(sctx->count << 3);
__be32 *dst = (__be32 *)out;
int i;

u32 padlen = SHA1_BLOCK_SIZE
- ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE);

sha1_update(desc, padding, padlen);
sha1_update(desc, (const u8 *)&bits, sizeof(bits));

for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
put_unaligned_be32(sctx->state[i], dst++);

*sctx = (struct sha1_state){};
return 0;
}

static int sha1_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
__be32 *dst = (__be32 *)out;
int blocks;
int i;

if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) {
sha1_update(desc, data, len);
return sha1_final(desc, out);
}
ASM_EXPORT(sha1_ce_offsetof_count,
offsetof(struct sha1_ce_state, sst.count));
ASM_EXPORT(sha1_ce_offsetof_finalize,
offsetof(struct sha1_ce_state, finalize));

/*
* Use a fast path if the input is a multiple of 64 bytes. In
* this case, there is no need to copy data around, and we can
* perform the entire digest calculation in a single invocation
* of sha1_ce_transform()
* Allow the asm code to perform the finalization if there is no
* partial data and the input is a round multiple of the block size.
*/
blocks = len / SHA1_BLOCK_SIZE;
sctx->finalize = finalize;

kernel_neon_begin_partial(16);
sha1_ce_transform(blocks, data, sctx->state, NULL, len);
sha1_base_do_update(desc, data, len,
(sha1_block_fn *)sha1_ce_transform);
if (!finalize)
sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform);
kernel_neon_end();

for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
put_unaligned_be32(sctx->state[i], dst++);

*sctx = (struct sha1_state){};
return 0;
return sha1_base_finish(desc, out);
}

static int sha1_export(struct shash_desc *desc, void *out)
static int sha1_ce_final(struct shash_desc *desc, u8 *out)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
struct sha1_state *dst = out;

*dst = *sctx;
return 0;
}

static int sha1_import(struct shash_desc *desc, const void *in)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
struct sha1_state const *src = in;

*sctx = *src;
return 0;
kernel_neon_begin_partial(16);
sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform);
kernel_neon_end();
return sha1_base_finish(desc, out);
}

static struct shash_alg alg = {
.init = sha1_init,
.update = sha1_update,
.final = sha1_final,
.finup = sha1_finup,
.export = sha1_export,
.import = sha1_import,
.descsize = sizeof(struct sha1_state),
.init = sha1_base_init,
.update = sha1_ce_update,
.final = sha1_ce_final,
.finup = sha1_ce_finup,
.descsize = sizeof(struct sha1_ce_state),
.digestsize = SHA1_DIGEST_SIZE,
.statesize = sizeof(struct sha1_state),
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ce",
Expand Down

0 comments on commit 07eb54d

Please sign in to comment.