Skip to content

Commit

Permalink
arm64/crypto: improve performance of GHASH algorithm
Browse files Browse the repository at this point in the history
This patches modifies the GHASH secure hash implementation to switch to a
faster, polynomial multiplication based reduction instead of one that uses
shifts and rotates.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
  • Loading branch information
Ard Biesheuvel authored and Catalin Marinas committed Jun 18, 2014
1 parent 6aa8b20 commit b913a64
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 56 deletions.
92 changes: 38 additions & 54 deletions arch/arm64/crypto/ghash-ce-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,6 @@
*
* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
* Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
*
* Copyright (c) 2009 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
* Vinodh Gopal
* Erdinc Ozturk
* Deniz Karakoyunlu
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
Expand All @@ -19,13 +11,15 @@
#include <linux/linkage.h>
#include <asm/assembler.h>

DATA .req v0
SHASH .req v1
IN1 .req v2
SHASH .req v0
SHASH2 .req v1
T1 .req v2
T2 .req v3
T3 .req v4
VZR .req v5
MASK .req v4
XL .req v5
XM .req v6
XH .req v7
IN1 .req v7

.text
.arch armv8-a+crypto
Expand All @@ -35,61 +29,51 @@
* struct ghash_key const *k, const char *head)
*/
ENTRY(pmull_ghash_update)
ld1 {DATA.16b}, [x1]
ld1 {SHASH.16b}, [x3]
eor VZR.16b, VZR.16b, VZR.16b
ld1 {XL.16b}, [x1]
movi MASK.16b, #0xe1
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
shl MASK.2d, MASK.2d, #57
eor SHASH2.16b, SHASH2.16b, SHASH.16b

/* do the head block first, if supplied */
cbz x4, 0f
ld1 {IN1.2d}, [x4]
ld1 {T1.2d}, [x4]
b 1f

0: ld1 {IN1.2d}, [x2], #16
0: ld1 {T1.2d}, [x2], #16
sub w0, w0, #1
1: ext IN1.16b, IN1.16b, IN1.16b, #8
CPU_LE( rev64 IN1.16b, IN1.16b )
eor DATA.16b, DATA.16b, IN1.16b

/* multiply DATA by SHASH in GF(2^128) */
ext T2.16b, DATA.16b, DATA.16b, #8
ext T3.16b, SHASH.16b, SHASH.16b, #8
eor T2.16b, T2.16b, DATA.16b
eor T3.16b, T3.16b, SHASH.16b
1: /* multiply XL by SHASH in GF(2^128) */
CPU_LE( rev64 T1.16b, T1.16b )

pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1
pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0
pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0)
eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0)
eor T2.16b, T2.16b, DATA.16b
ext T2.16b, XL.16b, XL.16b, #8
ext IN1.16b, T1.16b, T1.16b, #8
eor T1.16b, T1.16b, T2.16b
eor XL.16b, XL.16b, IN1.16b

ext T3.16b, VZR.16b, T2.16b, #8
ext T2.16b, T2.16b, VZR.16b, #8
eor DATA.16b, DATA.16b, T3.16b
eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of
// carry-less multiplication
pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
eor T1.16b, T1.16b, XL.16b
pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)

/* first phase of the reduction */
shl T3.2d, DATA.2d, #1
eor T3.16b, T3.16b, DATA.16b
shl T3.2d, T3.2d, #5
eor T3.16b, T3.16b, DATA.16b
shl T3.2d, T3.2d, #57
ext T2.16b, VZR.16b, T3.16b, #8
ext T3.16b, T3.16b, VZR.16b, #8
eor DATA.16b, DATA.16b, T2.16b
eor T1.16b, T1.16b, T3.16b
ext T1.16b, XL.16b, XH.16b, #8
eor T2.16b, XL.16b, XH.16b
eor XM.16b, XM.16b, T1.16b
eor XM.16b, XM.16b, T2.16b
pmull T2.1q, XL.1d, MASK.1d

/* second phase of the reduction */
ushr T2.2d, DATA.2d, #5
eor T2.16b, T2.16b, DATA.16b
ushr T2.2d, T2.2d, #1
eor T2.16b, T2.16b, DATA.16b
ushr T2.2d, T2.2d, #1
eor T1.16b, T1.16b, T2.16b
eor DATA.16b, DATA.16b, T1.16b
mov XH.d[0], XM.d[1]
mov XM.d[1], XL.d[0]

eor XL.16b, XM.16b, T2.16b
ext T2.16b, XL.16b, XL.16b, #8
pmull XL.1q, XL.1d, MASK.1d
eor T2.16b, T2.16b, XH.16b
eor XL.16b, XL.16b, T2.16b

cbnz w0, 0b

st1 {DATA.16b}, [x1]
st1 {XL.16b}, [x1]
ret
ENDPROC(pmull_ghash_update)
4 changes: 2 additions & 2 deletions arch/arm64/crypto/ghash-ce-glue.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
blocks = len / GHASH_BLOCK_SIZE;
len %= GHASH_BLOCK_SIZE;

kernel_neon_begin_partial(6);
kernel_neon_begin_partial(8);
pmull_ghash_update(blocks, ctx->digest, src, key,
partial ? ctx->buf : NULL);
kernel_neon_end();
Expand All @@ -89,7 +89,7 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)

memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);

kernel_neon_begin_partial(6);
kernel_neon_begin_partial(8);
pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
kernel_neon_end();
}
Expand Down

0 comments on commit b913a64

Please sign in to comment.