Skip to content

Commit

Permalink
crypto: cast6/avx - avoid using temporary stack buffers
Browse files Browse the repository at this point in the history
Introduce new assembler functions to avoid use temporary stack buffers in
glue code. This also allows use of vector instructions for xoring output
in CTR and CBC modes and construction of IVs for CTR mode.

ECB mode sees ~0.5% decrease in speed because added one extra function
call. CBC mode decryption and CTR mode benefit from vector operations
and gain ~2%.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
  • Loading branch information
Jussi Kivilinna authored and Herbert Xu committed Oct 24, 2012
1 parent 5899098 commit cba1cce
Show file tree
Hide file tree
Showing 3 changed files with 227 additions and 125 deletions.
190 changes: 123 additions & 67 deletions arch/x86/crypto/cast6-avx-x86_64-asm_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
*
*/

#include "glue_helper-asm-avx.S"

.file "cast6-avx-x86_64-asm_64.S"

.extern cast6_s1
Expand Down Expand Up @@ -205,51 +207,29 @@
vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3;

#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \
vmovdqu (0*4*4)(in), x0; \
vmovdqu (1*4*4)(in), x1; \
vmovdqu (2*4*4)(in), x2; \
vmovdqu (3*4*4)(in), x3; \
#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
vpshufb rmask, x3, x3; \
\
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)

#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
vpshufb rmask, x3, x3; \
vmovdqu x0, (0*4*4)(out); \
vmovdqu x1, (1*4*4)(out); \
vmovdqu x2, (2*4*4)(out); \
vmovdqu x3, (3*4*4)(out);

#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
vpshufb rmask, x3, x3; \
vpxor (0*4*4)(out), x0, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor (1*4*4)(out), x1, x1; \
vmovdqu x1, (1*4*4)(out); \
vpxor (2*4*4)(out), x2, x2; \
vmovdqu x2, (2*4*4)(out); \
vpxor (3*4*4)(out), x3, x3; \
vmovdqu x3, (3*4*4)(out);
vpshufb rmask, x3, x3;

.data

.align 16
.Lbswap_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.Lrkr_enc_Q_Q_QBAR_QBAR:
.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
Expand All @@ -269,31 +249,26 @@

.text

.align 16
.global __cast6_enc_blk_8way
.type __cast6_enc_blk_8way,@function;
.align 8
.type __cast6_enc_blk8,@function;

__cast6_enc_blk_8way:
__cast6_enc_blk8:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: bool, if true: xor output
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
*/

pushq %rbp;
pushq %rbx;
pushq %rcx;

vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;

leaq (4*4*4)(%rdx), %rax;
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

movq %rsi, %r11;
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

preload_rkr(0, dummy, none);
Q(0);
Expand All @@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
QBAR(10);
QBAR(11);

popq %rcx;
popq %rbx;
popq %rbp;

vmovdqa .Lbswap_mask, RKM;
leaq (4*4*4)(%r11), %rax;

testb %cl, %cl;
jnz __enc_xor8;

outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

ret;

__enc_xor8:
outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

ret;

.align 16
.global cast6_dec_blk_8way
.type cast6_dec_blk_8way,@function;
.align 8
.type __cast6_dec_blk8,@function;

cast6_dec_blk_8way:
__cast6_dec_blk8:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
*/

pushq %rbp;
Expand All @@ -350,11 +314,8 @@ cast6_dec_blk_8way:
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;

leaq (4*4*4)(%rdx), %rax;
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

movq %rsi, %r11;
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
Q(11);
Expand All @@ -376,8 +337,103 @@ cast6_dec_blk_8way:
popq %rbp;

vmovdqa .Lbswap_mask, RKM;
leaq (4*4*4)(%r11), %rax;
outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

ret;

.align 8
.global cast6_ecb_enc_8way
.type cast6_ecb_enc_8way,@function;

cast6_ecb_enc_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/

movq %rsi, %r11;

load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

call __cast6_enc_blk8;

store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

ret;

.align 8
.global cast6_ecb_dec_8way
.type cast6_ecb_dec_8way,@function;

cast6_ecb_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/

movq %rsi, %r11;

load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

call __cast6_dec_blk8;

store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

ret;

.align 8
.global cast6_cbc_dec_8way
.type cast6_cbc_dec_8way,@function;

cast6_cbc_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/

pushq %r12;

movq %rsi, %r11;
movq %rdx, %r12;

load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

call __cast6_dec_blk8;

store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

popq %r12;

ret;

.align 8
.global cast6_ctr_8way
.type cast6_ctr_8way,@function;

cast6_ctr_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/

pushq %r12;

movq %rsi, %r11;
movq %rdx, %r12;

load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
RD2, RX, RKR, RKM);

call __cast6_enc_blk8;

store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

popq %r12;

ret;
71 changes: 13 additions & 58 deletions arch/x86/crypto/cast6_avx_glue.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,43 +40,15 @@

#define CAST6_PARALLEL_BLOCKS 8

asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst,
asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src);

static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst,
const u8 *src)
{
__cast6_enc_blk_8way(ctx, dst, src, false);
}

static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst,
const u8 *src)
{
__cast6_enc_blk_8way(ctx, dst, src, true);
}

static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst,
const u8 *src)
{
cast6_dec_blk_8way(ctx, dst, src);
}


static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
{
u128 ivs[CAST6_PARALLEL_BLOCKS - 1];
unsigned int j;

for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
ivs[j] = src[j];

cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);

for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
}
asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
le128 *iv);

static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
Expand All @@ -89,30 +61,13 @@ static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
u128_xor(dst, src, (u128 *)&ctrblk);
}

static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
le128 *iv)
{
be128 ctrblks[CAST6_PARALLEL_BLOCKS];
unsigned int i;

for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) {
if (dst != src)
dst[i] = src[i];

le128_to_be128(&ctrblks[i], iv);
le128_inc(iv);
}

cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
}

static const struct common_glue_ctx cast6_enc = {
.num_funcs = 2,
.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,

.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) }
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
Expand All @@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = {

.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) }
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
}, {
.num_blocks = 1,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
Expand All @@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = {

.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) }
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
Expand All @@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = {

.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) }
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
}, {
.num_blocks = 1,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
Expand Down Expand Up @@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);

if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst);
cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
return;
}

Expand All @@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);

if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst);
cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
return;
}

Expand Down
Loading

0 comments on commit cba1cce

Please sign in to comment.