Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 344375
b: refs/heads/master
c: cba1cce
h: refs/heads/master
i:
  344373: 8b25abe
  344371: 1757ebd
  344367: 8ede56e
v: v3
  • Loading branch information
Jussi Kivilinna authored and Herbert Xu committed Oct 24, 2012
1 parent 0fc270d commit 82cf8d1
Show file tree
Hide file tree
Showing 4 changed files with 228 additions and 126 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: 58990986f1cba40c23c0c10592ace08616de3ffa
refs/heads/master: cba1cce05498d55f363c28cd2512368e95605518
190 changes: 123 additions & 67 deletions trunk/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
*
*/

#include "glue_helper-asm-avx.S"

.file "cast6-avx-x86_64-asm_64.S"

.extern cast6_s1
Expand Down Expand Up @@ -205,51 +207,29 @@
vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3;

#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \
vmovdqu (0*4*4)(in), x0; \
vmovdqu (1*4*4)(in), x1; \
vmovdqu (2*4*4)(in), x2; \
vmovdqu (3*4*4)(in), x3; \
#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
vpshufb rmask, x3, x3; \
\
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)

#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
vpshufb rmask, x3, x3; \
vmovdqu x0, (0*4*4)(out); \
vmovdqu x1, (1*4*4)(out); \
vmovdqu x2, (2*4*4)(out); \
vmovdqu x3, (3*4*4)(out);

#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
vpshufb rmask, x3, x3; \
vpxor (0*4*4)(out), x0, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor (1*4*4)(out), x1, x1; \
vmovdqu x1, (1*4*4)(out); \
vpxor (2*4*4)(out), x2, x2; \
vmovdqu x2, (2*4*4)(out); \
vpxor (3*4*4)(out), x3, x3; \
vmovdqu x3, (3*4*4)(out);
vpshufb rmask, x3, x3;

.data

.align 16
.Lbswap_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.Lrkr_enc_Q_Q_QBAR_QBAR:
.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
Expand All @@ -269,31 +249,26 @@

.text

.align 16
.global __cast6_enc_blk_8way
.type __cast6_enc_blk_8way,@function;
.align 8
.type __cast6_enc_blk8,@function;

__cast6_enc_blk_8way:
__cast6_enc_blk8:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: bool, if true: xor output
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
*/

pushq %rbp;
pushq %rbx;
pushq %rcx;

vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;

leaq (4*4*4)(%rdx), %rax;
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

movq %rsi, %r11;
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

preload_rkr(0, dummy, none);
Q(0);
Expand All @@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
QBAR(10);
QBAR(11);

popq %rcx;
popq %rbx;
popq %rbp;

vmovdqa .Lbswap_mask, RKM;
leaq (4*4*4)(%r11), %rax;

testb %cl, %cl;
jnz __enc_xor8;

outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

ret;

__enc_xor8:
outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

ret;

.align 16
.global cast6_dec_blk_8way
.type cast6_dec_blk_8way,@function;
.align 8
.type __cast6_dec_blk8,@function;

cast6_dec_blk_8way:
__cast6_dec_blk8:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
*/

pushq %rbp;
Expand All @@ -350,11 +314,8 @@ cast6_dec_blk_8way:
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;

leaq (4*4*4)(%rdx), %rax;
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

movq %rsi, %r11;
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
Q(11);
Expand All @@ -376,8 +337,103 @@ cast6_dec_blk_8way:
popq %rbp;

vmovdqa .Lbswap_mask, RKM;
leaq (4*4*4)(%r11), %rax;
outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);

ret;

.align 8
.global cast6_ecb_enc_8way
.type cast6_ecb_enc_8way,@function;

cast6_ecb_enc_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/

movq %rsi, %r11;

load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

call __cast6_enc_blk8;

store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

ret;

.align 8
.global cast6_ecb_dec_8way
.type cast6_ecb_dec_8way,@function;

cast6_ecb_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/

movq %rsi, %r11;

load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

call __cast6_dec_blk8;

store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

ret;

.align 8
.global cast6_cbc_dec_8way
.type cast6_cbc_dec_8way,@function;

cast6_cbc_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/

pushq %r12;

movq %rsi, %r11;
movq %rdx, %r12;

load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

call __cast6_dec_blk8;

store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

popq %r12;

ret;

.align 8
.global cast6_ctr_8way
.type cast6_ctr_8way,@function;

cast6_ctr_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/

pushq %r12;

movq %rsi, %r11;
movq %rdx, %r12;

load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
RD2, RX, RKR, RKM);

call __cast6_enc_blk8;

store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);

popq %r12;

ret;
Loading

0 comments on commit 82cf8d1

Please sign in to comment.