Skip to content

Commit

Permalink
---
Browse files Browse the repository at this point in the history
yaml
---
r: 350210
b: refs/heads/master
c: f317820
h: refs/heads/master
v: v3
  • Loading branch information
Jan Beulich authored and Ingo Molnar committed Jan 25, 2013
1 parent adef201 commit 76c2289
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 19 deletions.
2 changes: 1 addition & 1 deletion [refs]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
---
refs/heads/master: e8f6e3f8a14bae98197c6d9f280cd23d22eb1a33
refs/heads/master: f317820cb6ee3fb173319bf76e0e62437be78ad2
172 changes: 172 additions & 0 deletions trunk/arch/x86/include/asm/xor.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@
#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
#define NOP(x)

#define BLK64(pf, op, i) \
pf(i) \
op(i, 0) \
op(i + 1, 1) \
op(i + 2, 2) \
op(i + 3, 3)

static void
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
Expand Down Expand Up @@ -110,6 +118,40 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
kernel_fpu_end();
}

static void
xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
unsigned long lines = bytes >> 8;

kernel_fpu_begin();

asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(NOP, ST, i) \

" .align 32 ;\n"
" 1: ;\n"

BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)

" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");

kernel_fpu_end();
}

static void
xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
Expand Down Expand Up @@ -169,6 +211,43 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
kernel_fpu_end();
}

static void
xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
unsigned long lines = bytes >> 8;

kernel_fpu_begin();

asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(NOP, ST, i) \

" .align 32 ;\n"
" 1: ;\n"

BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)

" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines),
[p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");

kernel_fpu_end();
}

static void
xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
Expand Down Expand Up @@ -235,6 +314,45 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
kernel_fpu_end();
}

static void
xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
unsigned long lines = bytes >> 8;

kernel_fpu_begin();

asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(PF3, XO3, i) \
BLK64(NOP, ST, i) \

" .align 32 ;\n"
" 1: ;\n"

BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)

" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1),
[p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");

kernel_fpu_end();
}

static void
xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
Expand Down Expand Up @@ -308,12 +426,63 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
kernel_fpu_end();
}

static void
xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
unsigned long lines = bytes >> 8;

kernel_fpu_begin();

asm volatile(
#undef BLOCK
#define BLOCK(i) \
BLK64(PF0, LD, i) \
BLK64(PF1, XO1, i) \
BLK64(PF2, XO2, i) \
BLK64(PF3, XO3, i) \
BLK64(PF4, XO4, i) \
BLK64(NOP, ST, i) \

" .align 32 ;\n"
" 1: ;\n"

BLOCK(0)
BLOCK(4)
BLOCK(8)
BLOCK(12)

" add %[inc], %[p1] ;\n"
" add %[inc], %[p2] ;\n"
" add %[inc], %[p3] ;\n"
" add %[inc], %[p4] ;\n"
" add %[inc], %[p5] ;\n"
" dec %[cnt] ;\n"
" jnz 1b ;\n"
: [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
[p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
: [inc] XOR_CONSTANT_CONSTRAINT (256UL)
: "memory");

kernel_fpu_end();
}

static struct xor_block_template xor_block_sse_pf64 = {
.name = "prefetch64-sse",
.do_2 = xor_sse_2_pf64,
.do_3 = xor_sse_3_pf64,
.do_4 = xor_sse_4_pf64,
.do_5 = xor_sse_5_pf64,
};

#undef LD
#undef XO1
#undef XO2
#undef XO3
#undef XO4
#undef ST
#undef NOP
#undef BLK64
#undef BLOCK

#undef XOR_CONSTANT_CONSTRAINT
Expand All @@ -324,4 +493,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
# include <asm/xor_64.h>
#endif

#define XOR_SELECT_TEMPLATE(FASTEST) \
AVX_SELECT(FASTEST)

#endif /* _ASM_X86_XOR_H */
23 changes: 11 additions & 12 deletions trunk/arch/x86/include/asm/xor_32.h
Original file line number Diff line number Diff line change
Expand Up @@ -543,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = {
/* Also try the generic routines. */
#include <asm-generic/xor.h>

/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
xor_speed(&xor_block_8regs); \
xor_speed(&xor_block_8regs_p); \
xor_speed(&xor_block_32regs); \
xor_speed(&xor_block_32regs_p); \
AVX_XOR_SPEED; \
if (cpu_has_xmm) \
if (cpu_has_xmm) { \
xor_speed(&xor_block_pIII_sse); \
if (cpu_has_mmx) { \
xor_speed(&xor_block_sse_pf64); \
} else if (cpu_has_mmx) { \
xor_speed(&xor_block_pII_mmx); \
xor_speed(&xor_block_p5_mmx); \
} else { \
xor_speed(&xor_block_8regs); \
xor_speed(&xor_block_8regs_p); \
xor_speed(&xor_block_32regs); \
xor_speed(&xor_block_32regs_p); \
} \
} while (0)

/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
#define XOR_SELECT_TEMPLATE(FASTEST) \
AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)

#endif /* _ASM_X86_XOR_32_H */
10 changes: 4 additions & 6 deletions trunk/arch/x86/include/asm/xor_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,15 @@ static struct xor_block_template xor_block_sse = {
/* Also try the AVX routines */
#include <asm/xor_avx.h>

/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
AVX_XOR_SPEED; \
xor_speed(&xor_block_sse_pf64); \
xor_speed(&xor_block_sse); \
} while (0)

/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
#define XOR_SELECT_TEMPLATE(FASTEST) \
AVX_SELECT(&xor_block_sse)

#endif /* _ASM_X86_XOR_64_H */

0 comments on commit 76c2289

Please sign in to comment.