Skip to content

Commit

Permalink
s390/fpu: always enable the vector facility if it is available
Browse files Browse the repository at this point in the history
If the kernel detects that the s390 hardware supports the vector
facility, it is enabled by default at an early stage.  To force
it off, use the novx kernel parameter.  Note that there is a small
time window, where the vector facility is enabled before it is
forced to be off.

With enabling the vector facility by default, the FPU save and
restore functions can be improved.  They do not longer require
to manage expensive control register updates to enable or disable
the vector enablement control for particular processes.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
  • Loading branch information
Hendrik Brueckner authored and Martin Schwidefsky committed Oct 14, 2015
1 parent 395e6aa commit b5510d9
Show file tree
Hide file tree
Showing 13 changed files with 70 additions and 199 deletions.
2 changes: 0 additions & 2 deletions arch/s390/include/asm/ctl_reg.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ static inline void __ctl_clear_bit(unsigned int cr, unsigned int bit)
__ctl_load(reg, cr, cr);
}

void __ctl_set_vx(void);

void smp_ctl_set_bit(int cr, int bit);
void smp_ctl_clear_bit(int cr, int bit);

Expand Down
14 changes: 2 additions & 12 deletions arch/s390/include/asm/fpu-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@
#ifndef _ASM_S390_FPU_INTERNAL_H
#define _ASM_S390_FPU_INTERNAL_H

#define FPU_USE_VX 1 /* Vector extension is active */

#ifndef __ASSEMBLY__

#include <linux/errno.h>
#include <linux/string.h>
#include <asm/linkage.h>
Expand All @@ -20,7 +16,6 @@

struct fpu {
__u32 fpc; /* Floating-point control */
__u32 flags;
union {
void *regs;
freg_t *fprs; /* Floating-point register save area */
Expand All @@ -30,9 +25,6 @@ struct fpu {

void save_fpu_regs(void);

#define is_vx_fpu(fpu) (!!((fpu)->flags & FPU_USE_VX))
#define is_vx_task(tsk) (!!((tsk)->thread.fpu.flags & FPU_USE_VX))

/* VX array structure for address operand constraints in inline assemblies */
struct vx_array { __vector128 _[__NUM_VXRS]; };

Expand Down Expand Up @@ -89,7 +81,7 @@ static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs)
static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)
{
fpregs->pad = 0;
if (is_vx_fpu(fpu))
if (MACHINE_HAS_VX)
convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs);
else
memcpy((freg_t *)&fpregs->fprs, fpu->fprs,
Expand All @@ -98,13 +90,11 @@ static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)

static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu)
{
if (is_vx_fpu(fpu))
if (MACHINE_HAS_VX)
convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs);
else
memcpy(fpu->fprs, (freg_t *)&fpregs->fprs,
sizeof(fpregs->fprs));
}

#endif

#endif /* _ASM_S390_FPU_INTERNAL_H */
1 change: 0 additions & 1 deletion arch/s390/kernel/asm-offsets.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ int main(void)
BLANK();
DEFINE(__THREAD_ksp, offsetof(struct thread_struct, ksp));
DEFINE(__THREAD_FPU_fpc, offsetof(struct thread_struct, fpu.fpc));
DEFINE(__THREAD_FPU_flags, offsetof(struct thread_struct, fpu.flags));
DEFINE(__THREAD_FPU_regs, offsetof(struct thread_struct, fpu.regs));
DEFINE(__THREAD_per_cause, offsetof(struct thread_struct, per_event.cause));
DEFINE(__THREAD_per_address, offsetof(struct thread_struct, per_event.address));
Expand Down
7 changes: 3 additions & 4 deletions arch/s390/kernel/compat_signal.c
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ static int save_sigregs_ext32(struct pt_regs *regs,
return -EFAULT;

/* Save vector registers to signal stack */
if (is_vx_task(current)) {
if (MACHINE_HAS_VX) {
for (i = 0; i < __NUM_VXRS_LOW; i++)
vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1);
if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
Expand Down Expand Up @@ -277,7 +277,7 @@ static int restore_sigregs_ext32(struct pt_regs *regs,
*(__u32 *)&regs->gprs[i] = gprs_high[i];

/* Restore vector registers from signal stack */
if (is_vx_task(current)) {
if (MACHINE_HAS_VX) {
if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
sizeof(sregs_ext->vxrs_low)) ||
__copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW,
Expand Down Expand Up @@ -470,8 +470,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
*/
uc_flags = UC_GPRS_HIGH;
if (MACHINE_HAS_VX) {
if (is_vx_task(current))
uc_flags |= UC_VXRS;
uc_flags |= UC_VXRS;
} else
frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) +
sizeof(frame->uc.uc_mcontext_ext.vxrs_high);
Expand Down
12 changes: 11 additions & 1 deletion arch/s390/kernel/early.c
Original file line number Diff line number Diff line change
Expand Up @@ -329,9 +329,19 @@ static __init void detect_machine_facilities(void)
S390_lowcore.machine_flags |= MACHINE_FLAG_TE;
if (test_facility(51))
S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC;
if (test_facility(129))
if (test_facility(129)) {
S390_lowcore.machine_flags |= MACHINE_FLAG_VX;
__ctl_set_bit(0, 17);
}
}

static int __init disable_vector_extension(char *str)
{
S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX;
__ctl_clear_bit(0, 17);
return 1;
}
early_param("novx", disable_vector_extension);

static int __init cad_setup(char *str)
{
Expand Down
102 changes: 21 additions & 81 deletions arch/s390/kernel/entry.S
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include <asm/page.h>
#include <asm/sigp.h>
#include <asm/irq.h>
#include <asm/fpu-internal.h>
#include <asm/vx-insn.h>

__PT_R0 = __PT_GPRS
Expand Down Expand Up @@ -748,15 +747,12 @@ ENTRY(psw_idle)
br %r14
.Lpsw_idle_end:

/* Store floating-point controls and floating-point or vector extension
* registers instead. A critical section cleanup assures that the registers
* are stored even if interrupted for some other work. The register %r2
* designates a struct fpu to store register contents. If the specified
* structure does not contain a register save area, the register store is
* omitted (see also comments in arch_dup_task_struct()).
*
* The CIF_FPU flag is set in any case. The CIF_FPU triggers a lazy restore
* of the register contents at system call or io return.
/*
* Store floating-point controls and floating-point or vector register
* depending whether the vector facility is available. A critical section
* cleanup assures that the registers are stored even if interrupted for
* some other work. The CIF_FPU flag is set to trigger a lazy restore
* of the register contents at return from io or a system call.
*/
ENTRY(save_fpu_regs)
lg %r2,__LC_CURRENT
Expand All @@ -768,7 +764,7 @@ ENTRY(save_fpu_regs)
lg %r3,__THREAD_FPU_regs(%r2)
ltgr %r3,%r3
jz .Lsave_fpu_regs_done # no save area -> set CIF_FPU
tm __THREAD_FPU_flags+3(%r2),FPU_USE_VX
tm __LC_MACHINE_FLAGS+5,4 # MACHINE_HAS_VX
jz .Lsave_fpu_regs_fp # no -> store FP regs
.Lsave_fpu_regs_vx_low:
VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3)
Expand Down Expand Up @@ -797,41 +793,30 @@ ENTRY(save_fpu_regs)
br %r14
.Lsave_fpu_regs_end:

/* Load floating-point controls and floating-point or vector extension
* registers. A critical section cleanup assures that the register contents
* are loaded even if interrupted for some other work. Depending on the saved
* FP/VX state, the vector-enablement control, CR0.46, is either set or cleared.
/*
* Load floating-point controls and floating-point or vector registers.
* A critical section cleanup assures that the register contents are
* loaded even if interrupted for some other work.
*
* There are special calling conventions to fit into sysc and io return work:
* %r15: <kernel stack>
* The function requires:
* %r4 and __SF_EMPTY+32(%r15)
* %r4
*/
load_fpu_regs:
lg %r4,__LC_CURRENT
aghi %r4,__TASK_thread
tm __LC_CPU_FLAGS+7,_CIF_FPU
bnor %r14
lfpc __THREAD_FPU_fpc(%r4)
stctg %c0,%c0,__SF_EMPTY+32(%r15) # store CR0
tm __THREAD_FPU_flags+3(%r4),FPU_USE_VX # VX-enabled task ?
tm __LC_MACHINE_FLAGS+5,4 # MACHINE_HAS_VX
lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area
jz .Lload_fpu_regs_fp_ctl # -> no VX, load FP regs
.Lload_fpu_regs_vx_ctl:
tm __SF_EMPTY+32+5(%r15),2 # test VX control
jo .Lload_fpu_regs_vx
oi __SF_EMPTY+32+5(%r15),2 # set VX control
lctlg %c0,%c0,__SF_EMPTY+32(%r15)
jz .Lload_fpu_regs_fp # -> no VX, load FP regs
.Lload_fpu_regs_vx:
VLM %v0,%v15,0,%r4
.Lload_fpu_regs_vx_high:
VLM %v16,%v31,256,%r4
j .Lload_fpu_regs_done
.Lload_fpu_regs_fp_ctl:
tm __SF_EMPTY+32+5(%r15),2 # test VX control
jz .Lload_fpu_regs_fp
ni __SF_EMPTY+32+5(%r15),253 # clear VX control
lctlg %c0,%c0,__SF_EMPTY+32(%r15)
.Lload_fpu_regs_fp:
ld 0,0(%r4)
ld 1,8(%r4)
Expand All @@ -854,16 +839,6 @@ load_fpu_regs:
br %r14
.Lload_fpu_regs_end:

/* Test and set the vector enablement control in CR0.46 */
ENTRY(__ctl_set_vx)
stctg %c0,%c0,__SF_EMPTY(%r15)
tm __SF_EMPTY+5(%r15),2
bor %r14
oi __SF_EMPTY+5(%r15),2
lctlg %c0,%c0,__SF_EMPTY(%r15)
br %r14
.L__ctl_set_vx_end:

.L__critical_end:

/*
Expand Down Expand Up @@ -1019,10 +994,6 @@ cleanup_critical:
jl 0f
clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end
jl .Lcleanup_load_fpu_regs
clg %r9,BASED(.Lcleanup_table+112) # __ctl_set_vx
jl 0f
clg %r9,BASED(.Lcleanup_table+120) # .L__ctl_set_vx_end
jl .Lcleanup___ctl_set_vx
0: br %r14

.align 8
Expand All @@ -1041,8 +1012,6 @@ cleanup_critical:
.quad .Lsave_fpu_regs_end
.quad load_fpu_regs
.quad .Lload_fpu_regs_end
.quad __ctl_set_vx
.quad .L__ctl_set_vx_end

#if IS_ENABLED(CONFIG_KVM)
.Lcleanup_table_sie:
Expand Down Expand Up @@ -1226,7 +1195,7 @@ cleanup_critical:
lg %r3,__THREAD_FPU_regs(%r2)
ltgr %r3,%r3
jz 5f # no save area -> set CIF_FPU
tm __THREAD_FPU_flags+3(%r2),FPU_USE_VX
tm __LC_MACHINE_FLAGS+5,4 # MACHINE_HAS_VX
jz 4f # no VX -> store FP regs
2: # Store vector registers (V0-V15)
VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3)
Expand Down Expand Up @@ -1272,37 +1241,21 @@ cleanup_critical:
jhe 1f
clg %r9,BASED(.Lcleanup_load_fpu_regs_fp)
jhe 2f
clg %r9,BASED(.Lcleanup_load_fpu_regs_fp_ctl)
jhe 3f
clg %r9,BASED(.Lcleanup_load_fpu_regs_vx_high)
jhe 4f
jhe 3f
clg %r9,BASED(.Lcleanup_load_fpu_regs_vx)
jhe 5f
clg %r9,BASED(.Lcleanup_load_fpu_regs_vx_ctl)
jhe 6f
jhe 4f
lg %r4,__LC_CURRENT
aghi %r4,__TASK_thread
lfpc __THREAD_FPU_fpc(%r4)
tm __THREAD_FPU_flags+3(%r4),FPU_USE_VX # VX-enabled task ?
tm __LC_MACHINE_FLAGS+5,4 # MACHINE_HAS_VX
lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area
jz 3f # -> no VX, load FP regs
6: # Set VX-enablement control
stctg %c0,%c0,__SF_EMPTY+32(%r15) # store CR0
tm __SF_EMPTY+32+5(%r15),2 # test VX control
jo 5f
oi __SF_EMPTY+32+5(%r15),2 # set VX control
lctlg %c0,%c0,__SF_EMPTY+32(%r15)
5: # Load V0 ..V15 registers
jz 2f # -> no VX, load FP regs
4: # Load V0 ..V15 registers
VLM %v0,%v15,0,%r4
4: # Load V16..V31 registers
3: # Load V16..V31 registers
VLM %v16,%v31,256,%r4
j 1f
3: # Clear VX-enablement control for FP
stctg %c0,%c0,__SF_EMPTY+32(%r15) # store CR0
tm __SF_EMPTY+32+5(%r15),2 # test VX control
jz 2f
ni __SF_EMPTY+32+5(%r15),253 # clear VX control
lctlg %c0,%c0,__SF_EMPTY+32(%r15)
2: # Load floating-point registers
ld 0,0(%r4)
ld 1,8(%r4)
Expand All @@ -1324,28 +1277,15 @@ cleanup_critical:
ni __LC_CPU_FLAGS+7,255-_CIF_FPU
lg %r9,48(%r11) # return from load_fpu_regs
br %r14
.Lcleanup_load_fpu_regs_vx_ctl:
.quad .Lload_fpu_regs_vx_ctl
.Lcleanup_load_fpu_regs_vx:
.quad .Lload_fpu_regs_vx
.Lcleanup_load_fpu_regs_vx_high:
.quad .Lload_fpu_regs_vx_high
.Lcleanup_load_fpu_regs_fp_ctl:
.quad .Lload_fpu_regs_fp_ctl
.Lcleanup_load_fpu_regs_fp:
.quad .Lload_fpu_regs_fp
.Lcleanup_load_fpu_regs_done:
.quad .Lload_fpu_regs_done

.Lcleanup___ctl_set_vx:
stctg %c0,%c0,__SF_EMPTY(%r15)
tm __SF_EMPTY+5(%r15),2
bor %r14
oi __SF_EMPTY+5(%r15),2
lctlg %c0,%c0,__SF_EMPTY(%r15)
lg %r9,48(%r11) # return from __ctl_set_vx
br %r14

/*
* Integer constants
*/
Expand Down
2 changes: 0 additions & 2 deletions arch/s390/kernel/entry.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ void psw_idle(struct s390_idle_data *, unsigned long);
asmlinkage long do_syscall_trace_enter(struct pt_regs *regs);
asmlinkage void do_syscall_trace_exit(struct pt_regs *regs);

int alloc_vector_registers(struct task_struct *tsk);

void do_protection_exception(struct pt_regs *regs);
void do_dat_exception(struct pt_regs *regs);

Expand Down
32 changes: 15 additions & 17 deletions arch/s390/kernel/process.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,31 +87,29 @@ void arch_release_task_struct(struct task_struct *tsk)

int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
size_t fpu_regs_size;

*dst = *src;

/* Set up a new floating-point register save area */
dst->thread.fpu.fpc = 0;
dst->thread.fpu.flags = 0; /* Always start with VX disabled */
dst->thread.fpu.fprs = kzalloc(sizeof(freg_t) * __NUM_FPRS,
GFP_KERNEL|__GFP_REPEAT);
if (!dst->thread.fpu.fprs)
/*
* If the vector extension is available, it is enabled for all tasks,
* and, thus, the FPU register save area must be allocated accordingly.
*/
fpu_regs_size = MACHINE_HAS_VX ? sizeof(__vector128) * __NUM_VXRS
: sizeof(freg_t) * __NUM_FPRS;
dst->thread.fpu.regs = kzalloc(fpu_regs_size, GFP_KERNEL|__GFP_REPEAT);
if (!dst->thread.fpu.regs)
return -ENOMEM;

/*
* Save the floating-point or vector register state of the current
* task. The state is not saved for early kernel threads, for example,
* the init_task, which do not have an allocated save area.
* The CIF_FPU flag is set in any case to lazy clear or restore a saved
* state when switching to a different task or returning to user space.
* task and set the CIF_FPU flag to lazy restore the FPU register
* state when returning to user space.
*/
save_fpu_regs();
dst->thread.fpu.fpc = current->thread.fpu.fpc;
if (is_vx_task(current))
convert_vx_to_fp(dst->thread.fpu.fprs,
current->thread.fpu.vxrs);
else
memcpy(dst->thread.fpu.fprs, current->thread.fpu.fprs,
sizeof(freg_t) * __NUM_FPRS);
memcpy(dst->thread.fpu.regs, current->thread.fpu.regs, fpu_regs_size);

return 0;
}

Expand Down Expand Up @@ -199,7 +197,7 @@ int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
save_fpu_regs();
fpregs->fpc = current->thread.fpu.fpc;
fpregs->pad = 0;
if (is_vx_task(current))
if (MACHINE_HAS_VX)
convert_vx_to_fp((freg_t *)&fpregs->fprs,
current->thread.fpu.vxrs);
else
Expand Down
Loading

0 comments on commit b5510d9

Please sign in to comment.