From f781a9e96138d8839663af5e88649ab1fbed74f8 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 22 Mar 2016 07:46:56 -0700 Subject: [PATCH] Set index_arch_AVX_Fast_Unaligned_Load only for Intel processors Since only Intel processors with AVX2 have fast unaligned load, we should set index_arch_AVX_Fast_Unaligned_Load only for Intel processors. Move AVX, AVX2, AVX512, FMA and FMA4 detection into get_common_indeces and call get_common_indeces for other processors. Add CPU_FEATURES_CPU_P and CPU_FEATURES_ARCH_P to aoid loading GLRO(dl_x86_cpu_features) in cpu-features.c. [BZ #19583] * sysdeps/x86/cpu-features.c (get_common_indeces): Remove inline. Check family before setting family, model and extended_model. Set AVX, AVX2, AVX512, FMA and FMA4 usable bits here. (init_cpu_features): Replace HAS_CPU_FEATURE and HAS_ARCH_FEATURE with CPU_FEATURES_CPU_P and CPU_FEATURES_ARCH_P. Set index_arch_AVX_Fast_Unaligned_Load for Intel processors with usable AVX2. Call get_common_indeces for other processors with family == NULL. * sysdeps/x86/cpu-features.h (CPU_FEATURES_CPU_P): New macro. (CPU_FEATURES_ARCH_P): Likewise. (HAS_CPU_FEATURE): Use CPU_FEATURES_CPU_P. (HAS_ARCH_FEATURE): Use CPU_FEATURES_ARCH_P. --- ChangeLog | 17 +++++ sysdeps/x86/cpu-features.c | 152 +++++++++++++++++++------------------ sysdeps/x86/cpu-features.h | 10 ++- 3 files changed, 105 insertions(+), 74 deletions(-) diff --git a/ChangeLog b/ChangeLog index 36360d9134..522bc4fbc8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +2016-03-22 H.J. Lu + + [BZ #19583] + * sysdeps/x86/cpu-features.c (get_common_indeces): Remove + inline. Check family before setting family, model and + extended_model. Set AVX, AVX2, AVX512, FMA and FMA4 usable + bits here. + (init_cpu_features): Replace HAS_CPU_FEATURE and + HAS_ARCH_FEATURE with CPU_FEATURES_CPU_P and + CPU_FEATURES_ARCH_P. Set index_arch_AVX_Fast_Unaligned_Load + for Intel processors with usable AVX2. Call get_common_indeces + for other processors with family == NULL. + * sysdeps/x86/cpu-features.h (CPU_FEATURES_CPU_P): New macro. + (CPU_FEATURES_ARCH_P): Likewise. + (HAS_CPU_FEATURE): Use CPU_FEATURES_CPU_P. + (HAS_ARCH_FEATURE): Use CPU_FEATURES_ARCH_P. + 2016-03-22 Samuel Thibault * malloc/Makefile ($(objpfx)tst-malloc-backtrace, diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 1787716220..c8f81efd03 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -19,23 +19,79 @@ #include #include -static inline void +static void get_common_indeces (struct cpu_features *cpu_features, unsigned int *family, unsigned int *model, unsigned int *extended_model) { - unsigned int eax; - __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx, - cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx, - cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx); - GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].eax = eax; - *family = (eax >> 8) & 0x0f; - *model = (eax >> 4) & 0x0f; - *extended_model = (eax >> 12) & 0xf0; - if (*family == 0x0f) + if (family) { - *family += (eax >> 20) & 0xff; - *model += *extended_model; + unsigned int eax; + __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx, + cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx, + cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx); + cpu_features->cpuid[COMMON_CPUID_INDEX_1].eax = eax; + *family = (eax >> 8) & 0x0f; + *model = (eax >> 4) & 0x0f; + *extended_model = (eax >> 12) & 0xf0; + if (*family == 0x0f) + { + *family += (eax >> 20) & 0xff; + *model += *extended_model; + } + } + + if (cpu_features->max_cpuid >= 7) + __cpuid_count (7, 0, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx, + cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx); + + /* Can we call xgetbv? */ + if (CPU_FEATURES_CPU_P (cpu_features, OSXSAVE)) + { + unsigned int xcrlow; + unsigned int xcrhigh; + asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0)); + /* Is YMM and XMM state usable? */ + if ((xcrlow & (bit_YMM_state | bit_XMM_state)) == + (bit_YMM_state | bit_XMM_state)) + { + /* Determine if AVX is usable. */ + if (CPU_FEATURES_CPU_P (cpu_features, AVX)) + cpu_features->feature[index_arch_AVX_Usable] + |= bit_arch_AVX_Usable; + /* Determine if AVX2 is usable. */ + if (CPU_FEATURES_CPU_P (cpu_features, AVX2)) + cpu_features->feature[index_arch_AVX2_Usable] + |= bit_arch_AVX2_Usable; + /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and + ZMM16-ZMM31 state are enabled. */ + if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state + | bit_ZMM16_31_state)) == + (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state)) + { + /* Determine if AVX512F is usable. */ + if (CPU_FEATURES_CPU_P (cpu_features, AVX512F)) + { + cpu_features->feature[index_arch_AVX512F_Usable] + |= bit_arch_AVX512F_Usable; + /* Determine if AVX512DQ is usable. */ + if (CPU_FEATURES_CPU_P (cpu_features, AVX512DQ)) + cpu_features->feature[index_arch_AVX512DQ_Usable] + |= bit_arch_AVX512DQ_Usable; + } + } + /* Determine if FMA is usable. */ + if (CPU_FEATURES_CPU_P (cpu_features, FMA)) + cpu_features->feature[index_arch_FMA_Usable] + |= bit_arch_FMA_Usable; + /* Determine if FMA4 is usable. */ + if (CPU_FEATURES_CPU_P (cpu_features, FMA4)) + cpu_features->feature[index_arch_FMA4_Usable] + |= bit_arch_FMA4_Usable; + } } } @@ -135,6 +191,12 @@ init_cpu_features (struct cpu_features *cpu_features) break; } } + + /* Unaligned load with 256-bit AVX registers are faster on + Intel processors with AVX2. */ + if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) + cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load] + |= bit_arch_AVX_Fast_Unaligned_Load; } /* This spells out "AuthenticAMD". */ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) @@ -165,73 +227,19 @@ init_cpu_features (struct cpu_features *cpu_features) } } else - kind = arch_kind_other; + { + kind = arch_kind_other; + get_common_indeces (cpu_features, NULL, NULL, NULL); + } /* Support i586 if CX8 is available. */ - if (HAS_CPU_FEATURE (CX8)) + if (CPU_FEATURES_CPU_P (cpu_features, CX8)) cpu_features->feature[index_arch_I586] |= bit_arch_I586; /* Support i686 if CMOV is available. */ - if (HAS_CPU_FEATURE (CMOV)) + if (CPU_FEATURES_CPU_P (cpu_features, CMOV)) cpu_features->feature[index_arch_I686] |= bit_arch_I686; - if (cpu_features->max_cpuid >= 7) - __cpuid_count (7, 0, - cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax, - cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx, - cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx, - cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx); - - /* Can we call xgetbv? */ - if (HAS_CPU_FEATURE (OSXSAVE)) - { - unsigned int xcrlow; - unsigned int xcrhigh; - asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0)); - /* Is YMM and XMM state usable? */ - if ((xcrlow & (bit_YMM_state | bit_XMM_state)) == - (bit_YMM_state | bit_XMM_state)) - { - /* Determine if AVX is usable. */ - if (HAS_CPU_FEATURE (AVX)) - cpu_features->feature[index_arch_AVX_Usable] - |= bit_arch_AVX_Usable; -#if index_arch_AVX2_Usable != index_arch_AVX_Fast_Unaligned_Load -# error index_arch_AVX2_Usable != index_arch_AVX_Fast_Unaligned_Load -#endif - /* Determine if AVX2 is usable. Unaligned load with 256-bit - AVX registers are faster on processors with AVX2. */ - if (HAS_CPU_FEATURE (AVX2)) - cpu_features->feature[index_arch_AVX2_Usable] - |= bit_arch_AVX2_Usable | bit_arch_AVX_Fast_Unaligned_Load; - /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and - ZMM16-ZMM31 state are enabled. */ - if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state - | bit_ZMM16_31_state)) == - (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state)) - { - /* Determine if AVX512F is usable. */ - if (HAS_CPU_FEATURE (AVX512F)) - { - cpu_features->feature[index_arch_AVX512F_Usable] - |= bit_arch_AVX512F_Usable; - /* Determine if AVX512DQ is usable. */ - if (HAS_CPU_FEATURE (AVX512DQ)) - cpu_features->feature[index_arch_AVX512DQ_Usable] - |= bit_arch_AVX512DQ_Usable; - } - } - /* Determine if FMA is usable. */ - if (HAS_CPU_FEATURE (FMA)) - cpu_features->feature[index_arch_FMA_Usable] - |= bit_arch_FMA_Usable; - /* Determine if FMA4 is usable. */ - if (HAS_CPU_FEATURE (FMA4)) - cpu_features->feature[index_arch_FMA4_Usable] - |= bit_arch_FMA4_Usable; - } - } - #if !HAS_CPUID no_cpuid: #endif diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h index 0624a92b2f..e06eb7e41b 100644 --- a/sysdeps/x86/cpu-features.h +++ b/sysdeps/x86/cpu-features.h @@ -204,11 +204,17 @@ extern const struct cpu_features *__get_cpu_features (void) # endif +/* Only used directly in cpu-features.c. */ +# define CPU_FEATURES_CPU_P(ptr, name) \ + ((ptr->cpuid[index_cpu_##name].reg_##name & (bit_cpu_##name)) != 0) +# define CPU_FEATURES_ARCH_P(ptr, name) \ + ((ptr->feature[index_arch_##name] & (bit_arch_##name)) != 0) + /* HAS_* evaluates to true if we may use the feature at runtime. */ # define HAS_CPU_FEATURE(name) \ - ((__get_cpu_features ()->cpuid[index_cpu_##name].reg_##name & (bit_cpu_##name)) != 0) + CPU_FEATURES_CPU_P (__get_cpu_features (), name) # define HAS_ARCH_FEATURE(name) \ - ((__get_cpu_features ()->feature[index_arch_##name] & (bit_arch_##name)) != 0) + CPU_FEATURES_ARCH_P (__get_cpu_features (), name) # define index_cpu_CX8 COMMON_CPUID_INDEX_1 # define index_cpu_CMOV COMMON_CPUID_INDEX_1