From 4d1827485acecac0016eaaec199e97697afdaa84 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 31 Jul 2023 14:02:11 +0800 Subject: [PATCH 001/560] tools/power/turbostat: Fix failure with new uncore sysfs On some platforms, turbostat fails during launch time like below, turbostat version 2023.03.17 - Len Brown ... cpu40: MSR_IA32_PACKAGE_THERM_STATUS: 0x884c0000 (24 C) cpu40: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x00000003 (100 C, 100 C) turbostat: snapshot_sysfs_counter(/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz): No data available This is because new uncore sysfs is used on these platforms as introduced by commit 9b8dea80e3cb ("platform/x86/intel-uncore-freq: Support for cluster level controls"). With the new uncore sysfs interface, /sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz is still available, but reading it fails. How to support the fabric cluster level uncore sysfs is not settled yet, as a short term fix, clear the BIC_UNCORE_MHZ bit when new sysfs I/F is detected. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9a10512e34078..9de1ff6f82ce0 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4163,6 +4163,10 @@ static void intel_uncore_frequency_probe(void) if (access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00", R_OK)) return; + /* Cluster level sysfs not supported yet. */ + if (!access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK)) + return; + if (!access("/sys/devices/system/cpu/intel_uncore_frequency/package_00_die_00/current_freq_khz", R_OK)) BIC_PRESENT(BIC_UNCORE_MHZ); From 137f01b3529d292a68d22e9681e2f903c768f790 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 25 Mar 2023 21:57:07 +0800 Subject: [PATCH 002/560] tools/power/turbostat: Fix a knl bug MSR_KNL_CORE_C6_RESIDENCY should be evaluated only if 1. this is KNL platform AND 2. need to get C6 residency or need to calculate C1 residency Fix the broken logic introduced by commit 1e9042b9c8d4 ("tools/power turbostat: Fix CPU%C1 display value"). Fixes: 1e9042b9c8d4 ("tools/power turbostat: Fix CPU%C1 display value") Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9de1ff6f82ce0..fb6c6ddbdb604 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2180,7 +2180,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !do_knl_cstates) { if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6)) return -7; - } else if (do_knl_cstates || soft_c1_residency_display(BIC_CPU_c6)) { + } else if (do_knl_cstates && soft_c1_residency_display(BIC_CPU_c6)) { if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6)) return -7; } From b61b7d8c4c22c4298a50ae5d0ee88facb85ce665 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Mon, 27 Mar 2023 11:17:44 +0800 Subject: [PATCH 003/560] tools/power/turbostat: Enable the C-state Pre-wake printing Currently the C-state Pre-wake will not be printed due to the probe has not been invoked. Invoke the probe function accordingly. Fixes: aeb01e6d71ff ("tools/power turbostat: Print the C-state Pre-wake settings") Signed-off-by: Chen Yu Reviewed-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index fb6c6ddbdb604..03d4f09b103a1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5794,6 +5794,7 @@ void process_cpuid() rapl_probe(family, model); perf_limit_reasons_probe(family, model); automatic_cstate_conversion_probe(family, model); + prewake_cstate_probe(family, model); check_tcc_offset(model_orig); From b98a6d78768ec459d394db8bc086d071eb6556c8 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 24 Mar 2023 16:30:55 +0800 Subject: [PATCH 004/560] tools/power/turbostat: Enable TCC Offset on more models All Models that duplicate INTEL_FAM6_CANNONLAKE_L support TCC Offset. Enable this feature on all these models. Delete obsolete model_orig. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 03d4f09b103a1..d7880870ef688 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -211,9 +211,6 @@ int *fd_instr_count_percpu; struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; -/* Save original CPU model */ -unsigned int model_orig; - unsigned int num_iterations; unsigned int header_iterations; unsigned int debug; @@ -4046,14 +4043,7 @@ void check_tcc_offset(int model) switch (model) { case INTEL_FAM6_SKYLAKE_L: - case INTEL_FAM6_SKYLAKE: - case INTEL_FAM6_KABYLAKE_L: - case INTEL_FAM6_KABYLAKE: - case INTEL_FAM6_ICELAKE_L: - case INTEL_FAM6_ICELAKE: - case INTEL_FAM6_TIGERLAKE_L: - case INTEL_FAM6_TIGERLAKE: - case INTEL_FAM6_COMETLAKE: + case INTEL_FAM6_CANNONLAKE_L: if (!get_msr(base_cpu, MSR_PLATFORM_INFO, &msr)) { msr = (msr >> 30) & 1; if (msr) @@ -5573,10 +5563,9 @@ void process_cpuid() edx_flags & (1 << 22) ? "ACPI-TM" : "-", edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-"); } - if (genuine_intel) { - model_orig = model; + + if (genuine_intel) model = intel_model_duplicates(model); - } if (!(edx_flags & (1 << 5))) errx(1, "CPUID: no MSR"); @@ -5796,7 +5785,7 @@ void process_cpuid() automatic_cstate_conversion_probe(family, model); prewake_cstate_probe(family, model); - check_tcc_offset(model_orig); + check_tcc_offset(model); if (!quiet) dump_cstate_pstate_config_info(family, model); From 2c019d657968bdd93e11615e0919d8181a54742d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 21 Aug 2022 22:39:16 +0800 Subject: [PATCH 005/560] tools/power/turbostat: Support alternative graphics sysfs knobs /sys/class/graphics/fb0/device/drm/card0/ and /sys/class/drm/card0/ point to the same device node. But in some cases, one exists and the other one does not. Prefer to use /sys/class/drm/card0/, and fall back to /sys/class/graphics/fb0/device/drm/card0/. This recovers the "GFXMHz" and "GFXAMHz" columns on some platforms like a SPR server. Reviewed-by: Rodrigo Vivi Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 30 ++++++++++++++++----------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d7880870ef688..f9bc2230db73a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3188,8 +3188,8 @@ int snapshot_gfx_rc6_ms(void) /* * snapshot_gfx_mhz() * - * record snapshot of - * /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz + * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz + * when /sys/class/drm/card0/gt_cur_freq_mhz is not available. * * return 1 if config change requires a restart, else return 0 */ @@ -3198,9 +3198,11 @@ int snapshot_gfx_mhz(void) static FILE *fp; int retval; - if (fp == NULL) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r"); - else { + if (fp == NULL) { + fp = fopen("/sys/class/drm/card0/gt_cur_freq_mhz", "r"); + if (!fp) + fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", "r"); + } else { rewind(fp); fflush(fp); } @@ -3215,8 +3217,8 @@ int snapshot_gfx_mhz(void) /* * snapshot_gfx_cur_mhz() * - * record snapshot of - * /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz + * fall back to /sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz + * when /sys/class/drm/card0/gt_act_freq_mhz is not available. * * return 1 if config change requires a restart, else return 0 */ @@ -3225,9 +3227,11 @@ int snapshot_gfx_act_mhz(void) static FILE *fp; int retval; - if (fp == NULL) - fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r"); - else { + if (fp == NULL) { + fp = fopen("/sys/class/drm/card0/gt_act_freq_mhz", "r"); + if (!fp) + fp = fopen_or_die("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", "r"); + } else { rewind(fp); fflush(fp); } @@ -5804,10 +5808,12 @@ void process_cpuid() if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) BIC_PRESENT(BIC_GFX_rc6); - if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) + if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) || + !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) BIC_PRESENT(BIC_GFXMHz); - if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) || + !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) BIC_PRESENT(BIC_GFXACTMHz); if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK)) From 6d306d6ec7e0f9a5e90f5afd70e3b6c32ae50ce6 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 24 Mar 2023 16:26:25 +0800 Subject: [PATCH 006/560] tools/power/turbostat: Replace raw value cpu model with Macro Kernel already has #define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ Use standard Macro for CPU Model instead of raw value. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f9bc2230db73a..8d3a8af3692a7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5413,7 +5413,7 @@ unsigned int intel_model_duplicates(unsigned int model) switch (model) { case INTEL_FAM6_NEHALEM_EP: /* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */ case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ - case 0x1F: /* Core i7 and i5 Processor - Nehalem */ + case INTEL_FAM6_NEHALEM_G: /* Core i7 and i5 Processor - Nehalem */ case INTEL_FAM6_WESTMERE: /* Westmere Client - Clarkdale, Arrandale */ case INTEL_FAM6_WESTMERE_EP: /* Westmere EP - Gulftown */ return INTEL_FAM6_NEHALEM; From bbfc33b1e49f443296e56d8b76c77373f700aedc Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 24 Mar 2023 16:27:30 +0800 Subject: [PATCH 007/560] tools/power/turbostat: Remove redundant duplicates Remove redundant duplicates in intel_model_duplicates(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8d3a8af3692a7..2420300939da0 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5412,24 +5412,20 @@ unsigned int intel_model_duplicates(unsigned int model) switch (model) { case INTEL_FAM6_NEHALEM_EP: /* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */ - case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ case INTEL_FAM6_NEHALEM_G: /* Core i7 and i5 Processor - Nehalem */ case INTEL_FAM6_WESTMERE: /* Westmere Client - Clarkdale, Arrandale */ case INTEL_FAM6_WESTMERE_EP: /* Westmere EP - Gulftown */ return INTEL_FAM6_NEHALEM; - case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ case INTEL_FAM6_WESTMERE_EX: /* Westmere-EX Xeon - Eagleton */ return INTEL_FAM6_NEHALEM_EX; case INTEL_FAM6_XEON_PHI_KNM: return INTEL_FAM6_XEON_PHI_KNL; - case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_BROADWELL_D: /* BDX-DE */ return INTEL_FAM6_BROADWELL_X; - case INTEL_FAM6_SKYLAKE_L: case INTEL_FAM6_SKYLAKE: case INTEL_FAM6_KABYLAKE_L: case INTEL_FAM6_KABYLAKE: From 48674c1bb6124fe392e8fed80a39fcb3f62e6551 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 27 Mar 2023 12:36:55 +0800 Subject: [PATCH 008/560] tools/power/turbostat: Remove pseudo check for two models INTEL_FAM6_ATOM_SILVERMONT_MID/INTEL_FAM6_ATOM_AIRMONT_MID are not listed in probe_nhm_msrs(). This means that most of the turbostat features are not available on these two platforms. Further more, checking for these two models in has_slv_msrs() is dead code. Because has_slv_msrs() is called by the code guarded by probe_nhm_msrs(). For these two reasons, remove pseudo check for INTEL_FAM6_ATOM_SILVERMONT_MID and INTEL_FAM6_ATOM_AIRMONT_MID. Will add back the support when we can access these two platforms. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 2420300939da0..b8874d3dc83ea 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3792,8 +3792,6 @@ int has_slv_msrs(unsigned int family, unsigned int model) switch (model) { case INTEL_FAM6_ATOM_SILVERMONT: - case INTEL_FAM6_ATOM_SILVERMONT_MID: - case INTEL_FAM6_ATOM_AIRMONT_MID: return 1; } return 0; From 45232ab168a3c5abad86eafaef2beed8d7037666 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 21:52:53 +0800 Subject: [PATCH 009/560] tools/power/turbostat: Add skeleton support for table driven feature enumeration Turbostat supports a series of features that may diverge among different CPU models. Current code uses various of CPU model checks in different places to handle this, which makes the code hard to maintain. Add skeleton support for table driven feature enumeration to replace the current error-prone CPU model checks and global variables. Note: by comparing the CPU models with intel-family.h, it is found that turbostat support for below four Models are missing, including INTEL_FAM6_ICELAKE, INTEL_FAM6_ATOM_SILVERMONT_MID, INTEL_FAM6_ATOM_AIRMONT_MID and INTEL_FAM6_ATOM_AIRMONT_NP. Adding support for these models is a different work, thus it is not covered in this patch set. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 192 ++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b8874d3dc83ea..4deea374188a5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -281,6 +281,197 @@ unsigned int has_misc_feature_control; unsigned int first_counter_read = 1; int ignore_stdin; +/* Model specific support Start */ + +/* List of features that may diverge among different platforms */ +struct platform_features { +}; + +struct platform_data { + unsigned int model; + const struct platform_features *features; +}; + +static const struct platform_features nhm_features = { +}; + +static const struct platform_features nhx_features = { +}; + +static const struct platform_features snb_features = { +}; + +static const struct platform_features snx_features = { +}; + +static const struct platform_features ivb_features = { +}; + +static const struct platform_features ivx_features = { +}; + +static const struct platform_features hsw_features = { +}; + +static const struct platform_features hsx_features = { +}; + +static const struct platform_features hswl_features = { +}; + +static const struct platform_features hswg_features = { +}; + +static const struct platform_features bdw_features = { +}; + +static const struct platform_features bdwg_features = { +}; + +static const struct platform_features bdx_features = { +}; + +static const struct platform_features skl_features = { +}; + +static const struct platform_features cnl_features = { +}; + +static const struct platform_features skx_features = { +}; + +static const struct platform_features icx_features = { +}; + +static const struct platform_features spr_features = { +}; + +static const struct platform_features slv_features = { +}; + +static const struct platform_features slvd_features = { +}; + +static const struct platform_features amt_features = { +}; + +static const struct platform_features gmt_features = { +}; + +static const struct platform_features gmtd_features = { +}; + +static const struct platform_features gmtp_features = { +}; + +static const struct platform_features tmt_features = { +}; + +static const struct platform_features tmtd_features = { +}; + +static const struct platform_features knl_features = { +}; + +static const struct platform_features default_features = { +}; + +static const struct platform_features amd_features = { +}; + +static const struct platform_data turbostat_pdata[] = { + { INTEL_FAM6_NEHALEM, &nhm_features }, + { INTEL_FAM6_NEHALEM_G, &nhm_features }, + { INTEL_FAM6_NEHALEM_EP, &nhm_features }, + { INTEL_FAM6_NEHALEM_EX, &nhx_features }, + { INTEL_FAM6_WESTMERE, &nhm_features }, + { INTEL_FAM6_WESTMERE_EP, &nhm_features }, + { INTEL_FAM6_WESTMERE_EX, &nhx_features }, + { INTEL_FAM6_SANDYBRIDGE, &snb_features }, + { INTEL_FAM6_SANDYBRIDGE_X, &snx_features }, + { INTEL_FAM6_IVYBRIDGE, &ivb_features }, + { INTEL_FAM6_IVYBRIDGE_X, &ivx_features }, + { INTEL_FAM6_HASWELL, &hsw_features }, + { INTEL_FAM6_HASWELL_X, &hsx_features }, + { INTEL_FAM6_HASWELL_L, &hswl_features }, + { INTEL_FAM6_HASWELL_G, &hswg_features }, + { INTEL_FAM6_BROADWELL, &bdw_features }, + { INTEL_FAM6_BROADWELL_G, &bdwg_features }, + { INTEL_FAM6_BROADWELL_X, &bdx_features }, + { INTEL_FAM6_BROADWELL_D, &bdx_features }, + { INTEL_FAM6_SKYLAKE_L, &skl_features }, + { INTEL_FAM6_SKYLAKE, &skl_features }, + { INTEL_FAM6_SKYLAKE_X, &skx_features }, + { INTEL_FAM6_KABYLAKE_L, &skl_features }, + { INTEL_FAM6_KABYLAKE, &skl_features }, + { INTEL_FAM6_COMETLAKE, &skl_features }, + { INTEL_FAM6_COMETLAKE_L, &skl_features }, + { INTEL_FAM6_CANNONLAKE_L, &cnl_features }, + { INTEL_FAM6_ICELAKE_X, &icx_features }, + { INTEL_FAM6_ICELAKE_D, &icx_features }, + { INTEL_FAM6_ICELAKE_L, &cnl_features }, + { INTEL_FAM6_ICELAKE_NNPI, &cnl_features }, + { INTEL_FAM6_ROCKETLAKE, &cnl_features }, + { INTEL_FAM6_TIGERLAKE_L, &cnl_features }, + { INTEL_FAM6_TIGERLAKE, &cnl_features }, + { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features }, + { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features }, + { INTEL_FAM6_LAKEFIELD, &cnl_features }, + { INTEL_FAM6_ALDERLAKE, &cnl_features }, + { INTEL_FAM6_ALDERLAKE_L, &cnl_features }, + { INTEL_FAM6_RAPTORLAKE, &cnl_features }, + { INTEL_FAM6_RAPTORLAKE_P, &cnl_features }, + { INTEL_FAM6_RAPTORLAKE_S, &cnl_features }, + { INTEL_FAM6_METEORLAKE, &cnl_features }, + { INTEL_FAM6_METEORLAKE_L, &cnl_features }, + { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, + { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, + { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, + { INTEL_FAM6_ATOM_GOLDMONT, &gmt_features }, + { INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features }, + { INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features }, + { INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features }, + { INTEL_FAM6_ATOM_TREMONT, &tmt_features }, + { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, + { INTEL_FAM6_ATOM_GRACEMONT, &cnl_features }, + { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, + { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, + /* + * Missing support for + * INTEL_FAM6_ICELAKE + * INTEL_FAM6_ATOM_SILVERMONT_MID + * INTEL_FAM6_ATOM_AIRMONT_MID + * INTEL_FAM6_ATOM_AIRMONT_NP + */ + { 0, NULL }, +}; + +static const struct platform_features *platform; + +void probe_platform_features(unsigned int family, unsigned int model) +{ + int i; + + if (authentic_amd || hygon_genuine) { + platform = &amd_features; + return; + } + + platform = &default_features; + + if (!genuine_intel || family != 6) + return; + + for (i = 0; turbostat_pdata[i].features; i++) { + if (turbostat_pdata[i].model == model) { + platform = turbostat_pdata[i].features; + return; + } + } +} + +/* Model specific support End */ + #define RAPL_PKG (1 << 0) /* 0x610 MSR_PKG_POWER_LIMIT */ /* 0x611 MSR_PKG_ENERGY_STATUS */ @@ -5562,6 +5753,7 @@ void process_cpuid() edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-"); } + probe_platform_features(family, model); if (genuine_intel) model = intel_model_duplicates(model); From 778fc34a7a3db2811e28ce570318dd047f278cb2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 21 Aug 2023 15:26:32 +0800 Subject: [PATCH 010/560] tools/power/turbostat: Abstract MSR_MISC_FEATURE_CONTROL support Abstract MSR_MISC_FEATURE_CONTROL support. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4deea374188a5..7eaa0adf72e0a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -277,7 +277,6 @@ unsigned int has_hwp_notify; /* IA32_HWP_INTERRUPT */ unsigned int has_hwp_activity_window; /* IA32_HWP_REQUEST[bits 41:32] */ unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ -unsigned int has_misc_feature_control; unsigned int first_counter_read = 1; int ignore_stdin; @@ -285,6 +284,7 @@ int ignore_stdin; /* List of features that may diverge among different platforms */ struct platform_features { + bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ }; struct platform_data { @@ -299,51 +299,67 @@ static const struct platform_features nhx_features = { }; static const struct platform_features snb_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features snx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features ivb_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features ivx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features hsw_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features hsx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features hswl_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features hswg_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features bdw_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features bdwg_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features bdx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features skl_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features cnl_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features skx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features icx_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features spr_features = { + .has_msr_misc_feature_control = 1, }; static const struct platform_features slv_features = { @@ -3883,7 +3899,6 @@ void check_permissions(void) * * Side effect: * sets global pkg_cstate_limit to decode MSR_PKG_CST_CONFIG_CONTROL - * sets has_misc_feature_control */ int probe_nhm_msrs(unsigned int family, unsigned int model) { @@ -3909,7 +3924,6 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_IVYBRIDGE: /* IVB */ case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ pkg_cstate_limits = snb_pkg_cstate_limits; - has_misc_feature_control = 1; break; case INTEL_FAM6_HASWELL: /* HSW */ case INTEL_FAM6_HASWELL_G: /* HSW */ @@ -3921,16 +3935,13 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ pkg_cstate_limits = hsw_pkg_cstate_limits; - has_misc_feature_control = 1; break; case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ pkg_cstate_limits = skx_pkg_cstate_limits; - has_misc_feature_control = 1; break; case INTEL_FAM6_ICELAKE_X: /* ICX */ pkg_cstate_limits = icx_pkg_cstate_limits; - has_misc_feature_control = 1; break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ no_MSR_MISC_PWR_MGMT = 1; @@ -5541,7 +5552,7 @@ void decode_misc_feature_control(void) { unsigned long long msr; - if (!has_misc_feature_control) + if (!platform->has_msr_misc_feature_control) return; if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr)) From 3dd0e7547d11e770bafb40ad41f2631cc4b16649 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 21 Aug 2023 22:12:57 +0800 Subject: [PATCH 011/560] tools/power/turbostat: Abstract MSR_MISC_PWR_MGMT support Abstract MSR_MISC_PWR_MGMT support. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 31 +++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 7eaa0adf72e0a..9507f310e2124 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -238,7 +238,6 @@ unsigned int hygon_genuine; unsigned int max_level, max_extended_level; unsigned int has_invariant_tsc; unsigned int do_nhm_platform_info; -unsigned int no_MSR_MISC_PWR_MGMT; unsigned int aperf_mperf_multiplier = 1; double bclk; double base_hz; @@ -285,6 +284,7 @@ int ignore_stdin; /* List of features that may diverge among different platforms */ struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ + bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ }; struct platform_data { @@ -293,100 +293,125 @@ struct platform_data { }; static const struct platform_features nhm_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features nhx_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features snb_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features snx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features ivb_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features ivx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features hsw_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features hsx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features hswl_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features hswg_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features bdw_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features bdwg_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features bdx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features skl_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features cnl_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features icx_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features spr_features = { .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features slv_features = { }; static const struct platform_features slvd_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features amt_features = { }; static const struct platform_features gmt_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features gmtd_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features gmtp_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features tmt_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features tmtd_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features knl_features = { + .has_msr_misc_pwr_mgmt = 1, }; static const struct platform_features default_features = { @@ -3944,14 +3969,12 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) pkg_cstate_limits = icx_pkg_cstate_limits; break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ - no_MSR_MISC_PWR_MGMT = 1; /* FALLTHRU */ case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ pkg_cstate_limits = slv_pkg_cstate_limits; break; case INTEL_FAM6_ATOM_AIRMONT: /* AMT */ pkg_cstate_limits = amt_pkg_cstate_limits; - no_MSR_MISC_PWR_MGMT = 1; break; case INTEL_FAM6_XEON_PHI_KNL: /* PHI */ pkg_cstate_limits = phi_pkg_cstate_limits; @@ -5576,7 +5599,7 @@ void decode_misc_pwr_mgmt_msr(void) if (!do_nhm_platform_info) return; - if (no_MSR_MISC_PWR_MGMT) + if (!platform->has_msr_misc_pwr_mgmt) return; if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr)) From 71e841293c715797d8c6ae8cdc3f74b4396c5570 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 21 Aug 2023 22:22:48 +0800 Subject: [PATCH 012/560] tools/power/turbostat: Abstract BCLK frequency support Abstract CPU base clock frequency support. Note that bclk is used by 1. calculate base_hz using MSR_PLATFORM_INFO, which is guarded by probe_nhm_msrs(). 2. dump MSR_PLATFORM_INFO and Turbo Ratio Limit MSRs, which are also guarded by probe_nhm_msrs(). Thus probe_bclk() works for probe_nhm_msrs() models only. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 131 +++++++++++++++++--------- 1 file changed, 87 insertions(+), 44 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9507f310e2124..66ba70017d53f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -269,7 +269,6 @@ unsigned int do_ring_perf_limit_reasons; unsigned int crystal_hz; unsigned long long tsc_hz; int base_cpu; -double discover_bclk(unsigned int family, unsigned int model); unsigned int has_hwp; /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */ /* IA32_HWP_REQUEST, IA32_HWP_STATUS */ unsigned int has_hwp_notify; /* IA32_HWP_INTERRUPT */ @@ -279,12 +278,15 @@ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int first_counter_read = 1; int ignore_stdin; +int get_msr(int cpu, off_t offset, unsigned long long *msr); + /* Model specific support Start */ /* List of features that may diverge among different platforms */ struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ + int bclk_freq; /* CPU base clock */ }; struct platform_data { @@ -292,126 +294,185 @@ struct platform_data { const struct platform_features *features; }; +/* For BCLK */ +enum bclk_freq { + BCLK_100MHZ = 1, + BCLK_133MHZ, + BCLK_SLV, +}; + +#define SLM_BCLK_FREQS 5 +double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 }; + +double slm_bclk(void) +{ + unsigned long long msr = 3; + unsigned int i; + double freq; + + if (get_msr(base_cpu, MSR_FSB_FREQ, &msr)) + fprintf(outf, "SLM BCLK: unknown\n"); + + i = msr & 0xf; + if (i >= SLM_BCLK_FREQS) { + fprintf(outf, "SLM BCLK[%d] invalid\n", i); + i = 3; + } + freq = slm_freq_table[i]; + + if (!quiet) + fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq); + + return freq; +} + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_133MHZ, }; static const struct platform_features nhx_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_133MHZ, }; static const struct platform_features snb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features snx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features ivb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features ivx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features hsw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features hsx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features hswl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features hswg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features bdw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features bdwg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features bdx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features skl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features cnl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features icx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features spr_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features slv_features = { + .bclk_freq = BCLK_SLV, }; static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_SLV, }; static const struct platform_features amt_features = { + .bclk_freq = BCLK_133MHZ, }; static const struct platform_features gmt_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features gmtd_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features gmtp_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features knl_features = { .has_msr_misc_pwr_mgmt = 1, + .bclk_freq = BCLK_100MHZ, }; static const struct platform_features default_features = { @@ -3907,6 +3968,30 @@ void check_permissions(void) exit(-6); } +void probe_bclk(void) +{ + unsigned long long msr; + unsigned int base_ratio; + + if (!do_nhm_platform_info) + return; + + if (platform->bclk_freq == BCLK_100MHZ) + bclk = 100.00; + else if (platform->bclk_freq == BCLK_133MHZ) + bclk = 133.33; + else if (platform->bclk_freq == BCLK_SLV) + bclk = slm_bclk(); + else + return; + + get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); + base_ratio = (msr >> 8) & 0xFF; + + base_hz = base_ratio * bclk * 1000000; + has_base_hz = 1; +} + /* * NHM adds support for additional MSRs: * @@ -3928,7 +4013,6 @@ void check_permissions(void) int probe_nhm_msrs(unsigned int family, unsigned int model) { unsigned long long msr; - unsigned int base_ratio; int *pkg_cstate_limits; if (!genuine_intel) @@ -3937,8 +4021,6 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) if (family != 6) return 0; - bclk = discover_bclk(family, model); - switch (model) { case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ @@ -3992,11 +4074,6 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); pkg_cstate_limit = pkg_cstate_limits[msr & 0xF]; - get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); - base_ratio = (msr >> 8) & 0xFF; - - base_hz = base_ratio * bclk * 1000000; - has_base_hz = 1; return 1; } @@ -5403,41 +5480,6 @@ unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model) return 1; } -#define SLM_BCLK_FREQS 5 -double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 }; - -double slm_bclk(void) -{ - unsigned long long msr = 3; - unsigned int i; - double freq; - - if (get_msr(base_cpu, MSR_FSB_FREQ, &msr)) - fprintf(outf, "SLM BCLK: unknown\n"); - - i = msr & 0xf; - if (i >= SLM_BCLK_FREQS) { - fprintf(outf, "SLM BCLK[%d] invalid\n", i); - i = 3; - } - freq = slm_freq_table[i]; - - if (!quiet) - fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq); - - return freq; -} - -double discover_bclk(unsigned int family, unsigned int model) -{ - if (has_snb_msrs(family, model) || is_knl(family, model)) - return 100.00; - else if (is_slm(family, model)) - return slm_bclk(); - else - return 133.33; -} - int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned int eax, ebx, ecx, edx; @@ -5929,6 +5971,7 @@ void process_cpuid() BIC_PRESENT(BIC_CPU_c6); BIC_PRESENT(BIC_SMI); } + probe_bclk(); do_snb_cstates = has_snb_msrs(family, model); if (do_snb_cstates) From 3989fc890782c8002477895e9f24ffb98a132293 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 22:37:06 +0800 Subject: [PATCH 013/560] tools/power/turbostat: Abstract Package cstate limit decoding support Abstract the support for decoding package cstate limit from MSR_PKG_CST_CONFIG_CONTROL. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 112 ++++++++++++++++++++------ 1 file changed, 86 insertions(+), 26 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 66ba70017d53f..bcad9332a3b26 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -287,6 +287,7 @@ struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ int bclk_freq; /* CPU base clock */ + int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ }; struct platform_data { @@ -326,153 +327,193 @@ double slm_bclk(void) return freq; } +/* For Package cstate limit */ +enum package_cstate_limit { + CST_LIMIT_NHM = 1, + CST_LIMIT_SNB, + CST_LIMIT_HSW, + CST_LIMIT_SKX, + CST_LIMIT_ICX, + CST_LIMIT_SLV, + CST_LIMIT_AMT, + CST_LIMIT_KNL, + CST_LIMIT_GMT, +}; + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_133MHZ, + .cst_limit = CST_LIMIT_NHM, }; static const struct platform_features nhx_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_133MHZ, + .cst_limit = CST_LIMIT_NHM, }; static const struct platform_features snb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SNB, }; static const struct platform_features snx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SNB, }; static const struct platform_features ivb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SNB, }; static const struct platform_features ivx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SNB, }; static const struct platform_features hsw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features hsx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features hswl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features hswg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features bdw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features bdwg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features bdx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features skl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features cnl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_HSW, }; static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SKX, }; static const struct platform_features icx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_ICX, }; static const struct platform_features spr_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_SKX, }; static const struct platform_features slv_features = { .bclk_freq = BCLK_SLV, + .cst_limit = CST_LIMIT_SLV, }; static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_SLV, + .cst_limit = CST_LIMIT_SLV, }; static const struct platform_features amt_features = { .bclk_freq = BCLK_133MHZ, + .cst_limit = CST_LIMIT_AMT, }; static const struct platform_features gmt_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features gmtd_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features gmtp_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features knl_features = { .has_msr_misc_pwr_mgmt = 1, .bclk_freq = BCLK_100MHZ, + .cst_limit = CST_LIMIT_KNL, }; static const struct platform_features default_features = { @@ -2704,6 +2745,50 @@ int icx_pkg_cstate_limits[16] = PCLRSV, PCLRSV }; +void probe_cst_limit(void) +{ + unsigned long long msr; + int *pkg_cstate_limits; + + if (!do_nhm_platform_info) + return; + + switch (platform->cst_limit) { + case CST_LIMIT_NHM: + pkg_cstate_limits = nhm_pkg_cstate_limits; + break; + case CST_LIMIT_SNB: + pkg_cstate_limits = snb_pkg_cstate_limits; + break; + case CST_LIMIT_HSW: + pkg_cstate_limits = hsw_pkg_cstate_limits; + break; + case CST_LIMIT_SKX: + pkg_cstate_limits = skx_pkg_cstate_limits; + break; + case CST_LIMIT_ICX: + pkg_cstate_limits = icx_pkg_cstate_limits; + break; + case CST_LIMIT_SLV: + pkg_cstate_limits = slv_pkg_cstate_limits; + break; + case CST_LIMIT_AMT: + pkg_cstate_limits = amt_pkg_cstate_limits; + break; + case CST_LIMIT_KNL: + pkg_cstate_limits = phi_pkg_cstate_limits; + break; + case CST_LIMIT_GMT: + pkg_cstate_limits = glm_pkg_cstate_limits; + break; + default: + return; + } + + get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); + pkg_cstate_limit = pkg_cstate_limits[msr & 0xF]; +} + static void calculate_tsc_tweak() { tsc_tweak = base_hz / tsc_hz; @@ -4006,15 +4091,9 @@ void probe_bclk(void) * MSR_PKG_C6_RESIDENCY 0x000003f9 * MSR_CORE_C3_RESIDENCY 0x000003fc * MSR_CORE_C6_RESIDENCY 0x000003fd - * - * Side effect: - * sets global pkg_cstate_limit to decode MSR_PKG_CST_CONFIG_CONTROL */ int probe_nhm_msrs(unsigned int family, unsigned int model) { - unsigned long long msr; - int *pkg_cstate_limits; - if (!genuine_intel) return 0; @@ -4024,14 +4103,10 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) switch (model) { case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ - pkg_cstate_limits = nhm_pkg_cstate_limits; - break; case INTEL_FAM6_SANDYBRIDGE: /* SNB */ case INTEL_FAM6_SANDYBRIDGE_X: /* SNB Xeon */ case INTEL_FAM6_IVYBRIDGE: /* IVB */ case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ - pkg_cstate_limits = snb_pkg_cstate_limits; - break; case INTEL_FAM6_HASWELL: /* HSW */ case INTEL_FAM6_HASWELL_G: /* HSW */ case INTEL_FAM6_HASWELL_X: /* HSX */ @@ -4041,38 +4116,22 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_BROADWELL_X: /* BDX */ case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - pkg_cstate_limits = hsw_pkg_cstate_limits; - break; case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ - pkg_cstate_limits = skx_pkg_cstate_limits; - break; case INTEL_FAM6_ICELAKE_X: /* ICX */ - pkg_cstate_limits = icx_pkg_cstate_limits; - break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ - /* FALLTHRU */ case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ - pkg_cstate_limits = slv_pkg_cstate_limits; - break; case INTEL_FAM6_ATOM_AIRMONT: /* AMT */ - pkg_cstate_limits = amt_pkg_cstate_limits; - break; case INTEL_FAM6_XEON_PHI_KNL: /* PHI */ - pkg_cstate_limits = phi_pkg_cstate_limits; - break; case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ case INTEL_FAM6_ATOM_TREMONT: /* EHL */ case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ - pkg_cstate_limits = glm_pkg_cstate_limits; break; default: return 0; } - get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); - pkg_cstate_limit = pkg_cstate_limits[msr & 0xF]; return 1; } @@ -5964,6 +6023,7 @@ void process_cpuid() BIC_PRESENT(BIC_IRQ); BIC_PRESENT(BIC_TSC_MHz); + probe_cst_limit(); if (probe_nhm_msrs(family, model)) { do_nhm_platform_info = 1; BIC_PRESENT(BIC_CPU_c1); From fcfa1ce074ab76272639961d4d7900b91657a8d5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 25 Aug 2023 16:52:23 +0800 Subject: [PATCH 014/560] tools/power/turbostat: Abstract Nehalem MSRs support MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, and the TRL MSRs are always available for platforms since Nehalem. Support for these msrs can be described altogether. Abstract the support for these MSRs. Delete probe_nhm_msrs() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 101 +++++++++----------------- 1 file changed, 34 insertions(+), 67 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bcad9332a3b26..bc221e800aec6 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -237,7 +237,6 @@ unsigned int authentic_amd; unsigned int hygon_genuine; unsigned int max_level, max_extended_level; unsigned int has_invariant_tsc; -unsigned int do_nhm_platform_info; unsigned int aperf_mperf_multiplier = 1; double bclk; double base_hz; @@ -286,6 +285,7 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr); struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ + bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */ int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ }; @@ -342,12 +342,14 @@ enum package_cstate_limit { static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, .cst_limit = CST_LIMIT_NHM, }; static const struct platform_features nhx_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, .cst_limit = CST_LIMIT_NHM, }; @@ -355,6 +357,7 @@ static const struct platform_features nhx_features = { static const struct platform_features snb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, }; @@ -362,6 +365,7 @@ static const struct platform_features snb_features = { static const struct platform_features snx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, }; @@ -369,6 +373,7 @@ static const struct platform_features snx_features = { static const struct platform_features ivb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, }; @@ -376,6 +381,7 @@ static const struct platform_features ivb_features = { static const struct platform_features ivx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, }; @@ -383,6 +389,7 @@ static const struct platform_features ivx_features = { static const struct platform_features hsw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -390,6 +397,7 @@ static const struct platform_features hsw_features = { static const struct platform_features hsx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -397,6 +405,7 @@ static const struct platform_features hsx_features = { static const struct platform_features hswl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -404,6 +413,7 @@ static const struct platform_features hswl_features = { static const struct platform_features hswg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -411,6 +421,7 @@ static const struct platform_features hswg_features = { static const struct platform_features bdw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -418,6 +429,7 @@ static const struct platform_features bdw_features = { static const struct platform_features bdwg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -425,6 +437,7 @@ static const struct platform_features bdwg_features = { static const struct platform_features bdx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -432,6 +445,7 @@ static const struct platform_features bdx_features = { static const struct platform_features skl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -439,6 +453,7 @@ static const struct platform_features skl_features = { static const struct platform_features cnl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, }; @@ -446,6 +461,7 @@ static const struct platform_features cnl_features = { static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, }; @@ -453,6 +469,7 @@ static const struct platform_features skx_features = { static const struct platform_features icx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_ICX, }; @@ -460,58 +477,68 @@ static const struct platform_features icx_features = { static const struct platform_features spr_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, }; static const struct platform_features slv_features = { + .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, }; static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, }; static const struct platform_features amt_features = { + .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, .cst_limit = CST_LIMIT_AMT, }; static const struct platform_features gmt_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features gmtd_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features gmtp_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, }; static const struct platform_features knl_features = { .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_KNL, }; @@ -2750,7 +2777,7 @@ void probe_cst_limit(void) unsigned long long msr; int *pkg_cstate_limits; - if (!do_nhm_platform_info) + if (!platform->has_nhm_msrs) return; switch (platform->cst_limit) { @@ -4058,7 +4085,7 @@ void probe_bclk(void) unsigned long long msr; unsigned int base_ratio; - if (!do_nhm_platform_info) + if (!platform->has_nhm_msrs) return; if (platform->bclk_freq == BCLK_100MHZ) @@ -4077,65 +4104,6 @@ void probe_bclk(void) has_base_hz = 1; } -/* - * NHM adds support for additional MSRs: - * - * MSR_SMI_COUNT 0x00000034 - * - * MSR_PLATFORM_INFO 0x000000ce - * MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 - * - * MSR_MISC_PWR_MGMT 0x000001aa - * - * MSR_PKG_C3_RESIDENCY 0x000003f8 - * MSR_PKG_C6_RESIDENCY 0x000003f9 - * MSR_CORE_C3_RESIDENCY 0x000003fc - * MSR_CORE_C6_RESIDENCY 0x000003fd - */ -int probe_nhm_msrs(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ - case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ - case INTEL_FAM6_SANDYBRIDGE: /* SNB */ - case INTEL_FAM6_SANDYBRIDGE_X: /* SNB Xeon */ - case INTEL_FAM6_IVYBRIDGE: /* IVB */ - case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - case INTEL_FAM6_HASWELL_X: /* HSX */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_BROADWELL_G: /* BDW */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ - case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ - case INTEL_FAM6_ATOM_AIRMONT: /* AMT */ - case INTEL_FAM6_XEON_PHI_KNL: /* PHI */ - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ - break; - default: - return 0; - } - - return 1; -} - /* * SLV client has support for unique MSRs: * @@ -4461,7 +4429,7 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model) static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model) { - if (!do_nhm_platform_info) + if (!platform->has_nhm_msrs) return; dump_nhm_platform_info(); @@ -5606,7 +5574,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk } /* Temperature Target MSR is Nehalem and newer only */ - if (!do_nhm_platform_info) + if (!platform->has_nhm_msrs) goto guess; if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) @@ -5697,7 +5665,7 @@ void decode_misc_pwr_mgmt_msr(void) { unsigned long long msr; - if (!do_nhm_platform_info) + if (!platform->has_nhm_msrs) return; if (!platform->has_msr_misc_pwr_mgmt) @@ -6024,8 +5992,7 @@ void process_cpuid() BIC_PRESENT(BIC_TSC_MHz); probe_cst_limit(); - if (probe_nhm_msrs(family, model)) { - do_nhm_platform_info = 1; + if (platform->has_nhm_msrs) { BIC_PRESENT(BIC_CPU_c1); BIC_PRESENT(BIC_CPU_c3); BIC_PRESENT(BIC_CPU_c6); From c2c25e85df316a624e4a8ee85d65ea29265f486d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 27 Mar 2023 14:57:06 +0800 Subject: [PATCH 015/560] tools/power/turbostat: Remove a redundant check Platforms with has_msr_misc_pwr_mgmt set is a subset of platforms with has_nhm_msrs set. Thus remove the redudant check for platform->has_nhm_msrs. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bc221e800aec6..4eb10491f7143 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5665,9 +5665,6 @@ void decode_misc_pwr_mgmt_msr(void) { unsigned long long msr; - if (!platform->has_nhm_msrs) - return; - if (!platform->has_msr_misc_pwr_mgmt) return; From 8b7199c0855e3b22532a21aaaed78e699431e4e5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 27 Mar 2023 14:58:35 +0800 Subject: [PATCH 016/560] tools/power/turbostat: Rename some functions Rename dump_nhm_platform_info() and dump_nhm_cst_cfg() to dump_platform_info() and dump_cst_cfg() because these MSRs' behavior is consistent when they're available. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4eb10491f7143..654ae1ce130c7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2823,7 +2823,7 @@ static void calculate_tsc_tweak() void prewake_cstate_probe(unsigned int family, unsigned int model); -static void dump_nhm_platform_info(void) +static void dump_platform_info(void) { unsigned long long msr; unsigned int ratio; @@ -3059,7 +3059,7 @@ static void dump_knl_turbo_ratio_limits(void) ratio[i], bclk, ratio[i] * bclk, cores[i]); } -static void dump_nhm_cst_cfg(void) +static void dump_cst_cfg(void) { unsigned long long msr; @@ -4432,9 +4432,9 @@ static void dump_cstate_pstate_config_info(unsigned int family, unsigned int mod if (!platform->has_nhm_msrs) return; - dump_nhm_platform_info(); + dump_platform_info(); dump_turbo_ratio_info(family, model); - dump_nhm_cst_cfg(); + dump_cst_cfg(); } static int read_sysfs_int(char *path) From 10d85d85ab4f3ae7faca4450ce7b0a166fd396f0 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:16:00 +0800 Subject: [PATCH 017/560] tools/power/turbostat: Abstract Turbo Ratio Limit MSRs support Abstract the support for MSR_TURBO_RATIO_LIMIT, MSR_TRUBO_RATIO_LIMIT1, MSR_TURBO_RATIO_LIMIT2, MSR_SECONDARY_TURBO_RATIO_LIMIT, MSR_ATOM_CORE_RATIOS and MSR_ATOM_CORE_TURBO_RATIOS. Delete has_turbo_ratio_group_limits(), has_turbo_ratio_limit(), has_atom_turbo_ratio_limit(), has_ivt_turbo_ratio_limit(), has_hsw_turbo_ratio_limit(), has_knl_turbo_ratio_limit() and has_glm_turbo_ratio_limit() CPU model checks. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 171 +++++++------------------- 1 file changed, 46 insertions(+), 125 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 654ae1ce130c7..b6c53ec740d39 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -288,6 +288,7 @@ struct platform_features { bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */ int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ + int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ }; struct platform_data { @@ -340,11 +341,22 @@ enum package_cstate_limit { CST_LIMIT_GMT, }; +/* For Turbo Ratio Limit MSRs */ +enum turbo_ratio_limit_msrs { + TRL_BASE = BIT(0), + TRL_LIMIT1 = BIT(1), + TRL_LIMIT2 = BIT(2), + TRL_ATOM = BIT(3), + TRL_KNL = BIT(4), + TRL_CORECOUNT = BIT(5), +}; + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, .cst_limit = CST_LIMIT_NHM, + .trl_msrs = TRL_BASE, }; static const struct platform_features nhx_features = { @@ -360,6 +372,7 @@ static const struct platform_features snb_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, + .trl_msrs = TRL_BASE, }; static const struct platform_features snx_features = { @@ -368,6 +381,7 @@ static const struct platform_features snx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, + .trl_msrs = TRL_BASE, }; static const struct platform_features ivb_features = { @@ -376,6 +390,7 @@ static const struct platform_features ivb_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, + .trl_msrs = TRL_BASE, }; static const struct platform_features ivx_features = { @@ -384,6 +399,7 @@ static const struct platform_features ivx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, + .trl_msrs = TRL_BASE | TRL_LIMIT1, }; static const struct platform_features hsw_features = { @@ -392,6 +408,7 @@ static const struct platform_features hsw_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features hsx_features = { @@ -400,6 +417,7 @@ static const struct platform_features hsx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, }; static const struct platform_features hswl_features = { @@ -408,6 +426,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features hswg_features = { @@ -416,6 +435,7 @@ static const struct platform_features hswg_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features bdw_features = { @@ -424,6 +444,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features bdwg_features = { @@ -432,6 +453,7 @@ static const struct platform_features bdwg_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features bdx_features = { @@ -440,6 +462,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features skl_features = { @@ -448,6 +471,7 @@ static const struct platform_features skl_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features cnl_features = { @@ -456,6 +480,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .trl_msrs = TRL_BASE, }; static const struct platform_features skx_features = { @@ -464,6 +489,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features icx_features = { @@ -472,6 +498,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_ICX, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features spr_features = { @@ -480,12 +507,14 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, + .trl_msrs = TRL_ATOM, }; static const struct platform_features slvd_features = { @@ -493,12 +522,14 @@ static const struct platform_features slvd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, + .trl_msrs = TRL_BASE, }; static const struct platform_features amt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, .cst_limit = CST_LIMIT_AMT, + .trl_msrs = TRL_BASE, }; static const struct platform_features gmt_features = { @@ -506,6 +537,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features gmtd_features = { @@ -513,6 +545,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features gmtp_features = { @@ -520,6 +553,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, + .trl_msrs = TRL_BASE, }; static const struct platform_features tmt_features = { @@ -527,6 +561,7 @@ static const struct platform_features tmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, + .trl_msrs = TRL_BASE, }; static const struct platform_features tmtd_features = { @@ -534,6 +569,7 @@ static const struct platform_features tmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; static const struct platform_features knl_features = { @@ -541,6 +577,7 @@ static const struct platform_features knl_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_KNL, + .trl_msrs = TRL_KNL, }; static const struct platform_features default_features = { @@ -2911,29 +2948,7 @@ static void dump_ivt_turbo_ratio_limits(void) return; } -int has_turbo_ratio_group_limits(int family, int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_ICELAKE_X: - case INTEL_FAM6_SAPPHIRERAPIDS_X: - case INTEL_FAM6_ATOM_GOLDMONT_D: - case INTEL_FAM6_ATOM_TREMONT_D: - return 1; - default: - return 0; - } -} - -static void dump_turbo_ratio_limits(int trl_msr_offset, int family, int model) +static void dump_turbo_ratio_limits(int trl_msr_offset) { unsigned long long msr, core_counts; int shift; @@ -2942,7 +2957,7 @@ static void dump_turbo_ratio_limits(int trl_msr_offset, int family, int model) fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr); - if (has_turbo_ratio_group_limits(family, model)) { + if (platform->trl_msrs & TRL_CORECOUNT) { get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts); fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, core_counts); } else { @@ -4236,100 +4251,6 @@ int is_jvl(unsigned int family, unsigned int model) return 0; } -int has_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (has_slv_msrs(family, model)) - return 0; - - if (family != 6) - return 0; - - switch (model) { - /* Nehalem compatible, but do not include turbo-ratio limit support */ - case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ - case INTEL_FAM6_XEON_PHI_KNL: /* PHI - Knights Landing (different MSR definition) */ - return 0; - default: - return 1; - } -} - -int has_atom_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (has_slv_msrs(family, model)) - return 1; - - return 0; -} - -int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ - case INTEL_FAM6_HASWELL_X: /* HSW Xeon */ - return 1; - default: - return 0; - } -} - -int has_hsw_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_HASWELL_X: /* HSW Xeon */ - return 1; - default: - return 0; - } -} - -int has_knl_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_XEON_PHI_KNL: /* Knights Landing */ - return 1; - default: - return 0; - } -} - -int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_ICELAKE_X: - case INTEL_FAM6_SAPPHIRERAPIDS_X: - return 1; - default: - return 0; - } -} - int has_config_tdp(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -4404,23 +4325,23 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model) if (!has_turbo) return; - if (has_hsw_turbo_ratio_limit(family, model)) + if (platform->trl_msrs & TRL_LIMIT2) dump_hsw_turbo_ratio_limits(); - if (has_ivt_turbo_ratio_limit(family, model)) + if (platform->trl_msrs & TRL_LIMIT1) dump_ivt_turbo_ratio_limits(); - if (has_turbo_ratio_limit(family, model)) { - dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT, family, model); + if (platform->trl_msrs & TRL_BASE) { + dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT); if (is_hybrid) - dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT, family, model); + dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT); } - if (has_atom_turbo_ratio_limit(family, model)) + if (platform->trl_msrs & TRL_ATOM) dump_atom_turbo_ratio_limits(); - if (has_knl_turbo_ratio_limit(family, model)) + if (platform->trl_msrs & TRL_KNL) dump_knl_turbo_ratio_limits(); if (has_config_tdp(family, model)) From a3943deaf98f713c819dd7e67af734d4ed4da030 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 30 Jul 2023 13:54:25 +0800 Subject: [PATCH 018/560] tools/power/turbostat: Rename some TRL functions Rename dump_hsw_turbo_ratio_limits() and dump_ivt_turbo_ratio_limits() to dump_turbo_ratio_limit2() and dump_turbo_ratio_limit1() because they dump MSR_TURBO_RATIO_LIMIT1/LIMIT2, and the MSRs' behavior is consistent when they are available. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b6c53ec740d39..abcc055ea0e1a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2886,7 +2886,7 @@ static void dump_platform_info(void) return; } -static void dump_hsw_turbo_ratio_limits(void) +static void dump_turbo_ratio_limit2(void) { unsigned long long msr; unsigned int ratio; @@ -2905,7 +2905,7 @@ static void dump_hsw_turbo_ratio_limits(void) return; } -static void dump_ivt_turbo_ratio_limits(void) +static void dump_turbo_ratio_limit1(void) { unsigned long long msr; unsigned int ratio; @@ -4326,10 +4326,10 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model) return; if (platform->trl_msrs & TRL_LIMIT2) - dump_hsw_turbo_ratio_limits(); + dump_turbo_ratio_limit2(); if (platform->trl_msrs & TRL_LIMIT1) - dump_ivt_turbo_ratio_limits(); + dump_turbo_ratio_limit1(); if (platform->trl_msrs & TRL_BASE) { dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT); From a61c9cb478c0bf31a50d1829834e822e92876c95 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 14:19:46 +0800 Subject: [PATCH 019/560] tools/power/turbostat: Abstract Config TDP MSRs support Abstract the support for MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL and MSR_TURBO_ACTIVATION_RATIO. Delete has_config_tdp() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 54 ++++++++++----------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index abcc055ea0e1a..bb9d8c2605c84 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -286,6 +286,7 @@ struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */ + bool has_config_tdp; /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */ int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ @@ -388,6 +389,7 @@ static const struct platform_features ivb_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, @@ -406,6 +408,7 @@ static const struct platform_features hsw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -415,6 +418,7 @@ static const struct platform_features hsx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, @@ -424,6 +428,7 @@ static const struct platform_features hswl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -433,6 +438,7 @@ static const struct platform_features hswg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -442,6 +448,7 @@ static const struct platform_features bdw_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -451,6 +458,7 @@ static const struct platform_features bdwg_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -460,6 +468,7 @@ static const struct platform_features bdx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -469,6 +478,7 @@ static const struct platform_features skl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -478,6 +488,7 @@ static const struct platform_features cnl_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, @@ -487,6 +498,7 @@ static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -496,6 +508,7 @@ static const struct platform_features icx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -505,6 +518,7 @@ static const struct platform_features spr_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -575,6 +589,7 @@ static const struct platform_features tmtd_features = { static const struct platform_features knl_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, + .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, @@ -4251,35 +4266,6 @@ int is_jvl(unsigned int family, unsigned int model) return 0; } -int has_config_tdp(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_IVYBRIDGE: /* IVB */ - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_X: /* HSX */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_BROADWELL_G: /* BDW */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ - case INTEL_FAM6_XEON_PHI_KNL: /* Knights Landing */ - return 1; - default: - return 0; - } -} - /* * tcc_offset_bits: * 0: Tcc Offset not supported (Default) @@ -4320,7 +4306,7 @@ static void remove_underbar(char *s) *to = 0; } -static void dump_turbo_ratio_info(unsigned int family, unsigned int model) +static void dump_turbo_ratio_info(void) { if (!has_turbo) return; @@ -4344,17 +4330,17 @@ static void dump_turbo_ratio_info(unsigned int family, unsigned int model) if (platform->trl_msrs & TRL_KNL) dump_knl_turbo_ratio_limits(); - if (has_config_tdp(family, model)) + if (platform->has_config_tdp) dump_config_tdp(); } -static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model) +static void dump_cstate_pstate_config_info(void) { if (!platform->has_nhm_msrs) return; dump_platform_info(); - dump_turbo_ratio_info(family, model); + dump_turbo_ratio_info(); dump_cst_cfg(); } @@ -6000,7 +5986,7 @@ void process_cpuid() check_tcc_offset(model); if (!quiet) - dump_cstate_pstate_config_info(family, model); + dump_cstate_pstate_config_info(); intel_uncore_frequency_probe(); if (!quiet) From d8e1623baa0b49aa90cf5801cfaac9e3d3aa1e19 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 25 Aug 2023 23:04:44 +0800 Subject: [PATCH 020/560] tools/power/turbostat: Abstract TCC Offset bits support Abstract the support for different TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET. Delete check_tcc_offset() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 53 ++++++--------------------- 1 file changed, 12 insertions(+), 41 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bb9d8c2605c84..c39beb4078da7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -256,7 +256,6 @@ unsigned int gfx_cur_mhz; unsigned int gfx_act_mhz; unsigned int tj_max; unsigned int tj_max_override; -int tcc_offset_bits; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; @@ -290,6 +289,7 @@ struct platform_features { int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ + int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; struct platform_data { @@ -482,6 +482,7 @@ static const struct platform_features skl_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, }; static const struct platform_features cnl_features = { @@ -492,6 +493,7 @@ static const struct platform_features cnl_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, }; static const struct platform_features skx_features = { @@ -4266,33 +4268,6 @@ int is_jvl(unsigned int family, unsigned int model) return 0; } -/* - * tcc_offset_bits: - * 0: Tcc Offset not supported (Default) - * 6: Bit 29:24 of MSR_PLATFORM_INFO - * 4: Bit 27:24 of MSR_PLATFORM_INFO - */ -void check_tcc_offset(int model) -{ - unsigned long long msr; - - if (!genuine_intel) - return; - - switch (model) { - case INTEL_FAM6_SKYLAKE_L: - case INTEL_FAM6_CANNONLAKE_L: - if (!get_msr(base_cpu, MSR_PLATFORM_INFO, &msr)) { - msr = (msr >> 30) & 1; - if (msr) - tcc_offset_bits = 6; - } - return; - default: - return; - } -} - static void remove_underbar(char *s) { char *to = s; @@ -5490,20 +5465,18 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk tcc_default = (msr >> 16) & 0xFF; if (!quiet) { - switch (tcc_offset_bits) { - case 4: - tcc_offset = (msr >> 24) & 0xF; - fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); - break; - case 6: - tcc_offset = (msr >> 24) & 0x3F; + int bits = platform->tcc_offset_bits; + unsigned long long enabled = 0; + + if (bits && !get_msr(base_cpu, MSR_PLATFORM_INFO, &enabled)) + enabled = (enabled >> 30) & 1; + + if (bits && enabled) { + tcc_offset = (msr >> 24) & GENMASK(bits - 1, 0); fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); - break; - default: + } else { fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default); - break; } } @@ -5983,8 +5956,6 @@ void process_cpuid() automatic_cstate_conversion_probe(family, model); prewake_cstate_probe(family, model); - check_tcc_offset(model); - if (!quiet) dump_cstate_pstate_config_info(); intel_uncore_frequency_probe(); From 0c057cf7a0e163bf9631b83c002b3a55691674c7 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 30 Jul 2023 14:25:17 +0800 Subject: [PATCH 021/560] tools/power/turbostat: Abstract Perf Limit Reasons MSRs support Abstract the support for MSR_CORE/GFX/RING_PERF_LIMIT_REASONS MSRs. Delete perf_limit_reasons_probe() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 44 +++++++++------------------ 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c39beb4078da7..1207845340ada 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -259,11 +259,8 @@ unsigned int tj_max_override; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; -unsigned int do_core_perf_limit_reasons; unsigned int has_automatic_cstate_conversion; unsigned int dis_cstate_prewake; -unsigned int do_gfx_perf_limit_reasons; -unsigned int do_ring_perf_limit_reasons; unsigned int crystal_hz; unsigned long long tsc_hz; int base_cpu; @@ -289,6 +286,7 @@ struct platform_features { int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ + int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -352,6 +350,13 @@ enum turbo_ratio_limit_msrs { TRL_CORECOUNT = BIT(5), }; +/* For Perf Limit Reason MSRs */ +enum perf_limit_reason_msrs { + PLR_CORE = BIT(0), + PLR_GFX = BIT(1), + PLR_RING = BIT(2), +}; + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, @@ -412,6 +417,7 @@ static const struct platform_features hsw_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, }; static const struct platform_features hsx_features = { @@ -422,6 +428,7 @@ static const struct platform_features hsx_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, + .plr_msrs = PLR_CORE | PLR_RING, }; static const struct platform_features hswl_features = { @@ -432,6 +439,7 @@ static const struct platform_features hswl_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, }; static const struct platform_features hswg_features = { @@ -442,6 +450,7 @@ static const struct platform_features hswg_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, }; static const struct platform_features bdw_features = { @@ -4657,7 +4666,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data return -1; } - if (do_core_perf_limit_reasons) { + if (platform->plr_msrs & PLR_CORE) { get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr); fprintf(outf, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr); fprintf(outf, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)", @@ -4690,7 +4699,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : ""); } - if (do_gfx_perf_limit_reasons) { + if (platform->plr_msrs & PLR_GFX) { get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr); fprintf(outf, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr); fprintf(outf, " (Active: %s%s%s%s%s%s%s%s)", @@ -4710,7 +4719,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 25) ? "GFXPwr, " : "", (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : ""); } - if (do_ring_perf_limit_reasons) { + if (platform->plr_msrs & PLR_RING) { get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr); fprintf(outf, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr); fprintf(outf, " (Active: %s%s%s%s%s%s)", @@ -5002,28 +5011,6 @@ void rapl_probe(unsigned int family, unsigned int model) rapl_probe_amd(family, model); } -void perf_limit_reasons_probe(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return; - - if (family != 6) - return; - - switch (model) { - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - do_gfx_perf_limit_reasons = 1; - /* FALLTHRU */ - case INTEL_FAM6_HASWELL_X: /* HSX */ - do_core_perf_limit_reasons = 1; - do_ring_perf_limit_reasons = 1; - default: - return; - } -} - void automatic_cstate_conversion_probe(unsigned int family, unsigned int model) { if (family != 6) @@ -5952,7 +5939,6 @@ void process_cpuid() decode_c6_demotion_policy_msr(); rapl_probe(family, model); - perf_limit_reasons_probe(family, model); automatic_cstate_conversion_probe(family, model); prewake_cstate_probe(family, model); From d90120bf9f111bec8ba0b8ef86c46ccbcd9df188 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 22 Apr 2023 11:29:18 +0800 Subject: [PATCH 022/560] tools/power/turbostat: Abstract Automatic Cstate Conversion support Abstract the support for AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL. Delete automatic_cstate_conversion_probe() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1207845340ada..a235cbf7b581b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -259,7 +259,6 @@ unsigned int tj_max_override; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; -unsigned int has_automatic_cstate_conversion; unsigned int dis_cstate_prewake; unsigned int crystal_hz; unsigned long long tsc_hz; @@ -285,6 +284,7 @@ struct platform_features { bool has_config_tdp; /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */ int bclk_freq; /* CPU base clock */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ + bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ @@ -480,6 +480,7 @@ static const struct platform_features bdx_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, + .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, }; @@ -512,6 +513,7 @@ static const struct platform_features skx_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, + .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; @@ -3116,7 +3118,7 @@ static void dump_cst_cfg(void) (msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]); #define AUTOMATIC_CSTATE_CONVERSION (1UL << 16) - if (has_automatic_cstate_conversion) { + if (platform->has_cst_auto_convension) { fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off"); } @@ -5011,18 +5013,6 @@ void rapl_probe(unsigned int family, unsigned int model) rapl_probe_amd(family, model); } -void automatic_cstate_conversion_probe(unsigned int family, unsigned int model) -{ - if (family != 6) - return; - - switch (model) { - case INTEL_FAM6_BROADWELL_X: - case INTEL_FAM6_SKYLAKE_X: - has_automatic_cstate_conversion = 1; - } -} - void prewake_cstate_probe(unsigned int family, unsigned int model) { if (is_icx(family, model) || is_spr(family, model)) @@ -5939,7 +5929,6 @@ void process_cpuid() decode_c6_demotion_policy_msr(); rapl_probe(family, model); - automatic_cstate_conversion_probe(family, model); prewake_cstate_probe(family, model); if (!quiet) From a5d1ab93a0993616efbf61378e491d6673f4684d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 22 Apr 2023 11:39:49 +0800 Subject: [PATCH 023/560] tools/power/turbostat: Abstract hardcoded Crystal Clock frequency Abstract the support for hardcoded Crystal Clock frequency, which is used when crystal clock is not available from CPUID.15. Delete CPU model checks in process_cpuid(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a235cbf7b581b..c76baa10f4ebc 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -283,6 +283,7 @@ struct platform_features { bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */ bool has_config_tdp; /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */ int bclk_freq; /* CPU base clock */ + int crystal_freq; /* Crystal clock to use when not available from CPUID.15 */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ @@ -490,6 +491,7 @@ static const struct platform_features skl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .crystal_freq = 24000000, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -563,6 +565,7 @@ static const struct platform_features gmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .crystal_freq = 19200000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; @@ -571,6 +574,7 @@ static const struct platform_features gmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .crystal_freq = 25000000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, }; @@ -579,6 +583,7 @@ static const struct platform_features gmtp_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .crystal_freq = 19200000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, }; @@ -5796,26 +5801,12 @@ void process_cpuid() __cpuid(0x15, eax_crystal, ebx_tsc, crystal_hz, edx); if (ebx_tsc != 0) { - if (!quiet && (ebx != 0)) fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n", eax_crystal, ebx_tsc, crystal_hz); if (crystal_hz == 0) - switch (model) { - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - crystal_hz = 24000000; /* 24.0 MHz */ - break; - case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - crystal_hz = 25000000; /* 25.0 MHz */ - break; - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - crystal_hz = 19200000; /* 19.2 MHz */ - break; - default: - crystal_hz = 0; - } + crystal_hz = platform->crystal_freq; if (crystal_hz) { tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal; From b9cd66833d3a651cea10666674e9abcf2182e8ad Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 26 Aug 2023 14:38:38 +0800 Subject: [PATCH 024/560] tools/power/turbostat: Redefine RAPL macros Redefine RAPL macros to make the code more readable. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 102 ++++++++++++-------------- 1 file changed, 45 insertions(+), 57 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c76baa10f4ebc..4829e8289feb7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -358,6 +358,40 @@ enum perf_limit_reason_msrs { PLR_RING = BIT(2), }; +/* For RAPL MSRs */ +enum rapl_msrs { + RAPL_PKG_POWER_LIMIT = BIT(0), /* 0x610 MSR_PKG_POWER_LIMIT */ + RAPL_PKG_ENERGY_STATUS = BIT(1), /* 0x611 MSR_PKG_ENERGY_STATUS */ + RAPL_PKG_PERF_STATUS = BIT(2), /* 0x613 MSR_PKG_PERF_STATUS */ + RAPL_PKG_POWER_INFO = BIT(3), /* 0x614 MSR_PKG_POWER_INFO */ + RAPL_DRAM_POWER_LIMIT = BIT(4), /* 0x618 MSR_DRAM_POWER_LIMIT */ + RAPL_DRAM_ENERGY_STATUS = BIT(5), /* 0x619 MSR_DRAM_ENERGY_STATUS */ + RAPL_DRAM_PERF_STATUS = BIT(6), /* 0x61b MSR_DRAM_PERF_STATUS */ + RAPL_DRAM_POWER_INFO = BIT(7), /* 0x61c MSR_DRAM_POWER_INFO */ + RAPL_CORE_POWER_LIMIT = BIT(8), /* 0x638 MSR_PP0_POWER_LIMIT */ + RAPL_CORE_ENERGY_STATUS = BIT(9), /* 0x639 MSR_PP0_ENERGY_STATUS */ + RAPL_CORE_POLICY = BIT(10), /* 0x63a MSR_PP0_POLICY */ + RAPL_GFX_POWER_LIMIT = BIT(11), /* 0x640 MSR_PP1_POWER_LIMIT */ + RAPL_GFX_ENERGY_STATUS = BIT(12), /* 0x641 MSR_PP1_ENERGY_STATUS */ + RAPL_GFX_POLICY = BIT(13), /* 0x642 MSR_PP1_POLICY */ + RAPL_AMD_PWR_UNIT = BIT(14), /* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */ + RAPL_AMD_CORE_ENERGY_STAT = BIT(15), /* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */ + RAPL_AMD_PKG_ENERGY_STAT = BIT(16), /* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */ + RAPL_PER_CORE_ENERGY = BIT(17), /* Indicates cores energy collection is per-core, not per-package. */ +}; + +#define RAPL_PKG (RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT) +#define RAPL_DRAM (RAPL_DRAM_ENERGY_STATUS | RAPL_DRAM_POWER_LIMIT) +#define RAPL_CORE (RAPL_CORE_ENERGY_STATUS | RAPL_CORE_POWER_LIMIT) +#define RAPL_GFX (RAPL_GFX_POWER_LIMIT | RAPL_GFX_ENERGY_STATUS) + +#define RAPL_PKG_ALL (RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO) +#define RAPL_DRAM_ALL (RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO) +#define RAPL_CORE_ALL (RAPL_CORE | RAPL_CORE_POLICY) +#define RAPL_GFX_ALL (RAPL_GFX | RAPL_GFX_POLIGY) + +#define RAPL_AMD_F17H (RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT) + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, @@ -712,42 +746,6 @@ void probe_platform_features(unsigned int family, unsigned int model) /* Model specific support End */ -#define RAPL_PKG (1 << 0) - /* 0x610 MSR_PKG_POWER_LIMIT */ - /* 0x611 MSR_PKG_ENERGY_STATUS */ -#define RAPL_PKG_PERF_STATUS (1 << 1) - /* 0x613 MSR_PKG_PERF_STATUS */ -#define RAPL_PKG_POWER_INFO (1 << 2) - /* 0x614 MSR_PKG_POWER_INFO */ - -#define RAPL_DRAM (1 << 3) - /* 0x618 MSR_DRAM_POWER_LIMIT */ - /* 0x619 MSR_DRAM_ENERGY_STATUS */ -#define RAPL_DRAM_PERF_STATUS (1 << 4) - /* 0x61b MSR_DRAM_PERF_STATUS */ -#define RAPL_DRAM_POWER_INFO (1 << 5) - /* 0x61c MSR_DRAM_POWER_INFO */ - -#define RAPL_CORES_POWER_LIMIT (1 << 6) - /* 0x638 MSR_PP0_POWER_LIMIT */ -#define RAPL_CORE_POLICY (1 << 7) - /* 0x63a MSR_PP0_POLICY */ - -#define RAPL_GFX (1 << 8) - /* 0x640 MSR_PP1_POWER_LIMIT */ - /* 0x641 MSR_PP1_ENERGY_STATUS */ - /* 0x642 MSR_PP1_POLICY */ - -#define RAPL_CORES_ENERGY_STATUS (1 << 9) - /* 0x639 MSR_PP0_ENERGY_STATUS */ -#define RAPL_PER_CORE_ENERGY (1 << 10) - /* Indicates cores energy collection is per-core, - * not per-package. */ -#define RAPL_AMD_F17H (1 << 11) - /* 0xc0010299 MSR_RAPL_PWR_UNIT */ - /* 0xc001029a MSR_CORE_ENERGY_STAT */ - /* 0xc001029b MSR_PKG_ENERGY_STAT */ -#define RAPL_CORES (RAPL_CORES_ENERGY_STATUS | RAPL_CORES_POWER_LIMIT) #define TJMAX_DEFAULT 100 /* MSRs that are not yet in the kernel-provided header. */ @@ -948,7 +946,7 @@ int idx_valid(int idx) case IDX_DRAM_ENERGY: return do_rapl & RAPL_DRAM; case IDX_PP0_ENERGY: - return do_rapl & RAPL_CORES_ENERGY_STATUS; + return do_rapl & RAPL_CORE_ENERGY_STATUS; case IDX_PP1_ENERGY: return do_rapl & RAPL_GFX; case IDX_PKG_PERF: @@ -2710,7 +2708,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -13; p->energy_pkg = msr; } - if (do_rapl & RAPL_CORES_ENERGY_STATUS) { + if (do_rapl & RAPL_CORE_ENERGY_STATUS) { if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) return -14; p->energy_cores = msr; @@ -4810,7 +4808,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_HASWELL_G: /* HSW */ case INTEL_FAM6_BROADWELL: /* BDW */ case INTEL_FAM6_BROADWELL_G: /* BDW */ - do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO; if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4830,9 +4828,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) BIC_PRESENT(BIC_PkgWatt); break; case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - do_rapl = - RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS - | RAPL_GFX | RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4846,7 +4842,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) } break; case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ - do_rapl = RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG_ALL; BIC_PRESENT(BIC_PKG__); if (rapl_joules) BIC_PRESENT(BIC_Pkg_J); @@ -4855,9 +4851,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) break; case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - do_rapl = - RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS - | RAPL_GFX | RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4878,9 +4872,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_ICELAKE_X: /* ICX */ case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ - do_rapl = - RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | - RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4893,9 +4885,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) break; case INTEL_FAM6_SANDYBRIDGE_X: case INTEL_FAM6_IVYBRIDGE_X: - do_rapl = - RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS | - RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO; + do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4910,7 +4900,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ - do_rapl = RAPL_PKG | RAPL_CORES; + do_rapl = RAPL_PKG | RAPL_CORE; if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4920,9 +4910,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) } break; case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - do_rapl = - RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | - RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS; + do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -5195,7 +5183,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF); } - if (do_rapl & RAPL_CORES_POWER_LIMIT) { + if (do_rapl & RAPL_CORE_POWER_LIMIT) { if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n", From a98f886035d5f7e0ec66036dd6bf98b40e75b692 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 26 Aug 2023 14:57:12 +0800 Subject: [PATCH 025/560] tools/power/turbostat: Simplify the logic for RAPL enumeration The support for each RAPL domains, as well as the support for the perf status of each RAPL domains, can be detected by checking the availabilities of the corresponding RAPL MSRs. Change the code accordingly and remove the hardcoded logic for each model. Note that this also fixes the INTEL_FAM6_ATOM_TREMONT model, which has RAPL_PKG_PERF_STATUS and MSR_DRAM_PERF_STATUS but doesn't have BIC_PKG__ and BIC_RAM__ set. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 99 ++++++--------------------- 1 file changed, 22 insertions(+), 77 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4829e8289feb7..b2da36437b12c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4809,62 +4809,20 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_BROADWELL: /* BDW */ case INTEL_FAM6_BROADWELL_G: /* BDW */ do_rapl = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - BIC_PRESENT(BIC_GFX_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - BIC_PRESENT(BIC_GFXWatt); - } break; case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: do_rapl = RAPL_PKG | RAPL_PKG_POWER_INFO; - if (rapl_joules) - BIC_PRESENT(BIC_Pkg_J); - else - BIC_PRESENT(BIC_PkgWatt); break; case INTEL_FAM6_ATOM_TREMONT: /* EHL */ do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - BIC_PRESENT(BIC_RAM_J); - BIC_PRESENT(BIC_GFX_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - BIC_PRESENT(BIC_RAMWatt); - BIC_PRESENT(BIC_GFXWatt); - } break; case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ do_rapl = RAPL_PKG_ALL; - BIC_PRESENT(BIC_PKG__); - if (rapl_joules) - BIC_PRESENT(BIC_Pkg_J); - else - BIC_PRESENT(BIC_PkgWatt); break; case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; - BIC_PRESENT(BIC_PKG__); - BIC_PRESENT(BIC_RAM__); - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - BIC_PRESENT(BIC_RAM_J); - BIC_PRESENT(BIC_GFX_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - BIC_PRESENT(BIC_RAMWatt); - BIC_PRESENT(BIC_GFXWatt); - } break; case INTEL_FAM6_HASWELL_X: /* HSX */ case INTEL_FAM6_BROADWELL_X: /* BDX */ @@ -4873,60 +4831,47 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL; - BIC_PRESENT(BIC_PKG__); - BIC_PRESENT(BIC_RAM__); - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_RAM_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_RAMWatt); - } break; case INTEL_FAM6_SANDYBRIDGE_X: case INTEL_FAM6_IVYBRIDGE_X: do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL; - BIC_PRESENT(BIC_PKG__); - BIC_PRESENT(BIC_RAM__); - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - BIC_PRESENT(BIC_RAM_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - BIC_PRESENT(BIC_RAMWatt); - } break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ do_rapl = RAPL_PKG | RAPL_CORE; - if (rapl_joules) { - BIC_PRESENT(BIC_Pkg_J); - BIC_PRESENT(BIC_Cor_J); - } else { - BIC_PRESENT(BIC_PkgWatt); - BIC_PRESENT(BIC_CorWatt); - } break; case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS; - BIC_PRESENT(BIC_PKG__); - BIC_PRESENT(BIC_RAM__); - if (rapl_joules) { + break; + default: + return; + } + + if (rapl_joules) { + if (do_rapl & RAPL_PKG_ENERGY_STATUS) BIC_PRESENT(BIC_Pkg_J); + if (do_rapl & RAPL_CORE_ENERGY_STATUS) BIC_PRESENT(BIC_Cor_J); + if (do_rapl & RAPL_DRAM_ENERGY_STATUS) BIC_PRESENT(BIC_RAM_J); - } else { + if (do_rapl & RAPL_GFX_ENERGY_STATUS) + BIC_PRESENT(BIC_GFX_J); + } else { + if (do_rapl & RAPL_PKG_ENERGY_STATUS) BIC_PRESENT(BIC_PkgWatt); + if (do_rapl & RAPL_CORE_ENERGY_STATUS) BIC_PRESENT(BIC_CorWatt); + if (do_rapl & RAPL_DRAM_ENERGY_STATUS) BIC_PRESENT(BIC_RAMWatt); - } - break; - default: - return; + if (do_rapl & RAPL_GFX_ENERGY_STATUS) + BIC_PRESENT(BIC_GFXWatt); } + if (do_rapl & RAPL_PKG_PERF_STATUS) + BIC_PRESENT(BIC_PKG__); + if (do_rapl & RAPL_DRAM_PERF_STATUS) + BIC_PRESENT(BIC_RAM__); + /* units on package 0, verify later other packages match */ if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr)) return; From 86ba263d9b72b7766a99059e9b3bd104d089b7fa Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 28 Aug 2023 15:09:40 +0800 Subject: [PATCH 026/560] tools/power/turbostat: Abstract RAPL MSRs support Abstract the support for RAPL MSRs. Delete CPU model checks in rapl_probe_intel(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 189 ++++++++++++-------------- 1 file changed, 85 insertions(+), 104 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b2da36437b12c..90e1abe96dd56 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -288,6 +288,7 @@ struct platform_features { bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ + int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -414,6 +415,7 @@ static const struct platform_features snb_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features snx_features = { @@ -423,6 +425,7 @@ static const struct platform_features snx_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, }; static const struct platform_features ivb_features = { @@ -433,6 +436,7 @@ static const struct platform_features ivb_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features ivx_features = { @@ -442,6 +446,7 @@ static const struct platform_features ivx_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE | TRL_LIMIT1, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, }; static const struct platform_features hsw_features = { @@ -453,6 +458,7 @@ static const struct platform_features hsw_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features hsx_features = { @@ -464,6 +470,7 @@ static const struct platform_features hsx_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features hswl_features = { @@ -475,6 +482,7 @@ static const struct platform_features hswl_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features hswg_features = { @@ -486,6 +494,7 @@ static const struct platform_features hswg_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features bdw_features = { @@ -496,6 +505,7 @@ static const struct platform_features bdw_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features bdwg_features = { @@ -506,6 +516,7 @@ static const struct platform_features bdwg_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; static const struct platform_features bdx_features = { @@ -517,6 +528,7 @@ static const struct platform_features bdx_features = { .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features skl_features = { @@ -529,6 +541,7 @@ static const struct platform_features skl_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, }; static const struct platform_features cnl_features = { @@ -540,6 +553,7 @@ static const struct platform_features cnl_features = { .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, }; static const struct platform_features skx_features = { @@ -551,6 +565,7 @@ static const struct platform_features skx_features = { .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features icx_features = { @@ -561,6 +576,7 @@ static const struct platform_features icx_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features spr_features = { @@ -571,6 +587,7 @@ static const struct platform_features spr_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features slv_features = { @@ -578,6 +595,7 @@ static const struct platform_features slv_features = { .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, + .rapl_msrs = RAPL_PKG | RAPL_CORE, }; static const struct platform_features slvd_features = { @@ -586,6 +604,7 @@ static const struct platform_features slvd_features = { .bclk_freq = BCLK_SLV, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_CORE, }; static const struct platform_features amt_features = { @@ -602,6 +621,7 @@ static const struct platform_features gmt_features = { .crystal_freq = 19200000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, }; static const struct platform_features gmtd_features = { @@ -611,6 +631,7 @@ static const struct platform_features gmtd_features = { .crystal_freq = 25000000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, }; static const struct platform_features gmtp_features = { @@ -620,6 +641,7 @@ static const struct platform_features gmtp_features = { .crystal_freq = 19200000, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, }; static const struct platform_features tmt_features = { @@ -628,6 +650,7 @@ static const struct platform_features tmt_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, }; static const struct platform_features tmtd_features = { @@ -636,6 +659,7 @@ static const struct platform_features tmtd_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL, }; static const struct platform_features knl_features = { @@ -645,6 +669,7 @@ static const struct platform_features knl_features = { .bclk_freq = BCLK_100MHZ, .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; static const struct platform_features default_features = { @@ -653,6 +678,10 @@ static const struct platform_features default_features = { static const struct platform_features amd_features = { }; +static const struct platform_features amd_features_with_rapl = { + .rapl_msrs = RAPL_AMD_F17H, +}; + static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_NEHALEM, &nhm_features }, { INTEL_FAM6_NEHALEM_G, &nhm_features }, @@ -728,6 +757,17 @@ void probe_platform_features(unsigned int family, unsigned int model) if (authentic_amd || hygon_genuine) { platform = &amd_features; + + if (max_extended_level >= 0x80000007) { + unsigned int eax, ebx, ecx, edx; + + __cpuid(0x80000007, eax, ebx, ecx, edx); + /* RAPL (Fam 17h+) */ + if ((edx & (1 << 14)) && family >= 0x17) { + platform = &amd_features_with_rapl; + do_rapl = RAPL_PER_CORE_ENERGY; + } + } return; } @@ -882,7 +922,7 @@ off_t idx_to_offset(int idx) switch (idx) { case IDX_PKG_ENERGY: - if (do_rapl & RAPL_AMD_F17H) + if (platform->rapl_msrs & RAPL_AMD_F17H) offset = MSR_PKG_ENERGY_STAT; else offset = MSR_PKG_ENERGY_STATUS; @@ -942,17 +982,17 @@ int idx_valid(int idx) { switch (idx) { case IDX_PKG_ENERGY: - return do_rapl & (RAPL_PKG | RAPL_AMD_F17H); + return platform->rapl_msrs & (RAPL_PKG | RAPL_AMD_F17H); case IDX_DRAM_ENERGY: - return do_rapl & RAPL_DRAM; + return platform->rapl_msrs & RAPL_DRAM; case IDX_PP0_ENERGY: - return do_rapl & RAPL_CORE_ENERGY_STATUS; + return platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS; case IDX_PP1_ENERGY: - return do_rapl & RAPL_GFX; + return platform->rapl_msrs & RAPL_GFX; case IDX_PKG_PERF: - return do_rapl & RAPL_PKG_PERF_STATUS; + return platform->rapl_msrs & RAPL_PKG_PERF_STATUS; case IDX_DRAM_PERF: - return do_rapl & RAPL_DRAM_PERF_STATUS; + return platform->rapl_msrs & RAPL_DRAM_PERF_STATUS; default: return 0; } @@ -1330,10 +1370,10 @@ void print_header(char *delim) if (DO_BIC(BIC_CORE_THROT_CNT)) outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : "")); - if (do_rapl && !rapl_joules) { + if (platform->rapl_msrs && !rapl_joules) { if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : "")); - } else if (do_rapl && rapl_joules) { + } else if (platform->rapl_msrs && rapl_joules) { if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, "%sCor_J", (printed++ ? delim : "")); } @@ -1392,7 +1432,7 @@ void print_header(char *delim) if (DO_BIC(BIC_SYS_LPI)) outp += sprintf(outp, "%sSYS%%LPI", (printed++ ? delim : "")); - if (do_rapl && !rapl_joules) { + if (platform->rapl_msrs && !rapl_joules) { if (DO_BIC(BIC_PkgWatt)) outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : "")); if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY)) @@ -1405,7 +1445,7 @@ void print_header(char *delim) outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : "")); if (DO_BIC(BIC_RAM__)) outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : "")); - } else if (do_rapl && rapl_joules) { + } else if (platform->rapl_msrs && rapl_joules) { if (DO_BIC(BIC_Pkg_J)) outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : "")); if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY)) @@ -2638,7 +2678,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_CORE_THROT_CNT)) get_core_throt_cnt(cpu, &c->core_throt_cnt); - if (do_rapl & RAPL_AMD_F17H) { + if (platform->rapl_msrs & RAPL_AMD_F17H) { if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) return -14; c->core_energy = msr & 0xFFFFFFFF; @@ -2703,37 +2743,37 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_SYS_LPI)) p->sys_lpi = cpuidle_cur_sys_lpi_us; - if (do_rapl & RAPL_PKG) { + if (platform->rapl_msrs & RAPL_PKG) { if (get_msr_sum(cpu, MSR_PKG_ENERGY_STATUS, &msr)) return -13; p->energy_pkg = msr; } - if (do_rapl & RAPL_CORE_ENERGY_STATUS) { + if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) { if (get_msr_sum(cpu, MSR_PP0_ENERGY_STATUS, &msr)) return -14; p->energy_cores = msr; } - if (do_rapl & RAPL_DRAM) { + if (platform->rapl_msrs & RAPL_DRAM) { if (get_msr_sum(cpu, MSR_DRAM_ENERGY_STATUS, &msr)) return -15; p->energy_dram = msr; } - if (do_rapl & RAPL_GFX) { + if (platform->rapl_msrs & RAPL_GFX) { if (get_msr_sum(cpu, MSR_PP1_ENERGY_STATUS, &msr)) return -16; p->energy_gfx = msr; } - if (do_rapl & RAPL_PKG_PERF_STATUS) { + if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) { if (get_msr_sum(cpu, MSR_PKG_PERF_STATUS, &msr)) return -16; p->rapl_pkg_perf_status = msr; } - if (do_rapl & RAPL_DRAM_PERF_STATUS) { + if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) { if (get_msr_sum(cpu, MSR_DRAM_PERF_STATUS, &msr)) return -16; p->rapl_dram_perf_status = msr; } - if (do_rapl & RAPL_AMD_F17H) { + if (platform->rapl_msrs & RAPL_AMD_F17H) { if (get_msr_sum(cpu, MSR_PKG_ENERGY_STAT, &msr)) return -13; p->energy_pkg = msr; @@ -4750,7 +4790,7 @@ double get_tdp_intel(unsigned int model) { unsigned long long msr; - if (do_rapl & RAPL_PKG_POWER_INFO) + if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr)) return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; @@ -4791,85 +4831,35 @@ static double rapl_dram_energy_units_probe(int model, double rapl_energy_units) } } -void rapl_probe_intel(unsigned int family, unsigned int model) +void rapl_probe_intel(unsigned int model) { unsigned long long msr; unsigned int time_unit; double tdp; - if (family != 6) - return; - - switch (model) { - case INTEL_FAM6_SANDYBRIDGE: - case INTEL_FAM6_IVYBRIDGE: - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_BROADWELL_G: /* BDW */ - do_rapl = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO; - break; - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - do_rapl = RAPL_PKG | RAPL_PKG_POWER_INFO; - break; - case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; - break; - case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ - do_rapl = RAPL_PKG_ALL; - break; - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX; - break; - case INTEL_FAM6_HASWELL_X: /* HSX */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ - case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ - do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL; - break; - case INTEL_FAM6_SANDYBRIDGE_X: - case INTEL_FAM6_IVYBRIDGE_X: - do_rapl = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL; - break; - case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ - case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ - do_rapl = RAPL_PKG | RAPL_CORE; - break; - case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - do_rapl = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS; - break; - default: - return; - } - if (rapl_joules) { - if (do_rapl & RAPL_PKG_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) BIC_PRESENT(BIC_Pkg_J); - if (do_rapl & RAPL_CORE_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) BIC_PRESENT(BIC_Cor_J); - if (do_rapl & RAPL_DRAM_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) BIC_PRESENT(BIC_RAM_J); - if (do_rapl & RAPL_GFX_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) BIC_PRESENT(BIC_GFX_J); } else { - if (do_rapl & RAPL_PKG_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_PKG_ENERGY_STATUS) BIC_PRESENT(BIC_PkgWatt); - if (do_rapl & RAPL_CORE_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS) BIC_PRESENT(BIC_CorWatt); - if (do_rapl & RAPL_DRAM_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_DRAM_ENERGY_STATUS) BIC_PRESENT(BIC_RAMWatt); - if (do_rapl & RAPL_GFX_ENERGY_STATUS) + if (platform->rapl_msrs & RAPL_GFX_ENERGY_STATUS) BIC_PRESENT(BIC_GFXWatt); } - if (do_rapl & RAPL_PKG_PERF_STATUS) + if (platform->rapl_msrs & RAPL_PKG_PERF_STATUS) BIC_PRESENT(BIC_PKG__); - if (do_rapl & RAPL_DRAM_PERF_STATUS) + if (platform->rapl_msrs & RAPL_DRAM_PERF_STATUS) BIC_PRESENT(BIC_RAM__); /* units on package 0, verify later other packages match */ @@ -4900,22 +4890,10 @@ void rapl_probe_intel(unsigned int family, unsigned int model) void rapl_probe_amd(unsigned int family, unsigned int model) { unsigned long long msr; - unsigned int eax, ebx, ecx, edx; - unsigned int has_rapl = 0; double tdp; UNUSED(model); - if (max_extended_level >= 0x80000007) { - __cpuid(0x80000007, eax, ebx, ecx, edx); - /* RAPL (Fam 17h+) */ - has_rapl = edx & (1 << 14); - } - - if (!has_rapl || family < 0x17) - return; - - do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY; if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4941,12 +4919,15 @@ void rapl_probe_amd(unsigned int family, unsigned int model) /* * rapl_probe() * - * sets do_rapl, rapl_power_units, rapl_energy_units, rapl_time_units + * sets rapl_power_units, rapl_energy_units, rapl_time_units */ void rapl_probe(unsigned int family, unsigned int model) { + if (!platform->rapl_msrs) + return; + if (genuine_intel) - rapl_probe_intel(family, model); + rapl_probe_intel(model); if (authentic_amd || hygon_genuine) rapl_probe_amd(family, model); } @@ -5040,7 +5021,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) UNUSED(c); UNUSED(p); - if (!do_rapl) + if (!platform->rapl_msrs) return 0; /* RAPL counters are per package, so print only for 1st thread/package */ @@ -5053,7 +5034,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -1; } - if (do_rapl & RAPL_AMD_F17H) { + if (platform->rapl_msrs & RAPL_AMD_F17H) { msr_name = "MSR_RAPL_PWR_UNIT"; if (get_msr(cpu, MSR_RAPL_PWR_UNIT, &msr)) return -1; @@ -5066,7 +5047,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr, rapl_power_units, rapl_energy_units, rapl_time_units); - if (do_rapl & RAPL_PKG_POWER_INFO) { + if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) { if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr)) return -5; @@ -5079,7 +5060,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); } - if (do_rapl & RAPL_PKG) { + if (platform->rapl_msrs & RAPL_PKG) { if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr)) return -9; @@ -5103,7 +5084,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN"); } - if (do_rapl & RAPL_DRAM_POWER_INFO) { + if (platform->rapl_msrs & RAPL_DRAM_POWER_INFO) { if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr)) return -6; @@ -5114,7 +5095,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); } - if (do_rapl & RAPL_DRAM) { + if (platform->rapl_msrs & RAPL_DRAM) { if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n", @@ -5122,20 +5103,20 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) print_power_limit_msr(cpu, msr, "DRAM Limit"); } - if (do_rapl & RAPL_CORE_POLICY) { + if (platform->rapl_msrs & RAPL_CORE_POLICY) { if (get_msr(cpu, MSR_PP0_POLICY, &msr)) return -7; fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF); } - if (do_rapl & RAPL_CORE_POWER_LIMIT) { + if (platform->rapl_msrs & RAPL_CORE_POWER_LIMIT) { if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n", cpu, msr, (msr >> 31) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "Cores Limit"); } - if (do_rapl & RAPL_GFX) { + if (platform->rapl_msrs & RAPL_GFX) { if (get_msr(cpu, MSR_PP1_POLICY, &msr)) return -8; From e338831b14d2da921348c8c4055b9f2c94effe73 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 21:37:46 +0800 Subject: [PATCH 027/560] tools/power/turbostat: Abstract Per Core RAPL support Abstract the support for Per Core RAPL. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 90e1abe96dd56..6314c40dc15b3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -245,7 +245,6 @@ double tsc_tweak = 1.0; unsigned int show_pkg_only; unsigned int show_core_only; char *output_buffer, *outp; -unsigned int do_rapl; unsigned int do_dts; unsigned int do_ptm; unsigned int do_ipc; @@ -289,6 +288,7 @@ struct platform_features { int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ + bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -378,7 +378,6 @@ enum rapl_msrs { RAPL_AMD_PWR_UNIT = BIT(14), /* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */ RAPL_AMD_CORE_ENERGY_STAT = BIT(15), /* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */ RAPL_AMD_PKG_ENERGY_STAT = BIT(16), /* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */ - RAPL_PER_CORE_ENERGY = BIT(17), /* Indicates cores energy collection is per-core, not per-package. */ }; #define RAPL_PKG (RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT) @@ -680,6 +679,7 @@ static const struct platform_features amd_features = { static const struct platform_features amd_features_with_rapl = { .rapl_msrs = RAPL_AMD_F17H, + .has_per_core_rapl = 1, }; static const struct platform_data turbostat_pdata[] = { @@ -763,10 +763,8 @@ void probe_platform_features(unsigned int family, unsigned int model) __cpuid(0x80000007, eax, ebx, ecx, edx); /* RAPL (Fam 17h+) */ - if ((edx & (1 << 14)) && family >= 0x17) { + if ((edx & (1 << 14)) && family >= 0x17) platform = &amd_features_with_rapl; - do_rapl = RAPL_PER_CORE_ENERGY; - } } return; } @@ -1371,10 +1369,10 @@ void print_header(char *delim) outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : "")); if (platform->rapl_msrs && !rapl_joules) { - if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : "")); } else if (platform->rapl_msrs && rapl_joules) { - if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl) outp += sprintf(outp, "%sCor_J", (printed++ ? delim : "")); } @@ -1435,7 +1433,7 @@ void print_header(char *delim) if (platform->rapl_msrs && !rapl_joules) { if (DO_BIC(BIC_PkgWatt)) outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : "")); - if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl) outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : "")); if (DO_BIC(BIC_GFXWatt)) outp += sprintf(outp, "%sGFXWatt", (printed++ ? delim : "")); @@ -1448,7 +1446,7 @@ void print_header(char *delim) } else if (platform->rapl_msrs && rapl_joules) { if (DO_BIC(BIC_Pkg_J)) outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : "")); - if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl) outp += sprintf(outp, "%sCor_J", (printed++ ? delim : "")); if (DO_BIC(BIC_GFX_J)) outp += sprintf(outp, "%sGFX_J", (printed++ ? delim : "")); @@ -1750,10 +1748,10 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data fmt8 = "%s%.2f"; - if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); - if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); /* print per-package data only for 1st core in package */ @@ -1818,7 +1816,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); - if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); if (DO_BIC(BIC_GFXWatt)) @@ -1830,7 +1828,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->energy_dram * rapl_dram_energy_units / interval_float); if (DO_BIC(BIC_Pkg_J)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units); - if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY)) + if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units); if (DO_BIC(BIC_GFX_J)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units); From 6d35b8c4a661c849361239fe316035ea952606a3 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 22 Apr 2023 11:59:04 +0800 Subject: [PATCH 028/560] tools/power/turbostat: Abstract RAPL divisor support INTEL_FAM6_ATOM_SILVERMONT model needs a divisor to convert the raw Energy Units value from MSR_RAPL_POWER_UNIT. Abstract the support for RAPL divisor. Delete CPU model check in rapl_probe_intel(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 6314c40dc15b3..a317243356715 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -289,6 +289,7 @@ struct platform_features { int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */ + bool has_rapl_divisor; /* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -595,6 +596,7 @@ static const struct platform_features slv_features = { .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, + .has_rapl_divisor = 1, }; static const struct platform_features slvd_features = { @@ -4865,7 +4867,7 @@ void rapl_probe_intel(unsigned int model) return; rapl_power_units = 1.0 / (1 << (msr & 0xF)); - if (model == INTEL_FAM6_ATOM_SILVERMONT) + if (platform->has_rapl_divisor) rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000; else rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F)); From 9e6f35159cdef148c711d2fb7d5fd2b2b6fb772d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 22 Apr 2023 12:21:58 +0800 Subject: [PATCH 029/560] tools/power/turbostat: Abstract fixed DRAM Energy unit support Abstract the support for fixed Dram domain energy unit. Delete rapl_dram_energy_units_probe() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 31 +++++++++------------------ 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a317243356715..a26ae5a2e2bd1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -290,6 +290,7 @@ struct platform_features { int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */ bool has_rapl_divisor; /* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */ + bool has_fixed_rapl_unit; /* Fixed Energy Unit used for DRAM RAPL Domain */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -471,6 +472,7 @@ static const struct platform_features hsx_features = { .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .has_fixed_rapl_unit = 1, }; static const struct platform_features hswl_features = { @@ -529,6 +531,7 @@ static const struct platform_features bdx_features = { .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .has_fixed_rapl_unit = 1, }; static const struct platform_features skl_features = { @@ -566,6 +569,7 @@ static const struct platform_features skx_features = { .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .has_fixed_rapl_unit = 1, }; static const struct platform_features icx_features = { @@ -577,6 +581,7 @@ static const struct platform_features icx_features = { .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .has_fixed_rapl_unit = 1, }; static const struct platform_features spr_features = { @@ -671,6 +676,7 @@ static const struct platform_features knl_features = { .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, + .has_fixed_rapl_unit = 1, }; static const struct platform_features default_features = { @@ -4811,26 +4817,6 @@ double get_tdp_amd(unsigned int family) return 280.0; } -/* - * rapl_dram_energy_units_probe() - * Energy units are either hard-coded, or come from RAPL Energy Unit MSR. - */ -static double rapl_dram_energy_units_probe(int model, double rapl_energy_units) -{ - /* only called for genuine_intel, family 6 */ - - switch (model) { - case INTEL_FAM6_HASWELL_X: /* HSX */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - return (rapl_dram_energy_units = 15.3 / 1000000); - default: - return (rapl_energy_units); - } -} - void rapl_probe_intel(unsigned int model) { unsigned long long msr; @@ -4872,7 +4858,10 @@ void rapl_probe_intel(unsigned int model) else rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F)); - rapl_dram_energy_units = rapl_dram_energy_units_probe(model, rapl_energy_units); + if (platform->has_fixed_rapl_unit) + rapl_dram_energy_units = (15.3 / 1000000); + else + rapl_dram_energy_units = rapl_energy_units; time_unit = msr >> 16 & 0xF; if (time_unit == 0) From 7c60409382a4be05d601e0b45db7b0166845b0cf Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 10:21:10 +0800 Subject: [PATCH 030/560] tools/power/turbostat: Abstract hardcoded TDP value Different hardcoded TDP values are used when TDP can not be retrieved from the hardware. Abstract hardcoded TDP value. Delete CPU model checks in get_tdp_intel(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a26ae5a2e2bd1..45698c3a9e72a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -291,6 +291,7 @@ struct platform_features { bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */ bool has_rapl_divisor; /* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */ bool has_fixed_rapl_unit; /* Fixed Energy Unit used for DRAM RAPL Domain */ + int rapl_quirk_tdp; /* Hardcoded TDP value when cannot be retrieved from hardware */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ }; @@ -602,6 +603,7 @@ static const struct platform_features slv_features = { .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, .has_rapl_divisor = 1, + .rapl_quirk_tdp = 30, }; static const struct platform_features slvd_features = { @@ -611,6 +613,7 @@ static const struct platform_features slvd_features = { .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE, + .rapl_quirk_tdp = 30, }; static const struct platform_features amt_features = { @@ -688,6 +691,7 @@ static const struct platform_features amd_features = { static const struct platform_features amd_features_with_rapl = { .rapl_msrs = RAPL_AMD_F17H, .has_per_core_rapl = 1, + .rapl_quirk_tdp = 280, /* This is the max stock TDP of HEDT/Server Fam17h+ chips */ }; static const struct platform_data turbostat_pdata[] = { @@ -4792,29 +4796,31 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data #define RAPL_POWER_GRANULARITY 0x7FFF /* 15 bit power granularity */ #define RAPL_TIME_GRANULARITY 0x3F /* 6 bit time granularity */ +double get_quirk_tdp(void) +{ + if (platform->rapl_quirk_tdp) + return platform->rapl_quirk_tdp; + + return 135.0; +} + double get_tdp_intel(unsigned int model) { unsigned long long msr; + UNUSED(model); + if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr)) return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; - - switch (model) { - case INTEL_FAM6_ATOM_SILVERMONT: - case INTEL_FAM6_ATOM_SILVERMONT_D: - return 30.0; - default: - return 135.0; - } + return get_quirk_tdp(); } double get_tdp_amd(unsigned int family) { UNUSED(family); - /* This is the max stock TDP of HEDT/Server Fam17h+ chips */ - return 280.0; + return get_quirk_tdp(); } void rapl_probe_intel(unsigned int model) From bf1ad57c3f92c551dbfdcecd49797253f55cb7c1 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 10:28:04 +0800 Subject: [PATCH 031/560] tools/power/turbostat: Remove unused family/model parameters for RAPL functions RAPL probing can be done without family/model checking. Remove these parameters in rapl probe functions. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 45698c3a9e72a..44d7321b004e3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4804,26 +4804,22 @@ double get_quirk_tdp(void) return 135.0; } -double get_tdp_intel(unsigned int model) +double get_tdp_intel(void) { unsigned long long msr; - UNUSED(model); - if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr)) return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units; return get_quirk_tdp(); } -double get_tdp_amd(unsigned int family) +double get_tdp_amd(void) { - UNUSED(family); - return get_quirk_tdp(); } -void rapl_probe_intel(unsigned int model) +void rapl_probe_intel(void) { unsigned long long msr; unsigned int time_unit; @@ -4875,20 +4871,18 @@ void rapl_probe_intel(unsigned int model) rapl_time_units = 1.0 / (1 << (time_unit)); - tdp = get_tdp_intel(model); + tdp = get_tdp_intel(); rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp; if (!quiet) fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp); } -void rapl_probe_amd(unsigned int family, unsigned int model) +void rapl_probe_amd(void) { unsigned long long msr; double tdp; - UNUSED(model); - if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4904,7 +4898,7 @@ void rapl_probe_amd(unsigned int family, unsigned int model) rapl_energy_units = ldexp(1.0, -(msr >> 8 & 0x1f)); rapl_power_units = ldexp(1.0, -(msr & 0xf)); - tdp = get_tdp_amd(family); + tdp = get_tdp_amd(); rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp; if (!quiet) @@ -4916,15 +4910,15 @@ void rapl_probe_amd(unsigned int family, unsigned int model) * * sets rapl_power_units, rapl_energy_units, rapl_time_units */ -void rapl_probe(unsigned int family, unsigned int model) +void rapl_probe(void) { if (!platform->rapl_msrs) return; if (genuine_intel) - rapl_probe_intel(model); + rapl_probe_intel(); if (authentic_amd || hygon_genuine) - rapl_probe_amd(family, model); + rapl_probe_amd(); } void prewake_cstate_probe(unsigned int family, unsigned int model) @@ -5828,7 +5822,7 @@ void process_cpuid() if (!quiet && has_slv_msrs(family, model)) decode_c6_demotion_policy_msr(); - rapl_probe(family, model); + rapl_probe(); prewake_cstate_probe(family, model); if (!quiet) From 485a017c45200ed82518f6cdeea554f77e9a0562 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 10:37:23 +0800 Subject: [PATCH 032/560] tools/power/turbostat: Abstract TSC tweak support On some models, the CPU base frequency is different from the TSC frequency, and the aperf/mperf counters are running at CPU base frequency instead of TSC frequency. Abstract support for TSC tweak. Given that tsc_tweak depends on base_hz, move the code to probe_bclk() after base_hz is available. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 44d7321b004e3..05385fabc83a7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -293,6 +293,7 @@ struct platform_features { bool has_fixed_rapl_unit; /* Fixed Energy Unit used for DRAM RAPL Domain */ int rapl_quirk_tdp; /* Hardcoded TDP value when cannot be retrieved from hardware */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ + bool enable_tsc_tweak; /* Use CPU Base freq instead of TSC freq for aperf/mperf counter */ }; struct platform_data { @@ -546,6 +547,7 @@ static const struct platform_features skl_features = { .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, }; static const struct platform_features cnl_features = { @@ -558,6 +560,7 @@ static const struct platform_features cnl_features = { .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, }; static const struct platform_features skx_features = { @@ -660,6 +663,7 @@ static const struct platform_features tmt_features = { .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, }; static const struct platform_features tmtd_features = { @@ -2934,11 +2938,6 @@ void probe_cst_limit(void) pkg_cstate_limit = pkg_cstate_limits[msr & 0xF]; } -static void calculate_tsc_tweak() -{ - tsc_tweak = base_hz / tsc_hz; -} - void prewake_cstate_probe(unsigned int family, unsigned int model); static void dump_platform_info(void) @@ -4198,6 +4197,9 @@ void probe_bclk(void) base_hz = base_ratio * bclk * 1000000; has_base_hz = 1; + + if (platform->enable_tsc_tweak) + tsc_tweak = base_hz / tsc_hz; } /* @@ -5836,9 +5838,6 @@ void process_cpuid() if (!quiet) dump_sysfs_pstate_config(); - if (has_skl_msrs(family, model) || is_ehl(family, model)) - calculate_tsc_tweak(); - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) BIC_PRESENT(BIC_GFX_rc6); From 3c6a17b8ae44b0116e303402803c173fe2a3da92 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 22:37:37 +0800 Subject: [PATCH 033/560] tools/power/turbostat: Add skeleton support for cstate enumeration Add skeleton support for cstate enumeration. Note that the previous logic may override the cstate setting for multiple times for different reasons. The conversion to new cstate enumeration must be done step by step following the previous code order strictly. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 57 ++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 05385fabc83a7..6a49eb941fe0c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -283,6 +283,7 @@ struct platform_features { bool has_config_tdp; /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */ int bclk_freq; /* CPU base clock */ int crystal_freq; /* Crystal clock to use when not available from CPUID.15 */ + int supported_cstates; /* Core cstates and Package cstates supported */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ @@ -396,6 +397,21 @@ enum rapl_msrs { #define RAPL_AMD_F17H (RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT) +/* For Cstates */ +enum cstates { + CC1 = BIT(0), + CC3 = BIT(1), + CC6 = BIT(2), + CC7 = BIT(3), + PC2 = BIT(4), + PC3 = BIT(5), + PC6 = BIT(6), + PC7 = BIT(7), + PC8 = BIT(8), + PC9 = BIT(9), + PC10 = BIT(10), +}; + static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, @@ -5560,6 +5576,44 @@ void linux_perf_init(void) BIC_PRESENT(BIC_IPC); } +void probe_cstates(void) +{ + probe_cst_limit(); + + if (platform->supported_cstates & CC1) + BIC_PRESENT(BIC_CPU_c1); + + if (platform->supported_cstates & CC3) + BIC_PRESENT(BIC_CPU_c3); + + if (platform->supported_cstates & CC6) + BIC_PRESENT(BIC_CPU_c6); + + if (platform->supported_cstates & CC7) + BIC_PRESENT(BIC_CPU_c7); + + if (platform->supported_cstates & PC2 && (pkg_cstate_limit >= PCL__2)) + BIC_PRESENT(BIC_Pkgpc2); + + if (platform->supported_cstates & PC3 && (pkg_cstate_limit >= PCL__3)) + BIC_PRESENT(BIC_Pkgpc3); + + if (platform->supported_cstates & PC6 && (pkg_cstate_limit >= PCL__6)) + BIC_PRESENT(BIC_Pkgpc6); + + if (platform->supported_cstates & PC7 && (pkg_cstate_limit >= PCL__7)) + BIC_PRESENT(BIC_Pkgpc7); + + if (platform->supported_cstates & PC8 && (pkg_cstate_limit >= PCL__8)) + BIC_PRESENT(BIC_Pkgpc8); + + if (platform->supported_cstates & PC9 && (pkg_cstate_limit >= PCL__9)) + BIC_PRESENT(BIC_Pkgpc9); + + if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10)) + BIC_PRESENT(BIC_Pkgpc10); +} + void process_cpuid() { unsigned int eax, ebx, ecx, edx; @@ -5741,7 +5795,8 @@ void process_cpuid() BIC_PRESENT(BIC_IRQ); BIC_PRESENT(BIC_TSC_MHz); - probe_cst_limit(); + probe_cstates(); + if (platform->has_nhm_msrs) { BIC_PRESENT(BIC_CPU_c1); BIC_PRESENT(BIC_CPU_c3); From ce7ddf8af2f96a8a7af4cc2273843518c5810166 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:16:27 +0800 Subject: [PATCH 034/560] tools/power/turbostat: Adjust cstate for models with .has_nhm_msrs set Enable CC1/CC3/CC6 for platforms with .has_nhm_msrs set. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 33 +++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 6a49eb941fe0c..c7345e0c5185e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -416,6 +416,7 @@ static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_NHM, .trl_msrs = TRL_BASE, }; @@ -424,6 +425,7 @@ static const struct platform_features nhx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_NHM, }; @@ -432,6 +434,7 @@ static const struct platform_features snb_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -442,6 +445,7 @@ static const struct platform_features snx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -453,6 +457,7 @@ static const struct platform_features ivb_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -463,6 +468,7 @@ static const struct platform_features ivx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE | TRL_LIMIT1, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -474,6 +480,7 @@ static const struct platform_features hsw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -486,6 +493,7 @@ static const struct platform_features hsx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, @@ -499,6 +507,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -511,6 +520,7 @@ static const struct platform_features hswg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -523,6 +533,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -534,6 +545,7 @@ static const struct platform_features bdwg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -545,6 +557,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, @@ -559,6 +572,7 @@ static const struct platform_features skl_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 24000000, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -572,6 +586,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -585,6 +600,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -598,6 +614,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -610,6 +627,7 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -618,6 +636,7 @@ static const struct platform_features spr_features = { static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -629,6 +648,7 @@ static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -638,6 +658,7 @@ static const struct platform_features slvd_features = { static const struct platform_features amt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_AMT, .trl_msrs = TRL_BASE, }; @@ -647,6 +668,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -657,6 +679,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 25000000, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, @@ -667,6 +690,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -676,6 +700,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -686,6 +711,7 @@ static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, @@ -696,6 +722,7 @@ static const struct platform_features knl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC3 | CC6, .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -5797,12 +5824,8 @@ void process_cpuid() probe_cstates(); - if (platform->has_nhm_msrs) { - BIC_PRESENT(BIC_CPU_c1); - BIC_PRESENT(BIC_CPU_c3); - BIC_PRESENT(BIC_CPU_c6); + if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - } probe_bclk(); do_snb_cstates = has_snb_msrs(family, model); From 942c854d8d0f6c6fc0864a9da5f5e374a8e146e5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:16:56 +0800 Subject: [PATCH 035/560] tools/power/turbostat: Adjust cstate for has_snb_msrs() models Enable CC7 and PC2 for has_snb_msrs() models. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 47 ++++++++++++--------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c7345e0c5185e..174a8d0750da5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -434,7 +434,7 @@ static const struct platform_features snb_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -445,7 +445,7 @@ static const struct platform_features snx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -457,7 +457,7 @@ static const struct platform_features ivb_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -468,7 +468,7 @@ static const struct platform_features ivx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE | TRL_LIMIT1, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -480,7 +480,7 @@ static const struct platform_features hsw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -493,7 +493,7 @@ static const struct platform_features hsx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, @@ -507,7 +507,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -520,7 +520,7 @@ static const struct platform_features hswg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -533,7 +533,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -545,7 +545,7 @@ static const struct platform_features bdwg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -557,7 +557,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, @@ -572,7 +572,7 @@ static const struct platform_features skl_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 24000000, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -586,7 +586,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -600,7 +600,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -614,7 +614,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -627,7 +627,7 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -668,7 +668,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -679,7 +679,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 25000000, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, @@ -690,7 +690,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -700,7 +700,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -711,7 +711,7 @@ static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, @@ -5829,12 +5829,7 @@ void process_cpuid() probe_bclk(); do_snb_cstates = has_snb_msrs(family, model); - if (do_snb_cstates) - BIC_PRESENT(BIC_CPU_c7); - do_irtl_snb = has_snb_msrs(family, model); - if (do_snb_cstates && (pkg_cstate_limit >= PCL__2)) - BIC_PRESENT(BIC_Pkgpc2); if (pkg_cstate_limit >= PCL__3) BIC_PRESENT(BIC_Pkgpc3); if (pkg_cstate_limit >= PCL__6) From 6f1935c036f79b56b6a1dc6e51c8c6fe483983ec Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:17:23 +0800 Subject: [PATCH 036/560] tools/power/turbostat: Adjust cstate for models with .cst_limit set Enable PC3/PC6 for platforms with .cst_limit set because package cstates are guarded by pkg_cstate_limit. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 58 +++++++++++++-------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 174a8d0750da5..2bfbf4ccf5acf 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -416,7 +416,7 @@ static const struct platform_features nhm_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_NHM, .trl_msrs = TRL_BASE, }; @@ -425,7 +425,7 @@ static const struct platform_features nhx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_NHM, }; @@ -434,7 +434,7 @@ static const struct platform_features snb_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -445,7 +445,7 @@ static const struct platform_features snx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -457,7 +457,7 @@ static const struct platform_features ivb_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -468,7 +468,7 @@ static const struct platform_features ivx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE | TRL_LIMIT1, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -480,7 +480,7 @@ static const struct platform_features hsw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -493,7 +493,7 @@ static const struct platform_features hsx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, @@ -507,7 +507,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -520,7 +520,7 @@ static const struct platform_features hswg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -533,7 +533,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -545,7 +545,7 @@ static const struct platform_features bdwg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -557,7 +557,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, @@ -572,7 +572,7 @@ static const struct platform_features skl_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 24000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -586,7 +586,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -600,7 +600,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -614,7 +614,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -627,7 +627,7 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -636,7 +636,7 @@ static const struct platform_features spr_features = { static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -648,7 +648,7 @@ static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -658,7 +658,7 @@ static const struct platform_features slvd_features = { static const struct platform_features amt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_133MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_AMT, .trl_msrs = TRL_BASE, }; @@ -668,7 +668,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -679,7 +679,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 25000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, @@ -690,7 +690,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -700,7 +700,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -711,7 +711,7 @@ static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, @@ -722,7 +722,7 @@ static const struct platform_features knl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6, + .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -5830,10 +5830,6 @@ void process_cpuid() do_snb_cstates = has_snb_msrs(family, model); do_irtl_snb = has_snb_msrs(family, model); - if (pkg_cstate_limit >= PCL__3) - BIC_PRESENT(BIC_Pkgpc3); - if (pkg_cstate_limit >= PCL__6) - BIC_PRESENT(BIC_Pkgpc6); if (do_snb_cstates && (pkg_cstate_limit >= PCL__7)) BIC_PRESENT(BIC_Pkgpc7); if (has_slv_msrs(family, model)) { From 192cbf0468ae31062526287e257f5b56214d2da5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:17:58 +0800 Subject: [PATCH 037/560] tools/power/turbostat: Adjust cstate for has_snb_msrs() models Enable PC7 for has_snb_msrs() models. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 46 ++++++++++++--------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 2bfbf4ccf5acf..f3d44e81d7d51 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -221,7 +221,6 @@ unsigned int rapl_joules; unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; -unsigned int do_snb_cstates; unsigned int do_knl_cstates; unsigned int do_slm_cstates; unsigned int use_c1_residency_msr; @@ -434,7 +433,7 @@ static const struct platform_features snb_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -445,7 +444,7 @@ static const struct platform_features snx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -457,7 +456,7 @@ static const struct platform_features ivb_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -468,7 +467,7 @@ static const struct platform_features ivx_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, .trl_msrs = TRL_BASE | TRL_LIMIT1, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, @@ -480,7 +479,7 @@ static const struct platform_features hsw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -493,7 +492,7 @@ static const struct platform_features hsx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, @@ -507,7 +506,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -520,7 +519,7 @@ static const struct platform_features hswg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -533,7 +532,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -545,7 +544,7 @@ static const struct platform_features bdwg_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -557,7 +556,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, @@ -572,7 +571,7 @@ static const struct platform_features skl_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 24000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -586,7 +585,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -600,7 +599,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -614,7 +613,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -627,7 +626,7 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -668,7 +667,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -679,7 +678,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 25000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, @@ -690,7 +689,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -700,7 +699,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -711,7 +710,7 @@ static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, @@ -5827,11 +5826,8 @@ void process_cpuid() if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); probe_bclk(); - do_snb_cstates = has_snb_msrs(family, model); do_irtl_snb = has_snb_msrs(family, model); - if (do_snb_cstates && (pkg_cstate_limit >= PCL__7)) - BIC_PRESENT(BIC_Pkgpc7); if (has_slv_msrs(family, model)) { BIC_NOT_PRESENT(BIC_Pkgpc2); BIC_NOT_PRESENT(BIC_Pkgpc3); From ff206149551f09117f44883650a45ae692745703 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:07:08 +0800 Subject: [PATCH 038/560] tools/power/turbostat: Adjust cstate for has_slv_msrs() models Disable PC2/PC3/PC7 and enable PC6 for has_slv_msrs() models. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f3d44e81d7d51..972f5a9b14e69 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -635,7 +635,7 @@ static const struct platform_features spr_features = { static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, - .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, + .supported_cstates = CC1 | CC3 | CC6 | PC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -5829,10 +5829,6 @@ void process_cpuid() do_irtl_snb = has_snb_msrs(family, model); if (has_slv_msrs(family, model)) { - BIC_NOT_PRESENT(BIC_Pkgpc2); - BIC_NOT_PRESENT(BIC_Pkgpc3); - BIC_PRESENT(BIC_Pkgpc6); - BIC_NOT_PRESENT(BIC_Pkgpc7); BIC_PRESENT(BIC_Mod_c6); use_c1_residency_msr = 1; } From 3d982ac0dafeab860b9d42f5cc41a78753275fdd Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:09:20 +0800 Subject: [PATCH 039/560] tools/power/turbostat: Adjust cstate for is_jvl() models Disable CC3/CC7/PC2/PC3/PC6/PC7 for is_jvl() models. Delete is_jvl() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 972f5a9b14e69..e95972edde3c1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -710,7 +710,7 @@ static const struct platform_features tmtd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, @@ -4361,21 +4361,6 @@ int is_ehl(unsigned int family, unsigned int model) return 0; } -int is_jvl(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_TREMONT_D: - return 1; - } - return 0; -} - static void remove_underbar(char *s) { char *to = s; @@ -5832,14 +5817,6 @@ void process_cpuid() BIC_PRESENT(BIC_Mod_c6); use_c1_residency_msr = 1; } - if (is_jvl(family, model)) { - BIC_NOT_PRESENT(BIC_CPU_c3); - BIC_NOT_PRESENT(BIC_CPU_c7); - BIC_NOT_PRESENT(BIC_Pkgpc2); - BIC_NOT_PRESENT(BIC_Pkgpc3); - BIC_NOT_PRESENT(BIC_Pkgpc6); - BIC_NOT_PRESENT(BIC_Pkgpc7); - } if (is_dnv(family, model)) { BIC_PRESENT(BIC_CPU_c1); BIC_NOT_PRESENT(BIC_CPU_c3); From 8e20ced057423f0edaf40b650facc221e8030b33 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:12:29 +0800 Subject: [PATCH 040/560] tools/power/turbostat: Adjust cstate for is_dnv() models Enable CC1 and disable CC3/CC7/PC3/PC7 for is_dnv() models. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e95972edde3c1..069704ef3c803 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -678,7 +678,7 @@ static const struct platform_features gmtd_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 25000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, @@ -5818,11 +5818,6 @@ void process_cpuid() use_c1_residency_msr = 1; } if (is_dnv(family, model)) { - BIC_PRESENT(BIC_CPU_c1); - BIC_NOT_PRESENT(BIC_CPU_c3); - BIC_NOT_PRESENT(BIC_Pkgpc3); - BIC_NOT_PRESENT(BIC_CPU_c7); - BIC_NOT_PRESENT(BIC_Pkgpc7); use_c1_residency_msr = 1; } if (is_skx(family, model) || is_icx(family, model) || is_spr(family, model)) { From 24d16bec379db6eea2d72e18c97c6c80e486a5e1 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:18:27 +0800 Subject: [PATCH 041/560] tools/power/turbostat: Adjust cstate for is_skx()/is_icx()/is_spr() models Disable CC3/CC7/PC3/PC7 for is_skx()/is_icx()/is_spr() models. Delete is_skx() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 28 +++------------------------ 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 069704ef3c803..262af40fe35d1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -599,7 +599,7 @@ static const struct platform_features skx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, @@ -613,7 +613,7 @@ static const struct platform_features icx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_ICX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -626,7 +626,7 @@ static const struct platform_features spr_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -4298,22 +4298,6 @@ int is_bdx(unsigned int family, unsigned int model) return 0; } -int is_skx(unsigned int family, unsigned int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_SKYLAKE_X: - return 1; - } - return 0; -} - int is_icx(unsigned int family, unsigned int model) { @@ -5820,12 +5804,6 @@ void process_cpuid() if (is_dnv(family, model)) { use_c1_residency_msr = 1; } - if (is_skx(family, model) || is_icx(family, model) || is_spr(family, model)) { - BIC_NOT_PRESENT(BIC_CPU_c3); - BIC_NOT_PRESENT(BIC_Pkgpc3); - BIC_NOT_PRESENT(BIC_CPU_c7); - BIC_NOT_PRESENT(BIC_Pkgpc7); - } if (is_bdx(family, model)) { BIC_NOT_PRESENT(BIC_CPU_c7); BIC_NOT_PRESENT(BIC_Pkgpc7); From 1109694817fb4fcdfccb1a86fd58e69fb60f4eab Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:16:52 +0800 Subject: [PATCH 042/560] tools/power/turbostat: Adjust cstate for is_bdx() models Disable CC7/PC7 for is_bdx() models. Delete is_bdx() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 262af40fe35d1..27ca29f0545ad 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -556,7 +556,7 @@ static const struct platform_features bdx_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, @@ -4282,22 +4282,6 @@ int is_dnv(unsigned int family, unsigned int model) return 0; } -int is_bdx(unsigned int family, unsigned int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_BROADWELL_X: - return 1; - } - return 0; -} - int is_icx(unsigned int family, unsigned int model) { @@ -5804,10 +5788,6 @@ void process_cpuid() if (is_dnv(family, model)) { use_c1_residency_msr = 1; } - if (is_bdx(family, model)) { - BIC_NOT_PRESENT(BIC_CPU_c7); - BIC_NOT_PRESENT(BIC_Pkgpc7); - } if (has_c8910_msrs(family, model)) { if (pkg_cstate_limit >= PCL__8) BIC_PRESENT(BIC_Pkgpc8); From 4d2c95d40a90877ffd8f961055419f1f550a7ed9 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:20:02 +0800 Subject: [PATCH 043/560] tools/power/turbostat: Adjust cstate for has_c8910_msrs() models Enable PC8/PC9/PC10 for has_c8910_msrs() models. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 27ca29f0545ad..b0bc973c077de 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -506,7 +506,7 @@ static const struct platform_features hswl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, @@ -532,7 +532,7 @@ static const struct platform_features bdw_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -571,7 +571,7 @@ static const struct platform_features skl_features = { .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 24000000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -585,7 +585,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -667,7 +667,7 @@ static const struct platform_features gmt_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -689,7 +689,7 @@ static const struct platform_features gmtp_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, .crystal_freq = 19200000, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, @@ -699,7 +699,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, + .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -5788,14 +5788,6 @@ void process_cpuid() if (is_dnv(family, model)) { use_c1_residency_msr = 1; } - if (has_c8910_msrs(family, model)) { - if (pkg_cstate_limit >= PCL__8) - BIC_PRESENT(BIC_Pkgpc8); - if (pkg_cstate_limit >= PCL__9) - BIC_PRESENT(BIC_Pkgpc9); - if (pkg_cstate_limit >= PCL_10) - BIC_PRESENT(BIC_Pkgpc10); - } do_irtl_hsw = has_c8910_msrs(family, model); if (has_skl_msrs(family, model)) { BIC_PRESENT(BIC_Totl_c0); From cd7a2b6a61100a0fe8a40916d1afbd72d8833d57 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:24:19 +0800 Subject: [PATCH 044/560] tools/power/turbostat: Adjust cstate for is_slm()/is_knl()/is_cnl()/is_ehl() models Disable CC3 for is_slm()/is_knl()/is_cnl()/is_ehl() models. Delete is_cnl()/is_ehl() CPU model checks. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 44 +++------------------------ 1 file changed, 5 insertions(+), 39 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b0bc973c077de..24f470883ca24 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -585,7 +585,7 @@ static const struct platform_features cnl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, @@ -635,7 +635,7 @@ static const struct platform_features spr_features = { static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, - .supported_cstates = CC1 | CC3 | CC6 | PC6, + .supported_cstates = CC1 | CC6 | PC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -647,7 +647,7 @@ static const struct platform_features slvd_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, - .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, + .supported_cstates = CC1 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_SLV, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE, @@ -699,7 +699,7 @@ static const struct platform_features tmt_features = { .has_msr_misc_pwr_mgmt = 1, .has_nhm_msrs = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -721,7 +721,7 @@ static const struct platform_features knl_features = { .has_nhm_msrs = 1, .has_config_tdp = 1, .bclk_freq = BCLK_100MHZ, - .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6, + .supported_cstates = CC1 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_KNL, .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -4314,21 +4314,6 @@ int is_spr(unsigned int family, unsigned int model) return 0; } -int is_ehl(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_TREMONT: - return 1; - } - return 0; -} - static void remove_underbar(char *s) { char *to = s; @@ -5248,22 +5233,6 @@ int is_knl(unsigned int family, unsigned int model) return 0; } -int is_cnl(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - return 1; - } - - return 0; -} - unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model) { if (is_knl(family, model)) @@ -5798,9 +5767,6 @@ void process_cpuid() do_slm_cstates = is_slm(family, model); do_knl_cstates = is_knl(family, model); - if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) || is_ehl(family, model)) - BIC_NOT_PRESENT(BIC_CPU_c3); - if (!quiet) decode_misc_pwr_mgmt_msr(); From 8c382f9e74663072805565036026d4e79de96425 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:27:26 +0800 Subject: [PATCH 045/560] tools/power/turbostat: Use fine grained IRTL output It is pointless to dump the IRTL register for a package cstate that is not supported by the platform. Print IRTL only for states that are available in platform->supported_cstates. Delete has_c8910_msrs() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 99 ++++++++++----------------- 1 file changed, 36 insertions(+), 63 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 24f470883ca24..680373010b01e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -229,7 +229,6 @@ unsigned int has_epb; unsigned int has_turbo; unsigned int is_hybrid; unsigned int do_irtl_snb; -unsigned int do_irtl_hsw; unsigned int units = 1000000; /* MHz etc */ unsigned int genuine_intel; unsigned int authentic_amd; @@ -3269,39 +3268,47 @@ void print_irtl(void) { unsigned long long msr; - get_msr(base_cpu, MSR_PKGC3_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); - - get_msr(base_cpu, MSR_PKGC6_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); - - get_msr(base_cpu, MSR_PKGC7_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + if (platform->supported_cstates & PC3) { + get_msr(base_cpu, MSR_PKGC3_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } - if (!do_irtl_hsw) - return; + if (platform->supported_cstates & PC6) { + get_msr(base_cpu, MSR_PKGC6_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } - get_msr(base_cpu, MSR_PKGC8_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + if (platform->supported_cstates & PC7) { + get_msr(base_cpu, MSR_PKGC7_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } - get_msr(base_cpu, MSR_PKGC9_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + if (platform->supported_cstates & PC8) { + get_msr(base_cpu, MSR_PKGC8_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } - get_msr(base_cpu, MSR_PKGC10_IRTL, &msr); - fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr); - fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", - (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + if (platform->supported_cstates & PC9) { + get_msr(base_cpu, MSR_PKGC9_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } + if (platform->supported_cstates & PC10) { + get_msr(base_cpu, MSR_PKGC10_IRTL, &msr); + fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr); + fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT", + (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); + } } void free_fd_percpu(void) @@ -5145,39 +5152,6 @@ int has_snb_msrs(unsigned int family, unsigned int model) return 0; } -/* - * HSW ULT added support for C8/C9/C10 MSRs: - * - * MSR_PKG_C8_RESIDENCY 0x00000630 - * MSR_PKG_C9_RESIDENCY 0x00000631 - * MSR_PKG_C10_RESIDENCY 0x00000632 - * - * MSR_PKGC8_IRTL 0x00000633 - * MSR_PKGC9_IRTL 0x00000634 - * MSR_PKGC10_IRTL 0x00000635 - * - */ -int has_c8910_msrs(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - return 1; - } - return 0; -} - /* * SKL adds support for additional MSRS: * @@ -5757,7 +5731,6 @@ void process_cpuid() if (is_dnv(family, model)) { use_c1_residency_msr = 1; } - do_irtl_hsw = has_c8910_msrs(family, model); if (has_skl_msrs(family, model)) { BIC_PRESENT(BIC_Totl_c0); BIC_PRESENT(BIC_Any_c0); From 148df4fd04a98fb24198ecb4419c87e07d38af30 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 8 Sep 2023 23:18:51 +0800 Subject: [PATCH 046/560] tools/power/turbostat: Abstract IRTL support Abstract the support for MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL. Delete has_snb_msrs() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 72 ++++++++++----------------- 1 file changed, 26 insertions(+), 46 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 680373010b01e..44be06b763b28 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -228,7 +228,6 @@ unsigned int has_aperf; unsigned int has_epb; unsigned int has_turbo; unsigned int is_hybrid; -unsigned int do_irtl_snb; unsigned int units = 1000000; /* MHz etc */ unsigned int genuine_intel; unsigned int authentic_amd; @@ -284,6 +283,7 @@ struct platform_features { int supported_cstates; /* Core cstates and Package cstates supported */ int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ + bool has_irtl_msrs; /* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -434,6 +434,7 @@ static const struct platform_features snb_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; @@ -445,6 +446,7 @@ static const struct platform_features snx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, }; @@ -457,6 +459,7 @@ static const struct platform_features ivb_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; @@ -468,6 +471,7 @@ static const struct platform_features ivx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_SNB, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_LIMIT1, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL, }; @@ -480,6 +484,7 @@ static const struct platform_features hsw_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -493,6 +498,7 @@ static const struct platform_features hsx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2, .plr_msrs = PLR_CORE | PLR_RING, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -507,6 +513,7 @@ static const struct platform_features hswl_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -520,6 +527,7 @@ static const struct platform_features hswg_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, @@ -533,6 +541,7 @@ static const struct platform_features bdw_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; @@ -545,6 +554,7 @@ static const struct platform_features bdwg_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO, }; @@ -557,6 +567,7 @@ static const struct platform_features bdx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC3 | CC6 | PC2 | PC3 | PC6, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -572,6 +583,7 @@ static const struct platform_features skl_features = { .crystal_freq = 24000000, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -586,6 +598,7 @@ static const struct platform_features cnl_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -600,6 +613,7 @@ static const struct platform_features skx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, + .has_irtl_msrs = 1, .has_cst_auto_convension = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, @@ -614,6 +628,7 @@ static const struct platform_features icx_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_ICX, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, @@ -627,6 +642,7 @@ static const struct platform_features spr_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; @@ -668,6 +684,7 @@ static const struct platform_features gmt_features = { .crystal_freq = 19200000, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, }; @@ -679,6 +696,7 @@ static const struct platform_features gmtd_features = { .crystal_freq = 25000000, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_GMT, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, }; @@ -690,6 +708,7 @@ static const struct platform_features gmtp_features = { .crystal_freq = 19200000, .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO, }; @@ -700,6 +719,7 @@ static const struct platform_features tmt_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_GMT, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, .enable_tsc_tweak = 1, @@ -711,6 +731,7 @@ static const struct platform_features tmtd_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6, .cst_limit = CST_LIMIT_GMT, + .has_irtl_msrs = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL, }; @@ -3268,6 +3289,9 @@ void print_irtl(void) { unsigned long long msr; + if (!platform->has_irtl_msrs) + return; + if (platform->supported_cstates & PC3) { get_msr(base_cpu, MSR_PKGC3_IRTL, &msr); fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr); @@ -5109,49 +5133,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; } -/* - * SNB adds support for additional MSRs: - * - * MSR_PKG_C7_RESIDENCY 0x000003fa - * MSR_CORE_C7_RESIDENCY 0x000003fe - * MSR_PKG_C2_RESIDENCY 0x0000060d - */ - -int has_snb_msrs(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_SANDYBRIDGE: - case INTEL_FAM6_SANDYBRIDGE_X: - case INTEL_FAM6_IVYBRIDGE: /* IVB */ - case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_X: /* HSW */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_BROADWELL_G: /* BDW */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - case INTEL_FAM6_SAPPHIRERAPIDS_X: /* SPR */ - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ - return 1; - } - return 0; -} - /* * SKL adds support for additional MSRS: * @@ -5723,7 +5704,6 @@ void process_cpuid() BIC_PRESENT(BIC_SMI); probe_bclk(); - do_irtl_snb = has_snb_msrs(family, model); if (has_slv_msrs(family, model)) { BIC_PRESENT(BIC_Mod_c6); use_c1_residency_msr = 1; @@ -6098,7 +6078,7 @@ void turbostat_init() if (!quiet) for_all_cpus(print_thermal, ODD_COUNTERS); - if (!quiet && do_irtl_snb) + if (!quiet) print_irtl(); if (DO_BIC(BIC_IPC)) From 76d83d2ae8e3099d9a6bd67fba918108824d7d4d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:41:30 +0800 Subject: [PATCH 047/560] tools/power/turbostat: Abstract MSR_CORE_C1_RES support Abstract the support for MSR_CORE_C1_RES. Delete is_dnv() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 30 ++++++--------------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 44be06b763b28..de9260c966784 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -223,7 +223,6 @@ unsigned int list_header_only; unsigned int dump_only; unsigned int do_knl_cstates; unsigned int do_slm_cstates; -unsigned int use_c1_residency_msr; unsigned int has_aperf; unsigned int has_epb; unsigned int has_turbo; @@ -284,6 +283,7 @@ struct platform_features { int cst_limit; /* MSR_PKG_CST_CONFIG_CONTROL */ bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ bool has_irtl_msrs; /* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */ + bool has_msr_core_c1_res; /* MSR_CORE_C1_RES */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -652,6 +652,7 @@ static const struct platform_features slv_features = { .bclk_freq = BCLK_SLV, .supported_cstates = CC1 | CC6 | PC6, .cst_limit = CST_LIMIT_SLV, + .has_msr_core_c1_res = 1, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, .has_rapl_divisor = 1, @@ -697,6 +698,7 @@ static const struct platform_features gmtd_features = { .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_GMT, .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS, }; @@ -2069,7 +2071,7 @@ void delta_core(struct core_data *new, struct core_data *old) int soft_c1_residency_display(int bic) { - if (!DO_BIC(BIC_CPU_c1) || use_c1_residency_msr) + if (!DO_BIC(BIC_CPU_c1) || platform->has_msr_core_c1_res) return 0; return DO_BIC_READ(bic); @@ -2118,7 +2120,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d } } - if (use_c1_residency_msr) { + if (platform->has_msr_core_c1_res) { /* * Some models have a dedicated C1 residency MSR, * which should be more accurate than the derivation below. @@ -2700,7 +2702,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -5; t->smi_count = msr & 0xFFFFFFFF; } - if (DO_BIC(BIC_CPU_c1) && use_c1_residency_msr) { + if (DO_BIC(BIC_CPU_c1) && platform->has_msr_core_c1_res) { if (get_msr(cpu, MSR_CORE_C1_RES, &t->c1)) return -6; } @@ -4297,22 +4299,6 @@ int has_slv_msrs(unsigned int family, unsigned int model) return 0; } -int is_dnv(unsigned int family, unsigned int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_GOLDMONT_D: - return 1; - } - return 0; -} - int is_icx(unsigned int family, unsigned int model) { @@ -5706,10 +5692,6 @@ void process_cpuid() if (has_slv_msrs(family, model)) { BIC_PRESENT(BIC_Mod_c6); - use_c1_residency_msr = 1; - } - if (is_dnv(family, model)) { - use_c1_residency_msr = 1; } if (has_skl_msrs(family, model)) { BIC_PRESENT(BIC_Totl_c0); From 9cc1c1038526a5b6c9a57397def80ba79c260ff2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:47:30 +0800 Subject: [PATCH 048/560] tools/power/turbostat: Abstract MSR_MODULE_C6_RES_MS support Abstract MSR_MODULE_C6_RES_MS support. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index de9260c966784..5f90d96bd9e3e 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -284,6 +284,7 @@ struct platform_features { bool has_cst_auto_convension; /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */ bool has_irtl_msrs; /* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */ bool has_msr_core_c1_res; /* MSR_CORE_C1_RES */ + bool has_msr_module_c6_res_ms; /* MSR_MODULE_C6_RES_MS */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -653,6 +654,7 @@ static const struct platform_features slv_features = { .supported_cstates = CC1 | CC6 | PC6, .cst_limit = CST_LIMIT_SLV, .has_msr_core_c1_res = 1, + .has_msr_module_c6_res_ms = 1, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, .has_rapl_divisor = 1, @@ -5690,9 +5692,9 @@ void process_cpuid() BIC_PRESENT(BIC_SMI); probe_bclk(); - if (has_slv_msrs(family, model)) { + if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); - } + if (has_skl_msrs(family, model)) { BIC_PRESENT(BIC_Totl_c0); BIC_PRESENT(BIC_Any_c0); From 6c36882e09dbc9a44d64180ba7972838d3f45488 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 14:52:24 +0800 Subject: [PATCH 049/560] tools/power/turbostat: Abstract MSR_CC6/MC6_DEMOTION_POLICY_CONFIG support Abstract the support for MSR_CC6/MC6_DEMOTION_POLICY_CONFIG. Delete has_slv_msrs() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 29 ++++++--------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5f90d96bd9e3e..f8d7ba87f9689 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -285,6 +285,7 @@ struct platform_features { bool has_irtl_msrs; /* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */ bool has_msr_core_c1_res; /* MSR_CORE_C1_RES */ bool has_msr_module_c6_res_ms; /* MSR_MODULE_C6_RES_MS */ + bool has_msr_c6_demotion_policy_config; /* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -655,6 +656,7 @@ static const struct platform_features slv_features = { .cst_limit = CST_LIMIT_SLV, .has_msr_core_c1_res = 1, .has_msr_module_c6_res_ms = 1, + .has_msr_c6_demotion_policy_config = 1, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, .has_rapl_divisor = 1, @@ -4279,28 +4281,6 @@ void probe_bclk(void) tsc_tweak = base_hz / tsc_hz; } -/* - * SLV client has support for unique MSRs: - * - * MSR_CC6_DEMOTION_POLICY_CONFIG - * MSR_MC6_DEMOTION_POLICY_CONFIG - */ - -int has_slv_msrs(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_SILVERMONT: - return 1; - } - return 0; -} - int is_icx(unsigned int family, unsigned int model) { @@ -5358,6 +5338,9 @@ void decode_c6_demotion_policy_msr(void) { unsigned long long msr; + if (!platform->has_msr_c6_demotion_policy_config) + return; + if (!get_msr(base_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr)) fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n", base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS"); @@ -5707,7 +5690,7 @@ void process_cpuid() if (!quiet) decode_misc_pwr_mgmt_msr(); - if (!quiet && has_slv_msrs(family, model)) + if (!quiet) decode_c6_demotion_policy_msr(); rapl_probe(); From c8202a6c3acf7bbde42b5e389eec40fd8e1b8358 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 15:26:14 +0800 Subject: [PATCH 050/560] tools/power/turbostat: Abstract MSR_ATOM_PKG_C6_RESIDENCY support Abstract the support for MSR_ATOM_PKG_C6_RESIDENCY. Delete is_slm() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f8d7ba87f9689..a04861846d33d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -222,7 +222,6 @@ unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; unsigned int do_knl_cstates; -unsigned int do_slm_cstates; unsigned int has_aperf; unsigned int has_epb; unsigned int has_turbo; @@ -286,6 +285,7 @@ struct platform_features { bool has_msr_core_c1_res; /* MSR_CORE_C1_RES */ bool has_msr_module_c6_res_ms; /* MSR_MODULE_C6_RES_MS */ bool has_msr_c6_demotion_policy_config; /* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */ + bool has_msr_atom_pkg_c6_residency; /* MSR_ATOM_PKG_C6_RESIDENCY */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -657,6 +657,7 @@ static const struct platform_features slv_features = { .has_msr_core_c1_res = 1, .has_msr_module_c6_res_ms = 1, .has_msr_c6_demotion_policy_config = 1, + .has_msr_atom_pkg_c6_residency = 1, .trl_msrs = TRL_ATOM, .rapl_msrs = RAPL_PKG | RAPL_CORE, .has_rapl_divisor = 1, @@ -669,6 +670,7 @@ static const struct platform_features slvd_features = { .bclk_freq = BCLK_SLV, .supported_cstates = CC1 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_SLV, + .has_msr_atom_pkg_c6_residency = 1, .trl_msrs = TRL_BASE, .rapl_msrs = RAPL_PKG | RAPL_CORE, .rapl_quirk_tdp = 30, @@ -2795,7 +2797,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3)) return -9; if (DO_BIC(BIC_Pkgpc6)) { - if (do_slm_cstates) { + if (platform->has_msr_atom_pkg_c6_residency) { if (get_msr(cpu, MSR_ATOM_PKG_C6_RESIDENCY, &p->pc6)) return -10; } else { @@ -5125,22 +5127,6 @@ int has_skl_msrs(unsigned int family, unsigned int model) return 0; } -int is_slm(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ - case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ - return 1; - } - return 0; -} - int is_knl(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -5684,7 +5670,6 @@ void process_cpuid() BIC_PRESENT(BIC_GFX_c0); BIC_PRESENT(BIC_CPUGFX); } - do_slm_cstates = is_slm(family, model); do_knl_cstates = is_knl(family, model); if (!quiet) From 80d132cb45f2cc171395bfaacd74567a183ab160 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 15:00:58 +0800 Subject: [PATCH 051/560] tools/power/turbostat: Abstract MSR_KNL_CORE_C6_RESIDENCY support Abstract the support for MSR_KNL_CORE_C6_RESIDENCY. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a04861846d33d..f0a99e092fa78 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -221,7 +221,6 @@ unsigned int rapl_joules; unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; -unsigned int do_knl_cstates; unsigned int has_aperf; unsigned int has_epb; unsigned int has_turbo; @@ -286,6 +285,7 @@ struct platform_features { bool has_msr_module_c6_res_ms; /* MSR_MODULE_C6_RES_MS */ bool has_msr_c6_demotion_policy_config; /* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */ bool has_msr_atom_pkg_c6_residency; /* MSR_ATOM_PKG_C6_RESIDENCY */ + bool has_msr_knl_core_c6_residency; /* MSR_KNL_CORE_C6_RESIDENCY */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -751,6 +751,7 @@ static const struct platform_features knl_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC3 | PC6, .cst_limit = CST_LIMIT_KNL, + .has_msr_knl_core_c6_residency = 1, .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, @@ -2727,10 +2728,10 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -6; } - if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !do_knl_cstates) { + if ((DO_BIC(BIC_CPU_c6) || soft_c1_residency_display(BIC_CPU_c6)) && !platform->has_msr_knl_core_c6_residency) { if (get_msr(cpu, MSR_CORE_C6_RESIDENCY, &c->c6)) return -7; - } else if (do_knl_cstates && soft_c1_residency_display(BIC_CPU_c6)) { + } else if (platform->has_msr_knl_core_c6_residency && soft_c1_residency_display(BIC_CPU_c6)) { if (get_msr(cpu, MSR_KNL_CORE_C6_RESIDENCY, &c->c6)) return -7; } @@ -5670,7 +5671,6 @@ void process_cpuid() BIC_PRESENT(BIC_GFX_c0); BIC_PRESENT(BIC_CPUGFX); } - do_knl_cstates = is_knl(family, model); if (!quiet) decode_misc_pwr_mgmt_msr(); From 58ddb691d8d8a281535406766da4e23e0f011126 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 15:07:43 +0800 Subject: [PATCH 052/560] tools/power/turbostat: Abstract extended cstate MSRs support Abstract the support for MSR_PKG_WEIGHTED_CORE_C0_RES, MSR_PKG_ANY_CORE_C0_RES, MSR_PKG_ANY_GFXE_C0_RES and MSR_PKG_BOTH_CORE_GFXE_C0_RES. Delete has_skl_msrs() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 29 ++++----------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f0a99e092fa78..613b284f2f090 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -286,6 +286,7 @@ struct platform_features { bool has_msr_c6_demotion_policy_config; /* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */ bool has_msr_atom_pkg_c6_residency; /* MSR_ATOM_PKG_C6_RESIDENCY */ bool has_msr_knl_core_c6_residency; /* MSR_KNL_CORE_C6_RESIDENCY */ + bool has_ext_cst_msrs; /* MSR_PKG_WEIGHTED_CORE_C0_RES/MSR_PKG_ANY_CORE_C0_RES/MSR_PKG_ANY_GFXE_C0_RES/MSR_PKG_BOTH_CORE_GFXE_C0_RES */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -586,6 +587,7 @@ static const struct platform_features skl_features = { .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .has_irtl_msrs = 1, + .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -601,6 +603,7 @@ static const struct platform_features cnl_features = { .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .has_irtl_msrs = 1, + .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, @@ -5104,30 +5107,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; } -/* - * SKL adds support for additional MSRS: - * - * MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 - * MSR_PKG_ANY_CORE_C0_RES 0x00000659 - * MSR_PKG_ANY_GFXE_C0_RES 0x0000065A - * MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B - */ -int has_skl_msrs(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - return 1; - } - return 0; -} - int is_knl(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -5665,7 +5644,7 @@ void process_cpuid() if (platform->has_msr_module_c6_res_ms) BIC_PRESENT(BIC_Mod_c6); - if (has_skl_msrs(family, model)) { + if (platform->has_ext_cst_msrs) { BIC_PRESENT(BIC_Totl_c0); BIC_PRESENT(BIC_Any_c0); BIC_PRESENT(BIC_GFX_c0); From ed43247b15a4fb4df01c3408fcab6e206f93ab87 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 15:49:10 +0800 Subject: [PATCH 053/560] tools/power/turbostat: Abstract aperf/mperf multiplier support Abstract aperf/mperf multiplier support. Delete is_knl() CPU model check. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 613b284f2f090..65a507d82fc42 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -296,6 +296,7 @@ struct platform_features { int rapl_quirk_tdp; /* Hardcoded TDP value when cannot be retrieved from hardware */ int tcc_offset_bits; /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */ bool enable_tsc_tweak; /* Use CPU Base freq instead of TSC freq for aperf/mperf counter */ + bool need_perf_multiplier; /* mperf/aperf multiplier */ }; struct platform_data { @@ -758,6 +759,7 @@ static const struct platform_features knl_features = { .trl_msrs = TRL_KNL, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, + .need_perf_multiplier = 1, }; static const struct platform_features default_features = { @@ -5107,28 +5109,6 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; } -int is_knl(unsigned int family, unsigned int model) -{ - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ - return 1; - } - return 0; -} - -unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model) -{ - if (is_knl(family, model)) - return 1024; - return 1; -} - int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned int eax, ebx, ecx, edx; @@ -5630,7 +5610,7 @@ void process_cpuid() } if (has_aperf) - aperf_mperf_multiplier = get_aperf_mperf_multiplier(family, model); + aperf_mperf_multiplier = platform->need_perf_multiplier ? 1024 : 1; BIC_PRESENT(BIC_IRQ); BIC_PRESENT(BIC_TSC_MHz); From 7d0ebe6f7eaf8c0ac069ddab1fe3793401139fb3 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 15:57:44 +0800 Subject: [PATCH 054/560] tools/power/turbostat: Abstract cstate prewake bit support Abstract cstate prewake bit support. Delete is_icx()/is_spr() CPU model checks. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 47 +++------------------------ 1 file changed, 4 insertions(+), 43 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 65a507d82fc42..580cc2a3b9478 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -252,7 +252,6 @@ unsigned int tj_max_override; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; -unsigned int dis_cstate_prewake; unsigned int crystal_hz; unsigned long long tsc_hz; int base_cpu; @@ -287,6 +286,7 @@ struct platform_features { bool has_msr_atom_pkg_c6_residency; /* MSR_ATOM_PKG_C6_RESIDENCY */ bool has_msr_knl_core_c6_residency; /* MSR_KNL_CORE_C6_RESIDENCY */ bool has_ext_cst_msrs; /* MSR_PKG_WEIGHTED_CORE_C0_RES/MSR_PKG_ANY_CORE_C0_RES/MSR_PKG_ANY_GFXE_C0_RES/MSR_PKG_BOTH_CORE_GFXE_C0_RES */ + bool has_cst_prewake_bit; /* Cstate prewake bit in MSR_IA32_POWER_CTL */ int trl_msrs; /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */ int plr_msrs; /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */ int rapl_msrs; /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */ @@ -635,6 +635,7 @@ static const struct platform_features icx_features = { .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_ICX, .has_irtl_msrs = 1, + .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, .has_fixed_rapl_unit = 1, @@ -649,6 +650,7 @@ static const struct platform_features spr_features = { .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, .has_irtl_msrs = 1, + .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; @@ -3014,8 +3016,6 @@ void probe_cst_limit(void) pkg_cstate_limit = pkg_cstate_limits[msr & 0xF]; } -void prewake_cstate_probe(unsigned int family, unsigned int model); - static void dump_platform_info(void) { unsigned long long msr; @@ -3036,7 +3036,7 @@ static void dump_platform_info(void) base_cpu, msr, msr & 0x2 ? "EN" : "DIS"); /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */ - if (dis_cstate_prewake) + if (platform->has_cst_prewake_bit) fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN"); return; @@ -4289,38 +4289,6 @@ void probe_bclk(void) tsc_tweak = base_hz / tsc_hz; } -int is_icx(unsigned int family, unsigned int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_ICELAKE_X: - return 1; - } - return 0; -} - -int is_spr(unsigned int family, unsigned int model) -{ - - if (!genuine_intel) - return 0; - - if (family != 6) - return 0; - - switch (model) { - case INTEL_FAM6_SAPPHIRERAPIDS_X: - return 1; - } - return 0; -} - static void remove_underbar(char *s) { char *to = s; @@ -4910,12 +4878,6 @@ void rapl_probe(void) rapl_probe_amd(); } -void prewake_cstate_probe(unsigned int family, unsigned int model) -{ - if (is_icx(family, model) || is_spr(family, model)) - dis_cstate_prewake = 1; -} - int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; @@ -5638,7 +5600,6 @@ void process_cpuid() decode_c6_demotion_policy_msr(); rapl_probe(); - prewake_cstate_probe(family, model); if (!quiet) dump_cstate_pstate_config_info(); From d085b3b0f11af7d23601b832e0d9e446d18df968 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 21:53:21 +0800 Subject: [PATCH 055/560] tools/power/turbostat: Delete intel_model_duplicates() Now CPU model checks have been cleaned up, no code depends on the duplicated CPU model value. Delete intel_model_duplicates(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 59 --------------------------- 1 file changed, 59 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 580cc2a3b9478..89f53e1ac63a1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5258,63 +5258,6 @@ void decode_c6_demotion_policy_msr(void) base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS"); } -/* - * When models are the same, for the purpose of turbostat, reuse - */ -unsigned int intel_model_duplicates(unsigned int model) -{ - - switch (model) { - case INTEL_FAM6_NEHALEM_EP: /* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */ - case INTEL_FAM6_NEHALEM_G: /* Core i7 and i5 Processor - Nehalem */ - case INTEL_FAM6_WESTMERE: /* Westmere Client - Clarkdale, Arrandale */ - case INTEL_FAM6_WESTMERE_EP: /* Westmere EP - Gulftown */ - return INTEL_FAM6_NEHALEM; - - case INTEL_FAM6_WESTMERE_EX: /* Westmere-EX Xeon - Eagleton */ - return INTEL_FAM6_NEHALEM_EX; - - case INTEL_FAM6_XEON_PHI_KNM: - return INTEL_FAM6_XEON_PHI_KNL; - - case INTEL_FAM6_BROADWELL_D: /* BDX-DE */ - return INTEL_FAM6_BROADWELL_X; - - case INTEL_FAM6_SKYLAKE: - case INTEL_FAM6_KABYLAKE_L: - case INTEL_FAM6_KABYLAKE: - case INTEL_FAM6_COMETLAKE_L: - case INTEL_FAM6_COMETLAKE: - return INTEL_FAM6_SKYLAKE_L; - - case INTEL_FAM6_ICELAKE_L: - case INTEL_FAM6_ICELAKE_NNPI: - case INTEL_FAM6_TIGERLAKE_L: - case INTEL_FAM6_TIGERLAKE: - case INTEL_FAM6_ROCKETLAKE: - case INTEL_FAM6_LAKEFIELD: - case INTEL_FAM6_ALDERLAKE: - case INTEL_FAM6_ALDERLAKE_L: - case INTEL_FAM6_ATOM_GRACEMONT: - case INTEL_FAM6_RAPTORLAKE: - case INTEL_FAM6_RAPTORLAKE_P: - case INTEL_FAM6_RAPTORLAKE_S: - case INTEL_FAM6_METEORLAKE: - case INTEL_FAM6_METEORLAKE_L: - return INTEL_FAM6_CANNONLAKE_L; - - case INTEL_FAM6_ATOM_TREMONT_L: - return INTEL_FAM6_ATOM_TREMONT; - - case INTEL_FAM6_ICELAKE_D: - return INTEL_FAM6_ICELAKE_X; - - case INTEL_FAM6_EMERALDRAPIDS_X: - return INTEL_FAM6_SAPPHIRERAPIDS_X; - } - return model; -} - void print_dev_latency(void) { char *path = "/dev/cpu_dma_latency"; @@ -5457,8 +5400,6 @@ void process_cpuid() } probe_platform_features(family, model); - if (genuine_intel) - model = intel_model_duplicates(model); if (!(edx_flags & (1 << 5))) errx(1, "CPUID: no MSR"); From 32e8c6169af7ef7c938b1bd996d27ab171e27d80 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 14:44:06 +0800 Subject: [PATCH 056/560] tools/power/turbostat: Improve probe_platform_features() logic AMD/Hygon platforms that don't have RAPL use 'amd_features' to describe the platform features. Unknown Intel platforms use 'default_features' to describe the platform features. As none of the platform feature is set for 'amd_features' or 'default_features', there is no need to maintain both of them. Remove 'amd_features' structure and improve the logic in probe_platform_features(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 89f53e1ac63a1..102ba515cf4be 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -767,9 +767,6 @@ static const struct platform_features knl_features = { static const struct platform_features default_features = { }; -static const struct platform_features amd_features = { -}; - static const struct platform_features amd_features_with_rapl = { .rapl_msrs = RAPL_AMD_F17H, .has_per_core_rapl = 1, @@ -849,9 +846,9 @@ void probe_platform_features(unsigned int family, unsigned int model) { int i; - if (authentic_amd || hygon_genuine) { - platform = &amd_features; + platform = &default_features; + if (authentic_amd || hygon_genuine) { if (max_extended_level >= 0x80000007) { unsigned int eax, ebx, ecx, edx; @@ -863,8 +860,6 @@ void probe_platform_features(unsigned int family, unsigned int model) return; } - platform = &default_features; - if (!genuine_intel || family != 6) return; From 045acf6064c5567011163e97c28906e0a7791414 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 29 Aug 2023 13:47:58 +0800 Subject: [PATCH 057/560] tools/power/turbostat: Relocate cstate probing code Move all cstate probing related code into probe_cstates(). Note that dump_platform_info() actually dumps both MSR_PLATFORM_INFO and MSR_IA32_POWER_CTL. MSR_PLATFORM_INFO is for pstate and MSR_IA32_POWER_CTL is for cstate. So split dump_platform_info() and dump MSR_IA32_POWER_CTL in probe_cstates(). Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 50 +++++++++++++++++---------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 102ba515cf4be..e5ca1586961e7 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -272,7 +272,7 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr); struct platform_features { bool has_msr_misc_feature_control; /* MSR_MISC_FEATURE_CONTROL */ bool has_msr_misc_pwr_mgmt; /* MSR_MISC_PWR_MGMT */ - bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, TRL MSRs */ + bool has_nhm_msrs; /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, MSR_IA32_POWER_CTL, TRL MSRs */ bool has_config_tdp; /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */ int bclk_freq; /* CPU base clock */ int crystal_freq; /* Crystal clock to use when not available from CPUID.15 */ @@ -3025,6 +3025,14 @@ static void dump_platform_info(void) ratio = (msr >> 8) & 0xFF; fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk); +} + +static void dump_power_ctl(void) +{ + unsigned long long msr; + + if (!platform->has_nhm_msrs) + return; get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr); fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", @@ -3229,6 +3237,9 @@ static void dump_cst_cfg(void) { unsigned long long msr; + if (!platform->has_nhm_msrs) + return; + get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr); fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", base_cpu, msr); @@ -4332,7 +4343,6 @@ static void dump_cstate_pstate_config_info(void) dump_platform_info(); dump_turbo_ratio_info(); - dump_cst_cfg(); } static int read_sysfs_int(char *path) @@ -5332,6 +5342,25 @@ void probe_cstates(void) if (platform->supported_cstates & PC10 && (pkg_cstate_limit >= PCL_10)) BIC_PRESENT(BIC_Pkgpc10); + + if (platform->has_msr_module_c6_res_ms) + BIC_PRESENT(BIC_Mod_c6); + + if (platform->has_ext_cst_msrs) { + BIC_PRESENT(BIC_Totl_c0); + BIC_PRESENT(BIC_Any_c0); + BIC_PRESENT(BIC_GFX_c0); + BIC_PRESENT(BIC_CPUGFX); + } + + if (quiet) + return; + + dump_power_ctl(); + dump_cst_cfg(); + decode_c6_demotion_policy_msr(); + print_dev_latency(); + dump_sysfs_cstate_config(); } void process_cpuid() @@ -5519,32 +5548,15 @@ void process_cpuid() BIC_PRESENT(BIC_SMI); probe_bclk(); - if (platform->has_msr_module_c6_res_ms) - BIC_PRESENT(BIC_Mod_c6); - - if (platform->has_ext_cst_msrs) { - BIC_PRESENT(BIC_Totl_c0); - BIC_PRESENT(BIC_Any_c0); - BIC_PRESENT(BIC_GFX_c0); - BIC_PRESENT(BIC_CPUGFX); - } - if (!quiet) decode_misc_pwr_mgmt_msr(); - if (!quiet) - decode_c6_demotion_policy_msr(); - rapl_probe(); if (!quiet) dump_cstate_pstate_config_info(); intel_uncore_frequency_probe(); - if (!quiet) - print_dev_latency(); - if (!quiet) - dump_sysfs_cstate_config(); if (!quiet) dump_sysfs_pstate_config(); From 11cd9a09f3e827605cf8fc7c343ddb8c9f4ee95d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 15:58:10 +0800 Subject: [PATCH 058/560] tools/power/turbostat: Relocate pstate probing code Introduce probe_pstates() and move all pstate probing related code into it. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 39 ++++++++++++++------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e5ca1586961e7..1bcaee6d0eadf 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3016,6 +3016,9 @@ static void dump_platform_info(void) unsigned long long msr; unsigned int ratio; + if (!platform->has_nhm_msrs) + return; + get_msr(base_cpu, MSR_PLATFORM_INFO, &msr); fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr); @@ -4313,6 +4316,9 @@ static void dump_turbo_ratio_info(void) if (!has_turbo) return; + if (!platform->has_nhm_msrs) + return; + if (platform->trl_msrs & TRL_LIMIT2) dump_turbo_ratio_limit2(); @@ -4336,15 +4342,6 @@ static void dump_turbo_ratio_info(void) dump_config_tdp(); } -static void dump_cstate_pstate_config_info(void) -{ - if (!platform->has_nhm_msrs) - return; - - dump_platform_info(); - dump_turbo_ratio_info(); -} - static int read_sysfs_int(char *path) { FILE *input; @@ -5363,6 +5360,19 @@ void probe_cstates(void) dump_sysfs_cstate_config(); } +void probe_pstates(void) +{ + probe_bclk(); + + if (quiet) + return; + + dump_platform_info(); + dump_turbo_ratio_info(); + dump_sysfs_pstate_config(); + decode_misc_pwr_mgmt_msr(); +} + void process_cpuid() { unsigned int eax, ebx, ecx, edx; @@ -5542,24 +5552,17 @@ void process_cpuid() BIC_PRESENT(BIC_IRQ); BIC_PRESENT(BIC_TSC_MHz); + probe_pstates(); + probe_cstates(); if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - probe_bclk(); - - if (!quiet) - decode_misc_pwr_mgmt_msr(); rapl_probe(); - if (!quiet) - dump_cstate_pstate_config_info(); intel_uncore_frequency_probe(); - if (!quiet) - dump_sysfs_pstate_config(); - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) BIC_PRESENT(BIC_GFX_rc6); From 622c8f23556266a5afdd657aa0518b7d70d5dfc7 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 16:00:15 +0800 Subject: [PATCH 059/560] tools/power/turbostat: Rename uncore probing function Rename intel_uncore_frequency_probe() to probe_intel_uncore_frequency() to be consistent with other probing function names. Probe uncore frequency right after probing cstates. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1bcaee6d0eadf..a956a30d81dee 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4378,7 +4378,7 @@ static void dump_sysfs_file(char *path) fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf); } -static void intel_uncore_frequency_probe(void) +static void probe_intel_uncore_frequency(void) { int i, j; char path[128]; @@ -5556,13 +5556,13 @@ void process_cpuid() probe_cstates(); + probe_intel_uncore_frequency(); + if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); rapl_probe(); - intel_uncore_frequency_probe(); - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) BIC_PRESENT(BIC_GFX_rc6); From 6cb13609a07ba466b4613fdce7da8c98508069b7 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 16:00:33 +0800 Subject: [PATCH 060/560] tools/power/turbostat: Rename rapl probing function Rename rapl_probe() to probe_rapl() to be consistent with other probing function names. Probe rapl after probing uncore frequency. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a956a30d81dee..bf2b1d1b26271 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4865,11 +4865,11 @@ void rapl_probe_amd(void) } /* - * rapl_probe() + * probe_rapl() * * sets rapl_power_units, rapl_energy_units, rapl_time_units */ -void rapl_probe(void) +void probe_rapl(void) { if (!platform->rapl_msrs) return; @@ -5558,11 +5558,11 @@ void process_cpuid() probe_intel_uncore_frequency(); + probe_rapl(); + if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - rapl_probe(); - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) BIC_PRESENT(BIC_GFX_rc6); From 2538d1673d02f66f6bdf01eebf36e271228778e9 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 16:01:12 +0800 Subject: [PATCH 061/560] tools/power/turbostat: Relocate graphics probing code Introduce probe_graphics(), and move all graphics probing related code into it. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bf2b1d1b26271..feff9ecff368c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4424,6 +4424,20 @@ static void probe_intel_uncore_frequency(void) } } +static void probe_graphics(void) +{ + if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) + BIC_PRESENT(BIC_GFX_rc6); + + if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) || + !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) + BIC_PRESENT(BIC_GFXMHz); + + if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) || + !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) + BIC_PRESENT(BIC_GFXACTMHz); +} + static void dump_sysfs_cstate_config(void) { char path[64]; @@ -5558,22 +5572,13 @@ void process_cpuid() probe_intel_uncore_frequency(); + probe_graphics(); + probe_rapl(); if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) - BIC_PRESENT(BIC_GFX_rc6); - - if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK)) - BIC_PRESENT(BIC_GFXMHz); - - if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK) || - !access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK)) - BIC_PRESENT(BIC_GFXACTMHz); - if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK)) BIC_PRESENT(BIC_CPU_LPI); else From e7d7b82de192464b733fb2bcc9e305ea6f6ea47e Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 16:01:36 +0800 Subject: [PATCH 062/560] tools/power/turbostat: Relocate lpi probing code Introduce probe_lpi(), and move all lpi probing related code into it. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 38 ++++++++++++++++----------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index feff9ecff368c..ad9147757d5a5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5374,6 +5374,26 @@ void probe_cstates(void) dump_sysfs_cstate_config(); } +void probe_lpi(void) +{ + if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK)) + BIC_PRESENT(BIC_CPU_LPI); + else + BIC_NOT_PRESENT(BIC_CPU_LPI); + + if (!access(sys_lpi_file_sysfs, R_OK)) { + sys_lpi_file = sys_lpi_file_sysfs; + BIC_PRESENT(BIC_SYS_LPI); + } else if (!access(sys_lpi_file_debugfs, R_OK)) { + sys_lpi_file = sys_lpi_file_debugfs; + BIC_PRESENT(BIC_SYS_LPI); + } else { + sys_lpi_file_sysfs = NULL; + BIC_NOT_PRESENT(BIC_SYS_LPI); + } + +} + void probe_pstates(void) { probe_bclk(); @@ -5570,6 +5590,8 @@ void process_cpuid() probe_cstates(); + probe_lpi(); + probe_intel_uncore_frequency(); probe_graphics(); @@ -5579,27 +5601,11 @@ void process_cpuid() if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK)) - BIC_PRESENT(BIC_CPU_LPI); - else - BIC_NOT_PRESENT(BIC_CPU_LPI); - if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK)) BIC_PRESENT(BIC_CORE_THROT_CNT); else BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); - if (!access(sys_lpi_file_sysfs, R_OK)) { - sys_lpi_file = sys_lpi_file_sysfs; - BIC_PRESENT(BIC_SYS_LPI); - } else if (!access(sys_lpi_file_debugfs, R_OK)) { - sys_lpi_file = sys_lpi_file_debugfs; - BIC_PRESENT(BIC_SYS_LPI); - } else { - sys_lpi_file_sysfs = NULL; - BIC_NOT_PRESENT(BIC_SYS_LPI); - } - if (!quiet) decode_misc_feature_control(); From db735f8ba78bf7692579b78f0ca1e40563ef79fd Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 31 Aug 2023 16:02:16 +0800 Subject: [PATCH 063/560] tools/power/turbostat: Relocate thermal probing code Introduce probe_thermal(), and move all thermal probing related code into it. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ad9147757d5a5..8dae576234a45 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4894,6 +4894,14 @@ void probe_rapl(void) rapl_probe_amd(); } +void probe_thermal(void) +{ + if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK)) + BIC_PRESENT(BIC_CORE_THROT_CNT); + else + BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); +} + int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; @@ -5598,14 +5606,11 @@ void process_cpuid() probe_rapl(); + probe_thermal(); + if (platform->has_nhm_msrs) BIC_PRESENT(BIC_SMI); - if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK)) - BIC_PRESENT(BIC_CORE_THROT_CNT); - else - BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); - if (!quiet) decode_misc_feature_control(); From ce7a32c2a4cdba06ed5ead4ed8f980c893e20cd2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 30 Aug 2023 16:36:06 +0800 Subject: [PATCH 064/560] tools/power/turbostat: Reorder some functions Reorder some functions to solve code depdency introduced by next patch. No functional change. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 214 +++++++++++++------------- 1 file changed, 107 insertions(+), 107 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8dae576234a45..eb333612bdecb 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4878,92 +4878,6 @@ void rapl_probe_amd(void) fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp); } -/* - * probe_rapl() - * - * sets rapl_power_units, rapl_energy_units, rapl_time_units - */ -void probe_rapl(void) -{ - if (!platform->rapl_msrs) - return; - - if (genuine_intel) - rapl_probe_intel(); - if (authentic_amd || hygon_genuine) - rapl_probe_amd(); -} - -void probe_thermal(void) -{ - if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK)) - BIC_PRESENT(BIC_CORE_THROT_CNT); - else - BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); -} - -int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) -{ - unsigned long long msr; - unsigned int dts, dts2; - int cpu; - - UNUSED(c); - UNUSED(p); - - if (!(do_dts || do_ptm)) - return 0; - - cpu = t->cpu_id; - - /* DTS is per-core, no need to print for each thread */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) - return 0; - - if (cpu_migrate(cpu)) { - fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu); - return -1; - } - - if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) { - if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) - return 0; - - dts = (msr >> 16) & 0x7F; - fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts); - - if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr)) - return 0; - - dts = (msr >> 16) & 0x7F; - dts2 = (msr >> 8) & 0x7F; - fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", - cpu, msr, tj_max - dts, tj_max - dts2); - } - - if (do_dts && debug) { - unsigned int resolution; - - if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr)) - return 0; - - dts = (msr >> 16) & 0x7F; - resolution = (msr >> 27) & 0xF; - fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n", - cpu, msr, tj_max - dts, resolution); - - if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr)) - return 0; - - dts = (msr >> 16) & 0x7F; - dts2 = (msr >> 8) & 0x7F; - fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", - cpu, msr, tj_max - dts, tj_max - dts2); - } - - return 0; -} - void print_power_limit_msr(int cpu, unsigned long long msr, char *label) { fprintf(outf, "cpu%d: %s: %sabled (%0.3f Watts, %f sec, clamp %sabled)\n", @@ -5095,29 +5009,20 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; } -int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) +/* + * probe_rapl() + * + * sets rapl_power_units, rapl_energy_units, rapl_time_units + */ +void probe_rapl(void) { - unsigned int eax, ebx, ecx, edx; - - UNUSED(c); - UNUSED(p); - - if (!genuine_intel) - return 0; - - if (cpu_migrate(t->cpu_id)) { - fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id); - return -1; - } - - if (max_level < 0x1a) - return 0; + if (!platform->rapl_msrs) + return; - __cpuid(0x1a, eax, ebx, ecx, edx); - eax = (eax >> 24) & 0xFF; - if (eax == 0x20) - t->is_atom = true; - return 0; + if (genuine_intel) + rapl_probe_intel(); + if (authentic_amd || hygon_genuine) + rapl_probe_amd(); } /* @@ -5200,6 +5105,101 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk return 0; } +int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + unsigned long long msr; + unsigned int dts, dts2; + int cpu; + + UNUSED(c); + UNUSED(p); + + if (!(do_dts || do_ptm)) + return 0; + + cpu = t->cpu_id; + + /* DTS is per-core, no need to print for each thread */ + if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + return 0; + + if (cpu_migrate(cpu)) { + fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu); + return -1; + } + + if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) { + if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts); + + if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + dts2 = (msr >> 8) & 0x7F; + fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", + cpu, msr, tj_max - dts, tj_max - dts2); + } + + if (do_dts && debug) { + unsigned int resolution; + + if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + resolution = (msr >> 27) & 0xF; + fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n", + cpu, msr, tj_max - dts, resolution); + + if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr)) + return 0; + + dts = (msr >> 16) & 0x7F; + dts2 = (msr >> 8) & 0x7F; + fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", + cpu, msr, tj_max - dts, tj_max - dts2); + } + + return 0; +} + +void probe_thermal(void) +{ + if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK)) + BIC_PRESENT(BIC_CORE_THROT_CNT); + else + BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); +} + +int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + unsigned int eax, ebx, ecx, edx; + + UNUSED(c); + UNUSED(p); + + if (!genuine_intel) + return 0; + + if (cpu_migrate(t->cpu_id)) { + fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id); + return -1; + } + + if (max_level < 0x1a) + return 0; + + __cpuid(0x1a, eax, ebx, ecx, edx); + eax = (eax >> 24) & 0xFF; + if (eax == 0x20) + t->is_atom = true; + return 0; +} + void decode_feature_control_msr(void) { unsigned long long msr; From 5612b2c89bd0a3a2f7c9e8756e6ead971b03f8a3 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 30 Aug 2023 16:54:51 +0800 Subject: [PATCH 065/560] tools/power/turbostat: Relocate more probing related code Relocate more feature probing code outside of process_cpuids() into the corresponding probing functions. This improves the readability of code and the turbostat output. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 37 ++++++++++++--------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index eb333612bdecb..ffeee48e8d85a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5023,6 +5023,11 @@ void probe_rapl(void) rapl_probe_intel(); if (authentic_amd || hygon_genuine) rapl_probe_amd(); + + if (quiet) + return; + + for_all_cpus(print_rapl, ODD_COUNTERS); } /* @@ -5173,6 +5178,13 @@ void probe_thermal(void) BIC_PRESENT(BIC_CORE_THROT_CNT); else BIC_NOT_PRESENT(BIC_CORE_THROT_CNT); + + for_all_cpus(set_temperature_target, ODD_COUNTERS); + + if (quiet) + return; + + for_all_cpus(print_thermal, ODD_COUNTERS); } int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) @@ -5380,6 +5392,7 @@ void probe_cstates(void) decode_c6_demotion_policy_msr(); print_dev_latency(); dump_sysfs_cstate_config(); + print_irtl(); } void probe_lpi(void) @@ -5413,6 +5426,10 @@ void probe_pstates(void) dump_turbo_ratio_info(); dump_sysfs_pstate_config(); decode_misc_pwr_mgmt_msr(); + + for_all_cpus(print_hwp, ODD_COUNTERS); + for_all_cpus(print_epb, ODD_COUNTERS); + for_all_cpus(print_perf_limit, ODD_COUNTERS); } void process_cpuid() @@ -5897,29 +5914,9 @@ void turbostat_init() process_cpuid(); linux_perf_init(); - if (!quiet) - for_all_cpus(print_hwp, ODD_COUNTERS); - - if (!quiet) - for_all_cpus(print_epb, ODD_COUNTERS); - - if (!quiet) - for_all_cpus(print_perf_limit, ODD_COUNTERS); - - if (!quiet) - for_all_cpus(print_rapl, ODD_COUNTERS); - - for_all_cpus(set_temperature_target, ODD_COUNTERS); - for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); - if (!quiet) - for_all_cpus(print_thermal, ODD_COUNTERS); - - if (!quiet) - print_irtl(); - if (DO_BIC(BIC_IPC)) (void)get_instr_count_fd(base_cpu); } From 7ee39d8d593e3d28eced0fa1a8c8c6bdcbd4156e Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 13 Sep 2023 23:32:39 +0800 Subject: [PATCH 066/560] tools/power/turbostat: Introduce probe_pm_features() Feature probe has nothing to do with CPUID, thus it should not be in process_cpuids(). Introduce probe_pm_features() and move all feature probing functions into it. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ffeee48e8d85a..607152b36c1a3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5610,7 +5610,10 @@ void process_cpuid() BIC_PRESENT(BIC_IRQ); BIC_PRESENT(BIC_TSC_MHz); +} +void probe_pm_features(void) +{ probe_pstates(); probe_cstates(); @@ -5630,8 +5633,6 @@ void process_cpuid() if (!quiet) decode_misc_feature_control(); - - return; } /* @@ -5912,6 +5913,7 @@ void turbostat_init() check_dev_msr(); check_permissions(); process_cpuid(); + probe_pm_features(); linux_perf_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); From 05ad96ff0fb9d1b16abb5022b9c62636c6780fc2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 27 Aug 2023 15:33:27 +0800 Subject: [PATCH 067/560] tools/power/turbostat: Enable MSR_CORE_C1_RES on recent Intel client platforms All recent Intel client platforms have MSR_CORE_C1_RES. Enable the support on these platforms, including CNL/ICL/LKF/RKL/TGL/ADL/RPL/MTL. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 607152b36c1a3..9895f348b6373 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -604,6 +604,7 @@ static const struct platform_features cnl_features = { .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10, .cst_limit = CST_LIMIT_HSW, .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, .has_ext_cst_msrs = 1, .trl_msrs = TRL_BASE, .tcc_offset_bits = 6, From 6b74a30b767e362eda7deeac52edcd546c5f6d8f Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 9 Sep 2023 13:26:51 +0800 Subject: [PATCH 068/560] tools/power/turbostat: Remove PC7/PC9 support on ADL/RPL Compared with other platforms that share cnl_features, ADL/RPL don't have PC7/PC9. Clone a new platform feature set from cnl_features for ADL/RPL, with PC7/PC9 removed. Signed-off-by: Zhang Rui Reviewed-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 29 +++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9895f348b6373..a769daa59b126 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -612,6 +612,23 @@ static const struct platform_features cnl_features = { .enable_tsc_tweak = 1, }; +static const struct platform_features adl_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC8 | PC10, + .cst_limit = CST_LIMIT_HSW, + .has_irtl_msrs = 1, + .has_msr_core_c1_res = 1, + .has_ext_cst_msrs = 1, + .trl_msrs = TRL_BASE, + .tcc_offset_bits = 6, + .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX, + .enable_tsc_tweak = 1, +}; + static const struct platform_features skx_features = { .has_msr_misc_feature_control = 1, .has_msr_misc_pwr_mgmt = 1, @@ -812,11 +829,11 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features }, { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features }, { INTEL_FAM6_LAKEFIELD, &cnl_features }, - { INTEL_FAM6_ALDERLAKE, &cnl_features }, - { INTEL_FAM6_ALDERLAKE_L, &cnl_features }, - { INTEL_FAM6_RAPTORLAKE, &cnl_features }, - { INTEL_FAM6_RAPTORLAKE_P, &cnl_features }, - { INTEL_FAM6_RAPTORLAKE_S, &cnl_features }, + { INTEL_FAM6_ALDERLAKE, &adl_features }, + { INTEL_FAM6_ALDERLAKE_L, &adl_features }, + { INTEL_FAM6_RAPTORLAKE, &adl_features }, + { INTEL_FAM6_RAPTORLAKE_P, &adl_features }, + { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, @@ -828,7 +845,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features }, { INTEL_FAM6_ATOM_TREMONT, &tmt_features }, { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, - { INTEL_FAM6_ATOM_GRACEMONT, &cnl_features }, + { INTEL_FAM6_ATOM_GRACEMONT, &adl_features }, { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, /* From 71cfd1da9f0635ccd7124ad8b67b9aed596be491 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 6 Oct 2023 11:01:17 +0800 Subject: [PATCH 069/560] tools/power/turbostat: Introduce cpu_allowed_set Turbostat supports "-c" parameter which limits output to system summary plus the specified cpu-set. But some code still uses cpu_present_set to read and dump the counters. Introduce cpu_allowed_set for code that should obey the specified cpu-set. No functional change. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 31 ++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a769daa59b126..d8f44ea5b4bf9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -904,8 +904,8 @@ int backwards_count; char *progname; #define CPU_SUBSET_MAXCPUS 1024 /* need to use before probe... */ -cpu_set_t *cpu_present_set, *cpu_affinity_set, *cpu_subset; -size_t cpu_present_setsize, cpu_affinity_setsize, cpu_subset_size; +cpu_set_t *cpu_present_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; +size_t cpu_present_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; #define MAX_ADDED_COUNTERS 8 #define MAX_ADDED_THREAD_COUNTERS 24 #define BITMASK_SIZE 32 @@ -1157,6 +1157,11 @@ int cpu_is_not_present(int cpu) return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set); } +int cpu_is_not_allowed(int cpu) +{ + return !CPU_ISSET_S(cpu, cpu_allowed_setsize, cpu_allowed_set); +} + /* * run func(thread, core, package) in topology order * skip non-present cpus @@ -3396,6 +3401,10 @@ void free_all_buffers(void) cpu_present_set = NULL; cpu_present_setsize = 0; + CPU_FREE(cpu_allowed_set); + cpu_allowed_set = NULL; + cpu_allowed_setsize = 0; + CPU_FREE(cpu_affinity_set); cpu_affinity_set = NULL; cpu_affinity_setsize = 0; @@ -5697,13 +5706,29 @@ void topology_probe() CPU_ZERO_S(cpu_present_setsize, cpu_present_set); for_all_proc_cpus(mark_cpu_present); + /* + * Allocate and initialize cpu_allowed_set + */ + cpu_allowed_set = CPU_ALLOC((topo.max_cpu_num + 1)); + if (cpu_allowed_set == NULL) + err(3, "CPU_ALLOC"); + cpu_allowed_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1)); + CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set); + /* * Validate that all cpus in cpu_subset are also in cpu_present_set */ for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) { - if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) + if (!cpu_subset) { + if (CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) + CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); + continue; + } + if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) { if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) err(1, "cpu%d not present", i); + CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); + } } /* From 4ede6d1ce7acba9cafe7df4e935b174623cd2181 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 4 Oct 2023 13:52:02 +0800 Subject: [PATCH 070/560] tools/power/turbostat: Obey allowed CPUs when accessing CPU counters for_all_cpus/for_all_cpus_2 are used for accessing the per CPU counters, and they should follow the cpu_allowed_set instead of cpu_present_set. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d8f44ea5b4bf9..202cf5231d7a5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1182,7 +1182,7 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); - if (cpu_is_not_present(t->cpu_id)) + if (cpu_is_not_allowed(t->cpu_id)) continue; c = GET_CORE(core_base, core_no, node_no, pkg_no); @@ -3618,7 +3618,7 @@ int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *, t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); - if (cpu_is_not_present(t->cpu_id)) + if (cpu_is_not_allowed(t->cpu_id)) continue; t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no); From 7bb3fe27ad4f62039b2ac80a2147452a608b474f Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 4 Oct 2023 14:25:25 +0800 Subject: [PATCH 071/560] tools/power/turbostat: Obey allowed CPUs during startup Set turbostat CPU affinity to make sure turbostat is running on one of the allowed CPUs. Set base_cpu to the first allowed CPU so that some platform information is dumped using one of the allowed CPUs. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 202cf5231d7a5..f81ce832d17ae 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5731,6 +5731,10 @@ void topology_probe() } } + if (!CPU_COUNT_S(cpu_allowed_setsize, cpu_allowed_set)) + err(-ENODEV, "No valid cpus found"); + sched_setaffinity(0, cpu_allowed_setsize, cpu_allowed_set); + /* * Allocate and initialize cpu_affinity_set */ @@ -5941,12 +5945,17 @@ void setup_all_buffers(void) void set_base_cpu(void) { - base_cpu = sched_getcpu(); - if (base_cpu < 0) - err(-ENODEV, "No valid cpus found"); + int i; - if (debug > 1) - fprintf(outf, "base_cpu = %d\n", base_cpu); + for (i = 0; i < topo.max_cpu_num + 1; ++i) { + if (cpu_is_not_allowed(i)) + continue; + base_cpu = i; + if (debug > 1) + fprintf(outf, "base_cpu = %d\n", base_cpu); + return; + } + err(-ENODEV, "No valid cpus found"); } void turbostat_init() @@ -5976,8 +5985,6 @@ int fork_it(char **argv) first_counter_read = 0; if (status) exit(status); - /* clear affinity side-effect of get_counters() */ - sched_setaffinity(0, cpu_present_setsize, cpu_present_set); gettimeofday(&tv_even, (struct timezone *)NULL); child_pid = fork(); From 74318add132365db3026e281ac06836a26cda857 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 4 Oct 2023 15:13:23 +0800 Subject: [PATCH 072/560] tools/power/turbostat: Abstract several functions When detecting the primary thread/core in a core/package, current code doesn't handle the allowed CPUs. Abstract several functions for further fix of this issue. No functional change. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 58 +++++++++++++++++++-------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f81ce832d17ae..5f6bc076e1dd5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1198,6 +1198,30 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk return 0; } +int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + UNUSED(c); + UNUSED(p); + + return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE); +} + +int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + UNUSED(c); + UNUSED(p); + + return (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE); +} + +int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + UNUSED(c); + UNUSED(p); + + return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE) && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE); +} + int cpu_migrate(int cpu) { CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set); @@ -1682,11 +1706,11 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data int printed = 0; /* if showing only 1st thread in core and this isn't one, bail out */ - if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + if (show_core_only && !is_cpu_first_thread_in_core(t, c, p)) return 0; /* if showing only 1st thread in pkg and this isn't one, bail out */ - if (show_pkg_only && !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (show_pkg_only && !is_cpu_first_core_in_package(t, c, p)) return 0; /*if not summary line and --cpu is used */ @@ -1820,7 +1844,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc); /* print per-core data only for 1st thread in core */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + if (!is_cpu_first_thread_in_core(t, c, p)) goto done; if (DO_BIC(BIC_CPU_c3)) @@ -1867,7 +1891,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); /* print per-package data only for 1st core in package */ - if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_core_in_package(t, c, p)) goto done; /* PkgTmp */ @@ -2202,7 +2226,7 @@ int delta_cpu(struct thread_data *t, struct core_data *c, int retval = 0; /* calculate core delta only for 1st thread in core */ - if (t->flags & CPU_IS_FIRST_THREAD_IN_CORE) + if (is_cpu_first_thread_in_core(t, c, p)) delta_core(c, c2); /* always calculate thread delta */ @@ -2211,7 +2235,7 @@ int delta_cpu(struct thread_data *t, struct core_data *c, return retval; /* calculate package delta only for 1st core in package */ - if (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE) + if (is_cpu_first_core_in_package(t, c, p)) retval = delta_package(p, p2); return retval; @@ -2325,7 +2349,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) } /* sum per-core values only for 1st thread in core */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + if (!is_cpu_first_thread_in_core(t, c, p)) return 0; average.cores.c3 += c->c3; @@ -2345,7 +2369,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) } /* sum per-pkg values only for 1st core in pkg */ - if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_core_in_package(t, c, p)) return 0; if (DO_BIC(BIC_Totl_c0)) @@ -2745,7 +2769,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) } /* collect core counters only for 1st thread in core */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + if (!is_cpu_first_thread_in_core(t, c, p)) goto done; if (DO_BIC(BIC_CPU_c3) || soft_c1_residency_display(BIC_CPU_c3)) { @@ -2800,7 +2824,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) } /* collect package counters only for 1st core in package */ - if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_core_in_package(t, c, p)) goto done; if (DO_BIC(BIC_Totl_c0)) { @@ -4581,7 +4605,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) cpu = t->cpu_id; /* EPB is per-package */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_thread_in_package(t, c, p)) return 0; if (cpu_migrate(cpu)) { @@ -4630,7 +4654,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) cpu = t->cpu_id; /* MSR_HWP_CAPABILITIES is per-package */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_thread_in_package(t, c, p)) return 0; if (cpu_migrate(cpu)) { @@ -4713,7 +4737,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data cpu = t->cpu_id; /* per-package */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_thread_in_package(t, c, p)) return 0; if (cpu_migrate(cpu)) { @@ -4930,7 +4954,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; /* RAPL counters are per package, so print only for 1st thread/package */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_thread_in_package(t, c, p)) return 0; cpu = t->cpu_id; @@ -5083,7 +5107,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk return 0; /* this is a per-package concept */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) + if (!is_cpu_first_thread_in_package(t, c, p)) return 0; cpu = t->cpu_id; @@ -5152,7 +5176,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p cpu = t->cpu_id; /* DTS is per-core, no need to print for each thread */ - if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) + if (!is_cpu_first_thread_in_core(t, c, p)) return 0; if (cpu_migrate(cpu)) { @@ -5160,7 +5184,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p return -1; } - if (do_ptm && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) { + if (do_ptm && is_cpu_first_core_in_package(t, c, p)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) return 0; From ccf8a0528061b4ca5f5c0e73c8d888e7e6d8b054 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 6 Oct 2023 09:59:14 +0800 Subject: [PATCH 073/560] tools/power/turbostat: Obey allowed CPUs for primary thread/core detection Thread_id doesn't tell if a CPU is allowed or not. Detect allowed CPUs only and use the first detected thread/core as the primary thread/core of a core/package. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 35 ++++++++++++--------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5f6bc076e1dd5..e586164906fad 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -926,12 +926,11 @@ struct thread_data { unsigned int x2apic_id; unsigned int flags; bool is_atom; -#define CPU_IS_FIRST_THREAD_IN_CORE 0x2 -#define CPU_IS_FIRST_CORE_IN_PACKAGE 0x4 unsigned long long counter[MAX_ADDED_THREAD_COUNTERS]; } *thread_even, *thread_odd; struct core_data { + int base_cpu; unsigned long long c3; unsigned long long c6; unsigned long long c7; @@ -944,6 +943,7 @@ struct core_data { } *core_even, *core_odd; struct pkg_data { + int base_cpu; unsigned long long pc2; unsigned long long pc3; unsigned long long pc6; @@ -1200,26 +1200,21 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p) { - UNUSED(c); UNUSED(p); - return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE); + return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0); } int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) { UNUSED(c); - UNUSED(p); - return (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE); + return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0); } int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p) { - UNUSED(c); - UNUSED(p); - - return (t->flags & CPU_IS_FIRST_THREAD_IN_CORE) && (t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE); + return is_cpu_first_thread_in_core(t, c, p) && is_cpu_first_core_in_package(t, c, p); } int cpu_migrate(int cpu) @@ -2263,9 +2258,6 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data t->irq_count = 0; t->smi_count = 0; - /* tells format_counters to dump all fields from this set */ - t->flags = CPU_IS_FIRST_THREAD_IN_CORE | CPU_IS_FIRST_CORE_IN_PACKAGE; - c->c3 = 0; c->c6 = 0; c->c7 = 0; @@ -5872,15 +5864,19 @@ void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_ if (*c == NULL) goto error; - for (i = 0; i < num_cores; i++) + for (i = 0; i < num_cores; i++) { (*c)[i].core_id = -1; + (*c)[i].base_cpu = -1; + } *p = calloc(topo.num_packages, sizeof(struct pkg_data)); if (*p == NULL) goto error; - for (i = 0; i < topo.num_packages; i++) + for (i = 0; i < topo.num_packages; i++) { (*p)[i].package_id = i; + (*p)[i].base_cpu = -1; + } return; error: @@ -5913,10 +5909,11 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base, p = GET_PKG(pkg_base, pkg_id); t->cpu_id = cpu_id; - if (thread_id == 0) { - t->flags |= CPU_IS_FIRST_THREAD_IN_CORE; - if (cpu_is_first_core_in_package(cpu_id)) - t->flags |= CPU_IS_FIRST_CORE_IN_PACKAGE; + if (!cpu_is_not_allowed(cpu_id)) { + if (c->base_cpu < 0) + c->base_cpu = t->cpu_id; + if (p->base_cpu < 0) + p->base_cpu = t->cpu_id; } c->core_id = core_id; From 0fe3752901370b83b63d4ddbce708b23c8e41ea9 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 6 Oct 2023 18:35:26 +0800 Subject: [PATCH 074/560] tools/power/turbostat: Obey allowed CPUs for system summary System summary should summarize the information for allowed CPUs instead of all the present CPUs. Introduce topology information for allowed CPUs, and use them to get system summary. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 71 +++++++++++++++++---------- 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e586164906fad..91aa039893179 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1134,6 +1134,9 @@ struct topo_params { int num_die; int num_cpus; int num_cores; + int allowed_packages; + int allowed_cpus; + int allowed_cores; int max_cpu_num; int max_node_num; int nodes_per_pkg; @@ -1179,7 +1182,6 @@ int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pk struct thread_data *t; struct core_data *c; struct pkg_data *p; - t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); if (cpu_is_not_allowed(t->cpu_id)) @@ -2426,40 +2428,40 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data /* Use the global time delta for the average. */ average.threads.tv_delta = tv_delta; - average.threads.tsc /= topo.num_cpus; - average.threads.aperf /= topo.num_cpus; - average.threads.mperf /= topo.num_cpus; - average.threads.instr_count /= topo.num_cpus; - average.threads.c1 /= topo.num_cpus; + average.threads.tsc /= topo.allowed_cpus; + average.threads.aperf /= topo.allowed_cpus; + average.threads.mperf /= topo.allowed_cpus; + average.threads.instr_count /= topo.allowed_cpus; + average.threads.c1 /= topo.allowed_cpus; if (average.threads.irq_count > 9999999) sums_need_wide_columns = 1; - average.cores.c3 /= topo.num_cores; - average.cores.c6 /= topo.num_cores; - average.cores.c7 /= topo.num_cores; - average.cores.mc6_us /= topo.num_cores; + average.cores.c3 /= topo.allowed_cores; + average.cores.c6 /= topo.allowed_cores; + average.cores.c7 /= topo.allowed_cores; + average.cores.mc6_us /= topo.allowed_cores; if (DO_BIC(BIC_Totl_c0)) - average.packages.pkg_wtd_core_c0 /= topo.num_packages; + average.packages.pkg_wtd_core_c0 /= topo.allowed_packages; if (DO_BIC(BIC_Any_c0)) - average.packages.pkg_any_core_c0 /= topo.num_packages; + average.packages.pkg_any_core_c0 /= topo.allowed_packages; if (DO_BIC(BIC_GFX_c0)) - average.packages.pkg_any_gfxe_c0 /= topo.num_packages; + average.packages.pkg_any_gfxe_c0 /= topo.allowed_packages; if (DO_BIC(BIC_CPUGFX)) - average.packages.pkg_both_core_gfxe_c0 /= topo.num_packages; + average.packages.pkg_both_core_gfxe_c0 /= topo.allowed_packages; - average.packages.pc2 /= topo.num_packages; + average.packages.pc2 /= topo.allowed_packages; if (DO_BIC(BIC_Pkgpc3)) - average.packages.pc3 /= topo.num_packages; + average.packages.pc3 /= topo.allowed_packages; if (DO_BIC(BIC_Pkgpc6)) - average.packages.pc6 /= topo.num_packages; + average.packages.pc6 /= topo.allowed_packages; if (DO_BIC(BIC_Pkgpc7)) - average.packages.pc7 /= topo.num_packages; + average.packages.pc7 /= topo.allowed_packages; - average.packages.pc8 /= topo.num_packages; - average.packages.pc9 /= topo.num_packages; - average.packages.pc10 /= topo.num_packages; + average.packages.pc8 /= topo.allowed_packages; + average.packages.pc9 /= topo.allowed_packages; + average.packages.pc10 /= topo.allowed_packages; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2469,7 +2471,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data sums_need_wide_columns = 1; continue; } - average.threads.counter[i] /= topo.num_cpus; + average.threads.counter[i] /= topo.allowed_cpus; } for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2478,7 +2480,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data if (average.cores.counter[i] > 9999999) sums_need_wide_columns = 1; } - average.cores.counter[i] /= topo.num_cores; + average.cores.counter[i] /= topo.allowed_cores; } for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -2487,7 +2489,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data if (average.packages.counter[i] > 9999999) sums_need_wide_columns = 1; } - average.packages.counter[i] /= topo.num_packages; + average.packages.counter[i] /= topo.allowed_packages; } } @@ -3690,7 +3692,7 @@ void re_initialize(void) { free_all_buffers(); setup_all_buffers(); - fprintf(outf, "turbostat: re-initialized with num_cpus %d\n", topo.num_cpus); + fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } void set_max_cpu_num(void) @@ -5953,6 +5955,24 @@ void allocate_irq_buffers(void) err(-1, "calloc %d", topo.max_cpu_num + 1); } +int update_topo(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + topo.allowed_cpus++; + if ((int)t->cpu_id == c->base_cpu) + topo.allowed_cores++; + if ((int)t->cpu_id == p->base_cpu) + topo.allowed_packages++; + + return 0; +} + +void topology_update(void) +{ + topo.allowed_cpus = 0; + topo.allowed_cores = 0; + topo.allowed_packages = 0; + for_all_cpus(update_topo, ODD_COUNTERS); +} void setup_all_buffers(void) { topology_probe(); @@ -5962,6 +5982,7 @@ void setup_all_buffers(void) allocate_counters(&thread_odd, &core_odd, &package_odd); allocate_output_buffer(); for_all_proc_cpus(initialize_counters); + topology_update(); } void set_base_cpu(void) From c25ef0e5d9d7d5fb9e1679286fc7a11e70f16c70 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 20 Oct 2023 14:20:47 +0800 Subject: [PATCH 075/560] tools/power/turbostat: Handle offlined CPUs in cpu_subset It is possible that the cpu_subset contains offlined CPUs. If this happens during start, exit immediately because this is likely an operator error that is best fixed by re-invoking. If this happens at runtime, give a warning only because turbostat should do its best effort to continue running. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 31 ++++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 91aa039893179..f9ea07961a226 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1149,7 +1149,7 @@ struct timeval tv_even, tv_odd, tv_delta; int *irq_column_2_cpu; /* /proc/interrupts column numbers */ int *irqs_per_cpu; /* indexed by cpu_num */ -void setup_all_buffers(void); +void setup_all_buffers(bool startup); char *sys_lpi_file; char *sys_lpi_file_sysfs = "/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us"; @@ -3691,7 +3691,7 @@ int for_all_proc_cpus(int (func) (int)) void re_initialize(void) { free_all_buffers(); - setup_all_buffers(); + setup_all_buffers(false); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -5692,7 +5692,7 @@ int dir_filter(const struct dirent *dirp) return 0; } -void topology_probe() +void topology_probe(bool startup) { int i; int max_core_id = 0; @@ -5734,7 +5734,12 @@ void topology_probe() CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set); /* - * Validate that all cpus in cpu_subset are also in cpu_present_set + * Validate cpu_subset and update cpu_allowed_set. + * + * Make sure all cpus in cpu_subset are also in cpu_present_set during startup, + * and give a warning when cpus in cpu_subset become unavailable at runtime. + * + * cpu_allowed_set is the intersection of cpu_present_set and cpu_subset. */ for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) { if (!cpu_subset) { @@ -5743,9 +5748,15 @@ void topology_probe() continue; } if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) { - if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) - err(1, "cpu%d not present", i); - CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); + if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) { + /* all cpus in cpu_subset must be in cpu_present_set during startup */ + if (startup) + err(1, "cpu%d not present", i); + else + fprintf(stderr, "cpu%d not present\n", i); + } else { + CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); + } } } @@ -5973,9 +5984,9 @@ void topology_update(void) topo.allowed_packages = 0; for_all_cpus(update_topo, ODD_COUNTERS); } -void setup_all_buffers(void) +void setup_all_buffers(bool startup) { - topology_probe(); + topology_probe(startup); allocate_irq_buffers(); allocate_fd_percpu(); allocate_counters(&thread_even, &core_even, &package_even); @@ -6002,7 +6013,7 @@ void set_base_cpu(void) void turbostat_init() { - setup_all_buffers(); + setup_all_buffers(true); set_base_cpu(); check_dev_msr(); check_permissions(); From 8c3dd2c9e54273922ea71b2a4c0e77fc624c396b Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 20 Oct 2023 09:45:21 +0800 Subject: [PATCH 076/560] tools/power/turbostat: Abstrct function for parsing cpu string Abstract parse_cpu_str() which can update any specified cpu_set by a given cpu string. This can be used to handle further CPU limitations from other sources like cgroup. The cpu string parsing code is also enhanced to handle the strings that have an extra '\n' before string terminator. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 104 ++++++++++++++------------ 1 file changed, 55 insertions(+), 49 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f9ea07961a226..3a759b49f25e1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3565,6 +3565,59 @@ int get_physical_node_id(struct cpu_topology *thiscpu) return -1; } +static int parse_cpu_str(char *cpu_str, cpu_set_t *cpu_set, int cpu_set_size) +{ + unsigned int start, end; + char *next = cpu_str; + + while (next && *next) { + + if (*next == '-') /* no negative cpu numbers */ + return 1; + + start = strtoul(next, &next, 10); + + if (start >= CPU_SUBSET_MAXCPUS) + return 1; + CPU_SET_S(start, cpu_set_size, cpu_set); + + if (*next == '\0' || *next == '\n') + break; + + if (*next == ',') { + next += 1; + continue; + } + + if (*next == '-') { + next += 1; /* start range */ + } else if (*next == '.') { + next += 1; + if (*next == '.') + next += 1; /* start range */ + else + return 1; + } + + end = strtoul(next, &next, 10); + if (end <= start) + return 1; + + while (++start <= end) { + if (start >= CPU_SUBSET_MAXCPUS) + return 1; + CPU_SET_S(start, cpu_set_size, cpu_set); + } + + if (*next == ',') + next += 1; + else if (*next != '\0' && *next != '\n') + return 1; + } + + return 0; +} + int get_thread_siblings(struct cpu_topology *thiscpu) { char path[80], character; @@ -6384,9 +6437,6 @@ void probe_sysfs(void) */ void parse_cpu_command(char *optarg) { - unsigned int start, end; - char *next; - if (!strcmp(optarg, "core")) { if (cpu_subset) goto error; @@ -6409,52 +6459,8 @@ void parse_cpu_command(char *optarg) CPU_ZERO_S(cpu_subset_size, cpu_subset); - next = optarg; - - while (next && *next) { - - if (*next == '-') /* no negative cpu numbers */ - goto error; - - start = strtoul(next, &next, 10); - - if (start >= CPU_SUBSET_MAXCPUS) - goto error; - CPU_SET_S(start, cpu_subset_size, cpu_subset); - - if (*next == '\0') - break; - - if (*next == ',') { - next += 1; - continue; - } - - if (*next == '-') { - next += 1; /* start range */ - } else if (*next == '.') { - next += 1; - if (*next == '.') - next += 1; /* start range */ - else - goto error; - } - - end = strtoul(next, &next, 10); - if (end <= start) - goto error; - - while (++start <= end) { - if (start >= CPU_SUBSET_MAXCPUS) - goto error; - CPU_SET_S(start, cpu_subset_size, cpu_subset); - } - - if (*next == ',') - next += 1; - else if (*next != '\0') - goto error; - } + if (parse_cpu_str(optarg, cpu_subset, cpu_subset_size)) + goto error; return; From f638858da0925b29122f05135663013dc240eaf9 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 20 Oct 2023 09:39:22 +0800 Subject: [PATCH 077/560] tools/power/turbostat: Handle cgroup v2 cpu limitation CPUs can be isolated via cgroup settings and turbostat should avoid migrating to these CPUs, just like it does for the '-c' cpus. Introduce cpu_effective_set to save the cgroup cpu limitation info from /sys/fs/cgroup/cpuset.cpus.effective. And use cpu_allowed_set as the intersection of cpu_present_set, cpu_effective_set and cpu_subset. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 95 ++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 15 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 3a759b49f25e1..0ef6fba118b1b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -904,8 +904,8 @@ int backwards_count; char *progname; #define CPU_SUBSET_MAXCPUS 1024 /* need to use before probe... */ -cpu_set_t *cpu_present_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; -size_t cpu_present_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; +cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset; +size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size; #define MAX_ADDED_COUNTERS 8 #define MAX_ADDED_THREAD_COUNTERS 24 #define BITMASK_SIZE 32 @@ -3419,6 +3419,10 @@ void free_all_buffers(void) cpu_present_set = NULL; cpu_present_setsize = 0; + CPU_FREE(cpu_effective_set); + cpu_effective_set = NULL; + cpu_effective_setsize = 0; + CPU_FREE(cpu_allowed_set); cpu_allowed_set = NULL; cpu_allowed_setsize = 0; @@ -3741,6 +3745,46 @@ int for_all_proc_cpus(int (func) (int)) return 0; } +#define PATH_EFFECTIVE_CPUS "/sys/fs/cgroup/cpuset.cpus.effective" + +static char cpu_effective_str[1024]; + +static int update_effective_str(bool startup) +{ + FILE *fp; + char *pos; + char buf[1024]; + int ret; + + if (cpu_effective_str[0] == '\0' && !startup) + return 0; + + fp = fopen(PATH_EFFECTIVE_CPUS, "r"); + if (!fp) + return 0; + + pos = fgets(buf, 1024, fp); + if (!pos) + err(1, "%s: file read failed\n", PATH_EFFECTIVE_CPUS); + + fclose(fp); + + ret = strncmp(cpu_effective_str, buf, 1024); + if (!ret) + return 0; + + strncpy(cpu_effective_str, buf, 1024); + return 1; +} + +static void update_effective_set(bool startup) +{ + update_effective_str(startup); + + if (parse_cpu_str(cpu_effective_str, cpu_effective_set, cpu_effective_setsize)) + err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str); +} + void re_initialize(void) { free_all_buffers(); @@ -4257,6 +4301,10 @@ void turbostat_loop() re_initialize(); goto restart; } + if (update_effective_str(false)) { + re_initialize(); + goto restart; + } do_sleep(); if (snapshot_proc_sysfs_files()) goto restart; @@ -5777,6 +5825,16 @@ void topology_probe(bool startup) CPU_ZERO_S(cpu_present_setsize, cpu_present_set); for_all_proc_cpus(mark_cpu_present); + /* + * Allocate and initialize cpu_effective_set + */ + cpu_effective_set = CPU_ALLOC((topo.max_cpu_num + 1)); + if (cpu_effective_set == NULL) + err(3, "CPU_ALLOC"); + cpu_effective_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1)); + CPU_ZERO_S(cpu_effective_setsize, cpu_effective_set); + update_effective_set(startup); + /* * Allocate and initialize cpu_allowed_set */ @@ -5787,30 +5845,37 @@ void topology_probe(bool startup) CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set); /* - * Validate cpu_subset and update cpu_allowed_set. + * Validate and update cpu_allowed_set. * - * Make sure all cpus in cpu_subset are also in cpu_present_set during startup, - * and give a warning when cpus in cpu_subset become unavailable at runtime. + * Make sure all cpus in cpu_subset are also in cpu_present_set during startup. + * Give a warning when cpus in cpu_subset become unavailable at runtime. + * Give a warning when cpus are not effective because of cgroup setting. * - * cpu_allowed_set is the intersection of cpu_present_set and cpu_subset. + * cpu_allowed_set is the intersection of cpu_present_set/cpu_effective_set/cpu_subset. */ for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) { - if (!cpu_subset) { - if (CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) - CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); + if (cpu_subset && !CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) continue; - } - if (CPU_ISSET_S(i, cpu_subset_size, cpu_subset)) { - if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) { - /* all cpus in cpu_subset must be in cpu_present_set during startup */ + + if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) { + if (cpu_subset) { + /* cpus in cpu_subset must be in cpu_present_set during startup */ if (startup) err(1, "cpu%d not present", i); else fprintf(stderr, "cpu%d not present\n", i); - } else { - CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); } + continue; } + + if (CPU_COUNT_S(cpu_effective_setsize, cpu_effective_set)) { + if (!CPU_ISSET_S(i, cpu_effective_setsize, cpu_effective_set)) { + fprintf(stderr, "cpu%d not effective\n", i); + continue; + } + } + + CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set); } if (!CPU_COUNT_S(cpu_allowed_setsize, cpu_allowed_set)) From 37f68a2940558b4f6f8e51b7b1d00f084b4bdde2 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 24 Jan 2023 10:39:53 -0800 Subject: [PATCH 078/560] tools/power/turbostat: Move process to root cgroup When available CPUs are reduced via cgroup cpuset controller, turbostat will exit with errors (For example): get_counters: Could not migrate to CPU 0 turbostat: re-initialized with num_cpus 20 get_counters: Could not migrate to CPU 0 turbostat: re-initialized with num_cpus 20 Move the turbostat to root cgroup, which has every CPU. Writing the value 0 to a cgroup.procs file causes the writing process to be moved to the corresponding cgroup. Signed-off-by: Srinivas Pandruvada Tested-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 0ef6fba118b1b..fea63d9d8e02d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -6666,6 +6666,19 @@ void cmdline(int argc, char **argv) int main(int argc, char **argv) { + int fd, ret; + + fd = open("/sys/fs/cgroup/cgroup.procs", O_WRONLY); + if (fd < 0) + goto skip_cgroup_setting; + + ret = write(fd, "0\n", 2); + if (ret == -1) + perror("Can't update cgroup\n"); + + close(fd); + +skip_cgroup_setting: outf = stderr; cmdline(argc, argv); From 0e3f10e6aa97b0134b526ec9cdc3ccdac2239b43 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 19 Oct 2023 11:04:33 +0800 Subject: [PATCH 079/560] tools/power/turbostat: Add MSR_CORE_C1_RES support for spr_features Add MSR_CORE_C1_RES support for spr_features because both Sapphirerapids and Emeraldrapids support this MSR. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index fea63d9d8e02d..bbeeec02bf5bd 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -667,6 +667,7 @@ static const struct platform_features spr_features = { .bclk_freq = BCLK_100MHZ, .supported_cstates = CC1 | CC6 | PC2 | PC6, .cst_limit = CST_LIMIT_SKX, + .has_msr_core_c1_res = 1, .has_irtl_msrs = 1, .has_cst_prewake_bit = 1, .trl_msrs = TRL_BASE | TRL_CORECOUNT, From 5feab4a6b8a730438a0fe8758dfa0700f951edde Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 9 Sep 2023 13:28:07 +0800 Subject: [PATCH 080/560] tools/power/turbostat: Add initial support for GraniteRapids Add initial support for GraniteRapids. It shares the same features with SapphireRapids. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index bbeeec02bf5bd..981d39454b493 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -829,6 +829,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_TIGERLAKE, &cnl_features }, { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features }, { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features }, + { INTEL_FAM6_GRANITERAPIDS_X, &spr_features }, { INTEL_FAM6_LAKEFIELD, &cnl_features }, { INTEL_FAM6_ALDERLAKE, &adl_features }, { INTEL_FAM6_ALDERLAKE_L, &adl_features }, From d33605f367414c7e0009978d1fbe9af01a36e221 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 28 Sep 2023 13:09:02 +0800 Subject: [PATCH 081/560] tools/power/turbostat: Add initial support for SierraForest Add initial support for SierraForest. It shares the same features with SapphireRapids, except that it has MSR_MODULE_C6_RES_MS support. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 981d39454b493..a50b6d071f6ab 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -674,6 +674,22 @@ static const struct platform_features spr_features = { .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; +static const struct platform_features srf_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6 | PC2 | PC6, + .cst_limit = CST_LIMIT_SKX, + .has_msr_core_c1_res = 1, + .has_msr_module_c6_res_ms = 1, + .has_irtl_msrs = 1, + .has_cst_prewake_bit = 1, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, +}; + static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, @@ -848,6 +864,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_ATOM_TREMONT, &tmt_features }, { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, { INTEL_FAM6_ATOM_GRACEMONT, &adl_features }, + { INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features }, { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, /* From 5a6efcb9102af4210d5a59182dbfbc594ae50fd4 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 19 Oct 2023 11:00:07 +0800 Subject: [PATCH 082/560] tools/power/turbostat: Add initial support for GrandRidge Add initial support for GrandRidge. It shares the same features as SierraForest, except that it does not support PC2/PC6. Signed-off-by: Zhang Rui --- tools/power/x86/turbostat/turbostat.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a50b6d071f6ab..3407f08593f8d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -690,6 +690,22 @@ static const struct platform_features srf_features = { .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, }; +static const struct platform_features grr_features = { + .has_msr_misc_feature_control = 1, + .has_msr_misc_pwr_mgmt = 1, + .has_nhm_msrs = 1, + .has_config_tdp = 1, + .bclk_freq = BCLK_100MHZ, + .supported_cstates = CC1 | CC6, + .cst_limit = CST_LIMIT_SKX, + .has_msr_core_c1_res = 1, + .has_msr_module_c6_res_ms = 1, + .has_irtl_msrs = 1, + .has_cst_prewake_bit = 1, + .trl_msrs = TRL_BASE | TRL_CORECOUNT, + .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL, +}; + static const struct platform_features slv_features = { .has_nhm_msrs = 1, .bclk_freq = BCLK_SLV, @@ -865,6 +881,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, { INTEL_FAM6_ATOM_GRACEMONT, &adl_features }, { INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features }, + { INTEL_FAM6_ATOM_CRESTMONT, &grr_features }, { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, /* From 7b57e7b683e3872b02117f46bd7dc7ad765888a8 Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Tue, 23 May 2023 17:46:45 +0530 Subject: [PATCH 083/560] tools/power/turbostat: Add initial support for ArrowLake Add initial support for ArrowLake platform. It shares the same features with CannonLake. Signed-off-by: Sumeet Pawnikar --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 3407f08593f8d..6c52cceae1d71 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -870,6 +870,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, + { INTEL_FAM6_ARROWLAKE, &cnl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, From 956dbd3de400a5665faf08a8588556db9c1bb56e Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Tue, 23 May 2023 17:46:45 +0530 Subject: [PATCH 084/560] tools/power/turbostat: Add initial support for LunarLake Add initial support for LunarLake platform. It shares the same features with CannonLake. Signed-off-by: Sumeet Pawnikar --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 6c52cceae1d71..8a311d7272e75 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -871,6 +871,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_FAM6_METEORLAKE, &cnl_features }, { INTEL_FAM6_METEORLAKE_L, &cnl_features }, { INTEL_FAM6_ARROWLAKE, &cnl_features }, + { INTEL_FAM6_LUNARLAKE_M, &cnl_features }, { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, From 3503895788d402d6a3814085ed582c364ec3e903 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 31 Oct 2023 12:02:06 -0400 Subject: [PATCH 085/560] virtio_pci: move structure to a header These are guest/host interfaces, so they belong in the header where e.g. qemu will know to find them. Note: we added a new structure as opposed to extending existing one because someone might be relying on the size of the existing structure staying unchanged. Add a warning to avoid using sizeof. Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- drivers/virtio/virtio_pci_modern_dev.c | 7 ++++--- include/linux/virtio_pci_modern.h | 7 ------- include/uapi/linux/virtio_pci.h | 11 +++++++++++ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index e2a1fe7bb66cc..7de8b1ebabac4 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -294,9 +294,10 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev) err = -EINVAL; mdev->common = vp_modern_map_capability(mdev, common, - sizeof(struct virtio_pci_common_cfg), 4, - 0, sizeof(struct virtio_pci_modern_common_cfg), - &mdev->common_len, NULL); + sizeof(struct virtio_pci_common_cfg), 4, 0, + offsetofend(struct virtio_pci_modern_common_cfg, + queue_reset), + &mdev->common_len, NULL); if (!mdev->common) goto err_map_common; mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1, diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index d0f2797420f70..a09e13a577a99 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -5,13 +5,6 @@ #include #include -struct virtio_pci_modern_common_cfg { - struct virtio_pci_common_cfg cfg; - - __le16 queue_notify_data; /* read-write */ - __le16 queue_reset; /* read-write */ -}; - /** * struct virtio_pci_modern_device - info for modern PCI virtio * @pci_dev: Ptr to the PCI device struct diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index f703afc7ad31b..44f4dd2add188 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -166,6 +166,17 @@ struct virtio_pci_common_cfg { __le32 queue_used_hi; /* read-write */ }; +/* + * Warning: do not use sizeof on this: use offsetofend for + * specific fields you need. + */ +struct virtio_pci_modern_common_cfg { + struct virtio_pci_common_cfg cfg; + + __le16 queue_notify_data; /* read-write */ + __le16 queue_reset; /* read-write */ +}; + /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */ struct virtio_pci_cfg_cap { struct virtio_pci_cap cap; From 0d82410252ea324f0064e75b9865bb74cccc1dda Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 31 Oct 2023 15:43:39 +0100 Subject: [PATCH 086/560] vdpa_sim_blk: allocate the buffer zeroed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deleting and recreating a device can lead to having the same content as the old device, so let's always allocate buffers completely zeroed out. Fixes: abebb16254b3 ("vdpa_sim_blk: support shared backend") Suggested-by: Qing Wang Signed-off-by: Stefano Garzarella Message-Id: <20231031144339.121453-1-sgarzare@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Eugenio Pérez Acked-by: Jason Wang --- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index b3a3cb1657955..b137f36793439 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -437,7 +437,7 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, if (blk->shared_backend) { blk->buffer = shared_buffer; } else { - blk->buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, + blk->buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, GFP_KERNEL); if (!blk->buffer) { ret = -ENOMEM; @@ -495,7 +495,7 @@ static int __init vdpasim_blk_init(void) goto parent_err; if (shared_backend) { - shared_buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, + shared_buffer = kvzalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, GFP_KERNEL); if (!shared_buffer) { ret = -ENOMEM; From f2de37a572853d340f945a7748f74e3ed8c6b743 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 12 Oct 2023 12:28:52 +0200 Subject: [PATCH 087/560] riscv, qemu_fw_cfg: Add support for RISC-V architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Qemu fw_cfg support was missing for RISC-V, which made it hard to do proper vmcore dumps from qemu. Add the missing RISC-V arch-defines. You can now do vmcore dumps from qemu. Add "-device vmcoreinfo" to the qemu command-line. From the qemu monitor: (qemu) dump-guest-memory vmcore The vmcore can now be used, e.g., with the "crash" utility. Acked-by: "Michael S. Tsirkin" Acked-by: Alistair Francis Tested-by: Clément Léger Signed-off-by: Björn Töpel Message-Id: <20231012102852.234442-1-bjorn@kernel.org> Signed-off-by: Michael S. Tsirkin --- drivers/firmware/Kconfig | 2 +- drivers/firmware/qemu_fw_cfg.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index b59e3041fd627..f05ff56629b39 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -155,7 +155,7 @@ config RASPBERRYPI_FIRMWARE config FW_CFG_SYSFS tristate "QEMU fw_cfg device support in sysfs" - depends on SYSFS && (ARM || ARM64 || PARISC || PPC_PMAC || SPARC || X86) + depends on SYSFS && (ARM || ARM64 || PARISC || PPC_PMAC || RISCV || SPARC || X86) depends on HAS_IOPORT_MAP default n help diff --git a/drivers/firmware/qemu_fw_cfg.c b/drivers/firmware/qemu_fw_cfg.c index a69399a6b7c00..1448f61173b35 100644 --- a/drivers/firmware/qemu_fw_cfg.c +++ b/drivers/firmware/qemu_fw_cfg.c @@ -211,7 +211,7 @@ static void fw_cfg_io_cleanup(void) /* arch-specific ctrl & data register offsets are not available in ACPI, DT */ #if !(defined(FW_CFG_CTRL_OFF) && defined(FW_CFG_DATA_OFF)) -# if (defined(CONFIG_ARM) || defined(CONFIG_ARM64)) +# if (defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_RISCV)) # define FW_CFG_CTRL_OFF 0x08 # define FW_CFG_DATA_OFF 0x00 # define FW_CFG_DMA_OFF 0x10 From b2c8b644fac1087dfe69a1762c04df090178a5ae Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 25 Oct 2023 16:53:19 +0200 Subject: [PATCH 088/560] virtio_pci: Switch away from deprecated irq_set_affinity_hint Since commit 65c7cdedeb30 ("genirq: Provide new interfaces for affinity hints") irq_set_affinity_hint is being phased out. Switch to new interfaces for setting and applying irq affinity hints. Signed-off-by: Jakub Sitnicki Message-Id: <20231025145319.380775-1-jakub@cloudflare.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- drivers/virtio/virtio_pci_common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index c2524a7207cfa..7a5593997e0ef 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -242,7 +242,7 @@ void vp_del_vqs(struct virtio_device *vdev) if (v != VIRTIO_MSI_NO_VECTOR) { int irq = pci_irq_vector(vp_dev->pci_dev, v); - irq_set_affinity_hint(irq, NULL); + irq_update_affinity_hint(irq, NULL); free_irq(irq, vq); } } @@ -443,10 +443,10 @@ int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) mask = vp_dev->msix_affinity_masks[info->msix_vector]; irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); if (!cpu_mask) - irq_set_affinity_hint(irq, NULL); + irq_update_affinity_hint(irq, NULL); else { cpumask_copy(mask, cpu_mask); - irq_set_affinity_hint(irq, mask); + irq_set_affinity_and_hint(irq, mask); } } return 0; From e07754e0a1ea2d63fb29574253d1fd7405607343 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 27 Oct 2023 15:12:54 +0300 Subject: [PATCH 089/560] vhost-vdpa: fix use after free in vhost_vdpa_probe() The put_device() calls vhost_vdpa_release_dev() which calls ida_simple_remove() and frees "v". So this call to ida_simple_remove() is a use after free and a double free. Fixes: ebe6a354fa7e ("vhost-vdpa: Call ida_simple_remove() when failed") Signed-off-by: Dan Carpenter Message-Id: Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vdpa.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 30df5c58db73a..da7ec77cdaff0 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1582,7 +1582,6 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa) err: put_device(&v->dev); - ida_simple_remove(&vhost_vdpa_ida, v->minor); return r; } From dec96fc2dcb59723e041416b8dc53e011b4bfc2e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 13 Oct 2023 10:05:48 +0100 Subject: [PATCH 090/560] btrfs: use u64 for buffer sizes in the tree search ioctls In the tree search v2 ioctl we use the type size_t, which is an unsigned long, to track the buffer size in the local variable 'buf_size'. An unsigned long is 32 bits wide on a 32 bits architecture. The buffer size defined in struct btrfs_ioctl_search_args_v2 is a u64, so when we later try to copy the local variable 'buf_size' to the argument struct, when the search returns -EOVERFLOW, we copy only 32 bits which will be a problem on big endian systems. Fix this by using a u64 type for the buffer sizes, not only at btrfs_ioctl_tree_search_v2(), but also everywhere down the call chain so that we can use the u64 at btrfs_ioctl_tree_search_v2(). Fixes: cc68a8a5a433 ("btrfs: new ioctl TREE_SEARCH_V2") Reported-by: Dan Carpenter Link: https://lore.kernel.org/linux-btrfs/ce6f4bd6-9453-4ffe-ba00-cee35495e10f@moroto.mountain/ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 89cd212735ea6..f7e94aff41ae5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1528,7 +1528,7 @@ static noinline int key_in_sk(struct btrfs_key *key, static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, int *num_found) @@ -1660,7 +1660,7 @@ static noinline int copy_to_sk(struct btrfs_path *path, static noinline int search_ioctl(struct inode *inode, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf) { struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); @@ -1733,7 +1733,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, struct btrfs_ioctl_search_args __user *uargs = argp; struct btrfs_ioctl_search_key sk; int ret; - size_t buf_size; + u64 buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1763,8 +1763,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, struct btrfs_ioctl_search_args_v2 __user *uarg = argp; struct btrfs_ioctl_search_args_v2 args; int ret; - size_t buf_size; - const size_t buf_limit = SZ_16M; + u64 buf_size; + const u64 buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; From b8212814d1e8428a082234223105e4071b844fab Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 12 Oct 2023 12:42:55 +0300 Subject: [PATCH 091/560] btrfs: directly return 0 on no error code in btrfs_insert_raid_extent() It's more obvious to return a literal zero instead of "return ret;". Plus Smatch complains that ret could be uninitialized if the ordered_extent->bioc_list list is empty and this silences that warning. Signed-off-by: Dan Carpenter Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 944e8f1862aaa..9589362acfbf9 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -145,7 +145,7 @@ int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, btrfs_put_bioc(bioc); } - return ret; + return 0; } int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, From dfcb03ae8a341600d72fbf3c79429f306764d653 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 17 Oct 2023 16:23:22 +0900 Subject: [PATCH 092/560] btrfs: zoned: drop no longer valid write pointer check There is a check of the write pointer vs the zone size to reject an invalid write pointer. However, as of now, we have RAID0/RAID10 on the zoned mode, we can have a block group whose size is larger than the zone size. As an equivalent check against the block group's zone_capacity is already there, we can just drop this invalid check. Fixes: 568220fa9657 ("btrfs: zoned: support RAID0/1/10 on top of raid stripe tree") Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 3504ade30cb0c..188378ca19c7f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1661,13 +1661,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } out: - if (cache->alloc_offset > fs_info->zone_size) { - btrfs_err(fs_info, - "zoned: invalid write pointer %llu in block group %llu", - cache->alloc_offset, cache->start); - ret = -EIO; - } - if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", From 776a838f1fa95670c1c1cf7109a898090b473fa3 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 17 Oct 2023 17:00:31 +0900 Subject: [PATCH 093/560] btrfs: zoned: wait for data BG to be finished on direct IO allocation Running the fio command below on a ZNS device results in "Resource temporarily unavailable" error. $ sudo fio --name=w --directory=/mnt --filesize=1GB --bs=16MB --numjobs=16 \ --rw=write --ioengine=libaio --iodepth=128 --direct=1 fio: io_u error on file /mnt/w.2.0: Resource temporarily unavailable: write offset=117440512, buflen=16777216 fio: io_u error on file /mnt/w.2.0: Resource temporarily unavailable: write offset=134217728, buflen=16777216 ... This happens because -EAGAIN error returned from btrfs_reserve_extent() called from btrfs_new_extent_direct() is spilling over to the userland. btrfs_reserve_extent() returns -EAGAIN when there is no active zone available. Then, the caller should wait for some other on-going IO to finish a zone and retry the allocation. This logic is already implemented for buffered write in cow_file_range(), but it is missing for the direct IO counterpart. Implement the same logic for it. Reported-by: Shinichiro Kawasaki Fixes: 2ce543f47843 ("btrfs: zoned: wait until zone is finished when allocation didn't progress") CC: stable@vger.kernel.org # 6.1+ Tested-by: Shinichiro Kawasaki Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b388505c91cc1..137c4824ff912 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6979,8 +6979,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); +again: ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + ASSERT(btrfs_is_zoned(fs_info)); + wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + goto again; + } if (ret) return ERR_PTR(ret); From d8ba2a91fc3cd0347823435971f58f473cbba7aa Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 Oct 2023 15:18:17 -0400 Subject: [PATCH 094/560] btrfs: get correct owning_root when dropping snapshot Dave reported a bug where we were aborting the transaction while trying to cleanup the squota reservation for an extent. This turned out to be because we're doing btrfs_header_owner(next) in do_walk_down when we decide to free the block. However in this code block we haven't explicitly read next, so it could be stale. We would then get whatever garbage happened to be in the pages at this point. The commit that introduced that is "btrfs: track owning root in btrfs_ref". Fix this by saving the owner_root when we do the btrfs_lookup_extent_info(). We always do this in do_walk_down, it is how we make the decision of whether or not to delete the block. This is cheap because we've already done the extent item lookup at this point, so it's straightforward to just grab the owner root as well. Then we can use this when deleting the metadata block without needing to force a read of the extent buffer to find the owner. This fixes the problem that Dave reported. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- fs/btrfs/extent-tree.c | 25 +++++++++++++++++-------- fs/btrfs/extent-tree.h | 3 ++- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c0c5f22398207..14cefeaf96221 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -421,7 +421,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (btrfs_block_can_be_shared(root, buf)) { ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, btrfs_header_level(buf), 1, - &refs, &flags); + &refs, &flags, NULL); if (ret) return ret; if (unlikely(refs == 0)) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c8e5b4715b495..0455935ff5588 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -102,7 +102,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags) + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owning_root) { struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; @@ -114,6 +115,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u32 item_size; u64 num_refs; u64 extent_flags; + u64 owner = 0; int ret; /* @@ -167,6 +169,8 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_extent_item); num_refs = btrfs_extent_refs(leaf, ei); extent_flags = btrfs_extent_flags(leaf, ei); + owner = btrfs_get_extent_owner_root(fs_info, leaf, + path->slots[0]); } else { ret = -EUCLEAN; btrfs_err(fs_info, @@ -226,6 +230,8 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, *refs = num_refs; if (flags) *flags = extent_flags; + if (owning_root) + *owning_root = owner; out_free: btrfs_free_path(path); return ret; @@ -5234,7 +5240,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, wc->level - 1, 1, &refs, - &flags); + &flags, NULL); /* We don't care about errors in readahead. */ if (ret < 0) continue; @@ -5301,7 +5307,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); BUG_ON(ret == -ENOMEM); if (ret) return ret; @@ -5391,6 +5398,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; + u64 owner_root = 0; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_key key; struct btrfs_ref ref = { 0 }; @@ -5434,7 +5442,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], - &wc->flags[level - 1]); + &wc->flags[level - 1], + &owner_root); if (ret < 0) goto out_unlock; @@ -5567,8 +5576,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, find_next_key(path, level, &wc->drop_progress); btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent, - btrfs_header_owner(next)); + fs_info->nodesize, parent, owner_root); btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, 0, false); ret = btrfs_free_extent(trans, &ref); @@ -5635,7 +5643,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; @@ -5880,7 +5889,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], NULL); if (ret < 0) { err = ret; goto out_end_trans; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 0716f65d9753b..2e066035cceee 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -99,7 +99,8 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags); + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owner_root); int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, From 47e2b06b7b5cb356a987ba3429550c3a89ea89d6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 28 Oct 2023 13:28:45 +1030 Subject: [PATCH 095/560] btrfs: make found_logical_ret parameter mandatory for function queue_scrub_stripe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [BUG] There is a compilation warning reported on commit ae76d8e3e135 ("btrfs: scrub: fix grouping of read IO"), where gcc (14.0.0 20231022 experimental) is reporting the following uninitialized variable: fs/btrfs/scrub.c: In function ‘scrub_simple_mirror.isra’: fs/btrfs/scrub.c:2075:29: error: ‘found_logical’ may be used uninitialized [-Werror=maybe-uninitialized[https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html#index-Wmaybe-uninitialized]] 2075 | cur_logical = found_logical + BTRFS_STRIPE_LEN; fs/btrfs/scrub.c:2040:21: note: ‘found_logical’ was declared here 2040 | u64 found_logical; | ^~~~~~~~~~~~~ [CAUSE] This is a false alert, as @found_logical is passed as parameter @found_logical_ret of function queue_scrub_stripe(). As long as queue_scrub_stripe() returned 0, we would update @found_logical_ret. And if queue_scrub_stripe() returned >0 or <0, the caller would not utilized @found_logical, thus there should be nothing wrong. Although the triggering gcc is still experimental, it looks like the extra check on "if (found_logical_ret)" can sometimes confuse the compiler. Meanwhile the only caller of queue_scrub_stripe() is always passing a valid pointer, there is no need for such check at all. [FIX] Although the report itself is a false alert, we can still make it more explicit by: - Replace the check for @found_logical_ret with ASSERT() - Initialize @found_logical to U64_MAX - Add one extra ASSERT() to make sure @found_logical got updated Link: https://lore.kernel.org/linux-btrfs/87fs1x1p93.fsf@gentoo.org/ Fixes: ae76d8e3e135 ("btrfs: scrub: fix grouping of read IO") Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9ce5be21b0360..f62a408671cbc 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1868,6 +1868,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * */ ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); + /* @found_logical_ret must be specified. */ + ASSERT(found_logical_ret); + stripe = &sctx->stripes[sctx->cur_stripe]; scrub_reset_stripe(stripe); ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, @@ -1876,8 +1879,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; - if (found_logical_ret) - *found_logical_ret = stripe->logical; + *found_logical_ret = stripe->logical; sctx->cur_stripe++; /* We filled one group, submit it. */ @@ -2080,7 +2082,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { - u64 found_logical; + u64 found_logical = U64_MAX; u64 cur_physical = physical + cur_logical - logical_start; /* Canceled? */ @@ -2115,6 +2117,8 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, if (ret < 0) break; + /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ + ASSERT(found_logical != U64_MAX); cur_logical = found_logical + BTRFS_STRIPE_LEN; /* Don't hold CPU for too long time */ From cd63ffbd23edc176f09cac5c9287db732d7cbb73 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 30 Oct 2023 11:54:23 +0000 Subject: [PATCH 096/560] btrfs: fix error pointer dereference after failure to allocate fs devices At device_list_add() we allocate a btrfs_fs_devices structure and then before checking if the allocation failed (pointer is ERR_PTR(-ENOMEM)), we dereference the error pointer in a memcpy() argument if the feature BTRFS_FEATURE_INCOMPAT_METADATA_UUID is enabled. Fix this by checking for an allocation error before trying the memcpy(). Fixes: f7361d8c3fc3 ("btrfs: sipmlify uuid parameters of alloc_fs_devices()") Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1fdfa9153e30c..dd279241f78c9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -746,13 +746,13 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (!fs_devices) { fs_devices = alloc_fs_devices(disk_super->fsid); + if (IS_ERR(fs_devices)) + return ERR_CAST(fs_devices); + if (has_metadata_uuid) memcpy(fs_devices->metadata_uuid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); - if (IS_ERR(fs_devices)) - return ERR_CAST(fs_devices); - if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; From c692800cb2ef7a4f4940c68d765cd4649aff3e46 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 2 Nov 2023 02:33:14 +0300 Subject: [PATCH 097/560] MAINTAINERS: Add Intel TDX entry Add myself as Intel TDX maintainer. I drove upstreaming most of TDX code so far and I will continue working on TDX for foreseeable future. [ dhansen: * Add myself as a reviewer too * Swap Maintained=>Supported. I double checked Kirill is still being paid * Add drivers/virt/coco/tdx-guest ] Suggested-by: Dave Hansen Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Acked-by: Dave Hansen Acked-by: Kuppuswamy Sathyanarayanan Acked-by: Kai Huang Link: https://lore.kernel.org/all/20231101233314.2567-1-kirill.shutemov%40linux.intel.com --- MAINTAINERS | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index dd5de540ec0b5..b697020eca1c9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23460,6 +23460,20 @@ F: arch/x86/kernel/dumpstack.c F: arch/x86/kernel/stacktrace.c F: arch/x86/kernel/unwind_*.c +X86 TRUST DOMAIN EXTENSIONS (TDX) +M: Kirill A. Shutemov +R: Dave Hansen +L: x86@kernel.org +L: linux-coco@lists.linux.dev +S: Supported +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/tdx +F: arch/x86/boot/compressed/tdx* +F: arch/x86/coco/tdx/ +F: arch/x86/include/asm/shared/tdx.h +F: arch/x86/include/asm/tdx.h +F: arch/x86/virt/vmx/tdx/ +F: drivers/virt/coco/tdx-guest + X86 VDSO M: Andy Lutomirski L: linux-kernel@vger.kernel.org From d3badb15613c14dd35d3495b1dde5c90fcd616dd Mon Sep 17 00:00:00 2001 From: Fang Xiang Date: Mon, 30 Oct 2023 16:32:56 +0800 Subject: [PATCH 098/560] irqchip/gic-v3-its: Flush ITS tables correctly in non-coherent GIC designs In non-coherent GIC designs, the ITS tables must be flushed before writing to the GITS_BASER registers, otherwise the ITS could read dirty tables, which results in unpredictable behavior. Flush the tables right at the begin of its_setup_baser() to prevent that. [ tglx: Massage changelog ] Fixes: a8707f553884 ("irqchip/gic-v3: Add Rockchip 3588001 erratum workaround") Suggested-by: Marc Zyngier Signed-off-by: Fang Xiang Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20231030083256.4345-1-fangxiang3@xiaomi.com --- drivers/irqchip/irq-gic-v3-its.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index a8c89df1a9978..9a7a74239eabb 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2379,12 +2379,12 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, break; } + if (!shr) + gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order)); + its_write_baser(its, baser, val); tmp = baser->val; - if (its->flags & ITS_FLAGS_FORCE_NON_SHAREABLE) - tmp &= ~GITS_BASER_SHAREABILITY_MASK; - if ((val ^ tmp) & GITS_BASER_SHAREABILITY_MASK) { /* * Shareability didn't stick. Just use @@ -2394,10 +2394,9 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser, * non-cacheable as well. */ shr = tmp & GITS_BASER_SHAREABILITY_MASK; - if (!shr) { + if (!shr) cache = GITS_BASER_nC; - gic_flush_dcache_to_poc(base, PAGE_ORDER_TO_SIZE(order)); - } + goto retry_baser; } @@ -2609,6 +2608,11 @@ static int its_alloc_tables(struct its_node *its) /* erratum 24313: ignore memory access type */ cache = GITS_BASER_nCnB; + if (its->flags & ITS_FLAGS_FORCE_NON_SHAREABLE) { + cache = GITS_BASER_nC; + shr = 0; + } + for (i = 0; i < GITS_BASER_NR_REGS; i++) { struct its_baser *baser = its->tables + i; u64 val = its_read_baser(its, baser); From 18216762bcf618c52b85719d3563243f80e4a2d4 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Mon, 6 Nov 2023 17:12:04 +0700 Subject: [PATCH 099/560] x86/Documentation: Indent 'note::' directive for protocol version number note The protocol version number note is between the protocol version table and the memory layout section. As such, Sphinx renders the note directive not only on the actual note, but until the end of doc. Indent the directive so that only the actual protocol version number note is rendered as such. Fixes: 2c33c27fd603 ("x86/boot: Introduce kernel_info") Signed-off-by: Bagas Sanjaya Signed-off-by: Ingo Molnar Cc: Jonathan Corbet Link: https://lore.kernel.org/r/20231106101206.76487-2-bagasdotme@gmail.com Signed-off-by: Ingo Molnar --- Documentation/arch/x86/boot.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst index f5d2f2414de8b..22cc7a040dae0 100644 --- a/Documentation/arch/x86/boot.rst +++ b/Documentation/arch/x86/boot.rst @@ -77,7 +77,7 @@ Protocol 2.14 BURNT BY INCORRECT COMMIT Protocol 2.15 (Kernel 5.5) Added the kernel_info and kernel_info.setup_type_max. ============= ============================================================ -.. note:: + .. note:: The protocol version number should be changed only if the setup header is changed. There is no need to update the version number if boot_params or kernel_info are changed. Additionally, it is recommended to use From 258ea41c926b7b3a16d0d7aa210a1401c4a1601b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 6 Nov 2023 12:06:52 +0100 Subject: [PATCH 100/560] Revert "phy: realtek: usb: Add driver for the Realtek SoC USB 3.0 PHY" This reverts commit adda6e82a7de7d6d478f6c8ef127f0ac51c510a1. The recently added Realtek PHY drivers depend on the new port status notification mechanism which was built on the deprecated USB PHY implementation and devicetree binding. Specifically, using these PHYs would require describing the very same PHY using both the generic "phy" property and the deprecated "usb-phy" property which is clearly wrong. We should not be building new functionality on top of the legacy USB PHY implementation even if it is currently stuck in some kind of transitional limbo. Revert the new Realtek PHY drivers for now so that the port status notification interface can be reverted and replaced. Fixes: adda6e82a7de ("phy: realtek: usb: Add driver for the Realtek SoC USB 3.0 PHY") Cc: stable@vger.kernel.org # 6.6 Cc: Stanley Chang Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20231106110654.31090-2-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/phy/realtek/Kconfig | 12 - drivers/phy/realtek/Makefile | 1 - drivers/phy/realtek/phy-rtk-usb3.c | 761 ----------------------------- 3 files changed, 774 deletions(-) delete mode 100644 drivers/phy/realtek/phy-rtk-usb3.c diff --git a/drivers/phy/realtek/Kconfig b/drivers/phy/realtek/Kconfig index 75ac7e7c31aec..7455877510709 100644 --- a/drivers/phy/realtek/Kconfig +++ b/drivers/phy/realtek/Kconfig @@ -17,16 +17,4 @@ config PHY_RTK_RTD_USB2PHY DWC3 USB IP. This driver will do the PHY initialization of the parameters. -config PHY_RTK_RTD_USB3PHY - tristate "Realtek RTD USB3 PHY Transceiver Driver" - depends on USB_SUPPORT - select GENERIC_PHY - select USB_PHY - select USB_COMMON - help - Enable this to support Realtek SoC USB3 phy transceiver. - The DHC (digital home center) RTD series SoCs used the Synopsys - DWC3 USB IP. This driver will do the PHY initialization - of the parameters. - endif # ARCH_REALTEK || COMPILE_TEST diff --git a/drivers/phy/realtek/Makefile b/drivers/phy/realtek/Makefile index ed7b47ff8a268..cf5d440841a27 100644 --- a/drivers/phy/realtek/Makefile +++ b/drivers/phy/realtek/Makefile @@ -1,3 +1,2 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_PHY_RTK_RTD_USB2PHY) += phy-rtk-usb2.o -obj-$(CONFIG_PHY_RTK_RTD_USB3PHY) += phy-rtk-usb3.o diff --git a/drivers/phy/realtek/phy-rtk-usb3.c b/drivers/phy/realtek/phy-rtk-usb3.c deleted file mode 100644 index 67446a85e9688..0000000000000 --- a/drivers/phy/realtek/phy-rtk-usb3.c +++ /dev/null @@ -1,761 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * phy-rtk-usb3.c RTK usb3.0 phy driver - * - * copyright (c) 2023 realtek semiconductor corporation - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define USB_MDIO_CTRL_PHY_BUSY BIT(7) -#define USB_MDIO_CTRL_PHY_WRITE BIT(0) -#define USB_MDIO_CTRL_PHY_ADDR_SHIFT 8 -#define USB_MDIO_CTRL_PHY_DATA_SHIFT 16 - -#define MAX_USB_PHY_DATA_SIZE 0x30 -#define PHY_ADDR_0X09 0x09 -#define PHY_ADDR_0X0B 0x0b -#define PHY_ADDR_0X0D 0x0d -#define PHY_ADDR_0X10 0x10 -#define PHY_ADDR_0X1F 0x1f -#define PHY_ADDR_0X20 0x20 -#define PHY_ADDR_0X21 0x21 -#define PHY_ADDR_0X30 0x30 - -#define REG_0X09_FORCE_CALIBRATION BIT(9) -#define REG_0X0B_RX_OFFSET_RANGE_MASK 0xc -#define REG_0X0D_RX_DEBUG_TEST_EN BIT(6) -#define REG_0X10_DEBUG_MODE_SETTING 0x3c0 -#define REG_0X10_DEBUG_MODE_SETTING_MASK 0x3f8 -#define REG_0X1F_RX_OFFSET_CODE_MASK 0x1e - -#define USB_U3_TX_LFPS_SWING_TRIM_SHIFT 4 -#define USB_U3_TX_LFPS_SWING_TRIM_MASK 0xf -#define AMPLITUDE_CONTROL_COARSE_MASK 0xff -#define AMPLITUDE_CONTROL_FINE_MASK 0xffff -#define AMPLITUDE_CONTROL_COARSE_DEFAULT 0xff -#define AMPLITUDE_CONTROL_FINE_DEFAULT 0xffff - -#define PHY_ADDR_MAP_ARRAY_INDEX(addr) (addr) -#define ARRAY_INDEX_MAP_PHY_ADDR(index) (index) - -struct phy_reg { - void __iomem *reg_mdio_ctl; -}; - -struct phy_data { - u8 addr; - u16 data; -}; - -struct phy_cfg { - int param_size; - struct phy_data param[MAX_USB_PHY_DATA_SIZE]; - - bool check_efuse; - bool do_toggle; - bool do_toggle_once; - bool use_default_parameter; - bool check_rx_front_end_offset; -}; - -struct phy_parameter { - struct phy_reg phy_reg; - - /* Get from efuse */ - u8 efuse_usb_u3_tx_lfps_swing_trim; - - /* Get from dts */ - u32 amplitude_control_coarse; - u32 amplitude_control_fine; -}; - -struct rtk_phy { - struct usb_phy phy; - struct device *dev; - - struct phy_cfg *phy_cfg; - int num_phy; - struct phy_parameter *phy_parameter; - - struct dentry *debug_dir; -}; - -#define PHY_IO_TIMEOUT_USEC (50000) -#define PHY_IO_DELAY_US (100) - -static inline int utmi_wait_register(void __iomem *reg, u32 mask, u32 result) -{ - int ret; - unsigned int val; - - ret = read_poll_timeout(readl, val, ((val & mask) == result), - PHY_IO_DELAY_US, PHY_IO_TIMEOUT_USEC, false, reg); - if (ret) { - pr_err("%s can't program USB phy\n", __func__); - return -ETIMEDOUT; - } - - return 0; -} - -static int rtk_phy3_wait_vbusy(struct phy_reg *phy_reg) -{ - return utmi_wait_register(phy_reg->reg_mdio_ctl, USB_MDIO_CTRL_PHY_BUSY, 0); -} - -static u16 rtk_phy_read(struct phy_reg *phy_reg, char addr) -{ - unsigned int tmp; - u32 value; - - tmp = (addr << USB_MDIO_CTRL_PHY_ADDR_SHIFT); - - writel(tmp, phy_reg->reg_mdio_ctl); - - rtk_phy3_wait_vbusy(phy_reg); - - value = readl(phy_reg->reg_mdio_ctl); - value = value >> USB_MDIO_CTRL_PHY_DATA_SHIFT; - - return (u16)value; -} - -static int rtk_phy_write(struct phy_reg *phy_reg, char addr, u16 data) -{ - unsigned int val; - - val = USB_MDIO_CTRL_PHY_WRITE | - (addr << USB_MDIO_CTRL_PHY_ADDR_SHIFT) | - (data << USB_MDIO_CTRL_PHY_DATA_SHIFT); - - writel(val, phy_reg->reg_mdio_ctl); - - rtk_phy3_wait_vbusy(phy_reg); - - return 0; -} - -static void do_rtk_usb3_phy_toggle(struct rtk_phy *rtk_phy, int index, bool connect) -{ - struct phy_cfg *phy_cfg = rtk_phy->phy_cfg; - struct phy_reg *phy_reg; - struct phy_parameter *phy_parameter; - struct phy_data *phy_data; - u8 addr; - u16 data; - int i; - - phy_parameter = &((struct phy_parameter *)rtk_phy->phy_parameter)[index]; - phy_reg = &phy_parameter->phy_reg; - - if (!phy_cfg->do_toggle) - return; - - i = PHY_ADDR_MAP_ARRAY_INDEX(PHY_ADDR_0X09); - phy_data = phy_cfg->param + i; - addr = phy_data->addr; - data = phy_data->data; - - if (!addr && !data) { - addr = PHY_ADDR_0X09; - data = rtk_phy_read(phy_reg, addr); - phy_data->addr = addr; - phy_data->data = data; - } - - rtk_phy_write(phy_reg, addr, data & (~REG_0X09_FORCE_CALIBRATION)); - mdelay(1); - rtk_phy_write(phy_reg, addr, data | REG_0X09_FORCE_CALIBRATION); -} - -static int do_rtk_phy_init(struct rtk_phy *rtk_phy, int index) -{ - struct phy_cfg *phy_cfg; - struct phy_reg *phy_reg; - struct phy_parameter *phy_parameter; - int i = 0; - - phy_cfg = rtk_phy->phy_cfg; - phy_parameter = &((struct phy_parameter *)rtk_phy->phy_parameter)[index]; - phy_reg = &phy_parameter->phy_reg; - - if (phy_cfg->use_default_parameter) - goto do_toggle; - - for (i = 0; i < phy_cfg->param_size; i++) { - struct phy_data *phy_data = phy_cfg->param + i; - u8 addr = phy_data->addr; - u16 data = phy_data->data; - - if (!addr && !data) - continue; - - rtk_phy_write(phy_reg, addr, data); - } - -do_toggle: - if (phy_cfg->do_toggle_once) - phy_cfg->do_toggle = true; - - do_rtk_usb3_phy_toggle(rtk_phy, index, false); - - if (phy_cfg->do_toggle_once) { - u16 check_value = 0; - int count = 10; - u16 value_0x0d, value_0x10; - - /* Enable Debug mode by set 0x0D and 0x10 */ - value_0x0d = rtk_phy_read(phy_reg, PHY_ADDR_0X0D); - value_0x10 = rtk_phy_read(phy_reg, PHY_ADDR_0X10); - - rtk_phy_write(phy_reg, PHY_ADDR_0X0D, - value_0x0d | REG_0X0D_RX_DEBUG_TEST_EN); - rtk_phy_write(phy_reg, PHY_ADDR_0X10, - (value_0x10 & ~REG_0X10_DEBUG_MODE_SETTING_MASK) | - REG_0X10_DEBUG_MODE_SETTING); - - check_value = rtk_phy_read(phy_reg, PHY_ADDR_0X30); - - while (!(check_value & BIT(15))) { - check_value = rtk_phy_read(phy_reg, PHY_ADDR_0X30); - mdelay(1); - if (count-- < 0) - break; - } - - if (!(check_value & BIT(15))) - dev_info(rtk_phy->dev, "toggle fail addr=0x%02x, data=0x%04x\n", - PHY_ADDR_0X30, check_value); - - /* Disable Debug mode by set 0x0D and 0x10 to default*/ - rtk_phy_write(phy_reg, PHY_ADDR_0X0D, value_0x0d); - rtk_phy_write(phy_reg, PHY_ADDR_0X10, value_0x10); - - phy_cfg->do_toggle = false; - } - - if (phy_cfg->check_rx_front_end_offset) { - u16 rx_offset_code, rx_offset_range; - u16 code_mask = REG_0X1F_RX_OFFSET_CODE_MASK; - u16 range_mask = REG_0X0B_RX_OFFSET_RANGE_MASK; - bool do_update = false; - - rx_offset_code = rtk_phy_read(phy_reg, PHY_ADDR_0X1F); - if (((rx_offset_code & code_mask) == 0x0) || - ((rx_offset_code & code_mask) == code_mask)) - do_update = true; - - rx_offset_range = rtk_phy_read(phy_reg, PHY_ADDR_0X0B); - if (((rx_offset_range & range_mask) == range_mask) && do_update) { - dev_warn(rtk_phy->dev, "Don't update rx_offset_range (rx_offset_code=0x%x, rx_offset_range=0x%x)\n", - rx_offset_code, rx_offset_range); - do_update = false; - } - - if (do_update) { - u16 tmp1, tmp2; - - tmp1 = rx_offset_range & (~range_mask); - tmp2 = rx_offset_range & range_mask; - tmp2 += (1 << 2); - rx_offset_range = tmp1 | (tmp2 & range_mask); - rtk_phy_write(phy_reg, PHY_ADDR_0X0B, rx_offset_range); - goto do_toggle; - } - } - - return 0; -} - -static int rtk_phy_init(struct phy *phy) -{ - struct rtk_phy *rtk_phy = phy_get_drvdata(phy); - int ret = 0; - int i; - unsigned long phy_init_time = jiffies; - - for (i = 0; i < rtk_phy->num_phy; i++) - ret = do_rtk_phy_init(rtk_phy, i); - - dev_dbg(rtk_phy->dev, "Initialized RTK USB 3.0 PHY (take %dms)\n", - jiffies_to_msecs(jiffies - phy_init_time)); - - return ret; -} - -static int rtk_phy_exit(struct phy *phy) -{ - return 0; -} - -static const struct phy_ops ops = { - .init = rtk_phy_init, - .exit = rtk_phy_exit, - .owner = THIS_MODULE, -}; - -static void rtk_phy_toggle(struct usb_phy *usb3_phy, bool connect, int port) -{ - int index = port; - struct rtk_phy *rtk_phy = NULL; - - rtk_phy = dev_get_drvdata(usb3_phy->dev); - - if (index > rtk_phy->num_phy) { - dev_err(rtk_phy->dev, "%s: The port=%d is not in usb phy (num_phy=%d)\n", - __func__, index, rtk_phy->num_phy); - return; - } - - do_rtk_usb3_phy_toggle(rtk_phy, index, connect); -} - -static int rtk_phy_notify_port_status(struct usb_phy *x, int port, - u16 portstatus, u16 portchange) -{ - bool connect = false; - - pr_debug("%s port=%d portstatus=0x%x portchange=0x%x\n", - __func__, port, (int)portstatus, (int)portchange); - if (portstatus & USB_PORT_STAT_CONNECTION) - connect = true; - - if (portchange & USB_PORT_STAT_C_CONNECTION) - rtk_phy_toggle(x, connect, port); - - return 0; -} - -#ifdef CONFIG_DEBUG_FS -static struct dentry *create_phy_debug_root(void) -{ - struct dentry *phy_debug_root; - - phy_debug_root = debugfs_lookup("phy", usb_debug_root); - if (!phy_debug_root) - phy_debug_root = debugfs_create_dir("phy", usb_debug_root); - - return phy_debug_root; -} - -static int rtk_usb3_parameter_show(struct seq_file *s, void *unused) -{ - struct rtk_phy *rtk_phy = s->private; - struct phy_cfg *phy_cfg; - int i, index; - - phy_cfg = rtk_phy->phy_cfg; - - seq_puts(s, "Property:\n"); - seq_printf(s, " check_efuse: %s\n", - phy_cfg->check_efuse ? "Enable" : "Disable"); - seq_printf(s, " do_toggle: %s\n", - phy_cfg->do_toggle ? "Enable" : "Disable"); - seq_printf(s, " do_toggle_once: %s\n", - phy_cfg->do_toggle_once ? "Enable" : "Disable"); - seq_printf(s, " use_default_parameter: %s\n", - phy_cfg->use_default_parameter ? "Enable" : "Disable"); - - for (index = 0; index < rtk_phy->num_phy; index++) { - struct phy_reg *phy_reg; - struct phy_parameter *phy_parameter; - - phy_parameter = &((struct phy_parameter *)rtk_phy->phy_parameter)[index]; - phy_reg = &phy_parameter->phy_reg; - - seq_printf(s, "PHY %d:\n", index); - - for (i = 0; i < phy_cfg->param_size; i++) { - struct phy_data *phy_data = phy_cfg->param + i; - u8 addr = ARRAY_INDEX_MAP_PHY_ADDR(i); - u16 data = phy_data->data; - - if (!phy_data->addr && !data) - seq_printf(s, " addr = 0x%02x, data = none ==> read value = 0x%04x\n", - addr, rtk_phy_read(phy_reg, addr)); - else - seq_printf(s, " addr = 0x%02x, data = 0x%04x ==> read value = 0x%04x\n", - addr, data, rtk_phy_read(phy_reg, addr)); - } - - seq_puts(s, "PHY Property:\n"); - seq_printf(s, " efuse_usb_u3_tx_lfps_swing_trim: 0x%x\n", - (int)phy_parameter->efuse_usb_u3_tx_lfps_swing_trim); - seq_printf(s, " amplitude_control_coarse: 0x%x\n", - (int)phy_parameter->amplitude_control_coarse); - seq_printf(s, " amplitude_control_fine: 0x%x\n", - (int)phy_parameter->amplitude_control_fine); - } - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(rtk_usb3_parameter); - -static inline void create_debug_files(struct rtk_phy *rtk_phy) -{ - struct dentry *phy_debug_root = NULL; - - phy_debug_root = create_phy_debug_root(); - - if (!phy_debug_root) - return; - - rtk_phy->debug_dir = debugfs_create_dir(dev_name(rtk_phy->dev), phy_debug_root); - - debugfs_create_file("parameter", 0444, rtk_phy->debug_dir, rtk_phy, - &rtk_usb3_parameter_fops); - - return; -} - -static inline void remove_debug_files(struct rtk_phy *rtk_phy) -{ - debugfs_remove_recursive(rtk_phy->debug_dir); -} -#else -static inline void create_debug_files(struct rtk_phy *rtk_phy) { } -static inline void remove_debug_files(struct rtk_phy *rtk_phy) { } -#endif /* CONFIG_DEBUG_FS */ - -static int get_phy_data_by_efuse(struct rtk_phy *rtk_phy, - struct phy_parameter *phy_parameter, int index) -{ - struct phy_cfg *phy_cfg = rtk_phy->phy_cfg; - u8 value = 0; - struct nvmem_cell *cell; - - if (!phy_cfg->check_efuse) - goto out; - - cell = nvmem_cell_get(rtk_phy->dev, "usb_u3_tx_lfps_swing_trim"); - if (IS_ERR(cell)) { - dev_dbg(rtk_phy->dev, "%s no usb_u3_tx_lfps_swing_trim: %ld\n", - __func__, PTR_ERR(cell)); - } else { - unsigned char *buf; - size_t buf_size; - - buf = nvmem_cell_read(cell, &buf_size); - if (!IS_ERR(buf)) { - value = buf[0] & USB_U3_TX_LFPS_SWING_TRIM_MASK; - kfree(buf); - } - nvmem_cell_put(cell); - } - - if (value > 0 && value < 0x8) - phy_parameter->efuse_usb_u3_tx_lfps_swing_trim = 0x8; - else - phy_parameter->efuse_usb_u3_tx_lfps_swing_trim = (u8)value; - -out: - return 0; -} - -static void update_amplitude_control_value(struct rtk_phy *rtk_phy, - struct phy_parameter *phy_parameter) -{ - struct phy_cfg *phy_cfg; - struct phy_reg *phy_reg; - - phy_reg = &phy_parameter->phy_reg; - phy_cfg = rtk_phy->phy_cfg; - - if (phy_parameter->amplitude_control_coarse != AMPLITUDE_CONTROL_COARSE_DEFAULT) { - u16 val_mask = AMPLITUDE_CONTROL_COARSE_MASK; - u16 data; - - if (!phy_cfg->param[PHY_ADDR_0X20].addr && !phy_cfg->param[PHY_ADDR_0X20].data) { - phy_cfg->param[PHY_ADDR_0X20].addr = PHY_ADDR_0X20; - data = rtk_phy_read(phy_reg, PHY_ADDR_0X20); - } else { - data = phy_cfg->param[PHY_ADDR_0X20].data; - } - - data &= (~val_mask); - data |= (phy_parameter->amplitude_control_coarse & val_mask); - - phy_cfg->param[PHY_ADDR_0X20].data = data; - } - - if (phy_parameter->efuse_usb_u3_tx_lfps_swing_trim) { - u8 efuse_val = phy_parameter->efuse_usb_u3_tx_lfps_swing_trim; - u16 val_mask = USB_U3_TX_LFPS_SWING_TRIM_MASK; - int val_shift = USB_U3_TX_LFPS_SWING_TRIM_SHIFT; - u16 data; - - if (!phy_cfg->param[PHY_ADDR_0X20].addr && !phy_cfg->param[PHY_ADDR_0X20].data) { - phy_cfg->param[PHY_ADDR_0X20].addr = PHY_ADDR_0X20; - data = rtk_phy_read(phy_reg, PHY_ADDR_0X20); - } else { - data = phy_cfg->param[PHY_ADDR_0X20].data; - } - - data &= ~(val_mask << val_shift); - data |= ((efuse_val & val_mask) << val_shift); - - phy_cfg->param[PHY_ADDR_0X20].data = data; - } - - if (phy_parameter->amplitude_control_fine != AMPLITUDE_CONTROL_FINE_DEFAULT) { - u16 val_mask = AMPLITUDE_CONTROL_FINE_MASK; - - if (!phy_cfg->param[PHY_ADDR_0X21].addr && !phy_cfg->param[PHY_ADDR_0X21].data) - phy_cfg->param[PHY_ADDR_0X21].addr = PHY_ADDR_0X21; - - phy_cfg->param[PHY_ADDR_0X21].data = - phy_parameter->amplitude_control_fine & val_mask; - } -} - -static int parse_phy_data(struct rtk_phy *rtk_phy) -{ - struct device *dev = rtk_phy->dev; - struct phy_parameter *phy_parameter; - int ret = 0; - int index; - - rtk_phy->phy_parameter = devm_kzalloc(dev, sizeof(struct phy_parameter) * - rtk_phy->num_phy, GFP_KERNEL); - if (!rtk_phy->phy_parameter) - return -ENOMEM; - - for (index = 0; index < rtk_phy->num_phy; index++) { - phy_parameter = &((struct phy_parameter *)rtk_phy->phy_parameter)[index]; - - phy_parameter->phy_reg.reg_mdio_ctl = of_iomap(dev->of_node, 0) + index; - - /* Amplitude control address 0x20 bit 0 to bit 7 */ - if (of_property_read_u32(dev->of_node, "realtek,amplitude-control-coarse-tuning", - &phy_parameter->amplitude_control_coarse)) - phy_parameter->amplitude_control_coarse = AMPLITUDE_CONTROL_COARSE_DEFAULT; - - /* Amplitude control address 0x21 bit 0 to bit 16 */ - if (of_property_read_u32(dev->of_node, "realtek,amplitude-control-fine-tuning", - &phy_parameter->amplitude_control_fine)) - phy_parameter->amplitude_control_fine = AMPLITUDE_CONTROL_FINE_DEFAULT; - - get_phy_data_by_efuse(rtk_phy, phy_parameter, index); - - update_amplitude_control_value(rtk_phy, phy_parameter); - } - - return ret; -} - -static int rtk_usb3phy_probe(struct platform_device *pdev) -{ - struct rtk_phy *rtk_phy; - struct device *dev = &pdev->dev; - struct phy *generic_phy; - struct phy_provider *phy_provider; - const struct phy_cfg *phy_cfg; - int ret; - - phy_cfg = of_device_get_match_data(dev); - if (!phy_cfg) { - dev_err(dev, "phy config are not assigned!\n"); - return -EINVAL; - } - - rtk_phy = devm_kzalloc(dev, sizeof(*rtk_phy), GFP_KERNEL); - if (!rtk_phy) - return -ENOMEM; - - rtk_phy->dev = &pdev->dev; - rtk_phy->phy.dev = rtk_phy->dev; - rtk_phy->phy.label = "rtk-usb3phy"; - rtk_phy->phy.notify_port_status = rtk_phy_notify_port_status; - - rtk_phy->phy_cfg = devm_kzalloc(dev, sizeof(*phy_cfg), GFP_KERNEL); - - memcpy(rtk_phy->phy_cfg, phy_cfg, sizeof(*phy_cfg)); - - rtk_phy->num_phy = 1; - - ret = parse_phy_data(rtk_phy); - if (ret) - goto err; - - platform_set_drvdata(pdev, rtk_phy); - - generic_phy = devm_phy_create(rtk_phy->dev, NULL, &ops); - if (IS_ERR(generic_phy)) - return PTR_ERR(generic_phy); - - phy_set_drvdata(generic_phy, rtk_phy); - - phy_provider = devm_of_phy_provider_register(rtk_phy->dev, of_phy_simple_xlate); - if (IS_ERR(phy_provider)) - return PTR_ERR(phy_provider); - - ret = usb_add_phy_dev(&rtk_phy->phy); - if (ret) - goto err; - - create_debug_files(rtk_phy); - -err: - return ret; -} - -static void rtk_usb3phy_remove(struct platform_device *pdev) -{ - struct rtk_phy *rtk_phy = platform_get_drvdata(pdev); - - remove_debug_files(rtk_phy); - - usb_remove_phy(&rtk_phy->phy); -} - -static const struct phy_cfg rtd1295_phy_cfg = { - .param_size = MAX_USB_PHY_DATA_SIZE, - .param = { [0] = {0x01, 0x4008}, [1] = {0x01, 0xe046}, - [2] = {0x02, 0x6046}, [3] = {0x03, 0x2779}, - [4] = {0x04, 0x72f5}, [5] = {0x05, 0x2ad3}, - [6] = {0x06, 0x000e}, [7] = {0x07, 0x2e00}, - [8] = {0x08, 0x3591}, [9] = {0x09, 0x525c}, - [10] = {0x0a, 0xa600}, [11] = {0x0b, 0xa904}, - [12] = {0x0c, 0xc000}, [13] = {0x0d, 0xef1c}, - [14] = {0x0e, 0x2000}, [15] = {0x0f, 0x0000}, - [16] = {0x10, 0x000c}, [17] = {0x11, 0x4c00}, - [18] = {0x12, 0xfc00}, [19] = {0x13, 0x0c81}, - [20] = {0x14, 0xde01}, [21] = {0x15, 0x0000}, - [22] = {0x16, 0x0000}, [23] = {0x17, 0x0000}, - [24] = {0x18, 0x0000}, [25] = {0x19, 0x4004}, - [26] = {0x1a, 0x1260}, [27] = {0x1b, 0xff00}, - [28] = {0x1c, 0xcb00}, [29] = {0x1d, 0xa03f}, - [30] = {0x1e, 0xc2e0}, [31] = {0x1f, 0x2807}, - [32] = {0x20, 0x947a}, [33] = {0x21, 0x88aa}, - [34] = {0x22, 0x0057}, [35] = {0x23, 0xab66}, - [36] = {0x24, 0x0800}, [37] = {0x25, 0x0000}, - [38] = {0x26, 0x040a}, [39] = {0x27, 0x01d6}, - [40] = {0x28, 0xf8c2}, [41] = {0x29, 0x3080}, - [42] = {0x2a, 0x3082}, [43] = {0x2b, 0x2078}, - [44] = {0x2c, 0xffff}, [45] = {0x2d, 0xffff}, - [46] = {0x2e, 0x0000}, [47] = {0x2f, 0x0040}, }, - .check_efuse = false, - .do_toggle = true, - .do_toggle_once = false, - .use_default_parameter = false, - .check_rx_front_end_offset = false, -}; - -static const struct phy_cfg rtd1619_phy_cfg = { - .param_size = MAX_USB_PHY_DATA_SIZE, - .param = { [8] = {0x08, 0x3591}, - [38] = {0x26, 0x840b}, - [40] = {0x28, 0xf842}, }, - .check_efuse = false, - .do_toggle = true, - .do_toggle_once = false, - .use_default_parameter = false, - .check_rx_front_end_offset = false, -}; - -static const struct phy_cfg rtd1319_phy_cfg = { - .param_size = MAX_USB_PHY_DATA_SIZE, - .param = { [1] = {0x01, 0xac86}, - [6] = {0x06, 0x0003}, - [9] = {0x09, 0x924c}, - [10] = {0x0a, 0xa608}, - [11] = {0x0b, 0xb905}, - [14] = {0x0e, 0x2010}, - [32] = {0x20, 0x705a}, - [33] = {0x21, 0xf645}, - [34] = {0x22, 0x0013}, - [35] = {0x23, 0xcb66}, - [41] = {0x29, 0xff00}, }, - .check_efuse = true, - .do_toggle = true, - .do_toggle_once = false, - .use_default_parameter = false, - .check_rx_front_end_offset = false, -}; - -static const struct phy_cfg rtd1619b_phy_cfg = { - .param_size = MAX_USB_PHY_DATA_SIZE, - .param = { [1] = {0x01, 0xac8c}, - [6] = {0x06, 0x0017}, - [9] = {0x09, 0x724c}, - [10] = {0x0a, 0xb610}, - [11] = {0x0b, 0xb90d}, - [13] = {0x0d, 0xef2a}, - [15] = {0x0f, 0x9050}, - [16] = {0x10, 0x000c}, - [32] = {0x20, 0x70ff}, - [34] = {0x22, 0x0013}, - [35] = {0x23, 0xdb66}, - [38] = {0x26, 0x8609}, - [41] = {0x29, 0xff13}, - [42] = {0x2a, 0x3070}, }, - .check_efuse = true, - .do_toggle = false, - .do_toggle_once = true, - .use_default_parameter = false, - .check_rx_front_end_offset = false, -}; - -static const struct phy_cfg rtd1319d_phy_cfg = { - .param_size = MAX_USB_PHY_DATA_SIZE, - .param = { [1] = {0x01, 0xac89}, - [4] = {0x04, 0xf2f5}, - [6] = {0x06, 0x0017}, - [9] = {0x09, 0x424c}, - [10] = {0x0a, 0x9610}, - [11] = {0x0b, 0x9901}, - [12] = {0x0c, 0xf000}, - [13] = {0x0d, 0xef2a}, - [14] = {0x0e, 0x1000}, - [15] = {0x0f, 0x9050}, - [32] = {0x20, 0x7077}, - [35] = {0x23, 0x0b62}, - [37] = {0x25, 0x10ec}, - [42] = {0x2a, 0x3070}, }, - .check_efuse = true, - .do_toggle = false, - .do_toggle_once = true, - .use_default_parameter = false, - .check_rx_front_end_offset = true, -}; - -static const struct of_device_id usbphy_rtk_dt_match[] = { - { .compatible = "realtek,rtd1295-usb3phy", .data = &rtd1295_phy_cfg }, - { .compatible = "realtek,rtd1319-usb3phy", .data = &rtd1319_phy_cfg }, - { .compatible = "realtek,rtd1319d-usb3phy", .data = &rtd1319d_phy_cfg }, - { .compatible = "realtek,rtd1619-usb3phy", .data = &rtd1619_phy_cfg }, - { .compatible = "realtek,rtd1619b-usb3phy", .data = &rtd1619b_phy_cfg }, - {}, -}; -MODULE_DEVICE_TABLE(of, usbphy_rtk_dt_match); - -static struct platform_driver rtk_usb3phy_driver = { - .probe = rtk_usb3phy_probe, - .remove_new = rtk_usb3phy_remove, - .driver = { - .name = "rtk-usb3phy", - .of_match_table = usbphy_rtk_dt_match, - }, -}; - -module_platform_driver(rtk_usb3phy_driver); - -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform: rtk-usb3phy"); -MODULE_AUTHOR("Stanley Chang "); -MODULE_DESCRIPTION("Realtek usb 3.0 phy driver"); From 7a784bcdd7e54f0599da3b2360e472238412623e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 6 Nov 2023 12:06:53 +0100 Subject: [PATCH 101/560] Revert "phy: realtek: usb: Add driver for the Realtek SoC USB 2.0 PHY" This reverts commit 134e6d25f6bd06071e5aac0a7eefcea6f7713955. The recently added Realtek PHY drivers depend on the new port status notification mechanism which was built on the deprecated USB PHY implementation and devicetree binding. Specifically, using these PHYs would require describing the very same PHY using both the generic "phy" property and the deprecated "usb-phy" property which is clearly wrong. We should not be building new functionality on top of the legacy USB PHY implementation even if it is currently stuck in some kind of transitional limbo. Revert the new Realtek PHY drivers for now so that the port status notification interface can be reverted and replaced. Fixes: 134e6d25f6bd ("phy: realtek: usb: Add driver for the Realtek SoC USB 2.0 PHY") Cc: stable@vger.kernel.org # 6.6 Cc: Stanley Chang Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20231106110654.31090-3-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/phy/Kconfig | 1 - drivers/phy/Makefile | 1 - drivers/phy/realtek/Kconfig | 20 - drivers/phy/realtek/Makefile | 2 - drivers/phy/realtek/phy-rtk-usb2.c | 1325 ---------------------------- 5 files changed, 1349 deletions(-) delete mode 100644 drivers/phy/realtek/Kconfig delete mode 100644 drivers/phy/realtek/Makefile delete mode 100644 drivers/phy/realtek/phy-rtk-usb2.c diff --git a/drivers/phy/Kconfig b/drivers/phy/Kconfig index 787354b849c75..4cef568231bf0 100644 --- a/drivers/phy/Kconfig +++ b/drivers/phy/Kconfig @@ -87,7 +87,6 @@ source "drivers/phy/motorola/Kconfig" source "drivers/phy/mscc/Kconfig" source "drivers/phy/qualcomm/Kconfig" source "drivers/phy/ralink/Kconfig" -source "drivers/phy/realtek/Kconfig" source "drivers/phy/renesas/Kconfig" source "drivers/phy/rockchip/Kconfig" source "drivers/phy/samsung/Kconfig" diff --git a/drivers/phy/Makefile b/drivers/phy/Makefile index 868a220ed0f6d..fb3dc9de61115 100644 --- a/drivers/phy/Makefile +++ b/drivers/phy/Makefile @@ -26,7 +26,6 @@ obj-y += allwinner/ \ mscc/ \ qualcomm/ \ ralink/ \ - realtek/ \ renesas/ \ rockchip/ \ samsung/ \ diff --git a/drivers/phy/realtek/Kconfig b/drivers/phy/realtek/Kconfig deleted file mode 100644 index 7455877510709..0000000000000 --- a/drivers/phy/realtek/Kconfig +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Phy drivers for Realtek platforms -# - -if ARCH_REALTEK || COMPILE_TEST - -config PHY_RTK_RTD_USB2PHY - tristate "Realtek RTD USB2 PHY Transceiver Driver" - depends on USB_SUPPORT - select GENERIC_PHY - select USB_PHY - select USB_COMMON - help - Enable this to support Realtek SoC USB2 phy transceiver. - The DHC (digital home center) RTD series SoCs used the Synopsys - DWC3 USB IP. This driver will do the PHY initialization - of the parameters. - -endif # ARCH_REALTEK || COMPILE_TEST diff --git a/drivers/phy/realtek/Makefile b/drivers/phy/realtek/Makefile deleted file mode 100644 index cf5d440841a27..0000000000000 --- a/drivers/phy/realtek/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_PHY_RTK_RTD_USB2PHY) += phy-rtk-usb2.o diff --git a/drivers/phy/realtek/phy-rtk-usb2.c b/drivers/phy/realtek/phy-rtk-usb2.c deleted file mode 100644 index 0a6426285c67f..0000000000000 --- a/drivers/phy/realtek/phy-rtk-usb2.c +++ /dev/null @@ -1,1325 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * phy-rtk-usb2.c RTK usb2.0 PHY driver - * - * Copyright (C) 2023 Realtek Semiconductor Corporation - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* GUSB2PHYACCn register */ -#define PHY_NEW_REG_REQ BIT(25) -#define PHY_VSTS_BUSY BIT(23) -#define PHY_VCTRL_SHIFT 8 -#define PHY_REG_DATA_MASK 0xff - -#define GET_LOW_NIBBLE(addr) ((addr) & 0x0f) -#define GET_HIGH_NIBBLE(addr) (((addr) & 0xf0) >> 4) - -#define EFUS_USB_DC_CAL_RATE 2 -#define EFUS_USB_DC_CAL_MAX 7 - -#define EFUS_USB_DC_DIS_RATE 1 -#define EFUS_USB_DC_DIS_MAX 7 - -#define MAX_PHY_DATA_SIZE 20 -#define OFFEST_PHY_READ 0x20 - -#define MAX_USB_PHY_NUM 4 -#define MAX_USB_PHY_PAGE0_DATA_SIZE 16 -#define MAX_USB_PHY_PAGE1_DATA_SIZE 16 -#define MAX_USB_PHY_PAGE2_DATA_SIZE 8 - -#define SET_PAGE_OFFSET 0xf4 -#define SET_PAGE_0 0x9b -#define SET_PAGE_1 0xbb -#define SET_PAGE_2 0xdb - -#define PAGE_START 0xe0 -#define PAGE0_0XE4 0xe4 -#define PAGE0_0XE6 0xe6 -#define PAGE0_0XE7 0xe7 -#define PAGE1_0XE0 0xe0 -#define PAGE1_0XE2 0xe2 - -#define SENSITIVITY_CTRL (BIT(4) | BIT(5) | BIT(6)) -#define ENABLE_AUTO_SENSITIVITY_CALIBRATION BIT(2) -#define DEFAULT_DC_DRIVING_VALUE (0x8) -#define DEFAULT_DC_DISCONNECTION_VALUE (0x6) -#define HS_CLK_SELECT BIT(6) - -struct phy_reg { - void __iomem *reg_wrap_vstatus; - void __iomem *reg_gusb2phyacc0; - int vstatus_index; -}; - -struct phy_data { - u8 addr; - u8 data; -}; - -struct phy_cfg { - int page0_size; - struct phy_data page0[MAX_USB_PHY_PAGE0_DATA_SIZE]; - int page1_size; - struct phy_data page1[MAX_USB_PHY_PAGE1_DATA_SIZE]; - int page2_size; - struct phy_data page2[MAX_USB_PHY_PAGE2_DATA_SIZE]; - - int num_phy; - - bool check_efuse; - int check_efuse_version; -#define CHECK_EFUSE_V1 1 -#define CHECK_EFUSE_V2 2 - int efuse_dc_driving_rate; - int efuse_dc_disconnect_rate; - int dc_driving_mask; - int dc_disconnect_mask; - bool usb_dc_disconnect_at_page0; - int driving_updated_for_dev_dis; - - bool do_toggle; - bool do_toggle_driving; - bool use_default_parameter; - bool is_double_sensitivity_mode; -}; - -struct phy_parameter { - struct phy_reg phy_reg; - - /* Get from efuse */ - s8 efuse_usb_dc_cal; - s8 efuse_usb_dc_dis; - - /* Get from dts */ - bool inverse_hstx_sync_clock; - u32 driving_level; - s32 driving_level_compensate; - s32 disconnection_compensate; -}; - -struct rtk_phy { - struct usb_phy phy; - struct device *dev; - - struct phy_cfg *phy_cfg; - int num_phy; - struct phy_parameter *phy_parameter; - - struct dentry *debug_dir; -}; - -/* mapping 0xE0 to 0 ... 0xE7 to 7, 0xF0 to 8 ,,, 0xF7 to 15 */ -static inline int page_addr_to_array_index(u8 addr) -{ - return (int)((((addr) - PAGE_START) & 0x7) + - ((((addr) - PAGE_START) & 0x10) >> 1)); -} - -static inline u8 array_index_to_page_addr(int index) -{ - return ((((index) + PAGE_START) & 0x7) + - ((((index) & 0x8) << 1) + PAGE_START)); -} - -#define PHY_IO_TIMEOUT_USEC (50000) -#define PHY_IO_DELAY_US (100) - -static inline int utmi_wait_register(void __iomem *reg, u32 mask, u32 result) -{ - int ret; - unsigned int val; - - ret = read_poll_timeout(readl, val, ((val & mask) == result), - PHY_IO_DELAY_US, PHY_IO_TIMEOUT_USEC, false, reg); - if (ret) { - pr_err("%s can't program USB phy\n", __func__); - return -ETIMEDOUT; - } - - return 0; -} - -static char rtk_phy_read(struct phy_reg *phy_reg, char addr) -{ - void __iomem *reg_gusb2phyacc0 = phy_reg->reg_gusb2phyacc0; - unsigned int val; - int ret = 0; - - addr -= OFFEST_PHY_READ; - - /* polling until VBusy == 0 */ - ret = utmi_wait_register(reg_gusb2phyacc0, PHY_VSTS_BUSY, 0); - if (ret) - return (char)ret; - - /* VCtrl = low nibble of addr, and set PHY_NEW_REG_REQ */ - val = PHY_NEW_REG_REQ | (GET_LOW_NIBBLE(addr) << PHY_VCTRL_SHIFT); - writel(val, reg_gusb2phyacc0); - ret = utmi_wait_register(reg_gusb2phyacc0, PHY_VSTS_BUSY, 0); - if (ret) - return (char)ret; - - /* VCtrl = high nibble of addr, and set PHY_NEW_REG_REQ */ - val = PHY_NEW_REG_REQ | (GET_HIGH_NIBBLE(addr) << PHY_VCTRL_SHIFT); - writel(val, reg_gusb2phyacc0); - ret = utmi_wait_register(reg_gusb2phyacc0, PHY_VSTS_BUSY, 0); - if (ret) - return (char)ret; - - val = readl(reg_gusb2phyacc0); - - return (char)(val & PHY_REG_DATA_MASK); -} - -static int rtk_phy_write(struct phy_reg *phy_reg, char addr, char data) -{ - unsigned int val; - void __iomem *reg_wrap_vstatus = phy_reg->reg_wrap_vstatus; - void __iomem *reg_gusb2phyacc0 = phy_reg->reg_gusb2phyacc0; - int shift_bits = phy_reg->vstatus_index * 8; - int ret = 0; - - /* write data to VStatusOut2 (data output to phy) */ - writel((u32)data << shift_bits, reg_wrap_vstatus); - - ret = utmi_wait_register(reg_gusb2phyacc0, PHY_VSTS_BUSY, 0); - if (ret) - return ret; - - /* VCtrl = low nibble of addr, set PHY_NEW_REG_REQ */ - val = PHY_NEW_REG_REQ | (GET_LOW_NIBBLE(addr) << PHY_VCTRL_SHIFT); - - writel(val, reg_gusb2phyacc0); - ret = utmi_wait_register(reg_gusb2phyacc0, PHY_VSTS_BUSY, 0); - if (ret) - return ret; - - /* VCtrl = high nibble of addr, set PHY_NEW_REG_REQ */ - val = PHY_NEW_REG_REQ | (GET_HIGH_NIBBLE(addr) << PHY_VCTRL_SHIFT); - - writel(val, reg_gusb2phyacc0); - ret = utmi_wait_register(reg_gusb2phyacc0, PHY_VSTS_BUSY, 0); - if (ret) - return ret; - - return 0; -} - -static int rtk_phy_set_page(struct phy_reg *phy_reg, int page) -{ - switch (page) { - case 0: - return rtk_phy_write(phy_reg, SET_PAGE_OFFSET, SET_PAGE_0); - case 1: - return rtk_phy_write(phy_reg, SET_PAGE_OFFSET, SET_PAGE_1); - case 2: - return rtk_phy_write(phy_reg, SET_PAGE_OFFSET, SET_PAGE_2); - default: - pr_err("%s error page=%d\n", __func__, page); - } - - return -EINVAL; -} - -static u8 __updated_dc_disconnect_level_page0_0xe4(struct phy_cfg *phy_cfg, - struct phy_parameter *phy_parameter, u8 data) -{ - u8 ret; - s32 val; - s32 dc_disconnect_mask = phy_cfg->dc_disconnect_mask; - int offset = 4; - - val = (s32)((data >> offset) & dc_disconnect_mask) - + phy_parameter->efuse_usb_dc_dis - + phy_parameter->disconnection_compensate; - - if (val > dc_disconnect_mask) - val = dc_disconnect_mask; - else if (val < 0) - val = 0; - - ret = (data & (~(dc_disconnect_mask << offset))) | - (val & dc_disconnect_mask) << offset; - - return ret; -} - -/* updated disconnect level at page0 */ -static void update_dc_disconnect_level_at_page0(struct rtk_phy *rtk_phy, - struct phy_parameter *phy_parameter, bool update) -{ - struct phy_cfg *phy_cfg; - struct phy_reg *phy_reg; - struct phy_data *phy_data_page; - struct phy_data *phy_data; - u8 addr, data; - int offset = 4; - s32 dc_disconnect_mask; - int i; - - phy_cfg = rtk_phy->phy_cfg; - phy_reg = &phy_parameter->phy_reg; - - /* Set page 0 */ - phy_data_page = phy_cfg->page0; - rtk_phy_set_page(phy_reg, 0); - - i = page_addr_to_array_index(PAGE0_0XE4); - phy_data = phy_data_page + i; - if (!phy_data->addr) { - phy_data->addr = PAGE0_0XE4; - phy_data->data = rtk_phy_read(phy_reg, PAGE0_0XE4); - } - - addr = phy_data->addr; - data = phy_data->data; - dc_disconnect_mask = phy_cfg->dc_disconnect_mask; - - if (update) - data = __updated_dc_disconnect_level_page0_0xe4(phy_cfg, phy_parameter, data); - else - data = (data & ~(dc_disconnect_mask << offset)) | - (DEFAULT_DC_DISCONNECTION_VALUE << offset); - - if (rtk_phy_write(phy_reg, addr, data)) - dev_err(rtk_phy->dev, - "%s: Error to set page1 parameter addr=0x%x value=0x%x\n", - __func__, addr, data); -} - -static u8 __updated_dc_disconnect_level_page1_0xe2(struct phy_cfg *phy_cfg, - struct phy_parameter *phy_parameter, u8 data) -{ - u8 ret; - s32 val; - s32 dc_disconnect_mask = phy_cfg->dc_disconnect_mask; - - if (phy_cfg->check_efuse_version == CHECK_EFUSE_V1) { - val = (s32)(data & dc_disconnect_mask) - + phy_parameter->efuse_usb_dc_dis - + phy_parameter->disconnection_compensate; - } else { /* for CHECK_EFUSE_V2 or no efuse */ - if (phy_parameter->efuse_usb_dc_dis) - val = (s32)(phy_parameter->efuse_usb_dc_dis + - phy_parameter->disconnection_compensate); - else - val = (s32)((data & dc_disconnect_mask) + - phy_parameter->disconnection_compensate); - } - - if (val > dc_disconnect_mask) - val = dc_disconnect_mask; - else if (val < 0) - val = 0; - - ret = (data & (~dc_disconnect_mask)) | (val & dc_disconnect_mask); - - return ret; -} - -/* updated disconnect level at page1 */ -static void update_dc_disconnect_level_at_page1(struct rtk_phy *rtk_phy, - struct phy_parameter *phy_parameter, bool update) -{ - struct phy_cfg *phy_cfg; - struct phy_data *phy_data_page; - struct phy_data *phy_data; - struct phy_reg *phy_reg; - u8 addr, data; - s32 dc_disconnect_mask; - int i; - - phy_cfg = rtk_phy->phy_cfg; - phy_reg = &phy_parameter->phy_reg; - - /* Set page 1 */ - phy_data_page = phy_cfg->page1; - rtk_phy_set_page(phy_reg, 1); - - i = page_addr_to_array_index(PAGE1_0XE2); - phy_data = phy_data_page + i; - if (!phy_data->addr) { - phy_data->addr = PAGE1_0XE2; - phy_data->data = rtk_phy_read(phy_reg, PAGE1_0XE2); - } - - addr = phy_data->addr; - data = phy_data->data; - dc_disconnect_mask = phy_cfg->dc_disconnect_mask; - - if (update) - data = __updated_dc_disconnect_level_page1_0xe2(phy_cfg, phy_parameter, data); - else - data = (data & ~dc_disconnect_mask) | DEFAULT_DC_DISCONNECTION_VALUE; - - if (rtk_phy_write(phy_reg, addr, data)) - dev_err(rtk_phy->dev, - "%s: Error to set page1 parameter addr=0x%x value=0x%x\n", - __func__, addr, data); -} - -static void update_dc_disconnect_level(struct rtk_phy *rtk_phy, - struct phy_parameter *phy_parameter, bool update) -{ - struct phy_cfg *phy_cfg = rtk_phy->phy_cfg; - - if (phy_cfg->usb_dc_disconnect_at_page0) - update_dc_disconnect_level_at_page0(rtk_phy, phy_parameter, update); - else - update_dc_disconnect_level_at_page1(rtk_phy, phy_parameter, update); -} - -static u8 __update_dc_driving_page0_0xe4(struct phy_cfg *phy_cfg, - struct phy_parameter *phy_parameter, u8 data) -{ - s32 driving_level_compensate = phy_parameter->driving_level_compensate; - s32 dc_driving_mask = phy_cfg->dc_driving_mask; - s32 val; - u8 ret; - - if (phy_cfg->check_efuse_version == CHECK_EFUSE_V1) { - val = (s32)(data & dc_driving_mask) + driving_level_compensate - + phy_parameter->efuse_usb_dc_cal; - } else { /* for CHECK_EFUSE_V2 or no efuse */ - if (phy_parameter->efuse_usb_dc_cal) - val = (s32)((phy_parameter->efuse_usb_dc_cal & dc_driving_mask) - + driving_level_compensate); - else - val = (s32)(data & dc_driving_mask); - } - - if (val > dc_driving_mask) - val = dc_driving_mask; - else if (val < 0) - val = 0; - - ret = (data & (~dc_driving_mask)) | (val & dc_driving_mask); - - return ret; -} - -static void update_dc_driving_level(struct rtk_phy *rtk_phy, - struct phy_parameter *phy_parameter) -{ - struct phy_cfg *phy_cfg; - struct phy_reg *phy_reg; - - phy_reg = &phy_parameter->phy_reg; - phy_cfg = rtk_phy->phy_cfg; - if (!phy_cfg->page0[4].addr) { - rtk_phy_set_page(phy_reg, 0); - phy_cfg->page0[4].addr = PAGE0_0XE4; - phy_cfg->page0[4].data = rtk_phy_read(phy_reg, PAGE0_0XE4); - } - - if (phy_parameter->driving_level != DEFAULT_DC_DRIVING_VALUE) { - u32 dc_driving_mask; - u8 driving_level; - u8 data; - - data = phy_cfg->page0[4].data; - dc_driving_mask = phy_cfg->dc_driving_mask; - driving_level = data & dc_driving_mask; - - dev_dbg(rtk_phy->dev, "%s driving_level=%d => dts driving_level=%d\n", - __func__, driving_level, phy_parameter->driving_level); - - phy_cfg->page0[4].data = (data & (~dc_driving_mask)) | - (phy_parameter->driving_level & dc_driving_mask); - } - - phy_cfg->page0[4].data = __update_dc_driving_page0_0xe4(phy_cfg, - phy_parameter, - phy_cfg->page0[4].data); -} - -static void update_hs_clk_select(struct rtk_phy *rtk_phy, - struct phy_parameter *phy_parameter) -{ - struct phy_cfg *phy_cfg; - struct phy_reg *phy_reg; - - phy_cfg = rtk_phy->phy_cfg; - phy_reg = &phy_parameter->phy_reg; - - if (phy_parameter->inverse_hstx_sync_clock) { - if (!phy_cfg->page0[6].addr) { - rtk_phy_set_page(phy_reg, 0); - phy_cfg->page0[6].addr = PAGE0_0XE6; - phy_cfg->page0[6].data = rtk_phy_read(phy_reg, PAGE0_0XE6); - } - - phy_cfg->page0[6].data = phy_cfg->page0[6].data | HS_CLK_SELECT; - } -} - -static void do_rtk_phy_toggle(struct rtk_phy *rtk_phy, - int index, bool connect) -{ - struct phy_parameter *phy_parameter; - struct phy_cfg *phy_cfg; - struct phy_reg *phy_reg; - struct phy_data *phy_data_page; - u8 addr, data; - int i; - - phy_cfg = rtk_phy->phy_cfg; - phy_parameter = &((struct phy_parameter *)rtk_phy->phy_parameter)[index]; - phy_reg = &phy_parameter->phy_reg; - - if (!phy_cfg->do_toggle) - goto out; - - if (phy_cfg->is_double_sensitivity_mode) - goto do_toggle_driving; - - /* Set page 0 */ - rtk_phy_set_page(phy_reg, 0); - - addr = PAGE0_0XE7; - data = rtk_phy_read(phy_reg, addr); - - if (connect) - rtk_phy_write(phy_reg, addr, data & (~SENSITIVITY_CTRL)); - else - rtk_phy_write(phy_reg, addr, data | (SENSITIVITY_CTRL)); - -do_toggle_driving: - - if (!phy_cfg->do_toggle_driving) - goto do_toggle; - - /* Page 0 addr 0xE4 driving capability */ - - /* Set page 0 */ - phy_data_page = phy_cfg->page0; - rtk_phy_set_page(phy_reg, 0); - - i = page_addr_to_array_index(PAGE0_0XE4); - addr = phy_data_page[i].addr; - data = phy_data_page[i].data; - - if (connect) { - rtk_phy_write(phy_reg, addr, data); - } else { - u8 value; - s32 tmp; - s32 driving_updated = - phy_cfg->driving_updated_for_dev_dis; - s32 dc_driving_mask = phy_cfg->dc_driving_mask; - - tmp = (s32)(data & dc_driving_mask) + driving_updated; - - if (tmp > dc_driving_mask) - tmp = dc_driving_mask; - else if (tmp < 0) - tmp = 0; - - value = (data & (~dc_driving_mask)) | (tmp & dc_driving_mask); - - rtk_phy_write(phy_reg, addr, value); - } - -do_toggle: - /* restore dc disconnect level before toggle */ - update_dc_disconnect_level(rtk_phy, phy_parameter, false); - - /* Set page 1 */ - rtk_phy_set_page(phy_reg, 1); - - addr = PAGE1_0XE0; - data = rtk_phy_read(phy_reg, addr); - - rtk_phy_write(phy_reg, addr, data & - (~ENABLE_AUTO_SENSITIVITY_CALIBRATION)); - mdelay(1); - rtk_phy_write(phy_reg, addr, data | - (ENABLE_AUTO_SENSITIVITY_CALIBRATION)); - - /* update dc disconnect level after toggle */ - update_dc_disconnect_level(rtk_phy, phy_parameter, true); - -out: - return; -} - -static int do_rtk_phy_init(struct rtk_phy *rtk_phy, int index) -{ - struct phy_parameter *phy_parameter; - struct phy_cfg *phy_cfg; - struct phy_data *phy_data_page; - struct phy_reg *phy_reg; - int i; - - phy_cfg = rtk_phy->phy_cfg; - phy_parameter = &((struct phy_parameter *)rtk_phy->phy_parameter)[index]; - phy_reg = &phy_parameter->phy_reg; - - if (phy_cfg->use_default_parameter) { - dev_dbg(rtk_phy->dev, "%s phy#%d use default parameter\n", - __func__, index); - goto do_toggle; - } - - /* Set page 0 */ - phy_data_page = phy_cfg->page0; - rtk_phy_set_page(phy_reg, 0); - - for (i = 0; i < phy_cfg->page0_size; i++) { - struct phy_data *phy_data = phy_data_page + i; - u8 addr = phy_data->addr; - u8 data = phy_data->data; - - if (!addr) - continue; - - if (rtk_phy_write(phy_reg, addr, data)) { - dev_err(rtk_phy->dev, - "%s: Error to set page0 parameter addr=0x%x value=0x%x\n", - __func__, addr, data); - return -EINVAL; - } - } - - /* Set page 1 */ - phy_data_page = phy_cfg->page1; - rtk_phy_set_page(phy_reg, 1); - - for (i = 0; i < phy_cfg->page1_size; i++) { - struct phy_data *phy_data = phy_data_page + i; - u8 addr = phy_data->addr; - u8 data = phy_data->data; - - if (!addr) - continue; - - if (rtk_phy_write(phy_reg, addr, data)) { - dev_err(rtk_phy->dev, - "%s: Error to set page1 parameter addr=0x%x value=0x%x\n", - __func__, addr, data); - return -EINVAL; - } - } - - if (phy_cfg->page2_size == 0) - goto do_toggle; - - /* Set page 2 */ - phy_data_page = phy_cfg->page2; - rtk_phy_set_page(phy_reg, 2); - - for (i = 0; i < phy_cfg->page2_size; i++) { - struct phy_data *phy_data = phy_data_page + i; - u8 addr = phy_data->addr; - u8 data = phy_data->data; - - if (!addr) - continue; - - if (rtk_phy_write(phy_reg, addr, data)) { - dev_err(rtk_phy->dev, - "%s: Error to set page2 parameter addr=0x%x value=0x%x\n", - __func__, addr, data); - return -EINVAL; - } - } - -do_toggle: - do_rtk_phy_toggle(rtk_phy, index, false); - - return 0; -} - -static int rtk_phy_init(struct phy *phy) -{ - struct rtk_phy *rtk_phy = phy_get_drvdata(phy); - unsigned long phy_init_time = jiffies; - int i, ret = 0; - - if (!rtk_phy) - return -EINVAL; - - for (i = 0; i < rtk_phy->num_phy; i++) - ret = do_rtk_phy_init(rtk_phy, i); - - dev_dbg(rtk_phy->dev, "Initialized RTK USB 2.0 PHY (take %dms)\n", - jiffies_to_msecs(jiffies - phy_init_time)); - return ret; -} - -static int rtk_phy_exit(struct phy *phy) -{ - return 0; -} - -static const struct phy_ops ops = { - .init = rtk_phy_init, - .exit = rtk_phy_exit, - .owner = THIS_MODULE, -}; - -static void rtk_phy_toggle(struct usb_phy *usb2_phy, bool connect, int port) -{ - int index = port; - struct rtk_phy *rtk_phy = NULL; - - rtk_phy = dev_get_drvdata(usb2_phy->dev); - - if (index > rtk_phy->num_phy) { - dev_err(rtk_phy->dev, "%s: The port=%d is not in usb phy (num_phy=%d)\n", - __func__, index, rtk_phy->num_phy); - return; - } - - do_rtk_phy_toggle(rtk_phy, index, connect); -} - -static int rtk_phy_notify_port_status(struct usb_phy *x, int port, - u16 portstatus, u16 portchange) -{ - bool connect = false; - - pr_debug("%s port=%d portstatus=0x%x portchange=0x%x\n", - __func__, port, (int)portstatus, (int)portchange); - if (portstatus & USB_PORT_STAT_CONNECTION) - connect = true; - - if (portchange & USB_PORT_STAT_C_CONNECTION) - rtk_phy_toggle(x, connect, port); - - return 0; -} - -#ifdef CONFIG_DEBUG_FS -static struct dentry *create_phy_debug_root(void) -{ - struct dentry *phy_debug_root; - - phy_debug_root = debugfs_lookup("phy", usb_debug_root); - if (!phy_debug_root) - phy_debug_root = debugfs_create_dir("phy", usb_debug_root); - - return phy_debug_root; -} - -static int rtk_usb2_parameter_show(struct seq_file *s, void *unused) -{ - struct rtk_phy *rtk_phy = s->private; - struct phy_cfg *phy_cfg; - int i, index; - - phy_cfg = rtk_phy->phy_cfg; - - seq_puts(s, "Property:\n"); - seq_printf(s, " check_efuse: %s\n", - phy_cfg->check_efuse ? "Enable" : "Disable"); - seq_printf(s, " check_efuse_version: %d\n", - phy_cfg->check_efuse_version); - seq_printf(s, " efuse_dc_driving_rate: %d\n", - phy_cfg->efuse_dc_driving_rate); - seq_printf(s, " dc_driving_mask: 0x%x\n", - phy_cfg->dc_driving_mask); - seq_printf(s, " efuse_dc_disconnect_rate: %d\n", - phy_cfg->efuse_dc_disconnect_rate); - seq_printf(s, " dc_disconnect_mask: 0x%x\n", - phy_cfg->dc_disconnect_mask); - seq_printf(s, " usb_dc_disconnect_at_page0: %s\n", - phy_cfg->usb_dc_disconnect_at_page0 ? "true" : "false"); - seq_printf(s, " do_toggle: %s\n", - phy_cfg->do_toggle ? "Enable" : "Disable"); - seq_printf(s, " do_toggle_driving: %s\n", - phy_cfg->do_toggle_driving ? "Enable" : "Disable"); - seq_printf(s, " driving_updated_for_dev_dis: 0x%x\n", - phy_cfg->driving_updated_for_dev_dis); - seq_printf(s, " use_default_parameter: %s\n", - phy_cfg->use_default_parameter ? "Enable" : "Disable"); - seq_printf(s, " is_double_sensitivity_mode: %s\n", - phy_cfg->is_double_sensitivity_mode ? "Enable" : "Disable"); - - for (index = 0; index < rtk_phy->num_phy; index++) { - struct phy_parameter *phy_parameter; - struct phy_reg *phy_reg; - struct phy_data *phy_data_page; - - phy_parameter = &((struct phy_parameter *)rtk_phy->phy_parameter)[index]; - phy_reg = &phy_parameter->phy_reg; - - seq_printf(s, "PHY %d:\n", index); - - seq_puts(s, "Page 0:\n"); - /* Set page 0 */ - phy_data_page = phy_cfg->page0; - rtk_phy_set_page(phy_reg, 0); - - for (i = 0; i < phy_cfg->page0_size; i++) { - struct phy_data *phy_data = phy_data_page + i; - u8 addr = array_index_to_page_addr(i); - u8 data = phy_data->data; - u8 value = rtk_phy_read(phy_reg, addr); - - if (phy_data->addr) - seq_printf(s, " Page 0: addr=0x%x data=0x%02x ==> read value=0x%02x\n", - addr, data, value); - else - seq_printf(s, " Page 0: addr=0x%x data=none ==> read value=0x%02x\n", - addr, value); - } - - seq_puts(s, "Page 1:\n"); - /* Set page 1 */ - phy_data_page = phy_cfg->page1; - rtk_phy_set_page(phy_reg, 1); - - for (i = 0; i < phy_cfg->page1_size; i++) { - struct phy_data *phy_data = phy_data_page + i; - u8 addr = array_index_to_page_addr(i); - u8 data = phy_data->data; - u8 value = rtk_phy_read(phy_reg, addr); - - if (phy_data->addr) - seq_printf(s, " Page 1: addr=0x%x data=0x%02x ==> read value=0x%02x\n", - addr, data, value); - else - seq_printf(s, " Page 1: addr=0x%x data=none ==> read value=0x%02x\n", - addr, value); - } - - if (phy_cfg->page2_size == 0) - goto out; - - seq_puts(s, "Page 2:\n"); - /* Set page 2 */ - phy_data_page = phy_cfg->page2; - rtk_phy_set_page(phy_reg, 2); - - for (i = 0; i < phy_cfg->page2_size; i++) { - struct phy_data *phy_data = phy_data_page + i; - u8 addr = array_index_to_page_addr(i); - u8 data = phy_data->data; - u8 value = rtk_phy_read(phy_reg, addr); - - if (phy_data->addr) - seq_printf(s, " Page 2: addr=0x%x data=0x%02x ==> read value=0x%02x\n", - addr, data, value); - else - seq_printf(s, " Page 2: addr=0x%x data=none ==> read value=0x%02x\n", - addr, value); - } - -out: - seq_puts(s, "PHY Property:\n"); - seq_printf(s, " efuse_usb_dc_cal: %d\n", - (int)phy_parameter->efuse_usb_dc_cal); - seq_printf(s, " efuse_usb_dc_dis: %d\n", - (int)phy_parameter->efuse_usb_dc_dis); - seq_printf(s, " inverse_hstx_sync_clock: %s\n", - phy_parameter->inverse_hstx_sync_clock ? "Enable" : "Disable"); - seq_printf(s, " driving_level: %d\n", - phy_parameter->driving_level); - seq_printf(s, " driving_level_compensate: %d\n", - phy_parameter->driving_level_compensate); - seq_printf(s, " disconnection_compensate: %d\n", - phy_parameter->disconnection_compensate); - } - - return 0; -} -DEFINE_SHOW_ATTRIBUTE(rtk_usb2_parameter); - -static inline void create_debug_files(struct rtk_phy *rtk_phy) -{ - struct dentry *phy_debug_root = NULL; - - phy_debug_root = create_phy_debug_root(); - if (!phy_debug_root) - return; - - rtk_phy->debug_dir = debugfs_create_dir(dev_name(rtk_phy->dev), - phy_debug_root); - - debugfs_create_file("parameter", 0444, rtk_phy->debug_dir, rtk_phy, - &rtk_usb2_parameter_fops); - - return; -} - -static inline void remove_debug_files(struct rtk_phy *rtk_phy) -{ - debugfs_remove_recursive(rtk_phy->debug_dir); -} -#else -static inline void create_debug_files(struct rtk_phy *rtk_phy) { } -static inline void remove_debug_files(struct rtk_phy *rtk_phy) { } -#endif /* CONFIG_DEBUG_FS */ - -static int get_phy_data_by_efuse(struct rtk_phy *rtk_phy, - struct phy_parameter *phy_parameter, int index) -{ - struct phy_cfg *phy_cfg = rtk_phy->phy_cfg; - u8 value = 0; - struct nvmem_cell *cell; - struct soc_device_attribute rtk_soc_groot[] = { - { .family = "Realtek Groot",}, - { /* empty */ } }; - - if (!phy_cfg->check_efuse) - goto out; - - /* Read efuse for usb dc cal */ - cell = nvmem_cell_get(rtk_phy->dev, "usb-dc-cal"); - if (IS_ERR(cell)) { - dev_dbg(rtk_phy->dev, "%s no usb-dc-cal: %ld\n", - __func__, PTR_ERR(cell)); - } else { - unsigned char *buf; - size_t buf_size; - - buf = nvmem_cell_read(cell, &buf_size); - if (!IS_ERR(buf)) { - value = buf[0] & phy_cfg->dc_driving_mask; - kfree(buf); - } - nvmem_cell_put(cell); - } - - if (phy_cfg->check_efuse_version == CHECK_EFUSE_V1) { - int rate = phy_cfg->efuse_dc_driving_rate; - - if (value <= EFUS_USB_DC_CAL_MAX) - phy_parameter->efuse_usb_dc_cal = (int8_t)(value * rate); - else - phy_parameter->efuse_usb_dc_cal = -(int8_t) - ((EFUS_USB_DC_CAL_MAX & value) * rate); - - if (soc_device_match(rtk_soc_groot)) { - dev_dbg(rtk_phy->dev, "For groot IC we need a workaround to adjust efuse_usb_dc_cal\n"); - - /* We don't multiple dc_cal_rate=2 for positive dc cal compensate */ - if (value <= EFUS_USB_DC_CAL_MAX) - phy_parameter->efuse_usb_dc_cal = (int8_t)(value); - - /* We set max dc cal compensate is 0x8 if otp is 0x7 */ - if (value == 0x7) - phy_parameter->efuse_usb_dc_cal = (int8_t)(value + 1); - } - } else { /* for CHECK_EFUSE_V2 */ - phy_parameter->efuse_usb_dc_cal = value & phy_cfg->dc_driving_mask; - } - - /* Read efuse for usb dc disconnect level */ - value = 0; - cell = nvmem_cell_get(rtk_phy->dev, "usb-dc-dis"); - if (IS_ERR(cell)) { - dev_dbg(rtk_phy->dev, "%s no usb-dc-dis: %ld\n", - __func__, PTR_ERR(cell)); - } else { - unsigned char *buf; - size_t buf_size; - - buf = nvmem_cell_read(cell, &buf_size); - if (!IS_ERR(buf)) { - value = buf[0] & phy_cfg->dc_disconnect_mask; - kfree(buf); - } - nvmem_cell_put(cell); - } - - if (phy_cfg->check_efuse_version == CHECK_EFUSE_V1) { - int rate = phy_cfg->efuse_dc_disconnect_rate; - - if (value <= EFUS_USB_DC_DIS_MAX) - phy_parameter->efuse_usb_dc_dis = (int8_t)(value * rate); - else - phy_parameter->efuse_usb_dc_dis = -(int8_t) - ((EFUS_USB_DC_DIS_MAX & value) * rate); - } else { /* for CHECK_EFUSE_V2 */ - phy_parameter->efuse_usb_dc_dis = value & phy_cfg->dc_disconnect_mask; - } - -out: - return 0; -} - -static int parse_phy_data(struct rtk_phy *rtk_phy) -{ - struct device *dev = rtk_phy->dev; - struct device_node *np = dev->of_node; - struct phy_parameter *phy_parameter; - int ret = 0; - int index; - - rtk_phy->phy_parameter = devm_kzalloc(dev, sizeof(struct phy_parameter) * - rtk_phy->num_phy, GFP_KERNEL); - if (!rtk_phy->phy_parameter) - return -ENOMEM; - - for (index = 0; index < rtk_phy->num_phy; index++) { - phy_parameter = &((struct phy_parameter *)rtk_phy->phy_parameter)[index]; - - phy_parameter->phy_reg.reg_wrap_vstatus = of_iomap(np, 0); - phy_parameter->phy_reg.reg_gusb2phyacc0 = of_iomap(np, 1) + index; - phy_parameter->phy_reg.vstatus_index = index; - - if (of_property_read_bool(np, "realtek,inverse-hstx-sync-clock")) - phy_parameter->inverse_hstx_sync_clock = true; - else - phy_parameter->inverse_hstx_sync_clock = false; - - if (of_property_read_u32_index(np, "realtek,driving-level", - index, &phy_parameter->driving_level)) - phy_parameter->driving_level = DEFAULT_DC_DRIVING_VALUE; - - if (of_property_read_u32_index(np, "realtek,driving-level-compensate", - index, &phy_parameter->driving_level_compensate)) - phy_parameter->driving_level_compensate = 0; - - if (of_property_read_u32_index(np, "realtek,disconnection-compensate", - index, &phy_parameter->disconnection_compensate)) - phy_parameter->disconnection_compensate = 0; - - get_phy_data_by_efuse(rtk_phy, phy_parameter, index); - - update_dc_driving_level(rtk_phy, phy_parameter); - - update_hs_clk_select(rtk_phy, phy_parameter); - } - - return ret; -} - -static int rtk_usb2phy_probe(struct platform_device *pdev) -{ - struct rtk_phy *rtk_phy; - struct device *dev = &pdev->dev; - struct phy *generic_phy; - struct phy_provider *phy_provider; - const struct phy_cfg *phy_cfg; - int ret = 0; - - phy_cfg = of_device_get_match_data(dev); - if (!phy_cfg) { - dev_err(dev, "phy config are not assigned!\n"); - return -EINVAL; - } - - rtk_phy = devm_kzalloc(dev, sizeof(*rtk_phy), GFP_KERNEL); - if (!rtk_phy) - return -ENOMEM; - - rtk_phy->dev = &pdev->dev; - rtk_phy->phy.dev = rtk_phy->dev; - rtk_phy->phy.label = "rtk-usb2phy"; - rtk_phy->phy.notify_port_status = rtk_phy_notify_port_status; - - rtk_phy->phy_cfg = devm_kzalloc(dev, sizeof(*phy_cfg), GFP_KERNEL); - - memcpy(rtk_phy->phy_cfg, phy_cfg, sizeof(*phy_cfg)); - - rtk_phy->num_phy = phy_cfg->num_phy; - - ret = parse_phy_data(rtk_phy); - if (ret) - goto err; - - platform_set_drvdata(pdev, rtk_phy); - - generic_phy = devm_phy_create(rtk_phy->dev, NULL, &ops); - if (IS_ERR(generic_phy)) - return PTR_ERR(generic_phy); - - phy_set_drvdata(generic_phy, rtk_phy); - - phy_provider = devm_of_phy_provider_register(rtk_phy->dev, - of_phy_simple_xlate); - if (IS_ERR(phy_provider)) - return PTR_ERR(phy_provider); - - ret = usb_add_phy_dev(&rtk_phy->phy); - if (ret) - goto err; - - create_debug_files(rtk_phy); - -err: - return ret; -} - -static void rtk_usb2phy_remove(struct platform_device *pdev) -{ - struct rtk_phy *rtk_phy = platform_get_drvdata(pdev); - - remove_debug_files(rtk_phy); - - usb_remove_phy(&rtk_phy->phy); -} - -static const struct phy_cfg rtd1295_phy_cfg = { - .page0_size = MAX_USB_PHY_PAGE0_DATA_SIZE, - .page0 = { [0] = {0xe0, 0x90}, - [3] = {0xe3, 0x3a}, - [4] = {0xe4, 0x68}, - [6] = {0xe6, 0x91}, - [13] = {0xf5, 0x81}, - [15] = {0xf7, 0x02}, }, - .page1_size = 8, - .page1 = { /* default parameter */ }, - .page2_size = 0, - .page2 = { /* no parameter */ }, - .num_phy = 1, - .check_efuse = false, - .check_efuse_version = CHECK_EFUSE_V1, - .efuse_dc_driving_rate = 1, - .dc_driving_mask = 0xf, - .efuse_dc_disconnect_rate = EFUS_USB_DC_DIS_RATE, - .dc_disconnect_mask = 0xf, - .usb_dc_disconnect_at_page0 = true, - .do_toggle = true, - .do_toggle_driving = false, - .driving_updated_for_dev_dis = 0xf, - .use_default_parameter = false, - .is_double_sensitivity_mode = false, -}; - -static const struct phy_cfg rtd1395_phy_cfg = { - .page0_size = MAX_USB_PHY_PAGE0_DATA_SIZE, - .page0 = { [4] = {0xe4, 0xac}, - [13] = {0xf5, 0x00}, - [15] = {0xf7, 0x02}, }, - .page1_size = 8, - .page1 = { /* default parameter */ }, - .page2_size = 0, - .page2 = { /* no parameter */ }, - .num_phy = 1, - .check_efuse = false, - .check_efuse_version = CHECK_EFUSE_V1, - .efuse_dc_driving_rate = 1, - .dc_driving_mask = 0xf, - .efuse_dc_disconnect_rate = EFUS_USB_DC_DIS_RATE, - .dc_disconnect_mask = 0xf, - .usb_dc_disconnect_at_page0 = true, - .do_toggle = true, - .do_toggle_driving = false, - .driving_updated_for_dev_dis = 0xf, - .use_default_parameter = false, - .is_double_sensitivity_mode = false, -}; - -static const struct phy_cfg rtd1395_phy_cfg_2port = { - .page0_size = MAX_USB_PHY_PAGE0_DATA_SIZE, - .page0 = { [4] = {0xe4, 0xac}, - [13] = {0xf5, 0x00}, - [15] = {0xf7, 0x02}, }, - .page1_size = 8, - .page1 = { /* default parameter */ }, - .page2_size = 0, - .page2 = { /* no parameter */ }, - .num_phy = 2, - .check_efuse = false, - .check_efuse_version = CHECK_EFUSE_V1, - .efuse_dc_driving_rate = 1, - .dc_driving_mask = 0xf, - .efuse_dc_disconnect_rate = EFUS_USB_DC_DIS_RATE, - .dc_disconnect_mask = 0xf, - .usb_dc_disconnect_at_page0 = true, - .do_toggle = true, - .do_toggle_driving = false, - .driving_updated_for_dev_dis = 0xf, - .use_default_parameter = false, - .is_double_sensitivity_mode = false, -}; - -static const struct phy_cfg rtd1619_phy_cfg = { - .page0_size = MAX_USB_PHY_PAGE0_DATA_SIZE, - .page0 = { [4] = {0xe4, 0x68}, }, - .page1_size = 8, - .page1 = { /* default parameter */ }, - .page2_size = 0, - .page2 = { /* no parameter */ }, - .num_phy = 1, - .check_efuse = true, - .check_efuse_version = CHECK_EFUSE_V1, - .efuse_dc_driving_rate = 1, - .dc_driving_mask = 0xf, - .efuse_dc_disconnect_rate = EFUS_USB_DC_DIS_RATE, - .dc_disconnect_mask = 0xf, - .usb_dc_disconnect_at_page0 = true, - .do_toggle = true, - .do_toggle_driving = false, - .driving_updated_for_dev_dis = 0xf, - .use_default_parameter = false, - .is_double_sensitivity_mode = false, -}; - -static const struct phy_cfg rtd1319_phy_cfg = { - .page0_size = MAX_USB_PHY_PAGE0_DATA_SIZE, - .page0 = { [0] = {0xe0, 0x18}, - [4] = {0xe4, 0x6a}, - [7] = {0xe7, 0x71}, - [13] = {0xf5, 0x15}, - [15] = {0xf7, 0x32}, }, - .page1_size = 8, - .page1 = { [3] = {0xe3, 0x44}, }, - .page2_size = MAX_USB_PHY_PAGE2_DATA_SIZE, - .page2 = { [0] = {0xe0, 0x01}, }, - .num_phy = 1, - .check_efuse = true, - .check_efuse_version = CHECK_EFUSE_V1, - .efuse_dc_driving_rate = 1, - .dc_driving_mask = 0xf, - .efuse_dc_disconnect_rate = EFUS_USB_DC_DIS_RATE, - .dc_disconnect_mask = 0xf, - .usb_dc_disconnect_at_page0 = true, - .do_toggle = true, - .do_toggle_driving = true, - .driving_updated_for_dev_dis = 0xf, - .use_default_parameter = false, - .is_double_sensitivity_mode = true, -}; - -static const struct phy_cfg rtd1312c_phy_cfg = { - .page0_size = MAX_USB_PHY_PAGE0_DATA_SIZE, - .page0 = { [0] = {0xe0, 0x14}, - [4] = {0xe4, 0x67}, - [5] = {0xe5, 0x55}, }, - .page1_size = 8, - .page1 = { [3] = {0xe3, 0x23}, - [6] = {0xe6, 0x58}, }, - .page2_size = MAX_USB_PHY_PAGE2_DATA_SIZE, - .page2 = { /* default parameter */ }, - .num_phy = 1, - .check_efuse = true, - .check_efuse_version = CHECK_EFUSE_V1, - .efuse_dc_driving_rate = 1, - .dc_driving_mask = 0xf, - .efuse_dc_disconnect_rate = EFUS_USB_DC_DIS_RATE, - .dc_disconnect_mask = 0xf, - .usb_dc_disconnect_at_page0 = true, - .do_toggle = true, - .do_toggle_driving = true, - .driving_updated_for_dev_dis = 0xf, - .use_default_parameter = false, - .is_double_sensitivity_mode = true, -}; - -static const struct phy_cfg rtd1619b_phy_cfg = { - .page0_size = MAX_USB_PHY_PAGE0_DATA_SIZE, - .page0 = { [0] = {0xe0, 0xa3}, - [4] = {0xe4, 0x88}, - [5] = {0xe5, 0x4f}, - [6] = {0xe6, 0x02}, }, - .page1_size = 8, - .page1 = { [3] = {0xe3, 0x64}, }, - .page2_size = MAX_USB_PHY_PAGE2_DATA_SIZE, - .page2 = { [7] = {0xe7, 0x45}, }, - .num_phy = 1, - .check_efuse = true, - .check_efuse_version = CHECK_EFUSE_V1, - .efuse_dc_driving_rate = EFUS_USB_DC_CAL_RATE, - .dc_driving_mask = 0x1f, - .efuse_dc_disconnect_rate = EFUS_USB_DC_DIS_RATE, - .dc_disconnect_mask = 0xf, - .usb_dc_disconnect_at_page0 = false, - .do_toggle = true, - .do_toggle_driving = true, - .driving_updated_for_dev_dis = 0x8, - .use_default_parameter = false, - .is_double_sensitivity_mode = true, -}; - -static const struct phy_cfg rtd1319d_phy_cfg = { - .page0_size = MAX_USB_PHY_PAGE0_DATA_SIZE, - .page0 = { [0] = {0xe0, 0xa3}, - [4] = {0xe4, 0x8e}, - [5] = {0xe5, 0x4f}, - [6] = {0xe6, 0x02}, }, - .page1_size = MAX_USB_PHY_PAGE1_DATA_SIZE, - .page1 = { [14] = {0xf5, 0x1}, }, - .page2_size = MAX_USB_PHY_PAGE2_DATA_SIZE, - .page2 = { [7] = {0xe7, 0x44}, }, - .check_efuse = true, - .num_phy = 1, - .check_efuse_version = CHECK_EFUSE_V1, - .efuse_dc_driving_rate = EFUS_USB_DC_CAL_RATE, - .dc_driving_mask = 0x1f, - .efuse_dc_disconnect_rate = EFUS_USB_DC_DIS_RATE, - .dc_disconnect_mask = 0xf, - .usb_dc_disconnect_at_page0 = false, - .do_toggle = true, - .do_toggle_driving = false, - .driving_updated_for_dev_dis = 0x8, - .use_default_parameter = false, - .is_double_sensitivity_mode = true, -}; - -static const struct phy_cfg rtd1315e_phy_cfg = { - .page0_size = MAX_USB_PHY_PAGE0_DATA_SIZE, - .page0 = { [0] = {0xe0, 0xa3}, - [4] = {0xe4, 0x8c}, - [5] = {0xe5, 0x4f}, - [6] = {0xe6, 0x02}, }, - .page1_size = MAX_USB_PHY_PAGE1_DATA_SIZE, - .page1 = { [3] = {0xe3, 0x7f}, - [14] = {0xf5, 0x01}, }, - .page2_size = MAX_USB_PHY_PAGE2_DATA_SIZE, - .page2 = { [7] = {0xe7, 0x44}, }, - .num_phy = 1, - .check_efuse = true, - .check_efuse_version = CHECK_EFUSE_V2, - .efuse_dc_driving_rate = EFUS_USB_DC_CAL_RATE, - .dc_driving_mask = 0x1f, - .efuse_dc_disconnect_rate = EFUS_USB_DC_DIS_RATE, - .dc_disconnect_mask = 0xf, - .usb_dc_disconnect_at_page0 = false, - .do_toggle = true, - .do_toggle_driving = false, - .driving_updated_for_dev_dis = 0x8, - .use_default_parameter = false, - .is_double_sensitivity_mode = true, -}; - -static const struct of_device_id usbphy_rtk_dt_match[] = { - { .compatible = "realtek,rtd1295-usb2phy", .data = &rtd1295_phy_cfg }, - { .compatible = "realtek,rtd1312c-usb2phy", .data = &rtd1312c_phy_cfg }, - { .compatible = "realtek,rtd1315e-usb2phy", .data = &rtd1315e_phy_cfg }, - { .compatible = "realtek,rtd1319-usb2phy", .data = &rtd1319_phy_cfg }, - { .compatible = "realtek,rtd1319d-usb2phy", .data = &rtd1319d_phy_cfg }, - { .compatible = "realtek,rtd1395-usb2phy", .data = &rtd1395_phy_cfg }, - { .compatible = "realtek,rtd1395-usb2phy-2port", .data = &rtd1395_phy_cfg_2port }, - { .compatible = "realtek,rtd1619-usb2phy", .data = &rtd1619_phy_cfg }, - { .compatible = "realtek,rtd1619b-usb2phy", .data = &rtd1619b_phy_cfg }, - {}, -}; -MODULE_DEVICE_TABLE(of, usbphy_rtk_dt_match); - -static struct platform_driver rtk_usb2phy_driver = { - .probe = rtk_usb2phy_probe, - .remove_new = rtk_usb2phy_remove, - .driver = { - .name = "rtk-usb2phy", - .of_match_table = usbphy_rtk_dt_match, - }, -}; - -module_platform_driver(rtk_usb2phy_driver); - -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform: rtk-usb2phy"); -MODULE_AUTHOR("Stanley Chang "); -MODULE_DESCRIPTION("Realtek usb 2.0 phy driver"); From 1a229d8690a0f8951fc4aa8b76a7efab0d8de342 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 6 Nov 2023 12:06:54 +0100 Subject: [PATCH 102/560] Revert "usb: phy: add usb phy notify port status API" This reverts commit a08799cf17c22375752abfad3b4a2b34b3acb287. The recently added Realtek PHY drivers depend on the new port status notification mechanism which was built on the deprecated USB PHY implementation and devicetree binding. Specifically, using these PHYs would require describing the very same PHY using both the generic "phy" property and the deprecated "usb-phy" property which is clearly wrong. We should not be building new functionality on top of the legacy USB PHY implementation even if it is currently stuck in some kind of transitional limbo. Revert the new notification interface which is broken by design. Fixes: a08799cf17c2 ("usb: phy: add usb phy notify port status API") Cc: stable@vger.kernel.org # 6.6 Cc: Stanley Chang Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20231106110654.31090-4-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 23 ----------------------- include/linux/usb/phy.h | 13 ------------- 2 files changed, 36 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index b4584a0cd4845..87480a6e6d934 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -622,29 +622,6 @@ static int hub_ext_port_status(struct usb_hub *hub, int port1, int type, ret = 0; } mutex_unlock(&hub->status_mutex); - - /* - * There is no need to lock status_mutex here, because status_mutex - * protects hub->status, and the phy driver only checks the port - * status without changing the status. - */ - if (!ret) { - struct usb_device *hdev = hub->hdev; - - /* - * Only roothub will be notified of port state changes, - * since the USB PHY only cares about changes at the next - * level. - */ - if (is_root_hub(hdev)) { - struct usb_hcd *hcd = bus_to_hcd(hdev->bus); - - if (hcd->usb_phy) - usb_phy_notify_port_status(hcd->usb_phy, - port1 - 1, *status, *change); - } - } - return ret; } diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h index b513749582d77..e4de6bc1f69b6 100644 --- a/include/linux/usb/phy.h +++ b/include/linux/usb/phy.h @@ -144,10 +144,6 @@ struct usb_phy { */ int (*set_wakeup)(struct usb_phy *x, bool enabled); - /* notify phy port status change */ - int (*notify_port_status)(struct usb_phy *x, int port, - u16 portstatus, u16 portchange); - /* notify phy connect status change */ int (*notify_connect)(struct usb_phy *x, enum usb_device_speed speed); @@ -320,15 +316,6 @@ usb_phy_set_wakeup(struct usb_phy *x, bool enabled) return 0; } -static inline int -usb_phy_notify_port_status(struct usb_phy *x, int port, u16 portstatus, u16 portchange) -{ - if (x && x->notify_port_status) - return x->notify_port_status(x, port, portstatus, portchange); - else - return 0; -} - static inline int usb_phy_notify_connect(struct usb_phy *x, enum usb_device_speed speed) { From f2c1dba31133233697fc96e808c6005fc304a8e9 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 28 Jun 2023 09:40:53 -0400 Subject: [PATCH 103/560] tools/power/turbostat: bugfix "--show IPC" turbostat --show IPC displays "inf" for the IPC column turbostat was missing the explicit dependency of IPC on APERF, and thus neglected to collect APERF when only IPC was requested. typcial use: turbostat --quiet --show CPU,IPC Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8a311d7272e75..5aa6598a40a95 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2202,7 +2202,8 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d old->c1 = new->c1 - old->c1; - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) { + if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) + || soft_c1_residency_display(BIC_Avg_MHz)) { if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) { old->aperf = new->aperf - old->aperf; old->mperf = new->mperf - old->mperf; @@ -2724,7 +2725,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) retry: t->tsc = rdtsc(); /* we are running on local CPU of interest */ - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) { + if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) + || soft_c1_residency_display(BIC_Avg_MHz)) { unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; /* From b8337e6a780dad9505f9d44da07c0a5c52fa0a04 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 7 Nov 2023 23:28:30 -0500 Subject: [PATCH 104/560] tools/power turbostat: version 2023.11.07 Turbostat features are now table-driven (Rui Zhang) Add support for some new platforms (Sumeet Pawnikar, Rui Zhang) Gracefully run in configs when CPUs are limited (Rui Zhang, Srinivas Pandruvada) misc minor fixes. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5aa6598a40a95..7a334377f92b9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -6259,7 +6259,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2023.03.17 - Len Brown \n"); + fprintf(outf, "turbostat version 2023.11.07 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 From 31255e072b2e91f97645d792d25b2db744186dd1 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Tue, 7 Nov 2023 10:22:51 -0800 Subject: [PATCH 105/560] x86/shstk: Delay signal entry SSP write until after user accesses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a signal is being delivered, the kernel needs to make accesses to userspace. These accesses could encounter an access error, in which case the signal delivery itself will trigger a segfault. Usually this would result in the kernel killing the process. But in the case of a SEGV signal handler being configured, the failure of the first signal delivery will result in *another* signal getting delivered. The second signal may succeed if another thread has resolved the issue that triggered the segfault (i.e. a well timed mprotect()/mmap()), or the second signal is being delivered to another stack (i.e. an alt stack). On x86, in the non-shadow stack case, all the accesses to userspace are done before changes to the registers (in pt_regs). The operation is aborted when an access error occurs, so although there may be writes done for the first signal, control flow changes for the signal (regs->ip, regs->sp, etc) are not committed until all the accesses have already completed successfully. This means that the second signal will be delivered as if it happened at the time of the first signal. It will effectively replace the first aborted signal, overwriting the half-written frame of the aborted signal. So on sigreturn from the second signal, control flow will resume happily from the point of control flow where the original signal was delivered. The problem is, when shadow stack is active, the shadow stack SSP register/MSR is updated *before* some of the userspace accesses. This means if the earlier accesses succeed and the later ones fail, the second signal will not be delivered at the same spot on the shadow stack as the first one. So on sigreturn from the second signal, the SSP will be pointing to the wrong location on the shadow stack (off by a frame). Pengfei privately reported that while using a shadow stack enabled glibc, the “signal06” test in the LTP test-suite hung. It turns out it is testing the above described double signal scenario. When this test was compiled with shadow stack, the first signal pushed a shadow stack sigframe, then the second pushed another. When the second signal was handled, the SSP was at the first shadow stack signal frame instead of the original location. The test then got stuck as the #CP from the twice incremented SSP was incorrect and generated segfaults in a loop. Fix this by adjusting the SSP register only after any userspace accesses, such that there can be no failures after the SSP is adjusted. Do this by moving the shadow stack sigframe push logic to happen after all other userspace accesses. Note, sigreturn (as opposed to the signal delivery dealt with in this patch) has ordering behavior that could lead to similar failures. The ordering issues there extend beyond shadow stack to include the alt stack restoration. Fixing that would require cross-arch changes, and the ordering today does not cause any known test or apps breakages. So leave it as is, for now. [ dhansen: minor changelog/subject tweak ] Fixes: 05e36022c054 ("x86/shstk: Handle signals for shadow stack") Reported-by: Pengfei Xu Signed-off-by: Rick Edgecombe Signed-off-by: Dave Hansen Tested-by: Pengfei Xu Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20231107182251.91276-1-rick.p.edgecombe%40intel.com Link: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/syscalls/signal/signal06.c --- arch/x86/kernel/signal_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index cacf2ede62175..23d8aaf8d9fd1 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -175,9 +175,6 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp); uc_flags = frame_uc_flags(regs); - if (setup_signal_shadow_stack(ksig)) - return -EFAULT; - if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; @@ -198,6 +195,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) return -EFAULT; } + if (setup_signal_shadow_stack(ksig)) + return -EFAULT; + /* Set up registers for signal handler */ regs->di = ksig->sig; /* In case the signal handler was declared without prototypes */ From 65120498aaf8d7320647a8b6d6de7db42e74ea52 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Nov 2023 13:58:27 +0100 Subject: [PATCH 106/560] stackleak: add declarations for global functions With -Wmissing-prototypes enabled, the stackleak code produces a couple of warnings that have no declarations because they are only called from assembler: stackleak.c:127:25: error: no previous prototype for 'stackleak_erase' [-Werror=missing-prototypes] stackleak.c:139:25: error: no previous prototype for 'stackleak_erase_on_task_stack' [-Werror=missing-prototypes] stackleak.c:151:25: error: no previous prototype for 'stackleak_erase_off_task_stack' [-Werror=missing-prototypes] stackleak.c:159:49: error: no previous prototype for 'stackleak_track_stack' [-Werror=missing-prototypes] Add declarations to the stackleak header to shut up the warnings. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20231108125843.3806765-7-arnd@kernel.org Signed-off-by: Kees Cook --- include/linux/stackleak.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index c36e7a3b45e7e..3be2cb564710b 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -14,6 +14,7 @@ #ifdef CONFIG_GCC_PLUGIN_STACKLEAK #include +#include /* * The lowest address on tsk's stack which we can plausibly erase. @@ -76,6 +77,11 @@ static inline void stackleak_task_init(struct task_struct *t) # endif } +asmlinkage void noinstr stackleak_erase(void); +asmlinkage void noinstr stackleak_erase_on_task_stack(void); +asmlinkage void noinstr stackleak_erase_off_task_stack(void); +void __no_caller_saved_registers noinstr stackleak_track_stack(void); + #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ static inline void stackleak_task_init(struct task_struct *t) { } #endif From 1ee60356c2dca938362528404af95b8ef3e49b6a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 4 Nov 2023 13:43:37 -0700 Subject: [PATCH 107/560] gcc-plugins: randstruct: Only warn about true flexible arrays The randstruct GCC plugin tried to discover "fake" flexible arrays to issue warnings about them in randomized structs. In the future LSM overhead reduction series, it would be legal to have a randomized struct with a 1-element array, and this should _not_ be treated as a flexible array, especially since commit df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3"). Disable the 0-sized and 1-element array discovery logic in the plugin, but keep the "true" flexible array check. Cc: KP Singh Cc: linux-hardening@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311021532.iBwuZUZ0-lkp@intel.com/ Fixes: df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3") Reviewed-by: Bill Wendling Acked-by: "Gustavo A. R. Silva" Link: https://lore.kernel.org/r/20231104204334.work.160-kees@kernel.org Signed-off-by: Kees Cook --- scripts/gcc-plugins/randomize_layout_plugin.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/scripts/gcc-plugins/randomize_layout_plugin.c b/scripts/gcc-plugins/randomize_layout_plugin.c index 366395cab490d..910bd21d08f48 100644 --- a/scripts/gcc-plugins/randomize_layout_plugin.c +++ b/scripts/gcc-plugins/randomize_layout_plugin.c @@ -278,8 +278,6 @@ static bool is_flexible_array(const_tree field) { const_tree fieldtype; const_tree typesize; - const_tree elemtype; - const_tree elemsize; fieldtype = TREE_TYPE(field); typesize = TYPE_SIZE(fieldtype); @@ -287,20 +285,12 @@ static bool is_flexible_array(const_tree field) if (TREE_CODE(fieldtype) != ARRAY_TYPE) return false; - elemtype = TREE_TYPE(fieldtype); - elemsize = TYPE_SIZE(elemtype); - /* size of type is represented in bits */ if (typesize == NULL_TREE && TYPE_DOMAIN(fieldtype) != NULL_TREE && TYPE_MAX_VALUE(TYPE_DOMAIN(fieldtype)) == NULL_TREE) return true; - if (typesize != NULL_TREE && - (TREE_CONSTANT(typesize) && (!tree_to_uhwi(typesize) || - tree_to_uhwi(typesize) == tree_to_uhwi(elemsize)))) - return true; - return false; } From 19597cad64d608aa8ac2f8aef50a50187a565223 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Mon, 30 Oct 2023 12:19:12 +0530 Subject: [PATCH 108/560] scsi: qla2xxx: Fix system crash due to bad pointer access User experiences system crash when running AER error injection. The perturbation causes the abort-all-I/O path to trigger. The driver assumes all I/O on this path is FCP only. If there is both NVMe & FCP traffic, a system crash happens. Add additional check to see if I/O is FCP or not before access. PID: 999019 TASK: ff35d769f24722c0 CPU: 53 COMMAND: "kworker/53:1" 0 [ff3f78b964847b58] machine_kexec at ffffffffae86973d 1 [ff3f78b964847ba8] __crash_kexec at ffffffffae9be29d 2 [ff3f78b964847c70] crash_kexec at ffffffffae9bf528 3 [ff3f78b964847c78] oops_end at ffffffffae8282ab 4 [ff3f78b964847c98] exc_page_fault at ffffffffaf2da502 5 [ff3f78b964847cc0] asm_exc_page_fault at ffffffffaf400b62 [exception RIP: qla2x00_abort_srb+444] RIP: ffffffffc07b5f8c RSP: ff3f78b964847d78 RFLAGS: 00010046 RAX: 0000000000000282 RBX: ff35d74a0195a200 RCX: ff35d76886fd03a0 RDX: 0000000000000001 RSI: ffffffffc07c5ec8 RDI: ff35d74a0195a200 RBP: ff35d76913d22080 R8: ff35d7694d103200 R9: ff35d7694d103200 R10: 0000000100000000 R11: ffffffffb05d6630 R12: 0000000000010000 R13: ff3f78b964847df8 R14: ff35d768d8754000 R15: ff35d768877248e0 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 6 [ff3f78b964847d70] qla2x00_abort_srb at ffffffffc07b5f84 [qla2xxx] 7 [ff3f78b964847de0] __qla2x00_abort_all_cmds at ffffffffc07b6238 [qla2xxx] 8 [ff3f78b964847e38] qla2x00_abort_all_cmds at ffffffffc07ba635 [qla2xxx] 9 [ff3f78b964847e58] qla2x00_terminate_rport_io at ffffffffc08145eb [qla2xxx] 10 [ff3f78b964847e70] fc_terminate_rport_io at ffffffffc045987e [scsi_transport_fc] 11 [ff3f78b964847e88] process_one_work at ffffffffae914f15 12 [ff3f78b964847ed0] worker_thread at ffffffffae9154c0 13 [ff3f78b964847f10] kthread at ffffffffae91c456 14 [ff3f78b964847f50] ret_from_fork at ffffffffae8036ef Cc: stable@vger.kernel.org Fixes: f45bca8c5052 ("scsi: qla2xxx: Fix double scsi_done for abort path") Signed-off-by: Quinn Tran Signed-off-by: Nilesh Javali Link: https://lore.kernel.org/r/20231030064912.37912-1-njavali@marvell.com Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_os.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 7e103d7118253..d24410944f7db 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -1837,8 +1837,16 @@ static void qla2x00_abort_srb(struct qla_qpair *qp, srb_t *sp, const int res, } spin_lock_irqsave(qp->qp_lock_ptr, *flags); - if (ret_cmd && blk_mq_request_started(scsi_cmd_to_rq(cmd))) - sp->done(sp, res); + switch (sp->type) { + case SRB_SCSI_CMD: + if (ret_cmd && blk_mq_request_started(scsi_cmd_to_rq(cmd))) + sp->done(sp, res); + break; + default: + if (ret_cmd) + sp->done(sp, res); + break; + } } else { sp->done(sp, res); } From defde5a50d91c74e1ce71a7f0bce7fb1ae311d84 Mon Sep 17 00:00:00 2001 From: Naomi Chu Date: Thu, 2 Nov 2023 13:24:24 +0800 Subject: [PATCH 109/560] scsi: ufs: core: Expand MCQ queue slot to DeviceQueueDepth + 1 The UFSHCI 4.0 specification mandates that there should always be at least one empty slot in each queue for distinguishing between full and empty states. Enlarge 'hwq->max_entries' to 'DeviceQueueDepth + 1' to allow UFSHCI 4.0 controllers to fully utilize MCQ queue slots. Fixes: 4682abfae2eb ("scsi: ufs: core: mcq: Allocate memory for MCQ mode") Signed-off-by: Naomi Chu Link: https://lore.kernel.org/r/20231102052426.12006-2-naomi.chu@mediatek.com Reviewed-by: Stanley Chu Reviewed-by: Bart Van Assche Reviewed-by: Peter Wang Reviewed-by: Chun-Hung Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs-mcq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index 2ba8ec254dcee..5c75ab9d6bb50 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -436,7 +436,7 @@ int ufshcd_mcq_init(struct ufs_hba *hba) for (i = 0; i < hba->nr_hw_queues; i++) { hwq = &hba->uhq[i]; - hwq->max_entries = hba->nutrs; + hwq->max_entries = hba->nutrs + 1; spin_lock_init(&hwq->sq_lock); spin_lock_init(&hwq->cq_lock); mutex_init(&hwq->sq_mutex); From 27900d7119c464b43cd9eac69c85884d17bae240 Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Mon, 6 Nov 2023 15:51:17 +0800 Subject: [PATCH 110/560] scsi: ufs: core: Fix racing issue between ufshcd_mcq_abort() and ISR If command timeout happens and cq complete IRQ is raised at the same time, ufshcd_mcq_abort clears lprb->cmd and a NULL pointer deref happens in the ISR. Error log: ufshcd_abort: Device abort task at tag 18 Unable to handle kernel NULL pointer dereference at virtual address 0000000000000108 pc : [0xffffffe27ef867ac] scsi_dma_unmap+0xc/0x44 lr : [0xffffffe27f1b898c] ufshcd_release_scsi_cmd+0x24/0x114 Fixes: f1304d442077 ("scsi: ufs: mcq: Added ufshcd_mcq_abort()") Cc: stable@vger.kernel.org Signed-off-by: Peter Wang Link: https://lore.kernel.org/r/20231106075117.8995-1-peter.wang@mediatek.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs-mcq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index 5c75ab9d6bb50..0787456c2b892 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -630,6 +630,7 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd) int tag = scsi_cmd_to_rq(cmd)->tag; struct ufshcd_lrb *lrbp = &hba->lrb[tag]; struct ufs_hw_queue *hwq; + unsigned long flags; int err = FAILED; if (!ufshcd_cmd_inflight(lrbp->cmd)) { @@ -670,8 +671,10 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd) } err = SUCCESS; + spin_lock_irqsave(&hwq->cq_lock, flags); if (ufshcd_cmd_inflight(lrbp->cmd)) ufshcd_release_scsi_cmd(hba, lrbp); + spin_unlock_irqrestore(&hwq->cq_lock, flags); out: return err; From 860c3d03bbc3f17aef8600662c488f27fd093142 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Nov 2023 17:04:33 +0300 Subject: [PATCH 111/560] scsi: scsi_debug: Fix some bugs in sdebug_error_write() There are two bug in this code: 1) If count is zero, then it will lead to a NULL dereference. The kmalloc() will successfully allocate zero bytes and the test for "if (buf[0] == '-')" will read beyond the end of the zero size buffer and Oops. 2) The code does not ensure that the user's string is properly NUL terminated which could lead to a read overflow. Fixes: a9996d722b11 ("scsi: scsi_debug: Add interface to manage error injection for a single device") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/7733643d-e102-4581-8d29-769472011c97@moroto.mountain Reviewed-by: Wenchao Hao Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 67922e2c4c191..0dd21598f7b6d 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1019,7 +1019,7 @@ static ssize_t sdebug_error_write(struct file *file, const char __user *ubuf, struct sdebug_err_inject *inject; struct scsi_device *sdev = (struct scsi_device *)file->f_inode->i_private; - buf = kmalloc(count, GFP_KERNEL); + buf = kzalloc(count + 1, GFP_KERNEL); if (!buf) return -ENOMEM; From 037fbd3fcfbd99145f9310d93f6637012807cfd0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Nov 2023 17:05:04 +0300 Subject: [PATCH 112/560] scsi: scsi_debug: Delete some bogus error checking Smatch complains that "dentry" is never initialized. These days everyone initializes all their stack variables to zero so this means that it will trigger a warning every time this function is run. Really, debugfs functions are not supposed to be checked for errors in normal code. For example, if we updated this code to check the correct variable then it would print a warning if CONFIG_DEBUGFS was disabled. We don't want that. Just delete the check. Fixes: f084fe52c640 ("scsi: scsi_debug: Add debugfs interface to fail target reset") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/c602c9ad-5e35-4e18-a47f-87ed956a9ec2@moroto.mountain Reviewed-by: Wenchao Hao Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 0dd21598f7b6d..6d8218a441226 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1132,7 +1132,6 @@ static const struct file_operations sdebug_target_reset_fail_fops = { static int sdebug_target_alloc(struct scsi_target *starget) { struct sdebug_target_info *targetip; - struct dentry *dentry; targetip = kzalloc(sizeof(struct sdebug_target_info), GFP_KERNEL); if (!targetip) @@ -1140,15 +1139,9 @@ static int sdebug_target_alloc(struct scsi_target *starget) targetip->debugfs_entry = debugfs_create_dir(dev_name(&starget->dev), sdebug_debugfs_root); - if (IS_ERR_OR_NULL(targetip->debugfs_entry)) - pr_info("%s: failed to create debugfs directory for target %s\n", - __func__, dev_name(&starget->dev)); debugfs_create_file("fail_reset", 0600, targetip->debugfs_entry, starget, &sdebug_target_reset_fail_fops); - if (IS_ERR_OR_NULL(dentry)) - pr_info("%s: failed to create fail_reset file for target %s\n", - __func__, dev_name(&starget->dev)); starget->hostdata = targetip; From 3b83486399a6a9feb9c681b74c21a227d48d7020 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 6 Nov 2023 17:13:04 -0600 Subject: [PATCH 113/560] scsi: sd: Fix sshdr use in sd_suspend_common() If scsi_execute_cmd() returns < 0, it doesn't initialize the sshdr, so we shouldn't access the sshdr. If it returns 0, then the cmd executed successfully, so there is no need to check the sshdr. sd_sync_cache() will only access the sshdr if it's been setup because it calls scsi_status_is_check_condition() before accessing it. However, the sd_sync_cache() caller, sd_suspend_common(), does not check. sd_suspend_common() is only checking for ILLEGAL_REQUEST which it's using to determine if the command is supported. If it's not it just ignores the error. So to fix its sshdr use this patch just moves that check to sd_sync_cache() where it converts ILLEGAL_REQUEST to success/0. sd_suspend_common() was ignoring that error and sd_shutdown() doesn't check for errors so there will be no behavior changes. Signed-off-by: Mike Christie Link: https://lore.kernel.org/r/20231106231304.5694-2-michael.christie@oracle.com Reviewed-by: Christoph Hellwig Reviewed-by: Martin Wilck Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 53 ++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 864db30b9c8e1..3ef3b00397fe2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1565,24 +1565,21 @@ static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing) return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0; } -static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) +static int sd_sync_cache(struct scsi_disk *sdkp) { int retries, res; struct scsi_device *sdp = sdkp->device; const int timeout = sdp->request_queue->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; - struct scsi_sense_hdr my_sshdr; + struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, - /* caller might not be interested in sense, but we need it */ - .sshdr = sshdr ? : &my_sshdr, + .sshdr = &sshdr, }; if (!scsi_device_online(sdp)) return -ENODEV; - sshdr = exec_args.sshdr; - for (retries = 3; retries > 0; --retries) { unsigned char cmd[16] = { 0 }; @@ -1607,15 +1604,23 @@ static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) return res; if (scsi_status_is_check_condition(res) && - scsi_sense_valid(sshdr)) { - sd_print_sense_hdr(sdkp, sshdr); + scsi_sense_valid(&sshdr)) { + sd_print_sense_hdr(sdkp, &sshdr); /* we need to evaluate the error return */ - if (sshdr->asc == 0x3a || /* medium not present */ - sshdr->asc == 0x20 || /* invalid command */ - (sshdr->asc == 0x74 && sshdr->ascq == 0x71)) /* drive is password locked */ + if (sshdr.asc == 0x3a || /* medium not present */ + sshdr.asc == 0x20 || /* invalid command */ + (sshdr.asc == 0x74 && sshdr.ascq == 0x71)) /* drive is password locked */ /* this is no error here */ return 0; + /* + * This drive doesn't support sync and there's not much + * we can do because this is called during shutdown + * or suspend so just return success so those operations + * can proceed. + */ + if (sshdr.sense_key == ILLEGAL_REQUEST) + return 0; } switch (host_byte(res)) { @@ -3774,7 +3779,7 @@ static void sd_shutdown(struct device *dev) if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); - sd_sync_cache(sdkp, NULL); + sd_sync_cache(sdkp); } if (system_state != SYSTEM_RESTART && sdkp->device->manage_start_stop) { @@ -3786,7 +3791,6 @@ static void sd_shutdown(struct device *dev) static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) { struct scsi_disk *sdkp = dev_get_drvdata(dev); - struct scsi_sense_hdr sshdr; int ret = 0; if (!sdkp) /* E.g.: runtime suspend following sd_remove() */ @@ -3795,24 +3799,13 @@ static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) if (sdkp->WCE && sdkp->media_present) { if (!sdkp->device->silence_suspend) sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); - ret = sd_sync_cache(sdkp, &sshdr); - - if (ret) { - /* ignore OFFLINE device */ - if (ret == -ENODEV) - return 0; - - if (!scsi_sense_valid(&sshdr) || - sshdr.sense_key != ILLEGAL_REQUEST) - return ret; + ret = sd_sync_cache(sdkp); + /* ignore OFFLINE device */ + if (ret == -ENODEV) + return 0; - /* - * sshdr.sense_key == ILLEGAL_REQUEST means this drive - * doesn't support sync. There's not much to do and - * suspend shouldn't fail. - */ - ret = 0; - } + if (ret) + return ret; } if (sdkp->device->manage_start_stop) { From e439e4a62a8ea3c39d65c546de3af7d1c594077c Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Mon, 30 Oct 2023 10:43:11 +0100 Subject: [PATCH 114/560] scsi: ufs: qcom-ufs: dt-bindings: Document the SM8650 UFS Controller Document the UFS Controller on the SM8650 Platform. Reviewed-by: Alim Akhtar Reviewed-by: Manivannan Sadhasivam Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20231030-topic-sm8650-upstream-bindings-ufs-v3-1-a96364463fd5@linaro.org Reviewed-by: Krzysztof Kozlowski Signed-off-by: Martin K. Petersen --- Documentation/devicetree/bindings/ufs/qcom,ufs.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml index 462ead5a1cec3..2cf3d016db42c 100644 --- a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml +++ b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml @@ -36,6 +36,7 @@ properties: - qcom,sm8350-ufshc - qcom,sm8450-ufshc - qcom,sm8550-ufshc + - qcom,sm8650-ufshc - const: qcom,ufshc - const: jedec,ufs-2.0 @@ -122,6 +123,7 @@ allOf: - qcom,sm8350-ufshc - qcom,sm8450-ufshc - qcom,sm8550-ufshc + - qcom,sm8650-ufshc then: properties: clocks: From 6c8e69e4a702b072206f166111c003d704de15d9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 3 Nov 2023 12:26:25 +0000 Subject: [PATCH 115/560] btrfs: fix race between accounting qgroup extents and removing a qgroup When doing qgroup accounting for an extent, we take the spinlock fs_info->qgroup_lock and then add qgroups to the local list (iterator) named "qgroups". These qgroups are found in the fs_info->qgroup_tree rbtree. After we're done, we unlock fs_info->qgroup_lock and then call qgroup_iterator_nested_clean(), which will iterate over all the qgroups added to the local list "qgroups" and then delete them from the list. Deleting a qgroup from the list can however result in a use-after-free if a qgroup remove operation happens after we unlock fs_info->qgroup_lock and before or while we are at qgroup_iterator_nested_clean(). Fix this by calling qgroup_iterator_nested_clean() while still holding the lock fs_info->qgroup_lock - we don't need it under the 'out' label since before taking the lock the "qgroups" list is always empty. This guarantees safety because btrfs_remove_qgroup() takes that lock before removing a qgroup from the rbtree fs_info->qgroup_tree. This was reported by syzbot with the following stack traces: BUG: KASAN: slab-use-after-free in __list_del_entry_valid_or_report+0x2f/0x130 lib/list_debug.c:49 Read of size 8 at addr ffff888027e420b0 by task kworker/u4:3/48 CPU: 1 PID: 48 Comm: kworker/u4:3 Not tainted 6.6.0-syzkaller-10396-g4652b8e4f3ff #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 Workqueue: btrfs-qgroup-rescan btrfs_work_helper Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e7/0x2d0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0x163/0x540 mm/kasan/report.c:475 kasan_report+0x175/0x1b0 mm/kasan/report.c:588 __list_del_entry_valid_or_report+0x2f/0x130 lib/list_debug.c:49 __list_del_entry_valid include/linux/list.h:124 [inline] __list_del_entry include/linux/list.h:215 [inline] list_del_init include/linux/list.h:287 [inline] qgroup_iterator_nested_clean fs/btrfs/qgroup.c:2623 [inline] btrfs_qgroup_account_extent+0x18b/0x1150 fs/btrfs/qgroup.c:2883 qgroup_rescan_leaf fs/btrfs/qgroup.c:3543 [inline] btrfs_qgroup_rescan_worker+0x1078/0x1c60 fs/btrfs/qgroup.c:3604 btrfs_work_helper+0x37c/0xbd0 fs/btrfs/async-thread.c:315 process_one_work kernel/workqueue.c:2630 [inline] process_scheduled_works+0x90f/0x1400 kernel/workqueue.c:2703 worker_thread+0xa5f/0xff0 kernel/workqueue.c:2784 kthread+0x2d3/0x370 kernel/kthread.c:388 ret_from_fork+0x48/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 Allocated by task 6355: kasan_save_stack mm/kasan/common.c:45 [inline] kasan_set_track+0x4f/0x70 mm/kasan/common.c:52 ____kasan_kmalloc mm/kasan/common.c:374 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:383 kmalloc include/linux/slab.h:600 [inline] kzalloc include/linux/slab.h:721 [inline] btrfs_quota_enable+0xee9/0x2060 fs/btrfs/qgroup.c:1209 btrfs_ioctl_quota_ctl+0x143/0x190 fs/btrfs/ioctl.c:3705 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl+0xf8/0x170 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Freed by task 6355: kasan_save_stack mm/kasan/common.c:45 [inline] kasan_set_track+0x4f/0x70 mm/kasan/common.c:52 kasan_save_free_info+0x28/0x40 mm/kasan/generic.c:522 ____kasan_slab_free+0xd6/0x120 mm/kasan/common.c:236 kasan_slab_free include/linux/kasan.h:164 [inline] slab_free_hook mm/slub.c:1800 [inline] slab_free_freelist_hook mm/slub.c:1826 [inline] slab_free mm/slub.c:3809 [inline] __kmem_cache_free+0x263/0x3a0 mm/slub.c:3822 btrfs_remove_qgroup+0x764/0x8c0 fs/btrfs/qgroup.c:1787 btrfs_ioctl_qgroup_create+0x185/0x1e0 fs/btrfs/ioctl.c:3811 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl+0xf8/0x170 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Last potentially related work creation: kasan_save_stack+0x3f/0x60 mm/kasan/common.c:45 __kasan_record_aux_stack+0xad/0xc0 mm/kasan/generic.c:492 __call_rcu_common kernel/rcu/tree.c:2667 [inline] call_rcu+0x167/0xa70 kernel/rcu/tree.c:2781 kthread_worker_fn+0x4ba/0xa90 kernel/kthread.c:823 kthread+0x2d3/0x370 kernel/kthread.c:388 ret_from_fork+0x48/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 Second to last potentially related work creation: kasan_save_stack+0x3f/0x60 mm/kasan/common.c:45 __kasan_record_aux_stack+0xad/0xc0 mm/kasan/generic.c:492 __call_rcu_common kernel/rcu/tree.c:2667 [inline] call_rcu+0x167/0xa70 kernel/rcu/tree.c:2781 kthread_worker_fn+0x4ba/0xa90 kernel/kthread.c:823 kthread+0x2d3/0x370 kernel/kthread.c:388 ret_from_fork+0x48/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 The buggy address belongs to the object at ffff888027e42000 which belongs to the cache kmalloc-512 of size 512 The buggy address is located 176 bytes inside of freed 512-byte region [ffff888027e42000, ffff888027e42200) The buggy address belongs to the physical page: page:ffffea00009f9000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x27e40 head:ffffea00009f9000 order:2 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0xfff00000000840(slab|head|node=0|zone=1|lastcpupid=0x7ff) page_type: 0xffffffff() raw: 00fff00000000840 ffff888012c41c80 ffffea0000a5ba00 dead000000000002 raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 2, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 4514, tgid 4514 (udevadm), ts 24598439480, free_ts 23755696267 set_page_owner include/linux/page_owner.h:31 [inline] post_alloc_hook+0x1e6/0x210 mm/page_alloc.c:1536 prep_new_page mm/page_alloc.c:1543 [inline] get_page_from_freelist+0x31db/0x3360 mm/page_alloc.c:3170 __alloc_pages+0x255/0x670 mm/page_alloc.c:4426 alloc_slab_page+0x6a/0x160 mm/slub.c:1870 allocate_slab mm/slub.c:2017 [inline] new_slab+0x84/0x2f0 mm/slub.c:2070 ___slab_alloc+0xc85/0x1310 mm/slub.c:3223 __slab_alloc mm/slub.c:3322 [inline] __slab_alloc_node mm/slub.c:3375 [inline] slab_alloc_node mm/slub.c:3468 [inline] __kmem_cache_alloc_node+0x19d/0x270 mm/slub.c:3517 kmalloc_trace+0x2a/0xe0 mm/slab_common.c:1098 kmalloc include/linux/slab.h:600 [inline] kzalloc include/linux/slab.h:721 [inline] kernfs_fop_open+0x3e7/0xcc0 fs/kernfs/file.c:670 do_dentry_open+0x8fd/0x1590 fs/open.c:948 do_open fs/namei.c:3622 [inline] path_openat+0x2845/0x3280 fs/namei.c:3779 do_filp_open+0x234/0x490 fs/namei.c:3809 do_sys_openat2+0x13e/0x1d0 fs/open.c:1440 do_sys_open fs/open.c:1455 [inline] __do_sys_openat fs/open.c:1471 [inline] __se_sys_openat fs/open.c:1466 [inline] __x64_sys_openat+0x247/0x290 fs/open.c:1466 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1136 [inline] free_unref_page_prepare+0x8c3/0x9f0 mm/page_alloc.c:2312 free_unref_page+0x37/0x3f0 mm/page_alloc.c:2405 discard_slab mm/slub.c:2116 [inline] __unfreeze_partials+0x1dc/0x220 mm/slub.c:2655 put_cpu_partial+0x17b/0x250 mm/slub.c:2731 __slab_free+0x2b6/0x390 mm/slub.c:3679 qlink_free mm/kasan/quarantine.c:166 [inline] qlist_free_all+0x75/0xe0 mm/kasan/quarantine.c:185 kasan_quarantine_reduce+0x14b/0x160 mm/kasan/quarantine.c:292 __kasan_slab_alloc+0x23/0x70 mm/kasan/common.c:305 kasan_slab_alloc include/linux/kasan.h:188 [inline] slab_post_alloc_hook+0x67/0x3d0 mm/slab.h:762 slab_alloc_node mm/slub.c:3478 [inline] slab_alloc mm/slub.c:3486 [inline] __kmem_cache_alloc_lru mm/slub.c:3493 [inline] kmem_cache_alloc+0x104/0x2c0 mm/slub.c:3502 getname_flags+0xbc/0x4f0 fs/namei.c:140 do_sys_openat2+0xd2/0x1d0 fs/open.c:1434 do_sys_open fs/open.c:1455 [inline] __do_sys_openat fs/open.c:1471 [inline] __se_sys_openat fs/open.c:1466 [inline] __x64_sys_openat+0x247/0x290 fs/open.c:1466 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Memory state around the buggy address: ffff888027e41f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888027e42000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff888027e42080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888027e42100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888027e42180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Reported-by: syzbot+e0b615318f8fcfc01ceb@syzkaller.appspotmail.com Fixes: dce28769a33a ("btrfs: qgroup: use qgroup_iterator_nested to in qgroup_update_refcnt()") CC: stable@vger.kernel.org # 6.6 Link: https://lore.kernel.org/linux-btrfs/00000000000091a5b2060936bf6d@google.com/ Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index edb84cc032377..e48eba7e93793 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2874,13 +2874,19 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, num_bytes, seq); + /* + * We're done using the iterator, release all its qgroups while holding + * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup() + * and trigger use-after-free accesses to qgroups. + */ + qgroup_iterator_nested_clean(&qgroups); + /* * Bump qgroup_seq to avoid seq overlap */ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; spin_unlock(&fs_info->qgroup_lock); out_free: - qgroup_iterator_nested_clean(&qgroups); ulist_free(old_roots); ulist_free(new_roots); return ret; From 609d99379736aa6c5b0658654084198aa808035a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 6 Nov 2023 20:17:37 +0000 Subject: [PATCH 116/560] btrfs: fix qgroup record leaks when using simple quotas When using simple quotas we are not supposed to allocate qgroup records when adding delayed references. However we allocate them if either mode of quotas is enabled (the new simple one or the old one), but then we never free them because running the accounting, which frees the records, is only run when using the old quotas (at btrfs_qgroup_account_extents()), resulting in a memory leak of the records allocated when adding delayed references. Fix this by allocating the records only if the old quotas mode is enabled. Also fix btrfs_qgroup_trace_extent_nolock() to return 1 if the old quotas mode is not enabled - meaning the caller has to free the record. Fixes: 182940f4f4db ("btrfs: qgroup: add new quota mode for simple quotas") Reported-by: syzbot+d3ddc6dcc6386dea398b@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/00000000000004769106097f9a34@google.com/ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 4 ++-- fs/btrfs/qgroup.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9223934d95f47..891ea2fa263c9 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1041,7 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) { + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); @@ -1144,7 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) { + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e48eba7e93793..ce446d9d7f23d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1888,7 +1888,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, u64 bytenr = record->bytenr; if (!btrfs_qgroup_full_accounting(fs_info)) - return 0; + return 1; lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); From d3933152442b7f94419e9ea71835d71b620baf0e Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 3 Nov 2023 11:38:04 -0700 Subject: [PATCH 117/560] btrfs: make OWNER_REF_KEY type value smallest among inline refs BTRFS_EXTENT_OWNER_REF_KEY is the type of simple quotas extent owner refs. This special inline ref goes in front of all other inline refs. In general, inline refs have a required sorted order s.t. type never decreases (among other requirements). This was recently reified into a tree-checker and fsck rule, which broke simple quotas. To be fair, though, in a sense, the new owner ref item had also violated that not yet fully enforced requirement. This fix brings the owner ref item into compliance with the requirement that inline ref type never decrease. btrfs/301 exercises this behavior and should pass again with this fix. Fixes: d9a620f77e33 ("btrfs: new inline ref storing owning subvol of data extents") Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index c25fc96145947..d24e8e121507b 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -219,6 +219,22 @@ */ #define BTRFS_METADATA_ITEM_KEY 169 +/* + * Special inline ref key which stores the id of the subvolume which originally + * created the extent. This subvolume owns the extent permanently from the + * perspective of simple quotas. Needed to know which subvolume to free quota + * usage from when the extent is deleted. + * + * Stored as an inline ref rather to avoid wasting space on a separate item on + * top of the existing extent item. However, unlike the other inline refs, + * there is one one owner ref per extent rather than one per extent. + * + * Because of this, it goes at the front of the list of inline refs, and thus + * must have a lower type value than any other inline ref type (to satisfy the + * disk format rule that inline refs have non-decreasing type). + */ +#define BTRFS_EXTENT_OWNER_REF_KEY 172 + #define BTRFS_TREE_BLOCK_REF_KEY 176 #define BTRFS_EXTENT_DATA_REF_KEY 178 @@ -233,14 +249,6 @@ #define BTRFS_SHARED_DATA_REF_KEY 184 -/* - * Special inline ref key which stores the id of the subvolume which originally - * created the extent. This subvolume owns the extent permanently from the - * perspective of simple quotas. Needed to know which subvolume to free quota - * usage from when the extent is deleted. - */ -#define BTRFS_EXTENT_OWNER_REF_KEY 188 - /* * block groups give us hints into the extent allocation trees. Which * blocks are free etc etc From ec9aedb2aa1ab7ac420c00b31f5edc5be15ec167 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 3 Jul 2023 00:28:02 +0800 Subject: [PATCH 118/560] x86/acpi: Ignore invalid x2APIC entries Currently, the kernel enumerates the possible CPUs by parsing both ACPI MADT Local APIC entries and x2APIC entries. So CPUs with "valid" APIC IDs, even if they have duplicated APIC IDs in Local APIC and x2APIC, are always enumerated. Below is what ACPI MADT Local APIC and x2APIC describes on an Ivebridge-EP system, [02Ch 0044 1] Subtable Type : 00 [Processor Local APIC] [02Fh 0047 1] Local Apic ID : 00 ... [164h 0356 1] Subtable Type : 00 [Processor Local APIC] [167h 0359 1] Local Apic ID : 39 [16Ch 0364 1] Subtable Type : 00 [Processor Local APIC] [16Fh 0367 1] Local Apic ID : FF ... [3ECh 1004 1] Subtable Type : 09 [Processor Local x2APIC] [3F0h 1008 4] Processor x2Apic ID : 00000000 ... [B5Ch 2908 1] Subtable Type : 09 [Processor Local x2APIC] [B60h 2912 4] Processor x2Apic ID : 00000077 As a result, kernel shows "smpboot: Allowing 168 CPUs, 120 hotplug CPUs". And this wastes significant amount of memory for the per-cpu data. Plus this also breaks https://lore.kernel.org/all/87edm36qqb.ffs@tglx/, because __max_logical_packages is over-estimated by the APIC IDs in the x2APIC entries. According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure: "[Compatibility note] On some legacy OSes, Logical processors with APIC ID values less than 255 (whether in XAPIC or X2APIC mode) must use the Processor Local APIC structure to convey their APIC information to OSPM, and those processors must be declared in the DSDT using the Processor() keyword. Logical processors with APIC ID values 255 and greater must use the Processor Local x2APIC structure and be declared using the Device() keyword." Therefore prevent the registration of x2APIC entries with an APIC ID less than 255 if the local APIC table enumerates valid APIC IDs. [ tglx: Simplify the logic ] Signed-off-by: Zhang Rui Signed-off-by: Thomas Gleixner Tested-by: Peter Zijlstra Link: https://lore.kernel.org/r/20230702162802.344176-1-rui.zhang@intel.com --- arch/x86/kernel/acpi/boot.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c55c0ef47a187..fc5bce1b50476 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -63,6 +63,7 @@ int acpi_fix_pin2_polarity __initdata; #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; +static bool has_lapic_cpus __initdata; static bool acpi_support_online_capable; #endif @@ -232,6 +233,14 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) if (!acpi_is_processor_usable(processor->lapic_flags)) return 0; + /* + * According to https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-local-x2apic-structure + * when MADT provides both valid LAPIC and x2APIC entries, the APIC ID + * in x2APIC must be equal or greater than 0xff. + */ + if (has_lapic_cpus && apic_id < 0xff) + return 0; + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -1114,10 +1123,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) static int __init acpi_parse_madt_lapic_entries(void) { - int count; - int x2count = 0; - int ret; - struct acpi_subtable_proc madt_proc[2]; + int count, x2count = 0; if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; @@ -1126,21 +1132,11 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { - memset(madt_proc, 0, sizeof(madt_proc)); - madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; - madt_proc[0].handler = acpi_parse_lapic; - madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; - madt_proc[1].handler = acpi_parse_x2apic; - ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, - sizeof(struct acpi_table_madt), - madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); - if (ret < 0) { - pr_err("Error parsing LAPIC/X2APIC entries\n"); - return ret; - } - - count = madt_proc[0].count; - x2count = madt_proc[1].count; + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, + acpi_parse_lapic, MAX_LOCAL_APIC); + has_lapic_cpus = count > 0; + x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, + acpi_parse_x2apic, MAX_LOCAL_APIC); } if (!count && !x2count) { pr_err("No LAPIC entries present\n"); From fe69a1b1b6ed9ffc2c578c63f526026a8ab74f0c Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Thu, 9 Nov 2023 18:43:28 +0100 Subject: [PATCH 119/560] selftests: bpf: xskxceiver: ksft_print_msg: fix format type error Crossbuilding selftests/bpf for architecture arm64, format specifies type error show up like. xskxceiver.c:912:34: error: format specifies type 'int' but the argument has type '__u64' (aka 'unsigned long long') [-Werror,-Wformat] ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", ~~ %llu __func__, pkt->pkt_nb, meta->count); ^~~~~~~~~~~ xskxceiver.c:929:55: error: format specifies type 'unsigned long long' but the argument has type 'u64' (aka 'unsigned long') [-Werror,-Wformat] ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); ~~~~ ^~~~ Fixing the issues by casting to (unsigned long long) and changing the specifiers to be %llu from %d and %u, since with u64s it might be %llx or %lx, depending on architecture. Signed-off-by: Anders Roxell Link: https://lore.kernel.org/r/20231109174328.1774571-1-anders.roxell@linaro.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/xskxceiver.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 591ca9637b23e..b604c570309a7 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -908,8 +908,9 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) struct xdp_info *meta = data - sizeof(struct xdp_info); if (meta->count != pkt->pkt_nb) { - ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", - __func__, pkt->pkt_nb, meta->count); + ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", + __func__, pkt->pkt_nb, + (unsigned long long)meta->count); return false; } @@ -926,11 +927,13 @@ static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 exp if (addr >= umem->num_frames * umem->frame_size || addr + len > umem->num_frames * umem->frame_size) { - ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); + ksft_print_msg("Frag invalid addr: %llx len: %u\n", + (unsigned long long)addr, len); return false; } if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { - ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len); + ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", + (unsigned long long)addr, len); return false; } @@ -1029,7 +1032,8 @@ static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); ksft_print_msg("[%s] Too many packets completed\n", __func__); - ksft_print_msg("Last completion address: %llx\n", addr); + ksft_print_msg("Last completion address: %llx\n", + (unsigned long long)addr); return TEST_FAILURE; } @@ -1513,8 +1517,9 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject) } if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { - ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", - __func__, stats.tx_invalid_descs, + ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", + __func__, + (unsigned long long)stats.tx_invalid_descs, ifobject->xsk->pkt_stream->nb_pkts); return TEST_FAILURE; } From 3feb263bb516ee7e1da0acd22b15afbb9a7daa19 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 16:26:36 -0800 Subject: [PATCH 120/560] bpf: handle ldimm64 properly in check_cfg() ldimm64 instructions are 16-byte long, and so have to be handled appropriately in check_cfg(), just like the rest of BPF verifier does. This has implications in three places: - when determining next instruction for non-jump instructions; - when determining next instruction for callback address ldimm64 instructions (in visit_func_call_insn()); - when checking for unreachable instructions, where second half of ldimm64 is expected to be unreachable; We take this also as an opportunity to report jump into the middle of ldimm64. And adjust few test_verifier tests accordingly. Acked-by: Eduard Zingerman Reported-by: Hao Sun Fixes: 475fb78fbf48 ("bpf: verifier (add branch/goto checks)") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110002638.4168352-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++-- kernel/bpf/verifier.c | 27 ++++++++++++++----- .../testing/selftests/bpf/verifier/ld_imm64.c | 8 +++--- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4825d3cdb292..35bff17396c0b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -909,10 +909,14 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) aux->ctx_field_size = size; } +static bool bpf_is_ldimm64(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + static inline bool bpf_pseudo_func(const struct bpf_insn *insn) { - return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && - insn->src_reg == BPF_PSEUDO_FUNC; + return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; } struct bpf_prog_ops { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bd1c42eb540f1..b87715b364fda 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15439,15 +15439,16 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, struct bpf_verifier_env *env, bool visit_callee) { - int ret; + int ret, insn_sz; - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; + ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false); if (ret) return ret; - mark_prune_point(env, t + 1); + mark_prune_point(env, t + insn_sz); /* when we exit from subprog, we need to record non-linear history */ - mark_jmp_point(env, t + 1); + mark_jmp_point(env, t + insn_sz); if (visit_callee) { mark_prune_point(env, t); @@ -15469,15 +15470,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; - int ret, off; + int ret, off, insn_sz; if (bpf_pseudo_func(insn)) return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insn->code) != BPF_JMP && - BPF_CLASS(insn->code) != BPF_JMP32) - return push_insn(t, t + 1, FALLTHROUGH, env, false); + BPF_CLASS(insn->code) != BPF_JMP32) { + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + return push_insn(t, t + insn_sz, FALLTHROUGH, env, false); + } switch (BPF_OP(insn->code)) { case BPF_EXIT: @@ -15607,11 +15610,21 @@ static int check_cfg(struct bpf_verifier_env *env) } for (i = 0; i < insn_cnt; i++) { + struct bpf_insn *insn = &env->prog->insnsi[i]; + if (insn_state[i] != EXPLORED) { verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } + if (bpf_is_ldimm64(insn)) { + if (insn_state[i + 1] != 0) { + verbose(env, "jump into the middle of ldimm64 insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + i++; /* skip second half of ldimm64 */ + } } ret = 0; /* cfg looks good */ diff --git a/tools/testing/selftests/bpf/verifier/ld_imm64.c b/tools/testing/selftests/bpf/verifier/ld_imm64.c index f9297900cea6d..78f19c255f20b 100644 --- a/tools/testing/selftests/bpf/verifier/ld_imm64.c +++ b/tools/testing/selftests/bpf/verifier/ld_imm64.c @@ -9,8 +9,8 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_LD_IMM insn", - .errstr_unpriv = "R1 pointer comparison", + .errstr = "jump into the middle of ldimm64 insn 1", + .errstr_unpriv = "jump into the middle of ldimm64 insn 1", .result = REJECT, }, { @@ -23,8 +23,8 @@ BPF_LD_IMM64(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_LD_IMM insn", - .errstr_unpriv = "R1 pointer comparison", + .errstr = "jump into the middle of ldimm64 insn 1", + .errstr_unpriv = "jump into the middle of ldimm64 insn 1", .result = REJECT, }, { From 4bb7ea946a370707315ab774432963ce47291946 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 16:26:37 -0800 Subject: [PATCH 121/560] bpf: fix precision backtracking instruction iteration Fix an edge case in __mark_chain_precision() which prematurely stops backtracking instructions in a state if it happens that state's first and last instruction indexes are the same. This situations doesn't necessarily mean that there were no instructions simulated in a state, but rather that we starting from the instruction, jumped around a bit, and then ended up at the same instruction before checkpointing or marking precision. To distinguish between these two possible situations, we need to consult jump history. If it's empty or contain a single record "bridging" parent state and first instruction of processed state, then we indeed backtracked all instructions in this state. But if history is not empty, we are definitely not done yet. Move this logic inside get_prev_insn_idx() to contain it more nicely. Use -ENOENT return code to denote "we are out of instructions" situation. This bug was exposed by verifier_loop1.c's bounded_recursion subtest, once the next fix in this patch set is applied. Acked-by: Eduard Zingerman Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110002638.4168352-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b87715b364fda..484c742f733e9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3516,12 +3516,29 @@ static int push_jmp_history(struct bpf_verifier_env *env, /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. + * Return -ENOENT if we exhausted all instructions within given state. + * + * It's legal to have a bit of a looping with the same starting and ending + * insn index within the same state, e.g.: 3->4->5->3, so just because current + * instruction index is the same as state's first_idx doesn't mean we are + * done. If there is still some jump history left, we should keep going. We + * need to take into account that we might have a jump history between given + * state's parent and itself, due to checkpointing. In this case, we'll have + * history entry recording a jump from last instruction of parent state and + * first instruction of given state. */ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, u32 *history) { u32 cnt = *history; + if (i == st->first_insn_idx) { + if (cnt == 0) + return -ENOENT; + if (cnt == 1 && st->jmp_history[0].idx == i) + return -ENOENT; + } + if (cnt && st->jmp_history[cnt - 1].idx == i) { i = st->jmp_history[cnt - 1].prev_idx; (*history)--; @@ -4401,10 +4418,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * Nothing to be tracked further in the parent state. */ return 0; - if (i == first_idx) - break; subseq_idx = i; i = get_prev_insn_idx(st, i, &history); + if (i == -ENOENT) + break; if (i >= env->prog->len) { /* This can happen if backtracking reached insn 0 * and there are still reg_mask or stack_mask From 62ccdb11d3c63dc697dea1fd92b3496fe43dcc1e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 16:26:38 -0800 Subject: [PATCH 122/560] selftests/bpf: add edge case backtracking logic test Add a dedicated selftests to try to set up conditions to have a state with same first and last instruction index, but it actually is a loop 3->4->1->2->3. This confuses mark_chain_precision() if verifier doesn't take into account jump history. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110002638.4168352-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_precision.c | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c index 193c0f8272d05..6b564d4c09866 100644 --- a/tools/testing/selftests/bpf/progs/verifier_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -91,3 +91,43 @@ __naked int bpf_end_bswap(void) } #endif /* v4 instruction */ + +SEC("?raw_tp") +__success __log_level(2) +/* + * Without the bug fix there will be no history between "last_idx 3 first_idx 3" + * and "parent state regs=" lines. "R0_w=6" parts are here to help anchor + * expected log messages to the one specific mark_chain_precision operation. + * + * This is quite fragile: if verifier checkpointing heuristic changes, this + * might need adjusting. + */ +__msg("2: (07) r0 += 1 ; R0_w=6") +__msg("3: (35) if r0 >= 0xa goto pc+1") +__msg("mark_precise: frame0: last_idx 3 first_idx 3 subseq_idx -1") +__msg("mark_precise: frame0: regs=r0 stack= before 2: (07) r0 += 1") +__msg("mark_precise: frame0: regs=r0 stack= before 1: (07) r0 += 1") +__msg("mark_precise: frame0: regs=r0 stack= before 4: (05) goto pc-4") +__msg("mark_precise: frame0: regs=r0 stack= before 3: (35) if r0 >= 0xa goto pc+1") +__msg("mark_precise: frame0: parent state regs= stack=: R0_rw=P4") +__msg("3: R0_w=6") +__naked int state_loop_first_last_equal(void) +{ + asm volatile ( + "r0 = 0;" + "l0_%=:" + "r0 += 1;" + "r0 += 1;" + /* every few iterations we'll have a checkpoint here with + * first_idx == last_idx, potentially confusing precision + * backtracking logic + */ + "if r0 >= 10 goto l1_%=;" /* checkpoint + mark_precise */ + "goto l0_%=;" + "l1_%=:" + "exit;" + ::: __clobber_common + ); +} + +char _license[] SEC("license") = "GPL"; From 10e14e9652bf9e8104151bfd9200433083deae3d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 22:14:10 -0800 Subject: [PATCH 123/560] bpf: fix control-flow graph checking in privileged mode When BPF program is verified in privileged mode, BPF verifier allows bounded loops. This means that from CFG point of view there are definitely some back-edges. Original commit adjusted check_cfg() logic to not detect back-edges in control flow graph if they are resulting from conditional jumps, which the idea that subsequent full BPF verification process will determine whether such loops are bounded or not, and either accept or reject the BPF program. At least that's my reading of the intent. Unfortunately, the implementation of this idea doesn't work correctly in all possible situations. Conditional jump might not result in immediate back-edge, but just a few unconditional instructions later we can arrive at back-edge. In such situations check_cfg() would reject BPF program even in privileged mode, despite it might be bounded loop. Next patch adds one simple program demonstrating such scenario. To keep things simple, instead of trying to detect back edges in privileged mode, just assume every back edge is valid and let subsequent BPF verification prove or reject bounded loops. Note a few test changes. For unknown reason, we have a few tests that are specified to detect a back-edge in a privileged mode, but looking at their code it seems like the right outcome is passing check_cfg() and letting subsequent verification to make a decision about bounded or not bounded looping. Bounded recursion case is also interesting. The example should pass, as recursion is limited to just a few levels and so we never reach maximum number of nested frames and never exhaust maximum stack depth. But the way that max stack depth logic works today it falsely detects this as exceeding max nested frame count. This patch series doesn't attempt to fix this orthogonal problem, so we just adjust expected verifier failure. Suggested-by: Alexei Starovoitov Fixes: 2589726d12a1 ("bpf: introduce bounded loops") Reported-by: Hao Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110061412.2995786-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 23 +++++++------------ .../selftests/bpf/progs/verifier_loops1.c | 9 +++++--- tools/testing/selftests/bpf/verifier/calls.c | 6 ++--- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 484c742f733e9..a2267d5ed14ed 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15403,8 +15403,7 @@ enum { * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, - bool loop_ok) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) { int *insn_stack = env->cfg.insn_stack; int *insn_state = env->cfg.insn_state; @@ -15436,7 +15435,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return KEEP_EXPLORING; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->bpf_capable) + if (env->bpf_capable) return DONE_EXPLORING; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -15459,7 +15458,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, int ret, insn_sz; insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; - ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false); + ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); if (ret) return ret; @@ -15469,12 +15468,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, if (visit_callee) { mark_prune_point(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, - /* It's ok to allow recursion from CFG point of - * view. __check_func_call() will do the actual - * check. - */ - bpf_pseudo_func(insns + t)); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); } return ret; } @@ -15496,7 +15490,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) != BPF_JMP && BPF_CLASS(insn->code) != BPF_JMP32) { insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; - return push_insn(t, t + insn_sz, FALLTHROUGH, env, false); + return push_insn(t, t + insn_sz, FALLTHROUGH, env); } switch (BPF_OP(insn->code)) { @@ -15543,8 +15537,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) off = insn->imm; /* unconditional jump with single edge */ - ret = push_insn(t, t + off + 1, FALLTHROUGH, env, - true); + ret = push_insn(t, t + off + 1, FALLTHROUGH, env); if (ret) return ret; @@ -15557,11 +15550,11 @@ static int visit_insn(int t, struct bpf_verifier_env *env) /* conditional jump with two edges */ mark_prune_point(env, t); - ret = push_insn(t, t + 1, FALLTHROUGH, env, true); + ret = push_insn(t, t + 1, FALLTHROUGH, env); if (ret) return ret; - return push_insn(t, t + insn->off + 1, BRANCH, env, true); + return push_insn(t, t + insn->off + 1, BRANCH, env); } } diff --git a/tools/testing/selftests/bpf/progs/verifier_loops1.c b/tools/testing/selftests/bpf/progs/verifier_loops1.c index 5bc86af80a9ad..71735dbf33d4f 100644 --- a/tools/testing/selftests/bpf/progs/verifier_loops1.c +++ b/tools/testing/selftests/bpf/progs/verifier_loops1.c @@ -75,9 +75,10 @@ l0_%=: r0 += 1; \ " ::: __clobber_all); } -SEC("tracepoint") +SEC("socket") __description("bounded loop, start in the middle") -__failure __msg("back-edge") +__success +__failure_unpriv __msg_unpriv("back-edge") __naked void loop_start_in_the_middle(void) { asm volatile (" \ @@ -136,7 +137,9 @@ l0_%=: exit; \ SEC("tracepoint") __description("bounded recursion") -__failure __msg("back-edge") +__failure +/* verifier limitation in detecting max stack depth */ +__msg("the call stack of 8 frames is too deep !") __naked void bounded_recursion(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 1bdf2b43e49ea..3d5cd51071f04 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -442,7 +442,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, - .errstr = "back-edge from insn 0 to 0", + .errstr = "the call stack of 9 frames is too deep", .result = REJECT, }, { @@ -799,7 +799,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, - .errstr = "back-edge", + .errstr = "the call stack of 9 frames is too deep", .result = REJECT, }, { @@ -811,7 +811,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_TRACEPOINT, - .errstr = "back-edge", + .errstr = "the call stack of 9 frames is too deep", .result = REJECT, }, { From e2e57d637aa5da0a2f49d83ad44e9febf95df7b4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 22:14:11 -0800 Subject: [PATCH 124/560] selftests/bpf: add more test cases for check_cfg() Add a few more simple cases to validate proper privileged vs unprivileged loop detection behavior. conditional_loop2 is the one reported by Hao Sun that triggered this set of fixes. Acked-by: Eduard Zingerman Suggested-by: Hao Sun Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110061412.2995786-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_cfg.c | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_cfg.c b/tools/testing/selftests/bpf/progs/verifier_cfg.c index df7697b94007b..c1f55e1d80a42 100644 --- a/tools/testing/selftests/bpf/progs/verifier_cfg.c +++ b/tools/testing/selftests/bpf/progs/verifier_cfg.c @@ -97,4 +97,66 @@ l0_%=: r2 = r0; \ " ::: __clobber_all); } +SEC("socket") +__description("conditional loop (2)") +__success +__failure_unpriv __msg_unpriv("back-edge from insn 10 to 11") +__naked void conditional_loop2(void) +{ + asm volatile (" \ + r9 = 2 ll; \ + r3 = 0x20 ll; \ + r4 = 0x35 ll; \ + r8 = r4; \ + goto l1_%=; \ +l0_%=: r9 -= r3; \ + r9 -= r4; \ + r9 -= r8; \ +l1_%=: r8 += r4; \ + if r8 < 0x64 goto l0_%=; \ + r0 = r9; \ + exit; \ +" ::: __clobber_all); +} + +SEC("socket") +__description("unconditional loop after conditional jump") +__failure __msg("infinite loop detected") +__failure_unpriv __msg_unpriv("back-edge from insn 3 to 2") +__naked void uncond_loop_after_cond_jmp(void) +{ + asm volatile (" \ + r0 = 0; \ + if r0 > 0 goto l1_%=; \ +l0_%=: r0 = 1; \ + goto l0_%=; \ +l1_%=: exit; \ +" ::: __clobber_all); +} + + +__naked __noinline __used +static unsigned long never_ending_subprog() +{ + asm volatile (" \ + r0 = r1; \ + goto -1; \ +" ::: __clobber_all); +} + +SEC("socket") +__description("unconditional loop after conditional jump") +/* infinite loop is detected *after* check_cfg() */ +__failure __msg("infinite loop detected") +__naked void uncond_loop_in_subprog_after_cond_jmp(void) +{ + asm volatile (" \ + r0 = 0; \ + if r0 > 0 goto l1_%=; \ +l0_%=: r0 += 1; \ + call never_ending_subprog; \ +l1_%=: exit; \ +" ::: __clobber_all); +} + char _license[] SEC("license") = "GPL"; From 8a4f030dbced6fc255cbe67b2d0a129947e18493 Mon Sep 17 00:00:00 2001 From: Yuran Pereira Date: Wed, 8 Nov 2023 02:18:36 +0530 Subject: [PATCH 125/560] ptp: Fixes a null pointer dereference in ptp_ioctl Syzkaller found a null pointer dereference in ptp_ioctl originating from the lack of a null check for tsevq. ``` general protection fault, probably for non-canonical address 0xdffffc000000020b: 0000 [#1] PREEMPT SMP KASAN KASAN: probably user-memory-access in range [0x0000000000001058-0x000000000000105f] CPU: 0 PID: 5053 Comm: syz-executor353 Not tainted 6.6.0-syzkaller-10396-g4652b8e4f3ff #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 RIP: 0010:ptp_ioctl+0xcb7/0x1d10 drivers/ptp/ptp_chardev.c:476 ... Call Trace: posix_clock_ioctl+0xf8/0x160 kernel/time/posix-clock.c:86 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl fs/ioctl.c:857 [inline] __x64_sys_ioctl+0x18f/0x210 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b ``` This patch fixes the issue by adding a check for tsevq and ensuring ptp_ioctl returns with an error if tsevq is null. Reported-by: syzbot+8a78ecea7ac1a2ea26e5@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8a78ecea7ac1a2ea26e5 Fixes: c5a445b1e934 ("ptp: support event queue reader channel masks") Signed-off-by: Yuran Pereira Reviewed-by: Przemek Kitszel Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 3f7a747888024..e95a6ed130ef4 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -176,6 +176,8 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, int enable, err = 0; tsevq = pccontext->private_clkdata; + if (!tsevq) + return -EINVAL; switch (cmd) { From 871019b22d1bcc9fab2d1feba1b9a564acbb6e99 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 8 Nov 2023 13:13:25 -0800 Subject: [PATCH 126/560] net: set SOCK_RCU_FREE before inserting socket into hashtable We've started to see the following kernel traces: WARNING: CPU: 83 PID: 0 at net/core/filter.c:6641 sk_lookup+0x1bd/0x1d0 Call Trace: __bpf_skc_lookup+0x10d/0x120 bpf_sk_lookup+0x48/0xd0 bpf_sk_lookup_tcp+0x19/0x20 bpf_prog_+0x37c/0x16a3 cls_bpf_classify+0x205/0x2e0 tcf_classify+0x92/0x160 __netif_receive_skb_core+0xe52/0xf10 __netif_receive_skb_list_core+0x96/0x2b0 napi_complete_done+0x7b5/0xb70 _poll+0x94/0xb0 net_rx_action+0x163/0x1d70 __do_softirq+0xdc/0x32e asm_call_irq_on_stack+0x12/0x20 do_softirq_own_stack+0x36/0x50 do_softirq+0x44/0x70 __inet_hash can race with lockless (rcu) readers on the other cpus: __inet_hash __sk_nulls_add_node_rcu <- (bpf triggers here) sock_set_flag(SOCK_RCU_FREE) Let's move the SOCK_RCU_FREE part up a bit, before we are inserting the socket into hashtables. Note, that the race is really harmless; the bpf callers are handling this situation (where listener socket doesn't have SOCK_RCU_FREE set) correctly, so the only annoyance is a WARN_ONCE. More details from Eric regarding SOCK_RCU_FREE timeline: Commit 3b24d854cb35 ("tcp/dccp: do not touch listener sk_refcnt under synflood") added SOCK_RCU_FREE. At that time, the precise location of sock_set_flag(sk, SOCK_RCU_FREE) did not matter, because the thread calling __inet_hash() owns a reference on sk. SOCK_RCU_FREE was only tested at dismantle time. Commit 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") started checking SOCK_RCU_FREE _after_ the lookup to infer whether the refcount has been taken care of. Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") Reviewed-by: Eric Dumazet Signed-off-by: Stanislav Fomichev Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 598c1b114d2c2..a532f749e4778 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -751,12 +751,12 @@ int __inet_hash(struct sock *sk, struct sock *osk) if (err) goto unlock; } + sock_set_flag(sk, SOCK_RCU_FREE); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); - sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); From cbe9e68e1e0f965fd3b1caf975eb29083c65e6b0 Mon Sep 17 00:00:00 2001 From: Ravi Gunasekaran Date: Fri, 10 Nov 2023 14:57:49 +0530 Subject: [PATCH 127/560] MAINTAINERS: net: Update reviewers for TI's Ethernet drivers Grygorii is no longer associated with TI and messages addressed to him bounce. Add Siddharth, Roger and myself as reviewers. Signed-off-by: Ravi Gunasekaran Signed-off-by: David S. Miller --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 350d00657f6b5..e3acb36989f00 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21769,7 +21769,9 @@ F: Documentation/devicetree/bindings/counter/ti-eqep.yaml F: drivers/counter/ti-eqep.c TI ETHERNET SWITCH DRIVER (CPSW) -R: Grygorii Strashko +R: Siddharth Vadapalli +R: Ravi Gunasekaran +R: Roger Quadros L: linux-omap@vger.kernel.org L: netdev@vger.kernel.org S: Maintained From 18f039428c7df183b09c69ebf10ffd4e521035d2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Nov 2023 15:22:41 +0000 Subject: [PATCH 128/560] ipvlan: add ipvlan_route_v6_outbound() helper Inspired by syzbot reports using a stack of multiple ipvlan devices. Reduce stack size needed in ipvlan_process_v6_outbound() by moving the flowi6 struct used for the route lookup in an non inlined helper. ipvlan_route_v6_outbound() needs 120 bytes on the stack, immediately reclaimed. Also make sure ipvlan_process_v4_outbound() is not inlined. We might also have to lower MAX_NEST_DEV, because only syzbot uses setups with more than four stacked devices. BUG: TASK stack guard page was hit at ffffc9000e803ff8 (stack is ffffc9000e804000..ffffc9000e808000) stack guard page: 0000 [#1] SMP KASAN CPU: 0 PID: 13442 Comm: syz-executor.4 Not tainted 6.1.52-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 RIP: 0010:kasan_check_range+0x4/0x2a0 mm/kasan/generic.c:188 Code: 48 01 c6 48 89 c7 e8 db 4e c1 03 31 c0 5d c3 cc 0f 0b eb 02 0f 0b b8 ea ff ff ff 5d c3 cc 00 00 cc cc 00 00 cc cc 55 48 89 e5 <41> 57 41 56 41 55 41 54 53 b0 01 48 85 f6 0f 84 a4 01 00 00 48 89 RSP: 0018:ffffc9000e804000 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff817e5bf2 RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffffffff887c6568 RBP: ffffc9000e804000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: dffffc0000000001 R12: 1ffff92001d0080c R13: dffffc0000000000 R14: ffffffff87e6b100 R15: 0000000000000000 FS: 00007fd0c55826c0(0000) GS:ffff8881f6800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffc9000e803ff8 CR3: 0000000170ef7000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <#DF> [] __kasan_check_read+0x11/0x20 mm/kasan/shadow.c:31 [] instrument_atomic_read include/linux/instrumented.h:72 [inline] [] _test_bit include/asm-generic/bitops/instrumented-non-atomic.h:141 [inline] [] cpumask_test_cpu include/linux/cpumask.h:506 [inline] [] cpu_online include/linux/cpumask.h:1092 [inline] [] trace_lock_acquire include/trace/events/lock.h:24 [inline] [] lock_acquire+0xe2/0x590 kernel/locking/lockdep.c:5632 [] rcu_lock_acquire+0x2e/0x40 include/linux/rcupdate.h:306 [] rcu_read_lock include/linux/rcupdate.h:747 [inline] [] ip6_pol_route+0x15d/0x1440 net/ipv6/route.c:2221 [] ip6_pol_route_output+0x50/0x80 net/ipv6/route.c:2606 [] pol_lookup_func include/net/ip6_fib.h:584 [inline] [] fib6_rule_lookup+0x265/0x620 net/ipv6/fib6_rules.c:116 [] ip6_route_output_flags_noref+0x2d9/0x3a0 net/ipv6/route.c:2638 [] ip6_route_output_flags+0xca/0x340 net/ipv6/route.c:2651 [] ip6_route_output include/net/ip6_route.h:100 [inline] [] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:473 [inline] [] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline] [] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] [] ipvlan_queue_xmit+0xc33/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677 [] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229 [] netdev_start_xmit include/linux/netdevice.h:4966 [inline] [] xmit_one net/core/dev.c:3644 [inline] [] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660 [] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324 [] dev_queue_xmit include/linux/netdevice.h:3067 [inline] [] neigh_hh_output include/net/neighbour.h:529 [inline] [] neigh_output include/net/neighbour.h:543 [inline] [] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139 [] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline] [] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232 [] dst_output include/net/dst.h:444 [inline] [] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161 [] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline] [] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline] [] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] [] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677 [] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229 [] netdev_start_xmit include/linux/netdevice.h:4966 [inline] [] xmit_one net/core/dev.c:3644 [inline] [] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660 [] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324 [] dev_queue_xmit include/linux/netdevice.h:3067 [inline] [] neigh_hh_output include/net/neighbour.h:529 [inline] [] neigh_output include/net/neighbour.h:543 [inline] [] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139 [] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline] [] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232 [] dst_output include/net/dst.h:444 [inline] [] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161 [] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline] [] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline] [] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] [] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677 [] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229 [] netdev_start_xmit include/linux/netdevice.h:4966 [inline] [] xmit_one net/core/dev.c:3644 [inline] [] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660 [] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324 [] dev_queue_xmit include/linux/netdevice.h:3067 [inline] [] neigh_hh_output include/net/neighbour.h:529 [inline] [] neigh_output include/net/neighbour.h:543 [inline] [] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139 [] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline] [] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232 [] dst_output include/net/dst.h:444 [inline] [] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161 [] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline] [] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline] [] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] [] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677 [] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229 [] netdev_start_xmit include/linux/netdevice.h:4966 [inline] [] xmit_one net/core/dev.c:3644 [inline] [] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660 [] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324 [] dev_queue_xmit include/linux/netdevice.h:3067 [inline] [] neigh_hh_output include/net/neighbour.h:529 [inline] [] neigh_output include/net/neighbour.h:543 [inline] [] ip6_finish_output2+0x160d/0x1ae0 net/ipv6/ip6_output.c:139 [] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline] [] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232 [] dst_output include/net/dst.h:444 [inline] [] ip6_local_out+0x10f/0x140 net/ipv6/output_core.c:161 [] ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:483 [inline] [] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:529 [inline] [] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:602 [inline] [] ipvlan_queue_xmit+0x1174/0x1be0 drivers/net/ipvlan/ipvlan_core.c:677 [] ipvlan_start_xmit+0x49/0x100 drivers/net/ipvlan/ipvlan_main.c:229 [] netdev_start_xmit include/linux/netdevice.h:4966 [inline] [] xmit_one net/core/dev.c:3644 [inline] [] dev_hard_start_xmit+0x320/0x980 net/core/dev.c:3660 [] __dev_queue_xmit+0x16b2/0x3370 net/core/dev.c:4324 [] dev_queue_xmit include/linux/netdevice.h:3067 [inline] [] neigh_resolve_output+0x64e/0x750 net/core/neighbour.c:1560 [] neigh_output include/net/neighbour.h:545 [inline] [] ip6_finish_output2+0x1643/0x1ae0 net/ipv6/ip6_output.c:139 [] __ip6_finish_output net/ipv6/ip6_output.c:200 [inline] [] ip6_finish_output+0x6c6/0xb10 net/ipv6/ip6_output.c:211 [] NF_HOOK_COND include/linux/netfilter.h:298 [inline] [] ip6_output+0x2bc/0x3d0 net/ipv6/ip6_output.c:232 [] dst_output include/net/dst.h:444 [inline] [] NF_HOOK include/linux/netfilter.h:309 [inline] [] ip6_xmit+0x11a4/0x1b20 net/ipv6/ip6_output.c:352 [] sctp_v6_xmit+0x9ae/0x1230 net/sctp/ipv6.c:250 [] sctp_packet_transmit+0x25de/0x2bc0 net/sctp/output.c:653 [] sctp_packet_singleton+0x202/0x310 net/sctp/outqueue.c:783 [] sctp_outq_flush_ctrl net/sctp/outqueue.c:914 [inline] [] sctp_outq_flush+0x661/0x3d40 net/sctp/outqueue.c:1212 [] sctp_outq_uncork+0x79/0xb0 net/sctp/outqueue.c:764 [] sctp_side_effects net/sctp/sm_sideeffect.c:1199 [inline] [] sctp_do_sm+0x55c0/0x5c30 net/sctp/sm_sideeffect.c:1170 [] sctp_primitive_ASSOCIATE+0x97/0xc0 net/sctp/primitive.c:73 [] sctp_sendmsg_to_asoc+0xf62/0x17b0 net/sctp/socket.c:1839 [] sctp_sendmsg+0x212e/0x33b0 net/sctp/socket.c:2029 [] inet_sendmsg+0x149/0x310 net/ipv4/af_inet.c:849 [] sock_sendmsg_nosec net/socket.c:716 [inline] [] sock_sendmsg net/socket.c:736 [inline] [] ____sys_sendmsg+0x572/0x8c0 net/socket.c:2504 [] ___sys_sendmsg net/socket.c:2558 [inline] [] __sys_sendmsg+0x271/0x360 net/socket.c:2587 [] __do_sys_sendmsg net/socket.c:2596 [inline] [] __se_sys_sendmsg net/socket.c:2594 [inline] [] __x64_sys_sendmsg+0x7f/0x90 net/socket.c:2594 [] do_syscall_x64 arch/x86/entry/common.c:51 [inline] [] do_syscall_64+0x53/0x80 arch/x86/entry/common.c:84 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Mahesh Bandewar Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_core.c | 41 +++++++++++++++++++------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 21e9cac731218..2d5b021b4ea60 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -411,7 +411,7 @@ struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, return addr; } -static int ipvlan_process_v4_outbound(struct sk_buff *skb) +static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) { const struct iphdr *ip4h = ip_hdr(skb); struct net_device *dev = skb->dev; @@ -453,13 +453,11 @@ static int ipvlan_process_v4_outbound(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_IPV6) -static int ipvlan_process_v6_outbound(struct sk_buff *skb) + +static noinline_for_stack int +ipvlan_route_v6_outbound(struct net_device *dev, struct sk_buff *skb) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); - struct net_device *dev = skb->dev; - struct net *net = dev_net(dev); - struct dst_entry *dst; - int err, ret = NET_XMIT_DROP; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .daddr = ip6h->daddr, @@ -469,27 +467,38 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb) .flowi6_mark = skb->mark, .flowi6_proto = ip6h->nexthdr, }; + struct dst_entry *dst; + int err; - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - ret = dst->error; + dst = ip6_route_output(dev_net(dev), NULL, &fl6); + err = dst->error; + if (err) { dst_release(dst); - goto err; + return err; } skb_dst_set(skb, dst); + return 0; +} + +static int ipvlan_process_v6_outbound(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + int err, ret = NET_XMIT_DROP; + + err = ipvlan_route_v6_outbound(dev, skb); + if (unlikely(err)) { + DEV_STATS_INC(dev, tx_errors); + kfree_skb(skb); + return err; + } memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); - err = ip6_local_out(net, skb->sk, skb); + err = ip6_local_out(dev_net(dev), skb->sk, skb); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; - goto out; -err: - DEV_STATS_INC(dev, tx_errors); - kfree_skb(skb); -out: return ret; } #else From 719639853d88071dfdfd8d9971eca9c283ff314c Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Thu, 9 Nov 2023 00:44:20 +0900 Subject: [PATCH 129/560] tty: Fix uninit-value access in ppp_sync_receive() KMSAN reported the following uninit-value access issue: ===================================================== BUG: KMSAN: uninit-value in ppp_sync_input drivers/net/ppp/ppp_synctty.c:690 [inline] BUG: KMSAN: uninit-value in ppp_sync_receive+0xdc9/0xe70 drivers/net/ppp/ppp_synctty.c:334 ppp_sync_input drivers/net/ppp/ppp_synctty.c:690 [inline] ppp_sync_receive+0xdc9/0xe70 drivers/net/ppp/ppp_synctty.c:334 tiocsti+0x328/0x450 drivers/tty/tty_io.c:2295 tty_ioctl+0x808/0x1920 drivers/tty/tty_io.c:2694 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl+0x211/0x400 fs/ioctl.c:857 __x64_sys_ioctl+0x97/0xe0 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: __alloc_pages+0x75d/0xe80 mm/page_alloc.c:4591 __alloc_pages_node include/linux/gfp.h:238 [inline] alloc_pages_node include/linux/gfp.h:261 [inline] __page_frag_cache_refill+0x9a/0x2c0 mm/page_alloc.c:4691 page_frag_alloc_align+0x91/0x5d0 mm/page_alloc.c:4722 page_frag_alloc include/linux/gfp.h:322 [inline] __netdev_alloc_skb+0x215/0x6d0 net/core/skbuff.c:728 netdev_alloc_skb include/linux/skbuff.h:3225 [inline] dev_alloc_skb include/linux/skbuff.h:3238 [inline] ppp_sync_input drivers/net/ppp/ppp_synctty.c:669 [inline] ppp_sync_receive+0x237/0xe70 drivers/net/ppp/ppp_synctty.c:334 tiocsti+0x328/0x450 drivers/tty/tty_io.c:2295 tty_ioctl+0x808/0x1920 drivers/tty/tty_io.c:2694 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl+0x211/0x400 fs/ioctl.c:857 __x64_sys_ioctl+0x97/0xe0 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 0 PID: 12950 Comm: syz-executor.1 Not tainted 6.6.0-14500-g1c41041124bd #10 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014 ===================================================== ppp_sync_input() checks the first 2 bytes of the data are PPP_ALLSTATIONS and PPP_UI. However, if the data length is 1 and the first byte is PPP_ALLSTATIONS, an access to an uninitialized value occurs when checking PPP_UI. This patch resolves this issue by checking the data length. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Shigeru Yoshida Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_synctty.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c index ebcdffdf4f0e0..ea261a628786b 100644 --- a/drivers/net/ppp/ppp_synctty.c +++ b/drivers/net/ppp/ppp_synctty.c @@ -687,7 +687,7 @@ ppp_sync_input(struct syncppp *ap, const u8 *buf, const u8 *flags, int count) /* strip address/control field if present */ p = skb->data; - if (p[0] == PPP_ALLSTATIONS && p[1] == PPP_UI) { + if (skb->len >= 2 && p[0] == PPP_ALLSTATIONS && p[1] == PPP_UI) { /* chop off address/control */ if (skb->len < 3) goto err; From 4b21a669ca21ed8f24ef4530b2918be5730114de Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Fri, 10 Nov 2023 15:16:06 +0800 Subject: [PATCH 130/560] ALSA: hda/realtek - Add Dell ALC295 to pin fall back table Add ALC295 to pin fall back table. Remove 5 pin quirks for Dell ALC295. ALC295 was only support MIC2 for external MIC function. ALC295 assigned model "ALC269_FIXUP_DELL1_MIC_NO_PRESENCE" for pin fall back table. It was assigned wrong model. So, let's remove it. Fixes: fbc571290d9f ("ALSA: hda/realtek - Fixed Headphone Mic can't record on Dell platform") Signed-off-by: Kailang Yang Cc: Link: https://lore.kernel.org/r/7c1998e873834df98d59bd7e0d08c72e@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 669ae3d6e447e..d689f0050aaeb 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10821,22 +10821,6 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x12, 0x90a60130}, {0x17, 0x90170110}, {0x21, 0x03211020}), - SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, - {0x14, 0x90170110}, - {0x21, 0x04211020}), - SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, - {0x14, 0x90170110}, - {0x21, 0x04211030}), - SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE, - ALC295_STANDARD_PINS, - {0x17, 0x21014020}, - {0x18, 0x21a19030}), - SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE, - ALC295_STANDARD_PINS, - {0x17, 0x21014040}, - {0x18, 0x21a19050}), - SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE, - ALC295_STANDARD_PINS), SND_HDA_PIN_QUIRK(0x10ec0298, 0x1028, "Dell", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, ALC298_STANDARD_PINS, {0x17, 0x90170110}), @@ -10880,6 +10864,9 @@ static const struct snd_hda_pin_quirk alc269_fallback_pin_fixup_tbl[] = { SND_HDA_PIN_QUIRK(0x10ec0289, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, {0x19, 0x40000000}, {0x1b, 0x40000000}), + SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, + {0x19, 0x40000000}, + {0x1b, 0x40000000}), SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, {0x19, 0x40000000}, {0x1a, 0x40000000}), From 8384c0baf223e1c3bc7b1c711d80a4c6106d210e Mon Sep 17 00:00:00 2001 From: Eymen Yigit Date: Fri, 10 Nov 2023 18:07:15 +0300 Subject: [PATCH 131/560] ALSA: hda/realtek: Enable Mute LED on HP 255 G8 This HP Notebook uses ALC236 codec with COEF 0x07 idx 1 controlling the mute LED. Enable already existing quirk for this device. Signed-off-by: Eymen Yigit Cc: Luka Guzenko Cc: Link: https://lore.kernel.org/r/20231110150715.5141-1-eymenyg01@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d689f0050aaeb..c7de743961850 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9832,6 +9832,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8898, "HP EliteBook 845 G8 Notebook PC", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x103c, 0x88d0, "HP Pavilion 15-eh1xxx (mainboard 88D0)", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8902, "HP OMEN 16", ALC285_FIXUP_HP_MUTE_LED), + SND_PCI_QUIRK(0x103c, 0x890e, "HP 255 G8 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x8919, "HP Pavilion Aero Laptop 13-be0xxx", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x896d, "HP ZBook Firefly 16 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x896e, "HP EliteBook x360 830 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), From c3803203bc5ec910a3eb06172cf6fb368e0e4390 Mon Sep 17 00:00:00 2001 From: Ani Sinha Date: Mon, 16 Oct 2023 19:01:22 +0530 Subject: [PATCH 132/560] hv/hv_kvp_daemon: Some small fixes for handling NM keyfiles Some small fixes: - lets make sure we are not adding ipv4 addresses in ipv6 section in keyfile and vice versa. - ADDR_FAMILY_IPV6 is a bit in addr_family. Test that bit instead of checking the whole value of addr_family. - Some trivial fixes in hv_set_ifconfig.sh. These fixes are proposed after doing some internal testing at Red Hat. CC: Shradha Gupta CC: Saurabh Sengar Fixes: 42999c904612 ("hv/hv_kvp_daemon:Support for keyfile based connection profile") Signed-off-by: Ani Sinha Reviewed-by: Shradha Gupta Signed-off-by: Wei Liu Message-ID: <20231016133122.2419537-1-anisinha@redhat.com> --- tools/hv/hv_kvp_daemon.c | 20 ++++++++++++-------- tools/hv/hv_set_ifconfig.sh | 4 ++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 264eeb9c46a9f..318e2dad27e04 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -1421,7 +1421,7 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) if (error) goto setval_error; - if (new_val->addr_family == ADDR_FAMILY_IPV6) { + if (new_val->addr_family & ADDR_FAMILY_IPV6) { error = fprintf(nmfile, "\n[ipv6]\n"); if (error < 0) goto setval_error; @@ -1455,14 +1455,18 @@ static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val) if (error < 0) goto setval_error; - error = fprintf(nmfile, "gateway=%s\n", (char *)new_val->gate_way); - if (error < 0) - goto setval_error; - - error = fprintf(nmfile, "dns=%s\n", (char *)new_val->dns_addr); - if (error < 0) - goto setval_error; + /* we do not want ipv4 addresses in ipv6 section and vice versa */ + if (is_ipv6 != is_ipv4((char *)new_val->gate_way)) { + error = fprintf(nmfile, "gateway=%s\n", (char *)new_val->gate_way); + if (error < 0) + goto setval_error; + } + if (is_ipv6 != is_ipv4((char *)new_val->dns_addr)) { + error = fprintf(nmfile, "dns=%s\n", (char *)new_val->dns_addr); + if (error < 0) + goto setval_error; + } fclose(nmfile); fclose(ifcfg_file); diff --git a/tools/hv/hv_set_ifconfig.sh b/tools/hv/hv_set_ifconfig.sh index ae5a7a8249a20..440a91b35823b 100755 --- a/tools/hv/hv_set_ifconfig.sh +++ b/tools/hv/hv_set_ifconfig.sh @@ -53,7 +53,7 @@ # or "manual" if no boot-time protocol should be used) # # address1=ipaddr1/plen -# address=ipaddr2/plen +# address2=ipaddr2/plen # # gateway=gateway1;gateway2 # @@ -61,7 +61,7 @@ # # [ipv6] # address1=ipaddr1/plen -# address2=ipaddr1/plen +# address2=ipaddr2/plen # # gateway=gateway1;gateway2 # From 5c0930ccaad5a74d74e8b18b648c5eb21ed2fe94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Nov 2023 15:57:13 +0100 Subject: [PATCH 133/560] hrtimers: Push pending hrtimers away from outgoing CPU earlier 2b8272ff4a70 ("cpu/hotplug: Prevent self deadlock on CPU hot-unplug") solved the straight forward CPU hotplug deadlock vs. the scheduler bandwidth timer. Yu discovered a more involved variant where a task which has a bandwidth timer started on the outgoing CPU holds a lock and then gets throttled. If the lock required by one of the CPU hotplug callbacks the hotplug operation deadlocks because the unthrottling timer event is not handled on the dying CPU and can only be recovered once the control CPU reaches the hotplug state which pulls the pending hrtimers from the dead CPU. Solve this by pushing the hrtimers away from the dying CPU in the dying callbacks. Nothing can queue a hrtimer on the dying CPU at that point because all other CPUs spin in stop_machine() with interrupts disabled and once the operation is finished the CPU is marked offline. Reported-by: Yu Liao Signed-off-by: Thomas Gleixner Tested-by: Liu Tie Link: https://lore.kernel.org/r/87a5rphara.ffs@tglx --- include/linux/cpuhotplug.h | 1 + include/linux/hrtimer.h | 4 ++-- kernel/cpu.c | 8 +++++++- kernel/time/hrtimer.c | 33 ++++++++++++--------------------- 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 068f7738be22a..448f5f995adc7 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -193,6 +193,7 @@ enum cpuhp_state { CPUHP_AP_ARM_CORESIGHT_CTI_STARTING, CPUHP_AP_ARM64_ISNDEP_STARTING, CPUHP_AP_SMPCFD_DYING, + CPUHP_AP_HRTIMERS_DYING, CPUHP_AP_X86_TBOOT_DYING, CPUHP_AP_ARM_CACHE_B15_RAC_DYING, CPUHP_AP_ONLINE, diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 0ee140176f102..f2044d5a652b5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -531,9 +531,9 @@ extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU -int hrtimers_dead_cpu(unsigned int cpu); +int hrtimers_cpu_dying(unsigned int cpu); #else -#define hrtimers_dead_cpu NULL +#define hrtimers_cpu_dying NULL #endif #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 6de7c6bb74eee..2e69a1deaa31e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2098,7 +2098,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { [CPUHP_HRTIMERS_PREPARE] = { .name = "hrtimers:prepare", .startup.single = hrtimers_prepare_cpu, - .teardown.single = hrtimers_dead_cpu, + .teardown.single = NULL, }, [CPUHP_SMPCFD_PREPARE] = { .name = "smpcfd:prepare", @@ -2190,6 +2190,12 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = NULL, .teardown.single = smpcfd_dying_cpu, }, + [CPUHP_AP_HRTIMERS_DYING] = { + .name = "hrtimers:dying", + .startup.single = NULL, + .teardown.single = hrtimers_cpu_dying, + }, + /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ [CPUHP_AP_ONLINE] = { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 238262e4aba7e..760793998cdd7 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2219,29 +2219,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } -int hrtimers_dead_cpu(unsigned int scpu) +int hrtimers_cpu_dying(unsigned int dying_cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int i, ncpu = cpumask_first(cpu_active_mask); - BUG_ON(cpu_online(scpu)); - tick_cancel_sched_timer(scpu); + tick_cancel_sched_timer(dying_cpu); + + old_base = this_cpu_ptr(&hrtimer_bases); + new_base = &per_cpu(hrtimer_bases, ncpu); - /* - * this BH disable ensures that raise_softirq_irqoff() does - * not wakeup ksoftirqd (and acquire the pi-lock) while - * holding the cpu_base lock - */ - local_bh_disable(); - local_irq_disable(); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = this_cpu_ptr(&hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - raw_spin_lock(&new_base->lock); - raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&old_base->lock); + raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], @@ -2252,15 +2245,13 @@ int hrtimers_dead_cpu(unsigned int scpu) * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ - hrtimer_update_softirq_timer(new_base, false); + __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); + /* Tell the other CPU to retrigger the next event */ + smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); - raw_spin_unlock(&old_base->lock); raw_spin_unlock(&new_base->lock); + raw_spin_unlock(&old_base->lock); - /* Check, if we got expired work to do */ - __hrtimer_peek_ahead_timers(); - local_irq_enable(); - local_bh_enable(); return 0; } From e409d7346648c9acff84c3cc8d291767ee2d5326 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 10 Nov 2023 17:13:02 +0100 Subject: [PATCH 134/560] net: ti: icssg-prueth: Add missing icss_iep_put to error path Analogously to prueth_remove, just also taking care for NULL'ing the iep pointers. Fixes: 186734c15886 ("net: ti: icssg-prueth: add packet timestamping and ptp support") Fixes: 443a2367ba3c ("net: ti: icssg-prueth: am65x SR2.0 add 10M full duplex support") Signed-off-by: Jan Kiszka Reviewed-by: Wojciech Drewek Reviewed-by: MD Danish Anwar Reviewed-by: Roger Quadros Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 6c4b64227ac88..3abbeba26f1b0 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -2105,10 +2105,7 @@ static int prueth_probe(struct platform_device *pdev) prueth->iep1 = icss_iep_get_idx(np, 1); if (IS_ERR(prueth->iep1)) { ret = dev_err_probe(dev, PTR_ERR(prueth->iep1), "iep1 get failed\n"); - icss_iep_put(prueth->iep0); - prueth->iep0 = NULL; - prueth->iep1 = NULL; - goto free_pool; + goto put_iep0; } if (prueth->pdata.quirk_10m_link_issue) { @@ -2205,6 +2202,12 @@ static int prueth_probe(struct platform_device *pdev) exit_iep: if (prueth->pdata.quirk_10m_link_issue) icss_iep_exit_fw(prueth->iep1); + icss_iep_put(prueth->iep1); + +put_iep0: + icss_iep_put(prueth->iep0); + prueth->iep0 = NULL; + prueth->iep1 = NULL; free_pool: gen_pool_free(prueth->sram_pool, From 2bd5b559a1f391f05927bbb0b31381fa71c61e26 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 10 Nov 2023 17:13:08 +0100 Subject: [PATCH 135/560] net: ti: icssg-prueth: Fix error cleanup on failing pruss_request_mem_region We were just continuing in this case, surely not desired. Fixes: 128d5874c082 ("net: ti: icssg-prueth: Add ICSSG ethernet driver") Signed-off-by: Jan Kiszka Reviewed-by: Wojciech Drewek Reviewed-by: Roger Quadros Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 3abbeba26f1b0..411898a4f38ca 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -2063,7 +2063,7 @@ static int prueth_probe(struct platform_device *pdev) &prueth->shram); if (ret) { dev_err(dev, "unable to get PRUSS SHRD RAM2: %d\n", ret); - pruss_put(prueth->pruss); + goto put_pruss; } prueth->sram_pool = of_gen_pool_get(np, "sram", 0); @@ -2215,6 +2215,8 @@ static int prueth_probe(struct platform_device *pdev) put_mem: pruss_release_mem_region(prueth->pruss, &prueth->shram); + +put_pruss: pruss_put(prueth->pruss); put_cores: From 713f040cd22285fcc506f40a0d259566e6758c3c Mon Sep 17 00:00:00 2001 From: Chandradeep Dey Date: Sat, 11 Nov 2023 19:25:49 +0100 Subject: [PATCH 136/560] ALSA: hda/realtek - Enable internal speaker of ASUS K6500ZC Apply the already existing quirk chain ALC294_FIXUP_ASUS_SPK to enable the internal speaker of ASUS K6500ZC. Signed-off-by: Chandradeep Dey Cc: Link: https://lore.kernel.org/r/NizcVHQ--3-9@chandradeepdey.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c7de743961850..cdd808e02b44d 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9907,6 +9907,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x10a1, "ASUS UX391UA", ALC294_FIXUP_ASUS_SPK), SND_PCI_QUIRK(0x1043, 0x10c0, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x10d0, "ASUS X540LA/X540LJ", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1043, 0x10d3, "ASUS K6500ZC", ALC294_FIXUP_ASUS_SPK), SND_PCI_QUIRK(0x1043, 0x115d, "Asus 1015E", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x1043, 0x11c0, "ASUS X556UR", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x125e, "ASUS Q524UQK", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), From 7e8037b099c0bbe8f2109dc452dbcab8d400fc53 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Sat, 11 Nov 2023 00:37:47 -0800 Subject: [PATCH 137/560] x86/hyperv: Fix the detection of E820_TYPE_PRAM in a Gen2 VM A Gen2 VM doesn't support legacy PCI/PCIe, so both raw_pci_ops and raw_pci_ext_ops are NULL, and pci_subsys_init() -> pcibios_init() doesn't call pcibios_resource_survey() -> e820__reserve_resources_late(); as a result, any emulated persistent memory of E820_TYPE_PRAM (12) via the kernel parameter memmap=nn[KMG]!ss is not added into iomem_resource and hence can't be detected by register_e820_pmem(). Fix this by directly calling e820__reserve_resources_late() in hv_pci_init(), which is called from arch_initcall(pci_arch_init). It's ok to move a Gen2 VM's e820__reserve_resources_late() from subsys_initcall(pci_subsys_init) to arch_initcall(pci_arch_init) because the code in-between doesn't depend on the E820 resources. e820__reserve_resources_late() depends on e820__reserve_resources(), which has been called earlier from setup_arch(). For a Gen-2 VM, the new hv_pci_init() also adds any memory of E820_TYPE_PMEM (7) into iomem_resource, and acpi_nfit_register_region() -> acpi_nfit_insert_resource() -> region_intersects() returns REGION_INTERSECTS, so the memory of E820_TYPE_PMEM won't get added twice. Changed the local variable "int gen2vm" to "bool gen2vm". Signed-off-by: Saurabh Sengar Signed-off-by: Dexuan Cui Signed-off-by: Wei Liu Message-ID: <1699691867-9827-1-git-send-email-ssengar@linux.microsoft.com> --- arch/x86/hyperv/hv_init.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 21556ad87f4ba..8f3a4d16bb791 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -286,15 +287,31 @@ static int hv_cpu_die(unsigned int cpu) static int __init hv_pci_init(void) { - int gen2vm = efi_enabled(EFI_BOOT); + bool gen2vm = efi_enabled(EFI_BOOT); /* - * For Generation-2 VM, we exit from pci_arch_init() by returning 0. - * The purpose is to suppress the harmless warning: + * A Generation-2 VM doesn't support legacy PCI/PCIe, so both + * raw_pci_ops and raw_pci_ext_ops are NULL, and pci_subsys_init() -> + * pcibios_init() doesn't call pcibios_resource_survey() -> + * e820__reserve_resources_late(); as a result, any emulated persistent + * memory of E820_TYPE_PRAM (12) via the kernel parameter + * memmap=nn[KMG]!ss is not added into iomem_resource and hence can't be + * detected by register_e820_pmem(). Fix this by directly calling + * e820__reserve_resources_late() here: e820__reserve_resources_late() + * depends on e820__reserve_resources(), which has been called earlier + * from setup_arch(). Note: e820__reserve_resources_late() also adds + * any memory of E820_TYPE_PMEM (7) into iomem_resource, and + * acpi_nfit_register_region() -> acpi_nfit_insert_resource() -> + * region_intersects() returns REGION_INTERSECTS, so the memory of + * E820_TYPE_PMEM won't get added twice. + * + * We return 0 here so that pci_arch_init() won't print the warning: * "PCI: Fatal: No config space access function found" */ - if (gen2vm) + if (gen2vm) { + e820__reserve_resources_late(); return 0; + } /* For Generation-1 VM, we'll proceed in pci_arch_init(). */ return 1; From 7b211c7671212cad0b83603c674838c7e824d845 Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Fri, 10 Nov 2023 10:30:11 +0100 Subject: [PATCH 138/560] Revert "i2c: pxa: move to generic GPIO recovery" This reverts commit 0b01392c18b9993a584f36ace1d61118772ad0ca. Conversion of PXA to generic I2C recovery, makes the I2C bus completely lock up if recovery pinctrl is present in the DT and I2C recovery is enabled. So, until the generic I2C recovery can also work with PXA lets revert to have working I2C and I2C recovery again. Signed-off-by: Robert Marko Cc: stable@vger.kernel.org # 5.11+ Acked-by: Andi Shyti Acked-by: Russell King (Oracle) Acked-by: Linus Walleij Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-pxa.c | 76 ++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c index 1d76482427492..76f79b68cef84 100644 --- a/drivers/i2c/busses/i2c-pxa.c +++ b/drivers/i2c/busses/i2c-pxa.c @@ -265,6 +265,9 @@ struct pxa_i2c { u32 hs_mask; struct i2c_bus_recovery_info recovery; + struct pinctrl *pinctrl; + struct pinctrl_state *pinctrl_default; + struct pinctrl_state *pinctrl_recovery; }; #define _IBMR(i2c) ((i2c)->reg_ibmr) @@ -1299,12 +1302,13 @@ static void i2c_pxa_prepare_recovery(struct i2c_adapter *adap) */ gpiod_set_value(i2c->recovery.scl_gpiod, ibmr & IBMR_SCLS); gpiod_set_value(i2c->recovery.sda_gpiod, ibmr & IBMR_SDAS); + + WARN_ON(pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_recovery)); } static void i2c_pxa_unprepare_recovery(struct i2c_adapter *adap) { struct pxa_i2c *i2c = adap->algo_data; - struct i2c_bus_recovery_info *bri = adap->bus_recovery_info; u32 isr; /* @@ -1318,7 +1322,7 @@ static void i2c_pxa_unprepare_recovery(struct i2c_adapter *adap) i2c_pxa_do_reset(i2c); } - WARN_ON(pinctrl_select_state(bri->pinctrl, bri->pins_default)); + WARN_ON(pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_default)); dev_dbg(&i2c->adap.dev, "recovery: IBMR 0x%08x ISR 0x%08x\n", readl(_IBMR(i2c)), readl(_ISR(i2c))); @@ -1340,20 +1344,76 @@ static int i2c_pxa_init_recovery(struct pxa_i2c *i2c) if (IS_ENABLED(CONFIG_I2C_PXA_SLAVE)) return 0; - bri->pinctrl = devm_pinctrl_get(dev); - if (PTR_ERR(bri->pinctrl) == -ENODEV) { - bri->pinctrl = NULL; + i2c->pinctrl = devm_pinctrl_get(dev); + if (PTR_ERR(i2c->pinctrl) == -ENODEV) + i2c->pinctrl = NULL; + if (IS_ERR(i2c->pinctrl)) + return PTR_ERR(i2c->pinctrl); + + if (!i2c->pinctrl) + return 0; + + i2c->pinctrl_default = pinctrl_lookup_state(i2c->pinctrl, + PINCTRL_STATE_DEFAULT); + i2c->pinctrl_recovery = pinctrl_lookup_state(i2c->pinctrl, "recovery"); + + if (IS_ERR(i2c->pinctrl_default) || IS_ERR(i2c->pinctrl_recovery)) { + dev_info(dev, "missing pinmux recovery information: %ld %ld\n", + PTR_ERR(i2c->pinctrl_default), + PTR_ERR(i2c->pinctrl_recovery)); + return 0; + } + + /* + * Claiming GPIOs can influence the pinmux state, and may glitch the + * I2C bus. Do this carefully. + */ + bri->scl_gpiod = devm_gpiod_get(dev, "scl", GPIOD_OUT_HIGH_OPEN_DRAIN); + if (bri->scl_gpiod == ERR_PTR(-EPROBE_DEFER)) + return -EPROBE_DEFER; + if (IS_ERR(bri->scl_gpiod)) { + dev_info(dev, "missing scl gpio recovery information: %pe\n", + bri->scl_gpiod); + return 0; + } + + /* + * We have SCL. Pull SCL low and wait a bit so that SDA glitches + * have no effect. + */ + gpiod_direction_output(bri->scl_gpiod, 0); + udelay(10); + bri->sda_gpiod = devm_gpiod_get(dev, "sda", GPIOD_OUT_HIGH_OPEN_DRAIN); + + /* Wait a bit in case of a SDA glitch, and then release SCL. */ + udelay(10); + gpiod_direction_output(bri->scl_gpiod, 1); + + if (bri->sda_gpiod == ERR_PTR(-EPROBE_DEFER)) + return -EPROBE_DEFER; + + if (IS_ERR(bri->sda_gpiod)) { + dev_info(dev, "missing sda gpio recovery information: %pe\n", + bri->sda_gpiod); return 0; } - if (IS_ERR(bri->pinctrl)) - return PTR_ERR(bri->pinctrl); bri->prepare_recovery = i2c_pxa_prepare_recovery; bri->unprepare_recovery = i2c_pxa_unprepare_recovery; + bri->recover_bus = i2c_generic_scl_recovery; i2c->adap.bus_recovery_info = bri; - return 0; + /* + * Claiming GPIOs can change the pinmux state, which confuses the + * pinctrl since pinctrl's idea of the current setting is unaffected + * by the pinmux change caused by claiming the GPIO. Work around that + * by switching pinctrl to the GPIO state here. We do it this way to + * avoid glitching the I2C bus. + */ + pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_recovery); + + return pinctrl_select_state(i2c->pinctrl, i2c->pinctrl_default); } static int i2c_pxa_probe(struct platform_device *dev) From 2a5db859c6825b5d50377dda9c3cc729c20cad43 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 31 Jul 2023 20:46:17 +0800 Subject: [PATCH 139/560] xfs: factor out xfs_defer_pending_abort Factor out xfs_defer_pending_abort() from xfs_defer_trans_abort(), which not use transaction parameter, so it can be used after the transaction life cycle. Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index bcfb6a4203cdd..88388e12f8e7c 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -245,21 +245,18 @@ xfs_defer_create_intents( return ret; } -/* Abort all the intents that were committed. */ STATIC void -xfs_defer_trans_abort( - struct xfs_trans *tp, - struct list_head *dop_pending) +xfs_defer_pending_abort( + struct xfs_mount *mp, + struct list_head *dop_list) { struct xfs_defer_pending *dfp; const struct xfs_defer_op_type *ops; - trace_xfs_defer_trans_abort(tp, _RET_IP_); - /* Abort intent items that don't have a done item. */ - list_for_each_entry(dfp, dop_pending, dfp_list) { + list_for_each_entry(dfp, dop_list, dfp_list) { ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_abort(tp->t_mountp, dfp); + trace_xfs_defer_pending_abort(mp, dfp); if (dfp->dfp_intent && !dfp->dfp_done) { ops->abort_intent(dfp->dfp_intent); dfp->dfp_intent = NULL; @@ -267,6 +264,16 @@ xfs_defer_trans_abort( } } +/* Abort all the intents that were committed. */ +STATIC void +xfs_defer_trans_abort( + struct xfs_trans *tp, + struct list_head *dop_pending) +{ + trace_xfs_defer_trans_abort(tp, _RET_IP_); + xfs_defer_pending_abort(tp->t_mountp, dop_pending); +} + /* * Capture resources that the caller said not to release ("held") when the * transaction commits. Caller is responsible for zero-initializing @dres. From f8f9d952e42dd49ae534f61f2fa7ca0876cb9848 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 31 Jul 2023 20:46:18 +0800 Subject: [PATCH 140/560] xfs: abort intent items when recovery intents fail When recovering intents, we capture newly created intent items as part of committing recovered intent items. If intent recovery fails at a later point, we forget to remove those newly created intent items from the AIL and hang: [root@localhost ~]# cat /proc/539/stack [<0>] xfs_ail_push_all_sync+0x174/0x230 [<0>] xfs_unmount_flush_inodes+0x8d/0xd0 [<0>] xfs_mountfs+0x15f7/0x1e70 [<0>] xfs_fs_fill_super+0x10ec/0x1b20 [<0>] get_tree_bdev+0x3c8/0x730 [<0>] vfs_get_tree+0x89/0x2c0 [<0>] path_mount+0xecf/0x1800 [<0>] do_mount+0xf3/0x110 [<0>] __x64_sys_mount+0x154/0x1f0 [<0>] do_syscall_64+0x39/0x80 [<0>] entry_SYSCALL_64_after_hwframe+0x63/0xcd When newly created intent items fail to commit via transaction, intent recovery hasn't created done items for these newly created intent items, so the capture structure is the sole owner of the captured intent items. We must release them explicitly or else they leak: unreferenced object 0xffff888016719108 (size 432): comm "mount", pid 529, jiffies 4294706839 (age 144.463s) hex dump (first 32 bytes): 08 91 71 16 80 88 ff ff 08 91 71 16 80 88 ff ff ..q.......q..... 18 91 71 16 80 88 ff ff 18 91 71 16 80 88 ff ff ..q.......q..... backtrace: [] xfs_efi_init+0x18f/0x1d0 [] xfs_extent_free_create_intent+0x50/0x150 [] xfs_defer_create_intents+0x16a/0x340 [] xfs_defer_ops_capture_and_commit+0x8e/0xad0 [] xfs_cui_item_recover+0x819/0x980 [] xlog_recover_process_intents+0x246/0xb70 [] xlog_recover_finish+0x8a/0x9a0 [] xfs_log_mount_finish+0x2bb/0x4a0 [] xfs_mountfs+0x14bf/0x1e70 [] xfs_fs_fill_super+0x10d0/0x1b20 [] get_tree_bdev+0x3d2/0x6d0 [] vfs_get_tree+0x89/0x2c0 [] path_mount+0xecf/0x1800 [] do_mount+0xf3/0x110 [] __x64_sys_mount+0x154/0x1f0 [] do_syscall_64+0x39/0x80 Fix the problem above by abort intent items that don't have a done item when recovery intents fail. Fixes: e6fff81e4870 ("xfs: proper replay of deferred ops queued during log recovery") Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 5 +++-- fs/xfs/libxfs/xfs_defer.h | 2 +- fs/xfs/xfs_log_recover.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 88388e12f8e7c..f71679ce23b95 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -763,12 +763,13 @@ xfs_defer_ops_capture( /* Release all resources that we used to capture deferred ops. */ void -xfs_defer_ops_capture_free( +xfs_defer_ops_capture_abort( struct xfs_mount *mp, struct xfs_defer_capture *dfc) { unsigned short i; + xfs_defer_pending_abort(mp, &dfc->dfc_dfops); xfs_defer_cancel_list(mp, &dfc->dfc_dfops); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) @@ -809,7 +810,7 @@ xfs_defer_ops_capture_and_commit( /* Commit the transaction and add the capture structure to the list. */ error = xfs_trans_commit(tp); if (error) { - xfs_defer_ops_capture_free(mp, dfc); + xfs_defer_ops_capture_abort(mp, dfc); return error; } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 114a3a4930a3c..8788ad5f6a731 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -121,7 +121,7 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, struct list_head *capture_list); void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, struct xfs_defer_resources *dres); -void xfs_defer_ops_capture_free(struct xfs_mount *mp, +void xfs_defer_ops_capture_abort(struct xfs_mount *mp, struct xfs_defer_capture *d); void xfs_defer_resources_rele(struct xfs_defer_resources *dres); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 13b94d2e605bd..a1e18b24971a2 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2511,7 +2511,7 @@ xlog_abort_defer_ops( list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { list_del_init(&dfc->dfc_list); - xfs_defer_ops_capture_free(mp, dfc); + xfs_defer_ops_capture_abort(mp, dfc); } } From 00080503612f61d1ad67be641ed9cb4f9f6ba40e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 19 Sep 2023 17:12:45 -0400 Subject: [PATCH 141/560] XFS: Update MAINTAINERS to catch all XFS documentation Assumes that all XFS documentation will be prefixed with xfs-, which seems like a good policy anyway. Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Chandan Babu R --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cfd..a78795dc9aeef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23872,8 +23872,7 @@ T: git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git P: Documentation/filesystems/xfs-maintainer-entry-profile.rst F: Documentation/ABI/testing/sysfs-fs-xfs F: Documentation/admin-guide/xfs.rst -F: Documentation/filesystems/xfs-delayed-logging-design.rst -F: Documentation/filesystems/xfs-self-describing-metadata.rst +F: Documentation/filesystems/xfs-* F: fs/xfs/ F: include/uapi/linux/dqblk_xfs.h F: include/uapi/linux/fsmap.h From 55f669f34184ecb25b8353f29c7f6f1ae5b313d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Oct 2023 17:28:52 +0200 Subject: [PATCH 142/560] xfs: only remap the written blocks in xfs_reflink_end_cow_extent xfs_reflink_end_cow_extent looks up the COW extent and the data fork extent at offset_fsb, and then proceeds to remap the common subset between the two. It does however not limit the remapped extent to the passed in [*offset_fsbm end_fsb] range and thus potentially remaps more blocks than the one handled by the current I/O completion. This means that with sufficiently large data and COW extents we could be remapping COW fork mappings that have not been written to, leading to a stale data exposure on a powerfail event. We use to have a xfs_trim_range to make the remap fit the I/O completion range, but that got (apparently accidentally) removed in commit df2fd88f8ac7 ("xfs: rewrite xfs_reflink_end_cow to use intents"). Note that I've only found this by code inspection, and a test case would probably require very specific delay and error injection. Fixes: df2fd88f8ac7 ("xfs: rewrite xfs_reflink_end_cow to use intents") Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_reflink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 658edee8381dc..e5b62dc284664 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -784,6 +784,7 @@ xfs_reflink_end_cow_extent( } } del = got; + xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb); /* Grab the corresponding mapping in the data fork. */ nmaps = 1; From 471de20303dda0b67981e06d59cc6c4a83fd2a3c Mon Sep 17 00:00:00 2001 From: Leah Rumancik Date: Mon, 30 Oct 2023 13:33:49 -0700 Subject: [PATCH 143/560] xfs: up(ic_sema) if flushing data device fails We flush the data device cache before we issue external log IO. If the flush fails, we shut down the log immediately and return. However, the iclog->ic_sema is left in a decremented state so let's add an up(). Prior to this patch, xfs/438 would fail consistently when running with an external log device: sync -> xfs_log_force -> xlog_write_iclog -> down(&iclog->ic_sema) -> blkdev_issue_flush (fail causes us to intiate shutdown) -> xlog_force_shutdown -> return unmount -> xfs_log_umount -> xlog_wait_iclog_completion -> down(&iclog->ic_sema) --------> HANG There is a second early return / shutdown. Make sure the up() happens for it as well. Also make sure we cleanup the iclog state, xlog_state_done_syncing, before dropping the iclog lock. Fixes: b5d721eaae47 ("xfs: external logs need to flush data device") Fixes: 842a42d126b4 ("xfs: shutdown on failure to add page to log bio") Fixes: 7d839e325af2 ("xfs: check return codes when flushing block devices") Signed-off-by: Leah Rumancik Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 51c100c861770..ee206facf0dc0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1893,9 +1893,7 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog); - up(&iclog->ic_sema); - return; + goto sync; } /* @@ -1925,20 +1923,17 @@ xlog_write_iclog( * avoid shutdown re-entering this path and erroring out again. */ if (log->l_targ != log->l_mp->m_ddev_targp && - blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; - } + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) + goto shutdown; } if (iclog->ic_flags & XLOG_ICL_NEED_FUA) iclog->ic_bio.bi_opf |= REQ_FUA; iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; - } + if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) + goto shutdown; + if (is_vmalloc_addr(iclog->ic_data)) flush_kernel_vmap_range(iclog->ic_data, count); @@ -1959,6 +1954,12 @@ xlog_write_iclog( } submit_bio(&iclog->ic_bio); + return; +shutdown: + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); +sync: + xlog_state_done_syncing(iclog); + up(&iclog->ic_sema); } /* From f63a5b3769ad7659da4c0420751d78958ab97675 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 1 Nov 2023 09:41:45 -0700 Subject: [PATCH 144/560] xfs: fix internal error from AGFL exhaustion We've been seeing XFS errors like the following: XFS: Internal error i != 1 at line 3526 of file fs/xfs/libxfs/xfs_btree.c. Caller xfs_btree_insert+0x1ec/0x280 ... Call Trace: xfs_corruption_error+0x94/0xa0 xfs_btree_insert+0x221/0x280 xfs_alloc_fixup_trees+0x104/0x3e0 xfs_alloc_ag_vextent_size+0x667/0x820 xfs_alloc_fix_freelist+0x5d9/0x750 xfs_free_extent_fix_freelist+0x65/0xa0 __xfs_free_extent+0x57/0x180 ... This is the XFS_IS_CORRUPT() check in xfs_btree_insert() when xfs_btree_insrec() fails. After converting this into a panic and dissecting the core dump, I found that xfs_btree_insrec() is failing because it's trying to split a leaf node in the cntbt when the AG free list is empty. In particular, it's failing to get a block from the AGFL _while trying to refill the AGFL_. If a single operation splits every level of the bnobt and the cntbt (and the rmapbt if it is enabled) at once, the free list will be empty. Then, when the next operation tries to refill the free list, it allocates space. If the allocation does not use a full extent, it will need to insert records for the remaining space in the bnobt and cntbt. And if those new records go in full leaves, the leaves (and potentially more nodes up to the old root) need to be split. Fix it by accounting for the additional splits that may be required to refill the free list in the calculation for the minimum free list size. P.S. As far as I can tell, this bug has existed for a long time -- maybe back to xfs-history commit afdf80ae7405 ("Add XFS_AG_MAXLEVELS macros ...") in April 1994! It requires a very unlucky sequence of events, and in fact we didn't hit it until a particular sparse mmap workload updated from 5.12 to 5.19. But this bug existed in 5.12, so it must've been exposed by some other change in allocation or writeback patterns. It's also much less likely to be hit with the rmapbt enabled, since that increases the minimum free list size and is unlikely to split at the same time as the bnobt and cntbt. Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Omar Sandoval Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_alloc.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3069194527dd0..100ab5931b313 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2275,16 +2275,37 @@ xfs_alloc_min_freelist( ASSERT(mp->m_alloc_maxlevels > 0); + /* + * For a btree shorter than the maximum height, the worst case is that + * every level gets split and a new level is added, then while inserting + * another entry to refill the AGFL, every level under the old root gets + * split again. This is: + * + * (full height split reservation) + (AGFL refill split height) + * = (current height + 1) + (current height - 1) + * = (new height) + (new height - 2) + * = 2 * new height - 2 + * + * For a btree of maximum height, the worst case is that every level + * under the root gets split, then while inserting another entry to + * refill the AGFL, every level under the root gets split again. This is + * also: + * + * 2 * (current height - 1) + * = 2 * (new height - 1) + * = 2 * new height - 2 + */ + /* space needed by-bno freespace btree */ min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, - mp->m_alloc_maxlevels); + mp->m_alloc_maxlevels) * 2 - 2; /* space needed by-size freespace btree */ min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, - mp->m_alloc_maxlevels); + mp->m_alloc_maxlevels) * 2 - 2; /* space needed reverse mapping used space btree */ if (xfs_has_rmapbt(mp)) min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); + mp->m_rmap_maxlevels) * 2 - 2; return min_free; } From a2e4388adfa44684c7c428a5a5980efe0d75e13e Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Sun, 5 Nov 2023 20:23:18 +0100 Subject: [PATCH 145/560] xfs: fix again select in kconfig XFS_ONLINE_SCRUB_STATS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 57c0f4a8ea3a attempted to fix the select in the kconfig entry XFS_ONLINE_SCRUB_STATS by selecting XFS_DEBUG, but the original intention was to select DEBUG_FS, since the feature relies on debugfs to export the related scrub statistics. Fixes: 57c0f4a8ea3a ("xfs: fix select in config XFS_ONLINE_SCRUB_STATS") Reported-by: Holger Hoffstätte Signed-off-by: Anthony Iliopoulos Reviewed-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index ed0bc8cbc703d..567fb37274d35 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y depends on XFS_ONLINE_SCRUB - select XFS_DEBUG + select DEBUG_FS help If you say Y here, the kernel will gather usage data about the online metadata check subsystem. This includes the number From 038ca189c0d2c1570b4d922f25b524007c85cf94 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 10 Nov 2023 15:33:13 +1100 Subject: [PATCH 146/560] xfs: inode recovery does not validate the recovered inode Discovered when trying to track down a weird recovery corruption issue that wasn't detected at recovery time. The specific corruption was a zero extent count field when big extent counts are in use, and it turns out the dinode verifier doesn't detect that specific corruption case, either. So fix it too. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_inode_buf.c | 3 +++ fs/xfs/xfs_inode_item_recover.c | 14 +++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 543f3748c2a35..137a65bda95dc 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -510,6 +510,9 @@ xfs_dinode_verify( if (mode && nextents + naextents > nblocks) return __this_address; + if (nextents + naextents == 0 && nblocks != 0) + return __this_address; + if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 0e5dba2343ea1..1c5ba2a732b05 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -286,6 +286,7 @@ xlog_recover_inode_commit_pass2( struct xfs_log_dinode *ldip; uint isize; int need_free = 0; + xfs_failaddr_t fa; if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; @@ -528,8 +529,19 @@ xlog_recover_inode_commit_pass2( (dip->di_mode != 0)) error = xfs_recover_inode_owner_change(mp, dip, in_f, buffer_list); - /* re-generate the checksum. */ + /* re-generate the checksum and validate the recovered inode. */ xfs_dinode_calc_crc(log->l_mp, dip); + fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip); + if (fa) { + XFS_CORRUPTION_ERROR( + "Bad dinode after recovery", + XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip)); + xfs_alert(mp, + "Metadata corruption detected at %pS, inode 0x%llx", + fa, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } ASSERT(bp->b_mount == mp); bp->b_flags |= _XBF_LOGRECOVERY; From 7930d9e103700cde15833638855b750715c12091 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 10 Nov 2023 15:33:14 +1100 Subject: [PATCH 147/560] xfs: recovery should not clear di_flushiter unconditionally Because on v3 inodes, di_flushiter doesn't exist. It overlaps with zero padding in the inode, except when NREXT64=1 configurations are in use and the zero padding is no longer padding but holds the 64 bit extent counter. This manifests obviously on big endian platforms (e.g. s390) because the log dinode is in host order and the overlap is the LSBs of the extent count field. It is not noticed on little endian machines because the overlap is at the MSB end of the extent count field and we need to get more than 2^^48 extents in the inode before it manifests. i.e. the heat death of the universe will occur before we see the problem in little endian machines. This is a zero-day issue for NREXT64=1 configuraitons on big endian machines. Fix it by only clearing di_flushiter on v2 inodes during recovery. Fixes: 9b7d16e34bbe ("xfs: Introduce XFS_DIFLAG2_NREXT64 and associated helpers") cc: stable@kernel.org # 5.19+ Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_inode_item_recover.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 1c5ba2a732b05..144198a6b2702 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -370,24 +370,26 @@ xlog_recover_inode_commit_pass2( * superblock flag to determine whether we need to look at di_flushiter * to skip replay when the on disk inode is newer than the log one */ - if (!xfs_has_v3inodes(mp) && - ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { - /* - * Deal with the wrap case, DI_MAX_FLUSH is less - * than smaller numbers - */ - if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && - ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { - /* do nothing */ - } else { - trace_xfs_log_recover_inode_skip(log, in_f); - error = 0; - goto out_release; + if (!xfs_has_v3inodes(mp)) { + if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * Deal with the wrap case, DI_MAX_FLUSH is less + * than smaller numbers + */ + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && + ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { + /* do nothing */ + } else { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_release; + } } + + /* Take the opportunity to reset the flush iteration count */ + ldip->di_flushiter = 0; } - /* Take the opportunity to reset the flush iteration count */ - ldip->di_flushiter = 0; if (unlikely(S_ISREG(ldip->di_mode))) { if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && From e64e7c74b99ec9e439abca75f522f4b98f220bd1 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 24 Oct 2023 13:51:36 +0200 Subject: [PATCH 148/560] xen/events: avoid using info_for_irq() in xen_send_IPI_one() xen_send_IPI_one() is being used by cpuhp_report_idle_dead() after it calls rcu_report_dead(), meaning that any RCU usage by xen_send_IPI_one() is a bad idea. Unfortunately xen_send_IPI_one() is using notify_remote_via_irq() today, which is using irq_get_chip_data() via info_for_irq(). And irq_get_chip_data() in turn is using a maple-tree lookup requiring RCU. Avoid this problem by caching the ipi event channels in another percpu variable, allowing the use notify_remote_via_evtchn() in xen_send_IPI_one(). Fixes: 721255b9826b ("genirq: Use a maple tree for interrupt descriptor management") Reported-by: David Woodhouse Signed-off-by: Juergen Gross Tested-by: David Woodhouse Acked-by: Stefano Stabellini Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 6de6b084ea60d..237e6b884f722 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -164,6 +164,8 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; +/* Cache for IPI event channels - needed for hot cpu unplug (avoid RCU usage). */ +static DEFINE_PER_CPU(evtchn_port_t [XEN_NR_IPIS], ipi_to_evtchn) = {[0 ... XEN_NR_IPIS-1] = 0}; /* Event channel distribution data */ static atomic_t channels_on_cpu[NR_CPUS]; @@ -366,6 +368,7 @@ static int xen_irq_info_ipi_setup(unsigned cpu, info->u.ipi = ipi; per_cpu(ipi_to_irq, cpu)[ipi] = irq; + per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn; return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); } @@ -981,6 +984,7 @@ static void __unbind_from_irq(unsigned int irq) break; case IRQT_IPI: per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; + per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(irq)] = 0; break; case IRQT_EVTCHN: dev = info->u.interdomain; @@ -1632,7 +1636,7 @@ EXPORT_SYMBOL_GPL(evtchn_put); void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) { - int irq; + evtchn_port_t evtchn; #ifdef CONFIG_X86 if (unlikely(vector == XEN_NMI_VECTOR)) { @@ -1643,9 +1647,9 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) return; } #endif - irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); + evtchn = per_cpu(ipi_to_evtchn, cpu)[vector]; + BUG_ON(evtchn == 0); + notify_remote_via_evtchn(evtchn); } struct evtchn_loop_ctrl { From bfa993b355d33a438a746523e7129391c8664e8a Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Wed, 8 Nov 2023 16:25:15 -0500 Subject: [PATCH 149/560] acpi/processor: sanitize _OSC/_PDC capabilities for Xen dom0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Processor capability bits notify ACPI of the OS capabilities, and so ACPI can adjust the return of other Processor methods taking the OS capabilities into account. When Linux is running as a Xen dom0, the hypervisor is the entity in charge of processor power management, and hence Xen needs to make sure the capabilities reported by _OSC/_PDC match the capabilities of the driver in Xen. Introduce a small helper to sanitize the buffer when running as Xen dom0. When Xen supports HWP, this serves as the equivalent of commit a21211672c9a ("ACPI / processor: Request native thermal interrupt handling via _OSC") to avoid SMM crashes. Xen will set bit ACPI_PROC_CAP_COLLAB_PROC_PERF (bit 12) in the capability bits and the _OSC/_PDC call will apply it. [ jandryuk: Mention Xen HWP's need. Support _OSC & _PDC ] Signed-off-by: Roger Pau Monné Cc: stable@vger.kernel.org Signed-off-by: Jason Andryuk Reviewed-by: Michal Wilczynski Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20231108212517.72279-1-jandryuk@gmail.com Signed-off-by: Juergen Gross --- arch/x86/include/asm/acpi.h | 14 ++++++++++++++ arch/x86/include/asm/xen/hypervisor.h | 9 +++++++++ drivers/xen/pcpu.c | 22 ++++++++++++++++++++++ 3 files changed, 45 insertions(+) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index c8a7fc23f63c6..f896eed4516c7 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -16,6 +16,9 @@ #include #include #include +#include + +#include #ifdef CONFIG_ACPI_APEI # include @@ -127,6 +130,17 @@ static inline void arch_acpi_set_proc_cap_bits(u32 *cap) if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_option_idle_override == IDLE_NOMWAIT) *cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH); + + if (xen_initial_domain()) { + /* + * When Linux is running as Xen dom0, the hypervisor is the + * entity in charge of the processor power management, and so + * Xen needs to check the OS capabilities reported in the + * processor capabilities buffer matches what the hypervisor + * driver supports. + */ + xen_sanitize_proc_cap_bits(cap); + } } static inline bool acpi_has_cpu_in_madt(void) diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 7048dfacc04b2..a9088250770f2 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -100,4 +100,13 @@ static inline void leave_lazy(enum xen_lazy_mode mode) enum xen_lazy_mode xen_get_lazy_mode(void); +#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI) +void xen_sanitize_proc_cap_bits(uint32_t *buf); +#else +static inline void xen_sanitize_proc_cap_bits(uint32_t *buf) +{ + BUG(); +} +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c index b3e3d1bb37f3e..5086552731453 100644 --- a/drivers/xen/pcpu.c +++ b/drivers/xen/pcpu.c @@ -47,6 +47,9 @@ #include #include +#ifdef CONFIG_ACPI +#include +#endif /* * @cpu_id: Xen physical cpu logic number @@ -400,4 +403,23 @@ bool __init xen_processor_present(uint32_t acpi_id) return online; } + +void xen_sanitize_proc_cap_bits(uint32_t *cap) +{ + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .u.set_pminfo.id = -1, + .u.set_pminfo.type = XEN_PM_PDC, + }; + u32 buf[3] = { ACPI_PDC_REVISION_ID, 1, *cap }; + int ret; + + set_xen_guest_handle(op.u.set_pminfo.pdc, buf); + ret = HYPERVISOR_platform_op(&op); + if (ret) + pr_err("sanitize of _PDC buffer bits from Xen failed: %d\n", + ret); + else + *cap = buf[2]; +} #endif From 50e865a56876bd7a74a79c1778025631150f104a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 5 Nov 2023 21:56:31 -0800 Subject: [PATCH 150/560] xen/shbuf: eliminate 17 kernel-doc warnings Don't use kernel-doc markers ("/**") for comments that are not in kernel-doc format. This prevents multiple kernel-doc warnings: xen-front-pgdir-shbuf.c:25: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * This structure represents the structure of a shared page xen-front-pgdir-shbuf.c:37: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Shared buffer ops which are differently implemented xen-front-pgdir-shbuf.c:65: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Get granted reference to the very first page of the xen-front-pgdir-shbuf.c:85: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Map granted references of the shared buffer. xen-front-pgdir-shbuf.c:106: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Unmap granted references of the shared buffer. xen-front-pgdir-shbuf.c:127: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Free all the resources of the shared buffer. xen-front-pgdir-shbuf.c:154: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Get the number of pages the page directory consumes itself. xen-front-pgdir-shbuf.c:164: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Calculate the number of grant references needed to share the buffer xen-front-pgdir-shbuf.c:176: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Calculate the number of grant references needed to share the buffer xen-front-pgdir-shbuf.c:194: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Unmap the buffer previously mapped with grant references xen-front-pgdir-shbuf.c:242: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Map the buffer with grant references provided by the backend. xen-front-pgdir-shbuf.c:324: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Fill page directory with grant references to the pages of the xen-front-pgdir-shbuf.c:354: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Fill page directory with grant references to the pages of the xen-front-pgdir-shbuf.c:393: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Grant references to the frontend's buffer pages. xen-front-pgdir-shbuf.c:422: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Grant all the references needed to share the buffer. xen-front-pgdir-shbuf.c:470: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Allocate all required structures to mange shared buffer. xen-front-pgdir-shbuf.c:510: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Allocate a new instance of a shared buffer. Signed-off-by: Randy Dunlap Reported-by: kernel test robot Closes: lore.kernel.org/r/202311060203.yQrpPZhm-lkp@intel.com Acked-by: Juergen Gross Cc: Juergen Gross Cc: Stefano Stabellini Cc: Oleksandr Tyshchenko Cc: xen-devel@lists.xenproject.org Link: https://lore.kernel.org/r/20231106055631.21520-1-rdunlap@infradead.org Signed-off-by: Juergen Gross --- drivers/xen/xen-front-pgdir-shbuf.c | 34 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/xen/xen-front-pgdir-shbuf.c b/drivers/xen/xen-front-pgdir-shbuf.c index b52e0fa595a99..223870a0111b2 100644 --- a/drivers/xen/xen-front-pgdir-shbuf.c +++ b/drivers/xen/xen-front-pgdir-shbuf.c @@ -21,7 +21,7 @@ #include -/** +/* * This structure represents the structure of a shared page * that contains grant references to the pages of the shared * buffer. This structure is common to many Xen para-virtualized @@ -33,7 +33,7 @@ struct xen_page_directory { grant_ref_t gref[]; /* Variable length */ }; -/** +/* * Shared buffer ops which are differently implemented * depending on the allocation mode, e.g. if the buffer * is allocated by the corresponding backend or frontend. @@ -61,7 +61,7 @@ struct xen_front_pgdir_shbuf_ops { int (*unmap)(struct xen_front_pgdir_shbuf *buf); }; -/** +/* * Get granted reference to the very first page of the * page directory. Usually this is passed to the backend, * so it can find/fill the grant references to the buffer's @@ -81,7 +81,7 @@ xen_front_pgdir_shbuf_get_dir_start(struct xen_front_pgdir_shbuf *buf) } EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_get_dir_start); -/** +/* * Map granted references of the shared buffer. * * Depending on the shared buffer mode of allocation @@ -102,7 +102,7 @@ int xen_front_pgdir_shbuf_map(struct xen_front_pgdir_shbuf *buf) } EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_map); -/** +/* * Unmap granted references of the shared buffer. * * Depending on the shared buffer mode of allocation @@ -123,7 +123,7 @@ int xen_front_pgdir_shbuf_unmap(struct xen_front_pgdir_shbuf *buf) } EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_unmap); -/** +/* * Free all the resources of the shared buffer. * * \param buf shared buffer which resources to be freed. @@ -150,7 +150,7 @@ EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_free); offsetof(struct xen_page_directory, \ gref)) / sizeof(grant_ref_t)) -/** +/* * Get the number of pages the page directory consumes itself. * * \param buf shared buffer. @@ -160,7 +160,7 @@ static int get_num_pages_dir(struct xen_front_pgdir_shbuf *buf) return DIV_ROUND_UP(buf->num_pages, XEN_NUM_GREFS_PER_PAGE); } -/** +/* * Calculate the number of grant references needed to share the buffer * and its pages when backend allocates the buffer. * @@ -172,7 +172,7 @@ static void backend_calc_num_grefs(struct xen_front_pgdir_shbuf *buf) buf->num_grefs = get_num_pages_dir(buf); } -/** +/* * Calculate the number of grant references needed to share the buffer * and its pages when frontend allocates the buffer. * @@ -190,7 +190,7 @@ static void guest_calc_num_grefs(struct xen_front_pgdir_shbuf *buf) #define xen_page_to_vaddr(page) \ ((uintptr_t)pfn_to_kaddr(page_to_xen_pfn(page))) -/** +/* * Unmap the buffer previously mapped with grant references * provided by the backend. * @@ -238,7 +238,7 @@ static int backend_unmap(struct xen_front_pgdir_shbuf *buf) return ret; } -/** +/* * Map the buffer with grant references provided by the backend. * * \param buf shared buffer. @@ -320,7 +320,7 @@ static int backend_map(struct xen_front_pgdir_shbuf *buf) return ret; } -/** +/* * Fill page directory with grant references to the pages of the * page directory itself. * @@ -350,7 +350,7 @@ static void backend_fill_page_dir(struct xen_front_pgdir_shbuf *buf) page_dir->gref_dir_next_page = XEN_GREF_LIST_END; } -/** +/* * Fill page directory with grant references to the pages of the * page directory and the buffer we share with the backend. * @@ -389,7 +389,7 @@ static void guest_fill_page_dir(struct xen_front_pgdir_shbuf *buf) } } -/** +/* * Grant references to the frontend's buffer pages. * * These will be shared with the backend, so it can @@ -418,7 +418,7 @@ static int guest_grant_refs_for_buffer(struct xen_front_pgdir_shbuf *buf, return 0; } -/** +/* * Grant all the references needed to share the buffer. * * Grant references to the page directory pages and, if @@ -466,7 +466,7 @@ static int grant_references(struct xen_front_pgdir_shbuf *buf) return 0; } -/** +/* * Allocate all required structures to mange shared buffer. * * \param buf shared buffer. @@ -506,7 +506,7 @@ static const struct xen_front_pgdir_shbuf_ops local_ops = { .grant_refs_for_buffer = guest_grant_refs_for_buffer, }; -/** +/* * Allocate a new instance of a shared buffer. * * \param cfg configuration to be used while allocating a new shared buffer. From 472a2ff63efb30234cbf6b2cdaf8117f21b4f8bc Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 10 Nov 2023 17:37:07 +0800 Subject: [PATCH 151/560] net: hns3: fix add VLAN fail issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hclge_sync_vlan_filter is called in periodic task, trying to remove VLAN from vlan_del_fail_bmap. It can be concurrence with VLAN adding operation from user. So once user failed to delete a VLAN id, and add it again soon, it may be removed by the periodic task, which may cause the software configuration being inconsistent with hardware. So add mutex handling to avoid this. user hns3 driver periodic task │ add vlan 10 ───── hns3_vlan_rx_add_vid │ │ (suppose success) │ │ │ del vlan 10 ───── hns3_vlan_rx_kill_vid │ │ (suppose fail,add to │ │ vlan_del_fail_bmap) │ │ │ add vlan 10 ───── hns3_vlan_rx_add_vid │ (suppose success) │ foreach vlan_del_fail_bmp del vlan 10 Fixes: fe4144d47eef ("net: hns3: sync VLAN filter entries when kill VLAN ID failed") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- .../hisilicon/hns3/hns3pf/hclge_main.c | 28 +++++++++++++------ .../hisilicon/hns3/hns3vf/hclgevf_main.c | 11 ++++++-- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 66e5807903a02..e22279e5d43f3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -10025,8 +10025,6 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id, struct hclge_vport_vlan_cfg *vlan, *tmp; struct hclge_dev *hdev = vport->back; - mutex_lock(&hdev->vport_lock); - list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) { if (vlan->vlan_id == vlan_id) { if (is_write_tbl && vlan->hd_tbl_status) @@ -10041,8 +10039,6 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id, break; } } - - mutex_unlock(&hdev->vport_lock); } void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list) @@ -10451,11 +10447,16 @@ int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto, * handle mailbox. Just record the vlan id, and remove it after * reset finished. */ + mutex_lock(&hdev->vport_lock); if ((test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) || test_bit(HCLGE_STATE_RST_FAIL, &hdev->state)) && is_kill) { set_bit(vlan_id, vport->vlan_del_fail_bmap); + mutex_unlock(&hdev->vport_lock); return -EBUSY; + } else if (!is_kill && test_bit(vlan_id, vport->vlan_del_fail_bmap)) { + clear_bit(vlan_id, vport->vlan_del_fail_bmap); } + mutex_unlock(&hdev->vport_lock); /* when port base vlan enabled, we use port base vlan as the vlan * filter entry. In this case, we don't update vlan filter table @@ -10470,17 +10471,22 @@ int hclge_set_vlan_filter(struct hnae3_handle *handle, __be16 proto, } if (!ret) { - if (!is_kill) + if (!is_kill) { hclge_add_vport_vlan_table(vport, vlan_id, writen_to_tbl); - else if (is_kill && vlan_id != 0) + } else if (is_kill && vlan_id != 0) { + mutex_lock(&hdev->vport_lock); hclge_rm_vport_vlan_table(vport, vlan_id, false); + mutex_unlock(&hdev->vport_lock); + } } else if (is_kill) { /* when remove hw vlan filter failed, record the vlan id, * and try to remove it from hw later, to be consistence * with stack */ + mutex_lock(&hdev->vport_lock); set_bit(vlan_id, vport->vlan_del_fail_bmap); + mutex_unlock(&hdev->vport_lock); } hclge_set_vport_vlan_fltr_change(vport); @@ -10520,6 +10526,7 @@ static void hclge_sync_vlan_filter(struct hclge_dev *hdev) int i, ret, sync_cnt = 0; u16 vlan_id; + mutex_lock(&hdev->vport_lock); /* start from vport 1 for PF is always alive */ for (i = 0; i < hdev->num_alloc_vport; i++) { struct hclge_vport *vport = &hdev->vport[i]; @@ -10530,21 +10537,26 @@ static void hclge_sync_vlan_filter(struct hclge_dev *hdev) ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q), vport->vport_id, vlan_id, true); - if (ret && ret != -EINVAL) + if (ret && ret != -EINVAL) { + mutex_unlock(&hdev->vport_lock); return; + } clear_bit(vlan_id, vport->vlan_del_fail_bmap); hclge_rm_vport_vlan_table(vport, vlan_id, false); hclge_set_vport_vlan_fltr_change(vport); sync_cnt++; - if (sync_cnt >= HCLGE_MAX_SYNC_COUNT) + if (sync_cnt >= HCLGE_MAX_SYNC_COUNT) { + mutex_unlock(&hdev->vport_lock); return; + } vlan_id = find_first_bit(vport->vlan_del_fail_bmap, VLAN_N_VID); } } + mutex_unlock(&hdev->vport_lock); hclge_sync_vlan_fltr_state(hdev); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index a4d68fb216fb9..1c62e58ff6d89 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1206,6 +1206,8 @@ static int hclgevf_set_vlan_filter(struct hnae3_handle *handle, test_bit(HCLGEVF_STATE_RST_FAIL, &hdev->state)) && is_kill) { set_bit(vlan_id, hdev->vlan_del_fail_bmap); return -EBUSY; + } else if (!is_kill && test_bit(vlan_id, hdev->vlan_del_fail_bmap)) { + clear_bit(vlan_id, hdev->vlan_del_fail_bmap); } hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_VLAN, @@ -1233,20 +1235,25 @@ static void hclgevf_sync_vlan_filter(struct hclgevf_dev *hdev) int ret, sync_cnt = 0; u16 vlan_id; + if (bitmap_empty(hdev->vlan_del_fail_bmap, VLAN_N_VID)) + return; + + rtnl_lock(); vlan_id = find_first_bit(hdev->vlan_del_fail_bmap, VLAN_N_VID); while (vlan_id != VLAN_N_VID) { ret = hclgevf_set_vlan_filter(handle, htons(ETH_P_8021Q), vlan_id, true); if (ret) - return; + break; clear_bit(vlan_id, hdev->vlan_del_fail_bmap); sync_cnt++; if (sync_cnt >= HCLGEVF_MAX_SYNC_COUNT) - return; + break; vlan_id = find_first_bit(hdev->vlan_del_fail_bmap, VLAN_N_VID); } + rtnl_unlock(); } static int hclgevf_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable) From ac92c0a9a0603fb448e60f38e63302e4eebb8035 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 10 Nov 2023 17:37:08 +0800 Subject: [PATCH 152/560] net: hns3: add barrier in vf mailbox reply process In hclgevf_mbx_handler() and hclgevf_get_mbx_resp() functions, there is a typical store-store and load-load scenario between received_resp and additional_info. This patch adds barrier to fix the problem. Fixes: 4671042f1ef0 ("net: hns3: add match_id to check mailbox response from PF to VF") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c index bbf7b14079de3..85c2a634c8f96 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_mbx.c @@ -63,6 +63,9 @@ static int hclgevf_get_mbx_resp(struct hclgevf_dev *hdev, u16 code0, u16 code1, i++; } + /* ensure additional_info will be seen after received_resp */ + smp_rmb(); + if (i >= HCLGEVF_MAX_TRY_TIMES) { dev_err(&hdev->pdev->dev, "VF could not get mbx(%u,%u) resp(=%d) from PF in %d tries\n", @@ -178,6 +181,10 @@ static void hclgevf_handle_mbx_response(struct hclgevf_dev *hdev, resp->resp_status = hclgevf_resp_to_errno(resp_status); memcpy(resp->additional_info, req->msg.resp_data, HCLGE_MBX_MAX_RESP_DATA_SIZE * sizeof(u8)); + + /* ensure additional_info will be seen before setting received_resp */ + smp_wmb(); + if (match_id) { /* If match_id is not zero, it means PF support match_id. * if the match_id is right, VF get the right response, or From 75b247b57d8b71bcb679e4cb37d0db104848806c Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Fri, 10 Nov 2023 17:37:09 +0800 Subject: [PATCH 153/560] net: hns3: fix incorrect capability bit display for copper port Currently, the FEC capability bit is default set for device version V2. It's incorrect for the copper port. Eventhough it doesn't make the nic work abnormal, but the capability information display in debugfs may confuse user. So clear it when driver get the port type inforamtion. Fixes: 433ccce83504 ("net: hns3: use FEC capability queried from firmware") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e22279e5d43f3..c393b4ee4a320 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -11663,6 +11663,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) goto err_msi_irq_uninit; if (hdev->hw.mac.media_type == HNAE3_MEDIA_TYPE_COPPER) { + clear_bit(HNAE3_DEV_SUPPORT_FEC_B, ae_dev->caps); if (hnae3_dev_phy_imp_supported(hdev)) ret = hclge_update_tp_port_info(hdev); else From 53aba458f23846112c0d44239580ff59bc5c36c3 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 10 Nov 2023 17:37:10 +0800 Subject: [PATCH 154/560] net: hns3: fix out-of-bounds access may occur when coalesce info is read via debugfs The hns3 driver define an array of string to show the coalesce info, but if the kernel adds a new mode or a new state, out-of-bounds access may occur when coalesce info is read via debugfs, this patch fix the problem. Fixes: c99fead7cb07 ("net: hns3: add debugfs support for interrupt coalesce") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 0b138635bafa9..c083d1d10767b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -503,11 +503,14 @@ static void hns3_get_coal_info(struct hns3_enet_tqp_vector *tqp_vector, } sprintf(result[j++], "%d", i); - sprintf(result[j++], "%s", dim_state_str[dim->state]); + sprintf(result[j++], "%s", dim->state < ARRAY_SIZE(dim_state_str) ? + dim_state_str[dim->state] : "unknown"); sprintf(result[j++], "%u", dim->profile_ix); - sprintf(result[j++], "%s", dim_cqe_mode_str[dim->mode]); + sprintf(result[j++], "%s", dim->mode < ARRAY_SIZE(dim_cqe_mode_str) ? + dim_cqe_mode_str[dim->mode] : "unknown"); sprintf(result[j++], "%s", - dim_tune_stat_str[dim->tune_state]); + dim->tune_state < ARRAY_SIZE(dim_tune_stat_str) ? + dim_tune_stat_str[dim->tune_state] : "unknown"); sprintf(result[j++], "%u", dim->steps_left); sprintf(result[j++], "%u", dim->steps_right); sprintf(result[j++], "%u", dim->tired); From dbd2f3b20c6ae425665b6975d766e3653d453e73 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 10 Nov 2023 17:37:11 +0800 Subject: [PATCH 155/560] net: hns3: fix variable may not initialized problem in hns3_init_mac_addr() When a VF is calling hns3_init_mac_addr(), get_mac_addr() may return fail, then the value of mac_addr_temp is not initialized. Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 06117502001f9..b618797a7e8de 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -5139,7 +5139,7 @@ static int hns3_init_mac_addr(struct net_device *netdev) struct hns3_nic_priv *priv = netdev_priv(netdev); char format_mac_addr[HNAE3_FORMAT_MAC_ADDR_LEN]; struct hnae3_handle *h = priv->ae_handle; - u8 mac_addr_temp[ETH_ALEN]; + u8 mac_addr_temp[ETH_ALEN] = {0}; int ret = 0; if (h->ae_algo->ops->get_mac_addr) From 65e98bb56fa3ce2edb400930c05238c9b380500e Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Fri, 10 Nov 2023 17:37:12 +0800 Subject: [PATCH 156/560] net: hns3: fix VF reset fail issue Currently the reset process in hns3 and firmware watchdog init process is asynchronous. We think firmware watchdog initialization is completed before VF clear the interrupt source. However, firmware initialization may not complete early. So VF will receive multiple reset interrupts and fail to reset. So we add delay before VF interrupt source and 5 ms delay is enough to avoid second reset interrupt. Fixes: 427900d27d86 ("net: hns3: fix the timing issue of VF clearing interrupt sources") Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 14 +++++++++++++- .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 1c62e58ff6d89..0aa9beefd1c7e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1981,8 +1981,18 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev, return HCLGEVF_VECTOR0_EVENT_OTHER; } +static void hclgevf_reset_timer(struct timer_list *t) +{ + struct hclgevf_dev *hdev = from_timer(hdev, t, reset_timer); + + hclgevf_clear_event_cause(hdev, HCLGEVF_VECTOR0_EVENT_RST); + hclgevf_reset_task_schedule(hdev); +} + static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data) { +#define HCLGEVF_RESET_DELAY 5 + enum hclgevf_evt_cause event_cause; struct hclgevf_dev *hdev = data; u32 clearval; @@ -1994,7 +2004,8 @@ static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data) switch (event_cause) { case HCLGEVF_VECTOR0_EVENT_RST: - hclgevf_reset_task_schedule(hdev); + mod_timer(&hdev->reset_timer, + jiffies + msecs_to_jiffies(HCLGEVF_RESET_DELAY)); break; case HCLGEVF_VECTOR0_EVENT_MBX: hclgevf_mbx_handler(hdev); @@ -2937,6 +2948,7 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) HCLGEVF_DRIVER_NAME); hclgevf_task_schedule(hdev, round_jiffies_relative(HZ)); + timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0); return 0; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 81c16b8c8da29..a73f2bf3a56a6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -219,6 +219,7 @@ struct hclgevf_dev { enum hnae3_reset_type reset_level; unsigned long reset_pending; enum hnae3_reset_type reset_type; + struct timer_list reset_timer; #define HCLGEVF_RESET_REQUESTED 0 #define HCLGEVF_RESET_PENDING 1 From dff655e82faffc287d4a72a59f66fa120bf904e4 Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Fri, 10 Nov 2023 17:37:13 +0800 Subject: [PATCH 157/560] net: hns3: fix VF wrong speed and duplex issue If PF is down, firmware will returns 10 Mbit/s rate and half-duplex mode when PF queries the port information from firmware. After imp reset command is executed, PF status changes to down, and PF will query link status and updates port information from firmware in a periodic scheduled task. However, there is a low probability that port information is updated when PF is down, and then PF link status changes to up. In this case, PF synchronizes incorrect rate and duplex mode to VF. This patch fixes it by updating port information before PF synchronizes the rate and duplex to the VF when PF changes to up. Fixes: 18b6e31f8bf4 ("net: hns3: PF add support for pushing link status to VFs") Signed-off-by: Jijie Shao Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index c393b4ee4a320..5ea9e59569eff 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -61,6 +61,7 @@ static void hclge_sync_fd_table(struct hclge_dev *hdev); static void hclge_update_fec_stats(struct hclge_dev *hdev); static int hclge_mac_link_status_wait(struct hclge_dev *hdev, int link_ret, int wait_cnt); +static int hclge_update_port_info(struct hclge_dev *hdev); static struct hnae3_ae_algo ae_algo; @@ -3041,6 +3042,9 @@ static void hclge_update_link_status(struct hclge_dev *hdev) if (state != hdev->hw.mac.link) { hdev->hw.mac.link = state; + if (state == HCLGE_LINK_STATUS_UP) + hclge_update_port_info(hdev); + client->ops->link_status_change(handle, state); hclge_config_mac_tnl_int(hdev, state); if (rclient && rclient->ops->link_status_change) From f726eaa787e9f9bc858c902d18a09af6bcbfcdaf Mon Sep 17 00:00:00 2001 From: Jan Bottorff Date: Thu, 9 Nov 2023 03:19:27 +0000 Subject: [PATCH 158/560] i2c: designware: Fix corrupted memory seen in the ISR When running on a many core ARM64 server, errors were happening in the ISR that looked like corrupted memory. These corruptions would fix themselves if small delays were inserted in the ISR. Errors reported by the driver included "i2c_designware APMC0D0F:00: i2c_dw_xfer_msg: invalid target address" and "i2c_designware APMC0D0F:00:controller timed out" during in-band IPMI SSIF stress tests. The problem was determined to be memory writes in the driver were not becoming visible to all cores when execution rapidly shifted between cores, like when a register write immediately triggers an ISR. Processors with weak memory ordering, like ARM64, make no guarantees about the order normal memory writes become globally visible, unless barrier instructions are used to control ordering. To solve this, regmap accessor functions configured by this driver were changed to use non-relaxed forms of the low-level register access functions, which include a barrier on platforms that require it. This assures memory writes before a controller register access are visible to all cores. The community concluded defaulting to correct operation outweighed defaulting to the small performance gains from using relaxed access functions. Being a low speed device added weight to this choice of default register access behavior. Signed-off-by: Jan Bottorff Acked-by: Jarkko Nikula Tested-by: Serge Semin Reviewed-by: Serge Semin Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-common.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c index affcfb243f0f5..35f762872b8a5 100644 --- a/drivers/i2c/busses/i2c-designware-common.c +++ b/drivers/i2c/busses/i2c-designware-common.c @@ -63,7 +63,7 @@ static int dw_reg_read(void *context, unsigned int reg, unsigned int *val) { struct dw_i2c_dev *dev = context; - *val = readl_relaxed(dev->base + reg); + *val = readl(dev->base + reg); return 0; } @@ -72,7 +72,7 @@ static int dw_reg_write(void *context, unsigned int reg, unsigned int val) { struct dw_i2c_dev *dev = context; - writel_relaxed(val, dev->base + reg); + writel(val, dev->base + reg); return 0; } @@ -81,7 +81,7 @@ static int dw_reg_read_swab(void *context, unsigned int reg, unsigned int *val) { struct dw_i2c_dev *dev = context; - *val = swab32(readl_relaxed(dev->base + reg)); + *val = swab32(readl(dev->base + reg)); return 0; } @@ -90,7 +90,7 @@ static int dw_reg_write_swab(void *context, unsigned int reg, unsigned int val) { struct dw_i2c_dev *dev = context; - writel_relaxed(swab32(val), dev->base + reg); + writel(swab32(val), dev->base + reg); return 0; } @@ -99,8 +99,8 @@ static int dw_reg_read_word(void *context, unsigned int reg, unsigned int *val) { struct dw_i2c_dev *dev = context; - *val = readw_relaxed(dev->base + reg) | - (readw_relaxed(dev->base + reg + 2) << 16); + *val = readw(dev->base + reg) | + (readw(dev->base + reg + 2) << 16); return 0; } @@ -109,8 +109,8 @@ static int dw_reg_write_word(void *context, unsigned int reg, unsigned int val) { struct dw_i2c_dev *dev = context; - writew_relaxed(val, dev->base + reg); - writew_relaxed(val >> 16, dev->base + reg + 2); + writew(val, dev->base + reg); + writew(val >> 16, dev->base + reg + 2); return 0; } From 6979a51ecaec4dfb7c768eb2a77b77df73a74c8e Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Mon, 13 Nov 2023 15:16:56 +0530 Subject: [PATCH 159/560] MAINTAINERS: add entry for TI ICSSG Ethernet driver Add record for TI Industrial Communication Subsystem - Gigabit (ICSSG) Ethernet driver. Also add Roger and myself as maintainer. Signed-off-by: MD Danish Anwar Signed-off-by: David S. Miller --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index e3acb36989f00..0443f4d9f7366 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21795,6 +21795,15 @@ F: Documentation/devicetree/bindings/media/i2c/ti,ds90* F: drivers/media/i2c/ds90* F: include/media/i2c/ds90* +TI ICSSG ETHERNET DRIVER (ICSSG) +R: MD Danish Anwar +R: Roger Quadros +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +L: netdev@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/net/ti,icss*.yaml +F: drivers/net/ethernet/ti/icssg/* + TI J721E CSI2RX DRIVER M: Jai Luthra L: linux-media@vger.kernel.org From 438cbcdf105d84449fceb39a2d0e16d0ec20708f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Fri, 10 Nov 2023 13:05:46 +0100 Subject: [PATCH 160/560] net: mdio: fix typo in header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The quotes symbol in "EEE "link partner ability 1 should be at the end of the register name "EEE link partner ability 1" Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 8fa23bdcedbf9..007fd9c3e4b62 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -420,7 +420,7 @@ static inline u32 linkmode_adv_to_mii_t1_adv_m_t(unsigned long *advertising) * A function that translates value of following registers to the linkmode: * IEEE 802.3-2018 45.2.3.10 "EEE control and capability 1" register (3.20) * IEEE 802.3-2018 45.2.7.13 "EEE advertisement 1" register (7.60) - * IEEE 802.3-2018 45.2.7.14 "EEE "link partner ability 1 register (7.61) + * IEEE 802.3-2018 45.2.7.14 "EEE link partner ability 1" register (7.61) */ static inline void mii_eee_cap1_mod_linkmode_t(unsigned long *adv, u32 val) { From e6daf129ccb79d3781129f623f82bc676f2cb02c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 10 Nov 2023 10:36:00 -0500 Subject: [PATCH 161/560] net: gso_test: support CONFIG_MAX_SKB_FRAGS up to 45 The test allocs a single page to hold all the frag_list skbs. This is insufficient on kernels with CONFIG_MAX_SKB_FRAGS=45, due to the increased skb_shared_info frags[] array length. gso_test_func: ASSERTION FAILED at net/core/gso_test.c:210 Expected alloc_size <= ((1UL) << 12), but alloc_size == 5075 (0x13d3) ((1UL) << 12) == 4096 (0x1000) Simplify the logic. Just allocate a page for each frag_list skb. Fixes: 4688ecb1385f ("net: expand skb_segment unit test with frag_list coverage") Signed-off-by: Willem de Bruijn Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/core/gso_test.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/core/gso_test.c b/net/core/gso_test.c index ceb684be4cbf8..4c2e77bd12f4b 100644 --- a/net/core/gso_test.c +++ b/net/core/gso_test.c @@ -180,18 +180,17 @@ static void gso_test_func(struct kunit *test) } if (tcase->frag_skbs) { - unsigned int total_size = 0, total_true_size = 0, alloc_size = 0; + unsigned int total_size = 0, total_true_size = 0; struct sk_buff *frag_skb, *prev = NULL; - page = alloc_page(GFP_KERNEL); - KUNIT_ASSERT_NOT_NULL(test, page); - page_ref_add(page, tcase->nr_frag_skbs - 1); - for (i = 0; i < tcase->nr_frag_skbs; i++) { unsigned int frag_size; + page = alloc_page(GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, page); + frag_size = tcase->frag_skbs[i]; - frag_skb = build_skb(page_address(page) + alloc_size, + frag_skb = build_skb(page_address(page), frag_size + shinfo_size); KUNIT_ASSERT_NOT_NULL(test, frag_skb); __skb_put(frag_skb, frag_size); @@ -204,11 +203,8 @@ static void gso_test_func(struct kunit *test) total_size += frag_size; total_true_size += frag_skb->truesize; - alloc_size += frag_size + shinfo_size; } - KUNIT_ASSERT_LE(test, alloc_size, PAGE_SIZE); - skb->len += total_size; skb->data_len += total_size; skb->truesize += total_true_size; From fb317eb23b5ee4c37b0656a9a52a3db58d9dd072 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sat, 11 Nov 2023 01:39:47 +0900 Subject: [PATCH 162/560] tipc: Fix kernel-infoleak due to uninitialized TLV value KMSAN reported the following kernel-infoleak issue: ===================================================== BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline] BUG: KMSAN: kernel-infoleak in copy_to_user_iter lib/iov_iter.c:24 [inline] BUG: KMSAN: kernel-infoleak in iterate_ubuf include/linux/iov_iter.h:29 [inline] BUG: KMSAN: kernel-infoleak in iterate_and_advance2 include/linux/iov_iter.h:245 [inline] BUG: KMSAN: kernel-infoleak in iterate_and_advance include/linux/iov_iter.h:271 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x4ec/0x2bc0 lib/iov_iter.c:186 instrument_copy_to_user include/linux/instrumented.h:114 [inline] copy_to_user_iter lib/iov_iter.c:24 [inline] iterate_ubuf include/linux/iov_iter.h:29 [inline] iterate_and_advance2 include/linux/iov_iter.h:245 [inline] iterate_and_advance include/linux/iov_iter.h:271 [inline] _copy_to_iter+0x4ec/0x2bc0 lib/iov_iter.c:186 copy_to_iter include/linux/uio.h:197 [inline] simple_copy_to_iter net/core/datagram.c:532 [inline] __skb_datagram_iter.5+0x148/0xe30 net/core/datagram.c:420 skb_copy_datagram_iter+0x52/0x210 net/core/datagram.c:546 skb_copy_datagram_msg include/linux/skbuff.h:3960 [inline] netlink_recvmsg+0x43d/0x1630 net/netlink/af_netlink.c:1967 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg net/socket.c:1066 [inline] __sys_recvfrom+0x476/0x860 net/socket.c:2246 __do_sys_recvfrom net/socket.c:2264 [inline] __se_sys_recvfrom net/socket.c:2260 [inline] __x64_sys_recvfrom+0x130/0x200 net/socket.c:2260 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x103/0x9e0 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5f7/0xb50 mm/slub.c:3523 kmalloc_reserve+0x13c/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x2fd/0x770 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] tipc_tlv_alloc net/tipc/netlink_compat.c:156 [inline] tipc_get_err_tlv+0x90/0x5d0 net/tipc/netlink_compat.c:170 tipc_nl_compat_recv+0x1042/0x15d0 net/tipc/netlink_compat.c:1324 genl_family_rcv_msg_doit net/netlink/genetlink.c:972 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1052 [inline] genl_rcv_msg+0x1220/0x12c0 net/netlink/genetlink.c:1067 netlink_rcv_skb+0x4a4/0x6a0 net/netlink/af_netlink.c:2545 genl_rcv+0x41/0x60 net/netlink/genetlink.c:1076 netlink_unicast_kernel net/netlink/af_netlink.c:1342 [inline] netlink_unicast+0xf4b/0x1230 net/netlink/af_netlink.c:1368 netlink_sendmsg+0x1242/0x1420 net/netlink/af_netlink.c:1910 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x997/0xd60 net/socket.c:2588 ___sys_sendmsg+0x271/0x3b0 net/socket.c:2642 __sys_sendmsg net/socket.c:2671 [inline] __do_sys_sendmsg net/socket.c:2680 [inline] __se_sys_sendmsg net/socket.c:2678 [inline] __x64_sys_sendmsg+0x2fa/0x4a0 net/socket.c:2678 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Bytes 34-35 of 36 are uninitialized Memory access of size 36 starts at ffff88802d464a00 Data copied to user address 00007ff55033c0a0 CPU: 0 PID: 30322 Comm: syz-executor.0 Not tainted 6.6.0-14500-g1c41041124bd #10 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc38 04/01/2014 ===================================================== tipc_add_tlv() puts TLV descriptor and value onto `skb`. This size is calculated with TLV_SPACE() macro. It adds the size of struct tlv_desc and the length of TLV value passed as an argument, and aligns the result to a multiple of TLV_ALIGNTO, i.e., a multiple of 4 bytes. If the size of struct tlv_desc plus the length of TLV value is not aligned, the current implementation leaves the remaining bytes uninitialized. This is the cause of the above kernel-infoleak issue. This patch resolves this issue by clearing data up to an aligned size. Fixes: d0796d1ef63d ("tipc: convert legacy nl bearer dump to nl compat") Signed-off-by: Shigeru Yoshida Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 5bc076f2fa74a..c763008a8adba 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -102,6 +102,7 @@ static int tipc_add_tlv(struct sk_buff *skb, u16 type, void *data, u16 len) return -EMSGSIZE; skb_put(skb, TLV_SPACE(len)); + memset(tlv, 0, TLV_SPACE(len)); tlv->tlv_type = htons(type); tlv->tlv_len = htons(TLV_LENGTH(len)); if (len && data) From ca8add922f9c7f6e2e3c71039da8e0dcc64b87ed Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Sat, 11 Nov 2023 05:41:12 +0100 Subject: [PATCH 163/560] net: mvneta: fix calls to page_pool_get_stats Calling page_pool_get_stats in the mvneta driver without checks leads to kernel crashes. First the page pool is only available if the bm is not used. The page pool is also not allocated when the port is stopped. It can also be not allocated in case of errors. The current implementation leads to the following crash calling ethstats on a port that is down or when calling it at the wrong moment: ble to handle kernel NULL pointer dereference at virtual address 00000070 [00000070] *pgd=00000000 Internal error: Oops: 5 [#1] SMP ARM Hardware name: Marvell Armada 380/385 (Device Tree) PC is at page_pool_get_stats+0x18/0x1cc LR is at mvneta_ethtool_get_stats+0xa0/0xe0 [mvneta] pc : [] lr : [] psr: a0000013 sp : f1439d48 ip : f1439dc0 fp : 0000001d r10: 00000100 r9 : c4816b80 r8 : f0d75150 r7 : bf0b400c r6 : c238f000 r5 : 00000000 r4 : f1439d68 r3 : c2091040 r2 : ffffffd8 r1 : f1439d68 r0 : 00000000 Flags: NzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 10c5387d Table: 066b004a DAC: 00000051 Register r0 information: NULL pointer Register r1 information: 2-page vmalloc region starting at 0xf1438000 allocated at kernel_clone+0x9c/0x390 Register r2 information: non-paged memory Register r3 information: slab kmalloc-2k start c2091000 pointer offset 64 size 2048 Register r4 information: 2-page vmalloc region starting at 0xf1438000 allocated at kernel_clone+0x9c/0x390 Register r5 information: NULL pointer Register r6 information: slab kmalloc-cg-4k start c238f000 pointer offset 0 size 4096 Register r7 information: 15-page vmalloc region starting at 0xbf0a8000 allocated at load_module+0xa30/0x219c Register r8 information: 1-page vmalloc region starting at 0xf0d75000 allocated at ethtool_get_stats+0x138/0x208 Register r9 information: slab task_struct start c4816b80 pointer offset 0 Register r10 information: non-paged memory Register r11 information: non-paged memory Register r12 information: 2-page vmalloc region starting at 0xf1438000 allocated at kernel_clone+0x9c/0x390 Process snmpd (pid: 733, stack limit = 0x38de3a88) Stack: (0xf1439d48 to 0xf143a000) 9d40: 000000c0 00000001 c238f000 bf0b400c f0d75150 c4816b80 9d60: 00000100 bf0a98d8 00000000 00000000 00000000 00000000 00000000 00000000 9d80: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 9da0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 9dc0: 00000dc0 5335509c 00000035 c238f000 bf0b2214 01067f50 f0d75000 c0b9b9c8 9de0: 0000001d 00000035 c2212094 5335509c c4816b80 c238f000 c5ad6e00 01067f50 9e00: c1b0be80 c4816b80 00014813 c0b9d7f0 00000000 00000000 0000001d 0000001d 9e20: 00000000 00001200 00000000 00000000 c216ed90 c73943b8 00000000 00000000 9e40: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 9e60: 00000000 c0ad9034 00000000 00000000 00000000 00000000 00000000 00000000 9e80: 00000000 00000000 00000000 5335509c c1b0be80 f1439ee4 00008946 c1b0be80 9ea0: 01067f50 f1439ee3 00000000 00000046 b6d77ae0 c0b383f0 00008946 becc83e8 9ec0: c1b0be80 00000051 0000000b c68ca480 c7172d00 c0ad8ff0 f1439ee3 cf600e40 9ee0: 01600e40 32687465 00000000 00000000 00000000 01067f50 00000000 00000000 9f00: 00000000 5335509c 00008946 00008946 00000000 c68ca480 becc83e8 c05e2de0 9f20: f1439fb0 c03002f0 00000006 5ac3c35a c4816b80 00000006 b6d77ae0 c030caf0 9f40: c4817350 00000014 f1439e1c 0000000c 00000000 00000051 01000000 00000014 9f60: 00003fec f1439edc 00000001 c0372abc b6d77ae0 c0372abc cf600e40 5335509c 9f80: c21e6800 01015c9c 0000000b 00008946 00000036 c03002f0 c4816b80 00000036 9fa0: b6d77ae0 c03000c0 01015c9c 0000000b 0000000b 00008946 becc83e8 00000000 9fc0: 01015c9c 0000000b 00008946 00000036 00000035 010678a0 b6d797ec b6d77ae0 9fe0: b6dbf738 becc838c b6d186d7 b6baa858 40000030 0000000b 00000000 00000000 page_pool_get_stats from mvneta_ethtool_get_stats+0xa0/0xe0 [mvneta] mvneta_ethtool_get_stats [mvneta] from ethtool_get_stats+0x154/0x208 ethtool_get_stats from dev_ethtool+0xf48/0x2480 dev_ethtool from dev_ioctl+0x538/0x63c dev_ioctl from sock_ioctl+0x49c/0x53c sock_ioctl from sys_ioctl+0x134/0xbd8 sys_ioctl from ret_fast_syscall+0x0/0x1c Exception stack(0xf1439fa8 to 0xf1439ff0) 9fa0: 01015c9c 0000000b 0000000b 00008946 becc83e8 00000000 9fc0: 01015c9c 0000000b 00008946 00000036 00000035 010678a0 b6d797ec b6d77ae0 9fe0: b6dbf738 becc838c b6d186d7 b6baa858 Code: e28dd004 e1a05000 e2514000 0a00006a (e5902070) This commit adds the proper checks before calling page_pool_get_stats. Fixes: b3fc79225f05 ("net: mvneta: add support for page_pool_get_stats") Signed-off-by: Sven Auhagen Reported-by: Paulo Da Silva Acked-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 28 +++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 90817136808d4..29aac327574d6 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -4790,14 +4790,17 @@ static void mvneta_ethtool_get_strings(struct net_device *netdev, u32 sset, u8 *data) { if (sset == ETH_SS_STATS) { + struct mvneta_port *pp = netdev_priv(netdev); int i; for (i = 0; i < ARRAY_SIZE(mvneta_statistics); i++) memcpy(data + i * ETH_GSTRING_LEN, mvneta_statistics[i].name, ETH_GSTRING_LEN); - data += ETH_GSTRING_LEN * ARRAY_SIZE(mvneta_statistics); - page_pool_ethtool_stats_get_strings(data); + if (!pp->bm_priv) { + data += ETH_GSTRING_LEN * ARRAY_SIZE(mvneta_statistics); + page_pool_ethtool_stats_get_strings(data); + } } } @@ -4915,8 +4918,10 @@ static void mvneta_ethtool_pp_stats(struct mvneta_port *pp, u64 *data) struct page_pool_stats stats = {}; int i; - for (i = 0; i < rxq_number; i++) - page_pool_get_stats(pp->rxqs[i].page_pool, &stats); + for (i = 0; i < rxq_number; i++) { + if (pp->rxqs[i].page_pool) + page_pool_get_stats(pp->rxqs[i].page_pool, &stats); + } page_pool_ethtool_stats_get(data, &stats); } @@ -4932,14 +4937,21 @@ static void mvneta_ethtool_get_stats(struct net_device *dev, for (i = 0; i < ARRAY_SIZE(mvneta_statistics); i++) *data++ = pp->ethtool_stats[i]; - mvneta_ethtool_pp_stats(pp, data); + if (!pp->bm_priv) + mvneta_ethtool_pp_stats(pp, data); } static int mvneta_ethtool_get_sset_count(struct net_device *dev, int sset) { - if (sset == ETH_SS_STATS) - return ARRAY_SIZE(mvneta_statistics) + - page_pool_ethtool_stats_get_count(); + if (sset == ETH_SS_STATS) { + int count = ARRAY_SIZE(mvneta_statistics); + struct mvneta_port *pp = netdev_priv(dev); + + if (!pp->bm_priv) + count += page_pool_ethtool_stats_get_count(); + + return count; + } return -EOPNOTSUPP; } From c0a2a1b0d631fc460d830f52d06211838874d655 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 12 Nov 2023 22:16:32 -0500 Subject: [PATCH 164/560] ppp: limit MRU to 64K ppp_sync_ioctl allows setting device MRU, but does not sanity check this input. Limit to a sane upper bound of 64KB. No implementation I could find generates larger than 64KB frames. RFC 2823 mentions an upper bound of PPP over SDL of 64KB based on the 16-bit length field. Other protocols will be smaller, such as PPPoE (9KB jumbo frame) and PPPoA (18190 maximum CPCS-SDU size, RFC 2364). PPTP and L2TP encapsulate in IP. Syzbot managed to trigger alloc warning in __alloc_pages: if (WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)) WARNING: CPU: 1 PID: 37 at mm/page_alloc.c:4544 __alloc_pages+0x3ab/0x4a0 mm/page_alloc.c:4544 __alloc_skb+0x12b/0x330 net/core/skbuff.c:651 __netdev_alloc_skb+0x72/0x3f0 net/core/skbuff.c:715 netdev_alloc_skb include/linux/skbuff.h:3225 [inline] dev_alloc_skb include/linux/skbuff.h:3238 [inline] ppp_sync_input drivers/net/ppp/ppp_synctty.c:669 [inline] ppp_sync_receive+0xff/0x680 drivers/net/ppp/ppp_synctty.c:334 tty_ldisc_receive_buf+0x14c/0x180 drivers/tty/tty_buffer.c:390 tty_port_default_receive_buf+0x70/0xb0 drivers/tty/tty_port.c:37 receive_buf drivers/tty/tty_buffer.c:444 [inline] flush_to_ldisc+0x261/0x780 drivers/tty/tty_buffer.c:494 process_one_work+0x884/0x15c0 kernel/workqueue.c:2630 With call ioctl$PPPIOCSMRU1(r1, 0x40047452, &(0x7f0000000100)=0x5e6417a8) Similar code exists in other drivers that implement ppp_channel_ops ioctl PPPIOCSMRU. Those might also be in scope. Notably excluded from this are pppol2tp_ioctl and pppoe_ioctl. This code goes back to the start of git history. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+6177e1f90d92583bcc58@syzkaller.appspotmail.com Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_synctty.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c index ea261a628786b..52d05ce4a2819 100644 --- a/drivers/net/ppp/ppp_synctty.c +++ b/drivers/net/ppp/ppp_synctty.c @@ -453,6 +453,10 @@ ppp_sync_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg) case PPPIOCSMRU: if (get_user(val, (int __user *) argp)) break; + if (val > U16_MAX) { + err = -EINVAL; + break; + } if (val < PPP_MRU) val = PPP_MRU; ap->mru = val; From 47d970204054f859f35a2237baa75c2d84fcf436 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 25 Sep 2023 17:54:13 +0200 Subject: [PATCH 165/560] xen/events: fix delayed eoi list handling When delaying eoi handling of events, the related elements are queued into the percpu lateeoi list. In case the list isn't empty, the elements should be sorted by the time when eoi handling is to happen. Unfortunately a new element will never be queued at the start of the list, even if it has a handling time lower than all other list elements. Fix that by handling that case the same way as for an empty list. Fixes: e99502f76271 ("xen/events: defer eoi in case of excessive number of events") Reported-by: Jan Beulich Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 237e6b884f722..cd33a418344a8 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -604,7 +604,9 @@ static void lateeoi_list_add(struct irq_info *info) spin_lock_irqsave(&eoi->eoi_list_lock, flags); - if (list_empty(&eoi->eoi_list)) { + elem = list_first_entry_or_null(&eoi->eoi_list, struct irq_info, + eoi_list); + if (!elem || info->eoi_time < elem->eoi_time) { list_add(&info->eoi_list, &eoi->eoi_list); mod_delayed_work_on(info->eoi_cpu, system_wq, &eoi->delayed, delay); From f96c6c588ca81255566a5168e51c9cbbe7b86def Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 26 Sep 2023 14:29:54 +0200 Subject: [PATCH 166/560] xen/events: remove unused functions There are no users of xen_irq_from_pirq() and xen_set_irq_pending(). Remove those functions. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 30 ------------------------------ include/xen/events.h | 4 ---- 2 files changed, 34 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index cd33a418344a8..c5d86128eb733 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1169,29 +1169,6 @@ int xen_destroy_irq(int irq) return rc; } -int xen_irq_from_pirq(unsigned pirq) -{ - int irq; - - struct irq_info *info; - - mutex_lock(&irq_mapping_update_lock); - - list_for_each_entry(info, &xen_irq_list_head, list) { - if (info->type != IRQT_PIRQ) - continue; - irq = info->irq; - if (info->u.pirq.pirq == pirq) - goto out; - } - irq = -1; -out: - mutex_unlock(&irq_mapping_update_lock); - - return irq; -} - - int xen_pirq_from_irq(unsigned irq) { return pirq_from_irq(irq); @@ -2031,13 +2008,6 @@ void xen_clear_irq_pending(int irq) event_handler_exit(info); } EXPORT_SYMBOL(xen_clear_irq_pending); -void xen_set_irq_pending(int irq) -{ - evtchn_port_t evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - set_evtchn(evtchn); -} bool xen_test_irq_pending(int irq) { diff --git a/include/xen/events.h b/include/xen/events.h index 23932b0673dc7..a129cafa80ed0 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -88,7 +88,6 @@ void xen_irq_resume(void); /* Clear an irq's pending state, in preparation for polling on it */ void xen_clear_irq_pending(int irq); -void xen_set_irq_pending(int irq); bool xen_test_irq_pending(int irq); /* Poll waiting for an irq to become pending. In the usual case, the @@ -122,9 +121,6 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, /* De-allocates the above mentioned physical interrupt. */ int xen_destroy_irq(int irq); -/* Return irq from pirq */ -int xen_irq_from_pirq(unsigned pirq); - /* Return the pirq allocated to the irq. */ int xen_pirq_from_irq(unsigned irq); From b0077e269f6c152e807fdac90b58caf012cdbaab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Nov 2023 11:52:31 +0800 Subject: [PATCH 167/560] blk-mq: make sure active queue usage is held for bio_integrity_prep() blk_integrity_unregister() can come if queue usage counter isn't held for one bio with integrity prepared, so this request may be completed with calling profile->complete_fn, then kernel panic. Another constraint is that bio_integrity_prep() needs to be called before bio merge. Fix the issue by: - call bio_integrity_prep() with one queue usage counter grabbed reliably - call bio_integrity_prep() before bio merge Fixes: 900e080752025f00 ("block: move queue enter logic into blk_mq_submit_bio()") Reported-by: Yi Zhang Cc: Christoph Hellwig Signed-off-by: Ming Lei Tested-by: Yi Zhang Link: https://lore.kernel.org/r/20231113035231.2708053-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 75 +++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index e2d11183f62e3..900c1be1fee18 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2858,11 +2858,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, }; struct request *rq; - if (unlikely(bio_queue_enter(bio))) - return NULL; - if (blk_mq_attempt_bio_merge(q, bio, nsegs)) - goto queue_exit; + return NULL; rq_qos_throttle(q, bio); @@ -2878,35 +2875,23 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); -queue_exit: - blk_queue_exit(q); return NULL; } -static inline struct request *blk_mq_get_cached_request(struct request_queue *q, - struct blk_plug *plug, struct bio **bio, unsigned int nsegs) +/* return true if this @rq can be used for @bio */ +static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, + struct bio *bio) { - struct request *rq; - enum hctx_type type, hctx_type; + enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); + enum hctx_type hctx_type = rq->mq_hctx->type; - if (!plug) - return NULL; - rq = rq_list_peek(&plug->cached_rq); - if (!rq || rq->q != q) - return NULL; + WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); - if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { - *bio = NULL; - return NULL; - } - - type = blk_mq_get_hctx_type((*bio)->bi_opf); - hctx_type = rq->mq_hctx->type; if (type != hctx_type && !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT)) - return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) - return NULL; + return false; + if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) + return false; /* * If any qos ->throttle() end up blocking, we will have flushed the @@ -2914,12 +2899,12 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, * before we throttle. */ plug->cached_rq = rq_list_next(rq); - rq_qos_throttle(q, *bio); + rq_qos_throttle(rq->q, bio); blk_mq_rq_time_init(rq, 0); - rq->cmd_flags = (*bio)->bi_opf; + rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); - return rq; + return true; } static void bio_set_ioprio(struct bio *bio) @@ -2949,7 +2934,7 @@ void blk_mq_submit_bio(struct bio *bio) struct blk_plug *plug = blk_mq_plug(bio); const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; - struct request *rq; + struct request *rq = NULL; unsigned int nr_segs = 1; blk_status_t ret; @@ -2960,20 +2945,36 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (!bio_integrity_prep(bio)) - return; - bio_set_ioprio(bio); - rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); - if (!rq) { - if (!bio) + if (plug) { + rq = rq_list_peek(&plug->cached_rq); + if (rq && rq->q != q) + rq = NULL; + } + if (rq) { + if (!bio_integrity_prep(bio)) return; - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); - if (unlikely(!rq)) + if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) return; + if (blk_mq_can_use_cached_rq(rq, plug, bio)) + goto done; + percpu_ref_get(&q->q_usage_counter); + } else { + if (unlikely(bio_queue_enter(bio))) + return; + if (!bio_integrity_prep(bio)) + goto fail; + } + + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); + if (unlikely(!rq)) { +fail: + blk_queue_exit(q); + return; } +done: trace_block_getrq(bio); rq_qos_track(q, rq, bio); From d02ef87db9d6137fc2a98231b92f24ead4f7966d Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Mon, 13 Nov 2023 16:40:29 +0000 Subject: [PATCH 168/560] ALSA: hda: cs35l56: Enable low-power hibernation mode on i2c This can now be re-enabled as the sequence to reliably wake the device has been implemented in the shared ASoC code. This has a functional dependency on commit 3df761bdbc8b ("ASoC: cs35l56: Wake transactions need to be issued twice") To protect against this, enabling hibernation is conditional on CS35L56_WAKE_HOLD_TIME_US being defined, which indicates that the new hibernation sequences are available. Signed-off-by: Simon Trimmer Signed-off-by: Richard Fitzgerald Link: https://lore.kernel.org/r/20231113164029.1156669-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l56_hda_i2c.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/cs35l56_hda_i2c.c b/sound/pci/hda/cs35l56_hda_i2c.c index 757a4d193e0fb..a9ef6d86de839 100644 --- a/sound/pci/hda/cs35l56_hda_i2c.c +++ b/sound/pci/hda/cs35l56_hda_i2c.c @@ -21,6 +21,10 @@ static int cs35l56_hda_i2c_probe(struct i2c_client *clt) return -ENOMEM; cs35l56->base.dev = &clt->dev; + +#ifdef CS35L56_WAKE_HOLD_TIME_US + cs35l56->base.can_hibernate = true; +#endif cs35l56->base.regmap = devm_regmap_init_i2c(clt, &cs35l56_regmap_i2c); if (IS_ERR(cs35l56->base.regmap)) { ret = PTR_ERR(cs35l56->base.regmap); From 58c09cad1754c56cb000ef07477e8781e3fad4d3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 13 Nov 2023 08:52:24 -0800 Subject: [PATCH 169/560] drm/ci: make github dependabot happy again The drm CI scripts for gitlab have a requirements file that makes the github 'dependabot' worry about a few of the required tooling versions. It wants to update the pip requirements from 23.2.1 to 23.3: "When installing a package from a Mercurial VCS URL, e.g. pip install hg+..., with pip prior to v23.3, the specified Mercurial revision could be used to inject arbitrary configuration options to the hg clone call (e.g. --config). Controlling the Mercurial configuration can modify how and which repository is installed. This vulnerability does not affect users who aren't installing from Mercurial" and upgrade the urllib3 requirements from 2.0.4 to 2.0.7 due to two issues: "urllib3's request body not stripped after redirect from 303 status changes request method to GET" "`Cookie` HTTP header isn't stripped on cross-origin redirects" The file also ends up not having a newline at the end, that my editor ends up wanting to fix automatically. Link: https://github.com/dependabot Tested-by: Helen Koike Signed-off-by: Linus Torvalds --- drivers/gpu/drm/ci/xfails/requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/ci/xfails/requirements.txt b/drivers/gpu/drm/ci/xfails/requirements.txt index d8856d1581fdb..e9994c9db799b 100644 --- a/drivers/gpu/drm/ci/xfails/requirements.txt +++ b/drivers/gpu/drm/ci/xfails/requirements.txt @@ -5,7 +5,7 @@ termcolor==2.3.0 certifi==2023.7.22 charset-normalizer==3.2.0 idna==3.4 -pip==23.2.1 +pip==23.3 python-gitlab==3.15.0 requests==2.31.0 requests-toolbelt==1.0.0 @@ -13,5 +13,5 @@ ruamel.yaml==0.17.32 ruamel.yaml.clib==0.2.7 setuptools==68.0.0 tenacity==8.2.3 -urllib3==2.0.4 -wheel==0.41.1 \ No newline at end of file +urllib3==2.0.7 +wheel==0.41.1 From 382561d16854a747e6df71034da08d20d6013dfe Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sun, 12 Nov 2023 18:32:45 -0800 Subject: [PATCH 170/560] i2c: ocores: Move system PM hooks to the NOIRQ phase When an I2C device contains a wake IRQ subordinate to a regmap-irq chip, the regmap-irq code must be able to perform I2C transactions during suspend_device_irqs() and resume_device_irqs(). Therefore, the bus must be suspended/resumed during the NOIRQ phase. Signed-off-by: Samuel Holland Acked-by: Peter Korsgaard Reviewed-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-ocores.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index 041a76f71a49c..e106af83cef4d 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -771,8 +771,8 @@ static int ocores_i2c_resume(struct device *dev) return ocores_init(dev, i2c); } -static DEFINE_SIMPLE_DEV_PM_OPS(ocores_i2c_pm, - ocores_i2c_suspend, ocores_i2c_resume); +static DEFINE_NOIRQ_DEV_PM_OPS(ocores_i2c_pm, + ocores_i2c_suspend, ocores_i2c_resume); static struct platform_driver ocores_i2c_driver = { .probe = ocores_i2c_probe, From 7a1aba89ac54ccf6cad23a91a34c0ab24b1d7997 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Fri, 13 Oct 2023 12:25:10 +0200 Subject: [PATCH 171/560] ice: dpll: fix initial lock status of dpll When dpll device is registered and dpll subsystem performs notify of a new device, the lock state value provided to dpll subsystem equals 0 which is invalid value for the `enum dpll_lock_status`. Provide correct value by obtaining it from firmware before registering the dpll device. Fixes: d7999f5ea64b ("ice: implement dpll interface to control cgu") Signed-off-by: Aleksandr Loktionov Signed-off-by: Arkadiusz Kubalewski Tested-by: Sunitha Mekala (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_dpll.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 835c419ccc743..607f534055b6f 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -1756,6 +1756,7 @@ ice_dpll_init_dpll(struct ice_pf *pf, struct ice_dpll *d, bool cgu, } d->pf = pf; if (cgu) { + ice_dpll_update_state(pf, d, true); ret = dpll_device_register(d->dpll, type, &ice_dpll_ops, d); if (ret) { dpll_device_put(d->dpll); @@ -1796,8 +1797,6 @@ static int ice_dpll_init_worker(struct ice_pf *pf) struct ice_dplls *d = &pf->dplls; struct kthread_worker *kworker; - ice_dpll_update_state(pf, &d->eec, true); - ice_dpll_update_state(pf, &d->pps, true); kthread_init_delayed_work(&d->work, ice_dpll_periodic_work); kworker = kthread_create_worker(0, "ice-dplls-%s", dev_name(ice_pf_to_dev(pf))); From 4a4027f25dc3f39c2aafb3bf8926125c5378c9dc Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Tue, 31 Oct 2023 18:06:54 +0100 Subject: [PATCH 172/560] ice: dpll: fix check for dpll input priority range Supported priority value for input pins may differ with regard of NIC firmware version. E810T NICs with 3.20/4.00 FW versions would accept priority range 0-31, where firmware 4.10+ would support the range 0-9 and extra value of 255. Remove the in-range check as the driver has no information on supported values from the running firmware, let firmware decide if given value is correct and return extack error if the value is not supported. Fixes: d7999f5ea64b ("ice: implement dpll interface to control cgu") Reviewed-by: Przemek Kitszel Reviewed-by: Jacob Keller Signed-off-by: Arkadiusz Kubalewski Tested-by: Sunitha Mekala (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_dpll.c | 6 ------ drivers/net/ethernet/intel/ice/ice_dpll.h | 1 - 2 files changed, 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 607f534055b6f..831ba6683962e 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -815,12 +815,6 @@ ice_dpll_input_prio_set(const struct dpll_pin *pin, void *pin_priv, struct ice_pf *pf = d->pf; int ret; - if (prio > ICE_DPLL_PRIO_MAX) { - NL_SET_ERR_MSG_FMT(extack, "prio out of supported range 0-%d", - ICE_DPLL_PRIO_MAX); - return -EINVAL; - } - mutex_lock(&pf->dplls.lock); ret = ice_dpll_hw_input_prio_set(pf, d, p, prio, extack); mutex_unlock(&pf->dplls.lock); diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.h b/drivers/net/ethernet/intel/ice/ice_dpll.h index bb32b6d88373e..93172e93995b9 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.h +++ b/drivers/net/ethernet/intel/ice/ice_dpll.h @@ -6,7 +6,6 @@ #include "ice.h" -#define ICE_DPLL_PRIO_MAX 0xF #define ICE_DPLL_RCLK_NUM_MAX 4 /** ice_dpll_pin - store info about pins From 6db5f2cd9ebb12e930a82c01714a6589576cd50f Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Tue, 31 Oct 2023 18:08:00 +0100 Subject: [PATCH 173/560] ice: dpll: fix output pin capabilities The dpll output pins which are used to feed clock signal of PHY and MAC circuits cannot be disconnected, those integrated circuits require clock signal for operation. By stopping assignment of DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE pin capability, prevent the user from invoking the state set callback on those pins, setting the state on those pins already returns error, as firmware doesn't allow the change of their state. Fixes: d7999f5ea64b ("ice: implement dpll interface to control cgu") Fixes: 8a3a565ff210 ("ice: add admin commands to access cgu configuration") Reviewed-by: Andrii Staikov Signed-off-by: Arkadiusz Kubalewski Tested-by: Sunitha Mekala (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_dpll.c | 12 +++-- drivers/net/ethernet/intel/ice/ice_ptp_hw.c | 54 +++++++++++++++++++++ drivers/net/ethernet/intel/ice/ice_ptp_hw.h | 2 + 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 831ba6683962e..86b180cb32a02 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -1823,6 +1823,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, int num_pins, i, ret = -EINVAL; struct ice_hw *hw = &pf->hw; struct ice_dpll_pin *pins; + unsigned long caps; u8 freq_supp_num; bool input; @@ -1842,6 +1843,7 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, } for (i = 0; i < num_pins; i++) { + caps = 0; pins[i].idx = i; pins[i].prop.board_label = ice_cgu_get_pin_name(hw, i, input); pins[i].prop.type = ice_cgu_get_pin_type(hw, i, input); @@ -1854,8 +1856,8 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, &dp->input_prio[i]); if (ret) return ret; - pins[i].prop.capabilities |= - DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE; + caps |= (DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE | + DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE); pins[i].prop.phase_range.min = pf->dplls.input_phase_adj_max; pins[i].prop.phase_range.max = @@ -1865,9 +1867,11 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf, pf->dplls.output_phase_adj_max; pins[i].prop.phase_range.max = -pf->dplls.output_phase_adj_max; + ret = ice_cgu_get_output_pin_state_caps(hw, i, &caps); + if (ret) + return ret; } - pins[i].prop.capabilities |= - DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE; + pins[i].prop.capabilities = caps; ret = ice_dpll_pin_state_update(pf, &pins[i], pin_type, NULL); if (ret) return ret; diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c index 6d573908de7a0..a00b55e14aac4 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c @@ -3961,3 +3961,57 @@ int ice_get_cgu_rclk_pin_info(struct ice_hw *hw, u8 *base_idx, u8 *pin_num) return ret; } + +/** + * ice_cgu_get_output_pin_state_caps - get output pin state capabilities + * @hw: pointer to the hw struct + * @pin_id: id of a pin + * @caps: capabilities to modify + * + * Return: + * * 0 - success, state capabilities were modified + * * negative - failure, capabilities were not modified + */ +int ice_cgu_get_output_pin_state_caps(struct ice_hw *hw, u8 pin_id, + unsigned long *caps) +{ + bool can_change = true; + + switch (hw->device_id) { + case ICE_DEV_ID_E810C_SFP: + if (pin_id == ZL_OUT2 || pin_id == ZL_OUT3) + can_change = false; + break; + case ICE_DEV_ID_E810C_QSFP: + if (pin_id == ZL_OUT2 || pin_id == ZL_OUT3 || pin_id == ZL_OUT4) + can_change = false; + break; + case ICE_DEV_ID_E823L_10G_BASE_T: + case ICE_DEV_ID_E823L_1GBE: + case ICE_DEV_ID_E823L_BACKPLANE: + case ICE_DEV_ID_E823L_QSFP: + case ICE_DEV_ID_E823L_SFP: + case ICE_DEV_ID_E823C_10G_BASE_T: + case ICE_DEV_ID_E823C_BACKPLANE: + case ICE_DEV_ID_E823C_QSFP: + case ICE_DEV_ID_E823C_SFP: + case ICE_DEV_ID_E823C_SGMII: + if (hw->cgu_part_number == + ICE_AQC_GET_LINK_TOPO_NODE_NR_ZL30632_80032 && + pin_id == ZL_OUT2) + can_change = false; + else if (hw->cgu_part_number == + ICE_AQC_GET_LINK_TOPO_NODE_NR_SI5383_5384 && + pin_id == SI_OUT1) + can_change = false; + break; + default: + return -EINVAL; + } + if (can_change) + *caps |= DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE; + else + *caps &= ~DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE; + + return 0; +} diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h index 36aeeef99ec07..cf76701566c72 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h +++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h @@ -282,6 +282,8 @@ int ice_get_cgu_state(struct ice_hw *hw, u8 dpll_idx, int ice_get_cgu_rclk_pin_info(struct ice_hw *hw, u8 *base_idx, u8 *pin_num); void ice_ptp_init_phy_model(struct ice_hw *hw); +int ice_cgu_get_output_pin_state_caps(struct ice_hw *hw, u8 pin_id, + unsigned long *caps); #define PFTSYN_SEM_BYTES 4 From a778616e4cc2d5e3a253c7d8959aafa5218fc5e4 Mon Sep 17 00:00:00 2001 From: Dan Nowlin Date: Tue, 7 Nov 2023 12:32:27 -0500 Subject: [PATCH 174/560] ice: fix DDP package download for packages without signature segment Commit 3cbdb0343022 ("ice: Add support for E830 DDP package segment") incorrectly removed support for package download for packages without a signature segment. These packages include the signature buffer inline in the configurations buffers, and not in a signature segment. Fix package download by providing download support for both packages with (ice_download_pkg_with_sig_seg()) and without signature segment (ice_download_pkg_without_sig_seg()). Fixes: 3cbdb0343022 ("ice: Add support for E830 DDP package segment") Reported-by: Maciej Fijalkowski Closes: https://lore.kernel.org/netdev/ZUT50a94kk2pMGKb@boxer/ Tested-by: Maciej Fijalkowski Reviewed-by: Wojciech Drewek Reviewed-by: Jacob Keller Signed-off-by: Dan Nowlin Signed-off-by: Paul Greenwalt Reviewed-by: Simon Horman Tested-by: Arpana Arland (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ddp.c | 103 ++++++++++++++++++++++- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c index cfb1580f5850b..8b7504a9df316 100644 --- a/drivers/net/ethernet/intel/ice/ice_ddp.c +++ b/drivers/net/ethernet/intel/ice/ice_ddp.c @@ -1479,14 +1479,14 @@ ice_post_dwnld_pkg_actions(struct ice_hw *hw) } /** - * ice_download_pkg + * ice_download_pkg_with_sig_seg * @hw: pointer to the hardware structure * @pkg_hdr: pointer to package header * * Handles the download of a complete package. */ static enum ice_ddp_state -ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr) +ice_download_pkg_with_sig_seg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr) { enum ice_aq_err aq_err = hw->adminq.sq_last_status; enum ice_ddp_state state = ICE_DDP_PKG_ERR; @@ -1519,6 +1519,103 @@ ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr) state = ice_post_dwnld_pkg_actions(hw); ice_release_global_cfg_lock(hw); + + return state; +} + +/** + * ice_dwnld_cfg_bufs + * @hw: pointer to the hardware structure + * @bufs: pointer to an array of buffers + * @count: the number of buffers in the array + * + * Obtains global config lock and downloads the package configuration buffers + * to the firmware. + */ +static enum ice_ddp_state +ice_dwnld_cfg_bufs(struct ice_hw *hw, struct ice_buf *bufs, u32 count) +{ + enum ice_ddp_state state; + struct ice_buf_hdr *bh; + int status; + + if (!bufs || !count) + return ICE_DDP_PKG_ERR; + + /* If the first buffer's first section has its metadata bit set + * then there are no buffers to be downloaded, and the operation is + * considered a success. + */ + bh = (struct ice_buf_hdr *)bufs; + if (le32_to_cpu(bh->section_entry[0].type) & ICE_METADATA_BUF) + return ICE_DDP_PKG_SUCCESS; + + status = ice_acquire_global_cfg_lock(hw, ICE_RES_WRITE); + if (status) { + if (status == -EALREADY) + return ICE_DDP_PKG_ALREADY_LOADED; + return ice_map_aq_err_to_ddp_state(hw->adminq.sq_last_status); + } + + state = ice_dwnld_cfg_bufs_no_lock(hw, bufs, 0, count, true); + if (!state) + state = ice_post_dwnld_pkg_actions(hw); + + ice_release_global_cfg_lock(hw); + + return state; +} + +/** + * ice_download_pkg_without_sig_seg + * @hw: pointer to the hardware structure + * @ice_seg: pointer to the segment of the package to be downloaded + * + * Handles the download of a complete package without signature segment. + */ +static enum ice_ddp_state +ice_download_pkg_without_sig_seg(struct ice_hw *hw, struct ice_seg *ice_seg) +{ + struct ice_buf_table *ice_buf_tbl; + + ice_debug(hw, ICE_DBG_PKG, "Segment format version: %d.%d.%d.%d\n", + ice_seg->hdr.seg_format_ver.major, + ice_seg->hdr.seg_format_ver.minor, + ice_seg->hdr.seg_format_ver.update, + ice_seg->hdr.seg_format_ver.draft); + + ice_debug(hw, ICE_DBG_PKG, "Seg: type 0x%X, size %d, name %s\n", + le32_to_cpu(ice_seg->hdr.seg_type), + le32_to_cpu(ice_seg->hdr.seg_size), ice_seg->hdr.seg_id); + + ice_buf_tbl = ice_find_buf_table(ice_seg); + + ice_debug(hw, ICE_DBG_PKG, "Seg buf count: %d\n", + le32_to_cpu(ice_buf_tbl->buf_count)); + + return ice_dwnld_cfg_bufs(hw, ice_buf_tbl->buf_array, + le32_to_cpu(ice_buf_tbl->buf_count)); +} + +/** + * ice_download_pkg + * @hw: pointer to the hardware structure + * @pkg_hdr: pointer to package header + * @ice_seg: pointer to the segment of the package to be downloaded + * + * Handles the download of a complete package. + */ +static enum ice_ddp_state +ice_download_pkg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr, + struct ice_seg *ice_seg) +{ + enum ice_ddp_state state; + + if (hw->pkg_has_signing_seg) + state = ice_download_pkg_with_sig_seg(hw, pkg_hdr); + else + state = ice_download_pkg_without_sig_seg(hw, ice_seg); + ice_post_pkg_dwnld_vlan_mode_cfg(hw); return state; @@ -2083,7 +2180,7 @@ enum ice_ddp_state ice_init_pkg(struct ice_hw *hw, u8 *buf, u32 len) /* initialize package hints and then download package */ ice_init_pkg_hints(hw, seg); - state = ice_download_pkg(hw, pkg); + state = ice_download_pkg(hw, pkg, seg); if (state == ICE_DDP_PKG_ALREADY_LOADED) { ice_debug(hw, ICE_DBG_INIT, "package previously loaded - no work.\n"); From ff31ba19d732efb9aca3633935d71085e68d5076 Mon Sep 17 00:00:00 2001 From: Anastasia Belova Date: Mon, 13 Nov 2023 17:52:32 +0300 Subject: [PATCH 175/560] cifs: spnego: add ';' in HOST_KEY_LEN "host=" should start with ';' (as in cifs_get_spnego_key) So its length should be 6. Found by Linux Verification Center (linuxtesting.org) with SVACE. Reviewed-by: Paulo Alcantara (SUSE) Fixes: 7c9c3760b3a5 ("[CIFS] add constants for string lengths of keynames in SPNEGO upcall string") Signed-off-by: Anastasia Belova Co-developed-by: Ekaterina Esina Signed-off-by: Ekaterina Esina Signed-off-by: Steve French --- fs/smb/client/cifs_spnego.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c index 6f3285f1dfee5..af7849e5974ff 100644 --- a/fs/smb/client/cifs_spnego.c +++ b/fs/smb/client/cifs_spnego.c @@ -64,8 +64,8 @@ struct key_type cifs_spnego_key_type = { * strlen(";sec=ntlmsspi") */ #define MAX_MECH_STR_LEN 13 -/* strlen of "host=" */ -#define HOST_KEY_LEN 5 +/* strlen of ";host=" */ +#define HOST_KEY_LEN 6 /* strlen of ";ip4=" or ";ip6=" */ #define IP_KEY_LEN 5 From 181724fc72486dec2bec8803459be05b5162aaa8 Mon Sep 17 00:00:00 2001 From: Ekaterina Esina Date: Mon, 13 Nov 2023 19:42:41 +0300 Subject: [PATCH 176/560] cifs: fix check of rc in function generate_smb3signingkey Remove extra check after condition, add check after generating key for encryption. The check is needed to return non zero rc before rewriting it with generating key for decryption. Found by Linux Verification Center (linuxtesting.org) with SVACE. Reviewed-by: Paulo Alcantara (SUSE) Fixes: d70e9fa55884 ("cifs: try opening channels after mounting") Signed-off-by: Ekaterina Esina Co-developed-by: Anastasia Belova Signed-off-by: Anastasia Belova Signed-off-by: Steve French --- fs/smb/client/smb2transport.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 84ea67301303c..5a3ca62d2f07f 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -458,6 +458,8 @@ generate_smb3signingkey(struct cifs_ses *ses, ptriplet->encryption.context, ses->smb3encryptionkey, SMB3_ENC_DEC_KEY_SIZE); + if (rc) + return rc; rc = generate_key(ses, ptriplet->decryption.label, ptriplet->decryption.context, ses->smb3decryptionkey, @@ -466,9 +468,6 @@ generate_smb3signingkey(struct cifs_ses *ses, return rc; } - if (rc) - return rc; - #ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__); /* From 48d584b7f90f48d2623fb01743b099efbf0d36c2 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 6 Nov 2023 10:34:01 +0800 Subject: [PATCH 177/560] bcachefs: make bch2_target_to_text_sb static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bch2_target_to_text_sb are not used outside the file disk_groups.c, so the modification is defined as static. fs/bcachefs/disk_groups.c:583:6: warning: no previous prototype for ‘bch2_target_to_text_sb’. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7144 Signed-off-by: Jiapeng Chong Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_groups.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index d613695abf9f6..1f334124055ba 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -580,7 +580,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) } } -void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) +static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) { struct target t = target_decode(v); From c4f1f80a0e8d829ce4e29ca52cb7f74b22f67454 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 Nov 2023 12:30:19 -0500 Subject: [PATCH 178/560] bcachefs: Use correct fgf_t type as function argument This quiets a sparse complaint. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-pagecache.c | 2 +- fs/bcachefs/fs-io-pagecache.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index 8bd9bcdd27f73..ff664fd0d8ef8 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -13,7 +13,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, loff_t start, u64 end, - int fgp_flags, gfp_t gfp, + fgf_t fgp_flags, gfp_t gfp, folios *fs) { struct folio *f; diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index a2222ad586e9e..27f712ae37a68 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -7,7 +7,7 @@ typedef DARRAY(struct folio *) folios; int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, - u64, int, gfp_t, folios *); + u64, fgf_t, gfp_t, folios *); int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); /* From 1b8bc556280d3f4970407480e6a5ff49efe5601b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 6 Nov 2023 16:27:02 -0600 Subject: [PATCH 179/560] bcachefs: Use DECLARE_FLEX_ARRAY() helper and fix multiple -Warray-bounds warnings Transform zero-length array `s` into a proper flexible-array member in `struct snapshot_table` via the DECLARE_FLEX_ARRAY() helper; and fix tons of the following -Warray-bounds warnings: fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.c:135:70: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] fs/bcachefs/snapshot.h:36:21: warning: array subscript is outside array bounds of 'struct snapshot_t[0]' [-Warray-bounds=] This helps with the ongoing efforts to globally enable -Warray-bounds. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Kent Overstreet --- fs/bcachefs/subvolume_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 86833445af205..2d2e66a4e4681 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -20,7 +20,7 @@ struct snapshot_t { }; struct snapshot_table { - struct snapshot_t s[0]; + DECLARE_FLEX_ARRAY(struct snapshot_t, s); }; typedef struct { From 274c2f8fd27158d15524abe63c3df6fb96707dd3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 6 Nov 2023 15:40:22 -0600 Subject: [PATCH 180/560] bcachefs: Fix multiple -Warray-bounds warnings Transform zero-length array `entries` into a proper flexible-array member in `struct journal_seq_blacklist_table`; and fix the following -Warray-bounds warnings: fs/bcachefs/journal_seq_blacklist.c:148:26: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:150:30: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:154:27: warning: array subscript idx is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:176:27: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:177:27: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:297:34: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:298:34: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] fs/bcachefs/journal_seq_blacklist.c:300:31: warning: array subscript i is outside array bounds of 'struct journal_seq_blacklist_table_entry[0]' [-Warray-bounds=] This results in no differences in binary output. This helps with the ongoing efforts to globally enable -Warray-bounds. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 9cb8684959ee1..403aa3389fccf 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -617,7 +617,7 @@ struct journal_seq_blacklist_table { u64 start; u64 end; bool dirty; - } entries[0]; + } entries[]; }; struct journal_keys { From 03cc1e67a243cbb2c85d5fd84f369449f94d4269 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 7 Nov 2023 10:30:22 -0500 Subject: [PATCH 181/560] bcachefs: Fix null ptr deref in bch2_backpointer_get_node() bch2_btree_iter_peek_node() can return a NULL ptr (when the tree is shorter than the search depth); handle this with an early return. Signed-off-by: Kent Overstreet Reported-by: Dan Carpenter Fixes: https://lore.kernel.org/linux-bcachefs/5fc3c28b-c232-4ec7-b0ac-4ef220ddf976@moroto.mountain/T/ Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ef02c9bb03541..23c0834a97a4a 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -313,17 +313,17 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, bp.level - 1, 0); b = bch2_btree_iter_peek_node(iter); - if (IS_ERR(b)) + if (IS_ERR_OR_NULL(b)) goto err; BUG_ON(b->c.level != bp.level - 1); - if (b && extent_matches_bp(c, bp.btree_id, bp.level, - bkey_i_to_s_c(&b->key), - bucket, bp)) + if (extent_matches_bp(c, bp.btree_id, bp.level, + bkey_i_to_s_c(&b->key), + bucket, bp)) return b; - if (b && btree_node_will_make_reachable(b)) { + if (btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); From 4d6128dca6d940015fe2aa383ec1a0eeb9632f08 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 6 Nov 2023 11:59:05 -0500 Subject: [PATCH 182/560] bcachefs: Guard against insufficient devices to create stripes We can't create stripes if we don't have enough devices - this manifested as an integer underflow bug later. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 875f7c5a6fca6..2a77de18c004e 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1373,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, h->nr_active_devs++; rcu_read_unlock(); + + /* + * If we only have redundancy + 1 devices, we're better off with just + * replication: + */ + if (h->nr_active_devs < h->redundancy + 2) + bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", + h->nr_active_devs, h->redundancy + 2); + list_add(&h->list, &c->ec_stripe_head_list); return h; } @@ -1424,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); found: + if (!IS_ERR_OR_NULL(h) && + h->nr_active_devs < h->redundancy + 2) { + mutex_unlock(&h->lock); + h = NULL; + } mutex_unlock(&c->ec_stripe_head_lock); return h; } @@ -1681,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, int ret; h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); - if (!h) - bch_err(c, "no stripe head"); if (IS_ERR_OR_NULL(h)) return h; From 1bd5bcc9f5eef968ed021d72b14a157be7abdb49 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 13 Nov 2023 21:44:14 -0500 Subject: [PATCH 183/560] bcachefs: Split out btree_key_cache_types.h More consistent organization. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache_types.h | 30 +++++++++++++++++++++++++++++ fs/bcachefs/btree_types.h | 27 +------------------------- 2 files changed, 31 insertions(+), 26 deletions(-) create mode 100644 fs/bcachefs/btree_key_cache_types.h diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h new file mode 100644 index 0000000000000..0f967808d766d --- /dev/null +++ b/fs/bcachefs/btree_key_cache_types.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H +#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H + +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; +}; + +struct btree_key_cache { + struct mutex lock; + struct rhashtable table; + bool table_init_done; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; + struct shrinker *shrink; + unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; + + atomic_long_t nr_freed; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; +}; + +struct bkey_cached_key { + u32 btree_id; + struct bpos pos; +} __packed __aligned(4); + +#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 941841a0c5bf6..be5d6027e796c 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -5,7 +5,7 @@ #include #include -//#include "bkey_methods.h" +#include "btree_key_cache_types.h" #include "buckets_types.h" #include "darray.h" #include "errcode.h" @@ -312,31 +312,6 @@ struct btree_iter { #endif }; -struct btree_key_cache_freelist { - struct bkey_cached *objs[16]; - unsigned nr; -}; - -struct btree_key_cache { - struct mutex lock; - struct rhashtable table; - bool table_init_done; - struct list_head freed_pcpu; - struct list_head freed_nonpcpu; - struct shrinker *shrink; - unsigned shrink_iter; - struct btree_key_cache_freelist __percpu *pcpu_freed; - - atomic_long_t nr_freed; - atomic_long_t nr_keys; - atomic_long_t nr_dirty; -}; - -struct bkey_cached_key { - u32 btree_id; - struct bpos pos; -} __packed __aligned(4); - #define BKEY_CACHED_ACCESSED 0 #define BKEY_CACHED_DIRTY 1 From c65c13f0eac61218c9ee4635c05661c0b9760e58 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 6 Nov 2023 09:53:14 -0500 Subject: [PATCH 184/560] bcachefs: Run btree key cache shrinker less aggressively The btree key cache maintains lists of items that have been freed, but can't yet be reclaimed because a bch2_trans_relock() call might find them - we're waiting for SRCU readers to release. Previously, we wouldn't count these items against the number we're attempting to scan for, which would mean we'd evict more live key cache entries - doing quite a bit of potentially unecessary work. With recent work to make sure we don't hold SRCU locks for too long, it should be safe to count all the items on the freelists against number to scan - even if we can't reclaim them yet, we will be able to soon. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 23 +++++++++++++++++++---- fs/bcachefs/btree_key_cache_types.h | 4 ++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 9b78f78a75b59..b3305a04d8086 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -89,10 +89,13 @@ static void bkey_cached_free(struct btree_key_cache *bc, ck->btree_trans_barrier_seq = start_poll_synchronize_srcu(&c->btree_trans_barrier); - if (ck->c.lock.readers) + if (ck->c.lock.readers) { list_move_tail(&ck->list, &bc->freed_pcpu); - else + bc->nr_freed_pcpu++; + } else { list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; + } atomic_long_inc(&bc->nr_freed); kfree(ck->k); @@ -109,6 +112,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, { struct bkey_cached *pos; + bc->nr_freed_nonpcpu++; + list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, pos->btree_trans_barrier_seq)) { @@ -158,6 +163,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, #else mutex_lock(&bc->lock); list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; mutex_unlock(&bc->lock); #endif } else { @@ -217,6 +223,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, f->nr < ARRAY_SIZE(f->objs) / 2) { ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; f->objs[f->nr++] = ck; } @@ -229,6 +236,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, if (!list_empty(&bc->freed_nonpcpu)) { ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; } mutex_unlock(&bc->lock); #endif @@ -850,6 +858,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, * Newest freed entries are at the end of the list - once we hit one * that's too new to be freed, we can bail out: */ + scanned += bc->nr_freed_nonpcpu; + list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) @@ -859,13 +869,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - scanned++; freed++; + bc->nr_freed_nonpcpu--; } if (scanned >= nr) goto out; + scanned += bc->nr_freed_pcpu; + list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) @@ -875,8 +887,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); - scanned++; freed++; + bc->nr_freed_pcpu--; } if (scanned >= nr) @@ -982,6 +994,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) } #endif + BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); + BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); + list_splice(&bc->freed_pcpu, &items); list_splice(&bc->freed_nonpcpu, &items); diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h index 0f967808d766d..290e4e57df5bb 100644 --- a/fs/bcachefs/btree_key_cache_types.h +++ b/fs/bcachefs/btree_key_cache_types.h @@ -11,8 +11,12 @@ struct btree_key_cache { struct mutex lock; struct rhashtable table; bool table_init_done; + struct list_head freed_pcpu; + size_t nr_freed_pcpu; struct list_head freed_nonpcpu; + size_t nr_freed_nonpcpu; + struct shrinker *shrink; unsigned shrink_iter; struct btree_key_cache_freelist __percpu *pcpu_freed; From 3b8c4507779691984e31e64e0b80abb03cc02d0d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 6 Nov 2023 19:49:47 -0500 Subject: [PATCH 185/560] bcachefs: btree_trans->write_locked As prep work for the next patch to fix a key cache reclaim issue, we need to start tracking whether we're currently holding write locks - so that we can release and retake the before calling into memory reclaim. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 85 ++++++++++++++++++-------------- fs/bcachefs/btree_types.h | 1 + 2 files changed, 50 insertions(+), 36 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index decad7b66c59c..02491f7bb8314 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, bch2_btree_init_next(trans, b); } +static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) +{ + while (--i >= trans->updates) { + if (same_leaf_as_prev(trans, i)) + continue; + + bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + } + + trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); +} + +static inline int bch2_trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + EBUG_ON(trans->write_locked); + + trans_for_each_update(trans, i) { + if (same_leaf_as_prev(trans, i)) + continue; + + if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + return trans_lock_write_fail(trans, i); + + if (!i->cached) + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + } + + trans->write_locked = true; + return 0; +} + +static inline void bch2_trans_unlock_write(struct btree_trans *trans) +{ + if (likely(trans->write_locked)) { + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(trans, i->path, + insert_l(i)->b); + trans->write_locked = false; + } +} + /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ @@ -732,37 +779,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, return ret; } -static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) -{ - while (--i >= trans->updates) { - if (same_leaf_as_prev(trans, i)) - continue; - - bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); - } - - trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); -} - -static inline int trans_lock_write(struct btree_trans *trans) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) { - if (same_leaf_as_prev(trans, i)) - continue; - - if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) - return trans_lock_write_fail(trans, i); - - if (!i->cached) - bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); - } - - return 0; -} - static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { struct btree_insert_entry *i; @@ -838,7 +854,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (unlikely(ret)) return ret; - ret = trans_lock_write(trans); + ret = bch2_trans_lock_write(trans); if (unlikely(ret)) return ret; @@ -847,10 +863,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags if (!ret && unlikely(trans->journal_replay_not_finished)) bch2_drop_overwrites_from_journal(trans); - trans_for_each_update(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(trans, i->path, - insert_l(i)->b); + bch2_trans_unlock_write(trans); if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index be5d6027e796c..f3669fa685916 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -409,6 +409,7 @@ struct btree_trans { bool journal_transaction_names:1; bool journal_replay_not_finished:1; bool notrace_relock_fail:1; + bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; unsigned long last_begin_ip; From 09b0283ee23a02094a43a9b93146d1060c58fc3a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 5 Nov 2023 15:28:44 -0500 Subject: [PATCH 186/560] bcachefs: Make sure to drop/retake btree locks before reclaim We really don't want to be invoking memory reclaim with btree locks held: even aside from (solvable, but tricky) recursion issues, it can cause painful to diagnose performance edge cases. This fixes a recently reported issue in btree_key_can_insert_cached(). Signed-off-by: Kent Overstreet Reported-by: Mateusz Guzik Fixes: https://lore.kernel.org/linux-bcachefs/CAGudoHEsb_hGRMeWeXh+UF6po0qQuuq_NKSEo+s1sEb6bDLjpA@mail.gmail.com/T/ --- fs/bcachefs/btree_trans_commit.c | 48 ++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 02491f7bb8314..55a120eb8692b 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -368,6 +368,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans, return 0; } +noinline static int +btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned new_u64s) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct bkey_cached *ck = (void *) path->l[0].b; + struct bkey_i *new_k; + int ret; + + bch2_trans_unlock_write(trans); + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(path->btree_id), new_u64s); + return -BCH_ERR_ENOMEM_btree_key_cache_insert; + } + + ret = bch2_trans_relock(trans) ?: + bch2_trans_lock_write(trans); + if (unlikely(ret)) { + kfree(new_k); + return ret; + } + + memcpy(new_k, ck->k, ck->u64s * sizeof(u64)); + + trans_for_each_update(trans, i) + if (i->old_v == &ck->k->v) + i->old_v = &new_k->v; + + kfree(ck->k); + ck->u64s = new_u64s; + ck->k = new_k; + return 0; +} + static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, struct btree_path *path, unsigned u64s) { @@ -394,12 +433,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags return 0; new_u64s = roundup_pow_of_two(u64s); - new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); - if (!new_k) { - bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_id_str(path->btree_id), new_u64s); - return -BCH_ERR_ENOMEM_btree_key_cache_insert; - } + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); + if (unlikely(!new_k)) + return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); trans_for_each_update(trans, i) if (i->old_v == &ck->k->v) From 701ff57eb3d7c86c9a53de959e0c48fa8ca446d4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 3 Nov 2023 18:38:35 -0400 Subject: [PATCH 187/560] bcachefs: Check for nonce offset inconsistency in data_update path We've rarely been seeing a nonce offset inconsistency that doesn't show up in tests: this adds some extra verification code to the data update path that prints out more relevant info when it occurs. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 0771a6d880bf5..5ed66202c2265 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -239,6 +239,34 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, next_pos = insert->k.p; + /* + * Check for nonce offset inconsistency: + * This is debug code - we've been seeing this bug rarely, and + * it's been hard to reproduce, so this should give us some more + * information when it does occur: + */ + struct printbuf err = PRINTBUF; + int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err); + printbuf_exit(&err); + + if (invalid) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "about to insert invalid key in data update path"); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + + bch2_fatal_error(c); + goto out; + } + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, From 4b3812d90b2c93723adf4b6ce99240d301f7d5f9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 13 Nov 2023 20:49:39 -0800 Subject: [PATCH 188/560] Revert "ptp: Fixes a null pointer dereference in ptp_ioctl" This reverts commit 8a4f030dbced6fc255cbe67b2d0a129947e18493. Richard says: The test itself is harmless, but keeping it will make people think, "oh this pointer can be invalid." In fact the core stack ensures that ioctl() can't be invoked after release(), otherwise Bad Stuff happens. Fixes: 8a4f030dbced ("ptp: Fixes a null pointer dereference in ptp_ioctl") Link: https://lore.kernel.org/all/ZVAf_qdRfDAQYUt-@hoboy.vegasvil.org/ Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_chardev.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index e95a6ed130ef4..3f7a747888024 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -176,8 +176,6 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, int enable, err = 0; tsevq = pccontext->private_clkdata; - if (!tsevq) - return -EINVAL; switch (cmd) { From 73bde5a3294853947252cd9092a3517c7cb0cd2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Nov 2023 17:48:59 +0000 Subject: [PATCH 189/560] ptp: annotate data-race around q->head and q->tail As I was working on a syzbot report, I found that KCSAN would probably complain that reading q->head or q->tail without barriers could lead to invalid results. Add corresponding READ_ONCE() and WRITE_ONCE() to avoid load-store tearing. Fixes: d94ba80ebbea ("ptp: Added a brand new class driver for ptp clocks.") Signed-off-by: Eric Dumazet Acked-by: Richard Cochran Link: https://lore.kernel.org/r/20231109174859.3995880-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_chardev.c | 3 ++- drivers/ptp/ptp_clock.c | 5 +++-- drivers/ptp/ptp_private.h | 8 ++++++-- drivers/ptp/ptp_sysfs.c | 3 ++- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 3f7a747888024..7513018c9f9ac 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -572,7 +572,8 @@ ssize_t ptp_read(struct posix_clock_context *pccontext, uint rdflags, for (i = 0; i < cnt; i++) { event[i] = queue->buf[queue->head]; - queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS; + /* Paired with READ_ONCE() in queue_cnt() */ + WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS); } spin_unlock_irqrestore(&queue->lock, flags); diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 3134568af622d..15b804ba48685 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -57,10 +57,11 @@ static void enqueue_external_timestamp(struct timestamp_event_queue *queue, dst->t.sec = seconds; dst->t.nsec = remainder; + /* Both WRITE_ONCE() are paired with READ_ONCE() in queue_cnt() */ if (!queue_free(queue)) - queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS; + WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS); - queue->tail = (queue->tail + 1) % PTP_MAX_TIMESTAMPS; + WRITE_ONCE(queue->tail, (queue->tail + 1) % PTP_MAX_TIMESTAMPS); spin_unlock_irqrestore(&queue->lock, flags); } diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index 35fde0a057460..45f9002a5dcae 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -85,9 +85,13 @@ struct ptp_vclock { * that a writer might concurrently increment the tail does not * matter, since the queue remains nonempty nonetheless. */ -static inline int queue_cnt(struct timestamp_event_queue *q) +static inline int queue_cnt(const struct timestamp_event_queue *q) { - int cnt = q->tail - q->head; + /* + * Paired with WRITE_ONCE() in enqueue_external_timestamp(), + * ptp_read(), extts_fifo_show(). + */ + int cnt = READ_ONCE(q->tail) - READ_ONCE(q->head); return cnt < 0 ? PTP_MAX_TIMESTAMPS + cnt : cnt; } diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c index 7d023d9d0acbf..f7a499a1bd39e 100644 --- a/drivers/ptp/ptp_sysfs.c +++ b/drivers/ptp/ptp_sysfs.c @@ -94,7 +94,8 @@ static ssize_t extts_fifo_show(struct device *dev, qcnt = queue_cnt(queue); if (qcnt) { event = queue->buf[queue->head]; - queue->head = (queue->head + 1) % PTP_MAX_TIMESTAMPS; + /* Paired with READ_ONCE() in queue_cnt() */ + WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS); } spin_unlock_irqrestore(&queue->lock, flags); From 3cffa2ddc4d3fcf70cde361236f5a614f81a09b2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Nov 2023 18:01:02 +0000 Subject: [PATCH 190/560] bonding: stop the device in bond_setup_by_slave() Commit 9eed321cde22 ("net: lapbether: only support ethernet devices") has been able to keep syzbot away from net/lapb, until today. In the following splat [1], the issue is that a lapbether device has been created on a bonding device without members. Then adding a non ARPHRD_ETHER member forced the bonding master to change its type. The fix is to make sure we call dev_close() in bond_setup_by_slave() so that the potential linked lapbether devices (or any other devices having assumptions on the physical device) are removed. A similar bug has been addressed in commit 40baec225765 ("bonding: fix panic on non-ARPHRD_ETHER enslave failure") [1] skbuff: skb_under_panic: text:ffff800089508810 len:44 put:40 head:ffff0000c78e7c00 data:ffff0000c78e7bea tail:0x16 end:0x140 dev:bond0 kernel BUG at net/core/skbuff.c:192 ! Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 6007 Comm: syz-executor383 Not tainted 6.6.0-rc3-syzkaller-gbf6547d8715b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/04/2023 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : skb_panic net/core/skbuff.c:188 [inline] pc : skb_under_panic+0x13c/0x140 net/core/skbuff.c:202 lr : skb_panic net/core/skbuff.c:188 [inline] lr : skb_under_panic+0x13c/0x140 net/core/skbuff.c:202 sp : ffff800096a06aa0 x29: ffff800096a06ab0 x28: ffff800096a06ba0 x27: dfff800000000000 x26: ffff0000ce9b9b50 x25: 0000000000000016 x24: ffff0000c78e7bea x23: ffff0000c78e7c00 x22: 000000000000002c x21: 0000000000000140 x20: 0000000000000028 x19: ffff800089508810 x18: ffff800096a06100 x17: 0000000000000000 x16: ffff80008a629a3c x15: 0000000000000001 x14: 1fffe00036837a32 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000201 x10: 0000000000000000 x9 : cb50b496c519aa00 x8 : cb50b496c519aa00 x7 : 0000000000000001 x6 : 0000000000000001 x5 : ffff800096a063b8 x4 : ffff80008e280f80 x3 : ffff8000805ad11c x2 : 0000000000000001 x1 : 0000000100000201 x0 : 0000000000000086 Call trace: skb_panic net/core/skbuff.c:188 [inline] skb_under_panic+0x13c/0x140 net/core/skbuff.c:202 skb_push+0xf0/0x108 net/core/skbuff.c:2446 ip6gre_header+0xbc/0x738 net/ipv6/ip6_gre.c:1384 dev_hard_header include/linux/netdevice.h:3136 [inline] lapbeth_data_transmit+0x1c4/0x298 drivers/net/wan/lapbether.c:257 lapb_data_transmit+0x8c/0xb0 net/lapb/lapb_iface.c:447 lapb_transmit_buffer+0x178/0x204 net/lapb/lapb_out.c:149 lapb_send_control+0x220/0x320 net/lapb/lapb_subr.c:251 __lapb_disconnect_request+0x9c/0x17c net/lapb/lapb_iface.c:326 lapb_device_event+0x288/0x4e0 net/lapb/lapb_iface.c:492 notifier_call_chain+0x1a4/0x510 kernel/notifier.c:93 raw_notifier_call_chain+0x3c/0x50 kernel/notifier.c:461 call_netdevice_notifiers_info net/core/dev.c:1970 [inline] call_netdevice_notifiers_extack net/core/dev.c:2008 [inline] call_netdevice_notifiers net/core/dev.c:2022 [inline] __dev_close_many+0x1b8/0x3c4 net/core/dev.c:1508 dev_close_many+0x1e0/0x470 net/core/dev.c:1559 dev_close+0x174/0x250 net/core/dev.c:1585 lapbeth_device_event+0x2e4/0x958 drivers/net/wan/lapbether.c:466 notifier_call_chain+0x1a4/0x510 kernel/notifier.c:93 raw_notifier_call_chain+0x3c/0x50 kernel/notifier.c:461 call_netdevice_notifiers_info net/core/dev.c:1970 [inline] call_netdevice_notifiers_extack net/core/dev.c:2008 [inline] call_netdevice_notifiers net/core/dev.c:2022 [inline] __dev_close_many+0x1b8/0x3c4 net/core/dev.c:1508 dev_close_many+0x1e0/0x470 net/core/dev.c:1559 dev_close+0x174/0x250 net/core/dev.c:1585 bond_enslave+0x2298/0x30cc drivers/net/bonding/bond_main.c:2332 bond_do_ioctl+0x268/0xc64 drivers/net/bonding/bond_main.c:4539 dev_ifsioc+0x754/0x9ac dev_ioctl+0x4d8/0xd34 net/core/dev_ioctl.c:786 sock_do_ioctl+0x1d4/0x2d0 net/socket.c:1217 sock_ioctl+0x4e8/0x834 net/socket.c:1322 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl fs/ioctl.c:857 [inline] __arm64_sys_ioctl+0x14c/0x1c8 fs/ioctl.c:857 __invoke_syscall arch/arm64/kernel/syscall.c:37 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:51 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:136 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:155 el0_svc+0x58/0x16c arch/arm64/kernel/entry-common.c:678 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:696 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:591 Code: aa1803e6 aa1903e7 a90023f5 94785b8b (d4210000) Fixes: 872254dd6b1f ("net/bonding: Enable bonding to enslave non ARPHRD_ETHER") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: Jay Vosburgh Reviewed-by: Hangbin Liu Link: https://lore.kernel.org/r/20231109180102.4085183-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 51d47eda1c873..8e6cc0e133b7f 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1500,6 +1500,10 @@ static void bond_compute_features(struct bonding *bond) static void bond_setup_by_slave(struct net_device *bond_dev, struct net_device *slave_dev) { + bool was_up = !!(bond_dev->flags & IFF_UP); + + dev_close(bond_dev); + bond_dev->header_ops = slave_dev->header_ops; bond_dev->type = slave_dev->type; @@ -1514,6 +1518,8 @@ static void bond_setup_by_slave(struct net_device *bond_dev, bond_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); bond_dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); } + if (was_up) + dev_open(bond_dev, NULL); } /* On bonding slaves other than the currently active slave, suppress From 510e35fb931ffc3b100e5d5ae4595cd3beca9f1a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 9 Nov 2023 10:03:12 +0100 Subject: [PATCH 191/560] net: ethernet: cortina: Fix max RX frame define Enumerator 3 is 1548 bytes according to the datasheet. Not 1542. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Reviewed-by: Andrew Lunn Signed-off-by: Linus Walleij Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20231109-gemini-largeframe-fix-v4-1-6e611528db08@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cortina/gemini.c | 4 ++-- drivers/net/ethernet/cortina/gemini.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 5423fe26b4ef0..562b0917316af 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -432,8 +432,8 @@ static const struct gmac_max_framelen gmac_maxlens[] = { .val = CONFIG0_MAXLEN_1536, }, { - .max_l3_len = 1542, - .val = CONFIG0_MAXLEN_1542, + .max_l3_len = 1548, + .val = CONFIG0_MAXLEN_1548, }, { .max_l3_len = 9212, diff --git a/drivers/net/ethernet/cortina/gemini.h b/drivers/net/ethernet/cortina/gemini.h index 9fdf77d5eb374..99efb11557436 100644 --- a/drivers/net/ethernet/cortina/gemini.h +++ b/drivers/net/ethernet/cortina/gemini.h @@ -787,7 +787,7 @@ union gmac_config0 { #define CONFIG0_MAXLEN_1536 0 #define CONFIG0_MAXLEN_1518 1 #define CONFIG0_MAXLEN_1522 2 -#define CONFIG0_MAXLEN_1542 3 +#define CONFIG0_MAXLEN_1548 3 #define CONFIG0_MAXLEN_9k 4 /* 9212 */ #define CONFIG0_MAXLEN_10k 5 /* 10236 */ #define CONFIG0_MAXLEN_1518__6 6 From d4d0c5b4d279bfe3585fbd806efefd3e51c82afa Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 9 Nov 2023 10:03:13 +0100 Subject: [PATCH 192/560] net: ethernet: cortina: Handle large frames The Gemini ethernet controller provides hardware checksumming for frames up to 1514 bytes including ethernet headers but not FCS. If we start sending bigger frames (after first bumping up the MTU on both interfaces sending and receiving the frames), truncated packets start to appear on the target such as in this tcpdump resulting from ping -s 1474: 23:34:17.241983 14:d6:4d:a8:3c:4f (oui Unknown) > bc:ae:c5:6b:a8:3d (oui Unknown), ethertype IPv4 (0x0800), length 1514: truncated-ip - 2 bytes missing! (tos 0x0, ttl 64, id 32653, offset 0, flags [DF], proto ICMP (1), length 1502) OpenWrt.lan > Fecusia: ICMP echo request, id 1672, seq 50, length 1482 If we bypass the hardware checksumming and provide a software fallback, everything starts working fine up to the max TX MTU of 2047 bytes, for example ping -s2000 192.168.1.2: 00:44:29.587598 bc:ae:c5:6b:a8:3d (oui Unknown) > 14:d6:4d:a8:3c:4f (oui Unknown), ethertype IPv4 (0x0800), length 2042: (tos 0x0, ttl 64, id 51828, offset 0, flags [none], proto ICMP (1), length 2028) Fecusia > OpenWrt.lan: ICMP echo reply, id 1683, seq 4, length 2008 The bit enabling to bypass hardware checksum (or any of the "TSS" bits) are undocumented in the hardware reference manual. The entire hardware checksum unit appears undocumented. The conclusion that we need to use the "bypass" bit was found by trial-and-error. Since no hardware checksum will happen, we slot in a software checksum fallback. Check for the condition where we need to compute checksum on the skb with either hardware or software using == CHECKSUM_PARTIAL instead of != CHECKSUM_NONE which is an incomplete check according to . On the D-Link DIR-685 router this fixes a bug on the conduit interface to the RTL8366RB DSA switch: as the switch needs to add space for its tag it increases the MTU on the conduit interface to 1504 and that means that when the router sends packages of 1500 bytes these get an extra 4 bytes of DSA tag and the transfer fails because of the erroneous hardware checksumming, affecting such basic functionality as the LuCI web interface. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Signed-off-by: Linus Walleij Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20231109-gemini-largeframe-fix-v4-2-6e611528db08@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cortina/gemini.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 562b0917316af..e5e73dee13ac8 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1145,6 +1145,7 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, dma_addr_t mapping; unsigned short mtu; void *buffer; + int ret; mtu = ETH_HLEN; mtu += netdev->mtu; @@ -1159,9 +1160,30 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, word3 |= mtu; } - if (skb->ip_summed != CHECKSUM_NONE) { + if (skb->len >= ETH_FRAME_LEN) { + /* Hardware offloaded checksumming isn't working on frames + * bigger than 1514 bytes. A hypothesis about this is that the + * checksum buffer is only 1518 bytes, so when the frames get + * bigger they get truncated, or the last few bytes get + * overwritten by the FCS. + * + * Just use software checksumming and bypass on bigger frames. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + ret = skb_checksum_help(skb); + if (ret) + return ret; + } + word1 |= TSS_BYPASS_BIT; + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { int tcp = 0; + /* We do not switch off the checksumming on non TCP/UDP + * frames: as is shown from tests, the checksumming engine + * is smart enough to see that a frame is not actually TCP + * or UDP and then just pass it through without any changes + * to the frame. + */ if (skb->protocol == htons(ETH_P_IP)) { word1 |= TSS_IP_CHKSUM_BIT; tcp = ip_hdr(skb)->protocol == IPPROTO_TCP; From dc6c0bfbaa947dd7976e30e8c29b10c868b6fa42 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 9 Nov 2023 10:03:14 +0100 Subject: [PATCH 193/560] net: ethernet: cortina: Fix MTU max setting The RX max frame size is over 10000 for the Gemini ethernet, but the TX max frame size is actually just 2047 (0x7ff after checking the datasheet). Reflect this in what we offer to Linux, cap the MTU at the TX max frame minus ethernet headers. We delete the code disabling the hardware checksum for large MTUs as netdev->mtu can no longer be larger than netdev->max_mtu meaning the if()-clause in gmac_fix_features() is never true. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Reviewed-by: Andrew Lunn Signed-off-by: Linus Walleij Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20231109-gemini-largeframe-fix-v4-3-6e611528db08@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cortina/gemini.c | 17 ++++------------- drivers/net/ethernet/cortina/gemini.h | 2 +- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index e5e73dee13ac8..78287cfcbf638 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -2000,15 +2000,6 @@ static int gmac_change_mtu(struct net_device *netdev, int new_mtu) return 0; } -static netdev_features_t gmac_fix_features(struct net_device *netdev, - netdev_features_t features) -{ - if (netdev->mtu + ETH_HLEN + VLAN_HLEN > MTU_SIZE_BIT_MASK) - features &= ~GMAC_OFFLOAD_FEATURES; - - return features; -} - static int gmac_set_features(struct net_device *netdev, netdev_features_t features) { @@ -2234,7 +2225,6 @@ static const struct net_device_ops gmac_351x_ops = { .ndo_set_mac_address = gmac_set_mac_address, .ndo_get_stats64 = gmac_get_stats64, .ndo_change_mtu = gmac_change_mtu, - .ndo_fix_features = gmac_fix_features, .ndo_set_features = gmac_set_features, }; @@ -2486,11 +2476,12 @@ static int gemini_ethernet_port_probe(struct platform_device *pdev) netdev->hw_features = GMAC_OFFLOAD_FEATURES; netdev->features |= GMAC_OFFLOAD_FEATURES | NETIF_F_GRO; - /* We can handle jumbo frames up to 10236 bytes so, let's accept - * payloads of 10236 bytes minus VLAN and ethernet header + /* We can receive jumbo frames up to 10236 bytes but only + * transmit 2047 bytes so, let's accept payloads of 2047 + * bytes minus VLAN and ethernet header */ netdev->min_mtu = ETH_MIN_MTU; - netdev->max_mtu = 10236 - VLAN_ETH_HLEN; + netdev->max_mtu = MTU_SIZE_BIT_MASK - VLAN_ETH_HLEN; port->freeq_refill = 0; netif_napi_add(netdev, &port->napi, gmac_napi_poll); diff --git a/drivers/net/ethernet/cortina/gemini.h b/drivers/net/ethernet/cortina/gemini.h index 99efb11557436..24bb989981f23 100644 --- a/drivers/net/ethernet/cortina/gemini.h +++ b/drivers/net/ethernet/cortina/gemini.h @@ -502,7 +502,7 @@ union gmac_txdesc_3 { #define SOF_BIT 0x80000000 #define EOF_BIT 0x40000000 #define EOFIE_BIT BIT(29) -#define MTU_SIZE_BIT_MASK 0x1fff +#define MTU_SIZE_BIT_MASK 0x7ff /* Max MTU 2047 bytes */ /* GMAC Tx Descriptor */ struct gmac_txdesc { From 0ab0c45d8aaea5192328bfa6989673aceafc767c Mon Sep 17 00:00:00 2001 From: ChunHao Lin Date: Fri, 10 Nov 2023 01:33:59 +0800 Subject: [PATCH 194/560] r8169: add handling DASH when DASH is disabled For devices that support DASH, even DASH is disabled, there may still exist a default firmware that will influence device behavior. So driver needs to handle DASH for devices that support DASH, no matter the DASH status is. This patch also prepares for "fix network lost after resume on DASH systems". Fixes: ee7a1beb9759 ("r8169:call "rtl8168_driver_start" "rtl8168_driver_stop" only when hardware dash function is enabled") Cc: stable@vger.kernel.org Signed-off-by: ChunHao Lin Reviewed-by: Heiner Kallweit Link: https://lore.kernel.org/r/20231109173400.4573-2-hau@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 36 ++++++++++++++++------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 0c76c162b8a9f..cfcb40d909200 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -624,6 +624,7 @@ struct rtl8169_private { unsigned supports_gmii:1; unsigned aspm_manageable:1; + unsigned dash_enabled:1; dma_addr_t counters_phys_addr; struct rtl8169_counters *counters; struct rtl8169_tc_offsets tc_offset; @@ -1253,14 +1254,26 @@ static bool r8168ep_check_dash(struct rtl8169_private *tp) return r8168ep_ocp_read(tp, 0x128) & BIT(0); } -static enum rtl_dash_type rtl_check_dash(struct rtl8169_private *tp) +static bool rtl_dash_is_enabled(struct rtl8169_private *tp) +{ + switch (tp->dash_type) { + case RTL_DASH_DP: + return r8168dp_check_dash(tp); + case RTL_DASH_EP: + return r8168ep_check_dash(tp); + default: + return false; + } +} + +static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private *tp) { switch (tp->mac_version) { case RTL_GIGA_MAC_VER_28: case RTL_GIGA_MAC_VER_31: - return r8168dp_check_dash(tp) ? RTL_DASH_DP : RTL_DASH_NONE; + return RTL_DASH_DP; case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_53: - return r8168ep_check_dash(tp) ? RTL_DASH_EP : RTL_DASH_NONE; + return RTL_DASH_EP; default: return RTL_DASH_NONE; } @@ -1453,7 +1466,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts) device_set_wakeup_enable(tp_to_dev(tp), wolopts); - if (tp->dash_type == RTL_DASH_NONE) { + if (!tp->dash_enabled) { rtl_set_d3_pll_down(tp, !wolopts); tp->dev->wol_enabled = wolopts ? 1 : 0; } @@ -2512,7 +2525,7 @@ static void rtl_wol_enable_rx(struct rtl8169_private *tp) static void rtl_prepare_power_down(struct rtl8169_private *tp) { - if (tp->dash_type != RTL_DASH_NONE) + if (tp->dash_enabled) return; if (tp->mac_version == RTL_GIGA_MAC_VER_32 || @@ -4869,7 +4882,7 @@ static int rtl8169_runtime_idle(struct device *device) { struct rtl8169_private *tp = dev_get_drvdata(device); - if (tp->dash_type != RTL_DASH_NONE) + if (tp->dash_enabled) return -EBUSY; if (!netif_running(tp->dev) || !netif_carrier_ok(tp->dev)) @@ -4895,8 +4908,7 @@ static void rtl_shutdown(struct pci_dev *pdev) /* Restore original MAC address */ rtl_rar_set(tp, tp->dev->perm_addr); - if (system_state == SYSTEM_POWER_OFF && - tp->dash_type == RTL_DASH_NONE) { + if (system_state == SYSTEM_POWER_OFF && !tp->dash_enabled) { pci_wake_from_d3(pdev, tp->saved_wolopts); pci_set_power_state(pdev, PCI_D3hot); } @@ -5254,7 +5266,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) rc = pci_disable_link_state(pdev, PCIE_LINK_STATE_L1); tp->aspm_manageable = !rc; - tp->dash_type = rtl_check_dash(tp); + tp->dash_type = rtl_get_dash_type(tp); + tp->dash_enabled = rtl_dash_is_enabled(tp); tp->cp_cmd = RTL_R16(tp, CPlusCmd) & CPCMD_MASK; @@ -5325,7 +5338,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* configure chip for default features */ rtl8169_set_features(dev, dev->features); - if (tp->dash_type == RTL_DASH_NONE) { + if (!tp->dash_enabled) { rtl_set_d3_pll_down(tp, true); } else { rtl_set_d3_pll_down(tp, false); @@ -5365,7 +5378,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) "ok" : "ko"); if (tp->dash_type != RTL_DASH_NONE) { - netdev_info(dev, "DASH enabled\n"); + netdev_info(dev, "DASH %s\n", + tp->dash_enabled ? "enabled" : "disabled"); rtl8168_driver_start(tp); } From 868c3b95afef4883bfb66c9397482da6840b5baf Mon Sep 17 00:00:00 2001 From: ChunHao Lin Date: Fri, 10 Nov 2023 01:34:00 +0800 Subject: [PATCH 195/560] r8169: fix network lost after resume on DASH systems Device that support DASH may be reseted or powered off during suspend. So driver needs to handle DASH during system suspend and resume. Or DASH firmware will influence device behavior and causes network lost. Fixes: b646d90053f8 ("r8169: magic.") Cc: stable@vger.kernel.org Reviewed-by: Heiner Kallweit Signed-off-by: ChunHao Lin Link: https://lore.kernel.org/r/20231109173400.4573-3-hau@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index cfcb40d909200..b9bb1d2f0237e 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4661,10 +4661,16 @@ static void rtl8169_down(struct rtl8169_private *tp) rtl8169_cleanup(tp); rtl_disable_exit_l1(tp); rtl_prepare_power_down(tp); + + if (tp->dash_type != RTL_DASH_NONE) + rtl8168_driver_stop(tp); } static void rtl8169_up(struct rtl8169_private *tp) { + if (tp->dash_type != RTL_DASH_NONE) + rtl8168_driver_start(tp); + pci_set_master(tp->pci_dev); phy_init_hw(tp->phydev); phy_resume(tp->phydev); From b28060db7172e6d8912d88b369123eb89e0d36b4 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 12 Nov 2023 11:12:49 +0200 Subject: [PATCH 196/560] ovl: fix misformatted comment Remove misleading /** prefix from a regular comment. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311121628.byHp8tkv-lkp@intel.com/ Signed-off-by: Amir Goldstein --- fs/overlayfs/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 50a201e9cd398..c3f020ca13a8c 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -978,7 +978,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper, return 0; } -/** +/* * Caller must hold a reference to inode to prevent it from being freed while * it is marked inuse. */ From 37f32f52643869131ec01bb69bdf9f404f6109fb Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 12 Nov 2023 10:11:25 +0200 Subject: [PATCH 197/560] ovl: fix memory leak in ovl_parse_param() On failure to parse parameters in ovl_parse_param_lowerdir(), it is necessary to update ctx->nr with the correct nr before using ovl_reset_lowerdirs() to release l->name. Reported-and-tested-by: syzbot+26eedf3631650972f17c@syzkaller.appspotmail.com Fixes: c835110b588a ("ovl: remove unused code in lowerdir param parsing") Co-authored-by: Edward Adam Davis Signed-off-by: Amir Goldstein --- fs/overlayfs/params.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index ddab9ea267d12..3fe2dde1598f9 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -430,7 +430,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) struct ovl_fs_context *ctx = fc->fs_private; struct ovl_fs_context_layer *l; char *dup = NULL, *iter; - ssize_t nr_lower = 0, nr = 0, nr_data = 0; + ssize_t nr_lower, nr; bool data_layer = false; /* @@ -482,6 +482,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) iter = dup; l = ctx->lower; for (nr = 0; nr < nr_lower; nr++, l++) { + ctx->nr++; memset(l, 0, sizeof(*l)); err = ovl_mount_dir(iter, &l->path); @@ -498,10 +499,10 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) goto out_put; if (data_layer) - nr_data++; + ctx->nr_data++; /* Calling strchr() again would overrun. */ - if ((nr + 1) == nr_lower) + if (ctx->nr == nr_lower) break; err = -EINVAL; @@ -511,7 +512,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) * This is a regular layer so we require that * there are no data layers. */ - if ((ctx->nr_data + nr_data) > 0) { + if (ctx->nr_data > 0) { pr_err("regular lower layers cannot follow data lower layers"); goto out_put; } @@ -524,8 +525,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) data_layer = true; iter++; } - ctx->nr = nr_lower; - ctx->nr_data += nr_data; kfree(dup); return 0; From 686464514fbebb6c8de4415238319e414c3500a4 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 27 Sep 2023 08:58:05 +0200 Subject: [PATCH 198/560] xen/events: reduce externally visible helper functions get_evtchn_to_irq() has only one external user while irq_from_evtchn() provides the same functionality and is exported for a wider user base. Modify the only external user of get_evtchn_to_irq() to use irq_from_evtchn() instead and make get_evtchn_to_irq() static. evtchn_from_irq() and irq_from_virq() have a single external user and can easily be combined to a new helper irq_evtchn_from_virq() allowing to drop irq_from_virq() and to make evtchn_from_irq() static. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_2l.c | 8 ++++---- drivers/xen/events/events_base.c | 13 +++++++++---- drivers/xen/events/events_internal.h | 1 - include/xen/events.h | 4 ++-- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c index b8f2f971c2f0f..e3585330cf98b 100644 --- a/drivers/xen/events/events_2l.c +++ b/drivers/xen/events/events_2l.c @@ -171,11 +171,11 @@ static void evtchn_2l_handle_events(unsigned cpu, struct evtchn_loop_ctrl *ctrl) int i; struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + evtchn_port_t evtchn; /* Timer interrupt has highest priority. */ - irq = irq_from_virq(cpu, VIRQ_TIMER); + irq = irq_evtchn_from_virq(cpu, VIRQ_TIMER, &evtchn); if (irq != -1) { - evtchn_port_t evtchn = evtchn_from_irq(irq); word_idx = evtchn / BITS_PER_LONG; bit_idx = evtchn % BITS_PER_LONG; if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) @@ -328,9 +328,9 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { if (sync_test_bit(i, BM(sh->evtchn_pending))) { int word_idx = i / BITS_PER_EVTCHN_WORD; - printk(" %d: event %d -> irq %d%s%s%s\n", + printk(" %d: event %d -> irq %u%s%s%s\n", cpu_from_evtchn(i), i, - get_evtchn_to_irq(i), + irq_from_evtchn(i), sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) ? "" : " l2-clear", !sync_test_bit(i, BM(sh->evtchn_mask)) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index c5d86128eb733..a74ea28dcc3d6 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -248,7 +248,7 @@ static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq) return 0; } -int get_evtchn_to_irq(evtchn_port_t evtchn) +static int get_evtchn_to_irq(evtchn_port_t evtchn) { if (evtchn >= xen_evtchn_max_channels()) return -1; @@ -415,7 +415,7 @@ static void xen_irq_info_cleanup(struct irq_info *info) /* * Accessors for packed IRQ information. */ -evtchn_port_t evtchn_from_irq(unsigned irq) +static evtchn_port_t evtchn_from_irq(unsigned int irq) { const struct irq_info *info = NULL; @@ -433,9 +433,14 @@ unsigned int irq_from_evtchn(evtchn_port_t evtchn) } EXPORT_SYMBOL_GPL(irq_from_evtchn); -int irq_from_virq(unsigned int cpu, unsigned int virq) +int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq, + evtchn_port_t *evtchn) { - return per_cpu(virq_to_irq, cpu)[virq]; + int irq = per_cpu(virq_to_irq, cpu)[virq]; + + *evtchn = evtchn_from_irq(irq); + + return irq; } static enum ipi_vector ipi_from_irq(unsigned irq) diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h index 4d3398eff9cdf..19ae31695edcf 100644 --- a/drivers/xen/events/events_internal.h +++ b/drivers/xen/events/events_internal.h @@ -33,7 +33,6 @@ struct evtchn_ops { extern const struct evtchn_ops *evtchn_ops; -int get_evtchn_to_irq(evtchn_port_t evtchn); void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl); unsigned int cpu_from_evtchn(evtchn_port_t evtchn); diff --git a/include/xen/events.h b/include/xen/events.h index a129cafa80ed0..3b07409f80320 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -100,8 +100,8 @@ void xen_poll_irq_timeout(int irq, u64 timeout); /* Determine the IRQ which is bound to an event channel */ unsigned int irq_from_evtchn(evtchn_port_t evtchn); -int irq_from_virq(unsigned int cpu, unsigned int virq); -evtchn_port_t evtchn_from_irq(unsigned irq); +int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq, + evtchn_port_t *evtchn); int xen_set_callback_via(uint64_t via); int xen_evtchn_do_upcall(void); From 3bdb0ac350fe5e6301562143e4573971dd01ae0b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 27 Sep 2023 08:24:46 +0200 Subject: [PATCH 199/560] xen/events: remove some simple helpers from events_base.c The helper functions type_from_irq() and cpu_from_irq() are just one line functions used only internally. Open code them where needed. At the same time modify and rename get_evtchn_to_irq() to return a struct irq_info instead of the IRQ number. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 97 +++++++++++++------------------- 1 file changed, 38 insertions(+), 59 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index a74ea28dcc3d6..a810e8904fbf7 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -248,15 +248,6 @@ static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq) return 0; } -static int get_evtchn_to_irq(evtchn_port_t evtchn) -{ - if (evtchn >= xen_evtchn_max_channels()) - return -1; - if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) - return -1; - return READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]); -} - /* Get info for IRQ */ static struct irq_info *info_for_irq(unsigned irq) { @@ -274,6 +265,19 @@ static void set_info_for_irq(unsigned int irq, struct irq_info *info) irq_set_chip_data(irq, info); } +static struct irq_info *evtchn_to_info(evtchn_port_t evtchn) +{ + int irq; + + if (evtchn >= xen_evtchn_max_channels()) + return NULL; + if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) + return NULL; + irq = READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]); + + return (irq < 0) ? NULL : info_for_irq(irq); +} + /* Per CPU channel accounting */ static void channels_on_cpu_dec(struct irq_info *info) { @@ -429,7 +433,9 @@ static evtchn_port_t evtchn_from_irq(unsigned int irq) unsigned int irq_from_evtchn(evtchn_port_t evtchn) { - return get_evtchn_to_irq(evtchn); + struct irq_info *info = evtchn_to_info(evtchn); + + return info ? info->irq : -1; } EXPORT_SYMBOL_GPL(irq_from_evtchn); @@ -473,25 +479,11 @@ static unsigned pirq_from_irq(unsigned irq) return info->u.pirq.pirq; } -static enum xen_irq_type type_from_irq(unsigned irq) -{ - return info_for_irq(irq)->type; -} - -static unsigned cpu_from_irq(unsigned irq) -{ - return info_for_irq(irq)->cpu; -} - unsigned int cpu_from_evtchn(evtchn_port_t evtchn) { - int irq = get_evtchn_to_irq(evtchn); - unsigned ret = 0; - - if (irq != -1) - ret = cpu_from_irq(irq); + struct irq_info *info = evtchn_to_info(evtchn); - return ret; + return info ? info->cpu : 0; } static void do_mask(struct irq_info *info, u8 reason) @@ -540,13 +532,12 @@ static bool pirq_needs_eoi_flag(unsigned irq) static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu, bool force_affinity) { - int irq = get_evtchn_to_irq(evtchn); - struct irq_info *info = info_for_irq(irq); + struct irq_info *info = evtchn_to_info(evtchn); - BUG_ON(irq == -1); + BUG_ON(info == NULL); if (IS_ENABLED(CONFIG_SMP) && force_affinity) { - struct irq_data *data = irq_get_irq_data(irq); + struct irq_data *data = irq_get_irq_data(info->irq); irq_data_update_affinity(data, cpumask_of(cpu)); irq_data_update_effective_affinity(data, cpumask_of(cpu)); @@ -979,13 +970,13 @@ static void __unbind_from_irq(unsigned int irq) } if (VALID_EVTCHN(evtchn)) { - unsigned int cpu = cpu_from_irq(irq); + unsigned int cpu = info->cpu; struct xenbus_device *dev; if (!info->is_static) xen_evtchn_close(evtchn); - switch (type_from_irq(irq)) { + switch (info->type) { case IRQT_VIRQ: per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; break; @@ -1185,15 +1176,16 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, { int irq; int ret; + struct irq_info *info; if (evtchn >= xen_evtchn_max_channels()) return -ENOMEM; mutex_lock(&irq_mapping_update_lock); - irq = get_evtchn_to_irq(evtchn); + info = evtchn_to_info(evtchn); - if (irq == -1) { + if (!info) { irq = xen_allocate_irq_dynamic(); if (irq < 0) goto out; @@ -1216,9 +1208,9 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, */ bind_evtchn_to_cpu(evtchn, 0, false); } else { - struct irq_info *info = info_for_irq(irq); - if (!WARN_ON(!info || info->type != IRQT_EVTCHN)) + if (!WARN_ON(info->type != IRQT_EVTCHN)) info->refcnt++; + irq = info->irq; } out: @@ -1556,13 +1548,7 @@ EXPORT_SYMBOL_GPL(xen_set_irq_priority); int evtchn_make_refcounted(evtchn_port_t evtchn, bool is_static) { - int irq = get_evtchn_to_irq(evtchn); - struct irq_info *info; - - if (irq == -1) - return -ENOENT; - - info = info_for_irq(irq); + struct irq_info *info = evtchn_to_info(evtchn); if (!info) return -ENOENT; @@ -1578,7 +1564,6 @@ EXPORT_SYMBOL_GPL(evtchn_make_refcounted); int evtchn_get(evtchn_port_t evtchn) { - int irq; struct irq_info *info; int err = -ENOENT; @@ -1587,11 +1572,7 @@ int evtchn_get(evtchn_port_t evtchn) mutex_lock(&irq_mapping_update_lock); - irq = get_evtchn_to_irq(evtchn); - if (irq == -1) - goto done; - - info = info_for_irq(irq); + info = evtchn_to_info(evtchn); if (!info) goto done; @@ -1611,10 +1592,11 @@ EXPORT_SYMBOL_GPL(evtchn_get); void evtchn_put(evtchn_port_t evtchn) { - int irq = get_evtchn_to_irq(evtchn); - if (WARN_ON(irq == -1)) + struct irq_info *info = evtchn_to_info(evtchn); + + if (WARN_ON(!info)) return; - unbind_from_irq(irq); + unbind_from_irq(info->irq); } EXPORT_SYMBOL_GPL(evtchn_put); @@ -1644,12 +1626,10 @@ struct evtchn_loop_ctrl { void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) { - int irq; - struct irq_info *info; + struct irq_info *info = evtchn_to_info(port); struct xenbus_device *dev; - irq = get_evtchn_to_irq(port); - if (irq == -1) + if (!info) return; /* @@ -1674,7 +1654,6 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) } } - info = info_for_irq(irq); if (xchg_acquire(&info->is_active, 1)) return; @@ -1688,7 +1667,7 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) info->eoi_time = get_jiffies_64() + event_eoi_delay; } - generic_handle_irq(irq); + generic_handle_irq(info->irq); } int xen_evtchn_do_upcall(void) @@ -1746,7 +1725,7 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) mutex_lock(&irq_mapping_update_lock); /* After resume the irq<->evtchn mappings are all cleared out */ - BUG_ON(get_evtchn_to_irq(evtchn) != -1); + BUG_ON(evtchn_to_info(evtchn)); /* Expect irq to have been bound before, so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); From 4b7b492615cf3017190f55444f7016812b66611d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Nov 2023 13:49:38 +0000 Subject: [PATCH 200/560] af_unix: fix use-after-free in unix_stream_read_actor() syzbot reported the following crash [1] After releasing unix socket lock, u->oob_skb can be changed by another thread. We must temporarily increase skb refcount to make sure this other thread will not free the skb under us. [1] BUG: KASAN: slab-use-after-free in unix_stream_read_actor+0xa7/0xc0 net/unix/af_unix.c:2866 Read of size 4 at addr ffff88801f3b9cc4 by task syz-executor107/5297 CPU: 1 PID: 5297 Comm: syz-executor107 Not tainted 6.6.0-syzkaller-15910-gb8e3a87a627b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x1b0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0xc4/0x620 mm/kasan/report.c:475 kasan_report+0xda/0x110 mm/kasan/report.c:588 unix_stream_read_actor+0xa7/0xc0 net/unix/af_unix.c:2866 unix_stream_recv_urg net/unix/af_unix.c:2587 [inline] unix_stream_read_generic+0x19a5/0x2480 net/unix/af_unix.c:2666 unix_stream_recvmsg+0x189/0x1b0 net/unix/af_unix.c:2903 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg+0xe2/0x170 net/socket.c:1066 ____sys_recvmsg+0x21f/0x5c0 net/socket.c:2803 ___sys_recvmsg+0x115/0x1a0 net/socket.c:2845 __sys_recvmsg+0x114/0x1e0 net/socket.c:2875 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b RIP: 0033:0x7fc67492c559 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 51 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fc6748ab228 EFLAGS: 00000246 ORIG_RAX: 000000000000002f RAX: ffffffffffffffda RBX: 000000000000001c RCX: 00007fc67492c559 RDX: 0000000040010083 RSI: 0000000020000140 RDI: 0000000000000004 RBP: 00007fc6749b6348 R08: 00007fc6748ab6c0 R09: 00007fc6748ab6c0 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fc6749b6340 R13: 00007fc6749b634c R14: 00007ffe9fac52a0 R15: 00007ffe9fac5388 Allocated by task 5295: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 __kasan_slab_alloc+0x81/0x90 mm/kasan/common.c:328 kasan_slab_alloc include/linux/kasan.h:188 [inline] slab_post_alloc_hook mm/slab.h:763 [inline] slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x180/0x3c0 mm/slub.c:3523 __alloc_skb+0x287/0x330 net/core/skbuff.c:641 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xe4/0x710 net/core/skbuff.c:6331 sock_alloc_send_pskb+0x7e4/0x970 net/core/sock.c:2780 sock_alloc_send_skb include/net/sock.h:1884 [inline] queue_oob net/unix/af_unix.c:2147 [inline] unix_stream_sendmsg+0xb5f/0x10a0 net/unix/af_unix.c:2301 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 ____sys_sendmsg+0x6ac/0x940 net/socket.c:2584 ___sys_sendmsg+0x135/0x1d0 net/socket.c:2638 __sys_sendmsg+0x117/0x1e0 net/socket.c:2667 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Freed by task 5295: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 kasan_save_free_info+0x2b/0x40 mm/kasan/generic.c:522 ____kasan_slab_free mm/kasan/common.c:236 [inline] ____kasan_slab_free+0x15b/0x1b0 mm/kasan/common.c:200 kasan_slab_free include/linux/kasan.h:164 [inline] slab_free_hook mm/slub.c:1800 [inline] slab_free_freelist_hook+0x114/0x1e0 mm/slub.c:1826 slab_free mm/slub.c:3809 [inline] kmem_cache_free+0xf8/0x340 mm/slub.c:3831 kfree_skbmem+0xef/0x1b0 net/core/skbuff.c:1015 __kfree_skb net/core/skbuff.c:1073 [inline] consume_skb net/core/skbuff.c:1288 [inline] consume_skb+0xdf/0x170 net/core/skbuff.c:1282 queue_oob net/unix/af_unix.c:2178 [inline] unix_stream_sendmsg+0xd49/0x10a0 net/unix/af_unix.c:2301 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 ____sys_sendmsg+0x6ac/0x940 net/socket.c:2584 ___sys_sendmsg+0x135/0x1d0 net/socket.c:2638 __sys_sendmsg+0x117/0x1e0 net/socket.c:2667 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b The buggy address belongs to the object at ffff88801f3b9c80 which belongs to the cache skbuff_head_cache of size 240 The buggy address is located 68 bytes inside of freed 240-byte region [ffff88801f3b9c80, ffff88801f3b9d70) The buggy address belongs to the physical page: page:ffffea00007cee40 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1f3b9 flags: 0xfff00000000800(slab|node=0|zone=1|lastcpupid=0x7ff) page_type: 0xffffffff() raw: 00fff00000000800 ffff888142a60640 dead000000000122 0000000000000000 raw: 0000000000000000 00000000000c000c 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x12cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY), pid 5299, tgid 5283 (syz-executor107), ts 103803840339, free_ts 103600093431 set_page_owner include/linux/page_owner.h:31 [inline] post_alloc_hook+0x2cf/0x340 mm/page_alloc.c:1537 prep_new_page mm/page_alloc.c:1544 [inline] get_page_from_freelist+0xa25/0x36c0 mm/page_alloc.c:3312 __alloc_pages+0x1d0/0x4a0 mm/page_alloc.c:4568 alloc_pages_mpol+0x258/0x5f0 mm/mempolicy.c:2133 alloc_slab_page mm/slub.c:1870 [inline] allocate_slab+0x251/0x380 mm/slub.c:2017 new_slab mm/slub.c:2070 [inline] ___slab_alloc+0x8c7/0x1580 mm/slub.c:3223 __slab_alloc.constprop.0+0x56/0xa0 mm/slub.c:3322 __slab_alloc_node mm/slub.c:3375 [inline] slab_alloc_node mm/slub.c:3468 [inline] kmem_cache_alloc_node+0x132/0x3c0 mm/slub.c:3523 __alloc_skb+0x287/0x330 net/core/skbuff.c:641 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xe4/0x710 net/core/skbuff.c:6331 sock_alloc_send_pskb+0x7e4/0x970 net/core/sock.c:2780 sock_alloc_send_skb include/net/sock.h:1884 [inline] queue_oob net/unix/af_unix.c:2147 [inline] unix_stream_sendmsg+0xb5f/0x10a0 net/unix/af_unix.c:2301 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 ____sys_sendmsg+0x6ac/0x940 net/socket.c:2584 ___sys_sendmsg+0x135/0x1d0 net/socket.c:2638 __sys_sendmsg+0x117/0x1e0 net/socket.c:2667 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1137 [inline] free_unref_page_prepare+0x4f8/0xa90 mm/page_alloc.c:2347 free_unref_page+0x33/0x3b0 mm/page_alloc.c:2487 __unfreeze_partials+0x21d/0x240 mm/slub.c:2655 qlink_free mm/kasan/quarantine.c:168 [inline] qlist_free_all+0x6a/0x170 mm/kasan/quarantine.c:187 kasan_quarantine_reduce+0x18e/0x1d0 mm/kasan/quarantine.c:294 __kasan_slab_alloc+0x65/0x90 mm/kasan/common.c:305 kasan_slab_alloc include/linux/kasan.h:188 [inline] slab_post_alloc_hook mm/slab.h:763 [inline] slab_alloc_node mm/slub.c:3478 [inline] slab_alloc mm/slub.c:3486 [inline] __kmem_cache_alloc_lru mm/slub.c:3493 [inline] kmem_cache_alloc+0x15d/0x380 mm/slub.c:3502 vm_area_dup+0x21/0x2f0 kernel/fork.c:500 __split_vma+0x17d/0x1070 mm/mmap.c:2365 split_vma mm/mmap.c:2437 [inline] vma_modify+0x25d/0x450 mm/mmap.c:2472 vma_modify_flags include/linux/mm.h:3271 [inline] mprotect_fixup+0x228/0xc80 mm/mprotect.c:635 do_mprotect_pkey+0x852/0xd60 mm/mprotect.c:809 __do_sys_mprotect mm/mprotect.c:830 [inline] __se_sys_mprotect mm/mprotect.c:827 [inline] __x64_sys_mprotect+0x78/0xb0 mm/mprotect.c:827 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x3f/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Memory state around the buggy address: ffff88801f3b9b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88801f3b9c00: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc >ffff88801f3b9c80: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88801f3b9d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc ffff88801f3b9d80: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb Fixes: 876c14ad014d ("af_unix: fix holding spinlock in oob handling") Reported-and-tested-by: syzbot+7a2d546fa43e49315ed3@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Cc: Rao Shoaib Reviewed-by: Rao shoaib Link: https://lore.kernel.org/r/20231113134938.168151-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 45506a95b25f8..a357dc5f24046 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2581,15 +2581,16 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) if (!(state->flags & MSG_PEEK)) WRITE_ONCE(u->oob_skb, NULL); - + else + skb_get(oob_skb); unix_state_unlock(sk); chunk = state->recv_actor(oob_skb, 0, chunk, state); - if (!(state->flags & MSG_PEEK)) { + if (!(state->flags & MSG_PEEK)) UNIXCB(oob_skb).consumed += 1; - kfree_skb(oob_skb); - } + + consume_skb(oob_skb); mutex_unlock(&u->iolock); From 67059b61597c004e23b074547b40604222bee3a0 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 1 Nov 2023 09:33:51 +0800 Subject: [PATCH 201/560] netfilter: nft_set_rbtree: Remove unused variable nft_net MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code that uses nft_net has been removed, and the nft_pernet function is merely obtaining a reference to shared data through the net pointer. The content of the net pointer is not modified or changed, so both of them should be removed. silence the warning: net/netfilter/nft_set_rbtree.c:627:26: warning: variable ‘nft_net’ set but not used Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7103 Signed-off-by: Yang Li Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 6f1186abd47b4..baa3fea4fe65c 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -624,14 +624,12 @@ static void nft_rbtree_gc(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end = NULL; - struct nftables_pernet *nft_net; struct rb_node *node, *next; struct nft_trans_gc *gc; struct net *net; set = nft_set_container_of(priv); net = read_pnet(&set->net); - nft_net = nft_pernet(net); gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); if (!gc) From a44af08e3d4d7566eeea98d7a29fe06e7b9de944 Mon Sep 17 00:00:00 2001 From: Linkui Xiao Date: Wed, 1 Nov 2023 11:20:18 +0800 Subject: [PATCH 202/560] netfilter: nf_conntrack_bridge: initialize err to 0 K2CI reported a problem: consume_skb(skb); return err; [nf_br_ip_fragment() error] uninitialized symbol 'err'. err is not initialized, because returning 0 is expected, initialize err to 0. Fixes: 3c171f496ef5 ("netfilter: bridge: add connection tracking system") Reported-by: k2ci Signed-off-by: Linkui Xiao Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nf_conntrack_bridge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index b5c406a6e7654..abb090f94ed26 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -37,7 +37,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, ktime_t tstamp = skb->tstamp; struct ip_frag_state state; struct iphdr *iph; - int err; + int err = 0; /* for offloaded checksums cleanup checksum before fragmentation */ if (skb->ip_summed == CHECKSUM_PARTIAL && From c301f0981fdd3fd1ffac6836b423c4d7a8e0eb63 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 3 Nov 2023 09:42:51 +0300 Subject: [PATCH 203/560] netfilter: nf_tables: fix pointer math issue in nft_byteorder_eval() The problem is in nft_byteorder_eval() where we are iterating through a loop and writing to dst[0], dst[1], dst[2] and so on... On each iteration we are writing 8 bytes. But dst[] is an array of u32 so each element only has space for 4 bytes. That means that every iteration overwrites part of the previous element. I spotted this bug while reviewing commit caf3ef7468f7 ("netfilter: nf_tables: prevent OOB access in nft_byteorder_eval") which is a related issue. I think that the reason we have not detected this bug in testing is that most of time we only write one element. Fixes: ce1e7989d989 ("netfilter: nft_byteorder: provide 64bit le/be conversion") Signed-off-by: Dan Carpenter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- net/netfilter/nft_byteorder.c | 5 +++-- net/netfilter/nft_meta.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3bbd13ab1ecf5..b157c5cafd14c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -178,9 +178,9 @@ static inline __be32 nft_reg_load_be32(const u32 *sreg) return *(__force __be32 *)sreg; } -static inline void nft_reg_store64(u32 *dreg, u64 val) +static inline void nft_reg_store64(u64 *dreg, u64 val) { - put_unaligned(val, (u64 *)dreg); + put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index e596d1a842f70..f6e791a681015 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -38,13 +38,14 @@ void nft_byteorder_eval(const struct nft_expr *expr, switch (priv->size) { case 8: { + u64 *dst64 = (void *)dst; u64 src64; switch (priv->op) { case NFT_BYTEORDER_NTOH: for (i = 0; i < priv->len / 8; i++) { src64 = nft_reg_load64(&src[i]); - nft_reg_store64(&dst[i], + nft_reg_store64(&dst64[i], be64_to_cpu((__force __be64)src64)); } break; @@ -52,7 +53,7 @@ void nft_byteorder_eval(const struct nft_expr *expr, for (i = 0; i < priv->len / 8; i++) { src64 = (__force __u64) cpu_to_be64(nft_reg_load64(&src[i])); - nft_reg_store64(&dst[i], src64); + nft_reg_store64(&dst64[i], src64); } break; } diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index f7da7c43333b5..ba0d3683a45d3 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -63,7 +63,7 @@ nft_meta_get_eval_time(enum nft_meta_keys key, { switch (key) { case NFT_META_TIME_NS: - nft_reg_store64(dest, ktime_get_real_ns()); + nft_reg_store64((u64 *)dest, ktime_get_real_ns()); break; case NFT_META_TIME_DAY: nft_reg_store8(dest, nft_meta_weekday()); From a7d5a955bfa854ac6b0c53aaf933394b4e6139e4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Nov 2023 20:34:56 +0100 Subject: [PATCH 204/560] netfilter: nf_tables: bogus ENOENT when destroying element which does not exist destroy element command bogusly reports ENOENT in case a set element does not exist. ENOENT errors are skipped, however, err is still set and propagated to userspace. # nft destroy element ip raw BLACKLIST { 1.2.3.4 } Error: Could not process rule: No such file or directory destroy element ip raw BLACKLIST { 1.2.3.4 } ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Fixes: f80a612dd77c ("netfilter: nf_tables: add support to destroy operation") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a761ee6796f6f..debea1c677016 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7263,10 +7263,11 @@ static int nf_tables_delsetelem(struct sk_buff *skb, if (err < 0) { NL_SET_BAD_ATTR(extack, attr); - break; + return err; } } - return err; + + return 0; } /* From 28628fa952fefc7f2072ce6e8016968cc452b1ba Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 13 Nov 2023 21:13:23 +0100 Subject: [PATCH 205/560] netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test Linkui Xiao reported that there's a race condition when ipset swap and destroy is called, which can lead to crash in add/del/test element operations. Swap then destroy are usual operations to replace a set with another one in a production system. The issue can in some cases be reproduced with the script: ipset create hash_ip1 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip1 172.20.0.0/16 ipset add hash_ip1 192.168.0.0/16 iptables -A INPUT -m set --match-set hash_ip1 src -j ACCEPT while [ 1 ] do # ... Ongoing traffic... ipset create hash_ip2 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip2 172.20.0.0/16 ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 sleep 0.05 done In the race case the possible order of the operations are CPU0 CPU1 ip_set_test ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 hash_net_kadt Swap replaces hash_ip1 with hash_ip2 and then destroy removes hash_ip2 which is the original hash_ip1. ip_set_test was called on hash_ip1 and because destroy removed it, hash_net_kadt crashes. The fix is to force ip_set_swap() to wait for all readers to finish accessing the old set pointers by calling synchronize_rcu(). The first version of the patch was written by Linkui Xiao . v2: synchronize_rcu() is moved into ip_set_swap() in order not to burden ip_set_destroy() unnecessarily when all sets are destroyed. v3: Florian Westphal pointed out that all netfilter hooks run with rcu_read_lock() held and em_ipset.c wraps the entire ip_set_test() in rcu read lock/unlock pair. So there's no need to extend the rcu read locked area in ipset itself. Closes: https://lore.kernel.org/all/69e7963b-e7f8-3ad0-210-7b86eebf7f78@netfilter.org/ Reported by: Linkui Xiao Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 35d2f9c9ada02..4c133e06be1de 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -61,6 +61,8 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); ip_set_dereference((inst)->ip_set_list)[id] #define ip_set_ref_netlink(inst,id) \ rcu_dereference_raw((inst)->ip_set_list)[id] +#define ip_set_dereference_nfnl(p) \ + rcu_dereference_check(p, lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* The set types are implemented in modules and registered set types * can be found in ip_set_type_list. Adding/deleting types is @@ -708,15 +710,10 @@ __ip_set_put_netlink(struct ip_set *set) static struct ip_set * ip_set_rcu_get(struct net *net, ip_set_id_t index) { - struct ip_set *set; struct ip_set_net *inst = ip_set_pernet(net); - rcu_read_lock(); - /* ip_set_list itself needs to be protected */ - set = rcu_dereference(inst->ip_set_list)[index]; - rcu_read_unlock(); - - return set; + /* ip_set_list and the set pointer need to be protected */ + return ip_set_dereference_nfnl(inst->ip_set_list)[index]; } static inline void @@ -1397,6 +1394,9 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); + /* Make sure all readers of the old set pointers are completed. */ + synchronize_rcu(); + return 0; } From 8837ba3e58ea1e3d09ae36db80b1e80853aada95 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Nov 2023 21:13:31 +0100 Subject: [PATCH 206/560] netfilter: nf_tables: split async and sync catchall in two functions list_for_each_entry_safe() does not work for the async case which runs under RCU, therefore, split GC logic for catchall in two functions instead, one for each of the sync and async GC variants. The catchall sync GC variant never sees a _DEAD bit set on ever, thus, this handling is removed in such case, moreover, allocate GC sync batch via GFP_KERNEL. Fixes: 93995bf4af2c ("netfilter: nf_tables: remove catchall element in GC sync path") Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 55 +++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index debea1c677016..c0a42989b9822 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9680,16 +9680,14 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) call_rcu(&trans->rcu, nft_trans_gc_trans_free); } -static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, - unsigned int gc_seq, - bool sync) +struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, + unsigned int gc_seq) { - struct nft_set_elem_catchall *catchall, *next; + struct nft_set_elem_catchall *catchall; const struct nft_set *set = gc->set; - struct nft_elem_priv *elem_priv; struct nft_set_ext *ext; - list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_expired(ext)) @@ -9699,35 +9697,42 @@ static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, nft_set_elem_dead(ext); dead_elem: - if (sync) - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); - else - gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); - + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); if (!gc) return NULL; - elem_priv = catchall->elem; - if (sync) { - nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); - nft_setelem_catchall_destroy(catchall); - } - - nft_trans_gc_elem_add(gc, elem_priv); + nft_trans_gc_elem_add(gc, catchall->elem); } return gc; } -struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, - unsigned int gc_seq) -{ - return nft_trans_gc_catchall(gc, gc_seq, false); -} - struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) { - return nft_trans_gc_catchall(gc, 0, true); + struct nft_set_elem_catchall *catchall, *next; + const struct nft_set *set = gc->set; + struct nft_elem_priv *elem_priv; + struct nft_set_ext *ext; + + WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)); + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext)) + continue; + + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + return NULL; + + elem_priv = catchall->elem; + nft_setelem_data_deactivate(gc->net, gc->set, elem_priv); + nft_setelem_catchall_destroy(catchall); + nft_trans_gc_elem_add(gc, elem_priv); + } + + return gc; } static void nf_tables_module_autoload_cleanup(struct net *net) From b944aa9d86d5f782bfe5e51336434c960304839c Mon Sep 17 00:00:00 2001 From: Matus Malych Date: Tue, 14 Nov 2023 14:35:25 +0100 Subject: [PATCH 207/560] ALSA: hda/realtek: Enable Mute LED on HP 255 G10 HP 255 G10 has a mute LED that can be made to work using quirk ALC236_FIXUP_HP_MUTE_LED_COEFBIT2. Enable already existing quirk - at correct line to keep order Signed-off-by: Matus Malych Cc: Link: https://lore.kernel.org/r/20231114133524.11340-1-matus@malych.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index cdd808e02b44d..3c85b8247c110 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9868,6 +9868,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8abb, "HP ZBook Firefly 14 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ad1, "HP EliteBook 840 14 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ad2, "HP EliteBook 860 16 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b2f, "HP 255 15.6 inch G10 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x8b42, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b43, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b44, "HP", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), From 2e6ef8aaba6b709ce91164401fa1c12668510360 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 14 Nov 2023 07:57:53 -0600 Subject: [PATCH 208/560] Remove myself as maintainer of GFS2 I am retiring from Red Hat and will no longer be a maintainer of the gfs2 file system. Signed-off-by: Bob Peterson Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cfd..5c9f868e13b6e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8950,7 +8950,6 @@ S: Maintained F: scripts/get_maintainer.pl GFS2 FILE SYSTEM -M: Bob Peterson M: Andreas Gruenbacher L: gfs2@lists.linux.dev S: Supported From 782ce431613cf08c3a00dca42ad925c3b1108d09 Mon Sep 17 00:00:00 2001 From: Konstantin Runov Date: Mon, 30 Oct 2023 12:45:08 +0300 Subject: [PATCH 209/560] gcc-plugins: latent_entropy: Fix typo (args -> argc) in plugin description Fix the typo in the plugin description comment. Clearly, "argc" should be used. Signed-off-by: Konstantin Runov Link: https://lore.kernel.org/r/20231030094508.245432-1-runebone1@gmail.com Signed-off-by: Kees Cook --- scripts/gcc-plugins/latent_entropy_plugin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gcc-plugins/latent_entropy_plugin.c b/scripts/gcc-plugins/latent_entropy_plugin.c index 39e86be60dd2d..ff0b192be91ff 100644 --- a/scripts/gcc-plugins/latent_entropy_plugin.c +++ b/scripts/gcc-plugins/latent_entropy_plugin.c @@ -17,7 +17,7 @@ * if (argc <= 1) * printf("%s: no command arguments :(\n", *argv); * else - * printf("%s: %d command arguments!\n", *argv, args - 1); + * printf("%s: %d command arguments!\n", *argv, argc - 1); * } * * after: @@ -47,7 +47,7 @@ * // perturb_local_entropy() * } else { * local_entropy ^= 3896280633962944730; - * printf("%s: %d command arguments!\n", *argv, args - 1); + * printf("%s: %d command arguments!\n", *argv, argc - 1); * } * * // latent_entropy_execute() 4. From 29954d5b1e0d67a4cd61c30c2201030c97e94b1e Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 14 Nov 2023 04:54:12 +0000 Subject: [PATCH 210/560] cifs: fix leak of iface for primary channel My last change in this area introduced a change which accounted for primary channel in the interface ref count. However, it did not reduce this ref count on deallocation of the primary channel. i.e. during umount. Fixing this leak here, by dropping this ref count for primary channel while freeing up the session. Fixes: fa1d0508bdd4 ("cifs: account for primary channel in the interface list") Cc: stable@vger.kernel.org Reported-by: Paulo Alcantara Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/connect.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 57c2a7df34578..f896f60c924bf 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2065,6 +2065,12 @@ void __cifs_put_smb_ses(struct cifs_ses *ses) ses->chans[i].server = NULL; } + /* we now account for primary channel in iface->refcount */ + if (ses->chans[0].iface) { + kref_put(&ses->chans[0].iface->refcount, release_iface); + ses->chans[0].server = NULL; + } + sesInfoFree(ses); cifs_put_tcp_session(server, 0); } From 5eef12c4e3230f2025dc46ad8c4a3bc19978e5d7 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 14 Nov 2023 04:58:23 +0000 Subject: [PATCH 211/560] cifs: fix lock ordering while disabling multichannel The code to handle the case of server disabling multichannel was picking iface_lock with chan_lock held. This goes against the lock ordering rules, as iface_lock is a higher order lock (even if it isn't so obvious). This change fixes the lock ordering by doing the following in that order for each secondary channel: 1. store iface and server pointers in local variable 2. remove references to iface and server in channels 3. unlock chan_lock 4. lock iface_lock 5. dec ref count for iface 6. unlock iface_lock 7. dec ref count for server 8. lock chan_lock again Since this function can only be called in smb2_reconnect, and that cannot be called by two parallel processes, we should not have races due to dropping chan_lock between steps 3 and 8. Fixes: ee1d21794e55 ("cifs: handle when server stops supporting multichannel") Reported-by: Paulo Alcantara Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/sess.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 0bb2ac9290617..8b2d7c1ca4284 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -322,28 +322,32 @@ cifs_disable_secondary_channels(struct cifs_ses *ses) iface = ses->chans[i].iface; server = ses->chans[i].server; + /* + * remove these references first, since we need to unlock + * the chan_lock here, since iface_lock is a higher lock + */ + ses->chans[i].iface = NULL; + ses->chans[i].server = NULL; + spin_unlock(&ses->chan_lock); + if (iface) { spin_lock(&ses->iface_lock); kref_put(&iface->refcount, release_iface); - ses->chans[i].iface = NULL; iface->num_channels--; if (iface->weight_fulfilled) iface->weight_fulfilled--; spin_unlock(&ses->iface_lock); } - spin_unlock(&ses->chan_lock); - if (server && !server->terminate) { - server->terminate = true; - cifs_signal_cifsd_for_reconnect(server, false); - } - spin_lock(&ses->chan_lock); - if (server) { - ses->chans[i].server = NULL; + if (!server->terminate) { + server->terminate = true; + cifs_signal_cifsd_for_reconnect(server, false); + } cifs_put_tcp_session(server, false); } + spin_lock(&ses->chan_lock); } done: From eab03c23c2a162085b13200d7942fc5a00b5ccc8 Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Tue, 7 Nov 2023 17:05:07 +0800 Subject: [PATCH 212/560] sched/eevdf: Fix vruntime adjustment on reweight vruntime of the (on_rq && !0-lag) entity needs to be adjusted when it gets re-weighted, and the calculations can be simplified based on the fact that re-weight won't change the w-average of all the entities. Please check the proofs in comments. But adjusting vruntime can also cause position change in RB-tree hence require re-queue to fix up which might be costly. This might be avoided by deferring adjustment to the time the entity actually leaves tree (dequeue/pick), but that will negatively affect task selection and probably not good enough either. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Signed-off-by: Abel Wu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231107090510.71322-2-wuyun.abel@bytedance.com --- kernel/sched/fair.c | 151 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 128 insertions(+), 23 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2048138ce54b5..025d90925bf63 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3666,41 +3666,140 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif +static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + unsigned long old_weight = se->load.weight; + u64 avruntime = avg_vruntime(cfs_rq); + s64 vlag, vslice; + + /* + * VRUNTIME + * ======== + * + * COROLLARY #1: The virtual runtime of the entity needs to be + * adjusted if re-weight at !0-lag point. + * + * Proof: For contradiction assume this is not true, so we can + * re-weight without changing vruntime at !0-lag point. + * + * Weight VRuntime Avg-VRuntime + * before w v V + * after w' v' V' + * + * Since lag needs to be preserved through re-weight: + * + * lag = (V - v)*w = (V'- v')*w', where v = v' + * ==> V' = (V - v)*w/w' + v (1) + * + * Let W be the total weight of the entities before reweight, + * since V' is the new weighted average of entities: + * + * V' = (WV + w'v - wv) / (W + w' - w) (2) + * + * by using (1) & (2) we obtain: + * + * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v + * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v + * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v + * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) + * + * Since we are doing at !0-lag point which means V != v, we + * can simplify (3): + * + * ==> W / (W + w' - w) = w / w' + * ==> Ww' = Ww + ww' - ww + * ==> W * (w' - w) = w * (w' - w) + * ==> W = w (re-weight indicates w' != w) + * + * So the cfs_rq contains only one entity, hence vruntime of + * the entity @v should always equal to the cfs_rq's weighted + * average vruntime @V, which means we will always re-weight + * at 0-lag point, thus breach assumption. Proof completed. + * + * + * COROLLARY #2: Re-weight does NOT affect weighted average + * vruntime of all the entities. + * + * Proof: According to corollary #1, Eq. (1) should be: + * + * (V - v)*w = (V' - v')*w' + * ==> v' = V' - (V - v)*w/w' (4) + * + * According to the weighted average formula, we have: + * + * V' = (WV - wv + w'v') / (W - w + w') + * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') + * = (WV - wv + w'V' - Vw + wv) / (W - w + w') + * = (WV + w'V' - Vw) / (W - w + w') + * + * ==> V'*(W - w + w') = WV + w'V' - Vw + * ==> V' * (W - w) = (W - w) * V (5) + * + * If the entity is the only one in the cfs_rq, then reweight + * always occurs at 0-lag point, so V won't change. Or else + * there are other entities, hence W != w, then Eq. (5) turns + * into V' = V. So V won't change in either case, proof done. + * + * + * So according to corollary #1 & #2, the effect of re-weight + * on vruntime should be: + * + * v' = V' - (V - v) * w / w' (4) + * = V - (V - v) * w / w' + * = V - vl * w / w' + * = V - vl' + */ + if (avruntime != se->vruntime) { + vlag = (s64)(avruntime - se->vruntime); + vlag = div_s64(vlag * old_weight, weight); + se->vruntime = avruntime - vlag; + } + + /* + * DEADLINE + * ======== + * + * When the weight changes, the virtual time slope changes and + * we should adjust the relative virtual deadline accordingly. + * + * d' = v' + (d - v)*w/w' + * = V' - (V - v)*w/w' + (d - v)*w/w' + * = V - (V - v)*w/w' + (d - v)*w/w' + * = V + (d - V)*w/w' + */ + vslice = (s64)(se->deadline - avruntime); + vslice = div_s64(vslice * old_weight, weight); + se->deadline = avruntime + vslice; +} + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { - unsigned long old_weight = se->load.weight; + bool curr = cfs_rq->curr == se; if (se->on_rq) { /* commit outstanding execution time */ - if (cfs_rq->curr == se) + if (curr) update_curr(cfs_rq); else - avg_vruntime_sub(cfs_rq, se); + __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); - update_load_set(&se->load, weight); - if (!se->on_rq) { /* * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), * we need to scale se->vlag when w_i changes. */ - se->vlag = div_s64(se->vlag * old_weight, weight); + se->vlag = div_s64(se->vlag * se->load.weight, weight); } else { - s64 deadline = se->deadline - se->vruntime; - /* - * When the weight changes, the virtual time slope changes and - * we should adjust the relative virtual deadline accordingly. - */ - deadline = div_s64(deadline * old_weight, weight); - se->deadline = se->vruntime + deadline; - if (se != cfs_rq->curr) - min_deadline_cb_propagate(&se->run_node, NULL); + reweight_eevdf(cfs_rq, se, weight); } + update_load_set(&se->load, weight); + #ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); @@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { update_load_add(&cfs_rq->load, se->load.weight); - if (cfs_rq->curr != se) - avg_vruntime_add(cfs_rq, se); + if (!curr) { + /* + * The entity's vruntime has been adjusted, so let's check + * whether the rq-wide min_vruntime needs updated too. Since + * the calculations above require stable min_vruntime rather + * than up-to-date one, we do the update at the end of the + * reweight process. + */ + __enqueue_entity(cfs_rq, se); + update_min_vruntime(cfs_rq); + } } } @@ -3857,14 +3965,11 @@ static void update_cfs_group(struct sched_entity *se) #ifndef CONFIG_SMP shares = READ_ONCE(gcfs_rq->tg->shares); - - if (likely(se->load.weight == shares)) - return; #else - shares = calc_group_shares(gcfs_rq); + shares = calc_group_shares(gcfs_rq); #endif - - reweight_entity(cfs_rq_of(se), se, shares); + if (unlikely(se->load.weight != shares)) + reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ From 8b39d20eceeda6c4eb23df1497f9ed2fffdc8f69 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 26 Oct 2023 12:41:14 -0400 Subject: [PATCH 213/560] sched: psi: fix unprivileged polling against cgroups 519fabc7aaba ("psi: remove 500ms min window size limitation for triggers") breaks unprivileged psi polling on cgroups. Historically, we had a privilege check for polling in the open() of a pressure file in /proc, but were erroneously missing it for the open() of cgroup pressure files. When unprivileged polling was introduced in d82caa273565 ("sched/psi: Allow unprivileged polling of N*2s period"), it needed to filter privileges depending on the exact polling parameters, and as such moved the CAP_SYS_RESOURCE check from the proc open() callback to psi_trigger_create(). Both the proc files as well as cgroup files go through this during write(). This implicitly added the missing check for privileges required for HT polling for cgroups. When 519fabc7aaba ("psi: remove 500ms min window size limitation for triggers") followed right after to remove further restrictions on the RT polling window, it incorrectly assumed the cgroup privilege check was still missing and added it to the cgroup open(), mirroring what we used to do for proc files in the past. As a result, unprivileged poll requests that would be supported now get rejected when opening the cgroup pressure file for writing. Remove the cgroup open() check. psi_trigger_create() handles it. Fixes: 519fabc7aaba ("psi: remove 500ms min window size limitation for triggers") Reported-by: Luca Boccassi Signed-off-by: Johannes Weiner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Luca Boccassi Acked-by: Suren Baghdasaryan Cc: stable@vger.kernel.org # 6.5+ Link: https://lore.kernel.org/r/20231026164114.2488682-1-hannes@cmpxchg.org --- kernel/cgroup/cgroup.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1d5b9de3b1b9d..4b9ff41ca603a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3885,14 +3885,6 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } -static int cgroup_pressure_open(struct kernfs_open_file *of) -{ - if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - return 0; -} - static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; @@ -5299,7 +5291,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), - .open = cgroup_pressure_open, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5308,7 +5299,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), - .open = cgroup_pressure_open, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5317,7 +5307,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), - .open = cgroup_pressure_open, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, @@ -5327,7 +5316,6 @@ static struct cftype cgroup_psi_files[] = { { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), - .open = cgroup_pressure_open, .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, From 6d7e4782bcf549221b4ccfffec2cf4d1a473f1a3 Mon Sep 17 00:00:00 2001 From: Keisuke Nishimura Date: Tue, 31 Oct 2023 14:38:22 +0100 Subject: [PATCH 214/560] sched/fair: Fix the decision for load balance should_we_balance is called for the decision to do load-balancing. When sched ticks invoke this function, only one CPU should return true. However, in the current code, two CPUs can return true. The following situation, where b means busy and i means idle, is an example, because CPU 0 and CPU 2 return true. [0, 1] [2, 3] b b i b This fix checks if there exists an idle CPU with busy sibling(s) after looking for a CPU on an idle core. If some idle CPUs with busy siblings are found, just the first one should do load-balancing. Fixes: b1bfeab9b002 ("sched/fair: Consider the idle state of the whole core for load balance") Signed-off-by: Keisuke Nishimura Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chen Yu Reviewed-by: Shrikanth Hegde Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20231031133821.1570861-1-keisuke.nishimura@inria.fr --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 025d90925bf63..d7a3c63a2171a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11184,12 +11184,16 @@ static int should_we_balance(struct lb_env *env) continue; } - /* Are we the first idle CPU? */ + /* + * Are we the first idle core in a non-SMT domain or higher, + * or the first idle CPU in a SMT domain? + */ return cpu == env->dst_cpu; } - if (idle_smt == env->dst_cpu) - return true; + /* Are we the first idle CPU with busy siblings? */ + if (idle_smt != -1) + return idle_smt == env->dst_cpu; /* Are we the first CPU of this group ? */ return group_balance_cpu(sg) == env->dst_cpu; From 09f12bf9f790052710bd6e48a1fc1bc4d9e17389 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 7 Nov 2023 18:18:31 +0300 Subject: [PATCH 215/560] nouveau/gsp/r535: uninitialized variable in r535_gsp_acpi_mux_id() The if we hit the "continue" statement on the first iteration through the loop then "handle_mux" needs to be set to NULL so we continue looping. Fixes: 176fdcbddfd2 ("drm/nouveau/gsp/r535: add support for booting GSP-RM") Signed-off-by: Dan Carpenter Reviewed-by: Danilo Krummrich Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/1d864f6e-43e9-43d8-9d90-30e76c9c843b@moroto.mountain --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index e31f9641114b7..afa8e7377a76d 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -1159,7 +1159,7 @@ static void r535_gsp_acpi_mux_id(acpi_handle handle, u32 id, MUX_METHOD_DATA_ELEMENT *mode, MUX_METHOD_DATA_ELEMENT *part) { - acpi_handle iter = NULL, handle_mux; + acpi_handle iter = NULL, handle_mux = NULL; acpi_status status; unsigned long long value; From 42bd415bd8bd43721d423930b4695c565661e687 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Nov 2023 10:40:21 +0300 Subject: [PATCH 216/560] nouveau/gsp/r535: Fix a NULL vs error pointer bug The r535_gsp_cmdq_get() function returns error pointers but this code checks for NULL. Also we need to propagate the error pointer back to the callers in r535_gsp_rpc_get(). Returning NULL will lead to a NULL pointer dereference. Fixes: 176fdcbddfd2 ("drm/nouveau/gsp/r535: add support for booting GSP-RM") Signed-off-by: Dan Carpenter Reviewed-by: Danilo Krummrich Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/f71996d9-d1cb-45ea-a4b2-2dfc21312d8c@kili.mountain --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index afa8e7377a76d..dc44f5c7833f6 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -689,8 +689,8 @@ r535_gsp_rpc_get(struct nvkm_gsp *gsp, u32 fn, u32 argc) struct nvfw_gsp_rpc *rpc; rpc = r535_gsp_cmdq_get(gsp, ALIGN(sizeof(*rpc) + argc, sizeof(u64))); - if (!rpc) - return NULL; + if (IS_ERR(rpc)) + return ERR_CAST(rpc); rpc->header_version = 0x03000000; rpc->signature = ('C' << 24) | ('P' << 16) | ('R' << 8) | 'V'; From a2e36cd56041e277d7d81d35638fd8d9731e21f5 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 7 Nov 2023 15:32:55 +1000 Subject: [PATCH 217/560] nouveau: use an rwlock for the event lock. This allows it to break the following circular locking dependency. Aug 10 07:01:29 dg1test kernel: ====================================================== Aug 10 07:01:29 dg1test kernel: WARNING: possible circular locking dependency detected Aug 10 07:01:29 dg1test kernel: 6.4.0-rc7+ #10 Not tainted Aug 10 07:01:29 dg1test kernel: ------------------------------------------------------ Aug 10 07:01:29 dg1test kernel: wireplumber/2236 is trying to acquire lock: Aug 10 07:01:29 dg1test kernel: ffff8fca5320da18 (&fctx->lock){-...}-{2:2}, at: nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau] Aug 10 07:01:29 dg1test kernel: but task is already holding lock: Aug 10 07:01:29 dg1test kernel: ffff8fca41208610 (&event->list_lock#2){-...}-{2:2}, at: nvkm_event_ntfy+0x50/0xf0 [nouveau] Aug 10 07:01:29 dg1test kernel: which lock already depends on the new lock. Aug 10 07:01:29 dg1test kernel: the existing dependency chain (in reverse order) is: Aug 10 07:01:29 dg1test kernel: -> #3 (&event->list_lock#2){-...}-{2:2}: Aug 10 07:01:29 dg1test kernel: _raw_spin_lock_irqsave+0x4b/0x70 Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy+0x50/0xf0 [nouveau] Aug 10 07:01:29 dg1test kernel: ga100_fifo_nonstall_intr+0x24/0x30 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_intr+0x12c/0x240 [nouveau] Aug 10 07:01:29 dg1test kernel: __handle_irq_event_percpu+0x88/0x240 Aug 10 07:01:29 dg1test kernel: handle_irq_event+0x38/0x80 Aug 10 07:01:29 dg1test kernel: handle_edge_irq+0xa3/0x240 Aug 10 07:01:29 dg1test kernel: __common_interrupt+0x72/0x160 Aug 10 07:01:29 dg1test kernel: common_interrupt+0x60/0xe0 Aug 10 07:01:29 dg1test kernel: asm_common_interrupt+0x26/0x40 Aug 10 07:01:29 dg1test kernel: -> #2 (&device->intr.lock){-...}-{2:2}: Aug 10 07:01:29 dg1test kernel: _raw_spin_lock_irqsave+0x4b/0x70 Aug 10 07:01:29 dg1test kernel: nvkm_inth_allow+0x2c/0x80 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy_state+0x181/0x250 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy_allow+0x63/0xd0 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_uevent_mthd+0x4d/0x70 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_ioctl+0x10b/0x250 [nouveau] Aug 10 07:01:29 dg1test kernel: nvif_object_mthd+0xa8/0x1f0 [nouveau] Aug 10 07:01:29 dg1test kernel: nvif_event_allow+0x2a/0xa0 [nouveau] Aug 10 07:01:29 dg1test kernel: nouveau_fence_enable_signaling+0x78/0x80 [nouveau] Aug 10 07:01:29 dg1test kernel: __dma_fence_enable_signaling+0x5e/0x100 Aug 10 07:01:29 dg1test kernel: dma_fence_add_callback+0x4b/0xd0 Aug 10 07:01:29 dg1test kernel: nouveau_cli_work_queue+0xae/0x110 [nouveau] Aug 10 07:01:29 dg1test kernel: nouveau_gem_object_close+0x1d1/0x2a0 [nouveau] Aug 10 07:01:29 dg1test kernel: drm_gem_handle_delete+0x70/0xe0 [drm] Aug 10 07:01:29 dg1test kernel: drm_ioctl_kernel+0xa5/0x150 [drm] Aug 10 07:01:29 dg1test kernel: drm_ioctl+0x256/0x490 [drm] Aug 10 07:01:29 dg1test kernel: nouveau_drm_ioctl+0x5a/0xb0 [nouveau] Aug 10 07:01:29 dg1test kernel: __x64_sys_ioctl+0x91/0xd0 Aug 10 07:01:29 dg1test kernel: do_syscall_64+0x3c/0x90 Aug 10 07:01:29 dg1test kernel: entry_SYSCALL_64_after_hwframe+0x72/0xdc Aug 10 07:01:29 dg1test kernel: -> #1 (&event->refs_lock#4){....}-{2:2}: Aug 10 07:01:29 dg1test kernel: _raw_spin_lock_irqsave+0x4b/0x70 Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy_state+0x37/0x250 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy_allow+0x63/0xd0 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_uevent_mthd+0x4d/0x70 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_ioctl+0x10b/0x250 [nouveau] Aug 10 07:01:29 dg1test kernel: nvif_object_mthd+0xa8/0x1f0 [nouveau] Aug 10 07:01:29 dg1test kernel: nvif_event_allow+0x2a/0xa0 [nouveau] Aug 10 07:01:29 dg1test kernel: nouveau_fence_enable_signaling+0x78/0x80 [nouveau] Aug 10 07:01:29 dg1test kernel: __dma_fence_enable_signaling+0x5e/0x100 Aug 10 07:01:29 dg1test kernel: dma_fence_add_callback+0x4b/0xd0 Aug 10 07:01:29 dg1test kernel: nouveau_cli_work_queue+0xae/0x110 [nouveau] Aug 10 07:01:29 dg1test kernel: nouveau_gem_object_close+0x1d1/0x2a0 [nouveau] Aug 10 07:01:29 dg1test kernel: drm_gem_handle_delete+0x70/0xe0 [drm] Aug 10 07:01:29 dg1test kernel: drm_ioctl_kernel+0xa5/0x150 [drm] Aug 10 07:01:29 dg1test kernel: drm_ioctl+0x256/0x490 [drm] Aug 10 07:01:29 dg1test kernel: nouveau_drm_ioctl+0x5a/0xb0 [nouveau] Aug 10 07:01:29 dg1test kernel: __x64_sys_ioctl+0x91/0xd0 Aug 10 07:01:29 dg1test kernel: do_syscall_64+0x3c/0x90 Aug 10 07:01:29 dg1test kernel: entry_SYSCALL_64_after_hwframe+0x72/0xdc Aug 10 07:01:29 dg1test kernel: -> #0 (&fctx->lock){-...}-{2:2}: Aug 10 07:01:29 dg1test kernel: __lock_acquire+0x14e3/0x2240 Aug 10 07:01:29 dg1test kernel: lock_acquire+0xc8/0x2a0 Aug 10 07:01:29 dg1test kernel: _raw_spin_lock_irqsave+0x4b/0x70 Aug 10 07:01:29 dg1test kernel: nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_client_event+0xf/0x20 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy+0x9b/0xf0 [nouveau] Aug 10 07:01:29 dg1test kernel: ga100_fifo_nonstall_intr+0x24/0x30 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_intr+0x12c/0x240 [nouveau] Aug 10 07:01:29 dg1test kernel: __handle_irq_event_percpu+0x88/0x240 Aug 10 07:01:29 dg1test kernel: handle_irq_event+0x38/0x80 Aug 10 07:01:29 dg1test kernel: handle_edge_irq+0xa3/0x240 Aug 10 07:01:29 dg1test kernel: __common_interrupt+0x72/0x160 Aug 10 07:01:29 dg1test kernel: common_interrupt+0x60/0xe0 Aug 10 07:01:29 dg1test kernel: asm_common_interrupt+0x26/0x40 Aug 10 07:01:29 dg1test kernel: other info that might help us debug this: Aug 10 07:01:29 dg1test kernel: Chain exists of: &fctx->lock --> &device->intr.lock --> &event->list_lock#2 Aug 10 07:01:29 dg1test kernel: Possible unsafe locking scenario: Aug 10 07:01:29 dg1test kernel: CPU0 CPU1 Aug 10 07:01:29 dg1test kernel: ---- ---- Aug 10 07:01:29 dg1test kernel: lock(&event->list_lock#2); Aug 10 07:01:29 dg1test kernel: lock(&device->intr.lock); Aug 10 07:01:29 dg1test kernel: lock(&event->list_lock#2); Aug 10 07:01:29 dg1test kernel: lock(&fctx->lock); Aug 10 07:01:29 dg1test kernel: *** DEADLOCK *** Aug 10 07:01:29 dg1test kernel: 2 locks held by wireplumber/2236: Aug 10 07:01:29 dg1test kernel: #0: ffff8fca53177bf8 (&device->intr.lock){-...}-{2:2}, at: nvkm_intr+0x29/0x240 [nouveau] Aug 10 07:01:29 dg1test kernel: #1: ffff8fca41208610 (&event->list_lock#2){-...}-{2:2}, at: nvkm_event_ntfy+0x50/0xf0 [nouveau] Aug 10 07:01:29 dg1test kernel: stack backtrace: Aug 10 07:01:29 dg1test kernel: CPU: 6 PID: 2236 Comm: wireplumber Not tainted 6.4.0-rc7+ #10 Aug 10 07:01:29 dg1test kernel: Hardware name: Gigabyte Technology Co., Ltd. Z390 I AORUS PRO WIFI/Z390 I AORUS PRO WIFI-CF, BIOS F8 11/05/2021 Aug 10 07:01:29 dg1test kernel: Call Trace: Aug 10 07:01:29 dg1test kernel: Aug 10 07:01:29 dg1test kernel: dump_stack_lvl+0x5b/0x90 Aug 10 07:01:29 dg1test kernel: check_noncircular+0xe2/0x110 Aug 10 07:01:29 dg1test kernel: __lock_acquire+0x14e3/0x2240 Aug 10 07:01:29 dg1test kernel: lock_acquire+0xc8/0x2a0 Aug 10 07:01:29 dg1test kernel: ? nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau] Aug 10 07:01:29 dg1test kernel: ? lock_acquire+0xc8/0x2a0 Aug 10 07:01:29 dg1test kernel: _raw_spin_lock_irqsave+0x4b/0x70 Aug 10 07:01:29 dg1test kernel: ? nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau] Aug 10 07:01:29 dg1test kernel: nouveau_fence_wait_uevent_handler+0x2b/0x100 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_client_event+0xf/0x20 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_event_ntfy+0x9b/0xf0 [nouveau] Aug 10 07:01:29 dg1test kernel: ga100_fifo_nonstall_intr+0x24/0x30 [nouveau] Aug 10 07:01:29 dg1test kernel: nvkm_intr+0x12c/0x240 [nouveau] Aug 10 07:01:29 dg1test kernel: __handle_irq_event_percpu+0x88/0x240 Aug 10 07:01:29 dg1test kernel: handle_irq_event+0x38/0x80 Aug 10 07:01:29 dg1test kernel: handle_edge_irq+0xa3/0x240 Aug 10 07:01:29 dg1test kernel: __common_interrupt+0x72/0x160 Aug 10 07:01:29 dg1test kernel: common_interrupt+0x60/0xe0 Aug 10 07:01:29 dg1test kernel: asm_common_interrupt+0x26/0x40 Aug 10 07:01:29 dg1test kernel: RIP: 0033:0x7fb66174d700 Aug 10 07:01:29 dg1test kernel: Code: c1 e2 05 29 ca 8d 0c 10 0f be 07 84 c0 75 eb 89 c8 c3 0f 1f 84 00 00 00 00 00 f3 0f 1e fa e9 d7 0f fc ff 0f 1f 80 00 00 00 00 0f 1e fa e9 c7 0f fc> Aug 10 07:01:29 dg1test kernel: RSP: 002b:00007ffdd3c48438 EFLAGS: 00000206 Aug 10 07:01:29 dg1test kernel: RAX: 000055bb758763c0 RBX: 000055bb758752c0 RCX: 00000000000028b0 Aug 10 07:01:29 dg1test kernel: RDX: 000055bb758752c0 RSI: 000055bb75887490 RDI: 000055bb75862950 Aug 10 07:01:29 dg1test kernel: RBP: 00007ffdd3c48490 R08: 000055bb75873b10 R09: 0000000000000001 Aug 10 07:01:29 dg1test kernel: R10: 0000000000000004 R11: 000055bb7587f000 R12: 000055bb75887490 Aug 10 07:01:29 dg1test kernel: R13: 000055bb757f6280 R14: 000055bb758875c0 R15: 000055bb757f6280 Aug 10 07:01:29 dg1test kernel: Signed-off-by: Dave Airlie Tested-by: Danilo Krummrich Reviewed-by: Danilo Krummrich Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20231107053255.2257079-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/include/nvkm/core/event.h | 4 ++-- drivers/gpu/drm/nouveau/nvkm/core/event.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/event.h b/drivers/gpu/drm/nouveau/include/nvkm/core/event.h index 82b267c111470..460459af272d6 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/core/event.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/core/event.h @@ -14,7 +14,7 @@ struct nvkm_event { int index_nr; spinlock_t refs_lock; - spinlock_t list_lock; + rwlock_t list_lock; int *refs; struct list_head ntfy; @@ -38,7 +38,7 @@ nvkm_event_init(const struct nvkm_event_func *func, struct nvkm_subdev *subdev, int types_nr, int index_nr, struct nvkm_event *event) { spin_lock_init(&event->refs_lock); - spin_lock_init(&event->list_lock); + rwlock_init(&event->list_lock); return __nvkm_event_init(func, subdev, types_nr, index_nr, event); } diff --git a/drivers/gpu/drm/nouveau/nvkm/core/event.c b/drivers/gpu/drm/nouveau/nvkm/core/event.c index a6c877135598f..61fed7792e415 100644 --- a/drivers/gpu/drm/nouveau/nvkm/core/event.c +++ b/drivers/gpu/drm/nouveau/nvkm/core/event.c @@ -81,17 +81,17 @@ nvkm_event_ntfy_state(struct nvkm_event_ntfy *ntfy) static void nvkm_event_ntfy_remove(struct nvkm_event_ntfy *ntfy) { - spin_lock_irq(&ntfy->event->list_lock); + write_lock_irq(&ntfy->event->list_lock); list_del_init(&ntfy->head); - spin_unlock_irq(&ntfy->event->list_lock); + write_unlock_irq(&ntfy->event->list_lock); } static void nvkm_event_ntfy_insert(struct nvkm_event_ntfy *ntfy) { - spin_lock_irq(&ntfy->event->list_lock); + write_lock_irq(&ntfy->event->list_lock); list_add_tail(&ntfy->head, &ntfy->event->ntfy); - spin_unlock_irq(&ntfy->event->list_lock); + write_unlock_irq(&ntfy->event->list_lock); } static void @@ -176,7 +176,7 @@ nvkm_event_ntfy(struct nvkm_event *event, int id, u32 bits) return; nvkm_trace(event->subdev, "event: ntfy %08x on %d\n", bits, id); - spin_lock_irqsave(&event->list_lock, flags); + read_lock_irqsave(&event->list_lock, flags); list_for_each_entry_safe(ntfy, ntmp, &event->ntfy, head) { if (ntfy->id == id && ntfy->bits & bits) { @@ -185,7 +185,7 @@ nvkm_event_ntfy(struct nvkm_event *event, int id, u32 bits) } } - spin_unlock_irqrestore(&event->list_lock, flags); + read_unlock_irqrestore(&event->list_lock, flags); } void From 969d90ec212bae4b45bf9d21d7daa30aa6cf055e Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 14 Nov 2023 17:25:48 -0500 Subject: [PATCH 218/560] audit: don't WARN_ON_ONCE(!current->mm) in audit_exe_compare() eBPF can end up calling into the audit code from some odd places, and some of these places don't have @current set properly so we end up tripping the `WARN_ON_ONCE(!current->mm)` near the top of `audit_exe_compare()`. While the basic `!current->mm` check is good, the `WARN_ON_ONCE()` results in some scary console messages so let's drop that and just do the regular `!current->mm` check to avoid problems. Cc: Fixes: 47846d51348d ("audit: don't take task_lock() in audit_exe_compare() code path") Reported-by: Artem Savkov Signed-off-by: Paul Moore --- kernel/audit_watch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 91e82e34b51e3..7a98cd176a127 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -531,7 +531,7 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) if (tsk != current) return 0; - if (WARN_ON_ONCE(!current->mm)) + if (!current->mm) return 0; exe_file = get_mm_exe_file(current->mm); if (!exe_file) From 77618db346455129424fadbbaec596a09feaf3bb Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Thu, 12 Oct 2023 12:55:34 -0700 Subject: [PATCH 219/560] zstd: Fix array-index-out-of-bounds UBSAN warning Zstd used an array of length 1 to mean a flexible array for C89 compatibility. Switch to a C99 flexible array to fix the UBSAN warning. Tested locally by booting the kernel and writing to and reading from a BtrFS filesystem with zstd compression enabled. I was unable to reproduce the issue before the fix, however it is a trivial change. Link: https://lkml.kernel.org/r/20231012213428.1390905-1-nickrterrell@gmail.com Reported-by: syzbot+1f2eb3e8cd123ffce499@syzkaller.appspotmail.com Reported-by: Eric Biggers Reported-by: Kees Cook Signed-off-by: Nick Terrell Reviewed-by: Kees Cook --- lib/zstd/common/fse_decompress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c index a0d06095be83d..8dcb8ca39767c 100644 --- a/lib/zstd/common/fse_decompress.c +++ b/lib/zstd/common/fse_decompress.c @@ -312,7 +312,7 @@ size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size typedef struct { short ncount[FSE_MAX_SYMBOL_VALUE + 1]; - FSE_DTable dtable[1]; /* Dynamically sized */ + FSE_DTable dtable[]; /* Dynamically sized */ } FSE_DecompressWksp; From c9bd1568d5462f4108417518ce1af7b924acfb6f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Nov 2023 21:36:13 +0100 Subject: [PATCH 220/560] futex: Fix hardcoded flags Xi reported that commit 5694289ce183 ("futex: Flag conversion") broke glibc's robust futex tests. This was narrowed down to the change of FLAGS_SHARED from 0x01 to 0x10, at which point Florian noted that handle_futex_death() has a hardcoded flags argument of 1. Change this to: FLAGS_SIZE_32 | FLAGS_SHARED, matching how futex_to_flags() unconditionally sets FLAGS_SIZE_32 for all legacy futex ops. Reported-by: Xi Ruoyao Reported-by: Florian Weimer Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20231114201402.GA25315@noisy.programming.kicks-ass.net Fixes: 5694289ce183 ("futex: Flag conversion") Cc: --- kernel/futex/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 52695c59d0411..dad981a865b84 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -700,7 +700,8 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, owner = uval & FUTEX_TID_MASK; if (pending_op && !pi && !owner) { - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, + FUTEX_BITSET_MATCH_ANY); return 0; } @@ -752,8 +753,10 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, * Wake robust non-PI futexes here. The wakeup of * PI futexes happens in exit_pi_state(): */ - if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + if (!pi && (uval & FUTEX_WAITERS)) { + futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, + FUTEX_BITSET_MATCH_ANY); + } return 0; } From 889c58b3155ff4c8e8671c95daef63d6fabbb6b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jun 2023 12:34:46 +0200 Subject: [PATCH 221/560] perf/core: Fix cpuctx refcounting Audit of the refcounting turned up that perf_pmu_migrate_context() fails to migrate the ctx refcount. Fixes: bd2756811766 ("perf: Rewrite core context handling") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20230612093539.085862001@infradead.org Cc: --- include/linux/perf_event.h | 13 ++++++++----- kernel/events/core.c | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index afb028c54f339..5547ba68e6e47 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -843,11 +843,11 @@ struct perf_event { }; /* - * ,-----------------------[1:n]----------------------. - * V V - * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event - * ^ ^ | | - * `--------[1:n]---------' `-[n:1]-> pmu <-[1:n]-' + * ,-----------------------[1:n]------------------------. + * V V + * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event + * | | + * `--[n:1]-> pmu <-[1:n]--' * * * struct perf_event_pmu_context lifetime is refcount based and RCU freed @@ -865,6 +865,9 @@ struct perf_event { * ctx->mutex pinning the configuration. Since we hold a reference on * group_leader (through the filedesc) it can't go away, therefore it's * associated pmu_ctx must exist and cannot change due to ctx->mutex. + * + * perf_event holds a refcount on perf_event_context + * perf_event holds a refcount on perf_event_pmu_context */ struct perf_event_pmu_context { struct pmu *pmu; diff --git a/kernel/events/core.c b/kernel/events/core.c index 683dc086ef10a..b704d83a28b29 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4828,6 +4828,11 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, void *task_ctx_data = NULL; if (!ctx->task) { + /* + * perf_pmu_migrate_context() / __perf_pmu_install_event() + * relies on the fact that find_get_pmu_context() cannot fail + * for CPU contexts. + */ struct perf_cpu_pmu_context *cpc; cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); @@ -12889,6 +12894,9 @@ static void __perf_pmu_install_event(struct pmu *pmu, int cpu, struct perf_event *event) { struct perf_event_pmu_context *epc; + struct perf_event_context *old_ctx = event->ctx; + + get_ctx(ctx); /* normally find_get_context() */ event->cpu = cpu; epc = find_get_pmu_context(pmu, ctx, event); @@ -12897,6 +12905,11 @@ static void __perf_pmu_install_event(struct pmu *pmu, if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; perf_install_in_context(ctx, event, cpu); + + /* + * Now that event->ctx is updated and visible, put the old ctx. + */ + put_ctx(old_ctx); } static void __perf_pmu_install(struct perf_event_context *ctx, @@ -12935,6 +12948,10 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) struct perf_event_context *src_ctx, *dst_ctx; LIST_HEAD(events); + /* + * Since per-cpu context is persistent, no need to grab an extra + * reference. + */ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; From fa02de9e75889915b554eda1964a631fd019973b Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Mon, 13 Nov 2023 19:42:49 +0200 Subject: [PATCH 222/560] net: stmmac: fix rx budget limit check The while loop condition verifies 'count < limit'. Neither value change before the 'count >= limit' check. As is this check is dead code. But code inspection reveals a code path that modifies 'count' and then goto 'drain_data' and back to 'read_again'. So there is a need to verify count value sanity after 'read_again'. Move 'read_again' up to fix the count limit check. Fixes: ec222003bd94 ("net: stmmac: Prepare to add Split Header support") Signed-off-by: Baruch Siach Reviewed-by: Serge Semin Link: https://lore.kernel.org/r/d9486296c3b6b12ab3a0515fcd47d56447a07bfc.1699897370.git.baruch@tkos.co.il Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 3e50fd53a6174..f28838c8cdb33 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5328,10 +5328,10 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) len = 0; } +read_again: if (count >= limit) break; -read_again: buf1_len = 0; buf2_len = 0; entry = next_entry; From b6cb4541853c7ee512111b0e7ddf3cb66c99c137 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Mon, 13 Nov 2023 19:42:50 +0200 Subject: [PATCH 223/560] net: stmmac: avoid rx queue overrun dma_rx_size can be set as low as 64. Rx budget might be higher than that. Make sure to not overrun allocated rx buffers when budget is larger. Leave one descriptor unused to avoid wrap around of 'dirty_rx' vs 'cur_rx'. Signed-off-by: Baruch Siach Reviewed-by: Serge Semin Fixes: 47dd7a540b8a ("net: add support for STMicroelectronics Ethernet controllers.") Link: https://lore.kernel.org/r/d95413e44c97d4692e72cec13a75f894abeb6998.1699897370.git.baruch@tkos.co.il Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f28838c8cdb33..2afb2bd25977a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5293,6 +5293,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) dma_dir = page_pool_get_dma_dir(rx_q->page_pool); buf_sz = DIV_ROUND_UP(priv->dma_conf.dma_buf_sz, PAGE_SIZE) * PAGE_SIZE; + limit = min(priv->dma_conf.dma_rx_size - 1, (unsigned int)limit); if (netif_msg_rx_status(priv)) { void *rx_head; From 907d1bdb8b2cc0357d03a1c34d2a08d9943760b1 Mon Sep 17 00:00:00 2001 From: Alex Pakhunov Date: Mon, 13 Nov 2023 10:23:49 -0800 Subject: [PATCH 224/560] tg3: Move the [rt]x_dropped counters to tg3_napi This change moves [rt]x_dropped counters to tg3_napi so that they can be updated by a single writer, race-free. Signed-off-by: Alex Pakhunov Signed-off-by: Vincent Wong Reviewed-by: Michael Chan Link: https://lore.kernel.org/r/20231113182350.37472-1-alexey.pakhunov@spacex.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/tg3.c | 38 +++++++++++++++++++++++++---- drivers/net/ethernet/broadcom/tg3.h | 4 +-- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 1dee27349367e..884e9eecc471f 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -6889,7 +6889,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget) desc_idx, *post_ptr); drop_it_no_recycle: /* Other statistics kept track of by card. */ - tp->rx_dropped++; + tnapi->rx_dropped++; goto next_pkt; } @@ -8190,7 +8190,7 @@ static netdev_tx_t __tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) drop: dev_kfree_skb_any(skb); drop_nofree: - tp->tx_dropped++; + tnapi->tx_dropped++; return NETDEV_TX_OK; } @@ -9405,7 +9405,7 @@ static void __tg3_set_rx_mode(struct net_device *); /* tp->lock is held. */ static int tg3_halt(struct tg3 *tp, int kind, bool silent) { - int err; + int err, i; tg3_stop_fw(tp); @@ -9426,6 +9426,13 @@ static int tg3_halt(struct tg3 *tp, int kind, bool silent) /* And make sure the next sample is new data */ memset(tp->hw_stats, 0, sizeof(struct tg3_hw_stats)); + + for (i = 0; i < TG3_IRQ_MAX_VECS; ++i) { + struct tg3_napi *tnapi = &tp->napi[i]; + + tnapi->rx_dropped = 0; + tnapi->tx_dropped = 0; + } } return err; @@ -11975,6 +11982,9 @@ static void tg3_get_nstats(struct tg3 *tp, struct rtnl_link_stats64 *stats) { struct rtnl_link_stats64 *old_stats = &tp->net_stats_prev; struct tg3_hw_stats *hw_stats = tp->hw_stats; + unsigned long rx_dropped; + unsigned long tx_dropped; + int i; stats->rx_packets = old_stats->rx_packets + get_stat64(&hw_stats->rx_ucast_packets) + @@ -12021,8 +12031,26 @@ static void tg3_get_nstats(struct tg3 *tp, struct rtnl_link_stats64 *stats) stats->rx_missed_errors = old_stats->rx_missed_errors + get_stat64(&hw_stats->rx_discards); - stats->rx_dropped = tp->rx_dropped; - stats->tx_dropped = tp->tx_dropped; + /* Aggregate per-queue counters. The per-queue counters are updated + * by a single writer, race-free. The result computed by this loop + * might not be 100% accurate (counters can be updated in the middle of + * the loop) but the next tg3_get_nstats() will recompute the current + * value so it is acceptable. + * + * Note that these counters wrap around at 4G on 32bit machines. + */ + rx_dropped = (unsigned long)(old_stats->rx_dropped); + tx_dropped = (unsigned long)(old_stats->tx_dropped); + + for (i = 0; i < tp->irq_cnt; i++) { + struct tg3_napi *tnapi = &tp->napi[i]; + + rx_dropped += tnapi->rx_dropped; + tx_dropped += tnapi->tx_dropped; + } + + stats->rx_dropped = rx_dropped; + stats->tx_dropped = tx_dropped; } static int tg3_get_regs_len(struct net_device *dev) diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h index ae5c01bd11104..5016475e50054 100644 --- a/drivers/net/ethernet/broadcom/tg3.h +++ b/drivers/net/ethernet/broadcom/tg3.h @@ -3018,6 +3018,7 @@ struct tg3_napi { u16 *rx_rcb_prod_idx; struct tg3_rx_prodring_set prodring; struct tg3_rx_buffer_desc *rx_rcb; + unsigned long rx_dropped; u32 tx_prod ____cacheline_aligned; u32 tx_cons; @@ -3026,6 +3027,7 @@ struct tg3_napi { u32 prodmbox; struct tg3_tx_buffer_desc *tx_ring; struct tg3_tx_ring_info *tx_buffers; + unsigned long tx_dropped; dma_addr_t status_mapping; dma_addr_t rx_rcb_mapping; @@ -3220,8 +3222,6 @@ struct tg3 { /* begin "everything else" cacheline(s) section */ - unsigned long rx_dropped; - unsigned long tx_dropped; struct rtnl_link_stats64 net_stats_prev; struct tg3_ethtool_stats estats_prev; From 17dd5efe5f36a96bd78012594fabe21efb01186b Mon Sep 17 00:00:00 2001 From: Alex Pakhunov Date: Mon, 13 Nov 2023 10:23:50 -0800 Subject: [PATCH 225/560] tg3: Increment tx_dropped in tg3_tso_bug() tg3_tso_bug() drops a packet if it cannot be segmented for any reason. The number of discarded frames should be incremented accordingly. Signed-off-by: Alex Pakhunov Signed-off-by: Vincent Wong Reviewed-by: Pavan Chebbi Link: https://lore.kernel.org/r/20231113182350.37472-2-alexey.pakhunov@spacex.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/tg3.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 884e9eecc471f..48b6191efa56c 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -7918,8 +7918,10 @@ static int tg3_tso_bug(struct tg3 *tp, struct tg3_napi *tnapi, segs = skb_gso_segment(skb, tp->dev->features & ~(NETIF_F_TSO | NETIF_F_TSO6)); - if (IS_ERR(segs) || !segs) + if (IS_ERR(segs) || !segs) { + tnapi->tx_dropped++; goto tg3_tso_bug_end; + } skb_list_walk_safe(segs, seg, next) { skb_mark_not_on_list(seg); From 09d4c14c6c5e6e781a3879fed7f8e116a18b8c65 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 13 Nov 2023 10:32:56 -0800 Subject: [PATCH 226/560] pds_core: use correct index to mask irq Use the qcq's interrupt index, not the irq number, to mask the interrupt. Since the irq number can be out of range from the number of possible interrupts, we can end up accessing and potentially scribbling on out-of-range and/or unmapped memory, making the kernel angry. [ 3116.039364] BUG: unable to handle page fault for address: ffffbeea1c3edf84 [ 3116.047059] #PF: supervisor write access in kernel mode [ 3116.052895] #PF: error_code(0x0002) - not-present page [ 3116.058636] PGD 100000067 P4D 100000067 PUD 1001f2067 PMD 10f82e067 PTE 0 [ 3116.066221] Oops: 0002 [#1] SMP NOPTI [ 3116.092948] RIP: 0010:iowrite32+0x9/0x76 [ 3116.190452] Call Trace: [ 3116.193185] [ 3116.195430] ? show_trace_log_lvl+0x1d6/0x2f9 [ 3116.200298] ? show_trace_log_lvl+0x1d6/0x2f9 [ 3116.205166] ? pdsc_adminq_isr+0x43/0x55 [pds_core] [ 3116.210618] ? __die_body.cold+0x8/0xa [ 3116.214806] ? page_fault_oops+0x16d/0x1ac [ 3116.219382] ? exc_page_fault+0xbe/0x13b [ 3116.223764] ? asm_exc_page_fault+0x22/0x27 [ 3116.228440] ? iowrite32+0x9/0x76 [ 3116.232143] pdsc_adminq_isr+0x43/0x55 [pds_core] [ 3116.237627] __handle_irq_event_percpu+0x3a/0x184 [ 3116.243088] handle_irq_event+0x57/0xb0 [ 3116.247575] handle_edge_irq+0x87/0x225 [ 3116.252062] __common_interrupt+0x3e/0xbc [ 3116.256740] common_interrupt+0x7b/0x98 [ 3116.261216] [ 3116.263745] [ 3116.266268] asm_common_interrupt+0x22/0x27 Reported-by: Joao Martins Fixes: 01ba61b55b20 ("pds_core: Add adminq processing and commands") Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20231113183257.71110-2-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/adminq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/pds_core/adminq.c b/drivers/net/ethernet/amd/pds_core/adminq.c index 045fe133f6ee9..5beadabc21361 100644 --- a/drivers/net/ethernet/amd/pds_core/adminq.c +++ b/drivers/net/ethernet/amd/pds_core/adminq.c @@ -146,7 +146,7 @@ irqreturn_t pdsc_adminq_isr(int irq, void *data) } queue_work(pdsc->wq, &qcq->work); - pds_core_intr_mask(&pdsc->intr_ctrl[irq], PDS_CORE_INTR_MASK_CLEAR); + pds_core_intr_mask(&pdsc->intr_ctrl[qcq->intx], PDS_CORE_INTR_MASK_CLEAR); return IRQ_HANDLED; } From 7c02f6ae676a954216a192612040f9a0cde3adf7 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 13 Nov 2023 10:32:57 -0800 Subject: [PATCH 227/560] pds_core: fix up some format-truncation complaints Our friendly kernel test robot pointed out a couple of potential string truncation issues. None of which were we worried about, but can be relatively easily fixed to quiet the complaints. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310211736.66syyDpp-lkp@intel.com/ Fixes: 45d76f492938 ("pds_core: set up device and adminq") Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20231113183257.71110-3-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/core.h | 2 +- drivers/net/ethernet/amd/pds_core/dev.c | 8 ++++++-- drivers/net/ethernet/amd/pds_core/devlink.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h index f3a7deda99724..e35d3e7006bfc 100644 --- a/drivers/net/ethernet/amd/pds_core/core.h +++ b/drivers/net/ethernet/amd/pds_core/core.h @@ -15,7 +15,7 @@ #define PDSC_DRV_DESCRIPTION "AMD/Pensando Core Driver" #define PDSC_WATCHDOG_SECS 5 -#define PDSC_QUEUE_NAME_MAX_SZ 32 +#define PDSC_QUEUE_NAME_MAX_SZ 16 #define PDSC_ADMINQ_MIN_LENGTH 16 /* must be a power of two */ #define PDSC_NOTIFYQ_LENGTH 64 /* must be a power of two */ #define PDSC_TEARDOWN_RECOVERY false diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c index 7c1b965d61a92..31940b857e0e5 100644 --- a/drivers/net/ethernet/amd/pds_core/dev.c +++ b/drivers/net/ethernet/amd/pds_core/dev.c @@ -261,10 +261,14 @@ static int pdsc_identify(struct pdsc *pdsc) struct pds_core_drv_identity drv = {}; size_t sz; int err; + int n; drv.drv_type = cpu_to_le32(PDS_DRIVER_LINUX); - snprintf(drv.driver_ver_str, sizeof(drv.driver_ver_str), - "%s %s", PDS_CORE_DRV_NAME, utsname()->release); + /* Catching the return quiets a Wformat-truncation complaint */ + n = snprintf(drv.driver_ver_str, sizeof(drv.driver_ver_str), + "%s %s", PDS_CORE_DRV_NAME, utsname()->release); + if (n > sizeof(drv.driver_ver_str)) + dev_dbg(pdsc->dev, "release name truncated, don't care\n"); /* Next let's get some info about the device * We use the devcmd_lock at this level in order to diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c index 57f88c8b37def..e9948ea5bbcdb 100644 --- a/drivers/net/ethernet/amd/pds_core/devlink.c +++ b/drivers/net/ethernet/amd/pds_core/devlink.c @@ -104,7 +104,7 @@ int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req, struct pds_core_fw_list_info fw_list; struct pdsc *pdsc = devlink_priv(dl); union pds_core_dev_comp comp; - char buf[16]; + char buf[32]; int listlen; int err; int i; From 278a370c1766060d2144d6cf0b06c101e1043b6d Mon Sep 17 00:00:00 2001 From: Ziwei Xiao Date: Mon, 13 Nov 2023 16:41:44 -0800 Subject: [PATCH 228/560] gve: Fixes for napi_poll when budget is 0 Netpoll will explicilty pass the polling call with a budget of 0 to indicate it's clearing the Tx path only. For the gve_rx_poll and gve_xdp_poll, they were mistakenly taking the 0 budget as the indication to do all the work. Add check to avoid the rx path and xdp path being called when budget is 0. And also avoid napi_complete_done being called when budget is 0 for netpoll. Fixes: f5cedc84a30d ("gve: Add transmit and receive support") Signed-off-by: Ziwei Xiao Link: https://lore.kernel.org/r/20231114004144.2022268-1-ziweixiao@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 8 +++++++- drivers/net/ethernet/google/gve/gve_rx.c | 4 ---- drivers/net/ethernet/google/gve/gve_tx.c | 4 ---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 276f996f95dcc..2d42e733837b0 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -254,10 +254,13 @@ static int gve_napi_poll(struct napi_struct *napi, int budget) if (block->tx) { if (block->tx->q_num < priv->tx_cfg.num_queues) reschedule |= gve_tx_poll(block, budget); - else + else if (budget) reschedule |= gve_xdp_poll(block, budget); } + if (!budget) + return 0; + if (block->rx) { work_done = gve_rx_poll(block, budget); reschedule |= work_done == budget; @@ -298,6 +301,9 @@ static int gve_napi_poll_dqo(struct napi_struct *napi, int budget) if (block->tx) reschedule |= gve_tx_poll_dqo(block, /*do_clean=*/true); + if (!budget) + return 0; + if (block->rx) { work_done = gve_rx_poll_dqo(block, budget); reschedule |= work_done == budget; diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index e84a066aa1a40..73655347902d2 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -1007,10 +1007,6 @@ int gve_rx_poll(struct gve_notify_block *block, int budget) feat = block->napi.dev->features; - /* If budget is 0, do all the work */ - if (budget == 0) - budget = INT_MAX; - if (budget > 0) work_done = gve_clean_rx_done(rx, budget, feat); diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 6957a865cff37..9f6ffc4a54f0b 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -925,10 +925,6 @@ bool gve_xdp_poll(struct gve_notify_block *block, int budget) bool repoll; u32 to_do; - /* If budget is 0, do all the work */ - if (budget == 0) - budget = INT_MAX; - /* Find out how much work there is to be done */ nic_done = gve_tx_load_event_counter(priv, tx); to_do = min_t(u32, (nic_done - tx->done), budget); From 9fce92f050f448a0d1ddd9083ef967d9930f1e52 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 14 Nov 2023 00:16:13 +0100 Subject: [PATCH 229/560] mptcp: deal with large GSO size After the blamed commit below, the TCP sockets (and the MPTCP subflows) can build egress packets larger than 64K. That exceeds the maximum DSS data size, the length being misrepresent on the wire and the stream being corrupted, as later observed on the receiver: WARNING: CPU: 0 PID: 9696 at net/mptcp/protocol.c:705 __mptcp_move_skbs_from_subflow+0x2604/0x26e0 CPU: 0 PID: 9696 Comm: syz-executor.7 Not tainted 6.6.0-rc5-gcd8bdf563d46 #45 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 netlink: 8 bytes leftover after parsing attributes in process `syz-executor.4'. RIP: 0010:__mptcp_move_skbs_from_subflow+0x2604/0x26e0 net/mptcp/protocol.c:705 RSP: 0018:ffffc90000006e80 EFLAGS: 00010246 RAX: ffffffff83e9f674 RBX: ffff88802f45d870 RCX: ffff888102ad0000 netlink: 8 bytes leftover after parsing attributes in process `syz-executor.4'. RDX: 0000000080000303 RSI: 0000000000013908 RDI: 0000000000003908 RBP: ffffc90000007110 R08: ffffffff83e9e078 R09: 1ffff1100e548c8a R10: dffffc0000000000 R11: ffffed100e548c8b R12: 0000000000013908 R13: dffffc0000000000 R14: 0000000000003908 R15: 000000000031cf29 FS: 00007f239c47e700(0000) GS:ffff88811b200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f239c45cd78 CR3: 000000006a66c006 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 PKRU: 55555554 Call Trace: mptcp_data_ready+0x263/0xac0 net/mptcp/protocol.c:819 subflow_data_ready+0x268/0x6d0 net/mptcp/subflow.c:1409 tcp_data_queue+0x21a1/0x7a60 net/ipv4/tcp_input.c:5151 tcp_rcv_established+0x950/0x1d90 net/ipv4/tcp_input.c:6098 tcp_v6_do_rcv+0x554/0x12f0 net/ipv6/tcp_ipv6.c:1483 tcp_v6_rcv+0x2e26/0x3810 net/ipv6/tcp_ipv6.c:1749 ip6_protocol_deliver_rcu+0xd6b/0x1ae0 net/ipv6/ip6_input.c:438 ip6_input+0x1c5/0x470 net/ipv6/ip6_input.c:483 ipv6_rcv+0xef/0x2c0 include/linux/netfilter.h:304 __netif_receive_skb+0x1ea/0x6a0 net/core/dev.c:5532 process_backlog+0x353/0x660 net/core/dev.c:5974 __napi_poll+0xc6/0x5a0 net/core/dev.c:6536 net_rx_action+0x6a0/0xfd0 net/core/dev.c:6603 __do_softirq+0x184/0x524 kernel/softirq.c:553 do_softirq+0xdd/0x130 kernel/softirq.c:454 Address the issue explicitly bounding the maximum GSO size to what MPTCP actually allows. Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/450 Fixes: 7c4e983c4f3c ("net: allow gso_max_size to exceed 65536") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-1-7b9cd6a7b7f4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a0b8356cd8c58..66e947054945d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1230,6 +1230,8 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk, mptcp_do_fallback(ssk); } +#define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1)) + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1256,6 +1258,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, return -EAGAIN; /* compute send limit */ + if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE)) + ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE; info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); copy = info->size_goal; From d109a7767273d1706b541c22b83a0323823dfde4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 14 Nov 2023 00:16:14 +0100 Subject: [PATCH 230/560] mptcp: fix possible NULL pointer dereference on close After the blamed commit below, the MPTCP release callback can dereference the first subflow pointer via __mptcp_set_connected() and send buffer auto-tuning. Such pointer is always expected to be valid, except at socket destruction time, when the first subflow is deleted and the pointer zeroed. If the connect event is handled by the release callback while the msk socket is finally released, MPTCP hits the following splat: general protection fault, probably for non-canonical address 0xdffffc00000000f2: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000790-0x0000000000000797] CPU: 1 PID: 26719 Comm: syz-executor.2 Not tainted 6.6.0-syzkaller-10102-gff269e2cd5ad #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 RIP: 0010:mptcp_subflow_ctx net/mptcp/protocol.h:542 [inline] RIP: 0010:__mptcp_propagate_sndbuf net/mptcp/protocol.h:813 [inline] RIP: 0010:__mptcp_set_connected+0x57/0x3e0 net/mptcp/subflow.c:424 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff8a62323c RDX: 00000000000000f2 RSI: ffffffff8a630116 RDI: 0000000000000790 RBP: ffff88803334b100 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000034 R12: ffff88803334b198 R13: ffff888054f0b018 R14: 0000000000000000 R15: ffff88803334b100 FS: 0000000000000000(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbcb4f75198 CR3: 000000006afb5000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mptcp_release_cb+0xa2c/0xc40 net/mptcp/protocol.c:3405 release_sock+0xba/0x1f0 net/core/sock.c:3537 mptcp_close+0x32/0xf0 net/mptcp/protocol.c:3084 inet_release+0x132/0x270 net/ipv4/af_inet.c:433 inet6_release+0x4f/0x70 net/ipv6/af_inet6.c:485 __sock_release+0xae/0x260 net/socket.c:659 sock_close+0x1c/0x20 net/socket.c:1419 __fput+0x270/0xbb0 fs/file_table.c:394 task_work_run+0x14d/0x240 kernel/task_work.c:180 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0xa92/0x2a20 kernel/exit.c:876 do_group_exit+0xd4/0x2a0 kernel/exit.c:1026 get_signal+0x23ba/0x2790 kernel/signal.c:2900 arch_do_signal_or_restart+0x90/0x7f0 arch/x86/kernel/signal.c:309 exit_to_user_mode_loop kernel/entry/common.c:168 [inline] exit_to_user_mode_prepare+0x11f/0x240 kernel/entry/common.c:204 __syscall_exit_to_user_mode_work kernel/entry/common.c:285 [inline] syscall_exit_to_user_mode+0x1d/0x60 kernel/entry/common.c:296 do_syscall_64+0x4b/0x110 arch/x86/entry/common.c:88 entry_SYSCALL_64_after_hwframe+0x63/0x6b RIP: 0033:0x7fb515e7cae9 Code: Unable to access opcode bytes at 0x7fb515e7cabf. RSP: 002b:00007fb516c560c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: 000000000000003c RBX: 00007fb515f9c120 RCX: 00007fb515e7cae9 RDX: 0000000000000000 RSI: 0000000020000140 RDI: 0000000000000006 RBP: 00007fb515ec847a R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000006e R14: 00007fb515f9c120 R15: 00007ffc631eb968 To avoid sparkling unneeded conditionals, address the issue explicitly checking msk->first only in the critical place. Fixes: 8005184fd1ca ("mptcp: refactor sndbuf auto-tuning") Cc: stable@vger.kernel.org Reported-by: Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/454 Reported-by: Eric Dumazet Closes: https://lore.kernel.org/netdev/CANn89iLZUA6S2a=K8GObnS62KK6Jt4B7PsAs7meMFooM8xaTgw@mail.gmail.com/ Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-2-7b9cd6a7b7f4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 66e947054945d..bc81ea53a0499 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3402,10 +3402,11 @@ static void mptcp_release_cb(struct sock *sk) if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); if (unlikely(msk->cb_flags)) { - /* be sure to set the current sk state before tacking actions - * depending on sk_state, that is processing MPTCP_ERROR_REPORT + /* be sure to set the current sk state before taking actions + * depending on sk_state (MPTCP_ERROR_REPORT) + * On sk release avoid actions depending on the first subflow */ - if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags)) + if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags) && msk->first) __mptcp_set_connected(sk); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); From 8df220b29282e8b450ea57be62e1eccd4996837c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 14 Nov 2023 00:16:15 +0100 Subject: [PATCH 231/560] mptcp: add validity check for sending RM_ADDR This patch adds the validity check for sending RM_ADDRs for userspace PM in mptcp_pm_remove_addrs(), only send a RM_ADDR when the address is in the anno_list or conn_list. Fixes: 8b1c94da1e48 ("mptcp: only send RM_ADDR in nl_cmd_remove") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-3-7b9cd6a7b7f4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 1529ec3588155..bf4d96f6f99a6 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1515,8 +1515,9 @@ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, rm_list, list) { - remove_anno_list_by_saddr(msk, &entry->addr); - if (alist.nr < MPTCP_RM_IDS_MAX) + if ((remove_anno_list_by_saddr(msk, &entry->addr) || + lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) && + alist.nr < MPTCP_RM_IDS_MAX) alist.ids[alist.nr++] = entry->addr.id; } From 7679d34f97b7a09fd565f5729f79fd61b7c55329 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 14 Nov 2023 00:16:16 +0100 Subject: [PATCH 232/560] mptcp: fix setsockopt(IP_TOS) subflow locking The MPTCP implementation of the IP_TOS socket option uses the lockless variant of the TOS manipulation helper and does not hold such lock at the helper invocation time. Add the required locking. Fixes: ffcacff87cd6 ("mptcp: Support for IP_TOS for MPTCP setsockopt()") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/457 Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-4-7b9cd6a7b7f4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 77f5e8932abf6..3536807337004 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -738,8 +738,11 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname, val = READ_ONCE(inet_sk(sk)->tos); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + slow = lock_sock_fast(ssk); __ip_sock_set_tos(ssk, val); + unlock_sock_fast(ssk, slow); } release_sock(sk); From 7cefbe5e1dacc7236caa77e9d072423f21422fe2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 14 Nov 2023 00:16:17 +0100 Subject: [PATCH 233/560] selftests: mptcp: fix fastclose with csum failure Running the mp_join selftest manually with the following command line: ./mptcp_join.sh -z -C leads to some failures: 002 fastclose server test # ... rtx [fail] got 1 MP_RST[s] TX expected 0 # ... rstrx [fail] got 1 MP_RST[s] RX expected 0 The problem is really in the wrong expectations for the RST checks implied by the csum validation. Note that the same check is repeated explicitly in the same test-case, with the correct expectation and pass successfully. Address the issue explicitly setting the correct expectation for the failing checks. Reported-by: Xiumei Mu Fixes: 6bf41020b72b ("selftests: mptcp: update and extend fastclose test-cases") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-5-7b9cd6a7b7f4@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 75a2438efdf37..3c94f2f194d68 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3240,7 +3240,7 @@ fastclose_tests() if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 0 0 0 + chk_join_nr 0 0 0 0 0 0 1 chk_fclose_nr 1 1 invert chk_rst_nr 1 1 fi From 006ccc3090e2f30f5f97857f3946312692a5279e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 4 Nov 2023 22:54:26 -0400 Subject: [PATCH 234/560] bcachefs: Kill journal pre-reservations This deletes the complicated and somewhat expensive journal pre-reservation machinery in favor of just using journal watermarks: when the journal is more than half full, we run journal reclaim more aggressively, and when the journal is more than 3/4s full we only allow journal reclaim to get new journal reservations. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 2 - fs/bcachefs/btree_key_cache.c | 14 ----- fs/bcachefs/btree_trans_commit.c | 36 +---------- fs/bcachefs/btree_types.h | 3 - fs/bcachefs/btree_update_interior.c | 30 --------- fs/bcachefs/btree_update_interior.h | 1 - fs/bcachefs/journal.c | 31 --------- fs/bcachefs/journal.h | 98 ----------------------------- fs/bcachefs/journal_reclaim.c | 42 +++++-------- fs/bcachefs/journal_types.h | 26 -------- fs/bcachefs/trace.h | 11 +--- 11 files changed, 19 insertions(+), 275 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index c2adf3fbb0b3a..6fa90bcd70168 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3087,8 +3087,6 @@ void bch2_trans_put(struct btree_trans *trans) srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); } - bch2_journal_preres_put(&c->journal, &trans->journal_preres); - kfree(trans->extra_journal_entries.data); if (trans->fs_usage_deltas) { diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index b3305a04d8086..37fbf22de8fcb 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -672,7 +672,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, goto out; bch2_journal_pin_drop(j, &ck->journal); - bch2_journal_preres_put(j, &ck->res); BUG_ON(!btree_node_locked(c_iter.path, 0)); @@ -770,18 +769,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, BUG_ON(insert->k.u64s > ck->u64s); - if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { - int difference; - - BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s); - - difference = jset_u64s(insert->k.u64s) - ck->res.u64s; - if (difference > 0) { - trans->journal_preres.u64s -= difference; - ck->res.u64s += difference; - } - } - bkey_copy(ck->k, insert); ck->valid = true; @@ -1006,7 +993,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) cond_resched(); bch2_journal_pin_drop(&c->journal, &ck->journal); - bch2_journal_preres_put(&c->journal, &ck->res); list_del(&ck->list); kfree(ck->k); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 55a120eb8692b..12907beda98c2 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -323,17 +323,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); } -static noinline int -bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, - unsigned long trace_ip) -{ - return drop_locks_do(trans, - bch2_journal_preres_get(&trans->c->journal, - &trans->journal_preres, - trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK))); -} - static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, unsigned flags) { @@ -882,14 +871,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags } } - ret = bch2_journal_preres_get(&c->journal, - &trans->journal_preres, trans->journal_preres_u64s, - (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); - if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) - ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); - if (unlikely(ret)) - return ret; - ret = bch2_trans_lock_write(trans); if (unlikely(ret)) return ret; @@ -1052,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; struct btree_write_buffered_key *wb; - unsigned u64s; int ret = 0; if (!trans->nr_updates && @@ -1112,13 +1092,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); - memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); - trans->journal_u64s = trans->extra_journal_entries.nr; - trans->journal_preres_u64s = 0; - trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); - if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); @@ -1134,16 +1109,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (i->key_cache_already_flushed) continue; - /* we're going to journal the key being updated: */ - u64s = jset_u64s(i->k->k.u64s); - if (i->cached && - likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) - trans->journal_preres_u64s += u64s; - if (i->flags & BTREE_UPDATE_NOJOURNAL) continue; - trans->journal_u64s += u64s; + /* we're going to journal the key being updated: */ + trans->journal_u64s += jset_u64s(i->k->k.u64s); /* and we're also going to log the overwrite: */ if (trans->journal_transaction_names) @@ -1175,8 +1145,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) trace_and_count(c, transaction_commit, trans, _RET_IP_); out: - bch2_journal_preres_put(&c->journal, &trans->journal_preres); - if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) bch2_write_ref_put(c, BCH_WRITE_REF_trans); out_reset: diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index f3669fa685916..6fbd4ef3df6b9 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -327,7 +327,6 @@ struct bkey_cached { struct rhash_head hash; struct list_head list; - struct journal_preres res; struct journal_entry_pin journal; u64 seq; @@ -441,11 +440,9 @@ struct btree_trans { struct journal_entry_pin *journal_pin; struct journal_res journal_res; - struct journal_preres journal_preres; u64 *journal_seq; struct disk_reservation *disk_res; unsigned journal_u64s; - unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; }; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 39c2db68123bd..76f27bc9fa24e 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -513,8 +513,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * up_read(&c->gc_lock); as->took_gc_lock = false; - bch2_journal_preres_put(&c->journal, &as->journal_preres); - bch2_journal_pin_drop(&c->journal, &as->journal); bch2_journal_pin_flush(&c->journal, &as->journal); bch2_disk_reservation_put(c, &as->disk_res); @@ -734,8 +732,6 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_journal_pin_drop(&c->journal, &as->journal); - bch2_journal_preres_put(&c->journal, &as->journal_preres); - mutex_lock(&c->btree_interior_update_lock); for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; @@ -1047,7 +1043,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, unsigned nr_nodes[2] = { 0, 0 }; unsigned update_level = level; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - unsigned journal_flags = 0; int ret = 0; u32 restart_count = trans->restart_count; @@ -1061,10 +1056,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, flags &= ~BCH_WATERMARK_MASK; flags |= watermark; - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) - journal_flags |= JOURNAL_RES_GET_NONBLOCK; - journal_flags |= watermark; - while (1) { nr_nodes[!!update_level] += 1 + split; update_level++; @@ -1129,27 +1120,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags|JOURNAL_RES_GET_NONBLOCK); - if (ret) { - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; - goto err; - } - - ret = drop_locks_do(trans, - bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags)); - if (ret == -BCH_ERR_journal_preres_get_blocked) { - trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); - } - if (ret) - goto err; - } - ret = bch2_disk_reservation_get(c, &as->disk_res, (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), c->opts.metadata_replicas, diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 4df21512d640d..031076e75fa13 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -55,7 +55,6 @@ struct btree_update { unsigned update_level; struct disk_reservation disk_res; - struct journal_preres journal_preres; /* * BTREE_INTERIOR_UPDATING_NODE: diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 5b5d69f2316b2..23a9b7845d119 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -526,36 +526,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, return ret; } -/* journal_preres: */ - -static bool journal_preres_available(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true); - - if (!ret && mutex_trylock(&j->reclaim_lock)) { - bch2_journal_reclaim(j); - mutex_unlock(&j->reclaim_lock); - } - - return ret; -} - -int __bch2_journal_preres_get(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - int ret; - - closure_wait_event(&j->preres_wait, - (ret = bch2_journal_error(j)) || - journal_preres_available(j, res, new_u64s, flags)); - return ret; -} - /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *j, @@ -1306,7 +1276,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); - prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 011711e99c8d8..c85d01cf49484 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -395,104 +395,6 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re return 0; } -/* journal_preres: */ - -static inline void journal_set_watermark(struct journal *j) -{ - union journal_preres_state s = READ_ONCE(j->prereserved); - unsigned watermark = BCH_WATERMARK_stripe; - - if (fifo_free(&j->pin) < j->pin.size / 4) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); - if (fifo_free(&j->pin) < j->pin.size / 8) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (s.reserved > s.remaining) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); - if (!s.remaining) - watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); - - if (watermark == j->watermark) - return; - - swap(watermark, j->watermark); - if (watermark > j->watermark) - journal_wake(j); -} - -static inline void bch2_journal_preres_put(struct journal *j, - struct journal_preres *res) -{ - union journal_preres_state s = { .reserved = res->u64s }; - - if (!res->u64s) - return; - - s.v = atomic64_sub_return(s.v, &j->prereserved.counter); - res->u64s = 0; - - if (unlikely(s.waiting)) { - clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), - (unsigned long *) &j->prereserved.v); - closure_wake_up(&j->preres_wait); - } - - if (s.reserved <= s.remaining && j->watermark) - journal_set_watermark(j); -} - -int __bch2_journal_preres_get(struct journal *, - struct journal_preres *, unsigned, unsigned); - -static inline int bch2_journal_preres_get_fast(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags, - bool set_waiting) -{ - int d = new_u64s - res->u64s; - union journal_preres_state old, new; - u64 v = atomic64_read(&j->prereserved.counter); - enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - int ret; - - do { - old.v = new.v = v; - ret = 0; - - if (watermark == BCH_WATERMARK_reclaim || - new.reserved + d < new.remaining) { - new.reserved += d; - ret = 1; - } else if (set_waiting && !new.waiting) - new.waiting = true; - else - return 0; - } while ((v = atomic64_cmpxchg(&j->prereserved.counter, - old.v, new.v)) != old.v); - - if (ret) - res->u64s += d; - return ret; -} - -static inline int bch2_journal_preres_get(struct journal *j, - struct journal_preres *res, - unsigned new_u64s, - unsigned flags) -{ - if (new_u64s <= res->u64s) - return 0; - - if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) - return 0; - - if (flags & JOURNAL_RES_GET_NONBLOCK) - return -BCH_ERR_journal_preres_get_blocked; - - return __bch2_journal_preres_get(j, res, new_u64s, flags); -} - /* journal_entry_res: */ void bch2_journal_entry_res_resize(struct journal *, diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 9a584aaaa2eba..e63c6eda86afe 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -50,16 +50,21 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) +static inline void journal_set_watermark(struct journal *j, bool low_on_space) { - union journal_preres_state old, new; - u64 v = atomic64_read(&j->prereserved.counter); + unsigned watermark = BCH_WATERMARK_stripe; - do { - old.v = new.v = v; - new.remaining = u64s_remaining; - } while ((v = atomic64_cmpxchg(&j->prereserved.counter, - old.v, new.v)) != old.v); + if (low_on_space) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + if (fifo_free(&j->pin) < j->pin.size / 4) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + + if (watermark == j->watermark) + return; + + swap(watermark, j->watermark); + if (watermark > j->watermark) + journal_wake(j); } static struct journal_space @@ -162,7 +167,6 @@ void bch2_journal_space_available(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; unsigned clean, clean_ondisk, total; - s64 u64s_remaining = 0; unsigned max_entry_size = min(j->buf[0].buf_size >> 9, j->buf[1].buf_size >> 9); unsigned i, nr_online = 0, nr_devs_want; @@ -222,16 +226,10 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - u64s_remaining = (u64) clean << 6; - u64s_remaining -= (u64) total << 3; - u64s_remaining = max(0LL, u64s_remaining); - u64s_remaining /= 4; - u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); + journal_set_watermark(j, clean * 4 <= total); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; - journal_set_remaining(j, u64s_remaining); - journal_set_watermark(j); if (!ret) journal_wake(j); @@ -555,11 +553,6 @@ static u64 journal_seq_to_flush(struct journal *j) /* Try to keep the journal at most half full: */ nr_buckets = ja->nr / 2; - /* And include pre-reservations: */ - nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, - (ca->mi.bucket_size << 6) - - journal_entry_overhead(j)); - nr_buckets = min(nr_buckets, ja->nr); bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; @@ -638,10 +631,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) msecs_to_jiffies(c->opts.journal_reclaim_delay))) min_nr = 1; - if (j->prereserved.reserved * 4 > j->prereserved.remaining) - min_nr = 1; - - if (fifo_free(&j->pin) <= 32) + if (j->watermark != BCH_WATERMARK_stripe) min_nr = 1; if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) @@ -652,8 +642,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) trace_and_count(c, journal_reclaim_start, c, direct, kicked, min_nr, min_key_cache, - j->prereserved.reserved, - j->prereserved.remaining, atomic_read(&c->btree_cache.dirty), c->btree_cache.used, atomic_long_read(&c->btree_key_cache.nr_dirty), diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 42504e16acb6c..a756b69582e34 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -76,14 +76,6 @@ struct journal_res { u64 seq; }; -/* - * For reserving space in the journal prior to getting a reservation on a - * particular journal entry: - */ -struct journal_preres { - unsigned u64s; -}; - union journal_res_state { struct { atomic64_t counter; @@ -104,22 +96,6 @@ union journal_res_state { }; }; -union journal_preres_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - u64 waiting:1, - reserved:31, - remaining:32; - }; -}; - /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ #define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ @@ -180,8 +156,6 @@ struct journal { union journal_res_state reservations; enum bch_watermark watermark; - union journal_preres_state prereserved; - } __aligned(SMP_CACHE_BYTES); unsigned long flags; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 893304a1f06e6..7857671159b49 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write, TRACE_EVENT(journal_reclaim_start, TP_PROTO(struct bch_fs *c, bool direct, bool kicked, u64 min_nr, u64 min_key_cache, - u64 prereserved, u64 prereserved_total, u64 btree_cache_dirty, u64 btree_cache_total, u64 btree_key_cache_dirty, u64 btree_key_cache_total), - TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total, + TP_ARGS(c, direct, kicked, min_nr, min_key_cache, btree_cache_dirty, btree_cache_total, btree_key_cache_dirty, btree_key_cache_total), @@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start, __field(bool, kicked ) __field(u64, min_nr ) __field(u64, min_key_cache ) - __field(u64, prereserved ) - __field(u64, prereserved_total ) __field(u64, btree_cache_dirty ) __field(u64, btree_cache_total ) __field(u64, btree_key_cache_dirty ) @@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start, __entry->kicked = kicked; __entry->min_nr = min_nr; __entry->min_key_cache = min_key_cache; - __entry->prereserved = prereserved; - __entry->prereserved_total = prereserved_total; __entry->btree_cache_dirty = btree_cache_dirty; __entry->btree_cache_total = btree_cache_total; __entry->btree_key_cache_dirty = btree_key_cache_dirty; __entry->btree_key_cache_total = btree_key_cache_total; ), - TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->direct, __entry->kicked, __entry->min_nr, __entry->min_key_cache, - __entry->prereserved, - __entry->prereserved_total, __entry->btree_cache_dirty, __entry->btree_cache_total, __entry->btree_key_cache_dirty, From 069749688ea4bbaeff0ca3b229b443ea96b03757 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 Nov 2023 22:15:59 -0500 Subject: [PATCH 235/560] bcachefs: Fix iterator leak in may_delete_deleted_inode() may_delete_deleted_inode() was returning without exiting a btree iterator, eventually causing propagate_key_to_snaphot_leaves() to go into an infinite loop hitting btree_trans_too_many_iters(). Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index def77f2d88024..dab12c14d1ade 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1134,7 +1134,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, * unlinked inodes in the snapshot leaves: */ *need_another_pass = true; - return 0; + goto out; } ret = 1; From b783fc4d1366658200bf759e1010655a9e2e145c Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Sun, 12 Nov 2023 00:38:41 +0000 Subject: [PATCH 236/560] bcachefs: Fix potential sleeping during mount During mount, bcachefs mount option processing may sleep while allocating a string buffer. Fix this by reference counting in order to take the atomic path. Signed-off-by: Daniel J Blueman Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_groups.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 1f334124055ba..4d0cb0ccff32f 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) case TARGET_DEV: { struct bch_dev *ca; + out->atomic++; rcu_read_lock(); ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) @@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) } rcu_read_unlock(); + out->atomic--; break; } case TARGET_GROUP: From 178c4873fd06c0361d260547ce70fcdc29b74809 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 14:15:35 -0500 Subject: [PATCH 237/560] bcachefs: Fix error path in bch2_mount() This fixes a bug discovered by generic/388 where sb->s_fs_info was NULL while the superblock was still active - the error path was entirely fubar, and was trying to do something unclear and unecessary. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 166d8d8abe683..8ef817304e4a2 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1922,10 +1922,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, return dget(sb->s_root); err_put_super: - sb->s_fs_info = NULL; - c->vfs_sb = NULL; deactivate_locked_super(sb); - bch2_fs_stop(c); return ERR_PTR(bch2_err_class(ret)); } @@ -1933,11 +1930,8 @@ static void bch2_kill_sb(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; - if (c) - c->vfs_sb = NULL; generic_shutdown_super(sb); - if (c) - bch2_fs_free(c); + bch2_fs_free(c); } static struct file_system_type bcache_fs_type = { From f42fa17883e73d8509fff5925781d4157db82f00 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 15:47:02 -0500 Subject: [PATCH 238/560] bcachefs: Fix missing transaction commit In may_delete_deleted_inode(), there's a corner case when a snapshot was taken while we had an unlinked inode: we don't want to delete the inode in the internal (shared) snapshot node, since it might have been reattached in a descendent snapshot. Instead we propagate the key to any snapshot leaves it doesn't exist in, so that it can be deleted there if necessary, and then clear the unlinked flag in the internal node. But we forgot to commit after clearing the unlinked flag, causing us to go into an infinite loop. Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index dab12c14d1ade..c7849b0753e7a 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1169,8 +1169,10 @@ int bch2_delete_dead_inodes(struct bch_fs *c) */ for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p, - &need_another_pass)); + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass)); if (ret < 0) break; From 497c57a303590ea69ace23506e182c489e85694d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 17:02:08 -0500 Subject: [PATCH 239/560] bcachefs: Disable debug log statements The journal read path had some informational log statements preperatory for ZNS support - they're not of interest to users, so we can turn them off. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index f4bc2cdbfdd79..786a092855092 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1079,6 +1079,12 @@ static void bch2_journal_read_device(struct closure *cl) if (ja->bucket_seq[ja->cur_idx] && ja->sectors_free == ca->mi.bucket_size) { +#if 0 + /* + * Debug code for ZNS support, where we (probably) want to be + * correlated where we stopped in the journal to the zone write + * points: + */ bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); for (i = 0; i < 3; i++) { @@ -1086,6 +1092,7 @@ static void bch2_journal_read_device(struct closure *cl) bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); } +#endif ja->sectors_free = 0; } From 7125063fc6dfb77138b3a100527f3d8f9203ff2a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 2 Mar 2023 23:52:57 -0500 Subject: [PATCH 240/560] bcachefs: Don't decrease BTREE_ITER_MAX when LOCKDEP=y Running with fewer max btree paths doesn't work anymore when replication is enabled - as we've added e.g. the freespace and bucket gens btrees, we naturally end up needing more btree paths. This is an issue with lockdep, we end up taking more locks than lockdep will track (the MAX_LOCKD_DEPTH constant). But bcachefs as merged does not yet support lockdep anyways, so we can leave that for later. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_types.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 6fbd4ef3df6b9..60453ba86c4b9 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -363,11 +363,7 @@ struct btree_insert_entry { unsigned long ip_allocated; }; -#ifndef CONFIG_LOCKDEP #define BTREE_ITER_MAX 64 -#else -#define BTREE_ITER_MAX 32 -#endif struct btree_trans_commit_hook; typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); From db18ef1a02bc2cd924f86b2582302f2c2711b67c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 13 Nov 2023 21:17:19 -0500 Subject: [PATCH 241/560] bcachefs: Fix bch2_check_nlinks() for snapshots When searching the link table for the matching inode, we were searching for a specific - incorrect - snapshot ID as well, causing us to fail to find the inode. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9f3e9bd3d767a..e0c5cd119acc9 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2220,7 +2220,7 @@ static int nlink_cmp(const void *_l, const void *_r) const struct nlink *l = _l; const struct nlink *r = _r; - return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); + return cmp_int(l->inum, r->inum); } static void inc_link(struct bch_fs *c, struct snapshots_seen *s, From 62d73dfc44d54c97e0df6b947f0bccf6c4b8030e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 12 Nov 2023 21:46:52 -0500 Subject: [PATCH 242/560] bcachefs: Fix no_data_io mode checksum check In no_data_io mode, we expect data checksums to be wrong - don't want to spew the log with them. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index f02b3f7d26a01..d704a8f829c8a 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -795,7 +795,7 @@ static int bch2_write_decrypt(struct bch_write_op *op) * checksum: */ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - if (bch2_crc_cmp(op->crc.csum, csum)) + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) return -EIO; ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); From 61b85cb0d773115d9a4b20c3e67286844cf73f34 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 14 Nov 2023 18:52:22 -0500 Subject: [PATCH 243/560] bcachefs: six locks: Fix lost wakeup In percpu reader mode, trylock() for read had a lost wakeup: on failure to get the lock, we may have caused a writer to fail to get the lock, because we temporarily elevated the reader count. We need to check for waiters after decrementing the read count - not before. Signed-off-by: Kent Overstreet --- fs/bcachefs/six.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index b775cf0fb7cbf..97790445e67ad 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -163,8 +163,11 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, this_cpu_sub(*lock->readers, !ret); preempt_enable(); - if (!ret && (old & SIX_LOCK_WAITING_write)) - ret = -1 - SIX_LOCK_write; + if (!ret) { + smp_mb(); + if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write) + ret = -1 - SIX_LOCK_write; + } } else if (type == SIX_LOCK_write && lock->readers) { if (try) { atomic_add(SIX_LOCK_HELD_write, &lock->state); From 5dd9ad32d7758b1a76742f394acf0eb3ac8a636a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 27 Sep 2023 10:29:02 +0200 Subject: [PATCH 244/560] xen/events: drop xen_allocate_irqs_dynamic() Instead of having a common function for allocating a single IRQ or a consecutive number of IRQs, split up the functionality into the callers of xen_allocate_irqs_dynamic(). This allows to handle any allocation error in xen_irq_init() gracefully instead of panicing the system. Let xen_irq_init() return the irq_info pointer or NULL in case of an allocation error. Additionally set the IRQ into irq_info already at allocation time, as otherwise the IRQ would be '0' (which is a valid IRQ number) until being set. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 74 +++++++++++++++++++------------- 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index a810e8904fbf7..75f8d1dcbbb7b 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -304,6 +304,13 @@ static void channels_on_cpu_inc(struct irq_info *info) info->is_accounted = 1; } +static void xen_irq_free_desc(unsigned int irq) +{ + /* Legacy IRQ descriptors are managed by the arch. */ + if (irq >= nr_legacy_irqs()) + irq_free_desc(irq); +} + static void delayed_free_irq(struct work_struct *work) { struct irq_info *info = container_of(to_rcu_work(work), struct irq_info, @@ -315,9 +322,7 @@ static void delayed_free_irq(struct work_struct *work) kfree(info); - /* Legacy IRQ descriptors are managed by the arch. */ - if (irq >= nr_legacy_irqs()) - irq_free_desc(irq); + xen_irq_free_desc(irq); } /* Constructors for packed IRQ information. */ @@ -332,7 +337,6 @@ static int xen_irq_info_common_setup(struct irq_info *info, BUG_ON(info->type != IRQT_UNBOUND && info->type != type); info->type = type; - info->irq = irq; info->evtchn = evtchn; info->cpu = cpu; info->mask_reason = EVT_MASK_REASON_EXPLICIT; @@ -733,47 +737,45 @@ void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags) } EXPORT_SYMBOL_GPL(xen_irq_lateeoi); -static void xen_irq_init(unsigned irq) +static struct irq_info *xen_irq_init(unsigned int irq) { struct irq_info *info; info = kzalloc(sizeof(*info), GFP_KERNEL); - if (info == NULL) - panic("Unable to allocate metadata for IRQ%d\n", irq); + if (info) { + info->irq = irq; + info->type = IRQT_UNBOUND; + info->refcnt = -1; + INIT_RCU_WORK(&info->rwork, delayed_free_irq); - info->type = IRQT_UNBOUND; - info->refcnt = -1; - INIT_RCU_WORK(&info->rwork, delayed_free_irq); + set_info_for_irq(irq, info); + /* + * Interrupt affinity setting can be immediate. No point + * in delaying it until an interrupt is handled. + */ + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - set_info_for_irq(irq, info); - /* - * Interrupt affinity setting can be immediate. No point - * in delaying it until an interrupt is handled. - */ - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); + INIT_LIST_HEAD(&info->eoi_list); + list_add_tail(&info->list, &xen_irq_list_head); + } - INIT_LIST_HEAD(&info->eoi_list); - list_add_tail(&info->list, &xen_irq_list_head); + return info; } -static int __must_check xen_allocate_irqs_dynamic(int nvec) +static inline int __must_check xen_allocate_irq_dynamic(void) { - int i, irq = irq_alloc_descs(-1, 0, nvec, -1); + int irq = irq_alloc_desc_from(0, -1); if (irq >= 0) { - for (i = 0; i < nvec; i++) - xen_irq_init(irq + i); + if (!xen_irq_init(irq)) { + xen_irq_free_desc(irq); + irq = -1; + } } return irq; } -static inline int __must_check xen_allocate_irq_dynamic(void) -{ - - return xen_allocate_irqs_dynamic(1); -} - static int __must_check xen_allocate_irq_gsi(unsigned gsi) { int irq; @@ -793,7 +795,10 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi) else irq = irq_alloc_desc_at(gsi, -1); - xen_irq_init(irq); + if (!xen_irq_init(irq)) { + xen_irq_free_desc(irq); + irq = -1; + } return irq; } @@ -963,6 +968,11 @@ static void __unbind_from_irq(unsigned int irq) evtchn_port_t evtchn = evtchn_from_irq(irq); struct irq_info *info = info_for_irq(irq); + if (!info) { + xen_irq_free_desc(irq); + return; + } + if (info->refcnt > 0) { info->refcnt--; if (info->refcnt != 0) @@ -1101,11 +1111,14 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, mutex_lock(&irq_mapping_update_lock); - irq = xen_allocate_irqs_dynamic(nvec); + irq = irq_alloc_descs(-1, 0, nvec, -1); if (irq < 0) goto out; for (i = 0; i < nvec; i++) { + if (!xen_irq_init(irq + i)) + goto error_irq; + irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, @@ -1730,6 +1743,7 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); + info->irq = irq; (void)xen_irq_info_evtchn_setup(irq, evtchn, NULL); mutex_unlock(&irq_mapping_update_lock); From 3fcdaf3d7634338c3f5cbfa7451eb0b6b0024844 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 28 Sep 2023 09:09:52 +0200 Subject: [PATCH 245/560] xen/events: modify internal [un]bind interfaces Modify the internal bind- and unbind-interfaces to take a struct irq_info parameter. When allocating a new IRQ pass the pointer from the allocating function further up. This will reduce the number of info_for_irq() calls and make the code more efficient. Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 259 +++++++++++++++---------------- 1 file changed, 124 insertions(+), 135 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 75f8d1dcbbb7b..88f0c80d0f878 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -327,7 +327,6 @@ static void delayed_free_irq(struct work_struct *work) /* Constructors for packed IRQ information. */ static int xen_irq_info_common_setup(struct irq_info *info, - unsigned irq, enum xen_irq_type type, evtchn_port_t evtchn, unsigned short cpu) @@ -342,23 +341,22 @@ static int xen_irq_info_common_setup(struct irq_info *info, info->mask_reason = EVT_MASK_REASON_EXPLICIT; raw_spin_lock_init(&info->lock); - ret = set_evtchn_to_irq(evtchn, irq); + ret = set_evtchn_to_irq(evtchn, info->irq); if (ret < 0) return ret; - irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); + irq_clear_status_flags(info->irq, IRQ_NOREQUEST | IRQ_NOAUTOEN); return xen_evtchn_port_setup(evtchn); } -static int xen_irq_info_evtchn_setup(unsigned irq, +static int xen_irq_info_evtchn_setup(struct irq_info *info, evtchn_port_t evtchn, struct xenbus_device *dev) { - struct irq_info *info = info_for_irq(irq); int ret; - ret = xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); + ret = xen_irq_info_common_setup(info, IRQT_EVTCHN, evtchn, 0); info->u.interdomain = dev; if (dev) atomic_inc(&dev->event_channels); @@ -366,50 +364,37 @@ static int xen_irq_info_evtchn_setup(unsigned irq, return ret; } -static int xen_irq_info_ipi_setup(unsigned cpu, - unsigned irq, - evtchn_port_t evtchn, - enum ipi_vector ipi) +static int xen_irq_info_ipi_setup(struct irq_info *info, unsigned int cpu, + evtchn_port_t evtchn, enum ipi_vector ipi) { - struct irq_info *info = info_for_irq(irq); - info->u.ipi = ipi; - per_cpu(ipi_to_irq, cpu)[ipi] = irq; + per_cpu(ipi_to_irq, cpu)[ipi] = info->irq; per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn; - return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); + return xen_irq_info_common_setup(info, IRQT_IPI, evtchn, 0); } -static int xen_irq_info_virq_setup(unsigned cpu, - unsigned irq, - evtchn_port_t evtchn, - unsigned virq) +static int xen_irq_info_virq_setup(struct irq_info *info, unsigned int cpu, + evtchn_port_t evtchn, unsigned int virq) { - struct irq_info *info = info_for_irq(irq); - info->u.virq = virq; - per_cpu(virq_to_irq, cpu)[virq] = irq; + per_cpu(virq_to_irq, cpu)[virq] = info->irq; - return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0); + return xen_irq_info_common_setup(info, IRQT_VIRQ, evtchn, 0); } -static int xen_irq_info_pirq_setup(unsigned irq, - evtchn_port_t evtchn, - unsigned pirq, - unsigned gsi, - uint16_t domid, - unsigned char flags) +static int xen_irq_info_pirq_setup(struct irq_info *info, evtchn_port_t evtchn, + unsigned int pirq, unsigned int gsi, + uint16_t domid, unsigned char flags) { - struct irq_info *info = info_for_irq(irq); - info->u.pirq.pirq = pirq; info->u.pirq.gsi = gsi; info->u.pirq.domid = domid; info->u.pirq.flags = flags; - return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0); + return xen_irq_info_common_setup(info, IRQT_PIRQ, evtchn, 0); } static void xen_irq_info_cleanup(struct irq_info *info) @@ -453,20 +438,16 @@ int irq_evtchn_from_virq(unsigned int cpu, unsigned int virq, return irq; } -static enum ipi_vector ipi_from_irq(unsigned irq) +static enum ipi_vector ipi_from_irq(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); - BUG_ON(info == NULL); BUG_ON(info->type != IRQT_IPI); return info->u.ipi; } -static unsigned virq_from_irq(unsigned irq) +static unsigned int virq_from_irq(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); - BUG_ON(info == NULL); BUG_ON(info->type != IRQT_VIRQ); @@ -533,13 +514,9 @@ static bool pirq_needs_eoi_flag(unsigned irq) return info->u.pirq.flags & PIRQ_NEEDS_EOI; } -static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu, +static void bind_evtchn_to_cpu(struct irq_info *info, unsigned int cpu, bool force_affinity) { - struct irq_info *info = evtchn_to_info(evtchn); - - BUG_ON(info == NULL); - if (IS_ENABLED(CONFIG_SMP) && force_affinity) { struct irq_data *data = irq_get_irq_data(info->irq); @@ -547,7 +524,7 @@ static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu, irq_data_update_effective_affinity(data, cpumask_of(cpu)); } - xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu); + xen_evtchn_port_bind_to_cpu(info->evtchn, cpu, info->cpu); channels_on_cpu_dec(info); info->cpu = cpu; @@ -762,23 +739,24 @@ static struct irq_info *xen_irq_init(unsigned int irq) return info; } -static inline int __must_check xen_allocate_irq_dynamic(void) +static struct irq_info *xen_allocate_irq_dynamic(void) { int irq = irq_alloc_desc_from(0, -1); + struct irq_info *info = NULL; if (irq >= 0) { - if (!xen_irq_init(irq)) { + info = xen_irq_init(irq); + if (!info) xen_irq_free_desc(irq); - irq = -1; - } } - return irq; + return info; } -static int __must_check xen_allocate_irq_gsi(unsigned gsi) +static struct irq_info *xen_allocate_irq_gsi(unsigned int gsi) { int irq; + struct irq_info *info; /* * A PV guest has no concept of a GSI (since it has no ACPI @@ -795,18 +773,15 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi) else irq = irq_alloc_desc_at(gsi, -1); - if (!xen_irq_init(irq)) { + info = xen_irq_init(irq); + if (!info) xen_irq_free_desc(irq); - irq = -1; - } - return irq; + return info; } -static void xen_free_irq(unsigned irq) +static void xen_free_irq(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); - if (WARN_ON(!info)) return; @@ -897,7 +872,7 @@ static unsigned int __startup_pirq(unsigned int irq) goto err; info->evtchn = evtchn; - bind_evtchn_to_cpu(evtchn, 0, false); + bind_evtchn_to_cpu(info, 0, false); rc = xen_evtchn_port_setup(evtchn); if (rc) @@ -963,10 +938,9 @@ int xen_irq_from_gsi(unsigned gsi) } EXPORT_SYMBOL_GPL(xen_irq_from_gsi); -static void __unbind_from_irq(unsigned int irq) +static void __unbind_from_irq(struct irq_info *info, unsigned int irq) { - evtchn_port_t evtchn = evtchn_from_irq(irq); - struct irq_info *info = info_for_irq(irq); + evtchn_port_t evtchn; if (!info) { xen_irq_free_desc(irq); @@ -979,6 +953,8 @@ static void __unbind_from_irq(unsigned int irq) return; } + evtchn = info->evtchn; + if (VALID_EVTCHN(evtchn)) { unsigned int cpu = info->cpu; struct xenbus_device *dev; @@ -988,11 +964,11 @@ static void __unbind_from_irq(unsigned int irq) switch (info->type) { case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; + per_cpu(virq_to_irq, cpu)[virq_from_irq(info)] = -1; break; case IRQT_IPI: - per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; - per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(irq)] = 0; + per_cpu(ipi_to_irq, cpu)[ipi_from_irq(info)] = -1; + per_cpu(ipi_to_evtchn, cpu)[ipi_from_irq(info)] = 0; break; case IRQT_EVTCHN: dev = info->u.interdomain; @@ -1006,7 +982,7 @@ static void __unbind_from_irq(unsigned int irq) xen_irq_info_cleanup(info); } - xen_free_irq(irq); + xen_free_irq(info); } /* @@ -1022,24 +998,24 @@ static void __unbind_from_irq(unsigned int irq) int xen_bind_pirq_gsi_to_irq(unsigned gsi, unsigned pirq, int shareable, char *name) { - int irq; + struct irq_info *info; struct physdev_irq irq_op; int ret; mutex_lock(&irq_mapping_update_lock); - irq = xen_irq_from_gsi(gsi); - if (irq != -1) { + ret = xen_irq_from_gsi(gsi); + if (ret != -1) { pr_info("%s: returning irq %d for gsi %u\n", - __func__, irq, gsi); + __func__, ret, gsi); goto out; } - irq = xen_allocate_irq_gsi(gsi); - if (irq < 0) + info = xen_allocate_irq_gsi(gsi); + if (!info) goto out; - irq_op.irq = irq; + irq_op.irq = info->irq; irq_op.vector = 0; /* Only the privileged domain can do this. For non-priv, the pcifront @@ -1047,20 +1023,19 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, * this in the priv domain. */ if (xen_initial_domain() && HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { - xen_free_irq(irq); - irq = -ENOSPC; + xen_free_irq(info); + ret = -ENOSPC; goto out; } - ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF, + ret = xen_irq_info_pirq_setup(info, 0, pirq, gsi, DOMID_SELF, shareable ? PIRQ_SHAREABLE : 0); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } - pirq_query_unmask(irq); + pirq_query_unmask(info->irq); /* We try to use the handler with the appropriate semantic for the * type of interrupt: if the interrupt is an edge triggered * interrupt we use handle_edge_irq. @@ -1077,16 +1052,18 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, * is the right choice either way. */ if (shareable) - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip, handle_fasteoi_irq, name); else - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + irq_set_chip_and_handler_name(info->irq, &xen_pirq_chip, handle_edge_irq, name); + ret = info->irq; + out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; } #ifdef CONFIG_PCI_MSI @@ -1108,6 +1085,7 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, int pirq, int nvec, const char *name, domid_t domid) { int i, irq, ret; + struct irq_info *info; mutex_lock(&irq_mapping_update_lock); @@ -1116,12 +1094,13 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, goto out; for (i = 0; i < nvec; i++) { - if (!xen_irq_init(irq + i)) + info = xen_irq_init(irq + i); + if (!info) goto error_irq; irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); - ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, + ret = xen_irq_info_pirq_setup(info, 0, pirq + i, 0, domid, i == 0 ? 0 : PIRQ_MSI_GROUP); if (ret < 0) goto error_irq; @@ -1133,9 +1112,12 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, out: mutex_unlock(&irq_mapping_update_lock); return irq; + error_irq: - while (nvec--) - __unbind_from_irq(irq + nvec); + while (nvec--) { + info = info_for_irq(irq + nvec); + __unbind_from_irq(info, irq + nvec); + } mutex_unlock(&irq_mapping_update_lock); return ret; } @@ -1171,7 +1153,7 @@ int xen_destroy_irq(int irq) } } - xen_free_irq(irq); + xen_free_irq(info); out: mutex_unlock(&irq_mapping_update_lock); @@ -1187,8 +1169,7 @@ EXPORT_SYMBOL_GPL(xen_pirq_from_irq); static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, struct xenbus_device *dev) { - int irq; - int ret; + int ret = -ENOMEM; struct irq_info *info; if (evtchn >= xen_evtchn_max_channels()) @@ -1199,17 +1180,16 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, info = evtchn_to_info(evtchn); if (!info) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) + info = xen_allocate_irq_dynamic(); + if (!info) goto out; - irq_set_chip_and_handler_name(irq, chip, + irq_set_chip_and_handler_name(info->irq, chip, handle_edge_irq, "event"); - ret = xen_irq_info_evtchn_setup(irq, evtchn, dev); + ret = xen_irq_info_evtchn_setup(info, evtchn, dev); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } /* @@ -1219,17 +1199,17 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip, * affinity setting is not invoked on them so nothing would * bind the channel. */ - bind_evtchn_to_cpu(evtchn, 0, false); - } else { - if (!WARN_ON(info->type != IRQT_EVTCHN)) - info->refcnt++; - irq = info->irq; + bind_evtchn_to_cpu(info, 0, false); + } else if (!WARN_ON(info->type != IRQT_EVTCHN)) { + info->refcnt++; } + ret = info->irq; + out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; } int bind_evtchn_to_irq(evtchn_port_t evtchn) @@ -1248,18 +1228,19 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; evtchn_port_t evtchn; - int ret, irq; + struct irq_info *info; + int ret; mutex_lock(&irq_mapping_update_lock); - irq = per_cpu(ipi_to_irq, cpu)[ipi]; + ret = per_cpu(ipi_to_irq, cpu)[ipi]; - if (irq == -1) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) + if (ret == -1) { + info = xen_allocate_irq_dynamic(); + if (!info) goto out; - irq_set_chip_and_handler_name(irq, &xen_percpu_chip, + irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip, handle_percpu_irq, "ipi"); bind_ipi.vcpu = xen_vcpu_nr(cpu); @@ -1268,25 +1249,25 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) BUG(); evtchn = bind_ipi.port; - ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + ret = xen_irq_info_ipi_setup(info, cpu, evtchn, ipi); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } /* * Force the affinity mask to the target CPU so proc shows * the correct target. */ - bind_evtchn_to_cpu(evtchn, cpu, true); + bind_evtchn_to_cpu(info, cpu, true); + ret = info->irq; } else { - struct irq_info *info = info_for_irq(irq); + info = info_for_irq(ret); WARN_ON(info == NULL || info->type != IRQT_IPI); } out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; } static int bind_interdomain_evtchn_to_irq_chip(struct xenbus_device *dev, @@ -1354,22 +1335,23 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) { struct evtchn_bind_virq bind_virq; evtchn_port_t evtchn = 0; - int irq, ret; + struct irq_info *info; + int ret; mutex_lock(&irq_mapping_update_lock); - irq = per_cpu(virq_to_irq, cpu)[virq]; + ret = per_cpu(virq_to_irq, cpu)[virq]; - if (irq == -1) { - irq = xen_allocate_irq_dynamic(); - if (irq < 0) + if (ret == -1) { + info = xen_allocate_irq_dynamic(); + if (!info) goto out; if (percpu) - irq_set_chip_and_handler_name(irq, &xen_percpu_chip, + irq_set_chip_and_handler_name(info->irq, &xen_percpu_chip, handle_percpu_irq, "virq"); else - irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, + irq_set_chip_and_handler_name(info->irq, &xen_dynamic_chip, handle_edge_irq, "virq"); bind_virq.virq = virq; @@ -1384,10 +1366,9 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) BUG_ON(ret < 0); } - ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + ret = xen_irq_info_virq_setup(info, cpu, evtchn, virq); if (ret < 0) { - __unbind_from_irq(irq); - irq = ret; + __unbind_from_irq(info, info->irq); goto out; } @@ -1395,22 +1376,26 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) * Force the affinity mask for percpu interrupts so proc * shows the correct target. */ - bind_evtchn_to_cpu(evtchn, cpu, percpu); + bind_evtchn_to_cpu(info, cpu, percpu); + ret = info->irq; } else { - struct irq_info *info = info_for_irq(irq); + info = info_for_irq(ret); WARN_ON(info == NULL || info->type != IRQT_VIRQ); } out: mutex_unlock(&irq_mapping_update_lock); - return irq; + return ret; } static void unbind_from_irq(unsigned int irq) { + struct irq_info *info; + mutex_lock(&irq_mapping_update_lock); - __unbind_from_irq(irq); + info = info_for_irq(irq); + __unbind_from_irq(info, irq); mutex_unlock(&irq_mapping_update_lock); } @@ -1744,11 +1729,11 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) BUG_ON(info->type == IRQT_UNBOUND); info->irq = irq; - (void)xen_irq_info_evtchn_setup(irq, evtchn, NULL); + (void)xen_irq_info_evtchn_setup(info, evtchn, NULL); mutex_unlock(&irq_mapping_update_lock); - bind_evtchn_to_cpu(evtchn, info->cpu, false); + bind_evtchn_to_cpu(info, info->cpu, false); /* Unmask the event channel. */ enable_irq(irq); @@ -1782,7 +1767,7 @@ static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu) * it, but don't do the xenlinux-level rebind in that case. */ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu, false); + bind_evtchn_to_cpu(info, tcpu, false); do_unmask(info, EVT_MASK_REASON_TEMPORARY); @@ -1933,7 +1918,7 @@ static void restore_pirqs(void) if (rc) { pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", gsi, irq, pirq, rc); - xen_free_irq(irq); + xen_free_irq(info); continue; } @@ -1947,13 +1932,15 @@ static void restore_cpu_virqs(unsigned int cpu) { struct evtchn_bind_virq bind_virq; evtchn_port_t evtchn; + struct irq_info *info; int virq, irq; for (virq = 0; virq < NR_VIRQS; virq++) { if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) continue; + info = info_for_irq(irq); - BUG_ON(virq_from_irq(irq) != virq); + BUG_ON(virq_from_irq(info) != virq); /* Get a new binding from Xen. */ bind_virq.virq = virq; @@ -1964,9 +1951,9 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn = bind_virq.port; /* Record the new mapping. */ - (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + xen_irq_info_virq_setup(info, cpu, evtchn, virq); /* The affinity mask is still valid */ - bind_evtchn_to_cpu(evtchn, cpu, false); + bind_evtchn_to_cpu(info, cpu, false); } } @@ -1974,13 +1961,15 @@ static void restore_cpu_ipis(unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; evtchn_port_t evtchn; + struct irq_info *info; int ipi, irq; for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) { if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) continue; + info = info_for_irq(irq); - BUG_ON(ipi_from_irq(irq) != ipi); + BUG_ON(ipi_from_irq(info) != ipi); /* Get a new binding from Xen. */ bind_ipi.vcpu = xen_vcpu_nr(cpu); @@ -1990,9 +1979,9 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn = bind_ipi.port; /* Record the new mapping. */ - (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + xen_irq_info_ipi_setup(info, cpu, evtchn, ipi); /* The affinity mask is still valid */ - bind_evtchn_to_cpu(evtchn, cpu, false); + bind_evtchn_to_cpu(info, cpu, false); } } From cee96422e863f0b0e9d3d0c2d617271ef2255858 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 28 Sep 2023 11:25:32 +0200 Subject: [PATCH 246/560] xen/events: remove some info_for_irq() calls in pirq handling Instead of the IRQ number user the struct irq_info pointer as parameter in the internal pirq related functions. This allows to drop some calls of info_for_irq(). Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 117 ++++++++++++++++++------------- 1 file changed, 68 insertions(+), 49 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 88f0c80d0f878..f5edb9e27e3ca 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -174,7 +174,7 @@ static int **evtchn_to_irq; #ifdef CONFIG_X86 static unsigned long *pirq_eoi_map; #endif -static bool (*pirq_needs_eoi)(unsigned irq); +static bool (*pirq_needs_eoi)(struct irq_info *info); #define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) #define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) @@ -190,7 +190,6 @@ static struct irq_chip xen_lateeoi_chip; static struct irq_chip xen_percpu_chip; static struct irq_chip xen_pirq_chip; static void enable_dynirq(struct irq_data *data); -static void disable_dynirq(struct irq_data *data); static DEFINE_PER_CPU(unsigned int, irq_epoch); @@ -454,10 +453,8 @@ static unsigned int virq_from_irq(struct irq_info *info) return info->u.virq; } -static unsigned pirq_from_irq(unsigned irq) +static unsigned int pirq_from_irq(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); - BUG_ON(info == NULL); BUG_ON(info->type != IRQT_PIRQ); @@ -500,15 +497,14 @@ static void do_unmask(struct irq_info *info, u8 reason) } #ifdef CONFIG_X86 -static bool pirq_check_eoi_map(unsigned irq) +static bool pirq_check_eoi_map(struct irq_info *info) { - return test_bit(pirq_from_irq(irq), pirq_eoi_map); + return test_bit(pirq_from_irq(info), pirq_eoi_map); } #endif -static bool pirq_needs_eoi_flag(unsigned irq) +static bool pirq_needs_eoi_flag(struct irq_info *info) { - struct irq_info *info = info_for_irq(irq); BUG_ON(info->type != IRQT_PIRQ); return info->u.pirq.flags & PIRQ_NEEDS_EOI; @@ -802,14 +798,11 @@ static void event_handler_exit(struct irq_info *info) clear_evtchn(info->evtchn); } -static void pirq_query_unmask(int irq) +static void pirq_query_unmask(struct irq_info *info) { struct physdev_irq_status_query irq_status; - struct irq_info *info = info_for_irq(irq); - BUG_ON(info->type != IRQT_PIRQ); - - irq_status.irq = pirq_from_irq(irq); + irq_status.irq = pirq_from_irq(info); if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) irq_status.flags = 0; @@ -818,56 +811,76 @@ static void pirq_query_unmask(int irq) info->u.pirq.flags |= PIRQ_NEEDS_EOI; } -static void eoi_pirq(struct irq_data *data) +static void do_eoi_pirq(struct irq_info *info) { - struct irq_info *info = info_for_irq(data->irq); - evtchn_port_t evtchn = info ? info->evtchn : 0; - struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) }; + struct physdev_eoi eoi = { .irq = pirq_from_irq(info) }; int rc = 0; - if (!VALID_EVTCHN(evtchn)) + if (!VALID_EVTCHN(info->evtchn)) return; event_handler_exit(info); - if (pirq_needs_eoi(data->irq)) { + if (pirq_needs_eoi(info)) { rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); WARN_ON(rc); } } +static void eoi_pirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + do_eoi_pirq(info); +} + +static void do_disable_dynirq(struct irq_info *info) +{ + if (VALID_EVTCHN(info->evtchn)) + do_mask(info, EVT_MASK_REASON_EXPLICIT); +} + +static void disable_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + + if (info) + do_disable_dynirq(info); +} + static void mask_ack_pirq(struct irq_data *data) { - disable_dynirq(data); - eoi_pirq(data); + struct irq_info *info = info_for_irq(data->irq); + + if (info) { + do_disable_dynirq(info); + do_eoi_pirq(info); + } } -static unsigned int __startup_pirq(unsigned int irq) +static unsigned int __startup_pirq(struct irq_info *info) { struct evtchn_bind_pirq bind_pirq; - struct irq_info *info = info_for_irq(irq); - evtchn_port_t evtchn = evtchn_from_irq(irq); + evtchn_port_t evtchn = info->evtchn; int rc; - BUG_ON(info->type != IRQT_PIRQ); - if (VALID_EVTCHN(evtchn)) goto out; - bind_pirq.pirq = pirq_from_irq(irq); + bind_pirq.pirq = pirq_from_irq(info); /* NB. We are happy to share unless we are probing. */ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? BIND_PIRQ__WILL_SHARE : 0; rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); if (rc != 0) { - pr_warn("Failed to obtain physical IRQ %d\n", irq); + pr_warn("Failed to obtain physical IRQ %d\n", info->irq); return 0; } evtchn = bind_pirq.port; - pirq_query_unmask(irq); + pirq_query_unmask(info); - rc = set_evtchn_to_irq(evtchn, irq); + rc = set_evtchn_to_irq(evtchn, info->irq); if (rc) goto err; @@ -881,26 +894,28 @@ static unsigned int __startup_pirq(unsigned int irq) out: do_unmask(info, EVT_MASK_REASON_EXPLICIT); - eoi_pirq(irq_get_irq_data(irq)); + do_eoi_pirq(info); return 0; err: - pr_err("irq%d: Failed to set port to irq mapping (%d)\n", irq, rc); + pr_err("irq%d: Failed to set port to irq mapping (%d)\n", info->irq, + rc); xen_evtchn_close(evtchn); return 0; } static unsigned int startup_pirq(struct irq_data *data) { - return __startup_pirq(data->irq); + struct irq_info *info = info_for_irq(data->irq); + + return __startup_pirq(info); } static void shutdown_pirq(struct irq_data *data) { - unsigned int irq = data->irq; - struct irq_info *info = info_for_irq(irq); - evtchn_port_t evtchn = evtchn_from_irq(irq); + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info->evtchn; BUG_ON(info->type != IRQT_PIRQ); @@ -1035,7 +1050,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, goto out; } - pirq_query_unmask(info->irq); + pirq_query_unmask(info); /* We try to use the handler with the appropriate semantic for the * type of interrupt: if the interrupt is an edge triggered * interrupt we use handle_edge_irq. @@ -1162,7 +1177,9 @@ int xen_destroy_irq(int irq) int xen_pirq_from_irq(unsigned irq) { - return pirq_from_irq(irq); + struct irq_info *info = info_for_irq(irq); + + return pirq_from_irq(info); } EXPORT_SYMBOL_GPL(xen_pirq_from_irq); @@ -1824,28 +1841,30 @@ static void enable_dynirq(struct irq_data *data) do_unmask(info, EVT_MASK_REASON_EXPLICIT); } -static void disable_dynirq(struct irq_data *data) +static void do_ack_dynirq(struct irq_info *info) { - struct irq_info *info = info_for_irq(data->irq); - evtchn_port_t evtchn = info ? info->evtchn : 0; + evtchn_port_t evtchn = info->evtchn; if (VALID_EVTCHN(evtchn)) - do_mask(info, EVT_MASK_REASON_EXPLICIT); + event_handler_exit(info); } static void ack_dynirq(struct irq_data *data) { struct irq_info *info = info_for_irq(data->irq); - evtchn_port_t evtchn = info ? info->evtchn : 0; - if (VALID_EVTCHN(evtchn)) - event_handler_exit(info); + if (info) + do_ack_dynirq(info); } static void mask_ack_dynirq(struct irq_data *data) { - disable_dynirq(data); - ack_dynirq(data); + struct irq_info *info = info_for_irq(data->irq); + + if (info) { + do_disable_dynirq(info); + do_ack_dynirq(info); + } } static void lateeoi_ack_dynirq(struct irq_data *data) @@ -1924,7 +1943,7 @@ static void restore_pirqs(void) printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); - __startup_pirq(irq); + __startup_pirq(info); } } From 9e2e7efbbbff69d8340abb56d375dd79d1f5770f Mon Sep 17 00:00:00 2001 From: Johnathan Mantey Date: Mon, 13 Nov 2023 08:30:29 -0800 Subject: [PATCH 247/560] Revert ncsi: Propagate carrier gain/loss events to the NCSI controller This reverts commit 3780bb29311eccb7a1c9641032a112eed237f7e3. The cited commit introduced unwanted behavior. The intent for the commit was to be able to detect carrier loss/gain for just the NIC connected to the BMC. The unwanted effect is a carrier loss for auxiliary paths also causes the BMC to lose carrier. The BMC never regains carrier despite the secondary NIC regaining a link. This change, when merged, needs to be backported to stable kernels. 5.4-stable, 5.10-stable, 5.15-stable, 6.1-stable, 6.5-stable Fixes: 3780bb29311e ("ncsi: Propagate carrier gain/loss events to the NCSI controller") CC: stable@vger.kernel.org Signed-off-by: Johnathan Mantey Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/ncsi/ncsi-aen.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index f8854bff286cb..62fb1031763d1 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -89,11 +89,6 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, if ((had_link == has_link) || chained) return 0; - if (had_link) - netif_carrier_off(ndp->ndev.dev); - else - netif_carrier_on(ndp->ndev.dev); - if (!ndp->multi_package && !nc->package->multi_channel) { if (had_link) { ndp->flags |= NCSI_DEV_RESHUFFLE; From efc0c8363bc6dab2cd540acc886e6097deee8bb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Mon, 13 Nov 2023 17:44:12 +0100 Subject: [PATCH 248/560] dt-bindings: net: ethernet-controller: Fix formatting error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When moving the *-internal-delay-ps properties to only apply for RGMII interface modes there where a typo in the text formatting. Signed-off-by: Niklas Söderlund Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- .../devicetree/bindings/net/ethernet-controller.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index 9f6a5ccbcefef..d14d123ad7a02 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -275,12 +275,12 @@ allOf: properties: rx-internal-delay-ps: description: - RGMII Receive Clock Delay defined in pico seconds.This is used for + RGMII Receive Clock Delay defined in pico seconds. This is used for controllers that have configurable RX internal delays. If this property is present then the MAC applies the RX delay. tx-internal-delay-ps: description: - RGMII Transmit Clock Delay defined in pico seconds.This is used for + RGMII Transmit Clock Delay defined in pico seconds. This is used for controllers that have configurable TX internal delays. If this property is present then the MAC applies the TX delay. From 00a614fc3527ba8846e6d823013bcf724e9a68f6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 27 Oct 2023 17:26:23 +0200 Subject: [PATCH 249/560] accel/ivpu: avoid build failure with CONFIG_PM=n The usage count of struct dev_pm_info is an implementation detail that is only available if CONFIG_PM is enabled, so printing it in a debug message causes a build failure in configurations without PM: In file included from include/linux/device.h:15, from include/linux/pci.h:37, from drivers/accel/ivpu/ivpu_pm.c:8: drivers/accel/ivpu/ivpu_pm.c: In function 'ivpu_rpm_get_if_active': drivers/accel/ivpu/ivpu_pm.c:254:51: error: 'struct dev_pm_info' has no member named 'usage_count' 254 | atomic_read(&vdev->drm.dev->power.usage_count)); | ^ include/linux/dev_printk.h:129:48: note: in definition of macro 'dev_printk' 129 | _dev_printk(level, dev, fmt, ##__VA_ARGS__); \ | ^~~~~~~~~~~ drivers/accel/ivpu/ivpu_drv.h:75:17: note: in expansion of macro 'dev_dbg' 75 | dev_dbg((vdev)->drm.dev, "[%s] " fmt, #type, ##args); \ | ^~~~~~~ drivers/accel/ivpu/ivpu_pm.c:253:9: note: in expansion of macro 'ivpu_dbg' 253 | ivpu_dbg(vdev, RPM, "rpm_get_if_active count %d\n", | ^~~~~~~~ The print message does not seem essential, so the easiest workaround is to just remove it. Fixes: c39dc15191c4 ("accel/ivpu: Read clock rate only if device is up") Signed-off-by: Arnd Bergmann Reviewed-by: Stanislaw Gruszka Signed-off-by: Stanislaw Gruszka Link: https://patchwork.freedesktop.org/patch/msgid/20231027152633.528490-1-arnd@kernel.org (cherry picked from commit 1470acbef122c7e2e588f6346ce459c26d0568a2) Signed-off-by: Jacek Lawrynowicz --- drivers/accel/ivpu/ivpu_pm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 0ace218783c81..e9b16cbc26f49 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -250,9 +250,6 @@ int ivpu_rpm_get_if_active(struct ivpu_device *vdev) { int ret; - ivpu_dbg(vdev, RPM, "rpm_get_if_active count %d\n", - atomic_read(&vdev->drm.dev->power.usage_count)); - ret = pm_runtime_get_if_active(vdev->drm.dev, false); drm_WARN_ON(&vdev->drm, ret < 0); From 674e318089468ece99aef4796eaef7add57f36b2 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 14 Nov 2023 09:56:18 +0200 Subject: [PATCH 250/560] net: Fix undefined behavior in netdev name allocation Cited commit removed the strscpy() call and kept the snprintf() only. It is common to use 'dev->name' as the format string before a netdev is registered, this results in 'res' and 'name' pointers being equal. According to POSIX, if copying takes place between objects that overlap as a result of a call to sprintf() or snprintf(), the results are undefined. Add back the strscpy() and use 'buf' as an intermediate buffer. Fixes: 7ad17b04dc7b ("net: trust the bitmap in __dev_alloc_name()") Cc: Jakub Kicinski Reviewed-by: Vlad Buslov Signed-off-by: Gal Pressman Reviewed-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 0d548431f3fad..af53f6d838ce9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1119,7 +1119,9 @@ static int __dev_alloc_name(struct net *net, const char *name, char *res) if (i == max_netdevices) return -ENFILE; - snprintf(res, IFNAMSIZ, name, i); + /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */ + strscpy(buf, name, IFNAMSIZ); + snprintf(res, IFNAMSIZ, buf, i); return i; } From 9fadd4509966e375952f31ae954ab5eae76f90fe Mon Sep 17 00:00:00 2001 From: Jithu Joseph Date: Thu, 2 Nov 2023 12:52:18 -0700 Subject: [PATCH 251/560] MAINTAINERS: Remove stale entry for SBL platform driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maurice is no longer with Intel and his e-mail address is no longer active. Remove the stale entry from Slim boot loader section. Signed-off-by: Jithu Joseph Link: https://lore.kernel.org/r/20231102195218.143440-1-jithu.joseph@intel.com Signed-off-by: Ilpo Järvinen --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cfd..0666ddb655521 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11025,7 +11025,6 @@ F: drivers/net/wireless/intel/iwlwifi/ INTEL WMI SLIM BOOTLOADER (SBL) FIRMWARE UPDATE DRIVER M: Jithu Joseph -R: Maurice Ma S: Maintained W: https://slimbootloader.github.io/security/firmware-update.html F: drivers/platform/x86/intel/wmi/sbl-fw-update.c From 7a3c36eef9a5d13b16aa954da54224c9c6bed339 Mon Sep 17 00:00:00 2001 From: Stuart Hayhurst Date: Tue, 14 Nov 2023 11:38:08 +0000 Subject: [PATCH 252/560] platform/x86: ideapad-laptop: Set max_brightness before using it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit max_brightness is used in ideapad_kbd_bl_brightness_get() before it's set, causing ideapad_kbd_bl_brightness_get() to return -EINVAL sometimes. Fixes: ecaa1867b524 ("platform/x86: ideapad-laptop: Add support for keyboard backlights using KBLC ACPI symbol") Signed-off-by: Stuart Hayhurst Cc: stable@vger.kernel.org Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231114114055.6220-2-stuart.a.hayhurst@gmail.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/ideapad-laptop.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index ac037540acfc6..88eefccb6ed27 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -1425,18 +1425,17 @@ static int ideapad_kbd_bl_init(struct ideapad_private *priv) if (WARN_ON(priv->kbd_bl.initialized)) return -EEXIST; - brightness = ideapad_kbd_bl_brightness_get(priv); - if (brightness < 0) - return brightness; - - priv->kbd_bl.last_brightness = brightness; - if (ideapad_kbd_bl_check_tristate(priv->kbd_bl.type)) { priv->kbd_bl.led.max_brightness = 2; } else { priv->kbd_bl.led.max_brightness = 1; } + brightness = ideapad_kbd_bl_brightness_get(priv); + if (brightness < 0) + return brightness; + + priv->kbd_bl.last_brightness = brightness; priv->kbd_bl.led.name = "platform::" LED_FUNCTION_KBD_BACKLIGHT; priv->kbd_bl.led.brightness_get = ideapad_kbd_bl_led_cdev_brightness_get; priv->kbd_bl.led.brightness_set_blocking = ideapad_kbd_bl_led_cdev_brightness_set; From c5dbf04160005e07e8ca7232a7faa77ab1547ae0 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 13 Nov 2023 12:07:37 -0800 Subject: [PATCH 253/560] platform/x86: hp-bioscfg: Simplify return check in hp_add_other_attributes() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All cases in switch-case have a same goto on error, move the return check out of the switch. This is a cleanup. Signed-off-by: Harshit Mogalapalli Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231113200742.3593548-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-bioscfg/bioscfg.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c b/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c index 5798b49ddaba9..10676e1abc28c 100644 --- a/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c +++ b/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c @@ -630,21 +630,19 @@ static int hp_add_other_attributes(int attr_type) switch (attr_type) { case HPWMI_SECURE_PLATFORM_TYPE: ret = hp_populate_secure_platform_data(attr_name_kobj); - if (ret) - goto err_other_attr_init; break; case HPWMI_SURE_START_TYPE: ret = hp_populate_sure_start_data(attr_name_kobj); - if (ret) - goto err_other_attr_init; break; default: ret = -EINVAL; - goto err_other_attr_init; } + if (ret) + goto err_other_attr_init; + mutex_unlock(&bioscfg_drv.mutex); return 0; From 5736aa9537c9b8927dec32d3d47c8c31fe560f62 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 13 Nov 2023 12:07:38 -0800 Subject: [PATCH 254/560] platform/x86: hp-bioscfg: move mutex_lock() down in hp_add_other_attributes() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit attr_name_kobj's memory allocation is done with mutex_lock() held, this is not needed. Move allocation outside of mutex_lock() so unlock is not needed when allocation fails. Suggested-by: Ilpo Järvinen Signed-off-by: Harshit Mogalapalli Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231113200742.3593548-2-harshit.m.mogalapalli@oracle.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-bioscfg/bioscfg.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c b/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c index 10676e1abc28c..a3599498c4e81 100644 --- a/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c +++ b/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c @@ -592,13 +592,11 @@ static int hp_add_other_attributes(int attr_type) int ret; char *attr_name; - mutex_lock(&bioscfg_drv.mutex); - attr_name_kobj = kzalloc(sizeof(*attr_name_kobj), GFP_KERNEL); - if (!attr_name_kobj) { - ret = -ENOMEM; - goto err_other_attr_init; - } + if (!attr_name_kobj) + return -ENOMEM; + + mutex_lock(&bioscfg_drv.mutex); /* Check if attribute type is supported */ switch (attr_type) { From 9e88b493157a9901fa498f23cc3c9ab82b43ce83 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 15 Nov 2023 13:36:25 +0100 Subject: [PATCH 255/560] ALSA: hda: i915: Alays handle -EPROBE_DEFER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out that even if the comment says that the driver can load fine, it's not really the case and no codecs are detected. Specifically for -EPROBE_DEFER, always fail the probe. This fixes a regression when HDA-intel is loaded before i915. Reported-by: Ville Syrjälä Closes: https://lore.kernel.org/r/ZVNUxZzCGcxQzqJX@intel.com Signed-off-by: Maarten Lankhorst Tested-by: Kai Vehmanen Fixes: e6d0c13e9f46 ("ALSA: hda: i915: Remove extra argument from snd_hdac_i915_init") Link: https://gitlab.freedesktop.org/drm/intel/-/issues/9671 Link: https://lore.kernel.org/r/20231115123625.74286-1-maarten.lankhorst@linux.intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index e871afeeb383f..e79508002bb1a 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2141,6 +2141,9 @@ static int azx_probe(struct pci_dev *pci, if (chip->driver_caps & AZX_DCAPS_I915_COMPONENT) { err = snd_hdac_i915_init(azx_bus(chip)); if (err < 0) { + if (err == -EPROBE_DEFER) + goto out_free; + /* if the controller is bound only with HDMI/DP * (for HSW and BDW), we need to abort the probe; * for other chips, still continue probing as other From f40f939917b2b4cbf18450096c0ce1c58ed59fae Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 13 Nov 2023 12:07:39 -0800 Subject: [PATCH 256/560] platform/x86: hp-bioscfg: Fix error handling in hp_add_other_attributes() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'attr_name_kobj' is allocated using kzalloc, but on all the error paths it is not freed, hence we have a memory leak. Fix the error path before kobject_init_and_add() by adding kfree(). kobject_put() must be always called after passing the object to kobject_init_and_add(). Only the error path which is immediately next to kobject_init_and_add() calls kobject_put() and not any other error path after it. Fix the error handling after kobject_init_and_add() by moving the kobject_put() into the goto label err_other_attr_init that is already used by all the error paths after kobject_init_and_add(). Fixes: a34fc329b189 ("platform/x86: hp-bioscfg: bioscfg") Cc: stable@vger.kernel.org # 6.6.x: c5dbf0416000: platform/x86: hp-bioscfg: Simplify return check in hp_add_other_attributes() Cc: stable@vger.kernel.org # 6.6.x: 5736aa9537c9: platform/x86: hp-bioscfg: move mutex_lock() down in hp_add_other_attributes() Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202309201412.on0VXJGo-lkp@intel.com/ Signed-off-by: Harshit Mogalapalli [ij: Added the stable dep tags] Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231113200742.3593548-3-harshit.m.mogalapalli@oracle.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-bioscfg/bioscfg.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c b/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c index a3599498c4e81..6ddca857cc4d1 100644 --- a/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c +++ b/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c @@ -613,14 +613,14 @@ static int hp_add_other_attributes(int attr_type) default: pr_err("Error: Unknown attr_type: %d\n", attr_type); ret = -EINVAL; - goto err_other_attr_init; + kfree(attr_name_kobj); + goto unlock_drv_mutex; } ret = kobject_init_and_add(attr_name_kobj, &attr_name_ktype, NULL, "%s", attr_name); if (ret) { pr_err("Error encountered [%d]\n", ret); - kobject_put(attr_name_kobj); goto err_other_attr_init; } @@ -645,6 +645,8 @@ static int hp_add_other_attributes(int attr_type) return 0; err_other_attr_init: + kobject_put(attr_name_kobj); +unlock_drv_mutex: mutex_unlock(&bioscfg_drv.mutex); kfree(obj); return ret; From 92c47597db7d8fb500a4b04ebd457ec7360279cc Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 13 Nov 2023 12:07:40 -0800 Subject: [PATCH 257/560] platform/x86: hp-bioscfg: Remove unused obj in hp_add_other_attributes() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit acpi_object *obj is unused in this function, so delete it, also delete a unnecessary kfree(obj); Signed-off-by: Harshit Mogalapalli Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231113200742.3593548-4-harshit.m.mogalapalli@oracle.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-bioscfg/bioscfg.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c b/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c index 6ddca857cc4d1..8c9f4f3227fc6 100644 --- a/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c +++ b/drivers/platform/x86/hp/hp-bioscfg/bioscfg.c @@ -588,7 +588,6 @@ static void release_attributes_data(void) static int hp_add_other_attributes(int attr_type) { struct kobject *attr_name_kobj; - union acpi_object *obj = NULL; int ret; char *attr_name; @@ -648,7 +647,6 @@ static int hp_add_other_attributes(int attr_type) kobject_put(attr_name_kobj); unlock_drv_mutex: mutex_unlock(&bioscfg_drv.mutex); - kfree(obj); return ret; } From a0d45c3f596be53c1bd8822a1984532d14fdcea9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Nov 2023 09:55:50 -0700 Subject: [PATCH 258/560] io_uring/fdinfo: remove need for sqpoll lock for thread/pid retrieval A previous commit added a trylock for getting the SQPOLL thread info via fdinfo, but this introduced a regression where we often fail to get it if the thread is busy. For that case, we end up not printing the current CPU and PID info. Rather than rely on this lock, just print the pid we already stored in the io_sq_data struct, and ensure we update the current CPU every time we've slept or potentially rescheduled. The latter won't potentially be 100% accurate, but that wasn't the case before either as the task can get migrated at any time unless it has been pinned at creation time. We retain keeping the io_sq_data dereference inside the ctx->uring_lock, as it has always been, as destruction of the thread and data happen below that. We could make this RCU safe, but there's little point in doing that. With this, we always print the last valid information we had, rather than have spurious outputs with missing information. Fixes: 7644b1a1c9a7 ("io_uring/fdinfo: lock SQ thread while retrieving thread cpu/pid") Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 9 ++------- io_uring/sqpoll.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index f04a43044d917..976e9500f6518 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -145,13 +145,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { struct io_sq_data *sq = ctx->sq_data; - if (mutex_trylock(&sq->lock)) { - if (sq->thread) { - sq_pid = task_pid_nr(sq->thread); - sq_cpu = task_cpu(sq->thread); - } - mutex_unlock(&sq->lock); - } + sq_pid = sq->task_pid; + sq_cpu = sq->sq_cpu; } seq_printf(m, "SqThread:\t%d\n", sq_pid); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index bd6c2c7959a5b..65b5dbe3c850e 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -214,6 +214,7 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd) did_sig = get_signal(&ksig); cond_resched(); mutex_lock(&sqd->lock); + sqd->sq_cpu = raw_smp_processor_id(); } return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); } @@ -229,10 +230,15 @@ static int io_sq_thread(void *data) snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); set_task_comm(current, buf); - if (sqd->sq_cpu != -1) + /* reset to our pid after we've set task_comm, for fdinfo */ + sqd->task_pid = current->pid; + + if (sqd->sq_cpu != -1) { set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); - else + } else { set_cpus_allowed_ptr(current, cpu_online_mask); + sqd->sq_cpu = raw_smp_processor_id(); + } mutex_lock(&sqd->lock); while (1) { @@ -261,6 +267,7 @@ static int io_sq_thread(void *data) mutex_unlock(&sqd->lock); cond_resched(); mutex_lock(&sqd->lock); + sqd->sq_cpu = raw_smp_processor_id(); } continue; } @@ -294,6 +301,7 @@ static int io_sq_thread(void *data) mutex_unlock(&sqd->lock); schedule(); mutex_lock(&sqd->lock); + sqd->sq_cpu = raw_smp_processor_id(); } list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) atomic_andnot(IORING_SQ_NEED_WAKEUP, From 1fda5bb66ad8fb24ecb3858e61a13a6548428898 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 10 Nov 2023 17:39:28 -0800 Subject: [PATCH 259/560] bpf: Do not allocate percpu memory at init stage Kirill Shutemov reported significant percpu memory consumption increase after booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation"). The percpu memory consumption is increased from 111MB to 969MB. The number is from /proc/meminfo. I tried to reproduce the issue with my local VM which at most supports upto 255 cpus. With 252 cpus, without the above commit, the percpu memory consumption immediately after boot is 57MB while with the above commit the percpu memory consumption is 231MB. This is not good since so far percpu memory from bpf memory allocator is not widely used yet. Let us change pre-allocation in init stage to on-demand allocation when verifier detects there is a need of percpu memory for bpf program. With this change, percpu memory consumption after boot can be reduced signicantly. [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/ Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation") Reported-and-tested-by: Kirill A. Shutemov Signed-off-by: Yonghong Song Acked-by: Hou Tao Link: https://lore.kernel.org/r/20231111013928.948838-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/core.c | 8 +++----- kernel/bpf/verifier.c | 20 ++++++++++++++++++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 35bff17396c0b..6762dac3ef761 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -56,7 +56,7 @@ extern struct idr btf_idr; extern spinlock_t btf_idr_lock; extern struct kobject *btf_kobj; extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; -extern bool bpf_global_ma_set, bpf_global_percpu_ma_set; +extern bool bpf_global_ma_set; typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 08626b519ce23..cd3afe57ece3c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -64,8 +64,8 @@ #define OFF insn->off #define IMM insn->imm -struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; -bool bpf_global_ma_set, bpf_global_percpu_ma_set; +struct bpf_mem_alloc bpf_global_ma; +bool bpf_global_ma_set; /* No hurry in this branch * @@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void) ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); bpf_global_ma_set = !ret; - ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); - bpf_global_percpu_ma_set = !ret; - return !bpf_global_ma_set || !bpf_global_percpu_ma_set; + return ret; } late_initcall(bpf_global_ma_init); #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a2267d5ed14ed..6da370a047feb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "disasm.h" @@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #undef BPF_LINK_TYPE }; +struct bpf_mem_alloc bpf_global_percpu_ma; +static bool bpf_global_percpu_ma_set; + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta { struct btf *btf_vmlinux; static DEFINE_MUTEX(bpf_verifier_lock); +static DEFINE_MUTEX(bpf_percpu_ma_lock); static const struct bpf_line_info * find_linfo(const struct bpf_verifier_env *env, u32 insn_off) @@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) return -ENOMEM; - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set) - return -ENOMEM; + if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + } if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); From 430143b0d3611f4a9c8434319e5e504244749e79 Mon Sep 17 00:00:00 2001 From: Brenton Simpson Date: Tue, 14 Nov 2023 23:38:59 +0000 Subject: [PATCH 260/560] drm: panel-orientation-quirks: Add quirk for Lenovo Legion Go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Legion Go has a 2560x1600 portrait screen, with the native "up" facing the right controller (90° CW from the rest of the device). Signed-off-by: Brenton Simpson Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20231114233859.274189-1-appsforartists@google.com --- drivers/gpu/drm/drm_panel_orientation_quirks.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/drm_panel_orientation_quirks.c b/drivers/gpu/drm/drm_panel_orientation_quirks.c index d5c15292ae937..3d92f66e550c3 100644 --- a/drivers/gpu/drm/drm_panel_orientation_quirks.c +++ b/drivers/gpu/drm/drm_panel_orientation_quirks.c @@ -336,6 +336,12 @@ static const struct dmi_system_id orientation_data[] = { DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "IdeaPad Duet 3 10IGL5"), }, .driver_data = (void *)&lcd1200x1920_rightside_up, + }, { /* Lenovo Legion Go 8APU1 */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "Legion Go 8APU1"), + }, + .driver_data = (void *)&lcd1600x2560_leftside_up, }, { /* Lenovo Yoga Book X90F / X90L */ .matches = { DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Intel Corporation"), From ae1aadb1eb8d3cbc52e42bee71d67bd4a71f9f07 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 16 Nov 2023 00:39:33 +1000 Subject: [PATCH 261/560] nouveau: don't fail driver load if no display hw present. If we get back ENODEV don't fail load. There are nvidia devices that don't have display blocks and the driver should work on those. Fixes: 15740541e8f0 ("drm/nouveau/devinit/tu102-: prepare for GSP-RM") Link: https://gitlab.freedesktop.org/drm/nouveau/-/issues/270 Signed-off-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20231115143933.261287-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nouveau_display.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c index d8c92521226d9..f28f9a8574586 100644 --- a/drivers/gpu/drm/nouveau/nouveau_display.c +++ b/drivers/gpu/drm/nouveau/nouveau_display.c @@ -726,6 +726,11 @@ nouveau_display_create(struct drm_device *dev) if (nouveau_modeset != 2) { ret = nvif_disp_ctor(&drm->client.device, "kmsDisp", 0, &disp->disp); + /* no display hw */ + if (ret == -ENODEV) { + ret = 0; + goto disp_create_err; + } if (!ret && (disp->disp.outp_mask || drm->vbios.dcb.entries)) { nouveau_display_create_properties(dev); From 61cbc08fdb04fd445458b0f4cba7e6929afdfaef Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Wed, 15 Nov 2023 16:21:15 +0000 Subject: [PATCH 262/560] ALSA: hda/realtek: Add quirks for ASUS 2024 Zenbooks These ASUS Zenbook laptops use Realtek HDA codec combined with 2xCS35L41 Amplifiers using SPI or I2C with External Boost or Internal Boost. Signed-off-by: Stefan Binding Cc: Link: https://lore.kernel.org/r/20231115162116.494968-2-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 3c85b8247c110..a1e124370283a 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9947,13 +9947,17 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x19e1, "ASUS UX581LV", ALC295_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1a13, "Asus G73Jw", ALC269_FIXUP_ASUS_G73JW), SND_PCI_QUIRK(0x1043, 0x1a30, "ASUS X705UD", ALC256_FIXUP_ASUS_MIC), + SND_PCI_QUIRK(0x1043, 0x1a63, "ASUS UX3405MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1a83, "ASUS UM5302LA", ALC294_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1a8f, "ASUS UX582ZS", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1b11, "ASUS UX431DA", ALC294_FIXUP_ASUS_COEF_1B), SND_PCI_QUIRK(0x1043, 0x1b13, "Asus U41SV", ALC269_FIXUP_INV_DMIC), SND_PCI_QUIRK(0x1043, 0x1b93, "ASUS G614JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1bbd, "ASUS Z550MA", ALC255_FIXUP_ASUS_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1043, 0x1c03, "ASUS UM3406HA", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1c23, "Asus X55U", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), + SND_PCI_QUIRK(0x1043, 0x1c33, "ASUS UX5304MA", ALC245_FIXUP_CS35L41_SPI_2), + SND_PCI_QUIRK(0x1043, 0x1c43, "ASUS UX8406MA", ALC245_FIXUP_CS35L41_SPI_2), SND_PCI_QUIRK(0x1043, 0x1c62, "ASUS GU603", ALC289_FIXUP_ASUS_GA401), SND_PCI_QUIRK(0x1043, 0x1c92, "ASUS ROG Strix G15", ALC285_FIXUP_ASUS_G533Z_PINS), SND_PCI_QUIRK(0x1043, 0x1c9f, "ASUS G614JI", ALC285_FIXUP_ASUS_HEADSET_MIC), From 5d639b60971f003d3a9b2b31f8ec73b0718b5d57 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Wed, 15 Nov 2023 16:21:16 +0000 Subject: [PATCH 263/560] ALSA: hda/realtek: Add quirks for HP Laptops These HP laptops use Realtek HDA codec combined with 2 or 4 CS35L41 Amplifiers using SPI with Internal Boost. Signed-off-by: Stefan Binding Cc: Link: https://lore.kernel.org/r/20231115162116.494968-3-sbinding@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a1e124370283a..5618b1d9bfd13 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9902,6 +9902,9 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8c70, "HP EliteBook 835 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c71, "HP EliteBook 845 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c72, "HP EliteBook 865 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8ca4, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8ca7, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8cf5, "HP ZBook Studio 16", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), From 85c2ceaafbd306814a3a4740bf4d95ac26a8b36a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Nov 2023 17:07:40 +0300 Subject: [PATCH 264/560] mm/damon/sysfs: eliminate potential uninitialized variable warning The "err" variable is not initialized if damon_target_has_pid(ctx) is false and sys_target->regions->nr is zero. Link: https://lkml.kernel.org/r/739e6aaf-a634-4e33-98a8-16546379ec9f@moroto.mountain Fixes: 0bcd216c4741 ("mm/damon/sysfs: update monitoring target regions for online input commit") Signed-off-by: Dan Carpenter Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index e27846708b5a2..1dfa96d4de99b 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1172,7 +1172,7 @@ static int damon_sysfs_update_target(struct damon_target *target, struct damon_ctx *ctx, struct damon_sysfs_target *sys_target) { - int err; + int err = 0; if (damon_target_has_pid(ctx)) { err = damon_sysfs_update_target_pid(target, sys_target->pid); From 019b277b680f5b95135c042c78dd79318d8f9e3c Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 3 Nov 2023 23:23:41 +0500 Subject: [PATCH 265/560] selftests: mm: skip whole test instead of failure Some architectures don't support userfaultfd. Skip running the whole test on them instead of registering the failure. Link: https://lkml.kernel.org/r/20231103182343.2874015-1-usama.anjum@collabora.com Fixes: 46fd75d4a3c9 ("selftests: mm: add pagemap ioctl tests") Reported-by: Ryan Roberts Closes: https://lore.kernel.org/all/f8463381-2697-49e9-9460-9dc73452830d@arm.com Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 0161fb49fc6ef..f8685a2ea07e6 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -94,19 +94,19 @@ int init_uffd(void) uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); if (uffd == -1) - ksft_exit_fail_msg("uffd syscall failed\n"); + return uffd; uffdio_api.api = UFFD_API; uffdio_api.features = UFFD_FEATURE_WP_UNPOPULATED | UFFD_FEATURE_WP_ASYNC | UFFD_FEATURE_WP_HUGETLBFS_SHMEM; if (ioctl(uffd, UFFDIO_API, &uffdio_api)) - ksft_exit_fail_msg("UFFDIO_API\n"); + return -1; if (!(uffdio_api.api & UFFDIO_REGISTER_MODE_WP) || !(uffdio_api.features & UFFD_FEATURE_WP_UNPOPULATED) || !(uffdio_api.features & UFFD_FEATURE_WP_ASYNC) || !(uffdio_api.features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM)) - ksft_exit_fail_msg("UFFDIO_API error %llu\n", uffdio_api.api); + return -1; return 0; } @@ -1479,6 +1479,10 @@ int main(void) struct stat sbuf; ksft_print_header(); + + if (init_uffd()) + return ksft_exit_pass(); + ksft_set_plan(115); page_size = getpagesize(); @@ -1488,9 +1492,6 @@ int main(void) if (pagemap_fd < 0) return -EINVAL; - if (init_uffd()) - ksft_exit_fail_msg("uffd init failed\n"); - /* 1. Sanity testing */ sanity_tests_sd(); From 9297e5360c3bd777f95d5146dbeda7fb9ba4273a Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 3 Nov 2023 23:23:42 +0500 Subject: [PATCH 266/560] selftests: mm: fix some build warnings Fix build warnings: pagemap_ioctl.c:1154:38: warning: format `%s' expects a matching `char *' argument [-Wformat=] pagemap_ioctl.c:1162:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=] pagemap_ioctl.c:1192:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=] pagemap_ioctl.c:1600:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=] pagemap_ioctl.c:1628:51: warning: format `%ld' expects argument of type `long int', but argument 2 has type `int' [-Wformat=] Link: https://lkml.kernel.org/r/20231103182343.2874015-2-usama.anjum@collabora.com Fixes: 46fd75d4a3c9 ("selftests: mm: add pagemap ioctl tests") Signed-off-by: Muhammad Usama Anjum Cc: Shuah Khan Cc: Ryan Roberts Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index f8685a2ea07e6..befab43719bad 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1151,7 +1151,7 @@ int sanity_tests(void) /* 9. Memory mapped file */ fd = open(__FILE__, O_RDONLY); if (fd < 0) - ksft_exit_fail_msg("%s Memory mapped file\n"); + ksft_exit_fail_msg("%s Memory mapped file\n", __func__); ret = stat(__FILE__, &sbuf); if (ret < 0) @@ -1159,7 +1159,7 @@ int sanity_tests(void) fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (fmem == MAP_FAILED) - ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); tmp_buf = malloc(sbuf.st_size); memcpy(tmp_buf, fmem, sbuf.st_size); @@ -1189,7 +1189,7 @@ int sanity_tests(void) fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (fmem == MAP_FAILED) - ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); wp_init(fmem, buf_size); wp_addr_range(fmem, buf_size); @@ -1596,7 +1596,7 @@ int main(void) fmem = mmap(NULL, sbuf.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (fmem == MAP_FAILED) - ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); wp_init(fmem, sbuf.st_size); wp_addr_range(fmem, sbuf.st_size); @@ -1624,7 +1624,7 @@ int main(void) fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (fmem == MAP_FAILED) - ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + ksft_exit_fail_msg("error nomem %d %s\n", errno, strerror(errno)); wp_init(fmem, buf_size); wp_addr_range(fmem, buf_size); From dd9b35efd719be242e227f9eebad1e50ea5c914f Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 3 Nov 2023 10:33:59 -0700 Subject: [PATCH 267/560] selftests/mm: restore number of hugepages The test mm `hugetlb_fault_after_madv` selftest needs one and only one huge page to run, thus it sets `/proc/sys/vm/nr_hugepages` to 1. The problem is that further tests require the previous number of hugepages allocated in order to succeed. Save the number of huge pages before changing it, and restore it once the test finishes, so, further tests could run successfully. Link: https://lkml.kernel.org/r/20231103173400.1608403-1-leitao@debian.org Fixes: 116d57303a05 ("selftests/mm: add a new test for madv and hugetlb") Signed-off-by: Breno Leitao Reported-by: Ryan Roberts Closes: https://lore.kernel.org/all/662df57e-47f1-4c15-9b84-f2f2d587fc5c@arm.com/ Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index cc16f6ca85333..00757445278ed 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -223,9 +223,12 @@ CATEGORY="hugetlb" run_test ./hugepage-mremap CATEGORY="hugetlb" run_test ./hugepage-vmemmap CATEGORY="hugetlb" run_test ./hugetlb-madvise +nr_hugepages_tmp=$(cat /proc/sys/vm/nr_hugepages) # For this test, we need one and just one huge page echo 1 > /proc/sys/vm/nr_hugepages CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv +# Restore the previous number of huge pages, since further tests rely on it +echo "$nr_hugepages_tmp" > /proc/sys/vm/nr_hugepages if test_selected "hugetlb"; then echo "NOTE: These hugetlb tests provide minimal coverage. Use" From edf14544324dd036183fafe372fe5709708bdddd Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 3 Nov 2023 10:34:00 -0700 Subject: [PATCH 268/560] selftests/mm: add hugetlb_fault_after_madv to .gitignore commit 116d57303a05 ("selftests/mm: add a new test for madv and hugetlb") added a new test case, but, it didn't add the binary name in tools/testing/selftests/mm/.gitignore. Add hugetlb_fault_after_madv to tools/testing/selftests/mm/.gitignore. Link: https://lkml.kernel.org/r/20231103173400.1608403-2-leitao@debian.org Fixes: 116d57303a05 ("selftests/mm: add a new test for madv and hugetlb") Signed-off-by: Breno Leitao Reported-by: Ryan Roberts Closes: https://lore.kernel.org/all/662df57e-47f1-4c15-9b84-f2f2d587fc5c@arm.com/ Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index cc920c79ff1c3..4ff10ea614617 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -45,3 +45,4 @@ mdwe_test gup_longterm mkdirty va_high_addr_switch +hugetlb_fault_after_madv From a48d5bdc877b85201e42cef9c2fdf5378164c23a Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 6 Nov 2023 10:19:18 -0800 Subject: [PATCH 269/560] mm: fix for negative counter: nr_file_hugepages While qualifiying the 6.4 release, the following warning was detected in messages: vmstat_refresh: nr_file_hugepages -15664 The warning is caused by the incorrect updating of the NR_FILE_THPS counter in the function split_huge_page_to_list. The if case is checking for folio_test_swapbacked, but the else case is missing the check for folio_test_pmd_mappable. The other functions that manipulate the counter like __filemap_add_folio and filemap_unaccount_folio have the corresponding check. I have a test case, which reproduces the problem. It can be found here: https://github.com/sroeschus/testcase/blob/main/vmstat_refresh/madv.c The test case reproduces on an XFS filesystem. Running the same test case on a BTRFS filesystem does not reproduce the problem. AFAIK version 6.1 until 6.6 are affected by this problem. [akpm@linux-foundation.org: whitespace fix] [shr@devkernel.io: test for folio_test_pmd_mappable()] Link: https://lkml.kernel.org/r/20231108171517.2436103-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20231106181918.1091043-1-shr@devkernel.io Signed-off-by: Stefan Roesch Co-debugged-by: Johannes Weiner Acked-by: Johannes Weiner Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f31f02472396e..4f542444a91f2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2769,13 +2769,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) int nr = folio_nr_pages(folio); xas_split(&xas, folio, folio_order(folio)); - if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, - -nr); - } else { - __lruvec_stat_mod_folio(folio, NR_FILE_THPS, - -nr); - filemap_nr_thps_dec(mapping); + if (folio_test_pmd_mappable(folio)) { + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, + NR_SHMEM_THPS, -nr); + } else { + __lruvec_stat_mod_folio(folio, + NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + } } } From b4936b544b08ed44949055b92bd25f77759ebafc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Nov 2023 23:34:06 +0000 Subject: [PATCH 270/560] mm/damon/sysfs: check error from damon_sysfs_update_target() Patch series "mm/damon/sysfs: fix unhandled return values". Some of DAMON sysfs interface code is not handling return values from some functions. As a result, confusing user input handling or NULL-dereference is possible. Check those properly. This patch (of 3): damon_sysfs_update_target() returns error code for failures, but its caller, damon_sysfs_set_targets() is ignoring that. The update function seems making no critical change in case of such failures, but the behavior will look like DAMON sysfs is silently ignoring or only partially accepting the user input. Fix it. Link: https://lkml.kernel.org/r/20231106233408.51159-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231106233408.51159-2-sj@kernel.org Fixes: 19467a950b49 ("mm/damon/sysfs: remove requested targets when online-commit inputs") Signed-off-by: SeongJae Park Cc: [5.19+] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1dfa96d4de99b..7472404456aa8 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1203,8 +1203,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, damon_for_each_target_safe(t, next, ctx) { if (i < sysfs_targets->nr) { - damon_sysfs_update_target(t, ctx, + err = damon_sysfs_update_target(t, ctx, sysfs_targets->targets_arr[i]); + if (err) + return err; } else { if (damon_target_has_pid(ctx)) put_pid(t->pid); From 84055688b6bc075c92a88e2d6c3ad26ab93919f9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Nov 2023 23:34:07 +0000 Subject: [PATCH 271/560] mm/damon/sysfs-schemes: handle tried regions sysfs directory allocation failure DAMOS tried regions sysfs directory allocation function (damon_sysfs_scheme_regions_alloc()) is not handling the memory allocation failure. In the case, the code will dereference NULL pointer. Handle the failure to avoid such invalid access. Link: https://lkml.kernel.org/r/20231106233408.51159-3-sj@kernel.org Fixes: 9277d0367ba1 ("mm/damon/sysfs-schemes: implement scheme region directory") Signed-off-by: SeongJae Park Cc: [6.2+] Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 45bd0fd4a8b16..7413cb35c5a9c 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -162,6 +162,9 @@ damon_sysfs_scheme_regions_alloc(void) struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions), GFP_KERNEL); + if (!regions) + return NULL; + regions->kobj = (struct kobject){}; INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; From ae636ae2bbfd9279f5681dbf320d1da817e52b68 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Nov 2023 23:34:08 +0000 Subject: [PATCH 272/560] mm/damon/sysfs-schemes: handle tried region directory allocation failure DAMON sysfs interface's before_damos_apply callback (damon_sysfs_before_damos_apply()), which creates the DAMOS tried regions for each DAMOS action applied region, is not handling the allocation failure for the sysfs directory data. As a result, NULL pointer derefeence is possible. Fix it by handling the case. Link: https://lkml.kernel.org/r/20231106233408.51159-4-sj@kernel.org Fixes: f1d13cacabe1 ("mm/damon/sysfs: implement DAMOS tried regions update command") Signed-off-by: SeongJae Park Cc: [6.2+] Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 7413cb35c5a9c..be667236b8e6e 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1826,6 +1826,8 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; region = damon_sysfs_scheme_region_alloc(r); + if (!region) + return 0; list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; if (kobject_init_and_add(®ion->kobj, From 24948e3b7b12e0031a6edb4f49bbb9fb2ad1e4e9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 7 Nov 2023 09:18:02 -0800 Subject: [PATCH 273/560] mm: kmem: drop __GFP_NOFAIL when allocating objcg vectors Objcg vectors attached to slab pages to store slab object ownership information are allocated using gfp flags for the original slab allocation. Depending on slab page order and the size of slab objects, objcg vector can take several pages. If the original allocation was done with the __GFP_NOFAIL flag, it triggered a warning in the page allocation code. Indeed, order > 1 pages should not been allocated with the __GFP_NOFAIL flag. Fix this by simply dropping the __GFP_NOFAIL flag when allocating the objcg vector. It effectively allows to skip the accounting of a single slab object under a heavy memory pressure. An alternative would be to implement the mechanism to fallback to order-0 allocations for accounting metadata, which is also not perfect because it will increase performance penalty and memory footprint of the kernel memory accounting under memory pressure. Link: https://lkml.kernel.org/r/ZUp8ZFGxwmCx4ZFr@P9FQF9L96D.corp.robot.car Signed-off-by: Roman Gushchin Reported-by: Christoph Lameter Closes: https://lkml.kernel.org/r/6b42243e-f197-600a-5d22-56bd728a5ad8@gentwo.org Acked-by: Shakeel Butt Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 774bd6e21e278..1c1061df9cd17 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2936,7 +2936,8 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) * Moreover, it should not come from DMA buffer and is not readily * reclaimable. So those GFP bits should be masked off. */ -#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) /* * mod_objcg_mlstate() may be called with irq enabled, so From 13b2a4b22e98ff80b888a160a2acd92d81b05925 Mon Sep 17 00:00:00 2001 From: Hyeongtak Ji Date: Fri, 10 Nov 2023 14:37:09 +0900 Subject: [PATCH 274/560] mm/damon/core.c: avoid unintentional filtering out of schemes The function '__damos_filter_out()' causes DAMON to always filter out schemes whose filter type is anon or memcg if its matching value is set to false. This commit addresses the issue by ensuring that '__damos_filter_out()' no longer applies to filters whose type is 'anon' or 'memcg'. Link: https://lkml.kernel.org/r/1699594629-3816-1-git-send-email-hyeongtak.ji@gmail.com Fixes: ab9bda001b681 ("mm/damon/core: introduce address range type damos filter") Signed-off-by: Hyeongtak Ji Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 630077d95dc60..6262d55904e74 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -924,7 +924,7 @@ static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, matched = true; break; default: - break; + return false; } return matched == filter->matching; From 5f74f820f6fc844b95f9e5e406e0a07d97510420 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 13 Nov 2023 11:12:57 +0100 Subject: [PATCH 275/560] parisc: fix mmap_base calculation when stack grows upwards Matoro reported various userspace crashes on the parisc platform with kernel 6.6 and bisected it to commit 3033cd430768 ("parisc: Use generic mmap top-down layout and brk randomization"). That commit switched parisc to use the common infrastructure to calculate mmap_base, but missed that the mmap_base() function takes care for architectures where the stack grows downwards only. Fix the mmap_base() calculation to include the stack-grows-upwards case and thus fix the userspace crashes on parisc. Link: https://lkml.kernel.org/r/ZVH2qeS1bG7/1J/l@p100 Fixes: 3033cd430768 ("parisc: Use generic mmap top-down layout and brk randomization") Signed-off-by: Helge Deller Reported-by: matoro Tested-by: matoro Cc: [6.6+] Signed-off-by: Andrew Morton --- arch/parisc/Kconfig | 6 +++--- arch/parisc/include/asm/elf.h | 10 +--------- arch/parisc/include/asm/processor.h | 2 ++ arch/parisc/kernel/sys_parisc.c | 2 +- mm/util.c | 10 ++++++++++ 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index fd69dfa0cdabb..a7c9c0e69e5ab 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -140,11 +140,11 @@ config ARCH_MMAP_RND_COMPAT_BITS_MIN default 8 config ARCH_MMAP_RND_BITS_MAX - default 24 if 64BIT - default 17 + default 18 if 64BIT + default 13 config ARCH_MMAP_RND_COMPAT_BITS_MAX - default 17 + default 13 # unless you want to implement ACPI on PA-RISC ... ;-) config PM diff --git a/arch/parisc/include/asm/elf.h b/arch/parisc/include/asm/elf.h index 140eaa97bf215..2d73d3c3cd37f 100644 --- a/arch/parisc/include/asm/elf.h +++ b/arch/parisc/include/asm/elf.h @@ -349,15 +349,7 @@ struct pt_regs; /* forward declaration... */ #define ELF_HWCAP 0 -/* Masks for stack and mmap randomization */ -#define BRK_RND_MASK (is_32bit_task() ? 0x07ffUL : 0x3ffffUL) -#define MMAP_RND_MASK (is_32bit_task() ? 0x1fffUL : 0x3ffffUL) -#define STACK_RND_MASK MMAP_RND_MASK - -struct mm_struct; -extern unsigned long arch_randomize_brk(struct mm_struct *); -#define arch_randomize_brk arch_randomize_brk - +#define STACK_RND_MASK 0x7ff /* 8MB of VA */ #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 struct linux_binprm; diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index c05d121cf5d0f..982aca20f56f5 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -47,6 +47,8 @@ #ifndef __ASSEMBLY__ +struct rlimit; +unsigned long mmap_upper_limit(struct rlimit *rlim_stack); unsigned long calc_max_stack_size(unsigned long stack_max); /* diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index ab896eff7a1de..98af719d5f85b 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -77,7 +77,7 @@ unsigned long calc_max_stack_size(unsigned long stack_max) * indicating that "current" should be used instead of a passed-in * value from the exec bprm as done with arch_pick_mmap_layout(). */ -static unsigned long mmap_upper_limit(struct rlimit *rlim_stack) +unsigned long mmap_upper_limit(struct rlimit *rlim_stack) { unsigned long stack_base; diff --git a/mm/util.c b/mm/util.c index aa01f6ea5a75b..744b4d7e3fae2 100644 --- a/mm/util.c +++ b/mm/util.c @@ -414,6 +414,15 @@ static int mmap_is_legacy(struct rlimit *rlim_stack) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { +#ifdef CONFIG_STACK_GROWSUP + /* + * For an upwards growing stack the calculation is much simpler. + * Memory for the maximum stack size is reserved at the top of the + * task. mmap_base starts directly below the stack and grows + * downwards. + */ + return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); +#else unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_guard_gap; @@ -431,6 +440,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) gap = MAX_GAP; return PAGE_ALIGN(STACK_TOP - gap - rnd); +#endif } void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) From afccb0804fc74ac2f6737af6a139632606cb461d Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 14 Nov 2023 15:49:45 +0000 Subject: [PATCH 276/560] mm: more ptep_get() conversion Commit c33c794828f2 ("mm: ptep_get() conversion") converted all (non-arch) call sites to use ptep_get() instead of doing a direct dereference of the pte. Full rationale can be found in that commit's log. Since then, three new call sites have snuck in, which directly dereference the pte, so let's fix those up. Unfortunately there is no reliable automated mechanism to catch these; I'm relying on a combination of Coccinelle (which throws up a lot of false positives) and some compiler magic to force a compiler error on dereference (While this approach finds dereferences, it also yields a non-booting kernel so can't be committed). Link: https://lkml.kernel.org/r/20231114154945.490401-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- mm/ksm.c | 2 +- mm/userfaultfd.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 9710f43a89acd..32eedf3afd458 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3443,7 +3443,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(vmf->pte[count])) + if (!pte_none(ptep_get(&vmf->pte[count]))) goto skip; count++; diff --git a/mm/ksm.c b/mm/ksm.c index 7efcc68ccc6ea..6a831009b4cbf 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -468,7 +468,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex page = pfn_swap_entry_to_page(entry); } /* return 1 if the page is an normal ksm page or KSM-placed zero page */ - ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte); + ret = (page && PageKsm(page)) || is_ksm_zero_pte(ptent); pte_unmap_unlock(pte, ptl); return ret; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 96d9eae5c7cc8..0b6ca553bebec 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -312,7 +312,7 @@ static int mfill_atomic_pte_poison(pmd_t *dst_pmd, ret = -EEXIST; /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ - if (!pte_none(*dst_pte)) + if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); From df3aafe501853c92bc9e25b05dcb030fee072962 Mon Sep 17 00:00:00 2001 From: Itamar Gozlan Date: Tue, 14 Nov 2023 13:58:32 -0800 Subject: [PATCH 277/560] Revert "net/mlx5: DR, Supporting inline WQE when possible" This reverts commit 95c337cce0e11d06a715da73e6796ade9216637f. The revert is required due to the suspicion it cause some tests fail and will be moved to further investigation. Fixes: 95c337cce0e1 ("net/mlx5: DR, Supporting inline WQE when possible") Signed-off-by: Itamar Gozlan Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-2-saeed@kernel.org Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/dr_send.c | 115 ++---------------- 1 file changed, 13 insertions(+), 102 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 4e8527a724f50..6fa06ba2d3465 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -52,7 +52,6 @@ struct dr_qp_init_attr { u32 cqn; u32 pdn; u32 max_send_wr; - u32 max_send_sge; struct mlx5_uars_page *uar; u8 isolate_vl_tc:1; }; @@ -247,37 +246,6 @@ static int dr_poll_cq(struct mlx5dr_cq *dr_cq, int ne) return err == CQ_POLL_ERR ? err : npolled; } -static int dr_qp_get_args_update_send_wqe_size(struct dr_qp_init_attr *attr) -{ - return roundup_pow_of_two(sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_flow_update_ctrl_seg) + - sizeof(struct mlx5_wqe_header_modify_argument_update_seg)); -} - -/* We calculate for specific RC QP with the required functionality */ -static int dr_qp_calc_rc_send_wqe(struct dr_qp_init_attr *attr) -{ - int update_arg_size; - int inl_size = 0; - int tot_size; - int size; - - update_arg_size = dr_qp_get_args_update_send_wqe_size(attr); - - size = sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_raddr_seg); - inl_size = size + ALIGN(sizeof(struct mlx5_wqe_inline_seg) + - DR_STE_SIZE, 16); - - size += attr->max_send_sge * sizeof(struct mlx5_wqe_data_seg); - - size = max(size, update_arg_size); - - tot_size = max(size, inl_size); - - return ALIGN(tot_size, MLX5_SEND_WQE_BB); -} - static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, struct dr_qp_init_attr *attr) { @@ -285,7 +253,6 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {}; struct mlx5_wq_param wqp; struct mlx5dr_qp *dr_qp; - int wqe_size; int inlen; void *qpc; void *in; @@ -365,15 +332,6 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, if (err) goto err_in; dr_qp->uar = attr->uar; - wqe_size = dr_qp_calc_rc_send_wqe(attr); - dr_qp->max_inline_data = min(wqe_size - - (sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_raddr_seg) + - sizeof(struct mlx5_wqe_inline_seg)), - (2 * MLX5_SEND_WQE_BB - - (sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_raddr_seg) + - sizeof(struct mlx5_wqe_inline_seg)))); return dr_qp; @@ -437,48 +395,8 @@ dr_rdma_handle_flow_access_arg_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl, MLX5_SEND_WQE_DS; } -static int dr_set_data_inl_seg(struct mlx5dr_qp *dr_qp, - struct dr_data_seg *data_seg, void *wqe) -{ - int inline_header_size = sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_raddr_seg) + - sizeof(struct mlx5_wqe_inline_seg); - struct mlx5_wqe_inline_seg *seg; - int left_space; - int inl = 0; - void *addr; - int len; - int idx; - - seg = wqe; - wqe += sizeof(*seg); - addr = (void *)(unsigned long)(data_seg->addr); - len = data_seg->length; - inl += len; - left_space = MLX5_SEND_WQE_BB - inline_header_size; - - if (likely(len > left_space)) { - memcpy(wqe, addr, left_space); - len -= left_space; - addr += left_space; - idx = (dr_qp->sq.pc + 1) & (dr_qp->sq.wqe_cnt - 1); - wqe = mlx5_wq_cyc_get_wqe(&dr_qp->wq.sq, idx); - } - - memcpy(wqe, addr, len); - - if (likely(inl)) { - seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); - return DIV_ROUND_UP(inl + sizeof(seg->byte_count), - MLX5_SEND_WQE_DS); - } else { - return 0; - } -} - static void -dr_rdma_handle_icm_write_segments(struct mlx5dr_qp *dr_qp, - struct mlx5_wqe_ctrl_seg *wq_ctrl, +dr_rdma_handle_icm_write_segments(struct mlx5_wqe_ctrl_seg *wq_ctrl, u64 remote_addr, u32 rkey, struct dr_data_seg *data_seg, @@ -494,17 +412,15 @@ dr_rdma_handle_icm_write_segments(struct mlx5dr_qp *dr_qp, wq_raddr->reserved = 0; wq_dseg = (void *)(wq_raddr + 1); - /* WQE ctrl segment + WQE remote addr segment */ - *size = (sizeof(*wq_ctrl) + sizeof(*wq_raddr)) / MLX5_SEND_WQE_DS; - if (data_seg->send_flags & IB_SEND_INLINE) { - *size += dr_set_data_inl_seg(dr_qp, data_seg, wq_dseg); - } else { - wq_dseg->byte_count = cpu_to_be32(data_seg->length); - wq_dseg->lkey = cpu_to_be32(data_seg->lkey); - wq_dseg->addr = cpu_to_be64(data_seg->addr); - *size += sizeof(*wq_dseg) / MLX5_SEND_WQE_DS; /* WQE data segment */ - } + wq_dseg->byte_count = cpu_to_be32(data_seg->length); + wq_dseg->lkey = cpu_to_be32(data_seg->lkey); + wq_dseg->addr = cpu_to_be64(data_seg->addr); + + *size = (sizeof(*wq_ctrl) + /* WQE ctrl segment */ + sizeof(*wq_dseg) + /* WQE data segment */ + sizeof(*wq_raddr)) / /* WQE remote addr segment */ + MLX5_SEND_WQE_DS; } static void dr_set_ctrl_seg(struct mlx5_wqe_ctrl_seg *wq_ctrl, @@ -535,7 +451,7 @@ static void dr_rdma_segments(struct mlx5dr_qp *dr_qp, u64 remote_addr, switch (opcode) { case MLX5_OPCODE_RDMA_READ: case MLX5_OPCODE_RDMA_WRITE: - dr_rdma_handle_icm_write_segments(dr_qp, wq_ctrl, remote_addr, + dr_rdma_handle_icm_write_segments(wq_ctrl, remote_addr, rkey, data_seg, &size); break; case MLX5_OPCODE_FLOW_TBL_ACCESS: @@ -656,7 +572,7 @@ static void dr_fill_write_args_segs(struct mlx5dr_send_ring *send_ring, if (send_ring->pending_wqe % send_ring->signal_th == 0) send_info->write.send_flags |= IB_SEND_SIGNALED; else - send_info->write.send_flags &= ~IB_SEND_SIGNALED; + send_info->write.send_flags = 0; } static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn, @@ -680,13 +596,9 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn, } send_ring->pending_wqe++; - if (!send_info->write.lkey) - send_info->write.send_flags |= IB_SEND_INLINE; if (send_ring->pending_wqe % send_ring->signal_th == 0) send_info->write.send_flags |= IB_SEND_SIGNALED; - else - send_info->write.send_flags &= ~IB_SEND_SIGNALED; send_ring->pending_wqe++; send_info->read.length = send_info->write.length; @@ -696,9 +608,9 @@ static void dr_fill_write_icm_segs(struct mlx5dr_domain *dmn, send_info->read.lkey = send_ring->sync_mr->mkey; if (send_ring->pending_wqe % send_ring->signal_th == 0) - send_info->read.send_flags |= IB_SEND_SIGNALED; + send_info->read.send_flags = IB_SEND_SIGNALED; else - send_info->read.send_flags &= ~IB_SEND_SIGNALED; + send_info->read.send_flags = 0; } static void dr_fill_data_segs(struct mlx5dr_domain *dmn, @@ -1345,7 +1257,6 @@ int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn) dmn->send_ring->cq->qp = dmn->send_ring->qp; dmn->info.max_send_wr = QUEUE_SIZE; - init_attr.max_send_sge = 1; dmn->info.max_inline_size = min(dmn->send_ring->qp->max_inline_data, DR_STE_SIZE); From 7d2f74d1d4385a5bcf90618537f16a45121c30ae Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Tue, 14 Nov 2023 13:58:33 -0800 Subject: [PATCH 278/560] net/mlx5: Free used cpus mask when an IRQ is released Each EQ table maintains a cpumask of the already used CPUs that are mapped to IRQs to ensure that each IRQ gets mapped to a unique CPU. However, on IRQ release, the said cpumask is not updated by clearing the CPU from the mask to allow future IRQ request, causing the following error when a SF is reloaded after it has utilized all CPUs for its IRQs: mlx5_irq_affinity_request:135:(pid 306010): Didn't find a matching IRQ. err = -28 Thus, when releasing an IRQ, clear its mapped CPU from the used CPUs mask, to prevent the case described above. While at it, move the used cpumask update to the EQ layer as it is more fitting and preserves symmetricity of the IRQ request/release API. Fixes: a1772de78d73 ("net/mlx5: Refactor completion IRQ request/release API") Signed-off-by: Maher Sanalla Reviewed-by: Tariq Toukan Reviewed-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-3-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 25 ++++++++--- .../mellanox/mlx5/core/irq_affinity.c | 42 ------------------- 2 files changed, 19 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index ea0405e0a43fa..40a6cb052a2da 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -885,11 +885,14 @@ static void comp_irq_release_sf(struct mlx5_core_dev *dev, u16 vecidx) { struct mlx5_eq_table *table = dev->priv.eq_table; struct mlx5_irq *irq; + int cpu; irq = xa_load(&table->comp_irqs, vecidx); if (!irq) return; + cpu = cpumask_first(mlx5_irq_get_affinity_mask(irq)); + cpumask_clear_cpu(cpu, &table->used_cpus); xa_erase(&table->comp_irqs, vecidx); mlx5_irq_affinity_irq_release(dev, irq); } @@ -897,16 +900,26 @@ static void comp_irq_release_sf(struct mlx5_core_dev *dev, u16 vecidx) static int comp_irq_request_sf(struct mlx5_core_dev *dev, u16 vecidx) { struct mlx5_eq_table *table = dev->priv.eq_table; + struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev); + struct irq_affinity_desc af_desc = {}; struct mlx5_irq *irq; - irq = mlx5_irq_affinity_irq_request_auto(dev, &table->used_cpus, vecidx); - if (IS_ERR(irq)) { - /* In case SF irq pool does not exist, fallback to the PF irqs*/ - if (PTR_ERR(irq) == -ENOENT) - return comp_irq_request_pci(dev, vecidx); + /* In case SF irq pool does not exist, fallback to the PF irqs*/ + if (!mlx5_irq_pool_is_sf_pool(pool)) + return comp_irq_request_pci(dev, vecidx); + af_desc.is_managed = 1; + cpumask_copy(&af_desc.mask, cpu_online_mask); + cpumask_andnot(&af_desc.mask, &af_desc.mask, &table->used_cpus); + irq = mlx5_irq_affinity_request(pool, &af_desc); + if (IS_ERR(irq)) return PTR_ERR(irq); - } + + cpumask_or(&table->used_cpus, &table->used_cpus, mlx5_irq_get_affinity_mask(irq)); + mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n", + pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)), + cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)), + mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ); return xa_err(xa_store(&table->comp_irqs, vecidx, irq, GFP_KERNEL)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c index 047d5fed5f89e..612e666ec2635 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c @@ -168,45 +168,3 @@ void mlx5_irq_affinity_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *i if (pool->irqs_per_cpu) cpu_put(pool, cpu); } - -/** - * mlx5_irq_affinity_irq_request_auto - request one IRQ for mlx5 device. - * @dev: mlx5 device that is requesting the IRQ. - * @used_cpus: cpumask of bounded cpus by the device - * @vecidx: vector index to request an IRQ for. - * - * Each IRQ is bounded to at most 1 CPU. - * This function is requesting an IRQ according to the default assignment. - * The default assignment policy is: - * - request the least loaded IRQ which is not bound to any - * CPU of the previous IRQs requested. - * - * On success, this function updates used_cpus mask and returns an irq pointer. - * In case of an error, an appropriate error pointer is returned. - */ -struct mlx5_irq *mlx5_irq_affinity_irq_request_auto(struct mlx5_core_dev *dev, - struct cpumask *used_cpus, u16 vecidx) -{ - struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev); - struct irq_affinity_desc af_desc = {}; - struct mlx5_irq *irq; - - if (!mlx5_irq_pool_is_sf_pool(pool)) - return ERR_PTR(-ENOENT); - - af_desc.is_managed = 1; - cpumask_copy(&af_desc.mask, cpu_online_mask); - cpumask_andnot(&af_desc.mask, &af_desc.mask, used_cpus); - irq = mlx5_irq_affinity_request(pool, &af_desc); - - if (IS_ERR(irq)) - return irq; - - cpumask_or(used_cpus, used_cpus, mlx5_irq_get_affinity_mask(irq)); - mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n", - pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)), - cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)), - mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ); - - return irq; -} From ad4d82c3eacdd500a246af736e6e01d96484e35e Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Tue, 14 Nov 2023 13:58:34 -0800 Subject: [PATCH 279/560] net/mlx5: DR, Allow old devices to use multi destination FTE The current check isn't aware of old devices that don't have the relevant FW capability. This patch allows multi destination FTE in old cards, as it was before this check. Fixes: f6f46e7173cb ("net/mlx5: DR, Add check for multi destination FTE") Signed-off-by: Erez Shitrit Reviewed-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-4-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c index 6ea88a5818047..e3ec559369fa0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c @@ -57,7 +57,8 @@ static const char *dr_action_id_to_str(enum mlx5dr_action_type action_id) static bool mlx5dr_action_supp_fwd_fdb_multi_ft(struct mlx5_core_dev *dev) { - return (MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table_limit_regc) || + return (MLX5_CAP_GEN(dev, steering_format_version) < MLX5_STEERING_FORMAT_CONNECTX_6DX || + MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table_limit_regc) || MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_any_table)); } From fd64fd13c49a53012ce9170449dcd9eb71c11284 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:35 -0800 Subject: [PATCH 280/560] net/mlx5: Decouple PHC .adjtime and .adjphase implementations When running a phase adjustment operation, the free running clock should not be modified at all. The phase control keyword is intended to trigger an internal servo on the device that will converge to the provided delta. A free running counter cannot implement phase adjustment. Fixes: 8e11a68e2e8a ("net/mlx5: Add adjphase function to support hardware-only offset control") Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-5-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index aa29f09e83564..0c83ef174275a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -384,7 +384,12 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) static int mlx5_ptp_adjphase(struct ptp_clock_info *ptp, s32 delta) { - return mlx5_ptp_adjtime(ptp, delta); + struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock, ptp_info); + struct mlx5_core_dev *mdev; + + mdev = container_of(clock, struct mlx5_core_dev, clock); + + return mlx5_ptp_adjtime_real_time(mdev, delta); } static int mlx5_ptp_freq_adj_real_time(struct mlx5_core_dev *mdev, long scaled_ppm) From 6f9b1a0731662648949a1c0587f6acb3b7f8acf1 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 14 Nov 2023 13:58:36 -0800 Subject: [PATCH 281/560] net/mlx5e: fix double free of encap_header When mlx5_packet_reformat_alloc() fails, the encap_header allocated in mlx5e_tc_tun_create_header_ipv4{6} will be released within it. However, e->encap_header is already set to the previously freed encap_header before mlx5_packet_reformat_alloc(). As a result, the later mlx5e_encap_put() will free e->encap_header again, causing a double free issue. mlx5e_encap_put() --> mlx5e_encap_dealloc() --> kfree(e->encap_header) This happens when cmd: MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT fail. This patch fix it by not setting e->encap_header until mlx5_packet_reformat_alloc() success. Fixes: d589e785baf5e ("net/mlx5e: Allow concurrent creation of encap entries") Reported-by: Cruz Zhao Reported-by: Tianchen Ding Signed-off-by: Dust Li Reviewed-by: Wojciech Drewek Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 00a04fdd756f5..8bca696b6658c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -300,9 +300,6 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, if (err) goto destroy_neigh_entry; - e->encap_size = ipv4_encap_size; - e->encap_header = encap_header; - if (!(nud_state & NUD_VALID)) { neigh_event_send(attr.n, NULL); /* the encap entry will be made valid on neigh update event @@ -322,6 +319,8 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, goto destroy_neigh_entry; } + e->encap_size = ipv4_encap_size; + e->encap_header = encap_header; e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); mlx5e_route_lookup_ipv4_put(&attr); @@ -568,9 +567,6 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, if (err) goto destroy_neigh_entry; - e->encap_size = ipv6_encap_size; - e->encap_header = encap_header; - if (!(nud_state & NUD_VALID)) { neigh_event_send(attr.n, NULL); /* the encap entry will be made valid on neigh update event @@ -590,6 +586,8 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, goto destroy_neigh_entry; } + e->encap_size = ipv6_encap_size; + e->encap_header = encap_header; e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); mlx5e_route_lookup_ipv6_put(&attr); From 3a4aa3cb83563df942be49d145ee3b7ddf17d6bb Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Tue, 14 Nov 2023 13:58:37 -0800 Subject: [PATCH 282/560] net/mlx5e: fix double free of encap_header in update funcs Follow up to the previous patch to fix the same issue for mlx5e_tc_tun_update_header_ipv4{6} when mlx5_packet_reformat_alloc() fails. When mlx5_packet_reformat_alloc() fails, the encap_header allocated in mlx5e_tc_tun_update_header_ipv4{6} will be released within it. However, e->encap_header is already set to the previously freed encap_header before mlx5_packet_reformat_alloc(). As a result, the later mlx5e_encap_put() will free e->encap_header again, causing a double free issue. mlx5e_encap_put() --> mlx5e_encap_dealloc() --> kfree(e->encap_header) This patch fix it by not setting e->encap_header until mlx5_packet_reformat_alloc() success. Fixes: a54e20b4fcae ("net/mlx5e: Add basic TC tunnel set action for SRIOV offloads") Signed-off-by: Gavin Li Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-7-saeed@kernel.org Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/en/tc_tun.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 8bca696b6658c..668da5c70e63d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -403,16 +403,12 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv, if (err) goto free_encap; - e->encap_size = ipv4_encap_size; - kfree(e->encap_header); - e->encap_header = encap_header; - if (!(nud_state & NUD_VALID)) { neigh_event_send(attr.n, NULL); /* the encap entry will be made valid on neigh update event * and not used before that. */ - goto release_neigh; + goto free_encap; } memset(&reformat_params, 0, sizeof(reformat_params)); @@ -426,6 +422,10 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv, goto free_encap; } + e->encap_size = ipv4_encap_size; + kfree(e->encap_header); + e->encap_header = encap_header; + e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); mlx5e_route_lookup_ipv4_put(&attr); @@ -669,16 +669,12 @@ int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv, if (err) goto free_encap; - e->encap_size = ipv6_encap_size; - kfree(e->encap_header); - e->encap_header = encap_header; - if (!(nud_state & NUD_VALID)) { neigh_event_send(attr.n, NULL); /* the encap entry will be made valid on neigh update event * and not used before that. */ - goto release_neigh; + goto free_encap; } memset(&reformat_params, 0, sizeof(reformat_params)); @@ -692,6 +688,10 @@ int mlx5e_tc_tun_update_header_ipv6(struct mlx5e_priv *priv, goto free_encap; } + e->encap_size = ipv6_encap_size; + kfree(e->encap_header); + e->encap_header = encap_header; + e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(attr.out_dev)); mlx5e_route_lookup_ipv6_put(&attr); From 0c101a23ca7eaf00eef1328eefb04b3a93401cc8 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 14 Nov 2023 13:58:38 -0800 Subject: [PATCH 283/560] net/mlx5e: Fix pedit endianness Referenced commit addressed endianness issue in mlx5 pedit implementation in ad hoc manner instead of systematically treating integer values according to their types which left pedit fields of sizes not equal to 4 and where the bytes being modified are not least significant ones broken on big endian machines since wrong bits will be consumed during parsing which leads to following example error when applying pedit to source and destination MAC addresses: [Wed Oct 18 12:52:42 2023] mlx5_core 0001:00:00.1 p1v3_r: attempt to offload an unsupported field (cmd 0) [Wed Oct 18 12:52:42 2023] mask: 00000000330c5b68: 00 00 00 00 ff ff 00 00 00 00 ff ff 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 0000000017d22fd9: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 000000008186d717: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 0000000029eb6149: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 000000007ed103e4: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 00000000db8101a6: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [Wed Oct 18 12:52:42 2023] mask: 00000000ec3c08a9: 00 00 00 00 00 00 00 00 00 00 00 00 ............ Treat masks and values of pedit and filter match as network byte order, refactor pointers to them to void pointers instead of confusing u32 pointers and only cast to pointer-to-integer when reading a value from them. Treat pedit mlx5_fields->field_mask as host byte order according to its type u32, change the constants in fields array accordingly. Fixes: 82198d8bcdef ("net/mlx5e: Fix endianness when calculating pedit mask first bit") Signed-off-by: Vlad Buslov Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-8-saeed@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_tc.c | 60 ++++++++++--------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9a5a5c2c7da9e..7ca9e5b86778e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3147,7 +3147,7 @@ static struct mlx5_fields fields[] = { OFFLOAD(DIPV6_31_0, 32, U32_MAX, ip6.daddr.s6_addr32[3], 0, dst_ipv4_dst_ipv6.ipv6_layout.ipv6[12]), OFFLOAD(IPV6_HOPLIMIT, 8, U8_MAX, ip6.hop_limit, 0, ttl_hoplimit), - OFFLOAD(IP_DSCP, 16, 0xc00f, ip6, 0, ip_dscp), + OFFLOAD(IP_DSCP, 16, 0x0fc0, ip6, 0, ip_dscp), OFFLOAD(TCP_SPORT, 16, U16_MAX, tcp.source, 0, tcp_sport), OFFLOAD(TCP_DPORT, 16, U16_MAX, tcp.dest, 0, tcp_dport), @@ -3158,21 +3158,31 @@ static struct mlx5_fields fields[] = { OFFLOAD(UDP_DPORT, 16, U16_MAX, udp.dest, 0, udp_dport), }; -static unsigned long mask_to_le(unsigned long mask, int size) +static u32 mask_field_get(void *mask, struct mlx5_fields *f) { - __be32 mask_be32; - __be16 mask_be16; - - if (size == 32) { - mask_be32 = (__force __be32)(mask); - mask = (__force unsigned long)cpu_to_le32(be32_to_cpu(mask_be32)); - } else if (size == 16) { - mask_be32 = (__force __be32)(mask); - mask_be16 = *(__be16 *)&mask_be32; - mask = (__force unsigned long)cpu_to_le16(be16_to_cpu(mask_be16)); + switch (f->field_bsize) { + case 32: + return be32_to_cpu(*(__be32 *)mask) & f->field_mask; + case 16: + return be16_to_cpu(*(__be16 *)mask) & (u16)f->field_mask; + default: + return *(u8 *)mask & (u8)f->field_mask; } +} - return mask; +static void mask_field_clear(void *mask, struct mlx5_fields *f) +{ + switch (f->field_bsize) { + case 32: + *(__be32 *)mask &= ~cpu_to_be32(f->field_mask); + break; + case 16: + *(__be16 *)mask &= ~cpu_to_be16((u16)f->field_mask); + break; + default: + *(u8 *)mask &= ~(u8)f->field_mask; + break; + } } static int offload_pedit_fields(struct mlx5e_priv *priv, @@ -3184,11 +3194,12 @@ static int offload_pedit_fields(struct mlx5e_priv *priv, struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals; struct pedit_headers_action *hdrs = parse_attr->hdrs; void *headers_c, *headers_v, *action, *vals_p; - u32 *s_masks_p, *a_masks_p, s_mask, a_mask; struct mlx5e_tc_mod_hdr_acts *mod_acts; - unsigned long mask, field_mask; + void *s_masks_p, *a_masks_p; int i, first, last, next_z; struct mlx5_fields *f; + unsigned long mask; + u32 s_mask, a_mask; u8 cmd; mod_acts = &parse_attr->mod_hdr_acts; @@ -3204,15 +3215,11 @@ static int offload_pedit_fields(struct mlx5e_priv *priv, bool skip; f = &fields[i]; - /* avoid seeing bits set from previous iterations */ - s_mask = 0; - a_mask = 0; - s_masks_p = (void *)set_masks + f->offset; a_masks_p = (void *)add_masks + f->offset; - s_mask = *s_masks_p & f->field_mask; - a_mask = *a_masks_p & f->field_mask; + s_mask = mask_field_get(s_masks_p, f); + a_mask = mask_field_get(a_masks_p, f); if (!s_mask && !a_mask) /* nothing to offload here */ continue; @@ -3239,22 +3246,20 @@ static int offload_pedit_fields(struct mlx5e_priv *priv, match_mask, f->field_bsize)) skip = true; /* clear to denote we consumed this field */ - *s_masks_p &= ~f->field_mask; + mask_field_clear(s_masks_p, f); } else { cmd = MLX5_ACTION_TYPE_ADD; mask = a_mask; vals_p = (void *)add_vals + f->offset; /* add 0 is no change */ - if ((*(u32 *)vals_p & f->field_mask) == 0) + if (!mask_field_get(vals_p, f)) skip = true; /* clear to denote we consumed this field */ - *a_masks_p &= ~f->field_mask; + mask_field_clear(a_masks_p, f); } if (skip) continue; - mask = mask_to_le(mask, f->field_bsize); - first = find_first_bit(&mask, f->field_bsize); next_z = find_next_zero_bit(&mask, f->field_bsize, first); last = find_last_bit(&mask, f->field_bsize); @@ -3281,10 +3286,9 @@ static int offload_pedit_fields(struct mlx5e_priv *priv, MLX5_SET(set_action_in, action, field, f->field); if (cmd == MLX5_ACTION_TYPE_SET) { + unsigned long field_mask = f->field_mask; int start; - field_mask = mask_to_le(f->field_mask, f->field_bsize); - /* if field is bit sized it can start not from first bit */ start = find_first_bit(&field_mask, f->field_bsize); From bdf788cf224f61c20a01c58c00685d394d57887f Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Tue, 14 Nov 2023 13:58:39 -0800 Subject: [PATCH 284/560] net/mlx5e: Don't modify the peer sent-to-vport rules for IPSec offload As IPSec packet offload in switchdev mode is not supported with LAG, it's unnecessary to modify those sent-to-vport rules to the peer eswitch. Fixes: c6c2bf5db4ea ("net/mlx5e: Support IPsec packet offload for TX in switchdev mode") Signed-off-by: Jianbo Liu Reviewed-by: Leon Romanovsky Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-9-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index b296ac52a4397..88236e75fd901 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -984,7 +984,8 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - if (rep->vport == MLX5_VPORT_UPLINK && on_esw->offloads.ft_ipsec_tx_pol) { + if (rep->vport == MLX5_VPORT_UPLINK && + on_esw == from_esw && on_esw->offloads.ft_ipsec_tx_pol) { dest.ft = on_esw->offloads.ft_ipsec_tx_pol; flow_act.flags = FLOW_ACT_IGNORE_FLOW_LEVEL; dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; From 64f14d16eef1f939000f2617b50c7c996b5117d4 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:40 -0800 Subject: [PATCH 285/560] net/mlx5e: Avoid referencing skb after free-ing in drop path of mlx5e_sq_xmit_wqe When SQ is a port timestamping SQ for PTP, do not access tx flags of skb after free-ing the skb. Free the skb only after all references that depend on it have been handled in the dropped WQE path. Fixes: 3178308ad4ca ("net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs") Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-10-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index d41435c22ce56..19f2c25b05a0b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -494,10 +494,10 @@ mlx5e_sq_xmit_wqe(struct mlx5e_txqsq *sq, struct sk_buff *skb, err_drop: stats->dropped++; - dev_kfree_skb_any(skb); if (unlikely(sq->ptpsq && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) mlx5e_ptp_metadata_fifo_push(&sq->ptpsq->metadata_freelist, be32_to_cpu(eseg->flow_table_metadata)); + dev_kfree_skb_any(skb); mlx5e_tx_flush(sq); } From 7e3f3ba97e6cc6fce5bf62df2ca06c8e59040167 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:41 -0800 Subject: [PATCH 286/560] net/mlx5e: Track xmit submission to PTP WQ after populating metadata map Ensure the skb is available in metadata mapping to skbs before tracking the metadata index for detecting undelivered CQEs. If the metadata index is put in the tracking list before putting the skb in the map, the metadata index might be used for detecting undelivered CQEs before the relevant skb is available in the map, which can lead to a null-ptr-deref. Log: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] CPU: 0 PID: 1243 Comm: kworker/0:2 Not tainted 6.6.0-rc4+ #108 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: events mlx5e_rx_dim_work [mlx5_core] RIP: 0010:mlx5e_ptp_napi_poll+0x9a4/0x2290 [mlx5_core] Code: 8c 24 38 cc ff ff 4c 8d 3c c1 4c 89 f9 48 c1 e9 03 42 80 3c 31 00 0f 85 97 0f 00 00 4d 8b 3f 49 8d 7f 28 48 89 f9 48 c1 e9 03 <42> 80 3c 31 00 0f 85 8b 0f 00 00 49 8b 47 28 48 85 c0 0f 84 05 07 RSP: 0018:ffff8884d3c09c88 EFLAGS: 00010206 RAX: 0000000000000069 RBX: ffff8881160349d8 RCX: 0000000000000005 RDX: ffffed10218f48cf RSI: 0000000000000004 RDI: 0000000000000028 RBP: ffff888122707700 R08: 0000000000000001 R09: ffffed109a781383 R10: 0000000000000003 R11: 0000000000000003 R12: ffff88810c7a7a40 R13: ffff888122707700 R14: dffffc0000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8884d3c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f4f878dd6e0 CR3: 000000014d108002 CR4: 0000000000370eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? die_addr+0x3c/0xa0 ? exc_general_protection+0x144/0x210 ? asm_exc_general_protection+0x22/0x30 ? mlx5e_ptp_napi_poll+0x9a4/0x2290 [mlx5_core] ? mlx5e_ptp_napi_poll+0x8f6/0x2290 [mlx5_core] __napi_poll.constprop.0+0xa4/0x580 net_rx_action+0x460/0xb80 ? _raw_spin_unlock_irqrestore+0x32/0x60 ? __napi_poll.constprop.0+0x580/0x580 ? tasklet_action_common.isra.0+0x2ef/0x760 __do_softirq+0x26c/0x827 irq_exit_rcu+0xc2/0x100 common_interrupt+0x7f/0xa0 asm_common_interrupt+0x22/0x40 RIP: 0010:__kmem_cache_alloc_node+0xb/0x330 Code: 41 5d 41 5e 41 5f c3 8b 44 24 14 8b 4c 24 10 09 c8 eb d5 e8 b7 43 ca 01 0f 1f 80 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 57 <41> 56 41 89 d6 41 55 41 89 f5 41 54 49 89 fc 53 48 83 e4 f0 48 83 RSP: 0018:ffff88812c4079c0 EFLAGS: 00000246 RAX: 1ffffffff083c7fe RBX: ffff888100042dc0 RCX: 0000000000000218 RDX: 00000000ffffffff RSI: 0000000000000dc0 RDI: ffff888100042dc0 RBP: ffff88812c4079c8 R08: ffffffffa0289f96 R09: ffffed1025880ea9 R10: ffff888138839f80 R11: 0000000000000002 R12: 0000000000000dc0 R13: 0000000000000100 R14: 000000000000008c R15: ffff8881271fc450 ? cmd_exec+0x796/0x2200 [mlx5_core] kmalloc_trace+0x26/0xc0 cmd_exec+0x796/0x2200 [mlx5_core] mlx5_cmd_do+0x22/0xc0 [mlx5_core] mlx5_cmd_exec+0x17/0x30 [mlx5_core] mlx5_core_modify_cq_moderation+0x139/0x1b0 [mlx5_core] ? mlx5_add_cq_to_tasklet+0x280/0x280 [mlx5_core] ? lockdep_set_lock_cmp_fn+0x190/0x190 ? process_one_work+0x659/0x1220 mlx5e_rx_dim_work+0x9d/0x100 [mlx5_core] process_one_work+0x730/0x1220 ? lockdep_hardirqs_on_prepare+0x400/0x400 ? max_active_store+0xf0/0xf0 ? assign_work+0x168/0x240 worker_thread+0x70f/0x12d0 ? __kthread_parkme+0xd1/0x1d0 ? process_one_work+0x1220/0x1220 kthread+0x2d9/0x3b0 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x2d/0x70 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork_asm+0x11/0x20 Modules linked in: xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay mlx5_ib ib_uverbs ib_core zram zsmalloc mlx5_core fuse ---[ end trace 0000000000000000 ]--- Fixes: 3178308ad4ca ("net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs") Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-11-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 19f2c25b05a0b..f0b506e562df3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -399,9 +399,9 @@ mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb, u8 metadata_index = be32_to_cpu(eseg->flow_table_metadata); mlx5e_skb_cb_hwtstamp_init(skb); - mlx5e_ptpsq_track_metadata(sq->ptpsq, metadata_index); mlx5e_ptp_metadata_map_put(&sq->ptpsq->metadata_map, skb, metadata_index); + mlx5e_ptpsq_track_metadata(sq->ptpsq, metadata_index); if (!netif_tx_queue_stopped(sq->txq) && mlx5e_ptpsq_metadata_freelist_empty(sq->ptpsq)) { netif_tx_stop_queue(sq->txq); From 92214be5979c0961a471b7eaaaeacab41bdf456c Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:42 -0800 Subject: [PATCH 287/560] net/mlx5e: Update doorbell for port timestamping CQ before the software counter Previously, mlx5e_ptp_poll_ts_cq would update the device doorbell with the incremented consumer index after the relevant software counters in the kernel were updated. In the mlx5e_sq_xmit_wqe context, this would lead to either overrunning the device CQ or exceeding the expected software buffer size in the device CQ if the device CQ size was greater than the software buffer size. Update the relevant software counter only after updating the device CQ consumer index in the port timestamping napi_poll context. Log: mlx5_core 0000:08:00.0: cq_err_event_notifier:517:(pid 0): CQ error on CQN 0x487, syndrome 0x1 mlx5_core 0000:08:00.0 eth2: mlx5e_cq_error_event: cqn=0x000487 event=0x04 Fixes: 1880bc4e4a96 ("net/mlx5e: Add TX port timestamp support") Signed-off-by: Rahul Rameshbabu Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-12-saeed@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en/ptp.c | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index bb11e644d24f7..af3928eddafd1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -177,6 +177,8 @@ static void mlx5e_ptpsq_mark_ts_cqes_undelivered(struct mlx5e_ptpsq *ptpsq, static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, struct mlx5_cqe64 *cqe, + u8 *md_buff, + u8 *md_buff_sz, int budget) { struct mlx5e_ptp_port_ts_cqe_list *pending_cqe_list = ptpsq->ts_cqe_pending_list; @@ -211,19 +213,24 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp); out: napi_consume_skb(skb, budget); - mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist, metadata_id); + md_buff[*md_buff_sz++] = metadata_id; if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) && !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work); } -static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget) +static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int napi_budget) { struct mlx5e_ptpsq *ptpsq = container_of(cq, struct mlx5e_ptpsq, ts_cq); - struct mlx5_cqwq *cqwq = &cq->wq; + int budget = min(napi_budget, MLX5E_TX_CQ_POLL_BUDGET); + u8 metadata_buff[MLX5E_TX_CQ_POLL_BUDGET]; + u8 metadata_buff_sz = 0; + struct mlx5_cqwq *cqwq; struct mlx5_cqe64 *cqe; int work_done = 0; + cqwq = &cq->wq; + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state))) return false; @@ -234,7 +241,8 @@ static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget) do { mlx5_cqwq_pop(cqwq); - mlx5e_ptp_handle_ts_cqe(ptpsq, cqe, budget); + mlx5e_ptp_handle_ts_cqe(ptpsq, cqe, + metadata_buff, &metadata_buff_sz, napi_budget); } while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(cqwq))); mlx5_cqwq_update_db_record(cqwq); @@ -242,6 +250,10 @@ static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget) /* ensure cq space is freed before enabling more cqes */ wmb(); + while (metadata_buff_sz > 0) + mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist, + metadata_buff[--metadata_buff_sz]); + mlx5e_txqsq_wake(&ptpsq->txqsq); return work_done == budget; From 3338bebfc26a1e2cebbba82a1cf12c0159608e73 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:43 -0800 Subject: [PATCH 288/560] net/mlx5: Increase size of irq name buffer Without increased buffer size, will trigger -Wformat-truncation with W=1 for the snprintf operation writing to the buffer. drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c: In function 'mlx5_irq_alloc': drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:296:7: error: '@pci:' directive output may be truncated writing 5 bytes into a region of size between 1 and 32 [-Werror=format-truncation=] 296 | "%s@pci:%s", name, pci_name(dev->pdev)); | ^~~~~ drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:295:2: note: 'snprintf' output 6 or more bytes (assuming 37) into a destination of size 32 295 | snprintf(irq->name, MLX5_MAX_IRQ_NAME, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 296 | "%s@pci:%s", name, pci_name(dev->pdev)); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: ada9f5d00797 ("IB/mlx5: Fix eq names to display nicely in /proc/interrupts") Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c Signed-off-by: Rahul Rameshbabu Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-13-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 653648216730a..4dcf995cb1a20 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -28,7 +28,7 @@ struct mlx5_irq { struct atomic_notifier_head nh; cpumask_var_t mask; - char name[MLX5_MAX_IRQ_NAME]; + char name[MLX5_MAX_IRQ_FORMATTED_NAME]; struct mlx5_irq_pool *pool; int refcount; struct msi_map map; @@ -292,8 +292,8 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, else irq_sf_set_name(pool, name, i); ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh); - snprintf(irq->name, MLX5_MAX_IRQ_NAME, - "%s@pci:%s", name, pci_name(dev->pdev)); + snprintf(irq->name, MLX5_MAX_IRQ_FORMATTED_NAME, + MLX5_IRQ_NAME_FORMAT_STR, name, pci_name(dev->pdev)); err = request_irq(irq->map.virq, irq_int_handler, 0, irq->name, &irq->nh); if (err) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h index d3a77a0ab8488..c4d377f8df308 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h @@ -7,6 +7,9 @@ #include #define MLX5_MAX_IRQ_NAME (32) +#define MLX5_IRQ_NAME_FORMAT_STR ("%s@pci:%s") +#define MLX5_MAX_IRQ_FORMATTED_NAME \ + (MLX5_MAX_IRQ_NAME + sizeof(MLX5_IRQ_NAME_FORMAT_STR)) /* max irq_index is 2047, so four chars */ #define MLX5_MAX_IRQ_IDX_CHARS (4) #define MLX5_EQ_REFS_PER_IRQ (2) From dce94142842e119b982c27c1b62bd20890c7fd21 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 14 Nov 2023 13:58:44 -0800 Subject: [PATCH 289/560] net/mlx5e: Reduce the size of icosq_str icosq_str size is unnecessarily too long, and it causes a build warning -Wformat-truncation with W=1. Looking closely, It doesn't need to be 255B, hence this patch reduces the size to 32B which should be more than enough to host the string: "ICOSQ: 0x%x, ". While here, add a missing space in the formatted string. This fixes the following build warning: $ KCFLAGS='-Wall -Werror' $ make O=/tmp/kbuild/linux W=1 -s -j12 drivers/net/ethernet/mellanox/mlx5/core/ drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c: In function 'mlx5e_reporter_rx_timeout': drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c:718:56: error: ', CQ: 0x' directive output may be truncated writing 8 bytes into a region of size between 0 and 255 [-Werror=format-truncation=] 718 | "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x", | ^~~~~~~~ drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c:717:9: note: 'snprintf' output between 43 and 322 bytes into a destination of size 288 717 | snprintf(err_str, sizeof(err_str), | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 718 | "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x", | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 719 | rq->ix, icosq_str, rq->rqn, rq->cq.mcq.cqn); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: 521f31af004a ("net/mlx5e: Allow RQ outside of channel context") Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-14-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index fea8c0a5fe893..4358798d6ce14 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -492,11 +492,11 @@ static int mlx5e_rx_reporter_dump(struct devlink_health_reporter *reporter, void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq) { - char icosq_str[MLX5E_REPORTER_PER_Q_MAX_LEN] = {}; char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; struct mlx5e_icosq *icosq = rq->icosq; struct mlx5e_priv *priv = rq->priv; struct mlx5e_err_ctx err_ctx = {}; + char icosq_str[32] = {}; err_ctx.ctx = rq; err_ctx.recover = mlx5e_rx_reporter_timeout_recover; @@ -505,7 +505,7 @@ void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq) if (icosq) snprintf(icosq_str, sizeof(icosq_str), "ICOSQ: 0x%x, ", icosq->sqn); snprintf(err_str, sizeof(err_str), - "RX timeout on channel: %d, %sRQ: 0x%x, CQ: 0x%x", + "RX timeout on channel: %d, %s RQ: 0x%x, CQ: 0x%x", rq->ix, icosq_str, rq->rqn, rq->cq.mcq.cqn); mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); From 41e63c2baa11dc2aa71df5dd27a5bd87d11b6bbb Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:45 -0800 Subject: [PATCH 290/560] net/mlx5e: Check return value of snprintf writing to fw_version buffer Treat the operation as an error case when the return value is equivalent to the size of the name buffer. Failed to write null terminator to the name buffer, making the string malformed and should not be used. Provide a string with only the firmware version when forming the string with the board id fails. Without check, will trigger -Wformat-truncation with W=1. drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c: In function 'mlx5e_ethtool_get_drvinfo': drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c:49:31: warning: '%.16s' directive output may be truncated writing up to 16 bytes into a region of size between 13 and 22 [-Wformat-truncation=] 49 | "%d.%d.%04d (%.16s)", | ^~~~~ drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c:48:9: note: 'snprintf' output between 12 and 37 bytes into a destination of size 32 48 | snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 49 | "%d.%d.%04d (%.16s)", | ~~~~~~~~~~~~~~~~~~~~~ 50 | fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev), | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 51 | mdev->board_id); | ~~~~~~~~~~~~~~~ Fixes: 84e11edb71de ("net/mlx5e: Show board id in ethtool driver information") Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c Signed-off-by: Rahul Rameshbabu Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 215261a692550..792a0ea544cd3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -43,12 +43,17 @@ void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv, struct ethtool_drvinfo *drvinfo) { struct mlx5_core_dev *mdev = priv->mdev; + int count; strscpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver)); - snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), - "%d.%d.%04d (%.16s)", - fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev), - mdev->board_id); + count = snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%04d (%.16s)", fw_rev_maj(mdev), + fw_rev_min(mdev), fw_rev_sub(mdev), mdev->board_id); + if (count == sizeof(drvinfo->fw_version)) + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%04d", fw_rev_maj(mdev), + fw_rev_min(mdev), fw_rev_sub(mdev)); + strscpy(drvinfo->bus_info, dev_name(mdev->device), sizeof(drvinfo->bus_info)); } From 1b2bd0c0264febcd8d47209079a6671c38e6558b Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:46 -0800 Subject: [PATCH 291/560] net/mlx5e: Check return value of snprintf writing to fw_version buffer for representors Treat the operation as an error case when the return value is equivalent to the size of the name buffer. Failed to write null terminator to the name buffer, making the string malformed and should not be used. Provide a string with only the firmware version when forming the string with the board id fails. This logic for representors is identical to normal flow with ethtool. Without check, will trigger -Wformat-truncation with W=1. drivers/net/ethernet/mellanox/mlx5/core/en_rep.c: In function 'mlx5e_rep_get_drvinfo': drivers/net/ethernet/mellanox/mlx5/core/en_rep.c:78:31: warning: '%.16s' directive output may be truncated writing up to 16 bytes into a region of size between 13 and 22 [-Wformat-truncation=] 78 | "%d.%d.%04d (%.16s)", | ^~~~~ drivers/net/ethernet/mellanox/mlx5/core/en_rep.c:77:9: note: 'snprintf' output between 12 and 37 bytes into a destination of size 32 77 | snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 78 | "%d.%d.%04d (%.16s)", | ~~~~~~~~~~~~~~~~~~~~~ 79 | fw_rev_maj(mdev), fw_rev_min(mdev), | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 80 | fw_rev_sub(mdev), mdev->board_id); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: cf83c8fdcd47 ("net/mlx5e: Add missing ethtool driver info for representors") Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c Signed-off-by: Rahul Rameshbabu Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-16-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 693e55b010d9e..3ab682bbcf867 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -71,13 +71,17 @@ static void mlx5e_rep_get_drvinfo(struct net_device *dev, { struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5_core_dev *mdev = priv->mdev; + int count; strscpy(drvinfo->driver, mlx5e_rep_driver_name, sizeof(drvinfo->driver)); - snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), - "%d.%d.%04d (%.16s)", - fw_rev_maj(mdev), fw_rev_min(mdev), - fw_rev_sub(mdev), mdev->board_id); + count = snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%04d (%.16s)", fw_rev_maj(mdev), + fw_rev_min(mdev), fw_rev_sub(mdev), mdev->board_id); + if (count == sizeof(drvinfo->fw_version)) + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%04d", fw_rev_maj(mdev), + fw_rev_min(mdev), fw_rev_sub(mdev)); } static const struct counter_desc sw_rep_stats_desc[] = { From 7cd5af0e937a197295f3aa3721031f0fbae49cff Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 13 Nov 2023 12:53:28 -0500 Subject: [PATCH 292/560] net: sched: do not offload flows with a helper in act_ct There is no hardware supporting ct helper offload. However, prior to this patch, a flower filter with a helper in the ct action can be successfully set into the HW, for example (eth1 is a bnxt NIC): # tc qdisc add dev eth1 ingress_block 22 ingress # tc filter add block 22 proto ip flower skip_sw ip_proto tcp \ dst_port 21 ct_state -trk action ct helper ipv4-tcp-ftp # tc filter show dev eth1 ingress filter block 22 protocol ip pref 49152 flower chain 0 handle 0x1 eth_type ipv4 ip_proto tcp dst_port 21 ct_state -trk skip_sw in_hw in_hw_count 1 <---- action order 1: ct zone 0 helper ipv4-tcp-ftp pipe index 2 ref 1 bind 1 used_hw_stats delayed This might cause the flower filter not to work as expected in the HW. This patch avoids this problem by simply returning -EOPNOTSUPP in tcf_ct_offload_act_setup() to not allow to offload flows with a helper in act_ct. Fixes: a21b06e73191 ("net: sched: add helper support in act_ct") Signed-off-by: Xin Long Reviewed-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/f8685ec7702c4a448a1371a8b34b43217b583b9d.1699898008.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- include/net/tc_act/tc_ct.h | 9 +++++++++ net/sched/act_ct.c | 3 +++ 2 files changed, 12 insertions(+) diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index 8a6dbfb233362..77f87c622a2ef 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -58,6 +58,11 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) return to_ct_params(a)->nf_ft; } +static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) +{ + return to_ct_params(a)->helper; +} + #else static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return 0; } static inline int tcf_ct_action(const struct tc_action *a) { return 0; } @@ -65,6 +70,10 @@ static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) { return NULL; } +static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) +{ + return NULL; +} #endif /* CONFIG_NF_CONNTRACK */ #if IS_ENABLED(CONFIG_NET_ACT_CT) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 0db0ecf1d1103..b3f4a503ee2ba 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1549,6 +1549,9 @@ static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data, if (bind) { struct flow_action_entry *entry = entry_data; + if (tcf_ct_helper(act)) + return -EOPNOTSUPP; + entry->id = FLOW_ACTION_CT; entry->ct.action = tcf_ct_action(act); entry->ct.zone = tcf_ct_zone(act); From 7e1caeace0418381f36b3aa8403dfd82fc57fc53 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 14 Nov 2023 18:59:15 +0100 Subject: [PATCH 293/560] macvlan: Don't propagate promisc change to lower dev in passthru Macvlan device in passthru mode sets its lower device promiscuous mode according to its MACVLAN_FLAG_NOPROMISC flag instead of synchronizing it to its own promiscuity setting. However, macvlan_change_rx_flags() function doesn't check the mode before propagating such changes to the lower device which can cause net_device->promiscuity counter overflow as illustrated by reproduction example [0] and resulting dmesg log [1]. Fix the issue by first verifying the mode in macvlan_change_rx_flags() function before propagating promiscuous mode change to the lower device. [0]: ip link add macvlan1 link enp8s0f0 type macvlan mode passthru ip link set macvlan1 promisc on ip l set dev macvlan1 up ip link set macvlan1 promisc off ip l set dev macvlan1 down ip l set dev macvlan1 up [1]: [ 5156.281724] macvlan1: entered promiscuous mode [ 5156.285467] mlx5_core 0000:08:00.0 enp8s0f0: entered promiscuous mode [ 5156.287639] macvlan1: left promiscuous mode [ 5156.288339] mlx5_core 0000:08:00.0 enp8s0f0: left promiscuous mode [ 5156.290907] mlx5_core 0000:08:00.0 enp8s0f0: entered promiscuous mode [ 5156.317197] mlx5_core 0000:08:00.0 enp8s0f0: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken. Fixes: efdbd2b30caa ("macvlan: Propagate promiscuity setting to lower devices.") Reviewed-by: Gal Pressman Signed-off-by: Vlad Buslov Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20231114175915.1649154-1-vladbu@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/macvlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 02bd201bc7e58..c8da94af4161a 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -780,7 +780,7 @@ static void macvlan_change_rx_flags(struct net_device *dev, int change) if (dev->flags & IFF_UP) { if (change & IFF_ALLMULTI) dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1); - if (change & IFF_PROMISC) + if (!macvlan_passthru(vlan->port) && change & IFF_PROMISC) dev_set_promiscuity(lowerdev, dev->flags & IFF_PROMISC ? 1 : -1); From 1c4a7587d1bbee0fd53b63af60e4244a62775f57 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 1 Nov 2023 02:46:27 +0900 Subject: [PATCH 294/560] modpost: fix section mismatch message for RELA The section mismatch check prints a bogus symbol name on some architectures. [test code] #include int __initdata foo; int get_foo(void) { return foo; } If you compile it with GCC for riscv or loongarch, modpost will show an incorrect symbol name: WARNING: modpost: vmlinux: section mismatch in reference: get_foo+0x8 (section: .text) -> done (section: .init.data) To get the correct symbol address, the st_value must be added. This issue has never been noticed since commit 93684d3b8062 ("kbuild: include symbol names in section mismatch warnings") presumably because st_value becomes zero on most architectures when the referenced symbol is looked up. It is not true for riscv or loongarch, at least. With this fix, modpost will show the correct symbol name: WARNING: modpost: vmlinux: section mismatch in reference: get_foo+0x8 (section: .text) -> foo (section: .init.data) Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers --- scripts/mod/modpost.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 973b5e5ae2ddd..cb6406f485a96 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1383,13 +1383,15 @@ static void section_rela(struct module *mod, struct elf_info *elf, const Elf_Rela *rela; for (rela = start; rela < stop; rela++) { + Elf_Sym *tsym; Elf_Addr taddr, r_offset; unsigned int r_type, r_sym; r_offset = TO_NATIVE(rela->r_offset); get_rel_type_and_sym(elf, rela->r_info, &r_type, &r_sym); - taddr = TO_NATIVE(rela->r_addend); + tsym = elf->symtab_start + r_sym; + taddr = tsym->st_value + TO_NATIVE(rela->r_addend); switch (elf->hdr->e_machine) { case EM_RISCV: @@ -1404,7 +1406,7 @@ static void section_rela(struct module *mod, struct elf_info *elf, break; } - check_section_mismatch(mod, elf, elf->symtab_start + r_sym, + check_section_mismatch(mod, elf, tsym, fsecndx, fromsec, r_offset, taddr); } } From b3e0f94d15700ac8e8c1c2355834f5d5c753c41d Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Thu, 9 Nov 2023 19:02:14 -0500 Subject: [PATCH 295/560] drm/msm/dsi: use the correct VREG_CTRL_1 value for 4nm cphy Use the same value as the downstream driver. This change is needed for CPHY mode to work correctly. Fixes: 8b034e677111 ("drm/msm/dsi: add support for DSI-PHY on SM8550") Signed-off-by: Jonathan Marek Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/566987/ Link: https://lore.kernel.org/r/20231110000216.29979-1-jonathan@marek.ca Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c index 3b1ed02f644d2..89a6344bc8653 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c @@ -918,7 +918,7 @@ static int dsi_7nm_phy_enable(struct msm_dsi_phy *phy, if ((phy->cfg->quirks & DSI_PHY_7NM_QUIRK_V5_2)) { if (phy->cphy_mode) { vreg_ctrl_0 = 0x45; - vreg_ctrl_1 = 0x45; + vreg_ctrl_1 = 0x41; glbl_rescode_top_ctrl = 0x00; glbl_rescode_bot_ctrl = 0x00; } else { From 3944e343e54b93d3fef30eacc4738e77fdf5444e Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 7 Nov 2023 13:14:13 +0200 Subject: [PATCH 296/560] drm/msm: remove exra drm_kms_helper_poll_init() call It seems during rebases I have left a call to drm_kms_helper_poll_init() which is not guarded by the (priv->kms_init) check. This leads to the crash for the boards which don't have KMS output. Drop this call, as there is a correctly guarded one next to the one being removed. Fixes: 506efcba3129 ("drm/msm: carve out KMS code from msm_drv.c") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/566299/ Link: https://lore.kernel.org/r/20231107111413.2212942-1-dmitry.baryshkov@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/msm_drv.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index b61ccea053276..841f9c102b28c 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -286,8 +286,6 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) if (ret) goto err_msm_uninit; - drm_kms_helper_poll_init(ddev); - if (priv->kms_init) { drm_kms_helper_poll_init(ddev); msm_fbdev_setup(ddev); From a33b2431d11b4df137bbcfdd5a5adfa054c2479e Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 30 Oct 2023 16:23:20 -0700 Subject: [PATCH 297/560] drm/msm/dpu: Add missing safe_lut_tbl in sc8280xp catalog During USB transfers on the SC8280XP __arm_smmu_tlb_sync() is seen to typically take 1-2ms to complete. As expected this results in poor performance, something that has been mitigated by proposing running the iommu in non-strict mode (boot with iommu.strict=0). This turns out to be related to the SAFE logic, and programming the QOS SAFE values in the DPU (per suggestion from Rob and Doug) reduces the TLB sync time to below 10us, which means significant less time spent with interrupts disabled and a significant boost in throughput. Fixes: 4a352c2fc15a ("drm/msm/dpu: Introduce SC8280XP") Cc: stable@vger.kernel.org Suggested-by: Doug Anderson Suggested-by: Rob Clark Signed-off-by: Bjorn Andersson Tested-by: Johan Hovold Tested-by: Steev Klimaszewski Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/565094/ Link: https://lore.kernel.org/r/20231030-sc8280xp-dpu-safe-lut-v1-1-6d485d7b428f@quicinc.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h index 1ccd1edd693c5..4c0528794e7a7 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h @@ -406,6 +406,7 @@ static const struct dpu_perf_cfg sc8280xp_perf_data = { .min_llcc_ib = 0, .min_dram_ib = 800000, .danger_lut_tbl = {0xf, 0xffff, 0x0}, + .safe_lut_tbl = {0xfe00, 0xfe00, 0xffff}, .qos_lut_tbl = { {.nentry = ARRAY_SIZE(sc8180x_qos_linear), .entries = sc8180x_qos_linear From ebfa85c504cb34f2fd3257c6f5d54158a0ff1bf6 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Wed, 25 Oct 2023 12:23:09 +0300 Subject: [PATCH 298/560] drm/msm/dp: don't touch DP subconnector property in eDP case In case of the eDP connection there is no subconnetor and as such no subconnector property. Put drm_dp_set_subconnector_property() calls under the !is_edp condition. Fixes: bfcc3d8f94f4 ("drm/msm/dp: support setting the DP subconnector type") Signed-off-by: Abel Vesa Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Tested-by: Jessica Zhang # SC7280 Reviewed-by: Johan Hovold Tested-by: Johan Hovold Patchwork: https://patchwork.freedesktop.org/patch/564284/ Link: https://lore.kernel.org/r/20231025092711.851168-2-dmitry.baryshkov@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_display.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index 17bfa72727aac..165e4a51d52fe 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -365,9 +365,11 @@ static int dp_display_send_hpd_notification(struct dp_display_private *dp, /* reset video pattern flag on disconnect */ if (!hpd) { dp->panel->video_test = false; - drm_dp_set_subconnector_property(dp->dp_display.connector, - connector_status_disconnected, - dp->panel->dpcd, dp->panel->downstream_ports); + if (!dp->dp_display.is_edp) + drm_dp_set_subconnector_property(dp->dp_display.connector, + connector_status_disconnected, + dp->panel->dpcd, + dp->panel->downstream_ports); } dp->dp_display.is_connected = hpd; @@ -396,8 +398,11 @@ static int dp_display_process_hpd_high(struct dp_display_private *dp) dp_link_process_request(dp->link); - drm_dp_set_subconnector_property(dp->dp_display.connector, connector_status_connected, - dp->panel->dpcd, dp->panel->downstream_ports); + if (!dp->dp_display.is_edp) + drm_dp_set_subconnector_property(dp->dp_display.connector, + connector_status_connected, + dp->panel->dpcd, + dp->panel->downstream_ports); edid = dp->panel->edid; From 21133266ca12f82b6e59c9711258cca2097c167c Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Wed, 25 Oct 2023 12:23:10 +0300 Subject: [PATCH 299/560] drm/msm/dp: attach the DP subconnector property While developing and testing the commit bfcc3d8f94f4 ("drm/msm/dp: support setting the DP subconnector type") I had the patch [1] in my tree. I haven't noticed that it was a dependency for the commit in question. Mea culpa. Since the patch has not landed yet (and even was not reviewed) and since one of the bridges erroneously uses USB connector type instead of DP, attach the property directly from the MSM DP driver. This fixes the following oops on DP HPD event: drm_object_property_set_value (drivers/gpu/drm/drm_mode_object.c:288) dp_display_process_hpd_high (drivers/gpu/drm/msm/dp/dp_display.c:402) dp_hpd_plug_handle.isra.0 (drivers/gpu/drm/msm/dp/dp_display.c:604) hpd_event_thread (drivers/gpu/drm/msm/dp/dp_display.c:1110) kthread (kernel/kthread.c:388) ret_from_fork (arch/arm64/kernel/entry.S:858) [1] https://patchwork.freedesktop.org/patch/555530/ Fixes: bfcc3d8f94f4 ("drm/msm/dp: support setting the DP subconnector type") Reviewed-by: Abhinav Kumar Signed-off-by: Dmitry Baryshkov Tested-by: Jessica Zhang # SC7280 Reviewed-by: Johan Hovold Tested-by: Johan Hovold Patchwork: https://patchwork.freedesktop.org/patch/564286/ Link: https://lore.kernel.org/r/20231025092711.851168-3-dmitry.baryshkov@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_drm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/msm/dp/dp_drm.c b/drivers/gpu/drm/msm/dp/dp_drm.c index 40e7344180e3e..e3bdd7dd4cdc7 100644 --- a/drivers/gpu/drm/msm/dp/dp_drm.c +++ b/drivers/gpu/drm/msm/dp/dp_drm.c @@ -345,6 +345,9 @@ struct drm_connector *dp_drm_connector_init(struct msm_dp *dp_display, struct dr if (IS_ERR(connector)) return connector; + if (!dp_display->is_edp) + drm_connector_attach_dp_subconnector_property(connector); + drm_connector_attach_encoder(connector, encoder); return connector; From ba276ce5865b5a22ee96c4c5664bfefd9c1bb593 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 14 Nov 2023 19:11:04 -0500 Subject: [PATCH 300/560] bcachefs: Fix missing locking for dentry->d_parent access Reported-by: Al Viro Signed-off-by: Kent Overstreet --- fs/bcachefs/xattr.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index a39ff0c296ecf..79d982674c180 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -552,6 +552,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, s.v = v + 1; s.defined = true; } else { + /* + * Check if this option was set on the parent - if so, switched + * back to inheriting from the parent: + * + * rename() also has to deal with keeping inherited options up + * to date - see bch2_reinherit_attrs() + */ + spin_lock(&dentry->d_lock); if (!IS_ROOT(dentry)) { struct bch_inode_info *dir = to_bch_ei(d_inode(dentry->d_parent)); @@ -560,6 +568,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, } else { s.v = 0; } + spin_unlock(&dentry->d_lock); s.defined = false; } From ccab434e674ca95d483788b1895a70c21b7f016a Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 15 Nov 2023 11:08:57 +0100 Subject: [PATCH 301/560] usb: aqc111: check packet for fixup for true limit If a device sends a packet that is inbetween 0 and sizeof(u64) the value passed to skb_trim() as length will wrap around ending up as some very large value. The driver will then proceed to parse the header located at that position, which will either oops or process some random value. The fix is to check against sizeof(u64) rather than 0, which the driver currently does. The issue exists since the introduction of the driver. Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/aqc111.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/usb/aqc111.c b/drivers/net/usb/aqc111.c index a017e9de2119d..7b8afa589a53c 100644 --- a/drivers/net/usb/aqc111.c +++ b/drivers/net/usb/aqc111.c @@ -1079,17 +1079,17 @@ static int aqc111_rx_fixup(struct usbnet *dev, struct sk_buff *skb) u16 pkt_count = 0; u64 desc_hdr = 0; u16 vlan_tag = 0; - u32 skb_len = 0; + u32 skb_len; if (!skb) goto err; - if (skb->len == 0) + skb_len = skb->len; + if (skb_len < sizeof(desc_hdr)) goto err; - skb_len = skb->len; /* RX Descriptor Header */ - skb_trim(skb, skb->len - sizeof(desc_hdr)); + skb_trim(skb, skb_len - sizeof(desc_hdr)); desc_hdr = le64_to_cpup((u64 *)skb_tail_pointer(skb)); /* Check these packets */ From 7fbd5fc2b35a8f559a6b380dfa9bcd964a758186 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Wed, 15 Nov 2023 11:53:31 +0100 Subject: [PATCH 302/560] stmmac: dwmac-loongson: Add architecture dependency Only present the DWMAC_LOONGSON option on architectures where it can actually be used. This follows the same logic as the DWMAC_INTEL option. Signed-off-by: Jean Delvare Cc: Keguang Zhang Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index a2b9e289aa36a..85dcda51df052 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -280,7 +280,7 @@ config DWMAC_INTEL config DWMAC_LOONGSON tristate "Loongson PCI DWMAC support" default MACH_LOONGSON64 - depends on STMMAC_ETH && PCI + depends on (MACH_LOONGSON64 || COMPILE_TEST) && STMMAC_ETH && PCI depends on COMMON_CLK help This selects the LOONGSON PCI bus support for the stmmac driver, From 0c3bd086d12d185650d095a906662593ec607bd0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 15 Nov 2023 17:15:40 +0000 Subject: [PATCH 303/560] rxrpc: Fix some minor issues with bundle tracing Fix some superficial issues with the tracing of rxrpc_bundle structs, including: (1) Set the debug_id when the bundle is allocated rather than when it is set up so that the "NEW" trace line displays the correct bundle ID. (2) Show the refcount when emitting the "FREE" traceline. Signed-off-by: David Howells cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/rxrpc/conn_client.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 981ca5b98bcb9..1d95f8bc769fa 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -73,6 +73,7 @@ static void rxrpc_destroy_client_conn_ids(struct rxrpc_local *local) static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, gfp_t gfp) { + static atomic_t rxrpc_bundle_id; struct rxrpc_bundle *bundle; bundle = kzalloc(sizeof(*bundle), gfp); @@ -85,6 +86,7 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_call *call, bundle->upgrade = test_bit(RXRPC_CALL_UPGRADE, &call->flags); bundle->service_id = call->dest_srx.srx_service; bundle->security_level = call->security_level; + bundle->debug_id = atomic_inc_return(&rxrpc_bundle_id); refcount_set(&bundle->ref, 1); atomic_set(&bundle->active, 1); INIT_LIST_HEAD(&bundle->waiting_calls); @@ -105,7 +107,8 @@ struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle, static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) { - trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_free); + trace_rxrpc_bundle(bundle->debug_id, refcount_read(&bundle->ref), + rxrpc_bundle_free); rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle); key_put(bundle->key); kfree(bundle); @@ -239,7 +242,6 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) */ int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp) { - static atomic_t rxrpc_bundle_id; struct rxrpc_bundle *bundle, *candidate; struct rxrpc_local *local = call->local; struct rb_node *p, **pp, *parent; @@ -306,7 +308,6 @@ int rxrpc_look_up_bundle(struct rxrpc_call *call, gfp_t gfp) } _debug("new bundle"); - candidate->debug_id = atomic_inc_return(&rxrpc_bundle_id); rb_link_node(&candidate->local_node, parent, pp); rb_insert_color(&candidate->local_node, &local->client_bundles); call->bundle = rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call); From d565fa4300d9ebd5ba3bbd259ce841f8dab609d6 Mon Sep 17 00:00:00 2001 From: Gerd Bayer Date: Wed, 15 Nov 2023 16:59:58 +0100 Subject: [PATCH 304/560] s390/ism: ism driver implies smc protocol Since commit a72178cfe855 ("net/smc: Fix dependency of SMC on ISM") you can build the ism code without selecting the SMC network protocol. That leaves some ism functions be reported as unused. Move these functions under the conditional compile with CONFIG_SMC. Also codify the suggestion to also configure the SMC protocol in ism's Kconfig - but with an "imply" rather than a "select" as SMC depends on other config options and allow for a deliberate decision not to build SMC. Also, mention that in ISM's help. Fixes: a72178cfe855 ("net/smc: Fix dependency of SMC on ISM") Reported-by: Randy Dunlap Closes: https://lore.kernel.org/netdev/afd142a2-1fa0-46b9-8b2d-7652d41d3ab8@infradead.org/ Signed-off-by: Gerd Bayer Reviewed-by: Wenjia Zhang Reviewed-by: Simon Horman Acked-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Signed-off-by: David S. Miller --- drivers/s390/net/Kconfig | 3 +- drivers/s390/net/ism_drv.c | 93 +++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig index 4902d45e929ce..c61e6427384c3 100644 --- a/drivers/s390/net/Kconfig +++ b/drivers/s390/net/Kconfig @@ -103,10 +103,11 @@ config CCWGROUP config ISM tristate "Support for ISM vPCI Adapter" depends on PCI + imply SMC default n help Select this option if you want to use the Internal Shared Memory - vPCI Adapter. + vPCI Adapter. The adapter can be used with the SMC network protocol. To compile as a module choose M. The module name is ism. If unsure, choose N. diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 6df7f377d2f90..81aabbfbbe2ca 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -30,7 +30,6 @@ static const struct pci_device_id ism_device_table[] = { MODULE_DEVICE_TABLE(pci, ism_device_table); static debug_info_t *ism_debug_info; -static const struct smcd_ops ism_ops; #define NO_CLIENT 0xff /* must be >= MAX_CLIENTS */ static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ @@ -289,22 +288,6 @@ static int ism_read_local_gid(struct ism_dev *ism) return ret; } -static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, - u32 vid) -{ - union ism_query_rgid cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_QUERY_RGID; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.vlan_valid = vid_valid; - cmd.request.vlan_id = vid; - - return ism_cmd(ism, &cmd); -} - static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); @@ -429,23 +412,6 @@ static int ism_del_vlan_id(struct ism_dev *ism, u64 vlan_id) return ism_cmd(ism, &cmd); } -static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info) -{ - union ism_sig_ieq cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.trigger_irq = trigger_irq; - cmd.request.event_code = event_code; - cmd.request.info = info; - - return ism_cmd(ism, &cmd); -} - static unsigned int max_bytes(unsigned int start, unsigned int len, unsigned int boundary) { @@ -503,14 +469,6 @@ u8 *ism_get_seid(void) } EXPORT_SYMBOL_GPL(ism_get_seid); -static u16 ism_get_chid(struct ism_dev *ism) -{ - if (!ism || !ism->pdev) - return 0; - - return to_zpci(ism->pdev)->pchid; -} - static void ism_handle_event(struct ism_dev *ism) { struct ism_event *entry; @@ -569,11 +527,6 @@ static irqreturn_t ism_handle_irq(int irq, void *data) return IRQ_HANDLED; } -static u64 ism_get_local_gid(struct ism_dev *ism) -{ - return ism->local_gid; -} - static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; @@ -774,6 +727,22 @@ module_exit(ism_exit); /*************************** SMC-D Implementation *****************************/ #if IS_ENABLED(CONFIG_SMC) +static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, + u32 vid) +{ + union ism_query_rgid cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_QUERY_RGID; + cmd.request.hdr.len = sizeof(cmd.request); + + cmd.request.rgid = rgid; + cmd.request.vlan_valid = vid_valid; + cmd.request.vlan_id = vid; + + return ism_cmd(ism, &cmd); +} + static int smcd_query_rgid(struct smcd_dev *smcd, u64 rgid, u32 vid_valid, u32 vid) { @@ -811,6 +780,23 @@ static int smcd_reset_vlan_required(struct smcd_dev *smcd) return ism_cmd_simple(smcd->priv, ISM_RESET_VLAN); } +static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, + u32 event_code, u64 info) +{ + union ism_sig_ieq cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; + cmd.request.hdr.len = sizeof(cmd.request); + + cmd.request.rgid = rgid; + cmd.request.trigger_irq = trigger_irq; + cmd.request.event_code = event_code; + cmd.request.info = info; + + return ism_cmd(ism, &cmd); +} + static int smcd_signal_ieq(struct smcd_dev *smcd, u64 rgid, u32 trigger_irq, u32 event_code, u64 info) { @@ -830,11 +816,24 @@ static int smcd_supports_v2(void) SYSTEM_EID.type[0] != '0'; } +static u64 ism_get_local_gid(struct ism_dev *ism) +{ + return ism->local_gid; +} + static u64 smcd_get_local_gid(struct smcd_dev *smcd) { return ism_get_local_gid(smcd->priv); } +static u16 ism_get_chid(struct ism_dev *ism) +{ + if (!ism || !ism->pdev) + return 0; + + return to_zpci(ism->pdev)->pchid; +} + static u16 smcd_get_chid(struct smcd_dev *smcd) { return ism_get_chid(smcd->priv); From 75a50c4f5b957a34dedb7c8cce52b582a9162828 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 16 Nov 2023 18:01:41 +0100 Subject: [PATCH 305/560] kselftest: rtnetlink: fix ip route command typo The blamed commit below introduced a typo causing 'gretap' test-case failures: ./rtnetlink.sh -t kci_test_gretap -v COMMAND: ip link add name test-dummy0 type dummy COMMAND: ip link set test-dummy0 up COMMAND: ip netns add testns COMMAND: ip link help gretap 2>&1 | grep -q '^Usage:' COMMAND: ip -netns testns link add dev gretap00 type gretap seq key 102 local 172.16.1.100 remote 172.16.1.200 COMMAND: ip -netns testns addr add dev gretap00 10.1.1.100/24 COMMAND: ip -netns testns link set dev gretap00 ups Error: either "dev" is duplicate, or "ups" is a garbage. COMMAND: ip -netns testns link del gretap00 COMMAND: ip -netns testns link add dev gretap00 type gretap external COMMAND: ip -netns testns link del gretap00 FAIL: gretap Fix it by using the correct keyword. Fixes: 9c2a19f71515 ("kselftest: rtnetlink.sh: add verbose flag") Signed-off-by: Paolo Abeni Reviewed-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/rtnetlink.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index 5f2b3f6c0d749..38be9706c45f1 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -859,7 +859,7 @@ kci_test_gretap() run_cmd ip -netns "$testns" addr add dev "$DEV_NS" 10.1.1.100/24 - run_cmd ip -netns "$testns" link set dev $DEV_NS ups + run_cmd ip -netns "$testns" link set dev $DEV_NS up run_cmd ip -netns "$testns" link del "$DEV_NS" # test external mode From 3798680f2fbbe0ca3ab6138b34e0d161c36497ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Nov 2023 13:12:58 +0000 Subject: [PATCH 306/560] rxrpc: Fix RTT determination to use any ACK as a source Fix RTT determination to be able to use any type of ACK as the response from which RTT can be calculated provided its ack.serial is non-zero and matches the serial number of an outgoing DATA or ACK packet. This shouldn't be limited to REQUESTED-type ACKs as these can have other types substituted for them for things like duplicate or out-of-order packets. Fixes: 4700c4d80b7b ("rxrpc: Fix loss of RTT samples due to interposed ACK") Signed-off-by: David Howells cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- include/trace/events/rxrpc.h | 2 +- net/rxrpc/input.c | 35 ++++++++++++++++------------------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 4c53a5ef6257b..f7e537f64db45 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -328,7 +328,7 @@ E_(rxrpc_rtt_tx_ping, "PING") #define rxrpc_rtt_rx_traces \ - EM(rxrpc_rtt_rx_cancel, "CNCL") \ + EM(rxrpc_rtt_rx_other_ack, "OACK") \ EM(rxrpc_rtt_rx_obsolete, "OBSL") \ EM(rxrpc_rtt_rx_lost, "LOST") \ EM(rxrpc_rtt_rx_ping_response, "PONG") \ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 030d64f282f37..3f9594d125192 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -643,12 +643,8 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_mb(); /* Read data before setting avail bit */ set_bit(i, &call->rtt_avail); - if (type != rxrpc_rtt_rx_cancel) - rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, - sent_at, resp_time); - else - trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_cancel, i, - orig_serial, acked_serial, 0, 0); + rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, + sent_at, resp_time); matched = true; } @@ -801,20 +797,21 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) summary.ack_reason, nr_acks); rxrpc_inc_stat(call->rxnet, stat_rx_acks[ack.reason]); - switch (ack.reason) { - case RXRPC_ACK_PING_RESPONSE: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_ping_response); - break; - case RXRPC_ACK_REQUESTED: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_requested_ack); - break; - default: - if (acked_serial != 0) + if (acked_serial != 0) { + switch (ack.reason) { + case RXRPC_ACK_PING_RESPONSE: rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_cancel); - break; + rxrpc_rtt_rx_ping_response); + break; + case RXRPC_ACK_REQUESTED: + rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, + rxrpc_rtt_rx_requested_ack); + break; + default: + rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, + rxrpc_rtt_rx_other_ack); + break; + } } if (ack.reason == RXRPC_ACK_PING) { From 1a01319feef7047aa2ba400ffa3e047776aa29ca Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Nov 2023 13:12:59 +0000 Subject: [PATCH 307/560] rxrpc: Defer the response to a PING ACK until we've parsed it Defer the generation of a PING RESPONSE ACK in response to a PING ACK until we've parsed the PING ACK so that we pick up any changes to the packet queue so that we can update ackinfo. This is also applied to an ACK generated in response to an ACK with the REQUEST_ACK flag set. Note that whilst the problem was added in commit 248f219cb8bc, it didn't really matter at that point because the ACK was proposed in softirq mode and generated asynchronously later in process context, taking the latest values at the time. But this fix is only needed since the move to parse incoming packets in an I/O thread rather than in softirq and generate the ACK at point of proposal (b0346843b1076b34a0278ff601f8f287535cb064). Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells cc: Marc Dionne cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/rxrpc/input.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 3f9594d125192..92495e73b8699 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -814,14 +814,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } } - if (ack.reason == RXRPC_ACK_PING) { - rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, - rxrpc_propose_ack_respond_to_ping); - } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { - rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, - rxrpc_propose_ack_respond_to_ack); - } - /* If we get an EXCEEDS_WINDOW ACK from the server, it probably * indicates that the client address changed due to NAT. The server * lost the call because it switched to a different peer. @@ -832,7 +824,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); - return; + goto send_response; } /* If we get an OUT_OF_SEQUENCE ACK from the server, that can also @@ -846,7 +838,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); - return; + goto send_response; } /* Discard any out-of-order or duplicate ACKs (outside lock). */ @@ -854,7 +846,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, first_soft_ack, call->acks_first_seq, prev_pkt, call->acks_prev_seq); - return; + goto send_response; } info.rxMTU = 0; @@ -894,7 +886,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_CALL_SERVER_AWAIT_ACK: break; default: - return; + goto send_response; } if (before(hard_ack, call->acks_hard_ack) || @@ -906,7 +898,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (after(hard_ack, call->acks_hard_ack)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); - return; + goto send_response; } } @@ -924,6 +916,14 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_ping_for_lost_reply); rxrpc_congestion_management(call, skb, &summary, acked_serial); + +send_response: + if (ack.reason == RXRPC_ACK_PING) + rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, + rxrpc_propose_ack_respond_to_ping); + else if (sp->hdr.flags & RXRPC_REQUEST_ACK) + rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, + rxrpc_propose_ack_respond_to_ack); } /* From 76df934c6d5f5c93ba7a0112b1818620ddc10b19 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 16 Nov 2023 12:11:51 -0800 Subject: [PATCH 308/560] MAINTAINERS: Add netdev subsystem profile link The netdev subsystem has had a subsystem process document for a while now. Link it appropriately in MAINTAINERS with the P: tag. Cc: Jakub Kicinski Cc: "David S. Miller" Cc: Eric Dumazet Cc: Paolo Abeni Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 482d428472e76..80a95878c61d7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14994,6 +14994,7 @@ M: Jakub Kicinski M: Paolo Abeni L: netdev@vger.kernel.org S: Maintained +P: Documentation/process/maintainer-netdev.rst Q: https://patchwork.kernel.org/project/netdevbpf/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git @@ -15045,6 +15046,7 @@ M: Jakub Kicinski M: Paolo Abeni L: netdev@vger.kernel.org S: Maintained +P: Documentation/process/maintainer-netdev.rst Q: https://patchwork.kernel.org/project/netdevbpf/list/ B: mailto:netdev@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git From 9e0be3f50c0e8517d0238b62409c20bcb8cd8785 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 10 Nov 2023 13:07:22 +0100 Subject: [PATCH 309/560] linux/export: clean up the IA-64 KSYM_FUNC macro With commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"), there is no need to keep the IA-64 definition of the KSYM_FUNC macro. Clean up the IA-64 definition of the KSYM_FUNC macro. Signed-off-by: Lukas Bulwahn Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- include/linux/export-internal.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h index 45fca09b23194..69501e0ec239f 100644 --- a/include/linux/export-internal.h +++ b/include/linux/export-internal.h @@ -50,9 +50,7 @@ " .previous" "\n" \ ) -#ifdef CONFIG_IA64 -#define KSYM_FUNC(name) @fptr(name) -#elif defined(CONFIG_PARISC) && defined(CONFIG_64BIT) +#if defined(CONFIG_PARISC) && defined(CONFIG_64BIT) #define KSYM_FUNC(name) P%name #else #define KSYM_FUNC(name) name From 76020731d4ee897411ce4a73916ed805ea15d946 Mon Sep 17 00:00:00 2001 From: Simon Glass Date: Fri, 10 Nov 2023 17:28:01 -0700 Subject: [PATCH 310/560] kbuild: Move the single quotes for image name Add quotes where UIMAGE_NAME is used, rather than where it is defined. This allows the UIMAGE_NAME variable to be set by the user. Signed-off-by: Simon Glass Signed-off-by: Masahiro Yamada --- scripts/Makefile.lib | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 68d0134bdbf9d..1a965fe68e011 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -487,14 +487,14 @@ UIMAGE_OPTS-y ?= UIMAGE_TYPE ?= kernel UIMAGE_LOADADDR ?= arch_must_set_this UIMAGE_ENTRYADDR ?= $(UIMAGE_LOADADDR) -UIMAGE_NAME ?= 'Linux-$(KERNELRELEASE)' +UIMAGE_NAME ?= Linux-$(KERNELRELEASE) quiet_cmd_uimage = UIMAGE $@ cmd_uimage = $(BASH) $(MKIMAGE) -A $(UIMAGE_ARCH) -O linux \ -C $(UIMAGE_COMPRESSION) $(UIMAGE_OPTS-y) \ -T $(UIMAGE_TYPE) \ -a $(UIMAGE_LOADADDR) -e $(UIMAGE_ENTRYADDR) \ - -n $(UIMAGE_NAME) -d $< $@ + -n '$(UIMAGE_NAME)' -d $< $@ # XZ # --------------------------------------------------------------------------- From ae1eff0349f2e908fc083630e8441ea6dc434dc0 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 15 Nov 2023 13:16:53 +0900 Subject: [PATCH 311/560] kconfig: fix memory leak from range properties Currently, sym_validate_range() duplicates the range string using xstrdup(), which is overwritten by a subsequent sym_calc_value() call. It results in a memory leak. Instead, only the pointer should be copied. Below is a test case, with a summary from Valgrind. [Test Kconfig] config FOO int "foo" range 10 20 [Test .config] CONFIG_FOO=0 [Before] LEAK SUMMARY: definitely lost: 3 bytes in 1 blocks indirectly lost: 0 bytes in 0 blocks possibly lost: 0 bytes in 0 blocks still reachable: 17,465 bytes in 21 blocks suppressed: 0 bytes in 0 blocks [After] LEAK SUMMARY: definitely lost: 0 bytes in 0 blocks indirectly lost: 0 bytes in 0 blocks possibly lost: 0 bytes in 0 blocks still reachable: 17,462 bytes in 20 blocks suppressed: 0 bytes in 0 blocks Signed-off-by: Masahiro Yamada --- scripts/kconfig/symbol.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c index 0572330bf8a78..a76925b46ce63 100644 --- a/scripts/kconfig/symbol.c +++ b/scripts/kconfig/symbol.c @@ -122,9 +122,9 @@ static long long sym_get_range_val(struct symbol *sym, int base) static void sym_validate_range(struct symbol *sym) { struct property *prop; + struct symbol *range_sym; int base; long long val, val2; - char str[64]; switch (sym->type) { case S_INT: @@ -140,17 +140,15 @@ static void sym_validate_range(struct symbol *sym) if (!prop) return; val = strtoll(sym->curr.val, NULL, base); - val2 = sym_get_range_val(prop->expr->left.sym, base); + range_sym = prop->expr->left.sym; + val2 = sym_get_range_val(range_sym, base); if (val >= val2) { - val2 = sym_get_range_val(prop->expr->right.sym, base); + range_sym = prop->expr->right.sym; + val2 = sym_get_range_val(range_sym, base); if (val <= val2) return; } - if (sym->type == S_INT) - sprintf(str, "%lld", val2); - else - sprintf(str, "0x%llx", val2); - sym->curr.val = xstrdup(str); + sym->curr.val = range_sym->curr.val; } static void sym_set_changed(struct symbol *sym) From 1ffa8602e39b89469dc703ebab7a7e44c33da0f7 Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Wed, 13 Sep 2023 16:18:44 -0400 Subject: [PATCH 312/560] drm/amd/display: Guard against invalid RPTR/WPTR being set [WHY] HW can return invalid values on register read, guard against these being set and causing us to access memory out of range and page fault. [HOW] Guard at sync_inbox1 and guard at pushing commands. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Hansen Dsouza Acked-by: Alex Hung Signed-off-by: Nicholas Kazlauskas Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dmub/src/dmub_srv.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c index e43e8d4bfe375..5d36f3e5dc2bd 100644 --- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c @@ -707,9 +707,16 @@ enum dmub_status dmub_srv_sync_inbox1(struct dmub_srv *dmub) return DMUB_STATUS_INVALID; if (dmub->hw_funcs.get_inbox1_rptr && dmub->hw_funcs.get_inbox1_wptr) { - dmub->inbox1_rb.rptr = dmub->hw_funcs.get_inbox1_rptr(dmub); - dmub->inbox1_rb.wrpt = dmub->hw_funcs.get_inbox1_wptr(dmub); - dmub->inbox1_last_wptr = dmub->inbox1_rb.wrpt; + uint32_t rptr = dmub->hw_funcs.get_inbox1_rptr(dmub); + uint32_t wptr = dmub->hw_funcs.get_inbox1_wptr(dmub); + + if (rptr > dmub->inbox1_rb.capacity || wptr > dmub->inbox1_rb.capacity) { + return DMUB_STATUS_HW_FAILURE; + } else { + dmub->inbox1_rb.rptr = rptr; + dmub->inbox1_rb.wrpt = wptr; + dmub->inbox1_last_wptr = dmub->inbox1_rb.wrpt; + } } return DMUB_STATUS_OK; @@ -743,6 +750,11 @@ enum dmub_status dmub_srv_cmd_queue(struct dmub_srv *dmub, if (!dmub->hw_init) return DMUB_STATUS_INVALID; + if (dmub->inbox1_rb.rptr > dmub->inbox1_rb.capacity || + dmub->inbox1_rb.wrpt > dmub->inbox1_rb.capacity) { + return DMUB_STATUS_HW_FAILURE; + } + if (dmub_rb_push_front(&dmub->inbox1_rb, cmd)) return DMUB_STATUS_OK; From 0288603040c38ccfeb5342f34a52673366d90038 Mon Sep 17 00:00:00 2001 From: Victor Lu Date: Wed, 4 Oct 2023 14:24:15 -0400 Subject: [PATCH 313/560] drm/amdgpu: Do not program VF copy regs in mmhub v1.8 under SRIOV (v2) MC_VM_AGP_* registers should not be programmed by guest driver. v2: move early return outside of loop Signed-off-by: Victor Lu Reviewed-by: Samir Dhume Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index ea142611be1c0..9b0146732e13c 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -130,6 +130,9 @@ static void mmhub_v1_8_init_system_aperture_regs(struct amdgpu_device *adev) uint64_t value; int i; + if (amdgpu_sriov_vf(adev)) + return; + inst_mask = adev->aid_mask; for_each_inst(i, inst_mask) { /* Program the AGP BAR */ @@ -139,9 +142,6 @@ static void mmhub_v1_8_init_system_aperture_regs(struct amdgpu_device *adev) WREG32_SOC15(MMHUB, i, regMC_VM_AGP_TOP, adev->gmc.agp_end >> 24); - if (amdgpu_sriov_vf(adev)) - return; - /* Program the system aperture low logical page number. */ WREG32_SOC15(MMHUB, i, regMC_VM_SYSTEM_APERTURE_LOW_ADDR, min(adev->gmc.fb_start, adev->gmc.agp_start) >> 18); From bdb72185d310fc8049c7ea95221d640e9e7165e5 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Mon, 13 Nov 2023 18:05:34 +0800 Subject: [PATCH 314/560] drm/amdgpu: finalizing mem_partitions at the end of GMC v9 sw_fini The valid num_mem_partitions is required during ttm pool fini, thus move the cleanup at the end of the function. Signed-off-by: Le Ma Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index bde25eb4ed8e2..c1f2f166f0640 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -2170,8 +2170,6 @@ static int gmc_v9_0_sw_fini(void *handle) if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) amdgpu_gmc_sysfs_fini(adev); - adev->gmc.num_mem_partitions = 0; - kfree(adev->gmc.mem_partitions); amdgpu_gmc_ras_fini(adev); amdgpu_gem_force_release(adev); @@ -2185,6 +2183,9 @@ static int gmc_v9_0_sw_fini(void *handle) amdgpu_bo_free_kernel(&adev->gmc.pdb0_bo, NULL, &adev->gmc.ptr_pdb0); amdgpu_bo_fini(adev); + adev->gmc.num_mem_partitions = 0; + kfree(adev->gmc.mem_partitions); + return 0; } From 8a0173cd90984835645022bf1997abd1bcd81aae Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Sun, 12 Nov 2023 09:51:19 +0530 Subject: [PATCH 315/560] drm/amdgpu: Address member 'ring' not described in 'amdgpu_ vce, uvd_entity_init()' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following: drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c:237: warning: Function parameter or member 'ring' not described in 'amdgpu_vce_entity_init' drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c:405: warning: Function parameter or member 'ring' not described in 'amdgpu_uvd_entity_init' Cc: Christian König Cc: Alex Deucher Cc: "Pan, Xinhui" Signed-off-by: Srinivasan Shanmugam Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index 65949cc7abb93..07d930339b078 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -398,6 +398,7 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev) * amdgpu_uvd_entity_init - init entity * * @adev: amdgpu_device pointer + * @ring: amdgpu_ring pointer to check * * Initialize the entity used for handle management in the kernel driver. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index 0954447f689d9..59acf424a078f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -230,6 +230,7 @@ int amdgpu_vce_sw_fini(struct amdgpu_device *adev) * amdgpu_vce_entity_init - init entity * * @adev: amdgpu_device pointer + * @ring: amdgpu_ring pointer to check * * Initialize the entity used for handle management in the kernel driver. */ From a58555359a9f870543aaddef277c3396159895ce Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Mon, 23 Oct 2023 13:57:32 -0400 Subject: [PATCH 316/560] drm/amd/display: Fix DSC not Enabled on Direct MST Sink [WHY & HOW] For the scenario when a dsc capable MST sink device is directly connected, it needs to use max dsc compression as the link bw constraint. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Roman Li Acked-by: Alex Hung Signed-off-by: Fangzhi Zuo Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index d3b13d362edac..11da0eebee6c4 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -1604,31 +1604,31 @@ enum dc_status dm_dp_mst_is_port_support_mode( unsigned int upper_link_bw_in_kbps = 0, down_link_bw_in_kbps = 0; unsigned int max_compressed_bw_in_kbps = 0; struct dc_dsc_bw_range bw_range = {0}; - struct drm_dp_mst_topology_mgr *mst_mgr; + uint16_t full_pbn = aconnector->mst_output_port->full_pbn; /* - * check if the mode could be supported if DSC pass-through is supported - * AND check if there enough bandwidth available to support the mode - * with DSC enabled. + * Consider the case with the depth of the mst topology tree is equal or less than 2 + * A. When dsc bitstream can be transmitted along the entire path + * 1. dsc is possible between source and branch/leaf device (common dsc params is possible), AND + * 2. dsc passthrough supported at MST branch, or + * 3. dsc decoding supported at leaf MST device + * Use maximum dsc compression as bw constraint + * B. When dsc bitstream cannot be transmitted along the entire path + * Use native bw as bw constraint */ if (is_dsc_common_config_possible(stream, &bw_range) && - aconnector->mst_output_port->passthrough_aux) { - mst_mgr = aconnector->mst_output_port->mgr; - mutex_lock(&mst_mgr->lock); - + (aconnector->mst_output_port->passthrough_aux || + aconnector->dsc_aux == &aconnector->mst_output_port->aux)) { cur_link_settings = stream->link->verified_link_cap; upper_link_bw_in_kbps = dc_link_bandwidth_kbps(aconnector->dc_link, - &cur_link_settings - ); - down_link_bw_in_kbps = kbps_from_pbn(aconnector->mst_output_port->full_pbn); + &cur_link_settings); + down_link_bw_in_kbps = kbps_from_pbn(full_pbn); /* pick the bottleneck */ end_to_end_bw_in_kbps = min(upper_link_bw_in_kbps, down_link_bw_in_kbps); - mutex_unlock(&mst_mgr->lock); - /* * use the maximum dsc compression bandwidth as the required * bandwidth for the mode @@ -1643,8 +1643,7 @@ enum dc_status dm_dp_mst_is_port_support_mode( /* check if mode could be supported within full_pbn */ bpp = convert_dc_color_depth_into_bpc(stream->timing.display_color_depth) * 3; pbn = drm_dp_calc_pbn_mode(stream->timing.pix_clk_100hz / 10, bpp, false); - - if (pbn > aconnector->mst_output_port->full_pbn) + if (pbn > full_pbn) return DC_FAIL_BANDWIDTH_VALIDATE; } From 50d51374b498457c4dea26779d32ccfed12ddaff Mon Sep 17 00:00:00 2001 From: YuanShang Date: Tue, 31 Oct 2023 10:32:37 +0800 Subject: [PATCH 317/560] drm/amdgpu: correct chunk_ptr to a pointer to chunk. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable "chunk_ptr" should be a pointer pointing to a struct drm_amdgpu_cs_chunk instead of to a pointer of that. Signed-off-by: YuanShang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index df3ecfa9e13f5..e50be65000303 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -207,7 +207,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, } for (i = 0; i < p->nchunks; i++) { - struct drm_amdgpu_cs_chunk __user **chunk_ptr = NULL; + struct drm_amdgpu_cs_chunk __user *chunk_ptr = NULL; struct drm_amdgpu_cs_chunk user_chunk; uint32_t __user *cdata; From 786c355797b3942725829d02ce9e2e6a9eba11fe Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Tue, 31 Oct 2023 03:14:02 +0800 Subject: [PATCH 318/560] drm/amd/pm: Update metric table for smu v13_0_6 Update pmfw metric table to include pcie instantaneous bandwidth & pcie error counters Signed-off-by: Asad Kamal Reviewed-by: Le Ma Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- .../drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h index dab35d878a905..fef2d290f3f25 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h @@ -123,7 +123,7 @@ typedef enum { VOLTAGE_GUARDBAND_COUNT } GFX_GUARDBAND_e; -#define SMU_METRICS_TABLE_VERSION 0x8 +#define SMU_METRICS_TABLE_VERSION 0x9 typedef struct __attribute__((packed, aligned(4))) { uint32_t AccumulationCounter; @@ -211,6 +211,14 @@ typedef struct __attribute__((packed, aligned(4))) { //XGMI Data tranfser size uint64_t XgmiReadDataSizeAcc[8];//in KByte uint64_t XgmiWriteDataSizeAcc[8];//in KByte + + //PCIE BW Data and error count + uint32_t PcieBandwidth[4]; + uint32_t PCIeL0ToRecoveryCountAcc; // The Pcie counter itself is accumulated + uint32_t PCIenReplayAAcc; // The Pcie counter itself is accumulated + uint32_t PCIenReplayARolloverCountAcc; // The Pcie counter itself is accumulated + uint32_t PCIeNAKSentCountAcc; // The Pcie counter itself is accumulated + uint32_t PCIeNAKReceivedCountAcc; // The Pcie counter itself is accumulated } MetricsTable_t; #define SMU_VF_METRICS_TABLE_VERSION 0x3 From e4d0be18243ca006258b5c7c148796c0b43505c4 Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Tue, 14 Nov 2023 16:17:17 +0800 Subject: [PATCH 319/560] drm/amd/pm: Fill pcie error counters for gpu v1_4 Fill PCIE error counters & instantaneous bandwidth in gpu metrics v1_4 for smu v_13_0_6 Signed-off-by: Asad Kamal Reviewed-by: Le Ma Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 891605d4975f4..84fac499cf0cb 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2095,6 +2095,14 @@ static ssize_t smu_v13_0_6_get_gpu_metrics(struct smu_context *smu, void **table smu_v13_0_6_get_current_pcie_link_speed(smu); gpu_metrics->pcie_bandwidth_acc = SMUQ10_ROUND(metrics->PcieBandwidthAcc[0]); + gpu_metrics->pcie_bandwidth_inst = + SMUQ10_ROUND(metrics->PcieBandwidth[0]); + gpu_metrics->pcie_l0_to_recov_count_acc = + metrics->PCIeL0ToRecoveryCountAcc; + gpu_metrics->pcie_replay_count_acc = + metrics->PCIenReplayAAcc; + gpu_metrics->pcie_replay_rover_count_acc = + metrics->PCIenReplayARolloverCountAcc; } gpu_metrics->system_clock_counter = ktime_get_boottime_ns(); From 9725a4f9eb495bfa6c7f5ccdb49440ff06dba0a1 Mon Sep 17 00:00:00 2001 From: Muhammad Ahmed Date: Tue, 31 Oct 2023 16:03:21 -0400 Subject: [PATCH 320/560] drm/amd/display: Add null checks for 8K60 lightup [WHY & HOW] Add some null checks to fix an issue where 8k60 tiled display fails to light up. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Charlene Liu Acked-by: Alex Hung Signed-off-by: Muhammad Ahmed Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 2 +- drivers/gpu/drm/amd/display/dc/core/dc_resource.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 7b9bf5cb45299..d8f4347382123 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3178,7 +3178,7 @@ static bool update_planes_and_stream_state(struct dc *dc, struct pipe_ctx *otg_master = resource_get_otg_master_for_stream(&context->res_ctx, context->streams[i]); - if (otg_master->stream->test_pattern.type != DP_TEST_PATTERN_VIDEO_MODE) + if (otg_master && otg_master->stream->test_pattern.type != DP_TEST_PATTERN_VIDEO_MODE) resource_build_test_pattern_params(&context->res_ctx, otg_master); } } diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c index 1d48278cba96c..a1f1d10039927 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c @@ -5190,6 +5190,9 @@ bool dc_resource_acquire_secondary_pipe_for_mpc_odm_legacy( sec_next = sec_pipe->next_odm_pipe; sec_prev = sec_pipe->prev_odm_pipe; + if (pri_pipe == NULL) + return false; + *sec_pipe = *pri_pipe; sec_pipe->top_pipe = sec_top; From b71f4ade1b8900d30c661d6c27f87c35214c398c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 8 Nov 2023 13:31:57 -0600 Subject: [PATCH 321/560] drm/amd/display: fix a NULL pointer dereference in amdgpu_dm_i2c_xfer() When ddc_service_construct() is called, it explicitly checks both the link type and whether there is something on the link which will dictate whether the pin is marked as hw_supported. If the pin isn't set or the link is not set (such as from unloading/reloading amdgpu in an IGT test) then fail the amdgpu_dm_i2c_xfer() call. Cc: stable@vger.kernel.org Fixes: 22676bc500c2 ("drm/amd/display: Fix dmub soft hang for PSR 1") Link: https://github.com/fwupd/fwupd/issues/6327 Signed-off-by: Mario Limonciello Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 6f99f6754c119..5ec7acf65ee1c 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -7481,6 +7481,9 @@ static int amdgpu_dm_i2c_xfer(struct i2c_adapter *i2c_adap, int i; int result = -EIO; + if (!ddc_service->ddc_pin || !ddc_service->ddc_pin->hw_info.hw_supported) + return result; + cmd.payloads = kcalloc(num, sizeof(struct i2c_payload), GFP_KERNEL); if (!cmd.payloads) From 270b301beca58e427a0fda7523a71a9562e644bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Pekkarinen?= Date: Tue, 14 Nov 2023 17:27:51 +0200 Subject: [PATCH 322/560] drm/amd/display: fix NULL dereference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following patch will fix a minor issue where a debug message is referencing an struct that has just being checked whether is null or not. This has been noticed by using coccinelle, in the following output: drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c:540:25-29: ERROR: aconnector is NULL but dereferenced. Fixes: 5d72e247e58c ("drm/amd/display: switch DC over to the new DRM logging macros") Signed-off-by: José Pekkarinen Signed-off-by: Hamza Mahfooz Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c index ed784cf27d396..c7a29bb737e24 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c @@ -536,11 +536,8 @@ bool dm_helpers_dp_read_dpcd( struct amdgpu_dm_connector *aconnector = link->priv; - if (!aconnector) { - drm_dbg_dp(aconnector->base.dev, - "Failed to find connector for link!\n"); + if (!aconnector) return false; - } return drm_dp_dpcd_read(&aconnector->dm_dp_aux.aux, address, data, size) == size; From 435f5b369657cffee4b04db1f5805b48599f4dbe Mon Sep 17 00:00:00 2001 From: Tianci Yin Date: Wed, 1 Nov 2023 09:47:13 +0800 Subject: [PATCH 323/560] drm/amd/display: Enable fast plane updates on DCN3.2 and above [WHY] When cursor moves across screen boarder, lag cursor observed, since subvp settings need to sync up with vblank that causes cursor updates being delayed. [HOW] Enable fast plane updates on DCN3.2 to fix it. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Aurabindo Pillai Acked-by: Alex Hung Signed-off-by: Tianci Yin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 5ec7acf65ee1c..9581510d37401 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -9606,14 +9606,14 @@ static bool should_reset_plane(struct drm_atomic_state *state, struct drm_plane *other; struct drm_plane_state *old_other_state, *new_other_state; struct drm_crtc_state *new_crtc_state; + struct amdgpu_device *adev = drm_to_adev(plane->dev); int i; /* - * TODO: Remove this hack once the checks below are sufficient - * enough to determine when we need to reset all the planes on - * the stream. + * TODO: Remove this hack for all asics once it proves that the + * fast updates works fine on DCN3.2+. */ - if (state->allow_modeset) + if (adev->ip_versions[DCE_HWIP][0] < IP_VERSION(3, 2, 0) && state->allow_modeset) return true; /* Exit early if we know that we're adding or removing the plane. */ From 923bbfe6c888812db1088d684bd30c24036226d2 Mon Sep 17 00:00:00 2001 From: Paul Hsieh Date: Wed, 25 Oct 2023 10:53:35 +0800 Subject: [PATCH 324/560] drm/amd/display: Clear dpcd_sink_ext_caps if not set [WHY] Some eDP panels' ext caps don't set initial values and the value of dpcd_addr (0x317) is random. It means that sometimes the eDP can be OLED, miniLED and etc, and cause incorrect backlight control interface. [HOW] Add remove_sink_ext_caps to remove sink ext caps (HDR, OLED and etc) Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Anthony Koo Acked-by: Alex Hung Signed-off-by: Paul Hsieh Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc_types.h | 1 + drivers/gpu/drm/amd/display/dc/link/link_detection.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dc_types.h b/drivers/gpu/drm/amd/display/dc/dc_types.h index cea666ea66c61..fcb825e4f1bb8 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_types.h +++ b/drivers/gpu/drm/amd/display/dc/dc_types.h @@ -177,6 +177,7 @@ struct dc_panel_patch { unsigned int disable_fams; unsigned int skip_avmute; unsigned int mst_start_top_delay; + unsigned int remove_sink_ext_caps; }; struct dc_edid_caps { diff --git a/drivers/gpu/drm/amd/display/dc/link/link_detection.c b/drivers/gpu/drm/amd/display/dc/link/link_detection.c index d6f0f857c05af..f2fe523f914f1 100644 --- a/drivers/gpu/drm/amd/display/dc/link/link_detection.c +++ b/drivers/gpu/drm/amd/display/dc/link/link_detection.c @@ -1088,6 +1088,9 @@ static bool detect_link_and_local_sink(struct dc_link *link, if (sink->edid_caps.panel_patch.skip_scdc_overwrite) link->ctx->dc->debug.hdmi20_disable = true; + if (sink->edid_caps.panel_patch.remove_sink_ext_caps) + link->dpcd_sink_ext_caps.raw = 0; + if (dc_is_hdmi_signal(link->connector_signal)) read_scdc_caps(link->ddc, link->local_sink); From 07ee43faeb7eb088e49a7549fcabcae94c443d3b Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Mon, 13 Nov 2023 14:34:55 +0800 Subject: [PATCH 325/560] drm/amdgpu: fix ras err_data null pointer issue in amdgpu_ras.c fix ras err_data null pointer issue in amdgpu_ras.c Fixes: 8cc0f5669eb6 ("drm/amdgpu: Support multiple error query modes") Signed-off-by: Yang Wang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 84e5987b14e05..a3dc68e989108 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1188,7 +1188,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, } if (block_obj->hw_ops->query_ras_error_count) - block_obj->hw_ops->query_ras_error_count(adev, &err_data); + block_obj->hw_ops->query_ras_error_count(adev, err_data); if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || (info->head.block == AMDGPU_RAS_BLOCK__GFX) || From 0f216364625cb453b4f933deacfa92df7f2a2fc9 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 10 Nov 2023 13:15:39 +0530 Subject: [PATCH 326/560] drm/amd/pm: Don't send unload message for reset No need to notify about unload during reset. Also remove the FW version check. Signed-off-by: Lijo Lazar Reviewed-by: Yang Wang Reviewed-by: Asad Kamal Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 84fac499cf0cb..0e5a77c3c2e21 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1454,7 +1454,7 @@ static int smu_v13_0_6_register_irq_handler(struct smu_context *smu) static int smu_v13_0_6_notify_unload(struct smu_context *smu) { - if (smu->smc_fw_version <= 0x553500) + if (amdgpu_in_reset(smu->adev)) return 0; dev_dbg(smu->adev->dev, "Notify PMFW about driver unload"); From 5e8a0d3598b47ee5a57708072bdef08816264538 Mon Sep 17 00:00:00 2001 From: Duncan Ma Date: Wed, 25 Oct 2023 19:07:21 -0400 Subject: [PATCH 327/560] drm/amd/display: Negate IPS allow and commit bits [WHY] On s0i3, IPS mask isn't saved and restored. It is reset to zero on exit. If it is cleared unexpectedly, driver will proceed operations while DCN is in IPS2 and cause a hang. [HOW] Negate the bit logic. Default value of zero indicates it is still in IPS2. Driver must poll for the bit to assert. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Charlene Liu Acked-by: Alex Hung Signed-off-by: Duncan Ma Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c | 18 +++++++++--------- drivers/gpu/drm/amd/display/dc/core/dc.c | 4 ++-- drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c | 10 +++++----- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c index 0fa4fcd00de2c..507a7cf56711f 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c @@ -820,22 +820,22 @@ static void dcn35_set_idle_state(struct clk_mgr *clk_mgr_base, bool allow_idle) if (dc->config.disable_ips == DMUB_IPS_ENABLE || dc->config.disable_ips == DMUB_IPS_DISABLE_DYNAMIC) { - val |= DMUB_IPS1_ALLOW_MASK; - val |= DMUB_IPS2_ALLOW_MASK; - } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS1) { val = val & ~DMUB_IPS1_ALLOW_MASK; val = val & ~DMUB_IPS2_ALLOW_MASK; - } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2) { - val |= DMUB_IPS1_ALLOW_MASK; - val = val & ~DMUB_IPS2_ALLOW_MASK; - } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2_Z10) { + } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS1) { val |= DMUB_IPS1_ALLOW_MASK; val |= DMUB_IPS2_ALLOW_MASK; + } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2) { + val = val & ~DMUB_IPS1_ALLOW_MASK; + val |= DMUB_IPS2_ALLOW_MASK; + } else if (dc->config.disable_ips == DMUB_IPS_DISABLE_IPS2_Z10) { + val = val & ~DMUB_IPS1_ALLOW_MASK; + val = val & ~DMUB_IPS2_ALLOW_MASK; } if (!allow_idle) { - val = val & ~DMUB_IPS1_ALLOW_MASK; - val = val & ~DMUB_IPS2_ALLOW_MASK; + val |= DMUB_IPS1_ALLOW_MASK; + val |= DMUB_IPS2_ALLOW_MASK; } dcn35_smu_write_ips_scratch(clk_mgr, val); diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index d8f4347382123..76b47f1781279 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -4934,8 +4934,8 @@ bool dc_dmub_is_ips_idle_state(struct dc *dc) if (dc->hwss.get_idle_state) idle_state = dc->hwss.get_idle_state(dc); - if ((idle_state & DMUB_IPS1_ALLOW_MASK) || - (idle_state & DMUB_IPS2_ALLOW_MASK)) + if (!(idle_state & DMUB_IPS1_ALLOW_MASK) || + !(idle_state & DMUB_IPS2_ALLOW_MASK)) return true; return false; diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c index e4c007203318b..0e07699c1e835 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c @@ -1202,11 +1202,11 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc) allow_state = dc->hwss.get_idle_state(dc); dc->hwss.set_idle_state(dc, false); - if (allow_state & DMUB_IPS2_ALLOW_MASK) { + if (!(allow_state & DMUB_IPS2_ALLOW_MASK)) { // Wait for evaluation time udelay(dc->debug.ips2_eval_delay_us); commit_state = dc->hwss.get_idle_state(dc); - if (commit_state & DMUB_IPS2_COMMIT_MASK) { + if (!(commit_state & DMUB_IPS2_COMMIT_MASK)) { // Tell PMFW to exit low power state dc->clk_mgr->funcs->exit_low_power_state(dc->clk_mgr); @@ -1216,7 +1216,7 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc) for (i = 0; i < max_num_polls; ++i) { commit_state = dc->hwss.get_idle_state(dc); - if (!(commit_state & DMUB_IPS2_COMMIT_MASK)) + if (commit_state & DMUB_IPS2_COMMIT_MASK) break; udelay(1); @@ -1235,10 +1235,10 @@ void dc_dmub_srv_exit_low_power_state(const struct dc *dc) } dc_dmub_srv_notify_idle(dc, false); - if (allow_state & DMUB_IPS1_ALLOW_MASK) { + if (!(allow_state & DMUB_IPS1_ALLOW_MASK)) { for (i = 0; i < max_num_polls; ++i) { commit_state = dc->hwss.get_idle_state(dc); - if (!(commit_state & DMUB_IPS1_COMMIT_MASK)) + if (commit_state & DMUB_IPS1_COMMIT_MASK) break; udelay(1); From 9ddea8c9775d9379d71e6ac1519c552461b90b07 Mon Sep 17 00:00:00 2001 From: Shiwu Zhang Date: Tue, 31 Oct 2023 11:02:49 +0800 Subject: [PATCH 328/560] drm/amdgpu: add and populate the port num into xgmi topology info The port num info is firstly introduced with 20.00.01.13 xgmi ta and make them as part of topology info. Signed-off-by: Shiwu Zhang Reviewed-by: Le Ma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 32b701cc0376d..a21045d018f2b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1473,6 +1473,11 @@ int psp_xgmi_get_topology_info(struct psp_context *psp, topology->nodes[i].num_links = (requires_reflection && topology->nodes[i].num_links) ? topology->nodes[i].num_links : node_num_links; } + /* popluate the connected port num info if supported and available */ + if (ta_port_num_support && topology->nodes[i].num_links) { + memcpy(topology->nodes[i].port_num, link_extend_info_output->nodes[i].port_num, + sizeof(struct xgmi_connected_port_num) * TA_XGMI__MAX_PORT_NUM); + } /* reflect the topology information for bi-directionality */ if (requires_reflection && topology->nodes[i].num_hops) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index 5d36ad3f48c74..c4d9cbde55b9b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -150,6 +150,7 @@ struct psp_xgmi_node_info { uint8_t is_sharing_enabled; enum ta_xgmi_assigned_sdma_engine sdma_engine; uint8_t num_links; + struct xgmi_connected_port_num port_num[TA_XGMI__MAX_PORT_NUM]; }; struct psp_xgmi_topology_info { From 5911d02cac70d7fb52009fbd37423e63f8f6f9bc Mon Sep 17 00:00:00 2001 From: Lewis Huang Date: Thu, 19 Oct 2023 17:22:21 +0800 Subject: [PATCH 329/560] drm/amd/display: Change the DMCUB mailbox memory location from FB to inbox [WHY] Flush command sent to DMCUB spends more time for execution on a dGPU than on an APU. This causes cursor lag when using high refresh rate mouses. [HOW] 1. Change the DMCUB mailbox memory location from FB to inbox. 2. Only change windows memory to inbox. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Acked-by: Alex Hung Signed-off-by: Lewis Huang Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 13 ++++---- drivers/gpu/drm/amd/display/dmub/dmub_srv.h | 22 ++++++++----- .../gpu/drm/amd/display/dmub/src/dmub_srv.c | 32 ++++++++++++++----- 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 9581510d37401..ee97814ebd994 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2079,7 +2079,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev) struct dmub_srv_create_params create_params; struct dmub_srv_region_params region_params; struct dmub_srv_region_info region_info; - struct dmub_srv_fb_params fb_params; + struct dmub_srv_memory_params memory_params; struct dmub_srv_fb_info *fb_info; struct dmub_srv *dmub_srv; const struct dmcub_firmware_header_v1_0 *hdr; @@ -2182,6 +2182,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev) adev->dm.dmub_fw->data + le32_to_cpu(hdr->header.ucode_array_offset_bytes) + PSP_HEADER_BYTES; + region_params.is_mailbox_in_inbox = false; status = dmub_srv_calc_region_info(dmub_srv, ®ion_params, ®ion_info); @@ -2205,10 +2206,10 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev) return r; /* Rebase the regions on the framebuffer address. */ - memset(&fb_params, 0, sizeof(fb_params)); - fb_params.cpu_addr = adev->dm.dmub_bo_cpu_addr; - fb_params.gpu_addr = adev->dm.dmub_bo_gpu_addr; - fb_params.region_info = ®ion_info; + memset(&memory_params, 0, sizeof(memory_params)); + memory_params.cpu_fb_addr = adev->dm.dmub_bo_cpu_addr; + memory_params.gpu_fb_addr = adev->dm.dmub_bo_gpu_addr; + memory_params.region_info = ®ion_info; adev->dm.dmub_fb_info = kzalloc(sizeof(*adev->dm.dmub_fb_info), GFP_KERNEL); @@ -2220,7 +2221,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev) return -ENOMEM; } - status = dmub_srv_calc_fb_info(dmub_srv, &fb_params, fb_info); + status = dmub_srv_calc_mem_info(dmub_srv, &memory_params, fb_info); if (status != DMUB_STATUS_OK) { DRM_ERROR("Error calculating DMUB FB info: %d\n", status); return -EINVAL; diff --git a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h index 9665ada0f894b..df63aa8f01e98 100644 --- a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h +++ b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h @@ -195,6 +195,7 @@ struct dmub_srv_region_params { uint32_t vbios_size; const uint8_t *fw_inst_const; const uint8_t *fw_bss_data; + bool is_mailbox_in_inbox; }; /** @@ -214,20 +215,25 @@ struct dmub_srv_region_params { */ struct dmub_srv_region_info { uint32_t fb_size; + uint32_t inbox_size; uint8_t num_regions; struct dmub_region regions[DMUB_WINDOW_TOTAL]; }; /** - * struct dmub_srv_fb_params - parameters used for driver fb setup + * struct dmub_srv_memory_params - parameters used for driver fb setup * @region_info: region info calculated by dmub service - * @cpu_addr: base cpu address for the framebuffer - * @gpu_addr: base gpu virtual address for the framebuffer + * @cpu_fb_addr: base cpu address for the framebuffer + * @cpu_inbox_addr: base cpu address for the gart + * @gpu_fb_addr: base gpu virtual address for the framebuffer + * @gpu_inbox_addr: base gpu virtual address for the gart */ -struct dmub_srv_fb_params { +struct dmub_srv_memory_params { const struct dmub_srv_region_info *region_info; - void *cpu_addr; - uint64_t gpu_addr; + void *cpu_fb_addr; + void *cpu_inbox_addr; + uint64_t gpu_fb_addr; + uint64_t gpu_inbox_addr; }; /** @@ -563,8 +569,8 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub, * DMUB_STATUS_OK - success * DMUB_STATUS_INVALID - unspecified error */ -enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub, - const struct dmub_srv_fb_params *params, +enum dmub_status dmub_srv_calc_mem_info(struct dmub_srv *dmub, + const struct dmub_srv_memory_params *params, struct dmub_srv_fb_info *out); /** diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c index 5d36f3e5dc2bd..22fc4ba96defd 100644 --- a/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c +++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_srv.c @@ -434,7 +434,7 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub, uint32_t fw_state_size = DMUB_FW_STATE_SIZE; uint32_t trace_buffer_size = DMUB_TRACE_BUFFER_SIZE; uint32_t scratch_mem_size = DMUB_SCRATCH_MEM_SIZE; - + uint32_t previous_top = 0; if (!dmub->sw_init) return DMUB_STATUS_INVALID; @@ -459,8 +459,15 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub, bios->base = dmub_align(stack->top, 256); bios->top = bios->base + params->vbios_size; - mail->base = dmub_align(bios->top, 256); - mail->top = mail->base + DMUB_MAILBOX_SIZE; + if (params->is_mailbox_in_inbox) { + mail->base = 0; + mail->top = mail->base + DMUB_MAILBOX_SIZE; + previous_top = bios->top; + } else { + mail->base = dmub_align(bios->top, 256); + mail->top = mail->base + DMUB_MAILBOX_SIZE; + previous_top = mail->top; + } fw_info = dmub_get_fw_meta_info(params); @@ -479,7 +486,7 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub, dmub->fw_version = fw_info->fw_version; } - trace_buff->base = dmub_align(mail->top, 256); + trace_buff->base = dmub_align(previous_top, 256); trace_buff->top = trace_buff->base + dmub_align(trace_buffer_size, 64); fw_state->base = dmub_align(trace_buff->top, 256); @@ -490,11 +497,14 @@ dmub_srv_calc_region_info(struct dmub_srv *dmub, out->fb_size = dmub_align(scratch_mem->top, 4096); + if (params->is_mailbox_in_inbox) + out->inbox_size = dmub_align(mail->top, 4096); + return DMUB_STATUS_OK; } -enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub, - const struct dmub_srv_fb_params *params, +enum dmub_status dmub_srv_calc_mem_info(struct dmub_srv *dmub, + const struct dmub_srv_memory_params *params, struct dmub_srv_fb_info *out) { uint8_t *cpu_base; @@ -509,8 +519,8 @@ enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub, if (params->region_info->num_regions != DMUB_NUM_WINDOWS) return DMUB_STATUS_INVALID; - cpu_base = (uint8_t *)params->cpu_addr; - gpu_base = params->gpu_addr; + cpu_base = (uint8_t *)params->cpu_fb_addr; + gpu_base = params->gpu_fb_addr; for (i = 0; i < DMUB_NUM_WINDOWS; ++i) { const struct dmub_region *reg = @@ -518,6 +528,12 @@ enum dmub_status dmub_srv_calc_fb_info(struct dmub_srv *dmub, out->fb[i].cpu_addr = cpu_base + reg->base; out->fb[i].gpu_addr = gpu_base + reg->base; + + if (i == DMUB_WINDOW_4_MAILBOX && params->cpu_inbox_addr != 0) { + out->fb[i].cpu_addr = (uint8_t *)params->cpu_inbox_addr + reg->base; + out->fb[i].gpu_addr = params->gpu_inbox_addr + reg->base; + } + out->fb[i].size = reg->top - reg->base; } From 0ee057e66c4b782809a0a9265cdac5542e646706 Mon Sep 17 00:00:00 2001 From: Nicholas Susanto Date: Wed, 1 Nov 2023 15:30:10 -0400 Subject: [PATCH 330/560] drm/amd/display: Fix encoder disable logic [WHY] DENTIST hangs when OTG is off and encoder is on. We were not disabling the encoder properly when switching from extended mode to external monitor only. [HOW] Disable the encoder using an existing enable/disable fifo helper instead of enc35_stream_encoder_enable. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Acked-by: Alex Hung Signed-off-by: Nicholas Susanto Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/dc/dcn35/dcn35_dio_stream_encoder.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c index 001f9eb669207..62a8f0b560062 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/dcn35/dcn35_dio_stream_encoder.c @@ -261,12 +261,6 @@ static void enc35_stream_encoder_enable( /* invalid mode ! */ ASSERT_CRITICAL(false); } - - REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 1); - REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 1); - } else { - REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 0); - REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 0); } } @@ -436,6 +430,8 @@ static void enc35_disable_fifo(struct stream_encoder *enc) struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc); REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_ENABLE, 0); + REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 0); + REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 0); } static void enc35_enable_fifo(struct stream_encoder *enc) @@ -443,6 +439,8 @@ static void enc35_enable_fifo(struct stream_encoder *enc) struct dcn10_stream_encoder *enc1 = DCN10STRENC_FROM_STRENC(enc); REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_READ_START_LEVEL, 0x7); + REG_UPDATE(DIG_FE_CLK_CNTL, DIG_FE_CLK_EN, 1); + REG_UPDATE(DIG_FE_EN_CNTL, DIG_FE_ENABLE, 1); enc35_reset_fifo(enc, true); enc35_reset_fifo(enc, false); From 564ca1b53ece166b5915c2ac90f3e9313100f4ea Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 14 Nov 2023 11:36:56 -0500 Subject: [PATCH 331/560] drm/amdgpu/gmc11: fix logic typo in AGP check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Should be && rather than ||. Fixes: b2e1cbe6281f ("drm/amdgpu/gmc11: disable AGP on GC 11.5") Acked-by: Christian König Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello # PHX & Navi33 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index 6dce9b29f6756..ba4c82f5e6178 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -640,7 +640,7 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, &adev->gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH); - if (!amdgpu_sriov_vf(adev) || + if (!amdgpu_sriov_vf(adev) && (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0))) amdgpu_gmc_agp_location(adev, mc); From 6ba5b613837c5d997ad8297b22fc46cd0be58d76 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 9 Nov 2023 15:31:00 -0500 Subject: [PATCH 332/560] drm/amdgpu: add a module parameter to control the AGP aperture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a module parameter to control the AGP aperture. The AGP aperture is an aperture in the GPU's internal address space which provides direct non-paged access to the platform address space. This access is non-snooped so only uncached memory can be accessed. Add a knob so that we can toggle this for debugging. Fixes: 67318cb84341 ("drm/amdgpu/gmc11: set gart placement GC11") Acked-by: Christian König Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello # PHX & Navi33 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++++++++ drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 ++- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +- 5 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index afec09930efa9..9d92ca1576771 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -248,6 +248,7 @@ extern int amdgpu_umsch_mm; extern int amdgpu_seamless; extern int amdgpu_user_partt_mode; +extern int amdgpu_agp; #define AMDGPU_VM_MAX_NUM_CTX 4096 #define AMDGPU_SG_THRESHOLD (256*1024*1024) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 3095a3a864af7..8f24cabe21554 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -207,6 +207,7 @@ int amdgpu_user_partt_mode = AMDGPU_AUTO_COMPUTE_PARTITION_MODE; int amdgpu_umsch_mm; int amdgpu_seamless = -1; /* auto */ uint amdgpu_debug_mask; +int amdgpu_agp = -1; /* auto */ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work); @@ -961,6 +962,15 @@ module_param_named(seamless, amdgpu_seamless, int, 0444); MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default"); module_param_named(debug_mask, amdgpu_debug_mask, uint, 0444); +/** + * DOC: agp (int) + * Enable the AGP aperture. This provides an aperture in the GPU's internal + * address space for direct access to system memory. Note that these accesses + * are non-snooped, so they are only used for access to uncached memory. + */ +MODULE_PARM_DESC(agp, "AGP (-1 = auto (default), 0 = disable, 1 = enable)"); +module_param_named(agp, amdgpu_agp, int, 0444); + /* These devices are not supported by amdgpu. * They are supported by the mach64, r128, radeon drivers */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 0ec7b061d7c20..23483bffa1c78 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -675,7 +675,7 @@ static void gmc_v10_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, &adev->gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index ba4c82f5e6178..e1078b53e942d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -641,7 +641,8 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_vram_location(adev, &adev->gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH); if (!amdgpu_sriov_vf(adev) && - (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0))) + (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)) && + (amdgpu_agp != 0)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index c1f2f166f0640..1638c31777992 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1630,7 +1630,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev, } else { amdgpu_gmc_vram_location(adev, mc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) amdgpu_gmc_agp_location(adev, mc); } /* base offset of vram pages */ From 0db062eac3e0846c6f120867a79df83b4c3db46f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 9 Nov 2023 15:34:19 -0500 Subject: [PATCH 333/560] drm/amdgpu/gmc11: disable AGP aperture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. It can still be toggled via the module parameter for testing. v2: leave it configurable via module parameter Fixes: 67318cb84341 ("drm/amdgpu/gmc11: set gart placement GC11") Reviewed-by: Yang Wang (v1) Acked-by: Christian König Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello # PHX & Navi33 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index e1078b53e942d..23d7b548d13f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -642,7 +642,7 @@ static void gmc_v11_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_HIGH); if (!amdgpu_sriov_vf(adev) && (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(11, 5, 0)) && - (amdgpu_agp != 0)) + (amdgpu_agp == 1)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ From 61fc93695bbfde218d5f9f0b8051ce36eb649669 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 9 Nov 2023 15:38:54 -0500 Subject: [PATCH 334/560] drm/amdgpu/gmc10: disable AGP aperture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. It can still be toggled via the module parameter for testing. v2: leave it configurable via module parameter Reviewed-by: Yang Wang (v1) Acked-by: Christian König Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello # PHX & Navi33 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 23483bffa1c78..a5a05c16c10d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -675,7 +675,7 @@ static void gmc_v10_0_vram_gtt_location(struct amdgpu_device *adev, amdgpu_gmc_set_agp_default(adev, mc); amdgpu_gmc_vram_location(adev, &adev->gmc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1)) amdgpu_gmc_agp_location(adev, mc); /* base offset of vram pages */ From e8c2d3e25b844ad8f7c8b269a7cfd65285329264 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 9 Nov 2023 15:40:00 -0500 Subject: [PATCH 335/560] drm/amdgpu/gmc9: disable AGP aperture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We've had misc reports of random IOMMU page faults when this is used. It's just a rarely used optimization anyway, so let's just disable it. It can still be toggled via the module parameter for testing. v2: leave it configurable via module parameter Reviewed-by: Yang Wang (v1) Acked-by: Christian König Reviewed-by: Mario Limonciello Tested-by: Mario Limonciello # PHX & Navi33 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 1638c31777992..2ac5820e9c924 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1630,7 +1630,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev, } else { amdgpu_gmc_vram_location(adev, mc, base); amdgpu_gmc_gart_location(adev, mc, AMDGPU_GART_PLACEMENT_BEST_FIT); - if (!amdgpu_sriov_vf(adev) && (amdgpu_agp != 0)) + if (!amdgpu_sriov_vf(adev) && (amdgpu_agp == 1)) amdgpu_gmc_agp_location(adev, mc); } /* base offset of vram pages */ From e6bace7313d61e31f2b16fa3d774fd8cb3cb869e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Nov 2023 16:26:59 +0000 Subject: [PATCH 336/560] afs: Fix afs_server_list to be cleaned up with RCU afs_server_list is accessed with the rcu_read_lock() held from volume->servers, so it needs to be cleaned up correctly. Fix this by using kfree_rcu() instead of kfree(). Fixes: 8a070a964877 ("afs: Detect cell aliases 1 - Cells with root volumes") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- fs/afs/internal.h | 1 + fs/afs/server_list.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index c9cef3782b4ae..a812952be1c94 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -553,6 +553,7 @@ struct afs_server_entry { }; struct afs_server_list { + struct rcu_head rcu; afs_volid_t vids[AFS_MAXTYPES]; /* Volume IDs */ refcount_t usage; unsigned char nr_servers; diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index ed9056703505f..b59896b1de0af 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -17,7 +17,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) for (i = 0; i < slist->nr_servers; i++) afs_unuse_server(net, slist->servers[i].server, afs_server_trace_put_slist); - kfree(slist); + kfree_rcu(slist, rcu); } } From 2a4ca1b4b77850544408595e2433f5d7811a9daa Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 8 Jun 2023 09:43:54 +0100 Subject: [PATCH 337/560] afs: Make error on cell lookup failure consistent with OpenAFS When kafs tries to look up a cell in the DNS or the local config, it will translate a lookup failure into EDESTADDRREQ whereas OpenAFS translates it into ENOENT. Applications such as West expect the latter behaviour and fail if they see the former. This can be seen by trying to mount an unknown cell: # mount -t afs %example.com:cell.root /mnt mount: /mnt: mount(2) system call failed: Destination address required. Fixes: 4d673da14533 ("afs: Support the AFS dynamic root") Reported-by: Markus Suvanto Link: https://bugzilla.kernel.org/show_bug.cgi?id=216637 Signed-off-by: David Howells Reviewed-by: Jeffrey Altman cc: Marc Dionne cc: linux-afs@lists.infradead.org --- fs/afs/dynroot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 4d04ef2d3ae7b..1fa8cf23bd360 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -132,8 +132,8 @@ static int afs_probe_cell_name(struct dentry *dentry) ret = dns_query(net->net, "afsdb", name, len, "srv=1", NULL, NULL, false); - if (ret == -ENODATA) - ret = -EDESTADDRREQ; + if (ret == -ENODATA || ret == -ENOKEY) + ret = -ENOENT; return ret; } From bff2a2d453a1b683378b4508b86b84389f551a00 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 6 Nov 2023 18:12:30 +0100 Subject: [PATCH 338/560] swiotlb-xen: provide the "max_mapping_size" method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's a bug that when using the XEN hypervisor with bios with large multi-page bio vectors on NVMe, the kernel deadlocks [1]. The deadlocks are caused by inability to map a large bio vector - dma_map_sgtable always returns an error, this gets propagated to the block layer as BLK_STS_RESOURCE and the block layer retries the request indefinitely. XEN uses the swiotlb framework to map discontiguous pages into contiguous runs that are submitted to the PCIe device. The swiotlb framework has a limitation on the length of a mapping - this needs to be announced with the max_mapping_size method to make sure that the hardware drivers do not create larger mappings. Without max_mapping_size, the NVMe block driver would create large mappings that overrun the maximum mapping size. Reported-by: Marek Marczykowski-Górecki Link: https://lore.kernel.org/stable/ZTNH0qtmint%2FzLJZ@mail-itl/ [1] Tested-by: Marek Marczykowski-Górecki Suggested-by: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: Keith Busch Signed-off-by: Mikulas Patocka Acked-by: Stefano Stabellini Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/151bef41-e817-aea9-675-a35fdac4ed@redhat.com Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 946bd56f0ac53..0e6c6c25d154f 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -405,4 +405,5 @@ const struct dma_map_ops xen_swiotlb_dma_ops = { .get_sgtable = dma_common_get_sgtable, .alloc_pages = dma_common_alloc_pages, .free_pages = dma_common_free_pages, + .max_mapping_size = swiotlb_max_mapping_size, }; From 295b202227e98edb2fb5cc29b6ec4b96b2792d9c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 16 Nov 2023 12:54:59 -0600 Subject: [PATCH 339/560] xen: privcmd: Replace zero-length array with flex-array member and use __counted_by Fake flexible arrays (zero-length and one-element arrays) are deprecated, and should be replaced by flexible-array members. So, replace zero-length array with a flexible-array member in `struct privcmd_kernel_ioreq`. Also annotate array `ports` with `__counted_by()` to prepare for the coming implementation by GCC and Clang of the `__counted_by` attribute. Flexible array members annotated with `__counted_by` can have their accesses bounds-checked at run-time via `CONFIG_UBSAN_BOUNDS` (for array indexing) and `CONFIG_FORTIFY_SOURCE` (for strcpy/memcpy-family functions). This fixes multiple -Warray-bounds warnings: drivers/xen/privcmd.c:1239:30: warning: array subscript i is outside array bounds of 'struct ioreq_port[0]' [-Warray-bounds=] drivers/xen/privcmd.c:1240:30: warning: array subscript i is outside array bounds of 'struct ioreq_port[0]' [-Warray-bounds=] drivers/xen/privcmd.c:1241:30: warning: array subscript i is outside array bounds of 'struct ioreq_port[0]' [-Warray-bounds=] drivers/xen/privcmd.c:1245:33: warning: array subscript i is outside array bounds of 'struct ioreq_port[0]' [-Warray-bounds=] drivers/xen/privcmd.c:1258:67: warning: array subscript i is outside array bounds of 'struct ioreq_port[0]' [-Warray-bounds=] This results in no differences in binary output. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/ZVZlg3tPMPCRdteh@work Signed-off-by: Juergen Gross --- drivers/xen/privcmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 1ce7f3c7a9509..0eb337a8ec0fa 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -1115,7 +1115,7 @@ struct privcmd_kernel_ioreq { spinlock_t lock; /* Protects ioeventfds list */ struct list_head ioeventfds; struct list_head list; - struct ioreq_port ports[0]; + struct ioreq_port ports[] __counted_by(vcpus); }; static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id) From c6ea14d557343cd3af6c6be2f5a78c98bdb281bb Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Thu, 16 Nov 2023 22:31:21 +0530 Subject: [PATCH 340/560] platform/x86/amd/pmc: adjust getting DRAM size behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit amd_pmc_get_dram_size() is used to get the DRAM size information. But in the current code, mailbox command to get the DRAM size info is sent based on the values of dev->major and dev->minor. But dev->major and dev->minor will have either junk or zero assigned to them until at least once a call to amd_pmc_get_smu_version() is made which ideally populates dev->major and dev->minor. However, adding a amd_pmc_get_smu_version() call to amd_pmc_get_dram_size() has a downside of elevating the boot times. After talking to the PMFW team, it's understood that the "get dram size" mbox command would only be supported on specific platforms (like Mendocino) and not all. So, adjust getting DRAM size behavior such that, - if running on Rembrandt or Mendocino and the underlying PMFW knows how to execute the "get dram size" command it shall give the custom dram size. - if the underlying FW does not report the dram size, we just proceed further and assign the default dram size. The simplest way to address this is to remove amd_pmc_get_dram_size() function and directly call the "get dram size" command in the amd_pmc_s2d_init(). Reported-by: Mark Hasemeyer Fixes: be8325fb3d8c ("platform/x86/amd: pmc: Get STB DRAM size from PMFW") Cc: stable@vger.kernel.org Suggested-by: Sanket Goswami Signed-off-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20231116170121.3372222-1-Shyam-sundar.S-k@amd.com Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc.c | 31 ++---------------------------- 1 file changed, 2 insertions(+), 29 deletions(-) diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c index cd6ac04c14680..c3104714b4802 100644 --- a/drivers/platform/x86/amd/pmc/pmc.c +++ b/drivers/platform/x86/amd/pmc/pmc.c @@ -964,33 +964,6 @@ static const struct pci_device_id pmc_pci_ids[] = { { } }; -static int amd_pmc_get_dram_size(struct amd_pmc_dev *dev) -{ - int ret; - - switch (dev->cpu_id) { - case AMD_CPU_ID_YC: - if (!(dev->major > 90 || (dev->major == 90 && dev->minor > 39))) { - ret = -EINVAL; - goto err_dram_size; - } - break; - default: - ret = -EINVAL; - goto err_dram_size; - } - - ret = amd_pmc_send_cmd(dev, S2D_DRAM_SIZE, &dev->dram_size, dev->s2d_msg_id, true); - if (ret || !dev->dram_size) - goto err_dram_size; - - return 0; - -err_dram_size: - dev_err(dev->dev, "DRAM size command not supported for this platform\n"); - return ret; -} - static int amd_pmc_s2d_init(struct amd_pmc_dev *dev) { u32 phys_addr_low, phys_addr_hi; @@ -1009,8 +982,8 @@ static int amd_pmc_s2d_init(struct amd_pmc_dev *dev) return -EIO; /* Get DRAM size */ - ret = amd_pmc_get_dram_size(dev); - if (ret) + ret = amd_pmc_send_cmd(dev, S2D_DRAM_SIZE, &dev->dram_size, dev->s2d_msg_id, true); + if (ret || !dev->dram_size) dev->dram_size = S2D_TELEMETRY_DRAMBYTES_MAX; /* Get STB DRAM address */ From 24d85bb3be373b5831699bddf698b392bd2b904d Mon Sep 17 00:00:00 2001 From: Gil Fine Date: Tue, 7 Nov 2023 12:22:40 +0200 Subject: [PATCH 341/560] thunderbolt: Set lane bonding bit only for downstream port Fix the lane bonding procedure to follow the steps described in USB4 Connection Manager guide. Hence, set the lane bonding bit only for downstream port. This is needed for certain ASMedia device, otherwise lane bonding fails and the device disconnects. Cc: stable@vger.kernel.org Signed-off-by: Gil Fine Signed-off-by: Mika Westerberg --- drivers/thunderbolt/switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index 1e15ffa792955..9e5cc285cc8d3 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -1143,7 +1143,7 @@ int tb_port_lane_bonding_enable(struct tb_port *port) * Only set bonding if the link was not already bonded. This * avoids the lane adapter to re-enter bonding state. */ - if (width == TB_LINK_WIDTH_SINGLE) { + if (width == TB_LINK_WIDTH_SINGLE && !tb_is_upstream_port(port)) { ret = tb_port_set_lane_bonding(port, true); if (ret) goto err_lane1; From 5391bcfa56c79a891734e4d22aa0ca3217b86491 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 7 Nov 2023 14:34:27 +0200 Subject: [PATCH 342/560] thunderbolt: Send uevent after asymmetric/symmetric switch We should send uevent to userspace whenever the link speed or width changes but tb_switch_asym_enable() and tb_switch_asym_disable() set the sw->link_width already so tb_switch_update_link_attributes() never noticed the change. Fix this so that we let tb_switch_update_link_attributes() update the fields accordingly. Fixes: 81af2952e606 ("thunderbolt: Add support for asymmetric link") Reported-by: Pengfei Xu Tested-by: Pengfei Xu Signed-off-by: Mika Westerberg --- drivers/thunderbolt/switch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index 9e5cc285cc8d3..44e9b09de47a5 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -2880,6 +2880,7 @@ static int tb_switch_lane_bonding_disable(struct tb_switch *sw) return tb_port_wait_for_link_width(down, TB_LINK_WIDTH_SINGLE, 100); } +/* Note updating sw->link_width done in tb_switch_update_link_attributes() */ static int tb_switch_asym_enable(struct tb_switch *sw, enum tb_link_width width) { struct tb_port *up, *down, *port; @@ -2919,10 +2920,10 @@ static int tb_switch_asym_enable(struct tb_switch *sw, enum tb_link_width width) return ret; } - sw->link_width = width; return 0; } +/* Note updating sw->link_width done in tb_switch_update_link_attributes() */ static int tb_switch_asym_disable(struct tb_switch *sw) { struct tb_port *up, *down; @@ -2957,7 +2958,6 @@ static int tb_switch_asym_disable(struct tb_switch *sw) return ret; } - sw->link_width = TB_LINK_WIDTH_DUAL; return 0; } From 480713b1ba8eac4617936f8404da34bda991c30e Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 13 Nov 2023 12:49:13 +0200 Subject: [PATCH 343/560] thunderbolt: Only add device router DP IN to the head of the DP resource list When pairing DP IN and DP OUT adapters for DisplayPort tunneling, we should prioritize the possible external GPU DP IN adapters to take advantage of the its capabilities. However the commit in question did this for host router DP IN adapters too and that changes ordering of the initial DP IN resources in such way that resuming from suspend may end up using different resource and that may confuse the user. Fix this so that we only put DP IN adapters of device routers to the top of the resource list and leave host routers as is. Fixes: 274baf695b08 ("thunderbolt: Add DP IN added last in the head of the list of DP resources") Reported-by: Pengfei Xu Signed-off-by: Mika Westerberg --- drivers/thunderbolt/tb.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c index 5acdeb766860d..fd49f86e03532 100644 --- a/drivers/thunderbolt/tb.c +++ b/drivers/thunderbolt/tb.c @@ -213,7 +213,17 @@ static void tb_add_dp_resources(struct tb_switch *sw) if (!tb_switch_query_dp_resource(sw, port)) continue; - list_add(&port->list, &tcm->dp_resources); + /* + * If DP IN on device router exist, position it at the + * beginning of the DP resources list, so that it is used + * before DP IN of the host router. This way external GPU(s) + * will be prioritized when pairing DP IN to a DP OUT. + */ + if (tb_route(sw)) + list_add(&port->list, &tcm->dp_resources); + else + list_add_tail(&port->list, &tcm->dp_resources); + tb_port_dbg(port, "DP IN resource available\n"); } } From 914fa861e3d7803c9bbafc229652c2a69edb8b60 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Thu, 9 Nov 2023 19:18:22 +0800 Subject: [PATCH 344/560] erofs: simplify erofs_read_inode() After commit 1c7f49a76773 ("erofs: tidy up EROFS on-disk naming"), there is a unique `union erofs_inode_i_u` so that we could parse the union directly. Besides, it also replaces `inode->i_sb` with `sb` for simplicity. Signed-off-by: Ferry Meng Reviewed-by: Gao Xiang Reviewed-by: Yue Hu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20231109111822.17944-1-mengferry@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 98 +++++++++++++++++------------------------------- 1 file changed, 35 insertions(+), 63 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index b8ad05b4509d5..14a79d3226abf 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -15,11 +15,11 @@ static void *erofs_read_inode(struct erofs_buf *buf, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_inode *vi = EROFS_I(inode); const erofs_off_t inode_loc = erofs_iloc(inode); - erofs_blk_t blkaddr, nblks = 0; void *kaddr; struct erofs_inode_compact *dic; struct erofs_inode_extended *die, *copied = NULL; + union erofs_inode_i_u iu; unsigned int ifmt; int err; @@ -35,9 +35,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, dic = kaddr + *ofs; ifmt = le16_to_cpu(dic->i_format); - if (ifmt & ~EROFS_I_ALL) { - erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu", + erofs_err(sb, "unsupported i_format %u of nid %llu", ifmt, vi->nid); err = -EOPNOTSUPP; goto err_out; @@ -45,7 +44,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->datalayout = erofs_inode_datalayout(ifmt); if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) { - erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu", + erofs_err(sb, "unsupported datalayout %u of nid %llu", vi->datalayout, vi->nid); err = -EOPNOTSUPP; goto err_out; @@ -82,40 +81,15 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); inode->i_mode = le16_to_cpu(die->i_mode); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr); - break; - case S_IFCHR: - case S_IFBLK: - inode->i_rdev = - new_decode_dev(le32_to_cpu(die->i_u.rdev)); - break; - case S_IFIFO: - case S_IFSOCK: - inode->i_rdev = 0; - break; - default: - goto bogusimode; - } + iu = die->i_u; i_uid_write(inode, le32_to_cpu(die->i_uid)); i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); - - /* extended inode has its own timestamp */ + /* each extended inode has its own timestamp */ inode_set_ctime(inode, le64_to_cpu(die->i_mtime), le32_to_cpu(die->i_mtime_nsec)); inode->i_size = le64_to_cpu(die->i_size); - - /* total blocks for compressed files */ - if (erofs_inode_is_data_compressed(vi->datalayout)) - nblks = le32_to_cpu(die->i_u.compressed_blocks); - else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) - /* fill chunked inode summary info */ - vi->chunkformat = le16_to_cpu(die->i_u.c.format); kfree(copied); copied = NULL; break; @@ -125,49 +99,51 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); inode->i_mode = le16_to_cpu(dic->i_mode); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr); - break; - case S_IFCHR: - case S_IFBLK: - inode->i_rdev = - new_decode_dev(le32_to_cpu(dic->i_u.rdev)); - break; - case S_IFIFO: - case S_IFSOCK: - inode->i_rdev = 0; - break; - default: - goto bogusimode; - } + iu = dic->i_u; i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_gid_write(inode, le16_to_cpu(dic->i_gid)); set_nlink(inode, le16_to_cpu(dic->i_nlink)); - /* use build time for compact inodes */ inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec); inode->i_size = le32_to_cpu(dic->i_size); - if (erofs_inode_is_data_compressed(vi->datalayout)) - nblks = le32_to_cpu(dic->i_u.compressed_blocks); - else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) - vi->chunkformat = le16_to_cpu(dic->i_u.c.format); break; default: - erofs_err(inode->i_sb, - "unsupported on-disk inode version %u of nid %llu", + erofs_err(sb, "unsupported on-disk inode version %u of nid %llu", erofs_inode_version(ifmt), vi->nid); err = -EOPNOTSUPP; goto err_out; } - if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr); + break; + case S_IFCHR: + case S_IFBLK: + inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev)); + break; + case S_IFIFO: + case S_IFSOCK: + inode->i_rdev = 0; + break; + default: + erofs_err(sb, "bogus i_mode (%o) @ nid %llu", inode->i_mode, + vi->nid); + err = -EFSCORRUPTED; + goto err_out; + } + + /* total blocks for compressed files */ + if (erofs_inode_is_data_compressed(vi->datalayout)) { + nblks = le32_to_cpu(iu.compressed_blocks); + } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + /* fill chunked inode summary info */ + vi->chunkformat = le16_to_cpu(iu.c.format); if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { - erofs_err(inode->i_sb, - "unsupported chunk format %x of nid %llu", + erofs_err(sb, "unsupported chunk format %x of nid %llu", vi->chunkformat, vi->nid); err = -EOPNOTSUPP; goto err_out; @@ -191,10 +167,6 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); return kaddr; -bogusimode: - erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu", - inode->i_mode, vi->nid); - err = -EFSCORRUPTED; err_out: DBG_BUGON(1); kfree(copied); From 8bd90b6ae7856dd5000b75691d905b39b9ea5d6b Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Tue, 14 Nov 2023 15:07:04 +0800 Subject: [PATCH 345/560] erofs: fix NULL dereference of dif->bdev_handle in fscache mode Avoid NULL dereference of dif->bdev_handle, as dif->bdev_handle is NULL in fscache mode. BUG: kernel NULL pointer dereference, address: 0000000000000000 RIP: 0010:erofs_map_dev+0xbd/0x1c0 Call Trace: erofs_fscache_data_read_slice+0xa7/0x340 erofs_fscache_data_read+0x11/0x30 erofs_fscache_readahead+0xd9/0x100 read_pages+0x47/0x1f0 page_cache_ra_order+0x1e5/0x270 filemap_get_pages+0xf2/0x5f0 filemap_read+0xb8/0x2e0 vfs_read+0x18d/0x2b0 ksys_read+0x53/0xd0 do_syscall_64+0x42/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Reported-by: Yiqun Leng Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7245 Fixes: 49845720080d ("erofs: Convert to use bdev_open_by_path()") Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Reviewed-by: Yue Hu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20231114070704.23398-1-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/data.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 029c761670bfc..c98aeda8abb21 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -220,7 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - map->m_bdev = dif->bdev_handle->bdev; + map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; @@ -238,7 +238,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - map->m_bdev = dif->bdev_handle->bdev; + map->m_bdev = dif->bdev_handle ? + dif->bdev_handle->bdev : NULL; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; From 62b241efff99fc4d88a86f1c67c7516e31f432a3 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 17 Nov 2023 16:53:29 +0800 Subject: [PATCH 346/560] MAINTAINERS: erofs: add EROFS webpage Add a new `W:` field of the EROFS entry points to the documentation site at . In addition, update the in-tree documentation and Kconfig too. Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231117085329.1624223-1-hsiangkao@linux.alibaba.com --- Documentation/filesystems/erofs.rst | 4 ++++ MAINTAINERS | 1 + fs/erofs/Kconfig | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index 57c6ae23b3fcf..cc4626d6ee4f8 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -91,6 +91,10 @@ compatibility checking tool (fsck.erofs), and a debugging tool (dump.erofs): - git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs-utils.git +For more information, please also refer to the documentation site: + +- https://erofs.docs.kernel.org + Bugs and patches are welcome, please kindly help us and send to the following linux-erofs mailing list: diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cfd..cf39d16ad22ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7855,6 +7855,7 @@ R: Yue Hu R: Jeffle Xu L: linux-erofs@lists.ozlabs.org S: Maintained +W: https://erofs.docs.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git F: Documentation/ABI/testing/sysfs-fs-erofs F: Documentation/filesystems/erofs.rst diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index e540648dedc28..1d318f85232de 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -21,7 +21,7 @@ config EROFS_FS performance under extremely memory pressure without extra cost. See the documentation at - for more details. + and the web pages at for more details. If unsure, say N. From 27b13e209ddca5979847a1b57890e0372c1edcee Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Nov 2023 10:35:22 +0800 Subject: [PATCH 347/560] blk-throttle: fix lockdep warning of "cgroup_mutex or RCU read lock required!" Inside blkg_for_each_descendant_pre(), both css_for_each_descendant_pre() and blkg_lookup() requires RCU read lock, and either cgroup_assert_mutex_or_rcu_locked() or rcu_read_lock_held() is called. Fix the warning by adding rcu read lock. Reported-by: Changhui Zhong Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20231117023527.3188627-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 13e4377a8b286..16f5766620a41 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1320,6 +1320,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); + rcu_read_lock(); /* * Update has_rules[] flags for the updated tg's subtree. A tg is * considered to have rules if either the tg itself or any of its @@ -1347,6 +1348,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) this_tg->latency_target = max(this_tg->latency_target, parent_tg->latency_target); } + rcu_read_unlock(); /* * We're already holding queue_lock and know @tg is valid. Let's From 35a99d6557cacbc177314735342f77a2dda41872 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Nov 2023 10:35:23 +0800 Subject: [PATCH 348/560] blk-cgroup: avoid to warn !rcu_read_lock_held() in blkg_lookup() So far, all callers either holds spin lock or rcu read explicitly, and most of the caller has added WARN_ON_ONCE(!rcu_read_lock_held()) or lockdep_assert_held(&disk->queue->queue_lock). Remove WARN_ON_ONCE(!rcu_read_lock_held()) from blkg_lookup() for killing the false positive warning from blkg_conf_prep(). Reported-by: Changhui Zhong Fixes: 83462a6c971c ("blkcg: Drop unnecessary RCU read [un]locks from blkg_conf_prep/finish()") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20231117023527.3188627-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 624c03c8fe64e..fd482439afbc9 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -249,8 +249,6 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, { struct blkcg_gq *blkg; - WARN_ON_ONCE(!rcu_read_lock_held()); - if (blkcg == &blkcg_root) return q->root_blkg; From e63a57303599b17290cd8bc48e6f20b24289a8bc Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 17 Nov 2023 10:35:24 +0800 Subject: [PATCH 349/560] blk-cgroup: bypass blkcg_deactivate_policy after destroying blkcg_deactivate_policy() can be called after blkg_destroy_all() returns, and it isn't necessary since blkg_destroy_all has covered policy deactivation. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20231117023527.3188627-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4a42ea2972ad8..4b48c2c440981 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -577,6 +577,7 @@ static void blkg_destroy_all(struct gendisk *disk) struct request_queue *q = disk->queue; struct blkcg_gq *blkg, *n; int count = BLKG_DESTROY_BATCH_SIZE; + int i; restart: spin_lock_irq(&q->queue_lock); @@ -602,6 +603,18 @@ static void blkg_destroy_all(struct gendisk *disk) } } + /* + * Mark policy deactivated since policy offline has been done, and + * the free is scheduled, so future blkcg_deactivate_policy() can + * be bypassed + */ + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (pol) + __clear_bit(pol->plid, q->blkcg_pols); + } + q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); } From 6fc45b6ed921dc00dfb264dc08c7d67ee63d2656 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:21:14 +0100 Subject: [PATCH 350/560] dm-delay: fix a race between delay_presuspend and delay_bio In delay_presuspend, we set the atomic variable may_delay and then stop the timer and flush pending bios. The intention here is to prevent the delay target from re-arming the timer again. However, this test is racy. Suppose that one thread goes to delay_bio, sees that dc->may_delay is one and proceeds; now, another thread executes delay_presuspend, it sets dc->may_delay to zero, deletes the timer and flushes pending bios. Then, the first thread continues and adds the bio to delayed->list despite the fact that dc->may_delay is false. Fix this bug by changing may_delay's type from atomic_t to bool and only access it while holding the delayed_bios_lock mutex. Note that we don't have to grab the mutex in delay_resume because there are no bios in flight at this point. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index efd510984e259..2d6b900e4353c 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -33,7 +33,7 @@ struct delay_c { struct work_struct flush_expired_bios; struct list_head delayed_bios; struct task_struct *worker; - atomic_t may_delay; + bool may_delay; struct delay_class read; struct delay_class write; @@ -236,7 +236,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = dc; INIT_LIST_HEAD(&dc->delayed_bios); - atomic_set(&dc->may_delay, 1); + dc->may_delay = true; dc->argc = argc; ret = delay_class_ctr(ti, &dc->read, argv); @@ -312,7 +312,7 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) struct dm_delay_info *delayed; unsigned long expires = 0; - if (!c->delay || !atomic_read(&dc->may_delay)) + if (!c->delay) return DM_MAPIO_REMAPPED; delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); @@ -321,6 +321,10 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay); mutex_lock(&delayed_bios_lock); + if (unlikely(!dc->may_delay)) { + mutex_unlock(&delayed_bios_lock); + return DM_MAPIO_REMAPPED; + } c->ops++; list_add_tail(&delayed->list, &dc->delayed_bios); mutex_unlock(&delayed_bios_lock); @@ -337,7 +341,9 @@ static void delay_presuspend(struct dm_target *ti) { struct delay_c *dc = ti->private; - atomic_set(&dc->may_delay, 0); + mutex_lock(&delayed_bios_lock); + dc->may_delay = false; + mutex_unlock(&delayed_bios_lock); if (delay_is_fast(dc)) flush_delayed_bios_fast(dc, true); @@ -351,7 +357,7 @@ static void delay_resume(struct dm_target *ti) { struct delay_c *dc = ti->private; - atomic_set(&dc->may_delay, 1); + dc->may_delay = true; } static int delay_map(struct dm_target *ti, struct bio *bio) From 38cfff568169ff9f99784948f79f62ca1af5a187 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:22:47 +0100 Subject: [PATCH 351/560] dm-delay: fix bugs introduced by kthread mode This commit fixes the following bugs introduced by commit 70bbeb29fab0 ("dm delay: for short delays, use kthread instead of timers and wq"): * the function flush_worker_fn has no exit path - on unload, this function will just loop and consume 100% CPU without any progress * the wake-up mechanism in flush_worker_fn is racy - a wake up will be missed if the process adds entries to the delayed_bios list just before set_current_state(TASK_INTERRUPTIBLE) * flush_delayed_bios_fast submits a bio while holding a global mutex; this may deadlock if we have multiple stacked dm-delay devices and the underlying device attempts to acquire the mutex too * if the target constructor fails, it will call delay_dtr. delay_dtr would attempt to free dc->timer_lock without it being initialized by the constructor. * if the target constructor's kthread allocation fails, delay_dtr would crash trying to dereference dc->worker because it is non-NULL due to ERR_PTR. Fixes: 70bbeb29fab0 ("dm delay: for short delays, use kthread instead of timers and wq") Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 61 +++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 2d6b900e4353c..00c09d9fffebf 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -73,9 +73,23 @@ static inline bool delay_is_fast(struct delay_c *dc) return !!dc->worker; } +static void flush_bios(struct bio *bio) +{ + struct bio *n; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + dm_submit_bio_remap(bio, NULL); + bio = n; + } +} + static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all) { struct dm_delay_info *delayed, *next; + struct bio_list flush_bio_list; + bio_list_init(&flush_bio_list); mutex_lock(&delayed_bios_lock); list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { @@ -83,47 +97,42 @@ static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all) struct bio *bio = dm_bio_from_per_bio_data(delayed, sizeof(struct dm_delay_info)); list_del(&delayed->list); - dm_submit_bio_remap(bio, NULL); + bio_list_add(&flush_bio_list, bio); delayed->class->ops--; } } mutex_unlock(&delayed_bios_lock); + + flush_bios(bio_list_get(&flush_bio_list)); } static int flush_worker_fn(void *data) { struct delay_c *dc = data; - while (1) { + while (!kthread_should_stop()) { flush_delayed_bios_fast(dc, false); + mutex_lock(&delayed_bios_lock); if (unlikely(list_empty(&dc->delayed_bios))) { set_current_state(TASK_INTERRUPTIBLE); + mutex_unlock(&delayed_bios_lock); schedule(); - } else + } else { + mutex_unlock(&delayed_bios_lock); cond_resched(); + } } return 0; } -static void flush_bios(struct bio *bio) -{ - struct bio *n; - - while (bio) { - n = bio->bi_next; - bio->bi_next = NULL; - dm_submit_bio_remap(bio, NULL); - bio = n; - } -} - -static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all) +static void flush_delayed_bios(struct delay_c *dc, bool flush_all) { struct dm_delay_info *delayed, *next; unsigned long next_expires = 0; unsigned long start_timer = 0; - struct bio_list flush_bios = { }; + struct bio_list flush_bio_list; + bio_list_init(&flush_bio_list); mutex_lock(&delayed_bios_lock); list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { @@ -131,7 +140,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all) struct bio *bio = dm_bio_from_per_bio_data(delayed, sizeof(struct dm_delay_info)); list_del(&delayed->list); - bio_list_add(&flush_bios, bio); + bio_list_add(&flush_bio_list, bio); delayed->class->ops--; continue; } @@ -147,7 +156,7 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all) if (start_timer) queue_timeout(dc, next_expires); - return bio_list_get(&flush_bios); + flush_bios(bio_list_get(&flush_bio_list)); } static void flush_expired_bios(struct work_struct *work) @@ -158,7 +167,7 @@ static void flush_expired_bios(struct work_struct *work) if (delay_is_fast(dc)) flush_delayed_bios_fast(dc, false); else - flush_bios(flush_delayed_bios(dc, false)); + flush_delayed_bios(dc, false); } static void delay_dtr(struct dm_target *ti) @@ -177,8 +186,7 @@ static void delay_dtr(struct dm_target *ti) if (dc->worker) kthread_stop(dc->worker); - if (!delay_is_fast(dc)) - mutex_destroy(&dc->timer_lock); + mutex_destroy(&dc->timer_lock); kfree(dc); } @@ -236,6 +244,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = dc; INIT_LIST_HEAD(&dc->delayed_bios); + mutex_init(&dc->timer_lock); dc->may_delay = true; dc->argc = argc; @@ -282,12 +291,12 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) "dm-delay-flush-worker"); if (IS_ERR(dc->worker)) { ret = PTR_ERR(dc->worker); + dc->worker = NULL; goto bad; } } else { timer_setup(&dc->delay_timer, handle_delayed_timer, 0); INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); - mutex_init(&dc->timer_lock); dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); if (!dc->kdelayd_wq) { ret = -EINVAL; @@ -345,11 +354,11 @@ static void delay_presuspend(struct dm_target *ti) dc->may_delay = false; mutex_unlock(&delayed_bios_lock); - if (delay_is_fast(dc)) + if (delay_is_fast(dc)) { flush_delayed_bios_fast(dc, true); - else { + } else { del_timer_sync(&dc->delay_timer); - flush_bios(flush_delayed_bios(dc, true)); + flush_delayed_bios(dc, true); } } From ccadc8a21ef13eb80006ecff6a7466810e4f0dd6 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:24:04 +0100 Subject: [PATCH 352/560] dm-delay: avoid duplicate logic This is small refactoring of dm-delay - we avoid duplicate logic in flush_delayed_bios and flush_delayed_bios_fast and join these two functions into one. We also add cond_resched() to flush_delayed_bios because the list may have unbounded number of entries. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 65 ++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 44 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 00c09d9fffebf..5eabdb06c6498 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -85,24 +85,40 @@ static void flush_bios(struct bio *bio) } } -static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all) +static void flush_delayed_bios(struct delay_c *dc, bool flush_all) { struct dm_delay_info *delayed, *next; struct bio_list flush_bio_list; + unsigned long next_expires = 0; + bool start_timer = false; bio_list_init(&flush_bio_list); mutex_lock(&delayed_bios_lock); list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { + cond_resched(); if (flush_all || time_after_eq(jiffies, delayed->expires)) { struct bio *bio = dm_bio_from_per_bio_data(delayed, sizeof(struct dm_delay_info)); list_del(&delayed->list); bio_list_add(&flush_bio_list, bio); delayed->class->ops--; + continue; + } + + if (!delay_is_fast(dc)) { + if (!start_timer) { + start_timer = true; + next_expires = delayed->expires; + } else { + next_expires = min(next_expires, delayed->expires); + } } } mutex_unlock(&delayed_bios_lock); + if (start_timer) + queue_timeout(dc, next_expires); + flush_bios(bio_list_get(&flush_bio_list)); } @@ -111,7 +127,7 @@ static int flush_worker_fn(void *data) struct delay_c *dc = data; while (!kthread_should_stop()) { - flush_delayed_bios_fast(dc, false); + flush_delayed_bios(dc, false); mutex_lock(&delayed_bios_lock); if (unlikely(list_empty(&dc->delayed_bios))) { set_current_state(TASK_INTERRUPTIBLE); @@ -126,48 +142,12 @@ static int flush_worker_fn(void *data) return 0; } -static void flush_delayed_bios(struct delay_c *dc, bool flush_all) -{ - struct dm_delay_info *delayed, *next; - unsigned long next_expires = 0; - unsigned long start_timer = 0; - struct bio_list flush_bio_list; - bio_list_init(&flush_bio_list); - - mutex_lock(&delayed_bios_lock); - list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { - if (flush_all || time_after_eq(jiffies, delayed->expires)) { - struct bio *bio = dm_bio_from_per_bio_data(delayed, - sizeof(struct dm_delay_info)); - list_del(&delayed->list); - bio_list_add(&flush_bio_list, bio); - delayed->class->ops--; - continue; - } - - if (!start_timer) { - start_timer = 1; - next_expires = delayed->expires; - } else - next_expires = min(next_expires, delayed->expires); - } - mutex_unlock(&delayed_bios_lock); - - if (start_timer) - queue_timeout(dc, next_expires); - - flush_bios(bio_list_get(&flush_bio_list)); -} - static void flush_expired_bios(struct work_struct *work) { struct delay_c *dc; dc = container_of(work, struct delay_c, flush_expired_bios); - if (delay_is_fast(dc)) - flush_delayed_bios_fast(dc, false); - else - flush_delayed_bios(dc, false); + flush_delayed_bios(dc, false); } static void delay_dtr(struct dm_target *ti) @@ -354,12 +334,9 @@ static void delay_presuspend(struct dm_target *ti) dc->may_delay = false; mutex_unlock(&delayed_bios_lock); - if (delay_is_fast(dc)) { - flush_delayed_bios_fast(dc, true); - } else { + if (!delay_is_fast(dc)) del_timer_sync(&dc->delay_timer); - flush_delayed_bios(dc, true); - } + flush_delayed_bios(dc, true); } static void delay_resume(struct dm_target *ti) From 2a695062a5a42aead8c539a344168d4806b3fda2 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:36:34 +0100 Subject: [PATCH 353/560] dm-bufio: fix no-sleep mode dm-bufio has a no-sleep mode. When activated (with the DM_BUFIO_CLIENT_NO_SLEEP flag), the bufio client is read-only and we could call dm_bufio_get from tasklets. This is used by dm-verity. Unfortunately, commit 450e8dee51aa ("dm bufio: improve concurrent IO performance") broke this and the kernel would warn that cache_get() was calling down_read() from no-sleeping context. The bug can be reproduced by using "veritysetup open" with the "--use-tasklets" flag. This commit fixes dm-bufio, so that the tasklet mode works again, by expanding use of the 'no_sleep_enabled' static_key to conditionally use either a rw_semaphore or rwlock_t (which are colocated in the buffer_tree structure using a union). Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org # v6.4 Fixes: 450e8dee51aa ("dm bufio: improve concurrent IO performance") Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 87 ++++++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 62eb27639c9b8..f03d7dba270c5 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -254,7 +254,7 @@ enum evict_result { typedef enum evict_result (*le_predicate)(struct lru_entry *le, void *context); -static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context) +static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *context, bool no_sleep) { unsigned long tested = 0; struct list_head *h = lru->cursor; @@ -295,7 +295,8 @@ static struct lru_entry *lru_evict(struct lru *lru, le_predicate pred, void *con h = h->next; - cond_resched(); + if (!no_sleep) + cond_resched(); } return NULL; @@ -382,7 +383,10 @@ struct dm_buffer { */ struct buffer_tree { - struct rw_semaphore lock; + union { + struct rw_semaphore lock; + rwlock_t spinlock; + } u; struct rb_root root; } ____cacheline_aligned_in_smp; @@ -393,9 +397,12 @@ struct dm_buffer_cache { * on the locks. */ unsigned int num_locks; + bool no_sleep; struct buffer_tree trees[]; }; +static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); + static inline unsigned int cache_index(sector_t block, unsigned int num_locks) { return dm_hash_locks_index(block, num_locks); @@ -403,22 +410,34 @@ static inline unsigned int cache_index(sector_t block, unsigned int num_locks) static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block) { - down_read(&bc->trees[cache_index(block, bc->num_locks)].lock); + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) + read_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); + else + down_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock); } static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block) { - up_read(&bc->trees[cache_index(block, bc->num_locks)].lock); + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) + read_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); + else + up_read(&bc->trees[cache_index(block, bc->num_locks)].u.lock); } static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block) { - down_write(&bc->trees[cache_index(block, bc->num_locks)].lock); + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) + write_lock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); + else + down_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock); } static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block) { - up_write(&bc->trees[cache_index(block, bc->num_locks)].lock); + if (static_branch_unlikely(&no_sleep_enabled) && bc->no_sleep) + write_unlock_bh(&bc->trees[cache_index(block, bc->num_locks)].u.spinlock); + else + up_write(&bc->trees[cache_index(block, bc->num_locks)].u.lock); } /* @@ -442,18 +461,32 @@ static void lh_init(struct lock_history *lh, struct dm_buffer_cache *cache, bool static void __lh_lock(struct lock_history *lh, unsigned int index) { - if (lh->write) - down_write(&lh->cache->trees[index].lock); - else - down_read(&lh->cache->trees[index].lock); + if (lh->write) { + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) + write_lock_bh(&lh->cache->trees[index].u.spinlock); + else + down_write(&lh->cache->trees[index].u.lock); + } else { + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) + read_lock_bh(&lh->cache->trees[index].u.spinlock); + else + down_read(&lh->cache->trees[index].u.lock); + } } static void __lh_unlock(struct lock_history *lh, unsigned int index) { - if (lh->write) - up_write(&lh->cache->trees[index].lock); - else - up_read(&lh->cache->trees[index].lock); + if (lh->write) { + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) + write_unlock_bh(&lh->cache->trees[index].u.spinlock); + else + up_write(&lh->cache->trees[index].u.lock); + } else { + if (static_branch_unlikely(&no_sleep_enabled) && lh->cache->no_sleep) + read_unlock_bh(&lh->cache->trees[index].u.spinlock); + else + up_read(&lh->cache->trees[index].u.lock); + } } /* @@ -502,14 +535,18 @@ static struct dm_buffer *list_to_buffer(struct list_head *l) return le_to_buffer(le); } -static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks) +static void cache_init(struct dm_buffer_cache *bc, unsigned int num_locks, bool no_sleep) { unsigned int i; bc->num_locks = num_locks; + bc->no_sleep = no_sleep; for (i = 0; i < bc->num_locks; i++) { - init_rwsem(&bc->trees[i].lock); + if (no_sleep) + rwlock_init(&bc->trees[i].u.spinlock); + else + init_rwsem(&bc->trees[i].u.lock); bc->trees[i].root = RB_ROOT; } @@ -648,7 +685,7 @@ static struct dm_buffer *__cache_evict(struct dm_buffer_cache *bc, int list_mode struct lru_entry *le; struct dm_buffer *b; - le = lru_evict(&bc->lru[list_mode], __evict_pred, &w); + le = lru_evict(&bc->lru[list_mode], __evict_pred, &w, bc->no_sleep); if (!le) return NULL; @@ -702,7 +739,7 @@ static void __cache_mark_many(struct dm_buffer_cache *bc, int old_mode, int new_ struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context}; while (true) { - le = lru_evict(&bc->lru[old_mode], __evict_pred, &w); + le = lru_evict(&bc->lru[old_mode], __evict_pred, &w, bc->no_sleep); if (!le) break; @@ -915,10 +952,11 @@ static void cache_remove_range(struct dm_buffer_cache *bc, { unsigned int i; + BUG_ON(bc->no_sleep); for (i = 0; i < bc->num_locks; i++) { - down_write(&bc->trees[i].lock); + down_write(&bc->trees[i].u.lock); __remove_range(bc, &bc->trees[i].root, begin, end, pred, release); - up_write(&bc->trees[i].lock); + up_write(&bc->trees[i].u.lock); } } @@ -979,8 +1017,6 @@ struct dm_bufio_client { struct dm_buffer_cache cache; /* must be last member */ }; -static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled); - /*----------------------------------------------------------------*/ #define dm_bufio_in_request() (!!current->bio_list) @@ -1871,7 +1907,8 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, if (need_submit) submit_io(b, REQ_OP_READ, read_endio); - wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); + if (nf != NF_GET) /* we already tested this condition above */ + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); if (b->read_error) { int error = blk_status_to_errno(b->read_error); @@ -2421,7 +2458,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign r = -ENOMEM; goto bad_client; } - cache_init(&c->cache, num_locks); + cache_init(&c->cache, num_locks, (flags & DM_BUFIO_CLIENT_NO_SLEEP) != 0); c->bdev = bdev; c->block_size = block_size; From 28f07f2ab4b3a2714f1fefcc58ada4bcc195f806 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:37:25 +0100 Subject: [PATCH 354/560] dm-verity: don't use blocking calls from tasklets The commit 5721d4e5a9cd enhanced dm-verity, so that it can verify blocks from tasklets rather than from workqueues. This reportedly improves performance significantly. However, dm-verity was using the flag CRYPTO_TFM_REQ_MAY_SLEEP from tasklets which resulted in warnings about sleeping function being called from non-sleeping context. BUG: sleeping function called from invalid context at crypto/internal.h:206 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 14, name: ksoftirqd/0 preempt_count: 100, expected: 0 RCU nest depth: 0, expected: 0 CPU: 0 PID: 14 Comm: ksoftirqd/0 Tainted: G W 6.7.0-rc1 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Call Trace: dump_stack_lvl+0x32/0x50 __might_resched+0x110/0x160 crypto_hash_walk_done+0x54/0xb0 shash_ahash_update+0x51/0x60 verity_hash_update.isra.0+0x4a/0x130 [dm_verity] verity_verify_io+0x165/0x550 [dm_verity] ? free_unref_page+0xdf/0x170 ? psi_group_change+0x113/0x390 verity_tasklet+0xd/0x70 [dm_verity] tasklet_action_common.isra.0+0xb3/0xc0 __do_softirq+0xaf/0x1ec ? smpboot_thread_fn+0x1d/0x200 ? sort_range+0x20/0x20 run_ksoftirqd+0x15/0x30 smpboot_thread_fn+0xed/0x200 kthread+0xdc/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x28/0x40 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork_asm+0x11/0x20 This commit fixes dm-verity so that it doesn't use the flags CRYPTO_TFM_REQ_MAY_SLEEP and CRYPTO_TFM_REQ_MAY_BACKLOG from tasklets. The crypto API would do GFP_ATOMIC allocation instead, it could return -ENOMEM and we catch -ENOMEM in verity_tasklet and requeue the request to the workqueue. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org # v6.0+ Fixes: 5721d4e5a9cd ("dm verity: Add optional "try_verify_in_tasklet" feature") Signed-off-by: Mike Snitzer --- drivers/md/dm-verity-fec.c | 4 ++-- drivers/md/dm-verity-target.c | 23 ++++++++++++----------- drivers/md/dm-verity.h | 2 +- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 3ef9f018da60c..2099c755119e3 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -185,7 +185,7 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, { if (unlikely(verity_hash(v, verity_io_hash_req(v, io), data, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io)))) + verity_io_real_digest(v, io), true))) return 0; return memcmp(verity_io_real_digest(v, io), want_digest, @@ -386,7 +386,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, /* Always re-validate the corrected block against the expected hash */ r = verity_hash(v, verity_io_hash_req(v, io), fio->output, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io)); + verity_io_real_digest(v, io), true); if (unlikely(r < 0)) return r; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 26adcfea03022..e115fcfe723c9 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -135,20 +135,21 @@ static int verity_hash_update(struct dm_verity *v, struct ahash_request *req, * Wrapper for crypto_ahash_init, which handles verity salting. */ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req, - struct crypto_wait *wait) + struct crypto_wait *wait, bool may_sleep) { int r; ahash_request_set_tfm(req, v->tfm); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, (void *)wait); + ahash_request_set_callback(req, + may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0, + crypto_req_done, (void *)wait); crypto_init_wait(wait); r = crypto_wait_req(crypto_ahash_init(req), wait); if (unlikely(r < 0)) { - DMERR("crypto_ahash_init failed: %d", r); + if (r != -ENOMEM) + DMERR("crypto_ahash_init failed: %d", r); return r; } @@ -179,12 +180,12 @@ static int verity_hash_final(struct dm_verity *v, struct ahash_request *req, } int verity_hash(struct dm_verity *v, struct ahash_request *req, - const u8 *data, size_t len, u8 *digest) + const u8 *data, size_t len, u8 *digest, bool may_sleep) { int r; struct crypto_wait wait; - r = verity_hash_init(v, req, &wait); + r = verity_hash_init(v, req, &wait, may_sleep); if (unlikely(r < 0)) goto out; @@ -322,7 +323,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, r = verity_hash(v, verity_io_hash_req(v, io), data, 1 << v->hash_dev_block_bits, - verity_io_real_digest(v, io)); + verity_io_real_digest(v, io), !io->in_tasklet); if (unlikely(r < 0)) goto release_ret_r; @@ -556,7 +557,7 @@ static int verity_verify_io(struct dm_verity_io *io) continue; } - r = verity_hash_init(v, req, &wait); + r = verity_hash_init(v, req, &wait, !io->in_tasklet); if (unlikely(r < 0)) return r; @@ -652,7 +653,7 @@ static void verity_tasklet(unsigned long data) io->in_tasklet = true; err = verity_verify_io(io); - if (err == -EAGAIN) { + if (err == -EAGAIN || err == -ENOMEM) { /* fallback to retrying with work-queue */ INIT_WORK(&io->work, verity_work); queue_work(io->v->verify_wq, &io->work); @@ -1033,7 +1034,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v) goto out; r = verity_hash(v, req, zero_data, 1 << v->data_dev_block_bits, - v->zero_digest); + v->zero_digest, true); out: kfree(req); diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 2f555b4203679..f96f4e281ee4a 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -128,7 +128,7 @@ extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, u8 *data, size_t len)); extern int verity_hash(struct dm_verity *v, struct ahash_request *req, - const u8 *data, size_t len, u8 *digest); + const u8 *data, size_t len, u8 *digest, bool may_sleep); extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, sector_t block, u8 *digest, bool *is_zero); From 13648e04a9b831b3dfa5cf3887dfa6cf8fe5fe69 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 17 Nov 2023 18:38:33 +0100 Subject: [PATCH 355/560] dm-crypt: start allocating with MAX_ORDER Commit 23baf831a32c ("mm, treewide: redefine MAX_ORDER sanely") changed the meaning of MAX_ORDER from exclusive to inclusive. So, we can allocate compound pages with up to 1 << MAX_ORDER pages. Reflect this change in dm-crypt and start trying to allocate compound pages with MAX_ORDER. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 6de107aff3319..2ae8560b6a14a 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1673,7 +1673,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size) unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM; unsigned int remaining_size; - unsigned int order = MAX_ORDER - 1; + unsigned int order = MAX_ORDER; retry: if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) From bc1b5acb40201a0746d68a7d7cfc141899937f4f Mon Sep 17 00:00:00 2001 From: Mahmoud Adam Date: Fri, 10 Nov 2023 19:21:04 +0100 Subject: [PATCH 356/560] nfsd: fix file memleak on client_opens_release seq_release should be called to free the allocated seq_file Cc: stable@vger.kernel.org # v5.3+ Signed-off-by: Mahmoud Adam Reviewed-by: Jeff Layton Fixes: 78599c42ae3c ("nfsd4: add file to display list of client's opens") Reviewed-by: NeilBrown Tested-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 65fd5510323a3..3709e58f0a4a6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2804,7 +2804,7 @@ static int client_opens_release(struct inode *inode, struct file *file) /* XXX: alternatively, we could get/drop in seq start/stop */ drop_client(clp); - return 0; + return seq_release(inode, file); } static const struct file_operations client_states_fops = { From 49cecd8628a9855cd993792a0377559ea32d5e7c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 10 Nov 2023 11:28:39 -0500 Subject: [PATCH 357/560] NFSD: Update nfsd_cache_append() to use xdr_stream When inserting a DRC-cached response into the reply buffer, ensure that the reply buffer's xdr_stream is updated properly. Otherwise the server will send a garbage response. Cc: stable@vger.kernel.org # v6.3+ Reviewed-by: Jeff Layton Tested-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfscache.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 80621a7095107..abb453be71ca3 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -640,24 +640,17 @@ void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, return; } -/* - * Copy cached reply to current reply buffer. Should always fit. - * FIXME as reply is in a page, we should just attach the page, and - * keep a refcount.... - */ static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) { - struct kvec *vec = &rqstp->rq_res.head[0]; - - if (vec->iov_len + data->iov_len > PAGE_SIZE) { - printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n", - data->iov_len); - return 0; - } - memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len); - vec->iov_len += data->iov_len; - return 1; + __be32 *p; + + p = xdr_reserve_space(&rqstp->rq_res_stream, data->iov_len); + if (unlikely(!p)) + return false; + memcpy(p, data->iov_base, data->iov_len); + xdr_commit_encode(&rqstp->rq_res_stream); + return true; } /* From 1caf5f61dd8430ae5a0b4538afe4953ce7517cbb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 10 Nov 2023 11:28:33 -0500 Subject: [PATCH 358/560] NFSD: Fix "start of NFS reply" pointer passed to nfsd_cache_update() The "statp + 1" pointer that is passed to nfsd_cache_update() is supposed to point to the start of the egress NFS Reply header. In fact, it does point there for AUTH_SYS and RPCSEC_GSS_KRB5 requests. But both krb5i and krb5p add fields between the RPC header's accept_stat field and the start of the NFS Reply header. In those cases, "statp + 1" points at the extra fields instead of the Reply. The result is that nfsd_cache_update() caches what looks to the client like garbage. A connection break can occur for a number of reasons, but the most common reason when using krb5i/p is a GSS sequence number window underrun. When an underrun is detected, the server is obliged to drop the RPC and the connection to force a retransmit with a fresh GSS sequence number. The client presents the same XID, it hits in the server's DRC, and the server returns the garbage cache entry. The "statp + 1" argument has been used since the oldest changeset in the kernel history repo, so it has been in nfsd_dispatch() literally since before history began. The problem arose only when the server-side GSS implementation was added twenty years ago. Reviewed-by: Jeff Layton Tested-by: Jeff Layton --- fs/nfsd/nfssvc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d6122bb2d167b..b4e4e04f99317 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -981,6 +981,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; struct nfsd_cacherep *rp; + __be32 *nfs_reply; /* * Give the xdr decoder a chance to change this if it wants @@ -1010,6 +1011,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) goto out_dropit; } + nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0); *statp = proc->pc_func(rqstp); if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; @@ -1023,7 +1025,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) */ smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); - nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1); + nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply); out_cached_reply: return 1; From bf51c52a1f3c238d72c64e14d5e7702d3a245b82 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 10 Nov 2023 11:28:45 -0500 Subject: [PATCH 359/560] NFSD: Fix checksum mismatches in the duplicate reply cache nfsd_cache_csum() currently assumes that the server's RPC layer has been advancing rq_arg.head[0].iov_base as it decodes an incoming request, because that's the way it used to work. On entry, it expects that buf->head[0].iov_base points to the start of the NFS header, and excludes the already-decoded RPC header. These days however, head[0].iov_base now points to the start of the RPC header during all processing. It no longer points at the NFS Call header when execution arrives at nfsd_cache_csum(). In a retransmitted RPC the XID and the NFS header are supposed to be the same as the original message, but the contents of the retransmitted RPC header can be different. For example, for krb5, the GSS sequence number will be different between the two. Thus if the RPC header is always included in the DRC checksum computation, the checksum of the retransmitted message might not match the checksum of the original message, even though the NFS part of these messages is identical. The result is that, even if a matching XID is found in the DRC, the checksum mismatch causes the server to execute the retransmitted RPC transaction again. Reviewed-by: Jeff Layton Tested-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/cache.h | 4 +-- fs/nfsd/nfscache.c | 64 +++++++++++++++++++++++++++++++--------------- fs/nfsd/nfssvc.c | 10 +++++++- 3 files changed, 54 insertions(+), 24 deletions(-) diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 929248c6ca84c..4cbe0434cbb8c 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -84,8 +84,8 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn); void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); -int nfsd_cache_lookup(struct svc_rqst *rqstp, - struct nfsd_cacherep **cacherep); +int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, + unsigned int len, struct nfsd_cacherep **cacherep); void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, int cachetype, __be32 *statp); int nfsd_reply_cache_stats_show(struct seq_file *m, void *v); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index abb453be71ca3..6cd36af2f97e1 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -368,33 +368,52 @@ nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) return freed; } -/* - * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes +/** + * nfsd_cache_csum - Checksum incoming NFS Call arguments + * @buf: buffer containing a whole RPC Call message + * @start: starting byte of the NFS Call header + * @remaining: size of the NFS Call header, in bytes + * + * Compute a weak checksum of the leading bytes of an NFS procedure + * call header to help verify that a retransmitted Call matches an + * entry in the duplicate reply cache. + * + * To avoid assumptions about how the RPC message is laid out in + * @buf and what else it might contain (eg, a GSS MIC suffix), the + * caller passes us the exact location and length of the NFS Call + * header. + * + * Returns a 32-bit checksum value, as defined in RFC 793. */ -static __wsum -nfsd_cache_csum(struct svc_rqst *rqstp) +static __wsum nfsd_cache_csum(struct xdr_buf *buf, unsigned int start, + unsigned int remaining) { + unsigned int base, len; + struct xdr_buf subbuf; + __wsum csum = 0; + void *p; int idx; - unsigned int base; - __wsum csum; - struct xdr_buf *buf = &rqstp->rq_arg; - const unsigned char *p = buf->head[0].iov_base; - size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len, - RC_CSUMLEN); - size_t len = min(buf->head[0].iov_len, csum_len); + + if (remaining > RC_CSUMLEN) + remaining = RC_CSUMLEN; + if (xdr_buf_subsegment(buf, &subbuf, start, remaining)) + return csum; /* rq_arg.head first */ - csum = csum_partial(p, len, 0); - csum_len -= len; + if (subbuf.head[0].iov_len) { + len = min_t(unsigned int, subbuf.head[0].iov_len, remaining); + csum = csum_partial(subbuf.head[0].iov_base, len, csum); + remaining -= len; + } /* Continue into page array */ - idx = buf->page_base / PAGE_SIZE; - base = buf->page_base & ~PAGE_MASK; - while (csum_len) { - p = page_address(buf->pages[idx]) + base; - len = min_t(size_t, PAGE_SIZE - base, csum_len); + idx = subbuf.page_base / PAGE_SIZE; + base = subbuf.page_base & ~PAGE_MASK; + while (remaining) { + p = page_address(subbuf.pages[idx]) + base; + len = min_t(unsigned int, PAGE_SIZE - base, remaining); csum = csum_partial(p, len, csum); - csum_len -= len; + remaining -= len; base = 0; ++idx; } @@ -465,6 +484,8 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key, /** * nfsd_cache_lookup - Find an entry in the duplicate reply cache * @rqstp: Incoming Call to find + * @start: starting byte in @rqstp->rq_arg of the NFS Call header + * @len: size of the NFS Call header, in bytes * @cacherep: OUT: DRC entry for this request * * Try to find an entry matching the current call in the cache. When none @@ -478,7 +499,8 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key, * %RC_REPLY: Reply from cache * %RC_DROPIT: Do not process the request further */ -int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep) +int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, + unsigned int len, struct nfsd_cacherep **cacherep) { struct nfsd_net *nn; struct nfsd_cacherep *rp, *found; @@ -494,7 +516,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep) goto out; } - csum = nfsd_cache_csum(rqstp); + csum = nfsd_cache_csum(&rqstp->rq_arg, start, len); /* * Since the common case is a cache miss followed by an insert, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index b4e4e04f99317..fe61d9bbcc1fa 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -981,6 +981,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; struct nfsd_cacherep *rp; + unsigned int start, len; __be32 *nfs_reply; /* @@ -989,6 +990,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp) */ rqstp->rq_cachetype = proc->pc_cachetype; + /* + * ->pc_decode advances the argument stream past the NFS + * Call header, so grab the header's starting location and + * size now for the call to nfsd_cache_lookup(). + */ + start = xdr_stream_pos(&rqstp->rq_arg_stream); + len = xdr_stream_remaining(&rqstp->rq_arg_stream); if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; @@ -1002,7 +1010,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); rp = NULL; - switch (nfsd_cache_lookup(rqstp, &rp)) { + switch (nfsd_cache_lookup(rqstp, start, len, &rp)) { case RC_DOIT: break; case RC_REPLY: From 6965809e526917b73c8f9178173184dcf13cec4b Mon Sep 17 00:00:00 2001 From: Xuxin Xiong Date: Tue, 14 Nov 2023 12:42:05 +0800 Subject: [PATCH 360/560] drm/panel: auo,b101uan08.3: Fine tune the panel power sequence For "auo,b101uan08.3" this panel, it is stipulated in the panel spec that MIPI needs to keep the LP11 state before the lcm_reset pin is pulled high. Fixes: 56ad624b4cb5 ("drm/panel: support for auo, b101uan08.3 wuxga dsi video mode panel") Signed-off-by: Xuxin Xiong Reviewed-by: Douglas Anderson Signed-off-by: Douglas Anderson Link: https://patchwork.freedesktop.org/patch/msgid/20231114044205.613421-1-xuxinxiong@huaqin.corp-partner.google.com --- drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c index 9323e7b9e3849..a287be1aaf70f 100644 --- a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c +++ b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c @@ -1709,6 +1709,7 @@ static const struct panel_desc auo_b101uan08_3_desc = { .mode_flags = MIPI_DSI_MODE_VIDEO | MIPI_DSI_MODE_VIDEO_SYNC_PULSE | MIPI_DSI_MODE_LPM, .init_cmds = auo_b101uan08_3_init_cmd, + .lp11_before_reset = true, }; static const struct drm_display_mode boe_tv105wum_nw0_default_mode = { From 56466f653cb59a8f46e991ad1e285f43afdca7d4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 13 Oct 2023 11:25:15 +0300 Subject: [PATCH 361/560] drm/msm: remove unnecessary NULL check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This NULL check was required when it was added, but we shuffled the code around and now it's not. The inconsistent NULL checking triggers a Smatch warning: drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c:847 mdp5_init() warn: variable dereferenced before check 'mdp5_kms' (see line 782) Fixes: 1f50db2f3e1e ("drm/msm/mdp5: move resource allocation to the _probe function") Signed-off-by: Dan Carpenter Reviewed-by: Uwe Kleine-König Reviewed-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/562559/ Link: https://lore.kernel.org/r/ZSj+6/J6YsoSpLak@kadam Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c index a28fbcd096842..40cda31c8ed1d 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_kms.c @@ -844,8 +844,7 @@ static int mdp5_init(struct platform_device *pdev, struct drm_device *dev) return 0; fail: - if (mdp5_kms) - mdp5_destroy(mdp5_kms); + mdp5_destroy(mdp5_kms); return ret; } From 8a924db2d7b5eb69ba08b1a0af46e9f1359a9bdf Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Mon, 2 Oct 2023 08:57:33 -0400 Subject: [PATCH 362/560] fs: Pass AT_GETATTR_NOSEC flag to getattr interface function When vfs_getattr_nosec() calls a filesystem's getattr interface function then the 'nosec' should propagate into this function so that vfs_getattr_nosec() can again be called from the filesystem's gettattr rather than vfs_getattr(). The latter would add unnecessary security checks that the initial vfs_getattr_nosec() call wanted to avoid. Therefore, introduce the getattr flag GETATTR_NOSEC and allow to pass with the new getattr_flags parameter to the getattr interface function. In overlayfs and ecryptfs use this flag to determine which one of the two functions to call. In a recent code change introduced to IMA vfs_getattr_nosec() ended up calling vfs_getattr() in overlayfs, which in turn called security_inode_getattr() on an exiting process that did not have current->fs set anymore, which then caused a kernel NULL pointer dereference. With this change the call to security_inode_getattr() can be avoided, thus avoiding the NULL pointer dereference. Reported-by: Fixes: db1d1e8b9867 ("IMA: use vfs_getattr_nosec to get the i_version") Cc: Alexander Viro Cc: Cc: Miklos Szeredi Cc: Amir Goldstein Cc: Tyler Hicks Cc: Mimi Zohar Suggested-by: Christian Brauner Co-developed-by: Amir Goldstein Signed-off-by: Stefan Berger Link: https://lore.kernel.org/r/20231002125733.1251467-1-stefanb@linux.vnet.ibm.com Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/ecryptfs/inode.c | 12 ++++++++++-- fs/overlayfs/inode.c | 10 +++++----- fs/overlayfs/overlayfs.h | 8 ++++++++ fs/stat.c | 6 +++++- include/uapi/linux/fcntl.h | 3 +++ 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a25dd3d20008b..b0e8774c435a4 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -998,6 +998,14 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap, return rc; } +static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + if (flags & AT_GETATTR_NOSEC) + return vfs_getattr_nosec(path, stat, request_mask, flags); + return vfs_getattr(path, stat, request_mask, flags); +} + static int ecryptfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) @@ -1006,8 +1014,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap, struct kstat lower_stat; int rc; - rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat, - request_mask, flags); + rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry), + &lower_stat, request_mask, flags); if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 345b8f161ca4c..c63b31a460bef 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -171,7 +171,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getattr(&realpath, stat, request_mask, flags); + err = ovl_do_getattr(&realpath, stat, request_mask, flags); if (err) goto out; @@ -196,8 +196,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, (!is_dir ? STATX_NLINK : 0); ovl_path_lower(dentry, &realpath); - err = vfs_getattr(&realpath, &lowerstat, - lowermask, flags); + err = ovl_do_getattr(&realpath, &lowerstat, lowermask, + flags); if (err) goto out; @@ -249,8 +249,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, ovl_path_lowerdata(dentry, &realpath); if (realpath.dentry) { - err = vfs_getattr(&realpath, &lowerdatastat, - lowermask, flags); + err = ovl_do_getattr(&realpath, &lowerdatastat, + lowermask, flags); if (err) goto out; } else { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index ca88b2636a572..05c3dd597fa8d 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -408,6 +408,14 @@ static inline bool ovl_open_flags_need_copy_up(int flags) return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); } +static inline int ovl_do_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + if (flags & AT_GETATTR_NOSEC) + return vfs_getattr_nosec(path, stat, request_mask, flags); + return vfs_getattr(path, stat, request_mask, flags); +} + /* util.c */ int ovl_get_write_access(struct dentry *dentry); void ovl_put_write_access(struct dentry *dentry); diff --git a/fs/stat.c b/fs/stat.c index 24bb0209e4599..f721d26ec3f7e 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -133,7 +133,8 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, idmap = mnt_idmap(path->mnt); if (inode->i_op->getattr) return inode->i_op->getattr(idmap, path, stat, - request_mask, query_flags); + request_mask, + query_flags | AT_GETATTR_NOSEC); generic_fillattr(idmap, request_mask, inode, stat); return 0; @@ -166,6 +167,9 @@ int vfs_getattr(const struct path *path, struct kstat *stat, { int retval; + if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC)) + return -EPERM; + retval = security_inode_getattr(path); if (retval) return retval; diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 6c80f96049bd0..282e90aeb163c 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -116,5 +116,8 @@ #define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to compare object identity and may not be usable to open_by_handle_at(2) */ +#if defined(__KERNEL__) +#define AT_GETATTR_NOSEC 0x80000000 +#endif #endif /* _UAPI_LINUX_FCNTL_H */ From fe2c34bab6d46469ad3095955dc37e984dc24e38 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 17 Nov 2023 13:38:46 -0800 Subject: [PATCH 363/560] iov_iter: fix copy_page_to_iter_nofault() The recent conversion to inline functions made two mistakes: 1. It tries to copy the full amount requested (bytes), not just what's available in the kmap'd page (n). 2. It's not applying the offset in the first page. Note that copy_page_to_iter_nofault() is only used by /proc/kcore. This was detected by drgn's test suite. Fixes: f1982740f5e7 ("iov_iter: Convert iterate*() to inline funcs") Signed-off-by: Omar Sandoval Link: https://lore.kernel.org/r/c1616e06b5248013cbbb1881bb4fef85a7a69ccb.1700257019.git.osandov@fb.com Acked-by: David Howells Signed-off-by: Christian Brauner --- lib/iov_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index de7d11cf4c635..8ff6824a10053 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -409,7 +409,7 @@ size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t byte void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); - n = iterate_and_advance(i, bytes, kaddr, + n = iterate_and_advance(i, n, kaddr + offset, copy_to_user_iter_nofault, memcpy_to_iter); kunmap_local(kaddr); From 721d28f3dfb3e40c45ce45fbeeff47b72c230bc9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 16 Nov 2023 11:13:40 -0800 Subject: [PATCH 364/560] parisc: Replace strlcpy() with strscpy() strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated[1]. Additionally, it returns the size of the source string, not the resulting size of the destination string. In an effort to remove strlcpy() completely[2], replace strlcpy() here with strscpy(). Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [1] Link: https://github.com/KSPP/linux/issues/89 [2] Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Azeem Shaikh Cc: linux-parisc@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Helge Deller --- arch/parisc/kernel/processor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 29e2750f86a41..e95a977ba5f37 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -383,7 +383,7 @@ show_cpuinfo (struct seq_file *m, void *v) char cpu_name[60], *p; /* strip PA path from CPU name to not confuse lscpu */ - strlcpy(cpu_name, per_cpu(cpu_data, 0).dev->name, sizeof(cpu_name)); + strscpy(cpu_name, per_cpu(cpu_data, 0).dev->name, sizeof(cpu_name)); p = strrchr(cpu_name, '['); if (p) *(--p) = 0; From 6ad6e15a9c46b8f0932cd99724f26f3db4db1cdf Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 17 Nov 2023 16:43:52 +0100 Subject: [PATCH 365/560] parisc/power: Fix power soft-off when running on qemu Firmware returns the physical address of the power switch, so need to use gsc_writel() instead of direct memory access. Fixes: d0c219472980 ("parisc/power: Add power soft-off when running on qemu") Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v6.0+ --- drivers/parisc/power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c index 539d8920c2029..bb0d92461b08b 100644 --- a/drivers/parisc/power.c +++ b/drivers/parisc/power.c @@ -176,7 +176,7 @@ static struct notifier_block parisc_panic_block = { static int qemu_power_off(struct sys_off_data *data) { /* this turns the system off via SeaBIOS */ - *(int *)data->cb_data = 0; + gsc_writel(0, (unsigned long) data->cb_data); pdc_soft_power_button(1); return NOTIFY_DONE; } From 793838138c157d4c49f4fb744b170747e3dabf58 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 18 Nov 2023 19:33:35 +0100 Subject: [PATCH 366/560] prctl: Disable prctl(PR_SET_MDWE) on parisc systemd-254 tries to use prctl(PR_SET_MDWE) for it's MemoryDenyWriteExecute functionality, but fails on parisc which still needs executable stacks in certain combinations of gcc/glibc/kernel. Disable prctl(PR_SET_MDWE) by returning -EINVAL for now on parisc, until userspace has catched up. Signed-off-by: Helge Deller Co-developed-by: Linus Torvalds Reported-by: Sam James Closes: https://github.com/systemd/systemd/issues/29775 Tested-by: Sam James Link: https://lore.kernel.org/all/875y2jro9a.fsf@gentoo.org/ Cc: # v6.3+ --- kernel/sys.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sys.c b/kernel/sys.c index 420d9cb9cc8e2..e219fcfa112d8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2394,6 +2394,10 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN)) return -EINVAL; + /* PARISC cannot allow mdwe as it needs writable stacks */ + if (IS_ENABLED(CONFIG_PARISC)) + return -EINVAL; + current_bits = get_current_mdwe(); if (current_bits && current_bits != bits) return -EPERM; /* Cannot unset the flags */ From 06fc41b09cfbc02977acd9189473593a37d82d9b Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Mon, 9 Oct 2023 00:33:15 +0200 Subject: [PATCH 367/560] drm/panel: simple: Fix Innolux G101ICE-L01 bus flags Add missing .bus_flags = DRM_BUS_FLAG_DE_HIGH to this panel description, ones which match both the datasheet and the panel display_timing flags . Fixes: 1e29b840af9f ("drm/panel: simple: Add Innolux G101ICE-L01 panel") Signed-off-by: Marek Vasut Reviewed-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20231008223315.279215-1-marex@denx.de --- drivers/gpu/drm/panel/panel-simple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c index 6cd32b9090876..086750f99d27e 100644 --- a/drivers/gpu/drm/panel/panel-simple.c +++ b/drivers/gpu/drm/panel/panel-simple.c @@ -2402,6 +2402,7 @@ static const struct panel_desc innolux_g101ice_l01 = { .disable = 200, }, .bus_format = MEDIA_BUS_FMT_RGB888_1X7X4_SPWG, + .bus_flags = DRM_BUS_FLAG_DE_HIGH, .connector_type = DRM_MODE_CONNECTOR_LVDS, }; From 3f9a91b6c00e655d27bd785dcda1742dbdc31bda Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Mon, 9 Oct 2023 00:32:56 +0200 Subject: [PATCH 368/560] drm/panel: simple: Fix Innolux G101ICE-L01 timings The Innolux G101ICE-L01 datasheet [1] page 17 table 6.1 INPUT SIGNAL TIMING SPECIFICATIONS indicates that maximum vertical blanking time is 40 lines. Currently the driver uses 29 lines. Fix it, and since this panel is a DE panel, adjust the timings to make them less hostile to controllers which cannot do 1 px HSA/VSA, distribute the delays evenly between all three parts. [1] https://www.data-modul.com/sites/default/files/products/G101ICE-L01-C2-specification-12042389.pdf Fixes: 1e29b840af9f ("drm/panel: simple: Add Innolux G101ICE-L01 panel") Signed-off-by: Marek Vasut Reviewed-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20231008223256.279196-1-marex@denx.de --- drivers/gpu/drm/panel/panel-simple.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c index 086750f99d27e..9367a4572dcf6 100644 --- a/drivers/gpu/drm/panel/panel-simple.c +++ b/drivers/gpu/drm/panel/panel-simple.c @@ -2379,13 +2379,13 @@ static const struct panel_desc innolux_g070y2_t02 = { static const struct display_timing innolux_g101ice_l01_timing = { .pixelclock = { 60400000, 71100000, 74700000 }, .hactive = { 1280, 1280, 1280 }, - .hfront_porch = { 41, 80, 100 }, - .hback_porch = { 40, 79, 99 }, - .hsync_len = { 1, 1, 1 }, + .hfront_porch = { 30, 60, 70 }, + .hback_porch = { 30, 60, 70 }, + .hsync_len = { 22, 40, 60 }, .vactive = { 800, 800, 800 }, - .vfront_porch = { 5, 11, 14 }, - .vback_porch = { 4, 11, 14 }, - .vsync_len = { 1, 1, 1 }, + .vfront_porch = { 3, 8, 14 }, + .vback_porch = { 3, 8, 14 }, + .vsync_len = { 4, 7, 12 }, .flags = DISPLAY_FLAGS_DE_HIGH, }; From 8ba2c459668cfe2aaacc5ebcd35b4b9ef8643013 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Fri, 17 Nov 2023 18:11:08 +0800 Subject: [PATCH 369/560] net: wangxun: fix kernel panic due to null pointer When the device uses a custom subsystem vendor ID, the function wx_sw_init() returns before the memory of 'wx->mac_table' is allocated. The null pointer will causes the kernel panic. Fixes: 79625f45ca73 ("net: wangxun: Move MAC address handling to libwx") Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 8 +++++--- drivers/net/ethernet/wangxun/ngbe/ngbe_main.c | 4 +--- drivers/net/ethernet/wangxun/txgbe/txgbe_main.c | 4 +--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index a3c5de9d547a4..533e912af0893 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -1769,10 +1769,12 @@ int wx_sw_init(struct wx *wx) wx->subsystem_device_id = pdev->subsystem_device; } else { err = wx_flash_read_dword(wx, 0xfffdc, &ssid); - if (!err) - wx->subsystem_device_id = swab16((u16)ssid); + if (err < 0) { + wx_err(wx, "read of internal subsystem device id failed\n"); + return err; + } - return err; + wx->subsystem_device_id = swab16((u16)ssid); } wx->mac_table = kcalloc(wx->mac.num_rar_entries, diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c index 3d43f808c86b7..8db804543e66d 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c @@ -121,10 +121,8 @@ static int ngbe_sw_init(struct wx *wx) /* PCI config space info */ err = wx_sw_init(wx); - if (err < 0) { - wx_err(wx, "read of internal subsystem device id failed\n"); + if (err < 0) return err; - } /* mac type, phy type , oem type */ ngbe_init_type_code(wx); diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c index 70f0b5c01dacf..526250102db27 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c @@ -364,10 +364,8 @@ static int txgbe_sw_init(struct wx *wx) /* PCI config space info */ err = wx_sw_init(wx); - if (err < 0) { - wx_err(wx, "read of internal subsystem device id failed\n"); + if (err < 0) return err; - } txgbe_init_type_code(wx); From 93da8d75a66568ba4bb5b14ad2833acd7304cd02 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Nov 2023 14:17:33 +0000 Subject: [PATCH 370/560] wireguard: use DEV_STATS_INC() wg_xmit() can be called concurrently, KCSAN reported [1] some device stats updates can be lost. Use DEV_STATS_INC() for this unlikely case. [1] BUG: KCSAN: data-race in wg_xmit / wg_xmit read-write to 0xffff888104239160 of 8 bytes by task 1375 on cpu 0: wg_xmit+0x60f/0x680 drivers/net/wireguard/device.c:231 __netdev_start_xmit include/linux/netdevice.h:4918 [inline] netdev_start_xmit include/linux/netdevice.h:4932 [inline] xmit_one net/core/dev.c:3543 [inline] dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3559 ... read-write to 0xffff888104239160 of 8 bytes by task 1378 on cpu 1: wg_xmit+0x60f/0x680 drivers/net/wireguard/device.c:231 __netdev_start_xmit include/linux/netdevice.h:4918 [inline] netdev_start_xmit include/linux/netdevice.h:4932 [inline] xmit_one net/core/dev.c:3543 [inline] dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3559 ... v2: also change wg_packet_consume_data_done() (Hangbin Liu) and wg_packet_purge_staged_packets() Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Jason A. Donenfeld Cc: Hangbin Liu Signed-off-by: Jason A. Donenfeld Reviewed-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/wireguard/device.c | 4 ++-- drivers/net/wireguard/receive.c | 12 ++++++------ drivers/net/wireguard/send.c | 3 ++- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 258dcc1039216..deb9636b0ecf8 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -210,7 +210,7 @@ static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) */ while (skb_queue_len(&peer->staged_packet_queue) > MAX_STAGED_PACKETS) { dev_kfree_skb(__skb_dequeue(&peer->staged_packet_queue)); - ++dev->stats.tx_dropped; + DEV_STATS_INC(dev, tx_dropped); } skb_queue_splice_tail(&packets, &peer->staged_packet_queue); spin_unlock_bh(&peer->staged_packet_queue.lock); @@ -228,7 +228,7 @@ static netdev_tx_t wg_xmit(struct sk_buff *skb, struct net_device *dev) else if (skb->protocol == htons(ETH_P_IPV6)) icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); err: - ++dev->stats.tx_errors; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return ret; } diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 0b3f0c8435509..a176653c88616 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -416,20 +416,20 @@ static void wg_packet_consume_data_done(struct wg_peer *peer, net_dbg_skb_ratelimited("%s: Packet has unallowed src IP (%pISc) from peer %llu (%pISpfsc)\n", dev->name, skb, peer->internal_id, &peer->endpoint.addr); - ++dev->stats.rx_errors; - ++dev->stats.rx_frame_errors; + DEV_STATS_INC(dev, rx_errors); + DEV_STATS_INC(dev, rx_frame_errors); goto packet_processed; dishonest_packet_type: net_dbg_ratelimited("%s: Packet is neither ipv4 nor ipv6 from peer %llu (%pISpfsc)\n", dev->name, peer->internal_id, &peer->endpoint.addr); - ++dev->stats.rx_errors; - ++dev->stats.rx_frame_errors; + DEV_STATS_INC(dev, rx_errors); + DEV_STATS_INC(dev, rx_frame_errors); goto packet_processed; dishonest_packet_size: net_dbg_ratelimited("%s: Packet has incorrect size from peer %llu (%pISpfsc)\n", dev->name, peer->internal_id, &peer->endpoint.addr); - ++dev->stats.rx_errors; - ++dev->stats.rx_length_errors; + DEV_STATS_INC(dev, rx_errors); + DEV_STATS_INC(dev, rx_length_errors); goto packet_processed; packet_processed: dev_kfree_skb(skb); diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index 95c853b59e1da..0d48e0f4a1ba3 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -333,7 +333,8 @@ static void wg_packet_create_data(struct wg_peer *peer, struct sk_buff *first) void wg_packet_purge_staged_packets(struct wg_peer *peer) { spin_lock_bh(&peer->staged_packet_queue.lock); - peer->device->dev->stats.tx_dropped += peer->staged_packet_queue.qlen; + DEV_STATS_ADD(peer->device->dev, tx_dropped, + peer->staged_packet_queue.qlen); __skb_queue_purge(&peer->staged_packet_queue); spin_unlock_bh(&peer->staged_packet_queue.lock); } From 5f228d7c8a539714c1e9b7e7534f76bb7979f268 Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Fri, 17 Nov 2023 16:10:18 +0530 Subject: [PATCH 371/560] octeontx2-pf: Fix memory leak during interface down During 'ifconfig down' one RSS memory was not getting freed. This patch fixes the same. Fixes: 81a4362016e7 ("octeontx2-pf: Add RSS multi group support") Signed-off-by: Suman Ghosh Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 91b99fd703616..ba95ac9132746 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1934,6 +1934,8 @@ int otx2_stop(struct net_device *netdev) /* Clear RSS enable flag */ rss = &pf->hw.rss_info; rss->enable = false; + if (!netif_is_rxfh_configured(netdev)) + kfree(rss->rss_ctx[DEFAULT_RSS_CONTEXT_GROUP]); /* Cleanup Queue IRQ */ vec = pci_irq_vector(pf->pdev, From 938dbead34cd139c50c5d51cc58e18ef2f1c3d2c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Nov 2023 19:30:06 -0800 Subject: [PATCH 372/560] net: fill in MODULE_DESCRIPTION()s for SOCK_DIAG modules W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to all the sock diag modules in one fell swoop. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 1 + net/ipv4/raw_diag.c | 1 + net/ipv4/tcp_diag.c | 1 + net/ipv4/udp_diag.c | 1 + net/mptcp/mptcp_diag.c | 1 + net/packet/diag.c | 1 + net/sctp/diag.c | 1 + net/smc/smc_diag.c | 1 + net/tipc/diag.c | 1 + net/unix/diag.c | 1 + net/vmw_vsock/diag.c | 1 + net/xdp/xsk_diag.c | 1 + 12 files changed, 12 insertions(+) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f01aee832aab2..7d0e7aaa71e0a 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1481,5 +1481,6 @@ static void __exit inet_diag_exit(void) module_init(inet_diag_init); module_exit(inet_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("INET/INET6: socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */); diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index 63a40e4b678f5..fe2140c8375c8 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -257,5 +257,6 @@ static void __exit raw_diag_exit(void) module_init(raw_diag_init); module_exit(raw_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAW socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 01b50fa791898..4cbe4b44425a6 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -247,4 +247,5 @@ static void __exit tcp_diag_exit(void) module_init(tcp_diag_init); module_exit(tcp_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index de3f2d31f510a..dc41a22ee80e8 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -296,5 +296,6 @@ static void __exit udp_diag_exit(void) module_init(udp_diag_init); module_exit(udp_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */); diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 8df1bdb647e29..5409c2ea3f572 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -245,4 +245,5 @@ static void __exit mptcp_diag_exit(void) module_init(mptcp_diag_init); module_exit(mptcp_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MPTCP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-262 /* AF_INET - IPPROTO_MPTCP */); diff --git a/net/packet/diag.c b/net/packet/diag.c index f6b200cb3c066..9a7980e3309d6 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -262,4 +262,5 @@ static void __exit packet_diag_exit(void) module_init(packet_diag_init); module_exit(packet_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("PACKET socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 17 /* AF_PACKET */); diff --git a/net/sctp/diag.c b/net/sctp/diag.c index c3d6b92dd3862..eb05131ff1dd6 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -527,4 +527,5 @@ static void __exit sctp_diag_exit(void) module_init(sctp_diag_init); module_exit(sctp_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SCTP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 7ff2152971a5b..a584613aca125 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -268,5 +268,6 @@ static void __exit smc_diag_exit(void) module_init(smc_diag_init); module_exit(smc_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SMC socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME); diff --git a/net/tipc/diag.c b/net/tipc/diag.c index 73137f4aeb68f..18733451c9e0c 100644 --- a/net/tipc/diag.c +++ b/net/tipc/diag.c @@ -113,4 +113,5 @@ module_init(tipc_diag_init); module_exit(tipc_diag_exit); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("TIPC socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC); diff --git a/net/unix/diag.c b/net/unix/diag.c index 616b55c5b8908..bec09a3a1d44c 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -339,4 +339,5 @@ static void __exit unix_diag_exit(void) module_init(unix_diag_init); module_exit(unix_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("UNIX socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 1 /* AF_LOCAL */); diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c index a2823b1c5e28b..2e29994f92ffa 100644 --- a/net/vmw_vsock/diag.c +++ b/net/vmw_vsock/diag.c @@ -174,5 +174,6 @@ static void __exit vsock_diag_exit(void) module_init(vsock_diag_init); module_exit(vsock_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("VMware Virtual Sockets monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 40 /* AF_VSOCK */); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 22b36c8143cfd..9f8955367275e 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -211,4 +211,5 @@ static void __exit xsk_diag_exit(void) module_init(xsk_diag_init); module_exit(xsk_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("XDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP); From 98b1cc82c4affc16f5598d4fa14b1858671b2263 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Nov 2023 15:02:14 -0800 Subject: [PATCH 373/560] Linux 6.7-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ede0bd2410560..724c79bebe727 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 7 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* From a6925165ea82b7765269ddd8dcad57c731aa00de Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 31 Oct 2023 04:00:07 +0000 Subject: [PATCH 374/560] ata: pata_isapnp: Add missing error check for devm_ioport_map() Add missing error return check for devm_ioport_map() and return the error if this function call fails. Fixes: 0d5ff566779f ("libata: convert to iomap") Signed-off-by: Chen Ni Reviewed-by: Sergey Shtylyov Signed-off-by: Damien Le Moal --- drivers/ata/pata_isapnp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/ata/pata_isapnp.c b/drivers/ata/pata_isapnp.c index 25a63d043c8e1..0f77e04240661 100644 --- a/drivers/ata/pata_isapnp.c +++ b/drivers/ata/pata_isapnp.c @@ -82,6 +82,9 @@ static int isapnp_init_one(struct pnp_dev *idev, const struct pnp_device_id *dev if (pnp_port_valid(idev, 1)) { ctl_addr = devm_ioport_map(&idev->dev, pnp_port_start(idev, 1), 1); + if (!ctl_addr) + return -ENOMEM; + ap->ioaddr.altstatus_addr = ctl_addr; ap->ioaddr.ctl_addr = ctl_addr; ap->ops = &isapnp_port_ops; From 45b478951b2ba5aea70b2850c49c1aa83aedd0d2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 17 Nov 2023 15:56:30 -0800 Subject: [PATCH 375/560] md: fix bi_status reporting in md_end_clone_io md_end_clone_io() may overwrite error status in orig_bio->bi_status with BLK_STS_OK. This could happen when orig_bio has BIO_CHAIN (split by md_submit_bio => bio_split_to_limits, for example). As a result, upper layer may miss error reported from md (or the device) and consider the failed IO was successful. Fix this by only update orig_bio->bi_status when current bio reports error and orig_bio is BLK_STS_OK. This is the same behavior as __bio_chain_endio(). Fixes: 10764815ff47 ("md: add io accounting for raid0 and raid5") Cc: stable@vger.kernel.org # v5.14+ Reported-by: Bhanu Victor DiCara <00bvd0+linux@gmail.com> Closes: https://lore.kernel.org/regressions/5727380.DvuYhMxLoT@bvd0/ Signed-off-by: Song Liu Tested-by: Xiao Ni Reviewed-by: Yu Kuai Acked-by: Guoqing Jiang --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 4ee4593c874a7..c94373d64f2cd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8666,7 +8666,8 @@ static void md_end_clone_io(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; - orig_bio->bi_status = bio->bi_status; + if (bio->bi_status && !orig_bio->bi_status) + orig_bio->bi_status = bio->bi_status; if (md_io_clone->start_time) bio_end_io_acct(orig_bio, md_io_clone->start_time); From 018903e1cec3421a6198589fabd30682eb277904 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Mon, 30 Oct 2023 01:13:08 +0200 Subject: [PATCH 376/560] drm/i915/dp_mst: Fix race between connector registration and setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After drm_connector_init() is called the connector is visible to the rest of the kernel via the drm_mode_config::connector_list. Make sure that the DSC AUX device and capabilities are setup by that time. Another race condition is adding the connector to the connector list before drm_connector_helper_add() sets the connector helper functions. That's an unrelated issue, for which the fix is for a follow-up. One solution would be adding the connector to the connector list only during its registration in drm_connector_register(). Cc: Stanislav Lisovskiy Cc: Ville Syrjälä Fixes: 808b43fa7e56 ("drm/i915/dp_mst: Set connector DSC capabilities and decompression AUX") Reviewed-by: Stanislav Lisovskiy Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20231030155843.2251023-2-imre.deak@intel.com (cherry picked from commit 560ea72c76eb6d0c59f77580414e64cc09f1093d) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dp_mst.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index 7b4628f4f1240..851b312bd8449 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -1161,6 +1161,14 @@ static struct drm_connector *intel_dp_add_mst_connector(struct drm_dp_mst_topolo intel_connector->port = port; drm_dp_mst_get_port_malloc(port); + /* + * TODO: set the AUX for the actual MST port decompressing the stream. + * At the moment the driver only supports enabling this globally in the + * first downstream MST branch, via intel_dp's (root port) AUX. + */ + intel_connector->dp.dsc_decompression_aux = &intel_dp->aux; + intel_dp_mst_read_decompression_port_dsc_caps(intel_dp, intel_connector); + connector = &intel_connector->base; ret = drm_connector_init(dev, connector, &intel_dp_mst_connector_funcs, DRM_MODE_CONNECTOR_DisplayPort); @@ -1172,14 +1180,6 @@ static struct drm_connector *intel_dp_add_mst_connector(struct drm_dp_mst_topolo drm_connector_helper_add(connector, &intel_dp_mst_connector_helper_funcs); - /* - * TODO: set the AUX for the actual MST port decompressing the stream. - * At the moment the driver only supports enabling this globally in the - * first downstream MST branch, via intel_dp's (root port) AUX. - */ - intel_connector->dp.dsc_decompression_aux = &intel_dp->aux; - intel_dp_mst_read_decompression_port_dsc_caps(intel_dp, intel_connector); - for_each_pipe(dev_priv, pipe) { struct drm_encoder *enc = &intel_dp->mst_encoders[pipe]->base.base; From 0561794b6b642b84b879bf97061c4b4fa692839e Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Wed, 15 Nov 2023 11:54:03 +0100 Subject: [PATCH 377/560] drm/i915: do not clean GT table on error path The only task of intel_gt_release_all is to zero gt table. Calling it on error path prevents intel_gt_driver_late_release_all (called from i915_driver_late_release) to cleanup GTs, causing leakage. After i915_driver_late_release GT array is not used anymore so it does not need cleaning at all. Sample leak report: BUG i915_request (...): Objects remaining in i915_request on __kmem_cache_shutdown() ... Object 0xffff888113420040 @offset=64 Allocated in __i915_request_create+0x75/0x610 [i915] age=18339 cpu=1 pid=1454 kmem_cache_alloc+0x25b/0x270 __i915_request_create+0x75/0x610 [i915] i915_request_create+0x109/0x290 [i915] __engines_record_defaults+0xca/0x440 [i915] intel_gt_init+0x275/0x430 [i915] i915_gem_init+0x135/0x2c0 [i915] i915_driver_probe+0x8d1/0xdc0 [i915] v2: removed whole intel_gt_release_all Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/8489 Fixes: bec68cc9ea42 ("drm/i915: Prepare for multiple GTs") Signed-off-by: Andrzej Hajda Reviewed-by: Tvrtko Ursulin Reviewed-by: Nirmoy Das Reviewed-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20231115-dont_clean_gt_on_error_path-v2-1-54250125470a@intel.com (cherry picked from commit e899505533852bf1da133f2f4c9a9655ff77f7e5) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_gt.c | 11 ----------- drivers/gpu/drm/i915/i915_driver.c | 4 +--- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index ed32bf5b15464..ba1186fc524f8 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -982,8 +982,6 @@ int intel_gt_probe_all(struct drm_i915_private *i915) err: i915_probe_error(i915, "Failed to initialize %s! (%d)\n", gtdef->name, ret); - intel_gt_release_all(i915); - return ret; } @@ -1002,15 +1000,6 @@ int intel_gt_tiles_init(struct drm_i915_private *i915) return 0; } -void intel_gt_release_all(struct drm_i915_private *i915) -{ - struct intel_gt *gt; - unsigned int id; - - for_each_gt(gt, i915, id) - i915->gt[id] = NULL; -} - void intel_gt_info_print(const struct intel_gt_info *info, struct drm_printer *p) { diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index 8a0e2c745e1f9..802de2c6decb7 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -782,7 +782,7 @@ int i915_driver_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ret = i915_driver_mmio_probe(i915); if (ret < 0) - goto out_tiles_cleanup; + goto out_runtime_pm_put; ret = i915_driver_hw_probe(i915); if (ret < 0) @@ -842,8 +842,6 @@ int i915_driver_probe(struct pci_dev *pdev, const struct pci_device_id *ent) i915_ggtt_driver_late_release(i915); out_cleanup_mmio: i915_driver_mmio_release(i915); -out_tiles_cleanup: - intel_gt_release_all(i915); out_runtime_pm_put: enable_rpm_wakeref_asserts(&i915->runtime_pm); i915_driver_late_release(i915); From 66917f85db6002ed09cd24186258892fcfca64b6 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 20 Nov 2023 06:53:19 +0800 Subject: [PATCH 378/560] autofs: add: new_inode check in autofs_fill_super() Add missing NULL check of root_inode in autofs_fill_super(). While we are at it simplify the logic by taking advantage of the VFS cleanup procedures and get rid of the goto error handling, as suggested by Al Viro. Signed-off-by: Ian Kent Link: https://lore.kernel.org/r/20231119225319.331156-1-raven@themaw.net Reviewed-by: Bill O'Donnell Cc: Al Viro Cc: Christian Brauner Cc: Bill O'Donnell Reported-by: Signed-off-by: Christian Brauner --- fs/autofs/inode.c | 56 ++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index a5083d447a62f..1f5db68636631 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -309,9 +309,7 @@ static int autofs_fill_super(struct super_block *s, struct fs_context *fc) struct autofs_fs_context *ctx = fc->fs_private; struct autofs_sb_info *sbi = s->s_fs_info; struct inode *root_inode; - struct dentry *root; struct autofs_info *ino; - int ret = -ENOMEM; pr_debug("starting up, sbi = %p\n", sbi); @@ -328,56 +326,44 @@ static int autofs_fill_super(struct super_block *s, struct fs_context *fc) */ ino = autofs_new_ino(sbi); if (!ino) - goto fail; + return -ENOMEM; root_inode = autofs_get_inode(s, S_IFDIR | 0755); + if (!root_inode) + return -ENOMEM; + root_inode->i_uid = ctx->uid; root_inode->i_gid = ctx->gid; + root_inode->i_fop = &autofs_root_operations; + root_inode->i_op = &autofs_dir_inode_operations; - root = d_make_root(root_inode); - if (!root) - goto fail_ino; - - root->d_fsdata = ino; + s->s_root = d_make_root(root_inode); + if (unlikely(!s->s_root)) { + autofs_free_ino(ino); + return -ENOMEM; + } + s->s_root->d_fsdata = ino; if (ctx->pgrp_set) { sbi->oz_pgrp = find_get_pid(ctx->pgrp); - if (!sbi->oz_pgrp) { - ret = invalf(fc, "Could not find process group %d", - ctx->pgrp); - goto fail_dput; - } - } else { + if (!sbi->oz_pgrp) + return invalf(fc, "Could not find process group %d", + ctx->pgrp); + } else sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); - } if (autofs_type_trigger(sbi->type)) - __managed_dentry_set_managed(root); - - root_inode->i_fop = &autofs_root_operations; - root_inode->i_op = &autofs_dir_inode_operations; + /* s->s_root won't be contended so there's little to + * be gained by not taking the d_lock when setting + * d_flags, even when a lot mounts are being done. + */ + managed_dentry_set_managed(s->s_root); pr_debug("pipe fd = %d, pgrp = %u\n", sbi->pipefd, pid_nr(sbi->oz_pgrp)); sbi->flags &= ~AUTOFS_SBI_CATATONIC; - - /* - * Success! Install the root dentry now to indicate completion. - */ - s->s_root = root; return 0; - - /* - * Failure ... clean up. - */ -fail_dput: - dput(root); - goto fail; -fail_ino: - autofs_free_ino(ino); -fail: - return ret; } /* From 762321dab9a72760bf9aec48362f932717c9424d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Oct 2023 16:10:17 +0200 Subject: [PATCH 379/560] filemap: add a per-mapping stable writes flag folio_wait_stable waits for writeback to finish before modifying the contents of a folio again, e.g. to support check summing of the data in the block integrity code. Currently this behavior is controlled by the SB_I_STABLE_WRITES flag on the super_block, which means it is uniform for the entire file system. This is wrong for the block device pseudofs which is shared by all block devices, or file systems that can use multiple devices like XFS witht the RT subvolume or btrfs (although btrfs currently reimplements folio_wait_stable anyway). Add a per-address_space AS_STABLE_WRITES flag to control the behavior in a more fine grained way. The existing SB_I_STABLE_WRITES is kept to initialize AS_STABLE_WRITES to the existing default which covers most cases. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231025141020.192413-2-hch@lst.de Tested-by: Ilya Dryomov Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/inode.c | 2 ++ include/linux/pagemap.h | 17 +++++++++++++++++ mm/page-writeback.c | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index edcd8a61975f3..f238d987dec90 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -215,6 +215,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) lockdep_set_class_and_name(&mapping->invalidate_lock, &sb->s_type->invalidate_lock_key, "mapping.invalidate_lock"); + if (sb->s_iflags & SB_I_STABLE_WRITES) + mapping_set_stable_writes(mapping); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bcc1ea44b4e85..06142ff7f9ce0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -204,6 +204,8 @@ enum mapping_flags { AS_NO_WRITEBACK_TAGS = 5, AS_LARGE_FOLIO_SUPPORT = 6, AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ + AS_STABLE_WRITES, /* must wait for writeback before modifying + folio contents */ }; /** @@ -289,6 +291,21 @@ static inline void mapping_clear_release_always(struct address_space *mapping) clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); } +static inline bool mapping_stable_writes(const struct address_space *mapping) +{ + return test_bit(AS_STABLE_WRITES, &mapping->flags); +} + +static inline void mapping_set_stable_writes(struct address_space *mapping) +{ + set_bit(AS_STABLE_WRITES, &mapping->flags); +} + +static inline void mapping_clear_stable_writes(struct address_space *mapping) +{ + clear_bit(AS_STABLE_WRITES, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 46f2f5d3d183b..ee2fd6a6af407 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3107,7 +3107,7 @@ EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); */ void folio_wait_stable(struct folio *folio) { - if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES) + if (mapping_stable_writes(folio_mapping(folio))) folio_wait_writeback(folio); } EXPORT_SYMBOL_GPL(folio_wait_stable); From 1898efcdbed32bb1c67269c985a50bab0dbc9493 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Oct 2023 16:10:18 +0200 Subject: [PATCH 380/560] block: update the stable_writes flag in bdev_add Propagate the per-queue stable_write flags into each bdev inode in bdev_add. This makes sure devices that require stable writes have it set for I/O on the block device node as well. Note that this doesn't cover the case of a flag changing on a live device yet. We should handle that as well, but I plan to cover it as part of a more general rework of how changing runtime paramters on block devices works. Fixes: 1cb039f3dc16 ("bdi: replace BDI_CAP_STABLE_WRITES with a queue and a sb flag") Reported-by: Ilya Dryomov Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231025141020.192413-3-hch@lst.de Tested-by: Ilya Dryomov Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- block/bdev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/bdev.c b/block/bdev.c index e4cfb7adb6458..750aec178b6ab 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -425,6 +425,8 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) void bdev_add(struct block_device *bdev, dev_t dev) { + if (bdev_stable_writes(bdev)) + mapping_set_stable_writes(bdev->bd_inode->i_mapping); bdev->bd_dev = dev; bdev->bd_inode->i_rdev = dev; bdev->bd_inode->i_ino = dev; From c421df0b19430417a04f68919fc3d1943d20ac04 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Oct 2023 16:10:19 +0200 Subject: [PATCH 381/560] xfs: clean up FS_XFLAG_REALTIME handling in xfs_ioctl_setattr_xflags Introduce a local boolean variable if FS_XFLAG_REALTIME to make the checks for it more obvious, and de-densify a few of the conditionals using it to make them more readable while at it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231025141020.192413-4-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xfs/xfs_ioctl.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index a82470e027f72..022dc2dc78d2f 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1121,23 +1121,25 @@ xfs_ioctl_setattr_xflags( struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; + bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); uint64_t i_flags2; - /* Can't change realtime flag if any extents are allocated. */ - if ((ip->i_df.if_nextents || ip->i_delayed_blks) && - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) - return -EINVAL; + if (rtflag != XFS_IS_REALTIME_INODE(ip)) { + /* Can't change realtime flag if any extents are allocated. */ + if (ip->i_df.if_nextents || ip->i_delayed_blks) + return -EINVAL; + } - /* If realtime flag is set then must have realtime device */ - if (fa->fsx_xflags & FS_XFLAG_REALTIME) { + if (rtflag) { + /* If realtime flag is set then must have realtime device */ if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || xfs_extlen_to_rtxmod(mp, ip->i_extsize)) return -EINVAL; - } - /* Clear reflink if we are actually able to set the rt flag. */ - if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) - ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + /* Clear reflink if we are actually able to set the rt flag. */ + if (xfs_is_reflink_inode(ip)) + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + } /* diflags2 only valid for v3 inodes. */ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); From 9c04138414c00ae61421f36ada002712c4bac94a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Oct 2023 16:10:20 +0200 Subject: [PATCH 382/560] xfs: respect the stable writes flag on the RT device Update the per-folio stable writes flag dependening on which device an inode resides on. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231025141020.192413-5-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xfs/xfs_inode.h | 8 ++++++++ fs/xfs/xfs_ioctl.c | 8 ++++++++ fs/xfs/xfs_iops.c | 7 +++++++ 3 files changed, 23 insertions(+) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3dc47937da5d1..3beb470f18920 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -569,6 +569,14 @@ extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); +static inline void xfs_update_stable_writes(struct xfs_inode *ip) +{ + if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev)) + mapping_set_stable_writes(VFS_I(ip)->i_mapping); + else + mapping_clear_stable_writes(VFS_I(ip)->i_mapping); +} + /* * When setting up a newly allocated inode, we need to call * xfs_finish_inode_setup() once the inode is fully instantiated at diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 022dc2dc78d2f..6c3919687ea6b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1150,6 +1150,14 @@ xfs_ioctl_setattr_xflags( ip->i_diflags2 = i_flags2; xfs_diflags_to_iflags(ip, false); + + /* + * Make the stable writes flag match that of the device the inode + * resides on when flipping the RT flag. + */ + if (rtflag != XFS_IS_REALTIME_INODE(ip) && S_ISREG(VFS_I(ip)->i_mode)) + xfs_update_stable_writes(ip); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index fdfda4fba12b1..a0d77f5f512e2 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1298,6 +1298,13 @@ xfs_setup_inode( gfp_mask = mapping_gfp_mask(inode->i_mapping); mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); + /* + * For real-time inodes update the stable write flags to that of the RT + * device instead of the data device. + */ + if (S_ISREG(inode->i_mode) && XFS_IS_REALTIME_INODE(ip)) + xfs_update_stable_writes(ip); + /* * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. From 796432efab1e372d404e7a71cc6891a53f105051 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 19 Nov 2023 18:56:17 -0500 Subject: [PATCH 383/560] libfs: getdents() should return 0 after reaching EOD The new directory offset helpers don't conform with the convention of getdents() returning no more entries once a directory file descriptor has reached the current end-of-directory. To address this, copy the logic from dcache_readdir() to mark the open directory file descriptor once EOD has been reached. Seeking resets the mark. Reported-by: Tavian Barnes Closes: https://lore.kernel.org/linux-fsdevel/20231113180616.2831430-1-tavianator@tavianator.com/ Fixes: 6faddda69f62 ("libfs: Add directory operations for stable offsets") Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170043792492.4628.15646203084646716134.stgit@bazille.1015granger.net Signed-off-by: Christian Brauner --- fs/libfs.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index e9440d55073c5..c2aa6fd4795c4 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -399,6 +399,8 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) return -EINVAL; } + /* In this case, ->private_data is protected by f_pos_lock */ + file->private_data = NULL; return vfs_setpos(file, offset, U32_MAX); } @@ -428,7 +430,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } -static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx) +static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) { struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); XA_STATE(xas, &so_ctx->xa, ctx->pos); @@ -437,7 +439,7 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx) while (true) { dentry = offset_find_next(&xas); if (!dentry) - break; + return ERR_PTR(-ENOENT); if (!offset_dir_emit(ctx, dentry)) { dput(dentry); @@ -447,6 +449,7 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx) dput(dentry); ctx->pos = xas.xa_index + 1; } + return NULL; } /** @@ -479,7 +482,12 @@ static int offset_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - offset_iterate_dir(d_inode(dir), ctx); + /* In this case, ->private_data is protected by f_pos_lock */ + if (ctx->pos == 2) + file->private_data = NULL; + else if (file->private_data == ERR_PTR(-ENOENT)) + return 0; + file->private_data = offset_iterate_dir(d_inode(dir), ctx); return 0; } From 8479063f1fbee201a8739130e816cc331b675838 Mon Sep 17 00:00:00 2001 From: Charles Mirabile Date: Mon, 20 Nov 2023 05:55:45 -0500 Subject: [PATCH 384/560] io_uring/fs: consider link->flags when getting path for LINKAT In order for `AT_EMPTY_PATH` to work as expected, the fact that the user wants that behavior needs to make it to `getname_flags` or it will return ENOENT. Fixes: cf30da90bc3a ("io_uring: add support for IORING_OP_LINKAT") Cc: Link: https://github.com/axboe/liburing/issues/995 Signed-off-by: Charles Mirabile Link: https://lore.kernel.org/r/20231120105545.1209530-1-cmirabil@redhat.com Signed-off-by: Jens Axboe --- io_uring/fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/fs.c b/io_uring/fs.c index 08e3b175469c6..eccea851dd5a2 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -254,7 +254,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); lnk->flags = READ_ONCE(sqe->hardlink_flags); - lnk->oldpath = getname(oldf); + lnk->oldpath = getname_uflags(oldf, lnk->flags); if (IS_ERR(lnk->oldpath)) return PTR_ERR(lnk->oldpath); From bb0a05acd6121ff0e810b44fdc24dbdfaa46b642 Mon Sep 17 00:00:00 2001 From: Jonas Karlman Date: Thu, 26 Oct 2023 19:14:58 +0000 Subject: [PATCH 385/560] drm/rockchip: vop: Fix color for RGB888/BGR888 format on VOP full Use of DRM_FORMAT_RGB888 and DRM_FORMAT_BGR888 on e.g. RK3288, RK3328 and RK3399 result in wrong colors being displayed. The issue can be observed using modetest: modetest -s @:1920x1080-60@RG24 modetest -s @:1920x1080-60@BG24 Vendor 4.4 kernel apply an inverted rb swap for these formats on VOP full framework (IP version 3.x) compared to VOP little framework (2.x). Fix colors by applying different rb swap for VOP full framework (3.x) and VOP little framework (2.x) similar to vendor 4.4 kernel. Fixes: 85a359f25388 ("drm/rockchip: Add BGR formats to VOP") Signed-off-by: Jonas Karlman Tested-by: Diederik de Haas Reviewed-by: Christopher Obbard Tested-by: Christopher Obbard Signed-off-by: Heiko Stuebner Link: https://patchwork.freedesktop.org/patch/msgid/20231026191500.2994225-1-jonas@kwiboo.se --- drivers/gpu/drm/rockchip/rockchip_drm_vop.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c index 066299894d048..a13473b2d54c4 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c @@ -247,14 +247,22 @@ static inline void vop_cfg_done(struct vop *vop) VOP_REG_SET(vop, common, cfg_done, 1); } -static bool has_rb_swapped(uint32_t format) +static bool has_rb_swapped(uint32_t version, uint32_t format) { switch (format) { case DRM_FORMAT_XBGR8888: case DRM_FORMAT_ABGR8888: - case DRM_FORMAT_BGR888: case DRM_FORMAT_BGR565: return true; + /* + * full framework (IP version 3.x) only need rb swapped for RGB888 and + * little framework (IP version 2.x) only need rb swapped for BGR888, + * check for 3.x to also only rb swap BGR888 for unknown vop version + */ + case DRM_FORMAT_RGB888: + return VOP_MAJOR(version) == 3; + case DRM_FORMAT_BGR888: + return VOP_MAJOR(version) != 3; default: return false; } @@ -1030,7 +1038,7 @@ static void vop_plane_atomic_update(struct drm_plane *plane, VOP_WIN_SET(vop, win, dsp_info, dsp_info); VOP_WIN_SET(vop, win, dsp_st, dsp_st); - rb_swap = has_rb_swapped(fb->format->format); + rb_swap = has_rb_swapped(vop->data->version, fb->format->format); VOP_WIN_SET(vop, win, rb_swap, rb_swap); /* From baf8fb7e0e5ec54ea0839f0c534f2cdcd79bea9c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 20 Nov 2023 13:24:54 +0800 Subject: [PATCH 386/560] bcache: avoid oversize memory allocation by small stripe_size Arraies bcache->stripe_sectors_dirty and bcache->full_dirty_stripes are used for dirty data writeback, their sizes are decided by backing device capacity and stripe size. Larger backing device capacity or smaller stripe size make these two arraies occupies more dynamic memory space. Currently bcache->stripe_size is directly inherited from queue->limits.io_opt of underlying storage device. For normal hard drives, its limits.io_opt is 0, and bcache sets the corresponding stripe_size to 1TB (1<<31 sectors), it works fine 10+ years. But for devices do declare value for queue->limits.io_opt, small stripe_size (comparing to 1TB) becomes an issue for oversize memory allocations of bcache->stripe_sectors_dirty and bcache->full_dirty_stripes, while the capacity of hard drives gets much larger in recent decade. For example a raid5 array assembled by three 20TB hardrives, the raid device capacity is 40TB with typical 512KB limits.io_opt. After the math calculation in bcache code, these two arraies will occupy 400MB dynamic memory. Even worse Andrea Tomassetti reports that a 4KB limits.io_opt is declared on a new 2TB hard drive, then these two arraies request 2GB and 512MB dynamic memory from kzalloc(). The result is that bcache device always fails to initialize on his system. To avoid the oversize memory allocation, bcache->stripe_size should not directly inherited by queue->limits.io_opt from the underlying device. This patch defines BCH_MIN_STRIPE_SZ (4MB) as minimal bcache stripe size and set bcache device's stripe size against the declared limits.io_opt value from the underlying storage device, - If the declared limits.io_opt > BCH_MIN_STRIPE_SZ, bcache device will set its stripe size directly by this limits.io_opt value. - If the declared limits.io_opt < BCH_MIN_STRIPE_SZ, bcache device will set its stripe size by a value multiplying limits.io_opt and euqal or large than BCH_MIN_STRIPE_SZ. Then the minimal stripe size of a bcache device will always be >= 4MB. For a 40TB raid5 device with 512KB limits.io_opt, memory occupied by bcache->stripe_sectors_dirty and bcache->full_dirty_stripes will be 50MB in total. For a 2TB hard drive with 4KB limits.io_opt, memory occupied by these two arraies will be 2.5MB in total. Such mount of memory allocated for bcache->stripe_sectors_dirty and bcache->full_dirty_stripes is reasonable for most of storage devices. Reported-by: Andrea Tomassetti Signed-off-by: Coly Li Reviewed-by: Eric Wheeler Link: https://lore.kernel.org/r/20231120052503.6122-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 05be59ae21b29..6ae2329052c92 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -265,6 +265,7 @@ struct bcache_device { #define BCACHE_DEV_WB_RUNNING 3 #define BCACHE_DEV_RATE_DW_RUNNING 4 int nr_stripes; +#define BCH_MIN_STRIPE_SZ ((4 << 20) >> SECTOR_SHIFT) unsigned int stripe_size; atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 8bd899766372a..c7ecc7058d77e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -905,6 +905,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, if (!d->stripe_size) d->stripe_size = 1 << 31; + else if (d->stripe_size < BCH_MIN_STRIPE_SZ) + d->stripe_size = roundup(BCH_MIN_STRIPE_SZ, d->stripe_size); n = DIV_ROUND_UP_ULL(sectors, d->stripe_size); if (!n || n > max_stripes) { From 777967e7e9f6f5f3e153abffb562bffaf4430d26 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 20 Nov 2023 13:24:55 +0800 Subject: [PATCH 387/560] bcache: check return value from btree_node_alloc_replacement() In btree_gc_rewrite_node(), pointer 'n' is not checked after it returns from btree_gc_rewrite_node(). There is potential possibility that 'n' is a non NULL ERR_PTR(), referencing such error code is not permitted in following code. Therefore a return value checking is necessary after 'n' is back from btree_node_alloc_replacement(). Signed-off-by: Coly Li Reported-by: Dan Carpenter Cc: Link: https://lore.kernel.org/r/20231120052503.6122-3-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ae5cbb55861fd..de8d552201dc3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1532,6 +1532,8 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, return 0; n = btree_node_alloc_replacement(replace, NULL); + if (IS_ERR(n)) + return 0; /* recheck reserve after allocating replacement node */ if (btree_check_reserve(b, NULL)) { From be93825f0e6428c2d3f03a6e4d447dc48d33d7ff Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 20 Nov 2023 13:24:56 +0800 Subject: [PATCH 388/560] bcache: remove redundant assignment to variable cur_idx Variable cur_idx is being initialized with a value that is never read, it is being re-assigned later in a while-loop. Remove the redundant assignment. Cleans up clang scan build warning: drivers/md/bcache/writeback.c:916:2: warning: Value stored to 'cur_idx' is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Reviewed-by: Coly Li Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-4-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 24c049067f61a..c3e872e0a6f2c 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -913,7 +913,7 @@ static int bch_dirty_init_thread(void *arg) int cur_idx, prev_idx, skip_nr; k = p = NULL; - cur_idx = prev_idx = 0; + prev_idx = 0; bch_btree_iter_init(&c->root->keys, &iter, NULL); k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); From 2c7f497ac274a14330208b18f6f734000868ebf9 Mon Sep 17 00:00:00 2001 From: Rand Deeb Date: Mon, 20 Nov 2023 13:24:57 +0800 Subject: [PATCH 389/560] bcache: prevent potential division by zero error In SHOW(), the variable 'n' is of type 'size_t.' While there is a conditional check to verify that 'n' is not equal to zero before executing the 'do_div' macro, concerns arise regarding potential division by zero error in 64-bit environments. The concern arises when 'n' is 64 bits in size, greater than zero, and the lower 32 bits of it are zeros. In such cases, the conditional check passes because 'n' is non-zero, but the 'do_div' macro casts 'n' to 'uint32_t,' effectively truncating it to its lower 32 bits. Consequently, the 'n' value becomes zero. To fix this potential division by zero error and ensure precise division handling, this commit replaces the 'do_div' macro with div64_u64(). div64_u64() is designed to work with 64-bit operands, guaranteeing that division is performed correctly. This change enhances the robustness of the code, ensuring that division operations yield accurate results in all scenarios, eliminating the possibility of division by zero, and improving compatibility across different 64-bit environments. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Rand Deeb Cc: Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-5-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 45d8af755de6d..a438efb660699 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -1104,7 +1104,7 @@ SHOW(__bch_cache) sum += INITIAL_PRIO - cached[i]; if (n) - do_div(sum, n); + sum = div64_u64(sum, n); for (i = 0; i < ARRAY_SIZE(q); i++) q[i] = INITIAL_PRIO - cached[n * (i + 1) / From 7cc47e64d3d69786a2711a4767e26b26ba63d7ed Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Mon, 20 Nov 2023 13:24:58 +0800 Subject: [PATCH 390/560] bcache: fixup init dirty data errors We found that after long run, the dirty_data of the bcache device will have errors. This error cannot be eliminated unless re-register. We also found that reattach after detach, this error can accumulate. In bch_sectors_dirty_init(), all inode <= d->id keys will be recounted again. This is wrong, we only need to count the keys of the current device. Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") Signed-off-by: Mingzhe Zou Cc: Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-6-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index c3e872e0a6f2c..77fb72ac6b81e 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -991,8 +991,11 @@ void bch_sectors_dirty_init(struct bcache_device *d) op.count = 0; for_each_key_filter(&c->root->keys, - k, &iter, bch_ptr_invalid) + k, &iter, bch_ptr_invalid) { + if (KEY_INODE(k) != op.inode) + continue; sectors_dirty_init_fn(&op.op, c->root, k); + } rw_unlock(0, c->root); return; From e34820f984512b433ee1fc291417e60c47d56727 Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Mon, 20 Nov 2023 13:24:59 +0800 Subject: [PATCH 391/560] bcache: fixup lock c->root error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We had a problem with io hung because it was waiting for c->root to release the lock. crash> cache_set.root -l cache_set.list ffffa03fde4c0050 root = 0xffff802ef454c800 crash> btree -o 0xffff802ef454c800 | grep rw_semaphore [ffff802ef454c858] struct rw_semaphore lock; crash> struct rw_semaphore ffff802ef454c858 struct rw_semaphore { count = { counter = -4294967297 }, wait_list = { next = 0xffff00006786fc28, prev = 0xffff00005d0efac8 }, wait_lock = { raw_lock = { { val = { counter = 0 }, { locked = 0 '\000', pending = 0 '\000' }, { locked_pending = 0, tail = 0 } } } }, osq = { tail = { counter = 0 } }, owner = 0xffffa03fdc586603 } The "counter = -4294967297" means that lock count is -1 and a write lock is being attempted. Then, we found that there is a btree with a counter of 1 in btree_cache_freeable. crash> cache_set -l cache_set.list ffffa03fde4c0050 -o|grep btree_cache [ffffa03fde4c1140] struct list_head btree_cache; [ffffa03fde4c1150] struct list_head btree_cache_freeable; [ffffa03fde4c1160] struct list_head btree_cache_freed; [ffffa03fde4c1170] unsigned int btree_cache_used; [ffffa03fde4c1178] wait_queue_head_t btree_cache_wait; [ffffa03fde4c1190] struct task_struct *btree_cache_alloc_lock; crash> list -H ffffa03fde4c1140|wc -l 973 crash> list -H ffffa03fde4c1150|wc -l 1123 crash> cache_set.btree_cache_used -l cache_set.list ffffa03fde4c0050 btree_cache_used = 2097 crash> list -s btree -l btree.list -H ffffa03fde4c1140|grep -E -A2 "^ lock = {" > btree_cache.txt crash> list -s btree -l btree.list -H ffffa03fde4c1150|grep -E -A2 "^ lock = {" > btree_cache_freeable.txt [root@node-3 127.0.0.1-2023-08-04-16:40:28]# pwd /var/crash/127.0.0.1-2023-08-04-16:40:28 [root@node-3 127.0.0.1-2023-08-04-16:40:28]# cat btree_cache.txt|grep counter|grep -v "counter = 0" [root@node-3 127.0.0.1-2023-08-04-16:40:28]# cat btree_cache_freeable.txt|grep counter|grep -v "counter = 0" counter = 1 We found that this is a bug in bch_sectors_dirty_init() when locking c->root: (1). Thread X has locked c->root(A) write. (2). Thread Y failed to lock c->root(A), waiting for the lock(c->root A). (3). Thread X bch_btree_set_root() changes c->root from A to B. (4). Thread X releases the lock(c->root A). (5). Thread Y successfully locks c->root(A). (6). Thread Y releases the lock(c->root B). down_write locked ---(1)----------------------┐ | | | down_read waiting ---(2)----┐ | | | ┌-------------┐ ┌-------------┐ bch_btree_set_root ===(3)========>> | c->root A | | c->root B | | | └-------------┘ └-------------┘ up_write ---(4)---------------------┘ | | | | | down_read locked ---(5)-----------┘ | | | up_read ---(6)-----------------------------┘ Since c->root may change, the correct steps to lock c->root should be the same as bch_root_usage(), compare after locking. static unsigned int bch_root_usage(struct cache_set *c) { unsigned int bytes = 0; struct bkey *k; struct btree *b; struct btree_iter iter; goto lock_root; do { rw_unlock(false, b); lock_root: b = c->root; rw_lock(false, b, b->level); } while (b != c->root); for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) bytes += bkey_bytes(k); rw_unlock(false, b); return (bytes * 100) / btree_bytes(c); } Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") Signed-off-by: Mingzhe Zou Cc: Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-7-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 77fb72ac6b81e..a1d7609162468 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -977,14 +977,22 @@ static int bch_btre_dirty_init_thread_nr(void) void bch_sectors_dirty_init(struct bcache_device *d) { int i; + struct btree *b = NULL; struct bkey *k = NULL; struct btree_iter iter; struct sectors_dirty_init op; struct cache_set *c = d->c; struct bch_dirty_init_state state; +retry_lock: + b = c->root; + rw_lock(0, b, b->level); + if (b != c->root) { + rw_unlock(0, b); + goto retry_lock; + } + /* Just count root keys if no leaf node */ - rw_lock(0, c->root, c->root->level); if (c->root->level == 0) { bch_btree_op_init(&op.op, -1); op.inode = d->id; @@ -997,7 +1005,7 @@ void bch_sectors_dirty_init(struct bcache_device *d) sectors_dirty_init_fn(&op.op, c->root, k); } - rw_unlock(0, c->root); + rw_unlock(0, b); return; } @@ -1033,7 +1041,7 @@ void bch_sectors_dirty_init(struct bcache_device *d) out: /* Must wait for all threads to stop. */ wait_event(state.wait, atomic_read(&state.started) == 0); - rw_unlock(0, c->root); + rw_unlock(0, b); } void bch_cached_dev_writeback_init(struct cached_dev *dc) From 2faac25d7958c4761bb8cec54adb79f806783ad6 Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Mon, 20 Nov 2023 13:25:00 +0800 Subject: [PATCH 392/560] bcache: fixup multi-threaded bch_sectors_dirty_init() wake-up race We get a kernel crash about "unable to handle kernel paging request": ```dmesg [368033.032005] BUG: unable to handle kernel paging request at ffffffffad9ae4b5 [368033.032007] PGD fc3a0d067 P4D fc3a0d067 PUD fc3a0e063 PMD 8000000fc38000e1 [368033.032012] Oops: 0003 [#1] SMP PTI [368033.032015] CPU: 23 PID: 55090 Comm: bch_dirtcnt[0] Kdump: loaded Tainted: G OE --------- - - 4.18.0-147.5.1.es8_24.x86_64 #1 [368033.032017] Hardware name: Tsinghua Tongfang THTF Chaoqiang Server/072T6D, BIOS 2.4.3 01/17/2017 [368033.032027] RIP: 0010:native_queued_spin_lock_slowpath+0x183/0x1d0 [368033.032029] Code: 8b 02 48 85 c0 74 f6 48 89 c1 eb d0 c1 e9 12 83 e0 03 83 e9 01 48 c1 e0 05 48 63 c9 48 05 c0 3d 02 00 48 03 04 cd 60 68 93 ad <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 02 [368033.032031] RSP: 0018:ffffbb48852abe00 EFLAGS: 00010082 [368033.032032] RAX: ffffffffad9ae4b5 RBX: 0000000000000246 RCX: 0000000000003bf3 [368033.032033] RDX: ffff97b0ff8e3dc0 RSI: 0000000000600000 RDI: ffffbb4884743c68 [368033.032034] RBP: 0000000000000001 R08: 0000000000000000 R09: 000007ffffffffff [368033.032035] R10: ffffbb486bb01000 R11: 0000000000000001 R12: ffffffffc068da70 [368033.032036] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000000 [368033.032038] FS: 0000000000000000(0000) GS:ffff97b0ff8c0000(0000) knlGS:0000000000000000 [368033.032039] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [368033.032040] CR2: ffffffffad9ae4b5 CR3: 0000000fc3a0a002 CR4: 00000000003626e0 [368033.032042] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [368033.032043] bcache: bch_cached_dev_attach() Caching rbd479 as bcache462 on set 8cff3c36-4a76-4242-afaa-7630206bc70b [368033.032045] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [368033.032046] Call Trace: [368033.032054] _raw_spin_lock_irqsave+0x32/0x40 [368033.032061] __wake_up_common_lock+0x63/0xc0 [368033.032073] ? bch_ptr_invalid+0x10/0x10 [bcache] [368033.033502] bch_dirty_init_thread+0x14c/0x160 [bcache] [368033.033511] ? read_dirty_submit+0x60/0x60 [bcache] [368033.033516] kthread+0x112/0x130 [368033.033520] ? kthread_flush_work_fn+0x10/0x10 [368033.034505] ret_from_fork+0x35/0x40 ``` The crash occurred when call wake_up(&state->wait), and then we want to look at the value in the state. However, bch_sectors_dirty_init() is not found in the stack of any task. Since state is allocated on the stack, we guess that bch_sectors_dirty_init() has exited, causing bch_dirty_init_thread() to be unable to handle kernel paging request. In order to verify this idea, we added some printing information during wake_up(&state->wait). We find that "wake up" is printed twice, however we only expect the last thread to wake up once. ```dmesg [ 994.641004] alcache: bch_dirty_init_thread() wake up [ 994.641018] alcache: bch_dirty_init_thread() wake up [ 994.641523] alcache: bch_sectors_dirty_init() init exit ``` There is a race. If bch_sectors_dirty_init() exits after the first wake up, the second wake up will trigger this bug("unable to handle kernel paging request"). Proceed as follows: bch_sectors_dirty_init kthread_run ==============> bch_dirty_init_thread(bch_dirtcnt[0]) ... ... atomic_inc(&state.started) ... ... ... atomic_read(&state.enough) ... ... atomic_set(&state->enough, 1) kthread_run ======================================================> bch_dirty_init_thread(bch_dirtcnt[1]) ... atomic_dec_and_test(&state->started) ... atomic_inc(&state.started) ... ... ... wake_up(&state->wait) ... atomic_read(&state.enough) atomic_dec_and_test(&state->started) ... ... wait_event(state.wait, atomic_read(&state.started) == 0) ... return ... wake_up(&state->wait) We believe it is very common to wake up twice if there is no dirty, but crash is an extremely low probability event. It's hard for us to reproduce this issue. We attached and detached continuously for a week, with a total of more than one million attaches and only one crash. Putting atomic_inc(&state.started) before kthread_run() can avoid waking up twice. Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") Signed-off-by: Mingzhe Zou Cc: Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-8-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index a1d7609162468..3accfdaee6b19 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -1025,17 +1025,18 @@ void bch_sectors_dirty_init(struct bcache_device *d) if (atomic_read(&state.enough)) break; + atomic_inc(&state.started); state.infos[i].state = &state; state.infos[i].thread = kthread_run(bch_dirty_init_thread, &state.infos[i], "bch_dirtcnt[%d]", i); if (IS_ERR(state.infos[i].thread)) { pr_err("fails to run thread bch_dirty_init[%d]\n", i); + atomic_dec(&state.started); for (--i; i >= 0; i--) kthread_stop(state.infos[i].thread); goto out; } - atomic_inc(&state.started); } out: From f72f4312d4388376fc8a1f6cf37cb21a0d41758b Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 20 Nov 2023 13:25:01 +0800 Subject: [PATCH 393/560] bcache: replace a mistaken IS_ERR() by IS_ERR_OR_NULL() in btree_gc_coalesce() Commit 028ddcac477b ("bcache: Remove unnecessary NULL point check in node allocations") do the following change inside btree_gc_coalesce(), 31 @@ -1340,7 +1340,7 @@ static int btree_gc_coalesce( 32 memset(new_nodes, 0, sizeof(new_nodes)); 33 closure_init_stack(&cl); 34 35 - while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) 36 + while (nodes < GC_MERGE_NODES && !IS_ERR(r[nodes].b)) 37 keys += r[nodes++].keys; 38 39 blocks = btree_default_blocks(b->c) * 2 / 3; At line 35 the original r[nodes].b is not always allocatored from __bch_btree_node_alloc(), and possibly initialized as NULL pointer by caller of btree_gc_coalesce(). Therefore the change at line 36 is not correct. This patch replaces the mistaken IS_ERR() by IS_ERR_OR_NULL() to avoid potential issue. Fixes: 028ddcac477b ("bcache: Remove unnecessary NULL point check in node allocations") Cc: # 6.5+ Cc: Zheng Wang Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-9-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index de8d552201dc3..79f1fa4a0d551 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1368,7 +1368,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, memset(new_nodes, 0, sizeof(new_nodes)); closure_init_stack(&cl); - while (nodes < GC_MERGE_NODES && !IS_ERR(r[nodes].b)) + while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) keys += r[nodes++].keys; blocks = btree_default_blocks(b->c) * 2 / 3; From 31f5b956a197d4ec25c8a07cb3a2ab69d0c0b82f Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 20 Nov 2023 13:25:02 +0800 Subject: [PATCH 394/560] bcache: add code comments for bch_btree_node_get() and __bch_btree_node_alloc() This patch adds code comments to bch_btree_node_get() and __bch_btree_node_alloc() that NULL pointer will not be returned and it is unnecessary to check NULL pointer by the callers of these routines. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-10-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 79f1fa4a0d551..de3019972b355 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1000,6 +1000,9 @@ static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, * * The btree node will have either a read or a write lock held, depending on * level and op->lock. + * + * Note: Only error code or btree pointer will be returned, it is unncessary + * for callers to check NULL pointer. */ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, struct bkey *k, int level, bool write, @@ -1111,6 +1114,10 @@ static void btree_node_free(struct btree *b) mutex_unlock(&b->c->bucket_lock); } +/* + * Only error code or btree pointer will be returned, it is unncessary for + * callers to check NULL pointer. + */ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, int level, bool wait, struct btree *parent) From 3eba5e0b2422aec3c9e79822029599961fdcab97 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 20 Nov 2023 13:25:03 +0800 Subject: [PATCH 395/560] bcache: avoid NULL checking to c->root in run_cache_set() In run_cache_set() after c->root returned from bch_btree_node_get(), it is checked by IS_ERR_OR_NULL(). Indeed it is unncessary to check NULL because bch_btree_node_get() will not return NULL pointer to caller. This patch replaces IS_ERR_OR_NULL() by IS_ERR() for the above reason. Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20231120052503.6122-11-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c7ecc7058d77e..bfe1685dbae57 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2018,7 +2018,7 @@ static int run_cache_set(struct cache_set *c) c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL); - if (IS_ERR_OR_NULL(c->root)) + if (IS_ERR(c->root)) goto err; list_del_init(&c->root->list); From 9bb69ba4c177dccaa1f5b5cbdf80b67813328348 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 27 Oct 2023 19:36:51 +0100 Subject: [PATCH 396/560] ACPI: processor_idle: use raw_safe_halt() in acpi_idle_play_dead() Xen HVM guests were observed taking triple-faults when attempting to online a previously offlined vCPU. Investigation showed that the fault was coming from a failing call to lockdep_assert_irqs_disabled(), in load_current_idt() which was too early in the CPU bringup to actually catch the exception and report the failure cleanly. This was a false positive, caused by acpi_idle_play_dead() setting the per-cpu hardirqs_enabled flag by calling safe_halt(). Switch it to use raw_safe_halt() instead, which doesn't do so. Signed-off-by: David Woodhouse Acked-by: Peter Zijlstra (Intel) Cc: 6.6+ # 6.6+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/processor_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 3a34a8c425fe4..55437f5e0c3ae 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -592,7 +592,7 @@ static int acpi_idle_play_dead(struct cpuidle_device *dev, int index) while (1) { if (cx->entry_method == ACPI_CSTATE_HALT) - safe_halt(); + raw_safe_halt(); else if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) { io_idle(cx->address); } else From 37ba91a82e3b9de35f64348c62b5ec7d74e3a41c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 12 Nov 2023 21:36:26 +0100 Subject: [PATCH 397/560] ACPI: PM: Add acpi_device_fix_up_power_children() function In some cases it is necessary to fix-up the power-state of an ACPI device's children without touching the ACPI device itself add a new acpi_device_fix_up_power_children() function for this. Signed-off-by: Hans de Goede Cc: 6.6+ # 6.6+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/device_pm.c | 13 +++++++++++++ include/acpi/acpi_bus.h | 1 + 2 files changed, 14 insertions(+) diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index f007116a84276..3b4d048c49417 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -397,6 +397,19 @@ void acpi_device_fix_up_power_extended(struct acpi_device *adev) } EXPORT_SYMBOL_GPL(acpi_device_fix_up_power_extended); +/** + * acpi_device_fix_up_power_children - Force a device's children into D0. + * @adev: Parent device object whose children's power state is to be fixed up. + * + * Call acpi_device_fix_up_power() for @adev's children so long as they + * are reported as present and enabled. + */ +void acpi_device_fix_up_power_children(struct acpi_device *adev) +{ + acpi_dev_for_each_child(adev, fix_up_power_if_applicable, NULL); +} +EXPORT_SYMBOL_GPL(acpi_device_fix_up_power_children); + int acpi_device_update_power(struct acpi_device *device, int *state_p) { int state; diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index afeed6e72049e..1216d72c650fa 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -542,6 +542,7 @@ int acpi_device_set_power(struct acpi_device *device, int state); int acpi_bus_init_power(struct acpi_device *device); int acpi_device_fix_up_power(struct acpi_device *device); void acpi_device_fix_up_power_extended(struct acpi_device *adev); +void acpi_device_fix_up_power_children(struct acpi_device *adev); int acpi_bus_update_power(acpi_handle handle, int *state_p); int acpi_device_update_power(struct acpi_device *device, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); From c93695494606326d7fd72b46a2a657139ccb0dec Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 12 Nov 2023 21:36:27 +0100 Subject: [PATCH 398/560] ACPI: video: Use acpi_device_fix_up_power_children() Commit 89c290ea7589 ("ACPI: video: Put ACPI video and its child devices into D0 on boot") introduced calling acpi_device_fix_up_power_extended() on the video card for which the ACPI video bus is the companion device. This unnecessarily touches the power-state of the GPU itself, while the issue it tries to address only requires calling _PS0 on the child devices. Touching the power-state of the GPU itself is causing suspend / resume issues on e.g. a Lenovo ThinkPad W530. Instead use acpi_device_fix_up_power_children(), which only touches the child devices, to fix this. Fixes: 89c290ea7589 ("ACPI: video: Put ACPI video and its child devices into D0 on boot") Reported-by: Owen T. Heisler Closes: https://lore.kernel.org/regressions/9f36fb06-64c4-4264-aaeb-4e1289e764c4@owenh.net/ Closes: https://gitlab.freedesktop.org/drm/nouveau/-/issues/273 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218124 Tested-by: Kai-Heng Feng Tested-by: Owen T. Heisler Signed-off-by: Hans de Goede Cc: 6.6+ # 6.6+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_video.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index 0b7a01f38b65c..d321ca7160d97 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -2031,7 +2031,7 @@ static int acpi_video_bus_add(struct acpi_device *device) * HP ZBook Fury 16 G10 requires ACPI video's child devices have _PS0 * evaluated to have functional panel brightness control. */ - acpi_device_fix_up_power_extended(device); + acpi_device_fix_up_power_children(device); pr_info("%s [%s] (multi-head: %s rom: %s post: %s)\n", ACPI_VIDEO_DEVICE_NAME, acpi_device_bid(device), From bd911485294a6f0596e4592ed442438015cffc8a Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 15 Nov 2023 19:02:22 +0100 Subject: [PATCH 399/560] ACPI: resource: Skip IRQ override on ASUS ExpertBook B1402CVA Like various other ASUS ExpertBook-s, the ASUS ExpertBook B1402CVA has an ACPI DSDT table that describes IRQ 1 as ActiveLow while the kernel overrides it to EdgeHigh. This prevents the keyboard from working. To fix this issue, add this laptop to the skip_override_table so that the kernel does not override IRQ 1. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218114 Cc: All applicable Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index 15a3bdbd0755d..9bd9f79cd4099 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -447,6 +447,13 @@ static const struct dmi_system_id irq1_level_low_skip_override[] = { DMI_MATCH(DMI_BOARD_NAME, "B1402CBA"), }, }, + { + /* Asus ExpertBook B1402CVA */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_BOARD_NAME, "B1402CVA"), + }, + }, { /* Asus ExpertBook B1502CBA */ .matches = { From b85e2dab33ce467e8dcf1cb6c0c587132ff17f56 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 15 Nov 2023 11:47:51 -0500 Subject: [PATCH 400/560] PM: tools: Fix sleepgraph syntax error The sleepgraph tool currently fails: File "/usr/bin/sleepgraph", line 4155 or re.match('psci: CPU(?P[0-9]*) killed.*', msg)): ^ SyntaxError: unmatched ')' Fixes: 34ea427e01ea ("PM: tools: sleepgraph: Recognize "CPU killed" messages") Signed-off-by: David Woodhouse Reviewed-by: Wolfram Sang Signed-off-by: Rafael J. Wysocki --- tools/power/pm-graph/sleepgraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py index 4a356a7067855..40ad221e88811 100755 --- a/tools/power/pm-graph/sleepgraph.py +++ b/tools/power/pm-graph/sleepgraph.py @@ -4151,7 +4151,7 @@ def parseKernelLog(data): elif(re.match('Enabling non-boot CPUs .*', msg)): # start of first cpu resume cpu_start = ktime - elif(re.match('smpboot: CPU (?P[0-9]*) is now offline', msg)) \ + elif(re.match('smpboot: CPU (?P[0-9]*) is now offline', msg) \ or re.match('psci: CPU(?P[0-9]*) killed.*', msg)): # end of a cpu suspend, start of the next m = re.match('smpboot: CPU (?P[0-9]*) is now offline', msg) From 1b59860540a4018e8071dc18d4893ec389506b7d Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 17 Nov 2023 00:23:14 +0800 Subject: [PATCH 401/560] nbd: fold nbd config initialization into nbd_alloc_config() There are no functional changes, make the code cleaner and prepare to fix null-ptr-dereference while accessing 'nbd->config'. Signed-off-by: Li Nan Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20231116162316.1740402-2-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 855fdf5c3b4ea..02f844832d912 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1530,17 +1530,20 @@ static int nbd_ioctl(struct block_device *bdev, blk_mode_t mode, return error; } -static struct nbd_config *nbd_alloc_config(void) +static int nbd_alloc_and_init_config(struct nbd_device *nbd) { struct nbd_config *config; + if (WARN_ON(nbd->config)) + return -EINVAL; + if (!try_module_get(THIS_MODULE)) - return ERR_PTR(-ENODEV); + return -ENODEV; config = kzalloc(sizeof(struct nbd_config), GFP_NOFS); if (!config) { module_put(THIS_MODULE); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } atomic_set(&config->recv_threads, 0); @@ -1548,7 +1551,10 @@ static struct nbd_config *nbd_alloc_config(void) init_waitqueue_head(&config->conn_wait); config->blksize_bits = NBD_DEF_BLKSIZE_BITS; atomic_set(&config->live_connections, 0); - return config; + nbd->config = config; + refcount_set(&nbd->config_refs, 1); + + return 0; } static int nbd_open(struct gendisk *disk, blk_mode_t mode) @@ -1567,21 +1573,17 @@ static int nbd_open(struct gendisk *disk, blk_mode_t mode) goto out; } if (!refcount_inc_not_zero(&nbd->config_refs)) { - struct nbd_config *config; - mutex_lock(&nbd->config_lock); if (refcount_inc_not_zero(&nbd->config_refs)) { mutex_unlock(&nbd->config_lock); goto out; } - config = nbd_alloc_config(); - if (IS_ERR(config)) { - ret = PTR_ERR(config); + ret = nbd_alloc_and_init_config(nbd); + if (ret) { mutex_unlock(&nbd->config_lock); goto out; } - nbd->config = config; - refcount_set(&nbd->config_refs, 1); + refcount_inc(&nbd->refs); mutex_unlock(&nbd->config_lock); if (max_part) @@ -1990,22 +1992,17 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) pr_err("nbd%d already in use\n", index); return -EBUSY; } - if (WARN_ON(nbd->config)) { - mutex_unlock(&nbd->config_lock); - nbd_put(nbd); - return -EINVAL; - } - config = nbd_alloc_config(); - if (IS_ERR(config)) { + + ret = nbd_alloc_and_init_config(nbd); + if (ret) { mutex_unlock(&nbd->config_lock); nbd_put(nbd); pr_err("couldn't allocate config\n"); - return PTR_ERR(config); + return ret; } - nbd->config = config; - refcount_set(&nbd->config_refs, 1); - set_bit(NBD_RT_BOUND, &config->runtime_flags); + config = nbd->config; + set_bit(NBD_RT_BOUND, &config->runtime_flags); ret = nbd_genl_size_set(info, nbd); if (ret) goto out; From 3123ac77923341774ca3ad1196ad20bb0732bf70 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 17 Nov 2023 00:23:15 +0800 Subject: [PATCH 402/560] nbd: factor out a helper to get nbd_config without holding 'config_lock' There are no functional changes, just to make code cleaner and prepare to fix null-ptr-dereference while accessing 'nbd->config'. Signed-off-by: Li Nan Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20231116162316.1740402-3-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 02f844832d912..daaf8805e876c 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -395,6 +395,14 @@ static u32 req_to_nbd_cmd_type(struct request *req) } } +static struct nbd_config *nbd_get_config_unlocked(struct nbd_device *nbd) +{ + if (refcount_inc_not_zero(&nbd->config_refs)) + return nbd->config; + + return NULL; +} + static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); @@ -409,13 +417,13 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) return BLK_EH_DONE; } - if (!refcount_inc_not_zero(&nbd->config_refs)) { + config = nbd_get_config_unlocked(nbd); + if (!config) { cmd->status = BLK_STS_TIMEOUT; __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); mutex_unlock(&cmd->lock); goto done; } - config = nbd->config; if (config->num_connections > 1 || (config->num_connections == 1 && nbd->tag_set.timeout)) { @@ -977,12 +985,12 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) struct nbd_sock *nsock; int ret; - if (!refcount_inc_not_zero(&nbd->config_refs)) { + config = nbd_get_config_unlocked(nbd); + if (!config) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Socks array is empty\n"); return -EINVAL; } - config = nbd->config; if (index >= config->num_connections) { dev_err_ratelimited(disk_to_dev(nbd->disk), @@ -1560,6 +1568,7 @@ static int nbd_alloc_and_init_config(struct nbd_device *nbd) static int nbd_open(struct gendisk *disk, blk_mode_t mode) { struct nbd_device *nbd; + struct nbd_config *config; int ret = 0; mutex_lock(&nbd_index_mutex); @@ -1572,7 +1581,9 @@ static int nbd_open(struct gendisk *disk, blk_mode_t mode) ret = -ENXIO; goto out; } - if (!refcount_inc_not_zero(&nbd->config_refs)) { + + config = nbd_get_config_unlocked(nbd); + if (!config) { mutex_lock(&nbd->config_lock); if (refcount_inc_not_zero(&nbd->config_refs)) { mutex_unlock(&nbd->config_lock); @@ -1588,7 +1599,7 @@ static int nbd_open(struct gendisk *disk, blk_mode_t mode) mutex_unlock(&nbd->config_lock); if (max_part) set_bit(GD_NEED_PART_SCAN, &disk->state); - } else if (nbd_disconnected(nbd->config)) { + } else if (nbd_disconnected(config)) { if (max_part) set_bit(GD_NEED_PART_SCAN, &disk->state); } @@ -2205,7 +2216,8 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) } mutex_unlock(&nbd_index_mutex); - if (!refcount_inc_not_zero(&nbd->config_refs)) { + config = nbd_get_config_unlocked(nbd); + if (!config) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); nbd_put(nbd); @@ -2213,7 +2225,6 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) } mutex_lock(&nbd->config_lock); - config = nbd->config; if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || !nbd->pid) { dev_err(nbd_to_dev(nbd), From c2da049f419417808466c529999170f5c3ef7d3d Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 17 Nov 2023 00:23:16 +0800 Subject: [PATCH 403/560] nbd: fix null-ptr-dereference while accessing 'nbd->config' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Memory reordering may occur in nbd_genl_connect(), causing config_refs to be set to 1 while nbd->config is still empty. Opening nbd at this time will cause null-ptr-dereference. T1 T2 nbd_open nbd_get_config_unlocked nbd_genl_connect nbd_alloc_and_init_config //memory reordered refcount_set(&nbd->config_refs, 1) // 2 nbd->config ->null point nbd->config = config // 1 Fix it by adding smp barrier to guarantee the execution sequence. Signed-off-by: Li Nan Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20231116162316.1740402-4-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index daaf8805e876c..3f03cb3dc33cc 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -397,8 +397,16 @@ static u32 req_to_nbd_cmd_type(struct request *req) static struct nbd_config *nbd_get_config_unlocked(struct nbd_device *nbd) { - if (refcount_inc_not_zero(&nbd->config_refs)) + if (refcount_inc_not_zero(&nbd->config_refs)) { + /* + * Add smp_mb__after_atomic to ensure that reading nbd->config_refs + * and reading nbd->config is ordered. The pair is the barrier in + * nbd_alloc_and_init_config(), avoid nbd->config_refs is set + * before nbd->config. + */ + smp_mb__after_atomic(); return nbd->config; + } return NULL; } @@ -1559,7 +1567,15 @@ static int nbd_alloc_and_init_config(struct nbd_device *nbd) init_waitqueue_head(&config->conn_wait); config->blksize_bits = NBD_DEF_BLKSIZE_BITS; atomic_set(&config->live_connections, 0); + nbd->config = config; + /* + * Order refcount_set(&nbd->config_refs, 1) and nbd->config assignment, + * its pair is the barrier in nbd_get_config_unlocked(). + * So nbd_get_config_unlocked() won't see nbd->config as null after + * refcount_inc_not_zero() succeed. + */ + smp_mb__before_atomic(); refcount_set(&nbd->config_refs, 1); return 0; From c96b8175522a2c52e297ee0a49827a668f95e1e8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 20 Nov 2023 16:06:11 +0900 Subject: [PATCH 404/560] block: Remove blk_set_runtime_active() The function blk_set_runtime_active() is called only from blk_post_runtime_resume(), so there is no need for that function to be exported. Open-code this function directly in blk_post_runtime_resume() and remove it. Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20231120070611.33951-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-pm.c | 33 +++++---------------------------- include/linux/blk-pm.h | 1 - 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/block/blk-pm.c b/block/blk-pm.c index 6b72b2e03fc8a..42e8420747153 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -163,38 +163,15 @@ EXPORT_SYMBOL(blk_pre_runtime_resume); * @q: the queue of the device * * Description: - * For historical reasons, this routine merely calls blk_set_runtime_active() - * to do the real work of restarting the queue. It does this regardless of - * whether the device's runtime-resume succeeded; even if it failed the + * Restart the queue of a runtime suspended device. It does this regardless + * of whether the device's runtime-resume succeeded; even if it failed the * driver or error handler will need to communicate with the device. * * This function should be called near the end of the device's - * runtime_resume callback. + * runtime_resume callback to correct queue runtime PM status and re-enable + * peeking requests from the queue. */ void blk_post_runtime_resume(struct request_queue *q) -{ - blk_set_runtime_active(q); -} -EXPORT_SYMBOL(blk_post_runtime_resume); - -/** - * blk_set_runtime_active - Force runtime status of the queue to be active - * @q: the queue of the device - * - * If the device is left runtime suspended during system suspend the resume - * hook typically resumes the device and corrects runtime status - * accordingly. However, that does not affect the queue runtime PM status - * which is still "suspended". This prevents processing requests from the - * queue. - * - * This function can be used in driver's resume hook to correct queue - * runtime PM status and re-enable peeking requests from the queue. It - * should be called before first request is added to the queue. - * - * This function is also called by blk_post_runtime_resume() for - * runtime resumes. It does everything necessary to restart the queue. - */ -void blk_set_runtime_active(struct request_queue *q) { int old_status; @@ -211,4 +188,4 @@ void blk_set_runtime_active(struct request_queue *q) if (old_status != RPM_ACTIVE) blk_clear_pm_only(q); } -EXPORT_SYMBOL(blk_set_runtime_active); +EXPORT_SYMBOL(blk_post_runtime_resume); diff --git a/include/linux/blk-pm.h b/include/linux/blk-pm.h index 2580e05a8ab67..004b38a538ffe 100644 --- a/include/linux/blk-pm.h +++ b/include/linux/blk-pm.h @@ -15,7 +15,6 @@ extern int blk_pre_runtime_suspend(struct request_queue *q); extern void blk_post_runtime_suspend(struct request_queue *q, int err); extern void blk_pre_runtime_resume(struct request_queue *q); extern void blk_post_runtime_resume(struct request_queue *q); -extern void blk_set_runtime_active(struct request_queue *q); #else static inline void blk_pm_runtime_init(struct request_queue *q, struct device *dev) {} From 616add70bfdc0274a253e84fc78155c27aacde91 Mon Sep 17 00:00:00 2001 From: Mark O'Donovan Date: Wed, 11 Oct 2023 08:45:11 +0000 Subject: [PATCH 405/560] nvme-auth: unlock mutex in one place only Signed-off-by: Mark O'Donovan Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 48328e36e93bc..0f5ea63d3c8d2 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -757,12 +757,11 @@ static void nvme_queue_auth_work(struct work_struct *work) __func__, chap->qid); mutex_lock(&ctrl->dhchap_auth_mutex); ret = nvme_auth_dhchap_setup_host_response(ctrl, chap); + mutex_unlock(&ctrl->dhchap_auth_mutex); if (ret) { - mutex_unlock(&ctrl->dhchap_auth_mutex); chap->error = ret; goto fail2; } - mutex_unlock(&ctrl->dhchap_auth_mutex); /* DH-HMAC-CHAP Step 3: send reply */ dev_dbg(ctrl->device, "%s: qid %d send reply\n", From 38ce1570e2c46e7e9af983aa337edd7e43723aa2 Mon Sep 17 00:00:00 2001 From: Mark O'Donovan Date: Wed, 11 Oct 2023 08:45:12 +0000 Subject: [PATCH 406/560] nvme-auth: set explanation code for failure2 msgs Some error cases were not setting an auth-failure-reason-code-explanation. This means an AUTH_Failure2 message will be sent with an explanation value of 0 which is a reserved value. Signed-off-by: Mark O'Donovan Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 0f5ea63d3c8d2..72c0525c75f50 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -838,6 +838,8 @@ static void nvme_queue_auth_work(struct work_struct *work) } fail2: + if (chap->status == 0) + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; dev_dbg(ctrl->device, "%s: qid %d send failure2, status %x\n", __func__, chap->qid, chap->status); tl = nvme_auth_set_dhchap_failure2_data(ctrl, chap); From 23441536b63677cb2ed9b1637d8ca70315e44bd0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 14 Nov 2023 14:18:21 +0100 Subject: [PATCH 407/560] nvme-tcp: only evaluate 'tls' option if TLS is selected We only need to evaluate the 'tls' connect option if TLS is enabled; otherwise we might be getting a link error. Fixes: 706add13676d ("nvme: keyring: fix conditional compilation") Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202311140426.0eHrTXBr-lkp@intel.com/ Signed-off-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 89661a9cf850d..6ed7948155174 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1916,7 +1916,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) int ret; key_serial_t pskid = 0; - if (ctrl->opts->tls) { + if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && ctrl->opts->tls) { if (ctrl->opts->tls_key) pskid = key_serial(ctrl->opts->tls_key); else From cd9aed606088d36a7ffff3e808db4e76b1854285 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 14 Nov 2023 14:27:01 +0100 Subject: [PATCH 408/560] nvme: catch errors from nvme_configure_metadata() nvme_configure_metadata() is issuing I/O, so we might incur an I/O error which will cause the connection to be reset. But in that case any further probing will race with reset and cause UAF errors. So return a status from nvme_configure_metadata() and abort probing if there was an I/O error. Signed-off-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 88b54cdcbd683..fd28e6b6574c0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1814,16 +1814,18 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id) return ret; } -static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) +static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) { struct nvme_ctrl *ctrl = ns->ctrl; + int ret; - if (nvme_init_ms(ns, id)) - return; + ret = nvme_init_ms(ns, id); + if (ret) + return ret; ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - return; + return 0; if (ctrl->ops->flags & NVME_F_FABRICS) { /* @@ -1832,7 +1834,7 @@ static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) * remap the separate metadata buffer from the block layer. */ if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) - return; + return 0; ns->features |= NVME_NS_EXT_LBAS; @@ -1859,6 +1861,7 @@ static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) else ns->features |= NVME_NS_METADATA_SUPPORTED; } + return 0; } static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, @@ -2032,7 +2035,11 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, ns->lba_shift = id->lbaf[lbaf].ds; nvme_set_queue_limits(ns->ctrl, ns->queue); - nvme_configure_metadata(ns, id); + ret = nvme_configure_metadata(ns, id); + if (ret < 0) { + blk_mq_unfreeze_queue(ns->disk->queue); + goto out; + } nvme_set_chunk_sectors(ns, id); nvme_update_disk_info(ns->disk, ns, id); From c7ca9757bda35ff9ce27ab42f2cb8b84d983e6ad Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 16 Nov 2023 13:14:35 +0100 Subject: [PATCH 409/560] nvme: blank out authentication fabrics options if not configured If the config option NVME_HOST_AUTH is not selected we should not accept the corresponding fabrics options. This allows userspace to detect if NVMe authentication has been enabled for the kernel. Cc: Shin'ichiro Kawasaki Fixes: f50fff73d620 ("nvme: implement In-Band authentication") Signed-off-by: Hannes Reinecke Tested-by: Shin'ichiro Kawasaki Reviewed-by: Daniel Wagner Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 4673ead69c5f9..aa88606a44c40 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -667,8 +667,10 @@ static const match_table_t opt_tokens = { #endif { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" }, { NVMF_OPT_DISCOVERY, "discovery" }, +#ifdef CONFIG_NVME_HOST_AUTH { NVMF_OPT_DHCHAP_SECRET, "dhchap_secret=%s" }, { NVMF_OPT_DHCHAP_CTRL_SECRET, "dhchap_ctrl_secret=%s" }, +#endif #ifdef CONFIG_NVME_TCP_TLS { NVMF_OPT_TLS, "tls" }, #endif From 1c22e0295a5eb571c27b53c7371f95699ef705ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Nov 2023 08:13:36 -0500 Subject: [PATCH 410/560] nvmet: nul-terminate the NQNs passed in the connect command The host and subsystem NQNs are passed in the connect command payload and interpreted as nul-terminated strings. Ensure they actually are nul-terminated before using them. Fixes: a07b4970f464 "nvmet: add a generic NVMe target") Reported-by: Alon Zahavi Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/fabrics-cmd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 43b5bd8bb6a52..d8da840a1c0ed 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -244,6 +244,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) goto out; } + d->subsysnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; + d->hostnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, le32_to_cpu(c->kato), &ctrl); if (status) @@ -313,6 +315,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) goto out; } + d->subsysnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; + d->hostnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; ctrl = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, le16_to_cpu(d->cntlid), req); if (!ctrl) { From 11b9d0b4999705105d1fb7f0e8ac969e0f41b1b8 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 20 Oct 2023 07:06:06 +0200 Subject: [PATCH 411/560] nvmet-tcp: always initialize tls_handshake_tmo_work The TLS handshake timeout work item should always be initialized to avoid a crash when cancelling the workqueue. Fixes: 675b453e0241 ("nvmet-tcp: enable TLS handshake upcall") Suggested-by: Maurizio Lombardi Signed-off-by: Hannes Reinecke Tested-by: Shin'ichiro Kawasaki Tested-by: Yi Zhang Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 92b74d0b8686a..4cc27856aa8fe 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1854,6 +1854,8 @@ static int nvmet_tcp_tls_handshake(struct nvmet_tcp_queue *queue) } return ret; } +#else +static void nvmet_tcp_tls_handshake_timeout(struct work_struct *w) {} #endif static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, @@ -1911,9 +1913,9 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list); mutex_unlock(&nvmet_tcp_queue_mutex); -#ifdef CONFIG_NVME_TARGET_TCP_TLS INIT_DELAYED_WORK(&queue->tls_handshake_tmo_work, nvmet_tcp_tls_handshake_timeout); +#ifdef CONFIG_NVME_TARGET_TCP_TLS if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) { struct sock *sk = queue->sock->sk; From 53f2bca2609237f910531f2c1a7779b16ce7952d Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 20 Nov 2023 03:25:21 +0000 Subject: [PATCH 412/560] block/null_blk: Fix double blk_mq_start_request() warning When CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION is enabled, null_queue_rq() would return BLK_STS_RESOURCE or BLK_STS_DEV_RESOURCE for the request, which has been marked as MQ_RQ_IN_FLIGHT by blk_mq_start_request(). Then null_queue_rqs() put these requests in the rqlist, return back to the block layer core, which would try to queue them individually again, so the warning in blk_mq_start_request() triggered. Fix it by splitting the null_queue_rq() into two parts: the first is the preparation of request, the second is the handling of request. We put the blk_mq_start_request() after the preparation part, which may fail and return back to the block layer core. The throttling also belongs to the preparation part, so move it before blk_mq_start_request(). And change the return type of null_handle_cmd() to void, since it always return BLK_STS_OK now. Reported-by: Closes: https://lore.kernel.org/all/0000000000000e6aac06098aee0c@google.com/ Fixes: d78bfa1346ab ("block/null_blk: add queue_rqs() support") Suggested-by: Bart Van Assche Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/r/20231120032521.1012037-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 22a3cf7f32e23..3021d58ca51c1 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1464,19 +1464,13 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, return BLK_STS_OK; } -static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, - sector_t nr_sectors, enum req_op op) +static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, + sector_t nr_sectors, enum req_op op) { struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; blk_status_t sts; - if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { - sts = null_handle_throttled(cmd); - if (sts != BLK_STS_OK) - return sts; - } - if (op == REQ_OP_FLUSH) { cmd->error = errno_to_blk_status(null_handle_flush(nullb)); goto out; @@ -1493,7 +1487,6 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, out: nullb_complete_cmd(cmd); - return BLK_STS_OK; } static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) @@ -1724,8 +1717,6 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->fake_timeout = should_timeout_request(rq) || blk_should_fake_timeout(rq->q); - blk_mq_start_request(rq); - if (should_requeue_request(rq)) { /* * Alternate between hitting the core BUSY path, and the @@ -1738,6 +1729,15 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } + if (test_bit(NULLB_DEV_FL_THROTTLED, &nq->dev->flags)) { + blk_status_t sts = null_handle_throttled(cmd); + + if (sts != BLK_STS_OK) + return sts; + } + + blk_mq_start_request(rq); + if (is_poll) { spin_lock(&nq->poll_lock); list_add_tail(&rq->queuelist, &nq->poll_list); @@ -1747,7 +1747,8 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, if (cmd->fake_timeout) return BLK_STS_OK; - return null_handle_cmd(cmd, sector, nr_sectors, req_op(rq)); + null_handle_cmd(cmd, sector, nr_sectors, req_op(rq)); + return BLK_STS_OK; } static void null_queue_rqs(struct request **rqlist) From 79e0c5be8c73a674c92bd4ba77b75f4f8c91d32e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:13 +0100 Subject: [PATCH 413/560] net, vrf: Move dstats structure to core Just move struct pcpu_dstats out of the vrf into the core, and streamline the field names slightly, so they better align with the {t,l}stats ones. No functional change otherwise. A conversion of the u64s to u64_stats_t could be done at a separate point in future. This move is needed as we are moving the {t,l,d}stats allocation/freeing to the core. Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Cc: Jakub Kicinski Cc: David Ahern Link: https://lore.kernel.org/r/20231114004220.6495-2-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/vrf.c | 24 +++++++----------------- include/linux/netdevice.h | 10 ++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index db766941b78f6..3e6e0fdc3ba7f 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -121,22 +121,12 @@ struct net_vrf { int ifindex; }; -struct pcpu_dstats { - u64 tx_pkts; - u64 tx_bytes; - u64 tx_drps; - u64 rx_pkts; - u64 rx_bytes; - u64 rx_drps; - struct u64_stats_sync syncp; -}; - static void vrf_rx_stats(struct net_device *dev, int len) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); - dstats->rx_pkts++; + dstats->rx_packets++; dstats->rx_bytes += len; u64_stats_update_end(&dstats->syncp); } @@ -161,10 +151,10 @@ static void vrf_get_stats64(struct net_device *dev, do { start = u64_stats_fetch_begin(&dstats->syncp); tbytes = dstats->tx_bytes; - tpkts = dstats->tx_pkts; - tdrops = dstats->tx_drps; + tpkts = dstats->tx_packets; + tdrops = dstats->tx_drops; rbytes = dstats->rx_bytes; - rpkts = dstats->rx_pkts; + rpkts = dstats->rx_packets; } while (u64_stats_fetch_retry(&dstats->syncp, start)); stats->tx_bytes += tbytes; stats->tx_packets += tpkts; @@ -421,7 +411,7 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) vrf_rx_stats(dev, len); else - this_cpu_inc(dev->dstats->rx_drps); + this_cpu_inc(dev->dstats->rx_drops); return NETDEV_TX_OK; } @@ -616,11 +606,11 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); - dstats->tx_pkts++; + dstats->tx_packets++; dstats->tx_bytes += len; u64_stats_update_end(&dstats->syncp); } else { - this_cpu_inc(dev->dstats->tx_drps); + this_cpu_inc(dev->dstats->tx_drops); } return ret; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a16c9cc063fe0..98082113156e5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2755,6 +2755,16 @@ struct pcpu_sw_netstats { struct u64_stats_sync syncp; } __aligned(4 * sizeof(u64)); +struct pcpu_dstats { + u64 rx_packets; + u64 rx_bytes; + u64 rx_drops; + u64 tx_packets; + u64 tx_bytes; + u64 tx_drops; + struct u64_stats_sync syncp; +} __aligned(8 * sizeof(u64)); + struct pcpu_lstats { u64_stats_t packets; u64_stats_t bytes; From 34d21de99cea9cb17967874313e5b0262527833c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:14 +0100 Subject: [PATCH 414/560] net: Move {l,t,d}stats allocation to core and convert veth & vrf Move {l,t,d}stats allocation to the core and let netdevs pick the stats type they need. That way the driver doesn't have to bother with error handling (allocation failure checking, making sure free happens in the right spot, etc) - all happening in the core. Co-developed-by: Jakub Kicinski Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Cc: David Ahern Link: https://lore.kernel.org/r/20231114004220.6495-3-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/veth.c | 16 ++----------- drivers/net/vrf.c | 14 +++-------- include/linux/netdevice.h | 20 ++++++++++++---- net/core/dev.c | 49 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 69 insertions(+), 30 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 9980517ed8b0d..ac030c241d1a5 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1506,25 +1506,12 @@ static void veth_free_queues(struct net_device *dev) static int veth_dev_init(struct net_device *dev) { - int err; - - dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); - if (!dev->lstats) - return -ENOMEM; - - err = veth_alloc_queues(dev); - if (err) { - free_percpu(dev->lstats); - return err; - } - - return 0; + return veth_alloc_queues(dev); } static void veth_dev_free(struct net_device *dev) { veth_free_queues(dev); - free_percpu(dev->lstats); } #ifdef CONFIG_NET_POLL_CONTROLLER @@ -1796,6 +1783,7 @@ static void veth_setup(struct net_device *dev) NETIF_F_HW_VLAN_STAG_RX); dev->needs_free_netdev = true; dev->priv_destructor = veth_dev_free; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; dev->max_mtu = ETH_MAX_MTU; dev->hw_features = VETH_FEATURES; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 3e6e0fdc3ba7f..bb95ce43cd97d 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1164,22 +1164,15 @@ static void vrf_dev_uninit(struct net_device *dev) vrf_rtable_release(dev, vrf); vrf_rt6_release(dev, vrf); - - free_percpu(dev->dstats); - dev->dstats = NULL; } static int vrf_dev_init(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); - dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); - if (!dev->dstats) - goto out_nomem; - /* create the default dst which points back to us */ if (vrf_rtable_create(dev) != 0) - goto out_stats; + goto out_nomem; if (vrf_rt6_create(dev) != 0) goto out_rth; @@ -1193,9 +1186,6 @@ static int vrf_dev_init(struct net_device *dev) out_rth: vrf_rtable_release(dev, vrf); -out_stats: - free_percpu(dev->dstats); - dev->dstats = NULL; out_nomem: return -ENOMEM; } @@ -1694,6 +1684,8 @@ static void vrf_setup(struct net_device *dev) dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU; dev->mtu = dev->max_mtu; + + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 98082113156e5..2564e209465ea 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1797,6 +1797,13 @@ enum netdev_ml_priv_type { ML_PRIV_CAN, }; +enum netdev_stat_type { + NETDEV_PCPU_STAT_NONE, + NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */ + NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */ + NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */ +}; + /** * struct net_device - The DEVICE structure. * @@ -1991,10 +1998,14 @@ enum netdev_ml_priv_type { * * @ml_priv: Mid-layer private * @ml_priv_type: Mid-layer private type - * @lstats: Loopback statistics - * @tstats: Tunnel statistics - * @dstats: Dummy statistics - * @vstats: Virtual ethernet statistics + * + * @pcpu_stat_type: Type of device statistics which the core should + * allocate/free: none, lstats, tstats, dstats. none + * means the driver is handling statistics allocation/ + * freeing internally. + * @lstats: Loopback statistics: packets, bytes + * @tstats: Tunnel statistics: RX/TX packets, RX/TX bytes + * @dstats: Dummy statistics: RX/TX/drop packets, RX/TX bytes * * @garp_port: GARP * @mrp_port: MRP @@ -2354,6 +2365,7 @@ struct net_device { void *ml_priv; enum netdev_ml_priv_type ml_priv_type; + enum netdev_stat_type pcpu_stat_type:8; union { struct pcpu_lstats __percpu *lstats; struct pcpu_sw_netstats __percpu *tstats; diff --git a/net/core/dev.c b/net/core/dev.c index af53f6d838ce9..0cc6e283edba9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10051,6 +10051,46 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); +static int netdev_do_alloc_pcpu_stats(struct net_device *dev) +{ + void __percpu *v; + + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return 0; + case NETDEV_PCPU_STAT_LSTATS: + v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); + break; + default: + return -EINVAL; + } + + return v ? 0 : -ENOMEM; +} + +static void netdev_do_free_pcpu_stats(struct net_device *dev) +{ + switch (dev->pcpu_stat_type) { + case NETDEV_PCPU_STAT_NONE: + return; + case NETDEV_PCPU_STAT_LSTATS: + free_percpu(dev->lstats); + break; + case NETDEV_PCPU_STAT_TSTATS: + free_percpu(dev->tstats); + break; + case NETDEV_PCPU_STAT_DSTATS: + free_percpu(dev->dstats); + break; + } +} + /** * register_netdevice() - register a network device * @dev: device to register @@ -10111,9 +10151,13 @@ int register_netdevice(struct net_device *dev) goto err_uninit; } + ret = netdev_do_alloc_pcpu_stats(dev); + if (ret) + goto err_uninit; + ret = dev_index_reserve(net, dev->ifindex); if (ret < 0) - goto err_uninit; + goto err_free_pcpu; dev->ifindex = ret; /* Transfer changeable features to wanted_features and enable @@ -10219,6 +10263,8 @@ int register_netdevice(struct net_device *dev) call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); err_ifindex_release: dev_index_release(net, dev->ifindex); +err_free_pcpu: + netdev_do_free_pcpu_stats(dev); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -10471,6 +10517,7 @@ void netdev_run_todo(void) WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); + netdev_do_free_pcpu_stats(dev); if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) From ae1658272c6491a31ac968e39882fc569f312ac3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:15 +0100 Subject: [PATCH 415/560] netkit: Add tstats per-CPU traffic counters Add dev->tstats traffic accounting to netkit. The latter contains per-CPU RX and TX counters. The dev's TX counters are bumped upon pass/unspec as well as redirect verdicts, in other words, on everything except for drops. The dev's RX counters are bumped upon successful __netif_rx(), as well as from skb_do_redirect() (not part of this commit here). Using dev->lstats with having just a single packets/bytes counter and inferring one another's RX counters from the peer dev's lstats is not possible given skb_do_redirect() can also bump the device's stats. Signed-off-by: Daniel Borkmann Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20231114004220.6495-4-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/netkit.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 5a0f86f38f093..99de11f9cde5c 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -68,6 +68,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) netdev_tx_t ret_dev = NET_XMIT_SUCCESS; const struct bpf_mprog_entry *entry; struct net_device *peer; + int len = skb->len; rcu_read_lock(); peer = rcu_dereference(nk->peer); @@ -85,15 +86,22 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) case NETKIT_PASS: skb->protocol = eth_type_trans(skb, skb->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - __netif_rx(skb); + if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) { + dev_sw_netstats_tx_add(dev, 1, len); + dev_sw_netstats_rx_add(peer, len); + } else { + goto drop_stats; + } break; case NETKIT_REDIRECT: + dev_sw_netstats_tx_add(dev, 1, len); skb_do_redirect(skb); break; case NETKIT_DROP: default: drop: kfree_skb(skb); +drop_stats: dev_core_stats_tx_dropped_inc(dev); ret_dev = NET_XMIT_DROP; break; @@ -174,6 +182,13 @@ static struct net_device *netkit_peer_dev(struct net_device *dev) return rcu_dereference(netkit_priv(dev)->peer); } +static void netkit_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *stats) +{ + dev_fetch_sw_netstats(stats, dev->tstats); + stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped); +} + static void netkit_uninit(struct net_device *dev); static const struct net_device_ops netkit_netdev_ops = { @@ -184,6 +199,7 @@ static const struct net_device_ops netkit_netdev_ops = { .ndo_set_rx_headroom = netkit_set_headroom, .ndo_get_iflink = netkit_get_iflink, .ndo_get_peer_dev = netkit_peer_dev, + .ndo_get_stats64 = netkit_get_stats, .ndo_uninit = netkit_uninit, .ndo_features_check = passthru_features_check, }; @@ -218,6 +234,7 @@ static void netkit_setup(struct net_device *dev) ether_setup(dev); dev->max_mtu = ETH_MAX_MTU; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->flags |= IFF_NOARP; dev->priv_flags &= ~IFF_TX_SKB_SHARING; From 6f2684bf2b4460c84d0d34612a939f78b96b03fc Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 14 Nov 2023 01:42:16 +0100 Subject: [PATCH 416/560] veth: Use tstats per-CPU traffic counters Currently veth devices use the lstats per-CPU traffic counters, which only cover TX traffic. veth_get_stats64() actually populates RX stats of a veth device from its peer's TX counters, based on the assumption that a veth device can _only_ receive packets from its peer, which is no longer true: For example, recent CNIs (like Cilium) can use the bpf_redirect_peer() BPF helper to redirect traffic from NIC's tc ingress to veth's tc ingress (in a different netns), skipping veth's peer device. Unfortunately, this kind of traffic isn't currently accounted for in veth's RX stats. In preparation for the fix, use tstats (instead of lstats) to maintain both RX and TX counters for each veth device. We'll use RX counters for bpf_redirect_peer() traffic, and keep using TX counters for the usual "peer-to-peer" traffic. In veth_get_stats64(), calculate RX stats by _adding_ RX count to peer's TX count, in order to cover both kinds of traffic. veth_stats_rx() might need a name change (perhaps to "veth_stats_xdp()") for less confusion, but let's leave it to another patch to keep the fix minimal. Signed-off-by: Peilin Ye Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20231114004220.6495-5-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/veth.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index ac030c241d1a5..6cc352296c672 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -373,7 +373,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) skb_tx_timestamp(skb); if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { if (!use_napi) - dev_lstats_add(dev, length); + dev_sw_netstats_tx_add(dev, 1, length); else __veth_xdp_flush(rq); } else { @@ -387,14 +387,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) return ret; } -static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes) -{ - struct veth_priv *priv = netdev_priv(dev); - - dev_lstats_read(dev, packets, bytes); - return atomic64_read(&priv->dropped); -} - static void veth_stats_rx(struct veth_stats *result, struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); @@ -432,24 +424,24 @@ static void veth_get_stats64(struct net_device *dev, struct veth_priv *priv = netdev_priv(dev); struct net_device *peer; struct veth_stats rx; - u64 packets, bytes; - tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes); - tot->tx_bytes = bytes; - tot->tx_packets = packets; + tot->tx_dropped = atomic64_read(&priv->dropped); + dev_fetch_sw_netstats(tot, dev->tstats); veth_stats_rx(&rx, dev); tot->tx_dropped += rx.xdp_tx_err; tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err; - tot->rx_bytes = rx.xdp_bytes; - tot->rx_packets = rx.xdp_packets; + tot->rx_bytes += rx.xdp_bytes; + tot->rx_packets += rx.xdp_packets; rcu_read_lock(); peer = rcu_dereference(priv->peer); if (peer) { - veth_stats_tx(peer, &packets, &bytes); - tot->rx_bytes += bytes; - tot->rx_packets += packets; + struct rtnl_link_stats64 tot_peer = {}; + + dev_fetch_sw_netstats(&tot_peer, peer->tstats); + tot->rx_bytes += tot_peer.tx_bytes; + tot->rx_packets += tot_peer.tx_packets; veth_stats_rx(&rx, peer); tot->tx_dropped += rx.peer_tq_xdp_xmit_err; @@ -1783,7 +1775,7 @@ static void veth_setup(struct net_device *dev) NETIF_F_HW_VLAN_STAG_RX); dev->needs_free_netdev = true; dev->priv_destructor = veth_dev_free; - dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->max_mtu = ETH_MAX_MTU; dev->hw_features = VETH_FEATURES; From 024ee930cb3c9ae49e4266aee89cfde0ebb407e1 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 14 Nov 2023 01:42:17 +0100 Subject: [PATCH 417/560] bpf: Fix dev's rx stats for bpf_redirect_peer traffic Traffic redirected by bpf_redirect_peer() (used by recent CNIs like Cilium) is not accounted for in the RX stats of supported devices (that is, veth and netkit), confusing user space metrics collectors such as cAdvisor [0], as reported by Youlun. Fix it by calling dev_sw_netstats_rx_add() in skb_do_redirect(), to update RX traffic counters. Devices that support ndo_get_peer_dev _must_ use the @tstats per-CPU counters (instead of @lstats, or @dstats). To make this more fool-proof, error out when ndo_get_peer_dev is set but @tstats are not selected. [0] Specifically, the "container_network_receive_{byte,packet}s_total" counters are affected. Fixes: 9aa1206e8f48 ("bpf: Add redirect_peer helper") Reported-by: Youlun Zhang Signed-off-by: Peilin Ye Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20231114004220.6495-6-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- net/core/dev.c | 8 ++++++++ net/core/filter.c | 1 + 2 files changed, 9 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 0cc6e283edba9..c879246be48d8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10055,6 +10055,14 @@ static int netdev_do_alloc_pcpu_stats(struct net_device *dev) { void __percpu *v; + /* Drivers implementing ndo_get_peer_dev must support tstat + * accounting, so that skb_do_redirect() can bump the dev's + * RX stats upon network namespace switch. + */ + if (dev->netdev_ops->ndo_get_peer_dev && + dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS) + return -EOPNOTSUPP; + switch (dev->pcpu_stat_type) { case NETDEV_PCPU_STAT_NONE: return 0; diff --git a/net/core/filter.c b/net/core/filter.c index 383f96b0a1c78..cca810987c8dc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2492,6 +2492,7 @@ int skb_do_redirect(struct sk_buff *skb) net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; + dev_sw_netstats_rx_add(dev, skb->len); return -EAGAIN; } return flags & BPF_F_NEIGH ? From 2c225425704078282e152ba692649237f78b3d7a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:18 +0100 Subject: [PATCH 418/560] bpf, netkit: Add indirect call wrapper for fetching peer dev ndo_get_peer_dev is used in tcx BPF fast path, therefore make use of indirect call wrapper and therefore optimize the bpf_redirect_peer() internal handling a bit. Add a small skb_get_peer_dev() wrapper which utilizes the INDIRECT_CALL_1() macro instead of open coding. Future work could potentially add a peer pointer directly into struct net_device in future and convert veth and netkit over to use it so that eventually ndo_get_peer_dev can be removed. Co-developed-by: Nikolay Aleksandrov Signed-off-by: Nikolay Aleksandrov Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20231114004220.6495-7-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/netkit.c | 3 ++- include/net/netkit.h | 6 ++++++ net/core/filter.c | 18 +++++++++++++----- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 99de11f9cde5c..97bd6705c2411 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -177,7 +178,7 @@ static void netkit_set_headroom(struct net_device *dev, int headroom) rcu_read_unlock(); } -static struct net_device *netkit_peer_dev(struct net_device *dev) +INDIRECT_CALLABLE_SCOPE struct net_device *netkit_peer_dev(struct net_device *dev) { return rcu_dereference(netkit_priv(dev)->peer); } diff --git a/include/net/netkit.h b/include/net/netkit.h index 0ba2e6b847ca5..9ec0163739f45 100644 --- a/include/net/netkit.h +++ b/include/net/netkit.h @@ -10,6 +10,7 @@ int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int netkit_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int netkit_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog); int netkit_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +INDIRECT_CALLABLE_DECLARE(struct net_device *netkit_peer_dev(struct net_device *dev)); #else static inline int netkit_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -34,5 +35,10 @@ static inline int netkit_prog_query(const union bpf_attr *attr, { return -EINVAL; } + +static inline struct net_device *netkit_peer_dev(struct net_device *dev) +{ + return NULL; +} #endif /* CONFIG_NETKIT */ #endif /* __NET_NETKIT_H */ diff --git a/net/core/filter.c b/net/core/filter.c index cca810987c8dc..7e4d7c3bcc849 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include "dev.h" @@ -2468,6 +2469,16 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); +static struct net_device *skb_get_peer_dev(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (likely(ops->ndo_get_peer_dev)) + return INDIRECT_CALL_1(ops->ndo_get_peer_dev, + netkit_peer_dev, dev); + return NULL; +} + int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -2481,12 +2492,9 @@ int skb_do_redirect(struct sk_buff *skb) if (unlikely(!dev)) goto out_drop; if (flags & BPF_F_PEER) { - const struct net_device_ops *ops = dev->netdev_ops; - - if (unlikely(!ops->ndo_get_peer_dev || - !skb_at_tc_ingress(skb))) + if (unlikely(!skb_at_tc_ingress(skb))) goto out_drop; - dev = ops->ndo_get_peer_dev(dev); + dev = skb_get_peer_dev(dev); if (unlikely(!dev || !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) From eee82da79f036bb49ff80d3088b9530e3c2e57eb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:19 +0100 Subject: [PATCH 419/560] selftests/bpf: De-veth-ize the tc_redirect test case No functional changes to the test case, but just renaming various functions, variables, etc, to remove veth part of their name for making it more generic and reusable later on (e.g. for netkit). Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20231114004220.6495-8-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/tc_redirect.c | 263 +++++++++--------- 1 file changed, 137 insertions(+), 126 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 6ee22c3b251ad..407ff4e9bc788 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -110,11 +110,16 @@ static void netns_setup_namespaces_nofail(const char *verb) } } +enum dev_mode { + MODE_VETH, +}; + struct netns_setup_result { - int ifindex_veth_src; - int ifindex_veth_src_fwd; - int ifindex_veth_dst; - int ifindex_veth_dst_fwd; + enum dev_mode dev_mode; + int ifindex_src; + int ifindex_src_fwd; + int ifindex_dst; + int ifindex_dst_fwd; }; static int get_ifaddr(const char *name, char *ifaddr) @@ -140,55 +145,59 @@ static int get_ifaddr(const char *name, char *ifaddr) static int netns_setup_links_and_routes(struct netns_setup_result *result) { struct nstoken *nstoken = NULL; - char veth_src_fwd_addr[IFADDR_STR_LEN+1] = {}; + char src_fwd_addr[IFADDR_STR_LEN+1] = {}; - SYS(fail, "ip link add veth_src type veth peer name veth_src_fwd"); - SYS(fail, "ip link add veth_dst type veth peer name veth_dst_fwd"); + if (result->dev_mode == MODE_VETH) { + SYS(fail, "ip link add src type veth peer name src_fwd"); + SYS(fail, "ip link add dst type veth peer name dst_fwd"); - SYS(fail, "ip link set veth_dst_fwd address " MAC_DST_FWD); - SYS(fail, "ip link set veth_dst address " MAC_DST); + SYS(fail, "ip link set dst_fwd address " MAC_DST_FWD); + SYS(fail, "ip link set dst address " MAC_DST); + } - if (get_ifaddr("veth_src_fwd", veth_src_fwd_addr)) + if (get_ifaddr("src_fwd", src_fwd_addr)) goto fail; - result->ifindex_veth_src = if_nametoindex("veth_src"); - if (!ASSERT_GT(result->ifindex_veth_src, 0, "ifindex_veth_src")) + result->ifindex_src = if_nametoindex("src"); + if (!ASSERT_GT(result->ifindex_src, 0, "ifindex_src")) goto fail; - result->ifindex_veth_src_fwd = if_nametoindex("veth_src_fwd"); - if (!ASSERT_GT(result->ifindex_veth_src_fwd, 0, "ifindex_veth_src_fwd")) + result->ifindex_src_fwd = if_nametoindex("src_fwd"); + if (!ASSERT_GT(result->ifindex_src_fwd, 0, "ifindex_src_fwd")) goto fail; - result->ifindex_veth_dst = if_nametoindex("veth_dst"); - if (!ASSERT_GT(result->ifindex_veth_dst, 0, "ifindex_veth_dst")) + result->ifindex_dst = if_nametoindex("dst"); + if (!ASSERT_GT(result->ifindex_dst, 0, "ifindex_dst")) goto fail; - result->ifindex_veth_dst_fwd = if_nametoindex("veth_dst_fwd"); - if (!ASSERT_GT(result->ifindex_veth_dst_fwd, 0, "ifindex_veth_dst_fwd")) + result->ifindex_dst_fwd = if_nametoindex("dst_fwd"); + if (!ASSERT_GT(result->ifindex_dst_fwd, 0, "ifindex_dst_fwd")) goto fail; - SYS(fail, "ip link set veth_src netns " NS_SRC); - SYS(fail, "ip link set veth_src_fwd netns " NS_FWD); - SYS(fail, "ip link set veth_dst_fwd netns " NS_FWD); - SYS(fail, "ip link set veth_dst netns " NS_DST); + SYS(fail, "ip link set src netns " NS_SRC); + SYS(fail, "ip link set src_fwd netns " NS_FWD); + SYS(fail, "ip link set dst_fwd netns " NS_FWD); + SYS(fail, "ip link set dst netns " NS_DST); /** setup in 'src' namespace */ nstoken = open_netns(NS_SRC); if (!ASSERT_OK_PTR(nstoken, "setns src")) goto fail; - SYS(fail, "ip addr add " IP4_SRC "/32 dev veth_src"); - SYS(fail, "ip addr add " IP6_SRC "/128 dev veth_src nodad"); - SYS(fail, "ip link set dev veth_src up"); + SYS(fail, "ip addr add " IP4_SRC "/32 dev src"); + SYS(fail, "ip addr add " IP6_SRC "/128 dev src nodad"); + SYS(fail, "ip link set dev src up"); - SYS(fail, "ip route add " IP4_DST "/32 dev veth_src scope global"); - SYS(fail, "ip route add " IP4_NET "/16 dev veth_src scope global"); - SYS(fail, "ip route add " IP6_DST "/128 dev veth_src scope global"); + SYS(fail, "ip route add " IP4_DST "/32 dev src scope global"); + SYS(fail, "ip route add " IP4_NET "/16 dev src scope global"); + SYS(fail, "ip route add " IP6_DST "/128 dev src scope global"); - SYS(fail, "ip neigh add " IP4_DST " dev veth_src lladdr %s", - veth_src_fwd_addr); - SYS(fail, "ip neigh add " IP6_DST " dev veth_src lladdr %s", - veth_src_fwd_addr); + if (result->dev_mode == MODE_VETH) { + SYS(fail, "ip neigh add " IP4_DST " dev src lladdr %s", + src_fwd_addr); + SYS(fail, "ip neigh add " IP6_DST " dev src lladdr %s", + src_fwd_addr); + } close_netns(nstoken); @@ -201,15 +210,15 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result) * needs v4 one in order to start ARP probing. IP4_NET route is added * to the endpoints so that the ARP processing will reply. */ - SYS(fail, "ip addr add " IP4_SLL "/32 dev veth_src_fwd"); - SYS(fail, "ip addr add " IP4_DLL "/32 dev veth_dst_fwd"); - SYS(fail, "ip link set dev veth_src_fwd up"); - SYS(fail, "ip link set dev veth_dst_fwd up"); + SYS(fail, "ip addr add " IP4_SLL "/32 dev src_fwd"); + SYS(fail, "ip addr add " IP4_DLL "/32 dev dst_fwd"); + SYS(fail, "ip link set dev src_fwd up"); + SYS(fail, "ip link set dev dst_fwd up"); - SYS(fail, "ip route add " IP4_SRC "/32 dev veth_src_fwd scope global"); - SYS(fail, "ip route add " IP6_SRC "/128 dev veth_src_fwd scope global"); - SYS(fail, "ip route add " IP4_DST "/32 dev veth_dst_fwd scope global"); - SYS(fail, "ip route add " IP6_DST "/128 dev veth_dst_fwd scope global"); + SYS(fail, "ip route add " IP4_SRC "/32 dev src_fwd scope global"); + SYS(fail, "ip route add " IP6_SRC "/128 dev src_fwd scope global"); + SYS(fail, "ip route add " IP4_DST "/32 dev dst_fwd scope global"); + SYS(fail, "ip route add " IP6_DST "/128 dev dst_fwd scope global"); close_netns(nstoken); @@ -218,16 +227,18 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result) if (!ASSERT_OK_PTR(nstoken, "setns dst")) goto fail; - SYS(fail, "ip addr add " IP4_DST "/32 dev veth_dst"); - SYS(fail, "ip addr add " IP6_DST "/128 dev veth_dst nodad"); - SYS(fail, "ip link set dev veth_dst up"); + SYS(fail, "ip addr add " IP4_DST "/32 dev dst"); + SYS(fail, "ip addr add " IP6_DST "/128 dev dst nodad"); + SYS(fail, "ip link set dev dst up"); - SYS(fail, "ip route add " IP4_SRC "/32 dev veth_dst scope global"); - SYS(fail, "ip route add " IP4_NET "/16 dev veth_dst scope global"); - SYS(fail, "ip route add " IP6_SRC "/128 dev veth_dst scope global"); + SYS(fail, "ip route add " IP4_SRC "/32 dev dst scope global"); + SYS(fail, "ip route add " IP4_NET "/16 dev dst scope global"); + SYS(fail, "ip route add " IP6_SRC "/128 dev dst scope global"); - SYS(fail, "ip neigh add " IP4_SRC " dev veth_dst lladdr " MAC_DST_FWD); - SYS(fail, "ip neigh add " IP6_SRC " dev veth_dst lladdr " MAC_DST_FWD); + if (result->dev_mode == MODE_VETH) { + SYS(fail, "ip neigh add " IP4_SRC " dev dst lladdr " MAC_DST_FWD); + SYS(fail, "ip neigh add " IP6_SRC " dev dst lladdr " MAC_DST_FWD); + } close_netns(nstoken); @@ -293,23 +304,23 @@ static int netns_load_bpf(const struct bpf_program *src_prog, const struct bpf_program *chk_prog, const struct netns_setup_result *setup_result) { - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src_fwd); - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd); int err; - /* tc qdisc add dev veth_src_fwd clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_src_fwd, setup_result->ifindex_veth_src_fwd); - /* tc filter add dev veth_src_fwd ingress bpf da src_prog */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, src_prog, 0); - /* tc filter add dev veth_src_fwd egress bpf da chk_prog */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, chk_prog, 0); + /* tc qdisc add dev src_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd); + /* tc filter add dev src_fwd ingress bpf da src_prog */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, src_prog, 0); + /* tc filter add dev src_fwd egress bpf da chk_prog */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, chk_prog, 0); - /* tc qdisc add dev veth_dst_fwd clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); - /* tc filter add dev veth_dst_fwd ingress bpf da dst_prog */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, dst_prog, 0); - /* tc filter add dev veth_dst_fwd egress bpf da chk_prog */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, chk_prog, 0); + /* tc qdisc add dev dst_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd); + /* tc filter add dev dst_fwd ingress bpf da dst_prog */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, dst_prog, 0); + /* tc filter add dev dst_fwd egress bpf da chk_prog */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, chk_prog, 0); return 0; fail: @@ -539,10 +550,10 @@ static void test_inet_dtime(int family, int type, const char *addr, __u16 port) static int netns_load_dtime_bpf(struct test_tc_dtime *skel, const struct netns_setup_result *setup_result) { - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src_fwd); - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_src); - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst); + LIBBPF_OPTS(bpf_tc_hook, qdisc_src_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_src); + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst); struct nstoken *nstoken; int err; @@ -550,58 +561,58 @@ static int netns_load_dtime_bpf(struct test_tc_dtime *skel, nstoken = open_netns(NS_SRC); if (!ASSERT_OK_PTR(nstoken, "setns " NS_SRC)) return -1; - /* tc qdisc add dev veth_src clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_src, setup_result->ifindex_veth_src); - /* tc filter add dev veth_src ingress bpf da ingress_host */ - XGRESS_FILTER_ADD(&qdisc_veth_src, BPF_TC_INGRESS, skel->progs.ingress_host, 0); - /* tc filter add dev veth_src egress bpf da egress_host */ - XGRESS_FILTER_ADD(&qdisc_veth_src, BPF_TC_EGRESS, skel->progs.egress_host, 0); + /* tc qdisc add dev src clsact */ + QDISC_CLSACT_CREATE(&qdisc_src, setup_result->ifindex_src); + /* tc filter add dev src ingress bpf da ingress_host */ + XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_INGRESS, skel->progs.ingress_host, 0); + /* tc filter add dev src egress bpf da egress_host */ + XGRESS_FILTER_ADD(&qdisc_src, BPF_TC_EGRESS, skel->progs.egress_host, 0); close_netns(nstoken); /* setup ns_dst tc progs */ nstoken = open_netns(NS_DST); if (!ASSERT_OK_PTR(nstoken, "setns " NS_DST)) return -1; - /* tc qdisc add dev veth_dst clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_dst, setup_result->ifindex_veth_dst); - /* tc filter add dev veth_dst ingress bpf da ingress_host */ - XGRESS_FILTER_ADD(&qdisc_veth_dst, BPF_TC_INGRESS, skel->progs.ingress_host, 0); - /* tc filter add dev veth_dst egress bpf da egress_host */ - XGRESS_FILTER_ADD(&qdisc_veth_dst, BPF_TC_EGRESS, skel->progs.egress_host, 0); + /* tc qdisc add dev dst clsact */ + QDISC_CLSACT_CREATE(&qdisc_dst, setup_result->ifindex_dst); + /* tc filter add dev dst ingress bpf da ingress_host */ + XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_INGRESS, skel->progs.ingress_host, 0); + /* tc filter add dev dst egress bpf da egress_host */ + XGRESS_FILTER_ADD(&qdisc_dst, BPF_TC_EGRESS, skel->progs.egress_host, 0); close_netns(nstoken); /* setup ns_fwd tc progs */ nstoken = open_netns(NS_FWD); if (!ASSERT_OK_PTR(nstoken, "setns " NS_FWD)) return -1; - /* tc qdisc add dev veth_dst_fwd clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); - /* tc filter add dev veth_dst_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, + /* tc qdisc add dev dst_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd); + /* tc filter add dev dst_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, skel->progs.ingress_fwdns_prio100, 100); - /* tc filter add dev veth_dst_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, + /* tc filter add dev dst_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, skel->progs.ingress_fwdns_prio101, 101); - /* tc filter add dev veth_dst_fwd egress prio 100 bpf da egress_fwdns_prio100 */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, + /* tc filter add dev dst_fwd egress prio 100 bpf da egress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, skel->progs.egress_fwdns_prio100, 100); - /* tc filter add dev veth_dst_fwd egress prio 101 bpf da egress_fwdns_prio101 */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, + /* tc filter add dev dst_fwd egress prio 101 bpf da egress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, skel->progs.egress_fwdns_prio101, 101); - /* tc qdisc add dev veth_src_fwd clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_src_fwd, setup_result->ifindex_veth_src_fwd); - /* tc filter add dev veth_src_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, + /* tc qdisc add dev src_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_src_fwd, setup_result->ifindex_src_fwd); + /* tc filter add dev src_fwd ingress prio 100 bpf da ingress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, skel->progs.ingress_fwdns_prio100, 100); - /* tc filter add dev veth_src_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_INGRESS, + /* tc filter add dev src_fwd ingress prio 101 bpf da ingress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_INGRESS, skel->progs.ingress_fwdns_prio101, 101); - /* tc filter add dev veth_src_fwd egress prio 100 bpf da egress_fwdns_prio100 */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, + /* tc filter add dev src_fwd egress prio 100 bpf da egress_fwdns_prio100 */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, skel->progs.egress_fwdns_prio100, 100); - /* tc filter add dev veth_src_fwd egress prio 101 bpf da egress_fwdns_prio101 */ - XGRESS_FILTER_ADD(&qdisc_veth_src_fwd, BPF_TC_EGRESS, + /* tc filter add dev src_fwd egress prio 101 bpf da egress_fwdns_prio101 */ + XGRESS_FILTER_ADD(&qdisc_src_fwd, BPF_TC_EGRESS, skel->progs.egress_fwdns_prio101, 101); close_netns(nstoken); return 0; @@ -777,8 +788,8 @@ static void test_tc_redirect_dtime(struct netns_setup_result *setup_result) if (!ASSERT_OK_PTR(skel, "test_tc_dtime__open")) return; - skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; + skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd; + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; err = test_tc_dtime__load(skel); if (!ASSERT_OK(err, "test_tc_dtime__load")) @@ -868,8 +879,8 @@ static void test_tc_redirect_neigh(struct netns_setup_result *setup_result) if (!ASSERT_OK_PTR(skel, "test_tc_neigh__open")) goto done; - skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; + skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd; + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; err = test_tc_neigh__load(skel); if (!ASSERT_OK(err, "test_tc_neigh__load")) @@ -904,8 +915,8 @@ static void test_tc_redirect_peer(struct netns_setup_result *setup_result) if (!ASSERT_OK_PTR(skel, "test_tc_peer__open")) goto done; - skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd; - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; + skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd; + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; err = test_tc_peer__load(skel); if (!ASSERT_OK(err, "test_tc_peer__load")) @@ -996,7 +1007,7 @@ static int tun_relay_loop(int src_fd, int target_fd) static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) { LIBBPF_OPTS(bpf_tc_hook, qdisc_tun_fwd); - LIBBPF_OPTS(bpf_tc_hook, qdisc_veth_dst_fwd); + LIBBPF_OPTS(bpf_tc_hook, qdisc_dst_fwd); struct test_tc_peer *skel = NULL; struct nstoken *nstoken = NULL; int err; @@ -1045,7 +1056,7 @@ static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) goto fail; skel->rodata->IFINDEX_SRC = ifindex; - skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd; + skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd; err = test_tc_peer__load(skel); if (!ASSERT_OK(err, "test_tc_peer__load")) @@ -1053,19 +1064,19 @@ static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) /* Load "tc_src_l3" to the tun_fwd interface to redirect packets * towards dst, and "tc_dst" to redirect packets - * and "tc_chk" on veth_dst_fwd to drop non-redirected packets. + * and "tc_chk" on dst_fwd to drop non-redirected packets. */ /* tc qdisc add dev tun_fwd clsact */ QDISC_CLSACT_CREATE(&qdisc_tun_fwd, ifindex); /* tc filter add dev tun_fwd ingress bpf da tc_src_l3 */ XGRESS_FILTER_ADD(&qdisc_tun_fwd, BPF_TC_INGRESS, skel->progs.tc_src_l3, 0); - /* tc qdisc add dev veth_dst_fwd clsact */ - QDISC_CLSACT_CREATE(&qdisc_veth_dst_fwd, setup_result->ifindex_veth_dst_fwd); - /* tc filter add dev veth_dst_fwd ingress bpf da tc_dst_l3 */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_INGRESS, skel->progs.tc_dst_l3, 0); - /* tc filter add dev veth_dst_fwd egress bpf da tc_chk */ - XGRESS_FILTER_ADD(&qdisc_veth_dst_fwd, BPF_TC_EGRESS, skel->progs.tc_chk, 0); + /* tc qdisc add dev dst_fwd clsact */ + QDISC_CLSACT_CREATE(&qdisc_dst_fwd, setup_result->ifindex_dst_fwd); + /* tc filter add dev dst_fwd ingress bpf da tc_dst_l3 */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_INGRESS, skel->progs.tc_dst_l3, 0); + /* tc filter add dev dst_fwd egress bpf da tc_chk */ + XGRESS_FILTER_ADD(&qdisc_dst_fwd, BPF_TC_EGRESS, skel->progs.tc_chk, 0); /* Setup route and neigh tables */ SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP4_TUN_SRC "/24"); @@ -1074,17 +1085,17 @@ static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) SYS(fail, "ip -netns " NS_SRC " addr add dev tun_src " IP6_TUN_SRC "/64 nodad"); SYS(fail, "ip -netns " NS_FWD " addr add dev tun_fwd " IP6_TUN_FWD "/64 nodad"); - SYS(fail, "ip -netns " NS_SRC " route del " IP4_DST "/32 dev veth_src scope global"); + SYS(fail, "ip -netns " NS_SRC " route del " IP4_DST "/32 dev src scope global"); SYS(fail, "ip -netns " NS_SRC " route add " IP4_DST "/32 via " IP4_TUN_FWD " dev tun_src scope global"); - SYS(fail, "ip -netns " NS_DST " route add " IP4_TUN_SRC "/32 dev veth_dst scope global"); - SYS(fail, "ip -netns " NS_SRC " route del " IP6_DST "/128 dev veth_src scope global"); + SYS(fail, "ip -netns " NS_DST " route add " IP4_TUN_SRC "/32 dev dst scope global"); + SYS(fail, "ip -netns " NS_SRC " route del " IP6_DST "/128 dev src scope global"); SYS(fail, "ip -netns " NS_SRC " route add " IP6_DST "/128 via " IP6_TUN_FWD " dev tun_src scope global"); - SYS(fail, "ip -netns " NS_DST " route add " IP6_TUN_SRC "/128 dev veth_dst scope global"); + SYS(fail, "ip -netns " NS_DST " route add " IP6_TUN_SRC "/128 dev dst scope global"); - SYS(fail, "ip -netns " NS_DST " neigh add " IP4_TUN_SRC " dev veth_dst lladdr " MAC_DST_FWD); - SYS(fail, "ip -netns " NS_DST " neigh add " IP6_TUN_SRC " dev veth_dst lladdr " MAC_DST_FWD); + SYS(fail, "ip -netns " NS_DST " neigh add " IP4_TUN_SRC " dev dst lladdr " MAC_DST_FWD); + SYS(fail, "ip -netns " NS_DST " neigh add " IP6_TUN_SRC " dev dst lladdr " MAC_DST_FWD); if (!ASSERT_OK(set_forwarding(false), "disable forwarding")) goto fail; @@ -1106,9 +1117,9 @@ static void test_tc_redirect_peer_l3(struct netns_setup_result *setup_result) close_netns(nstoken); } -#define RUN_TEST(name) \ +#define RUN_TEST(name, mode) \ ({ \ - struct netns_setup_result setup_result; \ + struct netns_setup_result setup_result = { .dev_mode = mode, }; \ if (test__start_subtest(#name)) \ if (ASSERT_OK(netns_setup_namespaces("add"), "setup namespaces")) { \ if (ASSERT_OK(netns_setup_links_and_routes(&setup_result), \ @@ -1122,11 +1133,11 @@ static void *test_tc_redirect_run_tests(void *arg) { netns_setup_namespaces_nofail("delete"); - RUN_TEST(tc_redirect_peer); - RUN_TEST(tc_redirect_peer_l3); - RUN_TEST(tc_redirect_neigh); - RUN_TEST(tc_redirect_neigh_fib); - RUN_TEST(tc_redirect_dtime); + RUN_TEST(tc_redirect_peer, MODE_VETH); + RUN_TEST(tc_redirect_peer_l3, MODE_VETH); + RUN_TEST(tc_redirect_neigh, MODE_VETH); + RUN_TEST(tc_redirect_neigh_fib, MODE_VETH); + RUN_TEST(tc_redirect_dtime, MODE_VETH); return NULL; } From adfeae2d243d9e5b83d094af481d189156b11779 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 14 Nov 2023 01:42:20 +0100 Subject: [PATCH 420/560] selftests/bpf: Add netkit to tc_redirect selftest Extend the existing tc_redirect selftest to also cover netkit devices for exercising the bpf_redirect_peer() code paths, so that we have both veth as well as netkit covered, all tests still pass after this change. Signed-off-by: Daniel Borkmann Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20231114004220.6495-9-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- .../selftests/bpf/prog_tests/tc_redirect.c | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c index 407ff4e9bc788..518f143c5b0fe 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c @@ -24,6 +24,7 @@ #include "test_progs.h" #include "network_helpers.h" +#include "netlink_helpers.h" #include "test_tc_neigh_fib.skel.h" #include "test_tc_neigh.skel.h" #include "test_tc_peer.skel.h" @@ -112,6 +113,7 @@ static void netns_setup_namespaces_nofail(const char *verb) enum dev_mode { MODE_VETH, + MODE_NETKIT, }; struct netns_setup_result { @@ -142,10 +144,51 @@ static int get_ifaddr(const char *name, char *ifaddr) return 0; } +static int create_netkit(int mode, char *prim, char *peer) +{ + struct rtattr *linkinfo, *data, *peer_info; + struct rtnl_handle rth = { .fd = -1 }; + const char *type = "netkit"; + struct { + struct nlmsghdr n; + struct ifinfomsg i; + char buf[1024]; + } req = {}; + int err; + + err = rtnl_open(&rth, 0); + if (!ASSERT_OK(err, "open_rtnetlink")) + return err; + + memset(&req, 0, sizeof(req)); + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + req.n.nlmsg_type = RTM_NEWLINK; + req.i.ifi_family = AF_UNSPEC; + + addattr_l(&req.n, sizeof(req), IFLA_IFNAME, prim, strlen(prim)); + linkinfo = addattr_nest(&req.n, sizeof(req), IFLA_LINKINFO); + addattr_l(&req.n, sizeof(req), IFLA_INFO_KIND, type, strlen(type)); + data = addattr_nest(&req.n, sizeof(req), IFLA_INFO_DATA); + addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode); + peer_info = addattr_nest(&req.n, sizeof(req), IFLA_NETKIT_PEER_INFO); + req.n.nlmsg_len += sizeof(struct ifinfomsg); + addattr_l(&req.n, sizeof(req), IFLA_IFNAME, peer, strlen(peer)); + addattr_nest_end(&req.n, peer_info); + addattr_nest_end(&req.n, data); + addattr_nest_end(&req.n, linkinfo); + + err = rtnl_talk(&rth, &req.n, NULL); + ASSERT_OK(err, "talk_rtnetlink"); + rtnl_close(&rth); + return err; +} + static int netns_setup_links_and_routes(struct netns_setup_result *result) { struct nstoken *nstoken = NULL; char src_fwd_addr[IFADDR_STR_LEN+1] = {}; + int err; if (result->dev_mode == MODE_VETH) { SYS(fail, "ip link add src type veth peer name src_fwd"); @@ -153,6 +196,13 @@ static int netns_setup_links_and_routes(struct netns_setup_result *result) SYS(fail, "ip link set dst_fwd address " MAC_DST_FWD); SYS(fail, "ip link set dst address " MAC_DST); + } else if (result->dev_mode == MODE_NETKIT) { + err = create_netkit(NETKIT_L3, "src", "src_fwd"); + if (!ASSERT_OK(err, "create_ifindex_src")) + goto fail; + err = create_netkit(NETKIT_L3, "dst", "dst_fwd"); + if (!ASSERT_OK(err, "create_ifindex_dst")) + goto fail; } if (get_ifaddr("src_fwd", src_fwd_addr)) @@ -1134,7 +1184,9 @@ static void *test_tc_redirect_run_tests(void *arg) netns_setup_namespaces_nofail("delete"); RUN_TEST(tc_redirect_peer, MODE_VETH); + RUN_TEST(tc_redirect_peer, MODE_NETKIT); RUN_TEST(tc_redirect_peer_l3, MODE_VETH); + RUN_TEST(tc_redirect_peer_l3, MODE_NETKIT); RUN_TEST(tc_redirect_neigh, MODE_VETH); RUN_TEST(tc_redirect_neigh_fib, MODE_VETH); RUN_TEST(tc_redirect_dtime, MODE_VETH); From 5029c5e4f20d8d6b41cefbde4b3eeadaec4662c6 Mon Sep 17 00:00:00 2001 From: Muhammad Muzammil Date: Wed, 25 Oct 2023 15:24:36 +0200 Subject: [PATCH 421/560] s390/dasd: resolve spelling mistake resolve typing mistake from pimary to primary Signed-off-by: Muhammad Muzammil Link: https://lore.kernel.org/r/20231010043140.28416-1-m.muzzammilashraf@gmail.com Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20231025132437.1223363-2-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_int.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 2e663131adaf6..1b1b8a41c4d42 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -283,7 +283,7 @@ struct dasd_pprc_dev_info { __u8 secondary; /* 7 Secondary device address */ __u16 pprc_id; /* 8-9 Peer-to-Peer Remote Copy ID */ __u8 reserved2[12]; /* 10-21 reserved */ - __u16 prim_cu_ssid; /* 22-23 Pimary Control Unit SSID */ + __u16 prim_cu_ssid; /* 22-23 Primary Control Unit SSID */ __u8 reserved3[12]; /* 24-35 reserved */ __u16 sec_cu_ssid; /* 36-37 Secondary Control Unit SSID */ __u8 reserved4[90]; /* 38-127 reserved */ From db46cd1e0426f52999d50fa72cfa97fa39952885 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Wed, 25 Oct 2023 15:24:37 +0200 Subject: [PATCH 422/560] s390/dasd: protect device queue against concurrent access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In dasd_profile_start() the amount of requests on the device queue are counted. The access to the device queue is unprotected against concurrent access. With a lot of parallel I/O, especially with alias devices enabled, the device queue can change while dasd_profile_start() is accessing the queue. In the worst case this leads to a kernel panic due to incorrect pointer accesses. Fix this by taking the device lock before accessing the queue and counting the requests. Additionally the check for a valid profile data pointer can be done earlier to avoid unnecessary locking in a hot path. Cc: Fixes: 4fa52aa7a82f ("[S390] dasd: add enhanced DASD statistics interface") Reviewed-by: Stefan Haberland Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20231025132437.1223363-3-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index d440319a7945b..833cfab7d8776 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -676,18 +676,20 @@ static void dasd_profile_start(struct dasd_block *block, * we count each request only once. */ device = cqr->startdev; - if (device->profile.data) { - counter = 1; /* request is not yet queued on the start device */ - list_for_each(l, &device->ccw_queue) - if (++counter >= 31) - break; - } + if (!device->profile.data) + return; + + spin_lock(get_ccwdev_lock(device->cdev)); + counter = 1; /* request is not yet queued on the start device */ + list_for_each(l, &device->ccw_queue) + if (++counter >= 31) + break; + spin_unlock(get_ccwdev_lock(device->cdev)); + spin_lock(&device->profile.lock); - if (device->profile.data) { - device->profile.data->dasd_io_nr_req[counter]++; - if (rq_data_dir(req) == READ) - device->profile.data->dasd_read_nr_req[counter]++; - } + device->profile.data->dasd_io_nr_req[counter]++; + if (rq_data_dir(req) == READ) + device->profile.data->dasd_read_nr_req[counter]++; spin_unlock(&device->profile.lock); } From d6fef34ee4d102be448146f24caf96d7b4a05401 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 20 Nov 2023 14:18:31 -0800 Subject: [PATCH 423/560] io_uring: fix off-by one bvec index If the offset equals the bv_len of the first registered bvec, then the request does not include any of that first bvec. Skip it so that drivers don't have to deal with a zero length bvec, which was observed to break NVMe's PRP list creation. Cc: stable@vger.kernel.org Fixes: bd11b3a391e3 ("io_uring: don't use iov_iter_advance() for fixed buffers") Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20231120221831.2646460-1-kbusch@meta.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 7034be555334d..f521c5965a933 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1258,7 +1258,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, */ const struct bio_vec *bvec = imu->bvec; - if (offset <= bvec->bv_len) { + if (offset < bvec->bv_len) { /* * Note, huge pages buffers consists of one large * bvec entry and should always go this way. The other From 88903daecacf03b1e5636e1b5f18bda5b07030fc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 20 Nov 2023 18:51:06 -0500 Subject: [PATCH 424/560] eventfs: Remove expectation that ei->is_freed means ei->dentry == NULL The logic to free the eventfs_inode (ei) use to set is_freed and clear the "dentry" field under the eventfs_mutex. But that changed when a race was found where the ei->dentry needed to be cleared when the last dput() was called on it. But there was still logic that checked if ei->dentry was not NULL and is_freed is set, and would warn if it was. But since that situation was changed and the ei->dentry isn't cleared until the last dput() is called on it while the ei->is_freed is set, do not test for that condition anymore, and change the comments to reflect that. Link: https://lkml.kernel.org/r/20231120235154.265826243@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Fixes: 020010fbfa20 ("eventfs: Delete eventfs_inode when the last dentry is freed") Reported-by: Mark Rutland Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index f8a594a50ae62..f239b2b507a4a 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -27,16 +27,16 @@ /* * eventfs_mutex protects the eventfs_inode (ei) dentry. Any access * to the ei->dentry must be done under this mutex and after checking - * if ei->is_freed is not set. The ei->dentry is released under the - * mutex at the same time ei->is_freed is set. If ei->is_freed is set - * then the ei->dentry is invalid. + * if ei->is_freed is not set. When ei->is_freed is set, the dentry + * is on its way to being freed after the last dput() is made on it. */ static DEFINE_MUTEX(eventfs_mutex); /* * The eventfs_inode (ei) itself is protected by SRCU. It is released from * its parent's list and will have is_freed set (under eventfs_mutex). - * After the SRCU grace period is over, the ei may be freed. + * After the SRCU grace period is over and the last dput() is called + * the ei is freed. */ DEFINE_STATIC_SRCU(eventfs_srcu); @@ -365,12 +365,14 @@ create_file_dentry(struct eventfs_inode *ei, int idx, * created the dentry for this e_dentry. In which case * use that one. * - * Note, with the mutex held, the e_dentry cannot have content - * and the ei->is_freed be true at the same time. + * If ei->is_freed is set, the e_dentry is currently on its + * way to being freed, don't return it. If e_dentry is NULL + * it means it was already freed. */ - dentry = *e_dentry; - if (WARN_ON_ONCE(dentry && ei->is_freed)) + if (ei->is_freed) dentry = NULL; + else + dentry = *e_dentry; /* The lookup does not need to up the dentry refcount */ if (dentry && !lookup) dget(dentry); @@ -473,8 +475,8 @@ create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, * created the dentry for this e_dentry. In which case * use that one. * - * Note, with the mutex held, the e_dentry cannot have content - * and the ei->is_freed be true at the same time. + * If ei->is_freed is set, the e_dentry is currently on its + * way to being freed. */ dentry = ei->dentry; if (dentry && !lookup) From 71cade82f2b553a74d046c015c986f2df165696f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Mon, 20 Nov 2023 18:51:07 -0500 Subject: [PATCH 425/560] eventfs: Do not invalidate dentry in create_file/dir_dentry() With the call to simple_recursive_removal() on the entire eventfs sub system when the directory is removed, it performs the d_invalidate on all the dentries when it is removed. There's no need to do clean ups when a dentry is being created while the directory is being deleted. As dentries are cleaned up by the simpler_recursive_removal(), trying to do d_invalidate() in these functions will cause the dentry to be invalidated twice, and crash the kernel. Link: https://lore.kernel.org/all/20231116123016.140576-1-naresh.kamboju@linaro.org/ Link: https://lkml.kernel.org/r/20231120235154.422970988@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Fixes: 407c6726ca71 ("eventfs: Use simple_recursive_removal() to clean up dentries") Reported-by: Mark Rutland Reported-by: Naresh Kamboju Reported-by: Linux Kernel Functional Testing Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index f239b2b507a4a..3eb6c622a74d2 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -326,7 +326,6 @@ create_file_dentry(struct eventfs_inode *ei, int idx, struct eventfs_attr *attr = NULL; struct dentry **e_dentry = &ei->d_children[idx]; struct dentry *dentry; - bool invalidate = false; mutex_lock(&eventfs_mutex); if (ei->is_freed) { @@ -389,17 +388,14 @@ create_file_dentry(struct eventfs_inode *ei, int idx, * Otherwise it means two dentries exist with the same name. */ WARN_ON_ONCE(!ei->is_freed); - invalidate = true; + dentry = NULL; } mutex_unlock(&eventfs_mutex); - if (invalidate) - d_invalidate(dentry); - - if (lookup || invalidate) + if (lookup) dput(dentry); - return invalidate ? NULL : dentry; + return dentry; } /** @@ -439,7 +435,6 @@ static struct dentry * create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, struct dentry *parent, bool lookup) { - bool invalidate = false; struct dentry *dentry = NULL; mutex_lock(&eventfs_mutex); @@ -495,16 +490,14 @@ create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, * Otherwise it means two dentries exist with the same name. */ WARN_ON_ONCE(!ei->is_freed); - invalidate = true; + dentry = NULL; } mutex_unlock(&eventfs_mutex); - if (invalidate) - d_invalidate(dentry); - if (lookup || invalidate) + if (lookup) dput(dentry); - return invalidate ? NULL : dentry; + return dentry; } /** From 977bc146d4eb7070118d8a974919b33bb52732b4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:51 +0200 Subject: [PATCH 426/560] selftests/bpf: track tcp payload offset as scalar in xdp_synproxy This change prepares syncookie_{tc,xdp} for update in callbakcs verification logic. To allow bpf_loop() verification converge when multiple callback itreations are considered: - track offset inside TCP payload explicitly, not as a part of the pointer; - make sure that offset does not exceed MAX_PACKET_OFF enforced by verifier; - make sure that offset is tracked as unbound scalar between iterations, otherwise verifier won't be able infer that bpf_loop callback reaches identical states. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/xdp_synproxy_kern.c | 84 ++++++++++++------- 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c index e959336c7a730..80f620602d50f 100644 --- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -53,6 +53,8 @@ #define DEFAULT_TTL 64 #define MAX_ALLOWED_PORTS 8 +#define MAX_PACKET_OFF 0xffff + #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) @@ -183,63 +185,76 @@ static __always_inline __u32 tcp_clock_ms(void) } struct tcpopt_context { - __u8 *ptr; - __u8 *end; + void *data; void *data_end; __be32 *tsecr; __u8 wscale; bool option_timestamp; bool option_sack; + __u32 off; }; -static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) +static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz) { - __u8 opcode, opsize; + __u64 off = ctx->off; + __u8 *data; - if (ctx->ptr >= ctx->end) - return 1; - if (ctx->ptr >= ctx->data_end) - return 1; + /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */ + if (off > MAX_PACKET_OFF - sz) + return NULL; - opcode = ctx->ptr[0]; + data = ctx->data + off; + barrier_var(data); + if (data + sz >= ctx->data_end) + return NULL; - if (opcode == TCPOPT_EOL) - return 1; - if (opcode == TCPOPT_NOP) { - ++ctx->ptr; - return 0; - } + ctx->off += sz; + return data; +} - if (ctx->ptr + 1 >= ctx->end) - return 1; - if (ctx->ptr + 1 >= ctx->data_end) +static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) +{ + __u8 *opcode, *opsize, *wscale, *tsecr; + __u32 off = ctx->off; + + opcode = next(ctx, 1); + if (!opcode) return 1; - opsize = ctx->ptr[1]; - if (opsize < 2) + + if (*opcode == TCPOPT_EOL) return 1; + if (*opcode == TCPOPT_NOP) + return 0; - if (ctx->ptr + opsize > ctx->end) + opsize = next(ctx, 1); + if (!opsize || *opsize < 2) return 1; - switch (opcode) { + switch (*opcode) { case TCPOPT_WINDOW: - if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end) - ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE; + wscale = next(ctx, 1); + if (!wscale) + return 1; + if (*opsize == TCPOLEN_WINDOW) + ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE; break; case TCPOPT_TIMESTAMP: - if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) { + tsecr = next(ctx, 4); + if (!tsecr) + return 1; + if (*opsize == TCPOLEN_TIMESTAMP) { ctx->option_timestamp = true; /* Client's tsval becomes our tsecr. */ - *ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2)); + *ctx->tsecr = get_unaligned((__be32 *)tsecr); } break; case TCPOPT_SACK_PERM: - if (opsize == TCPOLEN_SACK_PERM) + if (*opsize == TCPOLEN_SACK_PERM) ctx->option_sack = true; break; } - ctx->ptr += opsize; + ctx->off = off + *opsize; return 0; } @@ -256,16 +271,21 @@ static int tscookie_tcpopt_parse_batch(__u32 index, void *context) static __always_inline bool tscookie_init(struct tcphdr *tcp_header, __u16 tcp_len, __be32 *tsval, - __be32 *tsecr, void *data_end) + __be32 *tsecr, void *data, void *data_end) { struct tcpopt_context loop_ctx = { - .ptr = (__u8 *)(tcp_header + 1), - .end = (__u8 *)tcp_header + tcp_len, + .data = data, .data_end = data_end, .tsecr = tsecr, .wscale = TS_OPT_WSCALE_MASK, .option_timestamp = false, .option_sack = false, + /* Note: currently verifier would track .off as unbound scalar. + * In case if verifier would at some point get smarter and + * compute bounded value for this var, beware that it might + * hinder bpf_loop() convergence validation. + */ + .off = (__u8 *)(tcp_header + 1) - (__u8 *)data, }; u32 cookie; @@ -635,7 +655,7 @@ static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, cookie = (__u32)value; if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, - &tsopt_buf[0], &tsopt_buf[1], data_end)) + &tsopt_buf[0], &tsopt_buf[1], data, data_end)) tsopt = tsopt_buf; /* Check that there is enough space for a SYNACK. It also covers From 87eb0152bcc102ecbda866978f4e54db5a3be1ef Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:52 +0200 Subject: [PATCH 427/560] selftests/bpf: track string payload offset as scalar in strobemeta This change prepares strobemeta for update in callbacks verification logic. To allow bpf_loop() verification converge when multiple callback iterations are considered: - track offset inside strobemeta_payload->payload directly as scalar value; - at each iteration make sure that remaining strobemeta_payload->payload capacity is sufficient for execution of read_{map,str}_var functions; - make sure that offset is tracked as unbound scalar between iterations, otherwise verifier won't be able infer that bpf_loop callback reaches identical states. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/strobemeta.h | 78 ++++++++++++------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h index e02cfd3807469..40df2cc26eaf9 100644 --- a/tools/testing/selftests/bpf/progs/strobemeta.h +++ b/tools/testing/selftests/bpf/progs/strobemeta.h @@ -24,9 +24,11 @@ struct task_struct {}; #define STACK_TABLE_EPOCH_SHIFT 20 #define STROBE_MAX_STR_LEN 1 #define STROBE_MAX_CFGS 32 +#define READ_MAP_VAR_PAYLOAD_CAP \ + ((1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) #define STROBE_MAX_PAYLOAD \ (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \ - STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN) + STROBE_MAX_MAPS * READ_MAP_VAR_PAYLOAD_CAP) struct strobe_value_header { /* @@ -355,7 +357,7 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, size_t idx, void *tls_base, struct strobe_value_generic *value, struct strobemeta_payload *data, - void *payload) + size_t off) { void *location; uint64_t len; @@ -366,7 +368,7 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, return 0; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr); + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, value->ptr); /* * if bpf_probe_read_user_str returns error (<0), due to casting to * unsinged int, it will become big number, so next check is @@ -378,14 +380,14 @@ static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg, return 0; data->str_lens[idx] = len; - return len; + return off + len; } -static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, - size_t idx, void *tls_base, - struct strobe_value_generic *value, - struct strobemeta_payload *data, - void *payload) +static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg, + size_t idx, void *tls_base, + struct strobe_value_generic *value, + struct strobemeta_payload *data, + size_t off) { struct strobe_map_descr* descr = &data->map_descrs[idx]; struct strobe_map_raw map; @@ -397,11 +399,11 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, location = calc_location(&cfg->map_locs[idx], tls_base); if (!location) - return payload; + return off; bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location); if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr)) - return payload; + return off; descr->id = map.id; descr->cnt = map.cnt; @@ -410,10 +412,10 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, data->req_meta_valid = 1; } - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag); + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.tag); if (len <= STROBE_MAX_STR_LEN) { descr->tag_len = len; - payload += len; + off += len; } #ifdef NO_UNROLL @@ -426,22 +428,22 @@ static __always_inline void *read_map_var(struct strobemeta_cfg *cfg, break; descr->key_lens[i] = 0; - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.entries[i].key); if (len <= STROBE_MAX_STR_LEN) { descr->key_lens[i] = len; - payload += len; + off += len; } descr->val_lens[i] = 0; - len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, + len = bpf_probe_read_user_str(&data->payload[off], STROBE_MAX_STR_LEN, map.entries[i].val); if (len <= STROBE_MAX_STR_LEN) { descr->val_lens[i] = len; - payload += len; + off += len; } } - return payload; + return off; } #ifdef USE_BPF_LOOP @@ -455,14 +457,20 @@ struct read_var_ctx { struct strobemeta_payload *data; void *tls_base; struct strobemeta_cfg *cfg; - void *payload; + size_t payload_off; /* value gets mutated */ struct strobe_value_generic *value; enum read_type type; }; -static int read_var_callback(__u32 index, struct read_var_ctx *ctx) +static int read_var_callback(__u64 index, struct read_var_ctx *ctx) { + /* lose precision info for ctx->payload_off, verifier won't track + * double xor, barrier_var() is needed to force clang keep both xors. + */ + ctx->payload_off ^= index; + barrier_var(ctx->payload_off); + ctx->payload_off ^= index; switch (ctx->type) { case READ_INT_VAR: if (index >= STROBE_MAX_INTS) @@ -472,14 +480,18 @@ static int read_var_callback(__u32 index, struct read_var_ctx *ctx) case READ_MAP_VAR: if (index >= STROBE_MAX_MAPS) return 1; - ctx->payload = read_map_var(ctx->cfg, index, ctx->tls_base, - ctx->value, ctx->data, ctx->payload); + if (ctx->payload_off > sizeof(ctx->data->payload) - READ_MAP_VAR_PAYLOAD_CAP) + return 1; + ctx->payload_off = read_map_var(ctx->cfg, index, ctx->tls_base, + ctx->value, ctx->data, ctx->payload_off); break; case READ_STR_VAR: if (index >= STROBE_MAX_STRS) return 1; - ctx->payload += read_str_var(ctx->cfg, index, ctx->tls_base, - ctx->value, ctx->data, ctx->payload); + if (ctx->payload_off > sizeof(ctx->data->payload) - STROBE_MAX_STR_LEN) + return 1; + ctx->payload_off = read_str_var(ctx->cfg, index, ctx->tls_base, + ctx->value, ctx->data, ctx->payload_off); break; } return 0; @@ -501,7 +513,8 @@ static void *read_strobe_meta(struct task_struct *task, pid_t pid = bpf_get_current_pid_tgid() >> 32; struct strobe_value_generic value = {0}; struct strobemeta_cfg *cfg; - void *tls_base, *payload; + size_t payload_off; + void *tls_base; cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid); if (!cfg) @@ -509,7 +522,7 @@ static void *read_strobe_meta(struct task_struct *task, data->int_vals_set_mask = 0; data->req_meta_valid = 0; - payload = data->payload; + payload_off = 0; /* * we don't have struct task_struct definition, it should be: * tls_base = (void *)task->thread.fsbase; @@ -522,7 +535,7 @@ static void *read_strobe_meta(struct task_struct *task, .tls_base = tls_base, .value = &value, .data = data, - .payload = payload, + .payload_off = 0, }; int err; @@ -540,6 +553,11 @@ static void *read_strobe_meta(struct task_struct *task, err = bpf_loop(STROBE_MAX_MAPS, read_var_callback, &ctx, 0); if (err != STROBE_MAX_MAPS) return NULL; + + payload_off = ctx.payload_off; + /* this should not really happen, here only to satisfy verifer */ + if (payload_off > sizeof(data->payload)) + payload_off = sizeof(data->payload); #else #ifdef NO_UNROLL #pragma clang loop unroll(disable) @@ -555,7 +573,7 @@ static void *read_strobe_meta(struct task_struct *task, #pragma unroll #endif /* NO_UNROLL */ for (int i = 0; i < STROBE_MAX_STRS; ++i) { - payload += read_str_var(cfg, i, tls_base, &value, data, payload); + payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off); } #ifdef NO_UNROLL #pragma clang loop unroll(disable) @@ -563,7 +581,7 @@ static void *read_strobe_meta(struct task_struct *task, #pragma unroll #endif /* NO_UNROLL */ for (int i = 0; i < STROBE_MAX_MAPS; ++i) { - payload = read_map_var(cfg, i, tls_base, &value, data, payload); + payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off); } #endif /* USE_BPF_LOOP */ @@ -571,7 +589,7 @@ static void *read_strobe_meta(struct task_struct *task, * return pointer right after end of payload, so it's possible to * calculate exact amount of useful data that needs to be sent */ - return payload; + return &data->payload[payload_off]; } SEC("raw_tracepoint/kfree_skb") From f40bfd1679446b22d321e64a1fa98b7d07d2be08 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:53 +0200 Subject: [PATCH 428/560] selftests/bpf: fix bpf_loop_bench for new callback verification scheme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a preparatory change. A follow-up patch "bpf: verify callbacks as if they are called unknown number of times" changes logic for callbacks handling. While previously callbacks were verified as a single function call, new scheme takes into account that callbacks could be executed unknown number of times. This has dire implications for bpf_loop_bench: SEC("fentry/" SYS_PREFIX "sys_getpgid") int benchmark(void *ctx) { for (int i = 0; i < 1000; i++) { bpf_loop(nr_loops, empty_callback, NULL, 0); __sync_add_and_fetch(&hits, nr_loops); } return 0; } W/o callbacks change verifier sees it as a 1000 calls to empty_callback(). However, with callbacks change things become exponential: - i=0: state exploring empty_callback is scheduled with i=0 (a); - i=1: state exploring empty_callback is scheduled with i=1; ... - i=999: state exploring empty_callback is scheduled with i=999; - state (a) is popped from stack; - i=1: state exploring empty_callback is scheduled with i=1; ... Avoid this issue by rewriting outer loop as bpf_loop(). Unfortunately, this adds a function call to a loop at runtime, which negatively affects performance: throughput latency before: 149.919 ± 0.168 M ops/s, 6.670 ns/op after : 137.040 ± 0.187 M ops/s, 7.297 ns/op Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_loop_bench.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_loop_bench.c b/tools/testing/selftests/bpf/progs/bpf_loop_bench.c index 4ce76eb064c41..d461746fd3c1e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_loop_bench.c +++ b/tools/testing/selftests/bpf/progs/bpf_loop_bench.c @@ -15,13 +15,16 @@ static int empty_callback(__u32 index, void *data) return 0; } +static int outer_loop(__u32 index, void *data) +{ + bpf_loop(nr_loops, empty_callback, NULL, 0); + __sync_add_and_fetch(&hits, nr_loops); + return 0; +} + SEC("fentry/" SYS_PREFIX "sys_getpgid") int benchmark(void *ctx) { - for (int i = 0; i < 1000; i++) { - bpf_loop(nr_loops, empty_callback, NULL, 0); - - __sync_add_and_fetch(&hits, nr_loops); - } + bpf_loop(1000, outer_loop, NULL, 0); return 0; } From 683b96f9606ab7308ffb23c46ab43cecdef8a241 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:54 +0200 Subject: [PATCH 429/560] bpf: extract __check_reg_arg() utility function Split check_reg_arg() into two utility functions: - check_reg_arg() operating on registers from current verifier state; - __check_reg_arg() operating on a specific set of registers passed as a parameter; The __check_reg_arg() function would be used by a follow-up change for callbacks handling. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6da370a047feb..e6e1bcfe00f54 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3439,13 +3439,11 @@ static void mark_insn_zext(struct bpf_verifier_env *env, reg->subreg_def = DEF_NOT_SUBREG; } -static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, - enum reg_arg_type t) +static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, + enum reg_arg_type t) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; - struct bpf_reg_state *reg, *regs = state->regs; + struct bpf_reg_state *reg; bool rw64; if (regno >= MAX_BPF_REG) { @@ -3486,6 +3484,15 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, + enum reg_arg_type t) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + + return __check_reg_arg(env, state->regs, regno, t); +} + static void mark_jmp_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].jmp_point = true; @@ -9350,7 +9357,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); - check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK); } } From 58124a98cb8eda69d248d7f1de954c8b2767c945 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:55 +0200 Subject: [PATCH 430/560] bpf: extract setup_func_entry() utility function Move code for simulated stack frame creation to a separate utility function. This function would be used in the follow-up change for callbacks handling. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 84 ++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e6e1bcfe00f54..68ee4803d3a26 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9370,11 +9370,10 @@ static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); -static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx, int subprog, - set_callee_state_fn set_callee_state_cb) +static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite, + set_callee_state_fn set_callee_state_cb, + struct bpf_verifier_state *state) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *caller, *callee; int err; @@ -9384,13 +9383,53 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -E2BIG; } - caller = state->frame[state->curframe]; if (state->frame[state->curframe + 1]) { verbose(env, "verifier bug. Frame %d already allocated\n", state->curframe + 1); return -EFAULT; } + caller = state->frame[state->curframe]; + callee = kzalloc(sizeof(*callee), GFP_KERNEL); + if (!callee) + return -ENOMEM; + state->frame[state->curframe + 1] = callee; + + /* callee cannot access r0, r6 - r9 for reading and has to write + * into its own stack before reading from it. + * callee can read/write into caller's stack + */ + init_func_state(env, callee, + /* remember the callsite, it will be used by bpf_exit */ + callsite, + state->curframe + 1 /* frameno within this callchain */, + subprog /* subprog number within this prog */); + /* Transfer references to the callee */ + err = copy_reference_state(callee, caller); + err = err ?: set_callee_state_cb(env, caller, callee, callsite); + if (err) + goto err_out; + + /* only increment it after check_reg_arg() finished */ + state->curframe++; + + return 0; + +err_out: + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return err; +} + +static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx, int subprog, + set_callee_state_fn set_callee_state_cb) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + int err; + + caller = state->frame[state->curframe]; err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; @@ -9460,35 +9499,12 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn return 0; } - callee = kzalloc(sizeof(*callee), GFP_KERNEL); - if (!callee) - return -ENOMEM; - state->frame[state->curframe + 1] = callee; - - /* callee cannot access r0, r6 - r9 for reading and has to write - * into its own stack before reading from it. - * callee can read/write into caller's stack - */ - init_func_state(env, callee, - /* remember the callsite, it will be used by bpf_exit */ - *insn_idx /* callsite */, - state->curframe + 1 /* frameno within this callchain */, - subprog /* subprog number within this prog */); - - /* Transfer references to the callee */ - err = copy_reference_state(callee, caller); + err = setup_func_entry(env, subprog, *insn_idx, set_callee_state_cb, state); if (err) - goto err_out; - - err = set_callee_state_cb(env, caller, callee, *insn_idx); - if (err) - goto err_out; + return err; clear_caller_saved_regs(env, caller->regs); - /* only increment it after check_reg_arg() finished */ - state->curframe++; - /* and go analyze first insn of the callee */ *insn_idx = env->subprog_info[subprog].start - 1; @@ -9496,14 +9512,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn verbose(env, "caller:\n"); print_verifier_state(env, caller, true); verbose(env, "callee:\n"); - print_verifier_state(env, callee, true); + print_verifier_state(env, state->frame[state->curframe], true); } - return 0; -err_out: - free_func_state(callee); - state->frame[state->curframe + 1] = NULL; - return err; + return 0; } int map_set_for_each_callback_args(struct bpf_verifier_env *env, From ab5cfac139ab8576fb54630d4cca23c3e690ee90 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:56 +0200 Subject: [PATCH 431/560] bpf: verify callbacks as if they are called unknown number of times Prior to this patch callbacks were handled as regular function calls, execution of callback body was modeled exactly once. This patch updates callbacks handling logic as follows: - introduces a function push_callback_call() that schedules callback body verification in env->head stack; - updates prepare_func_exit() to reschedule callback body verification upon BPF_EXIT; - as calls to bpf_*_iter_next(), calls to callback invoking functions are marked as checkpoints; - is_state_visited() is updated to stop callback based iteration when some identical parent state is found. Paths with callback function invoked zero times are now verified first, which leads to necessity to modify some selftests: - the following negative tests required adding release/unlock/drop calls to avoid previously masked unrelated error reports: - cb_refs.c:underflow_prog - exceptions_fail.c:reject_rbtree_add_throw - exceptions_fail.c:reject_with_cp_reference - the following precision tracking selftests needed change in expected log trace: - verifier_subprog_precision.c:callback_result_precise (note: r0 precision is no longer propagated inside callback and I think this is a correct behavior) - verifier_subprog_precision.c:parent_callee_saved_reg_precise_with_callback - verifier_subprog_precision.c:parent_stack_slot_precise_with_callback Reported-by: Andrew Werner Closes: https://lore.kernel.org/bpf/CA+vRuzPChFNXmouzGG+wsy=6eMcfr1mFG0F3g7rbg-sedGKW3w@mail.gmail.com/ Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-7-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 + kernel/bpf/verifier.c | 274 +++++++++++------- tools/testing/selftests/bpf/progs/cb_refs.c | 1 + .../selftests/bpf/progs/exceptions_fail.c | 2 + .../bpf/progs/verifier_subprog_precision.c | 71 ++++- 5 files changed, 240 insertions(+), 113 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 24213a99cc79d..dd326936dd6ff 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -400,6 +400,7 @@ struct bpf_verifier_state { struct bpf_idx_pair *jmp_history; u32 jmp_history_cnt; u32 dfs_depth; + u32 callback_unroll_depth; }; #define bpf_get_spilled_reg(slot, frame, mask) \ @@ -511,6 +512,10 @@ struct bpf_insn_aux_data { * this instruction, regardless of any heuristics */ bool force_checkpoint; + /* true if instruction is a call to a helper function that + * accepts callback function as a parameter. + */ + bool calls_callback; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 68ee4803d3a26..a60dfa56ebb3f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -547,13 +547,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_dynptr_data; } -static bool is_callback_calling_kfunc(u32 btf_id); +static bool is_sync_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); -static bool is_callback_calling_function(enum bpf_func_id func_id) +static bool is_sync_callback_calling_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_for_each_map_elem || - func_id == BPF_FUNC_timer_set_callback || func_id == BPF_FUNC_find_vma || func_id == BPF_FUNC_loop || func_id == BPF_FUNC_user_ringbuf_drain; @@ -564,6 +563,18 @@ static bool is_async_callback_calling_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_timer_set_callback; } +static bool is_callback_calling_function(enum bpf_func_id func_id) +{ + return is_sync_callback_calling_function(func_id) || + is_async_callback_calling_function(func_id); +} + +static bool is_sync_callback_calling_insn(struct bpf_insn *insn) +{ + return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) || + (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm)); +} + static bool is_storage_get_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_storage_get || @@ -1808,6 +1819,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; dst_state->dfs_depth = src->dfs_depth; + dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; @@ -3731,6 +3743,8 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) } } +static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); + /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -3906,16 +3920,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, return -EFAULT; return 0; } - } else if ((bpf_helper_call(insn) && - is_callback_calling_function(insn->imm) && - !is_async_callback_calling_function(insn->imm)) || - (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) { - /* callback-calling helper or kfunc call, which means - * we are exiting from subprog, but unlike the subprog - * call handling above, we shouldn't propagate - * precision of r1-r5 (if any requested), as they are - * not actually arguments passed directly to callback - * subprogs + } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) { + /* exit from callback subprog to callback-calling helper or + * kfunc call. Use idx/subseq_idx check to discern it from + * straight line code backtracking. + * Unlike the subprog call handling above, we shouldn't + * propagate precision of r1-r5 (if any requested), as they are + * not actually arguments passed directly to callback subprogs */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); @@ -3950,10 +3961,18 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, } else if (opcode == BPF_EXIT) { bool r0_precise; + /* Backtracking to a nested function call, 'idx' is a part of + * the inner frame 'subseq_idx' is a part of the outer frame. + * In case of a regular function call, instructions giving + * precision to registers R1-R5 should have been found already. + * In case of a callback, it is ok to have R1-R5 marked for + * backtracking, as these registers are set by the function + * invoking callback. + */ + if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - /* if backtracing was looking for registers R1-R5 - * they should have been found already. - */ verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); WARN_ONCE(1, "verifier backtracking bug"); return -EFAULT; @@ -9421,11 +9440,11 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls return err; } -static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx, int subprog, - set_callee_state_fn set_callee_state_cb) +static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int insn_idx, int subprog, + set_callee_state_fn set_callee_state_cb) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *state = env->cur_state, *callback_state; struct bpf_func_state *caller, *callee; int err; @@ -9433,44 +9452,22 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; - if (subprog_is_global(env, subprog)) { - if (err) { - verbose(env, "Caller passes invalid args into func#%d\n", - subprog); - return err; - } else { - if (env->log.level & BPF_LOG_LEVEL) - verbose(env, - "Func#%d is global and valid. Skipping.\n", - subprog); - clear_caller_saved_regs(env, caller->regs); - - /* All global functions return a 64-bit SCALAR_VALUE */ - mark_reg_unknown(env, caller->regs, BPF_REG_0); - caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; - - /* continue with next insn after call */ - return 0; - } - } /* set_callee_state is used for direct subprog calls, but we are * interested in validating only BPF helpers that can call subprogs as * callbacks */ - if (set_callee_state_cb != set_callee_state) { - env->subprog_info[subprog].is_cb = true; - if (bpf_pseudo_kfunc_call(insn) && - !is_callback_calling_kfunc(insn->imm)) { - verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); - return -EFAULT; - } else if (!bpf_pseudo_kfunc_call(insn) && - !is_callback_calling_function(insn->imm)) { /* helper */ - verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); - return -EFAULT; - } + env->subprog_info[subprog].is_cb = true; + if (bpf_pseudo_kfunc_call(insn) && + !is_sync_callback_calling_kfunc(insn->imm)) { + verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } else if (!bpf_pseudo_kfunc_call(insn) && + !is_callback_calling_function(insn->imm)) { /* helper */ + verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; } if (insn->code == (BPF_JMP | BPF_CALL) && @@ -9481,25 +9478,76 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* there is no real recursion here. timer callbacks are async */ env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, - *insn_idx, subprog); + insn_idx, subprog); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; callee->async_entry_cnt = caller->async_entry_cnt + 1; /* Convert bpf_timer_set_callback() args into timer callback args */ - err = set_callee_state_cb(env, caller, callee, *insn_idx); + err = set_callee_state_cb(env, caller, callee, insn_idx); if (err) return err; + return 0; + } + + /* for callback functions enqueue entry to callback and + * proceed with next instruction within current frame. + */ + callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false); + if (!callback_state) + return -ENOMEM; + + err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb, + callback_state); + if (err) + return err; + + callback_state->callback_unroll_depth++; + return 0; +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller; + int err, subprog, target_insn; + + target_insn = *insn_idx + insn->imm + 1; + subprog = find_subprog(env, target_insn); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", target_insn); + return -EFAULT; + } + + caller = state->frame[state->curframe]; + err = btf_check_subprog_call(env, subprog, caller->regs); + if (err == -EFAULT) + return err; + if (subprog_is_global(env, subprog)) { + if (err) { + verbose(env, "Caller passes invalid args into func#%d\n", subprog); + return err; + } + + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "Func#%d is global and valid. Skipping.\n", subprog); clear_caller_saved_regs(env, caller->regs); + + /* All global functions return a 64-bit SCALAR_VALUE */ mark_reg_unknown(env, caller->regs, BPF_REG_0); caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* continue with next insn after call */ return 0; } - err = setup_func_entry(env, subprog, *insn_idx, set_callee_state_cb, state); + /* for regular function entry setup new frame and continue + * from that frame. + */ + err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state); if (err) return err; @@ -9559,22 +9607,6 @@ static int set_callee_state(struct bpf_verifier_env *env, return 0; } -static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx) -{ - int subprog, target_insn; - - target_insn = *insn_idx + insn->imm + 1; - subprog = find_subprog(env, target_insn); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", - target_insn); - return -EFAULT; - } - - return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); -} - static int set_map_elem_callback_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -9798,6 +9830,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); return -EINVAL; } + if (!calls_callback(env, callee->callsite)) { + verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n", + *insn_idx, callee->callsite); + return -EFAULT; + } } else { /* return to the caller whatever r0 had in the callee */ caller->regs[BPF_REG_0] = *r0; @@ -9815,7 +9852,15 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; } - *insn_idx = callee->callsite + 1; + /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite, + * there function call logic would reschedule callback visit. If iteration + * converges is_state_visited() would prune that visit eventually. + */ + if (callee->in_callback_fn) + *insn_idx = callee->callsite; + else + *insn_idx = callee->callsite + 1; + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); print_verifier_state(env, callee, true); @@ -10228,24 +10273,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_for_each_map_elem: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_map_elem_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_map_elem_callback_state); break; case BPF_FUNC_timer_set_callback: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_timer_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_timer_callback_state); break; case BPF_FUNC_find_vma: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_find_vma_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_find_vma_callback_state); break; case BPF_FUNC_snprintf: err = check_bpf_snprintf_call(env, regs); break; case BPF_FUNC_loop: update_loop_inline_state(env, meta.subprogno); - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_loop_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_loop_callback_state); break; case BPF_FUNC_dynptr_from_mem: if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) { @@ -10341,8 +10386,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn break; } case BPF_FUNC_user_ringbuf_drain: - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_user_ringbuf_callback_state); + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_user_ringbuf_callback_state); break; } @@ -11230,7 +11275,7 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; } -static bool is_callback_calling_kfunc(u32 btf_id) +static bool is_sync_callback_calling_kfunc(u32 btf_id) { return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; } @@ -11982,6 +12027,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } + /* Check the arguments */ + err = check_kfunc_args(env, &meta, insn_idx); + if (err < 0) + return err; + + if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_rbtree_add_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); @@ -12017,10 +12077,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } - /* Check the arguments */ - err = check_kfunc_args(env, &meta, insn_idx); - if (err < 0) - return err; /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -12054,16 +12110,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { - err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, - set_rbtree_add_callback_state); - if (err) { - verbose(env, "kfunc %s#%d failed callback verification\n", - func_name, meta.func_id); - return err; - } - } - if (meta.func_id == special_kfunc_list[KF_bpf_throw]) { if (!bpf_jit_supports_exceptions()) { verbose(env, "JIT does not support calling kfunc %s#%d\n", @@ -15427,6 +15473,15 @@ static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx) return env->insn_aux_data[insn_idx].force_checkpoint; } +static void mark_calls_callback(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].calls_callback = true; +} + +static bool calls_callback(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].calls_callback; +} enum { DONE_EXPLORING = 0, @@ -15540,6 +15595,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env) * async state will be pushed for further exploration. */ mark_prune_point(env, t); + /* For functions that invoke callbacks it is not known how many times + * callback would be called. Verifier models callback calling functions + * by repeatedly visiting callback bodies and returning to origin call + * instruction. + * In order to stop such iteration verifier needs to identify when a + * state identical some state from a previous iteration is reached. + * Check below forces creation of checkpoint before callback calling + * instruction to allow search for such identical states. + */ + if (is_sync_callback_calling_insn(insn)) { + mark_calls_callback(env, t); + mark_force_checkpoint(env, t); + mark_prune_point(env, t); + mark_jmp_point(env, t); + } if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; @@ -17009,10 +17079,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) } goto skip_inf_loop_check; } + if (calls_callback(env, insn_idx)) { + if (states_equal(env, &sl->state, cur, true)) + goto hit; + goto skip_inf_loop_check; + } /* attempt to detect infinite loop to avoid unnecessary doomed work */ if (states_maybe_looping(&sl->state, cur) && states_equal(env, &sl->state, cur, false) && - !iter_active_depths_differ(&sl->state, cur)) { + !iter_active_depths_differ(&sl->state, cur) && + sl->state.callback_unroll_depth == cur->callback_unroll_depth) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); verbose(env, "cur state:"); diff --git a/tools/testing/selftests/bpf/progs/cb_refs.c b/tools/testing/selftests/bpf/progs/cb_refs.c index 76d661b20e87d..56c764df81967 100644 --- a/tools/testing/selftests/bpf/progs/cb_refs.c +++ b/tools/testing/selftests/bpf/progs/cb_refs.c @@ -33,6 +33,7 @@ int underflow_prog(void *ctx) if (!p) return 0; bpf_for_each_map_elem(&array_map, cb1, &p, 0); + bpf_kfunc_call_test_release(p); return 0; } diff --git a/tools/testing/selftests/bpf/progs/exceptions_fail.c b/tools/testing/selftests/bpf/progs/exceptions_fail.c index 4c39e920dac22..8c0ef2742208a 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_fail.c +++ b/tools/testing/selftests/bpf/progs/exceptions_fail.c @@ -171,6 +171,7 @@ int reject_with_rbtree_add_throw(void *ctx) return 0; bpf_spin_lock(&lock); bpf_rbtree_add(&rbtree, &f->node, rbless); + bpf_spin_unlock(&lock); return 0; } @@ -214,6 +215,7 @@ int reject_with_cb_reference(void *ctx) if (!f) return 0; bpf_loop(5, subprog_cb_ref, NULL, 0); + bpf_obj_drop(f); return 0; } diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index db6b3143338b6..da803cffb5efb 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -119,15 +119,26 @@ __naked int global_subprog_result_precise(void) SEC("?raw_tp") __success __log_level(2) +/* First simulated path does not include callback body */ __msg("14: (0f) r1 += r6") -__msg("mark_precise: frame0: last_idx 14 first_idx 10") +__msg("mark_precise: frame0: last_idx 14 first_idx 9") __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 11: (25) if r6 > 0x3 goto pc+4") __msg("mark_precise: frame0: regs=r6 stack= before 10: (bf) r6 = r0") -__msg("mark_precise: frame0: parent state regs=r0 stack=:") -__msg("mark_precise: frame0: last_idx 18 first_idx 0") -__msg("mark_precise: frame0: regs=r0 stack= before 18: (95) exit") +__msg("mark_precise: frame0: regs=r0 stack= before 9: (85) call bpf_loop") +/* State entering callback body popped from states stack */ +__msg("from 9 to 17: frame1:") +__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb") +__msg("17: (b7) r0 = 0") +__msg("18: (95) exit") +__msg("returning from callee:") +__msg("to caller at 9:") +/* r4 (flags) is always precise for bpf_loop() */ +__msg("frame 0: propagating r4") +__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") +__msg("mark_precise: frame0: regs=r4 stack= before 18: (95) exit") +__msg("from 18 to 9: safe") __naked int callback_result_precise(void) { asm volatile ( @@ -233,20 +244,36 @@ __naked int parent_callee_saved_reg_precise_global(void) SEC("?raw_tp") __success __log_level(2) +/* First simulated path does not include callback body */ __msg("12: (0f) r1 += r6") -__msg("mark_precise: frame0: last_idx 12 first_idx 10") +__msg("mark_precise: frame0: last_idx 12 first_idx 9") __msg("mark_precise: frame0: regs=r6 stack= before 11: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 10: (27) r6 *= 4") +__msg("mark_precise: frame0: regs=r6 stack= before 9: (85) call bpf_loop") __msg("mark_precise: frame0: parent state regs=r6 stack=:") -__msg("mark_precise: frame0: last_idx 16 first_idx 0") -__msg("mark_precise: frame0: regs=r6 stack= before 16: (95) exit") -__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0") -__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop#181") +__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9") __msg("mark_precise: frame0: regs=r6 stack= before 8: (b7) r4 = 0") __msg("mark_precise: frame0: regs=r6 stack= before 7: (b7) r3 = 0") __msg("mark_precise: frame0: regs=r6 stack= before 6: (bf) r2 = r8") __msg("mark_precise: frame0: regs=r6 stack= before 5: (b7) r1 = 1") __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3") +/* State entering callback body popped from states stack */ +__msg("from 9 to 15: frame1:") +__msg("15: frame1: R1=scalar() R2=0 R10=fp0 cb") +__msg("15: (b7) r0 = 0") +__msg("16: (95) exit") +__msg("returning from callee:") +__msg("to caller at 9:") +/* r4 (flags) is always precise for bpf_loop(), + * r6 was marked before backtracking to callback body. + */ +__msg("frame 0: propagating r4,r6") +__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") +__msg("mark_precise: frame0: regs=r4,r6 stack= before 16: (95) exit") +__msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0") +__msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop") +__msg("mark_precise: frame0: parent state regs= stack=:") +__msg("from 16 to 9: safe") __naked int parent_callee_saved_reg_precise_with_callback(void) { asm volatile ( @@ -373,22 +400,38 @@ __naked int parent_stack_slot_precise_global(void) SEC("?raw_tp") __success __log_level(2) +/* First simulated path does not include callback body */ __msg("14: (0f) r1 += r6") -__msg("mark_precise: frame0: last_idx 14 first_idx 11") +__msg("mark_precise: frame0: last_idx 14 first_idx 10") __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") __msg("mark_precise: frame0: regs=r6 stack= before 12: (27) r6 *= 4") __msg("mark_precise: frame0: regs=r6 stack= before 11: (79) r6 = *(u64 *)(r10 -8)") +__msg("mark_precise: frame0: regs= stack=-8 before 10: (85) call bpf_loop") __msg("mark_precise: frame0: parent state regs= stack=-8:") -__msg("mark_precise: frame0: last_idx 18 first_idx 0") -__msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit") -__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0") -__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181") +__msg("mark_precise: frame0: last_idx 9 first_idx 0 subseq_idx 10") __msg("mark_precise: frame0: regs= stack=-8 before 9: (b7) r4 = 0") __msg("mark_precise: frame0: regs= stack=-8 before 8: (b7) r3 = 0") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r2 = r8") __msg("mark_precise: frame0: regs= stack=-8 before 6: (bf) r1 = r6") __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -8) = r6") __msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3") +/* State entering callback body popped from states stack */ +__msg("from 10 to 17: frame1:") +__msg("17: frame1: R1=scalar() R2=0 R10=fp0 cb") +__msg("17: (b7) r0 = 0") +__msg("18: (95) exit") +__msg("returning from callee:") +__msg("to caller at 10:") +/* r4 (flags) is always precise for bpf_loop(), + * fp-8 was marked before backtracking to callback body. + */ +__msg("frame 0: propagating r4,fp-8") +__msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1") +__msg("mark_precise: frame0: regs=r4 stack=-8 before 18: (95) exit") +__msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0") +__msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181") +__msg("mark_precise: frame0: parent state regs= stack=:") +__msg("from 18 to 10: safe") __naked int parent_stack_slot_precise_with_callback(void) { asm volatile ( From 958465e217dbf5fc6677d42d8827fb3073d86afd Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:57 +0200 Subject: [PATCH 432/560] selftests/bpf: tests for iterating callbacks A set of test cases to check behavior of callback handling logic, check if verifier catches the following situations: - program not safe on second callback iteration; - program not safe on zero callback iterations; - infinite loop inside a callback. Verify that callback logic works for bpf_loop, bpf_for_each_map_elem, bpf_user_ringbuf_drain, bpf_find_vma. Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-8-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../bpf/progs/verifier_iterating_callbacks.c | 147 ++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index e5c61aa6604ac..5cfa7a6316b63 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -31,6 +31,7 @@ #include "verifier_helper_restricted.skel.h" #include "verifier_helper_value_access.skel.h" #include "verifier_int_ptr.skel.h" +#include "verifier_iterating_callbacks.skel.h" #include "verifier_jeq_infer_not_null.skel.h" #include "verifier_ld_ind.skel.h" #include "verifier_ldsx.skel.h" @@ -139,6 +140,7 @@ void test_verifier_helper_packet_access(void) { RUN(verifier_helper_packet_acces void test_verifier_helper_restricted(void) { RUN(verifier_helper_restricted); } void test_verifier_helper_value_access(void) { RUN(verifier_helper_value_access); } void test_verifier_int_ptr(void) { RUN(verifier_int_ptr); } +void test_verifier_iterating_callbacks(void) { RUN(verifier_iterating_callbacks); } void test_verifier_jeq_infer_not_null(void) { RUN(verifier_jeq_infer_not_null); } void test_verifier_ld_ind(void) { RUN(verifier_ld_ind); } void test_verifier_ldsx(void) { RUN(verifier_ldsx); } diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c new file mode 100644 index 0000000000000..fa9429f77a817 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 8); + __type(key, __u32); + __type(value, __u64); +} map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_USER_RINGBUF); + __uint(max_entries, 8); +} ringbuf SEC(".maps"); + +struct vm_area_struct; +struct bpf_map; + +struct buf_context { + char *buf; +}; + +struct num_context { + __u64 i; +}; + +__u8 choice_arr[2] = { 0, 1 }; + +static int unsafe_on_2nd_iter_cb(__u32 idx, struct buf_context *ctx) +{ + if (idx == 0) { + ctx->buf = (char *)(0xDEAD); + return 0; + } + + if (bpf_probe_read_user(ctx->buf, 8, (void *)(0xBADC0FFEE))) + return 1; + + return 0; +} + +SEC("?raw_tp") +__failure __msg("R1 type=scalar expected=fp") +int unsafe_on_2nd_iter(void *unused) +{ + char buf[4]; + struct buf_context loop_ctx = { .buf = buf }; + + bpf_loop(100, unsafe_on_2nd_iter_cb, &loop_ctx, 0); + return 0; +} + +static int unsafe_on_zero_iter_cb(__u32 idx, struct num_context *ctx) +{ + ctx->i = 0; + return 0; +} + +SEC("?raw_tp") +__failure __msg("invalid access to map value, value_size=2 off=32 size=1") +int unsafe_on_zero_iter(void *unused) +{ + struct num_context loop_ctx = { .i = 32 }; + + bpf_loop(100, unsafe_on_zero_iter_cb, &loop_ctx, 0); + return choice_arr[loop_ctx.i]; +} + +static int loop_detection_cb(__u32 idx, struct num_context *ctx) +{ + for (;;) {} + return 0; +} + +SEC("?raw_tp") +__failure __msg("infinite loop detected") +int loop_detection(void *unused) +{ + struct num_context loop_ctx = { .i = 0 }; + + bpf_loop(100, loop_detection_cb, &loop_ctx, 0); + return 0; +} + +static __always_inline __u64 oob_state_machine(struct num_context *ctx) +{ + switch (ctx->i) { + case 0: + ctx->i = 1; + break; + case 1: + ctx->i = 32; + break; + } + return 0; +} + +static __u64 for_each_map_elem_cb(struct bpf_map *map, __u32 *key, __u64 *val, void *data) +{ + return oob_state_machine(data); +} + +SEC("?raw_tp") +__failure __msg("invalid access to map value, value_size=2 off=32 size=1") +int unsafe_for_each_map_elem(void *unused) +{ + struct num_context loop_ctx = { .i = 0 }; + + bpf_for_each_map_elem(&map, for_each_map_elem_cb, &loop_ctx, 0); + return choice_arr[loop_ctx.i]; +} + +static __u64 ringbuf_drain_cb(struct bpf_dynptr *dynptr, void *data) +{ + return oob_state_machine(data); +} + +SEC("?raw_tp") +__failure __msg("invalid access to map value, value_size=2 off=32 size=1") +int unsafe_ringbuf_drain(void *unused) +{ + struct num_context loop_ctx = { .i = 0 }; + + bpf_user_ringbuf_drain(&ringbuf, ringbuf_drain_cb, &loop_ctx, 0); + return choice_arr[loop_ctx.i]; +} + +static __u64 find_vma_cb(struct task_struct *task, struct vm_area_struct *vma, void *data) +{ + return oob_state_machine(data); +} + +SEC("?raw_tp") +__failure __msg("invalid access to map value, value_size=2 off=32 size=1") +int unsafe_find_vma(void *unused) +{ + struct task_struct *task = bpf_get_current_task_btf(); + struct num_context loop_ctx = { .i = 0 }; + + bpf_find_vma(task, 0, find_vma_cb, &loop_ctx, 0); + return choice_arr[loop_ctx.i]; +} + +char _license[] SEC("license") = "GPL"; From cafe2c21508a38cdb3ed22708842e957b2572c3e Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:58 +0200 Subject: [PATCH 433/560] bpf: widening for callback iterators Callbacks are similar to open coded iterators, so add imprecise widening logic for callback body processing. This makes callback based loops behave identically to open coded iterators, e.g. allowing to verify programs like below: struct ctx { u32 i; }; int cb(u32 idx, struct ctx* ctx) { ++ctx->i; return 0; } ... struct ctx ctx = { .i = 0 }; bpf_loop(100, cb, &ctx, 0); ... Acked-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-9-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a60dfa56ebb3f..2f03e6b11bb9d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9799,9 +9799,10 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *state = env->cur_state, *prev_st; struct bpf_func_state *caller, *callee; struct bpf_reg_state *r0; + bool in_callback_fn; int err; callee = state->frame[state->curframe]; @@ -9856,7 +9857,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) * there function call logic would reschedule callback visit. If iteration * converges is_state_visited() would prune that visit eventually. */ - if (callee->in_callback_fn) + in_callback_fn = callee->in_callback_fn; + if (in_callback_fn) *insn_idx = callee->callsite; else *insn_idx = callee->callsite + 1; @@ -9871,6 +9873,24 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) * bpf_throw, this will be done by copy_verifier_state for extra frames. */ free_func_state(callee); state->frame[state->curframe--] = NULL; + + /* for callbacks widen imprecise scalars to make programs like below verify: + * + * struct ctx { int i; } + * void cb(int idx, struct ctx *ctx) { ctx->i++; ... } + * ... + * struct ctx = { .i = 0; } + * bpf_loop(100, cb, &ctx, 0); + * + * This is similar to what is done in process_iter_next_call() for open + * coded iterators. + */ + prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL; + if (prev_st) { + err = widen_imprecise_scalars(env, prev_st, state); + if (err) + return err; + } return 0; } From 9f3330aa644d6d979eb064c46e85c62d4b4eac75 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:06:59 +0200 Subject: [PATCH 434/560] selftests/bpf: test widening for iterating callbacks A test case to verify that imprecise scalars widening is applied to callback entering state, when callback call is simulated repeatedly. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-10-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_iterating_callbacks.c | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index fa9429f77a817..598c1e984b267 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -25,6 +25,7 @@ struct buf_context { struct num_context { __u64 i; + __u64 j; }; __u8 choice_arr[2] = { 0, 1 }; @@ -69,6 +70,25 @@ int unsafe_on_zero_iter(void *unused) return choice_arr[loop_ctx.i]; } +static int widening_cb(__u32 idx, struct num_context *ctx) +{ + ++ctx->i; + return 0; +} + +SEC("?raw_tp") +__success +int widening(void *unused) +{ + struct num_context loop_ctx = { .i = 0, .j = 1 }; + + bpf_loop(100, widening_cb, &loop_ctx, 0); + /* loop_ctx.j is not changed during callback iteration, + * verifier should not apply widening to it. + */ + return choice_arr[loop_ctx.j]; +} + static int loop_detection_cb(__u32 idx, struct num_context *ctx) { for (;;) {} From bb124da69c47dd98d69361ec13244ece50bec63e Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:07:00 +0200 Subject: [PATCH 435/560] bpf: keep track of max number of bpf_loop callback iterations In some cases verifier can't infer convergence of the bpf_loop() iteration. E.g. for the following program: static int cb(__u32 idx, struct num_context* ctx) { ctx->i++; return 0; } SEC("?raw_tp") int prog(void *_) { struct num_context ctx = { .i = 0 }; __u8 choice_arr[2] = { 0, 1 }; bpf_loop(2, cb, &ctx, 0); return choice_arr[ctx.i]; } Each 'cb' simulation would eventually return to 'prog' and reach 'return choice_arr[ctx.i]' statement. At which point ctx.i would be marked precise, thus forcing verifier to track multitude of separate states with {.i=0}, {.i=1}, ... at bpf_loop() callback entry. This commit allows "brute force" handling for such cases by limiting number of callback body simulations using 'umax' value of the first bpf_loop() parameter. For this, extend bpf_func_state with 'callback_depth' field. Increment this field when callback visiting state is pushed to states traversal stack. For frame #N it's 'callback_depth' field counts how many times callback with frame depth N+1 had been executed. Use bpf_func_state specifically to allow independent tracking of callback depths when multiple nested bpf_loop() calls are present. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-11-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 11 ++++++ kernel/bpf/verifier.c | 19 ++++++++-- .../bpf/progs/verifier_subprog_precision.c | 35 +++++++++++++------ 3 files changed, 53 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index dd326936dd6ff..aa4d19d0bc94b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -301,6 +301,17 @@ struct bpf_func_state { struct tnum callback_ret_range; bool in_async_callback_fn; bool in_exception_callback_fn; + /* For callback calling functions that limit number of possible + * callback executions (e.g. bpf_loop) keeps track of current + * simulated iteration number. + * Value in frame N refers to number of times callback with frame + * N+1 was simulated, e.g. for the following call: + * + * bpf_loop(..., fn, ...); | suppose current frame is N + * | fn would be simulated in frame N+1 + * | number of simulations is tracked in frame N + */ + u32 callback_depth; /* The following fields should be last. See copy_func_state() */ int acquired_refs; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2f03e6b11bb9d..af2819d5c8ee7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9505,6 +9505,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins return err; callback_state->callback_unroll_depth++; + callback_state->frame[callback_state->curframe - 1]->callback_depth++; + caller->callback_depth = 0; return 0; } @@ -10309,8 +10311,21 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn break; case BPF_FUNC_loop: update_loop_inline_state(env, meta.subprogno); - err = push_callback_call(env, insn, insn_idx, meta.subprogno, - set_loop_callback_state); + /* Verifier relies on R1 value to determine if bpf_loop() iteration + * is finished, thus mark it precise. + */ + err = mark_chain_precision(env, BPF_REG_1); + if (err) + return err; + if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_loop_callback_state); + } else { + cur_func(env)->callback_depth = 0; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame%d bpf_loop iteration limit reached\n", + env->cur_state->curframe); + } break; case BPF_FUNC_dynptr_from_mem: if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) { diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index da803cffb5efb..f61d623b1ce8d 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -119,7 +119,23 @@ __naked int global_subprog_result_precise(void) SEC("?raw_tp") __success __log_level(2) -/* First simulated path does not include callback body */ +/* First simulated path does not include callback body, + * r1 and r4 are always precise for bpf_loop() calls. + */ +__msg("9: (85) call bpf_loop#181") +__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") +__msg("mark_precise: frame0: parent state regs=r4 stack=:") +__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9") +__msg("mark_precise: frame0: regs=r4 stack= before 8: (b7) r4 = 0") +__msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") +__msg("mark_precise: frame0: parent state regs=r1 stack=:") +__msg("mark_precise: frame0: last_idx 8 first_idx 0 subseq_idx 9") +__msg("mark_precise: frame0: regs=r1 stack= before 8: (b7) r4 = 0") +__msg("mark_precise: frame0: regs=r1 stack= before 7: (b7) r3 = 0") +__msg("mark_precise: frame0: regs=r1 stack= before 6: (bf) r2 = r8") +__msg("mark_precise: frame0: regs=r1 stack= before 5: (bf) r1 = r6") +__msg("mark_precise: frame0: regs=r6 stack= before 4: (b7) r6 = 3") +/* r6 precision propagation */ __msg("14: (0f) r1 += r6") __msg("mark_precise: frame0: last_idx 14 first_idx 9") __msg("mark_precise: frame0: regs=r6 stack= before 13: (bf) r1 = r7") @@ -134,10 +150,9 @@ __msg("17: (b7) r0 = 0") __msg("18: (95) exit") __msg("returning from callee:") __msg("to caller at 9:") -/* r4 (flags) is always precise for bpf_loop() */ -__msg("frame 0: propagating r4") +__msg("frame 0: propagating r1,r4") __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") -__msg("mark_precise: frame0: regs=r4 stack= before 18: (95) exit") +__msg("mark_precise: frame0: regs=r1,r4 stack= before 18: (95) exit") __msg("from 18 to 9: safe") __naked int callback_result_precise(void) { @@ -264,12 +279,12 @@ __msg("15: (b7) r0 = 0") __msg("16: (95) exit") __msg("returning from callee:") __msg("to caller at 9:") -/* r4 (flags) is always precise for bpf_loop(), +/* r1, r4 are always precise for bpf_loop(), * r6 was marked before backtracking to callback body. */ -__msg("frame 0: propagating r4,r6") +__msg("frame 0: propagating r1,r4,r6") __msg("mark_precise: frame0: last_idx 9 first_idx 9 subseq_idx -1") -__msg("mark_precise: frame0: regs=r4,r6 stack= before 16: (95) exit") +__msg("mark_precise: frame0: regs=r1,r4,r6 stack= before 16: (95) exit") __msg("mark_precise: frame1: regs= stack= before 15: (b7) r0 = 0") __msg("mark_precise: frame1: regs= stack= before 9: (85) call bpf_loop") __msg("mark_precise: frame0: parent state regs= stack=:") @@ -422,12 +437,12 @@ __msg("17: (b7) r0 = 0") __msg("18: (95) exit") __msg("returning from callee:") __msg("to caller at 10:") -/* r4 (flags) is always precise for bpf_loop(), +/* r1, r4 are always precise for bpf_loop(), * fp-8 was marked before backtracking to callback body. */ -__msg("frame 0: propagating r4,fp-8") +__msg("frame 0: propagating r1,r4,fp-8") __msg("mark_precise: frame0: last_idx 10 first_idx 10 subseq_idx -1") -__msg("mark_precise: frame0: regs=r4 stack=-8 before 18: (95) exit") +__msg("mark_precise: frame0: regs=r1,r4 stack=-8 before 18: (95) exit") __msg("mark_precise: frame1: regs= stack= before 17: (b7) r0 = 0") __msg("mark_precise: frame1: regs= stack= before 10: (85) call bpf_loop#181") __msg("mark_precise: frame0: parent state regs= stack=:") From 57e2a52deeb12ab84c15c6d0fb93638b5b94001b Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 21 Nov 2023 04:07:01 +0200 Subject: [PATCH 436/560] selftests/bpf: check if max number of bpf_loop iterations is tracked Check that even if bpf_loop() callback simulation does not converge to a specific state, verification could proceed via "brute force" simulation of maximal number of callback calls. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231121020701.26440-12-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_iterating_callbacks.c | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index 598c1e984b267..5905e036e0eac 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -164,4 +164,79 @@ int unsafe_find_vma(void *unused) return choice_arr[loop_ctx.i]; } +static int iter_limit_cb(__u32 idx, struct num_context *ctx) +{ + ctx->i++; + return 0; +} + +SEC("?raw_tp") +__success +int bpf_loop_iter_limit_ok(void *unused) +{ + struct num_context ctx = { .i = 0 }; + + bpf_loop(1, iter_limit_cb, &ctx, 0); + return choice_arr[ctx.i]; +} + +SEC("?raw_tp") +__failure __msg("invalid access to map value, value_size=2 off=2 size=1") +int bpf_loop_iter_limit_overflow(void *unused) +{ + struct num_context ctx = { .i = 0 }; + + bpf_loop(2, iter_limit_cb, &ctx, 0); + return choice_arr[ctx.i]; +} + +static int iter_limit_level2a_cb(__u32 idx, struct num_context *ctx) +{ + ctx->i += 100; + return 0; +} + +static int iter_limit_level2b_cb(__u32 idx, struct num_context *ctx) +{ + ctx->i += 10; + return 0; +} + +static int iter_limit_level1_cb(__u32 idx, struct num_context *ctx) +{ + ctx->i += 1; + bpf_loop(1, iter_limit_level2a_cb, ctx, 0); + bpf_loop(1, iter_limit_level2b_cb, ctx, 0); + return 0; +} + +/* Check that path visiting every callback function once had been + * reached by verifier. Variables 'ctx{1,2}i' below serve as flags, + * with each decimal digit corresponding to a callback visit marker. + */ +SEC("socket") +__success __retval(111111) +int bpf_loop_iter_limit_nested(void *unused) +{ + struct num_context ctx1 = { .i = 0 }; + struct num_context ctx2 = { .i = 0 }; + __u64 a, b, c; + + bpf_loop(1, iter_limit_level1_cb, &ctx1, 0); + bpf_loop(1, iter_limit_level1_cb, &ctx2, 0); + a = ctx1.i; + b = ctx2.i; + /* Force 'ctx1.i' and 'ctx2.i' precise. */ + c = choice_arr[(a + b) % 2]; + /* This makes 'c' zero, but neither clang nor verifier know it. */ + c /= 10; + /* Make sure that verifier does not visit 'impossible' states: + * enumerate all possible callback visit masks. + */ + if (a != 0 && a != 1 && a != 11 && a != 101 && a != 111 && + b != 0 && b != 1 && b != 11 && b != 101 && b != 111) + asm volatile ("r0 /= 0;" ::: "r0"); + return 1000 * a + b + c; +} + char _license[] SEC("license") = "GPL"; From d3ec75bc635cb0cb8185b63293d33a3d1b942d22 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 21 Nov 2023 15:03:25 +0800 Subject: [PATCH 437/560] LoongArch: Add dependency between vmlinuz.efi and vmlinux.efi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A common issue in Makefile is a race in parallel building. You need to be careful to prevent multiple threads from writing to the same file simultaneously. Commit 3939f3345050 ("ARM: 8418/1: add boot image dependencies to not generate invalid images") addressed such a bad scenario. A similar symptom occurs with the following command: $ make -j$(nproc) ARCH=loongarch vmlinux.efi vmlinuz.efi [ snip ] SORTTAB vmlinux OBJCOPY arch/loongarch/boot/vmlinux.efi OBJCOPY arch/loongarch/boot/vmlinux.efi PAD arch/loongarch/boot/vmlinux.bin GZIP arch/loongarch/boot/vmlinuz OBJCOPY arch/loongarch/boot/vmlinuz.o LD arch/loongarch/boot/vmlinuz.efi.elf OBJCOPY arch/loongarch/boot/vmlinuz.efi The log "OBJCOPY arch/loongarch/boot/vmlinux.efi" is displayed twice. It indicates that two threads simultaneously enter arch/loongarch/boot/ and write to arch/loongarch/boot/vmlinux.efi. It occasionally leads to a build failure: $ make -j$(nproc) ARCH=loongarch vmlinux.efi vmlinuz.efi [ snip ] SORTTAB vmlinux OBJCOPY arch/loongarch/boot/vmlinux.efi PAD arch/loongarch/boot/vmlinux.bin truncate: Invalid number: ‘arch/loongarch/boot/vmlinux.bin’ make[2]: *** [drivers/firmware/efi/libstub/Makefile.zboot:13: arch/loongarch/boot/vmlinux.bin] Error 1 make[2]: *** Deleting file 'arch/loongarch/boot/vmlinux.bin' make[1]: *** [arch/loongarch/Makefile:146: vmlinuz.efi] Error 2 make[1]: *** Waiting for unfinished jobs.... make: *** [Makefile:234: __sub-make] Error 2 vmlinuz.efi depends on vmlinux.efi, but such a dependency is not specified in arch/loongarch/Makefile. Signed-off-by: Masahiro Yamada Signed-off-by: Huacai Chen --- arch/loongarch/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 9eeb0c05f3f4d..a8d65eaf1211f 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -142,6 +142,8 @@ vdso-install-y += arch/loongarch/vdso/vdso.so.dbg all: $(notdir $(KBUILD_IMAGE)) +vmlinuz.efi: vmlinux.efi + vmlinux.elf vmlinux.efi vmlinuz.efi: vmlinux $(Q)$(MAKE) $(build)=$(boot) $(bootvars-y) $(boot)/$@ From cbfd44bd5c6eec0aada0c39130f0b8d7ecba0529 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Tue, 21 Nov 2023 15:03:25 +0800 Subject: [PATCH 438/560] LoongArch: Explicitly set -fdirect-access-external-data for vmlinux After this llvm commit [1], The -fno-pic does not imply direct access external data. Explicitly set -fdirect-access-external-data for vmlinux that can avoids GOT entries. Link: https://github.com/llvm/llvm-project/commit/47eeee297775347cbdb7624d6a766c2a3eec4a59 Suggested-by: Xi Ruoyao Signed-off-by: WANG Rui Signed-off-by: Huacai Chen --- arch/loongarch/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index a8d65eaf1211f..204b94b2e6aaa 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -68,6 +68,7 @@ LDFLAGS_vmlinux += -static -n -nostdlib ifdef CONFIG_AS_HAS_EXPLICIT_RELOCS cflags-y += $(call cc-option,-mexplicit-relocs) KBUILD_CFLAGS_KERNEL += $(call cc-option,-mdirect-extern-access) +KBUILD_CFLAGS_KERNEL += $(call cc-option,-fdirect-access-external-data) KBUILD_AFLAGS_MODULE += $(call cc-option,-fno-direct-access-external-data) KBUILD_CFLAGS_MODULE += $(call cc-option,-fno-direct-access-external-data) KBUILD_AFLAGS_MODULE += $(call cc-option,-mno-relax) $(call cc-option,-Wa$(comma)-mno-relax) From aa0cbc1b506b090c3a775b547c693ada108cc0d7 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Tue, 21 Nov 2023 15:03:25 +0800 Subject: [PATCH 439/560] LoongArch: Record pc instead of offset in la_abs relocation To clarify, the previous version functioned flawlessly. However, it's worth noting that the LLVM's LoongArch backend currently lacks support for cross-section label calculations. With this patch, we enable the use of clang to compile relocatable kernels. Tested-by: Nathan Chancellor Signed-off-by: WANG Rui Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/asmmacro.h | 3 +-- arch/loongarch/include/asm/setup.h | 2 +- arch/loongarch/kernel/relocate.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/asmmacro.h b/arch/loongarch/include/asm/asmmacro.h index c9544f358c339..655db7d7a4279 100644 --- a/arch/loongarch/include/asm/asmmacro.h +++ b/arch/loongarch/include/asm/asmmacro.h @@ -609,8 +609,7 @@ lu32i.d \reg, 0 lu52i.d \reg, \reg, 0 .pushsection ".la_abs", "aw", %progbits - 768: - .dword 768b-766b + .dword 766b .dword \sym .popsection #endif diff --git a/arch/loongarch/include/asm/setup.h b/arch/loongarch/include/asm/setup.h index a0bc159ce8bdc..ee52fb1e99631 100644 --- a/arch/loongarch/include/asm/setup.h +++ b/arch/loongarch/include/asm/setup.h @@ -25,7 +25,7 @@ extern void set_merr_handler(unsigned long offset, void *addr, unsigned long len #ifdef CONFIG_RELOCATABLE struct rela_la_abs { - long offset; + long pc; long symvalue; }; diff --git a/arch/loongarch/kernel/relocate.c b/arch/loongarch/kernel/relocate.c index 6c3eff9af9fb1..288b739ca88dd 100644 --- a/arch/loongarch/kernel/relocate.c +++ b/arch/loongarch/kernel/relocate.c @@ -52,7 +52,7 @@ static inline void __init relocate_absolute(long random_offset) for (p = begin; (void *)p < end; p++) { long v = p->symvalue; uint32_t lu12iw, ori, lu32id, lu52id; - union loongarch_instruction *insn = (void *)p - p->offset; + union loongarch_instruction *insn = (void *)p->pc; lu12iw = (v >> 12) & 0xfffff; ori = v & 0xfff; From ee2daf7102f42007b7bf5dbd1ae76478ee62c512 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 21 Nov 2023 15:03:25 +0800 Subject: [PATCH 440/560] LoongArch: Add __percpu annotation for __percpu_read()/__percpu_write() When build kernel with C=1, we get: arch/loongarch/kernel/process.c:234:46: warning: incorrect type in argument 1 (different address spaces) arch/loongarch/kernel/process.c:234:46: expected void *ptr arch/loongarch/kernel/process.c:234:46: got unsigned long [noderef] __percpu * arch/loongarch/kernel/process.c:234:46: warning: incorrect type in argument 1 (different address spaces) arch/loongarch/kernel/process.c:234:46: expected void *ptr arch/loongarch/kernel/process.c:234:46: got unsigned long [noderef] __percpu * arch/loongarch/kernel/process.c:234:46: warning: incorrect type in argument 1 (different address spaces) arch/loongarch/kernel/process.c:234:46: expected void *ptr arch/loongarch/kernel/process.c:234:46: got unsigned long [noderef] __percpu * arch/loongarch/kernel/process.c:234:46: warning: incorrect type in argument 1 (different address spaces) arch/loongarch/kernel/process.c:234:46: expected void *ptr arch/loongarch/kernel/process.c:234:46: got unsigned long [noderef] __percpu * Add __percpu annotation for __percpu_read()/__percpu_write() can avoid such warnings. __percpu_xchg() and other functions don't need annotation because their wrapper, i.e. _pcp_protect(), already suppresses warnings. Also adjust the indentations in this file. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311080409.LlOfTR3m-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202311080840.Vc2kXhfp-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202311081340.3k72KKdg-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202311120926.cjYHyoYw-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202311152142.g6UyNx1R-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202311160339.DbhaH8LX-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202311181454.CTPrSYmQ-lkp@intel.com/ Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/percpu.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h index ed5da02b1cf6f..9b36ac003f890 100644 --- a/arch/loongarch/include/asm/percpu.h +++ b/arch/loongarch/include/asm/percpu.h @@ -40,13 +40,13 @@ static __always_inline unsigned long __percpu_##op(void *ptr, \ switch (size) { \ case 4: \ __asm__ __volatile__( \ - "am"#asm_op".w" " %[ret], %[val], %[ptr] \n" \ + "am"#asm_op".w" " %[ret], %[val], %[ptr] \n" \ : [ret] "=&r" (ret), [ptr] "+ZB"(*(u32 *)ptr) \ : [val] "r" (val)); \ break; \ case 8: \ __asm__ __volatile__( \ - "am"#asm_op".d" " %[ret], %[val], %[ptr] \n" \ + "am"#asm_op".d" " %[ret], %[val], %[ptr] \n" \ : [ret] "=&r" (ret), [ptr] "+ZB"(*(u64 *)ptr) \ : [val] "r" (val)); \ break; \ @@ -63,7 +63,7 @@ PERCPU_OP(and, and, &) PERCPU_OP(or, or, |) #undef PERCPU_OP -static __always_inline unsigned long __percpu_read(void *ptr, int size) +static __always_inline unsigned long __percpu_read(void __percpu *ptr, int size) { unsigned long ret; @@ -100,7 +100,7 @@ static __always_inline unsigned long __percpu_read(void *ptr, int size) return ret; } -static __always_inline void __percpu_write(void *ptr, unsigned long val, int size) +static __always_inline void __percpu_write(void __percpu *ptr, unsigned long val, int size) { switch (size) { case 1: @@ -132,8 +132,7 @@ static __always_inline void __percpu_write(void *ptr, unsigned long val, int siz } } -static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val, - int size) +static __always_inline unsigned long __percpu_xchg(void *ptr, unsigned long val, int size) { switch (size) { case 1: From 902d75cdf0cf0a3fb58550089ee519abf12566f5 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 21 Nov 2023 15:03:25 +0800 Subject: [PATCH 441/560] LoongArch: Silence the boot warning about 'nokaslr' The kernel parameter 'nokaslr' is handled before start_kernel(), so we don't need early_param() to mark it technically. But it can cause a boot warning as follows: Unknown kernel command line parameters "nokaslr", will be passed to user space. When we use 'init=/bin/bash', 'nokaslr' which passed to user space will even cause a kernel panic. So we use early_param() to mark 'nokaslr', simply print a notice and silence the boot warning (also fix a potential panic). This logic is similar to RISC-V. Signed-off-by: Huacai Chen --- arch/loongarch/kernel/relocate.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/loongarch/kernel/relocate.c b/arch/loongarch/kernel/relocate.c index 288b739ca88dd..1acfa704c8d09 100644 --- a/arch/loongarch/kernel/relocate.c +++ b/arch/loongarch/kernel/relocate.c @@ -102,6 +102,14 @@ static inline __init unsigned long get_random_boot(void) return hash; } +static int __init nokaslr(char *p) +{ + pr_info("KASLR is disabled.\n"); + + return 0; /* Print a notice and silence the boot warning */ +} +early_param("nokaslr", nokaslr); + static inline __init bool kaslr_disabled(void) { char *str; From 19d86a496233731882aea7ec24505ce6641b1c0c Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 21 Nov 2023 15:03:25 +0800 Subject: [PATCH 442/560] LoongArch: Mark {dmw,tlb}_virt_to_page() exports as non-GPL Mark {dmw,tlb}_virt_to_page() exports as non-GPL, in order to let out-of-tree modules (e.g. OpenZFS) be built without errors. Otherwise we get: ERROR: modpost: GPL-incompatible module zfs.ko uses GPL-only symbol 'dmw_virt_to_page' ERROR: modpost: GPL-incompatible module zfs.ko uses GPL-only symbol 'tlb_virt_to_page' Reported-by: Haowu Ge Signed-off-by: Huacai Chen --- arch/loongarch/mm/pgtable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 71d0539e2d0b0..2aae72e638713 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -13,13 +13,13 @@ struct page *dmw_virt_to_page(unsigned long kaddr) { return pfn_to_page(virt_to_pfn(kaddr)); } -EXPORT_SYMBOL_GPL(dmw_virt_to_page); +EXPORT_SYMBOL(dmw_virt_to_page); struct page *tlb_virt_to_page(unsigned long kaddr) { return pfn_to_page(pte_pfn(*virt_to_kpte(kaddr))); } -EXPORT_SYMBOL_GPL(tlb_virt_to_page); +EXPORT_SYMBOL(tlb_virt_to_page); pgd_t *pgd_alloc(struct mm_struct *mm) { From d43f37b73468c172bc89ac4824a1511b411f0778 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 21 Nov 2023 15:03:25 +0800 Subject: [PATCH 443/560] LoongArch: Implement constant timer shutdown interface When a cpu is hot-unplugged, it is put in idle state and the function arch_cpu_idle_dead() is called. The timer interrupt for this processor should be disabled, otherwise there will be pending timer interrupt for the unplugged cpu, so that vcpu is prevented from giving up scheduling when system is running in vm mode. This patch implements the timer shutdown interface so that the constant timer will be properly disabled when a CPU is hot-unplugged. Reviewed-by: WANG Xuerui Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kernel/time.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index 3064af94db9c2..e7015f7b70e37 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -58,14 +58,16 @@ static int constant_set_state_oneshot(struct clock_event_device *evt) return 0; } -static int constant_set_state_oneshot_stopped(struct clock_event_device *evt) +static int constant_set_state_periodic(struct clock_event_device *evt) { + unsigned long period; unsigned long timer_config; raw_spin_lock(&state_lock); - timer_config = csr_read64(LOONGARCH_CSR_TCFG); - timer_config &= ~CSR_TCFG_EN; + period = const_clock_freq / HZ; + timer_config = period & CSR_TCFG_VAL; + timer_config |= (CSR_TCFG_PERIOD | CSR_TCFG_EN); csr_write64(timer_config, LOONGARCH_CSR_TCFG); raw_spin_unlock(&state_lock); @@ -73,16 +75,14 @@ static int constant_set_state_oneshot_stopped(struct clock_event_device *evt) return 0; } -static int constant_set_state_periodic(struct clock_event_device *evt) +static int constant_set_state_shutdown(struct clock_event_device *evt) { - unsigned long period; unsigned long timer_config; raw_spin_lock(&state_lock); - period = const_clock_freq / HZ; - timer_config = period & CSR_TCFG_VAL; - timer_config |= (CSR_TCFG_PERIOD | CSR_TCFG_EN); + timer_config = csr_read64(LOONGARCH_CSR_TCFG); + timer_config &= ~CSR_TCFG_EN; csr_write64(timer_config, LOONGARCH_CSR_TCFG); raw_spin_unlock(&state_lock); @@ -90,11 +90,6 @@ static int constant_set_state_periodic(struct clock_event_device *evt) return 0; } -static int constant_set_state_shutdown(struct clock_event_device *evt) -{ - return 0; -} - static int constant_timer_next_event(unsigned long delta, struct clock_event_device *evt) { unsigned long timer_config; @@ -161,7 +156,7 @@ int constant_clockevent_init(void) cd->rating = 320; cd->cpumask = cpumask_of(cpu); cd->set_state_oneshot = constant_set_state_oneshot; - cd->set_state_oneshot_stopped = constant_set_state_oneshot_stopped; + cd->set_state_oneshot_stopped = constant_set_state_shutdown; cd->set_state_periodic = constant_set_state_periodic; cd->set_state_shutdown = constant_set_state_shutdown; cd->set_next_event = constant_timer_next_event; From 10301780c96e70b694eca86a3ccfa1893a4a6d3e Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Tue, 21 Nov 2023 15:03:25 +0800 Subject: [PATCH 444/560] Docs/LoongArch: Update links in LoongArch introduction.rst LoongArch-Vol1 has been updated to v1.10, the links in the documentation are out of date, let's update it. Signed-off-by: Yanteng Si Signed-off-by: Huacai Chen --- Documentation/arch/loongarch/introduction.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/arch/loongarch/introduction.rst b/Documentation/arch/loongarch/introduction.rst index 8c568cfc21079..5e6db78abeaf5 100644 --- a/Documentation/arch/loongarch/introduction.rst +++ b/Documentation/arch/loongarch/introduction.rst @@ -375,9 +375,9 @@ Developer web site of Loongson and LoongArch (Software and Documentation): Documentation of LoongArch ISA: - https://github.com/loongson/LoongArch-Documentation/releases/latest/download/LoongArch-Vol1-v1.02-CN.pdf (in Chinese) + https://github.com/loongson/LoongArch-Documentation/releases/latest/download/LoongArch-Vol1-v1.10-CN.pdf (in Chinese) - https://github.com/loongson/LoongArch-Documentation/releases/latest/download/LoongArch-Vol1-v1.02-EN.pdf (in English) + https://github.com/loongson/LoongArch-Documentation/releases/latest/download/LoongArch-Vol1-v1.10-EN.pdf (in English) Documentation of LoongArch ELF psABI: From c517fd2738f472eb0d1db60a70d91629349a9bf8 Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Tue, 21 Nov 2023 15:03:26 +0800 Subject: [PATCH 445/560] Docs/zh_CN/LoongArch: Update links in LoongArch introduction.rst LoongArch-Vol1 has been updated to v1.10, the links in the documentation are out of date, let's update it. Signed-off-by: Yanteng Si Signed-off-by: Huacai Chen --- .../translations/zh_CN/arch/loongarch/introduction.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/translations/zh_CN/arch/loongarch/introduction.rst b/Documentation/translations/zh_CN/arch/loongarch/introduction.rst index 59d6bf33050cb..bf463c5a4c514 100644 --- a/Documentation/translations/zh_CN/arch/loongarch/introduction.rst +++ b/Documentation/translations/zh_CN/arch/loongarch/introduction.rst @@ -338,9 +338,9 @@ Loongson与LoongArch的开发者网站(软件与文档资源): LoongArch指令集架构的文档: - https://github.com/loongson/LoongArch-Documentation/releases/latest/download/LoongArch-Vol1-v1.02-CN.pdf (中文版) + https://github.com/loongson/LoongArch-Documentation/releases/latest/download/LoongArch-Vol1-v1.10-CN.pdf (中文版) - https://github.com/loongson/LoongArch-Documentation/releases/latest/download/LoongArch-Vol1-v1.02-EN.pdf (英文版) + https://github.com/loongson/LoongArch-Documentation/releases/latest/download/LoongArch-Vol1-v1.10-EN.pdf (英文版) LoongArch的ELF psABI文档: From 8d9ce3e53bbd2d6241213e9a4deb310499c929ff Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 20 Nov 2023 16:45:45 +0100 Subject: [PATCH 446/560] MAINTAINERS: Drop Mark Gross as maintainer for x86 platform drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark has not really been active as maintainer for x86 platform drivers lately, drop Mark from the MAINTAINERS entries for drivers/platform/x86, drivers/platform/mellanox and drivers/platform/surface. Cc: Mark Gross Signed-off-by: Hans de Goede Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231120154548.611041-1-hdegoede@redhat.com Signed-off-by: Ilpo Järvinen --- MAINTAINERS | 3 --- 1 file changed, 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0666ddb655521..e972ce5f54e70 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13778,7 +13778,6 @@ F: drivers/net/ethernet/mellanox/mlxfw/ MELLANOX HARDWARE PLATFORM SUPPORT M: Hans de Goede M: Ilpo Järvinen -M: Mark Gross M: Vadim Pasternak L: platform-driver-x86@vger.kernel.org S: Supported @@ -14387,7 +14386,6 @@ F: drivers/platform/surface/surface_gpe.c MICROSOFT SURFACE HARDWARE PLATFORM SUPPORT M: Hans de Goede M: Ilpo Järvinen -M: Mark Gross M: Maximilian Luz L: platform-driver-x86@vger.kernel.org S: Maintained @@ -23653,7 +23651,6 @@ F: drivers/platform/x86/x86-android-tablets/ X86 PLATFORM DRIVERS M: Hans de Goede M: Ilpo Järvinen -M: Mark Gross L: platform-driver-x86@vger.kernel.org S: Maintained Q: https://patchwork.kernel.org/project/platform-driver-x86/list/ From a6584711e64d9d12ab79a450ec3628fd35e4f476 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Nov 2023 17:07:56 +0200 Subject: [PATCH 447/560] platform/x86: intel_telemetry: Fix kernel doc descriptions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LKP found issues with a kernel doc in the driver: core.c:116: warning: Function parameter or member 'ioss_evtconfig' not described in 'telemetry_update_events' core.c:188: warning: Function parameter or member 'ioss_evtconfig' not described in 'telemetry_get_eventconfig' It looks like it were copy'n'paste typos when these descriptions had been introduced. Fix the typos. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202310070743.WALmRGSY-lkp@intel.com/ Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231120150756.1661425-1-andriy.shevchenko@linux.intel.com Reviewed-by: Rajneesh Bhardwaj Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/telemetry/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/telemetry/core.c b/drivers/platform/x86/intel/telemetry/core.c index fdf55b5d69480..e4be40f73eebf 100644 --- a/drivers/platform/x86/intel/telemetry/core.c +++ b/drivers/platform/x86/intel/telemetry/core.c @@ -102,7 +102,7 @@ static const struct telemetry_core_ops telm_defpltops = { /** * telemetry_update_events() - Update telemetry Configuration * @pss_evtconfig: PSS related config. No change if num_evts = 0. - * @pss_evtconfig: IOSS related config. No change if num_evts = 0. + * @ioss_evtconfig: IOSS related config. No change if num_evts = 0. * * This API updates the IOSS & PSS Telemetry configuration. Old config * is overwritten. Call telemetry_reset_events when logging is over @@ -176,7 +176,7 @@ EXPORT_SYMBOL_GPL(telemetry_reset_events); /** * telemetry_get_eventconfig() - Returns the pss and ioss events enabled * @pss_evtconfig: Pointer to PSS related configuration. - * @pss_evtconfig: Pointer to IOSS related configuration. + * @ioss_evtconfig: Pointer to IOSS related configuration. * @pss_len: Number of u32 elements allocated for pss_evtconfig array * @ioss_len: Number of u32 elements allocated for ioss_evtconfig array * From 3f7c0634926daf48cd2f6db6c1197a1047074088 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Wed, 15 Nov 2023 12:10:04 +0100 Subject: [PATCH 448/560] accel/ivpu/37xx: Fix hangs related to MMIO reset There is no need to call MMIO reset using VPU_37XX_BUTTRESS_VPU_IP_RESET register. IP will be reset by FLR or by entering d0i3. Also IP reset during power_up is not needed as the VPU is already in reset. Removing MMIO reset improves stability as it a partial device reset that is not safe in some corner cases. This change also brings back ivpu_boot_pwr_domain_disable() that helps to properly power down VPU when it is hung by a buggy workload. Signed-off-by: Jacek Lawrynowicz Fixes: 828d63042aec ("accel/ivpu: Don't enter d0i3 during FLR") Reviewed-by: Jeffrey Hugo Link: https://patchwork.freedesktop.org/patch/msgid/20231115111004.1304092-1-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_hw_37xx.c | 46 +++++++++++++++---------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c index 5c0246b9e5228..4ccf1994b97ad 100644 --- a/drivers/accel/ivpu/ivpu_hw_37xx.c +++ b/drivers/accel/ivpu/ivpu_hw_37xx.c @@ -502,6 +502,16 @@ static int ivpu_boot_pwr_domain_enable(struct ivpu_device *vdev) return ret; } +static int ivpu_boot_pwr_domain_disable(struct ivpu_device *vdev) +{ + ivpu_boot_dpu_active_drive(vdev, false); + ivpu_boot_pwr_island_isolation_drive(vdev, true); + ivpu_boot_pwr_island_trickle_drive(vdev, false); + ivpu_boot_pwr_island_drive(vdev, false); + + return ivpu_boot_wait_for_pwr_island_status(vdev, 0x0); +} + static void ivpu_boot_no_snoop_enable(struct ivpu_device *vdev) { u32 val = REGV_RD32(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES); @@ -600,25 +610,17 @@ static int ivpu_hw_37xx_info_init(struct ivpu_device *vdev) static int ivpu_hw_37xx_reset(struct ivpu_device *vdev) { - int ret; - u32 val; - - if (IVPU_WA(punit_disabled)) - return 0; + int ret = 0; - ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US); - if (ret) { - ivpu_err(vdev, "Timed out waiting for TRIGGER bit\n"); - return ret; + if (ivpu_boot_pwr_domain_disable(vdev)) { + ivpu_err(vdev, "Failed to disable power domain\n"); + ret = -EIO; } - val = REGB_RD32(VPU_37XX_BUTTRESS_VPU_IP_RESET); - val = REG_SET_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, val); - REGB_WR32(VPU_37XX_BUTTRESS_VPU_IP_RESET, val); - - ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US); - if (ret) - ivpu_err(vdev, "Timed out waiting for RESET completion\n"); + if (ivpu_pll_disable(vdev)) { + ivpu_err(vdev, "Failed to disable PLL\n"); + ret = -EIO; + } return ret; } @@ -651,10 +653,6 @@ static int ivpu_hw_37xx_power_up(struct ivpu_device *vdev) { int ret; - ret = ivpu_hw_37xx_reset(vdev); - if (ret) - ivpu_warn(vdev, "Failed to reset HW: %d\n", ret); - ret = ivpu_hw_37xx_d0i3_disable(vdev); if (ret) ivpu_warn(vdev, "Failed to disable D0I3: %d\n", ret); @@ -722,11 +720,11 @@ static int ivpu_hw_37xx_power_down(struct ivpu_device *vdev) { int ret = 0; - if (!ivpu_hw_37xx_is_idle(vdev) && ivpu_hw_37xx_reset(vdev)) - ivpu_err(vdev, "Failed to reset the VPU\n"); + if (!ivpu_hw_37xx_is_idle(vdev)) + ivpu_warn(vdev, "VPU not idle during power down\n"); - if (ivpu_pll_disable(vdev)) { - ivpu_err(vdev, "Failed to disable PLL\n"); + if (ivpu_hw_37xx_reset(vdev)) { + ivpu_err(vdev, "Failed to reset VPU\n"); ret = -EIO; } From f2d4a5834638bbc967371b9168c0b481519f7c5e Mon Sep 17 00:00:00 2001 From: Hamish Martin Date: Wed, 25 Oct 2023 16:55:10 +1300 Subject: [PATCH 449/560] HID: mcp2221: Set driver data before I2C adapter add The process of adding an I2C adapter can invoke I2C accesses on that new adapter (see i2c_detect()). Ensure we have set the adapter's driver data to avoid null pointer dereferences in the xfer functions during the adapter add. This has been noted in the past and the same fix proposed but not completed. See: https://lore.kernel.org/lkml/ef597e73-ed71-168e-52af-0d19b03734ac@vigem.de/ Signed-off-by: Hamish Martin Signed-off-by: Jiri Kosina --- drivers/hid/hid-mcp2221.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-mcp2221.c b/drivers/hid/hid-mcp2221.c index 72883e0ce7575..b95f31cf0fa21 100644 --- a/drivers/hid/hid-mcp2221.c +++ b/drivers/hid/hid-mcp2221.c @@ -1157,12 +1157,12 @@ static int mcp2221_probe(struct hid_device *hdev, snprintf(mcp->adapter.name, sizeof(mcp->adapter.name), "MCP2221 usb-i2c bridge"); + i2c_set_adapdata(&mcp->adapter, mcp); ret = devm_i2c_add_adapter(&hdev->dev, &mcp->adapter); if (ret) { hid_err(hdev, "can't add usb-i2c adapter: %d\n", ret); return ret; } - i2c_set_adapdata(&mcp->adapter, mcp); #if IS_REACHABLE(CONFIG_GPIOLIB) /* Setup GPIO chip */ From 73ce9f1f2741a38f5d27393e627702ae2c46e6f2 Mon Sep 17 00:00:00 2001 From: Hamish Martin Date: Wed, 25 Oct 2023 16:55:11 +1300 Subject: [PATCH 450/560] HID: mcp2221: Allow IO to start during probe During the probe we add an I2C adapter and as soon as we add that adapter it may be used for a transfer (e.g via the code in i2cdetect()). Those transfers are not able to complete and time out. This is because the HID raw_event callback (mcp2221_raw_event) will not be invoked until the HID device's 'driver_input_lock' is marked up at the completion of the probe in hid_device_probe(). This starves the driver of the responses it is waiting for. In order to allow the I2C transfers to complete while we are still in the probe, start the IO once we have completed init of the HID device. This issue seems to have been seen before and a patch was submitted but it seems it was never accepted. See: https://lore.kernel.org/all/20221103222714.21566-3-Enrik.Berkhan@inka.de/ Signed-off-by: Hamish Martin Signed-off-by: Jiri Kosina --- drivers/hid/hid-mcp2221.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/hid-mcp2221.c b/drivers/hid/hid-mcp2221.c index b95f31cf0fa21..aef0785c91cc2 100644 --- a/drivers/hid/hid-mcp2221.c +++ b/drivers/hid/hid-mcp2221.c @@ -1142,6 +1142,8 @@ static int mcp2221_probe(struct hid_device *hdev, if (ret) return ret; + hid_device_io_start(hdev); + /* Set I2C bus clock diviser */ if (i2c_clk_freq > 400) i2c_clk_freq = 400; From 113f736655e4f20633e107d731dd5bd097d5938c Mon Sep 17 00:00:00 2001 From: Yihong Cao Date: Mon, 30 Oct 2023 01:05:38 +0800 Subject: [PATCH 451/560] HID: apple: add Jamesdonkey and A3R to non-apple keyboards list Jamesdonkey A3R keyboard is identified as "Jamesdonkey A3R" in wired mode, "A3R-U" in wireless mode and "A3R" in bluetooth mode. Adding them to non-apple keyboards fixes function key. Signed-off-by: Yihong Cao Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 3ca45975c686e..d9e9829b22001 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -345,6 +345,8 @@ static const struct apple_non_apple_keyboard non_apple_keyboards[] = { { "AONE" }, { "GANSS" }, { "Hailuck" }, + { "Jamesdonkey" }, + { "A3R" }, }; static bool apple_is_non_apple_keyboard(struct hid_device *hdev) From fc43e9c857b7aa55efba9398419b14d9e35dcc7d Mon Sep 17 00:00:00 2001 From: Charles Yi Date: Tue, 31 Oct 2023 12:32:39 +0800 Subject: [PATCH 452/560] HID: fix HID device resource race between HID core and debugging support hid_debug_events_release releases resources bound to the HID device instance. hid_device_release releases the underlying HID device instance potentially before hid_debug_events_release has completed releasing debug resources bound to the same HID device instance. Reference count to prevent the HID device instance from being torn down preemptively when HID debugging support is used. When count reaches zero, release core resources of HID device instance using hiddev_free. The crash: [ 120.728477][ T4396] kernel BUG at lib/list_debug.c:53! [ 120.728505][ T4396] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [ 120.739806][ T4396] Modules linked in: bcmdhd dhd_static_buf 8822cu pcie_mhi r8168 [ 120.747386][ T4396] CPU: 1 PID: 4396 Comm: hidt_bridge Not tainted 5.10.110 #257 [ 120.754771][ T4396] Hardware name: Rockchip RK3588 EVB4 LP4 V10 Board (DT) [ 120.761643][ T4396] pstate: 60400089 (nZCv daIf +PAN -UAO -TCO BTYPE=--) [ 120.768338][ T4396] pc : __list_del_entry_valid+0x98/0xac [ 120.773730][ T4396] lr : __list_del_entry_valid+0x98/0xac [ 120.779120][ T4396] sp : ffffffc01e62bb60 [ 120.783126][ T4396] x29: ffffffc01e62bb60 x28: ffffff818ce3a200 [ 120.789126][ T4396] x27: 0000000000000009 x26: 0000000000980000 [ 120.795126][ T4396] x25: ffffffc012431000 x24: ffffff802c6d4e00 [ 120.801125][ T4396] x23: ffffff8005c66f00 x22: ffffffc01183b5b8 [ 120.807125][ T4396] x21: ffffff819df2f100 x20: 0000000000000000 [ 120.813124][ T4396] x19: ffffff802c3f0700 x18: ffffffc01d2cd058 [ 120.819124][ T4396] x17: 0000000000000000 x16: 0000000000000000 [ 120.825124][ T4396] x15: 0000000000000004 x14: 0000000000003fff [ 120.831123][ T4396] x13: ffffffc012085588 x12: 0000000000000003 [ 120.837123][ T4396] x11: 00000000ffffbfff x10: 0000000000000003 [ 120.843123][ T4396] x9 : 455103d46b329300 x8 : 455103d46b329300 [ 120.849124][ T4396] x7 : 74707572726f6320 x6 : ffffffc0124b8cb5 [ 120.855124][ T4396] x5 : ffffffffffffffff x4 : 0000000000000000 [ 120.861123][ T4396] x3 : ffffffc011cf4f90 x2 : ffffff81fee7b948 [ 120.867122][ T4396] x1 : ffffffc011cf4f90 x0 : 0000000000000054 [ 120.873122][ T4396] Call trace: [ 120.876259][ T4396] __list_del_entry_valid+0x98/0xac [ 120.881304][ T4396] hid_debug_events_release+0x48/0x12c [ 120.886617][ T4396] full_proxy_release+0x50/0xbc [ 120.891323][ T4396] __fput+0xdc/0x238 [ 120.895075][ T4396] ____fput+0x14/0x24 [ 120.898911][ T4396] task_work_run+0x90/0x148 [ 120.903268][ T4396] do_exit+0x1bc/0x8a4 [ 120.907193][ T4396] do_group_exit+0x8c/0xa4 [ 120.911458][ T4396] get_signal+0x468/0x744 [ 120.915643][ T4396] do_signal+0x84/0x280 [ 120.919650][ T4396] do_notify_resume+0xd0/0x218 [ 120.924262][ T4396] work_pending+0xc/0x3f0 [ Rahul Rameshbabu : rework changelog ] Fixes: cd667ce24796 ("HID: use debugfs for events/reports dumping") Signed-off-by: Charles Yi Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 12 ++++++++++-- drivers/hid/hid-debug.c | 3 +++ include/linux/hid.h | 3 +++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 8992e3c1e7698..e0181218ad857 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -702,15 +702,22 @@ static void hid_close_report(struct hid_device *device) * Free a device structure, all reports, and all fields. */ -static void hid_device_release(struct device *dev) +void hiddev_free(struct kref *ref) { - struct hid_device *hid = to_hid_device(dev); + struct hid_device *hid = container_of(ref, struct hid_device, ref); hid_close_report(hid); kfree(hid->dev_rdesc); kfree(hid); } +static void hid_device_release(struct device *dev) +{ + struct hid_device *hid = to_hid_device(dev); + + kref_put(&hid->ref, hiddev_free); +} + /* * Fetch a report description item from the data stream. We support long * items, though they are not used yet. @@ -2846,6 +2853,7 @@ struct hid_device *hid_allocate_device(void) spin_lock_init(&hdev->debug_list_lock); sema_init(&hdev->driver_input_lock, 1); mutex_init(&hdev->ll_open_lock); + kref_init(&hdev->ref); hid_bpf_device_init(hdev); diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index e7ef1ea107c9e..7dd83ec74f8a9 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -1135,6 +1135,7 @@ static int hid_debug_events_open(struct inode *inode, struct file *file) goto out; } list->hdev = (struct hid_device *) inode->i_private; + kref_get(&list->hdev->ref); file->private_data = list; mutex_init(&list->read_mutex); @@ -1227,6 +1228,8 @@ static int hid_debug_events_release(struct inode *inode, struct file *file) list_del(&list->node); spin_unlock_irqrestore(&list->hdev->debug_list_lock, flags); kfifo_free(&list->hid_debug_fifo); + + kref_put(&list->hdev->ref, hiddev_free); kfree(list); return 0; diff --git a/include/linux/hid.h b/include/linux/hid.h index 5a8387a4a7126..bf43f3ff66640 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -679,6 +679,7 @@ struct hid_device { /* device report descriptor */ struct list_head debug_list; spinlock_t debug_list_lock; wait_queue_head_t debug_wait; + struct kref ref; unsigned int id; /* system unique id */ @@ -687,6 +688,8 @@ struct hid_device { /* device report descriptor */ #endif /* CONFIG_BPF */ }; +void hiddev_free(struct kref *ref); + #define to_hid_device(pdev) \ container_of(pdev, struct hid_device, dev) From a5e913c25b6b2b6ae02acef6d9400645ac03dfdf Mon Sep 17 00:00:00 2001 From: Brett Raye Date: Thu, 2 Nov 2023 18:10:38 -0700 Subject: [PATCH 453/560] HID: glorious: fix Glorious Model I HID report The Glorious Model I mouse has a buggy HID report descriptor for its keyboard endpoint (used for programmable buttons). For report ID 2, there is a mismatch between Logical Minimum and Usage Minimum in the array that reports keycodes. The offending portion of the descriptor: (from hid-decode) 0x95, 0x05, // Report Count (5) 30 0x75, 0x08, // Report Size (8) 32 0x15, 0x00, // Logical Minimum (0) 34 0x25, 0x65, // Logical Maximum (101) 36 0x05, 0x07, // Usage Page (Keyboard) 38 0x19, 0x01, // Usage Minimum (1) 40 0x29, 0x65, // Usage Maximum (101) 42 0x81, 0x00, // Input (Data,Arr,Abs) 44 This bug shifts all programmed keycodes up by 1. Importantly, this causes "empty" array indexes of 0x00 to be interpreted as 0x01, ErrorRollOver. The presence of ErrorRollOver causes the system to ignore all keypresses from the endpoint and breaks the ability to use the programmable buttons. Setting byte 41 to 0x00 fixes this, and causes keycodes to be interpreted correctly. Also, USB_VENDOR_ID_GLORIOUS is changed to USB_VENDOR_ID_SINOWEALTH, and a new ID for Laview Technology is added. Glorious seems to be white-labeling controller boards or mice from these vendors. There isn't a single canonical vendor ID for Glorious products. Signed-off-by: Brett Raye Signed-off-by: Jiri Kosina --- drivers/hid/hid-glorious.c | 16 ++++++++++++++-- drivers/hid/hid-ids.h | 11 +++++++---- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/hid/hid-glorious.c b/drivers/hid/hid-glorious.c index 558eb08c19ef9..281b3a7187cec 100644 --- a/drivers/hid/hid-glorious.c +++ b/drivers/hid/hid-glorious.c @@ -21,6 +21,10 @@ MODULE_DESCRIPTION("HID driver for Glorious PC Gaming Race mice"); * Glorious Model O and O- specify the const flag in the consumer input * report descriptor, which leads to inputs being ignored. Fix this * by patching the descriptor. + * + * Glorious Model I incorrectly specifes the Usage Minimum for its + * keyboard HID report, causing keycodes to be misinterpreted. + * Fix this by setting Usage Minimum to 0 in that report. */ static __u8 *glorious_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) @@ -32,6 +36,10 @@ static __u8 *glorious_report_fixup(struct hid_device *hdev, __u8 *rdesc, rdesc[85] = rdesc[113] = rdesc[141] = \ HID_MAIN_ITEM_VARIABLE | HID_MAIN_ITEM_RELATIVE; } + if (*rsize == 156 && rdesc[41] == 1) { + hid_info(hdev, "patching Glorious Model I keyboard report descriptor\n"); + rdesc[41] = 0; + } return rdesc; } @@ -44,6 +52,8 @@ static void glorious_update_name(struct hid_device *hdev) model = "Model O"; break; case USB_DEVICE_ID_GLORIOUS_MODEL_D: model = "Model D"; break; + case USB_DEVICE_ID_GLORIOUS_MODEL_I: + model = "Model I"; break; } snprintf(hdev->name, sizeof(hdev->name), "%s %s", "Glorious", model); @@ -66,10 +76,12 @@ static int glorious_probe(struct hid_device *hdev, } static const struct hid_device_id glorious_devices[] = { - { HID_USB_DEVICE(USB_VENDOR_ID_GLORIOUS, + { HID_USB_DEVICE(USB_VENDOR_ID_SINOWEALTH, USB_DEVICE_ID_GLORIOUS_MODEL_O) }, - { HID_USB_DEVICE(USB_VENDOR_ID_GLORIOUS, + { HID_USB_DEVICE(USB_VENDOR_ID_SINOWEALTH, USB_DEVICE_ID_GLORIOUS_MODEL_D) }, + { HID_USB_DEVICE(USB_VENDOR_ID_LAVIEW, + USB_DEVICE_ID_GLORIOUS_MODEL_I) }, { } }; MODULE_DEVICE_TABLE(hid, glorious_devices); diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index f7973ccd84a28..125c73a200517 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -511,10 +511,6 @@ #define USB_DEVICE_ID_GENERAL_TOUCH_WIN8_PIT_010A 0x010a #define USB_DEVICE_ID_GENERAL_TOUCH_WIN8_PIT_E100 0xe100 -#define USB_VENDOR_ID_GLORIOUS 0x258a -#define USB_DEVICE_ID_GLORIOUS_MODEL_D 0x0033 -#define USB_DEVICE_ID_GLORIOUS_MODEL_O 0x0036 - #define I2C_VENDOR_ID_GOODIX 0x27c6 #define I2C_DEVICE_ID_GOODIX_01F0 0x01f0 @@ -745,6 +741,9 @@ #define USB_VENDOR_ID_LABTEC 0x1020 #define USB_DEVICE_ID_LABTEC_WIRELESS_KEYBOARD 0x0006 +#define USB_VENDOR_ID_LAVIEW 0x22D4 +#define USB_DEVICE_ID_GLORIOUS_MODEL_I 0x1503 + #define USB_VENDOR_ID_LCPOWER 0x1241 #define USB_DEVICE_ID_LCPOWER_LC1000 0xf767 @@ -1160,6 +1159,10 @@ #define USB_VENDOR_ID_SIGMATEL 0x066F #define USB_DEVICE_ID_SIGMATEL_STMP3780 0x3780 +#define USB_VENDOR_ID_SINOWEALTH 0x258a +#define USB_DEVICE_ID_GLORIOUS_MODEL_D 0x0033 +#define USB_DEVICE_ID_GLORIOUS_MODEL_O 0x0036 + #define USB_VENDOR_ID_SIS_TOUCH 0x0457 #define USB_DEVICE_ID_SIS9200_TOUCH 0x9200 #define USB_DEVICE_ID_SIS817_TOUCH 0x0817 From 8d6ef26501b97243ee6c16b8187c5b38cb69b77d Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Thu, 16 Nov 2023 14:02:12 +0100 Subject: [PATCH 454/560] drm/ast: Disconnect BMC if physical connector is connected Many user-space compositors fail with mode setting if a CRTC has more than one connected connector. This is the case with the BMC on Aspeed systems. Work around this problem by setting the BMC's connector status to disconnected when the physical connector has a display attached. This way compositors will only see one connected connector at a time; either the physical one or the BMC. Suggested-by: Jocelyn Falempe Fixes: e329cb53b45d ("drm/ast: Add BMC virtual connector") Signed-off-by: Thomas Zimmermann Cc: # v6.6+ Reviewed-by: Jocelyn Falempe Link: https://patchwork.freedesktop.org/patch/msgid/20231116130217.22931-1-tzimmermann@suse.de --- drivers/gpu/drm/ast/ast_drv.h | 13 ++++++- drivers/gpu/drm/ast/ast_mode.c | 62 ++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/ast/ast_drv.h b/drivers/gpu/drm/ast/ast_drv.h index 2aee32344f4a2..772f3b049c169 100644 --- a/drivers/gpu/drm/ast/ast_drv.h +++ b/drivers/gpu/drm/ast/ast_drv.h @@ -174,6 +174,17 @@ to_ast_sil164_connector(struct drm_connector *connector) return container_of(connector, struct ast_sil164_connector, base); } +struct ast_bmc_connector { + struct drm_connector base; + struct drm_connector *physical_connector; +}; + +static inline struct ast_bmc_connector * +to_ast_bmc_connector(struct drm_connector *connector) +{ + return container_of(connector, struct ast_bmc_connector, base); +} + /* * Device */ @@ -218,7 +229,7 @@ struct ast_device { } astdp; struct { struct drm_encoder encoder; - struct drm_connector connector; + struct ast_bmc_connector bmc_connector; } bmc; } output; diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c index cb96149842851..c20534d0ef7c8 100644 --- a/drivers/gpu/drm/ast/ast_mode.c +++ b/drivers/gpu/drm/ast/ast_mode.c @@ -1767,6 +1767,30 @@ static const struct drm_encoder_funcs ast_bmc_encoder_funcs = { .destroy = drm_encoder_cleanup, }; +static int ast_bmc_connector_helper_detect_ctx(struct drm_connector *connector, + struct drm_modeset_acquire_ctx *ctx, + bool force) +{ + struct ast_bmc_connector *bmc_connector = to_ast_bmc_connector(connector); + struct drm_connector *physical_connector = bmc_connector->physical_connector; + + /* + * Most user-space compositors cannot handle more than one connected + * connector per CRTC. Hence, we only mark the BMC as connected if the + * physical connector is disconnected. If the physical connector's status + * is connected or unknown, the BMC remains disconnected. This has no + * effect on the output of the BMC. + * + * FIXME: Remove this logic once user-space compositors can handle more + * than one connector per CRTC. The BMC should always be connected. + */ + + if (physical_connector && physical_connector->status == connector_status_disconnected) + return connector_status_connected; + + return connector_status_disconnected; +} + static int ast_bmc_connector_helper_get_modes(struct drm_connector *connector) { return drm_add_modes_noedid(connector, 4096, 4096); @@ -1774,6 +1798,7 @@ static int ast_bmc_connector_helper_get_modes(struct drm_connector *connector) static const struct drm_connector_helper_funcs ast_bmc_connector_helper_funcs = { .get_modes = ast_bmc_connector_helper_get_modes, + .detect_ctx = ast_bmc_connector_helper_detect_ctx, }; static const struct drm_connector_funcs ast_bmc_connector_funcs = { @@ -1784,12 +1809,33 @@ static const struct drm_connector_funcs ast_bmc_connector_funcs = { .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, }; -static int ast_bmc_output_init(struct ast_device *ast) +static int ast_bmc_connector_init(struct drm_device *dev, + struct ast_bmc_connector *bmc_connector, + struct drm_connector *physical_connector) +{ + struct drm_connector *connector = &bmc_connector->base; + int ret; + + ret = drm_connector_init(dev, connector, &ast_bmc_connector_funcs, + DRM_MODE_CONNECTOR_VIRTUAL); + if (ret) + return ret; + + drm_connector_helper_add(connector, &ast_bmc_connector_helper_funcs); + + bmc_connector->physical_connector = physical_connector; + + return 0; +} + +static int ast_bmc_output_init(struct ast_device *ast, + struct drm_connector *physical_connector) { struct drm_device *dev = &ast->base; struct drm_crtc *crtc = &ast->crtc; struct drm_encoder *encoder = &ast->output.bmc.encoder; - struct drm_connector *connector = &ast->output.bmc.connector; + struct ast_bmc_connector *bmc_connector = &ast->output.bmc.bmc_connector; + struct drm_connector *connector = &bmc_connector->base; int ret; ret = drm_encoder_init(dev, encoder, @@ -1799,13 +1845,10 @@ static int ast_bmc_output_init(struct ast_device *ast) return ret; encoder->possible_crtcs = drm_crtc_mask(crtc); - ret = drm_connector_init(dev, connector, &ast_bmc_connector_funcs, - DRM_MODE_CONNECTOR_VIRTUAL); + ret = ast_bmc_connector_init(dev, bmc_connector, physical_connector); if (ret) return ret; - drm_connector_helper_add(connector, &ast_bmc_connector_helper_funcs); - ret = drm_connector_attach_encoder(connector, encoder); if (ret) return ret; @@ -1864,6 +1907,7 @@ static const struct drm_mode_config_funcs ast_mode_config_funcs = { int ast_mode_config_init(struct ast_device *ast) { struct drm_device *dev = &ast->base; + struct drm_connector *physical_connector = NULL; int ret; ret = drmm_mode_config_init(dev); @@ -1904,23 +1948,27 @@ int ast_mode_config_init(struct ast_device *ast) ret = ast_vga_output_init(ast); if (ret) return ret; + physical_connector = &ast->output.vga.vga_connector.base; } if (ast->tx_chip_types & AST_TX_SIL164_BIT) { ret = ast_sil164_output_init(ast); if (ret) return ret; + physical_connector = &ast->output.sil164.sil164_connector.base; } if (ast->tx_chip_types & AST_TX_DP501_BIT) { ret = ast_dp501_output_init(ast); if (ret) return ret; + physical_connector = &ast->output.dp501.connector; } if (ast->tx_chip_types & AST_TX_ASTDP_BIT) { ret = ast_astdp_output_init(ast); if (ret) return ret; + physical_connector = &ast->output.astdp.connector; } - ret = ast_bmc_output_init(ast); + ret = ast_bmc_output_init(ast, physical_connector); if (ret) return ret; From c55092187d9ad7b2f8f5a8645286fa03997d442f Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 14 Nov 2023 15:54:30 +0100 Subject: [PATCH 455/560] HID: add ALWAYS_POLL quirk for Apple kb These devices disconnect if suspended without remote wakeup. They can operate with the standard driver. Signed-off-by: Oliver Neukum Signed-off-by: Jiri Kosina --- drivers/hid/hid-quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index 5a48fcaa32f00..ea472923fab07 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -33,6 +33,7 @@ static const struct hid_device_id hid_quirks[] = { { HID_USB_DEVICE(USB_VENDOR_ID_AKAI, USB_DEVICE_ID_AKAI_MPKMINI2), HID_QUIRK_NO_INIT_REPORTS }, { HID_USB_DEVICE(USB_VENDOR_ID_ALPS, USB_DEVICE_ID_IBM_GAMEPAD), HID_QUIRK_BADPAD }, { HID_USB_DEVICE(USB_VENDOR_ID_AMI, USB_DEVICE_ID_AMI_VIRT_KEYBOARD_AND_MOUSE), HID_QUIRK_ALWAYS_POLL }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_ALU_REVB_ANSI), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_ATEN, USB_DEVICE_ID_ATEN_2PORTKVM), HID_QUIRK_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_ATEN, USB_DEVICE_ID_ATEN_4PORTKVMC), HID_QUIRK_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_ATEN, USB_DEVICE_ID_ATEN_4PORTKVM), HID_QUIRK_NOGET }, From c0e2926266af3b5acf28df0a8fc6e4d90effe0bb Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Sun, 19 Nov 2023 22:17:59 +0800 Subject: [PATCH 456/560] ipv4: Correct/silence an endian warning in __ip_do_redirect net/ipv4/route.c:783:46: warning: incorrect type in argument 2 (different base types) net/ipv4/route.c:783:46: expected unsigned int [usertype] key net/ipv4/route.c:783:46: got restricted __be32 [usertype] new_gw Fixes: 969447f226b4 ("ipv4: use new_gw for redirect neigh lookup") Suggested-by: Eric Dumazet Signed-off-by: Kunwu Chan Link: https://lore.kernel.org/r/20231119141759.420477-1-chentao@kylinos.cn Signed-off-by: Paolo Abeni --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3290a4442b4ac..16615d107cf06 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -780,7 +780,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow goto reject_redirect; } - n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); + n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw); if (!n) n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { From d30fb712e52964f2cf9a9c14cf67078394044837 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Sun, 19 Nov 2023 08:23:41 -0800 Subject: [PATCH 457/560] hv_netvsc: fix race of netvsc and VF register_netdevice The rtnl lock also needs to be held before rndis_filter_device_add() which advertises nvsp_2_vsc_capability / sriov bit, and triggers VF NIC offering and registering. If VF NIC finished register_netdev() earlier it may cause name based config failure. To fix this issue, move the call to rtnl_lock() before rndis_filter_device_add(), so VF will be registered later than netvsc / synthetic NIC, and gets a name numbered (ethX) after netvsc. Cc: stable@vger.kernel.org Fixes: e04e7a7bbd4b ("hv_netvsc: Fix a deadlock by getting rtnl lock earlier in netvsc_probe()") Reported-by: Dexuan Cui Signed-off-by: Haiyang Zhang Reviewed-by: Wojciech Drewek Reviewed-by: Simon Horman Reviewed-by: Dexuan Cui Signed-off-by: Paolo Abeni --- drivers/net/hyperv/netvsc_drv.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 3ba3c8fb28a5d..5e528a76f5f57 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2531,15 +2531,6 @@ static int netvsc_probe(struct hv_device *dev, goto devinfo_failed; } - nvdev = rndis_filter_device_add(dev, device_info); - if (IS_ERR(nvdev)) { - ret = PTR_ERR(nvdev); - netdev_err(net, "unable to add netvsc device (ret %d)\n", ret); - goto rndis_failed; - } - - eth_hw_addr_set(net, device_info->mac_adr); - /* We must get rtnl lock before scheduling nvdev->subchan_work, * otherwise netvsc_subchan_work() can get rtnl lock first and wait * all subchannels to show up, but that may not happen because @@ -2547,9 +2538,23 @@ static int netvsc_probe(struct hv_device *dev, * -> ... -> device_add() -> ... -> __device_attach() can't get * the device lock, so all the subchannels can't be processed -- * finally netvsc_subchan_work() hangs forever. + * + * The rtnl lock also needs to be held before rndis_filter_device_add() + * which advertises nvsp_2_vsc_capability / sriov bit, and triggers + * VF NIC offering and registering. If VF NIC finished register_netdev() + * earlier it may cause name based config failure. */ rtnl_lock(); + nvdev = rndis_filter_device_add(dev, device_info); + if (IS_ERR(nvdev)) { + ret = PTR_ERR(nvdev); + netdev_err(net, "unable to add netvsc device (ret %d)\n", ret); + goto rndis_failed; + } + + eth_hw_addr_set(net, device_info->mac_adr); + if (nvdev->num_chn > 1) schedule_work(&nvdev->subchan_work); @@ -2586,9 +2591,9 @@ static int netvsc_probe(struct hv_device *dev, return 0; register_failed: - rtnl_unlock(); rndis_filter_device_remove(dev, nvdev); rndis_failed: + rtnl_unlock(); netvsc_devinfo_put(device_info); devinfo_failed: free_percpu(net_device_ctx->vf_stats); From 85520856466ed6bc3b1ccb013cddac70ceb437db Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Sun, 19 Nov 2023 08:23:42 -0800 Subject: [PATCH 458/560] hv_netvsc: Fix race of register_netdevice_notifier and VF register If VF NIC is registered earlier, NETDEV_REGISTER event is replayed, but NETDEV_POST_INIT is not. Move register_netdevice_notifier() earlier, so the call back function is set before probing. Cc: stable@vger.kernel.org Fixes: e04e7a7bbd4b ("hv_netvsc: Fix a deadlock by getting rtnl lock earlier in netvsc_probe()") Reported-by: Dexuan Cui Signed-off-by: Haiyang Zhang Reviewed-by: Wojciech Drewek Reviewed-by: Dexuan Cui Signed-off-by: Paolo Abeni --- drivers/net/hyperv/netvsc_drv.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 5e528a76f5f57..b7dfd51f09e60 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2793,12 +2793,17 @@ static int __init netvsc_drv_init(void) } netvsc_ring_bytes = ring_size * PAGE_SIZE; + register_netdevice_notifier(&netvsc_netdev_notifier); + ret = vmbus_driver_register(&netvsc_drv); if (ret) - return ret; + goto err_vmbus_reg; - register_netdevice_notifier(&netvsc_netdev_notifier); return 0; + +err_vmbus_reg: + unregister_netdevice_notifier(&netvsc_netdev_notifier); + return ret; } MODULE_LICENSE("GPL"); From c807d6cd089d2f4951baa838081ec5ae3e2360f8 Mon Sep 17 00:00:00 2001 From: Long Li Date: Sun, 19 Nov 2023 08:23:43 -0800 Subject: [PATCH 459/560] hv_netvsc: Mark VF as slave before exposing it to user-mode When a VF is being exposed form the kernel, it should be marked as "slave" before exposing to the user-mode. The VF is not usable without netvsc running as master. The user-mode should never see a VF without the "slave" flag. This commit moves the code of setting the slave flag to the time before VF is exposed to user-mode. Cc: stable@vger.kernel.org Fixes: 0c195567a8f6 ("netvsc: transparent VF management") Signed-off-by: Long Li Signed-off-by: Haiyang Zhang Acked-by: Stephen Hemminger Acked-by: Dexuan Cui Signed-off-by: Paolo Abeni --- drivers/net/hyperv/netvsc_drv.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index b7dfd51f09e60..706ea5263e879 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2206,9 +2206,6 @@ static int netvsc_vf_join(struct net_device *vf_netdev, goto upper_link_failed; } - /* set slave flag before open to prevent IPv6 addrconf */ - vf_netdev->flags |= IFF_SLAVE; - schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT); call_netdevice_notifiers(NETDEV_JOIN, vf_netdev); @@ -2315,16 +2312,18 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) } - /* Fallback path to check synthetic vf with - * help of mac addr + /* Fallback path to check synthetic vf with help of mac addr. + * Because this function can be called before vf_netdev is + * initialized (NETDEV_POST_INIT) when its perm_addr has not been copied + * from dev_addr, also try to match to its dev_addr. + * Note: On Hyper-V and Azure, it's not possible to set a MAC address + * on a VF that matches to the MAC of a unrelated NETVSC device. */ list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) { ndev = hv_get_drvdata(ndev_ctx->device_ctx); - if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr)) { - netdev_notice(vf_netdev, - "falling back to mac addr based matching\n"); + if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr) || + ether_addr_equal(vf_netdev->dev_addr, ndev->perm_addr)) return ndev; - } } netdev_notice(vf_netdev, @@ -2332,6 +2331,19 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev) return NULL; } +static int netvsc_prepare_bonding(struct net_device *vf_netdev) +{ + struct net_device *ndev; + + ndev = get_netvsc_byslot(vf_netdev); + if (!ndev) + return NOTIFY_DONE; + + /* set slave flag before open to prevent IPv6 addrconf */ + vf_netdev->flags |= IFF_SLAVE; + return NOTIFY_DONE; +} + static int netvsc_register_vf(struct net_device *vf_netdev) { struct net_device_context *net_device_ctx; @@ -2758,6 +2770,8 @@ static int netvsc_netdev_event(struct notifier_block *this, return NOTIFY_DONE; switch (event) { + case NETDEV_POST_INIT: + return netvsc_prepare_bonding(event_dev); case NETDEV_REGISTER: return netvsc_register_vf(event_dev); case NETDEV_UNREGISTER: From e8df9d9f4209c04161321d8c12640ae560f65939 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 21 Nov 2023 09:46:28 +0800 Subject: [PATCH 460/560] perf/x86/intel: Correct incorrect 'or' operation for PMU capabilities When running perf-stat command on Intel hybrid platform, perf-stat reports the following errors: sudo taskset -c 7 ./perf stat -vvvv -e cpu_atom/instructions/ sleep 1 Opening: cpu/cycles/:HG ------------------------------------------------------------ perf_event_attr: type 0 (PERF_TYPE_HARDWARE) config 0xa00000000 disabled 1 ------------------------------------------------------------ sys_perf_event_open: pid 0 cpu -1 group_fd -1 flags 0x8 sys_perf_event_open failed, error -16 Performance counter stats for 'sleep 1': cpu_atom/instructions/ It looks the cpu_atom/instructions/ event can't be enabled on atom PMU even when the process is pinned on atom core. Investigation shows that exclusive_event_init() helper always returns -EBUSY error in the perf event creation. That's strange since the atom PMU should not be an exclusive PMU. Further investigation shows the issue was introduced by commit: 97588df87b56 ("perf/x86/intel: Add common intel_pmu_init_hybrid()") The commit originally intents to clear the bit PERF_PMU_CAP_AUX_OUTPUT from PMU capabilities if intel_cap.pebs_output_pt_available is not set, but it incorrectly uses 'or' operation and leads to all PMU capabilities bits are set to 1 except bit PERF_PMU_CAP_AUX_OUTPUT. Testing this fix on Intel hybrid platforms, the observed issues disappear. Fixes: 97588df87b56 ("perf/x86/intel: Add common intel_pmu_init_hybrid()") Signed-off-by: Dapeng Mi Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20231121014628.729989-1-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a08f794a0e79a..ce1c777227b4e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4660,7 +4660,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) if (pmu->intel_cap.pebs_output_pt_available) pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; else - pmu->pmu.capabilities |= ~PERF_PMU_CAP_AUX_OUTPUT; + pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT; intel_pmu_check_event_constraints(pmu->event_constraints, pmu->num_counters, From 8771127e25d6c20d458ad27cf32f7fcfc1755e05 Mon Sep 17 00:00:00 2001 From: Lech Perczak Date: Sat, 18 Nov 2023 00:19:17 +0100 Subject: [PATCH 461/560] USB: serial: option: don't claim interface 4 for ZTE MF290 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Interface 4 is used by for QMI interface in stock firmware of MF28D, the router which uses MF290 modem. Free the interface up, to rebind it to qmi_wwan driver. The proper configuration is: Interface mapping is: 0: QCDM, 1: (unknown), 2: AT (PCUI), 2: AT (Modem), 4: QMI T: Bus=01 Lev=02 Prnt=02 Port=00 Cnt=01 Dev#= 4 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=19d2 ProdID=0189 Rev= 0.00 S: Manufacturer=ZTE, Incorporated S: Product=ZTE LTE Technologies MSM C:* #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=84(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms Cc: Bjørn Mork Signed-off-by: Lech Perczak Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 45dcfaadaf98e..ff9049db6e65f 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1546,7 +1546,8 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0165, 0xff, 0xff, 0xff) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0167, 0xff, 0xff, 0xff), .driver_info = RSVD(4) }, - { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0189, 0xff, 0xff, 0xff) }, + { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0189, 0xff, 0xff, 0xff), + .driver_info = RSVD(4) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0191, 0xff, 0xff, 0xff), /* ZTE EuFi890 */ .driver_info = RSVD(4) }, { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, 0x0196, 0xff, 0xff, 0xff) }, From a1092619dd28ac0fcf23016160a2fdccd98ef935 Mon Sep 17 00:00:00 2001 From: Puliang Lu Date: Thu, 26 Oct 2023 20:35:06 +0800 Subject: [PATCH 462/560] USB: serial: option: fix FM101R-GL defines Modify the definition of the two Fibocom FM101R-GL PID macros, which had their PIDs switched. The correct PIDs are: - VID:PID 413C:8213, FM101R-GL ESIM are laptop M.2 cards (with MBIM interfaces for Linux) - VID:PID 413C:8215, FM101R-GL are laptop M.2 cards (with MBIM interface for Linux) 0x8213: mbim, tty 0x8215: mbim, tty Signed-off-by: Puliang Lu Fixes: 52480e1f1a25 ("USB: serial: option: add Fibocom to DELL custom modem FM101R-GL") Link: https://lore.kernel.org/lkml/TYZPR02MB508845BAD7936A62A105CE5D89DFA@TYZPR02MB5088.apcprd02.prod.outlook.com/ Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index ff9049db6e65f..9c76095ebfe14 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -203,8 +203,8 @@ static void option_instat_callback(struct urb *urb); #define DELL_PRODUCT_5829E_ESIM 0x81e4 #define DELL_PRODUCT_5829E 0x81e6 -#define DELL_PRODUCT_FM101R 0x8213 -#define DELL_PRODUCT_FM101R_ESIM 0x8215 +#define DELL_PRODUCT_FM101R_ESIM 0x8213 +#define DELL_PRODUCT_FM101R 0x8215 #define KYOCERA_VENDOR_ID 0x0c88 #define KYOCERA_PRODUCT_KPC650 0x17da From 5b4ffb176d7979ac66b349addf3f7de433335e00 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 21 Nov 2023 14:38:11 +0100 Subject: [PATCH 463/560] Revert "HID: logitech-dj: Add support for a new lightspeed receiver iteration" This reverts commit 9d1bd9346241cd6963b58da7ffb7ed303285f684. Multiple people reported misbehaving devices and reverting this commit fixes the problem for them. As soon as the original commit author starts reacting again, we can try to figure out why he hasn't seen the issues (mismatching report descriptors?), but for the time being, fix for 6.7 by reverting. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218172 Link: https://bugzilla.kernel.org/show_bug.cgi?id=218094 Cc: Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 - drivers/hid/hid-logitech-dj.c | 11 +++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 125c73a200517..c6e4e0d1f2147 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -868,7 +868,6 @@ #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_2 0xc534 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1 0xc539 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_1 0xc53f -#define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_2 0xc547 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_POWERPLAY 0xc53a #define USB_DEVICE_ID_SPACETRAVELLER 0xc623 #define USB_DEVICE_ID_SPACENAVIGATOR 0xc626 diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c index 8afe3be683ba2..e6a8b6d8eab70 100644 --- a/drivers/hid/hid-logitech-dj.c +++ b/drivers/hid/hid-logitech-dj.c @@ -1695,12 +1695,11 @@ static int logi_dj_raw_event(struct hid_device *hdev, } /* * Mouse-only receivers send unnumbered mouse data. The 27 MHz - * receiver uses 6 byte packets, the nano receiver 8 bytes, - * the lightspeed receiver (Pro X Superlight) 13 bytes. + * receiver uses 6 byte packets, the nano receiver 8 bytes. */ if (djrcv_dev->unnumbered_application == HID_GD_MOUSE && - size <= 13){ - u8 mouse_report[14]; + size <= 8) { + u8 mouse_report[9]; /* Prepend report id */ mouse_report[0] = REPORT_TYPE_MOUSE; @@ -1984,10 +1983,6 @@ static const struct hid_device_id logi_dj_receivers[] = { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_1), .driver_data = recvr_type_gaming_hidpp}, - { /* Logitech lightspeed receiver (0xc547) */ - HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, - USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_2), - .driver_data = recvr_type_gaming_hidpp}, { /* Logitech 27 MHz HID++ 1.0 receiver (0xc513) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX3000_RECEIVER), From 9c6dc13106f2dd2d6819d66618b25a6f41f0ee6a Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 20 Nov 2023 08:28:40 +0000 Subject: [PATCH 464/560] MAINTAINERS: Add indirect_call_wrapper.h to NETWORKING [GENERAL] indirect_call_wrapper.h is not, strictly speaking, networking specific. However, it's git history indicates that in practice changes go through netdev and thus the netdev maintainers have effectively been taking responsibility for it. Formalise this by adding it to the NETWORKING [GENERAL] section in the MAINTAINERS file. It is not clear how many other files under include/linux fall into this category and it would be interesting, as a follow-up, to audit that and propose further updates to the MAINTAINERS file as appropriate. Link: https://lore.kernel.org/netdev/20231116010310.4664dd38@kernel.org/ Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20231120-indirect_call_wrapper-maintainer-v1-1-0a6bb1f7363e@kernel.org Signed-off-by: Paolo Abeni --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 80a95878c61d7..6cc0708f77b4e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15057,6 +15057,7 @@ F: Documentation/networking/ F: Documentation/process/maintainer-netdev.rst F: Documentation/userspace-api/netlink/ F: include/linux/in.h +F: include/linux/indirect_call_wrapper.h F: include/linux/net.h F: include/linux/netdevice.h F: include/net/ From 41058707bea93b979c4854bdb857e46f2b85df92 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 10 Nov 2023 14:48:02 +0100 Subject: [PATCH 465/560] dt-bindings: usb: hcd: add missing phy name to example The example host controller node has two PHYs and therefore needs two PHY names. Fixes: 3aa3c66aedef ("dt-bindings: usb: Bring back phy-names") Signed-off-by: Johan Hovold Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20231110134802.32060-1-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/usb/usb-hcd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/usb/usb-hcd.yaml b/Documentation/devicetree/bindings/usb/usb-hcd.yaml index 692dd60e3f73f..45a19d4928afa 100644 --- a/Documentation/devicetree/bindings/usb/usb-hcd.yaml +++ b/Documentation/devicetree/bindings/usb/usb-hcd.yaml @@ -41,7 +41,7 @@ examples: - | usb { phys = <&usb2_phy1>, <&usb3_phy1>; - phy-names = "usb"; + phy-names = "usb2", "usb3"; #address-cells = <1>; #size-cells = <0>; From a6fe37f428c19dd164c2111157d4a1029bd853aa Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Wed, 1 Nov 2023 02:19:09 +0000 Subject: [PATCH 466/560] usb: typec: tcpm: Skip hard reset when in error recovery Hard reset queued prior to error recovery (or) received during error recovery will make TCPM to prematurely exit error recovery sequence. Ignore hard resets received during error recovery (or) port reset sequence. ``` [46505.459688] state change SNK_READY -> ERROR_RECOVERY [rev3 NONE_AMS] [46505.459706] state change ERROR_RECOVERY -> PORT_RESET [rev3 NONE_AMS] [46505.460433] disable vbus discharge ret:0 [46505.461226] Setting usb_comm capable false [46505.467244] Setting voltage/current limit 0 mV 0 mA [46505.467262] polarity 0 [46505.470695] Requesting mux state 0, usb-role 0, orientation 0 [46505.475621] cc:=0 [46505.476012] pending state change PORT_RESET -> PORT_RESET_WAIT_OFF @ 100 ms [rev3 NONE_AMS] [46505.476020] Received hard reset [46505.476024] state change PORT_RESET -> HARD_RESET_START [rev3 HARD_RESET] ``` Cc: stable@vger.kernel.org Fixes: f0690a25a140 ("staging: typec: USB Type-C Port Manager (tcpm)") Signed-off-by: Badhri Jagan Sridharan Acked-by: Heikki Krogeus Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20231101021909.2962679-1-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 058d5b853b574..b386102f7a3a4 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -5391,6 +5391,15 @@ static void _tcpm_pd_hard_reset(struct tcpm_port *port) if (port->bist_request == BDO_MODE_TESTDATA && port->tcpc->set_bist_data) port->tcpc->set_bist_data(port->tcpc, false); + switch (port->state) { + case ERROR_RECOVERY: + case PORT_RESET: + case PORT_RESET_WAIT_OFF: + return; + default: + break; + } + if (port->ams != NONE_AMS) port->ams = NONE_AMS; if (port->hard_reset_count < PD_N_HARD_RESET_COUNT) From cdd0cde8d8837de3d234bb08115d5f196e0ac8dd Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 30 Oct 2023 07:56:40 +0100 Subject: [PATCH 467/560] USB: typec: tps6598x: Fix a memory leak in an error handling path All error handling end to the error handling path, except these ones. Go to 'release_fw' as well here, otherwise 'fw' is leaking. Fixes: 7e7a3c815d22 ("USB: typec: tps6598x: Add TPS25750 support") Signed-off-by: Christophe JAILLET Acked-by: Heikki Krogerus Link: https://lore.kernel.org/r/23168336f18a9f6cb1a5b47130fc134dc0510d7f.1698648980.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tipd/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/usb/typec/tipd/core.c b/drivers/usb/typec/tipd/core.c index 0e867f531d344..b0184be06c3d0 100644 --- a/drivers/usb/typec/tipd/core.c +++ b/drivers/usb/typec/tipd/core.c @@ -968,16 +968,17 @@ static int tps25750_start_patch_burst_mode(struct tps6598x *tps) ret = of_property_match_string(np, "reg-names", "patch-address"); if (ret < 0) { dev_err(tps->dev, "failed to get patch-address %d\n", ret); - return ret; + goto release_fw; } ret = of_property_read_u32_index(np, "reg", ret, &addr); if (ret) - return ret; + goto release_fw; if (addr == 0 || (addr >= 0x20 && addr <= 0x23)) { dev_err(tps->dev, "wrong patch address %u\n", addr); - return -EINVAL; + ret = -EINVAL; + goto release_fw; } bpms_data.addr = (u8)addr; From 10d510abd096d620b9fda2dd3e0047c5efc4ad2b Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Wed, 25 Oct 2023 11:51:10 +0200 Subject: [PATCH 468/560] usb: dwc3: Fix default mode initialization The default mode, configurable by DT, shall be set before usb role switch driver is registered. Otherwise there is a race between default mode and mode set by usb role switch driver. Fixes: 98ed256a4dbad ("usb: dwc3: Add support for role-switch-default-mode binding") Cc: stable Signed-off-by: Alexander Stein Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20231025095110.2405281-1-alexander.stein@ew.tq-group.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/drd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/drd.c b/drivers/usb/dwc3/drd.c index 039bf241769af..57ddd2e43022e 100644 --- a/drivers/usb/dwc3/drd.c +++ b/drivers/usb/dwc3/drd.c @@ -505,6 +505,7 @@ static int dwc3_setup_role_switch(struct dwc3 *dwc) dwc->role_switch_default_mode = USB_DR_MODE_PERIPHERAL; mode = DWC3_GCTL_PRTCAP_DEVICE; } + dwc3_set_mode(dwc, mode); dwc3_role_switch.fwnode = dev_fwnode(dwc->dev); dwc3_role_switch.set = dwc3_usb_role_switch_set; @@ -526,7 +527,6 @@ static int dwc3_setup_role_switch(struct dwc3 *dwc) } } - dwc3_set_mode(dwc, mode); return 0; } #else From 187fb003c57c964ea61ac9fbfe41abf3ca9973eb Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Wed, 1 Nov 2023 01:28:45 +0000 Subject: [PATCH 469/560] usb: typec: tcpm: Fix sink caps op current check TCPM checks for sink caps operational current even when PD is disabled. This incorrectly sets tcpm_set_charge() when PD is disabled. Check for sink caps only when PD is enabled. [ 97.572342] Start toggling [ 97.578949] CC1: 0 -> 0, CC2: 0 -> 0 [state TOGGLING, polarity 0, disconnected] [ 99.571648] CC1: 0 -> 0, CC2: 0 -> 4 [state TOGGLING, polarity 0, connected] [ 99.571658] state change TOGGLING -> SNK_ATTACH_WAIT [rev3 NONE_AMS] [ 99.571673] pending state change SNK_ATTACH_WAIT -> SNK_DEBOUNCED @ 170 ms [rev3 NONE_AMS] [ 99.741778] state change SNK_ATTACH_WAIT -> SNK_DEBOUNCED [delayed 170 ms] [ 99.789283] CC1: 0 -> 0, CC2: 4 -> 5 [state SNK_DEBOUNCED, polarity 0, connected] [ 99.789306] state change SNK_DEBOUNCED -> SNK_DEBOUNCED [rev3 NONE_AMS] [ 99.903584] VBUS on [ 99.903591] state change SNK_DEBOUNCED -> SNK_ATTACHED [rev3 NONE_AMS] [ 99.903600] polarity 1 [ 99.910155] enable vbus discharge ret:0 [ 99.910160] Requesting mux state 1, usb-role 2, orientation 2 [ 99.946791] state change SNK_ATTACHED -> SNK_STARTUP [rev3 NONE_AMS] [ 99.946798] state change SNK_STARTUP -> SNK_DISCOVERY [rev3 NONE_AMS] [ 99.946800] Setting voltage/current limit 5000 mV 500 mA [ 99.946803] vbus=0 charge:=1 [ 100.027139] state change SNK_DISCOVERY -> SNK_READY [rev3 NONE_AMS] [ 100.027145] Setting voltage/current limit 5000 mV 3000 mA [ 100.466830] VBUS on Cc: stable@vger.kernel.org Fixes: 803b1c8a0cea ("usb: typec: tcpm: not sink vbus if operational current is 0mA") Signed-off-by: Badhri Jagan Sridharan Reviewed-by: Guenter Roeck Acked-by: Heikki Krogerus Tested-by: Will McVicker Link: https://lore.kernel.org/r/20231101012845.2701348-1-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index b386102f7a3a4..bfb6f9481e87f 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -4273,7 +4273,8 @@ static void run_state_machine(struct tcpm_port *port) current_lim = PD_P_SNK_STDBY_MW / 5; tcpm_set_current_limit(port, current_lim, 5000); /* Not sink vbus if operational current is 0mA */ - tcpm_set_charge(port, !!pdo_max_current(port->snk_pdo[0])); + tcpm_set_charge(port, !port->pd_supported || + pdo_max_current(port->snk_pdo[0])); if (!port->pd_supported) tcpm_set_state(port, SNK_READY, 0); From 58f2fcb3a845fcbbad2f3196bb37d744e0506250 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Wed, 8 Nov 2023 10:31:25 +0100 Subject: [PATCH 470/560] usb: cdnsp: Fix deadlock issue during using NCM gadget The interrupt service routine registered for the gadget is a primary handler which mask the interrupt source and a threaded handler which handles the source of the interrupt. Since the threaded handler is voluntary threaded, the IRQ-core does not disable bottom halves before invoke the handler like it does for the forced-threaded handler. Due to changes in networking it became visible that a network gadget's completions handler may schedule a softirq which remains unprocessed. The gadget's completion handler is usually invoked either in hard-IRQ or soft-IRQ context. In this context it is enough to just raise the softirq because the softirq itself will be handled once that context is left. In the case of the voluntary threaded handler, there is nothing that will process pending softirqs. Which means it remain queued until another random interrupt (on this CPU) fires and handles it on its exit path or another thread locks and unlocks a lock with the bh suffix. Worst case is that the CPU goes idle and the NOHZ complains about unhandled softirqs. Disable bottom halves before acquiring the lock (and disabling interrupts) and enable them after dropping the lock. This ensures that any pending softirqs will handled right away. cc: stable@vger.kernel.org Fixes: 3d82904559f4 ("usb: cdnsp: cdns3 Add main part of Cadence USBSSP DRD Driver") Signed-off-by: Pawel Laszczak Acked-by: Peter Chen Link: https://lore.kernel.org/r/20231108093125.224963-1-pawell@cadence.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdnsp-ring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/cdns3/cdnsp-ring.c b/drivers/usb/cdns3/cdnsp-ring.c index af981778382df..02f297f5637d7 100644 --- a/drivers/usb/cdns3/cdnsp-ring.c +++ b/drivers/usb/cdns3/cdnsp-ring.c @@ -1529,6 +1529,7 @@ irqreturn_t cdnsp_thread_irq_handler(int irq, void *data) unsigned long flags; int counter = 0; + local_bh_disable(); spin_lock_irqsave(&pdev->lock, flags); if (pdev->cdnsp_state & (CDNSP_STATE_HALTED | CDNSP_STATE_DYING)) { @@ -1541,6 +1542,7 @@ irqreturn_t cdnsp_thread_irq_handler(int irq, void *data) cdnsp_died(pdev); spin_unlock_irqrestore(&pdev->lock, flags); + local_bh_enable(); return IRQ_HANDLED; } @@ -1557,6 +1559,7 @@ irqreturn_t cdnsp_thread_irq_handler(int irq, void *data) cdnsp_update_erst_dequeue(pdev, event_ring_deq, 1); spin_unlock_irqrestore(&pdev->lock, flags); + local_bh_enable(); return IRQ_HANDLED; } From 30ce1c03a083c9dc131d09d28ba1bcaafa3d8df2 Mon Sep 17 00:00:00 2001 From: Wentong Wu Date: Tue, 14 Nov 2023 15:25:31 +0800 Subject: [PATCH 471/560] usb: misc: ljca: Drop _ADR support to get ljca children devices Currently the shipped platforms use only _HID to distinguish ljca children devices. The _ADR support here is for future HW. This patch is to drop _ADR support and we can then re-introduce it (revert this patch) if future HW actually starts using _ADR to distinguish children devices. Signed-off-by: Wentong Wu Reviewed-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231114072531.1366753-1-wentong.wu@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/usb-ljca.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/usb/misc/usb-ljca.c b/drivers/usb/misc/usb-ljca.c index c9decd0396d49..7f0deebebc130 100644 --- a/drivers/usb/misc/usb-ljca.c +++ b/drivers/usb/misc/usb-ljca.c @@ -457,8 +457,8 @@ static void ljca_auxdev_acpi_bind(struct ljca_adapter *adap, u64 adr, u8 id) { struct ljca_match_ids_walk_data wd = { 0 }; - struct acpi_device *parent, *adev; struct device *dev = adap->dev; + struct acpi_device *parent; char uid[4]; parent = ACPI_COMPANION(dev); @@ -466,17 +466,7 @@ static void ljca_auxdev_acpi_bind(struct ljca_adapter *adap, return; /* - * get auxdev ACPI handle from the ACPI device directly - * under the parent that matches _ADR. - */ - adev = acpi_find_child_device(parent, adr, false); - if (adev) { - ACPI_COMPANION_SET(&auxdev->dev, adev); - return; - } - - /* - * _ADR is a grey area in the ACPI specification, some + * Currently LJCA hw doesn't use _ADR instead the shipped * platforms use _HID to distinguish children devices. */ switch (adr) { From 0583bc776ca5b5a3f5752869fc31cf7322df2b35 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 15 Nov 2023 15:45:07 +0100 Subject: [PATCH 472/560] USB: dwc2: write HCINT with INTMASK applied dwc2_hc_n_intr() writes back INTMASK as read but evaluates it with intmask applied. In stress testing this causes spurious interrupts like this: [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 7 - ChHltd set, but reason is unknown [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001 [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001 [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 4 - ChHltd set, but reason is unknown [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001 [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_update_urb_state_abn(): trimming xfer length Applying INTMASK prevents this. The issue exists in all versions of the driver. Signed-off-by: Oliver Neukum Tested-by: Ivan Ivanov Tested-by: Andrea della Porta Link: https://lore.kernel.org/r/20231115144514.15248-1-oneukum@suse.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/hcd_intr.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/usb/dwc2/hcd_intr.c b/drivers/usb/dwc2/hcd_intr.c index 0144ca8350c31..5c7538d498dd1 100644 --- a/drivers/usb/dwc2/hcd_intr.c +++ b/drivers/usb/dwc2/hcd_intr.c @@ -2015,15 +2015,17 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum) { struct dwc2_qtd *qtd; struct dwc2_host_chan *chan; - u32 hcint, hcintmsk; + u32 hcint, hcintraw, hcintmsk; chan = hsotg->hc_ptr_array[chnum]; - hcint = dwc2_readl(hsotg, HCINT(chnum)); + hcintraw = dwc2_readl(hsotg, HCINT(chnum)); hcintmsk = dwc2_readl(hsotg, HCINTMSK(chnum)); + hcint = hcintraw & hcintmsk; + dwc2_writel(hsotg, hcint, HCINT(chnum)); + if (!chan) { dev_err(hsotg->dev, "## hc_ptr_array for channel is NULL ##\n"); - dwc2_writel(hsotg, hcint, HCINT(chnum)); return; } @@ -2032,11 +2034,9 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum) chnum); dev_vdbg(hsotg->dev, " hcint 0x%08x, hcintmsk 0x%08x, hcint&hcintmsk 0x%08x\n", - hcint, hcintmsk, hcint & hcintmsk); + hcintraw, hcintmsk, hcint); } - dwc2_writel(hsotg, hcint, HCINT(chnum)); - /* * If we got an interrupt after someone called * dwc2_hcd_endpoint_disable() we don't want to crash below @@ -2046,8 +2046,7 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum) return; } - chan->hcint = hcint; - hcint &= hcintmsk; + chan->hcint = hcintraw; /* * If the channel was halted due to a dequeue, the qtd list might From 791cd7afe51b0c770264836ab0607766e3e80f52 Mon Sep 17 00:00:00 2001 From: Stanley Chang Date: Fri, 17 Nov 2023 15:03:05 +0800 Subject: [PATCH 473/560] usb: dwc3: add missing of_node_put and platform_device_put of_get_compatible_child performs an of_node_get, so an of_node_put is required. Add platform_device_put to match with of_find_device_by_node. Fixes: 34c200483569 ("usb: dwc3: add Realtek DHC RTD SoC dwc3 glue layer driver") Signed-off-by: Stanley Chang Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20231117070311.32502-1-stanley_chang@realtek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-rtk.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-rtk.c b/drivers/usb/dwc3/dwc3-rtk.c index 590028e8fdcb2..3cd6b184551ce 100644 --- a/drivers/usb/dwc3/dwc3-rtk.c +++ b/drivers/usb/dwc3/dwc3-rtk.c @@ -183,10 +183,13 @@ static enum usb_device_speed __get_dwc3_maximum_speed(struct device_node *np) ret = of_property_read_string(dwc3_np, "maximum-speed", &maximum_speed); if (ret < 0) - return USB_SPEED_UNKNOWN; + goto out; ret = match_string(speed_names, ARRAY_SIZE(speed_names), maximum_speed); +out: + of_node_put(dwc3_np); + return (ret < 0) ? USB_SPEED_UNKNOWN : ret; } @@ -339,6 +342,9 @@ static int dwc3_rtk_probe_dwc3_core(struct dwc3_rtk *rtk) switch_usb2_role(rtk, rtk->cur_role); + platform_device_put(dwc3_pdev); + of_node_put(dwc3_node); + return 0; err_pdev_put: From 974bba5c118f4c2baf00de0356e3e4f7928b4cbc Mon Sep 17 00:00:00 2001 From: Niklas Neronin Date: Wed, 15 Nov 2023 14:13:25 +0200 Subject: [PATCH 474/560] usb: config: fix iteration issue in 'usb_get_bos_descriptor()' The BOS descriptor defines a root descriptor and is the base descriptor for accessing a family of related descriptors. Function 'usb_get_bos_descriptor()' encounters an iteration issue when skipping the 'USB_DT_DEVICE_CAPABILITY' descriptor type. This results in the same descriptor being read repeatedly. To address this issue, a 'goto' statement is introduced to ensure that the pointer and the amount read is updated correctly. This ensures that the function iterates to the next descriptor instead of reading the same descriptor repeatedly. Cc: stable@vger.kernel.org Fixes: 3dd550a2d365 ("USB: usbcore: Fix slab-out-of-bounds bug during device reset") Signed-off-by: Niklas Neronin Acked-by: Mathias Nyman Reviewed-by: Alan Stern Link: https://lore.kernel.org/r/20231115121325.471454-1-niklas.neronin@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index b19e38d5fd10c..7f8d33f92ddb5 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -1047,7 +1047,7 @@ int usb_get_bos_descriptor(struct usb_device *dev) if (cap->bDescriptorType != USB_DT_DEVICE_CAPABILITY) { dev_notice(ddev, "descriptor type invalid, skip\n"); - continue; + goto skip_to_next_descriptor; } switch (cap_type) { @@ -1078,6 +1078,7 @@ int usb_get_bos_descriptor(struct usb_device *dev) break; } +skip_to_next_descriptor: total_len -= length; buffer += length; } From 8bbae288a85abed6a1cf7d185d8b9dc2f5dcb12c Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Fri, 27 Oct 2023 11:28:20 +0000 Subject: [PATCH 475/560] usb: dwc3: set the dma max_seg_size Allow devices to have dma operations beyond 4K, and avoid warnings such as: DMA-API: dwc3 a600000.usb: mapping sg segment longer than device claims to support [len=86016] [max=65536] Cc: stable@vger.kernel.org Fixes: 72246da40f37 ("usb: Introduce DesignWare USB3 DRD Driver") Reported-by: Zubin Mithra Signed-off-by: Ricardo Ribalda Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20231026-dwc3-v2-1-1d4fd5c3e067@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 0328c86ef8061..b101dbf8c5dcc 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -2034,6 +2034,8 @@ static int dwc3_probe(struct platform_device *pdev) pm_runtime_put(dev); + dma_set_max_seg_size(dev, UINT_MAX); + return 0; err_exit_debugfs: From 61d2cf0db741827724d33079b4a54bf99a32b8e5 Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Sat, 18 Nov 2023 11:30:11 +0800 Subject: [PATCH 476/560] usb: xhci-mtk: fix in-ep's start-split check failure It's wrong to use the data length in a CS (in uframe x) to check whether there is a SS (in uframe x-2), because for a isoc-in ep, it may need some CS to receive data; Save the count of SS in a uframe for isoc/intr in-eps to fix the issue. Fixes: 5c954e030f55 ("usb: xhci-mtk: improve split scheduling by separate IN/OUT budget") Signed-off-by: Chunfeng Yun Link: https://lore.kernel.org/r/20231118033011.22033-1-chunfeng.yun@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mtk-sch.c | 13 ++++++++++--- drivers/usb/host/xhci-mtk.h | 2 ++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/usb/host/xhci-mtk-sch.c b/drivers/usb/host/xhci-mtk-sch.c index 5b3cd455adecc..61f3f8bbdcead 100644 --- a/drivers/usb/host/xhci-mtk-sch.c +++ b/drivers/usb/host/xhci-mtk-sch.c @@ -650,9 +650,8 @@ static int check_isoc_ss_overlap(struct mu3h_sch_ep_info *sch_ep, u32 offset) if (sch_ep->ep_type == ISOC_OUT_EP) { for (j = 0; j < sch_ep->num_budget_microframes; j++) { - k = XHCI_MTK_BW_INDEX(base + j + CS_OFFSET); - /* use cs to indicate existence of in-ss @(base+j) */ - if (tt->fs_bus_bw_in[k]) + k = XHCI_MTK_BW_INDEX(base + j); + if (tt->in_ss_cnt[k]) return -ESCH_SS_OVERLAP; } } else if (sch_ep->ep_type == ISOC_IN_EP || sch_ep->ep_type == INT_IN_EP) { @@ -769,6 +768,14 @@ static void update_sch_tt(struct mu3h_sch_ep_info *sch_ep, bool used) tt->fs_frame_bw[f] -= (u16)sch_ep->bw_budget_table[j]; } } + + if (sch_ep->ep_type == ISOC_IN_EP || sch_ep->ep_type == INT_IN_EP) { + k = XHCI_MTK_BW_INDEX(base); + if (used) + tt->in_ss_cnt[k]++; + else + tt->in_ss_cnt[k]--; + } } if (used) diff --git a/drivers/usb/host/xhci-mtk.h b/drivers/usb/host/xhci-mtk.h index 865b55e23b159..39f7ae7d30871 100644 --- a/drivers/usb/host/xhci-mtk.h +++ b/drivers/usb/host/xhci-mtk.h @@ -38,6 +38,7 @@ * @fs_bus_bw_in: save bandwidth used by FS/LS IN eps in each uframes * @ls_bus_bw: save bandwidth used by LS eps in each uframes * @fs_frame_bw: save bandwidth used by FS/LS eps in each FS frames + * @in_ss_cnt: the count of Start-Split for IN eps * @ep_list: Endpoints using this TT */ struct mu3h_sch_tt { @@ -45,6 +46,7 @@ struct mu3h_sch_tt { u16 fs_bus_bw_in[XHCI_MTK_MAX_ESIT]; u8 ls_bus_bw[XHCI_MTK_MAX_ESIT]; u16 fs_frame_bw[XHCI_MTK_FRAMES_CNT]; + u8 in_ss_cnt[XHCI_MTK_MAX_ESIT]; struct list_head ep_list; }; From 4b435764f7c2922822962e7f6343cce645d502f1 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Tue, 21 Nov 2023 13:46:47 +0200 Subject: [PATCH 477/560] usb: typec: tipd: Supply also I2C driver data If there is no fwnode, device_get_match_data() does not return anything making the probe to always fail. Using i2c_get_match_data() when there is no fwnode to fix that. Fixes: 5bd4853da049 ("USB: typec: tps6598x: Add device data to of_device_id") Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20231121114647.2005011-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tipd/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/tipd/core.c b/drivers/usb/typec/tipd/core.c index b0184be06c3d0..196535ad996d0 100644 --- a/drivers/usb/typec/tipd/core.c +++ b/drivers/usb/typec/tipd/core.c @@ -1227,7 +1227,10 @@ static int tps6598x_probe(struct i2c_client *client) TPS_REG_INT_PLUG_EVENT; } - tps->data = device_get_match_data(tps->dev); + if (dev_fwnode(tps->dev)) + tps->data = device_get_match_data(tps->dev); + else + tps->data = i2c_get_match_data(client); if (!tps->data) return -EINVAL; @@ -1426,7 +1429,7 @@ static const struct of_device_id tps6598x_of_match[] = { MODULE_DEVICE_TABLE(of, tps6598x_of_match); static const struct i2c_device_id tps6598x_id[] = { - { "tps6598x" }, + { "tps6598x", (kernel_ulong_t)&tps6598x_data }, { } }; MODULE_DEVICE_TABLE(i2c, tps6598x_id); From 16b7e0cccb243033de4406ffb4d892365041a1e7 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 3 Nov 2023 17:43:23 +0100 Subject: [PATCH 478/560] USB: xhci-plat: fix legacy PHY double init Commits 7b8ef22ea547 ("usb: xhci: plat: Add USB phy support") and 9134c1fd0503 ("usb: xhci: plat: Add USB 3.0 phy support") added support for looking up legacy PHYs from the sysdev devicetree node and initialising them. This broke drivers such as dwc3 which manages PHYs themself as the PHYs would now be initialised twice, something which specifically can lead to resources being left enabled during suspend (e.g. with the usb_phy_generic PHY driver). As the dwc3 driver uses driver-name matching for the xhci platform device, fix this by only looking up and initialising PHYs for devices that have been matched using OF. Note that checking that the platform device has a devicetree node would currently be sufficient, but that could lead to subtle breakages in case anyone ever tries to reuse an ancestor's node. Fixes: 7b8ef22ea547 ("usb: xhci: plat: Add USB phy support") Fixes: 9134c1fd0503 ("usb: xhci: plat: Add USB 3.0 phy support") Cc: stable@vger.kernel.org # 4.1 Cc: Maxime Ripard Cc: Stanley Chang Signed-off-by: Johan Hovold Tested-by: Stefan Eichenberger Tested-by: Stanley Chang Link: https://lore.kernel.org/r/20231103164323.14294-1-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-plat.c | 50 +++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c index b93161374293b..732cdeb739202 100644 --- a/drivers/usb/host/xhci-plat.c +++ b/drivers/usb/host/xhci-plat.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -148,7 +149,7 @@ int xhci_plat_probe(struct platform_device *pdev, struct device *sysdev, const s int ret; int irq; struct xhci_plat_priv *priv = NULL; - + bool of_match; if (usb_disabled()) return -ENODEV; @@ -253,16 +254,23 @@ int xhci_plat_probe(struct platform_device *pdev, struct device *sysdev, const s &xhci->imod_interval); } - hcd->usb_phy = devm_usb_get_phy_by_phandle(sysdev, "usb-phy", 0); - if (IS_ERR(hcd->usb_phy)) { - ret = PTR_ERR(hcd->usb_phy); - if (ret == -EPROBE_DEFER) - goto disable_clk; - hcd->usb_phy = NULL; - } else { - ret = usb_phy_init(hcd->usb_phy); - if (ret) - goto disable_clk; + /* + * Drivers such as dwc3 manages PHYs themself (and rely on driver name + * matching for the xhci platform device). + */ + of_match = of_match_device(pdev->dev.driver->of_match_table, &pdev->dev); + if (of_match) { + hcd->usb_phy = devm_usb_get_phy_by_phandle(sysdev, "usb-phy", 0); + if (IS_ERR(hcd->usb_phy)) { + ret = PTR_ERR(hcd->usb_phy); + if (ret == -EPROBE_DEFER) + goto disable_clk; + hcd->usb_phy = NULL; + } else { + ret = usb_phy_init(hcd->usb_phy); + if (ret) + goto disable_clk; + } } hcd->tpl_support = of_usb_host_tpl_support(sysdev->of_node); @@ -285,15 +293,17 @@ int xhci_plat_probe(struct platform_device *pdev, struct device *sysdev, const s goto dealloc_usb2_hcd; } - xhci->shared_hcd->usb_phy = devm_usb_get_phy_by_phandle(sysdev, - "usb-phy", 1); - if (IS_ERR(xhci->shared_hcd->usb_phy)) { - xhci->shared_hcd->usb_phy = NULL; - } else { - ret = usb_phy_init(xhci->shared_hcd->usb_phy); - if (ret) - dev_err(sysdev, "%s init usb3phy fail (ret=%d)\n", - __func__, ret); + if (of_match) { + xhci->shared_hcd->usb_phy = devm_usb_get_phy_by_phandle(sysdev, + "usb-phy", 1); + if (IS_ERR(xhci->shared_hcd->usb_phy)) { + xhci->shared_hcd->usb_phy = NULL; + } else { + ret = usb_phy_init(xhci->shared_hcd->usb_phy); + if (ret) + dev_err(sysdev, "%s init usb3phy fail (ret=%d)\n", + __func__, ret); + } } xhci->shared_hcd->tpl_support = hcd->tpl_support; From 98c598afc22d4e43c2ad91860b65996d0c099a5d Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 11 Sep 2023 10:33:08 +0800 Subject: [PATCH 479/560] nbd: pass nbd_sock to nbd_read_reply() instead of index If a socket is processing ioctl 'NBD_SET_SOCK', config->socks might be krealloc in nbd_add_socket(), and a garbage request is received now, a UAF may occurs. T1 nbd_ioctl __nbd_ioctl nbd_add_socket blk_mq_freeze_queue T2 recv_work nbd_read_reply sock_xmit krealloc config->socks def config->socks Pass nbd_sock to nbd_read_reply(). And introduce a new function sock_xmit_recv(), which differs from sock_xmit only in the way it get socket. ================================================================== BUG: KASAN: use-after-free in sock_xmit+0x525/0x550 Read of size 8 at addr ffff8880188ec428 by task kworker/u12:1/18779 Workqueue: knbd4-recv recv_work Call Trace: __dump_stack dump_stack+0xbe/0xfd print_address_description.constprop.0+0x19/0x170 __kasan_report.cold+0x6c/0x84 kasan_report+0x3a/0x50 sock_xmit+0x525/0x550 nbd_read_reply+0xfe/0x2c0 recv_work+0x1c2/0x750 process_one_work+0x6b6/0xf10 worker_thread+0xdd/0xd80 kthread+0x30a/0x410 ret_from_fork+0x22/0x30 Allocated by task 18784: kasan_save_stack+0x1b/0x40 kasan_set_track set_alloc_info __kasan_kmalloc __kasan_kmalloc.constprop.0+0xf0/0x130 slab_post_alloc_hook slab_alloc_node slab_alloc __kmalloc_track_caller+0x157/0x550 __do_krealloc krealloc+0x37/0xb0 nbd_add_socket +0x2d3/0x880 __nbd_ioctl nbd_ioctl+0x584/0x8e0 __blkdev_driver_ioctl blkdev_ioctl+0x2a0/0x6e0 block_ioctl+0xee/0x130 vfs_ioctl __do_sys_ioctl __se_sys_ioctl+0x138/0x190 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 Freed by task 18784: kasan_save_stack+0x1b/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x20/0x40 __kasan_slab_free.part.0+0x13f/0x1b0 slab_free_hook slab_free_freelist_hook slab_free kfree+0xcb/0x6c0 krealloc+0x56/0xb0 nbd_add_socket+0x2d3/0x880 __nbd_ioctl nbd_ioctl+0x584/0x8e0 __blkdev_driver_ioctl blkdev_ioctl+0x2a0/0x6e0 block_ioctl+0xee/0x130 vfs_ioctl __do_sys_ioctl __se_sys_ioctl+0x138/0x190 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 Signed-off-by: Li Nan Reviewed-by: Yu Kuai Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230911023308.3467802-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 3f03cb3dc33cc..b6414e1e645b7 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -67,6 +67,7 @@ struct nbd_sock { struct recv_thread_args { struct work_struct work; struct nbd_device *nbd; + struct nbd_sock *nsock; int index; }; @@ -505,15 +506,9 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) return BLK_EH_DONE; } -/* - * Send or receive packet. Return a positive value on success and - * negtive value on failue, and never return 0. - */ -static int sock_xmit(struct nbd_device *nbd, int index, int send, - struct iov_iter *iter, int msg_flags, int *sent) +static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send, + struct iov_iter *iter, int msg_flags, int *sent) { - struct nbd_config *config = nbd->config; - struct socket *sock = config->socks[index]->sock; int result; struct msghdr msg; unsigned int noreclaim_flag; @@ -556,6 +551,19 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send, return result; } +/* + * Send or receive packet. Return a positive value on success and + * negtive value on failure, and never return 0. + */ +static int sock_xmit(struct nbd_device *nbd, int index, int send, + struct iov_iter *iter, int msg_flags, int *sent) +{ + struct nbd_config *config = nbd->config; + struct socket *sock = config->socks[index]->sock; + + return __sock_xmit(nbd, sock, send, iter, msg_flags, sent); +} + /* * Different settings for sk->sk_sndtimeo can result in different return values * if there is a signal pending when we enter sendmsg, because reasons? @@ -712,7 +720,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) return 0; } -static int nbd_read_reply(struct nbd_device *nbd, int index, +static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock, struct nbd_reply *reply) { struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)}; @@ -721,7 +729,7 @@ static int nbd_read_reply(struct nbd_device *nbd, int index, reply->magic = 0; iov_iter_kvec(&to, ITER_DEST, &iov, 1, sizeof(*reply)); - result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); + result = __sock_xmit(nbd, sock, 0, &to, MSG_WAITALL, NULL); if (result < 0) { if (!nbd_disconnected(nbd->config)) dev_err(disk_to_dev(nbd->disk), @@ -845,14 +853,14 @@ static void recv_work(struct work_struct *work) struct nbd_device *nbd = args->nbd; struct nbd_config *config = nbd->config; struct request_queue *q = nbd->disk->queue; - struct nbd_sock *nsock; + struct nbd_sock *nsock = args->nsock; struct nbd_cmd *cmd; struct request *rq; while (1) { struct nbd_reply reply; - if (nbd_read_reply(nbd, args->index, &reply)) + if (nbd_read_reply(nbd, nsock->sock, &reply)) break; /* @@ -887,7 +895,6 @@ static void recv_work(struct work_struct *work) percpu_ref_put(&q->q_usage_counter); } - nsock = config->socks[args->index]; mutex_lock(&nsock->tx_lock); nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); @@ -1231,6 +1238,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) INIT_WORK(&args->work, recv_work); args->index = i; args->nbd = nbd; + args->nsock = nsock; nsock->cookie++; mutex_unlock(&nsock->tx_lock); sockfd_put(old); @@ -1413,6 +1421,7 @@ static int nbd_start_device(struct nbd_device *nbd) refcount_inc(&nbd->config_refs); INIT_WORK(&args->work, recv_work); args->nbd = nbd; + args->nsock = config->socks[i]; args->index = i; queue_work(nbd->recv_workq, &args->work); } From 2e569ada424c40ce27c99bfab4f9780619061c83 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 15 Nov 2023 22:02:11 +0100 Subject: [PATCH 480/560] x86/microcode: Remove the driver announcement and version First of all, the print is useless. The driver will either load and say which microcode revision the machine has or issue an error. Then, the version number is meaningless and actively confusing, as Yazen mentioned recently: when a subset of patches are backported to a distro kernel, one can't assume the driver version is the same as the upstream one. And besides, the version number of the loader hasn't been used and incremented for a long time. So drop it. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231115210212.9981-2-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 666d25bbc5ad2..b4be3a2c79df5 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -41,8 +41,6 @@ #include "internal.h" -#define DRIVER_VERSION "2.2" - static struct microcode_ops *microcode_ops; bool dis_ucode_ldr = true; @@ -846,8 +844,6 @@ static int __init microcode_init(void) cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); - pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION); - return 0; out_pdev: From 080990aa3344123673f686cda2df0d1b0deee046 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 15 Nov 2023 22:02:12 +0100 Subject: [PATCH 481/560] x86/microcode: Rework early revisions reporting The AMD side of the loader issues the microcode revision for each logical thread on the system, which can become really noisy on huge machines. And doing that doesn't make a whole lot of sense - the microcode revision is already in /proc/cpuinfo. So in case one is interested in the theoretical support of mixed silicon steppings on AMD, one can check there. What is also missing on the AMD side - something which people have requested before - is showing the microcode revision the CPU had *before* the early update. So abstract that up in the main code and have the BSP on each vendor provide those revision numbers. Then, dump them only once on driver init. On Intel, do not dump the patch date - it is not needed. Reported-by: Linus Torvalds Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/CAHk-=wg=%2B8rceshMkB4VnKxmRccVLtBLPBawnewZuuqyx5U=3A@mail.gmail.com --- arch/x86/kernel/cpu/microcode/amd.c | 39 +++++++----------------- arch/x86/kernel/cpu/microcode/core.c | 11 +++++-- arch/x86/kernel/cpu/microcode/intel.c | 17 +++++------ arch/x86/kernel/cpu/microcode/internal.h | 14 ++++++--- 4 files changed, 37 insertions(+), 44 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 9373ec01c5ae1..13b45b9c806da 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -104,8 +104,6 @@ struct cont_desc { size_t size; }; -static u32 ucode_new_rev; - /* * Microcode patch container file is prepended to the initrd in cpio * format. See Documentation/arch/x86/microcode.rst @@ -442,12 +440,11 @@ static int __apply_microcode_amd(struct microcode_amd *mc) * * Returns true if container found (sets @desc), false otherwise. */ -static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size) +static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size) { struct cont_desc desc = { 0 }; struct microcode_amd *mc; bool ret = false; - u32 rev, dummy; desc.cpuid_1_eax = cpuid_1_eax; @@ -457,22 +454,15 @@ static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size) if (!mc) return ret; - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* * Allow application of the same revision to pick up SMT-specific * changes even if the revision of the other SMT thread is already * up-to-date. */ - if (rev > mc->hdr.patch_id) + if (old_rev > mc->hdr.patch_id) return ret; - if (!__apply_microcode_amd(mc)) { - ucode_new_rev = mc->hdr.patch_id; - ret = true; - } - - return ret; + return !__apply_microcode_amd(mc); } static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) @@ -506,9 +496,12 @@ static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpi *ret = cp; } -void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) +void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax) { struct cpio_data cp = { }; + u32 dummy; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy); /* Needed in load_microcode_amd() */ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax; @@ -517,7 +510,8 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) if (!(cp.data && cp.size)) return; - early_apply_microcode(cpuid_1_eax, cp.data, cp.size); + if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size)) + native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy); } static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); @@ -625,10 +619,8 @@ void reload_ucode_amd(unsigned int cpu) rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); if (rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) { - ucode_new_rev = mc->hdr.patch_id; - pr_info("reload patch_level=0x%08x\n", ucode_new_rev); - } + if (!__apply_microcode_amd(mc)) + pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id); } } @@ -649,8 +641,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) if (p && (p->patch_id == csig->rev)) uci->mc = p->data; - pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); - return 0; } @@ -691,8 +681,6 @@ static enum ucode_state apply_microcode_amd(int cpu) rev = mc_amd->hdr.patch_id; ret = UCODE_UPDATED; - pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); - out: uci->cpu_sig.rev = rev; c->microcode = rev; @@ -935,11 +923,6 @@ struct microcode_ops * __init init_amd_microcode(void) pr_warn("AMD CPU family 0x%x not supported\n", c->x86); return NULL; } - - if (ucode_new_rev) - pr_info_once("microcode updated early to new patch_level=0x%08x\n", - ucode_new_rev); - return µcode_amd_ops; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b4be3a2c79df5..232026a239a68 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -75,6 +75,8 @@ static u32 final_levels[] = { 0, /* T-101 terminator */ }; +struct early_load_data early_data; + /* * Check the current patch level on this CPU. * @@ -153,9 +155,9 @@ void __init load_ucode_bsp(void) return; if (intel) - load_ucode_intel_bsp(); + load_ucode_intel_bsp(&early_data); else - load_ucode_amd_bsp(cpuid_1_eax); + load_ucode_amd_bsp(&early_data, cpuid_1_eax); } void load_ucode_ap(void) @@ -826,6 +828,11 @@ static int __init microcode_init(void) if (!microcode_ops) return -ENODEV; + pr_info_once("Current revision: 0x%08x\n", (early_data.new_rev ?: early_data.old_rev)); + + if (early_data.new_rev) + pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev); + microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) return PTR_ERR(microcode_pdev); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 6024feb98d29d..070426b9895fe 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -339,16 +339,9 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, static enum ucode_state apply_microcode_early(struct ucode_cpu_info *uci) { struct microcode_intel *mc = uci->mc; - enum ucode_state ret; - u32 cur_rev, date; + u32 cur_rev; - ret = __apply_microcode(uci, mc, &cur_rev); - if (ret == UCODE_UPDATED) { - date = mc->hdr.date; - pr_info_once("updated early: 0x%x -> 0x%x, date = %04x-%02x-%02x\n", - cur_rev, mc->hdr.rev, date & 0xffff, date >> 24, (date >> 16) & 0xff); - } - return ret; + return __apply_microcode(uci, mc, &cur_rev); } static __init bool load_builtin_intel_microcode(struct cpio_data *cp) @@ -413,13 +406,17 @@ static int __init save_builtin_microcode(void) early_initcall(save_builtin_microcode); /* Load microcode on BSP from initrd or builtin blobs */ -void __init load_ucode_intel_bsp(void) +void __init load_ucode_intel_bsp(struct early_load_data *ed) { struct ucode_cpu_info uci; + ed->old_rev = intel_get_microcode_revision(); + uci.mc = get_microcode_blob(&uci, false); if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) ucode_patch_va = UCODE_BSP_LOADED; + + ed->new_rev = uci.cpu_sig.rev; } void load_ucode_intel_ap(void) diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index f8047b12329a9..21776c529fa97 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -37,6 +37,12 @@ struct microcode_ops { use_nmi : 1; }; +struct early_load_data { + u32 old_rev; + u32 new_rev; +}; + +extern struct early_load_data early_data; extern struct ucode_cpu_info ucode_cpu_info[]; struct cpio_data find_microcode_in_initrd(const char *path); @@ -92,14 +98,14 @@ extern bool dis_ucode_ldr; extern bool force_minrev; #ifdef CONFIG_CPU_SUP_AMD -void load_ucode_amd_bsp(unsigned int family); +void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family); void load_ucode_amd_ap(unsigned int family); int save_microcode_in_initrd_amd(unsigned int family); void reload_ucode_amd(unsigned int cpu); struct microcode_ops *init_amd_microcode(void); void exit_amd_microcode(void); #else /* CONFIG_CPU_SUP_AMD */ -static inline void load_ucode_amd_bsp(unsigned int family) { } +static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { } static inline void load_ucode_amd_ap(unsigned int family) { } static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } static inline void reload_ucode_amd(unsigned int cpu) { } @@ -108,12 +114,12 @@ static inline void exit_amd_microcode(void) { } #endif /* !CONFIG_CPU_SUP_AMD */ #ifdef CONFIG_CPU_SUP_INTEL -void load_ucode_intel_bsp(void); +void load_ucode_intel_bsp(struct early_load_data *ed); void load_ucode_intel_ap(void); void reload_ucode_intel(void); struct microcode_ops *init_intel_microcode(void); #else /* CONFIG_CPU_SUP_INTEL */ -static inline void load_ucode_intel_bsp(void) { } +static inline void load_ucode_intel_bsp(struct early_load_data *ed) { } static inline void load_ucode_intel_ap(void) { } static inline void reload_ucode_intel(void) { } static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } From cea7008190ad65b4aaae6e94667a358d2c10a696 Mon Sep 17 00:00:00 2001 From: Cong Yang Date: Mon, 20 Nov 2023 10:01:09 +0800 Subject: [PATCH 482/560] drm/panel: boe-tv101wum-nl6: Fine tune Himax83102-j02 panel HFP and HBP The refresh reported by modetest is 60.46Hz, and the actual measurement is 60.01Hz, which is outside the expected tolerance. Adjust hporch and pixel clock to fix it. After repair, modetest and actual measurement were all 60.01Hz. Modetest refresh = Pixel CLK/ htotal* vtotal, but measurement frame rate is HS->LP cycle time(Vblanking). Measured frame rate is not only affecte by Htotal/Vtotal/pixel clock, also affected by Lane-num/PixelBit/LineTime /DSI CLK. Assume that the DSI controller could not make the mode that we requested(presumably it's PLL couldn't generate the exact pixel clock?). If you use a different DSI controller, you may need to readjust these parameters. Now this panel looks like it's only used by me on the MTK platform, so let's change this set of parameters. Fixes: 1bc2ef065f13 ("drm/panel: Support for Starry-himax83102-j02 TDDI MIPI-DSI panel") Signed-off-by: Cong Yang Reviewed-by: Douglas Anderson Signed-off-by: Douglas Anderson Link: https://patchwork.freedesktop.org/patch/msgid/20231120020109.3216343-1-yangcong5@huaqin.corp-partner.google.com --- drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c index a287be1aaf70f..be8f48e3c1db8 100644 --- a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c +++ b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c @@ -1767,11 +1767,11 @@ static const struct panel_desc starry_qfh032011_53g_desc = { }; static const struct drm_display_mode starry_himax83102_j02_default_mode = { - .clock = 161600, + .clock = 162850, .hdisplay = 1200, - .hsync_start = 1200 + 40, - .hsync_end = 1200 + 40 + 20, - .htotal = 1200 + 40 + 20 + 40, + .hsync_start = 1200 + 50, + .hsync_end = 1200 + 50 + 20, + .htotal = 1200 + 50 + 20 + 50, .vdisplay = 1920, .vsync_start = 1920 + 116, .vsync_end = 1920 + 116 + 8, From ab93edb2f94c3c0d5965be3815782472adbe3f52 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 22 Nov 2023 06:11:09 +1000 Subject: [PATCH 483/560] nouveau/gsp: allocate enough space for all channel ids. This probably isn't the ideal fix, but we ended up using chids sparsely, and lots of things rely on indexing into the full range, so just allocate the full range up front. The GSP code fixes 8 channels into a userd page, but we end up using a single userd page per channel so end up sparsely using the range. Fixes a few crashes seen with multiple channels. Link: https://gitlab.freedesktop.org/drm/nouveau/-/issues/277 Signed-off-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20231121201109.2988516-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nvkm/engine/fifo/r535.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/r535.c b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/r535.c index 3adbb05ff587b..d088e636edc31 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/r535.c @@ -539,7 +539,7 @@ r535_fifo_runl_ctor(struct nvkm_fifo *fifo) struct nvkm_runl *runl; struct nvkm_engn *engn; u32 cgids = 2048; - u32 chids = 2048 / CHID_PER_USERD; + u32 chids = 2048; int ret; NV2080_CTRL_FIFO_GET_DEVICE_INFO_TABLE_PARAMS *ctrl; From 0739af07d1d947af27c877f797cb82ceee702515 Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Mon, 20 Nov 2023 13:06:29 +0100 Subject: [PATCH 484/560] net: usb: ax88179_178a: fix failed operations during ax88179_reset Using generic ASIX Electronics Corp. AX88179 Gigabit Ethernet device, the following test cycle has been implemented: - power on - check logs - shutdown - after detecting the system shutdown, disconnect power - after approximately 60 seconds of sleep, power is restored Running some cycles, sometimes error logs like this appear: kernel: ax88179_178a 2-9:1.0 (unnamed net_device) (uninitialized): Failed to write reg index 0x0001: -19 kernel: ax88179_178a 2-9:1.0 (unnamed net_device) (uninitialized): Failed to read reg index 0x0001: -19 ... These failed operation are happening during ax88179_reset execution, so the initialization could not be correct. In order to avoid this, we need to increase the delay after reset and clock initial operations. By using these larger values, many cycles have been run and no failed operations appear. It would be better to check some status register to verify when the operation has finished, but I do not have found any available information (neither in the public datasheets nor in the manufacturer's driver). The only available information for the necessary delays is the maufacturer's driver (original values) but the proposed values are not enough for the tested devices. Fixes: e2ca90c276e1f ("ax88179_178a: ASIX AX88179_178A USB 3.0/2.0 to gigabit ethernet adapter driver") Reported-by: Herb Wei Tested-by: Herb Wei Signed-off-by: Jose Ignacio Tornos Martinez Link: https://lore.kernel.org/r/20231120120642.54334-1-jtornosm@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/ax88179_178a.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index aff39bf3161de..4ea0e155bb0d5 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1583,11 +1583,11 @@ static int ax88179_reset(struct usbnet *dev) *tmp16 = AX_PHYPWR_RSTCTL_IPRL; ax88179_write_cmd(dev, AX_ACCESS_MAC, AX_PHYPWR_RSTCTL, 2, 2, tmp16); - msleep(200); + msleep(500); *tmp = AX_CLK_SELECT_ACS | AX_CLK_SELECT_BCS; ax88179_write_cmd(dev, AX_ACCESS_MAC, AX_CLK_SELECT, 1, 1, tmp); - msleep(100); + msleep(200); /* Ethernet PHY Auto Detach*/ ax88179_auto_detach(dev); From 495ec91b48e489afefb2ad714f0d9b68c3016c6c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2023 12:01:09 -0800 Subject: [PATCH 485/560] docs: netdev: try to guide people on dealing with silence There has been more than a few threads which went idle before the merge window and now people came back to them and started asking about next steps. We currently tell people to be patient and not to repost too often. Our "not too often", however, is still a few orders of magnitude faster than other subsystems. Or so I feel after hearing people talk about review rates at LPC. Clarify in the doc that if the discussion went idle for a week on netdev, 95% of the time there's no point waiting longer. Link: https://lore.kernel.org/r/20231120200109.620392-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/process/maintainer-netdev.rst | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/Documentation/process/maintainer-netdev.rst b/Documentation/process/maintainer-netdev.rst index 7feacc20835e4..84ee60fceef24 100644 --- a/Documentation/process/maintainer-netdev.rst +++ b/Documentation/process/maintainer-netdev.rst @@ -193,9 +193,23 @@ Review timelines Generally speaking, the patches get triaged quickly (in less than 48h). But be patient, if your patch is active in patchwork (i.e. it's listed on the project's patch list) the chances it was missed are close to zero. -Asking the maintainer for status updates on your -patch is a good way to ensure your patch is ignored or pushed to the -bottom of the priority list. + +The high volume of development on netdev makes reviewers move on +from discussions relatively quickly. New comments and replies +are very unlikely to arrive after a week of silence. If a patch +is no longer active in patchwork and the thread went idle for more +than a week - clarify the next steps and/or post the next version. + +For RFC postings specifically, if nobody responded in a week - reviewers +either missed the posting or have no strong opinions. If the code is ready, +repost as a PATCH. + +Emails saying just "ping" or "bump" are considered rude. If you can't figure +out the status of the patch from patchwork or where the discussion has +landed - describe your best guess and ask if it's correct. For example:: + + I don't understand what the next steps are. Person X seems to be unhappy + with A, should I do B and repost the patches? .. _Changes requested: From b6fe6f03716da246b453369f98a553d4ab21447c Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Tue, 21 Nov 2023 09:37:09 +0800 Subject: [PATCH 486/560] dpll: Fix potential msg memleak when genlmsg_put_reply failed We should clean the skb resource if genlmsg_put_reply failed. Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions") Signed-off-by: Hao Ge Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20231121013709.73323-1-gehao@kylinos.cn Signed-off-by: Jakub Kicinski --- drivers/dpll/dpll_netlink.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index a6dc3997bf5c5..442a0ebeb953e 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -1093,9 +1093,10 @@ int dpll_nl_pin_id_get_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, DPLL_CMD_PIN_ID_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(msg); return -EMSGSIZE; - + } pin = dpll_pin_find_from_nlattr(info); if (!IS_ERR(pin)) { ret = dpll_msg_add_pin_handle(msg, pin); @@ -1123,8 +1124,10 @@ int dpll_nl_pin_get_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, DPLL_CMD_PIN_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(msg); return -EMSGSIZE; + } ret = dpll_cmd_pin_get_one(msg, pin, info->extack); if (ret) { nlmsg_free(msg); @@ -1256,8 +1259,10 @@ int dpll_nl_device_id_get_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, DPLL_CMD_DEVICE_ID_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(msg); return -EMSGSIZE; + } dpll = dpll_device_find_from_nlattr(info); if (!IS_ERR(dpll)) { @@ -1284,8 +1289,10 @@ int dpll_nl_device_get_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; hdr = genlmsg_put_reply(msg, info, &dpll_nl_family, 0, DPLL_CMD_DEVICE_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(msg); return -EMSGSIZE; + } ret = dpll_device_get_one(dpll, msg, info->extack); if (ret) { From 18286883e779fb79b413a7462968ee3f6768f19c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 14 Nov 2023 17:59:28 +0100 Subject: [PATCH 487/560] x86/hyperv: Use atomic_try_cmpxchg() to micro-optimize hv_nmi_unknown() Use atomic_try_cmpxchg() instead of atomic_cmpxchg(*ptr, old, new) == old in hv_nmi_unknown(). On x86 the CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after CMPXCHG. The generated asm code improves from: 3e: 65 8b 15 00 00 00 00 mov %gs:0x0(%rip),%edx 45: b8 ff ff ff ff mov $0xffffffff,%eax 4a: f0 0f b1 15 00 00 00 lock cmpxchg %edx,0x0(%rip) 51: 00 52: 83 f8 ff cmp $0xffffffff,%eax 55: 0f 95 c0 setne %al to: 3e: 65 8b 15 00 00 00 00 mov %gs:0x0(%rip),%edx 45: b8 ff ff ff ff mov $0xffffffff,%eax 4a: f0 0f b1 15 00 00 00 lock cmpxchg %edx,0x0(%rip) 51: 00 52: 0f 95 c0 setne %al No functional change intended. Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Wei Liu Cc: Dexuan Cui Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Signed-off-by: Uros Bizjak Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20231114170038.381634-1-ubizjak@gmail.com Signed-off-by: Wei Liu Message-ID: <20231114170038.381634-1-ubizjak@gmail.com> --- arch/x86/kernel/cpu/mshyperv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e6bba12c759cb..01fa06dd06b66 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -262,11 +262,14 @@ static uint32_t __init ms_hyperv_platform(void) static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) { static atomic_t nmi_cpu = ATOMIC_INIT(-1); + unsigned int old_cpu, this_cpu; if (!unknown_nmi_panic) return NMI_DONE; - if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1) + old_cpu = -1; + this_cpu = raw_smp_processor_id(); + if (!atomic_try_cmpxchg(&nmi_cpu, &old_cpu, this_cpu)) return NMI_HANDLED; return NMI_DONE; From e389fe8b68137344562fb6e4d53d8a89ef6212dd Mon Sep 17 00:00:00 2001 From: Victor Fragoso Date: Tue, 21 Nov 2023 21:05:56 +0000 Subject: [PATCH 488/560] USB: serial: option: add Fibocom L7xx modules Add support for Fibocom L716-EU module series. L716-EU is a Fibocom module based on ZTE's V3E/V3T chipset. Device creates multiple interfaces when connected to PC as follows: - Network Interface: ECM or RNDIS (set by FW or AT Command) - ttyUSB0: AT port - ttyUSB1: Modem port - ttyUSB2: AT2 port - ttyUSB3: Trace port for log information - ADB: ADB port for debugging. ("Driver=usbfs" when ADB server enabled) Here are the outputs of lsusb and usb-devices: $ ls /dev/ttyUSB* /dev/ttyUSB0 /dev/ttyUSB1 /dev/ttyUSB2 /dev/ttyUSB3 usb-devices: L716-EU (ECM mode): T: Bus=03 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 51 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2cb7 ProdID=0001 Rev= 1.00 S: Manufacturer=Fibocom,Incorporated S: Product=Fibocom Mobile Boardband S: SerialNumber=1234567890ABCDEF C:* #Ifs= 7 Cfg#= 1 Atr=e0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=06 Prot=00 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=06 Prot=00 Driver=cdc_ether E: Ad=87(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms L716-EU (RNDIS mode): T: Bus=03 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 49 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2cb7 ProdID=0001 Rev= 1.00 S: Manufacturer=Fibocom,Incorporated S: Product=Fibocom Mobile Boardband S: SerialNumber=1234567890ABCDEF C:* #Ifs= 7 Cfg#= 1 Atr=e0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=e0(wlcon) Sub=01 Prot=03 I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=02 Prot=ff Driver=rndis_host E: Ad=87(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Victor Fragoso Reviewed-by: Lars Melin Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 9c76095ebfe14..06b9b04c022a6 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2250,6 +2250,7 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(4) | RSVD(5) | RSVD(6) }, { USB_DEVICE(0x1782, 0x4d10) }, /* Fibocom L610 (AT mode) */ { USB_DEVICE_INTERFACE_CLASS(0x1782, 0x4d11, 0xff) }, /* Fibocom L610 (ECM/RNDIS mode) */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2cb7, 0x0001, 0xff, 0xff, 0xff) }, /* Fibocom L716-EU (ECM/RNDIS mode) */ { USB_DEVICE(0x2cb7, 0x0104), /* Fibocom NL678 series */ .driver_info = RSVD(4) | RSVD(5) }, { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0105, 0xff), /* Fibocom NL678 series */ From 06ae5afce8cc1f7621cc5c7751e449ce20d68af7 Mon Sep 17 00:00:00 2001 From: Denis Benato Date: Fri, 17 Nov 2023 14:15:55 +1300 Subject: [PATCH 489/560] HID: hid-asus: add const to read-only outgoing usb buffer In the function asus_kbd_set_report the parameter buf is read-only as it gets copied in a memory portion suitable for USB transfer, but the parameter is not marked as const: add the missing const and mark const immutable buffers passed to that function. Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Signed-off-by: Jiri Kosina --- drivers/hid/hid-asus.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index fd61dba882338..b70673a929a1e 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -381,7 +381,7 @@ static int asus_raw_event(struct hid_device *hdev, return 0; } -static int asus_kbd_set_report(struct hid_device *hdev, u8 *buf, size_t buf_size) +static int asus_kbd_set_report(struct hid_device *hdev, const u8 *buf, size_t buf_size) { unsigned char *dmabuf; int ret; @@ -404,7 +404,7 @@ static int asus_kbd_set_report(struct hid_device *hdev, u8 *buf, size_t buf_size static int asus_kbd_init(struct hid_device *hdev) { - u8 buf[] = { FEATURE_KBD_REPORT_ID, 0x41, 0x53, 0x55, 0x53, 0x20, 0x54, + const u8 buf[] = { FEATURE_KBD_REPORT_ID, 0x41, 0x53, 0x55, 0x53, 0x20, 0x54, 0x65, 0x63, 0x68, 0x2e, 0x49, 0x6e, 0x63, 0x2e, 0x00 }; int ret; @@ -418,7 +418,7 @@ static int asus_kbd_init(struct hid_device *hdev) static int asus_kbd_get_functions(struct hid_device *hdev, unsigned char *kbd_func) { - u8 buf[] = { FEATURE_KBD_REPORT_ID, 0x05, 0x20, 0x31, 0x00, 0x08 }; + const u8 buf[] = { FEATURE_KBD_REPORT_ID, 0x05, 0x20, 0x31, 0x00, 0x08 }; u8 *readbuf; int ret; @@ -449,7 +449,7 @@ static int asus_kbd_get_functions(struct hid_device *hdev, static int rog_nkey_led_init(struct hid_device *hdev) { - u8 buf_init_start[] = { FEATURE_KBD_LED_REPORT_ID1, 0xB9 }; + const u8 buf_init_start[] = { FEATURE_KBD_LED_REPORT_ID1, 0xB9 }; u8 buf_init2[] = { FEATURE_KBD_LED_REPORT_ID1, 0x41, 0x53, 0x55, 0x53, 0x20, 0x54, 0x65, 0x63, 0x68, 0x2e, 0x49, 0x6e, 0x63, 0x2e, 0x00 }; u8 buf_init3[] = { FEATURE_KBD_LED_REPORT_ID1, From 546edbd26cff7ae990e480a59150e801a06f77b1 Mon Sep 17 00:00:00 2001 From: Denis Benato Date: Fri, 17 Nov 2023 14:15:56 +1300 Subject: [PATCH 490/560] HID: hid-asus: reset the backlight brightness level on resume Some devices managed by this driver automatically set brightness to 0 before entering a suspended state and reset it back to a default brightness level after the resume: this has the effect of having the kernel report wrong brightness status after a sleep, and on some devices (like the Asus RC71L) that brightness is the intensity of LEDs directly facing the user. Fix the above issue by setting back brightness to the level it had before entering a sleep state. Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Signed-off-by: Jiri Kosina --- drivers/hid/hid-asus.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index b70673a929a1e..78cdfb8b9a7ae 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -1000,6 +1000,24 @@ static int asus_start_multitouch(struct hid_device *hdev) return 0; } +static int __maybe_unused asus_resume(struct hid_device *hdev) { + struct asus_drvdata *drvdata = hid_get_drvdata(hdev); + int ret = 0; + + if (drvdata->kbd_backlight) { + const u8 buf[] = { FEATURE_KBD_REPORT_ID, 0xba, 0xc5, 0xc4, + drvdata->kbd_backlight->cdev.brightness }; + ret = asus_kbd_set_report(hdev, buf, sizeof(buf)); + if (ret < 0) { + hid_err(hdev, "Asus failed to set keyboard backlight: %d\n", ret); + goto asus_resume_err; + } + } + +asus_resume_err: + return ret; +} + static int __maybe_unused asus_reset_resume(struct hid_device *hdev) { struct asus_drvdata *drvdata = hid_get_drvdata(hdev); @@ -1294,6 +1312,7 @@ static struct hid_driver asus_driver = { .input_configured = asus_input_configured, #ifdef CONFIG_PM .reset_resume = asus_reset_resume, + .resume = asus_resume, #endif .event = asus_event, .raw_event = asus_raw_event From 9ffccb691adb854e7b7f3ee57fbbda12ff70533f Mon Sep 17 00:00:00 2001 From: Aoba K Date: Tue, 21 Nov 2023 20:23:11 +0800 Subject: [PATCH 491/560] HID: multitouch: Add quirk for HONOR GLO-GXXX touchpad Honor MagicBook 13 2023 has a touchpad which do not switch to the multitouch mode until the input mode feature is written by the host. The touchpad do report the input mode at touchpad(3), while itself working under mouse mode. As a workaround, it is possible to call MT_QUIRE_FORCE_GET_FEATURE to force set feature in mt_set_input_mode for such device. The touchpad reports as BLTP7853, which cannot retrive any useful manufacture information on the internel by this string at present. As the serial number of the laptop is GLO-G52, while DMI info reports the laptop serial number as GLO-GXXX, this workaround should applied to all models which has the GLO-GXXX. Signed-off-by: Aoba K Signed-off-by: Jiri Kosina --- drivers/hid/hid-multitouch.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index e098cc7b39443..fd5b0637dad68 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -2046,6 +2046,11 @@ static const struct hid_device_id mt_devices[] = { MT_USB_DEVICE(USB_VENDOR_ID_HANVON_ALT, USB_DEVICE_ID_HANVON_ALT_MULTITOUCH) }, + /* HONOR GLO-GXXX panel */ + { .driver_data = MT_CLS_VTL, + HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8, + 0x347d, 0x7853) }, + /* Ilitek dual touch panel */ { .driver_data = MT_CLS_NSMU, MT_USB_DEVICE(USB_VENDOR_ID_ILITEK, From 84d2db91f14a32dc856a5972e3f0907089093c7a Mon Sep 17 00:00:00 2001 From: Nguyen Dinh Phi Date: Tue, 21 Nov 2023 15:53:57 +0800 Subject: [PATCH 492/560] nfc: virtual_ncidev: Add variable to check if ndev is running syzbot reported an memory leak that happens when an skb is add to send_buff after virtual nci closed. This patch adds a variable to track if the ndev is running before handling new skb in send function. Signed-off-by: Nguyen Dinh Phi Reported-by: syzbot+6eb09d75211863f15e3e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/00000000000075472b06007df4fb@google.com Reviewed-by: Bongsu Jeon Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- drivers/nfc/virtual_ncidev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/nfc/virtual_ncidev.c b/drivers/nfc/virtual_ncidev.c index b027be0b0b6ff..590b038e449e5 100644 --- a/drivers/nfc/virtual_ncidev.c +++ b/drivers/nfc/virtual_ncidev.c @@ -26,10 +26,14 @@ struct virtual_nci_dev { struct mutex mtx; struct sk_buff *send_buff; struct wait_queue_head wq; + bool running; }; static int virtual_nci_open(struct nci_dev *ndev) { + struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); + + vdev->running = true; return 0; } @@ -40,6 +44,7 @@ static int virtual_nci_close(struct nci_dev *ndev) mutex_lock(&vdev->mtx); kfree_skb(vdev->send_buff); vdev->send_buff = NULL; + vdev->running = false; mutex_unlock(&vdev->mtx); return 0; @@ -50,7 +55,7 @@ static int virtual_nci_send(struct nci_dev *ndev, struct sk_buff *skb) struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); mutex_lock(&vdev->mtx); - if (vdev->send_buff) { + if (vdev->send_buff || !vdev->running) { mutex_unlock(&vdev->mtx); kfree_skb(skb); return -1; From e6d71b437abc2f249e3b6a1ae1a7228e09c6e563 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 22 Nov 2023 10:37:05 +0800 Subject: [PATCH 493/560] net/smc: avoid data corruption caused by decline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We found a data corruption issue during testing of SMC-R on Redis applications. The benchmark has a low probability of reporting a strange error as shown below. "Error: Protocol error, got "\xe2" as reply type byte" Finally, we found that the retrieved error data was as follows: 0xE2 0xD4 0xC3 0xD9 0x04 0x00 0x2C 0x20 0xA6 0x56 0x00 0x16 0x3E 0x0C 0xCB 0x04 0x02 0x01 0x00 0x00 0x20 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0xE2 It is quite obvious that this is a SMC DECLINE message, which means that the applications received SMC protocol message. We found that this was caused by the following situations: client server ¦ clc proposal -------------> ¦ clc accept <------------- ¦ clc confirm -------------> wait llc confirm send llc confirm ¦failed llc confirm ¦ x------ (after 2s)timeout wait llc confirm rsp wait decline (after 1s) timeout (after 2s) timeout ¦ decline --------------> ¦ decline <-------------- As a result, a decline message was sent in the implementation, and this message was read from TCP by the already-fallback connection. This patch double the client timeout as 2x of the server value, With this simple change, the Decline messages should never cross or collide (during Confirm link timeout). This issue requires an immediate solution, since the protocol updates involve a more long-term solution. Fixes: 0fb0b02bd6fd ("net/smc: adapt SMC client code to use the LLC flow") Signed-off-by: D. Wythe Reviewed-by: Wen Gu Reviewed-by: Wenjia Zhang Signed-off-by: David S. Miller --- net/smc/af_smc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da97f946b79b0..2a1388841951e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -598,8 +598,12 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) struct smc_llc_qentry *qentry; int rc; - /* receive CONFIRM LINK request from server over RoCE fabric */ - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + /* Receive CONFIRM LINK request from server over RoCE fabric. + * Increasing the client's timeout by twice as much as the server's + * timeout by default can temporarily avoid decline messages of + * both sides crossing or colliding + */ + qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME, SMC_LLC_CONFIRM_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; From 372ee6a3368ec6ff46ee4e6ff4ffe2fe1e059dbb Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 21 Nov 2023 21:32:05 +0100 Subject: [PATCH 494/560] usb: misc: ljca: Fix enumeration error on Dell Latitude 9420 Not all LJCA chips implement SPI and on chips without SPI reading the SPI descriptors will timeout. On laptop models like the Dell Latitude 9420, this is expected behavior and not an error. Modify the driver to continue without instantiating a SPI auxbus child, instead of failing to probe() the whole LJCA chip. Fixes: acd6199f195d ("usb: Add support for Intel LJCA device") Signed-off-by: Hans de Goede Reviewed-by: Wentong Wu Link: https://lore.kernel.org/r/20231104175104.38786-1-hdegoede@redhat.com Link: https://lore.kernel.org/r/20231121203205.223047-1-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/usb-ljca.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/misc/usb-ljca.c b/drivers/usb/misc/usb-ljca.c index 7f0deebebc130..35770e608c649 100644 --- a/drivers/usb/misc/usb-ljca.c +++ b/drivers/usb/misc/usb-ljca.c @@ -646,10 +646,11 @@ static int ljca_enumerate_spi(struct ljca_adapter *adap) unsigned int i; int ret; + /* Not all LJCA chips implement SPI, a timeout reading the descriptors is normal */ ret = ljca_send(adap, LJCA_CLIENT_MNG, LJCA_MNG_ENUM_SPI, NULL, 0, buf, sizeof(buf), true, LJCA_ENUM_CLIENT_TIMEOUT_MS); if (ret < 0) - return ret; + return (ret == -ETIMEDOUT) ? 0 : ret; /* check firmware response */ desc = (struct ljca_spi_descriptor *)buf; From 6a26310273c323380da21eb23fcfd50e31140913 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 21 Nov 2023 09:09:33 +0100 Subject: [PATCH 495/560] Revert "net: r8169: Disable multicast filter for RTL8168H and RTL8107E" This reverts commit efa5f1311c4998e9e6317c52bc5ee93b3a0f36df. I couldn't reproduce the reported issue. What I did, based on a pcap packet log provided by the reporter: - Used same chip version (RTL8168h) - Set MAC address to the one used on the reporters system - Replayed the EAPOL unicast packet that, according to the reporter, was filtered out by the mc filter. The packet was properly received. Therefore the root cause of the reported issue seems to be somewhere else. Disabling mc filtering completely for the most common chip version is a quite big hammer. Therefore revert the change and wait for further analysis results from the reporter. Cc: stable@vger.kernel.org Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index b9bb1d2f0237e..295366a85c630 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2599,9 +2599,7 @@ static void rtl_set_rx_mode(struct net_device *dev) rx_mode &= ~AcceptMulticast; } else if (netdev_mc_count(dev) > MC_FILTER_LIMIT || dev->flags & IFF_ALLMULTI || - tp->mac_version == RTL_GIGA_MAC_VER_35 || - tp->mac_version == RTL_GIGA_MAC_VER_46 || - tp->mac_version == RTL_GIGA_MAC_VER_48) { + tp->mac_version == RTL_GIGA_MAC_VER_35) { /* accept all multicasts */ } else if (netdev_mc_empty(dev)) { rx_mode &= ~AcceptMulticast; From d0c930b745cafde8e7d25d0356c648bca669556a Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Mon, 13 Nov 2023 15:59:20 +0100 Subject: [PATCH 496/560] dt-bindings: usb: microchip,usb5744: Add second supply The USB5744 has two power supplies one for 3V3 and one for 1V2. Add the second supply to the USB5744 DT binding. Signed-off-by: Stefan Eichenberger Signed-off-by: Francesco Dolcini Acked-by: Conor Dooley Cc: stable Link: https://lore.kernel.org/r/20231113145921.30104-2-francesco@dolcini.it Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/usb/microchip,usb5744.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/usb/microchip,usb5744.yaml b/Documentation/devicetree/bindings/usb/microchip,usb5744.yaml index ff3a1707ef570..6d4cfd943f584 100644 --- a/Documentation/devicetree/bindings/usb/microchip,usb5744.yaml +++ b/Documentation/devicetree/bindings/usb/microchip,usb5744.yaml @@ -36,7 +36,11 @@ properties: vdd-supply: description: - VDD power supply to the hub + 3V3 power supply to the hub + + vdd2-supply: + description: + 1V2 power supply to the hub peer-hub: $ref: /schemas/types.yaml#/definitions/phandle @@ -62,6 +66,7 @@ allOf: properties: reset-gpios: false vdd-supply: false + vdd2-supply: false peer-hub: false i2c-bus: false else: From 6972b38ca05235f6142715db7062ecc87a422e22 Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Mon, 13 Nov 2023 15:59:21 +0100 Subject: [PATCH 497/560] usb: misc: onboard-hub: add support for Microchip USB5744 Add support for the Microchip USB5744 USB3.0 and USB2.0 Hub. The Microchip USB5744 supports two power supplies, one for 1V2 and one for 3V3. According to the datasheet there is no need for a delay between power on and reset, so this value is set to 0. Signed-off-by: Stefan Eichenberger Signed-off-by: Francesco Dolcini Cc: stable Acked-by: Matthias Kaehlcke Link: https://lore.kernel.org/r/20231113145921.30104-3-francesco@dolcini.it Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/onboard_usb_hub.c | 2 ++ drivers/usb/misc/onboard_usb_hub.h | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/usb/misc/onboard_usb_hub.c b/drivers/usb/misc/onboard_usb_hub.c index a341b2fbb7b44..2b45404e9732c 100644 --- a/drivers/usb/misc/onboard_usb_hub.c +++ b/drivers/usb/misc/onboard_usb_hub.c @@ -432,6 +432,8 @@ static const struct usb_device_id onboard_hub_id_table[] = { { USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2412) }, /* USB2412 USB 2.0 */ { USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2514) }, /* USB2514B USB 2.0 */ { USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2517) }, /* USB2517 USB 2.0 */ + { USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2744) }, /* USB5744 USB 2.0 */ + { USB_DEVICE(VENDOR_ID_MICROCHIP, 0x5744) }, /* USB5744 USB 3.0 */ { USB_DEVICE(VENDOR_ID_REALTEK, 0x0411) }, /* RTS5411 USB 3.1 */ { USB_DEVICE(VENDOR_ID_REALTEK, 0x5411) }, /* RTS5411 USB 2.1 */ { USB_DEVICE(VENDOR_ID_REALTEK, 0x0414) }, /* RTS5414 USB 3.2 */ diff --git a/drivers/usb/misc/onboard_usb_hub.h b/drivers/usb/misc/onboard_usb_hub.h index c4e24a7b92904..292110e64a1d9 100644 --- a/drivers/usb/misc/onboard_usb_hub.h +++ b/drivers/usb/misc/onboard_usb_hub.h @@ -16,6 +16,11 @@ static const struct onboard_hub_pdata microchip_usb424_data = { .num_supplies = 1, }; +static const struct onboard_hub_pdata microchip_usb5744_data = { + .reset_us = 0, + .num_supplies = 2, +}; + static const struct onboard_hub_pdata realtek_rts5411_data = { .reset_us = 0, .num_supplies = 1, @@ -50,6 +55,8 @@ static const struct of_device_id onboard_hub_match[] = { { .compatible = "usb424,2412", .data = µchip_usb424_data, }, { .compatible = "usb424,2514", .data = µchip_usb424_data, }, { .compatible = "usb424,2517", .data = µchip_usb424_data, }, + { .compatible = "usb424,2744", .data = µchip_usb5744_data, }, + { .compatible = "usb424,5744", .data = µchip_usb5744_data, }, { .compatible = "usb451,8140", .data = &ti_tusb8041_data, }, { .compatible = "usb451,8142", .data = &ti_tusb8041_data, }, { .compatible = "usb4b4,6504", .data = &cypress_hx3_data, }, From 0c2671f33a9c975d752216739d6a05cb88e98aa4 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 20 Nov 2023 17:16:05 +0100 Subject: [PATCH 498/560] dt-bindings: usb: qcom,dwc3: fix example wakeup interrupt types The DP/DM wakeup interrupts are edge triggered and which edge to trigger on depends on use-case and whether a Low speed or Full/High speed device is connected. Fixes: 3828026c9ec8 ("dt-bindings: usb: qcom,dwc3: Convert USB DWC3 bindings") Signed-off-by: Johan Hovold Acked-by: Krzysztof Kozlowski Reviewed-by: Andrew Halaney Link: https://lore.kernel.org/r/20231120161607.7405-2-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/usb/qcom,dwc3.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/usb/qcom,dwc3.yaml b/Documentation/devicetree/bindings/usb/qcom,dwc3.yaml index e889158ca2057..915c8205623b3 100644 --- a/Documentation/devicetree/bindings/usb/qcom,dwc3.yaml +++ b/Documentation/devicetree/bindings/usb/qcom,dwc3.yaml @@ -521,8 +521,8 @@ examples: interrupts = , , - , - ; + , + ; interrupt-names = "hs_phy_irq", "ss_phy_irq", "dm_hs_phy_irq", "dp_hs_phy_irq"; From 41f5a0973259db9e4e3c9963d36505f80107d1a0 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 20 Nov 2023 17:16:06 +0100 Subject: [PATCH 499/560] USB: dwc3: qcom: fix wakeup after probe deferral The Qualcomm glue driver is overriding the interrupt trigger types defined by firmware when requesting the wakeup interrupts during probe. This can lead to a failure to map the DP/DM wakeup interrupts after a probe deferral as the firmware defined trigger types do not match the type used for the initial mapping: irq: type mismatch, failed to map hwirq-14 for interrupt-controller@b220000! irq: type mismatch, failed to map hwirq-15 for interrupt-controller@b220000! Fix this by not overriding the firmware provided trigger types when requesting the wakeup interrupts. Fixes: a4333c3a6ba9 ("usb: dwc3: Add Qualcomm DWC3 glue driver") Cc: stable@vger.kernel.org # 4.18 Signed-off-by: Johan Hovold Reviewed-by: Andrew Halaney Link: https://lore.kernel.org/r/20231120161607.7405-3-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-qcom.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-qcom.c b/drivers/usb/dwc3/dwc3-qcom.c index 3de43df6bbe81..cf200342273af 100644 --- a/drivers/usb/dwc3/dwc3-qcom.c +++ b/drivers/usb/dwc3/dwc3-qcom.c @@ -549,7 +549,7 @@ static int dwc3_qcom_setup_irq(struct platform_device *pdev) irq_set_status_flags(irq, IRQ_NOAUTOEN); ret = devm_request_threaded_irq(qcom->dev, irq, NULL, qcom_dwc3_resume_irq, - IRQF_TRIGGER_HIGH | IRQF_ONESHOT, + IRQF_ONESHOT, "qcom_dwc3 HS", qcom); if (ret) { dev_err(qcom->dev, "hs_phy_irq failed: %d\n", ret); @@ -564,7 +564,7 @@ static int dwc3_qcom_setup_irq(struct platform_device *pdev) irq_set_status_flags(irq, IRQ_NOAUTOEN); ret = devm_request_threaded_irq(qcom->dev, irq, NULL, qcom_dwc3_resume_irq, - IRQF_TRIGGER_HIGH | IRQF_ONESHOT, + IRQF_ONESHOT, "qcom_dwc3 DP_HS", qcom); if (ret) { dev_err(qcom->dev, "dp_hs_phy_irq failed: %d\n", ret); @@ -579,7 +579,7 @@ static int dwc3_qcom_setup_irq(struct platform_device *pdev) irq_set_status_flags(irq, IRQ_NOAUTOEN); ret = devm_request_threaded_irq(qcom->dev, irq, NULL, qcom_dwc3_resume_irq, - IRQF_TRIGGER_HIGH | IRQF_ONESHOT, + IRQF_ONESHOT, "qcom_dwc3 DM_HS", qcom); if (ret) { dev_err(qcom->dev, "dm_hs_phy_irq failed: %d\n", ret); @@ -594,7 +594,7 @@ static int dwc3_qcom_setup_irq(struct platform_device *pdev) irq_set_status_flags(irq, IRQ_NOAUTOEN); ret = devm_request_threaded_irq(qcom->dev, irq, NULL, qcom_dwc3_resume_irq, - IRQF_TRIGGER_HIGH | IRQF_ONESHOT, + IRQF_ONESHOT, "qcom_dwc3 SS", qcom); if (ret) { dev_err(qcom->dev, "ss_phy_irq failed: %d\n", ret); From aee70a1d711327dae409671035a0368c1dc4a2ea Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 20 Nov 2023 17:16:07 +0100 Subject: [PATCH 500/560] USB: dwc3: qcom: simplify wakeup interrupt setup Use the IRQF_NO_AUTOEN irq flag when requesting the wakeup interrupts instead of setting it separately. No functional change intended. Signed-off-by: Johan Hovold Reviewed-by: Andrew Halaney Link: https://lore.kernel.org/r/20231120161607.7405-4-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-qcom.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-qcom.c b/drivers/usb/dwc3/dwc3-qcom.c index cf200342273af..8a76973f1fa2b 100644 --- a/drivers/usb/dwc3/dwc3-qcom.c +++ b/drivers/usb/dwc3/dwc3-qcom.c @@ -546,10 +546,9 @@ static int dwc3_qcom_setup_irq(struct platform_device *pdev) pdata ? pdata->hs_phy_irq_index : -1); if (irq > 0) { /* Keep wakeup interrupts disabled until suspend */ - irq_set_status_flags(irq, IRQ_NOAUTOEN); ret = devm_request_threaded_irq(qcom->dev, irq, NULL, qcom_dwc3_resume_irq, - IRQF_ONESHOT, + IRQF_ONESHOT | IRQF_NO_AUTOEN, "qcom_dwc3 HS", qcom); if (ret) { dev_err(qcom->dev, "hs_phy_irq failed: %d\n", ret); @@ -561,10 +560,9 @@ static int dwc3_qcom_setup_irq(struct platform_device *pdev) irq = dwc3_qcom_get_irq(pdev, "dp_hs_phy_irq", pdata ? pdata->dp_hs_phy_irq_index : -1); if (irq > 0) { - irq_set_status_flags(irq, IRQ_NOAUTOEN); ret = devm_request_threaded_irq(qcom->dev, irq, NULL, qcom_dwc3_resume_irq, - IRQF_ONESHOT, + IRQF_ONESHOT | IRQF_NO_AUTOEN, "qcom_dwc3 DP_HS", qcom); if (ret) { dev_err(qcom->dev, "dp_hs_phy_irq failed: %d\n", ret); @@ -576,10 +574,9 @@ static int dwc3_qcom_setup_irq(struct platform_device *pdev) irq = dwc3_qcom_get_irq(pdev, "dm_hs_phy_irq", pdata ? pdata->dm_hs_phy_irq_index : -1); if (irq > 0) { - irq_set_status_flags(irq, IRQ_NOAUTOEN); ret = devm_request_threaded_irq(qcom->dev, irq, NULL, qcom_dwc3_resume_irq, - IRQF_ONESHOT, + IRQF_ONESHOT | IRQF_NO_AUTOEN, "qcom_dwc3 DM_HS", qcom); if (ret) { dev_err(qcom->dev, "dm_hs_phy_irq failed: %d\n", ret); @@ -591,10 +588,9 @@ static int dwc3_qcom_setup_irq(struct platform_device *pdev) irq = dwc3_qcom_get_irq(pdev, "ss_phy_irq", pdata ? pdata->ss_phy_irq_index : -1); if (irq > 0) { - irq_set_status_flags(irq, IRQ_NOAUTOEN); ret = devm_request_threaded_irq(qcom->dev, irq, NULL, qcom_dwc3_resume_irq, - IRQF_ONESHOT, + IRQF_ONESHOT | IRQF_NO_AUTOEN, "qcom_dwc3 SS", qcom); if (ret) { dev_err(qcom->dev, "ss_phy_irq failed: %d\n", ret); From 51392a1879ff06dc21b68aef4825f6ef68a7be42 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 17 Nov 2023 18:36:48 +0100 Subject: [PATCH 501/560] USB: dwc3: qcom: fix resource leaks on probe deferral The driver needs to deregister and free the newly allocated dwc3 core platform device on ACPI probe errors (e.g. probe deferral) and on driver unbind but instead it leaked those resources while erroneously dropping a reference to the parent platform device which is still in use. For OF probing the driver takes a reference to the dwc3 core platform device which has also always been leaked. Fix the broken ACPI tear down and make sure to drop the dwc3 core reference for both OF and ACPI. Fixes: 8fd95da2cfb5 ("usb: dwc3: qcom: Release the correct resources in dwc3_qcom_remove()") Fixes: 2bc02355f8ba ("usb: dwc3: qcom: Add support for booting with ACPI") Fixes: a4333c3a6ba9 ("usb: dwc3: Add Qualcomm DWC3 glue driver") Cc: stable@vger.kernel.org # 4.18 Cc: Christophe JAILLET Cc: Lee Jones Signed-off-by: Johan Hovold Acked-by: Andrew Halaney Link: https://lore.kernel.org/r/20231117173650.21161-2-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-qcom.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-qcom.c b/drivers/usb/dwc3/dwc3-qcom.c index 8a76973f1fa2b..313a8ac2bd601 100644 --- a/drivers/usb/dwc3/dwc3-qcom.c +++ b/drivers/usb/dwc3/dwc3-qcom.c @@ -754,6 +754,7 @@ static int dwc3_qcom_of_register_core(struct platform_device *pdev) if (!qcom->dwc3) { ret = -ENODEV; dev_err(dev, "failed to get dwc3 platform device\n"); + of_platform_depopulate(dev); } node_put: @@ -895,7 +896,7 @@ static int dwc3_qcom_probe(struct platform_device *pdev) if (ret) { dev_err(dev, "failed to register DWC3 Core, err=%d\n", ret); - goto depopulate; + goto clk_disable; } ret = dwc3_qcom_interconnect_init(qcom); @@ -930,7 +931,8 @@ static int dwc3_qcom_probe(struct platform_device *pdev) if (np) of_platform_depopulate(&pdev->dev); else - platform_device_put(pdev); + platform_device_del(qcom->dwc3); + platform_device_put(qcom->dwc3); clk_disable: for (i = qcom->num_clocks - 1; i >= 0; i--) { clk_disable_unprepare(qcom->clks[i]); @@ -953,7 +955,8 @@ static void dwc3_qcom_remove(struct platform_device *pdev) if (np) of_platform_depopulate(&pdev->dev); else - platform_device_put(pdev); + platform_device_del(qcom->dwc3); + platform_device_put(qcom->dwc3); for (i = qcom->num_clocks - 1; i >= 0; i--) { clk_disable_unprepare(qcom->clks[i]); From 9feefbf57d92e8ee293dad67585d351c7d0b6e37 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 17 Nov 2023 18:36:49 +0100 Subject: [PATCH 502/560] USB: dwc3: qcom: fix software node leak on probe errors Make sure to remove the software node also on (ACPI) probe errors to avoid leaking the underlying resources. Note that the software node is only used for ACPI probe so the driver unbind tear down is updated to match probe. Fixes: 8dc6e6dd1bee ("usb: dwc3: qcom: Constify the software node") Cc: stable@vger.kernel.org # 5.12 Cc: Heikki Krogerus Signed-off-by: Johan Hovold Acked-by: Heikki Krogerus Acked-by: Andrew Halaney Link: https://lore.kernel.org/r/20231117173650.21161-3-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-qcom.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-qcom.c b/drivers/usb/dwc3/dwc3-qcom.c index 313a8ac2bd601..a9bce4548019f 100644 --- a/drivers/usb/dwc3/dwc3-qcom.c +++ b/drivers/usb/dwc3/dwc3-qcom.c @@ -928,10 +928,12 @@ static int dwc3_qcom_probe(struct platform_device *pdev) interconnect_exit: dwc3_qcom_interconnect_exit(qcom); depopulate: - if (np) + if (np) { of_platform_depopulate(&pdev->dev); - else + } else { + device_remove_software_node(&qcom->dwc3->dev); platform_device_del(qcom->dwc3); + } platform_device_put(qcom->dwc3); clk_disable: for (i = qcom->num_clocks - 1; i >= 0; i--) { @@ -951,11 +953,12 @@ static void dwc3_qcom_remove(struct platform_device *pdev) struct device *dev = &pdev->dev; int i; - device_remove_software_node(&qcom->dwc3->dev); - if (np) + if (np) { of_platform_depopulate(&pdev->dev); - else + } else { + device_remove_software_node(&qcom->dwc3->dev); platform_device_del(qcom->dwc3); + } platform_device_put(qcom->dwc3); for (i = qcom->num_clocks - 1; i >= 0; i--) { From 9cf87666fc6e08572341fe08ecd909935998fbbd Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 17 Nov 2023 18:36:50 +0100 Subject: [PATCH 503/560] USB: dwc3: qcom: fix ACPI platform device leak Make sure to free the "urs" platform device, which is created for some ACPI platforms, on probe errors and on driver unbind. Compile-tested only. Fixes: c25c210f590e ("usb: dwc3: qcom: add URS Host support for sdm845 ACPI boot") Cc: Shawn Guo Signed-off-by: Johan Hovold Acked-by: Andrew Halaney Acked-by: Shawn Guo Link: https://lore.kernel.org/r/20231117173650.21161-4-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-qcom.c | 37 +++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-qcom.c b/drivers/usb/dwc3/dwc3-qcom.c index a9bce4548019f..fdf6d5d3c2ada 100644 --- a/drivers/usb/dwc3/dwc3-qcom.c +++ b/drivers/usb/dwc3/dwc3-qcom.c @@ -763,9 +763,9 @@ static int dwc3_qcom_of_register_core(struct platform_device *pdev) return ret; } -static struct platform_device * -dwc3_qcom_create_urs_usb_platdev(struct device *dev) +static struct platform_device *dwc3_qcom_create_urs_usb_platdev(struct device *dev) { + struct platform_device *urs_usb = NULL; struct fwnode_handle *fwh; struct acpi_device *adev; char name[8]; @@ -785,9 +785,26 @@ dwc3_qcom_create_urs_usb_platdev(struct device *dev) adev = to_acpi_device_node(fwh); if (!adev) - return NULL; + goto err_put_handle; + + urs_usb = acpi_create_platform_device(adev, NULL); + if (IS_ERR_OR_NULL(urs_usb)) + goto err_put_handle; + + return urs_usb; - return acpi_create_platform_device(adev, NULL); +err_put_handle: + fwnode_handle_put(fwh); + + return urs_usb; +} + +static void dwc3_qcom_destroy_urs_usb_platdev(struct platform_device *urs_usb) +{ + struct fwnode_handle *fwh = urs_usb->dev.fwnode; + + platform_device_unregister(urs_usb); + fwnode_handle_put(fwh); } static int dwc3_qcom_probe(struct platform_device *pdev) @@ -871,13 +888,13 @@ static int dwc3_qcom_probe(struct platform_device *pdev) qcom->qscratch_base = devm_ioremap_resource(dev, parent_res); if (IS_ERR(qcom->qscratch_base)) { ret = PTR_ERR(qcom->qscratch_base); - goto clk_disable; + goto free_urs; } ret = dwc3_qcom_setup_irq(pdev); if (ret) { dev_err(dev, "failed to setup IRQs, err=%d\n", ret); - goto clk_disable; + goto free_urs; } /* @@ -896,7 +913,7 @@ static int dwc3_qcom_probe(struct platform_device *pdev) if (ret) { dev_err(dev, "failed to register DWC3 Core, err=%d\n", ret); - goto clk_disable; + goto free_urs; } ret = dwc3_qcom_interconnect_init(qcom); @@ -935,6 +952,9 @@ static int dwc3_qcom_probe(struct platform_device *pdev) platform_device_del(qcom->dwc3); } platform_device_put(qcom->dwc3); +free_urs: + if (qcom->urs_usb) + dwc3_qcom_destroy_urs_usb_platdev(qcom->urs_usb); clk_disable: for (i = qcom->num_clocks - 1; i >= 0; i--) { clk_disable_unprepare(qcom->clks[i]); @@ -961,6 +981,9 @@ static void dwc3_qcom_remove(struct platform_device *pdev) } platform_device_put(qcom->dwc3); + if (qcom->urs_usb) + dwc3_qcom_destroy_urs_usb_platdev(qcom->urs_usb); + for (i = qcom->num_clocks - 1; i >= 0; i--) { clk_disable_unprepare(qcom->clks[i]); clk_put(qcom->clks[i]); From 4711b7b8f99583f6105a33e91f106125134beacb Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 30 Oct 2023 11:41:33 +0100 Subject: [PATCH 504/560] s390/pai: cleanup event initialization Setting event::hw.last_tag to zero is not necessary. The memory for each event is dynamically allocated by the kernel common code and initialized to zero already. Remove this unnecessary assignment. Move the comment to function paicrypt_start() for clarification. Suggested-by: Sumanth Korikkar Acked-by: Sumanth Korikkar Signed-off-by: Thomas Richter Signed-off-by: Alexander Gordeev --- arch/s390/kernel/perf_pai_crypto.c | 11 +++++------ arch/s390/kernel/perf_pai_ext.c | 1 - 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index 77fd24e6cbb64..39a91b00438a7 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -279,12 +279,6 @@ static int paicrypt_event_init(struct perf_event *event) if (IS_ERR(cpump)) return PTR_ERR(cpump); - /* Event initialization sets last_tag to 0. When later on the events - * are deleted and re-added, do not reset the event count value to zero. - * Events are added, deleted and re-added when 2 or more events - * are active at the same time. - */ - event->hw.last_tag = 0; event->destroy = paicrypt_event_destroy; if (a->sample_period) { @@ -318,6 +312,11 @@ static void paicrypt_start(struct perf_event *event, int flags) { u64 sum; + /* Event initialization sets last_tag to 0. When later on the events + * are deleted and re-added, do not reset the event count value to zero. + * Events are added, deleted and re-added when 2 or more events + * are active at the same time. + */ if (!event->hw.last_tag) { event->hw.last_tag = 1; sum = paicrypt_getall(event); /* Get current value */ diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index 8ba0f1a3a39dc..e7013a2e89605 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -260,7 +260,6 @@ static int paiext_event_init(struct perf_event *event) rc = paiext_alloc(a, event); if (rc) return rc; - event->hw.last_tag = 0; event->destroy = paiext_event_destroy; if (a->sample_period) { From 673752a839694133a328610fcbc54f3d59ae87f3 Mon Sep 17 00:00:00 2001 From: Mikhail Zaslonko Date: Wed, 8 Nov 2023 18:18:52 +0100 Subject: [PATCH 505/560] s390/ipl: add missing IPL_TYPE_ECKD_DUMP case to ipl_init() Add missing IPL_TYPE_ECKD_DUMP case to ipl_init() creating ECKD ipl device attribute group similar to IPL_TYPE_ECKD case. Commit e2d2a2968f2a ("s390/ipl: add eckd dump support") should have had it from the beginning. Fixes: e2d2a2968f2a ("s390/ipl: add eckd dump support") Signed-off-by: Mikhail Zaslonko Reviewed-by: Sven Schnelle Signed-off-by: Alexander Gordeev --- arch/s390/kernel/ipl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index cc364fce6aa96..ba75f6bee7742 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -666,6 +666,7 @@ static int __init ipl_init(void) &ipl_ccw_attr_group_lpar); break; case IPL_TYPE_ECKD: + case IPL_TYPE_ECKD_DUMP: rc = sysfs_create_group(&ipl_kset->kobj, &ipl_eckd_attr_group); break; case IPL_TYPE_FCP: From 0a9ace1117bbaa25687468af703b472235f5c210 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 15 Nov 2023 11:39:02 +0100 Subject: [PATCH 506/560] s390: remove odd comment In the meantime hopefully most people got used to forward declarations, therefore remove the explanation. Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/processor.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index dc17896a001a9..c15eadbb99834 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -228,7 +228,6 @@ typedef struct thread_struct thread_struct; execve_tail(); \ } while (0) -/* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; struct seq_file; From aab1f809d7540def24498e81347740a7239a74d5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 20 Nov 2023 13:00:00 +0100 Subject: [PATCH 507/560] scripts/checkstack.pl: match all stack sizes for s390 For some unknown reason the regular expression for checkstack only matches three digit numbers starting with the number "3", or any higher number. Which means that it skips any stack sizes smaller than 304 bytes. This makes the checkstack script a bit less useful than it could be. Change the script to match any number. To be filtered out stack sizes can be configured with the min_stack variable, which omits any stack frame sizes smaller than 100 bytes by default. Tested-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- scripts/checkstack.pl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index 84f5fb7f1cecc..d83ba5d8f3f49 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -97,8 +97,7 @@ # 11160: a7 fb ff 60 aghi %r15,-160 # or # 100092: e3 f0 ff c8 ff 71 lay %r15,-56(%r15) - $re = qr/.*(?:lay|ag?hi).*\%r15,-(([0-9]{2}|[3-9])[0-9]{2}) - (?:\(\%r15\))?$/ox; + $re = qr/.*(?:lay|ag?hi).*\%r15,-([0-9]+)(?:\(\%r15\))?$/o; } elsif ($arch eq 'sparc' || $arch eq 'sparc64') { # f0019d10: 9d e3 bf 90 save %sp, -112, %sp $re = qr/.*save.*%sp, -(([0-9]{2}|[3-9])[0-9]{2}), %sp/o; From 3af755a46881c32fecaecfdeaf3a8f0a869deca5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 21 Nov 2023 09:01:03 +0100 Subject: [PATCH 508/560] nvme: move nvme_stop_keep_alive() back to original position Stopping keep-alive not only stops the keep-alive workqueue, but also needs to be synchronized with I/O termination as we must not send a keep-alive command after all I/O had been terminated. So to avoid any regressions move the call to stop_keep_alive() back to its original position and ensure that keep-alive is correctly stopped failing to setup the admin queue. Fixes: 4733b65d82bd ("nvme: start keep-alive after admin queue setup") Suggested-by: Sagi Grimberg Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/fc.c | 19 ++++++++----------- drivers/nvme/host/rdma.c | 1 + drivers/nvme/host/tcp.c | 1 + 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fd28e6b6574c0..46a4c9c5ea962 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -482,7 +482,6 @@ EXPORT_SYMBOL_GPL(nvme_cancel_tagset); void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl) { - nvme_stop_keep_alive(ctrl); if (ctrl->admin_tagset) { blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl); @@ -4355,6 +4354,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_mpath_stop(ctrl); nvme_auth_stop(ctrl); + nvme_stop_keep_alive(ctrl); nvme_stop_failfast_work(ctrl); flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fw_act_work); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 49c3e46eaa1ee..9f9a3b35dc64d 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2530,12 +2530,6 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * clean up the admin queue. Same thing as above. */ nvme_quiesce_admin_queue(&ctrl->ctrl); - - /* - * Open-coding nvme_cancel_admin_tagset() as fc - * is not using nvme_cancel_request(). - */ - nvme_stop_keep_alive(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -3138,11 +3132,12 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) nvme_unquiesce_admin_queue(&ctrl->ctrl); ret = nvme_init_ctrl_finish(&ctrl->ctrl, false); - if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) - ret = -EIO; if (ret) goto out_disconnect_admin_queue; - + if (test_bit(ASSOC_FAILED, &ctrl->flags)) { + ret = -EIO; + goto out_stop_keep_alive; + } /* sanity checks */ /* FC-NVME does not have other data in the capsule */ @@ -3150,7 +3145,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n", ctrl->ctrl.icdoff); ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - goto out_disconnect_admin_queue; + goto out_stop_keep_alive; } /* FC-NVME supports normal SGL Data Block Descriptors */ @@ -3158,7 +3153,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) dev_err(ctrl->ctrl.device, "Mandatory sgls are not supported!\n"); ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - goto out_disconnect_admin_queue; + goto out_stop_keep_alive; } if (opts->queue_size > ctrl->ctrl.maxcmd) { @@ -3205,6 +3200,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) out_term_aen_ops: nvme_fc_term_aen_ops(ctrl); +out_stop_keep_alive: + nvme_stop_keep_alive(&ctrl->ctrl); out_disconnect_admin_queue: dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: create_assoc failed, assoc_id %llx ret %d\n", diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index a7fea4cbacd75..6d178d5559204 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1080,6 +1080,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) nvme_rdma_free_io_queues(ctrl); } destroy_admin: + nvme_stop_keep_alive(&ctrl->ctrl); nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 6ed7948155174..ddcd23fb8b75d 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2237,6 +2237,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) nvme_tcp_destroy_io_queues(ctrl, new); } destroy_admin: + nvme_stop_keep_alive(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); return ret; } From 125b0bb95dd6bec81b806b997a4ccb026eeecf8f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 9 Nov 2023 22:22:13 -0800 Subject: [PATCH 509/560] asm-generic: qspinlock: fix queued_spin_value_unlocked() implementation We really don't want to do atomic_read() or anything like that, since we already have the value, not the lock. The whole point of this is that we've loaded the lock from memory, and we want to check whether the value we loaded was a locked one or not. The main use of this is the lockref code, which loads both the lock and the reference count in one atomic operation, and then works on that combined value. With the atomic_read(), the compiler would pointlessly spill the value to the stack, in order to then be able to read it back "atomically". This is the qspinlock version of commit c6f4a9002252 ("asm-generic: ticket-lock: Optimize arch_spin_value_unlocked()") which fixed this same bug for ticket locks. Cc: Guo Ren Cc: Ingo Molnar Cc: Waiman Long Link: https://lore.kernel.org/all/CAHk-=whNRv0v6kQiV5QO6DJhjH4KEL36vWQ6Re8Csrnh4zbRkQ@mail.gmail.com/ Signed-off-by: Linus Torvalds --- include/asm-generic/qspinlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index 995513fa26904..0655aa5b57b29 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -70,7 +70,7 @@ static __always_inline int queued_spin_is_locked(struct qspinlock *lock) */ static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) { - return !atomic_read(&lock.val); + return !lock.val.counter; } /** From ed17f7da5f0c8b65b7b5f7c98beb0aadbc0546ee Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 20 Nov 2023 10:31:38 -0800 Subject: [PATCH 510/560] xfs: clean up dqblk extraction Since the introduction of xfs_dqblk in V5, xfs really ought to find the dqblk pointer from the dquot buffer, then compute the xfs_disk_dquot pointer from the dqblk pointer. Fix the open-coded xfs_buf_offset calls and do the type checking in the correct order. Note that this has made no practical difference since the start of the xfs_disk_dquot is coincident with the start of the xfs_dqblk. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_dquot.c | 5 +++-- fs/xfs/xfs_dquot_item_recover.c | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index ac6ba646624df..a013b87ab8d5e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -562,7 +562,8 @@ xfs_dquot_from_disk( struct xfs_dquot *dqp, struct xfs_buf *bp) { - struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; + struct xfs_dqblk *dqb = xfs_buf_offset(bp, dqp->q_bufoffset); + struct xfs_disk_dquot *ddqp = &dqb->dd_diskdq; /* * Ensure that we got the type and ID we were looking for. @@ -1250,7 +1251,7 @@ xfs_qm_dqflush( } /* Flush the incore dquot to the ondisk buffer. */ - dqblk = bp->b_addr + dqp->q_bufoffset; + dqblk = xfs_buf_offset(bp, dqp->q_bufoffset); xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp); /* diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 8966ba842395b..db2cb5e4197b9 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -65,6 +65,7 @@ xlog_recover_dquot_commit_pass2( { struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; + struct xfs_dqblk *dqb; struct xfs_disk_dquot *ddq, *recddq; struct xfs_dq_logformat *dq_f; xfs_failaddr_t fa; @@ -130,14 +131,14 @@ xlog_recover_dquot_commit_pass2( return error; ASSERT(bp); - ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); + dqb = xfs_buf_offset(bp, dq_f->qlf_boffset); + ddq = &dqb->dd_diskdq; /* * If the dquot has an LSN in it, recover the dquot only if it's less * than the lsn of the transaction we are replaying. */ if (xfs_has_crc(mp)) { - struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { @@ -147,7 +148,7 @@ xlog_recover_dquot_commit_pass2( memcpy(ddq, recddq, item->ri_buf[1].i_len); if (xfs_has_crc(mp)) { - xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } From 9c235dfc3d3f901fe22acb20f2ab37ff39f2ce02 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 20 Nov 2023 10:31:44 -0800 Subject: [PATCH 511/560] xfs: dquot recovery does not validate the recovered dquot When we're recovering ondisk quota records from the log, we need to validate the recovered buffer contents before writing them to disk. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_dquot_item_recover.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index db2cb5e4197b9..2c2720ce69238 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -19,6 +19,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_error.h" STATIC void xlog_recover_dquot_ra_pass2( @@ -152,6 +153,19 @@ xlog_recover_dquot_commit_pass2( XFS_DQUOT_CRC_OFF); } + /* Validate the recovered dquot. */ + fa = xfs_dqblk_verify(log->l_mp, dqb, dq_f->qlf_id); + if (fa) { + XFS_CORRUPTION_ERROR("Bad dquot after recovery", + XFS_ERRLEVEL_LOW, mp, dqb, + sizeof(struct xfs_dqblk)); + xfs_alert(mp, + "Metadata corruption detected at %pS, dquot 0x%x", + fa, dq_f->qlf_id); + error = -EFSCORRUPTED; + goto out_release; + } + ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_mount == mp); bp->b_flags |= _XBF_LOGRECOVERY; From acfa60dbe03802d6afd28401aa47801270e82021 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 17 Nov 2023 13:14:22 +0000 Subject: [PATCH 512/560] arm64: mm: Fix "rodata=on" when CONFIG_RODATA_FULL_DEFAULT_ENABLED=y When CONFIG_RODATA_FULL_DEFAULT_ENABLED=y, passing "rodata=on" on the kernel command-line (rather than "rodata=full") should turn off the "full" behaviour, leaving writable linear aliases of read-only kernel memory. Unfortunately, the option has no effect in this situation and the only way to disable the "rodata=full" behaviour is to disable rodata protection entirely by passing "rodata=off". Fix this by parsing the "on" and "off" options in the arch code, additionally enforcing that 'rodata_full' cannot be set without also setting 'rodata_enabled', allowing us to simplify a couple of checks in the process. Fixes: 2e8cff0a0eee ("arm64: fix rodata=full") Cc: Ard Biesheuvel Cc: Mark Rutland Signed-off-by: Will Deacon Reviewed-by: "Russell King (Oracle)" Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20231117131422.29663-1-will@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/setup.h | 17 +++++++++++++++-- arch/arm64/mm/pageattr.c | 7 +++---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h index f4af547ef54ca..2e4d7da74fb87 100644 --- a/arch/arm64/include/asm/setup.h +++ b/arch/arm64/include/asm/setup.h @@ -21,9 +21,22 @@ static inline bool arch_parse_debug_rodata(char *arg) extern bool rodata_enabled; extern bool rodata_full; - if (arg && !strcmp(arg, "full")) { + if (!arg) + return false; + + if (!strcmp(arg, "full")) { + rodata_enabled = rodata_full = true; + return true; + } + + if (!strcmp(arg, "off")) { + rodata_enabled = rodata_full = false; + return true; + } + + if (!strcmp(arg, "on")) { rodata_enabled = true; - rodata_full = true; + rodata_full = false; return true; } diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 8e2017ba5f1b1..924843f1f661b 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -29,8 +29,8 @@ bool can_set_direct_map(void) * * KFENCE pool requires page-granular mapping if initialized late. */ - return (rodata_enabled && rodata_full) || debug_pagealloc_enabled() || - arm64_kfence_can_set_direct_map(); + return rodata_full || debug_pagealloc_enabled() || + arm64_kfence_can_set_direct_map(); } static int change_page_range(pte_t *ptep, unsigned long addr, void *data) @@ -105,8 +105,7 @@ static int change_memory_common(unsigned long addr, int numpages, * If we are manipulating read-only permissions, apply the same * change to the linear mapping of the pages that back this VM area. */ - if (rodata_enabled && - rodata_full && (pgprot_val(set_mask) == PTE_RDONLY || + if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY || pgprot_val(clear_mask) == PTE_RDONLY)) { for (i = 0; i < area->nr_pages; i++) { __change_memory_common((u64)page_address(area->pages[i]), From 4763d635c907baed212664dc579dde1663bb2676 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 21 Nov 2023 18:10:04 -0500 Subject: [PATCH 513/560] eventfs: Use GFP_NOFS for allocation when eventfs_mutex is held If memory reclaim happens, it can reclaim file system pages. The file system pages from eventfs may take the eventfs_mutex on reclaim. This means that allocation while holding the eventfs_mutex must not call into filesystem reclaim. A lockdep splat uncovered this. Link: https://lkml.kernel.org/r/20231121231112.373501894@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Fixes: 28e12c09f5aa0 ("eventfs: Save ownership and mode") Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reported-by: Mark Rutland Reviewed-by: Josef Bacik Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 3eb6c622a74d2..56d192f0ead89 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -95,7 +95,7 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, if (!(dentry->d_inode->i_mode & S_IFDIR)) { if (!ei->entry_attrs) { ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries, - GFP_KERNEL); + GFP_NOFS); if (!ei->entry_attrs) { ret = -ENOMEM; goto out; @@ -627,7 +627,7 @@ static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt) { struct dentry **tmp; - tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_KERNEL); + tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_NOFS); if (!tmp) return -1; tmp[cnt] = d; From bcae32c5632fc0a0dbce46fa731cd23403117e66 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 21 Nov 2023 18:10:05 -0500 Subject: [PATCH 514/560] eventfs: Move taking of inode_lock into dcache_dir_open_wrapper() The both create_file_dentry() and create_dir_dentry() takes a boolean parameter "lookup", as on lookup the inode_lock should already be taken, but for dcache_dir_open_wrapper() it is not taken. There's no reason that the dcache_dir_open_wrapper() can't take the inode_lock before calling these functions. In fact, it's better if it does, as the lock can be held throughout both directory and file creations. This also simplifies the code, and possibly prevents unexpected race conditions when the lock is released. Link: https://lkml.kernel.org/r/20231121231112.528544825@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode") Reviewed-by: Josef Bacik Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 56d192f0ead89..590e8176449b9 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -347,15 +347,8 @@ create_file_dentry(struct eventfs_inode *ei, int idx, mutex_unlock(&eventfs_mutex); - /* The lookup already has the parent->d_inode locked */ - if (!lookup) - inode_lock(parent->d_inode); - dentry = create_file(name, mode, attr, parent, data, fops); - if (!lookup) - inode_unlock(parent->d_inode); - mutex_lock(&eventfs_mutex); if (IS_ERR_OR_NULL(dentry)) { @@ -453,15 +446,8 @@ create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, } mutex_unlock(&eventfs_mutex); - /* The lookup already has the parent->d_inode locked */ - if (!lookup) - inode_lock(parent->d_inode); - dentry = create_dir(ei, parent); - if (!lookup) - inode_unlock(parent->d_inode); - mutex_lock(&eventfs_mutex); if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) { @@ -693,6 +679,7 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) return -ENOMEM; } + inode_lock(parent->d_inode); list_for_each_entry_srcu(ei_child, &ei->children, list, srcu_read_lock_held(&eventfs_srcu)) { d = create_dir_dentry(ei, ei_child, parent, false); @@ -725,6 +712,7 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) cnt++; } } + inode_unlock(parent->d_inode); srcu_read_unlock(&eventfs_srcu, idx); ret = dcache_dir_open(inode, file); From fc4561226feaad5fcdcb55646c348d77b8ee69c5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 21 Nov 2023 18:10:06 -0500 Subject: [PATCH 515/560] eventfs: Do not allow NULL parent to eventfs_start_creating() The eventfs directory is dynamically created via the meta data supplied by the existing trace events. All files and directories in eventfs has a parent. Do not allow NULL to be passed into eventfs_start_creating() as the parent because that should never happen. Warn if it does. Link: https://lkml.kernel.org/r/20231121231112.693841807@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Reviewed-by: Josef Bacik Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/inode.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 5b54948514fe2..ae648deed019c 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -509,20 +509,15 @@ struct dentry *eventfs_start_creating(const char *name, struct dentry *parent) struct dentry *dentry; int error; + /* Must always have a parent. */ + if (WARN_ON_ONCE(!parent)) + return ERR_PTR(-EINVAL); + error = simple_pin_fs(&trace_fs_type, &tracefs_mount, &tracefs_mount_count); if (error) return ERR_PTR(error); - /* - * If the parent is not specified, we create it in the root. - * We need the root dentry to do this, which is in the super - * block. A pointer to that is in the struct vfsmount that we - * have around. - */ - if (!parent) - parent = tracefs_mount->mnt_root; - if (unlikely(IS_DEADDIR(parent->d_inode))) dentry = ERR_PTR(-ENOENT); else From f49f950c217bfb40f11662bab39cb388d41e4cfb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 21 Nov 2023 18:10:07 -0500 Subject: [PATCH 516/560] eventfs: Make sure that parent->d_inode is locked in creating files/dirs Since the locking of the parent->d_inode has been moved outside the creation of the files and directories (as it use to be locked via a conditional), add a WARN_ON_ONCE() to the case that it's not locked. Link: https://lkml.kernel.org/r/20231121231112.853962542@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Reviewed-by: Josef Bacik Signed-off-by: Steven Rostedt (Google) --- fs/tracefs/event_inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 590e8176449b9..0b90869fd805c 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -327,6 +327,8 @@ create_file_dentry(struct eventfs_inode *ei, int idx, struct dentry **e_dentry = &ei->d_children[idx]; struct dentry *dentry; + WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); + mutex_lock(&eventfs_mutex); if (ei->is_freed) { mutex_unlock(&eventfs_mutex); @@ -430,6 +432,8 @@ create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, { struct dentry *dentry = NULL; + WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); + mutex_lock(&eventfs_mutex); if (pei->is_freed || ei->is_freed) { mutex_unlock(&eventfs_mutex); From 76d9eafff4484547ed9e606c8227ac9799a9f2da Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 15 Nov 2023 10:50:18 -0500 Subject: [PATCH 517/560] MAINTAINERS: TRACING: Add Mathieu Desnoyers as Reviewer In order to make sure I get CC'd on tracing changes for which my input would be relevant, add my name as reviewer of the TRACING subsystem. Link: https://lore.kernel.org/linux-trace-kernel/20231115155018.8236-1-mathieu.desnoyers@efficios.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index ea790149af795..a2d4ef4d90f6d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22078,6 +22078,7 @@ F: drivers/watchdog/tqmx86_wdt.c TRACING M: Steven Rostedt M: Masami Hiramatsu +R: Mathieu Desnoyers L: linux-kernel@vger.kernel.org L: linux-trace-kernel@vger.kernel.org S: Maintained From d78abcbabe7e98bb4baa4dea87550806944790ed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Nov 2023 23:47:17 +0100 Subject: [PATCH 518/560] nvme: target: fix nvme_keyring_id() references In configurations without CONFIG_NVME_TARGET_TCP_TLS, the keyring code might not be available, or using it will result in a runtime failure: x86_64-linux-ld: vmlinux.o: in function `nvmet_ports_make': configfs.c:(.text+0x100a211): undefined reference to `nvme_keyring_id' Add a check to ensure we only check the keyring if there is a chance of it being used, which avoids both the runtime and link-time problems. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20231122224719.4042108-2-arnd@kernel.org Signed-off-by: Jens Axboe --- drivers/nvme/target/configfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 9eed6e6765eaa..e307a044b1a1b 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1893,7 +1893,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, return ERR_PTR(-ENOMEM); } - if (nvme_keyring_id()) { + if (IS_ENABLED(CONFIG_NVME_TARGET_TCP_TLS) && nvme_keyring_id()) { port->keyring = key_lookup(nvme_keyring_id()); if (IS_ERR(port->keyring)) { pr_warn("NVMe keyring not available, disabling TLS\n"); From 65e2a74c44ddfa174b700f5da2d1d29b4ba6639b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Nov 2023 23:47:18 +0100 Subject: [PATCH 519/560] nvme: target: fix Kconfig select statements When the NVME target code is built-in but its TCP frontend is a loadable module, enabling keyring support causes a link failure: x86_64-linux-ld: vmlinux.o: in function `nvmet_ports_make': configfs.c:(.text+0x100a211): undefined reference to `nvme_keyring_id' The problem is that CONFIG_NVME_TARGET_TCP_TLS is a 'bool' symbol that depends on the tristate CONFIG_NVME_TARGET_TCP, so any 'select' from it inherits the state of the tristate symbol rather than the intended CONFIG_NVME_TARGET one that contains the actual call. The same thing is true for CONFIG_KEYS, which itself is required for NVME_KEYRING. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20231122224719.4042108-3-arnd@kernel.org Signed-off-by: Jens Axboe --- drivers/nvme/target/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 31633da9427c7..e1ebc73f3e5e0 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -4,6 +4,8 @@ config NVME_TARGET tristate "NVMe Target support" depends on BLOCK depends on CONFIGFS_FS + select NVME_KEYRING if NVME_TARGET_TCP_TLS + select KEYS if NVME_TARGET_TCP_TLS select BLK_DEV_INTEGRITY_T10 if BLK_DEV_INTEGRITY select SGL_ALLOC help @@ -87,9 +89,7 @@ config NVME_TARGET_TCP config NVME_TARGET_TCP_TLS bool "NVMe over Fabrics TCP target TLS encryption support" depends on NVME_TARGET_TCP - select NVME_KEYRING select NET_HANDSHAKE - select KEYS help Enables TLS encryption for the NVMe TCP target using the netlink handshake API. From 0e6c4fe782e683ff55a27fbb10e9c6b5c241533b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Nov 2023 23:47:19 +0100 Subject: [PATCH 520/560] nvme: tcp: fix compile-time checks for TLS mode When CONFIG_NVME_KEYRING is enabled as a loadable module, but the TCP host code is built-in, it fails to link: arm-linux-gnueabi-ld: drivers/nvme/host/tcp.o: in function `nvme_tcp_setup_ctrl': tcp.c:(.text+0x1940): undefined reference to `nvme_tls_psk_default' The problem is that the compile-time conditionals are inconsistent here, using a mix of #ifdef CONFIG_NVME_TCP_TLS, IS_ENABLED(CONFIG_NVME_TCP_TLS) and IS_ENABLED(CONFIG_NVME_KEYRING) checks, with CONFIG_NVME_KEYRING controlling whether the implementation is actually built. Change it to use IS_ENABLED(CONFIG_NVME_KEYRING) checks consistently, which should help readability and make it less error-prone. Combining it with the check for the ctrl->opts->tls flag lets the compiler drop all the TLS code in configurations without this feature, which also helps runtime behavior in addition to avoiding the link failure. To make it possible for the compiler to build the dead code, both the tls_handshake_timeout variable and the TLS specific members of nvme_tcp_queue need to be moved out of the #ifdef block as well, but at least the former of these gets optimized out again. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20231122224719.4042108-4-arnd@kernel.org Signed-off-by: Jens Axboe --- drivers/nvme/host/tcp.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index ddcd23fb8b75d..d79811cfa0ce8 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -36,11 +36,11 @@ static int so_priority; module_param(so_priority, int, 0644); MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); -#ifdef CONFIG_NVME_TCP_TLS /* * TLS handshake timeout */ static int tls_handshake_timeout = 10; +#ifdef CONFIG_NVME_TCP_TLS module_param(tls_handshake_timeout, int, 0644); MODULE_PARM_DESC(tls_handshake_timeout, "nvme TLS handshake timeout in seconds (default 10)"); @@ -161,10 +161,8 @@ struct nvme_tcp_queue { struct ahash_request *snd_hash; __le32 exp_ddgst; __le32 recv_ddgst; -#ifdef CONFIG_NVME_TCP_TLS struct completion tls_complete; int tls_err; -#endif struct page_frag_cache pf_cache; void (*state_change)(struct sock *); @@ -207,6 +205,14 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) return queue - queue->ctrl->queues; } +static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl) +{ + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) + return 0; + + return ctrl->opts->tls; +} + static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) { u32 queue_idx = nvme_tcp_queue_id(queue); @@ -1412,7 +1418,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) memset(&msg, 0, sizeof(msg)); iov.iov_base = icresp; iov.iov_len = sizeof(*icresp); - if (queue->ctrl->ctrl.opts->tls) { + if (nvme_tcp_tls(&queue->ctrl->ctrl)) { msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); } @@ -1424,7 +1430,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) goto free_icresp; } ret = -ENOTCONN; - if (queue->ctrl->ctrl.opts->tls) { + if (nvme_tcp_tls(&queue->ctrl->ctrl)) { ctype = tls_get_record_type(queue->sock->sk, (struct cmsghdr *)cbuf); if (ctype != TLS_RECORD_TYPE_DATA) { @@ -1548,7 +1554,6 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); } -#ifdef CONFIG_NVME_TCP_TLS static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) { struct nvme_tcp_queue *queue = data; @@ -1625,14 +1630,6 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, } return ret; } -#else -static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, - struct nvme_tcp_queue *queue, - key_serial_t pskid) -{ - return -EPROTONOSUPPORT; -} -#endif static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, key_serial_t pskid) @@ -1759,7 +1756,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, } /* If PSKs are configured try to start TLS */ - if (pskid) { + if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) { ret = nvme_tcp_start_tls(nctrl, queue, pskid); if (ret) goto err_init_connect; @@ -1916,7 +1913,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) int ret; key_serial_t pskid = 0; - if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && ctrl->opts->tls) { + if (nvme_tcp_tls(ctrl)) { if (ctrl->opts->tls_key) pskid = key_serial(ctrl->opts->tls_key); else @@ -1949,7 +1946,7 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; - if (ctrl->opts->tls && !ctrl->tls_key) { + if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) { dev_err(ctrl->device, "no PSK negotiated\n"); return -ENOKEY; } From 99360d9620f09fb8bc15548d855011bbb198c680 Mon Sep 17 00:00:00 2001 From: Lech Perczak Date: Sat, 18 Nov 2023 00:19:18 +0100 Subject: [PATCH 521/560] net: usb: qmi_wwan: claim interface 4 for ZTE MF290 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Interface 4 is used by for QMI interface in stock firmware of MF28D, the router which uses MF290 modem. Rebind it to qmi_wwan after freeing it up from option driver. The proper configuration is: Interface mapping is: 0: QCDM, 1: (unknown), 2: AT (PCUI), 2: AT (Modem), 4: QMI T: Bus=01 Lev=02 Prnt=02 Port=00 Cnt=01 Dev#= 4 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=19d2 ProdID=0189 Rev= 0.00 S: Manufacturer=ZTE, Incorporated S: Product=ZTE LTE Technologies MSM C:* #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=option E: Ad=84(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan E: Ad=86(I) Atr=03(Int.) MxPS= 64 Ivl=2ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=4ms Cc: Bjørn Mork Signed-off-by: Lech Perczak Link: https://lore.kernel.org/r/20231117231918.100278-3-lech.perczak@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 344af3c5c8366..e2e181378f412 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1289,6 +1289,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x19d2, 0x0168, 4)}, {QMI_FIXED_INTF(0x19d2, 0x0176, 3)}, {QMI_FIXED_INTF(0x19d2, 0x0178, 3)}, + {QMI_FIXED_INTF(0x19d2, 0x0189, 4)}, /* ZTE MF290 */ {QMI_FIXED_INTF(0x19d2, 0x0191, 4)}, /* ZTE EuFi890 */ {QMI_FIXED_INTF(0x19d2, 0x0199, 1)}, /* ZTE MF820S */ {QMI_FIXED_INTF(0x19d2, 0x0200, 1)}, From 7bf9a6b46549852a37e6d07e52c601c3c706b562 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 22 Nov 2023 15:07:41 -0800 Subject: [PATCH 522/560] arm/xen: fix xen_vcpu_info allocation alignment xen_vcpu_info is a percpu area than needs to be mapped by Xen. Currently, it could cross a page boundary resulting in Xen being unable to map it: [ 0.567318] kernel BUG at arch/arm64/xen/../../arm/xen/enlighten.c:164! [ 0.574002] Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP Fix the issue by using __alloc_percpu and requesting alignment for the memory allocation. Signed-off-by: Stefano Stabellini Link: https://lore.kernel.org/r/alpine.DEB.2.22.394.2311221501340.2053963@ubuntu-linux-20-04-desktop Fixes: 24d5373dda7c ("arm/xen: Use alloc_percpu rather than __alloc_percpu") Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- arch/arm/xen/enlighten.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 9afdc4c4a5dc1..a395b6c0aae2a 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -484,7 +484,8 @@ static int __init xen_guest_init(void) * for secondary CPUs as they are brought up. * For uniformity we use VCPUOP_register_vcpu_info even on cpu0. */ - xen_vcpu_info = alloc_percpu(struct vcpu_info); + xen_vcpu_info = __alloc_percpu(sizeof(struct vcpu_info), + 1 << fls(sizeof(struct vcpu_info) - 1)); if (xen_vcpu_info == NULL) return -ENOMEM; From 4aa1d8f89b10cdc25a231dabf808d8935e0b137a Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Tue, 21 Nov 2023 22:26:24 +0530 Subject: [PATCH 523/560] octeontx2-pf: Fix ntuple rule creation to direct packet to VF with higher Rx queue than its PF It is possible to add a ntuple rule which would like to direct packet to a VF whose number of queues are greater/less than its PF's queue numbers. For example a PF can have 2 Rx queues but a VF created on that PF can have 8 Rx queues. As of today, ntuple rule will reject rule because it is checking the requested queue number against PF's number of Rx queues. As a part of this fix if the action of a ntuple rule is to move a packet to a VF's queue then the check is removed. Also, a debug information is printed to aware user that it is user's responsibility to cross check if the requested queue number on that VF is a valid one. Fixes: f0a1913f8a6f ("octeontx2-pf: Add support for ethtool ntuple filters") Signed-off-by: Suman Ghosh Reviewed-by: Wojciech Drewek Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231121165624.3664182-1-sumang@marvell.com Signed-off-by: Paolo Abeni --- .../marvell/octeontx2/nic/otx2_flows.c | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c index 4762dbea64a12..97a71e9b85637 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c @@ -1088,6 +1088,7 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc) struct ethhdr *eth_hdr; bool new = false; int err = 0; + u64 vf_num; u32 ring; if (!flow_cfg->max_flows) { @@ -1100,7 +1101,21 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc) if (!(pfvf->flags & OTX2_FLAG_NTUPLE_SUPPORT)) return -ENOMEM; - if (ring >= pfvf->hw.rx_queues && fsp->ring_cookie != RX_CLS_FLOW_DISC) + /* Number of queues on a VF can be greater or less than + * the PF's queue. Hence no need to check for the + * queue count. Hence no need to check queue count if PF + * is installing for its VF. Below is the expected vf_num value + * based on the ethtool commands. + * + * e.g. + * 1. ethtool -U ... action -1 ==> vf_num:255 + * 2. ethtool -U ... action ==> vf_num:0 + * 3. ethtool -U ... vf queue ==> + * vf_num:vf_idx+1 + */ + vf_num = ethtool_get_flow_spec_ring_vf(fsp->ring_cookie); + if (!is_otx2_vf(pfvf->pcifunc) && !vf_num && + ring >= pfvf->hw.rx_queues && fsp->ring_cookie != RX_CLS_FLOW_DISC) return -EINVAL; if (fsp->location >= otx2_get_maxflows(flow_cfg)) @@ -1182,6 +1197,9 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc) flow_cfg->nr_flows++; } + if (flow->is_vf) + netdev_info(pfvf->netdev, + "Make sure that VF's queue number is within its queue limit\n"); return 0; } From 818ad9cc90d4a7165caaee7e32800c50d0564ec3 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 21 Nov 2023 20:08:44 +0100 Subject: [PATCH 524/560] net: veth: fix ethtool stats reporting Fix a possible misalignment between page_pool stats and tx xdp_stats reported in veth_get_ethtool_stats routine. The issue can be reproduced configuring the veth pair with the following tx/rx queues: $ip link add v0 numtxqueues 2 numrxqueues 4 type veth peer name v1 \ numtxqueues 1 numrxqueues 1 and loading a simple XDP program on v0 that just returns XDP_PASS. In this case on v0 the page_pool stats overwrites tx xdp_stats for queue 1. Fix the issue incrementing pp_idx of dev->real_num_tx_queues * VETH_TQ_STATS_LEN since we always report xdp_stats for all tx queues in ethtool. Fixes: 4fc418053ec7 ("net: veth: add page_pool stats") Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/c5b5d0485016836448453f12846c7c4ab75b094a.1700593593.git.lorenzo@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/veth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 6cc352296c672..57efb3454c57a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -236,8 +236,8 @@ static void veth_get_ethtool_stats(struct net_device *dev, data[tx_idx + j] += *(u64 *)(base + offset); } } while (u64_stats_fetch_retry(&rq_stats->syncp, start)); - pp_idx = tx_idx + VETH_TQ_STATS_LEN; } + pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN; page_pool_stats: veth_get_page_pool_stats(dev, &data[pp_idx]); From 676ec53844cbdf2f47e68a076cdff7f0ec6cbe3f Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Wed, 22 Nov 2023 00:44:33 +0530 Subject: [PATCH 525/560] amd-xgbe: handle corner-case during sfp hotplug Force the mode change for SFI in Fixed PHY configurations. Fixed PHY configurations needs PLL to be enabled while doing mode set. When the SFP module isn't connected during boot, driver assumes AN is ON and attempts auto-negotiation. However, if the connected SFP comes up in Fixed PHY configuration the link will not come up as PLL isn't enabled while the initial mode set command is issued. So, force the mode change for SFI in Fixed PHY configuration to fix link issues. Fixes: e57f7a3feaef ("amd-xgbe: Prepare for working with more than one type of phy") Acked-by: Shyam Sundar S K Signed-off-by: Raju Rangoju Reviewed-by: Wojciech Drewek Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amd/xgbe/xgbe-mdio.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c index 32d2c6fac6526..4a2dc705b5280 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c @@ -1193,7 +1193,19 @@ static int xgbe_phy_config_fixed(struct xgbe_prv_data *pdata) if (pdata->phy.duplex != DUPLEX_FULL) return -EINVAL; - xgbe_set_mode(pdata, mode); + /* Force the mode change for SFI in Fixed PHY config. + * Fixed PHY configs needs PLL to be enabled while doing mode set. + * When the SFP module isn't connected during boot, driver assumes + * AN is ON and attempts autonegotiation. However, if the connected + * SFP comes up in Fixed PHY config, the link will not come up as + * PLL isn't enabled while the initial mode set command is issued. + * So, force the mode change for SFI in Fixed PHY configuration to + * fix link issues. + */ + if (mode == XGBE_MODE_SFI) + xgbe_change_mode(pdata, mode); + else + xgbe_set_mode(pdata, mode); return 0; } From 7121205d5330c6a3cb3379348886d47c77b78d06 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Wed, 22 Nov 2023 00:44:34 +0530 Subject: [PATCH 526/560] amd-xgbe: handle the corner-case during tx completion The existing implementation uses software logic to accumulate tx completions until the specified time (1ms) is met and then poll them. However, there exists a tiny gap which leads to a race between resetting and checking the tx_activate flag. Due to this the tx completions are not reported to upper layer and tx queue timeout kicks-in restarting the device. To address this, introduce a tx cleanup mechanism as part of the periodic maintenance process. Fixes: c5aa9e3b8156 ("amd-xgbe: Initial AMD 10GbE platform driver") Acked-by: Shyam Sundar S K Signed-off-by: Raju Rangoju Reviewed-by: Wojciech Drewek Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 614c0278419bc..6b73648b37793 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -682,10 +682,24 @@ static void xgbe_service(struct work_struct *work) static void xgbe_service_timer(struct timer_list *t) { struct xgbe_prv_data *pdata = from_timer(pdata, t, service_timer); + struct xgbe_channel *channel; + unsigned int i; queue_work(pdata->dev_workqueue, &pdata->service_work); mod_timer(&pdata->service_timer, jiffies + HZ); + + if (!pdata->tx_usecs) + return; + + for (i = 0; i < pdata->channel_count; i++) { + channel = pdata->channel[i]; + if (!channel->tx_ring || channel->tx_timer_active) + break; + channel->tx_timer_active = 1; + mod_timer(&channel->tx_timer, + jiffies + usecs_to_jiffies(pdata->tx_usecs)); + } } static void xgbe_init_timers(struct xgbe_prv_data *pdata) From 7a2323ac24a50311f64a3a9b54ed5bef5821ecae Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Wed, 22 Nov 2023 00:44:35 +0530 Subject: [PATCH 527/560] amd-xgbe: propagate the correct speed and duplex status xgbe_get_link_ksettings() does not propagate correct speed and duplex information to ethtool during cable unplug. Due to which ethtool reports incorrect values for speed and duplex. Address this by propagating correct information. Fixes: 7c12aa08779c ("amd-xgbe: Move the PHY support into amd-xgbe") Acked-by: Shyam Sundar S K Signed-off-by: Raju Rangoju Reviewed-by: Wojciech Drewek Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c index 6e83ff59172a3..32fab5e772462 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c @@ -314,10 +314,15 @@ static int xgbe_get_link_ksettings(struct net_device *netdev, cmd->base.phy_address = pdata->phy.address; - cmd->base.autoneg = pdata->phy.autoneg; - cmd->base.speed = pdata->phy.speed; - cmd->base.duplex = pdata->phy.duplex; + if (netif_carrier_ok(netdev)) { + cmd->base.speed = pdata->phy.speed; + cmd->base.duplex = pdata->phy.duplex; + } else { + cmd->base.speed = SPEED_UNKNOWN; + cmd->base.duplex = DUPLEX_UNKNOWN; + } + cmd->base.autoneg = pdata->phy.autoneg; cmd->base.port = PORT_NONE; XGBE_LM_COPY(cmd, supported, lks, supported); From 460e462d22542adfafd8a5bc979437df73f1cbf3 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 16 Nov 2023 12:52:29 +0000 Subject: [PATCH 528/560] kselftest/arm64: Fix output formatting for za-fork The za-fork test does not output a newline when reporting the result of the one test it runs, causing the counts printed by kselftest to be included in the test name. Add the newline. Fixes: 266679ffd867 ("kselftest/arm64: Convert za-fork to use kselftest.h") Cc: # 6.4.x Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231116-arm64-fix-za-fork-output-v1-1-42c03d4f5759@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/za-fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/fp/za-fork.c b/tools/testing/selftests/arm64/fp/za-fork.c index b86cb1049497f..587b946482226 100644 --- a/tools/testing/selftests/arm64/fp/za-fork.c +++ b/tools/testing/selftests/arm64/fp/za-fork.c @@ -85,7 +85,7 @@ int main(int argc, char **argv) */ ret = open("/proc/sys/abi/sme_default_vector_length", O_RDONLY, 0); if (ret >= 0) { - ksft_test_result(fork_test(), "fork_test"); + ksft_test_result(fork_test(), "fork_test\n"); } else { ksft_print_msg("SME not supported\n"); From 0ffb08b1a45bd6b7694e01da0e1d9e3e788418fb Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 21 Nov 2023 13:12:55 -0800 Subject: [PATCH 529/560] ice: remove ptp_tx ring parameter flag Before performing a Tx timestamp in ice_stamp(), the driver checks a ptp_tx ring variable to see if timestamping is enabled on that ring. This value is set for all rings whenever userspace configures Tx timestamping. Ostensibly this was done to avoid wasting cycles checking other fields when timestamping has not been enabled. However, for Tx timestamps we already get an individual per-SKB flag indicating whether userspace wants to request a timestamp on that packet. We do not gain much by also having a separate flag to check for whether timestamping was enabled. In fact, the driver currently fails to restore the field after a PF reset. Because of this, if a PF reset occurs, timestamps will be disabled. Since this flag doesn't add value in the hotpath, remove it and always provide a timestamp if the SKB flag has been set. A following change will fix the reset path to properly restore user timestamping configuration completely. This went unnoticed for some time because one of the most common applications using Tx timestamps, ptp4l, will reconfigure the socket as part of its fault recovery logic. Fixes: ea9b847cda64 ("ice: enable transmit timestamps for E810 devices") Signed-off-by: Jacob Keller Reviewed-by: Jesse Brandeburg Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_ptp.c | 14 -------------- drivers/net/ethernet/intel/ice/ice_txrx.c | 3 --- drivers/net/ethernet/intel/ice/ice_txrx.h | 1 - 3 files changed, 18 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 1eddcbe89b0c4..affd90622a684 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -280,20 +280,6 @@ static void ice_ptp_configure_tx_tstamp(struct ice_pf *pf, bool on) */ static void ice_set_tx_tstamp(struct ice_pf *pf, bool on) { - struct ice_vsi *vsi; - u16 i; - - vsi = ice_get_main_vsi(pf); - if (!vsi) - return; - - /* Set the timestamp enable flag for all the Tx rings */ - ice_for_each_txq(vsi, i) { - if (!vsi->tx_rings[i]) - continue; - vsi->tx_rings[i]->ptp_tx = on; - } - if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_SELF) ice_ptp_configure_tx_tstamp(pf, on); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 52d0a126eb616..9e97ea8630686 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -2306,9 +2306,6 @@ ice_tstamp(struct ice_tx_ring *tx_ring, struct sk_buff *skb, if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))) return; - if (!tx_ring->ptp_tx) - return; - /* Tx timestamps cannot be sampled when doing TSO */ if (first->tx_flags & ICE_TX_FLAGS_TSO) return; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index 166413fc33f48..daf7b9dbb1435 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -380,7 +380,6 @@ struct ice_tx_ring { #define ICE_TX_FLAGS_RING_VLAN_L2TAG2 BIT(2) u8 flags; u8 dcb_tc; /* Traffic class of ring */ - u8 ptp_tx; } ____cacheline_internodealigned_in_smp; static inline bool ice_ring_uses_build_skb(struct ice_rx_ring *ring) From 7d606a1e2d0575b6c3a2600f43f90d1e409f9661 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 21 Nov 2023 13:12:56 -0800 Subject: [PATCH 530/560] ice: unify logic for programming PFINT_TSYN_MSK Commit d938a8cca88a ("ice: Auxbus devices & driver for E822 TS") modified how Tx timestamps are handled for E822 devices. On these devices, only the clock owner handles reading the Tx timestamp data from firmware. To do this, the PFINT_TSYN_MSK register is modified from the default value to one which enables reacting to a Tx timestamp on all PHY ports. The driver currently programs PFINT_TSYN_MSK in different places depending on whether the port is the clock owner or not. For the clock owner, the PFINT_TSYN_MSK value is programmed during ice_ptp_init_owner just before calling ice_ptp_tx_ena_intr to program the PHY ports. For the non-clock owner ports, the PFINT_TSYN_MSK is programmed during ice_ptp_init_port. If a large enough device reset occurs, the PFINT_TSYN_MSK register will be reset to the default value in which only the PHY associated directly with the PF will cause the Tx timestamp interrupt to trigger. The driver lacks logic to reprogram the PFINT_TSYN_MSK register after a device reset. For the E822 device, this results in the PF no longer responding to interrupts for other ports. This results in failure to deliver Tx timestamps to user space applications. Rename ice_ptp_configure_tx_tstamp to ice_ptp_cfg_tx_interrupt, and unify the logic for programming PFINT_TSYN_MSK and PFINT_OICR_ENA into one place. This function will program both registers according to the combination of user configuration and device requirements. This ensures that PFINT_TSYN_MSK is always restored when we configure the Tx timestamp interrupt. Fixes: d938a8cca88a ("ice: Auxbus devices & driver for E822 TS") Signed-off-by: Jacob Keller Reviewed-by: Jesse Brandeburg Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_ptp.c | 60 ++++++++++++++---------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index affd90622a684..624d05b4bbd9b 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -256,21 +256,42 @@ ice_verify_pin_e810t(struct ptp_clock_info *info, unsigned int pin, } /** - * ice_ptp_configure_tx_tstamp - Enable or disable Tx timestamp interrupt - * @pf: The PF pointer to search in - * @on: bool value for whether timestamp interrupt is enabled or disabled + * ice_ptp_cfg_tx_interrupt - Configure Tx timestamp interrupt for the device + * @pf: Board private structure + * + * Program the device to respond appropriately to the Tx timestamp interrupt + * cause. */ -static void ice_ptp_configure_tx_tstamp(struct ice_pf *pf, bool on) +static void ice_ptp_cfg_tx_interrupt(struct ice_pf *pf) { + struct ice_hw *hw = &pf->hw; + bool enable; u32 val; + switch (pf->ptp.tx_interrupt_mode) { + case ICE_PTP_TX_INTERRUPT_ALL: + /* React to interrupts across all quads. */ + wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x1f); + enable = true; + break; + case ICE_PTP_TX_INTERRUPT_NONE: + /* Do not react to interrupts on any quad. */ + wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x0); + enable = false; + break; + case ICE_PTP_TX_INTERRUPT_SELF: + default: + enable = pf->ptp.tstamp_config.tx_type == HWTSTAMP_TX_ON; + break; + } + /* Configure the Tx timestamp interrupt */ - val = rd32(&pf->hw, PFINT_OICR_ENA); - if (on) + val = rd32(hw, PFINT_OICR_ENA); + if (enable) val |= PFINT_OICR_TSYN_TX_M; else val &= ~PFINT_OICR_TSYN_TX_M; - wr32(&pf->hw, PFINT_OICR_ENA, val); + wr32(hw, PFINT_OICR_ENA, val); } /** @@ -280,10 +301,9 @@ static void ice_ptp_configure_tx_tstamp(struct ice_pf *pf, bool on) */ static void ice_set_tx_tstamp(struct ice_pf *pf, bool on) { - if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_SELF) - ice_ptp_configure_tx_tstamp(pf, on); - pf->ptp.tstamp_config.tx_type = on ? HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF; + + ice_ptp_cfg_tx_interrupt(pf); } /** @@ -2789,15 +2809,7 @@ static int ice_ptp_init_owner(struct ice_pf *pf) /* Release the global hardware lock */ ice_ptp_unlock(hw); - if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_ALL) { - /* The clock owner for this device type handles the timestamp - * interrupt for all ports. - */ - ice_ptp_configure_tx_tstamp(pf, true); - - /* React on all quads interrupts for E82x */ - wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x1f); - + if (!ice_is_e810(hw)) { /* Enable quad interrupts */ err = ice_ptp_tx_ena_intr(pf, true, itr); if (err) @@ -2867,13 +2879,6 @@ static int ice_ptp_init_port(struct ice_pf *pf, struct ice_ptp_port *ptp_port) case ICE_PHY_E810: return ice_ptp_init_tx_e810(pf, &ptp_port->tx); case ICE_PHY_E822: - /* Non-owner PFs don't react to any interrupts on E82x, - * neither on own quad nor on others - */ - if (!ice_ptp_pf_handles_tx_interrupt(pf)) { - ice_ptp_configure_tx_tstamp(pf, false); - wr32(hw, PFINT_TSYN_MSK + (0x4 * hw->pf_id), (u32)0x0); - } kthread_init_delayed_work(&ptp_port->ov_work, ice_ptp_wait_for_offsets); @@ -3018,6 +3023,9 @@ void ice_ptp_init(struct ice_pf *pf) /* Start the PHY timestamping block */ ice_ptp_reset_phy_timestamping(pf); + /* Configure initial Tx interrupt settings */ + ice_ptp_cfg_tx_interrupt(pf); + set_bit(ICE_FLAG_PTP, pf->flags); err = ice_ptp_init_work(pf, ptp); if (err) From 7758017911a4f2578d54c318e8fe77bcb5899054 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 21 Nov 2023 13:12:57 -0800 Subject: [PATCH 531/560] ice: restore timestamp configuration after device reset The driver calls ice_ptp_cfg_timestamp() during ice_ptp_prepare_for_reset() to disable timestamping while the device is resetting. This operation destroys the user requested configuration. While the driver does call ice_ptp_cfg_timestamp in ice_rebuild() to restore some hardware settings after a reset, it unconditionally passes true or false, resulting in failure to restore previous user space configuration. This results in a device reset forcibly disabling timestamp configuration regardless of current user settings. This was not detected previously due to a quirk of the LinuxPTP ptp4l application. If ptp4l detects a missing timestamp, it enters a fault state and performs recovery logic which includes executing SIOCSHWTSTAMP again, restoring the now accidentally cleared configuration. Not every application does this, and for these applications, timestamps will mysteriously stop after a PF reset, without being restored until an application restart. Fix this by replacing ice_ptp_cfg_timestamp() with two new functions: 1) ice_ptp_disable_timestamp_mode() which unconditionally disables the timestamping logic in ice_ptp_prepare_for_reset() and ice_ptp_release() 2) ice_ptp_restore_timestamp_mode() which calls ice_ptp_restore_tx_interrupt() to restore Tx timestamping configuration, calls ice_set_rx_tstamp() to restore Rx timestamping configuration, and issues an immediate TSYN_TX interrupt to ensure that timestamps which may have occurred during the device reset get processed. Modify the ice_ptp_set_timestamp_mode to directly save the user configuration and then call ice_ptp_restore_timestamp_mode. This way, reset no longer destroys the saved user configuration. This obsoletes the ice_set_tx_tstamp() function which can now be safely removed. With this change, all devices should now restore Tx and Rx timestamping functionality correctly after a PF reset without application intervention. Fixes: 77a781155a65 ("ice: enable receive hardware timestamping") Fixes: ea9b847cda64 ("ice: enable transmit timestamps for E810 devices") Signed-off-by: Jacob Keller Reviewed-by: Jesse Brandeburg Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- drivers/net/ethernet/intel/ice/ice_main.c | 12 +--- drivers/net/ethernet/intel/ice/ice_ptp.c | 74 ++++++++++++++--------- drivers/net/ethernet/intel/ice/ice_ptp.h | 5 +- 3 files changed, 51 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 6607fa6fe5562..fb9c93f37e84f 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -7401,15 +7401,6 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) goto err_vsi_rebuild; } - /* configure PTP timestamping after VSI rebuild */ - if (test_bit(ICE_FLAG_PTP_SUPPORTED, pf->flags)) { - if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_SELF) - ice_ptp_cfg_timestamp(pf, false); - else if (pf->ptp.tx_interrupt_mode == ICE_PTP_TX_INTERRUPT_ALL) - /* for E82x PHC owner always need to have interrupts */ - ice_ptp_cfg_timestamp(pf, true); - } - err = ice_vsi_rebuild_by_type(pf, ICE_VSI_SWITCHDEV_CTRL); if (err) { dev_err(dev, "Switchdev CTRL VSI rebuild failed: %d\n", err); @@ -7461,6 +7452,9 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) ice_plug_aux_dev(pf); if (ice_is_feature_supported(pf, ICE_F_SRIOV_LAG)) ice_lag_rebuild(pf); + + /* Restore timestamp mode settings after VSI rebuild */ + ice_ptp_restore_timestamp_mode(pf); return; err_vsi_rebuild: diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 624d05b4bbd9b..71f405f8a6fee 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -294,18 +294,6 @@ static void ice_ptp_cfg_tx_interrupt(struct ice_pf *pf) wr32(hw, PFINT_OICR_ENA, val); } -/** - * ice_set_tx_tstamp - Enable or disable Tx timestamping - * @pf: The PF pointer to search in - * @on: bool value for whether timestamps are enabled or disabled - */ -static void ice_set_tx_tstamp(struct ice_pf *pf, bool on) -{ - pf->ptp.tstamp_config.tx_type = on ? HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF; - - ice_ptp_cfg_tx_interrupt(pf); -} - /** * ice_set_rx_tstamp - Enable or disable Rx timestamping * @pf: The PF pointer to search in @@ -317,7 +305,7 @@ static void ice_set_rx_tstamp(struct ice_pf *pf, bool on) u16 i; vsi = ice_get_main_vsi(pf); - if (!vsi) + if (!vsi || !vsi->rx_rings) return; /* Set the timestamp flag for all the Rx rings */ @@ -326,23 +314,50 @@ static void ice_set_rx_tstamp(struct ice_pf *pf, bool on) continue; vsi->rx_rings[i]->ptp_rx = on; } +} + +/** + * ice_ptp_disable_timestamp_mode - Disable current timestamp mode + * @pf: Board private structure + * + * Called during preparation for reset to temporarily disable timestamping on + * the device. Called during remove to disable timestamping while cleaning up + * driver resources. + */ +static void ice_ptp_disable_timestamp_mode(struct ice_pf *pf) +{ + struct ice_hw *hw = &pf->hw; + u32 val; + + val = rd32(hw, PFINT_OICR_ENA); + val &= ~PFINT_OICR_TSYN_TX_M; + wr32(hw, PFINT_OICR_ENA, val); - pf->ptp.tstamp_config.rx_filter = on ? HWTSTAMP_FILTER_ALL : - HWTSTAMP_FILTER_NONE; + ice_set_rx_tstamp(pf, false); } /** - * ice_ptp_cfg_timestamp - Configure timestamp for init/deinit + * ice_ptp_restore_timestamp_mode - Restore timestamp configuration * @pf: Board private structure - * @ena: bool value to enable or disable time stamp * - * This function will configure timestamping during PTP initialization - * and deinitialization + * Called at the end of rebuild to restore timestamp configuration after + * a device reset. */ -void ice_ptp_cfg_timestamp(struct ice_pf *pf, bool ena) +void ice_ptp_restore_timestamp_mode(struct ice_pf *pf) { - ice_set_tx_tstamp(pf, ena); - ice_set_rx_tstamp(pf, ena); + struct ice_hw *hw = &pf->hw; + bool enable_rx; + + ice_ptp_cfg_tx_interrupt(pf); + + enable_rx = pf->ptp.tstamp_config.rx_filter == HWTSTAMP_FILTER_ALL; + ice_set_rx_tstamp(pf, enable_rx); + + /* Trigger an immediate software interrupt to ensure that timestamps + * which occurred during reset are handled now. + */ + wr32(hw, PFINT_OICR, PFINT_OICR_TSYN_TX_M); + ice_flush(hw); } /** @@ -2043,10 +2058,10 @@ ice_ptp_set_timestamp_mode(struct ice_pf *pf, struct hwtstamp_config *config) { switch (config->tx_type) { case HWTSTAMP_TX_OFF: - ice_set_tx_tstamp(pf, false); + pf->ptp.tstamp_config.tx_type = HWTSTAMP_TX_OFF; break; case HWTSTAMP_TX_ON: - ice_set_tx_tstamp(pf, true); + pf->ptp.tstamp_config.tx_type = HWTSTAMP_TX_ON; break; default: return -ERANGE; @@ -2054,7 +2069,7 @@ ice_ptp_set_timestamp_mode(struct ice_pf *pf, struct hwtstamp_config *config) switch (config->rx_filter) { case HWTSTAMP_FILTER_NONE: - ice_set_rx_tstamp(pf, false); + pf->ptp.tstamp_config.rx_filter = HWTSTAMP_FILTER_NONE; break; case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: @@ -2070,12 +2085,15 @@ ice_ptp_set_timestamp_mode(struct ice_pf *pf, struct hwtstamp_config *config) case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: case HWTSTAMP_FILTER_NTP_ALL: case HWTSTAMP_FILTER_ALL: - ice_set_rx_tstamp(pf, true); + pf->ptp.tstamp_config.rx_filter = HWTSTAMP_FILTER_ALL; break; default: return -ERANGE; } + /* Immediately update the device timestamping mode */ + ice_ptp_restore_timestamp_mode(pf); + return 0; } @@ -2743,7 +2761,7 @@ void ice_ptp_prepare_for_reset(struct ice_pf *pf) clear_bit(ICE_FLAG_PTP, pf->flags); /* Disable timestamping for both Tx and Rx */ - ice_ptp_cfg_timestamp(pf, false); + ice_ptp_disable_timestamp_mode(pf); kthread_cancel_delayed_work_sync(&ptp->work); @@ -3061,7 +3079,7 @@ void ice_ptp_release(struct ice_pf *pf) return; /* Disable timestamping for both Tx and Rx */ - ice_ptp_cfg_timestamp(pf, false); + ice_ptp_disable_timestamp_mode(pf); ice_ptp_remove_auxbus_device(pf); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.h b/drivers/net/ethernet/intel/ice/ice_ptp.h index 8f6f943927564..06a330867fc9b 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.h +++ b/drivers/net/ethernet/intel/ice/ice_ptp.h @@ -292,7 +292,7 @@ int ice_ptp_clock_index(struct ice_pf *pf); struct ice_pf; int ice_ptp_set_ts_config(struct ice_pf *pf, struct ifreq *ifr); int ice_ptp_get_ts_config(struct ice_pf *pf, struct ifreq *ifr); -void ice_ptp_cfg_timestamp(struct ice_pf *pf, bool ena); +void ice_ptp_restore_timestamp_mode(struct ice_pf *pf); void ice_ptp_extts_event(struct ice_pf *pf); s8 ice_ptp_request_ts(struct ice_ptp_tx *tx, struct sk_buff *skb); @@ -317,8 +317,7 @@ static inline int ice_ptp_get_ts_config(struct ice_pf *pf, struct ifreq *ifr) return -EOPNOTSUPP; } -static inline void ice_ptp_cfg_timestamp(struct ice_pf *pf, bool ena) { } - +static inline void ice_ptp_restore_timestamp_mode(struct ice_pf *pf) { } static inline void ice_ptp_extts_event(struct ice_pf *pf) { } static inline s8 ice_ptp_request_ts(struct ice_ptp_tx *tx, struct sk_buff *skb) From 4e20655e503e3a478cd1682bf25e3202dd823da8 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 21 Nov 2023 13:13:36 -0800 Subject: [PATCH 532/560] i40e: Fix adding unsupported cloud filters If a VF tries to add unsupported cloud filter through virtchnl then i40e_add_del_cloud_filter(_big_buf) returns -ENOTSUPP but this error code is stored in 'ret' instead of 'aq_ret' that is used as error code sent back to VF. In this scenario where one of the mentioned functions fails the value of 'aq_ret' is zero so the VF will incorrectly receive a 'success'. Use 'aq_ret' to store return value and remove 'ret' local variable. Additionally fix the issue when filter allocation fails, in this case no notification is sent back to the VF. Fixes: e284fc280473 ("i40e: Add and delete cloud filter") Reviewed-by: Simon Horman Signed-off-by: Ivan Vecera Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20231121211338.3348677-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 08d7edccfb8dd..3f99eb1982452 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3844,7 +3844,7 @@ static int i40e_vc_add_cloud_filter(struct i40e_vf *vf, u8 *msg) struct i40e_pf *pf = vf->pf; struct i40e_vsi *vsi = NULL; int aq_ret = 0; - int i, ret; + int i; if (!i40e_sync_vf_state(vf, I40E_VF_STATE_ACTIVE)) { aq_ret = -EINVAL; @@ -3868,8 +3868,10 @@ static int i40e_vc_add_cloud_filter(struct i40e_vf *vf, u8 *msg) } cfilter = kzalloc(sizeof(*cfilter), GFP_KERNEL); - if (!cfilter) - return -ENOMEM; + if (!cfilter) { + aq_ret = -ENOMEM; + goto err_out; + } /* parse destination mac address */ for (i = 0; i < ETH_ALEN; i++) @@ -3917,13 +3919,13 @@ static int i40e_vc_add_cloud_filter(struct i40e_vf *vf, u8 *msg) /* Adding cloud filter programmed as TC filter */ if (tcf.dst_port) - ret = i40e_add_del_cloud_filter_big_buf(vsi, cfilter, true); + aq_ret = i40e_add_del_cloud_filter_big_buf(vsi, cfilter, true); else - ret = i40e_add_del_cloud_filter(vsi, cfilter, true); - if (ret) { + aq_ret = i40e_add_del_cloud_filter(vsi, cfilter, true); + if (aq_ret) { dev_err(&pf->pdev->dev, "VF %d: Failed to add cloud filter, err %pe aq_err %s\n", - vf->vf_id, ERR_PTR(ret), + vf->vf_id, ERR_PTR(aq_ret), i40e_aq_str(&pf->hw, pf->hw.aq.asq_last_status)); goto err_free; } From f0863888f6cfef33e3117dccfe94fa78edf76be4 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Wed, 22 Nov 2023 00:16:42 +0300 Subject: [PATCH 533/560] vsock/test: fix SEQPACKET message bounds test Tune message length calculation to make this test work on machines where 'getpagesize()' returns >32KB. Now maximum message length is not hardcoded (on machines above it was smaller than 'getpagesize()' return value, thus we get negative value and test fails), but calculated at runtime and always bigger than 'getpagesize()' result. Reproduced on aarch64 with 64KB page size. Fixes: 5c338112e48a ("test/vsock: rework message bounds test") Signed-off-by: Arseniy Krasnov Reported-by: Bogdan Marcynkov Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20231121211642.163474-1-avkrasnov@salutedevices.com Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index 5b0e93f9996cb..01fa816868bc4 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -353,11 +353,12 @@ static void test_stream_msg_peek_server(const struct test_opts *opts) } #define SOCK_BUF_SIZE (2 * 1024 * 1024) -#define MAX_MSG_SIZE (32 * 1024) +#define MAX_MSG_PAGES 4 static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) { unsigned long curr_hash; + size_t max_msg_size; int page_size; int msg_count; int fd; @@ -373,7 +374,8 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) curr_hash = 0; page_size = getpagesize(); - msg_count = SOCK_BUF_SIZE / MAX_MSG_SIZE; + max_msg_size = MAX_MSG_PAGES * page_size; + msg_count = SOCK_BUF_SIZE / max_msg_size; for (int i = 0; i < msg_count; i++) { size_t buf_size; @@ -383,7 +385,7 @@ static void test_seqpacket_msg_bounds_client(const struct test_opts *opts) /* Use "small" buffers and "big" buffers. */ if (i & 1) buf_size = page_size + - (rand() % (MAX_MSG_SIZE - page_size)); + (rand() % (max_msg_size - page_size)); else buf_size = 1 + (rand() % page_size); @@ -429,7 +431,6 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts) unsigned long remote_hash; unsigned long curr_hash; int fd; - char buf[MAX_MSG_SIZE]; struct msghdr msg = {0}; struct iovec iov = {0}; @@ -457,8 +458,13 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts) control_writeln("SRVREADY"); /* Wait, until peer sends whole data. */ control_expectln("SENDDONE"); - iov.iov_base = buf; - iov.iov_len = sizeof(buf); + iov.iov_len = MAX_MSG_PAGES * getpagesize(); + iov.iov_base = malloc(iov.iov_len); + if (!iov.iov_base) { + perror("malloc"); + exit(EXIT_FAILURE); + } + msg.msg_iov = &iov; msg.msg_iovlen = 1; @@ -483,6 +489,7 @@ static void test_seqpacket_msg_bounds_server(const struct test_opts *opts) curr_hash += hash_djb2(msg.msg_iov[0].iov_base, recv_size); } + free(iov.iov_base); close(fd); remote_hash = control_readulong(); From fd0413bbf8b11f56e8aa842783b0deda0dfe2926 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Tue, 21 Nov 2023 16:42:17 -0800 Subject: [PATCH 534/560] net: axienet: Fix check for partial TX checksum Due to a typo, the code checked the RX checksum feature in the TX path. Fixes: 8a3b7a252dca ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver") Signed-off-by: Samuel Holland Reviewed-by: Andrew Lunn Reviewed-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/20231122004219.3504219-1-samuel.holland@sifive.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 82d0d44b2b02f..bf6e339904909 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -822,7 +822,7 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev) if (lp->features & XAE_FEATURE_FULL_TX_CSUM) { /* Tx Full Checksum Offload Enabled */ cur_p->app0 |= 2; - } else if (lp->features & XAE_FEATURE_PARTIAL_RX_CSUM) { + } else if (lp->features & XAE_FEATURE_PARTIAL_TX_CSUM) { csum_start_off = skb_transport_offset(skb); csum_index_off = csum_start_off + skb->csum_offset; /* Tx Partial Checksum Offload Enabled */ From 53f2cb491b500897a619ff6abd72f565933760f0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 22 Nov 2023 22:44:47 +0100 Subject: [PATCH 535/560] tls: fix NULL deref on tls_sw_splice_eof() with empty record syzkaller discovered that if tls_sw_splice_eof() is executed as part of sendfile() when the plaintext/ciphertext sk_msg are empty, the send path gets confused because the empty ciphertext buffer does not have enough space for the encryption overhead. This causes tls_push_record() to go on the `split = true` path (which is only supposed to be used when interacting with an attached BPF program), and then get further confused and hit the tls_merge_open_record() path, which then assumes that there must be at least one populated buffer element, leading to a NULL deref. It is possible to have empty plaintext/ciphertext buffers if we previously bailed from tls_sw_sendmsg_locked() via the tls_trim_both_msgs() path. tls_sw_push_pending_record() already handles this case correctly; let's do the same check in tls_sw_splice_eof(). Fixes: df720d288dbb ("tls/sw: Use splice_eof() to flush") Cc: stable@vger.kernel.org Reported-by: syzbot+40d43509a099ea756317@syzkaller.appspotmail.com Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20231122214447.675768-1-jannh@google.com Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index a78e8e7224091..316f761879624 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1232,11 +1232,14 @@ void tls_sw_splice_eof(struct socket *sock) lock_sock(sk); retry: + /* same checks as in tls_sw_push_pending_record() */ rec = ctx->open_rec; if (!rec) goto unlock; msg_pl = &rec->msg_plaintext; + if (msg_pl->sg.size == 0) + goto unlock; /* Check the BPF advisor and perform transmission. */ ret = bpf_exec_tx_verdict(msg_pl, sk, false, TLS_RECORD_TYPE_DATA, From 37f0205538baf70beb57cdcb6c7d14aa13257926 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 22 Nov 2023 17:17:08 -0600 Subject: [PATCH 536/560] net: ipa: fix one GSI register field width The width of the R_LENGTH field of the EV_CH_E_CNTXT_1 GSI register is 24 bits (not 20 bits) starting with IPA v5.0. Fix this. Fixes: faf0678ec8a0 ("net: ipa: add IPA v5.0 GSI register definitions") Signed-off-by: Alex Elder Link: https://lore.kernel.org/r/20231122231708.896632-1-elder@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ipa/reg/gsi_reg-v5.0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ipa/reg/gsi_reg-v5.0.c b/drivers/net/ipa/reg/gsi_reg-v5.0.c index d7b81a36d673b..145eb0bd096d6 100644 --- a/drivers/net/ipa/reg/gsi_reg-v5.0.c +++ b/drivers/net/ipa/reg/gsi_reg-v5.0.c @@ -78,7 +78,7 @@ REG_STRIDE_FIELDS(EV_CH_E_CNTXT_0, ev_ch_e_cntxt_0, 0x0001c000 + 0x12000 * GSI_EE_AP, 0x80); static const u32 reg_ev_ch_e_cntxt_1_fmask[] = { - [R_LENGTH] = GENMASK(19, 0), + [R_LENGTH] = GENMASK(23, 0), }; REG_STRIDE_FIELDS(EV_CH_E_CNTXT_1, ev_ch_e_cntxt_1, From 2be35a619482c1f4e5bc7a2d84049b8d7d171882 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 22 Nov 2023 19:06:24 -0800 Subject: [PATCH 537/560] tools: ynl: fix header path for nfsd The makefile dependency is trying to include the wrong header: : fatal error: ../../../../include/uapi//linux/nfsd.h: No such file or directory The guard also looks wrong. Fixes: f14122b2c2ac ("tools: ynl: Add source files for nfsd netlink protocol") Reviewed-by: Chuck Lever Link: https://lore.kernel.org/r/20231123030624.1611925-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/Makefile.deps | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 64d139400db10..3110f84dd0294 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -18,4 +18,4 @@ CFLAGS_devlink:=$(call get_hdr_inc,_LINUX_DEVLINK_H_,devlink.h) CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h) CFLAGS_netdev:=$(call get_hdr_inc,_LINUX_NETDEV_H,netdev.h) -CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_H,nfsd.h) +CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_NETLINK_H,nfsd_netlink.h) From 39f04b1406b23fcc129a67e70d6205d5a7322f38 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 22 Nov 2023 19:05:58 -0800 Subject: [PATCH 538/560] tools: ynl: fix duplicate op name in devlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't support CRUD-inspired message types in YNL too well. One aspect that currently trips us up is the fact that single message ID can be used in multiple commands (as the response). This leads to duplicate entries in the id-to-string tables: devlink-user.c:19:34: warning: initialized field overwritten [-Woverride-init] 19 | [DEVLINK_CMD_PORT_NEW] = "port-new", | ^~~~~~~~~~ devlink-user.c:19:34: note: (near initialization for ‘devlink_op_strmap[7]’) Fixes tag points at where the code was generated, the "real" problem is that the code generator does not support CRUD. Fixes: f2f9dd164db0 ("netlink: specs: devlink: add the remaining command to generate complete split_ops") Link: https://lore.kernel.org/r/20231123030558.1611831-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/generated/devlink-user.c | 2 +- tools/net/ynl/ynl-gen-c.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/generated/devlink-user.c b/tools/net/ynl/generated/devlink-user.c index bc5065bd99b2f..c12ca87ca2bb3 100644 --- a/tools/net/ynl/generated/devlink-user.c +++ b/tools/net/ynl/generated/devlink-user.c @@ -15,7 +15,7 @@ /* Enums */ static const char * const devlink_op_strmap[] = { [3] = "get", - [7] = "port-get", + // skip "port-get", duplicate reply value [DEVLINK_CMD_PORT_NEW] = "port-new", [13] = "sb-get", [17] = "sb-pool-get", diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index c4003a83cd5d8..3bd6b928c14ff 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -1505,6 +1505,12 @@ def put_op_name(family, cw): cw.block_start(line=f"static const char * const {map_name}[] =") for op_name, op in family.msgs.items(): if op.rsp_value: + # Make sure we don't add duplicated entries, if multiple commands + # produce the same response in legacy families. + if family.rsp_by_value[op.rsp_value] != op: + cw.p(f'// skip "{op_name}", duplicate reply value') + continue + if op.req_value == op.rsp_value: cw.p(f'[{op.enum_name}] = "{op_name}",') else: From da90e45d5afc4da2de7cd3ea7943d0f1baa47cc2 Mon Sep 17 00:00:00 2001 From: Asuna Yang Date: Wed, 22 Nov 2023 22:18:03 +0800 Subject: [PATCH 539/560] USB: serial: option: add Luat Air72*U series products Update the USB serial option driver support for Luat Air72*U series products. ID 1782:4e00 Spreadtrum Communications Inc. UNISOC-8910 T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 13 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1782 ProdID=4e00 Rev=00.00 S: Manufacturer=UNISOC S: Product=UNISOC-8910 C: #Ifs= 5 Cfg#= 1 Atr=e0 MxPwr=400mA I: If#= 0 Alt= 0 #EPs= 1 Cls=e0(wlcon) Sub=01 Prot=03 Driver=rndis_host E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=4096ms I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms If#= 2: AT If#= 3: PPP + AT If#= 4: Debug Co-developed-by: Yangyu Chen Signed-off-by: Yangyu Chen Signed-off-by: Asuna Yang Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 06b9b04c022a6..4dffcfefd62da 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -609,6 +609,8 @@ static void option_instat_callback(struct urb *urb); #define UNISOC_VENDOR_ID 0x1782 /* TOZED LT70-C based on UNISOC SL8563 uses UNISOC's vendor ID */ #define TOZED_PRODUCT_LT70C 0x4055 +/* Luat Air72*U series based on UNISOC UIS8910 uses UNISOC's vendor ID */ +#define LUAT_PRODUCT_AIR720U 0x4e00 /* Device flags */ @@ -2273,6 +2275,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, option_ids); From a15ccef82d3de9a37dc25898c60a394209368dc8 Mon Sep 17 00:00:00 2001 From: Ritvik Budhiraja Date: Tue, 21 Nov 2023 19:13:47 +0530 Subject: [PATCH 540/560] cifs: fix use after free for iface while disabling secondary channels We were deferencing iface after it has been released. Fix is to release after all dereference instances have been encountered. Signed-off-by: Ritvik Budhiraja Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202311110815.UJaeU3Tt-lkp@intel.com/ Signed-off-by: Steve French --- fs/smb/client/sess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 8b2d7c1ca4284..816e01c5589b4 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -332,10 +332,10 @@ cifs_disable_secondary_channels(struct cifs_ses *ses) if (iface) { spin_lock(&ses->iface_lock); - kref_put(&iface->refcount, release_iface); iface->num_channels--; if (iface->weight_fulfilled) iface->weight_fulfilled--; + kref_put(&iface->refcount, release_iface); spin_unlock(&ses->iface_lock); } From ed3e0a149b58ea8cfd10cc4f7cefb39877ff07ac Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 21 Nov 2023 20:12:52 -0300 Subject: [PATCH 541/560] smb: client: implement ->query_reparse_point() for SMB1 Reparse points are not limited to symlinks, so implement ->query_reparse_point() in order to handle different file types. Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/smb/client/cifspdu.h | 2 +- fs/smb/client/cifsproto.h | 9 ++ fs/smb/client/cifssmb.c | 193 +++++++++++++++----------------------- fs/smb/client/smb1ops.c | 49 ++-------- fs/smb/client/smb2ops.c | 35 ++++--- 5 files changed, 113 insertions(+), 175 deletions(-) diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index a75220db5c1e1..2a90134331a48 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -1356,7 +1356,7 @@ typedef struct smb_com_transaction_ioctl_rsp { __le32 DataDisplacement; __u8 SetupCount; /* 1 */ __le16 ReturnedDataLen; - __u16 ByteCount; + __le16 ByteCount; } __attribute__((packed)) TRANSACT_IOCTL_RSP; #define CIFS_ACL_OWNER 1 diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index d87e2c26cce2b..8a739c10d6345 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -458,6 +458,12 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char **syminfo, const struct nls_table *nls_codepage, int remap); +extern int cifs_query_reparse_point(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype); extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, char **symlinkinfo, const struct nls_table *nls_codepage); @@ -659,6 +665,9 @@ void cifs_put_tcp_super(struct super_block *sb); int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix); char *extract_hostname(const char *unc); char *extract_sharename(const char *unc); +int parse_reparse_point(struct reparse_data_buffer *buf, + u32 plen, struct cifs_sb_info *cifs_sb, + bool unicode, char **target_path); #ifdef CONFIG_CIFS_DFS_UPCALL static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 25503f1a4fd21..bad91ba6c3a9c 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -2690,136 +2690,97 @@ CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -/* - * Recent Windows versions now create symlinks more frequently - * and they use the "reparse point" mechanism below. We can of course - * do symlinks nicely to Samba and other servers which support the - * CIFS Unix Extensions and we can also do SFU symlinks and "client only" - * "MF" symlinks optionally, but for recent Windows we really need to - * reenable the code below and fix the cifs_symlink callers to handle this. - * In the interim this code has been moved to its own config option so - * it is not compiled in by default until callers fixed up and more tested. - */ -int -CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, - __u16 fid, char **symlinkinfo, - const struct nls_table *nls_codepage) +int cifs_query_reparse_point(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype) { - int rc = 0; - int bytes_returned; - struct smb_com_transaction_ioctl_req *pSMB; - struct smb_com_transaction_ioctl_rsp *pSMBr; - bool is_unicode; - unsigned int sub_len; - char *sub_start; - struct reparse_symlink_data *reparse_buf; - struct reparse_posix_data *posix_buf; + struct cifs_open_parms oparms; + TRANSACT_IOCTL_REQ *io_req = NULL; + TRANSACT_IOCTL_RSP *io_rsp = NULL; + struct cifs_fid fid; __u32 data_offset, data_count; - char *end_of_smb; + __u8 *start, *end; + int io_rsp_len; + int oplock = 0; + int rc; - cifs_dbg(FYI, "In Windows reparse style QueryLink for fid %u\n", fid); - rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, - (void **) &pSMBr); + cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path); + + if (cap_unix(tcon->ses)) + return -EOPNOTSUPP; + + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = FILE_READ_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, + OPEN_REPARSE_POINT), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) return rc; - pSMB->TotalParameterCount = 0 ; - pSMB->TotalDataCount = 0; - pSMB->MaxParameterCount = cpu_to_le32(2); - /* BB find exact data count max from sess structure BB */ - pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); - pSMB->MaxSetupCount = 4; - pSMB->Reserved = 0; - pSMB->ParameterOffset = 0; - pSMB->DataCount = 0; - pSMB->DataOffset = 0; - pSMB->SetupCount = 4; - pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); - pSMB->ParameterCount = pSMB->TotalParameterCount; - pSMB->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT); - pSMB->IsFsctl = 1; /* FSCTL */ - pSMB->IsRootFlag = 0; - pSMB->Fid = fid; /* file handle always le */ - pSMB->ByteCount = 0; + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, + (void **)&io_req, (void **)&io_rsp); + if (rc) + goto error; - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, 0); - if (rc) { - cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc); - goto qreparse_out; - } + io_req->TotalParameterCount = 0; + io_req->TotalDataCount = 0; + io_req->MaxParameterCount = cpu_to_le32(2); + /* BB find exact data count max from sess structure BB */ + io_req->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); + io_req->MaxSetupCount = 4; + io_req->Reserved = 0; + io_req->ParameterOffset = 0; + io_req->DataCount = 0; + io_req->DataOffset = 0; + io_req->SetupCount = 4; + io_req->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); + io_req->ParameterCount = io_req->TotalParameterCount; + io_req->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT); + io_req->IsFsctl = 1; + io_req->IsRootFlag = 0; + io_req->Fid = fid.netfid; + io_req->ByteCount = 0; + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)io_req, + (struct smb_hdr *)io_rsp, &io_rsp_len, 0); + if (rc) + goto error; - data_offset = le32_to_cpu(pSMBr->DataOffset); - data_count = le32_to_cpu(pSMBr->DataCount); - if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) { - /* BB also check enough total bytes returned */ - rc = -EIO; /* bad smb */ - goto qreparse_out; - } - if (!data_count || (data_count > 2048)) { + data_offset = le32_to_cpu(io_rsp->DataOffset); + data_count = le32_to_cpu(io_rsp->DataCount); + if (get_bcc(&io_rsp->hdr) < 2 || data_offset > 512 || + !data_count || data_count > 2048) { rc = -EIO; - cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n"); - goto qreparse_out; - } - end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; - reparse_buf = (struct reparse_symlink_data *) - ((char *)&pSMBr->hdr.Protocol + data_offset); - if ((char *)reparse_buf >= end_of_smb) { - rc = -EIO; - goto qreparse_out; - } - if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) { - cifs_dbg(FYI, "NFS style reparse tag\n"); - posix_buf = (struct reparse_posix_data *)reparse_buf; - - if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) { - cifs_dbg(FYI, "unsupported file type 0x%llx\n", - le64_to_cpu(posix_buf->InodeType)); - rc = -EOPNOTSUPP; - goto qreparse_out; - } - is_unicode = true; - sub_len = le16_to_cpu(reparse_buf->ReparseDataLength); - if (posix_buf->PathBuffer + sub_len > end_of_smb) { - cifs_dbg(FYI, "reparse buf beyond SMB\n"); - rc = -EIO; - goto qreparse_out; - } - *symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer, - sub_len, is_unicode, nls_codepage); - goto qreparse_out; - } else if (reparse_buf->ReparseTag != - cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) { - rc = -EOPNOTSUPP; - goto qreparse_out; + goto error; } - /* Reparse tag is NTFS symlink */ - sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) + - reparse_buf->PathBuffer; - sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength); - if (sub_start + sub_len > end_of_smb) { - cifs_dbg(FYI, "reparse buf beyond SMB\n"); + end = 2 + get_bcc(&io_rsp->hdr) + (__u8 *)&io_rsp->ByteCount; + start = (__u8 *)&io_rsp->hdr.Protocol + data_offset; + if (start >= end) { rc = -EIO; - goto qreparse_out; + goto error; } - if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) - is_unicode = true; - else - is_unicode = false; - - /* BB FIXME investigate remapping reserved chars here */ - *symlinkinfo = cifs_strndup_from_utf16(sub_start, sub_len, is_unicode, - nls_codepage); - if (!*symlinkinfo) - rc = -ENOMEM; -qreparse_out: - cifs_buf_release(pSMB); - /* - * Note: On -EAGAIN error only caller can retry on handle based calls - * since file handle passed in no longer valid. - */ + *tag = le32_to_cpu(((struct reparse_data_buffer *)start)->ReparseTag); + rsp->iov_base = io_rsp; + rsp->iov_len = io_rsp_len; + *rsp_buftype = CIFS_LARGE_BUFFER; + CIFSSMBClose(xid, tcon, fid.netfid); + return 0; + +error: + cifs_buf_release(io_req); + CIFSSMBClose(xid, tcon, fid.netfid); return rc; } diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 9bf8735cdd1e8..6b4d8effa79df 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -979,18 +979,13 @@ static int cifs_query_symlink(const unsigned int xid, char **target_path, struct kvec *rsp_iov) { + struct reparse_data_buffer *buf; + TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base; + bool unicode = !!(io->hdr.Flags2 & SMBFLG2_UNICODE); + u32 plen = le16_to_cpu(io->ByteCount); int rc; - int oplock = 0; - bool is_reparse_point = !!rsp_iov; - struct cifs_fid fid; - struct cifs_open_parms oparms; - cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); - - if (is_reparse_point) { - cifs_dbg(VFS, "reparse points not handled for SMB1 symlinks\n"); - return -EOPNOTSUPP; - } + cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path); /* Check for unix extensions */ if (cap_unix(tcon->ses)) { @@ -1001,37 +996,12 @@ static int cifs_query_symlink(const unsigned int xid, rc = cifs_unix_dfs_readlink(xid, tcon, full_path, target_path, cifs_sb->local_nls); - - goto out; + return rc; } - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .cifs_sb = cifs_sb, - .desired_access = FILE_READ_ATTRIBUTES, - .create_options = cifs_create_options(cifs_sb, - OPEN_REPARSE_POINT), - .disposition = FILE_OPEN, - .path = full_path, - .fid = &fid, - }; - - rc = CIFS_open(xid, &oparms, &oplock, NULL); - if (rc) - goto out; - - rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path, - cifs_sb->local_nls); - if (rc) - goto out_close; - - convert_delimiter(*target_path, '/'); -out_close: - CIFSSMBClose(xid, tcon, fid.netfid); -out: - if (!rc) - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); - return rc; + buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + + le32_to_cpu(io->DataOffset)); + return parse_reparse_point(buf, plen, cifs_sb, unicode, target_path); } static bool @@ -1214,6 +1184,7 @@ struct smb_version_operations smb1_operations = { .is_path_accessible = cifs_is_path_accessible, .can_echo = cifs_can_echo, .query_path_info = cifs_query_path_info, + .query_reparse_point = cifs_query_reparse_point, .query_file_info = cifs_query_file_info, .get_srv_inum = cifs_get_srv_inum, .set_path_size = CIFSSMBSetEOF, diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index a959ed2c9b22e..e9c8cff0b1d28 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2894,27 +2894,26 @@ parse_reparse_posix(struct reparse_posix_data *symlink_buf, return 0; } -static int -parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf, - u32 plen, char **target_path, - struct cifs_sb_info *cifs_sb) +static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym, + u32 plen, bool unicode, char **target_path, + struct cifs_sb_info *cifs_sb) { unsigned int sub_len; unsigned int sub_offset; /* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */ - sub_offset = le16_to_cpu(symlink_buf->SubstituteNameOffset); - sub_len = le16_to_cpu(symlink_buf->SubstituteNameLength); + sub_offset = le16_to_cpu(sym->SubstituteNameOffset); + sub_len = le16_to_cpu(sym->SubstituteNameLength); if (sub_offset + 20 > plen || sub_offset + sub_len + 20 > plen) { cifs_dbg(VFS, "srv returned malformed symlink buffer\n"); return -EIO; } - *target_path = cifs_strndup_from_utf16( - symlink_buf->PathBuffer + sub_offset, - sub_len, true, cifs_sb->local_nls); + *target_path = cifs_strndup_from_utf16(sym->PathBuffer + sub_offset, + sub_len, unicode, + cifs_sb->local_nls); if (!(*target_path)) return -ENOMEM; @@ -2924,19 +2923,17 @@ parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf, return 0; } -static int -parse_reparse_point(struct reparse_data_buffer *buf, - u32 plen, char **target_path, - struct cifs_sb_info *cifs_sb) +int parse_reparse_point(struct reparse_data_buffer *buf, + u32 plen, struct cifs_sb_info *cifs_sb, + bool unicode, char **target_path) { - if (plen < sizeof(struct reparse_data_buffer)) { + if (plen < sizeof(*buf)) { cifs_dbg(VFS, "reparse buffer is too small. Must be at least 8 bytes but was %d\n", plen); return -EIO; } - if (plen < le16_to_cpu(buf->ReparseDataLength) + - sizeof(struct reparse_data_buffer)) { + if (plen < le16_to_cpu(buf->ReparseDataLength) + sizeof(*buf)) { cifs_dbg(VFS, "srv returned invalid reparse buf length: %d\n", plen); return -EIO; @@ -2951,7 +2948,7 @@ parse_reparse_point(struct reparse_data_buffer *buf, case IO_REPARSE_TAG_SYMLINK: return parse_reparse_symlink( (struct reparse_symlink_data_buffer *)buf, - plen, target_path, cifs_sb); + plen, unicode, target_path, cifs_sb); default: cifs_dbg(VFS, "srv returned unknown symlink buffer tag:0x%08x\n", le32_to_cpu(buf->ReparseTag)); @@ -2970,11 +2967,11 @@ static int smb2_query_symlink(const unsigned int xid, struct smb2_ioctl_rsp *io = rsp_iov->iov_base; u32 plen = le32_to_cpu(io->OutputCount); - cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + cifs_tcon_dbg(FYI, "%s: path: %s\n", __func__, full_path); buf = (struct reparse_data_buffer *)((u8 *)io + le32_to_cpu(io->OutputOffset)); - return parse_reparse_point(buf, plen, target_path, cifs_sb); + return parse_reparse_point(buf, plen, cifs_sb, true, target_path); } static int smb2_query_reparse_point(const unsigned int xid, From 539aad7f14dab7f947e5ab81901c0b20513a50db Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 21 Nov 2023 20:12:53 -0300 Subject: [PATCH 542/560] smb: client: introduce ->parse_reparse_point() Parse reparse point into cifs_open_info_data structure and feed it through cifs_open_info_to_fattr(). Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 6 ++++-- fs/smb/client/inode.c | 23 +++++++++++++--------- fs/smb/client/smb1ops.c | 41 ++++++++++++++++++++++------------------ fs/smb/client/smb2ops.c | 28 ++++++++++++++------------- 4 files changed, 56 insertions(+), 42 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 6ffbd81bd109a..111daa4ff2610 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -395,8 +395,7 @@ struct smb_version_operations { struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - char **target_path, - struct kvec *rsp_iov); + char **target_path); /* open a file for non-posix mounts */ int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf); @@ -551,6 +550,9 @@ struct smb_version_operations { bool (*is_status_io_timeout)(char *buf); /* Check for STATUS_NETWORK_NAME_DELETED */ bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); + int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb, + struct kvec *rsp_iov, + struct cifs_open_info_data *data); }; struct smb_version_values { diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 86fbd3f847d65..dd482de3dc3f3 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -459,8 +459,7 @@ static int cifs_get_unix_fattr(const unsigned char *full_path, return -EOPNOTSUPP; rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, - &fattr->cf_symlink_target, - NULL); + &fattr->cf_symlink_target); cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc); } return rc; @@ -1035,22 +1034,28 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, if (!rc) iov = &rsp_iov; } + + rc = -EOPNOTSUPP; switch ((data->reparse_tag = tag)) { case 0: /* SMB1 symlink */ - iov = NULL; - fallthrough; - case IO_REPARSE_TAG_NFS: - case IO_REPARSE_TAG_SYMLINK: - if (!data->symlink_target && server->ops->query_symlink) { + if (server->ops->query_symlink) { rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, - &data->symlink_target, - iov); + &data->symlink_target); } break; case IO_REPARSE_TAG_MOUNT_POINT: cifs_create_junction_fattr(fattr, sb); + rc = 0; goto out; + default: + if (data->symlink_target) { + rc = 0; + } else if (server->ops->parse_reparse_point) { + rc = server->ops->parse_reparse_point(cifs_sb, + iov, data); + } + break; } cifs_open_info_to_fattr(fattr, data, sb); diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 6b4d8effa79df..0dd599004e042 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -976,32 +976,36 @@ static int cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, - char **target_path, - struct kvec *rsp_iov) + char **target_path) { - struct reparse_data_buffer *buf; - TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base; - bool unicode = !!(io->hdr.Flags2 & SMBFLG2_UNICODE); - u32 plen = le16_to_cpu(io->ByteCount); int rc; cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path); - /* Check for unix extensions */ - if (cap_unix(tcon->ses)) { - rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); - if (rc == -EREMOTE) - rc = cifs_unix_dfs_readlink(xid, tcon, full_path, - target_path, - cifs_sb->local_nls); - return rc; - } + if (!cap_unix(tcon->ses)) + return -EOPNOTSUPP; + + rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + if (rc == -EREMOTE) + rc = cifs_unix_dfs_readlink(xid, tcon, full_path, + target_path, cifs_sb->local_nls); + return rc; +} + +static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb, + struct kvec *rsp_iov, + struct cifs_open_info_data *data) +{ + struct reparse_data_buffer *buf; + TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base; + bool unicode = !!(io->hdr.Flags2 & SMBFLG2_UNICODE); + u32 plen = le16_to_cpu(io->ByteCount); buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + le32_to_cpu(io->DataOffset)); - return parse_reparse_point(buf, plen, cifs_sb, unicode, target_path); + return parse_reparse_point(buf, plen, cifs_sb, unicode, + &data->symlink_target); } static bool @@ -1200,6 +1204,7 @@ struct smb_version_operations smb1_operations = { .rename = CIFSSMBRename, .create_hardlink = CIFSCreateHardLink, .query_symlink = cifs_query_symlink, + .parse_reparse_point = cifs_parse_reparse_point, .open = cifs_open_file, .set_fid = cifs_set_fid, .close = cifs_close_file, diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index e9c8cff0b1d28..2955eaa51d4da 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2949,6 +2949,12 @@ int parse_reparse_point(struct reparse_data_buffer *buf, return parse_reparse_symlink( (struct reparse_symlink_data_buffer *)buf, plen, unicode, target_path, cifs_sb); + case IO_REPARSE_TAG_LX_SYMLINK: + case IO_REPARSE_TAG_AF_UNIX: + case IO_REPARSE_TAG_LX_FIFO: + case IO_REPARSE_TAG_LX_CHR: + case IO_REPARSE_TAG_LX_BLK: + return 0; default: cifs_dbg(VFS, "srv returned unknown symlink buffer tag:0x%08x\n", le32_to_cpu(buf->ReparseTag)); @@ -2956,22 +2962,18 @@ int parse_reparse_point(struct reparse_data_buffer *buf, } } -static int smb2_query_symlink(const unsigned int xid, - struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - const char *full_path, - char **target_path, - struct kvec *rsp_iov) +static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, + struct kvec *rsp_iov, + struct cifs_open_info_data *data) { struct reparse_data_buffer *buf; struct smb2_ioctl_rsp *io = rsp_iov->iov_base; u32 plen = le32_to_cpu(io->OutputCount); - cifs_tcon_dbg(FYI, "%s: path: %s\n", __func__, full_path); - buf = (struct reparse_data_buffer *)((u8 *)io + le32_to_cpu(io->OutputOffset)); - return parse_reparse_point(buf, plen, cifs_sb, true, target_path); + return parse_reparse_point(buf, plen, cifs_sb, + true, &data->symlink_target); } static int smb2_query_reparse_point(const unsigned int xid, @@ -5206,7 +5208,7 @@ struct smb_version_operations smb20_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .query_symlink = smb2_query_symlink, + .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .open = smb2_open_file, @@ -5308,7 +5310,7 @@ struct smb_version_operations smb21_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .query_symlink = smb2_query_symlink, + .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .open = smb2_open_file, @@ -5413,7 +5415,7 @@ struct smb_version_operations smb30_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .query_symlink = smb2_query_symlink, + .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .open = smb2_open_file, @@ -5527,7 +5529,7 @@ struct smb_version_operations smb311_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .query_symlink = smb2_query_symlink, + .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .open = smb2_open_file, From 45e724022e2704b5a5193fd96f378822b0448e07 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 21 Nov 2023 20:12:54 -0300 Subject: [PATCH 543/560] smb: client: set correct file type from NFS reparse points Handle all file types in NFS reparse points as specified in MS-FSCC 2.1.2.6 Network File System (NFS) Reparse Data Buffer. The client is now able to set all file types based on the parsed NFS reparse point, which used to support only symlinks. This works for SMB1+. Before patch: $ mount.cifs //srv/share /mnt -o ... $ ls -l /mnt ls: cannot access 'block': Operation not supported ls: cannot access 'char': Operation not supported ls: cannot access 'fifo': Operation not supported ls: cannot access 'sock': Operation not supported total 1 l????????? ? ? ? ? ? block l????????? ? ? ? ? ? char -rwxr-xr-x 1 root root 5 Nov 18 23:22 f0 l????????? ? ? ? ? ? fifo l--------- 1 root root 0 Nov 18 23:23 link -> f0 l????????? ? ? ? ? ? sock After patch: $ mount.cifs //srv/share /mnt -o ... $ ls -l /mnt total 1 brwxr-xr-x 1 root root 123, 123 Nov 18 00:34 block crwxr-xr-x 1 root root 1234, 1234 Nov 18 00:33 char -rwxr-xr-x 1 root root 5 Nov 18 23:22 f0 prwxr-xr-x 1 root root 0 Nov 18 23:23 fifo lrwxr-xr-x 1 root root 0 Nov 18 23:23 link -> f0 srwxr-xr-x 1 root root 0 Nov 19 2023 sock Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 8 ++- fs/smb/client/cifspdu.h | 2 +- fs/smb/client/cifsproto.h | 4 +- fs/smb/client/inode.c | 51 +++++++++++++++++-- fs/smb/client/readdir.c | 6 ++- fs/smb/client/smb1ops.c | 3 +- fs/smb/client/smb2inode.c | 2 +- fs/smb/client/smb2ops.c | 101 ++++++++++++++++++++------------------ 8 files changed, 116 insertions(+), 61 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 111daa4ff2610..7558167f603c3 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -191,7 +191,13 @@ struct cifs_open_info_data { bool reparse_point; bool symlink; }; - __u32 reparse_tag; + struct { + __u32 tag; + union { + struct reparse_data_buffer *buf; + struct reparse_posix_data *posix; + }; + } reparse; char *symlink_target; union { struct smb2_file_all_info fi; diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index 2a90134331a48..83ccc51a54d03 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -1509,7 +1509,7 @@ struct reparse_posix_data { __le16 ReparseDataLength; __u16 Reserved; __le64 InodeType; /* LNK, FIFO, CHR etc. */ - char PathBuffer[]; + __u8 DataBuffer[]; } __attribute__((packed)); struct cifs_quota_data { diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 8a739c10d6345..c00f844205590 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -210,7 +210,7 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, const struct cifs_fid *fid); bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, - u32 tag); + struct cifs_open_info_data *data); extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path, struct super_block *sb, unsigned int xid); extern int cifs_get_inode_info_unix(struct inode **pinode, @@ -667,7 +667,7 @@ char *extract_hostname(const char *unc); char *extract_sharename(const char *unc); int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, - bool unicode, char **target_path); + bool unicode, struct cifs_open_info_data *data); #ifdef CONFIG_CIFS_DFS_UPCALL static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index dd482de3dc3f3..47f49be69ced2 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -721,10 +721,51 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); } +static inline dev_t nfs_mkdev(struct reparse_posix_data *buf) +{ + u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer); + + return MKDEV(v >> 32, v & 0xffffffff); +} + bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, - u32 tag) + struct cifs_open_info_data *data) { + struct reparse_posix_data *buf = data->reparse.posix; + u32 tag = data->reparse.tag; + + if (tag == IO_REPARSE_TAG_NFS && buf) { + switch (le64_to_cpu(buf->InodeType)) { + case NFS_SPECFILE_CHR: + fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_CHR; + fattr->cf_rdev = nfs_mkdev(buf); + break; + case NFS_SPECFILE_BLK: + fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_BLK; + fattr->cf_rdev = nfs_mkdev(buf); + break; + case NFS_SPECFILE_FIFO: + fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_FIFO; + break; + case NFS_SPECFILE_SOCK: + fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_SOCK; + break; + case NFS_SPECFILE_LNK: + fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_LNK; + break; + default: + WARN_ON_ONCE(1); + return false; + } + return true; + } + switch (tag) { case IO_REPARSE_TAG_LX_SYMLINK: fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode; @@ -790,7 +831,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); if (cifs_open_data_reparse(data) && - cifs_reparse_point_to_fattr(cifs_sb, fattr, data->reparse_tag)) + cifs_reparse_point_to_fattr(cifs_sb, fattr, data)) goto out_reparse; if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { @@ -855,7 +896,7 @@ cifs_get_file_info(struct file *filp) data.adjust_tz = false; if (data.symlink_target) { data.symlink = true; - data.reparse_tag = IO_REPARSE_TAG_SYMLINK; + data.reparse.tag = IO_REPARSE_TAG_SYMLINK; } cifs_open_info_to_fattr(&fattr, &data, inode->i_sb); break; @@ -1024,7 +1065,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct kvec rsp_iov, *iov = NULL; int rsp_buftype = CIFS_NO_BUFFER; - u32 tag = data->reparse_tag; + u32 tag = data->reparse.tag; int rc = 0; if (!tag && server->ops->query_reparse_point) { @@ -1036,7 +1077,7 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, } rc = -EOPNOTSUPP; - switch ((data->reparse_tag = tag)) { + switch ((data->reparse.tag = tag)) { case 0: /* SMB1 symlink */ if (server->ops->query_symlink) { rc = server->ops->query_symlink(xid, tcon, diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 47fc22de8d20c..d30ea2005eb36 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -153,6 +153,10 @@ static bool reparse_file_needs_reval(const struct cifs_fattr *fattr) static void cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) { + struct cifs_open_info_data data = { + .reparse = { .tag = fattr->cf_cifstag, }, + }; + fattr->cf_uid = cifs_sb->ctx->linux_uid; fattr->cf_gid = cifs_sb->ctx->linux_gid; @@ -165,7 +169,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) * reasonably map some of them to directories vs. files vs. symlinks */ if ((fattr->cf_cifsattrs & ATTR_REPARSE) && - cifs_reparse_point_to_fattr(cifs_sb, fattr, fattr->cf_cifstag)) + cifs_reparse_point_to_fattr(cifs_sb, fattr, &data)) goto out_reparse; if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 0dd599004e042..64e25233e85de 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -1004,8 +1004,7 @@ static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb, buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + le32_to_cpu(io->DataOffset)); - return parse_reparse_point(buf, plen, cifs_sb, unicode, - &data->symlink_target); + return parse_reparse_point(buf, plen, cifs_sb, unicode, data); } static bool diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 0b89f7008ac0f..c94940af5d4b8 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -555,7 +555,7 @@ static int parse_create_response(struct cifs_open_info_data *data, break; } data->reparse_point = reparse_point; - data->reparse_tag = tag; + data->reparse.tag = tag; return rc; } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 2955eaa51d4da..f01a929a7a3a5 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2866,89 +2866,95 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, return rc; } -static int -parse_reparse_posix(struct reparse_posix_data *symlink_buf, - u32 plen, char **target_path, - struct cifs_sb_info *cifs_sb) +/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */ +static int parse_reparse_posix(struct reparse_posix_data *buf, + struct cifs_sb_info *cifs_sb, + struct cifs_open_info_data *data) { unsigned int len; - - /* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */ - len = le16_to_cpu(symlink_buf->ReparseDataLength); - - if (le64_to_cpu(symlink_buf->InodeType) != NFS_SPECFILE_LNK) { - cifs_dbg(VFS, "%lld not a supported symlink type\n", - le64_to_cpu(symlink_buf->InodeType)); + u64 type; + + switch ((type = le64_to_cpu(buf->InodeType))) { + case NFS_SPECFILE_LNK: + len = le16_to_cpu(buf->ReparseDataLength); + data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer, + len, true, + cifs_sb->local_nls); + if (!data->symlink_target) + return -ENOMEM; + convert_delimiter(data->symlink_target, '/'); + cifs_dbg(FYI, "%s: target path: %s\n", + __func__, data->symlink_target); + break; + case NFS_SPECFILE_CHR: + case NFS_SPECFILE_BLK: + case NFS_SPECFILE_FIFO: + case NFS_SPECFILE_SOCK: + break; + default: + cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n", + __func__, type); return -EOPNOTSUPP; } - - *target_path = cifs_strndup_from_utf16( - symlink_buf->PathBuffer, - len, true, cifs_sb->local_nls); - if (!(*target_path)) - return -ENOMEM; - - convert_delimiter(*target_path, '/'); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); - return 0; } static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym, - u32 plen, bool unicode, char **target_path, - struct cifs_sb_info *cifs_sb) + u32 plen, bool unicode, + struct cifs_sb_info *cifs_sb, + struct cifs_open_info_data *data) { - unsigned int sub_len; - unsigned int sub_offset; + unsigned int len; + unsigned int offs; /* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */ - sub_offset = le16_to_cpu(sym->SubstituteNameOffset); - sub_len = le16_to_cpu(sym->SubstituteNameLength); - if (sub_offset + 20 > plen || - sub_offset + sub_len + 20 > plen) { + offs = le16_to_cpu(sym->SubstituteNameOffset); + len = le16_to_cpu(sym->SubstituteNameLength); + if (offs + 20 > plen || offs + len + 20 > plen) { cifs_dbg(VFS, "srv returned malformed symlink buffer\n"); return -EIO; } - *target_path = cifs_strndup_from_utf16(sym->PathBuffer + sub_offset, - sub_len, unicode, - cifs_sb->local_nls); - if (!(*target_path)) + data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs, + len, unicode, + cifs_sb->local_nls); + if (!data->symlink_target) return -ENOMEM; - convert_delimiter(*target_path, '/'); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + convert_delimiter(data->symlink_target, '/'); + cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target); return 0; } int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, - bool unicode, char **target_path) + bool unicode, struct cifs_open_info_data *data) { if (plen < sizeof(*buf)) { - cifs_dbg(VFS, "reparse buffer is too small. Must be at least 8 bytes but was %d\n", - plen); + cifs_dbg(VFS, "%s: reparse buffer is too small. Must be at least 8 bytes but was %d\n", + __func__, plen); return -EIO; } if (plen < le16_to_cpu(buf->ReparseDataLength) + sizeof(*buf)) { - cifs_dbg(VFS, "srv returned invalid reparse buf length: %d\n", - plen); + cifs_dbg(VFS, "%s: invalid reparse buf length: %d\n", + __func__, plen); return -EIO; } + data->reparse.buf = buf; + /* See MS-FSCC 2.1.2 */ switch (le32_to_cpu(buf->ReparseTag)) { case IO_REPARSE_TAG_NFS: - return parse_reparse_posix( - (struct reparse_posix_data *)buf, - plen, target_path, cifs_sb); + return parse_reparse_posix((struct reparse_posix_data *)buf, + cifs_sb, data); case IO_REPARSE_TAG_SYMLINK: return parse_reparse_symlink( (struct reparse_symlink_data_buffer *)buf, - plen, unicode, target_path, cifs_sb); + plen, unicode, cifs_sb, data); case IO_REPARSE_TAG_LX_SYMLINK: case IO_REPARSE_TAG_AF_UNIX: case IO_REPARSE_TAG_LX_FIFO: @@ -2956,8 +2962,8 @@ int parse_reparse_point(struct reparse_data_buffer *buf, case IO_REPARSE_TAG_LX_BLK: return 0; default: - cifs_dbg(VFS, "srv returned unknown symlink buffer tag:0x%08x\n", - le32_to_cpu(buf->ReparseTag)); + cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n", + __func__, le32_to_cpu(buf->ReparseTag)); return -EOPNOTSUPP; } } @@ -2972,8 +2978,7 @@ static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, buf = (struct reparse_data_buffer *)((u8 *)io + le32_to_cpu(io->OutputOffset)); - return parse_reparse_point(buf, plen, cifs_sb, - true, &data->symlink_target); + return parse_reparse_point(buf, plen, cifs_sb, true, data); } static int smb2_query_reparse_point(const unsigned int xid, From b0348e459c836abdb0f4b967e006d15c77cf1c87 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 21 Nov 2023 20:12:55 -0300 Subject: [PATCH 544/560] smb: client: introduce cifs_sfu_make_node() Remove duplicate code and add new helper for creating special files in SFU (Services for UNIX) format that can be shared by SMB1+ code. Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/smb/client/cifsproto.h | 3 ++ fs/smb/client/smb1ops.c | 80 ++++------------------------------- fs/smb/client/smb2ops.c | 89 ++++++++++++++++++--------------------- 3 files changed, 52 insertions(+), 120 deletions(-) diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index c00f844205590..46feaa0880bdf 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -668,6 +668,9 @@ char *extract_sharename(const char *unc); int parse_reparse_point(struct reparse_data_buffer *buf, u32 plen, struct cifs_sb_info *cifs_sb, bool unicode, struct cifs_open_info_data *data); +int cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev); #ifdef CONFIG_CIFS_DFS_UPCALL static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 64e25233e85de..a9eaba8083b0d 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -1041,15 +1041,7 @@ cifs_make_node(unsigned int xid, struct inode *inode, { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct inode *newinode = NULL; - int rc = -EPERM; - struct cifs_open_info_data buf = {}; - struct cifs_io_parms io_parms; - __u32 oplock = 0; - struct cifs_fid fid; - struct cifs_open_parms oparms; - unsigned int bytes_written; - struct win_dev *pdev; - struct kvec iov[2]; + int rc; if (tcon->unix_ext) { /* @@ -1083,74 +1075,18 @@ cifs_make_node(unsigned int xid, struct inode *inode, d_instantiate(dentry, newinode); return rc; } - /* - * SMB1 SFU emulation: should work with all servers, but only - * support block and char device (no socket & fifo) + * Check if mounted with mount parm 'sfu' mount parm. + * SFU emulation should work with all servers, but only + * supports block and char device (no socket & fifo), + * and was used by default in earlier versions of Windows */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - return rc; - - if (!S_ISCHR(mode) && !S_ISBLK(mode)) - return rc; - - cifs_dbg(FYI, "sfu compat create special file\n"); - - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .cifs_sb = cifs_sb, - .desired_access = GENERIC_WRITE, - .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL), - .disposition = FILE_CREATE, - .path = full_path, - .fid = &fid, - }; - - if (tcon->ses->server->oplocks) - oplock = REQ_OPLOCK; - else - oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf); - if (rc) - return rc; - - /* - * BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely. - */ - - pdev = (struct win_dev *)&buf.fi; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = &buf.fi; - iov[1].iov_len = sizeof(struct win_dev); - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } - tcon->ses->server->ops->close(xid, tcon, &fid); - d_drop(dentry); - - /* FIXME: add code here to set EAs */ - - cifs_free_open_info(&buf); - return rc; + return -EPERM; + return cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); } - - struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index f01a929a7a3a5..82ab62fd00404 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -5068,41 +5068,24 @@ smb2_next_header(char *buf) return le32_to_cpu(hdr->NextCommand); } -static int -smb2_make_node(unsigned int xid, struct inode *inode, - struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, umode_t mode, dev_t dev) +int cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) { - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - int rc = -EPERM; struct cifs_open_info_data buf = {}; - struct cifs_io_parms io_parms = {0}; - __u32 oplock = 0; - struct cifs_fid fid; + struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; + struct cifs_io_parms io_parms = {}; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_fid fid; unsigned int bytes_written; struct win_dev *pdev; struct kvec iov[2]; - - /* - * Check if mounted with mount parm 'sfu' mount parm. - * SFU emulation should work with all servers, but only - * supports block and char device (no socket & fifo), - * and was used by default in earlier versions of Windows - */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - return rc; - - /* - * TODO: Add ability to create instead via reparse point. Windows (e.g. - * their current NFS server) uses this approach to expose special files - * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions - */ + __u32 oplock = server->oplocks ? REQ_OPLOCK : 0; + int rc; if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode)) - return rc; - - cifs_dbg(FYI, "sfu compat create special file\n"); + return -EPERM; oparms = (struct cifs_open_parms) { .tcon = tcon, @@ -5115,11 +5098,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, .fid = &fid, }; - if (tcon->ses->server->oplocks) - oplock = REQ_OPLOCK; - else - oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf); + rc = server->ops->open(xid, &oparms, &oplock, &buf); if (rc) return rc; @@ -5127,42 +5106,56 @@ smb2_make_node(unsigned int xid, struct inode *inode, * BB Do not bother to decode buf since no local inode yet to put * timestamps in, but we can reuse it safely. */ - pdev = (struct win_dev *)&buf.fi; io_parms.pid = current->tgid; io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = &buf.fi; - iov[1].iov_len = sizeof(struct win_dev); + io_parms.length = sizeof(*pdev); + iov[1].iov_base = pdev; + iov[1].iov_len = sizeof(*pdev); if (S_ISCHR(mode)) { memcpy(pdev->type, "IntxCHR", 8); pdev->major = cpu_to_le64(MAJOR(dev)); pdev->minor = cpu_to_le64(MINOR(dev)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); } else if (S_ISBLK(mode)) { memcpy(pdev->type, "IntxBLK", 8); pdev->major = cpu_to_le64(MAJOR(dev)); pdev->minor = cpu_to_le64(MINOR(dev)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); } else if (S_ISFIFO(mode)) { memcpy(pdev->type, "LnxFIFO", 8); - pdev->major = 0; - pdev->minor = 0; - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); } - tcon->ses->server->ops->close(xid, tcon, &fid); - d_drop(dentry); + rc = server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + server->ops->close(xid, tcon, &fid); + d_drop(dentry); /* FIXME: add code here to set EAs */ - cifs_free_open_info(&buf); return rc; } +static int smb2_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + /* + * Check if mounted with mount parm 'sfu' mount parm. + * SFU emulation should work with all servers, but only + * supports block and char device (no socket & fifo), + * and was used by default in earlier versions of Windows + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + return -EPERM; + /* + * TODO: Add ability to create instead via reparse point. Windows (e.g. + * their current NFS server) uses this approach to expose special files + * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions + */ + return cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); +} + #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, From c0a8574204054effad6ac83cc75c02576e2985fe Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 19 Nov 2023 14:32:34 +0900 Subject: [PATCH 545/560] arm64: add dependency between vmlinuz.efi and Image A common issue in Makefile is a race in parallel building. You need to be careful to prevent multiple threads from writing to the same file simultaneously. Commit 3939f3345050 ("ARM: 8418/1: add boot image dependencies to not generate invalid images") addressed such a bad scenario. A similar symptom occurs with the following command: $ make -j$(nproc) ARCH=arm64 Image vmlinuz.efi [ snip ] SORTTAB vmlinux OBJCOPY arch/arm64/boot/Image OBJCOPY arch/arm64/boot/Image AS arch/arm64/boot/zboot-header.o PAD arch/arm64/boot/vmlinux.bin GZIP arch/arm64/boot/vmlinuz OBJCOPY arch/arm64/boot/vmlinuz.o LD arch/arm64/boot/vmlinuz.efi.elf OBJCOPY arch/arm64/boot/vmlinuz.efi The log "OBJCOPY arch/arm64/boot/Image" is displayed twice. It indicates that two threads simultaneously enter arch/arm64/boot/ and write to arch/arm64/boot/Image. It occasionally leads to a build failure: $ make -j$(nproc) ARCH=arm64 Image vmlinuz.efi [ snip ] SORTTAB vmlinux OBJCOPY arch/arm64/boot/Image PAD arch/arm64/boot/vmlinux.bin truncate: Invalid number: 'arch/arm64/boot/vmlinux.bin' make[2]: *** [drivers/firmware/efi/libstub/Makefile.zboot:13: arch/arm64/boot/vmlinux.bin] Error 1 make[2]: *** Deleting file 'arch/arm64/boot/vmlinux.bin' make[1]: *** [arch/arm64/Makefile:163: vmlinuz.efi] Error 2 make[1]: *** Waiting for unfinished jobs.... make: *** [Makefile:234: __sub-make] Error 2 vmlinuz.efi depends on Image, but such a dependency is not specified in arch/arm64/Makefile. Signed-off-by: Masahiro Yamada Acked-by: Ard Biesheuvel Reviewed-by: SImon Glass Link: https://lore.kernel.org/r/20231119053234.2367621-1-masahiroy@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 4bd85cc0d32bf..9a2d3723cd0fa 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -158,7 +158,7 @@ endif all: $(notdir $(KBUILD_IMAGE)) - +vmlinuz.efi: Image Image vmlinuz.efi: vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ From bca4104b00fec60be330cd32818dd5c70db3d469 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Nov 2023 12:41:26 +0100 Subject: [PATCH 546/560] lockdep: Fix block chain corruption Kent reported an occasional KASAN splat in lockdep. Mark then noted: > I suspect the dodgy access is to chain_block_buckets[-1], which hits the last 4 > bytes of the redzone and gets (incorrectly/misleadingly) attributed to > nr_large_chain_blocks. That would mean @size == 0, at which point size_to_bucket() returns -1 and the above happens. alloc_chain_hlocks() has 'size - req', for the first with the precondition 'size >= rq', which allows the 0. This code is trying to split a block, del_chain_block() takes what we need, and add_chain_block() puts back the remainder, except in the above case the remainder is 0 sized and things go sideways. Fixes: 810507fe6fd5 ("locking/lockdep: Reuse freed chain_hlocks entries") Reported-by: Kent Overstreet Signed-off-by: Peter Zijlstra (Intel) Tested-by: Kent Overstreet Link: https://lkml.kernel.org/r/20231121114126.GH8262@noisy.programming.kicks-ass.net --- kernel/locking/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e85b5ad3e2069..151bd3de59363 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3497,7 +3497,8 @@ static int alloc_chain_hlocks(int req) size = chain_block_size(curr); if (likely(size >= req)) { del_chain_block(0, size, chain_block_next(curr)); - add_chain_block(curr + req, size - req); + if (size > req) + add_chain_block(curr + req, size - req); return curr; } } From 0167236e7d66c5e1e85d902a6abc2529b7544539 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 26 Oct 2023 01:25:07 +0100 Subject: [PATCH 547/560] afs: Return ENOENT if no cell DNS record can be found Make AFS return error ENOENT if no cell SRV or AFSDB DNS record (or cellservdb config file record) can be found rather than returning EDESTADDRREQ. Also add cell name lookup info to the cursor dump. Fixes: d5c32c89b208 ("afs: Fix cell DNS lookup") Reported-by: Markus Suvanto Link: https://bugzilla.kernel.org/show_bug.cgi?id=216637 Signed-off-by: David Howells Reviewed-by: Marc Dionne cc: linux-afs@lists.infradead.org --- fs/afs/vl_rotate.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index 488e58490b16e..eb415ce563600 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -58,6 +58,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) } /* Status load is ordered after lookup counter load */ + if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) { + pr_warn("No record of cell %s\n", cell->name); + vc->error = -ENOENT; + return false; + } + if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { vc->error = -EDESTADDRREQ; return false; @@ -285,6 +291,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) */ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) { + struct afs_cell *cell = vc->cell; static int count; int i; @@ -294,6 +301,9 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) rcu_read_lock(); pr_notice("EDESTADDR occurred\n"); + pr_notice("CELL: %s err=%d\n", cell->name, cell->error); + pr_notice("DNS: src=%u st=%u lc=%x\n", + cell->dns_source, cell->dns_status, cell->dns_lookup_count); pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n", vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error); From b590eb41be766c5a63acc7e8896a042f7a4e8293 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 1 Nov 2023 22:03:28 +0000 Subject: [PATCH 548/560] afs: Fix file locking on R/O volumes to operate in local mode AFS doesn't really do locking on R/O volumes as fileservers don't maintain state with each other and thus a lock on a R/O volume file on one fileserver will not be be visible to someone looking at the same file on another fileserver. Further, the server may return an error if you try it. Fix this by doing what other AFS clients do and handle filelocking on R/O volume files entirely within the client and don't touch the server. Fixes: 6c6c1d63c243 ("afs: Provide mount-time configurable byte-range file locking emulation") Signed-off-by: David Howells Reviewed-by: Marc Dionne cc: linux-afs@lists.infradead.org --- fs/afs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/afs/super.c b/fs/afs/super.c index 95d713074dc81..e95fb4cb4fcd2 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -407,6 +407,8 @@ static int afs_validate_fc(struct fs_context *fc) return PTR_ERR(volume); ctx->volume = volume; + if (volume->type != AFSVL_RWVOL) + ctx->flock_mode = afs_flock_mode_local; } return 0; From 68516f60c1d8b0a71e516d630f66b99cb50e0150 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Nov 2023 16:24:00 +0000 Subject: [PATCH 549/560] afs: Mark a superblock for an R/O or Backup volume as SB_RDONLY Mark a superblock that is for for an R/O or Backup volume as SB_RDONLY when mounting it. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- fs/afs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/afs/super.c b/fs/afs/super.c index e95fb4cb4fcd2..a01a0fb2cdbb5 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -407,8 +407,10 @@ static int afs_validate_fc(struct fs_context *fc) return PTR_ERR(volume); ctx->volume = volume; - if (volume->type != AFSVL_RWVOL) + if (volume->type != AFSVL_RWVOL) { ctx->flock_mode = afs_flock_mode_local; + fc->sb_flags |= SB_RDONLY; + } } return 0; From e11d4cccd094a7cd4696c8c42e672c76c092dad5 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 20 Nov 2023 15:37:50 +0100 Subject: [PATCH 550/560] parisc: Mark ex_table entries 32-bit aligned in assembly.h Add an align statement to tell the linker that all ex_table entries and as such the whole ex_table section should be 32-bit aligned in vmlinux and modules. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v6.0+ --- arch/parisc/include/asm/assembly.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/include/asm/assembly.h b/arch/parisc/include/asm/assembly.h index 75677b526b2bb..74d17d7e759da 100644 --- a/arch/parisc/include/asm/assembly.h +++ b/arch/parisc/include/asm/assembly.h @@ -574,6 +574,7 @@ */ #define ASM_EXCEPTIONTABLE_ENTRY(fault_addr, except_addr) \ .section __ex_table,"aw" ! \ + .align 4 ! \ .word (fault_addr - .), (except_addr - .) ! \ .previous From a80aeb86542a50aa8521729ea4cc731ee7174f03 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 20 Nov 2023 15:39:03 +0100 Subject: [PATCH 551/560] parisc: Mark ex_table entries 32-bit aligned in uaccess.h Add an align statement to tell the linker that all ex_table entries and as such the whole ex_table section should be 32-bit aligned in vmlinux and modules. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v6.0+ --- arch/parisc/include/asm/uaccess.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index 2bf660eabe421..4165079898d9e 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -41,6 +41,7 @@ struct exception_table_entry { #define ASM_EXCEPTIONTABLE_ENTRY( fault_addr, except_addr )\ ".section __ex_table,\"aw\"\n" \ + ".align 4\n" \ ".word (" #fault_addr " - .), (" #except_addr " - .)\n\t" \ ".previous\n" From 33f806da2df68606f77d7b892cd1298ba3d463e8 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 20 Nov 2023 23:10:20 +0100 Subject: [PATCH 552/560] parisc: Mark altinstructions read-only and 32-bit aligned Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v6.0+ --- arch/parisc/include/asm/alternative.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/parisc/include/asm/alternative.h b/arch/parisc/include/asm/alternative.h index 1ed45fd085d3b..1eb488f25b838 100644 --- a/arch/parisc/include/asm/alternative.h +++ b/arch/parisc/include/asm/alternative.h @@ -34,7 +34,8 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end, /* Alternative SMP implementation. */ #define ALTERNATIVE(cond, replacement) "!0:" \ - ".section .altinstructions, \"aw\" !" \ + ".section .altinstructions, \"a\" !" \ + ".align 4 !" \ ".word (0b-4-.) !" \ ".hword 1, " __stringify(cond) " !" \ ".word " __stringify(replacement) " !" \ @@ -44,7 +45,8 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end, /* to replace one single instructions by a new instruction */ #define ALTERNATIVE(from, to, cond, replacement)\ - .section .altinstructions, "aw" ! \ + .section .altinstructions, "a" ! \ + .align 4 ! \ .word (from - .) ! \ .hword (to - from)/4, cond ! \ .word replacement ! \ @@ -52,7 +54,8 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end, /* to replace multiple instructions by new code */ #define ALTERNATIVE_CODE(from, num_instructions, cond, new_instr_ptr)\ - .section .altinstructions, "aw" ! \ + .section .altinstructions, "a" ! \ + .align 4 ! \ .word (from - .) ! \ .hword -num_instructions, cond ! \ .word (new_instr_ptr - .) ! \ From 07eecff8ae78df7f28800484d31337e1f9bfca3a Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 20 Nov 2023 23:14:39 +0100 Subject: [PATCH 553/560] parisc: Mark jump_table naturally aligned The jump_table stores two 32-bit words and one 32- (on 32-bit kernel) or one 64-bit word (on 64-bit kernel). Ensure that the last word is always 64-bit aligned on a 64-bit kernel by aligning the whole structure on sizeof(long). Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v6.0+ --- arch/parisc/include/asm/jump_label.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/parisc/include/asm/jump_label.h b/arch/parisc/include/asm/jump_label.h index af2a598bc0f81..94428798b6aa6 100644 --- a/arch/parisc/include/asm/jump_label.h +++ b/arch/parisc/include/asm/jump_label.h @@ -15,10 +15,12 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool bran asm_volatile_goto("1:\n\t" "nop\n\t" ".pushsection __jump_table, \"aw\"\n\t" + ".align %1\n\t" ".word 1b - ., %l[l_yes] - .\n\t" __stringify(ASM_ULONG_INSN) " %c0 - .\n\t" ".popsection\n\t" - : : "i" (&((char *)key)[branch]) : : l_yes); + : : "i" (&((char *)key)[branch]), "i" (sizeof(long)) + : : l_yes); return false; l_yes: @@ -30,10 +32,12 @@ static __always_inline bool arch_static_branch_jump(struct static_key *key, bool asm_volatile_goto("1:\n\t" "b,n %l[l_yes]\n\t" ".pushsection __jump_table, \"aw\"\n\t" + ".align %1\n\t" ".word 1b - ., %l[l_yes] - .\n\t" __stringify(ASM_ULONG_INSN) " %c0 - .\n\t" ".popsection\n\t" - : : "i" (&((char *)key)[branch]) : : l_yes); + : : "i" (&((char *)key)[branch]), "i" (sizeof(long)) + : : l_yes); return false; l_yes: From b28fc0d8739c03e7b6c44914a9d00d4c6dddc0ea Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 25 Nov 2023 09:11:56 +0100 Subject: [PATCH 554/560] parisc: Mark lock_aligned variables 16-byte aligned on SMP On parisc we need 16-byte alignment for variables which are used for locking. Mark the __lock_aligned attribute acordingly so that the .data..lock_aligned section will get that alignment in the generated object files. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v6.0+ --- arch/parisc/include/asm/ldcw.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h index ee9e071859b2f..47ebc4c91eaff 100644 --- a/arch/parisc/include/asm/ldcw.h +++ b/arch/parisc/include/asm/ldcw.h @@ -55,7 +55,7 @@ }) #ifdef CONFIG_SMP -# define __lock_aligned __section(".data..lock_aligned") +# define __lock_aligned __section(".data..lock_aligned") __aligned(16) #endif #endif /* __PARISC_LDCW_H */ From c9fcb2b65c2849e8ff3be23fd8828312fb68dc19 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 25 Nov 2023 09:16:02 +0100 Subject: [PATCH 555/560] parisc: Ensure 32-bit alignment on parisc unwind section Make sure the .PARISC.unwind section will be 32-bit aligned. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v6.0+ --- arch/parisc/kernel/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 58694d1989c23..548051b0b4aff 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -130,6 +130,7 @@ SECTIONS RO_DATA(8) /* unwind info */ + . = ALIGN(4); .PARISC.unwind : { __start___unwind = .; *(.PARISC.unwind) From fe76a1349f235969381832c83d703bc911021eb6 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 20 Nov 2023 23:30:49 +0100 Subject: [PATCH 556/560] parisc: Use natural CPU alignment for bug_table Make sure that the __bug_table section gets 32- or 64-bit aligned, depending if a 32- or 64-bit kernel is being built. Mark it non-writeable and use .blockz instead of the .org assembler directive to pad the struct. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # v6.0+ --- arch/parisc/include/asm/bug.h | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h index 4b6d60b941247..b9cad0bb4461b 100644 --- a/arch/parisc/include/asm/bug.h +++ b/arch/parisc/include/asm/bug.h @@ -28,13 +28,15 @@ do { \ asm volatile("\n" \ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ - "\t.pushsection __bug_table,\"aw\"\n" \ + "\t.pushsection __bug_table,\"a\"\n" \ + "\t.align %4\n" \ "2:\t" ASM_WORD_INSN "1b, %c0\n" \ - "\t.short %c1, %c2\n" \ - "\t.org 2b+%c3\n" \ + "\t.short %1, %2\n" \ + "\t.blockz %3-2*%4-2*2\n" \ "\t.popsection" \ : : "i" (__FILE__), "i" (__LINE__), \ - "i" (0), "i" (sizeof(struct bug_entry)) ); \ + "i" (0), "i" (sizeof(struct bug_entry)), \ + "i" (sizeof(long)) ); \ unreachable(); \ } while(0) @@ -51,27 +53,31 @@ do { \ asm volatile("\n" \ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ - "\t.pushsection __bug_table,\"aw\"\n" \ + "\t.pushsection __bug_table,\"a\"\n" \ + "\t.align %4\n" \ "2:\t" ASM_WORD_INSN "1b, %c0\n" \ - "\t.short %c1, %c2\n" \ - "\t.org 2b+%c3\n" \ + "\t.short %1, %2\n" \ + "\t.blockz %3-2*%4-2*2\n" \ "\t.popsection" \ : : "i" (__FILE__), "i" (__LINE__), \ "i" (BUGFLAG_WARNING|(flags)), \ - "i" (sizeof(struct bug_entry)) ); \ + "i" (sizeof(struct bug_entry)), \ + "i" (sizeof(long)) ); \ } while(0) #else #define __WARN_FLAGS(flags) \ do { \ asm volatile("\n" \ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ - "\t.pushsection __bug_table,\"aw\"\n" \ + "\t.pushsection __bug_table,\"a\"\n" \ + "\t.align %2\n" \ "2:\t" ASM_WORD_INSN "1b\n" \ - "\t.short %c0\n" \ - "\t.org 2b+%c1\n" \ + "\t.short %0\n" \ + "\t.blockz %1-%2-2\n" \ "\t.popsection" \ : : "i" (BUGFLAG_WARNING|(flags)), \ - "i" (sizeof(struct bug_entry)) ); \ + "i" (sizeof(struct bug_entry)), \ + "i" (sizeof(long)) ); \ } while(0) #endif From e5f3e299a2b1e9c3ece24a38adfc089aef307e8a Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 23 Nov 2023 20:28:27 +0100 Subject: [PATCH 557/560] parisc: Drop the HP-UX ENOSYM and EREMOTERELEASE error codes Those return codes are only defined for the parisc architecture and are leftovers from when we wanted to be HP-UX compatible. They are not returned by any Linux kernel syscall but do trigger problems with the glibc strerrorname_np() and strerror() functions as reported in glibc issue #31080. There is no need to keep them, so simply remove them. Signed-off-by: Helge Deller Reported-by: Bruno Haible Closes: https://sourceware.org/bugzilla/show_bug.cgi?id=31080 Cc: stable@vger.kernel.org --- arch/parisc/include/uapi/asm/errno.h | 2 -- lib/errname.c | 6 ------ tools/arch/parisc/include/uapi/asm/errno.h | 2 -- 3 files changed, 10 deletions(-) diff --git a/arch/parisc/include/uapi/asm/errno.h b/arch/parisc/include/uapi/asm/errno.h index 87245c584784e..8d94739d75c67 100644 --- a/arch/parisc/include/uapi/asm/errno.h +++ b/arch/parisc/include/uapi/asm/errno.h @@ -75,7 +75,6 @@ /* We now return you to your regularly scheduled HPUX. */ -#define ENOSYM 215 /* symbol does not exist in executable */ #define ENOTSOCK 216 /* Socket operation on non-socket */ #define EDESTADDRREQ 217 /* Destination address required */ #define EMSGSIZE 218 /* Message too long */ @@ -101,7 +100,6 @@ #define ETIMEDOUT 238 /* Connection timed out */ #define ECONNREFUSED 239 /* Connection refused */ #define EREFUSED ECONNREFUSED /* for HP's NFS apparently */ -#define EREMOTERELEASE 240 /* Remote peer released connection */ #define EHOSTDOWN 241 /* Host is down */ #define EHOSTUNREACH 242 /* No route to host */ diff --git a/lib/errname.c b/lib/errname.c index dd1b998552cd9..4f9112b38f3ad 100644 --- a/lib/errname.c +++ b/lib/errname.c @@ -111,9 +111,6 @@ static const char *names_0[] = { E(ENOSPC), E(ENOSR), E(ENOSTR), -#ifdef ENOSYM - E(ENOSYM), -#endif E(ENOSYS), E(ENOTBLK), E(ENOTCONN), @@ -144,9 +141,6 @@ static const char *names_0[] = { #endif E(EREMOTE), E(EREMOTEIO), -#ifdef EREMOTERELEASE - E(EREMOTERELEASE), -#endif E(ERESTART), E(ERFKILL), E(EROFS), diff --git a/tools/arch/parisc/include/uapi/asm/errno.h b/tools/arch/parisc/include/uapi/asm/errno.h index 87245c584784e..8d94739d75c67 100644 --- a/tools/arch/parisc/include/uapi/asm/errno.h +++ b/tools/arch/parisc/include/uapi/asm/errno.h @@ -75,7 +75,6 @@ /* We now return you to your regularly scheduled HPUX. */ -#define ENOSYM 215 /* symbol does not exist in executable */ #define ENOTSOCK 216 /* Socket operation on non-socket */ #define EDESTADDRREQ 217 /* Destination address required */ #define EMSGSIZE 218 /* Message too long */ @@ -101,7 +100,6 @@ #define ETIMEDOUT 238 /* Connection timed out */ #define ECONNREFUSED 239 /* Connection refused */ #define EREFUSED ECONNREFUSED /* for HP's NFS apparently */ -#define EREMOTERELEASE 240 /* Remote peer released connection */ #define EHOSTDOWN 241 /* Host is down */ #define EHOSTUNREACH 242 /* No route to host */ From 43266838515d30dc0c45d5c7e6e7edacee6cce92 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 23 Nov 2023 21:57:19 +0100 Subject: [PATCH 558/560] parisc: Reduce size of the bug_table on 64-bit kernel by half Enable GENERIC_BUG_RELATIVE_POINTERS which will store 32-bit relative offsets to the bug address and the source file name instead of 64-bit absolute addresses. This effectively reduces the size of the bug_table[] array by half on 64-bit kernels. Signed-off-by: Helge Deller --- arch/parisc/Kconfig | 7 +++++-- arch/parisc/include/asm/bug.h | 34 +++++++++++++++++----------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index a7c9c0e69e5ab..d14ccc948a29b 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -115,9 +115,12 @@ config ARCH_HAS_ILOG2_U64 default n config GENERIC_BUG - bool - default y + def_bool y depends on BUG + select GENERIC_BUG_RELATIVE_POINTERS if 64BIT + +config GENERIC_BUG_RELATIVE_POINTERS + bool config GENERIC_HWEIGHT bool diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h index b9cad0bb4461b..1641ff9a8b83e 100644 --- a/arch/parisc/include/asm/bug.h +++ b/arch/parisc/include/asm/bug.h @@ -17,26 +17,27 @@ #define PARISC_BUG_BREAK_ASM "break 0x1f, 0x1fff" #define PARISC_BUG_BREAK_INSN 0x03ffe01f /* PARISC_BUG_BREAK_ASM */ -#if defined(CONFIG_64BIT) -#define ASM_WORD_INSN ".dword\t" +#ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS +# define __BUG_REL(val) ".word " __stringify(val) " - ." #else -#define ASM_WORD_INSN ".word\t" +# define __BUG_REL(val) ".word " __stringify(val) #endif + #ifdef CONFIG_DEBUG_BUGVERBOSE #define BUG() \ do { \ asm volatile("\n" \ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ "\t.pushsection __bug_table,\"a\"\n" \ - "\t.align %4\n" \ - "2:\t" ASM_WORD_INSN "1b, %c0\n" \ + "\t.align 4\n" \ + "2:\t" __BUG_REL(1b) "\n" \ + "\t" __BUG_REL(%c0) "\n" \ "\t.short %1, %2\n" \ - "\t.blockz %3-2*%4-2*2\n" \ + "\t.blockz %3-2*4-2*2\n" \ "\t.popsection" \ : : "i" (__FILE__), "i" (__LINE__), \ - "i" (0), "i" (sizeof(struct bug_entry)), \ - "i" (sizeof(long)) ); \ + "i" (0), "i" (sizeof(struct bug_entry)) ); \ unreachable(); \ } while(0) @@ -54,15 +55,15 @@ asm volatile("\n" \ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ "\t.pushsection __bug_table,\"a\"\n" \ - "\t.align %4\n" \ - "2:\t" ASM_WORD_INSN "1b, %c0\n" \ + "\t.align 4\n" \ + "2:\t" __BUG_REL(1b) "\n" \ + "\t" __BUG_REL(%c0) "\n" \ "\t.short %1, %2\n" \ - "\t.blockz %3-2*%4-2*2\n" \ + "\t.blockz %3-2*4-2*2\n" \ "\t.popsection" \ : : "i" (__FILE__), "i" (__LINE__), \ "i" (BUGFLAG_WARNING|(flags)), \ - "i" (sizeof(struct bug_entry)), \ - "i" (sizeof(long)) ); \ + "i" (sizeof(struct bug_entry)) ); \ } while(0) #else #define __WARN_FLAGS(flags) \ @@ -71,13 +72,12 @@ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ "\t.pushsection __bug_table,\"a\"\n" \ "\t.align %2\n" \ - "2:\t" ASM_WORD_INSN "1b\n" \ + "2:\t" __BUG_REL(1b) "\n" \ "\t.short %0\n" \ - "\t.blockz %1-%2-2\n" \ + "\t.blockz %1-4-2\n" \ "\t.popsection" \ : : "i" (BUGFLAG_WARNING|(flags)), \ - "i" (sizeof(struct bug_entry)), \ - "i" (sizeof(long)) ); \ + "i" (sizeof(struct bug_entry)) ); \ } while(0) #endif From 2cc14f52aeb78ce3f29677c2de1f06c0e91471ab Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 26 Nov 2023 19:59:33 -0800 Subject: [PATCH 559/560] Linux 6.7-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 724c79bebe727..99db546fbb452 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 7 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* From 826a5d8c9df9605fb4fdefa45432f95580241a1f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 25 Oct 2023 21:42:57 +0300 Subject: [PATCH 560/560] device property: Implement device_is_big_endian() Some users want to use the struct device pointer to see if the device is big endian in terms of Open Firmware specifications, i.e. if it has a "big-endian" property, or if the kernel was compiled for BE *and* the device has a "native-endian" property. Provide inline helper for the users. Signed-off-by: Andy Shevchenko Acked-by: Greg Kroah-Hartman Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20231025184259.250588-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/property.h b/include/linux/property.h index 9f2585d705a86..55c2692ffa8ca 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -80,12 +80,38 @@ int fwnode_property_match_string(const struct fwnode_handle *fwnode, bool fwnode_device_is_available(const struct fwnode_handle *fwnode); +static inline bool fwnode_device_is_big_endian(const struct fwnode_handle *fwnode) +{ + if (fwnode_property_present(fwnode, "big-endian")) + return true; + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) && + fwnode_property_present(fwnode, "native-endian")) + return true; + return false; +} + static inline bool fwnode_device_is_compatible(const struct fwnode_handle *fwnode, const char *compat) { return fwnode_property_match_string(fwnode, "compatible", compat) >= 0; } +/** + * device_is_big_endian - check if a device has BE registers + * @dev: Pointer to the struct device + * + * Returns: true if the device has a "big-endian" property, or if the kernel + * was compiled for BE *and* the device has a "native-endian" property. + * Returns false otherwise. + * + * Callers would nominally use ioread32be/iowrite32be if + * device_is_big_endian() == true, or readl/writel otherwise. + */ +static inline bool device_is_big_endian(const struct device *dev) +{ + return fwnode_device_is_big_endian(dev_fwnode(dev)); +} + /** * device_is_compatible - match 'compatible' property of the device with a given string * @dev: Pointer to the struct device