diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst index 7352ab89a55ae..c12837e61bda5 100644 --- a/Documentation/arch/x86/topology.rst +++ b/Documentation/arch/x86/topology.rst @@ -135,6 +135,10 @@ Thread-related topology information in the kernel: The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo "core_id." + - topology_logical_core_id(); + + The logical core ID to which a thread belongs. + System topology examples diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index b0bc68da6a116..33205dd410e47 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -981,7 +981,7 @@ static int cfdiag_push_sample(struct perf_event *event, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = cpuhw->usedss; raw.frag.data = cpuhw->stop; - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 1e99514fb7ae3..5f60248cb4687 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -981,7 +981,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) cpuhw->flags &= ~PMU_F_ENABLED; } -/* perf_exclude_event() - Filter event +/* perf_event_exclude() - Filter event * @event: The perf event * @regs: pt_regs structure * @sde_regs: Sample-data-entry (sde) regs structure @@ -990,7 +990,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) * * Return non-zero if the event shall be excluded. */ -static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, +static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs, struct perf_sf_sde_regs *sde_regs) { if (event->attr.exclude_user && user_mode(regs)) @@ -1073,7 +1073,7 @@ static int perf_push_sample(struct perf_event *event, data.tid_entry.pid = basic->hpp & LPP_PID_MASK; overflow = 0; - if (perf_exclude_event(event, ®s, sde_regs)) + if (perf_event_exclude(event, ®s, sde_regs)) goto out; if (perf_event_overflow(event, &data, ®s)) { overflow = 1; diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index fa73254542661..10725f5a6f0fd 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -478,7 +478,7 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = rawsize; raw.frag.data = cpump->save; - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index 7f462bef1fc07..a8f0bad99cf04 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -503,7 +503,7 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump, if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.frag.size = rawsize; raw.frag.data = cpump->save; - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index b4a1a2576510e..30d6ceb4c8ad4 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1001,8 +1001,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index e91970b01d624..e7a8b8758e088 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -31,6 +31,8 @@ static u32 ibs_caps; #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT +/* attr.config2 */ +#define IBS_SW_FILTER_MASK 1 /* * IBS states: @@ -290,6 +292,16 @@ static int perf_ibs_init(struct perf_event *event) if (has_branch_stack(event)) return -EOPNOTSUPP; + /* handle exclude_{user,kernel} in the IRQ handler */ + if (event->attr.exclude_host || event->attr.exclude_guest || + event->attr.exclude_idle) + return -EINVAL; + + if (!(event->attr.config2 & IBS_SW_FILTER_MASK) && + (event->attr.exclude_kernel || event->attr.exclude_user || + event->attr.exclude_hv)) + return -EINVAL; + ret = validate_group(event); if (ret) return ret; @@ -550,24 +562,14 @@ static struct attribute *attrs_empty[] = { NULL, }; -static struct attribute_group empty_format_group = { - .name = "format", - .attrs = attrs_empty, -}; - static struct attribute_group empty_caps_group = { .name = "caps", .attrs = attrs_empty, }; -static const struct attribute_group *empty_attr_groups[] = { - &empty_format_group, - &empty_caps_group, - NULL, -}; - PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_FORMAT_ATTR(swfilt, "config2:0"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); @@ -578,8 +580,9 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; } -static struct attribute *rand_en_attrs[] = { +static struct attribute *fetch_attrs[] = { &format_attr_rand_en.attr, + &format_attr_swfilt.attr, NULL, }; @@ -593,9 +596,9 @@ static struct attribute *zen4_ibs_extensions_attrs[] = { NULL, }; -static struct attribute_group group_rand_en = { +static struct attribute_group group_fetch_formats = { .name = "format", - .attrs = rand_en_attrs, + .attrs = fetch_attrs, }; static struct attribute_group group_fetch_l3missonly = { @@ -611,7 +614,7 @@ static struct attribute_group group_zen4_ibs_extensions = { }; static const struct attribute_group *fetch_attr_groups[] = { - &group_rand_en, + &group_fetch_formats, &empty_caps_group, NULL, }; @@ -628,6 +631,11 @@ cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; } +static struct attribute *op_attrs[] = { + &format_attr_swfilt.attr, + NULL, +}; + static struct attribute *cnt_ctl_attrs[] = { &format_attr_cnt_ctl.attr, NULL, @@ -638,6 +646,11 @@ static struct attribute *op_l3missonly_attrs[] = { NULL, }; +static struct attribute_group group_op_formats = { + .name = "format", + .attrs = op_attrs, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, @@ -650,6 +663,12 @@ static struct attribute_group group_op_l3missonly = { .is_visible = zen4_ibs_extensions_is_visible, }; +static const struct attribute_group *op_attr_groups[] = { + &group_op_formats, + &empty_caps_group, + NULL, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, @@ -667,7 +686,6 @@ static struct perf_ibs perf_ibs_fetch = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }, .msr = MSR_AMD64_IBSFETCHCTL, .config_mask = IBS_FETCH_CONFIG_MASK, @@ -691,7 +709,6 @@ static struct perf_ibs perf_ibs_op = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_CONFIG_MASK, @@ -1111,6 +1128,12 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) regs.flags |= PERF_EFLAGS_EXACT; } + if ((event->attr.config2 & IBS_SW_FILTER_MASK) && + perf_exclude_event(event, ®s)) { + throttle = perf_event_account_interrupt(event); + goto out; + } + if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw = (struct perf_raw_record){ .frag = { @@ -1118,7 +1141,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) .data = ibs_data.data, }, }; - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); } if (perf_ibs == &perf_ibs_op) @@ -1129,8 +1152,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) * recorded as part of interrupt regs. Thus we need to use rip from * interrupt regs while unwinding call stack. */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - perf_sample_save_callchain(&data, event, iregs); + perf_sample_save_callchain(&data, event, iregs); throttle = perf_event_overflow(event, &data, ®s); out: @@ -1228,7 +1250,7 @@ static __init int perf_ibs_op_init(void) if (ibs_caps & IBS_CAPS_ZEN4) perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; - perf_ibs_op.pmu.attr_groups = empty_attr_groups; + perf_ibs_op.pmu.attr_groups = op_attr_groups; perf_ibs_op.pmu.attr_update = op_attr_update; return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index c75c482d4c52f..8f218ac0d445c 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1707,8 +1707,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) - perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); + perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 66ecfc8840762..7601196d1d18e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2826,6 +2826,9 @@ static void intel_pmu_enable_fixed(struct perf_event *event) return; idx = INTEL_PMC_IDX_FIXED_SLOTS; + + if (event->attr.config1 & INTEL_TD_CFG_METRIC_CLEAR) + bits |= INTEL_FIXED_3_METRICS_CLEAR; } intel_set_masks(event, idx); @@ -4081,7 +4084,12 @@ static int intel_pmu_hw_config(struct perf_event *event) * is used in a metrics group, it too cannot support sampling. */ if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) { - if (event->attr.config1 || event->attr.config2) + /* The metrics_clear can only be set for the slots event */ + if (event->attr.config1 && + (!is_slots_event(event) || (event->attr.config1 & ~INTEL_TD_CFG_METRIC_CLEAR))) + return -EINVAL; + + if (event->attr.config2) return -EINVAL; /* @@ -4690,6 +4698,8 @@ PMU_FORMAT_ATTR(in_tx, "config:32" ); PMU_FORMAT_ATTR(in_tx_cp, "config:33" ); PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */ +PMU_FORMAT_ATTR(metrics_clear, "config1:0"); /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */ + static ssize_t umask2_show(struct device *dev, struct device_attribute *attr, char *page) @@ -4709,6 +4719,7 @@ static struct device_attribute format_attr_umask2 = static struct attribute *format_evtsel_ext_attrs[] = { &format_attr_umask2.attr, &format_attr_eq.attr, + &format_attr_metrics_clear.attr, NULL }; @@ -4733,6 +4744,13 @@ evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i) if (i == 1) return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0; + /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */ + if (i == 2) { + union perf_capabilities intel_cap = hybrid(dev_get_drvdata(dev), intel_cap); + + return intel_cap.rdpmc_metrics_clear ? attr->mode : 0; + } + return 0; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 6ba6549f26fac..ba74e11983280 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1789,8 +1789,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) - perf_sample_save_callchain(data, event, iregs); + perf_sample_save_callchain(data, event, iregs); /* * We use the interrupt regs as a base because the PEBS record does not @@ -1889,8 +1888,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 3) setup_pebs_time(event, data, pebs->tsc); - if (has_branch_stack(event)) - perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); + perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL); } static void adaptive_pebs_save_regs(struct pt_regs *regs, @@ -1917,8 +1915,6 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, } #define PEBS_LATENCY_MASK 0xffff -#define PEBS_CACHE_LATENCY_OFFSET 32 -#define PEBS_RETIRE_LATENCY_OFFSET 32 /* * With adaptive PEBS the layout depends on what fields are configured. @@ -1932,8 +1928,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct pebs_basic *basic = __pebs; void *next_record = basic + 1; - u64 sample_type; - u64 format_size; + u64 sample_type, format_group; struct pebs_meminfo *meminfo = NULL; struct pebs_gprs *gprs = NULL; struct x86_perf_regs *perf_regs; @@ -1945,7 +1940,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_regs->xmm_regs = NULL; sample_type = event->attr.sample_type; - format_size = basic->format_size; + format_group = basic->format_group; perf_sample_data_init(data, 0, event->hw.last_period); data->period = event->hw.last_period; @@ -1957,8 +1952,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) - perf_sample_save_callchain(data, event, iregs); + perf_sample_save_callchain(data, event, iregs); *regs = *iregs; /* The ip in basic is EventingIP */ @@ -1967,7 +1961,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY) - data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK; + data->weight.var3_w = basic->retire_latency; else data->weight.var3_w = 0; } @@ -1977,12 +1971,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * But PERF_SAMPLE_TRANSACTION needs gprs->ax. * Save the pointer here but process later. */ - if (format_size & PEBS_DATACFG_MEMINFO) { + if (format_group & PEBS_DATACFG_MEMINFO) { meminfo = next_record; next_record = meminfo + 1; } - if (format_size & PEBS_DATACFG_GP) { + if (format_group & PEBS_DATACFG_GP) { gprs = next_record; next_record = gprs + 1; @@ -1995,14 +1989,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, adaptive_pebs_save_regs(regs, gprs); } - if (format_size & PEBS_DATACFG_MEMINFO) { + if (format_group & PEBS_DATACFG_MEMINFO) { if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { - u64 weight = meminfo->latency; + u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->cache_latency : meminfo->mem_latency; - if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) { - data->weight.var2_w = weight & PEBS_LATENCY_MASK; - weight >>= PEBS_CACHE_LATENCY_OFFSET; - } + if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) + data->weight.var2_w = meminfo->instr_latency; /* * Although meminfo::latency is defined as a u64, @@ -2010,12 +2003,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * in practice on Ice Lake and earlier platforms. */ if (sample_type & PERF_SAMPLE_WEIGHT) { - data->weight.full = weight ?: + data->weight.full = latency ?: intel_get_tsx_weight(meminfo->tsx_tuning); } else { - data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: + data->weight.var1_dw = (u32)latency ?: intel_get_tsx_weight(meminfo->tsx_tuning); } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } @@ -2036,16 +2030,16 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } - if (format_size & PEBS_DATACFG_XMMS) { + if (format_group & PEBS_DATACFG_XMMS) { struct pebs_xmm *xmm = next_record; next_record = xmm + 1; perf_regs->xmm_regs = xmm->xmm; } - if (format_size & PEBS_DATACFG_LBRS) { + if (format_group & PEBS_DATACFG_LBRS) { struct lbr_entry *lbr = next_record; - int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) + int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT) & 0xff) + 1; next_record = next_record + num_lbr * sizeof(struct lbr_entry); @@ -2055,11 +2049,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } - WARN_ONCE(next_record != __pebs + (format_size >> 48), - "PEBS record size %llu, expected %llu, config %llx\n", - format_size >> 48, + WARN_ONCE(next_record != __pebs + basic->format_size, + "PEBS record size %u, expected %llu, config %llx\n", + basic->format_size, (u64)(next_record - __pebs), - basic->format_size); + format_group); } static inline void * @@ -2170,46 +2164,33 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) return 0; } +typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *, + struct perf_sample_data *, struct pt_regs *); + +static struct pt_regs dummy_iregs; + static __always_inline void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, + struct pt_regs *regs, struct perf_sample_data *data, - void *base, void *top, - int bit, int count, - void (*setup_sample)(struct perf_event *, - struct pt_regs *, - void *, - struct perf_sample_data *, - struct pt_regs *)) + void *at, + setup_fn setup_sample) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - struct x86_perf_regs perf_regs; - struct pt_regs *regs = &perf_regs.regs; - void *at = get_next_pebs_record_by_bit(base, top, bit); - static struct pt_regs dummy_iregs; - - if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - /* - * Now, auto-reload is only enabled in fixed period mode. - * The reload value is always hwc->sample_period. - * May need to change it, if auto-reload is enabled in - * freq mode later. - */ - intel_pmu_save_and_restart_reload(event, count); - } else if (!intel_pmu_save_and_restart(event)) - return; - - if (!iregs) - iregs = &dummy_iregs; + setup_sample(event, iregs, at, data, regs); + perf_event_output(event, data, regs); +} - while (count > 1) { - setup_sample(event, iregs, at, data, regs); - perf_event_output(event, data, regs); - at += cpuc->pebs_record_size; - at = get_next_pebs_record_by_bit(at, top, bit); - count--; - } +static __always_inline void +__intel_pmu_pebs_last_event(struct perf_event *event, + struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + void *at, + int count, + setup_fn setup_sample) +{ + struct hw_perf_event *hwc = &event->hw; setup_sample(event, iregs, at, data, regs); if (iregs == &dummy_iregs) { @@ -2228,6 +2209,44 @@ __intel_pmu_pebs_event(struct perf_event *event, if (perf_event_overflow(event, data, regs)) x86_pmu_stop(event, 0); } + + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { + /* + * Now, auto-reload is only enabled in fixed period mode. + * The reload value is always hwc->sample_period. + * May need to change it, if auto-reload is enabled in + * freq mode later. + */ + intel_pmu_save_and_restart_reload(event, count); + } else + intel_pmu_save_and_restart(event); +} + +static __always_inline void +__intel_pmu_pebs_events(struct perf_event *event, + struct pt_regs *iregs, + struct perf_sample_data *data, + void *base, void *top, + int bit, int count, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + void *at = get_next_pebs_record_by_bit(base, top, bit); + int cnt = count; + + if (!iregs) + iregs = &dummy_iregs; + + while (cnt > 1) { + __intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample); + at += cpuc->pebs_record_size; + at = get_next_pebs_record_by_bit(at, top, bit); + cnt--; + } + + __intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample); } static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data) @@ -2264,8 +2283,8 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_ return; } - __intel_pmu_pebs_event(event, iregs, data, at, top, 0, n, - setup_pebs_fixed_sample_data); + __intel_pmu_pebs_events(event, iregs, data, at, top, 0, n, + setup_pebs_fixed_sample_data); } static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) @@ -2396,9 +2415,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } if (counts[bit]) { - __intel_pmu_pebs_event(event, iregs, data, base, - top, bit, counts[bit], - setup_pebs_fixed_sample_data); + __intel_pmu_pebs_events(event, iregs, data, base, + top, bit, counts[bit], + setup_pebs_fixed_sample_data); } } } @@ -2406,8 +2425,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS]; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + struct pebs_basic *basic; struct perf_event *event; void *base, *at, *top; int bit; @@ -2429,30 +2452,41 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d return; } - for (at = base; at < top; at += cpuc->pebs_record_size) { + if (!iregs) + iregs = &dummy_iregs; + + /* Process all but the last event for each counter. */ + for (at = base; at < top; at += basic->format_size) { u64 pebs_status; - pebs_status = get_pebs_status(at) & cpuc->pebs_enabled; - pebs_status &= mask; + basic = at; + if (basic->format_size != cpuc->pebs_record_size) + continue; - for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) - counts[bit]++; + pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask; + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { + event = cpuc->events[bit]; + + if (WARN_ON_ONCE(!event) || + WARN_ON_ONCE(!event->attr.precise_ip)) + continue; + + if (counts[bit]++) { + __intel_pmu_pebs_event(event, iregs, regs, data, last[bit], + setup_pebs_adaptive_sample_data); + } + last[bit] = at; + } } for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { - if (counts[bit] == 0) + if (!counts[bit]) continue; event = cpuc->events[bit]; - if (WARN_ON_ONCE(!event)) - continue; - - if (WARN_ON_ONCE(!event->attr.precise_ip)) - continue; - __intel_pmu_pebs_event(event, iregs, data, base, - top, bit, counts[bit], - setup_pebs_adaptive_sample_data); + __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], + counts[bit], setup_pebs_adaptive_sample_data); } } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index e7aba7349231d..60b3078b7502f 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -745,7 +745,7 @@ static int uncore_pmu_event_init(struct perf_event *event) pmu = uncore_event_to_pmu(event); /* no device found for this pmu */ - if (pmu->func_id < 0) + if (!pmu->registered) return -ENOENT; /* Sampling not supported yet */ @@ -992,7 +992,7 @@ static void uncore_types_exit(struct intel_uncore_type **types) uncore_type_exit(*types); } -static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) +static int __init uncore_type_init(struct intel_uncore_type *type) { struct intel_uncore_pmu *pmus; size_t size; @@ -1005,7 +1005,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) size = uncore_max_dies() * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { - pmus[i].func_id = setid ? i : -1; pmus[i].pmu_idx = i; pmus[i].type = type; pmus[i].boxes = kzalloc(size, GFP_KERNEL); @@ -1055,12 +1054,12 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) } static int __init -uncore_types_init(struct intel_uncore_type **types, bool setid) +uncore_types_init(struct intel_uncore_type **types) { int ret; for (; *types; types++) { - ret = uncore_type_init(*types, setid); + ret = uncore_type_init(*types); if (ret) return ret; } @@ -1160,11 +1159,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev, if (!box) return -ENOMEM; - if (pmu->func_id < 0) - pmu->func_id = pdev->devfn; - else - WARN_ON_ONCE(pmu->func_id != pdev->devfn); - atomic_inc(&box->refcnt); box->dieid = die; box->pci_dev = pdev; @@ -1410,7 +1404,7 @@ static int __init uncore_pci_init(void) goto err; } - ret = uncore_types_init(uncore_pci_uncores, false); + ret = uncore_types_init(uncore_pci_uncores); if (ret) goto errtype; @@ -1678,7 +1672,7 @@ static int __init uncore_cpu_init(void) { int ret; - ret = uncore_types_init(uncore_msr_uncores, true); + ret = uncore_types_init(uncore_msr_uncores); if (ret) goto err; @@ -1697,7 +1691,7 @@ static int __init uncore_mmio_init(void) struct intel_uncore_type **types = uncore_mmio_uncores; int ret; - ret = uncore_types_init(types, true); + ret = uncore_types_init(types); if (ret) goto err; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 79ff32e13dcc8..3dcb88c0ecfa3 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -125,7 +125,6 @@ struct intel_uncore_pmu { struct pmu pmu; char name[UNCORE_PMU_NAME_LEN]; int pmu_idx; - int func_id; bool registered; atomic_t activeboxes; cpumask_t cpu_mask; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 3934e1e4e3b16..edb7fd50efe02 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -910,7 +910,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event) pmu = uncore_event_to_pmu(event); /* no device found for this pmu */ - if (pmu->func_id < 0) + if (!pmu->registered) return -ENOENT; /* Sampling not supported yet */ diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index ca98744343b89..60973c209c0e6 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6684,17 +6684,8 @@ void spr_uncore_mmio_init(void) /* GNR uncore support */ #define UNCORE_GNR_NUM_UNCORE_TYPES 23 -#define UNCORE_GNR_TYPE_15 15 -#define UNCORE_GNR_B2UPI 18 -#define UNCORE_GNR_TYPE_21 21 -#define UNCORE_GNR_TYPE_22 22 int gnr_uncore_units_ignore[] = { - UNCORE_SPR_UPI, - UNCORE_GNR_TYPE_15, - UNCORE_GNR_B2UPI, - UNCORE_GNR_TYPE_21, - UNCORE_GNR_TYPE_22, UNCORE_IGNORE_END }; @@ -6703,6 +6694,31 @@ static struct intel_uncore_type gnr_uncore_ubox = { .attr_update = uncore_alias_groups, }; +static struct intel_uncore_type gnr_uncore_pciex8 = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "pciex8", +}; + +static struct intel_uncore_type gnr_uncore_pciex16 = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "pciex16", +}; + +static struct intel_uncore_type gnr_uncore_upi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "upi", +}; + +static struct intel_uncore_type gnr_uncore_b2upi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "b2upi", +}; + +static struct intel_uncore_type gnr_uncore_b2hot = { + .name = "b2hot", + .attr_update = uncore_alias_groups, +}; + static struct intel_uncore_type gnr_uncore_b2cmi = { SPR_UNCORE_PCI_COMMON_FORMAT(), .name = "b2cmi", @@ -6727,21 +6743,21 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = { &gnr_uncore_ubox, &spr_uncore_imc, NULL, + &gnr_uncore_upi, NULL, NULL, NULL, + &spr_uncore_cxlcm, + &spr_uncore_cxldp, NULL, - NULL, - NULL, - NULL, - NULL, + &gnr_uncore_b2hot, &gnr_uncore_b2cmi, &gnr_uncore_b2cxl, - NULL, + &gnr_uncore_b2upi, NULL, &gnr_uncore_mdf_sbo, - NULL, - NULL, + &gnr_uncore_pciex16, + &gnr_uncore_pciex8, }; static struct freerunning_counters gnr_iio_freerunning[] = { diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 82c6f45ce9755..31c2771545a6c 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -624,6 +624,7 @@ union perf_capabilities { u64 pebs_output_pt_available:1; u64 pebs_timing_info:1; u64 anythread_deprecated:1; + u64 rdpmc_metrics_clear:1; }; u64 capabilities; }; diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index a8defc813c369..d3bb3865c1b1f 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -39,6 +39,10 @@ * event: rapl_energy_psys * perf code: 0x5 * + * core counter: consumption of a single physical core + * event: rapl_energy_core (power_core PMU) + * perf code: 0x1 + * * We manage those counters as free running (read-only). They may be * use simultaneously by other tools, such as turbostat. * @@ -70,18 +74,22 @@ MODULE_LICENSE("GPL"); /* * RAPL energy status counters */ -enum perf_rapl_events { +enum perf_rapl_pkg_events { PERF_RAPL_PP0 = 0, /* all cores */ PERF_RAPL_PKG, /* entire package */ PERF_RAPL_RAM, /* DRAM */ PERF_RAPL_PP1, /* gpu */ PERF_RAPL_PSYS, /* psys */ - PERF_RAPL_MAX, - NR_RAPL_DOMAINS = PERF_RAPL_MAX, + PERF_RAPL_PKG_EVENTS_MAX, + NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX, }; -static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { +#define PERF_RAPL_CORE 0 /* single core */ +#define PERF_RAPL_CORE_EVENTS_MAX 1 +#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX + +static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = { "pp0-core", "package", "dram", @@ -89,6 +97,8 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { "psys", }; +static const char *const rapl_core_domain_name __initconst = "core"; + /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved @@ -112,7 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = { \ * considered as either pkg-scope or die-scope, and we are considering * them as die-scope. */ -#define rapl_pmu_is_pkg_scope() \ +#define rapl_pkg_pmu_is_pkg_scope() \ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) @@ -129,7 +139,8 @@ struct rapl_pmu { struct rapl_pmus { struct pmu pmu; unsigned int nr_rapl_pmu; - struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu); + unsigned int cntr_mask; + struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu); }; enum rapl_unit_quirk { @@ -139,44 +150,43 @@ enum rapl_unit_quirk { }; struct rapl_model { - struct perf_msr *rapl_msrs; - unsigned long events; + struct perf_msr *rapl_pkg_msrs; + struct perf_msr *rapl_core_msrs; + unsigned long pkg_events; + unsigned long core_events; unsigned int msr_power_unit; enum rapl_unit_quirk unit_quirk; }; /* 1/2^hw_unit Joule */ -static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; -static struct rapl_pmus *rapl_pmus; -static unsigned int rapl_cntr_mask; +static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; +static int rapl_core_hw_unit __read_mostly; +static struct rapl_pmus *rapl_pmus_pkg; +static struct rapl_pmus *rapl_pmus_core; static u64 rapl_timer_ms; -static struct perf_msr *rapl_msrs; +static struct rapl_model *rapl_model; /* - * Helper functions to get the correct topology macros according to the + * Helper function to get the correct topology id according to the * RAPL PMU scope. */ -static inline unsigned int get_rapl_pmu_idx(int cpu) -{ - return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : - topology_logical_die_id(cpu); -} - -static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu) -{ - return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) : - topology_die_cpumask(cpu); -} - -static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) +static inline unsigned int get_rapl_pmu_idx(int cpu, int scope) { - unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu); - /* - * The unsigned check also catches the '-1' return value for non - * existent mappings in the topology map. + * Returns unsigned int, which converts the '-1' return value + * (for non-existent mappings in topology map) to UINT_MAX, so + * the error check in the caller is simplified. */ - return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL; + switch (scope) { + case PERF_PMU_SCOPE_PKG: + return topology_logical_package_id(cpu); + case PERF_PMU_SCOPE_DIE: + return topology_logical_die_id(cpu); + case PERF_PMU_SCOPE_CORE: + return topology_logical_core_id(cpu); + default: + return -EINVAL; + } } static inline u64 rapl_read_counter(struct perf_event *event) @@ -186,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event) return raw; } -static inline u64 rapl_scale(u64 v, int cfg) +static inline u64 rapl_scale(u64 v, struct perf_event *event) { - if (cfg > NR_RAPL_DOMAINS) { - pr_warn("Invalid domain %d, failed to scale data\n", cfg); - return v; - } + int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1]; + + if (event->pmu->scope == PERF_PMU_SCOPE_CORE) + hw_unit = rapl_core_hw_unit; + /* * scale delta to smallest unit (1/2^32) * users must then scale back: count * 1/(1e9*2^32) to get Joules * or use ldexp(count, -32). * Watts = Joules/Time delta */ - return v << (32 - rapl_hw_unit[cfg - 1]); + return v << (32 - hw_unit); } static u64 rapl_event_update(struct perf_event *event) @@ -225,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event) delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - sdelta = rapl_scale(delta, event->hw.config); + sdelta = rapl_scale(delta, event); local64_add(sdelta, &event->count); @@ -240,34 +251,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu) static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) { - struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); + struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); struct perf_event *event; unsigned long flags; - if (!pmu->n_active) + if (!rapl_pmu->n_active) return HRTIMER_NORESTART; - raw_spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); - list_for_each_entry(event, &pmu->active_list, active_entry) + list_for_each_entry(event, &rapl_pmu->active_list, active_entry) rapl_event_update(event); - raw_spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); - hrtimer_forward_now(hrtimer, pmu->timer_interval); + hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval); return HRTIMER_RESTART; } -static void rapl_hrtimer_init(struct rapl_pmu *pmu) +static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu) { - struct hrtimer *hr = &pmu->hrtimer; + struct hrtimer *hr = &rapl_pmu->hrtimer; hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hr->function = rapl_hrtimer_handle; } -static void __rapl_pmu_event_start(struct rapl_pmu *pmu, +static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu, struct perf_event *event) { if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) @@ -275,39 +286,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu, event->hw.state = 0; - list_add_tail(&event->active_entry, &pmu->active_list); + list_add_tail(&event->active_entry, &rapl_pmu->active_list); local64_set(&event->hw.prev_count, rapl_read_counter(event)); - pmu->n_active++; - if (pmu->n_active == 1) - rapl_start_hrtimer(pmu); + rapl_pmu->n_active++; + if (rapl_pmu->n_active == 1) + rapl_start_hrtimer(rapl_pmu); } static void rapl_pmu_event_start(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = event->pmu_private; + struct rapl_pmu *rapl_pmu = event->pmu_private; unsigned long flags; - raw_spin_lock_irqsave(&pmu->lock, flags); - __rapl_pmu_event_start(pmu, event); - raw_spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + __rapl_pmu_event_start(rapl_pmu, event); + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); } static void rapl_pmu_event_stop(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = event->pmu_private; + struct rapl_pmu *rapl_pmu = event->pmu_private; struct hw_perf_event *hwc = &event->hw; unsigned long flags; - raw_spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); /* mark event as deactivated and stopped */ if (!(hwc->state & PERF_HES_STOPPED)) { - WARN_ON_ONCE(pmu->n_active <= 0); - pmu->n_active--; - if (pmu->n_active == 0) - hrtimer_cancel(&pmu->hrtimer); + WARN_ON_ONCE(rapl_pmu->n_active <= 0); + rapl_pmu->n_active--; + if (rapl_pmu->n_active == 0) + hrtimer_cancel(&rapl_pmu->hrtimer); list_del(&event->active_entry); @@ -325,23 +336,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode) hwc->state |= PERF_HES_UPTODATE; } - raw_spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); } static int rapl_pmu_event_add(struct perf_event *event, int mode) { - struct rapl_pmu *pmu = event->pmu_private; + struct rapl_pmu *rapl_pmu = event->pmu_private; struct hw_perf_event *hwc = &event->hw; unsigned long flags; - raw_spin_lock_irqsave(&pmu->lock, flags); + raw_spin_lock_irqsave(&rapl_pmu->lock, flags); hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; if (mode & PERF_EF_START) - __rapl_pmu_event_start(pmu, event); + __rapl_pmu_event_start(rapl_pmu, event); - raw_spin_unlock_irqrestore(&pmu->lock, flags); + raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); return 0; } @@ -354,12 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags) static int rapl_pmu_event_init(struct perf_event *event) { u64 cfg = event->attr.config & RAPL_EVENT_MASK; - int bit, ret = 0; - struct rapl_pmu *pmu; + int bit, rapl_pmus_scope, ret = 0; + struct rapl_pmu *rapl_pmu; + unsigned int rapl_pmu_idx; + struct rapl_pmus *rapl_pmus; - /* only look at RAPL events */ - if (event->attr.type != rapl_pmus->pmu.type) - return -ENOENT; + /* unsupported modes and filters */ + if (event->attr.sample_period) /* no sampling */ + return -EINVAL; /* check only supported bits are set */ if (event->attr.config & ~RAPL_EVENT_MASK) @@ -368,26 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; - if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) + rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu); + if (!rapl_pmus) + return -EINVAL; + rapl_pmus_scope = rapl_pmus->pmu.scope; + + if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) { + /* only look at RAPL package events */ + if (event->attr.type != rapl_pmus_pkg->pmu.type) + return -ENOENT; + + cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); + if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) + return -EINVAL; + + bit = cfg - 1; + event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; + } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) { + /* only look at RAPL core events */ + if (event->attr.type != rapl_pmus_core->pmu.type) + return -ENOENT; + + cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1); + if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) + return -EINVAL; + + bit = cfg - 1; + event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr; + } else return -EINVAL; - - cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); - bit = cfg - 1; /* check event supported */ - if (!(rapl_cntr_mask & (1 << bit))) + if (!(rapl_pmus->cntr_mask & (1 << bit))) return -EINVAL; - /* unsupported modes and filters */ - if (event->attr.sample_period) /* no sampling */ + rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope); + if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) return -EINVAL; - /* must be done before validate_group */ - pmu = cpu_to_rapl_pmu(event->cpu); - if (!pmu) + rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; + if (!rapl_pmu) return -EINVAL; - event->pmu_private = pmu; - event->hw.event_base = rapl_msrs[bit].msr; + + event->pmu_private = rapl_pmu; event->hw.config = cfg; event->hw.idx = bit; @@ -404,12 +440,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); +RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01"); RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules"); /* * we compute in 0.23 nJ increments regardless of MSR @@ -419,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890 RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10"); /* * There are no default events, but we need to create @@ -451,6 +490,12 @@ static const struct attribute_group *rapl_attr_groups[] = { NULL, }; +static const struct attribute_group *rapl_core_attr_groups[] = { + &rapl_pmu_format_group, + &rapl_pmu_events_group, + NULL, +}; + static struct attribute *rapl_events_cores[] = { EVENT_PTR(rapl_cores), EVENT_PTR(rapl_cores_unit), @@ -511,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = { .attrs = rapl_events_psys, }; +static struct attribute *rapl_events_core[] = { + EVENT_PTR(rapl_core), + EVENT_PTR(rapl_core_unit), + EVENT_PTR(rapl_core_scale), + NULL, +}; + +static struct attribute_group rapl_events_core_group = { + .name = "events", + .attrs = rapl_events_core, +}; + static bool test_msr(int idx, void *data) { return test_bit(idx, (unsigned long *) data); @@ -536,11 +593,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = { }; /* - * Force to PERF_RAPL_MAX size due to: - * - perf_msr_probe(PERF_RAPL_MAX) + * Force to PERF_RAPL_PKG_EVENTS_MAX size due to: + * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX) * - want to use same event codes across both architectures */ -static struct perf_msr amd_rapl_msrs[] = { +static struct perf_msr amd_rapl_pkg_msrs[] = { [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 }, [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 }, @@ -548,18 +605,25 @@ static struct perf_msr amd_rapl_msrs[] = { [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, }; -static int rapl_check_hw_unit(struct rapl_model *rm) +static struct perf_msr amd_rapl_core_msrs[] = { + [PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group, + test_msr, false, RAPL_MSR_MASK }, +}; + +static int rapl_check_hw_unit(void) { u64 msr_rapl_power_unit_bits; int i; /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) + if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) return -1; - for (i = 0; i < NR_RAPL_DOMAINS; i++) - rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) + rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; - switch (rm->unit_quirk) { + rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + + switch (rapl_model->unit_quirk) { /* * DRAM domain on HSW server and KNL has fixed energy unit which can be * different than the unit from power unit MSR. See @@ -567,17 +631,16 @@ static int rapl_check_hw_unit(struct rapl_model *rm) * of 2. Datasheet, September 2014, Reference Number: 330784-001 " */ case RAPL_UNIT_QUIRK_INTEL_HSW: - rapl_hw_unit[PERF_RAPL_RAM] = 16; + rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16; break; /* SPR uses a fixed energy unit for Psys domain. */ case RAPL_UNIT_QUIRK_INTEL_SPR: - rapl_hw_unit[PERF_RAPL_PSYS] = 0; + rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0; break; default: break; } - /* * Calculate the timer rate: * Use reference of 200W for scaling the timeout to avoid counter @@ -586,9 +649,9 @@ static int rapl_check_hw_unit(struct rapl_model *rm) * if hw unit is 32, then we use 2 ms 1/200/2 */ rapl_timer_ms = 2; - if (rapl_hw_unit[0] < 32) { + if (rapl_pkg_hw_unit[0] < 32) { rapl_timer_ms = (1000 / (2 * 100)); - rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1)); + rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1)); } return 0; } @@ -596,24 +659,32 @@ static int rapl_check_hw_unit(struct rapl_model *rm) static void __init rapl_advertise(void) { int i; + int num_counters = hweight32(rapl_pmus_pkg->cntr_mask); + + if (rapl_pmus_core) + num_counters += hweight32(rapl_pmus_core->cntr_mask); pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", - hweight32(rapl_cntr_mask), rapl_timer_ms); + num_counters, rapl_timer_ms); - for (i = 0; i < NR_RAPL_DOMAINS; i++) { - if (rapl_cntr_mask & (1 << i)) { + for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { + if (rapl_pmus_pkg->cntr_mask & (1 << i)) { pr_info("hw unit of domain %s 2^-%d Joules\n", - rapl_domain_names[i], rapl_hw_unit[i]); + rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]); } } + + if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE))) + pr_info("hw unit of domain %s 2^-%d Joules\n", + rapl_core_domain_name, rapl_core_hw_unit); } -static void cleanup_rapl_pmus(void) +static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus) { int i; for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++) - kfree(rapl_pmus->pmus[i]); + kfree(rapl_pmus->rapl_pmu[i]); kfree(rapl_pmus); } @@ -626,46 +697,60 @@ static const struct attribute_group *rapl_attr_update[] = { NULL, }; -static int __init init_rapl_pmu(void) +static const struct attribute_group *rapl_core_attr_update[] = { + &rapl_events_core_group, + NULL, +}; + +static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus) { - struct rapl_pmu *pmu; + struct rapl_pmu *rapl_pmu; int idx; for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) { - pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); - if (!pmu) + rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL); + if (!rapl_pmu) goto free; - raw_spin_lock_init(&pmu->lock); - INIT_LIST_HEAD(&pmu->active_list); - pmu->pmu = &rapl_pmus->pmu; - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); - rapl_hrtimer_init(pmu); + raw_spin_lock_init(&rapl_pmu->lock); + INIT_LIST_HEAD(&rapl_pmu->active_list); + rapl_pmu->pmu = &rapl_pmus->pmu; + rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms); + rapl_hrtimer_init(rapl_pmu); - rapl_pmus->pmus[idx] = pmu; + rapl_pmus->rapl_pmu[idx] = rapl_pmu; } return 0; free: for (; idx > 0; idx--) - kfree(rapl_pmus->pmus[idx - 1]); + kfree(rapl_pmus->rapl_pmu[idx - 1]); return -ENOMEM; } -static int __init init_rapl_pmus(void) +static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope, + const struct attribute_group **rapl_attr_groups, + const struct attribute_group **rapl_attr_update) { int nr_rapl_pmu = topology_max_packages(); - int rapl_pmu_scope = PERF_PMU_SCOPE_PKG; + struct rapl_pmus *rapl_pmus; - if (!rapl_pmu_is_pkg_scope()) { - nr_rapl_pmu *= topology_max_dies_per_package(); - rapl_pmu_scope = PERF_PMU_SCOPE_DIE; - } + /* + * rapl_pmu_scope must be either PKG, DIE or CORE + */ + if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE) + nr_rapl_pmu *= topology_max_dies_per_package(); + else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE) + nr_rapl_pmu *= topology_num_cores_per_package(); + else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG) + return -EINVAL; - rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); + rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); if (!rapl_pmus) return -ENOMEM; + *rapl_pmus_ptr = rapl_pmus; + rapl_pmus->nr_rapl_pmu = nr_rapl_pmu; rapl_pmus->pmu.attr_groups = rapl_attr_groups; rapl_pmus->pmu.attr_update = rapl_attr_update; @@ -680,75 +765,77 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.module = THIS_MODULE; rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; - return init_rapl_pmu(); + return init_rapl_pmu(rapl_pmus); } static struct rapl_model model_snb = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PP1), .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_snbep = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsw = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1), .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsx = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_knl = { - .events = BIT(PERF_RAPL_PKG) | + .pkg_events = BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_skl = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1) | BIT(PERF_RAPL_PSYS), .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_msrs, + .rapl_pkg_msrs = intel_rapl_msrs, }; static struct rapl_model model_spr = { - .events = BIT(PERF_RAPL_PP0) | + .pkg_events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PSYS), .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, .msr_power_unit = MSR_RAPL_POWER_UNIT, - .rapl_msrs = intel_rapl_spr_msrs, + .rapl_pkg_msrs = intel_rapl_spr_msrs, }; static struct rapl_model model_amd_hygon = { - .events = BIT(PERF_RAPL_PKG), + .pkg_events = BIT(PERF_RAPL_PKG), + .core_events = BIT(PERF_RAPL_CORE), .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, - .rapl_msrs = amd_rapl_msrs, + .rapl_pkg_msrs = amd_rapl_pkg_msrs, + .rapl_core_msrs = amd_rapl_core_msrs, }; static const struct x86_cpu_id rapl_model_match[] __initconst = { @@ -804,45 +891,73 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); static int __init rapl_pmu_init(void) { const struct x86_cpu_id *id; - struct rapl_model *rm; + int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE; int ret; + if (rapl_pkg_pmu_is_pkg_scope()) + rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG; + id = x86_match_cpu(rapl_model_match); if (!id) return -ENODEV; - rm = (struct rapl_model *) id->driver_data; - - rapl_msrs = rm->rapl_msrs; + rapl_model = (struct rapl_model *) id->driver_data; - rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, - false, (void *) &rm->events); - - ret = rapl_check_hw_unit(rm); + ret = rapl_check_hw_unit(); if (ret) return ret; - ret = init_rapl_pmus(); + ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups, + rapl_attr_update); if (ret) return ret; - ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); + rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, + PERF_RAPL_PKG_EVENTS_MAX, false, + (void *) &rapl_model->pkg_events); + + ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1); if (ret) goto out; + if (rapl_model->core_events) { + ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE, + rapl_core_attr_groups, + rapl_core_attr_update); + if (ret) { + pr_warn("power-core PMU initialization failed (%d)\n", ret); + goto core_init_failed; + } + + rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs, + PERF_RAPL_CORE_EVENTS_MAX, false, + (void *) &rapl_model->core_events); + + ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1); + if (ret) { + pr_warn("power-core PMU registration failed (%d)\n", ret); + cleanup_rapl_pmus(rapl_pmus_core); + } + } + +core_init_failed: rapl_advertise(); return 0; out: pr_warn("Initialization failed (%d), disabled\n", ret); - cleanup_rapl_pmus(); + cleanup_rapl_pmus(rapl_pmus_pkg); return ret; } module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { - perf_pmu_unregister(&rapl_pmus->pmu); - cleanup_rapl_pmus(); + if (rapl_pmus_core) { + perf_pmu_unregister(&rapl_pmus_core->pmu); + cleanup_rapl_pmus(rapl_pmus_core); + } + perf_pmu_unregister(&rapl_pmus_pkg->pmu); + cleanup_rapl_pmus(rapl_pmus_pkg); } module_exit(intel_rapl_exit); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index d95f902acc521..1ac79f3616456 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -41,6 +41,7 @@ #define INTEL_FIXED_0_USER (1ULL << 1) #define INTEL_FIXED_0_ANYTHREAD (1ULL << 2) #define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3) +#define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2) #define HSW_IN_TX (1ULL << 32) #define HSW_IN_TX_CHECKPOINTED (1ULL << 33) @@ -372,6 +373,9 @@ static inline bool use_fixed_pseudo_encoding(u64 code) #define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND #define INTEL_TD_METRIC_NUM 8 +#define INTEL_TD_CFG_METRIC_CLEAR_BIT 0 +#define INTEL_TD_CFG_METRIC_CLEAR BIT_ULL(INTEL_TD_CFG_METRIC_CLEAR_BIT) + static inline bool is_metric_idx(int idx) { return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM; @@ -422,7 +426,9 @@ static inline bool is_topdown_idx(int idx) */ struct pebs_basic { - u64 format_size; + u64 format_group:32, + retire_latency:16, + format_size:16; u64 ip; u64 applicable_counters; u64 tsc; @@ -431,7 +437,17 @@ struct pebs_basic { struct pebs_meminfo { u64 address; u64 aux; - u64 latency; + union { + /* pre Alder Lake */ + u64 mem_latency; + /* Alder Lake and later */ + struct { + u64 instr_latency:16; + u64 pad2:16; + u64 cache_latency:16; + u64 pad3:16; + }; + }; u64 tsx_tuning; }; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 20e6009381ed6..c0cd10182e90c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -98,6 +98,7 @@ struct cpuinfo_topology { // Logical ID mappings u32 logical_pkg_id; u32 logical_die_id; + u32 logical_core_id; // AMD Node ID and Nodes per Package info u32 amd_node_id; diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index fd41103ad342c..3973cb9bb2e69 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu); #define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) #define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) +#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id) #define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) #define topology_ppin(cpu) (cpu_data(cpu).ppin) diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index 10719aba62768..cacfd3f6abeff 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p) seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); + seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id); seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c index 8277c64f88dbf..b5a5e14114699 100644 --- a/arch/x86/kernel/cpu/topology_common.c +++ b/arch/x86/kernel/cpu/topology_common.c @@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early) if (!early) { c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN); } /* Package relative core ID */ diff --git a/include/linux/mm.h b/include/linux/mm.h index b1c3db9cf3551..f02925447e590 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -728,7 +728,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -743,7 +743,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -751,7 +751,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -762,7 +762,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; @@ -780,7 +780,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 332cee2856620..825c04b564034 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -727,7 +727,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif @@ -921,6 +921,9 @@ struct mm_struct { * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. + * Incremented every time mmap_lock is write-locked/unlocked. + * Initialized to 0, therefore odd values indicate mmap_lock + * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. @@ -929,7 +932,7 @@ struct mm_struct { * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ - int mm_lock_seq; + seqcount_t mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index de9dc20b01ba7..45a21faa3ff62 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,39 +71,68 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK -/* - * Drop all currently-held per-VMA locks. - * This is called from the mmap_lock implementation directly before releasing - * a write-locked mmap_lock (or downgrading it to read-locked). - * This should normally NOT be called manually from other places. - * If you want to call this manually anyway, keep in mind that this will release - * *all* VMA write locks, including ones from further up the stack. - */ -static inline void vma_end_write_all(struct mm_struct *mm) + +static inline void mm_lock_seqcount_init(struct mm_struct *mm) +{ + seqcount_init(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) +{ + do_raw_write_seqcount_begin(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_end(struct mm_struct *mm) +{ + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); + do_raw_write_seqcount_end(&mm->mm_lock_seq); +} + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) { - mmap_assert_write_locked(mm); /* - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive - * mmap_lock being held. - * We need RELEASE semantics here to ensure that preceding stores into - * the VMA take effect before we unlock it with this store. - * Pairs with ACQUIRE semantics in vma_start_read(). + * Since mmap_lock is a sleeping lock, and waiting for it to become + * unlocked is more or less equivalent with taking it ourselves, don't + * bother with the speculative path if mmap_lock is already write-locked + * and take the slow path, which takes the lock. */ - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return read_seqcount_retry(&mm->mm_lock_seq, seq); } -#else -static inline void vma_end_write_all(struct mm_struct *mm) {} -#endif + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + return false; +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return true; +} + +#endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); } static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -111,6 +140,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -120,10 +150,26 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm) __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); + if (!ret) + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } +/* + * Drop all currently-held per-VMA locks. + * This is called from the mmap_lock implementation directly before releasing + * a write-locked mmap_lock (or downgrading it to read-locked). + * This should normally NOT be called manually from other places. + * If you want to call this manually anyway, keep in mind that this will release + * *all* VMA write locks, including ones from further up the stack. + */ +static inline void vma_end_write_all(struct mm_struct *mm) +{ + mmap_assert_write_locked(mm); + mm_lock_seqcount_end(mm); +} + static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index cb99ec8c9e96f..8333f132f4a96 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1279,6 +1279,11 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data, { int size = 1; + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) + return; + data->callchain = perf_callchain(event, regs); size += data->callchain->nr; @@ -1287,12 +1292,18 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data, } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, + struct perf_event *event, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; + if (!(event->attr.sample_type & PERF_SAMPLE_RAW)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW)) + return; + do { sum += frag->size; if (perf_raw_frag_last(frag)) @@ -1309,6 +1320,11 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data, data->sample_flags |= PERF_SAMPLE_RAW; } +static inline bool has_branch_stack(struct perf_event *event) +{ + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; +} + static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, @@ -1316,6 +1332,11 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, { int size = sizeof(u64); /* nr */ + if (!has_branch_stack(event)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) + return; + if (branch_sample_hw_index(event)) size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); @@ -1669,6 +1690,8 @@ static inline int perf_allow_tracepoint(struct perf_event_attr *attr) return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } +extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); + extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, @@ -1705,11 +1728,6 @@ static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #endif -static inline bool has_branch_stack(struct perf_event *event) -{ - return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; -} - static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; @@ -1879,6 +1897,10 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } +static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) +{ + return 0; +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index eb20dcaa51b59..d1a2346cf0f82 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) __seq; \ }) +/** + * raw_seqcount_try_begin() - begin a seqcount_t read critical section + * w/o lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * Similar to raw_seqcount_begin(), except it enables eliding the critical + * section entirely if odd, instead of doing the speculation knowing it will + * fail. + * + * Useful when counter stabilization is more or less equivalent to taking + * the lock and there is a slowpath that does that. + * + * If true, start will be set to the (even) sequence count read. + * + * Return: true when a read critical section is started. + */ +#define raw_seqcount_try_begin(s, start) \ +({ \ + start = raw_read_seqcount(s); \ + !(start & 1); \ +}) + /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index e0a4c20822458..b1df7d792fa16 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -16,6 +16,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -124,6 +125,10 @@ struct uprobe_task { unsigned int depth; struct return_instance *return_instances; + struct return_instance *ri_pool; + struct timer_list ri_timer; + seqcount_t ri_seqcount; + union { struct { struct arch_uprobe_task autask; @@ -137,7 +142,6 @@ struct uprobe_task { }; struct uprobe *active_uprobe; - struct timer_list ri_timer; unsigned long xol_vaddr; struct arch_uprobe *auprobe; @@ -154,12 +158,18 @@ struct return_instance { unsigned long stack; /* stack pointer */ unsigned long orig_ret_vaddr; /* original return address */ bool chained; /* true, if instance is nested */ - int consumers_cnt; + int cons_cnt; /* total number of session consumers */ struct return_instance *next; /* keep as stack */ struct rcu_head rcu; - struct return_consumer consumers[] __counted_by(consumers_cnt); + /* singular pre-allocated return_consumer instance for common case */ + struct return_consumer consumer; + /* + * extra return_consumer instances for rare cases of multiple session consumers, + * contains (cons_cnt - 1) elements + */ + struct return_consumer *extra_consumers; } ____cacheline_aligned; enum rp_check { diff --git a/kernel/events/core.c b/kernel/events/core.c index 065f9188b44a0..bcb09e011e9e1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6277,41 +6277,6 @@ void perf_event_update_userpage(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_update_userpage); -static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) -{ - struct perf_event *event = vmf->vma->vm_file->private_data; - struct perf_buffer *rb; - vm_fault_t ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) - goto unlock; - - vmf->page = perf_mmap_to_page(rb, vmf->pgoff); - if (!vmf->page) - goto unlock; - - get_page(vmf->page); - vmf->page->mapping = vmf->vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb) { @@ -6551,13 +6516,87 @@ static void perf_mmap_close(struct vm_area_struct *vma) ring_buffer_put(rb); /* could be last */ } +static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) +{ + /* The first page is the user control page, others are read-only. */ + return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; +} + static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, + .pfn_mkwrite = perf_mmap_pfn_mkwrite, }; +static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) +{ + unsigned long nr_pages = vma_pages(vma); + int err = 0; + unsigned long pagenum; + + /* + * We map this as a VM_PFNMAP VMA. + * + * This is not ideal as this is designed broadly for mappings of PFNs + * referencing memory-mapped I/O ranges or non-system RAM i.e. for which + * !pfn_valid(pfn). + * + * We are mapping kernel-allocated memory (memory we manage ourselves) + * which would more ideally be mapped using vm_insert_page() or a + * similar mechanism, that is as a VM_MIXEDMAP mapping. + * + * However this won't work here, because: + * + * 1. It uses vma->vm_page_prot, but this field has not been completely + * setup at the point of the f_op->mmp() hook, so we are unable to + * indicate that this should be mapped CoW in order that the + * mkwrite() hook can be invoked to make the first page R/W and the + * rest R/O as desired. + * + * 2. Anything other than a VM_PFNMAP of valid PFNs will result in + * vm_normal_page() returning a struct page * pointer, which means + * vm_ops->page_mkwrite() will be invoked rather than + * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping + * to work around retry logic in the fault handler, however this + * field is no longer allowed to be used within struct page. + * + * 3. Having a struct page * made available in the fault logic also + * means that the page gets put on the rmap and becomes + * inappropriately accessible and subject to map and ref counting. + * + * Ideally we would have a mechanism that could explicitly express our + * desires, but this is not currently the case, so we instead use + * VM_PFNMAP. + * + * We manage the lifetime of these mappings with internal refcounts (see + * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of + * this mapping is maintained correctly. + */ + for (pagenum = 0; pagenum < nr_pages; pagenum++) { + unsigned long va = vma->vm_start + PAGE_SIZE * pagenum; + struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum); + + if (page == NULL) { + err = -EINVAL; + break; + } + + /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */ + err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE, + vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); + if (err) + break; + } + +#ifdef CONFIG_MMU + /* Clear any partial mappings on error. */ + if (err) + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); +#endif + + return err; +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; @@ -6682,6 +6721,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto again; } + /* We need the rb to map pages. */ + rb = event->rb; goto unlock; } @@ -6776,6 +6817,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; + if (!ret) + ret = map_range(rb, vma); + if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); @@ -10039,8 +10083,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, data, regs); } -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) +int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; @@ -10425,9 +10468,9 @@ static struct pmu perf_tracepoint = { }; static int perf_tp_filter_match(struct perf_event *event, - struct perf_sample_data *data) + struct perf_raw_record *raw) { - void *record = data->raw->frag.data; + void *record = raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -10439,7 +10482,7 @@ static int perf_tp_filter_match(struct perf_event *event, } static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data, + struct perf_raw_record *raw, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) @@ -10450,7 +10493,7 @@ static int perf_tp_event_match(struct perf_event *event, if (event->attr.exclude_kernel && !user_mode(regs)) return 0; - if (!perf_tp_filter_match(event, data)) + if (!perf_tp_filter_match(event, raw)) return 0; return 1; @@ -10476,6 +10519,7 @@ EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); static void __perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, + struct perf_raw_record *raw, struct perf_event *event) { struct trace_entry *entry = record; @@ -10485,13 +10529,17 @@ static void __perf_tp_event_target_task(u64 count, void *record, /* Cannot deliver synchronous signal to other task. */ if (event->attr.sigtrap) return; - if (perf_tp_event_match(event, data, regs)) + if (perf_tp_event_match(event, raw, regs)) { + perf_sample_data_init(data, 0, 0); + perf_sample_save_raw_data(data, event, raw); perf_swevent_event(event, count, data, regs); + } } static void perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, + struct perf_raw_record *raw, struct perf_event_context *ctx) { unsigned int cpu = smp_processor_id(); @@ -10499,15 +10547,15 @@ static void perf_tp_event_target_task(u64 count, void *record, struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { - __perf_tp_event_target_task(count, record, regs, data, event); + __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) - __perf_tp_event_target_task(count, record, regs, data, sibling); + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { - __perf_tp_event_target_task(count, record, regs, data, event); + __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) - __perf_tp_event_target_task(count, record, regs, data, sibling); + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } } @@ -10525,15 +10573,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, }, }; - perf_sample_data_init(&data, 0, 0); - perf_sample_save_raw_data(&data, &raw); - perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) { - perf_swevent_event(event, count, &data, regs); - + if (perf_tp_event_match(event, &raw, regs)) { /* * Here use the same on-stack perf_sample_data, * some members in data are event-specific and @@ -10543,7 +10586,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, * because data->sample_flags is set. */ perf_sample_data_init(&data, 0, 0); - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); + perf_swevent_event(event, count, &data, regs); } } @@ -10560,7 +10604,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; raw_spin_lock(&ctx->lock); - perf_tp_event_target_task(count, record, regs, &data, ctx); + perf_tp_event_target_task(count, record, regs, &data, &raw, ctx); raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 4f46f688d0d49..180509132d4b6 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -643,7 +643,6 @@ static void rb_free_aux_page(struct perf_buffer *rb, int idx) struct page *page = virt_to_page(rb->aux_pages[idx]); ClearPagePrivate(page); - page->mapping = NULL; __free_page(page); } @@ -819,7 +818,6 @@ static void perf_mmap_free_page(void *addr) { struct page *page = virt_to_page(addr); - page->mapping = NULL; __free_page(page); } @@ -890,28 +888,13 @@ __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); } -static void perf_mmap_unmark_page(void *addr) -{ - struct page *page = vmalloc_to_page(addr); - - page->mapping = NULL; -} - static void rb_free_work(struct work_struct *work) { struct perf_buffer *rb; - void *base; - int i, nr; rb = container_of(work, struct perf_buffer, work); - nr = data_page_nr(rb); - - base = rb->user_page; - /* The '<=' counts in the user page. */ - for (i = 0; i <= nr; i++) - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); - vfree(base); + vfree(rb->user_page); kfree(rb); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5d71ef85420c5..e421a5f2ec7d0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1888,9 +1888,33 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } -static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) +static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) { - struct return_instance *next = ri->next; + ri->cons_cnt = 0; + ri->next = utask->ri_pool; + utask->ri_pool = ri; +} + +static struct return_instance *ri_pool_pop(struct uprobe_task *utask) +{ + struct return_instance *ri = utask->ri_pool; + + if (likely(ri)) + utask->ri_pool = ri->next; + + return ri; +} + +static void ri_free(struct return_instance *ri) +{ + kfree(ri->extra_consumers); + kfree_rcu(ri, rcu); +} + +static void free_ret_instance(struct uprobe_task *utask, + struct return_instance *ri, bool cleanup_hprobe) +{ + unsigned seq; if (cleanup_hprobe) { enum hprobe_state hstate; @@ -1899,8 +1923,22 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo hprobe_finalize(&ri->hprobe, hstate); } - kfree_rcu(ri, rcu); - return next; + /* + * At this point return_instance is unlinked from utask's + * return_instances list and this has become visible to ri_timer(). + * If seqcount now indicates that ri_timer's return instance + * processing loop isn't active, we can return ri into the pool of + * to-be-reused return instances for future uretprobes. If ri_timer() + * happens to be running right now, though, we fallback to safety and + * just perform RCU-delated freeing of ri. + */ + if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { + /* immediate reuse of ri without RCU GP is OK */ + ri_pool_push(utask, ri); + } else { + /* we might be racing with ri_timer(), so play it safe */ + ri_free(ri); + } } /* @@ -1910,7 +1948,7 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri; + struct return_instance *ri, *ri_next; if (!utask) return; @@ -1921,8 +1959,19 @@ void uprobe_free_utask(struct task_struct *t) timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; - while (ri) - ri = free_ret_instance(ri, true /* cleanup_hprobe */); + while (ri) { + ri_next = ri->next; + free_ret_instance(utask, ri, true /* cleanup_hprobe */); + ri = ri_next; + } + + /* free_ret_instance() above might add to ri_pool, so this loop should come last */ + ri = utask->ri_pool; + while (ri) { + ri_next = ri->next; + ri_free(ri); + ri = ri_next; + } kfree(utask); } @@ -1942,8 +1991,12 @@ static void ri_timer(struct timer_list *timer) /* RCU protects return_instance from freeing. */ guard(rcu)(); + write_seqcount_begin(&utask->ri_seqcount); + for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); + + write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) @@ -1955,6 +2008,7 @@ static struct uprobe_task *alloc_utask(void) return NULL; timer_setup(&utask->ri_timer, ri_timer, 0); + seqcount_init(&utask->ri_seqcount); return utask; } @@ -1974,32 +2028,40 @@ static struct uprobe_task *get_utask(void) return current->utask; } -static size_t ri_size(int consumers_cnt) +static struct return_instance *alloc_return_instance(struct uprobe_task *utask) { struct return_instance *ri; - return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt; -} - -#define DEF_CNT 4 - -static struct return_instance *alloc_return_instance(void) -{ - struct return_instance *ri; + ri = ri_pool_pop(utask); + if (ri) + return ri; - ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL); + ri = kzalloc(sizeof(*ri), GFP_KERNEL); if (!ri) return ZERO_SIZE_PTR; - ri->consumers_cnt = DEF_CNT; return ri; } static struct return_instance *dup_return_instance(struct return_instance *old) { - size_t size = ri_size(old->consumers_cnt); + struct return_instance *ri; + + ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); + if (!ri) + return NULL; - return kmemdup(old, size, GFP_KERNEL); + if (unlikely(old->cons_cnt > 1)) { + ri->extra_consumers = kmemdup(old->extra_consumers, + sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), + GFP_KERNEL); + if (!ri->extra_consumers) { + kfree(ri); + return NULL; + } + } + + return ri; } static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) @@ -2108,14 +2170,17 @@ unsigned long uprobe_get_trampoline_vaddr(void) static void cleanup_return_instances(struct uprobe_task *utask, bool chained, struct pt_regs *regs) { - struct return_instance *ri = utask->return_instances; + struct return_instance *ri = utask->return_instances, *ri_next; enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { - ri = free_ret_instance(ri, true /* cleanup_hprobe */); + ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; + + free_ret_instance(utask, ri, true /* cleanup_hprobe */); + ri = ri_next; } - rcu_assign_pointer(utask->return_instances, ri); } static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, @@ -2180,7 +2245,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, return; free: - kfree(ri); + ri_free(ri); } /* Prepare to single-step probed instruction out of line. */ @@ -2294,6 +2359,47 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) return is_trap_insn(&opcode); } +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) +{ + struct mm_struct *mm = current->mm; + struct uprobe *uprobe = NULL; + struct vm_area_struct *vma; + struct file *vm_file; + loff_t offset; + unsigned int seq; + + guard(rcu)(); + + if (!mmap_lock_speculate_try_begin(mm, &seq)) + return NULL; + + vma = vma_lookup(mm, bp_vaddr); + if (!vma) + return NULL; + + /* + * vm_file memory can be reused for another instance of struct file, + * but can't be freed from under us, so it's safe to read fields from + * it, even if the values are some garbage values; ultimately + * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure + * that whatever we speculatively found is correct + */ + vm_file = READ_ONCE(vma->vm_file); + if (!vm_file) + return NULL; + + offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); + uprobe = find_uprobe_rcu(vm_file->f_inode, offset); + if (!uprobe) + return NULL; + + /* now double check that nothing about MM changed */ + if (mmap_lock_speculate_retry(mm, seq)) + return NULL; + + return uprobe; +} + /* assumes being inside RCU protected region */ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { @@ -2301,10 +2407,14 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb struct uprobe *uprobe = NULL; struct vm_area_struct *vma; + uprobe = find_active_uprobe_speculative(bp_vaddr); + if (uprobe) + return uprobe; + mmap_read_lock(mm); vma = vma_lookup(mm, bp_vaddr); if (vma) { - if (valid_vma(vma, false)) { + if (vma->vm_file) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); @@ -2324,25 +2434,27 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb return uprobe; } -static struct return_instance* -push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie) +static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) { + struct return_consumer *ric; + if (unlikely(ri == ZERO_SIZE_PTR)) return ri; - if (unlikely(idx >= ri->consumers_cnt)) { - struct return_instance *old_ri = ri; - - ri->consumers_cnt += DEF_CNT; - ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL); - if (!ri) { - kfree(old_ri); + if (unlikely(ri->cons_cnt > 0)) { + ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); + if (!ric) { + ri_free(ri); return ZERO_SIZE_PTR; } + ri->extra_consumers = ric; } - ri->consumers[idx].id = id; - ri->consumers[idx].cookie = cookie; + ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; + ric->id = id; + ric->cookie = cookie; + + ri->cons_cnt++; return ri; } @@ -2350,14 +2462,17 @@ static struct return_consumer * return_consumer_find(struct return_instance *ri, int *iter, int id) { struct return_consumer *ric; - int idx = *iter; + int idx; - for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) { + for (idx = *iter; idx < ri->cons_cnt; idx++) + { + ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; if (ric->id == id) { *iter = idx + 1; return ric; } } + return NULL; } @@ -2371,9 +2486,9 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) struct uprobe_consumer *uc; bool has_consumers = false, remove = true; struct return_instance *ri = NULL; - int push_idx = 0; + struct uprobe_task *utask = current->utask; - current->utask->auprobe = &uprobe->arch; + utask->auprobe = &uprobe->arch; list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; @@ -2393,21 +2508,15 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) continue; if (!ri) - ri = alloc_return_instance(); + ri = alloc_return_instance(utask); if (session) - ri = push_consumer(ri, push_idx++, uc->id, cookie); + ri = push_consumer(ri, uc->id, cookie); } - current->utask->auprobe = NULL; + utask->auprobe = NULL; - if (!ZERO_OR_NULL_PTR(ri)) { - /* - * The push_idx value has the final number of return consumers, - * and ri->consumers_cnt has number of allocated consumers. - */ - ri->consumers_cnt = push_idx; + if (!ZERO_OR_NULL_PTR(ri)) prepare_uretprobe(uprobe, regs, ri); - } if (remove && has_consumers) { down_read(&uprobe->register_rwsem); @@ -2461,7 +2570,7 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri) void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; - struct return_instance *ri, *next; + struct return_instance *ri, *ri_next, *next_chain; struct uprobe *uprobe; enum hprobe_state hstate; bool valid; @@ -2481,8 +2590,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs) * or NULL; the latter case means that nobody but ri->func * could hit this trampoline on return. TODO: sigaltstack(). */ - next = find_next_ret_chain(ri); - valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); + next_chain = find_next_ret_chain(ri); + valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { @@ -2494,7 +2603,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs) * trampoline addresses on the stack are replaced with correct * original return addresses */ - rcu_assign_pointer(utask->return_instances, ri->next); + ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); + utask->depth--; uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) @@ -2502,9 +2613,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs) hprobe_finalize(&ri->hprobe, hstate); /* We already took care of hprobe, no need to waste more time on that. */ - ri = free_ret_instance(ri, false /* !cleanup_hprobe */); - utask->depth--; - } while (ri != next); + free_ret_instance(utask, ri, false /* !cleanup_hprobe */); + ri = ri_next; + } while (ri != next_chain); } while (!valid); return; diff --git a/kernel/fork.c b/kernel/fork.c index 9b301180fd416..ded49f18cd95c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -448,7 +448,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma) return false; init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return true; } @@ -1262,9 +1262,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); -#ifdef CONFIG_PER_VMA_LOCK - mm->mm_lock_seq = 0; -#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1b8db5aee9d38..9f2f65767639d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -619,7 +619,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_sample_data *sd) + u64 flags, struct perf_raw_record *raw, + struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); @@ -644,6 +645,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; + perf_sample_save_raw_data(sd, event, raw); + return perf_event_output(event, sd, regs); } @@ -687,9 +690,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, } perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - err = __bpf_perf_event_output(regs, map, flags, sd); + err = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_trace_nest_level); preempt_enable(); @@ -748,9 +750,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - ret = __bpf_perf_event_output(regs, map, flags, sd); + ret = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_event_output_nest_level); preempt_enable(); diff --git a/mm/gup.c b/mm/gup.c index 3b75e631f3691..87a9c2026e4db 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3360,8 +3360,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end, return 0; if (gup_flags & FOLL_PIN) { - seq = raw_read_seqcount(¤t->mm->write_protect_seq); - if (seq & 1) + if (!raw_seqcount_try_begin(¤t->mm->write_protect_seq, seq)) return 0; } diff --git a/mm/init-mm.c b/mm/init-mm.c index 24c8093792745..6af3ad675930b 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -40,7 +40,7 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), #ifdef CONFIG_PER_VMA_LOCK - .mm_lock_seq = 0, + .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), #endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 8fab5e13c7c3b..9bcf1736bf182 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -89,7 +89,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, * begun. Linking to the tree will have caused this to be incremented, * which means we will get a false positive otherwise. */ - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return vma; } @@ -214,7 +214,7 @@ static bool vma_write_started(struct vm_area_struct *vma) int seq = vma->vm_lock_seq; /* We reset after each check. */ - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; /* The vma_start_write() stub simply increments this value. */ return seq > -1; diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index e76ff579e1fdc..1d9fc97b8e80f 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -241,7 +241,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; struct vma_lock *vm_lock; #endif @@ -416,7 +416,7 @@ static inline bool vma_lock_alloc(struct vm_area_struct *vma) return false; init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return true; }