diff options
-rw-r--r-- | Documentation/virt/kvm/api.rst | 78 | ||||
-rw-r--r-- | arch/x86/events/intel/core.c | 1 | ||||
-rw-r--r-- | arch/x86/events/intel/ds.c | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 15 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/kvm.h | 29 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.c | 286 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.h | 13 | ||||
-rw-r--r-- | arch/x86/kvm/svm/pmu.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 23 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 230 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 12 | ||||
-rw-r--r-- | include/uapi/linux/kvm.h | 1 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 381 |
16 files changed, 896 insertions, 198 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9807b05a1b57..83e3acc9e321 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5005,6 +5005,15 @@ using this ioctl. :Parameters: struct kvm_pmu_event_filter (in) :Returns: 0 on success, -1 on error +Errors: + + ====== ============================================================ + EFAULT args[0] cannot be accessed + EINVAL args[0] contains invalid data in the filter or filter events + E2BIG nevents is too large + EBUSY not enough memory to allocate the filter + ====== ============================================================ + :: struct kvm_pmu_event_filter { @@ -5016,14 +5025,69 @@ using this ioctl. __u64 events[0]; }; -This ioctl restricts the set of PMU events that the guest can program. -The argument holds a list of events which will be allowed or denied. -The eventsel+umask of each event the guest attempts to program is compared -against the events field to determine whether the guest should have access. -The events field only controls general purpose counters; fixed purpose -counters are controlled by the fixed_counter_bitmap. +This ioctl restricts the set of PMU events the guest can program by limiting +which event select and unit mask combinations are permitted. + +The argument holds a list of filter events which will be allowed or denied. + +Filter events only control general purpose counters; fixed purpose counters +are controlled by the fixed_counter_bitmap. + +Valid values for 'flags':: + +``0`` + +To use this mode, clear the 'flags' field. + +In this mode each event will contain an event select + unit mask. + +When the guest attempts to program the PMU the guest's event select + +unit mask is compared against the filter events to determine whether the +guest should have access. + +``KVM_PMU_EVENT_FLAG_MASKED_EVENTS`` +:Capability: KVM_CAP_PMU_EVENT_MASKED_EVENTS + +In this mode each filter event will contain an event select, mask, match, and +exclude value. To encode a masked event use:: + + KVM_PMU_ENCODE_MASKED_ENTRY() + +An encoded event will follow this layout:: + + Bits Description + ---- ----------- + 7:0 event select (low bits) + 15:8 umask match + 31:16 unused + 35:32 event select (high bits) + 36:54 unused + 55 exclude bit + 63:56 umask mask + +When the guest attempts to program the PMU, these steps are followed in +determining if the guest should have access: + + 1. Match the event select from the guest against the filter events. + 2. If a match is found, match the guest's unit mask to the mask and match + values of the included filter events. + I.e. (unit mask & mask) == match && !exclude. + 3. If a match is found, match the guest's unit mask to the mask and match + values of the excluded filter events. + I.e. (unit mask & mask) == match && exclude. + 4. + a. If an included match is found and an excluded match is not found, filter + the event. + b. For everything else, do not filter the event. + 5. + a. If the event is filtered and it's an allow list, allow the guest to + program the event. + b. If the event is filtered and it's a deny list, do not allow the guest to + program the event. -No flags are defined yet, the field must be zero. +When setting a new pmu event filter, -EINVAL will be returned if any of the +unused fields are set or if any of the high bits (35:32) in the event +select are set when called on Intel. Valid values for 'action':: diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index dfd2c124cdf8..aa53d042b943 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6348,6 +6348,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; x86_pmu.extra_regs = intel_spr_extra_regs; x86_pmu.limit_period = spr_limit_period; + x86_pmu.pebs_ept = 1; x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 88e58b6ee73c..d8a404b91b7e 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2303,8 +2303,10 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; - case 4: case 5: + x86_pmu.pebs_ept = 1; + fallthrough; + case 4: x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl; x86_pmu.pebs_record_size = sizeof(struct pebs_basic); if (x86_pmu.intel_cap.pebs_baseline) { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8b38a4cb2e29..37983871ed61 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -514,6 +514,7 @@ struct kvm_pmc { #define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define KVM_PMC_MAX_FIXED 3 +#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) #define KVM_AMD_PMC_MAX_GENERIC 6 struct kvm_pmu { unsigned nr_arch_gp_counters; @@ -1151,6 +1152,18 @@ struct kvm_x86_msr_filter { struct msr_bitmap_range ranges[16]; }; +struct kvm_x86_pmu_event_filter { + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 nr_includes; + __u32 nr_excludes; + __u64 *includes; + __u64 *excludes; + __u64 events[]; +}; + enum kvm_apicv_inhibit { /********************************************************************/ @@ -1368,7 +1381,7 @@ struct kvm_arch { /* Guest can access the SGX PROVISIONKEY. */ bool sgx_provisioning_allowed; - struct kvm_pmu_event_filter __rcu *pmu_event_filter; + struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index bde47f3a8c9d..7f467fe05d42 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -526,6 +526,35 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) +#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) + +/* + * Masked event layout. + * Bits Description + * ---- ----------- + * 7:0 event select (low bits) + * 15:8 umask match + * 31:16 unused + * 35:32 event select (high bits) + * 36:54 unused + * 55 exclude bit + * 63:56 umask mask + */ + +#define KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, exclude) \ + (((event_select) & 0xFFULL) | (((event_select) & 0XF00ULL) << 24) | \ + (((mask) & 0xFFULL) << 56) | \ + (((match) & 0xFFULL) << 8) | \ + ((__u64)(!!(exclude)) << 55)) + +#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ + (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) +#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) + /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index f610b07ddcf4..b28fd020066f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1448,8 +1448,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; @@ -1570,8 +1569,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } @@ -1626,7 +1624,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } @@ -1691,7 +1689,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, data = APIC_BUS_FREQUENCY; break; default: - vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 58e5a456273a..612e6c70ce2e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -29,9 +29,18 @@ struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_GPL(kvm_pmu_cap); -static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { +/* Precise Distribution of Instructions Retired (PDIR) */ +static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + /* Instruction-Accurate PDIR (PDIR++) */ + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), + {} +}; + +/* Precise Distribution (PDist) */ +static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), {} }; @@ -156,6 +165,28 @@ static void kvm_perf_overflow(struct perf_event *perf_event, kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } +static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc) +{ + /* + * For some model specific pebs counters with special capabilities + * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise + * level to the maximum value (currently 3, backwards compatible) + * so that the perf subsystem would assign specific hardware counter + * with that capability for vPMC. + */ + if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) || + (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu))) + return 3; + + /* + * The non-zero precision level of guest event makes the ordinary + * guest event becomes a guest PEBS event and triggers the host + * PEBS PMI handler to determine whether the PEBS overflow PMI + * comes from the host counters or the guest. + */ + return 1; +} + static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) @@ -187,22 +218,12 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, } if (pebs) { /* - * The non-zero precision level of guest event makes the ordinary - * guest event becomes a guest PEBS event and triggers the host - * PEBS PMI handler to determine whether the PEBS overflow PMI - * comes from the host counters or the guest. - * * For most PEBS hardware events, the difference in the software * precision levels of guest and host PEBS events will not affect * the accuracy of the PEBS profiling result, because the "event IP" * in the PEBS record is calibrated on the guest side. - * - * On Icelake everything is fine. Other hardware (GLC+, TNT+) that - * could possibly care here is unsupported and needs changes. */ - attr.precise_ip = 1; - if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32) - attr.precise_ip = 3; + attr.precise_ip = pmc_get_pebs_precise_level(pmc); } event = perf_event_create_kernel_counter(&attr, -1, current, @@ -255,48 +276,128 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } -static int cmp_u64(const void *pa, const void *pb) +static int filter_cmp(const void *pa, const void *pb, u64 mask) { - u64 a = *(u64 *)pa; - u64 b = *(u64 *)pb; + u64 a = *(u64 *)pa & mask; + u64 b = *(u64 *)pb & mask; return (a > b) - (a < b); } + +static int filter_sort_cmp(const void *pa, const void *pb) +{ + return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT | + KVM_PMU_MASKED_ENTRY_EXCLUDE)); +} + +/* + * For the event filter, searching is done on the 'includes' list and + * 'excludes' list separately rather than on the 'events' list (which + * has both). As a result the exclude bit can be ignored. + */ +static int filter_event_cmp(const void *pa, const void *pb) +{ + return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT)); +} + +static int find_filter_index(u64 *events, u64 nevents, u64 key) +{ + u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]), + filter_event_cmp); + + if (!fe) + return -1; + + return fe - events; +} + +static bool is_filter_entry_match(u64 filter_event, u64 umask) +{ + u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8); + u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH; + + BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >> + (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) != + ARCH_PERFMON_EVENTSEL_UMASK); + + return (umask & mask) == match; +} + +static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel) +{ + u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT; + u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK; + int i, index; + + index = find_filter_index(events, nevents, event_select); + if (index < 0) + return false; + + /* + * Entries are sorted by the event select. Walk the list in both + * directions to process all entries with the targeted event select. + */ + for (i = index; i < nevents; i++) { + if (filter_event_cmp(&events[i], &event_select)) + break; + + if (is_filter_entry_match(events[i], umask)) + return true; + } + + for (i = index - 1; i >= 0; i--) { + if (filter_event_cmp(&events[i], &event_select)) + break; + + if (is_filter_entry_match(events[i], umask)) + return true; + } + + return false; +} + +static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f, + u64 eventsel) +{ + if (filter_contains_match(f->includes, f->nr_includes, eventsel) && + !filter_contains_match(f->excludes, f->nr_excludes, eventsel)) + return f->action == KVM_PMU_EVENT_ALLOW; + + return f->action == KVM_PMU_EVENT_DENY; +} + +static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, + int idx) +{ + int fixed_idx = idx - INTEL_PMC_IDX_FIXED; + + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) + return false; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) + return false; + + return true; +} + static bool check_pmu_event_filter(struct kvm_pmc *pmc) { - struct kvm_pmu_event_filter *filter; + struct kvm_x86_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; - bool allow_event = true; - __u64 key; - int idx; if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) return false; filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (!filter) - goto out; + return true; - if (pmc_is_gp(pmc)) { - key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; - if (bsearch(&key, filter->events, filter->nevents, - sizeof(__u64), cmp_u64)) - allow_event = filter->action == KVM_PMU_EVENT_ALLOW; - else - allow_event = filter->action == KVM_PMU_EVENT_DENY; - } else { - idx = pmc->idx - INTEL_PMC_IDX_FIXED; - if (filter->action == KVM_PMU_EVENT_DENY && - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - allow_event = false; - if (filter->action == KVM_PMU_EVENT_ALLOW && - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - allow_event = false; - } + if (pmc_is_gp(pmc)) + return is_gp_event_allowed(filter, pmc->eventsel); -out: - return allow_event; + return is_fixed_event_allowed(filter, pmc->idx); } static void reprogram_counter(struct kvm_pmc *pmc) @@ -593,43 +694,128 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) } EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); +static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) +{ + u64 mask = kvm_pmu_ops.EVENTSEL_EVENT | + KVM_PMU_MASKED_ENTRY_UMASK_MASK | + KVM_PMU_MASKED_ENTRY_UMASK_MATCH | + KVM_PMU_MASKED_ENTRY_EXCLUDE; + int i; + + for (i = 0; i < filter->nevents; i++) { + if (filter->events[i] & ~mask) + return false; + } + + return true; +} + +static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter) +{ + int i, j; + + for (i = 0, j = 0; i < filter->nevents; i++) { + /* + * Skip events that are impossible to match against a guest + * event. When filtering, only the event select + unit mask + * of the guest event is used. To maintain backwards + * compatibility, impossible filters can't be rejected :-( + */ + if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT | + ARCH_PERFMON_EVENTSEL_UMASK)) + continue; + /* + * Convert userspace events to a common in-kernel event so + * only one code path is needed to support both events. For + * the in-kernel events use masked events because they are + * flexible enough to handle both cases. To convert to masked + * events all that's needed is to add an "all ones" umask_mask, + * (unmasked filter events don't support EXCLUDE). + */ + filter->events[j++] = filter->events[i] | + (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT); + } + + filter->nevents = j; +} + +static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter) +{ + int i; + + if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)) + convert_to_masked_filter(filter); + else if (!is_masked_filter_valid(filter)) + return -EINVAL; + + /* + * Sort entries by event select and includes vs. excludes so that all + * entries for a given event select can be processed efficiently during + * filtering. The EXCLUDE flag uses a more significant bit than the + * event select, and so the sorted list is also effectively split into + * includes and excludes sub-lists. + */ + sort(&filter->events, filter->nevents, sizeof(filter->events[0]), + filter_sort_cmp, NULL); + + i = filter->nevents; + /* Find the first EXCLUDE event (only supported for masked events). */ + if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) { + for (i = 0; i < filter->nevents; i++) { + if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE) + break; + } + } + + filter->nr_includes = i; + filter->nr_excludes = filter->nevents - filter->nr_includes; + filter->includes = filter->events; + filter->excludes = filter->events + filter->nr_includes; + + return 0; +} + int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { - struct kvm_pmu_event_filter tmp, *filter; + struct kvm_pmu_event_filter __user *user_filter = argp; + struct kvm_x86_pmu_event_filter *filter; + struct kvm_pmu_event_filter tmp; struct kvm_vcpu *vcpu; unsigned long i; size_t size; int r; - if (copy_from_user(&tmp, argp, sizeof(tmp))) + if (copy_from_user(&tmp, user_filter, sizeof(tmp))) return -EFAULT; if (tmp.action != KVM_PMU_EVENT_ALLOW && tmp.action != KVM_PMU_EVENT_DENY) return -EINVAL; - if (tmp.flags != 0) + if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK) return -EINVAL; if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) return -E2BIG; size = struct_size(filter, events, tmp.nevents); - filter = kmalloc(size, GFP_KERNEL_ACCOUNT); + filter = kzalloc(size, GFP_KERNEL_ACCOUNT); if (!filter) return -ENOMEM; + filter->action = tmp.action; + filter->nevents = tmp.nevents; + filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap; + filter->flags = tmp.flags; + r = -EFAULT; - if (copy_from_user(filter, argp, size)) + if (copy_from_user(filter->events, user_filter->events, + sizeof(filter->events[0]) * filter->nevents)) goto cleanup; - /* Ensure nevents can't be changed between the user copies. */ - *filter = tmp; - - /* - * Sort the in-kernel list so that we can search it with bsearch. - */ - sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); + r = prepare_filter_lists(filter); + if (r) + goto cleanup; mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index cdb91009701d..79988dafb15b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -18,12 +18,6 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 -struct kvm_event_hw_type_mapping { - u8 eventsel; - u8 unit_mask; - unsigned event_type; -}; - struct kvm_pmu_ops { bool (*hw_event_available)(struct kvm_pmc *pmc); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); @@ -40,6 +34,9 @@ struct kvm_pmu_ops { void (*reset)(struct kvm_vcpu *vcpu); void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); + + const u64 EVENTSEL_EVENT; + const int MAX_NR_GP_COUNTERS; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); @@ -161,7 +158,7 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) extern struct x86_pmu_capability kvm_pmu_cap; -static inline void kvm_init_pmu_capability(void) +static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; @@ -180,6 +177,8 @@ static inline void kvm_init_pmu_capability(void) } kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, + pmu_ops->MAX_NR_GP_COUNTERS); kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, KVM_PMC_MAX_FIXED); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 1ff068f23841..cc77a0681800 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -231,4 +231,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .refresh = amd_pmu_refresh, .init = amd_pmu_init, .reset = amd_pmu_reset, + .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, + .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d13cf53e7390..dd21e8b1a259 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3015,8 +3015,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_DEBUGCTLMSR: if (!lbrv) { - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } if (data & DEBUGCTL_RESERVED_BITS) @@ -3045,7 +3044,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; case MSR_AMD64_DE_CFG: { struct kvm_msr_entry msr_entry; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index efce9ad70e4e..e8a3be0b9df9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -22,16 +22,19 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) -static struct kvm_event_hw_type_mapping intel_arch_events[] = { - [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, - [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, - [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, - [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, +static struct { + u8 eventsel; + u8 unit_mask; +} const intel_arch_events[] = { + [0] = { 0x3c, 0x00 }, + [1] = { 0xc0, 0x00 }, + [2] = { 0x3c, 0x01 }, + [3] = { 0x2e, 0x4f }, + [4] = { 0x2e, 0x41 }, + [5] = { 0xc4, 0x00 }, + [6] = { 0xc5, 0x00 }, /* The above index must match CPUID 0x0A.EBX bit vector */ - [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, + [7] = { 0x00, 0x03 }, }; /* mapping between fixed pmc index and intel_arch_events array */ @@ -811,4 +814,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .reset = intel_pmu_reset, .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, + .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, + .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, }; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bbf60bda877e..2690d018da11 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2206,9 +2206,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 810cbe3b3ba6..199a9ff0cd4b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1419,7 +1419,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); * may depend on host virtualization features rather than host cpu features. */ -static const u32 msrs_to_save_all[] = { +static const u32 msrs_to_save_base[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1436,6 +1436,10 @@ static const u32 msrs_to_save_all[] = { MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, +}; + +static const u32 msrs_to_save_pmu[] = { MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, @@ -1460,11 +1464,10 @@ static const u32 msrs_to_save_all[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, - - MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + + ARRAY_SIZE(msrs_to_save_pmu)]; static unsigned num_msrs_to_save; static const u32 emulated_msrs_all[] = { @@ -3559,9 +3562,20 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } +static bool kvm_is_msr_to_save(u32 msr_index) +{ + unsigned int i; + + for (i = 0; i < num_msrs_to_save; i++) { + if (msrs_to_save[i] == msr_index) + return true; + } + + return false; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - bool pr = false; u32 msr = msr_info->index; u64 data = msr_info->data; @@ -3607,15 +3621,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data == BIT_ULL(18)) { vcpu->arch.msr_hwcr = data; } else if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; @@ -3795,16 +3807,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - pr = true; - fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (pr || data != 0) - vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + if (data) + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_K7_CLK_CTL: /* @@ -3832,9 +3841,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) @@ -3882,20 +3889,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif - case MSR_IA32_PEBS_ENABLE: - case MSR_IA32_DS_AREA: - case MSR_PEBS_DATA_CFG: - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); + /* * Userspace is allowed to write '0' to MSRs that KVM reports * as to-be-saved, even if an MSRs isn't fully supported. */ - return !msr_info->host_initiated || data; - default: - if (kvm_pmu_is_valid_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr_info); + if (msr_info->host_initiated && !data && + kvm_is_msr_to_save(msr)) + break; + return KVM_MSR_RET_INVALID; } return 0; @@ -3985,20 +3990,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; - case MSR_IA32_PEBS_ENABLE: - case MSR_IA32_DS_AREA: - case MSR_PEBS_DATA_CFG: - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info); - /* - * Userspace is allowed to read MSRs that KVM reports as - * to-be-saved, even if an MSR isn't fully supported. - */ - if (!msr_info->host_initiated) - return 1; - msr_info->data = 0; - break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: @@ -4254,6 +4245,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + + /* + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. + */ + if (msr_info->host_initiated && + kvm_is_msr_to_save(msr_info->index)) { + msr_info->data = 0; + break; + } + return KVM_MSR_RET_INVALID; } return 0; @@ -4401,6 +4403,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_PMU_EVENT_FILTER: + case KVM_CAP_PMU_EVENT_MASKED_EVENTS: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: @@ -6993,83 +6996,98 @@ out: return r; } -static void kvm_init_msr_list(void) +static void kvm_probe_msr_to_save(u32 msr_index) { u32 dummy[2]; + + if (rdmsr_safe(msr_index, &dummy[0], &dummy[1])) + return; + + /* + * Even MSRs that are valid in the host may not be exposed to guests in + * some cases. + */ + switch (msr_index) { + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported()) + return; + break; + case MSR_TSC_AUX: + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) + return; + break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) + return; + break; + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) + return; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) + return; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && + !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) + return; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (msr_index - MSR_IA32_RTIT_ADDR0_A >= + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) + return; + break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: + if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: + if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX: + if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >= + kvm_pmu_cap.num_counters_fixed) + return; + break; + case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + return; + break; + default: + break; + } + + msrs_to_save[num_msrs_to_save++] = msr_index; +} + +static void kvm_init_msr_list(void) +{ unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, - "Please update the fixed PMCs in msrs_to_saved_all[]"); + "Please update the fixed PMCs in msrs_to_save_pmu[]"); num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; - for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { - if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) - continue; - - /* - * Even MSRs that are valid in the host may not be exposed - * to the guests in some cases. - */ - switch (msrs_to_save_all[i]) { - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) - continue; - break; - case MSR_TSC_AUX: - if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && - !kvm_cpu_cap_has(X86_FEATURE_RDPID)) - continue; - break; - case MSR_IA32_UMWAIT_CONTROL: - if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) - continue; - break; - case MSR_IA32_RTIT_CTL: - case MSR_IA32_RTIT_STATUS: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) - continue; - break; - case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) - continue; - break; - case MSR_IA32_RTIT_OUTPUT_BASE: - case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && - !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) - continue; - break; - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= - intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) - continue; - break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) - continue; - break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) - continue; - break; - case MSR_IA32_XFD: - case MSR_IA32_XFD_ERR: - if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) - continue; - break; - default: - break; - } + for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) + kvm_probe_msr_to_save(msrs_to_save_base[i]); - msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; + if (enable_pmu) { + for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) + kvm_probe_msr_to_save(msrs_to_save_pmu[i]); } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { @@ -9377,7 +9395,7 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); - kvm_init_pmu_capability(); + kvm_init_pmu_capability(ops->pmu_ops); r = ops->hardware_setup(); if (r != 0) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9de72586f406..f3554bf05201 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -331,6 +331,18 @@ extern bool report_ignored_msrs; extern bool eager_page_split; +static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data); +} + +static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr); +} + static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 55155e262646..76156e372f9c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1175,6 +1175,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 +#define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index 2de98fce7edd..253e4304bbe3 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -198,14 +198,15 @@ static struct kvm_pmu_event_filter *alloc_pmu_event_filter(uint32_t nevents) static struct kvm_pmu_event_filter * -create_pmu_event_filter(const uint64_t event_list[], - int nevents, uint32_t action) +create_pmu_event_filter(const uint64_t event_list[], int nevents, + uint32_t action, uint32_t flags) { struct kvm_pmu_event_filter *f; int i; f = alloc_pmu_event_filter(nevents); f->action = action; + f->flags = flags; for (i = 0; i < nevents; i++) f->events[i] = event_list[i]; @@ -216,7 +217,7 @@ static struct kvm_pmu_event_filter *event_filter(uint32_t action) { return create_pmu_event_filter(event_list, ARRAY_SIZE(event_list), - action); + action, 0); } /* @@ -263,7 +264,7 @@ static void test_amd_deny_list(struct kvm_vcpu *vcpu) struct kvm_pmu_event_filter *f; uint64_t count; - f = create_pmu_event_filter(&event, 1, KVM_PMU_EVENT_DENY); + f = create_pmu_event_filter(&event, 1, KVM_PMU_EVENT_DENY, 0); count = test_with_filter(vcpu, f); free(f); @@ -403,13 +404,372 @@ static bool use_amd_pmu(void) is_zen3(family, model)); } +/* + * "MEM_INST_RETIRED.ALL_LOADS", "MEM_INST_RETIRED.ALL_STORES", and + * "MEM_INST_RETIRED.ANY" from https://perfmon-events.intel.com/ + * supported on Intel Xeon processors: + * - Sapphire Rapids, Ice Lake, Cascade Lake, Skylake. + */ +#define MEM_INST_RETIRED 0xD0 +#define MEM_INST_RETIRED_LOAD EVENT(MEM_INST_RETIRED, 0x81) +#define MEM_INST_RETIRED_STORE EVENT(MEM_INST_RETIRED, 0x82) +#define MEM_INST_RETIRED_LOAD_STORE EVENT(MEM_INST_RETIRED, 0x83) + +static bool supports_event_mem_inst_retired(void) +{ + uint32_t eax, ebx, ecx, edx; + + cpuid(1, &eax, &ebx, &ecx, &edx); + if (x86_family(eax) == 0x6) { + switch (x86_model(eax)) { + /* Sapphire Rapids */ + case 0x8F: + /* Ice Lake */ + case 0x6A: + /* Skylake */ + /* Cascade Lake */ + case 0x55: + return true; + } + } + + return false; +} + +/* + * "LS Dispatch", from Processor Programming Reference + * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors, + * Preliminary Processor Programming Reference (PPR) for AMD Family + * 17h Model 31h, Revision B0 Processors, and Preliminary Processor + * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision + * B1 Processors Volume 1 of 2. + */ +#define LS_DISPATCH 0x29 +#define LS_DISPATCH_LOAD EVENT(LS_DISPATCH, BIT(0)) +#define LS_DISPATCH_STORE EVENT(LS_DISPATCH, BIT(1)) +#define LS_DISPATCH_LOAD_STORE EVENT(LS_DISPATCH, BIT(2)) + +#define INCLUDE_MASKED_ENTRY(event_select, mask, match) \ + KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, false) +#define EXCLUDE_MASKED_ENTRY(event_select, mask, match) \ + KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, true) + +struct perf_counter { + union { + uint64_t raw; + struct { + uint64_t loads:22; + uint64_t stores:22; + uint64_t loads_stores:20; + }; + }; +}; + +static uint64_t masked_events_guest_test(uint32_t msr_base) +{ + uint64_t ld0, ld1, st0, st1, ls0, ls1; + struct perf_counter c; + int val; + + /* + * The acutal value of the counters don't determine the outcome of + * the test. Only that they are zero or non-zero. + */ + ld0 = rdmsr(msr_base + 0); + st0 = rdmsr(msr_base + 1); + ls0 = rdmsr(msr_base + 2); + + __asm__ __volatile__("movl $0, %[v];" + "movl %[v], %%eax;" + "incl %[v];" + : [v]"+m"(val) :: "eax"); + + ld1 = rdmsr(msr_base + 0); + st1 = rdmsr(msr_base + 1); + ls1 = rdmsr(msr_base + 2); + + c.loads = ld1 - ld0; + c.stores = st1 - st0; + c.loads_stores = ls1 - ls0; + + return c.raw; +} + +static void intel_masked_events_guest_code(void) +{ + uint64_t r; + + for (;;) { + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + wrmsr(MSR_P6_EVNTSEL0 + 0, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD); + wrmsr(MSR_P6_EVNTSEL0 + 1, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_STORE); + wrmsr(MSR_P6_EVNTSEL0 + 2, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD_STORE); + + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x7); + + r = masked_events_guest_test(MSR_IA32_PMC0); + + GUEST_SYNC(r); + } +} + +static void amd_masked_events_guest_code(void) +{ + uint64_t r; + + for (;;) { + wrmsr(MSR_K7_EVNTSEL0, 0); + wrmsr(MSR_K7_EVNTSEL1, 0); + wrmsr(MSR_K7_EVNTSEL2, 0); + + wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD); + wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_STORE); + wrmsr(MSR_K7_EVNTSEL2, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD_STORE); + + r = masked_events_guest_test(MSR_K7_PERFCTR0); + + GUEST_SYNC(r); + } +} + +static struct perf_counter run_masked_events_test(struct kvm_vcpu *vcpu, + const uint64_t masked_events[], + const int nmasked_events) +{ + struct kvm_pmu_event_filter *f; + struct perf_counter r; + + f = create_pmu_event_filter(masked_events, nmasked_events, + KVM_PMU_EVENT_ALLOW, + KVM_PMU_EVENT_FLAG_MASKED_EVENTS); + r.raw = test_with_filter(vcpu, f); + free(f); + + return r; +} + +/* Matches KVM_PMU_EVENT_FILTER_MAX_EVENTS in pmu.c */ +#define MAX_FILTER_EVENTS 300 +#define MAX_TEST_EVENTS 10 + +#define ALLOW_LOADS BIT(0) +#define ALLOW_STORES BIT(1) +#define ALLOW_LOADS_STORES BIT(2) + +struct masked_events_test { + uint64_t intel_events[MAX_TEST_EVENTS]; + uint64_t intel_event_end; + uint64_t amd_events[MAX_TEST_EVENTS]; + uint64_t amd_event_end; + const char *msg; + uint32_t flags; +}; + +/* + * These are the test cases for the masked events tests. + * + * For each test, the guest enables 3 PMU counters (loads, stores, + * loads + stores). The filter is then set in KVM with the masked events + * provided. The test then verifies that the counters agree with which + * ones should be counting and which ones should be filtered. + */ +const struct masked_events_test test_cases[] = { + { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x81), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)), + }, + .msg = "Only allow loads.", + .flags = ALLOW_LOADS, + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)), + }, + .msg = "Only allow stores.", + .flags = ALLOW_STORES, + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(2)), + }, + .msg = "Only allow loads + stores.", + .flags = ALLOW_LOADS_STORES, + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0), + EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, ~(BIT(0) | BIT(1)), 0), + }, + .msg = "Only allow loads and stores.", + .flags = ALLOW_LOADS | ALLOW_STORES, + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0), + EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0), + EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)), + }, + .msg = "Only allow loads and loads + stores.", + .flags = ALLOW_LOADS | ALLOW_LOADS_STORES + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFE, 0x82), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0), + EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)), + }, + .msg = "Only allow stores and loads + stores.", + .flags = ALLOW_STORES | ALLOW_LOADS_STORES + }, { + .intel_events = { + INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0), + }, + .amd_events = { + INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0), + }, + .msg = "Only allow loads, stores, and loads + stores.", + .flags = ALLOW_LOADS | ALLOW_STORES | ALLOW_LOADS_STORES + }, +}; + +static int append_test_events(const struct masked_events_test *test, + uint64_t *events, int nevents) +{ + const uint64_t *evts; + int i; + + evts = use_intel_pmu() ? test->intel_events : test->amd_events; + for (i = 0; i < MAX_TEST_EVENTS; i++) { + if (evts[i] == 0) + break; + + events[nevents + i] = evts[i]; + } + + return nevents + i; +} + +static bool bool_eq(bool a, bool b) +{ + return a == b; +} + +static void run_masked_events_tests(struct kvm_vcpu *vcpu, uint64_t *events, + int nevents) +{ + int ntests = ARRAY_SIZE(test_cases); + struct perf_counter c; + int i, n; + + for (i = 0; i < ntests; i++) { + const struct masked_events_test *test = &test_cases[i]; + + /* Do any test case events overflow MAX_TEST_EVENTS? */ + assert(test->intel_event_end == 0); + assert(test->amd_event_end == 0); + + n = append_test_events(test, events, nevents); + + c = run_masked_events_test(vcpu, events, n); + TEST_ASSERT(bool_eq(c.loads, test->flags & ALLOW_LOADS) && + bool_eq(c.stores, test->flags & ALLOW_STORES) && + bool_eq(c.loads_stores, + test->flags & ALLOW_LOADS_STORES), + "%s loads: %u, stores: %u, loads + stores: %u", + test->msg, c.loads, c.stores, c.loads_stores); + } +} + +static void add_dummy_events(uint64_t *events, int nevents) +{ + int i; + + for (i = 0; i < nevents; i++) { + int event_select = i % 0xFF; + bool exclude = ((i % 4) == 0); + + if (event_select == MEM_INST_RETIRED || + event_select == LS_DISPATCH) + event_select++; + + events[i] = KVM_PMU_ENCODE_MASKED_ENTRY(event_select, 0, + 0, exclude); + } +} + +static void test_masked_events(struct kvm_vcpu *vcpu) +{ + int nevents = MAX_FILTER_EVENTS - MAX_TEST_EVENTS; + uint64_t events[MAX_FILTER_EVENTS]; + + /* Run the test cases against a sparse PMU event filter. */ + run_masked_events_tests(vcpu, events, 0); + + /* Run the test cases against a dense PMU event filter. */ + add_dummy_events(events, MAX_FILTER_EVENTS); + run_masked_events_tests(vcpu, events, nevents); +} + +static int run_filter_test(struct kvm_vcpu *vcpu, const uint64_t *events, + int nevents, uint32_t flags) +{ + struct kvm_pmu_event_filter *f; + int r; + + f = create_pmu_event_filter(events, nevents, KVM_PMU_EVENT_ALLOW, flags); + r = __vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f); + free(f); + + return r; +} + +static void test_filter_ioctl(struct kvm_vcpu *vcpu) +{ + uint64_t e = ~0ul; + int r; + + /* + * Unfortunately having invalid bits set in event data is expected to + * pass when flags == 0 (bits other than eventsel+umask). + */ + r = run_filter_test(vcpu, &e, 1, 0); + TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing"); + + r = run_filter_test(vcpu, &e, 1, KVM_PMU_EVENT_FLAG_MASKED_EVENTS); + TEST_ASSERT(r != 0, "Invalid PMU Event Filter is expected to fail"); + + e = KVM_PMU_ENCODE_MASKED_ENTRY(0xff, 0xff, 0xff, 0xf); + r = run_filter_test(vcpu, &e, 1, KVM_PMU_EVENT_FLAG_MASKED_EVENTS); + TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing"); +} + int main(int argc, char *argv[]) { void (*guest_code)(void); - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu, *vcpu2 = NULL; struct kvm_vm *vm; TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_MASKED_EVENTS)); TEST_REQUIRE(use_intel_pmu() || use_amd_pmu()); guest_code = use_intel_pmu() ? intel_guest_code : amd_guest_code; @@ -430,6 +790,17 @@ int main(int argc, char *argv[]) test_not_member_deny_list(vcpu); test_not_member_allow_list(vcpu); + if (use_intel_pmu() && + supports_event_mem_inst_retired() && + kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) >= 3) + vcpu2 = vm_vcpu_add(vm, 2, intel_masked_events_guest_code); + else if (use_amd_pmu()) + vcpu2 = vm_vcpu_add(vm, 2, amd_masked_events_guest_code); + + if (vcpu2) + test_masked_events(vcpu2); + test_filter_ioctl(vcpu); + kvm_vm_free(vm); test_pmu_config_disable(guest_code); |