From 1cc33161a83d20b5462b1e93f95d3ce6388079ee Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 20 Aug 2018 10:12:47 +0530 Subject: uprobes: Support SDT markers having reference count (semaphore) Userspace Statically Defined Tracepoints[1] are dtrace style markers inside userspace applications. Applications like PostgreSQL, MySQL, Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc have these markers embedded in them. These markers are added by developer at important places in the code. Each marker source expands to a single nop instruction in the compiled code but there may be additional overhead for computing the marker arguments which expands to couple of instructions. In case the overhead is more, execution of it can be omitted by runtime if() condition when no one is tracing on the marker: if (reference_counter > 0) { Execute marker instructions; } Default value of reference counter is 0. Tracer has to increment the reference counter before tracing on a marker and decrement it when done with the tracing. Implement the reference counter logic in core uprobe. User will be able to use it from trace_uprobe as well as from kernel module. New trace_uprobe definition with reference counter will now be: :[(ref_ctr_offset)] where ref_ctr_offset is an optional field. For kernel module, new variant of uprobe_register() has been introduced: uprobe_register_refctr(inode, offset, ref_ctr_offset, consumer) No new variant for uprobe_unregister() because it's assumed to have only one reference counter for one uprobe. [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation Note: 'reference counter' is called as 'semaphore' in original Dtrace (or Systemtap, bcc and even in ELF) documentation and code. But the term 'semaphore' is misleading in this context. This is just a counter used to hold number of tracers tracing on a marker. This is not really used for any synchronization. So we are calling it a 'reference counter' in kernel / perf code. Link: http://lkml.kernel.org/r/20180820044250.11659-2-ravi.bangoria@linux.ibm.com Reviewed-by: Masami Hiramatsu [Only trace_uprobe.c] Reviewed-by: Oleg Nesterov Reviewed-by: Song Liu Tested-by: Song Liu Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/events/uprobes.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 252 insertions(+), 7 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3207a4d26849..934feb39f6be 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -73,6 +73,7 @@ struct uprobe { struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ loff_t offset; + loff_t ref_ctr_offset; unsigned long flags; /* @@ -88,6 +89,15 @@ struct uprobe { struct arch_uprobe arch; }; +struct delayed_uprobe { + struct list_head list; + struct uprobe *uprobe; + struct mm_struct *mm; +}; + +static DEFINE_MUTEX(delayed_uprobe_lock); +static LIST_HEAD(delayed_uprobe_list); + /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction @@ -282,6 +292,166 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t return 1; } +static struct delayed_uprobe * +delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct delayed_uprobe *du; + + list_for_each_entry(du, &delayed_uprobe_list, list) + if (du->uprobe == uprobe && du->mm == mm) + return du; + return NULL; +} + +static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct delayed_uprobe *du; + + if (delayed_uprobe_check(uprobe, mm)) + return 0; + + du = kzalloc(sizeof(*du), GFP_KERNEL); + if (!du) + return -ENOMEM; + + du->uprobe = uprobe; + du->mm = mm; + list_add(&du->list, &delayed_uprobe_list); + return 0; +} + +static void delayed_uprobe_delete(struct delayed_uprobe *du) +{ + if (WARN_ON(!du)) + return; + list_del(&du->list); + kfree(du); +} + +static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct list_head *pos, *q; + struct delayed_uprobe *du; + + if (!uprobe && !mm) + return; + + list_for_each_safe(pos, q, &delayed_uprobe_list) { + du = list_entry(pos, struct delayed_uprobe, list); + + if (uprobe && du->uprobe != uprobe) + continue; + if (mm && du->mm != mm) + continue; + + delayed_uprobe_delete(du); + } +} + +static bool valid_ref_ctr_vma(struct uprobe *uprobe, + struct vm_area_struct *vma) +{ + unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); + + return uprobe->ref_ctr_offset && + vma->vm_file && + file_inode(vma->vm_file) == uprobe->inode && + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && + vma->vm_start <= vaddr && + vma->vm_end > vaddr; +} + +static struct vm_area_struct * +find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct vm_area_struct *tmp; + + for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + if (valid_ref_ctr_vma(uprobe, tmp)) + return tmp; + + return NULL; +} + +static int +__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) +{ + void *kaddr; + struct page *page; + struct vm_area_struct *vma; + int ret; + short *ptr; + + if (!vaddr || !d) + return -EINVAL; + + ret = get_user_pages_remote(NULL, mm, vaddr, 1, + FOLL_WRITE, &page, &vma, NULL); + if (unlikely(ret <= 0)) { + /* + * We are asking for 1 page. If get_user_pages_remote() fails, + * it may return 0, in that case we have to return error. + */ + return ret == 0 ? -EBUSY : ret; + } + + kaddr = kmap_atomic(page); + ptr = kaddr + (vaddr & ~PAGE_MASK); + + if (unlikely(*ptr + d < 0)) { + pr_warn("ref_ctr going negative. vaddr: 0x%lx, " + "curr val: %d, delta: %d\n", vaddr, *ptr, d); + ret = -EINVAL; + goto out; + } + + *ptr += d; + ret = 0; +out: + kunmap_atomic(kaddr); + put_page(page); + return ret; +} + +static void update_ref_ctr_warn(struct uprobe *uprobe, + struct mm_struct *mm, short d) +{ + pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", + d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, + (unsigned long long) uprobe->offset, + (unsigned long long) uprobe->ref_ctr_offset, mm); +} + +static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, + short d) +{ + struct vm_area_struct *rc_vma; + unsigned long rc_vaddr; + int ret = 0; + + rc_vma = find_ref_ctr_vma(uprobe, mm); + + if (rc_vma) { + rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); + ret = __update_ref_ctr(mm, rc_vaddr, d); + if (ret) + update_ref_ctr_warn(uprobe, mm, d); + + if (d > 0) + return ret; + } + + mutex_lock(&delayed_uprobe_lock); + if (d > 0) + ret = delayed_uprobe_add(uprobe, mm); + else + delayed_uprobe_remove(uprobe, mm); + mutex_unlock(&delayed_uprobe_lock); + + return ret; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for @@ -302,9 +472,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { + struct uprobe *uprobe; struct page *old_page, *new_page; struct vm_area_struct *vma; - int ret; + int ret, is_register, ref_ctr_updated = 0; + + is_register = is_swbp_insn(&opcode); + uprobe = container_of(auprobe, struct uprobe, arch); retry: /* Read the page with vaddr into memory */ @@ -317,6 +491,15 @@ retry: if (ret <= 0) goto put_old; + /* We are going to replace instruction, update ref_ctr. */ + if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); + if (ret) + goto put_old; + + ref_ctr_updated = 1; + } + ret = anon_vma_prepare(vma); if (ret) goto put_old; @@ -337,6 +520,11 @@ put_old: if (unlikely(ret == -EAGAIN)) goto retry; + + /* Revert back reference counter if instruction update failed. */ + if (ret && is_register && ref_ctr_updated) + update_ref_ctr(uprobe, mm, -1); + return ret; } @@ -378,8 +566,15 @@ static struct uprobe *get_uprobe(struct uprobe *uprobe) static void put_uprobe(struct uprobe *uprobe) { - if (atomic_dec_and_test(&uprobe->ref)) + if (atomic_dec_and_test(&uprobe->ref)) { + /* + * If application munmap(exec_vma) before uprobe_unregister() + * gets called, we don't get a chance to remove uprobe from + * delayed_uprobe_list from remove_breakpoint(). Do it here. + */ + delayed_uprobe_remove(uprobe, NULL); kfree(uprobe); + } } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -484,7 +679,8 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } -static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) +static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, + loff_t ref_ctr_offset) { struct uprobe *uprobe, *cur_uprobe; @@ -494,6 +690,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = inode; uprobe->offset = offset; + uprobe->ref_ctr_offset = ref_ctr_offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); @@ -895,7 +1092,7 @@ EXPORT_SYMBOL_GPL(uprobe_unregister); * else return 0 (success) */ static int __uprobe_register(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc) + loff_t ref_ctr_offset, struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; @@ -912,7 +1109,7 @@ static int __uprobe_register(struct inode *inode, loff_t offset, return -EINVAL; retry: - uprobe = alloc_uprobe(inode, offset); + uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (!uprobe) return -ENOMEM; /* @@ -938,10 +1135,17 @@ static int __uprobe_register(struct inode *inode, loff_t offset, int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { - return __uprobe_register(inode, offset, uc); + return __uprobe_register(inode, offset, 0, uc); } EXPORT_SYMBOL_GPL(uprobe_register); +int uprobe_register_refctr(struct inode *inode, loff_t offset, + loff_t ref_ctr_offset, struct uprobe_consumer *uc) +{ + return __uprobe_register(inode, offset, ref_ctr_offset, uc); +} +EXPORT_SYMBOL_GPL(uprobe_register_refctr); + /* * uprobe_apply - unregister an already registered probe. * @inode: the file in which the probe has to be removed. @@ -1060,6 +1264,35 @@ static void build_probe_list(struct inode *inode, spin_unlock(&uprobes_treelock); } +/* @vma contains reference counter, not the probed instruction. */ +static int delayed_ref_ctr_inc(struct vm_area_struct *vma) +{ + struct list_head *pos, *q; + struct delayed_uprobe *du; + unsigned long vaddr; + int ret = 0, err = 0; + + mutex_lock(&delayed_uprobe_lock); + list_for_each_safe(pos, q, &delayed_uprobe_list) { + du = list_entry(pos, struct delayed_uprobe, list); + + if (du->mm != vma->vm_mm || + !valid_ref_ctr_vma(du->uprobe, vma)) + continue; + + vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); + ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); + if (ret) { + update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); + if (!err) + err = ret; + } + delayed_uprobe_delete(du); + } + mutex_unlock(&delayed_uprobe_lock); + return err; +} + /* * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. * @@ -1072,7 +1305,15 @@ int uprobe_mmap(struct vm_area_struct *vma) struct uprobe *uprobe, *u; struct inode *inode; - if (no_uprobe_events() || !valid_vma(vma, true)) + if (no_uprobe_events()) + return 0; + + if (vma->vm_file && + (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && + test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + delayed_ref_ctr_inc(vma); + + if (!valid_vma(vma, true)) return 0; inode = file_inode(vma->vm_file); @@ -1246,6 +1487,10 @@ void uprobe_clear_state(struct mm_struct *mm) { struct xol_area *area = mm->uprobes_state.xol_area; + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(NULL, mm); + mutex_unlock(&delayed_uprobe_lock); + if (!area) return; -- cgit v1.2.3 From 22bad38286d9a652d7061a02f9743bb2ebb84e59 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 20 Aug 2018 10:12:48 +0530 Subject: uprobes/sdt: Prevent multiple reference counter for same uprobe We assume to have only one reference counter for one uprobe. Don't allow user to register multiple uprobes having same inode+offset but different reference counter. Link: http://lkml.kernel.org/r/20180820044250.11659-3-ravi.bangoria@linux.ibm.com Acked-by: Srikar Dronamraju Reviewed-by: Oleg Nesterov Reviewed-by: Song Liu Tested-by: Song Liu Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/events/uprobes.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel/events') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 934feb39f6be..96fb51f3994f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -679,6 +679,16 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } +static void +ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) +{ + pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " + "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", + uprobe->inode->i_ino, (unsigned long long) uprobe->offset, + (unsigned long long) cur_uprobe->ref_ctr_offset, + (unsigned long long) uprobe->ref_ctr_offset); +} + static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, loff_t ref_ctr_offset) { @@ -698,6 +708,12 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ if (cur_uprobe) { + if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { + ref_ctr_mismatch_warn(cur_uprobe, uprobe); + put_uprobe(cur_uprobe); + kfree(uprobe); + return ERR_PTR(-EINVAL); + } kfree(uprobe); uprobe = cur_uprobe; } @@ -1112,6 +1128,9 @@ static int __uprobe_register(struct inode *inode, loff_t offset, uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (!uprobe) return -ENOMEM; + if (IS_ERR(uprobe)) + return PTR_ERR(uprobe); + /* * We can race with uprobe_unregister()->delete_uprobe(). * Check uprobe_is_active() and retry if it is false. -- cgit v1.2.3 From a6ca88b241d5e929e6e60b12ad8cd288f0ffa256 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 1 Oct 2018 22:36:36 -0700 Subject: trace_uprobe: support reference counter in fd-based uprobe This patch enables uprobes with reference counter in fd-based uprobe. Highest 32 bits of perf_event_attr.config is used to stored offset of the reference count (semaphore). Format information in /sys/bus/event_source/devices/uprobe/format/ is updated to reflect this new feature. Link: http://lkml.kernel.org/r/20181002053636.1896903-1-songliubraving@fb.com Cc: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Reviewed-and-tested-by: Ravi Bangoria Signed-off-by: Song Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/events/core.c | 49 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 10 deletions(-) (limited to 'kernel/events') diff --git a/kernel/events/core.c b/kernel/events/core.c index c80549bf82c6..65b30773af3e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8368,30 +8368,39 @@ static struct pmu perf_tracepoint = { * * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe * if not set, create kprobe/uprobe + * + * The following values specify a reference counter (or semaphore in the + * terminology of tools like dtrace, systemtap, etc.) Userspace Statically + * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. + * + * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset + * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left */ enum perf_probe_config { PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */ + PERF_UPROBE_REF_CTR_OFFSET_BITS = 32, + PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS, }; PMU_FORMAT_ATTR(retprobe, "config:0"); +#endif -static struct attribute *probe_attrs[] = { +#ifdef CONFIG_KPROBE_EVENTS +static struct attribute *kprobe_attrs[] = { &format_attr_retprobe.attr, NULL, }; -static struct attribute_group probe_format_group = { +static struct attribute_group kprobe_format_group = { .name = "format", - .attrs = probe_attrs, + .attrs = kprobe_attrs, }; -static const struct attribute_group *probe_attr_groups[] = { - &probe_format_group, +static const struct attribute_group *kprobe_attr_groups[] = { + &kprobe_format_group, NULL, }; -#endif -#ifdef CONFIG_KPROBE_EVENTS static int perf_kprobe_event_init(struct perf_event *event); static struct pmu perf_kprobe = { .task_ctx_nr = perf_sw_context, @@ -8401,7 +8410,7 @@ static struct pmu perf_kprobe = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - .attr_groups = probe_attr_groups, + .attr_groups = kprobe_attr_groups, }; static int perf_kprobe_event_init(struct perf_event *event) @@ -8433,6 +8442,24 @@ static int perf_kprobe_event_init(struct perf_event *event) #endif /* CONFIG_KPROBE_EVENTS */ #ifdef CONFIG_UPROBE_EVENTS +PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63"); + +static struct attribute *uprobe_attrs[] = { + &format_attr_retprobe.attr, + &format_attr_ref_ctr_offset.attr, + NULL, +}; + +static struct attribute_group uprobe_format_group = { + .name = "format", + .attrs = uprobe_attrs, +}; + +static const struct attribute_group *uprobe_attr_groups[] = { + &uprobe_format_group, + NULL, +}; + static int perf_uprobe_event_init(struct perf_event *event); static struct pmu perf_uprobe = { .task_ctx_nr = perf_sw_context, @@ -8442,12 +8469,13 @@ static struct pmu perf_uprobe = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - .attr_groups = probe_attr_groups, + .attr_groups = uprobe_attr_groups, }; static int perf_uprobe_event_init(struct perf_event *event) { int err; + unsigned long ref_ctr_offset; bool is_retprobe; if (event->attr.type != perf_uprobe.type) @@ -8463,7 +8491,8 @@ static int perf_uprobe_event_init(struct perf_event *event) return -EOPNOTSUPP; is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE; - err = perf_uprobe_init(event, is_retprobe); + ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT; + err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe); if (err) return err; -- cgit v1.2.3