aboutsummaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/bpf_trace.c399
-rw-r--r--kernel/trace/ftrace.c25
-rw-r--r--kernel/trace/trace.c25
-rw-r--r--kernel/trace/trace_kprobe.c70
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_preemptirq.c39
8 files changed, 427 insertions, 150 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 0c82ac2c5688..a4020c0b4508 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -10,11 +10,6 @@ config USER_STACKTRACE_SUPPORT
config NOP_TRACER
bool
-config HAVE_FTRACE_NMI_ENTER
- bool
- help
- See Documentation/trace/ftrace-design.rst
-
config HAVE_FUNCTION_TRACER
bool
help
@@ -72,11 +67,6 @@ config RING_BUFFER
select TRACE_CLOCK
select IRQ_WORK
-config FTRACE_NMI_ENTER
- bool
- depends on HAVE_FTRACE_NMI_ENTER
- default y
-
config EVENT_TRACING
select CONTEXT_SWITCH_TRACER
select GLOB
@@ -158,6 +148,7 @@ config FUNCTION_TRACER
select CONTEXT_SWITCH_TRACER
select GLOB
select TASKS_RCU if PREEMPTION
+ select TASKS_RUDE_RCU
help
Enable the kernel to trace every kernel function. This is done
by using a compiler feature to insert a small, 5-byte No-Operation
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ca39dc3230cb..ea47f2084087 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -170,10 +170,10 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
blkcg = NULL;
#ifdef CONFIG_BLK_CGROUP
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
+ trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n,
blkcg ? cgroup_id(blkcg->css.cgroup) : 1);
#else
- trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0);
+ trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, 0);
#endif
local_irq_restore(flags);
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a010edc37ee0..e729c9e587a0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -136,18 +136,24 @@ static const struct bpf_func_proto bpf_override_return_proto = {
};
#endif
-BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
- const void __user *, unsafe_ptr)
+static __always_inline int
+bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr)
{
- int ret = probe_user_read(dst, unsafe_ptr, size);
+ int ret;
+ ret = probe_user_read(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
-
return ret;
}
-static const struct bpf_func_proto bpf_probe_read_user_proto = {
+BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
+ const void __user *, unsafe_ptr)
+{
+ return bpf_probe_read_user_common(dst, size, unsafe_ptr);
+}
+
+const struct bpf_func_proto bpf_probe_read_user_proto = {
.func = bpf_probe_read_user,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -156,18 +162,25 @@ static const struct bpf_func_proto bpf_probe_read_user_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
- const void __user *, unsafe_ptr)
+static __always_inline int
+bpf_probe_read_user_str_common(void *dst, u32 size,
+ const void __user *unsafe_ptr)
{
- int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size);
+ int ret;
+ ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
-
return ret;
}
-static const struct bpf_func_proto bpf_probe_read_user_str_proto = {
+BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
+ const void __user *, unsafe_ptr)
+{
+ return bpf_probe_read_user_str_common(dst, size, unsafe_ptr);
+}
+
+const struct bpf_func_proto bpf_probe_read_user_str_proto = {
.func = bpf_probe_read_user_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -177,28 +190,28 @@ static const struct bpf_func_proto bpf_probe_read_user_str_proto = {
};
static __always_inline int
-bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr,
- const bool compat)
+bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
{
int ret = security_locked_down(LOCKDOWN_BPF_READ);
if (unlikely(ret < 0))
- goto out;
- ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) :
- probe_kernel_read_strict(dst, unsafe_ptr, size);
+ goto fail;
+ ret = probe_kernel_read(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
-out:
- memset(dst, 0, size);
+ goto fail;
+ return ret;
+fail:
+ memset(dst, 0, size);
return ret;
}
BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false);
+ return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
}
-static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
+const struct bpf_func_proto bpf_probe_read_kernel_proto = {
.func = bpf_probe_read_kernel,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -207,53 +220,40 @@ static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
- const void *, unsafe_ptr)
-{
- return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true);
-}
-
-static const struct bpf_func_proto bpf_probe_read_compat_proto = {
- .func = bpf_probe_read_compat,
- .gpl_only = true,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_UNINIT_MEM,
- .arg2_type = ARG_CONST_SIZE_OR_ZERO,
- .arg3_type = ARG_ANYTHING,
-};
-
static __always_inline int
-bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr,
- const bool compat)
+bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
{
int ret = security_locked_down(LOCKDOWN_BPF_READ);
if (unlikely(ret < 0))
- goto out;
+ goto fail;
+
/*
- * The strncpy_from_unsafe_*() call will likely not fill the entire
- * buffer, but that's okay in this circumstance as we're probing
+ * The strncpy_from_kernel_nofault() call will likely not fill the
+ * entire buffer, but that's okay in this circumstance as we're probing
* arbitrary memory anyway similar to bpf_probe_read_*() and might
* as well probe the stack. Thus, memory is explicitly cleared
* only in error case, so that improper users ignoring return
* code altogether don't copy garbage; otherwise length of string
* is returned that can be used for bpf_perf_event_output() et al.
*/
- ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) :
- strncpy_from_unsafe_strict(dst, unsafe_ptr, size);
+ ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
-out:
- memset(dst, 0, size);
+ goto fail;
+
+ return 0;
+fail:
+ memset(dst, 0, size);
return ret;
}
BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false);
+ return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}
-static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
+const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
.func = bpf_probe_read_kernel_str,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -262,10 +262,34 @@ static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
.arg3_type = ARG_ANYTHING,
};
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
+ const void *, unsafe_ptr)
+{
+ if ((unsigned long)unsafe_ptr < TASK_SIZE) {
+ return bpf_probe_read_user_common(dst, size,
+ (__force void __user *)unsafe_ptr);
+ }
+ return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
+}
+
+static const struct bpf_func_proto bpf_probe_read_compat_proto = {
+ .func = bpf_probe_read_compat,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true);
+ if ((unsigned long)unsafe_ptr < TASK_SIZE) {
+ return bpf_probe_read_user_str_common(dst, size,
+ (__force void __user *)unsafe_ptr);
+ }
+ return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}
static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
@@ -276,6 +300,7 @@ static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
+#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */
BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
u32, size)
@@ -315,12 +340,40 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
{
+ if (!capable(CAP_SYS_ADMIN))
+ return NULL;
+
pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
current->comm, task_pid_nr(current));
return &bpf_probe_write_user_proto;
}
+static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
+ size_t bufsz)
+{
+ void __user *user_ptr = (__force void __user *)unsafe_ptr;
+
+ buf[0] = 0;
+
+ switch (fmt_ptype) {
+ case 's':
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)unsafe_ptr < TASK_SIZE) {
+ strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ break;
+ }
+ fallthrough;
+#endif
+ case 'k':
+ strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
+ break;
+ case 'u':
+ strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ break;
+ }
+}
+
/*
* Only limited trace_printk() conversion specifiers allowed:
* %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s
@@ -403,24 +456,8 @@ fmt_str:
break;
}
- buf[0] = 0;
- switch (fmt_ptype) {
- case 's':
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- strncpy_from_unsafe(buf, unsafe_ptr,
- sizeof(buf));
- break;
-#endif
- case 'k':
- strncpy_from_unsafe_strict(buf, unsafe_ptr,
- sizeof(buf));
- break;
- case 'u':
- strncpy_from_unsafe_user(buf,
- (__force void __user *)unsafe_ptr,
- sizeof(buf));
- break;
- }
+ bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype,
+ sizeof(buf));
goto fmt_next;
}
@@ -487,6 +524,214 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
+#define MAX_SEQ_PRINTF_VARARGS 12
+#define MAX_SEQ_PRINTF_MAX_MEMCPY 6
+#define MAX_SEQ_PRINTF_STR_LEN 128
+
+struct bpf_seq_printf_buf {
+ char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
+};
+static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
+static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
+
+BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
+ const void *, data, u32, data_len)
+{
+ int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
+ int i, buf_used, copy_size, num_args;
+ u64 params[MAX_SEQ_PRINTF_VARARGS];
+ struct bpf_seq_printf_buf *bufs;
+ const u64 *args = data;
+
+ buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
+ if (WARN_ON_ONCE(buf_used > 1)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ bufs = this_cpu_ptr(&bpf_seq_printf_buf);
+
+ /*
+ * bpf_check()->check_func_arg()->check_stack_boundary()
+ * guarantees that fmt points to bpf program stack,
+ * fmt_size bytes of it were initialized and fmt_size > 0
+ */
+ if (fmt[--fmt_size] != 0)
+ goto out;
+
+ if (data_len & 7)
+ goto out;
+
+ for (i = 0; i < fmt_size; i++) {
+ if (fmt[i] == '%') {
+ if (fmt[i + 1] == '%')
+ i++;
+ else if (!data || !data_len)
+ goto out;
+ }
+ }
+
+ num_args = data_len / 8;
+
+ /* check format string for allowed specifiers */
+ for (i = 0; i < fmt_size; i++) {
+ /* only printable ascii for now. */
+ if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (fmt[i] != '%')
+ continue;
+
+ if (fmt[i + 1] == '%') {
+ i++;
+ continue;
+ }
+
+ if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
+ err = -E2BIG;
+ goto out;
+ }
+
+ if (fmt_cnt >= num_args) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+ i++;
+
+ /* skip optional "[0 +-][num]" width formating field */
+ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' ||
+ fmt[i] == ' ')
+ i++;
+ if (fmt[i] >= '1' && fmt[i] <= '9') {
+ i++;
+ while (fmt[i] >= '0' && fmt[i] <= '9')
+ i++;
+ }
+
+ if (fmt[i] == 's') {
+ void *unsafe_ptr;
+
+ /* try our best to copy */
+ if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+ err = -E2BIG;
+ goto out;
+ }
+
+ unsafe_ptr = (void *)(long)args[fmt_cnt];
+ err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt],
+ unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN);
+ if (err < 0)
+ bufs->buf[memcpy_cnt][0] = '\0';
+ params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+ fmt_cnt++;
+ memcpy_cnt++;
+ continue;
+ }
+
+ if (fmt[i] == 'p') {
+ if (fmt[i + 1] == 0 ||
+ fmt[i + 1] == 'K' ||
+ fmt[i + 1] == 'x') {
+ /* just kernel pointers */
+ params[fmt_cnt] = args[fmt_cnt];
+ fmt_cnt++;
+ continue;
+ }
+
+ /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */
+ if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') {
+ err = -EINVAL;
+ goto out;
+ }
+ if (fmt[i + 2] != '4' && fmt[i + 2] != '6') {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+ err = -E2BIG;
+ goto out;
+ }
+
+
+ copy_size = (fmt[i + 2] == '4') ? 4 : 16;
+
+ err = probe_kernel_read(bufs->buf[memcpy_cnt],
+ (void *) (long) args[fmt_cnt],
+ copy_size);
+ if (err < 0)
+ memset(bufs->buf[memcpy_cnt], 0, copy_size);
+ params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+ i += 2;
+ fmt_cnt++;
+ memcpy_cnt++;
+ continue;
+ }
+
+ if (fmt[i] == 'l') {
+ i++;
+ if (fmt[i] == 'l')
+ i++;
+ }
+
+ if (fmt[i] != 'i' && fmt[i] != 'd' &&
+ fmt[i] != 'u' && fmt[i] != 'x') {
+ err = -EINVAL;
+ goto out;
+ }
+
+ params[fmt_cnt] = args[fmt_cnt];
+ fmt_cnt++;
+ }
+
+ /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
+ * all of them to seq_printf().
+ */
+ seq_printf(m, fmt, params[0], params[1], params[2], params[3],
+ params[4], params[5], params[6], params[7], params[8],
+ params[9], params[10], params[11]);
+
+ err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
+out:
+ this_cpu_dec(bpf_seq_printf_buf_used);
+ return err;
+}
+
+static int bpf_seq_printf_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_printf_proto = {
+ .func = bpf_seq_printf,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+ .btf_id = bpf_seq_printf_btf_ids,
+};
+
+BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
+{
+ return seq_write(m, data, len) ? -EOVERFLOW : 0;
+}
+
+static int bpf_seq_write_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_write_proto = {
+ .func = bpf_seq_write,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .btf_id = bpf_seq_write_btf_ids,
+};
+
static __always_inline int
get_map_perf_counter(struct bpf_map *map, u64 flags,
u64 *value, u64 *enabled, u64 *running)
@@ -698,7 +943,7 @@ BPF_CALL_0(bpf_get_current_task)
return (long) current;
}
-static const struct bpf_func_proto bpf_get_current_task_proto = {
+const struct bpf_func_proto bpf_get_current_task_proto = {
.func = bpf_get_current_task,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -827,6 +1072,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_peek_elem_proto;
case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto;
+ case BPF_FUNC_ktime_get_boot_ns:
+ return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
case BPF_FUNC_get_current_pid_tgid:
@@ -877,6 +1124,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_read_value_proto;
case BPF_FUNC_get_ns_current_pid_tgid:
return &bpf_get_ns_current_pid_tgid_proto;
+ case BPF_FUNC_ringbuf_output:
+ return &bpf_ringbuf_output_proto;
+ case BPF_FUNC_ringbuf_reserve:
+ return &bpf_ringbuf_reserve_proto;
+ case BPF_FUNC_ringbuf_submit:
+ return &bpf_ringbuf_submit_proto;
+ case BPF_FUNC_ringbuf_discard:
+ return &bpf_ringbuf_discard_proto;
+ case BPF_FUNC_ringbuf_query:
+ return &bpf_ringbuf_query_proto;
default:
return NULL;
}
@@ -1246,7 +1503,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
-static const struct bpf_func_proto *
+const struct bpf_func_proto *
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
@@ -1256,6 +1513,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_xdp_output:
return &bpf_xdp_output_proto;
#endif
+ case BPF_FUNC_seq_printf:
+ return prog->expected_attach_type == BPF_TRACE_ITER ?
+ &bpf_seq_printf_proto :
+ NULL;
+ case BPF_FUNC_seq_write:
+ return prog->expected_attach_type == BPF_TRACE_ITER ?
+ &bpf_seq_write_proto :
+ NULL;
default:
return raw_tp_prog_func_proto(func_id, prog);
}
@@ -1500,7 +1765,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
u32 *ids, prog_cnt, ids_len;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!perfmon_capable())
return -EPERM;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cd39cbf3631a..c163c3531faf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -160,17 +160,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
op->saved_func(ip, parent_ip, op, regs);
}
-static void ftrace_sync(struct work_struct *work)
-{
- /*
- * This function is just a stub to implement a hard force
- * of synchronize_rcu(). This requires synchronizing
- * tasks even in userspace and idle.
- *
- * Yes, function tracing is rude.
- */
-}
-
static void ftrace_sync_ipi(void *data)
{
/* Probably not needed, but do it anyway */
@@ -256,7 +245,7 @@ static void update_ftrace_function(void)
* Make sure all CPUs see this. Yes this is slow, but static
* tracing is slow and nasty to have enabled.
*/
- schedule_on_each_cpu(ftrace_sync);
+ synchronize_rcu_tasks_rude();
/* Now all cpus are using the list ops. */
function_trace_op = set_function_trace_op;
/* Make sure the function_trace_op is visible on all CPUs */
@@ -2032,11 +2021,11 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
switch (failed) {
case -EFAULT:
pr_info("ftrace faulted on modifying ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
break;
case -EINVAL:
pr_info("ftrace failed to modify ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
print_ip_ins(" actual: ", (unsigned char *)ip);
pr_cont("\n");
if (ftrace_expected) {
@@ -2046,11 +2035,11 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
break;
case -EPERM:
pr_info("ftrace faulted on writing ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
break;
default:
pr_info("ftrace faulted on unknown error ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
}
print_bug_type();
if (rec) {
@@ -2932,7 +2921,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- schedule_on_each_cpu(ftrace_sync);
+ synchronize_rcu_tasks_rude();
/*
* When the kernel is preeptive, tasks can be preempted
@@ -5888,7 +5877,7 @@ ftrace_graph_release(struct inode *inode, struct file *file)
* infrastructure to do the synchronization, thus we must do it
* ourselves.
*/
- schedule_on_each_cpu(ftrace_sync);
+ synchronize_rcu_tasks_rude();
free_ftrace_hash(old_hash);
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 760fd102dbe2..ec44b0e2a19c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2665,7 +2665,7 @@ static void output_printk(struct trace_event_buffer *fbuffer)
}
int tracepoint_printk_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int save_tracepoint_printk;
@@ -6308,13 +6308,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
__free_page(spd->pages[idx]);
}
-static const struct pipe_buf_operations tracing_pipe_buf_ops = {
- .confirm = generic_pipe_buf_confirm,
- .release = generic_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
- .get = generic_pipe_buf_get,
-};
-
static size_t
tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
{
@@ -6376,7 +6369,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
.nr_pages_max = PIPE_DEF_BUFFERS,
- .ops = &tracing_pipe_buf_ops,
+ .ops = &default_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
};
ssize_t ret;
@@ -7585,9 +7578,7 @@ static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
- .confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
- .steal = generic_pipe_buf_nosteal,
.get = buffer_pipe_buf_get,
};
@@ -8530,18 +8521,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
allocate_snapshot = false;
#endif
- /*
- * Because of some magic with the way alloc_percpu() works on
- * x86_64, we need to synchronize the pgd of all the tables,
- * otherwise the trace events that happen in x86_64 page fault
- * handlers can't cope with accessing the chance that a
- * alloc_percpu()'d memory might be touched in the page fault trace
- * event. Oh, and we need to audit all other alloc_percpu() and vmalloc()
- * calls in tracing, because something might get triggered within a
- * page fault trace event!
- */
- vmalloc_sync_mappings();
-
return 0;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 35989383ae11..ea8d0b094f1b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1202,11 +1202,25 @@ static const struct file_operations kprobe_profile_ops = {
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
+fetch_store_strlen_user(unsigned long addr)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+
+ return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+}
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
int ret, len = 0;
u8 c;
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if (addr < TASK_SIZE)
+ return fetch_store_strlen_user(addr);
+#endif
+
do {
ret = probe_kernel_read(&c, (u8 *)addr + len, 1);
len++;
@@ -1215,22 +1229,14 @@ fetch_store_strlen(unsigned long addr)
return (ret < 0) ? ret : len;
}
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen_user(unsigned long addr)
-{
- const void __user *uaddr = (__force const void __user *)addr;
-
- return strnlen_unsafe_user(uaddr, MAX_STRING_SIZE);
-}
-
/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
- * length and relative data location.
+ * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
*/
static nokprobe_inline int
-fetch_store_string(unsigned long addr, void *dest, void *base)
+fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
+ const void __user *uaddr = (__force const void __user *)addr;
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
@@ -1240,11 +1246,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
__dest = get_loc_data(dest, base);
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- ret = strncpy_from_unsafe(__dest, (void *)addr, maxlen);
+ ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
if (ret >= 0)
*(u32 *)dest = make_data_loc(ret, __dest - base);
@@ -1252,23 +1254,31 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
}
/*
- * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
- * with max length and relative data location.
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
+ * length and relative data location.
*/
static nokprobe_inline int
-fetch_store_string_user(unsigned long addr, void *dest, void *base)
+fetch_store_string(unsigned long addr, void *dest, void *base)
{
- const void __user *uaddr = (__force const void __user *)addr;
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)addr < TASK_SIZE)
+ return fetch_store_string_user(addr, dest, base);
+#endif
+
if (unlikely(!maxlen))
return -ENOMEM;
__dest = get_loc_data(dest, base);
- ret = strncpy_from_unsafe_user(__dest, uaddr, maxlen);
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
if (ret >= 0)
*(u32 *)dest = make_data_loc(ret, __dest - base);
@@ -1276,12 +1286,6 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
}
static nokprobe_inline int
-probe_mem_read(void *dest, void *src, size_t size)
-{
- return probe_kernel_read(dest, src, size);
-}
-
-static nokprobe_inline int
probe_mem_read_user(void *dest, void *src, size_t size)
{
const void __user *uaddr = (__force const void __user *)src;
@@ -1289,6 +1293,16 @@ probe_mem_read_user(void *dest, void *src, size_t size)
return probe_user_read(dest, uaddr, size);
}
+static nokprobe_inline int
+probe_mem_read(void *dest, void *src, size_t size)
+{
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)src < TASK_SIZE)
+ return probe_mem_read_user(dest, src, size);
+#endif
+ return probe_kernel_read(dest, src, size);
+}
+
/* Note that we don't verify it, since the code does not come from user space */
static int
process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 9a121e147102..73976de7f8cc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -393,7 +393,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
if (mm) {
const struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, ip);
if (vma) {
file = vma->vm_file;
@@ -405,7 +405,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
trace_seq_printf(s, "[+0x%lx]",
ip - vmstart);
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
trace_seq_printf(s, " <" IP_FMT ">", ip);
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 4d8e99fdbbbe..fb0691b8a88d 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -19,6 +19,24 @@
/* Per-cpu variable to prevent redundant calls when IRQs already off */
static DEFINE_PER_CPU(int, tracing_irq_cpu);
+/*
+ * Like trace_hardirqs_on() but without the lockdep invocation. This is
+ * used in the low level entry code where the ordering vs. RCU is important
+ * and lockdep uses a staged approach which splits the lockdep hardirq
+ * tracking into a RCU on and a RCU off section.
+ */
+void trace_hardirqs_on_prepare(void)
+{
+ if (this_cpu_read(tracing_irq_cpu)) {
+ if (!in_nmi())
+ trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
+ this_cpu_write(tracing_irq_cpu, 0);
+ }
+}
+EXPORT_SYMBOL(trace_hardirqs_on_prepare);
+NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);
+
void trace_hardirqs_on(void)
{
if (this_cpu_read(tracing_irq_cpu)) {
@@ -28,11 +46,31 @@ void trace_hardirqs_on(void)
this_cpu_write(tracing_irq_cpu, 0);
}
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
NOKPROBE_SYMBOL(trace_hardirqs_on);
+/*
+ * Like trace_hardirqs_off() but without the lockdep invocation. This is
+ * used in the low level entry code where the ordering vs. RCU is important
+ * and lockdep uses a staged approach which splits the lockdep hardirq
+ * tracking into a RCU on and a RCU off section.
+ */
+void trace_hardirqs_off_prepare(void)
+{
+ if (!this_cpu_read(tracing_irq_cpu)) {
+ this_cpu_write(tracing_irq_cpu, 1);
+ tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
+ if (!in_nmi())
+ trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1);
+ }
+
+}
+EXPORT_SYMBOL(trace_hardirqs_off_prepare);
+NOKPROBE_SYMBOL(trace_hardirqs_off_prepare);
+
void trace_hardirqs_off(void)
{
if (!this_cpu_read(tracing_irq_cpu)) {
@@ -56,6 +94,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
this_cpu_write(tracing_irq_cpu, 0);
}
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);