From daaf24c634ab951cad3dcef28492001ef9c931d0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 11 Jan 2018 17:39:09 +0100 Subject: [PATCH 01/33] bpf: simplify xdp_convert_ctx_access for xdp_rxq_info As pointed out by Daniel Borkmann, using bpf_target_off() is not necessary for xdp_rxq_info when extracting queue_index and ifindex, as these members are u32 like BPF_W. Also fix trivial spelling mistake introduced in same commit. Fixes: 02dd3291b2f0 ("bpf: finally expose xdp_rxq_info to XDP bpf-programs") Reported-by: Daniel Borkmann Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 405317f9c064..395d261948de 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -899,7 +899,7 @@ struct xdp_md { __u32 data; __u32 data_end; __u32 data_meta; - /* Below access go though struct xdp_rxq_info */ + /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ }; diff --git a/net/core/filter.c b/net/core/filter.c index d4b190e63b79..db2ee8c7e1bd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4310,16 +4310,15 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - bpf_target_off(struct net_device, - ifindex, 4, target_size)); + offsetof(struct net_device, ifindex)); break; case offsetof(struct xdp_md, rx_queue_index): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - bpf_target_off(struct xdp_rxq_info, - queue_index, 4, target_size)); + offsetof(struct xdp_rxq_info, + queue_index)); break; } From b4da3340eae2c3932144be3e81ccfd4e424d87b7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:54:04 +0900 Subject: [PATCH 02/33] tracing/kprobe: bpf: Check error injectable event is on function entry Check whether error injectable event is on function entry or not. Currently it checks the event is ftrace-based kprobes or not, but that is wrong. It should check if the event is on the entry of target function. Since error injection will override a function to just return with modified return value, that operation must be done before the target function starts making stackframe. As a side effect, bpf error injection is no need to depend on function-tracer. It can work with sw-breakpoint based kprobe events too. Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- arch/x86/include/asm/kprobes.h | 4 +--- arch/x86/kernel/kprobes/core.c | 14 ++++++++++++++ arch/x86/kernel/kprobes/ftrace.c | 14 -------------- kernel/trace/Kconfig | 2 -- kernel/trace/bpf_trace.c | 8 ++++---- kernel/trace/trace_kprobe.c | 9 ++++++--- kernel/trace/trace_probe.h | 12 ++++++------ 7 files changed, 31 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 36abb23a7a35..367d99cff426 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -67,9 +67,7 @@ extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); asmlinkage void kretprobe_trampoline(void); -#ifdef CONFIG_KPROBES_ON_FTRACE -extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs); -#endif +extern void arch_kprobe_override_function(struct pt_regs *regs); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index bd36f3c33cd0..b02a377d5905 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1183,3 +1183,17 @@ int arch_trampoline_kprobe(struct kprobe *p) { return 0; } + +asmlinkage void override_func(void); +asm( + ".type override_func, @function\n" + "override_func:\n" + " ret\n" + ".size override_func, .-override_func\n" +); + +void arch_kprobe_override_function(struct pt_regs *regs) +{ + regs->ip = (unsigned long)&override_func; +} +NOKPROBE_SYMBOL(arch_kprobe_override_function); diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 1ea748d682fd..8dc0161cec8f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -97,17 +97,3 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p) p->ainsn.boostable = false; return 0; } - -asmlinkage void override_func(void); -asm( - ".type override_func, @function\n" - "override_func:\n" - " ret\n" - ".size override_func, .-override_func\n" -); - -void arch_ftrace_kprobe_override_function(struct pt_regs *regs) -{ - regs->ip = (unsigned long)&override_func; -} -NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ae3a2d519e50..6400e1bf97c5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -533,9 +533,7 @@ config FUNCTION_PROFILER config BPF_KPROBE_OVERRIDE bool "Enable BPF programs to override a kprobed function" depends on BPF_EVENTS - depends on KPROBES_ON_FTRACE depends on HAVE_KPROBE_OVERRIDE - depends on DYNAMIC_FTRACE_WITH_REGS default n help Allows BPF to override the execution of a probed function and diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f6d2327ecb59..1966ad3bf3e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -85,7 +85,7 @@ BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { __this_cpu_write(bpf_kprobe_override, 1); regs_set_return_value(regs, rc); - arch_ftrace_kprobe_override_function(regs); + arch_kprobe_override_function(regs); return 0; } @@ -800,11 +800,11 @@ int perf_event_attach_bpf_prog(struct perf_event *event, int ret = -EEXIST; /* - * Kprobe override only works for ftrace based kprobes, and only if they - * are on the opt-in list. + * Kprobe override only works if they are on the function entry, + * and only if they are on the opt-in list. */ if (prog->kprobe_override && - (!trace_kprobe_ftrace(event->tp_event) || + (!trace_kprobe_on_func_entry(event->tp_event) || !trace_kprobe_error_injectable(event->tp_event))) return -EINVAL; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 91f4b57dab82..3c8deb977a8b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -88,13 +88,16 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } -int trace_kprobe_ftrace(struct trace_event_call *call) +bool trace_kprobe_on_func_entry(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; - return kprobe_ftrace(&tk->rp.kp); + + return kprobe_on_func_entry(tk->rp.kp.addr, + tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, + tk->rp.kp.addr ? 0 : tk->rp.kp.offset); } -int trace_kprobe_error_injectable(struct trace_event_call *call) +bool trace_kprobe_error_injectable(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; unsigned long addr; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5e54d748c84c..e101c5bb9eda 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -252,8 +252,8 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); -int trace_kprobe_ftrace(struct trace_event_call *call); -int trace_kprobe_error_injectable(struct trace_event_call *call); +bool trace_kprobe_on_func_entry(struct trace_event_call *call); +bool trace_kprobe_error_injectable(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -280,14 +280,14 @@ alloc_symbol_cache(const char *sym, long offset) return NULL; } -static inline int trace_kprobe_ftrace(struct trace_event_call *call) +static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) { - return 0; + return false; } -static inline int trace_kprobe_error_injectable(struct trace_event_call *call) +static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) { - return 0; + return false; } #endif /* CONFIG_KPROBE_EVENTS */ From 66665ad2f1023d3ffb0c12eea9e0a6d0b613ecb3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:54:33 +0900 Subject: [PATCH 03/33] tracing/kprobe: bpf: Compare instruction pointer with original one Compare instruction pointer with original one on the stack instead using per-cpu bpf_kprobe_override flag. This patch also consolidates reset_current_kprobe() and preempt_enable_no_resched() blocks. Those can be done in one place. Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 1 - kernel/trace/trace_kprobe.c | 21 +++++++-------------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1966ad3bf3e0..24ed6363e00f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -83,7 +83,6 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { - __this_cpu_write(bpf_kprobe_override, 1); regs_set_return_value(regs, rc); arch_kprobe_override_function(regs); return 0; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3c8deb977a8b..b8c90441bc87 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,8 +42,6 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) -DEFINE_PER_CPU(int, bpf_kprobe_override); - static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; @@ -1205,6 +1203,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int rctx; if (bpf_prog_array_valid(call)) { + unsigned long orig_ip = instruction_pointer(regs); int ret; ret = trace_call_bpf(call, regs); @@ -1212,12 +1211,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) /* * We need to check and see if we modified the pc of the * pt_regs, and if so clear the kprobe and return 1 so that we - * don't do the instruction skipping. Also reset our state so - * we are clean the next pass through. + * don't do the single stepping. + * The ftrace kprobe handler leaves it up to us to re-enable + * preemption here before returning if we've modified the ip. */ - if (__this_cpu_read(bpf_kprobe_override)) { - __this_cpu_write(bpf_kprobe_override, 0); + if (orig_ip != instruction_pointer(regs)) { reset_current_kprobe(); + preempt_enable_no_resched(); return 1; } if (!ret) @@ -1325,15 +1325,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) if (tk->tp.flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tk->tp.flags & TP_FLAG_PROFILE) { + if (tk->tp.flags & TP_FLAG_PROFILE) ret = kprobe_perf_func(tk, regs); - /* - * The ftrace kprobe handler leaves it up to us to re-enable - * preemption here before returning if we've modified the ip. - */ - if (ret) - preempt_enable_no_resched(); - } #endif return ret; } From 540adea3809f61115d2a1ea4ed6e627613452ba1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:55:03 +0900 Subject: [PATCH 04/33] error-injection: Separate error-injection from kprobe Since error-injection framework is not limited to be used by kprobes, nor bpf. Other kernel subsystems can use it freely for checking safeness of error-injection, e.g. livepatch, ftrace etc. So this separate error-injection framework from kprobes. Some differences has been made: - "kprobe" word is removed from any APIs/structures. - BPF_ALLOW_ERROR_INJECTION() is renamed to ALLOW_ERROR_INJECTION() since it is not limited for BPF too. - CONFIG_FUNCTION_ERROR_INJECTION is the config item of this feature. It is automatically enabled if the arch supports error injection feature for kprobe or ftrace etc. Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- arch/Kconfig | 2 +- arch/x86/Kconfig | 2 +- arch/x86/include/asm/error-injection.h | 13 ++ arch/x86/kernel/kprobes/core.c | 14 -- arch/x86/lib/Makefile | 1 + arch/x86/lib/error-inject.c | 19 +++ fs/btrfs/disk-io.c | 4 +- fs/btrfs/free-space-cache.c | 4 +- include/asm-generic/error-injection.h | 20 +++ include/asm-generic/vmlinux.lds.h | 14 +- include/linux/bpf.h | 11 -- include/linux/error-injection.h | 21 +++ include/linux/kprobes.h | 1 - include/linux/module.h | 6 +- kernel/kprobes.c | 163 ------------------- kernel/module.c | 8 +- kernel/trace/Kconfig | 2 +- kernel/trace/bpf_trace.c | 4 +- kernel/trace/trace_kprobe.c | 3 +- lib/Kconfig.debug | 4 + lib/Makefile | 1 + lib/error-inject.c | 213 +++++++++++++++++++++++++ 22 files changed, 317 insertions(+), 213 deletions(-) create mode 100644 arch/x86/include/asm/error-injection.h create mode 100644 arch/x86/lib/error-inject.c create mode 100644 include/asm-generic/error-injection.h create mode 100644 include/linux/error-injection.h create mode 100644 lib/error-inject.c diff --git a/arch/Kconfig b/arch/Kconfig index d3f4aaf9cb7a..97376accfb14 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -196,7 +196,7 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool -config HAVE_KPROBE_OVERRIDE +config HAVE_FUNCTION_ERROR_INJECTION bool config HAVE_NMI diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 45dc6233f2b9..366b19cb79b7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -154,7 +154,7 @@ config X86 select HAVE_KERNEL_XZ select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE - select HAVE_KPROBE_OVERRIDE + select HAVE_FUNCTION_ERROR_INJECTION select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 diff --git a/arch/x86/include/asm/error-injection.h b/arch/x86/include/asm/error-injection.h new file mode 100644 index 000000000000..47b7a1296245 --- /dev/null +++ b/arch/x86/include/asm/error-injection.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_ERROR_INJECTION_H +#define _ASM_ERROR_INJECTION_H + +#include +#include +#include +#include + +asmlinkage void just_return_func(void); +void override_function_with_return(struct pt_regs *regs); + +#endif /* _ASM_ERROR_INJECTION_H */ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b02a377d5905..bd36f3c33cd0 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1183,17 +1183,3 @@ int arch_trampoline_kprobe(struct kprobe *p) { return 0; } - -asmlinkage void override_func(void); -asm( - ".type override_func, @function\n" - "override_func:\n" - " ret\n" - ".size override_func, .-override_func\n" -); - -void arch_kprobe_override_function(struct pt_regs *regs) -{ - regs->ip = (unsigned long)&override_func; -} -NOKPROBE_SYMBOL(arch_kprobe_override_function); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 7b181b61170e..171377b83be1 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c new file mode 100644 index 000000000000..7b881d03d0dd --- /dev/null +++ b/arch/x86/lib/error-inject.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +asmlinkage void just_return_func(void); + +asm( + ".type just_return_func, @function\n" + "just_return_func:\n" + " ret\n" + ".size just_return_func, .-just_return_func\n" +); + +void override_function_with_return(struct pt_regs *regs) +{ + regs->ip = (unsigned long)&just_return_func; +} +NOKPROBE_SYMBOL(override_function_with_return); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5da18ebc9222..9798e21ebe9d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ctree.h" #include "disk-io.h" @@ -3124,7 +3124,7 @@ recovery_tree_root: goto fail_block_groups; goto retry_root_backup; } -BPF_ALLOW_ERROR_INJECTION(open_ctree); +ALLOW_ERROR_INJECTION(open_ctree); static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index fb1382893bfc..ef847699031a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -333,7 +333,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, return 0; } -BPF_ALLOW_ERROR_INJECTION(io_ctl_init); +ALLOW_ERROR_INJECTION(io_ctl_init); static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { diff --git a/include/asm-generic/error-injection.h b/include/asm-generic/error-injection.h new file mode 100644 index 000000000000..08352c9d9f97 --- /dev/null +++ b/include/asm-generic/error-injection.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_ERROR_INJECTION_H +#define _ASM_GENERIC_ERROR_INJECTION_H + +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) +#ifdef CONFIG_FUNCTION_ERROR_INJECTION +/* + * Whitelist ganerating macro. Specify functions which can be + * error-injectable using this macro. + */ +#define ALLOW_ERROR_INJECTION(fname) \ +static unsigned long __used \ + __attribute__((__section__("_error_injection_whitelist"))) \ + _eil_addr_##fname = (unsigned long)fname; +#else +#define ALLOW_ERROR_INJECTION(fname) +#endif +#endif + +#endif /* _ASM_GENERIC_ERROR_INJECTION_H */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index a2e8582d094a..f2068cca5206 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -136,13 +136,13 @@ #define KPROBE_BLACKLIST() #endif -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -#define ERROR_INJECT_LIST() . = ALIGN(8); \ - VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .; \ - KEEP(*(_kprobe_error_inject_list)) \ - VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) = .; +#ifdef CONFIG_FUNCTION_ERROR_INJECTION +#define ERROR_INJECT_WHITELIST() . = ALIGN(8); \ + VMLINUX_SYMBOL(__start_error_injection_whitelist) = .;\ + KEEP(*(_error_injection_whitelist)) \ + VMLINUX_SYMBOL(__stop_error_injection_whitelist) = .; #else -#define ERROR_INJECT_LIST() +#define ERROR_INJECT_WHITELIST() #endif #ifdef CONFIG_EVENT_TRACING @@ -573,7 +573,7 @@ FTRACE_EVENTS() \ TRACE_SYSCALLS() \ KPROBE_BLACKLIST() \ - ERROR_INJECT_LIST() \ + ERROR_INJECT_WHITELIST() \ MEM_DISCARD(init.rodata) \ CLK_OF_TABLES() \ RESERVEDMEM_OF_TABLES() \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 44f26f6df8fc..3496977203a3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -613,15 +613,4 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -#if defined(__KERNEL__) && !defined(__ASSEMBLY__) -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -#define BPF_ALLOW_ERROR_INJECTION(fname) \ -static unsigned long __used \ - __attribute__((__section__("_kprobe_error_inject_list"))) \ - _eil_addr_##fname = (unsigned long)fname; -#else -#define BPF_ALLOW_ERROR_INJECTION(fname) -#endif -#endif - #endif /* _LINUX_BPF_H */ diff --git a/include/linux/error-injection.h b/include/linux/error-injection.h new file mode 100644 index 000000000000..130a67c50dac --- /dev/null +++ b/include/linux/error-injection.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ERROR_INJECTION_H +#define _LINUX_ERROR_INJECTION_H + +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + +#include + +extern bool within_error_injection_list(unsigned long addr); + +#else /* !CONFIG_FUNCTION_ERROR_INJECTION */ + +#include +static inline bool within_error_injection_list(unsigned long addr) +{ + return false; +} + +#endif + +#endif /* _LINUX_ERROR_INJECTION_H */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 963fd364f3d6..9440a2fc8893 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -271,7 +271,6 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset); extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); extern bool within_kprobe_blacklist(unsigned long addr); -extern bool within_kprobe_error_injection_list(unsigned long addr); struct kprobe_insn_cache { struct mutex mutex; diff --git a/include/linux/module.h b/include/linux/module.h index 548fa09fa806..792e51d83bda 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -476,9 +476,9 @@ struct module { unsigned int num_ctors; #endif -#ifdef CONFIG_BPF_KPROBE_OVERRIDE - unsigned int num_kprobe_ei_funcs; - unsigned long *kprobe_ei_funcs; +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + unsigned int num_ei_funcs; + unsigned long *ei_funcs; #endif } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b4aab48ad258..da2ccf142358 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -83,16 +83,6 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } -/* List of symbols that can be overriden for error injection. */ -static LIST_HEAD(kprobe_error_injection_list); -static DEFINE_MUTEX(kprobe_ei_mutex); -struct kprobe_ei_entry { - struct list_head list; - unsigned long start_addr; - unsigned long end_addr; - void *priv; -}; - /* Blacklist -- list of struct kprobe_blacklist_entry */ static LIST_HEAD(kprobe_blacklist); @@ -1404,17 +1394,6 @@ bool within_kprobe_blacklist(unsigned long addr) return false; } -bool within_kprobe_error_injection_list(unsigned long addr) -{ - struct kprobe_ei_entry *ent; - - list_for_each_entry(ent, &kprobe_error_injection_list, list) { - if (addr >= ent->start_addr && addr < ent->end_addr) - return true; - } - return false; -} - /* * If we have a symbol_name argument, look it up and add the offset field * to it. This way, we can specify a relative address to a symbol. @@ -2189,86 +2168,6 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return 0; } -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -/* Markers of the _kprobe_error_inject_list section */ -extern unsigned long __start_kprobe_error_inject_list[]; -extern unsigned long __stop_kprobe_error_inject_list[]; - -/* - * Lookup and populate the kprobe_error_injection_list. - * - * For safety reasons we only allow certain functions to be overriden with - * bpf_error_injection, so we need to populate the list of the symbols that have - * been marked as safe for overriding. - */ -static void populate_kprobe_error_injection_list(unsigned long *start, - unsigned long *end, - void *priv) -{ - unsigned long *iter; - struct kprobe_ei_entry *ent; - unsigned long entry, offset = 0, size = 0; - - mutex_lock(&kprobe_ei_mutex); - for (iter = start; iter < end; iter++) { - entry = arch_deref_entry_point((void *)*iter); - - if (!kernel_text_address(entry) || - !kallsyms_lookup_size_offset(entry, &size, &offset)) { - pr_err("Failed to find error inject entry at %p\n", - (void *)entry); - continue; - } - - ent = kmalloc(sizeof(*ent), GFP_KERNEL); - if (!ent) - break; - ent->start_addr = entry; - ent->end_addr = entry + size; - ent->priv = priv; - INIT_LIST_HEAD(&ent->list); - list_add_tail(&ent->list, &kprobe_error_injection_list); - } - mutex_unlock(&kprobe_ei_mutex); -} - -static void __init populate_kernel_kprobe_ei_list(void) -{ - populate_kprobe_error_injection_list(__start_kprobe_error_inject_list, - __stop_kprobe_error_inject_list, - NULL); -} - -static void module_load_kprobe_ei_list(struct module *mod) -{ - if (!mod->num_kprobe_ei_funcs) - return; - populate_kprobe_error_injection_list(mod->kprobe_ei_funcs, - mod->kprobe_ei_funcs + - mod->num_kprobe_ei_funcs, mod); -} - -static void module_unload_kprobe_ei_list(struct module *mod) -{ - struct kprobe_ei_entry *ent, *n; - if (!mod->num_kprobe_ei_funcs) - return; - - mutex_lock(&kprobe_ei_mutex); - list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) { - if (ent->priv == mod) { - list_del_init(&ent->list); - kfree(ent); - } - } - mutex_unlock(&kprobe_ei_mutex); -} -#else -static inline void __init populate_kernel_kprobe_ei_list(void) {} -static inline void module_load_kprobe_ei_list(struct module *m) {} -static inline void module_unload_kprobe_ei_list(struct module *m) {} -#endif - /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2279,11 +2178,6 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); - if (val == MODULE_STATE_COMING) - module_load_kprobe_ei_list(mod); - else if (val == MODULE_STATE_GOING) - module_unload_kprobe_ei_list(mod); - if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2346,8 +2240,6 @@ static int __init init_kprobes(void) pr_err("Please take care of using kprobes.\n"); } - populate_kernel_kprobe_ei_list(); - if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { @@ -2515,56 +2407,6 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; -/* - * kprobes/error_injection_list -- shows which functions can be overriden for - * error injection. - * */ -static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&kprobe_ei_mutex); - return seq_list_start(&kprobe_error_injection_list, *pos); -} - -static void kprobe_ei_seq_stop(struct seq_file *m, void *v) -{ - mutex_unlock(&kprobe_ei_mutex); -} - -static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos) -{ - return seq_list_next(v, &kprobe_error_injection_list, pos); -} - -static int kprobe_ei_seq_show(struct seq_file *m, void *v) -{ - char buffer[KSYM_SYMBOL_LEN]; - struct kprobe_ei_entry *ent = - list_entry(v, struct kprobe_ei_entry, list); - - sprint_symbol(buffer, ent->start_addr); - seq_printf(m, "%s\n", buffer); - return 0; -} - -static const struct seq_operations kprobe_ei_seq_ops = { - .start = kprobe_ei_seq_start, - .next = kprobe_ei_seq_next, - .stop = kprobe_ei_seq_stop, - .show = kprobe_ei_seq_show, -}; - -static int kprobe_ei_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobe_ei_seq_ops); -} - -static const struct file_operations debugfs_kprobe_ei_ops = { - .open = kprobe_ei_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static void arm_all_kprobes(void) { struct hlist_head *head; @@ -2706,11 +2548,6 @@ static int __init debugfs_kprobe_init(void) if (!file) goto error; - file = debugfs_create_file("error_injection_list", 0444, dir, NULL, - &debugfs_kprobe_ei_ops); - if (!file) - goto error; - return 0; error: diff --git a/kernel/module.c b/kernel/module.c index bd695bfdc5c4..601494d4b7ea 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3118,10 +3118,10 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif -#ifdef CONFIG_BPF_KPROBE_OVERRIDE - mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list", - sizeof(*mod->kprobe_ei_funcs), - &mod->num_kprobe_ei_funcs); +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + mod->ei_funcs = section_objs(info, "_error_injection_whitelist", + sizeof(*mod->ei_funcs), + &mod->num_ei_funcs); #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6400e1bf97c5..7114c885a78a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -533,7 +533,7 @@ config FUNCTION_PROFILER config BPF_KPROBE_OVERRIDE bool "Enable BPF programs to override a kprobed function" depends on BPF_EVENTS - depends on HAVE_KPROBE_OVERRIDE + depends on FUNCTION_ERROR_INJECTION default n help Allows BPF to override the execution of a probed function and diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 24ed6363e00f..f274468cbc45 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "trace_probe.h" #include "trace.h" @@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { regs_set_return_value(regs, rc); - arch_kprobe_override_function(regs); + override_function_with_return(regs); return 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b8c90441bc87..1fad24acd444 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "trace_probe.h" @@ -107,7 +108,7 @@ bool trace_kprobe_error_injectable(struct trace_event_call *call) } else { addr = (unsigned long)tk->rp.kp.addr; } - return within_kprobe_error_injection_list(addr); + return within_error_injection_list(addr); } static int register_kprobe_event(struct trace_kprobe *tk); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9d5b78aad4c5..2a33efdd1fea 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1500,6 +1500,10 @@ config FAULT_INJECTION Provide fault-injection framework. For more details, see Documentation/fault-injection/. +config FUNCTION_ERROR_INJECTION + def_bool y + depends on HAVE_FUNCTION_ERROR_INJECTION && KPROBES + config FAILSLAB bool "Fault-injection capability for kmalloc" depends on FAULT_INJECTION diff --git a/lib/Makefile b/lib/Makefile index a6c8529dd9b2..75ec13778cd8 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -149,6 +149,7 @@ obj-$(CONFIG_NETDEV_NOTIFIER_ERROR_INJECT) += netdev-notifier-error-inject.o obj-$(CONFIG_MEMORY_NOTIFIER_ERROR_INJECT) += memory-notifier-error-inject.o obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ of-reconfig-notifier-error-inject.o +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_GENERIC_BUG) += bug.o diff --git a/lib/error-inject.c b/lib/error-inject.c new file mode 100644 index 000000000000..bccadcf3c981 --- /dev/null +++ b/lib/error-inject.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +// error-inject.c: Function-level error injection table +#include +#include +#include +#include +#include +#include +#include +#include + +/* Whitelist of symbols that can be overridden for error injection. */ +static LIST_HEAD(error_injection_list); +static DEFINE_MUTEX(ei_mutex); +struct ei_entry { + struct list_head list; + unsigned long start_addr; + unsigned long end_addr; + void *priv; +}; + +bool within_error_injection_list(unsigned long addr) +{ + struct ei_entry *ent; + bool ret = false; + + mutex_lock(&ei_mutex); + list_for_each_entry(ent, &error_injection_list, list) { + if (addr >= ent->start_addr && addr < ent->end_addr) { + ret = true; + break; + } + } + mutex_unlock(&ei_mutex); + return ret; +} + +/* + * Lookup and populate the error_injection_list. + * + * For safety reasons we only allow certain functions to be overridden with + * bpf_error_injection, so we need to populate the list of the symbols that have + * been marked as safe for overriding. + */ +static void populate_error_injection_list(unsigned long *start, + unsigned long *end, void *priv) +{ + unsigned long *iter; + struct ei_entry *ent; + unsigned long entry, offset = 0, size = 0; + + mutex_lock(&ei_mutex); + for (iter = start; iter < end; iter++) { + entry = arch_deref_entry_point((void *)*iter); + + if (!kernel_text_address(entry) || + !kallsyms_lookup_size_offset(entry, &size, &offset)) { + pr_err("Failed to find error inject entry at %p\n", + (void *)entry); + continue; + } + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + break; + ent->start_addr = entry; + ent->end_addr = entry + size; + ent->priv = priv; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &error_injection_list); + } + mutex_unlock(&ei_mutex); +} + +/* Markers of the _error_inject_whitelist section */ +extern unsigned long __start_error_injection_whitelist[]; +extern unsigned long __stop_error_injection_whitelist[]; + +static void __init populate_kernel_ei_list(void) +{ + populate_error_injection_list(__start_error_injection_whitelist, + __stop_error_injection_whitelist, + NULL); +} + +#ifdef CONFIG_MODULES +static void module_load_ei_list(struct module *mod) +{ + if (!mod->num_ei_funcs) + return; + + populate_error_injection_list(mod->ei_funcs, + mod->ei_funcs + mod->num_ei_funcs, mod); +} + +static void module_unload_ei_list(struct module *mod) +{ + struct ei_entry *ent, *n; + + if (!mod->num_ei_funcs) + return; + + mutex_lock(&ei_mutex); + list_for_each_entry_safe(ent, n, &error_injection_list, list) { + if (ent->priv == mod) { + list_del_init(&ent->list); + kfree(ent); + } + } + mutex_unlock(&ei_mutex); +} + +/* Module notifier call back, checking error injection table on the module */ +static int ei_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + + if (val == MODULE_STATE_COMING) + module_load_ei_list(mod); + else if (val == MODULE_STATE_GOING) + module_unload_ei_list(mod); + + return NOTIFY_DONE; +} + +static struct notifier_block ei_module_nb = { + .notifier_call = ei_module_callback, + .priority = 0 +}; + +static __init int module_ei_init(void) +{ + return register_module_notifier(&ei_module_nb); +} +#else /* !CONFIG_MODULES */ +#define module_ei_init() (0) +#endif + +/* + * error_injection/whitelist -- shows which functions can be overridden for + * error injection. + */ +static void *ei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ei_mutex); + return seq_list_start(&error_injection_list, *pos); +} + +static void ei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&ei_mutex); +} + +static void *ei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &error_injection_list, pos); +} + +static int ei_seq_show(struct seq_file *m, void *v) +{ + struct ei_entry *ent = list_entry(v, struct ei_entry, list); + + seq_printf(m, "%pf\n", (void *)ent->start_addr); + return 0; +} + +static const struct seq_operations ei_seq_ops = { + .start = ei_seq_start, + .next = ei_seq_next, + .stop = ei_seq_stop, + .show = ei_seq_show, +}; + +static int ei_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &ei_seq_ops); +} + +static const struct file_operations debugfs_ei_ops = { + .open = ei_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init ei_debugfs_init(void) +{ + struct dentry *dir, *file; + + dir = debugfs_create_dir("error_injection", NULL); + if (!dir) + return -ENOMEM; + + file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_ei_ops); + if (!file) { + debugfs_remove(dir); + return -ENOMEM; + } + + return 0; +} + +static int __init init_error_injection(void) +{ + populate_kernel_ei_list(); + + if (!module_ei_init()) + ei_debugfs_init(); + + return 0; +} +late_initcall(init_error_injection); From 663faf9f7beeaca4ad0176bb96c776eed9dad0c5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:55:33 +0900 Subject: [PATCH 05/33] error-injection: Add injectable error types Add injectable error types for each error-injectable function. One motivation of error injection test is to find software flaws, mistakes or mis-handlings of expectable errors. If we find such flaws by the test, that is a program bug, so we need to fix it. But if the tester miss input the error (e.g. just return success code without processing anything), it causes unexpected behavior even if the caller is correctly programmed to handle any errors. That is not what we want to test by error injection. To clarify what type of errors the caller must expect for each injectable function, this introduces injectable error types: - EI_ETYPE_NULL : means the function will return NULL if it fails. No ERR_PTR, just a NULL. - EI_ETYPE_ERRNO : means the function will return -ERRNO if it fails. - EI_ETYPE_ERRNO_NULL : means the function will return -ERRNO (ERR_PTR) or NULL. ALLOW_ERROR_INJECTION() macro is expanded to get one of NULL, ERRNO, ERRNO_NULL to record the error type for each function. e.g. ALLOW_ERROR_INJECTION(open_ctree, ERRNO) This error types are shown in debugfs as below. ==== / # cat /sys/kernel/debug/error_injection/list open_ctree [btrfs] ERRNO io_ctl_init [btrfs] ERRNO ==== Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/free-space-cache.c | 2 +- include/asm-generic/error-injection.h | 23 +++++++++++--- include/asm-generic/vmlinux.lds.h | 2 +- include/linux/error-injection.h | 6 ++++ include/linux/module.h | 3 +- lib/error-inject.c | 43 ++++++++++++++++++++++----- 7 files changed, 66 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9798e21ebe9d..83e2349e1362 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3124,7 +3124,7 @@ recovery_tree_root: goto fail_block_groups; goto retry_root_backup; } -ALLOW_ERROR_INJECTION(open_ctree); +ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ef847699031a..586bb06472bb 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -333,7 +333,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, return 0; } -ALLOW_ERROR_INJECTION(io_ctl_init); +ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { diff --git a/include/asm-generic/error-injection.h b/include/asm-generic/error-injection.h index 08352c9d9f97..296c65442f00 100644 --- a/include/asm-generic/error-injection.h +++ b/include/asm-generic/error-injection.h @@ -3,17 +3,32 @@ #define _ASM_GENERIC_ERROR_INJECTION_H #if defined(__KERNEL__) && !defined(__ASSEMBLY__) +enum { + EI_ETYPE_NONE, /* Dummy value for undefined case */ + EI_ETYPE_NULL, /* Return NULL if failure */ + EI_ETYPE_ERRNO, /* Return -ERRNO if failure */ + EI_ETYPE_ERRNO_NULL, /* Return -ERRNO or NULL if failure */ +}; + +struct error_injection_entry { + unsigned long addr; + int etype; +}; + #ifdef CONFIG_FUNCTION_ERROR_INJECTION /* * Whitelist ganerating macro. Specify functions which can be * error-injectable using this macro. */ -#define ALLOW_ERROR_INJECTION(fname) \ -static unsigned long __used \ +#define ALLOW_ERROR_INJECTION(fname, _etype) \ +static struct error_injection_entry __used \ __attribute__((__section__("_error_injection_whitelist"))) \ - _eil_addr_##fname = (unsigned long)fname; + _eil_addr_##fname = { \ + .addr = (unsigned long)fname, \ + .etype = EI_ETYPE_##_etype, \ + }; #else -#define ALLOW_ERROR_INJECTION(fname) +#define ALLOW_ERROR_INJECTION(fname, _etype) #endif #endif diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f2068cca5206..ebe544e048cd 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -137,7 +137,7 @@ #endif #ifdef CONFIG_FUNCTION_ERROR_INJECTION -#define ERROR_INJECT_WHITELIST() . = ALIGN(8); \ +#define ERROR_INJECT_WHITELIST() STRUCT_ALIGN(); \ VMLINUX_SYMBOL(__start_error_injection_whitelist) = .;\ KEEP(*(_error_injection_whitelist)) \ VMLINUX_SYMBOL(__stop_error_injection_whitelist) = .; diff --git a/include/linux/error-injection.h b/include/linux/error-injection.h index 130a67c50dac..280c61ecbf20 100644 --- a/include/linux/error-injection.h +++ b/include/linux/error-injection.h @@ -7,6 +7,7 @@ #include extern bool within_error_injection_list(unsigned long addr); +extern int get_injectable_error_type(unsigned long addr); #else /* !CONFIG_FUNCTION_ERROR_INJECTION */ @@ -16,6 +17,11 @@ static inline bool within_error_injection_list(unsigned long addr) return false; } +static inline int get_injectable_error_type(unsigned long addr) +{ + return EI_ETYPE_NONE; +} + #endif #endif /* _LINUX_ERROR_INJECTION_H */ diff --git a/include/linux/module.h b/include/linux/module.h index 792e51d83bda..9642d3116718 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -477,8 +478,8 @@ struct module { #endif #ifdef CONFIG_FUNCTION_ERROR_INJECTION + struct error_injection_entry *ei_funcs; unsigned int num_ei_funcs; - unsigned long *ei_funcs; #endif } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT diff --git a/lib/error-inject.c b/lib/error-inject.c index bccadcf3c981..c0d4600f4896 100644 --- a/lib/error-inject.c +++ b/lib/error-inject.c @@ -16,6 +16,7 @@ struct ei_entry { struct list_head list; unsigned long start_addr; unsigned long end_addr; + int etype; void *priv; }; @@ -35,6 +36,17 @@ bool within_error_injection_list(unsigned long addr) return ret; } +int get_injectable_error_type(unsigned long addr) +{ + struct ei_entry *ent; + + list_for_each_entry(ent, &error_injection_list, list) { + if (addr >= ent->start_addr && addr < ent->end_addr) + return ent->etype; + } + return EI_ETYPE_NONE; +} + /* * Lookup and populate the error_injection_list. * @@ -42,16 +54,17 @@ bool within_error_injection_list(unsigned long addr) * bpf_error_injection, so we need to populate the list of the symbols that have * been marked as safe for overriding. */ -static void populate_error_injection_list(unsigned long *start, - unsigned long *end, void *priv) +static void populate_error_injection_list(struct error_injection_entry *start, + struct error_injection_entry *end, + void *priv) { - unsigned long *iter; + struct error_injection_entry *iter; struct ei_entry *ent; unsigned long entry, offset = 0, size = 0; mutex_lock(&ei_mutex); for (iter = start; iter < end; iter++) { - entry = arch_deref_entry_point((void *)*iter); + entry = arch_deref_entry_point((void *)iter->addr); if (!kernel_text_address(entry) || !kallsyms_lookup_size_offset(entry, &size, &offset)) { @@ -65,6 +78,7 @@ static void populate_error_injection_list(unsigned long *start, break; ent->start_addr = entry; ent->end_addr = entry + size; + ent->etype = iter->etype; ent->priv = priv; INIT_LIST_HEAD(&ent->list); list_add_tail(&ent->list, &error_injection_list); @@ -73,8 +87,8 @@ static void populate_error_injection_list(unsigned long *start, } /* Markers of the _error_inject_whitelist section */ -extern unsigned long __start_error_injection_whitelist[]; -extern unsigned long __stop_error_injection_whitelist[]; +extern struct error_injection_entry __start_error_injection_whitelist[]; +extern struct error_injection_entry __stop_error_injection_whitelist[]; static void __init populate_kernel_ei_list(void) { @@ -157,11 +171,26 @@ static void *ei_seq_next(struct seq_file *m, void *v, loff_t *pos) return seq_list_next(v, &error_injection_list, pos); } +static const char *error_type_string(int etype) +{ + switch (etype) { + case EI_ETYPE_NULL: + return "NULL"; + case EI_ETYPE_ERRNO: + return "ERRNO"; + case EI_ETYPE_ERRNO_NULL: + return "ERRNO_NULL"; + default: + return "(unknown)"; + } +} + static int ei_seq_show(struct seq_file *m, void *v) { struct ei_entry *ent = list_entry(v, struct ei_entry, list); - seq_printf(m, "%pf\n", (void *)ent->start_addr); + seq_printf(m, "%pf\t%s\n", (void *)ent->start_addr, + error_type_string(ent->etype)); return 0; } From 4b1a29a7f5425d32640b34b8a755f34e02f64d0f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:56:03 +0900 Subject: [PATCH 06/33] error-injection: Support fault injection framework Support in-kernel fault-injection framework via debugfs. This allows you to inject a conditional error to specified function using debugfs interfaces. Here is the result of test script described in Documentation/fault-injection/fault-injection.txt =========== # ./test_fail_function.sh 1+0 records in 1+0 records out 1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.0227404 s, 46.1 MB/s btrfs-progs v4.4 See http://btrfs.wiki.kernel.org for more information. Label: (null) UUID: bfa96010-12e9-4360-aed0-42eec7af5798 Node size: 16384 Sector size: 4096 Filesystem size: 1001.00MiB Block group profiles: Data: single 8.00MiB Metadata: DUP 58.00MiB System: DUP 12.00MiB SSD detected: no Incompat features: extref, skinny-metadata Number of devices: 1 Devices: ID SIZE PATH 1 1001.00MiB /dev/loop2 mount: mount /dev/loop2 on /opt/tmpmnt failed: Cannot allocate memory SUCCESS! =========== Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- .../fault-injection/fault-injection.txt | 68 ++++ kernel/Makefile | 1 + kernel/fail_function.c | 349 ++++++++++++++++++ lib/Kconfig.debug | 10 + 4 files changed, 428 insertions(+) create mode 100644 kernel/fail_function.c diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt index 918972babcd8..f4a32463ca48 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.txt @@ -30,6 +30,12 @@ o fail_mmc_request injects MMC data errors on devices permitted by setting debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request +o fail_function + + injects error return on specific functions, which are marked by + ALLOW_ERROR_INJECTION() macro, by setting debugfs entries + under /sys/kernel/debug/fail_function. No boot option supported. + Configure fault-injection capabilities behavior ----------------------------------------------- @@ -123,6 +129,29 @@ configuration of fault-injection capabilities. default is 'N', setting it to 'Y' will disable failure injections when dealing with private (address space) futexes. +- /sys/kernel/debug/fail_function/inject: + + Format: { 'function-name' | '!function-name' | '' } + specifies the target function of error injection by name. + If the function name leads '!' prefix, given function is + removed from injection list. If nothing specified ('') + injection list is cleared. + +- /sys/kernel/debug/fail_function/injectable: + + (read only) shows error injectable functions and what type of + error values can be specified. The error type will be one of + below; + - NULL: retval must be 0. + - ERRNO: retval must be -1 to -MAX_ERRNO (-4096). + - ERR_NULL: retval must be 0 or -1 to -MAX_ERRNO (-4096). + +- /sys/kernel/debug/fail_function//retval: + + specifies the "error" return value to inject to the given + function for given function. This will be created when + user specifies new injection entry. + o Boot option In order to inject faults while debugfs is not available (early boot time), @@ -268,6 +297,45 @@ trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT echo "Injecting errors into the module $module... (interrupt to stop)" sleep 1000000 +------------------------------------------------------------------------------ + +o Inject open_ctree error while btrfs mount + +#!/bin/bash + +rm -f testfile.img +dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 +DEVICE=$(losetup --show -f testfile.img) +mkfs.btrfs -f $DEVICE +mkdir -p tmpmnt + +FAILTYPE=fail_function +FAILFUNC=open_ctree +echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject +echo -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval +echo N > /sys/kernel/debug/$FAILTYPE/task-filter +echo 100 > /sys/kernel/debug/$FAILTYPE/probability +echo 0 > /sys/kernel/debug/$FAILTYPE/interval +echo -1 > /sys/kernel/debug/$FAILTYPE/times +echo 0 > /sys/kernel/debug/$FAILTYPE/space +echo 1 > /sys/kernel/debug/$FAILTYPE/verbose + +mount -t btrfs $DEVICE tmpmnt +if [ $? -ne 0 ] +then + echo "SUCCESS!" +else + echo "FAILED!" + umount tmpmnt +fi + +echo > /sys/kernel/debug/$FAILTYPE/inject + +rmdir tmpmnt +losetup -d $DEVICE +rm testfile.img + + Tool to run command with failslab or fail_page_alloc ---------------------------------------------------- In order to make it easier to accomplish the tasks mentioned above, we can use diff --git a/kernel/Makefile b/kernel/Makefile index 172d151d429c..f85ae5dfa474 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o diff --git a/kernel/fail_function.c b/kernel/fail_function.c new file mode 100644 index 000000000000..21b0122cb39c --- /dev/null +++ b/kernel/fail_function.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fail_function.c: Function-based error injection + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); + +struct fei_attr { + struct list_head list; + struct kprobe kp; + unsigned long retval; +}; +static DEFINE_MUTEX(fei_lock); +static LIST_HEAD(fei_attr_list); +static DECLARE_FAULT_ATTR(fei_fault_attr); +static struct dentry *fei_debugfs_dir; + +static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv) +{ + switch (get_injectable_error_type(addr)) { + case EI_ETYPE_NULL: + if (retv != 0) + return 0; + break; + case EI_ETYPE_ERRNO: + if (retv < (unsigned long)-MAX_ERRNO) + return (unsigned long)-EINVAL; + break; + case EI_ETYPE_ERRNO_NULL: + if (retv != 0 && retv < (unsigned long)-MAX_ERRNO) + return (unsigned long)-EINVAL; + break; + } + + return retv; +} + +static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr) +{ + struct fei_attr *attr; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (attr) { + attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL); + if (!attr->kp.symbol_name) { + kfree(attr); + return NULL; + } + attr->kp.pre_handler = fei_kprobe_handler; + attr->retval = adjust_error_retval(addr, 0); + INIT_LIST_HEAD(&attr->list); + } + return attr; +} + +static void fei_attr_free(struct fei_attr *attr) +{ + if (attr) { + kfree(attr->kp.symbol_name); + kfree(attr); + } +} + +static struct fei_attr *fei_attr_lookup(const char *sym) +{ + struct fei_attr *attr; + + list_for_each_entry(attr, &fei_attr_list, list) { + if (!strcmp(attr->kp.symbol_name, sym)) + return attr; + } + + return NULL; +} + +static bool fei_attr_is_valid(struct fei_attr *_attr) +{ + struct fei_attr *attr; + + list_for_each_entry(attr, &fei_attr_list, list) { + if (attr == _attr) + return true; + } + + return false; +} + +static int fei_retval_set(void *data, u64 val) +{ + struct fei_attr *attr = data; + unsigned long retv = (unsigned long)val; + int err = 0; + + mutex_lock(&fei_lock); + /* + * Since this operation can be done after retval file is removed, + * It is safer to check the attr is still valid before accessing + * its member. + */ + if (!fei_attr_is_valid(attr)) { + err = -ENOENT; + goto out; + } + + if (attr->kp.addr) { + if (adjust_error_retval((unsigned long)attr->kp.addr, + val) != retv) + err = -EINVAL; + } + if (!err) + attr->retval = val; +out: + mutex_unlock(&fei_lock); + + return err; +} + +static int fei_retval_get(void *data, u64 *val) +{ + struct fei_attr *attr = data; + int err = 0; + + mutex_lock(&fei_lock); + /* Here we also validate @attr to ensure it still exists. */ + if (!fei_attr_is_valid(attr)) + err = -ENOENT; + else + *val = attr->retval; + mutex_unlock(&fei_lock); + + return err; +} +DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, + "%llx\n"); + +static int fei_debugfs_add_attr(struct fei_attr *attr) +{ + struct dentry *dir; + + dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); + if (!dir) + return -ENOMEM; + + if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) { + debugfs_remove_recursive(dir); + return -ENOMEM; + } + + return 0; +} + +static void fei_debugfs_remove_attr(struct fei_attr *attr) +{ + struct dentry *dir; + + dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir); + if (dir) + debugfs_remove_recursive(dir); +} + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) +{ + struct fei_attr *attr = container_of(kp, struct fei_attr, kp); + + if (should_fail(&fei_fault_attr, 1)) { + regs_set_return_value(regs, attr->retval); + override_function_with_return(regs); + /* Kprobe specific fixup */ + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + + return 0; +} +NOKPROBE_SYMBOL(fei_kprobe_handler) + +static void *fei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&fei_lock); + return seq_list_start(&fei_attr_list, *pos); +} + +static void fei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&fei_lock); +} + +static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &fei_attr_list, pos); +} + +static int fei_seq_show(struct seq_file *m, void *v) +{ + struct fei_attr *attr = list_entry(v, struct fei_attr, list); + + seq_printf(m, "%pf\n", attr->kp.addr); + return 0; +} + +static const struct seq_operations fei_seq_ops = { + .start = fei_seq_start, + .next = fei_seq_next, + .stop = fei_seq_stop, + .show = fei_seq_show, +}; + +static int fei_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fei_seq_ops); +} + +static void fei_attr_remove(struct fei_attr *attr) +{ + fei_debugfs_remove_attr(attr); + unregister_kprobe(&attr->kp); + list_del(&attr->list); + fei_attr_free(attr); +} + +static void fei_attr_remove_all(void) +{ + struct fei_attr *attr, *n; + + list_for_each_entry_safe(attr, n, &fei_attr_list, list) { + fei_attr_remove(attr); + } +} + +static ssize_t fei_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct fei_attr *attr; + unsigned long addr; + char *buf, *sym; + int ret; + + /* cut off if it is too long */ + if (count > KSYM_NAME_LEN) + count = KSYM_NAME_LEN; + buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, buffer, count)) { + ret = -EFAULT; + goto out; + } + buf[count] = '\0'; + sym = strstrip(buf); + + mutex_lock(&fei_lock); + + /* Writing just spaces will remove all injection points */ + if (sym[0] == '\0') { + fei_attr_remove_all(); + ret = count; + goto out; + } + /* Writing !function will remove one injection point */ + if (sym[0] == '!') { + attr = fei_attr_lookup(sym + 1); + if (!attr) { + ret = -ENOENT; + goto out; + } + fei_attr_remove(attr); + ret = count; + goto out; + } + + addr = kallsyms_lookup_name(sym); + if (!addr) { + ret = -EINVAL; + goto out; + } + if (!within_error_injection_list(addr)) { + ret = -ERANGE; + goto out; + } + if (fei_attr_lookup(sym)) { + ret = -EBUSY; + goto out; + } + attr = fei_attr_new(sym, addr); + if (!attr) { + ret = -ENOMEM; + goto out; + } + + ret = register_kprobe(&attr->kp); + if (!ret) + ret = fei_debugfs_add_attr(attr); + if (ret < 0) + fei_attr_remove(attr); + else { + list_add_tail(&attr->list, &fei_attr_list); + ret = count; + } +out: + kfree(buf); + mutex_unlock(&fei_lock); + return ret; +} + +static const struct file_operations fei_ops = { + .open = fei_open, + .read = seq_read, + .write = fei_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init fei_debugfs_init(void) +{ + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_function", NULL, + &fei_fault_attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + /* injectable attribute is just a symlink of error_inject/list */ + if (!debugfs_create_symlink("injectable", dir, + "../error_injection/list")) + goto error; + + if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops)) + goto error; + + fei_debugfs_dir = dir; + + return 0; +error: + debugfs_remove_recursive(dir); + return -ENOMEM; +} + +late_initcall(fei_debugfs_init); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2a33efdd1fea..890d4766cef3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1551,6 +1551,16 @@ config FAIL_FUTEX help Provide fault-injection capability for futexes. +config FAIL_FUNCTION + bool "Fault-injection capability for functions" + depends on FAULT_INJECTION_DEBUG_FS && FUNCTION_ERROR_INJECTION + help + Provide function-based fault-injection capability. + This will allow you to override a specific function with a return + with given return value. As a result, function caller will see + an error value and have to handle it. This is useful to test the + error handling in various subsystems. + config FAULT_INJECTION_DEBUG_FS bool "Debugfs entries for fault-injection capabilities" depends on FAULT_INJECTION && SYSFS && DEBUG_FS From 1110f3a9bcf394c06b81a98206aee9b6860653c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:03 -0800 Subject: [PATCH 07/33] bpf: add map_alloc_check callback .map_alloc callbacks contain a number of checks validating user- -provided map attributes against constraints of a particular map type. For offloaded maps we will need to check map attributes without actually allocating any memory on the host. Add a new callback for validating attributes before any memory is allocated. This callback can be selectively implemented by map types for sharing code with offloads, or simply to separate the logical steps of validation and allocation. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3496977203a3..263598619f90 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -25,6 +25,7 @@ struct bpf_map; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ + int (*map_alloc_check)(union bpf_attr *attr); struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_release)(struct bpf_map *map, struct file *map_file); void (*map_free)(struct bpf_map *map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2bac0dc8baba..c0ac03a04880 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -96,16 +96,25 @@ static int check_uarg_tail_zero(void __user *uaddr, static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { + const struct bpf_map_ops *ops; struct bpf_map *map; + int err; - if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || - !bpf_map_types[attr->map_type]) + if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) + return ERR_PTR(-EINVAL); + ops = bpf_map_types[attr->map_type]; + if (!ops) return ERR_PTR(-EINVAL); - map = bpf_map_types[attr->map_type]->map_alloc(attr); + if (ops->map_alloc_check) { + err = ops->map_alloc_check(attr); + if (err) + return ERR_PTR(err); + } + map = ops->map_alloc(attr); if (IS_ERR(map)) return map; - map->ops = bpf_map_types[attr->map_type]; + map->ops = ops; map->map_type = attr->map_type; return map; } From daffc5a2e6f4bf4f99b00e183117920e321b6763 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:04 -0800 Subject: [PATCH 08/33] bpf: hashtab: move attribute validation before allocation Number of attribute checks are currently performed after hashtab is already allocated. Move them to be able to split them out to the check function later on. Checks have to now be performed on the attr union directly instead of the members of bpf_map, since bpf_map will be allocated later. No functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 47 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3905d4bc5b80..b80f42adf068 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -269,6 +269,28 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) return ERR_PTR(-EINVAL); + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + if (attr->max_entries == 0 || attr->key_size == 0 || + attr->value_size == 0) + return ERR_PTR(-EINVAL); + + if (attr->key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + return ERR_PTR(-E2BIG); + + if (attr->value_size >= KMALLOC_MAX_SIZE - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + return ERR_PTR(-E2BIG); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -281,14 +303,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.map_flags = attr->map_flags; htab->map.numa_node = numa_node; - /* check sanity of attributes. - * value_size == 0 may be allowed in the future to use map as a set - */ - err = -EINVAL; - if (htab->map.max_entries == 0 || htab->map.key_size == 0 || - htab->map.value_size == 0) - goto free_htab; - if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. * since we are at it, make each lru list has the same @@ -304,22 +318,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* hash table size must be power of 2 */ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); - err = -E2BIG; - if (htab->map.key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - goto free_htab; - - if (htab->map.value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's - * kmalloc-able later in htab_map_update_elem() - */ - goto free_htab; - htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) @@ -327,6 +325,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else htab->elem_size += round_up(htab->map.value_size, 8); + err = -E2BIG; /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct bucket)) From 9328e0d1bc09e96bd7dc85374f5c2a1e0e04e539 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:05 -0800 Subject: [PATCH 09/33] bpf: hashtab: move checks out of alloc function Use the new callback to perform allocation checks for hash maps. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 123 +++++++++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 50 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b80f42adf068..7fd6519444d3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -227,6 +227,70 @@ static int alloc_extra_elems(struct bpf_htab *htab) } /* Called from syscall */ +static int htab_map_alloc_check(union bpf_attr *attr) +{ + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + int numa_node = bpf_map_attr_numa_node(attr); + + BUILD_BUG_ON(offsetof(struct htab_elem, htab) != + offsetof(struct htab_elem, hash_node.pprev)); + BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != + offsetof(struct htab_elem, hash_node.pprev)); + + if (lru && !capable(CAP_SYS_ADMIN)) + /* LRU implementation is much complicated than other + * maps. Hence, limit to CAP_SYS_ADMIN for now. + */ + return -EPERM; + + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) + /* reserved bits should not be used */ + return -EINVAL; + + if (!lru && percpu_lru) + return -EINVAL; + + if (lru && !prealloc) + return -ENOTSUPP; + + if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) + return -EINVAL; + + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + if (attr->max_entries == 0 || attr->key_size == 0 || + attr->value_size == 0) + return -EINVAL; + + if (attr->key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + return -E2BIG; + + if (attr->value_size >= KMALLOC_MAX_SIZE - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + return -E2BIG; + + return 0; +} + static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || @@ -245,52 +309,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) int err, i; u64 cost; - BUILD_BUG_ON(offsetof(struct htab_elem, htab) != - offsetof(struct htab_elem, hash_node.pprev)); - BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != - offsetof(struct htab_elem, hash_node.pprev)); - - if (lru && !capable(CAP_SYS_ADMIN)) - /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. - */ - return ERR_PTR(-EPERM); - - if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) - /* reserved bits should not be used */ - return ERR_PTR(-EINVAL); - - if (!lru && percpu_lru) - return ERR_PTR(-EINVAL); - - if (lru && !prealloc) - return ERR_PTR(-ENOTSUPP); - - if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) - return ERR_PTR(-EINVAL); - - /* check sanity of attributes. - * value_size == 0 may be allowed in the future to use map as a set - */ - if (attr->max_entries == 0 || attr->key_size == 0 || - attr->value_size == 0) - return ERR_PTR(-EINVAL); - - if (attr->key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - return ERR_PTR(-E2BIG); - - if (attr->value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's - * kmalloc-able later in htab_map_update_elem() - */ - return ERR_PTR(-E2BIG); - htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -1142,6 +1160,7 @@ static void htab_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1152,6 +1171,7 @@ const struct bpf_map_ops htab_map_ops = { }; const struct bpf_map_ops htab_lru_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1235,6 +1255,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, } const struct bpf_map_ops htab_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1244,6 +1265,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { }; const struct bpf_map_ops htab_lru_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1252,11 +1274,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +static int fd_htab_map_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return htab_map_alloc(attr); + return -EINVAL; + return htab_map_alloc_check(attr); } static void fd_htab_map_free(struct bpf_map *map) @@ -1327,7 +1349,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_htab_map_alloc(attr); + map = htab_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -1371,6 +1393,7 @@ static void htab_of_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_of_maps_map_ops = { + .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, .map_free = htab_of_map_free, .map_get_next_key = htab_map_get_next_key, From bd475643d74e8ed78bfd36d941053b0e45974e8e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:06 -0800 Subject: [PATCH 10/33] bpf: add helper for copying attrs to struct bpf_map All map types reimplement the field-by-field copy of union bpf_attr members into struct bpf_map. Add a helper to perform this operation. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/cpumap.c | 8 +------- kernel/bpf/devmap.c | 8 +------- kernel/bpf/hashtab.c | 9 +-------- kernel/bpf/lpm_trie.c | 7 +------ kernel/bpf/sockmap.c | 8 +------- kernel/bpf/stackmap.c | 6 +----- kernel/bpf/syscall.c | 10 ++++++++++ 8 files changed, 17 insertions(+), 40 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 263598619f90..c60ddfb34d41 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -378,6 +378,7 @@ void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); +void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); extern int sysctl_unprivileged_bpf_disabled; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ce5b669003b2..192151ec9d12 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) if (!cmap) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - cmap->map.map_type = attr->map_type; - cmap->map.key_size = attr->key_size; - cmap->map.value_size = attr->value_size; - cmap->map.max_entries = attr->max_entries; - cmap->map.map_flags = attr->map_flags; - cmap->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&cmap->map, attr); /* Pre-limit array size based on NR_CPUS, not final CPU check */ if (cmap->map.max_entries > NR_CPUS) { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ebdef54bf7df..565f9ece9115 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - dtab->map.map_type = attr->map_type; - dtab->map.key_size = attr->key_size; - dtab->map.value_size = attr->value_size; - dtab->map.max_entries = attr->max_entries; - dtab->map.map_flags = attr->map_flags; - dtab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&dtab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 7fd6519444d3..b76828f23b49 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -304,7 +304,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); - int numa_node = bpf_map_attr_numa_node(attr); struct bpf_htab *htab; int err, i; u64 cost; @@ -313,13 +312,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - htab->map.map_type = attr->map_type; - htab->map.key_size = attr->key_size; - htab->map.value_size = attr->value_size; - htab->map.max_entries = attr->max_entries; - htab->map.map_flags = attr->map_flags; - htab->map.numa_node = numa_node; + bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 885e45479680..584e02227671 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ - trie->map.map_type = attr->map_type; - trie->map.key_size = attr->key_size; - trie->map.value_size = attr->value_size; - trie->map.max_entries = attr->max_entries; - trie->map.map_flags = attr->map_flags; - trie->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&trie->map, attr); trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 079968680bc3..0314d1783d77 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -513,13 +513,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (!stab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - stab->map.map_type = attr->map_type; - stab->map.key_size = attr->key_size; - stab->map.value_size = attr->value_size; - stab->map.max_entries = attr->max_entries; - stab->map.map_flags = attr->map_flags; - stab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&stab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6c63c2222ea8..b0ecf43f5894 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_smap; - smap->map.map_type = attr->map_type; - smap->map.key_size = attr->key_size; + bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; - smap->map.max_entries = attr->max_entries; - smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - smap->map.numa_node = bpf_map_attr_numa_node(attr); err = bpf_map_precharge_memlock(smap->map.pages); if (err) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c0ac03a04880..a3f726bb42ea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -143,6 +143,16 @@ void bpf_map_area_free(void *area) kvfree(area); } +void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) +{ + map->map_type = attr->map_type; + map->key_size = attr->key_size; + map->value_size = attr->value_size; + map->max_entries = attr->max_entries; + map->map_flags = attr->map_flags; + map->numa_node = bpf_map_attr_numa_node(attr); +} + int bpf_map_precharge_memlock(u32 pages) { struct user_struct *user = get_current_user(); From 0a9c1991f285f829fd786fa2a9c824c2a3f87bc6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:07 -0800 Subject: [PATCH 11/33] bpf: rename bpf_dev_offload -> bpf_prog_offload With map offload coming, we need to call program offload structure something less ambiguous. Pure rename, no functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 2 +- include/linux/bpf.h | 4 ++-- kernel/bpf/offload.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 320b2250d29a..6590228d3755 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -237,7 +237,7 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct bpf_prog *prog, int err; if (prog) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; if (!offload) return -EINVAL; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c60ddfb34d41..9fff1ace1d8e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -200,7 +200,7 @@ struct bpf_prog_offload_ops { int insn_idx, int prev_insn_idx); }; -struct bpf_dev_offload { +struct bpf_prog_offload { struct bpf_prog *prog; struct net_device *netdev; void *dev_priv; @@ -230,7 +230,7 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; union { struct work_struct work; struct rcu_head rcu; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 040d4e0edf3f..001ddfde7874 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -32,7 +32,7 @@ static LIST_HEAD(bpf_prog_offload_devs); int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) @@ -72,7 +72,7 @@ err_free: static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, struct netdev_bpf *data) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; struct net_device *netdev; ASSERT_RTNL(); @@ -110,7 +110,7 @@ exit_unlock: int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); @@ -124,7 +124,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; data.offload.prog = prog; @@ -242,7 +242,7 @@ static int bpf_offload_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct bpf_dev_offload *offload, *tmp; + struct bpf_prog_offload *offload, *tmp; ASSERT_RTNL(); From 5bc2d55c6a69ef9efd11740359974b08ea11f1d7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:08 -0800 Subject: [PATCH 12/33] bpf: offload: factor out netdev checking at allocation time Add a helper to check if netdev could be found and whether it has .ndo_bpf callback. There is no need to check the callback every time it's invoked, ndos can't reasonably be swapped for a set without .ndp_bpf while program is loaded. bpf_dev_offload_check() will also be used by map offload. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 001ddfde7874..cdd1e19a668b 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -30,9 +30,19 @@ static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); +static int bpf_dev_offload_check(struct net_device *netdev) +{ + if (!netdev) + return -EINVAL; + if (!netdev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + return 0; +} + int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { struct bpf_prog_offload *offload; + int err; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) @@ -49,12 +59,15 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) offload->netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); - if (!offload->netdev) - goto err_free; + err = bpf_dev_offload_check(offload->netdev); + if (err) + goto err_maybe_put; down_write(&bpf_devs_lock); - if (offload->netdev->reg_state != NETREG_REGISTERED) + if (offload->netdev->reg_state != NETREG_REGISTERED) { + err = -EINVAL; goto err_unlock; + } prog->aux->offload = offload; list_add_tail(&offload->offloads, &bpf_prog_offload_devs); dev_put(offload->netdev); @@ -63,10 +76,11 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) return 0; err_unlock: up_write(&bpf_devs_lock); - dev_put(offload->netdev); -err_free: +err_maybe_put: + if (offload->netdev) + dev_put(offload->netdev); kfree(offload); - return -EINVAL; + return err; } static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, @@ -80,8 +94,6 @@ static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, if (!offload) return -ENODEV; netdev = offload->netdev; - if (!netdev->netdev_ops->ndo_bpf) - return -EOPNOTSUPP; data->command = cmd; From a38845729ea3985db5d2544ec3ef3dc8f6313a27 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:09 -0800 Subject: [PATCH 13/33] bpf: offload: add map offload infrastructure BPF map offload follow similar path to program offload. At creation time users may specify ifindex of the device on which they want to create the map. Map will be validated by the kernel's .map_alloc_check callback and device driver will be called for the actual allocation. Map will have an empty set of operations associated with it (save for alloc and free callbacks). The real device callbacks are kept in map->offload->dev_ops because they have slightly different signatures. Map operations are called in process context so the driver may communicate with HW freely, msleep(), wait() etc. Map alloc and free callbacks are muxed via existing .ndo_bpf, and are always called with rtnl lock held. Maps and programs are guaranteed to be destroyed before .ndo_uninit (i.e. before unregister_netdev() returns). Map callbacks are invoked with bpf_devs_lock *read* locked, drivers must take care of exclusive locking if necessary. All offload-specific branches are marked with unlikely() (through bpf_map_is_dev_bound()), given that branch penalty will be negligible compared to IO anyway, and we don't want to penalize SW path unnecessarily. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 59 +++++++++++ include/linux/netdevice.h | 6 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/offload.c | 188 +++++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 44 ++++++-- kernel/bpf/verifier.c | 7 ++ tools/include/uapi/linux/bpf.h | 1 + 7 files changed, 293 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9fff1ace1d8e..5c2c104dc2c5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -74,6 +74,33 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; }; +struct bpf_offloaded_map; + +struct bpf_map_dev_ops { + int (*map_get_next_key)(struct bpf_offloaded_map *map, + void *key, void *next_key); + int (*map_lookup_elem)(struct bpf_offloaded_map *map, + void *key, void *value); + int (*map_update_elem)(struct bpf_offloaded_map *map, + void *key, void *value, u64 flags); + int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key); +}; + +struct bpf_offloaded_map { + struct bpf_map map; + struct net_device *netdev; + const struct bpf_map_dev_ops *dev_ops; + void *dev_priv; + struct list_head offloads; +}; + +static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) +{ + return container_of(map, struct bpf_offloaded_map, map); +} + +extern const struct bpf_map_ops bpf_map_offload_ops; + /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ @@ -369,6 +396,7 @@ int __bpf_prog_charge(struct user_struct *user, u32 pages); void __bpf_prog_uncharge(struct user_struct *user, u32 pages); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); +void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); @@ -556,6 +584,15 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); +int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); +int bpf_map_offload_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags); +int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); +int bpf_map_offload_get_next_key(struct bpf_map *map, + void *key, void *next_key); + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map); + #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); @@ -563,6 +600,14 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) { return aux->offload_requested; } + +static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +{ + return unlikely(map->ops == &bpf_map_offload_ops); +} + +struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); +void bpf_map_offload_map_free(struct bpf_map *map); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -574,6 +619,20 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) { return false; } + +static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +{ + return false; +} + +static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void bpf_map_offload_map_free(struct bpf_map *map) +{ +} #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ef7b348e8498..0b3ab42d50fe 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -804,6 +804,8 @@ enum bpf_netdev_command { BPF_OFFLOAD_VERIFIER_PREP, BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY, + BPF_OFFLOAD_MAP_ALLOC, + BPF_OFFLOAD_MAP_FREE, }; struct bpf_prog_offload_ops; @@ -834,6 +836,10 @@ struct netdev_bpf { struct { struct bpf_prog *prog; } offload; + /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */ + struct { + struct bpf_offloaded_map *offmap; + }; }; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 395d261948de..7c2259e8bc54 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -245,6 +245,7 @@ union bpf_attr { * BPF_F_NUMA_NODE is set). */ char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index cdd1e19a668b..453785fa1881 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -24,11 +24,13 @@ #include #include -/* Protects bpf_prog_offload_devs and offload members of all progs. +/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members + * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); +static LIST_HEAD(bpf_map_offload_devs); static int bpf_dev_offload_check(struct net_device *netdev) { @@ -250,11 +252,186 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, const struct bpf_prog_ops bpf_offload_prog_ops = { }; +static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, + enum bpf_netdev_command cmd) +{ + struct netdev_bpf data = {}; + struct net_device *netdev; + + ASSERT_RTNL(); + + data.command = cmd; + data.offmap = offmap; + /* Caller must make sure netdev is valid */ + netdev = offmap->netdev; + + return netdev->netdev_ops->ndo_bpf(netdev, &data); +} + +struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_offloaded_map *offmap; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->map_type != BPF_MAP_TYPE_HASH) + return ERR_PTR(-EINVAL); + + offmap = kzalloc(sizeof(*offmap), GFP_USER); + if (!offmap) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&offmap->map, attr); + + rtnl_lock(); + down_write(&bpf_devs_lock); + offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); + err = bpf_dev_offload_check(offmap->netdev); + if (err) + goto err_unlock; + + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); + if (err) + goto err_unlock; + + list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + return &offmap->map; + +err_unlock: + up_write(&bpf_devs_lock); + rtnl_unlock(); + kfree(offmap); + return ERR_PTR(err); +} + +static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) +{ + WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); + /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ + bpf_map_free_id(&offmap->map, true); + list_del_init(&offmap->offloads); + offmap->netdev = NULL; +} + +void bpf_map_offload_map_free(struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + + rtnl_lock(); + down_write(&bpf_devs_lock); + if (offmap->netdev) + __bpf_map_offload_destroy(offmap); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + kfree(offmap); +} + +int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_update_elem(offmap, key, value, + flags); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_delete_elem(offmap, key); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); + up_read(&bpf_devs_lock); + + return ret; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap; + struct bpf_prog_offload *offload; + bool ret; + + if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map)) + return false; + if (!bpf_prog_is_dev_bound(prog->aux)) + return true; + + down_read(&bpf_devs_lock); + offload = prog->aux->offload; + offmap = map_to_offmap(map); + + ret = offload && offload->netdev == offmap->netdev; + up_read(&bpf_devs_lock); + + return ret; +} + +static void bpf_offload_orphan_all_progs(struct net_device *netdev) +{ + struct bpf_prog_offload *offload, *tmp; + + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) + if (offload->netdev == netdev) + __bpf_prog_offload_destroy(offload->prog); +} + +static void bpf_offload_orphan_all_maps(struct net_device *netdev) +{ + struct bpf_offloaded_map *offmap, *tmp; + + list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) + if (offmap->netdev == netdev) + __bpf_map_offload_destroy(offmap); +} + static int bpf_offload_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct bpf_prog_offload *offload, *tmp; ASSERT_RTNL(); @@ -265,11 +442,8 @@ static int bpf_offload_notification(struct notifier_block *notifier, break; down_write(&bpf_devs_lock); - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, - offloads) { - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); - } + bpf_offload_orphan_all_progs(netdev); + bpf_offload_orphan_all_maps(netdev); up_write(&bpf_devs_lock); break; default: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3f726bb42ea..c691b9e972e3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -94,6 +94,11 @@ static int check_uarg_tail_zero(void __user *uaddr, return 0; } +const struct bpf_map_ops bpf_map_offload_ops = { + .map_alloc = bpf_map_offload_map_alloc, + .map_free = bpf_map_offload_map_free, +}; + static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { const struct bpf_map_ops *ops; @@ -111,6 +116,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) if (err) return ERR_PTR(err); } + if (attr->map_ifindex) + ops = &bpf_map_offload_ops; map = ops->map_alloc(attr); if (IS_ERR(map)) return map; @@ -208,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) +void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { unsigned long flags; + /* Offloaded maps are removed from the IDR store when their device + * disappears - even if someone holds an fd to them they are unusable, + * the memory is gone, all ops will fail; they are simply waiting for + * refcnt to drop to be freed. + */ + if (!map->id) + return; + if (do_idr_lock) spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); + map->id = 0; if (do_idr_lock) spin_unlock_irqrestore(&map_idr_lock, flags); @@ -397,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD map_name +#define BPF_MAP_CREATE_LAST_FIELD map_ifindex /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -585,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -673,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr) goto free_value; /* Need to create a kthread, thus must support schedule */ - if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_update_elem(map, key, value, attr->flags); + goto out; + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } @@ -750,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_delete_elem(map, key); + goto out; + } + preempt_disable(); __this_cpu_inc(bpf_prog_active); rcu_read_lock(); @@ -757,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr) rcu_read_unlock(); __this_cpu_dec(bpf_prog_active); preempt_enable(); - +out: if (!err) trace_bpf_map_delete_elem(map, ufd, key); kfree(key); @@ -807,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr) if (!next_key) goto free_key; + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_get_next_key(map, key, next_key); + goto out; + } + rcu_read_lock(); err = map->ops->map_get_next_key(map, key, next_key); rcu_read_unlock(); +out: if (err) goto free_next_key; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48b61caa94cb..ceabb394d2dc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4816,6 +4816,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } } + + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && + !bpf_offload_dev_match(prog, map)) { + verbose(env, "offload device mismatch between prog and map\n"); + return -EINVAL; + } + return 0; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4e8c60acfa32..69f96af4a569 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -245,6 +245,7 @@ union bpf_attr { * BPF_F_NUMA_NODE is set). */ char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ From 4da98eea7903670d7b1d9ac83bbc6502a60bf0fd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:10 -0800 Subject: [PATCH 14/33] nfp: bpf: add map data structure To be able to split code into reasonable chunks we need to add the map data structures already. Later patches will add code piece by piece. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 7 ++++++- drivers/net/ethernet/netronome/nfp/bpf/main.h | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index e8cfe300c8c4..c9fd7d417d1a 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -313,6 +313,8 @@ static int nfp_bpf_init(struct nfp_app *app) bpf->app = app; app->priv = bpf; + INIT_LIST_HEAD(&bpf->map_list); + err = nfp_bpf_parse_capabilities(app); if (err) goto err_free_bpf; @@ -326,7 +328,10 @@ err_free_bpf: static void nfp_bpf_clean(struct nfp_app *app) { - kfree(app->priv); + struct nfp_app_bpf *bpf = app->priv; + + WARN_ON(!list_empty(&bpf->map_list)); + kfree(bpf); } const struct nfp_app_type app_bpf = { diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 66381afee2a9..23763b22f8fc 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -93,6 +93,8 @@ enum pkt_vec { * struct nfp_app_bpf - bpf app priv structure * @app: backpointer to the app * + * @map_list: list of offloaded maps + * * @adjust_head: adjust head capability * @flags: extra flags for adjust head * @off_min: minimal packet offset within buffer required @@ -103,6 +105,8 @@ enum pkt_vec { struct nfp_app_bpf { struct nfp_app *app; + struct list_head map_list; + struct nfp_bpf_cap_adjust_head { u32 flags; int off_min; @@ -112,6 +116,20 @@ struct nfp_app_bpf { } adjust_head; }; +/** + * struct nfp_bpf_map - private per-map data attached to BPF maps for offload + * @offmap: pointer to the offloaded BPF map + * @bpf: back pointer to bpf app private structure + * @tid: table id identifying map on datapath + * @l: link on the nfp_app_bpf->map_list list + */ +struct nfp_bpf_map { + struct bpf_offloaded_map *offmap; + struct nfp_app_bpf *bpf; + u32 tid; + struct list_head l; +}; + struct nfp_prog; struct nfp_insn_meta; typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); From d48ae231c5e13d98e3664443c6342c2011f5df2b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:11 -0800 Subject: [PATCH 15/33] nfp: bpf: add basic control channel communication For map support we will need to send and receive control messages. Add basic support for sending a message to FW, and waiting for a reply. Control messages are tagged with a 16 bit ID. Add a simple ID allocator and make sure we don't allow too many messages in flight, to avoid request <> reply mismatches. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/Makefile | 1 + drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 238 ++++++++++++++++++ drivers/net/ethernet/netronome/nfp/bpf/fw.h | 22 ++ drivers/net/ethernet/netronome/nfp/bpf/main.c | 5 + drivers/net/ethernet/netronome/nfp/bpf/main.h | 23 ++ drivers/net/ethernet/netronome/nfp/nfp_app.h | 9 + drivers/net/ethernet/netronome/nfp/nfp_net.h | 12 + .../ethernet/netronome/nfp/nfp_net_common.c | 7 + 8 files changed, 317 insertions(+) create mode 100644 drivers/net/ethernet/netronome/nfp/bpf/cmsg.c diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile index 6e5ef984398b..064f00e23a19 100644 --- a/drivers/net/ethernet/netronome/nfp/Makefile +++ b/drivers/net/ethernet/netronome/nfp/Makefile @@ -44,6 +44,7 @@ endif ifeq ($(CONFIG_BPF_SYSCALL),y) nfp-objs += \ + bpf/cmsg.o \ bpf/main.o \ bpf/offload.o \ bpf/verifier.o \ diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c new file mode 100644 index 000000000000..46753ee9f7c5 --- /dev/null +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -0,0 +1,238 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "../nfp_app.h" +#include "../nfp_net.h" +#include "fw.h" +#include "main.h" + +#define cmsg_warn(bpf, msg...) nn_dp_warn(&(bpf)->app->ctrl->dp, msg) + +#define NFP_BPF_TAG_ALLOC_SPAN (U16_MAX / 4) + +static bool nfp_bpf_all_tags_busy(struct nfp_app_bpf *bpf) +{ + u16 used_tags; + + used_tags = bpf->tag_alloc_next - bpf->tag_alloc_last; + + return used_tags > NFP_BPF_TAG_ALLOC_SPAN; +} + +static int nfp_bpf_alloc_tag(struct nfp_app_bpf *bpf) +{ + /* All FW communication for BPF is request-reply. To make sure we + * don't reuse the message ID too early after timeout - limit the + * number of requests in flight. + */ + if (nfp_bpf_all_tags_busy(bpf)) { + cmsg_warn(bpf, "all FW request contexts busy!\n"); + return -EAGAIN; + } + + WARN_ON(__test_and_set_bit(bpf->tag_alloc_next, bpf->tag_allocator)); + return bpf->tag_alloc_next++; +} + +static void nfp_bpf_free_tag(struct nfp_app_bpf *bpf, u16 tag) +{ + WARN_ON(!__test_and_clear_bit(tag, bpf->tag_allocator)); + + while (!test_bit(bpf->tag_alloc_last, bpf->tag_allocator) && + bpf->tag_alloc_last != bpf->tag_alloc_next) + bpf->tag_alloc_last++; +} + +static unsigned int nfp_bpf_cmsg_get_tag(struct sk_buff *skb) +{ + struct cmsg_hdr *hdr; + + hdr = (struct cmsg_hdr *)skb->data; + + return be16_to_cpu(hdr->tag); +} + +static struct sk_buff *__nfp_bpf_reply(struct nfp_app_bpf *bpf, u16 tag) +{ + unsigned int msg_tag; + struct sk_buff *skb; + + skb_queue_walk(&bpf->cmsg_replies, skb) { + msg_tag = nfp_bpf_cmsg_get_tag(skb); + if (msg_tag == tag) { + nfp_bpf_free_tag(bpf, tag); + __skb_unlink(skb, &bpf->cmsg_replies); + return skb; + } + } + + return NULL; +} + +static struct sk_buff *nfp_bpf_reply(struct nfp_app_bpf *bpf, u16 tag) +{ + struct sk_buff *skb; + + nfp_ctrl_lock(bpf->app->ctrl); + skb = __nfp_bpf_reply(bpf, tag); + nfp_ctrl_unlock(bpf->app->ctrl); + + return skb; +} + +static struct sk_buff *nfp_bpf_reply_drop_tag(struct nfp_app_bpf *bpf, u16 tag) +{ + struct sk_buff *skb; + + nfp_ctrl_lock(bpf->app->ctrl); + skb = __nfp_bpf_reply(bpf, tag); + if (!skb) + nfp_bpf_free_tag(bpf, tag); + nfp_ctrl_unlock(bpf->app->ctrl); + + return skb; +} + +static struct sk_buff * +nfp_bpf_cmsg_wait_reply(struct nfp_app_bpf *bpf, enum nfp_bpf_cmsg_type type, + int tag) +{ + struct sk_buff *skb; + int err; + + err = wait_event_interruptible_timeout(bpf->cmsg_wq, + skb = nfp_bpf_reply(bpf, tag), + msecs_to_jiffies(5000)); + /* We didn't get a response - try last time and atomically drop + * the tag even if no response is matched. + */ + if (!skb) + skb = nfp_bpf_reply_drop_tag(bpf, tag); + if (err < 0) { + cmsg_warn(bpf, "%s waiting for response to 0x%02x: %d\n", + err == ERESTARTSYS ? "interrupted" : "error", + type, err); + return ERR_PTR(err); + } + if (!skb) { + cmsg_warn(bpf, "timeout waiting for response to 0x%02x\n", + type); + return ERR_PTR(-ETIMEDOUT); + } + + return skb; +} + +struct sk_buff * +nfp_bpf_cmsg_communicate(struct nfp_app_bpf *bpf, struct sk_buff *skb, + enum nfp_bpf_cmsg_type type, unsigned int reply_size) +{ + struct cmsg_hdr *hdr; + int tag; + + nfp_ctrl_lock(bpf->app->ctrl); + tag = nfp_bpf_alloc_tag(bpf); + if (tag < 0) { + nfp_ctrl_unlock(bpf->app->ctrl); + dev_kfree_skb_any(skb); + return ERR_PTR(tag); + } + + hdr = (void *)skb->data; + hdr->ver = CMSG_MAP_ABI_VERSION; + hdr->type = type; + hdr->tag = cpu_to_be16(tag); + + __nfp_app_ctrl_tx(bpf->app, skb); + + nfp_ctrl_unlock(bpf->app->ctrl); + + skb = nfp_bpf_cmsg_wait_reply(bpf, type, tag); + if (IS_ERR(skb)) + return skb; + + hdr = (struct cmsg_hdr *)skb->data; + /* 0 reply_size means caller will do the validation */ + if (reply_size && skb->len != reply_size) { + cmsg_warn(bpf, "cmsg drop - wrong size %d != %d!\n", + skb->len, reply_size); + goto err_free; + } + if (hdr->type != __CMSG_REPLY(type)) { + cmsg_warn(bpf, "cmsg drop - wrong type 0x%02x != 0x%02lx!\n", + hdr->type, __CMSG_REPLY(type)); + goto err_free; + } + + return skb; +err_free: + dev_kfree_skb_any(skb); + return ERR_PTR(-EIO); +} + +void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb) +{ + struct nfp_app_bpf *bpf = app->priv; + unsigned int tag; + + if (unlikely(skb->len < sizeof(struct cmsg_reply_map_simple))) { + cmsg_warn(bpf, "cmsg drop - too short %d!\n", skb->len); + goto err_free; + } + + nfp_ctrl_lock(bpf->app->ctrl); + + tag = nfp_bpf_cmsg_get_tag(skb); + if (unlikely(!test_bit(tag, bpf->tag_allocator))) { + cmsg_warn(bpf, "cmsg drop - no one is waiting for tag %u!\n", + tag); + goto err_unlock; + } + + __skb_queue_tail(&bpf->cmsg_replies, skb); + wake_up_interruptible_all(&bpf->cmsg_wq); + + nfp_ctrl_unlock(bpf->app->ctrl); + + return; +err_unlock: + nfp_ctrl_unlock(bpf->app->ctrl); +err_free: + dev_kfree_skb_any(skb); +} diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h index 7206aa1522db..107676b34760 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h @@ -51,4 +51,26 @@ struct nfp_bpf_cap_tlv_adjust_head { #define NFP_BPF_ADJUST_HEAD_NO_META BIT(0) +/* + * Types defined for map related control messages + */ +#define CMSG_MAP_ABI_VERSION 1 + +enum nfp_bpf_cmsg_type { + __CMSG_TYPE_MAP_MAX, +}; + +#define CMSG_TYPE_MAP_REPLY_BIT 7 +#define __CMSG_REPLY(req) (BIT(CMSG_TYPE_MAP_REPLY_BIT) | (req)) + +struct cmsg_hdr { + u8 type; + u8 ver; + __be16 tag; +}; + +struct cmsg_reply_map_simple { + struct cmsg_hdr hdr; + __be32 rc; +}; #endif diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index c9fd7d417d1a..a14368c6449f 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -313,6 +313,8 @@ static int nfp_bpf_init(struct nfp_app *app) bpf->app = app; app->priv = bpf; + skb_queue_head_init(&bpf->cmsg_replies); + init_waitqueue_head(&bpf->cmsg_wq); INIT_LIST_HEAD(&bpf->map_list); err = nfp_bpf_parse_capabilities(app); @@ -330,6 +332,7 @@ static void nfp_bpf_clean(struct nfp_app *app) { struct nfp_app_bpf *bpf = app->priv; + WARN_ON(!skb_queue_empty(&bpf->cmsg_replies)); WARN_ON(!list_empty(&bpf->map_list)); kfree(bpf); } @@ -348,6 +351,8 @@ const struct nfp_app_type app_bpf = { .vnic_alloc = nfp_bpf_vnic_alloc, .vnic_free = nfp_bpf_vnic_free, + .ctrl_msg_rx = nfp_bpf_ctrl_msg_rx, + .setup_tc = nfp_bpf_setup_tc, .tc_busy = nfp_bpf_tc_busy, .bpf = nfp_ndo_bpf, diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 23763b22f8fc..e00a7de2a9c2 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -37,10 +37,14 @@ #include #include #include +#include #include +#include #include +#include #include "../nfp_asm.h" +#include "fw.h" /* For relocation logic use up-most byte of branch instruction as scratch * area. Remember to clear this before sending instructions to HW! @@ -93,6 +97,13 @@ enum pkt_vec { * struct nfp_app_bpf - bpf app priv structure * @app: backpointer to the app * + * @tag_allocator: bitmap of control message tags in use + * @tag_alloc_next: next tag bit to allocate + * @tag_alloc_last: next tag bit to be freed + * + * @cmsg_replies: received cmsg replies waiting to be consumed + * @cmsg_wq: work queue for waiting for cmsg replies + * * @map_list: list of offloaded maps * * @adjust_head: adjust head capability @@ -105,6 +116,13 @@ enum pkt_vec { struct nfp_app_bpf { struct nfp_app *app; + DECLARE_BITMAP(tag_allocator, U16_MAX + 1); + u16 tag_alloc_next; + u16 tag_alloc_last; + + struct sk_buff_head cmsg_replies; + struct wait_queue_head cmsg_wq; + struct list_head map_list; struct nfp_bpf_cap_adjust_head { @@ -284,4 +302,9 @@ nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, unsigned int insn_idx, unsigned int n_insns); void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv); + +struct sk_buff * +nfp_bpf_cmsg_communicate(struct nfp_app_bpf *bpf, struct sk_buff *skb, + enum nfp_bpf_cmsg_type type, unsigned int reply_size); +void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb); #endif diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h index 32ff46a00f70..6a6eb02b516e 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h @@ -165,6 +165,7 @@ struct nfp_app { void *priv; }; +bool __nfp_ctrl_tx(struct nfp_net *nn, struct sk_buff *skb); bool nfp_ctrl_tx(struct nfp_net *nn, struct sk_buff *skb); static inline int nfp_app_init(struct nfp_app *app) @@ -326,6 +327,14 @@ static inline int nfp_app_xdp_offload(struct nfp_app *app, struct nfp_net *nn, return app->type->xdp_offload(app, nn, prog); } +static inline bool __nfp_app_ctrl_tx(struct nfp_app *app, struct sk_buff *skb) +{ + trace_devlink_hwmsg(priv_to_devlink(app->pf), false, 0, + skb->data, skb->len); + + return __nfp_ctrl_tx(app->ctrl, skb); +} + static inline bool nfp_app_ctrl_tx(struct nfp_app *app, struct sk_buff *skb) { trace_devlink_hwmsg(priv_to_devlink(app->pf), false, 0, diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index 0e564cfabe7e..6f6e3d6fd935 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -839,6 +839,18 @@ static inline const char *nfp_net_name(struct nfp_net *nn) return nn->dp.netdev ? nn->dp.netdev->name : "ctrl"; } +static inline void nfp_ctrl_lock(struct nfp_net *nn) + __acquires(&nn->r_vecs[0].lock) +{ + spin_lock_bh(&nn->r_vecs[0].lock); +} + +static inline void nfp_ctrl_unlock(struct nfp_net *nn) + __releases(&nn->r_vecs[0].lock) +{ + spin_unlock_bh(&nn->r_vecs[0].lock); +} + /* Globals */ extern const char nfp_driver_version[]; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 07e0587dc14e..2b5cad3069a7 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1920,6 +1920,13 @@ err_free: return false; } +bool __nfp_ctrl_tx(struct nfp_net *nn, struct sk_buff *skb) +{ + struct nfp_net_r_vector *r_vec = &nn->r_vecs[0]; + + return nfp_ctrl_tx_one(nn, r_vec, skb, false); +} + bool nfp_ctrl_tx(struct nfp_net *nn, struct sk_buff *skb) { struct nfp_net_r_vector *r_vec = &nn->r_vecs[0]; From ff3d43f7568c82b335d7df2d40a31447c3fce10c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:12 -0800 Subject: [PATCH 16/33] nfp: bpf: implement helpers for FW map ops Implement calls for FW map communication. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 210 +++++++++++++++++- drivers/net/ethernet/netronome/nfp/bpf/fw.h | 65 ++++++ drivers/net/ethernet/netronome/nfp/bpf/main.h | 17 +- 3 files changed, 288 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c index 46753ee9f7c5..71e6586acc36 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include #include #include #include @@ -79,6 +80,28 @@ static void nfp_bpf_free_tag(struct nfp_app_bpf *bpf, u16 tag) bpf->tag_alloc_last++; } +static struct sk_buff * +nfp_bpf_cmsg_alloc(struct nfp_app_bpf *bpf, unsigned int size) +{ + struct sk_buff *skb; + + skb = nfp_app_ctrl_msg_alloc(bpf->app, size, GFP_KERNEL); + skb_put(skb, size); + + return skb; +} + +static struct sk_buff * +nfp_bpf_cmsg_map_req_alloc(struct nfp_app_bpf *bpf, unsigned int n) +{ + unsigned int size; + + size = sizeof(struct cmsg_req_map_op); + size += sizeof(struct cmsg_key_value_pair) * n; + + return nfp_bpf_cmsg_alloc(bpf, size); +} + static unsigned int nfp_bpf_cmsg_get_tag(struct sk_buff *skb) { struct cmsg_hdr *hdr; @@ -159,7 +182,7 @@ nfp_bpf_cmsg_wait_reply(struct nfp_app_bpf *bpf, enum nfp_bpf_cmsg_type type, return skb; } -struct sk_buff * +static struct sk_buff * nfp_bpf_cmsg_communicate(struct nfp_app_bpf *bpf, struct sk_buff *skb, enum nfp_bpf_cmsg_type type, unsigned int reply_size) { @@ -206,6 +229,191 @@ err_free: return ERR_PTR(-EIO); } +static int +nfp_bpf_ctrl_rc_to_errno(struct nfp_app_bpf *bpf, + struct cmsg_reply_map_simple *reply) +{ + static const int res_table[] = { + [CMSG_RC_SUCCESS] = 0, + [CMSG_RC_ERR_MAP_FD] = -EBADFD, + [CMSG_RC_ERR_MAP_NOENT] = -ENOENT, + [CMSG_RC_ERR_MAP_ERR] = -EINVAL, + [CMSG_RC_ERR_MAP_PARSE] = -EIO, + [CMSG_RC_ERR_MAP_EXIST] = -EEXIST, + [CMSG_RC_ERR_MAP_NOMEM] = -ENOMEM, + [CMSG_RC_ERR_MAP_E2BIG] = -E2BIG, + }; + u32 rc; + + rc = be32_to_cpu(reply->rc); + if (rc >= ARRAY_SIZE(res_table)) { + cmsg_warn(bpf, "FW responded with invalid status: %u\n", rc); + return -EIO; + } + + return res_table[rc]; +} + +long long int +nfp_bpf_ctrl_alloc_map(struct nfp_app_bpf *bpf, struct bpf_map *map) +{ + struct cmsg_reply_map_alloc_tbl *reply; + struct cmsg_req_map_alloc_tbl *req; + struct sk_buff *skb; + u32 tid; + int err; + + skb = nfp_bpf_cmsg_alloc(bpf, sizeof(*req)); + if (!skb) + return -ENOMEM; + + req = (void *)skb->data; + req->key_size = cpu_to_be32(map->key_size); + req->value_size = cpu_to_be32(map->value_size); + req->max_entries = cpu_to_be32(map->max_entries); + req->map_type = cpu_to_be32(map->map_type); + req->map_flags = 0; + + skb = nfp_bpf_cmsg_communicate(bpf, skb, CMSG_TYPE_MAP_ALLOC, + sizeof(*reply)); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + reply = (void *)skb->data; + err = nfp_bpf_ctrl_rc_to_errno(bpf, &reply->reply_hdr); + if (err) + goto err_free; + + tid = be32_to_cpu(reply->tid); + dev_consume_skb_any(skb); + + return tid; +err_free: + dev_kfree_skb_any(skb); + return err; +} + +void nfp_bpf_ctrl_free_map(struct nfp_app_bpf *bpf, struct nfp_bpf_map *nfp_map) +{ + struct cmsg_reply_map_free_tbl *reply; + struct cmsg_req_map_free_tbl *req; + struct sk_buff *skb; + int err; + + skb = nfp_bpf_cmsg_alloc(bpf, sizeof(*req)); + if (!skb) { + cmsg_warn(bpf, "leaking map - failed to allocate msg\n"); + return; + } + + req = (void *)skb->data; + req->tid = cpu_to_be32(nfp_map->tid); + + skb = nfp_bpf_cmsg_communicate(bpf, skb, CMSG_TYPE_MAP_FREE, + sizeof(*reply)); + if (IS_ERR(skb)) { + cmsg_warn(bpf, "leaking map - I/O error\n"); + return; + } + + reply = (void *)skb->data; + err = nfp_bpf_ctrl_rc_to_errno(bpf, &reply->reply_hdr); + if (err) + cmsg_warn(bpf, "leaking map - FW responded with: %d\n", err); + + dev_consume_skb_any(skb); +} + +static int +nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap, + enum nfp_bpf_cmsg_type op, + u8 *key, u8 *value, u64 flags, u8 *out_key, u8 *out_value) +{ + struct nfp_bpf_map *nfp_map = offmap->dev_priv; + struct nfp_app_bpf *bpf = nfp_map->bpf; + struct bpf_map *map = &offmap->map; + struct cmsg_reply_map_op *reply; + struct cmsg_req_map_op *req; + struct sk_buff *skb; + int err; + + /* FW messages have no space for more than 32 bits of flags */ + if (flags >> 32) + return -EOPNOTSUPP; + + skb = nfp_bpf_cmsg_map_req_alloc(bpf, 1); + if (!skb) + return -ENOMEM; + + req = (void *)skb->data; + req->tid = cpu_to_be32(nfp_map->tid); + req->count = cpu_to_be32(1); + req->flags = cpu_to_be32(flags); + + /* Copy inputs */ + if (key) + memcpy(&req->elem[0].key, key, map->key_size); + if (value) + memcpy(&req->elem[0].value, value, map->value_size); + + skb = nfp_bpf_cmsg_communicate(bpf, skb, op, + sizeof(*reply) + sizeof(*reply->elem)); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + reply = (void *)skb->data; + err = nfp_bpf_ctrl_rc_to_errno(bpf, &reply->reply_hdr); + if (err) + goto err_free; + + /* Copy outputs */ + if (out_key) + memcpy(out_key, &reply->elem[0].key, map->key_size); + if (out_value) + memcpy(out_value, &reply->elem[0].value, map->value_size); + + dev_consume_skb_any(skb); + + return 0; +err_free: + dev_kfree_skb_any(skb); + return err; +} + +int nfp_bpf_ctrl_update_entry(struct bpf_offloaded_map *offmap, + void *key, void *value, u64 flags) +{ + return nfp_bpf_ctrl_entry_op(offmap, CMSG_TYPE_MAP_UPDATE, + key, value, flags, NULL, NULL); +} + +int nfp_bpf_ctrl_del_entry(struct bpf_offloaded_map *offmap, void *key) +{ + return nfp_bpf_ctrl_entry_op(offmap, CMSG_TYPE_MAP_DELETE, + key, NULL, 0, NULL, NULL); +} + +int nfp_bpf_ctrl_lookup_entry(struct bpf_offloaded_map *offmap, + void *key, void *value) +{ + return nfp_bpf_ctrl_entry_op(offmap, CMSG_TYPE_MAP_LOOKUP, + key, NULL, 0, NULL, value); +} + +int nfp_bpf_ctrl_getfirst_entry(struct bpf_offloaded_map *offmap, + void *next_key) +{ + return nfp_bpf_ctrl_entry_op(offmap, CMSG_TYPE_MAP_GETFIRST, + NULL, NULL, 0, next_key, NULL); +} + +int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap, + void *key, void *next_key) +{ + return nfp_bpf_ctrl_entry_op(offmap, CMSG_TYPE_MAP_GETNEXT, + key, NULL, 0, next_key, NULL); +} + void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb) { struct nfp_app_bpf *bpf = app->priv; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h index 107676b34760..e0ff68fc9562 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h @@ -57,12 +57,33 @@ struct nfp_bpf_cap_tlv_adjust_head { #define CMSG_MAP_ABI_VERSION 1 enum nfp_bpf_cmsg_type { + CMSG_TYPE_MAP_ALLOC = 1, + CMSG_TYPE_MAP_FREE = 2, + CMSG_TYPE_MAP_LOOKUP = 3, + CMSG_TYPE_MAP_UPDATE = 4, + CMSG_TYPE_MAP_DELETE = 5, + CMSG_TYPE_MAP_GETNEXT = 6, + CMSG_TYPE_MAP_GETFIRST = 7, __CMSG_TYPE_MAP_MAX, }; #define CMSG_TYPE_MAP_REPLY_BIT 7 #define __CMSG_REPLY(req) (BIT(CMSG_TYPE_MAP_REPLY_BIT) | (req)) +#define CMSG_MAP_KEY_LW 16 +#define CMSG_MAP_VALUE_LW 16 + +enum nfp_bpf_cmsg_status { + CMSG_RC_SUCCESS = 0, + CMSG_RC_ERR_MAP_FD = 1, + CMSG_RC_ERR_MAP_NOENT = 2, + CMSG_RC_ERR_MAP_ERR = 3, + CMSG_RC_ERR_MAP_PARSE = 4, + CMSG_RC_ERR_MAP_EXIST = 5, + CMSG_RC_ERR_MAP_NOMEM = 6, + CMSG_RC_ERR_MAP_E2BIG = 7, +}; + struct cmsg_hdr { u8 type; u8 ver; @@ -73,4 +94,48 @@ struct cmsg_reply_map_simple { struct cmsg_hdr hdr; __be32 rc; }; + +struct cmsg_req_map_alloc_tbl { + struct cmsg_hdr hdr; + __be32 key_size; /* in bytes */ + __be32 value_size; /* in bytes */ + __be32 max_entries; + __be32 map_type; + __be32 map_flags; /* reserved */ +}; + +struct cmsg_reply_map_alloc_tbl { + struct cmsg_reply_map_simple reply_hdr; + __be32 tid; +}; + +struct cmsg_req_map_free_tbl { + struct cmsg_hdr hdr; + __be32 tid; +}; + +struct cmsg_reply_map_free_tbl { + struct cmsg_reply_map_simple reply_hdr; + __be32 count; +}; + +struct cmsg_key_value_pair { + __be32 key[CMSG_MAP_KEY_LW]; + __be32 value[CMSG_MAP_VALUE_LW]; +}; + +struct cmsg_req_map_op { + struct cmsg_hdr hdr; + __be32 tid; + __be32 count; + __be32 flags; + struct cmsg_key_value_pair elem[0]; +}; + +struct cmsg_reply_map_op { + struct cmsg_reply_map_simple reply_hdr; + __be32 count; + __be32 resv; + struct cmsg_key_value_pair elem[0]; +}; #endif diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index e00a7de2a9c2..047f253fc581 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -303,8 +303,19 @@ nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv); -struct sk_buff * -nfp_bpf_cmsg_communicate(struct nfp_app_bpf *bpf, struct sk_buff *skb, - enum nfp_bpf_cmsg_type type, unsigned int reply_size); +long long int +nfp_bpf_ctrl_alloc_map(struct nfp_app_bpf *bpf, struct bpf_map *map); +void +nfp_bpf_ctrl_free_map(struct nfp_app_bpf *bpf, struct nfp_bpf_map *nfp_map); +int nfp_bpf_ctrl_getfirst_entry(struct bpf_offloaded_map *offmap, + void *next_key); +int nfp_bpf_ctrl_update_entry(struct bpf_offloaded_map *offmap, + void *key, void *value, u64 flags); +int nfp_bpf_ctrl_del_entry(struct bpf_offloaded_map *offmap, void *key); +int nfp_bpf_ctrl_lookup_entry(struct bpf_offloaded_map *offmap, + void *key, void *value); +int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap, + void *key, void *next_key); + void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb); #endif From 9d080d5da959ac4b64954f47b5ffd35a752d268e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:13 -0800 Subject: [PATCH 17/33] nfp: bpf: parse function call and map capabilities Parse helper function and supported map FW TLV capabilities. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/fw.h | 16 +++++++ drivers/net/ethernet/netronome/nfp/bpf/main.c | 47 +++++++++++++++++++ drivers/net/ethernet/netronome/nfp/bpf/main.h | 24 ++++++++++ 3 files changed, 87 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h index e0ff68fc9562..cfcc7bcb2c67 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h @@ -38,7 +38,14 @@ #include enum bpf_cap_tlv_type { + NFP_BPF_CAP_TYPE_FUNC = 1, NFP_BPF_CAP_TYPE_ADJUST_HEAD = 2, + NFP_BPF_CAP_TYPE_MAPS = 3, +}; + +struct nfp_bpf_cap_tlv_func { + __le32 func_id; + __le32 func_addr; }; struct nfp_bpf_cap_tlv_adjust_head { @@ -51,6 +58,15 @@ struct nfp_bpf_cap_tlv_adjust_head { #define NFP_BPF_ADJUST_HEAD_NO_META BIT(0) +struct nfp_bpf_cap_tlv_maps { + __le32 types; + __le32 max_maps; + __le32 max_elems; + __le32 max_key_sz; + __le32 max_val_sz; + __le32 max_elem_sz; +}; + /* * Types defined for map related control messages */ diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index a14368c6449f..7d5cc59feb7e 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -251,6 +251,45 @@ nfp_bpf_parse_cap_adjust_head(struct nfp_app_bpf *bpf, void __iomem *value, return 0; } +static int +nfp_bpf_parse_cap_func(struct nfp_app_bpf *bpf, void __iomem *value, u32 length) +{ + struct nfp_bpf_cap_tlv_func __iomem *cap = value; + + if (length < sizeof(*cap)) { + nfp_err(bpf->app->cpp, "truncated function TLV: %d\n", length); + return -EINVAL; + } + + switch (readl(&cap->func_id)) { + case BPF_FUNC_map_lookup_elem: + bpf->helpers.map_lookup = readl(&cap->func_addr); + break; + } + + return 0; +} + +static int +nfp_bpf_parse_cap_maps(struct nfp_app_bpf *bpf, void __iomem *value, u32 length) +{ + struct nfp_bpf_cap_tlv_maps __iomem *cap = value; + + if (length < sizeof(*cap)) { + nfp_err(bpf->app->cpp, "truncated maps TLV: %d\n", length); + return -EINVAL; + } + + bpf->maps.types = readl(&cap->types); + bpf->maps.max_maps = readl(&cap->max_maps); + bpf->maps.max_elems = readl(&cap->max_elems); + bpf->maps.max_key_sz = readl(&cap->max_key_sz); + bpf->maps.max_val_sz = readl(&cap->max_val_sz); + bpf->maps.max_elem_sz = readl(&cap->max_elem_sz); + + return 0; +} + static int nfp_bpf_parse_capabilities(struct nfp_app *app) { struct nfp_cpp *cpp = app->pf->cpp; @@ -276,11 +315,19 @@ static int nfp_bpf_parse_capabilities(struct nfp_app *app) goto err_release_free; switch (type) { + case NFP_BPF_CAP_TYPE_FUNC: + if (nfp_bpf_parse_cap_func(app->priv, value, length)) + goto err_release_free; + break; case NFP_BPF_CAP_TYPE_ADJUST_HEAD: if (nfp_bpf_parse_cap_adjust_head(app->priv, value, length)) goto err_release_free; break; + case NFP_BPF_CAP_TYPE_MAPS: + if (nfp_bpf_parse_cap_maps(app->priv, value, length)) + goto err_release_free; + break; default: nfp_dbg(cpp, "unknown BPF capability: %d\n", type); break; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 047f253fc581..d381ae8629a2 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -112,6 +112,17 @@ enum pkt_vec { * @off_max: maximum packet offset within buffer required * @guaranteed_sub: amount of negative adjustment guaranteed possible * @guaranteed_add: amount of positive adjustment guaranteed possible + * + * @maps: map capability + * @types: supported map types + * @max_maps: max number of maps supported + * @max_elems: max number of entries in each map + * @max_key_sz: max size of map key + * @max_val_sz: max size of map value + * @max_elem_sz: max size of map entry (key + value) + * + * @helpers: helper addressess for various calls + * @map_lookup: map lookup helper address */ struct nfp_app_bpf { struct nfp_app *app; @@ -132,6 +143,19 @@ struct nfp_app_bpf { int guaranteed_sub; int guaranteed_add; } adjust_head; + + struct { + u32 types; + u32 max_maps; + u32 max_elems; + u32 max_key_sz; + u32 max_val_sz; + u32 max_elem_sz; + } maps; + + struct { + u32 map_lookup; + } helpers; }; /** From ce4ebfd859c33ea098bfa2e1b4623128046f59c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:14 -0800 Subject: [PATCH 18/33] nfp: bpf: add helpers for updating immediate instructions Immediate loads are used to load the return address of a helper. We need to be able to update those loads for relocations. Immediate loads can be slightly more complex and spread over two instructions in general, but here we only care about simple loads of small (< 65k) constants, so complex cases are not handled. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/nfp_asm.c | 58 ++++++++++++++++++++ drivers/net/ethernet/netronome/nfp/nfp_asm.h | 4 ++ 2 files changed, 62 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c index 9ee3a3f60cc7..3f6952b66a49 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c @@ -50,6 +50,11 @@ const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = { [CMD_TGT_READ_SWAP_LE] = { 0x03, 0x40 }, }; +static bool unreg_is_imm(u16 reg) +{ + return (reg & UR_REG_IMM) == UR_REG_IMM; +} + u16 br_get_offset(u64 instr) { u16 addr_lo, addr_hi; @@ -80,6 +85,59 @@ void br_add_offset(u64 *instr, u16 offset) br_set_offset(instr, addr + offset); } +static bool immed_can_modify(u64 instr) +{ + if (FIELD_GET(OP_IMMED_INV, instr) || + FIELD_GET(OP_IMMED_SHIFT, instr) || + FIELD_GET(OP_IMMED_WIDTH, instr) != IMMED_WIDTH_ALL) { + pr_err("Can't decode/encode immed!\n"); + return false; + } + return true; +} + +u16 immed_get_value(u64 instr) +{ + u16 reg; + + if (!immed_can_modify(instr)) + return 0; + + reg = FIELD_GET(OP_IMMED_A_SRC, instr); + if (!unreg_is_imm(reg)) + reg = FIELD_GET(OP_IMMED_B_SRC, instr); + + return (reg & 0xff) | FIELD_GET(OP_IMMED_IMM, instr); +} + +void immed_set_value(u64 *instr, u16 immed) +{ + if (!immed_can_modify(*instr)) + return; + + if (unreg_is_imm(FIELD_GET(OP_IMMED_A_SRC, *instr))) { + *instr &= ~FIELD_PREP(OP_IMMED_A_SRC, 0xff); + *instr |= FIELD_PREP(OP_IMMED_A_SRC, immed & 0xff); + } else { + *instr &= ~FIELD_PREP(OP_IMMED_B_SRC, 0xff); + *instr |= FIELD_PREP(OP_IMMED_B_SRC, immed & 0xff); + } + + *instr &= ~OP_IMMED_IMM; + *instr |= FIELD_PREP(OP_IMMED_IMM, immed >> 8); +} + +void immed_add_value(u64 *instr, u16 offset) +{ + u16 val; + + if (!immed_can_modify(*instr)) + return; + + val = immed_get_value(*instr); + immed_set_value(instr, val + offset); +} + static u16 nfp_swreg_to_unreg(swreg reg, bool is_dst) { bool lm_id, lm_dec = false; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h index 20e51cb60e69..5f9291db98e0 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h @@ -138,6 +138,10 @@ enum immed_shift { IMMED_SHIFT_2B = 2, }; +u16 immed_get_value(u64 instr); +void immed_set_value(u64 *instr, u16 immed); +void immed_add_value(u64 *instr, u16 offset); + #define OP_SHF_BASE 0x08000000000ULL #define OP_SHF_A_SRC 0x000000000ffULL #define OP_SHF_SC 0x00000000300ULL From 77a3d3113ba2aa5919af2335c05bf9505f4241db Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:15 -0800 Subject: [PATCH 19/33] nfp: bpf: add verification and codegen for map lookups Verify our current constraints on the location of the key are met and generate the code for calling map lookup on the datapath. New relocation types have to be added - for helpers and return addresses. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 86 +++++++++++++++++++ drivers/net/ethernet/netronome/nfp/bpf/main.h | 15 +++- .../net/ethernet/netronome/nfp/bpf/verifier.c | 39 +++++++++ 3 files changed, 138 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 47c5224f8d6f..77a5f35d7809 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -483,6 +483,21 @@ static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm) } } +static void +wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm, + enum nfp_relo_type relo) +{ + if (imm > 0xffff) { + pr_err("relocation of a large immediate!\n"); + nfp_prog->error = -EFAULT; + return; + } + emit_immed(nfp_prog, dst, imm, IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B); + + nfp_prog->prog[nfp_prog->prog_len - 1] |= + FIELD_PREP(OP_RELO_TYPE, relo); +} + /* ur_load_imm_any() - encode immediate or use tmp register (unrestricted) * If the @imm is small enough encode it directly in operand and return * otherwise load @imm to a spare register and return its encoding. @@ -1279,6 +1294,56 @@ static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) return 0; } +static int +map_lookup_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + struct bpf_offloaded_map *offmap; + struct nfp_bpf_map *nfp_map; + bool load_lm_ptr; + u32 ret_tgt; + s64 lm_off; + swreg tid; + + offmap = (struct bpf_offloaded_map *)meta->arg1.map_ptr; + nfp_map = offmap->dev_priv; + + /* We only have to reload LM0 if the key is not at start of stack */ + lm_off = nfp_prog->stack_depth; + lm_off += meta->arg2.var_off.value + meta->arg2.off; + load_lm_ptr = meta->arg2_var_off || lm_off; + + /* Set LM0 to start of key */ + if (load_lm_ptr) + emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0); + + /* Load map ID into a register, it should actually fit as an immediate + * but in case it doesn't deal with it here, not in the delay slots. + */ + tid = ur_load_imm_any(nfp_prog, nfp_map->tid, imm_a(nfp_prog)); + + emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + BPF_FUNC_map_lookup_elem, + 2, RELO_BR_HELPER); + ret_tgt = nfp_prog_current_offset(nfp_prog) + 2; + + /* Load map ID into A0 */ + wrp_mov(nfp_prog, reg_a(0), tid); + + /* Load the return address into B0 */ + wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL); + + if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt)) + return -EINVAL; + + /* Reset the LM0 pointer */ + if (!load_lm_ptr) + return 0; + + emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0); + wrp_nops(nfp_prog, 3); + + return 0; +} + /* --- Callbacks --- */ static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { @@ -2058,6 +2123,8 @@ static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) switch (meta->insn.imm) { case BPF_FUNC_xdp_adjust_head: return adjust_head(nfp_prog, meta); + case BPF_FUNC_map_lookup_elem: + return map_lookup_stack(nfp_prog, meta); default: WARN_ONCE(1, "verifier allowed unsupported function\n"); return -EOPNOTSUPP; @@ -2794,6 +2861,7 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv) for (i = 0; i < nfp_prog->prog_len; i++) { enum nfp_relo_type special; + u32 val; special = FIELD_GET(OP_RELO_TYPE, prog[i]); switch (special) { @@ -2813,6 +2881,24 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv) case RELO_BR_NEXT_PKT: br_set_offset(&prog[i], bv->tgt_done); break; + case RELO_BR_HELPER: + val = br_get_offset(prog[i]); + val -= BR_OFF_RELO; + switch (val) { + case BPF_FUNC_map_lookup_elem: + val = nfp_prog->bpf->helpers.map_lookup; + break; + default: + pr_err("relocation of unknown helper %d\n", + val); + err = -EINVAL; + goto err_free_prog; + } + br_set_offset(&prog[i], val); + break; + case RELO_IMMED_REL: + immed_add_value(&prog[i], bv->start_off); + break; } prog[i] &= ~OP_RELO_TYPE; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index d381ae8629a2..59197535c465 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -60,6 +60,9 @@ enum nfp_relo_type { RELO_BR_GO_ABORT, /* external jumps to fixed addresses */ RELO_BR_NEXT_PKT, + RELO_BR_HELPER, + /* immediate relocation against load address */ + RELO_IMMED_REL, }; /* To make absolute relocated branches (branches other than RELO_BR_REL) @@ -191,9 +194,12 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); * @ptr: pointer type for memory operations * @ldst_gather_len: memcpy length gathered from load/store sequence * @paired_st: the paired store insn at the head of the sequence - * @arg2: arg2 for call instructions * @ptr_not_const: pointer is not always constant * @jmp_dst: destination info for jump instructions + * @func_id: function id for call instructions + * @arg1: arg1 for call instructions + * @arg2: arg2 for call instructions + * @arg2_var_off: arg2 changes stack offset on different paths * @off: index of first generated machine instruction (in nfp_prog.prog) * @n: eBPF instruction number * @flags: eBPF instruction extra optimization flags @@ -211,7 +217,12 @@ struct nfp_insn_meta { bool ptr_not_const; }; struct nfp_insn_meta *jmp_dst; - struct bpf_reg_state arg2; + struct { + u32 func_id; + struct bpf_reg_state arg1; + struct bpf_reg_state arg2; + bool arg2_var_off; + }; }; unsigned int off; unsigned short n; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 7890d95d4018..f867577cc315 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -110,9 +110,11 @@ static int nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env, struct nfp_insn_meta *meta) { + const struct bpf_reg_state *reg1 = cur_regs(env) + BPF_REG_1; const struct bpf_reg_state *reg2 = cur_regs(env) + BPF_REG_2; struct nfp_app_bpf *bpf = nfp_prog->bpf; u32 func_id = meta->insn.imm; + s64 off, old_off; switch (func_id) { case BPF_FUNC_xdp_adjust_head: @@ -127,11 +129,48 @@ nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env, nfp_record_adjust_head(bpf, nfp_prog, meta, reg2); break; + + case BPF_FUNC_map_lookup_elem: + if (!bpf->helpers.map_lookup) { + pr_info("map_lookup: not supported by FW\n"); + return -EOPNOTSUPP; + } + if (reg2->type != PTR_TO_STACK) { + pr_info("map_lookup: unsupported key ptr type %d\n", + reg2->type); + return -EOPNOTSUPP; + } + if (!tnum_is_const(reg2->var_off)) { + pr_info("map_lookup: variable key pointer\n"); + return -EOPNOTSUPP; + } + + off = reg2->var_off.value + reg2->off; + if (-off % 4) { + pr_info("map_lookup: unaligned stack pointer %lld\n", + -off); + return -EOPNOTSUPP; + } + + /* Rest of the checks is only if we re-parse the same insn */ + if (!meta->func_id) + break; + + old_off = meta->arg2.var_off.value + meta->arg2.off; + meta->arg2_var_off |= off != old_off; + + if (meta->arg1.map_ptr != reg1->map_ptr) { + pr_info("map_lookup: called for different map\n"); + return -EOPNOTSUPP; + } + break; default: pr_vlog(env, "unsupported function id: %d\n", func_id); return -EOPNOTSUPP; } + meta->func_id = func_id; + meta->arg1 = *reg1; meta->arg2 = *reg2; return 0; From 3dd43c3319cb0ba17cec9a989124176b409da326 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:16 -0800 Subject: [PATCH 20/33] nfp: bpf: add support for reading map memory Map memory needs to use 40 bit addressing. Add handling of such accesses. Since 40 bit addresses are formed by using both 32 bit operands we need to pre-calculate the actual address instead of adding in the offset inside the instruction, like we did in 32 bit mode. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 77 ++++++++++++++++--- .../net/ethernet/netronome/nfp/bpf/verifier.c | 8 ++ 2 files changed, 76 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 77a5f35d7809..cdc949fabe98 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -553,27 +553,51 @@ wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len, emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true); } +static void +addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset, + swreg *rega, swreg *regb) +{ + if (offset == reg_imm(0)) { + *rega = reg_a(src_gpr); + *regb = reg_b(src_gpr + 1); + return; + } + + emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(src_gpr), ALU_OP_ADD, offset); + emit_alu(nfp_prog, imm_b(nfp_prog), reg_b(src_gpr + 1), ALU_OP_ADD_C, + reg_imm(0)); + *rega = imm_a(nfp_prog); + *regb = imm_b(nfp_prog); +} + /* NFP has Command Push Pull bus which supports bluk memory operations. */ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { bool descending_seq = meta->ldst_gather_len < 0; s16 len = abs(meta->ldst_gather_len); swreg src_base, off; + bool src_40bit_addr; unsigned int i; u8 xfer_num; off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog)); + src_40bit_addr = meta->ptr.type == PTR_TO_MAP_VALUE; src_base = reg_a(meta->insn.src_reg * 2); xfer_num = round_up(len, 4) / 4; + if (src_40bit_addr) + addr40_offset(nfp_prog, meta->insn.src_reg, off, &src_base, + &off); + /* Setup PREV_ALU fields to override memory read length. */ if (len > 32) wrp_immed(nfp_prog, reg_none(), CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1)); /* Memory read from source addr into transfer-in registers. */ - emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, - off, xfer_num - 1, true, len > 32); + emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, + src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0, + src_base, off, xfer_num - 1, true, len > 32); /* Move from transfer-in to transfer-out. */ for (i = 0; i < xfer_num; i++) @@ -711,20 +735,20 @@ data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size) } static int -data_ld_host_order(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset, - u8 dst_gpr, int size) +data_ld_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, + swreg lreg, swreg rreg, int size, enum cmd_mode mode) { unsigned int i; u8 mask, sz; - /* We load the value from the address indicated in @offset and then + /* We load the value from the address indicated in rreg + lreg and then * mask out the data we don't need. Note: this is little endian! */ sz = max(size, 4); mask = size < 4 ? GENMASK(size - 1, 0) : 0; - emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, - reg_a(src_gpr), offset, sz / 4 - 1, true); + emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0, + lreg, rreg, sz / 4 - 1, true); i = 0; if (mask) @@ -740,6 +764,26 @@ data_ld_host_order(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset, return 0; } +static int +data_ld_host_order_addr32(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset, + u8 dst_gpr, u8 size) +{ + return data_ld_host_order(nfp_prog, dst_gpr, reg_a(src_gpr), offset, + size, CMD_MODE_32b); +} + +static int +data_ld_host_order_addr40(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset, + u8 dst_gpr, u8 size) +{ + swreg rega, regb; + + addr40_offset(nfp_prog, src_gpr, offset, ®a, ®b); + + return data_ld_host_order(nfp_prog, dst_gpr, rega, regb, + size, CMD_MODE_40b_BA); +} + static int construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset, u16 src, u8 size) { @@ -1778,8 +1822,20 @@ mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog)); - return data_ld_host_order(nfp_prog, meta->insn.src_reg * 2, tmp_reg, - meta->insn.dst_reg * 2, size); + return data_ld_host_order_addr32(nfp_prog, meta->insn.src_reg * 2, + tmp_reg, meta->insn.dst_reg * 2, size); +} + +static int +mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + unsigned int size) +{ + swreg tmp_reg; + + tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog)); + + return data_ld_host_order_addr40(nfp_prog, meta->insn.src_reg * 2, + tmp_reg, meta->insn.dst_reg * 2, size); } static int @@ -1803,6 +1859,9 @@ mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, return mem_ldx_stack(nfp_prog, meta, size, meta->ptr.off + meta->ptr.var_off.value); + if (meta->ptr.type == PTR_TO_MAP_VALUE) + return mem_ldx_emem(nfp_prog, meta, size); + return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index f867577cc315..741438896cc7 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -249,6 +249,7 @@ nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, if (reg->type != PTR_TO_CTX && reg->type != PTR_TO_STACK && + reg->type != PTR_TO_MAP_VALUE && reg->type != PTR_TO_PACKET) { pr_vlog(env, "unsupported ptr type: %d\n", reg->type); return -EINVAL; @@ -260,6 +261,13 @@ nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, return err; } + if (reg->type == PTR_TO_MAP_VALUE) { + if (is_mbpf_store(meta)) { + pr_info("map writes not supported\n"); + return -EOPNOTSUPP; + } + } + if (meta->ptr.type != NOT_INIT && meta->ptr.type != reg->type) { pr_vlog(env, "ptr type changed for instruction %d -> %d\n", meta->ptr.type, reg->type); From 1bba4c413a328bfd216d59a212bec371e032391b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:17 -0800 Subject: [PATCH 21/33] nfp: bpf: implement bpf map offload Plug in to the stack's map offload callbacks for BPF map offload. Get next call needs some special handling on the FW side, since we can't send a NULL pointer to the FW there is a get first entry FW command. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 1 + drivers/net/ethernet/netronome/nfp/bpf/main.h | 4 + .../net/ethernet/netronome/nfp/bpf/offload.c | 104 ++++++++++++++++++ 3 files changed, 109 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 7d5cc59feb7e..8823c8360047 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -381,6 +381,7 @@ static void nfp_bpf_clean(struct nfp_app *app) WARN_ON(!skb_queue_empty(&bpf->cmsg_replies)); WARN_ON(!list_empty(&bpf->map_list)); + WARN_ON(bpf->maps_in_use || bpf->map_elems_in_use); kfree(bpf); } diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 59197535c465..b80e75a8ecda 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -108,6 +108,8 @@ enum pkt_vec { * @cmsg_wq: work queue for waiting for cmsg replies * * @map_list: list of offloaded maps + * @maps_in_use: number of currently offloaded maps + * @map_elems_in_use: number of elements allocated to offloaded maps * * @adjust_head: adjust head capability * @flags: extra flags for adjust head @@ -138,6 +140,8 @@ struct nfp_app_bpf { struct wait_queue_head cmsg_wq; struct list_head map_list; + unsigned int maps_in_use; + unsigned int map_elems_in_use; struct nfp_bpf_cap_adjust_head { u32 flags; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 6590228d3755..e2859b2e9c6a 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -36,6 +36,9 @@ * Netronome network device driver: TC offload functions for PF and VF */ +#define pr_fmt(fmt) "NFP net bpf: " fmt + +#include #include #include #include @@ -153,6 +156,103 @@ static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog) return 0; } +static int +nfp_bpf_map_get_next_key(struct bpf_offloaded_map *offmap, + void *key, void *next_key) +{ + if (!key) + return nfp_bpf_ctrl_getfirst_entry(offmap, next_key); + return nfp_bpf_ctrl_getnext_entry(offmap, key, next_key); +} + +static int +nfp_bpf_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) +{ + return nfp_bpf_ctrl_del_entry(offmap, key); +} + +static const struct bpf_map_dev_ops nfp_bpf_map_ops = { + .map_get_next_key = nfp_bpf_map_get_next_key, + .map_lookup_elem = nfp_bpf_ctrl_lookup_entry, + .map_update_elem = nfp_bpf_ctrl_update_entry, + .map_delete_elem = nfp_bpf_map_delete_elem, +}; + +static int +nfp_bpf_map_alloc(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap) +{ + struct nfp_bpf_map *nfp_map; + long long int res; + + if (!bpf->maps.types) + return -EOPNOTSUPP; + + if (offmap->map.map_flags || + offmap->map.numa_node != NUMA_NO_NODE) { + pr_info("map flags are not supported\n"); + return -EINVAL; + } + + if (!(bpf->maps.types & 1 << offmap->map.map_type)) { + pr_info("map type not supported\n"); + return -EOPNOTSUPP; + } + if (bpf->maps.max_maps == bpf->maps_in_use) { + pr_info("too many maps for a device\n"); + return -ENOMEM; + } + if (bpf->maps.max_elems - bpf->map_elems_in_use < + offmap->map.max_entries) { + pr_info("map with too many elements: %u, left: %u\n", + offmap->map.max_entries, + bpf->maps.max_elems - bpf->map_elems_in_use); + return -ENOMEM; + } + if (offmap->map.key_size > bpf->maps.max_key_sz || + offmap->map.value_size > bpf->maps.max_val_sz || + round_up(offmap->map.key_size, 8) + + round_up(offmap->map.value_size, 8) > bpf->maps.max_elem_sz) { + pr_info("elements don't fit in device constraints\n"); + return -ENOMEM; + } + + nfp_map = kzalloc(sizeof(*nfp_map), GFP_USER); + if (!nfp_map) + return -ENOMEM; + + offmap->dev_priv = nfp_map; + nfp_map->offmap = offmap; + nfp_map->bpf = bpf; + + res = nfp_bpf_ctrl_alloc_map(bpf, &offmap->map); + if (res < 0) { + kfree(nfp_map); + return res; + } + + nfp_map->tid = res; + offmap->dev_ops = &nfp_bpf_map_ops; + bpf->maps_in_use++; + bpf->map_elems_in_use += offmap->map.max_entries; + list_add_tail(&nfp_map->l, &bpf->map_list); + + return 0; +} + +static int +nfp_bpf_map_free(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap) +{ + struct nfp_bpf_map *nfp_map = offmap->dev_priv; + + nfp_bpf_ctrl_free_map(bpf, nfp_map); + list_del_init(&nfp_map->l); + bpf->map_elems_in_use -= offmap->map.max_entries; + bpf->maps_in_use--; + kfree(nfp_map); + + return 0; +} + int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf) { switch (bpf->command) { @@ -162,6 +262,10 @@ int nfp_ndo_bpf(struct nfp_app *app, struct nfp_net *nn, struct netdev_bpf *bpf) return nfp_bpf_translate(nn, bpf->offload.prog); case BPF_OFFLOAD_DESTROY: return nfp_bpf_destroy(nn, bpf->offload.prog); + case BPF_OFFLOAD_MAP_ALLOC: + return nfp_bpf_map_alloc(app->priv, bpf->offmap); + case BPF_OFFLOAD_MAP_FREE: + return nfp_bpf_map_free(app->priv, bpf->offmap); default: return -EINVAL; } From 45e5e1212af4633aa76db387ccaac8b41c8a7b6c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 15 Jan 2018 19:16:15 +0000 Subject: [PATCH 22/33] bpftool: recognize BPF_PROG_TYPE_CGROUP_DEVICE programs Bpftool doesn't recognize BPF_PROG_TYPE_CGROUP_DEVICE programs, so the prog show command prints the numeric type value: $ bpftool prog show 1: type 15 name bpf_prog1 tag ac9f93dbfd6d9b74 loaded_at Jan 15/07:58 uid 0 xlated 96B jited 105B memlock 4096B This patch defines the corresponding textual representation: $ bpftool prog show 1: cgroup_device name bpf_prog1 tag ac9f93dbfd6d9b74 loaded_at Jan 15/07:58 uid 0 xlated 96B jited 105B memlock 4096B Signed-off-by: Roman Gushchin Cc: Jakub Kicinski Cc: Quentin Monnet Cc: Daniel Borkmann Cc: Alexei Starovoitov Acked-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/prog.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index c6a28be4665c..099e21cf1b5c 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -66,6 +66,7 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_LWT_XMIT] = "lwt_xmit", [BPF_PROG_TYPE_SOCK_OPS] = "sock_ops", [BPF_PROG_TYPE_SK_SKB] = "sk_skb", + [BPF_PROG_TYPE_CGROUP_DEVICE] = "cgroup_device", }; static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) From 0fe875c5c7bc67072b629f13cf1ededcb4da4fb2 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 16 Jan 2018 11:27:05 +0000 Subject: [PATCH 23/33] bpf: cpumap: make some functions static Fixes the following sparse warnings: kernel/bpf/cpumap.c:146:6: warning: symbol '__cpu_map_queue_destructor' was not declared. Should it be static? kernel/bpf/cpumap.c:225:16: warning: symbol 'cpu_map_build_skb' was not declared. Should it be static? kernel/bpf/cpumap.c:340:26: warning: symbol '__cpu_map_entry_alloc' was not declared. Should it be static? kernel/bpf/cpumap.c:398:6: warning: symbol '__cpu_map_entry_free' was not declared. Should it be static? kernel/bpf/cpumap.c:441:6: warning: symbol '__cpu_map_entry_replace' was not declared. Should it be static? kernel/bpf/cpumap.c:454:5: warning: symbol 'cpu_map_delete_elem' was not declared. Should it be static? kernel/bpf/cpumap.c:467:5: warning: symbol 'cpu_map_update_elem' was not declared. Should it be static? kernel/bpf/cpumap.c:505:6: warning: symbol 'cpu_map_free' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 192151ec9d12..fbfdada6caee 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -137,7 +137,7 @@ free_cmap: return ERR_PTR(err); } -void __cpu_map_queue_destructor(void *ptr) +static void __cpu_map_queue_destructor(void *ptr) { /* The tear-down procedure should have made sure that queue is * empty. See __cpu_map_entry_replace() and work-queue @@ -216,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) return xdp_pkt; } -struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) +static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, + struct xdp_pkt *xdp_pkt) { unsigned int frame_size; void *pkt_data_start; @@ -331,7 +331,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, + int map_id) { gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; @@ -389,7 +390,7 @@ free_rcu: return NULL; } -void __cpu_map_entry_free(struct rcu_head *rcu) +static void __cpu_map_entry_free(struct rcu_head *rcu) { struct bpf_cpu_map_entry *rcpu; int cpu; @@ -432,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu) * cpu_map_kthread_stop, which waits for an RCU graze period before * stopping kthread, emptying the queue. */ -void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, - u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, + u32 key_cpu, struct bpf_cpu_map_entry *rcpu) { struct bpf_cpu_map_entry *old_rcpu; @@ -445,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, } } -int cpu_map_delete_elem(struct bpf_map *map, void *key) +static int cpu_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 key_cpu = *(u32 *)key; @@ -458,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key) return 0; } -int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -496,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } -void cpu_map_free(struct bpf_map *map) +static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); int cpu; From 4c38f74c9186e2bc32789ddab3a95ed384c695d7 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Tue, 16 Jan 2018 14:15:30 +0000 Subject: [PATCH 24/33] samples/bpf: Fix trailing semicolon The trailing semicolon is an empty statement that does no operation. Removing it since it doesn't do anything. Signed-off-by: Luis de Bethencourt Signed-off-by: Daniel Borkmann --- samples/bpf/xdp_monitor_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index 2fe2f761a0d0..c969141bfa8b 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -104,7 +104,7 @@ struct xdp_exception_ctx { SEC("tracepoint/xdp/xdp_exception") int trace_xdp_exception(struct xdp_exception_ctx *ctx) { - u64 *cnt;; + u64 *cnt; u32 key; key = ctx->act; From 0a2d28ff516c48a21c1e3be1ff7fba3e94248baa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2018 15:51:45 -0800 Subject: [PATCH 25/33] bpf: offload: make bpf_offload_dev_match() reject host+host case Daniel suggests it would be more logical for bpf_offload_dev_match() to return false is either the program or the map are not offloaded, rather than treating the both not offloaded case as a "matching CPU/host device". This makes no functional difference today, since verifier only calls bpf_offload_dev_match() when one of the objects is offloaded. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 453785fa1881..a88cebf368bf 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -395,10 +395,8 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) struct bpf_prog_offload *offload; bool ret; - if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map)) + if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) return false; - if (!bpf_prog_is_dev_bound(prog->aux)) - return true; down_read(&bpf_devs_lock); offload = prog->aux->offload; From 48b325632641da27a241912d66b38a1221ab425f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2018 15:51:46 -0800 Subject: [PATCH 26/33] bpf: annotate bpf_insn_print_t with __printf Functions of type bpf_insn_print_t take printf-like format string, mark the type accordingly. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/disasm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e0857d016f89..266fe8ee542b 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -29,8 +29,8 @@ extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -typedef void (*bpf_insn_print_t)(struct bpf_verifier_env *env, - const char *, ...); +typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env, + const char *, ...); typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, const struct bpf_insn *insn); typedef const char *(*bpf_insn_print_imm_t)(void *private_data, From 39b72ccdb278f53735af1a378e67e6110e3210ad Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Tue, 16 Jan 2018 15:51:47 -0800 Subject: [PATCH 27/33] tools: bpftool: add -DPACKAGE when including bfd.h bfd.h is requiring including of config.h except when PACKAGE or PACKAGE_VERSION are defined. /* PR 14072: Ensure that config.h is included first. */ #if !defined PACKAGE && !defined PACKAGE_VERSION #error config.h must be included before this header #endif This check has been introduced since May-2012. It doesn't show up in bfd.h on some Linux distribution, probably because distributions have remove it when building the package. However, sometimes the user might just build libfd from source code then link bpftool against it. For this case, bfd.h will be original that we need to define PACKAGE or PACKAGE_VERSION. Acked-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/Makefile | 2 +- tools/build/feature/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 2237bc43f71c..26901ec87361 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -39,7 +39,7 @@ CC = gcc CFLAGS += -O2 CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/ +CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/ CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' LIBS = -lelf -lbfd -lopcodes $(LIBBPF) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 17f2c73fff8b..bc715f6ac320 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -190,7 +190,7 @@ $(OUTPUT)test-libbfd.bin: $(BUILD) -DPACKAGE='"perf"' -lbfd -lz -liberty -ldl $(OUTPUT)test-disassembler-four-args.bin: - $(BUILD) -lbfd -lopcodes + $(BUILD) -DPACKAGE='"perf"' -lbfd -lopcodes $(OUTPUT)test-liberty.bin: $(CC) $(CFLAGS) -Wall -Werror -o $@ test-libbfd.c -DPACKAGE='"perf"' $(LDFLAGS) -lbfd -ldl -liberty From d77be68955475fc2321e73fe006240248f2f8fef Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 16 Jan 2018 15:51:48 -0800 Subject: [PATCH 28/33] libbpf: fix string comparison for guessing eBPF program type libbpf is able to deduce the type of a program from the name of the ELF section in which it is located. However, the comparison is made on the first n characters, n being determined with sizeof() applied to the reference string (e.g. "xdp"). When such section names are supposed to receive a suffix separated with a slash (e.g. "kprobe/"), using sizeof() takes the final NUL character of the reference string into account, which implies that both strings must be equal. Instead, the desired behaviour would consist in taking the length of the string, *without* accounting for the ending NUL character, and to make sure the reference string is a prefix to the ELF section name. Subtract 1 to the total size of the string for obtaining the length for the comparison. Fixes: 583c90097f72 ("libbpf: add ability to guess program type based on section name") Signed-off-by: Quentin Monnet Acked-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e9c4b7cabcf2..30c776375118 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1803,7 +1803,7 @@ BPF_PROG_TYPE_FNS(tracepoint, BPF_PROG_TYPE_TRACEPOINT); BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP); BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT); -#define BPF_PROG_SEC(string, type) { string, sizeof(string), type } +#define BPF_PROG_SEC(string, type) { string, sizeof(string) - 1, type } static const struct { const char *sec; size_t len; From 7dfa4d87cfc48f3d4171f4a1b886bbbe4faf5c07 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2018 15:51:49 -0800 Subject: [PATCH 29/33] nfp: bpf: print map lookup problems into verifier log Use the verifier log to output error messages if map lookup can't be offloaded. Signed-off-by: Jakub Kicinski Acked-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 741438896cc7..81dab462456c 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -132,22 +132,24 @@ nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env, case BPF_FUNC_map_lookup_elem: if (!bpf->helpers.map_lookup) { - pr_info("map_lookup: not supported by FW\n"); + pr_vlog(env, "map_lookup: not supported by FW\n"); return -EOPNOTSUPP; } if (reg2->type != PTR_TO_STACK) { - pr_info("map_lookup: unsupported key ptr type %d\n", + pr_vlog(env, + "map_lookup: unsupported key ptr type %d\n", reg2->type); return -EOPNOTSUPP; } if (!tnum_is_const(reg2->var_off)) { - pr_info("map_lookup: variable key pointer\n"); + pr_vlog(env, "map_lookup: variable key pointer\n"); return -EOPNOTSUPP; } off = reg2->var_off.value + reg2->off; if (-off % 4) { - pr_info("map_lookup: unaligned stack pointer %lld\n", + pr_vlog(env, + "map_lookup: unaligned stack pointer %lld\n", -off); return -EOPNOTSUPP; } @@ -160,7 +162,7 @@ nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env, meta->arg2_var_off |= off != old_off; if (meta->arg1.map_ptr != reg1->map_ptr) { - pr_info("map_lookup: called for different map\n"); + pr_vlog(env, "map_lookup: called for different map\n"); return -EOPNOTSUPP; } break; @@ -263,7 +265,7 @@ nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, if (reg->type == PTR_TO_MAP_VALUE) { if (is_mbpf_store(meta)) { - pr_info("map writes not supported\n"); + pr_vlog(env, "map writes not supported\n"); return -EOPNOTSUPP; } } From 74801e50d5b89329e6c02b8bd924a41234f76316 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 16 Jan 2018 15:51:50 -0800 Subject: [PATCH 30/33] nfp: bpf: reject program on instructions unknown to the JIT compiler If an eBPF instruction is unknown to the driver JIT compiler, we can reject the program at verification time. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Reviewed-by: Jiong Wang Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 5 +++++ drivers/net/ethernet/netronome/nfp/bpf/main.h | 1 + drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 6 ++++++ 3 files changed, 12 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index cdc949fabe98..56451edf01c2 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -2907,6 +2907,11 @@ void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt) } } +bool nfp_bpf_supported_opcode(u8 code) +{ + return !!instr_cb[code]; +} + void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv) { unsigned int i; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index b80e75a8ecda..c476bca15ba4 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -324,6 +324,7 @@ struct nfp_bpf_vnic { void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog, unsigned int cnt); int nfp_bpf_jit(struct nfp_prog *prog); +bool nfp_bpf_supported_opcode(u8 code); extern const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 81dab462456c..479f602887e9 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -290,6 +290,12 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) meta = nfp_bpf_goto_meta(nfp_prog, meta, insn_idx, env->prog->len); nfp_prog->verifier_meta = meta; + if (!nfp_bpf_supported_opcode(meta->insn.code)) { + pr_vlog(env, "instruction %#02x not supported\n", + meta->insn.code); + return -EINVAL; + } + if (meta->insn.src_reg >= MAX_BPF_REG || meta->insn.dst_reg >= MAX_BPF_REG) { pr_vlog(env, "program uses extended registers - jit hardening?\n"); From 7d386c624980e476612490df52eaa86c8e066edc Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 17 Jan 2018 00:20:30 +0100 Subject: [PATCH 31/33] libbpf: install the header file libbpf.h It seems like an oversight not to install the header file for libbpf, given the libbpf.so + libbpf.a files are installed. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- tools/lib/bpf/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 8ed43ae9db9b..54370654c708 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -192,7 +192,8 @@ install_lib: all_cmd install_headers: $(call QUIET_INSTALL, headers) \ - $(call do_install,bpf.h,$(prefix)/include/bpf,644) + $(call do_install,bpf.h,$(prefix)/include/bpf,644); \ + $(call do_install,libbpf.h,$(prefix)/include/bpf,644); install: install_lib From 63c859101ec32cbc8fa5b708c7f17de63b15e56e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 17 Jan 2018 00:20:35 +0100 Subject: [PATCH 32/33] libbpf: cleanup Makefile, remove unused elements The plugin_dir_SQ variable is not used, remove it. The function update_dir is also unused, remove it. The variable $VERSION_FILES is empty, remove it. These all originates from the introduction of the Makefile, and is likely a copy paste from tools/lib/traceevent/Makefile. Fixes: 1b76c13e4b36 ("bpf tools: Introduce 'bpf' library and add bpf feature check") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- tools/lib/bpf/Makefile | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 54370654c708..8e15e48cb8f8 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -93,7 +93,6 @@ export prefix libdir src obj # Shell quotes libdir_SQ = $(subst ','\'',$(libdir)) libdir_relative_SQ = $(subst ','\'',$(libdir_relative)) -plugin_dir_SQ = $(subst ','\'',$(plugin_dir)) LIB_FILE = libbpf.a libbpf.so @@ -150,7 +149,7 @@ CMD_TARGETS = $(LIB_FILE) TARGETS = $(CMD_TARGETS) -all: fixdep $(VERSION_FILES) all_cmd +all: fixdep all_cmd all_cmd: $(CMD_TARGETS) @@ -169,16 +168,6 @@ $(OUTPUT)libbpf.so: $(BPF_IN) $(OUTPUT)libbpf.a: $(BPF_IN) $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^ -define update_dir - (echo $1 > $@.tmp; \ - if [ -r $@ ] && cmp -s $@ $@.tmp; then \ - rm -f $@.tmp; \ - else \ - echo ' UPDATE $@'; \ - mv -f $@.tmp $@; \ - fi); -endef - define do_install if [ ! -d '$(DESTDIR_SQ)$2' ]; then \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \ @@ -204,7 +193,7 @@ config-clean: $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null clean: - $(call QUIET_CLEAN, libbpf) $(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES) .*.d .*.cmd \ + $(call QUIET_CLEAN, libbpf) $(RM) *.o *~ $(TARGETS) *.a *.so .*.d .*.cmd \ $(RM) LIBBPF-CFLAGS $(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP.libbpf From 7110d80d53f472956420cd05a6297f49b558b674 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 17 Jan 2018 00:20:40 +0100 Subject: [PATCH 33/33] libbpf: Makefile set specified permission mode The third parameter to do_install was not used by $(INSTALL) command. Fix this by only setting the -m option when the third parameter is supplied. The use of a third parameter was introduced in commit eb54e522a000 ("bpf: install libbpf headers on 'make install'"). Without this change, the header files are install as executables files (755). Fixes: eb54e522a000 ("bpf: install libbpf headers on 'make install'") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- tools/lib/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 8e15e48cb8f8..83714ca1f22b 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -172,7 +172,7 @@ define do_install if [ ! -d '$(DESTDIR_SQ)$2' ]; then \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \ fi; \ - $(INSTALL) $1 '$(DESTDIR_SQ)$2' + $(INSTALL) $1 $(if $3,-m $3,) '$(DESTDIR_SQ)$2' endef install_lib: all_cmd