2019-05-27 08:55:01 +02:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2017-04-25 19:25:54 +05:30
|
|
|
/*
|
|
|
|
* Split from ftrace_64.S
|
|
|
|
*/
|
|
|
|
|
2023-06-19 15:17:25 +05:30
|
|
|
#include <linux/export.h>
|
2017-04-25 19:25:54 +05:30
|
|
|
#include <linux/magic.h>
|
|
|
|
#include <asm/ppc_asm.h>
|
|
|
|
#include <asm/asm-offsets.h>
|
|
|
|
#include <asm/ftrace.h>
|
|
|
|
#include <asm/ppc-opcode.h>
|
|
|
|
#include <asm/thread_info.h>
|
|
|
|
#include <asm/bug.h>
|
|
|
|
#include <asm/ptrace.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
*
|
2018-04-19 12:34:09 +05:30
|
|
|
* ftrace_caller()/ftrace_regs_caller() is the function that replaces _mcount()
|
|
|
|
* when ftrace is active.
|
2017-04-25 19:25:54 +05:30
|
|
|
*
|
|
|
|
* We arrive here after a function A calls function B, and we are the trace
|
|
|
|
* function for B. When we enter r1 points to A's stack frame, B has not yet
|
|
|
|
* had a chance to allocate one yet.
|
|
|
|
*
|
|
|
|
* Additionally r2 may point either to the TOC for A, or B, depending on
|
|
|
|
* whether B did a TOC setup sequence before calling us.
|
|
|
|
*
|
|
|
|
* On entry the LR points back to the _mcount() call site, and r0 holds the
|
|
|
|
* saved LR as it was on entry to B, ie. the original return address at the
|
|
|
|
* call site in A.
|
|
|
|
*
|
|
|
|
* Our job is to save the register state into a struct pt_regs (on the stack)
|
|
|
|
* and then arrange for the ftrace function to be called.
|
|
|
|
*/
|
2022-02-17 13:01:57 +01:00
|
|
|
.macro ftrace_regs_entry allregs
|
2023-06-21 10:43:49 +05:30
|
|
|
/* Create a minimal stack frame for representing B */
|
|
|
|
PPC_STLU r1, -STACK_FRAME_MIN_SIZE(r1)
|
|
|
|
|
2017-04-25 19:25:54 +05:30
|
|
|
/* Create our stack frame + pt_regs */
|
2021-12-20 16:38:40 +00:00
|
|
|
PPC_STLU r1,-SWITCH_FRAME_SIZE(r1)
|
2017-04-25 19:25:54 +05:30
|
|
|
|
2024-10-30 12:38:48 +05:30
|
|
|
.if \allregs == 1
|
|
|
|
SAVE_GPRS(11, 12, r1)
|
|
|
|
.endif
|
|
|
|
|
|
|
|
/* Get the _mcount() call site out of LR */
|
|
|
|
mflr r11
|
|
|
|
|
|
|
|
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
|
|
|
|
/* Load the ftrace_op */
|
|
|
|
PPC_LL r12, -(MCOUNT_INSN_SIZE*2 + SZL)(r11)
|
|
|
|
|
|
|
|
/* Load direct_call from the ftrace_op */
|
|
|
|
PPC_LL r12, FTRACE_OPS_DIRECT_CALL(r12)
|
|
|
|
PPC_LCMPI r12, 0
|
|
|
|
.if \allregs == 1
|
|
|
|
bne .Lftrace_direct_call_regs
|
|
|
|
.else
|
|
|
|
bne .Lftrace_direct_call
|
|
|
|
.endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Save the previous LR in pt_regs->link */
|
|
|
|
PPC_STL r0, _LINK(r1)
|
|
|
|
/* Also save it in A's stack frame */
|
|
|
|
PPC_STL r0, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE+LRSAVE(r1)
|
|
|
|
|
2017-04-25 19:25:54 +05:30
|
|
|
/* Save all gprs to pt_regs */
|
2017-06-01 16:18:16 +05:30
|
|
|
SAVE_GPR(0, r1)
|
2022-02-17 13:01:57 +01:00
|
|
|
SAVE_GPRS(3, 10, r1)
|
2018-04-19 12:34:00 +05:30
|
|
|
|
2022-02-17 13:01:56 +01:00
|
|
|
#ifdef CONFIG_PPC64
|
2018-04-19 12:34:00 +05:30
|
|
|
/* Ok to continue? */
|
|
|
|
lbz r3, PACA_FTRACE_ENABLED(r13)
|
|
|
|
cmpdi r3, 0
|
|
|
|
beq ftrace_no_trace
|
2022-02-17 13:01:56 +01:00
|
|
|
#endif
|
2018-04-19 12:34:00 +05:30
|
|
|
|
2022-02-17 13:01:57 +01:00
|
|
|
.if \allregs == 1
|
|
|
|
SAVE_GPR(2, r1)
|
2024-10-30 12:38:48 +05:30
|
|
|
SAVE_GPRS(13, 31, r1)
|
2022-02-17 13:01:57 +01:00
|
|
|
.else
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#if defined(CONFIG_LIVEPATCH_64) || defined(CONFIG_PPC_FTRACE_OUT_OF_LINE)
|
2022-02-17 13:01:57 +01:00
|
|
|
SAVE_GPR(14, r1)
|
|
|
|
#endif
|
|
|
|
.endif
|
2017-06-01 16:18:16 +05:30
|
|
|
|
|
|
|
/* Save previous stack pointer (r1) */
|
2023-11-30 12:29:47 +05:30
|
|
|
addi r8, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE
|
2021-12-20 16:38:40 +00:00
|
|
|
PPC_STL r8, GPR1(r1)
|
2017-04-25 19:25:54 +05:30
|
|
|
|
2022-02-17 13:01:57 +01:00
|
|
|
.if \allregs == 1
|
2017-04-25 19:25:54 +05:30
|
|
|
/* Load special regs for save below */
|
2024-10-30 12:38:48 +05:30
|
|
|
mfcr r7
|
2017-04-25 19:25:54 +05:30
|
|
|
mfmsr r8
|
|
|
|
mfctr r9
|
|
|
|
mfxer r10
|
2022-02-17 13:01:57 +01:00
|
|
|
.else
|
|
|
|
/* Clear MSR to flag as ftrace_caller versus frace_regs_caller */
|
|
|
|
li r8, 0
|
|
|
|
.endif
|
2017-04-25 19:25:54 +05:30
|
|
|
|
2021-12-20 16:38:40 +00:00
|
|
|
#ifdef CONFIG_PPC64
|
2017-04-25 19:25:54 +05:30
|
|
|
/* Save callee's TOC in the ABI compliant location */
|
2022-02-17 13:01:59 +01:00
|
|
|
std r2, STK_GOT(r1)
|
2022-09-26 13:40:56 +10:00
|
|
|
LOAD_PACA_TOC() /* get kernel TOC in r2 */
|
powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS
Implement support for DYNAMIC_FTRACE_WITH_CALL_OPS similar to the
arm64 implementation.
This works by patching-in a pointer to an associated ftrace_ops
structure before each traceable function. If multiple ftrace_ops are
associated with a call site, then a special ftrace_list_ops is used to
enable iterating over all the registered ftrace_ops. If no ftrace_ops
are associated with a call site, then a special ftrace_nop_ops structure
is used to render the ftrace call as a no-op. ftrace trampoline can then
read the associated ftrace_ops for a call site by loading from an offset
from the LR, and branch directly to the associated function.
The primary advantage with this approach is that we don't have to
iterate over all the registered ftrace_ops for call sites that have a
single ftrace_ops registered. This is the equivalent of implementing
support for dynamic ftrace trampolines, which set up a special ftrace
trampoline for each registered ftrace_ops and have individual call sites
branch into those directly.
A secondary advantage is that this gives us a way to add support for
direct ftrace callers without having to resort to using stubs. The
address of the direct call trampoline can be loaded from the ftrace_ops
structure.
To support this, we reserve a nop before each function on 32-bit
powerpc. For 64-bit powerpc, two nops are reserved before each
out-of-line stub. During ftrace activation, we update this location with
the associated ftrace_ops pointer. Then, on ftrace entry, we load from
this location and call into ftrace_ops->func().
For 64-bit powerpc, we ensure that the out-of-line stub area is
doubleword aligned so that ftrace_ops address can be updated atomically.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-15-hbathini@linux.ibm.com
2024-10-30 12:38:47 +05:30
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
|
2024-10-30 12:38:48 +05:30
|
|
|
/* r11 points to the instruction following the call to ftrace */
|
|
|
|
PPC_LL r5, -(MCOUNT_INSN_SIZE*2 + SZL)(r11)
|
powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS
Implement support for DYNAMIC_FTRACE_WITH_CALL_OPS similar to the
arm64 implementation.
This works by patching-in a pointer to an associated ftrace_ops
structure before each traceable function. If multiple ftrace_ops are
associated with a call site, then a special ftrace_list_ops is used to
enable iterating over all the registered ftrace_ops. If no ftrace_ops
are associated with a call site, then a special ftrace_nop_ops structure
is used to render the ftrace call as a no-op. ftrace trampoline can then
read the associated ftrace_ops for a call site by loading from an offset
from the LR, and branch directly to the associated function.
The primary advantage with this approach is that we don't have to
iterate over all the registered ftrace_ops for call sites that have a
single ftrace_ops registered. This is the equivalent of implementing
support for dynamic ftrace trampolines, which set up a special ftrace
trampoline for each registered ftrace_ops and have individual call sites
branch into those directly.
A secondary advantage is that this gives us a way to add support for
direct ftrace callers without having to resort to using stubs. The
address of the direct call trampoline can be loaded from the ftrace_ops
structure.
To support this, we reserve a nop before each function on 32-bit
powerpc. For 64-bit powerpc, two nops are reserved before each
out-of-line stub. During ftrace activation, we update this location with
the associated ftrace_ops pointer. Then, on ftrace entry, we load from
this location and call into ftrace_ops->func().
For 64-bit powerpc, we ensure that the out-of-line stub area is
doubleword aligned so that ftrace_ops address can be updated atomically.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-15-hbathini@linux.ibm.com
2024-10-30 12:38:47 +05:30
|
|
|
PPC_LL r12, FTRACE_OPS_FUNC(r5)
|
|
|
|
mtctr r12
|
|
|
|
#else /* !CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS */
|
|
|
|
#ifdef CONFIG_PPC64
|
2022-09-26 13:40:54 +10:00
|
|
|
LOAD_REG_ADDR(r3, function_trace_op)
|
2017-04-25 19:25:54 +05:30
|
|
|
ld r5,0(r3)
|
2021-12-20 16:38:40 +00:00
|
|
|
#else
|
|
|
|
lis r3,function_trace_op@ha
|
|
|
|
lwz r5,function_trace_op@l(r3)
|
powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS
Implement support for DYNAMIC_FTRACE_WITH_CALL_OPS similar to the
arm64 implementation.
This works by patching-in a pointer to an associated ftrace_ops
structure before each traceable function. If multiple ftrace_ops are
associated with a call site, then a special ftrace_list_ops is used to
enable iterating over all the registered ftrace_ops. If no ftrace_ops
are associated with a call site, then a special ftrace_nop_ops structure
is used to render the ftrace call as a no-op. ftrace trampoline can then
read the associated ftrace_ops for a call site by loading from an offset
from the LR, and branch directly to the associated function.
The primary advantage with this approach is that we don't have to
iterate over all the registered ftrace_ops for call sites that have a
single ftrace_ops registered. This is the equivalent of implementing
support for dynamic ftrace trampolines, which set up a special ftrace
trampoline for each registered ftrace_ops and have individual call sites
branch into those directly.
A secondary advantage is that this gives us a way to add support for
direct ftrace callers without having to resort to using stubs. The
address of the direct call trampoline can be loaded from the ftrace_ops
structure.
To support this, we reserve a nop before each function on 32-bit
powerpc. For 64-bit powerpc, two nops are reserved before each
out-of-line stub. During ftrace activation, we update this location with
the associated ftrace_ops pointer. Then, on ftrace entry, we load from
this location and call into ftrace_ops->func().
For 64-bit powerpc, we ensure that the out-of-line stub area is
doubleword aligned so that ftrace_ops address can be updated atomically.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-15-hbathini@linux.ibm.com
2024-10-30 12:38:47 +05:30
|
|
|
#endif
|
2021-12-20 16:38:40 +00:00
|
|
|
#endif
|
2017-04-25 19:25:54 +05:30
|
|
|
|
|
|
|
/* Save special regs */
|
2021-12-20 16:38:40 +00:00
|
|
|
PPC_STL r8, _MSR(r1)
|
2022-02-17 13:01:57 +01:00
|
|
|
.if \allregs == 1
|
2024-10-30 12:38:48 +05:30
|
|
|
PPC_STL r7, _CCR(r1)
|
2021-12-20 16:38:40 +00:00
|
|
|
PPC_STL r9, _CTR(r1)
|
|
|
|
PPC_STL r10, _XER(r1)
|
2022-02-17 13:01:57 +01:00
|
|
|
.endif
|
2017-04-25 19:25:54 +05:30
|
|
|
|
2024-10-30 12:38:48 +05:30
|
|
|
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
|
|
|
|
/* Clear orig_gpr3 to later detect ftrace_direct call */
|
|
|
|
li r7, 0
|
|
|
|
PPC_STL r7, ORIG_GPR3(r1)
|
|
|
|
#endif
|
|
|
|
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
|
|
|
|
/* Save our real return address in nvr for return */
|
|
|
|
.if \allregs == 0
|
|
|
|
SAVE_GPR(15, r1)
|
|
|
|
.endif
|
2024-10-30 12:38:48 +05:30
|
|
|
mr r15, r11
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
/*
|
2024-10-30 12:38:48 +05:30
|
|
|
* We want the ftrace location in the function, but our lr (in r11)
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
* points at the 'mtlr r0' instruction in the out of line stub. To
|
|
|
|
* recover the ftrace location, we read the branch instruction in the
|
|
|
|
* stub, and adjust our lr by the branch offset.
|
|
|
|
*
|
|
|
|
* See ftrace_init_ool_stub() for the profile sequence.
|
|
|
|
*/
|
2024-10-30 12:38:48 +05:30
|
|
|
lwz r8, MCOUNT_INSN_SIZE(r11)
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
slwi r8, r8, 6
|
|
|
|
srawi r8, r8, 6
|
2024-10-30 12:38:48 +05:30
|
|
|
add r3, r11, r8
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
/*
|
|
|
|
* Override our nip to point past the branch in the original function.
|
|
|
|
* This allows reliable stack trace and the ftrace stack tracer to work as-is.
|
|
|
|
*/
|
2024-10-30 12:38:48 +05:30
|
|
|
addi r11, r3, MCOUNT_INSN_SIZE
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#else
|
|
|
|
/* Calculate ip from nip-4 into r3 for call below */
|
2024-10-30 12:38:48 +05:30
|
|
|
subi r3, r11, MCOUNT_INSN_SIZE
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Save NIP as pt_regs->nip */
|
2024-10-30 12:38:48 +05:30
|
|
|
PPC_STL r11, _NIP(r1)
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
/* Also save it in B's stackframe header for proper unwind */
|
2024-10-30 12:38:48 +05:30
|
|
|
PPC_STL r11, LRSAVE+SWITCH_FRAME_SIZE(r1)
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#if defined(CONFIG_LIVEPATCH_64) || defined(CONFIG_PPC_FTRACE_OUT_OF_LINE)
|
2024-10-30 12:38:48 +05:30
|
|
|
mr r14, r11 /* remember old NIP */
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Put the original return address in r4 as parent_ip */
|
|
|
|
mr r4, r0
|
|
|
|
|
2017-04-25 19:25:54 +05:30
|
|
|
/* Load &pt_regs in r6 for call below */
|
2022-11-27 22:49:32 +10:00
|
|
|
addi r6, r1, STACK_INT_FRAME_REGS
|
2022-02-17 13:01:57 +01:00
|
|
|
.endm
|
2017-04-25 19:25:54 +05:30
|
|
|
|
2022-02-17 13:01:57 +01:00
|
|
|
.macro ftrace_regs_exit allregs
|
2024-10-30 12:38:48 +05:30
|
|
|
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
|
|
|
|
/* Check orig_gpr3 to detect ftrace_direct call */
|
|
|
|
PPC_LL r3, ORIG_GPR3(r1)
|
|
|
|
PPC_LCMPI cr1, r3, 0
|
|
|
|
mtctr r3
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Restore possibly modified LR */
|
|
|
|
PPC_LL r0, _LINK(r1)
|
|
|
|
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#ifndef CONFIG_PPC_FTRACE_OUT_OF_LINE
|
2018-06-20 01:07:32 +09:00
|
|
|
/* Load ctr with the possibly modified NIP */
|
2021-12-20 16:38:40 +00:00
|
|
|
PPC_LL r3, _NIP(r1)
|
|
|
|
#ifdef CONFIG_LIVEPATCH_64
|
2018-06-20 01:07:32 +09:00
|
|
|
cmpd r14, r3 /* has NIP been altered? */
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#endif
|
2024-10-30 12:38:48 +05:30
|
|
|
|
|
|
|
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
|
|
|
|
beq cr1,2f
|
|
|
|
mtlr r3
|
|
|
|
b 3f
|
|
|
|
#endif
|
|
|
|
2: mtctr r3
|
|
|
|
mtlr r0
|
|
|
|
3:
|
|
|
|
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#else /* !CONFIG_PPC_FTRACE_OUT_OF_LINE */
|
|
|
|
/* Load LR with the possibly modified NIP */
|
|
|
|
PPC_LL r3, _NIP(r1)
|
|
|
|
cmpd r14, r3 /* has NIP been altered? */
|
|
|
|
bne- 1f
|
|
|
|
|
|
|
|
mr r3, r15
|
2025-04-17 00:42:27 +05:30
|
|
|
1: mtlr r3
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
.if \allregs == 0
|
|
|
|
REST_GPR(15, r1)
|
|
|
|
.endif
|
2017-04-25 19:25:54 +05:30
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Restore gprs */
|
2022-02-17 13:01:57 +01:00
|
|
|
.if \allregs == 1
|
2021-10-22 16:13:22 +10:00
|
|
|
REST_GPRS(2, 31, r1)
|
2022-02-17 13:01:57 +01:00
|
|
|
.else
|
|
|
|
REST_GPRS(3, 10, r1)
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#if defined(CONFIG_LIVEPATCH_64) || defined(CONFIG_PPC_FTRACE_OUT_OF_LINE)
|
2022-02-17 13:01:57 +01:00
|
|
|
REST_GPR(14, r1)
|
|
|
|
#endif
|
|
|
|
.endif
|
2017-04-25 19:25:54 +05:30
|
|
|
|
2021-12-20 16:38:40 +00:00
|
|
|
#ifdef CONFIG_PPC64
|
2017-04-25 19:25:54 +05:30
|
|
|
/* Restore callee's TOC */
|
2022-02-17 13:01:59 +01:00
|
|
|
ld r2, STK_GOT(r1)
|
2021-12-20 16:38:40 +00:00
|
|
|
#endif
|
2017-04-25 19:25:54 +05:30
|
|
|
|
|
|
|
/* Pop our stack frame */
|
2023-06-21 10:43:49 +05:30
|
|
|
addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE
|
2017-04-25 19:25:54 +05:30
|
|
|
|
2021-12-20 16:38:40 +00:00
|
|
|
#ifdef CONFIG_LIVEPATCH_64
|
2018-06-20 01:07:32 +09:00
|
|
|
/* Based on the cmpd above, if the NIP was altered handle livepatch */
|
2017-04-25 19:25:54 +05:30
|
|
|
bne- livepatch_handler
|
|
|
|
#endif
|
2024-10-30 12:38:48 +05:30
|
|
|
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
/* jump after _mcount site */
|
|
|
|
#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
|
2024-10-30 12:38:48 +05:30
|
|
|
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
|
|
|
|
bnectr cr1
|
|
|
|
#endif
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
/*
|
|
|
|
* Return with blr to keep the link stack balanced. The function profiling sequence
|
|
|
|
* uses 'mtlr r0' to restore LR.
|
|
|
|
*/
|
|
|
|
blr
|
|
|
|
#else
|
|
|
|
bctr
|
|
|
|
#endif
|
2022-02-17 13:01:57 +01:00
|
|
|
.endm
|
2017-04-25 19:25:54 +05:30
|
|
|
|
powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS
Implement support for DYNAMIC_FTRACE_WITH_CALL_OPS similar to the
arm64 implementation.
This works by patching-in a pointer to an associated ftrace_ops
structure before each traceable function. If multiple ftrace_ops are
associated with a call site, then a special ftrace_list_ops is used to
enable iterating over all the registered ftrace_ops. If no ftrace_ops
are associated with a call site, then a special ftrace_nop_ops structure
is used to render the ftrace call as a no-op. ftrace trampoline can then
read the associated ftrace_ops for a call site by loading from an offset
from the LR, and branch directly to the associated function.
The primary advantage with this approach is that we don't have to
iterate over all the registered ftrace_ops for call sites that have a
single ftrace_ops registered. This is the equivalent of implementing
support for dynamic ftrace trampolines, which set up a special ftrace
trampoline for each registered ftrace_ops and have individual call sites
branch into those directly.
A secondary advantage is that this gives us a way to add support for
direct ftrace callers without having to resort to using stubs. The
address of the direct call trampoline can be loaded from the ftrace_ops
structure.
To support this, we reserve a nop before each function on 32-bit
powerpc. For 64-bit powerpc, two nops are reserved before each
out-of-line stub. During ftrace activation, we update this location with
the associated ftrace_ops pointer. Then, on ftrace entry, we load from
this location and call into ftrace_ops->func().
For 64-bit powerpc, we ensure that the out-of-line stub area is
doubleword aligned so that ftrace_ops address can be updated atomically.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-15-hbathini@linux.ibm.com
2024-10-30 12:38:47 +05:30
|
|
|
.macro ftrace_regs_func allregs
|
|
|
|
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
|
|
|
|
bctrl
|
|
|
|
#else
|
|
|
|
.if \allregs == 1
|
2022-02-17 13:01:57 +01:00
|
|
|
.globl ftrace_regs_call
|
|
|
|
ftrace_regs_call:
|
powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS
Implement support for DYNAMIC_FTRACE_WITH_CALL_OPS similar to the
arm64 implementation.
This works by patching-in a pointer to an associated ftrace_ops
structure before each traceable function. If multiple ftrace_ops are
associated with a call site, then a special ftrace_list_ops is used to
enable iterating over all the registered ftrace_ops. If no ftrace_ops
are associated with a call site, then a special ftrace_nop_ops structure
is used to render the ftrace call as a no-op. ftrace trampoline can then
read the associated ftrace_ops for a call site by loading from an offset
from the LR, and branch directly to the associated function.
The primary advantage with this approach is that we don't have to
iterate over all the registered ftrace_ops for call sites that have a
single ftrace_ops registered. This is the equivalent of implementing
support for dynamic ftrace trampolines, which set up a special ftrace
trampoline for each registered ftrace_ops and have individual call sites
branch into those directly.
A secondary advantage is that this gives us a way to add support for
direct ftrace callers without having to resort to using stubs. The
address of the direct call trampoline can be loaded from the ftrace_ops
structure.
To support this, we reserve a nop before each function on 32-bit
powerpc. For 64-bit powerpc, two nops are reserved before each
out-of-line stub. During ftrace activation, we update this location with
the associated ftrace_ops pointer. Then, on ftrace entry, we load from
this location and call into ftrace_ops->func().
For 64-bit powerpc, we ensure that the out-of-line stub area is
doubleword aligned so that ftrace_ops address can be updated atomically.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-15-hbathini@linux.ibm.com
2024-10-30 12:38:47 +05:30
|
|
|
.else
|
|
|
|
.globl ftrace_call
|
|
|
|
ftrace_call:
|
|
|
|
.endif
|
|
|
|
/* ftrace_call(r3, r4, r5, r6) */
|
2022-02-17 13:01:57 +01:00
|
|
|
bl ftrace_stub
|
powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS
Implement support for DYNAMIC_FTRACE_WITH_CALL_OPS similar to the
arm64 implementation.
This works by patching-in a pointer to an associated ftrace_ops
structure before each traceable function. If multiple ftrace_ops are
associated with a call site, then a special ftrace_list_ops is used to
enable iterating over all the registered ftrace_ops. If no ftrace_ops
are associated with a call site, then a special ftrace_nop_ops structure
is used to render the ftrace call as a no-op. ftrace trampoline can then
read the associated ftrace_ops for a call site by loading from an offset
from the LR, and branch directly to the associated function.
The primary advantage with this approach is that we don't have to
iterate over all the registered ftrace_ops for call sites that have a
single ftrace_ops registered. This is the equivalent of implementing
support for dynamic ftrace trampolines, which set up a special ftrace
trampoline for each registered ftrace_ops and have individual call sites
branch into those directly.
A secondary advantage is that this gives us a way to add support for
direct ftrace callers without having to resort to using stubs. The
address of the direct call trampoline can be loaded from the ftrace_ops
structure.
To support this, we reserve a nop before each function on 32-bit
powerpc. For 64-bit powerpc, two nops are reserved before each
out-of-line stub. During ftrace activation, we update this location with
the associated ftrace_ops pointer. Then, on ftrace entry, we load from
this location and call into ftrace_ops->func().
For 64-bit powerpc, we ensure that the out-of-line stub area is
doubleword aligned so that ftrace_ops address can be updated atomically.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-15-hbathini@linux.ibm.com
2024-10-30 12:38:47 +05:30
|
|
|
#endif
|
|
|
|
.endm
|
|
|
|
|
|
|
|
_GLOBAL(ftrace_regs_caller)
|
|
|
|
ftrace_regs_entry 1
|
|
|
|
ftrace_regs_func 1
|
2022-02-17 13:01:57 +01:00
|
|
|
ftrace_regs_exit 1
|
2018-04-19 12:34:00 +05:30
|
|
|
|
2018-04-19 12:34:09 +05:30
|
|
|
_GLOBAL(ftrace_caller)
|
2022-02-17 13:01:57 +01:00
|
|
|
ftrace_regs_entry 0
|
powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS
Implement support for DYNAMIC_FTRACE_WITH_CALL_OPS similar to the
arm64 implementation.
This works by patching-in a pointer to an associated ftrace_ops
structure before each traceable function. If multiple ftrace_ops are
associated with a call site, then a special ftrace_list_ops is used to
enable iterating over all the registered ftrace_ops. If no ftrace_ops
are associated with a call site, then a special ftrace_nop_ops structure
is used to render the ftrace call as a no-op. ftrace trampoline can then
read the associated ftrace_ops for a call site by loading from an offset
from the LR, and branch directly to the associated function.
The primary advantage with this approach is that we don't have to
iterate over all the registered ftrace_ops for call sites that have a
single ftrace_ops registered. This is the equivalent of implementing
support for dynamic ftrace trampolines, which set up a special ftrace
trampoline for each registered ftrace_ops and have individual call sites
branch into those directly.
A secondary advantage is that this gives us a way to add support for
direct ftrace callers without having to resort to using stubs. The
address of the direct call trampoline can be loaded from the ftrace_ops
structure.
To support this, we reserve a nop before each function on 32-bit
powerpc. For 64-bit powerpc, two nops are reserved before each
out-of-line stub. During ftrace activation, we update this location with
the associated ftrace_ops pointer. Then, on ftrace entry, we load from
this location and call into ftrace_ops->func().
For 64-bit powerpc, we ensure that the out-of-line stub area is
doubleword aligned so that ftrace_ops address can be updated atomically.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-15-hbathini@linux.ibm.com
2024-10-30 12:38:47 +05:30
|
|
|
ftrace_regs_func 0
|
2022-02-17 13:01:57 +01:00
|
|
|
ftrace_regs_exit 0
|
2018-04-19 12:34:09 +05:30
|
|
|
|
2022-02-17 13:01:57 +01:00
|
|
|
_GLOBAL(ftrace_stub)
|
|
|
|
blr
|
2018-04-19 12:34:09 +05:30
|
|
|
|
2021-12-20 16:38:40 +00:00
|
|
|
#ifdef CONFIG_PPC64
|
2022-02-17 13:01:57 +01:00
|
|
|
ftrace_no_trace:
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
|
|
|
|
REST_GPR(3, r1)
|
|
|
|
addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE
|
|
|
|
blr
|
|
|
|
#else
|
2022-02-17 13:01:57 +01:00
|
|
|
mflr r3
|
|
|
|
mtctr r3
|
|
|
|
REST_GPR(3, r1)
|
2023-11-30 12:29:47 +05:30
|
|
|
addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE
|
2022-02-17 13:01:57 +01:00
|
|
|
mtlr r0
|
|
|
|
bctr
|
2021-12-20 16:38:25 +00:00
|
|
|
#endif
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#endif
|
2022-02-15 19:31:25 +01:00
|
|
|
|
2024-10-30 12:38:48 +05:30
|
|
|
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
|
|
|
|
.Lftrace_direct_call_regs:
|
|
|
|
mtctr r12
|
|
|
|
REST_GPRS(11, 12, r1)
|
|
|
|
addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE
|
|
|
|
bctr
|
|
|
|
.Lftrace_direct_call:
|
|
|
|
mtctr r12
|
|
|
|
addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE
|
|
|
|
bctr
|
|
|
|
SYM_FUNC_START(ftrace_stub_direct_tramp)
|
|
|
|
blr
|
|
|
|
SYM_FUNC_END(ftrace_stub_direct_tramp)
|
|
|
|
#endif
|
|
|
|
|
2021-12-20 16:38:40 +00:00
|
|
|
#ifdef CONFIG_LIVEPATCH_64
|
2017-04-25 19:25:54 +05:30
|
|
|
/*
|
|
|
|
* This function runs in the mcount context, between two functions. As
|
|
|
|
* such it can only clobber registers which are volatile and used in
|
|
|
|
* function linkage.
|
|
|
|
*
|
|
|
|
* We get here when a function A, calls another function B, but B has
|
|
|
|
* been live patched with a new function C.
|
|
|
|
*
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
* On entry, we have no stack frame and can not allocate one.
|
|
|
|
*
|
|
|
|
* With PPC_FTRACE_OUT_OF_LINE=n, on entry:
|
2017-04-25 19:25:54 +05:30
|
|
|
* - LR points back to the original caller (in A)
|
|
|
|
* - CTR holds the new NIP in C
|
powerpc/livepatch: Fix livepatch stack access
While running stress test with livepatch module loaded, kernel bug was
triggered.
cpu 0x5: Vector: 400 (Instruction Access) at [c0000000eb9d3b60]
5:mon> t
[c0000000eb9d3de0] c0000000eb9d3e30 (unreliable)
[c0000000eb9d3e30] c000000000008ab4 hardware_interrupt_common+0x114/0x120
--- Exception: 501 (Hardware Interrupt) at c000000000053040 livepatch_handler+0x4c/0x74
[c0000000eb9d4120] 0000000057ac6e9d (unreliable)
[d0000000089d9f78] 2e0965747962382e
SP (965747962342e09) is in userspace
When an interrupt occurs during the livepatch_handler execution, it's
possible for the livepatch_stack and/or thread_info to be corrupted.
eg:
Task A Interrupt Handler
========= =================
livepatch_handler:
mr r0, r1
ld r1, TI_livepatch_sp(r12)
hardware_interrupt_common:
do_IRQ+0x8:
mflr r0 <- saved stack pointer is overwritten
bl _mcount
...
std r27,-40(r1) <- overwrite of thread_info()
lis r2, STACK_END_MAGIC@h
ori r2, r2, STACK_END_MAGIC@l
ld r12, -8(r1)
Fix the corruption by using r11 register for livepatch stack
manipulation, instead of shuffling task stack and livepatch stack into
r1 register. Using r11 register also avoids disabling/enabling irq's
while setting up the livepatch stack.
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-09-20 15:49:51 +05:30
|
|
|
* - r0, r11 & r12 are free
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
*
|
|
|
|
* With PPC_FTRACE_OUT_OF_LINE=y, on entry:
|
|
|
|
* - r0 points back to the original caller (in A)
|
|
|
|
* - LR holds the new NIP in C
|
|
|
|
* - r11 & r12 are free
|
2017-04-25 19:25:54 +05:30
|
|
|
*/
|
|
|
|
livepatch_handler:
|
2019-01-12 09:55:50 +00:00
|
|
|
ld r12, PACA_THREAD_INFO(r13)
|
2017-04-25 19:25:54 +05:30
|
|
|
|
|
|
|
/* Allocate 3 x 8 bytes */
|
powerpc/livepatch: Fix livepatch stack access
While running stress test with livepatch module loaded, kernel bug was
triggered.
cpu 0x5: Vector: 400 (Instruction Access) at [c0000000eb9d3b60]
5:mon> t
[c0000000eb9d3de0] c0000000eb9d3e30 (unreliable)
[c0000000eb9d3e30] c000000000008ab4 hardware_interrupt_common+0x114/0x120
--- Exception: 501 (Hardware Interrupt) at c000000000053040 livepatch_handler+0x4c/0x74
[c0000000eb9d4120] 0000000057ac6e9d (unreliable)
[d0000000089d9f78] 2e0965747962382e
SP (965747962342e09) is in userspace
When an interrupt occurs during the livepatch_handler execution, it's
possible for the livepatch_stack and/or thread_info to be corrupted.
eg:
Task A Interrupt Handler
========= =================
livepatch_handler:
mr r0, r1
ld r1, TI_livepatch_sp(r12)
hardware_interrupt_common:
do_IRQ+0x8:
mflr r0 <- saved stack pointer is overwritten
bl _mcount
...
std r27,-40(r1) <- overwrite of thread_info()
lis r2, STACK_END_MAGIC@h
ori r2, r2, STACK_END_MAGIC@l
ld r12, -8(r1)
Fix the corruption by using r11 register for livepatch stack
manipulation, instead of shuffling task stack and livepatch stack into
r1 register. Using r11 register also avoids disabling/enabling irq's
while setting up the livepatch stack.
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-09-20 15:49:51 +05:30
|
|
|
ld r11, TI_livepatch_sp(r12)
|
|
|
|
addi r11, r11, 24
|
|
|
|
std r11, TI_livepatch_sp(r12)
|
2017-04-25 19:25:54 +05:30
|
|
|
|
|
|
|
/* Store stack end marker */
|
|
|
|
lis r12, STACK_END_MAGIC@h
|
|
|
|
ori r12, r12, STACK_END_MAGIC@l
|
powerpc/livepatch: Fix livepatch stack access
While running stress test with livepatch module loaded, kernel bug was
triggered.
cpu 0x5: Vector: 400 (Instruction Access) at [c0000000eb9d3b60]
5:mon> t
[c0000000eb9d3de0] c0000000eb9d3e30 (unreliable)
[c0000000eb9d3e30] c000000000008ab4 hardware_interrupt_common+0x114/0x120
--- Exception: 501 (Hardware Interrupt) at c000000000053040 livepatch_handler+0x4c/0x74
[c0000000eb9d4120] 0000000057ac6e9d (unreliable)
[d0000000089d9f78] 2e0965747962382e
SP (965747962342e09) is in userspace
When an interrupt occurs during the livepatch_handler execution, it's
possible for the livepatch_stack and/or thread_info to be corrupted.
eg:
Task A Interrupt Handler
========= =================
livepatch_handler:
mr r0, r1
ld r1, TI_livepatch_sp(r12)
hardware_interrupt_common:
do_IRQ+0x8:
mflr r0 <- saved stack pointer is overwritten
bl _mcount
...
std r27,-40(r1) <- overwrite of thread_info()
lis r2, STACK_END_MAGIC@h
ori r2, r2, STACK_END_MAGIC@l
ld r12, -8(r1)
Fix the corruption by using r11 register for livepatch stack
manipulation, instead of shuffling task stack and livepatch stack into
r1 register. Using r11 register also avoids disabling/enabling irq's
while setting up the livepatch stack.
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-09-20 15:49:51 +05:30
|
|
|
std r12, -8(r11)
|
2017-04-25 19:25:54 +05:30
|
|
|
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
/* Save toc & real LR on livepatch stack */
|
|
|
|
std r2, -24(r11)
|
|
|
|
#ifndef CONFIG_PPC_FTRACE_OUT_OF_LINE
|
|
|
|
mflr r12
|
|
|
|
std r12, -16(r11)
|
2017-04-25 19:25:54 +05:30
|
|
|
mfctr r12
|
powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflr r0
bl ftrace_caller
The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.
Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflr r0
bl ftrace_caller
mtlr r0
Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.
Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.
On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.
For modules, space for ftrace stubs is reserved from the generic module
stub space.
This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.
As an example, after this patch, kernel functions will have a single nop
at function entry:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
nop
mfocrf r11,8
...
When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
<kernel_clone>:
addis r2,r12,467
addi r2,r2,-16028
b ftrace_ool_stub_text_end+0x11b28
mfocrf r11,8
...
The associated stub:
<ftrace_ool_stub_text_end+0x11b28>:
mflr r0
bl ftrace_caller
mtlr r0
b kernel_clone+0xc
...
This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-13-hbathini@linux.ibm.com
2024-10-30 12:38:45 +05:30
|
|
|
#else
|
|
|
|
std r0, -16(r11)
|
|
|
|
mflr r12
|
|
|
|
/* Put ctr in r12 for global entry and branch there */
|
|
|
|
mtctr r12
|
|
|
|
#endif
|
2017-04-25 19:25:54 +05:30
|
|
|
bctrl
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now we are returning from the patched function to the original
|
powerpc/livepatch: Fix livepatch stack access
While running stress test with livepatch module loaded, kernel bug was
triggered.
cpu 0x5: Vector: 400 (Instruction Access) at [c0000000eb9d3b60]
5:mon> t
[c0000000eb9d3de0] c0000000eb9d3e30 (unreliable)
[c0000000eb9d3e30] c000000000008ab4 hardware_interrupt_common+0x114/0x120
--- Exception: 501 (Hardware Interrupt) at c000000000053040 livepatch_handler+0x4c/0x74
[c0000000eb9d4120] 0000000057ac6e9d (unreliable)
[d0000000089d9f78] 2e0965747962382e
SP (965747962342e09) is in userspace
When an interrupt occurs during the livepatch_handler execution, it's
possible for the livepatch_stack and/or thread_info to be corrupted.
eg:
Task A Interrupt Handler
========= =================
livepatch_handler:
mr r0, r1
ld r1, TI_livepatch_sp(r12)
hardware_interrupt_common:
do_IRQ+0x8:
mflr r0 <- saved stack pointer is overwritten
bl _mcount
...
std r27,-40(r1) <- overwrite of thread_info()
lis r2, STACK_END_MAGIC@h
ori r2, r2, STACK_END_MAGIC@l
ld r12, -8(r1)
Fix the corruption by using r11 register for livepatch stack
manipulation, instead of shuffling task stack and livepatch stack into
r1 register. Using r11 register also avoids disabling/enabling irq's
while setting up the livepatch stack.
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-09-20 15:49:51 +05:30
|
|
|
* caller A. We are free to use r11, r12 and we can use r2 until we
|
2017-04-25 19:25:54 +05:30
|
|
|
* restore it.
|
|
|
|
*/
|
|
|
|
|
2019-01-12 09:55:50 +00:00
|
|
|
ld r12, PACA_THREAD_INFO(r13)
|
2017-04-25 19:25:54 +05:30
|
|
|
|
powerpc/livepatch: Fix livepatch stack access
While running stress test with livepatch module loaded, kernel bug was
triggered.
cpu 0x5: Vector: 400 (Instruction Access) at [c0000000eb9d3b60]
5:mon> t
[c0000000eb9d3de0] c0000000eb9d3e30 (unreliable)
[c0000000eb9d3e30] c000000000008ab4 hardware_interrupt_common+0x114/0x120
--- Exception: 501 (Hardware Interrupt) at c000000000053040 livepatch_handler+0x4c/0x74
[c0000000eb9d4120] 0000000057ac6e9d (unreliable)
[d0000000089d9f78] 2e0965747962382e
SP (965747962342e09) is in userspace
When an interrupt occurs during the livepatch_handler execution, it's
possible for the livepatch_stack and/or thread_info to be corrupted.
eg:
Task A Interrupt Handler
========= =================
livepatch_handler:
mr r0, r1
ld r1, TI_livepatch_sp(r12)
hardware_interrupt_common:
do_IRQ+0x8:
mflr r0 <- saved stack pointer is overwritten
bl _mcount
...
std r27,-40(r1) <- overwrite of thread_info()
lis r2, STACK_END_MAGIC@h
ori r2, r2, STACK_END_MAGIC@l
ld r12, -8(r1)
Fix the corruption by using r11 register for livepatch stack
manipulation, instead of shuffling task stack and livepatch stack into
r1 register. Using r11 register also avoids disabling/enabling irq's
while setting up the livepatch stack.
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-09-20 15:49:51 +05:30
|
|
|
ld r11, TI_livepatch_sp(r12)
|
2017-04-25 19:25:54 +05:30
|
|
|
|
|
|
|
/* Check stack marker hasn't been trashed */
|
|
|
|
lis r2, STACK_END_MAGIC@h
|
|
|
|
ori r2, r2, STACK_END_MAGIC@l
|
powerpc/livepatch: Fix livepatch stack access
While running stress test with livepatch module loaded, kernel bug was
triggered.
cpu 0x5: Vector: 400 (Instruction Access) at [c0000000eb9d3b60]
5:mon> t
[c0000000eb9d3de0] c0000000eb9d3e30 (unreliable)
[c0000000eb9d3e30] c000000000008ab4 hardware_interrupt_common+0x114/0x120
--- Exception: 501 (Hardware Interrupt) at c000000000053040 livepatch_handler+0x4c/0x74
[c0000000eb9d4120] 0000000057ac6e9d (unreliable)
[d0000000089d9f78] 2e0965747962382e
SP (965747962342e09) is in userspace
When an interrupt occurs during the livepatch_handler execution, it's
possible for the livepatch_stack and/or thread_info to be corrupted.
eg:
Task A Interrupt Handler
========= =================
livepatch_handler:
mr r0, r1
ld r1, TI_livepatch_sp(r12)
hardware_interrupt_common:
do_IRQ+0x8:
mflr r0 <- saved stack pointer is overwritten
bl _mcount
...
std r27,-40(r1) <- overwrite of thread_info()
lis r2, STACK_END_MAGIC@h
ori r2, r2, STACK_END_MAGIC@l
ld r12, -8(r1)
Fix the corruption by using r11 register for livepatch stack
manipulation, instead of shuffling task stack and livepatch stack into
r1 register. Using r11 register also avoids disabling/enabling irq's
while setting up the livepatch stack.
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-09-20 15:49:51 +05:30
|
|
|
ld r12, -8(r11)
|
2017-04-25 19:25:54 +05:30
|
|
|
1: tdne r12, r2
|
|
|
|
EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
|
|
|
|
|
|
|
|
/* Restore LR & toc from livepatch stack */
|
powerpc/livepatch: Fix livepatch stack access
While running stress test with livepatch module loaded, kernel bug was
triggered.
cpu 0x5: Vector: 400 (Instruction Access) at [c0000000eb9d3b60]
5:mon> t
[c0000000eb9d3de0] c0000000eb9d3e30 (unreliable)
[c0000000eb9d3e30] c000000000008ab4 hardware_interrupt_common+0x114/0x120
--- Exception: 501 (Hardware Interrupt) at c000000000053040 livepatch_handler+0x4c/0x74
[c0000000eb9d4120] 0000000057ac6e9d (unreliable)
[d0000000089d9f78] 2e0965747962382e
SP (965747962342e09) is in userspace
When an interrupt occurs during the livepatch_handler execution, it's
possible for the livepatch_stack and/or thread_info to be corrupted.
eg:
Task A Interrupt Handler
========= =================
livepatch_handler:
mr r0, r1
ld r1, TI_livepatch_sp(r12)
hardware_interrupt_common:
do_IRQ+0x8:
mflr r0 <- saved stack pointer is overwritten
bl _mcount
...
std r27,-40(r1) <- overwrite of thread_info()
lis r2, STACK_END_MAGIC@h
ori r2, r2, STACK_END_MAGIC@l
ld r12, -8(r1)
Fix the corruption by using r11 register for livepatch stack
manipulation, instead of shuffling task stack and livepatch stack into
r1 register. Using r11 register also avoids disabling/enabling irq's
while setting up the livepatch stack.
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-09-20 15:49:51 +05:30
|
|
|
ld r12, -16(r11)
|
2017-04-25 19:25:54 +05:30
|
|
|
mtlr r12
|
powerpc/livepatch: Fix livepatch stack access
While running stress test with livepatch module loaded, kernel bug was
triggered.
cpu 0x5: Vector: 400 (Instruction Access) at [c0000000eb9d3b60]
5:mon> t
[c0000000eb9d3de0] c0000000eb9d3e30 (unreliable)
[c0000000eb9d3e30] c000000000008ab4 hardware_interrupt_common+0x114/0x120
--- Exception: 501 (Hardware Interrupt) at c000000000053040 livepatch_handler+0x4c/0x74
[c0000000eb9d4120] 0000000057ac6e9d (unreliable)
[d0000000089d9f78] 2e0965747962382e
SP (965747962342e09) is in userspace
When an interrupt occurs during the livepatch_handler execution, it's
possible for the livepatch_stack and/or thread_info to be corrupted.
eg:
Task A Interrupt Handler
========= =================
livepatch_handler:
mr r0, r1
ld r1, TI_livepatch_sp(r12)
hardware_interrupt_common:
do_IRQ+0x8:
mflr r0 <- saved stack pointer is overwritten
bl _mcount
...
std r27,-40(r1) <- overwrite of thread_info()
lis r2, STACK_END_MAGIC@h
ori r2, r2, STACK_END_MAGIC@l
ld r12, -8(r1)
Fix the corruption by using r11 register for livepatch stack
manipulation, instead of shuffling task stack and livepatch stack into
r1 register. Using r11 register also avoids disabling/enabling irq's
while setting up the livepatch stack.
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-09-20 15:49:51 +05:30
|
|
|
ld r2, -24(r11)
|
2017-04-25 19:25:54 +05:30
|
|
|
|
|
|
|
/* Pop livepatch stack frame */
|
2019-01-12 09:55:50 +00:00
|
|
|
ld r12, PACA_THREAD_INFO(r13)
|
powerpc/livepatch: Fix livepatch stack access
While running stress test with livepatch module loaded, kernel bug was
triggered.
cpu 0x5: Vector: 400 (Instruction Access) at [c0000000eb9d3b60]
5:mon> t
[c0000000eb9d3de0] c0000000eb9d3e30 (unreliable)
[c0000000eb9d3e30] c000000000008ab4 hardware_interrupt_common+0x114/0x120
--- Exception: 501 (Hardware Interrupt) at c000000000053040 livepatch_handler+0x4c/0x74
[c0000000eb9d4120] 0000000057ac6e9d (unreliable)
[d0000000089d9f78] 2e0965747962382e
SP (965747962342e09) is in userspace
When an interrupt occurs during the livepatch_handler execution, it's
possible for the livepatch_stack and/or thread_info to be corrupted.
eg:
Task A Interrupt Handler
========= =================
livepatch_handler:
mr r0, r1
ld r1, TI_livepatch_sp(r12)
hardware_interrupt_common:
do_IRQ+0x8:
mflr r0 <- saved stack pointer is overwritten
bl _mcount
...
std r27,-40(r1) <- overwrite of thread_info()
lis r2, STACK_END_MAGIC@h
ori r2, r2, STACK_END_MAGIC@l
ld r12, -8(r1)
Fix the corruption by using r11 register for livepatch stack
manipulation, instead of shuffling task stack and livepatch stack into
r1 register. Using r11 register also avoids disabling/enabling irq's
while setting up the livepatch stack.
Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-09-20 15:49:51 +05:30
|
|
|
subi r11, r11, 24
|
|
|
|
std r11, TI_livepatch_sp(r12)
|
2017-04-25 19:25:54 +05:30
|
|
|
|
|
|
|
/* Return to original caller of live patched function */
|
|
|
|
blr
|
|
|
|
#endif /* CONFIG_LIVEPATCH */
|
2023-06-19 15:17:25 +05:30
|
|
|
|
2023-06-19 15:17:34 +05:30
|
|
|
#ifndef CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY
|
2023-06-19 15:17:25 +05:30
|
|
|
_GLOBAL(mcount)
|
|
|
|
_GLOBAL(_mcount)
|
|
|
|
EXPORT_SYMBOL(_mcount)
|
|
|
|
mflr r12
|
|
|
|
mtctr r12
|
|
|
|
mtlr r0
|
|
|
|
bctr
|
2023-06-19 15:17:34 +05:30
|
|
|
#endif
|
2023-06-19 15:17:25 +05:30
|
|
|
|
|
|
|
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
|
|
|
|
_GLOBAL(return_to_handler)
|
|
|
|
/* need to save return values */
|
|
|
|
#ifdef CONFIG_PPC64
|
|
|
|
std r4, -32(r1)
|
|
|
|
std r3, -24(r1)
|
|
|
|
/* save TOC */
|
|
|
|
std r2, -16(r1)
|
|
|
|
std r31, -8(r1)
|
|
|
|
mr r31, r1
|
|
|
|
stdu r1, -112(r1)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We might be called from a module.
|
|
|
|
* Switch to our TOC to run inside the core kernel.
|
|
|
|
*/
|
|
|
|
LOAD_PACA_TOC()
|
|
|
|
#else
|
|
|
|
stwu r1, -16(r1)
|
|
|
|
stw r3, 8(r1)
|
|
|
|
stw r4, 12(r1)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
bl ftrace_return_to_handler
|
|
|
|
nop
|
|
|
|
|
|
|
|
/* return value has real return address */
|
|
|
|
mtlr r3
|
|
|
|
|
|
|
|
#ifdef CONFIG_PPC64
|
|
|
|
ld r1, 0(r1)
|
|
|
|
ld r4, -32(r1)
|
|
|
|
ld r3, -24(r1)
|
|
|
|
ld r2, -16(r1)
|
|
|
|
ld r31, -8(r1)
|
|
|
|
#else
|
|
|
|
lwz r3, 8(r1)
|
|
|
|
lwz r4, 12(r1)
|
|
|
|
addi r1, r1, 16
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Jump back to real return address */
|
|
|
|
blr
|
|
|
|
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
|
|
|
|
|
2024-10-30 12:38:46 +05:30
|
|
|
#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
|
|
|
|
SYM_DATA(ftrace_ool_stub_text_count, .long CONFIG_PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE)
|
|
|
|
|
powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS
Implement support for DYNAMIC_FTRACE_WITH_CALL_OPS similar to the
arm64 implementation.
This works by patching-in a pointer to an associated ftrace_ops
structure before each traceable function. If multiple ftrace_ops are
associated with a call site, then a special ftrace_list_ops is used to
enable iterating over all the registered ftrace_ops. If no ftrace_ops
are associated with a call site, then a special ftrace_nop_ops structure
is used to render the ftrace call as a no-op. ftrace trampoline can then
read the associated ftrace_ops for a call site by loading from an offset
from the LR, and branch directly to the associated function.
The primary advantage with this approach is that we don't have to
iterate over all the registered ftrace_ops for call sites that have a
single ftrace_ops registered. This is the equivalent of implementing
support for dynamic ftrace trampolines, which set up a special ftrace
trampoline for each registered ftrace_ops and have individual call sites
branch into those directly.
A secondary advantage is that this gives us a way to add support for
direct ftrace callers without having to resort to using stubs. The
address of the direct call trampoline can be loaded from the ftrace_ops
structure.
To support this, we reserve a nop before each function on 32-bit
powerpc. For 64-bit powerpc, two nops are reserved before each
out-of-line stub. During ftrace activation, we update this location with
the associated ftrace_ops pointer. Then, on ftrace entry, we load from
this location and call into ftrace_ops->func().
For 64-bit powerpc, we ensure that the out-of-line stub area is
doubleword aligned so that ftrace_ops address can be updated atomically.
Signed-off-by: Naveen N Rao <naveen@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://patch.msgid.link/20241030070850.1361304-15-hbathini@linux.ibm.com
2024-10-30 12:38:47 +05:30
|
|
|
SYM_START(ftrace_ool_stub_text, SYM_L_GLOBAL, .balign SZL)
|
2024-10-30 12:38:46 +05:30
|
|
|
.space CONFIG_PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE * FTRACE_OOL_STUB_SIZE
|
|
|
|
SYM_CODE_END(ftrace_ool_stub_text)
|
|
|
|
#endif
|
|
|
|
|
2023-06-19 15:17:25 +05:30
|
|
|
.pushsection ".tramp.ftrace.text","aw",@progbits;
|
|
|
|
.globl ftrace_tramp_text
|
|
|
|
ftrace_tramp_text:
|
|
|
|
.space 32
|
|
|
|
.popsection
|
|
|
|
|
|
|
|
.pushsection ".tramp.ftrace.init","aw",@progbits;
|
|
|
|
.globl ftrace_tramp_init
|
|
|
|
ftrace_tramp_init:
|
|
|
|
.space 32
|
|
|
|
.popsection
|