2019-05-19 13:08:55 +01:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2016-09-19 17:04:18 -04:00
|
|
|
#include <linux/extable.h>
|
2016-12-24 11:46:01 -08:00
|
|
|
#include <linux/uaccess.h>
|
2017-02-08 18:51:35 +01:00
|
|
|
#include <linux/sched/debug.h>
|
2021-11-10 11:01:09 +01:00
|
|
|
#include <linux/bitfield.h>
|
2017-11-24 09:42:21 +01:00
|
|
|
#include <xen/xen.h>
|
2017-02-08 18:51:35 +01:00
|
|
|
|
2021-10-15 03:16:41 +02:00
|
|
|
#include <asm/fpu/api.h>
|
2023-12-05 02:50:18 -08:00
|
|
|
#include <asm/fred.h>
|
2021-04-27 06:16:34 -05:00
|
|
|
#include <asm/sev.h>
|
2016-04-02 07:01:33 -07:00
|
|
|
#include <asm/traps.h>
|
2016-07-05 00:31:27 +02:00
|
|
|
#include <asm/kdebug.h>
|
2021-11-10 11:01:09 +01:00
|
|
|
#include <asm/insn-eval.h>
|
2021-11-10 11:01:20 +01:00
|
|
|
#include <asm/sgx.h>
|
2021-11-10 11:01:09 +01:00
|
|
|
|
|
|
|
static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr)
|
|
|
|
{
|
|
|
|
int reg_offset = pt_regs_offset(regs, nr);
|
|
|
|
static unsigned long __dummy;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(reg_offset < 0))
|
|
|
|
return &__dummy;
|
|
|
|
|
|
|
|
return (unsigned long *)((unsigned long)regs + reg_offset);
|
|
|
|
}
|
2008-01-30 13:31:41 +01:00
|
|
|
|
2012-04-20 17:12:48 -07:00
|
|
|
static inline unsigned long
|
|
|
|
ex_fixup_addr(const struct exception_table_entry *x)
|
|
|
|
{
|
|
|
|
return (unsigned long)&x->fixup + x->fixup;
|
|
|
|
}
|
2008-01-30 13:31:41 +01:00
|
|
|
|
2021-11-10 11:01:09 +01:00
|
|
|
static bool ex_handler_default(const struct exception_table_entry *e,
|
2021-09-08 15:29:18 +02:00
|
|
|
struct pt_regs *regs)
|
2008-01-30 13:31:41 +01:00
|
|
|
{
|
2021-11-10 11:01:09 +01:00
|
|
|
if (e->data & EX_FLAG_CLEAR_AX)
|
|
|
|
regs->ax = 0;
|
|
|
|
if (e->data & EX_FLAG_CLEAR_DX)
|
|
|
|
regs->dx = 0;
|
|
|
|
|
|
|
|
regs->ip = ex_fixup_addr(e);
|
2016-02-17 10:20:12 -08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2022-08-14 14:16:13 -07:00
|
|
|
/*
|
|
|
|
* This is the *very* rare case where we do a "load_unaligned_zeropad()"
|
|
|
|
* and it's a page crosser into a non-existent page.
|
|
|
|
*
|
|
|
|
* This happens when we optimistically load a pathname a word-at-a-time
|
|
|
|
* and the name is less than the full word and the next page is not
|
|
|
|
* mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC.
|
|
|
|
*
|
|
|
|
* NOTE! The faulting address is always a 'mov mem,reg' type instruction
|
|
|
|
* of size 'long', and the exception fixup must always point to right
|
|
|
|
* after the instruction.
|
|
|
|
*/
|
|
|
|
static bool ex_handler_zeropad(const struct exception_table_entry *e,
|
|
|
|
struct pt_regs *regs,
|
|
|
|
unsigned long fault_addr)
|
|
|
|
{
|
|
|
|
struct insn insn;
|
|
|
|
const unsigned long mask = sizeof(long) - 1;
|
|
|
|
unsigned long offset, addr, next_ip, len;
|
|
|
|
unsigned long *reg;
|
|
|
|
|
|
|
|
next_ip = ex_fixup_addr(e);
|
|
|
|
len = next_ip - regs->ip;
|
|
|
|
if (len > MAX_INSN_SIZE)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (insn_decode(&insn, (void *) regs->ip, len, INSN_MODE_KERN))
|
|
|
|
return false;
|
|
|
|
if (insn.length != len)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (insn.opcode.bytes[0] != 0x8b)
|
|
|
|
return false;
|
|
|
|
if (insn.opnd_bytes != sizeof(long))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
addr = (unsigned long) insn_get_addr_ref(&insn, regs);
|
|
|
|
if (addr == ~0ul)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
offset = addr & mask;
|
|
|
|
addr = addr & ~mask;
|
|
|
|
if (fault_addr != addr + sizeof(long))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
reg = insn_get_modrm_reg_ptr(&insn, regs);
|
|
|
|
if (!reg)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
*reg = *(unsigned long *)addr >> (offset * 8);
|
|
|
|
return ex_handler_default(e, regs);
|
|
|
|
}
|
|
|
|
|
2021-09-08 15:29:18 +02:00
|
|
|
static bool ex_handler_fault(const struct exception_table_entry *fixup,
|
|
|
|
struct pt_regs *regs, int trapnr)
|
2016-02-17 10:20:12 -08:00
|
|
|
{
|
|
|
|
regs->ax = trapnr;
|
2021-09-08 15:29:18 +02:00
|
|
|
return ex_handler_default(fixup, regs);
|
2016-02-17 10:20:12 -08:00
|
|
|
}
|
|
|
|
|
2021-11-10 11:01:20 +01:00
|
|
|
static bool ex_handler_sgx(const struct exception_table_entry *fixup,
|
|
|
|
struct pt_regs *regs, int trapnr)
|
|
|
|
{
|
|
|
|
regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG;
|
|
|
|
return ex_handler_default(fixup, regs);
|
|
|
|
}
|
|
|
|
|
x86/fpu: Reinitialize FPU registers if restoring FPU state fails
Userspace can change the FPU state of a task using the ptrace() or
rt_sigreturn() system calls. Because reserved bits in the FPU state can
cause the XRSTOR instruction to fail, the kernel has to carefully
validate that no reserved bits or other invalid values are being set.
Unfortunately, there have been bugs in this validation code. For
example, we were not checking that the 'xcomp_bv' field in the
xstate_header was 0. As-is, such bugs are exploitable to read the FPU
registers of other processes on the system. To do so, an attacker can
create a task, assign to it an invalid FPU state, then spin in a loop
and monitor the values of the FPU registers. Because the task's FPU
registers are not being restored, sometimes the FPU registers will have
the values from another process.
This is likely to continue to be a problem in the future because the
validation done by the CPU instructions like XRSTOR is not immediately
visible to kernel developers. Nor will invalid FPU states ever be
encountered during ordinary use --- they will only be seen during
fuzzing or exploits. There can even be reserved bits outside the
xstate_header which are easy to forget about. For example, the MXCSR
register contains reserved bits, which were not validated by the
KVM_SET_XSAVE ioctl until commit a575813bfe4b ("KVM: x86: Fix load
damaged SSEx MXCSR register").
Therefore, mitigate this class of vulnerability by restoring the FPU
registers from init_fpstate if restoring from the task's state fails.
We actually used to do this, but it was (perhaps unwisely) removed by
commit 9ccc27a5d297 ("x86/fpu: Remove error return values from
copy_kernel_to_*regs() functions"). This new patch is also a bit
different. First, it only clears the registers, not also the bad
in-memory state; this is simpler and makes it easier to make the
mitigation cover all callers of __copy_kernel_to_fpregs(). Second, it
does the register clearing in an exception handler so that no extra
instructions are added to context switches. In fact, we *remove*
instructions, since previously we were always zeroing the register
containing 'err' even if CONFIG_X86_DEBUG_FPU was disabled.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170922174156.16780-4-ebiggers3@gmail.com
Link: http://lkml.kernel.org/r/20170923130016.21448-27-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-09-23 15:00:09 +02:00
|
|
|
/*
|
|
|
|
* Handler for when we fail to restore a task's FPU state. We should never get
|
|
|
|
* here because the FPU state of a task using the FPU (task->thread.fpu.state)
|
|
|
|
* should always be valid. However, past bugs have allowed userspace to set
|
|
|
|
* reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
|
|
|
|
* These caused XRSTOR to fail when switching to the task, leaking the FPU
|
|
|
|
* registers of the task previously executing on the CPU. Mitigate this class
|
|
|
|
* of vulnerability by restoring from the initial state (essentially, zeroing
|
|
|
|
* out all the FPU registers) if we can't restore from the task's FPU state.
|
|
|
|
*/
|
2021-09-08 15:29:18 +02:00
|
|
|
static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
|
|
|
|
struct pt_regs *regs)
|
x86/fpu: Reinitialize FPU registers if restoring FPU state fails
Userspace can change the FPU state of a task using the ptrace() or
rt_sigreturn() system calls. Because reserved bits in the FPU state can
cause the XRSTOR instruction to fail, the kernel has to carefully
validate that no reserved bits or other invalid values are being set.
Unfortunately, there have been bugs in this validation code. For
example, we were not checking that the 'xcomp_bv' field in the
xstate_header was 0. As-is, such bugs are exploitable to read the FPU
registers of other processes on the system. To do so, an attacker can
create a task, assign to it an invalid FPU state, then spin in a loop
and monitor the values of the FPU registers. Because the task's FPU
registers are not being restored, sometimes the FPU registers will have
the values from another process.
This is likely to continue to be a problem in the future because the
validation done by the CPU instructions like XRSTOR is not immediately
visible to kernel developers. Nor will invalid FPU states ever be
encountered during ordinary use --- they will only be seen during
fuzzing or exploits. There can even be reserved bits outside the
xstate_header which are easy to forget about. For example, the MXCSR
register contains reserved bits, which were not validated by the
KVM_SET_XSAVE ioctl until commit a575813bfe4b ("KVM: x86: Fix load
damaged SSEx MXCSR register").
Therefore, mitigate this class of vulnerability by restoring the FPU
registers from init_fpstate if restoring from the task's state fails.
We actually used to do this, but it was (perhaps unwisely) removed by
commit 9ccc27a5d297 ("x86/fpu: Remove error return values from
copy_kernel_to_*regs() functions"). This new patch is also a bit
different. First, it only clears the registers, not also the bad
in-memory state; this is simpler and makes it easier to make the
mitigation cover all callers of __copy_kernel_to_fpregs(). Second, it
does the register clearing in an exception handler so that no extra
instructions are added to context switches. In fact, we *remove*
instructions, since previously we were always zeroing the register
containing 'err' even if CONFIG_X86_DEBUG_FPU was disabled.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170922174156.16780-4-ebiggers3@gmail.com
Link: http://lkml.kernel.org/r/20170923130016.21448-27-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-09-23 15:00:09 +02:00
|
|
|
{
|
|
|
|
regs->ip = ex_fixup_addr(fixup);
|
|
|
|
|
|
|
|
WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
|
|
|
|
(void *)instruction_pointer(regs));
|
|
|
|
|
2021-10-15 03:16:41 +02:00
|
|
|
fpu_reset_from_exception_fixup();
|
x86/fpu: Reinitialize FPU registers if restoring FPU state fails
Userspace can change the FPU state of a task using the ptrace() or
rt_sigreturn() system calls. Because reserved bits in the FPU state can
cause the XRSTOR instruction to fail, the kernel has to carefully
validate that no reserved bits or other invalid values are being set.
Unfortunately, there have been bugs in this validation code. For
example, we were not checking that the 'xcomp_bv' field in the
xstate_header was 0. As-is, such bugs are exploitable to read the FPU
registers of other processes on the system. To do so, an attacker can
create a task, assign to it an invalid FPU state, then spin in a loop
and monitor the values of the FPU registers. Because the task's FPU
registers are not being restored, sometimes the FPU registers will have
the values from another process.
This is likely to continue to be a problem in the future because the
validation done by the CPU instructions like XRSTOR is not immediately
visible to kernel developers. Nor will invalid FPU states ever be
encountered during ordinary use --- they will only be seen during
fuzzing or exploits. There can even be reserved bits outside the
xstate_header which are easy to forget about. For example, the MXCSR
register contains reserved bits, which were not validated by the
KVM_SET_XSAVE ioctl until commit a575813bfe4b ("KVM: x86: Fix load
damaged SSEx MXCSR register").
Therefore, mitigate this class of vulnerability by restoring the FPU
registers from init_fpstate if restoring from the task's state fails.
We actually used to do this, but it was (perhaps unwisely) removed by
commit 9ccc27a5d297 ("x86/fpu: Remove error return values from
copy_kernel_to_*regs() functions"). This new patch is also a bit
different. First, it only clears the registers, not also the bad
in-memory state; this is simpler and makes it easier to make the
mitigation cover all callers of __copy_kernel_to_fpregs(). Second, it
does the register clearing in an exception handler so that no extra
instructions are added to context switches. In fact, we *remove*
instructions, since previously we were always zeroing the register
containing 'err' even if CONFIG_X86_DEBUG_FPU was disabled.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170922174156.16780-4-ebiggers3@gmail.com
Link: http://lkml.kernel.org/r/20170923130016.21448-27-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-09-23 15:00:09 +02:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
x86-64: make access_ok() independent of LAM
The linear address masking (LAM) code made access_ok() more complicated,
in that it now needs to untag the address in order to verify the access
range. See commit 74c228d20a51 ("x86/uaccess: Provide untagged_addr()
and remove tags before address check").
We were able to avoid that overhead in the get_user/put_user code paths
by simply using the sign bit for the address check, and depending on the
GP fault if the address was non-canonical, which made it all independent
of LAM.
And we can do the same thing for access_ok(): simply check that the user
pointer range has the high bit clear. No need to bother with any
address bit masking.
In fact, we can go a bit further, and just check the starting address
for known small accesses ranges: any accesses that overflow will still
be in the non-canonical area and will still GP fault.
To still make syzkaller catch any potentially unchecked user addresses,
we'll continue to warn about GP faults that are caused by accesses in
the non-canonical range. But we'll limit that to purely "high bit set
and past the one-page 'slop' area".
We could probably just do that "check only starting address" for any
arbitrary range size: realistically all kernel accesses to user space
will be done starting at the low address. But let's leave that kind of
optimization for later. As it is, this already allows us to generate
simpler code and not worry about any tag bits in the address.
The one thing to look out for is the GUP address check: instead of
actually copying data in the virtual address range (and thus bad
addresses being caught by the GP fault), GUP will look up the page
tables manually. As a result, the page table limits need to be checked,
and that was previously implicitly done by the access_ok().
With the relaxed access_ok() check, we need to just do an explicit check
for TASK_SIZE_MAX in the GUP code instead. The GUP code already needs
to do the tag bit unmasking anyway, so there this is all very
straightforward, and there are no LAM issues.
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-04-28 12:55:10 -07:00
|
|
|
/*
|
|
|
|
* On x86-64, we end up being imprecise with 'access_ok()', and allow
|
|
|
|
* non-canonical user addresses to make the range comparisons simpler,
|
|
|
|
* and to not have to worry about LAM being enabled.
|
|
|
|
*
|
|
|
|
* In fact, we allow up to one page of "slop" at the sign boundary,
|
|
|
|
* which means that we can do access_ok() by just checking the sign
|
|
|
|
* of the pointer for the common case of having a small access size.
|
|
|
|
*/
|
|
|
|
static bool gp_fault_address_ok(unsigned long fault_address)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_X86_64
|
|
|
|
/* Is it in the "user space" part of the non-canonical space? */
|
2023-05-03 10:13:41 -07:00
|
|
|
if (valid_user_address(fault_address))
|
x86-64: make access_ok() independent of LAM
The linear address masking (LAM) code made access_ok() more complicated,
in that it now needs to untag the address in order to verify the access
range. See commit 74c228d20a51 ("x86/uaccess: Provide untagged_addr()
and remove tags before address check").
We were able to avoid that overhead in the get_user/put_user code paths
by simply using the sign bit for the address check, and depending on the
GP fault if the address was non-canonical, which made it all independent
of LAM.
And we can do the same thing for access_ok(): simply check that the user
pointer range has the high bit clear. No need to bother with any
address bit masking.
In fact, we can go a bit further, and just check the starting address
for known small accesses ranges: any accesses that overflow will still
be in the non-canonical area and will still GP fault.
To still make syzkaller catch any potentially unchecked user addresses,
we'll continue to warn about GP faults that are caused by accesses in
the non-canonical range. But we'll limit that to purely "high bit set
and past the one-page 'slop' area".
We could probably just do that "check only starting address" for any
arbitrary range size: realistically all kernel accesses to user space
will be done starting at the low address. But let's leave that kind of
optimization for later. As it is, this already allows us to generate
simpler code and not worry about any tag bits in the address.
The one thing to look out for is the GUP address check: instead of
actually copying data in the virtual address range (and thus bad
addresses being caught by the GP fault), GUP will look up the page
tables manually. As a result, the page table limits need to be checked,
and that was previously implicitly done by the access_ok().
With the relaxed access_ok() check, we need to just do an explicit check
for TASK_SIZE_MAX in the GUP code instead. The GUP code already needs
to do the tag bit unmasking anyway, so there this is all very
straightforward, and there are no LAM issues.
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-04-28 12:55:10 -07:00
|
|
|
return true;
|
|
|
|
|
|
|
|
/* .. or just above it? */
|
|
|
|
fault_address -= PAGE_SIZE;
|
2023-05-03 10:13:41 -07:00
|
|
|
if (valid_user_address(fault_address))
|
x86-64: make access_ok() independent of LAM
The linear address masking (LAM) code made access_ok() more complicated,
in that it now needs to untag the address in order to verify the access
range. See commit 74c228d20a51 ("x86/uaccess: Provide untagged_addr()
and remove tags before address check").
We were able to avoid that overhead in the get_user/put_user code paths
by simply using the sign bit for the address check, and depending on the
GP fault if the address was non-canonical, which made it all independent
of LAM.
And we can do the same thing for access_ok(): simply check that the user
pointer range has the high bit clear. No need to bother with any
address bit masking.
In fact, we can go a bit further, and just check the starting address
for known small accesses ranges: any accesses that overflow will still
be in the non-canonical area and will still GP fault.
To still make syzkaller catch any potentially unchecked user addresses,
we'll continue to warn about GP faults that are caused by accesses in
the non-canonical range. But we'll limit that to purely "high bit set
and past the one-page 'slop' area".
We could probably just do that "check only starting address" for any
arbitrary range size: realistically all kernel accesses to user space
will be done starting at the low address. But let's leave that kind of
optimization for later. As it is, this already allows us to generate
simpler code and not worry about any tag bits in the address.
The one thing to look out for is the GUP address check: instead of
actually copying data in the virtual address range (and thus bad
addresses being caught by the GP fault), GUP will look up the page
tables manually. As a result, the page table limits need to be checked,
and that was previously implicitly done by the access_ok().
With the relaxed access_ok() check, we need to just do an explicit check
for TASK_SIZE_MAX in the GUP code instead. The GUP code already needs
to do the tag bit unmasking anyway, so there this is all very
straightforward, and there are no LAM issues.
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-04-28 12:55:10 -07:00
|
|
|
return true;
|
|
|
|
#endif
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-09-08 15:29:18 +02:00
|
|
|
static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
|
x86-64: make access_ok() independent of LAM
The linear address masking (LAM) code made access_ok() more complicated,
in that it now needs to untag the address in order to verify the access
range. See commit 74c228d20a51 ("x86/uaccess: Provide untagged_addr()
and remove tags before address check").
We were able to avoid that overhead in the get_user/put_user code paths
by simply using the sign bit for the address check, and depending on the
GP fault if the address was non-canonical, which made it all independent
of LAM.
And we can do the same thing for access_ok(): simply check that the user
pointer range has the high bit clear. No need to bother with any
address bit masking.
In fact, we can go a bit further, and just check the starting address
for known small accesses ranges: any accesses that overflow will still
be in the non-canonical area and will still GP fault.
To still make syzkaller catch any potentially unchecked user addresses,
we'll continue to warn about GP faults that are caused by accesses in
the non-canonical range. But we'll limit that to purely "high bit set
and past the one-page 'slop' area".
We could probably just do that "check only starting address" for any
arbitrary range size: realistically all kernel accesses to user space
will be done starting at the low address. But let's leave that kind of
optimization for later. As it is, this already allows us to generate
simpler code and not worry about any tag bits in the address.
The one thing to look out for is the GUP address check: instead of
actually copying data in the virtual address range (and thus bad
addresses being caught by the GP fault), GUP will look up the page
tables manually. As a result, the page table limits need to be checked,
and that was previously implicitly done by the access_ok().
With the relaxed access_ok() check, we need to just do an explicit check
for TASK_SIZE_MAX in the GUP code instead. The GUP code already needs
to do the tag bit unmasking anyway, so there this is all very
straightforward, and there are no LAM issues.
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-04-28 12:55:10 -07:00
|
|
|
struct pt_regs *regs, int trapnr,
|
|
|
|
unsigned long fault_address)
|
2018-08-28 22:14:18 +02:00
|
|
|
{
|
x86-64: make access_ok() independent of LAM
The linear address masking (LAM) code made access_ok() more complicated,
in that it now needs to untag the address in order to verify the access
range. See commit 74c228d20a51 ("x86/uaccess: Provide untagged_addr()
and remove tags before address check").
We were able to avoid that overhead in the get_user/put_user code paths
by simply using the sign bit for the address check, and depending on the
GP fault if the address was non-canonical, which made it all independent
of LAM.
And we can do the same thing for access_ok(): simply check that the user
pointer range has the high bit clear. No need to bother with any
address bit masking.
In fact, we can go a bit further, and just check the starting address
for known small accesses ranges: any accesses that overflow will still
be in the non-canonical area and will still GP fault.
To still make syzkaller catch any potentially unchecked user addresses,
we'll continue to warn about GP faults that are caused by accesses in
the non-canonical range. But we'll limit that to purely "high bit set
and past the one-page 'slop' area".
We could probably just do that "check only starting address" for any
arbitrary range size: realistically all kernel accesses to user space
will be done starting at the low address. But let's leave that kind of
optimization for later. As it is, this already allows us to generate
simpler code and not worry about any tag bits in the address.
The one thing to look out for is the GUP address check: instead of
actually copying data in the virtual address range (and thus bad
addresses being caught by the GP fault), GUP will look up the page
tables manually. As a result, the page table limits need to be checked,
and that was previously implicitly done by the access_ok().
With the relaxed access_ok() check, we need to just do an explicit check
for TASK_SIZE_MAX in the GUP code instead. The GUP code already needs
to do the tag bit unmasking anyway, so there this is all very
straightforward, and there are no LAM issues.
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-04-28 12:55:10 -07:00
|
|
|
WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address),
|
|
|
|
"General protection fault in user access. Non-canonical address?");
|
2021-09-08 15:29:18 +02:00
|
|
|
return ex_handler_default(fixup, regs);
|
2018-08-28 22:14:18 +02:00
|
|
|
}
|
|
|
|
|
2021-11-10 11:01:10 +01:00
|
|
|
static bool ex_handler_msr(const struct exception_table_entry *fixup,
|
|
|
|
struct pt_regs *regs, bool wrmsr, bool safe, int reg)
|
2016-04-02 07:01:37 -07:00
|
|
|
{
|
2022-06-17 16:52:06 +02:00
|
|
|
if (__ONCE_LITE_IF(!safe && wrmsr)) {
|
|
|
|
pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
|
|
|
|
(unsigned int)regs->cx, (unsigned int)regs->dx,
|
|
|
|
(unsigned int)regs->ax, regs->ip, (void *)regs->ip);
|
2021-11-10 11:01:10 +01:00
|
|
|
show_stack_regs(regs);
|
2022-06-17 16:52:06 +02:00
|
|
|
}
|
2021-11-10 11:01:10 +01:00
|
|
|
|
2022-06-17 16:52:06 +02:00
|
|
|
if (__ONCE_LITE_IF(!safe && !wrmsr)) {
|
|
|
|
pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
|
|
|
|
(unsigned int)regs->cx, regs->ip, (void *)regs->ip);
|
2016-07-05 00:31:27 +02:00
|
|
|
show_stack_regs(regs);
|
2022-06-17 16:52:06 +02:00
|
|
|
}
|
2016-04-02 07:01:37 -07:00
|
|
|
|
2021-11-10 11:01:10 +01:00
|
|
|
if (!wrmsr) {
|
|
|
|
/* Pretend that the read succeeded and returned 0. */
|
|
|
|
regs->ax = 0;
|
|
|
|
regs->dx = 0;
|
|
|
|
}
|
2016-04-02 07:01:37 -07:00
|
|
|
|
2021-11-10 11:01:10 +01:00
|
|
|
if (safe)
|
|
|
|
*pt_regs_nr(regs, reg) = -EIO;
|
2016-04-02 07:01:37 -07:00
|
|
|
|
2021-09-08 15:29:18 +02:00
|
|
|
return ex_handler_default(fixup, regs);
|
2016-04-02 07:01:37 -07:00
|
|
|
}
|
|
|
|
|
2021-09-08 15:29:18 +02:00
|
|
|
static bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
|
|
|
|
struct pt_regs *regs)
|
2016-04-26 12:23:26 -07:00
|
|
|
{
|
|
|
|
if (static_cpu_has(X86_BUG_NULL_SEG))
|
|
|
|
asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
|
|
|
|
asm volatile ("mov %0, %%fs" : : "rm" (0));
|
2021-09-08 15:29:18 +02:00
|
|
|
return ex_handler_default(fixup, regs);
|
2016-04-26 12:23:26 -07:00
|
|
|
}
|
|
|
|
|
2021-11-10 11:01:09 +01:00
|
|
|
static bool ex_handler_imm_reg(const struct exception_table_entry *fixup,
|
|
|
|
struct pt_regs *regs, int reg, int imm)
|
|
|
|
{
|
|
|
|
*pt_regs_nr(regs, reg) = (long)imm;
|
|
|
|
return ex_handler_default(fixup, regs);
|
|
|
|
}
|
|
|
|
|
2021-11-10 11:01:22 +01:00
|
|
|
static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup,
|
x86-64: make access_ok() independent of LAM
The linear address masking (LAM) code made access_ok() more complicated,
in that it now needs to untag the address in order to verify the access
range. See commit 74c228d20a51 ("x86/uaccess: Provide untagged_addr()
and remove tags before address check").
We were able to avoid that overhead in the get_user/put_user code paths
by simply using the sign bit for the address check, and depending on the
GP fault if the address was non-canonical, which made it all independent
of LAM.
And we can do the same thing for access_ok(): simply check that the user
pointer range has the high bit clear. No need to bother with any
address bit masking.
In fact, we can go a bit further, and just check the starting address
for known small accesses ranges: any accesses that overflow will still
be in the non-canonical area and will still GP fault.
To still make syzkaller catch any potentially unchecked user addresses,
we'll continue to warn about GP faults that are caused by accesses in
the non-canonical range. But we'll limit that to purely "high bit set
and past the one-page 'slop' area".
We could probably just do that "check only starting address" for any
arbitrary range size: realistically all kernel accesses to user space
will be done starting at the low address. But let's leave that kind of
optimization for later. As it is, this already allows us to generate
simpler code and not worry about any tag bits in the address.
The one thing to look out for is the GUP address check: instead of
actually copying data in the virtual address range (and thus bad
addresses being caught by the GP fault), GUP will look up the page
tables manually. As a result, the page table limits need to be checked,
and that was previously implicitly done by the access_ok().
With the relaxed access_ok() check, we need to just do an explicit check
for TASK_SIZE_MAX in the GUP code instead. The GUP code already needs
to do the tag bit unmasking anyway, so there this is all very
straightforward, and there are no LAM issues.
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-04-28 12:55:10 -07:00
|
|
|
struct pt_regs *regs, int trapnr,
|
|
|
|
unsigned long fault_address,
|
|
|
|
int reg, int imm)
|
2021-11-10 11:01:22 +01:00
|
|
|
{
|
|
|
|
regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg);
|
x86-64: make access_ok() independent of LAM
The linear address masking (LAM) code made access_ok() more complicated,
in that it now needs to untag the address in order to verify the access
range. See commit 74c228d20a51 ("x86/uaccess: Provide untagged_addr()
and remove tags before address check").
We were able to avoid that overhead in the get_user/put_user code paths
by simply using the sign bit for the address check, and depending on the
GP fault if the address was non-canonical, which made it all independent
of LAM.
And we can do the same thing for access_ok(): simply check that the user
pointer range has the high bit clear. No need to bother with any
address bit masking.
In fact, we can go a bit further, and just check the starting address
for known small accesses ranges: any accesses that overflow will still
be in the non-canonical area and will still GP fault.
To still make syzkaller catch any potentially unchecked user addresses,
we'll continue to warn about GP faults that are caused by accesses in
the non-canonical range. But we'll limit that to purely "high bit set
and past the one-page 'slop' area".
We could probably just do that "check only starting address" for any
arbitrary range size: realistically all kernel accesses to user space
will be done starting at the low address. But let's leave that kind of
optimization for later. As it is, this already allows us to generate
simpler code and not worry about any tag bits in the address.
The one thing to look out for is the GUP address check: instead of
actually copying data in the virtual address range (and thus bad
addresses being caught by the GP fault), GUP will look up the page
tables manually. As a result, the page table limits need to be checked,
and that was previously implicitly done by the access_ok().
With the relaxed access_ok() check, we need to just do an explicit check
for TASK_SIZE_MAX in the GUP code instead. The GUP code already needs
to do the tag bit unmasking anyway, so there this is all very
straightforward, and there are no LAM issues.
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-04-28 12:55:10 -07:00
|
|
|
return ex_handler_uaccess(fixup, regs, trapnr, fault_address);
|
2021-11-10 11:01:22 +01:00
|
|
|
}
|
|
|
|
|
2023-12-05 02:50:18 -08:00
|
|
|
#ifdef CONFIG_X86_FRED
|
|
|
|
static bool ex_handler_eretu(const struct exception_table_entry *fixup,
|
|
|
|
struct pt_regs *regs, unsigned long error_code)
|
|
|
|
{
|
|
|
|
struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax));
|
|
|
|
unsigned short ss = uregs->ss;
|
|
|
|
unsigned short cs = uregs->cs;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Move the NMI bit from the invalid stack frame, which caused ERETU
|
|
|
|
* to fault, to the fault handler's stack frame, thus to unblock NMI
|
|
|
|
* with the fault handler's ERETS instruction ASAP if NMI is blocked.
|
|
|
|
*/
|
|
|
|
regs->fred_ss.nmi = uregs->fred_ss.nmi;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sync event information to uregs, i.e., the ERETU return frame, but
|
|
|
|
* is it safe to write to the ERETU return frame which is just above
|
|
|
|
* current event stack frame?
|
|
|
|
*
|
|
|
|
* The RSP used by FRED to push a stack frame is not the value in %rsp,
|
|
|
|
* it is calculated from %rsp with the following 2 steps:
|
|
|
|
* 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes
|
|
|
|
* 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line
|
|
|
|
* when an event delivery doesn't trigger a stack level change.
|
|
|
|
*
|
|
|
|
* Here is an example with N*64 (N=1) bytes reserved:
|
|
|
|
*
|
|
|
|
* 64-byte cache line ==> ______________
|
|
|
|
* |___Reserved___|
|
|
|
|
* |__Event_data__|
|
|
|
|
* |_____SS_______|
|
|
|
|
* |_____RSP______|
|
|
|
|
* |_____FLAGS____|
|
|
|
|
* |_____CS_______|
|
|
|
|
* |_____IP_______|
|
|
|
|
* 64-byte cache line ==> |__Error_code__| <== ERETU return frame
|
|
|
|
* |______________|
|
|
|
|
* |______________|
|
|
|
|
* |______________|
|
|
|
|
* |______________|
|
|
|
|
* |______________|
|
|
|
|
* |______________|
|
|
|
|
* |______________|
|
|
|
|
* 64-byte cache line ==> |______________| <== RSP after step 1) and 2)
|
|
|
|
* |___Reserved___|
|
|
|
|
* |__Event_data__|
|
|
|
|
* |_____SS_______|
|
|
|
|
* |_____RSP______|
|
|
|
|
* |_____FLAGS____|
|
|
|
|
* |_____CS_______|
|
|
|
|
* |_____IP_______|
|
|
|
|
* 64-byte cache line ==> |__Error_code__| <== ERETS return frame
|
|
|
|
*
|
|
|
|
* Thus a new FRED stack frame will always be pushed below a previous
|
|
|
|
* FRED stack frame ((N*64) bytes may be reserved between), and it is
|
|
|
|
* safe to write to a previous FRED stack frame as they never overlap.
|
|
|
|
*/
|
|
|
|
fred_info(uregs)->edata = fred_event_data(regs);
|
|
|
|
uregs->ssx = regs->ssx;
|
|
|
|
uregs->fred_ss.ss = ss;
|
|
|
|
/* The NMI bit was moved away above */
|
|
|
|
uregs->fred_ss.nmi = 0;
|
|
|
|
uregs->csx = regs->csx;
|
|
|
|
uregs->fred_cs.sl = 0;
|
|
|
|
uregs->fred_cs.wfe = 0;
|
|
|
|
uregs->cs = cs;
|
|
|
|
uregs->orig_ax = error_code;
|
|
|
|
|
|
|
|
return ex_handler_default(fixup, regs);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2021-09-08 15:29:18 +02:00
|
|
|
int ex_get_fixup_type(unsigned long ip)
|
2016-02-17 10:20:12 -08:00
|
|
|
{
|
2021-09-08 15:29:18 +02:00
|
|
|
const struct exception_table_entry *e = search_exception_tables(ip);
|
2016-02-17 10:20:12 -08:00
|
|
|
|
2021-11-10 11:01:09 +01:00
|
|
|
return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE;
|
2016-02-17 10:20:12 -08:00
|
|
|
}
|
|
|
|
|
2018-08-28 22:14:19 +02:00
|
|
|
int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
|
|
|
|
unsigned long fault_addr)
|
2016-02-17 10:20:12 -08:00
|
|
|
{
|
|
|
|
const struct exception_table_entry *e;
|
2021-11-10 11:01:09 +01:00
|
|
|
int type, reg, imm;
|
2008-01-30 13:31:41 +01:00
|
|
|
|
|
|
|
#ifdef CONFIG_PNPBIOS
|
|
|
|
if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
|
|
|
|
extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
|
|
|
|
extern u32 pnp_bios_is_utter_crap;
|
|
|
|
pnp_bios_is_utter_crap = 1;
|
|
|
|
printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
|
|
|
|
__asm__ volatile(
|
|
|
|
"movl %0, %%esp\n\t"
|
|
|
|
"jmp *%1\n\t"
|
|
|
|
: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
|
|
|
|
panic("do_trap: can't hit this");
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2016-02-17 10:20:12 -08:00
|
|
|
e = search_exception_tables(regs->ip);
|
|
|
|
if (!e)
|
|
|
|
return 0;
|
2008-01-30 13:31:41 +01:00
|
|
|
|
2021-11-10 11:01:09 +01:00
|
|
|
type = FIELD_GET(EX_DATA_TYPE_MASK, e->data);
|
|
|
|
reg = FIELD_GET(EX_DATA_REG_MASK, e->data);
|
|
|
|
imm = FIELD_GET(EX_DATA_IMM_MASK, e->data);
|
|
|
|
|
|
|
|
switch (type) {
|
2021-09-08 15:29:18 +02:00
|
|
|
case EX_TYPE_DEFAULT:
|
2021-09-08 15:29:19 +02:00
|
|
|
case EX_TYPE_DEFAULT_MCE_SAFE:
|
2021-09-08 15:29:18 +02:00
|
|
|
return ex_handler_default(e, regs);
|
|
|
|
case EX_TYPE_FAULT:
|
2021-09-08 15:29:19 +02:00
|
|
|
case EX_TYPE_FAULT_MCE_SAFE:
|
2021-09-08 15:29:18 +02:00
|
|
|
return ex_handler_fault(e, regs, trapnr);
|
|
|
|
case EX_TYPE_UACCESS:
|
x86-64: make access_ok() independent of LAM
The linear address masking (LAM) code made access_ok() more complicated,
in that it now needs to untag the address in order to verify the access
range. See commit 74c228d20a51 ("x86/uaccess: Provide untagged_addr()
and remove tags before address check").
We were able to avoid that overhead in the get_user/put_user code paths
by simply using the sign bit for the address check, and depending on the
GP fault if the address was non-canonical, which made it all independent
of LAM.
And we can do the same thing for access_ok(): simply check that the user
pointer range has the high bit clear. No need to bother with any
address bit masking.
In fact, we can go a bit further, and just check the starting address
for known small accesses ranges: any accesses that overflow will still
be in the non-canonical area and will still GP fault.
To still make syzkaller catch any potentially unchecked user addresses,
we'll continue to warn about GP faults that are caused by accesses in
the non-canonical range. But we'll limit that to purely "high bit set
and past the one-page 'slop' area".
We could probably just do that "check only starting address" for any
arbitrary range size: realistically all kernel accesses to user space
will be done starting at the low address. But let's leave that kind of
optimization for later. As it is, this already allows us to generate
simpler code and not worry about any tag bits in the address.
The one thing to look out for is the GUP address check: instead of
actually copying data in the virtual address range (and thus bad
addresses being caught by the GP fault), GUP will look up the page
tables manually. As a result, the page table limits need to be checked,
and that was previously implicitly done by the access_ok().
With the relaxed access_ok() check, we need to just do an explicit check
for TASK_SIZE_MAX in the GUP code instead. The GUP code already needs
to do the tag bit unmasking anyway, so there this is all very
straightforward, and there are no LAM issues.
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-04-28 12:55:10 -07:00
|
|
|
return ex_handler_uaccess(e, regs, trapnr, fault_addr);
|
2021-09-08 15:29:18 +02:00
|
|
|
case EX_TYPE_CLEAR_FS:
|
|
|
|
return ex_handler_clear_fs(e, regs);
|
|
|
|
case EX_TYPE_FPU_RESTORE:
|
|
|
|
return ex_handler_fprestore(e, regs);
|
|
|
|
case EX_TYPE_BPF:
|
|
|
|
return ex_handler_bpf(e, regs);
|
2021-11-10 11:01:10 +01:00
|
|
|
case EX_TYPE_WRMSR:
|
|
|
|
return ex_handler_msr(e, regs, true, false, reg);
|
|
|
|
case EX_TYPE_RDMSR:
|
|
|
|
return ex_handler_msr(e, regs, false, false, reg);
|
|
|
|
case EX_TYPE_WRMSR_SAFE:
|
|
|
|
return ex_handler_msr(e, regs, true, true, reg);
|
|
|
|
case EX_TYPE_RDMSR_SAFE:
|
|
|
|
return ex_handler_msr(e, regs, false, true, reg);
|
2021-09-08 15:29:18 +02:00
|
|
|
case EX_TYPE_WRMSR_IN_MCE:
|
|
|
|
ex_handler_msr_mce(regs, true);
|
|
|
|
break;
|
2021-11-10 11:01:10 +01:00
|
|
|
case EX_TYPE_RDMSR_IN_MCE:
|
|
|
|
ex_handler_msr_mce(regs, false);
|
|
|
|
break;
|
2022-01-11 12:11:14 +01:00
|
|
|
case EX_TYPE_POP_REG:
|
|
|
|
regs->sp += sizeof(long);
|
|
|
|
fallthrough;
|
2021-11-10 11:01:09 +01:00
|
|
|
case EX_TYPE_IMM_REG:
|
|
|
|
return ex_handler_imm_reg(e, regs, reg, imm);
|
2021-11-10 11:01:20 +01:00
|
|
|
case EX_TYPE_FAULT_SGX:
|
|
|
|
return ex_handler_sgx(e, regs, trapnr);
|
2021-11-10 11:01:22 +01:00
|
|
|
case EX_TYPE_UCOPY_LEN:
|
x86-64: make access_ok() independent of LAM
The linear address masking (LAM) code made access_ok() more complicated,
in that it now needs to untag the address in order to verify the access
range. See commit 74c228d20a51 ("x86/uaccess: Provide untagged_addr()
and remove tags before address check").
We were able to avoid that overhead in the get_user/put_user code paths
by simply using the sign bit for the address check, and depending on the
GP fault if the address was non-canonical, which made it all independent
of LAM.
And we can do the same thing for access_ok(): simply check that the user
pointer range has the high bit clear. No need to bother with any
address bit masking.
In fact, we can go a bit further, and just check the starting address
for known small accesses ranges: any accesses that overflow will still
be in the non-canonical area and will still GP fault.
To still make syzkaller catch any potentially unchecked user addresses,
we'll continue to warn about GP faults that are caused by accesses in
the non-canonical range. But we'll limit that to purely "high bit set
and past the one-page 'slop' area".
We could probably just do that "check only starting address" for any
arbitrary range size: realistically all kernel accesses to user space
will be done starting at the low address. But let's leave that kind of
optimization for later. As it is, this already allows us to generate
simpler code and not worry about any tag bits in the address.
The one thing to look out for is the GUP address check: instead of
actually copying data in the virtual address range (and thus bad
addresses being caught by the GP fault), GUP will look up the page
tables manually. As a result, the page table limits need to be checked,
and that was previously implicitly done by the access_ok().
With the relaxed access_ok() check, we need to just do an explicit check
for TASK_SIZE_MAX in the GUP code instead. The GUP code already needs
to do the tag bit unmasking anyway, so there this is all very
straightforward, and there are no LAM issues.
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2023-04-28 12:55:10 -07:00
|
|
|
return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm);
|
2022-08-14 14:16:13 -07:00
|
|
|
case EX_TYPE_ZEROPAD:
|
|
|
|
return ex_handler_zeropad(e, regs, fault_addr);
|
2023-12-05 02:50:18 -08:00
|
|
|
#ifdef CONFIG_X86_FRED
|
|
|
|
case EX_TYPE_ERETU:
|
|
|
|
return ex_handler_eretu(e, regs, error_code);
|
|
|
|
#endif
|
2021-09-08 15:29:18 +02:00
|
|
|
}
|
|
|
|
BUG();
|
2008-01-30 13:31:41 +01:00
|
|
|
}
|
2012-04-19 15:24:20 -07:00
|
|
|
|
2016-04-02 07:01:34 -07:00
|
|
|
extern unsigned int early_recursion_flag;
|
|
|
|
|
2012-04-19 15:24:20 -07:00
|
|
|
/* Restricted version used during very early boot */
|
2016-04-02 07:01:34 -07:00
|
|
|
void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
|
2012-04-19 15:24:20 -07:00
|
|
|
{
|
2016-04-02 07:01:33 -07:00
|
|
|
/* Ignore early NMIs. */
|
|
|
|
if (trapnr == X86_TRAP_NMI)
|
2016-04-02 07:01:34 -07:00
|
|
|
return;
|
|
|
|
|
|
|
|
if (early_recursion_flag > 2)
|
|
|
|
goto halt_loop;
|
|
|
|
|
2016-11-19 18:42:40 -08:00
|
|
|
/*
|
|
|
|
* Old CPUs leave the high bits of CS on the stack
|
|
|
|
* undefined. I'm not sure which CPUs do this, but at least
|
|
|
|
* the 486 DX works this way.
|
2017-11-24 09:42:21 +01:00
|
|
|
* Xen pv domains are not using the default __KERNEL_CS.
|
2016-11-19 18:42:40 -08:00
|
|
|
*/
|
2017-11-24 09:42:21 +01:00
|
|
|
if (!xen_pv_domain() && regs->cs != __KERNEL_CS)
|
2016-04-02 07:01:34 -07:00
|
|
|
goto fail;
|
2016-04-02 07:01:33 -07:00
|
|
|
|
2016-04-04 08:46:22 -07:00
|
|
|
/*
|
|
|
|
* The full exception fixup machinery is available as soon as
|
|
|
|
* the early IDT is loaded. This means that it is the
|
|
|
|
* responsibility of extable users to either function correctly
|
|
|
|
* when handlers are invoked early or to simply avoid causing
|
|
|
|
* exceptions before they're ready to handle them.
|
|
|
|
*
|
|
|
|
* This is better than filtering which handlers can be used,
|
|
|
|
* because refusing to call a handler here is guaranteed to
|
|
|
|
* result in a hard-to-debug panic.
|
|
|
|
*
|
|
|
|
* Keep in mind that not all vectors actually get here. Early
|
2018-08-28 22:14:19 +02:00
|
|
|
* page faults, for example, are special.
|
2016-04-04 08:46:22 -07:00
|
|
|
*/
|
2018-08-28 22:14:19 +02:00
|
|
|
if (fixup_exception(regs, trapnr, regs->orig_ax, 0))
|
2016-04-02 07:01:35 -07:00
|
|
|
return;
|
2016-04-02 07:01:34 -07:00
|
|
|
|
2020-06-11 20:26:38 -07:00
|
|
|
if (trapnr == X86_TRAP_UD) {
|
|
|
|
if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
|
|
|
|
/* Skip the ud2. */
|
|
|
|
regs->ip += LEN_UD2;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this was a BUG and report_bug returns or if this
|
|
|
|
* was just a normal #UD, we want to continue onward and
|
|
|
|
* crash.
|
|
|
|
*/
|
|
|
|
}
|
2017-06-12 13:52:46 +02:00
|
|
|
|
2016-04-02 07:01:34 -07:00
|
|
|
fail:
|
|
|
|
early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
|
|
|
|
(unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
|
|
|
|
regs->orig_ax, read_cr2());
|
|
|
|
|
|
|
|
show_regs(regs);
|
|
|
|
|
|
|
|
halt_loop:
|
|
|
|
while (true)
|
|
|
|
halt();
|
2012-04-19 15:24:20 -07:00
|
|
|
}
|