mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

By inducing delays in the right places, Jann Horn created a reproducer for
a hard to hit UAF issue that became possible after VMAs were allowed to be
recycled by adding SLAB_TYPESAFE_BY_RCU to their cache.
Race description is borrowed from Jann's discovery report:
lock_vma_under_rcu() looks up a VMA locklessly with mas_walk() under
rcu_read_lock(). At that point, the VMA may be concurrently freed, and it
can be recycled by another process. vma_start_read() then increments the
vma->vm_refcnt (if it is in an acceptable range), and if this succeeds,
vma_start_read() can return a recycled VMA.
In this scenario where the VMA has been recycled, lock_vma_under_rcu()
will then detect the mismatching ->vm_mm pointer and drop the VMA through
vma_end_read(), which calls vma_refcount_put(). vma_refcount_put() drops
the refcount and then calls rcuwait_wake_up() using a copy of vma->vm_mm.
This is wrong: It implicitly assumes that the caller is keeping the VMA's
mm alive, but in this scenario the caller has no relation to the VMA's mm,
so the rcuwait_wake_up() can cause UAF.
The diagram depicting the race:
T1 T2 T3
== == ==
lock_vma_under_rcu
mas_walk
<VMA gets removed from mm>
mmap
<the same VMA is reallocated>
vma_start_read
__refcount_inc_not_zero_limited_acquire
munmap
__vma_enter_locked
refcount_add_not_zero
vma_end_read
vma_refcount_put
__refcount_dec_and_test
rcuwait_wait_event
<finish operation>
rcuwait_wake_up [UAF]
Note that rcuwait_wait_event() in T3 does not block because refcount was
already dropped by T1. At this point T3 can exit and free the mm causing
UAF in T1.
To avoid this we move vma->vm_mm verification into vma_start_read() and
grab vma->vm_mm to stabilize it before vma_refcount_put() operation.
[surenb@google.com: v3]
Link: https://lkml.kernel.org/r/20250729145709.2731370-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250728175355.2282375-1-surenb@google.com
Fixes: 3104138517
("mm: make vma cache SLAB_TYPESAFE_BY_RCU")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: Jann Horn <jannh@google.com>
Closes: https://lore.kernel.org/all/CAG48ez0-deFbVH=E3jbkWx=X3uVbd8nWeo6kbJPQ0KoUD+m2tA@mail.gmail.com/
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
406 lines
10 KiB
C
406 lines
10 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#define CREATE_TRACE_POINTS
|
|
#include <trace/events/mmap_lock.h>
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/cgroup.h>
|
|
#include <linux/memcontrol.h>
|
|
#include <linux/mmap_lock.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/rcupdate.h>
|
|
#include <linux/smp.h>
|
|
#include <linux/trace_events.h>
|
|
#include <linux/local_lock.h>
|
|
|
|
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
|
|
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
|
|
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
|
|
|
|
#ifdef CONFIG_TRACING
|
|
/*
|
|
* Trace calls must be in a separate file, as otherwise there's a circular
|
|
* dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
|
|
*/
|
|
|
|
void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
|
|
{
|
|
trace_mmap_lock_start_locking(mm, write);
|
|
}
|
|
EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
|
|
|
|
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
|
|
bool success)
|
|
{
|
|
trace_mmap_lock_acquire_returned(mm, write, success);
|
|
}
|
|
EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
|
|
|
|
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
|
|
{
|
|
trace_mmap_lock_released(mm, write);
|
|
}
|
|
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
|
|
#endif /* CONFIG_TRACING */
|
|
|
|
#ifdef CONFIG_MMU
|
|
#ifdef CONFIG_PER_VMA_LOCK
|
|
static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
|
|
{
|
|
unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
|
|
|
|
/* Additional refcnt if the vma is attached. */
|
|
if (!detaching)
|
|
tgt_refcnt++;
|
|
|
|
/*
|
|
* If vma is detached then only vma_mark_attached() can raise the
|
|
* vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
|
|
*/
|
|
if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
|
|
return false;
|
|
|
|
rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
|
|
rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
|
|
refcount_read(&vma->vm_refcnt) == tgt_refcnt,
|
|
TASK_UNINTERRUPTIBLE);
|
|
lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
|
|
|
|
return true;
|
|
}
|
|
|
|
static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
|
|
{
|
|
*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
|
|
rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
|
|
}
|
|
|
|
void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
|
|
{
|
|
bool locked;
|
|
|
|
/*
|
|
* __vma_enter_locked() returns false immediately if the vma is not
|
|
* attached, otherwise it waits until refcnt is indicating that vma
|
|
* is attached with no readers.
|
|
*/
|
|
locked = __vma_enter_locked(vma, false);
|
|
|
|
/*
|
|
* We should use WRITE_ONCE() here because we can have concurrent reads
|
|
* from the early lockless pessimistic check in vma_start_read().
|
|
* We don't really care about the correctness of that early check, but
|
|
* we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
|
|
*/
|
|
WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
|
|
|
|
if (locked) {
|
|
bool detached;
|
|
|
|
__vma_exit_locked(vma, &detached);
|
|
WARN_ON_ONCE(detached); /* vma should remain attached */
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(__vma_start_write);
|
|
|
|
void vma_mark_detached(struct vm_area_struct *vma)
|
|
{
|
|
vma_assert_write_locked(vma);
|
|
vma_assert_attached(vma);
|
|
|
|
/*
|
|
* We are the only writer, so no need to use vma_refcount_put().
|
|
* The condition below is unlikely because the vma has been already
|
|
* write-locked and readers can increment vm_refcnt only temporarily
|
|
* before they check vm_lock_seq, realize the vma is locked and drop
|
|
* back the vm_refcnt. That is a narrow window for observing a raised
|
|
* vm_refcnt.
|
|
*/
|
|
if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
|
|
/* Wait until vma is detached with no readers. */
|
|
if (__vma_enter_locked(vma, true)) {
|
|
bool detached;
|
|
|
|
__vma_exit_locked(vma, &detached);
|
|
WARN_ON_ONCE(!detached);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
|
|
* stable and not isolated. If the VMA is not found or is being modified the
|
|
* function returns NULL.
|
|
*/
|
|
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
|
|
unsigned long address)
|
|
{
|
|
MA_STATE(mas, &mm->mm_mt, address, address);
|
|
struct vm_area_struct *vma;
|
|
|
|
rcu_read_lock();
|
|
retry:
|
|
vma = mas_walk(&mas);
|
|
if (!vma)
|
|
goto inval;
|
|
|
|
vma = vma_start_read(mm, vma);
|
|
if (IS_ERR_OR_NULL(vma)) {
|
|
/* Check if the VMA got isolated after we found it */
|
|
if (PTR_ERR(vma) == -EAGAIN) {
|
|
count_vm_vma_lock_event(VMA_LOCK_MISS);
|
|
/* The area was replaced with another one */
|
|
goto retry;
|
|
}
|
|
|
|
/* Failed to lock the VMA */
|
|
goto inval;
|
|
}
|
|
/*
|
|
* At this point, we have a stable reference to a VMA: The VMA is
|
|
* locked and we know it hasn't already been isolated.
|
|
* From here on, we can access the VMA without worrying about which
|
|
* fields are accessible for RCU readers.
|
|
*/
|
|
|
|
/* Check if the vma we locked is the right one. */
|
|
if (unlikely(address < vma->vm_start || address >= vma->vm_end))
|
|
goto inval_end_read;
|
|
|
|
rcu_read_unlock();
|
|
return vma;
|
|
|
|
inval_end_read:
|
|
vma_end_read(vma);
|
|
inval:
|
|
rcu_read_unlock();
|
|
count_vm_vma_lock_event(VMA_LOCK_ABORT);
|
|
return NULL;
|
|
}
|
|
|
|
static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
|
|
struct vma_iterator *vmi,
|
|
unsigned long from_addr)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
int ret;
|
|
|
|
ret = mmap_read_lock_killable(mm);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
|
|
/* Lookup the vma at the last position again under mmap_read_lock */
|
|
vma_iter_set(vmi, from_addr);
|
|
vma = vma_next(vmi);
|
|
if (vma) {
|
|
/* Very unlikely vma->vm_refcnt overflow case */
|
|
if (unlikely(!vma_start_read_locked(vma)))
|
|
vma = ERR_PTR(-EAGAIN);
|
|
}
|
|
|
|
mmap_read_unlock(mm);
|
|
|
|
return vma;
|
|
}
|
|
|
|
struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
|
|
struct vma_iterator *vmi,
|
|
unsigned long from_addr)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
unsigned int mm_wr_seq;
|
|
bool mmap_unlocked;
|
|
|
|
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
|
|
retry:
|
|
/* Start mmap_lock speculation in case we need to verify the vma later */
|
|
mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
|
|
vma = vma_next(vmi);
|
|
if (!vma)
|
|
return NULL;
|
|
|
|
vma = vma_start_read(mm, vma);
|
|
if (IS_ERR_OR_NULL(vma)) {
|
|
/*
|
|
* Retry immediately if the vma gets detached from under us.
|
|
* Infinite loop should not happen because the vma we find will
|
|
* have to be constantly knocked out from under us.
|
|
*/
|
|
if (PTR_ERR(vma) == -EAGAIN) {
|
|
/* reset to search from the last address */
|
|
vma_iter_set(vmi, from_addr);
|
|
goto retry;
|
|
}
|
|
|
|
goto fallback;
|
|
}
|
|
|
|
/* Verify the vma is not behind the last search position. */
|
|
if (unlikely(from_addr >= vma->vm_end))
|
|
goto fallback_unlock;
|
|
|
|
/*
|
|
* vma can be ahead of the last search position but we need to verify
|
|
* it was not shrunk after we found it and another vma has not been
|
|
* installed ahead of it. Otherwise we might observe a gap that should
|
|
* not be there.
|
|
*/
|
|
if (from_addr < vma->vm_start) {
|
|
/* Verify only if the address space might have changed since vma lookup. */
|
|
if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
|
|
vma_iter_set(vmi, from_addr);
|
|
if (vma != vma_next(vmi))
|
|
goto fallback_unlock;
|
|
}
|
|
}
|
|
|
|
return vma;
|
|
|
|
fallback_unlock:
|
|
vma_end_read(vma);
|
|
fallback:
|
|
rcu_read_unlock();
|
|
vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
|
|
rcu_read_lock();
|
|
/* Reinitialize the iterator after re-entering rcu read section */
|
|
vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
|
|
|
|
return vma;
|
|
}
|
|
#endif /* CONFIG_PER_VMA_LOCK */
|
|
|
|
#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
|
|
#include <linux/extable.h>
|
|
|
|
static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
|
|
{
|
|
if (likely(mmap_read_trylock(mm)))
|
|
return true;
|
|
|
|
if (regs && !user_mode(regs)) {
|
|
unsigned long ip = exception_ip(regs);
|
|
if (!search_exception_tables(ip))
|
|
return false;
|
|
}
|
|
|
|
return !mmap_read_lock_killable(mm);
|
|
}
|
|
|
|
static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
|
|
{
|
|
/*
|
|
* We don't have this operation yet.
|
|
*
|
|
* It should be easy enough to do: it's basically a
|
|
* atomic_long_try_cmpxchg_acquire()
|
|
* from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
|
|
* it also needs the proper lockdep magic etc.
|
|
*/
|
|
return false;
|
|
}
|
|
|
|
static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
|
|
{
|
|
mmap_read_unlock(mm);
|
|
if (regs && !user_mode(regs)) {
|
|
unsigned long ip = exception_ip(regs);
|
|
if (!search_exception_tables(ip))
|
|
return false;
|
|
}
|
|
return !mmap_write_lock_killable(mm);
|
|
}
|
|
|
|
/*
|
|
* Helper for page fault handling.
|
|
*
|
|
* This is kind of equivalent to "mmap_read_lock()" followed
|
|
* by "find_extend_vma()", except it's a lot more careful about
|
|
* the locking (and will drop the lock on failure).
|
|
*
|
|
* For example, if we have a kernel bug that causes a page
|
|
* fault, we don't want to just use mmap_read_lock() to get
|
|
* the mm lock, because that would deadlock if the bug were
|
|
* to happen while we're holding the mm lock for writing.
|
|
*
|
|
* So this checks the exception tables on kernel faults in
|
|
* order to only do this all for instructions that are actually
|
|
* expected to fault.
|
|
*
|
|
* We can also actually take the mm lock for writing if we
|
|
* need to extend the vma, which helps the VM layer a lot.
|
|
*/
|
|
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
|
|
unsigned long addr, struct pt_regs *regs)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
|
|
if (!get_mmap_lock_carefully(mm, regs))
|
|
return NULL;
|
|
|
|
vma = find_vma(mm, addr);
|
|
if (likely(vma && (vma->vm_start <= addr)))
|
|
return vma;
|
|
|
|
/*
|
|
* Well, dang. We might still be successful, but only
|
|
* if we can extend a vma to do so.
|
|
*/
|
|
if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
|
|
mmap_read_unlock(mm);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* We can try to upgrade the mmap lock atomically,
|
|
* in which case we can continue to use the vma
|
|
* we already looked up.
|
|
*
|
|
* Otherwise we'll have to drop the mmap lock and
|
|
* re-take it, and also look up the vma again,
|
|
* re-checking it.
|
|
*/
|
|
if (!mmap_upgrade_trylock(mm)) {
|
|
if (!upgrade_mmap_lock_carefully(mm, regs))
|
|
return NULL;
|
|
|
|
vma = find_vma(mm, addr);
|
|
if (!vma)
|
|
goto fail;
|
|
if (vma->vm_start <= addr)
|
|
goto success;
|
|
if (!(vma->vm_flags & VM_GROWSDOWN))
|
|
goto fail;
|
|
}
|
|
|
|
if (expand_stack_locked(vma, addr))
|
|
goto fail;
|
|
|
|
success:
|
|
mmap_write_downgrade(mm);
|
|
return vma;
|
|
|
|
fail:
|
|
mmap_write_unlock(mm);
|
|
return NULL;
|
|
}
|
|
#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
|
|
|
|
#else /* CONFIG_MMU */
|
|
|
|
/*
|
|
* At least xtensa ends up having protection faults even with no
|
|
* MMU.. No stack expansion, at least.
|
|
*/
|
|
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
|
|
unsigned long addr, struct pt_regs *regs)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
|
|
mmap_read_lock(mm);
|
|
vma = vma_lookup(mm, addr);
|
|
if (!vma)
|
|
mmap_read_unlock(mm);
|
|
return vma;
|
|
}
|
|
|
|
#endif /* CONFIG_MMU */
|