2020-02-03 17:36:20 -08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
#include <linux/pagewalk.h>
|
2024-01-30 11:34:36 +01:00
|
|
|
#include <linux/debugfs.h>
|
2020-02-03 17:36:20 -08:00
|
|
|
#include <linux/ptdump.h>
|
|
|
|
#include <linux/kasan.h>
|
mm/pagewalk: split walk_page_range_novma() into kernel/user parts
walk_page_range_novma() is rather confusing - it supports two modes, one
used often, the other used only for debugging.
The first mode is the common case of traversal of kernel page tables,
which is what nearly all callers use this for.
Secondly it provides an unusual debugging interface that allows for the
traversal of page tables in a userland range of memory even for that
memory which is not described by a VMA.
It is far from certain that such page tables should even exist, but
perhaps this is precisely why it is useful as a debugging mechanism.
As a result, this is utilised by ptdump only. Historically, things were
reversed - ptdump was the only user, and other parts of the kernel evolved
to use the kernel page table walking here.
Since we have some complicated and confusing locking rules for the novma
case, it makes sense to separate the two usages into their own functions.
Doing this also provide self-documentation as to the intent of the caller
- are they doing something rather unusual or are they simply doing a
standard kernel page table walk?
We therefore establish two separate functions - walk_page_range_debug()
for this single usage, and walk_kernel_page_table_range() for general
kernel page table walking.
The walk_page_range_debug() function is currently used to traverse both
userland and kernel mappings, so we maintain this and in the case of
kernel mappings being traversed, we have walk_page_range_debug() invoke
walk_kernel_page_table_range() internally.
We additionally make walk_page_range_debug() internal to mm.
Link: https://lkml.kernel.org/r/20250605135104.90720-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Barry Song <baohua@kernel.org>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: WANG Xuerui <kernel@xen0n.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-06-05 14:51:04 +01:00
|
|
|
#include "internal.h"
|
2020-02-03 17:36:20 -08:00
|
|
|
|
2020-12-22 12:02:06 -08:00
|
|
|
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
2020-02-03 17:36:20 -08:00
|
|
|
/*
|
|
|
|
* This is an optimization for KASAN=y case. Since all kasan page tables
|
|
|
|
* eventually point to the kasan_early_shadow_page we could call note_page()
|
|
|
|
* right away without walking through lower level page tables. This saves
|
|
|
|
* us dozens of seconds (minutes for 5-level config) while checking for
|
|
|
|
* W+X mapping or reading kernel_page_tables debugfs file.
|
|
|
|
*/
|
|
|
|
static inline int note_kasan_page_table(struct mm_walk *walk,
|
|
|
|
unsigned long addr)
|
|
|
|
{
|
|
|
|
struct ptdump_state *st = walk->private;
|
|
|
|
|
2025-04-07 11:01:11 +05:30
|
|
|
st->note_page_pte(st, addr, kasan_early_shadow_pte[0]);
|
2020-02-03 17:36:20 -08:00
|
|
|
|
|
|
|
walk->action = ACTION_CONTINUE;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
|
|
|
|
unsigned long next, struct mm_walk *walk)
|
|
|
|
{
|
|
|
|
struct ptdump_state *st = walk->private;
|
|
|
|
pgd_t val = READ_ONCE(*pgd);
|
|
|
|
|
2020-12-22 12:02:06 -08:00
|
|
|
#if CONFIG_PGTABLE_LEVELS > 4 && \
|
|
|
|
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
|
2020-02-03 17:36:20 -08:00
|
|
|
if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
|
|
|
|
return note_kasan_page_table(walk, addr);
|
|
|
|
#endif
|
|
|
|
|
2025-04-07 11:01:12 +05:30
|
|
|
if (st->effective_prot_pgd)
|
|
|
|
st->effective_prot_pgd(st, val);
|
2020-06-01 21:49:58 -07:00
|
|
|
|
2022-03-22 14:45:06 -07:00
|
|
|
if (pgd_leaf(val)) {
|
2025-04-07 11:01:11 +05:30
|
|
|
st->note_page_pgd(st, addr, val);
|
2022-03-22 14:45:06 -07:00
|
|
|
walk->action = ACTION_CONTINUE;
|
|
|
|
}
|
2020-02-03 17:36:20 -08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
|
|
|
|
unsigned long next, struct mm_walk *walk)
|
|
|
|
{
|
|
|
|
struct ptdump_state *st = walk->private;
|
|
|
|
p4d_t val = READ_ONCE(*p4d);
|
|
|
|
|
2020-12-22 12:02:06 -08:00
|
|
|
#if CONFIG_PGTABLE_LEVELS > 3 && \
|
|
|
|
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
|
2020-02-03 17:36:20 -08:00
|
|
|
if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
|
|
|
|
return note_kasan_page_table(walk, addr);
|
|
|
|
#endif
|
|
|
|
|
2025-04-07 11:01:12 +05:30
|
|
|
if (st->effective_prot_p4d)
|
|
|
|
st->effective_prot_p4d(st, val);
|
2020-06-01 21:49:58 -07:00
|
|
|
|
2022-03-22 14:45:06 -07:00
|
|
|
if (p4d_leaf(val)) {
|
2025-04-07 11:01:11 +05:30
|
|
|
st->note_page_p4d(st, addr, val);
|
2022-03-22 14:45:06 -07:00
|
|
|
walk->action = ACTION_CONTINUE;
|
|
|
|
}
|
2020-02-03 17:36:20 -08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
|
|
|
|
unsigned long next, struct mm_walk *walk)
|
|
|
|
{
|
|
|
|
struct ptdump_state *st = walk->private;
|
|
|
|
pud_t val = READ_ONCE(*pud);
|
|
|
|
|
2020-12-22 12:02:06 -08:00
|
|
|
#if CONFIG_PGTABLE_LEVELS > 2 && \
|
|
|
|
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
|
2020-02-03 17:36:20 -08:00
|
|
|
if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
|
|
|
|
return note_kasan_page_table(walk, addr);
|
|
|
|
#endif
|
|
|
|
|
2025-04-07 11:01:12 +05:30
|
|
|
if (st->effective_prot_pud)
|
|
|
|
st->effective_prot_pud(st, val);
|
2020-06-01 21:49:58 -07:00
|
|
|
|
2022-03-22 14:45:06 -07:00
|
|
|
if (pud_leaf(val)) {
|
2025-04-07 11:01:11 +05:30
|
|
|
st->note_page_pud(st, addr, val);
|
2022-03-22 14:45:06 -07:00
|
|
|
walk->action = ACTION_CONTINUE;
|
|
|
|
}
|
2020-02-03 17:36:20 -08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
|
|
|
|
unsigned long next, struct mm_walk *walk)
|
|
|
|
{
|
|
|
|
struct ptdump_state *st = walk->private;
|
|
|
|
pmd_t val = READ_ONCE(*pmd);
|
|
|
|
|
2020-12-22 12:02:06 -08:00
|
|
|
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
2020-02-03 17:36:20 -08:00
|
|
|
if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
|
|
|
|
return note_kasan_page_table(walk, addr);
|
|
|
|
#endif
|
|
|
|
|
2025-04-07 11:01:12 +05:30
|
|
|
if (st->effective_prot_pmd)
|
|
|
|
st->effective_prot_pmd(st, val);
|
2022-03-22 14:45:06 -07:00
|
|
|
if (pmd_leaf(val)) {
|
2025-04-07 11:01:11 +05:30
|
|
|
st->note_page_pmd(st, addr, val);
|
2022-03-22 14:45:06 -07:00
|
|
|
walk->action = ACTION_CONTINUE;
|
|
|
|
}
|
2020-02-03 17:36:20 -08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
|
|
|
|
unsigned long next, struct mm_walk *walk)
|
|
|
|
{
|
|
|
|
struct ptdump_state *st = walk->private;
|
2023-06-12 16:15:43 +01:00
|
|
|
pte_t val = ptep_get_lockless(pte);
|
2020-06-01 21:49:58 -07:00
|
|
|
|
2025-04-07 11:01:12 +05:30
|
|
|
if (st->effective_prot_pte)
|
|
|
|
st->effective_prot_pte(st, val);
|
2020-02-03 17:36:20 -08:00
|
|
|
|
2025-04-07 11:01:11 +05:30
|
|
|
st->note_page_pte(st, addr, val);
|
2020-02-03 17:36:20 -08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ptdump_hole(unsigned long addr, unsigned long next,
|
|
|
|
int depth, struct mm_walk *walk)
|
|
|
|
{
|
|
|
|
struct ptdump_state *st = walk->private;
|
2025-04-07 11:01:11 +05:30
|
|
|
pte_t pte_zero = {0};
|
|
|
|
pmd_t pmd_zero = {0};
|
|
|
|
pud_t pud_zero = {0};
|
|
|
|
p4d_t p4d_zero = {0};
|
|
|
|
pgd_t pgd_zero = {0};
|
|
|
|
|
|
|
|
switch (depth) {
|
|
|
|
case 4:
|
|
|
|
st->note_page_pte(st, addr, pte_zero);
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
st->note_page_pmd(st, addr, pmd_zero);
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
st->note_page_pud(st, addr, pud_zero);
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
st->note_page_p4d(st, addr, p4d_zero);
|
|
|
|
break;
|
|
|
|
case 0:
|
|
|
|
st->note_page_pgd(st, addr, pgd_zero);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2020-02-03 17:36:20 -08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct mm_walk_ops ptdump_ops = {
|
|
|
|
.pgd_entry = ptdump_pgd_entry,
|
|
|
|
.p4d_entry = ptdump_p4d_entry,
|
|
|
|
.pud_entry = ptdump_pud_entry,
|
|
|
|
.pmd_entry = ptdump_pmd_entry,
|
|
|
|
.pte_entry = ptdump_pte_entry,
|
|
|
|
.pte_hole = ptdump_hole,
|
|
|
|
};
|
|
|
|
|
2020-02-03 17:36:42 -08:00
|
|
|
void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
|
2020-02-03 17:36:20 -08:00
|
|
|
{
|
|
|
|
const struct ptdump_range *range = st->range;
|
|
|
|
|
2025-06-20 10:54:27 +05:30
|
|
|
get_online_mems();
|
2022-09-02 12:26:12 +01:00
|
|
|
mmap_write_lock(mm);
|
2020-02-03 17:36:20 -08:00
|
|
|
while (range->start != range->end) {
|
mm/pagewalk: split walk_page_range_novma() into kernel/user parts
walk_page_range_novma() is rather confusing - it supports two modes, one
used often, the other used only for debugging.
The first mode is the common case of traversal of kernel page tables,
which is what nearly all callers use this for.
Secondly it provides an unusual debugging interface that allows for the
traversal of page tables in a userland range of memory even for that
memory which is not described by a VMA.
It is far from certain that such page tables should even exist, but
perhaps this is precisely why it is useful as a debugging mechanism.
As a result, this is utilised by ptdump only. Historically, things were
reversed - ptdump was the only user, and other parts of the kernel evolved
to use the kernel page table walking here.
Since we have some complicated and confusing locking rules for the novma
case, it makes sense to separate the two usages into their own functions.
Doing this also provide self-documentation as to the intent of the caller
- are they doing something rather unusual or are they simply doing a
standard kernel page table walk?
We therefore establish two separate functions - walk_page_range_debug()
for this single usage, and walk_kernel_page_table_range() for general
kernel page table walking.
The walk_page_range_debug() function is currently used to traverse both
userland and kernel mappings, so we maintain this and in the case of
kernel mappings being traversed, we have walk_page_range_debug() invoke
walk_kernel_page_table_range() internally.
We additionally make walk_page_range_debug() internal to mm.
Link: https://lkml.kernel.org/r/20250605135104.90720-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Barry Song <baohua@kernel.org>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: WANG Xuerui <kernel@xen0n.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-06-05 14:51:04 +01:00
|
|
|
walk_page_range_debug(mm, range->start, range->end,
|
2020-02-03 17:36:42 -08:00
|
|
|
&ptdump_ops, pgd, st);
|
2020-02-03 17:36:20 -08:00
|
|
|
range++;
|
|
|
|
}
|
2022-09-02 12:26:12 +01:00
|
|
|
mmap_write_unlock(mm);
|
2025-06-20 10:54:27 +05:30
|
|
|
put_online_mems();
|
2020-02-03 17:36:20 -08:00
|
|
|
|
|
|
|
/* Flush out the last page */
|
2025-04-07 11:01:11 +05:30
|
|
|
st->note_page_flush(st);
|
2020-02-03 17:36:20 -08:00
|
|
|
}
|
2024-01-30 11:34:36 +01:00
|
|
|
|
|
|
|
static int check_wx_show(struct seq_file *m, void *v)
|
|
|
|
{
|
|
|
|
if (ptdump_check_wx())
|
|
|
|
seq_puts(m, "SUCCESS\n");
|
|
|
|
else
|
|
|
|
seq_puts(m, "FAILED\n");
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
DEFINE_SHOW_ATTRIBUTE(check_wx);
|
|
|
|
|
|
|
|
static int ptdump_debugfs_init(void)
|
|
|
|
{
|
|
|
|
debugfs_create_file("check_wx_pages", 0400, NULL, NULL, &check_wx_fops);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
device_initcall(ptdump_debugfs_init);
|