linux/arch/arm64/kernel/elfcore.c

140 lines
2.9 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0-only
#include <linux/coredump.h>
#include <linux/elfcore.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <asm/cpufeature.h>
#include <asm/mte.h>
#define for_each_mte_vma(cprm, i, m) \
if (system_supports_mte()) \
for (i = 0, m = cprm->vma_meta; \
i < cprm->vma_count; \
i++, m = cprm->vma_meta + i) \
if (m->flags & VM_MTE)
static unsigned long mte_vma_tag_dump_size(struct core_vma_metadata *m)
{
return (m->dump_size >> PAGE_SHIFT) * MTE_PAGE_TAG_STORAGE;
}
/* Derived from dump_user_range(); start/end must be page-aligned */
static int mte_dump_tag_range(struct coredump_params *cprm,
unsigned long start, unsigned long len)
{
int ret = 1;
unsigned long addr;
void *tags = NULL;
fs: avoid mmap sem relocks when coredumping with many missing pages Dumping processes with large allocated and mostly not-faulted areas is very slow. Borrowing a test case from Tavian Barnes: int main(void) { char *mem = mmap(NULL, 1ULL << 40, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); printf("%p %m\n", mem); if (mem != MAP_FAILED) { mem[0] = 1; } abort(); } That's 1TB of almost completely not-populated area. On my test box it takes 13-14 seconds to dump. The profile shows: - 99.89% 0.00% a.out entry_SYSCALL_64_after_hwframe do_syscall_64 syscall_exit_to_user_mode arch_do_signal_or_restart - get_signal - 99.89% do_coredump - 99.88% elf_core_dump - dump_user_range - 98.12% get_dump_page - 64.19% __get_user_pages - 40.92% gup_vma_lookup - find_vma - mt_find 4.21% __rcu_read_lock 1.33% __rcu_read_unlock - 3.14% check_vma_flags 0.68% vma_is_secretmem 0.61% __cond_resched 0.60% vma_pgtable_walk_end 0.59% vma_pgtable_walk_begin 0.58% no_page_table - 15.13% down_read_killable 0.69% __cond_resched 13.84% up_read 0.58% __cond_resched Almost 29% of the time is spent relocking the mmap semaphore between calls to get_dump_page() which find nothing. Whacking that results in times of 10 seconds (down from 13-14). While here make the thing killable. The real problem is the page-sized iteration and the real fix would patch it up instead. It is left as an exercise for the mm-familiar reader. Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> Link: https://lore.kernel.org/r/20250119103205.2172432-1-mjguzik@gmail.com Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-19 11:32:05 +01:00
int locked = 0;
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
fs: avoid mmap sem relocks when coredumping with many missing pages Dumping processes with large allocated and mostly not-faulted areas is very slow. Borrowing a test case from Tavian Barnes: int main(void) { char *mem = mmap(NULL, 1ULL << 40, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); printf("%p %m\n", mem); if (mem != MAP_FAILED) { mem[0] = 1; } abort(); } That's 1TB of almost completely not-populated area. On my test box it takes 13-14 seconds to dump. The profile shows: - 99.89% 0.00% a.out entry_SYSCALL_64_after_hwframe do_syscall_64 syscall_exit_to_user_mode arch_do_signal_or_restart - get_signal - 99.89% do_coredump - 99.88% elf_core_dump - dump_user_range - 98.12% get_dump_page - 64.19% __get_user_pages - 40.92% gup_vma_lookup - find_vma - mt_find 4.21% __rcu_read_lock 1.33% __rcu_read_unlock - 3.14% check_vma_flags 0.68% vma_is_secretmem 0.61% __cond_resched 0.60% vma_pgtable_walk_end 0.59% vma_pgtable_walk_begin 0.58% no_page_table - 15.13% down_read_killable 0.69% __cond_resched 13.84% up_read 0.58% __cond_resched Almost 29% of the time is spent relocking the mmap semaphore between calls to get_dump_page() which find nothing. Whacking that results in times of 10 seconds (down from 13-14). While here make the thing killable. The real problem is the page-sized iteration and the real fix would patch it up instead. It is left as an exercise for the mm-familiar reader. Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> Link: https://lore.kernel.org/r/20250119103205.2172432-1-mjguzik@gmail.com Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-19 11:32:05 +01:00
struct page *page = get_dump_page(addr, &locked);
/*
* get_dump_page() returns NULL when encountering an empty
* page table entry that would otherwise have been filled with
* the zero page. Skip the equivalent tag dump which would
* have been all zeros.
*/
if (!page) {
dump_skip(cprm, MTE_PAGE_TAG_STORAGE);
continue;
}
/*
* Pages mapped in user space as !pte_access_permitted() (e.g.
* PROT_EXEC only) may not have the PG_mte_tagged flag set.
*/
arm64: mte: Fix/clarify the PG_mte_tagged semantics Currently the PG_mte_tagged page flag mostly means the page contains valid tags and it should be set after the tags have been cleared or restored. However, in mte_sync_tags() it is set before setting the tags to avoid, in theory, a race with concurrent mprotect(PROT_MTE) for shared pages. However, a concurrent mprotect(PROT_MTE) with a copy on write in another thread can cause the new page to have stale tags. Similarly, tag reading via ptrace() can read stale tags if the PG_mte_tagged flag is set before actually clearing/restoring the tags. Fix the PG_mte_tagged semantics so that it is only set after the tags have been cleared or restored. This is safe for swap restoring into a MAP_SHARED or CoW page since the core code takes the page lock. Add two functions to test and set the PG_mte_tagged flag with acquire and release semantics. The downside is that concurrent mprotect(PROT_MTE) on a MAP_SHARED page may cause tag loss. This is already the case for KVM guests if a VMM changes the page protection while the guest triggers a user_mem_abort(). Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> [pcc@google.com: fix build with CONFIG_ARM64_MTE disabled] Signed-off-by: Peter Collingbourne <pcc@google.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Steven Price <steven.price@arm.com> Cc: Will Deacon <will@kernel.org> Cc: Marc Zyngier <maz@kernel.org> Cc: Peter Collingbourne <pcc@google.com> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20221104011041.290951-3-pcc@google.com
2022-11-03 18:10:35 -07:00
if (!page_mte_tagged(page)) {
put_page(page);
dump_skip(cprm, MTE_PAGE_TAG_STORAGE);
continue;
}
if (!tags) {
tags = mte_allocate_tag_storage();
if (!tags) {
put_page(page);
ret = 0;
break;
}
}
mte_save_page_tags(page_address(page), tags);
put_page(page);
if (!dump_emit(cprm, tags, MTE_PAGE_TAG_STORAGE)) {
ret = 0;
break;
}
}
if (tags)
mte_free_tag_storage(tags);
return ret;
}
Elf_Half elf_core_extra_phdrs(struct coredump_params *cprm)
{
int i;
struct core_vma_metadata *m;
int vma_count = 0;
for_each_mte_vma(cprm, i, m)
vma_count++;
return vma_count;
}
int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
{
int i;
struct core_vma_metadata *m;
for_each_mte_vma(cprm, i, m) {
struct elf_phdr phdr;
phdr.p_type = PT_AARCH64_MEMTAG_MTE;
phdr.p_offset = offset;
phdr.p_vaddr = m->start;
phdr.p_paddr = 0;
phdr.p_filesz = mte_vma_tag_dump_size(m);
phdr.p_memsz = m->end - m->start;
offset += phdr.p_filesz;
phdr.p_flags = 0;
phdr.p_align = 0;
if (!dump_emit(cprm, &phdr, sizeof(phdr)))
return 0;
}
return 1;
}
size_t elf_core_extra_data_size(struct coredump_params *cprm)
{
int i;
struct core_vma_metadata *m;
size_t data_size = 0;
for_each_mte_vma(cprm, i, m)
data_size += mte_vma_tag_dump_size(m);
return data_size;
}
int elf_core_write_extra_data(struct coredump_params *cprm)
{
int i;
struct core_vma_metadata *m;
for_each_mte_vma(cprm, i, m) {
if (!mte_dump_tag_range(cprm, m->start, m->dump_size))
return 0;
}
return 1;
}