mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-05-24 10:39:52 +00:00

We can hit the following BUG_ON during memory unplug: kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:342! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries NIP [c000000000093308] pmd_fragment_free+0x48/0xc0 LR [c00000000147bfec] remove_pagetable+0x578/0x60c Call Trace: 0xc000008050000000 (unreliable) remove_pagetable+0x384/0x60c radix__remove_section_mapping+0x18/0x2c remove_section_mapping+0x1c/0x3c arch_remove_memory+0x11c/0x180 try_remove_memory+0x120/0x1b0 __remove_memory+0x20/0x40 dlpar_remove_lmb+0xc0/0x114 dlpar_memory+0x8b0/0xb20 handle_dlpar_errorlog+0xc0/0x190 pseries_hp_work_fn+0x2c/0x60 process_one_work+0x30c/0x810 worker_thread+0x98/0x540 kthread+0x1c4/0x1d0 ret_from_kernel_thread+0x5c/0x74 This occurs when unplug is attempted for such memory which has been mapped using memblock pages as part of early kernel page table setup. We wouldn't have initialized the PMD or PTE fragment count for those PMD or PTE pages. This can be fixed by allocating memory in PAGE_SIZE granularity during early page table allocation. This makes sure a specific page is not shared for another memblock allocation and we can free them correctly on removing page-table pages. Since we now do PAGE_SIZE allocations for both PUD table and PMD table (Note that PTE table allocation is already of PAGE_SIZE), we end up allocating more memory for the same amount of system RAM. Here is a comparision of how much more we need for a 64T and 2G system after this patch: 1. 64T system ------------- 64T RAM would need 64G for vmemmap with struct page size being 64B. 128 PUD tables for 64T memory (1G mappings) 1 PUD table and 64 PMD tables for 64G vmemmap (2M mappings) With default PUD[PMD]_TABLE_SIZE(4K), (128+1+64)*4K=772K With PAGE_SIZE(64K) table allocations, (128+1+64)*64K=12352K 2. 2G system ------------ 2G RAM would need 2M for vmemmap with struct page size being 64B. 1 PUD table for 2G memory (1G mapping) 1 PUD table and 1 PMD table for 2M vmemmap (2M mappings) With default PUD[PMD]_TABLE_SIZE(4K), (1+1+1)*4K=12K With new PAGE_SIZE(64K) table allocations, (1+1+1)*64K=192K Signed-off-by: Bharata B Rao <bharata@linux.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200709131925.922266-2-aneesh.kumar@linux.ibm.com
181 lines
4.6 KiB
C
181 lines
4.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
#ifndef _ASM_POWERPC_BOOK3S_64_PGALLOC_H
|
|
#define _ASM_POWERPC_BOOK3S_64_PGALLOC_H
|
|
/*
|
|
*/
|
|
|
|
#include <linux/slab.h>
|
|
#include <linux/cpumask.h>
|
|
#include <linux/kmemleak.h>
|
|
#include <linux/percpu.h>
|
|
|
|
struct vmemmap_backing {
|
|
struct vmemmap_backing *list;
|
|
unsigned long phys;
|
|
unsigned long virt_addr;
|
|
};
|
|
extern struct vmemmap_backing *vmemmap_list;
|
|
|
|
extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned long);
|
|
extern void pmd_fragment_free(unsigned long *);
|
|
extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
|
|
extern void __tlb_remove_table(void *_table);
|
|
void pte_frag_destroy(void *pte_frag);
|
|
|
|
static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm)
|
|
{
|
|
#ifdef CONFIG_PPC_64K_PAGES
|
|
return (pgd_t *)__get_free_page(pgtable_gfp_flags(mm, PGALLOC_GFP));
|
|
#else
|
|
struct page *page;
|
|
page = alloc_pages(pgtable_gfp_flags(mm, PGALLOC_GFP | __GFP_RETRY_MAYFAIL),
|
|
4);
|
|
if (!page)
|
|
return NULL;
|
|
return (pgd_t *) page_address(page);
|
|
#endif
|
|
}
|
|
|
|
static inline void radix__pgd_free(struct mm_struct *mm, pgd_t *pgd)
|
|
{
|
|
#ifdef CONFIG_PPC_64K_PAGES
|
|
free_page((unsigned long)pgd);
|
|
#else
|
|
free_pages((unsigned long)pgd, 4);
|
|
#endif
|
|
}
|
|
|
|
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
|
|
{
|
|
pgd_t *pgd;
|
|
|
|
if (radix_enabled())
|
|
return radix__pgd_alloc(mm);
|
|
|
|
pgd = kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
|
|
pgtable_gfp_flags(mm, GFP_KERNEL));
|
|
if (unlikely(!pgd))
|
|
return pgd;
|
|
|
|
/*
|
|
* Don't scan the PGD for pointers, it contains references to PUDs but
|
|
* those references are not full pointers and so can't be recognised by
|
|
* kmemleak.
|
|
*/
|
|
kmemleak_no_scan(pgd);
|
|
|
|
/*
|
|
* With hugetlb, we don't clear the second half of the page table.
|
|
* If we share the same slab cache with the pmd or pud level table,
|
|
* we need to make sure we zero out the full table on alloc.
|
|
* With 4K we don't store slot in the second half. Hence we don't
|
|
* need to do this for 4k.
|
|
*/
|
|
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_PPC_64K_PAGES) && \
|
|
(H_PGD_INDEX_SIZE == H_PUD_CACHE_INDEX)
|
|
memset(pgd, 0, PGD_TABLE_SIZE);
|
|
#endif
|
|
return pgd;
|
|
}
|
|
|
|
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
|
|
{
|
|
if (radix_enabled())
|
|
return radix__pgd_free(mm, pgd);
|
|
kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
|
|
}
|
|
|
|
static inline void p4d_populate(struct mm_struct *mm, p4d_t *pgd, pud_t *pud)
|
|
{
|
|
*pgd = __p4d(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
|
|
}
|
|
|
|
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
|
|
{
|
|
pud_t *pud;
|
|
|
|
pud = kmem_cache_alloc(PGT_CACHE(PUD_CACHE_INDEX),
|
|
pgtable_gfp_flags(mm, GFP_KERNEL));
|
|
/*
|
|
* Tell kmemleak to ignore the PUD, that means don't scan it for
|
|
* pointers and don't consider it a leak. PUDs are typically only
|
|
* referred to by their PGD, but kmemleak is not able to recognise those
|
|
* as pointers, leading to false leak reports.
|
|
*/
|
|
kmemleak_ignore(pud);
|
|
|
|
return pud;
|
|
}
|
|
|
|
static inline void __pud_free(pud_t *pud)
|
|
{
|
|
struct page *page = virt_to_page(pud);
|
|
|
|
/*
|
|
* Early pud pages allocated via memblock allocator
|
|
* can't be directly freed to slab
|
|
*/
|
|
if (PageReserved(page))
|
|
free_reserved_page(page);
|
|
else
|
|
kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
|
|
}
|
|
|
|
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
|
|
{
|
|
return __pud_free(pud);
|
|
}
|
|
|
|
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
|
|
{
|
|
*pud = __pud(__pgtable_ptr_val(pmd) | PUD_VAL_BITS);
|
|
}
|
|
|
|
static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
|
|
unsigned long address)
|
|
{
|
|
pgtable_free_tlb(tlb, pud, PUD_INDEX);
|
|
}
|
|
|
|
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
|
|
{
|
|
return pmd_fragment_alloc(mm, addr);
|
|
}
|
|
|
|
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
|
|
{
|
|
pmd_fragment_free((unsigned long *)pmd);
|
|
}
|
|
|
|
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
|
|
unsigned long address)
|
|
{
|
|
return pgtable_free_tlb(tlb, pmd, PMD_INDEX);
|
|
}
|
|
|
|
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
|
|
pte_t *pte)
|
|
{
|
|
*pmd = __pmd(__pgtable_ptr_val(pte) | PMD_VAL_BITS);
|
|
}
|
|
|
|
static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
|
|
pgtable_t pte_page)
|
|
{
|
|
*pmd = __pmd(__pgtable_ptr_val(pte_page) | PMD_VAL_BITS);
|
|
}
|
|
|
|
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
|
|
unsigned long address)
|
|
{
|
|
pgtable_free_tlb(tlb, table, PTE_INDEX);
|
|
}
|
|
|
|
extern atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
|
|
static inline void update_page_count(int psize, long count)
|
|
{
|
|
if (IS_ENABLED(CONFIG_PROC_FS))
|
|
atomic_long_add(count, &direct_pages_count[psize]);
|
|
}
|
|
|
|
#endif /* _ASM_POWERPC_BOOK3S_64_PGALLOC_H */
|