linux/arch/riscv/include/asm/cacheflush.h

108 lines
2.7 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2015 Regents of the University of California
*/
#ifndef _ASM_RISCV_CACHEFLUSH_H
#define _ASM_RISCV_CACHEFLUSH_H
#include <linux/mm.h>
static inline void local_flush_icache_all(void)
{
asm volatile ("fence.i" ::: "memory");
}
static inline void local_flush_icache_range(unsigned long start,
unsigned long end)
{
local_flush_icache_all();
}
#define PG_dcache_clean PG_arch_1
static inline void flush_dcache_folio(struct folio *folio)
{
if (test_bit(PG_dcache_clean, &folio->flags))
clear_bit(PG_dcache_clean, &folio->flags);
}
#define flush_dcache_folio flush_dcache_folio
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
static inline void flush_dcache_page(struct page *page)
{
flush_dcache_folio(page_folio(page));
}
#define flush_icache_user_page(vma, pg, addr, len) \
do { \
if (vma->vm_flags & VM_EXEC) \
flush_icache_mm(vma->vm_mm, 0); \
} while (0)
#ifdef CONFIG_64BIT
riscv: Stop emitting preventive sfence.vma for new vmalloc mappings In 6.5, we removed the vmalloc fault path because that can't work (see [1] [2]). Then in order to make sure that new page table entries were seen by the page table walker, we had to preventively emit a sfence.vma on all harts [3] but this solution is very costly since it relies on IPI. And even there, we could end up in a loop of vmalloc faults if a vmalloc allocation is done in the IPI path (for example if it is traced, see [4]), which could result in a kernel stack overflow. Those preventive sfence.vma needed to be emitted because: - if the uarch caches invalid entries, the new mapping may not be observed by the page table walker and an invalidation may be needed. - if the uarch does not cache invalid entries, a reordered access could "miss" the new mapping and traps: in that case, we would actually only need to retry the access, no sfence.vma is required. So this patch removes those preventive sfence.vma and actually handles the possible (and unlikely) exceptions. And since the kernel stacks mappings lie in the vmalloc area, this handling must be done very early when the trap is taken, at the very beginning of handle_exception: this also rules out the vmalloc allocations in the fault path. Link: https://lore.kernel.org/linux-riscv/20230531093817.665799-1-bjorn@kernel.org/ [1] Link: https://lore.kernel.org/linux-riscv/20230801090927.2018653-1-dylan@andestech.com [2] Link: https://lore.kernel.org/linux-riscv/20230725132246.817726-1-alexghiti@rivosinc.com/ [3] Link: https://lore.kernel.org/lkml/20200508144043.13893-1-joro@8bytes.org/ [4] Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com> Reviewed-by: Yunhui Cui <cuiyunhui@bytedance.com> Link: https://lore.kernel.org/r/20240717060125.139416-4-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
2024-07-17 08:01:24 +02:00
extern u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
extern char _end[];
#define flush_cache_vmap flush_cache_vmap
static inline void flush_cache_vmap(unsigned long start, unsigned long end)
{
if (is_vmalloc_or_module_addr((void *)start)) {
int i;
/*
* We don't care if concurrently a cpu resets this value since
* the only place this can happen is in handle_exception() where
* an sfence.vma is emitted.
*/
for (i = 0; i < ARRAY_SIZE(new_vmalloc); ++i)
new_vmalloc[i] = -1ULL;
}
}
#define flush_cache_vmap_early(start, end) local_flush_tlb_kernel_range(start, end)
#endif
#ifndef CONFIG_SMP
#define flush_icache_all() local_flush_icache_all()
#define flush_icache_mm(mm, local) flush_icache_all()
#else /* CONFIG_SMP */
void flush_icache_all(void);
void flush_icache_mm(struct mm_struct *mm, bool local);
#endif /* CONFIG_SMP */
/*
* RISC-V doesn't have an instruction to flush parts of the instruction cache,
* so instead we just flush the whole thing.
*/
#define flush_icache_range flush_icache_range
static inline void flush_icache_range(unsigned long start, unsigned long end)
{
flush_icache_all();
}
extern unsigned int riscv_cbom_block_size;
extern unsigned int riscv_cboz_block_size;
extern unsigned int riscv_cbop_block_size;
void riscv_init_cbo_blocksizes(void);
#ifdef CONFIG_RISCV_DMA_NONCOHERENT
void riscv_noncoherent_supported(void);
riscv: allow kmalloc() caches aligned to the smallest value Currently, riscv defines ARCH_DMA_MINALIGN as L1_CACHE_BYTES, I.E 64Bytes, if CONFIG_RISCV_DMA_NONCOHERENT=y. To support unified kernel Image, usually we have to enable CONFIG_RISCV_DMA_NONCOHERENT, thus it brings some bad effects to coherent platforms: Firstly, it wastes memory, kmalloc-96, kmalloc-32, kmalloc-16 and kmalloc-8 slab caches don't exist any more, they are replaced with either kmalloc-128 or kmalloc-64. Secondly, larger than necessary kmalloc aligned allocations results in unnecessary cache/TLB pressure. This issue also exists on arm64 platforms. From last year, Catalin tried to solve this issue by decoupling ARCH_KMALLOC_MINALIGN from ARCH_DMA_MINALIGN, limiting kmalloc() minimum alignment to dma_get_cache_alignment() and replacing ARCH_KMALLOC_MINALIGN usage in various drivers with ARCH_DMA_MINALIGN etc.[1] One fact we can make use of for riscv: if the CPU doesn't support ZICBOM or T-HEAD CMO, we know the platform is coherent. Based on Catalin's work and above fact, we can easily solve the kmalloc align issue for riscv: we can override dma_get_cache_alignment(), then let it return ARCH_DMA_MINALIGN at the beginning and return 1 once we know the underlying HW neither supports ZICBOM nor supports T-HEAD CMO. So what about if the CPU supports ZICBOM or T-HEAD CMO, but all the devices are dma coherent? Well, we use ARCH_DMA_MINALIGN as the kmalloc minimum alignment, nothing changed in this case. This case can be improved in the future. After this patch, a simple test of booting to a small buildroot rootfs on qemu shows: kmalloc-96 5041 5041 96 ... kmalloc-64 9606 9606 64 ... kmalloc-32 5128 5128 32 ... kmalloc-16 7682 7682 16 ... kmalloc-8 10246 10246 8 ... So we save about 1268KB memory. The saving will be much larger in normal OS env on real HW platforms. Link: https://lore.kernel.org/linux-arm-kernel/20230524171904.3967031-1-catalin.marinas@arm.com/ [1] Signed-off-by: Jisheng Zhang <jszhang@kernel.org> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20230718152214.2907-2-jszhang@kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
2023-07-18 23:22:13 +08:00
void __init riscv_set_dma_cache_alignment(void);
#else
static inline void riscv_noncoherent_supported(void) {}
riscv: allow kmalloc() caches aligned to the smallest value Currently, riscv defines ARCH_DMA_MINALIGN as L1_CACHE_BYTES, I.E 64Bytes, if CONFIG_RISCV_DMA_NONCOHERENT=y. To support unified kernel Image, usually we have to enable CONFIG_RISCV_DMA_NONCOHERENT, thus it brings some bad effects to coherent platforms: Firstly, it wastes memory, kmalloc-96, kmalloc-32, kmalloc-16 and kmalloc-8 slab caches don't exist any more, they are replaced with either kmalloc-128 or kmalloc-64. Secondly, larger than necessary kmalloc aligned allocations results in unnecessary cache/TLB pressure. This issue also exists on arm64 platforms. From last year, Catalin tried to solve this issue by decoupling ARCH_KMALLOC_MINALIGN from ARCH_DMA_MINALIGN, limiting kmalloc() minimum alignment to dma_get_cache_alignment() and replacing ARCH_KMALLOC_MINALIGN usage in various drivers with ARCH_DMA_MINALIGN etc.[1] One fact we can make use of for riscv: if the CPU doesn't support ZICBOM or T-HEAD CMO, we know the platform is coherent. Based on Catalin's work and above fact, we can easily solve the kmalloc align issue for riscv: we can override dma_get_cache_alignment(), then let it return ARCH_DMA_MINALIGN at the beginning and return 1 once we know the underlying HW neither supports ZICBOM nor supports T-HEAD CMO. So what about if the CPU supports ZICBOM or T-HEAD CMO, but all the devices are dma coherent? Well, we use ARCH_DMA_MINALIGN as the kmalloc minimum alignment, nothing changed in this case. This case can be improved in the future. After this patch, a simple test of booting to a small buildroot rootfs on qemu shows: kmalloc-96 5041 5041 96 ... kmalloc-64 9606 9606 64 ... kmalloc-32 5128 5128 32 ... kmalloc-16 7682 7682 16 ... kmalloc-8 10246 10246 8 ... So we save about 1268KB memory. The saving will be much larger in normal OS env on real HW platforms. Link: https://lore.kernel.org/linux-arm-kernel/20230524171904.3967031-1-catalin.marinas@arm.com/ [1] Signed-off-by: Jisheng Zhang <jszhang@kernel.org> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20230718152214.2907-2-jszhang@kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
2023-07-18 23:22:13 +08:00
static inline void riscv_set_dma_cache_alignment(void) {}
#endif
/*
* Bits in sys_riscv_flush_icache()'s flags argument.
*/
#define SYS_RISCV_FLUSH_ICACHE_LOCAL 1UL
#define SYS_RISCV_FLUSH_ICACHE_ALL (SYS_RISCV_FLUSH_ICACHE_LOCAL)
#include <asm-generic/cacheflush.h>
#endif /* _ASM_RISCV_CACHEFLUSH_H */