- The 2 patch series "zram: support algorithm-specific parameters" from

Sergey Senozhatsky adds infrastructure for passing algorithm-specific
   parameters into zram.  A single parameter `winbits' is implemented at
   this time.
 
 - The 5 patch series "memcg: nmi-safe kmem charging" from Shakeel Butt
   makes memcg charging nmi-safe, which is required by BFP, which can
   operate in NMI context.
 
 - The 5 patch series "Some random fixes and cleanup to shmem" from
   Kemeng Shi implements small fixes and cleanups in the shmem code.
 
 - The 2 patch series "Skip mm selftests instead when kernel features are
   not present" from Zi Yan fixes some issues in the MM selftest code.
 
 - The 2 patch series "mm/damon: build-enable essential DAMON components
   by default" from SeongJae Park reworks DAMON Kconfig to make it easier
   to enable CONFIG_DAMON.
 
 - The 2 patch series "sched/numa: add statistics of numa balance task
   migration" from Libo Chen adds more info into sysfs and procfs files to
   improve visibility into the NUMA balancer's task migration activity.
 
 - The 4 patch series "selftests/mm: cow and gup_longterm cleanups" from
   Mark Brown provides various updates to some of the MM selftests to make
   them play better with the overall containing framework.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCaDzA9wAKCRDdBJ7gKXxA
 js8sAP9V3COg+vzTmimzP3ocTkkbbIJzDfM6nXpE2EQ4BR3ejwD+NsIT2ZLtTF6O
 LqAZpgO7ju6wMjR/lM30ebCq5qFbZAw=
 =oruw
 -----END PGP SIGNATURE-----

Merge tag 'mm-stable-2025-06-01-14-06' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull more MM updates from Andrew Morton:

 - "zram: support algorithm-specific parameters" from Sergey Senozhatsky
   adds infrastructure for passing algorithm-specific parameters into
   zram. A single parameter `winbits' is implemented at this time.

 - "memcg: nmi-safe kmem charging" from Shakeel Butt makes memcg
   charging nmi-safe, which is required by BFP, which can operate in NMI
   context.

 - "Some random fixes and cleanup to shmem" from Kemeng Shi implements
   small fixes and cleanups in the shmem code.

 - "Skip mm selftests instead when kernel features are not present" from
   Zi Yan fixes some issues in the MM selftest code.

 - "mm/damon: build-enable essential DAMON components by default" from
   SeongJae Park reworks DAMON Kconfig to make it easier to enable
   CONFIG_DAMON.

 - "sched/numa: add statistics of numa balance task migration" from Libo
   Chen adds more info into sysfs and procfs files to improve visibility
   into the NUMA balancer's task migration activity.

 - "selftests/mm: cow and gup_longterm cleanups" from Mark Brown
   provides various updates to some of the MM selftests to make them
   play better with the overall containing framework.

* tag 'mm-stable-2025-06-01-14-06' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (43 commits)
  mm/khugepaged: clean up refcount check using folio_expected_ref_count()
  selftests/mm: fix test result reporting in gup_longterm
  selftests/mm: report unique test names for each cow test
  selftests/mm: add helper for logging test start and results
  selftests/mm: use standard ksft_finished() in cow and gup_longterm
  selftests/damon/_damon_sysfs: skip testcases if CONFIG_DAMON_SYSFS is disabled
  sched/numa: add statistics of numa balance task
  sched/numa: fix task swap by skipping kernel threads
  tools/testing: check correct variable in open_procmap()
  tools/testing/vma: add missing function stub
  mm/gup: update comment explaining why gup_fast() disables IRQs
  selftests/mm: two fixes for the pfnmap test
  mm/khugepaged: fix race with folio split/free using temporary reference
  mm: add CONFIG_PAGE_BLOCK_ORDER to select page block order
  mmu_notifiers: remove leftover stub macros
  selftests/mm: deduplicate test names in madv_populate
  kcov: rust: add flags for KCOV with Rust
  mm: rust: make CONFIG_MMU ifdefs more narrow
  mmu_gather: move tlb flush for VM_PFNMAP/VM_MIXEDMAP vmas into free_pgtables()
  mm/damon/Kconfig: enable CONFIG_DAMON by default
  ...
This commit is contained in:
Linus Torvalds 2025-06-02 16:00:26 -07:00
commit fd1f847350
59 changed files with 909 additions and 408 deletions

View file

@ -1732,6 +1732,12 @@ The following nested keys are defined.
numa_hint_faults (npn)
Number of NUMA hinting faults.
numa_task_migrated (npn)
Number of task migration by NUMA balancing.
numa_task_swapped (npn)
Number of task swap by NUMA balancing.
pgdemote_kswapd
Number of pages demoted by kswapd.

View file

@ -227,9 +227,9 @@ void __flush_dcache_folio(struct address_space *mapping, struct folio *folio)
}
/*
* If this is a page cache page, and we have an aliasing VIPT cache,
* If this is a page cache folio, and we have an aliasing VIPT cache,
* we only need to do one flush - which would be at the relevant
* userspace colour, which is congruent with page->index.
* userspace colour, which is congruent with folio->index.
*/
if (mapping && cache_is_vipt_aliasing())
flush_pfn_alias(folio_pfn(folio), folio_pos(folio));

View file

@ -105,7 +105,8 @@ static struct list_head ptable_list[3] = {
#define PD_PTABLE(page) ((ptable_desc *)&(virt_to_page((void *)(page))->lru))
#define PD_PAGE(ptable) (list_entry(ptable, struct page, lru))
#define PD_MARKBITS(dp) (*(unsigned int *)&PD_PAGE(dp)->index)
#define PD_PTDESC(ptable) (list_entry(ptable, struct ptdesc, pt_list))
#define PD_MARKBITS(dp) (*(unsigned int *)&PD_PTDESC(dp)->pt_index)
static const int ptable_shift[3] = {
7+2, /* PGD */

View file

@ -8,7 +8,7 @@
#include "backend_deflate.h"
/* Use the same value as crypto API */
#define DEFLATE_DEF_WINBITS 11
#define DEFLATE_DEF_WINBITS (-11)
#define DEFLATE_DEF_MEMLEVEL MAX_MEM_LEVEL
struct deflate_ctx {
@ -22,8 +22,10 @@ static void deflate_release_params(struct zcomp_params *params)
static int deflate_setup_params(struct zcomp_params *params)
{
if (params->level == ZCOMP_PARAM_NO_LEVEL)
if (params->level == ZCOMP_PARAM_NOT_SET)
params->level = Z_DEFAULT_COMPRESSION;
if (params->deflate.winbits == ZCOMP_PARAM_NOT_SET)
params->deflate.winbits = DEFLATE_DEF_WINBITS;
return 0;
}
@ -57,13 +59,13 @@ static int deflate_create(struct zcomp_params *params, struct zcomp_ctx *ctx)
return -ENOMEM;
ctx->context = zctx;
sz = zlib_deflate_workspacesize(-DEFLATE_DEF_WINBITS, MAX_MEM_LEVEL);
sz = zlib_deflate_workspacesize(params->deflate.winbits, MAX_MEM_LEVEL);
zctx->cctx.workspace = vzalloc(sz);
if (!zctx->cctx.workspace)
goto error;
ret = zlib_deflateInit2(&zctx->cctx, params->level, Z_DEFLATED,
-DEFLATE_DEF_WINBITS, DEFLATE_DEF_MEMLEVEL,
params->deflate.winbits, DEFLATE_DEF_MEMLEVEL,
Z_DEFAULT_STRATEGY);
if (ret != Z_OK)
goto error;
@ -73,7 +75,7 @@ static int deflate_create(struct zcomp_params *params, struct zcomp_ctx *ctx)
if (!zctx->dctx.workspace)
goto error;
ret = zlib_inflateInit2(&zctx->dctx, -DEFLATE_DEF_WINBITS);
ret = zlib_inflateInit2(&zctx->dctx, params->deflate.winbits);
if (ret != Z_OK)
goto error;

View file

@ -18,7 +18,7 @@ static void lz4_release_params(struct zcomp_params *params)
static int lz4_setup_params(struct zcomp_params *params)
{
if (params->level == ZCOMP_PARAM_NO_LEVEL)
if (params->level == ZCOMP_PARAM_NOT_SET)
params->level = LZ4_ACCELERATION_DEFAULT;
return 0;

View file

@ -18,7 +18,7 @@ static void lz4hc_release_params(struct zcomp_params *params)
static int lz4hc_setup_params(struct zcomp_params *params)
{
if (params->level == ZCOMP_PARAM_NO_LEVEL)
if (params->level == ZCOMP_PARAM_NOT_SET)
params->level = LZ4HC_DEFAULT_CLEVEL;
return 0;

View file

@ -58,7 +58,7 @@ static int zstd_setup_params(struct zcomp_params *params)
return -ENOMEM;
params->drv_data = zp;
if (params->level == ZCOMP_PARAM_NO_LEVEL)
if (params->level == ZCOMP_PARAM_NOT_SET)
params->level = zstd_default_clevel();
zp->cprm = zstd_get_params(params->level, PAGE_SIZE);

View file

@ -5,7 +5,11 @@
#include <linux/mutex.h>
#define ZCOMP_PARAM_NO_LEVEL INT_MIN
#define ZCOMP_PARAM_NOT_SET INT_MIN
struct deflate_params {
s32 winbits;
};
/*
* Immutable driver (backend) parameters. The driver may attach private
@ -17,6 +21,9 @@ struct zcomp_params {
void *dict;
size_t dict_sz;
s32 level;
union {
struct deflate_params deflate;
};
void *drv_data;
};

View file

@ -1276,13 +1276,15 @@ static void comp_params_reset(struct zram *zram, u32 prio)
struct zcomp_params *params = &zram->params[prio];
vfree(params->dict);
params->level = ZCOMP_PARAM_NO_LEVEL;
params->level = ZCOMP_PARAM_NOT_SET;
params->deflate.winbits = ZCOMP_PARAM_NOT_SET;
params->dict_sz = 0;
params->dict = NULL;
}
static int comp_params_store(struct zram *zram, u32 prio, s32 level,
const char *dict_path)
const char *dict_path,
struct deflate_params *deflate_params)
{
ssize_t sz = 0;
@ -1300,6 +1302,7 @@ static int comp_params_store(struct zram *zram, u32 prio, s32 level,
zram->params[prio].dict_sz = sz;
zram->params[prio].level = level;
zram->params[prio].deflate.winbits = deflate_params->winbits;
return 0;
}
@ -1308,11 +1311,14 @@ static ssize_t algorithm_params_store(struct device *dev,
const char *buf,
size_t len)
{
s32 prio = ZRAM_PRIMARY_COMP, level = ZCOMP_PARAM_NO_LEVEL;
s32 prio = ZRAM_PRIMARY_COMP, level = ZCOMP_PARAM_NOT_SET;
char *args, *param, *val, *algo = NULL, *dict_path = NULL;
struct deflate_params deflate_params;
struct zram *zram = dev_to_zram(dev);
int ret;
deflate_params.winbits = ZCOMP_PARAM_NOT_SET;
args = skip_spaces(buf);
while (*args) {
args = next_arg(args, &param, &val);
@ -1343,6 +1349,13 @@ static ssize_t algorithm_params_store(struct device *dev,
dict_path = val;
continue;
}
if (!strcmp(param, "deflate.winbits")) {
ret = kstrtoint(val, 10, &deflate_params.winbits);
if (ret)
return ret;
continue;
}
}
/* Lookup priority by algorithm name */
@ -1364,7 +1377,7 @@ static ssize_t algorithm_params_store(struct device *dev,
if (prio < ZRAM_PRIMARY_COMP || prio >= ZRAM_MAX_COMPS)
return -EINVAL;
ret = comp_params_store(zram, prio, level, dict_path);
ret = comp_params_store(zram, prio, level, dict_path, &deflate_params);
return ret ? ret : len;
}

View file

@ -913,7 +913,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
struct ntfs_inode *ni = ntfs_i(inode);
u64 valid = ni->i_valid;
struct ntfs_sb_info *sbi = ni->mi.sbi;
struct page *page, **pages = NULL;
struct page **pages = NULL;
struct folio *folio;
size_t written = 0;
u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits;
u32 frame_size = 1u << frame_bits;
@ -923,7 +924,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
u64 frame_vbo;
pgoff_t index;
bool frame_uptodate;
struct folio *folio;
if (frame_size < PAGE_SIZE) {
/*
@ -977,8 +977,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
pages_per_frame);
if (err) {
for (ip = 0; ip < pages_per_frame; ip++) {
page = pages[ip];
folio = page_folio(page);
folio = page_folio(pages[ip]);
folio_unlock(folio);
folio_put(folio);
}
@ -989,10 +988,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
ip = off >> PAGE_SHIFT;
off = offset_in_page(valid);
for (; ip < pages_per_frame; ip++, off = 0) {
page = pages[ip];
folio = page_folio(page);
zero_user_segment(page, off, PAGE_SIZE);
flush_dcache_page(page);
folio = page_folio(pages[ip]);
folio_zero_segment(folio, off, PAGE_SIZE);
flush_dcache_folio(folio);
folio_mark_uptodate(folio);
}
@ -1001,8 +999,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
ni_unlock(ni);
for (ip = 0; ip < pages_per_frame; ip++) {
page = pages[ip];
folio = page_folio(page);
folio = page_folio(pages[ip]);
folio_mark_uptodate(folio);
folio_unlock(folio);
folio_put(folio);
@ -1046,8 +1043,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
if (err) {
for (ip = 0; ip < pages_per_frame;
ip++) {
page = pages[ip];
folio = page_folio(page);
folio = page_folio(pages[ip]);
folio_unlock(folio);
folio_put(folio);
}
@ -1065,10 +1061,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
for (;;) {
size_t cp, tail = PAGE_SIZE - off;
page = pages[ip];
cp = copy_page_from_iter_atomic(page, off,
folio = page_folio(pages[ip]);
cp = copy_folio_from_iter_atomic(folio, off,
min(tail, bytes), from);
flush_dcache_page(page);
flush_dcache_folio(folio);
copied += cp;
bytes -= cp;
@ -1088,9 +1084,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
ni_unlock(ni);
for (ip = 0; ip < pages_per_frame; ip++) {
page = pages[ip];
ClearPageDirty(page);
folio = page_folio(page);
folio = page_folio(pages[ip]);
folio_clear_dirty(folio);
folio_mark_uptodate(folio);
folio_unlock(folio);
folio_put(folio);

View file

@ -58,6 +58,11 @@
* Defaults to flushing at tlb_end_vma() to reset the range; helps when
* there's large holes between the VMAs.
*
* - tlb_free_vmas()
*
* tlb_free_vmas() marks the start of unlinking of one or more vmas
* and freeing page-tables.
*
* - tlb_remove_table()
*
* tlb_remove_table() is the basic primitive to free page-table directories
@ -464,7 +469,12 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
*/
tlb->vma_huge = is_vm_hugetlb_page(vma);
tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
tlb->vma_pfn = !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP));
/*
* Track if there's at least one VM_PFNMAP/VM_MIXEDMAP vma
* in the tracked range, see tlb_free_vmas().
*/
tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP));
}
static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
@ -547,23 +557,39 @@ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *
}
static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS))
return;
/*
* Do a TLB flush and reset the range at VMA boundaries; this avoids
* the ranges growing with the unused space between consecutive VMAs,
* but also the mmu_gather::vma_* flags from tlb_start_vma() rely on
* this.
*/
tlb_flush_mmu_tlbonly(tlb);
}
static inline void tlb_free_vmas(struct mmu_gather *tlb)
{
if (tlb->fullmm)
return;
/*
* VM_PFNMAP is more fragile because the core mm will not track the
* page mapcount -- there might not be page-frames for these PFNs after
* all. Force flush TLBs for such ranges to avoid munmap() vs
* unmap_mapping_range() races.
* page mapcount -- there might not be page-frames for these PFNs
* after all.
*
* Specifically() there is a race between munmap() and
* unmap_mapping_range(), where munmap() will unlink the VMA, such
* that unmap_mapping_range() will no longer observe the VMA and
* no-op, without observing the TLBI, returning prematurely.
*
* So if we're about to unlink such a VMA, and we have pending
* TLBI for such a vma, flush things now.
*/
if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) {
/*
* Do a TLB flush and reset the range at VMA boundaries; this avoids
* the ranges growing with the unused space between consecutive VMAs.
*/
if (tlb->vma_pfn)
tlb_flush_mmu_tlbonly(tlb);
}
}
/*

View file

@ -113,6 +113,12 @@ struct mem_cgroup_per_node {
CACHELINE_PADDING(_pad2_);
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter iter;
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
/* slab stats for nmi context */
atomic_t slab_reclaimable;
atomic_t slab_unreclaimable;
#endif
};
struct mem_cgroup_threshold {
@ -236,6 +242,10 @@ struct mem_cgroup {
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS];
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
/* MEMCG_KMEM for nmi context */
atomic_t kmem_stat;
#endif
/*
* Hint of reclaim pressure for socket memroy management. Note
* that this indicator should NOT be used in legacy cgroup mode

View file

@ -1276,9 +1276,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf);
* the page's disk buffers. PG_private must be set to tell the VM to call
* into the filesystem to release these pages.
*
* A page may belong to an inode's memory mapping. In this case, page->mapping
* is the pointer to the inode, and page->index is the file offset of the page,
* in units of PAGE_SIZE.
* A folio may belong to an inode's memory mapping. In this case,
* folio->mapping points to the inode, and folio->index is the file
* offset of the folio, in units of PAGE_SIZE.
*
* If pagecache pages are not associated with an inode, they are said to be
* anonymous pages. These may become associated with the swapcache, and in that

View file

@ -108,7 +108,7 @@ struct page {
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
union {
pgoff_t index; /* Our offset within mapping. */
pgoff_t __folio_index; /* Our offset within mapping. */
unsigned long share; /* share count for fsdax */
};
/**
@ -489,7 +489,7 @@ FOLIO_MATCH(flags, flags);
FOLIO_MATCH(lru, lru);
FOLIO_MATCH(mapping, mapping);
FOLIO_MATCH(compound_head, lru);
FOLIO_MATCH(index, index);
FOLIO_MATCH(__folio_index, index);
FOLIO_MATCH(private, private);
FOLIO_MATCH(_mapcount, _mapcount);
FOLIO_MATCH(_refcount, _refcount);
@ -590,7 +590,7 @@ TABLE_MATCH(flags, __page_flags);
TABLE_MATCH(compound_head, pt_list);
TABLE_MATCH(compound_head, _pt_pad_1);
TABLE_MATCH(mapping, __page_mapping);
TABLE_MATCH(index, pt_index);
TABLE_MATCH(__folio_index, pt_index);
TABLE_MATCH(rcu_head, pt_rcu_head);
TABLE_MATCH(page_type, __page_type);
TABLE_MATCH(_refcount, __page_refcount);

View file

@ -654,9 +654,6 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_young_notify ptep_test_and_clear_young
#define pmdp_clear_young_notify pmdp_test_and_clear_young
#define ptep_clear_flush_notify ptep_clear_flush
#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
static inline void mmu_notifier_synchronize(void)
{

View file

@ -37,6 +37,22 @@
#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)
/* Defines the order for the number of pages that have a migrate type. */
#ifndef CONFIG_PAGE_BLOCK_ORDER
#define PAGE_BLOCK_ORDER MAX_PAGE_ORDER
#else
#define PAGE_BLOCK_ORDER CONFIG_PAGE_BLOCK_ORDER
#endif /* CONFIG_PAGE_BLOCK_ORDER */
/*
* The MAX_PAGE_ORDER, which defines the max order of pages to be allocated
* by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_ORDER,
* which defines the order for the number of pages that can have a migrate type
*/
#if (PAGE_BLOCK_ORDER > MAX_PAGE_ORDER)
#error MAX_PAGE_ORDER must be >= PAGE_BLOCK_ORDER
#endif
/*
* PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
* costly to service. That is between allocation orders which should

View file

@ -41,18 +41,18 @@ extern unsigned int pageblock_order;
* Huge pages are a constant size, but don't exceed the maximum allocation
* granularity.
*/
#define pageblock_order MIN_T(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER)
#define pageblock_order MIN_T(unsigned int, HUGETLB_PAGE_ORDER, PAGE_BLOCK_ORDER)
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
#elif defined(CONFIG_TRANSPARENT_HUGEPAGE)
#define pageblock_order MIN_T(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER)
#define pageblock_order MIN_T(unsigned int, HPAGE_PMD_ORDER, PAGE_BLOCK_ORDER)
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
#define pageblock_order MAX_PAGE_ORDER
/* If huge pages are not used, group by PAGE_BLOCK_ORDER */
#define pageblock_order PAGE_BLOCK_ORDER
#endif /* CONFIG_HUGETLB_PAGE */

View file

@ -548,6 +548,10 @@ struct sched_statistics {
u64 nr_failed_migrations_running;
u64 nr_failed_migrations_hot;
u64 nr_forced_migrations;
#ifdef CONFIG_NUMA_BALANCING
u64 numa_task_migrated;
u64 numa_task_swapped;
#endif
u64 nr_wakeups;
u64 nr_wakeups_sync;

View file

@ -182,8 +182,6 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
return ret;
}
size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
size_t bytes, struct iov_iter *i);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
@ -193,6 +191,8 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i);
size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset,
size_t bytes, struct iov_iter *i);
size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
@ -210,12 +210,6 @@ static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset,
return copy_page_from_iter(&folio->page, offset, bytes, i);
}
static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
size_t offset, size_t bytes, struct iov_iter *i)
{
return copy_page_from_iter_atomic(&folio->page, offset, bytes, i);
}
size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
size_t bytes, struct iov_iter *i);

View file

@ -66,6 +66,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
NUMA_HINT_FAULTS,
NUMA_HINT_FAULTS_LOCAL,
NUMA_PAGE_MIGRATE,
NUMA_TASK_MIGRATE,
NUMA_TASK_SWAP,
#endif
#ifdef CONFIG_MIGRATION
PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,

View file

@ -992,6 +992,20 @@ config MEMCG
help
Provides control over the memory footprint of tasks in a cgroup.
config MEMCG_NMI_UNSAFE
bool
depends on MEMCG
depends on HAVE_NMI
depends on !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !ARCH_HAVE_NMI_SAFE_CMPXCHG
default y
config MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
bool
depends on MEMCG
depends on HAVE_NMI
depends on !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && ARCH_HAVE_NMI_SAFE_CMPXCHG
default y
config MEMCG_V1
bool "Legacy cgroup v1 memory controller"
depends on MEMCG

View file

@ -531,7 +531,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
*
* For shared mappings (when @fshared), the key is:
*
* ( inode->i_sequence, page->index, offset_within_page )
* ( inode->i_sequence, page offset within mapping, offset_within_page )
*
* [ also see get_inode_sequence_number() ]
*

View file

@ -3362,6 +3362,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
__schedstat_inc(p->stats.numa_task_swapped);
count_vm_numa_event(NUMA_TASK_SWAP);
count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
@ -7930,8 +7934,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
__schedstat_inc(p->stats.numa_task_migrated);
count_vm_numa_event(NUMA_TASK_MIGRATE);
count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE);
trace_sched_move_numa(p, curr_cpu, target_cpu);
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
}

View file

@ -1210,6 +1210,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P_SCHEDSTAT(nr_failed_migrations_running);
P_SCHEDSTAT(nr_failed_migrations_hot);
P_SCHEDSTAT(nr_forced_migrations);
#ifdef CONFIG_NUMA_BALANCING
P_SCHEDSTAT(numa_task_migrated);
P_SCHEDSTAT(numa_task_swapped);
#endif
P_SCHEDSTAT(nr_wakeups);
P_SCHEDSTAT(nr_wakeups_sync);
P_SCHEDSTAT(nr_wakeups_migrate);

View file

@ -2273,7 +2273,8 @@ static bool task_numa_compare(struct task_numa_env *env,
rcu_read_lock();
cur = rcu_dereference(dst_rq->curr);
if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
!cur->mm))
cur = NULL;
/*

View file

@ -457,38 +457,35 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
}
EXPORT_SYMBOL(iov_iter_zero);
size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset,
size_t bytes, struct iov_iter *i)
{
size_t n, copied = 0;
bool uses_kmap = IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) ||
PageHighMem(page);
if (!page_copy_sane(page, offset, bytes))
if (!page_copy_sane(&folio->page, offset, bytes))
return 0;
if (WARN_ON_ONCE(!i->data_source))
return 0;
do {
char *p;
char *to = kmap_local_folio(folio, offset);
n = bytes - copied;
if (uses_kmap) {
page += offset / PAGE_SIZE;
offset %= PAGE_SIZE;
n = min_t(size_t, n, PAGE_SIZE - offset);
}
if (folio_test_partial_kmap(folio) &&
n > PAGE_SIZE - offset_in_page(offset))
n = PAGE_SIZE - offset_in_page(offset);
p = kmap_atomic(page) + offset;
n = __copy_from_iter(p, n, i);
kunmap_atomic(p);
pagefault_disable();
n = __copy_from_iter(to, n, i);
pagefault_enable();
kunmap_local(to);
copied += n;
offset += n;
} while (uses_kmap && copied != bytes && n > 0);
} while (copied != bytes && n > 0);
return copied;
}
EXPORT_SYMBOL(copy_page_from_iter_atomic);
EXPORT_SYMBOL(copy_folio_from_iter_atomic);
static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
{

View file

@ -993,6 +993,40 @@ config CMA_AREAS
If unsure, leave the default value "8" in UMA and "20" in NUMA.
#
# Select this config option from the architecture Kconfig, if available, to set
# the max page order for physically contiguous allocations.
#
config ARCH_FORCE_MAX_ORDER
int
#
# When ARCH_FORCE_MAX_ORDER is not defined,
# the default page block order is MAX_PAGE_ORDER (10) as per
# include/linux/mmzone.h.
#
config PAGE_BLOCK_ORDER
int "Page Block Order"
range 1 10 if ARCH_FORCE_MAX_ORDER = 0
default 10 if ARCH_FORCE_MAX_ORDER = 0
range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
default ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
help
The page block order refers to the power of two number of pages that
are physically contiguous and can have a migrate type associated to
them. The maximum size of the page block order is limited by
ARCH_FORCE_MAX_ORDER.
This config allows overriding the default page block order when the
page block order is required to be smaller than ARCH_FORCE_MAX_ORDER
or MAX_PAGE_ORDER.
Reducing pageblock order can negatively impact THP generation
success rate. If your workloads uses THP heavily, please use this
option with caution.
Don't change if unsure.
config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS

View file

@ -4,6 +4,7 @@ menu "Data Access Monitoring"
config DAMON
bool "DAMON: Data Access Monitoring Framework"
default y
help
This builds a framework that allows kernel subsystems to monitor
access frequency of each memory region. The information can be useful
@ -28,6 +29,7 @@ config DAMON_VADDR
bool "Data access monitoring operations for virtual address spaces"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
default DAMON
help
This builds the default data access monitoring operations for DAMON
that work for virtual address spaces.
@ -36,6 +38,7 @@ config DAMON_PADDR
bool "Data access monitoring operations for the physical address space"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
default DAMON
help
This builds the default data access monitoring operations for DAMON
that works for the physical address space.
@ -55,6 +58,7 @@ config DAMON_VADDR_KUNIT_TEST
config DAMON_SYSFS
bool "DAMON sysfs interface"
depends on DAMON && SYSFS
default DAMON
help
This builds the sysfs interface for DAMON. The user space can use
the interface for arbitrary data access monitoring.

View file

@ -1093,9 +1093,17 @@ static int damon_commit_targets(
if (err)
return err;
} else {
struct damos *s;
if (damon_target_has_pid(dst))
put_pid(dst_target->pid);
damon_destroy_target(dst_target);
damon_for_each_scheme(s, dst) {
if (s->quota.charge_target_from == dst_target) {
s->quota.charge_target_from = NULL;
s->quota.charge_addr_from = 0;
}
}
}
}

View file

@ -142,7 +142,7 @@ static void page_cache_delete(struct address_space *mapping,
xas_init_marks(&xas);
folio->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
/* Leave folio->index set: truncation lookup relies upon it */
mapping->nrpages -= nr;
}
@ -949,7 +949,7 @@ unlock:
return 0;
error:
folio->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
/* Leave folio->index set: truncation relies upon it */
folio_put_refs(folio, nr);
return xas_error(&xas);
}

View file

@ -3299,7 +3299,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
* include/asm-generic/tlb.h for more details.
*
* We do not adopt an rcu_read_lock() here as we also want to block IPIs
* that come from THPs splitting.
* that come from callers of tlb_remove_table_sync_one().
*/
local_irq_save(flags);
gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);

View file

@ -3741,7 +3741,7 @@ static void __init report_hugepages(void)
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
buf, h->free_huge_pages);
buf, h->nr_huge_pages);
if (nrinvalid)
pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
buf, nrinvalid, nrinvalid > 1 ? "s" : "");

View file

@ -548,19 +548,6 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
}
static bool is_refcount_suitable(struct folio *folio)
{
int expected_refcount = folio_mapcount(folio);
if (!folio_test_anon(folio) || folio_test_swapcache(folio))
expected_refcount += folio_nr_pages(folio);
if (folio_test_private(folio))
expected_refcount++;
return folio_ref_count(folio) == expected_refcount;
}
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
@ -652,7 +639,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
if (!is_refcount_suitable(folio)) {
if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
@ -1402,7 +1389,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
if (!is_refcount_suitable(folio)) {
if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@ -2293,6 +2280,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
continue;
}
if (!folio_try_get(folio)) {
xas_reset(&xas);
continue;
}
if (unlikely(folio != xas_reload(&xas))) {
folio_put(folio);
xas_reset(&xas);
continue;
}
if (folio_order(folio) == HPAGE_PMD_ORDER &&
folio->index == start) {
/* Maybe PMD-mapped */
@ -2303,23 +2301,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
* it's safe to skip LRU and refcount checks before
* returning.
*/
folio_put(folio);
break;
}
node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
folio_put(folio);
break;
}
cc->node_load[node]++;
if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
folio_put(folio);
break;
}
if (!is_refcount_suitable(folio)) {
if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
folio_put(folio);
break;
}
@ -2331,6 +2333,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
*/
present += folio_nr_pages(folio);
folio_put(folio);
if (need_resched()) {
xas_pause(&xas);

View file

@ -474,6 +474,8 @@ static const unsigned int memcg_vm_event_stat[] = {
NUMA_PAGE_MIGRATE,
NUMA_PTE_UPDATES,
NUMA_HINT_FAULTS,
NUMA_TASK_MIGRATE,
NUMA_TASK_SWAP,
#endif
};
@ -531,7 +533,7 @@ struct memcg_vmstats {
unsigned long events_pending[NR_MEMCG_EVENTS];
/* Stats updates since the last flush */
atomic64_t stats_updates;
atomic_t stats_updates;
};
/*
@ -557,7 +559,7 @@ static u64 flush_last_time;
static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
return atomic64_read(&vmstats->stats_updates) >
return atomic_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
@ -571,7 +573,9 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
if (!val)
return;
css_rstat_updated(&memcg->css, cpu);
/* TODO: add to cgroup update tree once it is nmi-safe. */
if (!in_nmi())
css_rstat_updated(&memcg->css, cpu);
statc_pcpu = memcg->vmstats_percpu;
for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
statc = this_cpu_ptr(statc_pcpu);
@ -589,7 +593,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
continue;
stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0);
atomic64_add(stats_updates, &statc->vmstats->stats_updates);
atomic_add(stats_updates, &statc->vmstats->stats_updates);
}
}
@ -597,7 +601,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
{
bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates),
trace_memcg_flush_stats(memcg, atomic_read(&memcg->vmstats->stats_updates),
force, needs_flush);
if (!force && !needs_flush)
@ -2513,17 +2517,47 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct lruvec *lruvec;
if (likely(!in_nmi())) {
lruvec = mem_cgroup_lruvec(memcg, pgdat);
mod_memcg_lruvec_state(lruvec, idx, nr);
} else {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
/* TODO: add to cgroup update tree once it is nmi-safe. */
if (idx == NR_SLAB_RECLAIMABLE_B)
atomic_add(nr, &pn->slab_reclaimable);
else
atomic_add(nr, &pn->slab_unreclaimable);
}
}
#else
static inline void account_slab_nmi_safe(struct mem_cgroup *memcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct lruvec *lruvec;
lruvec = mem_cgroup_lruvec(memcg, pgdat);
mod_memcg_lruvec_state(lruvec, idx, nr);
}
#endif
static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
mod_memcg_lruvec_state(lruvec, idx, nr);
account_slab_nmi_safe(memcg, pgdat, idx, nr);
rcu_read_unlock();
}
@ -2648,6 +2682,9 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
return NULL;
if (in_task()) {
memcg = current->active_memcg;
if (unlikely(memcg))
@ -2710,6 +2747,23 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
return objcg;
}
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
{
if (likely(!in_nmi())) {
mod_memcg_state(memcg, MEMCG_KMEM, val);
} else {
/* TODO: add to cgroup update tree once it is nmi-safe. */
atomic_add(val, &memcg->kmem_stat);
}
}
#else
static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
{
mod_memcg_state(memcg, MEMCG_KMEM, val);
}
#endif
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
* @objcg: object cgroup to uncharge
@ -2722,7 +2776,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
memcg = get_mem_cgroup_from_objcg(objcg);
mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
refill_stock(memcg, nr_pages);
@ -2750,7 +2804,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
account_kmem_nmi_safe(memcg, nr_pages);
memcg1_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
@ -3961,6 +4015,53 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
}
}
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
int cpu)
{
int nid;
if (atomic_read(&memcg->kmem_stat)) {
int kmem = atomic_xchg(&memcg->kmem_stat, 0);
int index = memcg_stats_index(MEMCG_KMEM);
memcg->vmstats->state[index] += kmem;
if (parent)
parent->vmstats->state_pending[index] += kmem;
}
for_each_node_state(nid, N_MEMORY) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
struct lruvec_stats *lstats = pn->lruvec_stats;
struct lruvec_stats *plstats = NULL;
if (parent)
plstats = parent->nodeinfo[nid]->lruvec_stats;
if (atomic_read(&pn->slab_reclaimable)) {
int slab = atomic_xchg(&pn->slab_reclaimable, 0);
int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B);
lstats->state[index] += slab;
if (plstats)
plstats->state_pending[index] += slab;
}
if (atomic_read(&pn->slab_unreclaimable)) {
int slab = atomic_xchg(&pn->slab_unreclaimable, 0);
int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B);
lstats->state[index] += slab;
if (plstats)
plstats->state_pending[index] += slab;
}
}
}
#else
static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
int cpu)
{}
#endif
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@ -3969,6 +4070,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct aggregate_control ac;
int nid;
flush_nmi_stats(memcg, parent, cpu);
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
ac = (struct aggregate_control) {
@ -4018,8 +4121,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
}
WRITE_ONCE(statc->stats_updates, 0);
/* We are in a per-cpu loop here, only do the atomic write once */
if (atomic64_read(&memcg->vmstats->stats_updates))
atomic64_set(&memcg->vmstats->stats_updates, 0);
if (atomic_read(&memcg->vmstats->stats_updates))
atomic_set(&memcg->vmstats->stats_updates, 0);
}
static void mem_cgroup_fork(struct task_struct *task)

View file

@ -358,6 +358,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
{
struct unlink_vma_file_batch vb;
tlb_free_vmas(tlb);
do {
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
@ -4668,8 +4670,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/*
* KSM sometimes has to copy on read faults, for example, if
* page->index of !PageKSM() pages would be nonlinear inside the
* anon VMA -- PageKSM() is lost on actual swapout.
* folio->index of non-ksm folios would be nonlinear inside the
* anon VMA -- the ksm flag is lost on actual swapout.
*/
folio = ksm_might_need_to_copy(folio, vma, vmf->address);
if (unlikely(!folio)) {

View file

@ -1509,7 +1509,7 @@ static inline void setup_usemap(struct zone *zone) {}
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void)
{
unsigned int order = MAX_PAGE_ORDER;
unsigned int order = PAGE_BLOCK_ORDER;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)

View file

@ -424,6 +424,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
tlb->page_size = 0;
#endif
tlb->vma_pfn = 0;
__tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);

View file

@ -2565,11 +2565,11 @@ struct folio *writeback_iter(struct address_space *mapping,
if (!folio) {
/*
* To avoid deadlocks between range_cyclic writeback and callers
* that hold pages in PageWriteback to aggregate I/O until
* that hold folios in writeback to aggregate I/O until
* the writeback iteration finishes, we do not loop back to the
* start of the file. Doing so causes a page lock/page
* start of the file. Doing so causes a folio lock/folio
* writeback access order inversion - we should only ever lock
* multiple pages in ascending page->index order, and looping
* multiple folios in ascending folio->index order, and looping
* back to the start of the file violates that rule and causes
* deadlocks.
*/

View file

@ -1446,8 +1446,6 @@ static int shmem_unuse_swap_entries(struct inode *inode,
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
if (!xa_is_value(folio))
continue;
error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
mapping_gfp_mask(mapping), NULL, NULL);
if (error == 0) {
@ -1505,6 +1503,7 @@ int shmem_unuse(unsigned int type)
return 0;
mutex_lock(&shmem_swaplist_mutex);
start_over:
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) {
list_del_init(&info->swaplist);
@ -1523,13 +1522,15 @@ int shmem_unuse(unsigned int type)
cond_resched();
mutex_lock(&shmem_swaplist_mutex);
next = list_next_entry(info, swaplist);
if (!info->swapped)
list_del_init(&info->swaplist);
if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction);
if (error)
break;
if (list_empty(&info->swaplist))
goto start_over;
next = list_next_entry(info, swaplist);
if (!info->swapped)
list_del_init(&info->swaplist);
}
mutex_unlock(&shmem_swaplist_mutex);
@ -1643,8 +1644,8 @@ try_split:
BUG_ON(folio_mapped(folio));
return swap_writeout(folio, wbc);
}
list_del_init(&info->swaplist);
if (!info->swapped)
list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
if (nr_pages > 1)
goto try_split;
@ -2331,6 +2332,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
*/
split_order = shmem_split_large_entry(inode, index, swap, gfp);
if (split_order < 0) {
folio_put(folio);
folio = NULL;
error = split_order;
goto failed;
}
@ -5805,12 +5808,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
if (shmem_acct_size(flags, size))
return ERR_PTR(-ENOMEM);
if (is_idmapped_mnt(mnt))
return ERR_PTR(-EINVAL);
if (shmem_acct_size(flags, size))
return ERR_PTR(-ENOMEM);
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
S_IFREG | S_IRWXUGO, 0, flags);
if (IS_ERR(inode)) {

View file

@ -425,7 +425,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing page->index */
/* We rely upon deletion not changing folio->index */
if (xa_is_value(folio))
continue;

View file

@ -1347,6 +1347,8 @@ const char * const vmstat_text[] = {
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
"numa_task_migrated",
"numa_task_swapped",
#endif
#ifdef CONFIG_MIGRATION
"pgmigrate_success",

View file

@ -54,8 +54,8 @@ struct zpdesc {
ZPDESC_MATCH(flags, flags);
ZPDESC_MATCH(lru, lru);
ZPDESC_MATCH(mapping, movable_ops);
ZPDESC_MATCH(index, next);
ZPDESC_MATCH(index, handle);
ZPDESC_MATCH(__folio_index, next);
ZPDESC_MATCH(__folio_index, handle);
ZPDESC_MATCH(private, zspage);
ZPDESC_MATCH(page_type, first_obj_offset);
ZPDESC_MATCH(_refcount, _refcount);

View file

@ -492,6 +492,7 @@ $(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs \
ifneq ($(or $(CONFIG_X86_64),$(CONFIG_X86_32)),)
$(obj)/core.o: scripts/target.json
endif
KCOV_INSTRUMENT_core.o := n
$(obj)/compiler_builtins.o: private skip_gendwarfksyms = 1
$(obj)/compiler_builtins.o: private rustc_objcopy = -w -W '__*'

View file

@ -10,7 +10,6 @@
//! control what happens when userspace reads or writes to that region of memory.
//!
//! C header: [`include/linux/mm.h`](srctree/include/linux/mm.h)
#![cfg(CONFIG_MMU)]
use crate::{
bindings,
@ -21,6 +20,10 @@ use core::{ops::Deref, ptr::NonNull};
pub mod virt;
use virt::VmaRef;
#[cfg(CONFIG_MMU)]
pub use mmput_async::MmWithUserAsync;
mod mmput_async;
/// A wrapper for the kernel's `struct mm_struct`.
///
/// This represents the address space of a userspace process, so each process has one `Mm`
@ -111,50 +114,6 @@ impl Deref for MmWithUser {
}
}
/// A wrapper for the kernel's `struct mm_struct`.
///
/// This type is identical to `MmWithUser` except that it uses `mmput_async` when dropping a
/// refcount. This means that the destructor of `ARef<MmWithUserAsync>` is safe to call in atomic
/// context.
///
/// # Invariants
///
/// Values of this type are always refcounted using `mmget`. The value of `mm_users` is non-zero.
#[repr(transparent)]
pub struct MmWithUserAsync {
mm: MmWithUser,
}
// SAFETY: It is safe to call `mmput_async` on another thread than where `mmget` was called.
unsafe impl Send for MmWithUserAsync {}
// SAFETY: All methods on `MmWithUserAsync` can be called in parallel from several threads.
unsafe impl Sync for MmWithUserAsync {}
// SAFETY: By the type invariants, this type is always refcounted.
unsafe impl AlwaysRefCounted for MmWithUserAsync {
#[inline]
fn inc_ref(&self) {
// SAFETY: The pointer is valid since self is a reference.
unsafe { bindings::mmget(self.as_raw()) };
}
#[inline]
unsafe fn dec_ref(obj: NonNull<Self>) {
// SAFETY: The caller is giving up their refcount.
unsafe { bindings::mmput_async(obj.cast().as_ptr()) };
}
}
// Make all `MmWithUser` methods available on `MmWithUserAsync`.
impl Deref for MmWithUserAsync {
type Target = MmWithUser;
#[inline]
fn deref(&self) -> &MmWithUser {
&self.mm
}
}
// These methods are safe to call even if `mm_users` is zero.
impl Mm {
/// Returns a raw pointer to the inner `mm_struct`.
@ -206,13 +165,6 @@ impl MmWithUser {
unsafe { &*ptr.cast() }
}
/// Use `mmput_async` when dropping this refcount.
#[inline]
pub fn into_mmput_async(me: ARef<MmWithUser>) -> ARef<MmWithUserAsync> {
// SAFETY: The layouts and invariants are compatible.
unsafe { ARef::from_raw(ARef::into_raw(me).cast()) }
}
/// Attempt to access a vma using the vma read lock.
///
/// This is an optimistic trylock operation, so it may fail if there is contention. In that

View file

@ -0,0 +1,68 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2024 Google LLC.
//! Version of `MmWithUser` using `mmput_async`.
//!
//! This is a separate file from `mm.rs` due to the dependency on `CONFIG_MMU=y`.
#![cfg(CONFIG_MMU)]
use crate::{
bindings,
mm::MmWithUser,
types::{ARef, AlwaysRefCounted},
};
use core::{ops::Deref, ptr::NonNull};
/// A wrapper for the kernel's `struct mm_struct`.
///
/// This type is identical to `MmWithUser` except that it uses `mmput_async` when dropping a
/// refcount. This means that the destructor of `ARef<MmWithUserAsync>` is safe to call in atomic
/// context.
///
/// # Invariants
///
/// Values of this type are always refcounted using `mmget`. The value of `mm_users` is non-zero.
#[repr(transparent)]
pub struct MmWithUserAsync {
mm: MmWithUser,
}
// SAFETY: It is safe to call `mmput_async` on another thread than where `mmget` was called.
unsafe impl Send for MmWithUserAsync {}
// SAFETY: All methods on `MmWithUserAsync` can be called in parallel from several threads.
unsafe impl Sync for MmWithUserAsync {}
// SAFETY: By the type invariants, this type is always refcounted.
unsafe impl AlwaysRefCounted for MmWithUserAsync {
#[inline]
fn inc_ref(&self) {
// SAFETY: The pointer is valid since self is a reference.
unsafe { bindings::mmget(self.as_raw()) };
}
#[inline]
unsafe fn dec_ref(obj: NonNull<Self>) {
// SAFETY: The caller is giving up their refcount.
unsafe { bindings::mmput_async(obj.cast().as_ptr()) };
}
}
// Make all `MmWithUser` methods available on `MmWithUserAsync`.
impl Deref for MmWithUserAsync {
type Target = MmWithUser;
#[inline]
fn deref(&self) -> &MmWithUser {
&self.mm
}
}
impl MmWithUser {
/// Use `mmput_async` when dropping this refcount.
#[inline]
pub fn into_mmput_async(me: ARef<MmWithUser>) -> ARef<MmWithUserAsync> {
// SAFETY: The layouts and invariants are compatible.
unsafe { ARef::from_raw(ARef::into_raw(me).cast()) }
}
}

View file

@ -2,4 +2,10 @@
kcov-flags-y += -fsanitize-coverage=trace-pc
kcov-flags-$(CONFIG_KCOV_ENABLE_COMPARISONS) += -fsanitize-coverage=trace-cmp
kcov-rflags-y += -Cpasses=sancov-module
kcov-rflags-y += -Cllvm-args=-sanitizer-coverage-level=3
kcov-rflags-y += -Cllvm-args=-sanitizer-coverage-trace-pc
kcov-rflags-$(CONFIG_KCOV_ENABLE_COMPARISONS) += -Cllvm-args=-sanitizer-coverage-trace-compares
export CFLAGS_KCOV := $(kcov-flags-y)
export RUSTFLAGS_KCOV := $(kcov-rflags-y)

View file

@ -169,6 +169,9 @@ ifeq ($(CONFIG_KCOV),y)
_c_flags += $(if $(patsubst n%,, \
$(KCOV_INSTRUMENT_$(target-stem).o)$(KCOV_INSTRUMENT)$(if $(is-kernel-object),$(CONFIG_KCOV_INSTRUMENT_ALL))), \
$(CFLAGS_KCOV))
_rust_flags += $(if $(patsubst n%,, \
$(KCOV_INSTRUMENT_$(target-stem).o)$(KCOV_INSTRUMENT)$(if $(is-kernel-object),$(CONFIG_KCOV_INSTRUMENT_ALL))), \
$(RUSTFLAGS_KCOV))
endif
#

View file

@ -15,6 +15,10 @@ if sysfs_root is None:
print('Seems sysfs not mounted?')
exit(ksft_skip)
if not os.path.exists(sysfs_root):
print('Seems DAMON disabled?')
exit(ksft_skip)
def write_file(path, string):
"Returns error string if failed, or None otherwise"
string = '%s' % string

View file

@ -112,9 +112,12 @@ struct comm_pipes {
static int setup_comm_pipes(struct comm_pipes *comm_pipes)
{
if (pipe(comm_pipes->child_ready) < 0)
if (pipe(comm_pipes->child_ready) < 0) {
ksft_perror("pipe()");
return -errno;
}
if (pipe(comm_pipes->parent_ready) < 0) {
ksft_perror("pipe()");
close(comm_pipes->child_ready[0]);
close(comm_pipes->child_ready[1]);
return -errno;
@ -207,13 +210,14 @@ static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
ret = setup_comm_pipes(&comm_pipes);
if (ret) {
ksft_test_result_fail("pipe() failed\n");
log_test_result(KSFT_FAIL);
return;
}
ret = fork();
if (ret < 0) {
ksft_test_result_fail("fork() failed\n");
ksft_perror("fork() failed");
log_test_result(KSFT_FAIL);
goto close_comm_pipes;
} else if (!ret) {
exit(fn(mem, size, &comm_pipes));
@ -228,9 +232,18 @@ static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
* write-faults by directly mapping pages writable.
*/
ret = mprotect(mem, size, PROT_READ);
ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
if (ret) {
ksft_test_result_fail("mprotect() failed\n");
ksft_perror("mprotect() failed");
log_test_result(KSFT_FAIL);
write(comm_pipes.parent_ready[1], "0", 1);
wait(&ret);
goto close_comm_pipes;
}
ret = mprotect(mem, size, PROT_READ|PROT_WRITE);
if (ret) {
ksft_perror("mprotect() failed");
log_test_result(KSFT_FAIL);
write(comm_pipes.parent_ready[1], "0", 1);
wait(&ret);
goto close_comm_pipes;
@ -248,16 +261,16 @@ static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
ret = -EINVAL;
if (!ret) {
ksft_test_result_pass("No leak from parent into child\n");
log_test_result(KSFT_PASS);
} else if (xfail) {
/*
* With hugetlb, some vmsplice() tests are currently expected to
* fail because (a) harder to fix and (b) nobody really cares.
* Flag them as expected failure for now.
*/
ksft_test_result_xfail("Leak from parent into child\n");
log_test_result(KSFT_XFAIL);
} else {
ksft_test_result_fail("Leak from parent into child\n");
log_test_result(KSFT_FAIL);
}
close_comm_pipes:
close_comm_pipes(&comm_pipes);
@ -306,26 +319,29 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size,
ret = setup_comm_pipes(&comm_pipes);
if (ret) {
ksft_test_result_fail("pipe() failed\n");
log_test_result(KSFT_FAIL);
goto free;
}
if (pipe(fds) < 0) {
ksft_test_result_fail("pipe() failed\n");
ksft_perror("pipe() failed");
log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
if (before_fork) {
transferred = vmsplice(fds[1], &iov, 1, 0);
if (transferred <= 0) {
ksft_test_result_fail("vmsplice() failed\n");
ksft_print_msg("vmsplice() failed\n");
log_test_result(KSFT_FAIL);
goto close_pipe;
}
}
ret = fork();
if (ret < 0) {
ksft_test_result_fail("fork() failed\n");
ksft_perror("fork() failed\n");
log_test_result(KSFT_FAIL);
goto close_pipe;
} else if (!ret) {
write(comm_pipes.child_ready[1], "0", 1);
@ -339,7 +355,8 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size,
if (!before_fork) {
transferred = vmsplice(fds[1], &iov, 1, 0);
if (transferred <= 0) {
ksft_test_result_fail("vmsplice() failed\n");
ksft_perror("vmsplice() failed");
log_test_result(KSFT_FAIL);
wait(&ret);
goto close_pipe;
}
@ -348,7 +365,8 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size,
while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
;
if (munmap(mem, size) < 0) {
ksft_test_result_fail("munmap() failed\n");
ksft_perror("munmap() failed");
log_test_result(KSFT_FAIL);
goto close_pipe;
}
write(comm_pipes.parent_ready[1], "0", 1);
@ -356,7 +374,8 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size,
/* Wait until the child is done writing. */
wait(&ret);
if (!WIFEXITED(ret)) {
ksft_test_result_fail("wait() failed\n");
ksft_perror("wait() failed");
log_test_result(KSFT_FAIL);
goto close_pipe;
}
@ -364,22 +383,23 @@ static void do_test_vmsplice_in_parent(char *mem, size_t size,
for (total = 0; total < transferred; total += cur) {
cur = read(fds[0], new + total, transferred - total);
if (cur < 0) {
ksft_test_result_fail("read() failed\n");
ksft_perror("read() failed");
log_test_result(KSFT_FAIL);
goto close_pipe;
}
}
if (!memcmp(old, new, transferred)) {
ksft_test_result_pass("No leak from child into parent\n");
log_test_result(KSFT_PASS);
} else if (xfail) {
/*
* With hugetlb, some vmsplice() tests are currently expected to
* fail because (a) harder to fix and (b) nobody really cares.
* Flag them as expected failure for now.
*/
ksft_test_result_xfail("Leak from child into parent\n");
log_test_result(KSFT_XFAIL);
} else {
ksft_test_result_fail("Leak from child into parent\n");
log_test_result(KSFT_FAIL);
}
close_pipe:
close(fds[0]);
@ -416,13 +436,14 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
ret = setup_comm_pipes(&comm_pipes);
if (ret) {
ksft_test_result_fail("pipe() failed\n");
log_test_result(KSFT_FAIL);
return;
}
file = tmpfile();
if (!file) {
ksft_test_result_fail("tmpfile() failed\n");
ksft_perror("tmpfile() failed");
log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
fd = fileno(file);
@ -430,14 +451,16 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
tmp = malloc(size);
if (!tmp) {
ksft_test_result_fail("malloc() failed\n");
ksft_print_msg("malloc() failed\n");
log_test_result(KSFT_FAIL);
goto close_file;
}
/* Skip on errors, as we might just lack kernel support. */
ret = io_uring_queue_init(1, &ring, 0);
if (ret < 0) {
ksft_test_result_skip("io_uring_queue_init() failed\n");
ksft_print_msg("io_uring_queue_init() failed\n");
log_test_result(KSFT_SKIP);
goto free_tmp;
}
@ -452,7 +475,8 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
iov.iov_len = size;
ret = io_uring_register_buffers(&ring, &iov, 1);
if (ret) {
ksft_test_result_skip("io_uring_register_buffers() failed\n");
ksft_print_msg("io_uring_register_buffers() failed\n");
log_test_result(KSFT_SKIP);
goto queue_exit;
}
@ -463,7 +487,8 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
*/
ret = fork();
if (ret < 0) {
ksft_test_result_fail("fork() failed\n");
ksft_perror("fork() failed");
log_test_result(KSFT_FAIL);
goto unregister_buffers;
} else if (!ret) {
write(comm_pipes.child_ready[1], "0", 1);
@ -483,10 +508,17 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
* if the page is mapped R/O vs. R/W).
*/
ret = mprotect(mem, size, PROT_READ);
clear_softdirty();
ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
if (ret) {
ksft_test_result_fail("mprotect() failed\n");
ksft_perror("mprotect() failed");
log_test_result(KSFT_FAIL);
goto unregister_buffers;
}
clear_softdirty();
ret = mprotect(mem, size, PROT_READ | PROT_WRITE);
if (ret) {
ksft_perror("mprotect() failed");
log_test_result(KSFT_FAIL);
goto unregister_buffers;
}
}
@ -498,25 +530,29 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
memset(mem, 0xff, size);
sqe = io_uring_get_sqe(&ring);
if (!sqe) {
ksft_test_result_fail("io_uring_get_sqe() failed\n");
ksft_print_msg("io_uring_get_sqe() failed\n");
log_test_result(KSFT_FAIL);
goto quit_child;
}
io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
ret = io_uring_submit(&ring);
if (ret < 0) {
ksft_test_result_fail("io_uring_submit() failed\n");
ksft_print_msg("io_uring_submit() failed\n");
log_test_result(KSFT_FAIL);
goto quit_child;
}
ret = io_uring_wait_cqe(&ring, &cqe);
if (ret < 0) {
ksft_test_result_fail("io_uring_wait_cqe() failed\n");
ksft_print_msg("io_uring_wait_cqe() failed\n");
log_test_result(KSFT_FAIL);
goto quit_child;
}
if (cqe->res != size) {
ksft_test_result_fail("write_fixed failed\n");
ksft_print_msg("write_fixed failed\n");
log_test_result(KSFT_FAIL);
goto quit_child;
}
io_uring_cqe_seen(&ring, cqe);
@ -526,15 +562,18 @@ static void do_test_iouring(char *mem, size_t size, bool use_fork)
while (total < size) {
cur = pread(fd, tmp + total, size - total, total);
if (cur < 0) {
ksft_test_result_fail("pread() failed\n");
ksft_print_msg("pread() failed\n");
log_test_result(KSFT_FAIL);
goto quit_child;
}
total += cur;
}
/* Finally, check if we read what we expected. */
ksft_test_result(!memcmp(mem, tmp, size),
"Longterm R/W pin is reliable\n");
if (!memcmp(mem, tmp, size))
log_test_result(KSFT_PASS);
else
log_test_result(KSFT_FAIL);
quit_child:
if (use_fork) {
@ -582,19 +621,21 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
int ret;
if (gup_fd < 0) {
ksft_test_result_skip("gup_test not available\n");
ksft_print_msg("gup_test not available\n");
log_test_result(KSFT_SKIP);
return;
}
tmp = malloc(size);
if (!tmp) {
ksft_test_result_fail("malloc() failed\n");
ksft_print_msg("malloc() failed\n");
log_test_result(KSFT_FAIL);
return;
}
ret = setup_comm_pipes(&comm_pipes);
if (ret) {
ksft_test_result_fail("pipe() failed\n");
log_test_result(KSFT_FAIL);
goto free_tmp;
}
@ -609,7 +650,8 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
*/
ret = fork();
if (ret < 0) {
ksft_test_result_fail("fork() failed\n");
ksft_perror("fork() failed");
log_test_result(KSFT_FAIL);
goto close_comm_pipes;
} else if (!ret) {
write(comm_pipes.child_ready[1], "0", 1);
@ -646,7 +688,8 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
clear_softdirty();
ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
if (ret) {
ksft_test_result_fail("mprotect() failed\n");
ksft_perror("mprotect() failed");
log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
break;
@ -661,9 +704,11 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
if (ret) {
if (errno == EINVAL)
ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
ret = KSFT_SKIP;
else
ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
ret = KSFT_FAIL;
ksft_perror("PIN_LONGTERM_TEST_START failed");
log_test_result(ret);
goto wait;
}
@ -676,22 +721,26 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
*/
tmp_val = (__u64)(uintptr_t)tmp;
ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
if (ret)
ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
else
ksft_test_result(!memcmp(mem, tmp, size),
"Longterm R/O pin is reliable\n");
if (ret) {
ksft_perror("PIN_LONGTERM_TEST_READ failed");
log_test_result(KSFT_FAIL);
} else {
if (!memcmp(mem, tmp, size))
log_test_result(KSFT_PASS);
else
log_test_result(KSFT_FAIL);
}
ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
if (ret)
ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
ksft_perror("PIN_LONGTERM_TEST_STOP failed");
wait:
switch (test) {
case RO_PIN_TEST_SHARED:
write(comm_pipes.parent_ready[1], "0", 1);
wait(&ret);
if (!WIFEXITED(ret))
ksft_print_msg("[INFO] wait() failed\n");
ksft_perror("wait() failed");
break;
default:
break;
@ -746,14 +795,16 @@ static void do_run_with_base_page(test_fn fn, bool swapout)
mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
return;
}
ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
/* Ignore if not around on a kernel. */
if (ret && errno != EINVAL) {
ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
ksft_perror("MADV_NOHUGEPAGE failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
@ -763,7 +814,8 @@ static void do_run_with_base_page(test_fn fn, bool swapout)
if (swapout) {
madvise(mem, pagesize, MADV_PAGEOUT);
if (!pagemap_is_swapped(pagemap_fd, mem)) {
ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
log_test_result(KSFT_SKIP);
goto munmap;
}
}
@ -775,13 +827,13 @@ munmap:
static void run_with_base_page(test_fn fn, const char *desc)
{
ksft_print_msg("[RUN] %s ... with base page\n", desc);
log_test_start("%s ... with base page", desc);
do_run_with_base_page(fn, false);
}
static void run_with_base_page_swap(test_fn fn, const char *desc)
{
ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
log_test_start("%s ... with swapped out base page", desc);
do_run_with_base_page(fn, true);
}
@ -807,7 +859,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mmap_mem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
return;
}
@ -816,7 +869,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
ret = madvise(mem, thpsize, MADV_HUGEPAGE);
if (ret) {
ksft_test_result_fail("MADV_HUGEPAGE failed\n");
ksft_perror("MADV_HUGEPAGE failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
@ -826,7 +880,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
*/
mem[0] = 1;
if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) {
ksft_test_result_skip("Did not get a THP populated\n");
ksft_print_msg("Did not get a THP populated\n");
log_test_result(KSFT_SKIP);
goto munmap;
}
memset(mem, 1, thpsize);
@ -846,12 +901,14 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
*/
ret = mprotect(mem + pagesize, pagesize, PROT_READ);
if (ret) {
ksft_test_result_fail("mprotect() failed\n");
ksft_perror("mprotect() failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
if (ret) {
ksft_test_result_fail("mprotect() failed\n");
ksft_perror("mprotect() failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
break;
@ -863,7 +920,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
*/
ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
if (ret) {
ksft_test_result_fail("MADV_DONTNEED failed\n");
ksft_perror("MADV_DONTNEED failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
size = pagesize;
@ -877,13 +935,15 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mremap_mem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
if (tmp != mremap_mem) {
ksft_test_result_fail("mremap() failed\n");
ksft_perror("mremap() failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
size = mremap_size;
@ -896,12 +956,14 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
*/
ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
if (ret) {
ksft_test_result_fail("MADV_DONTFORK failed\n");
ksft_perror("MADV_DONTFORK failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
ret = fork();
if (ret < 0) {
ksft_test_result_fail("fork() failed\n");
ksft_perror("fork() failed");
log_test_result(KSFT_FAIL);
goto munmap;
} else if (!ret) {
exit(0);
@ -910,7 +972,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
/* Allow for sharing all pages again. */
ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
if (ret) {
ksft_test_result_fail("MADV_DOFORK failed\n");
ksft_perror("MADV_DOFORK failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
break;
@ -924,7 +987,8 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize)
case THP_RUN_SINGLE_PTE_SWAPOUT:
madvise(mem, size, MADV_PAGEOUT);
if (!range_is_swapped(mem, size)) {
ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
ksft_print_msg("MADV_PAGEOUT did not work, is swap enabled?\n");
log_test_result(KSFT_SKIP);
goto munmap;
}
break;
@ -941,56 +1005,56 @@ munmap:
static void run_with_thp(test_fn fn, const char *desc, size_t size)
{
ksft_print_msg("[RUN] %s ... with THP (%zu kB)\n",
log_test_start("%s ... with THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PMD, size);
}
static void run_with_thp_swap(test_fn fn, const char *desc, size_t size)
{
ksft_print_msg("[RUN] %s ... with swapped-out THP (%zu kB)\n",
log_test_start("%s ... with swapped-out THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT, size);
}
static void run_with_pte_mapped_thp(test_fn fn, const char *desc, size_t size)
{
ksft_print_msg("[RUN] %s ... with PTE-mapped THP (%zu kB)\n",
log_test_start("%s ... with PTE-mapped THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PTE, size);
}
static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc, size_t size)
{
ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP (%zu kB)\n",
log_test_start("%s ... with swapped-out, PTE-mapped THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT, size);
}
static void run_with_single_pte_of_thp(test_fn fn, const char *desc, size_t size)
{
ksft_print_msg("[RUN] %s ... with single PTE of THP (%zu kB)\n",
log_test_start("%s ... with single PTE of THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_SINGLE_PTE, size);
}
static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc, size_t size)
{
ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP (%zu kB)\n",
log_test_start("%s ... with single PTE of swapped-out THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT, size);
}
static void run_with_partial_mremap_thp(test_fn fn, const char *desc, size_t size)
{
ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP (%zu kB)\n",
log_test_start("%s ... with partially mremap()'ed THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP, size);
}
static void run_with_partial_shared_thp(test_fn fn, const char *desc, size_t size)
{
ksft_print_msg("[RUN] %s ... with partially shared THP (%zu kB)\n",
log_test_start("%s ... with partially shared THP (%zu kB)",
desc, size / 1024);
do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED, size);
}
@ -1000,14 +1064,15 @@ static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
char *mem, *dummy;
ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
log_test_start("%s ... with hugetlb (%zu kB)", desc,
hugetlbsize / 1024);
flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
if (mem == MAP_FAILED) {
ksft_test_result_skip("need more free huge pages\n");
ksft_perror("need more free huge pages");
log_test_result(KSFT_SKIP);
return;
}
@ -1020,7 +1085,8 @@ static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
*/
dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
if (dummy == MAP_FAILED) {
ksft_test_result_skip("need more free huge pages\n");
ksft_perror("need more free huge pages");
log_test_result(KSFT_SKIP);
goto munmap;
}
munmap(dummy, hugetlbsize);
@ -1226,7 +1292,7 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
ret = setup_comm_pipes(&comm_pipes);
if (ret) {
ksft_test_result_fail("pipe() failed\n");
log_test_result(KSFT_FAIL);
return;
}
@ -1236,12 +1302,14 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
*/
ret = mprotect(mem + pagesize, pagesize, PROT_READ);
if (ret) {
ksft_test_result_fail("mprotect() failed\n");
ksft_perror("mprotect() failed");
log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
if (ret) {
ksft_test_result_fail("mprotect() failed\n");
ksft_perror("mprotect() failed");
log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
@ -1250,8 +1318,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
/* Collapse before actually COW-sharing the page. */
ret = madvise(mem, size, MADV_COLLAPSE);
if (ret) {
ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
strerror(errno));
ksft_perror("MADV_COLLAPSE failed");
log_test_result(KSFT_SKIP);
goto close_comm_pipes;
}
break;
@ -1262,7 +1330,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
/* Don't COW-share the upper part of the THP. */
ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
if (ret) {
ksft_test_result_fail("MADV_DONTFORK failed\n");
ksft_perror("MADV_DONTFORK failed");
log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
break;
@ -1270,7 +1339,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
/* Don't COW-share the lower part of the THP. */
ret = madvise(mem, size / 2, MADV_DONTFORK);
if (ret) {
ksft_test_result_fail("MADV_DONTFORK failed\n");
ksft_perror("MADV_DONTFORK failed");
log_test_result(KSFT_FAIL);
goto close_comm_pipes;
}
break;
@ -1280,7 +1350,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
ret = fork();
if (ret < 0) {
ksft_test_result_fail("fork() failed\n");
ksft_perror("fork() failed");
log_test_result(KSFT_FAIL);
goto close_comm_pipes;
} else if (!ret) {
switch (test) {
@ -1314,7 +1385,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
*/
ret = madvise(mem, size, MADV_DOFORK);
if (ret) {
ksft_test_result_fail("MADV_DOFORK failed\n");
ksft_perror("MADV_DOFORK failed");
log_test_result(KSFT_FAIL);
write(comm_pipes.parent_ready[1], "0", 1);
wait(&ret);
goto close_comm_pipes;
@ -1324,8 +1396,8 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
/* Collapse before anyone modified the COW-shared page. */
ret = madvise(mem, size, MADV_COLLAPSE);
if (ret) {
ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
strerror(errno));
ksft_perror("MADV_COLLAPSE failed");
log_test_result(KSFT_SKIP);
write(comm_pipes.parent_ready[1], "0", 1);
wait(&ret);
goto close_comm_pipes;
@ -1345,7 +1417,10 @@ static void do_test_anon_thp_collapse(char *mem, size_t size,
else
ret = -EINVAL;
ksft_test_result(!ret, "No leak from parent into child\n");
if (!ret)
log_test_result(KSFT_PASS);
else
log_test_result(KSFT_FAIL);
close_comm_pipes:
close_comm_pipes(&comm_pipes);
}
@ -1430,7 +1505,7 @@ static void run_anon_thp_test_cases(void)
for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
struct test_case const *test_case = &anon_thp_test_cases[i];
ksft_print_msg("[RUN] %s\n", test_case->desc);
log_test_start("%s", test_case->desc);
do_run_with_thp(test_case->fn, THP_RUN_PMD, pmdsize);
}
}
@ -1453,8 +1528,10 @@ static void test_cow(char *mem, const char *smem, size_t size)
memset(mem, 0xff, size);
/* See if we still read the old values via the other mapping. */
ksft_test_result(!memcmp(smem, old, size),
"Other mapping not modified\n");
if (!memcmp(smem, old, size))
log_test_result(KSFT_PASS);
else
log_test_result(KSFT_FAIL);
free(old);
}
@ -1472,18 +1549,20 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
{
char *mem, *smem, tmp;
ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
log_test_start("%s ... with shared zeropage", desc);
mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON, -1, 0);
if (mem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
return;
}
smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
if (smem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
@ -1504,10 +1583,11 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
size_t mmap_size;
int ret;
ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
log_test_start("%s ... with huge zeropage", desc);
if (!has_huge_zeropage) {
ksft_test_result_skip("Huge zeropage not enabled\n");
ksft_print_msg("Huge zeropage not enabled\n");
log_test_result(KSFT_SKIP);
return;
}
@ -1516,13 +1596,15 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mmap_mem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
return;
}
mmap_smem = mmap(NULL, mmap_size, PROT_READ,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mmap_smem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
@ -1531,9 +1613,15 @@ static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
smem = (char *)(((uintptr_t)mmap_smem + pmdsize) & ~(pmdsize - 1));
ret = madvise(mem, pmdsize, MADV_HUGEPAGE);
if (ret != 0) {
ksft_perror("madvise()");
log_test_result(KSFT_FAIL);
goto munmap;
}
ret |= madvise(smem, pmdsize, MADV_HUGEPAGE);
if (ret) {
ksft_test_result_fail("MADV_HUGEPAGE failed\n");
if (ret != 0) {
ksft_perror("madvise()");
log_test_result(KSFT_FAIL);
goto munmap;
}
@ -1562,29 +1650,33 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc)
char *mem, *smem, tmp;
int fd;
ksft_print_msg("[RUN] %s ... with memfd\n", desc);
log_test_start("%s ... with memfd", desc);
fd = memfd_create("test", 0);
if (fd < 0) {
ksft_test_result_fail("memfd_create() failed\n");
ksft_perror("memfd_create() failed");
log_test_result(KSFT_FAIL);
return;
}
/* File consists of a single page filled with zeroes. */
if (fallocate(fd, 0, 0, pagesize)) {
ksft_test_result_fail("fallocate() failed\n");
ksft_perror("fallocate() failed");
log_test_result(KSFT_FAIL);
goto close;
}
/* Create a private mapping of the memfd. */
mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
if (mem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
goto close;
}
smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
if (smem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
@ -1607,35 +1699,40 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
FILE *file;
int fd;
ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
log_test_start("%s ... with tmpfile", desc);
file = tmpfile();
if (!file) {
ksft_test_result_fail("tmpfile() failed\n");
ksft_perror("tmpfile() failed");
log_test_result(KSFT_FAIL);
return;
}
fd = fileno(file);
if (fd < 0) {
ksft_test_result_skip("fileno() failed\n");
ksft_perror("fileno() failed");
log_test_result(KSFT_SKIP);
return;
}
/* File consists of a single page filled with zeroes. */
if (fallocate(fd, 0, 0, pagesize)) {
ksft_test_result_fail("fallocate() failed\n");
ksft_perror("fallocate() failed");
log_test_result(KSFT_FAIL);
goto close;
}
/* Create a private mapping of the memfd. */
mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
if (mem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
goto close;
}
smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
if (smem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
@ -1659,20 +1756,22 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
char *mem, *smem, tmp;
int fd;
ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
log_test_start("%s ... with memfd hugetlb (%zu kB)", desc,
hugetlbsize / 1024);
flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
fd = memfd_create("test", flags);
if (fd < 0) {
ksft_test_result_skip("memfd_create() failed\n");
ksft_perror("memfd_create() failed");
log_test_result(KSFT_SKIP);
return;
}
/* File consists of a single page filled with zeroes. */
if (fallocate(fd, 0, 0, hugetlbsize)) {
ksft_test_result_skip("need more free huge pages\n");
ksft_perror("need more free huge pages");
log_test_result(KSFT_SKIP);
goto close;
}
@ -1680,12 +1779,14 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
0);
if (mem == MAP_FAILED) {
ksft_test_result_skip("need more free huge pages\n");
ksft_perror("need more free huge pages");
log_test_result(KSFT_SKIP);
goto close;
}
smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
if (smem == MAP_FAILED) {
ksft_test_result_fail("mmap() failed\n");
ksft_perror("mmap() failed");
log_test_result(KSFT_FAIL);
goto munmap;
}
@ -1771,7 +1872,6 @@ static int tests_per_non_anon_test_case(void)
int main(int argc, char **argv)
{
int err;
struct thp_settings default_settings;
ksft_print_header();
@ -1811,9 +1911,5 @@ int main(int argc, char **argv)
thp_restore_settings();
}
err = ksft_get_fail_cnt();
if (err)
ksft_exit_fail_msg("%d out of %d tests failed\n",
err, ksft_test_num());
ksft_exit_pass();
ksft_finished();
}

View file

@ -1453,8 +1453,21 @@ TEST_F(guard_regions, uffd)
/* Set up uffd. */
uffd = userfaultfd(0);
if (uffd == -1 && errno == EPERM)
ksft_exit_skip("No userfaultfd permissions, try running as root.\n");
if (uffd == -1) {
switch (errno) {
case EPERM:
SKIP(return, "No userfaultfd permissions, try running as root.");
break;
case ENOSYS:
SKIP(return, "userfaultfd is not supported/not enabled.");
break;
default:
ksft_exit_fail_msg("userfaultfd failed with %s\n",
strerror(errno));
break;
}
}
ASSERT_NE(uffd, -1);
ASSERT_EQ(ioctl(uffd, UFFDIO_API, &api), 0);

View file

@ -93,33 +93,48 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
__fsword_t fs_type = get_fs_type(fd);
bool should_work;
char *mem;
int result = KSFT_PASS;
int ret;
if (fd < 0) {
result = KSFT_FAIL;
goto report;
}
if (ftruncate(fd, size)) {
if (errno == ENOENT) {
skip_test_dodgy_fs("ftruncate()");
} else {
ksft_test_result_fail("ftruncate() failed (%s)\n", strerror(errno));
ksft_print_msg("ftruncate() failed (%s)\n",
strerror(errno));
result = KSFT_FAIL;
goto report;
}
return;
}
if (fallocate(fd, 0, 0, size)) {
if (size == pagesize)
ksft_test_result_fail("fallocate() failed (%s)\n", strerror(errno));
else
ksft_test_result_skip("need more free huge pages\n");
return;
if (size == pagesize) {
ksft_print_msg("fallocate() failed (%s)\n", strerror(errno));
result = KSFT_FAIL;
} else {
ksft_print_msg("need more free huge pages\n");
result = KSFT_SKIP;
}
goto report;
}
mem = mmap(NULL, size, PROT_READ | PROT_WRITE,
shared ? MAP_SHARED : MAP_PRIVATE, fd, 0);
if (mem == MAP_FAILED) {
if (size == pagesize || shared)
ksft_test_result_fail("mmap() failed (%s)\n", strerror(errno));
else
ksft_test_result_skip("need more free huge pages\n");
return;
if (size == pagesize || shared) {
ksft_print_msg("mmap() failed (%s)\n", strerror(errno));
result = KSFT_FAIL;
} else {
ksft_print_msg("need more free huge pages\n");
result = KSFT_SKIP;
}
goto report;
}
/* Fault in the page such that GUP-fast can pin it directly. */
@ -134,7 +149,8 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
*/
ret = mprotect(mem, size, PROT_READ);
if (ret) {
ksft_test_result_fail("mprotect() failed (%s)\n", strerror(errno));
ksft_print_msg("mprotect() failed (%s)\n", strerror(errno));
result = KSFT_FAIL;
goto munmap;
}
/* FALLTHROUGH */
@ -147,12 +163,14 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
type == TEST_TYPE_RW_FAST;
if (gup_fd < 0) {
ksft_test_result_skip("gup_test not available\n");
ksft_print_msg("gup_test not available\n");
result = KSFT_SKIP;
break;
}
if (rw && shared && fs_is_unknown(fs_type)) {
ksft_test_result_skip("Unknown filesystem\n");
ksft_print_msg("Unknown filesystem\n");
result = KSFT_SKIP;
return;
}
/*
@ -169,14 +187,19 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
args.flags |= rw ? PIN_LONGTERM_TEST_FLAG_USE_WRITE : 0;
ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
if (ret && errno == EINVAL) {
ksft_test_result_skip("PIN_LONGTERM_TEST_START failed (EINVAL)n");
ksft_print_msg("PIN_LONGTERM_TEST_START failed (EINVAL)n");
result = KSFT_SKIP;
break;
} else if (ret && errno == EFAULT) {
ksft_test_result(!should_work, "Should have failed\n");
if (should_work)
result = KSFT_FAIL;
else
result = KSFT_PASS;
break;
} else if (ret) {
ksft_test_result_fail("PIN_LONGTERM_TEST_START failed (%s)\n",
strerror(errno));
ksft_print_msg("PIN_LONGTERM_TEST_START failed (%s)\n",
strerror(errno));
result = KSFT_FAIL;
break;
}
@ -189,7 +212,10 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
* some previously unsupported filesystems, we might want to
* perform some additional tests for possible data corruptions.
*/
ksft_test_result(should_work, "Should have worked\n");
if (should_work)
result = KSFT_PASS;
else
result = KSFT_FAIL;
break;
}
#ifdef LOCAL_CONFIG_HAVE_LIBURING
@ -199,8 +225,9 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
/* io_uring always pins pages writable. */
if (shared && fs_is_unknown(fs_type)) {
ksft_test_result_skip("Unknown filesystem\n");
return;
ksft_print_msg("Unknown filesystem\n");
result = KSFT_SKIP;
goto report;
}
should_work = !shared ||
fs_supports_writable_longterm_pinning(fs_type);
@ -208,8 +235,9 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
/* Skip on errors, as we might just lack kernel support. */
ret = io_uring_queue_init(1, &ring, 0);
if (ret < 0) {
ksft_test_result_skip("io_uring_queue_init() failed (%s)\n",
strerror(-ret));
ksft_print_msg("io_uring_queue_init() failed (%s)\n",
strerror(-ret));
result = KSFT_SKIP;
break;
}
/*
@ -222,17 +250,28 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
/* Only new kernels return EFAULT. */
if (ret && (errno == ENOSPC || errno == EOPNOTSUPP ||
errno == EFAULT)) {
ksft_test_result(!should_work, "Should have failed (%s)\n",
strerror(errno));
if (should_work) {
ksft_print_msg("Should have failed (%s)\n",
strerror(errno));
result = KSFT_FAIL;
} else {
result = KSFT_PASS;
}
} else if (ret) {
/*
* We might just lack support or have insufficient
* MEMLOCK limits.
*/
ksft_test_result_skip("io_uring_register_buffers() failed (%s)\n",
strerror(-ret));
ksft_print_msg("io_uring_register_buffers() failed (%s)\n",
strerror(-ret));
result = KSFT_SKIP;
} else {
ksft_test_result(should_work, "Should have worked\n");
if (should_work) {
result = KSFT_PASS;
} else {
ksft_print_msg("Should have worked\n");
result = KSFT_FAIL;
}
io_uring_unregister_buffers(&ring);
}
@ -246,6 +285,8 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared)
munmap:
munmap(mem, size);
report:
log_test_result(result);
}
typedef void (*test_fn)(int fd, size_t size);
@ -254,13 +295,11 @@ static void run_with_memfd(test_fn fn, const char *desc)
{
int fd;
ksft_print_msg("[RUN] %s ... with memfd\n", desc);
log_test_start("%s ... with memfd", desc);
fd = memfd_create("test", 0);
if (fd < 0) {
ksft_test_result_fail("memfd_create() failed (%s)\n", strerror(errno));
return;
}
if (fd < 0)
ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno));
fn(fd, pagesize);
close(fd);
@ -271,23 +310,23 @@ static void run_with_tmpfile(test_fn fn, const char *desc)
FILE *file;
int fd;
ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
log_test_start("%s ... with tmpfile", desc);
file = tmpfile();
if (!file) {
ksft_test_result_fail("tmpfile() failed (%s)\n", strerror(errno));
return;
}
fd = fileno(file);
if (fd < 0) {
ksft_test_result_fail("fileno() failed (%s)\n", strerror(errno));
goto close;
ksft_print_msg("tmpfile() failed (%s)\n", strerror(errno));
fd = -1;
} else {
fd = fileno(file);
if (fd < 0) {
ksft_print_msg("fileno() failed (%s)\n", strerror(errno));
}
}
fn(fd, pagesize);
close:
fclose(file);
if (file)
fclose(file);
}
static void run_with_local_tmpfile(test_fn fn, const char *desc)
@ -295,22 +334,22 @@ static void run_with_local_tmpfile(test_fn fn, const char *desc)
char filename[] = __FILE__"_tmpfile_XXXXXX";
int fd;
ksft_print_msg("[RUN] %s ... with local tmpfile\n", desc);
log_test_start("%s ... with local tmpfile", desc);
fd = mkstemp(filename);
if (fd < 0) {
ksft_test_result_fail("mkstemp() failed (%s)\n", strerror(errno));
return;
}
if (fd < 0)
ksft_print_msg("mkstemp() failed (%s)\n", strerror(errno));
if (unlink(filename)) {
ksft_test_result_fail("unlink() failed (%s)\n", strerror(errno));
goto close;
ksft_print_msg("unlink() failed (%s)\n", strerror(errno));
close(fd);
fd = -1;
}
fn(fd, pagesize);
close:
close(fd);
if (fd >= 0)
close(fd);
}
static void run_with_memfd_hugetlb(test_fn fn, const char *desc,
@ -319,15 +358,14 @@ static void run_with_memfd_hugetlb(test_fn fn, const char *desc,
int flags = MFD_HUGETLB;
int fd;
ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
log_test_start("%s ... with memfd hugetlb (%zu kB)", desc,
hugetlbsize / 1024);
flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
fd = memfd_create("test", flags);
if (fd < 0) {
ksft_test_result_skip("memfd_create() failed (%s)\n", strerror(errno));
return;
ksft_print_msg("memfd_create() failed (%s)\n", strerror(errno));
}
fn(fd, hugetlbsize);
@ -455,7 +493,7 @@ static int tests_per_test_case(void)
int main(int argc, char **argv)
{
int i, err;
int i;
pagesize = getpagesize();
nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes,
@ -469,9 +507,5 @@ int main(int argc, char **argv)
for (i = 0; i < ARRAY_SIZE(test_cases); i++)
run_test_case(&test_cases[i]);
err = ksft_get_fail_cnt();
if (err)
ksft_exit_fail_msg("%d out of %d tests failed\n",
err, ksft_test_num());
ksft_exit_pass();
ksft_finished();
}

View file

@ -172,12 +172,12 @@ static void test_populate_read(void)
if (addr == MAP_FAILED)
ksft_exit_fail_msg("mmap failed\n");
ksft_test_result(range_is_not_populated(addr, SIZE),
"range initially not populated\n");
"read range initially not populated\n");
ret = madvise(addr, SIZE, MADV_POPULATE_READ);
ksft_test_result(!ret, "MADV_POPULATE_READ\n");
ksft_test_result(range_is_populated(addr, SIZE),
"range is populated\n");
"read range is populated\n");
munmap(addr, SIZE);
}
@ -194,12 +194,12 @@ static void test_populate_write(void)
if (addr == MAP_FAILED)
ksft_exit_fail_msg("mmap failed\n");
ksft_test_result(range_is_not_populated(addr, SIZE),
"range initially not populated\n");
"write range initially not populated\n");
ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
ksft_test_result(range_is_populated(addr, SIZE),
"range is populated\n");
"write range is populated\n");
munmap(addr, SIZE);
}
@ -247,19 +247,19 @@ static void test_softdirty(void)
/* Clear any softdirty bits. */
clear_softdirty();
ksft_test_result(range_is_not_softdirty(addr, SIZE),
"range is not softdirty\n");
"cleared range is not softdirty\n");
/* Populating READ should set softdirty. */
ret = madvise(addr, SIZE, MADV_POPULATE_READ);
ksft_test_result(!ret, "MADV_POPULATE_READ\n");
ksft_test_result(!ret, "softdirty MADV_POPULATE_READ\n");
ksft_test_result(range_is_not_softdirty(addr, SIZE),
"range is not softdirty\n");
"range is not softdirty after MADV_POPULATE_READ\n");
/* Populating WRITE should set softdirty. */
ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
ksft_test_result(!ret, "softdirty MADV_POPULATE_WRITE\n");
ksft_test_result(range_is_softdirty(addr, SIZE),
"range is softdirty\n");
"range is softdirty after MADV_POPULATE_WRITE \n");
munmap(addr, SIZE);
}

View file

@ -196,7 +196,7 @@ static void test_mlock_lock(void)
ksft_exit_fail_msg("munlock(): %s\n", strerror(errno));
}
ksft_test_result(!unlock_lock_check(map), "%s: Locked\n", __func__);
ksft_test_result(!unlock_lock_check(map), "%s: Unlocked\n", __func__);
munmap(map, 2 * page_size);
}

View file

@ -12,6 +12,8 @@
#include <stdint.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <ctype.h>
#include <fcntl.h>
#include <signal.h>
#include <setjmp.h>
@ -43,14 +45,62 @@ static int test_read_access(char *addr, size_t size, size_t pagesize)
/* Force a read that the compiler cannot optimize out. */
*((volatile char *)(addr + offs));
}
if (signal(SIGSEGV, signal_handler) == SIG_ERR)
if (signal(SIGSEGV, SIG_DFL) == SIG_ERR)
return -EINVAL;
return ret;
}
static int find_ram_target(off_t *phys_addr,
unsigned long long pagesize)
{
unsigned long long start, end;
char line[80], *end_ptr;
FILE *file;
/* Search /proc/iomem for the first suitable "System RAM" range. */
file = fopen("/proc/iomem", "r");
if (!file)
return -errno;
while (fgets(line, sizeof(line), file)) {
/* Ignore any child nodes. */
if (!isalnum(line[0]))
continue;
if (!strstr(line, "System RAM\n"))
continue;
start = strtoull(line, &end_ptr, 16);
/* Skip over the "-" */
end_ptr++;
/* Make end "exclusive". */
end = strtoull(end_ptr, NULL, 16) + 1;
/* Actual addresses are not exported */
if (!start && !end)
break;
/* We need full pages. */
start = (start + pagesize - 1) & ~(pagesize - 1);
end &= ~(pagesize - 1);
if (start != (off_t)start)
break;
/* We need two pages. */
if (end > start + 2 * pagesize) {
fclose(file);
*phys_addr = start;
return 0;
}
}
return -ENOENT;
}
FIXTURE(pfnmap)
{
off_t phys_addr;
size_t pagesize;
int dev_mem_fd;
char *addr1;
@ -63,14 +113,17 @@ FIXTURE_SETUP(pfnmap)
{
self->pagesize = getpagesize();
/* We'll require two physical pages throughout our tests ... */
if (find_ram_target(&self->phys_addr, self->pagesize))
SKIP(return, "Cannot find ram target in '/proc/iomem'\n");
self->dev_mem_fd = open("/dev/mem", O_RDONLY);
if (self->dev_mem_fd < 0)
SKIP(return, "Cannot open '/dev/mem'\n");
/* We'll require the first two pages throughout our tests ... */
self->size1 = self->pagesize * 2;
self->addr1 = mmap(NULL, self->size1, PROT_READ, MAP_SHARED,
self->dev_mem_fd, 0);
self->dev_mem_fd, self->phys_addr);
if (self->addr1 == MAP_FAILED)
SKIP(return, "Cannot mmap '/dev/mem'\n");
@ -129,7 +182,7 @@ TEST_F(pfnmap, munmap_split)
*/
self->size2 = self->pagesize;
self->addr2 = mmap(NULL, self->pagesize, PROT_READ, MAP_SHARED,
self->dev_mem_fd, 0);
self->dev_mem_fd, self->phys_addr);
ASSERT_NE(self->addr2, MAP_FAILED);
}

View file

@ -127,7 +127,7 @@ void test_mmap(unsigned long size, unsigned flags)
show(size);
ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
"%s mmap %lu\n", __func__, size);
"%s mmap %lu %x\n", __func__, size, flags);
if (munmap(map, size * NUM_PAGES))
ksft_exit_fail_msg("%s: unmap %s\n", __func__, strerror(errno));
@ -165,7 +165,7 @@ void test_shmget(unsigned long size, unsigned flags)
show(size);
ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES,
"%s: mmap %lu\n", __func__, size);
"%s: mmap %lu %x\n", __func__, size, flags);
if (shmdt(map))
ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno));
}

View file

@ -7,23 +7,20 @@
# real test to check that the kernel is configured to support at least 5
# pagetable levels.
# 1 means the test failed
exitcode=1
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
fail()
skip()
{
echo "$1"
exit $exitcode
exit $ksft_skip
}
check_supported_x86_64()
{
local config="/proc/config.gz"
[[ -f "${config}" ]] || config="/boot/config-$(uname -r)"
[[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot"
[[ -f "${config}" ]] || skip "Cannot find kernel config in /proc or /boot"
# gzip -dcfq automatically handles both compressed and plaintext input.
# See man 1 gzip under '-f'.
@ -33,11 +30,9 @@ check_supported_x86_64()
else {print 1}; exit}' /proc/cpuinfo 2>/dev/null)
if [[ "${pg_table_levels}" -lt 5 ]]; then
echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
exit $ksft_skip
skip "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
elif [[ "${cpu_supports_pl5}" -ne 0 ]]; then
echo "$0: CPU does not have the necessary la57 flag to support page table level 5"
exit $ksft_skip
skip "$0: CPU does not have the necessary la57 flag to support page table level 5"
fi
}
@ -45,24 +40,21 @@ check_supported_ppc64()
{
local config="/proc/config.gz"
[[ -f "${config}" ]] || config="/boot/config-$(uname -r)"
[[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot"
[[ -f "${config}" ]] || skip "Cannot find kernel config in /proc or /boot"
local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
if [[ "${pg_table_levels}" -lt 5 ]]; then
echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
exit $ksft_skip
skip "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
fi
local mmu_support=$(grep -m1 "mmu" /proc/cpuinfo | awk '{print $3}')
if [[ "$mmu_support" != "radix" ]]; then
echo "$0: System does not use Radix MMU, required for 5-level paging"
exit $ksft_skip
skip "$0: System does not use Radix MMU, required for 5-level paging"
fi
local hugepages_total=$(awk '/HugePages_Total/ {print $2}' /proc/meminfo)
if [[ "${hugepages_total}" -eq 0 ]]; then
echo "$0: HugePages are not enabled, required for some tests"
exit $ksft_skip
skip "$0: HugePages are not enabled, required for some tests"
fi
}

View file

@ -439,7 +439,7 @@ int open_procmap(pid_t pid, struct procmap_fd *procmap_out)
sprintf(path, "/proc/%d/maps", pid);
procmap_out->query.size = sizeof(procmap_out->query);
procmap_out->fd = open(path, O_RDONLY);
if (procmap_out < 0)
if (procmap_out->fd < 0)
ret = -errno;
return ret;

View file

@ -3,6 +3,7 @@
#include <stdbool.h>
#include <sys/mman.h>
#include <err.h>
#include <stdarg.h>
#include <strings.h> /* ffsl() */
#include <unistd.h> /* _SC_PAGESIZE */
#include "../kselftest.h"
@ -95,6 +96,25 @@ static inline int open_self_procmap(struct procmap_fd *procmap_out)
return open_procmap(pid, procmap_out);
}
/* These helpers need to be inline to match the kselftest.h idiom. */
static char test_name[1024];
static inline void log_test_start(const char *name, ...)
{
va_list args;
va_start(args, name);
vsnprintf(test_name, sizeof(test_name), name, args);
ksft_print_msg("[RUN] %s\n", test_name);
va_end(args);
}
static inline void log_test_result(int result)
{
ksft_test_result_report(result, "%s\n", test_name);
}
/*
* On ppc64 this will only work with radix 2M hugepage size
*/

View file

@ -1461,4 +1461,9 @@ static inline int __call_mmap_prepare(struct file *file,
return file->f_op->mmap_prepare(desc);
}
static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
{
(void)vma;
}
#endif /* __MM_VMA_INTERNAL_H */