33 hotfixes. 24 are cc:stable and the remainder address post-6.13 issues

or aren't considered necessary for -stable kernels.
 
 26 are for MM and 7 are for non-MM.
 
 - "mm: memory_failure: unmap poisoned folio during migrate properly"
   from Ma Wupeng fixes a couple of two year old bugs involving the
   migration of hwpoisoned folios.
 
 - "selftests/damon: three fixes for false results" from SeongJae Park
   fixes three one year old bugs in the SAMON selftest code.
 
 The remainder are singletons and doubletons.  Please see the individual
 changelogs for details.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZ8zgnAAKCRDdBJ7gKXxA
 jmTzAP9LsTIpkPkRXDpwxPR/Si5KPwOkE6sGj4ETEqbX3vUvcAEA/Lp7oafc7Vqr
 XxlC1VFush1ZK29Tecxzvnapl2/VSAs=
 =zBvY
 -----END PGP SIGNATURE-----

Merge tag 'mm-hotfixes-stable-2025-03-08-16-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc fixes from Andrew Morton:
 "33 hotfixes. 24 are cc:stable and the remainder address post-6.13
  issues or aren't considered necessary for -stable kernels.

  26 are for MM and 7 are for non-MM.

   - "mm: memory_failure: unmap poisoned folio during migrate properly"
     from Ma Wupeng fixes a couple of two year old bugs involving the
     migration of hwpoisoned folios.

   - "selftests/damon: three fixes for false results" from SeongJae Park
     fixes three one year old bugs in the SAMON selftest code.

  The remainder are singletons and doubletons. Please see the individual
  changelogs for details"

* tag 'mm-hotfixes-stable-2025-03-08-16-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (33 commits)
  mm/page_alloc: fix uninitialized variable
  rapidio: add check for rio_add_net() in rio_scan_alloc_net()
  rapidio: fix an API misues when rio_add_net() fails
  MAINTAINERS: .mailmap: update Sumit Garg's email address
  Revert "mm/page_alloc.c: don't show protection in zone's ->lowmem_reserve[] for empty zone"
  mm: fix finish_fault() handling for large folios
  mm: don't skip arch_sync_kernel_mappings() in error paths
  mm: shmem: remove unnecessary warning in shmem_writepage()
  userfaultfd: fix PTE unmapping stack-allocated PTE copies
  userfaultfd: do not block on locking a large folio with raised refcount
  mm: zswap: use ATOMIC_LONG_INIT to initialize zswap_stored_pages
  mm: shmem: fix potential data corruption during shmem swapin
  mm: fix kernel BUG when userfaultfd_move encounters swapcache
  selftests/damon/damon_nr_regions: sort collected regiosn before checking with min/max boundaries
  selftests/damon/damon_nr_regions: set ops update for merge results check to 100ms
  selftests/damon/damos_quota: make real expectation of quota exceeds
  include/linux/log2.h: mark is_power_of_2() with __always_inline
  NFS: fix nfs_release_folio() to not deadlock via kcompactd writeback
  mm, swap: avoid BUG_ON in relocate_cluster()
  mm: swap: use correct step in loop to wait all clusters in wait_for_allocation()
  ...
This commit is contained in:
Linus Torvalds 2025-03-08 14:34:06 -10:00
commit 1110ce6a1e
39 changed files with 349 additions and 123 deletions

View file

@ -691,6 +691,7 @@ Subbaraman Narayanamurthy <quic_subbaram@quicinc.com> <subbaram@codeaurora.org>
Subhash Jadavani <subhashj@codeaurora.org>
Sudarshan Rajagopalan <quic_sudaraja@quicinc.com> <sudaraja@codeaurora.org>
Sudeep Holla <sudeep.holla@arm.com> Sudeep KarkadaNagesha <sudeep.karkadanagesha@arm.com>
Sumit Garg <sumit.garg@kernel.org> <sumit.garg@linaro.org>
Sumit Semwal <sumit.semwal@ti.com>
Surabhi Vishnoi <quic_svishnoi@quicinc.com> <svishnoi@codeaurora.org>
Sven Eckelmann <sven@narfation.org> <seckelmann@datto.com>

View file

@ -12875,7 +12875,7 @@ F: include/keys/trusted_dcp.h
F: security/keys/trusted-keys/trusted_dcp.c
KEYS-TRUSTED-TEE
M: Sumit Garg <sumit.garg@linaro.org>
M: Sumit Garg <sumit.garg@kernel.org>
L: linux-integrity@vger.kernel.org
L: keyrings@vger.kernel.org
S: Supported
@ -17675,7 +17675,7 @@ F: Documentation/ABI/testing/sysfs-bus-optee-devices
F: drivers/tee/optee/
OP-TEE RANDOM NUMBER GENERATOR (RNG) DRIVER
M: Sumit Garg <sumit.garg@linaro.org>
M: Sumit Garg <sumit.garg@kernel.org>
L: op-tee@lists.trustedfirmware.org
S: Maintained
F: drivers/char/hw_random/optee-rng.c
@ -23288,7 +23288,7 @@ F: include/media/i2c/tw9910.h
TEE SUBSYSTEM
M: Jens Wiklander <jens.wiklander@linaro.org>
R: Sumit Garg <sumit.garg@linaro.org>
R: Sumit Garg <sumit.garg@kernel.org>
L: op-tee@lists.trustedfirmware.org
S: Maintained
F: Documentation/ABI/testing/sysfs-class-tee

View file

@ -62,7 +62,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address,
}
static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
unsigned long pfn, struct vm_fault *vmf)
unsigned long pfn, bool need_lock)
{
spinlock_t *ptl;
pgd_t *pgd;
@ -99,12 +99,11 @@ again:
if (!pte)
return 0;
/*
* If we are using split PTE locks, then we need to take the page
* lock here. Otherwise we are using shared mm->page_table_lock
* which is already locked, thus cannot take it.
*/
if (ptl != vmf->ptl) {
if (need_lock) {
/*
* Use nested version here to indicate that we are already
* holding one similar spinlock.
*/
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
pte_unmap_unlock(pte, ptl);
@ -114,7 +113,7 @@ again:
ret = do_adjust_pte(vma, address, pfn, pte);
if (ptl != vmf->ptl)
if (need_lock)
spin_unlock(ptl);
pte_unmap(pte);
@ -123,9 +122,10 @@ again:
static void
make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep, unsigned long pfn,
struct vm_fault *vmf)
unsigned long addr, pte_t *ptep, unsigned long pfn)
{
const unsigned long pmd_start_addr = ALIGN_DOWN(addr, PMD_SIZE);
const unsigned long pmd_end_addr = pmd_start_addr + PMD_SIZE;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *mpnt;
unsigned long offset;
@ -141,6 +141,14 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
*/
flush_dcache_mmap_lock(mapping);
vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
/*
* If we are using split PTE locks, then we need to take the pte
* lock. Otherwise we are using shared mm->page_table_lock which
* is already locked, thus cannot take it.
*/
bool need_lock = IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS);
unsigned long mpnt_addr;
/*
* If this VMA is not in our MM, we can ignore it.
* Note that we intentionally mask out the VMA
@ -151,7 +159,12 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
if (!(mpnt->vm_flags & VM_MAYSHARE))
continue;
offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
aliases += adjust_pte(mpnt, mpnt->vm_start + offset, pfn, vmf);
mpnt_addr = mpnt->vm_start + offset;
/* Avoid deadlocks by not grabbing the same PTE lock again. */
if (mpnt_addr >= pmd_start_addr && mpnt_addr < pmd_end_addr)
need_lock = false;
aliases += adjust_pte(mpnt, mpnt_addr, pfn, need_lock);
}
flush_dcache_mmap_unlock(mapping);
if (aliases)
@ -194,7 +207,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
__flush_dcache_folio(mapping, folio);
if (mapping) {
if (cache_is_vivt())
make_coherent(mapping, vma, addr, ptep, pfn, vmf);
make_coherent(mapping, vma, addr, ptep, pfn);
else if (vma->vm_flags & VM_EXEC)
__flush_icache_all();
}

View file

@ -44,8 +44,10 @@ static inline pgd_t * pgd_alloc(struct mm_struct *mm)
pgd_t *new_pgd;
new_pgd = __pgd_alloc(mm, 0);
memcpy(new_pgd, swapper_pg_dir, PAGE_SIZE);
memset(new_pgd, 0, (PAGE_OFFSET >> PGDIR_SHIFT));
if (likely(new_pgd != NULL)) {
memcpy(new_pgd, swapper_pg_dir, PAGE_SIZE);
memset(new_pgd, 0, (PAGE_OFFSET >> PGDIR_SHIFT));
}
return new_pgd;
}

View file

@ -1742,7 +1742,8 @@ static int rio_mport_add_riodev(struct mport_cdev_priv *priv,
err = rio_add_net(net);
if (err) {
rmcd_debug(RDEV, "failed to register net, err=%d", err);
kfree(net);
put_device(&net->dev);
mport->net = NULL;
goto cleanup;
}
}

View file

@ -871,7 +871,10 @@ static struct rio_net *rio_scan_alloc_net(struct rio_mport *mport,
dev_set_name(&net->dev, "rnet_%d", net->id);
net->dev.parent = &mport->dev;
net->dev.release = rio_scan_release_dev;
rio_add_net(net);
if (rio_add_net(net)) {
put_device(&net->dev);
net = NULL;
}
}
return net;

View file

@ -29,6 +29,7 @@
#include <linux/pagemap.h>
#include <linux/gfp.h>
#include <linux/swap.h>
#include <linux/compaction.h>
#include <linux/uaccess.h>
#include <linux/filelock.h>
@ -457,7 +458,7 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp)
/* If the private flag is set, then the folio is not freeable */
if (folio_test_private(folio)) {
if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL ||
current_is_kswapd())
current_is_kswapd() || current_is_kcompactd())
return false;
if (nfs_wb_folio(folio->mapping->host, folio) < 0)
return false;

View file

@ -80,6 +80,11 @@ static inline unsigned long compact_gap(unsigned int order)
return 2UL << order;
}
static inline int current_is_kcompactd(void)
{
return current->flags & PF_KCOMPACTD;
}
#ifdef CONFIG_COMPACTION
extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order);

View file

@ -682,6 +682,7 @@ struct huge_bootmem_page {
int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
void wait_for_freed_hugetlb_folios(void);
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, bool cow_from_owner);
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
@ -1068,6 +1069,10 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn,
return 0;
}
static inline void wait_for_freed_hugetlb_folios(void)
{
}
static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr,
bool cow_from_owner)

View file

@ -41,7 +41,7 @@ int __ilog2_u64(u64 n)
* *not* considered a power of two.
* Return: true if @n is a power of 2, otherwise false.
*/
static inline __attribute__((const))
static __always_inline __attribute__((const))
bool is_power_of_2(unsigned long n)
{
return (n != 0 && ((n & (n - 1)) == 0));

View file

@ -1701,7 +1701,7 @@ extern struct pid *cad_pid;
#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
#define PF__HOLE__00010000 0x00010000
#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */
#define PF_KSWAPD 0x00020000 /* I am kswapd */
#define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
#define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */

View file

@ -2103,7 +2103,7 @@ config FAIL_SKB_REALLOC
reallocated, catching possible invalid pointers to the skb.
For more information, check
Documentation/dev-tools/fault-injection/fault-injection.rst
Documentation/fault-injection/fault-injection.rst
config FAULT_INJECTION_CONFIGFS
bool "Configfs interface for fault-injection capabilities"

View file

@ -3181,6 +3181,7 @@ static int kcompactd(void *p)
long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
long timeout = default_timeout;
current->flags |= PF_KCOMPACTD;
set_freezable();
pgdat->kcompactd_max_order = 0;
@ -3237,6 +3238,8 @@ static int kcompactd(void *p)
pgdat->proactive_compact_trigger = false;
}
current->flags &= ~PF_KCOMPACTD;
return 0;
}

View file

@ -2943,6 +2943,14 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
void wait_for_freed_hugetlb_folios(void)
{
if (llist_empty(&hpage_freelist))
return;
flush_work(&free_hpage_work);
}
typedef enum {
/*
* For either 0/1: we checked the per-vma resv map, and one resv

View file

@ -1115,7 +1115,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
* mm/memory-failure.c
*/
#ifdef CONFIG_MEMORY_FAILURE
void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu);
int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);
@ -1138,8 +1138,9 @@ unsigned long page_mapped_in_vma(const struct page *page,
struct vm_area_struct *vma);
#else
static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
return -EBUSY;
}
#endif

View file

@ -357,6 +357,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
size -= to_go;
}
}
EXPORT_SYMBOL_GPL(kmsan_handle_dma);
void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
enum dma_data_direction dir)

View file

@ -1556,11 +1556,35 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
struct address_space *mapping;
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
struct address_space *mapping;
if (folio_test_swapcache(folio)) {
pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
ttu &= ~TTU_HWPOISON;
}
/*
* Propagate the dirty bit from PTEs to struct page first, because we
* need this to decide if we should kill or just drop the page.
* XXX: the dirty test could be racy: set_page_dirty() may not always
* be called inside page lock (it's recommended but not enforced).
*/
mapping = folio_mapping(folio);
if (!must_kill && !folio_test_dirty(folio) && mapping &&
mapping_can_writeback(mapping)) {
if (folio_mkclean(folio)) {
folio_set_dirty(folio);
} else {
ttu &= ~TTU_HWPOISON;
pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
* For hugetlb folios in shared mappings, try_to_unmap
* could potentially call huge_pmd_unshare. Because of
@ -1572,7 +1596,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
if (!mapping) {
pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n",
folio_pfn(folio));
return;
return -EBUSY;
}
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
@ -1580,6 +1604,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
} else {
try_to_unmap(folio, ttu);
}
return folio_mapped(folio) ? -EBUSY : 0;
}
/*
@ -1589,8 +1615,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
unsigned long pfn, int flags)
{
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
int forcekill;
@ -1613,29 +1637,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
if (!folio_mapped(folio))
return true;
if (folio_test_swapcache(folio)) {
pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
ttu &= ~TTU_HWPOISON;
}
/*
* Propagate the dirty bit from PTEs to struct page first, because we
* need this to decide if we should kill or just drop the page.
* XXX: the dirty test could be racy: set_page_dirty() may not always
* be called inside page lock (it's recommended but not enforced).
*/
mapping = folio_mapping(folio);
if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
mapping_can_writeback(mapping)) {
if (folio_mkclean(folio)) {
folio_set_dirty(folio);
} else {
ttu &= ~TTU_HWPOISON;
pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
/*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
@ -1643,9 +1644,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
unmap_poisoned_folio(folio, ttu);
unmap_success = !folio_mapped(folio);
unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
if (!unmap_success)
pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
pfn, folio_mapcount(folio));

View file

@ -3051,8 +3051,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
next = pgd_addr_end(addr, end);
if (pgd_none(*pgd) && !create)
continue;
if (WARN_ON_ONCE(pgd_leaf(*pgd)))
return -EINVAL;
if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
err = -EINVAL;
break;
}
if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
if (!create)
continue;
@ -5183,7 +5185,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
!(vma->vm_flags & VM_SHARED);
int type, nr_pages;
unsigned long addr = vmf->address;
unsigned long addr;
bool needs_fallback = false;
fallback:
addr = vmf->address;
/* Did we COW the page? */
if (is_cow)
@ -5222,7 +5228,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* approach also applies to non-anonymous-shmem faults to avoid
* inflating the RSS of the process.
*/
if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
pgoff_t idx = folio_page_idx(folio, page);
@ -5258,9 +5265,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
goto unlock;
} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
ret = VM_FAULT_NOPAGE;
goto unlock;
needs_fallback = true;
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto fallback;
}
folio_ref_add(folio, nr_pages - 1);

View file

@ -1822,27 +1822,25 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (folio_test_large(folio))
pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
/*
* HWPoison pages have elevated reference counts so the migration would
* fail on them. It also doesn't make any sense to migrate them in the
* first place. Still try to unmap such a page in case it is still mapped
* (keep the unmap as the catch all safety net).
*/
if (folio_test_hwpoison(folio) ||
(folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
if (folio_mapped(folio))
unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK);
continue;
}
if (!folio_try_get(folio))
continue;
if (unlikely(page_folio(page) != folio))
goto put_folio;
if (folio_test_hwpoison(folio) ||
(folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
if (folio_mapped(folio)) {
folio_lock(folio);
unmap_poisoned_folio(folio, pfn, false);
folio_unlock(folio);
}
goto put_folio;
}
if (!isolate_folio_to_list(folio, &source)) {
if (__ratelimit(&migrate_rs)) {
pr_warn("failed to isolate pfn %lx\n",

View file

@ -4243,6 +4243,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
restart:
compaction_retries = 0;
no_progress_loops = 0;
compact_result = COMPACT_SKIPPED;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist_iter_cookie = zonelist_iter_begin();
@ -5849,11 +5850,10 @@ static void setup_per_zone_lowmem_reserve(void)
for (j = i + 1; j < MAX_NR_ZONES; j++) {
struct zone *upper_zone = &pgdat->node_zones[j];
bool empty = !zone_managed_pages(upper_zone);
managed_pages += zone_managed_pages(upper_zone);
if (clear || empty)
if (clear)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;

View file

@ -607,6 +607,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
struct zone *zone;
int ret;
/*
* Due to the deferred freeing of hugetlb folios, the hugepage folios may
* not immediately release to the buddy system. This can cause PageBuddy()
* to fail in __test_page_isolated_in_pageblock(). To ensure that the
* hugetlb folios are properly released back to the buddy system, we
* invoke the wait_for_freed_hugetlb_folios() function to wait for the
* release to complete.
*/
wait_for_freed_hugetlb_folios();
/*
* Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free
* pages are not aligned to pageblock_nr_pages.

View file

@ -1548,7 +1548,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (WARN_ON_ONCE(!wbc->for_reclaim))
goto redirty;
if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
if ((info->flags & VM_LOCKED) || sbinfo->noswap)
goto redirty;
if (!total_swap_pages)
@ -2253,7 +2253,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio *folio = NULL;
bool skip_swapcache = false;
swp_entry_t swap;
int error, nr_pages;
int error, nr_pages, order, split_order;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
swap = radix_to_swp_entry(*foliop);
@ -2272,10 +2272,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
int order = xa_get_order(&mapping->i_pages, index);
bool fallback_order0 = false;
int split_order;
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
@ -2339,6 +2338,29 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
error = -ENOMEM;
goto failed;
}
} else if (order != folio_order(folio)) {
/*
* Swap readahead may swap in order 0 folios into swapcache
* asynchronously, while the shmem mapping can still stores
* large swap entries. In such cases, we should split the
* large swap entry to prevent possible data corruption.
*/
split_order = shmem_split_large_entry(inode, index, swap, gfp);
if (split_order < 0) {
error = split_order;
goto failed;
}
/*
* If the large swap entry has already been split, it is
* necessary to recalculate the new swap entry based on
* the old order alignment.
*/
if (split_order > 0) {
pgoff_t offset = index - round_down(index, 1 << split_order);
swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
}
}
alloced:
@ -2346,7 +2368,8 @@ alloced:
folio_lock(folio);
if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
folio->swap.val != swap.val ||
!shmem_confirm_swap(mapping, index, swap)) {
!shmem_confirm_swap(mapping, index, swap) ||
xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
error = -EEXIST;
goto unlock;
}

View file

@ -653,7 +653,8 @@ static void relocate_cluster(struct swap_info_struct *si,
return;
if (!ci->count) {
free_cluster(si, ci);
if (ci->flags != CLUSTER_FLAG_FREE)
free_cluster(si, ci);
} else if (ci->count != SWAPFILE_CLUSTER) {
if (ci->flags != CLUSTER_FLAG_FRAG)
move_cluster(si, ci, &si->frag_clusters[ci->order],
@ -858,6 +859,10 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
offset++;
}
/* in case no swap cache is reclaimed */
if (ci->flags == CLUSTER_FLAG_NONE)
relocate_cluster(si, ci);
unlock_cluster(ci);
if (to_scan <= 0)
break;
@ -2641,7 +2646,6 @@ static void wait_for_allocation(struct swap_info_struct *si)
for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
ci = lock_cluster(si, offset);
unlock_cluster(ci);
offset += SWAPFILE_CLUSTER;
}
}
@ -3542,6 +3546,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
int err, i;
si = swp_swap_info(entry);
if (WARN_ON_ONCE(!si)) {
pr_err("%s%08lx\n", Bad_file, entry.val);
return -EINVAL;
}
offset = swp_offset(entry);
VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);

View file

@ -18,6 +18,7 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
#include "swap.h"
static __always_inline
bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
@ -1076,16 +1077,14 @@ out:
return err;
}
static int move_swap_pte(struct mm_struct *mm,
static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl)
spinlock_t *dst_ptl, spinlock_t *src_ptl,
struct folio *src_folio)
{
if (!pte_swp_exclusive(orig_src_pte))
return -EBUSY;
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@ -1094,6 +1093,16 @@ static int move_swap_pte(struct mm_struct *mm,
return -EAGAIN;
}
/*
* The src_folio resides in the swapcache, requiring an update to its
* index and mapping to align with the dst_vma, where a swap-in may
* occur and hit the swapcache after moving the PTE.
*/
if (src_folio) {
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
}
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@ -1141,6 +1150,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
__u64 mode)
{
swp_entry_t entry;
struct swap_info_struct *si = NULL;
pte_t orig_src_pte, orig_dst_pte;
pte_t src_folio_pte;
spinlock_t *src_ptl, *dst_ptl;
@ -1240,6 +1250,7 @@ retry:
*/
if (!src_folio) {
struct folio *folio;
bool locked;
/*
* Pin the page while holding the lock to be sure the
@ -1259,14 +1270,28 @@ retry:
goto out;
}
locked = folio_trylock(folio);
/*
* We avoid waiting for folio lock with a raised
* refcount for large folios because extra refcounts
* will result in split_folio() failing later and
* retrying. If multiple tasks are trying to move a
* large folio we can end up livelocking.
*/
if (!locked && folio_test_large(folio)) {
spin_unlock(src_ptl);
err = -EAGAIN;
goto out;
}
folio_get(folio);
src_folio = folio;
src_folio_pte = orig_src_pte;
spin_unlock(src_ptl);
if (!folio_trylock(src_folio)) {
pte_unmap(&orig_src_pte);
pte_unmap(&orig_dst_pte);
if (!locked) {
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
folio_lock(src_folio);
@ -1282,8 +1307,8 @@ retry:
/* at this point we have src_folio locked */
if (folio_test_large(src_folio)) {
/* split_folio() can block */
pte_unmap(&orig_src_pte);
pte_unmap(&orig_dst_pte);
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
err = split_folio(src_folio);
if (err)
@ -1308,8 +1333,8 @@ retry:
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
pte_unmap(&orig_src_pte);
pte_unmap(&orig_dst_pte);
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
anon_vma_lock_write(src_anon_vma);
@ -1322,11 +1347,13 @@ retry:
orig_dst_pte, orig_src_pte, dst_pmd,
dst_pmdval, dst_ptl, src_ptl, src_folio);
} else {
struct folio *folio = NULL;
entry = pte_to_swp_entry(orig_src_pte);
if (non_swap_entry(entry)) {
if (is_migration_entry(entry)) {
pte_unmap(&orig_src_pte);
pte_unmap(&orig_dst_pte);
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
err = -EAGAIN;
@ -1335,9 +1362,53 @@ retry:
goto out;
}
err = move_swap_pte(mm, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd,
dst_pmdval, dst_ptl, src_ptl);
if (!pte_swp_exclusive(orig_src_pte)) {
err = -EBUSY;
goto out;
}
si = get_swap_device(entry);
if (unlikely(!si)) {
err = -EAGAIN;
goto out;
}
/*
* Verify the existence of the swapcache. If present, the folio's
* index and mapping must be updated even when the PTE is a swap
* entry. The anon_vma lock is not taken during this process since
* the folio has already been unmapped, and the swap entry is
* exclusive, preventing rmap walks.
*
* For large folios, return -EBUSY immediately, as split_folio()
* also returns -EBUSY when attempting to split unmapped large
* folios in the swapcache. This issue needs to be resolved
* separately to allow proper handling.
*/
if (!src_folio)
folio = filemap_get_folio(swap_address_space(entry),
swap_cache_index(entry));
if (!IS_ERR_OR_NULL(folio)) {
if (folio_test_large(folio)) {
err = -EBUSY;
folio_put(folio);
goto out;
}
src_folio = folio;
src_folio_pte = orig_src_pte;
if (!folio_trylock(src_folio)) {
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
put_swap_device(si);
si = NULL;
/* now we can block and wait */
folio_lock(src_folio);
goto retry;
}
}
err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
dst_ptl, src_ptl, src_folio);
}
out:
@ -1354,6 +1425,8 @@ out:
if (src_pte)
pte_unmap(src_pte);
mmu_notifier_invalidate_range_end(&range);
if (si)
put_swap_device(si);
return err;
}

View file

@ -1509,24 +1509,28 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
{
struct vm_area_struct *vma = vmg->vma;
unsigned long start = vmg->start;
unsigned long end = vmg->end;
struct vm_area_struct *merged;
/* First, try to merge. */
merged = vma_merge_existing_range(vmg);
if (merged)
return merged;
if (vmg_nomem(vmg))
return ERR_PTR(-ENOMEM);
/* Split any preceding portion of the VMA. */
if (vma->vm_start < vmg->start) {
int err = split_vma(vmg->vmi, vma, vmg->start, 1);
if (vma->vm_start < start) {
int err = split_vma(vmg->vmi, vma, start, 1);
if (err)
return ERR_PTR(err);
}
/* Split any trailing portion of the VMA. */
if (vma->vm_end > vmg->end) {
int err = split_vma(vmg->vmi, vma, vmg->end, 0);
if (vma->vm_end > end) {
int err = split_vma(vmg->vmi, vma, end, 0);
if (err)
return ERR_PTR(err);

View file

@ -586,13 +586,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
return 0;
return err;
}
/*

View file

@ -43,7 +43,7 @@
* statistics
**********************************/
/* The number of compressed pages currently stored in zswap */
atomic_long_t zswap_stored_pages = ATOMIC_INIT(0);
atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
/*
* The statistics below are not protected from concurrent access for

View file

@ -65,6 +65,7 @@ def test_nr_regions(real_nr_regions, min_nr_regions, max_nr_regions):
test_name = 'nr_regions test with %d/%d/%d real/min/max nr_regions' % (
real_nr_regions, min_nr_regions, max_nr_regions)
collected_nr_regions.sort()
if (collected_nr_regions[0] < min_nr_regions or
collected_nr_regions[-1] > max_nr_regions):
print('fail %s' % test_name)
@ -109,6 +110,7 @@ def main():
attrs = kdamonds.kdamonds[0].contexts[0].monitoring_attrs
attrs.min_nr_regions = 3
attrs.max_nr_regions = 7
attrs.update_us = 100000
err = kdamonds.kdamonds[0].commit()
if err is not None:
proc.terminate()

View file

@ -51,16 +51,19 @@ def main():
nr_quota_exceeds = scheme.stats.qt_exceeds
wss_collected.sort()
nr_expected_quota_exceeds = 0
for wss in wss_collected:
if wss > sz_quota:
print('quota is not kept: %s > %s' % (wss, sz_quota))
print('collected samples are as below')
print('\n'.join(['%d' % wss for wss in wss_collected]))
exit(1)
if wss == sz_quota:
nr_expected_quota_exceeds += 1
if nr_quota_exceeds < len(wss_collected):
print('quota is not always exceeded: %d > %d' %
(len(wss_collected), nr_quota_exceeds))
if nr_quota_exceeds < nr_expected_quota_exceeds:
print('quota is exceeded less than expected: %d < %d' %
(nr_quota_exceeds, nr_expected_quota_exceeds))
exit(1)
if __name__ == '__main__':

View file

@ -63,6 +63,9 @@ def main():
if last_effective_bytes != 0 else -1.0))
if last_effective_bytes == goal.effective_bytes:
# effective quota was already minimum that cannot be more reduced
if expect_increase is False and last_effective_bytes == 1:
continue
print('efective bytes not changed: %d' % goal.effective_bytes)
exit(1)

View file

@ -15,7 +15,7 @@
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <asm-generic/unistd.h>
#include <unistd.h>
#include <sys/mman.h>
#include <errno.h>
#include <fcntl.h> /* Definition of O_* constants */

View file

@ -11,7 +11,7 @@
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include <asm-generic/unistd.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/mman.h>
@ -369,6 +369,7 @@ unmap:
munmap(map, size);
}
#ifdef __NR_userfaultfd
static void test_unmerge_uffd_wp(void)
{
struct uffdio_writeprotect uffd_writeprotect;
@ -429,6 +430,7 @@ close_uffd:
unmap:
munmap(map, size);
}
#endif
/* Verify that KSM can be enabled / queried with prctl. */
static void test_prctl(void)
@ -684,7 +686,9 @@ int main(int argc, char **argv)
exit(test_child_ksm());
}
#ifdef __NR_userfaultfd
tests++;
#endif
ksft_print_header();
ksft_set_plan(tests);
@ -696,7 +700,9 @@ int main(int argc, char **argv)
test_unmerge();
test_unmerge_zero_pages();
test_unmerge_discarded();
#ifdef __NR_userfaultfd
test_unmerge_uffd_wp();
#endif
test_prot_none();

View file

@ -17,7 +17,7 @@
#include <stdlib.h>
#include <string.h>
#include <asm-generic/unistd.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <fcntl.h>
@ -28,6 +28,8 @@
#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__)
#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__)
#ifdef __NR_memfd_secret
#define PATTERN 0x55
static const int prot = PROT_READ | PROT_WRITE;
@ -332,3 +334,13 @@ int main(int argc, char *argv[])
ksft_finished();
}
#else /* __NR_memfd_secret */
int main(int argc, char *argv[])
{
printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n");
return KSFT_SKIP;
}
#endif /* __NR_memfd_secret */

View file

@ -9,7 +9,7 @@
*/
#include <fcntl.h>
#include <signal.h>
#include <asm-generic/unistd.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
@ -265,6 +265,7 @@ munmap:
munmap(mmap_mem, mmap_size);
}
#ifdef __NR_userfaultfd
static void test_uffdio_copy(void)
{
struct uffdio_register uffdio_register;
@ -322,6 +323,7 @@ munmap:
munmap(dst, pagesize);
free(src);
}
#endif /* __NR_userfaultfd */
int main(void)
{
@ -334,7 +336,9 @@ int main(void)
thpsize / 1024);
tests += 3;
}
#ifdef __NR_userfaultfd
tests += 1;
#endif /* __NR_userfaultfd */
ksft_print_header();
ksft_set_plan(tests);
@ -364,7 +368,9 @@ int main(void)
if (thpsize)
test_pte_mapped_thp();
/* Placing a fresh page via userfaultfd may set the PTE dirty. */
#ifdef __NR_userfaultfd
test_uffdio_copy();
#endif /* __NR_userfaultfd */
err = ksft_get_fail_cnt();
if (err)

View file

@ -3,7 +3,6 @@
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <asm-generic/unistd.h>
static int mlock2_(void *start, size_t len, int flags)
{

View file

@ -42,7 +42,7 @@
#include <sys/wait.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <asm-generic/unistd.h>
#include <unistd.h>
#include <sys/ptrace.h>
#include <setjmp.h>

View file

@ -673,7 +673,11 @@ int uffd_open_dev(unsigned int flags)
int uffd_open_sys(unsigned int flags)
{
#ifdef __NR_userfaultfd
return syscall(__NR_userfaultfd, flags);
#else
return -1;
#endif
}
int uffd_open(unsigned int flags)

View file

@ -33,10 +33,11 @@
* pthread_mutex_lock will also verify the atomicity of the memory
* transfer (UFFDIO_COPY).
*/
#include <asm-generic/unistd.h>
#include "uffd-common.h"
uint64_t features;
#ifdef __NR_userfaultfd
#define BOUNCE_RANDOM (1<<0)
#define BOUNCE_RACINGFAULTS (1<<1)
@ -471,3 +472,15 @@ int main(int argc, char **argv)
nr_pages, nr_pages_per_cpu);
return userfaultfd_stress();
}
#else /* __NR_userfaultfd */
#warning "missing __NR_userfaultfd definition"
int main(void)
{
printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
return KSFT_SKIP;
}
#endif /* __NR_userfaultfd */

View file

@ -5,11 +5,12 @@
* Copyright (C) 2015-2023 Red Hat, Inc.
*/
#include <asm-generic/unistd.h>
#include "uffd-common.h"
#include "../../../../mm/gup_test.h"
#ifdef __NR_userfaultfd
/* The unit test doesn't need a large or random size, make it 32MB for now */
#define UFFD_TEST_MEM_SIZE (32UL << 20)
@ -1558,3 +1559,14 @@ int main(int argc, char *argv[])
return ksft_get_fail_cnt() ? KSFT_FAIL : KSFT_PASS;
}
#else /* __NR_userfaultfd */
#warning "missing __NR_userfaultfd definition"
int main(void)
{
printf("Skipping %s (missing __NR_userfaultfd)\n", __file__);
return KSFT_SKIP;
}
#endif /* __NR_userfaultfd */