mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-10-31 16:54:21 +00:00 
			
		
		
		
	 6168d0da2b
			
		
	
	
		6168d0da2b
		
	
	
	
	
		
			
			This patch moves per node lru_lock into lruvec, thus bring a lru_lock for each of memcg per node. So on a large machine, each of memcg don't have to suffer from per node pgdat->lru_lock competition. They could go fast with their self lru_lock. After move memcg charge before lru inserting, page isolation could serialize page's memcg, then per memcg lruvec lock is stable and could replace per node lru lock. In isolate_migratepages_block(), compact_unlock_should_abort and lock_page_lruvec_irqsave are open coded to work with compact_control. Also add a debug func in locking which may give some clues if there are sth out of hands. Daniel Jordan's testing show 62% improvement on modified readtwice case on his 2P * 10 core * 2 HT broadwell box. https://lore.kernel.org/lkml/20200915165807.kpp7uhiw7l3loofu@ca-dmjordan1.us.oracle.com/ Hugh Dickins helped on the patch polish, thanks! [alex.shi@linux.alibaba.com: fix comment typo] Link: https://lkml.kernel.org/r/5b085715-292a-4b43-50b3-d73dc90d1de5@linux.alibaba.com [alex.shi@linux.alibaba.com: use page_memcg()] Link: https://lkml.kernel.org/r/5a4c2b72-7ee8-2478-fc0e-85eb83aafec4@linux.alibaba.com Link: https://lkml.kernel.org/r/1604566549-62481-18-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com> Acked-by: Hugh Dickins <hughd@google.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Rong Chen <rong.a.chen@intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Yang Shi <yang.shi@linux.alibaba.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> Cc: Daniel Jordan <daniel.m.jordan@oracle.com> Cc: Alexander Duyck <alexander.duyck@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Jann Horn <jannh@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Kirill A. Shutemov <kirill@shutemov.name> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mika Penttilä <mika.penttila@nextfour.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Shakeel Butt <shakeelb@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Wei Yang <richard.weiyang@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
		
			
				
	
	
		
			102 lines
		
	
	
	
		
			2.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			102 lines
		
	
	
	
		
			2.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0
 | |
| /*
 | |
|  * linux/mm/mmzone.c
 | |
|  *
 | |
|  * management codes for pgdats, zones and page flags
 | |
|  */
 | |
| 
 | |
| 
 | |
| #include <linux/stddef.h>
 | |
| #include <linux/mm.h>
 | |
| #include <linux/mmzone.h>
 | |
| 
 | |
| struct pglist_data *first_online_pgdat(void)
 | |
| {
 | |
| 	return NODE_DATA(first_online_node);
 | |
| }
 | |
| 
 | |
| struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 | |
| {
 | |
| 	int nid = next_online_node(pgdat->node_id);
 | |
| 
 | |
| 	if (nid == MAX_NUMNODES)
 | |
| 		return NULL;
 | |
| 	return NODE_DATA(nid);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * next_zone - helper magic for for_each_zone()
 | |
|  */
 | |
| struct zone *next_zone(struct zone *zone)
 | |
| {
 | |
| 	pg_data_t *pgdat = zone->zone_pgdat;
 | |
| 
 | |
| 	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
 | |
| 		zone++;
 | |
| 	else {
 | |
| 		pgdat = next_online_pgdat(pgdat);
 | |
| 		if (pgdat)
 | |
| 			zone = pgdat->node_zones;
 | |
| 		else
 | |
| 			zone = NULL;
 | |
| 	}
 | |
| 	return zone;
 | |
| }
 | |
| 
 | |
| static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
 | |
| {
 | |
| #ifdef CONFIG_NUMA
 | |
| 	return node_isset(zonelist_node_idx(zref), *nodes);
 | |
| #else
 | |
| 	return 1;
 | |
| #endif /* CONFIG_NUMA */
 | |
| }
 | |
| 
 | |
| /* Returns the next zone at or below highest_zoneidx in a zonelist */
 | |
| struct zoneref *__next_zones_zonelist(struct zoneref *z,
 | |
| 					enum zone_type highest_zoneidx,
 | |
| 					nodemask_t *nodes)
 | |
| {
 | |
| 	/*
 | |
| 	 * Find the next suitable zone to use for the allocation.
 | |
| 	 * Only filter based on nodemask if it's set
 | |
| 	 */
 | |
| 	if (unlikely(nodes == NULL))
 | |
| 		while (zonelist_zone_idx(z) > highest_zoneidx)
 | |
| 			z++;
 | |
| 	else
 | |
| 		while (zonelist_zone_idx(z) > highest_zoneidx ||
 | |
| 				(z->zone && !zref_in_nodemask(z, nodes)))
 | |
| 			z++;
 | |
| 
 | |
| 	return z;
 | |
| }
 | |
| 
 | |
| void lruvec_init(struct lruvec *lruvec)
 | |
| {
 | |
| 	enum lru_list lru;
 | |
| 
 | |
| 	memset(lruvec, 0, sizeof(struct lruvec));
 | |
| 	spin_lock_init(&lruvec->lru_lock);
 | |
| 
 | |
| 	for_each_lru(lru)
 | |
| 		INIT_LIST_HEAD(&lruvec->lists[lru]);
 | |
| }
 | |
| 
 | |
| #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
 | |
| int page_cpupid_xchg_last(struct page *page, int cpupid)
 | |
| {
 | |
| 	unsigned long old_flags, flags;
 | |
| 	int last_cpupid;
 | |
| 
 | |
| 	do {
 | |
| 		old_flags = flags = page->flags;
 | |
| 		last_cpupid = page_cpupid_last(page);
 | |
| 
 | |
| 		flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
 | |
| 		flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
 | |
| 	} while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
 | |
| 
 | |
| 	return last_cpupid;
 | |
| }
 | |
| #endif
 |