mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-10-31 16:54:21 +00:00 
			
		
		
		
	mm/gup: retire follow_hugetlb_page()
Now __get_user_pages() should be well prepared to handle thp completely, as long as hugetlb gup requests even without the hugetlb's special path. Time to retire follow_hugetlb_page(). Tweak misc comments to reflect reality of follow_hugetlb_page()'s removal. Link: https://lkml.kernel.org/r/20230628215310.73782-7-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: James Houghton <jthoughton@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Kirill A . Shutemov <kirill@shutemov.name> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yang Shi <shy828301@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									57edfcfd34
								
							
						
					
					
						commit
						4849807114
					
				
					 4 changed files with 1 additions and 256 deletions
				
			
		|  | @ -427,7 +427,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) | |||
| 	 * | ||||
| 	 * We also don't do userfault handling during | ||||
| 	 * coredumping. hugetlbfs has the special | ||||
| 	 * follow_hugetlb_page() to skip missing pages in the | ||||
| 	 * hugetlb_follow_page_mask() to skip missing pages in the | ||||
| 	 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with | ||||
| 	 * the no_page_table() helper in follow_page_mask(), but the | ||||
| 	 * shmem_vm_ops->fault method is invoked even during | ||||
|  |  | |||
|  | @ -133,9 +133,6 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, | |||
| struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, | ||||
| 				      unsigned long address, unsigned int flags, | ||||
| 				      unsigned int *page_mask); | ||||
| long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, | ||||
| 			 struct page **, unsigned long *, unsigned long *, | ||||
| 			 long, unsigned int, int *); | ||||
| void unmap_hugepage_range(struct vm_area_struct *, | ||||
| 			  unsigned long, unsigned long, struct page *, | ||||
| 			  zap_flags_t); | ||||
|  | @ -305,15 +302,6 @@ static inline struct page *hugetlb_follow_page_mask( | |||
| 	BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ | ||||
| } | ||||
| 
 | ||||
| static inline long follow_hugetlb_page(struct mm_struct *mm, | ||||
| 			struct vm_area_struct *vma, struct page **pages, | ||||
| 			unsigned long *position, unsigned long *nr_pages, | ||||
| 			long i, unsigned int flags, int *nonblocking) | ||||
| { | ||||
| 	BUG(); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static inline int copy_hugetlb_page_range(struct mm_struct *dst, | ||||
| 					  struct mm_struct *src, | ||||
| 					  struct vm_area_struct *dst_vma, | ||||
|  |  | |||
							
								
								
									
										19
									
								
								mm/gup.c
									
										
									
									
									
								
							
							
						
						
									
										19
									
								
								mm/gup.c
									
										
									
									
									
								
							|  | @ -819,9 +819,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, | |||
| 	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use | ||||
| 	 * special hugetlb page table walking code.  This eliminates the | ||||
| 	 * need to check for hugetlb entries in the general walking code. | ||||
| 	 * | ||||
| 	 * hugetlb_follow_page_mask is only for follow_page() handling here. | ||||
| 	 * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. | ||||
| 	 */ | ||||
| 	if (is_vm_hugetlb_page(vma)) | ||||
| 		return hugetlb_follow_page_mask(vma, address, flags, | ||||
|  | @ -1221,22 +1218,6 @@ static long __get_user_pages(struct mm_struct *mm, | |||
| 			ret = check_vma_flags(vma, gup_flags); | ||||
| 			if (ret) | ||||
| 				goto out; | ||||
| 
 | ||||
| 			if (is_vm_hugetlb_page(vma)) { | ||||
| 				i = follow_hugetlb_page(mm, vma, pages, | ||||
| 							&start, &nr_pages, i, | ||||
| 							gup_flags, locked); | ||||
| 				if (!*locked) { | ||||
| 					/*
 | ||||
| 					 * We've got a VM_FAULT_RETRY | ||||
| 					 * and we've lost mmap_lock. | ||||
| 					 * We must stop here. | ||||
| 					 */ | ||||
| 					BUG_ON(gup_flags & FOLL_NOWAIT); | ||||
| 					goto out; | ||||
| 				} | ||||
| 				continue; | ||||
| 			} | ||||
| 		} | ||||
| retry: | ||||
| 		/*
 | ||||
|  |  | |||
							
								
								
									
										224
									
								
								mm/hugetlb.c
									
										
									
									
									
								
							
							
						
						
									
										224
									
								
								mm/hugetlb.c
									
										
									
									
									
								
							|  | @ -5721,7 +5721,6 @@ out_release_old: | |||
| 
 | ||||
| /*
 | ||||
|  * Return whether there is a pagecache page to back given address within VMA. | ||||
|  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. | ||||
|  */ | ||||
| static bool hugetlbfs_pagecache_present(struct hstate *h, | ||||
| 			struct vm_area_struct *vma, unsigned long address) | ||||
|  | @ -6422,37 +6421,6 @@ out_release_nounlock: | |||
| } | ||||
| #endif /* CONFIG_USERFAULTFD */ | ||||
| 
 | ||||
| static void record_subpages(struct page *page, struct vm_area_struct *vma, | ||||
| 			    int refs, struct page **pages) | ||||
| { | ||||
| 	int nr; | ||||
| 
 | ||||
| 	for (nr = 0; nr < refs; nr++) { | ||||
| 		if (likely(pages)) | ||||
| 			pages[nr] = nth_page(page, nr); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma, | ||||
| 					       unsigned int flags, pte_t *pte, | ||||
| 					       bool *unshare) | ||||
| { | ||||
| 	pte_t pteval = huge_ptep_get(pte); | ||||
| 
 | ||||
| 	*unshare = false; | ||||
| 	if (is_swap_pte(pteval)) | ||||
| 		return true; | ||||
| 	if (huge_pte_write(pteval)) | ||||
| 		return false; | ||||
| 	if (flags & FOLL_WRITE) | ||||
| 		return true; | ||||
| 	if (gup_must_unshare(vma, flags, pte_page(pteval))) { | ||||
| 		*unshare = true; | ||||
| 		return true; | ||||
| 	} | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, | ||||
| 				      unsigned long address, unsigned int flags, | ||||
| 				      unsigned int *page_mask) | ||||
|  | @ -6524,198 +6492,6 @@ out_unlock: | |||
| 	return page; | ||||
| } | ||||
| 
 | ||||
| long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||||
| 			 struct page **pages, unsigned long *position, | ||||
| 			 unsigned long *nr_pages, long i, unsigned int flags, | ||||
| 			 int *locked) | ||||
| { | ||||
| 	unsigned long pfn_offset; | ||||
| 	unsigned long vaddr = *position; | ||||
| 	unsigned long remainder = *nr_pages; | ||||
| 	struct hstate *h = hstate_vma(vma); | ||||
| 	int err = -EFAULT, refs; | ||||
| 
 | ||||
| 	while (vaddr < vma->vm_end && remainder) { | ||||
| 		pte_t *pte; | ||||
| 		spinlock_t *ptl = NULL; | ||||
| 		bool unshare = false; | ||||
| 		int absent; | ||||
| 		struct page *page; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If we have a pending SIGKILL, don't keep faulting pages and | ||||
| 		 * potentially allocating memory. | ||||
| 		 */ | ||||
| 		if (fatal_signal_pending(current)) { | ||||
| 			remainder = 0; | ||||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		hugetlb_vma_lock_read(vma); | ||||
| 		/*
 | ||||
| 		 * Some archs (sparc64, sh*) have multiple pte_ts to | ||||
| 		 * each hugepage.  We have to make sure we get the | ||||
| 		 * first, for the page indexing below to work. | ||||
| 		 * | ||||
| 		 * Note that page table lock is not held when pte is null. | ||||
| 		 */ | ||||
| 		pte = hugetlb_walk(vma, vaddr & huge_page_mask(h), | ||||
| 				   huge_page_size(h)); | ||||
| 		if (pte) | ||||
| 			ptl = huge_pte_lock(h, mm, pte); | ||||
| 		absent = !pte || huge_pte_none(huge_ptep_get(pte)); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * When coredumping, it suits get_dump_page if we just return | ||||
| 		 * an error where there's an empty slot with no huge pagecache | ||||
| 		 * to back it.  This way, we avoid allocating a hugepage, and | ||||
| 		 * the sparse dumpfile avoids allocating disk blocks, but its | ||||
| 		 * huge holes still show up with zeroes where they need to be. | ||||
| 		 */ | ||||
| 		if (absent && (flags & FOLL_DUMP) && | ||||
| 		    !hugetlbfs_pagecache_present(h, vma, vaddr)) { | ||||
| 			if (pte) | ||||
| 				spin_unlock(ptl); | ||||
| 			hugetlb_vma_unlock_read(vma); | ||||
| 			remainder = 0; | ||||
| 			break; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * We need call hugetlb_fault for both hugepages under migration | ||||
| 		 * (in which case hugetlb_fault waits for the migration,) and | ||||
| 		 * hwpoisoned hugepages (in which case we need to prevent the | ||||
| 		 * caller from accessing to them.) In order to do this, we use | ||||
| 		 * here is_swap_pte instead of is_hugetlb_entry_migration and | ||||
| 		 * is_hugetlb_entry_hwpoisoned. This is because it simply covers | ||||
| 		 * both cases, and because we can't follow correct pages | ||||
| 		 * directly from any kind of swap entries. | ||||
| 		 */ | ||||
| 		if (absent || | ||||
| 		    __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) { | ||||
| 			vm_fault_t ret; | ||||
| 			unsigned int fault_flags = 0; | ||||
| 
 | ||||
| 			if (pte) | ||||
| 				spin_unlock(ptl); | ||||
| 			hugetlb_vma_unlock_read(vma); | ||||
| 
 | ||||
| 			if (flags & FOLL_WRITE) | ||||
| 				fault_flags |= FAULT_FLAG_WRITE; | ||||
| 			else if (unshare) | ||||
| 				fault_flags |= FAULT_FLAG_UNSHARE; | ||||
| 			if (locked) { | ||||
| 				fault_flags |= FAULT_FLAG_ALLOW_RETRY | | ||||
| 					FAULT_FLAG_KILLABLE; | ||||
| 				if (flags & FOLL_INTERRUPTIBLE) | ||||
| 					fault_flags |= FAULT_FLAG_INTERRUPTIBLE; | ||||
| 			} | ||||
| 			if (flags & FOLL_NOWAIT) | ||||
| 				fault_flags |= FAULT_FLAG_ALLOW_RETRY | | ||||
| 					FAULT_FLAG_RETRY_NOWAIT; | ||||
| 			if (flags & FOLL_TRIED) { | ||||
| 				/*
 | ||||
| 				 * Note: FAULT_FLAG_ALLOW_RETRY and | ||||
| 				 * FAULT_FLAG_TRIED can co-exist | ||||
| 				 */ | ||||
| 				fault_flags |= FAULT_FLAG_TRIED; | ||||
| 			} | ||||
| 			ret = hugetlb_fault(mm, vma, vaddr, fault_flags); | ||||
| 			if (ret & VM_FAULT_ERROR) { | ||||
| 				err = vm_fault_to_errno(ret, flags); | ||||
| 				remainder = 0; | ||||
| 				break; | ||||
| 			} | ||||
| 			if (ret & VM_FAULT_RETRY) { | ||||
| 				if (locked && | ||||
| 				    !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) | ||||
| 					*locked = 0; | ||||
| 				*nr_pages = 0; | ||||
| 				/*
 | ||||
| 				 * VM_FAULT_RETRY must not return an | ||||
| 				 * error, it will return zero | ||||
| 				 * instead. | ||||
| 				 * | ||||
| 				 * No need to update "position" as the | ||||
| 				 * caller will not check it after | ||||
| 				 * *nr_pages is set to 0. | ||||
| 				 */ | ||||
| 				return i; | ||||
| 			} | ||||
| 			continue; | ||||
| 		} | ||||
| 
 | ||||
| 		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; | ||||
| 		page = pte_page(huge_ptep_get(pte)); | ||||
| 
 | ||||
| 		VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && | ||||
| 			       !PageAnonExclusive(page), page); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If subpage information not requested, update counters | ||||
| 		 * and skip the same_page loop below. | ||||
| 		 */ | ||||
| 		if (!pages && !pfn_offset && | ||||
| 		    (vaddr + huge_page_size(h) < vma->vm_end) && | ||||
| 		    (remainder >= pages_per_huge_page(h))) { | ||||
| 			vaddr += huge_page_size(h); | ||||
| 			remainder -= pages_per_huge_page(h); | ||||
| 			i += pages_per_huge_page(h); | ||||
| 			spin_unlock(ptl); | ||||
| 			hugetlb_vma_unlock_read(vma); | ||||
| 			continue; | ||||
| 		} | ||||
| 
 | ||||
| 		/* vaddr may not be aligned to PAGE_SIZE */ | ||||
| 		refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, | ||||
| 		    (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); | ||||
| 
 | ||||
| 		if (pages) | ||||
| 			record_subpages(nth_page(page, pfn_offset), | ||||
| 					vma, refs, | ||||
| 					likely(pages) ? pages + i : NULL); | ||||
| 
 | ||||
| 		if (pages) { | ||||
| 			/*
 | ||||
| 			 * try_grab_folio() should always succeed here, | ||||
| 			 * because: a) we hold the ptl lock, and b) we've just | ||||
| 			 * checked that the huge page is present in the page | ||||
| 			 * tables. If the huge page is present, then the tail | ||||
| 			 * pages must also be present. The ptl prevents the | ||||
| 			 * head page and tail pages from being rearranged in | ||||
| 			 * any way. As this is hugetlb, the pages will never | ||||
| 			 * be p2pdma or not longterm pinable. So this page | ||||
| 			 * must be available at this point, unless the page | ||||
| 			 * refcount overflowed: | ||||
| 			 */ | ||||
| 			if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, | ||||
| 							 flags))) { | ||||
| 				spin_unlock(ptl); | ||||
| 				hugetlb_vma_unlock_read(vma); | ||||
| 				remainder = 0; | ||||
| 				err = -ENOMEM; | ||||
| 				break; | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		vaddr += (refs << PAGE_SHIFT); | ||||
| 		remainder -= refs; | ||||
| 		i += refs; | ||||
| 
 | ||||
| 		spin_unlock(ptl); | ||||
| 		hugetlb_vma_unlock_read(vma); | ||||
| 	} | ||||
| 	*nr_pages = remainder; | ||||
| 	/*
 | ||||
| 	 * setting position is actually required only if remainder is | ||||
| 	 * not zero but it's faster not to add a "if (remainder)" | ||||
| 	 * branch. | ||||
| 	 */ | ||||
| 	*position = vaddr; | ||||
| 
 | ||||
| 	return i ? i : err; | ||||
| } | ||||
| 
 | ||||
| long hugetlb_change_protection(struct vm_area_struct *vma, | ||||
| 		unsigned long address, unsigned long end, | ||||
| 		pgprot_t newprot, unsigned long cp_flags) | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Peter Xu
						Peter Xu