x86: rewrite '__copy_user_nocache' function

I didn't really want to do this, but as part of all the other changes to the user copy loops, I've been looking at this horror. I tried to clean it up multiple times, but every time I just found more problems, and the way it's written, it's just too hard to fix them. For example, the code is written to do quad-word alignment, and will use regular byte accesses to get to that point. That's fairly simple, but it means that any initial 8-byte alignment will be done with cached copies. However, the code then is very careful to do any 4-byte _tail_ accesses using an uncached 4-byte write, and that was claimed to be relevant in commit a82eee7424 ("x86/uaccess/64: Handle the caching of 4-byte nocache copies properly in __copy_user_nocache()"). So if you do a 4-byte copy using that function, it carefully uses a 4-byte 'movnti' for the destination. But if you were to do a 12-byte copy that is 4-byte aligned, it would _not_ do a 4-byte 'movnti' followed by a 8-byte 'movnti' to keep it all uncached. Instead, it would align the destination to 8 bytes using a byte-at-a-time loop, and then do a 8-byte 'movnti' for the final 8 bytes. The main caller that cares is __copy_user_flushcache(), which knows about this insanity, and has odd cases for it all. But I just can't deal with looking at this kind of "it does one case right, and another related case entirely wrong". And the code really wasn't fixable without hard drugs, which I try to avoid. So instead, rewrite it in a form that hopefully not only gets this right, but is a bit more maintainable. Knock wood. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2025-11-27 01:11:31 +00:00 · 2023-04-20 15:13:50 -07:00 · 2023-04-20 15:13:50 -07:00 · 034ff37d34
commit 034ff37d34
parent e1f2750edc
3 changed files with 243 additions and 214 deletions
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y)
 endif
        lib-y += clear_page_64.o copy_page_64.o
        lib-y += memmove_64.o memset_64.o
-        lib-y += copy_user_64.o
+        lib-y += copy_user_64.o copy_user_uncached_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@ -7,15 +7,8 @@
 */
 #include <linux/linkage.h>
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/cpufeatures.h>
 #include <asm/alternative.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
 #include <asm/export.h>
 #include <asm/trapnr.h>
 /*
 * rep_movs_alternative - memory copy with exception handling.
@ -119,209 +112,3 @@ SYM_FUNC_START(rep_movs_alternative)
 	_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
 SYM_FUNC_END(rep_movs_alternative)
 EXPORT_SYMBOL(rep_movs_alternative)
 /*
 * The uncached copy needs to align the destination for
 * movnti and friends.
 */
 .macro ALIGN_DESTINATION
 	/* check for bad alignment of destination */
 	movl %edi,%ecx
 	andl $7,%ecx
 	jz 102f				/* already aligned */
 	subl $8,%ecx
 	negl %ecx
 	subl %ecx,%edx
 100:	movb (%rsi),%al
 101:	movb %al,(%rdi)
 	incq %rsi
 	incq %rdi
 	decl %ecx
 	jnz 100b
 102:
 	_ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
 	_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
 .endm
 /*
 * copy_user_nocache - Uncached memory copy with exception handling
 * This will force destination out of cache for more performance.
 *
 * Note: Cached memory copy is used when destination or size is not
 * naturally aligned. That is:
 *  - Require 8-byte alignment when size is 8 bytes or larger.
 *  - Require 4-byte alignment when size is 4 bytes.
 */
 SYM_FUNC_START(__copy_user_nocache)
 	/* If size is less than 8 bytes, go to 4-byte copy */
 	cmpl $8,%edx
 	jb .L_4b_nocache_copy_entry
 	/* If destination is not 8-byte aligned, "cache" copy to align it */
 	ALIGN_DESTINATION
 	/* Set 4x8-byte copy count and remainder */
 	movl %edx,%ecx
 	andl $63,%edx
 	shrl $6,%ecx
 	jz .L_8b_nocache_copy_entry	/* jump if count is 0 */
 	/* Perform 4x8-byte nocache loop-copy */
 .L_4x8b_nocache_copy_loop:
 1:	movq (%rsi),%r8
 2:	movq 1*8(%rsi),%r9
 3:	movq 2*8(%rsi),%r10
 4:	movq 3*8(%rsi),%r11
 5:	movnti %r8,(%rdi)
 6:	movnti %r9,1*8(%rdi)
 7:	movnti %r10,2*8(%rdi)
 8:	movnti %r11,3*8(%rdi)
 9:	movq 4*8(%rsi),%r8
 10:	movq 5*8(%rsi),%r9
 11:	movq 6*8(%rsi),%r10
 12:	movq 7*8(%rsi),%r11
 13:	movnti %r8,4*8(%rdi)
 14:	movnti %r9,5*8(%rdi)
 15:	movnti %r10,6*8(%rdi)
 16:	movnti %r11,7*8(%rdi)
 	leaq 64(%rsi),%rsi
 	leaq 64(%rdi),%rdi
 	decl %ecx
 	jnz .L_4x8b_nocache_copy_loop
 	/* Set 8-byte copy count and remainder */
 .L_8b_nocache_copy_entry:
 	movl %edx,%ecx
 	andl $7,%edx
 	shrl $3,%ecx
 	jz .L_4b_nocache_copy_entry	/* jump if count is 0 */
 	/* Perform 8-byte nocache loop-copy */
 .L_8b_nocache_copy_loop:
 20:	movq (%rsi),%r8
 21:	movnti %r8,(%rdi)
 	leaq 8(%rsi),%rsi
 	leaq 8(%rdi),%rdi
 	decl %ecx
 	jnz .L_8b_nocache_copy_loop
 	/* If no byte left, we're done */
 .L_4b_nocache_copy_entry:
 	andl %edx,%edx
 	jz .L_finish_copy
 	/* If destination is not 4-byte aligned, go to byte copy: */
 	movl %edi,%ecx
 	andl $3,%ecx
 	jnz .L_1b_cache_copy_entry
 	/* Set 4-byte copy count (1 or 0) and remainder */
 	movl %edx,%ecx
 	andl $3,%edx
 	shrl $2,%ecx
 	jz .L_1b_cache_copy_entry	/* jump if count is 0 */
 	/* Perform 4-byte nocache copy: */
 30:	movl (%rsi),%r8d
 31:	movnti %r8d,(%rdi)
 	leaq 4(%rsi),%rsi
 	leaq 4(%rdi),%rdi
 	/* If no bytes left, we're done: */
 	andl %edx,%edx
 	jz .L_finish_copy
 	/* Perform byte "cache" loop-copy for the remainder */
 .L_1b_cache_copy_entry:
 	movl %edx,%ecx
 .L_1b_cache_copy_loop:
 40:	movb (%rsi),%al
 41:	movb %al,(%rdi)
 	incq %rsi
 	incq %rdi
 	decl %ecx
 	jnz .L_1b_cache_copy_loop
 	/* Finished copying; fence the prior stores */
 .L_finish_copy:
 	xorl %eax,%eax
 	sfence
 	RET
 .L_fixup_4x8b_copy:
 	shll $6,%ecx
 	addl %ecx,%edx
 	jmp .L_fixup_handle_tail
 .L_fixup_8b_copy:
 	lea (%rdx,%rcx,8),%rdx
 	jmp .L_fixup_handle_tail
 .L_fixup_4b_copy:
 	lea (%rdx,%rcx,4),%rdx
 	jmp .L_fixup_handle_tail
 .L_fixup_1b_copy:
 	movl %ecx,%edx
 .L_fixup_handle_tail:
 	sfence
 	jmp .Lcopy_user_handle_tail
 	_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy)
 	_ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy)
 	_ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy)
 	_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
 	_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
 	_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
 /*
 * Try to copy last bytes.
 * Since protection fault in copy_from/to_user is not a normal situation,
 * it is not necessary to optimize tail handling.
 * Don't try to copy the tail if machine check happened
 *
 * Input:
 * eax trap number written by ex_handler_copy()
 * rdi destination
 * rsi source
 * rdx count
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
 .Lcopy_user_handle_tail:
 	cmp $X86_TRAP_MC,%eax
 	je 3f
 	movl %edx,%ecx
 1:	rep movsb
 2:	mov %ecx,%eax
 	RET
 3:
 	movl %edx,%eax
 	RET
 	_ASM_EXTABLE_CPY(1b, 2b)
 .Lcopy_user_handle_align:
 	addl %ecx,%edx
 	jmp .Lcopy_user_handle_tail
 SYM_FUNC_END(__copy_user_nocache)
 EXPORT_SYMBOL(__copy_user_nocache)
--- a/arch/x86/lib/copy_user_uncached_64.S
+++ b/arch/x86/lib/copy_user_uncached_64.S
@ -0,0 +1,242 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
 * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
 */
 #include <linux/linkage.h>
 #include <asm/asm.h>
 #include <asm/export.h>
 /*
 * copy_user_nocache - Uncached memory copy with exception handling
 *
 * This copies from user space into kernel space, but the kernel
 * space accesses can take a machine check exception, so they too
 * need exception handling.
 *
 * Note: only 32-bit and 64-bit stores have non-temporal versions,
 * and we only use aligned versions. Any unaligned parts at the
 * start or end of the copy will be done using normal cached stores.
 *
 * Input:
 * rdi destination
 * rsi source
 * edx count
 *
 * Output:
 * rax uncopied bytes or 0 if successful.
 */
 SYM_FUNC_START(__copy_user_nocache)
 	/* If destination is not 7-byte aligned, we'll have to align it */
 	testb $7,%dil
 	jne .Lalign
 .Lis_aligned:
 	cmp $64,%edx
 	jb .Lquadwords
 	.p2align 4,0x90
 .Lunrolled:
 10:	movq (%rsi),%r8
 11:	movq 8(%rsi),%r9
 12:	movq 16(%rsi),%r10
 13:	movq 24(%rsi),%r11
 20:	movnti %r8,(%rdi)
 21:	movnti %r9,8(%rdi)
 22:	movnti %r10,16(%rdi)
 23:	movnti %r11,24(%rdi)
 30:	movq 32(%rsi),%r8
 31:	movq 40(%rsi),%r9
 32:	movq 48(%rsi),%r10
 33:	movq 56(%rsi),%r11
 40:	movnti %r8,32(%rdi)
 41:	movnti %r9,40(%rdi)
 42:	movnti %r10,48(%rdi)
 43:	movnti %r11,56(%rdi)
 	addq $64,%rsi
 	addq $64,%rdi
 	sub $64,%edx
 	cmp $64,%edx
 	jae .Lunrolled
 /*
 * First set of user mode loads have been done
 * without any stores, so if they fail, we can
 * just try the non-unrolled loop.
 */
 _ASM_EXTABLE_UA(10b, .Lquadwords)
 _ASM_EXTABLE_UA(11b, .Lquadwords)
 _ASM_EXTABLE_UA(12b, .Lquadwords)
 _ASM_EXTABLE_UA(13b, .Lquadwords)
 /*
 * The second set of user mode loads have been
 * done with 32 bytes stored to the destination,
 * so we need to take that into account before
 * falling back to the unrolled loop.
 */
 _ASM_EXTABLE_UA(30b, .Lfixup32)
 _ASM_EXTABLE_UA(31b, .Lfixup32)
 _ASM_EXTABLE_UA(32b, .Lfixup32)
 _ASM_EXTABLE_UA(33b, .Lfixup32)
 /*
 * An exception on a write means that we're
 * done, but we need to update the count
 * depending on where in the unrolled loop
 * we were.
 */
 _ASM_EXTABLE_UA(20b, .Ldone0)
 _ASM_EXTABLE_UA(21b, .Ldone8)
 _ASM_EXTABLE_UA(22b, .Ldone16)
 _ASM_EXTABLE_UA(23b, .Ldone24)
 _ASM_EXTABLE_UA(40b, .Ldone32)
 _ASM_EXTABLE_UA(41b, .Ldone40)
 _ASM_EXTABLE_UA(42b, .Ldone48)
 _ASM_EXTABLE_UA(43b, .Ldone56)
 .Lquadwords:
 	cmp $8,%edx
 	jb .Llong
 50:	movq (%rsi),%rax
 51:	movnti %rax,(%rdi)
 	addq $8,%rsi
 	addq $8,%rdi
 	sub $8,%edx
 	jmp .Lquadwords
 /*
 * If we fail on the last full quadword, we will
 * not try to do any byte-wise cached accesses.
 * We will try to do one more 4-byte uncached
 * one, though.
 */
 _ASM_EXTABLE_UA(50b, .Llast4)
 _ASM_EXTABLE_UA(51b, .Ldone0)
 .Llong:
 	test $4,%dl
 	je .Lword
 60:	movl (%rsi),%eax
 61:	movnti %eax,(%rdi)
 	addq $4,%rsi
 	addq $4,%rdi
 	sub $4,%edx
 .Lword:
 	sfence
 	test $2,%dl
 	je .Lbyte
 70:	movw (%rsi),%ax
 71:	movw %ax,(%rdi)
 	addq $2,%rsi
 	addq $2,%rdi
 	sub $2,%edx
 .Lbyte:
 	test $1,%dl
 	je .Ldone
 80:	movb (%rsi),%al
 81:	movb %al,(%rdi)
 	dec %edx
 .Ldone:
 	mov %edx,%eax
 	RET
 /*
 * If we fail on the last four bytes, we won't
 * bother with any fixups. It's dead, Jim. Note
 * that there's no need for 'sfence' for any
 * of this, since the exception will have been
 * serializing.
 */
 _ASM_EXTABLE_UA(60b, .Ldone)
 _ASM_EXTABLE_UA(61b, .Ldone)
 _ASM_EXTABLE_UA(70b, .Ldone)
 _ASM_EXTABLE_UA(71b, .Ldone)
 _ASM_EXTABLE_UA(80b, .Ldone)
 _ASM_EXTABLE_UA(81b, .Ldone)
 /*
 * This is the "head needs aliging" case when
 * the destination isn't 8-byte aligned. The
 * 4-byte case can be done uncached, but any
 * smaller alignment is done with regular stores.
 */
 .Lalign:
 	test $1,%dil
 	je .Lalign_word
 	test %edx,%edx
 	je .Ldone
 90:	movb (%rsi),%al
 91:	movb %al,(%rdi)
 	inc %rsi
 	inc %rdi
 	dec %edx
 .Lalign_word:
 	test $2,%dil
 	je .Lalign_long
 	cmp $2,%edx
 	jb .Lbyte
 92:	movw (%rsi),%ax
 93:	movw %ax,(%rdi)
 	addq $2,%rsi
 	addq $2,%rdi
 	sub $2,%edx
 .Lalign_long:
 	test $4,%dil
 	je .Lis_aligned
 	cmp $4,%edx
 	jb .Lword
 94:	movl (%rsi),%eax
 95:	movnti %eax,(%rdi)
 	addq $4,%rsi
 	addq $4,%rdi
 	sub $4,%edx
 	jmp .Lis_aligned
 /*
 * If we fail on the initial alignment accesses,
 * we're all done. Again, no point in trying to
 * do byte-by-byte probing if the 4-byte load
 * fails - we're not doing any uncached accesses
 * any more.
 */
 _ASM_EXTABLE_UA(90b, .Ldone)
 _ASM_EXTABLE_UA(91b, .Ldone)
 _ASM_EXTABLE_UA(92b, .Ldone)
 _ASM_EXTABLE_UA(93b, .Ldone)
 _ASM_EXTABLE_UA(94b, .Ldone)
 _ASM_EXTABLE_UA(95b, .Ldone)
 /*
 * Exception table fixups for faults in the middle
 */
 .Ldone56: sub $8,%edx
 .Ldone48: sub $8,%edx
 .Ldone40: sub $8,%edx
 .Ldone32: sub $8,%edx
 .Ldone24: sub $8,%edx
 .Ldone16: sub $8,%edx
 .Ldone8: sub $8,%edx
 .Ldone0:
 	mov %edx,%eax
 	RET
 .Lfixup32:
 	addq $32,%rsi
 	addq $32,%rdi
 	sub $32,%edx
 	jmp .Lquadwords
 .Llast4:
 52:	movl (%rsi),%eax
 53:	movnti %eax,(%rdi)
 	sfence
 	sub $4,%edx
 	mov %edx,%eax
 	RET
 _ASM_EXTABLE_UA(52b, .Ldone0)
 _ASM_EXTABLE_UA(53b, .Ldone0)
 SYM_FUNC_END(__copy_user_nocache)
 EXPORT_SYMBOL(__copy_user_nocache)