linux/tools/include/nolibc/arch-x86.h

388 lines
18 KiB
C
Raw Permalink Normal View History

/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
/*
* x86 specific definitions for NOLIBC (both 32- and 64-bit)
* Copyright (C) 2017-2025 Willy Tarreau <w@1wt.eu>
*/
#ifndef _NOLIBC_ARCH_X86_H
#define _NOLIBC_ARCH_X86_H
#include "compiler.h"
#include "crt.h"
#if !defined(__x86_64__)
/* Syscalls for i386 :
* - mostly similar to x86_64
* - registers are 32-bit
* - syscall number is passed in eax
* - arguments are in ebx, ecx, edx, esi, edi, ebp respectively
* - all registers are preserved (except eax of course)
* - the system call is performed by calling int $0x80
* - syscall return comes in eax
* - the arguments are cast to long and assigned into the target registers
* which are then simply passed as registers to the asm code, so that we
* don't have to experience issues with register constraints.
* - the syscall number is always specified last in order to allow to force
* some registers before (gcc refuses a %-register at the last position).
*
* Also, i386 supports the old_select syscall if newselect is not available
*/
#define __ARCH_WANT_SYS_OLD_SELECT
#define my_syscall0(num) \
({ \
long _ret; \
register long _num __asm__ ("eax") = (num); \
\
__asm__ volatile ( \
"int $0x80\n" \
: "=a" (_ret) \
: "0"(_num) \
: "memory", "cc" \
); \
_ret; \
})
#define my_syscall1(num, arg1) \
({ \
long _ret; \
register long _num __asm__ ("eax") = (num); \
register long _arg1 __asm__ ("ebx") = (long)(arg1); \
\
__asm__ volatile ( \
"int $0x80\n" \
: "=a" (_ret) \
: "r"(_arg1), \
"0"(_num) \
: "memory", "cc" \
); \
_ret; \
})
#define my_syscall2(num, arg1, arg2) \
({ \
long _ret; \
register long _num __asm__ ("eax") = (num); \
register long _arg1 __asm__ ("ebx") = (long)(arg1); \
register long _arg2 __asm__ ("ecx") = (long)(arg2); \
\
__asm__ volatile ( \
"int $0x80\n" \
: "=a" (_ret) \
: "r"(_arg1), "r"(_arg2), \
"0"(_num) \
: "memory", "cc" \
); \
_ret; \
})
#define my_syscall3(num, arg1, arg2, arg3) \
({ \
long _ret; \
register long _num __asm__ ("eax") = (num); \
register long _arg1 __asm__ ("ebx") = (long)(arg1); \
register long _arg2 __asm__ ("ecx") = (long)(arg2); \
register long _arg3 __asm__ ("edx") = (long)(arg3); \
\
__asm__ volatile ( \
"int $0x80\n" \
: "=a" (_ret) \
: "r"(_arg1), "r"(_arg2), "r"(_arg3), \
"0"(_num) \
: "memory", "cc" \
); \
_ret; \
})
#define my_syscall4(num, arg1, arg2, arg3, arg4) \
({ \
long _ret; \
register long _num __asm__ ("eax") = (num); \
register long _arg1 __asm__ ("ebx") = (long)(arg1); \
register long _arg2 __asm__ ("ecx") = (long)(arg2); \
register long _arg3 __asm__ ("edx") = (long)(arg3); \
register long _arg4 __asm__ ("esi") = (long)(arg4); \
\
__asm__ volatile ( \
"int $0x80\n" \
: "=a" (_ret) \
: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \
"0"(_num) \
: "memory", "cc" \
); \
_ret; \
})
#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \
({ \
long _ret; \
register long _num __asm__ ("eax") = (num); \
register long _arg1 __asm__ ("ebx") = (long)(arg1); \
register long _arg2 __asm__ ("ecx") = (long)(arg2); \
register long _arg3 __asm__ ("edx") = (long)(arg3); \
register long _arg4 __asm__ ("esi") = (long)(arg4); \
register long _arg5 __asm__ ("edi") = (long)(arg5); \
\
__asm__ volatile ( \
"int $0x80\n" \
: "=a" (_ret) \
: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \
"0"(_num) \
: "memory", "cc" \
); \
_ret; \
})
#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \
({ \
long _eax = (long)(num); \
long _arg6 = (long)(arg6); /* Always in memory */ \
__asm__ volatile ( \
"pushl %[_arg6]\n\t" \
"pushl %%ebp\n\t" \
"movl 4(%%esp),%%ebp\n\t" \
"int $0x80\n\t" \
"popl %%ebp\n\t" \
"addl $4,%%esp\n\t" \
: "+a"(_eax) /* %eax */ \
: "b"(arg1), /* %ebx */ \
"c"(arg2), /* %ecx */ \
"d"(arg3), /* %edx */ \
"S"(arg4), /* %esi */ \
"D"(arg5), /* %edi */ \
[_arg6]"m"(_arg6) /* memory */ \
: "memory", "cc" \
); \
_eax; \
})
/* startup code */
/*
* i386 System V ABI mandates:
* 1) last pushed argument must be 16-byte aligned.
* 2) The deepest stack frame should be set to zero
*
*/
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
{
__asm__ volatile (
"xor %ebp, %ebp\n" /* zero the stack frame */
"mov %esp, %eax\n" /* save stack pointer to %eax, as arg1 of _start_c */
"sub $12, %esp\n" /* sub 12 to keep it aligned after the push %eax */
"push %eax\n" /* push arg1 on stack to support plain stack modes too */
"call _start_c\n" /* transfer to c runtime */
"hlt\n" /* ensure it does not return */
);
__nolibc_entrypoint_epilogue();
}
#else /* !defined(__x86_64__) */
/* Syscalls for x86_64 :
* - registers are 64-bit
* - syscall number is passed in rax
* - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively
* - the system call is performed by calling the syscall instruction
* - syscall return comes in rax
* - rcx and r11 are clobbered, others are preserved.
* - the arguments are cast to long and assigned into the target registers
* which are then simply passed as registers to the asm code, so that we
* don't have to experience issues with register constraints.
* - the syscall number is always specified last in order to allow to force
* some registers before (gcc refuses a %-register at the last position).
* - see also x86-64 ABI section A.2 AMD64 Linux Kernel Conventions, A.2.1
* Calling Conventions.
*
* Link x86-64 ABI: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/home
*
*/
#define my_syscall0(num) \
({ \
long _ret; \
register long _num __asm__ ("rax") = (num); \
\
__asm__ volatile ( \
"syscall\n" \
: "=a"(_ret) \
: "0"(_num) \
: "rcx", "r11", "memory", "cc" \
); \
_ret; \
})
#define my_syscall1(num, arg1) \
({ \
long _ret; \
register long _num __asm__ ("rax") = (num); \
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
\
__asm__ volatile ( \
"syscall\n" \
: "=a"(_ret) \
: "r"(_arg1), \
"0"(_num) \
: "rcx", "r11", "memory", "cc" \
); \
_ret; \
})
#define my_syscall2(num, arg1, arg2) \
({ \
long _ret; \
register long _num __asm__ ("rax") = (num); \
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
register long _arg2 __asm__ ("rsi") = (long)(arg2); \
\
__asm__ volatile ( \
"syscall\n" \
: "=a"(_ret) \
: "r"(_arg1), "r"(_arg2), \
"0"(_num) \
: "rcx", "r11", "memory", "cc" \
); \
_ret; \
})
#define my_syscall3(num, arg1, arg2, arg3) \
({ \
long _ret; \
register long _num __asm__ ("rax") = (num); \
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
register long _arg2 __asm__ ("rsi") = (long)(arg2); \
register long _arg3 __asm__ ("rdx") = (long)(arg3); \
\
__asm__ volatile ( \
"syscall\n" \
: "=a"(_ret) \
: "r"(_arg1), "r"(_arg2), "r"(_arg3), \
"0"(_num) \
: "rcx", "r11", "memory", "cc" \
); \
_ret; \
})
#define my_syscall4(num, arg1, arg2, arg3, arg4) \
({ \
long _ret; \
register long _num __asm__ ("rax") = (num); \
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
register long _arg2 __asm__ ("rsi") = (long)(arg2); \
register long _arg3 __asm__ ("rdx") = (long)(arg3); \
register long _arg4 __asm__ ("r10") = (long)(arg4); \
\
__asm__ volatile ( \
"syscall\n" \
: "=a"(_ret) \
: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \
"0"(_num) \
: "rcx", "r11", "memory", "cc" \
); \
_ret; \
})
#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \
({ \
long _ret; \
register long _num __asm__ ("rax") = (num); \
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
register long _arg2 __asm__ ("rsi") = (long)(arg2); \
register long _arg3 __asm__ ("rdx") = (long)(arg3); \
register long _arg4 __asm__ ("r10") = (long)(arg4); \
register long _arg5 __asm__ ("r8") = (long)(arg5); \
\
__asm__ volatile ( \
"syscall\n" \
: "=a"(_ret) \
: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \
"0"(_num) \
: "rcx", "r11", "memory", "cc" \
); \
_ret; \
})
#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \
({ \
long _ret; \
register long _num __asm__ ("rax") = (num); \
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
register long _arg2 __asm__ ("rsi") = (long)(arg2); \
register long _arg3 __asm__ ("rdx") = (long)(arg3); \
register long _arg4 __asm__ ("r10") = (long)(arg4); \
register long _arg5 __asm__ ("r8") = (long)(arg5); \
register long _arg6 __asm__ ("r9") = (long)(arg6); \
\
__asm__ volatile ( \
"syscall\n" \
: "=a"(_ret) \
: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \
"r"(_arg6), "0"(_num) \
: "rcx", "r11", "memory", "cc" \
); \
_ret; \
})
/* startup code */
/*
* x86-64 System V ABI mandates:
* 1) %rsp must be 16-byte aligned right before the function call.
* 2) The deepest stack frame should be zero (the %rbp).
*
*/
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
tools/nolibc: make compiler and assembler agree on the section around _start The out-of-block asm() statement carrying _start does not allow the compiler to know what section the assembly code is being emitted to, and there's no easy way to push/pop the current section and restore it. It sometimes causes issues depending on the include files ordering and compiler optimizations. For example if a variable is declared immediately before the asm() block and another one after, the compiler assumes that the current section is still .bss and doesn't re-emit it, making the second variable appear inside the .text section instead. Forcing .bss at the end of the _start block doesn't work either because at certain optimizations the compiler may reorder blocks and will make some real code appear just after this block. A significant number of solutions were attempted, but many of them were still sensitive to section reordering. In the end, the best way to make sure the compiler and assembler agree on the current section is to place this code inside a function. Here the function is directly called _start and configured not to emit a frame-pointer, hence to have no prologue. If some future architectures would still emit some prologue, another working approach consists in naming the function differently and placing the _start label inside the asm statement. But the current solution is simpler. It was tested with nolibc-test at -O,-O0,-O2,-O3,-Os for arm,arm64,i386, mips,riscv,s390 and x86_64. Signed-off-by: Willy Tarreau <w@1wt.eu> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2023-01-10 08:24:13 +01:00
{
__asm__ volatile (
"xor %ebp, %ebp\n" /* zero the stack frame */
"mov %rsp, %rdi\n" /* save stack pointer to %rdi, as arg1 of _start_c */
"call _start_c\n" /* transfer to c runtime */
"hlt\n" /* ensure it does not return */
tools/nolibc: make compiler and assembler agree on the section around _start The out-of-block asm() statement carrying _start does not allow the compiler to know what section the assembly code is being emitted to, and there's no easy way to push/pop the current section and restore it. It sometimes causes issues depending on the include files ordering and compiler optimizations. For example if a variable is declared immediately before the asm() block and another one after, the compiler assumes that the current section is still .bss and doesn't re-emit it, making the second variable appear inside the .text section instead. Forcing .bss at the end of the _start block doesn't work either because at certain optimizations the compiler may reorder blocks and will make some real code appear just after this block. A significant number of solutions were attempted, but many of them were still sensitive to section reordering. In the end, the best way to make sure the compiler and assembler agree on the current section is to place this code inside a function. Here the function is directly called _start and configured not to emit a frame-pointer, hence to have no prologue. If some future architectures would still emit some prologue, another working approach consists in naming the function differently and placing the _start label inside the asm statement. But the current solution is simpler. It was tested with nolibc-test at -O,-O0,-O2,-O3,-Os for arm,arm64,i386, mips,riscv,s390 and x86_64. Signed-off-by: Willy Tarreau <w@1wt.eu> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2023-01-10 08:24:13 +01:00
);
__nolibc_entrypoint_epilogue();
tools/nolibc: make compiler and assembler agree on the section around _start The out-of-block asm() statement carrying _start does not allow the compiler to know what section the assembly code is being emitted to, and there's no easy way to push/pop the current section and restore it. It sometimes causes issues depending on the include files ordering and compiler optimizations. For example if a variable is declared immediately before the asm() block and another one after, the compiler assumes that the current section is still .bss and doesn't re-emit it, making the second variable appear inside the .text section instead. Forcing .bss at the end of the _start block doesn't work either because at certain optimizations the compiler may reorder blocks and will make some real code appear just after this block. A significant number of solutions were attempted, but many of them were still sensitive to section reordering. In the end, the best way to make sure the compiler and assembler agree on the current section is to place this code inside a function. Here the function is directly called _start and configured not to emit a frame-pointer, hence to have no prologue. If some future architectures would still emit some prologue, another working approach consists in naming the function differently and placing the _start label inside the asm statement. But the current solution is simpler. It was tested with nolibc-test at -O,-O0,-O2,-O3,-Os for arm,arm64,i386, mips,riscv,s390 and x86_64. Signed-off-by: Willy Tarreau <w@1wt.eu> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2023-01-10 08:24:13 +01:00
}
tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()` Simplify memcpy() and memmove() on the x86-64 arch. The x86-64 arch has a 'rep movsb' instruction, which can perform memcpy() using only a single instruction, given: %rdi = destination %rsi = source %rcx = length Additionally, it can also handle the overlapping case by setting DF=1 (backward copy), which can be used as the memmove() implementation. Before this patch: ``` 00000000000010ab <memmove>: 10ab: 48 89 f8 mov %rdi,%rax 10ae: 31 c9 xor %ecx,%ecx 10b0: 48 39 f7 cmp %rsi,%rdi 10b3: 48 83 d1 ff adc $0xffffffffffffffff,%rcx 10b7: 48 85 d2 test %rdx,%rdx 10ba: 74 25 je 10e1 <memmove+0x36> 10bc: 48 83 c9 01 or $0x1,%rcx 10c0: 48 39 f0 cmp %rsi,%rax 10c3: 48 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%rdi 10ca: 48 0f 43 fa cmovae %rdx,%rdi 10ce: 48 01 cf add %rcx,%rdi 10d1: 44 8a 04 3e mov (%rsi,%rdi,1),%r8b 10d5: 44 88 04 38 mov %r8b,(%rax,%rdi,1) 10d9: 48 01 cf add %rcx,%rdi 10dc: 48 ff ca dec %rdx 10df: 75 f0 jne 10d1 <memmove+0x26> 10e1: c3 ret 00000000000010e2 <memcpy>: 10e2: 48 89 f8 mov %rdi,%rax 10e5: 48 85 d2 test %rdx,%rdx 10e8: 74 12 je 10fc <memcpy+0x1a> 10ea: 31 c9 xor %ecx,%ecx 10ec: 40 8a 3c 0e mov (%rsi,%rcx,1),%dil 10f0: 40 88 3c 08 mov %dil,(%rax,%rcx,1) 10f4: 48 ff c1 inc %rcx 10f7: 48 39 ca cmp %rcx,%rdx 10fa: 75 f0 jne 10ec <memcpy+0xa> 10fc: c3 ret ``` After this patch: ``` // memmove is an alias for memcpy 000000000040133b <memcpy>: 40133b: 48 89 d1 mov %rdx,%rcx 40133e: 48 89 f8 mov %rdi,%rax 401341: 48 89 fa mov %rdi,%rdx 401344: 48 29 f2 sub %rsi,%rdx 401347: 48 39 ca cmp %rcx,%rdx 40134a: 72 03 jb 40134f <memcpy+0x14> 40134c: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 40134e: c3 ret 40134f: 48 8d 7c 0f ff lea -0x1(%rdi,%rcx,1),%rdi 401354: 48 8d 74 0e ff lea -0x1(%rsi,%rcx,1),%rsi 401359: fd std 40135a: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 40135c: fc cld 40135d: c3 ret ``` v3: - Make memmove as an alias for memcpy (Willy). - Make the forward copy the likely case (Alviro). v2: - Fix the broken memmove implementation (David). Link: https://lore.kernel.org/lkml/20230902062237.GA23141@1wt.eu Link: https://lore.kernel.org/lkml/5a821292d96a4dbc84c96ccdc6b5b666@AcuMS.aculab.com Suggested-by: David Laight <David.Laight@aculab.com> Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org> Signed-off-by: Willy Tarreau <w@1wt.eu> Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:02 +07:00
#define NOLIBC_ARCH_HAS_MEMMOVE
void *memmove(void *dst, const void *src, size_t len);
#define NOLIBC_ARCH_HAS_MEMCPY
void *memcpy(void *dst, const void *src, size_t len);
#define NOLIBC_ARCH_HAS_MEMSET
void *memset(void *dst, int c, size_t len);
tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()` Simplify memcpy() and memmove() on the x86-64 arch. The x86-64 arch has a 'rep movsb' instruction, which can perform memcpy() using only a single instruction, given: %rdi = destination %rsi = source %rcx = length Additionally, it can also handle the overlapping case by setting DF=1 (backward copy), which can be used as the memmove() implementation. Before this patch: ``` 00000000000010ab <memmove>: 10ab: 48 89 f8 mov %rdi,%rax 10ae: 31 c9 xor %ecx,%ecx 10b0: 48 39 f7 cmp %rsi,%rdi 10b3: 48 83 d1 ff adc $0xffffffffffffffff,%rcx 10b7: 48 85 d2 test %rdx,%rdx 10ba: 74 25 je 10e1 <memmove+0x36> 10bc: 48 83 c9 01 or $0x1,%rcx 10c0: 48 39 f0 cmp %rsi,%rax 10c3: 48 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%rdi 10ca: 48 0f 43 fa cmovae %rdx,%rdi 10ce: 48 01 cf add %rcx,%rdi 10d1: 44 8a 04 3e mov (%rsi,%rdi,1),%r8b 10d5: 44 88 04 38 mov %r8b,(%rax,%rdi,1) 10d9: 48 01 cf add %rcx,%rdi 10dc: 48 ff ca dec %rdx 10df: 75 f0 jne 10d1 <memmove+0x26> 10e1: c3 ret 00000000000010e2 <memcpy>: 10e2: 48 89 f8 mov %rdi,%rax 10e5: 48 85 d2 test %rdx,%rdx 10e8: 74 12 je 10fc <memcpy+0x1a> 10ea: 31 c9 xor %ecx,%ecx 10ec: 40 8a 3c 0e mov (%rsi,%rcx,1),%dil 10f0: 40 88 3c 08 mov %dil,(%rax,%rcx,1) 10f4: 48 ff c1 inc %rcx 10f7: 48 39 ca cmp %rcx,%rdx 10fa: 75 f0 jne 10ec <memcpy+0xa> 10fc: c3 ret ``` After this patch: ``` // memmove is an alias for memcpy 000000000040133b <memcpy>: 40133b: 48 89 d1 mov %rdx,%rcx 40133e: 48 89 f8 mov %rdi,%rax 401341: 48 89 fa mov %rdi,%rdx 401344: 48 29 f2 sub %rsi,%rdx 401347: 48 39 ca cmp %rcx,%rdx 40134a: 72 03 jb 40134f <memcpy+0x14> 40134c: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 40134e: c3 ret 40134f: 48 8d 7c 0f ff lea -0x1(%rdi,%rcx,1),%rdi 401354: 48 8d 74 0e ff lea -0x1(%rsi,%rcx,1),%rsi 401359: fd std 40135a: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 40135c: fc cld 40135d: c3 ret ``` v3: - Make memmove as an alias for memcpy (Willy). - Make the forward copy the likely case (Alviro). v2: - Fix the broken memmove implementation (David). Link: https://lore.kernel.org/lkml/20230902062237.GA23141@1wt.eu Link: https://lore.kernel.org/lkml/5a821292d96a4dbc84c96ccdc6b5b666@AcuMS.aculab.com Suggested-by: David Laight <David.Laight@aculab.com> Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org> Signed-off-by: Willy Tarreau <w@1wt.eu> Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:02 +07:00
__asm__ (
".section .text.nolibc_memmove_memcpy\n"
".weak memmove\n"
".weak memcpy\n"
"memmove:\n"
"memcpy:\n"
"movq %rdx, %rcx\n\t"
"movq %rdi, %rax\n\t"
"movq %rdi, %rdx\n\t"
"subq %rsi, %rdx\n\t"
"cmpq %rcx, %rdx\n\t"
"jb 1f\n\t"
tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()` Simplify memcpy() and memmove() on the x86-64 arch. The x86-64 arch has a 'rep movsb' instruction, which can perform memcpy() using only a single instruction, given: %rdi = destination %rsi = source %rcx = length Additionally, it can also handle the overlapping case by setting DF=1 (backward copy), which can be used as the memmove() implementation. Before this patch: ``` 00000000000010ab <memmove>: 10ab: 48 89 f8 mov %rdi,%rax 10ae: 31 c9 xor %ecx,%ecx 10b0: 48 39 f7 cmp %rsi,%rdi 10b3: 48 83 d1 ff adc $0xffffffffffffffff,%rcx 10b7: 48 85 d2 test %rdx,%rdx 10ba: 74 25 je 10e1 <memmove+0x36> 10bc: 48 83 c9 01 or $0x1,%rcx 10c0: 48 39 f0 cmp %rsi,%rax 10c3: 48 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%rdi 10ca: 48 0f 43 fa cmovae %rdx,%rdi 10ce: 48 01 cf add %rcx,%rdi 10d1: 44 8a 04 3e mov (%rsi,%rdi,1),%r8b 10d5: 44 88 04 38 mov %r8b,(%rax,%rdi,1) 10d9: 48 01 cf add %rcx,%rdi 10dc: 48 ff ca dec %rdx 10df: 75 f0 jne 10d1 <memmove+0x26> 10e1: c3 ret 00000000000010e2 <memcpy>: 10e2: 48 89 f8 mov %rdi,%rax 10e5: 48 85 d2 test %rdx,%rdx 10e8: 74 12 je 10fc <memcpy+0x1a> 10ea: 31 c9 xor %ecx,%ecx 10ec: 40 8a 3c 0e mov (%rsi,%rcx,1),%dil 10f0: 40 88 3c 08 mov %dil,(%rax,%rcx,1) 10f4: 48 ff c1 inc %rcx 10f7: 48 39 ca cmp %rcx,%rdx 10fa: 75 f0 jne 10ec <memcpy+0xa> 10fc: c3 ret ``` After this patch: ``` // memmove is an alias for memcpy 000000000040133b <memcpy>: 40133b: 48 89 d1 mov %rdx,%rcx 40133e: 48 89 f8 mov %rdi,%rax 401341: 48 89 fa mov %rdi,%rdx 401344: 48 29 f2 sub %rsi,%rdx 401347: 48 39 ca cmp %rcx,%rdx 40134a: 72 03 jb 40134f <memcpy+0x14> 40134c: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 40134e: c3 ret 40134f: 48 8d 7c 0f ff lea -0x1(%rdi,%rcx,1),%rdi 401354: 48 8d 74 0e ff lea -0x1(%rsi,%rcx,1),%rsi 401359: fd std 40135a: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 40135c: fc cld 40135d: c3 ret ``` v3: - Make memmove as an alias for memcpy (Willy). - Make the forward copy the likely case (Alviro). v2: - Fix the broken memmove implementation (David). Link: https://lore.kernel.org/lkml/20230902062237.GA23141@1wt.eu Link: https://lore.kernel.org/lkml/5a821292d96a4dbc84c96ccdc6b5b666@AcuMS.aculab.com Suggested-by: David Laight <David.Laight@aculab.com> Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org> Signed-off-by: Willy Tarreau <w@1wt.eu> Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:02 +07:00
"rep movsb\n\t"
"retq\n"
"1:" /* backward copy */
tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()` Simplify memcpy() and memmove() on the x86-64 arch. The x86-64 arch has a 'rep movsb' instruction, which can perform memcpy() using only a single instruction, given: %rdi = destination %rsi = source %rcx = length Additionally, it can also handle the overlapping case by setting DF=1 (backward copy), which can be used as the memmove() implementation. Before this patch: ``` 00000000000010ab <memmove>: 10ab: 48 89 f8 mov %rdi,%rax 10ae: 31 c9 xor %ecx,%ecx 10b0: 48 39 f7 cmp %rsi,%rdi 10b3: 48 83 d1 ff adc $0xffffffffffffffff,%rcx 10b7: 48 85 d2 test %rdx,%rdx 10ba: 74 25 je 10e1 <memmove+0x36> 10bc: 48 83 c9 01 or $0x1,%rcx 10c0: 48 39 f0 cmp %rsi,%rax 10c3: 48 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%rdi 10ca: 48 0f 43 fa cmovae %rdx,%rdi 10ce: 48 01 cf add %rcx,%rdi 10d1: 44 8a 04 3e mov (%rsi,%rdi,1),%r8b 10d5: 44 88 04 38 mov %r8b,(%rax,%rdi,1) 10d9: 48 01 cf add %rcx,%rdi 10dc: 48 ff ca dec %rdx 10df: 75 f0 jne 10d1 <memmove+0x26> 10e1: c3 ret 00000000000010e2 <memcpy>: 10e2: 48 89 f8 mov %rdi,%rax 10e5: 48 85 d2 test %rdx,%rdx 10e8: 74 12 je 10fc <memcpy+0x1a> 10ea: 31 c9 xor %ecx,%ecx 10ec: 40 8a 3c 0e mov (%rsi,%rcx,1),%dil 10f0: 40 88 3c 08 mov %dil,(%rax,%rcx,1) 10f4: 48 ff c1 inc %rcx 10f7: 48 39 ca cmp %rcx,%rdx 10fa: 75 f0 jne 10ec <memcpy+0xa> 10fc: c3 ret ``` After this patch: ``` // memmove is an alias for memcpy 000000000040133b <memcpy>: 40133b: 48 89 d1 mov %rdx,%rcx 40133e: 48 89 f8 mov %rdi,%rax 401341: 48 89 fa mov %rdi,%rdx 401344: 48 29 f2 sub %rsi,%rdx 401347: 48 39 ca cmp %rcx,%rdx 40134a: 72 03 jb 40134f <memcpy+0x14> 40134c: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 40134e: c3 ret 40134f: 48 8d 7c 0f ff lea -0x1(%rdi,%rcx,1),%rdi 401354: 48 8d 74 0e ff lea -0x1(%rsi,%rcx,1),%rsi 401359: fd std 40135a: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 40135c: fc cld 40135d: c3 ret ``` v3: - Make memmove as an alias for memcpy (Willy). - Make the forward copy the likely case (Alviro). v2: - Fix the broken memmove implementation (David). Link: https://lore.kernel.org/lkml/20230902062237.GA23141@1wt.eu Link: https://lore.kernel.org/lkml/5a821292d96a4dbc84c96ccdc6b5b666@AcuMS.aculab.com Suggested-by: David Laight <David.Laight@aculab.com> Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org> Signed-off-by: Willy Tarreau <w@1wt.eu> Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:02 +07:00
"leaq -1(%rdi, %rcx, 1), %rdi\n\t"
"leaq -1(%rsi, %rcx, 1), %rsi\n\t"
"std\n\t"
"rep movsb\n\t"
"cld\n\t"
"retq\n"
".section .text.nolibc_memset\n"
".weak memset\n"
"memset:\n"
"xchgl %eax, %esi\n\t"
"movq %rdx, %rcx\n\t"
"pushq %rdi\n\t"
"rep stosb\n\t"
"popq %rax\n\t"
"retq\n"
tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()` Simplify memcpy() and memmove() on the x86-64 arch. The x86-64 arch has a 'rep movsb' instruction, which can perform memcpy() using only a single instruction, given: %rdi = destination %rsi = source %rcx = length Additionally, it can also handle the overlapping case by setting DF=1 (backward copy), which can be used as the memmove() implementation. Before this patch: ``` 00000000000010ab <memmove>: 10ab: 48 89 f8 mov %rdi,%rax 10ae: 31 c9 xor %ecx,%ecx 10b0: 48 39 f7 cmp %rsi,%rdi 10b3: 48 83 d1 ff adc $0xffffffffffffffff,%rcx 10b7: 48 85 d2 test %rdx,%rdx 10ba: 74 25 je 10e1 <memmove+0x36> 10bc: 48 83 c9 01 or $0x1,%rcx 10c0: 48 39 f0 cmp %rsi,%rax 10c3: 48 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%rdi 10ca: 48 0f 43 fa cmovae %rdx,%rdi 10ce: 48 01 cf add %rcx,%rdi 10d1: 44 8a 04 3e mov (%rsi,%rdi,1),%r8b 10d5: 44 88 04 38 mov %r8b,(%rax,%rdi,1) 10d9: 48 01 cf add %rcx,%rdi 10dc: 48 ff ca dec %rdx 10df: 75 f0 jne 10d1 <memmove+0x26> 10e1: c3 ret 00000000000010e2 <memcpy>: 10e2: 48 89 f8 mov %rdi,%rax 10e5: 48 85 d2 test %rdx,%rdx 10e8: 74 12 je 10fc <memcpy+0x1a> 10ea: 31 c9 xor %ecx,%ecx 10ec: 40 8a 3c 0e mov (%rsi,%rcx,1),%dil 10f0: 40 88 3c 08 mov %dil,(%rax,%rcx,1) 10f4: 48 ff c1 inc %rcx 10f7: 48 39 ca cmp %rcx,%rdx 10fa: 75 f0 jne 10ec <memcpy+0xa> 10fc: c3 ret ``` After this patch: ``` // memmove is an alias for memcpy 000000000040133b <memcpy>: 40133b: 48 89 d1 mov %rdx,%rcx 40133e: 48 89 f8 mov %rdi,%rax 401341: 48 89 fa mov %rdi,%rdx 401344: 48 29 f2 sub %rsi,%rdx 401347: 48 39 ca cmp %rcx,%rdx 40134a: 72 03 jb 40134f <memcpy+0x14> 40134c: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 40134e: c3 ret 40134f: 48 8d 7c 0f ff lea -0x1(%rdi,%rcx,1),%rdi 401354: 48 8d 74 0e ff lea -0x1(%rsi,%rcx,1),%rsi 401359: fd std 40135a: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 40135c: fc cld 40135d: c3 ret ``` v3: - Make memmove as an alias for memcpy (Willy). - Make the forward copy the likely case (Alviro). v2: - Fix the broken memmove implementation (David). Link: https://lore.kernel.org/lkml/20230902062237.GA23141@1wt.eu Link: https://lore.kernel.org/lkml/5a821292d96a4dbc84c96ccdc6b5b666@AcuMS.aculab.com Suggested-by: David Laight <David.Laight@aculab.com> Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org> Signed-off-by: Willy Tarreau <w@1wt.eu> Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:02 +07:00
);
#endif /* !defined(__x86_64__) */
#endif /* _NOLIBC_ARCH_X86_H */