2022-02-07 17:23:17 +01:00
|
|
|
/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
|
|
|
|
/*
|
2025-06-18 20:30:30 +02:00
|
|
|
* x86 specific definitions for NOLIBC (both 32- and 64-bit)
|
|
|
|
* Copyright (C) 2017-2025 Willy Tarreau <w@1wt.eu>
|
2022-02-07 17:23:17 +01:00
|
|
|
*/
|
|
|
|
|
2025-06-18 20:30:30 +02:00
|
|
|
#ifndef _NOLIBC_ARCH_X86_H
|
|
|
|
#define _NOLIBC_ARCH_X86_H
|
2022-02-07 17:23:17 +01:00
|
|
|
|
2023-05-21 11:36:34 +02:00
|
|
|
#include "compiler.h"
|
2023-07-16 02:27:49 +08:00
|
|
|
#include "crt.h"
|
2023-05-21 11:36:34 +02:00
|
|
|
|
2025-06-18 20:30:30 +02:00
|
|
|
#if !defined(__x86_64__)
|
|
|
|
|
|
|
|
/* Syscalls for i386 :
|
|
|
|
* - mostly similar to x86_64
|
|
|
|
* - registers are 32-bit
|
|
|
|
* - syscall number is passed in eax
|
|
|
|
* - arguments are in ebx, ecx, edx, esi, edi, ebp respectively
|
|
|
|
* - all registers are preserved (except eax of course)
|
|
|
|
* - the system call is performed by calling int $0x80
|
|
|
|
* - syscall return comes in eax
|
|
|
|
* - the arguments are cast to long and assigned into the target registers
|
|
|
|
* which are then simply passed as registers to the asm code, so that we
|
|
|
|
* don't have to experience issues with register constraints.
|
|
|
|
* - the syscall number is always specified last in order to allow to force
|
|
|
|
* some registers before (gcc refuses a %-register at the last position).
|
|
|
|
*
|
|
|
|
* Also, i386 supports the old_select syscall if newselect is not available
|
|
|
|
*/
|
|
|
|
#define __ARCH_WANT_SYS_OLD_SELECT
|
|
|
|
|
|
|
|
#define my_syscall0(num) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
|
|
|
register long _num __asm__ ("eax") = (num); \
|
|
|
|
\
|
|
|
|
__asm__ volatile ( \
|
|
|
|
"int $0x80\n" \
|
|
|
|
: "=a" (_ret) \
|
|
|
|
: "0"(_num) \
|
|
|
|
: "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall1(num, arg1) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
|
|
|
register long _num __asm__ ("eax") = (num); \
|
|
|
|
register long _arg1 __asm__ ("ebx") = (long)(arg1); \
|
|
|
|
\
|
|
|
|
__asm__ volatile ( \
|
|
|
|
"int $0x80\n" \
|
|
|
|
: "=a" (_ret) \
|
|
|
|
: "r"(_arg1), \
|
|
|
|
"0"(_num) \
|
|
|
|
: "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall2(num, arg1, arg2) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
|
|
|
register long _num __asm__ ("eax") = (num); \
|
|
|
|
register long _arg1 __asm__ ("ebx") = (long)(arg1); \
|
|
|
|
register long _arg2 __asm__ ("ecx") = (long)(arg2); \
|
|
|
|
\
|
|
|
|
__asm__ volatile ( \
|
|
|
|
"int $0x80\n" \
|
|
|
|
: "=a" (_ret) \
|
|
|
|
: "r"(_arg1), "r"(_arg2), \
|
|
|
|
"0"(_num) \
|
|
|
|
: "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall3(num, arg1, arg2, arg3) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
|
|
|
register long _num __asm__ ("eax") = (num); \
|
|
|
|
register long _arg1 __asm__ ("ebx") = (long)(arg1); \
|
|
|
|
register long _arg2 __asm__ ("ecx") = (long)(arg2); \
|
|
|
|
register long _arg3 __asm__ ("edx") = (long)(arg3); \
|
|
|
|
\
|
|
|
|
__asm__ volatile ( \
|
|
|
|
"int $0x80\n" \
|
|
|
|
: "=a" (_ret) \
|
|
|
|
: "r"(_arg1), "r"(_arg2), "r"(_arg3), \
|
|
|
|
"0"(_num) \
|
|
|
|
: "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall4(num, arg1, arg2, arg3, arg4) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
|
|
|
register long _num __asm__ ("eax") = (num); \
|
|
|
|
register long _arg1 __asm__ ("ebx") = (long)(arg1); \
|
|
|
|
register long _arg2 __asm__ ("ecx") = (long)(arg2); \
|
|
|
|
register long _arg3 __asm__ ("edx") = (long)(arg3); \
|
|
|
|
register long _arg4 __asm__ ("esi") = (long)(arg4); \
|
|
|
|
\
|
|
|
|
__asm__ volatile ( \
|
|
|
|
"int $0x80\n" \
|
|
|
|
: "=a" (_ret) \
|
|
|
|
: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \
|
|
|
|
"0"(_num) \
|
|
|
|
: "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
|
|
|
register long _num __asm__ ("eax") = (num); \
|
|
|
|
register long _arg1 __asm__ ("ebx") = (long)(arg1); \
|
|
|
|
register long _arg2 __asm__ ("ecx") = (long)(arg2); \
|
|
|
|
register long _arg3 __asm__ ("edx") = (long)(arg3); \
|
|
|
|
register long _arg4 __asm__ ("esi") = (long)(arg4); \
|
|
|
|
register long _arg5 __asm__ ("edi") = (long)(arg5); \
|
|
|
|
\
|
|
|
|
__asm__ volatile ( \
|
|
|
|
"int $0x80\n" \
|
|
|
|
: "=a" (_ret) \
|
|
|
|
: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \
|
|
|
|
"0"(_num) \
|
|
|
|
: "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \
|
|
|
|
({ \
|
|
|
|
long _eax = (long)(num); \
|
|
|
|
long _arg6 = (long)(arg6); /* Always in memory */ \
|
|
|
|
__asm__ volatile ( \
|
|
|
|
"pushl %[_arg6]\n\t" \
|
|
|
|
"pushl %%ebp\n\t" \
|
|
|
|
"movl 4(%%esp),%%ebp\n\t" \
|
|
|
|
"int $0x80\n\t" \
|
|
|
|
"popl %%ebp\n\t" \
|
|
|
|
"addl $4,%%esp\n\t" \
|
|
|
|
: "+a"(_eax) /* %eax */ \
|
|
|
|
: "b"(arg1), /* %ebx */ \
|
|
|
|
"c"(arg2), /* %ecx */ \
|
|
|
|
"d"(arg3), /* %edx */ \
|
|
|
|
"S"(arg4), /* %esi */ \
|
|
|
|
"D"(arg5), /* %edi */ \
|
|
|
|
[_arg6]"m"(_arg6) /* memory */ \
|
|
|
|
: "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_eax; \
|
|
|
|
})
|
|
|
|
|
|
|
|
/* startup code */
|
|
|
|
/*
|
|
|
|
* i386 System V ABI mandates:
|
|
|
|
* 1) last pushed argument must be 16-byte aligned.
|
|
|
|
* 2) The deepest stack frame should be set to zero
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
|
|
|
|
{
|
|
|
|
__asm__ volatile (
|
|
|
|
"xor %ebp, %ebp\n" /* zero the stack frame */
|
|
|
|
"mov %esp, %eax\n" /* save stack pointer to %eax, as arg1 of _start_c */
|
|
|
|
"sub $12, %esp\n" /* sub 12 to keep it aligned after the push %eax */
|
|
|
|
"push %eax\n" /* push arg1 on stack to support plain stack modes too */
|
|
|
|
"call _start_c\n" /* transfer to c runtime */
|
|
|
|
"hlt\n" /* ensure it does not return */
|
|
|
|
);
|
|
|
|
__nolibc_entrypoint_epilogue();
|
|
|
|
}
|
|
|
|
|
|
|
|
#else /* !defined(__x86_64__) */
|
|
|
|
|
2022-02-07 17:23:17 +01:00
|
|
|
/* Syscalls for x86_64 :
|
|
|
|
* - registers are 64-bit
|
|
|
|
* - syscall number is passed in rax
|
|
|
|
* - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively
|
|
|
|
* - the system call is performed by calling the syscall instruction
|
|
|
|
* - syscall return comes in rax
|
|
|
|
* - rcx and r11 are clobbered, others are preserved.
|
|
|
|
* - the arguments are cast to long and assigned into the target registers
|
|
|
|
* which are then simply passed as registers to the asm code, so that we
|
|
|
|
* don't have to experience issues with register constraints.
|
|
|
|
* - the syscall number is always specified last in order to allow to force
|
|
|
|
* some registers before (gcc refuses a %-register at the last position).
|
|
|
|
* - see also x86-64 ABI section A.2 AMD64 Linux Kernel Conventions, A.2.1
|
|
|
|
* Calling Conventions.
|
|
|
|
*
|
2022-03-29 17:17:29 +07:00
|
|
|
* Link x86-64 ABI: https://gitlab.com/x86-psABIs/x86-64-ABI/-/wikis/home
|
2022-02-07 17:23:17 +01:00
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define my_syscall0(num) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
2022-03-29 17:17:30 +07:00
|
|
|
register long _num __asm__ ("rax") = (num); \
|
2023-07-07 22:50:34 +08:00
|
|
|
\
|
2023-07-07 22:52:09 +08:00
|
|
|
__asm__ volatile ( \
|
2022-02-07 17:23:17 +01:00
|
|
|
"syscall\n" \
|
|
|
|
: "=a"(_ret) \
|
|
|
|
: "0"(_num) \
|
|
|
|
: "rcx", "r11", "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall1(num, arg1) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
2022-03-29 17:17:30 +07:00
|
|
|
register long _num __asm__ ("rax") = (num); \
|
|
|
|
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
|
2023-07-07 22:50:34 +08:00
|
|
|
\
|
2023-07-07 22:52:09 +08:00
|
|
|
__asm__ volatile ( \
|
2022-02-07 17:23:17 +01:00
|
|
|
"syscall\n" \
|
|
|
|
: "=a"(_ret) \
|
|
|
|
: "r"(_arg1), \
|
|
|
|
"0"(_num) \
|
|
|
|
: "rcx", "r11", "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall2(num, arg1, arg2) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
2022-03-29 17:17:30 +07:00
|
|
|
register long _num __asm__ ("rax") = (num); \
|
|
|
|
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
|
|
|
|
register long _arg2 __asm__ ("rsi") = (long)(arg2); \
|
2023-07-07 22:50:34 +08:00
|
|
|
\
|
2023-07-07 22:52:09 +08:00
|
|
|
__asm__ volatile ( \
|
2022-02-07 17:23:17 +01:00
|
|
|
"syscall\n" \
|
|
|
|
: "=a"(_ret) \
|
|
|
|
: "r"(_arg1), "r"(_arg2), \
|
|
|
|
"0"(_num) \
|
|
|
|
: "rcx", "r11", "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall3(num, arg1, arg2, arg3) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
2022-03-29 17:17:30 +07:00
|
|
|
register long _num __asm__ ("rax") = (num); \
|
|
|
|
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
|
|
|
|
register long _arg2 __asm__ ("rsi") = (long)(arg2); \
|
|
|
|
register long _arg3 __asm__ ("rdx") = (long)(arg3); \
|
2023-07-07 22:50:34 +08:00
|
|
|
\
|
2023-07-07 22:52:09 +08:00
|
|
|
__asm__ volatile ( \
|
2022-02-07 17:23:17 +01:00
|
|
|
"syscall\n" \
|
|
|
|
: "=a"(_ret) \
|
|
|
|
: "r"(_arg1), "r"(_arg2), "r"(_arg3), \
|
|
|
|
"0"(_num) \
|
|
|
|
: "rcx", "r11", "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall4(num, arg1, arg2, arg3, arg4) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
2022-03-29 17:17:30 +07:00
|
|
|
register long _num __asm__ ("rax") = (num); \
|
|
|
|
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
|
|
|
|
register long _arg2 __asm__ ("rsi") = (long)(arg2); \
|
|
|
|
register long _arg3 __asm__ ("rdx") = (long)(arg3); \
|
|
|
|
register long _arg4 __asm__ ("r10") = (long)(arg4); \
|
2023-07-07 22:50:34 +08:00
|
|
|
\
|
2023-07-07 22:52:09 +08:00
|
|
|
__asm__ volatile ( \
|
2022-02-07 17:23:17 +01:00
|
|
|
"syscall\n" \
|
|
|
|
: "=a"(_ret) \
|
|
|
|
: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \
|
|
|
|
"0"(_num) \
|
|
|
|
: "rcx", "r11", "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
2022-03-29 17:17:30 +07:00
|
|
|
register long _num __asm__ ("rax") = (num); \
|
|
|
|
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
|
|
|
|
register long _arg2 __asm__ ("rsi") = (long)(arg2); \
|
|
|
|
register long _arg3 __asm__ ("rdx") = (long)(arg3); \
|
|
|
|
register long _arg4 __asm__ ("r10") = (long)(arg4); \
|
|
|
|
register long _arg5 __asm__ ("r8") = (long)(arg5); \
|
2023-07-07 22:50:34 +08:00
|
|
|
\
|
2023-07-07 22:52:09 +08:00
|
|
|
__asm__ volatile ( \
|
2022-02-07 17:23:17 +01:00
|
|
|
"syscall\n" \
|
|
|
|
: "=a"(_ret) \
|
|
|
|
: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \
|
|
|
|
"0"(_num) \
|
|
|
|
: "rcx", "r11", "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \
|
|
|
|
({ \
|
|
|
|
long _ret; \
|
2022-03-29 17:17:30 +07:00
|
|
|
register long _num __asm__ ("rax") = (num); \
|
|
|
|
register long _arg1 __asm__ ("rdi") = (long)(arg1); \
|
|
|
|
register long _arg2 __asm__ ("rsi") = (long)(arg2); \
|
|
|
|
register long _arg3 __asm__ ("rdx") = (long)(arg3); \
|
|
|
|
register long _arg4 __asm__ ("r10") = (long)(arg4); \
|
|
|
|
register long _arg5 __asm__ ("r8") = (long)(arg5); \
|
|
|
|
register long _arg6 __asm__ ("r9") = (long)(arg6); \
|
2023-07-07 22:50:34 +08:00
|
|
|
\
|
2023-07-07 22:52:09 +08:00
|
|
|
__asm__ volatile ( \
|
2022-02-07 17:23:17 +01:00
|
|
|
"syscall\n" \
|
|
|
|
: "=a"(_ret) \
|
|
|
|
: "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \
|
|
|
|
"r"(_arg6), "0"(_num) \
|
|
|
|
: "rcx", "r11", "memory", "cc" \
|
|
|
|
); \
|
|
|
|
_ret; \
|
|
|
|
})
|
|
|
|
|
|
|
|
/* startup code */
|
|
|
|
/*
|
|
|
|
* x86-64 System V ABI mandates:
|
|
|
|
* 1) %rsp must be 16-byte aligned right before the function call.
|
|
|
|
* 2) The deepest stack frame should be zero (the %rbp).
|
|
|
|
*
|
|
|
|
*/
|
2024-08-07 23:51:41 +02:00
|
|
|
void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void)
|
tools/nolibc: make compiler and assembler agree on the section around _start
The out-of-block asm() statement carrying _start does not allow the
compiler to know what section the assembly code is being emitted to,
and there's no easy way to push/pop the current section and restore
it. It sometimes causes issues depending on the include files ordering
and compiler optimizations. For example if a variable is declared
immediately before the asm() block and another one after, the compiler
assumes that the current section is still .bss and doesn't re-emit it,
making the second variable appear inside the .text section instead.
Forcing .bss at the end of the _start block doesn't work either because
at certain optimizations the compiler may reorder blocks and will make
some real code appear just after this block.
A significant number of solutions were attempted, but many of them were
still sensitive to section reordering. In the end, the best way to make
sure the compiler and assembler agree on the current section is to place
this code inside a function. Here the function is directly called _start
and configured not to emit a frame-pointer, hence to have no prologue.
If some future architectures would still emit some prologue, another
working approach consists in naming the function differently and placing
the _start label inside the asm statement. But the current solution is
simpler.
It was tested with nolibc-test at -O,-O0,-O2,-O3,-Os for arm,arm64,i386,
mips,riscv,s390 and x86_64.
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2023-01-10 08:24:13 +01:00
|
|
|
{
|
|
|
|
__asm__ volatile (
|
2023-07-16 02:27:49 +08:00
|
|
|
"xor %ebp, %ebp\n" /* zero the stack frame */
|
|
|
|
"mov %rsp, %rdi\n" /* save stack pointer to %rdi, as arg1 of _start_c */
|
|
|
|
"call _start_c\n" /* transfer to c runtime */
|
|
|
|
"hlt\n" /* ensure it does not return */
|
tools/nolibc: make compiler and assembler agree on the section around _start
The out-of-block asm() statement carrying _start does not allow the
compiler to know what section the assembly code is being emitted to,
and there's no easy way to push/pop the current section and restore
it. It sometimes causes issues depending on the include files ordering
and compiler optimizations. For example if a variable is declared
immediately before the asm() block and another one after, the compiler
assumes that the current section is still .bss and doesn't re-emit it,
making the second variable appear inside the .text section instead.
Forcing .bss at the end of the _start block doesn't work either because
at certain optimizations the compiler may reorder blocks and will make
some real code appear just after this block.
A significant number of solutions were attempted, but many of them were
still sensitive to section reordering. In the end, the best way to make
sure the compiler and assembler agree on the current section is to place
this code inside a function. Here the function is directly called _start
and configured not to emit a frame-pointer, hence to have no prologue.
If some future architectures would still emit some prologue, another
working approach consists in naming the function differently and placing
the _start label inside the asm statement. But the current solution is
simpler.
It was tested with nolibc-test at -O,-O0,-O2,-O3,-Os for arm,arm64,i386,
mips,riscv,s390 and x86_64.
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2023-01-10 08:24:13 +01:00
|
|
|
);
|
2024-08-07 23:51:41 +02:00
|
|
|
__nolibc_entrypoint_epilogue();
|
tools/nolibc: make compiler and assembler agree on the section around _start
The out-of-block asm() statement carrying _start does not allow the
compiler to know what section the assembly code is being emitted to,
and there's no easy way to push/pop the current section and restore
it. It sometimes causes issues depending on the include files ordering
and compiler optimizations. For example if a variable is declared
immediately before the asm() block and another one after, the compiler
assumes that the current section is still .bss and doesn't re-emit it,
making the second variable appear inside the .text section instead.
Forcing .bss at the end of the _start block doesn't work either because
at certain optimizations the compiler may reorder blocks and will make
some real code appear just after this block.
A significant number of solutions were attempted, but many of them were
still sensitive to section reordering. In the end, the best way to make
sure the compiler and assembler agree on the current section is to place
this code inside a function. Here the function is directly called _start
and configured not to emit a frame-pointer, hence to have no prologue.
If some future architectures would still emit some prologue, another
working approach consists in naming the function differently and placing
the _start label inside the asm statement. But the current solution is
simpler.
It was tested with nolibc-test at -O,-O0,-O2,-O3,-Os for arm,arm64,i386,
mips,riscv,s390 and x86_64.
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
2023-01-10 08:24:13 +01:00
|
|
|
}
|
2022-02-07 17:23:17 +01:00
|
|
|
|
tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Simplify memcpy() and memmove() on the x86-64 arch.
The x86-64 arch has a 'rep movsb' instruction, which can perform
memcpy() using only a single instruction, given:
%rdi = destination
%rsi = source
%rcx = length
Additionally, it can also handle the overlapping case by setting DF=1
(backward copy), which can be used as the memmove() implementation.
Before this patch:
```
00000000000010ab <memmove>:
10ab: 48 89 f8 mov %rdi,%rax
10ae: 31 c9 xor %ecx,%ecx
10b0: 48 39 f7 cmp %rsi,%rdi
10b3: 48 83 d1 ff adc $0xffffffffffffffff,%rcx
10b7: 48 85 d2 test %rdx,%rdx
10ba: 74 25 je 10e1 <memmove+0x36>
10bc: 48 83 c9 01 or $0x1,%rcx
10c0: 48 39 f0 cmp %rsi,%rax
10c3: 48 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%rdi
10ca: 48 0f 43 fa cmovae %rdx,%rdi
10ce: 48 01 cf add %rcx,%rdi
10d1: 44 8a 04 3e mov (%rsi,%rdi,1),%r8b
10d5: 44 88 04 38 mov %r8b,(%rax,%rdi,1)
10d9: 48 01 cf add %rcx,%rdi
10dc: 48 ff ca dec %rdx
10df: 75 f0 jne 10d1 <memmove+0x26>
10e1: c3 ret
00000000000010e2 <memcpy>:
10e2: 48 89 f8 mov %rdi,%rax
10e5: 48 85 d2 test %rdx,%rdx
10e8: 74 12 je 10fc <memcpy+0x1a>
10ea: 31 c9 xor %ecx,%ecx
10ec: 40 8a 3c 0e mov (%rsi,%rcx,1),%dil
10f0: 40 88 3c 08 mov %dil,(%rax,%rcx,1)
10f4: 48 ff c1 inc %rcx
10f7: 48 39 ca cmp %rcx,%rdx
10fa: 75 f0 jne 10ec <memcpy+0xa>
10fc: c3 ret
```
After this patch:
```
// memmove is an alias for memcpy
000000000040133b <memcpy>:
40133b: 48 89 d1 mov %rdx,%rcx
40133e: 48 89 f8 mov %rdi,%rax
401341: 48 89 fa mov %rdi,%rdx
401344: 48 29 f2 sub %rsi,%rdx
401347: 48 39 ca cmp %rcx,%rdx
40134a: 72 03 jb 40134f <memcpy+0x14>
40134c: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi)
40134e: c3 ret
40134f: 48 8d 7c 0f ff lea -0x1(%rdi,%rcx,1),%rdi
401354: 48 8d 74 0e ff lea -0x1(%rsi,%rcx,1),%rsi
401359: fd std
40135a: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi)
40135c: fc cld
40135d: c3 ret
```
v3:
- Make memmove as an alias for memcpy (Willy).
- Make the forward copy the likely case (Alviro).
v2:
- Fix the broken memmove implementation (David).
Link: https://lore.kernel.org/lkml/20230902062237.GA23141@1wt.eu
Link: https://lore.kernel.org/lkml/5a821292d96a4dbc84c96ccdc6b5b666@AcuMS.aculab.com
Suggested-by: David Laight <David.Laight@aculab.com>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:02 +07:00
|
|
|
#define NOLIBC_ARCH_HAS_MEMMOVE
|
|
|
|
void *memmove(void *dst, const void *src, size_t len);
|
|
|
|
|
|
|
|
#define NOLIBC_ARCH_HAS_MEMCPY
|
|
|
|
void *memcpy(void *dst, const void *src, size_t len);
|
|
|
|
|
tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Simplify memset() on the x86-64 arch.
The x86-64 arch has a 'rep stosb' instruction, which can perform
memset() using only a single instruction, given:
%al = value (just like the second argument of memset())
%rdi = destination
%rcx = length
Before this patch:
```
00000000000010c9 <memset>:
10c9: 48 89 f8 mov %rdi,%rax
10cc: 48 85 d2 test %rdx,%rdx
10cf: 74 0e je 10df <memset+0x16>
10d1: 31 c9 xor %ecx,%ecx
10d3: 40 88 34 08 mov %sil,(%rax,%rcx,1)
10d7: 48 ff c1 inc %rcx
10da: 48 39 ca cmp %rcx,%rdx
10dd: 75 f4 jne 10d3 <memset+0xa>
10df: c3 ret
```
After this patch:
```
0000000000001511 <memset>:
1511: 96 xchg %eax,%esi
1512: 48 89 d1 mov %rdx,%rcx
1515: 57 push %rdi
1516: f3 aa rep stos %al,%es:(%rdi)
1518: 58 pop %rax
1519: c3 ret
```
v2:
- Use pushq %rdi / popq %rax (Alviro).
- Use xchg %eax, %esi (Willy).
Link: https://lore.kernel.org/lkml/ZO9e6h2jjVIMpBJP@1wt.eu
Suggested-by: Alviro Iskandar Setiawan <alviro.iskandar@gnuweeb.org>
Suggested-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Reviewed-by: Alviro Iskandar Setiawan <alviro.iskandar@gnuweeb.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:03 +07:00
|
|
|
#define NOLIBC_ARCH_HAS_MEMSET
|
|
|
|
void *memset(void *dst, int c, size_t len);
|
|
|
|
|
tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Simplify memcpy() and memmove() on the x86-64 arch.
The x86-64 arch has a 'rep movsb' instruction, which can perform
memcpy() using only a single instruction, given:
%rdi = destination
%rsi = source
%rcx = length
Additionally, it can also handle the overlapping case by setting DF=1
(backward copy), which can be used as the memmove() implementation.
Before this patch:
```
00000000000010ab <memmove>:
10ab: 48 89 f8 mov %rdi,%rax
10ae: 31 c9 xor %ecx,%ecx
10b0: 48 39 f7 cmp %rsi,%rdi
10b3: 48 83 d1 ff adc $0xffffffffffffffff,%rcx
10b7: 48 85 d2 test %rdx,%rdx
10ba: 74 25 je 10e1 <memmove+0x36>
10bc: 48 83 c9 01 or $0x1,%rcx
10c0: 48 39 f0 cmp %rsi,%rax
10c3: 48 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%rdi
10ca: 48 0f 43 fa cmovae %rdx,%rdi
10ce: 48 01 cf add %rcx,%rdi
10d1: 44 8a 04 3e mov (%rsi,%rdi,1),%r8b
10d5: 44 88 04 38 mov %r8b,(%rax,%rdi,1)
10d9: 48 01 cf add %rcx,%rdi
10dc: 48 ff ca dec %rdx
10df: 75 f0 jne 10d1 <memmove+0x26>
10e1: c3 ret
00000000000010e2 <memcpy>:
10e2: 48 89 f8 mov %rdi,%rax
10e5: 48 85 d2 test %rdx,%rdx
10e8: 74 12 je 10fc <memcpy+0x1a>
10ea: 31 c9 xor %ecx,%ecx
10ec: 40 8a 3c 0e mov (%rsi,%rcx,1),%dil
10f0: 40 88 3c 08 mov %dil,(%rax,%rcx,1)
10f4: 48 ff c1 inc %rcx
10f7: 48 39 ca cmp %rcx,%rdx
10fa: 75 f0 jne 10ec <memcpy+0xa>
10fc: c3 ret
```
After this patch:
```
// memmove is an alias for memcpy
000000000040133b <memcpy>:
40133b: 48 89 d1 mov %rdx,%rcx
40133e: 48 89 f8 mov %rdi,%rax
401341: 48 89 fa mov %rdi,%rdx
401344: 48 29 f2 sub %rsi,%rdx
401347: 48 39 ca cmp %rcx,%rdx
40134a: 72 03 jb 40134f <memcpy+0x14>
40134c: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi)
40134e: c3 ret
40134f: 48 8d 7c 0f ff lea -0x1(%rdi,%rcx,1),%rdi
401354: 48 8d 74 0e ff lea -0x1(%rsi,%rcx,1),%rsi
401359: fd std
40135a: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi)
40135c: fc cld
40135d: c3 ret
```
v3:
- Make memmove as an alias for memcpy (Willy).
- Make the forward copy the likely case (Alviro).
v2:
- Fix the broken memmove implementation (David).
Link: https://lore.kernel.org/lkml/20230902062237.GA23141@1wt.eu
Link: https://lore.kernel.org/lkml/5a821292d96a4dbc84c96ccdc6b5b666@AcuMS.aculab.com
Suggested-by: David Laight <David.Laight@aculab.com>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:02 +07:00
|
|
|
__asm__ (
|
|
|
|
".section .text.nolibc_memmove_memcpy\n"
|
|
|
|
".weak memmove\n"
|
|
|
|
".weak memcpy\n"
|
|
|
|
"memmove:\n"
|
|
|
|
"memcpy:\n"
|
|
|
|
"movq %rdx, %rcx\n\t"
|
|
|
|
"movq %rdi, %rax\n\t"
|
|
|
|
"movq %rdi, %rdx\n\t"
|
|
|
|
"subq %rsi, %rdx\n\t"
|
|
|
|
"cmpq %rcx, %rdx\n\t"
|
2024-08-12 22:50:19 +02:00
|
|
|
"jb 1f\n\t"
|
tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Simplify memcpy() and memmove() on the x86-64 arch.
The x86-64 arch has a 'rep movsb' instruction, which can perform
memcpy() using only a single instruction, given:
%rdi = destination
%rsi = source
%rcx = length
Additionally, it can also handle the overlapping case by setting DF=1
(backward copy), which can be used as the memmove() implementation.
Before this patch:
```
00000000000010ab <memmove>:
10ab: 48 89 f8 mov %rdi,%rax
10ae: 31 c9 xor %ecx,%ecx
10b0: 48 39 f7 cmp %rsi,%rdi
10b3: 48 83 d1 ff adc $0xffffffffffffffff,%rcx
10b7: 48 85 d2 test %rdx,%rdx
10ba: 74 25 je 10e1 <memmove+0x36>
10bc: 48 83 c9 01 or $0x1,%rcx
10c0: 48 39 f0 cmp %rsi,%rax
10c3: 48 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%rdi
10ca: 48 0f 43 fa cmovae %rdx,%rdi
10ce: 48 01 cf add %rcx,%rdi
10d1: 44 8a 04 3e mov (%rsi,%rdi,1),%r8b
10d5: 44 88 04 38 mov %r8b,(%rax,%rdi,1)
10d9: 48 01 cf add %rcx,%rdi
10dc: 48 ff ca dec %rdx
10df: 75 f0 jne 10d1 <memmove+0x26>
10e1: c3 ret
00000000000010e2 <memcpy>:
10e2: 48 89 f8 mov %rdi,%rax
10e5: 48 85 d2 test %rdx,%rdx
10e8: 74 12 je 10fc <memcpy+0x1a>
10ea: 31 c9 xor %ecx,%ecx
10ec: 40 8a 3c 0e mov (%rsi,%rcx,1),%dil
10f0: 40 88 3c 08 mov %dil,(%rax,%rcx,1)
10f4: 48 ff c1 inc %rcx
10f7: 48 39 ca cmp %rcx,%rdx
10fa: 75 f0 jne 10ec <memcpy+0xa>
10fc: c3 ret
```
After this patch:
```
// memmove is an alias for memcpy
000000000040133b <memcpy>:
40133b: 48 89 d1 mov %rdx,%rcx
40133e: 48 89 f8 mov %rdi,%rax
401341: 48 89 fa mov %rdi,%rdx
401344: 48 29 f2 sub %rsi,%rdx
401347: 48 39 ca cmp %rcx,%rdx
40134a: 72 03 jb 40134f <memcpy+0x14>
40134c: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi)
40134e: c3 ret
40134f: 48 8d 7c 0f ff lea -0x1(%rdi,%rcx,1),%rdi
401354: 48 8d 74 0e ff lea -0x1(%rsi,%rcx,1),%rsi
401359: fd std
40135a: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi)
40135c: fc cld
40135d: c3 ret
```
v3:
- Make memmove as an alias for memcpy (Willy).
- Make the forward copy the likely case (Alviro).
v2:
- Fix the broken memmove implementation (David).
Link: https://lore.kernel.org/lkml/20230902062237.GA23141@1wt.eu
Link: https://lore.kernel.org/lkml/5a821292d96a4dbc84c96ccdc6b5b666@AcuMS.aculab.com
Suggested-by: David Laight <David.Laight@aculab.com>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:02 +07:00
|
|
|
"rep movsb\n\t"
|
|
|
|
"retq\n"
|
2024-08-12 22:50:19 +02:00
|
|
|
"1:" /* backward copy */
|
tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Simplify memcpy() and memmove() on the x86-64 arch.
The x86-64 arch has a 'rep movsb' instruction, which can perform
memcpy() using only a single instruction, given:
%rdi = destination
%rsi = source
%rcx = length
Additionally, it can also handle the overlapping case by setting DF=1
(backward copy), which can be used as the memmove() implementation.
Before this patch:
```
00000000000010ab <memmove>:
10ab: 48 89 f8 mov %rdi,%rax
10ae: 31 c9 xor %ecx,%ecx
10b0: 48 39 f7 cmp %rsi,%rdi
10b3: 48 83 d1 ff adc $0xffffffffffffffff,%rcx
10b7: 48 85 d2 test %rdx,%rdx
10ba: 74 25 je 10e1 <memmove+0x36>
10bc: 48 83 c9 01 or $0x1,%rcx
10c0: 48 39 f0 cmp %rsi,%rax
10c3: 48 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%rdi
10ca: 48 0f 43 fa cmovae %rdx,%rdi
10ce: 48 01 cf add %rcx,%rdi
10d1: 44 8a 04 3e mov (%rsi,%rdi,1),%r8b
10d5: 44 88 04 38 mov %r8b,(%rax,%rdi,1)
10d9: 48 01 cf add %rcx,%rdi
10dc: 48 ff ca dec %rdx
10df: 75 f0 jne 10d1 <memmove+0x26>
10e1: c3 ret
00000000000010e2 <memcpy>:
10e2: 48 89 f8 mov %rdi,%rax
10e5: 48 85 d2 test %rdx,%rdx
10e8: 74 12 je 10fc <memcpy+0x1a>
10ea: 31 c9 xor %ecx,%ecx
10ec: 40 8a 3c 0e mov (%rsi,%rcx,1),%dil
10f0: 40 88 3c 08 mov %dil,(%rax,%rcx,1)
10f4: 48 ff c1 inc %rcx
10f7: 48 39 ca cmp %rcx,%rdx
10fa: 75 f0 jne 10ec <memcpy+0xa>
10fc: c3 ret
```
After this patch:
```
// memmove is an alias for memcpy
000000000040133b <memcpy>:
40133b: 48 89 d1 mov %rdx,%rcx
40133e: 48 89 f8 mov %rdi,%rax
401341: 48 89 fa mov %rdi,%rdx
401344: 48 29 f2 sub %rsi,%rdx
401347: 48 39 ca cmp %rcx,%rdx
40134a: 72 03 jb 40134f <memcpy+0x14>
40134c: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi)
40134e: c3 ret
40134f: 48 8d 7c 0f ff lea -0x1(%rdi,%rcx,1),%rdi
401354: 48 8d 74 0e ff lea -0x1(%rsi,%rcx,1),%rsi
401359: fd std
40135a: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi)
40135c: fc cld
40135d: c3 ret
```
v3:
- Make memmove as an alias for memcpy (Willy).
- Make the forward copy the likely case (Alviro).
v2:
- Fix the broken memmove implementation (David).
Link: https://lore.kernel.org/lkml/20230902062237.GA23141@1wt.eu
Link: https://lore.kernel.org/lkml/5a821292d96a4dbc84c96ccdc6b5b666@AcuMS.aculab.com
Suggested-by: David Laight <David.Laight@aculab.com>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:02 +07:00
|
|
|
"leaq -1(%rdi, %rcx, 1), %rdi\n\t"
|
|
|
|
"leaq -1(%rsi, %rcx, 1), %rsi\n\t"
|
|
|
|
"std\n\t"
|
|
|
|
"rep movsb\n\t"
|
|
|
|
"cld\n\t"
|
|
|
|
"retq\n"
|
tools/nolibc: x86-64: Use `rep stosb` for `memset()`
Simplify memset() on the x86-64 arch.
The x86-64 arch has a 'rep stosb' instruction, which can perform
memset() using only a single instruction, given:
%al = value (just like the second argument of memset())
%rdi = destination
%rcx = length
Before this patch:
```
00000000000010c9 <memset>:
10c9: 48 89 f8 mov %rdi,%rax
10cc: 48 85 d2 test %rdx,%rdx
10cf: 74 0e je 10df <memset+0x16>
10d1: 31 c9 xor %ecx,%ecx
10d3: 40 88 34 08 mov %sil,(%rax,%rcx,1)
10d7: 48 ff c1 inc %rcx
10da: 48 39 ca cmp %rcx,%rdx
10dd: 75 f4 jne 10d3 <memset+0xa>
10df: c3 ret
```
After this patch:
```
0000000000001511 <memset>:
1511: 96 xchg %eax,%esi
1512: 48 89 d1 mov %rdx,%rcx
1515: 57 push %rdi
1516: f3 aa rep stos %al,%es:(%rdi)
1518: 58 pop %rax
1519: c3 ret
```
v2:
- Use pushq %rdi / popq %rax (Alviro).
- Use xchg %eax, %esi (Willy).
Link: https://lore.kernel.org/lkml/ZO9e6h2jjVIMpBJP@1wt.eu
Suggested-by: Alviro Iskandar Setiawan <alviro.iskandar@gnuweeb.org>
Suggested-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Reviewed-by: Alviro Iskandar Setiawan <alviro.iskandar@gnuweeb.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:03 +07:00
|
|
|
|
|
|
|
".section .text.nolibc_memset\n"
|
|
|
|
".weak memset\n"
|
|
|
|
"memset:\n"
|
|
|
|
"xchgl %eax, %esi\n\t"
|
|
|
|
"movq %rdx, %rcx\n\t"
|
|
|
|
"pushq %rdi\n\t"
|
|
|
|
"rep stosb\n\t"
|
|
|
|
"popq %rax\n\t"
|
|
|
|
"retq\n"
|
tools/nolibc: x86-64: Use `rep movsb` for `memcpy()` and `memmove()`
Simplify memcpy() and memmove() on the x86-64 arch.
The x86-64 arch has a 'rep movsb' instruction, which can perform
memcpy() using only a single instruction, given:
%rdi = destination
%rsi = source
%rcx = length
Additionally, it can also handle the overlapping case by setting DF=1
(backward copy), which can be used as the memmove() implementation.
Before this patch:
```
00000000000010ab <memmove>:
10ab: 48 89 f8 mov %rdi,%rax
10ae: 31 c9 xor %ecx,%ecx
10b0: 48 39 f7 cmp %rsi,%rdi
10b3: 48 83 d1 ff adc $0xffffffffffffffff,%rcx
10b7: 48 85 d2 test %rdx,%rdx
10ba: 74 25 je 10e1 <memmove+0x36>
10bc: 48 83 c9 01 or $0x1,%rcx
10c0: 48 39 f0 cmp %rsi,%rax
10c3: 48 c7 c7 ff ff ff ff mov $0xffffffffffffffff,%rdi
10ca: 48 0f 43 fa cmovae %rdx,%rdi
10ce: 48 01 cf add %rcx,%rdi
10d1: 44 8a 04 3e mov (%rsi,%rdi,1),%r8b
10d5: 44 88 04 38 mov %r8b,(%rax,%rdi,1)
10d9: 48 01 cf add %rcx,%rdi
10dc: 48 ff ca dec %rdx
10df: 75 f0 jne 10d1 <memmove+0x26>
10e1: c3 ret
00000000000010e2 <memcpy>:
10e2: 48 89 f8 mov %rdi,%rax
10e5: 48 85 d2 test %rdx,%rdx
10e8: 74 12 je 10fc <memcpy+0x1a>
10ea: 31 c9 xor %ecx,%ecx
10ec: 40 8a 3c 0e mov (%rsi,%rcx,1),%dil
10f0: 40 88 3c 08 mov %dil,(%rax,%rcx,1)
10f4: 48 ff c1 inc %rcx
10f7: 48 39 ca cmp %rcx,%rdx
10fa: 75 f0 jne 10ec <memcpy+0xa>
10fc: c3 ret
```
After this patch:
```
// memmove is an alias for memcpy
000000000040133b <memcpy>:
40133b: 48 89 d1 mov %rdx,%rcx
40133e: 48 89 f8 mov %rdi,%rax
401341: 48 89 fa mov %rdi,%rdx
401344: 48 29 f2 sub %rsi,%rdx
401347: 48 39 ca cmp %rcx,%rdx
40134a: 72 03 jb 40134f <memcpy+0x14>
40134c: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi)
40134e: c3 ret
40134f: 48 8d 7c 0f ff lea -0x1(%rdi,%rcx,1),%rdi
401354: 48 8d 74 0e ff lea -0x1(%rsi,%rcx,1),%rsi
401359: fd std
40135a: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi)
40135c: fc cld
40135d: c3 ret
```
v3:
- Make memmove as an alias for memcpy (Willy).
- Make the forward copy the likely case (Alviro).
v2:
- Fix the broken memmove implementation (David).
Link: https://lore.kernel.org/lkml/20230902062237.GA23141@1wt.eu
Link: https://lore.kernel.org/lkml/5a821292d96a4dbc84c96ccdc6b5b666@AcuMS.aculab.com
Suggested-by: David Laight <David.Laight@aculab.com>
Signed-off-by: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
2023-09-02 20:35:02 +07:00
|
|
|
);
|
|
|
|
|
2025-06-18 20:30:30 +02:00
|
|
|
#endif /* !defined(__x86_64__) */
|
|
|
|
#endif /* _NOLIBC_ARCH_X86_H */
|