mirror of
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-08-05 16:54:27 +00:00

This switches x86-64 over to using 'tzcount' instead of the integer multiply trick to turn the bytemask information into actual byte counts. We even had a comment saying that a fast bit count instruction is better than a multiply, but x86 bit counting has traditionally been "questionably fast", and so avoiding it was the right thing back in the days. Now, on any half-way modern core, using bit counting is cheaper and smaller than the large constant multiply, so let's just switch over. Note that as part of switching over to counting bits, we also do it at a different point. We used to create the byte count from the final byte mask, but once you use the 'tzcount' instruction (aka 'bsf' on older CPU's), you can actually count the leading zeroes using a value we have available earlier. In fact, we can just use the very first mask of bits that tells us whether we have any zero bytes at all. The zero bytes in the word will have the high bit set, so just doing 'tzcount' on that value and dividing by 8 will give the number of bytes that precede the first NUL character, which is exactly what we want. Note also that the input value to the tzcount is by definition not zero, since that is the condition that we already used to check the whole "do we have any zero bytes at all". So we don't need to worry about the legacy instruction behavior of pre-lzcount days when 'bsf' didn't have a result for zero input. The 32-bit code continues to use the bimple bit op trick that is faster even on newer cores, but particularly on the older 32-bit-only ones. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
84 lines
2 KiB
C
84 lines
2 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_WORD_AT_A_TIME_H
|
|
#define _ASM_WORD_AT_A_TIME_H
|
|
|
|
#include <linux/bitops.h>
|
|
#include <linux/wordpart.h>
|
|
|
|
struct word_at_a_time {
|
|
const unsigned long one_bits, high_bits;
|
|
};
|
|
|
|
#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
|
|
|
|
/* Return nonzero if it has a zero */
|
|
static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
|
|
{
|
|
unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
|
|
*bits = mask;
|
|
return mask;
|
|
}
|
|
|
|
static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
|
|
{
|
|
return bits;
|
|
}
|
|
|
|
#ifdef CONFIG_64BIT
|
|
|
|
/* Keep the initial has_zero() value for both bitmask and size calc */
|
|
#define create_zero_mask(bits) (bits)
|
|
|
|
static inline unsigned long zero_bytemask(unsigned long bits)
|
|
{
|
|
bits = (bits - 1) & ~bits;
|
|
return bits >> 7;
|
|
}
|
|
|
|
#define find_zero(bits) (__ffs(bits) >> 3)
|
|
|
|
#else
|
|
|
|
/* Create the final mask for both bytemask and size */
|
|
static inline unsigned long create_zero_mask(unsigned long bits)
|
|
{
|
|
bits = (bits - 1) & ~bits;
|
|
return bits >> 7;
|
|
}
|
|
|
|
/* The mask we created is directly usable as a bytemask */
|
|
#define zero_bytemask(mask) (mask)
|
|
|
|
/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
|
|
static inline unsigned long find_zero(unsigned long mask)
|
|
{
|
|
/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
|
|
long a = (0x0ff0001+mask) >> 23;
|
|
/* Fix the 1 for 00 case */
|
|
return a & mask;
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
* Load an unaligned word from kernel space.
|
|
*
|
|
* In the (very unlikely) case of the word being a page-crosser
|
|
* and the next page not being mapped, take the exception and
|
|
* return zeroes in the non-existing part.
|
|
*/
|
|
static inline unsigned long load_unaligned_zeropad(const void *addr)
|
|
{
|
|
unsigned long ret;
|
|
|
|
asm volatile(
|
|
"1: mov %[mem], %[ret]\n"
|
|
"2:\n"
|
|
_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_ZEROPAD)
|
|
: [ret] "=r" (ret)
|
|
: [mem] "m" (*(unsigned long *)addr));
|
|
|
|
return ret;
|
|
}
|
|
|
|
#endif /* _ASM_WORD_AT_A_TIME_H */
|