linux/arch/powerpc/kernel/misc_32.S
Christophe Leroy 2a17a5bebc powerpc/32: Replace mulhdu() by mul_u64_u64_shr()
Using mul_u64_u64_shr() provides similar calculation as mulhdu()
assembly function, but enables inlining by the compiler.

The home-made assembly function had special handling for when one of
the arguments is not a fully populated u64 but time functions use it
to multiply timebase by a calculated scale which is constructed to
have most significant bit set.

On mpc8xx sched_clock() runs 3% faster. On mpc83xx it is 2%.

As you can see below, sched_clock() is not much bigger than before:

	c000cf68 <sched_clock>:
	c000cf68:	7d 2d 42 a6 	mftbu   r9
	c000cf6c:	7d 0c 42 a6 	mftb    r8
	c000cf70:	7d 4d 42 a6 	mftbu   r10
	c000cf74:	7c 09 50 40 	cmplw   r9,r10
	c000cf78:	40 82 ff f0 	bne     c000cf68 <sched_clock>
	c000cf7c:	3d 40 c1 37 	lis     r10,-16073
	c000cf80:	38 8a b3 30 	addi    r4,r10,-19664
	c000cf84:	80 ea b3 30 	lwz     r7,-19664(r10)
	c000cf88:	80 64 00 14 	lwz     r3,20(r4)
	c000cf8c:	39 40 00 00 	li      r10,0
	c000cf90:	80 a4 00 04 	lwz     r5,4(r4)
	c000cf94:	80 c4 00 10 	lwz     r6,16(r4)
	c000cf98:	7c 63 40 10 	subfc   r3,r3,r8
	c000cf9c:	80 84 00 08 	lwz     r4,8(r4)
	c000cfa0:	7d 06 49 10 	subfe   r8,r6,r9
	c000cfa4:	7c c7 19 d6 	mullw   r6,r7,r3
	c000cfa8:	7d 25 18 16 	mulhwu  r9,r5,r3
	c000cfac:	7c 08 29 d6 	mullw   r0,r8,r5
	c000cfb0:	7c 67 18 16 	mulhwu  r3,r7,r3
	c000cfb4:	7d 29 30 14 	addc    r9,r9,r6
	c000cfb8:	7c a8 28 16 	mulhwu  r5,r8,r5
	c000cfbc:	7c ca 51 14 	adde    r6,r10,r10
	c000cfc0:	7d 67 41 d6 	mullw   r11,r7,r8
	c000cfc4:	7d 29 00 14 	addc    r9,r9,r0
	c000cfc8:	7c c6 01 94 	addze   r6,r6
	c000cfcc:	7c 63 28 14 	addc    r3,r3,r5
	c000cfd0:	7d 4a 51 14 	adde    r10,r10,r10
	c000cfd4:	7c e7 40 16 	mulhwu  r7,r7,r8
	c000cfd8:	7c 63 58 14 	addc    r3,r3,r11
	c000cfdc:	7d 4a 01 94 	addze   r10,r10
	c000cfe0:	7c 63 30 14 	addc    r3,r3,r6
	c000cfe4:	7d 4a 39 14 	adde    r10,r10,r7
	c000cfe8:	35 24 ff e0 	addic.  r9,r4,-32
	c000cfec:	41 80 00 10 	blt     c000cffc <sched_clock+0x94>
	c000cff0:	7c 63 48 30 	slw     r3,r3,r9
	c000cff4:	38 80 00 00 	li      r4,0
	c000cff8:	4e 80 00 20 	blr
	c000cffc:	21 04 00 1f 	subfic  r8,r4,31
	c000d000:	54 69 f8 7e 	srwi    r9,r3,1
	c000d004:	7d 4a 20 30 	slw     r10,r10,r4
	c000d008:	7d 29 44 30 	srw     r9,r9,r8
	c000d00c:	7c 64 20 30 	slw     r4,r3,r4
	c000d010:	7d 23 53 78 	or      r3,r9,r10
	c000d014:	4e 80 00 20 	blr

Before this change:

	c000d0bc <sched_clock>:
	c000d0bc:	94 21 ff f0 	stwu    r1,-16(r1)
	c000d0c0:	7c 08 02 a6 	mflr    r0
	c000d0c4:	90 01 00 14 	stw     r0,20(r1)
	c000d0c8:	93 e1 00 0c 	stw     r31,12(r1)
	c000d0cc:	7d 2d 42 a6 	mftbu   r9
	c000d0d0:	7d 0c 42 a6 	mftb    r8
	c000d0d4:	7d 4d 42 a6 	mftbu   r10
	c000d0d8:	7c 09 50 40 	cmplw   r9,r10
	c000d0dc:	40 82 ff f0 	bne     c000d0cc <sched_clock+0x10>
	c000d0e0:	3f e0 c1 37 	lis     r31,-16073
	c000d0e4:	3b ff b3 30 	addi    r31,r31,-19664
	c000d0e8:	80 9f 00 14 	lwz     r4,20(r31)
	c000d0ec:	80 7f 00 10 	lwz     r3,16(r31)
	c000d0f0:	7c 84 40 10 	subfc   r4,r4,r8
	c000d0f4:	80 bf 00 00 	lwz     r5,0(r31)
	c000d0f8:	80 df 00 04 	lwz     r6,4(r31)
	c000d0fc:	7c 63 49 10 	subfe   r3,r3,r9
	c000d100:	48 00 37 85 	bl      c0010884 <mulhdu>
	c000d104:	81 3f 00 08 	lwz     r9,8(r31)
	c000d108:	35 49 ff e0 	addic.  r10,r9,-32
	c000d10c:	41 80 00 20 	blt     c000d12c <sched_clock+0x70>
	c000d110:	80 01 00 14 	lwz     r0,20(r1)
	c000d114:	7c 83 50 30 	slw     r3,r4,r10
	c000d118:	83 e1 00 0c 	lwz     r31,12(r1)
	c000d11c:	38 80 00 00 	li      r4,0
	c000d120:	7c 08 03 a6 	mtlr    r0
	c000d124:	38 21 00 10 	addi    r1,r1,16
	c000d128:	4e 80 00 20 	blr
	c000d12c:	80 01 00 14 	lwz     r0,20(r1)
	c000d130:	54 8a f8 7e 	srwi    r10,r4,1
	c000d134:	21 09 00 1f 	subfic  r8,r9,31
	c000d138:	83 e1 00 0c 	lwz     r31,12(r1)
	c000d13c:	7c 63 48 30 	slw     r3,r3,r9
	c000d140:	7d 4a 44 30 	srw     r10,r10,r8
	c000d144:	7c 84 48 30 	slw     r4,r4,r9
	c000d148:	7d 43 1b 78 	or      r3,r10,r3
	c000d14c:	7c 08 03 a6 	mtlr    r0
	c000d150:	38 21 00 10 	addi    r1,r1,16
	c000d154:	4e 80 00 20 	blr

	c0010884 <mulhdu>:
	c0010884:	2c 06 00 00 	cmpwi   r6,0
	c0010888:	2c 83 00 00 	cmpwi   cr1,r3,0
	c001088c:	7c 8a 23 78 	mr      r10,r4
	c0010890:	7c 84 28 16 	mulhwu  r4,r4,r5
	c0010894:	41 82 00 14 	beq     c00108a8 <mulhdu+0x24>
	c0010898:	7c 0a 30 16 	mulhwu  r0,r10,r6
	c001089c:	7c ea 29 d6 	mullw   r7,r10,r5
	c00108a0:	7c e0 38 14 	addc    r7,r0,r7
	c00108a4:	7c 84 01 94 	addze   r4,r4
	c00108a8:	4d 86 00 20 	beqlr   cr1
	c00108ac:	7d 23 29 d6 	mullw   r9,r3,r5
	c00108b0:	7d 43 28 16 	mulhwu  r10,r3,r5
	c00108b4:	41 82 00 18 	beq     c00108cc <mulhdu+0x48>
	c00108b8:	7c 03 31 d6 	mullw   r0,r3,r6
	c00108bc:	7d 03 30 16 	mulhwu  r8,r3,r6
	c00108c0:	7c e0 38 14 	addc    r7,r0,r7
	c00108c4:	7c 84 41 14 	adde    r4,r4,r8
	c00108c8:	7d 4a 01 94 	addze   r10,r10
	c00108cc:	7c 84 48 14 	addc    r4,r4,r9
	c00108d0:	7c 6a 01 94 	addze   r3,r10
	c00108d4:	4e 80 00 20 	blr

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/f29e473c193c87bdbd36b209dfdee99d2f0c60dc.1733566130.git.christophe.leroy@csgroup.eu
2024-12-10 08:15:30 +05:30

324 lines
6.4 KiB
ArmAsm

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* This file contains miscellaneous low-level functions.
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
*
* Largely rewritten by Cort Dougan (cort@cs.nmt.edu)
* and Paul Mackerras.
*
*/
#include <linux/export.h>
#include <linux/sys.h>
#include <asm/unistd.h>
#include <asm/errno.h>
#include <asm/reg.h>
#include <asm/page.h>
#include <asm/cache.h>
#include <asm/cputable.h>
#include <asm/mmu.h>
#include <asm/ppc_asm.h>
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
#include <asm/processor.h>
#include <asm/bug.h>
#include <asm/ptrace.h>
#include <asm/feature-fixups.h>
.text
/*
* reloc_got2 runs through the .got2 section adding an offset
* to each entry.
*/
_GLOBAL(reloc_got2)
mflr r11
lis r7,__got2_start@ha
addi r7,r7,__got2_start@l
lis r8,__got2_end@ha
addi r8,r8,__got2_end@l
subf r8,r7,r8
srwi. r8,r8,2
beqlr
mtctr r8
bcl 20,31,$+4
1: mflr r0
lis r4,1b@ha
addi r4,r4,1b@l
subf r0,r4,r0
add r7,r0,r7
2: lwz r0,0(r7)
add r0,r0,r3
stw r0,0(r7)
addi r7,r7,4
bdnz 2b
mtlr r11
blr
/*
* call_setup_cpu - call the setup_cpu function for this cpu
* r3 = data offset, r24 = cpu number
*
* Setup function is called with:
* r3 = data offset
* r4 = ptr to CPU spec (relocated)
*/
_GLOBAL(call_setup_cpu)
addis r4,r3,cur_cpu_spec@ha
addi r4,r4,cur_cpu_spec@l
lwz r4,0(r4)
add r4,r4,r3
lwz r5,CPU_SPEC_SETUP(r4)
cmpwi 0,r5,0
add r5,r5,r3
beqlr
mtctr r5
bctr
#if defined(CONFIG_CPU_FREQ_PMAC) && defined(CONFIG_PPC_BOOK3S_32)
/* This gets called by via-pmu.c to switch the PLL selection
* on 750fx CPU. This function should really be moved to some
* other place (as most of the cpufreq code in via-pmu
*/
_GLOBAL(low_choose_750fx_pll)
/* Clear MSR:EE */
mfmsr r7
rlwinm r0,r7,0,17,15
mtmsr r0
/* If switching to PLL1, disable HID0:BTIC */
cmplwi cr0,r3,0
beq 1f
mfspr r5,SPRN_HID0
rlwinm r5,r5,0,27,25
sync
mtspr SPRN_HID0,r5
isync
sync
1:
/* Calc new HID1 value */
mfspr r4,SPRN_HID1 /* Build a HID1:PS bit from parameter */
rlwinm r5,r3,16,15,15 /* Clear out HID1:PS from value read */
rlwinm r4,r4,0,16,14 /* Could have I used rlwimi here ? */
or r4,r4,r5
mtspr SPRN_HID1,r4
#ifdef CONFIG_SMP
/* Store new HID1 image */
lwz r6,TASK_CPU(r2)
slwi r6,r6,2
#else
li r6, 0
#endif
addis r6,r6,nap_save_hid1@ha
stw r4,nap_save_hid1@l(r6)
/* If switching to PLL0, enable HID0:BTIC */
cmplwi cr0,r3,0
bne 1f
mfspr r5,SPRN_HID0
ori r5,r5,HID0_BTIC
sync
mtspr SPRN_HID0,r5
isync
sync
1:
/* Return */
mtmsr r7
blr
_GLOBAL(low_choose_7447a_dfs)
/* Clear MSR:EE */
mfmsr r7
rlwinm r0,r7,0,17,15
mtmsr r0
/* Calc new HID1 value */
mfspr r4,SPRN_HID1
insrwi r4,r3,1,9 /* insert parameter into bit 9 */
sync
mtspr SPRN_HID1,r4
sync
isync
/* Return */
mtmsr r7
blr
#endif /* CONFIG_CPU_FREQ_PMAC && CONFIG_PPC_BOOK3S_32 */
/*
* Copy a whole page. We use the dcbz instruction on the destination
* to reduce memory traffic (it eliminates the unnecessary reads of
* the destination into cache). This requires that the destination
* is cacheable.
*/
#define COPY_16_BYTES \
lwz r6,4(r4); \
lwz r7,8(r4); \
lwz r8,12(r4); \
lwzu r9,16(r4); \
stw r6,4(r3); \
stw r7,8(r3); \
stw r8,12(r3); \
stwu r9,16(r3)
_GLOBAL(copy_page)
rlwinm r5, r3, 0, L1_CACHE_BYTES - 1
addi r3,r3,-4
0: twnei r5, 0 /* WARN if r3 is not cache aligned */
EMIT_WARN_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
addi r4,r4,-4
li r5,4
#if MAX_COPY_PREFETCH > 1
li r0,MAX_COPY_PREFETCH
li r11,4
mtctr r0
11: dcbt r11,r4
addi r11,r11,L1_CACHE_BYTES
bdnz 11b
#else /* MAX_COPY_PREFETCH == 1 */
dcbt r5,r4
li r11,L1_CACHE_BYTES+4
#endif /* MAX_COPY_PREFETCH */
li r0,PAGE_SIZE/L1_CACHE_BYTES - MAX_COPY_PREFETCH
crclr 4*cr0+eq
2:
mtctr r0
1:
dcbt r11,r4
dcbz r5,r3
COPY_16_BYTES
#if L1_CACHE_BYTES >= 32
COPY_16_BYTES
#if L1_CACHE_BYTES >= 64
COPY_16_BYTES
COPY_16_BYTES
#if L1_CACHE_BYTES >= 128
COPY_16_BYTES
COPY_16_BYTES
COPY_16_BYTES
COPY_16_BYTES
#endif
#endif
#endif
bdnz 1b
beqlr
crnot 4*cr0+eq,4*cr0+eq
li r0,MAX_COPY_PREFETCH
li r11,4
b 2b
EXPORT_SYMBOL(copy_page)
/*
* Extended precision shifts.
*
* Updated to be valid for shift counts from 0 to 63 inclusive.
* -- Gabriel
*
* R3/R4 has 64 bit value
* R5 has shift count
* result in R3/R4
*
* ashrdi3: arithmetic right shift (sign propagation)
* lshrdi3: logical right shift
* ashldi3: left shift
*/
_GLOBAL(__ashrdi3)
subfic r6,r5,32
srw r4,r4,r5 # LSW = count > 31 ? 0 : LSW >> count
addi r7,r5,32 # could be xori, or addi with -32
slw r6,r3,r6 # t1 = count > 31 ? 0 : MSW << (32-count)
rlwinm r8,r7,0,32 # t3 = (count < 32) ? 32 : 0
sraw r7,r3,r7 # t2 = MSW >> (count-32)
or r4,r4,r6 # LSW |= t1
slw r7,r7,r8 # t2 = (count < 32) ? 0 : t2
sraw r3,r3,r5 # MSW = MSW >> count
or r4,r4,r7 # LSW |= t2
blr
EXPORT_SYMBOL(__ashrdi3)
_GLOBAL(__ashldi3)
subfic r6,r5,32
slw r3,r3,r5 # MSW = count > 31 ? 0 : MSW << count
addi r7,r5,32 # could be xori, or addi with -32
srw r6,r4,r6 # t1 = count > 31 ? 0 : LSW >> (32-count)
slw r7,r4,r7 # t2 = count < 32 ? 0 : LSW << (count-32)
or r3,r3,r6 # MSW |= t1
slw r4,r4,r5 # LSW = LSW << count
or r3,r3,r7 # MSW |= t2
blr
EXPORT_SYMBOL(__ashldi3)
_GLOBAL(__lshrdi3)
subfic r6,r5,32
srw r4,r4,r5 # LSW = count > 31 ? 0 : LSW >> count
addi r7,r5,32 # could be xori, or addi with -32
slw r6,r3,r6 # t1 = count > 31 ? 0 : MSW << (32-count)
srw r7,r3,r7 # t2 = count < 32 ? 0 : MSW >> (count-32)
or r4,r4,r6 # LSW |= t1
srw r3,r3,r5 # MSW = MSW >> count
or r4,r4,r7 # LSW |= t2
blr
EXPORT_SYMBOL(__lshrdi3)
/*
* 64-bit comparison: __cmpdi2(s64 a, s64 b)
* Returns 0 if a < b, 1 if a == b, 2 if a > b.
*/
_GLOBAL(__cmpdi2)
cmpw r3,r5
li r3,1
bne 1f
cmplw r4,r6
beqlr
1: li r3,0
bltlr
li r3,2
blr
EXPORT_SYMBOL(__cmpdi2)
/*
* 64-bit comparison: __ucmpdi2(u64 a, u64 b)
* Returns 0 if a < b, 1 if a == b, 2 if a > b.
*/
_GLOBAL(__ucmpdi2)
cmplw r3,r5
li r3,1
bne 1f
cmplw r4,r6
beqlr
1: li r3,0
bltlr
li r3,2
blr
EXPORT_SYMBOL(__ucmpdi2)
_GLOBAL(__bswapdi2)
rotlwi r9,r4,8
rotlwi r10,r3,8
rlwimi r9,r4,24,0,7
rlwimi r10,r3,24,0,7
rlwimi r9,r4,24,16,23
rlwimi r10,r3,24,16,23
mr r3,r9
mr r4,r10
blr
EXPORT_SYMBOL(__bswapdi2)
#ifdef CONFIG_SMP
_GLOBAL(start_secondary_resume)
/* Reset stack */
rlwinm r1, r1, 0, 0, 31 - THREAD_SHIFT
addi r1,r1,THREAD_SIZE-STACK_FRAME_MIN_SIZE
li r3,0
stw r3,0(r1) /* Zero the stack frame pointer */
bl start_secondary
b .
#endif /* CONFIG_SMP */