mirror of
				git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
				synced 2025-09-18 22:14:16 +00:00 
			
		
		
		
	alpha: fix alignment problem in csum_ipv6_magic()
Hopefully this fixes http://bugzilla.kernel.org/show_bug.cgi?id=8635 The struct in6_addr passed to csum_ipv6_magic() is 4 byte aligned, so we can't use the regular 64-bit loads. Since the cost of handling of 4 byte and 1 byte aligned 64-bit data is roughly the same, this code can cope with any src/dst [mis]alignment. Signed-off-by: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Richard Henderson <rth@twiddle.net> Cc: Dustin Marquess <jailbird@alcatraz.fdf.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									653d4876b7
								
							
						
					
					
						commit
						58ed2f9c75
					
				
					 2 changed files with 71 additions and 24 deletions
				
			
		|  | @ -7,6 +7,9 @@ | ||||||
|  *                                __u32 len, |  *                                __u32 len, | ||||||
|  *                                unsigned short proto, |  *                                unsigned short proto, | ||||||
|  *                                unsigned int csum);
 |  *                                unsigned int csum);
 | ||||||
|  |  * | ||||||
|  |  * Misalignment handling (which costs 16 instructions / 8 cycles) | ||||||
|  |  * added by Ivan Kokshaysky <ink@jurassic.park.msu.ru>
 | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| 	.globl csum_ipv6_magic
 | 	.globl csum_ipv6_magic
 | ||||||
|  | @ -16,37 +19,57 @@ | ||||||
| csum_ipv6_magic: | csum_ipv6_magic: | ||||||
| 	.prologue 0
 | 	.prologue 0
 | ||||||
| 
 | 
 | ||||||
| 	ldq	$0,0($16)	# e0    : load src & dst addr words | 	ldq_u	$0,0($16)	# e0    : load src & dst addr words | ||||||
| 	zapnot	$20,15,$20	# .. e1 : zero extend incoming csum | 	zapnot	$20,15,$20	# .. e1 : zero extend incoming csum | ||||||
| 	extqh	$18,1,$4	# e0    : byte swap len & proto while we wait | 	extqh	$18,1,$4	# e0    : byte swap len & proto while we wait | ||||||
| 	ldq	$1,8($16)	# .. e1 : | 	ldq_u	$21,7($16)	# .. e1 : handle misalignment | ||||||
| 
 | 
 | ||||||
| 	extbl	$18,1,$5	# e0	: | 	extbl	$18,1,$5	# e0	: | ||||||
| 	ldq	$2,0($17)	# .. e1 : | 	ldq_u	$1,8($16)	# .. e1 : | ||||||
| 	extbl	$18,2,$6	# e0 	: | 	extbl	$18,2,$6	# e0 	: | ||||||
| 	ldq	$3,8($17)	# .. e1 : | 	ldq_u	$22,15($16)	# .. e1 : | ||||||
| 
 | 
 | ||||||
| 	extbl	$18,3,$18	# e0	: | 	extbl	$18,3,$18	# e0	: | ||||||
|  | 	ldq_u	$2,0($17)	# .. e1 : | ||||||
| 	sra	$4,32,$4	# e0	: | 	sra	$4,32,$4	# e0	: | ||||||
|  | 	ldq_u	$23,7($17)	# .. e1 : | ||||||
|  | 
 | ||||||
|  | 	extql	$0,$16,$0	# e0	: | ||||||
|  | 	ldq_u	$3,8($17)	# .. e1 : | ||||||
|  | 	extqh	$21,$16,$21	# e0	: | ||||||
|  | 	ldq_u	$24,15($17)	# .. e1 : | ||||||
|  | 
 | ||||||
| 	sll	$5,16,$5	# e0	: | 	sll	$5,16,$5	# e0	: | ||||||
|  | 	or	$0,$21,$0	# .. e1 : 1st src word complete | ||||||
|  | 	extql	$1,$16,$1	# e0	: | ||||||
| 	addq	$20,$0,$20	# .. e1 : begin summing the words | 	addq	$20,$0,$20	# .. e1 : begin summing the words | ||||||
| 
 | 
 | ||||||
| 	sll	$6,8,$6		# e0	: | 	extqh	$22,$16,$22	# e0	: | ||||||
| 	cmpult	$20,$0,$0	# .. e1 : | 	cmpult	$20,$0,$0	# .. e1 : | ||||||
| 	extwh	$19,7,$7	# e0    : | 	sll	$6,8,$6		# e0	: | ||||||
|  | 	or	$1,$22,$1	# .. e1 : 2nd src word complete | ||||||
|  | 
 | ||||||
|  | 	extql	$2,$17,$2	# e0	: | ||||||
| 	or	$4,$18,$18	# .. e1 : | 	or	$4,$18,$18	# .. e1 : | ||||||
| 
 | 	extqh	$23,$17,$23	# e0	: | ||||||
| 	extbl	$19,1,$19	# e0    : |  | ||||||
| 	or	$5,$6,$5	# .. e1 : | 	or	$5,$6,$5	# .. e1 : | ||||||
| 	or	$18,$5,$18	# e0    : len complete |  | ||||||
| 	or	$19,$7,$19	# .. e1 : |  | ||||||
| 
 | 
 | ||||||
| 	sll	$19,48,$19	# e0    : | 	extql	$3,$17,$3	# e0	: | ||||||
|  | 	or	$2,$23,$2	# .. e1 : 1st dst word complete | ||||||
|  | 	extqh	$24,$17,$24	# e0	: | ||||||
|  | 	or	$18,$5,$18	# .. e1 : len complete | ||||||
|  | 
 | ||||||
|  | 	extwh	$19,7,$7	# e0    : | ||||||
|  | 	or	$3,$24,$3	# .. e1 : 2nd dst word complete | ||||||
|  | 	extbl	$19,1,$19	# e0    : | ||||||
| 	addq	$20,$1,$20	# .. e1 : | 	addq	$20,$1,$20	# .. e1 : | ||||||
| 	sra	$19,32,$19	# e0    : proto complete |  | ||||||
| 	cmpult	$20,$1,$1	# .. e1 : |  | ||||||
| 
 | 
 | ||||||
| 	nop			# e0    : | 	or	$19,$7,$19	# e0    : | ||||||
|  | 	cmpult	$20,$1,$1	# .. e1 : | ||||||
|  | 	sll	$19,48,$19	# e0    : | ||||||
|  | 	nop			# .. e0 : | ||||||
|  | 
 | ||||||
|  | 	sra	$19,32,$19	# e0    : proto complete | ||||||
| 	addq	$20,$2,$20	# .. e1 : | 	addq	$20,$2,$20	# .. e1 : | ||||||
| 	cmpult	$20,$2,$2	# e0    : | 	cmpult	$20,$2,$2	# e0    : | ||||||
| 	addq	$20,$3,$20	# .. e1 : | 	addq	$20,$3,$20	# .. e1 : | ||||||
|  | @ -84,7 +107,7 @@ csum_ipv6_magic: | ||||||
| 	extwl	$0,2,$1		# e0    : fold 17-bit value | 	extwl	$0,2,$1		# e0    : fold 17-bit value | ||||||
| 	zapnot	$0,3,$0		# .. e1 : | 	zapnot	$0,3,$0		# .. e1 : | ||||||
| 	addq	$0,$1,$0	# e0    : | 	addq	$0,$1,$0	# e0    : | ||||||
| 	not	$0,$0		# e1    : and complement. | 	not	$0,$0		# .. e1 : and complement. | ||||||
| 
 | 
 | ||||||
| 	zapnot	$0,3,$0		# e0    : | 	zapnot	$0,3,$0		# e0    : | ||||||
| 	ret			# .. e1 : | 	ret			# .. e1 : | ||||||
|  |  | ||||||
|  | @ -46,6 +46,10 @@ | ||||||
|  * add the 3 low ushorts together, generating a uint |  * add the 3 low ushorts together, generating a uint | ||||||
|  * a final add of the 2 lower ushorts |  * a final add of the 2 lower ushorts | ||||||
|  * truncating the result. |  * truncating the result. | ||||||
|  |  * | ||||||
|  |  * Misalignment handling added by Ivan Kokshaysky <ink@jurassic.park.msu.ru>
 | ||||||
|  |  * The cost is 16 instructions (~8 cycles), including two extra loads which | ||||||
|  |  * may cause additional delay in rare cases (load-load replay traps). | ||||||
|  */ |  */ | ||||||
| 
 | 
 | ||||||
| 	.globl csum_ipv6_magic
 | 	.globl csum_ipv6_magic
 | ||||||
|  | @ -55,25 +59,45 @@ | ||||||
| csum_ipv6_magic: | csum_ipv6_magic: | ||||||
| 	.prologue 0
 | 	.prologue 0
 | ||||||
| 
 | 
 | ||||||
| 	ldq	$0,0($16)	# L : Latency: 3 | 	ldq_u	$0,0($16)	# L : Latency: 3 | ||||||
| 	inslh	$18,7,$4	# U : 0000000000AABBCC | 	inslh	$18,7,$4	# U : 0000000000AABBCC | ||||||
| 	ldq	$1,8($16)	# L : Latency: 3 | 	ldq_u	$1,8($16)	# L : Latency: 3 | ||||||
| 	sll	$19,8,$7	# U : U L U L : 0x00000000 00aabb00 | 	sll	$19,8,$7	# U : U L U L : 0x00000000 00aabb00 | ||||||
| 
 | 
 | ||||||
|  | 	and	$16,7,$6	# E : src misalignment | ||||||
|  | 	ldq_u	$5,15($16)	# L : Latency: 3 | ||||||
| 	zapnot	$20,15,$20	# U : zero extend incoming csum | 	zapnot	$20,15,$20	# U : zero extend incoming csum | ||||||
| 	ldq	$2,0($17)	# L : Latency: 3 | 	ldq_u	$2,0($17)	# L : U L U L : Latency: 3 | ||||||
| 	sll	$19,24,$19	# U : U L L U : 0x000000aa bb000000 | 
 | ||||||
|  | 	extql	$0,$6,$0	# U : | ||||||
|  | 	extqh	$1,$6,$22	# U : | ||||||
|  | 	ldq_u	$3,8($17)	# L : Latency: 3 | ||||||
|  | 	sll	$19,24,$19	# U : U U L U : 0x000000aa bb000000 | ||||||
|  | 
 | ||||||
|  | 	cmoveq	$6,$31,$22	# E : src aligned? | ||||||
|  | 	ldq_u	$23,15($17)	# L : Latency: 3 | ||||||
| 	inswl	$18,3,$18	# U : 000000CCDD000000 | 	inswl	$18,3,$18	# U : 000000CCDD000000 | ||||||
|  | 	addl	$19,$7,$19	# E : U L U L : <sign bits>bbaabb00 | ||||||
| 
 | 
 | ||||||
| 	ldq	$3,8($17)	# L : Latency: 3 | 	or	$0,$22,$0	# E : 1st src word complete | ||||||
| 	bis	$18,$4,$18	# E : 000000CCDDAABBCC | 	extql	$1,$6,$1	# U : | ||||||
| 	addl	$19,$7,$19	# E : <sign bits>bbaabb00 | 	or	$18,$4,$18	# E : 000000CCDDAABBCC | ||||||
| 	nop			# E : U L U L | 	extqh	$5,$6,$5	# U : L U L U | ||||||
| 
 | 
 | ||||||
|  | 	and	$17,7,$6	# E : dst misalignment | ||||||
|  | 	extql	$2,$6,$2	# U : | ||||||
|  | 	or	$1,$5,$1	# E : 2nd src word complete | ||||||
|  | 	extqh	$3,$6,$22	# U : L U L U : | ||||||
|  | 
 | ||||||
|  | 	cmoveq	$6,$31,$22	# E : dst aligned? | ||||||
|  | 	extql	$3,$6,$3	# U : | ||||||
| 	addq	$20,$0,$20	# E : begin summing the words | 	addq	$20,$0,$20	# E : begin summing the words | ||||||
|  | 	extqh	$23,$6,$23	# U : L U L U : | ||||||
|  | 
 | ||||||
| 	srl	$18,16,$4	# U : 0000000000CCDDAA | 	srl	$18,16,$4	# U : 0000000000CCDDAA | ||||||
|  | 	or	$2,$22,$2	# E : 1st dst word complete | ||||||
| 	zap	$19,0x3,$19	# U : <sign bits>bbaa0000 | 	zap	$19,0x3,$19	# U : <sign bits>bbaa0000 | ||||||
| 	nop			# E : L U U L | 	or	$3,$23,$3	# E : U L U L : 2nd dst word complete | ||||||
| 
 | 
 | ||||||
| 	cmpult	$20,$0,$0	# E : | 	cmpult	$20,$0,$0	# E : | ||||||
| 	addq	$20,$1,$20	# E : | 	addq	$20,$1,$20	# E : | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Ivan Kokshaysky
						Ivan Kokshaysky