patch-2.4.22 linux-2.4.22/arch/sh64/lib/page_copy.S

Next file: linux-2.4.22/arch/sh64/lib/panic.c
Previous file: linux-2.4.22/arch/sh64/lib/page_clear.S
Back to the patch index
Back to the overall index

diff -urN linux-2.4.21/arch/sh64/lib/page_copy.S linux-2.4.22/arch/sh64/lib/page_copy.S
@@ -0,0 +1,77 @@
+/* Written by Richard P. Curnow, SuperH (UK) Ltd.
+
+   Tight version of mempy for the case of just copying a page.
+   Prefetch strategy empirically optimised against RTL simulations
+   of SH5-101 cut2 eval chip with Cayman board DDR memory.
+
+   Parameters:
+   r2 : source effective address (start of page)
+   r3 : destination effective address (start of page)
+
+   Always copies 4096 bytes.
+  
+   Points to review.
+   * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
+     It seems like the prefetch needs to be at at least 4 lines ahead to get
+     the data into the cache in time, and the allocos contend with outstanding
+     prefetches for the same cache set, so it's better to have the numbers
+     different.
+   */
+
+	.section .text..SHmedia32,"ax"
+	.little
+
+	.balign 8
+	.global sh64_page_copy
+sh64_page_copy:
+
+	/* Copy 4096 bytes worth of data from r2 to r3.
+	   Do prefetches 4 lines ahead.
+	   Do alloco 2 lines ahead */
+
+	pta 1f, tr1
+	pta 2f, tr2
+	pta 3f, tr3
+	ptabs r18, tr0
+
+	ld.q r2, 0x00, r63
+	ld.q r2, 0x20, r63
+	ld.q r2, 0x40, r63
+	ld.q r2, 0x60, r63
+	alloco r3, 0x00
+	alloco r3, 0x20
+
+	movi 3968, r6
+	add  r3, r6, r6
+	addi r6, 64, r7
+	addi r7, 64, r8
+	sub r2, r3, r60
+	addi r60, 8, r61
+	addi r61, 8, r62
+	addi r62, 8, r23
+	addi r60, 0x80, r22
+
+/* Minimal code size.  The extra branches inside the loop don't cost much
+   because they overlap with the time spent waiting for prefetches to
+   complete. */
+1:
+	bge/u r3, r6, tr2  ! skip prefetch for last 4 lines
+	ldx.q r3, r22, r63 ! prefetch 4 lines hence
+2:
+	bge/u r3, r7, tr3  ! skip alloco for last 2 lines
+	alloco r3, 0x40    ! alloc destination line 2 lines ahead
+3:
+	ldx.q r3, r60, r36
+	ldx.q r3, r61, r37
+	ldx.q r3, r62, r38
+	ldx.q r3, r23, r39
+	st.q  r3,   0, r36
+	st.q  r3,   8, r37
+	st.q  r3,  16, r38
+	st.q  r3,  24, r39
+	addi r3, 32, r3
+	bgt/l r8, r3, tr1
+
+	blink tr0, r63	   ! return
+
+

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)