123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- /*
- Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
- This file is subject to the terms and conditions of the GNU General Public
- License. See the file "COPYING" in the main directory of this archive
- for more details.
- Tight version of mempy for the case of just copying a page.
- Prefetch strategy empirically optimised against RTL simulations
- of SH5-101 cut2 eval chip with Cayman board DDR memory.
- Parameters:
- r2 : destination effective address (start of page)
- r3 : source effective address (start of page)
- Always copies 4096 bytes.
- Points to review.
- * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
- It seems like the prefetch needs to be at at least 4 lines ahead to get
- the data into the cache in time, and the allocos contend with outstanding
- prefetches for the same cache set, so it's better to have the numbers
- different.
- */
- .section .text..SHmedia32,"ax"
- .little
- .balign 8
- .global copy_page
- copy_page:
- /* Copy 4096 bytes worth of data from r3 to r2.
- Do prefetches 4 lines ahead.
- Do alloco 2 lines ahead */
- pta 1f, tr1
- pta 2f, tr2
- pta 3f, tr3
- ptabs r18, tr0
- #if 0
- /* TAKum03020 */
- ld.q r3, 0x00, r63
- ld.q r3, 0x20, r63
- ld.q r3, 0x40, r63
- ld.q r3, 0x60, r63
- #endif
- alloco r2, 0x00
- synco ! TAKum03020
- alloco r2, 0x20
- synco ! TAKum03020
- movi 3968, r6
- add r2, r6, r6
- addi r6, 64, r7
- addi r7, 64, r8
- sub r3, r2, r60
- addi r60, 8, r61
- addi r61, 8, r62
- addi r62, 8, r23
- addi r60, 0x80, r22
- /* Minimal code size. The extra branches inside the loop don't cost much
- because they overlap with the time spent waiting for prefetches to
- complete. */
- 1:
- #if 0
- /* TAKum03020 */
- bge/u r2, r6, tr2 ! skip prefetch for last 4 lines
- ldx.q r2, r22, r63 ! prefetch 4 lines hence
- #endif
- 2:
- bge/u r2, r7, tr3 ! skip alloco for last 2 lines
- alloco r2, 0x40 ! alloc destination line 2 lines ahead
- synco ! TAKum03020
- 3:
- ldx.q r2, r60, r36
- ldx.q r2, r61, r37
- ldx.q r2, r62, r38
- ldx.q r2, r23, r39
- st.q r2, 0, r36
- st.q r2, 8, r37
- st.q r2, 16, r38
- st.q r2, 24, r39
- addi r2, 32, r2
- bgt/l r8, r2, tr1
- blink tr0, r63 ! return
|