123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- /*
- * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
- #include <linux/linkage.h>
- #include <asm/cache.h>
- /*
- * The memset implementation below is optimized to use prefetchw and prealloc
- * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
- * If you want to implement optimized memset for other possible L1 data cache
- * line lengths (32B and 128B) you should rewrite code carefully checking
- * we don't call any prefetchw/prealloc instruction for L1 cache lines which
- * don't belongs to memset area.
- */
- #if L1_CACHE_SHIFT == 6
- .macro PREALLOC_INSTR reg, off
- prealloc [\reg, \off]
- .endm
- .macro PREFETCHW_INSTR reg, off
- prefetchw [\reg, \off]
- .endm
- #else
- .macro PREALLOC_INSTR
- .endm
- .macro PREFETCHW_INSTR
- .endm
- #endif
- ENTRY_CFI(memset)
- PREFETCHW_INSTR r0, 0 ; Prefetch the first write location
- mov.f 0, r2
- ;;; if size is zero
- jz.d [blink]
- mov r3, r0 ; don't clobber ret val
- ;;; if length < 8
- brls.d.nt r2, 8, .Lsmallchunk
- mov.f lp_count,r2
- and.f r4, r0, 0x03
- rsub lp_count, r4, 4
- lpnz @.Laligndestination
- ;; LOOP BEGIN
- stb.ab r1, [r3,1]
- sub r2, r2, 1
- .Laligndestination:
- ;;; Destination is aligned
- and r1, r1, 0xFF
- asl r4, r1, 8
- or r4, r4, r1
- asl r5, r4, 16
- or r5, r5, r4
- mov r4, r5
- sub3 lp_count, r2, 8
- cmp r2, 64
- bmsk.hi r2, r2, 5
- mov.ls lp_count, 0
- add3.hi r2, r2, 8
- ;;; Convert len to Dwords, unfold x8
- lsr.f lp_count, lp_count, 6
- lpnz @.Lset64bytes
- ;; LOOP START
- PREALLOC_INSTR r3, 64 ; alloc next line w/o fetching
- #ifdef CONFIG_ARC_HAS_LL64
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- #else
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- #endif
- .Lset64bytes:
- lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes
- lpnz .Lset32bytes
- ;; LOOP START
- #ifdef CONFIG_ARC_HAS_LL64
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- std.ab r4, [r3, 8]
- #else
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- st.ab r4, [r3, 4]
- #endif
- .Lset32bytes:
- and.f lp_count, r2, 0x1F ;Last remaining 31 bytes
- .Lsmallchunk:
- lpnz .Lcopy3bytes
- ;; LOOP START
- stb.ab r1, [r3, 1]
- .Lcopy3bytes:
- j [blink]
- END_CFI(memset)
- ENTRY_CFI(memzero)
- ; adjust bzero args to memset args
- mov r2, r1
- b.d memset ;tail call so need to tinker with blink
- mov r1, 0
- END_CFI(memzero)
|