memset-archs.S 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. /*
  2. * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License version 2 as
  6. * published by the Free Software Foundation.
  7. */
  8. #include <linux/linkage.h>
  9. #include <asm/cache.h>
  10. /*
  11. * The memset implementation below is optimized to use prefetchw and prealloc
  12. * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
  13. * If you want to implement optimized memset for other possible L1 data cache
  14. * line lengths (32B and 128B) you should rewrite code carefully checking
  15. * we don't call any prefetchw/prealloc instruction for L1 cache lines which
  16. * don't belongs to memset area.
  17. */
  18. #if L1_CACHE_SHIFT == 6
  19. .macro PREALLOC_INSTR reg, off
  20. prealloc [\reg, \off]
  21. .endm
  22. .macro PREFETCHW_INSTR reg, off
  23. prefetchw [\reg, \off]
  24. .endm
  25. #else
  26. .macro PREALLOC_INSTR
  27. .endm
  28. .macro PREFETCHW_INSTR
  29. .endm
  30. #endif
  31. ENTRY_CFI(memset)
  32. PREFETCHW_INSTR r0, 0 ; Prefetch the first write location
  33. mov.f 0, r2
  34. ;;; if size is zero
  35. jz.d [blink]
  36. mov r3, r0 ; don't clobber ret val
  37. ;;; if length < 8
  38. brls.d.nt r2, 8, .Lsmallchunk
  39. mov.f lp_count,r2
  40. and.f r4, r0, 0x03
  41. rsub lp_count, r4, 4
  42. lpnz @.Laligndestination
  43. ;; LOOP BEGIN
  44. stb.ab r1, [r3,1]
  45. sub r2, r2, 1
  46. .Laligndestination:
  47. ;;; Destination is aligned
  48. and r1, r1, 0xFF
  49. asl r4, r1, 8
  50. or r4, r4, r1
  51. asl r5, r4, 16
  52. or r5, r5, r4
  53. mov r4, r5
  54. sub3 lp_count, r2, 8
  55. cmp r2, 64
  56. bmsk.hi r2, r2, 5
  57. mov.ls lp_count, 0
  58. add3.hi r2, r2, 8
  59. ;;; Convert len to Dwords, unfold x8
  60. lsr.f lp_count, lp_count, 6
  61. lpnz @.Lset64bytes
  62. ;; LOOP START
  63. PREALLOC_INSTR r3, 64 ; alloc next line w/o fetching
  64. #ifdef CONFIG_ARC_HAS_LL64
  65. std.ab r4, [r3, 8]
  66. std.ab r4, [r3, 8]
  67. std.ab r4, [r3, 8]
  68. std.ab r4, [r3, 8]
  69. std.ab r4, [r3, 8]
  70. std.ab r4, [r3, 8]
  71. std.ab r4, [r3, 8]
  72. std.ab r4, [r3, 8]
  73. #else
  74. st.ab r4, [r3, 4]
  75. st.ab r4, [r3, 4]
  76. st.ab r4, [r3, 4]
  77. st.ab r4, [r3, 4]
  78. st.ab r4, [r3, 4]
  79. st.ab r4, [r3, 4]
  80. st.ab r4, [r3, 4]
  81. st.ab r4, [r3, 4]
  82. st.ab r4, [r3, 4]
  83. st.ab r4, [r3, 4]
  84. st.ab r4, [r3, 4]
  85. st.ab r4, [r3, 4]
  86. st.ab r4, [r3, 4]
  87. st.ab r4, [r3, 4]
  88. st.ab r4, [r3, 4]
  89. st.ab r4, [r3, 4]
  90. #endif
  91. .Lset64bytes:
  92. lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes
  93. lpnz .Lset32bytes
  94. ;; LOOP START
  95. #ifdef CONFIG_ARC_HAS_LL64
  96. std.ab r4, [r3, 8]
  97. std.ab r4, [r3, 8]
  98. std.ab r4, [r3, 8]
  99. std.ab r4, [r3, 8]
  100. #else
  101. st.ab r4, [r3, 4]
  102. st.ab r4, [r3, 4]
  103. st.ab r4, [r3, 4]
  104. st.ab r4, [r3, 4]
  105. st.ab r4, [r3, 4]
  106. st.ab r4, [r3, 4]
  107. st.ab r4, [r3, 4]
  108. st.ab r4, [r3, 4]
  109. #endif
  110. .Lset32bytes:
  111. and.f lp_count, r2, 0x1F ;Last remaining 31 bytes
  112. .Lsmallchunk:
  113. lpnz .Lcopy3bytes
  114. ;; LOOP START
  115. stb.ab r1, [r3, 1]
  116. .Lcopy3bytes:
  117. j [blink]
  118. END_CFI(memset)
  119. ENTRY_CFI(memzero)
  120. ; adjust bzero args to memset args
  121. mov r2, r1
  122. b.d memset ;tail call so need to tinker with blink
  123. mov r1, 0
  124. END_CFI(memzero)