M7memset.S 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. /*
  2. * M7memset.S: SPARC M7 optimized memset.
  3. *
  4. * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
  5. */
  6. /*
  7. * M7memset.S: M7 optimized memset.
  8. *
  9. * char *memset(sp, c, n)
  10. *
  11. * Set an array of n chars starting at sp to the character c.
  12. * Return sp.
  13. *
  14. * Fast assembler language version of the following C-program for memset
  15. * which represents the `standard' for the C-library.
  16. *
  17. * void *
  18. * memset(void *sp1, int c, size_t n)
  19. * {
  20. * if (n != 0) {
  21. * char *sp = sp1;
  22. * do {
  23. * *sp++ = (char)c;
  24. * } while (--n != 0);
  25. * }
  26. * return (sp1);
  27. * }
  28. *
  29. * The algorithm is as follows :
  30. *
  31. * For small 6 or fewer bytes stores, bytes will be stored.
  32. *
  33. * For less than 32 bytes stores, align the address on 4 byte boundary.
  34. * Then store as many 4-byte chunks, followed by trailing bytes.
  35. *
  36. * For sizes greater than 32 bytes, align the address on 8 byte boundary.
  37. * if (count >= 64) {
  38. * store 8-bytes chunks to align the address on 64 byte boundary
  39. * if (value to be set is zero && count >= MIN_ZERO) {
  40. * Using BIS stores, set the first long word of each
  41. * 64-byte cache line to zero which will also clear the
  42. * other seven long words of the cache line.
  43. * }
  44. * else if (count >= MIN_LOOP) {
  45. * Using BIS stores, set the first long word of each of
  46. * ST_CHUNK cache lines (64 bytes each) before the main
  47. * loop is entered.
  48. * In the main loop, continue pre-setting the first long
  49. * word of each cache line ST_CHUNK lines in advance while
  50. * setting the other seven long words (56 bytes) of each
  51. * cache line until fewer than ST_CHUNK*64 bytes remain.
  52. * Then set the remaining seven long words of each cache
  53. * line that has already had its first long word set.
  54. * }
  55. * store remaining data in 64-byte chunks until less than
  56. * 64 bytes remain.
  57. * }
  58. * Store as many 8-byte chunks, followed by trailing bytes.
  59. *
  60. * BIS = Block Init Store
  61. * Doing the advance store of the first element of the cache line
  62. * initiates the displacement of a cache line while only using a single
  63. * instruction in the pipeline. That avoids various pipeline delays,
  64. * such as filling the miss buffer. The performance effect is
  65. * similar to prefetching for normal stores.
  66. * The special case for zero fills runs faster and uses fewer instruction
  67. * cycles than the normal memset loop.
  68. *
  69. * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
  70. * BIS stores must be followed by a membar #StoreStore. The benefit of
  71. * the BIS store must be balanced against the cost of the membar operation.
  72. */
  73. /*
  74. * ASI_STBI_P marks the cache line as "least recently used"
  75. * which means if many threads are active, it has a high chance
  76. * of being pushed out of the cache between the first initializing
  77. * store and the final stores.
  78. * Thus, we use ASI_STBIMRU_P which marks the cache line as
  79. * "most recently used" for all but the last store to the cache line.
  80. */
  81. #include <asm/asi.h>
  82. #include <asm/page.h>
  83. #define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P
  84. #define ASI_STBIMRU_P ASI_ST_BLKINIT_MRU_P
  85. #define ST_CHUNK 24 /* multiple of 4 due to loop unrolling */
  86. #define MIN_LOOP 16320
  87. #define MIN_ZERO 512
  88. .section ".text"
  89. .align 32
  90. /*
  91. * Define clear_page(dest) as memset(dest, 0, PAGE_SIZE)
  92. * (can create a more optimized version later.)
  93. */
  94. .globl M7clear_page
  95. .globl M7clear_user_page
  96. M7clear_page: /* clear_page(dest) */
  97. M7clear_user_page:
  98. set PAGE_SIZE, %o1
  99. /* fall through into bzero code */
  100. .size M7clear_page,.-M7clear_page
  101. .size M7clear_user_page,.-M7clear_user_page
  102. /*
  103. * Define bzero(dest, n) as memset(dest, 0, n)
  104. * (can create a more optimized version later.)
  105. */
  106. .globl M7bzero
  107. M7bzero: /* bzero(dest, size) */
  108. mov %o1, %o2
  109. mov 0, %o1
  110. /* fall through into memset code */
  111. .size M7bzero,.-M7bzero
  112. .global M7memset
  113. .type M7memset, #function
  114. .register %g3, #scratch
  115. M7memset:
  116. mov %o0, %o5 ! copy sp1 before using it
  117. cmp %o2, 7 ! if small counts, just write bytes
  118. bleu,pn %xcc, .wrchar
  119. and %o1, 0xff, %o1 ! o1 is (char)c
  120. sll %o1, 8, %o3
  121. or %o1, %o3, %o1 ! now o1 has 2 bytes of c
  122. sll %o1, 16, %o3
  123. cmp %o2, 32
  124. blu,pn %xcc, .wdalign
  125. or %o1, %o3, %o1 ! now o1 has 4 bytes of c
  126. sllx %o1, 32, %o3
  127. or %o1, %o3, %o1 ! now o1 has 8 bytes of c
  128. .dbalign:
  129. andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound?
  130. bz,pt %xcc, .blkalign ! already long word aligned
  131. sub %o3, 8, %o3 ! -(bytes till long word aligned)
  132. add %o2, %o3, %o2 ! update o2 with new count
  133. ! Set -(%o3) bytes till sp1 long word aligned
  134. 1: stb %o1, [%o5] ! there is at least 1 byte to set
  135. inccc %o3 ! byte clearing loop
  136. bl,pt %xcc, 1b
  137. inc %o5
  138. ! Now sp1 is long word aligned (sp1 is found in %o5)
  139. .blkalign:
  140. cmp %o2, 64 ! check if there are 64 bytes to set
  141. blu,pn %xcc, .wrshort
  142. mov %o2, %o3
  143. andcc %o5, 63, %o3 ! is sp1 block aligned?
  144. bz,pt %xcc, .blkwr ! now block aligned
  145. sub %o3, 64, %o3 ! o3 is -(bytes till block aligned)
  146. add %o2, %o3, %o2 ! o2 is the remainder
  147. ! Store -(%o3) bytes till dst is block (64 byte) aligned.
  148. ! Use long word stores.
  149. ! Recall that dst is already long word aligned
  150. 1:
  151. addcc %o3, 8, %o3
  152. stx %o1, [%o5]
  153. bl,pt %xcc, 1b
  154. add %o5, 8, %o5
  155. ! Now sp1 is block aligned
  156. .blkwr:
  157. andn %o2, 63, %o4 ! calculate size of blocks in bytes
  158. brz,pn %o1, .wrzero ! special case if c == 0
  159. and %o2, 63, %o3 ! %o3 = bytes left after blk stores.
  160. set MIN_LOOP, %g1
  161. cmp %o4, %g1 ! check there are enough bytes to set
  162. blu,pn %xcc, .short_set ! to justify cost of membar
  163. ! must be > pre-cleared lines
  164. nop
  165. ! initial cache-clearing stores
  166. ! get store pipeline moving
  167. rd %asi, %g3 ! save %asi to be restored later
  168. wr %g0, ASI_STBIMRU_P, %asi
  169. ! Primary memset loop for large memsets
  170. .wr_loop:
  171. sub %o5, 8, %o5 ! adjust %o5 for ASI store alignment
  172. mov ST_CHUNK, %g1
  173. .wr_loop_start:
  174. stxa %o1, [%o5+8]%asi
  175. subcc %g1, 4, %g1
  176. stxa %o1, [%o5+8+64]%asi
  177. add %o5, 256, %o5
  178. stxa %o1, [%o5+8-128]%asi
  179. bgu %xcc, .wr_loop_start
  180. stxa %o1, [%o5+8-64]%asi
  181. sub %o5, ST_CHUNK*64, %o5 ! reset %o5
  182. mov ST_CHUNK, %g1
  183. .wr_loop_rest:
  184. stxa %o1, [%o5+8+8]%asi
  185. sub %o4, 64, %o4
  186. stxa %o1, [%o5+16+8]%asi
  187. subcc %g1, 1, %g1
  188. stxa %o1, [%o5+24+8]%asi
  189. stxa %o1, [%o5+32+8]%asi
  190. stxa %o1, [%o5+40+8]%asi
  191. add %o5, 64, %o5
  192. stxa %o1, [%o5-8]%asi
  193. bgu %xcc, .wr_loop_rest
  194. stxa %o1, [%o5]ASI_STBI_P
  195. ! If more than ST_CHUNK*64 bytes remain to set, continue
  196. ! setting the first long word of each cache line in advance
  197. ! to keep the store pipeline moving.
  198. cmp %o4, ST_CHUNK*64
  199. bge,pt %xcc, .wr_loop_start
  200. mov ST_CHUNK, %g1
  201. brz,a,pn %o4, .asi_done
  202. add %o5, 8, %o5 ! restore %o5 offset
  203. .wr_loop_small:
  204. stxa %o1, [%o5+8]%asi
  205. stxa %o1, [%o5+8+8]%asi
  206. stxa %o1, [%o5+16+8]%asi
  207. stxa %o1, [%o5+24+8]%asi
  208. stxa %o1, [%o5+32+8]%asi
  209. subcc %o4, 64, %o4
  210. stxa %o1, [%o5+40+8]%asi
  211. add %o5, 64, %o5
  212. stxa %o1, [%o5-8]%asi
  213. bgu,pt %xcc, .wr_loop_small
  214. stxa %o1, [%o5]ASI_STBI_P
  215. ba .asi_done
  216. add %o5, 8, %o5 ! restore %o5 offset
  217. ! Special case loop for zero fill memsets
  218. ! For each 64 byte cache line, single STBI to first element
  219. ! clears line
  220. .wrzero:
  221. cmp %o4, MIN_ZERO ! check if enough bytes to set
  222. ! to pay %asi + membar cost
  223. blu %xcc, .short_set
  224. nop
  225. sub %o4, 256, %o4
  226. .wrzero_loop:
  227. mov 64, %g3
  228. stxa %o1, [%o5]ASI_STBI_P
  229. subcc %o4, 256, %o4
  230. stxa %o1, [%o5+%g3]ASI_STBI_P
  231. add %o5, 256, %o5
  232. sub %g3, 192, %g3
  233. stxa %o1, [%o5+%g3]ASI_STBI_P
  234. add %g3, 64, %g3
  235. bge,pt %xcc, .wrzero_loop
  236. stxa %o1, [%o5+%g3]ASI_STBI_P
  237. add %o4, 256, %o4
  238. brz,pn %o4, .bsi_done
  239. nop
  240. .wrzero_small:
  241. stxa %o1, [%o5]ASI_STBI_P
  242. subcc %o4, 64, %o4
  243. bgu,pt %xcc, .wrzero_small
  244. add %o5, 64, %o5
  245. ba,a .bsi_done
  246. .asi_done:
  247. wr %g3, 0x0, %asi ! restored saved %asi
  248. .bsi_done:
  249. membar #StoreStore ! required by use of Block Store Init
  250. .short_set:
  251. cmp %o4, 64 ! check if 64 bytes to set
  252. blu %xcc, 5f
  253. nop
  254. 4: ! set final blocks of 64 bytes
  255. stx %o1, [%o5]
  256. stx %o1, [%o5+8]
  257. stx %o1, [%o5+16]
  258. stx %o1, [%o5+24]
  259. subcc %o4, 64, %o4
  260. stx %o1, [%o5+32]
  261. stx %o1, [%o5+40]
  262. add %o5, 64, %o5
  263. stx %o1, [%o5-16]
  264. bgu,pt %xcc, 4b
  265. stx %o1, [%o5-8]
  266. 5:
  267. ! Set the remaining long words
  268. .wrshort:
  269. subcc %o3, 8, %o3 ! Can we store any long words?
  270. blu,pn %xcc, .wrchars
  271. and %o2, 7, %o2 ! calc bytes left after long words
  272. 6:
  273. subcc %o3, 8, %o3
  274. stx %o1, [%o5] ! store the long words
  275. bgeu,pt %xcc, 6b
  276. add %o5, 8, %o5
  277. .wrchars: ! check for extra chars
  278. brnz %o2, .wrfin
  279. nop
  280. retl
  281. nop
  282. .wdalign:
  283. andcc %o5, 3, %o3 ! is sp1 aligned on a word boundary
  284. bz,pn %xcc, .wrword
  285. andn %o2, 3, %o3 ! create word sized count in %o3
  286. dec %o2 ! decrement count
  287. stb %o1, [%o5] ! clear a byte
  288. b .wdalign
  289. inc %o5 ! next byte
  290. .wrword:
  291. subcc %o3, 4, %o3
  292. st %o1, [%o5] ! 4-byte writing loop
  293. bnz,pt %xcc, .wrword
  294. add %o5, 4, %o5
  295. and %o2, 3, %o2 ! leftover count, if any
  296. .wrchar:
  297. ! Set the remaining bytes, if any
  298. brz %o2, .exit
  299. nop
  300. .wrfin:
  301. deccc %o2
  302. stb %o1, [%o5]
  303. bgu,pt %xcc, .wrfin
  304. inc %o5
  305. .exit:
  306. retl ! %o0 was preserved
  307. nop
  308. .size M7memset,.-M7memset