123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353 |
- /*
- * M7memset.S: SPARC M7 optimized memset.
- *
- * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
- */
- /*
- * M7memset.S: M7 optimized memset.
- *
- * char *memset(sp, c, n)
- *
- * Set an array of n chars starting at sp to the character c.
- * Return sp.
- *
- * Fast assembler language version of the following C-program for memset
- * which represents the `standard' for the C-library.
- *
- * void *
- * memset(void *sp1, int c, size_t n)
- * {
- * if (n != 0) {
- * char *sp = sp1;
- * do {
- * *sp++ = (char)c;
- * } while (--n != 0);
- * }
- * return (sp1);
- * }
- *
- * The algorithm is as follows :
- *
- * For small 6 or fewer bytes stores, bytes will be stored.
- *
- * For less than 32 bytes stores, align the address on 4 byte boundary.
- * Then store as many 4-byte chunks, followed by trailing bytes.
- *
- * For sizes greater than 32 bytes, align the address on 8 byte boundary.
- * if (count >= 64) {
- * store 8-bytes chunks to align the address on 64 byte boundary
- * if (value to be set is zero && count >= MIN_ZERO) {
- * Using BIS stores, set the first long word of each
- * 64-byte cache line to zero which will also clear the
- * other seven long words of the cache line.
- * }
- * else if (count >= MIN_LOOP) {
- * Using BIS stores, set the first long word of each of
- * ST_CHUNK cache lines (64 bytes each) before the main
- * loop is entered.
- * In the main loop, continue pre-setting the first long
- * word of each cache line ST_CHUNK lines in advance while
- * setting the other seven long words (56 bytes) of each
- * cache line until fewer than ST_CHUNK*64 bytes remain.
- * Then set the remaining seven long words of each cache
- * line that has already had its first long word set.
- * }
- * store remaining data in 64-byte chunks until less than
- * 64 bytes remain.
- * }
- * Store as many 8-byte chunks, followed by trailing bytes.
- *
- * BIS = Block Init Store
- * Doing the advance store of the first element of the cache line
- * initiates the displacement of a cache line while only using a single
- * instruction in the pipeline. That avoids various pipeline delays,
- * such as filling the miss buffer. The performance effect is
- * similar to prefetching for normal stores.
- * The special case for zero fills runs faster and uses fewer instruction
- * cycles than the normal memset loop.
- *
- * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
- * BIS stores must be followed by a membar #StoreStore. The benefit of
- * the BIS store must be balanced against the cost of the membar operation.
- */
- /*
- * ASI_STBI_P marks the cache line as "least recently used"
- * which means if many threads are active, it has a high chance
- * of being pushed out of the cache between the first initializing
- * store and the final stores.
- * Thus, we use ASI_STBIMRU_P which marks the cache line as
- * "most recently used" for all but the last store to the cache line.
- */
- #include <asm/asi.h>
- #include <asm/page.h>
- #define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P
- #define ASI_STBIMRU_P ASI_ST_BLKINIT_MRU_P
- #define ST_CHUNK 24 /* multiple of 4 due to loop unrolling */
- #define MIN_LOOP 16320
- #define MIN_ZERO 512
- .section ".text"
- .align 32
- /*
- * Define clear_page(dest) as memset(dest, 0, PAGE_SIZE)
- * (can create a more optimized version later.)
- */
- .globl M7clear_page
- .globl M7clear_user_page
- M7clear_page: /* clear_page(dest) */
- M7clear_user_page:
- set PAGE_SIZE, %o1
- /* fall through into bzero code */
- .size M7clear_page,.-M7clear_page
- .size M7clear_user_page,.-M7clear_user_page
- /*
- * Define bzero(dest, n) as memset(dest, 0, n)
- * (can create a more optimized version later.)
- */
- .globl M7bzero
- M7bzero: /* bzero(dest, size) */
- mov %o1, %o2
- mov 0, %o1
- /* fall through into memset code */
- .size M7bzero,.-M7bzero
- .global M7memset
- .type M7memset, #function
- .register %g3, #scratch
- M7memset:
- mov %o0, %o5 ! copy sp1 before using it
- cmp %o2, 7 ! if small counts, just write bytes
- bleu,pn %xcc, .wrchar
- and %o1, 0xff, %o1 ! o1 is (char)c
- sll %o1, 8, %o3
- or %o1, %o3, %o1 ! now o1 has 2 bytes of c
- sll %o1, 16, %o3
- cmp %o2, 32
- blu,pn %xcc, .wdalign
- or %o1, %o3, %o1 ! now o1 has 4 bytes of c
- sllx %o1, 32, %o3
- or %o1, %o3, %o1 ! now o1 has 8 bytes of c
- .dbalign:
- andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound?
- bz,pt %xcc, .blkalign ! already long word aligned
- sub %o3, 8, %o3 ! -(bytes till long word aligned)
- add %o2, %o3, %o2 ! update o2 with new count
- ! Set -(%o3) bytes till sp1 long word aligned
- 1: stb %o1, [%o5] ! there is at least 1 byte to set
- inccc %o3 ! byte clearing loop
- bl,pt %xcc, 1b
- inc %o5
- ! Now sp1 is long word aligned (sp1 is found in %o5)
- .blkalign:
- cmp %o2, 64 ! check if there are 64 bytes to set
- blu,pn %xcc, .wrshort
- mov %o2, %o3
- andcc %o5, 63, %o3 ! is sp1 block aligned?
- bz,pt %xcc, .blkwr ! now block aligned
- sub %o3, 64, %o3 ! o3 is -(bytes till block aligned)
- add %o2, %o3, %o2 ! o2 is the remainder
- ! Store -(%o3) bytes till dst is block (64 byte) aligned.
- ! Use long word stores.
- ! Recall that dst is already long word aligned
- 1:
- addcc %o3, 8, %o3
- stx %o1, [%o5]
- bl,pt %xcc, 1b
- add %o5, 8, %o5
- ! Now sp1 is block aligned
- .blkwr:
- andn %o2, 63, %o4 ! calculate size of blocks in bytes
- brz,pn %o1, .wrzero ! special case if c == 0
- and %o2, 63, %o3 ! %o3 = bytes left after blk stores.
- set MIN_LOOP, %g1
- cmp %o4, %g1 ! check there are enough bytes to set
- blu,pn %xcc, .short_set ! to justify cost of membar
- ! must be > pre-cleared lines
- nop
- ! initial cache-clearing stores
- ! get store pipeline moving
- rd %asi, %g3 ! save %asi to be restored later
- wr %g0, ASI_STBIMRU_P, %asi
- ! Primary memset loop for large memsets
- .wr_loop:
- sub %o5, 8, %o5 ! adjust %o5 for ASI store alignment
- mov ST_CHUNK, %g1
- .wr_loop_start:
- stxa %o1, [%o5+8]%asi
- subcc %g1, 4, %g1
- stxa %o1, [%o5+8+64]%asi
- add %o5, 256, %o5
- stxa %o1, [%o5+8-128]%asi
- bgu %xcc, .wr_loop_start
- stxa %o1, [%o5+8-64]%asi
- sub %o5, ST_CHUNK*64, %o5 ! reset %o5
- mov ST_CHUNK, %g1
- .wr_loop_rest:
- stxa %o1, [%o5+8+8]%asi
- sub %o4, 64, %o4
- stxa %o1, [%o5+16+8]%asi
- subcc %g1, 1, %g1
- stxa %o1, [%o5+24+8]%asi
- stxa %o1, [%o5+32+8]%asi
- stxa %o1, [%o5+40+8]%asi
- add %o5, 64, %o5
- stxa %o1, [%o5-8]%asi
- bgu %xcc, .wr_loop_rest
- stxa %o1, [%o5]ASI_STBI_P
- ! If more than ST_CHUNK*64 bytes remain to set, continue
- ! setting the first long word of each cache line in advance
- ! to keep the store pipeline moving.
- cmp %o4, ST_CHUNK*64
- bge,pt %xcc, .wr_loop_start
- mov ST_CHUNK, %g1
- brz,a,pn %o4, .asi_done
- add %o5, 8, %o5 ! restore %o5 offset
- .wr_loop_small:
- stxa %o1, [%o5+8]%asi
- stxa %o1, [%o5+8+8]%asi
- stxa %o1, [%o5+16+8]%asi
- stxa %o1, [%o5+24+8]%asi
- stxa %o1, [%o5+32+8]%asi
- subcc %o4, 64, %o4
- stxa %o1, [%o5+40+8]%asi
- add %o5, 64, %o5
- stxa %o1, [%o5-8]%asi
- bgu,pt %xcc, .wr_loop_small
- stxa %o1, [%o5]ASI_STBI_P
- ba .asi_done
- add %o5, 8, %o5 ! restore %o5 offset
- ! Special case loop for zero fill memsets
- ! For each 64 byte cache line, single STBI to first element
- ! clears line
- .wrzero:
- cmp %o4, MIN_ZERO ! check if enough bytes to set
- ! to pay %asi + membar cost
- blu %xcc, .short_set
- nop
- sub %o4, 256, %o4
- .wrzero_loop:
- mov 64, %g3
- stxa %o1, [%o5]ASI_STBI_P
- subcc %o4, 256, %o4
- stxa %o1, [%o5+%g3]ASI_STBI_P
- add %o5, 256, %o5
- sub %g3, 192, %g3
- stxa %o1, [%o5+%g3]ASI_STBI_P
- add %g3, 64, %g3
- bge,pt %xcc, .wrzero_loop
- stxa %o1, [%o5+%g3]ASI_STBI_P
- add %o4, 256, %o4
- brz,pn %o4, .bsi_done
- nop
- .wrzero_small:
- stxa %o1, [%o5]ASI_STBI_P
- subcc %o4, 64, %o4
- bgu,pt %xcc, .wrzero_small
- add %o5, 64, %o5
- ba,a .bsi_done
- .asi_done:
- wr %g3, 0x0, %asi ! restored saved %asi
- .bsi_done:
- membar #StoreStore ! required by use of Block Store Init
- .short_set:
- cmp %o4, 64 ! check if 64 bytes to set
- blu %xcc, 5f
- nop
- 4: ! set final blocks of 64 bytes
- stx %o1, [%o5]
- stx %o1, [%o5+8]
- stx %o1, [%o5+16]
- stx %o1, [%o5+24]
- subcc %o4, 64, %o4
- stx %o1, [%o5+32]
- stx %o1, [%o5+40]
- add %o5, 64, %o5
- stx %o1, [%o5-16]
- bgu,pt %xcc, 4b
- stx %o1, [%o5-8]
- 5:
- ! Set the remaining long words
- .wrshort:
- subcc %o3, 8, %o3 ! Can we store any long words?
- blu,pn %xcc, .wrchars
- and %o2, 7, %o2 ! calc bytes left after long words
- 6:
- subcc %o3, 8, %o3
- stx %o1, [%o5] ! store the long words
- bgeu,pt %xcc, 6b
- add %o5, 8, %o5
- .wrchars: ! check for extra chars
- brnz %o2, .wrfin
- nop
- retl
- nop
- .wdalign:
- andcc %o5, 3, %o3 ! is sp1 aligned on a word boundary
- bz,pn %xcc, .wrword
- andn %o2, 3, %o3 ! create word sized count in %o3
- dec %o2 ! decrement count
- stb %o1, [%o5] ! clear a byte
- b .wdalign
- inc %o5 ! next byte
- .wrword:
- subcc %o3, 4, %o3
- st %o1, [%o5] ! 4-byte writing loop
- bnz,pt %xcc, .wrword
- add %o5, 4, %o5
- and %o2, 3, %o2 ! leftover count, if any
- .wrchar:
- ! Set the remaining bytes, if any
- brz %o2, .exit
- nop
- .wrfin:
- deccc %o2
- stb %o1, [%o5]
- bgu,pt %xcc, .wrfin
- inc %o5
- .exit:
- retl ! %o0 was preserved
- nop
- .size M7memset,.-M7memset
|