123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544 |
- .section .text
- .p2align 4
- .global memcpy
- .type memcpy, @function
- memcpy:
- {
- p2 = cmp.eq(len,
- align888 = or(ptr_in, ptr_out);
- p0 = cmp.gtu(len,
- p1 = cmp.eq(ptr_in, ptr_out);
- }
- {
- p1 = or(p2, p1);
- p3 = cmp.gtu(len,
- align888 = or(align888, len);
- len8 = lsr(len,
- }
- {
- dcfetch(ptr_in);
- p2 = bitsclr(align888,
- if(p1) jumpr r31;
- }
- {
- p2 = and(p2,!p3);
- if (p2.new) len = add(len,
- if (p2.new) jump:NT .Ldwordaligned;
- }
- {
- if(!p0) jump .Lbytes23orless;
- mask.l =
-
- prolog = sub(
- }
- {
-
- allocframe(
- mask.h =
- ptr_in_p_128 = add(ptr_in,
- back = cl0(len);
- }
- {
- memd(sp+
- r31.l =
- prolog &= lsr(mask, back);
- offset = and(ptr_in,
- }
- {
- memd(sp+
- dalign = sub(ptr_out, ptr_in);
- r31.h =
- }
- {
-
- over = add(len, ptr_in);
- back = add(len, offset);
- memd(sp+
- }
- {
- noprolog = bitsclr(prolog,
- prolog = and(prolog,
- dcfetch(ptr_in_p_128);
- ptr_in_p_128 = add(ptr_in_p_128,
- }
- {
- kernel = sub(len, prolog);
- shift = asl(prolog,
- star3 = and(prolog,
- ptr_in = and(ptr_in,
- }
- {
- prolog = lsr(prolog,
- epilog = and(kernel,
- ptr_out_p_32 = add(ptr_out, prolog);
- over = and(over,
- }
- {
- p3 = cmp.gtu(back,
- kernel = lsr(kernel,
- dcfetch(ptr_in_p_128);
- ptr_in_p_128 = add(ptr_in_p_128,
- }
- {
- p1 = cmp.eq(prolog,
- if(!p1.new) prolog = add(prolog,
- dcfetch(ptr_in_p_128);
- ptr_in_p_128 = add(ptr_in_p_128,
- }
- {
- nokernel = cmp.eq(kernel,
- dcfetch(ptr_in_p_128);
- ptr_in_p_128 = add(ptr_in_p_128,
- shiftb = and(shift,
- }
- {
- dcfetch(ptr_in_p_128);
- ptr_in_p_128 = add(ptr_in_p_128,
- if(nokernel) jump .Lskip64;
- p2 = cmp.eq(kernel,
- }
- {
- dczeroa(ptr_out_p_32);
-
- if(!p2) ptr_out_p_32 = add(ptr_out_p_32,
- }
- {
- dalign = and(dalign,
- dczeroa(ptr_out_p_32);
- }
- .Lskip64:
- {
- data70 = memd(ptr_in++
- if(p3) dataF8 = memd(ptr_in+
- if(noprolog) jump .Lnoprolog32;
- align = offset;
- }
- {
- ldata0 = valignb(dataF8, data70, align);
- ifbyte = tstbit(shift,
- offset = add(offset, star3);
- }
- {
- if(ifbyte) memb(ptr_out++
- ldata0 = lsr(ldata0, shiftb);
- shiftb = and(shift,
- ifhword = tstbit(shift,
- }
- {
- if(ifhword) memh(ptr_out++
- ldata0 = lsr(ldata0, shiftb);
- ifword = tstbit(shift,
- p2 = cmp.gtu(offset,
- }
- {
- if(ifword) memw(ptr_out++
- if(p2) data70 = dataF8;
- if(p2) dataF8 = memd(ptr_in++
- align = offset;
- }
- .Lnoprolog32:
- {
- p3 = sp1loop0(.Ldword_loop_prolog, prolog)
- rest = sub(len, star3);
- p0 = cmp.gt(over,
- }
- if(p0) rest = add(rest,
- .Ldword_loop_prolog:
- {
- if(p3) memd(ptr_out++
- ldata0 = valignb(dataF8, data70, align);
- p0 = cmp.gt(rest,
- }
- {
- data70 = dataF8;
- if(p0) dataF8 = memd(ptr_in++
- rest = add(rest,
- }:endloop0
- .Lkernel:
- {
-
- p3 = cmp.gtu(kernel,
-
- if(p3.new) kernel = add(kernel,
-
- if(p3.new) epilog = add(epilog,
- }
- {
- nokernel = cmp.eq(kernel,
- if(nokernel.new) jump:NT .Lepilog;
- inc = combine(
- p3 = cmp.gtu(dalign,
- }
- {
- if(p3) jump .Lodd_alignment;
- }
- {
- loop0(.Loword_loop_25to31, kernel);
- kernel1 = cmp.gtu(kernel,
- rest = kernel;
- }
- .falign
- .Loword_loop_25to31:
- {
- dcfetch(ptr_in_p_128);
- if(kernel1) ptr_out_p_32 = add(ptr_out_p_32,
- }
- {
- dczeroa(ptr_out_p_32);
- p3 = cmp.eq(kernel, rest);
- }
- {
-
- ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
-
- if(!p3) memd(ptr_out++
- ldata1 = valignb(dataF8, data70, align);
- data70 = memd(ptr_in++
- }
- {
- memd(ptr_out++
- ldata0 = valignb(data70, dataF8, align);
- dataF8 = memd(ptr_in++
- }
- {
- memd(ptr_out++
- ldata1 = valignb(dataF8, data70, align);
- data70 = memd(ptr_in++
- }
- {
- memd(ptr_out++
- ldata0 = valignb(data70, dataF8, align);
- dataF8 = memd(ptr_in++
- kernel1 = cmp.gtu(kernel,
- }:endloop0
- {
- memd(ptr_out++
- jump .Lepilog;
- }
- .Lodd_alignment:
- {
- loop0(.Loword_loop_00to24, kernel);
- kernel1 = cmp.gtu(kernel,
- rest = add(kernel,
- }
- .falign
- .Loword_loop_00to24:
- {
- dcfetch(ptr_in_p_128);
- ptr_in_p_128kernel = vaddw(ptr_in_p_128kernel, inc);
- if(kernel1) ptr_out_p_32 = add(ptr_out_p_32,
- }
- {
- dczeroa(ptr_out_p_32);
- }
- {
- memd(ptr_out++
- ldata0 = valignb(dataF8, data70, align);
- data70 = memd(ptr_in++
- }
- {
- memd(ptr_out++
- ldata0 = valignb(data70, dataF8, align);
- dataF8 = memd(ptr_in++
- }
- {
- memd(ptr_out++
- ldata0 = valignb(dataF8, data70, align);
- data70 = memd(ptr_in++
- }
- {
- memd(ptr_out++
- ldata0 = valignb(data70, dataF8, align);
- dataF8 = memd(ptr_in++
- kernel1 = cmp.gtu(kernel,
- }:endloop0
- .Lepilog:
- {
- noepilog = cmp.eq(epilog,
- epilogdws = lsr(epilog,
- kernel = and(epilog,
- }
- {
- if(noepilog) jumpr r31;
- if(noepilog) ptr_out = sub(ptr_out, len);
- p3 = cmp.eq(epilogdws,
- shift2 = asl(epilog,
- }
- {
- shiftb = and(shift2,
- ifword = tstbit(epilog,
- if(p3) jump .Lepilog60;
- if(!p3) epilog = add(epilog,
- }
- {
- loop0(.Ldword_loop_epilog, epilogdws);
-
- p3 = cmp.eq(kernel,
- if(p3.new) kernel=
- p1 = cmp.gt(over,
- }
-
- if(p1) kernel=
- .Ldword_loop_epilog:
- {
- memd(ptr_out++
- ldata0 = valignb(dataF8, data70, align);
- p3 = cmp.gt(epilog, kernel);
- }
- {
- data70 = dataF8;
- if(p3) dataF8 = memd(ptr_in++
- epilog = add(epilog,
- }:endloop0
- .Lepilog60:
- {
- if(ifword) memw(ptr_out++
- ldata0 = lsr(ldata0, shiftb);
- ifhword = tstbit(epilog,
- shiftb = and(shift2,
- }
- {
- if(ifhword) memh(ptr_out++
- ldata0 = lsr(ldata0, shiftb);
- ifbyte = tstbit(epilog,
- if(ifbyte.new) len = add(len,
- }
- {
- if(ifbyte) memb(ptr_out) = data0;
- ptr_out = sub(ptr_out, len);
- jumpr r31;
- }
- .Lbytes23orless:
- {
- p3 = sp1loop0(.Lbyte_copy, len);
- len = add(len,
- }
- .Lbyte_copy:
- {
- data0 = memb(ptr_in++
- if(p3) memb(ptr_out++
- }:endloop0
- {
- memb(ptr_out) = data0;
- ptr_out = sub(ptr_out, len);
- jumpr r31;
- }
- .Ldwordaligned:
- {
- p3 = sp1loop0(.Ldword_copy, len8);
- }
- .Ldword_copy:
- {
- if(p3) memd(ptr_out++
- ldata0 = memd(ptr_in++
- }:endloop0
- {
- memd(ptr_out) = ldata0;
- ptr_out = sub(ptr_out, len);
- jumpr r31;
- }
- .Lmemcpy_return:
- r21:20 = memd(sp+
- {
- r25:24 = memd(sp+
- r17:16 = memd(sp+
- }
- deallocframe;
- jumpr r31
|