1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228 |
- //
- // d_draw16.s
- // x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel
- // subdivision.
- //
- #include "qasm.h"
- #include "d_ifacea.h"
- #if id386
- //----------------------------------------------------------------------
- // 8-bpp horizontal span drawing code for polygons, with no transparency and
- // 16-pixel subdivision.
- //
- // Assumes there is at least one span in pspans, and that every span
- // contains at least one pixel
- //----------------------------------------------------------------------
- .data
- .text
- // out-of-line, rarely-needed clamping code
- LClampHigh0:
- movl C(bbextents),%esi
- jmp LClampReentry0
- LClampHighOrLow0:
- jg LClampHigh0
- xorl %esi,%esi
- jmp LClampReentry0
- LClampHigh1:
- movl C(bbextentt),%edx
- jmp LClampReentry1
- LClampHighOrLow1:
- jg LClampHigh1
- xorl %edx,%edx
- jmp LClampReentry1
- LClampLow2:
- movl $4096,%ebp
- jmp LClampReentry2
- LClampHigh2:
- movl C(bbextents),%ebp
- jmp LClampReentry2
- LClampLow3:
- movl $4096,%ecx
- jmp LClampReentry3
- LClampHigh3:
- movl C(bbextentt),%ecx
- jmp LClampReentry3
- LClampLow4:
- movl $4096,%eax
- jmp LClampReentry4
- LClampHigh4:
- movl C(bbextents),%eax
- jmp LClampReentry4
- LClampLow5:
- movl $4096,%ebx
- jmp LClampReentry5
- LClampHigh5:
- movl C(bbextentt),%ebx
- jmp LClampReentry5
- #define pspans 4+16
- .align 4
- .globl C(D_DrawSpans16)
- C(D_DrawSpans16):
- pushl %ebp // preserve caller's stack frame
- pushl %edi
- pushl %esi // preserve register variables
- pushl %ebx
- //
- // set up scaled-by-16 steps, for 16-long segments; also set up cacheblock
- // and span list pointers
- //
- // TODO: any overlap from rearranging?
- flds C(d_sdivzstepu)
- fmuls fp_16
- movl C(cacheblock),%edx
- flds C(d_tdivzstepu)
- fmuls fp_16
- movl pspans(%esp),%ebx // point to the first span descriptor
- flds C(d_zistepu)
- fmuls fp_16
- movl %edx,pbase // pbase = cacheblock
- fstps zi16stepu
- fstps tdivz16stepu
- fstps sdivz16stepu
- LSpanLoop:
- //
- // set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
- // initial s and t values
- //
- // FIXME: pipeline FILD?
- fildl espan_t_v(%ebx)
- fildl espan_t_u(%ebx)
- fld %st(1) // dv | du | dv
- fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv
- fld %st(1) // du | dv*d_sdivzstepv | du | dv
- fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
- fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
- fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu |
- // dv*d_sdivzstepv | du | dv
- fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu |
- // dv*d_sdivzstepv | du | dv
- faddp %st(0),%st(2) // du*d_tdivzstepu |
- // du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
- fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
- // du*d_tdivzstepu | du | dv
- fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv |
- // du*d_tdivzstepu | du | dv
- fmuls C(d_tdivzstepv) // dv*d_tdivzstepv |
- // du*d_sdivzstepu + dv*d_sdivzstepv |
- // du*d_tdivzstepu | du | dv
- fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
- // dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
- fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv +
- // du*d_sdivzstepu; stays in %st(2) at end
- fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
- // s/z
- fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv |
- // du*d_tdivzstepu | du | s/z
- fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv |
- // du*d_tdivzstepu | du | s/z
- faddp %st(0),%st(2) // dv*d_zistepv |
- // dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
- fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu |
- // dv*d_zistepv | s/z
- fmuls C(d_zistepu) // du*d_zistepu |
- // dv*d_tdivzstepv + du*d_tdivzstepu |
- // dv*d_zistepv | s/z
- fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu |
- // du*d_zistepu | dv*d_zistepv | s/z
- fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv +
- // du*d_tdivzstepu; stays in %st(1) at end
- fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z
- faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z
- flds fp_64k // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
- fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
- fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv +
- // du*d_zistepu; stays in %st(0) at end
- // 1/z | fp_64k | t/z | s/z
- //
- // calculate and clamp s & t
- //
- fdivr %st(0),%st(1) // 1/z | z*64k | t/z | s/z
- //
- // point %edi to the first pixel in the span
- //
- movl C(d_viewbuffer),%ecx
- movl espan_t_v(%ebx),%eax
- movl %ebx,pspantemp // preserve spans pointer
- movl C(tadjust),%edx
- movl C(sadjust),%esi
- movl C(d_scantable)(,%eax,4),%edi // v * screenwidth
- addl %ecx,%edi
- movl espan_t_u(%ebx),%ecx
- addl %ecx,%edi // pdest = &pdestspan[scans->u];
- movl espan_t_count(%ebx),%ecx
- //
- // now start the FDIV for the end of the span
- //
- cmpl $16,%ecx
- ja LSetupNotLast1
- decl %ecx
- jz LCleanup1 // if only one pixel, no need to start an FDIV
- movl %ecx,spancountminus1
- // finish up the s and t calcs
- fxch %st(1) // z*64k | 1/z | t/z | s/z
- fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
- fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
- fxch %st(1) // z*64k | s | 1/z | t/z | s/z
- fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
- fxch %st(1) // s | t | 1/z | t/z | s/z
- fistpl s // 1/z | t | t/z | s/z
- fistpl t // 1/z | t/z | s/z
- fildl spancountminus1
- flds C(d_tdivzstepu) // C(d_tdivzstepu) | spancountminus1
- flds C(d_zistepu) // C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
- fmul %st(2),%st(0) // C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
- fxch %st(1) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
- fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
- fxch %st(2) // scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
- fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
- // C(d_tdivzstepu)*scm1
- fxch %st(1) // C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
- // C(d_tdivzstepu)*scm1
- faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
- fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
- faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
- faddp %st(0),%st(3)
- flds fp_64k
- fdiv %st(1),%st(0) // this is what we've gone to all this trouble to
- // overlap
- jmp LFDIVInFlight1
- LCleanup1:
- // finish up the s and t calcs
- fxch %st(1) // z*64k | 1/z | t/z | s/z
- fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
- fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
- fxch %st(1) // z*64k | s | 1/z | t/z | s/z
- fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
- fxch %st(1) // s | t | 1/z | t/z | s/z
- fistpl s // 1/z | t | t/z | s/z
- fistpl t // 1/z | t/z | s/z
- jmp LFDIVInFlight1
- .align 4
- LSetupNotLast1:
- // finish up the s and t calcs
- fxch %st(1) // z*64k | 1/z | t/z | s/z
- fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
- fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
- fxch %st(1) // z*64k | s | 1/z | t/z | s/z
- fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
- fxch %st(1) // s | t | 1/z | t/z | s/z
- fistpl s // 1/z | t | t/z | s/z
- fistpl t // 1/z | t/z | s/z
- fadds zi16stepu
- fxch %st(2)
- fadds sdivz16stepu
- fxch %st(2)
- flds tdivz16stepu
- faddp %st(0),%st(2)
- flds fp_64k
- fdiv %st(1),%st(0) // z = 1/1/z
- // this is what we've gone to all this trouble to
- // overlap
- LFDIVInFlight1:
- addl s,%esi
- addl t,%edx
- movl C(bbextents),%ebx
- movl C(bbextentt),%ebp
- cmpl %ebx,%esi
- ja LClampHighOrLow0
- LClampReentry0:
- movl %esi,s
- movl pbase,%ebx
- shll $16,%esi
- cmpl %ebp,%edx
- movl %esi,sfracf
- ja LClampHighOrLow1
- LClampReentry1:
- movl %edx,t
- movl s,%esi // sfrac = scans->sfrac;
- shll $16,%edx
- movl t,%eax // tfrac = scans->tfrac;
- sarl $16,%esi
- movl %edx,tfracf
- //
- // calculate the texture starting address
- //
- sarl $16,%eax
- movl C(cachewidth),%edx
- imull %edx,%eax // (tfrac >> 16) * cachewidth
- addl %ebx,%esi
- addl %eax,%esi // psource = pbase + (sfrac >> 16) +
- // ((tfrac >> 16) * cachewidth);
- //
- // determine whether last span or not
- //
- cmpl $16,%ecx
- jna LLastSegment
- //
- // not the last segment; do full 16-wide segment
- //
- LNotLastSegment:
- //
- // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
- // get there
- //
- // pick up after the FDIV that was left in flight previously
- fld %st(0) // duplicate it
- fmul %st(4),%st(0) // s = s/z * z
- fxch %st(1)
- fmul %st(3),%st(0) // t = t/z * z
- fxch %st(1)
- fistpl snext
- fistpl tnext
- movl snext,%eax
- movl tnext,%edx
- movb (%esi),%bl // get first source texel
- subl $16,%ecx // count off this segments' pixels
- movl C(sadjust),%ebp
- movl %ecx,counttemp // remember count of remaining pixels
- movl C(tadjust),%ecx
- movb %bl,(%edi) // store first dest pixel
- addl %eax,%ebp
- addl %edx,%ecx
- movl C(bbextents),%eax
- movl C(bbextentt),%edx
- cmpl $4096,%ebp
- jl LClampLow2
- cmpl %eax,%ebp
- ja LClampHigh2
- LClampReentry2:
- cmpl $4096,%ecx
- jl LClampLow3
- cmpl %edx,%ecx
- ja LClampHigh3
- LClampReentry3:
- movl %ebp,snext
- movl %ecx,tnext
- subl s,%ebp
- subl t,%ecx
-
- //
- // set up advancetable
- //
- movl %ecx,%eax
- movl %ebp,%edx
- sarl $20,%eax // tstep >>= 16;
- jz LZero
- sarl $20,%edx // sstep >>= 16;
- movl C(cachewidth),%ebx
- imull %ebx,%eax
- jmp LSetUp1
- LZero:
- sarl $20,%edx // sstep >>= 16;
- movl C(cachewidth),%ebx
- LSetUp1:
- addl %edx,%eax // add in sstep
- // (tstep >> 16) * cachewidth + (sstep >> 16);
- movl tfracf,%edx
- movl %eax,advancetable+4 // advance base in t
- addl %ebx,%eax // ((tstep >> 16) + 1) * cachewidth +
- // (sstep >> 16);
- shll $12,%ebp // left-justify sstep fractional part
- movl sfracf,%ebx
- shll $12,%ecx // left-justify tstep fractional part
- movl %eax,advancetable // advance extra in t
- movl %ecx,tstep
- addl %ecx,%edx // advance tfrac fractional part by tstep frac
- sbbl %ecx,%ecx // turn tstep carry into -1 (0 if none)
- addl %ebp,%ebx // advance sfrac fractional part by sstep frac
- adcl advancetable+4(,%ecx,4),%esi // point to next source texel
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb (%esi),%al
- addl %ebp,%ebx
- movb %al,1(%edi)
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,2(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,3(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,4(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,5(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,6(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,7(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- //
- // start FDIV for end of next segment in flight, so it can overlap
- //
- movl counttemp,%ecx
- cmpl $16,%ecx // more than one segment after this?
- ja LSetupNotLast2 // yes
- decl %ecx
- jz LFDIVInFlight2 // if only one pixel, no need to start an FDIV
- movl %ecx,spancountminus1
- fildl spancountminus1
- flds C(d_zistepu) // C(d_zistepu) | spancountminus1
- fmul %st(1),%st(0) // C(d_zistepu)*scm1 | scm1
- flds C(d_tdivzstepu) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
- fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
- fxch %st(1) // C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
- faddp %st(0),%st(3) // C(d_tdivzstepu)*scm1 | scm1
- fxch %st(1) // scm1 | C(d_tdivzstepu)*scm1
- fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
- fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
- faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
- flds fp_64k // 64k | C(d_sdivzstepu)*scm1
- fxch %st(1) // C(d_sdivzstepu)*scm1 | 64k
- faddp %st(0),%st(4) // 64k
- fdiv %st(1),%st(0) // this is what we've gone to all this trouble to
- // overlap
- jmp LFDIVInFlight2
- .align 4
- LSetupNotLast2:
- fadds zi16stepu
- fxch %st(2)
- fadds sdivz16stepu
- fxch %st(2)
- flds tdivz16stepu
- faddp %st(0),%st(2)
- flds fp_64k
- fdiv %st(1),%st(0) // z = 1/1/z
- // this is what we've gone to all this trouble to
- // overlap
- LFDIVInFlight2:
- movl %ecx,counttemp
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,8(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,9(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,10(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,11(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,12(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,13(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,14(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl $16,%edi
- movl %edx,tfracf
- movl snext,%edx
- movl %ebx,sfracf
- movl tnext,%ebx
- movl %edx,s
- movl %ebx,t
- movl counttemp,%ecx // retrieve count
- //
- // determine whether last span or not
- //
- cmpl $16,%ecx // are there multiple segments remaining?
- movb %al,-1(%edi)
- ja LNotLastSegment // yes
- //
- // last segment of scan
- //
- LLastSegment:
- //
- // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
- // get there. The number of pixels left is variable, and we want to land on the
- // last pixel, not step one past it, so we can't run into arithmetic problems
- //
- testl %ecx,%ecx
- jz LNoSteps // just draw the last pixel and we're done
- // pick up after the FDIV that was left in flight previously
- fld %st(0) // duplicate it
- fmul %st(4),%st(0) // s = s/z * z
- fxch %st(1)
- fmul %st(3),%st(0) // t = t/z * z
- fxch %st(1)
- fistpl snext
- fistpl tnext
- movb (%esi),%al // load first texel in segment
- movl C(tadjust),%ebx
- movb %al,(%edi) // store first pixel in segment
- movl C(sadjust),%eax
- addl snext,%eax
- addl tnext,%ebx
- movl C(bbextents),%ebp
- movl C(bbextentt),%edx
- cmpl $4096,%eax
- jl LClampLow4
- cmpl %ebp,%eax
- ja LClampHigh4
- LClampReentry4:
- movl %eax,snext
- cmpl $4096,%ebx
- jl LClampLow5
- cmpl %edx,%ebx
- ja LClampHigh5
- LClampReentry5:
- cmpl $1,%ecx // don't bother
- je LOnlyOneStep // if two pixels in segment, there's only one step,
- // of the segment length
- subl s,%eax
- subl t,%ebx
- addl %eax,%eax // convert to 15.17 format so multiply by 1.31
- addl %ebx,%ebx // reciprocal yields 16.48
- imull reciprocal_table_16-8(,%ecx,4) // sstep = (snext - s) /
- // (spancount-1)
- movl %edx,%ebp
- movl %ebx,%eax
- imull reciprocal_table_16-8(,%ecx,4) // tstep = (tnext - t) /
- // (spancount-1)
- LSetEntryvec:
- //
- // set up advancetable
- //
- movl entryvec_table_16(,%ecx,4),%ebx
- movl %edx,%eax
- movl %ebx,jumptemp // entry point into code for RET later
- movl %ebp,%ecx
- sarl $16,%edx // tstep >>= 16;
- movl C(cachewidth),%ebx
- sarl $16,%ecx // sstep >>= 16;
- imull %ebx,%edx
- addl %ecx,%edx // add in sstep
- // (tstep >> 16) * cachewidth + (sstep >> 16);
- movl tfracf,%ecx
- movl %edx,advancetable+4 // advance base in t
- addl %ebx,%edx // ((tstep >> 16) + 1) * cachewidth +
- // (sstep >> 16);
- shll $16,%ebp // left-justify sstep fractional part
- movl sfracf,%ebx
- shll $16,%eax // left-justify tstep fractional part
- movl %edx,advancetable // advance extra in t
- movl %eax,tstep
- movl %ecx,%edx
- addl %eax,%edx
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- jmp *jumptemp // jump to the number-of-pixels handler
- //----------------------------------------
- LNoSteps:
- movb (%esi),%al // load first texel in segment
- subl $15,%edi // adjust for hardwired offset
- jmp LEndSpan
- LOnlyOneStep:
- subl s,%eax
- subl t,%ebx
- movl %eax,%ebp
- movl %ebx,%edx
- jmp LSetEntryvec
- //----------------------------------------
- .globl Entry2_16, Entry3_16, Entry4_16, Entry5_16
- .globl Entry6_16, Entry7_16, Entry8_16, Entry9_16
- .globl Entry10_16, Entry11_16, Entry12_16, Entry13_16
- .globl Entry14_16, Entry15_16, Entry16_16
- Entry2_16:
- subl $14,%edi // adjust for hardwired offsets
- movb (%esi),%al
- jmp LEntry2_16
- //----------------------------------------
- Entry3_16:
- subl $13,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- jmp LEntry3_16
- //----------------------------------------
- Entry4_16:
- subl $12,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry4_16
- //----------------------------------------
- Entry5_16:
- subl $11,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry5_16
- //----------------------------------------
- Entry6_16:
- subl $10,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry6_16
- //----------------------------------------
- Entry7_16:
- subl $9,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry7_16
- //----------------------------------------
- Entry8_16:
- subl $8,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry8_16
- //----------------------------------------
- Entry9_16:
- subl $7,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry9_16
- //----------------------------------------
- Entry10_16:
- subl $6,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry10_16
- //----------------------------------------
- Entry11_16:
- subl $5,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry11_16
- //----------------------------------------
- Entry12_16:
- subl $4,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry12_16
- //----------------------------------------
- Entry13_16:
- subl $3,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry13_16
- //----------------------------------------
- Entry14_16:
- subl $2,%edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry14_16
- //----------------------------------------
- Entry15_16:
- decl %edi // adjust for hardwired offsets
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- jmp LEntry15_16
- //----------------------------------------
- Entry16_16:
- addl %eax,%edx
- movb (%esi),%al
- sbbl %ecx,%ecx
- addl %ebp,%ebx
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- sbbl %ecx,%ecx
- movb %al,1(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry15_16:
- sbbl %ecx,%ecx
- movb %al,2(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry14_16:
- sbbl %ecx,%ecx
- movb %al,3(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry13_16:
- sbbl %ecx,%ecx
- movb %al,4(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry12_16:
- sbbl %ecx,%ecx
- movb %al,5(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry11_16:
- sbbl %ecx,%ecx
- movb %al,6(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry10_16:
- sbbl %ecx,%ecx
- movb %al,7(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry9_16:
- sbbl %ecx,%ecx
- movb %al,8(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry8_16:
- sbbl %ecx,%ecx
- movb %al,9(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry7_16:
- sbbl %ecx,%ecx
- movb %al,10(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry6_16:
- sbbl %ecx,%ecx
- movb %al,11(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry5_16:
- sbbl %ecx,%ecx
- movb %al,12(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- addl tstep,%edx
- LEntry4_16:
- sbbl %ecx,%ecx
- movb %al,13(%edi)
- addl %ebp,%ebx
- movb (%esi),%al
- adcl advancetable+4(,%ecx,4),%esi
- LEntry3_16:
- movb %al,14(%edi)
- movb (%esi),%al
- LEntry2_16:
- LEndSpan:
- //
- // clear s/z, t/z, 1/z from FP stack
- //
- fstp %st(0)
- fstp %st(0)
- fstp %st(0)
- movl pspantemp,%ebx // restore spans pointer
- movl espan_t_pnext(%ebx),%ebx // point to next span
- testl %ebx,%ebx // any more spans?
- movb %al,15(%edi)
- jnz LSpanLoop // more spans
- popl %ebx // restore register variables
- popl %esi
- popl %edi
- popl %ebp // restore the caller's stack frame
- ret
- //----------------------------------------------------------------------
- // 8-bpp horizontal span z drawing codefor polygons, with no transparency.
- //
- // Assumes there is at least one span in pzspans, and that every span
- // contains at least one pixel
- //----------------------------------------------------------------------
- .text
- // z-clamp on a non-negative gradient span
- LClamp:
- movl $0x40000000,%edx
- xorl %ebx,%ebx
- fstp %st(0)
- jmp LZDraw
- // z-clamp on a negative gradient span
- LClampNeg:
- movl $0x40000000,%edx
- xorl %ebx,%ebx
- fstp %st(0)
- jmp LZDrawNeg
- #define pzspans 4+16
- .globl C(D_DrawZSpans)
- C(D_DrawZSpans):
- pushl %ebp // preserve caller's stack frame
- pushl %edi
- pushl %esi // preserve register variables
- pushl %ebx
- flds C(d_zistepu)
- movl C(d_zistepu),%eax
- movl pzspans(%esp),%esi
- testl %eax,%eax
- jz LFNegSpan
- fmuls Float2ToThe31nd
- fistpl izistep // note: we are relying on FP exceptions being turned
- // off here to avoid range problems
- movl izistep,%ebx // remains loaded for all spans
- LFSpanLoop:
- // set up the initial 1/z value
- fildl espan_t_v(%esi)
- fildl espan_t_u(%esi)
- movl espan_t_v(%esi),%ecx
- movl C(d_pzbuffer),%edi
- fmuls C(d_zistepu)
- fxch %st(1)
- fmuls C(d_zistepv)
- fxch %st(1)
- fadds C(d_ziorigin)
- imull C(d_zrowbytes),%ecx
- faddp %st(0),%st(1)
- // clamp if z is nearer than 2 (1/z > 0.5)
- fcoms float_point5
- addl %ecx,%edi
- movl espan_t_u(%esi),%edx
- addl %edx,%edx // word count
- movl espan_t_count(%esi),%ecx
- addl %edx,%edi // pdest = &pdestspan[scans->u];
- pushl %esi // preserve spans pointer
- fnstsw %ax
- testb $0x45,%ah
- jz LClamp
- fmuls Float2ToThe31nd
- fistpl izi // note: we are relying on FP exceptions being turned
- // off here to avoid problems when the span is closer
- // than 1/(2**31)
- movl izi,%edx
- // at this point:
- // %ebx = izistep
- // %ecx = count
- // %edx = izi
- // %edi = pdest
- LZDraw:
- // do a single pixel up front, if necessary to dword align the destination
- testl $2,%edi
- jz LFMiddle
- movl %edx,%eax
- addl %ebx,%edx
- shrl $16,%eax
- decl %ecx
- movw %ax,(%edi)
- addl $2,%edi
- // do middle a pair of aligned dwords at a time
- LFMiddle:
- pushl %ecx
- shrl $1,%ecx // count / 2
- jz LFLast // no aligned dwords to do
- shrl $1,%ecx // (count / 2) / 2
- jnc LFMiddleLoop // even number of aligned dwords to do
- movl %edx,%eax
- addl %ebx,%edx
- shrl $16,%eax
- movl %edx,%esi
- addl %ebx,%edx
- andl $0xFFFF0000,%esi
- orl %esi,%eax
- movl %eax,(%edi)
- addl $4,%edi
- andl %ecx,%ecx
- jz LFLast
- LFMiddleLoop:
- movl %edx,%eax
- addl %ebx,%edx
- shrl $16,%eax
- movl %edx,%esi
- addl %ebx,%edx
- andl $0xFFFF0000,%esi
- orl %esi,%eax
- movl %edx,%ebp
- movl %eax,(%edi)
- addl %ebx,%edx
- shrl $16,%ebp
- movl %edx,%esi
- addl %ebx,%edx
- andl $0xFFFF0000,%esi
- orl %esi,%ebp
- movl %ebp,4(%edi) // FIXME: eliminate register contention
- addl $8,%edi
- decl %ecx
- jnz LFMiddleLoop
- LFLast:
- popl %ecx // retrieve count
- popl %esi // retrieve span pointer
- // do the last, unaligned pixel, if there is one
- andl $1,%ecx // is there an odd pixel left to do?
- jz LFSpanDone // no
- shrl $16,%edx
- movw %dx,(%edi) // do the final pixel's z
- LFSpanDone:
- movl espan_t_pnext(%esi),%esi
- testl %esi,%esi
- jnz LFSpanLoop
- jmp LFDone
- LFNegSpan:
- fmuls FloatMinus2ToThe31nd
- fistpl izistep // note: we are relying on FP exceptions being turned
- // off here to avoid range problems
- movl izistep,%ebx // remains loaded for all spans
- LFNegSpanLoop:
- // set up the initial 1/z value
- fildl espan_t_v(%esi)
- fildl espan_t_u(%esi)
- movl espan_t_v(%esi),%ecx
- movl C(d_pzbuffer),%edi
- fmuls C(d_zistepu)
- fxch %st(1)
- fmuls C(d_zistepv)
- fxch %st(1)
- fadds C(d_ziorigin)
- imull C(d_zrowbytes),%ecx
- faddp %st(0),%st(1)
- // clamp if z is nearer than 2 (1/z > 0.5)
- fcoms float_point5
- addl %ecx,%edi
- movl espan_t_u(%esi),%edx
- addl %edx,%edx // word count
- movl espan_t_count(%esi),%ecx
- addl %edx,%edi // pdest = &pdestspan[scans->u];
- pushl %esi // preserve spans pointer
- fnstsw %ax
- testb $0x45,%ah
- jz LClampNeg
- fmuls Float2ToThe31nd
- fistpl izi // note: we are relying on FP exceptions being turned
- // off here to avoid problems when the span is closer
- // than 1/(2**31)
- movl izi,%edx
- // at this point:
- // %ebx = izistep
- // %ecx = count
- // %edx = izi
- // %edi = pdest
- LZDrawNeg:
- // do a single pixel up front, if necessary to dword align the destination
- testl $2,%edi
- jz LFNegMiddle
- movl %edx,%eax
- subl %ebx,%edx
- shrl $16,%eax
- decl %ecx
- movw %ax,(%edi)
- addl $2,%edi
- // do middle a pair of aligned dwords at a time
- LFNegMiddle:
- pushl %ecx
- shrl $1,%ecx // count / 2
- jz LFNegLast // no aligned dwords to do
- shrl $1,%ecx // (count / 2) / 2
- jnc LFNegMiddleLoop // even number of aligned dwords to do
- movl %edx,%eax
- subl %ebx,%edx
- shrl $16,%eax
- movl %edx,%esi
- subl %ebx,%edx
- andl $0xFFFF0000,%esi
- orl %esi,%eax
- movl %eax,(%edi)
- addl $4,%edi
- andl %ecx,%ecx
- jz LFNegLast
- LFNegMiddleLoop:
- movl %edx,%eax
- subl %ebx,%edx
- shrl $16,%eax
- movl %edx,%esi
- subl %ebx,%edx
- andl $0xFFFF0000,%esi
- orl %esi,%eax
- movl %edx,%ebp
- movl %eax,(%edi)
- subl %ebx,%edx
- shrl $16,%ebp
- movl %edx,%esi
- subl %ebx,%edx
- andl $0xFFFF0000,%esi
- orl %esi,%ebp
- movl %ebp,4(%edi) // FIXME: eliminate register contention
- addl $8,%edi
- decl %ecx
- jnz LFNegMiddleLoop
- LFNegLast:
- popl %ecx // retrieve count
- popl %esi // retrieve span pointer
- // do the last, unaligned pixel, if there is one
- andl $1,%ecx // is there an odd pixel left to do?
- jz LFNegSpanDone // no
- shrl $16,%edx
- movw %dx,(%edi) // do the final pixel's z
- LFNegSpanDone:
- movl espan_t_pnext(%esi),%esi
- testl %esi,%esi
- jnz LFNegSpanLoop
- LFDone:
- popl %ebx // restore register variables
- popl %esi
- popl %edi
- popl %ebp // restore the caller's stack frame
- ret
- #endif // id386
|