r_draw16.s 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228
  1. //
  2. // d_draw16.s
  3. // x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel
  4. // subdivision.
  5. //
  6. #include "qasm.h"
  7. #include "d_ifacea.h"
  8. #if id386
  9. //----------------------------------------------------------------------
  10. // 8-bpp horizontal span drawing code for polygons, with no transparency and
  11. // 16-pixel subdivision.
  12. //
  13. // Assumes there is at least one span in pspans, and that every span
  14. // contains at least one pixel
  15. //----------------------------------------------------------------------
  16. .data
  17. .text
  18. // out-of-line, rarely-needed clamping code
  19. LClampHigh0:
  20. movl C(bbextents),%esi
  21. jmp LClampReentry0
  22. LClampHighOrLow0:
  23. jg LClampHigh0
  24. xorl %esi,%esi
  25. jmp LClampReentry0
  26. LClampHigh1:
  27. movl C(bbextentt),%edx
  28. jmp LClampReentry1
  29. LClampHighOrLow1:
  30. jg LClampHigh1
  31. xorl %edx,%edx
  32. jmp LClampReentry1
  33. LClampLow2:
  34. movl $4096,%ebp
  35. jmp LClampReentry2
  36. LClampHigh2:
  37. movl C(bbextents),%ebp
  38. jmp LClampReentry2
  39. LClampLow3:
  40. movl $4096,%ecx
  41. jmp LClampReentry3
  42. LClampHigh3:
  43. movl C(bbextentt),%ecx
  44. jmp LClampReentry3
  45. LClampLow4:
  46. movl $4096,%eax
  47. jmp LClampReentry4
  48. LClampHigh4:
  49. movl C(bbextents),%eax
  50. jmp LClampReentry4
  51. LClampLow5:
  52. movl $4096,%ebx
  53. jmp LClampReentry5
  54. LClampHigh5:
  55. movl C(bbextentt),%ebx
  56. jmp LClampReentry5
  57. #define pspans 4+16
  58. .align 4
  59. .globl C(D_DrawSpans16)
  60. C(D_DrawSpans16):
  61. pushl %ebp // preserve caller's stack frame
  62. pushl %edi
  63. pushl %esi // preserve register variables
  64. pushl %ebx
  65. //
  66. // set up scaled-by-16 steps, for 16-long segments; also set up cacheblock
  67. // and span list pointers
  68. //
  69. // TODO: any overlap from rearranging?
  70. flds C(d_sdivzstepu)
  71. fmuls fp_16
  72. movl C(cacheblock),%edx
  73. flds C(d_tdivzstepu)
  74. fmuls fp_16
  75. movl pspans(%esp),%ebx // point to the first span descriptor
  76. flds C(d_zistepu)
  77. fmuls fp_16
  78. movl %edx,pbase // pbase = cacheblock
  79. fstps zi16stepu
  80. fstps tdivz16stepu
  81. fstps sdivz16stepu
  82. LSpanLoop:
  83. //
  84. // set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
  85. // initial s and t values
  86. //
  87. // FIXME: pipeline FILD?
  88. fildl espan_t_v(%ebx)
  89. fildl espan_t_u(%ebx)
  90. fld %st(1) // dv | du | dv
  91. fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv
  92. fld %st(1) // du | dv*d_sdivzstepv | du | dv
  93. fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
  94. fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
  95. fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu |
  96. // dv*d_sdivzstepv | du | dv
  97. fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu |
  98. // dv*d_sdivzstepv | du | dv
  99. faddp %st(0),%st(2) // du*d_tdivzstepu |
  100. // du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
  101. fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
  102. // du*d_tdivzstepu | du | dv
  103. fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv |
  104. // du*d_tdivzstepu | du | dv
  105. fmuls C(d_tdivzstepv) // dv*d_tdivzstepv |
  106. // du*d_sdivzstepu + dv*d_sdivzstepv |
  107. // du*d_tdivzstepu | du | dv
  108. fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
  109. // dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
  110. fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv +
  111. // du*d_sdivzstepu; stays in %st(2) at end
  112. fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
  113. // s/z
  114. fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv |
  115. // du*d_tdivzstepu | du | s/z
  116. fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv |
  117. // du*d_tdivzstepu | du | s/z
  118. faddp %st(0),%st(2) // dv*d_zistepv |
  119. // dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
  120. fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu |
  121. // dv*d_zistepv | s/z
  122. fmuls C(d_zistepu) // du*d_zistepu |
  123. // dv*d_tdivzstepv + du*d_tdivzstepu |
  124. // dv*d_zistepv | s/z
  125. fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu |
  126. // du*d_zistepu | dv*d_zistepv | s/z
  127. fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv +
  128. // du*d_tdivzstepu; stays in %st(1) at end
  129. fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z
  130. faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z
  131. flds fp_64k // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
  132. fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
  133. fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv +
  134. // du*d_zistepu; stays in %st(0) at end
  135. // 1/z | fp_64k | t/z | s/z
  136. //
  137. // calculate and clamp s & t
  138. //
  139. fdivr %st(0),%st(1) // 1/z | z*64k | t/z | s/z
  140. //
  141. // point %edi to the first pixel in the span
  142. //
  143. movl C(d_viewbuffer),%ecx
  144. movl espan_t_v(%ebx),%eax
  145. movl %ebx,pspantemp // preserve spans pointer
  146. movl C(tadjust),%edx
  147. movl C(sadjust),%esi
  148. movl C(d_scantable)(,%eax,4),%edi // v * screenwidth
  149. addl %ecx,%edi
  150. movl espan_t_u(%ebx),%ecx
  151. addl %ecx,%edi // pdest = &pdestspan[scans->u];
  152. movl espan_t_count(%ebx),%ecx
  153. //
  154. // now start the FDIV for the end of the span
  155. //
  156. cmpl $16,%ecx
  157. ja LSetupNotLast1
  158. decl %ecx
  159. jz LCleanup1 // if only one pixel, no need to start an FDIV
  160. movl %ecx,spancountminus1
  161. // finish up the s and t calcs
  162. fxch %st(1) // z*64k | 1/z | t/z | s/z
  163. fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
  164. fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
  165. fxch %st(1) // z*64k | s | 1/z | t/z | s/z
  166. fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
  167. fxch %st(1) // s | t | 1/z | t/z | s/z
  168. fistpl s // 1/z | t | t/z | s/z
  169. fistpl t // 1/z | t/z | s/z
  170. fildl spancountminus1
  171. flds C(d_tdivzstepu) // C(d_tdivzstepu) | spancountminus1
  172. flds C(d_zistepu) // C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
  173. fmul %st(2),%st(0) // C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
  174. fxch %st(1) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
  175. fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
  176. fxch %st(2) // scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
  177. fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
  178. // C(d_tdivzstepu)*scm1
  179. fxch %st(1) // C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
  180. // C(d_tdivzstepu)*scm1
  181. faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
  182. fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
  183. faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
  184. faddp %st(0),%st(3)
  185. flds fp_64k
  186. fdiv %st(1),%st(0) // this is what we've gone to all this trouble to
  187. // overlap
  188. jmp LFDIVInFlight1
  189. LCleanup1:
  190. // finish up the s and t calcs
  191. fxch %st(1) // z*64k | 1/z | t/z | s/z
  192. fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
  193. fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
  194. fxch %st(1) // z*64k | s | 1/z | t/z | s/z
  195. fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
  196. fxch %st(1) // s | t | 1/z | t/z | s/z
  197. fistpl s // 1/z | t | t/z | s/z
  198. fistpl t // 1/z | t/z | s/z
  199. jmp LFDIVInFlight1
  200. .align 4
  201. LSetupNotLast1:
  202. // finish up the s and t calcs
  203. fxch %st(1) // z*64k | 1/z | t/z | s/z
  204. fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
  205. fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
  206. fxch %st(1) // z*64k | s | 1/z | t/z | s/z
  207. fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
  208. fxch %st(1) // s | t | 1/z | t/z | s/z
  209. fistpl s // 1/z | t | t/z | s/z
  210. fistpl t // 1/z | t/z | s/z
  211. fadds zi16stepu
  212. fxch %st(2)
  213. fadds sdivz16stepu
  214. fxch %st(2)
  215. flds tdivz16stepu
  216. faddp %st(0),%st(2)
  217. flds fp_64k
  218. fdiv %st(1),%st(0) // z = 1/1/z
  219. // this is what we've gone to all this trouble to
  220. // overlap
  221. LFDIVInFlight1:
  222. addl s,%esi
  223. addl t,%edx
  224. movl C(bbextents),%ebx
  225. movl C(bbextentt),%ebp
  226. cmpl %ebx,%esi
  227. ja LClampHighOrLow0
  228. LClampReentry0:
  229. movl %esi,s
  230. movl pbase,%ebx
  231. shll $16,%esi
  232. cmpl %ebp,%edx
  233. movl %esi,sfracf
  234. ja LClampHighOrLow1
  235. LClampReentry1:
  236. movl %edx,t
  237. movl s,%esi // sfrac = scans->sfrac;
  238. shll $16,%edx
  239. movl t,%eax // tfrac = scans->tfrac;
  240. sarl $16,%esi
  241. movl %edx,tfracf
  242. //
  243. // calculate the texture starting address
  244. //
  245. sarl $16,%eax
  246. movl C(cachewidth),%edx
  247. imull %edx,%eax // (tfrac >> 16) * cachewidth
  248. addl %ebx,%esi
  249. addl %eax,%esi // psource = pbase + (sfrac >> 16) +
  250. // ((tfrac >> 16) * cachewidth);
  251. //
  252. // determine whether last span or not
  253. //
  254. cmpl $16,%ecx
  255. jna LLastSegment
  256. //
  257. // not the last segment; do full 16-wide segment
  258. //
  259. LNotLastSegment:
  260. //
  261. // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
  262. // get there
  263. //
  264. // pick up after the FDIV that was left in flight previously
  265. fld %st(0) // duplicate it
  266. fmul %st(4),%st(0) // s = s/z * z
  267. fxch %st(1)
  268. fmul %st(3),%st(0) // t = t/z * z
  269. fxch %st(1)
  270. fistpl snext
  271. fistpl tnext
  272. movl snext,%eax
  273. movl tnext,%edx
  274. movb (%esi),%bl // get first source texel
  275. subl $16,%ecx // count off this segments' pixels
  276. movl C(sadjust),%ebp
  277. movl %ecx,counttemp // remember count of remaining pixels
  278. movl C(tadjust),%ecx
  279. movb %bl,(%edi) // store first dest pixel
  280. addl %eax,%ebp
  281. addl %edx,%ecx
  282. movl C(bbextents),%eax
  283. movl C(bbextentt),%edx
  284. cmpl $4096,%ebp
  285. jl LClampLow2
  286. cmpl %eax,%ebp
  287. ja LClampHigh2
  288. LClampReentry2:
  289. cmpl $4096,%ecx
  290. jl LClampLow3
  291. cmpl %edx,%ecx
  292. ja LClampHigh3
  293. LClampReentry3:
  294. movl %ebp,snext
  295. movl %ecx,tnext
  296. subl s,%ebp
  297. subl t,%ecx
  298. //
  299. // set up advancetable
  300. //
  301. movl %ecx,%eax
  302. movl %ebp,%edx
  303. sarl $20,%eax // tstep >>= 16;
  304. jz LZero
  305. sarl $20,%edx // sstep >>= 16;
  306. movl C(cachewidth),%ebx
  307. imull %ebx,%eax
  308. jmp LSetUp1
  309. LZero:
  310. sarl $20,%edx // sstep >>= 16;
  311. movl C(cachewidth),%ebx
  312. LSetUp1:
  313. addl %edx,%eax // add in sstep
  314. // (tstep >> 16) * cachewidth + (sstep >> 16);
  315. movl tfracf,%edx
  316. movl %eax,advancetable+4 // advance base in t
  317. addl %ebx,%eax // ((tstep >> 16) + 1) * cachewidth +
  318. // (sstep >> 16);
  319. shll $12,%ebp // left-justify sstep fractional part
  320. movl sfracf,%ebx
  321. shll $12,%ecx // left-justify tstep fractional part
  322. movl %eax,advancetable // advance extra in t
  323. movl %ecx,tstep
  324. addl %ecx,%edx // advance tfrac fractional part by tstep frac
  325. sbbl %ecx,%ecx // turn tstep carry into -1 (0 if none)
  326. addl %ebp,%ebx // advance sfrac fractional part by sstep frac
  327. adcl advancetable+4(,%ecx,4),%esi // point to next source texel
  328. addl tstep,%edx
  329. sbbl %ecx,%ecx
  330. movb (%esi),%al
  331. addl %ebp,%ebx
  332. movb %al,1(%edi)
  333. adcl advancetable+4(,%ecx,4),%esi
  334. addl tstep,%edx
  335. sbbl %ecx,%ecx
  336. addl %ebp,%ebx
  337. movb (%esi),%al
  338. adcl advancetable+4(,%ecx,4),%esi
  339. addl tstep,%edx
  340. sbbl %ecx,%ecx
  341. movb %al,2(%edi)
  342. addl %ebp,%ebx
  343. movb (%esi),%al
  344. adcl advancetable+4(,%ecx,4),%esi
  345. addl tstep,%edx
  346. sbbl %ecx,%ecx
  347. movb %al,3(%edi)
  348. addl %ebp,%ebx
  349. movb (%esi),%al
  350. adcl advancetable+4(,%ecx,4),%esi
  351. addl tstep,%edx
  352. sbbl %ecx,%ecx
  353. movb %al,4(%edi)
  354. addl %ebp,%ebx
  355. movb (%esi),%al
  356. adcl advancetable+4(,%ecx,4),%esi
  357. addl tstep,%edx
  358. sbbl %ecx,%ecx
  359. movb %al,5(%edi)
  360. addl %ebp,%ebx
  361. movb (%esi),%al
  362. adcl advancetable+4(,%ecx,4),%esi
  363. addl tstep,%edx
  364. sbbl %ecx,%ecx
  365. movb %al,6(%edi)
  366. addl %ebp,%ebx
  367. movb (%esi),%al
  368. adcl advancetable+4(,%ecx,4),%esi
  369. addl tstep,%edx
  370. sbbl %ecx,%ecx
  371. movb %al,7(%edi)
  372. addl %ebp,%ebx
  373. movb (%esi),%al
  374. adcl advancetable+4(,%ecx,4),%esi
  375. //
  376. // start FDIV for end of next segment in flight, so it can overlap
  377. //
  378. movl counttemp,%ecx
  379. cmpl $16,%ecx // more than one segment after this?
  380. ja LSetupNotLast2 // yes
  381. decl %ecx
  382. jz LFDIVInFlight2 // if only one pixel, no need to start an FDIV
  383. movl %ecx,spancountminus1
  384. fildl spancountminus1
  385. flds C(d_zistepu) // C(d_zistepu) | spancountminus1
  386. fmul %st(1),%st(0) // C(d_zistepu)*scm1 | scm1
  387. flds C(d_tdivzstepu) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
  388. fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
  389. fxch %st(1) // C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
  390. faddp %st(0),%st(3) // C(d_tdivzstepu)*scm1 | scm1
  391. fxch %st(1) // scm1 | C(d_tdivzstepu)*scm1
  392. fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
  393. fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
  394. faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
  395. flds fp_64k // 64k | C(d_sdivzstepu)*scm1
  396. fxch %st(1) // C(d_sdivzstepu)*scm1 | 64k
  397. faddp %st(0),%st(4) // 64k
  398. fdiv %st(1),%st(0) // this is what we've gone to all this trouble to
  399. // overlap
  400. jmp LFDIVInFlight2
  401. .align 4
  402. LSetupNotLast2:
  403. fadds zi16stepu
  404. fxch %st(2)
  405. fadds sdivz16stepu
  406. fxch %st(2)
  407. flds tdivz16stepu
  408. faddp %st(0),%st(2)
  409. flds fp_64k
  410. fdiv %st(1),%st(0) // z = 1/1/z
  411. // this is what we've gone to all this trouble to
  412. // overlap
  413. LFDIVInFlight2:
  414. movl %ecx,counttemp
  415. addl tstep,%edx
  416. sbbl %ecx,%ecx
  417. movb %al,8(%edi)
  418. addl %ebp,%ebx
  419. movb (%esi),%al
  420. adcl advancetable+4(,%ecx,4),%esi
  421. addl tstep,%edx
  422. sbbl %ecx,%ecx
  423. movb %al,9(%edi)
  424. addl %ebp,%ebx
  425. movb (%esi),%al
  426. adcl advancetable+4(,%ecx,4),%esi
  427. addl tstep,%edx
  428. sbbl %ecx,%ecx
  429. movb %al,10(%edi)
  430. addl %ebp,%ebx
  431. movb (%esi),%al
  432. adcl advancetable+4(,%ecx,4),%esi
  433. addl tstep,%edx
  434. sbbl %ecx,%ecx
  435. movb %al,11(%edi)
  436. addl %ebp,%ebx
  437. movb (%esi),%al
  438. adcl advancetable+4(,%ecx,4),%esi
  439. addl tstep,%edx
  440. sbbl %ecx,%ecx
  441. movb %al,12(%edi)
  442. addl %ebp,%ebx
  443. movb (%esi),%al
  444. adcl advancetable+4(,%ecx,4),%esi
  445. addl tstep,%edx
  446. sbbl %ecx,%ecx
  447. movb %al,13(%edi)
  448. addl %ebp,%ebx
  449. movb (%esi),%al
  450. adcl advancetable+4(,%ecx,4),%esi
  451. addl tstep,%edx
  452. sbbl %ecx,%ecx
  453. movb %al,14(%edi)
  454. addl %ebp,%ebx
  455. movb (%esi),%al
  456. adcl advancetable+4(,%ecx,4),%esi
  457. addl $16,%edi
  458. movl %edx,tfracf
  459. movl snext,%edx
  460. movl %ebx,sfracf
  461. movl tnext,%ebx
  462. movl %edx,s
  463. movl %ebx,t
  464. movl counttemp,%ecx // retrieve count
  465. //
  466. // determine whether last span or not
  467. //
  468. cmpl $16,%ecx // are there multiple segments remaining?
  469. movb %al,-1(%edi)
  470. ja LNotLastSegment // yes
  471. //
  472. // last segment of scan
  473. //
  474. LLastSegment:
  475. //
  476. // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
  477. // get there. The number of pixels left is variable, and we want to land on the
  478. // last pixel, not step one past it, so we can't run into arithmetic problems
  479. //
  480. testl %ecx,%ecx
  481. jz LNoSteps // just draw the last pixel and we're done
  482. // pick up after the FDIV that was left in flight previously
  483. fld %st(0) // duplicate it
  484. fmul %st(4),%st(0) // s = s/z * z
  485. fxch %st(1)
  486. fmul %st(3),%st(0) // t = t/z * z
  487. fxch %st(1)
  488. fistpl snext
  489. fistpl tnext
  490. movb (%esi),%al // load first texel in segment
  491. movl C(tadjust),%ebx
  492. movb %al,(%edi) // store first pixel in segment
  493. movl C(sadjust),%eax
  494. addl snext,%eax
  495. addl tnext,%ebx
  496. movl C(bbextents),%ebp
  497. movl C(bbextentt),%edx
  498. cmpl $4096,%eax
  499. jl LClampLow4
  500. cmpl %ebp,%eax
  501. ja LClampHigh4
  502. LClampReentry4:
  503. movl %eax,snext
  504. cmpl $4096,%ebx
  505. jl LClampLow5
  506. cmpl %edx,%ebx
  507. ja LClampHigh5
  508. LClampReentry5:
  509. cmpl $1,%ecx // don't bother
  510. je LOnlyOneStep // if two pixels in segment, there's only one step,
  511. // of the segment length
  512. subl s,%eax
  513. subl t,%ebx
  514. addl %eax,%eax // convert to 15.17 format so multiply by 1.31
  515. addl %ebx,%ebx // reciprocal yields 16.48
  516. imull reciprocal_table_16-8(,%ecx,4) // sstep = (snext - s) /
  517. // (spancount-1)
  518. movl %edx,%ebp
  519. movl %ebx,%eax
  520. imull reciprocal_table_16-8(,%ecx,4) // tstep = (tnext - t) /
  521. // (spancount-1)
  522. LSetEntryvec:
  523. //
  524. // set up advancetable
  525. //
  526. movl entryvec_table_16(,%ecx,4),%ebx
  527. movl %edx,%eax
  528. movl %ebx,jumptemp // entry point into code for RET later
  529. movl %ebp,%ecx
  530. sarl $16,%edx // tstep >>= 16;
  531. movl C(cachewidth),%ebx
  532. sarl $16,%ecx // sstep >>= 16;
  533. imull %ebx,%edx
  534. addl %ecx,%edx // add in sstep
  535. // (tstep >> 16) * cachewidth + (sstep >> 16);
  536. movl tfracf,%ecx
  537. movl %edx,advancetable+4 // advance base in t
  538. addl %ebx,%edx // ((tstep >> 16) + 1) * cachewidth +
  539. // (sstep >> 16);
  540. shll $16,%ebp // left-justify sstep fractional part
  541. movl sfracf,%ebx
  542. shll $16,%eax // left-justify tstep fractional part
  543. movl %edx,advancetable // advance extra in t
  544. movl %eax,tstep
  545. movl %ecx,%edx
  546. addl %eax,%edx
  547. sbbl %ecx,%ecx
  548. addl %ebp,%ebx
  549. adcl advancetable+4(,%ecx,4),%esi
  550. jmp *jumptemp // jump to the number-of-pixels handler
  551. //----------------------------------------
  552. LNoSteps:
  553. movb (%esi),%al // load first texel in segment
  554. subl $15,%edi // adjust for hardwired offset
  555. jmp LEndSpan
  556. LOnlyOneStep:
  557. subl s,%eax
  558. subl t,%ebx
  559. movl %eax,%ebp
  560. movl %ebx,%edx
  561. jmp LSetEntryvec
  562. //----------------------------------------
  563. .globl Entry2_16, Entry3_16, Entry4_16, Entry5_16
  564. .globl Entry6_16, Entry7_16, Entry8_16, Entry9_16
  565. .globl Entry10_16, Entry11_16, Entry12_16, Entry13_16
  566. .globl Entry14_16, Entry15_16, Entry16_16
  567. Entry2_16:
  568. subl $14,%edi // adjust for hardwired offsets
  569. movb (%esi),%al
  570. jmp LEntry2_16
  571. //----------------------------------------
  572. Entry3_16:
  573. subl $13,%edi // adjust for hardwired offsets
  574. addl %eax,%edx
  575. movb (%esi),%al
  576. sbbl %ecx,%ecx
  577. addl %ebp,%ebx
  578. adcl advancetable+4(,%ecx,4),%esi
  579. jmp LEntry3_16
  580. //----------------------------------------
  581. Entry4_16:
  582. subl $12,%edi // adjust for hardwired offsets
  583. addl %eax,%edx
  584. movb (%esi),%al
  585. sbbl %ecx,%ecx
  586. addl %ebp,%ebx
  587. adcl advancetable+4(,%ecx,4),%esi
  588. addl tstep,%edx
  589. jmp LEntry4_16
  590. //----------------------------------------
  591. Entry5_16:
  592. subl $11,%edi // adjust for hardwired offsets
  593. addl %eax,%edx
  594. movb (%esi),%al
  595. sbbl %ecx,%ecx
  596. addl %ebp,%ebx
  597. adcl advancetable+4(,%ecx,4),%esi
  598. addl tstep,%edx
  599. jmp LEntry5_16
  600. //----------------------------------------
  601. Entry6_16:
  602. subl $10,%edi // adjust for hardwired offsets
  603. addl %eax,%edx
  604. movb (%esi),%al
  605. sbbl %ecx,%ecx
  606. addl %ebp,%ebx
  607. adcl advancetable+4(,%ecx,4),%esi
  608. addl tstep,%edx
  609. jmp LEntry6_16
  610. //----------------------------------------
  611. Entry7_16:
  612. subl $9,%edi // adjust for hardwired offsets
  613. addl %eax,%edx
  614. movb (%esi),%al
  615. sbbl %ecx,%ecx
  616. addl %ebp,%ebx
  617. adcl advancetable+4(,%ecx,4),%esi
  618. addl tstep,%edx
  619. jmp LEntry7_16
  620. //----------------------------------------
  621. Entry8_16:
  622. subl $8,%edi // adjust for hardwired offsets
  623. addl %eax,%edx
  624. movb (%esi),%al
  625. sbbl %ecx,%ecx
  626. addl %ebp,%ebx
  627. adcl advancetable+4(,%ecx,4),%esi
  628. addl tstep,%edx
  629. jmp LEntry8_16
  630. //----------------------------------------
  631. Entry9_16:
  632. subl $7,%edi // adjust for hardwired offsets
  633. addl %eax,%edx
  634. movb (%esi),%al
  635. sbbl %ecx,%ecx
  636. addl %ebp,%ebx
  637. adcl advancetable+4(,%ecx,4),%esi
  638. addl tstep,%edx
  639. jmp LEntry9_16
  640. //----------------------------------------
  641. Entry10_16:
  642. subl $6,%edi // adjust for hardwired offsets
  643. addl %eax,%edx
  644. movb (%esi),%al
  645. sbbl %ecx,%ecx
  646. addl %ebp,%ebx
  647. adcl advancetable+4(,%ecx,4),%esi
  648. addl tstep,%edx
  649. jmp LEntry10_16
  650. //----------------------------------------
  651. Entry11_16:
  652. subl $5,%edi // adjust for hardwired offsets
  653. addl %eax,%edx
  654. movb (%esi),%al
  655. sbbl %ecx,%ecx
  656. addl %ebp,%ebx
  657. adcl advancetable+4(,%ecx,4),%esi
  658. addl tstep,%edx
  659. jmp LEntry11_16
  660. //----------------------------------------
  661. Entry12_16:
  662. subl $4,%edi // adjust for hardwired offsets
  663. addl %eax,%edx
  664. movb (%esi),%al
  665. sbbl %ecx,%ecx
  666. addl %ebp,%ebx
  667. adcl advancetable+4(,%ecx,4),%esi
  668. addl tstep,%edx
  669. jmp LEntry12_16
  670. //----------------------------------------
  671. Entry13_16:
  672. subl $3,%edi // adjust for hardwired offsets
  673. addl %eax,%edx
  674. movb (%esi),%al
  675. sbbl %ecx,%ecx
  676. addl %ebp,%ebx
  677. adcl advancetable+4(,%ecx,4),%esi
  678. addl tstep,%edx
  679. jmp LEntry13_16
  680. //----------------------------------------
  681. Entry14_16:
  682. subl $2,%edi // adjust for hardwired offsets
  683. addl %eax,%edx
  684. movb (%esi),%al
  685. sbbl %ecx,%ecx
  686. addl %ebp,%ebx
  687. adcl advancetable+4(,%ecx,4),%esi
  688. addl tstep,%edx
  689. jmp LEntry14_16
  690. //----------------------------------------
  691. Entry15_16:
  692. decl %edi // adjust for hardwired offsets
  693. addl %eax,%edx
  694. movb (%esi),%al
  695. sbbl %ecx,%ecx
  696. addl %ebp,%ebx
  697. adcl advancetable+4(,%ecx,4),%esi
  698. addl tstep,%edx
  699. jmp LEntry15_16
  700. //----------------------------------------
  701. Entry16_16:
  702. addl %eax,%edx
  703. movb (%esi),%al
  704. sbbl %ecx,%ecx
  705. addl %ebp,%ebx
  706. adcl advancetable+4(,%ecx,4),%esi
  707. addl tstep,%edx
  708. sbbl %ecx,%ecx
  709. movb %al,1(%edi)
  710. addl %ebp,%ebx
  711. movb (%esi),%al
  712. adcl advancetable+4(,%ecx,4),%esi
  713. addl tstep,%edx
  714. LEntry15_16:
  715. sbbl %ecx,%ecx
  716. movb %al,2(%edi)
  717. addl %ebp,%ebx
  718. movb (%esi),%al
  719. adcl advancetable+4(,%ecx,4),%esi
  720. addl tstep,%edx
  721. LEntry14_16:
  722. sbbl %ecx,%ecx
  723. movb %al,3(%edi)
  724. addl %ebp,%ebx
  725. movb (%esi),%al
  726. adcl advancetable+4(,%ecx,4),%esi
  727. addl tstep,%edx
  728. LEntry13_16:
  729. sbbl %ecx,%ecx
  730. movb %al,4(%edi)
  731. addl %ebp,%ebx
  732. movb (%esi),%al
  733. adcl advancetable+4(,%ecx,4),%esi
  734. addl tstep,%edx
  735. LEntry12_16:
  736. sbbl %ecx,%ecx
  737. movb %al,5(%edi)
  738. addl %ebp,%ebx
  739. movb (%esi),%al
  740. adcl advancetable+4(,%ecx,4),%esi
  741. addl tstep,%edx
  742. LEntry11_16:
  743. sbbl %ecx,%ecx
  744. movb %al,6(%edi)
  745. addl %ebp,%ebx
  746. movb (%esi),%al
  747. adcl advancetable+4(,%ecx,4),%esi
  748. addl tstep,%edx
  749. LEntry10_16:
  750. sbbl %ecx,%ecx
  751. movb %al,7(%edi)
  752. addl %ebp,%ebx
  753. movb (%esi),%al
  754. adcl advancetable+4(,%ecx,4),%esi
  755. addl tstep,%edx
  756. LEntry9_16:
  757. sbbl %ecx,%ecx
  758. movb %al,8(%edi)
  759. addl %ebp,%ebx
  760. movb (%esi),%al
  761. adcl advancetable+4(,%ecx,4),%esi
  762. addl tstep,%edx
  763. LEntry8_16:
  764. sbbl %ecx,%ecx
  765. movb %al,9(%edi)
  766. addl %ebp,%ebx
  767. movb (%esi),%al
  768. adcl advancetable+4(,%ecx,4),%esi
  769. addl tstep,%edx
  770. LEntry7_16:
  771. sbbl %ecx,%ecx
  772. movb %al,10(%edi)
  773. addl %ebp,%ebx
  774. movb (%esi),%al
  775. adcl advancetable+4(,%ecx,4),%esi
  776. addl tstep,%edx
  777. LEntry6_16:
  778. sbbl %ecx,%ecx
  779. movb %al,11(%edi)
  780. addl %ebp,%ebx
  781. movb (%esi),%al
  782. adcl advancetable+4(,%ecx,4),%esi
  783. addl tstep,%edx
  784. LEntry5_16:
  785. sbbl %ecx,%ecx
  786. movb %al,12(%edi)
  787. addl %ebp,%ebx
  788. movb (%esi),%al
  789. adcl advancetable+4(,%ecx,4),%esi
  790. addl tstep,%edx
  791. LEntry4_16:
  792. sbbl %ecx,%ecx
  793. movb %al,13(%edi)
  794. addl %ebp,%ebx
  795. movb (%esi),%al
  796. adcl advancetable+4(,%ecx,4),%esi
  797. LEntry3_16:
  798. movb %al,14(%edi)
  799. movb (%esi),%al
  800. LEntry2_16:
  801. LEndSpan:
  802. //
  803. // clear s/z, t/z, 1/z from FP stack
  804. //
  805. fstp %st(0)
  806. fstp %st(0)
  807. fstp %st(0)
  808. movl pspantemp,%ebx // restore spans pointer
  809. movl espan_t_pnext(%ebx),%ebx // point to next span
  810. testl %ebx,%ebx // any more spans?
  811. movb %al,15(%edi)
  812. jnz LSpanLoop // more spans
  813. popl %ebx // restore register variables
  814. popl %esi
  815. popl %edi
  816. popl %ebp // restore the caller's stack frame
  817. ret
  818. //----------------------------------------------------------------------
  819. // 8-bpp horizontal span z drawing codefor polygons, with no transparency.
  820. //
  821. // Assumes there is at least one span in pzspans, and that every span
  822. // contains at least one pixel
  823. //----------------------------------------------------------------------
  824. .text
  825. // z-clamp on a non-negative gradient span
  826. LClamp:
  827. movl $0x40000000,%edx
  828. xorl %ebx,%ebx
  829. fstp %st(0)
  830. jmp LZDraw
  831. // z-clamp on a negative gradient span
  832. LClampNeg:
  833. movl $0x40000000,%edx
  834. xorl %ebx,%ebx
  835. fstp %st(0)
  836. jmp LZDrawNeg
  837. #define pzspans 4+16
  838. .globl C(D_DrawZSpans)
  839. C(D_DrawZSpans):
  840. pushl %ebp // preserve caller's stack frame
  841. pushl %edi
  842. pushl %esi // preserve register variables
  843. pushl %ebx
  844. flds C(d_zistepu)
  845. movl C(d_zistepu),%eax
  846. movl pzspans(%esp),%esi
  847. testl %eax,%eax
  848. jz LFNegSpan
  849. fmuls Float2ToThe31nd
  850. fistpl izistep // note: we are relying on FP exceptions being turned
  851. // off here to avoid range problems
  852. movl izistep,%ebx // remains loaded for all spans
  853. LFSpanLoop:
  854. // set up the initial 1/z value
  855. fildl espan_t_v(%esi)
  856. fildl espan_t_u(%esi)
  857. movl espan_t_v(%esi),%ecx
  858. movl C(d_pzbuffer),%edi
  859. fmuls C(d_zistepu)
  860. fxch %st(1)
  861. fmuls C(d_zistepv)
  862. fxch %st(1)
  863. fadds C(d_ziorigin)
  864. imull C(d_zrowbytes),%ecx
  865. faddp %st(0),%st(1)
  866. // clamp if z is nearer than 2 (1/z > 0.5)
  867. fcoms float_point5
  868. addl %ecx,%edi
  869. movl espan_t_u(%esi),%edx
  870. addl %edx,%edx // word count
  871. movl espan_t_count(%esi),%ecx
  872. addl %edx,%edi // pdest = &pdestspan[scans->u];
  873. pushl %esi // preserve spans pointer
  874. fnstsw %ax
  875. testb $0x45,%ah
  876. jz LClamp
  877. fmuls Float2ToThe31nd
  878. fistpl izi // note: we are relying on FP exceptions being turned
  879. // off here to avoid problems when the span is closer
  880. // than 1/(2**31)
  881. movl izi,%edx
  882. // at this point:
  883. // %ebx = izistep
  884. // %ecx = count
  885. // %edx = izi
  886. // %edi = pdest
  887. LZDraw:
  888. // do a single pixel up front, if necessary to dword align the destination
  889. testl $2,%edi
  890. jz LFMiddle
  891. movl %edx,%eax
  892. addl %ebx,%edx
  893. shrl $16,%eax
  894. decl %ecx
  895. movw %ax,(%edi)
  896. addl $2,%edi
  897. // do middle a pair of aligned dwords at a time
  898. LFMiddle:
  899. pushl %ecx
  900. shrl $1,%ecx // count / 2
  901. jz LFLast // no aligned dwords to do
  902. shrl $1,%ecx // (count / 2) / 2
  903. jnc LFMiddleLoop // even number of aligned dwords to do
  904. movl %edx,%eax
  905. addl %ebx,%edx
  906. shrl $16,%eax
  907. movl %edx,%esi
  908. addl %ebx,%edx
  909. andl $0xFFFF0000,%esi
  910. orl %esi,%eax
  911. movl %eax,(%edi)
  912. addl $4,%edi
  913. andl %ecx,%ecx
  914. jz LFLast
  915. LFMiddleLoop:
  916. movl %edx,%eax
  917. addl %ebx,%edx
  918. shrl $16,%eax
  919. movl %edx,%esi
  920. addl %ebx,%edx
  921. andl $0xFFFF0000,%esi
  922. orl %esi,%eax
  923. movl %edx,%ebp
  924. movl %eax,(%edi)
  925. addl %ebx,%edx
  926. shrl $16,%ebp
  927. movl %edx,%esi
  928. addl %ebx,%edx
  929. andl $0xFFFF0000,%esi
  930. orl %esi,%ebp
  931. movl %ebp,4(%edi) // FIXME: eliminate register contention
  932. addl $8,%edi
  933. decl %ecx
  934. jnz LFMiddleLoop
  935. LFLast:
  936. popl %ecx // retrieve count
  937. popl %esi // retrieve span pointer
  938. // do the last, unaligned pixel, if there is one
  939. andl $1,%ecx // is there an odd pixel left to do?
  940. jz LFSpanDone // no
  941. shrl $16,%edx
  942. movw %dx,(%edi) // do the final pixel's z
  943. LFSpanDone:
  944. movl espan_t_pnext(%esi),%esi
  945. testl %esi,%esi
  946. jnz LFSpanLoop
  947. jmp LFDone
  948. LFNegSpan:
  949. fmuls FloatMinus2ToThe31nd
  950. fistpl izistep // note: we are relying on FP exceptions being turned
  951. // off here to avoid range problems
  952. movl izistep,%ebx // remains loaded for all spans
  953. LFNegSpanLoop:
  954. // set up the initial 1/z value
  955. fildl espan_t_v(%esi)
  956. fildl espan_t_u(%esi)
  957. movl espan_t_v(%esi),%ecx
  958. movl C(d_pzbuffer),%edi
  959. fmuls C(d_zistepu)
  960. fxch %st(1)
  961. fmuls C(d_zistepv)
  962. fxch %st(1)
  963. fadds C(d_ziorigin)
  964. imull C(d_zrowbytes),%ecx
  965. faddp %st(0),%st(1)
  966. // clamp if z is nearer than 2 (1/z > 0.5)
  967. fcoms float_point5
  968. addl %ecx,%edi
  969. movl espan_t_u(%esi),%edx
  970. addl %edx,%edx // word count
  971. movl espan_t_count(%esi),%ecx
  972. addl %edx,%edi // pdest = &pdestspan[scans->u];
  973. pushl %esi // preserve spans pointer
  974. fnstsw %ax
  975. testb $0x45,%ah
  976. jz LClampNeg
  977. fmuls Float2ToThe31nd
  978. fistpl izi // note: we are relying on FP exceptions being turned
  979. // off here to avoid problems when the span is closer
  980. // than 1/(2**31)
  981. movl izi,%edx
  982. // at this point:
  983. // %ebx = izistep
  984. // %ecx = count
  985. // %edx = izi
  986. // %edi = pdest
  987. LZDrawNeg:
  988. // do a single pixel up front, if necessary to dword align the destination
  989. testl $2,%edi
  990. jz LFNegMiddle
  991. movl %edx,%eax
  992. subl %ebx,%edx
  993. shrl $16,%eax
  994. decl %ecx
  995. movw %ax,(%edi)
  996. addl $2,%edi
  997. // do middle a pair of aligned dwords at a time
  998. LFNegMiddle:
  999. pushl %ecx
  1000. shrl $1,%ecx // count / 2
  1001. jz LFNegLast // no aligned dwords to do
  1002. shrl $1,%ecx // (count / 2) / 2
  1003. jnc LFNegMiddleLoop // even number of aligned dwords to do
  1004. movl %edx,%eax
  1005. subl %ebx,%edx
  1006. shrl $16,%eax
  1007. movl %edx,%esi
  1008. subl %ebx,%edx
  1009. andl $0xFFFF0000,%esi
  1010. orl %esi,%eax
  1011. movl %eax,(%edi)
  1012. addl $4,%edi
  1013. andl %ecx,%ecx
  1014. jz LFNegLast
  1015. LFNegMiddleLoop:
  1016. movl %edx,%eax
  1017. subl %ebx,%edx
  1018. shrl $16,%eax
  1019. movl %edx,%esi
  1020. subl %ebx,%edx
  1021. andl $0xFFFF0000,%esi
  1022. orl %esi,%eax
  1023. movl %edx,%ebp
  1024. movl %eax,(%edi)
  1025. subl %ebx,%edx
  1026. shrl $16,%ebp
  1027. movl %edx,%esi
  1028. subl %ebx,%edx
  1029. andl $0xFFFF0000,%esi
  1030. orl %esi,%ebp
  1031. movl %ebp,4(%edi) // FIXME: eliminate register contention
  1032. addl $8,%edi
  1033. decl %ecx
  1034. jnz LFNegMiddleLoop
  1035. LFNegLast:
  1036. popl %ecx // retrieve count
  1037. popl %esi // retrieve span pointer
  1038. // do the last, unaligned pixel, if there is one
  1039. andl $1,%ecx // is there an odd pixel left to do?
  1040. jz LFNegSpanDone // no
  1041. shrl $16,%edx
  1042. movw %dx,(%edi) // do the final pixel's z
  1043. LFNegSpanDone:
  1044. movl espan_t_pnext(%esi),%esi
  1045. testl %esi,%esi
  1046. jnz LFNegSpanLoop
  1047. LFDone:
  1048. popl %ebx // restore register variables
  1049. popl %esi
  1050. popl %edi
  1051. popl %ebp // restore the caller's stack frame
  1052. ret
  1053. #endif // id386