d_draw16.s 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975
  1. /*
  2. Copyright (C) 1996-1997 Id Software, Inc.
  3. This program is free software; you can redistribute it and/or
  4. modify it under the terms of the GNU General Public License
  5. as published by the Free Software Foundation; either version 2
  6. of the License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. See the GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  14. */
  15. //
  16. // d_draw16.s
  17. // x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel
  18. // subdivision.
  19. //
  20. #include "asm_i386.h"
  21. #include "quakeasm.h"
  22. #include "asm_draw.h"
  23. #include "d_ifacea.h"
  24. #if id386
  25. //----------------------------------------------------------------------
  26. // 8-bpp horizontal span drawing code for polygons, with no transparency and
  27. // 16-pixel subdivision.
  28. //
  29. // Assumes there is at least one span in pspans, and that every span
  30. // contains at least one pixel
  31. //----------------------------------------------------------------------
  32. .data
  33. .text
  34. // out-of-line, rarely-needed clamping code
  35. LClampHigh0:
  36. movl C(bbextents),%esi
  37. jmp LClampReentry0
  38. LClampHighOrLow0:
  39. jg LClampHigh0
  40. xorl %esi,%esi
  41. jmp LClampReentry0
  42. LClampHigh1:
  43. movl C(bbextentt),%edx
  44. jmp LClampReentry1
  45. LClampHighOrLow1:
  46. jg LClampHigh1
  47. xorl %edx,%edx
  48. jmp LClampReentry1
  49. LClampLow2:
  50. movl $4096,%ebp
  51. jmp LClampReentry2
  52. LClampHigh2:
  53. movl C(bbextents),%ebp
  54. jmp LClampReentry2
  55. LClampLow3:
  56. movl $4096,%ecx
  57. jmp LClampReentry3
  58. LClampHigh3:
  59. movl C(bbextentt),%ecx
  60. jmp LClampReentry3
  61. LClampLow4:
  62. movl $4096,%eax
  63. jmp LClampReentry4
  64. LClampHigh4:
  65. movl C(bbextents),%eax
  66. jmp LClampReentry4
  67. LClampLow5:
  68. movl $4096,%ebx
  69. jmp LClampReentry5
  70. LClampHigh5:
  71. movl C(bbextentt),%ebx
  72. jmp LClampReentry5
  73. #define pspans 4+16
  74. .align 4
  75. .globl C(D_DrawSpans16)
  76. C(D_DrawSpans16):
  77. pushl %ebp // preserve caller's stack frame
  78. pushl %edi
  79. pushl %esi // preserve register variables
  80. pushl %ebx
  81. //
  82. // set up scaled-by-16 steps, for 16-long segments; also set up cacheblock
  83. // and span list pointers
  84. //
  85. // TODO: any overlap from rearranging?
  86. flds C(d_sdivzstepu)
  87. fmuls fp_16
  88. movl C(cacheblock),%edx
  89. flds C(d_tdivzstepu)
  90. fmuls fp_16
  91. movl pspans(%esp),%ebx // point to the first span descriptor
  92. flds C(d_zistepu)
  93. fmuls fp_16
  94. movl %edx,pbase // pbase = cacheblock
  95. fstps zi16stepu
  96. fstps tdivz16stepu
  97. fstps sdivz16stepu
  98. LSpanLoop:
  99. //
  100. // set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
  101. // initial s and t values
  102. //
  103. // FIXME: pipeline FILD?
  104. fildl espan_t_v(%ebx)
  105. fildl espan_t_u(%ebx)
  106. fld %st(1) // dv | du | dv
  107. fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv
  108. fld %st(1) // du | dv*d_sdivzstepv | du | dv
  109. fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
  110. fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
  111. fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu |
  112. // dv*d_sdivzstepv | du | dv
  113. fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu |
  114. // dv*d_sdivzstepv | du | dv
  115. faddp %st(0),%st(2) // du*d_tdivzstepu |
  116. // du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
  117. fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
  118. // du*d_tdivzstepu | du | dv
  119. fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv |
  120. // du*d_tdivzstepu | du | dv
  121. fmuls C(d_tdivzstepv) // dv*d_tdivzstepv |
  122. // du*d_sdivzstepu + dv*d_sdivzstepv |
  123. // du*d_tdivzstepu | du | dv
  124. fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv |
  125. // dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
  126. fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv +
  127. // du*d_sdivzstepu; stays in %st(2) at end
  128. fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
  129. // s/z
  130. fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv |
  131. // du*d_tdivzstepu | du | s/z
  132. fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv |
  133. // du*d_tdivzstepu | du | s/z
  134. faddp %st(0),%st(2) // dv*d_zistepv |
  135. // dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
  136. fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu |
  137. // dv*d_zistepv | s/z
  138. fmuls C(d_zistepu) // du*d_zistepu |
  139. // dv*d_tdivzstepv + du*d_tdivzstepu |
  140. // dv*d_zistepv | s/z
  141. fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu |
  142. // du*d_zistepu | dv*d_zistepv | s/z
  143. fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv +
  144. // du*d_tdivzstepu; stays in %st(1) at end
  145. fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z
  146. faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z
  147. flds fp_64k // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
  148. fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
  149. fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv +
  150. // du*d_zistepu; stays in %st(0) at end
  151. // 1/z | fp_64k | t/z | s/z
  152. //
  153. // calculate and clamp s & t
  154. //
  155. fdivr %st(0),%st(1) // 1/z | z*64k | t/z | s/z
  156. //
  157. // point %edi to the first pixel in the span
  158. //
  159. movl C(d_viewbuffer),%ecx
  160. movl espan_t_v(%ebx),%eax
  161. movl %ebx,pspantemp // preserve spans pointer
  162. movl C(tadjust),%edx
  163. movl C(sadjust),%esi
  164. movl C(d_scantable)(,%eax,4),%edi // v * screenwidth
  165. addl %ecx,%edi
  166. movl espan_t_u(%ebx),%ecx
  167. addl %ecx,%edi // pdest = &pdestspan[scans->u];
  168. movl espan_t_count(%ebx),%ecx
  169. //
  170. // now start the FDIV for the end of the span
  171. //
  172. cmpl $16,%ecx
  173. ja LSetupNotLast1
  174. decl %ecx
  175. jz LCleanup1 // if only one pixel, no need to start an FDIV
  176. movl %ecx,spancountminus1
  177. // finish up the s and t calcs
  178. fxch %st(1) // z*64k | 1/z | t/z | s/z
  179. fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
  180. fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
  181. fxch %st(1) // z*64k | s | 1/z | t/z | s/z
  182. fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
  183. fxch %st(1) // s | t | 1/z | t/z | s/z
  184. fistpl s // 1/z | t | t/z | s/z
  185. fistpl t // 1/z | t/z | s/z
  186. fildl spancountminus1
  187. flds C(d_tdivzstepu) // C(d_tdivzstepu) | spancountminus1
  188. flds C(d_zistepu) // C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
  189. fmul %st(2),%st(0) // C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
  190. fxch %st(1) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
  191. fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
  192. fxch %st(2) // scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
  193. fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
  194. // C(d_tdivzstepu)*scm1
  195. fxch %st(1) // C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
  196. // C(d_tdivzstepu)*scm1
  197. faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
  198. fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
  199. faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
  200. faddp %st(0),%st(3)
  201. flds fp_64k
  202. fdiv %st(1),%st(0) // this is what we've gone to all this trouble to
  203. // overlap
  204. jmp LFDIVInFlight1
  205. LCleanup1:
  206. // finish up the s and t calcs
  207. fxch %st(1) // z*64k | 1/z | t/z | s/z
  208. fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
  209. fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
  210. fxch %st(1) // z*64k | s | 1/z | t/z | s/z
  211. fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
  212. fxch %st(1) // s | t | 1/z | t/z | s/z
  213. fistpl s // 1/z | t | t/z | s/z
  214. fistpl t // 1/z | t/z | s/z
  215. jmp LFDIVInFlight1
  216. .align 4
  217. LSetupNotLast1:
  218. // finish up the s and t calcs
  219. fxch %st(1) // z*64k | 1/z | t/z | s/z
  220. fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z
  221. fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z
  222. fxch %st(1) // z*64k | s | 1/z | t/z | s/z
  223. fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z
  224. fxch %st(1) // s | t | 1/z | t/z | s/z
  225. fistpl s // 1/z | t | t/z | s/z
  226. fistpl t // 1/z | t/z | s/z
  227. fadds zi16stepu
  228. fxch %st(2)
  229. fadds sdivz16stepu
  230. fxch %st(2)
  231. flds tdivz16stepu
  232. faddp %st(0),%st(2)
  233. flds fp_64k
  234. fdiv %st(1),%st(0) // z = 1/1/z
  235. // this is what we've gone to all this trouble to
  236. // overlap
  237. LFDIVInFlight1:
  238. addl s,%esi
  239. addl t,%edx
  240. movl C(bbextents),%ebx
  241. movl C(bbextentt),%ebp
  242. cmpl %ebx,%esi
  243. ja LClampHighOrLow0
  244. LClampReentry0:
  245. movl %esi,s
  246. movl pbase,%ebx
  247. shll $16,%esi
  248. cmpl %ebp,%edx
  249. movl %esi,sfracf
  250. ja LClampHighOrLow1
  251. LClampReentry1:
  252. movl %edx,t
  253. movl s,%esi // sfrac = scans->sfrac;
  254. shll $16,%edx
  255. movl t,%eax // tfrac = scans->tfrac;
  256. sarl $16,%esi
  257. movl %edx,tfracf
  258. //
  259. // calculate the texture starting address
  260. //
  261. sarl $16,%eax
  262. movl C(cachewidth),%edx
  263. imull %edx,%eax // (tfrac >> 16) * cachewidth
  264. addl %ebx,%esi
  265. addl %eax,%esi // psource = pbase + (sfrac >> 16) +
  266. // ((tfrac >> 16) * cachewidth);
  267. //
  268. // determine whether last span or not
  269. //
  270. cmpl $16,%ecx
  271. jna LLastSegment
  272. //
  273. // not the last segment; do full 16-wide segment
  274. //
  275. LNotLastSegment:
  276. //
  277. // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
  278. // get there
  279. //
  280. // pick up after the FDIV that was left in flight previously
  281. fld %st(0) // duplicate it
  282. fmul %st(4),%st(0) // s = s/z * z
  283. fxch %st(1)
  284. fmul %st(3),%st(0) // t = t/z * z
  285. fxch %st(1)
  286. fistpl snext
  287. fistpl tnext
  288. movl snext,%eax
  289. movl tnext,%edx
  290. movb (%esi),%bl // get first source texel
  291. subl $16,%ecx // count off this segments' pixels
  292. movl C(sadjust),%ebp
  293. movl %ecx,counttemp // remember count of remaining pixels
  294. movl C(tadjust),%ecx
  295. movb %bl,(%edi) // store first dest pixel
  296. addl %eax,%ebp
  297. addl %edx,%ecx
  298. movl C(bbextents),%eax
  299. movl C(bbextentt),%edx
  300. cmpl $4096,%ebp
  301. jl LClampLow2
  302. cmpl %eax,%ebp
  303. ja LClampHigh2
  304. LClampReentry2:
  305. cmpl $4096,%ecx
  306. jl LClampLow3
  307. cmpl %edx,%ecx
  308. ja LClampHigh3
  309. LClampReentry3:
  310. movl %ebp,snext
  311. movl %ecx,tnext
  312. subl s,%ebp
  313. subl t,%ecx
  314. //
  315. // set up advancetable
  316. //
  317. movl %ecx,%eax
  318. movl %ebp,%edx
  319. sarl $20,%eax // tstep >>= 16;
  320. jz LZero
  321. sarl $20,%edx // sstep >>= 16;
  322. movl C(cachewidth),%ebx
  323. imull %ebx,%eax
  324. jmp LSetUp1
  325. LZero:
  326. sarl $20,%edx // sstep >>= 16;
  327. movl C(cachewidth),%ebx
  328. LSetUp1:
  329. addl %edx,%eax // add in sstep
  330. // (tstep >> 16) * cachewidth + (sstep >> 16);
  331. movl tfracf,%edx
  332. movl %eax,advancetable+4 // advance base in t
  333. addl %ebx,%eax // ((tstep >> 16) + 1) * cachewidth +
  334. // (sstep >> 16);
  335. shll $12,%ebp // left-justify sstep fractional part
  336. movl sfracf,%ebx
  337. shll $12,%ecx // left-justify tstep fractional part
  338. movl %eax,advancetable // advance extra in t
  339. movl %ecx,tstep
  340. addl %ecx,%edx // advance tfrac fractional part by tstep frac
  341. sbbl %ecx,%ecx // turn tstep carry into -1 (0 if none)
  342. addl %ebp,%ebx // advance sfrac fractional part by sstep frac
  343. adcl advancetable+4(,%ecx,4),%esi // point to next source texel
  344. addl tstep,%edx
  345. sbbl %ecx,%ecx
  346. movb (%esi),%al
  347. addl %ebp,%ebx
  348. movb %al,1(%edi)
  349. adcl advancetable+4(,%ecx,4),%esi
  350. addl tstep,%edx
  351. sbbl %ecx,%ecx
  352. addl %ebp,%ebx
  353. movb (%esi),%al
  354. adcl advancetable+4(,%ecx,4),%esi
  355. addl tstep,%edx
  356. sbbl %ecx,%ecx
  357. movb %al,2(%edi)
  358. addl %ebp,%ebx
  359. movb (%esi),%al
  360. adcl advancetable+4(,%ecx,4),%esi
  361. addl tstep,%edx
  362. sbbl %ecx,%ecx
  363. movb %al,3(%edi)
  364. addl %ebp,%ebx
  365. movb (%esi),%al
  366. adcl advancetable+4(,%ecx,4),%esi
  367. addl tstep,%edx
  368. sbbl %ecx,%ecx
  369. movb %al,4(%edi)
  370. addl %ebp,%ebx
  371. movb (%esi),%al
  372. adcl advancetable+4(,%ecx,4),%esi
  373. addl tstep,%edx
  374. sbbl %ecx,%ecx
  375. movb %al,5(%edi)
  376. addl %ebp,%ebx
  377. movb (%esi),%al
  378. adcl advancetable+4(,%ecx,4),%esi
  379. addl tstep,%edx
  380. sbbl %ecx,%ecx
  381. movb %al,6(%edi)
  382. addl %ebp,%ebx
  383. movb (%esi),%al
  384. adcl advancetable+4(,%ecx,4),%esi
  385. addl tstep,%edx
  386. sbbl %ecx,%ecx
  387. movb %al,7(%edi)
  388. addl %ebp,%ebx
  389. movb (%esi),%al
  390. adcl advancetable+4(,%ecx,4),%esi
  391. //
  392. // start FDIV for end of next segment in flight, so it can overlap
  393. //
  394. movl counttemp,%ecx
  395. cmpl $16,%ecx // more than one segment after this?
  396. ja LSetupNotLast2 // yes
  397. decl %ecx
  398. jz LFDIVInFlight2 // if only one pixel, no need to start an FDIV
  399. movl %ecx,spancountminus1
  400. fildl spancountminus1
  401. flds C(d_zistepu) // C(d_zistepu) | spancountminus1
  402. fmul %st(1),%st(0) // C(d_zistepu)*scm1 | scm1
  403. flds C(d_tdivzstepu) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
  404. fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
  405. fxch %st(1) // C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
  406. faddp %st(0),%st(3) // C(d_tdivzstepu)*scm1 | scm1
  407. fxch %st(1) // scm1 | C(d_tdivzstepu)*scm1
  408. fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
  409. fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
  410. faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1
  411. flds fp_64k // 64k | C(d_sdivzstepu)*scm1
  412. fxch %st(1) // C(d_sdivzstepu)*scm1 | 64k
  413. faddp %st(0),%st(4) // 64k
  414. fdiv %st(1),%st(0) // this is what we've gone to all this trouble to
  415. // overlap
  416. jmp LFDIVInFlight2
  417. .align 4
  418. LSetupNotLast2:
  419. fadds zi16stepu
  420. fxch %st(2)
  421. fadds sdivz16stepu
  422. fxch %st(2)
  423. flds tdivz16stepu
  424. faddp %st(0),%st(2)
  425. flds fp_64k
  426. fdiv %st(1),%st(0) // z = 1/1/z
  427. // this is what we've gone to all this trouble to
  428. // overlap
  429. LFDIVInFlight2:
  430. movl %ecx,counttemp
  431. addl tstep,%edx
  432. sbbl %ecx,%ecx
  433. movb %al,8(%edi)
  434. addl %ebp,%ebx
  435. movb (%esi),%al
  436. adcl advancetable+4(,%ecx,4),%esi
  437. addl tstep,%edx
  438. sbbl %ecx,%ecx
  439. movb %al,9(%edi)
  440. addl %ebp,%ebx
  441. movb (%esi),%al
  442. adcl advancetable+4(,%ecx,4),%esi
  443. addl tstep,%edx
  444. sbbl %ecx,%ecx
  445. movb %al,10(%edi)
  446. addl %ebp,%ebx
  447. movb (%esi),%al
  448. adcl advancetable+4(,%ecx,4),%esi
  449. addl tstep,%edx
  450. sbbl %ecx,%ecx
  451. movb %al,11(%edi)
  452. addl %ebp,%ebx
  453. movb (%esi),%al
  454. adcl advancetable+4(,%ecx,4),%esi
  455. addl tstep,%edx
  456. sbbl %ecx,%ecx
  457. movb %al,12(%edi)
  458. addl %ebp,%ebx
  459. movb (%esi),%al
  460. adcl advancetable+4(,%ecx,4),%esi
  461. addl tstep,%edx
  462. sbbl %ecx,%ecx
  463. movb %al,13(%edi)
  464. addl %ebp,%ebx
  465. movb (%esi),%al
  466. adcl advancetable+4(,%ecx,4),%esi
  467. addl tstep,%edx
  468. sbbl %ecx,%ecx
  469. movb %al,14(%edi)
  470. addl %ebp,%ebx
  471. movb (%esi),%al
  472. adcl advancetable+4(,%ecx,4),%esi
  473. addl $16,%edi
  474. movl %edx,tfracf
  475. movl snext,%edx
  476. movl %ebx,sfracf
  477. movl tnext,%ebx
  478. movl %edx,s
  479. movl %ebx,t
  480. movl counttemp,%ecx // retrieve count
  481. //
  482. // determine whether last span or not
  483. //
  484. cmpl $16,%ecx // are there multiple segments remaining?
  485. movb %al,-1(%edi)
  486. ja LNotLastSegment // yes
  487. //
  488. // last segment of scan
  489. //
  490. LLastSegment:
  491. //
  492. // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
  493. // get there. The number of pixels left is variable, and we want to land on the
  494. // last pixel, not step one past it, so we can't run into arithmetic problems
  495. //
  496. testl %ecx,%ecx
  497. jz LNoSteps // just draw the last pixel and we're done
  498. // pick up after the FDIV that was left in flight previously
  499. fld %st(0) // duplicate it
  500. fmul %st(4),%st(0) // s = s/z * z
  501. fxch %st(1)
  502. fmul %st(3),%st(0) // t = t/z * z
  503. fxch %st(1)
  504. fistpl snext
  505. fistpl tnext
  506. movb (%esi),%al // load first texel in segment
  507. movl C(tadjust),%ebx
  508. movb %al,(%edi) // store first pixel in segment
  509. movl C(sadjust),%eax
  510. addl snext,%eax
  511. addl tnext,%ebx
  512. movl C(bbextents),%ebp
  513. movl C(bbextentt),%edx
  514. cmpl $4096,%eax
  515. jl LClampLow4
  516. cmpl %ebp,%eax
  517. ja LClampHigh4
  518. LClampReentry4:
  519. movl %eax,snext
  520. cmpl $4096,%ebx
  521. jl LClampLow5
  522. cmpl %edx,%ebx
  523. ja LClampHigh5
  524. LClampReentry5:
  525. cmpl $1,%ecx // don't bother
  526. je LOnlyOneStep // if two pixels in segment, there's only one step,
  527. // of the segment length
  528. subl s,%eax
  529. subl t,%ebx
  530. addl %eax,%eax // convert to 15.17 format so multiply by 1.31
  531. addl %ebx,%ebx // reciprocal yields 16.48
  532. imull reciprocal_table_16-8(,%ecx,4) // sstep = (snext - s) /
  533. // (spancount-1)
  534. movl %edx,%ebp
  535. movl %ebx,%eax
  536. imull reciprocal_table_16-8(,%ecx,4) // tstep = (tnext - t) /
  537. // (spancount-1)
  538. LSetEntryvec:
  539. //
  540. // set up advancetable
  541. //
  542. movl entryvec_table_16(,%ecx,4),%ebx
  543. movl %edx,%eax
  544. movl %ebx,jumptemp // entry point into code for RET later
  545. movl %ebp,%ecx
  546. sarl $16,%edx // tstep >>= 16;
  547. movl C(cachewidth),%ebx
  548. sarl $16,%ecx // sstep >>= 16;
  549. imull %ebx,%edx
  550. addl %ecx,%edx // add in sstep
  551. // (tstep >> 16) * cachewidth + (sstep >> 16);
  552. movl tfracf,%ecx
  553. movl %edx,advancetable+4 // advance base in t
  554. addl %ebx,%edx // ((tstep >> 16) + 1) * cachewidth +
  555. // (sstep >> 16);
  556. shll $16,%ebp // left-justify sstep fractional part
  557. movl sfracf,%ebx
  558. shll $16,%eax // left-justify tstep fractional part
  559. movl %edx,advancetable // advance extra in t
  560. movl %eax,tstep
  561. movl %ecx,%edx
  562. addl %eax,%edx
  563. sbbl %ecx,%ecx
  564. addl %ebp,%ebx
  565. adcl advancetable+4(,%ecx,4),%esi
  566. jmp *jumptemp // jump to the number-of-pixels handler
  567. //----------------------------------------
  568. LNoSteps:
  569. movb (%esi),%al // load first texel in segment
  570. subl $15,%edi // adjust for hardwired offset
  571. jmp LEndSpan
  572. LOnlyOneStep:
  573. subl s,%eax
  574. subl t,%ebx
  575. movl %eax,%ebp
  576. movl %ebx,%edx
  577. jmp LSetEntryvec
  578. //----------------------------------------
  579. .globl Entry2_16, Entry3_16, Entry4_16, Entry5_16
  580. .globl Entry6_16, Entry7_16, Entry8_16, Entry9_16
  581. .globl Entry10_16, Entry11_16, Entry12_16, Entry13_16
  582. .globl Entry14_16, Entry15_16, Entry16_16
  583. Entry2_16:
  584. subl $14,%edi // adjust for hardwired offsets
  585. movb (%esi),%al
  586. jmp LEntry2_16
  587. //----------------------------------------
  588. Entry3_16:
  589. subl $13,%edi // adjust for hardwired offsets
  590. addl %eax,%edx
  591. movb (%esi),%al
  592. sbbl %ecx,%ecx
  593. addl %ebp,%ebx
  594. adcl advancetable+4(,%ecx,4),%esi
  595. jmp LEntry3_16
  596. //----------------------------------------
  597. Entry4_16:
  598. subl $12,%edi // adjust for hardwired offsets
  599. addl %eax,%edx
  600. movb (%esi),%al
  601. sbbl %ecx,%ecx
  602. addl %ebp,%ebx
  603. adcl advancetable+4(,%ecx,4),%esi
  604. addl tstep,%edx
  605. jmp LEntry4_16
  606. //----------------------------------------
  607. Entry5_16:
  608. subl $11,%edi // adjust for hardwired offsets
  609. addl %eax,%edx
  610. movb (%esi),%al
  611. sbbl %ecx,%ecx
  612. addl %ebp,%ebx
  613. adcl advancetable+4(,%ecx,4),%esi
  614. addl tstep,%edx
  615. jmp LEntry5_16
  616. //----------------------------------------
  617. Entry6_16:
  618. subl $10,%edi // adjust for hardwired offsets
  619. addl %eax,%edx
  620. movb (%esi),%al
  621. sbbl %ecx,%ecx
  622. addl %ebp,%ebx
  623. adcl advancetable+4(,%ecx,4),%esi
  624. addl tstep,%edx
  625. jmp LEntry6_16
  626. //----------------------------------------
  627. Entry7_16:
  628. subl $9,%edi // adjust for hardwired offsets
  629. addl %eax,%edx
  630. movb (%esi),%al
  631. sbbl %ecx,%ecx
  632. addl %ebp,%ebx
  633. adcl advancetable+4(,%ecx,4),%esi
  634. addl tstep,%edx
  635. jmp LEntry7_16
  636. //----------------------------------------
  637. Entry8_16:
  638. subl $8,%edi // adjust for hardwired offsets
  639. addl %eax,%edx
  640. movb (%esi),%al
  641. sbbl %ecx,%ecx
  642. addl %ebp,%ebx
  643. adcl advancetable+4(,%ecx,4),%esi
  644. addl tstep,%edx
  645. jmp LEntry8_16
  646. //----------------------------------------
  647. Entry9_16:
  648. subl $7,%edi // adjust for hardwired offsets
  649. addl %eax,%edx
  650. movb (%esi),%al
  651. sbbl %ecx,%ecx
  652. addl %ebp,%ebx
  653. adcl advancetable+4(,%ecx,4),%esi
  654. addl tstep,%edx
  655. jmp LEntry9_16
  656. //----------------------------------------
  657. Entry10_16:
  658. subl $6,%edi // adjust for hardwired offsets
  659. addl %eax,%edx
  660. movb (%esi),%al
  661. sbbl %ecx,%ecx
  662. addl %ebp,%ebx
  663. adcl advancetable+4(,%ecx,4),%esi
  664. addl tstep,%edx
  665. jmp LEntry10_16
  666. //----------------------------------------
  667. Entry11_16:
  668. subl $5,%edi // adjust for hardwired offsets
  669. addl %eax,%edx
  670. movb (%esi),%al
  671. sbbl %ecx,%ecx
  672. addl %ebp,%ebx
  673. adcl advancetable+4(,%ecx,4),%esi
  674. addl tstep,%edx
  675. jmp LEntry11_16
  676. //----------------------------------------
  677. Entry12_16:
  678. subl $4,%edi // adjust for hardwired offsets
  679. addl %eax,%edx
  680. movb (%esi),%al
  681. sbbl %ecx,%ecx
  682. addl %ebp,%ebx
  683. adcl advancetable+4(,%ecx,4),%esi
  684. addl tstep,%edx
  685. jmp LEntry12_16
  686. //----------------------------------------
  687. Entry13_16:
  688. subl $3,%edi // adjust for hardwired offsets
  689. addl %eax,%edx
  690. movb (%esi),%al
  691. sbbl %ecx,%ecx
  692. addl %ebp,%ebx
  693. adcl advancetable+4(,%ecx,4),%esi
  694. addl tstep,%edx
  695. jmp LEntry13_16
  696. //----------------------------------------
  697. Entry14_16:
  698. subl $2,%edi // adjust for hardwired offsets
  699. addl %eax,%edx
  700. movb (%esi),%al
  701. sbbl %ecx,%ecx
  702. addl %ebp,%ebx
  703. adcl advancetable+4(,%ecx,4),%esi
  704. addl tstep,%edx
  705. jmp LEntry14_16
  706. //----------------------------------------
  707. Entry15_16:
  708. decl %edi // adjust for hardwired offsets
  709. addl %eax,%edx
  710. movb (%esi),%al
  711. sbbl %ecx,%ecx
  712. addl %ebp,%ebx
  713. adcl advancetable+4(,%ecx,4),%esi
  714. addl tstep,%edx
  715. jmp LEntry15_16
  716. //----------------------------------------
  717. Entry16_16:
  718. addl %eax,%edx
  719. movb (%esi),%al
  720. sbbl %ecx,%ecx
  721. addl %ebp,%ebx
  722. adcl advancetable+4(,%ecx,4),%esi
  723. addl tstep,%edx
  724. sbbl %ecx,%ecx
  725. movb %al,1(%edi)
  726. addl %ebp,%ebx
  727. movb (%esi),%al
  728. adcl advancetable+4(,%ecx,4),%esi
  729. addl tstep,%edx
  730. LEntry15_16:
  731. sbbl %ecx,%ecx
  732. movb %al,2(%edi)
  733. addl %ebp,%ebx
  734. movb (%esi),%al
  735. adcl advancetable+4(,%ecx,4),%esi
  736. addl tstep,%edx
  737. LEntry14_16:
  738. sbbl %ecx,%ecx
  739. movb %al,3(%edi)
  740. addl %ebp,%ebx
  741. movb (%esi),%al
  742. adcl advancetable+4(,%ecx,4),%esi
  743. addl tstep,%edx
  744. LEntry13_16:
  745. sbbl %ecx,%ecx
  746. movb %al,4(%edi)
  747. addl %ebp,%ebx
  748. movb (%esi),%al
  749. adcl advancetable+4(,%ecx,4),%esi
  750. addl tstep,%edx
  751. LEntry12_16:
  752. sbbl %ecx,%ecx
  753. movb %al,5(%edi)
  754. addl %ebp,%ebx
  755. movb (%esi),%al
  756. adcl advancetable+4(,%ecx,4),%esi
  757. addl tstep,%edx
  758. LEntry11_16:
  759. sbbl %ecx,%ecx
  760. movb %al,6(%edi)
  761. addl %ebp,%ebx
  762. movb (%esi),%al
  763. adcl advancetable+4(,%ecx,4),%esi
  764. addl tstep,%edx
  765. LEntry10_16:
  766. sbbl %ecx,%ecx
  767. movb %al,7(%edi)
  768. addl %ebp,%ebx
  769. movb (%esi),%al
  770. adcl advancetable+4(,%ecx,4),%esi
  771. addl tstep,%edx
  772. LEntry9_16:
  773. sbbl %ecx,%ecx
  774. movb %al,8(%edi)
  775. addl %ebp,%ebx
  776. movb (%esi),%al
  777. adcl advancetable+4(,%ecx,4),%esi
  778. addl tstep,%edx
  779. LEntry8_16:
  780. sbbl %ecx,%ecx
  781. movb %al,9(%edi)
  782. addl %ebp,%ebx
  783. movb (%esi),%al
  784. adcl advancetable+4(,%ecx,4),%esi
  785. addl tstep,%edx
  786. LEntry7_16:
  787. sbbl %ecx,%ecx
  788. movb %al,10(%edi)
  789. addl %ebp,%ebx
  790. movb (%esi),%al
  791. adcl advancetable+4(,%ecx,4),%esi
  792. addl tstep,%edx
  793. LEntry6_16:
  794. sbbl %ecx,%ecx
  795. movb %al,11(%edi)
  796. addl %ebp,%ebx
  797. movb (%esi),%al
  798. adcl advancetable+4(,%ecx,4),%esi
  799. addl tstep,%edx
  800. LEntry5_16:
  801. sbbl %ecx,%ecx
  802. movb %al,12(%edi)
  803. addl %ebp,%ebx
  804. movb (%esi),%al
  805. adcl advancetable+4(,%ecx,4),%esi
  806. addl tstep,%edx
  807. LEntry4_16:
  808. sbbl %ecx,%ecx
  809. movb %al,13(%edi)
  810. addl %ebp,%ebx
  811. movb (%esi),%al
  812. adcl advancetable+4(,%ecx,4),%esi
  813. LEntry3_16:
  814. movb %al,14(%edi)
  815. movb (%esi),%al
  816. LEntry2_16:
  817. LEndSpan:
  818. //
  819. // clear s/z, t/z, 1/z from FP stack
  820. //
  821. fstp %st(0)
  822. fstp %st(0)
  823. fstp %st(0)
  824. movl pspantemp,%ebx // restore spans pointer
  825. movl espan_t_pnext(%ebx),%ebx // point to next span
  826. testl %ebx,%ebx // any more spans?
  827. movb %al,15(%edi)
  828. jnz LSpanLoop // more spans
  829. popl %ebx // restore register variables
  830. popl %esi
  831. popl %edi
  832. popl %ebp // restore the caller's stack frame
  833. ret
  834. #endif // id386