crc_i386.asm 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. ;===========================================================================
  2. ; Copyright (c) 1990-2007 Info-ZIP. All rights reserved.
  3. ;
  4. ; See the accompanying file LICENSE, version 2000-Apr-09 or later
  5. ; (the contents of which are also included in zip.h) for terms of use.
  6. ; If, for some reason, all these files are missing, the Info-ZIP license
  7. ; also may be found at: ftp://ftp.info-zip.org/pub/infozip/license.html
  8. ;===========================================================================
  9. ; crc_i386.asm, optimized CRC calculation function for Zip and UnZip,
  10. ; created by Paul Kienitz and Christian Spieler. Last revised 07 Jan 2007.
  11. ;
  12. ; Revised 06-Oct-96, Scott Field (sfield@microsoft.com)
  13. ; fixed to assemble with masm by not using .model directive which makes
  14. ; assumptions about segment alignment. Also,
  15. ; avoid using loop, and j[e]cxz where possible. Use mov + inc, rather
  16. ; than lodsb, and other misc. changes resulting in the following performance
  17. ; increases:
  18. ;
  19. ; unrolled loops NO_UNROLLED_LOOPS
  20. ; *8 >8 <8 *8 >8 <8
  21. ;
  22. ; +54% +42% +35% +82% +52% +25%
  23. ;
  24. ; first item in each table is input buffer length, even multiple of 8
  25. ; second item in each table is input buffer length, > 8
  26. ; third item in each table is input buffer length, < 8
  27. ;
  28. ; Revised 02-Apr-97, Chr. Spieler, based on Rodney Brown (rdb@cmutual.com.au)
  29. ; Incorporated Rodney Brown's 32-bit-reads optimization as found in the
  30. ; UNIX AS source crc_i386.S. This new code can be disabled by defining
  31. ; the macro symbol NO_32_BIT_LOADS.
  32. ;
  33. ; Revised 12-Oct-97, Chr. Spieler, based on Rodney Brown (rdb@cmutual.com.au)
  34. ; Incorporated Rodney Brown's additional tweaks for 32-bit-optimized CPUs
  35. ; (like the Pentium Pro, Pentium II, and probably some Pentium clones).
  36. ; This optimization is controlled by the macro symbol __686 and is disabled
  37. ; by default. (This default is based on the assumption that most users
  38. ; do not yet work on a Pentium Pro or Pentium II machine ...)
  39. ;
  40. ; Revised 25-Mar-98, Cosmin Truta (cosmint@cs.ubbcluj.ro)
  41. ; Working without .model directive caused tasm32 version 5.0 to produce
  42. ; bad object code. The optimized alignments can be optionally disabled
  43. ; by defining NO_ALIGN, thus allowing to use .model flat. There is no need
  44. ; to define this macro if using other versions of tasm.
  45. ;
  46. ; Revised 16-Jan-2005, Cosmin Truta (cosmint@cs.ubbcluj.ro)
  47. ; Enabled the 686 build by default, because there are hardly any pre-686 CPUs
  48. ; in serious use nowadays. (See the 12-Oct-97 note above.)
  49. ;
  50. ; Revised 03-Jan-2006, Chr. Spieler
  51. ; Enlarged unrolling loops to "do 16 bytes per turn"; optimized access to
  52. ; data buffer in loop body (adjust pointer only once in loop body and use
  53. ; offsets to access each item); added additional support for the "unfolded
  54. ; tables" optimization variant (enabled by IZ_CRCOPTIM_UNFOLDTBL).
  55. ;
  56. ; Revised 07-Jan-2007, Chr. Spieler
  57. ; Recognize additional conditional flag CRC_TABLE_ONLY that prevents
  58. ; compilation of the crc32() function.
  59. ;
  60. ; FLAT memory model assumed.
  61. ;
  62. ; Loop unrolling can be disabled by defining the macro NO_UNROLLED_LOOPS.
  63. ; This results in shorter code at the expense of reduced performance.
  64. ;
  65. ;==============================================================================
  66. ;
  67. ; Do NOT assemble this source if external crc32 routine from zlib gets used,
  68. ; or only the precomputed CRC_32_Table is needed.
  69. ;
  70. IFNDEF USE_ZLIB
  71. IFNDEF CRC_TABLE_ONLY
  72. ;
  73. .386p
  74. name crc_i386
  75. IFDEF NO_ALIGN
  76. .model flat
  77. ENDIF
  78. IFNDEF PRE_686
  79. IFNDEF __686
  80. __686 EQU 1 ; optimize for Pentium Pro, Pentium II and compatible CPUs
  81. ENDIF
  82. ENDIF
  83. extrn _get_crc_table:near ; ZCONST ulg near *get_crc_table(void);
  84. ;
  85. IFNDEF NO_STD_STACKFRAME
  86. ; Use a `standard' stack frame setup on routine entry and exit.
  87. ; Actually, this option is set as default, because it results
  88. ; in smaller code !!
  89. STD_ENTRY MACRO
  90. push ebp
  91. mov ebp,esp
  92. ENDM
  93. Arg1 EQU 08H[ebp]
  94. Arg2 EQU 0CH[ebp]
  95. Arg3 EQU 10H[ebp]
  96. STD_LEAVE MACRO
  97. pop ebp
  98. ENDM
  99. ELSE ; NO_STD_STACKFRAME
  100. STD_ENTRY MACRO
  101. ENDM
  102. Arg1 EQU 18H[esp]
  103. Arg2 EQU 1CH[esp]
  104. Arg3 EQU 20H[esp]
  105. STD_LEAVE MACRO
  106. ENDM
  107. ENDIF ; ?NO_STD_STACKFRAME
  108. ; These two (three) macros make up the loop body of the CRC32 cruncher.
  109. ; registers modified:
  110. ; eax : crc value "c"
  111. ; esi : pointer to next data byte (or dword) "buf++"
  112. ; registers read:
  113. ; edi : pointer to base of crc_table array
  114. ; scratch registers:
  115. ; ebx : index into crc_table array
  116. ; (requires upper three bytes = 0 when __686 is undefined)
  117. IFNDEF __686 ; optimize for 386, 486, Pentium
  118. Do_CRC MACRO
  119. mov bl,al ; tmp = c & 0xFF
  120. shr eax,8 ; c = (c >> 8)
  121. xor eax,[edi+ebx*4] ; ^ table[tmp]
  122. ENDM
  123. ELSE ; __686 : optimize for Pentium Pro, Pentium II and compatible CPUs
  124. Do_CRC MACRO
  125. movzx ebx,al ; tmp = c & 0xFF
  126. shr eax,8 ; c = (c >> 8)
  127. xor eax,[edi+ebx*4] ; ^ table[tmp]
  128. ENDM
  129. ENDIF ; ?__686
  130. Do_CRC_byte MACRO
  131. xor al, byte ptr [esi] ; c ^= *buf
  132. inc esi ; buf++
  133. Do_CRC ; c = (c >> 8) ^ table[c & 0xFF]
  134. ENDM
  135. Do_CRC_byteof MACRO ofs
  136. xor al, byte ptr [esi+ofs] ; c ^= *(buf+ofs)
  137. Do_CRC ; c = (c >> 8) ^ table[c & 0xFF]
  138. ENDM
  139. IFNDEF NO_32_BIT_LOADS
  140. IFDEF IZ_CRCOPTIM_UNFOLDTBL
  141. ; the edx register is needed in crc calculation
  142. SavLen EQU Arg3
  143. UpdCRC_dword MACRO
  144. movzx ebx,al ; tmp = c & 0xFF
  145. mov edx,[edi+ebx*4+3072] ; table[256*3+tmp]
  146. movzx ebx,ah ; tmp = (c>>8) & 0xFF
  147. shr eax,16 ;
  148. xor edx,[edi+ebx*4+2048] ; ^ table[256*2+tmp]
  149. movzx ebx,al ; tmp = (c>>16) & 0xFF
  150. shr eax,8 ; tmp = (c>>24)
  151. xor edx,[edi+ebx*4+1024] ; ^ table[256*1+tmp]
  152. mov eax,[edi+eax*4] ; ^ table[256*0+tmp]
  153. xor eax,edx ; ..
  154. ENDM
  155. UpdCRC_dword_sh MACRO dwPtrIncr
  156. movzx ebx,al ; tmp = c & 0xFF
  157. mov edx,[edi+ebx*4+3072] ; table[256*3+tmp]
  158. movzx ebx,ah ; tmp = (c>>8) & 0xFF
  159. xor edx,[edi+ebx*4+2048] ; ^ table[256*2+tmp]
  160. shr eax,16 ;
  161. movzx ebx,al ; tmp = (c>>16) & 0xFF
  162. add esi, 4*dwPtrIncr ; ((ulg *)buf) += dwPtrIncr
  163. shr eax,8 ; tmp = (c>>24)
  164. xor edx,[edi+ebx*4+1024] ; ^ table[256*1+tmp]
  165. mov eax,[edi+eax*4] ; ^ table[256*0+tmp]
  166. xor eax,edx ; ..
  167. ENDM
  168. ELSE ; IZ_CRCOPTIM_UNFOLDTBL
  169. ; the edx register is not needed anywhere else
  170. SavLen EQU edx
  171. UpdCRC_dword MACRO
  172. Do_CRC
  173. Do_CRC
  174. Do_CRC
  175. Do_CRC
  176. ENDM
  177. UpdCRC_dword_sh MACRO dwPtrIncr
  178. Do_CRC
  179. Do_CRC
  180. add esi, 4*dwPtrIncr ; ((ulg *)buf) += dwPtrIncr
  181. Do_CRC
  182. Do_CRC
  183. ENDM
  184. ENDIF ; ?IZ_CRCOPTIM_UNFOLDTBL
  185. Do_CRC_dword MACRO
  186. xor eax, dword ptr [esi] ; c ^= *(ulg *)buf
  187. UpdCRC_dword_sh 1 ; ... ((ulg *)buf)++
  188. ENDM
  189. Do_CRC_4dword MACRO
  190. xor eax, dword ptr [esi] ; c ^= *(ulg *)buf
  191. UpdCRC_dword
  192. xor eax, dword ptr [esi+4] ; c ^= *((ulg *)buf+1)
  193. UpdCRC_dword
  194. xor eax, dword ptr [esi+8] ; c ^= *((ulg *)buf+2)
  195. UpdCRC_dword
  196. xor eax, dword ptr [esi+12] ; c ^= *((ulg *)buf]+3
  197. UpdCRC_dword_sh 4 ; ... ((ulg *)buf)+=4
  198. ENDM
  199. ENDIF ; !NO_32_BIT_LOADS
  200. IFNDEF NO_ALIGN
  201. _TEXT segment use32 para public 'CODE'
  202. ELSE
  203. _TEXT segment use32
  204. ENDIF
  205. assume CS: _TEXT
  206. public _crc32
  207. _crc32 proc near ; ulg crc32(ulg crc, ZCONST uch *buf, extent len)
  208. STD_ENTRY
  209. push edi
  210. push esi
  211. push ebx
  212. push edx
  213. push ecx
  214. mov esi,Arg2 ; 2nd arg: uch *buf
  215. sub eax,eax ;> if (!buf)
  216. test esi,esi ;> return 0;
  217. jz fine ;> else {
  218. call _get_crc_table
  219. mov edi,eax
  220. mov eax,Arg1 ; 1st arg: ulg crc
  221. IFNDEF __686
  222. sub ebx,ebx ; ebx=0; make bl usable as a dword
  223. ENDIF
  224. mov ecx,Arg3 ; 3rd arg: extent len
  225. not eax ;> c = ~crc;
  226. test ecx,ecx
  227. IFNDEF NO_UNROLLED_LOOPS
  228. jz bail
  229. IFNDEF NO_32_BIT_LOADS
  230. align_loop:
  231. test esi,3 ; align buf pointer on next
  232. jz SHORT aligned_now ; dword boundary
  233. Do_CRC_byte
  234. dec ecx
  235. jnz align_loop
  236. aligned_now:
  237. ENDIF ; !NO_32_BIT_LOADS
  238. mov SavLen,ecx ; save current len for later
  239. shr ecx,4 ; ecx = len / 16
  240. jz No_Sixteens
  241. IFNDEF NO_ALIGN
  242. ; align loop head at start of 486 internal cache line !!
  243. align 16
  244. ENDIF
  245. Next_Sixteen:
  246. IFNDEF NO_32_BIT_LOADS
  247. Do_CRC_4dword
  248. ELSE ; NO_32_BIT_LOADS
  249. Do_CRC_byteof 0
  250. Do_CRC_byteof 1
  251. Do_CRC_byteof 2
  252. Do_CRC_byteof 3
  253. Do_CRC_byteof 4
  254. Do_CRC_byteof 5
  255. Do_CRC_byteof 6
  256. Do_CRC_byteof 7
  257. Do_CRC_byteof 8
  258. Do_CRC_byteof 9
  259. Do_CRC_byteof 10
  260. Do_CRC_byteof 11
  261. Do_CRC_byteof 12
  262. Do_CRC_byteof 13
  263. Do_CRC_byteof 14
  264. Do_CRC_byteof 15
  265. add esi, 16 ; buf += 16
  266. ENDIF ; ?NO_32_BIT_LOADS
  267. dec ecx
  268. jnz Next_Sixteen
  269. No_Sixteens:
  270. mov ecx,SavLen
  271. and ecx,00000000FH ; ecx = len % 16
  272. IFNDEF NO_32_BIT_LOADS
  273. shr ecx,2 ; ecx = len / 4
  274. jz SHORT No_Fours
  275. Next_Four:
  276. Do_CRC_dword
  277. dec ecx
  278. jnz Next_Four
  279. No_Fours:
  280. mov ecx,SavLen
  281. and ecx,000000003H ; ecx = len % 4
  282. ENDIF ; !NO_32_BIT_LOADS
  283. ENDIF ; !NO_UNROLLED_LOOPS
  284. jz SHORT bail ;> if (len)
  285. IFNDEF NO_ALIGN
  286. ; align loop head at start of 486 internal cache line !!
  287. align 16
  288. ENDIF
  289. loupe: ;> do {
  290. Do_CRC_byte ; c = CRC32(c,*buf++,crctab);
  291. dec ecx ;> } while (--len);
  292. jnz loupe
  293. bail: ;> }
  294. not eax ;> return ~c;
  295. fine:
  296. pop ecx
  297. pop edx
  298. pop ebx
  299. pop esi
  300. pop edi
  301. STD_LEAVE
  302. ret
  303. _crc32 endp
  304. _TEXT ends
  305. ;
  306. ENDIF ; !CRC_TABLE_ONLY
  307. ENDIF ; !USE_ZLIB
  308. ;
  309. end