crc_i386.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. /*
  2. Copyright (c) 1990-2007 Info-ZIP. All rights reserved.
  3. See the accompanying file LICENSE, version 2000-Apr-09 or later
  4. (the contents of which are also included in zip.h) for terms of use.
  5. If, for some reason, all these files are missing, the Info-ZIP license
  6. also may be found at: ftp://ftp.info-zip.org/pub/infozip/license.html
  7. */
  8. /*
  9. * crc_i386.S, optimized CRC calculation function for Zip and UnZip,
  10. * created by Paul Kienitz and Christian Spieler. Last revised 07 Jan 2007.
  11. *
  12. * GRR 961110: incorporated Scott Field optimizations from win32/crc_i386.asm
  13. * => overall 6% speedup in "unzip -tq" on 9MB zipfile (486-66)
  14. *
  15. * SPC 970402: revised for Rodney Brown's optimizations (32-bit-wide
  16. * aligned reads for most of the data from buffer), can be
  17. * disabled by defining the macro NO_32_BIT_LOADS
  18. *
  19. * SPC 971012: added Rodney Brown's additional tweaks for 32-bit-optimized
  20. * CPUs (like the Pentium Pro, Pentium II, and probably some
  21. * Pentium clones). This optimization is controlled by the
  22. * preprocessor switch "__686" and is disabled by default.
  23. * (This default is based on the assumption that most users
  24. * do not yet work on a Pentium Pro or Pentium II machine ...)
  25. *
  26. * COS 050116: Enabled the 686 build by default, because there are hardly any
  27. * pre-686 CPUs in serious use nowadays. (See SPC 970402 above.)
  28. *
  29. * SPC 060103: Updated code to incorporate newer optimizations found in zlib.
  30. *
  31. * SPC 070107: Added conditional switch to deactivate crc32() compilation.
  32. *
  33. * FLAT memory model assumed. Calling interface:
  34. * - args are pushed onto the stack from right to left,
  35. * - return value is given in the EAX register,
  36. * - all other registers (with exception of EFLAGS) are preserved. (With
  37. * GNU C 2.7.x, %edx and %ecx are `scratch' registers, but preserving
  38. * them nevertheless adds only 4 single byte instructions.)
  39. *
  40. * This source generates the function
  41. * ulg crc32(ulg crc, ZCONST uch *buf, extent len).
  42. *
  43. * Loop unrolling can be disabled by defining the macro NO_UNROLLED_LOOPS.
  44. * This results in shorter code at the expense of reduced performance.
  45. */
  46. /* This file is NOT used in conjunction with zlib, or when only creation of
  47. * the basic CRC_32_Table (for other purpose) is requested.
  48. */
  49. #if !defined(USE_ZLIB) && !defined(CRC_TABLE_ONLY)
  50. /* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix
  51. * external symbols with an underline character '_'.
  52. */
  53. #if defined(NO_UNDERLINE) || defined(__ELF__)
  54. # define _crc32 crc32
  55. # define _get_crc_table get_crc_table
  56. #endif
  57. /* Use 16-byte alignment if your assembler supports it. Warning: gas
  58. * uses a log(x) parameter (.align 4 means 16-byte alignment). On SVR4
  59. * the parameter is a number of bytes.
  60. */
  61. #ifndef ALIGNMENT
  62. # define ALIGNMENT .align 4,0x90
  63. #endif
  64. #if defined(i386) || defined(_i386) || defined(_I386) || defined(__i386)
  65. /* This version is for 386 Unix, OS/2, MSDOS in 32 bit mode (gcc & gas).
  66. * Warning: it uses the AT&T syntax: mov source,dest
  67. * This file is only optional. If you want to use the C version,
  68. * remove -DASM_CRC from CFLAGS in Makefile and set OBJA to an empty string.
  69. */
  70. .file "crc_i386.S"
  71. #if !defined(PRE_686) && !defined(__686)
  72. /* Optimize for Pentium Pro and compatible CPUs by default. */
  73. # define __686
  74. #endif
  75. #if defined(NO_STD_STACKFRAME) && defined(USE_STD_STACKFRAME)
  76. # undef USE_STACKFRAME
  77. #else
  78. /* The default is to use standard stack frame entry, because it
  79. * results in smaller code!
  80. */
  81. # ifndef USE_STD_STACKFRAME
  82. # define USE_STD_STACKFRAME
  83. # endif
  84. #endif
  85. #ifdef USE_STD_STACKFRAME
  86. # define _STD_ENTRY pushl %ebp ; movl %esp,%ebp
  87. # define arg1 8(%ebp)
  88. # define arg2 12(%ebp)
  89. # define arg3 16(%ebp)
  90. # define _STD_LEAVE popl %ebp
  91. #else /* !USE_STD_STACKFRAME */
  92. # define _STD_ENTRY
  93. # define arg1 24(%esp)
  94. # define arg2 28(%esp)
  95. # define arg3 32(%esp)
  96. # define _STD_LEAVE
  97. #endif /* ?USE_STD_STACKFRAME */
  98. /*
  99. * These two (three) macros make up the loop body of the CRC32 cruncher.
  100. * registers modified:
  101. * eax : crc value "c"
  102. * esi : pointer to next data byte (or lword) "buf++"
  103. * registers read:
  104. * edi : pointer to base of crc_table array
  105. * scratch registers:
  106. * ebx : index into crc_table array
  107. * (requires upper three bytes = 0 when __686 is undefined)
  108. */
  109. #ifndef __686 /* optimize for 386, 486, Pentium */
  110. #define Do_CRC /* c = (c >> 8) ^ table[c & 0xFF] */\
  111. movb %al, %bl ;/* tmp = c & 0xFF */\
  112. shrl $8, %eax ;/* c = (c >> 8) */\
  113. xorl (%edi, %ebx, 4), %eax ;/* c ^= table[tmp] */
  114. #else /* __686 : optimize for Pentium Pro and compatible CPUs */
  115. #define Do_CRC /* c = (c >> 8) ^ table[c & 0xFF] */\
  116. movzbl %al, %ebx ;/* tmp = c & 0xFF */\
  117. shrl $8, %eax ;/* c = (c >> 8) */\
  118. xorl (%edi, %ebx, 4), %eax ;/* c ^=table[tmp] */
  119. #endif /* ?__686 */
  120. #define Do_CRC_byte /* c = (c >> 8) ^ table[(c^*buf++)&0xFF] */\
  121. xorb (%esi), %al ;/* c ^= *buf */\
  122. incl %esi ;/* buf++ */\
  123. Do_CRC
  124. #define Do_CRC_byteof(ofs) /* c = (c >> 8) ^ table[(c^*buf++)&0xFF] */\
  125. xorb ofs(%esi), %al ;/* c ^= *buf */\
  126. incl %esi ;/* buf++ */\
  127. Do_CRC
  128. #ifndef NO_32_BIT_LOADS
  129. # ifdef IZ_CRCOPTIM_UNFOLDTBL
  130. /* the edx register is needed in crc calculation */
  131. # define SavLen arg3
  132. # define UpdCRC_lword \
  133. movzbl %al, %ebx ; \
  134. movl 3072(%edi,%ebx,4), %edx ; \
  135. movzbl %ah, %ebx ; \
  136. shrl $16, %eax ; \
  137. xor 2048(%edi,%ebx,4), %edx ; \
  138. movzbl %al, %ebx ; \
  139. shrl $8,%eax ; \
  140. xorl 1024(%edi,%ebx,4), %edx ; \
  141. movl (%edi,%eax,4), %eax ; \
  142. xorl %edx,%eax ;
  143. # define UpdCRC_lword_sh(dwPtrIncr) \
  144. movzbl %al, %ebx ; \
  145. movl 3072(%edi,%ebx,4), %edx ; \
  146. movzbl %ah, %ebx ; \
  147. shrl $16, %eax ; \
  148. xor 2048(%edi,%ebx,4), %edx ; \
  149. movzbl %al, %ebx ; \
  150. addl $4*(dwPtrIncr), %esi ;/* ((ulg *)buf)+=dwPtrIncr */\
  151. shrl $8,%eax ; \
  152. xorl 1024(%edi,%ebx,4), %edx ; \
  153. movl (%edi,%eax,4),%eax ; \
  154. xorl %edx,%eax ;
  155. # else /* !IZ_CRCOPTIM_UNFOLDTBL */
  156. /* the edx register is not needed anywhere else */
  157. # define SavLen %edx
  158. # define UpdCRC_lword \
  159. Do_CRC \
  160. Do_CRC \
  161. Do_CRC \
  162. Do_CRC
  163. # define UpdCRC_lword_sh(dwPtrIncr) \
  164. Do_CRC \
  165. Do_CRC \
  166. addl $4*(dwPtrIncr), %esi ;/* ((ulg *)buf)++ */\
  167. Do_CRC \
  168. Do_CRC
  169. # endif /* ?IZ_CRCOPTIM_UNFOLDTBL */
  170. #define Do_CRC_lword \
  171. xorl (%esi), %eax ;/* c ^= *(ulg *)buf */\
  172. UpdCRC_lword_sh(1) /* ... ((ulg *)buf)++ */
  173. #define Do_CRC_4lword \
  174. xorl (%esi), %eax ;/* c ^= *(ulg *)buf */\
  175. UpdCRC_lword \
  176. xorl 4(%esi), %eax ;/* c ^= *((ulg *)buf+1) */\
  177. UpdCRC_lword \
  178. xorl 8(%esi), %eax ;/* c ^= *((ulg *)buf+2) */\
  179. UpdCRC_lword \
  180. xorl 12(%esi), %eax ;/* c ^= *((ulg *)buf]+3 */\
  181. UpdCRC_lword_sh(4) /* ... ((ulg *)buf)+=4 */
  182. #endif /* !NO_32_BIT_LOADS */
  183. .text
  184. .globl _crc32
  185. _crc32: /* ulg crc32(ulg crc, uch *buf, extent len) */
  186. _STD_ENTRY
  187. pushl %edi
  188. pushl %esi
  189. pushl %ebx
  190. pushl %edx
  191. pushl %ecx
  192. movl arg2, %esi /* 2nd arg: uch *buf */
  193. subl %eax, %eax /* > if (!buf) */
  194. testl %esi, %esi /* > return 0; */
  195. jz .L_fine /* > else { */
  196. call _get_crc_table
  197. movl %eax, %edi
  198. movl arg1, %eax /* 1st arg: ulg crc */
  199. #ifndef __686
  200. subl %ebx, %ebx /* ebx=0; bl usable as dword */
  201. #endif
  202. movl arg3, %ecx /* 3rd arg: extent len */
  203. notl %eax /* > c = ~crc; */
  204. testl %ecx, %ecx
  205. #ifndef NO_UNROLLED_LOOPS
  206. jz .L_bail
  207. # ifndef NO_32_BIT_LOADS
  208. /* Assert now have positive length */
  209. .L_align_loop:
  210. testl $3, %esi /* Align buf on lword boundary */
  211. jz .L_aligned_now
  212. Do_CRC_byte
  213. decl %ecx
  214. jnz .L_align_loop
  215. .L_aligned_now:
  216. # endif /* !NO_32_BIT_LOADS */
  217. movl %ecx, SavLen /* save current value of len */
  218. shrl $4, %ecx /* ecx = len / 16 */
  219. jz .L_No_Sixteens
  220. /* align loop head at start of 486 internal cache line !! */
  221. ALIGNMENT
  222. .L_Next_Sixteen:
  223. # ifndef NO_32_BIT_LOADS
  224. Do_CRC_4lword
  225. # else /* NO_32_BIT_LOADS */
  226. Do_CRC_byteof(0)
  227. Do_CRC_byteof(1)
  228. Do_CRC_byteof(2)
  229. Do_CRC_byteof(3)
  230. Do_CRC_byteof(4)
  231. Do_CRC_byteof(5)
  232. Do_CRC_byteof(6)
  233. Do_CRC_byteof(7)
  234. Do_CRC_byteof(8)
  235. Do_CRC_byteof(9)
  236. Do_CRC_byteof(10)
  237. Do_CRC_byteof(11)
  238. Do_CRC_byteof(12)
  239. Do_CRC_byteof(13)
  240. Do_CRC_byteof(14)
  241. Do_CRC_byteof(15)
  242. addl $16,%esi ;/* buf += 16 */
  243. # endif /* ?NO_32_BIT_LOADS */
  244. decl %ecx
  245. jnz .L_Next_Sixteen
  246. .L_No_Sixteens:
  247. movl SavLen, %ecx
  248. andl $15, %ecx /* ecx = len % 16 */
  249. # ifndef NO_32_BIT_LOADS
  250. shrl $2,%ecx /* ecx = len / 4 */
  251. jz .L_No_Fours
  252. .L_Next_Four:
  253. Do_CRC_lword
  254. decl %ecx
  255. jnz .L_Next_Four
  256. .L_No_Fours:
  257. movl SavLen,%ecx
  258. andl $3,%ecx /* ecx = len % 4 */
  259. # endif /* !NO_32_BIT_LOADS */
  260. #endif /* !NO_UNROLLED_LOOPS */
  261. jz .L_bail /* > if (len) */
  262. /* align loop head at start of 486 internal cache line !! */
  263. ALIGNMENT
  264. .L_loupe: /* > do { */
  265. Do_CRC_byte /* c = CRC32(c,*buf++,crctab);*/
  266. decl %ecx /* > } while (--len); */
  267. jnz .L_loupe
  268. .L_bail: /* > } */
  269. notl %eax /* > return ~c; */
  270. .L_fine:
  271. popl %ecx
  272. popl %edx
  273. popl %ebx
  274. popl %esi
  275. popl %edi
  276. _STD_LEAVE
  277. ret
  278. #else
  279. error: this asm version is for 386 only
  280. #endif /* i386 || _i386 || _I386 || __i386 */
  281. #endif /* !USE_ZLIB && !CRC_TABLE_ONLY */