amd64-match.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. /*
  2. * match.S -- optimized version of longest_match()
  3. * based on the similar work by Gilles Vollant, and Brian Raiter, written 1998
  4. *
  5. * This is free software; you can redistribute it and/or modify it
  6. * under the terms of the BSD License. Use by owners of Che Guevarra
  7. * parafernalia is prohibited, where possible, and highly discouraged
  8. * elsewhere.
  9. */
  10. #ifndef NO_UNDERLINE
  11. # define match_init _match_init
  12. # define longest_match _longest_match
  13. #endif
  14. #define scanend ebx
  15. #define scanendw bx
  16. #define chainlenwmask edx /* high word: current chain len low word: s->wmask */
  17. #define curmatch rsi
  18. #define curmatchd esi
  19. #define windowbestlen r8
  20. #define scanalign r9
  21. #define scanalignd r9d
  22. #define window r10
  23. #define bestlen r11
  24. #define bestlend r11d
  25. #define scanstart r12d
  26. #define scanstartw r12w
  27. #define scan r13
  28. #define nicematch r14d
  29. #define limit r15
  30. #define limitd r15d
  31. #define prev rcx
  32. /*
  33. * The 258 is a "magic number, not a parameter -- changing it
  34. * breaks the hell loose
  35. */
  36. #define MAX_MATCH (258)
  37. #define MIN_MATCH (3)
  38. #define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1)
  39. #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7)
  40. /* stack frame offsets */
  41. #define LocalVarsSize (112)
  42. #define _chainlenwmask ( 8-LocalVarsSize)(%rsp)
  43. #define _windowbestlen (16-LocalVarsSize)(%rsp)
  44. #define save_r14 (24-LocalVarsSize)(%rsp)
  45. #define save_rsi (32-LocalVarsSize)(%rsp)
  46. #define save_rbx (40-LocalVarsSize)(%rsp)
  47. #define save_r12 (56-LocalVarsSize)(%rsp)
  48. #define save_r13 (64-LocalVarsSize)(%rsp)
  49. #define save_r15 (80-LocalVarsSize)(%rsp)
  50. .globl match_init, longest_match
  51. /*
  52. * On AMD64 the first argument of a function (in our case -- the pointer to
  53. * deflate_state structure) is passed in %rdi, hence our offsets below are
  54. * all off of that.
  55. */
  56. /* you can check the structure offset by running
  57. #include <stdlib.h>
  58. #include <stdio.h>
  59. #include "deflate.h"
  60. void print_depl()
  61. {
  62. deflate_state ds;
  63. deflate_state *s=&ds;
  64. printf("size pointer=%u\n",(int)sizeof(void*));
  65. printf("#define dsWSize (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s)));
  66. printf("#define dsWMask (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s)));
  67. printf("#define dsWindow (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s)));
  68. printf("#define dsPrev (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s)));
  69. printf("#define dsMatchLen (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s)));
  70. printf("#define dsPrevMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s)));
  71. printf("#define dsStrStart (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s)));
  72. printf("#define dsMatchStart (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s)));
  73. printf("#define dsLookahead (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s)));
  74. printf("#define dsPrevLen (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s)));
  75. printf("#define dsMaxChainLen (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
  76. printf("#define dsGoodMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s)));
  77. printf("#define dsNiceMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s)));
  78. }
  79. */
  80. /*
  81. to compile for XCode 3.2 on MacOSX x86_64
  82. - run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S"
  83. */
  84. #ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE
  85. #define dsWSize ( 68)(%rdi)
  86. #define dsWMask ( 76)(%rdi)
  87. #define dsWindow ( 80)(%rdi)
  88. #define dsPrev ( 96)(%rdi)
  89. #define dsMatchLen (144)(%rdi)
  90. #define dsPrevMatch (148)(%rdi)
  91. #define dsStrStart (156)(%rdi)
  92. #define dsMatchStart (160)(%rdi)
  93. #define dsLookahead (164)(%rdi)
  94. #define dsPrevLen (168)(%rdi)
  95. #define dsMaxChainLen (172)(%rdi)
  96. #define dsGoodMatch (188)(%rdi)
  97. #define dsNiceMatch (192)(%rdi)
  98. #else
  99. #ifndef STRUCT_OFFSET
  100. # define STRUCT_OFFSET (0)
  101. #endif
  102. #define dsWSize ( 56 + STRUCT_OFFSET)(%rdi)
  103. #define dsWMask ( 64 + STRUCT_OFFSET)(%rdi)
  104. #define dsWindow ( 72 + STRUCT_OFFSET)(%rdi)
  105. #define dsPrev ( 88 + STRUCT_OFFSET)(%rdi)
  106. #define dsMatchLen (136 + STRUCT_OFFSET)(%rdi)
  107. #define dsPrevMatch (140 + STRUCT_OFFSET)(%rdi)
  108. #define dsStrStart (148 + STRUCT_OFFSET)(%rdi)
  109. #define dsMatchStart (152 + STRUCT_OFFSET)(%rdi)
  110. #define dsLookahead (156 + STRUCT_OFFSET)(%rdi)
  111. #define dsPrevLen (160 + STRUCT_OFFSET)(%rdi)
  112. #define dsMaxChainLen (164 + STRUCT_OFFSET)(%rdi)
  113. #define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi)
  114. #define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi)
  115. #endif
  116. .text
  117. /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */
  118. longest_match:
  119. /*
  120. * Retrieve the function arguments. %curmatch will hold cur_match
  121. * throughout the entire function (passed via rsi on amd64).
  122. * rdi will hold the pointer to the deflate_state (first arg on amd64)
  123. */
  124. mov %rsi, save_rsi
  125. mov %rbx, save_rbx
  126. mov %r12, save_r12
  127. mov %r13, save_r13
  128. mov %r14, save_r14
  129. mov %r15, save_r15
  130. /* uInt wmask = s->w_mask; */
  131. /* unsigned chain_length = s->max_chain_length; */
  132. /* if (s->prev_length >= s->good_match) { */
  133. /* chain_length >>= 2; */
  134. /* } */
  135. movl dsPrevLen, %eax
  136. movl dsGoodMatch, %ebx
  137. cmpl %ebx, %eax
  138. movl dsWMask, %eax
  139. movl dsMaxChainLen, %chainlenwmask
  140. jl LastMatchGood
  141. shrl $2, %chainlenwmask
  142. LastMatchGood:
  143. /* chainlen is decremented once beforehand so that the function can */
  144. /* use the sign flag instead of the zero flag for the exit test. */
  145. /* It is then shifted into the high word, to make room for the wmask */
  146. /* value, which it will always accompany. */
  147. decl %chainlenwmask
  148. shll $16, %chainlenwmask
  149. orl %eax, %chainlenwmask
  150. /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */
  151. movl dsNiceMatch, %eax
  152. movl dsLookahead, %ebx
  153. cmpl %eax, %ebx
  154. jl LookaheadLess
  155. movl %eax, %ebx
  156. LookaheadLess: movl %ebx, %nicematch
  157. /* register Bytef *scan = s->window + s->strstart; */
  158. mov dsWindow, %window
  159. movl dsStrStart, %limitd
  160. lea (%limit, %window), %scan
  161. /* Determine how many bytes the scan ptr is off from being */
  162. /* dword-aligned. */
  163. mov %scan, %scanalign
  164. negl %scanalignd
  165. andl $3, %scanalignd
  166. /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
  167. /* s->strstart - (IPos)MAX_DIST(s) : NIL; */
  168. movl dsWSize, %eax
  169. subl $MIN_LOOKAHEAD, %eax
  170. xorl %ecx, %ecx
  171. subl %eax, %limitd
  172. cmovng %ecx, %limitd
  173. /* int best_len = s->prev_length; */
  174. movl dsPrevLen, %bestlend
  175. /* Store the sum of s->window + best_len in %windowbestlen locally, and in memory. */
  176. lea (%window, %bestlen), %windowbestlen
  177. mov %windowbestlen, _windowbestlen
  178. /* register ush scan_start = *(ushf*)scan; */
  179. /* register ush scan_end = *(ushf*)(scan+best_len-1); */
  180. /* Posf *prev = s->prev; */
  181. movzwl (%scan), %scanstart
  182. movzwl -1(%scan, %bestlen), %scanend
  183. mov dsPrev, %prev
  184. /* Jump into the main loop. */
  185. movl %chainlenwmask, _chainlenwmask
  186. jmp LoopEntry
  187. .balign 16
  188. /* do {
  189. * match = s->window + cur_match;
  190. * if (*(ushf*)(match+best_len-1) != scan_end ||
  191. * *(ushf*)match != scan_start) continue;
  192. * [...]
  193. * } while ((cur_match = prev[cur_match & wmask]) > limit
  194. * && --chain_length != 0);
  195. *
  196. * Here is the inner loop of the function. The function will spend the
  197. * majority of its time in this loop, and majority of that time will
  198. * be spent in the first ten instructions.
  199. */
  200. LookupLoop:
  201. andl %chainlenwmask, %curmatchd
  202. movzwl (%prev, %curmatch, 2), %curmatchd
  203. cmpl %limitd, %curmatchd
  204. jbe LeaveNow
  205. subl $0x00010000, %chainlenwmask
  206. js LeaveNow
  207. LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw
  208. jne LookupLoop
  209. cmpw %scanstartw, (%window, %curmatch)
  210. jne LookupLoop
  211. /* Store the current value of chainlen. */
  212. movl %chainlenwmask, _chainlenwmask
  213. /* %scan is the string under scrutiny, and %prev to the string we */
  214. /* are hoping to match it up with. In actuality, %esi and %edi are */
  215. /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
  216. /* initialized to -(MAX_MATCH_8 - scanalign). */
  217. mov $(-MAX_MATCH_8), %rdx
  218. lea (%curmatch, %window), %windowbestlen
  219. lea MAX_MATCH_8(%windowbestlen, %scanalign), %windowbestlen
  220. lea MAX_MATCH_8(%scan, %scanalign), %prev
  221. /* the prefetching below makes very little difference... */
  222. prefetcht1 (%windowbestlen, %rdx)
  223. prefetcht1 (%prev, %rdx)
  224. /*
  225. * Test the strings for equality, 8 bytes at a time. At the end,
  226. * adjust %rdx so that it is offset to the exact byte that mismatched.
  227. *
  228. * It should be confessed that this loop usually does not represent
  229. * much of the total running time. Replacing it with a more
  230. * straightforward "rep cmpsb" would not drastically degrade
  231. * performance -- unrolling it, for example, makes no difference.
  232. */
  233. #undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */
  234. LoopCmps:
  235. #ifdef USE_SSE
  236. /* Preload the SSE registers */
  237. movdqu (%windowbestlen, %rdx), %xmm1
  238. movdqu (%prev, %rdx), %xmm2
  239. pcmpeqb %xmm2, %xmm1
  240. movdqu 16(%windowbestlen, %rdx), %xmm3
  241. movdqu 16(%prev, %rdx), %xmm4
  242. pcmpeqb %xmm4, %xmm3
  243. movdqu 32(%windowbestlen, %rdx), %xmm5
  244. movdqu 32(%prev, %rdx), %xmm6
  245. pcmpeqb %xmm6, %xmm5
  246. movdqu 48(%windowbestlen, %rdx), %xmm7
  247. movdqu 48(%prev, %rdx), %xmm8
  248. pcmpeqb %xmm8, %xmm7
  249. /* Check the comparisions' results */
  250. pmovmskb %xmm1, %rax
  251. notw %ax
  252. bsfw %ax, %ax
  253. jnz LeaveLoopCmps
  254. /* this is the only iteration of the loop with a possibility of having
  255. incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40
  256. and (0x40*4)+8=0x108 */
  257. add $8, %rdx
  258. jz LenMaximum
  259. add $8, %rdx
  260. pmovmskb %xmm3, %rax
  261. notw %ax
  262. bsfw %ax, %ax
  263. jnz LeaveLoopCmps
  264. add $16, %rdx
  265. pmovmskb %xmm5, %rax
  266. notw %ax
  267. bsfw %ax, %ax
  268. jnz LeaveLoopCmps
  269. add $16, %rdx
  270. pmovmskb %xmm7, %rax
  271. notw %ax
  272. bsfw %ax, %ax
  273. jnz LeaveLoopCmps
  274. add $16, %rdx
  275. jmp LoopCmps
  276. LeaveLoopCmps: add %rax, %rdx
  277. #else
  278. mov (%windowbestlen, %rdx), %rax
  279. xor (%prev, %rdx), %rax
  280. jnz LeaveLoopCmps
  281. mov 8(%windowbestlen, %rdx), %rax
  282. xor 8(%prev, %rdx), %rax
  283. jnz LeaveLoopCmps8
  284. mov 16(%windowbestlen, %rdx), %rax
  285. xor 16(%prev, %rdx), %rax
  286. jnz LeaveLoopCmps16
  287. add $24, %rdx
  288. jnz LoopCmps
  289. jmp LenMaximum
  290. # if 0
  291. /*
  292. * This three-liner is tantalizingly simple, but bsf is a slow instruction,
  293. * and the complicated alternative down below is quite a bit faster. Sad...
  294. */
  295. LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */
  296. shrl $3, %eax /* divide by 8 to get the byte */
  297. add %rax, %rdx
  298. # else
  299. LeaveLoopCmps16:
  300. add $8, %rdx
  301. LeaveLoopCmps8:
  302. add $8, %rdx
  303. LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */
  304. jnz Check16
  305. add $4, %rdx
  306. shr $32, %rax
  307. Check16: testw $0xFFFF, %ax
  308. jnz LenLower
  309. add $2, %rdx
  310. shrl $16, %eax
  311. LenLower: subb $1, %al
  312. adc $0, %rdx
  313. # endif
  314. #endif
  315. /* Calculate the length of the match. If it is longer than MAX_MATCH, */
  316. /* then automatically accept it as the best possible match and leave. */
  317. lea (%prev, %rdx), %rax
  318. sub %scan, %rax
  319. cmpl $MAX_MATCH, %eax
  320. jge LenMaximum
  321. /* If the length of the match is not longer than the best match we */
  322. /* have so far, then forget it and return to the lookup loop. */
  323. cmpl %bestlend, %eax
  324. jg LongerMatch
  325. mov _windowbestlen, %windowbestlen
  326. mov dsPrev, %prev
  327. movl _chainlenwmask, %edx
  328. jmp LookupLoop
  329. /* s->match_start = cur_match; */
  330. /* best_len = len; */
  331. /* if (len >= nice_match) break; */
  332. /* scan_end = *(ushf*)(scan+best_len-1); */
  333. LongerMatch:
  334. movl %eax, %bestlend
  335. movl %curmatchd, dsMatchStart
  336. cmpl %nicematch, %eax
  337. jge LeaveNow
  338. lea (%window, %bestlen), %windowbestlen
  339. mov %windowbestlen, _windowbestlen
  340. movzwl -1(%scan, %rax), %scanend
  341. mov dsPrev, %prev
  342. movl _chainlenwmask, %chainlenwmask
  343. jmp LookupLoop
  344. /* Accept the current string, with the maximum possible length. */
  345. LenMaximum:
  346. movl $MAX_MATCH, %bestlend
  347. movl %curmatchd, dsMatchStart
  348. /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
  349. /* return s->lookahead; */
  350. LeaveNow:
  351. movl dsLookahead, %eax
  352. cmpl %eax, %bestlend
  353. cmovngl %bestlend, %eax
  354. LookaheadRet:
  355. /* Restore the registers and return from whence we came. */
  356. mov save_rsi, %rsi
  357. mov save_rbx, %rbx
  358. mov save_r12, %r12
  359. mov save_r13, %r13
  360. mov save_r14, %r14
  361. mov save_r15, %r15
  362. ret
  363. match_init: ret