|
- ;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding
- ; *
- ; * inffas32.asm is derivated from inffas86.c, with translation of assembly code
- ; *
- ; * Copyright (C) 1995-2003 Mark Adler
- ; * For conditions of distribution and use, see copyright notice in zlib.h
- ; *
- ; * Copyright (C) 2003 Chris Anderson <christop@charm.net>
- ; * Please use the copyright conditions above.
- ; *
- ; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
- ; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
- ; * the moment. I have successfully compiled and tested this code with gcc2.96,
- ; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
- ; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
- ; * enabled. I will attempt to merge the MMX code into this version. Newer
- ; * versions of this and inffast.S can be found at
- ; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
- ; *
- ; * 2005 : modification by Gilles Vollant
- ; */
- ; For Visual C++ 4.x and higher and ML 6.x and higher
- ; ml.exe is in directory \MASM611C of Win95 DDK
- ; ml.exe is also distributed in http://www.masm32.com/masmdl.htm
- ; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/
- ;
- ;
- ; compile with command line option
- ; ml /coff /Zi /c /Flinffas32.lst inffas32.asm
- ; if you define NO_GZIP (see inflate.h), compile with
- ; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm
- ; zlib122sup is 0 fort zlib 1.2.2.1 and lower
- ; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head
- ; in inflate_state in inflate.h)
- zlib1222sup equ 8
- IFDEF GUNZIP
- INFLATE_MODE_TYPE equ 11
- INFLATE_MODE_BAD equ 26
- ELSE
- IFNDEF NO_GUNZIP
- INFLATE_MODE_TYPE equ 11
- INFLATE_MODE_BAD equ 26
- ELSE
- INFLATE_MODE_TYPE equ 3
- INFLATE_MODE_BAD equ 17
- ENDIF
- ENDIF
- ; 75 "inffast.S"
- ;FILE "inffast.S"
- ;;;GLOBAL _inflate_fast
- ;;;SECTION .text
- .586p
- .mmx
- name inflate_fast_x86
- .MODEL FLAT
- _DATA segment
- inflate_fast_use_mmx:
- dd 1
- _TEXT segment
- ALIGN 4
- db 'Fast decoding Code from Chris Anderson'
- db 0
- ALIGN 4
- invalid_literal_length_code_msg:
- db 'invalid literal/length code'
- db 0
- ALIGN 4
- invalid_distance_code_msg:
- db 'invalid distance code'
- db 0
- ALIGN 4
- invalid_distance_too_far_msg:
- db 'invalid distance too far back'
- db 0
- ALIGN 4
- inflate_fast_mask:
- dd 0
- dd 1
- dd 3
- dd 7
- dd 15
- dd 31
- dd 63
- dd 127
- dd 255
- dd 511
- dd 1023
- dd 2047
- dd 4095
- dd 8191
- dd 16383
- dd 32767
- dd 65535
- dd 131071
- dd 262143
- dd 524287
- dd 1048575
- dd 2097151
- dd 4194303
- dd 8388607
- dd 16777215
- dd 33554431
- dd 67108863
- dd 134217727
- dd 268435455
- dd 536870911
- dd 1073741823
- dd 2147483647
- dd 4294967295
- mode_state equ 0 ;/* state->mode */
- wsize_state equ (32+zlib1222sup) ;/* state->wsize */
- write_state equ (36+4+zlib1222sup) ;/* state->write */
- window_state equ (40+4+zlib1222sup) ;/* state->window */
- hold_state equ (44+4+zlib1222sup) ;/* state->hold */
- bits_state equ (48+4+zlib1222sup) ;/* state->bits */
- lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */
- distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */
- lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */
- distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */
- ;;SECTION .text
- ; 205 "inffast.S"
- ;GLOBAL inflate_fast_use_mmx
- ;SECTION .data
- ; GLOBAL inflate_fast_use_mmx:object
- ;.size inflate_fast_use_mmx, 4
- ; 226 "inffast.S"
- ;SECTION .text
- ALIGN 4
- _inflate_fast proc near
- .FPO (16, 4, 0, 0, 1, 0)
- push edi
- push esi
- push ebp
- push ebx
- pushfd
- sub esp,64
- cld
- mov esi, [esp+88]
- mov edi, [esi+28]
- mov edx, [esi+4]
- mov eax, [esi+0]
- add edx,eax
- sub edx,11
- mov [esp+44],eax
- mov [esp+20],edx
- mov ebp, [esp+92]
- mov ecx, [esi+16]
- mov ebx, [esi+12]
- sub ebp,ecx
- neg ebp
- add ebp,ebx
- sub ecx,257
- add ecx,ebx
- mov [esp+60],ebx
- mov [esp+40],ebp
- mov [esp+16],ecx
- ; 285 "inffast.S"
- mov eax, [edi+lencode_state]
- mov ecx, [edi+distcode_state]
- mov [esp+8],eax
- mov [esp+12],ecx
- mov eax,1
- mov ecx, [edi+lenbits_state]
- shl eax,cl
- dec eax
- mov [esp+0],eax
- mov eax,1
- mov ecx, [edi+distbits_state]
- shl eax,cl
- dec eax
- mov [esp+4],eax
- mov eax, [edi+wsize_state]
- mov ecx, [edi+write_state]
- mov edx, [edi+window_state]
- mov [esp+52],eax
- mov [esp+48],ecx
- mov [esp+56],edx
- mov ebp, [edi+hold_state]
- mov ebx, [edi+bits_state]
- ; 321 "inffast.S"
- mov esi, [esp+44]
- mov ecx, [esp+20]
- cmp ecx,esi
- ja L_align_long
- add ecx,11
- sub ecx,esi
- mov eax,12
- sub eax,ecx
- lea edi, [esp+28]
- rep movsb
- mov ecx,eax
- xor eax,eax
- rep stosb
- lea esi, [esp+28]
- mov [esp+20],esi
- jmp L_is_aligned
- L_align_long:
- test esi,3
- jz L_is_aligned
- xor eax,eax
- mov al, [esi]
- inc esi
- mov ecx,ebx
- add ebx,8
- shl eax,cl
- or ebp,eax
- jmp L_align_long
- L_is_aligned:
- mov edi, [esp+60]
- ; 366 "inffast.S"
- L_check_mmx:
- cmp dword ptr [inflate_fast_use_mmx],2
- je L_init_mmx
- ja L_do_loop
- push eax
- push ebx
- push ecx
- push edx
- pushfd
- mov eax, [esp]
- xor dword ptr [esp],0200000h
- popfd
- pushfd
- pop edx
- xor edx,eax
- jz L_dont_use_mmx
- xor eax,eax
- cpuid
- cmp ebx,0756e6547h
- jne L_dont_use_mmx
- cmp ecx,06c65746eh
- jne L_dont_use_mmx
- cmp edx,049656e69h
- jne L_dont_use_mmx
- mov eax,1
- cpuid
- shr eax,8
- and eax,15
- cmp eax,6
- jne L_dont_use_mmx
- test edx,0800000h
- jnz L_use_mmx
- jmp L_dont_use_mmx
- L_use_mmx:
- mov dword ptr [inflate_fast_use_mmx],2
- jmp L_check_mmx_pop
- L_dont_use_mmx:
- mov dword ptr [inflate_fast_use_mmx],3
- L_check_mmx_pop:
- pop edx
- pop ecx
- pop ebx
- pop eax
- jmp L_check_mmx
- ; 426 "inffast.S"
- ALIGN 4
- L_do_loop:
- ; 437 "inffast.S"
- cmp bl,15
- ja L_get_length_code
- xor eax,eax
- lodsw
- mov cl,bl
- add bl,16
- shl eax,cl
- or ebp,eax
- L_get_length_code:
- mov edx, [esp+0]
- mov ecx, [esp+8]
- and edx,ebp
- mov eax, [ecx+edx*4]
- L_dolen:
- mov cl,ah
- sub bl,ah
- shr ebp,cl
- test al,al
- jnz L_test_for_length_base
- shr eax,16
- stosb
- L_while_test:
- cmp [esp+16],edi
- jbe L_break_loop
- cmp [esp+20],esi
- ja L_do_loop
- jmp L_break_loop
- L_test_for_length_base:
- ; 502 "inffast.S"
- mov edx,eax
- shr edx,16
- mov cl,al
- test al,16
- jz L_test_for_second_level_length
- and cl,15
- jz L_save_len
- cmp bl,cl
- jae L_add_bits_to_len
- mov ch,cl
- xor eax,eax
- lodsw
- mov cl,bl
- add bl,16
- shl eax,cl
- or ebp,eax
- mov cl,ch
- L_add_bits_to_len:
- mov eax,1
- shl eax,cl
- dec eax
- sub bl,cl
- and eax,ebp
- shr ebp,cl
- add edx,eax
- L_save_len:
- mov [esp+24],edx
- L_decode_distance:
- ; 549 "inffast.S"
- cmp bl,15
- ja L_get_distance_code
- xor eax,eax
- lodsw
- mov cl,bl
- add bl,16
- shl eax,cl
- or ebp,eax
- L_get_distance_code:
- mov edx, [esp+4]
- mov ecx, [esp+12]
- and edx,ebp
- mov eax, [ecx+edx*4]
- L_dodist:
- mov edx,eax
- shr edx,16
- mov cl,ah
- sub bl,ah
- shr ebp,cl
- ; 584 "inffast.S"
- mov cl,al
- test al,16
- jz L_test_for_second_level_dist
- and cl,15
- jz L_check_dist_one
- cmp bl,cl
- jae L_add_bits_to_dist
- mov ch,cl
- xor eax,eax
- lodsw
- mov cl,bl
- add bl,16
- shl eax,cl
- or ebp,eax
- mov cl,ch
- L_add_bits_to_dist:
- mov eax,1
- shl eax,cl
- dec eax
- sub bl,cl
- and eax,ebp
- shr ebp,cl
- add edx,eax
- jmp L_check_window
- L_check_window:
- ; 625 "inffast.S"
- mov [esp+44],esi
- mov eax,edi
- sub eax, [esp+40]
- cmp eax,edx
- jb L_clip_window
- mov ecx, [esp+24]
- mov esi,edi
- sub esi,edx
- sub ecx,3
- mov al, [esi]
- mov [edi],al
- mov al, [esi+1]
- mov dl, [esi+2]
- add esi,3
- mov [edi+1],al
- mov [edi+2],dl
- add edi,3
- rep movsb
- mov esi, [esp+44]
- jmp L_while_test
- ALIGN 4
- L_check_dist_one:
- cmp edx,1
- jne L_check_window
- cmp [esp+40],edi
- je L_check_window
- dec edi
- mov ecx, [esp+24]
- mov al, [edi]
- sub ecx,3
- mov [edi+1],al
- mov [edi+2],al
- mov [edi+3],al
- add edi,4
- rep stosb
- jmp L_while_test
- ALIGN 4
- L_test_for_second_level_length:
- test al,64
- jnz L_test_for_end_of_block
- mov eax,1
- shl eax,cl
- dec eax
- and eax,ebp
- add eax,edx
- mov edx, [esp+8]
- mov eax, [edx+eax*4]
- jmp L_dolen
- ALIGN 4
- L_test_for_second_level_dist:
- test al,64
- jnz L_invalid_distance_code
- mov eax,1
- shl eax,cl
- dec eax
- and eax,ebp
- add eax,edx
- mov edx, [esp+12]
- mov eax, [edx+eax*4]
- jmp L_dodist
- ALIGN 4
- L_clip_window:
- ; 721 "inffast.S"
- mov ecx,eax
- mov eax, [esp+52]
- neg ecx
- mov esi, [esp+56]
- cmp eax,edx
- jb L_invalid_distance_too_far
- add ecx,edx
- cmp dword ptr [esp+48],0
- jne L_wrap_around_window
- sub eax,ecx
- add esi,eax
- ; 749 "inffast.S"
- mov eax, [esp+24]
- cmp eax,ecx
- jbe L_do_copy1
- sub eax,ecx
- rep movsb
- mov esi,edi
- sub esi,edx
- jmp L_do_copy1
- cmp eax,ecx
- jbe L_do_copy1
- sub eax,ecx
- rep movsb
- mov esi,edi
- sub esi,edx
- jmp L_do_copy1
- L_wrap_around_window:
- ; 793 "inffast.S"
- mov eax, [esp+48]
- cmp ecx,eax
- jbe L_contiguous_in_window
- add esi, [esp+52]
- add esi,eax
- sub esi,ecx
- sub ecx,eax
- mov eax, [esp+24]
- cmp eax,ecx
- jbe L_do_copy1
- sub eax,ecx
- rep movsb
- mov esi, [esp+56]
- mov ecx, [esp+48]
- cmp eax,ecx
- jbe L_do_copy1
- sub eax,ecx
- rep movsb
- mov esi,edi
- sub esi,edx
- jmp L_do_copy1
- L_contiguous_in_window:
- ; 836 "inffast.S"
- add esi,eax
- sub esi,ecx
- mov eax, [esp+24]
- cmp eax,ecx
- jbe L_do_copy1
- sub eax,ecx
- rep movsb
- mov esi,edi
- sub esi,edx
- L_do_copy1:
- ; 862 "inffast.S"
- mov ecx,eax
- rep movsb
- mov esi, [esp+44]
- jmp L_while_test
- ; 878 "inffast.S"
- ALIGN 4
- L_init_mmx:
- emms
- movd mm0,ebp
- mov ebp,ebx
- ; 896 "inffast.S"
- movd mm4,dword ptr [esp+0]
- movq mm3,mm4
- movd mm5,dword ptr [esp+4]
- movq mm2,mm5
- pxor mm1,mm1
- mov ebx, [esp+8]
- jmp L_do_loop_mmx
- ALIGN 4
- L_do_loop_mmx:
- psrlq mm0,mm1
- cmp ebp,32
- ja L_get_length_code_mmx
- movd mm6,ebp
- movd mm7,dword ptr [esi]
- add esi,4
- psllq mm7,mm6
- add ebp,32
- por mm0,mm7
- L_get_length_code_mmx:
- pand mm4,mm0
- movd eax,mm4
- movq mm4,mm3
- mov eax, [ebx+eax*4]
- L_dolen_mmx:
- movzx ecx,ah
- movd mm1,ecx
- sub ebp,ecx
- test al,al
- jnz L_test_for_length_base_mmx
- shr eax,16
- stosb
- L_while_test_mmx:
- cmp [esp+16],edi
- jbe L_break_loop
- cmp [esp+20],esi
- ja L_do_loop_mmx
- jmp L_break_loop
- L_test_for_length_base_mmx:
- mov edx,eax
- shr edx,16
- test al,16
- jz L_test_for_second_level_length_mmx
- and eax,15
- jz L_decode_distance_mmx
- psrlq mm0,mm1
- movd mm1,eax
- movd ecx,mm0
- sub ebp,eax
- and ecx, [inflate_fast_mask+eax*4]
- add edx,ecx
- L_decode_distance_mmx:
- psrlq mm0,mm1
- cmp ebp,32
- ja L_get_dist_code_mmx
- movd mm6,ebp
- movd mm7,dword ptr [esi]
- add esi,4
- psllq mm7,mm6
- add ebp,32
- por mm0,mm7
- L_get_dist_code_mmx:
- mov ebx, [esp+12]
- pand mm5,mm0
- movd eax,mm5
- movq mm5,mm2
- mov eax, [ebx+eax*4]
- L_dodist_mmx:
- movzx ecx,ah
- mov ebx,eax
- shr ebx,16
- sub ebp,ecx
- movd mm1,ecx
- test al,16
- jz L_test_for_second_level_dist_mmx
- and eax,15
- jz L_check_dist_one_mmx
- L_add_bits_to_dist_mmx:
- psrlq mm0,mm1
- movd mm1,eax
- movd ecx,mm0
- sub ebp,eax
- and ecx, [inflate_fast_mask+eax*4]
- add ebx,ecx
- L_check_window_mmx:
- mov [esp+44],esi
- mov eax,edi
- sub eax, [esp+40]
- cmp eax,ebx
- jb L_clip_window_mmx
- mov ecx,edx
- mov esi,edi
- sub esi,ebx
- sub ecx,3
- mov al, [esi]
- mov [edi],al
- mov al, [esi+1]
- mov dl, [esi+2]
- add esi,3
- mov [edi+1],al
- mov [edi+2],dl
- add edi,3
- rep movsb
- mov esi, [esp+44]
- mov ebx, [esp+8]
- jmp L_while_test_mmx
- ALIGN 4
- L_check_dist_one_mmx:
- cmp ebx,1
- jne L_check_window_mmx
- cmp [esp+40],edi
- je L_check_window_mmx
- dec edi
- mov ecx,edx
- mov al, [edi]
- sub ecx,3
- mov [edi+1],al
- mov [edi+2],al
- mov [edi+3],al
- add edi,4
- rep stosb
- mov ebx, [esp+8]
- jmp L_while_test_mmx
- ALIGN 4
- L_test_for_second_level_length_mmx:
- test al,64
- jnz L_test_for_end_of_block
- and eax,15
- psrlq mm0,mm1
- movd ecx,mm0
- and ecx, [inflate_fast_mask+eax*4]
- add ecx,edx
- mov eax, [ebx+ecx*4]
- jmp L_dolen_mmx
- ALIGN 4
- L_test_for_second_level_dist_mmx:
- test al,64
- jnz L_invalid_distance_code
- and eax,15
- psrlq mm0,mm1
- movd ecx,mm0
- and ecx, [inflate_fast_mask+eax*4]
- mov eax, [esp+12]
- add ecx,ebx
- mov eax, [eax+ecx*4]
- jmp L_dodist_mmx
- ALIGN 4
- L_clip_window_mmx:
- mov ecx,eax
- mov eax, [esp+52]
- neg ecx
- mov esi, [esp+56]
- cmp eax,ebx
- jb L_invalid_distance_too_far
- add ecx,ebx
- cmp dword ptr [esp+48],0
- jne L_wrap_around_window_mmx
- sub eax,ecx
- add esi,eax
- cmp edx,ecx
- jbe L_do_copy1_mmx
- sub edx,ecx
- rep movsb
- mov esi,edi
- sub esi,ebx
- jmp L_do_copy1_mmx
- cmp edx,ecx
- jbe L_do_copy1_mmx
- sub edx,ecx
- rep movsb
- mov esi,edi
- sub esi,ebx
- jmp L_do_copy1_mmx
- L_wrap_around_window_mmx:
- mov eax, [esp+48]
- cmp ecx,eax
- jbe L_contiguous_in_window_mmx
- add esi, [esp+52]
- add esi,eax
- sub esi,ecx
- sub ecx,eax
- cmp edx,ecx
- jbe L_do_copy1_mmx
- sub edx,ecx
- rep movsb
- mov esi, [esp+56]
- mov ecx, [esp+48]
- cmp edx,ecx
- jbe L_do_copy1_mmx
- sub edx,ecx
- rep movsb
- mov esi,edi
- sub esi,ebx
- jmp L_do_copy1_mmx
- L_contiguous_in_window_mmx:
- add esi,eax
- sub esi,ecx
- cmp edx,ecx
- jbe L_do_copy1_mmx
- sub edx,ecx
- rep movsb
- mov esi,edi
- sub esi,ebx
- L_do_copy1_mmx:
- mov ecx,edx
- rep movsb
- mov esi, [esp+44]
- mov ebx, [esp+8]
- jmp L_while_test_mmx
- ; 1174 "inffast.S"
- L_invalid_distance_code:
- mov ecx, invalid_distance_code_msg
- mov edx,INFLATE_MODE_BAD
- jmp L_update_stream_state
- L_test_for_end_of_block:
- test al,32
- jz L_invalid_literal_length_code
- mov ecx,0
- mov edx,INFLATE_MODE_TYPE
- jmp L_update_stream_state
- L_invalid_literal_length_code:
- mov ecx, invalid_literal_length_code_msg
- mov edx,INFLATE_MODE_BAD
- jmp L_update_stream_state
- L_invalid_distance_too_far:
- mov esi, [esp+44]
- mov ecx, invalid_distance_too_far_msg
- mov edx,INFLATE_MODE_BAD
- jmp L_update_stream_state
- L_update_stream_state:
- mov eax, [esp+88]
- test ecx,ecx
- jz L_skip_msg
- mov [eax+24],ecx
- L_skip_msg:
- mov eax, [eax+28]
- mov [eax+mode_state],edx
- jmp L_break_loop
- ALIGN 4
- L_break_loop:
- ; 1243 "inffast.S"
- cmp dword ptr [inflate_fast_use_mmx],2
- jne L_update_next_in
- mov ebx,ebp
- L_update_next_in:
- ; 1266 "inffast.S"
- mov eax, [esp+88]
- mov ecx,ebx
- mov edx, [eax+28]
- shr ecx,3
- sub esi,ecx
- shl ecx,3
- sub ebx,ecx
- mov [eax+12],edi
- mov [edx+bits_state],ebx
- mov ecx,ebx
- lea ebx, [esp+28]
- cmp [esp+20],ebx
- jne L_buf_not_used
- sub esi,ebx
- mov ebx, [eax+0]
- mov [esp+20],ebx
- add esi,ebx
- mov ebx, [eax+4]
- sub ebx,11
- add [esp+20],ebx
- L_buf_not_used:
- mov [eax+0],esi
- mov ebx,1
- shl ebx,cl
- dec ebx
- cmp dword ptr [inflate_fast_use_mmx],2
- jne L_update_hold
- psrlq mm0,mm1
- movd ebp,mm0
- emms
- L_update_hold:
- and ebp,ebx
- mov [edx+hold_state],ebp
- mov ebx, [esp+20]
- cmp ebx,esi
- jbe L_last_is_smaller
- sub ebx,esi
- add ebx,11
- mov [eax+4],ebx
- jmp L_fixup_out
- L_last_is_smaller:
- sub esi,ebx
- neg esi
- add esi,11
- mov [eax+4],esi
- L_fixup_out:
- mov ebx, [esp+16]
- cmp ebx,edi
- jbe L_end_is_smaller
- sub ebx,edi
- add ebx,257
- mov [eax+16],ebx
- jmp L_done
- L_end_is_smaller:
- sub edi,ebx
- neg edi
- add edi,257
- mov [eax+16],edi
- L_done:
- add esp,64
- popfd
- pop ebx
- pop ebp
- pop esi
- pop edi
- ret
- _inflate_fast endp
- _TEXT ends
- end
|