match686.asm 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. ; match686.asm -- Asm portion of the optimized longest_match for 32 bits x86
  2. ; Copyright (C) 1995-1996 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
  3. ; File written by Gilles Vollant, by converting match686.S from Brian Raiter
  4. ; for MASM. This is as assembly version of longest_match
  5. ; from Jean-loup Gailly in deflate.c
  6. ;
  7. ; http://www.zlib.net
  8. ; http://www.winimage.com/zLibDll
  9. ; http://www.muppetlabs.com/~breadbox/software/assembly.html
  10. ;
  11. ; For Visual C++ 4.x and higher and ML 6.x and higher
  12. ; ml.exe is distributed in
  13. ; http://www.microsoft.com/downloads/details.aspx?FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64
  14. ;
  15. ; this file contain two implementation of longest_match
  16. ;
  17. ; this longest_match was written by Brian raiter (1998), optimized for Pentium Pro
  18. ; (and the faster known version of match_init on modern Core 2 Duo and AMD Phenom)
  19. ;
  20. ; for using an assembly version of longest_match, you need define ASMV in project
  21. ;
  22. ; compile the asm file running
  23. ; ml /coff /Zi /c /Flmatch686.lst match686.asm
  24. ; and do not include match686.obj in your project
  25. ;
  26. ; note: contrib of zLib 1.2.3 and earlier contained both a deprecated version for
  27. ; Pentium (prior Pentium Pro) and this version for Pentium Pro and modern processor
  28. ; with autoselect (with cpu detection code)
  29. ; if you want support the old pentium optimization, you can still use these version
  30. ;
  31. ; this file is not optimized for old pentium, but it compatible with all x86 32 bits
  32. ; processor (starting 80386)
  33. ;
  34. ;
  35. ; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
  36. ;uInt longest_match(s, cur_match)
  37. ; deflate_state *s;
  38. ; IPos cur_match; /* current match */
  39. NbStack equ 76
  40. cur_match equ dword ptr[esp+NbStack-0]
  41. str_s equ dword ptr[esp+NbStack-4]
  42. ; 5 dword on top (ret,ebp,esi,edi,ebx)
  43. adrret equ dword ptr[esp+NbStack-8]
  44. pushebp equ dword ptr[esp+NbStack-12]
  45. pushedi equ dword ptr[esp+NbStack-16]
  46. pushesi equ dword ptr[esp+NbStack-20]
  47. pushebx equ dword ptr[esp+NbStack-24]
  48. chain_length equ dword ptr [esp+NbStack-28]
  49. limit equ dword ptr [esp+NbStack-32]
  50. best_len equ dword ptr [esp+NbStack-36]
  51. window equ dword ptr [esp+NbStack-40]
  52. prev equ dword ptr [esp+NbStack-44]
  53. scan_start equ word ptr [esp+NbStack-48]
  54. wmask equ dword ptr [esp+NbStack-52]
  55. match_start_ptr equ dword ptr [esp+NbStack-56]
  56. nice_match equ dword ptr [esp+NbStack-60]
  57. scan equ dword ptr [esp+NbStack-64]
  58. windowlen equ dword ptr [esp+NbStack-68]
  59. match_start equ dword ptr [esp+NbStack-72]
  60. strend equ dword ptr [esp+NbStack-76]
  61. NbStackAdd equ (NbStack-24)
  62. .386p
  63. name gvmatch
  64. .MODEL FLAT
  65. ; all the +zlib1222add offsets are due to the addition of fields
  66. ; in zlib in the deflate_state structure since the asm code was first written
  67. ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
  68. ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
  69. ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
  70. zlib1222add equ 8
  71. ; Note : these value are good with a 8 bytes boundary pack structure
  72. dep_chain_length equ 74h+zlib1222add
  73. dep_window equ 30h+zlib1222add
  74. dep_strstart equ 64h+zlib1222add
  75. dep_prev_length equ 70h+zlib1222add
  76. dep_nice_match equ 88h+zlib1222add
  77. dep_w_size equ 24h+zlib1222add
  78. dep_prev equ 38h+zlib1222add
  79. dep_w_mask equ 2ch+zlib1222add
  80. dep_good_match equ 84h+zlib1222add
  81. dep_match_start equ 68h+zlib1222add
  82. dep_lookahead equ 6ch+zlib1222add
  83. _TEXT segment
  84. IFDEF NOUNDERLINE
  85. public longest_match
  86. public match_init
  87. ELSE
  88. public _longest_match
  89. public _match_init
  90. ENDIF
  91. MAX_MATCH equ 258
  92. MIN_MATCH equ 3
  93. MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1)
  94. MAX_MATCH equ 258
  95. MIN_MATCH equ 3
  96. MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1)
  97. MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h)
  98. ;;; stack frame offsets
  99. chainlenwmask equ esp + 0 ; high word: current chain len
  100. ; low word: s->wmask
  101. window equ esp + 4 ; local copy of s->window
  102. windowbestlen equ esp + 8 ; s->window + bestlen
  103. scanstart equ esp + 16 ; first two bytes of string
  104. scanend equ esp + 12 ; last two bytes of string
  105. scanalign equ esp + 20 ; dword-misalignment of string
  106. nicematch equ esp + 24 ; a good enough match size
  107. bestlen equ esp + 28 ; size of best match so far
  108. scan equ esp + 32 ; ptr to string wanting match
  109. LocalVarsSize equ 36
  110. ; saved ebx byte esp + 36
  111. ; saved edi byte esp + 40
  112. ; saved esi byte esp + 44
  113. ; saved ebp byte esp + 48
  114. ; return address byte esp + 52
  115. deflatestate equ esp + 56 ; the function arguments
  116. curmatch equ esp + 60
  117. ;;; Offsets for fields in the deflate_state structure. These numbers
  118. ;;; are calculated from the definition of deflate_state, with the
  119. ;;; assumption that the compiler will dword-align the fields. (Thus,
  120. ;;; changing the definition of deflate_state could easily cause this
  121. ;;; program to crash horribly, without so much as a warning at
  122. ;;; compile time. Sigh.)
  123. dsWSize equ 36+zlib1222add
  124. dsWMask equ 44+zlib1222add
  125. dsWindow equ 48+zlib1222add
  126. dsPrev equ 56+zlib1222add
  127. dsMatchLen equ 88+zlib1222add
  128. dsPrevMatch equ 92+zlib1222add
  129. dsStrStart equ 100+zlib1222add
  130. dsMatchStart equ 104+zlib1222add
  131. dsLookahead equ 108+zlib1222add
  132. dsPrevLen equ 112+zlib1222add
  133. dsMaxChainLen equ 116+zlib1222add
  134. dsGoodMatch equ 132+zlib1222add
  135. dsNiceMatch equ 136+zlib1222add
  136. ;;; match686.asm -- Pentium-Pro-optimized version of longest_match()
  137. ;;; Written for zlib 1.1.2
  138. ;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
  139. ;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
  140. ;;;
  141. ;;
  142. ;; This software is provided 'as-is', without any express or implied
  143. ;; warranty. In no event will the authors be held liable for any damages
  144. ;; arising from the use of this software.
  145. ;;
  146. ;; Permission is granted to anyone to use this software for any purpose,
  147. ;; including commercial applications, and to alter it and redistribute it
  148. ;; freely, subject to the following restrictions:
  149. ;;
  150. ;; 1. The origin of this software must not be misrepresented; you must not
  151. ;; claim that you wrote the original software. If you use this software
  152. ;; in a product, an acknowledgment in the product documentation would be
  153. ;; appreciated but is not required.
  154. ;; 2. Altered source versions must be plainly marked as such, and must not be
  155. ;; misrepresented as being the original software
  156. ;; 3. This notice may not be removed or altered from any source distribution.
  157. ;;
  158. ;GLOBAL _longest_match, _match_init
  159. ;SECTION .text
  160. ;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
  161. ;_longest_match:
  162. IFDEF NOUNDERLINE
  163. longest_match proc near
  164. ELSE
  165. _longest_match proc near
  166. ENDIF
  167. ;;; Save registers that the compiler may be using, and adjust esp to
  168. ;;; make room for our stack frame.
  169. push ebp
  170. push edi
  171. push esi
  172. push ebx
  173. sub esp, LocalVarsSize
  174. ;;; Retrieve the function arguments. ecx will hold cur_match
  175. ;;; throughout the entire function. edx will hold the pointer to the
  176. ;;; deflate_state structure during the function's setup (before
  177. ;;; entering the main loop.
  178. mov edx, [deflatestate]
  179. mov ecx, [curmatch]
  180. ;;; uInt wmask = s->w_mask;
  181. ;;; unsigned chain_length = s->max_chain_length;
  182. ;;; if (s->prev_length >= s->good_match) {
  183. ;;; chain_length >>= 2;
  184. ;;; }
  185. mov eax, [edx + dsPrevLen]
  186. mov ebx, [edx + dsGoodMatch]
  187. cmp eax, ebx
  188. mov eax, [edx + dsWMask]
  189. mov ebx, [edx + dsMaxChainLen]
  190. jl LastMatchGood
  191. shr ebx, 2
  192. LastMatchGood:
  193. ;;; chainlen is decremented once beforehand so that the function can
  194. ;;; use the sign flag instead of the zero flag for the exit test.
  195. ;;; It is then shifted into the high word, to make room for the wmask
  196. ;;; value, which it will always accompany.
  197. dec ebx
  198. shl ebx, 16
  199. or ebx, eax
  200. mov [chainlenwmask], ebx
  201. ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
  202. mov eax, [edx + dsNiceMatch]
  203. mov ebx, [edx + dsLookahead]
  204. cmp ebx, eax
  205. jl LookaheadLess
  206. mov ebx, eax
  207. LookaheadLess: mov [nicematch], ebx
  208. ;;; register Bytef *scan = s->window + s->strstart;
  209. mov esi, [edx + dsWindow]
  210. mov [window], esi
  211. mov ebp, [edx + dsStrStart]
  212. lea edi, [esi + ebp]
  213. mov [scan], edi
  214. ;;; Determine how many bytes the scan ptr is off from being
  215. ;;; dword-aligned.
  216. mov eax, edi
  217. neg eax
  218. and eax, 3
  219. mov [scanalign], eax
  220. ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
  221. ;;; s->strstart - (IPos)MAX_DIST(s) : NIL;
  222. mov eax, [edx + dsWSize]
  223. sub eax, MIN_LOOKAHEAD
  224. sub ebp, eax
  225. jg LimitPositive
  226. xor ebp, ebp
  227. LimitPositive:
  228. ;;; int best_len = s->prev_length;
  229. mov eax, [edx + dsPrevLen]
  230. mov [bestlen], eax
  231. ;;; Store the sum of s->window + best_len in esi locally, and in esi.
  232. add esi, eax
  233. mov [windowbestlen], esi
  234. ;;; register ush scan_start = *(ushf*)scan;
  235. ;;; register ush scan_end = *(ushf*)(scan+best_len-1);
  236. ;;; Posf *prev = s->prev;
  237. movzx ebx, word ptr [edi]
  238. mov [scanstart], ebx
  239. movzx ebx, word ptr [edi + eax - 1]
  240. mov [scanend], ebx
  241. mov edi, [edx + dsPrev]
  242. ;;; Jump into the main loop.
  243. mov edx, [chainlenwmask]
  244. jmp short LoopEntry
  245. align 4
  246. ;;; do {
  247. ;;; match = s->window + cur_match;
  248. ;;; if (*(ushf*)(match+best_len-1) != scan_end ||
  249. ;;; *(ushf*)match != scan_start) continue;
  250. ;;; [...]
  251. ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
  252. ;;; && --chain_length != 0);
  253. ;;;
  254. ;;; Here is the inner loop of the function. The function will spend the
  255. ;;; majority of its time in this loop, and majority of that time will
  256. ;;; be spent in the first ten instructions.
  257. ;;;
  258. ;;; Within this loop:
  259. ;;; ebx = scanend
  260. ;;; ecx = curmatch
  261. ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
  262. ;;; esi = windowbestlen - i.e., (window + bestlen)
  263. ;;; edi = prev
  264. ;;; ebp = limit
  265. LookupLoop:
  266. and ecx, edx
  267. movzx ecx, word ptr [edi + ecx*2]
  268. cmp ecx, ebp
  269. jbe LeaveNow
  270. sub edx, 00010000h
  271. js LeaveNow
  272. LoopEntry: movzx eax, word ptr [esi + ecx - 1]
  273. cmp eax, ebx
  274. jnz LookupLoop
  275. mov eax, [window]
  276. movzx eax, word ptr [eax + ecx]
  277. cmp eax, [scanstart]
  278. jnz LookupLoop
  279. ;;; Store the current value of chainlen.
  280. mov [chainlenwmask], edx
  281. ;;; Point edi to the string under scrutiny, and esi to the string we
  282. ;;; are hoping to match it up with. In actuality, esi and edi are
  283. ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
  284. ;;; initialized to -(MAX_MATCH_8 - scanalign).
  285. mov esi, [window]
  286. mov edi, [scan]
  287. add esi, ecx
  288. mov eax, [scanalign]
  289. mov edx, 0fffffef8h; -(MAX_MATCH_8)
  290. lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
  291. lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
  292. ;;; Test the strings for equality, 8 bytes at a time. At the end,
  293. ;;; adjust edx so that it is offset to the exact byte that mismatched.
  294. ;;;
  295. ;;; We already know at this point that the first three bytes of the
  296. ;;; strings match each other, and they can be safely passed over before
  297. ;;; starting the compare loop. So what this code does is skip over 0-3
  298. ;;; bytes, as much as necessary in order to dword-align the edi
  299. ;;; pointer. (esi will still be misaligned three times out of four.)
  300. ;;;
  301. ;;; It should be confessed that this loop usually does not represent
  302. ;;; much of the total running time. Replacing it with a more
  303. ;;; straightforward "rep cmpsb" would not drastically degrade
  304. ;;; performance.
  305. LoopCmps:
  306. mov eax, [esi + edx]
  307. xor eax, [edi + edx]
  308. jnz LeaveLoopCmps
  309. mov eax, [esi + edx + 4]
  310. xor eax, [edi + edx + 4]
  311. jnz LeaveLoopCmps4
  312. add edx, 8
  313. jnz LoopCmps
  314. jmp short LenMaximum
  315. LeaveLoopCmps4: add edx, 4
  316. LeaveLoopCmps: test eax, 0000FFFFh
  317. jnz LenLower
  318. add edx, 2
  319. shr eax, 16
  320. LenLower: sub al, 1
  321. adc edx, 0
  322. ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
  323. ;;; then automatically accept it as the best possible match and leave.
  324. lea eax, [edi + edx]
  325. mov edi, [scan]
  326. sub eax, edi
  327. cmp eax, MAX_MATCH
  328. jge LenMaximum
  329. ;;; If the length of the match is not longer than the best match we
  330. ;;; have so far, then forget it and return to the lookup loop.
  331. mov edx, [deflatestate]
  332. mov ebx, [bestlen]
  333. cmp eax, ebx
  334. jg LongerMatch
  335. mov esi, [windowbestlen]
  336. mov edi, [edx + dsPrev]
  337. mov ebx, [scanend]
  338. mov edx, [chainlenwmask]
  339. jmp LookupLoop
  340. ;;; s->match_start = cur_match;
  341. ;;; best_len = len;
  342. ;;; if (len >= nice_match) break;
  343. ;;; scan_end = *(ushf*)(scan+best_len-1);
  344. LongerMatch: mov ebx, [nicematch]
  345. mov [bestlen], eax
  346. mov [edx + dsMatchStart], ecx
  347. cmp eax, ebx
  348. jge LeaveNow
  349. mov esi, [window]
  350. add esi, eax
  351. mov [windowbestlen], esi
  352. movzx ebx, word ptr [edi + eax - 1]
  353. mov edi, [edx + dsPrev]
  354. mov [scanend], ebx
  355. mov edx, [chainlenwmask]
  356. jmp LookupLoop
  357. ;;; Accept the current string, with the maximum possible length.
  358. LenMaximum: mov edx, [deflatestate]
  359. mov dword ptr [bestlen], MAX_MATCH
  360. mov [edx + dsMatchStart], ecx
  361. ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
  362. ;;; return s->lookahead;
  363. LeaveNow:
  364. mov edx, [deflatestate]
  365. mov ebx, [bestlen]
  366. mov eax, [edx + dsLookahead]
  367. cmp ebx, eax
  368. jg LookaheadRet
  369. mov eax, ebx
  370. LookaheadRet:
  371. ;;; Restore the stack and return from whence we came.
  372. add esp, LocalVarsSize
  373. pop ebx
  374. pop esi
  375. pop edi
  376. pop ebp
  377. ret
  378. ; please don't remove this string !
  379. ; Your can freely use match686 in any free or commercial app if you don't remove the string in the binary!
  380. db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
  381. IFDEF NOUNDERLINE
  382. longest_match endp
  383. ELSE
  384. _longest_match endp
  385. ENDIF
  386. IFDEF NOUNDERLINE
  387. match_init proc near
  388. ret
  389. match_init endp
  390. ELSE
  391. _match_init proc near
  392. ret
  393. _match_init endp
  394. ENDIF
  395. _TEXT ends
  396. end