You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

17780 line
447 KiB

  1. // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
  2. //go:build !appengine && !noasm && gc && !noasm
  3. // +build !appengine,!noasm,gc,!noasm
  4. #include "textflag.h"
  5. // func _dummy_()
  6. TEXT ·_dummy_(SB), $0
  7. #ifdef GOAMD64_v4
  8. #ifndef GOAMD64_v3
  9. #define GOAMD64_v3
  10. #endif
  11. #endif
  12. RET
  13. // func encodeBlockAsm(dst []byte, src []byte) int
  14. // Requires: BMI, SSE2
  15. TEXT ·encodeBlockAsm(SB), $65560-56
  16. MOVQ dst_base+0(FP), AX
  17. MOVQ $0x00000200, CX
  18. LEAQ 24(SP), DX
  19. PXOR X0, X0
  20. zero_loop_encodeBlockAsm:
  21. MOVOU X0, (DX)
  22. MOVOU X0, 16(DX)
  23. MOVOU X0, 32(DX)
  24. MOVOU X0, 48(DX)
  25. MOVOU X0, 64(DX)
  26. MOVOU X0, 80(DX)
  27. MOVOU X0, 96(DX)
  28. MOVOU X0, 112(DX)
  29. ADDQ $0x80, DX
  30. DECQ CX
  31. JNZ zero_loop_encodeBlockAsm
  32. MOVL $0x00000000, 12(SP)
  33. MOVQ src_len+32(FP), CX
  34. LEAQ -9(CX), DX
  35. LEAQ -8(CX), SI
  36. MOVL SI, 8(SP)
  37. SHRQ $0x05, CX
  38. SUBL CX, DX
  39. LEAQ (AX)(DX*1), DX
  40. MOVQ DX, (SP)
  41. MOVL $0x00000001, CX
  42. MOVL CX, 16(SP)
  43. MOVQ src_base+24(FP), DX
  44. search_loop_encodeBlockAsm:
  45. MOVL CX, SI
  46. SUBL 12(SP), SI
  47. SHRL $0x06, SI
  48. LEAL 4(CX)(SI*1), SI
  49. CMPL SI, 8(SP)
  50. JGE emit_remainder_encodeBlockAsm
  51. MOVQ (DX)(CX*1), DI
  52. MOVL SI, 20(SP)
  53. MOVQ $0x0000cf1bbcdcbf9b, R9
  54. MOVQ DI, R10
  55. MOVQ DI, R11
  56. SHRQ $0x08, R11
  57. SHLQ $0x10, R10
  58. IMULQ R9, R10
  59. SHRQ $0x32, R10
  60. SHLQ $0x10, R11
  61. IMULQ R9, R11
  62. SHRQ $0x32, R11
  63. MOVL 24(SP)(R10*4), SI
  64. MOVL 24(SP)(R11*4), R8
  65. MOVL CX, 24(SP)(R10*4)
  66. LEAL 1(CX), R10
  67. MOVL R10, 24(SP)(R11*4)
  68. MOVQ DI, R10
  69. SHRQ $0x10, R10
  70. SHLQ $0x10, R10
  71. IMULQ R9, R10
  72. SHRQ $0x32, R10
  73. MOVL CX, R9
  74. SUBL 16(SP), R9
  75. MOVL 1(DX)(R9*1), R11
  76. MOVQ DI, R9
  77. SHRQ $0x08, R9
  78. CMPL R9, R11
  79. JNE no_repeat_found_encodeBlockAsm
  80. LEAL 1(CX), DI
  81. MOVL 12(SP), R8
  82. MOVL DI, SI
  83. SUBL 16(SP), SI
  84. JZ repeat_extend_back_end_encodeBlockAsm
  85. repeat_extend_back_loop_encodeBlockAsm:
  86. CMPL DI, R8
  87. JLE repeat_extend_back_end_encodeBlockAsm
  88. MOVB -1(DX)(SI*1), BL
  89. MOVB -1(DX)(DI*1), R9
  90. CMPB BL, R9
  91. JNE repeat_extend_back_end_encodeBlockAsm
  92. LEAL -1(DI), DI
  93. DECL SI
  94. JNZ repeat_extend_back_loop_encodeBlockAsm
  95. repeat_extend_back_end_encodeBlockAsm:
  96. MOVL 12(SP), SI
  97. CMPL SI, DI
  98. JEQ emit_literal_done_repeat_emit_encodeBlockAsm
  99. MOVL DI, R9
  100. MOVL DI, 12(SP)
  101. LEAQ (DX)(SI*1), R10
  102. SUBL SI, R9
  103. LEAL -1(R9), SI
  104. CMPL SI, $0x3c
  105. JLT one_byte_repeat_emit_encodeBlockAsm
  106. CMPL SI, $0x00000100
  107. JLT two_bytes_repeat_emit_encodeBlockAsm
  108. CMPL SI, $0x00010000
  109. JLT three_bytes_repeat_emit_encodeBlockAsm
  110. CMPL SI, $0x01000000
  111. JLT four_bytes_repeat_emit_encodeBlockAsm
  112. MOVB $0xfc, (AX)
  113. MOVL SI, 1(AX)
  114. ADDQ $0x05, AX
  115. JMP memmove_long_repeat_emit_encodeBlockAsm
  116. four_bytes_repeat_emit_encodeBlockAsm:
  117. MOVL SI, R11
  118. SHRL $0x10, R11
  119. MOVB $0xf8, (AX)
  120. MOVW SI, 1(AX)
  121. MOVB R11, 3(AX)
  122. ADDQ $0x04, AX
  123. JMP memmove_long_repeat_emit_encodeBlockAsm
  124. three_bytes_repeat_emit_encodeBlockAsm:
  125. MOVB $0xf4, (AX)
  126. MOVW SI, 1(AX)
  127. ADDQ $0x03, AX
  128. JMP memmove_long_repeat_emit_encodeBlockAsm
  129. two_bytes_repeat_emit_encodeBlockAsm:
  130. MOVB $0xf0, (AX)
  131. MOVB SI, 1(AX)
  132. ADDQ $0x02, AX
  133. CMPL SI, $0x40
  134. JL memmove_repeat_emit_encodeBlockAsm
  135. JMP memmove_long_repeat_emit_encodeBlockAsm
  136. one_byte_repeat_emit_encodeBlockAsm:
  137. SHLB $0x02, SI
  138. MOVB SI, (AX)
  139. ADDQ $0x01, AX
  140. memmove_repeat_emit_encodeBlockAsm:
  141. LEAQ (AX)(R9*1), SI
  142. // genMemMoveShort
  143. CMPQ R9, $0x08
  144. JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
  145. CMPQ R9, $0x10
  146. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
  147. CMPQ R9, $0x20
  148. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
  149. JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
  150. emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
  151. MOVQ (R10), R11
  152. MOVQ R11, (AX)
  153. JMP memmove_end_copy_repeat_emit_encodeBlockAsm
  154. emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
  155. MOVQ (R10), R11
  156. MOVQ -8(R10)(R9*1), R10
  157. MOVQ R11, (AX)
  158. MOVQ R10, -8(AX)(R9*1)
  159. JMP memmove_end_copy_repeat_emit_encodeBlockAsm
  160. emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
  161. MOVOU (R10), X0
  162. MOVOU -16(R10)(R9*1), X1
  163. MOVOU X0, (AX)
  164. MOVOU X1, -16(AX)(R9*1)
  165. JMP memmove_end_copy_repeat_emit_encodeBlockAsm
  166. emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
  167. MOVOU (R10), X0
  168. MOVOU 16(R10), X1
  169. MOVOU -32(R10)(R9*1), X2
  170. MOVOU -16(R10)(R9*1), X3
  171. MOVOU X0, (AX)
  172. MOVOU X1, 16(AX)
  173. MOVOU X2, -32(AX)(R9*1)
  174. MOVOU X3, -16(AX)(R9*1)
  175. memmove_end_copy_repeat_emit_encodeBlockAsm:
  176. MOVQ SI, AX
  177. JMP emit_literal_done_repeat_emit_encodeBlockAsm
  178. memmove_long_repeat_emit_encodeBlockAsm:
  179. LEAQ (AX)(R9*1), SI
  180. // genMemMoveLong
  181. MOVOU (R10), X0
  182. MOVOU 16(R10), X1
  183. MOVOU -32(R10)(R9*1), X2
  184. MOVOU -16(R10)(R9*1), X3
  185. MOVQ R9, R12
  186. SHRQ $0x05, R12
  187. MOVQ AX, R11
  188. ANDL $0x0000001f, R11
  189. MOVQ $0x00000040, R13
  190. SUBQ R11, R13
  191. DECQ R12
  192. JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
  193. LEAQ -32(R10)(R13*1), R11
  194. LEAQ -32(AX)(R13*1), R14
  195. emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
  196. MOVOU (R11), X4
  197. MOVOU 16(R11), X5
  198. MOVOA X4, (R14)
  199. MOVOA X5, 16(R14)
  200. ADDQ $0x20, R14
  201. ADDQ $0x20, R11
  202. ADDQ $0x20, R13
  203. DECQ R12
  204. JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
  205. emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
  206. MOVOU -32(R10)(R13*1), X4
  207. MOVOU -16(R10)(R13*1), X5
  208. MOVOA X4, -32(AX)(R13*1)
  209. MOVOA X5, -16(AX)(R13*1)
  210. ADDQ $0x20, R13
  211. CMPQ R9, R13
  212. JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
  213. MOVOU X0, (AX)
  214. MOVOU X1, 16(AX)
  215. MOVOU X2, -32(AX)(R9*1)
  216. MOVOU X3, -16(AX)(R9*1)
  217. MOVQ SI, AX
  218. emit_literal_done_repeat_emit_encodeBlockAsm:
  219. ADDL $0x05, CX
  220. MOVL CX, SI
  221. SUBL 16(SP), SI
  222. MOVQ src_len+32(FP), R9
  223. SUBL CX, R9
  224. LEAQ (DX)(CX*1), R10
  225. LEAQ (DX)(SI*1), SI
  226. // matchLen
  227. XORL R12, R12
  228. CMPL R9, $0x08
  229. JL matchlen_match4_repeat_extend_encodeBlockAsm
  230. matchlen_loopback_repeat_extend_encodeBlockAsm:
  231. MOVQ (R10)(R12*1), R11
  232. XORQ (SI)(R12*1), R11
  233. TESTQ R11, R11
  234. JZ matchlen_loop_repeat_extend_encodeBlockAsm
  235. #ifdef GOAMD64_v3
  236. TZCNTQ R11, R11
  237. #else
  238. BSFQ R11, R11
  239. #endif
  240. SARQ $0x03, R11
  241. LEAL (R12)(R11*1), R12
  242. JMP repeat_extend_forward_end_encodeBlockAsm
  243. matchlen_loop_repeat_extend_encodeBlockAsm:
  244. LEAL -8(R9), R9
  245. LEAL 8(R12), R12
  246. CMPL R9, $0x08
  247. JGE matchlen_loopback_repeat_extend_encodeBlockAsm
  248. JZ repeat_extend_forward_end_encodeBlockAsm
  249. matchlen_match4_repeat_extend_encodeBlockAsm:
  250. CMPL R9, $0x04
  251. JL matchlen_match2_repeat_extend_encodeBlockAsm
  252. MOVL (R10)(R12*1), R11
  253. CMPL (SI)(R12*1), R11
  254. JNE matchlen_match2_repeat_extend_encodeBlockAsm
  255. SUBL $0x04, R9
  256. LEAL 4(R12), R12
  257. matchlen_match2_repeat_extend_encodeBlockAsm:
  258. CMPL R9, $0x02
  259. JL matchlen_match1_repeat_extend_encodeBlockAsm
  260. MOVW (R10)(R12*1), R11
  261. CMPW (SI)(R12*1), R11
  262. JNE matchlen_match1_repeat_extend_encodeBlockAsm
  263. SUBL $0x02, R9
  264. LEAL 2(R12), R12
  265. matchlen_match1_repeat_extend_encodeBlockAsm:
  266. CMPL R9, $0x01
  267. JL repeat_extend_forward_end_encodeBlockAsm
  268. MOVB (R10)(R12*1), R11
  269. CMPB (SI)(R12*1), R11
  270. JNE repeat_extend_forward_end_encodeBlockAsm
  271. LEAL 1(R12), R12
  272. repeat_extend_forward_end_encodeBlockAsm:
  273. ADDL R12, CX
  274. MOVL CX, SI
  275. SUBL DI, SI
  276. MOVL 16(SP), DI
  277. TESTL R8, R8
  278. JZ repeat_as_copy_encodeBlockAsm
  279. // emitRepeat
  280. emit_repeat_again_match_repeat_encodeBlockAsm:
  281. MOVL SI, R8
  282. LEAL -4(SI), SI
  283. CMPL R8, $0x08
  284. JLE repeat_two_match_repeat_encodeBlockAsm
  285. CMPL R8, $0x0c
  286. JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm
  287. CMPL DI, $0x00000800
  288. JLT repeat_two_offset_match_repeat_encodeBlockAsm
  289. cant_repeat_two_offset_match_repeat_encodeBlockAsm:
  290. CMPL SI, $0x00000104
  291. JLT repeat_three_match_repeat_encodeBlockAsm
  292. CMPL SI, $0x00010100
  293. JLT repeat_four_match_repeat_encodeBlockAsm
  294. CMPL SI, $0x0100ffff
  295. JLT repeat_five_match_repeat_encodeBlockAsm
  296. LEAL -16842747(SI), SI
  297. MOVW $0x001d, (AX)
  298. MOVW $0xfffb, 2(AX)
  299. MOVB $0xff, 4(AX)
  300. ADDQ $0x05, AX
  301. JMP emit_repeat_again_match_repeat_encodeBlockAsm
  302. repeat_five_match_repeat_encodeBlockAsm:
  303. LEAL -65536(SI), SI
  304. MOVL SI, DI
  305. MOVW $0x001d, (AX)
  306. MOVW SI, 2(AX)
  307. SARL $0x10, DI
  308. MOVB DI, 4(AX)
  309. ADDQ $0x05, AX
  310. JMP repeat_end_emit_encodeBlockAsm
  311. repeat_four_match_repeat_encodeBlockAsm:
  312. LEAL -256(SI), SI
  313. MOVW $0x0019, (AX)
  314. MOVW SI, 2(AX)
  315. ADDQ $0x04, AX
  316. JMP repeat_end_emit_encodeBlockAsm
  317. repeat_three_match_repeat_encodeBlockAsm:
  318. LEAL -4(SI), SI
  319. MOVW $0x0015, (AX)
  320. MOVB SI, 2(AX)
  321. ADDQ $0x03, AX
  322. JMP repeat_end_emit_encodeBlockAsm
  323. repeat_two_match_repeat_encodeBlockAsm:
  324. SHLL $0x02, SI
  325. ORL $0x01, SI
  326. MOVW SI, (AX)
  327. ADDQ $0x02, AX
  328. JMP repeat_end_emit_encodeBlockAsm
  329. repeat_two_offset_match_repeat_encodeBlockAsm:
  330. XORQ R8, R8
  331. LEAL 1(R8)(SI*4), SI
  332. MOVB DI, 1(AX)
  333. SARL $0x08, DI
  334. SHLL $0x05, DI
  335. ORL DI, SI
  336. MOVB SI, (AX)
  337. ADDQ $0x02, AX
  338. JMP repeat_end_emit_encodeBlockAsm
  339. repeat_as_copy_encodeBlockAsm:
  340. // emitCopy
  341. CMPL DI, $0x00010000
  342. JL two_byte_offset_repeat_as_copy_encodeBlockAsm
  343. four_bytes_loop_back_repeat_as_copy_encodeBlockAsm:
  344. CMPL SI, $0x40
  345. JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm
  346. MOVB $0xff, (AX)
  347. MOVL DI, 1(AX)
  348. LEAL -64(SI), SI
  349. ADDQ $0x05, AX
  350. CMPL SI, $0x04
  351. JL four_bytes_remain_repeat_as_copy_encodeBlockAsm
  352. // emitRepeat
  353. emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
  354. MOVL SI, R8
  355. LEAL -4(SI), SI
  356. CMPL R8, $0x08
  357. JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
  358. CMPL R8, $0x0c
  359. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
  360. CMPL DI, $0x00000800
  361. JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
  362. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
  363. CMPL SI, $0x00000104
  364. JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
  365. CMPL SI, $0x00010100
  366. JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
  367. CMPL SI, $0x0100ffff
  368. JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
  369. LEAL -16842747(SI), SI
  370. MOVW $0x001d, (AX)
  371. MOVW $0xfffb, 2(AX)
  372. MOVB $0xff, 4(AX)
  373. ADDQ $0x05, AX
  374. JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
  375. repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
  376. LEAL -65536(SI), SI
  377. MOVL SI, DI
  378. MOVW $0x001d, (AX)
  379. MOVW SI, 2(AX)
  380. SARL $0x10, DI
  381. MOVB DI, 4(AX)
  382. ADDQ $0x05, AX
  383. JMP repeat_end_emit_encodeBlockAsm
  384. repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
  385. LEAL -256(SI), SI
  386. MOVW $0x0019, (AX)
  387. MOVW SI, 2(AX)
  388. ADDQ $0x04, AX
  389. JMP repeat_end_emit_encodeBlockAsm
  390. repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
  391. LEAL -4(SI), SI
  392. MOVW $0x0015, (AX)
  393. MOVB SI, 2(AX)
  394. ADDQ $0x03, AX
  395. JMP repeat_end_emit_encodeBlockAsm
  396. repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
  397. SHLL $0x02, SI
  398. ORL $0x01, SI
  399. MOVW SI, (AX)
  400. ADDQ $0x02, AX
  401. JMP repeat_end_emit_encodeBlockAsm
  402. repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
  403. XORQ R8, R8
  404. LEAL 1(R8)(SI*4), SI
  405. MOVB DI, 1(AX)
  406. SARL $0x08, DI
  407. SHLL $0x05, DI
  408. ORL DI, SI
  409. MOVB SI, (AX)
  410. ADDQ $0x02, AX
  411. JMP repeat_end_emit_encodeBlockAsm
  412. JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm
  413. four_bytes_remain_repeat_as_copy_encodeBlockAsm:
  414. TESTL SI, SI
  415. JZ repeat_end_emit_encodeBlockAsm
  416. MOVB $0x03, BL
  417. LEAL -4(BX)(SI*4), SI
  418. MOVB SI, (AX)
  419. MOVL DI, 1(AX)
  420. ADDQ $0x05, AX
  421. JMP repeat_end_emit_encodeBlockAsm
  422. two_byte_offset_repeat_as_copy_encodeBlockAsm:
  423. CMPL SI, $0x40
  424. JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm
  425. CMPL DI, $0x00000800
  426. JAE long_offset_short_repeat_as_copy_encodeBlockAsm
  427. MOVL $0x00000001, R8
  428. LEAL 16(R8), R8
  429. MOVB DI, 1(AX)
  430. MOVL DI, R9
  431. SHRL $0x08, R9
  432. SHLL $0x05, R9
  433. ORL R9, R8
  434. MOVB R8, (AX)
  435. ADDQ $0x02, AX
  436. SUBL $0x08, SI
  437. // emitRepeat
  438. LEAL -4(SI), SI
  439. JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  440. emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  441. MOVL SI, R8
  442. LEAL -4(SI), SI
  443. CMPL R8, $0x08
  444. JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  445. CMPL R8, $0x0c
  446. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  447. CMPL DI, $0x00000800
  448. JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  449. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  450. CMPL SI, $0x00000104
  451. JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  452. CMPL SI, $0x00010100
  453. JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  454. CMPL SI, $0x0100ffff
  455. JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  456. LEAL -16842747(SI), SI
  457. MOVW $0x001d, (AX)
  458. MOVW $0xfffb, 2(AX)
  459. MOVB $0xff, 4(AX)
  460. ADDQ $0x05, AX
  461. JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
  462. repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  463. LEAL -65536(SI), SI
  464. MOVL SI, DI
  465. MOVW $0x001d, (AX)
  466. MOVW SI, 2(AX)
  467. SARL $0x10, DI
  468. MOVB DI, 4(AX)
  469. ADDQ $0x05, AX
  470. JMP repeat_end_emit_encodeBlockAsm
  471. repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  472. LEAL -256(SI), SI
  473. MOVW $0x0019, (AX)
  474. MOVW SI, 2(AX)
  475. ADDQ $0x04, AX
  476. JMP repeat_end_emit_encodeBlockAsm
  477. repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  478. LEAL -4(SI), SI
  479. MOVW $0x0015, (AX)
  480. MOVB SI, 2(AX)
  481. ADDQ $0x03, AX
  482. JMP repeat_end_emit_encodeBlockAsm
  483. repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  484. SHLL $0x02, SI
  485. ORL $0x01, SI
  486. MOVW SI, (AX)
  487. ADDQ $0x02, AX
  488. JMP repeat_end_emit_encodeBlockAsm
  489. repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
  490. XORQ R8, R8
  491. LEAL 1(R8)(SI*4), SI
  492. MOVB DI, 1(AX)
  493. SARL $0x08, DI
  494. SHLL $0x05, DI
  495. ORL DI, SI
  496. MOVB SI, (AX)
  497. ADDQ $0x02, AX
  498. JMP repeat_end_emit_encodeBlockAsm
  499. long_offset_short_repeat_as_copy_encodeBlockAsm:
  500. MOVB $0xee, (AX)
  501. MOVW DI, 1(AX)
  502. LEAL -60(SI), SI
  503. ADDQ $0x03, AX
  504. // emitRepeat
  505. emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  506. MOVL SI, R8
  507. LEAL -4(SI), SI
  508. CMPL R8, $0x08
  509. JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
  510. CMPL R8, $0x0c
  511. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
  512. CMPL DI, $0x00000800
  513. JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
  514. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  515. CMPL SI, $0x00000104
  516. JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
  517. CMPL SI, $0x00010100
  518. JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
  519. CMPL SI, $0x0100ffff
  520. JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
  521. LEAL -16842747(SI), SI
  522. MOVW $0x001d, (AX)
  523. MOVW $0xfffb, 2(AX)
  524. MOVB $0xff, 4(AX)
  525. ADDQ $0x05, AX
  526. JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
  527. repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  528. LEAL -65536(SI), SI
  529. MOVL SI, DI
  530. MOVW $0x001d, (AX)
  531. MOVW SI, 2(AX)
  532. SARL $0x10, DI
  533. MOVB DI, 4(AX)
  534. ADDQ $0x05, AX
  535. JMP repeat_end_emit_encodeBlockAsm
  536. repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  537. LEAL -256(SI), SI
  538. MOVW $0x0019, (AX)
  539. MOVW SI, 2(AX)
  540. ADDQ $0x04, AX
  541. JMP repeat_end_emit_encodeBlockAsm
  542. repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  543. LEAL -4(SI), SI
  544. MOVW $0x0015, (AX)
  545. MOVB SI, 2(AX)
  546. ADDQ $0x03, AX
  547. JMP repeat_end_emit_encodeBlockAsm
  548. repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  549. SHLL $0x02, SI
  550. ORL $0x01, SI
  551. MOVW SI, (AX)
  552. ADDQ $0x02, AX
  553. JMP repeat_end_emit_encodeBlockAsm
  554. repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
  555. XORQ R8, R8
  556. LEAL 1(R8)(SI*4), SI
  557. MOVB DI, 1(AX)
  558. SARL $0x08, DI
  559. SHLL $0x05, DI
  560. ORL DI, SI
  561. MOVB SI, (AX)
  562. ADDQ $0x02, AX
  563. JMP repeat_end_emit_encodeBlockAsm
  564. JMP two_byte_offset_repeat_as_copy_encodeBlockAsm
  565. two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
  566. CMPL SI, $0x0c
  567. JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
  568. CMPL DI, $0x00000800
  569. JGE emit_copy_three_repeat_as_copy_encodeBlockAsm
  570. MOVB $0x01, BL
  571. LEAL -16(BX)(SI*4), SI
  572. MOVB DI, 1(AX)
  573. SHRL $0x08, DI
  574. SHLL $0x05, DI
  575. ORL DI, SI
  576. MOVB SI, (AX)
  577. ADDQ $0x02, AX
  578. JMP repeat_end_emit_encodeBlockAsm
  579. emit_copy_three_repeat_as_copy_encodeBlockAsm:
  580. MOVB $0x02, BL
  581. LEAL -4(BX)(SI*4), SI
  582. MOVB SI, (AX)
  583. MOVW DI, 1(AX)
  584. ADDQ $0x03, AX
  585. repeat_end_emit_encodeBlockAsm:
  586. MOVL CX, 12(SP)
  587. JMP search_loop_encodeBlockAsm
  588. no_repeat_found_encodeBlockAsm:
  589. CMPL (DX)(SI*1), DI
  590. JEQ candidate_match_encodeBlockAsm
  591. SHRQ $0x08, DI
  592. MOVL 24(SP)(R10*4), SI
  593. LEAL 2(CX), R9
  594. CMPL (DX)(R8*1), DI
  595. JEQ candidate2_match_encodeBlockAsm
  596. MOVL R9, 24(SP)(R10*4)
  597. SHRQ $0x08, DI
  598. CMPL (DX)(SI*1), DI
  599. JEQ candidate3_match_encodeBlockAsm
  600. MOVL 20(SP), CX
  601. JMP search_loop_encodeBlockAsm
  602. candidate3_match_encodeBlockAsm:
  603. ADDL $0x02, CX
  604. JMP candidate_match_encodeBlockAsm
  605. candidate2_match_encodeBlockAsm:
  606. MOVL R9, 24(SP)(R10*4)
  607. INCL CX
  608. MOVL R8, SI
  609. candidate_match_encodeBlockAsm:
  610. MOVL 12(SP), DI
  611. TESTL SI, SI
  612. JZ match_extend_back_end_encodeBlockAsm
  613. match_extend_back_loop_encodeBlockAsm:
  614. CMPL CX, DI
  615. JLE match_extend_back_end_encodeBlockAsm
  616. MOVB -1(DX)(SI*1), BL
  617. MOVB -1(DX)(CX*1), R8
  618. CMPB BL, R8
  619. JNE match_extend_back_end_encodeBlockAsm
  620. LEAL -1(CX), CX
  621. DECL SI
  622. JZ match_extend_back_end_encodeBlockAsm
  623. JMP match_extend_back_loop_encodeBlockAsm
  624. match_extend_back_end_encodeBlockAsm:
  625. MOVL CX, DI
  626. SUBL 12(SP), DI
  627. LEAQ 5(AX)(DI*1), DI
  628. CMPQ DI, (SP)
  629. JL match_dst_size_check_encodeBlockAsm
  630. MOVQ $0x00000000, ret+48(FP)
  631. RET
  632. match_dst_size_check_encodeBlockAsm:
  633. MOVL CX, DI
  634. MOVL 12(SP), R8
  635. CMPL R8, DI
  636. JEQ emit_literal_done_match_emit_encodeBlockAsm
  637. MOVL DI, R9
  638. MOVL DI, 12(SP)
  639. LEAQ (DX)(R8*1), DI
  640. SUBL R8, R9
  641. LEAL -1(R9), R8
  642. CMPL R8, $0x3c
  643. JLT one_byte_match_emit_encodeBlockAsm
  644. CMPL R8, $0x00000100
  645. JLT two_bytes_match_emit_encodeBlockAsm
  646. CMPL R8, $0x00010000
  647. JLT three_bytes_match_emit_encodeBlockAsm
  648. CMPL R8, $0x01000000
  649. JLT four_bytes_match_emit_encodeBlockAsm
  650. MOVB $0xfc, (AX)
  651. MOVL R8, 1(AX)
  652. ADDQ $0x05, AX
  653. JMP memmove_long_match_emit_encodeBlockAsm
  654. four_bytes_match_emit_encodeBlockAsm:
  655. MOVL R8, R10
  656. SHRL $0x10, R10
  657. MOVB $0xf8, (AX)
  658. MOVW R8, 1(AX)
  659. MOVB R10, 3(AX)
  660. ADDQ $0x04, AX
  661. JMP memmove_long_match_emit_encodeBlockAsm
  662. three_bytes_match_emit_encodeBlockAsm:
  663. MOVB $0xf4, (AX)
  664. MOVW R8, 1(AX)
  665. ADDQ $0x03, AX
  666. JMP memmove_long_match_emit_encodeBlockAsm
  667. two_bytes_match_emit_encodeBlockAsm:
  668. MOVB $0xf0, (AX)
  669. MOVB R8, 1(AX)
  670. ADDQ $0x02, AX
  671. CMPL R8, $0x40
  672. JL memmove_match_emit_encodeBlockAsm
  673. JMP memmove_long_match_emit_encodeBlockAsm
  674. one_byte_match_emit_encodeBlockAsm:
  675. SHLB $0x02, R8
  676. MOVB R8, (AX)
  677. ADDQ $0x01, AX
  678. memmove_match_emit_encodeBlockAsm:
  679. LEAQ (AX)(R9*1), R8
  680. // genMemMoveShort
  681. CMPQ R9, $0x08
  682. JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
  683. CMPQ R9, $0x10
  684. JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
  685. CMPQ R9, $0x20
  686. JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
  687. JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
  688. emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
  689. MOVQ (DI), R10
  690. MOVQ R10, (AX)
  691. JMP memmove_end_copy_match_emit_encodeBlockAsm
  692. emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
  693. MOVQ (DI), R10
  694. MOVQ -8(DI)(R9*1), DI
  695. MOVQ R10, (AX)
  696. MOVQ DI, -8(AX)(R9*1)
  697. JMP memmove_end_copy_match_emit_encodeBlockAsm
  698. emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
  699. MOVOU (DI), X0
  700. MOVOU -16(DI)(R9*1), X1
  701. MOVOU X0, (AX)
  702. MOVOU X1, -16(AX)(R9*1)
  703. JMP memmove_end_copy_match_emit_encodeBlockAsm
  704. emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
  705. MOVOU (DI), X0
  706. MOVOU 16(DI), X1
  707. MOVOU -32(DI)(R9*1), X2
  708. MOVOU -16(DI)(R9*1), X3
  709. MOVOU X0, (AX)
  710. MOVOU X1, 16(AX)
  711. MOVOU X2, -32(AX)(R9*1)
  712. MOVOU X3, -16(AX)(R9*1)
  713. memmove_end_copy_match_emit_encodeBlockAsm:
  714. MOVQ R8, AX
  715. JMP emit_literal_done_match_emit_encodeBlockAsm
  716. memmove_long_match_emit_encodeBlockAsm:
  717. LEAQ (AX)(R9*1), R8
  718. // genMemMoveLong
  719. MOVOU (DI), X0
  720. MOVOU 16(DI), X1
  721. MOVOU -32(DI)(R9*1), X2
  722. MOVOU -16(DI)(R9*1), X3
  723. MOVQ R9, R11
  724. SHRQ $0x05, R11
  725. MOVQ AX, R10
  726. ANDL $0x0000001f, R10
  727. MOVQ $0x00000040, R12
  728. SUBQ R10, R12
  729. DECQ R11
  730. JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
  731. LEAQ -32(DI)(R12*1), R10
  732. LEAQ -32(AX)(R12*1), R13
  733. emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
  734. MOVOU (R10), X4
  735. MOVOU 16(R10), X5
  736. MOVOA X4, (R13)
  737. MOVOA X5, 16(R13)
  738. ADDQ $0x20, R13
  739. ADDQ $0x20, R10
  740. ADDQ $0x20, R12
  741. DECQ R11
  742. JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
  743. emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
  744. MOVOU -32(DI)(R12*1), X4
  745. MOVOU -16(DI)(R12*1), X5
  746. MOVOA X4, -32(AX)(R12*1)
  747. MOVOA X5, -16(AX)(R12*1)
  748. ADDQ $0x20, R12
  749. CMPQ R9, R12
  750. JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
  751. MOVOU X0, (AX)
  752. MOVOU X1, 16(AX)
  753. MOVOU X2, -32(AX)(R9*1)
  754. MOVOU X3, -16(AX)(R9*1)
  755. MOVQ R8, AX
  756. emit_literal_done_match_emit_encodeBlockAsm:
  757. match_nolit_loop_encodeBlockAsm:
  758. MOVL CX, DI
  759. SUBL SI, DI
  760. MOVL DI, 16(SP)
  761. ADDL $0x04, CX
  762. ADDL $0x04, SI
  763. MOVQ src_len+32(FP), DI
  764. SUBL CX, DI
  765. LEAQ (DX)(CX*1), R8
  766. LEAQ (DX)(SI*1), SI
  767. // matchLen
  768. XORL R10, R10
  769. CMPL DI, $0x08
  770. JL matchlen_match4_match_nolit_encodeBlockAsm
  771. matchlen_loopback_match_nolit_encodeBlockAsm:
  772. MOVQ (R8)(R10*1), R9
  773. XORQ (SI)(R10*1), R9
  774. TESTQ R9, R9
  775. JZ matchlen_loop_match_nolit_encodeBlockAsm
  776. #ifdef GOAMD64_v3
  777. TZCNTQ R9, R9
  778. #else
  779. BSFQ R9, R9
  780. #endif
  781. SARQ $0x03, R9
  782. LEAL (R10)(R9*1), R10
  783. JMP match_nolit_end_encodeBlockAsm
  784. matchlen_loop_match_nolit_encodeBlockAsm:
  785. LEAL -8(DI), DI
  786. LEAL 8(R10), R10
  787. CMPL DI, $0x08
  788. JGE matchlen_loopback_match_nolit_encodeBlockAsm
  789. JZ match_nolit_end_encodeBlockAsm
  790. matchlen_match4_match_nolit_encodeBlockAsm:
  791. CMPL DI, $0x04
  792. JL matchlen_match2_match_nolit_encodeBlockAsm
  793. MOVL (R8)(R10*1), R9
  794. CMPL (SI)(R10*1), R9
  795. JNE matchlen_match2_match_nolit_encodeBlockAsm
  796. SUBL $0x04, DI
  797. LEAL 4(R10), R10
  798. matchlen_match2_match_nolit_encodeBlockAsm:
  799. CMPL DI, $0x02
  800. JL matchlen_match1_match_nolit_encodeBlockAsm
  801. MOVW (R8)(R10*1), R9
  802. CMPW (SI)(R10*1), R9
  803. JNE matchlen_match1_match_nolit_encodeBlockAsm
  804. SUBL $0x02, DI
  805. LEAL 2(R10), R10
  806. matchlen_match1_match_nolit_encodeBlockAsm:
  807. CMPL DI, $0x01
  808. JL match_nolit_end_encodeBlockAsm
  809. MOVB (R8)(R10*1), R9
  810. CMPB (SI)(R10*1), R9
  811. JNE match_nolit_end_encodeBlockAsm
  812. LEAL 1(R10), R10
  813. match_nolit_end_encodeBlockAsm:
  814. ADDL R10, CX
  815. MOVL 16(SP), SI
  816. ADDL $0x04, R10
  817. MOVL CX, 12(SP)
  818. // emitCopy
  819. CMPL SI, $0x00010000
  820. JL two_byte_offset_match_nolit_encodeBlockAsm
  821. four_bytes_loop_back_match_nolit_encodeBlockAsm:
  822. CMPL R10, $0x40
  823. JLE four_bytes_remain_match_nolit_encodeBlockAsm
  824. MOVB $0xff, (AX)
  825. MOVL SI, 1(AX)
  826. LEAL -64(R10), R10
  827. ADDQ $0x05, AX
  828. CMPL R10, $0x04
  829. JL four_bytes_remain_match_nolit_encodeBlockAsm
  830. // emitRepeat
  831. emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
  832. MOVL R10, DI
  833. LEAL -4(R10), R10
  834. CMPL DI, $0x08
  835. JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy
  836. CMPL DI, $0x0c
  837. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
  838. CMPL SI, $0x00000800
  839. JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
  840. cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
  841. CMPL R10, $0x00000104
  842. JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy
  843. CMPL R10, $0x00010100
  844. JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy
  845. CMPL R10, $0x0100ffff
  846. JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy
  847. LEAL -16842747(R10), R10
  848. MOVW $0x001d, (AX)
  849. MOVW $0xfffb, 2(AX)
  850. MOVB $0xff, 4(AX)
  851. ADDQ $0x05, AX
  852. JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
  853. repeat_five_match_nolit_encodeBlockAsm_emit_copy:
  854. LEAL -65536(R10), R10
  855. MOVL R10, SI
  856. MOVW $0x001d, (AX)
  857. MOVW R10, 2(AX)
  858. SARL $0x10, SI
  859. MOVB SI, 4(AX)
  860. ADDQ $0x05, AX
  861. JMP match_nolit_emitcopy_end_encodeBlockAsm
  862. repeat_four_match_nolit_encodeBlockAsm_emit_copy:
  863. LEAL -256(R10), R10
  864. MOVW $0x0019, (AX)
  865. MOVW R10, 2(AX)
  866. ADDQ $0x04, AX
  867. JMP match_nolit_emitcopy_end_encodeBlockAsm
  868. repeat_three_match_nolit_encodeBlockAsm_emit_copy:
  869. LEAL -4(R10), R10
  870. MOVW $0x0015, (AX)
  871. MOVB R10, 2(AX)
  872. ADDQ $0x03, AX
  873. JMP match_nolit_emitcopy_end_encodeBlockAsm
  874. repeat_two_match_nolit_encodeBlockAsm_emit_copy:
  875. SHLL $0x02, R10
  876. ORL $0x01, R10
  877. MOVW R10, (AX)
  878. ADDQ $0x02, AX
  879. JMP match_nolit_emitcopy_end_encodeBlockAsm
  880. repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
  881. XORQ DI, DI
  882. LEAL 1(DI)(R10*4), R10
  883. MOVB SI, 1(AX)
  884. SARL $0x08, SI
  885. SHLL $0x05, SI
  886. ORL SI, R10
  887. MOVB R10, (AX)
  888. ADDQ $0x02, AX
  889. JMP match_nolit_emitcopy_end_encodeBlockAsm
  890. JMP four_bytes_loop_back_match_nolit_encodeBlockAsm
  891. four_bytes_remain_match_nolit_encodeBlockAsm:
  892. TESTL R10, R10
  893. JZ match_nolit_emitcopy_end_encodeBlockAsm
  894. MOVB $0x03, BL
  895. LEAL -4(BX)(R10*4), R10
  896. MOVB R10, (AX)
  897. MOVL SI, 1(AX)
  898. ADDQ $0x05, AX
  899. JMP match_nolit_emitcopy_end_encodeBlockAsm
  900. two_byte_offset_match_nolit_encodeBlockAsm:
  901. CMPL R10, $0x40
  902. JLE two_byte_offset_short_match_nolit_encodeBlockAsm
  903. CMPL SI, $0x00000800
  904. JAE long_offset_short_match_nolit_encodeBlockAsm
  905. MOVL $0x00000001, DI
  906. LEAL 16(DI), DI
  907. MOVB SI, 1(AX)
  908. MOVL SI, R8
  909. SHRL $0x08, R8
  910. SHLL $0x05, R8
  911. ORL R8, DI
  912. MOVB DI, (AX)
  913. ADDQ $0x02, AX
  914. SUBL $0x08, R10
  915. // emitRepeat
  916. LEAL -4(R10), R10
  917. JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
  918. emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  919. MOVL R10, DI
  920. LEAL -4(R10), R10
  921. CMPL DI, $0x08
  922. JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
  923. CMPL DI, $0x0c
  924. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
  925. CMPL SI, $0x00000800
  926. JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
  927. cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  928. CMPL R10, $0x00000104
  929. JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
  930. CMPL R10, $0x00010100
  931. JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
  932. CMPL R10, $0x0100ffff
  933. JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
  934. LEAL -16842747(R10), R10
  935. MOVW $0x001d, (AX)
  936. MOVW $0xfffb, 2(AX)
  937. MOVB $0xff, 4(AX)
  938. ADDQ $0x05, AX
  939. JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b
  940. repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  941. LEAL -65536(R10), R10
  942. MOVL R10, SI
  943. MOVW $0x001d, (AX)
  944. MOVW R10, 2(AX)
  945. SARL $0x10, SI
  946. MOVB SI, 4(AX)
  947. ADDQ $0x05, AX
  948. JMP match_nolit_emitcopy_end_encodeBlockAsm
  949. repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  950. LEAL -256(R10), R10
  951. MOVW $0x0019, (AX)
  952. MOVW R10, 2(AX)
  953. ADDQ $0x04, AX
  954. JMP match_nolit_emitcopy_end_encodeBlockAsm
  955. repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  956. LEAL -4(R10), R10
  957. MOVW $0x0015, (AX)
  958. MOVB R10, 2(AX)
  959. ADDQ $0x03, AX
  960. JMP match_nolit_emitcopy_end_encodeBlockAsm
  961. repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  962. SHLL $0x02, R10
  963. ORL $0x01, R10
  964. MOVW R10, (AX)
  965. ADDQ $0x02, AX
  966. JMP match_nolit_emitcopy_end_encodeBlockAsm
  967. repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
  968. XORQ DI, DI
  969. LEAL 1(DI)(R10*4), R10
  970. MOVB SI, 1(AX)
  971. SARL $0x08, SI
  972. SHLL $0x05, SI
  973. ORL SI, R10
  974. MOVB R10, (AX)
  975. ADDQ $0x02, AX
  976. JMP match_nolit_emitcopy_end_encodeBlockAsm
  977. long_offset_short_match_nolit_encodeBlockAsm:
  978. MOVB $0xee, (AX)
  979. MOVW SI, 1(AX)
  980. LEAL -60(R10), R10
  981. ADDQ $0x03, AX
  982. // emitRepeat
  983. emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
  984. MOVL R10, DI
  985. LEAL -4(R10), R10
  986. CMPL DI, $0x08
  987. JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
  988. CMPL DI, $0x0c
  989. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
  990. CMPL SI, $0x00000800
  991. JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
  992. cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
  993. CMPL R10, $0x00000104
  994. JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
  995. CMPL R10, $0x00010100
  996. JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
  997. CMPL R10, $0x0100ffff
  998. JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
  999. LEAL -16842747(R10), R10
  1000. MOVW $0x001d, (AX)
  1001. MOVW $0xfffb, 2(AX)
  1002. MOVB $0xff, 4(AX)
  1003. ADDQ $0x05, AX
  1004. JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
  1005. repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
  1006. LEAL -65536(R10), R10
  1007. MOVL R10, SI
  1008. MOVW $0x001d, (AX)
  1009. MOVW R10, 2(AX)
  1010. SARL $0x10, SI
  1011. MOVB SI, 4(AX)
  1012. ADDQ $0x05, AX
  1013. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1014. repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
  1015. LEAL -256(R10), R10
  1016. MOVW $0x0019, (AX)
  1017. MOVW R10, 2(AX)
  1018. ADDQ $0x04, AX
  1019. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1020. repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
  1021. LEAL -4(R10), R10
  1022. MOVW $0x0015, (AX)
  1023. MOVB R10, 2(AX)
  1024. ADDQ $0x03, AX
  1025. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1026. repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
  1027. SHLL $0x02, R10
  1028. ORL $0x01, R10
  1029. MOVW R10, (AX)
  1030. ADDQ $0x02, AX
  1031. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1032. repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
  1033. XORQ DI, DI
  1034. LEAL 1(DI)(R10*4), R10
  1035. MOVB SI, 1(AX)
  1036. SARL $0x08, SI
  1037. SHLL $0x05, SI
  1038. ORL SI, R10
  1039. MOVB R10, (AX)
  1040. ADDQ $0x02, AX
  1041. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1042. JMP two_byte_offset_match_nolit_encodeBlockAsm
  1043. two_byte_offset_short_match_nolit_encodeBlockAsm:
  1044. CMPL R10, $0x0c
  1045. JGE emit_copy_three_match_nolit_encodeBlockAsm
  1046. CMPL SI, $0x00000800
  1047. JGE emit_copy_three_match_nolit_encodeBlockAsm
  1048. MOVB $0x01, BL
  1049. LEAL -16(BX)(R10*4), R10
  1050. MOVB SI, 1(AX)
  1051. SHRL $0x08, SI
  1052. SHLL $0x05, SI
  1053. ORL SI, R10
  1054. MOVB R10, (AX)
  1055. ADDQ $0x02, AX
  1056. JMP match_nolit_emitcopy_end_encodeBlockAsm
  1057. emit_copy_three_match_nolit_encodeBlockAsm:
  1058. MOVB $0x02, BL
  1059. LEAL -4(BX)(R10*4), R10
  1060. MOVB R10, (AX)
  1061. MOVW SI, 1(AX)
  1062. ADDQ $0x03, AX
  1063. match_nolit_emitcopy_end_encodeBlockAsm:
  1064. CMPL CX, 8(SP)
  1065. JGE emit_remainder_encodeBlockAsm
  1066. MOVQ -2(DX)(CX*1), DI
  1067. CMPQ AX, (SP)
  1068. JL match_nolit_dst_ok_encodeBlockAsm
  1069. MOVQ $0x00000000, ret+48(FP)
  1070. RET
  1071. match_nolit_dst_ok_encodeBlockAsm:
  1072. MOVQ $0x0000cf1bbcdcbf9b, R9
  1073. MOVQ DI, R8
  1074. SHRQ $0x10, DI
  1075. MOVQ DI, SI
  1076. SHLQ $0x10, R8
  1077. IMULQ R9, R8
  1078. SHRQ $0x32, R8
  1079. SHLQ $0x10, SI
  1080. IMULQ R9, SI
  1081. SHRQ $0x32, SI
  1082. LEAL -2(CX), R9
  1083. LEAQ 24(SP)(SI*4), R10
  1084. MOVL (R10), SI
  1085. MOVL R9, 24(SP)(R8*4)
  1086. MOVL CX, (R10)
  1087. CMPL (DX)(SI*1), DI
  1088. JEQ match_nolit_loop_encodeBlockAsm
  1089. INCL CX
  1090. JMP search_loop_encodeBlockAsm
  1091. emit_remainder_encodeBlockAsm:
  1092. MOVQ src_len+32(FP), CX
  1093. SUBL 12(SP), CX
  1094. LEAQ 5(AX)(CX*1), CX
  1095. CMPQ CX, (SP)
  1096. JL emit_remainder_ok_encodeBlockAsm
  1097. MOVQ $0x00000000, ret+48(FP)
  1098. RET
  1099. emit_remainder_ok_encodeBlockAsm:
  1100. MOVQ src_len+32(FP), CX
  1101. MOVL 12(SP), BX
  1102. CMPL BX, CX
  1103. JEQ emit_literal_done_emit_remainder_encodeBlockAsm
  1104. MOVL CX, SI
  1105. MOVL CX, 12(SP)
  1106. LEAQ (DX)(BX*1), CX
  1107. SUBL BX, SI
  1108. LEAL -1(SI), DX
  1109. CMPL DX, $0x3c
  1110. JLT one_byte_emit_remainder_encodeBlockAsm
  1111. CMPL DX, $0x00000100
  1112. JLT two_bytes_emit_remainder_encodeBlockAsm
  1113. CMPL DX, $0x00010000
  1114. JLT three_bytes_emit_remainder_encodeBlockAsm
  1115. CMPL DX, $0x01000000
  1116. JLT four_bytes_emit_remainder_encodeBlockAsm
  1117. MOVB $0xfc, (AX)
  1118. MOVL DX, 1(AX)
  1119. ADDQ $0x05, AX
  1120. JMP memmove_long_emit_remainder_encodeBlockAsm
  1121. four_bytes_emit_remainder_encodeBlockAsm:
  1122. MOVL DX, BX
  1123. SHRL $0x10, BX
  1124. MOVB $0xf8, (AX)
  1125. MOVW DX, 1(AX)
  1126. MOVB BL, 3(AX)
  1127. ADDQ $0x04, AX
  1128. JMP memmove_long_emit_remainder_encodeBlockAsm
  1129. three_bytes_emit_remainder_encodeBlockAsm:
  1130. MOVB $0xf4, (AX)
  1131. MOVW DX, 1(AX)
  1132. ADDQ $0x03, AX
  1133. JMP memmove_long_emit_remainder_encodeBlockAsm
  1134. two_bytes_emit_remainder_encodeBlockAsm:
  1135. MOVB $0xf0, (AX)
  1136. MOVB DL, 1(AX)
  1137. ADDQ $0x02, AX
  1138. CMPL DX, $0x40
  1139. JL memmove_emit_remainder_encodeBlockAsm
  1140. JMP memmove_long_emit_remainder_encodeBlockAsm
  1141. one_byte_emit_remainder_encodeBlockAsm:
  1142. SHLB $0x02, DL
  1143. MOVB DL, (AX)
  1144. ADDQ $0x01, AX
  1145. memmove_emit_remainder_encodeBlockAsm:
  1146. LEAQ (AX)(SI*1), DX
  1147. MOVL SI, BX
  1148. // genMemMoveShort
  1149. CMPQ BX, $0x03
  1150. JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
  1151. JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
  1152. CMPQ BX, $0x08
  1153. JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7
  1154. CMPQ BX, $0x10
  1155. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
  1156. CMPQ BX, $0x20
  1157. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
  1158. JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
  1159. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
  1160. MOVB (CX), SI
  1161. MOVB -1(CX)(BX*1), CL
  1162. MOVB SI, (AX)
  1163. MOVB CL, -1(AX)(BX*1)
  1164. JMP memmove_end_copy_emit_remainder_encodeBlockAsm
  1165. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
  1166. MOVW (CX), SI
  1167. MOVB 2(CX), CL
  1168. MOVW SI, (AX)
  1169. MOVB CL, 2(AX)
  1170. JMP memmove_end_copy_emit_remainder_encodeBlockAsm
  1171. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7:
  1172. MOVL (CX), SI
  1173. MOVL -4(CX)(BX*1), CX
  1174. MOVL SI, (AX)
  1175. MOVL CX, -4(AX)(BX*1)
  1176. JMP memmove_end_copy_emit_remainder_encodeBlockAsm
  1177. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
  1178. MOVQ (CX), SI
  1179. MOVQ -8(CX)(BX*1), CX
  1180. MOVQ SI, (AX)
  1181. MOVQ CX, -8(AX)(BX*1)
  1182. JMP memmove_end_copy_emit_remainder_encodeBlockAsm
  1183. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
  1184. MOVOU (CX), X0
  1185. MOVOU -16(CX)(BX*1), X1
  1186. MOVOU X0, (AX)
  1187. MOVOU X1, -16(AX)(BX*1)
  1188. JMP memmove_end_copy_emit_remainder_encodeBlockAsm
  1189. emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
  1190. MOVOU (CX), X0
  1191. MOVOU 16(CX), X1
  1192. MOVOU -32(CX)(BX*1), X2
  1193. MOVOU -16(CX)(BX*1), X3
  1194. MOVOU X0, (AX)
  1195. MOVOU X1, 16(AX)
  1196. MOVOU X2, -32(AX)(BX*1)
  1197. MOVOU X3, -16(AX)(BX*1)
  1198. memmove_end_copy_emit_remainder_encodeBlockAsm:
  1199. MOVQ DX, AX
  1200. JMP emit_literal_done_emit_remainder_encodeBlockAsm
  1201. memmove_long_emit_remainder_encodeBlockAsm:
  1202. LEAQ (AX)(SI*1), DX
  1203. MOVL SI, BX
  1204. // genMemMoveLong
  1205. MOVOU (CX), X0
  1206. MOVOU 16(CX), X1
  1207. MOVOU -32(CX)(BX*1), X2
  1208. MOVOU -16(CX)(BX*1), X3
  1209. MOVQ BX, DI
  1210. SHRQ $0x05, DI
  1211. MOVQ AX, SI
  1212. ANDL $0x0000001f, SI
  1213. MOVQ $0x00000040, R8
  1214. SUBQ SI, R8
  1215. DECQ DI
  1216. JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
  1217. LEAQ -32(CX)(R8*1), SI
  1218. LEAQ -32(AX)(R8*1), R9
  1219. emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
  1220. MOVOU (SI), X4
  1221. MOVOU 16(SI), X5
  1222. MOVOA X4, (R9)
  1223. MOVOA X5, 16(R9)
  1224. ADDQ $0x20, R9
  1225. ADDQ $0x20, SI
  1226. ADDQ $0x20, R8
  1227. DECQ DI
  1228. JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
  1229. emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
  1230. MOVOU -32(CX)(R8*1), X4
  1231. MOVOU -16(CX)(R8*1), X5
  1232. MOVOA X4, -32(AX)(R8*1)
  1233. MOVOA X5, -16(AX)(R8*1)
  1234. ADDQ $0x20, R8
  1235. CMPQ BX, R8
  1236. JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
  1237. MOVOU X0, (AX)
  1238. MOVOU X1, 16(AX)
  1239. MOVOU X2, -32(AX)(BX*1)
  1240. MOVOU X3, -16(AX)(BX*1)
  1241. MOVQ DX, AX
  1242. emit_literal_done_emit_remainder_encodeBlockAsm:
  1243. MOVQ dst_base+0(FP), CX
  1244. SUBQ CX, AX
  1245. MOVQ AX, ret+48(FP)
  1246. RET
  1247. // func encodeBlockAsm4MB(dst []byte, src []byte) int
  1248. // Requires: BMI, SSE2
  1249. TEXT ·encodeBlockAsm4MB(SB), $65560-56
  1250. MOVQ dst_base+0(FP), AX
  1251. MOVQ $0x00000200, CX
  1252. LEAQ 24(SP), DX
  1253. PXOR X0, X0
  1254. zero_loop_encodeBlockAsm4MB:
  1255. MOVOU X0, (DX)
  1256. MOVOU X0, 16(DX)
  1257. MOVOU X0, 32(DX)
  1258. MOVOU X0, 48(DX)
  1259. MOVOU X0, 64(DX)
  1260. MOVOU X0, 80(DX)
  1261. MOVOU X0, 96(DX)
  1262. MOVOU X0, 112(DX)
  1263. ADDQ $0x80, DX
  1264. DECQ CX
  1265. JNZ zero_loop_encodeBlockAsm4MB
  1266. MOVL $0x00000000, 12(SP)
  1267. MOVQ src_len+32(FP), CX
  1268. LEAQ -9(CX), DX
  1269. LEAQ -8(CX), SI
  1270. MOVL SI, 8(SP)
  1271. SHRQ $0x05, CX
  1272. SUBL CX, DX
  1273. LEAQ (AX)(DX*1), DX
  1274. MOVQ DX, (SP)
  1275. MOVL $0x00000001, CX
  1276. MOVL CX, 16(SP)
  1277. MOVQ src_base+24(FP), DX
  1278. search_loop_encodeBlockAsm4MB:
  1279. MOVL CX, SI
  1280. SUBL 12(SP), SI
  1281. SHRL $0x06, SI
  1282. LEAL 4(CX)(SI*1), SI
  1283. CMPL SI, 8(SP)
  1284. JGE emit_remainder_encodeBlockAsm4MB
  1285. MOVQ (DX)(CX*1), DI
  1286. MOVL SI, 20(SP)
  1287. MOVQ $0x0000cf1bbcdcbf9b, R9
  1288. MOVQ DI, R10
  1289. MOVQ DI, R11
  1290. SHRQ $0x08, R11
  1291. SHLQ $0x10, R10
  1292. IMULQ R9, R10
  1293. SHRQ $0x32, R10
  1294. SHLQ $0x10, R11
  1295. IMULQ R9, R11
  1296. SHRQ $0x32, R11
  1297. MOVL 24(SP)(R10*4), SI
  1298. MOVL 24(SP)(R11*4), R8
  1299. MOVL CX, 24(SP)(R10*4)
  1300. LEAL 1(CX), R10
  1301. MOVL R10, 24(SP)(R11*4)
  1302. MOVQ DI, R10
  1303. SHRQ $0x10, R10
  1304. SHLQ $0x10, R10
  1305. IMULQ R9, R10
  1306. SHRQ $0x32, R10
  1307. MOVL CX, R9
  1308. SUBL 16(SP), R9
  1309. MOVL 1(DX)(R9*1), R11
  1310. MOVQ DI, R9
  1311. SHRQ $0x08, R9
  1312. CMPL R9, R11
  1313. JNE no_repeat_found_encodeBlockAsm4MB
  1314. LEAL 1(CX), DI
  1315. MOVL 12(SP), R8
  1316. MOVL DI, SI
  1317. SUBL 16(SP), SI
  1318. JZ repeat_extend_back_end_encodeBlockAsm4MB
  1319. repeat_extend_back_loop_encodeBlockAsm4MB:
  1320. CMPL DI, R8
  1321. JLE repeat_extend_back_end_encodeBlockAsm4MB
  1322. MOVB -1(DX)(SI*1), BL
  1323. MOVB -1(DX)(DI*1), R9
  1324. CMPB BL, R9
  1325. JNE repeat_extend_back_end_encodeBlockAsm4MB
  1326. LEAL -1(DI), DI
  1327. DECL SI
  1328. JNZ repeat_extend_back_loop_encodeBlockAsm4MB
  1329. repeat_extend_back_end_encodeBlockAsm4MB:
  1330. MOVL 12(SP), SI
  1331. CMPL SI, DI
  1332. JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB
  1333. MOVL DI, R9
  1334. MOVL DI, 12(SP)
  1335. LEAQ (DX)(SI*1), R10
  1336. SUBL SI, R9
  1337. LEAL -1(R9), SI
  1338. CMPL SI, $0x3c
  1339. JLT one_byte_repeat_emit_encodeBlockAsm4MB
  1340. CMPL SI, $0x00000100
  1341. JLT two_bytes_repeat_emit_encodeBlockAsm4MB
  1342. CMPL SI, $0x00010000
  1343. JLT three_bytes_repeat_emit_encodeBlockAsm4MB
  1344. MOVL SI, R11
  1345. SHRL $0x10, R11
  1346. MOVB $0xf8, (AX)
  1347. MOVW SI, 1(AX)
  1348. MOVB R11, 3(AX)
  1349. ADDQ $0x04, AX
  1350. JMP memmove_long_repeat_emit_encodeBlockAsm4MB
  1351. three_bytes_repeat_emit_encodeBlockAsm4MB:
  1352. MOVB $0xf4, (AX)
  1353. MOVW SI, 1(AX)
  1354. ADDQ $0x03, AX
  1355. JMP memmove_long_repeat_emit_encodeBlockAsm4MB
  1356. two_bytes_repeat_emit_encodeBlockAsm4MB:
  1357. MOVB $0xf0, (AX)
  1358. MOVB SI, 1(AX)
  1359. ADDQ $0x02, AX
  1360. CMPL SI, $0x40
  1361. JL memmove_repeat_emit_encodeBlockAsm4MB
  1362. JMP memmove_long_repeat_emit_encodeBlockAsm4MB
  1363. one_byte_repeat_emit_encodeBlockAsm4MB:
  1364. SHLB $0x02, SI
  1365. MOVB SI, (AX)
  1366. ADDQ $0x01, AX
  1367. memmove_repeat_emit_encodeBlockAsm4MB:
  1368. LEAQ (AX)(R9*1), SI
  1369. // genMemMoveShort
  1370. CMPQ R9, $0x08
  1371. JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
  1372. CMPQ R9, $0x10
  1373. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
  1374. CMPQ R9, $0x20
  1375. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
  1376. JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
  1377. emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
  1378. MOVQ (R10), R11
  1379. MOVQ R11, (AX)
  1380. JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
  1381. emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
  1382. MOVQ (R10), R11
  1383. MOVQ -8(R10)(R9*1), R10
  1384. MOVQ R11, (AX)
  1385. MOVQ R10, -8(AX)(R9*1)
  1386. JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
  1387. emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
  1388. MOVOU (R10), X0
  1389. MOVOU -16(R10)(R9*1), X1
  1390. MOVOU X0, (AX)
  1391. MOVOU X1, -16(AX)(R9*1)
  1392. JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
  1393. emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
  1394. MOVOU (R10), X0
  1395. MOVOU 16(R10), X1
  1396. MOVOU -32(R10)(R9*1), X2
  1397. MOVOU -16(R10)(R9*1), X3
  1398. MOVOU X0, (AX)
  1399. MOVOU X1, 16(AX)
  1400. MOVOU X2, -32(AX)(R9*1)
  1401. MOVOU X3, -16(AX)(R9*1)
  1402. memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
  1403. MOVQ SI, AX
  1404. JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB
  1405. memmove_long_repeat_emit_encodeBlockAsm4MB:
  1406. LEAQ (AX)(R9*1), SI
  1407. // genMemMoveLong
  1408. MOVOU (R10), X0
  1409. MOVOU 16(R10), X1
  1410. MOVOU -32(R10)(R9*1), X2
  1411. MOVOU -16(R10)(R9*1), X3
  1412. MOVQ R9, R12
  1413. SHRQ $0x05, R12
  1414. MOVQ AX, R11
  1415. ANDL $0x0000001f, R11
  1416. MOVQ $0x00000040, R13
  1417. SUBQ R11, R13
  1418. DECQ R12
  1419. JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
  1420. LEAQ -32(R10)(R13*1), R11
  1421. LEAQ -32(AX)(R13*1), R14
  1422. emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
  1423. MOVOU (R11), X4
  1424. MOVOU 16(R11), X5
  1425. MOVOA X4, (R14)
  1426. MOVOA X5, 16(R14)
  1427. ADDQ $0x20, R14
  1428. ADDQ $0x20, R11
  1429. ADDQ $0x20, R13
  1430. DECQ R12
  1431. JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
  1432. emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
  1433. MOVOU -32(R10)(R13*1), X4
  1434. MOVOU -16(R10)(R13*1), X5
  1435. MOVOA X4, -32(AX)(R13*1)
  1436. MOVOA X5, -16(AX)(R13*1)
  1437. ADDQ $0x20, R13
  1438. CMPQ R9, R13
  1439. JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
  1440. MOVOU X0, (AX)
  1441. MOVOU X1, 16(AX)
  1442. MOVOU X2, -32(AX)(R9*1)
  1443. MOVOU X3, -16(AX)(R9*1)
  1444. MOVQ SI, AX
  1445. emit_literal_done_repeat_emit_encodeBlockAsm4MB:
  1446. ADDL $0x05, CX
  1447. MOVL CX, SI
  1448. SUBL 16(SP), SI
  1449. MOVQ src_len+32(FP), R9
  1450. SUBL CX, R9
  1451. LEAQ (DX)(CX*1), R10
  1452. LEAQ (DX)(SI*1), SI
  1453. // matchLen
  1454. XORL R12, R12
  1455. CMPL R9, $0x08
  1456. JL matchlen_match4_repeat_extend_encodeBlockAsm4MB
  1457. matchlen_loopback_repeat_extend_encodeBlockAsm4MB:
  1458. MOVQ (R10)(R12*1), R11
  1459. XORQ (SI)(R12*1), R11
  1460. TESTQ R11, R11
  1461. JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB
  1462. #ifdef GOAMD64_v3
  1463. TZCNTQ R11, R11
  1464. #else
  1465. BSFQ R11, R11
  1466. #endif
  1467. SARQ $0x03, R11
  1468. LEAL (R12)(R11*1), R12
  1469. JMP repeat_extend_forward_end_encodeBlockAsm4MB
  1470. matchlen_loop_repeat_extend_encodeBlockAsm4MB:
  1471. LEAL -8(R9), R9
  1472. LEAL 8(R12), R12
  1473. CMPL R9, $0x08
  1474. JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB
  1475. JZ repeat_extend_forward_end_encodeBlockAsm4MB
  1476. matchlen_match4_repeat_extend_encodeBlockAsm4MB:
  1477. CMPL R9, $0x04
  1478. JL matchlen_match2_repeat_extend_encodeBlockAsm4MB
  1479. MOVL (R10)(R12*1), R11
  1480. CMPL (SI)(R12*1), R11
  1481. JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB
  1482. SUBL $0x04, R9
  1483. LEAL 4(R12), R12
  1484. matchlen_match2_repeat_extend_encodeBlockAsm4MB:
  1485. CMPL R9, $0x02
  1486. JL matchlen_match1_repeat_extend_encodeBlockAsm4MB
  1487. MOVW (R10)(R12*1), R11
  1488. CMPW (SI)(R12*1), R11
  1489. JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB
  1490. SUBL $0x02, R9
  1491. LEAL 2(R12), R12
  1492. matchlen_match1_repeat_extend_encodeBlockAsm4MB:
  1493. CMPL R9, $0x01
  1494. JL repeat_extend_forward_end_encodeBlockAsm4MB
  1495. MOVB (R10)(R12*1), R11
  1496. CMPB (SI)(R12*1), R11
  1497. JNE repeat_extend_forward_end_encodeBlockAsm4MB
  1498. LEAL 1(R12), R12
  1499. repeat_extend_forward_end_encodeBlockAsm4MB:
  1500. ADDL R12, CX
  1501. MOVL CX, SI
  1502. SUBL DI, SI
  1503. MOVL 16(SP), DI
  1504. TESTL R8, R8
  1505. JZ repeat_as_copy_encodeBlockAsm4MB
  1506. // emitRepeat
  1507. MOVL SI, R8
  1508. LEAL -4(SI), SI
  1509. CMPL R8, $0x08
  1510. JLE repeat_two_match_repeat_encodeBlockAsm4MB
  1511. CMPL R8, $0x0c
  1512. JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
  1513. CMPL DI, $0x00000800
  1514. JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB
  1515. cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
  1516. CMPL SI, $0x00000104
  1517. JLT repeat_three_match_repeat_encodeBlockAsm4MB
  1518. CMPL SI, $0x00010100
  1519. JLT repeat_four_match_repeat_encodeBlockAsm4MB
  1520. LEAL -65536(SI), SI
  1521. MOVL SI, DI
  1522. MOVW $0x001d, (AX)
  1523. MOVW SI, 2(AX)
  1524. SARL $0x10, DI
  1525. MOVB DI, 4(AX)
  1526. ADDQ $0x05, AX
  1527. JMP repeat_end_emit_encodeBlockAsm4MB
  1528. repeat_four_match_repeat_encodeBlockAsm4MB:
  1529. LEAL -256(SI), SI
  1530. MOVW $0x0019, (AX)
  1531. MOVW SI, 2(AX)
  1532. ADDQ $0x04, AX
  1533. JMP repeat_end_emit_encodeBlockAsm4MB
  1534. repeat_three_match_repeat_encodeBlockAsm4MB:
  1535. LEAL -4(SI), SI
  1536. MOVW $0x0015, (AX)
  1537. MOVB SI, 2(AX)
  1538. ADDQ $0x03, AX
  1539. JMP repeat_end_emit_encodeBlockAsm4MB
  1540. repeat_two_match_repeat_encodeBlockAsm4MB:
  1541. SHLL $0x02, SI
  1542. ORL $0x01, SI
  1543. MOVW SI, (AX)
  1544. ADDQ $0x02, AX
  1545. JMP repeat_end_emit_encodeBlockAsm4MB
  1546. repeat_two_offset_match_repeat_encodeBlockAsm4MB:
  1547. XORQ R8, R8
  1548. LEAL 1(R8)(SI*4), SI
  1549. MOVB DI, 1(AX)
  1550. SARL $0x08, DI
  1551. SHLL $0x05, DI
  1552. ORL DI, SI
  1553. MOVB SI, (AX)
  1554. ADDQ $0x02, AX
  1555. JMP repeat_end_emit_encodeBlockAsm4MB
  1556. repeat_as_copy_encodeBlockAsm4MB:
  1557. // emitCopy
  1558. CMPL DI, $0x00010000
  1559. JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
  1560. four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB:
  1561. CMPL SI, $0x40
  1562. JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
  1563. MOVB $0xff, (AX)
  1564. MOVL DI, 1(AX)
  1565. LEAL -64(SI), SI
  1566. ADDQ $0x05, AX
  1567. CMPL SI, $0x04
  1568. JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
  1569. // emitRepeat
  1570. MOVL SI, R8
  1571. LEAL -4(SI), SI
  1572. CMPL R8, $0x08
  1573. JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
  1574. CMPL R8, $0x0c
  1575. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
  1576. CMPL DI, $0x00000800
  1577. JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
  1578. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
  1579. CMPL SI, $0x00000104
  1580. JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
  1581. CMPL SI, $0x00010100
  1582. JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
  1583. LEAL -65536(SI), SI
  1584. MOVL SI, DI
  1585. MOVW $0x001d, (AX)
  1586. MOVW SI, 2(AX)
  1587. SARL $0x10, DI
  1588. MOVB DI, 4(AX)
  1589. ADDQ $0x05, AX
  1590. JMP repeat_end_emit_encodeBlockAsm4MB
  1591. repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
  1592. LEAL -256(SI), SI
  1593. MOVW $0x0019, (AX)
  1594. MOVW SI, 2(AX)
  1595. ADDQ $0x04, AX
  1596. JMP repeat_end_emit_encodeBlockAsm4MB
  1597. repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
  1598. LEAL -4(SI), SI
  1599. MOVW $0x0015, (AX)
  1600. MOVB SI, 2(AX)
  1601. ADDQ $0x03, AX
  1602. JMP repeat_end_emit_encodeBlockAsm4MB
  1603. repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
  1604. SHLL $0x02, SI
  1605. ORL $0x01, SI
  1606. MOVW SI, (AX)
  1607. ADDQ $0x02, AX
  1608. JMP repeat_end_emit_encodeBlockAsm4MB
  1609. repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
  1610. XORQ R8, R8
  1611. LEAL 1(R8)(SI*4), SI
  1612. MOVB DI, 1(AX)
  1613. SARL $0x08, DI
  1614. SHLL $0x05, DI
  1615. ORL DI, SI
  1616. MOVB SI, (AX)
  1617. ADDQ $0x02, AX
  1618. JMP repeat_end_emit_encodeBlockAsm4MB
  1619. JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm4MB
  1620. four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
  1621. TESTL SI, SI
  1622. JZ repeat_end_emit_encodeBlockAsm4MB
  1623. MOVB $0x03, BL
  1624. LEAL -4(BX)(SI*4), SI
  1625. MOVB SI, (AX)
  1626. MOVL DI, 1(AX)
  1627. ADDQ $0x05, AX
  1628. JMP repeat_end_emit_encodeBlockAsm4MB
  1629. two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
  1630. CMPL SI, $0x40
  1631. JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
  1632. CMPL DI, $0x00000800
  1633. JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB
  1634. MOVL $0x00000001, R8
  1635. LEAL 16(R8), R8
  1636. MOVB DI, 1(AX)
  1637. SHRL $0x08, DI
  1638. SHLL $0x05, DI
  1639. ORL DI, R8
  1640. MOVB R8, (AX)
  1641. ADDQ $0x02, AX
  1642. SUBL $0x08, SI
  1643. // emitRepeat
  1644. LEAL -4(SI), SI
  1645. JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1646. MOVL SI, R8
  1647. LEAL -4(SI), SI
  1648. CMPL R8, $0x08
  1649. JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1650. CMPL R8, $0x0c
  1651. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1652. CMPL DI, $0x00000800
  1653. JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1654. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
  1655. CMPL SI, $0x00000104
  1656. JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1657. CMPL SI, $0x00010100
  1658. JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
  1659. LEAL -65536(SI), SI
  1660. MOVL SI, DI
  1661. MOVW $0x001d, (AX)
  1662. MOVW SI, 2(AX)
  1663. SARL $0x10, DI
  1664. MOVB DI, 4(AX)
  1665. ADDQ $0x05, AX
  1666. JMP repeat_end_emit_encodeBlockAsm4MB
  1667. repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
  1668. LEAL -256(SI), SI
  1669. MOVW $0x0019, (AX)
  1670. MOVW SI, 2(AX)
  1671. ADDQ $0x04, AX
  1672. JMP repeat_end_emit_encodeBlockAsm4MB
  1673. repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
  1674. LEAL -4(SI), SI
  1675. MOVW $0x0015, (AX)
  1676. MOVB SI, 2(AX)
  1677. ADDQ $0x03, AX
  1678. JMP repeat_end_emit_encodeBlockAsm4MB
  1679. repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
  1680. SHLL $0x02, SI
  1681. ORL $0x01, SI
  1682. MOVW SI, (AX)
  1683. ADDQ $0x02, AX
  1684. JMP repeat_end_emit_encodeBlockAsm4MB
  1685. repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
  1686. XORQ R8, R8
  1687. LEAL 1(R8)(SI*4), SI
  1688. MOVB DI, 1(AX)
  1689. SARL $0x08, DI
  1690. SHLL $0x05, DI
  1691. ORL DI, SI
  1692. MOVB SI, (AX)
  1693. ADDQ $0x02, AX
  1694. JMP repeat_end_emit_encodeBlockAsm4MB
  1695. long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
  1696. MOVB $0xee, (AX)
  1697. MOVW DI, 1(AX)
  1698. LEAL -60(SI), SI
  1699. ADDQ $0x03, AX
  1700. // emitRepeat
  1701. MOVL SI, R8
  1702. LEAL -4(SI), SI
  1703. CMPL R8, $0x08
  1704. JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
  1705. CMPL R8, $0x0c
  1706. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
  1707. CMPL DI, $0x00000800
  1708. JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
  1709. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
  1710. CMPL SI, $0x00000104
  1711. JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
  1712. CMPL SI, $0x00010100
  1713. JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
  1714. LEAL -65536(SI), SI
  1715. MOVL SI, DI
  1716. MOVW $0x001d, (AX)
  1717. MOVW SI, 2(AX)
  1718. SARL $0x10, DI
  1719. MOVB DI, 4(AX)
  1720. ADDQ $0x05, AX
  1721. JMP repeat_end_emit_encodeBlockAsm4MB
  1722. repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
  1723. LEAL -256(SI), SI
  1724. MOVW $0x0019, (AX)
  1725. MOVW SI, 2(AX)
  1726. ADDQ $0x04, AX
  1727. JMP repeat_end_emit_encodeBlockAsm4MB
  1728. repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
  1729. LEAL -4(SI), SI
  1730. MOVW $0x0015, (AX)
  1731. MOVB SI, 2(AX)
  1732. ADDQ $0x03, AX
  1733. JMP repeat_end_emit_encodeBlockAsm4MB
  1734. repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
  1735. SHLL $0x02, SI
  1736. ORL $0x01, SI
  1737. MOVW SI, (AX)
  1738. ADDQ $0x02, AX
  1739. JMP repeat_end_emit_encodeBlockAsm4MB
  1740. repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
  1741. XORQ R8, R8
  1742. LEAL 1(R8)(SI*4), SI
  1743. MOVB DI, 1(AX)
  1744. SARL $0x08, DI
  1745. SHLL $0x05, DI
  1746. ORL DI, SI
  1747. MOVB SI, (AX)
  1748. ADDQ $0x02, AX
  1749. JMP repeat_end_emit_encodeBlockAsm4MB
  1750. JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
  1751. two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
  1752. CMPL SI, $0x0c
  1753. JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
  1754. CMPL DI, $0x00000800
  1755. JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
  1756. MOVB $0x01, BL
  1757. LEAL -16(BX)(SI*4), SI
  1758. MOVB DI, 1(AX)
  1759. SHRL $0x08, DI
  1760. SHLL $0x05, DI
  1761. ORL DI, SI
  1762. MOVB SI, (AX)
  1763. ADDQ $0x02, AX
  1764. JMP repeat_end_emit_encodeBlockAsm4MB
  1765. emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
  1766. MOVB $0x02, BL
  1767. LEAL -4(BX)(SI*4), SI
  1768. MOVB SI, (AX)
  1769. MOVW DI, 1(AX)
  1770. ADDQ $0x03, AX
  1771. repeat_end_emit_encodeBlockAsm4MB:
  1772. MOVL CX, 12(SP)
  1773. JMP search_loop_encodeBlockAsm4MB
  1774. no_repeat_found_encodeBlockAsm4MB:
  1775. CMPL (DX)(SI*1), DI
  1776. JEQ candidate_match_encodeBlockAsm4MB
  1777. SHRQ $0x08, DI
  1778. MOVL 24(SP)(R10*4), SI
  1779. LEAL 2(CX), R9
  1780. CMPL (DX)(R8*1), DI
  1781. JEQ candidate2_match_encodeBlockAsm4MB
  1782. MOVL R9, 24(SP)(R10*4)
  1783. SHRQ $0x08, DI
  1784. CMPL (DX)(SI*1), DI
  1785. JEQ candidate3_match_encodeBlockAsm4MB
  1786. MOVL 20(SP), CX
  1787. JMP search_loop_encodeBlockAsm4MB
  1788. candidate3_match_encodeBlockAsm4MB:
  1789. ADDL $0x02, CX
  1790. JMP candidate_match_encodeBlockAsm4MB
  1791. candidate2_match_encodeBlockAsm4MB:
  1792. MOVL R9, 24(SP)(R10*4)
  1793. INCL CX
  1794. MOVL R8, SI
  1795. candidate_match_encodeBlockAsm4MB:
  1796. MOVL 12(SP), DI
  1797. TESTL SI, SI
  1798. JZ match_extend_back_end_encodeBlockAsm4MB
  1799. match_extend_back_loop_encodeBlockAsm4MB:
  1800. CMPL CX, DI
  1801. JLE match_extend_back_end_encodeBlockAsm4MB
  1802. MOVB -1(DX)(SI*1), BL
  1803. MOVB -1(DX)(CX*1), R8
  1804. CMPB BL, R8
  1805. JNE match_extend_back_end_encodeBlockAsm4MB
  1806. LEAL -1(CX), CX
  1807. DECL SI
  1808. JZ match_extend_back_end_encodeBlockAsm4MB
  1809. JMP match_extend_back_loop_encodeBlockAsm4MB
  1810. match_extend_back_end_encodeBlockAsm4MB:
  1811. MOVL CX, DI
  1812. SUBL 12(SP), DI
  1813. LEAQ 4(AX)(DI*1), DI
  1814. CMPQ DI, (SP)
  1815. JL match_dst_size_check_encodeBlockAsm4MB
  1816. MOVQ $0x00000000, ret+48(FP)
  1817. RET
  1818. match_dst_size_check_encodeBlockAsm4MB:
  1819. MOVL CX, DI
  1820. MOVL 12(SP), R8
  1821. CMPL R8, DI
  1822. JEQ emit_literal_done_match_emit_encodeBlockAsm4MB
  1823. MOVL DI, R9
  1824. MOVL DI, 12(SP)
  1825. LEAQ (DX)(R8*1), DI
  1826. SUBL R8, R9
  1827. LEAL -1(R9), R8
  1828. CMPL R8, $0x3c
  1829. JLT one_byte_match_emit_encodeBlockAsm4MB
  1830. CMPL R8, $0x00000100
  1831. JLT two_bytes_match_emit_encodeBlockAsm4MB
  1832. CMPL R8, $0x00010000
  1833. JLT three_bytes_match_emit_encodeBlockAsm4MB
  1834. MOVL R8, R10
  1835. SHRL $0x10, R10
  1836. MOVB $0xf8, (AX)
  1837. MOVW R8, 1(AX)
  1838. MOVB R10, 3(AX)
  1839. ADDQ $0x04, AX
  1840. JMP memmove_long_match_emit_encodeBlockAsm4MB
  1841. three_bytes_match_emit_encodeBlockAsm4MB:
  1842. MOVB $0xf4, (AX)
  1843. MOVW R8, 1(AX)
  1844. ADDQ $0x03, AX
  1845. JMP memmove_long_match_emit_encodeBlockAsm4MB
  1846. two_bytes_match_emit_encodeBlockAsm4MB:
  1847. MOVB $0xf0, (AX)
  1848. MOVB R8, 1(AX)
  1849. ADDQ $0x02, AX
  1850. CMPL R8, $0x40
  1851. JL memmove_match_emit_encodeBlockAsm4MB
  1852. JMP memmove_long_match_emit_encodeBlockAsm4MB
  1853. one_byte_match_emit_encodeBlockAsm4MB:
  1854. SHLB $0x02, R8
  1855. MOVB R8, (AX)
  1856. ADDQ $0x01, AX
  1857. memmove_match_emit_encodeBlockAsm4MB:
  1858. LEAQ (AX)(R9*1), R8
  1859. // genMemMoveShort
  1860. CMPQ R9, $0x08
  1861. JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
  1862. CMPQ R9, $0x10
  1863. JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
  1864. CMPQ R9, $0x20
  1865. JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
  1866. JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
  1867. emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
  1868. MOVQ (DI), R10
  1869. MOVQ R10, (AX)
  1870. JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
  1871. emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
  1872. MOVQ (DI), R10
  1873. MOVQ -8(DI)(R9*1), DI
  1874. MOVQ R10, (AX)
  1875. MOVQ DI, -8(AX)(R9*1)
  1876. JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
  1877. emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
  1878. MOVOU (DI), X0
  1879. MOVOU -16(DI)(R9*1), X1
  1880. MOVOU X0, (AX)
  1881. MOVOU X1, -16(AX)(R9*1)
  1882. JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
  1883. emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
  1884. MOVOU (DI), X0
  1885. MOVOU 16(DI), X1
  1886. MOVOU -32(DI)(R9*1), X2
  1887. MOVOU -16(DI)(R9*1), X3
  1888. MOVOU X0, (AX)
  1889. MOVOU X1, 16(AX)
  1890. MOVOU X2, -32(AX)(R9*1)
  1891. MOVOU X3, -16(AX)(R9*1)
  1892. memmove_end_copy_match_emit_encodeBlockAsm4MB:
  1893. MOVQ R8, AX
  1894. JMP emit_literal_done_match_emit_encodeBlockAsm4MB
  1895. memmove_long_match_emit_encodeBlockAsm4MB:
  1896. LEAQ (AX)(R9*1), R8
  1897. // genMemMoveLong
  1898. MOVOU (DI), X0
  1899. MOVOU 16(DI), X1
  1900. MOVOU -32(DI)(R9*1), X2
  1901. MOVOU -16(DI)(R9*1), X3
  1902. MOVQ R9, R11
  1903. SHRQ $0x05, R11
  1904. MOVQ AX, R10
  1905. ANDL $0x0000001f, R10
  1906. MOVQ $0x00000040, R12
  1907. SUBQ R10, R12
  1908. DECQ R11
  1909. JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
  1910. LEAQ -32(DI)(R12*1), R10
  1911. LEAQ -32(AX)(R12*1), R13
  1912. emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
  1913. MOVOU (R10), X4
  1914. MOVOU 16(R10), X5
  1915. MOVOA X4, (R13)
  1916. MOVOA X5, 16(R13)
  1917. ADDQ $0x20, R13
  1918. ADDQ $0x20, R10
  1919. ADDQ $0x20, R12
  1920. DECQ R11
  1921. JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
  1922. emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
  1923. MOVOU -32(DI)(R12*1), X4
  1924. MOVOU -16(DI)(R12*1), X5
  1925. MOVOA X4, -32(AX)(R12*1)
  1926. MOVOA X5, -16(AX)(R12*1)
  1927. ADDQ $0x20, R12
  1928. CMPQ R9, R12
  1929. JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
  1930. MOVOU X0, (AX)
  1931. MOVOU X1, 16(AX)
  1932. MOVOU X2, -32(AX)(R9*1)
  1933. MOVOU X3, -16(AX)(R9*1)
  1934. MOVQ R8, AX
  1935. emit_literal_done_match_emit_encodeBlockAsm4MB:
  1936. match_nolit_loop_encodeBlockAsm4MB:
  1937. MOVL CX, DI
  1938. SUBL SI, DI
  1939. MOVL DI, 16(SP)
  1940. ADDL $0x04, CX
  1941. ADDL $0x04, SI
  1942. MOVQ src_len+32(FP), DI
  1943. SUBL CX, DI
  1944. LEAQ (DX)(CX*1), R8
  1945. LEAQ (DX)(SI*1), SI
  1946. // matchLen
  1947. XORL R10, R10
  1948. CMPL DI, $0x08
  1949. JL matchlen_match4_match_nolit_encodeBlockAsm4MB
  1950. matchlen_loopback_match_nolit_encodeBlockAsm4MB:
  1951. MOVQ (R8)(R10*1), R9
  1952. XORQ (SI)(R10*1), R9
  1953. TESTQ R9, R9
  1954. JZ matchlen_loop_match_nolit_encodeBlockAsm4MB
  1955. #ifdef GOAMD64_v3
  1956. TZCNTQ R9, R9
  1957. #else
  1958. BSFQ R9, R9
  1959. #endif
  1960. SARQ $0x03, R9
  1961. LEAL (R10)(R9*1), R10
  1962. JMP match_nolit_end_encodeBlockAsm4MB
  1963. matchlen_loop_match_nolit_encodeBlockAsm4MB:
  1964. LEAL -8(DI), DI
  1965. LEAL 8(R10), R10
  1966. CMPL DI, $0x08
  1967. JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB
  1968. JZ match_nolit_end_encodeBlockAsm4MB
  1969. matchlen_match4_match_nolit_encodeBlockAsm4MB:
  1970. CMPL DI, $0x04
  1971. JL matchlen_match2_match_nolit_encodeBlockAsm4MB
  1972. MOVL (R8)(R10*1), R9
  1973. CMPL (SI)(R10*1), R9
  1974. JNE matchlen_match2_match_nolit_encodeBlockAsm4MB
  1975. SUBL $0x04, DI
  1976. LEAL 4(R10), R10
  1977. matchlen_match2_match_nolit_encodeBlockAsm4MB:
  1978. CMPL DI, $0x02
  1979. JL matchlen_match1_match_nolit_encodeBlockAsm4MB
  1980. MOVW (R8)(R10*1), R9
  1981. CMPW (SI)(R10*1), R9
  1982. JNE matchlen_match1_match_nolit_encodeBlockAsm4MB
  1983. SUBL $0x02, DI
  1984. LEAL 2(R10), R10
  1985. matchlen_match1_match_nolit_encodeBlockAsm4MB:
  1986. CMPL DI, $0x01
  1987. JL match_nolit_end_encodeBlockAsm4MB
  1988. MOVB (R8)(R10*1), R9
  1989. CMPB (SI)(R10*1), R9
  1990. JNE match_nolit_end_encodeBlockAsm4MB
  1991. LEAL 1(R10), R10
  1992. match_nolit_end_encodeBlockAsm4MB:
  1993. ADDL R10, CX
  1994. MOVL 16(SP), SI
  1995. ADDL $0x04, R10
  1996. MOVL CX, 12(SP)
  1997. // emitCopy
  1998. CMPL SI, $0x00010000
  1999. JL two_byte_offset_match_nolit_encodeBlockAsm4MB
  2000. four_bytes_loop_back_match_nolit_encodeBlockAsm4MB:
  2001. CMPL R10, $0x40
  2002. JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB
  2003. MOVB $0xff, (AX)
  2004. MOVL SI, 1(AX)
  2005. LEAL -64(R10), R10
  2006. ADDQ $0x05, AX
  2007. CMPL R10, $0x04
  2008. JL four_bytes_remain_match_nolit_encodeBlockAsm4MB
  2009. // emitRepeat
  2010. MOVL R10, DI
  2011. LEAL -4(R10), R10
  2012. CMPL DI, $0x08
  2013. JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
  2014. CMPL DI, $0x0c
  2015. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
  2016. CMPL SI, $0x00000800
  2017. JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
  2018. cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
  2019. CMPL R10, $0x00000104
  2020. JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
  2021. CMPL R10, $0x00010100
  2022. JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
  2023. LEAL -65536(R10), R10
  2024. MOVL R10, SI
  2025. MOVW $0x001d, (AX)
  2026. MOVW R10, 2(AX)
  2027. SARL $0x10, SI
  2028. MOVB SI, 4(AX)
  2029. ADDQ $0x05, AX
  2030. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2031. repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
  2032. LEAL -256(R10), R10
  2033. MOVW $0x0019, (AX)
  2034. MOVW R10, 2(AX)
  2035. ADDQ $0x04, AX
  2036. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2037. repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
  2038. LEAL -4(R10), R10
  2039. MOVW $0x0015, (AX)
  2040. MOVB R10, 2(AX)
  2041. ADDQ $0x03, AX
  2042. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2043. repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
  2044. SHLL $0x02, R10
  2045. ORL $0x01, R10
  2046. MOVW R10, (AX)
  2047. ADDQ $0x02, AX
  2048. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2049. repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
  2050. XORQ DI, DI
  2051. LEAL 1(DI)(R10*4), R10
  2052. MOVB SI, 1(AX)
  2053. SARL $0x08, SI
  2054. SHLL $0x05, SI
  2055. ORL SI, R10
  2056. MOVB R10, (AX)
  2057. ADDQ $0x02, AX
  2058. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2059. JMP four_bytes_loop_back_match_nolit_encodeBlockAsm4MB
  2060. four_bytes_remain_match_nolit_encodeBlockAsm4MB:
  2061. TESTL R10, R10
  2062. JZ match_nolit_emitcopy_end_encodeBlockAsm4MB
  2063. MOVB $0x03, BL
  2064. LEAL -4(BX)(R10*4), R10
  2065. MOVB R10, (AX)
  2066. MOVL SI, 1(AX)
  2067. ADDQ $0x05, AX
  2068. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2069. two_byte_offset_match_nolit_encodeBlockAsm4MB:
  2070. CMPL R10, $0x40
  2071. JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB
  2072. CMPL SI, $0x00000800
  2073. JAE long_offset_short_match_nolit_encodeBlockAsm4MB
  2074. MOVL $0x00000001, DI
  2075. LEAL 16(DI), DI
  2076. MOVB SI, 1(AX)
  2077. SHRL $0x08, SI
  2078. SHLL $0x05, SI
  2079. ORL SI, DI
  2080. MOVB DI, (AX)
  2081. ADDQ $0x02, AX
  2082. SUBL $0x08, R10
  2083. // emitRepeat
  2084. LEAL -4(R10), R10
  2085. JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2086. MOVL R10, DI
  2087. LEAL -4(R10), R10
  2088. CMPL DI, $0x08
  2089. JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2090. CMPL DI, $0x0c
  2091. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2092. CMPL SI, $0x00000800
  2093. JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2094. cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
  2095. CMPL R10, $0x00000104
  2096. JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2097. CMPL R10, $0x00010100
  2098. JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
  2099. LEAL -65536(R10), R10
  2100. MOVL R10, SI
  2101. MOVW $0x001d, (AX)
  2102. MOVW R10, 2(AX)
  2103. SARL $0x10, SI
  2104. MOVB SI, 4(AX)
  2105. ADDQ $0x05, AX
  2106. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2107. repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
  2108. LEAL -256(R10), R10
  2109. MOVW $0x0019, (AX)
  2110. MOVW R10, 2(AX)
  2111. ADDQ $0x04, AX
  2112. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2113. repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
  2114. LEAL -4(R10), R10
  2115. MOVW $0x0015, (AX)
  2116. MOVB R10, 2(AX)
  2117. ADDQ $0x03, AX
  2118. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2119. repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
  2120. SHLL $0x02, R10
  2121. ORL $0x01, R10
  2122. MOVW R10, (AX)
  2123. ADDQ $0x02, AX
  2124. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2125. repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
  2126. XORQ DI, DI
  2127. LEAL 1(DI)(R10*4), R10
  2128. MOVB SI, 1(AX)
  2129. SARL $0x08, SI
  2130. SHLL $0x05, SI
  2131. ORL SI, R10
  2132. MOVB R10, (AX)
  2133. ADDQ $0x02, AX
  2134. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2135. long_offset_short_match_nolit_encodeBlockAsm4MB:
  2136. MOVB $0xee, (AX)
  2137. MOVW SI, 1(AX)
  2138. LEAL -60(R10), R10
  2139. ADDQ $0x03, AX
  2140. // emitRepeat
  2141. MOVL R10, DI
  2142. LEAL -4(R10), R10
  2143. CMPL DI, $0x08
  2144. JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
  2145. CMPL DI, $0x0c
  2146. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
  2147. CMPL SI, $0x00000800
  2148. JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
  2149. cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
  2150. CMPL R10, $0x00000104
  2151. JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
  2152. CMPL R10, $0x00010100
  2153. JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
  2154. LEAL -65536(R10), R10
  2155. MOVL R10, SI
  2156. MOVW $0x001d, (AX)
  2157. MOVW R10, 2(AX)
  2158. SARL $0x10, SI
  2159. MOVB SI, 4(AX)
  2160. ADDQ $0x05, AX
  2161. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2162. repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
  2163. LEAL -256(R10), R10
  2164. MOVW $0x0019, (AX)
  2165. MOVW R10, 2(AX)
  2166. ADDQ $0x04, AX
  2167. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2168. repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
  2169. LEAL -4(R10), R10
  2170. MOVW $0x0015, (AX)
  2171. MOVB R10, 2(AX)
  2172. ADDQ $0x03, AX
  2173. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2174. repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
  2175. SHLL $0x02, R10
  2176. ORL $0x01, R10
  2177. MOVW R10, (AX)
  2178. ADDQ $0x02, AX
  2179. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2180. repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
  2181. XORQ DI, DI
  2182. LEAL 1(DI)(R10*4), R10
  2183. MOVB SI, 1(AX)
  2184. SARL $0x08, SI
  2185. SHLL $0x05, SI
  2186. ORL SI, R10
  2187. MOVB R10, (AX)
  2188. ADDQ $0x02, AX
  2189. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2190. JMP two_byte_offset_match_nolit_encodeBlockAsm4MB
  2191. two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
  2192. CMPL R10, $0x0c
  2193. JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
  2194. CMPL SI, $0x00000800
  2195. JGE emit_copy_three_match_nolit_encodeBlockAsm4MB
  2196. MOVB $0x01, BL
  2197. LEAL -16(BX)(R10*4), R10
  2198. MOVB SI, 1(AX)
  2199. SHRL $0x08, SI
  2200. SHLL $0x05, SI
  2201. ORL SI, R10
  2202. MOVB R10, (AX)
  2203. ADDQ $0x02, AX
  2204. JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
  2205. emit_copy_three_match_nolit_encodeBlockAsm4MB:
  2206. MOVB $0x02, BL
  2207. LEAL -4(BX)(R10*4), R10
  2208. MOVB R10, (AX)
  2209. MOVW SI, 1(AX)
  2210. ADDQ $0x03, AX
  2211. match_nolit_emitcopy_end_encodeBlockAsm4MB:
  2212. CMPL CX, 8(SP)
  2213. JGE emit_remainder_encodeBlockAsm4MB
  2214. MOVQ -2(DX)(CX*1), DI
  2215. CMPQ AX, (SP)
  2216. JL match_nolit_dst_ok_encodeBlockAsm4MB
  2217. MOVQ $0x00000000, ret+48(FP)
  2218. RET
  2219. match_nolit_dst_ok_encodeBlockAsm4MB:
  2220. MOVQ $0x0000cf1bbcdcbf9b, R9
  2221. MOVQ DI, R8
  2222. SHRQ $0x10, DI
  2223. MOVQ DI, SI
  2224. SHLQ $0x10, R8
  2225. IMULQ R9, R8
  2226. SHRQ $0x32, R8
  2227. SHLQ $0x10, SI
  2228. IMULQ R9, SI
  2229. SHRQ $0x32, SI
  2230. LEAL -2(CX), R9
  2231. LEAQ 24(SP)(SI*4), R10
  2232. MOVL (R10), SI
  2233. MOVL R9, 24(SP)(R8*4)
  2234. MOVL CX, (R10)
  2235. CMPL (DX)(SI*1), DI
  2236. JEQ match_nolit_loop_encodeBlockAsm4MB
  2237. INCL CX
  2238. JMP search_loop_encodeBlockAsm4MB
  2239. emit_remainder_encodeBlockAsm4MB:
  2240. MOVQ src_len+32(FP), CX
  2241. SUBL 12(SP), CX
  2242. LEAQ 4(AX)(CX*1), CX
  2243. CMPQ CX, (SP)
  2244. JL emit_remainder_ok_encodeBlockAsm4MB
  2245. MOVQ $0x00000000, ret+48(FP)
  2246. RET
  2247. emit_remainder_ok_encodeBlockAsm4MB:
  2248. MOVQ src_len+32(FP), CX
  2249. MOVL 12(SP), BX
  2250. CMPL BX, CX
  2251. JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB
  2252. MOVL CX, SI
  2253. MOVL CX, 12(SP)
  2254. LEAQ (DX)(BX*1), CX
  2255. SUBL BX, SI
  2256. LEAL -1(SI), DX
  2257. CMPL DX, $0x3c
  2258. JLT one_byte_emit_remainder_encodeBlockAsm4MB
  2259. CMPL DX, $0x00000100
  2260. JLT two_bytes_emit_remainder_encodeBlockAsm4MB
  2261. CMPL DX, $0x00010000
  2262. JLT three_bytes_emit_remainder_encodeBlockAsm4MB
  2263. MOVL DX, BX
  2264. SHRL $0x10, BX
  2265. MOVB $0xf8, (AX)
  2266. MOVW DX, 1(AX)
  2267. MOVB BL, 3(AX)
  2268. ADDQ $0x04, AX
  2269. JMP memmove_long_emit_remainder_encodeBlockAsm4MB
  2270. three_bytes_emit_remainder_encodeBlockAsm4MB:
  2271. MOVB $0xf4, (AX)
  2272. MOVW DX, 1(AX)
  2273. ADDQ $0x03, AX
  2274. JMP memmove_long_emit_remainder_encodeBlockAsm4MB
  2275. two_bytes_emit_remainder_encodeBlockAsm4MB:
  2276. MOVB $0xf0, (AX)
  2277. MOVB DL, 1(AX)
  2278. ADDQ $0x02, AX
  2279. CMPL DX, $0x40
  2280. JL memmove_emit_remainder_encodeBlockAsm4MB
  2281. JMP memmove_long_emit_remainder_encodeBlockAsm4MB
  2282. one_byte_emit_remainder_encodeBlockAsm4MB:
  2283. SHLB $0x02, DL
  2284. MOVB DL, (AX)
  2285. ADDQ $0x01, AX
  2286. memmove_emit_remainder_encodeBlockAsm4MB:
  2287. LEAQ (AX)(SI*1), DX
  2288. MOVL SI, BX
  2289. // genMemMoveShort
  2290. CMPQ BX, $0x03
  2291. JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2
  2292. JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3
  2293. CMPQ BX, $0x08
  2294. JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7
  2295. CMPQ BX, $0x10
  2296. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
  2297. CMPQ BX, $0x20
  2298. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
  2299. JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
  2300. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2:
  2301. MOVB (CX), SI
  2302. MOVB -1(CX)(BX*1), CL
  2303. MOVB SI, (AX)
  2304. MOVB CL, -1(AX)(BX*1)
  2305. JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
  2306. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3:
  2307. MOVW (CX), SI
  2308. MOVB 2(CX), CL
  2309. MOVW SI, (AX)
  2310. MOVB CL, 2(AX)
  2311. JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
  2312. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7:
  2313. MOVL (CX), SI
  2314. MOVL -4(CX)(BX*1), CX
  2315. MOVL SI, (AX)
  2316. MOVL CX, -4(AX)(BX*1)
  2317. JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
  2318. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
  2319. MOVQ (CX), SI
  2320. MOVQ -8(CX)(BX*1), CX
  2321. MOVQ SI, (AX)
  2322. MOVQ CX, -8(AX)(BX*1)
  2323. JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
  2324. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
  2325. MOVOU (CX), X0
  2326. MOVOU -16(CX)(BX*1), X1
  2327. MOVOU X0, (AX)
  2328. MOVOU X1, -16(AX)(BX*1)
  2329. JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
  2330. emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
  2331. MOVOU (CX), X0
  2332. MOVOU 16(CX), X1
  2333. MOVOU -32(CX)(BX*1), X2
  2334. MOVOU -16(CX)(BX*1), X3
  2335. MOVOU X0, (AX)
  2336. MOVOU X1, 16(AX)
  2337. MOVOU X2, -32(AX)(BX*1)
  2338. MOVOU X3, -16(AX)(BX*1)
  2339. memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
  2340. MOVQ DX, AX
  2341. JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB
  2342. memmove_long_emit_remainder_encodeBlockAsm4MB:
  2343. LEAQ (AX)(SI*1), DX
  2344. MOVL SI, BX
  2345. // genMemMoveLong
  2346. MOVOU (CX), X0
  2347. MOVOU 16(CX), X1
  2348. MOVOU -32(CX)(BX*1), X2
  2349. MOVOU -16(CX)(BX*1), X3
  2350. MOVQ BX, DI
  2351. SHRQ $0x05, DI
  2352. MOVQ AX, SI
  2353. ANDL $0x0000001f, SI
  2354. MOVQ $0x00000040, R8
  2355. SUBQ SI, R8
  2356. DECQ DI
  2357. JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
  2358. LEAQ -32(CX)(R8*1), SI
  2359. LEAQ -32(AX)(R8*1), R9
  2360. emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
  2361. MOVOU (SI), X4
  2362. MOVOU 16(SI), X5
  2363. MOVOA X4, (R9)
  2364. MOVOA X5, 16(R9)
  2365. ADDQ $0x20, R9
  2366. ADDQ $0x20, SI
  2367. ADDQ $0x20, R8
  2368. DECQ DI
  2369. JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
  2370. emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
  2371. MOVOU -32(CX)(R8*1), X4
  2372. MOVOU -16(CX)(R8*1), X5
  2373. MOVOA X4, -32(AX)(R8*1)
  2374. MOVOA X5, -16(AX)(R8*1)
  2375. ADDQ $0x20, R8
  2376. CMPQ BX, R8
  2377. JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
  2378. MOVOU X0, (AX)
  2379. MOVOU X1, 16(AX)
  2380. MOVOU X2, -32(AX)(BX*1)
  2381. MOVOU X3, -16(AX)(BX*1)
  2382. MOVQ DX, AX
  2383. emit_literal_done_emit_remainder_encodeBlockAsm4MB:
  2384. MOVQ dst_base+0(FP), CX
  2385. SUBQ CX, AX
  2386. MOVQ AX, ret+48(FP)
  2387. RET
  2388. // func encodeBlockAsm12B(dst []byte, src []byte) int
  2389. // Requires: BMI, SSE2
  2390. TEXT ·encodeBlockAsm12B(SB), $16408-56
  2391. MOVQ dst_base+0(FP), AX
  2392. MOVQ $0x00000080, CX
  2393. LEAQ 24(SP), DX
  2394. PXOR X0, X0
  2395. zero_loop_encodeBlockAsm12B:
  2396. MOVOU X0, (DX)
  2397. MOVOU X0, 16(DX)
  2398. MOVOU X0, 32(DX)
  2399. MOVOU X0, 48(DX)
  2400. MOVOU X0, 64(DX)
  2401. MOVOU X0, 80(DX)
  2402. MOVOU X0, 96(DX)
  2403. MOVOU X0, 112(DX)
  2404. ADDQ $0x80, DX
  2405. DECQ CX
  2406. JNZ zero_loop_encodeBlockAsm12B
  2407. MOVL $0x00000000, 12(SP)
  2408. MOVQ src_len+32(FP), CX
  2409. LEAQ -9(CX), DX
  2410. LEAQ -8(CX), SI
  2411. MOVL SI, 8(SP)
  2412. SHRQ $0x05, CX
  2413. SUBL CX, DX
  2414. LEAQ (AX)(DX*1), DX
  2415. MOVQ DX, (SP)
  2416. MOVL $0x00000001, CX
  2417. MOVL CX, 16(SP)
  2418. MOVQ src_base+24(FP), DX
  2419. search_loop_encodeBlockAsm12B:
  2420. MOVL CX, SI
  2421. SUBL 12(SP), SI
  2422. SHRL $0x05, SI
  2423. LEAL 4(CX)(SI*1), SI
  2424. CMPL SI, 8(SP)
  2425. JGE emit_remainder_encodeBlockAsm12B
  2426. MOVQ (DX)(CX*1), DI
  2427. MOVL SI, 20(SP)
  2428. MOVQ $0x000000cf1bbcdcbb, R9
  2429. MOVQ DI, R10
  2430. MOVQ DI, R11
  2431. SHRQ $0x08, R11
  2432. SHLQ $0x18, R10
  2433. IMULQ R9, R10
  2434. SHRQ $0x34, R10
  2435. SHLQ $0x18, R11
  2436. IMULQ R9, R11
  2437. SHRQ $0x34, R11
  2438. MOVL 24(SP)(R10*4), SI
  2439. MOVL 24(SP)(R11*4), R8
  2440. MOVL CX, 24(SP)(R10*4)
  2441. LEAL 1(CX), R10
  2442. MOVL R10, 24(SP)(R11*4)
  2443. MOVQ DI, R10
  2444. SHRQ $0x10, R10
  2445. SHLQ $0x18, R10
  2446. IMULQ R9, R10
  2447. SHRQ $0x34, R10
  2448. MOVL CX, R9
  2449. SUBL 16(SP), R9
  2450. MOVL 1(DX)(R9*1), R11
  2451. MOVQ DI, R9
  2452. SHRQ $0x08, R9
  2453. CMPL R9, R11
  2454. JNE no_repeat_found_encodeBlockAsm12B
  2455. LEAL 1(CX), DI
  2456. MOVL 12(SP), R8
  2457. MOVL DI, SI
  2458. SUBL 16(SP), SI
  2459. JZ repeat_extend_back_end_encodeBlockAsm12B
  2460. repeat_extend_back_loop_encodeBlockAsm12B:
  2461. CMPL DI, R8
  2462. JLE repeat_extend_back_end_encodeBlockAsm12B
  2463. MOVB -1(DX)(SI*1), BL
  2464. MOVB -1(DX)(DI*1), R9
  2465. CMPB BL, R9
  2466. JNE repeat_extend_back_end_encodeBlockAsm12B
  2467. LEAL -1(DI), DI
  2468. DECL SI
  2469. JNZ repeat_extend_back_loop_encodeBlockAsm12B
  2470. repeat_extend_back_end_encodeBlockAsm12B:
  2471. MOVL 12(SP), SI
  2472. CMPL SI, DI
  2473. JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B
  2474. MOVL DI, R9
  2475. MOVL DI, 12(SP)
  2476. LEAQ (DX)(SI*1), R10
  2477. SUBL SI, R9
  2478. LEAL -1(R9), SI
  2479. CMPL SI, $0x3c
  2480. JLT one_byte_repeat_emit_encodeBlockAsm12B
  2481. CMPL SI, $0x00000100
  2482. JLT two_bytes_repeat_emit_encodeBlockAsm12B
  2483. MOVB $0xf4, (AX)
  2484. MOVW SI, 1(AX)
  2485. ADDQ $0x03, AX
  2486. JMP memmove_long_repeat_emit_encodeBlockAsm12B
  2487. two_bytes_repeat_emit_encodeBlockAsm12B:
  2488. MOVB $0xf0, (AX)
  2489. MOVB SI, 1(AX)
  2490. ADDQ $0x02, AX
  2491. CMPL SI, $0x40
  2492. JL memmove_repeat_emit_encodeBlockAsm12B
  2493. JMP memmove_long_repeat_emit_encodeBlockAsm12B
  2494. one_byte_repeat_emit_encodeBlockAsm12B:
  2495. SHLB $0x02, SI
  2496. MOVB SI, (AX)
  2497. ADDQ $0x01, AX
  2498. memmove_repeat_emit_encodeBlockAsm12B:
  2499. LEAQ (AX)(R9*1), SI
  2500. // genMemMoveShort
  2501. CMPQ R9, $0x08
  2502. JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
  2503. CMPQ R9, $0x10
  2504. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
  2505. CMPQ R9, $0x20
  2506. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
  2507. JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
  2508. emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
  2509. MOVQ (R10), R11
  2510. MOVQ R11, (AX)
  2511. JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
  2512. emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
  2513. MOVQ (R10), R11
  2514. MOVQ -8(R10)(R9*1), R10
  2515. MOVQ R11, (AX)
  2516. MOVQ R10, -8(AX)(R9*1)
  2517. JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
  2518. emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
  2519. MOVOU (R10), X0
  2520. MOVOU -16(R10)(R9*1), X1
  2521. MOVOU X0, (AX)
  2522. MOVOU X1, -16(AX)(R9*1)
  2523. JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
  2524. emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
  2525. MOVOU (R10), X0
  2526. MOVOU 16(R10), X1
  2527. MOVOU -32(R10)(R9*1), X2
  2528. MOVOU -16(R10)(R9*1), X3
  2529. MOVOU X0, (AX)
  2530. MOVOU X1, 16(AX)
  2531. MOVOU X2, -32(AX)(R9*1)
  2532. MOVOU X3, -16(AX)(R9*1)
  2533. memmove_end_copy_repeat_emit_encodeBlockAsm12B:
  2534. MOVQ SI, AX
  2535. JMP emit_literal_done_repeat_emit_encodeBlockAsm12B
  2536. memmove_long_repeat_emit_encodeBlockAsm12B:
  2537. LEAQ (AX)(R9*1), SI
  2538. // genMemMoveLong
  2539. MOVOU (R10), X0
  2540. MOVOU 16(R10), X1
  2541. MOVOU -32(R10)(R9*1), X2
  2542. MOVOU -16(R10)(R9*1), X3
  2543. MOVQ R9, R12
  2544. SHRQ $0x05, R12
  2545. MOVQ AX, R11
  2546. ANDL $0x0000001f, R11
  2547. MOVQ $0x00000040, R13
  2548. SUBQ R11, R13
  2549. DECQ R12
  2550. JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
  2551. LEAQ -32(R10)(R13*1), R11
  2552. LEAQ -32(AX)(R13*1), R14
  2553. emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
  2554. MOVOU (R11), X4
  2555. MOVOU 16(R11), X5
  2556. MOVOA X4, (R14)
  2557. MOVOA X5, 16(R14)
  2558. ADDQ $0x20, R14
  2559. ADDQ $0x20, R11
  2560. ADDQ $0x20, R13
  2561. DECQ R12
  2562. JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
  2563. emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
  2564. MOVOU -32(R10)(R13*1), X4
  2565. MOVOU -16(R10)(R13*1), X5
  2566. MOVOA X4, -32(AX)(R13*1)
  2567. MOVOA X5, -16(AX)(R13*1)
  2568. ADDQ $0x20, R13
  2569. CMPQ R9, R13
  2570. JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
  2571. MOVOU X0, (AX)
  2572. MOVOU X1, 16(AX)
  2573. MOVOU X2, -32(AX)(R9*1)
  2574. MOVOU X3, -16(AX)(R9*1)
  2575. MOVQ SI, AX
  2576. emit_literal_done_repeat_emit_encodeBlockAsm12B:
  2577. ADDL $0x05, CX
  2578. MOVL CX, SI
  2579. SUBL 16(SP), SI
  2580. MOVQ src_len+32(FP), R9
  2581. SUBL CX, R9
  2582. LEAQ (DX)(CX*1), R10
  2583. LEAQ (DX)(SI*1), SI
  2584. // matchLen
  2585. XORL R12, R12
  2586. CMPL R9, $0x08
  2587. JL matchlen_match4_repeat_extend_encodeBlockAsm12B
  2588. matchlen_loopback_repeat_extend_encodeBlockAsm12B:
  2589. MOVQ (R10)(R12*1), R11
  2590. XORQ (SI)(R12*1), R11
  2591. TESTQ R11, R11
  2592. JZ matchlen_loop_repeat_extend_encodeBlockAsm12B
  2593. #ifdef GOAMD64_v3
  2594. TZCNTQ R11, R11
  2595. #else
  2596. BSFQ R11, R11
  2597. #endif
  2598. SARQ $0x03, R11
  2599. LEAL (R12)(R11*1), R12
  2600. JMP repeat_extend_forward_end_encodeBlockAsm12B
  2601. matchlen_loop_repeat_extend_encodeBlockAsm12B:
  2602. LEAL -8(R9), R9
  2603. LEAL 8(R12), R12
  2604. CMPL R9, $0x08
  2605. JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B
  2606. JZ repeat_extend_forward_end_encodeBlockAsm12B
  2607. matchlen_match4_repeat_extend_encodeBlockAsm12B:
  2608. CMPL R9, $0x04
  2609. JL matchlen_match2_repeat_extend_encodeBlockAsm12B
  2610. MOVL (R10)(R12*1), R11
  2611. CMPL (SI)(R12*1), R11
  2612. JNE matchlen_match2_repeat_extend_encodeBlockAsm12B
  2613. SUBL $0x04, R9
  2614. LEAL 4(R12), R12
  2615. matchlen_match2_repeat_extend_encodeBlockAsm12B:
  2616. CMPL R9, $0x02
  2617. JL matchlen_match1_repeat_extend_encodeBlockAsm12B
  2618. MOVW (R10)(R12*1), R11
  2619. CMPW (SI)(R12*1), R11
  2620. JNE matchlen_match1_repeat_extend_encodeBlockAsm12B
  2621. SUBL $0x02, R9
  2622. LEAL 2(R12), R12
  2623. matchlen_match1_repeat_extend_encodeBlockAsm12B:
  2624. CMPL R9, $0x01
  2625. JL repeat_extend_forward_end_encodeBlockAsm12B
  2626. MOVB (R10)(R12*1), R11
  2627. CMPB (SI)(R12*1), R11
  2628. JNE repeat_extend_forward_end_encodeBlockAsm12B
  2629. LEAL 1(R12), R12
  2630. repeat_extend_forward_end_encodeBlockAsm12B:
  2631. ADDL R12, CX
  2632. MOVL CX, SI
  2633. SUBL DI, SI
  2634. MOVL 16(SP), DI
  2635. TESTL R8, R8
  2636. JZ repeat_as_copy_encodeBlockAsm12B
  2637. // emitRepeat
  2638. MOVL SI, R8
  2639. LEAL -4(SI), SI
  2640. CMPL R8, $0x08
  2641. JLE repeat_two_match_repeat_encodeBlockAsm12B
  2642. CMPL R8, $0x0c
  2643. JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
  2644. CMPL DI, $0x00000800
  2645. JLT repeat_two_offset_match_repeat_encodeBlockAsm12B
  2646. cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
  2647. CMPL SI, $0x00000104
  2648. JLT repeat_three_match_repeat_encodeBlockAsm12B
  2649. LEAL -256(SI), SI
  2650. MOVW $0x0019, (AX)
  2651. MOVW SI, 2(AX)
  2652. ADDQ $0x04, AX
  2653. JMP repeat_end_emit_encodeBlockAsm12B
  2654. repeat_three_match_repeat_encodeBlockAsm12B:
  2655. LEAL -4(SI), SI
  2656. MOVW $0x0015, (AX)
  2657. MOVB SI, 2(AX)
  2658. ADDQ $0x03, AX
  2659. JMP repeat_end_emit_encodeBlockAsm12B
  2660. repeat_two_match_repeat_encodeBlockAsm12B:
  2661. SHLL $0x02, SI
  2662. ORL $0x01, SI
  2663. MOVW SI, (AX)
  2664. ADDQ $0x02, AX
  2665. JMP repeat_end_emit_encodeBlockAsm12B
  2666. repeat_two_offset_match_repeat_encodeBlockAsm12B:
  2667. XORQ R8, R8
  2668. LEAL 1(R8)(SI*4), SI
  2669. MOVB DI, 1(AX)
  2670. SARL $0x08, DI
  2671. SHLL $0x05, DI
  2672. ORL DI, SI
  2673. MOVB SI, (AX)
  2674. ADDQ $0x02, AX
  2675. JMP repeat_end_emit_encodeBlockAsm12B
  2676. repeat_as_copy_encodeBlockAsm12B:
  2677. // emitCopy
  2678. two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
  2679. CMPL SI, $0x40
  2680. JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
  2681. CMPL DI, $0x00000800
  2682. JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B
  2683. MOVL $0x00000001, R8
  2684. LEAL 16(R8), R8
  2685. MOVB DI, 1(AX)
  2686. SHRL $0x08, DI
  2687. SHLL $0x05, DI
  2688. ORL DI, R8
  2689. MOVB R8, (AX)
  2690. ADDQ $0x02, AX
  2691. SUBL $0x08, SI
  2692. // emitRepeat
  2693. LEAL -4(SI), SI
  2694. JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
  2695. MOVL SI, R8
  2696. LEAL -4(SI), SI
  2697. CMPL R8, $0x08
  2698. JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
  2699. CMPL R8, $0x0c
  2700. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
  2701. CMPL DI, $0x00000800
  2702. JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
  2703. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
  2704. CMPL SI, $0x00000104
  2705. JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
  2706. LEAL -256(SI), SI
  2707. MOVW $0x0019, (AX)
  2708. MOVW SI, 2(AX)
  2709. ADDQ $0x04, AX
  2710. JMP repeat_end_emit_encodeBlockAsm12B
  2711. repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
  2712. LEAL -4(SI), SI
  2713. MOVW $0x0015, (AX)
  2714. MOVB SI, 2(AX)
  2715. ADDQ $0x03, AX
  2716. JMP repeat_end_emit_encodeBlockAsm12B
  2717. repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
  2718. SHLL $0x02, SI
  2719. ORL $0x01, SI
  2720. MOVW SI, (AX)
  2721. ADDQ $0x02, AX
  2722. JMP repeat_end_emit_encodeBlockAsm12B
  2723. repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
  2724. XORQ R8, R8
  2725. LEAL 1(R8)(SI*4), SI
  2726. MOVB DI, 1(AX)
  2727. SARL $0x08, DI
  2728. SHLL $0x05, DI
  2729. ORL DI, SI
  2730. MOVB SI, (AX)
  2731. ADDQ $0x02, AX
  2732. JMP repeat_end_emit_encodeBlockAsm12B
  2733. long_offset_short_repeat_as_copy_encodeBlockAsm12B:
  2734. MOVB $0xee, (AX)
  2735. MOVW DI, 1(AX)
  2736. LEAL -60(SI), SI
  2737. ADDQ $0x03, AX
  2738. // emitRepeat
  2739. MOVL SI, R8
  2740. LEAL -4(SI), SI
  2741. CMPL R8, $0x08
  2742. JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
  2743. CMPL R8, $0x0c
  2744. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
  2745. CMPL DI, $0x00000800
  2746. JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
  2747. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
  2748. CMPL SI, $0x00000104
  2749. JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
  2750. LEAL -256(SI), SI
  2751. MOVW $0x0019, (AX)
  2752. MOVW SI, 2(AX)
  2753. ADDQ $0x04, AX
  2754. JMP repeat_end_emit_encodeBlockAsm12B
  2755. repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
  2756. LEAL -4(SI), SI
  2757. MOVW $0x0015, (AX)
  2758. MOVB SI, 2(AX)
  2759. ADDQ $0x03, AX
  2760. JMP repeat_end_emit_encodeBlockAsm12B
  2761. repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
  2762. SHLL $0x02, SI
  2763. ORL $0x01, SI
  2764. MOVW SI, (AX)
  2765. ADDQ $0x02, AX
  2766. JMP repeat_end_emit_encodeBlockAsm12B
  2767. repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
  2768. XORQ R8, R8
  2769. LEAL 1(R8)(SI*4), SI
  2770. MOVB DI, 1(AX)
  2771. SARL $0x08, DI
  2772. SHLL $0x05, DI
  2773. ORL DI, SI
  2774. MOVB SI, (AX)
  2775. ADDQ $0x02, AX
  2776. JMP repeat_end_emit_encodeBlockAsm12B
  2777. JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B
  2778. two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
  2779. CMPL SI, $0x0c
  2780. JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
  2781. CMPL DI, $0x00000800
  2782. JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
  2783. MOVB $0x01, BL
  2784. LEAL -16(BX)(SI*4), SI
  2785. MOVB DI, 1(AX)
  2786. SHRL $0x08, DI
  2787. SHLL $0x05, DI
  2788. ORL DI, SI
  2789. MOVB SI, (AX)
  2790. ADDQ $0x02, AX
  2791. JMP repeat_end_emit_encodeBlockAsm12B
  2792. emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
  2793. MOVB $0x02, BL
  2794. LEAL -4(BX)(SI*4), SI
  2795. MOVB SI, (AX)
  2796. MOVW DI, 1(AX)
  2797. ADDQ $0x03, AX
  2798. repeat_end_emit_encodeBlockAsm12B:
  2799. MOVL CX, 12(SP)
  2800. JMP search_loop_encodeBlockAsm12B
  2801. no_repeat_found_encodeBlockAsm12B:
  2802. CMPL (DX)(SI*1), DI
  2803. JEQ candidate_match_encodeBlockAsm12B
  2804. SHRQ $0x08, DI
  2805. MOVL 24(SP)(R10*4), SI
  2806. LEAL 2(CX), R9
  2807. CMPL (DX)(R8*1), DI
  2808. JEQ candidate2_match_encodeBlockAsm12B
  2809. MOVL R9, 24(SP)(R10*4)
  2810. SHRQ $0x08, DI
  2811. CMPL (DX)(SI*1), DI
  2812. JEQ candidate3_match_encodeBlockAsm12B
  2813. MOVL 20(SP), CX
  2814. JMP search_loop_encodeBlockAsm12B
  2815. candidate3_match_encodeBlockAsm12B:
  2816. ADDL $0x02, CX
  2817. JMP candidate_match_encodeBlockAsm12B
  2818. candidate2_match_encodeBlockAsm12B:
  2819. MOVL R9, 24(SP)(R10*4)
  2820. INCL CX
  2821. MOVL R8, SI
  2822. candidate_match_encodeBlockAsm12B:
  2823. MOVL 12(SP), DI
  2824. TESTL SI, SI
  2825. JZ match_extend_back_end_encodeBlockAsm12B
  2826. match_extend_back_loop_encodeBlockAsm12B:
  2827. CMPL CX, DI
  2828. JLE match_extend_back_end_encodeBlockAsm12B
  2829. MOVB -1(DX)(SI*1), BL
  2830. MOVB -1(DX)(CX*1), R8
  2831. CMPB BL, R8
  2832. JNE match_extend_back_end_encodeBlockAsm12B
  2833. LEAL -1(CX), CX
  2834. DECL SI
  2835. JZ match_extend_back_end_encodeBlockAsm12B
  2836. JMP match_extend_back_loop_encodeBlockAsm12B
  2837. match_extend_back_end_encodeBlockAsm12B:
  2838. MOVL CX, DI
  2839. SUBL 12(SP), DI
  2840. LEAQ 3(AX)(DI*1), DI
  2841. CMPQ DI, (SP)
  2842. JL match_dst_size_check_encodeBlockAsm12B
  2843. MOVQ $0x00000000, ret+48(FP)
  2844. RET
  2845. match_dst_size_check_encodeBlockAsm12B:
  2846. MOVL CX, DI
  2847. MOVL 12(SP), R8
  2848. CMPL R8, DI
  2849. JEQ emit_literal_done_match_emit_encodeBlockAsm12B
  2850. MOVL DI, R9
  2851. MOVL DI, 12(SP)
  2852. LEAQ (DX)(R8*1), DI
  2853. SUBL R8, R9
  2854. LEAL -1(R9), R8
  2855. CMPL R8, $0x3c
  2856. JLT one_byte_match_emit_encodeBlockAsm12B
  2857. CMPL R8, $0x00000100
  2858. JLT two_bytes_match_emit_encodeBlockAsm12B
  2859. MOVB $0xf4, (AX)
  2860. MOVW R8, 1(AX)
  2861. ADDQ $0x03, AX
  2862. JMP memmove_long_match_emit_encodeBlockAsm12B
  2863. two_bytes_match_emit_encodeBlockAsm12B:
  2864. MOVB $0xf0, (AX)
  2865. MOVB R8, 1(AX)
  2866. ADDQ $0x02, AX
  2867. CMPL R8, $0x40
  2868. JL memmove_match_emit_encodeBlockAsm12B
  2869. JMP memmove_long_match_emit_encodeBlockAsm12B
  2870. one_byte_match_emit_encodeBlockAsm12B:
  2871. SHLB $0x02, R8
  2872. MOVB R8, (AX)
  2873. ADDQ $0x01, AX
  2874. memmove_match_emit_encodeBlockAsm12B:
  2875. LEAQ (AX)(R9*1), R8
  2876. // genMemMoveShort
  2877. CMPQ R9, $0x08
  2878. JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
  2879. CMPQ R9, $0x10
  2880. JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
  2881. CMPQ R9, $0x20
  2882. JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
  2883. JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
  2884. emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
  2885. MOVQ (DI), R10
  2886. MOVQ R10, (AX)
  2887. JMP memmove_end_copy_match_emit_encodeBlockAsm12B
  2888. emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
  2889. MOVQ (DI), R10
  2890. MOVQ -8(DI)(R9*1), DI
  2891. MOVQ R10, (AX)
  2892. MOVQ DI, -8(AX)(R9*1)
  2893. JMP memmove_end_copy_match_emit_encodeBlockAsm12B
  2894. emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
  2895. MOVOU (DI), X0
  2896. MOVOU -16(DI)(R9*1), X1
  2897. MOVOU X0, (AX)
  2898. MOVOU X1, -16(AX)(R9*1)
  2899. JMP memmove_end_copy_match_emit_encodeBlockAsm12B
  2900. emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
  2901. MOVOU (DI), X0
  2902. MOVOU 16(DI), X1
  2903. MOVOU -32(DI)(R9*1), X2
  2904. MOVOU -16(DI)(R9*1), X3
  2905. MOVOU X0, (AX)
  2906. MOVOU X1, 16(AX)
  2907. MOVOU X2, -32(AX)(R9*1)
  2908. MOVOU X3, -16(AX)(R9*1)
  2909. memmove_end_copy_match_emit_encodeBlockAsm12B:
  2910. MOVQ R8, AX
  2911. JMP emit_literal_done_match_emit_encodeBlockAsm12B
  2912. memmove_long_match_emit_encodeBlockAsm12B:
  2913. LEAQ (AX)(R9*1), R8
  2914. // genMemMoveLong
  2915. MOVOU (DI), X0
  2916. MOVOU 16(DI), X1
  2917. MOVOU -32(DI)(R9*1), X2
  2918. MOVOU -16(DI)(R9*1), X3
  2919. MOVQ R9, R11
  2920. SHRQ $0x05, R11
  2921. MOVQ AX, R10
  2922. ANDL $0x0000001f, R10
  2923. MOVQ $0x00000040, R12
  2924. SUBQ R10, R12
  2925. DECQ R11
  2926. JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
  2927. LEAQ -32(DI)(R12*1), R10
  2928. LEAQ -32(AX)(R12*1), R13
  2929. emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
  2930. MOVOU (R10), X4
  2931. MOVOU 16(R10), X5
  2932. MOVOA X4, (R13)
  2933. MOVOA X5, 16(R13)
  2934. ADDQ $0x20, R13
  2935. ADDQ $0x20, R10
  2936. ADDQ $0x20, R12
  2937. DECQ R11
  2938. JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
  2939. emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
  2940. MOVOU -32(DI)(R12*1), X4
  2941. MOVOU -16(DI)(R12*1), X5
  2942. MOVOA X4, -32(AX)(R12*1)
  2943. MOVOA X5, -16(AX)(R12*1)
  2944. ADDQ $0x20, R12
  2945. CMPQ R9, R12
  2946. JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
  2947. MOVOU X0, (AX)
  2948. MOVOU X1, 16(AX)
  2949. MOVOU X2, -32(AX)(R9*1)
  2950. MOVOU X3, -16(AX)(R9*1)
  2951. MOVQ R8, AX
  2952. emit_literal_done_match_emit_encodeBlockAsm12B:
  2953. match_nolit_loop_encodeBlockAsm12B:
  2954. MOVL CX, DI
  2955. SUBL SI, DI
  2956. MOVL DI, 16(SP)
  2957. ADDL $0x04, CX
  2958. ADDL $0x04, SI
  2959. MOVQ src_len+32(FP), DI
  2960. SUBL CX, DI
  2961. LEAQ (DX)(CX*1), R8
  2962. LEAQ (DX)(SI*1), SI
  2963. // matchLen
  2964. XORL R10, R10
  2965. CMPL DI, $0x08
  2966. JL matchlen_match4_match_nolit_encodeBlockAsm12B
  2967. matchlen_loopback_match_nolit_encodeBlockAsm12B:
  2968. MOVQ (R8)(R10*1), R9
  2969. XORQ (SI)(R10*1), R9
  2970. TESTQ R9, R9
  2971. JZ matchlen_loop_match_nolit_encodeBlockAsm12B
  2972. #ifdef GOAMD64_v3
  2973. TZCNTQ R9, R9
  2974. #else
  2975. BSFQ R9, R9
  2976. #endif
  2977. SARQ $0x03, R9
  2978. LEAL (R10)(R9*1), R10
  2979. JMP match_nolit_end_encodeBlockAsm12B
  2980. matchlen_loop_match_nolit_encodeBlockAsm12B:
  2981. LEAL -8(DI), DI
  2982. LEAL 8(R10), R10
  2983. CMPL DI, $0x08
  2984. JGE matchlen_loopback_match_nolit_encodeBlockAsm12B
  2985. JZ match_nolit_end_encodeBlockAsm12B
  2986. matchlen_match4_match_nolit_encodeBlockAsm12B:
  2987. CMPL DI, $0x04
  2988. JL matchlen_match2_match_nolit_encodeBlockAsm12B
  2989. MOVL (R8)(R10*1), R9
  2990. CMPL (SI)(R10*1), R9
  2991. JNE matchlen_match2_match_nolit_encodeBlockAsm12B
  2992. SUBL $0x04, DI
  2993. LEAL 4(R10), R10
  2994. matchlen_match2_match_nolit_encodeBlockAsm12B:
  2995. CMPL DI, $0x02
  2996. JL matchlen_match1_match_nolit_encodeBlockAsm12B
  2997. MOVW (R8)(R10*1), R9
  2998. CMPW (SI)(R10*1), R9
  2999. JNE matchlen_match1_match_nolit_encodeBlockAsm12B
  3000. SUBL $0x02, DI
  3001. LEAL 2(R10), R10
  3002. matchlen_match1_match_nolit_encodeBlockAsm12B:
  3003. CMPL DI, $0x01
  3004. JL match_nolit_end_encodeBlockAsm12B
  3005. MOVB (R8)(R10*1), R9
  3006. CMPB (SI)(R10*1), R9
  3007. JNE match_nolit_end_encodeBlockAsm12B
  3008. LEAL 1(R10), R10
  3009. match_nolit_end_encodeBlockAsm12B:
  3010. ADDL R10, CX
  3011. MOVL 16(SP), SI
  3012. ADDL $0x04, R10
  3013. MOVL CX, 12(SP)
  3014. // emitCopy
  3015. two_byte_offset_match_nolit_encodeBlockAsm12B:
  3016. CMPL R10, $0x40
  3017. JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B
  3018. CMPL SI, $0x00000800
  3019. JAE long_offset_short_match_nolit_encodeBlockAsm12B
  3020. MOVL $0x00000001, DI
  3021. LEAL 16(DI), DI
  3022. MOVB SI, 1(AX)
  3023. SHRL $0x08, SI
  3024. SHLL $0x05, SI
  3025. ORL SI, DI
  3026. MOVB DI, (AX)
  3027. ADDQ $0x02, AX
  3028. SUBL $0x08, R10
  3029. // emitRepeat
  3030. LEAL -4(R10), R10
  3031. JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
  3032. MOVL R10, DI
  3033. LEAL -4(R10), R10
  3034. CMPL DI, $0x08
  3035. JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
  3036. CMPL DI, $0x0c
  3037. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
  3038. CMPL SI, $0x00000800
  3039. JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
  3040. cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
  3041. CMPL R10, $0x00000104
  3042. JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
  3043. LEAL -256(R10), R10
  3044. MOVW $0x0019, (AX)
  3045. MOVW R10, 2(AX)
  3046. ADDQ $0x04, AX
  3047. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3048. repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
  3049. LEAL -4(R10), R10
  3050. MOVW $0x0015, (AX)
  3051. MOVB R10, 2(AX)
  3052. ADDQ $0x03, AX
  3053. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3054. repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
  3055. SHLL $0x02, R10
  3056. ORL $0x01, R10
  3057. MOVW R10, (AX)
  3058. ADDQ $0x02, AX
  3059. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3060. repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
  3061. XORQ DI, DI
  3062. LEAL 1(DI)(R10*4), R10
  3063. MOVB SI, 1(AX)
  3064. SARL $0x08, SI
  3065. SHLL $0x05, SI
  3066. ORL SI, R10
  3067. MOVB R10, (AX)
  3068. ADDQ $0x02, AX
  3069. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3070. long_offset_short_match_nolit_encodeBlockAsm12B:
  3071. MOVB $0xee, (AX)
  3072. MOVW SI, 1(AX)
  3073. LEAL -60(R10), R10
  3074. ADDQ $0x03, AX
  3075. // emitRepeat
  3076. MOVL R10, DI
  3077. LEAL -4(R10), R10
  3078. CMPL DI, $0x08
  3079. JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
  3080. CMPL DI, $0x0c
  3081. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
  3082. CMPL SI, $0x00000800
  3083. JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
  3084. cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
  3085. CMPL R10, $0x00000104
  3086. JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
  3087. LEAL -256(R10), R10
  3088. MOVW $0x0019, (AX)
  3089. MOVW R10, 2(AX)
  3090. ADDQ $0x04, AX
  3091. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3092. repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
  3093. LEAL -4(R10), R10
  3094. MOVW $0x0015, (AX)
  3095. MOVB R10, 2(AX)
  3096. ADDQ $0x03, AX
  3097. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3098. repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
  3099. SHLL $0x02, R10
  3100. ORL $0x01, R10
  3101. MOVW R10, (AX)
  3102. ADDQ $0x02, AX
  3103. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3104. repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
  3105. XORQ DI, DI
  3106. LEAL 1(DI)(R10*4), R10
  3107. MOVB SI, 1(AX)
  3108. SARL $0x08, SI
  3109. SHLL $0x05, SI
  3110. ORL SI, R10
  3111. MOVB R10, (AX)
  3112. ADDQ $0x02, AX
  3113. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3114. JMP two_byte_offset_match_nolit_encodeBlockAsm12B
  3115. two_byte_offset_short_match_nolit_encodeBlockAsm12B:
  3116. CMPL R10, $0x0c
  3117. JGE emit_copy_three_match_nolit_encodeBlockAsm12B
  3118. CMPL SI, $0x00000800
  3119. JGE emit_copy_three_match_nolit_encodeBlockAsm12B
  3120. MOVB $0x01, BL
  3121. LEAL -16(BX)(R10*4), R10
  3122. MOVB SI, 1(AX)
  3123. SHRL $0x08, SI
  3124. SHLL $0x05, SI
  3125. ORL SI, R10
  3126. MOVB R10, (AX)
  3127. ADDQ $0x02, AX
  3128. JMP match_nolit_emitcopy_end_encodeBlockAsm12B
  3129. emit_copy_three_match_nolit_encodeBlockAsm12B:
  3130. MOVB $0x02, BL
  3131. LEAL -4(BX)(R10*4), R10
  3132. MOVB R10, (AX)
  3133. MOVW SI, 1(AX)
  3134. ADDQ $0x03, AX
  3135. match_nolit_emitcopy_end_encodeBlockAsm12B:
  3136. CMPL CX, 8(SP)
  3137. JGE emit_remainder_encodeBlockAsm12B
  3138. MOVQ -2(DX)(CX*1), DI
  3139. CMPQ AX, (SP)
  3140. JL match_nolit_dst_ok_encodeBlockAsm12B
  3141. MOVQ $0x00000000, ret+48(FP)
  3142. RET
  3143. match_nolit_dst_ok_encodeBlockAsm12B:
  3144. MOVQ $0x000000cf1bbcdcbb, R9
  3145. MOVQ DI, R8
  3146. SHRQ $0x10, DI
  3147. MOVQ DI, SI
  3148. SHLQ $0x18, R8
  3149. IMULQ R9, R8
  3150. SHRQ $0x34, R8
  3151. SHLQ $0x18, SI
  3152. IMULQ R9, SI
  3153. SHRQ $0x34, SI
  3154. LEAL -2(CX), R9
  3155. LEAQ 24(SP)(SI*4), R10
  3156. MOVL (R10), SI
  3157. MOVL R9, 24(SP)(R8*4)
  3158. MOVL CX, (R10)
  3159. CMPL (DX)(SI*1), DI
  3160. JEQ match_nolit_loop_encodeBlockAsm12B
  3161. INCL CX
  3162. JMP search_loop_encodeBlockAsm12B
  3163. emit_remainder_encodeBlockAsm12B:
  3164. MOVQ src_len+32(FP), CX
  3165. SUBL 12(SP), CX
  3166. LEAQ 3(AX)(CX*1), CX
  3167. CMPQ CX, (SP)
  3168. JL emit_remainder_ok_encodeBlockAsm12B
  3169. MOVQ $0x00000000, ret+48(FP)
  3170. RET
  3171. emit_remainder_ok_encodeBlockAsm12B:
  3172. MOVQ src_len+32(FP), CX
  3173. MOVL 12(SP), BX
  3174. CMPL BX, CX
  3175. JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B
  3176. MOVL CX, SI
  3177. MOVL CX, 12(SP)
  3178. LEAQ (DX)(BX*1), CX
  3179. SUBL BX, SI
  3180. LEAL -1(SI), DX
  3181. CMPL DX, $0x3c
  3182. JLT one_byte_emit_remainder_encodeBlockAsm12B
  3183. CMPL DX, $0x00000100
  3184. JLT two_bytes_emit_remainder_encodeBlockAsm12B
  3185. MOVB $0xf4, (AX)
  3186. MOVW DX, 1(AX)
  3187. ADDQ $0x03, AX
  3188. JMP memmove_long_emit_remainder_encodeBlockAsm12B
  3189. two_bytes_emit_remainder_encodeBlockAsm12B:
  3190. MOVB $0xf0, (AX)
  3191. MOVB DL, 1(AX)
  3192. ADDQ $0x02, AX
  3193. CMPL DX, $0x40
  3194. JL memmove_emit_remainder_encodeBlockAsm12B
  3195. JMP memmove_long_emit_remainder_encodeBlockAsm12B
  3196. one_byte_emit_remainder_encodeBlockAsm12B:
  3197. SHLB $0x02, DL
  3198. MOVB DL, (AX)
  3199. ADDQ $0x01, AX
  3200. memmove_emit_remainder_encodeBlockAsm12B:
  3201. LEAQ (AX)(SI*1), DX
  3202. MOVL SI, BX
  3203. // genMemMoveShort
  3204. CMPQ BX, $0x03
  3205. JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2
  3206. JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3
  3207. CMPQ BX, $0x08
  3208. JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7
  3209. CMPQ BX, $0x10
  3210. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
  3211. CMPQ BX, $0x20
  3212. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
  3213. JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
  3214. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2:
  3215. MOVB (CX), SI
  3216. MOVB -1(CX)(BX*1), CL
  3217. MOVB SI, (AX)
  3218. MOVB CL, -1(AX)(BX*1)
  3219. JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
  3220. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3:
  3221. MOVW (CX), SI
  3222. MOVB 2(CX), CL
  3223. MOVW SI, (AX)
  3224. MOVB CL, 2(AX)
  3225. JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
  3226. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7:
  3227. MOVL (CX), SI
  3228. MOVL -4(CX)(BX*1), CX
  3229. MOVL SI, (AX)
  3230. MOVL CX, -4(AX)(BX*1)
  3231. JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
  3232. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
  3233. MOVQ (CX), SI
  3234. MOVQ -8(CX)(BX*1), CX
  3235. MOVQ SI, (AX)
  3236. MOVQ CX, -8(AX)(BX*1)
  3237. JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
  3238. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
  3239. MOVOU (CX), X0
  3240. MOVOU -16(CX)(BX*1), X1
  3241. MOVOU X0, (AX)
  3242. MOVOU X1, -16(AX)(BX*1)
  3243. JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
  3244. emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
  3245. MOVOU (CX), X0
  3246. MOVOU 16(CX), X1
  3247. MOVOU -32(CX)(BX*1), X2
  3248. MOVOU -16(CX)(BX*1), X3
  3249. MOVOU X0, (AX)
  3250. MOVOU X1, 16(AX)
  3251. MOVOU X2, -32(AX)(BX*1)
  3252. MOVOU X3, -16(AX)(BX*1)
  3253. memmove_end_copy_emit_remainder_encodeBlockAsm12B:
  3254. MOVQ DX, AX
  3255. JMP emit_literal_done_emit_remainder_encodeBlockAsm12B
  3256. memmove_long_emit_remainder_encodeBlockAsm12B:
  3257. LEAQ (AX)(SI*1), DX
  3258. MOVL SI, BX
  3259. // genMemMoveLong
  3260. MOVOU (CX), X0
  3261. MOVOU 16(CX), X1
  3262. MOVOU -32(CX)(BX*1), X2
  3263. MOVOU -16(CX)(BX*1), X3
  3264. MOVQ BX, DI
  3265. SHRQ $0x05, DI
  3266. MOVQ AX, SI
  3267. ANDL $0x0000001f, SI
  3268. MOVQ $0x00000040, R8
  3269. SUBQ SI, R8
  3270. DECQ DI
  3271. JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
  3272. LEAQ -32(CX)(R8*1), SI
  3273. LEAQ -32(AX)(R8*1), R9
  3274. emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
  3275. MOVOU (SI), X4
  3276. MOVOU 16(SI), X5
  3277. MOVOA X4, (R9)
  3278. MOVOA X5, 16(R9)
  3279. ADDQ $0x20, R9
  3280. ADDQ $0x20, SI
  3281. ADDQ $0x20, R8
  3282. DECQ DI
  3283. JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
  3284. emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
  3285. MOVOU -32(CX)(R8*1), X4
  3286. MOVOU -16(CX)(R8*1), X5
  3287. MOVOA X4, -32(AX)(R8*1)
  3288. MOVOA X5, -16(AX)(R8*1)
  3289. ADDQ $0x20, R8
  3290. CMPQ BX, R8
  3291. JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
  3292. MOVOU X0, (AX)
  3293. MOVOU X1, 16(AX)
  3294. MOVOU X2, -32(AX)(BX*1)
  3295. MOVOU X3, -16(AX)(BX*1)
  3296. MOVQ DX, AX
  3297. emit_literal_done_emit_remainder_encodeBlockAsm12B:
  3298. MOVQ dst_base+0(FP), CX
  3299. SUBQ CX, AX
  3300. MOVQ AX, ret+48(FP)
  3301. RET
  3302. // func encodeBlockAsm10B(dst []byte, src []byte) int
  3303. // Requires: BMI, SSE2
  3304. TEXT ·encodeBlockAsm10B(SB), $4120-56
  3305. MOVQ dst_base+0(FP), AX
  3306. MOVQ $0x00000020, CX
  3307. LEAQ 24(SP), DX
  3308. PXOR X0, X0
  3309. zero_loop_encodeBlockAsm10B:
  3310. MOVOU X0, (DX)
  3311. MOVOU X0, 16(DX)
  3312. MOVOU X0, 32(DX)
  3313. MOVOU X0, 48(DX)
  3314. MOVOU X0, 64(DX)
  3315. MOVOU X0, 80(DX)
  3316. MOVOU X0, 96(DX)
  3317. MOVOU X0, 112(DX)
  3318. ADDQ $0x80, DX
  3319. DECQ CX
  3320. JNZ zero_loop_encodeBlockAsm10B
  3321. MOVL $0x00000000, 12(SP)
  3322. MOVQ src_len+32(FP), CX
  3323. LEAQ -9(CX), DX
  3324. LEAQ -8(CX), SI
  3325. MOVL SI, 8(SP)
  3326. SHRQ $0x05, CX
  3327. SUBL CX, DX
  3328. LEAQ (AX)(DX*1), DX
  3329. MOVQ DX, (SP)
  3330. MOVL $0x00000001, CX
  3331. MOVL CX, 16(SP)
  3332. MOVQ src_base+24(FP), DX
  3333. search_loop_encodeBlockAsm10B:
  3334. MOVL CX, SI
  3335. SUBL 12(SP), SI
  3336. SHRL $0x05, SI
  3337. LEAL 4(CX)(SI*1), SI
  3338. CMPL SI, 8(SP)
  3339. JGE emit_remainder_encodeBlockAsm10B
  3340. MOVQ (DX)(CX*1), DI
  3341. MOVL SI, 20(SP)
  3342. MOVQ $0x9e3779b1, R9
  3343. MOVQ DI, R10
  3344. MOVQ DI, R11
  3345. SHRQ $0x08, R11
  3346. SHLQ $0x20, R10
  3347. IMULQ R9, R10
  3348. SHRQ $0x36, R10
  3349. SHLQ $0x20, R11
  3350. IMULQ R9, R11
  3351. SHRQ $0x36, R11
  3352. MOVL 24(SP)(R10*4), SI
  3353. MOVL 24(SP)(R11*4), R8
  3354. MOVL CX, 24(SP)(R10*4)
  3355. LEAL 1(CX), R10
  3356. MOVL R10, 24(SP)(R11*4)
  3357. MOVQ DI, R10
  3358. SHRQ $0x10, R10
  3359. SHLQ $0x20, R10
  3360. IMULQ R9, R10
  3361. SHRQ $0x36, R10
  3362. MOVL CX, R9
  3363. SUBL 16(SP), R9
  3364. MOVL 1(DX)(R9*1), R11
  3365. MOVQ DI, R9
  3366. SHRQ $0x08, R9
  3367. CMPL R9, R11
  3368. JNE no_repeat_found_encodeBlockAsm10B
  3369. LEAL 1(CX), DI
  3370. MOVL 12(SP), R8
  3371. MOVL DI, SI
  3372. SUBL 16(SP), SI
  3373. JZ repeat_extend_back_end_encodeBlockAsm10B
  3374. repeat_extend_back_loop_encodeBlockAsm10B:
  3375. CMPL DI, R8
  3376. JLE repeat_extend_back_end_encodeBlockAsm10B
  3377. MOVB -1(DX)(SI*1), BL
  3378. MOVB -1(DX)(DI*1), R9
  3379. CMPB BL, R9
  3380. JNE repeat_extend_back_end_encodeBlockAsm10B
  3381. LEAL -1(DI), DI
  3382. DECL SI
  3383. JNZ repeat_extend_back_loop_encodeBlockAsm10B
  3384. repeat_extend_back_end_encodeBlockAsm10B:
  3385. MOVL 12(SP), SI
  3386. CMPL SI, DI
  3387. JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B
  3388. MOVL DI, R9
  3389. MOVL DI, 12(SP)
  3390. LEAQ (DX)(SI*1), R10
  3391. SUBL SI, R9
  3392. LEAL -1(R9), SI
  3393. CMPL SI, $0x3c
  3394. JLT one_byte_repeat_emit_encodeBlockAsm10B
  3395. CMPL SI, $0x00000100
  3396. JLT two_bytes_repeat_emit_encodeBlockAsm10B
  3397. MOVB $0xf4, (AX)
  3398. MOVW SI, 1(AX)
  3399. ADDQ $0x03, AX
  3400. JMP memmove_long_repeat_emit_encodeBlockAsm10B
  3401. two_bytes_repeat_emit_encodeBlockAsm10B:
  3402. MOVB $0xf0, (AX)
  3403. MOVB SI, 1(AX)
  3404. ADDQ $0x02, AX
  3405. CMPL SI, $0x40
  3406. JL memmove_repeat_emit_encodeBlockAsm10B
  3407. JMP memmove_long_repeat_emit_encodeBlockAsm10B
  3408. one_byte_repeat_emit_encodeBlockAsm10B:
  3409. SHLB $0x02, SI
  3410. MOVB SI, (AX)
  3411. ADDQ $0x01, AX
  3412. memmove_repeat_emit_encodeBlockAsm10B:
  3413. LEAQ (AX)(R9*1), SI
  3414. // genMemMoveShort
  3415. CMPQ R9, $0x08
  3416. JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
  3417. CMPQ R9, $0x10
  3418. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
  3419. CMPQ R9, $0x20
  3420. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
  3421. JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
  3422. emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
  3423. MOVQ (R10), R11
  3424. MOVQ R11, (AX)
  3425. JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
  3426. emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
  3427. MOVQ (R10), R11
  3428. MOVQ -8(R10)(R9*1), R10
  3429. MOVQ R11, (AX)
  3430. MOVQ R10, -8(AX)(R9*1)
  3431. JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
  3432. emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
  3433. MOVOU (R10), X0
  3434. MOVOU -16(R10)(R9*1), X1
  3435. MOVOU X0, (AX)
  3436. MOVOU X1, -16(AX)(R9*1)
  3437. JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
  3438. emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
  3439. MOVOU (R10), X0
  3440. MOVOU 16(R10), X1
  3441. MOVOU -32(R10)(R9*1), X2
  3442. MOVOU -16(R10)(R9*1), X3
  3443. MOVOU X0, (AX)
  3444. MOVOU X1, 16(AX)
  3445. MOVOU X2, -32(AX)(R9*1)
  3446. MOVOU X3, -16(AX)(R9*1)
  3447. memmove_end_copy_repeat_emit_encodeBlockAsm10B:
  3448. MOVQ SI, AX
  3449. JMP emit_literal_done_repeat_emit_encodeBlockAsm10B
  3450. memmove_long_repeat_emit_encodeBlockAsm10B:
  3451. LEAQ (AX)(R9*1), SI
  3452. // genMemMoveLong
  3453. MOVOU (R10), X0
  3454. MOVOU 16(R10), X1
  3455. MOVOU -32(R10)(R9*1), X2
  3456. MOVOU -16(R10)(R9*1), X3
  3457. MOVQ R9, R12
  3458. SHRQ $0x05, R12
  3459. MOVQ AX, R11
  3460. ANDL $0x0000001f, R11
  3461. MOVQ $0x00000040, R13
  3462. SUBQ R11, R13
  3463. DECQ R12
  3464. JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
  3465. LEAQ -32(R10)(R13*1), R11
  3466. LEAQ -32(AX)(R13*1), R14
  3467. emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
  3468. MOVOU (R11), X4
  3469. MOVOU 16(R11), X5
  3470. MOVOA X4, (R14)
  3471. MOVOA X5, 16(R14)
  3472. ADDQ $0x20, R14
  3473. ADDQ $0x20, R11
  3474. ADDQ $0x20, R13
  3475. DECQ R12
  3476. JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
  3477. emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
  3478. MOVOU -32(R10)(R13*1), X4
  3479. MOVOU -16(R10)(R13*1), X5
  3480. MOVOA X4, -32(AX)(R13*1)
  3481. MOVOA X5, -16(AX)(R13*1)
  3482. ADDQ $0x20, R13
  3483. CMPQ R9, R13
  3484. JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
  3485. MOVOU X0, (AX)
  3486. MOVOU X1, 16(AX)
  3487. MOVOU X2, -32(AX)(R9*1)
  3488. MOVOU X3, -16(AX)(R9*1)
  3489. MOVQ SI, AX
  3490. emit_literal_done_repeat_emit_encodeBlockAsm10B:
  3491. ADDL $0x05, CX
  3492. MOVL CX, SI
  3493. SUBL 16(SP), SI
  3494. MOVQ src_len+32(FP), R9
  3495. SUBL CX, R9
  3496. LEAQ (DX)(CX*1), R10
  3497. LEAQ (DX)(SI*1), SI
  3498. // matchLen
  3499. XORL R12, R12
  3500. CMPL R9, $0x08
  3501. JL matchlen_match4_repeat_extend_encodeBlockAsm10B
  3502. matchlen_loopback_repeat_extend_encodeBlockAsm10B:
  3503. MOVQ (R10)(R12*1), R11
  3504. XORQ (SI)(R12*1), R11
  3505. TESTQ R11, R11
  3506. JZ matchlen_loop_repeat_extend_encodeBlockAsm10B
  3507. #ifdef GOAMD64_v3
  3508. TZCNTQ R11, R11
  3509. #else
  3510. BSFQ R11, R11
  3511. #endif
  3512. SARQ $0x03, R11
  3513. LEAL (R12)(R11*1), R12
  3514. JMP repeat_extend_forward_end_encodeBlockAsm10B
  3515. matchlen_loop_repeat_extend_encodeBlockAsm10B:
  3516. LEAL -8(R9), R9
  3517. LEAL 8(R12), R12
  3518. CMPL R9, $0x08
  3519. JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B
  3520. JZ repeat_extend_forward_end_encodeBlockAsm10B
  3521. matchlen_match4_repeat_extend_encodeBlockAsm10B:
  3522. CMPL R9, $0x04
  3523. JL matchlen_match2_repeat_extend_encodeBlockAsm10B
  3524. MOVL (R10)(R12*1), R11
  3525. CMPL (SI)(R12*1), R11
  3526. JNE matchlen_match2_repeat_extend_encodeBlockAsm10B
  3527. SUBL $0x04, R9
  3528. LEAL 4(R12), R12
  3529. matchlen_match2_repeat_extend_encodeBlockAsm10B:
  3530. CMPL R9, $0x02
  3531. JL matchlen_match1_repeat_extend_encodeBlockAsm10B
  3532. MOVW (R10)(R12*1), R11
  3533. CMPW (SI)(R12*1), R11
  3534. JNE matchlen_match1_repeat_extend_encodeBlockAsm10B
  3535. SUBL $0x02, R9
  3536. LEAL 2(R12), R12
  3537. matchlen_match1_repeat_extend_encodeBlockAsm10B:
  3538. CMPL R9, $0x01
  3539. JL repeat_extend_forward_end_encodeBlockAsm10B
  3540. MOVB (R10)(R12*1), R11
  3541. CMPB (SI)(R12*1), R11
  3542. JNE repeat_extend_forward_end_encodeBlockAsm10B
  3543. LEAL 1(R12), R12
  3544. repeat_extend_forward_end_encodeBlockAsm10B:
  3545. ADDL R12, CX
  3546. MOVL CX, SI
  3547. SUBL DI, SI
  3548. MOVL 16(SP), DI
  3549. TESTL R8, R8
  3550. JZ repeat_as_copy_encodeBlockAsm10B
  3551. // emitRepeat
  3552. MOVL SI, R8
  3553. LEAL -4(SI), SI
  3554. CMPL R8, $0x08
  3555. JLE repeat_two_match_repeat_encodeBlockAsm10B
  3556. CMPL R8, $0x0c
  3557. JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
  3558. CMPL DI, $0x00000800
  3559. JLT repeat_two_offset_match_repeat_encodeBlockAsm10B
  3560. cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
  3561. CMPL SI, $0x00000104
  3562. JLT repeat_three_match_repeat_encodeBlockAsm10B
  3563. LEAL -256(SI), SI
  3564. MOVW $0x0019, (AX)
  3565. MOVW SI, 2(AX)
  3566. ADDQ $0x04, AX
  3567. JMP repeat_end_emit_encodeBlockAsm10B
  3568. repeat_three_match_repeat_encodeBlockAsm10B:
  3569. LEAL -4(SI), SI
  3570. MOVW $0x0015, (AX)
  3571. MOVB SI, 2(AX)
  3572. ADDQ $0x03, AX
  3573. JMP repeat_end_emit_encodeBlockAsm10B
  3574. repeat_two_match_repeat_encodeBlockAsm10B:
  3575. SHLL $0x02, SI
  3576. ORL $0x01, SI
  3577. MOVW SI, (AX)
  3578. ADDQ $0x02, AX
  3579. JMP repeat_end_emit_encodeBlockAsm10B
  3580. repeat_two_offset_match_repeat_encodeBlockAsm10B:
  3581. XORQ R8, R8
  3582. LEAL 1(R8)(SI*4), SI
  3583. MOVB DI, 1(AX)
  3584. SARL $0x08, DI
  3585. SHLL $0x05, DI
  3586. ORL DI, SI
  3587. MOVB SI, (AX)
  3588. ADDQ $0x02, AX
  3589. JMP repeat_end_emit_encodeBlockAsm10B
  3590. repeat_as_copy_encodeBlockAsm10B:
  3591. // emitCopy
  3592. two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
  3593. CMPL SI, $0x40
  3594. JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
  3595. CMPL DI, $0x00000800
  3596. JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B
  3597. MOVL $0x00000001, R8
  3598. LEAL 16(R8), R8
  3599. MOVB DI, 1(AX)
  3600. SHRL $0x08, DI
  3601. SHLL $0x05, DI
  3602. ORL DI, R8
  3603. MOVB R8, (AX)
  3604. ADDQ $0x02, AX
  3605. SUBL $0x08, SI
  3606. // emitRepeat
  3607. LEAL -4(SI), SI
  3608. JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
  3609. MOVL SI, R8
  3610. LEAL -4(SI), SI
  3611. CMPL R8, $0x08
  3612. JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
  3613. CMPL R8, $0x0c
  3614. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
  3615. CMPL DI, $0x00000800
  3616. JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
  3617. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
  3618. CMPL SI, $0x00000104
  3619. JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
  3620. LEAL -256(SI), SI
  3621. MOVW $0x0019, (AX)
  3622. MOVW SI, 2(AX)
  3623. ADDQ $0x04, AX
  3624. JMP repeat_end_emit_encodeBlockAsm10B
  3625. repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
  3626. LEAL -4(SI), SI
  3627. MOVW $0x0015, (AX)
  3628. MOVB SI, 2(AX)
  3629. ADDQ $0x03, AX
  3630. JMP repeat_end_emit_encodeBlockAsm10B
  3631. repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
  3632. SHLL $0x02, SI
  3633. ORL $0x01, SI
  3634. MOVW SI, (AX)
  3635. ADDQ $0x02, AX
  3636. JMP repeat_end_emit_encodeBlockAsm10B
  3637. repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
  3638. XORQ R8, R8
  3639. LEAL 1(R8)(SI*4), SI
  3640. MOVB DI, 1(AX)
  3641. SARL $0x08, DI
  3642. SHLL $0x05, DI
  3643. ORL DI, SI
  3644. MOVB SI, (AX)
  3645. ADDQ $0x02, AX
  3646. JMP repeat_end_emit_encodeBlockAsm10B
  3647. long_offset_short_repeat_as_copy_encodeBlockAsm10B:
  3648. MOVB $0xee, (AX)
  3649. MOVW DI, 1(AX)
  3650. LEAL -60(SI), SI
  3651. ADDQ $0x03, AX
  3652. // emitRepeat
  3653. MOVL SI, R8
  3654. LEAL -4(SI), SI
  3655. CMPL R8, $0x08
  3656. JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
  3657. CMPL R8, $0x0c
  3658. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
  3659. CMPL DI, $0x00000800
  3660. JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
  3661. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
  3662. CMPL SI, $0x00000104
  3663. JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
  3664. LEAL -256(SI), SI
  3665. MOVW $0x0019, (AX)
  3666. MOVW SI, 2(AX)
  3667. ADDQ $0x04, AX
  3668. JMP repeat_end_emit_encodeBlockAsm10B
  3669. repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
  3670. LEAL -4(SI), SI
  3671. MOVW $0x0015, (AX)
  3672. MOVB SI, 2(AX)
  3673. ADDQ $0x03, AX
  3674. JMP repeat_end_emit_encodeBlockAsm10B
  3675. repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
  3676. SHLL $0x02, SI
  3677. ORL $0x01, SI
  3678. MOVW SI, (AX)
  3679. ADDQ $0x02, AX
  3680. JMP repeat_end_emit_encodeBlockAsm10B
  3681. repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
  3682. XORQ R8, R8
  3683. LEAL 1(R8)(SI*4), SI
  3684. MOVB DI, 1(AX)
  3685. SARL $0x08, DI
  3686. SHLL $0x05, DI
  3687. ORL DI, SI
  3688. MOVB SI, (AX)
  3689. ADDQ $0x02, AX
  3690. JMP repeat_end_emit_encodeBlockAsm10B
  3691. JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B
  3692. two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
  3693. CMPL SI, $0x0c
  3694. JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
  3695. CMPL DI, $0x00000800
  3696. JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
  3697. MOVB $0x01, BL
  3698. LEAL -16(BX)(SI*4), SI
  3699. MOVB DI, 1(AX)
  3700. SHRL $0x08, DI
  3701. SHLL $0x05, DI
  3702. ORL DI, SI
  3703. MOVB SI, (AX)
  3704. ADDQ $0x02, AX
  3705. JMP repeat_end_emit_encodeBlockAsm10B
  3706. emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
  3707. MOVB $0x02, BL
  3708. LEAL -4(BX)(SI*4), SI
  3709. MOVB SI, (AX)
  3710. MOVW DI, 1(AX)
  3711. ADDQ $0x03, AX
  3712. repeat_end_emit_encodeBlockAsm10B:
  3713. MOVL CX, 12(SP)
  3714. JMP search_loop_encodeBlockAsm10B
  3715. no_repeat_found_encodeBlockAsm10B:
  3716. CMPL (DX)(SI*1), DI
  3717. JEQ candidate_match_encodeBlockAsm10B
  3718. SHRQ $0x08, DI
  3719. MOVL 24(SP)(R10*4), SI
  3720. LEAL 2(CX), R9
  3721. CMPL (DX)(R8*1), DI
  3722. JEQ candidate2_match_encodeBlockAsm10B
  3723. MOVL R9, 24(SP)(R10*4)
  3724. SHRQ $0x08, DI
  3725. CMPL (DX)(SI*1), DI
  3726. JEQ candidate3_match_encodeBlockAsm10B
  3727. MOVL 20(SP), CX
  3728. JMP search_loop_encodeBlockAsm10B
  3729. candidate3_match_encodeBlockAsm10B:
  3730. ADDL $0x02, CX
  3731. JMP candidate_match_encodeBlockAsm10B
  3732. candidate2_match_encodeBlockAsm10B:
  3733. MOVL R9, 24(SP)(R10*4)
  3734. INCL CX
  3735. MOVL R8, SI
  3736. candidate_match_encodeBlockAsm10B:
  3737. MOVL 12(SP), DI
  3738. TESTL SI, SI
  3739. JZ match_extend_back_end_encodeBlockAsm10B
  3740. match_extend_back_loop_encodeBlockAsm10B:
  3741. CMPL CX, DI
  3742. JLE match_extend_back_end_encodeBlockAsm10B
  3743. MOVB -1(DX)(SI*1), BL
  3744. MOVB -1(DX)(CX*1), R8
  3745. CMPB BL, R8
  3746. JNE match_extend_back_end_encodeBlockAsm10B
  3747. LEAL -1(CX), CX
  3748. DECL SI
  3749. JZ match_extend_back_end_encodeBlockAsm10B
  3750. JMP match_extend_back_loop_encodeBlockAsm10B
  3751. match_extend_back_end_encodeBlockAsm10B:
  3752. MOVL CX, DI
  3753. SUBL 12(SP), DI
  3754. LEAQ 3(AX)(DI*1), DI
  3755. CMPQ DI, (SP)
  3756. JL match_dst_size_check_encodeBlockAsm10B
  3757. MOVQ $0x00000000, ret+48(FP)
  3758. RET
  3759. match_dst_size_check_encodeBlockAsm10B:
  3760. MOVL CX, DI
  3761. MOVL 12(SP), R8
  3762. CMPL R8, DI
  3763. JEQ emit_literal_done_match_emit_encodeBlockAsm10B
  3764. MOVL DI, R9
  3765. MOVL DI, 12(SP)
  3766. LEAQ (DX)(R8*1), DI
  3767. SUBL R8, R9
  3768. LEAL -1(R9), R8
  3769. CMPL R8, $0x3c
  3770. JLT one_byte_match_emit_encodeBlockAsm10B
  3771. CMPL R8, $0x00000100
  3772. JLT two_bytes_match_emit_encodeBlockAsm10B
  3773. MOVB $0xf4, (AX)
  3774. MOVW R8, 1(AX)
  3775. ADDQ $0x03, AX
  3776. JMP memmove_long_match_emit_encodeBlockAsm10B
  3777. two_bytes_match_emit_encodeBlockAsm10B:
  3778. MOVB $0xf0, (AX)
  3779. MOVB R8, 1(AX)
  3780. ADDQ $0x02, AX
  3781. CMPL R8, $0x40
  3782. JL memmove_match_emit_encodeBlockAsm10B
  3783. JMP memmove_long_match_emit_encodeBlockAsm10B
  3784. one_byte_match_emit_encodeBlockAsm10B:
  3785. SHLB $0x02, R8
  3786. MOVB R8, (AX)
  3787. ADDQ $0x01, AX
  3788. memmove_match_emit_encodeBlockAsm10B:
  3789. LEAQ (AX)(R9*1), R8
  3790. // genMemMoveShort
  3791. CMPQ R9, $0x08
  3792. JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
  3793. CMPQ R9, $0x10
  3794. JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
  3795. CMPQ R9, $0x20
  3796. JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
  3797. JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
  3798. emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
  3799. MOVQ (DI), R10
  3800. MOVQ R10, (AX)
  3801. JMP memmove_end_copy_match_emit_encodeBlockAsm10B
  3802. emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
  3803. MOVQ (DI), R10
  3804. MOVQ -8(DI)(R9*1), DI
  3805. MOVQ R10, (AX)
  3806. MOVQ DI, -8(AX)(R9*1)
  3807. JMP memmove_end_copy_match_emit_encodeBlockAsm10B
  3808. emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
  3809. MOVOU (DI), X0
  3810. MOVOU -16(DI)(R9*1), X1
  3811. MOVOU X0, (AX)
  3812. MOVOU X1, -16(AX)(R9*1)
  3813. JMP memmove_end_copy_match_emit_encodeBlockAsm10B
  3814. emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
  3815. MOVOU (DI), X0
  3816. MOVOU 16(DI), X1
  3817. MOVOU -32(DI)(R9*1), X2
  3818. MOVOU -16(DI)(R9*1), X3
  3819. MOVOU X0, (AX)
  3820. MOVOU X1, 16(AX)
  3821. MOVOU X2, -32(AX)(R9*1)
  3822. MOVOU X3, -16(AX)(R9*1)
  3823. memmove_end_copy_match_emit_encodeBlockAsm10B:
  3824. MOVQ R8, AX
  3825. JMP emit_literal_done_match_emit_encodeBlockAsm10B
  3826. memmove_long_match_emit_encodeBlockAsm10B:
  3827. LEAQ (AX)(R9*1), R8
  3828. // genMemMoveLong
  3829. MOVOU (DI), X0
  3830. MOVOU 16(DI), X1
  3831. MOVOU -32(DI)(R9*1), X2
  3832. MOVOU -16(DI)(R9*1), X3
  3833. MOVQ R9, R11
  3834. SHRQ $0x05, R11
  3835. MOVQ AX, R10
  3836. ANDL $0x0000001f, R10
  3837. MOVQ $0x00000040, R12
  3838. SUBQ R10, R12
  3839. DECQ R11
  3840. JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
  3841. LEAQ -32(DI)(R12*1), R10
  3842. LEAQ -32(AX)(R12*1), R13
  3843. emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
  3844. MOVOU (R10), X4
  3845. MOVOU 16(R10), X5
  3846. MOVOA X4, (R13)
  3847. MOVOA X5, 16(R13)
  3848. ADDQ $0x20, R13
  3849. ADDQ $0x20, R10
  3850. ADDQ $0x20, R12
  3851. DECQ R11
  3852. JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
  3853. emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
  3854. MOVOU -32(DI)(R12*1), X4
  3855. MOVOU -16(DI)(R12*1), X5
  3856. MOVOA X4, -32(AX)(R12*1)
  3857. MOVOA X5, -16(AX)(R12*1)
  3858. ADDQ $0x20, R12
  3859. CMPQ R9, R12
  3860. JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
  3861. MOVOU X0, (AX)
  3862. MOVOU X1, 16(AX)
  3863. MOVOU X2, -32(AX)(R9*1)
  3864. MOVOU X3, -16(AX)(R9*1)
  3865. MOVQ R8, AX
  3866. emit_literal_done_match_emit_encodeBlockAsm10B:
  3867. match_nolit_loop_encodeBlockAsm10B:
  3868. MOVL CX, DI
  3869. SUBL SI, DI
  3870. MOVL DI, 16(SP)
  3871. ADDL $0x04, CX
  3872. ADDL $0x04, SI
  3873. MOVQ src_len+32(FP), DI
  3874. SUBL CX, DI
  3875. LEAQ (DX)(CX*1), R8
  3876. LEAQ (DX)(SI*1), SI
  3877. // matchLen
  3878. XORL R10, R10
  3879. CMPL DI, $0x08
  3880. JL matchlen_match4_match_nolit_encodeBlockAsm10B
  3881. matchlen_loopback_match_nolit_encodeBlockAsm10B:
  3882. MOVQ (R8)(R10*1), R9
  3883. XORQ (SI)(R10*1), R9
  3884. TESTQ R9, R9
  3885. JZ matchlen_loop_match_nolit_encodeBlockAsm10B
  3886. #ifdef GOAMD64_v3
  3887. TZCNTQ R9, R9
  3888. #else
  3889. BSFQ R9, R9
  3890. #endif
  3891. SARQ $0x03, R9
  3892. LEAL (R10)(R9*1), R10
  3893. JMP match_nolit_end_encodeBlockAsm10B
  3894. matchlen_loop_match_nolit_encodeBlockAsm10B:
  3895. LEAL -8(DI), DI
  3896. LEAL 8(R10), R10
  3897. CMPL DI, $0x08
  3898. JGE matchlen_loopback_match_nolit_encodeBlockAsm10B
  3899. JZ match_nolit_end_encodeBlockAsm10B
  3900. matchlen_match4_match_nolit_encodeBlockAsm10B:
  3901. CMPL DI, $0x04
  3902. JL matchlen_match2_match_nolit_encodeBlockAsm10B
  3903. MOVL (R8)(R10*1), R9
  3904. CMPL (SI)(R10*1), R9
  3905. JNE matchlen_match2_match_nolit_encodeBlockAsm10B
  3906. SUBL $0x04, DI
  3907. LEAL 4(R10), R10
  3908. matchlen_match2_match_nolit_encodeBlockAsm10B:
  3909. CMPL DI, $0x02
  3910. JL matchlen_match1_match_nolit_encodeBlockAsm10B
  3911. MOVW (R8)(R10*1), R9
  3912. CMPW (SI)(R10*1), R9
  3913. JNE matchlen_match1_match_nolit_encodeBlockAsm10B
  3914. SUBL $0x02, DI
  3915. LEAL 2(R10), R10
  3916. matchlen_match1_match_nolit_encodeBlockAsm10B:
  3917. CMPL DI, $0x01
  3918. JL match_nolit_end_encodeBlockAsm10B
  3919. MOVB (R8)(R10*1), R9
  3920. CMPB (SI)(R10*1), R9
  3921. JNE match_nolit_end_encodeBlockAsm10B
  3922. LEAL 1(R10), R10
  3923. match_nolit_end_encodeBlockAsm10B:
  3924. ADDL R10, CX
  3925. MOVL 16(SP), SI
  3926. ADDL $0x04, R10
  3927. MOVL CX, 12(SP)
  3928. // emitCopy
  3929. two_byte_offset_match_nolit_encodeBlockAsm10B:
  3930. CMPL R10, $0x40
  3931. JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B
  3932. CMPL SI, $0x00000800
  3933. JAE long_offset_short_match_nolit_encodeBlockAsm10B
  3934. MOVL $0x00000001, DI
  3935. LEAL 16(DI), DI
  3936. MOVB SI, 1(AX)
  3937. SHRL $0x08, SI
  3938. SHLL $0x05, SI
  3939. ORL SI, DI
  3940. MOVB DI, (AX)
  3941. ADDQ $0x02, AX
  3942. SUBL $0x08, R10
  3943. // emitRepeat
  3944. LEAL -4(R10), R10
  3945. JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
  3946. MOVL R10, DI
  3947. LEAL -4(R10), R10
  3948. CMPL DI, $0x08
  3949. JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
  3950. CMPL DI, $0x0c
  3951. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
  3952. CMPL SI, $0x00000800
  3953. JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
  3954. cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
  3955. CMPL R10, $0x00000104
  3956. JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
  3957. LEAL -256(R10), R10
  3958. MOVW $0x0019, (AX)
  3959. MOVW R10, 2(AX)
  3960. ADDQ $0x04, AX
  3961. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  3962. repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
  3963. LEAL -4(R10), R10
  3964. MOVW $0x0015, (AX)
  3965. MOVB R10, 2(AX)
  3966. ADDQ $0x03, AX
  3967. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  3968. repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
  3969. SHLL $0x02, R10
  3970. ORL $0x01, R10
  3971. MOVW R10, (AX)
  3972. ADDQ $0x02, AX
  3973. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  3974. repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
  3975. XORQ DI, DI
  3976. LEAL 1(DI)(R10*4), R10
  3977. MOVB SI, 1(AX)
  3978. SARL $0x08, SI
  3979. SHLL $0x05, SI
  3980. ORL SI, R10
  3981. MOVB R10, (AX)
  3982. ADDQ $0x02, AX
  3983. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  3984. long_offset_short_match_nolit_encodeBlockAsm10B:
  3985. MOVB $0xee, (AX)
  3986. MOVW SI, 1(AX)
  3987. LEAL -60(R10), R10
  3988. ADDQ $0x03, AX
  3989. // emitRepeat
  3990. MOVL R10, DI
  3991. LEAL -4(R10), R10
  3992. CMPL DI, $0x08
  3993. JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
  3994. CMPL DI, $0x0c
  3995. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
  3996. CMPL SI, $0x00000800
  3997. JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
  3998. cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
  3999. CMPL R10, $0x00000104
  4000. JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
  4001. LEAL -256(R10), R10
  4002. MOVW $0x0019, (AX)
  4003. MOVW R10, 2(AX)
  4004. ADDQ $0x04, AX
  4005. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4006. repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
  4007. LEAL -4(R10), R10
  4008. MOVW $0x0015, (AX)
  4009. MOVB R10, 2(AX)
  4010. ADDQ $0x03, AX
  4011. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4012. repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
  4013. SHLL $0x02, R10
  4014. ORL $0x01, R10
  4015. MOVW R10, (AX)
  4016. ADDQ $0x02, AX
  4017. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4018. repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
  4019. XORQ DI, DI
  4020. LEAL 1(DI)(R10*4), R10
  4021. MOVB SI, 1(AX)
  4022. SARL $0x08, SI
  4023. SHLL $0x05, SI
  4024. ORL SI, R10
  4025. MOVB R10, (AX)
  4026. ADDQ $0x02, AX
  4027. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4028. JMP two_byte_offset_match_nolit_encodeBlockAsm10B
  4029. two_byte_offset_short_match_nolit_encodeBlockAsm10B:
  4030. CMPL R10, $0x0c
  4031. JGE emit_copy_three_match_nolit_encodeBlockAsm10B
  4032. CMPL SI, $0x00000800
  4033. JGE emit_copy_three_match_nolit_encodeBlockAsm10B
  4034. MOVB $0x01, BL
  4035. LEAL -16(BX)(R10*4), R10
  4036. MOVB SI, 1(AX)
  4037. SHRL $0x08, SI
  4038. SHLL $0x05, SI
  4039. ORL SI, R10
  4040. MOVB R10, (AX)
  4041. ADDQ $0x02, AX
  4042. JMP match_nolit_emitcopy_end_encodeBlockAsm10B
  4043. emit_copy_three_match_nolit_encodeBlockAsm10B:
  4044. MOVB $0x02, BL
  4045. LEAL -4(BX)(R10*4), R10
  4046. MOVB R10, (AX)
  4047. MOVW SI, 1(AX)
  4048. ADDQ $0x03, AX
  4049. match_nolit_emitcopy_end_encodeBlockAsm10B:
  4050. CMPL CX, 8(SP)
  4051. JGE emit_remainder_encodeBlockAsm10B
  4052. MOVQ -2(DX)(CX*1), DI
  4053. CMPQ AX, (SP)
  4054. JL match_nolit_dst_ok_encodeBlockAsm10B
  4055. MOVQ $0x00000000, ret+48(FP)
  4056. RET
  4057. match_nolit_dst_ok_encodeBlockAsm10B:
  4058. MOVQ $0x9e3779b1, R9
  4059. MOVQ DI, R8
  4060. SHRQ $0x10, DI
  4061. MOVQ DI, SI
  4062. SHLQ $0x20, R8
  4063. IMULQ R9, R8
  4064. SHRQ $0x36, R8
  4065. SHLQ $0x20, SI
  4066. IMULQ R9, SI
  4067. SHRQ $0x36, SI
  4068. LEAL -2(CX), R9
  4069. LEAQ 24(SP)(SI*4), R10
  4070. MOVL (R10), SI
  4071. MOVL R9, 24(SP)(R8*4)
  4072. MOVL CX, (R10)
  4073. CMPL (DX)(SI*1), DI
  4074. JEQ match_nolit_loop_encodeBlockAsm10B
  4075. INCL CX
  4076. JMP search_loop_encodeBlockAsm10B
  4077. emit_remainder_encodeBlockAsm10B:
  4078. MOVQ src_len+32(FP), CX
  4079. SUBL 12(SP), CX
  4080. LEAQ 3(AX)(CX*1), CX
  4081. CMPQ CX, (SP)
  4082. JL emit_remainder_ok_encodeBlockAsm10B
  4083. MOVQ $0x00000000, ret+48(FP)
  4084. RET
  4085. emit_remainder_ok_encodeBlockAsm10B:
  4086. MOVQ src_len+32(FP), CX
  4087. MOVL 12(SP), BX
  4088. CMPL BX, CX
  4089. JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B
  4090. MOVL CX, SI
  4091. MOVL CX, 12(SP)
  4092. LEAQ (DX)(BX*1), CX
  4093. SUBL BX, SI
  4094. LEAL -1(SI), DX
  4095. CMPL DX, $0x3c
  4096. JLT one_byte_emit_remainder_encodeBlockAsm10B
  4097. CMPL DX, $0x00000100
  4098. JLT two_bytes_emit_remainder_encodeBlockAsm10B
  4099. MOVB $0xf4, (AX)
  4100. MOVW DX, 1(AX)
  4101. ADDQ $0x03, AX
  4102. JMP memmove_long_emit_remainder_encodeBlockAsm10B
  4103. two_bytes_emit_remainder_encodeBlockAsm10B:
  4104. MOVB $0xf0, (AX)
  4105. MOVB DL, 1(AX)
  4106. ADDQ $0x02, AX
  4107. CMPL DX, $0x40
  4108. JL memmove_emit_remainder_encodeBlockAsm10B
  4109. JMP memmove_long_emit_remainder_encodeBlockAsm10B
  4110. one_byte_emit_remainder_encodeBlockAsm10B:
  4111. SHLB $0x02, DL
  4112. MOVB DL, (AX)
  4113. ADDQ $0x01, AX
  4114. memmove_emit_remainder_encodeBlockAsm10B:
  4115. LEAQ (AX)(SI*1), DX
  4116. MOVL SI, BX
  4117. // genMemMoveShort
  4118. CMPQ BX, $0x03
  4119. JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2
  4120. JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3
  4121. CMPQ BX, $0x08
  4122. JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7
  4123. CMPQ BX, $0x10
  4124. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
  4125. CMPQ BX, $0x20
  4126. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
  4127. JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
  4128. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2:
  4129. MOVB (CX), SI
  4130. MOVB -1(CX)(BX*1), CL
  4131. MOVB SI, (AX)
  4132. MOVB CL, -1(AX)(BX*1)
  4133. JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
  4134. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3:
  4135. MOVW (CX), SI
  4136. MOVB 2(CX), CL
  4137. MOVW SI, (AX)
  4138. MOVB CL, 2(AX)
  4139. JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
  4140. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7:
  4141. MOVL (CX), SI
  4142. MOVL -4(CX)(BX*1), CX
  4143. MOVL SI, (AX)
  4144. MOVL CX, -4(AX)(BX*1)
  4145. JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
  4146. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
  4147. MOVQ (CX), SI
  4148. MOVQ -8(CX)(BX*1), CX
  4149. MOVQ SI, (AX)
  4150. MOVQ CX, -8(AX)(BX*1)
  4151. JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
  4152. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
  4153. MOVOU (CX), X0
  4154. MOVOU -16(CX)(BX*1), X1
  4155. MOVOU X0, (AX)
  4156. MOVOU X1, -16(AX)(BX*1)
  4157. JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
  4158. emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
  4159. MOVOU (CX), X0
  4160. MOVOU 16(CX), X1
  4161. MOVOU -32(CX)(BX*1), X2
  4162. MOVOU -16(CX)(BX*1), X3
  4163. MOVOU X0, (AX)
  4164. MOVOU X1, 16(AX)
  4165. MOVOU X2, -32(AX)(BX*1)
  4166. MOVOU X3, -16(AX)(BX*1)
  4167. memmove_end_copy_emit_remainder_encodeBlockAsm10B:
  4168. MOVQ DX, AX
  4169. JMP emit_literal_done_emit_remainder_encodeBlockAsm10B
  4170. memmove_long_emit_remainder_encodeBlockAsm10B:
  4171. LEAQ (AX)(SI*1), DX
  4172. MOVL SI, BX
  4173. // genMemMoveLong
  4174. MOVOU (CX), X0
  4175. MOVOU 16(CX), X1
  4176. MOVOU -32(CX)(BX*1), X2
  4177. MOVOU -16(CX)(BX*1), X3
  4178. MOVQ BX, DI
  4179. SHRQ $0x05, DI
  4180. MOVQ AX, SI
  4181. ANDL $0x0000001f, SI
  4182. MOVQ $0x00000040, R8
  4183. SUBQ SI, R8
  4184. DECQ DI
  4185. JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
  4186. LEAQ -32(CX)(R8*1), SI
  4187. LEAQ -32(AX)(R8*1), R9
  4188. emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
  4189. MOVOU (SI), X4
  4190. MOVOU 16(SI), X5
  4191. MOVOA X4, (R9)
  4192. MOVOA X5, 16(R9)
  4193. ADDQ $0x20, R9
  4194. ADDQ $0x20, SI
  4195. ADDQ $0x20, R8
  4196. DECQ DI
  4197. JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
  4198. emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
  4199. MOVOU -32(CX)(R8*1), X4
  4200. MOVOU -16(CX)(R8*1), X5
  4201. MOVOA X4, -32(AX)(R8*1)
  4202. MOVOA X5, -16(AX)(R8*1)
  4203. ADDQ $0x20, R8
  4204. CMPQ BX, R8
  4205. JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
  4206. MOVOU X0, (AX)
  4207. MOVOU X1, 16(AX)
  4208. MOVOU X2, -32(AX)(BX*1)
  4209. MOVOU X3, -16(AX)(BX*1)
  4210. MOVQ DX, AX
  4211. emit_literal_done_emit_remainder_encodeBlockAsm10B:
  4212. MOVQ dst_base+0(FP), CX
  4213. SUBQ CX, AX
  4214. MOVQ AX, ret+48(FP)
  4215. RET
  4216. // func encodeBlockAsm8B(dst []byte, src []byte) int
  4217. // Requires: BMI, SSE2
  4218. TEXT ·encodeBlockAsm8B(SB), $1048-56
  4219. MOVQ dst_base+0(FP), AX
  4220. MOVQ $0x00000008, CX
  4221. LEAQ 24(SP), DX
  4222. PXOR X0, X0
  4223. zero_loop_encodeBlockAsm8B:
  4224. MOVOU X0, (DX)
  4225. MOVOU X0, 16(DX)
  4226. MOVOU X0, 32(DX)
  4227. MOVOU X0, 48(DX)
  4228. MOVOU X0, 64(DX)
  4229. MOVOU X0, 80(DX)
  4230. MOVOU X0, 96(DX)
  4231. MOVOU X0, 112(DX)
  4232. ADDQ $0x80, DX
  4233. DECQ CX
  4234. JNZ zero_loop_encodeBlockAsm8B
  4235. MOVL $0x00000000, 12(SP)
  4236. MOVQ src_len+32(FP), CX
  4237. LEAQ -9(CX), DX
  4238. LEAQ -8(CX), SI
  4239. MOVL SI, 8(SP)
  4240. SHRQ $0x05, CX
  4241. SUBL CX, DX
  4242. LEAQ (AX)(DX*1), DX
  4243. MOVQ DX, (SP)
  4244. MOVL $0x00000001, CX
  4245. MOVL CX, 16(SP)
  4246. MOVQ src_base+24(FP), DX
  4247. search_loop_encodeBlockAsm8B:
  4248. MOVL CX, SI
  4249. SUBL 12(SP), SI
  4250. SHRL $0x04, SI
  4251. LEAL 4(CX)(SI*1), SI
  4252. CMPL SI, 8(SP)
  4253. JGE emit_remainder_encodeBlockAsm8B
  4254. MOVQ (DX)(CX*1), DI
  4255. MOVL SI, 20(SP)
  4256. MOVQ $0x9e3779b1, R9
  4257. MOVQ DI, R10
  4258. MOVQ DI, R11
  4259. SHRQ $0x08, R11
  4260. SHLQ $0x20, R10
  4261. IMULQ R9, R10
  4262. SHRQ $0x38, R10
  4263. SHLQ $0x20, R11
  4264. IMULQ R9, R11
  4265. SHRQ $0x38, R11
  4266. MOVL 24(SP)(R10*4), SI
  4267. MOVL 24(SP)(R11*4), R8
  4268. MOVL CX, 24(SP)(R10*4)
  4269. LEAL 1(CX), R10
  4270. MOVL R10, 24(SP)(R11*4)
  4271. MOVQ DI, R10
  4272. SHRQ $0x10, R10
  4273. SHLQ $0x20, R10
  4274. IMULQ R9, R10
  4275. SHRQ $0x38, R10
  4276. MOVL CX, R9
  4277. SUBL 16(SP), R9
  4278. MOVL 1(DX)(R9*1), R11
  4279. MOVQ DI, R9
  4280. SHRQ $0x08, R9
  4281. CMPL R9, R11
  4282. JNE no_repeat_found_encodeBlockAsm8B
  4283. LEAL 1(CX), DI
  4284. MOVL 12(SP), R8
  4285. MOVL DI, SI
  4286. SUBL 16(SP), SI
  4287. JZ repeat_extend_back_end_encodeBlockAsm8B
  4288. repeat_extend_back_loop_encodeBlockAsm8B:
  4289. CMPL DI, R8
  4290. JLE repeat_extend_back_end_encodeBlockAsm8B
  4291. MOVB -1(DX)(SI*1), BL
  4292. MOVB -1(DX)(DI*1), R9
  4293. CMPB BL, R9
  4294. JNE repeat_extend_back_end_encodeBlockAsm8B
  4295. LEAL -1(DI), DI
  4296. DECL SI
  4297. JNZ repeat_extend_back_loop_encodeBlockAsm8B
  4298. repeat_extend_back_end_encodeBlockAsm8B:
  4299. MOVL 12(SP), SI
  4300. CMPL SI, DI
  4301. JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B
  4302. MOVL DI, R9
  4303. MOVL DI, 12(SP)
  4304. LEAQ (DX)(SI*1), R10
  4305. SUBL SI, R9
  4306. LEAL -1(R9), SI
  4307. CMPL SI, $0x3c
  4308. JLT one_byte_repeat_emit_encodeBlockAsm8B
  4309. CMPL SI, $0x00000100
  4310. JLT two_bytes_repeat_emit_encodeBlockAsm8B
  4311. MOVB $0xf4, (AX)
  4312. MOVW SI, 1(AX)
  4313. ADDQ $0x03, AX
  4314. JMP memmove_long_repeat_emit_encodeBlockAsm8B
  4315. two_bytes_repeat_emit_encodeBlockAsm8B:
  4316. MOVB $0xf0, (AX)
  4317. MOVB SI, 1(AX)
  4318. ADDQ $0x02, AX
  4319. CMPL SI, $0x40
  4320. JL memmove_repeat_emit_encodeBlockAsm8B
  4321. JMP memmove_long_repeat_emit_encodeBlockAsm8B
  4322. one_byte_repeat_emit_encodeBlockAsm8B:
  4323. SHLB $0x02, SI
  4324. MOVB SI, (AX)
  4325. ADDQ $0x01, AX
  4326. memmove_repeat_emit_encodeBlockAsm8B:
  4327. LEAQ (AX)(R9*1), SI
  4328. // genMemMoveShort
  4329. CMPQ R9, $0x08
  4330. JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
  4331. CMPQ R9, $0x10
  4332. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
  4333. CMPQ R9, $0x20
  4334. JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
  4335. JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
  4336. emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
  4337. MOVQ (R10), R11
  4338. MOVQ R11, (AX)
  4339. JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
  4340. emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
  4341. MOVQ (R10), R11
  4342. MOVQ -8(R10)(R9*1), R10
  4343. MOVQ R11, (AX)
  4344. MOVQ R10, -8(AX)(R9*1)
  4345. JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
  4346. emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
  4347. MOVOU (R10), X0
  4348. MOVOU -16(R10)(R9*1), X1
  4349. MOVOU X0, (AX)
  4350. MOVOU X1, -16(AX)(R9*1)
  4351. JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
  4352. emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
  4353. MOVOU (R10), X0
  4354. MOVOU 16(R10), X1
  4355. MOVOU -32(R10)(R9*1), X2
  4356. MOVOU -16(R10)(R9*1), X3
  4357. MOVOU X0, (AX)
  4358. MOVOU X1, 16(AX)
  4359. MOVOU X2, -32(AX)(R9*1)
  4360. MOVOU X3, -16(AX)(R9*1)
  4361. memmove_end_copy_repeat_emit_encodeBlockAsm8B:
  4362. MOVQ SI, AX
  4363. JMP emit_literal_done_repeat_emit_encodeBlockAsm8B
  4364. memmove_long_repeat_emit_encodeBlockAsm8B:
  4365. LEAQ (AX)(R9*1), SI
  4366. // genMemMoveLong
  4367. MOVOU (R10), X0
  4368. MOVOU 16(R10), X1
  4369. MOVOU -32(R10)(R9*1), X2
  4370. MOVOU -16(R10)(R9*1), X3
  4371. MOVQ R9, R12
  4372. SHRQ $0x05, R12
  4373. MOVQ AX, R11
  4374. ANDL $0x0000001f, R11
  4375. MOVQ $0x00000040, R13
  4376. SUBQ R11, R13
  4377. DECQ R12
  4378. JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
  4379. LEAQ -32(R10)(R13*1), R11
  4380. LEAQ -32(AX)(R13*1), R14
  4381. emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
  4382. MOVOU (R11), X4
  4383. MOVOU 16(R11), X5
  4384. MOVOA X4, (R14)
  4385. MOVOA X5, 16(R14)
  4386. ADDQ $0x20, R14
  4387. ADDQ $0x20, R11
  4388. ADDQ $0x20, R13
  4389. DECQ R12
  4390. JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
  4391. emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
  4392. MOVOU -32(R10)(R13*1), X4
  4393. MOVOU -16(R10)(R13*1), X5
  4394. MOVOA X4, -32(AX)(R13*1)
  4395. MOVOA X5, -16(AX)(R13*1)
  4396. ADDQ $0x20, R13
  4397. CMPQ R9, R13
  4398. JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
  4399. MOVOU X0, (AX)
  4400. MOVOU X1, 16(AX)
  4401. MOVOU X2, -32(AX)(R9*1)
  4402. MOVOU X3, -16(AX)(R9*1)
  4403. MOVQ SI, AX
  4404. emit_literal_done_repeat_emit_encodeBlockAsm8B:
  4405. ADDL $0x05, CX
  4406. MOVL CX, SI
  4407. SUBL 16(SP), SI
  4408. MOVQ src_len+32(FP), R9
  4409. SUBL CX, R9
  4410. LEAQ (DX)(CX*1), R10
  4411. LEAQ (DX)(SI*1), SI
  4412. // matchLen
  4413. XORL R12, R12
  4414. CMPL R9, $0x08
  4415. JL matchlen_match4_repeat_extend_encodeBlockAsm8B
  4416. matchlen_loopback_repeat_extend_encodeBlockAsm8B:
  4417. MOVQ (R10)(R12*1), R11
  4418. XORQ (SI)(R12*1), R11
  4419. TESTQ R11, R11
  4420. JZ matchlen_loop_repeat_extend_encodeBlockAsm8B
  4421. #ifdef GOAMD64_v3
  4422. TZCNTQ R11, R11
  4423. #else
  4424. BSFQ R11, R11
  4425. #endif
  4426. SARQ $0x03, R11
  4427. LEAL (R12)(R11*1), R12
  4428. JMP repeat_extend_forward_end_encodeBlockAsm8B
  4429. matchlen_loop_repeat_extend_encodeBlockAsm8B:
  4430. LEAL -8(R9), R9
  4431. LEAL 8(R12), R12
  4432. CMPL R9, $0x08
  4433. JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B
  4434. JZ repeat_extend_forward_end_encodeBlockAsm8B
  4435. matchlen_match4_repeat_extend_encodeBlockAsm8B:
  4436. CMPL R9, $0x04
  4437. JL matchlen_match2_repeat_extend_encodeBlockAsm8B
  4438. MOVL (R10)(R12*1), R11
  4439. CMPL (SI)(R12*1), R11
  4440. JNE matchlen_match2_repeat_extend_encodeBlockAsm8B
  4441. SUBL $0x04, R9
  4442. LEAL 4(R12), R12
  4443. matchlen_match2_repeat_extend_encodeBlockAsm8B:
  4444. CMPL R9, $0x02
  4445. JL matchlen_match1_repeat_extend_encodeBlockAsm8B
  4446. MOVW (R10)(R12*1), R11
  4447. CMPW (SI)(R12*1), R11
  4448. JNE matchlen_match1_repeat_extend_encodeBlockAsm8B
  4449. SUBL $0x02, R9
  4450. LEAL 2(R12), R12
  4451. matchlen_match1_repeat_extend_encodeBlockAsm8B:
  4452. CMPL R9, $0x01
  4453. JL repeat_extend_forward_end_encodeBlockAsm8B
  4454. MOVB (R10)(R12*1), R11
  4455. CMPB (SI)(R12*1), R11
  4456. JNE repeat_extend_forward_end_encodeBlockAsm8B
  4457. LEAL 1(R12), R12
  4458. repeat_extend_forward_end_encodeBlockAsm8B:
  4459. ADDL R12, CX
  4460. MOVL CX, SI
  4461. SUBL DI, SI
  4462. MOVL 16(SP), DI
  4463. TESTL R8, R8
  4464. JZ repeat_as_copy_encodeBlockAsm8B
  4465. // emitRepeat
  4466. MOVL SI, DI
  4467. LEAL -4(SI), SI
  4468. CMPL DI, $0x08
  4469. JLE repeat_two_match_repeat_encodeBlockAsm8B
  4470. CMPL DI, $0x0c
  4471. JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
  4472. cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
  4473. CMPL SI, $0x00000104
  4474. JLT repeat_three_match_repeat_encodeBlockAsm8B
  4475. LEAL -256(SI), SI
  4476. MOVW $0x0019, (AX)
  4477. MOVW SI, 2(AX)
  4478. ADDQ $0x04, AX
  4479. JMP repeat_end_emit_encodeBlockAsm8B
  4480. repeat_three_match_repeat_encodeBlockAsm8B:
  4481. LEAL -4(SI), SI
  4482. MOVW $0x0015, (AX)
  4483. MOVB SI, 2(AX)
  4484. ADDQ $0x03, AX
  4485. JMP repeat_end_emit_encodeBlockAsm8B
  4486. repeat_two_match_repeat_encodeBlockAsm8B:
  4487. SHLL $0x02, SI
  4488. ORL $0x01, SI
  4489. MOVW SI, (AX)
  4490. ADDQ $0x02, AX
  4491. JMP repeat_end_emit_encodeBlockAsm8B
  4492. XORQ R8, R8
  4493. LEAL 1(R8)(SI*4), SI
  4494. MOVB DI, 1(AX)
  4495. SARL $0x08, DI
  4496. SHLL $0x05, DI
  4497. ORL DI, SI
  4498. MOVB SI, (AX)
  4499. ADDQ $0x02, AX
  4500. JMP repeat_end_emit_encodeBlockAsm8B
  4501. repeat_as_copy_encodeBlockAsm8B:
  4502. // emitCopy
  4503. two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
  4504. CMPL SI, $0x40
  4505. JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
  4506. CMPL DI, $0x00000800
  4507. JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B
  4508. MOVL $0x00000001, R8
  4509. LEAL 16(R8), R8
  4510. MOVB DI, 1(AX)
  4511. SHRL $0x08, DI
  4512. SHLL $0x05, DI
  4513. ORL DI, R8
  4514. MOVB R8, (AX)
  4515. ADDQ $0x02, AX
  4516. SUBL $0x08, SI
  4517. // emitRepeat
  4518. LEAL -4(SI), SI
  4519. JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
  4520. MOVL SI, DI
  4521. LEAL -4(SI), SI
  4522. CMPL DI, $0x08
  4523. JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
  4524. CMPL DI, $0x0c
  4525. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
  4526. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
  4527. CMPL SI, $0x00000104
  4528. JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
  4529. LEAL -256(SI), SI
  4530. MOVW $0x0019, (AX)
  4531. MOVW SI, 2(AX)
  4532. ADDQ $0x04, AX
  4533. JMP repeat_end_emit_encodeBlockAsm8B
  4534. repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
  4535. LEAL -4(SI), SI
  4536. MOVW $0x0015, (AX)
  4537. MOVB SI, 2(AX)
  4538. ADDQ $0x03, AX
  4539. JMP repeat_end_emit_encodeBlockAsm8B
  4540. repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
  4541. SHLL $0x02, SI
  4542. ORL $0x01, SI
  4543. MOVW SI, (AX)
  4544. ADDQ $0x02, AX
  4545. JMP repeat_end_emit_encodeBlockAsm8B
  4546. XORQ R8, R8
  4547. LEAL 1(R8)(SI*4), SI
  4548. MOVB DI, 1(AX)
  4549. SARL $0x08, DI
  4550. SHLL $0x05, DI
  4551. ORL DI, SI
  4552. MOVB SI, (AX)
  4553. ADDQ $0x02, AX
  4554. JMP repeat_end_emit_encodeBlockAsm8B
  4555. long_offset_short_repeat_as_copy_encodeBlockAsm8B:
  4556. MOVB $0xee, (AX)
  4557. MOVW DI, 1(AX)
  4558. LEAL -60(SI), SI
  4559. ADDQ $0x03, AX
  4560. // emitRepeat
  4561. MOVL SI, DI
  4562. LEAL -4(SI), SI
  4563. CMPL DI, $0x08
  4564. JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
  4565. CMPL DI, $0x0c
  4566. JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
  4567. cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
  4568. CMPL SI, $0x00000104
  4569. JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
  4570. LEAL -256(SI), SI
  4571. MOVW $0x0019, (AX)
  4572. MOVW SI, 2(AX)
  4573. ADDQ $0x04, AX
  4574. JMP repeat_end_emit_encodeBlockAsm8B
  4575. repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
  4576. LEAL -4(SI), SI
  4577. MOVW $0x0015, (AX)
  4578. MOVB SI, 2(AX)
  4579. ADDQ $0x03, AX
  4580. JMP repeat_end_emit_encodeBlockAsm8B
  4581. repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
  4582. SHLL $0x02, SI
  4583. ORL $0x01, SI
  4584. MOVW SI, (AX)
  4585. ADDQ $0x02, AX
  4586. JMP repeat_end_emit_encodeBlockAsm8B
  4587. XORQ R8, R8
  4588. LEAL 1(R8)(SI*4), SI
  4589. MOVB DI, 1(AX)
  4590. SARL $0x08, DI
  4591. SHLL $0x05, DI
  4592. ORL DI, SI
  4593. MOVB SI, (AX)
  4594. ADDQ $0x02, AX
  4595. JMP repeat_end_emit_encodeBlockAsm8B
  4596. JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B
  4597. two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
  4598. CMPL SI, $0x0c
  4599. JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B
  4600. MOVB $0x01, BL
  4601. LEAL -16(BX)(SI*4), SI
  4602. MOVB DI, 1(AX)
  4603. SHRL $0x08, DI
  4604. SHLL $0x05, DI
  4605. ORL DI, SI
  4606. MOVB SI, (AX)
  4607. ADDQ $0x02, AX
  4608. JMP repeat_end_emit_encodeBlockAsm8B
  4609. emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
  4610. MOVB $0x02, BL
  4611. LEAL -4(BX)(SI*4), SI
  4612. MOVB SI, (AX)
  4613. MOVW DI, 1(AX)
  4614. ADDQ $0x03, AX
  4615. repeat_end_emit_encodeBlockAsm8B:
  4616. MOVL CX, 12(SP)
  4617. JMP search_loop_encodeBlockAsm8B
  4618. no_repeat_found_encodeBlockAsm8B:
  4619. CMPL (DX)(SI*1), DI
  4620. JEQ candidate_match_encodeBlockAsm8B
  4621. SHRQ $0x08, DI
  4622. MOVL 24(SP)(R10*4), SI
  4623. LEAL 2(CX), R9
  4624. CMPL (DX)(R8*1), DI
  4625. JEQ candidate2_match_encodeBlockAsm8B
  4626. MOVL R9, 24(SP)(R10*4)
  4627. SHRQ $0x08, DI
  4628. CMPL (DX)(SI*1), DI
  4629. JEQ candidate3_match_encodeBlockAsm8B
  4630. MOVL 20(SP), CX
  4631. JMP search_loop_encodeBlockAsm8B
  4632. candidate3_match_encodeBlockAsm8B:
  4633. ADDL $0x02, CX
  4634. JMP candidate_match_encodeBlockAsm8B
  4635. candidate2_match_encodeBlockAsm8B:
  4636. MOVL R9, 24(SP)(R10*4)
  4637. INCL CX
  4638. MOVL R8, SI
  4639. candidate_match_encodeBlockAsm8B:
  4640. MOVL 12(SP), DI
  4641. TESTL SI, SI
  4642. JZ match_extend_back_end_encodeBlockAsm8B
  4643. match_extend_back_loop_encodeBlockAsm8B:
  4644. CMPL CX, DI
  4645. JLE match_extend_back_end_encodeBlockAsm8B
  4646. MOVB -1(DX)(SI*1), BL
  4647. MOVB -1(DX)(CX*1), R8
  4648. CMPB BL, R8
  4649. JNE match_extend_back_end_encodeBlockAsm8B
  4650. LEAL -1(CX), CX
  4651. DECL SI
  4652. JZ match_extend_back_end_encodeBlockAsm8B
  4653. JMP match_extend_back_loop_encodeBlockAsm8B
  4654. match_extend_back_end_encodeBlockAsm8B:
  4655. MOVL CX, DI
  4656. SUBL 12(SP), DI
  4657. LEAQ 3(AX)(DI*1), DI
  4658. CMPQ DI, (SP)
  4659. JL match_dst_size_check_encodeBlockAsm8B
  4660. MOVQ $0x00000000, ret+48(FP)
  4661. RET
  4662. match_dst_size_check_encodeBlockAsm8B:
  4663. MOVL CX, DI
  4664. MOVL 12(SP), R8
  4665. CMPL R8, DI
  4666. JEQ emit_literal_done_match_emit_encodeBlockAsm8B
  4667. MOVL DI, R9
  4668. MOVL DI, 12(SP)
  4669. LEAQ (DX)(R8*1), DI
  4670. SUBL R8, R9
  4671. LEAL -1(R9), R8
  4672. CMPL R8, $0x3c
  4673. JLT one_byte_match_emit_encodeBlockAsm8B
  4674. CMPL R8, $0x00000100
  4675. JLT two_bytes_match_emit_encodeBlockAsm8B
  4676. MOVB $0xf4, (AX)
  4677. MOVW R8, 1(AX)
  4678. ADDQ $0x03, AX
  4679. JMP memmove_long_match_emit_encodeBlockAsm8B
  4680. two_bytes_match_emit_encodeBlockAsm8B:
  4681. MOVB $0xf0, (AX)
  4682. MOVB R8, 1(AX)
  4683. ADDQ $0x02, AX
  4684. CMPL R8, $0x40
  4685. JL memmove_match_emit_encodeBlockAsm8B
  4686. JMP memmove_long_match_emit_encodeBlockAsm8B
  4687. one_byte_match_emit_encodeBlockAsm8B:
  4688. SHLB $0x02, R8
  4689. MOVB R8, (AX)
  4690. ADDQ $0x01, AX
  4691. memmove_match_emit_encodeBlockAsm8B:
  4692. LEAQ (AX)(R9*1), R8
  4693. // genMemMoveShort
  4694. CMPQ R9, $0x08
  4695. JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
  4696. CMPQ R9, $0x10
  4697. JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
  4698. CMPQ R9, $0x20
  4699. JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
  4700. JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
  4701. emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
  4702. MOVQ (DI), R10
  4703. MOVQ R10, (AX)
  4704. JMP memmove_end_copy_match_emit_encodeBlockAsm8B
  4705. emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
  4706. MOVQ (DI), R10
  4707. MOVQ -8(DI)(R9*1), DI
  4708. MOVQ R10, (AX)
  4709. MOVQ DI, -8(AX)(R9*1)
  4710. JMP memmove_end_copy_match_emit_encodeBlockAsm8B
  4711. emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
  4712. MOVOU (DI), X0
  4713. MOVOU -16(DI)(R9*1), X1
  4714. MOVOU X0, (AX)
  4715. MOVOU X1, -16(AX)(R9*1)
  4716. JMP memmove_end_copy_match_emit_encodeBlockAsm8B
  4717. emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
  4718. MOVOU (DI), X0
  4719. MOVOU 16(DI), X1
  4720. MOVOU -32(DI)(R9*1), X2
  4721. MOVOU -16(DI)(R9*1), X3
  4722. MOVOU X0, (AX)
  4723. MOVOU X1, 16(AX)
  4724. MOVOU X2, -32(AX)(R9*1)
  4725. MOVOU X3, -16(AX)(R9*1)
  4726. memmove_end_copy_match_emit_encodeBlockAsm8B:
  4727. MOVQ R8, AX
  4728. JMP emit_literal_done_match_emit_encodeBlockAsm8B
  4729. memmove_long_match_emit_encodeBlockAsm8B:
  4730. LEAQ (AX)(R9*1), R8
  4731. // genMemMoveLong
  4732. MOVOU (DI), X0
  4733. MOVOU 16(DI), X1
  4734. MOVOU -32(DI)(R9*1), X2
  4735. MOVOU -16(DI)(R9*1), X3
  4736. MOVQ R9, R11
  4737. SHRQ $0x05, R11
  4738. MOVQ AX, R10
  4739. ANDL $0x0000001f, R10
  4740. MOVQ $0x00000040, R12
  4741. SUBQ R10, R12
  4742. DECQ R11
  4743. JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
  4744. LEAQ -32(DI)(R12*1), R10
  4745. LEAQ -32(AX)(R12*1), R13
  4746. emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
  4747. MOVOU (R10), X4
  4748. MOVOU 16(R10), X5
  4749. MOVOA X4, (R13)
  4750. MOVOA X5, 16(R13)
  4751. ADDQ $0x20, R13
  4752. ADDQ $0x20, R10
  4753. ADDQ $0x20, R12
  4754. DECQ R11
  4755. JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
  4756. emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
  4757. MOVOU -32(DI)(R12*1), X4
  4758. MOVOU -16(DI)(R12*1), X5
  4759. MOVOA X4, -32(AX)(R12*1)
  4760. MOVOA X5, -16(AX)(R12*1)
  4761. ADDQ $0x20, R12
  4762. CMPQ R9, R12
  4763. JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
  4764. MOVOU X0, (AX)
  4765. MOVOU X1, 16(AX)
  4766. MOVOU X2, -32(AX)(R9*1)
  4767. MOVOU X3, -16(AX)(R9*1)
  4768. MOVQ R8, AX
  4769. emit_literal_done_match_emit_encodeBlockAsm8B:
  4770. match_nolit_loop_encodeBlockAsm8B:
  4771. MOVL CX, DI
  4772. SUBL SI, DI
  4773. MOVL DI, 16(SP)
  4774. ADDL $0x04, CX
  4775. ADDL $0x04, SI
  4776. MOVQ src_len+32(FP), DI
  4777. SUBL CX, DI
  4778. LEAQ (DX)(CX*1), R8
  4779. LEAQ (DX)(SI*1), SI
  4780. // matchLen
  4781. XORL R10, R10
  4782. CMPL DI, $0x08
  4783. JL matchlen_match4_match_nolit_encodeBlockAsm8B
  4784. matchlen_loopback_match_nolit_encodeBlockAsm8B:
  4785. MOVQ (R8)(R10*1), R9
  4786. XORQ (SI)(R10*1), R9
  4787. TESTQ R9, R9
  4788. JZ matchlen_loop_match_nolit_encodeBlockAsm8B
  4789. #ifdef GOAMD64_v3
  4790. TZCNTQ R9, R9
  4791. #else
  4792. BSFQ R9, R9
  4793. #endif
  4794. SARQ $0x03, R9
  4795. LEAL (R10)(R9*1), R10
  4796. JMP match_nolit_end_encodeBlockAsm8B
  4797. matchlen_loop_match_nolit_encodeBlockAsm8B:
  4798. LEAL -8(DI), DI
  4799. LEAL 8(R10), R10
  4800. CMPL DI, $0x08
  4801. JGE matchlen_loopback_match_nolit_encodeBlockAsm8B
  4802. JZ match_nolit_end_encodeBlockAsm8B
  4803. matchlen_match4_match_nolit_encodeBlockAsm8B:
  4804. CMPL DI, $0x04
  4805. JL matchlen_match2_match_nolit_encodeBlockAsm8B
  4806. MOVL (R8)(R10*1), R9
  4807. CMPL (SI)(R10*1), R9
  4808. JNE matchlen_match2_match_nolit_encodeBlockAsm8B
  4809. SUBL $0x04, DI
  4810. LEAL 4(R10), R10
  4811. matchlen_match2_match_nolit_encodeBlockAsm8B:
  4812. CMPL DI, $0x02
  4813. JL matchlen_match1_match_nolit_encodeBlockAsm8B
  4814. MOVW (R8)(R10*1), R9
  4815. CMPW (SI)(R10*1), R9
  4816. JNE matchlen_match1_match_nolit_encodeBlockAsm8B
  4817. SUBL $0x02, DI
  4818. LEAL 2(R10), R10
  4819. matchlen_match1_match_nolit_encodeBlockAsm8B:
  4820. CMPL DI, $0x01
  4821. JL match_nolit_end_encodeBlockAsm8B
  4822. MOVB (R8)(R10*1), R9
  4823. CMPB (SI)(R10*1), R9
  4824. JNE match_nolit_end_encodeBlockAsm8B
  4825. LEAL 1(R10), R10
  4826. match_nolit_end_encodeBlockAsm8B:
  4827. ADDL R10, CX
  4828. MOVL 16(SP), SI
  4829. ADDL $0x04, R10
  4830. MOVL CX, 12(SP)
  4831. // emitCopy
  4832. two_byte_offset_match_nolit_encodeBlockAsm8B:
  4833. CMPL R10, $0x40
  4834. JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B
  4835. CMPL SI, $0x00000800
  4836. JAE long_offset_short_match_nolit_encodeBlockAsm8B
  4837. MOVL $0x00000001, DI
  4838. LEAL 16(DI), DI
  4839. MOVB SI, 1(AX)
  4840. SHRL $0x08, SI
  4841. SHLL $0x05, SI
  4842. ORL SI, DI
  4843. MOVB DI, (AX)
  4844. ADDQ $0x02, AX
  4845. SUBL $0x08, R10
  4846. // emitRepeat
  4847. LEAL -4(R10), R10
  4848. JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
  4849. MOVL R10, SI
  4850. LEAL -4(R10), R10
  4851. CMPL SI, $0x08
  4852. JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
  4853. CMPL SI, $0x0c
  4854. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
  4855. cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
  4856. CMPL R10, $0x00000104
  4857. JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
  4858. LEAL -256(R10), R10
  4859. MOVW $0x0019, (AX)
  4860. MOVW R10, 2(AX)
  4861. ADDQ $0x04, AX
  4862. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  4863. repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
  4864. LEAL -4(R10), R10
  4865. MOVW $0x0015, (AX)
  4866. MOVB R10, 2(AX)
  4867. ADDQ $0x03, AX
  4868. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  4869. repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
  4870. SHLL $0x02, R10
  4871. ORL $0x01, R10
  4872. MOVW R10, (AX)
  4873. ADDQ $0x02, AX
  4874. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  4875. XORQ DI, DI
  4876. LEAL 1(DI)(R10*4), R10
  4877. MOVB SI, 1(AX)
  4878. SARL $0x08, SI
  4879. SHLL $0x05, SI
  4880. ORL SI, R10
  4881. MOVB R10, (AX)
  4882. ADDQ $0x02, AX
  4883. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  4884. long_offset_short_match_nolit_encodeBlockAsm8B:
  4885. MOVB $0xee, (AX)
  4886. MOVW SI, 1(AX)
  4887. LEAL -60(R10), R10
  4888. ADDQ $0x03, AX
  4889. // emitRepeat
  4890. MOVL R10, SI
  4891. LEAL -4(R10), R10
  4892. CMPL SI, $0x08
  4893. JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
  4894. CMPL SI, $0x0c
  4895. JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
  4896. cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
  4897. CMPL R10, $0x00000104
  4898. JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
  4899. LEAL -256(R10), R10
  4900. MOVW $0x0019, (AX)
  4901. MOVW R10, 2(AX)
  4902. ADDQ $0x04, AX
  4903. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  4904. repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
  4905. LEAL -4(R10), R10
  4906. MOVW $0x0015, (AX)
  4907. MOVB R10, 2(AX)
  4908. ADDQ $0x03, AX
  4909. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  4910. repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
  4911. SHLL $0x02, R10
  4912. ORL $0x01, R10
  4913. MOVW R10, (AX)
  4914. ADDQ $0x02, AX
  4915. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  4916. XORQ DI, DI
  4917. LEAL 1(DI)(R10*4), R10
  4918. MOVB SI, 1(AX)
  4919. SARL $0x08, SI
  4920. SHLL $0x05, SI
  4921. ORL SI, R10
  4922. MOVB R10, (AX)
  4923. ADDQ $0x02, AX
  4924. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  4925. JMP two_byte_offset_match_nolit_encodeBlockAsm8B
  4926. two_byte_offset_short_match_nolit_encodeBlockAsm8B:
  4927. CMPL R10, $0x0c
  4928. JGE emit_copy_three_match_nolit_encodeBlockAsm8B
  4929. MOVB $0x01, BL
  4930. LEAL -16(BX)(R10*4), R10
  4931. MOVB SI, 1(AX)
  4932. SHRL $0x08, SI
  4933. SHLL $0x05, SI
  4934. ORL SI, R10
  4935. MOVB R10, (AX)
  4936. ADDQ $0x02, AX
  4937. JMP match_nolit_emitcopy_end_encodeBlockAsm8B
  4938. emit_copy_three_match_nolit_encodeBlockAsm8B:
  4939. MOVB $0x02, BL
  4940. LEAL -4(BX)(R10*4), R10
  4941. MOVB R10, (AX)
  4942. MOVW SI, 1(AX)
  4943. ADDQ $0x03, AX
  4944. match_nolit_emitcopy_end_encodeBlockAsm8B:
  4945. CMPL CX, 8(SP)
  4946. JGE emit_remainder_encodeBlockAsm8B
  4947. MOVQ -2(DX)(CX*1), DI
  4948. CMPQ AX, (SP)
  4949. JL match_nolit_dst_ok_encodeBlockAsm8B
  4950. MOVQ $0x00000000, ret+48(FP)
  4951. RET
  4952. match_nolit_dst_ok_encodeBlockAsm8B:
  4953. MOVQ $0x9e3779b1, R9
  4954. MOVQ DI, R8
  4955. SHRQ $0x10, DI
  4956. MOVQ DI, SI
  4957. SHLQ $0x20, R8
  4958. IMULQ R9, R8
  4959. SHRQ $0x38, R8
  4960. SHLQ $0x20, SI
  4961. IMULQ R9, SI
  4962. SHRQ $0x38, SI
  4963. LEAL -2(CX), R9
  4964. LEAQ 24(SP)(SI*4), R10
  4965. MOVL (R10), SI
  4966. MOVL R9, 24(SP)(R8*4)
  4967. MOVL CX, (R10)
  4968. CMPL (DX)(SI*1), DI
  4969. JEQ match_nolit_loop_encodeBlockAsm8B
  4970. INCL CX
  4971. JMP search_loop_encodeBlockAsm8B
  4972. emit_remainder_encodeBlockAsm8B:
  4973. MOVQ src_len+32(FP), CX
  4974. SUBL 12(SP), CX
  4975. LEAQ 3(AX)(CX*1), CX
  4976. CMPQ CX, (SP)
  4977. JL emit_remainder_ok_encodeBlockAsm8B
  4978. MOVQ $0x00000000, ret+48(FP)
  4979. RET
  4980. emit_remainder_ok_encodeBlockAsm8B:
  4981. MOVQ src_len+32(FP), CX
  4982. MOVL 12(SP), BX
  4983. CMPL BX, CX
  4984. JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B
  4985. MOVL CX, SI
  4986. MOVL CX, 12(SP)
  4987. LEAQ (DX)(BX*1), CX
  4988. SUBL BX, SI
  4989. LEAL -1(SI), DX
  4990. CMPL DX, $0x3c
  4991. JLT one_byte_emit_remainder_encodeBlockAsm8B
  4992. CMPL DX, $0x00000100
  4993. JLT two_bytes_emit_remainder_encodeBlockAsm8B
  4994. MOVB $0xf4, (AX)
  4995. MOVW DX, 1(AX)
  4996. ADDQ $0x03, AX
  4997. JMP memmove_long_emit_remainder_encodeBlockAsm8B
  4998. two_bytes_emit_remainder_encodeBlockAsm8B:
  4999. MOVB $0xf0, (AX)
  5000. MOVB DL, 1(AX)
  5001. ADDQ $0x02, AX
  5002. CMPL DX, $0x40
  5003. JL memmove_emit_remainder_encodeBlockAsm8B
  5004. JMP memmove_long_emit_remainder_encodeBlockAsm8B
  5005. one_byte_emit_remainder_encodeBlockAsm8B:
  5006. SHLB $0x02, DL
  5007. MOVB DL, (AX)
  5008. ADDQ $0x01, AX
  5009. memmove_emit_remainder_encodeBlockAsm8B:
  5010. LEAQ (AX)(SI*1), DX
  5011. MOVL SI, BX
  5012. // genMemMoveShort
  5013. CMPQ BX, $0x03
  5014. JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2
  5015. JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3
  5016. CMPQ BX, $0x08
  5017. JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7
  5018. CMPQ BX, $0x10
  5019. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
  5020. CMPQ BX, $0x20
  5021. JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
  5022. JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
  5023. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2:
  5024. MOVB (CX), SI
  5025. MOVB -1(CX)(BX*1), CL
  5026. MOVB SI, (AX)
  5027. MOVB CL, -1(AX)(BX*1)
  5028. JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
  5029. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3:
  5030. MOVW (CX), SI
  5031. MOVB 2(CX), CL
  5032. MOVW SI, (AX)
  5033. MOVB CL, 2(AX)
  5034. JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
  5035. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7:
  5036. MOVL (CX), SI
  5037. MOVL -4(CX)(BX*1), CX
  5038. MOVL SI, (AX)
  5039. MOVL CX, -4(AX)(BX*1)
  5040. JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
  5041. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
  5042. MOVQ (CX), SI
  5043. MOVQ -8(CX)(BX*1), CX
  5044. MOVQ SI, (AX)
  5045. MOVQ CX, -8(AX)(BX*1)
  5046. JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
  5047. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
  5048. MOVOU (CX), X0
  5049. MOVOU -16(CX)(BX*1), X1
  5050. MOVOU X0, (AX)
  5051. MOVOU X1, -16(AX)(BX*1)
  5052. JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
  5053. emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
  5054. MOVOU (CX), X0
  5055. MOVOU 16(CX), X1
  5056. MOVOU -32(CX)(BX*1), X2
  5057. MOVOU -16(CX)(BX*1), X3
  5058. MOVOU X0, (AX)
  5059. MOVOU X1, 16(AX)
  5060. MOVOU X2, -32(AX)(BX*1)
  5061. MOVOU X3, -16(AX)(BX*1)
  5062. memmove_end_copy_emit_remainder_encodeBlockAsm8B:
  5063. MOVQ DX, AX
  5064. JMP emit_literal_done_emit_remainder_encodeBlockAsm8B
  5065. memmove_long_emit_remainder_encodeBlockAsm8B:
  5066. LEAQ (AX)(SI*1), DX
  5067. MOVL SI, BX
  5068. // genMemMoveLong
  5069. MOVOU (CX), X0
  5070. MOVOU 16(CX), X1
  5071. MOVOU -32(CX)(BX*1), X2
  5072. MOVOU -16(CX)(BX*1), X3
  5073. MOVQ BX, DI
  5074. SHRQ $0x05, DI
  5075. MOVQ AX, SI
  5076. ANDL $0x0000001f, SI
  5077. MOVQ $0x00000040, R8
  5078. SUBQ SI, R8
  5079. DECQ DI
  5080. JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
  5081. LEAQ -32(CX)(R8*1), SI
  5082. LEAQ -32(AX)(R8*1), R9
  5083. emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
  5084. MOVOU (SI), X4
  5085. MOVOU 16(SI), X5
  5086. MOVOA X4, (R9)
  5087. MOVOA X5, 16(R9)
  5088. ADDQ $0x20, R9
  5089. ADDQ $0x20, SI
  5090. ADDQ $0x20, R8
  5091. DECQ DI
  5092. JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
  5093. emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
  5094. MOVOU -32(CX)(R8*1), X4
  5095. MOVOU -16(CX)(R8*1), X5
  5096. MOVOA X4, -32(AX)(R8*1)
  5097. MOVOA X5, -16(AX)(R8*1)
  5098. ADDQ $0x20, R8
  5099. CMPQ BX, R8
  5100. JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
  5101. MOVOU X0, (AX)
  5102. MOVOU X1, 16(AX)
  5103. MOVOU X2, -32(AX)(BX*1)
  5104. MOVOU X3, -16(AX)(BX*1)
  5105. MOVQ DX, AX
  5106. emit_literal_done_emit_remainder_encodeBlockAsm8B:
  5107. MOVQ dst_base+0(FP), CX
  5108. SUBQ CX, AX
  5109. MOVQ AX, ret+48(FP)
  5110. RET
  5111. // func encodeBetterBlockAsm(dst []byte, src []byte) int
  5112. // Requires: BMI, SSE2
  5113. TEXT ·encodeBetterBlockAsm(SB), $327704-56
  5114. MOVQ dst_base+0(FP), AX
  5115. MOVQ $0x00000a00, CX
  5116. LEAQ 24(SP), DX
  5117. PXOR X0, X0
  5118. zero_loop_encodeBetterBlockAsm:
  5119. MOVOU X0, (DX)
  5120. MOVOU X0, 16(DX)
  5121. MOVOU X0, 32(DX)
  5122. MOVOU X0, 48(DX)
  5123. MOVOU X0, 64(DX)
  5124. MOVOU X0, 80(DX)
  5125. MOVOU X0, 96(DX)
  5126. MOVOU X0, 112(DX)
  5127. ADDQ $0x80, DX
  5128. DECQ CX
  5129. JNZ zero_loop_encodeBetterBlockAsm
  5130. MOVL $0x00000000, 12(SP)
  5131. MOVQ src_len+32(FP), CX
  5132. LEAQ -6(CX), DX
  5133. LEAQ -8(CX), SI
  5134. MOVL SI, 8(SP)
  5135. SHRQ $0x05, CX
  5136. SUBL CX, DX
  5137. LEAQ (AX)(DX*1), DX
  5138. MOVQ DX, (SP)
  5139. MOVL $0x00000001, CX
  5140. MOVL $0x00000000, 16(SP)
  5141. MOVQ src_base+24(FP), DX
  5142. search_loop_encodeBetterBlockAsm:
  5143. MOVL CX, SI
  5144. SUBL 12(SP), SI
  5145. SHRL $0x07, SI
  5146. CMPL SI, $0x63
  5147. JLE check_maxskip_ok_encodeBetterBlockAsm
  5148. LEAL 100(CX), SI
  5149. JMP check_maxskip_cont_encodeBetterBlockAsm
  5150. check_maxskip_ok_encodeBetterBlockAsm:
  5151. LEAL 1(CX)(SI*1), SI
  5152. check_maxskip_cont_encodeBetterBlockAsm:
  5153. CMPL SI, 8(SP)
  5154. JGE emit_remainder_encodeBetterBlockAsm
  5155. MOVQ (DX)(CX*1), DI
  5156. MOVL SI, 20(SP)
  5157. MOVQ $0x00cf1bbcdcbfa563, R9
  5158. MOVQ $0x9e3779b1, SI
  5159. MOVQ DI, R10
  5160. MOVQ DI, R11
  5161. SHLQ $0x08, R10
  5162. IMULQ R9, R10
  5163. SHRQ $0x30, R10
  5164. SHLQ $0x20, R11
  5165. IMULQ SI, R11
  5166. SHRQ $0x32, R11
  5167. MOVL 24(SP)(R10*4), SI
  5168. MOVL 262168(SP)(R11*4), R8
  5169. MOVL CX, 24(SP)(R10*4)
  5170. MOVL CX, 262168(SP)(R11*4)
  5171. CMPL (DX)(SI*1), DI
  5172. JEQ candidate_match_encodeBetterBlockAsm
  5173. CMPL (DX)(R8*1), DI
  5174. JEQ candidateS_match_encodeBetterBlockAsm
  5175. MOVL 20(SP), CX
  5176. JMP search_loop_encodeBetterBlockAsm
  5177. candidateS_match_encodeBetterBlockAsm:
  5178. SHRQ $0x08, DI
  5179. MOVQ DI, R10
  5180. SHLQ $0x08, R10
  5181. IMULQ R9, R10
  5182. SHRQ $0x30, R10
  5183. MOVL 24(SP)(R10*4), SI
  5184. INCL CX
  5185. MOVL CX, 24(SP)(R10*4)
  5186. CMPL (DX)(SI*1), DI
  5187. JEQ candidate_match_encodeBetterBlockAsm
  5188. DECL CX
  5189. MOVL R8, SI
  5190. candidate_match_encodeBetterBlockAsm:
  5191. MOVL 12(SP), DI
  5192. TESTL SI, SI
  5193. JZ match_extend_back_end_encodeBetterBlockAsm
  5194. match_extend_back_loop_encodeBetterBlockAsm:
  5195. CMPL CX, DI
  5196. JLE match_extend_back_end_encodeBetterBlockAsm
  5197. MOVB -1(DX)(SI*1), BL
  5198. MOVB -1(DX)(CX*1), R8
  5199. CMPB BL, R8
  5200. JNE match_extend_back_end_encodeBetterBlockAsm
  5201. LEAL -1(CX), CX
  5202. DECL SI
  5203. JZ match_extend_back_end_encodeBetterBlockAsm
  5204. JMP match_extend_back_loop_encodeBetterBlockAsm
  5205. match_extend_back_end_encodeBetterBlockAsm:
  5206. MOVL CX, DI
  5207. SUBL 12(SP), DI
  5208. LEAQ 5(AX)(DI*1), DI
  5209. CMPQ DI, (SP)
  5210. JL match_dst_size_check_encodeBetterBlockAsm
  5211. MOVQ $0x00000000, ret+48(FP)
  5212. RET
  5213. match_dst_size_check_encodeBetterBlockAsm:
  5214. MOVL CX, DI
  5215. ADDL $0x04, CX
  5216. ADDL $0x04, SI
  5217. MOVQ src_len+32(FP), R8
  5218. SUBL CX, R8
  5219. LEAQ (DX)(CX*1), R9
  5220. LEAQ (DX)(SI*1), R10
  5221. // matchLen
  5222. XORL R12, R12
  5223. CMPL R8, $0x08
  5224. JL matchlen_match4_match_nolit_encodeBetterBlockAsm
  5225. matchlen_loopback_match_nolit_encodeBetterBlockAsm:
  5226. MOVQ (R9)(R12*1), R11
  5227. XORQ (R10)(R12*1), R11
  5228. TESTQ R11, R11
  5229. JZ matchlen_loop_match_nolit_encodeBetterBlockAsm
  5230. #ifdef GOAMD64_v3
  5231. TZCNTQ R11, R11
  5232. #else
  5233. BSFQ R11, R11
  5234. #endif
  5235. SARQ $0x03, R11
  5236. LEAL (R12)(R11*1), R12
  5237. JMP match_nolit_end_encodeBetterBlockAsm
  5238. matchlen_loop_match_nolit_encodeBetterBlockAsm:
  5239. LEAL -8(R8), R8
  5240. LEAL 8(R12), R12
  5241. CMPL R8, $0x08
  5242. JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm
  5243. JZ match_nolit_end_encodeBetterBlockAsm
  5244. matchlen_match4_match_nolit_encodeBetterBlockAsm:
  5245. CMPL R8, $0x04
  5246. JL matchlen_match2_match_nolit_encodeBetterBlockAsm
  5247. MOVL (R9)(R12*1), R11
  5248. CMPL (R10)(R12*1), R11
  5249. JNE matchlen_match2_match_nolit_encodeBetterBlockAsm
  5250. SUBL $0x04, R8
  5251. LEAL 4(R12), R12
  5252. matchlen_match2_match_nolit_encodeBetterBlockAsm:
  5253. CMPL R8, $0x02
  5254. JL matchlen_match1_match_nolit_encodeBetterBlockAsm
  5255. MOVW (R9)(R12*1), R11
  5256. CMPW (R10)(R12*1), R11
  5257. JNE matchlen_match1_match_nolit_encodeBetterBlockAsm
  5258. SUBL $0x02, R8
  5259. LEAL 2(R12), R12
  5260. matchlen_match1_match_nolit_encodeBetterBlockAsm:
  5261. CMPL R8, $0x01
  5262. JL match_nolit_end_encodeBetterBlockAsm
  5263. MOVB (R9)(R12*1), R11
  5264. CMPB (R10)(R12*1), R11
  5265. JNE match_nolit_end_encodeBetterBlockAsm
  5266. LEAL 1(R12), R12
  5267. match_nolit_end_encodeBetterBlockAsm:
  5268. MOVL CX, R8
  5269. SUBL SI, R8
  5270. // Check if repeat
  5271. CMPL 16(SP), R8
  5272. JEQ match_is_repeat_encodeBetterBlockAsm
  5273. CMPL R12, $0x01
  5274. JG match_length_ok_encodeBetterBlockAsm
  5275. CMPL R8, $0x0000ffff
  5276. JLE match_length_ok_encodeBetterBlockAsm
  5277. MOVL 20(SP), CX
  5278. INCL CX
  5279. JMP search_loop_encodeBetterBlockAsm
  5280. match_length_ok_encodeBetterBlockAsm:
  5281. MOVL R8, 16(SP)
  5282. MOVL 12(SP), SI
  5283. CMPL SI, DI
  5284. JEQ emit_literal_done_match_emit_encodeBetterBlockAsm
  5285. MOVL DI, R9
  5286. MOVL DI, 12(SP)
  5287. LEAQ (DX)(SI*1), R10
  5288. SUBL SI, R9
  5289. LEAL -1(R9), SI
  5290. CMPL SI, $0x3c
  5291. JLT one_byte_match_emit_encodeBetterBlockAsm
  5292. CMPL SI, $0x00000100
  5293. JLT two_bytes_match_emit_encodeBetterBlockAsm
  5294. CMPL SI, $0x00010000
  5295. JLT three_bytes_match_emit_encodeBetterBlockAsm
  5296. CMPL SI, $0x01000000
  5297. JLT four_bytes_match_emit_encodeBetterBlockAsm
  5298. MOVB $0xfc, (AX)
  5299. MOVL SI, 1(AX)
  5300. ADDQ $0x05, AX
  5301. JMP memmove_long_match_emit_encodeBetterBlockAsm
  5302. four_bytes_match_emit_encodeBetterBlockAsm:
  5303. MOVL SI, R11
  5304. SHRL $0x10, R11
  5305. MOVB $0xf8, (AX)
  5306. MOVW SI, 1(AX)
  5307. MOVB R11, 3(AX)
  5308. ADDQ $0x04, AX
  5309. JMP memmove_long_match_emit_encodeBetterBlockAsm
  5310. three_bytes_match_emit_encodeBetterBlockAsm:
  5311. MOVB $0xf4, (AX)
  5312. MOVW SI, 1(AX)
  5313. ADDQ $0x03, AX
  5314. JMP memmove_long_match_emit_encodeBetterBlockAsm
  5315. two_bytes_match_emit_encodeBetterBlockAsm:
  5316. MOVB $0xf0, (AX)
  5317. MOVB SI, 1(AX)
  5318. ADDQ $0x02, AX
  5319. CMPL SI, $0x40
  5320. JL memmove_match_emit_encodeBetterBlockAsm
  5321. JMP memmove_long_match_emit_encodeBetterBlockAsm
  5322. one_byte_match_emit_encodeBetterBlockAsm:
  5323. SHLB $0x02, SI
  5324. MOVB SI, (AX)
  5325. ADDQ $0x01, AX
  5326. memmove_match_emit_encodeBetterBlockAsm:
  5327. LEAQ (AX)(R9*1), SI
  5328. // genMemMoveShort
  5329. CMPQ R9, $0x04
  5330. JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
  5331. CMPQ R9, $0x08
  5332. JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
  5333. CMPQ R9, $0x10
  5334. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
  5335. CMPQ R9, $0x20
  5336. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
  5337. JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
  5338. emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
  5339. MOVL (R10), R11
  5340. MOVL R11, (AX)
  5341. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
  5342. emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
  5343. MOVL (R10), R11
  5344. MOVL -4(R10)(R9*1), R10
  5345. MOVL R11, (AX)
  5346. MOVL R10, -4(AX)(R9*1)
  5347. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
  5348. emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
  5349. MOVQ (R10), R11
  5350. MOVQ -8(R10)(R9*1), R10
  5351. MOVQ R11, (AX)
  5352. MOVQ R10, -8(AX)(R9*1)
  5353. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
  5354. emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
  5355. MOVOU (R10), X0
  5356. MOVOU -16(R10)(R9*1), X1
  5357. MOVOU X0, (AX)
  5358. MOVOU X1, -16(AX)(R9*1)
  5359. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
  5360. emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
  5361. MOVOU (R10), X0
  5362. MOVOU 16(R10), X1
  5363. MOVOU -32(R10)(R9*1), X2
  5364. MOVOU -16(R10)(R9*1), X3
  5365. MOVOU X0, (AX)
  5366. MOVOU X1, 16(AX)
  5367. MOVOU X2, -32(AX)(R9*1)
  5368. MOVOU X3, -16(AX)(R9*1)
  5369. memmove_end_copy_match_emit_encodeBetterBlockAsm:
  5370. MOVQ SI, AX
  5371. JMP emit_literal_done_match_emit_encodeBetterBlockAsm
  5372. memmove_long_match_emit_encodeBetterBlockAsm:
  5373. LEAQ (AX)(R9*1), SI
  5374. // genMemMoveLong
  5375. MOVOU (R10), X0
  5376. MOVOU 16(R10), X1
  5377. MOVOU -32(R10)(R9*1), X2
  5378. MOVOU -16(R10)(R9*1), X3
  5379. MOVQ R9, R13
  5380. SHRQ $0x05, R13
  5381. MOVQ AX, R11
  5382. ANDL $0x0000001f, R11
  5383. MOVQ $0x00000040, R14
  5384. SUBQ R11, R14
  5385. DECQ R13
  5386. JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
  5387. LEAQ -32(R10)(R14*1), R11
  5388. LEAQ -32(AX)(R14*1), R15
  5389. emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
  5390. MOVOU (R11), X4
  5391. MOVOU 16(R11), X5
  5392. MOVOA X4, (R15)
  5393. MOVOA X5, 16(R15)
  5394. ADDQ $0x20, R15
  5395. ADDQ $0x20, R11
  5396. ADDQ $0x20, R14
  5397. DECQ R13
  5398. JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
  5399. emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
  5400. MOVOU -32(R10)(R14*1), X4
  5401. MOVOU -16(R10)(R14*1), X5
  5402. MOVOA X4, -32(AX)(R14*1)
  5403. MOVOA X5, -16(AX)(R14*1)
  5404. ADDQ $0x20, R14
  5405. CMPQ R9, R14
  5406. JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
  5407. MOVOU X0, (AX)
  5408. MOVOU X1, 16(AX)
  5409. MOVOU X2, -32(AX)(R9*1)
  5410. MOVOU X3, -16(AX)(R9*1)
  5411. MOVQ SI, AX
  5412. emit_literal_done_match_emit_encodeBetterBlockAsm:
  5413. ADDL R12, CX
  5414. ADDL $0x04, R12
  5415. MOVL CX, 12(SP)
  5416. // emitCopy
  5417. CMPL R8, $0x00010000
  5418. JL two_byte_offset_match_nolit_encodeBetterBlockAsm
  5419. four_bytes_loop_back_match_nolit_encodeBetterBlockAsm:
  5420. CMPL R12, $0x40
  5421. JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm
  5422. MOVB $0xff, (AX)
  5423. MOVL R8, 1(AX)
  5424. LEAL -64(R12), R12
  5425. ADDQ $0x05, AX
  5426. CMPL R12, $0x04
  5427. JL four_bytes_remain_match_nolit_encodeBetterBlockAsm
  5428. // emitRepeat
  5429. emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
  5430. MOVL R12, SI
  5431. LEAL -4(R12), R12
  5432. CMPL SI, $0x08
  5433. JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
  5434. CMPL SI, $0x0c
  5435. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
  5436. CMPL R8, $0x00000800
  5437. JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
  5438. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
  5439. CMPL R12, $0x00000104
  5440. JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
  5441. CMPL R12, $0x00010100
  5442. JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
  5443. CMPL R12, $0x0100ffff
  5444. JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
  5445. LEAL -16842747(R12), R12
  5446. MOVW $0x001d, (AX)
  5447. MOVW $0xfffb, 2(AX)
  5448. MOVB $0xff, 4(AX)
  5449. ADDQ $0x05, AX
  5450. JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
  5451. repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
  5452. LEAL -65536(R12), R12
  5453. MOVL R12, R8
  5454. MOVW $0x001d, (AX)
  5455. MOVW R12, 2(AX)
  5456. SARL $0x10, R8
  5457. MOVB R8, 4(AX)
  5458. ADDQ $0x05, AX
  5459. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5460. repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
  5461. LEAL -256(R12), R12
  5462. MOVW $0x0019, (AX)
  5463. MOVW R12, 2(AX)
  5464. ADDQ $0x04, AX
  5465. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5466. repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
  5467. LEAL -4(R12), R12
  5468. MOVW $0x0015, (AX)
  5469. MOVB R12, 2(AX)
  5470. ADDQ $0x03, AX
  5471. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5472. repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
  5473. SHLL $0x02, R12
  5474. ORL $0x01, R12
  5475. MOVW R12, (AX)
  5476. ADDQ $0x02, AX
  5477. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5478. repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
  5479. XORQ SI, SI
  5480. LEAL 1(SI)(R12*4), R12
  5481. MOVB R8, 1(AX)
  5482. SARL $0x08, R8
  5483. SHLL $0x05, R8
  5484. ORL R8, R12
  5485. MOVB R12, (AX)
  5486. ADDQ $0x02, AX
  5487. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5488. JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm
  5489. four_bytes_remain_match_nolit_encodeBetterBlockAsm:
  5490. TESTL R12, R12
  5491. JZ match_nolit_emitcopy_end_encodeBetterBlockAsm
  5492. MOVB $0x03, BL
  5493. LEAL -4(BX)(R12*4), R12
  5494. MOVB R12, (AX)
  5495. MOVL R8, 1(AX)
  5496. ADDQ $0x05, AX
  5497. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5498. two_byte_offset_match_nolit_encodeBetterBlockAsm:
  5499. CMPL R12, $0x40
  5500. JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm
  5501. CMPL R8, $0x00000800
  5502. JAE long_offset_short_match_nolit_encodeBetterBlockAsm
  5503. MOVL $0x00000001, SI
  5504. LEAL 16(SI), SI
  5505. MOVB R8, 1(AX)
  5506. MOVL R8, R9
  5507. SHRL $0x08, R9
  5508. SHLL $0x05, R9
  5509. ORL R9, SI
  5510. MOVB SI, (AX)
  5511. ADDQ $0x02, AX
  5512. SUBL $0x08, R12
  5513. // emitRepeat
  5514. LEAL -4(R12), R12
  5515. JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5516. emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5517. MOVL R12, SI
  5518. LEAL -4(R12), R12
  5519. CMPL SI, $0x08
  5520. JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5521. CMPL SI, $0x0c
  5522. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5523. CMPL R8, $0x00000800
  5524. JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5525. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5526. CMPL R12, $0x00000104
  5527. JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5528. CMPL R12, $0x00010100
  5529. JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5530. CMPL R12, $0x0100ffff
  5531. JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5532. LEAL -16842747(R12), R12
  5533. MOVW $0x001d, (AX)
  5534. MOVW $0xfffb, 2(AX)
  5535. MOVB $0xff, 4(AX)
  5536. ADDQ $0x05, AX
  5537. JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
  5538. repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5539. LEAL -65536(R12), R12
  5540. MOVL R12, R8
  5541. MOVW $0x001d, (AX)
  5542. MOVW R12, 2(AX)
  5543. SARL $0x10, R8
  5544. MOVB R8, 4(AX)
  5545. ADDQ $0x05, AX
  5546. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5547. repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5548. LEAL -256(R12), R12
  5549. MOVW $0x0019, (AX)
  5550. MOVW R12, 2(AX)
  5551. ADDQ $0x04, AX
  5552. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5553. repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5554. LEAL -4(R12), R12
  5555. MOVW $0x0015, (AX)
  5556. MOVB R12, 2(AX)
  5557. ADDQ $0x03, AX
  5558. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5559. repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5560. SHLL $0x02, R12
  5561. ORL $0x01, R12
  5562. MOVW R12, (AX)
  5563. ADDQ $0x02, AX
  5564. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5565. repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
  5566. XORQ SI, SI
  5567. LEAL 1(SI)(R12*4), R12
  5568. MOVB R8, 1(AX)
  5569. SARL $0x08, R8
  5570. SHLL $0x05, R8
  5571. ORL R8, R12
  5572. MOVB R12, (AX)
  5573. ADDQ $0x02, AX
  5574. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5575. long_offset_short_match_nolit_encodeBetterBlockAsm:
  5576. MOVB $0xee, (AX)
  5577. MOVW R8, 1(AX)
  5578. LEAL -60(R12), R12
  5579. ADDQ $0x03, AX
  5580. // emitRepeat
  5581. emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5582. MOVL R12, SI
  5583. LEAL -4(R12), R12
  5584. CMPL SI, $0x08
  5585. JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5586. CMPL SI, $0x0c
  5587. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5588. CMPL R8, $0x00000800
  5589. JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5590. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5591. CMPL R12, $0x00000104
  5592. JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5593. CMPL R12, $0x00010100
  5594. JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5595. CMPL R12, $0x0100ffff
  5596. JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5597. LEAL -16842747(R12), R12
  5598. MOVW $0x001d, (AX)
  5599. MOVW $0xfffb, 2(AX)
  5600. MOVB $0xff, 4(AX)
  5601. ADDQ $0x05, AX
  5602. JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
  5603. repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5604. LEAL -65536(R12), R12
  5605. MOVL R12, R8
  5606. MOVW $0x001d, (AX)
  5607. MOVW R12, 2(AX)
  5608. SARL $0x10, R8
  5609. MOVB R8, 4(AX)
  5610. ADDQ $0x05, AX
  5611. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5612. repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5613. LEAL -256(R12), R12
  5614. MOVW $0x0019, (AX)
  5615. MOVW R12, 2(AX)
  5616. ADDQ $0x04, AX
  5617. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5618. repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5619. LEAL -4(R12), R12
  5620. MOVW $0x0015, (AX)
  5621. MOVB R12, 2(AX)
  5622. ADDQ $0x03, AX
  5623. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5624. repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5625. SHLL $0x02, R12
  5626. ORL $0x01, R12
  5627. MOVW R12, (AX)
  5628. ADDQ $0x02, AX
  5629. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5630. repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
  5631. XORQ SI, SI
  5632. LEAL 1(SI)(R12*4), R12
  5633. MOVB R8, 1(AX)
  5634. SARL $0x08, R8
  5635. SHLL $0x05, R8
  5636. ORL R8, R12
  5637. MOVB R12, (AX)
  5638. ADDQ $0x02, AX
  5639. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5640. JMP two_byte_offset_match_nolit_encodeBetterBlockAsm
  5641. two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
  5642. CMPL R12, $0x0c
  5643. JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
  5644. CMPL R8, $0x00000800
  5645. JGE emit_copy_three_match_nolit_encodeBetterBlockAsm
  5646. MOVB $0x01, BL
  5647. LEAL -16(BX)(R12*4), R12
  5648. MOVB R8, 1(AX)
  5649. SHRL $0x08, R8
  5650. SHLL $0x05, R8
  5651. ORL R8, R12
  5652. MOVB R12, (AX)
  5653. ADDQ $0x02, AX
  5654. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5655. emit_copy_three_match_nolit_encodeBetterBlockAsm:
  5656. MOVB $0x02, BL
  5657. LEAL -4(BX)(R12*4), R12
  5658. MOVB R12, (AX)
  5659. MOVW R8, 1(AX)
  5660. ADDQ $0x03, AX
  5661. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5662. match_is_repeat_encodeBetterBlockAsm:
  5663. MOVL 12(SP), SI
  5664. CMPL SI, DI
  5665. JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
  5666. MOVL DI, R9
  5667. MOVL DI, 12(SP)
  5668. LEAQ (DX)(SI*1), R10
  5669. SUBL SI, R9
  5670. LEAL -1(R9), SI
  5671. CMPL SI, $0x3c
  5672. JLT one_byte_match_emit_repeat_encodeBetterBlockAsm
  5673. CMPL SI, $0x00000100
  5674. JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm
  5675. CMPL SI, $0x00010000
  5676. JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm
  5677. CMPL SI, $0x01000000
  5678. JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm
  5679. MOVB $0xfc, (AX)
  5680. MOVL SI, 1(AX)
  5681. ADDQ $0x05, AX
  5682. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
  5683. four_bytes_match_emit_repeat_encodeBetterBlockAsm:
  5684. MOVL SI, R11
  5685. SHRL $0x10, R11
  5686. MOVB $0xf8, (AX)
  5687. MOVW SI, 1(AX)
  5688. MOVB R11, 3(AX)
  5689. ADDQ $0x04, AX
  5690. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
  5691. three_bytes_match_emit_repeat_encodeBetterBlockAsm:
  5692. MOVB $0xf4, (AX)
  5693. MOVW SI, 1(AX)
  5694. ADDQ $0x03, AX
  5695. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
  5696. two_bytes_match_emit_repeat_encodeBetterBlockAsm:
  5697. MOVB $0xf0, (AX)
  5698. MOVB SI, 1(AX)
  5699. ADDQ $0x02, AX
  5700. CMPL SI, $0x40
  5701. JL memmove_match_emit_repeat_encodeBetterBlockAsm
  5702. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
  5703. one_byte_match_emit_repeat_encodeBetterBlockAsm:
  5704. SHLB $0x02, SI
  5705. MOVB SI, (AX)
  5706. ADDQ $0x01, AX
  5707. memmove_match_emit_repeat_encodeBetterBlockAsm:
  5708. LEAQ (AX)(R9*1), SI
  5709. // genMemMoveShort
  5710. CMPQ R9, $0x04
  5711. JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
  5712. CMPQ R9, $0x08
  5713. JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
  5714. CMPQ R9, $0x10
  5715. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
  5716. CMPQ R9, $0x20
  5717. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
  5718. JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
  5719. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
  5720. MOVL (R10), R11
  5721. MOVL R11, (AX)
  5722. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
  5723. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
  5724. MOVL (R10), R11
  5725. MOVL -4(R10)(R9*1), R10
  5726. MOVL R11, (AX)
  5727. MOVL R10, -4(AX)(R9*1)
  5728. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
  5729. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
  5730. MOVQ (R10), R11
  5731. MOVQ -8(R10)(R9*1), R10
  5732. MOVQ R11, (AX)
  5733. MOVQ R10, -8(AX)(R9*1)
  5734. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
  5735. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
  5736. MOVOU (R10), X0
  5737. MOVOU -16(R10)(R9*1), X1
  5738. MOVOU X0, (AX)
  5739. MOVOU X1, -16(AX)(R9*1)
  5740. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
  5741. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
  5742. MOVOU (R10), X0
  5743. MOVOU 16(R10), X1
  5744. MOVOU -32(R10)(R9*1), X2
  5745. MOVOU -16(R10)(R9*1), X3
  5746. MOVOU X0, (AX)
  5747. MOVOU X1, 16(AX)
  5748. MOVOU X2, -32(AX)(R9*1)
  5749. MOVOU X3, -16(AX)(R9*1)
  5750. memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
  5751. MOVQ SI, AX
  5752. JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
  5753. memmove_long_match_emit_repeat_encodeBetterBlockAsm:
  5754. LEAQ (AX)(R9*1), SI
  5755. // genMemMoveLong
  5756. MOVOU (R10), X0
  5757. MOVOU 16(R10), X1
  5758. MOVOU -32(R10)(R9*1), X2
  5759. MOVOU -16(R10)(R9*1), X3
  5760. MOVQ R9, R13
  5761. SHRQ $0x05, R13
  5762. MOVQ AX, R11
  5763. ANDL $0x0000001f, R11
  5764. MOVQ $0x00000040, R14
  5765. SUBQ R11, R14
  5766. DECQ R13
  5767. JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
  5768. LEAQ -32(R10)(R14*1), R11
  5769. LEAQ -32(AX)(R14*1), R15
  5770. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
  5771. MOVOU (R11), X4
  5772. MOVOU 16(R11), X5
  5773. MOVOA X4, (R15)
  5774. MOVOA X5, 16(R15)
  5775. ADDQ $0x20, R15
  5776. ADDQ $0x20, R11
  5777. ADDQ $0x20, R14
  5778. DECQ R13
  5779. JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
  5780. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
  5781. MOVOU -32(R10)(R14*1), X4
  5782. MOVOU -16(R10)(R14*1), X5
  5783. MOVOA X4, -32(AX)(R14*1)
  5784. MOVOA X5, -16(AX)(R14*1)
  5785. ADDQ $0x20, R14
  5786. CMPQ R9, R14
  5787. JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
  5788. MOVOU X0, (AX)
  5789. MOVOU X1, 16(AX)
  5790. MOVOU X2, -32(AX)(R9*1)
  5791. MOVOU X3, -16(AX)(R9*1)
  5792. MOVQ SI, AX
  5793. emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
  5794. ADDL R12, CX
  5795. ADDL $0x04, R12
  5796. MOVL CX, 12(SP)
  5797. // emitRepeat
  5798. emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
  5799. MOVL R12, SI
  5800. LEAL -4(R12), R12
  5801. CMPL SI, $0x08
  5802. JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm
  5803. CMPL SI, $0x0c
  5804. JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
  5805. CMPL R8, $0x00000800
  5806. JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
  5807. cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
  5808. CMPL R12, $0x00000104
  5809. JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm
  5810. CMPL R12, $0x00010100
  5811. JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm
  5812. CMPL R12, $0x0100ffff
  5813. JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm
  5814. LEAL -16842747(R12), R12
  5815. MOVW $0x001d, (AX)
  5816. MOVW $0xfffb, 2(AX)
  5817. MOVB $0xff, 4(AX)
  5818. ADDQ $0x05, AX
  5819. JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
  5820. repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
  5821. LEAL -65536(R12), R12
  5822. MOVL R12, R8
  5823. MOVW $0x001d, (AX)
  5824. MOVW R12, 2(AX)
  5825. SARL $0x10, R8
  5826. MOVB R8, 4(AX)
  5827. ADDQ $0x05, AX
  5828. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5829. repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
  5830. LEAL -256(R12), R12
  5831. MOVW $0x0019, (AX)
  5832. MOVW R12, 2(AX)
  5833. ADDQ $0x04, AX
  5834. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5835. repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
  5836. LEAL -4(R12), R12
  5837. MOVW $0x0015, (AX)
  5838. MOVB R12, 2(AX)
  5839. ADDQ $0x03, AX
  5840. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5841. repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
  5842. SHLL $0x02, R12
  5843. ORL $0x01, R12
  5844. MOVW R12, (AX)
  5845. ADDQ $0x02, AX
  5846. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
  5847. repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
  5848. XORQ SI, SI
  5849. LEAL 1(SI)(R12*4), R12
  5850. MOVB R8, 1(AX)
  5851. SARL $0x08, R8
  5852. SHLL $0x05, R8
  5853. ORL R8, R12
  5854. MOVB R12, (AX)
  5855. ADDQ $0x02, AX
  5856. match_nolit_emitcopy_end_encodeBetterBlockAsm:
  5857. CMPL CX, 8(SP)
  5858. JGE emit_remainder_encodeBetterBlockAsm
  5859. CMPQ AX, (SP)
  5860. JL match_nolit_dst_ok_encodeBetterBlockAsm
  5861. MOVQ $0x00000000, ret+48(FP)
  5862. RET
  5863. match_nolit_dst_ok_encodeBetterBlockAsm:
  5864. MOVQ $0x00cf1bbcdcbfa563, SI
  5865. MOVQ $0x9e3779b1, R8
  5866. INCL DI
  5867. MOVQ (DX)(DI*1), R9
  5868. MOVQ R9, R10
  5869. MOVQ R9, R11
  5870. MOVQ R9, R12
  5871. SHRQ $0x08, R11
  5872. MOVQ R11, R13
  5873. SHRQ $0x10, R12
  5874. LEAL 1(DI), R14
  5875. LEAL 2(DI), R15
  5876. MOVQ -2(DX)(CX*1), R9
  5877. SHLQ $0x08, R10
  5878. IMULQ SI, R10
  5879. SHRQ $0x30, R10
  5880. SHLQ $0x08, R13
  5881. IMULQ SI, R13
  5882. SHRQ $0x30, R13
  5883. SHLQ $0x20, R11
  5884. IMULQ R8, R11
  5885. SHRQ $0x32, R11
  5886. SHLQ $0x20, R12
  5887. IMULQ R8, R12
  5888. SHRQ $0x32, R12
  5889. MOVL DI, 24(SP)(R10*4)
  5890. MOVL R14, 24(SP)(R13*4)
  5891. MOVL R14, 262168(SP)(R11*4)
  5892. MOVL R15, 262168(SP)(R12*4)
  5893. MOVQ R9, R10
  5894. MOVQ R9, R11
  5895. SHRQ $0x08, R11
  5896. MOVQ R11, R13
  5897. LEAL -2(CX), R9
  5898. LEAL -1(CX), DI
  5899. SHLQ $0x08, R10
  5900. IMULQ SI, R10
  5901. SHRQ $0x30, R10
  5902. SHLQ $0x20, R11
  5903. IMULQ R8, R11
  5904. SHRQ $0x32, R11
  5905. SHLQ $0x08, R13
  5906. IMULQ SI, R13
  5907. SHRQ $0x30, R13
  5908. MOVL R9, 24(SP)(R10*4)
  5909. MOVL DI, 262168(SP)(R11*4)
  5910. MOVL DI, 24(SP)(R13*4)
  5911. JMP search_loop_encodeBetterBlockAsm
  5912. emit_remainder_encodeBetterBlockAsm:
  5913. MOVQ src_len+32(FP), CX
  5914. SUBL 12(SP), CX
  5915. LEAQ 5(AX)(CX*1), CX
  5916. CMPQ CX, (SP)
  5917. JL emit_remainder_ok_encodeBetterBlockAsm
  5918. MOVQ $0x00000000, ret+48(FP)
  5919. RET
  5920. emit_remainder_ok_encodeBetterBlockAsm:
  5921. MOVQ src_len+32(FP), CX
  5922. MOVL 12(SP), BX
  5923. CMPL BX, CX
  5924. JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm
  5925. MOVL CX, SI
  5926. MOVL CX, 12(SP)
  5927. LEAQ (DX)(BX*1), CX
  5928. SUBL BX, SI
  5929. LEAL -1(SI), DX
  5930. CMPL DX, $0x3c
  5931. JLT one_byte_emit_remainder_encodeBetterBlockAsm
  5932. CMPL DX, $0x00000100
  5933. JLT two_bytes_emit_remainder_encodeBetterBlockAsm
  5934. CMPL DX, $0x00010000
  5935. JLT three_bytes_emit_remainder_encodeBetterBlockAsm
  5936. CMPL DX, $0x01000000
  5937. JLT four_bytes_emit_remainder_encodeBetterBlockAsm
  5938. MOVB $0xfc, (AX)
  5939. MOVL DX, 1(AX)
  5940. ADDQ $0x05, AX
  5941. JMP memmove_long_emit_remainder_encodeBetterBlockAsm
  5942. four_bytes_emit_remainder_encodeBetterBlockAsm:
  5943. MOVL DX, BX
  5944. SHRL $0x10, BX
  5945. MOVB $0xf8, (AX)
  5946. MOVW DX, 1(AX)
  5947. MOVB BL, 3(AX)
  5948. ADDQ $0x04, AX
  5949. JMP memmove_long_emit_remainder_encodeBetterBlockAsm
  5950. three_bytes_emit_remainder_encodeBetterBlockAsm:
  5951. MOVB $0xf4, (AX)
  5952. MOVW DX, 1(AX)
  5953. ADDQ $0x03, AX
  5954. JMP memmove_long_emit_remainder_encodeBetterBlockAsm
  5955. two_bytes_emit_remainder_encodeBetterBlockAsm:
  5956. MOVB $0xf0, (AX)
  5957. MOVB DL, 1(AX)
  5958. ADDQ $0x02, AX
  5959. CMPL DX, $0x40
  5960. JL memmove_emit_remainder_encodeBetterBlockAsm
  5961. JMP memmove_long_emit_remainder_encodeBetterBlockAsm
  5962. one_byte_emit_remainder_encodeBetterBlockAsm:
  5963. SHLB $0x02, DL
  5964. MOVB DL, (AX)
  5965. ADDQ $0x01, AX
  5966. memmove_emit_remainder_encodeBetterBlockAsm:
  5967. LEAQ (AX)(SI*1), DX
  5968. MOVL SI, BX
  5969. // genMemMoveShort
  5970. CMPQ BX, $0x03
  5971. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
  5972. JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
  5973. CMPQ BX, $0x08
  5974. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
  5975. CMPQ BX, $0x10
  5976. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
  5977. CMPQ BX, $0x20
  5978. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
  5979. JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
  5980. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
  5981. MOVB (CX), SI
  5982. MOVB -1(CX)(BX*1), CL
  5983. MOVB SI, (AX)
  5984. MOVB CL, -1(AX)(BX*1)
  5985. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
  5986. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
  5987. MOVW (CX), SI
  5988. MOVB 2(CX), CL
  5989. MOVW SI, (AX)
  5990. MOVB CL, 2(AX)
  5991. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
  5992. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
  5993. MOVL (CX), SI
  5994. MOVL -4(CX)(BX*1), CX
  5995. MOVL SI, (AX)
  5996. MOVL CX, -4(AX)(BX*1)
  5997. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
  5998. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
  5999. MOVQ (CX), SI
  6000. MOVQ -8(CX)(BX*1), CX
  6001. MOVQ SI, (AX)
  6002. MOVQ CX, -8(AX)(BX*1)
  6003. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
  6004. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
  6005. MOVOU (CX), X0
  6006. MOVOU -16(CX)(BX*1), X1
  6007. MOVOU X0, (AX)
  6008. MOVOU X1, -16(AX)(BX*1)
  6009. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
  6010. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
  6011. MOVOU (CX), X0
  6012. MOVOU 16(CX), X1
  6013. MOVOU -32(CX)(BX*1), X2
  6014. MOVOU -16(CX)(BX*1), X3
  6015. MOVOU X0, (AX)
  6016. MOVOU X1, 16(AX)
  6017. MOVOU X2, -32(AX)(BX*1)
  6018. MOVOU X3, -16(AX)(BX*1)
  6019. memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
  6020. MOVQ DX, AX
  6021. JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm
  6022. memmove_long_emit_remainder_encodeBetterBlockAsm:
  6023. LEAQ (AX)(SI*1), DX
  6024. MOVL SI, BX
  6025. // genMemMoveLong
  6026. MOVOU (CX), X0
  6027. MOVOU 16(CX), X1
  6028. MOVOU -32(CX)(BX*1), X2
  6029. MOVOU -16(CX)(BX*1), X3
  6030. MOVQ BX, DI
  6031. SHRQ $0x05, DI
  6032. MOVQ AX, SI
  6033. ANDL $0x0000001f, SI
  6034. MOVQ $0x00000040, R8
  6035. SUBQ SI, R8
  6036. DECQ DI
  6037. JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
  6038. LEAQ -32(CX)(R8*1), SI
  6039. LEAQ -32(AX)(R8*1), R9
  6040. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
  6041. MOVOU (SI), X4
  6042. MOVOU 16(SI), X5
  6043. MOVOA X4, (R9)
  6044. MOVOA X5, 16(R9)
  6045. ADDQ $0x20, R9
  6046. ADDQ $0x20, SI
  6047. ADDQ $0x20, R8
  6048. DECQ DI
  6049. JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
  6050. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
  6051. MOVOU -32(CX)(R8*1), X4
  6052. MOVOU -16(CX)(R8*1), X5
  6053. MOVOA X4, -32(AX)(R8*1)
  6054. MOVOA X5, -16(AX)(R8*1)
  6055. ADDQ $0x20, R8
  6056. CMPQ BX, R8
  6057. JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
  6058. MOVOU X0, (AX)
  6059. MOVOU X1, 16(AX)
  6060. MOVOU X2, -32(AX)(BX*1)
  6061. MOVOU X3, -16(AX)(BX*1)
  6062. MOVQ DX, AX
  6063. emit_literal_done_emit_remainder_encodeBetterBlockAsm:
  6064. MOVQ dst_base+0(FP), CX
  6065. SUBQ CX, AX
  6066. MOVQ AX, ret+48(FP)
  6067. RET
  6068. // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
  6069. // Requires: BMI, SSE2
  6070. TEXT ·encodeBetterBlockAsm4MB(SB), $327704-56
  6071. MOVQ dst_base+0(FP), AX
  6072. MOVQ $0x00000a00, CX
  6073. LEAQ 24(SP), DX
  6074. PXOR X0, X0
  6075. zero_loop_encodeBetterBlockAsm4MB:
  6076. MOVOU X0, (DX)
  6077. MOVOU X0, 16(DX)
  6078. MOVOU X0, 32(DX)
  6079. MOVOU X0, 48(DX)
  6080. MOVOU X0, 64(DX)
  6081. MOVOU X0, 80(DX)
  6082. MOVOU X0, 96(DX)
  6083. MOVOU X0, 112(DX)
  6084. ADDQ $0x80, DX
  6085. DECQ CX
  6086. JNZ zero_loop_encodeBetterBlockAsm4MB
  6087. MOVL $0x00000000, 12(SP)
  6088. MOVQ src_len+32(FP), CX
  6089. LEAQ -6(CX), DX
  6090. LEAQ -8(CX), SI
  6091. MOVL SI, 8(SP)
  6092. SHRQ $0x05, CX
  6093. SUBL CX, DX
  6094. LEAQ (AX)(DX*1), DX
  6095. MOVQ DX, (SP)
  6096. MOVL $0x00000001, CX
  6097. MOVL $0x00000000, 16(SP)
  6098. MOVQ src_base+24(FP), DX
  6099. search_loop_encodeBetterBlockAsm4MB:
  6100. MOVL CX, SI
  6101. SUBL 12(SP), SI
  6102. SHRL $0x07, SI
  6103. CMPL SI, $0x63
  6104. JLE check_maxskip_ok_encodeBetterBlockAsm4MB
  6105. LEAL 100(CX), SI
  6106. JMP check_maxskip_cont_encodeBetterBlockAsm4MB
  6107. check_maxskip_ok_encodeBetterBlockAsm4MB:
  6108. LEAL 1(CX)(SI*1), SI
  6109. check_maxskip_cont_encodeBetterBlockAsm4MB:
  6110. CMPL SI, 8(SP)
  6111. JGE emit_remainder_encodeBetterBlockAsm4MB
  6112. MOVQ (DX)(CX*1), DI
  6113. MOVL SI, 20(SP)
  6114. MOVQ $0x00cf1bbcdcbfa563, R9
  6115. MOVQ $0x9e3779b1, SI
  6116. MOVQ DI, R10
  6117. MOVQ DI, R11
  6118. SHLQ $0x08, R10
  6119. IMULQ R9, R10
  6120. SHRQ $0x30, R10
  6121. SHLQ $0x20, R11
  6122. IMULQ SI, R11
  6123. SHRQ $0x32, R11
  6124. MOVL 24(SP)(R10*4), SI
  6125. MOVL 262168(SP)(R11*4), R8
  6126. MOVL CX, 24(SP)(R10*4)
  6127. MOVL CX, 262168(SP)(R11*4)
  6128. CMPL (DX)(SI*1), DI
  6129. JEQ candidate_match_encodeBetterBlockAsm4MB
  6130. CMPL (DX)(R8*1), DI
  6131. JEQ candidateS_match_encodeBetterBlockAsm4MB
  6132. MOVL 20(SP), CX
  6133. JMP search_loop_encodeBetterBlockAsm4MB
  6134. candidateS_match_encodeBetterBlockAsm4MB:
  6135. SHRQ $0x08, DI
  6136. MOVQ DI, R10
  6137. SHLQ $0x08, R10
  6138. IMULQ R9, R10
  6139. SHRQ $0x30, R10
  6140. MOVL 24(SP)(R10*4), SI
  6141. INCL CX
  6142. MOVL CX, 24(SP)(R10*4)
  6143. CMPL (DX)(SI*1), DI
  6144. JEQ candidate_match_encodeBetterBlockAsm4MB
  6145. DECL CX
  6146. MOVL R8, SI
  6147. candidate_match_encodeBetterBlockAsm4MB:
  6148. MOVL 12(SP), DI
  6149. TESTL SI, SI
  6150. JZ match_extend_back_end_encodeBetterBlockAsm4MB
  6151. match_extend_back_loop_encodeBetterBlockAsm4MB:
  6152. CMPL CX, DI
  6153. JLE match_extend_back_end_encodeBetterBlockAsm4MB
  6154. MOVB -1(DX)(SI*1), BL
  6155. MOVB -1(DX)(CX*1), R8
  6156. CMPB BL, R8
  6157. JNE match_extend_back_end_encodeBetterBlockAsm4MB
  6158. LEAL -1(CX), CX
  6159. DECL SI
  6160. JZ match_extend_back_end_encodeBetterBlockAsm4MB
  6161. JMP match_extend_back_loop_encodeBetterBlockAsm4MB
  6162. match_extend_back_end_encodeBetterBlockAsm4MB:
  6163. MOVL CX, DI
  6164. SUBL 12(SP), DI
  6165. LEAQ 4(AX)(DI*1), DI
  6166. CMPQ DI, (SP)
  6167. JL match_dst_size_check_encodeBetterBlockAsm4MB
  6168. MOVQ $0x00000000, ret+48(FP)
  6169. RET
  6170. match_dst_size_check_encodeBetterBlockAsm4MB:
  6171. MOVL CX, DI
  6172. ADDL $0x04, CX
  6173. ADDL $0x04, SI
  6174. MOVQ src_len+32(FP), R8
  6175. SUBL CX, R8
  6176. LEAQ (DX)(CX*1), R9
  6177. LEAQ (DX)(SI*1), R10
  6178. // matchLen
  6179. XORL R12, R12
  6180. CMPL R8, $0x08
  6181. JL matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
  6182. matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB:
  6183. MOVQ (R9)(R12*1), R11
  6184. XORQ (R10)(R12*1), R11
  6185. TESTQ R11, R11
  6186. JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB
  6187. #ifdef GOAMD64_v3
  6188. TZCNTQ R11, R11
  6189. #else
  6190. BSFQ R11, R11
  6191. #endif
  6192. SARQ $0x03, R11
  6193. LEAL (R12)(R11*1), R12
  6194. JMP match_nolit_end_encodeBetterBlockAsm4MB
  6195. matchlen_loop_match_nolit_encodeBetterBlockAsm4MB:
  6196. LEAL -8(R8), R8
  6197. LEAL 8(R12), R12
  6198. CMPL R8, $0x08
  6199. JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB
  6200. JZ match_nolit_end_encodeBetterBlockAsm4MB
  6201. matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
  6202. CMPL R8, $0x04
  6203. JL matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
  6204. MOVL (R9)(R12*1), R11
  6205. CMPL (R10)(R12*1), R11
  6206. JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
  6207. SUBL $0x04, R8
  6208. LEAL 4(R12), R12
  6209. matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
  6210. CMPL R8, $0x02
  6211. JL matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
  6212. MOVW (R9)(R12*1), R11
  6213. CMPW (R10)(R12*1), R11
  6214. JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
  6215. SUBL $0x02, R8
  6216. LEAL 2(R12), R12
  6217. matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
  6218. CMPL R8, $0x01
  6219. JL match_nolit_end_encodeBetterBlockAsm4MB
  6220. MOVB (R9)(R12*1), R11
  6221. CMPB (R10)(R12*1), R11
  6222. JNE match_nolit_end_encodeBetterBlockAsm4MB
  6223. LEAL 1(R12), R12
  6224. match_nolit_end_encodeBetterBlockAsm4MB:
  6225. MOVL CX, R8
  6226. SUBL SI, R8
  6227. // Check if repeat
  6228. CMPL 16(SP), R8
  6229. JEQ match_is_repeat_encodeBetterBlockAsm4MB
  6230. CMPL R12, $0x01
  6231. JG match_length_ok_encodeBetterBlockAsm4MB
  6232. CMPL R8, $0x0000ffff
  6233. JLE match_length_ok_encodeBetterBlockAsm4MB
  6234. MOVL 20(SP), CX
  6235. INCL CX
  6236. JMP search_loop_encodeBetterBlockAsm4MB
  6237. match_length_ok_encodeBetterBlockAsm4MB:
  6238. MOVL R8, 16(SP)
  6239. MOVL 12(SP), SI
  6240. CMPL SI, DI
  6241. JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB
  6242. MOVL DI, R9
  6243. MOVL DI, 12(SP)
  6244. LEAQ (DX)(SI*1), R10
  6245. SUBL SI, R9
  6246. LEAL -1(R9), SI
  6247. CMPL SI, $0x3c
  6248. JLT one_byte_match_emit_encodeBetterBlockAsm4MB
  6249. CMPL SI, $0x00000100
  6250. JLT two_bytes_match_emit_encodeBetterBlockAsm4MB
  6251. CMPL SI, $0x00010000
  6252. JLT three_bytes_match_emit_encodeBetterBlockAsm4MB
  6253. MOVL SI, R11
  6254. SHRL $0x10, R11
  6255. MOVB $0xf8, (AX)
  6256. MOVW SI, 1(AX)
  6257. MOVB R11, 3(AX)
  6258. ADDQ $0x04, AX
  6259. JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
  6260. three_bytes_match_emit_encodeBetterBlockAsm4MB:
  6261. MOVB $0xf4, (AX)
  6262. MOVW SI, 1(AX)
  6263. ADDQ $0x03, AX
  6264. JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
  6265. two_bytes_match_emit_encodeBetterBlockAsm4MB:
  6266. MOVB $0xf0, (AX)
  6267. MOVB SI, 1(AX)
  6268. ADDQ $0x02, AX
  6269. CMPL SI, $0x40
  6270. JL memmove_match_emit_encodeBetterBlockAsm4MB
  6271. JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
  6272. one_byte_match_emit_encodeBetterBlockAsm4MB:
  6273. SHLB $0x02, SI
  6274. MOVB SI, (AX)
  6275. ADDQ $0x01, AX
  6276. memmove_match_emit_encodeBetterBlockAsm4MB:
  6277. LEAQ (AX)(R9*1), SI
  6278. // genMemMoveShort
  6279. CMPQ R9, $0x04
  6280. JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
  6281. CMPQ R9, $0x08
  6282. JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
  6283. CMPQ R9, $0x10
  6284. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
  6285. CMPQ R9, $0x20
  6286. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
  6287. JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
  6288. emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
  6289. MOVL (R10), R11
  6290. MOVL R11, (AX)
  6291. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
  6292. emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
  6293. MOVL (R10), R11
  6294. MOVL -4(R10)(R9*1), R10
  6295. MOVL R11, (AX)
  6296. MOVL R10, -4(AX)(R9*1)
  6297. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
  6298. emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
  6299. MOVQ (R10), R11
  6300. MOVQ -8(R10)(R9*1), R10
  6301. MOVQ R11, (AX)
  6302. MOVQ R10, -8(AX)(R9*1)
  6303. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
  6304. emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
  6305. MOVOU (R10), X0
  6306. MOVOU -16(R10)(R9*1), X1
  6307. MOVOU X0, (AX)
  6308. MOVOU X1, -16(AX)(R9*1)
  6309. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
  6310. emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
  6311. MOVOU (R10), X0
  6312. MOVOU 16(R10), X1
  6313. MOVOU -32(R10)(R9*1), X2
  6314. MOVOU -16(R10)(R9*1), X3
  6315. MOVOU X0, (AX)
  6316. MOVOU X1, 16(AX)
  6317. MOVOU X2, -32(AX)(R9*1)
  6318. MOVOU X3, -16(AX)(R9*1)
  6319. memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
  6320. MOVQ SI, AX
  6321. JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB
  6322. memmove_long_match_emit_encodeBetterBlockAsm4MB:
  6323. LEAQ (AX)(R9*1), SI
  6324. // genMemMoveLong
  6325. MOVOU (R10), X0
  6326. MOVOU 16(R10), X1
  6327. MOVOU -32(R10)(R9*1), X2
  6328. MOVOU -16(R10)(R9*1), X3
  6329. MOVQ R9, R13
  6330. SHRQ $0x05, R13
  6331. MOVQ AX, R11
  6332. ANDL $0x0000001f, R11
  6333. MOVQ $0x00000040, R14
  6334. SUBQ R11, R14
  6335. DECQ R13
  6336. JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  6337. LEAQ -32(R10)(R14*1), R11
  6338. LEAQ -32(AX)(R14*1), R15
  6339. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
  6340. MOVOU (R11), X4
  6341. MOVOU 16(R11), X5
  6342. MOVOA X4, (R15)
  6343. MOVOA X5, 16(R15)
  6344. ADDQ $0x20, R15
  6345. ADDQ $0x20, R11
  6346. ADDQ $0x20, R14
  6347. DECQ R13
  6348. JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
  6349. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
  6350. MOVOU -32(R10)(R14*1), X4
  6351. MOVOU -16(R10)(R14*1), X5
  6352. MOVOA X4, -32(AX)(R14*1)
  6353. MOVOA X5, -16(AX)(R14*1)
  6354. ADDQ $0x20, R14
  6355. CMPQ R9, R14
  6356. JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  6357. MOVOU X0, (AX)
  6358. MOVOU X1, 16(AX)
  6359. MOVOU X2, -32(AX)(R9*1)
  6360. MOVOU X3, -16(AX)(R9*1)
  6361. MOVQ SI, AX
  6362. emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
  6363. ADDL R12, CX
  6364. ADDL $0x04, R12
  6365. MOVL CX, 12(SP)
  6366. // emitCopy
  6367. CMPL R8, $0x00010000
  6368. JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
  6369. four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB:
  6370. CMPL R12, $0x40
  6371. JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
  6372. MOVB $0xff, (AX)
  6373. MOVL R8, 1(AX)
  6374. LEAL -64(R12), R12
  6375. ADDQ $0x05, AX
  6376. CMPL R12, $0x04
  6377. JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
  6378. // emitRepeat
  6379. MOVL R12, SI
  6380. LEAL -4(R12), R12
  6381. CMPL SI, $0x08
  6382. JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
  6383. CMPL SI, $0x0c
  6384. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
  6385. CMPL R8, $0x00000800
  6386. JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
  6387. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
  6388. CMPL R12, $0x00000104
  6389. JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
  6390. CMPL R12, $0x00010100
  6391. JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
  6392. LEAL -65536(R12), R12
  6393. MOVL R12, R8
  6394. MOVW $0x001d, (AX)
  6395. MOVW R12, 2(AX)
  6396. SARL $0x10, R8
  6397. MOVB R8, 4(AX)
  6398. ADDQ $0x05, AX
  6399. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6400. repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
  6401. LEAL -256(R12), R12
  6402. MOVW $0x0019, (AX)
  6403. MOVW R12, 2(AX)
  6404. ADDQ $0x04, AX
  6405. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6406. repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
  6407. LEAL -4(R12), R12
  6408. MOVW $0x0015, (AX)
  6409. MOVB R12, 2(AX)
  6410. ADDQ $0x03, AX
  6411. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6412. repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
  6413. SHLL $0x02, R12
  6414. ORL $0x01, R12
  6415. MOVW R12, (AX)
  6416. ADDQ $0x02, AX
  6417. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6418. repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
  6419. XORQ SI, SI
  6420. LEAL 1(SI)(R12*4), R12
  6421. MOVB R8, 1(AX)
  6422. SARL $0x08, R8
  6423. SHLL $0x05, R8
  6424. ORL R8, R12
  6425. MOVB R12, (AX)
  6426. ADDQ $0x02, AX
  6427. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6428. JMP four_bytes_loop_back_match_nolit_encodeBetterBlockAsm4MB
  6429. four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
  6430. TESTL R12, R12
  6431. JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6432. MOVB $0x03, BL
  6433. LEAL -4(BX)(R12*4), R12
  6434. MOVB R12, (AX)
  6435. MOVL R8, 1(AX)
  6436. ADDQ $0x05, AX
  6437. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6438. two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
  6439. CMPL R12, $0x40
  6440. JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
  6441. CMPL R8, $0x00000800
  6442. JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB
  6443. MOVL $0x00000001, SI
  6444. LEAL 16(SI), SI
  6445. MOVB R8, 1(AX)
  6446. SHRL $0x08, R8
  6447. SHLL $0x05, R8
  6448. ORL R8, SI
  6449. MOVB SI, (AX)
  6450. ADDQ $0x02, AX
  6451. SUBL $0x08, R12
  6452. // emitRepeat
  6453. LEAL -4(R12), R12
  6454. JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6455. MOVL R12, SI
  6456. LEAL -4(R12), R12
  6457. CMPL SI, $0x08
  6458. JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6459. CMPL SI, $0x0c
  6460. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6461. CMPL R8, $0x00000800
  6462. JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6463. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
  6464. CMPL R12, $0x00000104
  6465. JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6466. CMPL R12, $0x00010100
  6467. JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
  6468. LEAL -65536(R12), R12
  6469. MOVL R12, R8
  6470. MOVW $0x001d, (AX)
  6471. MOVW R12, 2(AX)
  6472. SARL $0x10, R8
  6473. MOVB R8, 4(AX)
  6474. ADDQ $0x05, AX
  6475. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6476. repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
  6477. LEAL -256(R12), R12
  6478. MOVW $0x0019, (AX)
  6479. MOVW R12, 2(AX)
  6480. ADDQ $0x04, AX
  6481. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6482. repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
  6483. LEAL -4(R12), R12
  6484. MOVW $0x0015, (AX)
  6485. MOVB R12, 2(AX)
  6486. ADDQ $0x03, AX
  6487. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6488. repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
  6489. SHLL $0x02, R12
  6490. ORL $0x01, R12
  6491. MOVW R12, (AX)
  6492. ADDQ $0x02, AX
  6493. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6494. repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
  6495. XORQ SI, SI
  6496. LEAL 1(SI)(R12*4), R12
  6497. MOVB R8, 1(AX)
  6498. SARL $0x08, R8
  6499. SHLL $0x05, R8
  6500. ORL R8, R12
  6501. MOVB R12, (AX)
  6502. ADDQ $0x02, AX
  6503. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6504. long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
  6505. MOVB $0xee, (AX)
  6506. MOVW R8, 1(AX)
  6507. LEAL -60(R12), R12
  6508. ADDQ $0x03, AX
  6509. // emitRepeat
  6510. MOVL R12, SI
  6511. LEAL -4(R12), R12
  6512. CMPL SI, $0x08
  6513. JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
  6514. CMPL SI, $0x0c
  6515. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
  6516. CMPL R8, $0x00000800
  6517. JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
  6518. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
  6519. CMPL R12, $0x00000104
  6520. JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
  6521. CMPL R12, $0x00010100
  6522. JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
  6523. LEAL -65536(R12), R12
  6524. MOVL R12, R8
  6525. MOVW $0x001d, (AX)
  6526. MOVW R12, 2(AX)
  6527. SARL $0x10, R8
  6528. MOVB R8, 4(AX)
  6529. ADDQ $0x05, AX
  6530. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6531. repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
  6532. LEAL -256(R12), R12
  6533. MOVW $0x0019, (AX)
  6534. MOVW R12, 2(AX)
  6535. ADDQ $0x04, AX
  6536. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6537. repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
  6538. LEAL -4(R12), R12
  6539. MOVW $0x0015, (AX)
  6540. MOVB R12, 2(AX)
  6541. ADDQ $0x03, AX
  6542. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6543. repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
  6544. SHLL $0x02, R12
  6545. ORL $0x01, R12
  6546. MOVW R12, (AX)
  6547. ADDQ $0x02, AX
  6548. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6549. repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
  6550. XORQ SI, SI
  6551. LEAL 1(SI)(R12*4), R12
  6552. MOVB R8, 1(AX)
  6553. SARL $0x08, R8
  6554. SHLL $0x05, R8
  6555. ORL R8, R12
  6556. MOVB R12, (AX)
  6557. ADDQ $0x02, AX
  6558. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6559. JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
  6560. two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
  6561. CMPL R12, $0x0c
  6562. JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
  6563. CMPL R8, $0x00000800
  6564. JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
  6565. MOVB $0x01, BL
  6566. LEAL -16(BX)(R12*4), R12
  6567. MOVB R8, 1(AX)
  6568. SHRL $0x08, R8
  6569. SHLL $0x05, R8
  6570. ORL R8, R12
  6571. MOVB R12, (AX)
  6572. ADDQ $0x02, AX
  6573. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6574. emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
  6575. MOVB $0x02, BL
  6576. LEAL -4(BX)(R12*4), R12
  6577. MOVB R12, (AX)
  6578. MOVW R8, 1(AX)
  6579. ADDQ $0x03, AX
  6580. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6581. match_is_repeat_encodeBetterBlockAsm4MB:
  6582. MOVL 12(SP), SI
  6583. CMPL SI, DI
  6584. JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
  6585. MOVL DI, R9
  6586. MOVL DI, 12(SP)
  6587. LEAQ (DX)(SI*1), R10
  6588. SUBL SI, R9
  6589. LEAL -1(R9), SI
  6590. CMPL SI, $0x3c
  6591. JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
  6592. CMPL SI, $0x00000100
  6593. JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
  6594. CMPL SI, $0x00010000
  6595. JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
  6596. MOVL SI, R11
  6597. SHRL $0x10, R11
  6598. MOVB $0xf8, (AX)
  6599. MOVW SI, 1(AX)
  6600. MOVB R11, 3(AX)
  6601. ADDQ $0x04, AX
  6602. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
  6603. three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
  6604. MOVB $0xf4, (AX)
  6605. MOVW SI, 1(AX)
  6606. ADDQ $0x03, AX
  6607. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
  6608. two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
  6609. MOVB $0xf0, (AX)
  6610. MOVB SI, 1(AX)
  6611. ADDQ $0x02, AX
  6612. CMPL SI, $0x40
  6613. JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB
  6614. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
  6615. one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
  6616. SHLB $0x02, SI
  6617. MOVB SI, (AX)
  6618. ADDQ $0x01, AX
  6619. memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
  6620. LEAQ (AX)(R9*1), SI
  6621. // genMemMoveShort
  6622. CMPQ R9, $0x04
  6623. JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
  6624. CMPQ R9, $0x08
  6625. JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
  6626. CMPQ R9, $0x10
  6627. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
  6628. CMPQ R9, $0x20
  6629. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
  6630. JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
  6631. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
  6632. MOVL (R10), R11
  6633. MOVL R11, (AX)
  6634. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
  6635. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
  6636. MOVL (R10), R11
  6637. MOVL -4(R10)(R9*1), R10
  6638. MOVL R11, (AX)
  6639. MOVL R10, -4(AX)(R9*1)
  6640. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
  6641. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
  6642. MOVQ (R10), R11
  6643. MOVQ -8(R10)(R9*1), R10
  6644. MOVQ R11, (AX)
  6645. MOVQ R10, -8(AX)(R9*1)
  6646. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
  6647. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
  6648. MOVOU (R10), X0
  6649. MOVOU -16(R10)(R9*1), X1
  6650. MOVOU X0, (AX)
  6651. MOVOU X1, -16(AX)(R9*1)
  6652. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
  6653. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
  6654. MOVOU (R10), X0
  6655. MOVOU 16(R10), X1
  6656. MOVOU -32(R10)(R9*1), X2
  6657. MOVOU -16(R10)(R9*1), X3
  6658. MOVOU X0, (AX)
  6659. MOVOU X1, 16(AX)
  6660. MOVOU X2, -32(AX)(R9*1)
  6661. MOVOU X3, -16(AX)(R9*1)
  6662. memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
  6663. MOVQ SI, AX
  6664. JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
  6665. memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
  6666. LEAQ (AX)(R9*1), SI
  6667. // genMemMoveLong
  6668. MOVOU (R10), X0
  6669. MOVOU 16(R10), X1
  6670. MOVOU -32(R10)(R9*1), X2
  6671. MOVOU -16(R10)(R9*1), X3
  6672. MOVQ R9, R13
  6673. SHRQ $0x05, R13
  6674. MOVQ AX, R11
  6675. ANDL $0x0000001f, R11
  6676. MOVQ $0x00000040, R14
  6677. SUBQ R11, R14
  6678. DECQ R13
  6679. JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  6680. LEAQ -32(R10)(R14*1), R11
  6681. LEAQ -32(AX)(R14*1), R15
  6682. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
  6683. MOVOU (R11), X4
  6684. MOVOU 16(R11), X5
  6685. MOVOA X4, (R15)
  6686. MOVOA X5, 16(R15)
  6687. ADDQ $0x20, R15
  6688. ADDQ $0x20, R11
  6689. ADDQ $0x20, R14
  6690. DECQ R13
  6691. JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
  6692. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
  6693. MOVOU -32(R10)(R14*1), X4
  6694. MOVOU -16(R10)(R14*1), X5
  6695. MOVOA X4, -32(AX)(R14*1)
  6696. MOVOA X5, -16(AX)(R14*1)
  6697. ADDQ $0x20, R14
  6698. CMPQ R9, R14
  6699. JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  6700. MOVOU X0, (AX)
  6701. MOVOU X1, 16(AX)
  6702. MOVOU X2, -32(AX)(R9*1)
  6703. MOVOU X3, -16(AX)(R9*1)
  6704. MOVQ SI, AX
  6705. emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
  6706. ADDL R12, CX
  6707. ADDL $0x04, R12
  6708. MOVL CX, 12(SP)
  6709. // emitRepeat
  6710. MOVL R12, SI
  6711. LEAL -4(R12), R12
  6712. CMPL SI, $0x08
  6713. JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
  6714. CMPL SI, $0x0c
  6715. JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
  6716. CMPL R8, $0x00000800
  6717. JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
  6718. cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
  6719. CMPL R12, $0x00000104
  6720. JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
  6721. CMPL R12, $0x00010100
  6722. JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
  6723. LEAL -65536(R12), R12
  6724. MOVL R12, R8
  6725. MOVW $0x001d, (AX)
  6726. MOVW R12, 2(AX)
  6727. SARL $0x10, R8
  6728. MOVB R8, 4(AX)
  6729. ADDQ $0x05, AX
  6730. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6731. repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
  6732. LEAL -256(R12), R12
  6733. MOVW $0x0019, (AX)
  6734. MOVW R12, 2(AX)
  6735. ADDQ $0x04, AX
  6736. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6737. repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
  6738. LEAL -4(R12), R12
  6739. MOVW $0x0015, (AX)
  6740. MOVB R12, 2(AX)
  6741. ADDQ $0x03, AX
  6742. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6743. repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
  6744. SHLL $0x02, R12
  6745. ORL $0x01, R12
  6746. MOVW R12, (AX)
  6747. ADDQ $0x02, AX
  6748. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
  6749. repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
  6750. XORQ SI, SI
  6751. LEAL 1(SI)(R12*4), R12
  6752. MOVB R8, 1(AX)
  6753. SARL $0x08, R8
  6754. SHLL $0x05, R8
  6755. ORL R8, R12
  6756. MOVB R12, (AX)
  6757. ADDQ $0x02, AX
  6758. match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
  6759. CMPL CX, 8(SP)
  6760. JGE emit_remainder_encodeBetterBlockAsm4MB
  6761. CMPQ AX, (SP)
  6762. JL match_nolit_dst_ok_encodeBetterBlockAsm4MB
  6763. MOVQ $0x00000000, ret+48(FP)
  6764. RET
  6765. match_nolit_dst_ok_encodeBetterBlockAsm4MB:
  6766. MOVQ $0x00cf1bbcdcbfa563, SI
  6767. MOVQ $0x9e3779b1, R8
  6768. INCL DI
  6769. MOVQ (DX)(DI*1), R9
  6770. MOVQ R9, R10
  6771. MOVQ R9, R11
  6772. MOVQ R9, R12
  6773. SHRQ $0x08, R11
  6774. MOVQ R11, R13
  6775. SHRQ $0x10, R12
  6776. LEAL 1(DI), R14
  6777. LEAL 2(DI), R15
  6778. MOVQ -2(DX)(CX*1), R9
  6779. SHLQ $0x08, R10
  6780. IMULQ SI, R10
  6781. SHRQ $0x30, R10
  6782. SHLQ $0x08, R13
  6783. IMULQ SI, R13
  6784. SHRQ $0x30, R13
  6785. SHLQ $0x20, R11
  6786. IMULQ R8, R11
  6787. SHRQ $0x32, R11
  6788. SHLQ $0x20, R12
  6789. IMULQ R8, R12
  6790. SHRQ $0x32, R12
  6791. MOVL DI, 24(SP)(R10*4)
  6792. MOVL R14, 24(SP)(R13*4)
  6793. MOVL R14, 262168(SP)(R11*4)
  6794. MOVL R15, 262168(SP)(R12*4)
  6795. MOVQ R9, R10
  6796. MOVQ R9, R11
  6797. SHRQ $0x08, R11
  6798. MOVQ R11, R13
  6799. LEAL -2(CX), R9
  6800. LEAL -1(CX), DI
  6801. SHLQ $0x08, R10
  6802. IMULQ SI, R10
  6803. SHRQ $0x30, R10
  6804. SHLQ $0x20, R11
  6805. IMULQ R8, R11
  6806. SHRQ $0x32, R11
  6807. SHLQ $0x08, R13
  6808. IMULQ SI, R13
  6809. SHRQ $0x30, R13
  6810. MOVL R9, 24(SP)(R10*4)
  6811. MOVL DI, 262168(SP)(R11*4)
  6812. MOVL DI, 24(SP)(R13*4)
  6813. JMP search_loop_encodeBetterBlockAsm4MB
  6814. emit_remainder_encodeBetterBlockAsm4MB:
  6815. MOVQ src_len+32(FP), CX
  6816. SUBL 12(SP), CX
  6817. LEAQ 4(AX)(CX*1), CX
  6818. CMPQ CX, (SP)
  6819. JL emit_remainder_ok_encodeBetterBlockAsm4MB
  6820. MOVQ $0x00000000, ret+48(FP)
  6821. RET
  6822. emit_remainder_ok_encodeBetterBlockAsm4MB:
  6823. MOVQ src_len+32(FP), CX
  6824. MOVL 12(SP), BX
  6825. CMPL BX, CX
  6826. JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
  6827. MOVL CX, SI
  6828. MOVL CX, 12(SP)
  6829. LEAQ (DX)(BX*1), CX
  6830. SUBL BX, SI
  6831. LEAL -1(SI), DX
  6832. CMPL DX, $0x3c
  6833. JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB
  6834. CMPL DX, $0x00000100
  6835. JLT two_bytes_emit_remainder_encodeBetterBlockAsm4MB
  6836. CMPL DX, $0x00010000
  6837. JLT three_bytes_emit_remainder_encodeBetterBlockAsm4MB
  6838. MOVL DX, BX
  6839. SHRL $0x10, BX
  6840. MOVB $0xf8, (AX)
  6841. MOVW DX, 1(AX)
  6842. MOVB BL, 3(AX)
  6843. ADDQ $0x04, AX
  6844. JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
  6845. three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
  6846. MOVB $0xf4, (AX)
  6847. MOVW DX, 1(AX)
  6848. ADDQ $0x03, AX
  6849. JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
  6850. two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
  6851. MOVB $0xf0, (AX)
  6852. MOVB DL, 1(AX)
  6853. ADDQ $0x02, AX
  6854. CMPL DX, $0x40
  6855. JL memmove_emit_remainder_encodeBetterBlockAsm4MB
  6856. JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
  6857. one_byte_emit_remainder_encodeBetterBlockAsm4MB:
  6858. SHLB $0x02, DL
  6859. MOVB DL, (AX)
  6860. ADDQ $0x01, AX
  6861. memmove_emit_remainder_encodeBetterBlockAsm4MB:
  6862. LEAQ (AX)(SI*1), DX
  6863. MOVL SI, BX
  6864. // genMemMoveShort
  6865. CMPQ BX, $0x03
  6866. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2
  6867. JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3
  6868. CMPQ BX, $0x08
  6869. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
  6870. CMPQ BX, $0x10
  6871. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
  6872. CMPQ BX, $0x20
  6873. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
  6874. JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
  6875. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2:
  6876. MOVB (CX), SI
  6877. MOVB -1(CX)(BX*1), CL
  6878. MOVB SI, (AX)
  6879. MOVB CL, -1(AX)(BX*1)
  6880. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
  6881. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3:
  6882. MOVW (CX), SI
  6883. MOVB 2(CX), CL
  6884. MOVW SI, (AX)
  6885. MOVB CL, 2(AX)
  6886. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
  6887. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
  6888. MOVL (CX), SI
  6889. MOVL -4(CX)(BX*1), CX
  6890. MOVL SI, (AX)
  6891. MOVL CX, -4(AX)(BX*1)
  6892. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
  6893. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
  6894. MOVQ (CX), SI
  6895. MOVQ -8(CX)(BX*1), CX
  6896. MOVQ SI, (AX)
  6897. MOVQ CX, -8(AX)(BX*1)
  6898. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
  6899. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
  6900. MOVOU (CX), X0
  6901. MOVOU -16(CX)(BX*1), X1
  6902. MOVOU X0, (AX)
  6903. MOVOU X1, -16(AX)(BX*1)
  6904. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
  6905. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
  6906. MOVOU (CX), X0
  6907. MOVOU 16(CX), X1
  6908. MOVOU -32(CX)(BX*1), X2
  6909. MOVOU -16(CX)(BX*1), X3
  6910. MOVOU X0, (AX)
  6911. MOVOU X1, 16(AX)
  6912. MOVOU X2, -32(AX)(BX*1)
  6913. MOVOU X3, -16(AX)(BX*1)
  6914. memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
  6915. MOVQ DX, AX
  6916. JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
  6917. memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
  6918. LEAQ (AX)(SI*1), DX
  6919. MOVL SI, BX
  6920. // genMemMoveLong
  6921. MOVOU (CX), X0
  6922. MOVOU 16(CX), X1
  6923. MOVOU -32(CX)(BX*1), X2
  6924. MOVOU -16(CX)(BX*1), X3
  6925. MOVQ BX, DI
  6926. SHRQ $0x05, DI
  6927. MOVQ AX, SI
  6928. ANDL $0x0000001f, SI
  6929. MOVQ $0x00000040, R8
  6930. SUBQ SI, R8
  6931. DECQ DI
  6932. JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  6933. LEAQ -32(CX)(R8*1), SI
  6934. LEAQ -32(AX)(R8*1), R9
  6935. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
  6936. MOVOU (SI), X4
  6937. MOVOU 16(SI), X5
  6938. MOVOA X4, (R9)
  6939. MOVOA X5, 16(R9)
  6940. ADDQ $0x20, R9
  6941. ADDQ $0x20, SI
  6942. ADDQ $0x20, R8
  6943. DECQ DI
  6944. JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
  6945. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
  6946. MOVOU -32(CX)(R8*1), X4
  6947. MOVOU -16(CX)(R8*1), X5
  6948. MOVOA X4, -32(AX)(R8*1)
  6949. MOVOA X5, -16(AX)(R8*1)
  6950. ADDQ $0x20, R8
  6951. CMPQ BX, R8
  6952. JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
  6953. MOVOU X0, (AX)
  6954. MOVOU X1, 16(AX)
  6955. MOVOU X2, -32(AX)(BX*1)
  6956. MOVOU X3, -16(AX)(BX*1)
  6957. MOVQ DX, AX
  6958. emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
  6959. MOVQ dst_base+0(FP), CX
  6960. SUBQ CX, AX
  6961. MOVQ AX, ret+48(FP)
  6962. RET
  6963. // func encodeBetterBlockAsm12B(dst []byte, src []byte) int
  6964. // Requires: BMI, SSE2
  6965. TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
  6966. MOVQ dst_base+0(FP), AX
  6967. MOVQ $0x00000280, CX
  6968. LEAQ 24(SP), DX
  6969. PXOR X0, X0
  6970. zero_loop_encodeBetterBlockAsm12B:
  6971. MOVOU X0, (DX)
  6972. MOVOU X0, 16(DX)
  6973. MOVOU X0, 32(DX)
  6974. MOVOU X0, 48(DX)
  6975. MOVOU X0, 64(DX)
  6976. MOVOU X0, 80(DX)
  6977. MOVOU X0, 96(DX)
  6978. MOVOU X0, 112(DX)
  6979. ADDQ $0x80, DX
  6980. DECQ CX
  6981. JNZ zero_loop_encodeBetterBlockAsm12B
  6982. MOVL $0x00000000, 12(SP)
  6983. MOVQ src_len+32(FP), CX
  6984. LEAQ -6(CX), DX
  6985. LEAQ -8(CX), SI
  6986. MOVL SI, 8(SP)
  6987. SHRQ $0x05, CX
  6988. SUBL CX, DX
  6989. LEAQ (AX)(DX*1), DX
  6990. MOVQ DX, (SP)
  6991. MOVL $0x00000001, CX
  6992. MOVL $0x00000000, 16(SP)
  6993. MOVQ src_base+24(FP), DX
  6994. search_loop_encodeBetterBlockAsm12B:
  6995. MOVL CX, SI
  6996. SUBL 12(SP), SI
  6997. SHRL $0x06, SI
  6998. LEAL 1(CX)(SI*1), SI
  6999. CMPL SI, 8(SP)
  7000. JGE emit_remainder_encodeBetterBlockAsm12B
  7001. MOVQ (DX)(CX*1), DI
  7002. MOVL SI, 20(SP)
  7003. MOVQ $0x0000cf1bbcdcbf9b, R9
  7004. MOVQ $0x9e3779b1, SI
  7005. MOVQ DI, R10
  7006. MOVQ DI, R11
  7007. SHLQ $0x10, R10
  7008. IMULQ R9, R10
  7009. SHRQ $0x32, R10
  7010. SHLQ $0x20, R11
  7011. IMULQ SI, R11
  7012. SHRQ $0x34, R11
  7013. MOVL 24(SP)(R10*4), SI
  7014. MOVL 65560(SP)(R11*4), R8
  7015. MOVL CX, 24(SP)(R10*4)
  7016. MOVL CX, 65560(SP)(R11*4)
  7017. CMPL (DX)(SI*1), DI
  7018. JEQ candidate_match_encodeBetterBlockAsm12B
  7019. CMPL (DX)(R8*1), DI
  7020. JEQ candidateS_match_encodeBetterBlockAsm12B
  7021. MOVL 20(SP), CX
  7022. JMP search_loop_encodeBetterBlockAsm12B
  7023. candidateS_match_encodeBetterBlockAsm12B:
  7024. SHRQ $0x08, DI
  7025. MOVQ DI, R10
  7026. SHLQ $0x10, R10
  7027. IMULQ R9, R10
  7028. SHRQ $0x32, R10
  7029. MOVL 24(SP)(R10*4), SI
  7030. INCL CX
  7031. MOVL CX, 24(SP)(R10*4)
  7032. CMPL (DX)(SI*1), DI
  7033. JEQ candidate_match_encodeBetterBlockAsm12B
  7034. DECL CX
  7035. MOVL R8, SI
  7036. candidate_match_encodeBetterBlockAsm12B:
  7037. MOVL 12(SP), DI
  7038. TESTL SI, SI
  7039. JZ match_extend_back_end_encodeBetterBlockAsm12B
  7040. match_extend_back_loop_encodeBetterBlockAsm12B:
  7041. CMPL CX, DI
  7042. JLE match_extend_back_end_encodeBetterBlockAsm12B
  7043. MOVB -1(DX)(SI*1), BL
  7044. MOVB -1(DX)(CX*1), R8
  7045. CMPB BL, R8
  7046. JNE match_extend_back_end_encodeBetterBlockAsm12B
  7047. LEAL -1(CX), CX
  7048. DECL SI
  7049. JZ match_extend_back_end_encodeBetterBlockAsm12B
  7050. JMP match_extend_back_loop_encodeBetterBlockAsm12B
  7051. match_extend_back_end_encodeBetterBlockAsm12B:
  7052. MOVL CX, DI
  7053. SUBL 12(SP), DI
  7054. LEAQ 3(AX)(DI*1), DI
  7055. CMPQ DI, (SP)
  7056. JL match_dst_size_check_encodeBetterBlockAsm12B
  7057. MOVQ $0x00000000, ret+48(FP)
  7058. RET
  7059. match_dst_size_check_encodeBetterBlockAsm12B:
  7060. MOVL CX, DI
  7061. ADDL $0x04, CX
  7062. ADDL $0x04, SI
  7063. MOVQ src_len+32(FP), R8
  7064. SUBL CX, R8
  7065. LEAQ (DX)(CX*1), R9
  7066. LEAQ (DX)(SI*1), R10
  7067. // matchLen
  7068. XORL R12, R12
  7069. CMPL R8, $0x08
  7070. JL matchlen_match4_match_nolit_encodeBetterBlockAsm12B
  7071. matchlen_loopback_match_nolit_encodeBetterBlockAsm12B:
  7072. MOVQ (R9)(R12*1), R11
  7073. XORQ (R10)(R12*1), R11
  7074. TESTQ R11, R11
  7075. JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B
  7076. #ifdef GOAMD64_v3
  7077. TZCNTQ R11, R11
  7078. #else
  7079. BSFQ R11, R11
  7080. #endif
  7081. SARQ $0x03, R11
  7082. LEAL (R12)(R11*1), R12
  7083. JMP match_nolit_end_encodeBetterBlockAsm12B
  7084. matchlen_loop_match_nolit_encodeBetterBlockAsm12B:
  7085. LEAL -8(R8), R8
  7086. LEAL 8(R12), R12
  7087. CMPL R8, $0x08
  7088. JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B
  7089. JZ match_nolit_end_encodeBetterBlockAsm12B
  7090. matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
  7091. CMPL R8, $0x04
  7092. JL matchlen_match2_match_nolit_encodeBetterBlockAsm12B
  7093. MOVL (R9)(R12*1), R11
  7094. CMPL (R10)(R12*1), R11
  7095. JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B
  7096. SUBL $0x04, R8
  7097. LEAL 4(R12), R12
  7098. matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
  7099. CMPL R8, $0x02
  7100. JL matchlen_match1_match_nolit_encodeBetterBlockAsm12B
  7101. MOVW (R9)(R12*1), R11
  7102. CMPW (R10)(R12*1), R11
  7103. JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
  7104. SUBL $0x02, R8
  7105. LEAL 2(R12), R12
  7106. matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
  7107. CMPL R8, $0x01
  7108. JL match_nolit_end_encodeBetterBlockAsm12B
  7109. MOVB (R9)(R12*1), R11
  7110. CMPB (R10)(R12*1), R11
  7111. JNE match_nolit_end_encodeBetterBlockAsm12B
  7112. LEAL 1(R12), R12
  7113. match_nolit_end_encodeBetterBlockAsm12B:
  7114. MOVL CX, R8
  7115. SUBL SI, R8
  7116. // Check if repeat
  7117. CMPL 16(SP), R8
  7118. JEQ match_is_repeat_encodeBetterBlockAsm12B
  7119. MOVL R8, 16(SP)
  7120. MOVL 12(SP), SI
  7121. CMPL SI, DI
  7122. JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B
  7123. MOVL DI, R9
  7124. MOVL DI, 12(SP)
  7125. LEAQ (DX)(SI*1), R10
  7126. SUBL SI, R9
  7127. LEAL -1(R9), SI
  7128. CMPL SI, $0x3c
  7129. JLT one_byte_match_emit_encodeBetterBlockAsm12B
  7130. CMPL SI, $0x00000100
  7131. JLT two_bytes_match_emit_encodeBetterBlockAsm12B
  7132. MOVB $0xf4, (AX)
  7133. MOVW SI, 1(AX)
  7134. ADDQ $0x03, AX
  7135. JMP memmove_long_match_emit_encodeBetterBlockAsm12B
  7136. two_bytes_match_emit_encodeBetterBlockAsm12B:
  7137. MOVB $0xf0, (AX)
  7138. MOVB SI, 1(AX)
  7139. ADDQ $0x02, AX
  7140. CMPL SI, $0x40
  7141. JL memmove_match_emit_encodeBetterBlockAsm12B
  7142. JMP memmove_long_match_emit_encodeBetterBlockAsm12B
  7143. one_byte_match_emit_encodeBetterBlockAsm12B:
  7144. SHLB $0x02, SI
  7145. MOVB SI, (AX)
  7146. ADDQ $0x01, AX
  7147. memmove_match_emit_encodeBetterBlockAsm12B:
  7148. LEAQ (AX)(R9*1), SI
  7149. // genMemMoveShort
  7150. CMPQ R9, $0x04
  7151. JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
  7152. CMPQ R9, $0x08
  7153. JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
  7154. CMPQ R9, $0x10
  7155. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
  7156. CMPQ R9, $0x20
  7157. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
  7158. JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
  7159. emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
  7160. MOVL (R10), R11
  7161. MOVL R11, (AX)
  7162. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
  7163. emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
  7164. MOVL (R10), R11
  7165. MOVL -4(R10)(R9*1), R10
  7166. MOVL R11, (AX)
  7167. MOVL R10, -4(AX)(R9*1)
  7168. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
  7169. emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
  7170. MOVQ (R10), R11
  7171. MOVQ -8(R10)(R9*1), R10
  7172. MOVQ R11, (AX)
  7173. MOVQ R10, -8(AX)(R9*1)
  7174. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
  7175. emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
  7176. MOVOU (R10), X0
  7177. MOVOU -16(R10)(R9*1), X1
  7178. MOVOU X0, (AX)
  7179. MOVOU X1, -16(AX)(R9*1)
  7180. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
  7181. emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
  7182. MOVOU (R10), X0
  7183. MOVOU 16(R10), X1
  7184. MOVOU -32(R10)(R9*1), X2
  7185. MOVOU -16(R10)(R9*1), X3
  7186. MOVOU X0, (AX)
  7187. MOVOU X1, 16(AX)
  7188. MOVOU X2, -32(AX)(R9*1)
  7189. MOVOU X3, -16(AX)(R9*1)
  7190. memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
  7191. MOVQ SI, AX
  7192. JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B
  7193. memmove_long_match_emit_encodeBetterBlockAsm12B:
  7194. LEAQ (AX)(R9*1), SI
  7195. // genMemMoveLong
  7196. MOVOU (R10), X0
  7197. MOVOU 16(R10), X1
  7198. MOVOU -32(R10)(R9*1), X2
  7199. MOVOU -16(R10)(R9*1), X3
  7200. MOVQ R9, R13
  7201. SHRQ $0x05, R13
  7202. MOVQ AX, R11
  7203. ANDL $0x0000001f, R11
  7204. MOVQ $0x00000040, R14
  7205. SUBQ R11, R14
  7206. DECQ R13
  7207. JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7208. LEAQ -32(R10)(R14*1), R11
  7209. LEAQ -32(AX)(R14*1), R15
  7210. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
  7211. MOVOU (R11), X4
  7212. MOVOU 16(R11), X5
  7213. MOVOA X4, (R15)
  7214. MOVOA X5, 16(R15)
  7215. ADDQ $0x20, R15
  7216. ADDQ $0x20, R11
  7217. ADDQ $0x20, R14
  7218. DECQ R13
  7219. JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
  7220. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
  7221. MOVOU -32(R10)(R14*1), X4
  7222. MOVOU -16(R10)(R14*1), X5
  7223. MOVOA X4, -32(AX)(R14*1)
  7224. MOVOA X5, -16(AX)(R14*1)
  7225. ADDQ $0x20, R14
  7226. CMPQ R9, R14
  7227. JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7228. MOVOU X0, (AX)
  7229. MOVOU X1, 16(AX)
  7230. MOVOU X2, -32(AX)(R9*1)
  7231. MOVOU X3, -16(AX)(R9*1)
  7232. MOVQ SI, AX
  7233. emit_literal_done_match_emit_encodeBetterBlockAsm12B:
  7234. ADDL R12, CX
  7235. ADDL $0x04, R12
  7236. MOVL CX, 12(SP)
  7237. // emitCopy
  7238. two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
  7239. CMPL R12, $0x40
  7240. JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
  7241. CMPL R8, $0x00000800
  7242. JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B
  7243. MOVL $0x00000001, SI
  7244. LEAL 16(SI), SI
  7245. MOVB R8, 1(AX)
  7246. SHRL $0x08, R8
  7247. SHLL $0x05, R8
  7248. ORL R8, SI
  7249. MOVB SI, (AX)
  7250. ADDQ $0x02, AX
  7251. SUBL $0x08, R12
  7252. // emitRepeat
  7253. LEAL -4(R12), R12
  7254. JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
  7255. MOVL R12, SI
  7256. LEAL -4(R12), R12
  7257. CMPL SI, $0x08
  7258. JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
  7259. CMPL SI, $0x0c
  7260. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
  7261. CMPL R8, $0x00000800
  7262. JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
  7263. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
  7264. CMPL R12, $0x00000104
  7265. JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
  7266. LEAL -256(R12), R12
  7267. MOVW $0x0019, (AX)
  7268. MOVW R12, 2(AX)
  7269. ADDQ $0x04, AX
  7270. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7271. repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
  7272. LEAL -4(R12), R12
  7273. MOVW $0x0015, (AX)
  7274. MOVB R12, 2(AX)
  7275. ADDQ $0x03, AX
  7276. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7277. repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
  7278. SHLL $0x02, R12
  7279. ORL $0x01, R12
  7280. MOVW R12, (AX)
  7281. ADDQ $0x02, AX
  7282. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7283. repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
  7284. XORQ SI, SI
  7285. LEAL 1(SI)(R12*4), R12
  7286. MOVB R8, 1(AX)
  7287. SARL $0x08, R8
  7288. SHLL $0x05, R8
  7289. ORL R8, R12
  7290. MOVB R12, (AX)
  7291. ADDQ $0x02, AX
  7292. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7293. long_offset_short_match_nolit_encodeBetterBlockAsm12B:
  7294. MOVB $0xee, (AX)
  7295. MOVW R8, 1(AX)
  7296. LEAL -60(R12), R12
  7297. ADDQ $0x03, AX
  7298. // emitRepeat
  7299. MOVL R12, SI
  7300. LEAL -4(R12), R12
  7301. CMPL SI, $0x08
  7302. JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
  7303. CMPL SI, $0x0c
  7304. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
  7305. CMPL R8, $0x00000800
  7306. JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
  7307. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
  7308. CMPL R12, $0x00000104
  7309. JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
  7310. LEAL -256(R12), R12
  7311. MOVW $0x0019, (AX)
  7312. MOVW R12, 2(AX)
  7313. ADDQ $0x04, AX
  7314. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7315. repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
  7316. LEAL -4(R12), R12
  7317. MOVW $0x0015, (AX)
  7318. MOVB R12, 2(AX)
  7319. ADDQ $0x03, AX
  7320. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7321. repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
  7322. SHLL $0x02, R12
  7323. ORL $0x01, R12
  7324. MOVW R12, (AX)
  7325. ADDQ $0x02, AX
  7326. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7327. repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
  7328. XORQ SI, SI
  7329. LEAL 1(SI)(R12*4), R12
  7330. MOVB R8, 1(AX)
  7331. SARL $0x08, R8
  7332. SHLL $0x05, R8
  7333. ORL R8, R12
  7334. MOVB R12, (AX)
  7335. ADDQ $0x02, AX
  7336. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7337. JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B
  7338. two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
  7339. CMPL R12, $0x0c
  7340. JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
  7341. CMPL R8, $0x00000800
  7342. JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
  7343. MOVB $0x01, BL
  7344. LEAL -16(BX)(R12*4), R12
  7345. MOVB R8, 1(AX)
  7346. SHRL $0x08, R8
  7347. SHLL $0x05, R8
  7348. ORL R8, R12
  7349. MOVB R12, (AX)
  7350. ADDQ $0x02, AX
  7351. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7352. emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
  7353. MOVB $0x02, BL
  7354. LEAL -4(BX)(R12*4), R12
  7355. MOVB R12, (AX)
  7356. MOVW R8, 1(AX)
  7357. ADDQ $0x03, AX
  7358. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7359. match_is_repeat_encodeBetterBlockAsm12B:
  7360. MOVL 12(SP), SI
  7361. CMPL SI, DI
  7362. JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
  7363. MOVL DI, R9
  7364. MOVL DI, 12(SP)
  7365. LEAQ (DX)(SI*1), R10
  7366. SUBL SI, R9
  7367. LEAL -1(R9), SI
  7368. CMPL SI, $0x3c
  7369. JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B
  7370. CMPL SI, $0x00000100
  7371. JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
  7372. MOVB $0xf4, (AX)
  7373. MOVW SI, 1(AX)
  7374. ADDQ $0x03, AX
  7375. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
  7376. two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
  7377. MOVB $0xf0, (AX)
  7378. MOVB SI, 1(AX)
  7379. ADDQ $0x02, AX
  7380. CMPL SI, $0x40
  7381. JL memmove_match_emit_repeat_encodeBetterBlockAsm12B
  7382. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
  7383. one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
  7384. SHLB $0x02, SI
  7385. MOVB SI, (AX)
  7386. ADDQ $0x01, AX
  7387. memmove_match_emit_repeat_encodeBetterBlockAsm12B:
  7388. LEAQ (AX)(R9*1), SI
  7389. // genMemMoveShort
  7390. CMPQ R9, $0x04
  7391. JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
  7392. CMPQ R9, $0x08
  7393. JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
  7394. CMPQ R9, $0x10
  7395. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
  7396. CMPQ R9, $0x20
  7397. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
  7398. JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
  7399. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
  7400. MOVL (R10), R11
  7401. MOVL R11, (AX)
  7402. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
  7403. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
  7404. MOVL (R10), R11
  7405. MOVL -4(R10)(R9*1), R10
  7406. MOVL R11, (AX)
  7407. MOVL R10, -4(AX)(R9*1)
  7408. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
  7409. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
  7410. MOVQ (R10), R11
  7411. MOVQ -8(R10)(R9*1), R10
  7412. MOVQ R11, (AX)
  7413. MOVQ R10, -8(AX)(R9*1)
  7414. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
  7415. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
  7416. MOVOU (R10), X0
  7417. MOVOU -16(R10)(R9*1), X1
  7418. MOVOU X0, (AX)
  7419. MOVOU X1, -16(AX)(R9*1)
  7420. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
  7421. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
  7422. MOVOU (R10), X0
  7423. MOVOU 16(R10), X1
  7424. MOVOU -32(R10)(R9*1), X2
  7425. MOVOU -16(R10)(R9*1), X3
  7426. MOVOU X0, (AX)
  7427. MOVOU X1, 16(AX)
  7428. MOVOU X2, -32(AX)(R9*1)
  7429. MOVOU X3, -16(AX)(R9*1)
  7430. memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
  7431. MOVQ SI, AX
  7432. JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
  7433. memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
  7434. LEAQ (AX)(R9*1), SI
  7435. // genMemMoveLong
  7436. MOVOU (R10), X0
  7437. MOVOU 16(R10), X1
  7438. MOVOU -32(R10)(R9*1), X2
  7439. MOVOU -16(R10)(R9*1), X3
  7440. MOVQ R9, R13
  7441. SHRQ $0x05, R13
  7442. MOVQ AX, R11
  7443. ANDL $0x0000001f, R11
  7444. MOVQ $0x00000040, R14
  7445. SUBQ R11, R14
  7446. DECQ R13
  7447. JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7448. LEAQ -32(R10)(R14*1), R11
  7449. LEAQ -32(AX)(R14*1), R15
  7450. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
  7451. MOVOU (R11), X4
  7452. MOVOU 16(R11), X5
  7453. MOVOA X4, (R15)
  7454. MOVOA X5, 16(R15)
  7455. ADDQ $0x20, R15
  7456. ADDQ $0x20, R11
  7457. ADDQ $0x20, R14
  7458. DECQ R13
  7459. JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
  7460. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
  7461. MOVOU -32(R10)(R14*1), X4
  7462. MOVOU -16(R10)(R14*1), X5
  7463. MOVOA X4, -32(AX)(R14*1)
  7464. MOVOA X5, -16(AX)(R14*1)
  7465. ADDQ $0x20, R14
  7466. CMPQ R9, R14
  7467. JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7468. MOVOU X0, (AX)
  7469. MOVOU X1, 16(AX)
  7470. MOVOU X2, -32(AX)(R9*1)
  7471. MOVOU X3, -16(AX)(R9*1)
  7472. MOVQ SI, AX
  7473. emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
  7474. ADDL R12, CX
  7475. ADDL $0x04, R12
  7476. MOVL CX, 12(SP)
  7477. // emitRepeat
  7478. MOVL R12, SI
  7479. LEAL -4(R12), R12
  7480. CMPL SI, $0x08
  7481. JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
  7482. CMPL SI, $0x0c
  7483. JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
  7484. CMPL R8, $0x00000800
  7485. JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
  7486. cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
  7487. CMPL R12, $0x00000104
  7488. JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
  7489. LEAL -256(R12), R12
  7490. MOVW $0x0019, (AX)
  7491. MOVW R12, 2(AX)
  7492. ADDQ $0x04, AX
  7493. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7494. repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
  7495. LEAL -4(R12), R12
  7496. MOVW $0x0015, (AX)
  7497. MOVB R12, 2(AX)
  7498. ADDQ $0x03, AX
  7499. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7500. repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
  7501. SHLL $0x02, R12
  7502. ORL $0x01, R12
  7503. MOVW R12, (AX)
  7504. ADDQ $0x02, AX
  7505. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
  7506. repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
  7507. XORQ SI, SI
  7508. LEAL 1(SI)(R12*4), R12
  7509. MOVB R8, 1(AX)
  7510. SARL $0x08, R8
  7511. SHLL $0x05, R8
  7512. ORL R8, R12
  7513. MOVB R12, (AX)
  7514. ADDQ $0x02, AX
  7515. match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
  7516. CMPL CX, 8(SP)
  7517. JGE emit_remainder_encodeBetterBlockAsm12B
  7518. CMPQ AX, (SP)
  7519. JL match_nolit_dst_ok_encodeBetterBlockAsm12B
  7520. MOVQ $0x00000000, ret+48(FP)
  7521. RET
  7522. match_nolit_dst_ok_encodeBetterBlockAsm12B:
  7523. MOVQ $0x0000cf1bbcdcbf9b, SI
  7524. MOVQ $0x9e3779b1, R8
  7525. INCL DI
  7526. MOVQ (DX)(DI*1), R9
  7527. MOVQ R9, R10
  7528. MOVQ R9, R11
  7529. MOVQ R9, R12
  7530. SHRQ $0x08, R11
  7531. MOVQ R11, R13
  7532. SHRQ $0x10, R12
  7533. LEAL 1(DI), R14
  7534. LEAL 2(DI), R15
  7535. MOVQ -2(DX)(CX*1), R9
  7536. SHLQ $0x10, R10
  7537. IMULQ SI, R10
  7538. SHRQ $0x32, R10
  7539. SHLQ $0x10, R13
  7540. IMULQ SI, R13
  7541. SHRQ $0x32, R13
  7542. SHLQ $0x20, R11
  7543. IMULQ R8, R11
  7544. SHRQ $0x34, R11
  7545. SHLQ $0x20, R12
  7546. IMULQ R8, R12
  7547. SHRQ $0x34, R12
  7548. MOVL DI, 24(SP)(R10*4)
  7549. MOVL R14, 24(SP)(R13*4)
  7550. MOVL R14, 65560(SP)(R11*4)
  7551. MOVL R15, 65560(SP)(R12*4)
  7552. MOVQ R9, R10
  7553. MOVQ R9, R11
  7554. SHRQ $0x08, R11
  7555. MOVQ R11, R13
  7556. LEAL -2(CX), R9
  7557. LEAL -1(CX), DI
  7558. SHLQ $0x10, R10
  7559. IMULQ SI, R10
  7560. SHRQ $0x32, R10
  7561. SHLQ $0x20, R11
  7562. IMULQ R8, R11
  7563. SHRQ $0x34, R11
  7564. SHLQ $0x10, R13
  7565. IMULQ SI, R13
  7566. SHRQ $0x32, R13
  7567. MOVL R9, 24(SP)(R10*4)
  7568. MOVL DI, 65560(SP)(R11*4)
  7569. MOVL DI, 24(SP)(R13*4)
  7570. JMP search_loop_encodeBetterBlockAsm12B
  7571. emit_remainder_encodeBetterBlockAsm12B:
  7572. MOVQ src_len+32(FP), CX
  7573. SUBL 12(SP), CX
  7574. LEAQ 3(AX)(CX*1), CX
  7575. CMPQ CX, (SP)
  7576. JL emit_remainder_ok_encodeBetterBlockAsm12B
  7577. MOVQ $0x00000000, ret+48(FP)
  7578. RET
  7579. emit_remainder_ok_encodeBetterBlockAsm12B:
  7580. MOVQ src_len+32(FP), CX
  7581. MOVL 12(SP), BX
  7582. CMPL BX, CX
  7583. JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
  7584. MOVL CX, SI
  7585. MOVL CX, 12(SP)
  7586. LEAQ (DX)(BX*1), CX
  7587. SUBL BX, SI
  7588. LEAL -1(SI), DX
  7589. CMPL DX, $0x3c
  7590. JLT one_byte_emit_remainder_encodeBetterBlockAsm12B
  7591. CMPL DX, $0x00000100
  7592. JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B
  7593. MOVB $0xf4, (AX)
  7594. MOVW DX, 1(AX)
  7595. ADDQ $0x03, AX
  7596. JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
  7597. two_bytes_emit_remainder_encodeBetterBlockAsm12B:
  7598. MOVB $0xf0, (AX)
  7599. MOVB DL, 1(AX)
  7600. ADDQ $0x02, AX
  7601. CMPL DX, $0x40
  7602. JL memmove_emit_remainder_encodeBetterBlockAsm12B
  7603. JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
  7604. one_byte_emit_remainder_encodeBetterBlockAsm12B:
  7605. SHLB $0x02, DL
  7606. MOVB DL, (AX)
  7607. ADDQ $0x01, AX
  7608. memmove_emit_remainder_encodeBetterBlockAsm12B:
  7609. LEAQ (AX)(SI*1), DX
  7610. MOVL SI, BX
  7611. // genMemMoveShort
  7612. CMPQ BX, $0x03
  7613. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2
  7614. JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3
  7615. CMPQ BX, $0x08
  7616. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
  7617. CMPQ BX, $0x10
  7618. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
  7619. CMPQ BX, $0x20
  7620. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
  7621. JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
  7622. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2:
  7623. MOVB (CX), SI
  7624. MOVB -1(CX)(BX*1), CL
  7625. MOVB SI, (AX)
  7626. MOVB CL, -1(AX)(BX*1)
  7627. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
  7628. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3:
  7629. MOVW (CX), SI
  7630. MOVB 2(CX), CL
  7631. MOVW SI, (AX)
  7632. MOVB CL, 2(AX)
  7633. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
  7634. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
  7635. MOVL (CX), SI
  7636. MOVL -4(CX)(BX*1), CX
  7637. MOVL SI, (AX)
  7638. MOVL CX, -4(AX)(BX*1)
  7639. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
  7640. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
  7641. MOVQ (CX), SI
  7642. MOVQ -8(CX)(BX*1), CX
  7643. MOVQ SI, (AX)
  7644. MOVQ CX, -8(AX)(BX*1)
  7645. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
  7646. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
  7647. MOVOU (CX), X0
  7648. MOVOU -16(CX)(BX*1), X1
  7649. MOVOU X0, (AX)
  7650. MOVOU X1, -16(AX)(BX*1)
  7651. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
  7652. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
  7653. MOVOU (CX), X0
  7654. MOVOU 16(CX), X1
  7655. MOVOU -32(CX)(BX*1), X2
  7656. MOVOU -16(CX)(BX*1), X3
  7657. MOVOU X0, (AX)
  7658. MOVOU X1, 16(AX)
  7659. MOVOU X2, -32(AX)(BX*1)
  7660. MOVOU X3, -16(AX)(BX*1)
  7661. memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
  7662. MOVQ DX, AX
  7663. JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
  7664. memmove_long_emit_remainder_encodeBetterBlockAsm12B:
  7665. LEAQ (AX)(SI*1), DX
  7666. MOVL SI, BX
  7667. // genMemMoveLong
  7668. MOVOU (CX), X0
  7669. MOVOU 16(CX), X1
  7670. MOVOU -32(CX)(BX*1), X2
  7671. MOVOU -16(CX)(BX*1), X3
  7672. MOVQ BX, DI
  7673. SHRQ $0x05, DI
  7674. MOVQ AX, SI
  7675. ANDL $0x0000001f, SI
  7676. MOVQ $0x00000040, R8
  7677. SUBQ SI, R8
  7678. DECQ DI
  7679. JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7680. LEAQ -32(CX)(R8*1), SI
  7681. LEAQ -32(AX)(R8*1), R9
  7682. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
  7683. MOVOU (SI), X4
  7684. MOVOU 16(SI), X5
  7685. MOVOA X4, (R9)
  7686. MOVOA X5, 16(R9)
  7687. ADDQ $0x20, R9
  7688. ADDQ $0x20, SI
  7689. ADDQ $0x20, R8
  7690. DECQ DI
  7691. JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
  7692. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
  7693. MOVOU -32(CX)(R8*1), X4
  7694. MOVOU -16(CX)(R8*1), X5
  7695. MOVOA X4, -32(AX)(R8*1)
  7696. MOVOA X5, -16(AX)(R8*1)
  7697. ADDQ $0x20, R8
  7698. CMPQ BX, R8
  7699. JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
  7700. MOVOU X0, (AX)
  7701. MOVOU X1, 16(AX)
  7702. MOVOU X2, -32(AX)(BX*1)
  7703. MOVOU X3, -16(AX)(BX*1)
  7704. MOVQ DX, AX
  7705. emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
  7706. MOVQ dst_base+0(FP), CX
  7707. SUBQ CX, AX
  7708. MOVQ AX, ret+48(FP)
  7709. RET
  7710. // func encodeBetterBlockAsm10B(dst []byte, src []byte) int
  7711. // Requires: BMI, SSE2
  7712. TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
  7713. MOVQ dst_base+0(FP), AX
  7714. MOVQ $0x000000a0, CX
  7715. LEAQ 24(SP), DX
  7716. PXOR X0, X0
  7717. zero_loop_encodeBetterBlockAsm10B:
  7718. MOVOU X0, (DX)
  7719. MOVOU X0, 16(DX)
  7720. MOVOU X0, 32(DX)
  7721. MOVOU X0, 48(DX)
  7722. MOVOU X0, 64(DX)
  7723. MOVOU X0, 80(DX)
  7724. MOVOU X0, 96(DX)
  7725. MOVOU X0, 112(DX)
  7726. ADDQ $0x80, DX
  7727. DECQ CX
  7728. JNZ zero_loop_encodeBetterBlockAsm10B
  7729. MOVL $0x00000000, 12(SP)
  7730. MOVQ src_len+32(FP), CX
  7731. LEAQ -6(CX), DX
  7732. LEAQ -8(CX), SI
  7733. MOVL SI, 8(SP)
  7734. SHRQ $0x05, CX
  7735. SUBL CX, DX
  7736. LEAQ (AX)(DX*1), DX
  7737. MOVQ DX, (SP)
  7738. MOVL $0x00000001, CX
  7739. MOVL $0x00000000, 16(SP)
  7740. MOVQ src_base+24(FP), DX
  7741. search_loop_encodeBetterBlockAsm10B:
  7742. MOVL CX, SI
  7743. SUBL 12(SP), SI
  7744. SHRL $0x05, SI
  7745. LEAL 1(CX)(SI*1), SI
  7746. CMPL SI, 8(SP)
  7747. JGE emit_remainder_encodeBetterBlockAsm10B
  7748. MOVQ (DX)(CX*1), DI
  7749. MOVL SI, 20(SP)
  7750. MOVQ $0x0000cf1bbcdcbf9b, R9
  7751. MOVQ $0x9e3779b1, SI
  7752. MOVQ DI, R10
  7753. MOVQ DI, R11
  7754. SHLQ $0x10, R10
  7755. IMULQ R9, R10
  7756. SHRQ $0x34, R10
  7757. SHLQ $0x20, R11
  7758. IMULQ SI, R11
  7759. SHRQ $0x36, R11
  7760. MOVL 24(SP)(R10*4), SI
  7761. MOVL 16408(SP)(R11*4), R8
  7762. MOVL CX, 24(SP)(R10*4)
  7763. MOVL CX, 16408(SP)(R11*4)
  7764. CMPL (DX)(SI*1), DI
  7765. JEQ candidate_match_encodeBetterBlockAsm10B
  7766. CMPL (DX)(R8*1), DI
  7767. JEQ candidateS_match_encodeBetterBlockAsm10B
  7768. MOVL 20(SP), CX
  7769. JMP search_loop_encodeBetterBlockAsm10B
  7770. candidateS_match_encodeBetterBlockAsm10B:
  7771. SHRQ $0x08, DI
  7772. MOVQ DI, R10
  7773. SHLQ $0x10, R10
  7774. IMULQ R9, R10
  7775. SHRQ $0x34, R10
  7776. MOVL 24(SP)(R10*4), SI
  7777. INCL CX
  7778. MOVL CX, 24(SP)(R10*4)
  7779. CMPL (DX)(SI*1), DI
  7780. JEQ candidate_match_encodeBetterBlockAsm10B
  7781. DECL CX
  7782. MOVL R8, SI
  7783. candidate_match_encodeBetterBlockAsm10B:
  7784. MOVL 12(SP), DI
  7785. TESTL SI, SI
  7786. JZ match_extend_back_end_encodeBetterBlockAsm10B
  7787. match_extend_back_loop_encodeBetterBlockAsm10B:
  7788. CMPL CX, DI
  7789. JLE match_extend_back_end_encodeBetterBlockAsm10B
  7790. MOVB -1(DX)(SI*1), BL
  7791. MOVB -1(DX)(CX*1), R8
  7792. CMPB BL, R8
  7793. JNE match_extend_back_end_encodeBetterBlockAsm10B
  7794. LEAL -1(CX), CX
  7795. DECL SI
  7796. JZ match_extend_back_end_encodeBetterBlockAsm10B
  7797. JMP match_extend_back_loop_encodeBetterBlockAsm10B
  7798. match_extend_back_end_encodeBetterBlockAsm10B:
  7799. MOVL CX, DI
  7800. SUBL 12(SP), DI
  7801. LEAQ 3(AX)(DI*1), DI
  7802. CMPQ DI, (SP)
  7803. JL match_dst_size_check_encodeBetterBlockAsm10B
  7804. MOVQ $0x00000000, ret+48(FP)
  7805. RET
  7806. match_dst_size_check_encodeBetterBlockAsm10B:
  7807. MOVL CX, DI
  7808. ADDL $0x04, CX
  7809. ADDL $0x04, SI
  7810. MOVQ src_len+32(FP), R8
  7811. SUBL CX, R8
  7812. LEAQ (DX)(CX*1), R9
  7813. LEAQ (DX)(SI*1), R10
  7814. // matchLen
  7815. XORL R12, R12
  7816. CMPL R8, $0x08
  7817. JL matchlen_match4_match_nolit_encodeBetterBlockAsm10B
  7818. matchlen_loopback_match_nolit_encodeBetterBlockAsm10B:
  7819. MOVQ (R9)(R12*1), R11
  7820. XORQ (R10)(R12*1), R11
  7821. TESTQ R11, R11
  7822. JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B
  7823. #ifdef GOAMD64_v3
  7824. TZCNTQ R11, R11
  7825. #else
  7826. BSFQ R11, R11
  7827. #endif
  7828. SARQ $0x03, R11
  7829. LEAL (R12)(R11*1), R12
  7830. JMP match_nolit_end_encodeBetterBlockAsm10B
  7831. matchlen_loop_match_nolit_encodeBetterBlockAsm10B:
  7832. LEAL -8(R8), R8
  7833. LEAL 8(R12), R12
  7834. CMPL R8, $0x08
  7835. JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B
  7836. JZ match_nolit_end_encodeBetterBlockAsm10B
  7837. matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
  7838. CMPL R8, $0x04
  7839. JL matchlen_match2_match_nolit_encodeBetterBlockAsm10B
  7840. MOVL (R9)(R12*1), R11
  7841. CMPL (R10)(R12*1), R11
  7842. JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B
  7843. SUBL $0x04, R8
  7844. LEAL 4(R12), R12
  7845. matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
  7846. CMPL R8, $0x02
  7847. JL matchlen_match1_match_nolit_encodeBetterBlockAsm10B
  7848. MOVW (R9)(R12*1), R11
  7849. CMPW (R10)(R12*1), R11
  7850. JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
  7851. SUBL $0x02, R8
  7852. LEAL 2(R12), R12
  7853. matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
  7854. CMPL R8, $0x01
  7855. JL match_nolit_end_encodeBetterBlockAsm10B
  7856. MOVB (R9)(R12*1), R11
  7857. CMPB (R10)(R12*1), R11
  7858. JNE match_nolit_end_encodeBetterBlockAsm10B
  7859. LEAL 1(R12), R12
  7860. match_nolit_end_encodeBetterBlockAsm10B:
  7861. MOVL CX, R8
  7862. SUBL SI, R8
  7863. // Check if repeat
  7864. CMPL 16(SP), R8
  7865. JEQ match_is_repeat_encodeBetterBlockAsm10B
  7866. MOVL R8, 16(SP)
  7867. MOVL 12(SP), SI
  7868. CMPL SI, DI
  7869. JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B
  7870. MOVL DI, R9
  7871. MOVL DI, 12(SP)
  7872. LEAQ (DX)(SI*1), R10
  7873. SUBL SI, R9
  7874. LEAL -1(R9), SI
  7875. CMPL SI, $0x3c
  7876. JLT one_byte_match_emit_encodeBetterBlockAsm10B
  7877. CMPL SI, $0x00000100
  7878. JLT two_bytes_match_emit_encodeBetterBlockAsm10B
  7879. MOVB $0xf4, (AX)
  7880. MOVW SI, 1(AX)
  7881. ADDQ $0x03, AX
  7882. JMP memmove_long_match_emit_encodeBetterBlockAsm10B
  7883. two_bytes_match_emit_encodeBetterBlockAsm10B:
  7884. MOVB $0xf0, (AX)
  7885. MOVB SI, 1(AX)
  7886. ADDQ $0x02, AX
  7887. CMPL SI, $0x40
  7888. JL memmove_match_emit_encodeBetterBlockAsm10B
  7889. JMP memmove_long_match_emit_encodeBetterBlockAsm10B
  7890. one_byte_match_emit_encodeBetterBlockAsm10B:
  7891. SHLB $0x02, SI
  7892. MOVB SI, (AX)
  7893. ADDQ $0x01, AX
  7894. memmove_match_emit_encodeBetterBlockAsm10B:
  7895. LEAQ (AX)(R9*1), SI
  7896. // genMemMoveShort
  7897. CMPQ R9, $0x04
  7898. JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
  7899. CMPQ R9, $0x08
  7900. JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
  7901. CMPQ R9, $0x10
  7902. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
  7903. CMPQ R9, $0x20
  7904. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
  7905. JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
  7906. emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
  7907. MOVL (R10), R11
  7908. MOVL R11, (AX)
  7909. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
  7910. emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
  7911. MOVL (R10), R11
  7912. MOVL -4(R10)(R9*1), R10
  7913. MOVL R11, (AX)
  7914. MOVL R10, -4(AX)(R9*1)
  7915. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
  7916. emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
  7917. MOVQ (R10), R11
  7918. MOVQ -8(R10)(R9*1), R10
  7919. MOVQ R11, (AX)
  7920. MOVQ R10, -8(AX)(R9*1)
  7921. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
  7922. emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
  7923. MOVOU (R10), X0
  7924. MOVOU -16(R10)(R9*1), X1
  7925. MOVOU X0, (AX)
  7926. MOVOU X1, -16(AX)(R9*1)
  7927. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
  7928. emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
  7929. MOVOU (R10), X0
  7930. MOVOU 16(R10), X1
  7931. MOVOU -32(R10)(R9*1), X2
  7932. MOVOU -16(R10)(R9*1), X3
  7933. MOVOU X0, (AX)
  7934. MOVOU X1, 16(AX)
  7935. MOVOU X2, -32(AX)(R9*1)
  7936. MOVOU X3, -16(AX)(R9*1)
  7937. memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
  7938. MOVQ SI, AX
  7939. JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B
  7940. memmove_long_match_emit_encodeBetterBlockAsm10B:
  7941. LEAQ (AX)(R9*1), SI
  7942. // genMemMoveLong
  7943. MOVOU (R10), X0
  7944. MOVOU 16(R10), X1
  7945. MOVOU -32(R10)(R9*1), X2
  7946. MOVOU -16(R10)(R9*1), X3
  7947. MOVQ R9, R13
  7948. SHRQ $0x05, R13
  7949. MOVQ AX, R11
  7950. ANDL $0x0000001f, R11
  7951. MOVQ $0x00000040, R14
  7952. SUBQ R11, R14
  7953. DECQ R13
  7954. JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  7955. LEAQ -32(R10)(R14*1), R11
  7956. LEAQ -32(AX)(R14*1), R15
  7957. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
  7958. MOVOU (R11), X4
  7959. MOVOU 16(R11), X5
  7960. MOVOA X4, (R15)
  7961. MOVOA X5, 16(R15)
  7962. ADDQ $0x20, R15
  7963. ADDQ $0x20, R11
  7964. ADDQ $0x20, R14
  7965. DECQ R13
  7966. JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
  7967. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
  7968. MOVOU -32(R10)(R14*1), X4
  7969. MOVOU -16(R10)(R14*1), X5
  7970. MOVOA X4, -32(AX)(R14*1)
  7971. MOVOA X5, -16(AX)(R14*1)
  7972. ADDQ $0x20, R14
  7973. CMPQ R9, R14
  7974. JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  7975. MOVOU X0, (AX)
  7976. MOVOU X1, 16(AX)
  7977. MOVOU X2, -32(AX)(R9*1)
  7978. MOVOU X3, -16(AX)(R9*1)
  7979. MOVQ SI, AX
  7980. emit_literal_done_match_emit_encodeBetterBlockAsm10B:
  7981. ADDL R12, CX
  7982. ADDL $0x04, R12
  7983. MOVL CX, 12(SP)
  7984. // emitCopy
  7985. two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
  7986. CMPL R12, $0x40
  7987. JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
  7988. CMPL R8, $0x00000800
  7989. JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B
  7990. MOVL $0x00000001, SI
  7991. LEAL 16(SI), SI
  7992. MOVB R8, 1(AX)
  7993. SHRL $0x08, R8
  7994. SHLL $0x05, R8
  7995. ORL R8, SI
  7996. MOVB SI, (AX)
  7997. ADDQ $0x02, AX
  7998. SUBL $0x08, R12
  7999. // emitRepeat
  8000. LEAL -4(R12), R12
  8001. JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
  8002. MOVL R12, SI
  8003. LEAL -4(R12), R12
  8004. CMPL SI, $0x08
  8005. JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
  8006. CMPL SI, $0x0c
  8007. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
  8008. CMPL R8, $0x00000800
  8009. JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
  8010. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
  8011. CMPL R12, $0x00000104
  8012. JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
  8013. LEAL -256(R12), R12
  8014. MOVW $0x0019, (AX)
  8015. MOVW R12, 2(AX)
  8016. ADDQ $0x04, AX
  8017. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8018. repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
  8019. LEAL -4(R12), R12
  8020. MOVW $0x0015, (AX)
  8021. MOVB R12, 2(AX)
  8022. ADDQ $0x03, AX
  8023. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8024. repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
  8025. SHLL $0x02, R12
  8026. ORL $0x01, R12
  8027. MOVW R12, (AX)
  8028. ADDQ $0x02, AX
  8029. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8030. repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
  8031. XORQ SI, SI
  8032. LEAL 1(SI)(R12*4), R12
  8033. MOVB R8, 1(AX)
  8034. SARL $0x08, R8
  8035. SHLL $0x05, R8
  8036. ORL R8, R12
  8037. MOVB R12, (AX)
  8038. ADDQ $0x02, AX
  8039. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8040. long_offset_short_match_nolit_encodeBetterBlockAsm10B:
  8041. MOVB $0xee, (AX)
  8042. MOVW R8, 1(AX)
  8043. LEAL -60(R12), R12
  8044. ADDQ $0x03, AX
  8045. // emitRepeat
  8046. MOVL R12, SI
  8047. LEAL -4(R12), R12
  8048. CMPL SI, $0x08
  8049. JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
  8050. CMPL SI, $0x0c
  8051. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
  8052. CMPL R8, $0x00000800
  8053. JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
  8054. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
  8055. CMPL R12, $0x00000104
  8056. JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
  8057. LEAL -256(R12), R12
  8058. MOVW $0x0019, (AX)
  8059. MOVW R12, 2(AX)
  8060. ADDQ $0x04, AX
  8061. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8062. repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
  8063. LEAL -4(R12), R12
  8064. MOVW $0x0015, (AX)
  8065. MOVB R12, 2(AX)
  8066. ADDQ $0x03, AX
  8067. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8068. repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
  8069. SHLL $0x02, R12
  8070. ORL $0x01, R12
  8071. MOVW R12, (AX)
  8072. ADDQ $0x02, AX
  8073. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8074. repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
  8075. XORQ SI, SI
  8076. LEAL 1(SI)(R12*4), R12
  8077. MOVB R8, 1(AX)
  8078. SARL $0x08, R8
  8079. SHLL $0x05, R8
  8080. ORL R8, R12
  8081. MOVB R12, (AX)
  8082. ADDQ $0x02, AX
  8083. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8084. JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B
  8085. two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
  8086. CMPL R12, $0x0c
  8087. JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
  8088. CMPL R8, $0x00000800
  8089. JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
  8090. MOVB $0x01, BL
  8091. LEAL -16(BX)(R12*4), R12
  8092. MOVB R8, 1(AX)
  8093. SHRL $0x08, R8
  8094. SHLL $0x05, R8
  8095. ORL R8, R12
  8096. MOVB R12, (AX)
  8097. ADDQ $0x02, AX
  8098. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8099. emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
  8100. MOVB $0x02, BL
  8101. LEAL -4(BX)(R12*4), R12
  8102. MOVB R12, (AX)
  8103. MOVW R8, 1(AX)
  8104. ADDQ $0x03, AX
  8105. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8106. match_is_repeat_encodeBetterBlockAsm10B:
  8107. MOVL 12(SP), SI
  8108. CMPL SI, DI
  8109. JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
  8110. MOVL DI, R9
  8111. MOVL DI, 12(SP)
  8112. LEAQ (DX)(SI*1), R10
  8113. SUBL SI, R9
  8114. LEAL -1(R9), SI
  8115. CMPL SI, $0x3c
  8116. JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B
  8117. CMPL SI, $0x00000100
  8118. JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
  8119. MOVB $0xf4, (AX)
  8120. MOVW SI, 1(AX)
  8121. ADDQ $0x03, AX
  8122. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
  8123. two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
  8124. MOVB $0xf0, (AX)
  8125. MOVB SI, 1(AX)
  8126. ADDQ $0x02, AX
  8127. CMPL SI, $0x40
  8128. JL memmove_match_emit_repeat_encodeBetterBlockAsm10B
  8129. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
  8130. one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
  8131. SHLB $0x02, SI
  8132. MOVB SI, (AX)
  8133. ADDQ $0x01, AX
  8134. memmove_match_emit_repeat_encodeBetterBlockAsm10B:
  8135. LEAQ (AX)(R9*1), SI
  8136. // genMemMoveShort
  8137. CMPQ R9, $0x04
  8138. JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
  8139. CMPQ R9, $0x08
  8140. JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
  8141. CMPQ R9, $0x10
  8142. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
  8143. CMPQ R9, $0x20
  8144. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
  8145. JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
  8146. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
  8147. MOVL (R10), R11
  8148. MOVL R11, (AX)
  8149. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
  8150. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
  8151. MOVL (R10), R11
  8152. MOVL -4(R10)(R9*1), R10
  8153. MOVL R11, (AX)
  8154. MOVL R10, -4(AX)(R9*1)
  8155. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
  8156. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
  8157. MOVQ (R10), R11
  8158. MOVQ -8(R10)(R9*1), R10
  8159. MOVQ R11, (AX)
  8160. MOVQ R10, -8(AX)(R9*1)
  8161. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
  8162. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
  8163. MOVOU (R10), X0
  8164. MOVOU -16(R10)(R9*1), X1
  8165. MOVOU X0, (AX)
  8166. MOVOU X1, -16(AX)(R9*1)
  8167. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
  8168. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
  8169. MOVOU (R10), X0
  8170. MOVOU 16(R10), X1
  8171. MOVOU -32(R10)(R9*1), X2
  8172. MOVOU -16(R10)(R9*1), X3
  8173. MOVOU X0, (AX)
  8174. MOVOU X1, 16(AX)
  8175. MOVOU X2, -32(AX)(R9*1)
  8176. MOVOU X3, -16(AX)(R9*1)
  8177. memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
  8178. MOVQ SI, AX
  8179. JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
  8180. memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
  8181. LEAQ (AX)(R9*1), SI
  8182. // genMemMoveLong
  8183. MOVOU (R10), X0
  8184. MOVOU 16(R10), X1
  8185. MOVOU -32(R10)(R9*1), X2
  8186. MOVOU -16(R10)(R9*1), X3
  8187. MOVQ R9, R13
  8188. SHRQ $0x05, R13
  8189. MOVQ AX, R11
  8190. ANDL $0x0000001f, R11
  8191. MOVQ $0x00000040, R14
  8192. SUBQ R11, R14
  8193. DECQ R13
  8194. JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  8195. LEAQ -32(R10)(R14*1), R11
  8196. LEAQ -32(AX)(R14*1), R15
  8197. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
  8198. MOVOU (R11), X4
  8199. MOVOU 16(R11), X5
  8200. MOVOA X4, (R15)
  8201. MOVOA X5, 16(R15)
  8202. ADDQ $0x20, R15
  8203. ADDQ $0x20, R11
  8204. ADDQ $0x20, R14
  8205. DECQ R13
  8206. JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
  8207. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
  8208. MOVOU -32(R10)(R14*1), X4
  8209. MOVOU -16(R10)(R14*1), X5
  8210. MOVOA X4, -32(AX)(R14*1)
  8211. MOVOA X5, -16(AX)(R14*1)
  8212. ADDQ $0x20, R14
  8213. CMPQ R9, R14
  8214. JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  8215. MOVOU X0, (AX)
  8216. MOVOU X1, 16(AX)
  8217. MOVOU X2, -32(AX)(R9*1)
  8218. MOVOU X3, -16(AX)(R9*1)
  8219. MOVQ SI, AX
  8220. emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
  8221. ADDL R12, CX
  8222. ADDL $0x04, R12
  8223. MOVL CX, 12(SP)
  8224. // emitRepeat
  8225. MOVL R12, SI
  8226. LEAL -4(R12), R12
  8227. CMPL SI, $0x08
  8228. JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
  8229. CMPL SI, $0x0c
  8230. JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
  8231. CMPL R8, $0x00000800
  8232. JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
  8233. cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
  8234. CMPL R12, $0x00000104
  8235. JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
  8236. LEAL -256(R12), R12
  8237. MOVW $0x0019, (AX)
  8238. MOVW R12, 2(AX)
  8239. ADDQ $0x04, AX
  8240. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8241. repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
  8242. LEAL -4(R12), R12
  8243. MOVW $0x0015, (AX)
  8244. MOVB R12, 2(AX)
  8245. ADDQ $0x03, AX
  8246. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8247. repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
  8248. SHLL $0x02, R12
  8249. ORL $0x01, R12
  8250. MOVW R12, (AX)
  8251. ADDQ $0x02, AX
  8252. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
  8253. repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
  8254. XORQ SI, SI
  8255. LEAL 1(SI)(R12*4), R12
  8256. MOVB R8, 1(AX)
  8257. SARL $0x08, R8
  8258. SHLL $0x05, R8
  8259. ORL R8, R12
  8260. MOVB R12, (AX)
  8261. ADDQ $0x02, AX
  8262. match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
  8263. CMPL CX, 8(SP)
  8264. JGE emit_remainder_encodeBetterBlockAsm10B
  8265. CMPQ AX, (SP)
  8266. JL match_nolit_dst_ok_encodeBetterBlockAsm10B
  8267. MOVQ $0x00000000, ret+48(FP)
  8268. RET
  8269. match_nolit_dst_ok_encodeBetterBlockAsm10B:
  8270. MOVQ $0x0000cf1bbcdcbf9b, SI
  8271. MOVQ $0x9e3779b1, R8
  8272. INCL DI
  8273. MOVQ (DX)(DI*1), R9
  8274. MOVQ R9, R10
  8275. MOVQ R9, R11
  8276. MOVQ R9, R12
  8277. SHRQ $0x08, R11
  8278. MOVQ R11, R13
  8279. SHRQ $0x10, R12
  8280. LEAL 1(DI), R14
  8281. LEAL 2(DI), R15
  8282. MOVQ -2(DX)(CX*1), R9
  8283. SHLQ $0x10, R10
  8284. IMULQ SI, R10
  8285. SHRQ $0x34, R10
  8286. SHLQ $0x10, R13
  8287. IMULQ SI, R13
  8288. SHRQ $0x34, R13
  8289. SHLQ $0x20, R11
  8290. IMULQ R8, R11
  8291. SHRQ $0x36, R11
  8292. SHLQ $0x20, R12
  8293. IMULQ R8, R12
  8294. SHRQ $0x36, R12
  8295. MOVL DI, 24(SP)(R10*4)
  8296. MOVL R14, 24(SP)(R13*4)
  8297. MOVL R14, 16408(SP)(R11*4)
  8298. MOVL R15, 16408(SP)(R12*4)
  8299. MOVQ R9, R10
  8300. MOVQ R9, R11
  8301. SHRQ $0x08, R11
  8302. MOVQ R11, R13
  8303. LEAL -2(CX), R9
  8304. LEAL -1(CX), DI
  8305. SHLQ $0x10, R10
  8306. IMULQ SI, R10
  8307. SHRQ $0x34, R10
  8308. SHLQ $0x20, R11
  8309. IMULQ R8, R11
  8310. SHRQ $0x36, R11
  8311. SHLQ $0x10, R13
  8312. IMULQ SI, R13
  8313. SHRQ $0x34, R13
  8314. MOVL R9, 24(SP)(R10*4)
  8315. MOVL DI, 16408(SP)(R11*4)
  8316. MOVL DI, 24(SP)(R13*4)
  8317. JMP search_loop_encodeBetterBlockAsm10B
  8318. emit_remainder_encodeBetterBlockAsm10B:
  8319. MOVQ src_len+32(FP), CX
  8320. SUBL 12(SP), CX
  8321. LEAQ 3(AX)(CX*1), CX
  8322. CMPQ CX, (SP)
  8323. JL emit_remainder_ok_encodeBetterBlockAsm10B
  8324. MOVQ $0x00000000, ret+48(FP)
  8325. RET
  8326. emit_remainder_ok_encodeBetterBlockAsm10B:
  8327. MOVQ src_len+32(FP), CX
  8328. MOVL 12(SP), BX
  8329. CMPL BX, CX
  8330. JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
  8331. MOVL CX, SI
  8332. MOVL CX, 12(SP)
  8333. LEAQ (DX)(BX*1), CX
  8334. SUBL BX, SI
  8335. LEAL -1(SI), DX
  8336. CMPL DX, $0x3c
  8337. JLT one_byte_emit_remainder_encodeBetterBlockAsm10B
  8338. CMPL DX, $0x00000100
  8339. JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B
  8340. MOVB $0xf4, (AX)
  8341. MOVW DX, 1(AX)
  8342. ADDQ $0x03, AX
  8343. JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
  8344. two_bytes_emit_remainder_encodeBetterBlockAsm10B:
  8345. MOVB $0xf0, (AX)
  8346. MOVB DL, 1(AX)
  8347. ADDQ $0x02, AX
  8348. CMPL DX, $0x40
  8349. JL memmove_emit_remainder_encodeBetterBlockAsm10B
  8350. JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
  8351. one_byte_emit_remainder_encodeBetterBlockAsm10B:
  8352. SHLB $0x02, DL
  8353. MOVB DL, (AX)
  8354. ADDQ $0x01, AX
  8355. memmove_emit_remainder_encodeBetterBlockAsm10B:
  8356. LEAQ (AX)(SI*1), DX
  8357. MOVL SI, BX
  8358. // genMemMoveShort
  8359. CMPQ BX, $0x03
  8360. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2
  8361. JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3
  8362. CMPQ BX, $0x08
  8363. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
  8364. CMPQ BX, $0x10
  8365. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
  8366. CMPQ BX, $0x20
  8367. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
  8368. JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
  8369. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2:
  8370. MOVB (CX), SI
  8371. MOVB -1(CX)(BX*1), CL
  8372. MOVB SI, (AX)
  8373. MOVB CL, -1(AX)(BX*1)
  8374. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
  8375. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3:
  8376. MOVW (CX), SI
  8377. MOVB 2(CX), CL
  8378. MOVW SI, (AX)
  8379. MOVB CL, 2(AX)
  8380. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
  8381. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
  8382. MOVL (CX), SI
  8383. MOVL -4(CX)(BX*1), CX
  8384. MOVL SI, (AX)
  8385. MOVL CX, -4(AX)(BX*1)
  8386. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
  8387. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
  8388. MOVQ (CX), SI
  8389. MOVQ -8(CX)(BX*1), CX
  8390. MOVQ SI, (AX)
  8391. MOVQ CX, -8(AX)(BX*1)
  8392. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
  8393. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
  8394. MOVOU (CX), X0
  8395. MOVOU -16(CX)(BX*1), X1
  8396. MOVOU X0, (AX)
  8397. MOVOU X1, -16(AX)(BX*1)
  8398. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
  8399. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
  8400. MOVOU (CX), X0
  8401. MOVOU 16(CX), X1
  8402. MOVOU -32(CX)(BX*1), X2
  8403. MOVOU -16(CX)(BX*1), X3
  8404. MOVOU X0, (AX)
  8405. MOVOU X1, 16(AX)
  8406. MOVOU X2, -32(AX)(BX*1)
  8407. MOVOU X3, -16(AX)(BX*1)
  8408. memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
  8409. MOVQ DX, AX
  8410. JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
  8411. memmove_long_emit_remainder_encodeBetterBlockAsm10B:
  8412. LEAQ (AX)(SI*1), DX
  8413. MOVL SI, BX
  8414. // genMemMoveLong
  8415. MOVOU (CX), X0
  8416. MOVOU 16(CX), X1
  8417. MOVOU -32(CX)(BX*1), X2
  8418. MOVOU -16(CX)(BX*1), X3
  8419. MOVQ BX, DI
  8420. SHRQ $0x05, DI
  8421. MOVQ AX, SI
  8422. ANDL $0x0000001f, SI
  8423. MOVQ $0x00000040, R8
  8424. SUBQ SI, R8
  8425. DECQ DI
  8426. JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  8427. LEAQ -32(CX)(R8*1), SI
  8428. LEAQ -32(AX)(R8*1), R9
  8429. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
  8430. MOVOU (SI), X4
  8431. MOVOU 16(SI), X5
  8432. MOVOA X4, (R9)
  8433. MOVOA X5, 16(R9)
  8434. ADDQ $0x20, R9
  8435. ADDQ $0x20, SI
  8436. ADDQ $0x20, R8
  8437. DECQ DI
  8438. JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
  8439. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
  8440. MOVOU -32(CX)(R8*1), X4
  8441. MOVOU -16(CX)(R8*1), X5
  8442. MOVOA X4, -32(AX)(R8*1)
  8443. MOVOA X5, -16(AX)(R8*1)
  8444. ADDQ $0x20, R8
  8445. CMPQ BX, R8
  8446. JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
  8447. MOVOU X0, (AX)
  8448. MOVOU X1, 16(AX)
  8449. MOVOU X2, -32(AX)(BX*1)
  8450. MOVOU X3, -16(AX)(BX*1)
  8451. MOVQ DX, AX
  8452. emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
  8453. MOVQ dst_base+0(FP), CX
  8454. SUBQ CX, AX
  8455. MOVQ AX, ret+48(FP)
  8456. RET
  8457. // func encodeBetterBlockAsm8B(dst []byte, src []byte) int
  8458. // Requires: BMI, SSE2
  8459. TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
  8460. MOVQ dst_base+0(FP), AX
  8461. MOVQ $0x00000028, CX
  8462. LEAQ 24(SP), DX
  8463. PXOR X0, X0
  8464. zero_loop_encodeBetterBlockAsm8B:
  8465. MOVOU X0, (DX)
  8466. MOVOU X0, 16(DX)
  8467. MOVOU X0, 32(DX)
  8468. MOVOU X0, 48(DX)
  8469. MOVOU X0, 64(DX)
  8470. MOVOU X0, 80(DX)
  8471. MOVOU X0, 96(DX)
  8472. MOVOU X0, 112(DX)
  8473. ADDQ $0x80, DX
  8474. DECQ CX
  8475. JNZ zero_loop_encodeBetterBlockAsm8B
  8476. MOVL $0x00000000, 12(SP)
  8477. MOVQ src_len+32(FP), CX
  8478. LEAQ -6(CX), DX
  8479. LEAQ -8(CX), SI
  8480. MOVL SI, 8(SP)
  8481. SHRQ $0x05, CX
  8482. SUBL CX, DX
  8483. LEAQ (AX)(DX*1), DX
  8484. MOVQ DX, (SP)
  8485. MOVL $0x00000001, CX
  8486. MOVL $0x00000000, 16(SP)
  8487. MOVQ src_base+24(FP), DX
  8488. search_loop_encodeBetterBlockAsm8B:
  8489. MOVL CX, SI
  8490. SUBL 12(SP), SI
  8491. SHRL $0x04, SI
  8492. LEAL 1(CX)(SI*1), SI
  8493. CMPL SI, 8(SP)
  8494. JGE emit_remainder_encodeBetterBlockAsm8B
  8495. MOVQ (DX)(CX*1), DI
  8496. MOVL SI, 20(SP)
  8497. MOVQ $0x0000cf1bbcdcbf9b, R9
  8498. MOVQ $0x9e3779b1, SI
  8499. MOVQ DI, R10
  8500. MOVQ DI, R11
  8501. SHLQ $0x10, R10
  8502. IMULQ R9, R10
  8503. SHRQ $0x36, R10
  8504. SHLQ $0x20, R11
  8505. IMULQ SI, R11
  8506. SHRQ $0x38, R11
  8507. MOVL 24(SP)(R10*4), SI
  8508. MOVL 4120(SP)(R11*4), R8
  8509. MOVL CX, 24(SP)(R10*4)
  8510. MOVL CX, 4120(SP)(R11*4)
  8511. CMPL (DX)(SI*1), DI
  8512. JEQ candidate_match_encodeBetterBlockAsm8B
  8513. CMPL (DX)(R8*1), DI
  8514. JEQ candidateS_match_encodeBetterBlockAsm8B
  8515. MOVL 20(SP), CX
  8516. JMP search_loop_encodeBetterBlockAsm8B
  8517. candidateS_match_encodeBetterBlockAsm8B:
  8518. SHRQ $0x08, DI
  8519. MOVQ DI, R10
  8520. SHLQ $0x10, R10
  8521. IMULQ R9, R10
  8522. SHRQ $0x36, R10
  8523. MOVL 24(SP)(R10*4), SI
  8524. INCL CX
  8525. MOVL CX, 24(SP)(R10*4)
  8526. CMPL (DX)(SI*1), DI
  8527. JEQ candidate_match_encodeBetterBlockAsm8B
  8528. DECL CX
  8529. MOVL R8, SI
  8530. candidate_match_encodeBetterBlockAsm8B:
  8531. MOVL 12(SP), DI
  8532. TESTL SI, SI
  8533. JZ match_extend_back_end_encodeBetterBlockAsm8B
  8534. match_extend_back_loop_encodeBetterBlockAsm8B:
  8535. CMPL CX, DI
  8536. JLE match_extend_back_end_encodeBetterBlockAsm8B
  8537. MOVB -1(DX)(SI*1), BL
  8538. MOVB -1(DX)(CX*1), R8
  8539. CMPB BL, R8
  8540. JNE match_extend_back_end_encodeBetterBlockAsm8B
  8541. LEAL -1(CX), CX
  8542. DECL SI
  8543. JZ match_extend_back_end_encodeBetterBlockAsm8B
  8544. JMP match_extend_back_loop_encodeBetterBlockAsm8B
  8545. match_extend_back_end_encodeBetterBlockAsm8B:
  8546. MOVL CX, DI
  8547. SUBL 12(SP), DI
  8548. LEAQ 3(AX)(DI*1), DI
  8549. CMPQ DI, (SP)
  8550. JL match_dst_size_check_encodeBetterBlockAsm8B
  8551. MOVQ $0x00000000, ret+48(FP)
  8552. RET
  8553. match_dst_size_check_encodeBetterBlockAsm8B:
  8554. MOVL CX, DI
  8555. ADDL $0x04, CX
  8556. ADDL $0x04, SI
  8557. MOVQ src_len+32(FP), R8
  8558. SUBL CX, R8
  8559. LEAQ (DX)(CX*1), R9
  8560. LEAQ (DX)(SI*1), R10
  8561. // matchLen
  8562. XORL R12, R12
  8563. CMPL R8, $0x08
  8564. JL matchlen_match4_match_nolit_encodeBetterBlockAsm8B
  8565. matchlen_loopback_match_nolit_encodeBetterBlockAsm8B:
  8566. MOVQ (R9)(R12*1), R11
  8567. XORQ (R10)(R12*1), R11
  8568. TESTQ R11, R11
  8569. JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B
  8570. #ifdef GOAMD64_v3
  8571. TZCNTQ R11, R11
  8572. #else
  8573. BSFQ R11, R11
  8574. #endif
  8575. SARQ $0x03, R11
  8576. LEAL (R12)(R11*1), R12
  8577. JMP match_nolit_end_encodeBetterBlockAsm8B
  8578. matchlen_loop_match_nolit_encodeBetterBlockAsm8B:
  8579. LEAL -8(R8), R8
  8580. LEAL 8(R12), R12
  8581. CMPL R8, $0x08
  8582. JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B
  8583. JZ match_nolit_end_encodeBetterBlockAsm8B
  8584. matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
  8585. CMPL R8, $0x04
  8586. JL matchlen_match2_match_nolit_encodeBetterBlockAsm8B
  8587. MOVL (R9)(R12*1), R11
  8588. CMPL (R10)(R12*1), R11
  8589. JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B
  8590. SUBL $0x04, R8
  8591. LEAL 4(R12), R12
  8592. matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
  8593. CMPL R8, $0x02
  8594. JL matchlen_match1_match_nolit_encodeBetterBlockAsm8B
  8595. MOVW (R9)(R12*1), R11
  8596. CMPW (R10)(R12*1), R11
  8597. JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
  8598. SUBL $0x02, R8
  8599. LEAL 2(R12), R12
  8600. matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
  8601. CMPL R8, $0x01
  8602. JL match_nolit_end_encodeBetterBlockAsm8B
  8603. MOVB (R9)(R12*1), R11
  8604. CMPB (R10)(R12*1), R11
  8605. JNE match_nolit_end_encodeBetterBlockAsm8B
  8606. LEAL 1(R12), R12
  8607. match_nolit_end_encodeBetterBlockAsm8B:
  8608. MOVL CX, R8
  8609. SUBL SI, R8
  8610. // Check if repeat
  8611. CMPL 16(SP), R8
  8612. JEQ match_is_repeat_encodeBetterBlockAsm8B
  8613. MOVL R8, 16(SP)
  8614. MOVL 12(SP), SI
  8615. CMPL SI, DI
  8616. JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B
  8617. MOVL DI, R9
  8618. MOVL DI, 12(SP)
  8619. LEAQ (DX)(SI*1), R10
  8620. SUBL SI, R9
  8621. LEAL -1(R9), SI
  8622. CMPL SI, $0x3c
  8623. JLT one_byte_match_emit_encodeBetterBlockAsm8B
  8624. CMPL SI, $0x00000100
  8625. JLT two_bytes_match_emit_encodeBetterBlockAsm8B
  8626. MOVB $0xf4, (AX)
  8627. MOVW SI, 1(AX)
  8628. ADDQ $0x03, AX
  8629. JMP memmove_long_match_emit_encodeBetterBlockAsm8B
  8630. two_bytes_match_emit_encodeBetterBlockAsm8B:
  8631. MOVB $0xf0, (AX)
  8632. MOVB SI, 1(AX)
  8633. ADDQ $0x02, AX
  8634. CMPL SI, $0x40
  8635. JL memmove_match_emit_encodeBetterBlockAsm8B
  8636. JMP memmove_long_match_emit_encodeBetterBlockAsm8B
  8637. one_byte_match_emit_encodeBetterBlockAsm8B:
  8638. SHLB $0x02, SI
  8639. MOVB SI, (AX)
  8640. ADDQ $0x01, AX
  8641. memmove_match_emit_encodeBetterBlockAsm8B:
  8642. LEAQ (AX)(R9*1), SI
  8643. // genMemMoveShort
  8644. CMPQ R9, $0x04
  8645. JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
  8646. CMPQ R9, $0x08
  8647. JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
  8648. CMPQ R9, $0x10
  8649. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
  8650. CMPQ R9, $0x20
  8651. JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
  8652. JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
  8653. emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
  8654. MOVL (R10), R11
  8655. MOVL R11, (AX)
  8656. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
  8657. emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
  8658. MOVL (R10), R11
  8659. MOVL -4(R10)(R9*1), R10
  8660. MOVL R11, (AX)
  8661. MOVL R10, -4(AX)(R9*1)
  8662. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
  8663. emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
  8664. MOVQ (R10), R11
  8665. MOVQ -8(R10)(R9*1), R10
  8666. MOVQ R11, (AX)
  8667. MOVQ R10, -8(AX)(R9*1)
  8668. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
  8669. emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
  8670. MOVOU (R10), X0
  8671. MOVOU -16(R10)(R9*1), X1
  8672. MOVOU X0, (AX)
  8673. MOVOU X1, -16(AX)(R9*1)
  8674. JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
  8675. emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
  8676. MOVOU (R10), X0
  8677. MOVOU 16(R10), X1
  8678. MOVOU -32(R10)(R9*1), X2
  8679. MOVOU -16(R10)(R9*1), X3
  8680. MOVOU X0, (AX)
  8681. MOVOU X1, 16(AX)
  8682. MOVOU X2, -32(AX)(R9*1)
  8683. MOVOU X3, -16(AX)(R9*1)
  8684. memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
  8685. MOVQ SI, AX
  8686. JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B
  8687. memmove_long_match_emit_encodeBetterBlockAsm8B:
  8688. LEAQ (AX)(R9*1), SI
  8689. // genMemMoveLong
  8690. MOVOU (R10), X0
  8691. MOVOU 16(R10), X1
  8692. MOVOU -32(R10)(R9*1), X2
  8693. MOVOU -16(R10)(R9*1), X3
  8694. MOVQ R9, R13
  8695. SHRQ $0x05, R13
  8696. MOVQ AX, R11
  8697. ANDL $0x0000001f, R11
  8698. MOVQ $0x00000040, R14
  8699. SUBQ R11, R14
  8700. DECQ R13
  8701. JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  8702. LEAQ -32(R10)(R14*1), R11
  8703. LEAQ -32(AX)(R14*1), R15
  8704. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
  8705. MOVOU (R11), X4
  8706. MOVOU 16(R11), X5
  8707. MOVOA X4, (R15)
  8708. MOVOA X5, 16(R15)
  8709. ADDQ $0x20, R15
  8710. ADDQ $0x20, R11
  8711. ADDQ $0x20, R14
  8712. DECQ R13
  8713. JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
  8714. emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
  8715. MOVOU -32(R10)(R14*1), X4
  8716. MOVOU -16(R10)(R14*1), X5
  8717. MOVOA X4, -32(AX)(R14*1)
  8718. MOVOA X5, -16(AX)(R14*1)
  8719. ADDQ $0x20, R14
  8720. CMPQ R9, R14
  8721. JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  8722. MOVOU X0, (AX)
  8723. MOVOU X1, 16(AX)
  8724. MOVOU X2, -32(AX)(R9*1)
  8725. MOVOU X3, -16(AX)(R9*1)
  8726. MOVQ SI, AX
  8727. emit_literal_done_match_emit_encodeBetterBlockAsm8B:
  8728. ADDL R12, CX
  8729. ADDL $0x04, R12
  8730. MOVL CX, 12(SP)
  8731. // emitCopy
  8732. two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
  8733. CMPL R12, $0x40
  8734. JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
  8735. CMPL R8, $0x00000800
  8736. JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B
  8737. MOVL $0x00000001, SI
  8738. LEAL 16(SI), SI
  8739. MOVB R8, 1(AX)
  8740. SHRL $0x08, R8
  8741. SHLL $0x05, R8
  8742. ORL R8, SI
  8743. MOVB SI, (AX)
  8744. ADDQ $0x02, AX
  8745. SUBL $0x08, R12
  8746. // emitRepeat
  8747. LEAL -4(R12), R12
  8748. JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
  8749. MOVL R12, SI
  8750. LEAL -4(R12), R12
  8751. CMPL SI, $0x08
  8752. JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
  8753. CMPL SI, $0x0c
  8754. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
  8755. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
  8756. CMPL R12, $0x00000104
  8757. JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
  8758. LEAL -256(R12), R12
  8759. MOVW $0x0019, (AX)
  8760. MOVW R12, 2(AX)
  8761. ADDQ $0x04, AX
  8762. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8763. repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
  8764. LEAL -4(R12), R12
  8765. MOVW $0x0015, (AX)
  8766. MOVB R12, 2(AX)
  8767. ADDQ $0x03, AX
  8768. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8769. repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
  8770. SHLL $0x02, R12
  8771. ORL $0x01, R12
  8772. MOVW R12, (AX)
  8773. ADDQ $0x02, AX
  8774. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8775. XORQ SI, SI
  8776. LEAL 1(SI)(R12*4), R12
  8777. MOVB R8, 1(AX)
  8778. SARL $0x08, R8
  8779. SHLL $0x05, R8
  8780. ORL R8, R12
  8781. MOVB R12, (AX)
  8782. ADDQ $0x02, AX
  8783. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8784. long_offset_short_match_nolit_encodeBetterBlockAsm8B:
  8785. MOVB $0xee, (AX)
  8786. MOVW R8, 1(AX)
  8787. LEAL -60(R12), R12
  8788. ADDQ $0x03, AX
  8789. // emitRepeat
  8790. MOVL R12, SI
  8791. LEAL -4(R12), R12
  8792. CMPL SI, $0x08
  8793. JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
  8794. CMPL SI, $0x0c
  8795. JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
  8796. cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
  8797. CMPL R12, $0x00000104
  8798. JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
  8799. LEAL -256(R12), R12
  8800. MOVW $0x0019, (AX)
  8801. MOVW R12, 2(AX)
  8802. ADDQ $0x04, AX
  8803. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8804. repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
  8805. LEAL -4(R12), R12
  8806. MOVW $0x0015, (AX)
  8807. MOVB R12, 2(AX)
  8808. ADDQ $0x03, AX
  8809. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8810. repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
  8811. SHLL $0x02, R12
  8812. ORL $0x01, R12
  8813. MOVW R12, (AX)
  8814. ADDQ $0x02, AX
  8815. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8816. XORQ SI, SI
  8817. LEAL 1(SI)(R12*4), R12
  8818. MOVB R8, 1(AX)
  8819. SARL $0x08, R8
  8820. SHLL $0x05, R8
  8821. ORL R8, R12
  8822. MOVB R12, (AX)
  8823. ADDQ $0x02, AX
  8824. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8825. JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B
  8826. two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
  8827. CMPL R12, $0x0c
  8828. JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B
  8829. MOVB $0x01, BL
  8830. LEAL -16(BX)(R12*4), R12
  8831. MOVB R8, 1(AX)
  8832. SHRL $0x08, R8
  8833. SHLL $0x05, R8
  8834. ORL R8, R12
  8835. MOVB R12, (AX)
  8836. ADDQ $0x02, AX
  8837. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8838. emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
  8839. MOVB $0x02, BL
  8840. LEAL -4(BX)(R12*4), R12
  8841. MOVB R12, (AX)
  8842. MOVW R8, 1(AX)
  8843. ADDQ $0x03, AX
  8844. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8845. match_is_repeat_encodeBetterBlockAsm8B:
  8846. MOVL 12(SP), SI
  8847. CMPL SI, DI
  8848. JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
  8849. MOVL DI, R8
  8850. MOVL DI, 12(SP)
  8851. LEAQ (DX)(SI*1), R9
  8852. SUBL SI, R8
  8853. LEAL -1(R8), SI
  8854. CMPL SI, $0x3c
  8855. JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B
  8856. CMPL SI, $0x00000100
  8857. JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
  8858. MOVB $0xf4, (AX)
  8859. MOVW SI, 1(AX)
  8860. ADDQ $0x03, AX
  8861. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
  8862. two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
  8863. MOVB $0xf0, (AX)
  8864. MOVB SI, 1(AX)
  8865. ADDQ $0x02, AX
  8866. CMPL SI, $0x40
  8867. JL memmove_match_emit_repeat_encodeBetterBlockAsm8B
  8868. JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
  8869. one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
  8870. SHLB $0x02, SI
  8871. MOVB SI, (AX)
  8872. ADDQ $0x01, AX
  8873. memmove_match_emit_repeat_encodeBetterBlockAsm8B:
  8874. LEAQ (AX)(R8*1), SI
  8875. // genMemMoveShort
  8876. CMPQ R8, $0x04
  8877. JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
  8878. CMPQ R8, $0x08
  8879. JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
  8880. CMPQ R8, $0x10
  8881. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
  8882. CMPQ R8, $0x20
  8883. JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
  8884. JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
  8885. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
  8886. MOVL (R9), R10
  8887. MOVL R10, (AX)
  8888. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
  8889. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
  8890. MOVL (R9), R10
  8891. MOVL -4(R9)(R8*1), R9
  8892. MOVL R10, (AX)
  8893. MOVL R9, -4(AX)(R8*1)
  8894. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
  8895. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
  8896. MOVQ (R9), R10
  8897. MOVQ -8(R9)(R8*1), R9
  8898. MOVQ R10, (AX)
  8899. MOVQ R9, -8(AX)(R8*1)
  8900. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
  8901. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
  8902. MOVOU (R9), X0
  8903. MOVOU -16(R9)(R8*1), X1
  8904. MOVOU X0, (AX)
  8905. MOVOU X1, -16(AX)(R8*1)
  8906. JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
  8907. emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
  8908. MOVOU (R9), X0
  8909. MOVOU 16(R9), X1
  8910. MOVOU -32(R9)(R8*1), X2
  8911. MOVOU -16(R9)(R8*1), X3
  8912. MOVOU X0, (AX)
  8913. MOVOU X1, 16(AX)
  8914. MOVOU X2, -32(AX)(R8*1)
  8915. MOVOU X3, -16(AX)(R8*1)
  8916. memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
  8917. MOVQ SI, AX
  8918. JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
  8919. memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
  8920. LEAQ (AX)(R8*1), SI
  8921. // genMemMoveLong
  8922. MOVOU (R9), X0
  8923. MOVOU 16(R9), X1
  8924. MOVOU -32(R9)(R8*1), X2
  8925. MOVOU -16(R9)(R8*1), X3
  8926. MOVQ R8, R11
  8927. SHRQ $0x05, R11
  8928. MOVQ AX, R10
  8929. ANDL $0x0000001f, R10
  8930. MOVQ $0x00000040, R13
  8931. SUBQ R10, R13
  8932. DECQ R11
  8933. JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  8934. LEAQ -32(R9)(R13*1), R10
  8935. LEAQ -32(AX)(R13*1), R14
  8936. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
  8937. MOVOU (R10), X4
  8938. MOVOU 16(R10), X5
  8939. MOVOA X4, (R14)
  8940. MOVOA X5, 16(R14)
  8941. ADDQ $0x20, R14
  8942. ADDQ $0x20, R10
  8943. ADDQ $0x20, R13
  8944. DECQ R11
  8945. JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
  8946. emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
  8947. MOVOU -32(R9)(R13*1), X4
  8948. MOVOU -16(R9)(R13*1), X5
  8949. MOVOA X4, -32(AX)(R13*1)
  8950. MOVOA X5, -16(AX)(R13*1)
  8951. ADDQ $0x20, R13
  8952. CMPQ R8, R13
  8953. JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  8954. MOVOU X0, (AX)
  8955. MOVOU X1, 16(AX)
  8956. MOVOU X2, -32(AX)(R8*1)
  8957. MOVOU X3, -16(AX)(R8*1)
  8958. MOVQ SI, AX
  8959. emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
  8960. ADDL R12, CX
  8961. ADDL $0x04, R12
  8962. MOVL CX, 12(SP)
  8963. // emitRepeat
  8964. MOVL R12, SI
  8965. LEAL -4(R12), R12
  8966. CMPL SI, $0x08
  8967. JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
  8968. CMPL SI, $0x0c
  8969. JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
  8970. cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
  8971. CMPL R12, $0x00000104
  8972. JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
  8973. LEAL -256(R12), R12
  8974. MOVW $0x0019, (AX)
  8975. MOVW R12, 2(AX)
  8976. ADDQ $0x04, AX
  8977. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8978. repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
  8979. LEAL -4(R12), R12
  8980. MOVW $0x0015, (AX)
  8981. MOVB R12, 2(AX)
  8982. ADDQ $0x03, AX
  8983. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8984. repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
  8985. SHLL $0x02, R12
  8986. ORL $0x01, R12
  8987. MOVW R12, (AX)
  8988. ADDQ $0x02, AX
  8989. JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
  8990. XORQ SI, SI
  8991. LEAL 1(SI)(R12*4), R12
  8992. MOVB R8, 1(AX)
  8993. SARL $0x08, R8
  8994. SHLL $0x05, R8
  8995. ORL R8, R12
  8996. MOVB R12, (AX)
  8997. ADDQ $0x02, AX
  8998. match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
  8999. CMPL CX, 8(SP)
  9000. JGE emit_remainder_encodeBetterBlockAsm8B
  9001. CMPQ AX, (SP)
  9002. JL match_nolit_dst_ok_encodeBetterBlockAsm8B
  9003. MOVQ $0x00000000, ret+48(FP)
  9004. RET
  9005. match_nolit_dst_ok_encodeBetterBlockAsm8B:
  9006. MOVQ $0x0000cf1bbcdcbf9b, SI
  9007. MOVQ $0x9e3779b1, R8
  9008. INCL DI
  9009. MOVQ (DX)(DI*1), R9
  9010. MOVQ R9, R10
  9011. MOVQ R9, R11
  9012. MOVQ R9, R12
  9013. SHRQ $0x08, R11
  9014. MOVQ R11, R13
  9015. SHRQ $0x10, R12
  9016. LEAL 1(DI), R14
  9017. LEAL 2(DI), R15
  9018. MOVQ -2(DX)(CX*1), R9
  9019. SHLQ $0x10, R10
  9020. IMULQ SI, R10
  9021. SHRQ $0x36, R10
  9022. SHLQ $0x10, R13
  9023. IMULQ SI, R13
  9024. SHRQ $0x36, R13
  9025. SHLQ $0x20, R11
  9026. IMULQ R8, R11
  9027. SHRQ $0x38, R11
  9028. SHLQ $0x20, R12
  9029. IMULQ R8, R12
  9030. SHRQ $0x38, R12
  9031. MOVL DI, 24(SP)(R10*4)
  9032. MOVL R14, 24(SP)(R13*4)
  9033. MOVL R14, 4120(SP)(R11*4)
  9034. MOVL R15, 4120(SP)(R12*4)
  9035. MOVQ R9, R10
  9036. MOVQ R9, R11
  9037. SHRQ $0x08, R11
  9038. MOVQ R11, R13
  9039. LEAL -2(CX), R9
  9040. LEAL -1(CX), DI
  9041. SHLQ $0x10, R10
  9042. IMULQ SI, R10
  9043. SHRQ $0x36, R10
  9044. SHLQ $0x20, R11
  9045. IMULQ R8, R11
  9046. SHRQ $0x38, R11
  9047. SHLQ $0x10, R13
  9048. IMULQ SI, R13
  9049. SHRQ $0x36, R13
  9050. MOVL R9, 24(SP)(R10*4)
  9051. MOVL DI, 4120(SP)(R11*4)
  9052. MOVL DI, 24(SP)(R13*4)
  9053. JMP search_loop_encodeBetterBlockAsm8B
  9054. emit_remainder_encodeBetterBlockAsm8B:
  9055. MOVQ src_len+32(FP), CX
  9056. SUBL 12(SP), CX
  9057. LEAQ 3(AX)(CX*1), CX
  9058. CMPQ CX, (SP)
  9059. JL emit_remainder_ok_encodeBetterBlockAsm8B
  9060. MOVQ $0x00000000, ret+48(FP)
  9061. RET
  9062. emit_remainder_ok_encodeBetterBlockAsm8B:
  9063. MOVQ src_len+32(FP), CX
  9064. MOVL 12(SP), BX
  9065. CMPL BX, CX
  9066. JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
  9067. MOVL CX, SI
  9068. MOVL CX, 12(SP)
  9069. LEAQ (DX)(BX*1), CX
  9070. SUBL BX, SI
  9071. LEAL -1(SI), DX
  9072. CMPL DX, $0x3c
  9073. JLT one_byte_emit_remainder_encodeBetterBlockAsm8B
  9074. CMPL DX, $0x00000100
  9075. JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B
  9076. MOVB $0xf4, (AX)
  9077. MOVW DX, 1(AX)
  9078. ADDQ $0x03, AX
  9079. JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
  9080. two_bytes_emit_remainder_encodeBetterBlockAsm8B:
  9081. MOVB $0xf0, (AX)
  9082. MOVB DL, 1(AX)
  9083. ADDQ $0x02, AX
  9084. CMPL DX, $0x40
  9085. JL memmove_emit_remainder_encodeBetterBlockAsm8B
  9086. JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
  9087. one_byte_emit_remainder_encodeBetterBlockAsm8B:
  9088. SHLB $0x02, DL
  9089. MOVB DL, (AX)
  9090. ADDQ $0x01, AX
  9091. memmove_emit_remainder_encodeBetterBlockAsm8B:
  9092. LEAQ (AX)(SI*1), DX
  9093. MOVL SI, BX
  9094. // genMemMoveShort
  9095. CMPQ BX, $0x03
  9096. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2
  9097. JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3
  9098. CMPQ BX, $0x08
  9099. JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
  9100. CMPQ BX, $0x10
  9101. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
  9102. CMPQ BX, $0x20
  9103. JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
  9104. JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
  9105. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2:
  9106. MOVB (CX), SI
  9107. MOVB -1(CX)(BX*1), CL
  9108. MOVB SI, (AX)
  9109. MOVB CL, -1(AX)(BX*1)
  9110. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
  9111. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3:
  9112. MOVW (CX), SI
  9113. MOVB 2(CX), CL
  9114. MOVW SI, (AX)
  9115. MOVB CL, 2(AX)
  9116. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
  9117. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
  9118. MOVL (CX), SI
  9119. MOVL -4(CX)(BX*1), CX
  9120. MOVL SI, (AX)
  9121. MOVL CX, -4(AX)(BX*1)
  9122. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
  9123. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
  9124. MOVQ (CX), SI
  9125. MOVQ -8(CX)(BX*1), CX
  9126. MOVQ SI, (AX)
  9127. MOVQ CX, -8(AX)(BX*1)
  9128. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
  9129. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
  9130. MOVOU (CX), X0
  9131. MOVOU -16(CX)(BX*1), X1
  9132. MOVOU X0, (AX)
  9133. MOVOU X1, -16(AX)(BX*1)
  9134. JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
  9135. emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
  9136. MOVOU (CX), X0
  9137. MOVOU 16(CX), X1
  9138. MOVOU -32(CX)(BX*1), X2
  9139. MOVOU -16(CX)(BX*1), X3
  9140. MOVOU X0, (AX)
  9141. MOVOU X1, 16(AX)
  9142. MOVOU X2, -32(AX)(BX*1)
  9143. MOVOU X3, -16(AX)(BX*1)
  9144. memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
  9145. MOVQ DX, AX
  9146. JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
  9147. memmove_long_emit_remainder_encodeBetterBlockAsm8B:
  9148. LEAQ (AX)(SI*1), DX
  9149. MOVL SI, BX
  9150. // genMemMoveLong
  9151. MOVOU (CX), X0
  9152. MOVOU 16(CX), X1
  9153. MOVOU -32(CX)(BX*1), X2
  9154. MOVOU -16(CX)(BX*1), X3
  9155. MOVQ BX, DI
  9156. SHRQ $0x05, DI
  9157. MOVQ AX, SI
  9158. ANDL $0x0000001f, SI
  9159. MOVQ $0x00000040, R8
  9160. SUBQ SI, R8
  9161. DECQ DI
  9162. JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  9163. LEAQ -32(CX)(R8*1), SI
  9164. LEAQ -32(AX)(R8*1), R9
  9165. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
  9166. MOVOU (SI), X4
  9167. MOVOU 16(SI), X5
  9168. MOVOA X4, (R9)
  9169. MOVOA X5, 16(R9)
  9170. ADDQ $0x20, R9
  9171. ADDQ $0x20, SI
  9172. ADDQ $0x20, R8
  9173. DECQ DI
  9174. JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
  9175. emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
  9176. MOVOU -32(CX)(R8*1), X4
  9177. MOVOU -16(CX)(R8*1), X5
  9178. MOVOA X4, -32(AX)(R8*1)
  9179. MOVOA X5, -16(AX)(R8*1)
  9180. ADDQ $0x20, R8
  9181. CMPQ BX, R8
  9182. JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
  9183. MOVOU X0, (AX)
  9184. MOVOU X1, 16(AX)
  9185. MOVOU X2, -32(AX)(BX*1)
  9186. MOVOU X3, -16(AX)(BX*1)
  9187. MOVQ DX, AX
  9188. emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
  9189. MOVQ dst_base+0(FP), CX
  9190. SUBQ CX, AX
  9191. MOVQ AX, ret+48(FP)
  9192. RET
  9193. // func encodeSnappyBlockAsm(dst []byte, src []byte) int
  9194. // Requires: BMI, SSE2
  9195. TEXT ·encodeSnappyBlockAsm(SB), $65560-56
  9196. MOVQ dst_base+0(FP), AX
  9197. MOVQ $0x00000200, CX
  9198. LEAQ 24(SP), DX
  9199. PXOR X0, X0
  9200. zero_loop_encodeSnappyBlockAsm:
  9201. MOVOU X0, (DX)
  9202. MOVOU X0, 16(DX)
  9203. MOVOU X0, 32(DX)
  9204. MOVOU X0, 48(DX)
  9205. MOVOU X0, 64(DX)
  9206. MOVOU X0, 80(DX)
  9207. MOVOU X0, 96(DX)
  9208. MOVOU X0, 112(DX)
  9209. ADDQ $0x80, DX
  9210. DECQ CX
  9211. JNZ zero_loop_encodeSnappyBlockAsm
  9212. MOVL $0x00000000, 12(SP)
  9213. MOVQ src_len+32(FP), CX
  9214. LEAQ -9(CX), DX
  9215. LEAQ -8(CX), SI
  9216. MOVL SI, 8(SP)
  9217. SHRQ $0x05, CX
  9218. SUBL CX, DX
  9219. LEAQ (AX)(DX*1), DX
  9220. MOVQ DX, (SP)
  9221. MOVL $0x00000001, CX
  9222. MOVL CX, 16(SP)
  9223. MOVQ src_base+24(FP), DX
  9224. search_loop_encodeSnappyBlockAsm:
  9225. MOVL CX, SI
  9226. SUBL 12(SP), SI
  9227. SHRL $0x06, SI
  9228. LEAL 4(CX)(SI*1), SI
  9229. CMPL SI, 8(SP)
  9230. JGE emit_remainder_encodeSnappyBlockAsm
  9231. MOVQ (DX)(CX*1), DI
  9232. MOVL SI, 20(SP)
  9233. MOVQ $0x0000cf1bbcdcbf9b, R9
  9234. MOVQ DI, R10
  9235. MOVQ DI, R11
  9236. SHRQ $0x08, R11
  9237. SHLQ $0x10, R10
  9238. IMULQ R9, R10
  9239. SHRQ $0x32, R10
  9240. SHLQ $0x10, R11
  9241. IMULQ R9, R11
  9242. SHRQ $0x32, R11
  9243. MOVL 24(SP)(R10*4), SI
  9244. MOVL 24(SP)(R11*4), R8
  9245. MOVL CX, 24(SP)(R10*4)
  9246. LEAL 1(CX), R10
  9247. MOVL R10, 24(SP)(R11*4)
  9248. MOVQ DI, R10
  9249. SHRQ $0x10, R10
  9250. SHLQ $0x10, R10
  9251. IMULQ R9, R10
  9252. SHRQ $0x32, R10
  9253. MOVL CX, R9
  9254. SUBL 16(SP), R9
  9255. MOVL 1(DX)(R9*1), R11
  9256. MOVQ DI, R9
  9257. SHRQ $0x08, R9
  9258. CMPL R9, R11
  9259. JNE no_repeat_found_encodeSnappyBlockAsm
  9260. LEAL 1(CX), DI
  9261. MOVL 12(SP), SI
  9262. MOVL DI, R8
  9263. SUBL 16(SP), R8
  9264. JZ repeat_extend_back_end_encodeSnappyBlockAsm
  9265. repeat_extend_back_loop_encodeSnappyBlockAsm:
  9266. CMPL DI, SI
  9267. JLE repeat_extend_back_end_encodeSnappyBlockAsm
  9268. MOVB -1(DX)(R8*1), BL
  9269. MOVB -1(DX)(DI*1), R9
  9270. CMPB BL, R9
  9271. JNE repeat_extend_back_end_encodeSnappyBlockAsm
  9272. LEAL -1(DI), DI
  9273. DECL R8
  9274. JNZ repeat_extend_back_loop_encodeSnappyBlockAsm
  9275. repeat_extend_back_end_encodeSnappyBlockAsm:
  9276. MOVL 12(SP), SI
  9277. CMPL SI, DI
  9278. JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm
  9279. MOVL DI, R8
  9280. MOVL DI, 12(SP)
  9281. LEAQ (DX)(SI*1), R9
  9282. SUBL SI, R8
  9283. LEAL -1(R8), SI
  9284. CMPL SI, $0x3c
  9285. JLT one_byte_repeat_emit_encodeSnappyBlockAsm
  9286. CMPL SI, $0x00000100
  9287. JLT two_bytes_repeat_emit_encodeSnappyBlockAsm
  9288. CMPL SI, $0x00010000
  9289. JLT three_bytes_repeat_emit_encodeSnappyBlockAsm
  9290. CMPL SI, $0x01000000
  9291. JLT four_bytes_repeat_emit_encodeSnappyBlockAsm
  9292. MOVB $0xfc, (AX)
  9293. MOVL SI, 1(AX)
  9294. ADDQ $0x05, AX
  9295. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
  9296. four_bytes_repeat_emit_encodeSnappyBlockAsm:
  9297. MOVL SI, R10
  9298. SHRL $0x10, R10
  9299. MOVB $0xf8, (AX)
  9300. MOVW SI, 1(AX)
  9301. MOVB R10, 3(AX)
  9302. ADDQ $0x04, AX
  9303. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
  9304. three_bytes_repeat_emit_encodeSnappyBlockAsm:
  9305. MOVB $0xf4, (AX)
  9306. MOVW SI, 1(AX)
  9307. ADDQ $0x03, AX
  9308. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
  9309. two_bytes_repeat_emit_encodeSnappyBlockAsm:
  9310. MOVB $0xf0, (AX)
  9311. MOVB SI, 1(AX)
  9312. ADDQ $0x02, AX
  9313. CMPL SI, $0x40
  9314. JL memmove_repeat_emit_encodeSnappyBlockAsm
  9315. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
  9316. one_byte_repeat_emit_encodeSnappyBlockAsm:
  9317. SHLB $0x02, SI
  9318. MOVB SI, (AX)
  9319. ADDQ $0x01, AX
  9320. memmove_repeat_emit_encodeSnappyBlockAsm:
  9321. LEAQ (AX)(R8*1), SI
  9322. // genMemMoveShort
  9323. CMPQ R8, $0x08
  9324. JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
  9325. CMPQ R8, $0x10
  9326. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
  9327. CMPQ R8, $0x20
  9328. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
  9329. JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
  9330. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
  9331. MOVQ (R9), R10
  9332. MOVQ R10, (AX)
  9333. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
  9334. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
  9335. MOVQ (R9), R10
  9336. MOVQ -8(R9)(R8*1), R9
  9337. MOVQ R10, (AX)
  9338. MOVQ R9, -8(AX)(R8*1)
  9339. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
  9340. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
  9341. MOVOU (R9), X0
  9342. MOVOU -16(R9)(R8*1), X1
  9343. MOVOU X0, (AX)
  9344. MOVOU X1, -16(AX)(R8*1)
  9345. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
  9346. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
  9347. MOVOU (R9), X0
  9348. MOVOU 16(R9), X1
  9349. MOVOU -32(R9)(R8*1), X2
  9350. MOVOU -16(R9)(R8*1), X3
  9351. MOVOU X0, (AX)
  9352. MOVOU X1, 16(AX)
  9353. MOVOU X2, -32(AX)(R8*1)
  9354. MOVOU X3, -16(AX)(R8*1)
  9355. memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
  9356. MOVQ SI, AX
  9357. JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm
  9358. memmove_long_repeat_emit_encodeSnappyBlockAsm:
  9359. LEAQ (AX)(R8*1), SI
  9360. // genMemMoveLong
  9361. MOVOU (R9), X0
  9362. MOVOU 16(R9), X1
  9363. MOVOU -32(R9)(R8*1), X2
  9364. MOVOU -16(R9)(R8*1), X3
  9365. MOVQ R8, R11
  9366. SHRQ $0x05, R11
  9367. MOVQ AX, R10
  9368. ANDL $0x0000001f, R10
  9369. MOVQ $0x00000040, R12
  9370. SUBQ R10, R12
  9371. DECQ R11
  9372. JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  9373. LEAQ -32(R9)(R12*1), R10
  9374. LEAQ -32(AX)(R12*1), R13
  9375. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
  9376. MOVOU (R10), X4
  9377. MOVOU 16(R10), X5
  9378. MOVOA X4, (R13)
  9379. MOVOA X5, 16(R13)
  9380. ADDQ $0x20, R13
  9381. ADDQ $0x20, R10
  9382. ADDQ $0x20, R12
  9383. DECQ R11
  9384. JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
  9385. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
  9386. MOVOU -32(R9)(R12*1), X4
  9387. MOVOU -16(R9)(R12*1), X5
  9388. MOVOA X4, -32(AX)(R12*1)
  9389. MOVOA X5, -16(AX)(R12*1)
  9390. ADDQ $0x20, R12
  9391. CMPQ R8, R12
  9392. JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  9393. MOVOU X0, (AX)
  9394. MOVOU X1, 16(AX)
  9395. MOVOU X2, -32(AX)(R8*1)
  9396. MOVOU X3, -16(AX)(R8*1)
  9397. MOVQ SI, AX
  9398. emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
  9399. ADDL $0x05, CX
  9400. MOVL CX, SI
  9401. SUBL 16(SP), SI
  9402. MOVQ src_len+32(FP), R8
  9403. SUBL CX, R8
  9404. LEAQ (DX)(CX*1), R9
  9405. LEAQ (DX)(SI*1), SI
  9406. // matchLen
  9407. XORL R11, R11
  9408. CMPL R8, $0x08
  9409. JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm
  9410. matchlen_loopback_repeat_extend_encodeSnappyBlockAsm:
  9411. MOVQ (R9)(R11*1), R10
  9412. XORQ (SI)(R11*1), R10
  9413. TESTQ R10, R10
  9414. JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm
  9415. #ifdef GOAMD64_v3
  9416. TZCNTQ R10, R10
  9417. #else
  9418. BSFQ R10, R10
  9419. #endif
  9420. SARQ $0x03, R10
  9421. LEAL (R11)(R10*1), R11
  9422. JMP repeat_extend_forward_end_encodeSnappyBlockAsm
  9423. matchlen_loop_repeat_extend_encodeSnappyBlockAsm:
  9424. LEAL -8(R8), R8
  9425. LEAL 8(R11), R11
  9426. CMPL R8, $0x08
  9427. JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm
  9428. JZ repeat_extend_forward_end_encodeSnappyBlockAsm
  9429. matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
  9430. CMPL R8, $0x04
  9431. JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm
  9432. MOVL (R9)(R11*1), R10
  9433. CMPL (SI)(R11*1), R10
  9434. JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm
  9435. SUBL $0x04, R8
  9436. LEAL 4(R11), R11
  9437. matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
  9438. CMPL R8, $0x02
  9439. JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm
  9440. MOVW (R9)(R11*1), R10
  9441. CMPW (SI)(R11*1), R10
  9442. JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
  9443. SUBL $0x02, R8
  9444. LEAL 2(R11), R11
  9445. matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
  9446. CMPL R8, $0x01
  9447. JL repeat_extend_forward_end_encodeSnappyBlockAsm
  9448. MOVB (R9)(R11*1), R10
  9449. CMPB (SI)(R11*1), R10
  9450. JNE repeat_extend_forward_end_encodeSnappyBlockAsm
  9451. LEAL 1(R11), R11
  9452. repeat_extend_forward_end_encodeSnappyBlockAsm:
  9453. ADDL R11, CX
  9454. MOVL CX, SI
  9455. SUBL DI, SI
  9456. MOVL 16(SP), DI
  9457. // emitCopy
  9458. CMPL DI, $0x00010000
  9459. JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
  9460. four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
  9461. CMPL SI, $0x40
  9462. JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
  9463. MOVB $0xff, (AX)
  9464. MOVL DI, 1(AX)
  9465. LEAL -64(SI), SI
  9466. ADDQ $0x05, AX
  9467. CMPL SI, $0x04
  9468. JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
  9469. JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
  9470. four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
  9471. TESTL SI, SI
  9472. JZ repeat_end_emit_encodeSnappyBlockAsm
  9473. MOVB $0x03, BL
  9474. LEAL -4(BX)(SI*4), SI
  9475. MOVB SI, (AX)
  9476. MOVL DI, 1(AX)
  9477. ADDQ $0x05, AX
  9478. JMP repeat_end_emit_encodeSnappyBlockAsm
  9479. two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
  9480. CMPL SI, $0x40
  9481. JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
  9482. MOVB $0xee, (AX)
  9483. MOVW DI, 1(AX)
  9484. LEAL -60(SI), SI
  9485. ADDQ $0x03, AX
  9486. JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
  9487. two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
  9488. CMPL SI, $0x0c
  9489. JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
  9490. CMPL DI, $0x00000800
  9491. JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
  9492. MOVB $0x01, BL
  9493. LEAL -16(BX)(SI*4), SI
  9494. MOVB DI, 1(AX)
  9495. SHRL $0x08, DI
  9496. SHLL $0x05, DI
  9497. ORL DI, SI
  9498. MOVB SI, (AX)
  9499. ADDQ $0x02, AX
  9500. JMP repeat_end_emit_encodeSnappyBlockAsm
  9501. emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
  9502. MOVB $0x02, BL
  9503. LEAL -4(BX)(SI*4), SI
  9504. MOVB SI, (AX)
  9505. MOVW DI, 1(AX)
  9506. ADDQ $0x03, AX
  9507. repeat_end_emit_encodeSnappyBlockAsm:
  9508. MOVL CX, 12(SP)
  9509. JMP search_loop_encodeSnappyBlockAsm
  9510. no_repeat_found_encodeSnappyBlockAsm:
  9511. CMPL (DX)(SI*1), DI
  9512. JEQ candidate_match_encodeSnappyBlockAsm
  9513. SHRQ $0x08, DI
  9514. MOVL 24(SP)(R10*4), SI
  9515. LEAL 2(CX), R9
  9516. CMPL (DX)(R8*1), DI
  9517. JEQ candidate2_match_encodeSnappyBlockAsm
  9518. MOVL R9, 24(SP)(R10*4)
  9519. SHRQ $0x08, DI
  9520. CMPL (DX)(SI*1), DI
  9521. JEQ candidate3_match_encodeSnappyBlockAsm
  9522. MOVL 20(SP), CX
  9523. JMP search_loop_encodeSnappyBlockAsm
  9524. candidate3_match_encodeSnappyBlockAsm:
  9525. ADDL $0x02, CX
  9526. JMP candidate_match_encodeSnappyBlockAsm
  9527. candidate2_match_encodeSnappyBlockAsm:
  9528. MOVL R9, 24(SP)(R10*4)
  9529. INCL CX
  9530. MOVL R8, SI
  9531. candidate_match_encodeSnappyBlockAsm:
  9532. MOVL 12(SP), DI
  9533. TESTL SI, SI
  9534. JZ match_extend_back_end_encodeSnappyBlockAsm
  9535. match_extend_back_loop_encodeSnappyBlockAsm:
  9536. CMPL CX, DI
  9537. JLE match_extend_back_end_encodeSnappyBlockAsm
  9538. MOVB -1(DX)(SI*1), BL
  9539. MOVB -1(DX)(CX*1), R8
  9540. CMPB BL, R8
  9541. JNE match_extend_back_end_encodeSnappyBlockAsm
  9542. LEAL -1(CX), CX
  9543. DECL SI
  9544. JZ match_extend_back_end_encodeSnappyBlockAsm
  9545. JMP match_extend_back_loop_encodeSnappyBlockAsm
  9546. match_extend_back_end_encodeSnappyBlockAsm:
  9547. MOVL CX, DI
  9548. SUBL 12(SP), DI
  9549. LEAQ 5(AX)(DI*1), DI
  9550. CMPQ DI, (SP)
  9551. JL match_dst_size_check_encodeSnappyBlockAsm
  9552. MOVQ $0x00000000, ret+48(FP)
  9553. RET
  9554. match_dst_size_check_encodeSnappyBlockAsm:
  9555. MOVL CX, DI
  9556. MOVL 12(SP), R8
  9557. CMPL R8, DI
  9558. JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm
  9559. MOVL DI, R9
  9560. MOVL DI, 12(SP)
  9561. LEAQ (DX)(R8*1), DI
  9562. SUBL R8, R9
  9563. LEAL -1(R9), R8
  9564. CMPL R8, $0x3c
  9565. JLT one_byte_match_emit_encodeSnappyBlockAsm
  9566. CMPL R8, $0x00000100
  9567. JLT two_bytes_match_emit_encodeSnappyBlockAsm
  9568. CMPL R8, $0x00010000
  9569. JLT three_bytes_match_emit_encodeSnappyBlockAsm
  9570. CMPL R8, $0x01000000
  9571. JLT four_bytes_match_emit_encodeSnappyBlockAsm
  9572. MOVB $0xfc, (AX)
  9573. MOVL R8, 1(AX)
  9574. ADDQ $0x05, AX
  9575. JMP memmove_long_match_emit_encodeSnappyBlockAsm
  9576. four_bytes_match_emit_encodeSnappyBlockAsm:
  9577. MOVL R8, R10
  9578. SHRL $0x10, R10
  9579. MOVB $0xf8, (AX)
  9580. MOVW R8, 1(AX)
  9581. MOVB R10, 3(AX)
  9582. ADDQ $0x04, AX
  9583. JMP memmove_long_match_emit_encodeSnappyBlockAsm
  9584. three_bytes_match_emit_encodeSnappyBlockAsm:
  9585. MOVB $0xf4, (AX)
  9586. MOVW R8, 1(AX)
  9587. ADDQ $0x03, AX
  9588. JMP memmove_long_match_emit_encodeSnappyBlockAsm
  9589. two_bytes_match_emit_encodeSnappyBlockAsm:
  9590. MOVB $0xf0, (AX)
  9591. MOVB R8, 1(AX)
  9592. ADDQ $0x02, AX
  9593. CMPL R8, $0x40
  9594. JL memmove_match_emit_encodeSnappyBlockAsm
  9595. JMP memmove_long_match_emit_encodeSnappyBlockAsm
  9596. one_byte_match_emit_encodeSnappyBlockAsm:
  9597. SHLB $0x02, R8
  9598. MOVB R8, (AX)
  9599. ADDQ $0x01, AX
  9600. memmove_match_emit_encodeSnappyBlockAsm:
  9601. LEAQ (AX)(R9*1), R8
  9602. // genMemMoveShort
  9603. CMPQ R9, $0x08
  9604. JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
  9605. CMPQ R9, $0x10
  9606. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
  9607. CMPQ R9, $0x20
  9608. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
  9609. JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
  9610. emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
  9611. MOVQ (DI), R10
  9612. MOVQ R10, (AX)
  9613. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
  9614. emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
  9615. MOVQ (DI), R10
  9616. MOVQ -8(DI)(R9*1), DI
  9617. MOVQ R10, (AX)
  9618. MOVQ DI, -8(AX)(R9*1)
  9619. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
  9620. emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
  9621. MOVOU (DI), X0
  9622. MOVOU -16(DI)(R9*1), X1
  9623. MOVOU X0, (AX)
  9624. MOVOU X1, -16(AX)(R9*1)
  9625. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
  9626. emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
  9627. MOVOU (DI), X0
  9628. MOVOU 16(DI), X1
  9629. MOVOU -32(DI)(R9*1), X2
  9630. MOVOU -16(DI)(R9*1), X3
  9631. MOVOU X0, (AX)
  9632. MOVOU X1, 16(AX)
  9633. MOVOU X2, -32(AX)(R9*1)
  9634. MOVOU X3, -16(AX)(R9*1)
  9635. memmove_end_copy_match_emit_encodeSnappyBlockAsm:
  9636. MOVQ R8, AX
  9637. JMP emit_literal_done_match_emit_encodeSnappyBlockAsm
  9638. memmove_long_match_emit_encodeSnappyBlockAsm:
  9639. LEAQ (AX)(R9*1), R8
  9640. // genMemMoveLong
  9641. MOVOU (DI), X0
  9642. MOVOU 16(DI), X1
  9643. MOVOU -32(DI)(R9*1), X2
  9644. MOVOU -16(DI)(R9*1), X3
  9645. MOVQ R9, R11
  9646. SHRQ $0x05, R11
  9647. MOVQ AX, R10
  9648. ANDL $0x0000001f, R10
  9649. MOVQ $0x00000040, R12
  9650. SUBQ R10, R12
  9651. DECQ R11
  9652. JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  9653. LEAQ -32(DI)(R12*1), R10
  9654. LEAQ -32(AX)(R12*1), R13
  9655. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
  9656. MOVOU (R10), X4
  9657. MOVOU 16(R10), X5
  9658. MOVOA X4, (R13)
  9659. MOVOA X5, 16(R13)
  9660. ADDQ $0x20, R13
  9661. ADDQ $0x20, R10
  9662. ADDQ $0x20, R12
  9663. DECQ R11
  9664. JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
  9665. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
  9666. MOVOU -32(DI)(R12*1), X4
  9667. MOVOU -16(DI)(R12*1), X5
  9668. MOVOA X4, -32(AX)(R12*1)
  9669. MOVOA X5, -16(AX)(R12*1)
  9670. ADDQ $0x20, R12
  9671. CMPQ R9, R12
  9672. JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  9673. MOVOU X0, (AX)
  9674. MOVOU X1, 16(AX)
  9675. MOVOU X2, -32(AX)(R9*1)
  9676. MOVOU X3, -16(AX)(R9*1)
  9677. MOVQ R8, AX
  9678. emit_literal_done_match_emit_encodeSnappyBlockAsm:
  9679. match_nolit_loop_encodeSnappyBlockAsm:
  9680. MOVL CX, DI
  9681. SUBL SI, DI
  9682. MOVL DI, 16(SP)
  9683. ADDL $0x04, CX
  9684. ADDL $0x04, SI
  9685. MOVQ src_len+32(FP), DI
  9686. SUBL CX, DI
  9687. LEAQ (DX)(CX*1), R8
  9688. LEAQ (DX)(SI*1), SI
  9689. // matchLen
  9690. XORL R10, R10
  9691. CMPL DI, $0x08
  9692. JL matchlen_match4_match_nolit_encodeSnappyBlockAsm
  9693. matchlen_loopback_match_nolit_encodeSnappyBlockAsm:
  9694. MOVQ (R8)(R10*1), R9
  9695. XORQ (SI)(R10*1), R9
  9696. TESTQ R9, R9
  9697. JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm
  9698. #ifdef GOAMD64_v3
  9699. TZCNTQ R9, R9
  9700. #else
  9701. BSFQ R9, R9
  9702. #endif
  9703. SARQ $0x03, R9
  9704. LEAL (R10)(R9*1), R10
  9705. JMP match_nolit_end_encodeSnappyBlockAsm
  9706. matchlen_loop_match_nolit_encodeSnappyBlockAsm:
  9707. LEAL -8(DI), DI
  9708. LEAL 8(R10), R10
  9709. CMPL DI, $0x08
  9710. JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm
  9711. JZ match_nolit_end_encodeSnappyBlockAsm
  9712. matchlen_match4_match_nolit_encodeSnappyBlockAsm:
  9713. CMPL DI, $0x04
  9714. JL matchlen_match2_match_nolit_encodeSnappyBlockAsm
  9715. MOVL (R8)(R10*1), R9
  9716. CMPL (SI)(R10*1), R9
  9717. JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm
  9718. SUBL $0x04, DI
  9719. LEAL 4(R10), R10
  9720. matchlen_match2_match_nolit_encodeSnappyBlockAsm:
  9721. CMPL DI, $0x02
  9722. JL matchlen_match1_match_nolit_encodeSnappyBlockAsm
  9723. MOVW (R8)(R10*1), R9
  9724. CMPW (SI)(R10*1), R9
  9725. JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm
  9726. SUBL $0x02, DI
  9727. LEAL 2(R10), R10
  9728. matchlen_match1_match_nolit_encodeSnappyBlockAsm:
  9729. CMPL DI, $0x01
  9730. JL match_nolit_end_encodeSnappyBlockAsm
  9731. MOVB (R8)(R10*1), R9
  9732. CMPB (SI)(R10*1), R9
  9733. JNE match_nolit_end_encodeSnappyBlockAsm
  9734. LEAL 1(R10), R10
  9735. match_nolit_end_encodeSnappyBlockAsm:
  9736. ADDL R10, CX
  9737. MOVL 16(SP), SI
  9738. ADDL $0x04, R10
  9739. MOVL CX, 12(SP)
  9740. // emitCopy
  9741. CMPL SI, $0x00010000
  9742. JL two_byte_offset_match_nolit_encodeSnappyBlockAsm
  9743. four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
  9744. CMPL R10, $0x40
  9745. JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm
  9746. MOVB $0xff, (AX)
  9747. MOVL SI, 1(AX)
  9748. LEAL -64(R10), R10
  9749. ADDQ $0x05, AX
  9750. CMPL R10, $0x04
  9751. JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm
  9752. JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
  9753. four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
  9754. TESTL R10, R10
  9755. JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm
  9756. MOVB $0x03, BL
  9757. LEAL -4(BX)(R10*4), R10
  9758. MOVB R10, (AX)
  9759. MOVL SI, 1(AX)
  9760. ADDQ $0x05, AX
  9761. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
  9762. two_byte_offset_match_nolit_encodeSnappyBlockAsm:
  9763. CMPL R10, $0x40
  9764. JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
  9765. MOVB $0xee, (AX)
  9766. MOVW SI, 1(AX)
  9767. LEAL -60(R10), R10
  9768. ADDQ $0x03, AX
  9769. JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm
  9770. two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
  9771. CMPL R10, $0x0c
  9772. JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
  9773. CMPL SI, $0x00000800
  9774. JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm
  9775. MOVB $0x01, BL
  9776. LEAL -16(BX)(R10*4), R10
  9777. MOVB SI, 1(AX)
  9778. SHRL $0x08, SI
  9779. SHLL $0x05, SI
  9780. ORL SI, R10
  9781. MOVB R10, (AX)
  9782. ADDQ $0x02, AX
  9783. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
  9784. emit_copy_three_match_nolit_encodeSnappyBlockAsm:
  9785. MOVB $0x02, BL
  9786. LEAL -4(BX)(R10*4), R10
  9787. MOVB R10, (AX)
  9788. MOVW SI, 1(AX)
  9789. ADDQ $0x03, AX
  9790. match_nolit_emitcopy_end_encodeSnappyBlockAsm:
  9791. CMPL CX, 8(SP)
  9792. JGE emit_remainder_encodeSnappyBlockAsm
  9793. MOVQ -2(DX)(CX*1), DI
  9794. CMPQ AX, (SP)
  9795. JL match_nolit_dst_ok_encodeSnappyBlockAsm
  9796. MOVQ $0x00000000, ret+48(FP)
  9797. RET
  9798. match_nolit_dst_ok_encodeSnappyBlockAsm:
  9799. MOVQ $0x0000cf1bbcdcbf9b, R9
  9800. MOVQ DI, R8
  9801. SHRQ $0x10, DI
  9802. MOVQ DI, SI
  9803. SHLQ $0x10, R8
  9804. IMULQ R9, R8
  9805. SHRQ $0x32, R8
  9806. SHLQ $0x10, SI
  9807. IMULQ R9, SI
  9808. SHRQ $0x32, SI
  9809. LEAL -2(CX), R9
  9810. LEAQ 24(SP)(SI*4), R10
  9811. MOVL (R10), SI
  9812. MOVL R9, 24(SP)(R8*4)
  9813. MOVL CX, (R10)
  9814. CMPL (DX)(SI*1), DI
  9815. JEQ match_nolit_loop_encodeSnappyBlockAsm
  9816. INCL CX
  9817. JMP search_loop_encodeSnappyBlockAsm
  9818. emit_remainder_encodeSnappyBlockAsm:
  9819. MOVQ src_len+32(FP), CX
  9820. SUBL 12(SP), CX
  9821. LEAQ 5(AX)(CX*1), CX
  9822. CMPQ CX, (SP)
  9823. JL emit_remainder_ok_encodeSnappyBlockAsm
  9824. MOVQ $0x00000000, ret+48(FP)
  9825. RET
  9826. emit_remainder_ok_encodeSnappyBlockAsm:
  9827. MOVQ src_len+32(FP), CX
  9828. MOVL 12(SP), BX
  9829. CMPL BX, CX
  9830. JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm
  9831. MOVL CX, SI
  9832. MOVL CX, 12(SP)
  9833. LEAQ (DX)(BX*1), CX
  9834. SUBL BX, SI
  9835. LEAL -1(SI), DX
  9836. CMPL DX, $0x3c
  9837. JLT one_byte_emit_remainder_encodeSnappyBlockAsm
  9838. CMPL DX, $0x00000100
  9839. JLT two_bytes_emit_remainder_encodeSnappyBlockAsm
  9840. CMPL DX, $0x00010000
  9841. JLT three_bytes_emit_remainder_encodeSnappyBlockAsm
  9842. CMPL DX, $0x01000000
  9843. JLT four_bytes_emit_remainder_encodeSnappyBlockAsm
  9844. MOVB $0xfc, (AX)
  9845. MOVL DX, 1(AX)
  9846. ADDQ $0x05, AX
  9847. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
  9848. four_bytes_emit_remainder_encodeSnappyBlockAsm:
  9849. MOVL DX, BX
  9850. SHRL $0x10, BX
  9851. MOVB $0xf8, (AX)
  9852. MOVW DX, 1(AX)
  9853. MOVB BL, 3(AX)
  9854. ADDQ $0x04, AX
  9855. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
  9856. three_bytes_emit_remainder_encodeSnappyBlockAsm:
  9857. MOVB $0xf4, (AX)
  9858. MOVW DX, 1(AX)
  9859. ADDQ $0x03, AX
  9860. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
  9861. two_bytes_emit_remainder_encodeSnappyBlockAsm:
  9862. MOVB $0xf0, (AX)
  9863. MOVB DL, 1(AX)
  9864. ADDQ $0x02, AX
  9865. CMPL DX, $0x40
  9866. JL memmove_emit_remainder_encodeSnappyBlockAsm
  9867. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
  9868. one_byte_emit_remainder_encodeSnappyBlockAsm:
  9869. SHLB $0x02, DL
  9870. MOVB DL, (AX)
  9871. ADDQ $0x01, AX
  9872. memmove_emit_remainder_encodeSnappyBlockAsm:
  9873. LEAQ (AX)(SI*1), DX
  9874. MOVL SI, BX
  9875. // genMemMoveShort
  9876. CMPQ BX, $0x03
  9877. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2
  9878. JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3
  9879. CMPQ BX, $0x08
  9880. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7
  9881. CMPQ BX, $0x10
  9882. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
  9883. CMPQ BX, $0x20
  9884. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
  9885. JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
  9886. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2:
  9887. MOVB (CX), SI
  9888. MOVB -1(CX)(BX*1), CL
  9889. MOVB SI, (AX)
  9890. MOVB CL, -1(AX)(BX*1)
  9891. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
  9892. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3:
  9893. MOVW (CX), SI
  9894. MOVB 2(CX), CL
  9895. MOVW SI, (AX)
  9896. MOVB CL, 2(AX)
  9897. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
  9898. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7:
  9899. MOVL (CX), SI
  9900. MOVL -4(CX)(BX*1), CX
  9901. MOVL SI, (AX)
  9902. MOVL CX, -4(AX)(BX*1)
  9903. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
  9904. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
  9905. MOVQ (CX), SI
  9906. MOVQ -8(CX)(BX*1), CX
  9907. MOVQ SI, (AX)
  9908. MOVQ CX, -8(AX)(BX*1)
  9909. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
  9910. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
  9911. MOVOU (CX), X0
  9912. MOVOU -16(CX)(BX*1), X1
  9913. MOVOU X0, (AX)
  9914. MOVOU X1, -16(AX)(BX*1)
  9915. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
  9916. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
  9917. MOVOU (CX), X0
  9918. MOVOU 16(CX), X1
  9919. MOVOU -32(CX)(BX*1), X2
  9920. MOVOU -16(CX)(BX*1), X3
  9921. MOVOU X0, (AX)
  9922. MOVOU X1, 16(AX)
  9923. MOVOU X2, -32(AX)(BX*1)
  9924. MOVOU X3, -16(AX)(BX*1)
  9925. memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
  9926. MOVQ DX, AX
  9927. JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm
  9928. memmove_long_emit_remainder_encodeSnappyBlockAsm:
  9929. LEAQ (AX)(SI*1), DX
  9930. MOVL SI, BX
  9931. // genMemMoveLong
  9932. MOVOU (CX), X0
  9933. MOVOU 16(CX), X1
  9934. MOVOU -32(CX)(BX*1), X2
  9935. MOVOU -16(CX)(BX*1), X3
  9936. MOVQ BX, DI
  9937. SHRQ $0x05, DI
  9938. MOVQ AX, SI
  9939. ANDL $0x0000001f, SI
  9940. MOVQ $0x00000040, R8
  9941. SUBQ SI, R8
  9942. DECQ DI
  9943. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  9944. LEAQ -32(CX)(R8*1), SI
  9945. LEAQ -32(AX)(R8*1), R9
  9946. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
  9947. MOVOU (SI), X4
  9948. MOVOU 16(SI), X5
  9949. MOVOA X4, (R9)
  9950. MOVOA X5, 16(R9)
  9951. ADDQ $0x20, R9
  9952. ADDQ $0x20, SI
  9953. ADDQ $0x20, R8
  9954. DECQ DI
  9955. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
  9956. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
  9957. MOVOU -32(CX)(R8*1), X4
  9958. MOVOU -16(CX)(R8*1), X5
  9959. MOVOA X4, -32(AX)(R8*1)
  9960. MOVOA X5, -16(AX)(R8*1)
  9961. ADDQ $0x20, R8
  9962. CMPQ BX, R8
  9963. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
  9964. MOVOU X0, (AX)
  9965. MOVOU X1, 16(AX)
  9966. MOVOU X2, -32(AX)(BX*1)
  9967. MOVOU X3, -16(AX)(BX*1)
  9968. MOVQ DX, AX
  9969. emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
  9970. MOVQ dst_base+0(FP), CX
  9971. SUBQ CX, AX
  9972. MOVQ AX, ret+48(FP)
  9973. RET
  9974. // func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
  9975. // Requires: BMI, SSE2
  9976. TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
  9977. MOVQ dst_base+0(FP), AX
  9978. MOVQ $0x00000200, CX
  9979. LEAQ 24(SP), DX
  9980. PXOR X0, X0
  9981. zero_loop_encodeSnappyBlockAsm64K:
  9982. MOVOU X0, (DX)
  9983. MOVOU X0, 16(DX)
  9984. MOVOU X0, 32(DX)
  9985. MOVOU X0, 48(DX)
  9986. MOVOU X0, 64(DX)
  9987. MOVOU X0, 80(DX)
  9988. MOVOU X0, 96(DX)
  9989. MOVOU X0, 112(DX)
  9990. ADDQ $0x80, DX
  9991. DECQ CX
  9992. JNZ zero_loop_encodeSnappyBlockAsm64K
  9993. MOVL $0x00000000, 12(SP)
  9994. MOVQ src_len+32(FP), CX
  9995. LEAQ -9(CX), DX
  9996. LEAQ -8(CX), SI
  9997. MOVL SI, 8(SP)
  9998. SHRQ $0x05, CX
  9999. SUBL CX, DX
  10000. LEAQ (AX)(DX*1), DX
  10001. MOVQ DX, (SP)
  10002. MOVL $0x00000001, CX
  10003. MOVL CX, 16(SP)
  10004. MOVQ src_base+24(FP), DX
  10005. search_loop_encodeSnappyBlockAsm64K:
  10006. MOVL CX, SI
  10007. SUBL 12(SP), SI
  10008. SHRL $0x06, SI
  10009. LEAL 4(CX)(SI*1), SI
  10010. CMPL SI, 8(SP)
  10011. JGE emit_remainder_encodeSnappyBlockAsm64K
  10012. MOVQ (DX)(CX*1), DI
  10013. MOVL SI, 20(SP)
  10014. MOVQ $0x0000cf1bbcdcbf9b, R9
  10015. MOVQ DI, R10
  10016. MOVQ DI, R11
  10017. SHRQ $0x08, R11
  10018. SHLQ $0x10, R10
  10019. IMULQ R9, R10
  10020. SHRQ $0x32, R10
  10021. SHLQ $0x10, R11
  10022. IMULQ R9, R11
  10023. SHRQ $0x32, R11
  10024. MOVL 24(SP)(R10*4), SI
  10025. MOVL 24(SP)(R11*4), R8
  10026. MOVL CX, 24(SP)(R10*4)
  10027. LEAL 1(CX), R10
  10028. MOVL R10, 24(SP)(R11*4)
  10029. MOVQ DI, R10
  10030. SHRQ $0x10, R10
  10031. SHLQ $0x10, R10
  10032. IMULQ R9, R10
  10033. SHRQ $0x32, R10
  10034. MOVL CX, R9
  10035. SUBL 16(SP), R9
  10036. MOVL 1(DX)(R9*1), R11
  10037. MOVQ DI, R9
  10038. SHRQ $0x08, R9
  10039. CMPL R9, R11
  10040. JNE no_repeat_found_encodeSnappyBlockAsm64K
  10041. LEAL 1(CX), DI
  10042. MOVL 12(SP), SI
  10043. MOVL DI, R8
  10044. SUBL 16(SP), R8
  10045. JZ repeat_extend_back_end_encodeSnappyBlockAsm64K
  10046. repeat_extend_back_loop_encodeSnappyBlockAsm64K:
  10047. CMPL DI, SI
  10048. JLE repeat_extend_back_end_encodeSnappyBlockAsm64K
  10049. MOVB -1(DX)(R8*1), BL
  10050. MOVB -1(DX)(DI*1), R9
  10051. CMPB BL, R9
  10052. JNE repeat_extend_back_end_encodeSnappyBlockAsm64K
  10053. LEAL -1(DI), DI
  10054. DECL R8
  10055. JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K
  10056. repeat_extend_back_end_encodeSnappyBlockAsm64K:
  10057. MOVL 12(SP), SI
  10058. CMPL SI, DI
  10059. JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
  10060. MOVL DI, R8
  10061. MOVL DI, 12(SP)
  10062. LEAQ (DX)(SI*1), R9
  10063. SUBL SI, R8
  10064. LEAL -1(R8), SI
  10065. CMPL SI, $0x3c
  10066. JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K
  10067. CMPL SI, $0x00000100
  10068. JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K
  10069. MOVB $0xf4, (AX)
  10070. MOVW SI, 1(AX)
  10071. ADDQ $0x03, AX
  10072. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
  10073. two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
  10074. MOVB $0xf0, (AX)
  10075. MOVB SI, 1(AX)
  10076. ADDQ $0x02, AX
  10077. CMPL SI, $0x40
  10078. JL memmove_repeat_emit_encodeSnappyBlockAsm64K
  10079. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
  10080. one_byte_repeat_emit_encodeSnappyBlockAsm64K:
  10081. SHLB $0x02, SI
  10082. MOVB SI, (AX)
  10083. ADDQ $0x01, AX
  10084. memmove_repeat_emit_encodeSnappyBlockAsm64K:
  10085. LEAQ (AX)(R8*1), SI
  10086. // genMemMoveShort
  10087. CMPQ R8, $0x08
  10088. JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
  10089. CMPQ R8, $0x10
  10090. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
  10091. CMPQ R8, $0x20
  10092. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
  10093. JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
  10094. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
  10095. MOVQ (R9), R10
  10096. MOVQ R10, (AX)
  10097. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
  10098. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
  10099. MOVQ (R9), R10
  10100. MOVQ -8(R9)(R8*1), R9
  10101. MOVQ R10, (AX)
  10102. MOVQ R9, -8(AX)(R8*1)
  10103. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
  10104. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
  10105. MOVOU (R9), X0
  10106. MOVOU -16(R9)(R8*1), X1
  10107. MOVOU X0, (AX)
  10108. MOVOU X1, -16(AX)(R8*1)
  10109. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
  10110. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
  10111. MOVOU (R9), X0
  10112. MOVOU 16(R9), X1
  10113. MOVOU -32(R9)(R8*1), X2
  10114. MOVOU -16(R9)(R8*1), X3
  10115. MOVOU X0, (AX)
  10116. MOVOU X1, 16(AX)
  10117. MOVOU X2, -32(AX)(R8*1)
  10118. MOVOU X3, -16(AX)(R8*1)
  10119. memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
  10120. MOVQ SI, AX
  10121. JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
  10122. memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
  10123. LEAQ (AX)(R8*1), SI
  10124. // genMemMoveLong
  10125. MOVOU (R9), X0
  10126. MOVOU 16(R9), X1
  10127. MOVOU -32(R9)(R8*1), X2
  10128. MOVOU -16(R9)(R8*1), X3
  10129. MOVQ R8, R11
  10130. SHRQ $0x05, R11
  10131. MOVQ AX, R10
  10132. ANDL $0x0000001f, R10
  10133. MOVQ $0x00000040, R12
  10134. SUBQ R10, R12
  10135. DECQ R11
  10136. JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  10137. LEAQ -32(R9)(R12*1), R10
  10138. LEAQ -32(AX)(R12*1), R13
  10139. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
  10140. MOVOU (R10), X4
  10141. MOVOU 16(R10), X5
  10142. MOVOA X4, (R13)
  10143. MOVOA X5, 16(R13)
  10144. ADDQ $0x20, R13
  10145. ADDQ $0x20, R10
  10146. ADDQ $0x20, R12
  10147. DECQ R11
  10148. JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
  10149. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
  10150. MOVOU -32(R9)(R12*1), X4
  10151. MOVOU -16(R9)(R12*1), X5
  10152. MOVOA X4, -32(AX)(R12*1)
  10153. MOVOA X5, -16(AX)(R12*1)
  10154. ADDQ $0x20, R12
  10155. CMPQ R8, R12
  10156. JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  10157. MOVOU X0, (AX)
  10158. MOVOU X1, 16(AX)
  10159. MOVOU X2, -32(AX)(R8*1)
  10160. MOVOU X3, -16(AX)(R8*1)
  10161. MOVQ SI, AX
  10162. emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
  10163. ADDL $0x05, CX
  10164. MOVL CX, SI
  10165. SUBL 16(SP), SI
  10166. MOVQ src_len+32(FP), R8
  10167. SUBL CX, R8
  10168. LEAQ (DX)(CX*1), R9
  10169. LEAQ (DX)(SI*1), SI
  10170. // matchLen
  10171. XORL R11, R11
  10172. CMPL R8, $0x08
  10173. JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
  10174. matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K:
  10175. MOVQ (R9)(R11*1), R10
  10176. XORQ (SI)(R11*1), R10
  10177. TESTQ R10, R10
  10178. JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K
  10179. #ifdef GOAMD64_v3
  10180. TZCNTQ R10, R10
  10181. #else
  10182. BSFQ R10, R10
  10183. #endif
  10184. SARQ $0x03, R10
  10185. LEAL (R11)(R10*1), R11
  10186. JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
  10187. matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K:
  10188. LEAL -8(R8), R8
  10189. LEAL 8(R11), R11
  10190. CMPL R8, $0x08
  10191. JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K
  10192. JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
  10193. matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
  10194. CMPL R8, $0x04
  10195. JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
  10196. MOVL (R9)(R11*1), R10
  10197. CMPL (SI)(R11*1), R10
  10198. JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
  10199. SUBL $0x04, R8
  10200. LEAL 4(R11), R11
  10201. matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
  10202. CMPL R8, $0x02
  10203. JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
  10204. MOVW (R9)(R11*1), R10
  10205. CMPW (SI)(R11*1), R10
  10206. JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
  10207. SUBL $0x02, R8
  10208. LEAL 2(R11), R11
  10209. matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
  10210. CMPL R8, $0x01
  10211. JL repeat_extend_forward_end_encodeSnappyBlockAsm64K
  10212. MOVB (R9)(R11*1), R10
  10213. CMPB (SI)(R11*1), R10
  10214. JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K
  10215. LEAL 1(R11), R11
  10216. repeat_extend_forward_end_encodeSnappyBlockAsm64K:
  10217. ADDL R11, CX
  10218. MOVL CX, SI
  10219. SUBL DI, SI
  10220. MOVL 16(SP), DI
  10221. // emitCopy
  10222. two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
  10223. CMPL SI, $0x40
  10224. JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
  10225. MOVB $0xee, (AX)
  10226. MOVW DI, 1(AX)
  10227. LEAL -60(SI), SI
  10228. ADDQ $0x03, AX
  10229. JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
  10230. two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
  10231. CMPL SI, $0x0c
  10232. JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
  10233. CMPL DI, $0x00000800
  10234. JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
  10235. MOVB $0x01, BL
  10236. LEAL -16(BX)(SI*4), SI
  10237. MOVB DI, 1(AX)
  10238. SHRL $0x08, DI
  10239. SHLL $0x05, DI
  10240. ORL DI, SI
  10241. MOVB SI, (AX)
  10242. ADDQ $0x02, AX
  10243. JMP repeat_end_emit_encodeSnappyBlockAsm64K
  10244. emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
  10245. MOVB $0x02, BL
  10246. LEAL -4(BX)(SI*4), SI
  10247. MOVB SI, (AX)
  10248. MOVW DI, 1(AX)
  10249. ADDQ $0x03, AX
  10250. repeat_end_emit_encodeSnappyBlockAsm64K:
  10251. MOVL CX, 12(SP)
  10252. JMP search_loop_encodeSnappyBlockAsm64K
  10253. no_repeat_found_encodeSnappyBlockAsm64K:
  10254. CMPL (DX)(SI*1), DI
  10255. JEQ candidate_match_encodeSnappyBlockAsm64K
  10256. SHRQ $0x08, DI
  10257. MOVL 24(SP)(R10*4), SI
  10258. LEAL 2(CX), R9
  10259. CMPL (DX)(R8*1), DI
  10260. JEQ candidate2_match_encodeSnappyBlockAsm64K
  10261. MOVL R9, 24(SP)(R10*4)
  10262. SHRQ $0x08, DI
  10263. CMPL (DX)(SI*1), DI
  10264. JEQ candidate3_match_encodeSnappyBlockAsm64K
  10265. MOVL 20(SP), CX
  10266. JMP search_loop_encodeSnappyBlockAsm64K
  10267. candidate3_match_encodeSnappyBlockAsm64K:
  10268. ADDL $0x02, CX
  10269. JMP candidate_match_encodeSnappyBlockAsm64K
  10270. candidate2_match_encodeSnappyBlockAsm64K:
  10271. MOVL R9, 24(SP)(R10*4)
  10272. INCL CX
  10273. MOVL R8, SI
  10274. candidate_match_encodeSnappyBlockAsm64K:
  10275. MOVL 12(SP), DI
  10276. TESTL SI, SI
  10277. JZ match_extend_back_end_encodeSnappyBlockAsm64K
  10278. match_extend_back_loop_encodeSnappyBlockAsm64K:
  10279. CMPL CX, DI
  10280. JLE match_extend_back_end_encodeSnappyBlockAsm64K
  10281. MOVB -1(DX)(SI*1), BL
  10282. MOVB -1(DX)(CX*1), R8
  10283. CMPB BL, R8
  10284. JNE match_extend_back_end_encodeSnappyBlockAsm64K
  10285. LEAL -1(CX), CX
  10286. DECL SI
  10287. JZ match_extend_back_end_encodeSnappyBlockAsm64K
  10288. JMP match_extend_back_loop_encodeSnappyBlockAsm64K
  10289. match_extend_back_end_encodeSnappyBlockAsm64K:
  10290. MOVL CX, DI
  10291. SUBL 12(SP), DI
  10292. LEAQ 3(AX)(DI*1), DI
  10293. CMPQ DI, (SP)
  10294. JL match_dst_size_check_encodeSnappyBlockAsm64K
  10295. MOVQ $0x00000000, ret+48(FP)
  10296. RET
  10297. match_dst_size_check_encodeSnappyBlockAsm64K:
  10298. MOVL CX, DI
  10299. MOVL 12(SP), R8
  10300. CMPL R8, DI
  10301. JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K
  10302. MOVL DI, R9
  10303. MOVL DI, 12(SP)
  10304. LEAQ (DX)(R8*1), DI
  10305. SUBL R8, R9
  10306. LEAL -1(R9), R8
  10307. CMPL R8, $0x3c
  10308. JLT one_byte_match_emit_encodeSnappyBlockAsm64K
  10309. CMPL R8, $0x00000100
  10310. JLT two_bytes_match_emit_encodeSnappyBlockAsm64K
  10311. MOVB $0xf4, (AX)
  10312. MOVW R8, 1(AX)
  10313. ADDQ $0x03, AX
  10314. JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
  10315. two_bytes_match_emit_encodeSnappyBlockAsm64K:
  10316. MOVB $0xf0, (AX)
  10317. MOVB R8, 1(AX)
  10318. ADDQ $0x02, AX
  10319. CMPL R8, $0x40
  10320. JL memmove_match_emit_encodeSnappyBlockAsm64K
  10321. JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
  10322. one_byte_match_emit_encodeSnappyBlockAsm64K:
  10323. SHLB $0x02, R8
  10324. MOVB R8, (AX)
  10325. ADDQ $0x01, AX
  10326. memmove_match_emit_encodeSnappyBlockAsm64K:
  10327. LEAQ (AX)(R9*1), R8
  10328. // genMemMoveShort
  10329. CMPQ R9, $0x08
  10330. JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
  10331. CMPQ R9, $0x10
  10332. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
  10333. CMPQ R9, $0x20
  10334. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
  10335. JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
  10336. emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
  10337. MOVQ (DI), R10
  10338. MOVQ R10, (AX)
  10339. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
  10340. emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
  10341. MOVQ (DI), R10
  10342. MOVQ -8(DI)(R9*1), DI
  10343. MOVQ R10, (AX)
  10344. MOVQ DI, -8(AX)(R9*1)
  10345. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
  10346. emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
  10347. MOVOU (DI), X0
  10348. MOVOU -16(DI)(R9*1), X1
  10349. MOVOU X0, (AX)
  10350. MOVOU X1, -16(AX)(R9*1)
  10351. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
  10352. emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
  10353. MOVOU (DI), X0
  10354. MOVOU 16(DI), X1
  10355. MOVOU -32(DI)(R9*1), X2
  10356. MOVOU -16(DI)(R9*1), X3
  10357. MOVOU X0, (AX)
  10358. MOVOU X1, 16(AX)
  10359. MOVOU X2, -32(AX)(R9*1)
  10360. MOVOU X3, -16(AX)(R9*1)
  10361. memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
  10362. MOVQ R8, AX
  10363. JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K
  10364. memmove_long_match_emit_encodeSnappyBlockAsm64K:
  10365. LEAQ (AX)(R9*1), R8
  10366. // genMemMoveLong
  10367. MOVOU (DI), X0
  10368. MOVOU 16(DI), X1
  10369. MOVOU -32(DI)(R9*1), X2
  10370. MOVOU -16(DI)(R9*1), X3
  10371. MOVQ R9, R11
  10372. SHRQ $0x05, R11
  10373. MOVQ AX, R10
  10374. ANDL $0x0000001f, R10
  10375. MOVQ $0x00000040, R12
  10376. SUBQ R10, R12
  10377. DECQ R11
  10378. JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  10379. LEAQ -32(DI)(R12*1), R10
  10380. LEAQ -32(AX)(R12*1), R13
  10381. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
  10382. MOVOU (R10), X4
  10383. MOVOU 16(R10), X5
  10384. MOVOA X4, (R13)
  10385. MOVOA X5, 16(R13)
  10386. ADDQ $0x20, R13
  10387. ADDQ $0x20, R10
  10388. ADDQ $0x20, R12
  10389. DECQ R11
  10390. JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
  10391. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
  10392. MOVOU -32(DI)(R12*1), X4
  10393. MOVOU -16(DI)(R12*1), X5
  10394. MOVOA X4, -32(AX)(R12*1)
  10395. MOVOA X5, -16(AX)(R12*1)
  10396. ADDQ $0x20, R12
  10397. CMPQ R9, R12
  10398. JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  10399. MOVOU X0, (AX)
  10400. MOVOU X1, 16(AX)
  10401. MOVOU X2, -32(AX)(R9*1)
  10402. MOVOU X3, -16(AX)(R9*1)
  10403. MOVQ R8, AX
  10404. emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
  10405. match_nolit_loop_encodeSnappyBlockAsm64K:
  10406. MOVL CX, DI
  10407. SUBL SI, DI
  10408. MOVL DI, 16(SP)
  10409. ADDL $0x04, CX
  10410. ADDL $0x04, SI
  10411. MOVQ src_len+32(FP), DI
  10412. SUBL CX, DI
  10413. LEAQ (DX)(CX*1), R8
  10414. LEAQ (DX)(SI*1), SI
  10415. // matchLen
  10416. XORL R10, R10
  10417. CMPL DI, $0x08
  10418. JL matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
  10419. matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K:
  10420. MOVQ (R8)(R10*1), R9
  10421. XORQ (SI)(R10*1), R9
  10422. TESTQ R9, R9
  10423. JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K
  10424. #ifdef GOAMD64_v3
  10425. TZCNTQ R9, R9
  10426. #else
  10427. BSFQ R9, R9
  10428. #endif
  10429. SARQ $0x03, R9
  10430. LEAL (R10)(R9*1), R10
  10431. JMP match_nolit_end_encodeSnappyBlockAsm64K
  10432. matchlen_loop_match_nolit_encodeSnappyBlockAsm64K:
  10433. LEAL -8(DI), DI
  10434. LEAL 8(R10), R10
  10435. CMPL DI, $0x08
  10436. JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K
  10437. JZ match_nolit_end_encodeSnappyBlockAsm64K
  10438. matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
  10439. CMPL DI, $0x04
  10440. JL matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
  10441. MOVL (R8)(R10*1), R9
  10442. CMPL (SI)(R10*1), R9
  10443. JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
  10444. SUBL $0x04, DI
  10445. LEAL 4(R10), R10
  10446. matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
  10447. CMPL DI, $0x02
  10448. JL matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
  10449. MOVW (R8)(R10*1), R9
  10450. CMPW (SI)(R10*1), R9
  10451. JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
  10452. SUBL $0x02, DI
  10453. LEAL 2(R10), R10
  10454. matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
  10455. CMPL DI, $0x01
  10456. JL match_nolit_end_encodeSnappyBlockAsm64K
  10457. MOVB (R8)(R10*1), R9
  10458. CMPB (SI)(R10*1), R9
  10459. JNE match_nolit_end_encodeSnappyBlockAsm64K
  10460. LEAL 1(R10), R10
  10461. match_nolit_end_encodeSnappyBlockAsm64K:
  10462. ADDL R10, CX
  10463. MOVL 16(SP), SI
  10464. ADDL $0x04, R10
  10465. MOVL CX, 12(SP)
  10466. // emitCopy
  10467. two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
  10468. CMPL R10, $0x40
  10469. JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
  10470. MOVB $0xee, (AX)
  10471. MOVW SI, 1(AX)
  10472. LEAL -60(R10), R10
  10473. ADDQ $0x03, AX
  10474. JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
  10475. two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
  10476. CMPL R10, $0x0c
  10477. JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
  10478. CMPL SI, $0x00000800
  10479. JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
  10480. MOVB $0x01, BL
  10481. LEAL -16(BX)(R10*4), R10
  10482. MOVB SI, 1(AX)
  10483. SHRL $0x08, SI
  10484. SHLL $0x05, SI
  10485. ORL SI, R10
  10486. MOVB R10, (AX)
  10487. ADDQ $0x02, AX
  10488. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
  10489. emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
  10490. MOVB $0x02, BL
  10491. LEAL -4(BX)(R10*4), R10
  10492. MOVB R10, (AX)
  10493. MOVW SI, 1(AX)
  10494. ADDQ $0x03, AX
  10495. match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
  10496. CMPL CX, 8(SP)
  10497. JGE emit_remainder_encodeSnappyBlockAsm64K
  10498. MOVQ -2(DX)(CX*1), DI
  10499. CMPQ AX, (SP)
  10500. JL match_nolit_dst_ok_encodeSnappyBlockAsm64K
  10501. MOVQ $0x00000000, ret+48(FP)
  10502. RET
  10503. match_nolit_dst_ok_encodeSnappyBlockAsm64K:
  10504. MOVQ $0x0000cf1bbcdcbf9b, R9
  10505. MOVQ DI, R8
  10506. SHRQ $0x10, DI
  10507. MOVQ DI, SI
  10508. SHLQ $0x10, R8
  10509. IMULQ R9, R8
  10510. SHRQ $0x32, R8
  10511. SHLQ $0x10, SI
  10512. IMULQ R9, SI
  10513. SHRQ $0x32, SI
  10514. LEAL -2(CX), R9
  10515. LEAQ 24(SP)(SI*4), R10
  10516. MOVL (R10), SI
  10517. MOVL R9, 24(SP)(R8*4)
  10518. MOVL CX, (R10)
  10519. CMPL (DX)(SI*1), DI
  10520. JEQ match_nolit_loop_encodeSnappyBlockAsm64K
  10521. INCL CX
  10522. JMP search_loop_encodeSnappyBlockAsm64K
  10523. emit_remainder_encodeSnappyBlockAsm64K:
  10524. MOVQ src_len+32(FP), CX
  10525. SUBL 12(SP), CX
  10526. LEAQ 3(AX)(CX*1), CX
  10527. CMPQ CX, (SP)
  10528. JL emit_remainder_ok_encodeSnappyBlockAsm64K
  10529. MOVQ $0x00000000, ret+48(FP)
  10530. RET
  10531. emit_remainder_ok_encodeSnappyBlockAsm64K:
  10532. MOVQ src_len+32(FP), CX
  10533. MOVL 12(SP), BX
  10534. CMPL BX, CX
  10535. JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
  10536. MOVL CX, SI
  10537. MOVL CX, 12(SP)
  10538. LEAQ (DX)(BX*1), CX
  10539. SUBL BX, SI
  10540. LEAL -1(SI), DX
  10541. CMPL DX, $0x3c
  10542. JLT one_byte_emit_remainder_encodeSnappyBlockAsm64K
  10543. CMPL DX, $0x00000100
  10544. JLT two_bytes_emit_remainder_encodeSnappyBlockAsm64K
  10545. MOVB $0xf4, (AX)
  10546. MOVW DX, 1(AX)
  10547. ADDQ $0x03, AX
  10548. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
  10549. two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
  10550. MOVB $0xf0, (AX)
  10551. MOVB DL, 1(AX)
  10552. ADDQ $0x02, AX
  10553. CMPL DX, $0x40
  10554. JL memmove_emit_remainder_encodeSnappyBlockAsm64K
  10555. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
  10556. one_byte_emit_remainder_encodeSnappyBlockAsm64K:
  10557. SHLB $0x02, DL
  10558. MOVB DL, (AX)
  10559. ADDQ $0x01, AX
  10560. memmove_emit_remainder_encodeSnappyBlockAsm64K:
  10561. LEAQ (AX)(SI*1), DX
  10562. MOVL SI, BX
  10563. // genMemMoveShort
  10564. CMPQ BX, $0x03
  10565. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2
  10566. JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3
  10567. CMPQ BX, $0x08
  10568. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7
  10569. CMPQ BX, $0x10
  10570. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
  10571. CMPQ BX, $0x20
  10572. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
  10573. JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
  10574. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2:
  10575. MOVB (CX), SI
  10576. MOVB -1(CX)(BX*1), CL
  10577. MOVB SI, (AX)
  10578. MOVB CL, -1(AX)(BX*1)
  10579. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
  10580. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3:
  10581. MOVW (CX), SI
  10582. MOVB 2(CX), CL
  10583. MOVW SI, (AX)
  10584. MOVB CL, 2(AX)
  10585. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
  10586. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7:
  10587. MOVL (CX), SI
  10588. MOVL -4(CX)(BX*1), CX
  10589. MOVL SI, (AX)
  10590. MOVL CX, -4(AX)(BX*1)
  10591. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
  10592. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
  10593. MOVQ (CX), SI
  10594. MOVQ -8(CX)(BX*1), CX
  10595. MOVQ SI, (AX)
  10596. MOVQ CX, -8(AX)(BX*1)
  10597. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
  10598. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
  10599. MOVOU (CX), X0
  10600. MOVOU -16(CX)(BX*1), X1
  10601. MOVOU X0, (AX)
  10602. MOVOU X1, -16(AX)(BX*1)
  10603. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
  10604. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
  10605. MOVOU (CX), X0
  10606. MOVOU 16(CX), X1
  10607. MOVOU -32(CX)(BX*1), X2
  10608. MOVOU -16(CX)(BX*1), X3
  10609. MOVOU X0, (AX)
  10610. MOVOU X1, 16(AX)
  10611. MOVOU X2, -32(AX)(BX*1)
  10612. MOVOU X3, -16(AX)(BX*1)
  10613. memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
  10614. MOVQ DX, AX
  10615. JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
  10616. memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
  10617. LEAQ (AX)(SI*1), DX
  10618. MOVL SI, BX
  10619. // genMemMoveLong
  10620. MOVOU (CX), X0
  10621. MOVOU 16(CX), X1
  10622. MOVOU -32(CX)(BX*1), X2
  10623. MOVOU -16(CX)(BX*1), X3
  10624. MOVQ BX, DI
  10625. SHRQ $0x05, DI
  10626. MOVQ AX, SI
  10627. ANDL $0x0000001f, SI
  10628. MOVQ $0x00000040, R8
  10629. SUBQ SI, R8
  10630. DECQ DI
  10631. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  10632. LEAQ -32(CX)(R8*1), SI
  10633. LEAQ -32(AX)(R8*1), R9
  10634. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
  10635. MOVOU (SI), X4
  10636. MOVOU 16(SI), X5
  10637. MOVOA X4, (R9)
  10638. MOVOA X5, 16(R9)
  10639. ADDQ $0x20, R9
  10640. ADDQ $0x20, SI
  10641. ADDQ $0x20, R8
  10642. DECQ DI
  10643. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
  10644. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
  10645. MOVOU -32(CX)(R8*1), X4
  10646. MOVOU -16(CX)(R8*1), X5
  10647. MOVOA X4, -32(AX)(R8*1)
  10648. MOVOA X5, -16(AX)(R8*1)
  10649. ADDQ $0x20, R8
  10650. CMPQ BX, R8
  10651. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
  10652. MOVOU X0, (AX)
  10653. MOVOU X1, 16(AX)
  10654. MOVOU X2, -32(AX)(BX*1)
  10655. MOVOU X3, -16(AX)(BX*1)
  10656. MOVQ DX, AX
  10657. emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
  10658. MOVQ dst_base+0(FP), CX
  10659. SUBQ CX, AX
  10660. MOVQ AX, ret+48(FP)
  10661. RET
  10662. // func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
  10663. // Requires: BMI, SSE2
  10664. TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
  10665. MOVQ dst_base+0(FP), AX
  10666. MOVQ $0x00000080, CX
  10667. LEAQ 24(SP), DX
  10668. PXOR X0, X0
  10669. zero_loop_encodeSnappyBlockAsm12B:
  10670. MOVOU X0, (DX)
  10671. MOVOU X0, 16(DX)
  10672. MOVOU X0, 32(DX)
  10673. MOVOU X0, 48(DX)
  10674. MOVOU X0, 64(DX)
  10675. MOVOU X0, 80(DX)
  10676. MOVOU X0, 96(DX)
  10677. MOVOU X0, 112(DX)
  10678. ADDQ $0x80, DX
  10679. DECQ CX
  10680. JNZ zero_loop_encodeSnappyBlockAsm12B
  10681. MOVL $0x00000000, 12(SP)
  10682. MOVQ src_len+32(FP), CX
  10683. LEAQ -9(CX), DX
  10684. LEAQ -8(CX), SI
  10685. MOVL SI, 8(SP)
  10686. SHRQ $0x05, CX
  10687. SUBL CX, DX
  10688. LEAQ (AX)(DX*1), DX
  10689. MOVQ DX, (SP)
  10690. MOVL $0x00000001, CX
  10691. MOVL CX, 16(SP)
  10692. MOVQ src_base+24(FP), DX
  10693. search_loop_encodeSnappyBlockAsm12B:
  10694. MOVL CX, SI
  10695. SUBL 12(SP), SI
  10696. SHRL $0x05, SI
  10697. LEAL 4(CX)(SI*1), SI
  10698. CMPL SI, 8(SP)
  10699. JGE emit_remainder_encodeSnappyBlockAsm12B
  10700. MOVQ (DX)(CX*1), DI
  10701. MOVL SI, 20(SP)
  10702. MOVQ $0x000000cf1bbcdcbb, R9
  10703. MOVQ DI, R10
  10704. MOVQ DI, R11
  10705. SHRQ $0x08, R11
  10706. SHLQ $0x18, R10
  10707. IMULQ R9, R10
  10708. SHRQ $0x34, R10
  10709. SHLQ $0x18, R11
  10710. IMULQ R9, R11
  10711. SHRQ $0x34, R11
  10712. MOVL 24(SP)(R10*4), SI
  10713. MOVL 24(SP)(R11*4), R8
  10714. MOVL CX, 24(SP)(R10*4)
  10715. LEAL 1(CX), R10
  10716. MOVL R10, 24(SP)(R11*4)
  10717. MOVQ DI, R10
  10718. SHRQ $0x10, R10
  10719. SHLQ $0x18, R10
  10720. IMULQ R9, R10
  10721. SHRQ $0x34, R10
  10722. MOVL CX, R9
  10723. SUBL 16(SP), R9
  10724. MOVL 1(DX)(R9*1), R11
  10725. MOVQ DI, R9
  10726. SHRQ $0x08, R9
  10727. CMPL R9, R11
  10728. JNE no_repeat_found_encodeSnappyBlockAsm12B
  10729. LEAL 1(CX), DI
  10730. MOVL 12(SP), SI
  10731. MOVL DI, R8
  10732. SUBL 16(SP), R8
  10733. JZ repeat_extend_back_end_encodeSnappyBlockAsm12B
  10734. repeat_extend_back_loop_encodeSnappyBlockAsm12B:
  10735. CMPL DI, SI
  10736. JLE repeat_extend_back_end_encodeSnappyBlockAsm12B
  10737. MOVB -1(DX)(R8*1), BL
  10738. MOVB -1(DX)(DI*1), R9
  10739. CMPB BL, R9
  10740. JNE repeat_extend_back_end_encodeSnappyBlockAsm12B
  10741. LEAL -1(DI), DI
  10742. DECL R8
  10743. JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B
  10744. repeat_extend_back_end_encodeSnappyBlockAsm12B:
  10745. MOVL 12(SP), SI
  10746. CMPL SI, DI
  10747. JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
  10748. MOVL DI, R8
  10749. MOVL DI, 12(SP)
  10750. LEAQ (DX)(SI*1), R9
  10751. SUBL SI, R8
  10752. LEAL -1(R8), SI
  10753. CMPL SI, $0x3c
  10754. JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B
  10755. CMPL SI, $0x00000100
  10756. JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B
  10757. MOVB $0xf4, (AX)
  10758. MOVW SI, 1(AX)
  10759. ADDQ $0x03, AX
  10760. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
  10761. two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
  10762. MOVB $0xf0, (AX)
  10763. MOVB SI, 1(AX)
  10764. ADDQ $0x02, AX
  10765. CMPL SI, $0x40
  10766. JL memmove_repeat_emit_encodeSnappyBlockAsm12B
  10767. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
  10768. one_byte_repeat_emit_encodeSnappyBlockAsm12B:
  10769. SHLB $0x02, SI
  10770. MOVB SI, (AX)
  10771. ADDQ $0x01, AX
  10772. memmove_repeat_emit_encodeSnappyBlockAsm12B:
  10773. LEAQ (AX)(R8*1), SI
  10774. // genMemMoveShort
  10775. CMPQ R8, $0x08
  10776. JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
  10777. CMPQ R8, $0x10
  10778. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
  10779. CMPQ R8, $0x20
  10780. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
  10781. JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
  10782. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
  10783. MOVQ (R9), R10
  10784. MOVQ R10, (AX)
  10785. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
  10786. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
  10787. MOVQ (R9), R10
  10788. MOVQ -8(R9)(R8*1), R9
  10789. MOVQ R10, (AX)
  10790. MOVQ R9, -8(AX)(R8*1)
  10791. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
  10792. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
  10793. MOVOU (R9), X0
  10794. MOVOU -16(R9)(R8*1), X1
  10795. MOVOU X0, (AX)
  10796. MOVOU X1, -16(AX)(R8*1)
  10797. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
  10798. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
  10799. MOVOU (R9), X0
  10800. MOVOU 16(R9), X1
  10801. MOVOU -32(R9)(R8*1), X2
  10802. MOVOU -16(R9)(R8*1), X3
  10803. MOVOU X0, (AX)
  10804. MOVOU X1, 16(AX)
  10805. MOVOU X2, -32(AX)(R8*1)
  10806. MOVOU X3, -16(AX)(R8*1)
  10807. memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
  10808. MOVQ SI, AX
  10809. JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
  10810. memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
  10811. LEAQ (AX)(R8*1), SI
  10812. // genMemMoveLong
  10813. MOVOU (R9), X0
  10814. MOVOU 16(R9), X1
  10815. MOVOU -32(R9)(R8*1), X2
  10816. MOVOU -16(R9)(R8*1), X3
  10817. MOVQ R8, R11
  10818. SHRQ $0x05, R11
  10819. MOVQ AX, R10
  10820. ANDL $0x0000001f, R10
  10821. MOVQ $0x00000040, R12
  10822. SUBQ R10, R12
  10823. DECQ R11
  10824. JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  10825. LEAQ -32(R9)(R12*1), R10
  10826. LEAQ -32(AX)(R12*1), R13
  10827. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
  10828. MOVOU (R10), X4
  10829. MOVOU 16(R10), X5
  10830. MOVOA X4, (R13)
  10831. MOVOA X5, 16(R13)
  10832. ADDQ $0x20, R13
  10833. ADDQ $0x20, R10
  10834. ADDQ $0x20, R12
  10835. DECQ R11
  10836. JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
  10837. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
  10838. MOVOU -32(R9)(R12*1), X4
  10839. MOVOU -16(R9)(R12*1), X5
  10840. MOVOA X4, -32(AX)(R12*1)
  10841. MOVOA X5, -16(AX)(R12*1)
  10842. ADDQ $0x20, R12
  10843. CMPQ R8, R12
  10844. JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  10845. MOVOU X0, (AX)
  10846. MOVOU X1, 16(AX)
  10847. MOVOU X2, -32(AX)(R8*1)
  10848. MOVOU X3, -16(AX)(R8*1)
  10849. MOVQ SI, AX
  10850. emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
  10851. ADDL $0x05, CX
  10852. MOVL CX, SI
  10853. SUBL 16(SP), SI
  10854. MOVQ src_len+32(FP), R8
  10855. SUBL CX, R8
  10856. LEAQ (DX)(CX*1), R9
  10857. LEAQ (DX)(SI*1), SI
  10858. // matchLen
  10859. XORL R11, R11
  10860. CMPL R8, $0x08
  10861. JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
  10862. matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B:
  10863. MOVQ (R9)(R11*1), R10
  10864. XORQ (SI)(R11*1), R10
  10865. TESTQ R10, R10
  10866. JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B
  10867. #ifdef GOAMD64_v3
  10868. TZCNTQ R10, R10
  10869. #else
  10870. BSFQ R10, R10
  10871. #endif
  10872. SARQ $0x03, R10
  10873. LEAL (R11)(R10*1), R11
  10874. JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
  10875. matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B:
  10876. LEAL -8(R8), R8
  10877. LEAL 8(R11), R11
  10878. CMPL R8, $0x08
  10879. JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B
  10880. JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
  10881. matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
  10882. CMPL R8, $0x04
  10883. JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
  10884. MOVL (R9)(R11*1), R10
  10885. CMPL (SI)(R11*1), R10
  10886. JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
  10887. SUBL $0x04, R8
  10888. LEAL 4(R11), R11
  10889. matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
  10890. CMPL R8, $0x02
  10891. JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
  10892. MOVW (R9)(R11*1), R10
  10893. CMPW (SI)(R11*1), R10
  10894. JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
  10895. SUBL $0x02, R8
  10896. LEAL 2(R11), R11
  10897. matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
  10898. CMPL R8, $0x01
  10899. JL repeat_extend_forward_end_encodeSnappyBlockAsm12B
  10900. MOVB (R9)(R11*1), R10
  10901. CMPB (SI)(R11*1), R10
  10902. JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B
  10903. LEAL 1(R11), R11
  10904. repeat_extend_forward_end_encodeSnappyBlockAsm12B:
  10905. ADDL R11, CX
  10906. MOVL CX, SI
  10907. SUBL DI, SI
  10908. MOVL 16(SP), DI
  10909. // emitCopy
  10910. two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
  10911. CMPL SI, $0x40
  10912. JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
  10913. MOVB $0xee, (AX)
  10914. MOVW DI, 1(AX)
  10915. LEAL -60(SI), SI
  10916. ADDQ $0x03, AX
  10917. JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
  10918. two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
  10919. CMPL SI, $0x0c
  10920. JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
  10921. CMPL DI, $0x00000800
  10922. JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
  10923. MOVB $0x01, BL
  10924. LEAL -16(BX)(SI*4), SI
  10925. MOVB DI, 1(AX)
  10926. SHRL $0x08, DI
  10927. SHLL $0x05, DI
  10928. ORL DI, SI
  10929. MOVB SI, (AX)
  10930. ADDQ $0x02, AX
  10931. JMP repeat_end_emit_encodeSnappyBlockAsm12B
  10932. emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
  10933. MOVB $0x02, BL
  10934. LEAL -4(BX)(SI*4), SI
  10935. MOVB SI, (AX)
  10936. MOVW DI, 1(AX)
  10937. ADDQ $0x03, AX
  10938. repeat_end_emit_encodeSnappyBlockAsm12B:
  10939. MOVL CX, 12(SP)
  10940. JMP search_loop_encodeSnappyBlockAsm12B
  10941. no_repeat_found_encodeSnappyBlockAsm12B:
  10942. CMPL (DX)(SI*1), DI
  10943. JEQ candidate_match_encodeSnappyBlockAsm12B
  10944. SHRQ $0x08, DI
  10945. MOVL 24(SP)(R10*4), SI
  10946. LEAL 2(CX), R9
  10947. CMPL (DX)(R8*1), DI
  10948. JEQ candidate2_match_encodeSnappyBlockAsm12B
  10949. MOVL R9, 24(SP)(R10*4)
  10950. SHRQ $0x08, DI
  10951. CMPL (DX)(SI*1), DI
  10952. JEQ candidate3_match_encodeSnappyBlockAsm12B
  10953. MOVL 20(SP), CX
  10954. JMP search_loop_encodeSnappyBlockAsm12B
  10955. candidate3_match_encodeSnappyBlockAsm12B:
  10956. ADDL $0x02, CX
  10957. JMP candidate_match_encodeSnappyBlockAsm12B
  10958. candidate2_match_encodeSnappyBlockAsm12B:
  10959. MOVL R9, 24(SP)(R10*4)
  10960. INCL CX
  10961. MOVL R8, SI
  10962. candidate_match_encodeSnappyBlockAsm12B:
  10963. MOVL 12(SP), DI
  10964. TESTL SI, SI
  10965. JZ match_extend_back_end_encodeSnappyBlockAsm12B
  10966. match_extend_back_loop_encodeSnappyBlockAsm12B:
  10967. CMPL CX, DI
  10968. JLE match_extend_back_end_encodeSnappyBlockAsm12B
  10969. MOVB -1(DX)(SI*1), BL
  10970. MOVB -1(DX)(CX*1), R8
  10971. CMPB BL, R8
  10972. JNE match_extend_back_end_encodeSnappyBlockAsm12B
  10973. LEAL -1(CX), CX
  10974. DECL SI
  10975. JZ match_extend_back_end_encodeSnappyBlockAsm12B
  10976. JMP match_extend_back_loop_encodeSnappyBlockAsm12B
  10977. match_extend_back_end_encodeSnappyBlockAsm12B:
  10978. MOVL CX, DI
  10979. SUBL 12(SP), DI
  10980. LEAQ 3(AX)(DI*1), DI
  10981. CMPQ DI, (SP)
  10982. JL match_dst_size_check_encodeSnappyBlockAsm12B
  10983. MOVQ $0x00000000, ret+48(FP)
  10984. RET
  10985. match_dst_size_check_encodeSnappyBlockAsm12B:
  10986. MOVL CX, DI
  10987. MOVL 12(SP), R8
  10988. CMPL R8, DI
  10989. JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B
  10990. MOVL DI, R9
  10991. MOVL DI, 12(SP)
  10992. LEAQ (DX)(R8*1), DI
  10993. SUBL R8, R9
  10994. LEAL -1(R9), R8
  10995. CMPL R8, $0x3c
  10996. JLT one_byte_match_emit_encodeSnappyBlockAsm12B
  10997. CMPL R8, $0x00000100
  10998. JLT two_bytes_match_emit_encodeSnappyBlockAsm12B
  10999. MOVB $0xf4, (AX)
  11000. MOVW R8, 1(AX)
  11001. ADDQ $0x03, AX
  11002. JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
  11003. two_bytes_match_emit_encodeSnappyBlockAsm12B:
  11004. MOVB $0xf0, (AX)
  11005. MOVB R8, 1(AX)
  11006. ADDQ $0x02, AX
  11007. CMPL R8, $0x40
  11008. JL memmove_match_emit_encodeSnappyBlockAsm12B
  11009. JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
  11010. one_byte_match_emit_encodeSnappyBlockAsm12B:
  11011. SHLB $0x02, R8
  11012. MOVB R8, (AX)
  11013. ADDQ $0x01, AX
  11014. memmove_match_emit_encodeSnappyBlockAsm12B:
  11015. LEAQ (AX)(R9*1), R8
  11016. // genMemMoveShort
  11017. CMPQ R9, $0x08
  11018. JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
  11019. CMPQ R9, $0x10
  11020. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
  11021. CMPQ R9, $0x20
  11022. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
  11023. JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
  11024. emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
  11025. MOVQ (DI), R10
  11026. MOVQ R10, (AX)
  11027. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
  11028. emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
  11029. MOVQ (DI), R10
  11030. MOVQ -8(DI)(R9*1), DI
  11031. MOVQ R10, (AX)
  11032. MOVQ DI, -8(AX)(R9*1)
  11033. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
  11034. emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
  11035. MOVOU (DI), X0
  11036. MOVOU -16(DI)(R9*1), X1
  11037. MOVOU X0, (AX)
  11038. MOVOU X1, -16(AX)(R9*1)
  11039. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
  11040. emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
  11041. MOVOU (DI), X0
  11042. MOVOU 16(DI), X1
  11043. MOVOU -32(DI)(R9*1), X2
  11044. MOVOU -16(DI)(R9*1), X3
  11045. MOVOU X0, (AX)
  11046. MOVOU X1, 16(AX)
  11047. MOVOU X2, -32(AX)(R9*1)
  11048. MOVOU X3, -16(AX)(R9*1)
  11049. memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
  11050. MOVQ R8, AX
  11051. JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B
  11052. memmove_long_match_emit_encodeSnappyBlockAsm12B:
  11053. LEAQ (AX)(R9*1), R8
  11054. // genMemMoveLong
  11055. MOVOU (DI), X0
  11056. MOVOU 16(DI), X1
  11057. MOVOU -32(DI)(R9*1), X2
  11058. MOVOU -16(DI)(R9*1), X3
  11059. MOVQ R9, R11
  11060. SHRQ $0x05, R11
  11061. MOVQ AX, R10
  11062. ANDL $0x0000001f, R10
  11063. MOVQ $0x00000040, R12
  11064. SUBQ R10, R12
  11065. DECQ R11
  11066. JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  11067. LEAQ -32(DI)(R12*1), R10
  11068. LEAQ -32(AX)(R12*1), R13
  11069. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
  11070. MOVOU (R10), X4
  11071. MOVOU 16(R10), X5
  11072. MOVOA X4, (R13)
  11073. MOVOA X5, 16(R13)
  11074. ADDQ $0x20, R13
  11075. ADDQ $0x20, R10
  11076. ADDQ $0x20, R12
  11077. DECQ R11
  11078. JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
  11079. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
  11080. MOVOU -32(DI)(R12*1), X4
  11081. MOVOU -16(DI)(R12*1), X5
  11082. MOVOA X4, -32(AX)(R12*1)
  11083. MOVOA X5, -16(AX)(R12*1)
  11084. ADDQ $0x20, R12
  11085. CMPQ R9, R12
  11086. JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  11087. MOVOU X0, (AX)
  11088. MOVOU X1, 16(AX)
  11089. MOVOU X2, -32(AX)(R9*1)
  11090. MOVOU X3, -16(AX)(R9*1)
  11091. MOVQ R8, AX
  11092. emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
  11093. match_nolit_loop_encodeSnappyBlockAsm12B:
  11094. MOVL CX, DI
  11095. SUBL SI, DI
  11096. MOVL DI, 16(SP)
  11097. ADDL $0x04, CX
  11098. ADDL $0x04, SI
  11099. MOVQ src_len+32(FP), DI
  11100. SUBL CX, DI
  11101. LEAQ (DX)(CX*1), R8
  11102. LEAQ (DX)(SI*1), SI
  11103. // matchLen
  11104. XORL R10, R10
  11105. CMPL DI, $0x08
  11106. JL matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
  11107. matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B:
  11108. MOVQ (R8)(R10*1), R9
  11109. XORQ (SI)(R10*1), R9
  11110. TESTQ R9, R9
  11111. JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B
  11112. #ifdef GOAMD64_v3
  11113. TZCNTQ R9, R9
  11114. #else
  11115. BSFQ R9, R9
  11116. #endif
  11117. SARQ $0x03, R9
  11118. LEAL (R10)(R9*1), R10
  11119. JMP match_nolit_end_encodeSnappyBlockAsm12B
  11120. matchlen_loop_match_nolit_encodeSnappyBlockAsm12B:
  11121. LEAL -8(DI), DI
  11122. LEAL 8(R10), R10
  11123. CMPL DI, $0x08
  11124. JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B
  11125. JZ match_nolit_end_encodeSnappyBlockAsm12B
  11126. matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
  11127. CMPL DI, $0x04
  11128. JL matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
  11129. MOVL (R8)(R10*1), R9
  11130. CMPL (SI)(R10*1), R9
  11131. JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
  11132. SUBL $0x04, DI
  11133. LEAL 4(R10), R10
  11134. matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
  11135. CMPL DI, $0x02
  11136. JL matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
  11137. MOVW (R8)(R10*1), R9
  11138. CMPW (SI)(R10*1), R9
  11139. JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
  11140. SUBL $0x02, DI
  11141. LEAL 2(R10), R10
  11142. matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
  11143. CMPL DI, $0x01
  11144. JL match_nolit_end_encodeSnappyBlockAsm12B
  11145. MOVB (R8)(R10*1), R9
  11146. CMPB (SI)(R10*1), R9
  11147. JNE match_nolit_end_encodeSnappyBlockAsm12B
  11148. LEAL 1(R10), R10
  11149. match_nolit_end_encodeSnappyBlockAsm12B:
  11150. ADDL R10, CX
  11151. MOVL 16(SP), SI
  11152. ADDL $0x04, R10
  11153. MOVL CX, 12(SP)
  11154. // emitCopy
  11155. two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
  11156. CMPL R10, $0x40
  11157. JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
  11158. MOVB $0xee, (AX)
  11159. MOVW SI, 1(AX)
  11160. LEAL -60(R10), R10
  11161. ADDQ $0x03, AX
  11162. JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
  11163. two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
  11164. CMPL R10, $0x0c
  11165. JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
  11166. CMPL SI, $0x00000800
  11167. JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
  11168. MOVB $0x01, BL
  11169. LEAL -16(BX)(R10*4), R10
  11170. MOVB SI, 1(AX)
  11171. SHRL $0x08, SI
  11172. SHLL $0x05, SI
  11173. ORL SI, R10
  11174. MOVB R10, (AX)
  11175. ADDQ $0x02, AX
  11176. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
  11177. emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
  11178. MOVB $0x02, BL
  11179. LEAL -4(BX)(R10*4), R10
  11180. MOVB R10, (AX)
  11181. MOVW SI, 1(AX)
  11182. ADDQ $0x03, AX
  11183. match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
  11184. CMPL CX, 8(SP)
  11185. JGE emit_remainder_encodeSnappyBlockAsm12B
  11186. MOVQ -2(DX)(CX*1), DI
  11187. CMPQ AX, (SP)
  11188. JL match_nolit_dst_ok_encodeSnappyBlockAsm12B
  11189. MOVQ $0x00000000, ret+48(FP)
  11190. RET
  11191. match_nolit_dst_ok_encodeSnappyBlockAsm12B:
  11192. MOVQ $0x000000cf1bbcdcbb, R9
  11193. MOVQ DI, R8
  11194. SHRQ $0x10, DI
  11195. MOVQ DI, SI
  11196. SHLQ $0x18, R8
  11197. IMULQ R9, R8
  11198. SHRQ $0x34, R8
  11199. SHLQ $0x18, SI
  11200. IMULQ R9, SI
  11201. SHRQ $0x34, SI
  11202. LEAL -2(CX), R9
  11203. LEAQ 24(SP)(SI*4), R10
  11204. MOVL (R10), SI
  11205. MOVL R9, 24(SP)(R8*4)
  11206. MOVL CX, (R10)
  11207. CMPL (DX)(SI*1), DI
  11208. JEQ match_nolit_loop_encodeSnappyBlockAsm12B
  11209. INCL CX
  11210. JMP search_loop_encodeSnappyBlockAsm12B
  11211. emit_remainder_encodeSnappyBlockAsm12B:
  11212. MOVQ src_len+32(FP), CX
  11213. SUBL 12(SP), CX
  11214. LEAQ 3(AX)(CX*1), CX
  11215. CMPQ CX, (SP)
  11216. JL emit_remainder_ok_encodeSnappyBlockAsm12B
  11217. MOVQ $0x00000000, ret+48(FP)
  11218. RET
  11219. emit_remainder_ok_encodeSnappyBlockAsm12B:
  11220. MOVQ src_len+32(FP), CX
  11221. MOVL 12(SP), BX
  11222. CMPL BX, CX
  11223. JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
  11224. MOVL CX, SI
  11225. MOVL CX, 12(SP)
  11226. LEAQ (DX)(BX*1), CX
  11227. SUBL BX, SI
  11228. LEAL -1(SI), DX
  11229. CMPL DX, $0x3c
  11230. JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B
  11231. CMPL DX, $0x00000100
  11232. JLT two_bytes_emit_remainder_encodeSnappyBlockAsm12B
  11233. MOVB $0xf4, (AX)
  11234. MOVW DX, 1(AX)
  11235. ADDQ $0x03, AX
  11236. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
  11237. two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
  11238. MOVB $0xf0, (AX)
  11239. MOVB DL, 1(AX)
  11240. ADDQ $0x02, AX
  11241. CMPL DX, $0x40
  11242. JL memmove_emit_remainder_encodeSnappyBlockAsm12B
  11243. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
  11244. one_byte_emit_remainder_encodeSnappyBlockAsm12B:
  11245. SHLB $0x02, DL
  11246. MOVB DL, (AX)
  11247. ADDQ $0x01, AX
  11248. memmove_emit_remainder_encodeSnappyBlockAsm12B:
  11249. LEAQ (AX)(SI*1), DX
  11250. MOVL SI, BX
  11251. // genMemMoveShort
  11252. CMPQ BX, $0x03
  11253. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2
  11254. JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3
  11255. CMPQ BX, $0x08
  11256. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7
  11257. CMPQ BX, $0x10
  11258. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
  11259. CMPQ BX, $0x20
  11260. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
  11261. JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
  11262. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2:
  11263. MOVB (CX), SI
  11264. MOVB -1(CX)(BX*1), CL
  11265. MOVB SI, (AX)
  11266. MOVB CL, -1(AX)(BX*1)
  11267. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
  11268. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3:
  11269. MOVW (CX), SI
  11270. MOVB 2(CX), CL
  11271. MOVW SI, (AX)
  11272. MOVB CL, 2(AX)
  11273. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
  11274. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7:
  11275. MOVL (CX), SI
  11276. MOVL -4(CX)(BX*1), CX
  11277. MOVL SI, (AX)
  11278. MOVL CX, -4(AX)(BX*1)
  11279. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
  11280. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
  11281. MOVQ (CX), SI
  11282. MOVQ -8(CX)(BX*1), CX
  11283. MOVQ SI, (AX)
  11284. MOVQ CX, -8(AX)(BX*1)
  11285. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
  11286. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
  11287. MOVOU (CX), X0
  11288. MOVOU -16(CX)(BX*1), X1
  11289. MOVOU X0, (AX)
  11290. MOVOU X1, -16(AX)(BX*1)
  11291. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
  11292. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
  11293. MOVOU (CX), X0
  11294. MOVOU 16(CX), X1
  11295. MOVOU -32(CX)(BX*1), X2
  11296. MOVOU -16(CX)(BX*1), X3
  11297. MOVOU X0, (AX)
  11298. MOVOU X1, 16(AX)
  11299. MOVOU X2, -32(AX)(BX*1)
  11300. MOVOU X3, -16(AX)(BX*1)
  11301. memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
  11302. MOVQ DX, AX
  11303. JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
  11304. memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
  11305. LEAQ (AX)(SI*1), DX
  11306. MOVL SI, BX
  11307. // genMemMoveLong
  11308. MOVOU (CX), X0
  11309. MOVOU 16(CX), X1
  11310. MOVOU -32(CX)(BX*1), X2
  11311. MOVOU -16(CX)(BX*1), X3
  11312. MOVQ BX, DI
  11313. SHRQ $0x05, DI
  11314. MOVQ AX, SI
  11315. ANDL $0x0000001f, SI
  11316. MOVQ $0x00000040, R8
  11317. SUBQ SI, R8
  11318. DECQ DI
  11319. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  11320. LEAQ -32(CX)(R8*1), SI
  11321. LEAQ -32(AX)(R8*1), R9
  11322. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
  11323. MOVOU (SI), X4
  11324. MOVOU 16(SI), X5
  11325. MOVOA X4, (R9)
  11326. MOVOA X5, 16(R9)
  11327. ADDQ $0x20, R9
  11328. ADDQ $0x20, SI
  11329. ADDQ $0x20, R8
  11330. DECQ DI
  11331. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
  11332. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
  11333. MOVOU -32(CX)(R8*1), X4
  11334. MOVOU -16(CX)(R8*1), X5
  11335. MOVOA X4, -32(AX)(R8*1)
  11336. MOVOA X5, -16(AX)(R8*1)
  11337. ADDQ $0x20, R8
  11338. CMPQ BX, R8
  11339. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
  11340. MOVOU X0, (AX)
  11341. MOVOU X1, 16(AX)
  11342. MOVOU X2, -32(AX)(BX*1)
  11343. MOVOU X3, -16(AX)(BX*1)
  11344. MOVQ DX, AX
  11345. emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
  11346. MOVQ dst_base+0(FP), CX
  11347. SUBQ CX, AX
  11348. MOVQ AX, ret+48(FP)
  11349. RET
  11350. // func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
  11351. // Requires: BMI, SSE2
  11352. TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
  11353. MOVQ dst_base+0(FP), AX
  11354. MOVQ $0x00000020, CX
  11355. LEAQ 24(SP), DX
  11356. PXOR X0, X0
  11357. zero_loop_encodeSnappyBlockAsm10B:
  11358. MOVOU X0, (DX)
  11359. MOVOU X0, 16(DX)
  11360. MOVOU X0, 32(DX)
  11361. MOVOU X0, 48(DX)
  11362. MOVOU X0, 64(DX)
  11363. MOVOU X0, 80(DX)
  11364. MOVOU X0, 96(DX)
  11365. MOVOU X0, 112(DX)
  11366. ADDQ $0x80, DX
  11367. DECQ CX
  11368. JNZ zero_loop_encodeSnappyBlockAsm10B
  11369. MOVL $0x00000000, 12(SP)
  11370. MOVQ src_len+32(FP), CX
  11371. LEAQ -9(CX), DX
  11372. LEAQ -8(CX), SI
  11373. MOVL SI, 8(SP)
  11374. SHRQ $0x05, CX
  11375. SUBL CX, DX
  11376. LEAQ (AX)(DX*1), DX
  11377. MOVQ DX, (SP)
  11378. MOVL $0x00000001, CX
  11379. MOVL CX, 16(SP)
  11380. MOVQ src_base+24(FP), DX
  11381. search_loop_encodeSnappyBlockAsm10B:
  11382. MOVL CX, SI
  11383. SUBL 12(SP), SI
  11384. SHRL $0x05, SI
  11385. LEAL 4(CX)(SI*1), SI
  11386. CMPL SI, 8(SP)
  11387. JGE emit_remainder_encodeSnappyBlockAsm10B
  11388. MOVQ (DX)(CX*1), DI
  11389. MOVL SI, 20(SP)
  11390. MOVQ $0x9e3779b1, R9
  11391. MOVQ DI, R10
  11392. MOVQ DI, R11
  11393. SHRQ $0x08, R11
  11394. SHLQ $0x20, R10
  11395. IMULQ R9, R10
  11396. SHRQ $0x36, R10
  11397. SHLQ $0x20, R11
  11398. IMULQ R9, R11
  11399. SHRQ $0x36, R11
  11400. MOVL 24(SP)(R10*4), SI
  11401. MOVL 24(SP)(R11*4), R8
  11402. MOVL CX, 24(SP)(R10*4)
  11403. LEAL 1(CX), R10
  11404. MOVL R10, 24(SP)(R11*4)
  11405. MOVQ DI, R10
  11406. SHRQ $0x10, R10
  11407. SHLQ $0x20, R10
  11408. IMULQ R9, R10
  11409. SHRQ $0x36, R10
  11410. MOVL CX, R9
  11411. SUBL 16(SP), R9
  11412. MOVL 1(DX)(R9*1), R11
  11413. MOVQ DI, R9
  11414. SHRQ $0x08, R9
  11415. CMPL R9, R11
  11416. JNE no_repeat_found_encodeSnappyBlockAsm10B
  11417. LEAL 1(CX), DI
  11418. MOVL 12(SP), SI
  11419. MOVL DI, R8
  11420. SUBL 16(SP), R8
  11421. JZ repeat_extend_back_end_encodeSnappyBlockAsm10B
  11422. repeat_extend_back_loop_encodeSnappyBlockAsm10B:
  11423. CMPL DI, SI
  11424. JLE repeat_extend_back_end_encodeSnappyBlockAsm10B
  11425. MOVB -1(DX)(R8*1), BL
  11426. MOVB -1(DX)(DI*1), R9
  11427. CMPB BL, R9
  11428. JNE repeat_extend_back_end_encodeSnappyBlockAsm10B
  11429. LEAL -1(DI), DI
  11430. DECL R8
  11431. JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B
  11432. repeat_extend_back_end_encodeSnappyBlockAsm10B:
  11433. MOVL 12(SP), SI
  11434. CMPL SI, DI
  11435. JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
  11436. MOVL DI, R8
  11437. MOVL DI, 12(SP)
  11438. LEAQ (DX)(SI*1), R9
  11439. SUBL SI, R8
  11440. LEAL -1(R8), SI
  11441. CMPL SI, $0x3c
  11442. JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B
  11443. CMPL SI, $0x00000100
  11444. JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B
  11445. MOVB $0xf4, (AX)
  11446. MOVW SI, 1(AX)
  11447. ADDQ $0x03, AX
  11448. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
  11449. two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
  11450. MOVB $0xf0, (AX)
  11451. MOVB SI, 1(AX)
  11452. ADDQ $0x02, AX
  11453. CMPL SI, $0x40
  11454. JL memmove_repeat_emit_encodeSnappyBlockAsm10B
  11455. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
  11456. one_byte_repeat_emit_encodeSnappyBlockAsm10B:
  11457. SHLB $0x02, SI
  11458. MOVB SI, (AX)
  11459. ADDQ $0x01, AX
  11460. memmove_repeat_emit_encodeSnappyBlockAsm10B:
  11461. LEAQ (AX)(R8*1), SI
  11462. // genMemMoveShort
  11463. CMPQ R8, $0x08
  11464. JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
  11465. CMPQ R8, $0x10
  11466. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
  11467. CMPQ R8, $0x20
  11468. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
  11469. JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
  11470. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
  11471. MOVQ (R9), R10
  11472. MOVQ R10, (AX)
  11473. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
  11474. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
  11475. MOVQ (R9), R10
  11476. MOVQ -8(R9)(R8*1), R9
  11477. MOVQ R10, (AX)
  11478. MOVQ R9, -8(AX)(R8*1)
  11479. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
  11480. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
  11481. MOVOU (R9), X0
  11482. MOVOU -16(R9)(R8*1), X1
  11483. MOVOU X0, (AX)
  11484. MOVOU X1, -16(AX)(R8*1)
  11485. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
  11486. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
  11487. MOVOU (R9), X0
  11488. MOVOU 16(R9), X1
  11489. MOVOU -32(R9)(R8*1), X2
  11490. MOVOU -16(R9)(R8*1), X3
  11491. MOVOU X0, (AX)
  11492. MOVOU X1, 16(AX)
  11493. MOVOU X2, -32(AX)(R8*1)
  11494. MOVOU X3, -16(AX)(R8*1)
  11495. memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
  11496. MOVQ SI, AX
  11497. JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
  11498. memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
  11499. LEAQ (AX)(R8*1), SI
  11500. // genMemMoveLong
  11501. MOVOU (R9), X0
  11502. MOVOU 16(R9), X1
  11503. MOVOU -32(R9)(R8*1), X2
  11504. MOVOU -16(R9)(R8*1), X3
  11505. MOVQ R8, R11
  11506. SHRQ $0x05, R11
  11507. MOVQ AX, R10
  11508. ANDL $0x0000001f, R10
  11509. MOVQ $0x00000040, R12
  11510. SUBQ R10, R12
  11511. DECQ R11
  11512. JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  11513. LEAQ -32(R9)(R12*1), R10
  11514. LEAQ -32(AX)(R12*1), R13
  11515. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
  11516. MOVOU (R10), X4
  11517. MOVOU 16(R10), X5
  11518. MOVOA X4, (R13)
  11519. MOVOA X5, 16(R13)
  11520. ADDQ $0x20, R13
  11521. ADDQ $0x20, R10
  11522. ADDQ $0x20, R12
  11523. DECQ R11
  11524. JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
  11525. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
  11526. MOVOU -32(R9)(R12*1), X4
  11527. MOVOU -16(R9)(R12*1), X5
  11528. MOVOA X4, -32(AX)(R12*1)
  11529. MOVOA X5, -16(AX)(R12*1)
  11530. ADDQ $0x20, R12
  11531. CMPQ R8, R12
  11532. JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  11533. MOVOU X0, (AX)
  11534. MOVOU X1, 16(AX)
  11535. MOVOU X2, -32(AX)(R8*1)
  11536. MOVOU X3, -16(AX)(R8*1)
  11537. MOVQ SI, AX
  11538. emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
  11539. ADDL $0x05, CX
  11540. MOVL CX, SI
  11541. SUBL 16(SP), SI
  11542. MOVQ src_len+32(FP), R8
  11543. SUBL CX, R8
  11544. LEAQ (DX)(CX*1), R9
  11545. LEAQ (DX)(SI*1), SI
  11546. // matchLen
  11547. XORL R11, R11
  11548. CMPL R8, $0x08
  11549. JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
  11550. matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B:
  11551. MOVQ (R9)(R11*1), R10
  11552. XORQ (SI)(R11*1), R10
  11553. TESTQ R10, R10
  11554. JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B
  11555. #ifdef GOAMD64_v3
  11556. TZCNTQ R10, R10
  11557. #else
  11558. BSFQ R10, R10
  11559. #endif
  11560. SARQ $0x03, R10
  11561. LEAL (R11)(R10*1), R11
  11562. JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
  11563. matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B:
  11564. LEAL -8(R8), R8
  11565. LEAL 8(R11), R11
  11566. CMPL R8, $0x08
  11567. JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B
  11568. JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
  11569. matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
  11570. CMPL R8, $0x04
  11571. JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
  11572. MOVL (R9)(R11*1), R10
  11573. CMPL (SI)(R11*1), R10
  11574. JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
  11575. SUBL $0x04, R8
  11576. LEAL 4(R11), R11
  11577. matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
  11578. CMPL R8, $0x02
  11579. JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
  11580. MOVW (R9)(R11*1), R10
  11581. CMPW (SI)(R11*1), R10
  11582. JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
  11583. SUBL $0x02, R8
  11584. LEAL 2(R11), R11
  11585. matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
  11586. CMPL R8, $0x01
  11587. JL repeat_extend_forward_end_encodeSnappyBlockAsm10B
  11588. MOVB (R9)(R11*1), R10
  11589. CMPB (SI)(R11*1), R10
  11590. JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B
  11591. LEAL 1(R11), R11
  11592. repeat_extend_forward_end_encodeSnappyBlockAsm10B:
  11593. ADDL R11, CX
  11594. MOVL CX, SI
  11595. SUBL DI, SI
  11596. MOVL 16(SP), DI
  11597. // emitCopy
  11598. two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
  11599. CMPL SI, $0x40
  11600. JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
  11601. MOVB $0xee, (AX)
  11602. MOVW DI, 1(AX)
  11603. LEAL -60(SI), SI
  11604. ADDQ $0x03, AX
  11605. JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
  11606. two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
  11607. CMPL SI, $0x0c
  11608. JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
  11609. CMPL DI, $0x00000800
  11610. JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
  11611. MOVB $0x01, BL
  11612. LEAL -16(BX)(SI*4), SI
  11613. MOVB DI, 1(AX)
  11614. SHRL $0x08, DI
  11615. SHLL $0x05, DI
  11616. ORL DI, SI
  11617. MOVB SI, (AX)
  11618. ADDQ $0x02, AX
  11619. JMP repeat_end_emit_encodeSnappyBlockAsm10B
  11620. emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
  11621. MOVB $0x02, BL
  11622. LEAL -4(BX)(SI*4), SI
  11623. MOVB SI, (AX)
  11624. MOVW DI, 1(AX)
  11625. ADDQ $0x03, AX
  11626. repeat_end_emit_encodeSnappyBlockAsm10B:
  11627. MOVL CX, 12(SP)
  11628. JMP search_loop_encodeSnappyBlockAsm10B
  11629. no_repeat_found_encodeSnappyBlockAsm10B:
  11630. CMPL (DX)(SI*1), DI
  11631. JEQ candidate_match_encodeSnappyBlockAsm10B
  11632. SHRQ $0x08, DI
  11633. MOVL 24(SP)(R10*4), SI
  11634. LEAL 2(CX), R9
  11635. CMPL (DX)(R8*1), DI
  11636. JEQ candidate2_match_encodeSnappyBlockAsm10B
  11637. MOVL R9, 24(SP)(R10*4)
  11638. SHRQ $0x08, DI
  11639. CMPL (DX)(SI*1), DI
  11640. JEQ candidate3_match_encodeSnappyBlockAsm10B
  11641. MOVL 20(SP), CX
  11642. JMP search_loop_encodeSnappyBlockAsm10B
  11643. candidate3_match_encodeSnappyBlockAsm10B:
  11644. ADDL $0x02, CX
  11645. JMP candidate_match_encodeSnappyBlockAsm10B
  11646. candidate2_match_encodeSnappyBlockAsm10B:
  11647. MOVL R9, 24(SP)(R10*4)
  11648. INCL CX
  11649. MOVL R8, SI
  11650. candidate_match_encodeSnappyBlockAsm10B:
  11651. MOVL 12(SP), DI
  11652. TESTL SI, SI
  11653. JZ match_extend_back_end_encodeSnappyBlockAsm10B
  11654. match_extend_back_loop_encodeSnappyBlockAsm10B:
  11655. CMPL CX, DI
  11656. JLE match_extend_back_end_encodeSnappyBlockAsm10B
  11657. MOVB -1(DX)(SI*1), BL
  11658. MOVB -1(DX)(CX*1), R8
  11659. CMPB BL, R8
  11660. JNE match_extend_back_end_encodeSnappyBlockAsm10B
  11661. LEAL -1(CX), CX
  11662. DECL SI
  11663. JZ match_extend_back_end_encodeSnappyBlockAsm10B
  11664. JMP match_extend_back_loop_encodeSnappyBlockAsm10B
  11665. match_extend_back_end_encodeSnappyBlockAsm10B:
  11666. MOVL CX, DI
  11667. SUBL 12(SP), DI
  11668. LEAQ 3(AX)(DI*1), DI
  11669. CMPQ DI, (SP)
  11670. JL match_dst_size_check_encodeSnappyBlockAsm10B
  11671. MOVQ $0x00000000, ret+48(FP)
  11672. RET
  11673. match_dst_size_check_encodeSnappyBlockAsm10B:
  11674. MOVL CX, DI
  11675. MOVL 12(SP), R8
  11676. CMPL R8, DI
  11677. JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B
  11678. MOVL DI, R9
  11679. MOVL DI, 12(SP)
  11680. LEAQ (DX)(R8*1), DI
  11681. SUBL R8, R9
  11682. LEAL -1(R9), R8
  11683. CMPL R8, $0x3c
  11684. JLT one_byte_match_emit_encodeSnappyBlockAsm10B
  11685. CMPL R8, $0x00000100
  11686. JLT two_bytes_match_emit_encodeSnappyBlockAsm10B
  11687. MOVB $0xf4, (AX)
  11688. MOVW R8, 1(AX)
  11689. ADDQ $0x03, AX
  11690. JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
  11691. two_bytes_match_emit_encodeSnappyBlockAsm10B:
  11692. MOVB $0xf0, (AX)
  11693. MOVB R8, 1(AX)
  11694. ADDQ $0x02, AX
  11695. CMPL R8, $0x40
  11696. JL memmove_match_emit_encodeSnappyBlockAsm10B
  11697. JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
  11698. one_byte_match_emit_encodeSnappyBlockAsm10B:
  11699. SHLB $0x02, R8
  11700. MOVB R8, (AX)
  11701. ADDQ $0x01, AX
  11702. memmove_match_emit_encodeSnappyBlockAsm10B:
  11703. LEAQ (AX)(R9*1), R8
  11704. // genMemMoveShort
  11705. CMPQ R9, $0x08
  11706. JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
  11707. CMPQ R9, $0x10
  11708. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
  11709. CMPQ R9, $0x20
  11710. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
  11711. JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
  11712. emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
  11713. MOVQ (DI), R10
  11714. MOVQ R10, (AX)
  11715. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
  11716. emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
  11717. MOVQ (DI), R10
  11718. MOVQ -8(DI)(R9*1), DI
  11719. MOVQ R10, (AX)
  11720. MOVQ DI, -8(AX)(R9*1)
  11721. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
  11722. emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
  11723. MOVOU (DI), X0
  11724. MOVOU -16(DI)(R9*1), X1
  11725. MOVOU X0, (AX)
  11726. MOVOU X1, -16(AX)(R9*1)
  11727. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
  11728. emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
  11729. MOVOU (DI), X0
  11730. MOVOU 16(DI), X1
  11731. MOVOU -32(DI)(R9*1), X2
  11732. MOVOU -16(DI)(R9*1), X3
  11733. MOVOU X0, (AX)
  11734. MOVOU X1, 16(AX)
  11735. MOVOU X2, -32(AX)(R9*1)
  11736. MOVOU X3, -16(AX)(R9*1)
  11737. memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
  11738. MOVQ R8, AX
  11739. JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B
  11740. memmove_long_match_emit_encodeSnappyBlockAsm10B:
  11741. LEAQ (AX)(R9*1), R8
  11742. // genMemMoveLong
  11743. MOVOU (DI), X0
  11744. MOVOU 16(DI), X1
  11745. MOVOU -32(DI)(R9*1), X2
  11746. MOVOU -16(DI)(R9*1), X3
  11747. MOVQ R9, R11
  11748. SHRQ $0x05, R11
  11749. MOVQ AX, R10
  11750. ANDL $0x0000001f, R10
  11751. MOVQ $0x00000040, R12
  11752. SUBQ R10, R12
  11753. DECQ R11
  11754. JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  11755. LEAQ -32(DI)(R12*1), R10
  11756. LEAQ -32(AX)(R12*1), R13
  11757. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
  11758. MOVOU (R10), X4
  11759. MOVOU 16(R10), X5
  11760. MOVOA X4, (R13)
  11761. MOVOA X5, 16(R13)
  11762. ADDQ $0x20, R13
  11763. ADDQ $0x20, R10
  11764. ADDQ $0x20, R12
  11765. DECQ R11
  11766. JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
  11767. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
  11768. MOVOU -32(DI)(R12*1), X4
  11769. MOVOU -16(DI)(R12*1), X5
  11770. MOVOA X4, -32(AX)(R12*1)
  11771. MOVOA X5, -16(AX)(R12*1)
  11772. ADDQ $0x20, R12
  11773. CMPQ R9, R12
  11774. JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  11775. MOVOU X0, (AX)
  11776. MOVOU X1, 16(AX)
  11777. MOVOU X2, -32(AX)(R9*1)
  11778. MOVOU X3, -16(AX)(R9*1)
  11779. MOVQ R8, AX
  11780. emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
  11781. match_nolit_loop_encodeSnappyBlockAsm10B:
  11782. MOVL CX, DI
  11783. SUBL SI, DI
  11784. MOVL DI, 16(SP)
  11785. ADDL $0x04, CX
  11786. ADDL $0x04, SI
  11787. MOVQ src_len+32(FP), DI
  11788. SUBL CX, DI
  11789. LEAQ (DX)(CX*1), R8
  11790. LEAQ (DX)(SI*1), SI
  11791. // matchLen
  11792. XORL R10, R10
  11793. CMPL DI, $0x08
  11794. JL matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
  11795. matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B:
  11796. MOVQ (R8)(R10*1), R9
  11797. XORQ (SI)(R10*1), R9
  11798. TESTQ R9, R9
  11799. JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B
  11800. #ifdef GOAMD64_v3
  11801. TZCNTQ R9, R9
  11802. #else
  11803. BSFQ R9, R9
  11804. #endif
  11805. SARQ $0x03, R9
  11806. LEAL (R10)(R9*1), R10
  11807. JMP match_nolit_end_encodeSnappyBlockAsm10B
  11808. matchlen_loop_match_nolit_encodeSnappyBlockAsm10B:
  11809. LEAL -8(DI), DI
  11810. LEAL 8(R10), R10
  11811. CMPL DI, $0x08
  11812. JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B
  11813. JZ match_nolit_end_encodeSnappyBlockAsm10B
  11814. matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
  11815. CMPL DI, $0x04
  11816. JL matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
  11817. MOVL (R8)(R10*1), R9
  11818. CMPL (SI)(R10*1), R9
  11819. JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
  11820. SUBL $0x04, DI
  11821. LEAL 4(R10), R10
  11822. matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
  11823. CMPL DI, $0x02
  11824. JL matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
  11825. MOVW (R8)(R10*1), R9
  11826. CMPW (SI)(R10*1), R9
  11827. JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
  11828. SUBL $0x02, DI
  11829. LEAL 2(R10), R10
  11830. matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
  11831. CMPL DI, $0x01
  11832. JL match_nolit_end_encodeSnappyBlockAsm10B
  11833. MOVB (R8)(R10*1), R9
  11834. CMPB (SI)(R10*1), R9
  11835. JNE match_nolit_end_encodeSnappyBlockAsm10B
  11836. LEAL 1(R10), R10
  11837. match_nolit_end_encodeSnappyBlockAsm10B:
  11838. ADDL R10, CX
  11839. MOVL 16(SP), SI
  11840. ADDL $0x04, R10
  11841. MOVL CX, 12(SP)
  11842. // emitCopy
  11843. two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
  11844. CMPL R10, $0x40
  11845. JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
  11846. MOVB $0xee, (AX)
  11847. MOVW SI, 1(AX)
  11848. LEAL -60(R10), R10
  11849. ADDQ $0x03, AX
  11850. JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
  11851. two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
  11852. CMPL R10, $0x0c
  11853. JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
  11854. CMPL SI, $0x00000800
  11855. JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
  11856. MOVB $0x01, BL
  11857. LEAL -16(BX)(R10*4), R10
  11858. MOVB SI, 1(AX)
  11859. SHRL $0x08, SI
  11860. SHLL $0x05, SI
  11861. ORL SI, R10
  11862. MOVB R10, (AX)
  11863. ADDQ $0x02, AX
  11864. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
  11865. emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
  11866. MOVB $0x02, BL
  11867. LEAL -4(BX)(R10*4), R10
  11868. MOVB R10, (AX)
  11869. MOVW SI, 1(AX)
  11870. ADDQ $0x03, AX
  11871. match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
  11872. CMPL CX, 8(SP)
  11873. JGE emit_remainder_encodeSnappyBlockAsm10B
  11874. MOVQ -2(DX)(CX*1), DI
  11875. CMPQ AX, (SP)
  11876. JL match_nolit_dst_ok_encodeSnappyBlockAsm10B
  11877. MOVQ $0x00000000, ret+48(FP)
  11878. RET
  11879. match_nolit_dst_ok_encodeSnappyBlockAsm10B:
  11880. MOVQ $0x9e3779b1, R9
  11881. MOVQ DI, R8
  11882. SHRQ $0x10, DI
  11883. MOVQ DI, SI
  11884. SHLQ $0x20, R8
  11885. IMULQ R9, R8
  11886. SHRQ $0x36, R8
  11887. SHLQ $0x20, SI
  11888. IMULQ R9, SI
  11889. SHRQ $0x36, SI
  11890. LEAL -2(CX), R9
  11891. LEAQ 24(SP)(SI*4), R10
  11892. MOVL (R10), SI
  11893. MOVL R9, 24(SP)(R8*4)
  11894. MOVL CX, (R10)
  11895. CMPL (DX)(SI*1), DI
  11896. JEQ match_nolit_loop_encodeSnappyBlockAsm10B
  11897. INCL CX
  11898. JMP search_loop_encodeSnappyBlockAsm10B
  11899. emit_remainder_encodeSnappyBlockAsm10B:
  11900. MOVQ src_len+32(FP), CX
  11901. SUBL 12(SP), CX
  11902. LEAQ 3(AX)(CX*1), CX
  11903. CMPQ CX, (SP)
  11904. JL emit_remainder_ok_encodeSnappyBlockAsm10B
  11905. MOVQ $0x00000000, ret+48(FP)
  11906. RET
  11907. emit_remainder_ok_encodeSnappyBlockAsm10B:
  11908. MOVQ src_len+32(FP), CX
  11909. MOVL 12(SP), BX
  11910. CMPL BX, CX
  11911. JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
  11912. MOVL CX, SI
  11913. MOVL CX, 12(SP)
  11914. LEAQ (DX)(BX*1), CX
  11915. SUBL BX, SI
  11916. LEAL -1(SI), DX
  11917. CMPL DX, $0x3c
  11918. JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B
  11919. CMPL DX, $0x00000100
  11920. JLT two_bytes_emit_remainder_encodeSnappyBlockAsm10B
  11921. MOVB $0xf4, (AX)
  11922. MOVW DX, 1(AX)
  11923. ADDQ $0x03, AX
  11924. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
  11925. two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
  11926. MOVB $0xf0, (AX)
  11927. MOVB DL, 1(AX)
  11928. ADDQ $0x02, AX
  11929. CMPL DX, $0x40
  11930. JL memmove_emit_remainder_encodeSnappyBlockAsm10B
  11931. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
  11932. one_byte_emit_remainder_encodeSnappyBlockAsm10B:
  11933. SHLB $0x02, DL
  11934. MOVB DL, (AX)
  11935. ADDQ $0x01, AX
  11936. memmove_emit_remainder_encodeSnappyBlockAsm10B:
  11937. LEAQ (AX)(SI*1), DX
  11938. MOVL SI, BX
  11939. // genMemMoveShort
  11940. CMPQ BX, $0x03
  11941. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2
  11942. JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3
  11943. CMPQ BX, $0x08
  11944. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7
  11945. CMPQ BX, $0x10
  11946. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
  11947. CMPQ BX, $0x20
  11948. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
  11949. JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
  11950. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2:
  11951. MOVB (CX), SI
  11952. MOVB -1(CX)(BX*1), CL
  11953. MOVB SI, (AX)
  11954. MOVB CL, -1(AX)(BX*1)
  11955. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
  11956. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3:
  11957. MOVW (CX), SI
  11958. MOVB 2(CX), CL
  11959. MOVW SI, (AX)
  11960. MOVB CL, 2(AX)
  11961. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
  11962. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7:
  11963. MOVL (CX), SI
  11964. MOVL -4(CX)(BX*1), CX
  11965. MOVL SI, (AX)
  11966. MOVL CX, -4(AX)(BX*1)
  11967. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
  11968. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
  11969. MOVQ (CX), SI
  11970. MOVQ -8(CX)(BX*1), CX
  11971. MOVQ SI, (AX)
  11972. MOVQ CX, -8(AX)(BX*1)
  11973. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
  11974. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
  11975. MOVOU (CX), X0
  11976. MOVOU -16(CX)(BX*1), X1
  11977. MOVOU X0, (AX)
  11978. MOVOU X1, -16(AX)(BX*1)
  11979. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
  11980. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
  11981. MOVOU (CX), X0
  11982. MOVOU 16(CX), X1
  11983. MOVOU -32(CX)(BX*1), X2
  11984. MOVOU -16(CX)(BX*1), X3
  11985. MOVOU X0, (AX)
  11986. MOVOU X1, 16(AX)
  11987. MOVOU X2, -32(AX)(BX*1)
  11988. MOVOU X3, -16(AX)(BX*1)
  11989. memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
  11990. MOVQ DX, AX
  11991. JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
  11992. memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
  11993. LEAQ (AX)(SI*1), DX
  11994. MOVL SI, BX
  11995. // genMemMoveLong
  11996. MOVOU (CX), X0
  11997. MOVOU 16(CX), X1
  11998. MOVOU -32(CX)(BX*1), X2
  11999. MOVOU -16(CX)(BX*1), X3
  12000. MOVQ BX, DI
  12001. SHRQ $0x05, DI
  12002. MOVQ AX, SI
  12003. ANDL $0x0000001f, SI
  12004. MOVQ $0x00000040, R8
  12005. SUBQ SI, R8
  12006. DECQ DI
  12007. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  12008. LEAQ -32(CX)(R8*1), SI
  12009. LEAQ -32(AX)(R8*1), R9
  12010. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
  12011. MOVOU (SI), X4
  12012. MOVOU 16(SI), X5
  12013. MOVOA X4, (R9)
  12014. MOVOA X5, 16(R9)
  12015. ADDQ $0x20, R9
  12016. ADDQ $0x20, SI
  12017. ADDQ $0x20, R8
  12018. DECQ DI
  12019. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
  12020. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
  12021. MOVOU -32(CX)(R8*1), X4
  12022. MOVOU -16(CX)(R8*1), X5
  12023. MOVOA X4, -32(AX)(R8*1)
  12024. MOVOA X5, -16(AX)(R8*1)
  12025. ADDQ $0x20, R8
  12026. CMPQ BX, R8
  12027. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
  12028. MOVOU X0, (AX)
  12029. MOVOU X1, 16(AX)
  12030. MOVOU X2, -32(AX)(BX*1)
  12031. MOVOU X3, -16(AX)(BX*1)
  12032. MOVQ DX, AX
  12033. emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
  12034. MOVQ dst_base+0(FP), CX
  12035. SUBQ CX, AX
  12036. MOVQ AX, ret+48(FP)
  12037. RET
  12038. // func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
  12039. // Requires: BMI, SSE2
  12040. TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
  12041. MOVQ dst_base+0(FP), AX
  12042. MOVQ $0x00000008, CX
  12043. LEAQ 24(SP), DX
  12044. PXOR X0, X0
  12045. zero_loop_encodeSnappyBlockAsm8B:
  12046. MOVOU X0, (DX)
  12047. MOVOU X0, 16(DX)
  12048. MOVOU X0, 32(DX)
  12049. MOVOU X0, 48(DX)
  12050. MOVOU X0, 64(DX)
  12051. MOVOU X0, 80(DX)
  12052. MOVOU X0, 96(DX)
  12053. MOVOU X0, 112(DX)
  12054. ADDQ $0x80, DX
  12055. DECQ CX
  12056. JNZ zero_loop_encodeSnappyBlockAsm8B
  12057. MOVL $0x00000000, 12(SP)
  12058. MOVQ src_len+32(FP), CX
  12059. LEAQ -9(CX), DX
  12060. LEAQ -8(CX), SI
  12061. MOVL SI, 8(SP)
  12062. SHRQ $0x05, CX
  12063. SUBL CX, DX
  12064. LEAQ (AX)(DX*1), DX
  12065. MOVQ DX, (SP)
  12066. MOVL $0x00000001, CX
  12067. MOVL CX, 16(SP)
  12068. MOVQ src_base+24(FP), DX
  12069. search_loop_encodeSnappyBlockAsm8B:
  12070. MOVL CX, SI
  12071. SUBL 12(SP), SI
  12072. SHRL $0x04, SI
  12073. LEAL 4(CX)(SI*1), SI
  12074. CMPL SI, 8(SP)
  12075. JGE emit_remainder_encodeSnappyBlockAsm8B
  12076. MOVQ (DX)(CX*1), DI
  12077. MOVL SI, 20(SP)
  12078. MOVQ $0x9e3779b1, R9
  12079. MOVQ DI, R10
  12080. MOVQ DI, R11
  12081. SHRQ $0x08, R11
  12082. SHLQ $0x20, R10
  12083. IMULQ R9, R10
  12084. SHRQ $0x38, R10
  12085. SHLQ $0x20, R11
  12086. IMULQ R9, R11
  12087. SHRQ $0x38, R11
  12088. MOVL 24(SP)(R10*4), SI
  12089. MOVL 24(SP)(R11*4), R8
  12090. MOVL CX, 24(SP)(R10*4)
  12091. LEAL 1(CX), R10
  12092. MOVL R10, 24(SP)(R11*4)
  12093. MOVQ DI, R10
  12094. SHRQ $0x10, R10
  12095. SHLQ $0x20, R10
  12096. IMULQ R9, R10
  12097. SHRQ $0x38, R10
  12098. MOVL CX, R9
  12099. SUBL 16(SP), R9
  12100. MOVL 1(DX)(R9*1), R11
  12101. MOVQ DI, R9
  12102. SHRQ $0x08, R9
  12103. CMPL R9, R11
  12104. JNE no_repeat_found_encodeSnappyBlockAsm8B
  12105. LEAL 1(CX), DI
  12106. MOVL 12(SP), SI
  12107. MOVL DI, R8
  12108. SUBL 16(SP), R8
  12109. JZ repeat_extend_back_end_encodeSnappyBlockAsm8B
  12110. repeat_extend_back_loop_encodeSnappyBlockAsm8B:
  12111. CMPL DI, SI
  12112. JLE repeat_extend_back_end_encodeSnappyBlockAsm8B
  12113. MOVB -1(DX)(R8*1), BL
  12114. MOVB -1(DX)(DI*1), R9
  12115. CMPB BL, R9
  12116. JNE repeat_extend_back_end_encodeSnappyBlockAsm8B
  12117. LEAL -1(DI), DI
  12118. DECL R8
  12119. JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B
  12120. repeat_extend_back_end_encodeSnappyBlockAsm8B:
  12121. MOVL 12(SP), SI
  12122. CMPL SI, DI
  12123. JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
  12124. MOVL DI, R8
  12125. MOVL DI, 12(SP)
  12126. LEAQ (DX)(SI*1), R9
  12127. SUBL SI, R8
  12128. LEAL -1(R8), SI
  12129. CMPL SI, $0x3c
  12130. JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B
  12131. CMPL SI, $0x00000100
  12132. JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B
  12133. MOVB $0xf4, (AX)
  12134. MOVW SI, 1(AX)
  12135. ADDQ $0x03, AX
  12136. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
  12137. two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
  12138. MOVB $0xf0, (AX)
  12139. MOVB SI, 1(AX)
  12140. ADDQ $0x02, AX
  12141. CMPL SI, $0x40
  12142. JL memmove_repeat_emit_encodeSnappyBlockAsm8B
  12143. JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
  12144. one_byte_repeat_emit_encodeSnappyBlockAsm8B:
  12145. SHLB $0x02, SI
  12146. MOVB SI, (AX)
  12147. ADDQ $0x01, AX
  12148. memmove_repeat_emit_encodeSnappyBlockAsm8B:
  12149. LEAQ (AX)(R8*1), SI
  12150. // genMemMoveShort
  12151. CMPQ R8, $0x08
  12152. JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
  12153. CMPQ R8, $0x10
  12154. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
  12155. CMPQ R8, $0x20
  12156. JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
  12157. JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
  12158. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
  12159. MOVQ (R9), R10
  12160. MOVQ R10, (AX)
  12161. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
  12162. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
  12163. MOVQ (R9), R10
  12164. MOVQ -8(R9)(R8*1), R9
  12165. MOVQ R10, (AX)
  12166. MOVQ R9, -8(AX)(R8*1)
  12167. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
  12168. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
  12169. MOVOU (R9), X0
  12170. MOVOU -16(R9)(R8*1), X1
  12171. MOVOU X0, (AX)
  12172. MOVOU X1, -16(AX)(R8*1)
  12173. JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
  12174. emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
  12175. MOVOU (R9), X0
  12176. MOVOU 16(R9), X1
  12177. MOVOU -32(R9)(R8*1), X2
  12178. MOVOU -16(R9)(R8*1), X3
  12179. MOVOU X0, (AX)
  12180. MOVOU X1, 16(AX)
  12181. MOVOU X2, -32(AX)(R8*1)
  12182. MOVOU X3, -16(AX)(R8*1)
  12183. memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
  12184. MOVQ SI, AX
  12185. JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
  12186. memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
  12187. LEAQ (AX)(R8*1), SI
  12188. // genMemMoveLong
  12189. MOVOU (R9), X0
  12190. MOVOU 16(R9), X1
  12191. MOVOU -32(R9)(R8*1), X2
  12192. MOVOU -16(R9)(R8*1), X3
  12193. MOVQ R8, R11
  12194. SHRQ $0x05, R11
  12195. MOVQ AX, R10
  12196. ANDL $0x0000001f, R10
  12197. MOVQ $0x00000040, R12
  12198. SUBQ R10, R12
  12199. DECQ R11
  12200. JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  12201. LEAQ -32(R9)(R12*1), R10
  12202. LEAQ -32(AX)(R12*1), R13
  12203. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
  12204. MOVOU (R10), X4
  12205. MOVOU 16(R10), X5
  12206. MOVOA X4, (R13)
  12207. MOVOA X5, 16(R13)
  12208. ADDQ $0x20, R13
  12209. ADDQ $0x20, R10
  12210. ADDQ $0x20, R12
  12211. DECQ R11
  12212. JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
  12213. emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
  12214. MOVOU -32(R9)(R12*1), X4
  12215. MOVOU -16(R9)(R12*1), X5
  12216. MOVOA X4, -32(AX)(R12*1)
  12217. MOVOA X5, -16(AX)(R12*1)
  12218. ADDQ $0x20, R12
  12219. CMPQ R8, R12
  12220. JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  12221. MOVOU X0, (AX)
  12222. MOVOU X1, 16(AX)
  12223. MOVOU X2, -32(AX)(R8*1)
  12224. MOVOU X3, -16(AX)(R8*1)
  12225. MOVQ SI, AX
  12226. emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
  12227. ADDL $0x05, CX
  12228. MOVL CX, SI
  12229. SUBL 16(SP), SI
  12230. MOVQ src_len+32(FP), R8
  12231. SUBL CX, R8
  12232. LEAQ (DX)(CX*1), R9
  12233. LEAQ (DX)(SI*1), SI
  12234. // matchLen
  12235. XORL R11, R11
  12236. CMPL R8, $0x08
  12237. JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
  12238. matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B:
  12239. MOVQ (R9)(R11*1), R10
  12240. XORQ (SI)(R11*1), R10
  12241. TESTQ R10, R10
  12242. JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B
  12243. #ifdef GOAMD64_v3
  12244. TZCNTQ R10, R10
  12245. #else
  12246. BSFQ R10, R10
  12247. #endif
  12248. SARQ $0x03, R10
  12249. LEAL (R11)(R10*1), R11
  12250. JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
  12251. matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B:
  12252. LEAL -8(R8), R8
  12253. LEAL 8(R11), R11
  12254. CMPL R8, $0x08
  12255. JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B
  12256. JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
  12257. matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
  12258. CMPL R8, $0x04
  12259. JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
  12260. MOVL (R9)(R11*1), R10
  12261. CMPL (SI)(R11*1), R10
  12262. JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
  12263. SUBL $0x04, R8
  12264. LEAL 4(R11), R11
  12265. matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
  12266. CMPL R8, $0x02
  12267. JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
  12268. MOVW (R9)(R11*1), R10
  12269. CMPW (SI)(R11*1), R10
  12270. JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
  12271. SUBL $0x02, R8
  12272. LEAL 2(R11), R11
  12273. matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
  12274. CMPL R8, $0x01
  12275. JL repeat_extend_forward_end_encodeSnappyBlockAsm8B
  12276. MOVB (R9)(R11*1), R10
  12277. CMPB (SI)(R11*1), R10
  12278. JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B
  12279. LEAL 1(R11), R11
  12280. repeat_extend_forward_end_encodeSnappyBlockAsm8B:
  12281. ADDL R11, CX
  12282. MOVL CX, SI
  12283. SUBL DI, SI
  12284. MOVL 16(SP), DI
  12285. // emitCopy
  12286. two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
  12287. CMPL SI, $0x40
  12288. JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
  12289. MOVB $0xee, (AX)
  12290. MOVW DI, 1(AX)
  12291. LEAL -60(SI), SI
  12292. ADDQ $0x03, AX
  12293. JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
  12294. two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
  12295. CMPL SI, $0x0c
  12296. JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
  12297. MOVB $0x01, BL
  12298. LEAL -16(BX)(SI*4), SI
  12299. MOVB DI, 1(AX)
  12300. SHRL $0x08, DI
  12301. SHLL $0x05, DI
  12302. ORL DI, SI
  12303. MOVB SI, (AX)
  12304. ADDQ $0x02, AX
  12305. JMP repeat_end_emit_encodeSnappyBlockAsm8B
  12306. emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
  12307. MOVB $0x02, BL
  12308. LEAL -4(BX)(SI*4), SI
  12309. MOVB SI, (AX)
  12310. MOVW DI, 1(AX)
  12311. ADDQ $0x03, AX
  12312. repeat_end_emit_encodeSnappyBlockAsm8B:
  12313. MOVL CX, 12(SP)
  12314. JMP search_loop_encodeSnappyBlockAsm8B
  12315. no_repeat_found_encodeSnappyBlockAsm8B:
  12316. CMPL (DX)(SI*1), DI
  12317. JEQ candidate_match_encodeSnappyBlockAsm8B
  12318. SHRQ $0x08, DI
  12319. MOVL 24(SP)(R10*4), SI
  12320. LEAL 2(CX), R9
  12321. CMPL (DX)(R8*1), DI
  12322. JEQ candidate2_match_encodeSnappyBlockAsm8B
  12323. MOVL R9, 24(SP)(R10*4)
  12324. SHRQ $0x08, DI
  12325. CMPL (DX)(SI*1), DI
  12326. JEQ candidate3_match_encodeSnappyBlockAsm8B
  12327. MOVL 20(SP), CX
  12328. JMP search_loop_encodeSnappyBlockAsm8B
  12329. candidate3_match_encodeSnappyBlockAsm8B:
  12330. ADDL $0x02, CX
  12331. JMP candidate_match_encodeSnappyBlockAsm8B
  12332. candidate2_match_encodeSnappyBlockAsm8B:
  12333. MOVL R9, 24(SP)(R10*4)
  12334. INCL CX
  12335. MOVL R8, SI
  12336. candidate_match_encodeSnappyBlockAsm8B:
  12337. MOVL 12(SP), DI
  12338. TESTL SI, SI
  12339. JZ match_extend_back_end_encodeSnappyBlockAsm8B
  12340. match_extend_back_loop_encodeSnappyBlockAsm8B:
  12341. CMPL CX, DI
  12342. JLE match_extend_back_end_encodeSnappyBlockAsm8B
  12343. MOVB -1(DX)(SI*1), BL
  12344. MOVB -1(DX)(CX*1), R8
  12345. CMPB BL, R8
  12346. JNE match_extend_back_end_encodeSnappyBlockAsm8B
  12347. LEAL -1(CX), CX
  12348. DECL SI
  12349. JZ match_extend_back_end_encodeSnappyBlockAsm8B
  12350. JMP match_extend_back_loop_encodeSnappyBlockAsm8B
  12351. match_extend_back_end_encodeSnappyBlockAsm8B:
  12352. MOVL CX, DI
  12353. SUBL 12(SP), DI
  12354. LEAQ 3(AX)(DI*1), DI
  12355. CMPQ DI, (SP)
  12356. JL match_dst_size_check_encodeSnappyBlockAsm8B
  12357. MOVQ $0x00000000, ret+48(FP)
  12358. RET
  12359. match_dst_size_check_encodeSnappyBlockAsm8B:
  12360. MOVL CX, DI
  12361. MOVL 12(SP), R8
  12362. CMPL R8, DI
  12363. JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B
  12364. MOVL DI, R9
  12365. MOVL DI, 12(SP)
  12366. LEAQ (DX)(R8*1), DI
  12367. SUBL R8, R9
  12368. LEAL -1(R9), R8
  12369. CMPL R8, $0x3c
  12370. JLT one_byte_match_emit_encodeSnappyBlockAsm8B
  12371. CMPL R8, $0x00000100
  12372. JLT two_bytes_match_emit_encodeSnappyBlockAsm8B
  12373. MOVB $0xf4, (AX)
  12374. MOVW R8, 1(AX)
  12375. ADDQ $0x03, AX
  12376. JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
  12377. two_bytes_match_emit_encodeSnappyBlockAsm8B:
  12378. MOVB $0xf0, (AX)
  12379. MOVB R8, 1(AX)
  12380. ADDQ $0x02, AX
  12381. CMPL R8, $0x40
  12382. JL memmove_match_emit_encodeSnappyBlockAsm8B
  12383. JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
  12384. one_byte_match_emit_encodeSnappyBlockAsm8B:
  12385. SHLB $0x02, R8
  12386. MOVB R8, (AX)
  12387. ADDQ $0x01, AX
  12388. memmove_match_emit_encodeSnappyBlockAsm8B:
  12389. LEAQ (AX)(R9*1), R8
  12390. // genMemMoveShort
  12391. CMPQ R9, $0x08
  12392. JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
  12393. CMPQ R9, $0x10
  12394. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
  12395. CMPQ R9, $0x20
  12396. JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
  12397. JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
  12398. emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
  12399. MOVQ (DI), R10
  12400. MOVQ R10, (AX)
  12401. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
  12402. emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
  12403. MOVQ (DI), R10
  12404. MOVQ -8(DI)(R9*1), DI
  12405. MOVQ R10, (AX)
  12406. MOVQ DI, -8(AX)(R9*1)
  12407. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
  12408. emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
  12409. MOVOU (DI), X0
  12410. MOVOU -16(DI)(R9*1), X1
  12411. MOVOU X0, (AX)
  12412. MOVOU X1, -16(AX)(R9*1)
  12413. JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
  12414. emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
  12415. MOVOU (DI), X0
  12416. MOVOU 16(DI), X1
  12417. MOVOU -32(DI)(R9*1), X2
  12418. MOVOU -16(DI)(R9*1), X3
  12419. MOVOU X0, (AX)
  12420. MOVOU X1, 16(AX)
  12421. MOVOU X2, -32(AX)(R9*1)
  12422. MOVOU X3, -16(AX)(R9*1)
  12423. memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
  12424. MOVQ R8, AX
  12425. JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B
  12426. memmove_long_match_emit_encodeSnappyBlockAsm8B:
  12427. LEAQ (AX)(R9*1), R8
  12428. // genMemMoveLong
  12429. MOVOU (DI), X0
  12430. MOVOU 16(DI), X1
  12431. MOVOU -32(DI)(R9*1), X2
  12432. MOVOU -16(DI)(R9*1), X3
  12433. MOVQ R9, R11
  12434. SHRQ $0x05, R11
  12435. MOVQ AX, R10
  12436. ANDL $0x0000001f, R10
  12437. MOVQ $0x00000040, R12
  12438. SUBQ R10, R12
  12439. DECQ R11
  12440. JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  12441. LEAQ -32(DI)(R12*1), R10
  12442. LEAQ -32(AX)(R12*1), R13
  12443. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
  12444. MOVOU (R10), X4
  12445. MOVOU 16(R10), X5
  12446. MOVOA X4, (R13)
  12447. MOVOA X5, 16(R13)
  12448. ADDQ $0x20, R13
  12449. ADDQ $0x20, R10
  12450. ADDQ $0x20, R12
  12451. DECQ R11
  12452. JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
  12453. emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
  12454. MOVOU -32(DI)(R12*1), X4
  12455. MOVOU -16(DI)(R12*1), X5
  12456. MOVOA X4, -32(AX)(R12*1)
  12457. MOVOA X5, -16(AX)(R12*1)
  12458. ADDQ $0x20, R12
  12459. CMPQ R9, R12
  12460. JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  12461. MOVOU X0, (AX)
  12462. MOVOU X1, 16(AX)
  12463. MOVOU X2, -32(AX)(R9*1)
  12464. MOVOU X3, -16(AX)(R9*1)
  12465. MOVQ R8, AX
  12466. emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
  12467. match_nolit_loop_encodeSnappyBlockAsm8B:
  12468. MOVL CX, DI
  12469. SUBL SI, DI
  12470. MOVL DI, 16(SP)
  12471. ADDL $0x04, CX
  12472. ADDL $0x04, SI
  12473. MOVQ src_len+32(FP), DI
  12474. SUBL CX, DI
  12475. LEAQ (DX)(CX*1), R8
  12476. LEAQ (DX)(SI*1), SI
  12477. // matchLen
  12478. XORL R10, R10
  12479. CMPL DI, $0x08
  12480. JL matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
  12481. matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B:
  12482. MOVQ (R8)(R10*1), R9
  12483. XORQ (SI)(R10*1), R9
  12484. TESTQ R9, R9
  12485. JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B
  12486. #ifdef GOAMD64_v3
  12487. TZCNTQ R9, R9
  12488. #else
  12489. BSFQ R9, R9
  12490. #endif
  12491. SARQ $0x03, R9
  12492. LEAL (R10)(R9*1), R10
  12493. JMP match_nolit_end_encodeSnappyBlockAsm8B
  12494. matchlen_loop_match_nolit_encodeSnappyBlockAsm8B:
  12495. LEAL -8(DI), DI
  12496. LEAL 8(R10), R10
  12497. CMPL DI, $0x08
  12498. JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B
  12499. JZ match_nolit_end_encodeSnappyBlockAsm8B
  12500. matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
  12501. CMPL DI, $0x04
  12502. JL matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
  12503. MOVL (R8)(R10*1), R9
  12504. CMPL (SI)(R10*1), R9
  12505. JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
  12506. SUBL $0x04, DI
  12507. LEAL 4(R10), R10
  12508. matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
  12509. CMPL DI, $0x02
  12510. JL matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
  12511. MOVW (R8)(R10*1), R9
  12512. CMPW (SI)(R10*1), R9
  12513. JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
  12514. SUBL $0x02, DI
  12515. LEAL 2(R10), R10
  12516. matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
  12517. CMPL DI, $0x01
  12518. JL match_nolit_end_encodeSnappyBlockAsm8B
  12519. MOVB (R8)(R10*1), R9
  12520. CMPB (SI)(R10*1), R9
  12521. JNE match_nolit_end_encodeSnappyBlockAsm8B
  12522. LEAL 1(R10), R10
  12523. match_nolit_end_encodeSnappyBlockAsm8B:
  12524. ADDL R10, CX
  12525. MOVL 16(SP), SI
  12526. ADDL $0x04, R10
  12527. MOVL CX, 12(SP)
  12528. // emitCopy
  12529. two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
  12530. CMPL R10, $0x40
  12531. JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
  12532. MOVB $0xee, (AX)
  12533. MOVW SI, 1(AX)
  12534. LEAL -60(R10), R10
  12535. ADDQ $0x03, AX
  12536. JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
  12537. two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
  12538. CMPL R10, $0x0c
  12539. JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
  12540. MOVB $0x01, BL
  12541. LEAL -16(BX)(R10*4), R10
  12542. MOVB SI, 1(AX)
  12543. SHRL $0x08, SI
  12544. SHLL $0x05, SI
  12545. ORL SI, R10
  12546. MOVB R10, (AX)
  12547. ADDQ $0x02, AX
  12548. JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
  12549. emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
  12550. MOVB $0x02, BL
  12551. LEAL -4(BX)(R10*4), R10
  12552. MOVB R10, (AX)
  12553. MOVW SI, 1(AX)
  12554. ADDQ $0x03, AX
  12555. match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
  12556. CMPL CX, 8(SP)
  12557. JGE emit_remainder_encodeSnappyBlockAsm8B
  12558. MOVQ -2(DX)(CX*1), DI
  12559. CMPQ AX, (SP)
  12560. JL match_nolit_dst_ok_encodeSnappyBlockAsm8B
  12561. MOVQ $0x00000000, ret+48(FP)
  12562. RET
  12563. match_nolit_dst_ok_encodeSnappyBlockAsm8B:
  12564. MOVQ $0x9e3779b1, R9
  12565. MOVQ DI, R8
  12566. SHRQ $0x10, DI
  12567. MOVQ DI, SI
  12568. SHLQ $0x20, R8
  12569. IMULQ R9, R8
  12570. SHRQ $0x38, R8
  12571. SHLQ $0x20, SI
  12572. IMULQ R9, SI
  12573. SHRQ $0x38, SI
  12574. LEAL -2(CX), R9
  12575. LEAQ 24(SP)(SI*4), R10
  12576. MOVL (R10), SI
  12577. MOVL R9, 24(SP)(R8*4)
  12578. MOVL CX, (R10)
  12579. CMPL (DX)(SI*1), DI
  12580. JEQ match_nolit_loop_encodeSnappyBlockAsm8B
  12581. INCL CX
  12582. JMP search_loop_encodeSnappyBlockAsm8B
  12583. emit_remainder_encodeSnappyBlockAsm8B:
  12584. MOVQ src_len+32(FP), CX
  12585. SUBL 12(SP), CX
  12586. LEAQ 3(AX)(CX*1), CX
  12587. CMPQ CX, (SP)
  12588. JL emit_remainder_ok_encodeSnappyBlockAsm8B
  12589. MOVQ $0x00000000, ret+48(FP)
  12590. RET
  12591. emit_remainder_ok_encodeSnappyBlockAsm8B:
  12592. MOVQ src_len+32(FP), CX
  12593. MOVL 12(SP), BX
  12594. CMPL BX, CX
  12595. JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
  12596. MOVL CX, SI
  12597. MOVL CX, 12(SP)
  12598. LEAQ (DX)(BX*1), CX
  12599. SUBL BX, SI
  12600. LEAL -1(SI), DX
  12601. CMPL DX, $0x3c
  12602. JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B
  12603. CMPL DX, $0x00000100
  12604. JLT two_bytes_emit_remainder_encodeSnappyBlockAsm8B
  12605. MOVB $0xf4, (AX)
  12606. MOVW DX, 1(AX)
  12607. ADDQ $0x03, AX
  12608. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
  12609. two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
  12610. MOVB $0xf0, (AX)
  12611. MOVB DL, 1(AX)
  12612. ADDQ $0x02, AX
  12613. CMPL DX, $0x40
  12614. JL memmove_emit_remainder_encodeSnappyBlockAsm8B
  12615. JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
  12616. one_byte_emit_remainder_encodeSnappyBlockAsm8B:
  12617. SHLB $0x02, DL
  12618. MOVB DL, (AX)
  12619. ADDQ $0x01, AX
  12620. memmove_emit_remainder_encodeSnappyBlockAsm8B:
  12621. LEAQ (AX)(SI*1), DX
  12622. MOVL SI, BX
  12623. // genMemMoveShort
  12624. CMPQ BX, $0x03
  12625. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2
  12626. JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3
  12627. CMPQ BX, $0x08
  12628. JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7
  12629. CMPQ BX, $0x10
  12630. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
  12631. CMPQ BX, $0x20
  12632. JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
  12633. JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
  12634. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2:
  12635. MOVB (CX), SI
  12636. MOVB -1(CX)(BX*1), CL
  12637. MOVB SI, (AX)
  12638. MOVB CL, -1(AX)(BX*1)
  12639. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
  12640. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3:
  12641. MOVW (CX), SI
  12642. MOVB 2(CX), CL
  12643. MOVW SI, (AX)
  12644. MOVB CL, 2(AX)
  12645. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
  12646. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7:
  12647. MOVL (CX), SI
  12648. MOVL -4(CX)(BX*1), CX
  12649. MOVL SI, (AX)
  12650. MOVL CX, -4(AX)(BX*1)
  12651. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
  12652. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
  12653. MOVQ (CX), SI
  12654. MOVQ -8(CX)(BX*1), CX
  12655. MOVQ SI, (AX)
  12656. MOVQ CX, -8(AX)(BX*1)
  12657. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
  12658. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
  12659. MOVOU (CX), X0
  12660. MOVOU -16(CX)(BX*1), X1
  12661. MOVOU X0, (AX)
  12662. MOVOU X1, -16(AX)(BX*1)
  12663. JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
  12664. emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
  12665. MOVOU (CX), X0
  12666. MOVOU 16(CX), X1
  12667. MOVOU -32(CX)(BX*1), X2
  12668. MOVOU -16(CX)(BX*1), X3
  12669. MOVOU X0, (AX)
  12670. MOVOU X1, 16(AX)
  12671. MOVOU X2, -32(AX)(BX*1)
  12672. MOVOU X3, -16(AX)(BX*1)
  12673. memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
  12674. MOVQ DX, AX
  12675. JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
  12676. memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
  12677. LEAQ (AX)(SI*1), DX
  12678. MOVL SI, BX
  12679. // genMemMoveLong
  12680. MOVOU (CX), X0
  12681. MOVOU 16(CX), X1
  12682. MOVOU -32(CX)(BX*1), X2
  12683. MOVOU -16(CX)(BX*1), X3
  12684. MOVQ BX, DI
  12685. SHRQ $0x05, DI
  12686. MOVQ AX, SI
  12687. ANDL $0x0000001f, SI
  12688. MOVQ $0x00000040, R8
  12689. SUBQ SI, R8
  12690. DECQ DI
  12691. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  12692. LEAQ -32(CX)(R8*1), SI
  12693. LEAQ -32(AX)(R8*1), R9
  12694. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
  12695. MOVOU (SI), X4
  12696. MOVOU 16(SI), X5
  12697. MOVOA X4, (R9)
  12698. MOVOA X5, 16(R9)
  12699. ADDQ $0x20, R9
  12700. ADDQ $0x20, SI
  12701. ADDQ $0x20, R8
  12702. DECQ DI
  12703. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
  12704. emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
  12705. MOVOU -32(CX)(R8*1), X4
  12706. MOVOU -16(CX)(R8*1), X5
  12707. MOVOA X4, -32(AX)(R8*1)
  12708. MOVOA X5, -16(AX)(R8*1)
  12709. ADDQ $0x20, R8
  12710. CMPQ BX, R8
  12711. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
  12712. MOVOU X0, (AX)
  12713. MOVOU X1, 16(AX)
  12714. MOVOU X2, -32(AX)(BX*1)
  12715. MOVOU X3, -16(AX)(BX*1)
  12716. MOVQ DX, AX
  12717. emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
  12718. MOVQ dst_base+0(FP), CX
  12719. SUBQ CX, AX
  12720. MOVQ AX, ret+48(FP)
  12721. RET
  12722. // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
  12723. // Requires: BMI, SSE2
  12724. TEXT ·encodeSnappyBetterBlockAsm(SB), $327704-56
  12725. MOVQ dst_base+0(FP), AX
  12726. MOVQ $0x00000a00, CX
  12727. LEAQ 24(SP), DX
  12728. PXOR X0, X0
  12729. zero_loop_encodeSnappyBetterBlockAsm:
  12730. MOVOU X0, (DX)
  12731. MOVOU X0, 16(DX)
  12732. MOVOU X0, 32(DX)
  12733. MOVOU X0, 48(DX)
  12734. MOVOU X0, 64(DX)
  12735. MOVOU X0, 80(DX)
  12736. MOVOU X0, 96(DX)
  12737. MOVOU X0, 112(DX)
  12738. ADDQ $0x80, DX
  12739. DECQ CX
  12740. JNZ zero_loop_encodeSnappyBetterBlockAsm
  12741. MOVL $0x00000000, 12(SP)
  12742. MOVQ src_len+32(FP), CX
  12743. LEAQ -9(CX), DX
  12744. LEAQ -8(CX), SI
  12745. MOVL SI, 8(SP)
  12746. SHRQ $0x05, CX
  12747. SUBL CX, DX
  12748. LEAQ (AX)(DX*1), DX
  12749. MOVQ DX, (SP)
  12750. MOVL $0x00000001, CX
  12751. MOVL $0x00000000, 16(SP)
  12752. MOVQ src_base+24(FP), DX
  12753. search_loop_encodeSnappyBetterBlockAsm:
  12754. MOVL CX, SI
  12755. SUBL 12(SP), SI
  12756. SHRL $0x07, SI
  12757. CMPL SI, $0x63
  12758. JLE check_maxskip_ok_encodeSnappyBetterBlockAsm
  12759. LEAL 100(CX), SI
  12760. JMP check_maxskip_cont_encodeSnappyBetterBlockAsm
  12761. check_maxskip_ok_encodeSnappyBetterBlockAsm:
  12762. LEAL 1(CX)(SI*1), SI
  12763. check_maxskip_cont_encodeSnappyBetterBlockAsm:
  12764. CMPL SI, 8(SP)
  12765. JGE emit_remainder_encodeSnappyBetterBlockAsm
  12766. MOVQ (DX)(CX*1), DI
  12767. MOVL SI, 20(SP)
  12768. MOVQ $0x00cf1bbcdcbfa563, R9
  12769. MOVQ $0x9e3779b1, SI
  12770. MOVQ DI, R10
  12771. MOVQ DI, R11
  12772. SHLQ $0x08, R10
  12773. IMULQ R9, R10
  12774. SHRQ $0x30, R10
  12775. SHLQ $0x20, R11
  12776. IMULQ SI, R11
  12777. SHRQ $0x32, R11
  12778. MOVL 24(SP)(R10*4), SI
  12779. MOVL 262168(SP)(R11*4), R8
  12780. MOVL CX, 24(SP)(R10*4)
  12781. MOVL CX, 262168(SP)(R11*4)
  12782. CMPL (DX)(SI*1), DI
  12783. JEQ candidate_match_encodeSnappyBetterBlockAsm
  12784. CMPL (DX)(R8*1), DI
  12785. JEQ candidateS_match_encodeSnappyBetterBlockAsm
  12786. MOVL 20(SP), CX
  12787. JMP search_loop_encodeSnappyBetterBlockAsm
  12788. candidateS_match_encodeSnappyBetterBlockAsm:
  12789. SHRQ $0x08, DI
  12790. MOVQ DI, R10
  12791. SHLQ $0x08, R10
  12792. IMULQ R9, R10
  12793. SHRQ $0x30, R10
  12794. MOVL 24(SP)(R10*4), SI
  12795. INCL CX
  12796. MOVL CX, 24(SP)(R10*4)
  12797. CMPL (DX)(SI*1), DI
  12798. JEQ candidate_match_encodeSnappyBetterBlockAsm
  12799. DECL CX
  12800. MOVL R8, SI
  12801. candidate_match_encodeSnappyBetterBlockAsm:
  12802. MOVL 12(SP), DI
  12803. TESTL SI, SI
  12804. JZ match_extend_back_end_encodeSnappyBetterBlockAsm
  12805. match_extend_back_loop_encodeSnappyBetterBlockAsm:
  12806. CMPL CX, DI
  12807. JLE match_extend_back_end_encodeSnappyBetterBlockAsm
  12808. MOVB -1(DX)(SI*1), BL
  12809. MOVB -1(DX)(CX*1), R8
  12810. CMPB BL, R8
  12811. JNE match_extend_back_end_encodeSnappyBetterBlockAsm
  12812. LEAL -1(CX), CX
  12813. DECL SI
  12814. JZ match_extend_back_end_encodeSnappyBetterBlockAsm
  12815. JMP match_extend_back_loop_encodeSnappyBetterBlockAsm
  12816. match_extend_back_end_encodeSnappyBetterBlockAsm:
  12817. MOVL CX, DI
  12818. SUBL 12(SP), DI
  12819. LEAQ 5(AX)(DI*1), DI
  12820. CMPQ DI, (SP)
  12821. JL match_dst_size_check_encodeSnappyBetterBlockAsm
  12822. MOVQ $0x00000000, ret+48(FP)
  12823. RET
  12824. match_dst_size_check_encodeSnappyBetterBlockAsm:
  12825. MOVL CX, DI
  12826. ADDL $0x04, CX
  12827. ADDL $0x04, SI
  12828. MOVQ src_len+32(FP), R8
  12829. SUBL CX, R8
  12830. LEAQ (DX)(CX*1), R9
  12831. LEAQ (DX)(SI*1), R10
  12832. // matchLen
  12833. XORL R12, R12
  12834. CMPL R8, $0x08
  12835. JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
  12836. matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm:
  12837. MOVQ (R9)(R12*1), R11
  12838. XORQ (R10)(R12*1), R11
  12839. TESTQ R11, R11
  12840. JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm
  12841. #ifdef GOAMD64_v3
  12842. TZCNTQ R11, R11
  12843. #else
  12844. BSFQ R11, R11
  12845. #endif
  12846. SARQ $0x03, R11
  12847. LEAL (R12)(R11*1), R12
  12848. JMP match_nolit_end_encodeSnappyBetterBlockAsm
  12849. matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm:
  12850. LEAL -8(R8), R8
  12851. LEAL 8(R12), R12
  12852. CMPL R8, $0x08
  12853. JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm
  12854. JZ match_nolit_end_encodeSnappyBetterBlockAsm
  12855. matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
  12856. CMPL R8, $0x04
  12857. JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
  12858. MOVL (R9)(R12*1), R11
  12859. CMPL (R10)(R12*1), R11
  12860. JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
  12861. SUBL $0x04, R8
  12862. LEAL 4(R12), R12
  12863. matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
  12864. CMPL R8, $0x02
  12865. JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
  12866. MOVW (R9)(R12*1), R11
  12867. CMPW (R10)(R12*1), R11
  12868. JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
  12869. SUBL $0x02, R8
  12870. LEAL 2(R12), R12
  12871. matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
  12872. CMPL R8, $0x01
  12873. JL match_nolit_end_encodeSnappyBetterBlockAsm
  12874. MOVB (R9)(R12*1), R11
  12875. CMPB (R10)(R12*1), R11
  12876. JNE match_nolit_end_encodeSnappyBetterBlockAsm
  12877. LEAL 1(R12), R12
  12878. match_nolit_end_encodeSnappyBetterBlockAsm:
  12879. MOVL CX, R8
  12880. SUBL SI, R8
  12881. // Check if repeat
  12882. CMPL R12, $0x01
  12883. JG match_length_ok_encodeSnappyBetterBlockAsm
  12884. CMPL R8, $0x0000ffff
  12885. JLE match_length_ok_encodeSnappyBetterBlockAsm
  12886. MOVL 20(SP), CX
  12887. INCL CX
  12888. JMP search_loop_encodeSnappyBetterBlockAsm
  12889. match_length_ok_encodeSnappyBetterBlockAsm:
  12890. MOVL R8, 16(SP)
  12891. MOVL 12(SP), SI
  12892. CMPL SI, DI
  12893. JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
  12894. MOVL DI, R9
  12895. MOVL DI, 12(SP)
  12896. LEAQ (DX)(SI*1), R10
  12897. SUBL SI, R9
  12898. LEAL -1(R9), SI
  12899. CMPL SI, $0x3c
  12900. JLT one_byte_match_emit_encodeSnappyBetterBlockAsm
  12901. CMPL SI, $0x00000100
  12902. JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm
  12903. CMPL SI, $0x00010000
  12904. JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm
  12905. CMPL SI, $0x01000000
  12906. JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm
  12907. MOVB $0xfc, (AX)
  12908. MOVL SI, 1(AX)
  12909. ADDQ $0x05, AX
  12910. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
  12911. four_bytes_match_emit_encodeSnappyBetterBlockAsm:
  12912. MOVL SI, R11
  12913. SHRL $0x10, R11
  12914. MOVB $0xf8, (AX)
  12915. MOVW SI, 1(AX)
  12916. MOVB R11, 3(AX)
  12917. ADDQ $0x04, AX
  12918. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
  12919. three_bytes_match_emit_encodeSnappyBetterBlockAsm:
  12920. MOVB $0xf4, (AX)
  12921. MOVW SI, 1(AX)
  12922. ADDQ $0x03, AX
  12923. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
  12924. two_bytes_match_emit_encodeSnappyBetterBlockAsm:
  12925. MOVB $0xf0, (AX)
  12926. MOVB SI, 1(AX)
  12927. ADDQ $0x02, AX
  12928. CMPL SI, $0x40
  12929. JL memmove_match_emit_encodeSnappyBetterBlockAsm
  12930. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
  12931. one_byte_match_emit_encodeSnappyBetterBlockAsm:
  12932. SHLB $0x02, SI
  12933. MOVB SI, (AX)
  12934. ADDQ $0x01, AX
  12935. memmove_match_emit_encodeSnappyBetterBlockAsm:
  12936. LEAQ (AX)(R9*1), SI
  12937. // genMemMoveShort
  12938. CMPQ R9, $0x08
  12939. JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
  12940. CMPQ R9, $0x10
  12941. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
  12942. CMPQ R9, $0x20
  12943. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
  12944. JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
  12945. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
  12946. MOVQ (R10), R11
  12947. MOVQ R11, (AX)
  12948. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
  12949. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
  12950. MOVQ (R10), R11
  12951. MOVQ -8(R10)(R9*1), R10
  12952. MOVQ R11, (AX)
  12953. MOVQ R10, -8(AX)(R9*1)
  12954. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
  12955. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
  12956. MOVOU (R10), X0
  12957. MOVOU -16(R10)(R9*1), X1
  12958. MOVOU X0, (AX)
  12959. MOVOU X1, -16(AX)(R9*1)
  12960. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
  12961. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
  12962. MOVOU (R10), X0
  12963. MOVOU 16(R10), X1
  12964. MOVOU -32(R10)(R9*1), X2
  12965. MOVOU -16(R10)(R9*1), X3
  12966. MOVOU X0, (AX)
  12967. MOVOU X1, 16(AX)
  12968. MOVOU X2, -32(AX)(R9*1)
  12969. MOVOU X3, -16(AX)(R9*1)
  12970. memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
  12971. MOVQ SI, AX
  12972. JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
  12973. memmove_long_match_emit_encodeSnappyBetterBlockAsm:
  12974. LEAQ (AX)(R9*1), SI
  12975. // genMemMoveLong
  12976. MOVOU (R10), X0
  12977. MOVOU 16(R10), X1
  12978. MOVOU -32(R10)(R9*1), X2
  12979. MOVOU -16(R10)(R9*1), X3
  12980. MOVQ R9, R13
  12981. SHRQ $0x05, R13
  12982. MOVQ AX, R11
  12983. ANDL $0x0000001f, R11
  12984. MOVQ $0x00000040, R14
  12985. SUBQ R11, R14
  12986. DECQ R13
  12987. JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
  12988. LEAQ -32(R10)(R14*1), R11
  12989. LEAQ -32(AX)(R14*1), R15
  12990. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
  12991. MOVOU (R11), X4
  12992. MOVOU 16(R11), X5
  12993. MOVOA X4, (R15)
  12994. MOVOA X5, 16(R15)
  12995. ADDQ $0x20, R15
  12996. ADDQ $0x20, R11
  12997. ADDQ $0x20, R14
  12998. DECQ R13
  12999. JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
  13000. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
  13001. MOVOU -32(R10)(R14*1), X4
  13002. MOVOU -16(R10)(R14*1), X5
  13003. MOVOA X4, -32(AX)(R14*1)
  13004. MOVOA X5, -16(AX)(R14*1)
  13005. ADDQ $0x20, R14
  13006. CMPQ R9, R14
  13007. JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
  13008. MOVOU X0, (AX)
  13009. MOVOU X1, 16(AX)
  13010. MOVOU X2, -32(AX)(R9*1)
  13011. MOVOU X3, -16(AX)(R9*1)
  13012. MOVQ SI, AX
  13013. emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
  13014. ADDL R12, CX
  13015. ADDL $0x04, R12
  13016. MOVL CX, 12(SP)
  13017. // emitCopy
  13018. CMPL R8, $0x00010000
  13019. JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
  13020. four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
  13021. CMPL R12, $0x40
  13022. JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
  13023. MOVB $0xff, (AX)
  13024. MOVL R8, 1(AX)
  13025. LEAL -64(R12), R12
  13026. ADDQ $0x05, AX
  13027. CMPL R12, $0x04
  13028. JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
  13029. JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
  13030. four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
  13031. TESTL R12, R12
  13032. JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
  13033. MOVB $0x03, BL
  13034. LEAL -4(BX)(R12*4), R12
  13035. MOVB R12, (AX)
  13036. MOVL R8, 1(AX)
  13037. ADDQ $0x05, AX
  13038. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
  13039. two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
  13040. CMPL R12, $0x40
  13041. JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
  13042. MOVB $0xee, (AX)
  13043. MOVW R8, 1(AX)
  13044. LEAL -60(R12), R12
  13045. ADDQ $0x03, AX
  13046. JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
  13047. two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
  13048. CMPL R12, $0x0c
  13049. JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
  13050. CMPL R8, $0x00000800
  13051. JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
  13052. MOVB $0x01, BL
  13053. LEAL -16(BX)(R12*4), R12
  13054. MOVB R8, 1(AX)
  13055. SHRL $0x08, R8
  13056. SHLL $0x05, R8
  13057. ORL R8, R12
  13058. MOVB R12, (AX)
  13059. ADDQ $0x02, AX
  13060. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
  13061. emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
  13062. MOVB $0x02, BL
  13063. LEAL -4(BX)(R12*4), R12
  13064. MOVB R12, (AX)
  13065. MOVW R8, 1(AX)
  13066. ADDQ $0x03, AX
  13067. match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
  13068. CMPL CX, 8(SP)
  13069. JGE emit_remainder_encodeSnappyBetterBlockAsm
  13070. CMPQ AX, (SP)
  13071. JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm
  13072. MOVQ $0x00000000, ret+48(FP)
  13073. RET
  13074. match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
  13075. MOVQ $0x00cf1bbcdcbfa563, SI
  13076. MOVQ $0x9e3779b1, R8
  13077. INCL DI
  13078. MOVQ (DX)(DI*1), R9
  13079. MOVQ R9, R10
  13080. MOVQ R9, R11
  13081. MOVQ R9, R12
  13082. SHRQ $0x08, R11
  13083. MOVQ R11, R13
  13084. SHRQ $0x10, R12
  13085. LEAL 1(DI), R14
  13086. LEAL 2(DI), R15
  13087. MOVQ -2(DX)(CX*1), R9
  13088. SHLQ $0x08, R10
  13089. IMULQ SI, R10
  13090. SHRQ $0x30, R10
  13091. SHLQ $0x08, R13
  13092. IMULQ SI, R13
  13093. SHRQ $0x30, R13
  13094. SHLQ $0x20, R11
  13095. IMULQ R8, R11
  13096. SHRQ $0x32, R11
  13097. SHLQ $0x20, R12
  13098. IMULQ R8, R12
  13099. SHRQ $0x32, R12
  13100. MOVL DI, 24(SP)(R10*4)
  13101. MOVL R14, 24(SP)(R13*4)
  13102. MOVL R14, 262168(SP)(R11*4)
  13103. MOVL R15, 262168(SP)(R12*4)
  13104. MOVQ R9, R10
  13105. MOVQ R9, R11
  13106. SHRQ $0x08, R11
  13107. MOVQ R11, R13
  13108. LEAL -2(CX), R9
  13109. LEAL -1(CX), DI
  13110. SHLQ $0x08, R10
  13111. IMULQ SI, R10
  13112. SHRQ $0x30, R10
  13113. SHLQ $0x20, R11
  13114. IMULQ R8, R11
  13115. SHRQ $0x32, R11
  13116. SHLQ $0x08, R13
  13117. IMULQ SI, R13
  13118. SHRQ $0x30, R13
  13119. MOVL R9, 24(SP)(R10*4)
  13120. MOVL DI, 262168(SP)(R11*4)
  13121. MOVL DI, 24(SP)(R13*4)
  13122. JMP search_loop_encodeSnappyBetterBlockAsm
  13123. emit_remainder_encodeSnappyBetterBlockAsm:
  13124. MOVQ src_len+32(FP), CX
  13125. SUBL 12(SP), CX
  13126. LEAQ 5(AX)(CX*1), CX
  13127. CMPQ CX, (SP)
  13128. JL emit_remainder_ok_encodeSnappyBetterBlockAsm
  13129. MOVQ $0x00000000, ret+48(FP)
  13130. RET
  13131. emit_remainder_ok_encodeSnappyBetterBlockAsm:
  13132. MOVQ src_len+32(FP), CX
  13133. MOVL 12(SP), BX
  13134. CMPL BX, CX
  13135. JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
  13136. MOVL CX, SI
  13137. MOVL CX, 12(SP)
  13138. LEAQ (DX)(BX*1), CX
  13139. SUBL BX, SI
  13140. LEAL -1(SI), DX
  13141. CMPL DX, $0x3c
  13142. JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm
  13143. CMPL DX, $0x00000100
  13144. JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
  13145. CMPL DX, $0x00010000
  13146. JLT three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
  13147. CMPL DX, $0x01000000
  13148. JLT four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
  13149. MOVB $0xfc, (AX)
  13150. MOVL DX, 1(AX)
  13151. ADDQ $0x05, AX
  13152. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
  13153. four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
  13154. MOVL DX, BX
  13155. SHRL $0x10, BX
  13156. MOVB $0xf8, (AX)
  13157. MOVW DX, 1(AX)
  13158. MOVB BL, 3(AX)
  13159. ADDQ $0x04, AX
  13160. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
  13161. three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
  13162. MOVB $0xf4, (AX)
  13163. MOVW DX, 1(AX)
  13164. ADDQ $0x03, AX
  13165. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
  13166. two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
  13167. MOVB $0xf0, (AX)
  13168. MOVB DL, 1(AX)
  13169. ADDQ $0x02, AX
  13170. CMPL DX, $0x40
  13171. JL memmove_emit_remainder_encodeSnappyBetterBlockAsm
  13172. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
  13173. one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
  13174. SHLB $0x02, DL
  13175. MOVB DL, (AX)
  13176. ADDQ $0x01, AX
  13177. memmove_emit_remainder_encodeSnappyBetterBlockAsm:
  13178. LEAQ (AX)(SI*1), DX
  13179. MOVL SI, BX
  13180. // genMemMoveShort
  13181. CMPQ BX, $0x03
  13182. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2
  13183. JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3
  13184. CMPQ BX, $0x08
  13185. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7
  13186. CMPQ BX, $0x10
  13187. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
  13188. CMPQ BX, $0x20
  13189. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
  13190. JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
  13191. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2:
  13192. MOVB (CX), SI
  13193. MOVB -1(CX)(BX*1), CL
  13194. MOVB SI, (AX)
  13195. MOVB CL, -1(AX)(BX*1)
  13196. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
  13197. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3:
  13198. MOVW (CX), SI
  13199. MOVB 2(CX), CL
  13200. MOVW SI, (AX)
  13201. MOVB CL, 2(AX)
  13202. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
  13203. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7:
  13204. MOVL (CX), SI
  13205. MOVL -4(CX)(BX*1), CX
  13206. MOVL SI, (AX)
  13207. MOVL CX, -4(AX)(BX*1)
  13208. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
  13209. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
  13210. MOVQ (CX), SI
  13211. MOVQ -8(CX)(BX*1), CX
  13212. MOVQ SI, (AX)
  13213. MOVQ CX, -8(AX)(BX*1)
  13214. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
  13215. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
  13216. MOVOU (CX), X0
  13217. MOVOU -16(CX)(BX*1), X1
  13218. MOVOU X0, (AX)
  13219. MOVOU X1, -16(AX)(BX*1)
  13220. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
  13221. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
  13222. MOVOU (CX), X0
  13223. MOVOU 16(CX), X1
  13224. MOVOU -32(CX)(BX*1), X2
  13225. MOVOU -16(CX)(BX*1), X3
  13226. MOVOU X0, (AX)
  13227. MOVOU X1, 16(AX)
  13228. MOVOU X2, -32(AX)(BX*1)
  13229. MOVOU X3, -16(AX)(BX*1)
  13230. memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
  13231. MOVQ DX, AX
  13232. JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
  13233. memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
  13234. LEAQ (AX)(SI*1), DX
  13235. MOVL SI, BX
  13236. // genMemMoveLong
  13237. MOVOU (CX), X0
  13238. MOVOU 16(CX), X1
  13239. MOVOU -32(CX)(BX*1), X2
  13240. MOVOU -16(CX)(BX*1), X3
  13241. MOVQ BX, DI
  13242. SHRQ $0x05, DI
  13243. MOVQ AX, SI
  13244. ANDL $0x0000001f, SI
  13245. MOVQ $0x00000040, R8
  13246. SUBQ SI, R8
  13247. DECQ DI
  13248. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
  13249. LEAQ -32(CX)(R8*1), SI
  13250. LEAQ -32(AX)(R8*1), R9
  13251. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
  13252. MOVOU (SI), X4
  13253. MOVOU 16(SI), X5
  13254. MOVOA X4, (R9)
  13255. MOVOA X5, 16(R9)
  13256. ADDQ $0x20, R9
  13257. ADDQ $0x20, SI
  13258. ADDQ $0x20, R8
  13259. DECQ DI
  13260. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
  13261. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
  13262. MOVOU -32(CX)(R8*1), X4
  13263. MOVOU -16(CX)(R8*1), X5
  13264. MOVOA X4, -32(AX)(R8*1)
  13265. MOVOA X5, -16(AX)(R8*1)
  13266. ADDQ $0x20, R8
  13267. CMPQ BX, R8
  13268. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
  13269. MOVOU X0, (AX)
  13270. MOVOU X1, 16(AX)
  13271. MOVOU X2, -32(AX)(BX*1)
  13272. MOVOU X3, -16(AX)(BX*1)
  13273. MOVQ DX, AX
  13274. emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
  13275. MOVQ dst_base+0(FP), CX
  13276. SUBQ CX, AX
  13277. MOVQ AX, ret+48(FP)
  13278. RET
  13279. // func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
  13280. // Requires: BMI, SSE2
  13281. TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
  13282. MOVQ dst_base+0(FP), AX
  13283. MOVQ $0x00000a00, CX
  13284. LEAQ 24(SP), DX
  13285. PXOR X0, X0
  13286. zero_loop_encodeSnappyBetterBlockAsm64K:
  13287. MOVOU X0, (DX)
  13288. MOVOU X0, 16(DX)
  13289. MOVOU X0, 32(DX)
  13290. MOVOU X0, 48(DX)
  13291. MOVOU X0, 64(DX)
  13292. MOVOU X0, 80(DX)
  13293. MOVOU X0, 96(DX)
  13294. MOVOU X0, 112(DX)
  13295. ADDQ $0x80, DX
  13296. DECQ CX
  13297. JNZ zero_loop_encodeSnappyBetterBlockAsm64K
  13298. MOVL $0x00000000, 12(SP)
  13299. MOVQ src_len+32(FP), CX
  13300. LEAQ -9(CX), DX
  13301. LEAQ -8(CX), SI
  13302. MOVL SI, 8(SP)
  13303. SHRQ $0x05, CX
  13304. SUBL CX, DX
  13305. LEAQ (AX)(DX*1), DX
  13306. MOVQ DX, (SP)
  13307. MOVL $0x00000001, CX
  13308. MOVL $0x00000000, 16(SP)
  13309. MOVQ src_base+24(FP), DX
  13310. search_loop_encodeSnappyBetterBlockAsm64K:
  13311. MOVL CX, SI
  13312. SUBL 12(SP), SI
  13313. SHRL $0x07, SI
  13314. LEAL 1(CX)(SI*1), SI
  13315. CMPL SI, 8(SP)
  13316. JGE emit_remainder_encodeSnappyBetterBlockAsm64K
  13317. MOVQ (DX)(CX*1), DI
  13318. MOVL SI, 20(SP)
  13319. MOVQ $0x00cf1bbcdcbfa563, R9
  13320. MOVQ $0x9e3779b1, SI
  13321. MOVQ DI, R10
  13322. MOVQ DI, R11
  13323. SHLQ $0x08, R10
  13324. IMULQ R9, R10
  13325. SHRQ $0x30, R10
  13326. SHLQ $0x20, R11
  13327. IMULQ SI, R11
  13328. SHRQ $0x32, R11
  13329. MOVL 24(SP)(R10*4), SI
  13330. MOVL 262168(SP)(R11*4), R8
  13331. MOVL CX, 24(SP)(R10*4)
  13332. MOVL CX, 262168(SP)(R11*4)
  13333. CMPL (DX)(SI*1), DI
  13334. JEQ candidate_match_encodeSnappyBetterBlockAsm64K
  13335. CMPL (DX)(R8*1), DI
  13336. JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
  13337. MOVL 20(SP), CX
  13338. JMP search_loop_encodeSnappyBetterBlockAsm64K
  13339. candidateS_match_encodeSnappyBetterBlockAsm64K:
  13340. SHRQ $0x08, DI
  13341. MOVQ DI, R10
  13342. SHLQ $0x08, R10
  13343. IMULQ R9, R10
  13344. SHRQ $0x30, R10
  13345. MOVL 24(SP)(R10*4), SI
  13346. INCL CX
  13347. MOVL CX, 24(SP)(R10*4)
  13348. CMPL (DX)(SI*1), DI
  13349. JEQ candidate_match_encodeSnappyBetterBlockAsm64K
  13350. DECL CX
  13351. MOVL R8, SI
  13352. candidate_match_encodeSnappyBetterBlockAsm64K:
  13353. MOVL 12(SP), DI
  13354. TESTL SI, SI
  13355. JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
  13356. match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
  13357. CMPL CX, DI
  13358. JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K
  13359. MOVB -1(DX)(SI*1), BL
  13360. MOVB -1(DX)(CX*1), R8
  13361. CMPB BL, R8
  13362. JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K
  13363. LEAL -1(CX), CX
  13364. DECL SI
  13365. JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
  13366. JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K
  13367. match_extend_back_end_encodeSnappyBetterBlockAsm64K:
  13368. MOVL CX, DI
  13369. SUBL 12(SP), DI
  13370. LEAQ 3(AX)(DI*1), DI
  13371. CMPQ DI, (SP)
  13372. JL match_dst_size_check_encodeSnappyBetterBlockAsm64K
  13373. MOVQ $0x00000000, ret+48(FP)
  13374. RET
  13375. match_dst_size_check_encodeSnappyBetterBlockAsm64K:
  13376. MOVL CX, DI
  13377. ADDL $0x04, CX
  13378. ADDL $0x04, SI
  13379. MOVQ src_len+32(FP), R8
  13380. SUBL CX, R8
  13381. LEAQ (DX)(CX*1), R9
  13382. LEAQ (DX)(SI*1), R10
  13383. // matchLen
  13384. XORL R12, R12
  13385. CMPL R8, $0x08
  13386. JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
  13387. matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K:
  13388. MOVQ (R9)(R12*1), R11
  13389. XORQ (R10)(R12*1), R11
  13390. TESTQ R11, R11
  13391. JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K
  13392. #ifdef GOAMD64_v3
  13393. TZCNTQ R11, R11
  13394. #else
  13395. BSFQ R11, R11
  13396. #endif
  13397. SARQ $0x03, R11
  13398. LEAL (R12)(R11*1), R12
  13399. JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
  13400. matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K:
  13401. LEAL -8(R8), R8
  13402. LEAL 8(R12), R12
  13403. CMPL R8, $0x08
  13404. JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K
  13405. JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
  13406. matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
  13407. CMPL R8, $0x04
  13408. JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
  13409. MOVL (R9)(R12*1), R11
  13410. CMPL (R10)(R12*1), R11
  13411. JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
  13412. SUBL $0x04, R8
  13413. LEAL 4(R12), R12
  13414. matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
  13415. CMPL R8, $0x02
  13416. JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
  13417. MOVW (R9)(R12*1), R11
  13418. CMPW (R10)(R12*1), R11
  13419. JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
  13420. SUBL $0x02, R8
  13421. LEAL 2(R12), R12
  13422. matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
  13423. CMPL R8, $0x01
  13424. JL match_nolit_end_encodeSnappyBetterBlockAsm64K
  13425. MOVB (R9)(R12*1), R11
  13426. CMPB (R10)(R12*1), R11
  13427. JNE match_nolit_end_encodeSnappyBetterBlockAsm64K
  13428. LEAL 1(R12), R12
  13429. match_nolit_end_encodeSnappyBetterBlockAsm64K:
  13430. MOVL CX, R8
  13431. SUBL SI, R8
  13432. // Check if repeat
  13433. MOVL R8, 16(SP)
  13434. MOVL 12(SP), SI
  13435. CMPL SI, DI
  13436. JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
  13437. MOVL DI, R9
  13438. MOVL DI, 12(SP)
  13439. LEAQ (DX)(SI*1), R10
  13440. SUBL SI, R9
  13441. LEAL -1(R9), SI
  13442. CMPL SI, $0x3c
  13443. JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K
  13444. CMPL SI, $0x00000100
  13445. JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
  13446. MOVB $0xf4, (AX)
  13447. MOVW SI, 1(AX)
  13448. ADDQ $0x03, AX
  13449. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
  13450. two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
  13451. MOVB $0xf0, (AX)
  13452. MOVB SI, 1(AX)
  13453. ADDQ $0x02, AX
  13454. CMPL SI, $0x40
  13455. JL memmove_match_emit_encodeSnappyBetterBlockAsm64K
  13456. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
  13457. one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
  13458. SHLB $0x02, SI
  13459. MOVB SI, (AX)
  13460. ADDQ $0x01, AX
  13461. memmove_match_emit_encodeSnappyBetterBlockAsm64K:
  13462. LEAQ (AX)(R9*1), SI
  13463. // genMemMoveShort
  13464. CMPQ R9, $0x08
  13465. JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
  13466. CMPQ R9, $0x10
  13467. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
  13468. CMPQ R9, $0x20
  13469. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
  13470. JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
  13471. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
  13472. MOVQ (R10), R11
  13473. MOVQ R11, (AX)
  13474. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
  13475. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
  13476. MOVQ (R10), R11
  13477. MOVQ -8(R10)(R9*1), R10
  13478. MOVQ R11, (AX)
  13479. MOVQ R10, -8(AX)(R9*1)
  13480. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
  13481. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
  13482. MOVOU (R10), X0
  13483. MOVOU -16(R10)(R9*1), X1
  13484. MOVOU X0, (AX)
  13485. MOVOU X1, -16(AX)(R9*1)
  13486. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
  13487. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
  13488. MOVOU (R10), X0
  13489. MOVOU 16(R10), X1
  13490. MOVOU -32(R10)(R9*1), X2
  13491. MOVOU -16(R10)(R9*1), X3
  13492. MOVOU X0, (AX)
  13493. MOVOU X1, 16(AX)
  13494. MOVOU X2, -32(AX)(R9*1)
  13495. MOVOU X3, -16(AX)(R9*1)
  13496. memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
  13497. MOVQ SI, AX
  13498. JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
  13499. memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
  13500. LEAQ (AX)(R9*1), SI
  13501. // genMemMoveLong
  13502. MOVOU (R10), X0
  13503. MOVOU 16(R10), X1
  13504. MOVOU -32(R10)(R9*1), X2
  13505. MOVOU -16(R10)(R9*1), X3
  13506. MOVQ R9, R13
  13507. SHRQ $0x05, R13
  13508. MOVQ AX, R11
  13509. ANDL $0x0000001f, R11
  13510. MOVQ $0x00000040, R14
  13511. SUBQ R11, R14
  13512. DECQ R13
  13513. JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
  13514. LEAQ -32(R10)(R14*1), R11
  13515. LEAQ -32(AX)(R14*1), R15
  13516. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
  13517. MOVOU (R11), X4
  13518. MOVOU 16(R11), X5
  13519. MOVOA X4, (R15)
  13520. MOVOA X5, 16(R15)
  13521. ADDQ $0x20, R15
  13522. ADDQ $0x20, R11
  13523. ADDQ $0x20, R14
  13524. DECQ R13
  13525. JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
  13526. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
  13527. MOVOU -32(R10)(R14*1), X4
  13528. MOVOU -16(R10)(R14*1), X5
  13529. MOVOA X4, -32(AX)(R14*1)
  13530. MOVOA X5, -16(AX)(R14*1)
  13531. ADDQ $0x20, R14
  13532. CMPQ R9, R14
  13533. JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
  13534. MOVOU X0, (AX)
  13535. MOVOU X1, 16(AX)
  13536. MOVOU X2, -32(AX)(R9*1)
  13537. MOVOU X3, -16(AX)(R9*1)
  13538. MOVQ SI, AX
  13539. emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
  13540. ADDL R12, CX
  13541. ADDL $0x04, R12
  13542. MOVL CX, 12(SP)
  13543. // emitCopy
  13544. two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
  13545. CMPL R12, $0x40
  13546. JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
  13547. MOVB $0xee, (AX)
  13548. MOVW R8, 1(AX)
  13549. LEAL -60(R12), R12
  13550. ADDQ $0x03, AX
  13551. JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
  13552. two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
  13553. CMPL R12, $0x0c
  13554. JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
  13555. CMPL R8, $0x00000800
  13556. JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
  13557. MOVB $0x01, BL
  13558. LEAL -16(BX)(R12*4), R12
  13559. MOVB R8, 1(AX)
  13560. SHRL $0x08, R8
  13561. SHLL $0x05, R8
  13562. ORL R8, R12
  13563. MOVB R12, (AX)
  13564. ADDQ $0x02, AX
  13565. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
  13566. emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
  13567. MOVB $0x02, BL
  13568. LEAL -4(BX)(R12*4), R12
  13569. MOVB R12, (AX)
  13570. MOVW R8, 1(AX)
  13571. ADDQ $0x03, AX
  13572. match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
  13573. CMPL CX, 8(SP)
  13574. JGE emit_remainder_encodeSnappyBetterBlockAsm64K
  13575. CMPQ AX, (SP)
  13576. JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
  13577. MOVQ $0x00000000, ret+48(FP)
  13578. RET
  13579. match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
  13580. MOVQ $0x00cf1bbcdcbfa563, SI
  13581. MOVQ $0x9e3779b1, R8
  13582. INCL DI
  13583. MOVQ (DX)(DI*1), R9
  13584. MOVQ R9, R10
  13585. MOVQ R9, R11
  13586. MOVQ R9, R12
  13587. SHRQ $0x08, R11
  13588. MOVQ R11, R13
  13589. SHRQ $0x10, R12
  13590. LEAL 1(DI), R14
  13591. LEAL 2(DI), R15
  13592. MOVQ -2(DX)(CX*1), R9
  13593. SHLQ $0x08, R10
  13594. IMULQ SI, R10
  13595. SHRQ $0x30, R10
  13596. SHLQ $0x08, R13
  13597. IMULQ SI, R13
  13598. SHRQ $0x30, R13
  13599. SHLQ $0x20, R11
  13600. IMULQ R8, R11
  13601. SHRQ $0x32, R11
  13602. SHLQ $0x20, R12
  13603. IMULQ R8, R12
  13604. SHRQ $0x32, R12
  13605. MOVL DI, 24(SP)(R10*4)
  13606. MOVL R14, 24(SP)(R13*4)
  13607. MOVL R14, 262168(SP)(R11*4)
  13608. MOVL R15, 262168(SP)(R12*4)
  13609. MOVQ R9, R10
  13610. MOVQ R9, R11
  13611. SHRQ $0x08, R11
  13612. MOVQ R11, R13
  13613. LEAL -2(CX), R9
  13614. LEAL -1(CX), DI
  13615. SHLQ $0x08, R10
  13616. IMULQ SI, R10
  13617. SHRQ $0x30, R10
  13618. SHLQ $0x20, R11
  13619. IMULQ R8, R11
  13620. SHRQ $0x32, R11
  13621. SHLQ $0x08, R13
  13622. IMULQ SI, R13
  13623. SHRQ $0x30, R13
  13624. MOVL R9, 24(SP)(R10*4)
  13625. MOVL DI, 262168(SP)(R11*4)
  13626. MOVL DI, 24(SP)(R13*4)
  13627. JMP search_loop_encodeSnappyBetterBlockAsm64K
  13628. emit_remainder_encodeSnappyBetterBlockAsm64K:
  13629. MOVQ src_len+32(FP), CX
  13630. SUBL 12(SP), CX
  13631. LEAQ 3(AX)(CX*1), CX
  13632. CMPQ CX, (SP)
  13633. JL emit_remainder_ok_encodeSnappyBetterBlockAsm64K
  13634. MOVQ $0x00000000, ret+48(FP)
  13635. RET
  13636. emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
  13637. MOVQ src_len+32(FP), CX
  13638. MOVL 12(SP), BX
  13639. CMPL BX, CX
  13640. JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
  13641. MOVL CX, SI
  13642. MOVL CX, 12(SP)
  13643. LEAQ (DX)(BX*1), CX
  13644. SUBL BX, SI
  13645. LEAL -1(SI), DX
  13646. CMPL DX, $0x3c
  13647. JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
  13648. CMPL DX, $0x00000100
  13649. JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
  13650. MOVB $0xf4, (AX)
  13651. MOVW DX, 1(AX)
  13652. ADDQ $0x03, AX
  13653. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
  13654. two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
  13655. MOVB $0xf0, (AX)
  13656. MOVB DL, 1(AX)
  13657. ADDQ $0x02, AX
  13658. CMPL DX, $0x40
  13659. JL memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
  13660. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
  13661. one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
  13662. SHLB $0x02, DL
  13663. MOVB DL, (AX)
  13664. ADDQ $0x01, AX
  13665. memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
  13666. LEAQ (AX)(SI*1), DX
  13667. MOVL SI, BX
  13668. // genMemMoveShort
  13669. CMPQ BX, $0x03
  13670. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2
  13671. JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3
  13672. CMPQ BX, $0x08
  13673. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7
  13674. CMPQ BX, $0x10
  13675. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
  13676. CMPQ BX, $0x20
  13677. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
  13678. JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
  13679. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2:
  13680. MOVB (CX), SI
  13681. MOVB -1(CX)(BX*1), CL
  13682. MOVB SI, (AX)
  13683. MOVB CL, -1(AX)(BX*1)
  13684. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
  13685. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3:
  13686. MOVW (CX), SI
  13687. MOVB 2(CX), CL
  13688. MOVW SI, (AX)
  13689. MOVB CL, 2(AX)
  13690. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
  13691. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7:
  13692. MOVL (CX), SI
  13693. MOVL -4(CX)(BX*1), CX
  13694. MOVL SI, (AX)
  13695. MOVL CX, -4(AX)(BX*1)
  13696. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
  13697. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
  13698. MOVQ (CX), SI
  13699. MOVQ -8(CX)(BX*1), CX
  13700. MOVQ SI, (AX)
  13701. MOVQ CX, -8(AX)(BX*1)
  13702. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
  13703. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
  13704. MOVOU (CX), X0
  13705. MOVOU -16(CX)(BX*1), X1
  13706. MOVOU X0, (AX)
  13707. MOVOU X1, -16(AX)(BX*1)
  13708. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
  13709. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
  13710. MOVOU (CX), X0
  13711. MOVOU 16(CX), X1
  13712. MOVOU -32(CX)(BX*1), X2
  13713. MOVOU -16(CX)(BX*1), X3
  13714. MOVOU X0, (AX)
  13715. MOVOU X1, 16(AX)
  13716. MOVOU X2, -32(AX)(BX*1)
  13717. MOVOU X3, -16(AX)(BX*1)
  13718. memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
  13719. MOVQ DX, AX
  13720. JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
  13721. memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
  13722. LEAQ (AX)(SI*1), DX
  13723. MOVL SI, BX
  13724. // genMemMoveLong
  13725. MOVOU (CX), X0
  13726. MOVOU 16(CX), X1
  13727. MOVOU -32(CX)(BX*1), X2
  13728. MOVOU -16(CX)(BX*1), X3
  13729. MOVQ BX, DI
  13730. SHRQ $0x05, DI
  13731. MOVQ AX, SI
  13732. ANDL $0x0000001f, SI
  13733. MOVQ $0x00000040, R8
  13734. SUBQ SI, R8
  13735. DECQ DI
  13736. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
  13737. LEAQ -32(CX)(R8*1), SI
  13738. LEAQ -32(AX)(R8*1), R9
  13739. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
  13740. MOVOU (SI), X4
  13741. MOVOU 16(SI), X5
  13742. MOVOA X4, (R9)
  13743. MOVOA X5, 16(R9)
  13744. ADDQ $0x20, R9
  13745. ADDQ $0x20, SI
  13746. ADDQ $0x20, R8
  13747. DECQ DI
  13748. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
  13749. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
  13750. MOVOU -32(CX)(R8*1), X4
  13751. MOVOU -16(CX)(R8*1), X5
  13752. MOVOA X4, -32(AX)(R8*1)
  13753. MOVOA X5, -16(AX)(R8*1)
  13754. ADDQ $0x20, R8
  13755. CMPQ BX, R8
  13756. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
  13757. MOVOU X0, (AX)
  13758. MOVOU X1, 16(AX)
  13759. MOVOU X2, -32(AX)(BX*1)
  13760. MOVOU X3, -16(AX)(BX*1)
  13761. MOVQ DX, AX
  13762. emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
  13763. MOVQ dst_base+0(FP), CX
  13764. SUBQ CX, AX
  13765. MOVQ AX, ret+48(FP)
  13766. RET
  13767. // func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
  13768. // Requires: BMI, SSE2
  13769. TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
  13770. MOVQ dst_base+0(FP), AX
  13771. MOVQ $0x00000280, CX
  13772. LEAQ 24(SP), DX
  13773. PXOR X0, X0
  13774. zero_loop_encodeSnappyBetterBlockAsm12B:
  13775. MOVOU X0, (DX)
  13776. MOVOU X0, 16(DX)
  13777. MOVOU X0, 32(DX)
  13778. MOVOU X0, 48(DX)
  13779. MOVOU X0, 64(DX)
  13780. MOVOU X0, 80(DX)
  13781. MOVOU X0, 96(DX)
  13782. MOVOU X0, 112(DX)
  13783. ADDQ $0x80, DX
  13784. DECQ CX
  13785. JNZ zero_loop_encodeSnappyBetterBlockAsm12B
  13786. MOVL $0x00000000, 12(SP)
  13787. MOVQ src_len+32(FP), CX
  13788. LEAQ -9(CX), DX
  13789. LEAQ -8(CX), SI
  13790. MOVL SI, 8(SP)
  13791. SHRQ $0x05, CX
  13792. SUBL CX, DX
  13793. LEAQ (AX)(DX*1), DX
  13794. MOVQ DX, (SP)
  13795. MOVL $0x00000001, CX
  13796. MOVL $0x00000000, 16(SP)
  13797. MOVQ src_base+24(FP), DX
  13798. search_loop_encodeSnappyBetterBlockAsm12B:
  13799. MOVL CX, SI
  13800. SUBL 12(SP), SI
  13801. SHRL $0x06, SI
  13802. LEAL 1(CX)(SI*1), SI
  13803. CMPL SI, 8(SP)
  13804. JGE emit_remainder_encodeSnappyBetterBlockAsm12B
  13805. MOVQ (DX)(CX*1), DI
  13806. MOVL SI, 20(SP)
  13807. MOVQ $0x0000cf1bbcdcbf9b, R9
  13808. MOVQ $0x9e3779b1, SI
  13809. MOVQ DI, R10
  13810. MOVQ DI, R11
  13811. SHLQ $0x10, R10
  13812. IMULQ R9, R10
  13813. SHRQ $0x32, R10
  13814. SHLQ $0x20, R11
  13815. IMULQ SI, R11
  13816. SHRQ $0x34, R11
  13817. MOVL 24(SP)(R10*4), SI
  13818. MOVL 65560(SP)(R11*4), R8
  13819. MOVL CX, 24(SP)(R10*4)
  13820. MOVL CX, 65560(SP)(R11*4)
  13821. CMPL (DX)(SI*1), DI
  13822. JEQ candidate_match_encodeSnappyBetterBlockAsm12B
  13823. CMPL (DX)(R8*1), DI
  13824. JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
  13825. MOVL 20(SP), CX
  13826. JMP search_loop_encodeSnappyBetterBlockAsm12B
  13827. candidateS_match_encodeSnappyBetterBlockAsm12B:
  13828. SHRQ $0x08, DI
  13829. MOVQ DI, R10
  13830. SHLQ $0x10, R10
  13831. IMULQ R9, R10
  13832. SHRQ $0x32, R10
  13833. MOVL 24(SP)(R10*4), SI
  13834. INCL CX
  13835. MOVL CX, 24(SP)(R10*4)
  13836. CMPL (DX)(SI*1), DI
  13837. JEQ candidate_match_encodeSnappyBetterBlockAsm12B
  13838. DECL CX
  13839. MOVL R8, SI
  13840. candidate_match_encodeSnappyBetterBlockAsm12B:
  13841. MOVL 12(SP), DI
  13842. TESTL SI, SI
  13843. JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
  13844. match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
  13845. CMPL CX, DI
  13846. JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B
  13847. MOVB -1(DX)(SI*1), BL
  13848. MOVB -1(DX)(CX*1), R8
  13849. CMPB BL, R8
  13850. JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B
  13851. LEAL -1(CX), CX
  13852. DECL SI
  13853. JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
  13854. JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B
  13855. match_extend_back_end_encodeSnappyBetterBlockAsm12B:
  13856. MOVL CX, DI
  13857. SUBL 12(SP), DI
  13858. LEAQ 3(AX)(DI*1), DI
  13859. CMPQ DI, (SP)
  13860. JL match_dst_size_check_encodeSnappyBetterBlockAsm12B
  13861. MOVQ $0x00000000, ret+48(FP)
  13862. RET
  13863. match_dst_size_check_encodeSnappyBetterBlockAsm12B:
  13864. MOVL CX, DI
  13865. ADDL $0x04, CX
  13866. ADDL $0x04, SI
  13867. MOVQ src_len+32(FP), R8
  13868. SUBL CX, R8
  13869. LEAQ (DX)(CX*1), R9
  13870. LEAQ (DX)(SI*1), R10
  13871. // matchLen
  13872. XORL R12, R12
  13873. CMPL R8, $0x08
  13874. JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
  13875. matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B:
  13876. MOVQ (R9)(R12*1), R11
  13877. XORQ (R10)(R12*1), R11
  13878. TESTQ R11, R11
  13879. JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B
  13880. #ifdef GOAMD64_v3
  13881. TZCNTQ R11, R11
  13882. #else
  13883. BSFQ R11, R11
  13884. #endif
  13885. SARQ $0x03, R11
  13886. LEAL (R12)(R11*1), R12
  13887. JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
  13888. matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B:
  13889. LEAL -8(R8), R8
  13890. LEAL 8(R12), R12
  13891. CMPL R8, $0x08
  13892. JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B
  13893. JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
  13894. matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
  13895. CMPL R8, $0x04
  13896. JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
  13897. MOVL (R9)(R12*1), R11
  13898. CMPL (R10)(R12*1), R11
  13899. JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
  13900. SUBL $0x04, R8
  13901. LEAL 4(R12), R12
  13902. matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
  13903. CMPL R8, $0x02
  13904. JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
  13905. MOVW (R9)(R12*1), R11
  13906. CMPW (R10)(R12*1), R11
  13907. JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
  13908. SUBL $0x02, R8
  13909. LEAL 2(R12), R12
  13910. matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
  13911. CMPL R8, $0x01
  13912. JL match_nolit_end_encodeSnappyBetterBlockAsm12B
  13913. MOVB (R9)(R12*1), R11
  13914. CMPB (R10)(R12*1), R11
  13915. JNE match_nolit_end_encodeSnappyBetterBlockAsm12B
  13916. LEAL 1(R12), R12
  13917. match_nolit_end_encodeSnappyBetterBlockAsm12B:
  13918. MOVL CX, R8
  13919. SUBL SI, R8
  13920. // Check if repeat
  13921. MOVL R8, 16(SP)
  13922. MOVL 12(SP), SI
  13923. CMPL SI, DI
  13924. JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
  13925. MOVL DI, R9
  13926. MOVL DI, 12(SP)
  13927. LEAQ (DX)(SI*1), R10
  13928. SUBL SI, R9
  13929. LEAL -1(R9), SI
  13930. CMPL SI, $0x3c
  13931. JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B
  13932. CMPL SI, $0x00000100
  13933. JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
  13934. MOVB $0xf4, (AX)
  13935. MOVW SI, 1(AX)
  13936. ADDQ $0x03, AX
  13937. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
  13938. two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
  13939. MOVB $0xf0, (AX)
  13940. MOVB SI, 1(AX)
  13941. ADDQ $0x02, AX
  13942. CMPL SI, $0x40
  13943. JL memmove_match_emit_encodeSnappyBetterBlockAsm12B
  13944. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
  13945. one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
  13946. SHLB $0x02, SI
  13947. MOVB SI, (AX)
  13948. ADDQ $0x01, AX
  13949. memmove_match_emit_encodeSnappyBetterBlockAsm12B:
  13950. LEAQ (AX)(R9*1), SI
  13951. // genMemMoveShort
  13952. CMPQ R9, $0x08
  13953. JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
  13954. CMPQ R9, $0x10
  13955. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
  13956. CMPQ R9, $0x20
  13957. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
  13958. JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
  13959. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
  13960. MOVQ (R10), R11
  13961. MOVQ R11, (AX)
  13962. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
  13963. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
  13964. MOVQ (R10), R11
  13965. MOVQ -8(R10)(R9*1), R10
  13966. MOVQ R11, (AX)
  13967. MOVQ R10, -8(AX)(R9*1)
  13968. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
  13969. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
  13970. MOVOU (R10), X0
  13971. MOVOU -16(R10)(R9*1), X1
  13972. MOVOU X0, (AX)
  13973. MOVOU X1, -16(AX)(R9*1)
  13974. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
  13975. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
  13976. MOVOU (R10), X0
  13977. MOVOU 16(R10), X1
  13978. MOVOU -32(R10)(R9*1), X2
  13979. MOVOU -16(R10)(R9*1), X3
  13980. MOVOU X0, (AX)
  13981. MOVOU X1, 16(AX)
  13982. MOVOU X2, -32(AX)(R9*1)
  13983. MOVOU X3, -16(AX)(R9*1)
  13984. memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
  13985. MOVQ SI, AX
  13986. JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
  13987. memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
  13988. LEAQ (AX)(R9*1), SI
  13989. // genMemMoveLong
  13990. MOVOU (R10), X0
  13991. MOVOU 16(R10), X1
  13992. MOVOU -32(R10)(R9*1), X2
  13993. MOVOU -16(R10)(R9*1), X3
  13994. MOVQ R9, R13
  13995. SHRQ $0x05, R13
  13996. MOVQ AX, R11
  13997. ANDL $0x0000001f, R11
  13998. MOVQ $0x00000040, R14
  13999. SUBQ R11, R14
  14000. DECQ R13
  14001. JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
  14002. LEAQ -32(R10)(R14*1), R11
  14003. LEAQ -32(AX)(R14*1), R15
  14004. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
  14005. MOVOU (R11), X4
  14006. MOVOU 16(R11), X5
  14007. MOVOA X4, (R15)
  14008. MOVOA X5, 16(R15)
  14009. ADDQ $0x20, R15
  14010. ADDQ $0x20, R11
  14011. ADDQ $0x20, R14
  14012. DECQ R13
  14013. JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
  14014. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
  14015. MOVOU -32(R10)(R14*1), X4
  14016. MOVOU -16(R10)(R14*1), X5
  14017. MOVOA X4, -32(AX)(R14*1)
  14018. MOVOA X5, -16(AX)(R14*1)
  14019. ADDQ $0x20, R14
  14020. CMPQ R9, R14
  14021. JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
  14022. MOVOU X0, (AX)
  14023. MOVOU X1, 16(AX)
  14024. MOVOU X2, -32(AX)(R9*1)
  14025. MOVOU X3, -16(AX)(R9*1)
  14026. MOVQ SI, AX
  14027. emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
  14028. ADDL R12, CX
  14029. ADDL $0x04, R12
  14030. MOVL CX, 12(SP)
  14031. // emitCopy
  14032. two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
  14033. CMPL R12, $0x40
  14034. JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
  14035. MOVB $0xee, (AX)
  14036. MOVW R8, 1(AX)
  14037. LEAL -60(R12), R12
  14038. ADDQ $0x03, AX
  14039. JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
  14040. two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
  14041. CMPL R12, $0x0c
  14042. JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
  14043. CMPL R8, $0x00000800
  14044. JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
  14045. MOVB $0x01, BL
  14046. LEAL -16(BX)(R12*4), R12
  14047. MOVB R8, 1(AX)
  14048. SHRL $0x08, R8
  14049. SHLL $0x05, R8
  14050. ORL R8, R12
  14051. MOVB R12, (AX)
  14052. ADDQ $0x02, AX
  14053. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
  14054. emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
  14055. MOVB $0x02, BL
  14056. LEAL -4(BX)(R12*4), R12
  14057. MOVB R12, (AX)
  14058. MOVW R8, 1(AX)
  14059. ADDQ $0x03, AX
  14060. match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
  14061. CMPL CX, 8(SP)
  14062. JGE emit_remainder_encodeSnappyBetterBlockAsm12B
  14063. CMPQ AX, (SP)
  14064. JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
  14065. MOVQ $0x00000000, ret+48(FP)
  14066. RET
  14067. match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
  14068. MOVQ $0x0000cf1bbcdcbf9b, SI
  14069. MOVQ $0x9e3779b1, R8
  14070. INCL DI
  14071. MOVQ (DX)(DI*1), R9
  14072. MOVQ R9, R10
  14073. MOVQ R9, R11
  14074. MOVQ R9, R12
  14075. SHRQ $0x08, R11
  14076. MOVQ R11, R13
  14077. SHRQ $0x10, R12
  14078. LEAL 1(DI), R14
  14079. LEAL 2(DI), R15
  14080. MOVQ -2(DX)(CX*1), R9
  14081. SHLQ $0x10, R10
  14082. IMULQ SI, R10
  14083. SHRQ $0x32, R10
  14084. SHLQ $0x10, R13
  14085. IMULQ SI, R13
  14086. SHRQ $0x32, R13
  14087. SHLQ $0x20, R11
  14088. IMULQ R8, R11
  14089. SHRQ $0x34, R11
  14090. SHLQ $0x20, R12
  14091. IMULQ R8, R12
  14092. SHRQ $0x34, R12
  14093. MOVL DI, 24(SP)(R10*4)
  14094. MOVL R14, 24(SP)(R13*4)
  14095. MOVL R14, 65560(SP)(R11*4)
  14096. MOVL R15, 65560(SP)(R12*4)
  14097. MOVQ R9, R10
  14098. MOVQ R9, R11
  14099. SHRQ $0x08, R11
  14100. MOVQ R11, R13
  14101. LEAL -2(CX), R9
  14102. LEAL -1(CX), DI
  14103. SHLQ $0x10, R10
  14104. IMULQ SI, R10
  14105. SHRQ $0x32, R10
  14106. SHLQ $0x20, R11
  14107. IMULQ R8, R11
  14108. SHRQ $0x34, R11
  14109. SHLQ $0x10, R13
  14110. IMULQ SI, R13
  14111. SHRQ $0x32, R13
  14112. MOVL R9, 24(SP)(R10*4)
  14113. MOVL DI, 65560(SP)(R11*4)
  14114. MOVL DI, 24(SP)(R13*4)
  14115. JMP search_loop_encodeSnappyBetterBlockAsm12B
  14116. emit_remainder_encodeSnappyBetterBlockAsm12B:
  14117. MOVQ src_len+32(FP), CX
  14118. SUBL 12(SP), CX
  14119. LEAQ 3(AX)(CX*1), CX
  14120. CMPQ CX, (SP)
  14121. JL emit_remainder_ok_encodeSnappyBetterBlockAsm12B
  14122. MOVQ $0x00000000, ret+48(FP)
  14123. RET
  14124. emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
  14125. MOVQ src_len+32(FP), CX
  14126. MOVL 12(SP), BX
  14127. CMPL BX, CX
  14128. JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
  14129. MOVL CX, SI
  14130. MOVL CX, 12(SP)
  14131. LEAQ (DX)(BX*1), CX
  14132. SUBL BX, SI
  14133. LEAL -1(SI), DX
  14134. CMPL DX, $0x3c
  14135. JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
  14136. CMPL DX, $0x00000100
  14137. JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
  14138. MOVB $0xf4, (AX)
  14139. MOVW DX, 1(AX)
  14140. ADDQ $0x03, AX
  14141. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
  14142. two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14143. MOVB $0xf0, (AX)
  14144. MOVB DL, 1(AX)
  14145. ADDQ $0x02, AX
  14146. CMPL DX, $0x40
  14147. JL memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
  14148. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
  14149. one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14150. SHLB $0x02, DL
  14151. MOVB DL, (AX)
  14152. ADDQ $0x01, AX
  14153. memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14154. LEAQ (AX)(SI*1), DX
  14155. MOVL SI, BX
  14156. // genMemMoveShort
  14157. CMPQ BX, $0x03
  14158. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2
  14159. JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3
  14160. CMPQ BX, $0x08
  14161. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7
  14162. CMPQ BX, $0x10
  14163. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
  14164. CMPQ BX, $0x20
  14165. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
  14166. JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
  14167. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2:
  14168. MOVB (CX), SI
  14169. MOVB -1(CX)(BX*1), CL
  14170. MOVB SI, (AX)
  14171. MOVB CL, -1(AX)(BX*1)
  14172. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
  14173. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3:
  14174. MOVW (CX), SI
  14175. MOVB 2(CX), CL
  14176. MOVW SI, (AX)
  14177. MOVB CL, 2(AX)
  14178. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
  14179. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7:
  14180. MOVL (CX), SI
  14181. MOVL -4(CX)(BX*1), CX
  14182. MOVL SI, (AX)
  14183. MOVL CX, -4(AX)(BX*1)
  14184. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
  14185. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
  14186. MOVQ (CX), SI
  14187. MOVQ -8(CX)(BX*1), CX
  14188. MOVQ SI, (AX)
  14189. MOVQ CX, -8(AX)(BX*1)
  14190. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
  14191. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
  14192. MOVOU (CX), X0
  14193. MOVOU -16(CX)(BX*1), X1
  14194. MOVOU X0, (AX)
  14195. MOVOU X1, -16(AX)(BX*1)
  14196. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
  14197. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
  14198. MOVOU (CX), X0
  14199. MOVOU 16(CX), X1
  14200. MOVOU -32(CX)(BX*1), X2
  14201. MOVOU -16(CX)(BX*1), X3
  14202. MOVOU X0, (AX)
  14203. MOVOU X1, 16(AX)
  14204. MOVOU X2, -32(AX)(BX*1)
  14205. MOVOU X3, -16(AX)(BX*1)
  14206. memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14207. MOVQ DX, AX
  14208. JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
  14209. memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14210. LEAQ (AX)(SI*1), DX
  14211. MOVL SI, BX
  14212. // genMemMoveLong
  14213. MOVOU (CX), X0
  14214. MOVOU 16(CX), X1
  14215. MOVOU -32(CX)(BX*1), X2
  14216. MOVOU -16(CX)(BX*1), X3
  14217. MOVQ BX, DI
  14218. SHRQ $0x05, DI
  14219. MOVQ AX, SI
  14220. ANDL $0x0000001f, SI
  14221. MOVQ $0x00000040, R8
  14222. SUBQ SI, R8
  14223. DECQ DI
  14224. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
  14225. LEAQ -32(CX)(R8*1), SI
  14226. LEAQ -32(AX)(R8*1), R9
  14227. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
  14228. MOVOU (SI), X4
  14229. MOVOU 16(SI), X5
  14230. MOVOA X4, (R9)
  14231. MOVOA X5, 16(R9)
  14232. ADDQ $0x20, R9
  14233. ADDQ $0x20, SI
  14234. ADDQ $0x20, R8
  14235. DECQ DI
  14236. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
  14237. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
  14238. MOVOU -32(CX)(R8*1), X4
  14239. MOVOU -16(CX)(R8*1), X5
  14240. MOVOA X4, -32(AX)(R8*1)
  14241. MOVOA X5, -16(AX)(R8*1)
  14242. ADDQ $0x20, R8
  14243. CMPQ BX, R8
  14244. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
  14245. MOVOU X0, (AX)
  14246. MOVOU X1, 16(AX)
  14247. MOVOU X2, -32(AX)(BX*1)
  14248. MOVOU X3, -16(AX)(BX*1)
  14249. MOVQ DX, AX
  14250. emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
  14251. MOVQ dst_base+0(FP), CX
  14252. SUBQ CX, AX
  14253. MOVQ AX, ret+48(FP)
  14254. RET
  14255. // func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
  14256. // Requires: BMI, SSE2
  14257. TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
  14258. MOVQ dst_base+0(FP), AX
  14259. MOVQ $0x000000a0, CX
  14260. LEAQ 24(SP), DX
  14261. PXOR X0, X0
  14262. zero_loop_encodeSnappyBetterBlockAsm10B:
  14263. MOVOU X0, (DX)
  14264. MOVOU X0, 16(DX)
  14265. MOVOU X0, 32(DX)
  14266. MOVOU X0, 48(DX)
  14267. MOVOU X0, 64(DX)
  14268. MOVOU X0, 80(DX)
  14269. MOVOU X0, 96(DX)
  14270. MOVOU X0, 112(DX)
  14271. ADDQ $0x80, DX
  14272. DECQ CX
  14273. JNZ zero_loop_encodeSnappyBetterBlockAsm10B
  14274. MOVL $0x00000000, 12(SP)
  14275. MOVQ src_len+32(FP), CX
  14276. LEAQ -9(CX), DX
  14277. LEAQ -8(CX), SI
  14278. MOVL SI, 8(SP)
  14279. SHRQ $0x05, CX
  14280. SUBL CX, DX
  14281. LEAQ (AX)(DX*1), DX
  14282. MOVQ DX, (SP)
  14283. MOVL $0x00000001, CX
  14284. MOVL $0x00000000, 16(SP)
  14285. MOVQ src_base+24(FP), DX
  14286. search_loop_encodeSnappyBetterBlockAsm10B:
  14287. MOVL CX, SI
  14288. SUBL 12(SP), SI
  14289. SHRL $0x05, SI
  14290. LEAL 1(CX)(SI*1), SI
  14291. CMPL SI, 8(SP)
  14292. JGE emit_remainder_encodeSnappyBetterBlockAsm10B
  14293. MOVQ (DX)(CX*1), DI
  14294. MOVL SI, 20(SP)
  14295. MOVQ $0x0000cf1bbcdcbf9b, R9
  14296. MOVQ $0x9e3779b1, SI
  14297. MOVQ DI, R10
  14298. MOVQ DI, R11
  14299. SHLQ $0x10, R10
  14300. IMULQ R9, R10
  14301. SHRQ $0x34, R10
  14302. SHLQ $0x20, R11
  14303. IMULQ SI, R11
  14304. SHRQ $0x36, R11
  14305. MOVL 24(SP)(R10*4), SI
  14306. MOVL 16408(SP)(R11*4), R8
  14307. MOVL CX, 24(SP)(R10*4)
  14308. MOVL CX, 16408(SP)(R11*4)
  14309. CMPL (DX)(SI*1), DI
  14310. JEQ candidate_match_encodeSnappyBetterBlockAsm10B
  14311. CMPL (DX)(R8*1), DI
  14312. JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
  14313. MOVL 20(SP), CX
  14314. JMP search_loop_encodeSnappyBetterBlockAsm10B
  14315. candidateS_match_encodeSnappyBetterBlockAsm10B:
  14316. SHRQ $0x08, DI
  14317. MOVQ DI, R10
  14318. SHLQ $0x10, R10
  14319. IMULQ R9, R10
  14320. SHRQ $0x34, R10
  14321. MOVL 24(SP)(R10*4), SI
  14322. INCL CX
  14323. MOVL CX, 24(SP)(R10*4)
  14324. CMPL (DX)(SI*1), DI
  14325. JEQ candidate_match_encodeSnappyBetterBlockAsm10B
  14326. DECL CX
  14327. MOVL R8, SI
  14328. candidate_match_encodeSnappyBetterBlockAsm10B:
  14329. MOVL 12(SP), DI
  14330. TESTL SI, SI
  14331. JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
  14332. match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
  14333. CMPL CX, DI
  14334. JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B
  14335. MOVB -1(DX)(SI*1), BL
  14336. MOVB -1(DX)(CX*1), R8
  14337. CMPB BL, R8
  14338. JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B
  14339. LEAL -1(CX), CX
  14340. DECL SI
  14341. JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
  14342. JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B
  14343. match_extend_back_end_encodeSnappyBetterBlockAsm10B:
  14344. MOVL CX, DI
  14345. SUBL 12(SP), DI
  14346. LEAQ 3(AX)(DI*1), DI
  14347. CMPQ DI, (SP)
  14348. JL match_dst_size_check_encodeSnappyBetterBlockAsm10B
  14349. MOVQ $0x00000000, ret+48(FP)
  14350. RET
  14351. match_dst_size_check_encodeSnappyBetterBlockAsm10B:
  14352. MOVL CX, DI
  14353. ADDL $0x04, CX
  14354. ADDL $0x04, SI
  14355. MOVQ src_len+32(FP), R8
  14356. SUBL CX, R8
  14357. LEAQ (DX)(CX*1), R9
  14358. LEAQ (DX)(SI*1), R10
  14359. // matchLen
  14360. XORL R12, R12
  14361. CMPL R8, $0x08
  14362. JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
  14363. matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B:
  14364. MOVQ (R9)(R12*1), R11
  14365. XORQ (R10)(R12*1), R11
  14366. TESTQ R11, R11
  14367. JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B
  14368. #ifdef GOAMD64_v3
  14369. TZCNTQ R11, R11
  14370. #else
  14371. BSFQ R11, R11
  14372. #endif
  14373. SARQ $0x03, R11
  14374. LEAL (R12)(R11*1), R12
  14375. JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
  14376. matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B:
  14377. LEAL -8(R8), R8
  14378. LEAL 8(R12), R12
  14379. CMPL R8, $0x08
  14380. JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B
  14381. JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
  14382. matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
  14383. CMPL R8, $0x04
  14384. JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
  14385. MOVL (R9)(R12*1), R11
  14386. CMPL (R10)(R12*1), R11
  14387. JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
  14388. SUBL $0x04, R8
  14389. LEAL 4(R12), R12
  14390. matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
  14391. CMPL R8, $0x02
  14392. JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
  14393. MOVW (R9)(R12*1), R11
  14394. CMPW (R10)(R12*1), R11
  14395. JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
  14396. SUBL $0x02, R8
  14397. LEAL 2(R12), R12
  14398. matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
  14399. CMPL R8, $0x01
  14400. JL match_nolit_end_encodeSnappyBetterBlockAsm10B
  14401. MOVB (R9)(R12*1), R11
  14402. CMPB (R10)(R12*1), R11
  14403. JNE match_nolit_end_encodeSnappyBetterBlockAsm10B
  14404. LEAL 1(R12), R12
  14405. match_nolit_end_encodeSnappyBetterBlockAsm10B:
  14406. MOVL CX, R8
  14407. SUBL SI, R8
  14408. // Check if repeat
  14409. MOVL R8, 16(SP)
  14410. MOVL 12(SP), SI
  14411. CMPL SI, DI
  14412. JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
  14413. MOVL DI, R9
  14414. MOVL DI, 12(SP)
  14415. LEAQ (DX)(SI*1), R10
  14416. SUBL SI, R9
  14417. LEAL -1(R9), SI
  14418. CMPL SI, $0x3c
  14419. JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B
  14420. CMPL SI, $0x00000100
  14421. JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
  14422. MOVB $0xf4, (AX)
  14423. MOVW SI, 1(AX)
  14424. ADDQ $0x03, AX
  14425. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
  14426. two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
  14427. MOVB $0xf0, (AX)
  14428. MOVB SI, 1(AX)
  14429. ADDQ $0x02, AX
  14430. CMPL SI, $0x40
  14431. JL memmove_match_emit_encodeSnappyBetterBlockAsm10B
  14432. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
  14433. one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
  14434. SHLB $0x02, SI
  14435. MOVB SI, (AX)
  14436. ADDQ $0x01, AX
  14437. memmove_match_emit_encodeSnappyBetterBlockAsm10B:
  14438. LEAQ (AX)(R9*1), SI
  14439. // genMemMoveShort
  14440. CMPQ R9, $0x08
  14441. JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
  14442. CMPQ R9, $0x10
  14443. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
  14444. CMPQ R9, $0x20
  14445. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
  14446. JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
  14447. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
  14448. MOVQ (R10), R11
  14449. MOVQ R11, (AX)
  14450. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
  14451. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
  14452. MOVQ (R10), R11
  14453. MOVQ -8(R10)(R9*1), R10
  14454. MOVQ R11, (AX)
  14455. MOVQ R10, -8(AX)(R9*1)
  14456. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
  14457. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
  14458. MOVOU (R10), X0
  14459. MOVOU -16(R10)(R9*1), X1
  14460. MOVOU X0, (AX)
  14461. MOVOU X1, -16(AX)(R9*1)
  14462. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
  14463. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
  14464. MOVOU (R10), X0
  14465. MOVOU 16(R10), X1
  14466. MOVOU -32(R10)(R9*1), X2
  14467. MOVOU -16(R10)(R9*1), X3
  14468. MOVOU X0, (AX)
  14469. MOVOU X1, 16(AX)
  14470. MOVOU X2, -32(AX)(R9*1)
  14471. MOVOU X3, -16(AX)(R9*1)
  14472. memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
  14473. MOVQ SI, AX
  14474. JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
  14475. memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
  14476. LEAQ (AX)(R9*1), SI
  14477. // genMemMoveLong
  14478. MOVOU (R10), X0
  14479. MOVOU 16(R10), X1
  14480. MOVOU -32(R10)(R9*1), X2
  14481. MOVOU -16(R10)(R9*1), X3
  14482. MOVQ R9, R13
  14483. SHRQ $0x05, R13
  14484. MOVQ AX, R11
  14485. ANDL $0x0000001f, R11
  14486. MOVQ $0x00000040, R14
  14487. SUBQ R11, R14
  14488. DECQ R13
  14489. JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
  14490. LEAQ -32(R10)(R14*1), R11
  14491. LEAQ -32(AX)(R14*1), R15
  14492. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
  14493. MOVOU (R11), X4
  14494. MOVOU 16(R11), X5
  14495. MOVOA X4, (R15)
  14496. MOVOA X5, 16(R15)
  14497. ADDQ $0x20, R15
  14498. ADDQ $0x20, R11
  14499. ADDQ $0x20, R14
  14500. DECQ R13
  14501. JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
  14502. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
  14503. MOVOU -32(R10)(R14*1), X4
  14504. MOVOU -16(R10)(R14*1), X5
  14505. MOVOA X4, -32(AX)(R14*1)
  14506. MOVOA X5, -16(AX)(R14*1)
  14507. ADDQ $0x20, R14
  14508. CMPQ R9, R14
  14509. JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
  14510. MOVOU X0, (AX)
  14511. MOVOU X1, 16(AX)
  14512. MOVOU X2, -32(AX)(R9*1)
  14513. MOVOU X3, -16(AX)(R9*1)
  14514. MOVQ SI, AX
  14515. emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
  14516. ADDL R12, CX
  14517. ADDL $0x04, R12
  14518. MOVL CX, 12(SP)
  14519. // emitCopy
  14520. two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
  14521. CMPL R12, $0x40
  14522. JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
  14523. MOVB $0xee, (AX)
  14524. MOVW R8, 1(AX)
  14525. LEAL -60(R12), R12
  14526. ADDQ $0x03, AX
  14527. JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
  14528. two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
  14529. CMPL R12, $0x0c
  14530. JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
  14531. CMPL R8, $0x00000800
  14532. JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
  14533. MOVB $0x01, BL
  14534. LEAL -16(BX)(R12*4), R12
  14535. MOVB R8, 1(AX)
  14536. SHRL $0x08, R8
  14537. SHLL $0x05, R8
  14538. ORL R8, R12
  14539. MOVB R12, (AX)
  14540. ADDQ $0x02, AX
  14541. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
  14542. emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
  14543. MOVB $0x02, BL
  14544. LEAL -4(BX)(R12*4), R12
  14545. MOVB R12, (AX)
  14546. MOVW R8, 1(AX)
  14547. ADDQ $0x03, AX
  14548. match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
  14549. CMPL CX, 8(SP)
  14550. JGE emit_remainder_encodeSnappyBetterBlockAsm10B
  14551. CMPQ AX, (SP)
  14552. JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
  14553. MOVQ $0x00000000, ret+48(FP)
  14554. RET
  14555. match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
  14556. MOVQ $0x0000cf1bbcdcbf9b, SI
  14557. MOVQ $0x9e3779b1, R8
  14558. INCL DI
  14559. MOVQ (DX)(DI*1), R9
  14560. MOVQ R9, R10
  14561. MOVQ R9, R11
  14562. MOVQ R9, R12
  14563. SHRQ $0x08, R11
  14564. MOVQ R11, R13
  14565. SHRQ $0x10, R12
  14566. LEAL 1(DI), R14
  14567. LEAL 2(DI), R15
  14568. MOVQ -2(DX)(CX*1), R9
  14569. SHLQ $0x10, R10
  14570. IMULQ SI, R10
  14571. SHRQ $0x34, R10
  14572. SHLQ $0x10, R13
  14573. IMULQ SI, R13
  14574. SHRQ $0x34, R13
  14575. SHLQ $0x20, R11
  14576. IMULQ R8, R11
  14577. SHRQ $0x36, R11
  14578. SHLQ $0x20, R12
  14579. IMULQ R8, R12
  14580. SHRQ $0x36, R12
  14581. MOVL DI, 24(SP)(R10*4)
  14582. MOVL R14, 24(SP)(R13*4)
  14583. MOVL R14, 16408(SP)(R11*4)
  14584. MOVL R15, 16408(SP)(R12*4)
  14585. MOVQ R9, R10
  14586. MOVQ R9, R11
  14587. SHRQ $0x08, R11
  14588. MOVQ R11, R13
  14589. LEAL -2(CX), R9
  14590. LEAL -1(CX), DI
  14591. SHLQ $0x10, R10
  14592. IMULQ SI, R10
  14593. SHRQ $0x34, R10
  14594. SHLQ $0x20, R11
  14595. IMULQ R8, R11
  14596. SHRQ $0x36, R11
  14597. SHLQ $0x10, R13
  14598. IMULQ SI, R13
  14599. SHRQ $0x34, R13
  14600. MOVL R9, 24(SP)(R10*4)
  14601. MOVL DI, 16408(SP)(R11*4)
  14602. MOVL DI, 24(SP)(R13*4)
  14603. JMP search_loop_encodeSnappyBetterBlockAsm10B
  14604. emit_remainder_encodeSnappyBetterBlockAsm10B:
  14605. MOVQ src_len+32(FP), CX
  14606. SUBL 12(SP), CX
  14607. LEAQ 3(AX)(CX*1), CX
  14608. CMPQ CX, (SP)
  14609. JL emit_remainder_ok_encodeSnappyBetterBlockAsm10B
  14610. MOVQ $0x00000000, ret+48(FP)
  14611. RET
  14612. emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
  14613. MOVQ src_len+32(FP), CX
  14614. MOVL 12(SP), BX
  14615. CMPL BX, CX
  14616. JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
  14617. MOVL CX, SI
  14618. MOVL CX, 12(SP)
  14619. LEAQ (DX)(BX*1), CX
  14620. SUBL BX, SI
  14621. LEAL -1(SI), DX
  14622. CMPL DX, $0x3c
  14623. JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
  14624. CMPL DX, $0x00000100
  14625. JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
  14626. MOVB $0xf4, (AX)
  14627. MOVW DX, 1(AX)
  14628. ADDQ $0x03, AX
  14629. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
  14630. two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
  14631. MOVB $0xf0, (AX)
  14632. MOVB DL, 1(AX)
  14633. ADDQ $0x02, AX
  14634. CMPL DX, $0x40
  14635. JL memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
  14636. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
  14637. one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
  14638. SHLB $0x02, DL
  14639. MOVB DL, (AX)
  14640. ADDQ $0x01, AX
  14641. memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
  14642. LEAQ (AX)(SI*1), DX
  14643. MOVL SI, BX
  14644. // genMemMoveShort
  14645. CMPQ BX, $0x03
  14646. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2
  14647. JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3
  14648. CMPQ BX, $0x08
  14649. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7
  14650. CMPQ BX, $0x10
  14651. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
  14652. CMPQ BX, $0x20
  14653. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
  14654. JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
  14655. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2:
  14656. MOVB (CX), SI
  14657. MOVB -1(CX)(BX*1), CL
  14658. MOVB SI, (AX)
  14659. MOVB CL, -1(AX)(BX*1)
  14660. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
  14661. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3:
  14662. MOVW (CX), SI
  14663. MOVB 2(CX), CL
  14664. MOVW SI, (AX)
  14665. MOVB CL, 2(AX)
  14666. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
  14667. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7:
  14668. MOVL (CX), SI
  14669. MOVL -4(CX)(BX*1), CX
  14670. MOVL SI, (AX)
  14671. MOVL CX, -4(AX)(BX*1)
  14672. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
  14673. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
  14674. MOVQ (CX), SI
  14675. MOVQ -8(CX)(BX*1), CX
  14676. MOVQ SI, (AX)
  14677. MOVQ CX, -8(AX)(BX*1)
  14678. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
  14679. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
  14680. MOVOU (CX), X0
  14681. MOVOU -16(CX)(BX*1), X1
  14682. MOVOU X0, (AX)
  14683. MOVOU X1, -16(AX)(BX*1)
  14684. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
  14685. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
  14686. MOVOU (CX), X0
  14687. MOVOU 16(CX), X1
  14688. MOVOU -32(CX)(BX*1), X2
  14689. MOVOU -16(CX)(BX*1), X3
  14690. MOVOU X0, (AX)
  14691. MOVOU X1, 16(AX)
  14692. MOVOU X2, -32(AX)(BX*1)
  14693. MOVOU X3, -16(AX)(BX*1)
  14694. memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
  14695. MOVQ DX, AX
  14696. JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
  14697. memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
  14698. LEAQ (AX)(SI*1), DX
  14699. MOVL SI, BX
  14700. // genMemMoveLong
  14701. MOVOU (CX), X0
  14702. MOVOU 16(CX), X1
  14703. MOVOU -32(CX)(BX*1), X2
  14704. MOVOU -16(CX)(BX*1), X3
  14705. MOVQ BX, DI
  14706. SHRQ $0x05, DI
  14707. MOVQ AX, SI
  14708. ANDL $0x0000001f, SI
  14709. MOVQ $0x00000040, R8
  14710. SUBQ SI, R8
  14711. DECQ DI
  14712. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
  14713. LEAQ -32(CX)(R8*1), SI
  14714. LEAQ -32(AX)(R8*1), R9
  14715. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
  14716. MOVOU (SI), X4
  14717. MOVOU 16(SI), X5
  14718. MOVOA X4, (R9)
  14719. MOVOA X5, 16(R9)
  14720. ADDQ $0x20, R9
  14721. ADDQ $0x20, SI
  14722. ADDQ $0x20, R8
  14723. DECQ DI
  14724. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
  14725. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
  14726. MOVOU -32(CX)(R8*1), X4
  14727. MOVOU -16(CX)(R8*1), X5
  14728. MOVOA X4, -32(AX)(R8*1)
  14729. MOVOA X5, -16(AX)(R8*1)
  14730. ADDQ $0x20, R8
  14731. CMPQ BX, R8
  14732. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
  14733. MOVOU X0, (AX)
  14734. MOVOU X1, 16(AX)
  14735. MOVOU X2, -32(AX)(BX*1)
  14736. MOVOU X3, -16(AX)(BX*1)
  14737. MOVQ DX, AX
  14738. emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
  14739. MOVQ dst_base+0(FP), CX
  14740. SUBQ CX, AX
  14741. MOVQ AX, ret+48(FP)
  14742. RET
  14743. // func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
  14744. // Requires: BMI, SSE2
  14745. TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
  14746. MOVQ dst_base+0(FP), AX
  14747. MOVQ $0x00000028, CX
  14748. LEAQ 24(SP), DX
  14749. PXOR X0, X0
  14750. zero_loop_encodeSnappyBetterBlockAsm8B:
  14751. MOVOU X0, (DX)
  14752. MOVOU X0, 16(DX)
  14753. MOVOU X0, 32(DX)
  14754. MOVOU X0, 48(DX)
  14755. MOVOU X0, 64(DX)
  14756. MOVOU X0, 80(DX)
  14757. MOVOU X0, 96(DX)
  14758. MOVOU X0, 112(DX)
  14759. ADDQ $0x80, DX
  14760. DECQ CX
  14761. JNZ zero_loop_encodeSnappyBetterBlockAsm8B
  14762. MOVL $0x00000000, 12(SP)
  14763. MOVQ src_len+32(FP), CX
  14764. LEAQ -9(CX), DX
  14765. LEAQ -8(CX), SI
  14766. MOVL SI, 8(SP)
  14767. SHRQ $0x05, CX
  14768. SUBL CX, DX
  14769. LEAQ (AX)(DX*1), DX
  14770. MOVQ DX, (SP)
  14771. MOVL $0x00000001, CX
  14772. MOVL $0x00000000, 16(SP)
  14773. MOVQ src_base+24(FP), DX
  14774. search_loop_encodeSnappyBetterBlockAsm8B:
  14775. MOVL CX, SI
  14776. SUBL 12(SP), SI
  14777. SHRL $0x04, SI
  14778. LEAL 1(CX)(SI*1), SI
  14779. CMPL SI, 8(SP)
  14780. JGE emit_remainder_encodeSnappyBetterBlockAsm8B
  14781. MOVQ (DX)(CX*1), DI
  14782. MOVL SI, 20(SP)
  14783. MOVQ $0x0000cf1bbcdcbf9b, R9
  14784. MOVQ $0x9e3779b1, SI
  14785. MOVQ DI, R10
  14786. MOVQ DI, R11
  14787. SHLQ $0x10, R10
  14788. IMULQ R9, R10
  14789. SHRQ $0x36, R10
  14790. SHLQ $0x20, R11
  14791. IMULQ SI, R11
  14792. SHRQ $0x38, R11
  14793. MOVL 24(SP)(R10*4), SI
  14794. MOVL 4120(SP)(R11*4), R8
  14795. MOVL CX, 24(SP)(R10*4)
  14796. MOVL CX, 4120(SP)(R11*4)
  14797. CMPL (DX)(SI*1), DI
  14798. JEQ candidate_match_encodeSnappyBetterBlockAsm8B
  14799. CMPL (DX)(R8*1), DI
  14800. JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
  14801. MOVL 20(SP), CX
  14802. JMP search_loop_encodeSnappyBetterBlockAsm8B
  14803. candidateS_match_encodeSnappyBetterBlockAsm8B:
  14804. SHRQ $0x08, DI
  14805. MOVQ DI, R10
  14806. SHLQ $0x10, R10
  14807. IMULQ R9, R10
  14808. SHRQ $0x36, R10
  14809. MOVL 24(SP)(R10*4), SI
  14810. INCL CX
  14811. MOVL CX, 24(SP)(R10*4)
  14812. CMPL (DX)(SI*1), DI
  14813. JEQ candidate_match_encodeSnappyBetterBlockAsm8B
  14814. DECL CX
  14815. MOVL R8, SI
  14816. candidate_match_encodeSnappyBetterBlockAsm8B:
  14817. MOVL 12(SP), DI
  14818. TESTL SI, SI
  14819. JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
  14820. match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
  14821. CMPL CX, DI
  14822. JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B
  14823. MOVB -1(DX)(SI*1), BL
  14824. MOVB -1(DX)(CX*1), R8
  14825. CMPB BL, R8
  14826. JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B
  14827. LEAL -1(CX), CX
  14828. DECL SI
  14829. JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
  14830. JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B
  14831. match_extend_back_end_encodeSnappyBetterBlockAsm8B:
  14832. MOVL CX, DI
  14833. SUBL 12(SP), DI
  14834. LEAQ 3(AX)(DI*1), DI
  14835. CMPQ DI, (SP)
  14836. JL match_dst_size_check_encodeSnappyBetterBlockAsm8B
  14837. MOVQ $0x00000000, ret+48(FP)
  14838. RET
  14839. match_dst_size_check_encodeSnappyBetterBlockAsm8B:
  14840. MOVL CX, DI
  14841. ADDL $0x04, CX
  14842. ADDL $0x04, SI
  14843. MOVQ src_len+32(FP), R8
  14844. SUBL CX, R8
  14845. LEAQ (DX)(CX*1), R9
  14846. LEAQ (DX)(SI*1), R10
  14847. // matchLen
  14848. XORL R12, R12
  14849. CMPL R8, $0x08
  14850. JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
  14851. matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B:
  14852. MOVQ (R9)(R12*1), R11
  14853. XORQ (R10)(R12*1), R11
  14854. TESTQ R11, R11
  14855. JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B
  14856. #ifdef GOAMD64_v3
  14857. TZCNTQ R11, R11
  14858. #else
  14859. BSFQ R11, R11
  14860. #endif
  14861. SARQ $0x03, R11
  14862. LEAL (R12)(R11*1), R12
  14863. JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
  14864. matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B:
  14865. LEAL -8(R8), R8
  14866. LEAL 8(R12), R12
  14867. CMPL R8, $0x08
  14868. JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B
  14869. JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
  14870. matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
  14871. CMPL R8, $0x04
  14872. JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
  14873. MOVL (R9)(R12*1), R11
  14874. CMPL (R10)(R12*1), R11
  14875. JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
  14876. SUBL $0x04, R8
  14877. LEAL 4(R12), R12
  14878. matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
  14879. CMPL R8, $0x02
  14880. JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
  14881. MOVW (R9)(R12*1), R11
  14882. CMPW (R10)(R12*1), R11
  14883. JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
  14884. SUBL $0x02, R8
  14885. LEAL 2(R12), R12
  14886. matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
  14887. CMPL R8, $0x01
  14888. JL match_nolit_end_encodeSnappyBetterBlockAsm8B
  14889. MOVB (R9)(R12*1), R11
  14890. CMPB (R10)(R12*1), R11
  14891. JNE match_nolit_end_encodeSnappyBetterBlockAsm8B
  14892. LEAL 1(R12), R12
  14893. match_nolit_end_encodeSnappyBetterBlockAsm8B:
  14894. MOVL CX, R8
  14895. SUBL SI, R8
  14896. // Check if repeat
  14897. MOVL R8, 16(SP)
  14898. MOVL 12(SP), SI
  14899. CMPL SI, DI
  14900. JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
  14901. MOVL DI, R9
  14902. MOVL DI, 12(SP)
  14903. LEAQ (DX)(SI*1), R10
  14904. SUBL SI, R9
  14905. LEAL -1(R9), SI
  14906. CMPL SI, $0x3c
  14907. JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B
  14908. CMPL SI, $0x00000100
  14909. JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
  14910. MOVB $0xf4, (AX)
  14911. MOVW SI, 1(AX)
  14912. ADDQ $0x03, AX
  14913. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
  14914. two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
  14915. MOVB $0xf0, (AX)
  14916. MOVB SI, 1(AX)
  14917. ADDQ $0x02, AX
  14918. CMPL SI, $0x40
  14919. JL memmove_match_emit_encodeSnappyBetterBlockAsm8B
  14920. JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
  14921. one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
  14922. SHLB $0x02, SI
  14923. MOVB SI, (AX)
  14924. ADDQ $0x01, AX
  14925. memmove_match_emit_encodeSnappyBetterBlockAsm8B:
  14926. LEAQ (AX)(R9*1), SI
  14927. // genMemMoveShort
  14928. CMPQ R9, $0x08
  14929. JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
  14930. CMPQ R9, $0x10
  14931. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
  14932. CMPQ R9, $0x20
  14933. JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
  14934. JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
  14935. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
  14936. MOVQ (R10), R11
  14937. MOVQ R11, (AX)
  14938. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
  14939. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
  14940. MOVQ (R10), R11
  14941. MOVQ -8(R10)(R9*1), R10
  14942. MOVQ R11, (AX)
  14943. MOVQ R10, -8(AX)(R9*1)
  14944. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
  14945. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
  14946. MOVOU (R10), X0
  14947. MOVOU -16(R10)(R9*1), X1
  14948. MOVOU X0, (AX)
  14949. MOVOU X1, -16(AX)(R9*1)
  14950. JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
  14951. emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
  14952. MOVOU (R10), X0
  14953. MOVOU 16(R10), X1
  14954. MOVOU -32(R10)(R9*1), X2
  14955. MOVOU -16(R10)(R9*1), X3
  14956. MOVOU X0, (AX)
  14957. MOVOU X1, 16(AX)
  14958. MOVOU X2, -32(AX)(R9*1)
  14959. MOVOU X3, -16(AX)(R9*1)
  14960. memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
  14961. MOVQ SI, AX
  14962. JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
  14963. memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
  14964. LEAQ (AX)(R9*1), SI
  14965. // genMemMoveLong
  14966. MOVOU (R10), X0
  14967. MOVOU 16(R10), X1
  14968. MOVOU -32(R10)(R9*1), X2
  14969. MOVOU -16(R10)(R9*1), X3
  14970. MOVQ R9, R13
  14971. SHRQ $0x05, R13
  14972. MOVQ AX, R11
  14973. ANDL $0x0000001f, R11
  14974. MOVQ $0x00000040, R14
  14975. SUBQ R11, R14
  14976. DECQ R13
  14977. JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
  14978. LEAQ -32(R10)(R14*1), R11
  14979. LEAQ -32(AX)(R14*1), R15
  14980. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
  14981. MOVOU (R11), X4
  14982. MOVOU 16(R11), X5
  14983. MOVOA X4, (R15)
  14984. MOVOA X5, 16(R15)
  14985. ADDQ $0x20, R15
  14986. ADDQ $0x20, R11
  14987. ADDQ $0x20, R14
  14988. DECQ R13
  14989. JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
  14990. emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
  14991. MOVOU -32(R10)(R14*1), X4
  14992. MOVOU -16(R10)(R14*1), X5
  14993. MOVOA X4, -32(AX)(R14*1)
  14994. MOVOA X5, -16(AX)(R14*1)
  14995. ADDQ $0x20, R14
  14996. CMPQ R9, R14
  14997. JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
  14998. MOVOU X0, (AX)
  14999. MOVOU X1, 16(AX)
  15000. MOVOU X2, -32(AX)(R9*1)
  15001. MOVOU X3, -16(AX)(R9*1)
  15002. MOVQ SI, AX
  15003. emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
  15004. ADDL R12, CX
  15005. ADDL $0x04, R12
  15006. MOVL CX, 12(SP)
  15007. // emitCopy
  15008. two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
  15009. CMPL R12, $0x40
  15010. JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
  15011. MOVB $0xee, (AX)
  15012. MOVW R8, 1(AX)
  15013. LEAL -60(R12), R12
  15014. ADDQ $0x03, AX
  15015. JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
  15016. two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
  15017. CMPL R12, $0x0c
  15018. JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
  15019. MOVB $0x01, BL
  15020. LEAL -16(BX)(R12*4), R12
  15021. MOVB R8, 1(AX)
  15022. SHRL $0x08, R8
  15023. SHLL $0x05, R8
  15024. ORL R8, R12
  15025. MOVB R12, (AX)
  15026. ADDQ $0x02, AX
  15027. JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
  15028. emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
  15029. MOVB $0x02, BL
  15030. LEAL -4(BX)(R12*4), R12
  15031. MOVB R12, (AX)
  15032. MOVW R8, 1(AX)
  15033. ADDQ $0x03, AX
  15034. match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
  15035. CMPL CX, 8(SP)
  15036. JGE emit_remainder_encodeSnappyBetterBlockAsm8B
  15037. CMPQ AX, (SP)
  15038. JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
  15039. MOVQ $0x00000000, ret+48(FP)
  15040. RET
  15041. match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
  15042. MOVQ $0x0000cf1bbcdcbf9b, SI
  15043. MOVQ $0x9e3779b1, R8
  15044. INCL DI
  15045. MOVQ (DX)(DI*1), R9
  15046. MOVQ R9, R10
  15047. MOVQ R9, R11
  15048. MOVQ R9, R12
  15049. SHRQ $0x08, R11
  15050. MOVQ R11, R13
  15051. SHRQ $0x10, R12
  15052. LEAL 1(DI), R14
  15053. LEAL 2(DI), R15
  15054. MOVQ -2(DX)(CX*1), R9
  15055. SHLQ $0x10, R10
  15056. IMULQ SI, R10
  15057. SHRQ $0x36, R10
  15058. SHLQ $0x10, R13
  15059. IMULQ SI, R13
  15060. SHRQ $0x36, R13
  15061. SHLQ $0x20, R11
  15062. IMULQ R8, R11
  15063. SHRQ $0x38, R11
  15064. SHLQ $0x20, R12
  15065. IMULQ R8, R12
  15066. SHRQ $0x38, R12
  15067. MOVL DI, 24(SP)(R10*4)
  15068. MOVL R14, 24(SP)(R13*4)
  15069. MOVL R14, 4120(SP)(R11*4)
  15070. MOVL R15, 4120(SP)(R12*4)
  15071. MOVQ R9, R10
  15072. MOVQ R9, R11
  15073. SHRQ $0x08, R11
  15074. MOVQ R11, R13
  15075. LEAL -2(CX), R9
  15076. LEAL -1(CX), DI
  15077. SHLQ $0x10, R10
  15078. IMULQ SI, R10
  15079. SHRQ $0x36, R10
  15080. SHLQ $0x20, R11
  15081. IMULQ R8, R11
  15082. SHRQ $0x38, R11
  15083. SHLQ $0x10, R13
  15084. IMULQ SI, R13
  15085. SHRQ $0x36, R13
  15086. MOVL R9, 24(SP)(R10*4)
  15087. MOVL DI, 4120(SP)(R11*4)
  15088. MOVL DI, 24(SP)(R13*4)
  15089. JMP search_loop_encodeSnappyBetterBlockAsm8B
  15090. emit_remainder_encodeSnappyBetterBlockAsm8B:
  15091. MOVQ src_len+32(FP), CX
  15092. SUBL 12(SP), CX
  15093. LEAQ 3(AX)(CX*1), CX
  15094. CMPQ CX, (SP)
  15095. JL emit_remainder_ok_encodeSnappyBetterBlockAsm8B
  15096. MOVQ $0x00000000, ret+48(FP)
  15097. RET
  15098. emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
  15099. MOVQ src_len+32(FP), CX
  15100. MOVL 12(SP), BX
  15101. CMPL BX, CX
  15102. JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
  15103. MOVL CX, SI
  15104. MOVL CX, 12(SP)
  15105. LEAQ (DX)(BX*1), CX
  15106. SUBL BX, SI
  15107. LEAL -1(SI), DX
  15108. CMPL DX, $0x3c
  15109. JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
  15110. CMPL DX, $0x00000100
  15111. JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
  15112. MOVB $0xf4, (AX)
  15113. MOVW DX, 1(AX)
  15114. ADDQ $0x03, AX
  15115. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
  15116. two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15117. MOVB $0xf0, (AX)
  15118. MOVB DL, 1(AX)
  15119. ADDQ $0x02, AX
  15120. CMPL DX, $0x40
  15121. JL memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
  15122. JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
  15123. one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15124. SHLB $0x02, DL
  15125. MOVB DL, (AX)
  15126. ADDQ $0x01, AX
  15127. memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15128. LEAQ (AX)(SI*1), DX
  15129. MOVL SI, BX
  15130. // genMemMoveShort
  15131. CMPQ BX, $0x03
  15132. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2
  15133. JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3
  15134. CMPQ BX, $0x08
  15135. JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7
  15136. CMPQ BX, $0x10
  15137. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
  15138. CMPQ BX, $0x20
  15139. JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
  15140. JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
  15141. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2:
  15142. MOVB (CX), SI
  15143. MOVB -1(CX)(BX*1), CL
  15144. MOVB SI, (AX)
  15145. MOVB CL, -1(AX)(BX*1)
  15146. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
  15147. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3:
  15148. MOVW (CX), SI
  15149. MOVB 2(CX), CL
  15150. MOVW SI, (AX)
  15151. MOVB CL, 2(AX)
  15152. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
  15153. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7:
  15154. MOVL (CX), SI
  15155. MOVL -4(CX)(BX*1), CX
  15156. MOVL SI, (AX)
  15157. MOVL CX, -4(AX)(BX*1)
  15158. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
  15159. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
  15160. MOVQ (CX), SI
  15161. MOVQ -8(CX)(BX*1), CX
  15162. MOVQ SI, (AX)
  15163. MOVQ CX, -8(AX)(BX*1)
  15164. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
  15165. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
  15166. MOVOU (CX), X0
  15167. MOVOU -16(CX)(BX*1), X1
  15168. MOVOU X0, (AX)
  15169. MOVOU X1, -16(AX)(BX*1)
  15170. JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
  15171. emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
  15172. MOVOU (CX), X0
  15173. MOVOU 16(CX), X1
  15174. MOVOU -32(CX)(BX*1), X2
  15175. MOVOU -16(CX)(BX*1), X3
  15176. MOVOU X0, (AX)
  15177. MOVOU X1, 16(AX)
  15178. MOVOU X2, -32(AX)(BX*1)
  15179. MOVOU X3, -16(AX)(BX*1)
  15180. memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15181. MOVQ DX, AX
  15182. JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
  15183. memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15184. LEAQ (AX)(SI*1), DX
  15185. MOVL SI, BX
  15186. // genMemMoveLong
  15187. MOVOU (CX), X0
  15188. MOVOU 16(CX), X1
  15189. MOVOU -32(CX)(BX*1), X2
  15190. MOVOU -16(CX)(BX*1), X3
  15191. MOVQ BX, DI
  15192. SHRQ $0x05, DI
  15193. MOVQ AX, SI
  15194. ANDL $0x0000001f, SI
  15195. MOVQ $0x00000040, R8
  15196. SUBQ SI, R8
  15197. DECQ DI
  15198. JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
  15199. LEAQ -32(CX)(R8*1), SI
  15200. LEAQ -32(AX)(R8*1), R9
  15201. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
  15202. MOVOU (SI), X4
  15203. MOVOU 16(SI), X5
  15204. MOVOA X4, (R9)
  15205. MOVOA X5, 16(R9)
  15206. ADDQ $0x20, R9
  15207. ADDQ $0x20, SI
  15208. ADDQ $0x20, R8
  15209. DECQ DI
  15210. JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
  15211. emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
  15212. MOVOU -32(CX)(R8*1), X4
  15213. MOVOU -16(CX)(R8*1), X5
  15214. MOVOA X4, -32(AX)(R8*1)
  15215. MOVOA X5, -16(AX)(R8*1)
  15216. ADDQ $0x20, R8
  15217. CMPQ BX, R8
  15218. JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
  15219. MOVOU X0, (AX)
  15220. MOVOU X1, 16(AX)
  15221. MOVOU X2, -32(AX)(BX*1)
  15222. MOVOU X3, -16(AX)(BX*1)
  15223. MOVQ DX, AX
  15224. emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
  15225. MOVQ dst_base+0(FP), CX
  15226. SUBQ CX, AX
  15227. MOVQ AX, ret+48(FP)
  15228. RET
  15229. // func emitLiteral(dst []byte, lit []byte) int
  15230. // Requires: SSE2
  15231. TEXT ·emitLiteral(SB), NOSPLIT, $0-56
  15232. MOVQ lit_len+32(FP), DX
  15233. MOVQ dst_base+0(FP), AX
  15234. MOVQ lit_base+24(FP), CX
  15235. TESTQ DX, DX
  15236. JZ emit_literal_end_standalone_skip
  15237. MOVL DX, BX
  15238. LEAL -1(DX), SI
  15239. CMPL SI, $0x3c
  15240. JLT one_byte_standalone
  15241. CMPL SI, $0x00000100
  15242. JLT two_bytes_standalone
  15243. CMPL SI, $0x00010000
  15244. JLT three_bytes_standalone
  15245. CMPL SI, $0x01000000
  15246. JLT four_bytes_standalone
  15247. MOVB $0xfc, (AX)
  15248. MOVL SI, 1(AX)
  15249. ADDQ $0x05, BX
  15250. ADDQ $0x05, AX
  15251. JMP memmove_long_standalone
  15252. four_bytes_standalone:
  15253. MOVL SI, DI
  15254. SHRL $0x10, DI
  15255. MOVB $0xf8, (AX)
  15256. MOVW SI, 1(AX)
  15257. MOVB DI, 3(AX)
  15258. ADDQ $0x04, BX
  15259. ADDQ $0x04, AX
  15260. JMP memmove_long_standalone
  15261. three_bytes_standalone:
  15262. MOVB $0xf4, (AX)
  15263. MOVW SI, 1(AX)
  15264. ADDQ $0x03, BX
  15265. ADDQ $0x03, AX
  15266. JMP memmove_long_standalone
  15267. two_bytes_standalone:
  15268. MOVB $0xf0, (AX)
  15269. MOVB SI, 1(AX)
  15270. ADDQ $0x02, BX
  15271. ADDQ $0x02, AX
  15272. CMPL SI, $0x40
  15273. JL memmove_standalone
  15274. JMP memmove_long_standalone
  15275. one_byte_standalone:
  15276. SHLB $0x02, SI
  15277. MOVB SI, (AX)
  15278. ADDQ $0x01, BX
  15279. ADDQ $0x01, AX
  15280. memmove_standalone:
  15281. // genMemMoveShort
  15282. CMPQ DX, $0x03
  15283. JB emit_lit_memmove_standalone_memmove_move_1or2
  15284. JE emit_lit_memmove_standalone_memmove_move_3
  15285. CMPQ DX, $0x08
  15286. JB emit_lit_memmove_standalone_memmove_move_4through7
  15287. CMPQ DX, $0x10
  15288. JBE emit_lit_memmove_standalone_memmove_move_8through16
  15289. CMPQ DX, $0x20
  15290. JBE emit_lit_memmove_standalone_memmove_move_17through32
  15291. JMP emit_lit_memmove_standalone_memmove_move_33through64
  15292. emit_lit_memmove_standalone_memmove_move_1or2:
  15293. MOVB (CX), SI
  15294. MOVB -1(CX)(DX*1), CL
  15295. MOVB SI, (AX)
  15296. MOVB CL, -1(AX)(DX*1)
  15297. JMP emit_literal_end_standalone
  15298. emit_lit_memmove_standalone_memmove_move_3:
  15299. MOVW (CX), SI
  15300. MOVB 2(CX), CL
  15301. MOVW SI, (AX)
  15302. MOVB CL, 2(AX)
  15303. JMP emit_literal_end_standalone
  15304. emit_lit_memmove_standalone_memmove_move_4through7:
  15305. MOVL (CX), SI
  15306. MOVL -4(CX)(DX*1), CX
  15307. MOVL SI, (AX)
  15308. MOVL CX, -4(AX)(DX*1)
  15309. JMP emit_literal_end_standalone
  15310. emit_lit_memmove_standalone_memmove_move_8through16:
  15311. MOVQ (CX), SI
  15312. MOVQ -8(CX)(DX*1), CX
  15313. MOVQ SI, (AX)
  15314. MOVQ CX, -8(AX)(DX*1)
  15315. JMP emit_literal_end_standalone
  15316. emit_lit_memmove_standalone_memmove_move_17through32:
  15317. MOVOU (CX), X0
  15318. MOVOU -16(CX)(DX*1), X1
  15319. MOVOU X0, (AX)
  15320. MOVOU X1, -16(AX)(DX*1)
  15321. JMP emit_literal_end_standalone
  15322. emit_lit_memmove_standalone_memmove_move_33through64:
  15323. MOVOU (CX), X0
  15324. MOVOU 16(CX), X1
  15325. MOVOU -32(CX)(DX*1), X2
  15326. MOVOU -16(CX)(DX*1), X3
  15327. MOVOU X0, (AX)
  15328. MOVOU X1, 16(AX)
  15329. MOVOU X2, -32(AX)(DX*1)
  15330. MOVOU X3, -16(AX)(DX*1)
  15331. JMP emit_literal_end_standalone
  15332. JMP emit_literal_end_standalone
  15333. memmove_long_standalone:
  15334. // genMemMoveLong
  15335. MOVOU (CX), X0
  15336. MOVOU 16(CX), X1
  15337. MOVOU -32(CX)(DX*1), X2
  15338. MOVOU -16(CX)(DX*1), X3
  15339. MOVQ DX, DI
  15340. SHRQ $0x05, DI
  15341. MOVQ AX, SI
  15342. ANDL $0x0000001f, SI
  15343. MOVQ $0x00000040, R8
  15344. SUBQ SI, R8
  15345. DECQ DI
  15346. JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
  15347. LEAQ -32(CX)(R8*1), SI
  15348. LEAQ -32(AX)(R8*1), R9
  15349. emit_lit_memmove_long_standalonelarge_big_loop_back:
  15350. MOVOU (SI), X4
  15351. MOVOU 16(SI), X5
  15352. MOVOA X4, (R9)
  15353. MOVOA X5, 16(R9)
  15354. ADDQ $0x20, R9
  15355. ADDQ $0x20, SI
  15356. ADDQ $0x20, R8
  15357. DECQ DI
  15358. JNA emit_lit_memmove_long_standalonelarge_big_loop_back
  15359. emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
  15360. MOVOU -32(CX)(R8*1), X4
  15361. MOVOU -16(CX)(R8*1), X5
  15362. MOVOA X4, -32(AX)(R8*1)
  15363. MOVOA X5, -16(AX)(R8*1)
  15364. ADDQ $0x20, R8
  15365. CMPQ DX, R8
  15366. JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
  15367. MOVOU X0, (AX)
  15368. MOVOU X1, 16(AX)
  15369. MOVOU X2, -32(AX)(DX*1)
  15370. MOVOU X3, -16(AX)(DX*1)
  15371. JMP emit_literal_end_standalone
  15372. JMP emit_literal_end_standalone
  15373. emit_literal_end_standalone_skip:
  15374. XORQ BX, BX
  15375. emit_literal_end_standalone:
  15376. MOVQ BX, ret+48(FP)
  15377. RET
  15378. // func emitRepeat(dst []byte, offset int, length int) int
  15379. TEXT ·emitRepeat(SB), NOSPLIT, $0-48
  15380. XORQ BX, BX
  15381. MOVQ dst_base+0(FP), AX
  15382. MOVQ offset+24(FP), CX
  15383. MOVQ length+32(FP), DX
  15384. // emitRepeat
  15385. emit_repeat_again_standalone:
  15386. MOVL DX, SI
  15387. LEAL -4(DX), DX
  15388. CMPL SI, $0x08
  15389. JLE repeat_two_standalone
  15390. CMPL SI, $0x0c
  15391. JGE cant_repeat_two_offset_standalone
  15392. CMPL CX, $0x00000800
  15393. JLT repeat_two_offset_standalone
  15394. cant_repeat_two_offset_standalone:
  15395. CMPL DX, $0x00000104
  15396. JLT repeat_three_standalone
  15397. CMPL DX, $0x00010100
  15398. JLT repeat_four_standalone
  15399. CMPL DX, $0x0100ffff
  15400. JLT repeat_five_standalone
  15401. LEAL -16842747(DX), DX
  15402. MOVW $0x001d, (AX)
  15403. MOVW $0xfffb, 2(AX)
  15404. MOVB $0xff, 4(AX)
  15405. ADDQ $0x05, AX
  15406. ADDQ $0x05, BX
  15407. JMP emit_repeat_again_standalone
  15408. repeat_five_standalone:
  15409. LEAL -65536(DX), DX
  15410. MOVL DX, CX
  15411. MOVW $0x001d, (AX)
  15412. MOVW DX, 2(AX)
  15413. SARL $0x10, CX
  15414. MOVB CL, 4(AX)
  15415. ADDQ $0x05, BX
  15416. ADDQ $0x05, AX
  15417. JMP gen_emit_repeat_end
  15418. repeat_four_standalone:
  15419. LEAL -256(DX), DX
  15420. MOVW $0x0019, (AX)
  15421. MOVW DX, 2(AX)
  15422. ADDQ $0x04, BX
  15423. ADDQ $0x04, AX
  15424. JMP gen_emit_repeat_end
  15425. repeat_three_standalone:
  15426. LEAL -4(DX), DX
  15427. MOVW $0x0015, (AX)
  15428. MOVB DL, 2(AX)
  15429. ADDQ $0x03, BX
  15430. ADDQ $0x03, AX
  15431. JMP gen_emit_repeat_end
  15432. repeat_two_standalone:
  15433. SHLL $0x02, DX
  15434. ORL $0x01, DX
  15435. MOVW DX, (AX)
  15436. ADDQ $0x02, BX
  15437. ADDQ $0x02, AX
  15438. JMP gen_emit_repeat_end
  15439. repeat_two_offset_standalone:
  15440. XORQ SI, SI
  15441. LEAL 1(SI)(DX*4), DX
  15442. MOVB CL, 1(AX)
  15443. SARL $0x08, CX
  15444. SHLL $0x05, CX
  15445. ORL CX, DX
  15446. MOVB DL, (AX)
  15447. ADDQ $0x02, BX
  15448. ADDQ $0x02, AX
  15449. gen_emit_repeat_end:
  15450. MOVQ BX, ret+40(FP)
  15451. RET
  15452. // func emitCopy(dst []byte, offset int, length int) int
  15453. TEXT ·emitCopy(SB), NOSPLIT, $0-48
  15454. XORQ BX, BX
  15455. MOVQ dst_base+0(FP), AX
  15456. MOVQ offset+24(FP), CX
  15457. MOVQ length+32(FP), DX
  15458. // emitCopy
  15459. CMPL CX, $0x00010000
  15460. JL two_byte_offset_standalone
  15461. four_bytes_loop_back_standalone:
  15462. CMPL DX, $0x40
  15463. JLE four_bytes_remain_standalone
  15464. MOVB $0xff, (AX)
  15465. MOVL CX, 1(AX)
  15466. LEAL -64(DX), DX
  15467. ADDQ $0x05, BX
  15468. ADDQ $0x05, AX
  15469. CMPL DX, $0x04
  15470. JL four_bytes_remain_standalone
  15471. // emitRepeat
  15472. emit_repeat_again_standalone_emit_copy:
  15473. MOVL DX, SI
  15474. LEAL -4(DX), DX
  15475. CMPL SI, $0x08
  15476. JLE repeat_two_standalone_emit_copy
  15477. CMPL SI, $0x0c
  15478. JGE cant_repeat_two_offset_standalone_emit_copy
  15479. CMPL CX, $0x00000800
  15480. JLT repeat_two_offset_standalone_emit_copy
  15481. cant_repeat_two_offset_standalone_emit_copy:
  15482. CMPL DX, $0x00000104
  15483. JLT repeat_three_standalone_emit_copy
  15484. CMPL DX, $0x00010100
  15485. JLT repeat_four_standalone_emit_copy
  15486. CMPL DX, $0x0100ffff
  15487. JLT repeat_five_standalone_emit_copy
  15488. LEAL -16842747(DX), DX
  15489. MOVW $0x001d, (AX)
  15490. MOVW $0xfffb, 2(AX)
  15491. MOVB $0xff, 4(AX)
  15492. ADDQ $0x05, AX
  15493. ADDQ $0x05, BX
  15494. JMP emit_repeat_again_standalone_emit_copy
  15495. repeat_five_standalone_emit_copy:
  15496. LEAL -65536(DX), DX
  15497. MOVL DX, CX
  15498. MOVW $0x001d, (AX)
  15499. MOVW DX, 2(AX)
  15500. SARL $0x10, CX
  15501. MOVB CL, 4(AX)
  15502. ADDQ $0x05, BX
  15503. ADDQ $0x05, AX
  15504. JMP gen_emit_copy_end
  15505. repeat_four_standalone_emit_copy:
  15506. LEAL -256(DX), DX
  15507. MOVW $0x0019, (AX)
  15508. MOVW DX, 2(AX)
  15509. ADDQ $0x04, BX
  15510. ADDQ $0x04, AX
  15511. JMP gen_emit_copy_end
  15512. repeat_three_standalone_emit_copy:
  15513. LEAL -4(DX), DX
  15514. MOVW $0x0015, (AX)
  15515. MOVB DL, 2(AX)
  15516. ADDQ $0x03, BX
  15517. ADDQ $0x03, AX
  15518. JMP gen_emit_copy_end
  15519. repeat_two_standalone_emit_copy:
  15520. SHLL $0x02, DX
  15521. ORL $0x01, DX
  15522. MOVW DX, (AX)
  15523. ADDQ $0x02, BX
  15524. ADDQ $0x02, AX
  15525. JMP gen_emit_copy_end
  15526. repeat_two_offset_standalone_emit_copy:
  15527. XORQ SI, SI
  15528. LEAL 1(SI)(DX*4), DX
  15529. MOVB CL, 1(AX)
  15530. SARL $0x08, CX
  15531. SHLL $0x05, CX
  15532. ORL CX, DX
  15533. MOVB DL, (AX)
  15534. ADDQ $0x02, BX
  15535. ADDQ $0x02, AX
  15536. JMP gen_emit_copy_end
  15537. JMP four_bytes_loop_back_standalone
  15538. four_bytes_remain_standalone:
  15539. TESTL DX, DX
  15540. JZ gen_emit_copy_end
  15541. MOVB $0x03, SI
  15542. LEAL -4(SI)(DX*4), DX
  15543. MOVB DL, (AX)
  15544. MOVL CX, 1(AX)
  15545. ADDQ $0x05, BX
  15546. ADDQ $0x05, AX
  15547. JMP gen_emit_copy_end
  15548. two_byte_offset_standalone:
  15549. CMPL DX, $0x40
  15550. JLE two_byte_offset_short_standalone
  15551. CMPL CX, $0x00000800
  15552. JAE long_offset_short_standalone
  15553. MOVL $0x00000001, SI
  15554. LEAL 16(SI), SI
  15555. MOVB CL, 1(AX)
  15556. MOVL CX, DI
  15557. SHRL $0x08, DI
  15558. SHLL $0x05, DI
  15559. ORL DI, SI
  15560. MOVB SI, (AX)
  15561. ADDQ $0x02, BX
  15562. ADDQ $0x02, AX
  15563. SUBL $0x08, DX
  15564. // emitRepeat
  15565. LEAL -4(DX), DX
  15566. JMP cant_repeat_two_offset_standalone_emit_copy_short_2b
  15567. emit_repeat_again_standalone_emit_copy_short_2b:
  15568. MOVL DX, SI
  15569. LEAL -4(DX), DX
  15570. CMPL SI, $0x08
  15571. JLE repeat_two_standalone_emit_copy_short_2b
  15572. CMPL SI, $0x0c
  15573. JGE cant_repeat_two_offset_standalone_emit_copy_short_2b
  15574. CMPL CX, $0x00000800
  15575. JLT repeat_two_offset_standalone_emit_copy_short_2b
  15576. cant_repeat_two_offset_standalone_emit_copy_short_2b:
  15577. CMPL DX, $0x00000104
  15578. JLT repeat_three_standalone_emit_copy_short_2b
  15579. CMPL DX, $0x00010100
  15580. JLT repeat_four_standalone_emit_copy_short_2b
  15581. CMPL DX, $0x0100ffff
  15582. JLT repeat_five_standalone_emit_copy_short_2b
  15583. LEAL -16842747(DX), DX
  15584. MOVW $0x001d, (AX)
  15585. MOVW $0xfffb, 2(AX)
  15586. MOVB $0xff, 4(AX)
  15587. ADDQ $0x05, AX
  15588. ADDQ $0x05, BX
  15589. JMP emit_repeat_again_standalone_emit_copy_short_2b
  15590. repeat_five_standalone_emit_copy_short_2b:
  15591. LEAL -65536(DX), DX
  15592. MOVL DX, CX
  15593. MOVW $0x001d, (AX)
  15594. MOVW DX, 2(AX)
  15595. SARL $0x10, CX
  15596. MOVB CL, 4(AX)
  15597. ADDQ $0x05, BX
  15598. ADDQ $0x05, AX
  15599. JMP gen_emit_copy_end
  15600. repeat_four_standalone_emit_copy_short_2b:
  15601. LEAL -256(DX), DX
  15602. MOVW $0x0019, (AX)
  15603. MOVW DX, 2(AX)
  15604. ADDQ $0x04, BX
  15605. ADDQ $0x04, AX
  15606. JMP gen_emit_copy_end
  15607. repeat_three_standalone_emit_copy_short_2b:
  15608. LEAL -4(DX), DX
  15609. MOVW $0x0015, (AX)
  15610. MOVB DL, 2(AX)
  15611. ADDQ $0x03, BX
  15612. ADDQ $0x03, AX
  15613. JMP gen_emit_copy_end
  15614. repeat_two_standalone_emit_copy_short_2b:
  15615. SHLL $0x02, DX
  15616. ORL $0x01, DX
  15617. MOVW DX, (AX)
  15618. ADDQ $0x02, BX
  15619. ADDQ $0x02, AX
  15620. JMP gen_emit_copy_end
  15621. repeat_two_offset_standalone_emit_copy_short_2b:
  15622. XORQ SI, SI
  15623. LEAL 1(SI)(DX*4), DX
  15624. MOVB CL, 1(AX)
  15625. SARL $0x08, CX
  15626. SHLL $0x05, CX
  15627. ORL CX, DX
  15628. MOVB DL, (AX)
  15629. ADDQ $0x02, BX
  15630. ADDQ $0x02, AX
  15631. JMP gen_emit_copy_end
  15632. long_offset_short_standalone:
  15633. MOVB $0xee, (AX)
  15634. MOVW CX, 1(AX)
  15635. LEAL -60(DX), DX
  15636. ADDQ $0x03, AX
  15637. ADDQ $0x03, BX
  15638. // emitRepeat
  15639. emit_repeat_again_standalone_emit_copy_short:
  15640. MOVL DX, SI
  15641. LEAL -4(DX), DX
  15642. CMPL SI, $0x08
  15643. JLE repeat_two_standalone_emit_copy_short
  15644. CMPL SI, $0x0c
  15645. JGE cant_repeat_two_offset_standalone_emit_copy_short
  15646. CMPL CX, $0x00000800
  15647. JLT repeat_two_offset_standalone_emit_copy_short
  15648. cant_repeat_two_offset_standalone_emit_copy_short:
  15649. CMPL DX, $0x00000104
  15650. JLT repeat_three_standalone_emit_copy_short
  15651. CMPL DX, $0x00010100
  15652. JLT repeat_four_standalone_emit_copy_short
  15653. CMPL DX, $0x0100ffff
  15654. JLT repeat_five_standalone_emit_copy_short
  15655. LEAL -16842747(DX), DX
  15656. MOVW $0x001d, (AX)
  15657. MOVW $0xfffb, 2(AX)
  15658. MOVB $0xff, 4(AX)
  15659. ADDQ $0x05, AX
  15660. ADDQ $0x05, BX
  15661. JMP emit_repeat_again_standalone_emit_copy_short
  15662. repeat_five_standalone_emit_copy_short:
  15663. LEAL -65536(DX), DX
  15664. MOVL DX, CX
  15665. MOVW $0x001d, (AX)
  15666. MOVW DX, 2(AX)
  15667. SARL $0x10, CX
  15668. MOVB CL, 4(AX)
  15669. ADDQ $0x05, BX
  15670. ADDQ $0x05, AX
  15671. JMP gen_emit_copy_end
  15672. repeat_four_standalone_emit_copy_short:
  15673. LEAL -256(DX), DX
  15674. MOVW $0x0019, (AX)
  15675. MOVW DX, 2(AX)
  15676. ADDQ $0x04, BX
  15677. ADDQ $0x04, AX
  15678. JMP gen_emit_copy_end
  15679. repeat_three_standalone_emit_copy_short:
  15680. LEAL -4(DX), DX
  15681. MOVW $0x0015, (AX)
  15682. MOVB DL, 2(AX)
  15683. ADDQ $0x03, BX
  15684. ADDQ $0x03, AX
  15685. JMP gen_emit_copy_end
  15686. repeat_two_standalone_emit_copy_short:
  15687. SHLL $0x02, DX
  15688. ORL $0x01, DX
  15689. MOVW DX, (AX)
  15690. ADDQ $0x02, BX
  15691. ADDQ $0x02, AX
  15692. JMP gen_emit_copy_end
  15693. repeat_two_offset_standalone_emit_copy_short:
  15694. XORQ SI, SI
  15695. LEAL 1(SI)(DX*4), DX
  15696. MOVB CL, 1(AX)
  15697. SARL $0x08, CX
  15698. SHLL $0x05, CX
  15699. ORL CX, DX
  15700. MOVB DL, (AX)
  15701. ADDQ $0x02, BX
  15702. ADDQ $0x02, AX
  15703. JMP gen_emit_copy_end
  15704. JMP two_byte_offset_standalone
  15705. two_byte_offset_short_standalone:
  15706. CMPL DX, $0x0c
  15707. JGE emit_copy_three_standalone
  15708. CMPL CX, $0x00000800
  15709. JGE emit_copy_three_standalone
  15710. MOVB $0x01, SI
  15711. LEAL -16(SI)(DX*4), DX
  15712. MOVB CL, 1(AX)
  15713. SHRL $0x08, CX
  15714. SHLL $0x05, CX
  15715. ORL CX, DX
  15716. MOVB DL, (AX)
  15717. ADDQ $0x02, BX
  15718. ADDQ $0x02, AX
  15719. JMP gen_emit_copy_end
  15720. emit_copy_three_standalone:
  15721. MOVB $0x02, SI
  15722. LEAL -4(SI)(DX*4), DX
  15723. MOVB DL, (AX)
  15724. MOVW CX, 1(AX)
  15725. ADDQ $0x03, BX
  15726. ADDQ $0x03, AX
  15727. gen_emit_copy_end:
  15728. MOVQ BX, ret+40(FP)
  15729. RET
  15730. // func emitCopyNoRepeat(dst []byte, offset int, length int) int
  15731. TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
  15732. XORQ BX, BX
  15733. MOVQ dst_base+0(FP), AX
  15734. MOVQ offset+24(FP), CX
  15735. MOVQ length+32(FP), DX
  15736. // emitCopy
  15737. CMPL CX, $0x00010000
  15738. JL two_byte_offset_standalone_snappy
  15739. four_bytes_loop_back_standalone_snappy:
  15740. CMPL DX, $0x40
  15741. JLE four_bytes_remain_standalone_snappy
  15742. MOVB $0xff, (AX)
  15743. MOVL CX, 1(AX)
  15744. LEAL -64(DX), DX
  15745. ADDQ $0x05, BX
  15746. ADDQ $0x05, AX
  15747. CMPL DX, $0x04
  15748. JL four_bytes_remain_standalone_snappy
  15749. JMP four_bytes_loop_back_standalone_snappy
  15750. four_bytes_remain_standalone_snappy:
  15751. TESTL DX, DX
  15752. JZ gen_emit_copy_end_snappy
  15753. MOVB $0x03, SI
  15754. LEAL -4(SI)(DX*4), DX
  15755. MOVB DL, (AX)
  15756. MOVL CX, 1(AX)
  15757. ADDQ $0x05, BX
  15758. ADDQ $0x05, AX
  15759. JMP gen_emit_copy_end_snappy
  15760. two_byte_offset_standalone_snappy:
  15761. CMPL DX, $0x40
  15762. JLE two_byte_offset_short_standalone_snappy
  15763. MOVB $0xee, (AX)
  15764. MOVW CX, 1(AX)
  15765. LEAL -60(DX), DX
  15766. ADDQ $0x03, AX
  15767. ADDQ $0x03, BX
  15768. JMP two_byte_offset_standalone_snappy
  15769. two_byte_offset_short_standalone_snappy:
  15770. CMPL DX, $0x0c
  15771. JGE emit_copy_three_standalone_snappy
  15772. CMPL CX, $0x00000800
  15773. JGE emit_copy_three_standalone_snappy
  15774. MOVB $0x01, SI
  15775. LEAL -16(SI)(DX*4), DX
  15776. MOVB CL, 1(AX)
  15777. SHRL $0x08, CX
  15778. SHLL $0x05, CX
  15779. ORL CX, DX
  15780. MOVB DL, (AX)
  15781. ADDQ $0x02, BX
  15782. ADDQ $0x02, AX
  15783. JMP gen_emit_copy_end_snappy
  15784. emit_copy_three_standalone_snappy:
  15785. MOVB $0x02, SI
  15786. LEAL -4(SI)(DX*4), DX
  15787. MOVB DL, (AX)
  15788. MOVW CX, 1(AX)
  15789. ADDQ $0x03, BX
  15790. ADDQ $0x03, AX
  15791. gen_emit_copy_end_snappy:
  15792. MOVQ BX, ret+40(FP)
  15793. RET
  15794. // func matchLen(a []byte, b []byte) int
  15795. // Requires: BMI
  15796. TEXT ·matchLen(SB), NOSPLIT, $0-56
  15797. MOVQ a_base+0(FP), AX
  15798. MOVQ b_base+24(FP), CX
  15799. MOVQ a_len+8(FP), DX
  15800. // matchLen
  15801. XORL SI, SI
  15802. CMPL DX, $0x08
  15803. JL matchlen_match4_standalone
  15804. matchlen_loopback_standalone:
  15805. MOVQ (AX)(SI*1), BX
  15806. XORQ (CX)(SI*1), BX
  15807. TESTQ BX, BX
  15808. JZ matchlen_loop_standalone
  15809. #ifdef GOAMD64_v3
  15810. TZCNTQ BX, BX
  15811. #else
  15812. BSFQ BX, BX
  15813. #endif
  15814. SARQ $0x03, BX
  15815. LEAL (SI)(BX*1), SI
  15816. JMP gen_match_len_end
  15817. matchlen_loop_standalone:
  15818. LEAL -8(DX), DX
  15819. LEAL 8(SI), SI
  15820. CMPL DX, $0x08
  15821. JGE matchlen_loopback_standalone
  15822. JZ gen_match_len_end
  15823. matchlen_match4_standalone:
  15824. CMPL DX, $0x04
  15825. JL matchlen_match2_standalone
  15826. MOVL (AX)(SI*1), BX
  15827. CMPL (CX)(SI*1), BX
  15828. JNE matchlen_match2_standalone
  15829. SUBL $0x04, DX
  15830. LEAL 4(SI), SI
  15831. matchlen_match2_standalone:
  15832. CMPL DX, $0x02
  15833. JL matchlen_match1_standalone
  15834. MOVW (AX)(SI*1), BX
  15835. CMPW (CX)(SI*1), BX
  15836. JNE matchlen_match1_standalone
  15837. SUBL $0x02, DX
  15838. LEAL 2(SI), SI
  15839. matchlen_match1_standalone:
  15840. CMPL DX, $0x01
  15841. JL gen_match_len_end
  15842. MOVB (AX)(SI*1), BL
  15843. CMPB (CX)(SI*1), BL
  15844. JNE gen_match_len_end
  15845. LEAL 1(SI), SI
  15846. gen_match_len_end:
  15847. MOVQ SI, ret+48(FP)
  15848. RET