You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

4101 rivejä
80 KiB

  1. // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
  2. //go:build !appengine && !noasm && gc && !noasm
  3. // +build !appengine,!noasm,gc,!noasm
  4. // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  5. // Requires: CMOV
  6. TEXT ·sequenceDecs_decode_amd64(SB), $8-32
  7. MOVQ br+8(FP), AX
  8. MOVQ 32(AX), DX
  9. MOVBQZX 40(AX), BX
  10. MOVQ 24(AX), SI
  11. MOVQ (AX), AX
  12. ADDQ SI, AX
  13. MOVQ AX, (SP)
  14. MOVQ ctx+16(FP), AX
  15. MOVQ 72(AX), DI
  16. MOVQ 80(AX), R8
  17. MOVQ 88(AX), R9
  18. MOVQ 104(AX), R10
  19. MOVQ s+0(FP), AX
  20. MOVQ 144(AX), R11
  21. MOVQ 152(AX), R12
  22. MOVQ 160(AX), R13
  23. sequenceDecs_decode_amd64_main_loop:
  24. MOVQ (SP), R14
  25. // Fill bitreader to have enough for the offset and match length.
  26. CMPQ SI, $0x08
  27. JL sequenceDecs_decode_amd64_fill_byte_by_byte
  28. MOVQ BX, AX
  29. SHRQ $0x03, AX
  30. SUBQ AX, R14
  31. MOVQ (R14), DX
  32. SUBQ AX, SI
  33. ANDQ $0x07, BX
  34. JMP sequenceDecs_decode_amd64_fill_end
  35. sequenceDecs_decode_amd64_fill_byte_by_byte:
  36. CMPQ SI, $0x00
  37. JLE sequenceDecs_decode_amd64_fill_end
  38. CMPQ BX, $0x07
  39. JLE sequenceDecs_decode_amd64_fill_end
  40. SHLQ $0x08, DX
  41. SUBQ $0x01, R14
  42. SUBQ $0x01, SI
  43. SUBQ $0x08, BX
  44. MOVBQZX (R14), AX
  45. ORQ AX, DX
  46. JMP sequenceDecs_decode_amd64_fill_byte_by_byte
  47. sequenceDecs_decode_amd64_fill_end:
  48. // Update offset
  49. MOVQ R9, AX
  50. MOVQ BX, CX
  51. MOVQ DX, R15
  52. SHLQ CL, R15
  53. MOVB AH, CL
  54. SHRQ $0x20, AX
  55. TESTQ CX, CX
  56. JZ sequenceDecs_decode_amd64_of_update_zero
  57. ADDQ CX, BX
  58. CMPQ BX, $0x40
  59. JA sequenceDecs_decode_amd64_of_update_zero
  60. CMPQ CX, $0x40
  61. JAE sequenceDecs_decode_amd64_of_update_zero
  62. NEGQ CX
  63. SHRQ CL, R15
  64. ADDQ R15, AX
  65. sequenceDecs_decode_amd64_of_update_zero:
  66. MOVQ AX, 16(R10)
  67. // Update match length
  68. MOVQ R8, AX
  69. MOVQ BX, CX
  70. MOVQ DX, R15
  71. SHLQ CL, R15
  72. MOVB AH, CL
  73. SHRQ $0x20, AX
  74. TESTQ CX, CX
  75. JZ sequenceDecs_decode_amd64_ml_update_zero
  76. ADDQ CX, BX
  77. CMPQ BX, $0x40
  78. JA sequenceDecs_decode_amd64_ml_update_zero
  79. CMPQ CX, $0x40
  80. JAE sequenceDecs_decode_amd64_ml_update_zero
  81. NEGQ CX
  82. SHRQ CL, R15
  83. ADDQ R15, AX
  84. sequenceDecs_decode_amd64_ml_update_zero:
  85. MOVQ AX, 8(R10)
  86. // Fill bitreader to have enough for the remaining
  87. CMPQ SI, $0x08
  88. JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
  89. MOVQ BX, AX
  90. SHRQ $0x03, AX
  91. SUBQ AX, R14
  92. MOVQ (R14), DX
  93. SUBQ AX, SI
  94. ANDQ $0x07, BX
  95. JMP sequenceDecs_decode_amd64_fill_2_end
  96. sequenceDecs_decode_amd64_fill_2_byte_by_byte:
  97. CMPQ SI, $0x00
  98. JLE sequenceDecs_decode_amd64_fill_2_end
  99. CMPQ BX, $0x07
  100. JLE sequenceDecs_decode_amd64_fill_2_end
  101. SHLQ $0x08, DX
  102. SUBQ $0x01, R14
  103. SUBQ $0x01, SI
  104. SUBQ $0x08, BX
  105. MOVBQZX (R14), AX
  106. ORQ AX, DX
  107. JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
  108. sequenceDecs_decode_amd64_fill_2_end:
  109. // Update literal length
  110. MOVQ DI, AX
  111. MOVQ BX, CX
  112. MOVQ DX, R15
  113. SHLQ CL, R15
  114. MOVB AH, CL
  115. SHRQ $0x20, AX
  116. TESTQ CX, CX
  117. JZ sequenceDecs_decode_amd64_ll_update_zero
  118. ADDQ CX, BX
  119. CMPQ BX, $0x40
  120. JA sequenceDecs_decode_amd64_ll_update_zero
  121. CMPQ CX, $0x40
  122. JAE sequenceDecs_decode_amd64_ll_update_zero
  123. NEGQ CX
  124. SHRQ CL, R15
  125. ADDQ R15, AX
  126. sequenceDecs_decode_amd64_ll_update_zero:
  127. MOVQ AX, (R10)
  128. // Fill bitreader for state updates
  129. MOVQ R14, (SP)
  130. MOVQ R9, AX
  131. SHRQ $0x08, AX
  132. MOVBQZX AL, AX
  133. MOVQ ctx+16(FP), CX
  134. CMPQ 96(CX), $0x00
  135. JZ sequenceDecs_decode_amd64_skip_update
  136. // Update Literal Length State
  137. MOVBQZX DI, R14
  138. SHRQ $0x10, DI
  139. MOVWQZX DI, DI
  140. LEAQ (BX)(R14*1), CX
  141. MOVQ DX, R15
  142. MOVQ CX, BX
  143. ROLQ CL, R15
  144. MOVL $0x00000001, BP
  145. MOVB R14, CL
  146. SHLL CL, BP
  147. DECL BP
  148. ANDQ BP, R15
  149. ADDQ R15, DI
  150. // Load ctx.llTable
  151. MOVQ ctx+16(FP), CX
  152. MOVQ (CX), CX
  153. MOVQ (CX)(DI*8), DI
  154. // Update Match Length State
  155. MOVBQZX R8, R14
  156. SHRQ $0x10, R8
  157. MOVWQZX R8, R8
  158. LEAQ (BX)(R14*1), CX
  159. MOVQ DX, R15
  160. MOVQ CX, BX
  161. ROLQ CL, R15
  162. MOVL $0x00000001, BP
  163. MOVB R14, CL
  164. SHLL CL, BP
  165. DECL BP
  166. ANDQ BP, R15
  167. ADDQ R15, R8
  168. // Load ctx.mlTable
  169. MOVQ ctx+16(FP), CX
  170. MOVQ 24(CX), CX
  171. MOVQ (CX)(R8*8), R8
  172. // Update Offset State
  173. MOVBQZX R9, R14
  174. SHRQ $0x10, R9
  175. MOVWQZX R9, R9
  176. LEAQ (BX)(R14*1), CX
  177. MOVQ DX, R15
  178. MOVQ CX, BX
  179. ROLQ CL, R15
  180. MOVL $0x00000001, BP
  181. MOVB R14, CL
  182. SHLL CL, BP
  183. DECL BP
  184. ANDQ BP, R15
  185. ADDQ R15, R9
  186. // Load ctx.ofTable
  187. MOVQ ctx+16(FP), CX
  188. MOVQ 48(CX), CX
  189. MOVQ (CX)(R9*8), R9
  190. sequenceDecs_decode_amd64_skip_update:
  191. // Adjust offset
  192. MOVQ 16(R10), CX
  193. CMPQ AX, $0x01
  194. JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
  195. MOVQ R12, R13
  196. MOVQ R11, R12
  197. MOVQ CX, R11
  198. JMP sequenceDecs_decode_amd64_after_adjust
  199. sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
  200. CMPQ (R10), $0x00000000
  201. JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
  202. INCQ CX
  203. JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
  204. sequenceDecs_decode_amd64_adjust_offset_maybezero:
  205. TESTQ CX, CX
  206. JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
  207. MOVQ R11, CX
  208. JMP sequenceDecs_decode_amd64_after_adjust
  209. sequenceDecs_decode_amd64_adjust_offset_nonzero:
  210. CMPQ CX, $0x01
  211. JB sequenceDecs_decode_amd64_adjust_zero
  212. JEQ sequenceDecs_decode_amd64_adjust_one
  213. CMPQ CX, $0x02
  214. JA sequenceDecs_decode_amd64_adjust_three
  215. JMP sequenceDecs_decode_amd64_adjust_two
  216. sequenceDecs_decode_amd64_adjust_zero:
  217. MOVQ R11, AX
  218. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  219. sequenceDecs_decode_amd64_adjust_one:
  220. MOVQ R12, AX
  221. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  222. sequenceDecs_decode_amd64_adjust_two:
  223. MOVQ R13, AX
  224. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  225. sequenceDecs_decode_amd64_adjust_three:
  226. LEAQ -1(R11), AX
  227. sequenceDecs_decode_amd64_adjust_test_temp_valid:
  228. TESTQ AX, AX
  229. JNZ sequenceDecs_decode_amd64_adjust_temp_valid
  230. MOVQ $0x00000001, AX
  231. sequenceDecs_decode_amd64_adjust_temp_valid:
  232. CMPQ CX, $0x01
  233. CMOVQNE R12, R13
  234. MOVQ R11, R12
  235. MOVQ AX, R11
  236. MOVQ AX, CX
  237. sequenceDecs_decode_amd64_after_adjust:
  238. MOVQ CX, 16(R10)
  239. // Check values
  240. MOVQ 8(R10), AX
  241. MOVQ (R10), R14
  242. LEAQ (AX)(R14*1), R15
  243. MOVQ s+0(FP), BP
  244. ADDQ R15, 256(BP)
  245. MOVQ ctx+16(FP), R15
  246. SUBQ R14, 128(R15)
  247. JS error_not_enough_literals
  248. CMPQ AX, $0x00020002
  249. JA sequenceDecs_decode_amd64_error_match_len_too_big
  250. TESTQ CX, CX
  251. JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
  252. TESTQ AX, AX
  253. JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
  254. sequenceDecs_decode_amd64_match_len_ofs_ok:
  255. ADDQ $0x18, R10
  256. MOVQ ctx+16(FP), AX
  257. DECQ 96(AX)
  258. JNS sequenceDecs_decode_amd64_main_loop
  259. MOVQ s+0(FP), AX
  260. MOVQ R11, 144(AX)
  261. MOVQ R12, 152(AX)
  262. MOVQ R13, 160(AX)
  263. MOVQ br+8(FP), AX
  264. MOVQ DX, 32(AX)
  265. MOVB BL, 40(AX)
  266. MOVQ SI, 24(AX)
  267. // Return success
  268. MOVQ $0x00000000, ret+24(FP)
  269. RET
  270. // Return with match length error
  271. sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
  272. MOVQ $0x00000001, ret+24(FP)
  273. RET
  274. // Return with match too long error
  275. sequenceDecs_decode_amd64_error_match_len_too_big:
  276. MOVQ $0x00000002, ret+24(FP)
  277. RET
  278. // Return with match offset too long error
  279. MOVQ $0x00000003, ret+24(FP)
  280. RET
  281. // Return with not enough literals error
  282. error_not_enough_literals:
  283. MOVQ $0x00000004, ret+24(FP)
  284. RET
  285. // Return with not enough output space error
  286. MOVQ $0x00000005, ret+24(FP)
  287. RET
  288. // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  289. // Requires: CMOV
  290. TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
  291. MOVQ br+8(FP), AX
  292. MOVQ 32(AX), DX
  293. MOVBQZX 40(AX), BX
  294. MOVQ 24(AX), SI
  295. MOVQ (AX), AX
  296. ADDQ SI, AX
  297. MOVQ AX, (SP)
  298. MOVQ ctx+16(FP), AX
  299. MOVQ 72(AX), DI
  300. MOVQ 80(AX), R8
  301. MOVQ 88(AX), R9
  302. MOVQ 104(AX), R10
  303. MOVQ s+0(FP), AX
  304. MOVQ 144(AX), R11
  305. MOVQ 152(AX), R12
  306. MOVQ 160(AX), R13
  307. sequenceDecs_decode_56_amd64_main_loop:
  308. MOVQ (SP), R14
  309. // Fill bitreader to have enough for the offset and match length.
  310. CMPQ SI, $0x08
  311. JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
  312. MOVQ BX, AX
  313. SHRQ $0x03, AX
  314. SUBQ AX, R14
  315. MOVQ (R14), DX
  316. SUBQ AX, SI
  317. ANDQ $0x07, BX
  318. JMP sequenceDecs_decode_56_amd64_fill_end
  319. sequenceDecs_decode_56_amd64_fill_byte_by_byte:
  320. CMPQ SI, $0x00
  321. JLE sequenceDecs_decode_56_amd64_fill_end
  322. CMPQ BX, $0x07
  323. JLE sequenceDecs_decode_56_amd64_fill_end
  324. SHLQ $0x08, DX
  325. SUBQ $0x01, R14
  326. SUBQ $0x01, SI
  327. SUBQ $0x08, BX
  328. MOVBQZX (R14), AX
  329. ORQ AX, DX
  330. JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
  331. sequenceDecs_decode_56_amd64_fill_end:
  332. // Update offset
  333. MOVQ R9, AX
  334. MOVQ BX, CX
  335. MOVQ DX, R15
  336. SHLQ CL, R15
  337. MOVB AH, CL
  338. SHRQ $0x20, AX
  339. TESTQ CX, CX
  340. JZ sequenceDecs_decode_56_amd64_of_update_zero
  341. ADDQ CX, BX
  342. CMPQ BX, $0x40
  343. JA sequenceDecs_decode_56_amd64_of_update_zero
  344. CMPQ CX, $0x40
  345. JAE sequenceDecs_decode_56_amd64_of_update_zero
  346. NEGQ CX
  347. SHRQ CL, R15
  348. ADDQ R15, AX
  349. sequenceDecs_decode_56_amd64_of_update_zero:
  350. MOVQ AX, 16(R10)
  351. // Update match length
  352. MOVQ R8, AX
  353. MOVQ BX, CX
  354. MOVQ DX, R15
  355. SHLQ CL, R15
  356. MOVB AH, CL
  357. SHRQ $0x20, AX
  358. TESTQ CX, CX
  359. JZ sequenceDecs_decode_56_amd64_ml_update_zero
  360. ADDQ CX, BX
  361. CMPQ BX, $0x40
  362. JA sequenceDecs_decode_56_amd64_ml_update_zero
  363. CMPQ CX, $0x40
  364. JAE sequenceDecs_decode_56_amd64_ml_update_zero
  365. NEGQ CX
  366. SHRQ CL, R15
  367. ADDQ R15, AX
  368. sequenceDecs_decode_56_amd64_ml_update_zero:
  369. MOVQ AX, 8(R10)
  370. // Update literal length
  371. MOVQ DI, AX
  372. MOVQ BX, CX
  373. MOVQ DX, R15
  374. SHLQ CL, R15
  375. MOVB AH, CL
  376. SHRQ $0x20, AX
  377. TESTQ CX, CX
  378. JZ sequenceDecs_decode_56_amd64_ll_update_zero
  379. ADDQ CX, BX
  380. CMPQ BX, $0x40
  381. JA sequenceDecs_decode_56_amd64_ll_update_zero
  382. CMPQ CX, $0x40
  383. JAE sequenceDecs_decode_56_amd64_ll_update_zero
  384. NEGQ CX
  385. SHRQ CL, R15
  386. ADDQ R15, AX
  387. sequenceDecs_decode_56_amd64_ll_update_zero:
  388. MOVQ AX, (R10)
  389. // Fill bitreader for state updates
  390. MOVQ R14, (SP)
  391. MOVQ R9, AX
  392. SHRQ $0x08, AX
  393. MOVBQZX AL, AX
  394. MOVQ ctx+16(FP), CX
  395. CMPQ 96(CX), $0x00
  396. JZ sequenceDecs_decode_56_amd64_skip_update
  397. // Update Literal Length State
  398. MOVBQZX DI, R14
  399. SHRQ $0x10, DI
  400. MOVWQZX DI, DI
  401. LEAQ (BX)(R14*1), CX
  402. MOVQ DX, R15
  403. MOVQ CX, BX
  404. ROLQ CL, R15
  405. MOVL $0x00000001, BP
  406. MOVB R14, CL
  407. SHLL CL, BP
  408. DECL BP
  409. ANDQ BP, R15
  410. ADDQ R15, DI
  411. // Load ctx.llTable
  412. MOVQ ctx+16(FP), CX
  413. MOVQ (CX), CX
  414. MOVQ (CX)(DI*8), DI
  415. // Update Match Length State
  416. MOVBQZX R8, R14
  417. SHRQ $0x10, R8
  418. MOVWQZX R8, R8
  419. LEAQ (BX)(R14*1), CX
  420. MOVQ DX, R15
  421. MOVQ CX, BX
  422. ROLQ CL, R15
  423. MOVL $0x00000001, BP
  424. MOVB R14, CL
  425. SHLL CL, BP
  426. DECL BP
  427. ANDQ BP, R15
  428. ADDQ R15, R8
  429. // Load ctx.mlTable
  430. MOVQ ctx+16(FP), CX
  431. MOVQ 24(CX), CX
  432. MOVQ (CX)(R8*8), R8
  433. // Update Offset State
  434. MOVBQZX R9, R14
  435. SHRQ $0x10, R9
  436. MOVWQZX R9, R9
  437. LEAQ (BX)(R14*1), CX
  438. MOVQ DX, R15
  439. MOVQ CX, BX
  440. ROLQ CL, R15
  441. MOVL $0x00000001, BP
  442. MOVB R14, CL
  443. SHLL CL, BP
  444. DECL BP
  445. ANDQ BP, R15
  446. ADDQ R15, R9
  447. // Load ctx.ofTable
  448. MOVQ ctx+16(FP), CX
  449. MOVQ 48(CX), CX
  450. MOVQ (CX)(R9*8), R9
  451. sequenceDecs_decode_56_amd64_skip_update:
  452. // Adjust offset
  453. MOVQ 16(R10), CX
  454. CMPQ AX, $0x01
  455. JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
  456. MOVQ R12, R13
  457. MOVQ R11, R12
  458. MOVQ CX, R11
  459. JMP sequenceDecs_decode_56_amd64_after_adjust
  460. sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
  461. CMPQ (R10), $0x00000000
  462. JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
  463. INCQ CX
  464. JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
  465. sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
  466. TESTQ CX, CX
  467. JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
  468. MOVQ R11, CX
  469. JMP sequenceDecs_decode_56_amd64_after_adjust
  470. sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
  471. CMPQ CX, $0x01
  472. JB sequenceDecs_decode_56_amd64_adjust_zero
  473. JEQ sequenceDecs_decode_56_amd64_adjust_one
  474. CMPQ CX, $0x02
  475. JA sequenceDecs_decode_56_amd64_adjust_three
  476. JMP sequenceDecs_decode_56_amd64_adjust_two
  477. sequenceDecs_decode_56_amd64_adjust_zero:
  478. MOVQ R11, AX
  479. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  480. sequenceDecs_decode_56_amd64_adjust_one:
  481. MOVQ R12, AX
  482. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  483. sequenceDecs_decode_56_amd64_adjust_two:
  484. MOVQ R13, AX
  485. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  486. sequenceDecs_decode_56_amd64_adjust_three:
  487. LEAQ -1(R11), AX
  488. sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
  489. TESTQ AX, AX
  490. JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
  491. MOVQ $0x00000001, AX
  492. sequenceDecs_decode_56_amd64_adjust_temp_valid:
  493. CMPQ CX, $0x01
  494. CMOVQNE R12, R13
  495. MOVQ R11, R12
  496. MOVQ AX, R11
  497. MOVQ AX, CX
  498. sequenceDecs_decode_56_amd64_after_adjust:
  499. MOVQ CX, 16(R10)
  500. // Check values
  501. MOVQ 8(R10), AX
  502. MOVQ (R10), R14
  503. LEAQ (AX)(R14*1), R15
  504. MOVQ s+0(FP), BP
  505. ADDQ R15, 256(BP)
  506. MOVQ ctx+16(FP), R15
  507. SUBQ R14, 128(R15)
  508. JS error_not_enough_literals
  509. CMPQ AX, $0x00020002
  510. JA sequenceDecs_decode_56_amd64_error_match_len_too_big
  511. TESTQ CX, CX
  512. JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
  513. TESTQ AX, AX
  514. JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
  515. sequenceDecs_decode_56_amd64_match_len_ofs_ok:
  516. ADDQ $0x18, R10
  517. MOVQ ctx+16(FP), AX
  518. DECQ 96(AX)
  519. JNS sequenceDecs_decode_56_amd64_main_loop
  520. MOVQ s+0(FP), AX
  521. MOVQ R11, 144(AX)
  522. MOVQ R12, 152(AX)
  523. MOVQ R13, 160(AX)
  524. MOVQ br+8(FP), AX
  525. MOVQ DX, 32(AX)
  526. MOVB BL, 40(AX)
  527. MOVQ SI, 24(AX)
  528. // Return success
  529. MOVQ $0x00000000, ret+24(FP)
  530. RET
  531. // Return with match length error
  532. sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
  533. MOVQ $0x00000001, ret+24(FP)
  534. RET
  535. // Return with match too long error
  536. sequenceDecs_decode_56_amd64_error_match_len_too_big:
  537. MOVQ $0x00000002, ret+24(FP)
  538. RET
  539. // Return with match offset too long error
  540. MOVQ $0x00000003, ret+24(FP)
  541. RET
  542. // Return with not enough literals error
  543. error_not_enough_literals:
  544. MOVQ $0x00000004, ret+24(FP)
  545. RET
  546. // Return with not enough output space error
  547. MOVQ $0x00000005, ret+24(FP)
  548. RET
  549. // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  550. // Requires: BMI, BMI2, CMOV
  551. TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
  552. MOVQ br+8(FP), CX
  553. MOVQ 32(CX), AX
  554. MOVBQZX 40(CX), DX
  555. MOVQ 24(CX), BX
  556. MOVQ (CX), CX
  557. ADDQ BX, CX
  558. MOVQ CX, (SP)
  559. MOVQ ctx+16(FP), CX
  560. MOVQ 72(CX), SI
  561. MOVQ 80(CX), DI
  562. MOVQ 88(CX), R8
  563. MOVQ 104(CX), R9
  564. MOVQ s+0(FP), CX
  565. MOVQ 144(CX), R10
  566. MOVQ 152(CX), R11
  567. MOVQ 160(CX), R12
  568. sequenceDecs_decode_bmi2_main_loop:
  569. MOVQ (SP), R13
  570. // Fill bitreader to have enough for the offset and match length.
  571. CMPQ BX, $0x08
  572. JL sequenceDecs_decode_bmi2_fill_byte_by_byte
  573. MOVQ DX, CX
  574. SHRQ $0x03, CX
  575. SUBQ CX, R13
  576. MOVQ (R13), AX
  577. SUBQ CX, BX
  578. ANDQ $0x07, DX
  579. JMP sequenceDecs_decode_bmi2_fill_end
  580. sequenceDecs_decode_bmi2_fill_byte_by_byte:
  581. CMPQ BX, $0x00
  582. JLE sequenceDecs_decode_bmi2_fill_end
  583. CMPQ DX, $0x07
  584. JLE sequenceDecs_decode_bmi2_fill_end
  585. SHLQ $0x08, AX
  586. SUBQ $0x01, R13
  587. SUBQ $0x01, BX
  588. SUBQ $0x08, DX
  589. MOVBQZX (R13), CX
  590. ORQ CX, AX
  591. JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
  592. sequenceDecs_decode_bmi2_fill_end:
  593. // Update offset
  594. MOVQ $0x00000808, CX
  595. BEXTRQ CX, R8, R14
  596. MOVQ AX, R15
  597. LEAQ (DX)(R14*1), CX
  598. ROLQ CL, R15
  599. BZHIQ R14, R15, R15
  600. MOVQ CX, DX
  601. MOVQ R8, CX
  602. SHRQ $0x20, CX
  603. ADDQ R15, CX
  604. MOVQ CX, 16(R9)
  605. // Update match length
  606. MOVQ $0x00000808, CX
  607. BEXTRQ CX, DI, R14
  608. MOVQ AX, R15
  609. LEAQ (DX)(R14*1), CX
  610. ROLQ CL, R15
  611. BZHIQ R14, R15, R15
  612. MOVQ CX, DX
  613. MOVQ DI, CX
  614. SHRQ $0x20, CX
  615. ADDQ R15, CX
  616. MOVQ CX, 8(R9)
  617. // Fill bitreader to have enough for the remaining
  618. CMPQ BX, $0x08
  619. JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
  620. MOVQ DX, CX
  621. SHRQ $0x03, CX
  622. SUBQ CX, R13
  623. MOVQ (R13), AX
  624. SUBQ CX, BX
  625. ANDQ $0x07, DX
  626. JMP sequenceDecs_decode_bmi2_fill_2_end
  627. sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
  628. CMPQ BX, $0x00
  629. JLE sequenceDecs_decode_bmi2_fill_2_end
  630. CMPQ DX, $0x07
  631. JLE sequenceDecs_decode_bmi2_fill_2_end
  632. SHLQ $0x08, AX
  633. SUBQ $0x01, R13
  634. SUBQ $0x01, BX
  635. SUBQ $0x08, DX
  636. MOVBQZX (R13), CX
  637. ORQ CX, AX
  638. JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
  639. sequenceDecs_decode_bmi2_fill_2_end:
  640. // Update literal length
  641. MOVQ $0x00000808, CX
  642. BEXTRQ CX, SI, R14
  643. MOVQ AX, R15
  644. LEAQ (DX)(R14*1), CX
  645. ROLQ CL, R15
  646. BZHIQ R14, R15, R15
  647. MOVQ CX, DX
  648. MOVQ SI, CX
  649. SHRQ $0x20, CX
  650. ADDQ R15, CX
  651. MOVQ CX, (R9)
  652. // Fill bitreader for state updates
  653. MOVQ R13, (SP)
  654. MOVQ $0x00000808, CX
  655. BEXTRQ CX, R8, R13
  656. MOVQ ctx+16(FP), CX
  657. CMPQ 96(CX), $0x00
  658. JZ sequenceDecs_decode_bmi2_skip_update
  659. LEAQ (SI)(DI*1), R14
  660. ADDQ R8, R14
  661. MOVBQZX R14, R14
  662. LEAQ (DX)(R14*1), CX
  663. MOVQ AX, R15
  664. MOVQ CX, DX
  665. ROLQ CL, R15
  666. BZHIQ R14, R15, R15
  667. // Update Offset State
  668. BZHIQ R8, R15, CX
  669. SHRXQ R8, R15, R15
  670. MOVQ $0x00001010, R14
  671. BEXTRQ R14, R8, R8
  672. ADDQ CX, R8
  673. // Load ctx.ofTable
  674. MOVQ ctx+16(FP), CX
  675. MOVQ 48(CX), CX
  676. MOVQ (CX)(R8*8), R8
  677. // Update Match Length State
  678. BZHIQ DI, R15, CX
  679. SHRXQ DI, R15, R15
  680. MOVQ $0x00001010, R14
  681. BEXTRQ R14, DI, DI
  682. ADDQ CX, DI
  683. // Load ctx.mlTable
  684. MOVQ ctx+16(FP), CX
  685. MOVQ 24(CX), CX
  686. MOVQ (CX)(DI*8), DI
  687. // Update Literal Length State
  688. BZHIQ SI, R15, CX
  689. MOVQ $0x00001010, R14
  690. BEXTRQ R14, SI, SI
  691. ADDQ CX, SI
  692. // Load ctx.llTable
  693. MOVQ ctx+16(FP), CX
  694. MOVQ (CX), CX
  695. MOVQ (CX)(SI*8), SI
  696. sequenceDecs_decode_bmi2_skip_update:
  697. // Adjust offset
  698. MOVQ 16(R9), CX
  699. CMPQ R13, $0x01
  700. JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
  701. MOVQ R11, R12
  702. MOVQ R10, R11
  703. MOVQ CX, R10
  704. JMP sequenceDecs_decode_bmi2_after_adjust
  705. sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
  706. CMPQ (R9), $0x00000000
  707. JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
  708. INCQ CX
  709. JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
  710. sequenceDecs_decode_bmi2_adjust_offset_maybezero:
  711. TESTQ CX, CX
  712. JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
  713. MOVQ R10, CX
  714. JMP sequenceDecs_decode_bmi2_after_adjust
  715. sequenceDecs_decode_bmi2_adjust_offset_nonzero:
  716. CMPQ CX, $0x01
  717. JB sequenceDecs_decode_bmi2_adjust_zero
  718. JEQ sequenceDecs_decode_bmi2_adjust_one
  719. CMPQ CX, $0x02
  720. JA sequenceDecs_decode_bmi2_adjust_three
  721. JMP sequenceDecs_decode_bmi2_adjust_two
  722. sequenceDecs_decode_bmi2_adjust_zero:
  723. MOVQ R10, R13
  724. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  725. sequenceDecs_decode_bmi2_adjust_one:
  726. MOVQ R11, R13
  727. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  728. sequenceDecs_decode_bmi2_adjust_two:
  729. MOVQ R12, R13
  730. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  731. sequenceDecs_decode_bmi2_adjust_three:
  732. LEAQ -1(R10), R13
  733. sequenceDecs_decode_bmi2_adjust_test_temp_valid:
  734. TESTQ R13, R13
  735. JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
  736. MOVQ $0x00000001, R13
  737. sequenceDecs_decode_bmi2_adjust_temp_valid:
  738. CMPQ CX, $0x01
  739. CMOVQNE R11, R12
  740. MOVQ R10, R11
  741. MOVQ R13, R10
  742. MOVQ R13, CX
  743. sequenceDecs_decode_bmi2_after_adjust:
  744. MOVQ CX, 16(R9)
  745. // Check values
  746. MOVQ 8(R9), R13
  747. MOVQ (R9), R14
  748. LEAQ (R13)(R14*1), R15
  749. MOVQ s+0(FP), BP
  750. ADDQ R15, 256(BP)
  751. MOVQ ctx+16(FP), R15
  752. SUBQ R14, 128(R15)
  753. JS error_not_enough_literals
  754. CMPQ R13, $0x00020002
  755. JA sequenceDecs_decode_bmi2_error_match_len_too_big
  756. TESTQ CX, CX
  757. JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
  758. TESTQ R13, R13
  759. JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
  760. sequenceDecs_decode_bmi2_match_len_ofs_ok:
  761. ADDQ $0x18, R9
  762. MOVQ ctx+16(FP), CX
  763. DECQ 96(CX)
  764. JNS sequenceDecs_decode_bmi2_main_loop
  765. MOVQ s+0(FP), CX
  766. MOVQ R10, 144(CX)
  767. MOVQ R11, 152(CX)
  768. MOVQ R12, 160(CX)
  769. MOVQ br+8(FP), CX
  770. MOVQ AX, 32(CX)
  771. MOVB DL, 40(CX)
  772. MOVQ BX, 24(CX)
  773. // Return success
  774. MOVQ $0x00000000, ret+24(FP)
  775. RET
  776. // Return with match length error
  777. sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
  778. MOVQ $0x00000001, ret+24(FP)
  779. RET
  780. // Return with match too long error
  781. sequenceDecs_decode_bmi2_error_match_len_too_big:
  782. MOVQ $0x00000002, ret+24(FP)
  783. RET
  784. // Return with match offset too long error
  785. MOVQ $0x00000003, ret+24(FP)
  786. RET
  787. // Return with not enough literals error
  788. error_not_enough_literals:
  789. MOVQ $0x00000004, ret+24(FP)
  790. RET
  791. // Return with not enough output space error
  792. MOVQ $0x00000005, ret+24(FP)
  793. RET
  794. // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  795. // Requires: BMI, BMI2, CMOV
  796. TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
  797. MOVQ br+8(FP), CX
  798. MOVQ 32(CX), AX
  799. MOVBQZX 40(CX), DX
  800. MOVQ 24(CX), BX
  801. MOVQ (CX), CX
  802. ADDQ BX, CX
  803. MOVQ CX, (SP)
  804. MOVQ ctx+16(FP), CX
  805. MOVQ 72(CX), SI
  806. MOVQ 80(CX), DI
  807. MOVQ 88(CX), R8
  808. MOVQ 104(CX), R9
  809. MOVQ s+0(FP), CX
  810. MOVQ 144(CX), R10
  811. MOVQ 152(CX), R11
  812. MOVQ 160(CX), R12
  813. sequenceDecs_decode_56_bmi2_main_loop:
  814. MOVQ (SP), R13
  815. // Fill bitreader to have enough for the offset and match length.
  816. CMPQ BX, $0x08
  817. JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
  818. MOVQ DX, CX
  819. SHRQ $0x03, CX
  820. SUBQ CX, R13
  821. MOVQ (R13), AX
  822. SUBQ CX, BX
  823. ANDQ $0x07, DX
  824. JMP sequenceDecs_decode_56_bmi2_fill_end
  825. sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
  826. CMPQ BX, $0x00
  827. JLE sequenceDecs_decode_56_bmi2_fill_end
  828. CMPQ DX, $0x07
  829. JLE sequenceDecs_decode_56_bmi2_fill_end
  830. SHLQ $0x08, AX
  831. SUBQ $0x01, R13
  832. SUBQ $0x01, BX
  833. SUBQ $0x08, DX
  834. MOVBQZX (R13), CX
  835. ORQ CX, AX
  836. JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
  837. sequenceDecs_decode_56_bmi2_fill_end:
  838. // Update offset
  839. MOVQ $0x00000808, CX
  840. BEXTRQ CX, R8, R14
  841. MOVQ AX, R15
  842. LEAQ (DX)(R14*1), CX
  843. ROLQ CL, R15
  844. BZHIQ R14, R15, R15
  845. MOVQ CX, DX
  846. MOVQ R8, CX
  847. SHRQ $0x20, CX
  848. ADDQ R15, CX
  849. MOVQ CX, 16(R9)
  850. // Update match length
  851. MOVQ $0x00000808, CX
  852. BEXTRQ CX, DI, R14
  853. MOVQ AX, R15
  854. LEAQ (DX)(R14*1), CX
  855. ROLQ CL, R15
  856. BZHIQ R14, R15, R15
  857. MOVQ CX, DX
  858. MOVQ DI, CX
  859. SHRQ $0x20, CX
  860. ADDQ R15, CX
  861. MOVQ CX, 8(R9)
  862. // Update literal length
  863. MOVQ $0x00000808, CX
  864. BEXTRQ CX, SI, R14
  865. MOVQ AX, R15
  866. LEAQ (DX)(R14*1), CX
  867. ROLQ CL, R15
  868. BZHIQ R14, R15, R15
  869. MOVQ CX, DX
  870. MOVQ SI, CX
  871. SHRQ $0x20, CX
  872. ADDQ R15, CX
  873. MOVQ CX, (R9)
  874. // Fill bitreader for state updates
  875. MOVQ R13, (SP)
  876. MOVQ $0x00000808, CX
  877. BEXTRQ CX, R8, R13
  878. MOVQ ctx+16(FP), CX
  879. CMPQ 96(CX), $0x00
  880. JZ sequenceDecs_decode_56_bmi2_skip_update
  881. LEAQ (SI)(DI*1), R14
  882. ADDQ R8, R14
  883. MOVBQZX R14, R14
  884. LEAQ (DX)(R14*1), CX
  885. MOVQ AX, R15
  886. MOVQ CX, DX
  887. ROLQ CL, R15
  888. BZHIQ R14, R15, R15
  889. // Update Offset State
  890. BZHIQ R8, R15, CX
  891. SHRXQ R8, R15, R15
  892. MOVQ $0x00001010, R14
  893. BEXTRQ R14, R8, R8
  894. ADDQ CX, R8
  895. // Load ctx.ofTable
  896. MOVQ ctx+16(FP), CX
  897. MOVQ 48(CX), CX
  898. MOVQ (CX)(R8*8), R8
  899. // Update Match Length State
  900. BZHIQ DI, R15, CX
  901. SHRXQ DI, R15, R15
  902. MOVQ $0x00001010, R14
  903. BEXTRQ R14, DI, DI
  904. ADDQ CX, DI
  905. // Load ctx.mlTable
  906. MOVQ ctx+16(FP), CX
  907. MOVQ 24(CX), CX
  908. MOVQ (CX)(DI*8), DI
  909. // Update Literal Length State
  910. BZHIQ SI, R15, CX
  911. MOVQ $0x00001010, R14
  912. BEXTRQ R14, SI, SI
  913. ADDQ CX, SI
  914. // Load ctx.llTable
  915. MOVQ ctx+16(FP), CX
  916. MOVQ (CX), CX
  917. MOVQ (CX)(SI*8), SI
  918. sequenceDecs_decode_56_bmi2_skip_update:
  919. // Adjust offset
  920. MOVQ 16(R9), CX
  921. CMPQ R13, $0x01
  922. JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
  923. MOVQ R11, R12
  924. MOVQ R10, R11
  925. MOVQ CX, R10
  926. JMP sequenceDecs_decode_56_bmi2_after_adjust
  927. sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
  928. CMPQ (R9), $0x00000000
  929. JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
  930. INCQ CX
  931. JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  932. sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
  933. TESTQ CX, CX
  934. JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  935. MOVQ R10, CX
  936. JMP sequenceDecs_decode_56_bmi2_after_adjust
  937. sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
  938. CMPQ CX, $0x01
  939. JB sequenceDecs_decode_56_bmi2_adjust_zero
  940. JEQ sequenceDecs_decode_56_bmi2_adjust_one
  941. CMPQ CX, $0x02
  942. JA sequenceDecs_decode_56_bmi2_adjust_three
  943. JMP sequenceDecs_decode_56_bmi2_adjust_two
  944. sequenceDecs_decode_56_bmi2_adjust_zero:
  945. MOVQ R10, R13
  946. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  947. sequenceDecs_decode_56_bmi2_adjust_one:
  948. MOVQ R11, R13
  949. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  950. sequenceDecs_decode_56_bmi2_adjust_two:
  951. MOVQ R12, R13
  952. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  953. sequenceDecs_decode_56_bmi2_adjust_three:
  954. LEAQ -1(R10), R13
  955. sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
  956. TESTQ R13, R13
  957. JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
  958. MOVQ $0x00000001, R13
  959. sequenceDecs_decode_56_bmi2_adjust_temp_valid:
  960. CMPQ CX, $0x01
  961. CMOVQNE R11, R12
  962. MOVQ R10, R11
  963. MOVQ R13, R10
  964. MOVQ R13, CX
  965. sequenceDecs_decode_56_bmi2_after_adjust:
  966. MOVQ CX, 16(R9)
  967. // Check values
  968. MOVQ 8(R9), R13
  969. MOVQ (R9), R14
  970. LEAQ (R13)(R14*1), R15
  971. MOVQ s+0(FP), BP
  972. ADDQ R15, 256(BP)
  973. MOVQ ctx+16(FP), R15
  974. SUBQ R14, 128(R15)
  975. JS error_not_enough_literals
  976. CMPQ R13, $0x00020002
  977. JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
  978. TESTQ CX, CX
  979. JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
  980. TESTQ R13, R13
  981. JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
  982. sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
  983. ADDQ $0x18, R9
  984. MOVQ ctx+16(FP), CX
  985. DECQ 96(CX)
  986. JNS sequenceDecs_decode_56_bmi2_main_loop
  987. MOVQ s+0(FP), CX
  988. MOVQ R10, 144(CX)
  989. MOVQ R11, 152(CX)
  990. MOVQ R12, 160(CX)
  991. MOVQ br+8(FP), CX
  992. MOVQ AX, 32(CX)
  993. MOVB DL, 40(CX)
  994. MOVQ BX, 24(CX)
  995. // Return success
  996. MOVQ $0x00000000, ret+24(FP)
  997. RET
  998. // Return with match length error
  999. sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
  1000. MOVQ $0x00000001, ret+24(FP)
  1001. RET
  1002. // Return with match too long error
  1003. sequenceDecs_decode_56_bmi2_error_match_len_too_big:
  1004. MOVQ $0x00000002, ret+24(FP)
  1005. RET
  1006. // Return with match offset too long error
  1007. MOVQ $0x00000003, ret+24(FP)
  1008. RET
  1009. // Return with not enough literals error
  1010. error_not_enough_literals:
  1011. MOVQ $0x00000004, ret+24(FP)
  1012. RET
  1013. // Return with not enough output space error
  1014. MOVQ $0x00000005, ret+24(FP)
  1015. RET
  1016. // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
  1017. // Requires: SSE
  1018. TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
  1019. MOVQ ctx+0(FP), R10
  1020. MOVQ 8(R10), CX
  1021. TESTQ CX, CX
  1022. JZ empty_seqs
  1023. MOVQ (R10), AX
  1024. MOVQ 24(R10), DX
  1025. MOVQ 32(R10), BX
  1026. MOVQ 80(R10), SI
  1027. MOVQ 104(R10), DI
  1028. MOVQ 120(R10), R8
  1029. MOVQ 56(R10), R9
  1030. MOVQ 64(R10), R10
  1031. ADDQ R10, R9
  1032. // seqsBase += 24 * seqIndex
  1033. LEAQ (DX)(DX*2), R11
  1034. SHLQ $0x03, R11
  1035. ADDQ R11, AX
  1036. // outBase += outPosition
  1037. ADDQ DI, BX
  1038. main_loop:
  1039. MOVQ (AX), R11
  1040. MOVQ 16(AX), R12
  1041. MOVQ 8(AX), R13
  1042. // Copy literals
  1043. TESTQ R11, R11
  1044. JZ check_offset
  1045. XORQ R14, R14
  1046. copy_1:
  1047. MOVUPS (SI)(R14*1), X0
  1048. MOVUPS X0, (BX)(R14*1)
  1049. ADDQ $0x10, R14
  1050. CMPQ R14, R11
  1051. JB copy_1
  1052. ADDQ R11, SI
  1053. ADDQ R11, BX
  1054. ADDQ R11, DI
  1055. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1056. check_offset:
  1057. LEAQ (DI)(R10*1), R11
  1058. CMPQ R12, R11
  1059. JG error_match_off_too_big
  1060. CMPQ R12, R8
  1061. JG error_match_off_too_big
  1062. // Copy match from history
  1063. MOVQ R12, R11
  1064. SUBQ DI, R11
  1065. JLS copy_match
  1066. MOVQ R9, R14
  1067. SUBQ R11, R14
  1068. CMPQ R13, R11
  1069. JG copy_all_from_history
  1070. MOVQ R13, R11
  1071. SUBQ $0x10, R11
  1072. JB copy_4_small
  1073. copy_4_loop:
  1074. MOVUPS (R14), X0
  1075. MOVUPS X0, (BX)
  1076. ADDQ $0x10, R14
  1077. ADDQ $0x10, BX
  1078. SUBQ $0x10, R11
  1079. JAE copy_4_loop
  1080. LEAQ 16(R14)(R11*1), R14
  1081. LEAQ 16(BX)(R11*1), BX
  1082. MOVUPS -16(R14), X0
  1083. MOVUPS X0, -16(BX)
  1084. JMP copy_4_end
  1085. copy_4_small:
  1086. CMPQ R13, $0x03
  1087. JE copy_4_move_3
  1088. CMPQ R13, $0x08
  1089. JB copy_4_move_4through7
  1090. JMP copy_4_move_8through16
  1091. copy_4_move_3:
  1092. MOVW (R14), R11
  1093. MOVB 2(R14), R12
  1094. MOVW R11, (BX)
  1095. MOVB R12, 2(BX)
  1096. ADDQ R13, R14
  1097. ADDQ R13, BX
  1098. JMP copy_4_end
  1099. copy_4_move_4through7:
  1100. MOVL (R14), R11
  1101. MOVL -4(R14)(R13*1), R12
  1102. MOVL R11, (BX)
  1103. MOVL R12, -4(BX)(R13*1)
  1104. ADDQ R13, R14
  1105. ADDQ R13, BX
  1106. JMP copy_4_end
  1107. copy_4_move_8through16:
  1108. MOVQ (R14), R11
  1109. MOVQ -8(R14)(R13*1), R12
  1110. MOVQ R11, (BX)
  1111. MOVQ R12, -8(BX)(R13*1)
  1112. ADDQ R13, R14
  1113. ADDQ R13, BX
  1114. copy_4_end:
  1115. ADDQ R13, DI
  1116. ADDQ $0x18, AX
  1117. INCQ DX
  1118. CMPQ DX, CX
  1119. JB main_loop
  1120. JMP loop_finished
  1121. copy_all_from_history:
  1122. MOVQ R11, R15
  1123. SUBQ $0x10, R15
  1124. JB copy_5_small
  1125. copy_5_loop:
  1126. MOVUPS (R14), X0
  1127. MOVUPS X0, (BX)
  1128. ADDQ $0x10, R14
  1129. ADDQ $0x10, BX
  1130. SUBQ $0x10, R15
  1131. JAE copy_5_loop
  1132. LEAQ 16(R14)(R15*1), R14
  1133. LEAQ 16(BX)(R15*1), BX
  1134. MOVUPS -16(R14), X0
  1135. MOVUPS X0, -16(BX)
  1136. JMP copy_5_end
  1137. copy_5_small:
  1138. CMPQ R11, $0x03
  1139. JE copy_5_move_3
  1140. JB copy_5_move_1or2
  1141. CMPQ R11, $0x08
  1142. JB copy_5_move_4through7
  1143. JMP copy_5_move_8through16
  1144. copy_5_move_1or2:
  1145. MOVB (R14), R15
  1146. MOVB -1(R14)(R11*1), BP
  1147. MOVB R15, (BX)
  1148. MOVB BP, -1(BX)(R11*1)
  1149. ADDQ R11, R14
  1150. ADDQ R11, BX
  1151. JMP copy_5_end
  1152. copy_5_move_3:
  1153. MOVW (R14), R15
  1154. MOVB 2(R14), BP
  1155. MOVW R15, (BX)
  1156. MOVB BP, 2(BX)
  1157. ADDQ R11, R14
  1158. ADDQ R11, BX
  1159. JMP copy_5_end
  1160. copy_5_move_4through7:
  1161. MOVL (R14), R15
  1162. MOVL -4(R14)(R11*1), BP
  1163. MOVL R15, (BX)
  1164. MOVL BP, -4(BX)(R11*1)
  1165. ADDQ R11, R14
  1166. ADDQ R11, BX
  1167. JMP copy_5_end
  1168. copy_5_move_8through16:
  1169. MOVQ (R14), R15
  1170. MOVQ -8(R14)(R11*1), BP
  1171. MOVQ R15, (BX)
  1172. MOVQ BP, -8(BX)(R11*1)
  1173. ADDQ R11, R14
  1174. ADDQ R11, BX
  1175. copy_5_end:
  1176. ADDQ R11, DI
  1177. SUBQ R11, R13
  1178. // Copy match from the current buffer
  1179. copy_match:
  1180. MOVQ BX, R11
  1181. SUBQ R12, R11
  1182. // ml <= mo
  1183. CMPQ R13, R12
  1184. JA copy_overlapping_match
  1185. // Copy non-overlapping match
  1186. ADDQ R13, DI
  1187. MOVQ BX, R12
  1188. ADDQ R13, BX
  1189. copy_2:
  1190. MOVUPS (R11), X0
  1191. MOVUPS X0, (R12)
  1192. ADDQ $0x10, R11
  1193. ADDQ $0x10, R12
  1194. SUBQ $0x10, R13
  1195. JHI copy_2
  1196. JMP handle_loop
  1197. // Copy overlapping match
  1198. copy_overlapping_match:
  1199. ADDQ R13, DI
  1200. copy_slow_3:
  1201. MOVB (R11), R12
  1202. MOVB R12, (BX)
  1203. INCQ R11
  1204. INCQ BX
  1205. DECQ R13
  1206. JNZ copy_slow_3
  1207. handle_loop:
  1208. ADDQ $0x18, AX
  1209. INCQ DX
  1210. CMPQ DX, CX
  1211. JB main_loop
  1212. loop_finished:
  1213. // Return value
  1214. MOVB $0x01, ret+8(FP)
  1215. // Update the context
  1216. MOVQ ctx+0(FP), AX
  1217. MOVQ DX, 24(AX)
  1218. MOVQ DI, 104(AX)
  1219. MOVQ 80(AX), CX
  1220. SUBQ CX, SI
  1221. MOVQ SI, 112(AX)
  1222. RET
  1223. error_match_off_too_big:
  1224. // Return value
  1225. MOVB $0x00, ret+8(FP)
  1226. // Update the context
  1227. MOVQ ctx+0(FP), AX
  1228. MOVQ DX, 24(AX)
  1229. MOVQ DI, 104(AX)
  1230. MOVQ 80(AX), CX
  1231. SUBQ CX, SI
  1232. MOVQ SI, 112(AX)
  1233. RET
  1234. empty_seqs:
  1235. // Return value
  1236. MOVB $0x01, ret+8(FP)
  1237. RET
  1238. // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
  1239. // Requires: SSE
  1240. TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
  1241. MOVQ ctx+0(FP), R10
  1242. MOVQ 8(R10), CX
  1243. TESTQ CX, CX
  1244. JZ empty_seqs
  1245. MOVQ (R10), AX
  1246. MOVQ 24(R10), DX
  1247. MOVQ 32(R10), BX
  1248. MOVQ 80(R10), SI
  1249. MOVQ 104(R10), DI
  1250. MOVQ 120(R10), R8
  1251. MOVQ 56(R10), R9
  1252. MOVQ 64(R10), R10
  1253. ADDQ R10, R9
  1254. // seqsBase += 24 * seqIndex
  1255. LEAQ (DX)(DX*2), R11
  1256. SHLQ $0x03, R11
  1257. ADDQ R11, AX
  1258. // outBase += outPosition
  1259. ADDQ DI, BX
  1260. main_loop:
  1261. MOVQ (AX), R11
  1262. MOVQ 16(AX), R12
  1263. MOVQ 8(AX), R13
  1264. // Copy literals
  1265. TESTQ R11, R11
  1266. JZ check_offset
  1267. MOVQ R11, R14
  1268. SUBQ $0x10, R14
  1269. JB copy_1_small
  1270. copy_1_loop:
  1271. MOVUPS (SI), X0
  1272. MOVUPS X0, (BX)
  1273. ADDQ $0x10, SI
  1274. ADDQ $0x10, BX
  1275. SUBQ $0x10, R14
  1276. JAE copy_1_loop
  1277. LEAQ 16(SI)(R14*1), SI
  1278. LEAQ 16(BX)(R14*1), BX
  1279. MOVUPS -16(SI), X0
  1280. MOVUPS X0, -16(BX)
  1281. JMP copy_1_end
  1282. copy_1_small:
  1283. CMPQ R11, $0x03
  1284. JE copy_1_move_3
  1285. JB copy_1_move_1or2
  1286. CMPQ R11, $0x08
  1287. JB copy_1_move_4through7
  1288. JMP copy_1_move_8through16
  1289. copy_1_move_1or2:
  1290. MOVB (SI), R14
  1291. MOVB -1(SI)(R11*1), R15
  1292. MOVB R14, (BX)
  1293. MOVB R15, -1(BX)(R11*1)
  1294. ADDQ R11, SI
  1295. ADDQ R11, BX
  1296. JMP copy_1_end
  1297. copy_1_move_3:
  1298. MOVW (SI), R14
  1299. MOVB 2(SI), R15
  1300. MOVW R14, (BX)
  1301. MOVB R15, 2(BX)
  1302. ADDQ R11, SI
  1303. ADDQ R11, BX
  1304. JMP copy_1_end
  1305. copy_1_move_4through7:
  1306. MOVL (SI), R14
  1307. MOVL -4(SI)(R11*1), R15
  1308. MOVL R14, (BX)
  1309. MOVL R15, -4(BX)(R11*1)
  1310. ADDQ R11, SI
  1311. ADDQ R11, BX
  1312. JMP copy_1_end
  1313. copy_1_move_8through16:
  1314. MOVQ (SI), R14
  1315. MOVQ -8(SI)(R11*1), R15
  1316. MOVQ R14, (BX)
  1317. MOVQ R15, -8(BX)(R11*1)
  1318. ADDQ R11, SI
  1319. ADDQ R11, BX
  1320. copy_1_end:
  1321. ADDQ R11, DI
  1322. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1323. check_offset:
  1324. LEAQ (DI)(R10*1), R11
  1325. CMPQ R12, R11
  1326. JG error_match_off_too_big
  1327. CMPQ R12, R8
  1328. JG error_match_off_too_big
  1329. // Copy match from history
  1330. MOVQ R12, R11
  1331. SUBQ DI, R11
  1332. JLS copy_match
  1333. MOVQ R9, R14
  1334. SUBQ R11, R14
  1335. CMPQ R13, R11
  1336. JG copy_all_from_history
  1337. MOVQ R13, R11
  1338. SUBQ $0x10, R11
  1339. JB copy_4_small
  1340. copy_4_loop:
  1341. MOVUPS (R14), X0
  1342. MOVUPS X0, (BX)
  1343. ADDQ $0x10, R14
  1344. ADDQ $0x10, BX
  1345. SUBQ $0x10, R11
  1346. JAE copy_4_loop
  1347. LEAQ 16(R14)(R11*1), R14
  1348. LEAQ 16(BX)(R11*1), BX
  1349. MOVUPS -16(R14), X0
  1350. MOVUPS X0, -16(BX)
  1351. JMP copy_4_end
  1352. copy_4_small:
  1353. CMPQ R13, $0x03
  1354. JE copy_4_move_3
  1355. CMPQ R13, $0x08
  1356. JB copy_4_move_4through7
  1357. JMP copy_4_move_8through16
  1358. copy_4_move_3:
  1359. MOVW (R14), R11
  1360. MOVB 2(R14), R12
  1361. MOVW R11, (BX)
  1362. MOVB R12, 2(BX)
  1363. ADDQ R13, R14
  1364. ADDQ R13, BX
  1365. JMP copy_4_end
  1366. copy_4_move_4through7:
  1367. MOVL (R14), R11
  1368. MOVL -4(R14)(R13*1), R12
  1369. MOVL R11, (BX)
  1370. MOVL R12, -4(BX)(R13*1)
  1371. ADDQ R13, R14
  1372. ADDQ R13, BX
  1373. JMP copy_4_end
  1374. copy_4_move_8through16:
  1375. MOVQ (R14), R11
  1376. MOVQ -8(R14)(R13*1), R12
  1377. MOVQ R11, (BX)
  1378. MOVQ R12, -8(BX)(R13*1)
  1379. ADDQ R13, R14
  1380. ADDQ R13, BX
  1381. copy_4_end:
  1382. ADDQ R13, DI
  1383. ADDQ $0x18, AX
  1384. INCQ DX
  1385. CMPQ DX, CX
  1386. JB main_loop
  1387. JMP loop_finished
  1388. copy_all_from_history:
  1389. MOVQ R11, R15
  1390. SUBQ $0x10, R15
  1391. JB copy_5_small
  1392. copy_5_loop:
  1393. MOVUPS (R14), X0
  1394. MOVUPS X0, (BX)
  1395. ADDQ $0x10, R14
  1396. ADDQ $0x10, BX
  1397. SUBQ $0x10, R15
  1398. JAE copy_5_loop
  1399. LEAQ 16(R14)(R15*1), R14
  1400. LEAQ 16(BX)(R15*1), BX
  1401. MOVUPS -16(R14), X0
  1402. MOVUPS X0, -16(BX)
  1403. JMP copy_5_end
  1404. copy_5_small:
  1405. CMPQ R11, $0x03
  1406. JE copy_5_move_3
  1407. JB copy_5_move_1or2
  1408. CMPQ R11, $0x08
  1409. JB copy_5_move_4through7
  1410. JMP copy_5_move_8through16
  1411. copy_5_move_1or2:
  1412. MOVB (R14), R15
  1413. MOVB -1(R14)(R11*1), BP
  1414. MOVB R15, (BX)
  1415. MOVB BP, -1(BX)(R11*1)
  1416. ADDQ R11, R14
  1417. ADDQ R11, BX
  1418. JMP copy_5_end
  1419. copy_5_move_3:
  1420. MOVW (R14), R15
  1421. MOVB 2(R14), BP
  1422. MOVW R15, (BX)
  1423. MOVB BP, 2(BX)
  1424. ADDQ R11, R14
  1425. ADDQ R11, BX
  1426. JMP copy_5_end
  1427. copy_5_move_4through7:
  1428. MOVL (R14), R15
  1429. MOVL -4(R14)(R11*1), BP
  1430. MOVL R15, (BX)
  1431. MOVL BP, -4(BX)(R11*1)
  1432. ADDQ R11, R14
  1433. ADDQ R11, BX
  1434. JMP copy_5_end
  1435. copy_5_move_8through16:
  1436. MOVQ (R14), R15
  1437. MOVQ -8(R14)(R11*1), BP
  1438. MOVQ R15, (BX)
  1439. MOVQ BP, -8(BX)(R11*1)
  1440. ADDQ R11, R14
  1441. ADDQ R11, BX
  1442. copy_5_end:
  1443. ADDQ R11, DI
  1444. SUBQ R11, R13
  1445. // Copy match from the current buffer
  1446. copy_match:
  1447. MOVQ BX, R11
  1448. SUBQ R12, R11
  1449. // ml <= mo
  1450. CMPQ R13, R12
  1451. JA copy_overlapping_match
  1452. // Copy non-overlapping match
  1453. ADDQ R13, DI
  1454. MOVQ R13, R12
  1455. SUBQ $0x10, R12
  1456. JB copy_2_small
  1457. copy_2_loop:
  1458. MOVUPS (R11), X0
  1459. MOVUPS X0, (BX)
  1460. ADDQ $0x10, R11
  1461. ADDQ $0x10, BX
  1462. SUBQ $0x10, R12
  1463. JAE copy_2_loop
  1464. LEAQ 16(R11)(R12*1), R11
  1465. LEAQ 16(BX)(R12*1), BX
  1466. MOVUPS -16(R11), X0
  1467. MOVUPS X0, -16(BX)
  1468. JMP copy_2_end
  1469. copy_2_small:
  1470. CMPQ R13, $0x03
  1471. JE copy_2_move_3
  1472. JB copy_2_move_1or2
  1473. CMPQ R13, $0x08
  1474. JB copy_2_move_4through7
  1475. JMP copy_2_move_8through16
  1476. copy_2_move_1or2:
  1477. MOVB (R11), R12
  1478. MOVB -1(R11)(R13*1), R14
  1479. MOVB R12, (BX)
  1480. MOVB R14, -1(BX)(R13*1)
  1481. ADDQ R13, R11
  1482. ADDQ R13, BX
  1483. JMP copy_2_end
  1484. copy_2_move_3:
  1485. MOVW (R11), R12
  1486. MOVB 2(R11), R14
  1487. MOVW R12, (BX)
  1488. MOVB R14, 2(BX)
  1489. ADDQ R13, R11
  1490. ADDQ R13, BX
  1491. JMP copy_2_end
  1492. copy_2_move_4through7:
  1493. MOVL (R11), R12
  1494. MOVL -4(R11)(R13*1), R14
  1495. MOVL R12, (BX)
  1496. MOVL R14, -4(BX)(R13*1)
  1497. ADDQ R13, R11
  1498. ADDQ R13, BX
  1499. JMP copy_2_end
  1500. copy_2_move_8through16:
  1501. MOVQ (R11), R12
  1502. MOVQ -8(R11)(R13*1), R14
  1503. MOVQ R12, (BX)
  1504. MOVQ R14, -8(BX)(R13*1)
  1505. ADDQ R13, R11
  1506. ADDQ R13, BX
  1507. copy_2_end:
  1508. JMP handle_loop
  1509. // Copy overlapping match
  1510. copy_overlapping_match:
  1511. ADDQ R13, DI
  1512. copy_slow_3:
  1513. MOVB (R11), R12
  1514. MOVB R12, (BX)
  1515. INCQ R11
  1516. INCQ BX
  1517. DECQ R13
  1518. JNZ copy_slow_3
  1519. handle_loop:
  1520. ADDQ $0x18, AX
  1521. INCQ DX
  1522. CMPQ DX, CX
  1523. JB main_loop
  1524. loop_finished:
  1525. // Return value
  1526. MOVB $0x01, ret+8(FP)
  1527. // Update the context
  1528. MOVQ ctx+0(FP), AX
  1529. MOVQ DX, 24(AX)
  1530. MOVQ DI, 104(AX)
  1531. MOVQ 80(AX), CX
  1532. SUBQ CX, SI
  1533. MOVQ SI, 112(AX)
  1534. RET
  1535. error_match_off_too_big:
  1536. // Return value
  1537. MOVB $0x00, ret+8(FP)
  1538. // Update the context
  1539. MOVQ ctx+0(FP), AX
  1540. MOVQ DX, 24(AX)
  1541. MOVQ DI, 104(AX)
  1542. MOVQ 80(AX), CX
  1543. SUBQ CX, SI
  1544. MOVQ SI, 112(AX)
  1545. RET
  1546. empty_seqs:
  1547. // Return value
  1548. MOVB $0x01, ret+8(FP)
  1549. RET
  1550. // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  1551. // Requires: CMOV, SSE
  1552. TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
  1553. MOVQ br+8(FP), AX
  1554. MOVQ 32(AX), DX
  1555. MOVBQZX 40(AX), BX
  1556. MOVQ 24(AX), SI
  1557. MOVQ (AX), AX
  1558. ADDQ SI, AX
  1559. MOVQ AX, (SP)
  1560. MOVQ ctx+16(FP), AX
  1561. MOVQ 72(AX), DI
  1562. MOVQ 80(AX), R8
  1563. MOVQ 88(AX), R9
  1564. XORQ CX, CX
  1565. MOVQ CX, 8(SP)
  1566. MOVQ CX, 16(SP)
  1567. MOVQ CX, 24(SP)
  1568. MOVQ 112(AX), R10
  1569. MOVQ 128(AX), CX
  1570. MOVQ CX, 32(SP)
  1571. MOVQ 144(AX), R11
  1572. MOVQ 136(AX), R12
  1573. MOVQ 200(AX), CX
  1574. MOVQ CX, 56(SP)
  1575. MOVQ 176(AX), CX
  1576. MOVQ CX, 48(SP)
  1577. MOVQ 184(AX), AX
  1578. MOVQ AX, 40(SP)
  1579. MOVQ 40(SP), AX
  1580. ADDQ AX, 48(SP)
  1581. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  1582. ADDQ R10, 32(SP)
  1583. // outBase += outPosition
  1584. ADDQ R12, R10
  1585. sequenceDecs_decodeSync_amd64_main_loop:
  1586. MOVQ (SP), R13
  1587. // Fill bitreader to have enough for the offset and match length.
  1588. CMPQ SI, $0x08
  1589. JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1590. MOVQ BX, AX
  1591. SHRQ $0x03, AX
  1592. SUBQ AX, R13
  1593. MOVQ (R13), DX
  1594. SUBQ AX, SI
  1595. ANDQ $0x07, BX
  1596. JMP sequenceDecs_decodeSync_amd64_fill_end
  1597. sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
  1598. CMPQ SI, $0x00
  1599. JLE sequenceDecs_decodeSync_amd64_fill_end
  1600. CMPQ BX, $0x07
  1601. JLE sequenceDecs_decodeSync_amd64_fill_end
  1602. SHLQ $0x08, DX
  1603. SUBQ $0x01, R13
  1604. SUBQ $0x01, SI
  1605. SUBQ $0x08, BX
  1606. MOVBQZX (R13), AX
  1607. ORQ AX, DX
  1608. JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1609. sequenceDecs_decodeSync_amd64_fill_end:
  1610. // Update offset
  1611. MOVQ R9, AX
  1612. MOVQ BX, CX
  1613. MOVQ DX, R14
  1614. SHLQ CL, R14
  1615. MOVB AH, CL
  1616. SHRQ $0x20, AX
  1617. TESTQ CX, CX
  1618. JZ sequenceDecs_decodeSync_amd64_of_update_zero
  1619. ADDQ CX, BX
  1620. CMPQ BX, $0x40
  1621. JA sequenceDecs_decodeSync_amd64_of_update_zero
  1622. CMPQ CX, $0x40
  1623. JAE sequenceDecs_decodeSync_amd64_of_update_zero
  1624. NEGQ CX
  1625. SHRQ CL, R14
  1626. ADDQ R14, AX
  1627. sequenceDecs_decodeSync_amd64_of_update_zero:
  1628. MOVQ AX, 8(SP)
  1629. // Update match length
  1630. MOVQ R8, AX
  1631. MOVQ BX, CX
  1632. MOVQ DX, R14
  1633. SHLQ CL, R14
  1634. MOVB AH, CL
  1635. SHRQ $0x20, AX
  1636. TESTQ CX, CX
  1637. JZ sequenceDecs_decodeSync_amd64_ml_update_zero
  1638. ADDQ CX, BX
  1639. CMPQ BX, $0x40
  1640. JA sequenceDecs_decodeSync_amd64_ml_update_zero
  1641. CMPQ CX, $0x40
  1642. JAE sequenceDecs_decodeSync_amd64_ml_update_zero
  1643. NEGQ CX
  1644. SHRQ CL, R14
  1645. ADDQ R14, AX
  1646. sequenceDecs_decodeSync_amd64_ml_update_zero:
  1647. MOVQ AX, 16(SP)
  1648. // Fill bitreader to have enough for the remaining
  1649. CMPQ SI, $0x08
  1650. JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1651. MOVQ BX, AX
  1652. SHRQ $0x03, AX
  1653. SUBQ AX, R13
  1654. MOVQ (R13), DX
  1655. SUBQ AX, SI
  1656. ANDQ $0x07, BX
  1657. JMP sequenceDecs_decodeSync_amd64_fill_2_end
  1658. sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
  1659. CMPQ SI, $0x00
  1660. JLE sequenceDecs_decodeSync_amd64_fill_2_end
  1661. CMPQ BX, $0x07
  1662. JLE sequenceDecs_decodeSync_amd64_fill_2_end
  1663. SHLQ $0x08, DX
  1664. SUBQ $0x01, R13
  1665. SUBQ $0x01, SI
  1666. SUBQ $0x08, BX
  1667. MOVBQZX (R13), AX
  1668. ORQ AX, DX
  1669. JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1670. sequenceDecs_decodeSync_amd64_fill_2_end:
  1671. // Update literal length
  1672. MOVQ DI, AX
  1673. MOVQ BX, CX
  1674. MOVQ DX, R14
  1675. SHLQ CL, R14
  1676. MOVB AH, CL
  1677. SHRQ $0x20, AX
  1678. TESTQ CX, CX
  1679. JZ sequenceDecs_decodeSync_amd64_ll_update_zero
  1680. ADDQ CX, BX
  1681. CMPQ BX, $0x40
  1682. JA sequenceDecs_decodeSync_amd64_ll_update_zero
  1683. CMPQ CX, $0x40
  1684. JAE sequenceDecs_decodeSync_amd64_ll_update_zero
  1685. NEGQ CX
  1686. SHRQ CL, R14
  1687. ADDQ R14, AX
  1688. sequenceDecs_decodeSync_amd64_ll_update_zero:
  1689. MOVQ AX, 24(SP)
  1690. // Fill bitreader for state updates
  1691. MOVQ R13, (SP)
  1692. MOVQ R9, AX
  1693. SHRQ $0x08, AX
  1694. MOVBQZX AL, AX
  1695. MOVQ ctx+16(FP), CX
  1696. CMPQ 96(CX), $0x00
  1697. JZ sequenceDecs_decodeSync_amd64_skip_update
  1698. // Update Literal Length State
  1699. MOVBQZX DI, R13
  1700. SHRQ $0x10, DI
  1701. MOVWQZX DI, DI
  1702. LEAQ (BX)(R13*1), CX
  1703. MOVQ DX, R14
  1704. MOVQ CX, BX
  1705. ROLQ CL, R14
  1706. MOVL $0x00000001, R15
  1707. MOVB R13, CL
  1708. SHLL CL, R15
  1709. DECL R15
  1710. ANDQ R15, R14
  1711. ADDQ R14, DI
  1712. // Load ctx.llTable
  1713. MOVQ ctx+16(FP), CX
  1714. MOVQ (CX), CX
  1715. MOVQ (CX)(DI*8), DI
  1716. // Update Match Length State
  1717. MOVBQZX R8, R13
  1718. SHRQ $0x10, R8
  1719. MOVWQZX R8, R8
  1720. LEAQ (BX)(R13*1), CX
  1721. MOVQ DX, R14
  1722. MOVQ CX, BX
  1723. ROLQ CL, R14
  1724. MOVL $0x00000001, R15
  1725. MOVB R13, CL
  1726. SHLL CL, R15
  1727. DECL R15
  1728. ANDQ R15, R14
  1729. ADDQ R14, R8
  1730. // Load ctx.mlTable
  1731. MOVQ ctx+16(FP), CX
  1732. MOVQ 24(CX), CX
  1733. MOVQ (CX)(R8*8), R8
  1734. // Update Offset State
  1735. MOVBQZX R9, R13
  1736. SHRQ $0x10, R9
  1737. MOVWQZX R9, R9
  1738. LEAQ (BX)(R13*1), CX
  1739. MOVQ DX, R14
  1740. MOVQ CX, BX
  1741. ROLQ CL, R14
  1742. MOVL $0x00000001, R15
  1743. MOVB R13, CL
  1744. SHLL CL, R15
  1745. DECL R15
  1746. ANDQ R15, R14
  1747. ADDQ R14, R9
  1748. // Load ctx.ofTable
  1749. MOVQ ctx+16(FP), CX
  1750. MOVQ 48(CX), CX
  1751. MOVQ (CX)(R9*8), R9
  1752. sequenceDecs_decodeSync_amd64_skip_update:
  1753. // Adjust offset
  1754. MOVQ s+0(FP), CX
  1755. MOVQ 8(SP), R13
  1756. CMPQ AX, $0x01
  1757. JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
  1758. MOVUPS 144(CX), X0
  1759. MOVQ R13, 144(CX)
  1760. MOVUPS X0, 152(CX)
  1761. JMP sequenceDecs_decodeSync_amd64_after_adjust
  1762. sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
  1763. CMPQ 24(SP), $0x00000000
  1764. JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
  1765. INCQ R13
  1766. JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  1767. sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
  1768. TESTQ R13, R13
  1769. JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  1770. MOVQ 144(CX), R13
  1771. JMP sequenceDecs_decodeSync_amd64_after_adjust
  1772. sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
  1773. MOVQ R13, AX
  1774. XORQ R14, R14
  1775. MOVQ $-1, R15
  1776. CMPQ R13, $0x03
  1777. CMOVQEQ R14, AX
  1778. CMOVQEQ R15, R14
  1779. ADDQ 144(CX)(AX*8), R14
  1780. JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
  1781. MOVQ $0x00000001, R14
  1782. sequenceDecs_decodeSync_amd64_adjust_temp_valid:
  1783. CMPQ R13, $0x01
  1784. JZ sequenceDecs_decodeSync_amd64_adjust_skip
  1785. MOVQ 152(CX), AX
  1786. MOVQ AX, 160(CX)
  1787. sequenceDecs_decodeSync_amd64_adjust_skip:
  1788. MOVQ 144(CX), AX
  1789. MOVQ AX, 152(CX)
  1790. MOVQ R14, 144(CX)
  1791. MOVQ R14, R13
  1792. sequenceDecs_decodeSync_amd64_after_adjust:
  1793. MOVQ R13, 8(SP)
  1794. // Check values
  1795. MOVQ 16(SP), AX
  1796. MOVQ 24(SP), CX
  1797. LEAQ (AX)(CX*1), R14
  1798. MOVQ s+0(FP), R15
  1799. ADDQ R14, 256(R15)
  1800. MOVQ ctx+16(FP), R14
  1801. SUBQ CX, 104(R14)
  1802. JS error_not_enough_literals
  1803. CMPQ AX, $0x00020002
  1804. JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
  1805. TESTQ R13, R13
  1806. JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
  1807. TESTQ AX, AX
  1808. JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
  1809. sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
  1810. MOVQ 24(SP), AX
  1811. MOVQ 8(SP), CX
  1812. MOVQ 16(SP), R13
  1813. // Check if we have enough space in s.out
  1814. LEAQ (AX)(R13*1), R14
  1815. ADDQ R10, R14
  1816. CMPQ R14, 32(SP)
  1817. JA error_not_enough_space
  1818. // Copy literals
  1819. TESTQ AX, AX
  1820. JZ check_offset
  1821. XORQ R14, R14
  1822. copy_1:
  1823. MOVUPS (R11)(R14*1), X0
  1824. MOVUPS X0, (R10)(R14*1)
  1825. ADDQ $0x10, R14
  1826. CMPQ R14, AX
  1827. JB copy_1
  1828. ADDQ AX, R11
  1829. ADDQ AX, R10
  1830. ADDQ AX, R12
  1831. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1832. check_offset:
  1833. MOVQ R12, AX
  1834. ADDQ 40(SP), AX
  1835. CMPQ CX, AX
  1836. JG error_match_off_too_big
  1837. CMPQ CX, 56(SP)
  1838. JG error_match_off_too_big
  1839. // Copy match from history
  1840. MOVQ CX, AX
  1841. SUBQ R12, AX
  1842. JLS copy_match
  1843. MOVQ 48(SP), R14
  1844. SUBQ AX, R14
  1845. CMPQ R13, AX
  1846. JG copy_all_from_history
  1847. MOVQ R13, AX
  1848. SUBQ $0x10, AX
  1849. JB copy_4_small
  1850. copy_4_loop:
  1851. MOVUPS (R14), X0
  1852. MOVUPS X0, (R10)
  1853. ADDQ $0x10, R14
  1854. ADDQ $0x10, R10
  1855. SUBQ $0x10, AX
  1856. JAE copy_4_loop
  1857. LEAQ 16(R14)(AX*1), R14
  1858. LEAQ 16(R10)(AX*1), R10
  1859. MOVUPS -16(R14), X0
  1860. MOVUPS X0, -16(R10)
  1861. JMP copy_4_end
  1862. copy_4_small:
  1863. CMPQ R13, $0x03
  1864. JE copy_4_move_3
  1865. CMPQ R13, $0x08
  1866. JB copy_4_move_4through7
  1867. JMP copy_4_move_8through16
  1868. copy_4_move_3:
  1869. MOVW (R14), AX
  1870. MOVB 2(R14), CL
  1871. MOVW AX, (R10)
  1872. MOVB CL, 2(R10)
  1873. ADDQ R13, R14
  1874. ADDQ R13, R10
  1875. JMP copy_4_end
  1876. copy_4_move_4through7:
  1877. MOVL (R14), AX
  1878. MOVL -4(R14)(R13*1), CX
  1879. MOVL AX, (R10)
  1880. MOVL CX, -4(R10)(R13*1)
  1881. ADDQ R13, R14
  1882. ADDQ R13, R10
  1883. JMP copy_4_end
  1884. copy_4_move_8through16:
  1885. MOVQ (R14), AX
  1886. MOVQ -8(R14)(R13*1), CX
  1887. MOVQ AX, (R10)
  1888. MOVQ CX, -8(R10)(R13*1)
  1889. ADDQ R13, R14
  1890. ADDQ R13, R10
  1891. copy_4_end:
  1892. ADDQ R13, R12
  1893. JMP handle_loop
  1894. JMP loop_finished
  1895. copy_all_from_history:
  1896. MOVQ AX, R15
  1897. SUBQ $0x10, R15
  1898. JB copy_5_small
  1899. copy_5_loop:
  1900. MOVUPS (R14), X0
  1901. MOVUPS X0, (R10)
  1902. ADDQ $0x10, R14
  1903. ADDQ $0x10, R10
  1904. SUBQ $0x10, R15
  1905. JAE copy_5_loop
  1906. LEAQ 16(R14)(R15*1), R14
  1907. LEAQ 16(R10)(R15*1), R10
  1908. MOVUPS -16(R14), X0
  1909. MOVUPS X0, -16(R10)
  1910. JMP copy_5_end
  1911. copy_5_small:
  1912. CMPQ AX, $0x03
  1913. JE copy_5_move_3
  1914. JB copy_5_move_1or2
  1915. CMPQ AX, $0x08
  1916. JB copy_5_move_4through7
  1917. JMP copy_5_move_8through16
  1918. copy_5_move_1or2:
  1919. MOVB (R14), R15
  1920. MOVB -1(R14)(AX*1), BP
  1921. MOVB R15, (R10)
  1922. MOVB BP, -1(R10)(AX*1)
  1923. ADDQ AX, R14
  1924. ADDQ AX, R10
  1925. JMP copy_5_end
  1926. copy_5_move_3:
  1927. MOVW (R14), R15
  1928. MOVB 2(R14), BP
  1929. MOVW R15, (R10)
  1930. MOVB BP, 2(R10)
  1931. ADDQ AX, R14
  1932. ADDQ AX, R10
  1933. JMP copy_5_end
  1934. copy_5_move_4through7:
  1935. MOVL (R14), R15
  1936. MOVL -4(R14)(AX*1), BP
  1937. MOVL R15, (R10)
  1938. MOVL BP, -4(R10)(AX*1)
  1939. ADDQ AX, R14
  1940. ADDQ AX, R10
  1941. JMP copy_5_end
  1942. copy_5_move_8through16:
  1943. MOVQ (R14), R15
  1944. MOVQ -8(R14)(AX*1), BP
  1945. MOVQ R15, (R10)
  1946. MOVQ BP, -8(R10)(AX*1)
  1947. ADDQ AX, R14
  1948. ADDQ AX, R10
  1949. copy_5_end:
  1950. ADDQ AX, R12
  1951. SUBQ AX, R13
  1952. // Copy match from the current buffer
  1953. copy_match:
  1954. MOVQ R10, AX
  1955. SUBQ CX, AX
  1956. // ml <= mo
  1957. CMPQ R13, CX
  1958. JA copy_overlapping_match
  1959. // Copy non-overlapping match
  1960. ADDQ R13, R12
  1961. MOVQ R10, CX
  1962. ADDQ R13, R10
  1963. copy_2:
  1964. MOVUPS (AX), X0
  1965. MOVUPS X0, (CX)
  1966. ADDQ $0x10, AX
  1967. ADDQ $0x10, CX
  1968. SUBQ $0x10, R13
  1969. JHI copy_2
  1970. JMP handle_loop
  1971. // Copy overlapping match
  1972. copy_overlapping_match:
  1973. ADDQ R13, R12
  1974. copy_slow_3:
  1975. MOVB (AX), CL
  1976. MOVB CL, (R10)
  1977. INCQ AX
  1978. INCQ R10
  1979. DECQ R13
  1980. JNZ copy_slow_3
  1981. handle_loop:
  1982. MOVQ ctx+16(FP), AX
  1983. DECQ 96(AX)
  1984. JNS sequenceDecs_decodeSync_amd64_main_loop
  1985. loop_finished:
  1986. MOVQ br+8(FP), AX
  1987. MOVQ DX, 32(AX)
  1988. MOVB BL, 40(AX)
  1989. MOVQ SI, 24(AX)
  1990. // Update the context
  1991. MOVQ ctx+16(FP), AX
  1992. MOVQ R12, 136(AX)
  1993. MOVQ 144(AX), CX
  1994. SUBQ CX, R11
  1995. MOVQ R11, 168(AX)
  1996. // Return success
  1997. MOVQ $0x00000000, ret+24(FP)
  1998. RET
  1999. // Return with match length error
  2000. sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
  2001. MOVQ 16(SP), AX
  2002. MOVQ ctx+16(FP), CX
  2003. MOVQ AX, 216(CX)
  2004. MOVQ $0x00000001, ret+24(FP)
  2005. RET
  2006. // Return with match too long error
  2007. sequenceDecs_decodeSync_amd64_error_match_len_too_big:
  2008. MOVQ ctx+16(FP), AX
  2009. MOVQ 16(SP), CX
  2010. MOVQ CX, 216(AX)
  2011. MOVQ $0x00000002, ret+24(FP)
  2012. RET
  2013. // Return with match offset too long error
  2014. error_match_off_too_big:
  2015. MOVQ ctx+16(FP), AX
  2016. MOVQ 8(SP), CX
  2017. MOVQ CX, 224(AX)
  2018. MOVQ R12, 136(AX)
  2019. MOVQ $0x00000003, ret+24(FP)
  2020. RET
  2021. // Return with not enough literals error
  2022. error_not_enough_literals:
  2023. MOVQ ctx+16(FP), AX
  2024. MOVQ 24(SP), CX
  2025. MOVQ CX, 208(AX)
  2026. MOVQ $0x00000004, ret+24(FP)
  2027. RET
  2028. // Return with not enough output space error
  2029. error_not_enough_space:
  2030. MOVQ ctx+16(FP), AX
  2031. MOVQ 24(SP), CX
  2032. MOVQ CX, 208(AX)
  2033. MOVQ 16(SP), CX
  2034. MOVQ CX, 216(AX)
  2035. MOVQ R12, 136(AX)
  2036. MOVQ $0x00000005, ret+24(FP)
  2037. RET
  2038. // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2039. // Requires: BMI, BMI2, CMOV, SSE
  2040. TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
  2041. MOVQ br+8(FP), CX
  2042. MOVQ 32(CX), AX
  2043. MOVBQZX 40(CX), DX
  2044. MOVQ 24(CX), BX
  2045. MOVQ (CX), CX
  2046. ADDQ BX, CX
  2047. MOVQ CX, (SP)
  2048. MOVQ ctx+16(FP), CX
  2049. MOVQ 72(CX), SI
  2050. MOVQ 80(CX), DI
  2051. MOVQ 88(CX), R8
  2052. XORQ R9, R9
  2053. MOVQ R9, 8(SP)
  2054. MOVQ R9, 16(SP)
  2055. MOVQ R9, 24(SP)
  2056. MOVQ 112(CX), R9
  2057. MOVQ 128(CX), R10
  2058. MOVQ R10, 32(SP)
  2059. MOVQ 144(CX), R10
  2060. MOVQ 136(CX), R11
  2061. MOVQ 200(CX), R12
  2062. MOVQ R12, 56(SP)
  2063. MOVQ 176(CX), R12
  2064. MOVQ R12, 48(SP)
  2065. MOVQ 184(CX), CX
  2066. MOVQ CX, 40(SP)
  2067. MOVQ 40(SP), CX
  2068. ADDQ CX, 48(SP)
  2069. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2070. ADDQ R9, 32(SP)
  2071. // outBase += outPosition
  2072. ADDQ R11, R9
  2073. sequenceDecs_decodeSync_bmi2_main_loop:
  2074. MOVQ (SP), R12
  2075. // Fill bitreader to have enough for the offset and match length.
  2076. CMPQ BX, $0x08
  2077. JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2078. MOVQ DX, CX
  2079. SHRQ $0x03, CX
  2080. SUBQ CX, R12
  2081. MOVQ (R12), AX
  2082. SUBQ CX, BX
  2083. ANDQ $0x07, DX
  2084. JMP sequenceDecs_decodeSync_bmi2_fill_end
  2085. sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
  2086. CMPQ BX, $0x00
  2087. JLE sequenceDecs_decodeSync_bmi2_fill_end
  2088. CMPQ DX, $0x07
  2089. JLE sequenceDecs_decodeSync_bmi2_fill_end
  2090. SHLQ $0x08, AX
  2091. SUBQ $0x01, R12
  2092. SUBQ $0x01, BX
  2093. SUBQ $0x08, DX
  2094. MOVBQZX (R12), CX
  2095. ORQ CX, AX
  2096. JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2097. sequenceDecs_decodeSync_bmi2_fill_end:
  2098. // Update offset
  2099. MOVQ $0x00000808, CX
  2100. BEXTRQ CX, R8, R13
  2101. MOVQ AX, R14
  2102. LEAQ (DX)(R13*1), CX
  2103. ROLQ CL, R14
  2104. BZHIQ R13, R14, R14
  2105. MOVQ CX, DX
  2106. MOVQ R8, CX
  2107. SHRQ $0x20, CX
  2108. ADDQ R14, CX
  2109. MOVQ CX, 8(SP)
  2110. // Update match length
  2111. MOVQ $0x00000808, CX
  2112. BEXTRQ CX, DI, R13
  2113. MOVQ AX, R14
  2114. LEAQ (DX)(R13*1), CX
  2115. ROLQ CL, R14
  2116. BZHIQ R13, R14, R14
  2117. MOVQ CX, DX
  2118. MOVQ DI, CX
  2119. SHRQ $0x20, CX
  2120. ADDQ R14, CX
  2121. MOVQ CX, 16(SP)
  2122. // Fill bitreader to have enough for the remaining
  2123. CMPQ BX, $0x08
  2124. JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2125. MOVQ DX, CX
  2126. SHRQ $0x03, CX
  2127. SUBQ CX, R12
  2128. MOVQ (R12), AX
  2129. SUBQ CX, BX
  2130. ANDQ $0x07, DX
  2131. JMP sequenceDecs_decodeSync_bmi2_fill_2_end
  2132. sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
  2133. CMPQ BX, $0x00
  2134. JLE sequenceDecs_decodeSync_bmi2_fill_2_end
  2135. CMPQ DX, $0x07
  2136. JLE sequenceDecs_decodeSync_bmi2_fill_2_end
  2137. SHLQ $0x08, AX
  2138. SUBQ $0x01, R12
  2139. SUBQ $0x01, BX
  2140. SUBQ $0x08, DX
  2141. MOVBQZX (R12), CX
  2142. ORQ CX, AX
  2143. JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2144. sequenceDecs_decodeSync_bmi2_fill_2_end:
  2145. // Update literal length
  2146. MOVQ $0x00000808, CX
  2147. BEXTRQ CX, SI, R13
  2148. MOVQ AX, R14
  2149. LEAQ (DX)(R13*1), CX
  2150. ROLQ CL, R14
  2151. BZHIQ R13, R14, R14
  2152. MOVQ CX, DX
  2153. MOVQ SI, CX
  2154. SHRQ $0x20, CX
  2155. ADDQ R14, CX
  2156. MOVQ CX, 24(SP)
  2157. // Fill bitreader for state updates
  2158. MOVQ R12, (SP)
  2159. MOVQ $0x00000808, CX
  2160. BEXTRQ CX, R8, R12
  2161. MOVQ ctx+16(FP), CX
  2162. CMPQ 96(CX), $0x00
  2163. JZ sequenceDecs_decodeSync_bmi2_skip_update
  2164. LEAQ (SI)(DI*1), R13
  2165. ADDQ R8, R13
  2166. MOVBQZX R13, R13
  2167. LEAQ (DX)(R13*1), CX
  2168. MOVQ AX, R14
  2169. MOVQ CX, DX
  2170. ROLQ CL, R14
  2171. BZHIQ R13, R14, R14
  2172. // Update Offset State
  2173. BZHIQ R8, R14, CX
  2174. SHRXQ R8, R14, R14
  2175. MOVQ $0x00001010, R13
  2176. BEXTRQ R13, R8, R8
  2177. ADDQ CX, R8
  2178. // Load ctx.ofTable
  2179. MOVQ ctx+16(FP), CX
  2180. MOVQ 48(CX), CX
  2181. MOVQ (CX)(R8*8), R8
  2182. // Update Match Length State
  2183. BZHIQ DI, R14, CX
  2184. SHRXQ DI, R14, R14
  2185. MOVQ $0x00001010, R13
  2186. BEXTRQ R13, DI, DI
  2187. ADDQ CX, DI
  2188. // Load ctx.mlTable
  2189. MOVQ ctx+16(FP), CX
  2190. MOVQ 24(CX), CX
  2191. MOVQ (CX)(DI*8), DI
  2192. // Update Literal Length State
  2193. BZHIQ SI, R14, CX
  2194. MOVQ $0x00001010, R13
  2195. BEXTRQ R13, SI, SI
  2196. ADDQ CX, SI
  2197. // Load ctx.llTable
  2198. MOVQ ctx+16(FP), CX
  2199. MOVQ (CX), CX
  2200. MOVQ (CX)(SI*8), SI
  2201. sequenceDecs_decodeSync_bmi2_skip_update:
  2202. // Adjust offset
  2203. MOVQ s+0(FP), CX
  2204. MOVQ 8(SP), R13
  2205. CMPQ R12, $0x01
  2206. JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
  2207. MOVUPS 144(CX), X0
  2208. MOVQ R13, 144(CX)
  2209. MOVUPS X0, 152(CX)
  2210. JMP sequenceDecs_decodeSync_bmi2_after_adjust
  2211. sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
  2212. CMPQ 24(SP), $0x00000000
  2213. JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
  2214. INCQ R13
  2215. JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2216. sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
  2217. TESTQ R13, R13
  2218. JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2219. MOVQ 144(CX), R13
  2220. JMP sequenceDecs_decodeSync_bmi2_after_adjust
  2221. sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
  2222. MOVQ R13, R12
  2223. XORQ R14, R14
  2224. MOVQ $-1, R15
  2225. CMPQ R13, $0x03
  2226. CMOVQEQ R14, R12
  2227. CMOVQEQ R15, R14
  2228. ADDQ 144(CX)(R12*8), R14
  2229. JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
  2230. MOVQ $0x00000001, R14
  2231. sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
  2232. CMPQ R13, $0x01
  2233. JZ sequenceDecs_decodeSync_bmi2_adjust_skip
  2234. MOVQ 152(CX), R12
  2235. MOVQ R12, 160(CX)
  2236. sequenceDecs_decodeSync_bmi2_adjust_skip:
  2237. MOVQ 144(CX), R12
  2238. MOVQ R12, 152(CX)
  2239. MOVQ R14, 144(CX)
  2240. MOVQ R14, R13
  2241. sequenceDecs_decodeSync_bmi2_after_adjust:
  2242. MOVQ R13, 8(SP)
  2243. // Check values
  2244. MOVQ 16(SP), CX
  2245. MOVQ 24(SP), R12
  2246. LEAQ (CX)(R12*1), R14
  2247. MOVQ s+0(FP), R15
  2248. ADDQ R14, 256(R15)
  2249. MOVQ ctx+16(FP), R14
  2250. SUBQ R12, 104(R14)
  2251. JS error_not_enough_literals
  2252. CMPQ CX, $0x00020002
  2253. JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
  2254. TESTQ R13, R13
  2255. JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
  2256. TESTQ CX, CX
  2257. JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
  2258. sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
  2259. MOVQ 24(SP), CX
  2260. MOVQ 8(SP), R12
  2261. MOVQ 16(SP), R13
  2262. // Check if we have enough space in s.out
  2263. LEAQ (CX)(R13*1), R14
  2264. ADDQ R9, R14
  2265. CMPQ R14, 32(SP)
  2266. JA error_not_enough_space
  2267. // Copy literals
  2268. TESTQ CX, CX
  2269. JZ check_offset
  2270. XORQ R14, R14
  2271. copy_1:
  2272. MOVUPS (R10)(R14*1), X0
  2273. MOVUPS X0, (R9)(R14*1)
  2274. ADDQ $0x10, R14
  2275. CMPQ R14, CX
  2276. JB copy_1
  2277. ADDQ CX, R10
  2278. ADDQ CX, R9
  2279. ADDQ CX, R11
  2280. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2281. check_offset:
  2282. MOVQ R11, CX
  2283. ADDQ 40(SP), CX
  2284. CMPQ R12, CX
  2285. JG error_match_off_too_big
  2286. CMPQ R12, 56(SP)
  2287. JG error_match_off_too_big
  2288. // Copy match from history
  2289. MOVQ R12, CX
  2290. SUBQ R11, CX
  2291. JLS copy_match
  2292. MOVQ 48(SP), R14
  2293. SUBQ CX, R14
  2294. CMPQ R13, CX
  2295. JG copy_all_from_history
  2296. MOVQ R13, CX
  2297. SUBQ $0x10, CX
  2298. JB copy_4_small
  2299. copy_4_loop:
  2300. MOVUPS (R14), X0
  2301. MOVUPS X0, (R9)
  2302. ADDQ $0x10, R14
  2303. ADDQ $0x10, R9
  2304. SUBQ $0x10, CX
  2305. JAE copy_4_loop
  2306. LEAQ 16(R14)(CX*1), R14
  2307. LEAQ 16(R9)(CX*1), R9
  2308. MOVUPS -16(R14), X0
  2309. MOVUPS X0, -16(R9)
  2310. JMP copy_4_end
  2311. copy_4_small:
  2312. CMPQ R13, $0x03
  2313. JE copy_4_move_3
  2314. CMPQ R13, $0x08
  2315. JB copy_4_move_4through7
  2316. JMP copy_4_move_8through16
  2317. copy_4_move_3:
  2318. MOVW (R14), CX
  2319. MOVB 2(R14), R12
  2320. MOVW CX, (R9)
  2321. MOVB R12, 2(R9)
  2322. ADDQ R13, R14
  2323. ADDQ R13, R9
  2324. JMP copy_4_end
  2325. copy_4_move_4through7:
  2326. MOVL (R14), CX
  2327. MOVL -4(R14)(R13*1), R12
  2328. MOVL CX, (R9)
  2329. MOVL R12, -4(R9)(R13*1)
  2330. ADDQ R13, R14
  2331. ADDQ R13, R9
  2332. JMP copy_4_end
  2333. copy_4_move_8through16:
  2334. MOVQ (R14), CX
  2335. MOVQ -8(R14)(R13*1), R12
  2336. MOVQ CX, (R9)
  2337. MOVQ R12, -8(R9)(R13*1)
  2338. ADDQ R13, R14
  2339. ADDQ R13, R9
  2340. copy_4_end:
  2341. ADDQ R13, R11
  2342. JMP handle_loop
  2343. JMP loop_finished
  2344. copy_all_from_history:
  2345. MOVQ CX, R15
  2346. SUBQ $0x10, R15
  2347. JB copy_5_small
  2348. copy_5_loop:
  2349. MOVUPS (R14), X0
  2350. MOVUPS X0, (R9)
  2351. ADDQ $0x10, R14
  2352. ADDQ $0x10, R9
  2353. SUBQ $0x10, R15
  2354. JAE copy_5_loop
  2355. LEAQ 16(R14)(R15*1), R14
  2356. LEAQ 16(R9)(R15*1), R9
  2357. MOVUPS -16(R14), X0
  2358. MOVUPS X0, -16(R9)
  2359. JMP copy_5_end
  2360. copy_5_small:
  2361. CMPQ CX, $0x03
  2362. JE copy_5_move_3
  2363. JB copy_5_move_1or2
  2364. CMPQ CX, $0x08
  2365. JB copy_5_move_4through7
  2366. JMP copy_5_move_8through16
  2367. copy_5_move_1or2:
  2368. MOVB (R14), R15
  2369. MOVB -1(R14)(CX*1), BP
  2370. MOVB R15, (R9)
  2371. MOVB BP, -1(R9)(CX*1)
  2372. ADDQ CX, R14
  2373. ADDQ CX, R9
  2374. JMP copy_5_end
  2375. copy_5_move_3:
  2376. MOVW (R14), R15
  2377. MOVB 2(R14), BP
  2378. MOVW R15, (R9)
  2379. MOVB BP, 2(R9)
  2380. ADDQ CX, R14
  2381. ADDQ CX, R9
  2382. JMP copy_5_end
  2383. copy_5_move_4through7:
  2384. MOVL (R14), R15
  2385. MOVL -4(R14)(CX*1), BP
  2386. MOVL R15, (R9)
  2387. MOVL BP, -4(R9)(CX*1)
  2388. ADDQ CX, R14
  2389. ADDQ CX, R9
  2390. JMP copy_5_end
  2391. copy_5_move_8through16:
  2392. MOVQ (R14), R15
  2393. MOVQ -8(R14)(CX*1), BP
  2394. MOVQ R15, (R9)
  2395. MOVQ BP, -8(R9)(CX*1)
  2396. ADDQ CX, R14
  2397. ADDQ CX, R9
  2398. copy_5_end:
  2399. ADDQ CX, R11
  2400. SUBQ CX, R13
  2401. // Copy match from the current buffer
  2402. copy_match:
  2403. MOVQ R9, CX
  2404. SUBQ R12, CX
  2405. // ml <= mo
  2406. CMPQ R13, R12
  2407. JA copy_overlapping_match
  2408. // Copy non-overlapping match
  2409. ADDQ R13, R11
  2410. MOVQ R9, R12
  2411. ADDQ R13, R9
  2412. copy_2:
  2413. MOVUPS (CX), X0
  2414. MOVUPS X0, (R12)
  2415. ADDQ $0x10, CX
  2416. ADDQ $0x10, R12
  2417. SUBQ $0x10, R13
  2418. JHI copy_2
  2419. JMP handle_loop
  2420. // Copy overlapping match
  2421. copy_overlapping_match:
  2422. ADDQ R13, R11
  2423. copy_slow_3:
  2424. MOVB (CX), R12
  2425. MOVB R12, (R9)
  2426. INCQ CX
  2427. INCQ R9
  2428. DECQ R13
  2429. JNZ copy_slow_3
  2430. handle_loop:
  2431. MOVQ ctx+16(FP), CX
  2432. DECQ 96(CX)
  2433. JNS sequenceDecs_decodeSync_bmi2_main_loop
  2434. loop_finished:
  2435. MOVQ br+8(FP), CX
  2436. MOVQ AX, 32(CX)
  2437. MOVB DL, 40(CX)
  2438. MOVQ BX, 24(CX)
  2439. // Update the context
  2440. MOVQ ctx+16(FP), AX
  2441. MOVQ R11, 136(AX)
  2442. MOVQ 144(AX), CX
  2443. SUBQ CX, R10
  2444. MOVQ R10, 168(AX)
  2445. // Return success
  2446. MOVQ $0x00000000, ret+24(FP)
  2447. RET
  2448. // Return with match length error
  2449. sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
  2450. MOVQ 16(SP), AX
  2451. MOVQ ctx+16(FP), CX
  2452. MOVQ AX, 216(CX)
  2453. MOVQ $0x00000001, ret+24(FP)
  2454. RET
  2455. // Return with match too long error
  2456. sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
  2457. MOVQ ctx+16(FP), AX
  2458. MOVQ 16(SP), CX
  2459. MOVQ CX, 216(AX)
  2460. MOVQ $0x00000002, ret+24(FP)
  2461. RET
  2462. // Return with match offset too long error
  2463. error_match_off_too_big:
  2464. MOVQ ctx+16(FP), AX
  2465. MOVQ 8(SP), CX
  2466. MOVQ CX, 224(AX)
  2467. MOVQ R11, 136(AX)
  2468. MOVQ $0x00000003, ret+24(FP)
  2469. RET
  2470. // Return with not enough literals error
  2471. error_not_enough_literals:
  2472. MOVQ ctx+16(FP), AX
  2473. MOVQ 24(SP), CX
  2474. MOVQ CX, 208(AX)
  2475. MOVQ $0x00000004, ret+24(FP)
  2476. RET
  2477. // Return with not enough output space error
  2478. error_not_enough_space:
  2479. MOVQ ctx+16(FP), AX
  2480. MOVQ 24(SP), CX
  2481. MOVQ CX, 208(AX)
  2482. MOVQ 16(SP), CX
  2483. MOVQ CX, 216(AX)
  2484. MOVQ R11, 136(AX)
  2485. MOVQ $0x00000005, ret+24(FP)
  2486. RET
  2487. // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2488. // Requires: CMOV, SSE
  2489. TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
  2490. MOVQ br+8(FP), AX
  2491. MOVQ 32(AX), DX
  2492. MOVBQZX 40(AX), BX
  2493. MOVQ 24(AX), SI
  2494. MOVQ (AX), AX
  2495. ADDQ SI, AX
  2496. MOVQ AX, (SP)
  2497. MOVQ ctx+16(FP), AX
  2498. MOVQ 72(AX), DI
  2499. MOVQ 80(AX), R8
  2500. MOVQ 88(AX), R9
  2501. XORQ CX, CX
  2502. MOVQ CX, 8(SP)
  2503. MOVQ CX, 16(SP)
  2504. MOVQ CX, 24(SP)
  2505. MOVQ 112(AX), R10
  2506. MOVQ 128(AX), CX
  2507. MOVQ CX, 32(SP)
  2508. MOVQ 144(AX), R11
  2509. MOVQ 136(AX), R12
  2510. MOVQ 200(AX), CX
  2511. MOVQ CX, 56(SP)
  2512. MOVQ 176(AX), CX
  2513. MOVQ CX, 48(SP)
  2514. MOVQ 184(AX), AX
  2515. MOVQ AX, 40(SP)
  2516. MOVQ 40(SP), AX
  2517. ADDQ AX, 48(SP)
  2518. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2519. ADDQ R10, 32(SP)
  2520. // outBase += outPosition
  2521. ADDQ R12, R10
  2522. sequenceDecs_decodeSync_safe_amd64_main_loop:
  2523. MOVQ (SP), R13
  2524. // Fill bitreader to have enough for the offset and match length.
  2525. CMPQ SI, $0x08
  2526. JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2527. MOVQ BX, AX
  2528. SHRQ $0x03, AX
  2529. SUBQ AX, R13
  2530. MOVQ (R13), DX
  2531. SUBQ AX, SI
  2532. ANDQ $0x07, BX
  2533. JMP sequenceDecs_decodeSync_safe_amd64_fill_end
  2534. sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
  2535. CMPQ SI, $0x00
  2536. JLE sequenceDecs_decodeSync_safe_amd64_fill_end
  2537. CMPQ BX, $0x07
  2538. JLE sequenceDecs_decodeSync_safe_amd64_fill_end
  2539. SHLQ $0x08, DX
  2540. SUBQ $0x01, R13
  2541. SUBQ $0x01, SI
  2542. SUBQ $0x08, BX
  2543. MOVBQZX (R13), AX
  2544. ORQ AX, DX
  2545. JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2546. sequenceDecs_decodeSync_safe_amd64_fill_end:
  2547. // Update offset
  2548. MOVQ R9, AX
  2549. MOVQ BX, CX
  2550. MOVQ DX, R14
  2551. SHLQ CL, R14
  2552. MOVB AH, CL
  2553. SHRQ $0x20, AX
  2554. TESTQ CX, CX
  2555. JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2556. ADDQ CX, BX
  2557. CMPQ BX, $0x40
  2558. JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2559. CMPQ CX, $0x40
  2560. JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2561. NEGQ CX
  2562. SHRQ CL, R14
  2563. ADDQ R14, AX
  2564. sequenceDecs_decodeSync_safe_amd64_of_update_zero:
  2565. MOVQ AX, 8(SP)
  2566. // Update match length
  2567. MOVQ R8, AX
  2568. MOVQ BX, CX
  2569. MOVQ DX, R14
  2570. SHLQ CL, R14
  2571. MOVB AH, CL
  2572. SHRQ $0x20, AX
  2573. TESTQ CX, CX
  2574. JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2575. ADDQ CX, BX
  2576. CMPQ BX, $0x40
  2577. JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2578. CMPQ CX, $0x40
  2579. JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2580. NEGQ CX
  2581. SHRQ CL, R14
  2582. ADDQ R14, AX
  2583. sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
  2584. MOVQ AX, 16(SP)
  2585. // Fill bitreader to have enough for the remaining
  2586. CMPQ SI, $0x08
  2587. JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  2588. MOVQ BX, AX
  2589. SHRQ $0x03, AX
  2590. SUBQ AX, R13
  2591. MOVQ (R13), DX
  2592. SUBQ AX, SI
  2593. ANDQ $0x07, BX
  2594. JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2595. sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
  2596. CMPQ SI, $0x00
  2597. JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2598. CMPQ BX, $0x07
  2599. JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2600. SHLQ $0x08, DX
  2601. SUBQ $0x01, R13
  2602. SUBQ $0x01, SI
  2603. SUBQ $0x08, BX
  2604. MOVBQZX (R13), AX
  2605. ORQ AX, DX
  2606. JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  2607. sequenceDecs_decodeSync_safe_amd64_fill_2_end:
  2608. // Update literal length
  2609. MOVQ DI, AX
  2610. MOVQ BX, CX
  2611. MOVQ DX, R14
  2612. SHLQ CL, R14
  2613. MOVB AH, CL
  2614. SHRQ $0x20, AX
  2615. TESTQ CX, CX
  2616. JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2617. ADDQ CX, BX
  2618. CMPQ BX, $0x40
  2619. JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2620. CMPQ CX, $0x40
  2621. JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2622. NEGQ CX
  2623. SHRQ CL, R14
  2624. ADDQ R14, AX
  2625. sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
  2626. MOVQ AX, 24(SP)
  2627. // Fill bitreader for state updates
  2628. MOVQ R13, (SP)
  2629. MOVQ R9, AX
  2630. SHRQ $0x08, AX
  2631. MOVBQZX AL, AX
  2632. MOVQ ctx+16(FP), CX
  2633. CMPQ 96(CX), $0x00
  2634. JZ sequenceDecs_decodeSync_safe_amd64_skip_update
  2635. // Update Literal Length State
  2636. MOVBQZX DI, R13
  2637. SHRQ $0x10, DI
  2638. MOVWQZX DI, DI
  2639. LEAQ (BX)(R13*1), CX
  2640. MOVQ DX, R14
  2641. MOVQ CX, BX
  2642. ROLQ CL, R14
  2643. MOVL $0x00000001, R15
  2644. MOVB R13, CL
  2645. SHLL CL, R15
  2646. DECL R15
  2647. ANDQ R15, R14
  2648. ADDQ R14, DI
  2649. // Load ctx.llTable
  2650. MOVQ ctx+16(FP), CX
  2651. MOVQ (CX), CX
  2652. MOVQ (CX)(DI*8), DI
  2653. // Update Match Length State
  2654. MOVBQZX R8, R13
  2655. SHRQ $0x10, R8
  2656. MOVWQZX R8, R8
  2657. LEAQ (BX)(R13*1), CX
  2658. MOVQ DX, R14
  2659. MOVQ CX, BX
  2660. ROLQ CL, R14
  2661. MOVL $0x00000001, R15
  2662. MOVB R13, CL
  2663. SHLL CL, R15
  2664. DECL R15
  2665. ANDQ R15, R14
  2666. ADDQ R14, R8
  2667. // Load ctx.mlTable
  2668. MOVQ ctx+16(FP), CX
  2669. MOVQ 24(CX), CX
  2670. MOVQ (CX)(R8*8), R8
  2671. // Update Offset State
  2672. MOVBQZX R9, R13
  2673. SHRQ $0x10, R9
  2674. MOVWQZX R9, R9
  2675. LEAQ (BX)(R13*1), CX
  2676. MOVQ DX, R14
  2677. MOVQ CX, BX
  2678. ROLQ CL, R14
  2679. MOVL $0x00000001, R15
  2680. MOVB R13, CL
  2681. SHLL CL, R15
  2682. DECL R15
  2683. ANDQ R15, R14
  2684. ADDQ R14, R9
  2685. // Load ctx.ofTable
  2686. MOVQ ctx+16(FP), CX
  2687. MOVQ 48(CX), CX
  2688. MOVQ (CX)(R9*8), R9
  2689. sequenceDecs_decodeSync_safe_amd64_skip_update:
  2690. // Adjust offset
  2691. MOVQ s+0(FP), CX
  2692. MOVQ 8(SP), R13
  2693. CMPQ AX, $0x01
  2694. JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
  2695. MOVUPS 144(CX), X0
  2696. MOVQ R13, 144(CX)
  2697. MOVUPS X0, 152(CX)
  2698. JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
  2699. sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
  2700. CMPQ 24(SP), $0x00000000
  2701. JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
  2702. INCQ R13
  2703. JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  2704. sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
  2705. TESTQ R13, R13
  2706. JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  2707. MOVQ 144(CX), R13
  2708. JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
  2709. sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
  2710. MOVQ R13, AX
  2711. XORQ R14, R14
  2712. MOVQ $-1, R15
  2713. CMPQ R13, $0x03
  2714. CMOVQEQ R14, AX
  2715. CMOVQEQ R15, R14
  2716. ADDQ 144(CX)(AX*8), R14
  2717. JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
  2718. MOVQ $0x00000001, R14
  2719. sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
  2720. CMPQ R13, $0x01
  2721. JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
  2722. MOVQ 152(CX), AX
  2723. MOVQ AX, 160(CX)
  2724. sequenceDecs_decodeSync_safe_amd64_adjust_skip:
  2725. MOVQ 144(CX), AX
  2726. MOVQ AX, 152(CX)
  2727. MOVQ R14, 144(CX)
  2728. MOVQ R14, R13
  2729. sequenceDecs_decodeSync_safe_amd64_after_adjust:
  2730. MOVQ R13, 8(SP)
  2731. // Check values
  2732. MOVQ 16(SP), AX
  2733. MOVQ 24(SP), CX
  2734. LEAQ (AX)(CX*1), R14
  2735. MOVQ s+0(FP), R15
  2736. ADDQ R14, 256(R15)
  2737. MOVQ ctx+16(FP), R14
  2738. SUBQ CX, 104(R14)
  2739. JS error_not_enough_literals
  2740. CMPQ AX, $0x00020002
  2741. JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
  2742. TESTQ R13, R13
  2743. JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
  2744. TESTQ AX, AX
  2745. JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
  2746. sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
  2747. MOVQ 24(SP), AX
  2748. MOVQ 8(SP), CX
  2749. MOVQ 16(SP), R13
  2750. // Check if we have enough space in s.out
  2751. LEAQ (AX)(R13*1), R14
  2752. ADDQ R10, R14
  2753. CMPQ R14, 32(SP)
  2754. JA error_not_enough_space
  2755. // Copy literals
  2756. TESTQ AX, AX
  2757. JZ check_offset
  2758. MOVQ AX, R14
  2759. SUBQ $0x10, R14
  2760. JB copy_1_small
  2761. copy_1_loop:
  2762. MOVUPS (R11), X0
  2763. MOVUPS X0, (R10)
  2764. ADDQ $0x10, R11
  2765. ADDQ $0x10, R10
  2766. SUBQ $0x10, R14
  2767. JAE copy_1_loop
  2768. LEAQ 16(R11)(R14*1), R11
  2769. LEAQ 16(R10)(R14*1), R10
  2770. MOVUPS -16(R11), X0
  2771. MOVUPS X0, -16(R10)
  2772. JMP copy_1_end
  2773. copy_1_small:
  2774. CMPQ AX, $0x03
  2775. JE copy_1_move_3
  2776. JB copy_1_move_1or2
  2777. CMPQ AX, $0x08
  2778. JB copy_1_move_4through7
  2779. JMP copy_1_move_8through16
  2780. copy_1_move_1or2:
  2781. MOVB (R11), R14
  2782. MOVB -1(R11)(AX*1), R15
  2783. MOVB R14, (R10)
  2784. MOVB R15, -1(R10)(AX*1)
  2785. ADDQ AX, R11
  2786. ADDQ AX, R10
  2787. JMP copy_1_end
  2788. copy_1_move_3:
  2789. MOVW (R11), R14
  2790. MOVB 2(R11), R15
  2791. MOVW R14, (R10)
  2792. MOVB R15, 2(R10)
  2793. ADDQ AX, R11
  2794. ADDQ AX, R10
  2795. JMP copy_1_end
  2796. copy_1_move_4through7:
  2797. MOVL (R11), R14
  2798. MOVL -4(R11)(AX*1), R15
  2799. MOVL R14, (R10)
  2800. MOVL R15, -4(R10)(AX*1)
  2801. ADDQ AX, R11
  2802. ADDQ AX, R10
  2803. JMP copy_1_end
  2804. copy_1_move_8through16:
  2805. MOVQ (R11), R14
  2806. MOVQ -8(R11)(AX*1), R15
  2807. MOVQ R14, (R10)
  2808. MOVQ R15, -8(R10)(AX*1)
  2809. ADDQ AX, R11
  2810. ADDQ AX, R10
  2811. copy_1_end:
  2812. ADDQ AX, R12
  2813. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2814. check_offset:
  2815. MOVQ R12, AX
  2816. ADDQ 40(SP), AX
  2817. CMPQ CX, AX
  2818. JG error_match_off_too_big
  2819. CMPQ CX, 56(SP)
  2820. JG error_match_off_too_big
  2821. // Copy match from history
  2822. MOVQ CX, AX
  2823. SUBQ R12, AX
  2824. JLS copy_match
  2825. MOVQ 48(SP), R14
  2826. SUBQ AX, R14
  2827. CMPQ R13, AX
  2828. JG copy_all_from_history
  2829. MOVQ R13, AX
  2830. SUBQ $0x10, AX
  2831. JB copy_4_small
  2832. copy_4_loop:
  2833. MOVUPS (R14), X0
  2834. MOVUPS X0, (R10)
  2835. ADDQ $0x10, R14
  2836. ADDQ $0x10, R10
  2837. SUBQ $0x10, AX
  2838. JAE copy_4_loop
  2839. LEAQ 16(R14)(AX*1), R14
  2840. LEAQ 16(R10)(AX*1), R10
  2841. MOVUPS -16(R14), X0
  2842. MOVUPS X0, -16(R10)
  2843. JMP copy_4_end
  2844. copy_4_small:
  2845. CMPQ R13, $0x03
  2846. JE copy_4_move_3
  2847. CMPQ R13, $0x08
  2848. JB copy_4_move_4through7
  2849. JMP copy_4_move_8through16
  2850. copy_4_move_3:
  2851. MOVW (R14), AX
  2852. MOVB 2(R14), CL
  2853. MOVW AX, (R10)
  2854. MOVB CL, 2(R10)
  2855. ADDQ R13, R14
  2856. ADDQ R13, R10
  2857. JMP copy_4_end
  2858. copy_4_move_4through7:
  2859. MOVL (R14), AX
  2860. MOVL -4(R14)(R13*1), CX
  2861. MOVL AX, (R10)
  2862. MOVL CX, -4(R10)(R13*1)
  2863. ADDQ R13, R14
  2864. ADDQ R13, R10
  2865. JMP copy_4_end
  2866. copy_4_move_8through16:
  2867. MOVQ (R14), AX
  2868. MOVQ -8(R14)(R13*1), CX
  2869. MOVQ AX, (R10)
  2870. MOVQ CX, -8(R10)(R13*1)
  2871. ADDQ R13, R14
  2872. ADDQ R13, R10
  2873. copy_4_end:
  2874. ADDQ R13, R12
  2875. JMP handle_loop
  2876. JMP loop_finished
  2877. copy_all_from_history:
  2878. MOVQ AX, R15
  2879. SUBQ $0x10, R15
  2880. JB copy_5_small
  2881. copy_5_loop:
  2882. MOVUPS (R14), X0
  2883. MOVUPS X0, (R10)
  2884. ADDQ $0x10, R14
  2885. ADDQ $0x10, R10
  2886. SUBQ $0x10, R15
  2887. JAE copy_5_loop
  2888. LEAQ 16(R14)(R15*1), R14
  2889. LEAQ 16(R10)(R15*1), R10
  2890. MOVUPS -16(R14), X0
  2891. MOVUPS X0, -16(R10)
  2892. JMP copy_5_end
  2893. copy_5_small:
  2894. CMPQ AX, $0x03
  2895. JE copy_5_move_3
  2896. JB copy_5_move_1or2
  2897. CMPQ AX, $0x08
  2898. JB copy_5_move_4through7
  2899. JMP copy_5_move_8through16
  2900. copy_5_move_1or2:
  2901. MOVB (R14), R15
  2902. MOVB -1(R14)(AX*1), BP
  2903. MOVB R15, (R10)
  2904. MOVB BP, -1(R10)(AX*1)
  2905. ADDQ AX, R14
  2906. ADDQ AX, R10
  2907. JMP copy_5_end
  2908. copy_5_move_3:
  2909. MOVW (R14), R15
  2910. MOVB 2(R14), BP
  2911. MOVW R15, (R10)
  2912. MOVB BP, 2(R10)
  2913. ADDQ AX, R14
  2914. ADDQ AX, R10
  2915. JMP copy_5_end
  2916. copy_5_move_4through7:
  2917. MOVL (R14), R15
  2918. MOVL -4(R14)(AX*1), BP
  2919. MOVL R15, (R10)
  2920. MOVL BP, -4(R10)(AX*1)
  2921. ADDQ AX, R14
  2922. ADDQ AX, R10
  2923. JMP copy_5_end
  2924. copy_5_move_8through16:
  2925. MOVQ (R14), R15
  2926. MOVQ -8(R14)(AX*1), BP
  2927. MOVQ R15, (R10)
  2928. MOVQ BP, -8(R10)(AX*1)
  2929. ADDQ AX, R14
  2930. ADDQ AX, R10
  2931. copy_5_end:
  2932. ADDQ AX, R12
  2933. SUBQ AX, R13
  2934. // Copy match from the current buffer
  2935. copy_match:
  2936. MOVQ R10, AX
  2937. SUBQ CX, AX
  2938. // ml <= mo
  2939. CMPQ R13, CX
  2940. JA copy_overlapping_match
  2941. // Copy non-overlapping match
  2942. ADDQ R13, R12
  2943. MOVQ R13, CX
  2944. SUBQ $0x10, CX
  2945. JB copy_2_small
  2946. copy_2_loop:
  2947. MOVUPS (AX), X0
  2948. MOVUPS X0, (R10)
  2949. ADDQ $0x10, AX
  2950. ADDQ $0x10, R10
  2951. SUBQ $0x10, CX
  2952. JAE copy_2_loop
  2953. LEAQ 16(AX)(CX*1), AX
  2954. LEAQ 16(R10)(CX*1), R10
  2955. MOVUPS -16(AX), X0
  2956. MOVUPS X0, -16(R10)
  2957. JMP copy_2_end
  2958. copy_2_small:
  2959. CMPQ R13, $0x03
  2960. JE copy_2_move_3
  2961. JB copy_2_move_1or2
  2962. CMPQ R13, $0x08
  2963. JB copy_2_move_4through7
  2964. JMP copy_2_move_8through16
  2965. copy_2_move_1or2:
  2966. MOVB (AX), CL
  2967. MOVB -1(AX)(R13*1), R14
  2968. MOVB CL, (R10)
  2969. MOVB R14, -1(R10)(R13*1)
  2970. ADDQ R13, AX
  2971. ADDQ R13, R10
  2972. JMP copy_2_end
  2973. copy_2_move_3:
  2974. MOVW (AX), CX
  2975. MOVB 2(AX), R14
  2976. MOVW CX, (R10)
  2977. MOVB R14, 2(R10)
  2978. ADDQ R13, AX
  2979. ADDQ R13, R10
  2980. JMP copy_2_end
  2981. copy_2_move_4through7:
  2982. MOVL (AX), CX
  2983. MOVL -4(AX)(R13*1), R14
  2984. MOVL CX, (R10)
  2985. MOVL R14, -4(R10)(R13*1)
  2986. ADDQ R13, AX
  2987. ADDQ R13, R10
  2988. JMP copy_2_end
  2989. copy_2_move_8through16:
  2990. MOVQ (AX), CX
  2991. MOVQ -8(AX)(R13*1), R14
  2992. MOVQ CX, (R10)
  2993. MOVQ R14, -8(R10)(R13*1)
  2994. ADDQ R13, AX
  2995. ADDQ R13, R10
  2996. copy_2_end:
  2997. JMP handle_loop
  2998. // Copy overlapping match
  2999. copy_overlapping_match:
  3000. ADDQ R13, R12
  3001. copy_slow_3:
  3002. MOVB (AX), CL
  3003. MOVB CL, (R10)
  3004. INCQ AX
  3005. INCQ R10
  3006. DECQ R13
  3007. JNZ copy_slow_3
  3008. handle_loop:
  3009. MOVQ ctx+16(FP), AX
  3010. DECQ 96(AX)
  3011. JNS sequenceDecs_decodeSync_safe_amd64_main_loop
  3012. loop_finished:
  3013. MOVQ br+8(FP), AX
  3014. MOVQ DX, 32(AX)
  3015. MOVB BL, 40(AX)
  3016. MOVQ SI, 24(AX)
  3017. // Update the context
  3018. MOVQ ctx+16(FP), AX
  3019. MOVQ R12, 136(AX)
  3020. MOVQ 144(AX), CX
  3021. SUBQ CX, R11
  3022. MOVQ R11, 168(AX)
  3023. // Return success
  3024. MOVQ $0x00000000, ret+24(FP)
  3025. RET
  3026. // Return with match length error
  3027. sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
  3028. MOVQ 16(SP), AX
  3029. MOVQ ctx+16(FP), CX
  3030. MOVQ AX, 216(CX)
  3031. MOVQ $0x00000001, ret+24(FP)
  3032. RET
  3033. // Return with match too long error
  3034. sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
  3035. MOVQ ctx+16(FP), AX
  3036. MOVQ 16(SP), CX
  3037. MOVQ CX, 216(AX)
  3038. MOVQ $0x00000002, ret+24(FP)
  3039. RET
  3040. // Return with match offset too long error
  3041. error_match_off_too_big:
  3042. MOVQ ctx+16(FP), AX
  3043. MOVQ 8(SP), CX
  3044. MOVQ CX, 224(AX)
  3045. MOVQ R12, 136(AX)
  3046. MOVQ $0x00000003, ret+24(FP)
  3047. RET
  3048. // Return with not enough literals error
  3049. error_not_enough_literals:
  3050. MOVQ ctx+16(FP), AX
  3051. MOVQ 24(SP), CX
  3052. MOVQ CX, 208(AX)
  3053. MOVQ $0x00000004, ret+24(FP)
  3054. RET
  3055. // Return with not enough output space error
  3056. error_not_enough_space:
  3057. MOVQ ctx+16(FP), AX
  3058. MOVQ 24(SP), CX
  3059. MOVQ CX, 208(AX)
  3060. MOVQ 16(SP), CX
  3061. MOVQ CX, 216(AX)
  3062. MOVQ R12, 136(AX)
  3063. MOVQ $0x00000005, ret+24(FP)
  3064. RET
  3065. // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  3066. // Requires: BMI, BMI2, CMOV, SSE
  3067. TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
  3068. MOVQ br+8(FP), CX
  3069. MOVQ 32(CX), AX
  3070. MOVBQZX 40(CX), DX
  3071. MOVQ 24(CX), BX
  3072. MOVQ (CX), CX
  3073. ADDQ BX, CX
  3074. MOVQ CX, (SP)
  3075. MOVQ ctx+16(FP), CX
  3076. MOVQ 72(CX), SI
  3077. MOVQ 80(CX), DI
  3078. MOVQ 88(CX), R8
  3079. XORQ R9, R9
  3080. MOVQ R9, 8(SP)
  3081. MOVQ R9, 16(SP)
  3082. MOVQ R9, 24(SP)
  3083. MOVQ 112(CX), R9
  3084. MOVQ 128(CX), R10
  3085. MOVQ R10, 32(SP)
  3086. MOVQ 144(CX), R10
  3087. MOVQ 136(CX), R11
  3088. MOVQ 200(CX), R12
  3089. MOVQ R12, 56(SP)
  3090. MOVQ 176(CX), R12
  3091. MOVQ R12, 48(SP)
  3092. MOVQ 184(CX), CX
  3093. MOVQ CX, 40(SP)
  3094. MOVQ 40(SP), CX
  3095. ADDQ CX, 48(SP)
  3096. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  3097. ADDQ R9, 32(SP)
  3098. // outBase += outPosition
  3099. ADDQ R11, R9
  3100. sequenceDecs_decodeSync_safe_bmi2_main_loop:
  3101. MOVQ (SP), R12
  3102. // Fill bitreader to have enough for the offset and match length.
  3103. CMPQ BX, $0x08
  3104. JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3105. MOVQ DX, CX
  3106. SHRQ $0x03, CX
  3107. SUBQ CX, R12
  3108. MOVQ (R12), AX
  3109. SUBQ CX, BX
  3110. ANDQ $0x07, DX
  3111. JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
  3112. sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
  3113. CMPQ BX, $0x00
  3114. JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
  3115. CMPQ DX, $0x07
  3116. JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
  3117. SHLQ $0x08, AX
  3118. SUBQ $0x01, R12
  3119. SUBQ $0x01, BX
  3120. SUBQ $0x08, DX
  3121. MOVBQZX (R12), CX
  3122. ORQ CX, AX
  3123. JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3124. sequenceDecs_decodeSync_safe_bmi2_fill_end:
  3125. // Update offset
  3126. MOVQ $0x00000808, CX
  3127. BEXTRQ CX, R8, R13
  3128. MOVQ AX, R14
  3129. LEAQ (DX)(R13*1), CX
  3130. ROLQ CL, R14
  3131. BZHIQ R13, R14, R14
  3132. MOVQ CX, DX
  3133. MOVQ R8, CX
  3134. SHRQ $0x20, CX
  3135. ADDQ R14, CX
  3136. MOVQ CX, 8(SP)
  3137. // Update match length
  3138. MOVQ $0x00000808, CX
  3139. BEXTRQ CX, DI, R13
  3140. MOVQ AX, R14
  3141. LEAQ (DX)(R13*1), CX
  3142. ROLQ CL, R14
  3143. BZHIQ R13, R14, R14
  3144. MOVQ CX, DX
  3145. MOVQ DI, CX
  3146. SHRQ $0x20, CX
  3147. ADDQ R14, CX
  3148. MOVQ CX, 16(SP)
  3149. // Fill bitreader to have enough for the remaining
  3150. CMPQ BX, $0x08
  3151. JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3152. MOVQ DX, CX
  3153. SHRQ $0x03, CX
  3154. SUBQ CX, R12
  3155. MOVQ (R12), AX
  3156. SUBQ CX, BX
  3157. ANDQ $0x07, DX
  3158. JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3159. sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
  3160. CMPQ BX, $0x00
  3161. JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3162. CMPQ DX, $0x07
  3163. JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3164. SHLQ $0x08, AX
  3165. SUBQ $0x01, R12
  3166. SUBQ $0x01, BX
  3167. SUBQ $0x08, DX
  3168. MOVBQZX (R12), CX
  3169. ORQ CX, AX
  3170. JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3171. sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
  3172. // Update literal length
  3173. MOVQ $0x00000808, CX
  3174. BEXTRQ CX, SI, R13
  3175. MOVQ AX, R14
  3176. LEAQ (DX)(R13*1), CX
  3177. ROLQ CL, R14
  3178. BZHIQ R13, R14, R14
  3179. MOVQ CX, DX
  3180. MOVQ SI, CX
  3181. SHRQ $0x20, CX
  3182. ADDQ R14, CX
  3183. MOVQ CX, 24(SP)
  3184. // Fill bitreader for state updates
  3185. MOVQ R12, (SP)
  3186. MOVQ $0x00000808, CX
  3187. BEXTRQ CX, R8, R12
  3188. MOVQ ctx+16(FP), CX
  3189. CMPQ 96(CX), $0x00
  3190. JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
  3191. LEAQ (SI)(DI*1), R13
  3192. ADDQ R8, R13
  3193. MOVBQZX R13, R13
  3194. LEAQ (DX)(R13*1), CX
  3195. MOVQ AX, R14
  3196. MOVQ CX, DX
  3197. ROLQ CL, R14
  3198. BZHIQ R13, R14, R14
  3199. // Update Offset State
  3200. BZHIQ R8, R14, CX
  3201. SHRXQ R8, R14, R14
  3202. MOVQ $0x00001010, R13
  3203. BEXTRQ R13, R8, R8
  3204. ADDQ CX, R8
  3205. // Load ctx.ofTable
  3206. MOVQ ctx+16(FP), CX
  3207. MOVQ 48(CX), CX
  3208. MOVQ (CX)(R8*8), R8
  3209. // Update Match Length State
  3210. BZHIQ DI, R14, CX
  3211. SHRXQ DI, R14, R14
  3212. MOVQ $0x00001010, R13
  3213. BEXTRQ R13, DI, DI
  3214. ADDQ CX, DI
  3215. // Load ctx.mlTable
  3216. MOVQ ctx+16(FP), CX
  3217. MOVQ 24(CX), CX
  3218. MOVQ (CX)(DI*8), DI
  3219. // Update Literal Length State
  3220. BZHIQ SI, R14, CX
  3221. MOVQ $0x00001010, R13
  3222. BEXTRQ R13, SI, SI
  3223. ADDQ CX, SI
  3224. // Load ctx.llTable
  3225. MOVQ ctx+16(FP), CX
  3226. MOVQ (CX), CX
  3227. MOVQ (CX)(SI*8), SI
  3228. sequenceDecs_decodeSync_safe_bmi2_skip_update:
  3229. // Adjust offset
  3230. MOVQ s+0(FP), CX
  3231. MOVQ 8(SP), R13
  3232. CMPQ R12, $0x01
  3233. JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
  3234. MOVUPS 144(CX), X0
  3235. MOVQ R13, 144(CX)
  3236. MOVUPS X0, 152(CX)
  3237. JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3238. sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
  3239. CMPQ 24(SP), $0x00000000
  3240. JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
  3241. INCQ R13
  3242. JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3243. sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
  3244. TESTQ R13, R13
  3245. JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3246. MOVQ 144(CX), R13
  3247. JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3248. sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
  3249. MOVQ R13, R12
  3250. XORQ R14, R14
  3251. MOVQ $-1, R15
  3252. CMPQ R13, $0x03
  3253. CMOVQEQ R14, R12
  3254. CMOVQEQ R15, R14
  3255. ADDQ 144(CX)(R12*8), R14
  3256. JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
  3257. MOVQ $0x00000001, R14
  3258. sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
  3259. CMPQ R13, $0x01
  3260. JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
  3261. MOVQ 152(CX), R12
  3262. MOVQ R12, 160(CX)
  3263. sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
  3264. MOVQ 144(CX), R12
  3265. MOVQ R12, 152(CX)
  3266. MOVQ R14, 144(CX)
  3267. MOVQ R14, R13
  3268. sequenceDecs_decodeSync_safe_bmi2_after_adjust:
  3269. MOVQ R13, 8(SP)
  3270. // Check values
  3271. MOVQ 16(SP), CX
  3272. MOVQ 24(SP), R12
  3273. LEAQ (CX)(R12*1), R14
  3274. MOVQ s+0(FP), R15
  3275. ADDQ R14, 256(R15)
  3276. MOVQ ctx+16(FP), R14
  3277. SUBQ R12, 104(R14)
  3278. JS error_not_enough_literals
  3279. CMPQ CX, $0x00020002
  3280. JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
  3281. TESTQ R13, R13
  3282. JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
  3283. TESTQ CX, CX
  3284. JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
  3285. sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
  3286. MOVQ 24(SP), CX
  3287. MOVQ 8(SP), R12
  3288. MOVQ 16(SP), R13
  3289. // Check if we have enough space in s.out
  3290. LEAQ (CX)(R13*1), R14
  3291. ADDQ R9, R14
  3292. CMPQ R14, 32(SP)
  3293. JA error_not_enough_space
  3294. // Copy literals
  3295. TESTQ CX, CX
  3296. JZ check_offset
  3297. MOVQ CX, R14
  3298. SUBQ $0x10, R14
  3299. JB copy_1_small
  3300. copy_1_loop:
  3301. MOVUPS (R10), X0
  3302. MOVUPS X0, (R9)
  3303. ADDQ $0x10, R10
  3304. ADDQ $0x10, R9
  3305. SUBQ $0x10, R14
  3306. JAE copy_1_loop
  3307. LEAQ 16(R10)(R14*1), R10
  3308. LEAQ 16(R9)(R14*1), R9
  3309. MOVUPS -16(R10), X0
  3310. MOVUPS X0, -16(R9)
  3311. JMP copy_1_end
  3312. copy_1_small:
  3313. CMPQ CX, $0x03
  3314. JE copy_1_move_3
  3315. JB copy_1_move_1or2
  3316. CMPQ CX, $0x08
  3317. JB copy_1_move_4through7
  3318. JMP copy_1_move_8through16
  3319. copy_1_move_1or2:
  3320. MOVB (R10), R14
  3321. MOVB -1(R10)(CX*1), R15
  3322. MOVB R14, (R9)
  3323. MOVB R15, -1(R9)(CX*1)
  3324. ADDQ CX, R10
  3325. ADDQ CX, R9
  3326. JMP copy_1_end
  3327. copy_1_move_3:
  3328. MOVW (R10), R14
  3329. MOVB 2(R10), R15
  3330. MOVW R14, (R9)
  3331. MOVB R15, 2(R9)
  3332. ADDQ CX, R10
  3333. ADDQ CX, R9
  3334. JMP copy_1_end
  3335. copy_1_move_4through7:
  3336. MOVL (R10), R14
  3337. MOVL -4(R10)(CX*1), R15
  3338. MOVL R14, (R9)
  3339. MOVL R15, -4(R9)(CX*1)
  3340. ADDQ CX, R10
  3341. ADDQ CX, R9
  3342. JMP copy_1_end
  3343. copy_1_move_8through16:
  3344. MOVQ (R10), R14
  3345. MOVQ -8(R10)(CX*1), R15
  3346. MOVQ R14, (R9)
  3347. MOVQ R15, -8(R9)(CX*1)
  3348. ADDQ CX, R10
  3349. ADDQ CX, R9
  3350. copy_1_end:
  3351. ADDQ CX, R11
  3352. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  3353. check_offset:
  3354. MOVQ R11, CX
  3355. ADDQ 40(SP), CX
  3356. CMPQ R12, CX
  3357. JG error_match_off_too_big
  3358. CMPQ R12, 56(SP)
  3359. JG error_match_off_too_big
  3360. // Copy match from history
  3361. MOVQ R12, CX
  3362. SUBQ R11, CX
  3363. JLS copy_match
  3364. MOVQ 48(SP), R14
  3365. SUBQ CX, R14
  3366. CMPQ R13, CX
  3367. JG copy_all_from_history
  3368. MOVQ R13, CX
  3369. SUBQ $0x10, CX
  3370. JB copy_4_small
  3371. copy_4_loop:
  3372. MOVUPS (R14), X0
  3373. MOVUPS X0, (R9)
  3374. ADDQ $0x10, R14
  3375. ADDQ $0x10, R9
  3376. SUBQ $0x10, CX
  3377. JAE copy_4_loop
  3378. LEAQ 16(R14)(CX*1), R14
  3379. LEAQ 16(R9)(CX*1), R9
  3380. MOVUPS -16(R14), X0
  3381. MOVUPS X0, -16(R9)
  3382. JMP copy_4_end
  3383. copy_4_small:
  3384. CMPQ R13, $0x03
  3385. JE copy_4_move_3
  3386. CMPQ R13, $0x08
  3387. JB copy_4_move_4through7
  3388. JMP copy_4_move_8through16
  3389. copy_4_move_3:
  3390. MOVW (R14), CX
  3391. MOVB 2(R14), R12
  3392. MOVW CX, (R9)
  3393. MOVB R12, 2(R9)
  3394. ADDQ R13, R14
  3395. ADDQ R13, R9
  3396. JMP copy_4_end
  3397. copy_4_move_4through7:
  3398. MOVL (R14), CX
  3399. MOVL -4(R14)(R13*1), R12
  3400. MOVL CX, (R9)
  3401. MOVL R12, -4(R9)(R13*1)
  3402. ADDQ R13, R14
  3403. ADDQ R13, R9
  3404. JMP copy_4_end
  3405. copy_4_move_8through16:
  3406. MOVQ (R14), CX
  3407. MOVQ -8(R14)(R13*1), R12
  3408. MOVQ CX, (R9)
  3409. MOVQ R12, -8(R9)(R13*1)
  3410. ADDQ R13, R14
  3411. ADDQ R13, R9
  3412. copy_4_end:
  3413. ADDQ R13, R11
  3414. JMP handle_loop
  3415. JMP loop_finished
  3416. copy_all_from_history:
  3417. MOVQ CX, R15
  3418. SUBQ $0x10, R15
  3419. JB copy_5_small
  3420. copy_5_loop:
  3421. MOVUPS (R14), X0
  3422. MOVUPS X0, (R9)
  3423. ADDQ $0x10, R14
  3424. ADDQ $0x10, R9
  3425. SUBQ $0x10, R15
  3426. JAE copy_5_loop
  3427. LEAQ 16(R14)(R15*1), R14
  3428. LEAQ 16(R9)(R15*1), R9
  3429. MOVUPS -16(R14), X0
  3430. MOVUPS X0, -16(R9)
  3431. JMP copy_5_end
  3432. copy_5_small:
  3433. CMPQ CX, $0x03
  3434. JE copy_5_move_3
  3435. JB copy_5_move_1or2
  3436. CMPQ CX, $0x08
  3437. JB copy_5_move_4through7
  3438. JMP copy_5_move_8through16
  3439. copy_5_move_1or2:
  3440. MOVB (R14), R15
  3441. MOVB -1(R14)(CX*1), BP
  3442. MOVB R15, (R9)
  3443. MOVB BP, -1(R9)(CX*1)
  3444. ADDQ CX, R14
  3445. ADDQ CX, R9
  3446. JMP copy_5_end
  3447. copy_5_move_3:
  3448. MOVW (R14), R15
  3449. MOVB 2(R14), BP
  3450. MOVW R15, (R9)
  3451. MOVB BP, 2(R9)
  3452. ADDQ CX, R14
  3453. ADDQ CX, R9
  3454. JMP copy_5_end
  3455. copy_5_move_4through7:
  3456. MOVL (R14), R15
  3457. MOVL -4(R14)(CX*1), BP
  3458. MOVL R15, (R9)
  3459. MOVL BP, -4(R9)(CX*1)
  3460. ADDQ CX, R14
  3461. ADDQ CX, R9
  3462. JMP copy_5_end
  3463. copy_5_move_8through16:
  3464. MOVQ (R14), R15
  3465. MOVQ -8(R14)(CX*1), BP
  3466. MOVQ R15, (R9)
  3467. MOVQ BP, -8(R9)(CX*1)
  3468. ADDQ CX, R14
  3469. ADDQ CX, R9
  3470. copy_5_end:
  3471. ADDQ CX, R11
  3472. SUBQ CX, R13
  3473. // Copy match from the current buffer
  3474. copy_match:
  3475. MOVQ R9, CX
  3476. SUBQ R12, CX
  3477. // ml <= mo
  3478. CMPQ R13, R12
  3479. JA copy_overlapping_match
  3480. // Copy non-overlapping match
  3481. ADDQ R13, R11
  3482. MOVQ R13, R12
  3483. SUBQ $0x10, R12
  3484. JB copy_2_small
  3485. copy_2_loop:
  3486. MOVUPS (CX), X0
  3487. MOVUPS X0, (R9)
  3488. ADDQ $0x10, CX
  3489. ADDQ $0x10, R9
  3490. SUBQ $0x10, R12
  3491. JAE copy_2_loop
  3492. LEAQ 16(CX)(R12*1), CX
  3493. LEAQ 16(R9)(R12*1), R9
  3494. MOVUPS -16(CX), X0
  3495. MOVUPS X0, -16(R9)
  3496. JMP copy_2_end
  3497. copy_2_small:
  3498. CMPQ R13, $0x03
  3499. JE copy_2_move_3
  3500. JB copy_2_move_1or2
  3501. CMPQ R13, $0x08
  3502. JB copy_2_move_4through7
  3503. JMP copy_2_move_8through16
  3504. copy_2_move_1or2:
  3505. MOVB (CX), R12
  3506. MOVB -1(CX)(R13*1), R14
  3507. MOVB R12, (R9)
  3508. MOVB R14, -1(R9)(R13*1)
  3509. ADDQ R13, CX
  3510. ADDQ R13, R9
  3511. JMP copy_2_end
  3512. copy_2_move_3:
  3513. MOVW (CX), R12
  3514. MOVB 2(CX), R14
  3515. MOVW R12, (R9)
  3516. MOVB R14, 2(R9)
  3517. ADDQ R13, CX
  3518. ADDQ R13, R9
  3519. JMP copy_2_end
  3520. copy_2_move_4through7:
  3521. MOVL (CX), R12
  3522. MOVL -4(CX)(R13*1), R14
  3523. MOVL R12, (R9)
  3524. MOVL R14, -4(R9)(R13*1)
  3525. ADDQ R13, CX
  3526. ADDQ R13, R9
  3527. JMP copy_2_end
  3528. copy_2_move_8through16:
  3529. MOVQ (CX), R12
  3530. MOVQ -8(CX)(R13*1), R14
  3531. MOVQ R12, (R9)
  3532. MOVQ R14, -8(R9)(R13*1)
  3533. ADDQ R13, CX
  3534. ADDQ R13, R9
  3535. copy_2_end:
  3536. JMP handle_loop
  3537. // Copy overlapping match
  3538. copy_overlapping_match:
  3539. ADDQ R13, R11
  3540. copy_slow_3:
  3541. MOVB (CX), R12
  3542. MOVB R12, (R9)
  3543. INCQ CX
  3544. INCQ R9
  3545. DECQ R13
  3546. JNZ copy_slow_3
  3547. handle_loop:
  3548. MOVQ ctx+16(FP), CX
  3549. DECQ 96(CX)
  3550. JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
  3551. loop_finished:
  3552. MOVQ br+8(FP), CX
  3553. MOVQ AX, 32(CX)
  3554. MOVB DL, 40(CX)
  3555. MOVQ BX, 24(CX)
  3556. // Update the context
  3557. MOVQ ctx+16(FP), AX
  3558. MOVQ R11, 136(AX)
  3559. MOVQ 144(AX), CX
  3560. SUBQ CX, R10
  3561. MOVQ R10, 168(AX)
  3562. // Return success
  3563. MOVQ $0x00000000, ret+24(FP)
  3564. RET
  3565. // Return with match length error
  3566. sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
  3567. MOVQ 16(SP), AX
  3568. MOVQ ctx+16(FP), CX
  3569. MOVQ AX, 216(CX)
  3570. MOVQ $0x00000001, ret+24(FP)
  3571. RET
  3572. // Return with match too long error
  3573. sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
  3574. MOVQ ctx+16(FP), AX
  3575. MOVQ 16(SP), CX
  3576. MOVQ CX, 216(AX)
  3577. MOVQ $0x00000002, ret+24(FP)
  3578. RET
  3579. // Return with match offset too long error
  3580. error_match_off_too_big:
  3581. MOVQ ctx+16(FP), AX
  3582. MOVQ 8(SP), CX
  3583. MOVQ CX, 224(AX)
  3584. MOVQ R11, 136(AX)
  3585. MOVQ $0x00000003, ret+24(FP)
  3586. RET
  3587. // Return with not enough literals error
  3588. error_not_enough_literals:
  3589. MOVQ ctx+16(FP), AX
  3590. MOVQ 24(SP), CX
  3591. MOVQ CX, 208(AX)
  3592. MOVQ $0x00000004, ret+24(FP)
  3593. RET
  3594. // Return with not enough output space error
  3595. error_not_enough_space:
  3596. MOVQ ctx+16(FP), AX
  3597. MOVQ 24(SP), CX
  3598. MOVQ CX, 208(AX)
  3599. MOVQ 16(SP), CX
  3600. MOVQ CX, 216(AX)
  3601. MOVQ R11, 136(AX)
  3602. MOVQ $0x00000005, ret+24(FP)
  3603. RET