You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

848 line
15 KiB

  1. // Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
  2. //go:build amd64 && !appengine && !noasm && gc
  3. // +build amd64,!appengine,!noasm,gc
  4. // func decompress4x_main_loop_amd64(ctx *decompress4xContext)
  5. TEXT ·decompress4x_main_loop_amd64(SB), $0-8
  6. XORQ DX, DX
  7. // Preload values
  8. MOVQ ctx+0(FP), AX
  9. MOVBQZX 8(AX), DI
  10. MOVQ 16(AX), SI
  11. MOVQ 48(AX), BX
  12. MOVQ 24(AX), R9
  13. MOVQ 32(AX), R10
  14. MOVQ (AX), R11
  15. // Main loop
  16. main_loop:
  17. MOVQ SI, R8
  18. CMPQ R8, BX
  19. SETGE DL
  20. // br0.fillFast32()
  21. MOVQ 32(R11), R12
  22. MOVBQZX 40(R11), R13
  23. CMPQ R13, $0x20
  24. JBE skip_fill0
  25. MOVQ 24(R11), AX
  26. SUBQ $0x20, R13
  27. SUBQ $0x04, AX
  28. MOVQ (R11), R14
  29. // b.value |= uint64(low) << (b.bitsRead & 63)
  30. MOVL (AX)(R14*1), R14
  31. MOVQ R13, CX
  32. SHLQ CL, R14
  33. MOVQ AX, 24(R11)
  34. ORQ R14, R12
  35. // exhausted = exhausted || (br0.off < 4)
  36. CMPQ AX, $0x04
  37. SETLT AL
  38. ORB AL, DL
  39. skip_fill0:
  40. // val0 := br0.peekTopBits(peekBits)
  41. MOVQ R12, R14
  42. MOVQ DI, CX
  43. SHRQ CL, R14
  44. // v0 := table[val0&mask]
  45. MOVW (R10)(R14*2), CX
  46. // br0.advance(uint8(v0.entry)
  47. MOVB CH, AL
  48. SHLQ CL, R12
  49. ADDB CL, R13
  50. // val1 := br0.peekTopBits(peekBits)
  51. MOVQ DI, CX
  52. MOVQ R12, R14
  53. SHRQ CL, R14
  54. // v1 := table[val1&mask]
  55. MOVW (R10)(R14*2), CX
  56. // br0.advance(uint8(v1.entry))
  57. MOVB CH, AH
  58. SHLQ CL, R12
  59. ADDB CL, R13
  60. // these two writes get coalesced
  61. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  62. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  63. MOVW AX, (R8)
  64. // update the bitreader structure
  65. MOVQ R12, 32(R11)
  66. MOVB R13, 40(R11)
  67. ADDQ R9, R8
  68. // br1.fillFast32()
  69. MOVQ 80(R11), R12
  70. MOVBQZX 88(R11), R13
  71. CMPQ R13, $0x20
  72. JBE skip_fill1
  73. MOVQ 72(R11), AX
  74. SUBQ $0x20, R13
  75. SUBQ $0x04, AX
  76. MOVQ 48(R11), R14
  77. // b.value |= uint64(low) << (b.bitsRead & 63)
  78. MOVL (AX)(R14*1), R14
  79. MOVQ R13, CX
  80. SHLQ CL, R14
  81. MOVQ AX, 72(R11)
  82. ORQ R14, R12
  83. // exhausted = exhausted || (br1.off < 4)
  84. CMPQ AX, $0x04
  85. SETLT AL
  86. ORB AL, DL
  87. skip_fill1:
  88. // val0 := br1.peekTopBits(peekBits)
  89. MOVQ R12, R14
  90. MOVQ DI, CX
  91. SHRQ CL, R14
  92. // v0 := table[val0&mask]
  93. MOVW (R10)(R14*2), CX
  94. // br1.advance(uint8(v0.entry)
  95. MOVB CH, AL
  96. SHLQ CL, R12
  97. ADDB CL, R13
  98. // val1 := br1.peekTopBits(peekBits)
  99. MOVQ DI, CX
  100. MOVQ R12, R14
  101. SHRQ CL, R14
  102. // v1 := table[val1&mask]
  103. MOVW (R10)(R14*2), CX
  104. // br1.advance(uint8(v1.entry))
  105. MOVB CH, AH
  106. SHLQ CL, R12
  107. ADDB CL, R13
  108. // these two writes get coalesced
  109. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  110. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  111. MOVW AX, (R8)
  112. // update the bitreader structure
  113. MOVQ R12, 80(R11)
  114. MOVB R13, 88(R11)
  115. ADDQ R9, R8
  116. // br2.fillFast32()
  117. MOVQ 128(R11), R12
  118. MOVBQZX 136(R11), R13
  119. CMPQ R13, $0x20
  120. JBE skip_fill2
  121. MOVQ 120(R11), AX
  122. SUBQ $0x20, R13
  123. SUBQ $0x04, AX
  124. MOVQ 96(R11), R14
  125. // b.value |= uint64(low) << (b.bitsRead & 63)
  126. MOVL (AX)(R14*1), R14
  127. MOVQ R13, CX
  128. SHLQ CL, R14
  129. MOVQ AX, 120(R11)
  130. ORQ R14, R12
  131. // exhausted = exhausted || (br2.off < 4)
  132. CMPQ AX, $0x04
  133. SETLT AL
  134. ORB AL, DL
  135. skip_fill2:
  136. // val0 := br2.peekTopBits(peekBits)
  137. MOVQ R12, R14
  138. MOVQ DI, CX
  139. SHRQ CL, R14
  140. // v0 := table[val0&mask]
  141. MOVW (R10)(R14*2), CX
  142. // br2.advance(uint8(v0.entry)
  143. MOVB CH, AL
  144. SHLQ CL, R12
  145. ADDB CL, R13
  146. // val1 := br2.peekTopBits(peekBits)
  147. MOVQ DI, CX
  148. MOVQ R12, R14
  149. SHRQ CL, R14
  150. // v1 := table[val1&mask]
  151. MOVW (R10)(R14*2), CX
  152. // br2.advance(uint8(v1.entry))
  153. MOVB CH, AH
  154. SHLQ CL, R12
  155. ADDB CL, R13
  156. // these two writes get coalesced
  157. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  158. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  159. MOVW AX, (R8)
  160. // update the bitreader structure
  161. MOVQ R12, 128(R11)
  162. MOVB R13, 136(R11)
  163. ADDQ R9, R8
  164. // br3.fillFast32()
  165. MOVQ 176(R11), R12
  166. MOVBQZX 184(R11), R13
  167. CMPQ R13, $0x20
  168. JBE skip_fill3
  169. MOVQ 168(R11), AX
  170. SUBQ $0x20, R13
  171. SUBQ $0x04, AX
  172. MOVQ 144(R11), R14
  173. // b.value |= uint64(low) << (b.bitsRead & 63)
  174. MOVL (AX)(R14*1), R14
  175. MOVQ R13, CX
  176. SHLQ CL, R14
  177. MOVQ AX, 168(R11)
  178. ORQ R14, R12
  179. // exhausted = exhausted || (br3.off < 4)
  180. CMPQ AX, $0x04
  181. SETLT AL
  182. ORB AL, DL
  183. skip_fill3:
  184. // val0 := br3.peekTopBits(peekBits)
  185. MOVQ R12, R14
  186. MOVQ DI, CX
  187. SHRQ CL, R14
  188. // v0 := table[val0&mask]
  189. MOVW (R10)(R14*2), CX
  190. // br3.advance(uint8(v0.entry)
  191. MOVB CH, AL
  192. SHLQ CL, R12
  193. ADDB CL, R13
  194. // val1 := br3.peekTopBits(peekBits)
  195. MOVQ DI, CX
  196. MOVQ R12, R14
  197. SHRQ CL, R14
  198. // v1 := table[val1&mask]
  199. MOVW (R10)(R14*2), CX
  200. // br3.advance(uint8(v1.entry))
  201. MOVB CH, AH
  202. SHLQ CL, R12
  203. ADDB CL, R13
  204. // these two writes get coalesced
  205. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  206. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  207. MOVW AX, (R8)
  208. // update the bitreader structure
  209. MOVQ R12, 176(R11)
  210. MOVB R13, 184(R11)
  211. ADDQ $0x02, SI
  212. TESTB DL, DL
  213. JZ main_loop
  214. MOVQ ctx+0(FP), AX
  215. SUBQ 16(AX), SI
  216. SHLQ $0x02, SI
  217. MOVQ SI, 40(AX)
  218. RET
  219. // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
  220. TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
  221. XORQ DX, DX
  222. // Preload values
  223. MOVQ ctx+0(FP), CX
  224. MOVBQZX 8(CX), DI
  225. MOVQ 16(CX), BX
  226. MOVQ 48(CX), SI
  227. MOVQ 24(CX), R9
  228. MOVQ 32(CX), R10
  229. MOVQ (CX), R11
  230. // Main loop
  231. main_loop:
  232. MOVQ BX, R8
  233. CMPQ R8, SI
  234. SETGE DL
  235. // br0.fillFast32()
  236. MOVQ 32(R11), R12
  237. MOVBQZX 40(R11), R13
  238. CMPQ R13, $0x20
  239. JBE skip_fill0
  240. MOVQ 24(R11), R14
  241. SUBQ $0x20, R13
  242. SUBQ $0x04, R14
  243. MOVQ (R11), R15
  244. // b.value |= uint64(low) << (b.bitsRead & 63)
  245. MOVL (R14)(R15*1), R15
  246. MOVQ R13, CX
  247. SHLQ CL, R15
  248. MOVQ R14, 24(R11)
  249. ORQ R15, R12
  250. // exhausted = exhausted || (br0.off < 4)
  251. CMPQ R14, $0x04
  252. SETLT AL
  253. ORB AL, DL
  254. skip_fill0:
  255. // val0 := br0.peekTopBits(peekBits)
  256. MOVQ R12, R14
  257. MOVQ DI, CX
  258. SHRQ CL, R14
  259. // v0 := table[val0&mask]
  260. MOVW (R10)(R14*2), CX
  261. // br0.advance(uint8(v0.entry)
  262. MOVB CH, AL
  263. SHLQ CL, R12
  264. ADDB CL, R13
  265. // val1 := br0.peekTopBits(peekBits)
  266. MOVQ R12, R14
  267. MOVQ DI, CX
  268. SHRQ CL, R14
  269. // v1 := table[val0&mask]
  270. MOVW (R10)(R14*2), CX
  271. // br0.advance(uint8(v1.entry)
  272. MOVB CH, AH
  273. SHLQ CL, R12
  274. ADDB CL, R13
  275. BSWAPL AX
  276. // val2 := br0.peekTopBits(peekBits)
  277. MOVQ R12, R14
  278. MOVQ DI, CX
  279. SHRQ CL, R14
  280. // v2 := table[val0&mask]
  281. MOVW (R10)(R14*2), CX
  282. // br0.advance(uint8(v2.entry)
  283. MOVB CH, AH
  284. SHLQ CL, R12
  285. ADDB CL, R13
  286. // val3 := br0.peekTopBits(peekBits)
  287. MOVQ R12, R14
  288. MOVQ DI, CX
  289. SHRQ CL, R14
  290. // v3 := table[val0&mask]
  291. MOVW (R10)(R14*2), CX
  292. // br0.advance(uint8(v3.entry)
  293. MOVB CH, AL
  294. SHLQ CL, R12
  295. ADDB CL, R13
  296. BSWAPL AX
  297. // these four writes get coalesced
  298. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  299. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  300. // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
  301. // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
  302. MOVL AX, (R8)
  303. // update the bitreader structure
  304. MOVQ R12, 32(R11)
  305. MOVB R13, 40(R11)
  306. ADDQ R9, R8
  307. // br1.fillFast32()
  308. MOVQ 80(R11), R12
  309. MOVBQZX 88(R11), R13
  310. CMPQ R13, $0x20
  311. JBE skip_fill1
  312. MOVQ 72(R11), R14
  313. SUBQ $0x20, R13
  314. SUBQ $0x04, R14
  315. MOVQ 48(R11), R15
  316. // b.value |= uint64(low) << (b.bitsRead & 63)
  317. MOVL (R14)(R15*1), R15
  318. MOVQ R13, CX
  319. SHLQ CL, R15
  320. MOVQ R14, 72(R11)
  321. ORQ R15, R12
  322. // exhausted = exhausted || (br1.off < 4)
  323. CMPQ R14, $0x04
  324. SETLT AL
  325. ORB AL, DL
  326. skip_fill1:
  327. // val0 := br1.peekTopBits(peekBits)
  328. MOVQ R12, R14
  329. MOVQ DI, CX
  330. SHRQ CL, R14
  331. // v0 := table[val0&mask]
  332. MOVW (R10)(R14*2), CX
  333. // br1.advance(uint8(v0.entry)
  334. MOVB CH, AL
  335. SHLQ CL, R12
  336. ADDB CL, R13
  337. // val1 := br1.peekTopBits(peekBits)
  338. MOVQ R12, R14
  339. MOVQ DI, CX
  340. SHRQ CL, R14
  341. // v1 := table[val0&mask]
  342. MOVW (R10)(R14*2), CX
  343. // br1.advance(uint8(v1.entry)
  344. MOVB CH, AH
  345. SHLQ CL, R12
  346. ADDB CL, R13
  347. BSWAPL AX
  348. // val2 := br1.peekTopBits(peekBits)
  349. MOVQ R12, R14
  350. MOVQ DI, CX
  351. SHRQ CL, R14
  352. // v2 := table[val0&mask]
  353. MOVW (R10)(R14*2), CX
  354. // br1.advance(uint8(v2.entry)
  355. MOVB CH, AH
  356. SHLQ CL, R12
  357. ADDB CL, R13
  358. // val3 := br1.peekTopBits(peekBits)
  359. MOVQ R12, R14
  360. MOVQ DI, CX
  361. SHRQ CL, R14
  362. // v3 := table[val0&mask]
  363. MOVW (R10)(R14*2), CX
  364. // br1.advance(uint8(v3.entry)
  365. MOVB CH, AL
  366. SHLQ CL, R12
  367. ADDB CL, R13
  368. BSWAPL AX
  369. // these four writes get coalesced
  370. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  371. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  372. // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
  373. // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
  374. MOVL AX, (R8)
  375. // update the bitreader structure
  376. MOVQ R12, 80(R11)
  377. MOVB R13, 88(R11)
  378. ADDQ R9, R8
  379. // br2.fillFast32()
  380. MOVQ 128(R11), R12
  381. MOVBQZX 136(R11), R13
  382. CMPQ R13, $0x20
  383. JBE skip_fill2
  384. MOVQ 120(R11), R14
  385. SUBQ $0x20, R13
  386. SUBQ $0x04, R14
  387. MOVQ 96(R11), R15
  388. // b.value |= uint64(low) << (b.bitsRead & 63)
  389. MOVL (R14)(R15*1), R15
  390. MOVQ R13, CX
  391. SHLQ CL, R15
  392. MOVQ R14, 120(R11)
  393. ORQ R15, R12
  394. // exhausted = exhausted || (br2.off < 4)
  395. CMPQ R14, $0x04
  396. SETLT AL
  397. ORB AL, DL
  398. skip_fill2:
  399. // val0 := br2.peekTopBits(peekBits)
  400. MOVQ R12, R14
  401. MOVQ DI, CX
  402. SHRQ CL, R14
  403. // v0 := table[val0&mask]
  404. MOVW (R10)(R14*2), CX
  405. // br2.advance(uint8(v0.entry)
  406. MOVB CH, AL
  407. SHLQ CL, R12
  408. ADDB CL, R13
  409. // val1 := br2.peekTopBits(peekBits)
  410. MOVQ R12, R14
  411. MOVQ DI, CX
  412. SHRQ CL, R14
  413. // v1 := table[val0&mask]
  414. MOVW (R10)(R14*2), CX
  415. // br2.advance(uint8(v1.entry)
  416. MOVB CH, AH
  417. SHLQ CL, R12
  418. ADDB CL, R13
  419. BSWAPL AX
  420. // val2 := br2.peekTopBits(peekBits)
  421. MOVQ R12, R14
  422. MOVQ DI, CX
  423. SHRQ CL, R14
  424. // v2 := table[val0&mask]
  425. MOVW (R10)(R14*2), CX
  426. // br2.advance(uint8(v2.entry)
  427. MOVB CH, AH
  428. SHLQ CL, R12
  429. ADDB CL, R13
  430. // val3 := br2.peekTopBits(peekBits)
  431. MOVQ R12, R14
  432. MOVQ DI, CX
  433. SHRQ CL, R14
  434. // v3 := table[val0&mask]
  435. MOVW (R10)(R14*2), CX
  436. // br2.advance(uint8(v3.entry)
  437. MOVB CH, AL
  438. SHLQ CL, R12
  439. ADDB CL, R13
  440. BSWAPL AX
  441. // these four writes get coalesced
  442. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  443. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  444. // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
  445. // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
  446. MOVL AX, (R8)
  447. // update the bitreader structure
  448. MOVQ R12, 128(R11)
  449. MOVB R13, 136(R11)
  450. ADDQ R9, R8
  451. // br3.fillFast32()
  452. MOVQ 176(R11), R12
  453. MOVBQZX 184(R11), R13
  454. CMPQ R13, $0x20
  455. JBE skip_fill3
  456. MOVQ 168(R11), R14
  457. SUBQ $0x20, R13
  458. SUBQ $0x04, R14
  459. MOVQ 144(R11), R15
  460. // b.value |= uint64(low) << (b.bitsRead & 63)
  461. MOVL (R14)(R15*1), R15
  462. MOVQ R13, CX
  463. SHLQ CL, R15
  464. MOVQ R14, 168(R11)
  465. ORQ R15, R12
  466. // exhausted = exhausted || (br3.off < 4)
  467. CMPQ R14, $0x04
  468. SETLT AL
  469. ORB AL, DL
  470. skip_fill3:
  471. // val0 := br3.peekTopBits(peekBits)
  472. MOVQ R12, R14
  473. MOVQ DI, CX
  474. SHRQ CL, R14
  475. // v0 := table[val0&mask]
  476. MOVW (R10)(R14*2), CX
  477. // br3.advance(uint8(v0.entry)
  478. MOVB CH, AL
  479. SHLQ CL, R12
  480. ADDB CL, R13
  481. // val1 := br3.peekTopBits(peekBits)
  482. MOVQ R12, R14
  483. MOVQ DI, CX
  484. SHRQ CL, R14
  485. // v1 := table[val0&mask]
  486. MOVW (R10)(R14*2), CX
  487. // br3.advance(uint8(v1.entry)
  488. MOVB CH, AH
  489. SHLQ CL, R12
  490. ADDB CL, R13
  491. BSWAPL AX
  492. // val2 := br3.peekTopBits(peekBits)
  493. MOVQ R12, R14
  494. MOVQ DI, CX
  495. SHRQ CL, R14
  496. // v2 := table[val0&mask]
  497. MOVW (R10)(R14*2), CX
  498. // br3.advance(uint8(v2.entry)
  499. MOVB CH, AH
  500. SHLQ CL, R12
  501. ADDB CL, R13
  502. // val3 := br3.peekTopBits(peekBits)
  503. MOVQ R12, R14
  504. MOVQ DI, CX
  505. SHRQ CL, R14
  506. // v3 := table[val0&mask]
  507. MOVW (R10)(R14*2), CX
  508. // br3.advance(uint8(v3.entry)
  509. MOVB CH, AL
  510. SHLQ CL, R12
  511. ADDB CL, R13
  512. BSWAPL AX
  513. // these four writes get coalesced
  514. // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
  515. // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
  516. // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
  517. // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
  518. MOVL AX, (R8)
  519. // update the bitreader structure
  520. MOVQ R12, 176(R11)
  521. MOVB R13, 184(R11)
  522. ADDQ $0x04, BX
  523. TESTB DL, DL
  524. JZ main_loop
  525. MOVQ ctx+0(FP), AX
  526. SUBQ 16(AX), BX
  527. SHLQ $0x02, BX
  528. MOVQ BX, 40(AX)
  529. RET
  530. // func decompress1x_main_loop_amd64(ctx *decompress1xContext)
  531. TEXT ·decompress1x_main_loop_amd64(SB), $0-8
  532. MOVQ ctx+0(FP), CX
  533. MOVQ 16(CX), DX
  534. MOVQ 24(CX), BX
  535. CMPQ BX, $0x04
  536. JB error_max_decoded_size_exeeded
  537. LEAQ (DX)(BX*1), BX
  538. MOVQ (CX), SI
  539. MOVQ (SI), R8
  540. MOVQ 24(SI), R9
  541. MOVQ 32(SI), R10
  542. MOVBQZX 40(SI), R11
  543. MOVQ 32(CX), SI
  544. MOVBQZX 8(CX), DI
  545. JMP loop_condition
  546. main_loop:
  547. // Check if we have room for 4 bytes in the output buffer
  548. LEAQ 4(DX), CX
  549. CMPQ CX, BX
  550. JGE error_max_decoded_size_exeeded
  551. // Decode 4 values
  552. CMPQ R11, $0x20
  553. JL bitReader_fillFast_1_end
  554. SUBQ $0x20, R11
  555. SUBQ $0x04, R9
  556. MOVL (R8)(R9*1), R12
  557. MOVQ R11, CX
  558. SHLQ CL, R12
  559. ORQ R12, R10
  560. bitReader_fillFast_1_end:
  561. MOVQ DI, CX
  562. MOVQ R10, R12
  563. SHRQ CL, R12
  564. MOVW (SI)(R12*2), CX
  565. MOVB CH, AL
  566. MOVBQZX CL, CX
  567. ADDQ CX, R11
  568. SHLQ CL, R10
  569. MOVQ DI, CX
  570. MOVQ R10, R12
  571. SHRQ CL, R12
  572. MOVW (SI)(R12*2), CX
  573. MOVB CH, AH
  574. MOVBQZX CL, CX
  575. ADDQ CX, R11
  576. SHLQ CL, R10
  577. BSWAPL AX
  578. CMPQ R11, $0x20
  579. JL bitReader_fillFast_2_end
  580. SUBQ $0x20, R11
  581. SUBQ $0x04, R9
  582. MOVL (R8)(R9*1), R12
  583. MOVQ R11, CX
  584. SHLQ CL, R12
  585. ORQ R12, R10
  586. bitReader_fillFast_2_end:
  587. MOVQ DI, CX
  588. MOVQ R10, R12
  589. SHRQ CL, R12
  590. MOVW (SI)(R12*2), CX
  591. MOVB CH, AH
  592. MOVBQZX CL, CX
  593. ADDQ CX, R11
  594. SHLQ CL, R10
  595. MOVQ DI, CX
  596. MOVQ R10, R12
  597. SHRQ CL, R12
  598. MOVW (SI)(R12*2), CX
  599. MOVB CH, AL
  600. MOVBQZX CL, CX
  601. ADDQ CX, R11
  602. SHLQ CL, R10
  603. BSWAPL AX
  604. // Store the decoded values
  605. MOVL AX, (DX)
  606. ADDQ $0x04, DX
  607. loop_condition:
  608. CMPQ R9, $0x08
  609. JGE main_loop
  610. // Update ctx structure
  611. MOVQ ctx+0(FP), AX
  612. SUBQ 16(AX), DX
  613. MOVQ DX, 40(AX)
  614. MOVQ (AX), AX
  615. MOVQ R9, 24(AX)
  616. MOVQ R10, 32(AX)
  617. MOVB R11, 40(AX)
  618. RET
  619. // Report error
  620. error_max_decoded_size_exeeded:
  621. MOVQ ctx+0(FP), AX
  622. MOVQ $-1, CX
  623. MOVQ CX, 40(AX)
  624. RET
  625. // func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
  626. // Requires: BMI2
  627. TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
  628. MOVQ ctx+0(FP), CX
  629. MOVQ 16(CX), DX
  630. MOVQ 24(CX), BX
  631. CMPQ BX, $0x04
  632. JB error_max_decoded_size_exeeded
  633. LEAQ (DX)(BX*1), BX
  634. MOVQ (CX), SI
  635. MOVQ (SI), R8
  636. MOVQ 24(SI), R9
  637. MOVQ 32(SI), R10
  638. MOVBQZX 40(SI), R11
  639. MOVQ 32(CX), SI
  640. MOVBQZX 8(CX), DI
  641. JMP loop_condition
  642. main_loop:
  643. // Check if we have room for 4 bytes in the output buffer
  644. LEAQ 4(DX), CX
  645. CMPQ CX, BX
  646. JGE error_max_decoded_size_exeeded
  647. // Decode 4 values
  648. CMPQ R11, $0x20
  649. JL bitReader_fillFast_1_end
  650. SUBQ $0x20, R11
  651. SUBQ $0x04, R9
  652. MOVL (R8)(R9*1), CX
  653. SHLXQ R11, CX, CX
  654. ORQ CX, R10
  655. bitReader_fillFast_1_end:
  656. SHRXQ DI, R10, CX
  657. MOVW (SI)(CX*2), CX
  658. MOVB CH, AL
  659. MOVBQZX CL, CX
  660. ADDQ CX, R11
  661. SHLXQ CX, R10, R10
  662. SHRXQ DI, R10, CX
  663. MOVW (SI)(CX*2), CX
  664. MOVB CH, AH
  665. MOVBQZX CL, CX
  666. ADDQ CX, R11
  667. SHLXQ CX, R10, R10
  668. BSWAPL AX
  669. CMPQ R11, $0x20
  670. JL bitReader_fillFast_2_end
  671. SUBQ $0x20, R11
  672. SUBQ $0x04, R9
  673. MOVL (R8)(R9*1), CX
  674. SHLXQ R11, CX, CX
  675. ORQ CX, R10
  676. bitReader_fillFast_2_end:
  677. SHRXQ DI, R10, CX
  678. MOVW (SI)(CX*2), CX
  679. MOVB CH, AH
  680. MOVBQZX CL, CX
  681. ADDQ CX, R11
  682. SHLXQ CX, R10, R10
  683. SHRXQ DI, R10, CX
  684. MOVW (SI)(CX*2), CX
  685. MOVB CH, AL
  686. MOVBQZX CL, CX
  687. ADDQ CX, R11
  688. SHLXQ CX, R10, R10
  689. BSWAPL AX
  690. // Store the decoded values
  691. MOVL AX, (DX)
  692. ADDQ $0x04, DX
  693. loop_condition:
  694. CMPQ R9, $0x08
  695. JGE main_loop
  696. // Update ctx structure
  697. MOVQ ctx+0(FP), AX
  698. SUBQ 16(AX), DX
  699. MOVQ DX, 40(AX)
  700. MOVQ (AX), AX
  701. MOVQ R9, 24(AX)
  702. MOVQ R10, 32(AX)
  703. MOVB R11, 40(AX)
  704. RET
  705. // Report error
  706. error_max_decoded_size_exeeded:
  707. MOVQ ctx+0(FP), AX
  708. MOVQ $-1, CX
  709. MOVQ CX, 40(AX)
  710. RET