25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

223 lines
5.7 KiB

  1. //go:build amd64 && !appengine && !noasm && gc
  2. // +build amd64,!appengine,!noasm,gc
  3. // This file contains the specialisation of Decoder.Decompress4X
  4. // and Decoder.Decompress1X that use an asm implementation of thir main loops.
  5. package huff0
  6. import (
  7. "errors"
  8. "fmt"
  9. "github.com/klauspost/compress/internal/cpuinfo"
  10. )
  11. // decompress4x_main_loop_x86 is an x86 assembler implementation
  12. // of Decompress4X when tablelog > 8.
  13. //go:noescape
  14. func decompress4x_main_loop_amd64(ctx *decompress4xContext)
  15. // decompress4x_8b_loop_x86 is an x86 assembler implementation
  16. // of Decompress4X when tablelog <= 8 which decodes 4 entries
  17. // per loop.
  18. //go:noescape
  19. func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
  20. // fallback8BitSize is the size where using Go version is faster.
  21. const fallback8BitSize = 800
  22. type decompress4xContext struct {
  23. pbr *[4]bitReaderShifted
  24. peekBits uint8
  25. out *byte
  26. dstEvery int
  27. tbl *dEntrySingle
  28. decoded int
  29. limit *byte
  30. }
  31. // Decompress4X will decompress a 4X encoded stream.
  32. // The length of the supplied input must match the end of a block exactly.
  33. // The *capacity* of the dst slice must match the destination size of
  34. // the uncompressed data exactly.
  35. func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
  36. if len(d.dt.single) == 0 {
  37. return nil, errors.New("no table loaded")
  38. }
  39. if len(src) < 6+(4*1) {
  40. return nil, errors.New("input too small")
  41. }
  42. use8BitTables := d.actualTableLog <= 8
  43. if cap(dst) < fallback8BitSize && use8BitTables {
  44. return d.decompress4X8bit(dst, src)
  45. }
  46. var br [4]bitReaderShifted
  47. // Decode "jump table"
  48. start := 6
  49. for i := 0; i < 3; i++ {
  50. length := int(src[i*2]) | (int(src[i*2+1]) << 8)
  51. if start+length >= len(src) {
  52. return nil, errors.New("truncated input (or invalid offset)")
  53. }
  54. err := br[i].init(src[start : start+length])
  55. if err != nil {
  56. return nil, err
  57. }
  58. start += length
  59. }
  60. err := br[3].init(src[start:])
  61. if err != nil {
  62. return nil, err
  63. }
  64. // destination, offset to match first output
  65. dstSize := cap(dst)
  66. dst = dst[:dstSize]
  67. out := dst
  68. dstEvery := (dstSize + 3) / 4
  69. const tlSize = 1 << tableLogMax
  70. const tlMask = tlSize - 1
  71. single := d.dt.single[:tlSize]
  72. var decoded int
  73. if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
  74. ctx := decompress4xContext{
  75. pbr: &br,
  76. peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
  77. out: &out[0],
  78. dstEvery: dstEvery,
  79. tbl: &single[0],
  80. limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
  81. }
  82. if use8BitTables {
  83. decompress4x_8b_main_loop_amd64(&ctx)
  84. } else {
  85. decompress4x_main_loop_amd64(&ctx)
  86. }
  87. decoded = ctx.decoded
  88. out = out[decoded/4:]
  89. }
  90. // Decode remaining.
  91. remainBytes := dstEvery - (decoded / 4)
  92. for i := range br {
  93. offset := dstEvery * i
  94. endsAt := offset + remainBytes
  95. if endsAt > len(out) {
  96. endsAt = len(out)
  97. }
  98. br := &br[i]
  99. bitsLeft := br.remaining()
  100. for bitsLeft > 0 {
  101. br.fill()
  102. if offset >= endsAt {
  103. return nil, errors.New("corruption detected: stream overrun 4")
  104. }
  105. // Read value and increment offset.
  106. val := br.peekBitsFast(d.actualTableLog)
  107. v := single[val&tlMask].entry
  108. nBits := uint8(v)
  109. br.advance(nBits)
  110. bitsLeft -= uint(nBits)
  111. out[offset] = uint8(v >> 8)
  112. offset++
  113. }
  114. if offset != endsAt {
  115. return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
  116. }
  117. decoded += offset - dstEvery*i
  118. err = br.close()
  119. if err != nil {
  120. return nil, err
  121. }
  122. }
  123. if dstSize != decoded {
  124. return nil, errors.New("corruption detected: short output block")
  125. }
  126. return dst, nil
  127. }
  128. // decompress4x_main_loop_x86 is an x86 assembler implementation
  129. // of Decompress1X when tablelog > 8.
  130. //go:noescape
  131. func decompress1x_main_loop_amd64(ctx *decompress1xContext)
  132. // decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
  133. // of Decompress1X when tablelog > 8.
  134. //go:noescape
  135. func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
  136. type decompress1xContext struct {
  137. pbr *bitReaderShifted
  138. peekBits uint8
  139. out *byte
  140. outCap int
  141. tbl *dEntrySingle
  142. decoded int
  143. }
  144. // Error reported by asm implementations
  145. const error_max_decoded_size_exeeded = -1
  146. // Decompress1X will decompress a 1X encoded stream.
  147. // The cap of the output buffer will be the maximum decompressed size.
  148. // The length of the supplied input must match the end of a block exactly.
  149. func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
  150. if len(d.dt.single) == 0 {
  151. return nil, errors.New("no table loaded")
  152. }
  153. var br bitReaderShifted
  154. err := br.init(src)
  155. if err != nil {
  156. return dst, err
  157. }
  158. maxDecodedSize := cap(dst)
  159. dst = dst[:maxDecodedSize]
  160. const tlSize = 1 << tableLogMax
  161. const tlMask = tlSize - 1
  162. if maxDecodedSize >= 4 {
  163. ctx := decompress1xContext{
  164. pbr: &br,
  165. out: &dst[0],
  166. outCap: maxDecodedSize,
  167. peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
  168. tbl: &d.dt.single[0],
  169. }
  170. if cpuinfo.HasBMI2() {
  171. decompress1x_main_loop_bmi2(&ctx)
  172. } else {
  173. decompress1x_main_loop_amd64(&ctx)
  174. }
  175. if ctx.decoded == error_max_decoded_size_exeeded {
  176. return nil, ErrMaxDecodedSizeExceeded
  177. }
  178. dst = dst[:ctx.decoded]
  179. }
  180. // br < 8, so uint8 is fine
  181. bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
  182. for bitsLeft > 0 {
  183. br.fill()
  184. if len(dst) >= maxDecodedSize {
  185. br.close()
  186. return nil, ErrMaxDecodedSizeExceeded
  187. }
  188. v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
  189. nBits := uint8(v.entry)
  190. br.advance(nBits)
  191. bitsLeft -= nBits
  192. dst = append(dst, uint8(v.entry>>8))
  193. }
  194. return dst, br.close()
  195. }