You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

190 line
7.1 KiB

  1. # -*- coding: utf-8 -*-
  2. """Support code for writing BGZF files
  3. Shamelessly taken from Biopython
  4. """
  5. # Biopython License Agreement
  6. #
  7. # Permission to use, copy, modify, and distribute this software and its
  8. # documentation with or without modifications and for any purpose and
  9. # without fee is hereby granted, provided that any copyright notices
  10. # appear in all copies and that both those copyright notices and this
  11. # permission notice appear in supporting documentation, and that the
  12. # names of the contributors or copyright holders not be used in
  13. # advertising or publicity pertaining to distribution of the software
  14. # without specific prior permission.
  15. #
  16. # THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
  17. # WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
  18. # WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
  19. # CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
  20. # OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
  21. # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  22. # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
  23. # OR PERFORMANCE OF THIS SOFTWARE.
  24. import codecs
  25. import struct
  26. import zlib
  27. # For Python 2 can just use: _bgzf_magic = '\x1f\x8b\x08\x04'
  28. # but need to use bytes on Python 3
  29. _bgzf_magic = b"\x1f\x8b\x08\x04"
  30. _bgzf_header = b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42" b"\x43\x02\x00"
  31. _bgzf_eof = (
  32. b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC\x02\x00"
  33. b"\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"
  34. )
  35. _bytes_BC = b"BC"
  36. def make_virtual_offset(block_start_offset, within_block_offset):
  37. """Compute a BGZF virtual offset from block start and within block offsets.
  38. The BAM indexing scheme records read positions using a 64 bit
  39. 'virtual offset', comprising in C terms:
  40. block_start_offset << 16 | within_block_offset
  41. Here block_start_offset is the file offset of the BGZF block
  42. start (unsigned integer using up to 64-16 = 48 bits), and
  43. within_block_offset within the (decompressed) block (unsigned
  44. 16 bit integer).
  45. >>> make_virtual_offset(0, 0)
  46. 0
  47. >>> make_virtual_offset(0, 1)
  48. 1
  49. >>> make_virtual_offset(0, 2**16 - 1)
  50. 65535
  51. >>> make_virtual_offset(0, 2**16)
  52. Traceback (most recent call last):
  53. ...
  54. ValueError: Require 0 <= within_block_offset < 2**16, got 65536
  55. >>> 65536 == make_virtual_offset(1, 0)
  56. True
  57. >>> 65537 == make_virtual_offset(1, 1)
  58. True
  59. >>> 131071 == make_virtual_offset(1, 2**16 - 1)
  60. True
  61. >>> 6553600000 == make_virtual_offset(100000, 0)
  62. True
  63. >>> 6553600001 == make_virtual_offset(100000, 1)
  64. True
  65. >>> 6553600010 == make_virtual_offset(100000, 10)
  66. True
  67. >>> make_virtual_offset(2**48, 0)
  68. Traceback (most recent call last):
  69. ...
  70. ValueError: Require 0 <= block_start_offset < 2**48, got 281474976710656
  71. """
  72. if within_block_offset < 0 or within_block_offset >= 65536:
  73. raise ValueError("Require 0 <= within_block_offset < 2**16, got %i" % within_block_offset)
  74. if block_start_offset < 0 or block_start_offset >= 281474976710656:
  75. raise ValueError("Require 0 <= block_start_offset < 2**48, got %i" % block_start_offset)
  76. return (block_start_offset << 16) | within_block_offset
  77. class BgzfWriter(object):
  78. def __init__(self, filename=None, mode="w", fileobj=None, compresslevel=6):
  79. if fileobj:
  80. assert filename is None
  81. handle = fileobj
  82. else:
  83. if "w" not in mode.lower() and "a" not in mode.lower():
  84. raise ValueError("Must use write or append mode, not %r" % mode)
  85. if "a" in mode.lower():
  86. handle = open(filename, "ab")
  87. else:
  88. handle = open(filename, "wb")
  89. self._text = "b" not in mode.lower()
  90. self._handle = handle
  91. self._buffer = b""
  92. self.compresslevel = compresslevel
  93. def _write_block(self, block):
  94. # print("Saving %i bytes" % len(block))
  95. assert len(block) <= 65536
  96. # Giving a negative window bits means no gzip/zlib headers,
  97. # -15 used in samtools
  98. c = zlib.compressobj(self.compresslevel, zlib.DEFLATED, -15, zlib.DEF_MEM_LEVEL, 0)
  99. compressed = c.compress(block) + c.flush()
  100. del c
  101. assert len(compressed) < 65536, "TODO - Didn't compress enough, try less data in this block"
  102. crc = zlib.crc32(block)
  103. # Should cope with a mix of Python platforms...
  104. if crc < 0:
  105. crc = struct.pack("<i", crc)
  106. else:
  107. crc = struct.pack("<I", crc)
  108. bsize = struct.pack("<H", len(compressed) + 25) # includes -1
  109. crc = struct.pack("<I", zlib.crc32(block) & 0xFFFFFFFF)
  110. uncompressed_length = struct.pack("<I", len(block))
  111. # Fixed 16 bytes,
  112. # gzip magic bytes (4) mod time (4),
  113. # gzip flag (1), os (1), extra length which is six (2),
  114. # sub field which is BC (2), sub field length of two (2),
  115. # Variable data,
  116. # 2 bytes: block length as BC sub field (2)
  117. # X bytes: the data
  118. # 8 bytes: crc (4), uncompressed data length (4)
  119. data = _bgzf_header + bsize + compressed + crc + uncompressed_length
  120. self._handle.write(data)
  121. def write(self, data):
  122. # TODO - Check bytes vs unicode
  123. if isinstance(data, str):
  124. data = codecs.latin_1_encode(data)[0]
  125. # block_size = 2**16 = 65536
  126. data_len = len(data)
  127. if len(self._buffer) + data_len < 65536:
  128. # print("Cached %r" % data)
  129. self._buffer += data
  130. return
  131. else:
  132. # print("Got %r, writing out some data..." % data)
  133. self._buffer += data
  134. while len(self._buffer) >= 65536:
  135. self._write_block(self._buffer[:65536])
  136. self._buffer = self._buffer[65536:]
  137. def flush(self):
  138. while len(self._buffer) >= 65536:
  139. self._write_block(self._buffer[:65535])
  140. self._buffer = self._buffer[65535:]
  141. self._write_block(self._buffer)
  142. self._buffer = b""
  143. self._handle.flush()
  144. def close(self):
  145. """Flush data, write 28 bytes BGZF EOF marker, and close BGZF file.
  146. samtools will look for a magic EOF marker, just a 28 byte empty BGZF
  147. block, and if it is missing warns the BAM file may be truncated. In
  148. addition to samtools writing this block, so too does bgzip - so this
  149. implementation does too.
  150. """
  151. if self._buffer:
  152. self.flush()
  153. self._handle.write(_bgzf_eof)
  154. self._handle.flush()
  155. self._handle.close()
  156. def tell(self):
  157. """Returns a BGZF 64-bit virtual offset."""
  158. return make_virtual_offset(self._handle.tell(), len(self._buffer))
  159. def seekable(self):
  160. # Not seekable, but we do support tell...
  161. return False
  162. @classmethod
  163. def isatty(klass):
  164. return False
  165. def fileno(self):
  166. return self._handle.fileno()
  167. def __enter__(self):
  168. return self
  169. def __exit__(self, type_, value, traceback):
  170. self.close()