您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 

728 行
22 KiB

  1. # cython: language_level=3
  2. # cython: embedsignature=True
  3. # cython: profile=True
  4. # adds doc-strings for sphinx
  5. ########################################################################
  6. ########################################################################
  7. ## Cython cimports
  8. ########################################################################
  9. from posix.unistd cimport dup
  10. from libc.errno cimport errno
  11. from libc.stdint cimport INT32_MAX
  12. from cpython cimport PyBytes_FromStringAndSize
  13. from pysam.libchtslib cimport *
  14. from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
  15. from pysam.libcutils cimport encode_filename, from_string_and_size, libc_whence_from_io
  16. ########################################################################
  17. ########################################################################
  18. ## Python imports
  19. ########################################################################
  20. import os
  21. import io
  22. import re
  23. from warnings import warn
  24. ########################################################################
  25. ########################################################################
  26. ## Constants
  27. ########################################################################
  28. __all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile']
  29. # maximum genomic coordinace
  30. cdef int MAX_POS = (1 << 31) - 1
  31. cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
  32. cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
  33. 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
  34. cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
  35. ########################################################################
  36. ########################################################################
  37. ## Verbosity functions
  38. ########################################################################
  39. cpdef set_verbosity(int verbosity):
  40. """Set htslib's hts_verbose global variable to the specified value."""
  41. return hts_set_verbosity(verbosity)
  42. cpdef get_verbosity():
  43. """Return the value of htslib's hts_verbose global variable."""
  44. return hts_get_verbosity()
  45. ########################################################################
  46. ########################################################################
  47. ## HFile wrapper class
  48. ########################################################################
  49. cdef class HFile(object):
  50. cdef hFILE *fp
  51. cdef readonly object name, mode
  52. def __init__(self, name, mode='r', closefd=True):
  53. self._open(name, mode, closefd=True)
  54. def __dealloc__(self):
  55. self.close()
  56. @property
  57. def closed(self):
  58. return self.fp == NULL
  59. cdef _open(self, name, mode, closefd=True):
  60. self.name = name
  61. self.mode = mode
  62. mode = force_bytes(mode)
  63. if isinstance(name, int):
  64. if self.fp != NULL:
  65. name = dup(name)
  66. self.fp = hdopen(name, mode)
  67. else:
  68. name = encode_filename(name)
  69. self.fp = hopen(name, mode)
  70. if not self.fp:
  71. raise IOError(errno, 'failed to open HFile', self.name)
  72. def close(self):
  73. if self.fp == NULL:
  74. return
  75. cdef hFILE *fp = self.fp
  76. self.fp = NULL
  77. if hclose(fp) != 0:
  78. raise IOError(errno, 'failed to close HFile', self.name)
  79. def fileno(self):
  80. if self.fp == NULL:
  81. raise IOError('operation on closed HFile')
  82. if isinstance(self.name, int):
  83. return self.name
  84. else:
  85. raise AttributeError('fileno not available')
  86. def __enter__(self):
  87. return self
  88. def __exit__(self, type, value, tb):
  89. self.close()
  90. def __iter__(self):
  91. return self
  92. def __next__(self):
  93. line = self.readline()
  94. if not line:
  95. raise StopIteration()
  96. return line
  97. def flush(self):
  98. if self.fp == NULL:
  99. raise IOError('operation on closed HFile')
  100. if hflush(self.fp) != 0:
  101. raise IOError(herrno(self.fp), 'failed to flush HFile', self.name)
  102. def isatty(self):
  103. if self.fp == NULL:
  104. raise IOError('operation on closed HFile')
  105. return False
  106. def readable(self):
  107. return self.fp != NULL and 'r' in self.mode
  108. def read(self, Py_ssize_t size=-1):
  109. if self.fp == NULL:
  110. raise IOError('operation on closed HFile')
  111. if size == 0:
  112. return b''
  113. cdef list parts = []
  114. cdef bytes part
  115. cdef Py_ssize_t chunk_size, ret, bytes_read = 0
  116. cdef char *cpart
  117. while size == -1 or bytes_read < size:
  118. chunk_size = 4096
  119. if size != -1:
  120. chunk_size = min(chunk_size, size - bytes_read)
  121. part = PyBytes_FromStringAndSize(NULL, chunk_size)
  122. cpart = <char *>part
  123. ret = hread(self.fp, <void *>cpart, chunk_size)
  124. if ret < 0:
  125. IOError(herrno(self.fp), 'failed to read HFile', self.name)
  126. elif not ret:
  127. break
  128. bytes_read += ret
  129. if ret < chunk_size:
  130. part = cpart[:ret]
  131. parts.append(part)
  132. return b''.join(parts)
  133. def readall(self):
  134. return self.read()
  135. def readinto(self, buf):
  136. if self.fp == NULL:
  137. raise IOError('operation on closed HFile')
  138. size = len(buf)
  139. if size == 0:
  140. return size
  141. mv = memoryview(buf)
  142. ret = hread(self.fp, <void *>mv, size)
  143. if ret < 0:
  144. IOError(herrno(self.fp), 'failed to read HFile', self.name)
  145. return ret
  146. def readline(self, Py_ssize_t size=-1):
  147. if self.fp == NULL:
  148. raise IOError('operation on closed HFile')
  149. if size == 0:
  150. return b''
  151. cdef list parts = []
  152. cdef bytes part
  153. cdef Py_ssize_t chunk_size, ret, bytes_read = 0
  154. cdef char *cpart
  155. while size == -1 or bytes_read < size:
  156. chunk_size = 4096
  157. if size != -1:
  158. chunk_size = min(chunk_size, size - bytes_read)
  159. part = PyBytes_FromStringAndSize(NULL, chunk_size)
  160. cpart = <char *>part
  161. # Python bytes objects allocate an extra byte for a null terminator
  162. ret = hgetln(cpart, chunk_size+1, self.fp)
  163. if ret < 0:
  164. IOError(herrno(self.fp), 'failed to read HFile', self.name)
  165. elif not ret:
  166. break
  167. bytes_read += ret
  168. if ret < chunk_size:
  169. part = cpart[:ret]
  170. cpart = <char *>part
  171. parts.append(part)
  172. if cpart[ret-1] == b'\n':
  173. break
  174. return b''.join(parts)
  175. def readlines(self):
  176. return list(self)
  177. def seek(self, Py_ssize_t offset, int whence=io.SEEK_SET):
  178. if self.fp == NULL:
  179. raise IOError('operation on closed HFile')
  180. cdef Py_ssize_t off = hseek(self.fp, offset, libc_whence_from_io(whence))
  181. if off < 0:
  182. raise IOError(herrno(self.fp), 'seek failed on HFile', self.name)
  183. return off
  184. def tell(self):
  185. if self.fp == NULL:
  186. raise IOError('operation on closed HFile')
  187. ret = htell(self.fp)
  188. if ret < 0:
  189. raise IOError(herrno(self.fp), 'tell failed on HFile', self.name)
  190. return ret
  191. def seekable(self):
  192. return self.fp != NULL
  193. def truncate(self, size=None):
  194. raise NotImplementedError()
  195. def writable(self):
  196. return self.fp != NULL and 'w' in self.mode
  197. def write(self, bytes b):
  198. if self.fp == NULL:
  199. raise IOError('operation on closed HFile')
  200. got = hwrite(self.fp, <void *>b, len(b))
  201. if got < 0:
  202. raise IOError(herrno(self.fp), 'write failed on HFile', self.name)
  203. return got
  204. def writelines(self, lines):
  205. for line in lines:
  206. self.write(line)
  207. ########################################################################
  208. ########################################################################
  209. ## Helpers for backward compatibility to hide the difference between
  210. ## boolean properties and methods
  211. ########################################################################
  212. class CallableValue(object):
  213. def __init__(self, value):
  214. self.value = value
  215. def __call__(self):
  216. return self.value
  217. def __bool__(self):
  218. return self.value
  219. def __nonzero__(self):
  220. return self.value
  221. def __eq__(self, other):
  222. return self.value == other
  223. def __ne__(self, other):
  224. return self.value != other
  225. CTrue = CallableValue(True)
  226. CFalse = CallableValue(False)
  227. ########################################################################
  228. ########################################################################
  229. ## HTSFile wrapper class (base class for AlignmentFile and VariantFile)
  230. ########################################################################
  231. cdef class HTSFile(object):
  232. """
  233. Base class for HTS file types
  234. """
  235. def __cinit__(self, *args, **kwargs):
  236. self.htsfile = NULL
  237. self.threads = 1
  238. self.duplicate_filehandle = True
  239. def close(self):
  240. if self.htsfile:
  241. hts_close(self.htsfile)
  242. self.htsfile = NULL
  243. def __dealloc__(self):
  244. if self.htsfile:
  245. hts_close(self.htsfile)
  246. self.htsfile = NULL
  247. def flush(self):
  248. """Flush any buffered data to the underlying output stream."""
  249. if self.htsfile:
  250. if hts_flush(self.htsfile) < 0:
  251. raise OSError(errno, f'Flushing {type(self).__name__} failed', force_str(self.filename))
  252. def check_truncation(self, ignore_truncation=False):
  253. """Check if file is truncated."""
  254. if not self.htsfile:
  255. return
  256. if self.htsfile.format.compression != bgzf:
  257. return
  258. cdef BGZF *bgzfp = hts_get_bgzfp(self.htsfile)
  259. if not bgzfp:
  260. return
  261. cdef int ret = bgzf_check_EOF(bgzfp)
  262. if ret < 0:
  263. raise IOError(errno, 'error checking for EOF marker')
  264. elif ret == 0:
  265. msg = 'no BGZF EOF marker; file may be truncated'.format(self.filename)
  266. if ignore_truncation:
  267. warn(msg)
  268. else:
  269. raise IOError(msg)
  270. def __enter__(self):
  271. return self
  272. def __exit__(self, exc_type, exc_value, traceback):
  273. self.close()
  274. return False
  275. @property
  276. def category(self):
  277. """General file format category. One of UNKNOWN, ALIGNMENTS,
  278. VARIANTS, INDEX, REGIONS"""
  279. if not self.htsfile:
  280. raise ValueError('metadata not available on closed file')
  281. return FORMAT_CATEGORIES[self.htsfile.format.category]
  282. @property
  283. def format(self):
  284. """File format.
  285. One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM,
  286. BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
  287. """
  288. if not self.htsfile:
  289. raise ValueError('metadata not available on closed file')
  290. return FORMATS[self.htsfile.format.format]
  291. @property
  292. def version(self):
  293. """Tuple of file format version numbers (major, minor)"""
  294. if not self.htsfile:
  295. raise ValueError('metadata not available on closed file')
  296. return self.htsfile.format.version.major, self.htsfile.format.version.minor
  297. @property
  298. def compression(self):
  299. """File compression.
  300. One of NONE, GZIP, BGZF, CUSTOM."""
  301. if not self.htsfile:
  302. raise ValueError('metadata not available on closed file')
  303. return COMPRESSION[self.htsfile.format.compression]
  304. @property
  305. def description(self):
  306. """Vaguely human readable description of the file format"""
  307. if not self.htsfile:
  308. raise ValueError('metadata not available on closed file')
  309. cdef char *desc = hts_format_description(&self.htsfile.format)
  310. try:
  311. return charptr_to_str(desc)
  312. finally:
  313. free(desc)
  314. @property
  315. def is_open(self):
  316. """return True if HTSFile is open and in a valid state."""
  317. return CTrue if self.htsfile != NULL else CFalse
  318. @property
  319. def is_closed(self):
  320. """return True if HTSFile is closed."""
  321. return self.htsfile == NULL
  322. @property
  323. def closed(self):
  324. """return True if HTSFile is closed."""
  325. return self.htsfile == NULL
  326. @property
  327. def is_write(self):
  328. """return True if HTSFile is open for writing"""
  329. return self.htsfile != NULL and self.htsfile.is_write != 0
  330. @property
  331. def is_read(self):
  332. """return True if HTSFile is open for reading"""
  333. return self.htsfile != NULL and self.htsfile.is_write == 0
  334. @property
  335. def is_sam(self):
  336. """return True if HTSFile is reading or writing a SAM alignment file"""
  337. return self.htsfile != NULL and self.htsfile.format.format == sam
  338. @property
  339. def is_bam(self):
  340. """return True if HTSFile is reading or writing a BAM alignment file"""
  341. return self.htsfile != NULL and self.htsfile.format.format == bam
  342. @property
  343. def is_cram(self):
  344. """return True if HTSFile is reading or writing a BAM alignment file"""
  345. return self.htsfile != NULL and self.htsfile.format.format == cram
  346. @property
  347. def is_vcf(self):
  348. """return True if HTSFile is reading or writing a VCF variant file"""
  349. return self.htsfile != NULL and self.htsfile.format.format == vcf
  350. @property
  351. def is_bcf(self):
  352. """return True if HTSFile is reading or writing a BCF variant file"""
  353. return self.htsfile != NULL and self.htsfile.format.format == bcf
  354. def reset(self):
  355. """reset file position to beginning of file just after the header.
  356. Returns
  357. -------
  358. The file position after moving the file pointer. : pointer
  359. """
  360. return self.seek(self.start_offset)
  361. def seek(self, uint64_t offset, int whence=io.SEEK_SET):
  362. """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`."""
  363. if not self.is_open:
  364. raise ValueError('I/O operation on closed file')
  365. if self.is_stream:
  366. raise IOError('seek not available in streams')
  367. whence = libc_whence_from_io(whence)
  368. cdef int64_t ret
  369. if self.htsfile.format.compression == bgzf:
  370. with nogil:
  371. ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, whence)
  372. elif self.htsfile.format.compression == no_compression:
  373. ret = 0 if (hseek(self.htsfile.fp.hfile, offset, whence) >= 0) else -1
  374. else:
  375. raise NotImplementedError("seek not implemented in files compressed by method {}".format(
  376. self.htsfile.format.compression))
  377. return ret
  378. def tell(self):
  379. """return current file position, see :meth:`pysam.HTSFile.seek`."""
  380. if not self.is_open:
  381. raise ValueError('I/O operation on closed file')
  382. if self.is_stream:
  383. raise IOError('tell not available in streams')
  384. cdef int64_t ret
  385. if self.htsfile.format.compression == bgzf:
  386. with nogil:
  387. ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
  388. elif self.htsfile.format.compression == no_compression:
  389. ret = htell(self.htsfile.fp.hfile)
  390. elif self.htsfile.format.format == cram:
  391. with nogil:
  392. ret = htell(cram_fd_get_fp(self.htsfile.fp.cram))
  393. else:
  394. raise NotImplementedError("seek not implemented in files compressed by method {}".format(
  395. self.htsfile.format.compression))
  396. return ret
  397. cdef htsFile *_open_htsfile(self) except? NULL:
  398. cdef char *cfilename
  399. cdef char *cmode = self.mode
  400. cdef int fd, dup_fd, threads
  401. threads = self.threads - 1
  402. if isinstance(self.filename, bytes):
  403. cfilename = self.filename
  404. with nogil:
  405. htsfile = hts_open(cfilename, cmode)
  406. if htsfile != NULL:
  407. hts_set_threads(htsfile, threads)
  408. return htsfile
  409. else:
  410. if isinstance(self.filename, int):
  411. fd = self.filename
  412. else:
  413. fd = self.filename.fileno()
  414. if self.duplicate_filehandle:
  415. dup_fd = dup(fd)
  416. else:
  417. dup_fd = fd
  418. # Replicate mode normalization done in hts_open_format
  419. smode = self.mode.replace(b'b', b'').replace(b'c', b'')
  420. if b'b' in self.mode:
  421. smode += b'b'
  422. elif b'c' in self.mode:
  423. smode += b'c'
  424. cmode = smode
  425. hfile = hdopen(dup_fd, cmode)
  426. if hfile == NULL:
  427. raise IOError('Cannot create hfile')
  428. try:
  429. # filename.name can be an int
  430. filename = str(self.filename.name)
  431. except AttributeError:
  432. filename = '<fd:{}>'.format(fd)
  433. filename = encode_filename(filename)
  434. cfilename = filename
  435. with nogil:
  436. htsfile = hts_hopen(hfile, cfilename, cmode)
  437. if htsfile != NULL:
  438. hts_set_threads(htsfile, threads)
  439. return htsfile
  440. def add_hts_options(self, format_options=None):
  441. """Given a list of key=value format option strings, add them to an open htsFile
  442. """
  443. cdef int rval
  444. cdef hts_opt *opts = NULL
  445. if format_options:
  446. for format_option in format_options:
  447. rval = hts_opt_add(&opts, format_option)
  448. if rval != 0:
  449. if opts != NULL:
  450. hts_opt_free(opts)
  451. raise RuntimeError('Invalid format option ({}) specified'.format(format_option))
  452. if opts != NULL:
  453. rval = hts_opt_apply(self.htsfile, opts)
  454. if rval != 0:
  455. hts_opt_free(opts)
  456. raise RuntimeError('An error occurred while applying the requested format options')
  457. hts_opt_free(opts)
  458. def parse_region(self, contig=None, start=None, stop=None,
  459. region=None, tid=None,
  460. reference=None, end=None):
  461. """parse alternative ways to specify a genomic region. A region can
  462. either be specified by :term:`contig`, `start` and
  463. `stop`. `start` and `stop` denote 0-based, half-open
  464. intervals. :term:`reference` and `end` are also accepted for
  465. backward compatibility as synonyms for :term:`contig` and
  466. `stop`, respectively.
  467. Alternatively, a samtools :term:`region` string can be
  468. supplied.
  469. If any of the coordinates are missing they will be replaced by
  470. the minimum (`start`) or maximum (`stop`) coordinate.
  471. Note that region strings are 1-based inclusive, while `start`
  472. and `stop` denote an interval in 0-based, half-open
  473. coordinates (like BED files and Python slices).
  474. If `contig` or `region` or are ``*``, unmapped reads at the end
  475. of a BAM file will be returned. Setting either to ``.`` will
  476. iterate from the beginning of the file.
  477. Returns
  478. -------
  479. tuple :
  480. a tuple of `flag`, :term:`tid`, `start` and
  481. `stop`. The flag indicates whether no coordinates were
  482. supplied and the genomic region is the complete genomic space.
  483. Raises
  484. ------
  485. ValueError
  486. for invalid or out of bounds regions.
  487. """
  488. cdef int rtid
  489. cdef int32_t rstart
  490. cdef int32_t rstop
  491. if reference is not None:
  492. if contig is not None:
  493. raise ValueError('contig and reference should not both be specified')
  494. contig = reference
  495. if end is not None:
  496. if stop is not None:
  497. raise ValueError('stop and end should not both be specified')
  498. stop = end
  499. if contig is None and tid is None and region is None:
  500. return 0, 0, 0, MAX_POS
  501. rtid = -1
  502. rstart = 0
  503. rstop = MAX_POS
  504. if start is not None:
  505. try:
  506. rstart = start
  507. except OverflowError:
  508. raise ValueError('start out of range (%i)' % start)
  509. if stop is not None:
  510. try:
  511. rstop = stop
  512. except OverflowError:
  513. raise ValueError('stop out of range (%i)' % stop)
  514. if region:
  515. region = force_str(region)
  516. if ":" in region:
  517. contig, coord = region.split(":")
  518. parts = coord.split("-")
  519. rstart = int(parts[0]) - 1
  520. if len(parts) >= 1:
  521. rstop = int(parts[1])
  522. else:
  523. contig = region
  524. if tid is not None:
  525. if not self.is_valid_tid(tid):
  526. raise IndexError('invalid tid')
  527. rtid = tid
  528. else:
  529. if contig == "*":
  530. rtid = HTS_IDX_NOCOOR
  531. elif contig == ".":
  532. rtid = HTS_IDX_START
  533. else:
  534. rtid = self.get_tid(contig)
  535. if rtid < 0:
  536. raise ValueError('invalid contig `%s`' % contig)
  537. if rstart > rstop:
  538. raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop))
  539. if not 0 <= rstart < MAX_POS:
  540. raise ValueError('start out of range (%i)' % rstart)
  541. if not 0 <= rstop <= MAX_POS:
  542. raise ValueError('stop out of range (%i)' % rstop)
  543. return 1, rtid, rstart, rstop
  544. def is_valid_tid(self, tid):
  545. """
  546. return True if the numerical :term:`tid` is valid; False otherwise.
  547. returns -1 if contig is not known.
  548. """
  549. raise NotImplementedError()
  550. def is_valid_reference_name(self, contig):
  551. """
  552. return True if the contig name :term:`contig` is valid; False otherwise.
  553. """
  554. return self.get_tid(contig) != -1
  555. def get_tid(self, contig):
  556. """
  557. return the numerical :term:`tid` corresponding to
  558. :term:`contig`
  559. returns -1 if contig is not known.
  560. """
  561. raise NotImplementedError()
  562. def get_reference_name(self, tid):
  563. """
  564. return :term:`contig` name corresponding to numerical :term:`tid`
  565. """
  566. raise NotImplementedError()