Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.
 
 
 
 

1274 rader
42 KiB

  1. # cython: language_level=3
  2. # cython: embedsignature=True
  3. # cython: profile=True
  4. ###############################################################################
  5. ###############################################################################
  6. # Cython wrapper for access to tabix indexed files in bgzf format
  7. ###############################################################################
  8. # The principal classes and functions defined in this module are:
  9. #
  10. # class TabixFile class wrapping tabix indexed files in bgzf format
  11. #
  12. # class asTuple Parser class for tuples
  13. # class asGTF Parser class for GTF formatted rows
  14. # class asGFF3 Parser class for GFF3 formatted rows
  15. # class asBed Parser class for Bed formatted rows
  16. # class asVCF Parser class for VCF formatted rows
  17. #
  18. # class tabix_generic_iterator Streamed iterator of bgzf formatted files
  19. #
  20. # Additionally this module defines several additional classes that are part
  21. # of the internal API. These are:
  22. #
  23. # class Parser base class for parsers of tab-separated rows
  24. # class tabix_file_iterator
  25. # class TabixIterator iterator class over rows in bgzf file
  26. # class EmptyIterator
  27. #
  28. # For backwards compatibility, the following classes are also defined:
  29. #
  30. # class Tabixfile equivalent to TabixFile
  31. #
  32. ###############################################################################
  33. #
  34. # The MIT License
  35. #
  36. # Copyright (c) 2015 Andreas Heger
  37. #
  38. # Permission is hereby granted, free of charge, to any person obtaining a
  39. # copy of this software and associated documentation files (the "Software"),
  40. # to deal in the Software without restriction, including without limitation
  41. # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  42. # and/or sell copies of the Software, and to permit persons to whom the
  43. # Software is furnished to do so, subject to the following conditions:
  44. #
  45. # The above copyright notice and this permission notice shall be included in
  46. # all copies or substantial portions of the Software.
  47. #
  48. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  49. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  50. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  51. # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  52. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  53. # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  54. # DEALINGS IN THE SOFTWARE.
  55. #
  56. ###############################################################################
  57. import os
  58. import sys
  59. from libc.stdio cimport printf, fprintf, stderr
  60. from libc.string cimport strerror
  61. from libc.errno cimport errno
  62. from posix.unistd cimport dup
  63. from cpython cimport PyErr_SetString, PyBytes_Check, \
  64. PyUnicode_Check, PyBytes_FromStringAndSize, \
  65. PyObject_AsFileDescriptor
  66. cimport pysam.libctabixproxies as ctabixproxies
  67. from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
  68. BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
  69. tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
  70. tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
  71. tbx_destroy, hisremote, region_list, hts_getline, \
  72. TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
  73. no_compression, bcf, bcf_index_build2
  74. from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
  75. from pysam.libcutils cimport encode_filename, from_string_and_size
  76. cdef class Parser:
  77. def __init__(self, encoding="ascii"):
  78. self.encoding = encoding
  79. def set_encoding(self, encoding):
  80. self.encoding = encoding
  81. def get_encoding(self):
  82. return self.encoding
  83. cdef parse(self, char * buffer, int length):
  84. raise NotImplementedError(
  85. 'parse method of %s not implemented' % str(self))
  86. def __call__(self, char * buffer, int length):
  87. return self.parse(buffer, length)
  88. cdef class asTuple(Parser):
  89. '''converts a :term:`tabix row` into a python tuple.
  90. A field in a row is accessed by numeric index.
  91. '''
  92. cdef parse(self, char * buffer, int len):
  93. cdef ctabixproxies.TupleProxy r
  94. r = ctabixproxies.TupleProxy(self.encoding)
  95. # need to copy - there were some
  96. # persistence issues with "present"
  97. r.copy(buffer, len)
  98. return r
  99. cdef class asGFF3(Parser):
  100. '''converts a :term:`tabix row` into a GFF record with the following
  101. fields:
  102. +----------+----------+-------------------------------+
  103. |*Column* |*Name* |*Content* |
  104. +----------+----------+-------------------------------+
  105. |1 |contig |the chromosome name |
  106. +----------+----------+-------------------------------+
  107. |2 |feature |The feature type |
  108. +----------+----------+-------------------------------+
  109. |3 |source |The feature source |
  110. +----------+----------+-------------------------------+
  111. |4 |start |genomic start coordinate |
  112. | | |(0-based) |
  113. +----------+----------+-------------------------------+
  114. |5 |end |genomic end coordinate |
  115. | | |(0-based) |
  116. +----------+----------+-------------------------------+
  117. |6 |score |feature score |
  118. +----------+----------+-------------------------------+
  119. |7 |strand |strand |
  120. +----------+----------+-------------------------------+
  121. |8 |frame |frame |
  122. +----------+----------+-------------------------------+
  123. |9 |attributes|the attribute field |
  124. +----------+----------+-------------------------------+
  125. '''
  126. cdef parse(self, char * buffer, int len):
  127. cdef ctabixproxies.GFF3Proxy r
  128. r = ctabixproxies.GFF3Proxy(self.encoding)
  129. r.copy(buffer, len)
  130. return r
  131. cdef class asGTF(Parser):
  132. '''converts a :term:`tabix row` into a GTF record with the following
  133. fields:
  134. +----------+----------+-------------------------------+
  135. |*Column* |*Name* |*Content* |
  136. +----------+----------+-------------------------------+
  137. |1 |contig |the chromosome name |
  138. +----------+----------+-------------------------------+
  139. |2 |feature |The feature type |
  140. +----------+----------+-------------------------------+
  141. |3 |source |The feature source |
  142. +----------+----------+-------------------------------+
  143. |4 |start |genomic start coordinate |
  144. | | |(0-based) |
  145. +----------+----------+-------------------------------+
  146. |5 |end |genomic end coordinate |
  147. | | |(0-based) |
  148. +----------+----------+-------------------------------+
  149. |6 |score |feature score |
  150. +----------+----------+-------------------------------+
  151. |7 |strand |strand |
  152. +----------+----------+-------------------------------+
  153. |8 |frame |frame |
  154. +----------+----------+-------------------------------+
  155. |9 |attributes|the attribute field |
  156. +----------+----------+-------------------------------+
  157. GTF formatted entries also define the following fields that
  158. are derived from the attributes field:
  159. +--------------------+------------------------------+
  160. |*Name* |*Content* |
  161. +--------------------+------------------------------+
  162. |gene_id |the gene identifier |
  163. +--------------------+------------------------------+
  164. |transcript_id |the transcript identifier |
  165. +--------------------+------------------------------+
  166. '''
  167. cdef parse(self, char * buffer, int len):
  168. cdef ctabixproxies.GTFProxy r
  169. r = ctabixproxies.GTFProxy(self.encoding)
  170. r.copy(buffer, len)
  171. return r
  172. cdef class asBed(Parser):
  173. '''converts a :term:`tabix row` into a bed record
  174. with the following fields:
  175. +-----------+-----------+------------------------------------------+
  176. |*Column* |*Field* |*Contents* |
  177. | | | |
  178. +-----------+-----------+------------------------------------------+
  179. |1 |contig |contig |
  180. | | | |
  181. +-----------+-----------+------------------------------------------+
  182. |2 |start |genomic start coordinate (zero-based) |
  183. +-----------+-----------+------------------------------------------+
  184. |3 |end |genomic end coordinate plus one |
  185. | | |(zero-based) |
  186. +-----------+-----------+------------------------------------------+
  187. |4 |name |name of feature. |
  188. +-----------+-----------+------------------------------------------+
  189. |5 |score |score of feature |
  190. +-----------+-----------+------------------------------------------+
  191. |6 |strand |strand of feature |
  192. +-----------+-----------+------------------------------------------+
  193. |7 |thickStart |thickStart |
  194. +-----------+-----------+------------------------------------------+
  195. |8 |thickEnd |thickEnd |
  196. +-----------+-----------+------------------------------------------+
  197. |9 |itemRGB |itemRGB |
  198. +-----------+-----------+------------------------------------------+
  199. |10 |blockCount |number of bocks |
  200. +-----------+-----------+------------------------------------------+
  201. |11 |blockSizes |',' separated string of block sizes |
  202. +-----------+-----------+------------------------------------------+
  203. |12 |blockStarts|',' separated string of block genomic |
  204. | | |start positions |
  205. +-----------+-----------+------------------------------------------+
  206. Only the first three fields are required. Additional
  207. fields are optional, but if one is defined, all the preceding
  208. need to be defined as well.
  209. '''
  210. cdef parse(self, char * buffer, int len):
  211. cdef ctabixproxies.BedProxy r
  212. r = ctabixproxies.BedProxy(self.encoding)
  213. r.copy(buffer, len)
  214. return r
  215. cdef class asVCF(Parser):
  216. '''converts a :term:`tabix row` into a VCF record with
  217. the following fields:
  218. +----------+---------+------------------------------------+
  219. |*Column* |*Field* |*Contents* |
  220. | | | |
  221. +----------+---------+------------------------------------+
  222. |1 |contig |chromosome |
  223. +----------+---------+------------------------------------+
  224. |2 |pos |chromosomal position, zero-based |
  225. +----------+---------+------------------------------------+
  226. |3 |id |id |
  227. +----------+---------+------------------------------------+
  228. |4 |ref |reference allele |
  229. +----------+---------+------------------------------------+
  230. |5 |alt |alternate alleles |
  231. +----------+---------+------------------------------------+
  232. |6 |qual |quality |
  233. +----------+---------+------------------------------------+
  234. |7 |filter |filter |
  235. +----------+---------+------------------------------------+
  236. |8 |info |info |
  237. +----------+---------+------------------------------------+
  238. |9 |format |format specifier. |
  239. +----------+---------+------------------------------------+
  240. Access to genotypes is via index::
  241. contig = vcf.contig
  242. first_sample_genotype = vcf[0]
  243. second_sample_genotype = vcf[1]
  244. '''
  245. cdef parse(self, char * buffer, int len):
  246. cdef ctabixproxies.VCFProxy r
  247. r = ctabixproxies.VCFProxy(self.encoding)
  248. r.copy(buffer, len)
  249. return r
  250. cdef class TabixFile:
  251. """Random access to bgzf formatted files that
  252. have been indexed by :term:`tabix`.
  253. The file is automatically opened. The index file of file
  254. ``<filename>`` is expected to be called ``<filename>.tbi``
  255. by default (see parameter `index`).
  256. Parameters
  257. ----------
  258. filename : string
  259. Filename of bgzf file to be opened.
  260. index : string
  261. The filename of the index. If not set, the default is to
  262. assume that the index is called ``filename.tbi``
  263. mode : char
  264. The file opening mode. Currently, only ``r`` is permitted.
  265. parser : :class:`pysam.Parser`
  266. sets the default parser for this tabix file. If `parser`
  267. is None, the results are returned as an unparsed string.
  268. Otherwise, `parser` is assumed to be a functor that will return
  269. parsed data (see for example :class:`~pysam.asTuple` and
  270. :class:`~pysam.asGTF`).
  271. encoding : string
  272. The encoding passed to the parser
  273. threads: integer
  274. Number of threads to use for decompressing Tabix files.
  275. (Default=1)
  276. Raises
  277. ------
  278. ValueError
  279. if index file is missing.
  280. IOError
  281. if file could not be opened
  282. """
  283. def __cinit__(self,
  284. filename,
  285. mode='r',
  286. parser=None,
  287. index=None,
  288. encoding="ascii",
  289. threads=1,
  290. *args,
  291. **kwargs ):
  292. self.htsfile = NULL
  293. self.is_remote = False
  294. self.is_stream = False
  295. self.parser = parser
  296. self.threads = threads
  297. self._open(filename, mode, index, *args, **kwargs)
  298. self.encoding = encoding
  299. def _open( self,
  300. filename,
  301. mode='r',
  302. index=None,
  303. threads=1,
  304. ):
  305. '''open a :term:`tabix file` for reading.'''
  306. if mode != 'r':
  307. raise ValueError("invalid file opening mode `%s`" % mode)
  308. if self.htsfile != NULL:
  309. self.close()
  310. self.htsfile = NULL
  311. self.threads=threads
  312. filename_index = index or (filename + ".tbi")
  313. # encode all the strings to pass to tabix
  314. self.filename = encode_filename(filename)
  315. self.filename_index = encode_filename(filename_index)
  316. self.is_stream = self.filename == b'-'
  317. self.is_remote = hisremote(self.filename)
  318. if not self.is_remote:
  319. if not os.path.exists(filename):
  320. raise IOError("file `%s` not found" % filename)
  321. if not os.path.exists(filename_index):
  322. raise IOError("index `%s` not found" % filename_index)
  323. # open file
  324. cdef char *cfilename = self.filename
  325. cdef char *cfilename_index = self.filename_index
  326. with nogil:
  327. self.htsfile = hts_open(cfilename, 'r')
  328. if self.htsfile == NULL:
  329. raise IOError("could not open file `%s`" % filename)
  330. #if self.htsfile.format.category != region_list:
  331. # raise ValueError("file does not contain region data")
  332. with nogil:
  333. self.index = tbx_index_load2(cfilename, cfilename_index)
  334. if self.index == NULL:
  335. raise IOError("could not open index for `%s`" % filename)
  336. if not self.is_stream:
  337. self.start_offset = self.tell()
  338. def _dup(self):
  339. '''return a copy of this tabix file.
  340. The file is being re-opened.
  341. '''
  342. return TabixFile(self.filename,
  343. mode="r",
  344. threads=self.threads,
  345. parser=self.parser,
  346. index=self.filename_index,
  347. encoding=self.encoding)
  348. def fetch(self,
  349. reference=None,
  350. start=None,
  351. end=None,
  352. region=None,
  353. parser=None,
  354. multiple_iterators=False):
  355. '''fetch one or more rows in a :term:`region` using 0-based
  356. indexing. The region is specified by :term:`reference`,
  357. *start* and *end*. Alternatively, a samtools :term:`region`
  358. string can be supplied.
  359. Without *reference* or *region* all entries will be fetched.
  360. If only *reference* is set, all reads matching on *reference*
  361. will be fetched.
  362. If *parser* is None, the default parser will be used for
  363. parsing.
  364. Set *multiple_iterators* to true if you will be using multiple
  365. iterators on the same file at the same time. The iterator
  366. returned will receive its own copy of a filehandle to the file
  367. effectively re-opening the file. Re-opening a file creates
  368. some overhead, so beware.
  369. '''
  370. if not self.is_open():
  371. raise ValueError("I/O operation on closed file")
  372. # convert coordinates to region string, which is one-based
  373. if reference:
  374. if end is not None:
  375. if end < 0:
  376. raise ValueError("end out of range (%i)" % end)
  377. if start is None:
  378. start = 0
  379. if start < 0:
  380. raise ValueError("start out of range (%i)" % end)
  381. elif start > end:
  382. raise ValueError(
  383. 'start (%i) >= end (%i)' % (start, end))
  384. elif start == end:
  385. return EmptyIterator()
  386. else:
  387. region = '%s:%i-%i' % (reference, start + 1, end)
  388. elif start is not None:
  389. if start < 0:
  390. raise ValueError("start out of range (%i)" % end)
  391. region = '%s:%i' % (reference, start + 1)
  392. else:
  393. region = reference
  394. # get iterator
  395. cdef hts_itr_t * itr
  396. cdef char *cstr
  397. cdef TabixFile fileobj
  398. # reopen the same file if necessary
  399. if multiple_iterators:
  400. fileobj = self._dup()
  401. else:
  402. fileobj = self
  403. if region is None:
  404. # without region or reference - iterate from start
  405. with nogil:
  406. itr = tbx_itr_queryi(fileobj.index,
  407. HTS_IDX_START,
  408. 0,
  409. 0)
  410. else:
  411. s = force_bytes(region, encoding=fileobj.encoding)
  412. cstr = s
  413. with nogil:
  414. itr = tbx_itr_querys(fileobj.index, cstr)
  415. if itr == NULL:
  416. if region is None:
  417. if len(self.contigs) > 0:
  418. # when accessing a tabix file created prior tabix 1.0
  419. # the full-file iterator is empty.
  420. raise ValueError(
  421. "could not create iterator, possible "
  422. "tabix version mismatch")
  423. else:
  424. # possible reason is that the file is empty -
  425. # return an empty iterator
  426. return EmptyIterator()
  427. else:
  428. raise ValueError(
  429. "could not create iterator for region '%s'" %
  430. region)
  431. # use default parser if no parser is specified
  432. if parser is None:
  433. parser = fileobj.parser
  434. cdef TabixIterator a
  435. if parser is None:
  436. a = TabixIterator(encoding=fileobj.encoding)
  437. else:
  438. parser.set_encoding(fileobj.encoding)
  439. a = TabixIteratorParsed(parser)
  440. a.tabixfile = fileobj
  441. a.iterator = itr
  442. return a
  443. ###############################################################
  444. ###############################################################
  445. ###############################################################
  446. ## properties
  447. ###############################################################
  448. property header:
  449. '''the file header.
  450. The file header consists of the lines at the beginning of a
  451. file that are prefixed by the comment character ``#``.
  452. .. note::
  453. The header is returned as an iterator presenting lines
  454. without the newline character.
  455. '''
  456. def __get__(self):
  457. cdef char *cfilename = self.filename
  458. cdef char *cfilename_index = self.filename_index
  459. cdef kstring_t buffer
  460. buffer.l = buffer.m = 0
  461. buffer.s = NULL
  462. cdef htsFile * fp = NULL
  463. cdef int KS_SEP_LINE = 2
  464. cdef tbx_t * tbx = NULL
  465. lines = []
  466. with nogil:
  467. fp = hts_open(cfilename, 'r')
  468. if fp == NULL:
  469. raise OSError("could not open {} for reading header".format(self.filename))
  470. with nogil:
  471. tbx = tbx_index_load2(cfilename, cfilename_index)
  472. if tbx == NULL:
  473. raise OSError("could not load .tbi/.csi index of {}".format(self.filename))
  474. while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
  475. if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
  476. break
  477. lines.append(force_str(buffer.s, self.encoding))
  478. with nogil:
  479. hts_close(fp)
  480. free(buffer.s)
  481. return lines
  482. property contigs:
  483. '''list of chromosome names'''
  484. def __get__(self):
  485. cdef const char ** sequences
  486. cdef int nsequences
  487. with nogil:
  488. sequences = tbx_seqnames(self.index, &nsequences)
  489. cdef int x
  490. result = []
  491. for x from 0 <= x < nsequences:
  492. result.append(force_str(sequences[x]))
  493. # htslib instructions:
  494. # only free container, not the sequences themselves
  495. free(sequences)
  496. return result
  497. def close(self):
  498. '''
  499. closes the :class:`pysam.TabixFile`.'''
  500. if self.htsfile != NULL:
  501. hts_close(self.htsfile)
  502. self.htsfile = NULL
  503. if self.index != NULL:
  504. tbx_destroy(self.index)
  505. self.index = NULL
  506. def __dealloc__( self ):
  507. # remember: dealloc cannot call other python methods
  508. # note: no doc string
  509. # note: __del__ is not called.
  510. if self.htsfile != NULL:
  511. hts_close(self.htsfile)
  512. self.htsfile = NULL
  513. if self.index != NULL:
  514. tbx_destroy(self.index)
  515. cdef class TabixIterator:
  516. """iterates over rows in *tabixfile* in region
  517. given by *tid*, *start* and *end*.
  518. """
  519. def __init__(self, encoding="ascii"):
  520. self.encoding = encoding
  521. def __iter__(self):
  522. self.buffer.s = NULL
  523. self.buffer.l = 0
  524. self.buffer.m = 0
  525. return self
  526. cdef int __cnext__(self):
  527. '''iterate to next element.
  528. Return -5 if file has been closed when this function
  529. was called.
  530. '''
  531. if self.tabixfile.htsfile == NULL:
  532. return -5
  533. cdef int retval
  534. while 1:
  535. with nogil:
  536. retval = tbx_itr_next(
  537. self.tabixfile.htsfile,
  538. self.tabixfile.index,
  539. self.iterator,
  540. &self.buffer)
  541. if retval < 0:
  542. break
  543. if self.buffer.s[0] != b'#':
  544. break
  545. return retval
  546. def _itr_error(self, err: int):
  547. if err == -5:
  548. return IOError("iteration on closed file")
  549. else:
  550. return ValueError(f"iteration failed (error code {err})")
  551. def __next__(self):
  552. """python version of next().
  553. pyrex uses this non-standard name instead of next()
  554. """
  555. cdef int retval = self.__cnext__()
  556. if retval < 0:
  557. raise StopIteration if retval == -1 else self._itr_error(retval)
  558. return charptr_to_str(self.buffer.s, self.encoding)
  559. def __dealloc__(self):
  560. if <void*>self.iterator != NULL:
  561. tbx_itr_destroy(self.iterator)
  562. if self.buffer.s != NULL:
  563. free(self.buffer.s)
  564. class EmptyIterator:
  565. '''empty iterator'''
  566. def __iter__(self):
  567. return self
  568. def __next__(self):
  569. raise StopIteration()
  570. cdef class TabixIteratorParsed(TabixIterator):
  571. """iterates over mapped reads in a region.
  572. The *parser* determines the encoding.
  573. Returns parsed data.
  574. """
  575. def __init__(self, Parser parser):
  576. super().__init__()
  577. self.parser = parser
  578. def __next__(self):
  579. """python version of next().
  580. pyrex uses this non-standard name instead of next()
  581. """
  582. cdef int retval = self.__cnext__()
  583. if retval < 0:
  584. raise StopIteration if retval == -1 else self._itr_error(retval)
  585. return self.parser.parse(self.buffer.s,
  586. self.buffer.l)
  587. cdef class GZIterator:
  588. def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
  589. '''iterate line-by-line through gzip (or bgzip)
  590. compressed file.
  591. '''
  592. if not os.path.exists(filename):
  593. raise IOError("No such file or directory: %s" % filename)
  594. filename = encode_filename(filename)
  595. cdef char *cfilename = filename
  596. with nogil:
  597. self.gzipfile = bgzf_open(cfilename, "r")
  598. self._filename = filename
  599. self.kstream = ks_init(self.gzipfile)
  600. self.encoding = encoding
  601. self.buffer.l = 0
  602. self.buffer.m = 0
  603. self.buffer.s = <char*>malloc(buffer_size)
  604. def __dealloc__(self):
  605. '''close file.'''
  606. if self.gzipfile != NULL:
  607. bgzf_close(self.gzipfile)
  608. self.gzipfile = NULL
  609. if self.buffer.s != NULL:
  610. free(self.buffer.s)
  611. if self.kstream != NULL:
  612. ks_destroy(self.kstream)
  613. def __iter__(self):
  614. return self
  615. cdef int __cnext__(self):
  616. cdef int dret = 0
  617. cdef int retval = 0
  618. while 1:
  619. with nogil:
  620. retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
  621. if retval < 0:
  622. break
  623. return dret
  624. return -1
  625. def __next__(self):
  626. """python version of next().
  627. """
  628. cdef int retval = self.__cnext__()
  629. if retval < 0:
  630. raise StopIteration
  631. return force_str(self.buffer.s, self.encoding)
  632. cdef class GZIteratorHead(GZIterator):
  633. '''iterate line-by-line through gzip (or bgzip)
  634. compressed file returning comments at top of file.
  635. '''
  636. def __next__(self):
  637. """python version of next().
  638. """
  639. cdef int retval = self.__cnext__()
  640. if retval < 0:
  641. raise StopIteration
  642. if self.buffer.s[0] == b'#':
  643. return self.buffer.s
  644. else:
  645. raise StopIteration
  646. cdef class GZIteratorParsed(GZIterator):
  647. '''iterate line-by-line through gzip (or bgzip)
  648. compressed file returning comments at top of file.
  649. '''
  650. def __init__(self, parser):
  651. self.parser = parser
  652. def __next__(self):
  653. """python version of next().
  654. """
  655. cdef int retval = self.__cnext__()
  656. if retval < 0:
  657. raise StopIteration
  658. return self.parser.parse(self.buffer.s,
  659. self.buffer.l)
  660. def tabix_compress(filename_in,
  661. filename_out,
  662. force=False):
  663. '''compress *filename_in* writing the output to *filename_out*.
  664. Raise an IOError if *filename_out* already exists, unless *force*
  665. is set.
  666. '''
  667. if not force and os.path.exists(filename_out):
  668. raise IOError(
  669. "Filename '%s' already exists, use *force* to "
  670. "overwrite" % filename_out)
  671. cdef int WINDOW_SIZE
  672. cdef int c, r
  673. cdef void * buffer
  674. cdef BGZF * fp
  675. cdef int fd_src
  676. cdef bint is_empty = True
  677. cdef int O_RDONLY
  678. O_RDONLY = os.O_RDONLY
  679. WINDOW_SIZE = 64 * 1024
  680. fn = encode_filename(filename_out)
  681. cdef char *cfn = fn
  682. with nogil:
  683. fp = bgzf_open(cfn, "w")
  684. if fp == NULL:
  685. raise IOError("could not open '%s' for writing" % filename_out)
  686. fn = encode_filename(filename_in)
  687. fd_src = open(fn, O_RDONLY)
  688. if fd_src == 0:
  689. raise IOError("could not open '%s' for reading" % filename_in)
  690. buffer = malloc(WINDOW_SIZE)
  691. c = 1
  692. while c > 0:
  693. with nogil:
  694. c = read(fd_src, buffer, WINDOW_SIZE)
  695. if c > 0:
  696. is_empty = False
  697. r = bgzf_write(fp, buffer, c)
  698. if r < 0:
  699. free(buffer)
  700. raise IOError("writing failed")
  701. free(buffer)
  702. r = bgzf_close(fp)
  703. if r < 0:
  704. raise IOError("error %i when writing to file %s" % (r, filename_out))
  705. r = close(fd_src)
  706. # an empty file will return with -1, thus ignore this.
  707. if r < 0:
  708. if not (r == -1 and is_empty):
  709. raise IOError("error %i when closing file %s" % (r, filename_in))
  710. def tabix_index(filename,
  711. force=False,
  712. seq_col=None,
  713. start_col=None,
  714. end_col=None,
  715. preset=None,
  716. meta_char="#",
  717. int line_skip=0,
  718. zerobased=False,
  719. int min_shift=-1,
  720. index=None,
  721. keep_original=False,
  722. csi=False,
  723. ):
  724. '''index tab-separated *filename* using tabix.
  725. An existing index will not be overwritten unless *force* is set.
  726. The index will be built from coordinates in columns *seq_col*,
  727. *start_col* and *end_col*.
  728. The contents of *filename* have to be sorted by contig and
  729. position - the method does not check if the file is sorted.
  730. Column indices are 0-based. Note that this is different from the
  731. tabix command line utility where column indices start at 1.
  732. Coordinates in the file are assumed to be 1-based unless
  733. *zerobased* is set.
  734. If *preset* is provided, the column coordinates are taken from a
  735. preset. Valid values for preset are "gff", "bed", "sam", "vcf",
  736. psltbl", "pileup".
  737. Lines beginning with *meta_char* and the first *line_skip* lines
  738. will be skipped.
  739. If *filename* is not detected as a gzip file it will be automatically
  740. compressed. The original file will be removed and only the compressed
  741. file will be retained.
  742. By default or when *min_shift* is 0, creates a TBI index. If *min_shift*
  743. is greater than zero and/or *csi* is True, creates a CSI index with a
  744. minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set).
  745. *index* controls the filename which should be used for creating the index.
  746. If not set, the default is to append ``.tbi`` to *filename*.
  747. When automatically compressing files, if *keep_original* is set the
  748. uncompressed file will not be deleted.
  749. returns the filename of the compressed data
  750. '''
  751. if preset is None and \
  752. (seq_col is None or start_col is None or end_col is None):
  753. raise ValueError(
  754. "neither preset nor seq_col,start_col and end_col given")
  755. fn = encode_filename(filename)
  756. cdef char *cfn = fn
  757. cdef htsFile *fp = hts_open(cfn, "r")
  758. if fp == NULL:
  759. raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
  760. cdef htsFormat fmt = hts_get_format(fp)[0]
  761. hts_close(fp)
  762. if fmt.compression == no_compression:
  763. tabix_compress(filename, filename + ".gz", force=force)
  764. if not keep_original:
  765. os.unlink(filename)
  766. filename += ".gz"
  767. fn = encode_filename(filename)
  768. cfn = fn
  769. # columns (1-based):
  770. # preset-code, contig, start, end, metachar for
  771. # comments, lines to ignore at beginning
  772. # 0 is a missing column
  773. preset2conf = {
  774. 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
  775. 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
  776. 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
  777. 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
  778. 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
  779. }
  780. conf_data = None
  781. if preset == "bcf" or fmt.format == bcf:
  782. csi = True
  783. elif preset:
  784. try:
  785. conf_data = preset2conf[preset]
  786. except KeyError:
  787. raise KeyError(
  788. "unknown preset '%s', valid presets are '%s'" %
  789. (preset, ",".join(preset2conf.keys())))
  790. else:
  791. if end_col is None:
  792. end_col = -1
  793. preset = 0
  794. # tabix internally works with 0-based coordinates and
  795. # open/closed intervals. When using a preset, conversion is
  796. # automatically taken care of. Otherwise, the coordinates are
  797. # assumed to be 1-based closed intervals and -1 is subtracted
  798. # from the start coordinate. To avoid doing this, set the
  799. # TI_FLAG_UCSC=0x10000 flag:
  800. if zerobased:
  801. preset = preset | TBX_UCSC
  802. conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip)
  803. cdef tbx_conf_t conf
  804. if conf_data:
  805. conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
  806. if csi or min_shift > 0:
  807. suffix = ".csi"
  808. if min_shift <= 0: min_shift = 14
  809. else:
  810. suffix = ".tbi"
  811. min_shift = 0
  812. index = index or filename + suffix
  813. fn_index = encode_filename(index)
  814. if not force and os.path.exists(index):
  815. raise IOError(
  816. "filename '%s' already exists, use *force* to overwrite" % index)
  817. cdef char *fnidx = fn_index
  818. cdef int retval = 0
  819. if csi and fmt.format == bcf:
  820. with nogil:
  821. retval = bcf_index_build2(cfn, fnidx, min_shift)
  822. else:
  823. with nogil:
  824. retval = tbx_index_build2(cfn, fnidx, min_shift, &conf)
  825. if retval != 0:
  826. raise OSError("building of index for {} failed".format(filename))
  827. return filename
  828. # #########################################################
  829. # cdef class tabix_file_iterator_old:
  830. # '''iterate over ``infile``.
  831. # This iterator is not safe. If the :meth:`__next__()` method is called
  832. # after ``infile`` is closed, the result is undefined (see ``fclose()``).
  833. # The iterator might either raise a StopIteration or segfault.
  834. # '''
  835. # def __cinit__(self,
  836. # infile,
  837. # Parser parser,
  838. # int buffer_size = 65536 ):
  839. # cdef int fd = PyObject_AsFileDescriptor( infile )
  840. # if fd == -1: raise ValueError( "I/O operation on closed file." )
  841. # self.infile = fdopen( fd, 'r')
  842. # if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
  843. # self.buffer = <char*>malloc( buffer_size )
  844. # self.size = buffer_size
  845. # self.parser = parser
  846. # def __iter__(self):
  847. # return self
  848. # cdef __cnext__(self):
  849. # cdef char * b
  850. # cdef size_t nbytes
  851. # b = self.buffer
  852. # while not feof( self.infile ):
  853. # nbytes = getline( &b, &self.size, self.infile)
  854. # # stop at first error or eof
  855. # if (nbytes == -1): break
  856. # # skip comments
  857. # if (b[0] == '#'): continue
  858. # # skip empty lines
  859. # if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
  860. # # make sure that entry is complete
  861. # if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
  862. # result = b
  863. # raise ValueError( "incomplete line at %s" % result )
  864. # # make sure that this goes fully through C
  865. # # otherwise buffer is copied to/from a
  866. # # Python object causing segfaults as
  867. # # the wrong memory is freed
  868. # return self.parser.parse( b, nbytes )
  869. # raise StopIteration
  870. # def __dealloc__(self):
  871. # free(self.buffer)
  872. # def __next__(self):
  873. # return self.__cnext__()
  874. #########################################################
  875. #########################################################
  876. #########################################################
  877. ## Iterators for parsing through unindexed files.
  878. #########################################################
  879. # cdef buildGzipError(void *gzfp):
  880. # cdef int errnum = 0
  881. # cdef char *s = gzerror(gzfp, &errnum)
  882. # return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
  883. cdef class tabix_file_iterator:
  884. '''iterate over a compressed or uncompressed ``infile``.
  885. '''
  886. def __cinit__(self,
  887. infile,
  888. Parser parser,
  889. int buffer_size=65536):
  890. if infile.closed:
  891. raise ValueError("I/O operation on closed file.")
  892. self.infile = infile
  893. cdef int fd = PyObject_AsFileDescriptor(infile)
  894. if fd == -1:
  895. raise ValueError("I/O operation on closed file.")
  896. self.duplicated_fd = dup(fd)
  897. # From the manual:
  898. # gzopen can be used to read a file which is not in gzip format;
  899. # in this case gzread will directly read from the file without decompression.
  900. # When reading, this will be detected automatically by looking
  901. # for the magic two-byte gzip header.
  902. self.fh = bgzf_dopen(self.duplicated_fd, 'r')
  903. if self.fh == NULL:
  904. raise IOError('%s' % strerror(errno))
  905. self.kstream = ks_init(self.fh)
  906. self.buffer.s = <char*>malloc(buffer_size)
  907. #if self.buffer == NULL:
  908. # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
  909. #self.size = buffer_size
  910. self.parser = parser
  911. def __iter__(self):
  912. return self
  913. cdef __cnext__(self):
  914. cdef char * b
  915. cdef int dret = 0
  916. cdef int retval = 0
  917. while 1:
  918. with nogil:
  919. retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
  920. if retval < 0:
  921. break
  922. #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
  923. b = self.buffer.s
  924. # skip comments
  925. if (b[0] == b'#'):
  926. continue
  927. # skip empty lines
  928. if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
  929. continue
  930. # gzgets terminates at \n, no need to test
  931. # parser creates a copy
  932. return self.parser.parse(b, self.buffer.l)
  933. raise StopIteration
  934. def __dealloc__(self):
  935. free(self.buffer.s)
  936. ks_destroy(self.kstream)
  937. bgzf_close(self.fh)
  938. def __next__(self):
  939. return self.__cnext__()
  940. class tabix_generic_iterator:
  941. '''iterate over ``infile``.
  942. Permits the use of file-like objects for example from the gzip module.
  943. '''
  944. def __init__(self, infile, parser):
  945. self.infile = infile
  946. if self.infile.closed:
  947. raise ValueError("I/O operation on closed file.")
  948. self.parser = parser
  949. def __iter__(self):
  950. return self
  951. # cython version - required for python 3
  952. def __next__(self):
  953. cdef char * b
  954. cdef char * cpy
  955. cdef size_t nbytes
  956. encoding = self.parser.get_encoding()
  957. # note that GzipFile.close() does not close the file
  958. # reading is still possible.
  959. if self.infile.closed:
  960. raise ValueError("I/O operation on closed file.")
  961. while 1:
  962. line = self.infile.readline()
  963. if not line:
  964. break
  965. s = force_bytes(line, encoding)
  966. b = s
  967. nbytes = len(line)
  968. assert b[nbytes] == b'\0'
  969. # skip comments
  970. if b[0] == b'#':
  971. continue
  972. # skip empty lines
  973. if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
  974. continue
  975. # make sure that entry is complete
  976. if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
  977. raise ValueError("incomplete line at %s" % line)
  978. bytes_cpy = <bytes> b
  979. cpy = <char *> bytes_cpy
  980. return self.parser(cpy, nbytes)
  981. raise StopIteration
  982. def tabix_iterator(infile, parser):
  983. """return an iterator over all entries in a file.
  984. Results are returned parsed as specified by the *parser*. If
  985. *parser* is None, the results are returned as an unparsed string.
  986. Otherwise, *parser* is assumed to be a functor that will return
  987. parsed data (see for example :class:`~pysam.asTuple` and
  988. :class:`~pysam.asGTF`).
  989. """
  990. return tabix_generic_iterator(infile, parser)
  991. cdef class Tabixfile(TabixFile):
  992. """Tabixfile is deprecated: use TabixFile instead"""
  993. pass
  994. __all__ = [
  995. "tabix_index",
  996. "tabix_compress",
  997. "TabixFile",
  998. "Tabixfile",
  999. "asTuple",
  1000. "asGTF",
  1001. "asGFF3",
  1002. "asVCF",
  1003. "asBed",
  1004. "GZIterator",
  1005. "GZIteratorHead",
  1006. "tabix_iterator",
  1007. "tabix_generic_iterator",
  1008. "tabix_file_iterator",
  1009. ]