|
- # cython: language_level=3
- # cython: embedsignature=True
- # cython: profile=True
- ###############################################################################
- ###############################################################################
- # Cython wrapper for access to tabix indexed files in bgzf format
- ###############################################################################
- # The principal classes and functions defined in this module are:
- #
- # class TabixFile class wrapping tabix indexed files in bgzf format
- #
- # class asTuple Parser class for tuples
- # class asGTF Parser class for GTF formatted rows
- # class asGFF3 Parser class for GFF3 formatted rows
- # class asBed Parser class for Bed formatted rows
- # class asVCF Parser class for VCF formatted rows
- #
- # class tabix_generic_iterator Streamed iterator of bgzf formatted files
- #
- # Additionally this module defines several additional classes that are part
- # of the internal API. These are:
- #
- # class Parser base class for parsers of tab-separated rows
- # class tabix_file_iterator
- # class TabixIterator iterator class over rows in bgzf file
- # class EmptyIterator
- #
- # For backwards compatibility, the following classes are also defined:
- #
- # class Tabixfile equivalent to TabixFile
- #
- ###############################################################################
- #
- # The MIT License
- #
- # Copyright (c) 2015 Andreas Heger
- #
- # Permission is hereby granted, free of charge, to any person obtaining a
- # copy of this software and associated documentation files (the "Software"),
- # to deal in the Software without restriction, including without limitation
- # the rights to use, copy, modify, merge, publish, distribute, sublicense,
- # and/or sell copies of the Software, and to permit persons to whom the
- # Software is furnished to do so, subject to the following conditions:
- #
- # The above copyright notice and this permission notice shall be included in
- # all copies or substantial portions of the Software.
- #
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- # DEALINGS IN THE SOFTWARE.
- #
- ###############################################################################
- import os
- import sys
-
- from libc.stdio cimport printf, fprintf, stderr
- from libc.string cimport strerror
- from libc.errno cimport errno
- from posix.unistd cimport dup
-
- from cpython cimport PyErr_SetString, PyBytes_Check, \
- PyUnicode_Check, PyBytes_FromStringAndSize, \
- PyObject_AsFileDescriptor
-
- cimport pysam.libctabixproxies as ctabixproxies
-
- from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
- BGZF, bgzf_open, bgzf_dopen, bgzf_close, bgzf_write, \
- tbx_index_build2, tbx_index_load2, tbx_itr_queryi, tbx_itr_querys, \
- tbx_conf_t, tbx_seqnames, tbx_itr_next, tbx_itr_destroy, \
- tbx_destroy, hisremote, region_list, hts_getline, \
- TBX_GENERIC, TBX_SAM, TBX_VCF, TBX_UCSC, hts_get_format, htsFormat, \
- no_compression, bcf, bcf_index_build2
-
- from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
- from pysam.libcutils cimport encode_filename, from_string_and_size
-
- cdef class Parser:
-
- def __init__(self, encoding="ascii"):
- self.encoding = encoding
-
- def set_encoding(self, encoding):
- self.encoding = encoding
-
- def get_encoding(self):
- return self.encoding
-
- cdef parse(self, char * buffer, int length):
- raise NotImplementedError(
- 'parse method of %s not implemented' % str(self))
-
- def __call__(self, char * buffer, int length):
- return self.parse(buffer, length)
-
-
- cdef class asTuple(Parser):
- '''converts a :term:`tabix row` into a python tuple.
-
- A field in a row is accessed by numeric index.
- '''
- cdef parse(self, char * buffer, int len):
- cdef ctabixproxies.TupleProxy r
- r = ctabixproxies.TupleProxy(self.encoding)
- # need to copy - there were some
- # persistence issues with "present"
- r.copy(buffer, len)
- return r
-
-
- cdef class asGFF3(Parser):
- '''converts a :term:`tabix row` into a GFF record with the following
- fields:
-
- +----------+----------+-------------------------------+
- |*Column* |*Name* |*Content* |
- +----------+----------+-------------------------------+
- |1 |contig |the chromosome name |
- +----------+----------+-------------------------------+
- |2 |feature |The feature type |
- +----------+----------+-------------------------------+
- |3 |source |The feature source |
- +----------+----------+-------------------------------+
- |4 |start |genomic start coordinate |
- | | |(0-based) |
- +----------+----------+-------------------------------+
- |5 |end |genomic end coordinate |
- | | |(0-based) |
- +----------+----------+-------------------------------+
- |6 |score |feature score |
- +----------+----------+-------------------------------+
- |7 |strand |strand |
- +----------+----------+-------------------------------+
- |8 |frame |frame |
- +----------+----------+-------------------------------+
- |9 |attributes|the attribute field |
- +----------+----------+-------------------------------+
-
- '''
- cdef parse(self, char * buffer, int len):
- cdef ctabixproxies.GFF3Proxy r
- r = ctabixproxies.GFF3Proxy(self.encoding)
- r.copy(buffer, len)
- return r
-
-
- cdef class asGTF(Parser):
- '''converts a :term:`tabix row` into a GTF record with the following
- fields:
-
- +----------+----------+-------------------------------+
- |*Column* |*Name* |*Content* |
- +----------+----------+-------------------------------+
- |1 |contig |the chromosome name |
- +----------+----------+-------------------------------+
- |2 |feature |The feature type |
- +----------+----------+-------------------------------+
- |3 |source |The feature source |
- +----------+----------+-------------------------------+
- |4 |start |genomic start coordinate |
- | | |(0-based) |
- +----------+----------+-------------------------------+
- |5 |end |genomic end coordinate |
- | | |(0-based) |
- +----------+----------+-------------------------------+
- |6 |score |feature score |
- +----------+----------+-------------------------------+
- |7 |strand |strand |
- +----------+----------+-------------------------------+
- |8 |frame |frame |
- +----------+----------+-------------------------------+
- |9 |attributes|the attribute field |
- +----------+----------+-------------------------------+
-
- GTF formatted entries also define the following fields that
- are derived from the attributes field:
-
- +--------------------+------------------------------+
- |*Name* |*Content* |
- +--------------------+------------------------------+
- |gene_id |the gene identifier |
- +--------------------+------------------------------+
- |transcript_id |the transcript identifier |
- +--------------------+------------------------------+
-
- '''
- cdef parse(self, char * buffer, int len):
- cdef ctabixproxies.GTFProxy r
- r = ctabixproxies.GTFProxy(self.encoding)
- r.copy(buffer, len)
- return r
-
-
- cdef class asBed(Parser):
- '''converts a :term:`tabix row` into a bed record
- with the following fields:
-
- +-----------+-----------+------------------------------------------+
- |*Column* |*Field* |*Contents* |
- | | | |
- +-----------+-----------+------------------------------------------+
- |1 |contig |contig |
- | | | |
- +-----------+-----------+------------------------------------------+
- |2 |start |genomic start coordinate (zero-based) |
- +-----------+-----------+------------------------------------------+
- |3 |end |genomic end coordinate plus one |
- | | |(zero-based) |
- +-----------+-----------+------------------------------------------+
- |4 |name |name of feature. |
- +-----------+-----------+------------------------------------------+
- |5 |score |score of feature |
- +-----------+-----------+------------------------------------------+
- |6 |strand |strand of feature |
- +-----------+-----------+------------------------------------------+
- |7 |thickStart |thickStart |
- +-----------+-----------+------------------------------------------+
- |8 |thickEnd |thickEnd |
- +-----------+-----------+------------------------------------------+
- |9 |itemRGB |itemRGB |
- +-----------+-----------+------------------------------------------+
- |10 |blockCount |number of bocks |
- +-----------+-----------+------------------------------------------+
- |11 |blockSizes |',' separated string of block sizes |
- +-----------+-----------+------------------------------------------+
- |12 |blockStarts|',' separated string of block genomic |
- | | |start positions |
- +-----------+-----------+------------------------------------------+
-
- Only the first three fields are required. Additional
- fields are optional, but if one is defined, all the preceding
- need to be defined as well.
-
- '''
- cdef parse(self, char * buffer, int len):
- cdef ctabixproxies.BedProxy r
- r = ctabixproxies.BedProxy(self.encoding)
- r.copy(buffer, len)
- return r
-
-
- cdef class asVCF(Parser):
- '''converts a :term:`tabix row` into a VCF record with
- the following fields:
-
- +----------+---------+------------------------------------+
- |*Column* |*Field* |*Contents* |
- | | | |
- +----------+---------+------------------------------------+
- |1 |contig |chromosome |
- +----------+---------+------------------------------------+
- |2 |pos |chromosomal position, zero-based |
- +----------+---------+------------------------------------+
- |3 |id |id |
- +----------+---------+------------------------------------+
- |4 |ref |reference allele |
- +----------+---------+------------------------------------+
- |5 |alt |alternate alleles |
- +----------+---------+------------------------------------+
- |6 |qual |quality |
- +----------+---------+------------------------------------+
- |7 |filter |filter |
- +----------+---------+------------------------------------+
- |8 |info |info |
- +----------+---------+------------------------------------+
- |9 |format |format specifier. |
- +----------+---------+------------------------------------+
-
- Access to genotypes is via index::
-
- contig = vcf.contig
- first_sample_genotype = vcf[0]
- second_sample_genotype = vcf[1]
-
- '''
- cdef parse(self, char * buffer, int len):
- cdef ctabixproxies.VCFProxy r
- r = ctabixproxies.VCFProxy(self.encoding)
- r.copy(buffer, len)
- return r
-
-
- cdef class TabixFile:
- """Random access to bgzf formatted files that
- have been indexed by :term:`tabix`.
-
- The file is automatically opened. The index file of file
- ``<filename>`` is expected to be called ``<filename>.tbi``
- by default (see parameter `index`).
-
- Parameters
- ----------
-
- filename : string
- Filename of bgzf file to be opened.
-
- index : string
- The filename of the index. If not set, the default is to
- assume that the index is called ``filename.tbi``
-
- mode : char
- The file opening mode. Currently, only ``r`` is permitted.
-
- parser : :class:`pysam.Parser`
-
- sets the default parser for this tabix file. If `parser`
- is None, the results are returned as an unparsed string.
- Otherwise, `parser` is assumed to be a functor that will return
- parsed data (see for example :class:`~pysam.asTuple` and
- :class:`~pysam.asGTF`).
-
- encoding : string
-
- The encoding passed to the parser
-
- threads: integer
- Number of threads to use for decompressing Tabix files.
- (Default=1)
-
-
- Raises
- ------
-
- ValueError
- if index file is missing.
-
- IOError
- if file could not be opened
- """
- def __cinit__(self,
- filename,
- mode='r',
- parser=None,
- index=None,
- encoding="ascii",
- threads=1,
- *args,
- **kwargs ):
-
- self.htsfile = NULL
- self.is_remote = False
- self.is_stream = False
- self.parser = parser
- self.threads = threads
- self._open(filename, mode, index, *args, **kwargs)
- self.encoding = encoding
-
- def _open( self,
- filename,
- mode='r',
- index=None,
- threads=1,
- ):
- '''open a :term:`tabix file` for reading.'''
-
- if mode != 'r':
- raise ValueError("invalid file opening mode `%s`" % mode)
-
- if self.htsfile != NULL:
- self.close()
- self.htsfile = NULL
- self.threads=threads
-
- filename_index = index or (filename + ".tbi")
- # encode all the strings to pass to tabix
- self.filename = encode_filename(filename)
- self.filename_index = encode_filename(filename_index)
-
- self.is_stream = self.filename == b'-'
- self.is_remote = hisremote(self.filename)
-
- if not self.is_remote:
- if not os.path.exists(filename):
- raise IOError("file `%s` not found" % filename)
-
- if not os.path.exists(filename_index):
- raise IOError("index `%s` not found" % filename_index)
-
- # open file
- cdef char *cfilename = self.filename
- cdef char *cfilename_index = self.filename_index
- with nogil:
- self.htsfile = hts_open(cfilename, 'r')
-
- if self.htsfile == NULL:
- raise IOError("could not open file `%s`" % filename)
-
- #if self.htsfile.format.category != region_list:
- # raise ValueError("file does not contain region data")
-
- with nogil:
- self.index = tbx_index_load2(cfilename, cfilename_index)
-
- if self.index == NULL:
- raise IOError("could not open index for `%s`" % filename)
-
- if not self.is_stream:
- self.start_offset = self.tell()
-
- def _dup(self):
- '''return a copy of this tabix file.
-
- The file is being re-opened.
- '''
- return TabixFile(self.filename,
- mode="r",
- threads=self.threads,
- parser=self.parser,
- index=self.filename_index,
- encoding=self.encoding)
-
- def fetch(self,
- reference=None,
- start=None,
- end=None,
- region=None,
- parser=None,
- multiple_iterators=False):
- '''fetch one or more rows in a :term:`region` using 0-based
- indexing. The region is specified by :term:`reference`,
- *start* and *end*. Alternatively, a samtools :term:`region`
- string can be supplied.
-
- Without *reference* or *region* all entries will be fetched.
-
- If only *reference* is set, all reads matching on *reference*
- will be fetched.
-
- If *parser* is None, the default parser will be used for
- parsing.
-
- Set *multiple_iterators* to true if you will be using multiple
- iterators on the same file at the same time. The iterator
- returned will receive its own copy of a filehandle to the file
- effectively re-opening the file. Re-opening a file creates
- some overhead, so beware.
-
- '''
- if not self.is_open():
- raise ValueError("I/O operation on closed file")
-
- # convert coordinates to region string, which is one-based
- if reference:
- if end is not None:
- if end < 0:
- raise ValueError("end out of range (%i)" % end)
- if start is None:
- start = 0
-
- if start < 0:
- raise ValueError("start out of range (%i)" % end)
- elif start > end:
- raise ValueError(
- 'start (%i) >= end (%i)' % (start, end))
- elif start == end:
- return EmptyIterator()
- else:
- region = '%s:%i-%i' % (reference, start + 1, end)
- elif start is not None:
- if start < 0:
- raise ValueError("start out of range (%i)" % end)
- region = '%s:%i' % (reference, start + 1)
- else:
- region = reference
-
- # get iterator
- cdef hts_itr_t * itr
- cdef char *cstr
- cdef TabixFile fileobj
-
- # reopen the same file if necessary
- if multiple_iterators:
- fileobj = self._dup()
- else:
- fileobj = self
-
- if region is None:
- # without region or reference - iterate from start
- with nogil:
- itr = tbx_itr_queryi(fileobj.index,
- HTS_IDX_START,
- 0,
- 0)
- else:
- s = force_bytes(region, encoding=fileobj.encoding)
- cstr = s
- with nogil:
- itr = tbx_itr_querys(fileobj.index, cstr)
-
- if itr == NULL:
- if region is None:
- if len(self.contigs) > 0:
- # when accessing a tabix file created prior tabix 1.0
- # the full-file iterator is empty.
- raise ValueError(
- "could not create iterator, possible "
- "tabix version mismatch")
- else:
- # possible reason is that the file is empty -
- # return an empty iterator
- return EmptyIterator()
- else:
- raise ValueError(
- "could not create iterator for region '%s'" %
- region)
-
- # use default parser if no parser is specified
- if parser is None:
- parser = fileobj.parser
-
- cdef TabixIterator a
- if parser is None:
- a = TabixIterator(encoding=fileobj.encoding)
- else:
- parser.set_encoding(fileobj.encoding)
- a = TabixIteratorParsed(parser)
-
- a.tabixfile = fileobj
- a.iterator = itr
-
- return a
-
- ###############################################################
- ###############################################################
- ###############################################################
- ## properties
- ###############################################################
- property header:
- '''the file header.
-
- The file header consists of the lines at the beginning of a
- file that are prefixed by the comment character ``#``.
-
- .. note::
- The header is returned as an iterator presenting lines
- without the newline character.
- '''
-
- def __get__(self):
-
- cdef char *cfilename = self.filename
- cdef char *cfilename_index = self.filename_index
-
- cdef kstring_t buffer
- buffer.l = buffer.m = 0
- buffer.s = NULL
-
- cdef htsFile * fp = NULL
- cdef int KS_SEP_LINE = 2
- cdef tbx_t * tbx = NULL
- lines = []
- with nogil:
- fp = hts_open(cfilename, 'r')
-
- if fp == NULL:
- raise OSError("could not open {} for reading header".format(self.filename))
-
- with nogil:
- tbx = tbx_index_load2(cfilename, cfilename_index)
-
- if tbx == NULL:
- raise OSError("could not load .tbi/.csi index of {}".format(self.filename))
-
- while hts_getline(fp, KS_SEP_LINE, &buffer) >= 0:
- if not buffer.l or buffer.s[0] != tbx.conf.meta_char:
- break
- lines.append(force_str(buffer.s, self.encoding))
-
- with nogil:
- hts_close(fp)
- free(buffer.s)
-
- return lines
-
- property contigs:
- '''list of chromosome names'''
- def __get__(self):
- cdef const char ** sequences
- cdef int nsequences
-
- with nogil:
- sequences = tbx_seqnames(self.index, &nsequences)
- cdef int x
- result = []
- for x from 0 <= x < nsequences:
- result.append(force_str(sequences[x]))
-
- # htslib instructions:
- # only free container, not the sequences themselves
- free(sequences)
-
- return result
-
- def close(self):
- '''
- closes the :class:`pysam.TabixFile`.'''
- if self.htsfile != NULL:
- hts_close(self.htsfile)
- self.htsfile = NULL
- if self.index != NULL:
- tbx_destroy(self.index)
- self.index = NULL
-
- def __dealloc__( self ):
- # remember: dealloc cannot call other python methods
- # note: no doc string
- # note: __del__ is not called.
- if self.htsfile != NULL:
- hts_close(self.htsfile)
- self.htsfile = NULL
- if self.index != NULL:
- tbx_destroy(self.index)
-
-
- cdef class TabixIterator:
- """iterates over rows in *tabixfile* in region
- given by *tid*, *start* and *end*.
- """
-
- def __init__(self, encoding="ascii"):
- self.encoding = encoding
-
- def __iter__(self):
- self.buffer.s = NULL
- self.buffer.l = 0
- self.buffer.m = 0
-
- return self
-
- cdef int __cnext__(self):
- '''iterate to next element.
-
- Return -5 if file has been closed when this function
- was called.
- '''
- if self.tabixfile.htsfile == NULL:
- return -5
-
- cdef int retval
-
- while 1:
- with nogil:
- retval = tbx_itr_next(
- self.tabixfile.htsfile,
- self.tabixfile.index,
- self.iterator,
- &self.buffer)
-
- if retval < 0:
- break
-
- if self.buffer.s[0] != b'#':
- break
-
- return retval
-
- def _itr_error(self, err: int):
- if err == -5:
- return IOError("iteration on closed file")
- else:
- return ValueError(f"iteration failed (error code {err})")
-
- def __next__(self):
- """python version of next().
-
- pyrex uses this non-standard name instead of next()
- """
-
- cdef int retval = self.__cnext__()
- if retval < 0:
- raise StopIteration if retval == -1 else self._itr_error(retval)
-
- return charptr_to_str(self.buffer.s, self.encoding)
-
- def __dealloc__(self):
- if <void*>self.iterator != NULL:
- tbx_itr_destroy(self.iterator)
- if self.buffer.s != NULL:
- free(self.buffer.s)
-
-
- class EmptyIterator:
- '''empty iterator'''
-
- def __iter__(self):
- return self
-
- def __next__(self):
- raise StopIteration()
-
-
- cdef class TabixIteratorParsed(TabixIterator):
- """iterates over mapped reads in a region.
-
- The *parser* determines the encoding.
-
- Returns parsed data.
- """
-
- def __init__(self, Parser parser):
- super().__init__()
- self.parser = parser
-
- def __next__(self):
- """python version of next().
-
- pyrex uses this non-standard name instead of next()
- """
-
- cdef int retval = self.__cnext__()
- if retval < 0:
- raise StopIteration if retval == -1 else self._itr_error(retval)
-
- return self.parser.parse(self.buffer.s,
- self.buffer.l)
-
-
- cdef class GZIterator:
- def __init__(self, filename, int buffer_size=65536, encoding="ascii"):
- '''iterate line-by-line through gzip (or bgzip)
- compressed file.
- '''
- if not os.path.exists(filename):
- raise IOError("No such file or directory: %s" % filename)
-
- filename = encode_filename(filename)
- cdef char *cfilename = filename
- with nogil:
- self.gzipfile = bgzf_open(cfilename, "r")
- self._filename = filename
- self.kstream = ks_init(self.gzipfile)
- self.encoding = encoding
-
- self.buffer.l = 0
- self.buffer.m = 0
- self.buffer.s = <char*>malloc(buffer_size)
-
- def __dealloc__(self):
- '''close file.'''
- if self.gzipfile != NULL:
- bgzf_close(self.gzipfile)
- self.gzipfile = NULL
- if self.buffer.s != NULL:
- free(self.buffer.s)
- if self.kstream != NULL:
- ks_destroy(self.kstream)
-
- def __iter__(self):
- return self
-
- cdef int __cnext__(self):
- cdef int dret = 0
- cdef int retval = 0
- while 1:
- with nogil:
- retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
-
- if retval < 0:
- break
-
- return dret
- return -1
-
- def __next__(self):
- """python version of next().
- """
- cdef int retval = self.__cnext__()
- if retval < 0:
- raise StopIteration
- return force_str(self.buffer.s, self.encoding)
-
-
- cdef class GZIteratorHead(GZIterator):
- '''iterate line-by-line through gzip (or bgzip)
- compressed file returning comments at top of file.
- '''
-
- def __next__(self):
- """python version of next().
- """
- cdef int retval = self.__cnext__()
- if retval < 0:
- raise StopIteration
- if self.buffer.s[0] == b'#':
- return self.buffer.s
- else:
- raise StopIteration
-
-
- cdef class GZIteratorParsed(GZIterator):
- '''iterate line-by-line through gzip (or bgzip)
- compressed file returning comments at top of file.
- '''
-
- def __init__(self, parser):
- self.parser = parser
-
- def __next__(self):
- """python version of next().
- """
- cdef int retval = self.__cnext__()
- if retval < 0:
- raise StopIteration
-
- return self.parser.parse(self.buffer.s,
- self.buffer.l)
-
-
- def tabix_compress(filename_in,
- filename_out,
- force=False):
- '''compress *filename_in* writing the output to *filename_out*.
-
- Raise an IOError if *filename_out* already exists, unless *force*
- is set.
- '''
-
- if not force and os.path.exists(filename_out):
- raise IOError(
- "Filename '%s' already exists, use *force* to "
- "overwrite" % filename_out)
-
- cdef int WINDOW_SIZE
- cdef int c, r
- cdef void * buffer
- cdef BGZF * fp
- cdef int fd_src
- cdef bint is_empty = True
- cdef int O_RDONLY
- O_RDONLY = os.O_RDONLY
-
- WINDOW_SIZE = 64 * 1024
-
- fn = encode_filename(filename_out)
- cdef char *cfn = fn
- with nogil:
- fp = bgzf_open(cfn, "w")
- if fp == NULL:
- raise IOError("could not open '%s' for writing" % filename_out)
-
- fn = encode_filename(filename_in)
- fd_src = open(fn, O_RDONLY)
- if fd_src == 0:
- raise IOError("could not open '%s' for reading" % filename_in)
-
- buffer = malloc(WINDOW_SIZE)
- c = 1
-
- while c > 0:
- with nogil:
- c = read(fd_src, buffer, WINDOW_SIZE)
- if c > 0:
- is_empty = False
- r = bgzf_write(fp, buffer, c)
- if r < 0:
- free(buffer)
- raise IOError("writing failed")
-
- free(buffer)
- r = bgzf_close(fp)
- if r < 0:
- raise IOError("error %i when writing to file %s" % (r, filename_out))
-
- r = close(fd_src)
- # an empty file will return with -1, thus ignore this.
- if r < 0:
- if not (r == -1 and is_empty):
- raise IOError("error %i when closing file %s" % (r, filename_in))
-
-
- def tabix_index(filename,
- force=False,
- seq_col=None,
- start_col=None,
- end_col=None,
- preset=None,
- meta_char="#",
- int line_skip=0,
- zerobased=False,
- int min_shift=-1,
- index=None,
- keep_original=False,
- csi=False,
- ):
- '''index tab-separated *filename* using tabix.
-
- An existing index will not be overwritten unless *force* is set.
-
- The index will be built from coordinates in columns *seq_col*,
- *start_col* and *end_col*.
-
- The contents of *filename* have to be sorted by contig and
- position - the method does not check if the file is sorted.
-
- Column indices are 0-based. Note that this is different from the
- tabix command line utility where column indices start at 1.
-
- Coordinates in the file are assumed to be 1-based unless
- *zerobased* is set.
-
- If *preset* is provided, the column coordinates are taken from a
- preset. Valid values for preset are "gff", "bed", "sam", "vcf",
- psltbl", "pileup".
-
- Lines beginning with *meta_char* and the first *line_skip* lines
- will be skipped.
-
- If *filename* is not detected as a gzip file it will be automatically
- compressed. The original file will be removed and only the compressed
- file will be retained.
-
- By default or when *min_shift* is 0, creates a TBI index. If *min_shift*
- is greater than zero and/or *csi* is True, creates a CSI index with a
- minimal interval size of 1<<*min_shift* (1<<14 if only *csi* is set).
-
- *index* controls the filename which should be used for creating the index.
- If not set, the default is to append ``.tbi`` to *filename*.
-
- When automatically compressing files, if *keep_original* is set the
- uncompressed file will not be deleted.
-
- returns the filename of the compressed data
-
- '''
-
- if preset is None and \
- (seq_col is None or start_col is None or end_col is None):
- raise ValueError(
- "neither preset nor seq_col,start_col and end_col given")
-
- fn = encode_filename(filename)
- cdef char *cfn = fn
-
- cdef htsFile *fp = hts_open(cfn, "r")
- if fp == NULL:
- raise IOError("Could not open file '%s': %s" % (filename, force_str(strerror(errno))))
-
- cdef htsFormat fmt = hts_get_format(fp)[0]
- hts_close(fp)
-
- if fmt.compression == no_compression:
- tabix_compress(filename, filename + ".gz", force=force)
- if not keep_original:
- os.unlink(filename)
- filename += ".gz"
- fn = encode_filename(filename)
- cfn = fn
-
- # columns (1-based):
- # preset-code, contig, start, end, metachar for
- # comments, lines to ignore at beginning
- # 0 is a missing column
- preset2conf = {
- 'gff' : (TBX_GENERIC, 1, 4, 5, ord('#'), 0),
- 'bed' : (TBX_UCSC, 1, 2, 3, ord('#'), 0),
- 'psltbl' : (TBX_UCSC, 15, 17, 18, ord('#'), 0),
- 'sam' : (TBX_SAM, 3, 4, 0, ord('@'), 0),
- 'vcf' : (TBX_VCF, 1, 2, 0, ord('#'), 0),
- }
-
- conf_data = None
- if preset == "bcf" or fmt.format == bcf:
- csi = True
- elif preset:
- try:
- conf_data = preset2conf[preset]
- except KeyError:
- raise KeyError(
- "unknown preset '%s', valid presets are '%s'" %
- (preset, ",".join(preset2conf.keys())))
- else:
- if end_col is None:
- end_col = -1
-
- preset = 0
- # tabix internally works with 0-based coordinates and
- # open/closed intervals. When using a preset, conversion is
- # automatically taken care of. Otherwise, the coordinates are
- # assumed to be 1-based closed intervals and -1 is subtracted
- # from the start coordinate. To avoid doing this, set the
- # TI_FLAG_UCSC=0x10000 flag:
- if zerobased:
- preset = preset | TBX_UCSC
-
- conf_data = (preset, seq_col + 1, start_col + 1, end_col + 1, ord(meta_char), line_skip)
-
- cdef tbx_conf_t conf
- if conf_data:
- conf.preset, conf.sc, conf.bc, conf.ec, conf.meta_char, conf.line_skip = conf_data
-
- if csi or min_shift > 0:
- suffix = ".csi"
- if min_shift <= 0: min_shift = 14
- else:
- suffix = ".tbi"
- min_shift = 0
-
- index = index or filename + suffix
- fn_index = encode_filename(index)
-
- if not force and os.path.exists(index):
- raise IOError(
- "filename '%s' already exists, use *force* to overwrite" % index)
-
- cdef char *fnidx = fn_index
- cdef int retval = 0
-
- if csi and fmt.format == bcf:
- with nogil:
- retval = bcf_index_build2(cfn, fnidx, min_shift)
- else:
- with nogil:
- retval = tbx_index_build2(cfn, fnidx, min_shift, &conf)
-
- if retval != 0:
- raise OSError("building of index for {} failed".format(filename))
-
- return filename
-
- # #########################################################
- # cdef class tabix_file_iterator_old:
- # '''iterate over ``infile``.
-
- # This iterator is not safe. If the :meth:`__next__()` method is called
- # after ``infile`` is closed, the result is undefined (see ``fclose()``).
-
- # The iterator might either raise a StopIteration or segfault.
- # '''
-
-
- # def __cinit__(self,
- # infile,
- # Parser parser,
- # int buffer_size = 65536 ):
-
- # cdef int fd = PyObject_AsFileDescriptor( infile )
- # if fd == -1: raise ValueError( "I/O operation on closed file." )
- # self.infile = fdopen( fd, 'r')
-
- # if self.infile == NULL: raise ValueError( "I/O operation on closed file." )
-
- # self.buffer = <char*>malloc( buffer_size )
- # self.size = buffer_size
- # self.parser = parser
-
- # def __iter__(self):
- # return self
-
- # cdef __cnext__(self):
-
- # cdef char * b
- # cdef size_t nbytes
- # b = self.buffer
-
- # while not feof( self.infile ):
- # nbytes = getline( &b, &self.size, self.infile)
-
- # # stop at first error or eof
- # if (nbytes == -1): break
- # # skip comments
- # if (b[0] == '#'): continue
-
- # # skip empty lines
- # if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': continue
-
- # # make sure that entry is complete
- # if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
- # result = b
- # raise ValueError( "incomplete line at %s" % result )
-
- # # make sure that this goes fully through C
- # # otherwise buffer is copied to/from a
- # # Python object causing segfaults as
- # # the wrong memory is freed
- # return self.parser.parse( b, nbytes )
-
- # raise StopIteration
-
- # def __dealloc__(self):
- # free(self.buffer)
-
- # def __next__(self):
- # return self.__cnext__()
-
- #########################################################
- #########################################################
- #########################################################
- ## Iterators for parsing through unindexed files.
- #########################################################
- # cdef buildGzipError(void *gzfp):
- # cdef int errnum = 0
- # cdef char *s = gzerror(gzfp, &errnum)
- # return "error (%d): %s (%d: %s)" % (errno, strerror(errno), errnum, s)
-
-
- cdef class tabix_file_iterator:
- '''iterate over a compressed or uncompressed ``infile``.
- '''
-
- def __cinit__(self,
- infile,
- Parser parser,
- int buffer_size=65536):
-
- if infile.closed:
- raise ValueError("I/O operation on closed file.")
-
- self.infile = infile
-
- cdef int fd = PyObject_AsFileDescriptor(infile)
- if fd == -1:
- raise ValueError("I/O operation on closed file.")
-
- self.duplicated_fd = dup(fd)
-
- # From the manual:
- # gzopen can be used to read a file which is not in gzip format;
- # in this case gzread will directly read from the file without decompression.
- # When reading, this will be detected automatically by looking
- # for the magic two-byte gzip header.
- self.fh = bgzf_dopen(self.duplicated_fd, 'r')
-
- if self.fh == NULL:
- raise IOError('%s' % strerror(errno))
-
- self.kstream = ks_init(self.fh)
-
- self.buffer.s = <char*>malloc(buffer_size)
- #if self.buffer == NULL:
- # raise MemoryError( "tabix_file_iterator: could not allocate %i bytes" % buffer_size)
- #self.size = buffer_size
- self.parser = parser
-
- def __iter__(self):
- return self
-
- cdef __cnext__(self):
-
- cdef char * b
- cdef int dret = 0
- cdef int retval = 0
- while 1:
- with nogil:
- retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
-
- if retval < 0:
- break
- #raise IOError('gzip error: %s' % buildGzipError( self.fh ))
-
- b = self.buffer.s
-
- # skip comments
- if (b[0] == b'#'):
- continue
-
- # skip empty lines
- if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
- continue
-
- # gzgets terminates at \n, no need to test
-
- # parser creates a copy
- return self.parser.parse(b, self.buffer.l)
-
- raise StopIteration
-
- def __dealloc__(self):
- free(self.buffer.s)
- ks_destroy(self.kstream)
- bgzf_close(self.fh)
-
- def __next__(self):
- return self.__cnext__()
-
-
- class tabix_generic_iterator:
- '''iterate over ``infile``.
-
- Permits the use of file-like objects for example from the gzip module.
- '''
- def __init__(self, infile, parser):
-
- self.infile = infile
- if self.infile.closed:
- raise ValueError("I/O operation on closed file.")
- self.parser = parser
-
- def __iter__(self):
- return self
-
- # cython version - required for python 3
- def __next__(self):
-
- cdef char * b
- cdef char * cpy
- cdef size_t nbytes
-
- encoding = self.parser.get_encoding()
-
- # note that GzipFile.close() does not close the file
- # reading is still possible.
- if self.infile.closed:
- raise ValueError("I/O operation on closed file.")
-
- while 1:
-
- line = self.infile.readline()
- if not line:
- break
-
- s = force_bytes(line, encoding)
- b = s
- nbytes = len(line)
- assert b[nbytes] == b'\0'
-
- # skip comments
- if b[0] == b'#':
- continue
-
- # skip empty lines
- if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
- continue
-
- # make sure that entry is complete
- if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
- raise ValueError("incomplete line at %s" % line)
-
- bytes_cpy = <bytes> b
- cpy = <char *> bytes_cpy
-
- return self.parser(cpy, nbytes)
-
- raise StopIteration
-
-
- def tabix_iterator(infile, parser):
- """return an iterator over all entries in a file.
-
- Results are returned parsed as specified by the *parser*. If
- *parser* is None, the results are returned as an unparsed string.
- Otherwise, *parser* is assumed to be a functor that will return
- parsed data (see for example :class:`~pysam.asTuple` and
- :class:`~pysam.asGTF`).
-
- """
- return tabix_generic_iterator(infile, parser)
-
-
- cdef class Tabixfile(TabixFile):
- """Tabixfile is deprecated: use TabixFile instead"""
- pass
-
-
- __all__ = [
- "tabix_index",
- "tabix_compress",
- "TabixFile",
- "Tabixfile",
- "asTuple",
- "asGTF",
- "asGFF3",
- "asVCF",
- "asBed",
- "GZIterator",
- "GZIteratorHead",
- "tabix_iterator",
- "tabix_generic_iterator",
- "tabix_file_iterator",
- ]
|