|
- # cython: language_level=3
- # cython: embedsignature=True
- # cython: profile=True
- # adds doc-strings for sphinx
-
- ########################################################################
- ########################################################################
- ## Cython cimports
- ########################################################################
-
- from posix.unistd cimport dup
- from libc.errno cimport errno
- from libc.stdint cimport INT32_MAX
- from cpython cimport PyBytes_FromStringAndSize
- from pysam.libchtslib cimport *
- from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
- from pysam.libcutils cimport encode_filename, from_string_and_size, libc_whence_from_io
-
-
- ########################################################################
- ########################################################################
- ## Python imports
- ########################################################################
-
- import os
- import io
- import re
- from warnings import warn
-
-
- ########################################################################
- ########################################################################
- ## Constants
- ########################################################################
-
- __all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile']
-
- # maximum genomic coordinace
- cdef int MAX_POS = (1 << 31) - 1
-
- cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
- cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
- 'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
- cdef tuple COMPRESSION = ('NONE', 'GZIP', 'BGZF', 'CUSTOM')
-
-
- ########################################################################
- ########################################################################
- ## Verbosity functions
- ########################################################################
-
-
- cpdef set_verbosity(int verbosity):
- """Set htslib's hts_verbose global variable to the specified value."""
- return hts_set_verbosity(verbosity)
-
- cpdef get_verbosity():
- """Return the value of htslib's hts_verbose global variable."""
- return hts_get_verbosity()
-
-
- ########################################################################
- ########################################################################
- ## HFile wrapper class
- ########################################################################
-
- cdef class HFile(object):
- cdef hFILE *fp
- cdef readonly object name, mode
-
- def __init__(self, name, mode='r', closefd=True):
- self._open(name, mode, closefd=True)
-
- def __dealloc__(self):
- self.close()
-
- @property
- def closed(self):
- return self.fp == NULL
-
- cdef _open(self, name, mode, closefd=True):
- self.name = name
- self.mode = mode
-
- mode = force_bytes(mode)
-
- if isinstance(name, int):
- if self.fp != NULL:
- name = dup(name)
- self.fp = hdopen(name, mode)
- else:
- name = encode_filename(name)
- self.fp = hopen(name, mode)
-
- if not self.fp:
- raise IOError(errno, 'failed to open HFile', self.name)
-
- def close(self):
- if self.fp == NULL:
- return
-
- cdef hFILE *fp = self.fp
- self.fp = NULL
-
- if hclose(fp) != 0:
- raise IOError(errno, 'failed to close HFile', self.name)
-
- def fileno(self):
- if self.fp == NULL:
- raise IOError('operation on closed HFile')
- if isinstance(self.name, int):
- return self.name
- else:
- raise AttributeError('fileno not available')
-
- def __enter__(self):
- return self
-
- def __exit__(self, type, value, tb):
- self.close()
-
- def __iter__(self):
- return self
-
- def __next__(self):
- line = self.readline()
- if not line:
- raise StopIteration()
- return line
-
- def flush(self):
- if self.fp == NULL:
- raise IOError('operation on closed HFile')
- if hflush(self.fp) != 0:
- raise IOError(herrno(self.fp), 'failed to flush HFile', self.name)
-
- def isatty(self):
- if self.fp == NULL:
- raise IOError('operation on closed HFile')
- return False
-
- def readable(self):
- return self.fp != NULL and 'r' in self.mode
-
- def read(self, Py_ssize_t size=-1):
- if self.fp == NULL:
- raise IOError('operation on closed HFile')
-
- if size == 0:
- return b''
-
- cdef list parts = []
- cdef bytes part
- cdef Py_ssize_t chunk_size, ret, bytes_read = 0
- cdef char *cpart
-
- while size == -1 or bytes_read < size:
- chunk_size = 4096
- if size != -1:
- chunk_size = min(chunk_size, size - bytes_read)
-
- part = PyBytes_FromStringAndSize(NULL, chunk_size)
- cpart = <char *>part
- ret = hread(self.fp, <void *>cpart, chunk_size)
-
- if ret < 0:
- IOError(herrno(self.fp), 'failed to read HFile', self.name)
- elif not ret:
- break
-
- bytes_read += ret
-
- if ret < chunk_size:
- part = cpart[:ret]
-
- parts.append(part)
-
- return b''.join(parts)
-
- def readall(self):
- return self.read()
-
- def readinto(self, buf):
- if self.fp == NULL:
- raise IOError('operation on closed HFile')
-
- size = len(buf)
-
- if size == 0:
- return size
-
- mv = memoryview(buf)
- ret = hread(self.fp, <void *>mv, size)
-
- if ret < 0:
- IOError(herrno(self.fp), 'failed to read HFile', self.name)
-
- return ret
-
- def readline(self, Py_ssize_t size=-1):
- if self.fp == NULL:
- raise IOError('operation on closed HFile')
-
- if size == 0:
- return b''
-
- cdef list parts = []
- cdef bytes part
- cdef Py_ssize_t chunk_size, ret, bytes_read = 0
- cdef char *cpart
-
- while size == -1 or bytes_read < size:
- chunk_size = 4096
- if size != -1:
- chunk_size = min(chunk_size, size - bytes_read)
-
- part = PyBytes_FromStringAndSize(NULL, chunk_size)
- cpart = <char *>part
-
- # Python bytes objects allocate an extra byte for a null terminator
- ret = hgetln(cpart, chunk_size+1, self.fp)
-
- if ret < 0:
- IOError(herrno(self.fp), 'failed to read HFile', self.name)
- elif not ret:
- break
-
- bytes_read += ret
-
- if ret < chunk_size:
- part = cpart[:ret]
- cpart = <char *>part
-
- parts.append(part)
-
- if cpart[ret-1] == b'\n':
- break
-
- return b''.join(parts)
-
- def readlines(self):
- return list(self)
-
- def seek(self, Py_ssize_t offset, int whence=io.SEEK_SET):
- if self.fp == NULL:
- raise IOError('operation on closed HFile')
-
- cdef Py_ssize_t off = hseek(self.fp, offset, libc_whence_from_io(whence))
-
- if off < 0:
- raise IOError(herrno(self.fp), 'seek failed on HFile', self.name)
-
- return off
-
- def tell(self):
- if self.fp == NULL:
- raise IOError('operation on closed HFile')
-
- ret = htell(self.fp)
-
- if ret < 0:
- raise IOError(herrno(self.fp), 'tell failed on HFile', self.name)
-
- return ret
-
- def seekable(self):
- return self.fp != NULL
-
- def truncate(self, size=None):
- raise NotImplementedError()
-
- def writable(self):
- return self.fp != NULL and 'w' in self.mode
-
- def write(self, bytes b):
- if self.fp == NULL:
- raise IOError('operation on closed HFile')
-
- got = hwrite(self.fp, <void *>b, len(b))
-
- if got < 0:
- raise IOError(herrno(self.fp), 'write failed on HFile', self.name)
-
- return got
-
- def writelines(self, lines):
- for line in lines:
- self.write(line)
-
-
- ########################################################################
- ########################################################################
- ## Helpers for backward compatibility to hide the difference between
- ## boolean properties and methods
- ########################################################################
-
- class CallableValue(object):
- def __init__(self, value):
- self.value = value
- def __call__(self):
- return self.value
- def __bool__(self):
- return self.value
- def __nonzero__(self):
- return self.value
- def __eq__(self, other):
- return self.value == other
- def __ne__(self, other):
- return self.value != other
-
-
- CTrue = CallableValue(True)
- CFalse = CallableValue(False)
-
-
- ########################################################################
- ########################################################################
- ## HTSFile wrapper class (base class for AlignmentFile and VariantFile)
- ########################################################################
-
- cdef class HTSFile(object):
- """
- Base class for HTS file types
- """
-
- def __cinit__(self, *args, **kwargs):
- self.htsfile = NULL
- self.threads = 1
- self.duplicate_filehandle = True
-
- def close(self):
- if self.htsfile:
- hts_close(self.htsfile)
- self.htsfile = NULL
-
- def __dealloc__(self):
- if self.htsfile:
- hts_close(self.htsfile)
- self.htsfile = NULL
-
- def flush(self):
- """Flush any buffered data to the underlying output stream."""
- if self.htsfile:
- if hts_flush(self.htsfile) < 0:
- raise OSError(errno, f'Flushing {type(self).__name__} failed', force_str(self.filename))
-
- def check_truncation(self, ignore_truncation=False):
- """Check if file is truncated."""
- if not self.htsfile:
- return
-
- if self.htsfile.format.compression != bgzf:
- return
-
- cdef BGZF *bgzfp = hts_get_bgzfp(self.htsfile)
- if not bgzfp:
- return
-
- cdef int ret = bgzf_check_EOF(bgzfp)
- if ret < 0:
- raise IOError(errno, 'error checking for EOF marker')
- elif ret == 0:
- msg = 'no BGZF EOF marker; file may be truncated'.format(self.filename)
- if ignore_truncation:
- warn(msg)
- else:
- raise IOError(msg)
-
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- return False
-
- @property
- def category(self):
- """General file format category. One of UNKNOWN, ALIGNMENTS,
- VARIANTS, INDEX, REGIONS"""
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return FORMAT_CATEGORIES[self.htsfile.format.category]
-
- @property
- def format(self):
- """File format.
-
- One of UNKNOWN, BINARY_FORMAT, TEXT_FORMAT, SAM, BAM,
- BAI, CRAM, CRAI, VCF, BCF, CSI, GZI, TBI, BED.
- """
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return FORMATS[self.htsfile.format.format]
-
- @property
- def version(self):
- """Tuple of file format version numbers (major, minor)"""
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return self.htsfile.format.version.major, self.htsfile.format.version.minor
-
- @property
- def compression(self):
- """File compression.
-
- One of NONE, GZIP, BGZF, CUSTOM."""
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- return COMPRESSION[self.htsfile.format.compression]
-
- @property
- def description(self):
- """Vaguely human readable description of the file format"""
- if not self.htsfile:
- raise ValueError('metadata not available on closed file')
- cdef char *desc = hts_format_description(&self.htsfile.format)
- try:
- return charptr_to_str(desc)
- finally:
- free(desc)
-
- @property
- def is_open(self):
- """return True if HTSFile is open and in a valid state."""
- return CTrue if self.htsfile != NULL else CFalse
-
- @property
- def is_closed(self):
- """return True if HTSFile is closed."""
- return self.htsfile == NULL
-
- @property
- def closed(self):
- """return True if HTSFile is closed."""
- return self.htsfile == NULL
-
- @property
- def is_write(self):
- """return True if HTSFile is open for writing"""
- return self.htsfile != NULL and self.htsfile.is_write != 0
-
- @property
- def is_read(self):
- """return True if HTSFile is open for reading"""
- return self.htsfile != NULL and self.htsfile.is_write == 0
-
- @property
- def is_sam(self):
- """return True if HTSFile is reading or writing a SAM alignment file"""
- return self.htsfile != NULL and self.htsfile.format.format == sam
-
- @property
- def is_bam(self):
- """return True if HTSFile is reading or writing a BAM alignment file"""
- return self.htsfile != NULL and self.htsfile.format.format == bam
-
- @property
- def is_cram(self):
- """return True if HTSFile is reading or writing a BAM alignment file"""
- return self.htsfile != NULL and self.htsfile.format.format == cram
-
- @property
- def is_vcf(self):
- """return True if HTSFile is reading or writing a VCF variant file"""
- return self.htsfile != NULL and self.htsfile.format.format == vcf
-
- @property
- def is_bcf(self):
- """return True if HTSFile is reading or writing a BCF variant file"""
- return self.htsfile != NULL and self.htsfile.format.format == bcf
-
- def reset(self):
- """reset file position to beginning of file just after the header.
-
- Returns
- -------
-
- The file position after moving the file pointer. : pointer
-
- """
- return self.seek(self.start_offset)
-
- def seek(self, uint64_t offset, int whence=io.SEEK_SET):
- """move file pointer to position *offset*, see :meth:`pysam.HTSFile.tell`."""
- if not self.is_open:
- raise ValueError('I/O operation on closed file')
- if self.is_stream:
- raise IOError('seek not available in streams')
-
- whence = libc_whence_from_io(whence)
-
- cdef int64_t ret
- if self.htsfile.format.compression == bgzf:
- with nogil:
- ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, whence)
- elif self.htsfile.format.compression == no_compression:
- ret = 0 if (hseek(self.htsfile.fp.hfile, offset, whence) >= 0) else -1
- else:
- raise NotImplementedError("seek not implemented in files compressed by method {}".format(
- self.htsfile.format.compression))
- return ret
-
- def tell(self):
- """return current file position, see :meth:`pysam.HTSFile.seek`."""
- if not self.is_open:
- raise ValueError('I/O operation on closed file')
- if self.is_stream:
- raise IOError('tell not available in streams')
-
- cdef int64_t ret
- if self.htsfile.format.compression == bgzf:
- with nogil:
- ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
- elif self.htsfile.format.compression == no_compression:
- ret = htell(self.htsfile.fp.hfile)
- elif self.htsfile.format.format == cram:
- with nogil:
- ret = htell(cram_fd_get_fp(self.htsfile.fp.cram))
- else:
- raise NotImplementedError("seek not implemented in files compressed by method {}".format(
- self.htsfile.format.compression))
-
- return ret
-
- cdef htsFile *_open_htsfile(self) except? NULL:
- cdef char *cfilename
- cdef char *cmode = self.mode
- cdef int fd, dup_fd, threads
-
- threads = self.threads - 1
- if isinstance(self.filename, bytes):
- cfilename = self.filename
- with nogil:
- htsfile = hts_open(cfilename, cmode)
- if htsfile != NULL:
- hts_set_threads(htsfile, threads)
- return htsfile
- else:
- if isinstance(self.filename, int):
- fd = self.filename
- else:
- fd = self.filename.fileno()
-
- if self.duplicate_filehandle:
- dup_fd = dup(fd)
- else:
- dup_fd = fd
-
- # Replicate mode normalization done in hts_open_format
- smode = self.mode.replace(b'b', b'').replace(b'c', b'')
- if b'b' in self.mode:
- smode += b'b'
- elif b'c' in self.mode:
- smode += b'c'
- cmode = smode
-
- hfile = hdopen(dup_fd, cmode)
- if hfile == NULL:
- raise IOError('Cannot create hfile')
-
- try:
- # filename.name can be an int
- filename = str(self.filename.name)
- except AttributeError:
- filename = '<fd:{}>'.format(fd)
-
- filename = encode_filename(filename)
- cfilename = filename
- with nogil:
- htsfile = hts_hopen(hfile, cfilename, cmode)
- if htsfile != NULL:
- hts_set_threads(htsfile, threads)
- return htsfile
-
- def add_hts_options(self, format_options=None):
- """Given a list of key=value format option strings, add them to an open htsFile
- """
- cdef int rval
- cdef hts_opt *opts = NULL
-
- if format_options:
- for format_option in format_options:
- rval = hts_opt_add(&opts, format_option)
- if rval != 0:
- if opts != NULL:
- hts_opt_free(opts)
- raise RuntimeError('Invalid format option ({}) specified'.format(format_option))
- if opts != NULL:
- rval = hts_opt_apply(self.htsfile, opts)
- if rval != 0:
- hts_opt_free(opts)
- raise RuntimeError('An error occurred while applying the requested format options')
- hts_opt_free(opts)
-
- def parse_region(self, contig=None, start=None, stop=None,
- region=None, tid=None,
- reference=None, end=None):
- """parse alternative ways to specify a genomic region. A region can
- either be specified by :term:`contig`, `start` and
- `stop`. `start` and `stop` denote 0-based, half-open
- intervals. :term:`reference` and `end` are also accepted for
- backward compatibility as synonyms for :term:`contig` and
- `stop`, respectively.
-
- Alternatively, a samtools :term:`region` string can be
- supplied.
-
- If any of the coordinates are missing they will be replaced by
- the minimum (`start`) or maximum (`stop`) coordinate.
-
- Note that region strings are 1-based inclusive, while `start`
- and `stop` denote an interval in 0-based, half-open
- coordinates (like BED files and Python slices).
-
- If `contig` or `region` or are ``*``, unmapped reads at the end
- of a BAM file will be returned. Setting either to ``.`` will
- iterate from the beginning of the file.
-
- Returns
- -------
-
- tuple :
- a tuple of `flag`, :term:`tid`, `start` and
- `stop`. The flag indicates whether no coordinates were
- supplied and the genomic region is the complete genomic space.
-
- Raises
- ------
-
- ValueError
- for invalid or out of bounds regions.
-
- """
- cdef int rtid
- cdef int32_t rstart
- cdef int32_t rstop
-
- if reference is not None:
- if contig is not None:
- raise ValueError('contig and reference should not both be specified')
- contig = reference
-
- if end is not None:
- if stop is not None:
- raise ValueError('stop and end should not both be specified')
- stop = end
-
- if contig is None and tid is None and region is None:
- return 0, 0, 0, MAX_POS
-
- rtid = -1
- rstart = 0
- rstop = MAX_POS
- if start is not None:
- try:
- rstart = start
- except OverflowError:
- raise ValueError('start out of range (%i)' % start)
-
- if stop is not None:
- try:
- rstop = stop
- except OverflowError:
- raise ValueError('stop out of range (%i)' % stop)
-
- if region:
- region = force_str(region)
- if ":" in region:
- contig, coord = region.split(":")
- parts = coord.split("-")
- rstart = int(parts[0]) - 1
- if len(parts) >= 1:
- rstop = int(parts[1])
- else:
- contig = region
-
- if tid is not None:
- if not self.is_valid_tid(tid):
- raise IndexError('invalid tid')
- rtid = tid
- else:
- if contig == "*":
- rtid = HTS_IDX_NOCOOR
- elif contig == ".":
- rtid = HTS_IDX_START
- else:
- rtid = self.get_tid(contig)
- if rtid < 0:
- raise ValueError('invalid contig `%s`' % contig)
-
- if rstart > rstop:
- raise ValueError('invalid coordinates: start (%i) > stop (%i)' % (rstart, rstop))
- if not 0 <= rstart < MAX_POS:
- raise ValueError('start out of range (%i)' % rstart)
- if not 0 <= rstop <= MAX_POS:
- raise ValueError('stop out of range (%i)' % rstop)
-
- return 1, rtid, rstart, rstop
-
- def is_valid_tid(self, tid):
- """
- return True if the numerical :term:`tid` is valid; False otherwise.
-
- returns -1 if contig is not known.
- """
- raise NotImplementedError()
-
- def is_valid_reference_name(self, contig):
- """
- return True if the contig name :term:`contig` is valid; False otherwise.
- """
- return self.get_tid(contig) != -1
-
- def get_tid(self, contig):
- """
- return the numerical :term:`tid` corresponding to
- :term:`contig`
-
- returns -1 if contig is not known.
- """
- raise NotImplementedError()
-
- def get_reference_name(self, tid):
- """
- return :term:`contig` name corresponding to numerical :term:`tid`
- """
- raise NotImplementedError()
|