|
- # cython: embedsignature=True
- # cython: profile=True
- ########################################################
- ########################################################
- # Cython wrapper for SAM/BAM/CRAM files based on htslib
- ########################################################
- # The principal classes defined in this module are:
- #
- # class AlignmentFile read/write access to SAM/BAM/CRAM formatted files
- #
- # class AlignmentHeader manage SAM/BAM/CRAM header data
- #
- # class IndexedReads index a SAM/BAM/CRAM file by query name while keeping
- # the original sort order intact
- #
- # Additionally this module defines numerous additional classes that
- # are part of the internal API. These are:
- #
- # Various iterator classes to iterate over alignments in sequential
- # (IteratorRow) or in a stacked fashion (IteratorColumn):
- #
- # class IteratorRow
- # class IteratorRowRegion
- # class IteratorRowHead
- # class IteratorRowAll
- # class IteratorRowAllRefs
- # class IteratorRowSelection
- # class IteratorColumn
- # class IteratorColumnRegion
- # class IteratorColumnAll
- # class IteratorColumnAllRefs
- #
- ########################################################
- #
- # The MIT License
- #
- # Copyright (c) 2015 Andreas Heger
- #
- # Permission is hereby granted, free of charge, to any person obtaining a
- # copy of this software and associated documentation files (the "Software"),
- # to deal in the Software without restriction, including without limitation
- # the rights to use, copy, modify, merge, publish, distribute, sublicense,
- # and/or sell copies of the Software, and to permit persons to whom the
- # Software is furnished to do so, subject to the following conditions:
- #
- # The above copyright notice and this permission notice shall be included in
- # all copies or substantial portions of the Software.
- #
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- # DEALINGS IN THE SOFTWARE.
- #
- ########################################################
- import os
- import collections
- try:
- from collections.abc import Sequence, Mapping # noqa
- except ImportError:
- from collections import Sequence, Mapping # noqa
- import re
- import warnings
- import array
- from libc.errno cimport errno, EPIPE
- from libc.string cimport strcmp, strpbrk
- from libc.stdint cimport INT32_MAX
-
- from cpython cimport array as c_array
-
- from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
- from pysam.libcutils cimport OSError_from_errno, encode_filename, from_string_and_size
- from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
- from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
- HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
-
- from io import StringIO
-
- cimport cython
-
-
- __all__ = [
- "AlignmentFile",
- "AlignmentHeader",
- "IteratorRow",
- "IteratorColumn",
- "IndexedReads"]
-
- IndexStats = collections.namedtuple("IndexStats",
- ("contig",
- "mapped",
- "unmapped",
- "total"))
-
- ########################################################
- ## global variables
- # maximum genomic coordinace
- # for some reason, using 'int' causes overflow
- cdef int MAX_POS = (1 << 31) - 1
-
- # valid types for SAM headers
- VALID_HEADER_TYPES = {"HD" : Mapping,
- "SQ" : Sequence,
- "RG" : Sequence,
- "PG" : Sequence,
- "CO" : Sequence}
-
- # order of records within SAM headers
- VALID_HEADERS = ("HD", "SQ", "RG", "PG", "CO")
-
- # default type conversions within SAM header records
- KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str,
- "SS" : str,},
- "SQ" : {"SN" : str, "LN" : int, "AS" : str,
- "M5" : str, "SP" : str, "UR" : str,
- "AH" : str, "TP" : str, "DS" : str,
- "AN" : str,},
- "RG" : {"ID" : str, "CN" : str, "DS" : str,
- "DT" : str, "FO" : str, "KS" : str,
- "LB" : str, "PG" : str, "PI" : str,
- "PL" : str, "PM" : str, "PU" : str,
- "SM" : str, "BC" : str,},
- "PG" : {"ID" : str, "PN" : str, "CL" : str,
- "PP" : str, "DS" : str, "VN" : str,},}
-
- # output order of fields within records. Ensure that CL is at
- # the end as parsing a CL will ignore any subsequent records.
- VALID_HEADER_ORDER = {"HD" : ("VN", "SO", "SS", "GO"),
- "SQ" : ("SN", "LN", "AS", "M5",
- "UR", "SP", "AH", "TP",
- "DS", "AN"),
- "RG" : ("ID", "CN", "SM", "LB",
- "PU", "PI", "DT", "DS",
- "PL", "FO", "KS", "PG",
- "PM", "BC"),
- "PG" : ("PN", "ID", "VN", "PP",
- "DS", "CL"),}
-
-
- def build_header_line(fields, record):
- '''build a header line from `fields` dictionary for `record`'''
-
- # TODO: add checking for field and sort order
- line = ["@%s" % record]
- # comment
- if record == "CO":
- line.append(fields)
- # user tags
- elif record.islower():
- for key in sorted(fields):
- line.append("%s:%s" % (key, str(fields[key])))
- # defined tags
- else:
- # write fields of the specification
- for key in VALID_HEADER_ORDER[record]:
- if key in fields:
- line.append("%s:%s" % (key, str(fields[key])))
- # write user fields
- for key in fields:
- if not key.isupper():
- line.append("%s:%s" % (key, str(fields[key])))
-
- return "\t".join(line)
-
-
- cdef AlignmentHeader makeAlignmentHeader(bam_hdr_t *hdr):
- if not hdr:
- raise ValueError('cannot create AlignmentHeader, received NULL pointer')
-
- # check: is AlignmetHeader.__cinit__ called?
- cdef AlignmentHeader header = AlignmentHeader.__new__(AlignmentHeader)
- header.ptr = hdr
-
- return header
-
- def read_failure_reason(code):
- if code == -2:
- return 'truncated file'
- else:
- return "error {} while reading file".format(code)
-
-
- # the following should be class-method for VariantHeader, but cdef @classmethods
- # are not implemented in cython.
- cdef int fill_AlignmentHeader_from_list(bam_hdr_t *dest,
- reference_names,
- reference_lengths,
- add_sq_text=True,
- text=None) except -1:
- """build header from list of reference names and lengths.
- """
-
- cdef class AlignmentHeader(object):
- """header information for a :class:`AlignmentFile` object
-
- Parameters
- ----------
- header_dict : dict
- build header from a multi-level dictionary. The
- first level are the four types ('HD', 'SQ', ...). The second
- level are a list of lines, with each line being a list of
- tag-value pairs. The header is constructed first from all the
- defined fields, followed by user tags in alphabetical
- order. Alternatively, an :class:`~pysam.AlignmentHeader`
- object can be passed directly.
-
- text : string
- use the string provided as the header
-
- reference_names : list
- see reference_lengths
-
- reference_lengths : list
- build header from list of chromosome names and lengths. By
- default, 'SQ' and 'LN' tags will be added to the header
- text. This option can be changed by unsetting the flag
- `add_sq_text`.
-
- add_sq_text : bool
- do not add 'SQ' and 'LN' tags to header. This option permits
- construction :term:`SAM` formatted files without a header.
-
- """
-
- # See makeVariantHeader for C constructor
- def __cinit__(self):
- self.ptr = NULL
-
- # Python constructor
- def __init__(self):
- self.ptr = bam_hdr_init()
- if self.ptr is NULL:
- raise MemoryError("could not create header")
-
- @classmethod
- def _from_text_and_lengths(cls, text, reference_names, reference_lengths):
-
- cdef AlignmentHeader self = AlignmentHeader()
- cdef char *ctext
- cdef int l_text
- cdef int n, x
- if text is not None:
- btext = force_bytes(text)
- ctext = btext
- l_text = len(btext)
- self.ptr.text = <char*>calloc(l_text + 1, sizeof(char))
- if self.ptr.text == NULL:
- raise MemoryError("could not allocate {} bytes".format(l_text + 1), sizeof(char))
- self.ptr.l_text = l_text
- memcpy(self.ptr.text, ctext, l_text + 1)
-
- if reference_names and reference_lengths:
- reference_names = [force_bytes(ref) for ref in reference_names]
-
- self.ptr.n_targets = len(reference_names)
-
- n = sum([len(reference_names) + 1])
- self.ptr.target_name = <char**>calloc(n, sizeof(char*))
- if self.ptr.target_name == NULL:
- raise MemoryError("could not allocate {} bytes".format(n, sizeof(char *)))
-
- self.ptr.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
- if self.ptr.target_len == NULL:
- raise MemoryError("could not allocate {} bytes".format(n, sizeof(uint32_t)))
-
- for x from 0 <= x < self.ptr.n_targets:
- self.ptr.target_len[x] = reference_lengths[x]
- name = reference_names[x]
- self.ptr.target_name[x] = <char*>calloc(len(name) + 1, sizeof(char))
- if self.ptr.target_name[x] == NULL:
- raise MemoryError("could not allocate {} bytes".format(len(name) + 1, sizeof(char)))
- strncpy(self.ptr.target_name[x], name, len(name))
-
- return self
-
- @classmethod
- def from_text(cls, text):
-
- reference_names, reference_lengths = [], []
- for line in text.splitlines():
- if line.startswith("@SQ"):
- fields = dict([x.split(":", 1) for x in line.split("\t")[1:]])
- try:
- reference_names.append(fields["SN"])
- reference_lengths.append(int(fields["LN"]))
- except KeyError:
- raise KeyError("incomplete sequence information in '%s'" % str(fields))
- except ValueError:
- raise ValueError("wrong sequence information in '%s'" % str(fields))
-
- return cls._from_text_and_lengths(text, reference_names, reference_lengths)
-
- @classmethod
- def from_dict(cls, header_dict):
-
- cdef list lines = []
- # first: defined tags
- for record in VALID_HEADERS:
- if record in header_dict:
- data = header_dict[record]
- if not isinstance(data, VALID_HEADER_TYPES[record]):
- raise ValueError(
- "invalid type for record {}: {}, expected {}".format(
- record, type(data), VALID_HEADER_TYPES[record]))
- if isinstance(data, Mapping):
- lines.append(build_header_line(data, record))
- else:
- for fields in header_dict[record]:
- lines.append(build_header_line(fields, record))
-
- # then: user tags (lower case), sorted alphabetically
- for record, data in sorted(header_dict.items()):
- if record in VALID_HEADERS:
- continue
- if isinstance(data, Mapping):
- lines.append(build_header_line(data, record))
- else:
- for fields in header_dict[record]:
- lines.append(build_header_line(fields, record))
-
- text = "\n".join(lines) + "\n"
-
- reference_names, reference_lengths = [], []
- if "SQ" in header_dict:
- for fields in header_dict["SQ"]:
- try:
- reference_names.append(fields["SN"])
- reference_lengths.append(fields["LN"])
- except KeyError:
- raise KeyError("incomplete sequence information in '%s'" % str(fields))
-
- return cls._from_text_and_lengths(text, reference_names, reference_lengths)
-
- @classmethod
- def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=True):
-
- if len(reference_names) != len(reference_lengths):
- raise ValueError("number of reference names and lengths do not match")
-
- # optionally, if there is no text, add a SAM compatible header to output file.
- if text is None and add_sq_text:
- text = "".join(["@SQ\tSN:{}\tLN:{}\n".format(x, y) for x, y in zip(
- reference_names, reference_lengths)])
-
- return cls._from_text_and_lengths(text, reference_names, reference_lengths)
-
- def __dealloc__(self):
- bam_hdr_destroy(self.ptr)
- self.ptr = NULL
-
- def __bool__(self):
- return self.ptr != NULL
-
- def copy(self):
- return makeAlignmentHeader(bam_hdr_dup(self.ptr))
-
- property nreferences:
- """int with the number of :term:`reference` sequences in the file.
-
- This is a read-only attribute."""
- def __get__(self):
- return self.ptr.n_targets
-
- property references:
- """tuple with the names of :term:`reference` sequences. This is a
- read-only attribute"""
- def __get__(self):
- t = []
- cdef int x
- for x in range(self.ptr.n_targets):
- t.append(charptr_to_str(self.ptr.target_name[x]))
- return tuple(t)
-
- property lengths:
- """tuple of the lengths of the :term:`reference` sequences. This is a
- read-only attribute. The lengths are in the same order as
- :attr:`pysam.AlignmentFile.references`
- """
- def __get__(self):
- t = []
- cdef int x
- for x in range(self.ptr.n_targets):
- t.append(self.ptr.target_len[x])
- return tuple(t)
-
- def _build_sequence_section(self):
- """return sequence section of header.
-
- The sequence section is built from the list of reference names and
- lengths stored in the BAM-file and not from any @SQ entries that
- are part of the header's text section.
- """
-
- cdef int x
- text = []
- for x in range(self.ptr.n_targets):
- text.append("@SQ\tSN:{}\tLN:{}\n".format(
- force_str(self.ptr.target_name[x]),
- self.ptr.target_len[x]))
- return "".join(text)
-
- def to_dict(self):
- """return two-level dictionary with header information from the file.
-
- The first level contains the record (``HD``, ``SQ``, etc) and
- the second level contains the fields (``VN``, ``LN``, etc).
-
- The parser is validating and will raise an AssertionError if
- if encounters any record or field tags that are not part of
- the SAM specification. Use the
- :attr:`pysam.AlignmentFile.text` attribute to get the unparsed
- header.
-
- The parsing follows the SAM format specification with the
- exception of the ``CL`` field. This option will consume the
- rest of a header line irrespective of any additional fields.
- This behaviour has been added to accommodate command line
- options that contain characters that are not valid field
- separators.
-
- If no @SQ entries are within the text section of the header,
- this will be automatically added from the reference names and
- lengths stored in the binary part of the header.
- """
- result = collections.OrderedDict()
-
- # convert to python string
- t = self.__str__()
- for line in t.split("\n"):
- line = line.strip(' \0')
- if not line:
- continue
- assert line.startswith("@"), \
- "header line without '@': '%s'" % line
- fields = line[1:].split("\t")
- record = fields[0]
- assert record in VALID_HEADER_TYPES, \
- "header line with invalid type '%s': '%s'" % (record, line)
-
- # treat comments
- if record == "CO":
- if record not in result:
- result[record] = []
- result[record].append("\t".join( fields[1:]))
- continue
- # the following is clumsy as generators do not work?
- x = {}
-
- for idx, field in enumerate(fields[1:]):
- if ":" not in field:
- raise ValueError("malformatted header: no ':' in field" )
- key, value = field.split(":", 1)
- if key in ("CL",):
- # special treatment for command line
- # statements (CL). These might contain
- # characters that are non-conformant with
- # the valid field separators in the SAM
- # header. Thus, in contravention to the
- # SAM API, consume the rest of the line.
- key, value = "\t".join(fields[idx+1:]).split(":", 1)
- x[key] = KNOWN_HEADER_FIELDS[record][key](value)
- break
-
- # interpret type of known header record tags, default to str
- x[key] = KNOWN_HEADER_FIELDS[record].get(key, str)(value)
-
- if VALID_HEADER_TYPES[record] == Mapping:
- if record in result:
- raise ValueError(
- "multiple '%s' lines are not permitted" % record)
-
- result[record] = x
- elif VALID_HEADER_TYPES[record] == Sequence:
- if record not in result: result[record] = []
- result[record].append(x)
-
- # if there are no SQ lines in the header, add the
- # reference names from the information in the bam
- # file.
- #
- # Background: c-samtools keeps the textual part of the
- # header separate from the list of reference names and
- # lengths. Thus, if a header contains only SQ lines,
- # the SQ information is not part of the textual header
- # and thus are missing from the output. See issue 84.
- if "SQ" not in result:
- sq = []
- for ref, length in zip(self.references, self.lengths):
- sq.append({'LN': length, 'SN': ref })
- result["SQ"] = sq
-
- return result
-
- def as_dict(self):
- """deprecated, use :meth:`to_dict()` instead"""
- return self.to_dict()
-
- def get_reference_name(self, tid):
- if tid == -1:
- return None
- if not 0 <= tid < self.ptr.n_targets:
- raise ValueError("reference_id %i out of range 0<=tid<%i" %
- (tid, self.ptr.n_targets))
- return charptr_to_str(self.ptr.target_name[tid])
-
- def get_reference_length(self, reference):
- cdef int tid = self.get_tid(reference)
- if tid < 0:
- raise KeyError("unknown reference {}".format(reference))
- else:
- return self.ptr.target_len[tid]
-
- def is_valid_tid(self, int tid):
- """
- return True if the numerical :term:`tid` is valid; False otherwise.
-
- Note that the unmapped tid code (-1) counts as an invalid.
- """
- return 0 <= tid < self.ptr.n_targets
-
- def get_tid(self, reference):
- """
- return the numerical :term:`tid` corresponding to
- :term:`reference`
-
- returns -1 if reference is not known.
- """
- reference = force_bytes(reference)
- tid = bam_name2id(self.ptr, reference)
- if tid < -1:
- raise ValueError('could not parse header')
- return tid
-
- def __str__(self):
- '''string with the full contents of the :term:`sam file` header as a
- string.
-
- If no @SQ entries are within the text section of the header,
- this will be automatically added from the reference names and
- lengths stored in the binary part of the header.
-
- See :attr:`pysam.AlignmentFile.header.to_dict()` to get a parsed
- representation of the header.
- '''
- text = from_string_and_size(self.ptr.text, self.ptr.l_text)
- if "@SQ" not in text:
- text += "\n" + self._build_sequence_section()
- return text
-
- # dictionary access methods, for backwards compatibility.
- def __setitem__(self, key, value):
- raise TypeError("AlignmentHeader does not support item assignment (use header.to_dict()")
-
- def __getitem__(self, key):
- return self.to_dict().__getitem__(key)
-
- def items(self):
- return self.to_dict().items()
-
- # PY2 compatibility
- def iteritems(self):
- return self.to_dict().items()
-
- def keys(self):
- return self.to_dict().keys()
-
- def values(self):
- return self.to_dict().values()
-
- def get(self, *args):
- return self.to_dict().get(*args)
-
- def __len__(self):
- return self.to_dict().__len__()
-
- def __contains__(self, key):
- return self.to_dict().__contains__(key)
-
-
- cdef class AlignmentFile(HTSFile):
- """AlignmentFile(filepath_or_object, mode=None, template=None,
- reference_names=None, reference_lengths=None, text=NULL,
- header=None, add_sq_text=False, check_header=True, check_sq=True,
- reference_filename=None, filename=None, index_filename=None,
- filepath_index=None, require_index=False, duplicate_filehandle=True,
- ignore_truncation=False, threads=1)
-
- A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
-
- If `filepath_or_object` is a string, the file is automatically
- opened. If `filepath_or_object` is a python File object, the
- already opened file will be used.
-
- If the file is opened for reading and an index exists (if file is BAM, a
- .bai file or if CRAM a .crai file), it will be opened automatically.
- `index_filename` may be specified explicitly. If the index is not named
- in the standard manner, not located in the same directory as the
- BAM/CRAM file, or is remote. Without an index, random access via
- :meth:`~pysam.AlignmentFile.fetch` and :meth:`~pysam.AlignmentFile.pileup`
- is disabled.
-
- For writing, the header of a :term:`SAM` file/:term:`BAM` file can
- be constituted from several sources (see also the samtools format
- specification):
-
- 1. If `template` is given, the header is copied from another
- `AlignmentFile` (`template` must be a
- :class:`~pysam.AlignmentFile`).
-
- 2. If `header` is given, the header is built from a
- multi-level dictionary.
-
- 3. If `text` is given, new header text is copied from raw
- text.
-
- 4. The names (`reference_names`) and lengths
- (`reference_lengths`) are supplied directly as lists.
-
- When reading or writing a CRAM file, the filename of a FASTA-formatted
- reference can be specified with `reference_filename`.
-
- By default, if a file is opened in mode 'r', it is checked
- for a valid header (`check_header` = True) and a definition of
- chromosome names (`check_sq` = True).
-
- Parameters
- ----------
- mode : string
- `mode` should be ``r`` for reading or ``w`` for writing. The
- default is text mode (:term:`SAM`). For binary (:term:`BAM`)
- I/O you should append ``b`` for compressed or ``u`` for
- uncompressed :term:`BAM` output. Use ``h`` to output header
- information in text (:term:`TAM`) mode. Use ``c`` for
- :term:`CRAM` formatted files.
-
- If ``b`` is present, it must immediately follow ``r`` or
- ``w``. Valid modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``,
- ``wbu``, ``wb0``, ``rc`` and ``wc``. For instance, to open a
- :term:`BAM` formatted file for reading, type::
-
- f = pysam.AlignmentFile('ex1.bam','rb')
-
- If mode is not specified, the method will try to auto-detect
- in the order 'rb', 'r', thus both the following should work::
-
- f1 = pysam.AlignmentFile('ex1.bam')
- f2 = pysam.AlignmentFile('ex1.sam')
-
- template : AlignmentFile
- when writing, copy header from file `template`.
-
- header : dict or AlignmentHeader
- when writing, build header from a multi-level dictionary. The
- first level are the four types ('HD', 'SQ', ...). The second
- level are a list of lines, with each line being a list of
- tag-value pairs. The header is constructed first from all the
- defined fields, followed by user tags in alphabetical
- order. Alternatively, an :class:`~pysam.AlignmentHeader`
- object can be passed directly.
-
- text : string
- when writing, use the string provided as the header
-
- reference_names : list
- see reference_lengths
-
- reference_lengths : list
- when writing or opening a SAM file without header build header
- from list of chromosome names and lengths. By default, 'SQ'
- and 'LN' tags will be added to the header text. This option
- can be changed by unsetting the flag `add_sq_text`.
-
- add_sq_text : bool
- do not add 'SQ' and 'LN' tags to header. This option permits
- construction :term:`SAM` formatted files without a header.
-
- add_sam_header : bool
- when outputting SAM the default is to output a header. This is
- equivalent to opening the file in 'wh' mode. If this option is
- set to False, no header will be output. To read such a file,
- set `check_header=False`.
-
- check_header : bool
- obsolete: when reading a SAM file, check if header is present
- (default=True)
-
- check_sq : bool
- when reading, check if SQ entries are present in header
- (default=True)
-
- reference_filename : string
- Path to a FASTA-formatted reference file. Valid only for CRAM files.
- When reading a CRAM file, this overrides both ``$REF_PATH`` and the URL
- specified in the header (``UR`` tag), which are normally used to find
- the reference.
-
- index_filename : string
- Explicit path to the index file. Only needed if the index is not
- named in the standard manner, not located in the same directory as
- the BAM/CRAM file, or is remote. An IOError is raised if the index
- cannot be found or is invalid.
-
- filepath_index : string
- Alias for `index_filename`.
-
- require_index : bool
- When reading, require that an index file is present and is valid or
- raise an IOError. (default=False)
-
- filename : string
- Alternative to filepath_or_object. Filename of the file
- to be opened.
-
- duplicate_filehandle: bool
- By default, file handles passed either directly or through
- File-like objects will be duplicated before passing them to
- htslib. The duplication prevents issues where the same stream
- will be closed by htslib and through destruction of the
- high-level python object. Set to False to turn off
- duplication.
-
- ignore_truncation: bool
- Issue a warning, instead of raising an error if the current file
- appears to be truncated due to a missing EOF marker. Only applies
- to bgzipped formats. (Default=False)
-
- format_options: list
- A list of key=value strings, as accepted by --input-fmt-option and
- --output-fmt-option in samtools.
- threads: integer
- Number of threads to use for compressing/decompressing BAM/CRAM files.
- Setting threads to > 1 cannot be combined with `ignore_truncation`.
- (Default=1)
- """
-
- def __cinit__(self, *args, **kwargs):
- self.htsfile = NULL
- self.filename = None
- self.mode = None
- self.threads = 1
- self.is_stream = False
- self.is_remote = False
- self.index = NULL
-
- if "filename" in kwargs:
- args = [kwargs["filename"]]
- del kwargs["filename"]
-
- self._open(*args, **kwargs)
-
- # allocate memory for iterator
- self.b = <bam1_t*>calloc(1, sizeof(bam1_t))
- if self.b == NULL:
- raise MemoryError("could not allocate memory of size {}".format(sizeof(bam1_t)))
-
- def has_index(self):
- """return true if htsfile has an existing (and opened) index.
- """
- return self.index != NULL
-
- def check_index(self):
- """return True if index is present.
-
- Raises
- ------
-
- AttributeError
- if htsfile is :term:`SAM` formatted and thus has no index.
-
- ValueError
- if htsfile is closed or index could not be opened.
- """
-
- if not self.is_open:
- raise ValueError("I/O operation on closed file")
- if not self.is_bam and not self.is_cram:
- raise AttributeError(
- "AlignmentFile.mapped only available in bam files")
- if self.index == NULL:
- raise ValueError(
- "mapping information not recorded in index "
- "or index not available")
- return True
-
- def _open(self,
- filepath_or_object,
- mode=None,
- AlignmentFile template=None,
- reference_names=None,
- reference_lengths=None,
- reference_filename=None,
- text=None,
- header=None,
- port=None,
- add_sq_text=True,
- add_sam_header=True,
- check_header=True,
- check_sq=True,
- index_filename=None,
- filepath_index=None,
- require_index=False,
- referencenames=None,
- referencelengths=None,
- duplicate_filehandle=True,
- ignore_truncation=False,
- format_options=None,
- threads=1):
- '''open a sam, bam or cram formatted file.
-
- If _open is called on an existing file, the current file
- will be closed and a new file will be opened.
-
- '''
- cdef char *cfilename = NULL
- cdef char *creference_filename = NULL
- cdef char *cindexname = NULL
- cdef char *cmode = NULL
- cdef bam_hdr_t * hdr = NULL
- cdef int ret
-
- if threads > 1 and ignore_truncation:
- # This won't raise errors if reaching a truncated alignment,
- # because bgzf_mt_reader in htslib does not deal with
- # bgzf_mt_read_block returning non-zero values, contrary
- # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
- # Better to avoid this (for now) than to produce seemingly correct results.
- raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
- self.threads = threads
-
- # for backwards compatibility:
- if referencenames is not None:
- reference_names = referencenames
- if referencelengths is not None:
- reference_lengths = referencelengths
-
- # close a previously opened file
- if self.is_open:
- self.close()
-
- # autodetection for read
- if mode is None:
- mode = "r"
-
- if add_sam_header and mode == "w":
- mode = "wh"
-
- assert mode in ("r", "w", "rb", "wb", "wh",
- "wbu", "rU", "wb0",
- "rc", "wc"), \
- "invalid file opening mode `%s`" % mode
-
- self.duplicate_filehandle = duplicate_filehandle
-
- # StringIO not supported
- if isinstance(filepath_or_object, StringIO):
- raise NotImplementedError(
- "access from StringIO objects not supported")
- # reading from a file descriptor
- elif isinstance(filepath_or_object, int):
- self.filename = filepath_or_object
- filename = None
- self.is_remote = False
- self.is_stream = True
- # reading from a File object or other object with fileno
- elif hasattr(filepath_or_object, "fileno"):
- if filepath_or_object.closed:
- raise ValueError('I/O operation on closed file')
- self.filename = filepath_or_object
- # .name can be TextIOWrapper
- try:
- filename = encode_filename(str(filepath_or_object.name))
- cfilename = filename
- except AttributeError:
- filename = None
- self.is_remote = False
- self.is_stream = True
- # what remains is a filename
- else:
- self.filename = filename = encode_filename(filepath_or_object)
- cfilename = filename
- self.is_remote = hisremote(cfilename)
- self.is_stream = self.filename == b'-'
-
- # for htslib, wbu seems to not work
- if mode == "wbu":
- mode = "wb0"
-
- self.mode = force_bytes(mode)
- self.reference_filename = reference_filename = encode_filename(
- reference_filename)
-
- if mode[0] == 'w':
- # open file for writing
-
- if not (template or header or text or (reference_names and reference_lengths)):
- raise ValueError(
- "either supply options `template`, `header`, `text` or both `reference_names` "
- "and `reference_lengths` for writing")
-
- if template:
- # header is copied, though at the moment not strictly
- # necessary as AlignmentHeader is immutable.
- self.header = template.header.copy()
- elif isinstance(header, AlignmentHeader):
- self.header = header.copy()
- elif isinstance(header, Mapping):
- self.header = AlignmentHeader.from_dict(header)
- elif reference_names and reference_lengths:
- self.header = AlignmentHeader.from_references(
- reference_names,
- reference_lengths,
- add_sq_text=add_sq_text,
- text=text)
- elif text:
- self.header = AlignmentHeader.from_text(text)
- else:
- raise ValueError("not enough information to construct header. Please provide template, "
- "header, text or reference_names/reference_lengths")
- self.htsfile = self._open_htsfile()
-
- if self.htsfile == NULL:
- if errno:
- raise OSError_from_errno("Could not open alignment file", filename)
- else:
- raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
- if format_options and len(format_options):
- self.add_hts_options(format_options)
- # set filename with reference sequences. If no filename
- # is given, the CRAM reference arrays will be built from
- # the @SQ header in the header
- if "c" in mode and reference_filename:
- if (hts_set_fai_filename(self.htsfile, self.reference_filename) != 0):
- raise ValueError("failure when setting reference filename")
-
- # write header to htsfile
- if "b" in mode or "c" in mode or "h" in mode:
- hdr = self.header.ptr
- with nogil:
- ret = sam_hdr_write(self.htsfile, hdr)
- if ret < 0:
- raise OSError_from_errno("Could not write headers", filename)
-
- elif mode[0] == "r":
- # open file for reading
- self.htsfile = self._open_htsfile()
-
- if self.htsfile == NULL:
- if errno:
- raise OSError_from_errno("Could not open alignment file", filename)
- else:
- raise ValueError("could not open alignment file `{}`".format(force_str(filename)))
-
- if self.htsfile.format.category != sequence_data:
- raise ValueError("file does not contain alignment data")
-
- if format_options and len(format_options):
- self.add_hts_options(format_options)
-
- self.check_truncation(ignore_truncation)
-
- # bam/cram files require a valid header
- if self.is_bam or self.is_cram:
- with nogil:
- hdr = sam_hdr_read(self.htsfile)
- if hdr == NULL:
- raise ValueError(
- "file does not have a valid header (mode='%s') "
- "- is it BAM/CRAM format?" % mode)
- self.header = makeAlignmentHeader(hdr)
- else:
- # in sam files a header is optional. If not given,
- # user may provide reference names and lengths to built
- # an on-the-fly header.
- if reference_names and reference_lengths:
- # build header from a target names and lengths
- self.header = AlignmentHeader.from_references(
- reference_names=reference_names,
- reference_lengths=reference_lengths,
- add_sq_text=add_sq_text,
- text=text)
- else:
- with nogil:
- hdr = sam_hdr_read(self.htsfile)
- if hdr == NULL:
- raise ValueError(
- "SAM? file does not have a valid header (mode='%s'), "
- "please provide reference_names and reference_lengths")
- self.header = makeAlignmentHeader(hdr)
-
- # set filename with reference sequences
- if self.is_cram and reference_filename:
- creference_filename = self.reference_filename
- hts_set_opt(self.htsfile,
- CRAM_OPT_REFERENCE,
- creference_filename)
-
- if check_sq and self.header.nreferences == 0:
- raise ValueError(
- ("file has no sequences defined (mode='%s') - "
- "is it SAM/BAM format? Consider opening with "
- "check_sq=False") % mode)
-
- if self.is_bam or self.is_cram:
- self.index_filename = index_filename or filepath_index
- if self.index_filename:
- cindexname = bfile_name = encode_filename(self.index_filename)
-
- if cfilename or cindexname:
- with nogil:
- self.index = sam_index_load3(self.htsfile, cfilename, cindexname,
- HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL)
-
- if not self.index and (cindexname or require_index):
- if errno:
- raise OSError_from_errno("Could not open index file", self.index_filename)
- else:
- raise IOError('unable to open index file `%s`' % self.index_filename)
-
- elif require_index:
- raise IOError('unable to open index file')
-
- # save start of data section
- if not self.is_stream:
- self.start_offset = self.tell()
-
- def fetch(self,
- contig=None,
- start=None,
- stop=None,
- region=None,
- tid=None,
- until_eof=False,
- multiple_iterators=False,
- reference=None,
- end=None):
- """fetch reads aligned in a :term:`region`.
-
- See :meth:`~pysam.HTSFile.parse_region` for more information
- on how genomic regions can be specified. :term:`reference` and
- `end` are also accepted for backward compatibility as synonyms
- for :term:`contig` and `stop`, respectively.
-
- Without a `contig` or `region` all mapped reads in the file
- will be fetched. The reads will be returned ordered by reference
- sequence, which will not necessarily be the order within the
- file. This mode of iteration still requires an index. If there is
- no index, use `until_eof=True`.
-
- If only `contig` is set, all reads aligned to `contig`
- will be fetched.
-
- A :term:`SAM` file does not allow random access. If `region`
- or `contig` are given, an exception is raised.
-
- Parameters
- ----------
-
- until_eof : bool
-
- If `until_eof` is True, all reads from the current file
- position will be returned in order as they are within the
- file. Using this option will also fetch unmapped reads.
-
- multiple_iterators : bool
-
- If `multiple_iterators` is True, multiple
- iterators on the same file can be used at the same time. The
- iterator returned will receive its own copy of a filehandle to
- the file effectively re-opening the file. Re-opening a file
- creates some overhead, so beware.
-
- Returns
- -------
-
- An iterator over a collection of reads. : IteratorRow
-
- Raises
- ------
-
- ValueError
- if the genomic coordinates are out of range or invalid or the
- file does not permit random access to genomic coordinates.
-
- """
- cdef int rtid, rstart, rstop, has_coord
-
- if not self.is_open:
- raise ValueError( "I/O operation on closed file" )
-
- has_coord, rtid, rstart, rstop = self.parse_region(
- contig, start, stop, region, tid,
- end=end, reference=reference)
-
- # Turn of re-opening if htsfile is a stream
- if self.is_stream:
- multiple_iterators = False
-
- if self.is_bam or self.is_cram:
- if not until_eof and not self.is_remote:
- if not self.has_index():
- raise ValueError(
- "fetch called on bamfile without index")
-
- if has_coord:
- return IteratorRowRegion(
- self, rtid, rstart, rstop,
- multiple_iterators=multiple_iterators)
- else:
- if until_eof:
- return IteratorRowAll(
- self,
- multiple_iterators=multiple_iterators)
- else:
- # AH: check - reason why no multiple_iterators for
- # AllRefs?
- return IteratorRowAllRefs(
- self,
- multiple_iterators=multiple_iterators)
- else:
- if has_coord:
- raise ValueError(
- "fetching by region is not available for SAM files")
-
- if multiple_iterators == True:
- raise ValueError(
- "multiple iterators not implemented for SAM files")
-
- return IteratorRowAll(self,
- multiple_iterators=multiple_iterators)
-
- def head(self, n, multiple_iterators=True):
- '''return an iterator over the first n alignments.
-
- This iterator is is useful for inspecting the bam-file.
-
- Parameters
- ----------
-
- multiple_iterators : bool
-
- is set to True by default in order to
- avoid changing the current file position.
-
- Returns
- -------
-
- an iterator over a collection of reads : IteratorRowHead
-
- '''
- return IteratorRowHead(self, n,
- multiple_iterators=multiple_iterators)
-
- def mate(self, AlignedSegment read):
- '''return the mate of :class:`pysam.AlignedSegment` `read`.
-
- .. note::
-
- Calling this method will change the file position.
- This might interfere with any iterators that have
- not re-opened the file.
-
- .. note::
-
- This method is too slow for high-throughput processing.
- If a read needs to be processed with its mate, work
- from a read name sorted file or, better, cache reads.
-
- Returns
- -------
-
- the mate : AlignedSegment
-
- Raises
- ------
-
- ValueError
- if the read is unpaired or the mate is unmapped
-
- '''
- cdef uint32_t flag = read._delegate.core.flag
-
- if flag & BAM_FPAIRED == 0:
- raise ValueError("read %s: is unpaired" %
- (read.query_name))
- if flag & BAM_FMUNMAP != 0:
- raise ValueError("mate %s: is unmapped" %
- (read.query_name))
-
- # xor flags to get the other mate
- cdef int x = BAM_FREAD1 + BAM_FREAD2
- flag = (flag ^ x) & x
-
- # Make sure to use a separate file to jump around
- # to mate as otherwise the original file position
- # will be lost
- # The following code is not using the C API and
- # could thus be made much quicker, for example
- # by using tell and seek.
- for mate in self.fetch(
- read._delegate.core.mpos,
- read._delegate.core.mpos + 1,
- tid=read._delegate.core.mtid,
- multiple_iterators=True):
- if mate.flag & flag != 0 and \
- mate.query_name == read.query_name:
- break
- else:
- raise ValueError("mate not found")
-
- return mate
-
- def pileup(self,
- contig=None,
- start=None,
- stop=None,
- region=None,
- reference=None,
- end=None,
- **kwargs):
- """perform a :term:`pileup` within a :term:`region`. The region is
- specified by :term:`contig`, `start` and `stop` (using
- 0-based indexing). :term:`reference` and `end` are also accepted for
- backward compatibility as synonyms for :term:`contig` and `stop`,
- respectively. Alternatively, a samtools 'region' string
- can be supplied.
-
- Without 'contig' or 'region' all reads will be used for the
- pileup. The reads will be returned ordered by
- :term:`contig` sequence, which will not necessarily be the
- order within the file.
-
- Note that :term:`SAM` formatted files do not allow random
- access. In these files, if a 'region' or 'contig' are
- given an exception is raised.
-
- .. note::
-
- 'all' reads which overlap the region are returned. The
- first base returned will be the first base of the first
- read 'not' necessarily the first base of the region used
- in the query.
-
- Parameters
- ----------
-
- truncate : bool
-
- By default, the samtools pileup engine outputs all reads
- overlapping a region. If truncate is True and a region is
- given, only columns in the exact region specified are
- returned.
-
- max_depth : int
- Maximum read depth permitted. The default limit is '8000'.
-
- stepper : string
- The stepper controls how the iterator advances.
- Possible options for the stepper are
-
- ``all``
- skip reads in which any of the following flags are set:
- BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
-
- ``nofilter``
- uses every single read turning off any filtering.
-
- ``samtools``
- same filter and read processing as in samtools
- pileup. For full compatibility, this requires a
- 'fastafile' to be given. The following options all pertain
- to filtering of the ``samtools`` stepper.
-
- fastafile : :class:`~pysam.FastaFile` object.
-
- This is required for some of the steppers.
-
- ignore_overlaps: bool
-
- If set to True, detect if read pairs overlap and only take
- the higher quality base. This is the default.
-
- flag_filter : int
-
- ignore reads where any of the bits in the flag are set. The default is
- BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP.
-
- flag_require : int
-
- only use reads where certain flags are set. The default is 0.
-
- ignore_orphans: bool
-
- ignore orphans (paired reads that are not in a proper pair).
- The default is to ignore orphans.
-
- min_base_quality: int
-
- Minimum base quality. Bases below the minimum quality will
- not be output. The default is 13.
-
- adjust_capq_threshold: int
-
- adjust mapping quality. The default is 0 for no
- adjustment. The recommended value for adjustment is 50.
-
- min_mapping_quality : int
-
- only use reads above a minimum mapping quality. The default is 0.
-
- compute_baq: bool
-
- re-alignment computing per-Base Alignment Qualities (BAQ). The
- default is to do re-alignment. Realignment requires a reference
- sequence. If none is present, no realignment will be performed.
-
- redo_baq: bool
-
- recompute per-Base Alignment Quality on the fly ignoring
- existing base qualities. The default is False (use existing
- base qualities).
-
- Returns
- -------
-
- an iterator over genomic positions. : IteratorColumn
-
- """
- cdef int rtid, has_coord
- cdef int32_t rstart, rstop
-
- if not self.is_open:
- raise ValueError("I/O operation on closed file")
-
- has_coord, rtid, rstart, rstop = self.parse_region(
- contig, start, stop, region, reference=reference, end=end)
-
- if has_coord:
- if not self.has_index():
- raise ValueError("no index available for pileup")
-
- return IteratorColumnRegion(self,
- tid=rtid,
- start=rstart,
- stop=rstop,
- **kwargs)
- else:
- if self.has_index():
- return IteratorColumnAllRefs(self, **kwargs)
- else:
- return IteratorColumnAll(self, **kwargs)
-
- def count(self,
- contig=None,
- start=None,
- stop=None,
- region=None,
- until_eof=False,
- read_callback="nofilter",
- reference=None,
- end=None):
- '''count the number of reads in :term:`region`
-
- The region is specified by :term:`contig`, `start` and `stop`.
- :term:`reference` and `end` are also accepted for backward
- compatibility as synonyms for :term:`contig` and `stop`,
- respectively. Alternatively, a `samtools`_ :term:`region`
- string can be supplied.
-
- A :term:`SAM` file does not allow random access and if
- `region` or `contig` are given, an exception is raised.
-
- Parameters
- ----------
-
- contig : string
- reference_name of the genomic region (chromosome)
-
- start : int
- start of the genomic region (0-based inclusive)
-
- stop : int
- end of the genomic region (0-based exclusive)
-
- region : string
- a region string in samtools format.
-
- until_eof : bool
- count until the end of the file, possibly including
- unmapped reads as well.
-
- read_callback: string or function
-
- select a call-back to ignore reads when counting. It can
- be either a string with the following values:
-
- ``all``
- skip reads in which any of the following
- flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
- BAM_FDUP
-
- ``nofilter``
- uses every single read
-
- Alternatively, `read_callback` can be a function
- ``check_read(read)`` that should return True only for
- those reads that shall be included in the counting.
-
- reference : string
- backward compatible synonym for `contig`
-
- end : int
- backward compatible synonym for `stop`
-
- Raises
- ------
-
- ValueError
- if the genomic coordinates are out of range or invalid.
-
- '''
- cdef AlignedSegment read
- cdef long counter = 0
-
- if not self.is_open:
- raise ValueError("I/O operation on closed file")
-
- cdef int filter_method = 0
- if read_callback == "all":
- filter_method = 1
- elif read_callback == "nofilter":
- filter_method = 2
-
- for read in self.fetch(contig=contig,
- start=start,
- stop=stop,
- reference=reference,
- end=end,
- region=region,
- until_eof=until_eof):
- # apply filter
- if filter_method == 1:
- # filter = "all"
- if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
- continue
- elif filter_method == 2:
- # filter = "nofilter"
- pass
- else:
- if not read_callback(read):
- continue
- counter += 1
-
- return counter
-
- @cython.boundscheck(False) # we do manual bounds checking
- def count_coverage(self,
- contig,
- start=None,
- stop=None,
- region=None,
- quality_threshold=15,
- read_callback='all',
- reference=None,
- end=None):
- """count the coverage of genomic positions by reads in :term:`region`.
-
- The region is specified by :term:`contig`, `start` and `stop`.
- :term:`reference` and `end` are also accepted for backward
- compatibility as synonyms for :term:`contig` and `stop`,
- respectively. Alternatively, a `samtools`_ :term:`region`
- string can be supplied. The coverage is computed per-base [ACGT].
-
- Parameters
- ----------
-
- contig : string
- reference_name of the genomic region (chromosome)
-
- start : int
- start of the genomic region (0-based inclusive). If not
- given, count from the start of the chromosome.
-
- stop : int
- end of the genomic region (0-based exclusive). If not given,
- count to the end of the chromosome.
-
- region : string
- a region string.
-
- quality_threshold : int
- quality_threshold is the minimum quality score (in phred) a
- base has to reach to be counted.
-
- read_callback: string or function
-
- select a call-back to ignore reads when counting. It can
- be either a string with the following values:
-
- ``all``
- skip reads in which any of the following
- flags are set: BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL,
- BAM_FDUP
-
- ``nofilter``
- uses every single read
-
- Alternatively, `read_callback` can be a function
- ``check_read(read)`` that should return True only for
- those reads that shall be included in the counting.
-
- reference : string
- backward compatible synonym for `contig`
-
- end : int
- backward compatible synonym for `stop`
-
- Raises
- ------
-
- ValueError
- if the genomic coordinates are out of range or invalid.
-
- Returns
- -------
-
- four array.arrays of the same length in order A C G T : tuple
-
- """
-
- cdef uint32_t contig_length = self.get_reference_length(contig)
- cdef int _start = start if start is not None else 0
- cdef int _stop = stop if stop is not None else contig_length
- _stop = _stop if _stop < contig_length else contig_length
-
- if _stop == _start:
- raise ValueError("interval of size 0")
- if _stop < _start:
- raise ValueError("interval of size less than 0")
-
- cdef int length = _stop - _start
- cdef c_array.array int_array_template = array.array('L', [])
- cdef c_array.array count_a
- cdef c_array.array count_c
- cdef c_array.array count_g
- cdef c_array.array count_t
- count_a = c_array.clone(int_array_template, length, zero=True)
- count_c = c_array.clone(int_array_template, length, zero=True)
- count_g = c_array.clone(int_array_template, length, zero=True)
- count_t = c_array.clone(int_array_template, length, zero=True)
-
- cdef AlignedSegment read
- cdef cython.str seq
- cdef c_array.array quality
- cdef int qpos
- cdef int refpos
- cdef int c = 0
- cdef int filter_method = 0
-
-
- if read_callback == "all":
- filter_method = 1
- elif read_callback == "nofilter":
- filter_method = 2
-
- cdef int _threshold = quality_threshold or 0
- for read in self.fetch(contig=contig,
- reference=reference,
- start=start,
- stop=stop,
- end=end,
- region=region):
- # apply filter
- if filter_method == 1:
- # filter = "all"
- if (read.flag & (0x4 | 0x100 | 0x200 | 0x400)):
- continue
- elif filter_method == 2:
- # filter = "nofilter"
- pass
- else:
- if not read_callback(read):
- continue
-
- # count
- seq = read.seq
- if seq is None:
- continue
- quality = read.query_qualities
-
- for qpos, refpos in read.get_aligned_pairs(True):
- if qpos is not None and refpos is not None and \
- _start <= refpos < _stop:
-
- # only check base quality if _threshold > 0
- if (_threshold and quality and quality[qpos] >= _threshold) or not _threshold:
- if seq[qpos] == 'A':
- count_a.data.as_ulongs[refpos - _start] += 1
- if seq[qpos] == 'C':
- count_c.data.as_ulongs[refpos - _start] += 1
- if seq[qpos] == 'G':
- count_g.data.as_ulongs[refpos - _start] += 1
- if seq[qpos] == 'T':
- count_t.data.as_ulongs[refpos - _start] += 1
-
- return count_a, count_c, count_g, count_t
-
- def find_introns_slow(self, read_iterator):
- """Return a dictionary {(start, stop): count}
- Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
- and their support ( = number of reads ).
-
- read_iterator can be the result of a .fetch(...) call.
- Or it can be a generator filtering such reads. Example
- samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
- """
- res = collections.Counter()
- for r in read_iterator:
- if 'N' in r.cigarstring:
- last_read_pos = False
- for read_loc, genome_loc in r.get_aligned_pairs():
- if read_loc is None and last_read_pos:
- start = genome_loc
- elif read_loc and last_read_pos is None:
- stop = genome_loc # we are right exclusive ,so this is correct
- res[(start, stop)] += 1
- del start
- del stop
- last_read_pos = read_loc
- return res
-
- def find_introns(self, read_iterator):
- """Return a dictionary {(start, stop): count}
- Listing the intronic sites in the reads (identified by 'N' in the cigar strings),
- and their support ( = number of reads ).
-
- read_iterator can be the result of a .fetch(...) call.
- Or it can be a generator filtering such reads. Example
- samfile.find_introns((read for read in samfile.fetch(...) if read.is_reverse)
- """
- cdef:
- uint32_t base_position, junc_start, nt
- int op
- AlignedSegment r
- int BAM_CREF_SKIP = 3 #BAM_CREF_SKIP
-
- res = collections.Counter()
-
- match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
- for r in read_iterator:
- base_position = r.pos
- cigar = r.cigartuples
- if cigar is None:
- continue
-
- for op, nt in cigar:
- if op in match_or_deletion:
- base_position += nt
- elif op == BAM_CREF_SKIP:
- junc_start = base_position
- base_position += nt
- res[(junc_start, base_position)] += 1
- return res
-
-
- def close(self):
- '''closes the :class:`pysam.AlignmentFile`.'''
-
- if self.htsfile == NULL:
- return
-
- if self.index != NULL:
- hts_idx_destroy(self.index)
- self.index = NULL
-
- cdef int ret = hts_close(self.htsfile)
- self.htsfile = NULL
-
- self.header = None
-
- if ret < 0 and errno != EPIPE:
- if isinstance(self.filename, (str, bytes)):
- raise OSError_from_errno("Closing failed", self.filename)
- else:
- raise OSError_from_errno("Closing failed")
-
- def __dealloc__(self):
- cdef int ret = 0
-
- if self.index != NULL:
- hts_idx_destroy(self.index)
- self.index = NULL
-
- if self.htsfile != NULL:
- ret = hts_close(self.htsfile)
- self.htsfile = NULL
-
- self.header = None
-
- if self.b:
- bam_destroy1(self.b)
- self.b = NULL
-
- if ret < 0 and errno != EPIPE:
- if isinstance(self.filename, (str, bytes)):
- raise OSError_from_errno("Closing failed", self.filename)
- else:
- raise OSError_from_errno("Closing failed")
-
- cpdef int write(self, AlignedSegment read) except -1:
- '''
- write a single :class:`pysam.AlignedSegment` to disk.
-
- Raises:
- ValueError
- if the writing failed
-
- Returns:
- int :
- the number of bytes written. If the file is closed,
- this will be 0.
- '''
- if not self.is_open:
- return 0
-
- if self.header.ptr.n_targets <= read._delegate.core.tid:
- raise ValueError(
- "AlignedSegment refers to reference number {} that "
- "is larger than the number of references ({}) in the header".format(
- read._delegate.core.tid, self.header.ptr.n_targets))
-
- cdef int ret
- with nogil:
- ret = sam_write1(self.htsfile,
- self.header.ptr,
- read._delegate)
-
- # kbj: Still need to raise an exception with except -1. Otherwise
- # when ret == -1 we get a "SystemError: error return without
- # exception set".
- if ret < 0:
- raise IOError("sam_write1 failed with error code {}".format(ret))
-
- return ret
-
- # context manager interface
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_value, traceback):
- self.close()
- return False
-
- ###############################################################
- ###############################################################
- ###############################################################
- ## properties
- ###############################################################
- property mapped:
- """int with total number of mapped alignments according to the
- statistics recorded in the index. This is a read-only
- attribute.
- (This will be 0 for a CRAM file indexed by a .crai index, as that
- index format does not record these statistics.)
- """
- def __get__(self):
- self.check_index()
- cdef int tid
- cdef uint64_t total = 0
- cdef uint64_t mapped, unmapped
- for tid from 0 <= tid < self.header.nreferences:
- with nogil:
- hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
- total += mapped
- return total
-
- property unmapped:
- """int with total number of unmapped reads according to the statistics
- recorded in the index. This number of reads includes the number of reads
- without coordinates. This is a read-only attribute.
- (This will be 0 for a CRAM file indexed by a .crai index, as that
- index format does not record these statistics.)
- """
- def __get__(self):
- self.check_index()
- cdef int tid
- cdef uint64_t total = hts_idx_get_n_no_coor(self.index)
- cdef uint64_t mapped, unmapped
- for tid from 0 <= tid < self.header.nreferences:
- with nogil:
- hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
- total += unmapped
- return total
-
- property nocoordinate:
- """int with total number of reads without coordinates according to the
- statistics recorded in the index, i.e., the statistic printed for "*"
- by the ``samtools idxstats`` command. This is a read-only attribute.
- (This will be 0 for a CRAM file indexed by a .crai index, as that
- index format does not record these statistics.)
- """
- def __get__(self):
- self.check_index()
- cdef uint64_t n
- with nogil:
- n = hts_idx_get_n_no_coor(self.index)
- return n
-
- def get_index_statistics(self):
- """return statistics about mapped/unmapped reads per chromosome as
- they are stored in the index, similarly to the statistics printed
- by the ``samtools idxstats`` command.
-
- CRAI indexes do not record these statistics, so for a CRAM file
- with a .crai index the returned statistics will all be 0.
-
- Returns:
- list :
- a list of records for each chromosome. Each record has the
- attributes 'contig', 'mapped', 'unmapped' and 'total'.
- """
-
- self.check_index()
- cdef int tid
- cdef uint64_t mapped, unmapped
- results = []
- # TODO: use header
- for tid from 0 <= tid < self.nreferences:
- with nogil:
- hts_idx_get_stat(self.index, tid, &mapped, &unmapped)
- results.append(
- IndexStats._make((
- self.get_reference_name(tid),
- mapped,
- unmapped,
- mapped + unmapped)))
-
- return results
-
- ###############################################################
- ## file-object like iterator access
- ## note: concurrent access will cause errors (see IteratorRow
- ## and multiple_iterators)
- ## Possible solutions: deprecate or open new file handle
- def __iter__(self):
- if not self.is_open:
- raise ValueError("I/O operation on closed file")
-
- if not self.is_bam and self.header.nreferences == 0:
- raise NotImplementedError(
- "can not iterate over samfile without header")
- return self
-
- cdef bam1_t * getCurrent(self):
- return self.b
-
- cdef int cnext(self):
- '''
- cversion of iterator. Used by :class:`pysam.AlignmentFile.IteratorColumn`.
- '''
- cdef int ret
- cdef bam_hdr_t * hdr = self.header.ptr
- with nogil:
- ret = sam_read1(self.htsfile,
- hdr,
- self.b)
- return ret
-
- def __next__(self):
- cdef int ret = self.cnext()
- if ret >= 0:
- return makeAlignedSegment(self.b, self.header)
- elif ret == -1:
- raise StopIteration
- else:
- raise IOError(read_failure_reason(ret))
-
- ###########################################
- # methods/properties referencing the header
- def is_valid_tid(self, int tid):
- """
- return True if the numerical :term:`tid` is valid; False otherwise.
-
- Note that the unmapped tid code (-1) counts as an invalid.
- """
- if self.header is None:
- raise ValueError("header not available in closed files")
- return self.header.is_valid_tid(tid)
-
- def get_tid(self, reference):
- """
- return the numerical :term:`tid` corresponding to
- :term:`reference`
-
- returns -1 if reference is not known.
- """
- if self.header is None:
- raise ValueError("header not available in closed files")
- return self.header.get_tid(reference)
-
- def get_reference_name(self, tid):
- """
- return :term:`reference` name corresponding to numerical :term:`tid`
- """
- if self.header is None:
- raise ValueError("header not available in closed files")
- return self.header.get_reference_name(tid)
-
- def get_reference_length(self, reference):
- """
- return :term:`reference` length corresponding to numerical :term:`tid`
- """
- if self.header is None:
- raise ValueError("header not available in closed files")
- return self.header.get_reference_length(reference)
-
- property nreferences:
- """int with the number of :term:`reference` sequences in the file.
- This is a read-only attribute."""
- def __get__(self):
- if self.header:
- return self.header.nreferences
- else:
- raise ValueError("header not available in closed files")
-
- property references:
- """tuple with the names of :term:`reference` sequences. This is a
- read-only attribute"""
- def __get__(self):
- if self.header:
- return self.header.references
- else:
- raise ValueError("header not available in closed files")
-
- property lengths:
- """tuple of the lengths of the :term:`reference` sequences. This is a
- read-only attribute. The lengths are in the same order as
- :attr:`pysam.AlignmentFile.references`
-
- """
- def __get__(self):
- if self.header:
- return self.header.lengths
- else:
- raise ValueError("header not available in closed files")
-
- # Compatibility functions for pysam < 0.14
- property text:
- """deprecated, use :attr:`references` and :attr:`lengths` instead"""
- def __get__(self):
- if self.header:
- return self.header.__str__()
- else:
- raise ValueError("header not available in closed files")
-
- # Compatibility functions for pysam < 0.8.3
- def gettid(self, reference):
- """deprecated, use :meth:`get_tid` instead"""
- return self.get_tid(reference)
-
- def getrname(self, tid):
- """deprecated, use :meth:`get_reference_name` instead"""
- return self.get_reference_name(tid)
-
-
- cdef class IteratorRow:
- '''abstract base class for iterators over mapped reads.
-
- Various iterators implement different behaviours for wrapping around
- contig boundaries. Examples include:
-
- :class:`pysam.IteratorRowRegion`
- iterate within a single contig and a defined region.
-
- :class:`pysam.IteratorRowAll`
- iterate until EOF. This iterator will also include unmapped reads.
-
- :class:`pysam.IteratorRowAllRefs`
- iterate over all reads in all reference sequences.
-
- The method :meth:`AlignmentFile.fetch` returns an IteratorRow.
-
- .. note::
-
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a
- :meth:`AlignmentFile.fetch`.
-
- '''
-
- def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
- cdef char *cfilename
- cdef char *creference_filename
- cdef char *cindexname = NULL
-
- if not samfile.is_open:
- raise ValueError("I/O operation on closed file")
-
- # makes sure that samfile stays alive as long as the
- # iterator is alive
- self.samfile = samfile
-
- # reopen the file - note that this makes the iterator
- # slow and causes pileup to slow down significantly.
- if multiple_iterators:
-
- cfilename = samfile.filename
- with nogil:
- self.htsfile = hts_open(cfilename, 'r')
- assert self.htsfile != NULL
-
- if samfile.has_index():
- if samfile.index_filename:
- cindexname = bindex_filename = encode_filename(samfile.index_filename)
- with nogil:
- self.index = sam_index_load2(self.htsfile, cfilename, cindexname)
- else:
- self.index = NULL
-
- # need to advance in newly opened file to position after header
- # better: use seek/tell?
- with nogil:
- hdr = sam_hdr_read(self.htsfile)
- if hdr is NULL:
- raise IOError("unable to read header information")
- self.header = makeAlignmentHeader(hdr)
-
- self.owns_samfile = True
-
- # options specific to CRAM files
- if samfile.is_cram and samfile.reference_filename:
- creference_filename = samfile.reference_filename
- hts_set_opt(self.htsfile,
- CRAM_OPT_REFERENCE,
- creference_filename)
-
- else:
- self.htsfile = samfile.htsfile
- self.index = samfile.index
- self.owns_samfile = False
- self.header = samfile.header
-
- self.retval = 0
-
- self.b = bam_init1()
-
- def __dealloc__(self):
- bam_destroy1(self.b)
- if self.owns_samfile:
- hts_idx_destroy(self.index)
- hts_close(self.htsfile)
-
-
- cdef class IteratorRowRegion(IteratorRow):
- """*(AlignmentFile samfile, int tid, int beg, int stop,
- int multiple_iterators=False)*
-
- iterate over mapped reads in a region.
-
- .. note::
-
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a
- :meth:`AlignmentFile.fetch`.
-
- """
-
- def __init__(self, AlignmentFile samfile,
- int tid, int beg, int stop,
- int multiple_iterators=False):
-
- if not samfile.has_index():
- raise ValueError("no index available for iteration")
-
- super().__init__(samfile, multiple_iterators=multiple_iterators)
- with nogil:
- self.iter = sam_itr_queryi(
- self.index,
- tid,
- beg,
- stop)
-
- def __iter__(self):
- return self
-
- cdef bam1_t * getCurrent(self):
- return self.b
-
- cdef int cnext(self):
- '''cversion of iterator. Used by IteratorColumn'''
- with nogil:
- self.retval = hts_itr_next(hts_get_bgzfp(self.htsfile),
- self.iter,
- self.b,
- self.htsfile)
-
- def __next__(self):
- self.cnext()
- if self.retval >= 0:
- return makeAlignedSegment(self.b, self.header)
- elif self.retval == -1:
- raise StopIteration
- elif self.retval == -2:
- # Note: it is currently not the case that hts_iter_next
- # returns -2 for a truncated file.
- # See https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
- raise IOError('truncated file')
- else:
- raise IOError("error while reading file {}: {}".format(self.samfile.filename, self.retval))
-
- def __dealloc__(self):
- hts_itr_destroy(self.iter)
-
-
- cdef class IteratorRowHead(IteratorRow):
- """*(AlignmentFile samfile, n, int multiple_iterators=False)*
-
- iterate over first n reads in `samfile`
-
- .. note::
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a
- :meth:`AlignmentFile.head`.
-
- """
-
- def __init__(self,
- AlignmentFile samfile,
- int n,
- int multiple_iterators=False):
- super().__init__(samfile, multiple_iterators=multiple_iterators)
-
- self.max_rows = n
- self.current_row = 0
-
- def __iter__(self):
- return self
-
- cdef bam1_t * getCurrent(self):
- return self.b
-
- cdef int cnext(self):
- '''cversion of iterator. Used by IteratorColumn'''
- cdef int ret
- cdef bam_hdr_t * hdr = self.header.ptr
- with nogil:
- ret = sam_read1(self.htsfile,
- hdr,
- self.b)
- return ret
-
- def __next__(self):
- if self.current_row >= self.max_rows:
- raise StopIteration
-
- cdef int ret = self.cnext()
- if ret >= 0:
- self.current_row += 1
- return makeAlignedSegment(self.b, self.header)
- elif ret == -1:
- raise StopIteration
- else:
- raise IOError(read_failure_reason(ret))
-
-
- cdef class IteratorRowAll(IteratorRow):
- """*(AlignmentFile samfile, int multiple_iterators=False)*
-
- iterate over all reads in `samfile`
-
- .. note::
-
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a
- :meth:`AlignmentFile.fetch`.
-
- """
-
- def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
- super().__init__(samfile, multiple_iterators=multiple_iterators)
-
- def __iter__(self):
- return self
-
- cdef bam1_t * getCurrent(self):
- return self.b
-
- cdef int cnext(self):
- '''cversion of iterator. Used by IteratorColumn'''
- cdef int ret
- cdef bam_hdr_t * hdr = self.header.ptr
- with nogil:
- ret = sam_read1(self.htsfile,
- hdr,
- self.b)
- return ret
-
- def __next__(self):
- cdef int ret = self.cnext()
- if ret >= 0:
- return makeAlignedSegment(self.b, self.header)
- elif ret == -1:
- raise StopIteration
- else:
- raise IOError(read_failure_reason(ret))
-
-
- cdef class IteratorRowAllRefs(IteratorRow):
- """iterates over all mapped reads by chaining iterators over each
- reference
-
- .. note::
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a
- :meth:`AlignmentFile.fetch`.
-
- """
-
- def __init__(self, AlignmentFile samfile, multiple_iterators=False):
- super().__init__(samfile, multiple_iterators=multiple_iterators)
-
- if not samfile.has_index():
- raise ValueError("no index available for fetch")
-
- self.tid = -1
-
- def nextiter(self):
- # get a new iterator for a chromosome. The file
- # will not be re-opened.
- self.rowiter = IteratorRowRegion(self.samfile,
- self.tid,
- 0,
- MAX_POS)
- # set htsfile and header of the rowiter
- # to the values in this iterator to reflect multiple_iterators
- self.rowiter.htsfile = self.htsfile
- self.rowiter.header = self.header
-
- # make sure the iterator understand that IteratorRowAllRefs
- # has ownership
- self.rowiter.owns_samfile = False
-
- def __iter__(self):
- return self
-
- def __next__(self):
- # Create an initial iterator
- if self.tid == -1:
- if not self.samfile.nreferences:
- raise StopIteration
- self.tid = 0
- self.nextiter()
-
- while 1:
- self.rowiter.cnext()
-
- # If current iterator is not exhausted, return aligned read
- if self.rowiter.retval > 0:
- return makeAlignedSegment(self.rowiter.b, self.header)
-
- self.tid += 1
-
- # Otherwise, proceed to next reference or stop
- if self.tid < self.samfile.nreferences:
- self.nextiter()
- else:
- raise StopIteration
-
-
- cdef class IteratorRowSelection(IteratorRow):
- """*(AlignmentFile samfile)*
-
- iterate over reads in `samfile` at a given list of file positions.
-
- .. note::
- It is usually not necessary to create an object of this class
- explicitly. It is returned as a result of call to a :meth:`AlignmentFile.fetch`.
- """
-
- def __init__(self, AlignmentFile samfile, positions, int multiple_iterators=True):
- super().__init__(samfile, multiple_iterators=multiple_iterators)
-
- self.positions = positions
- self.current_pos = 0
-
- def __iter__(self):
- return self
-
- cdef bam1_t * getCurrent(self):
- return self.b
-
- cdef int cnext(self):
- '''cversion of iterator'''
- # end iteration if out of positions
- if self.current_pos >= len(self.positions): return -1
-
- cdef int ret
- cdef uint64_t pos = self.positions[self.current_pos]
- with nogil:
- ret = bgzf_seek(hts_get_bgzfp(self.htsfile), pos, 0)
- if ret < 0:
- raise OSError_from_errno("Can't seek", self.samfile.filename)
- self.current_pos += 1
-
- cdef bam_hdr_t * hdr = self.header.ptr
- with nogil:
- ret = sam_read1(self.htsfile,
- hdr,
- self.b)
- return ret
-
- def __next__(self):
- cdef int ret = self.cnext()
- if ret >= 0:
- return makeAlignedSegment(self.b, self.header)
- elif ret == -1:
- raise StopIteration
- else:
- raise IOError(read_failure_reason(ret))
-
-
- cdef int __advance_nofilter(void *data, bam1_t *b):
- '''advance without any read filtering.
- '''
- cdef __iterdata * d = <__iterdata*>data
- cdef int ret
- with nogil:
- ret = sam_itr_next(d.htsfile, d.iter, b)
- return ret
-
-
- cdef int __advance_raw_nofilter(void *data, bam1_t *b):
- '''advance (without iterator) without any read filtering.
- '''
- cdef __iterdata * d = <__iterdata*>data
- cdef int ret
- with nogil:
- ret = sam_read1(d.htsfile, d.header, b)
- return ret
-
-
- cdef int __advance_all(void *data, bam1_t *b):
- '''only use reads for pileup passing basic filters such as
-
- BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
- '''
-
- cdef __iterdata * d = <__iterdata*>data
- cdef mask = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP
- cdef int ret
- while 1:
- with nogil:
- ret = sam_itr_next(d.htsfile, d.iter, b)
- if ret < 0:
- break
- if b.core.flag & d.flag_filter:
- continue
- break
- return ret
-
-
- cdef int __advance_raw_all(void *data, bam1_t *b):
- '''only use reads for pileup passing basic filters such as
-
- BAM_FUNMAP, BAM_FSECONDARY, BAM_FQCFAIL, BAM_FDUP
- '''
-
- cdef __iterdata * d = <__iterdata*>data
- cdef int ret
- while 1:
- with nogil:
- ret = sam_read1(d.htsfile, d.header, b)
- if ret < 0:
- break
- if b.core.flag & d.flag_filter:
- continue
- break
- return ret
-
-
- cdef int __advance_samtools(void * data, bam1_t * b):
- '''advance using same filter and read processing as in
- the samtools pileup.
- '''
- cdef __iterdata * d = <__iterdata*>data
- cdef int ret
- cdef int q
-
- while 1:
- with nogil:
- ret = sam_itr_next(d.htsfile, d.iter, b) if d.iter else sam_read1(d.htsfile, d.header, b)
- if ret < 0:
- break
- if b.core.flag & d.flag_filter:
- continue
- if d.flag_require and not (b.core.flag & d.flag_require):
- continue
-
- # reload sequence
- if d.fastafile != NULL and b.core.tid != d.tid:
- if d.seq != NULL:
- free(d.seq)
- d.tid = b.core.tid
- with nogil:
- d.seq = faidx_fetch_seq(
- d.fastafile,
- d.header.target_name[d.tid],
- 0, MAX_POS,
- &d.seq_len)
-
- if d.seq == NULL:
- raise ValueError(
- "reference sequence for '{}' (tid={}) not found".format(
- d.header.target_name[d.tid], d.tid))
-
- # realign read - changes base qualities
- if d.seq != NULL and d.compute_baq:
- # 4th option to realign is flag:
- # apply_baq = flag&1, extend_baq = flag&2, redo_baq = flag&4
- if d.redo_baq:
- sam_prob_realn(b, d.seq, d.seq_len, 7)
- else:
- sam_prob_realn(b, d.seq, d.seq_len, 3)
-
- if d.seq != NULL and d.adjust_capq_threshold > 10:
- q = sam_cap_mapq(b, d.seq, d.seq_len, d.adjust_capq_threshold)
- if q < 0:
- continue
- elif b.core.qual > q:
- b.core.qual = q
-
- if b.core.qual < d.min_mapping_quality:
- continue
- if d.ignore_orphans and b.core.flag & BAM_FPAIRED and not (b.core.flag & BAM_FPROPER_PAIR):
- continue
-
- break
-
- return ret
-
-
- cdef class IteratorColumn:
- '''abstract base class for iterators over columns.
-
- IteratorColumn objects wrap the pileup functionality of samtools.
-
- For reasons of efficiency, the iterator points to the current
- pileup buffer. The pileup buffer is updated at every iteration.
- This might cause some unexpected behaviour. For example,
- consider the conversion to a list::
-
- f = AlignmentFile("file.bam", "rb")
- result = list(f.pileup())
-
- Here, ``result`` will contain ``n`` objects of type
- :class:`~pysam.PileupColumn` for ``n`` columns, but each object in
- ``result`` will contain the same information.
-
- The desired behaviour can be achieved by list comprehension::
-
- result = [x.pileups() for x in f.pileup()]
-
- ``result`` will be a list of ``n`` lists of objects of type
- :class:`~pysam.PileupRead`.
-
- If the iterator is associated with a :class:`~pysam.Fastafile`
- using the :meth:`add_reference` method, then the iterator will
- export the current sequence via the methods :meth:`get_sequence`
- and :meth:`seq_len`.
-
- See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
- '''
-
- def __cinit__( self, AlignmentFile samfile, **kwargs):
- self.samfile = samfile
- self.fastafile = kwargs.get("fastafile", None)
- self.stepper = kwargs.get("stepper", "samtools")
- self.max_depth = kwargs.get("max_depth", 8000)
- self.ignore_overlaps = kwargs.get("ignore_overlaps", True)
- self.min_base_quality = kwargs.get("min_base_quality", 13)
- self.iterdata.seq = NULL
- self.iterdata.min_mapping_quality = kwargs.get("min_mapping_quality", 0)
- self.iterdata.flag_require = kwargs.get("flag_require", 0)
- self.iterdata.flag_filter = kwargs.get("flag_filter", BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
- self.iterdata.adjust_capq_threshold = kwargs.get("adjust_capq_threshold", 0)
- self.iterdata.compute_baq = kwargs.get("compute_baq", True)
- self.iterdata.redo_baq = kwargs.get("redo_baq", False)
- self.iterdata.ignore_orphans = kwargs.get("ignore_orphans", True)
-
- self.tid = 0
- self.pos = 0
- self.n_plp = 0
- self.plp = NULL
- self.pileup_iter = <bam_mplp_t>NULL
-
- def __iter__(self):
- return self
-
- cdef int cnext(self):
- '''perform next iteration.
- '''
- # do not release gil here because of call-backs
- cdef int ret = bam_mplp_auto(self.pileup_iter,
- &self.tid,
- &self.pos,
- &self.n_plp,
- &self.plp)
- return ret
-
- cdef char * get_sequence(self):
- '''return current reference sequence underlying the iterator.
- '''
- return self.iterdata.seq
-
- property seq_len:
- '''current sequence length.'''
- def __get__(self):
- return self.iterdata.seq_len
-
- def add_reference(self, FastaFile fastafile):
- '''
- add reference sequences in `fastafile` to iterator.'''
- self.fastafile = fastafile
- if self.iterdata.seq != NULL:
- free(self.iterdata.seq)
- self.iterdata.tid = -1
- self.iterdata.fastafile = self.fastafile.fastafile
-
- def has_reference(self):
- '''
- return true if iterator is associated with a reference'''
- return self.fastafile
-
- cdef _setup_iterator(self,
- int tid,
- int start,
- int stop,
- int multiple_iterators=0):
- '''setup the iterator structure'''
-
- self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators)
- self.iterdata.htsfile = self.samfile.htsfile
- self.iterdata.iter = self.iter.iter
- self.iterdata.seq = NULL
- self.iterdata.tid = -1
- self.iterdata.header = self.samfile.header.ptr
-
- if self.fastafile is not None:
- self.iterdata.fastafile = self.fastafile.fastafile
- else:
- self.iterdata.fastafile = NULL
-
- # Free any previously allocated memory before reassigning
- # pileup_iter
- self._free_pileup_iter()
-
- cdef void * data[1]
- data[0] = <void*>&self.iterdata
-
- if self.stepper is None or self.stepper == "all":
- with nogil:
- self.pileup_iter = bam_mplp_init(1,
- <bam_plp_auto_f>&__advance_all,
- data)
- elif self.stepper == "nofilter":
- with nogil:
- self.pileup_iter = bam_mplp_init(1,
- <bam_plp_auto_f>&__advance_nofilter,
- data)
- elif self.stepper == "samtools":
- with nogil:
- self.pileup_iter = bam_mplp_init(1,
- <bam_plp_auto_f>&__advance_samtools,
- data)
- else:
- raise ValueError(
- "unknown stepper option `%s` in IteratorColumn" % self.stepper)
-
- if self.max_depth:
- with nogil:
- bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
-
- if self.ignore_overlaps:
- with nogil:
- bam_mplp_init_overlaps(self.pileup_iter)
-
- cdef _setup_raw_rest_iterator(self):
- '''set up an "iterator" that just uses sam_read1(), similar to HTS_IDX_REST'''
-
- self.iter = None
- self.iterdata.iter = NULL
- self.iterdata.htsfile = self.samfile.htsfile
- self.iterdata.seq = NULL
- self.iterdata.tid = -1
- self.iterdata.header = self.samfile.header.ptr
-
- if self.fastafile is not None:
- self.iterdata.fastafile = self.fastafile.fastafile
- else:
- self.iterdata.fastafile = NULL
-
- # Free any previously allocated memory before reassigning
- # pileup_iter
- self._free_pileup_iter()
-
- cdef void * data[1]
- data[0] = <void*>&self.iterdata
-
- if self.stepper is None or self.stepper == "all":
- with nogil:
- self.pileup_iter = bam_mplp_init(1,
- <bam_plp_auto_f>&__advance_raw_all,
- data)
- elif self.stepper == "nofilter":
- with nogil:
- self.pileup_iter = bam_mplp_init(1,
- <bam_plp_auto_f>&__advance_raw_nofilter,
- data)
- elif self.stepper == "samtools":
- with nogil:
- self.pileup_iter = bam_mplp_init(1,
- <bam_plp_auto_f>&__advance_samtools,
- data)
- else:
- raise ValueError(
- "unknown stepper option `%s` in IteratorColumn" % self.stepper)
-
- if self.max_depth:
- with nogil:
- bam_mplp_set_maxcnt(self.pileup_iter, self.max_depth)
-
- if self.ignore_overlaps:
- with nogil:
- bam_mplp_init_overlaps(self.pileup_iter)
-
- cdef reset(self, tid, start, stop):
- '''reset iterator position.
-
- This permits using the iterator multiple times without
- having to incur the full set-up costs.
- '''
- if self.iter is None:
- raise TypeError("Raw iterator set up without region cannot be reset")
-
- self.iter = IteratorRowRegion(self.samfile, tid, start, stop, multiple_iterators=0)
- self.iterdata.iter = self.iter.iter
-
- # invalidate sequence if different tid
- if self.tid != tid:
- if self.iterdata.seq != NULL:
- free(self.iterdata.seq)
- self.iterdata.seq = NULL
- self.iterdata.tid = -1
-
- # self.pileup_iter = bam_mplp_init(1
- # &__advancepileup,
- # &self.iterdata)
- with nogil:
- bam_mplp_reset(self.pileup_iter)
-
- cdef _free_pileup_iter(self):
- '''free the memory alloc'd by bam_plp_init.
-
- This is needed before setup_iterator allocates another
- pileup_iter, or else memory will be lost. '''
- if self.pileup_iter != <bam_mplp_t>NULL:
- with nogil:
- bam_mplp_reset(self.pileup_iter)
- bam_mplp_destroy(self.pileup_iter)
- self.pileup_iter = <bam_mplp_t>NULL
-
- def __dealloc__(self):
- # reset in order to avoid memory leak messages for iterators
- # that have not been fully consumed
- self._free_pileup_iter()
- self.plp = <const bam_pileup1_t*>NULL
-
- if self.iterdata.seq != NULL:
- free(self.iterdata.seq)
- self.iterdata.seq = NULL
-
- # backwards compatibility
-
- def hasReference(self):
- return self.has_reference()
- cdef char * getSequence(self):
- return self.get_sequence()
- def addReference(self, FastaFile fastafile):
- return self.add_reference(fastafile)
-
-
- cdef class IteratorColumnRegion(IteratorColumn):
- '''iterates over a region only.
- '''
- def __cinit__(self,
- AlignmentFile samfile,
- int tid = 0,
- int start = 0,
- int stop = MAX_POS,
- int truncate = False,
- int multiple_iterators = True,
- **kwargs ):
-
- # initialize iterator. Multiple iterators not available
- # for CRAM.
- if multiple_iterators and samfile.is_cram:
- warnings.warn("multiple_iterators not implemented for CRAM")
- multiple_iterators = False
-
- self._setup_iterator(tid, start, stop, multiple_iterators)
- self.start = start
- self.stop = stop
- self.truncate = truncate
-
- def __next__(self):
-
- cdef int n
-
- while 1:
- n = self.cnext()
- if n < 0:
- raise ValueError("error during iteration" )
-
- if n == 0:
- raise StopIteration
-
- if self.truncate:
- if self.start > self.pos:
- continue
- if self.pos >= self.stop:
- raise StopIteration
-
- return makePileupColumn(&self.plp,
- self.tid,
- self.pos,
- self.n_plp,
- self.min_base_quality,
- self.iterdata.seq,
- self.samfile.header)
-
-
- cdef class IteratorColumnAllRefs(IteratorColumn):
- """iterates over all columns by chaining iterators over each reference
- """
-
- def __cinit__(self,
- AlignmentFile samfile,
- **kwargs):
-
- # no iteration over empty files
- if not samfile.nreferences:
- raise StopIteration
-
- # initialize iterator
- self._setup_iterator(self.tid, 0, MAX_POS, 1)
-
- def __next__(self):
-
- cdef int n
- while 1:
- n = self.cnext()
- if n < 0:
- raise ValueError("error during iteration")
-
- # proceed to next reference or stop
- if n == 0:
- self.tid += 1
- if self.tid < self.samfile.nreferences:
- self._setup_iterator(self.tid, 0, MAX_POS, 0)
- else:
- raise StopIteration
- continue
-
- # return result, if within same reference
- return makePileupColumn(&self.plp,
- self.tid,
- self.pos,
- self.n_plp,
- self.min_base_quality,
- self.iterdata.seq,
- self.samfile.header)
-
-
- cdef class IteratorColumnAll(IteratorColumn):
- """iterates over all columns, without using an index
- """
-
- def __cinit__(self,
- AlignmentFile samfile,
- **kwargs):
-
- self._setup_raw_rest_iterator()
-
- def __next__(self):
-
- cdef int n
- n = self.cnext()
- if n < 0:
- raise ValueError("error during iteration")
-
- if n == 0:
- raise StopIteration
-
- return makePileupColumn(&self.plp,
- self.tid,
- self.pos,
- self.n_plp,
- self.min_base_quality,
- self.iterdata.seq,
- self.samfile.header)
-
-
- cdef class SNPCall:
- '''the results of a SNP call.'''
- cdef int _tid
- cdef int _pos
- cdef char _reference_base
- cdef char _genotype
- cdef int _consensus_quality
- cdef int _snp_quality
- cdef int _rms_mapping_quality
- cdef int _coverage
-
- property tid:
- '''the chromosome ID as is defined in the header'''
- def __get__(self):
- return self._tid
-
- property pos:
- '''nucleotide position of SNP.'''
- def __get__(self): return self._pos
-
- property reference_base:
- '''reference base at pos. ``N`` if no reference sequence supplied.'''
- def __get__(self): return from_string_and_size( &self._reference_base, 1 )
-
- property genotype:
- '''the genotype called.'''
- def __get__(self): return from_string_and_size( &self._genotype, 1 )
-
- property consensus_quality:
- '''the genotype quality (Phred-scaled).'''
- def __get__(self): return self._consensus_quality
-
- property snp_quality:
- '''the snp quality (Phred scaled) - probability of consensus being
- identical to reference sequence.'''
- def __get__(self): return self._snp_quality
-
- property mapping_quality:
- '''the root mean square (rms) of the mapping quality of all reads
- involved in the call.'''
- def __get__(self): return self._rms_mapping_quality
-
- property coverage:
- '''coverage or read depth - the number of reads involved in the call.'''
- def __get__(self): return self._coverage
-
- def __str__(self):
-
- return "\t".join( map(str, (
- self.tid,
- self.pos,
- self.reference_base,
- self.genotype,
- self.consensus_quality,
- self.snp_quality,
- self.mapping_quality,
- self.coverage ) ) )
-
-
- cdef class IndexedReads:
- """Index a Sam/BAM-file by query name while keeping the
- original sort order intact.
-
- The index is kept in memory and can be substantial.
-
- By default, the file is re-opened to avoid conflicts if multiple
- operators work on the same file. Set `multiple_iterators` = False
- to not re-open `samfile`.
-
- Parameters
- ----------
-
- samfile : AlignmentFile
- File to be indexed.
-
- multiple_iterators : bool
- Flag indicating whether the file should be reopened. Reopening prevents
- existing iterators being affected by the indexing.
-
- """
-
- def __init__(self, AlignmentFile samfile, int multiple_iterators=True):
- cdef char *cfilename
-
- # makes sure that samfile stays alive as long as this
- # object is alive.
- self.samfile = samfile
- cdef bam_hdr_t * hdr = NULL
- assert samfile.is_bam, "can only apply IndexReads on bam files"
-
- # multiple_iterators the file - note that this makes the iterator
- # slow and causes pileup to slow down significantly.
- if multiple_iterators:
- cfilename = samfile.filename
- with nogil:
- self.htsfile = hts_open(cfilename, 'r')
- if self.htsfile == NULL:
- raise OSError_from_errno("Unable to reopen file", cfilename)
-
- # need to advance in newly opened file to position after header
- # better: use seek/tell?
- with nogil:
- hdr = sam_hdr_read(self.htsfile)
- if hdr == NULL:
- raise OSError("unable to read header information")
- self.header = makeAlignmentHeader(hdr)
- self.owns_samfile = True
- else:
- self.htsfile = self.samfile.htsfile
- self.header = samfile.header
- self.owns_samfile = False
-
- def build(self):
- '''build the index.'''
-
- self.index = collections.defaultdict(list)
-
- # this method will start indexing from the current file position
- cdef int ret = 1
- cdef bam1_t * b = <bam1_t*>calloc(1, sizeof( bam1_t))
- if b == NULL:
- raise MemoryError("could not allocate {} bytes".format(sizeof(bam1_t)))
-
- cdef uint64_t pos
- cdef bam_hdr_t * hdr = self.header.ptr
-
- while ret > 0:
- with nogil:
- pos = bgzf_tell(hts_get_bgzfp(self.htsfile))
- ret = sam_read1(self.htsfile,
- hdr,
- b)
-
- if ret > 0:
- qname = charptr_to_str(pysam_bam_get_qname(b))
- self.index[qname].append(pos)
-
- bam_destroy1(b)
-
- def find(self, query_name):
- '''find `query_name` in index.
-
- Returns
- -------
-
- IteratorRowSelection
- Returns an iterator over all reads with query_name.
-
- Raises
- ------
-
- KeyError
- if the `query_name` is not in the index.
-
- '''
- if query_name in self.index:
- return IteratorRowSelection(
- self.samfile,
- self.index[query_name],
- multiple_iterators = False)
- else:
- raise KeyError("read %s not found" % query_name)
-
- def __dealloc__(self):
- if self.owns_samfile:
- hts_close(self.htsfile)
|