25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

180 lines
6.5 KiB

  1. # -*- coding: utf-8 -*-
  2. """Parsing of VCF files from ``file``-like objects
  3. """
  4. import gzip
  5. import os
  6. import pysam
  7. from . import parser
  8. __author__ = "Manuel Holtgrewe <manuel.holtgrewe@bihealth.de>"
  9. class Reader:
  10. """Class for parsing of files from ``file``-like objects
  11. Instead of using the constructor, use the class methods
  12. :py:meth:`~Reader.from_stream` and
  13. :py:meth:`~Reader.from_path`.
  14. On construction, the header will be read from the file which can cause
  15. problems. After construction, :py:class:`~Reader` can be used as
  16. an iterable of :py:class:`~vcfpy.record.Record`.
  17. :raises: :py:class:`~vcfpy.exceptions.InvalidHeaderException` in the case
  18. of problems reading the header
  19. .. note::
  20. It is important to note that the ``header`` member is used during
  21. the parsing of the file. **If you need a modified version then
  22. create a copy, e.g., using :py:method:`~vcfpy.header.Header.copy`**.
  23. .. note::
  24. If you use the ``parsed_samples`` feature and you write out
  25. records then you must not change the ``FORMAT`` of the record.
  26. """
  27. @classmethod
  28. def from_stream(
  29. klass, stream, path=None, tabix_path=None, record_checks=None, parsed_samples=None
  30. ):
  31. """Create new :py:class:`Reader` from file
  32. .. note::
  33. If you use the ``parsed_samples`` feature and you write out
  34. records then you must not change the ``FORMAT`` of the record.
  35. :param stream: ``file``-like object to read from
  36. :param path: optional string with path to store (for display only)
  37. :param list record_checks: record checks to perform, can contain
  38. 'INFO' and 'FORMAT'
  39. :param list parsed_samples: ``list`` of ``str`` values with names of
  40. samples to parse call information for (for speedup); leave to
  41. ``None`` for ignoring
  42. """
  43. record_checks = record_checks or []
  44. if tabix_path and not path:
  45. raise ValueError("Must give path if tabix_path is given")
  46. return Reader(
  47. stream=stream,
  48. path=path,
  49. tabix_path=tabix_path,
  50. record_checks=record_checks,
  51. parsed_samples=parsed_samples,
  52. )
  53. @classmethod
  54. def from_path(klass, path, tabix_path=None, record_checks=None, parsed_samples=None):
  55. """Create new :py:class:`Reader` from path
  56. .. note::
  57. If you use the ``parsed_samples`` feature and you write out
  58. records then you must not change the ``FORMAT`` of the record.
  59. :param path: the path to load from (converted to ``str`` for
  60. compatibility with ``path.py``)
  61. :param tabix_path: optional string with path to TBI index,
  62. automatic inferral from ``path`` will be tried on the fly
  63. if not given
  64. :param list record_checks: record checks to perform, can contain
  65. 'INFO' and 'FORMAT'
  66. """
  67. record_checks = record_checks or []
  68. path = str(path)
  69. if path.endswith(".gz"):
  70. f = gzip.open(path, "rt")
  71. if not tabix_path:
  72. tabix_path = path + ".tbi"
  73. if not os.path.exists(tabix_path):
  74. tabix_path = None # guessing path failed
  75. else:
  76. f = open(path, "rt")
  77. return klass.from_stream(
  78. stream=f,
  79. path=path,
  80. tabix_path=tabix_path,
  81. record_checks=record_checks,
  82. parsed_samples=parsed_samples,
  83. )
  84. def __init__(self, stream, path=None, tabix_path=None, record_checks=None, parsed_samples=None):
  85. #: stream (``file``-like object) to read from
  86. self.stream = stream
  87. #: optional ``str`` with the path to the stream
  88. self.path = path
  89. #: optional ``str`` with path to tabix file
  90. self.tabix_path = tabix_path
  91. #: checks to perform on records, can contain 'FORMAT' and 'INFO'
  92. self.record_checks = tuple(record_checks or [])
  93. #: if set, list of samples to parse for
  94. self.parsed_samples = parsed_samples
  95. #: the ``pysam.TabixFile`` used for reading from index bgzip-ed VCF;
  96. #: constructed on the fly
  97. self.tabix_file = None
  98. # the iterator through the Tabix file to use
  99. self.tabix_iter = None
  100. #: the parser to use
  101. self.parser = parser.Parser(stream, self.path, self.record_checks)
  102. #: the Header
  103. self.header = self.parser.parse_header(parsed_samples)
  104. def fetch(self, chrom_or_region, begin=None, end=None):
  105. """Jump to the start position of the given chromosomal position
  106. and limit iteration to the end position
  107. :param str chrom_or_region: name of the chromosome to jump to if
  108. begin and end are given and a samtools region string otherwise
  109. (e.g. "chr1:123,456-123,900").
  110. :param int begin: 0-based begin position (inclusive)
  111. :param int end: 0-based end position (exclusive)
  112. """
  113. if begin is not None and end is None:
  114. raise ValueError("begin and end must both be None or neither")
  115. # close tabix file if any and is open
  116. if self.tabix_file and not self.tabix_file.closed:
  117. self.tabix_file.close()
  118. # open tabix file if not yet open
  119. if not self.tabix_file or self.tabix_file.closed:
  120. self.tabix_file = pysam.TabixFile(filename=self.path, index=self.tabix_path)
  121. # jump to the next position
  122. if begin is None:
  123. self.tabix_iter = self.tabix_file.fetch(region=chrom_or_region)
  124. else:
  125. self.tabix_iter = self.tabix_file.fetch(reference=chrom_or_region, start=begin, end=end)
  126. return self
  127. def close(self):
  128. """Close underlying stream"""
  129. if self.tabix_file and not self.tabix_file.closed:
  130. self.tabix_file.close()
  131. if self.stream:
  132. self.stream.close()
  133. def __enter__(self):
  134. return self
  135. def __exit__(self, type_, value, traceback):
  136. self.close()
  137. def __iter__(self):
  138. return self
  139. def __next__(self):
  140. """Return next object from file
  141. :returns:
  142. :raises: ``vcfpy.exceptions.InvalidRecordException`` in the case of
  143. problems reading the record
  144. :raises: ``StopException`` if at end
  145. """
  146. if self.tabix_iter:
  147. return self.parser.parse_line(str(next(self.tabix_iter)))
  148. else:
  149. result = self.parser.parse_next_record()
  150. if result is None:
  151. raise StopIteration()
  152. else:
  153. return result