Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 
 
 

807 řádky
28 KiB

  1. # -*- coding: utf-8 -*-
  2. """Parsing of VCF files from ``str``
  3. """
  4. import ast
  5. import functools
  6. import math
  7. import re
  8. import warnings
  9. from . import header
  10. from . import record
  11. from . import exceptions
  12. from .exceptions import (
  13. CannotConvertValue,
  14. LeadingTrailingSpaceInKey,
  15. UnknownFilter,
  16. UnknownVCFVersion,
  17. SpaceInChromLine,
  18. )
  19. from .compat import OrderedDict
  20. __author__ = "Manuel Holtgrewe <manuel.holtgrewe@bihealth.de>"
  21. # expected "#CHROM" header prefix when there are samples
  22. REQUIRE_SAMPLE_HEADER = ("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT")
  23. # expected "#CHROM" header prefix when there are no samples
  24. REQUIRE_NO_SAMPLE_HEADER = ("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")
  25. #: Supported VCF versions, a warning will be issued otherwise
  26. SUPPORTED_VCF_VERSIONS = ("VCFv4.0", "VCFv4.1", "VCFv4.2", "VCFv4.3")
  27. class QuotedStringSplitter:
  28. """Helper class for splitting quoted strings
  29. Has support for interpreting quoting strings but also brackets. Meant
  30. for splitting the VCF header line dicts
  31. """
  32. #: state constant for normal
  33. NORMAL = 0
  34. #: state constant for quoted
  35. QUOTED = 1
  36. #: state constant for delimiter
  37. ESCAPED = 2
  38. #: state constant for array
  39. ARRAY = 3
  40. #: state constant for delimiter
  41. DELIM = 4
  42. def __init__(self, delim=",", quote='"', brackets="[]"):
  43. #: string delimiter
  44. self.delim = delim
  45. #: quote character
  46. self.quote = quote
  47. #: two-character string with opening and closing brackets
  48. assert len(brackets) == 2
  49. self.brackets = brackets
  50. def run(self, s):
  51. """Split string ``s`` at delimiter, correctly interpreting quotes
  52. Further, interprets arrays wrapped in one level of ``[]``. No
  53. recursive brackets are interpreted (as this would make the grammar
  54. non-regular and currently this complexity is not needed). Currently,
  55. quoting inside of braces is not supported either. This is just to
  56. support the example from VCF v4.3.
  57. """
  58. begins, ends = [0], []
  59. # transition table
  60. DISPATCH = {
  61. self.NORMAL: self._handle_normal,
  62. self.QUOTED: self._handle_quoted,
  63. self.ARRAY: self._handle_array,
  64. self.DELIM: self._handle_delim,
  65. self.ESCAPED: self._handle_escaped,
  66. }
  67. # run state automaton
  68. state = self.NORMAL
  69. for pos, c in enumerate(s):
  70. state = DISPATCH[state](c, pos, begins, ends)
  71. ends.append(len(s))
  72. assert len(begins) == len(ends)
  73. # Build resulting list
  74. return [s[start:end] for start, end in zip(begins, ends)]
  75. def _handle_normal(self, c, pos, begins, ends): # pylint: disable=W0613
  76. if c == self.delim:
  77. ends.append(pos)
  78. return self.DELIM
  79. elif c == self.quote:
  80. return self.QUOTED
  81. elif c == self.brackets[0]:
  82. return self.ARRAY
  83. else:
  84. return self.NORMAL
  85. def _handle_quoted(self, c, pos, begins, ends): # pylint: disable=W0613
  86. if c == "\\":
  87. return self.ESCAPED
  88. elif c == self.quote:
  89. return self.NORMAL
  90. else:
  91. return self.QUOTED
  92. def _handle_array(self, c, pos, begins, ends): # pylint: disable=W0613
  93. if c == self.brackets[1]:
  94. return self.NORMAL
  95. else:
  96. return self.ARRAY
  97. def _handle_delim(self, c, pos, begins, ends): # pylint: disable=W0613
  98. begins.append(pos)
  99. return self.NORMAL
  100. def _handle_escaped(self, c, pos, begins, ends): # pylint: disable=W0613
  101. return self.QUOTED
  102. def split_quoted_string(s, delim=",", quote='"', brackets="[]"):
  103. return QuotedStringSplitter(delim, quote, brackets).run(s)
  104. def split_mapping(pair_str):
  105. """Split the ``str`` in ``pair_str`` at ``'='``
  106. Warn if key needs to be stripped
  107. """
  108. orig_key, value = pair_str.split("=", 1)
  109. key = orig_key.strip()
  110. if key != orig_key:
  111. warnings.warn(
  112. "Mapping key {} has leading or trailing space".format(repr(orig_key)),
  113. LeadingTrailingSpaceInKey,
  114. )
  115. return key, value
  116. def parse_mapping(value):
  117. """Parse the given VCF header line mapping
  118. Such a mapping consists of "key=value" pairs, separated by commas and
  119. wrapped into angular brackets ("<...>"). Strings are usually quoted,
  120. for certain known keys, exceptions are made, depending on the tag key.
  121. this, however, only gets important when serializing.
  122. :raises: :py:class:`vcfpy.exceptions.InvalidHeaderException` if
  123. there was a problem parsing the file
  124. """
  125. if not value.startswith("<") or not value.endswith(">"):
  126. raise exceptions.InvalidHeaderException(
  127. "Header mapping value was not wrapped in angular brackets"
  128. )
  129. # split the comma-separated list into pairs, ignoring commas in quotes
  130. pairs = split_quoted_string(value[1:-1], delim=",", quote='"')
  131. # split these pairs into key/value pairs, converting flags to mappings
  132. # to True
  133. key_values = []
  134. for pair in pairs:
  135. if "=" in pair:
  136. key, value = split_mapping(pair)
  137. if value.startswith('"') and value.endswith('"'):
  138. value = ast.literal_eval(value)
  139. elif value.startswith("[") and value.endswith("]"):
  140. value = [v.strip() for v in value[1:-1].split(",")]
  141. else:
  142. key, value = pair, True
  143. key_values.append((key, value))
  144. # return completely parsed mapping as OrderedDict
  145. return OrderedDict(key_values)
  146. class HeaderLineParserBase:
  147. """Parse into appropriate HeaderLine"""
  148. def parse_key_value(self, key, value):
  149. """Parse the key/value pair
  150. :param str key: the key to use in parsing
  151. :param str value: the value to parse
  152. :returns: :py:class:`vcfpy.header.HeaderLine` object
  153. """
  154. raise NotImplementedError("Must be overridden")
  155. class StupidHeaderLineParser(HeaderLineParserBase):
  156. """Parse into HeaderLine (no particular structure)"""
  157. def parse_key_value(self, key, value):
  158. return header.HeaderLine(key, value)
  159. class MappingHeaderLineParser(HeaderLineParserBase):
  160. """Parse into HeaderLine (no particular structure)"""
  161. def __init__(self, line_class):
  162. """Initialize the parser"""
  163. #: the class to use for the VCF header line
  164. self.line_class = line_class
  165. def parse_key_value(self, key, value):
  166. return self.line_class(key, value, parse_mapping(value))
  167. def build_header_parsers():
  168. """Return mapping for parsers to use for each VCF header type
  169. Inject the WarningHelper into the parsers.
  170. """
  171. result = {
  172. "ALT": MappingHeaderLineParser(header.AltAlleleHeaderLine),
  173. "contig": MappingHeaderLineParser(header.ContigHeaderLine),
  174. "FILTER": MappingHeaderLineParser(header.FilterHeaderLine),
  175. "FORMAT": MappingHeaderLineParser(header.FormatHeaderLine),
  176. "INFO": MappingHeaderLineParser(header.InfoHeaderLine),
  177. "META": MappingHeaderLineParser(header.MetaHeaderLine),
  178. "PEDIGREE": MappingHeaderLineParser(header.PedigreeHeaderLine),
  179. "SAMPLE": MappingHeaderLineParser(header.SampleHeaderLine),
  180. "__default__": StupidHeaderLineParser(), # fallback
  181. }
  182. return result
  183. # Field value converters
  184. _CONVERTERS = {
  185. "Integer": int,
  186. "Float": float,
  187. "Flag": lambda x: True,
  188. "Character": str,
  189. "String": str,
  190. }
  191. def convert_field_value(type_, value):
  192. """Convert atomic field value according to the type"""
  193. if value == ".":
  194. return None
  195. elif type_ in ("Character", "String"):
  196. if "%" in value:
  197. for k, v in record.UNESCAPE_MAPPING:
  198. value = value.replace(k, v)
  199. return value
  200. else:
  201. try:
  202. return _CONVERTERS[type_](value)
  203. except ValueError:
  204. warnings.warn(
  205. ("{} cannot be converted to {}, keeping as " "string.").format(value, type_),
  206. CannotConvertValue,
  207. )
  208. return value
  209. def parse_field_value(field_info, value):
  210. """Parse ``value`` according to ``field_info``
  211. """
  212. if field_info.id == "FT":
  213. return [x for x in value.split(";") if x != "."]
  214. elif field_info.type == "Flag":
  215. return True
  216. elif field_info.number == 1:
  217. return convert_field_value(field_info.type, value)
  218. else:
  219. if value == ".":
  220. return []
  221. else:
  222. return [convert_field_value(field_info.type, x) for x in value.split(",")]
  223. # Regular expression for break-end
  224. BREAKEND_PATTERN = re.compile("[\\[\\]]")
  225. def parse_breakend(alt_str):
  226. """Parse breakend and return tuple with results, parameters for BreakEnd
  227. constructor
  228. """
  229. arr = BREAKEND_PATTERN.split(alt_str)
  230. mate_chrom, mate_pos = arr[1].split(":", 1)
  231. mate_pos = int(mate_pos)
  232. if mate_chrom[0] == "<":
  233. mate_chrom = mate_chrom[1:-1]
  234. within_main_assembly = False
  235. else:
  236. within_main_assembly = True
  237. FWD_REV = {True: record.FORWARD, False: record.REVERSE}
  238. orientation = FWD_REV[alt_str[0] == "[" or alt_str[0] == "]"]
  239. mate_orientation = FWD_REV["[" in alt_str]
  240. if orientation == record.FORWARD:
  241. sequence = arr[2]
  242. else:
  243. sequence = arr[0]
  244. return (mate_chrom, mate_pos, orientation, mate_orientation, sequence, within_main_assembly)
  245. def process_sub_grow(ref, alt_str):
  246. """Process substution where the string grows"""
  247. if len(alt_str) == 0:
  248. raise exceptions.InvalidRecordException("Invalid VCF, empty ALT")
  249. elif len(alt_str) == 1:
  250. if ref[0] == alt_str[0]:
  251. return record.Substitution(record.DEL, alt_str)
  252. else:
  253. return record.Substitution(record.INDEL, alt_str)
  254. else:
  255. return record.Substitution(record.INDEL, alt_str)
  256. def process_sub_shrink(ref, alt_str):
  257. """Process substution where the string shrink"""
  258. if len(ref) == 0:
  259. raise exceptions.InvalidRecordException("Invalid VCF, empty REF")
  260. elif len(ref) == 1:
  261. if ref[0] == alt_str[0]:
  262. return record.Substitution(record.INS, alt_str)
  263. else:
  264. return record.Substitution(record.INDEL, alt_str)
  265. else:
  266. return record.Substitution(record.INDEL, alt_str)
  267. def process_sub(ref, alt_str):
  268. """Process substitution"""
  269. if len(ref) == len(alt_str):
  270. if len(ref) == 1:
  271. return record.Substitution(record.SNV, alt_str)
  272. else:
  273. return record.Substitution(record.MNV, alt_str)
  274. elif len(ref) > len(alt_str):
  275. return process_sub_grow(ref, alt_str)
  276. else: # len(ref) < len(alt_str):
  277. return process_sub_shrink(ref, alt_str)
  278. def process_alt(header, ref, alt_str): # pylint: disable=W0613
  279. """Process alternative value using Header in ``header``"""
  280. # By its nature, this function contains a large number of case distinctions
  281. if "]" in alt_str or "[" in alt_str:
  282. return record.BreakEnd(*parse_breakend(alt_str))
  283. elif alt_str[0] == "." and len(alt_str) > 0:
  284. return record.SingleBreakEnd(record.FORWARD, alt_str[1:])
  285. elif alt_str[-1] == "." and len(alt_str) > 0:
  286. return record.SingleBreakEnd(record.REVERSE, alt_str[:-1])
  287. elif alt_str[0] == "<" and alt_str[-1] == ">":
  288. inner = alt_str[1:-1]
  289. return record.SymbolicAllele(inner)
  290. else: # substitution
  291. return process_sub(ref, alt_str)
  292. class HeaderParser:
  293. """Helper class for parsing a VCF header
  294. """
  295. def __init__(self):
  296. #: Sub parsers to use for parsing the header lines
  297. self.sub_parsers = build_header_parsers()
  298. def parse_line(self, line):
  299. """Parse VCF header ``line`` (trailing '\r\n' or '\n' is ignored)
  300. :param str line: ``str`` with line to parse
  301. :param dict sub_parsers: ``dict`` mapping header line types to
  302. appropriate parser objects
  303. :returns: appropriate :py:class:`HeaderLine` parsed from ``line``
  304. :raises: :py:class:`vcfpy.exceptions.InvalidHeaderException` if
  305. there was a problem parsing the file
  306. """
  307. if not line or not line.startswith("##"):
  308. raise exceptions.InvalidHeaderException(
  309. 'Invalid VCF header line (must start with "##") {}'.format(line)
  310. )
  311. if "=" not in line:
  312. raise exceptions.InvalidHeaderException(
  313. 'Invalid VCF header line (must contain "=") {}'.format(line)
  314. )
  315. line = line[len("##") :].rstrip() # trim '^##' and trailing whitespace
  316. # split key/value pair at "="
  317. key, value = split_mapping(line)
  318. sub_parser = self.sub_parsers.get(key, self.sub_parsers["__default__"])
  319. return sub_parser.parse_key_value(key, value)
  320. class RecordParser:
  321. """Helper class for parsing VCF records"""
  322. def __init__(self, header, samples, record_checks=None):
  323. #: Header with the meta information
  324. self.header = header
  325. #: SamplesInfos with sample information
  326. self.samples = samples
  327. #: The checks to perform, can contain 'INFO' and 'FORMAT'
  328. self.record_checks = tuple(record_checks or [])
  329. # Expected number of fields
  330. if self.samples.names:
  331. self.expected_fields = 9 + len(self.samples.names)
  332. else:
  333. self.expected_fields = 8
  334. # Cache of FieldInfo objects by FORMAT string
  335. self._format_cache = {}
  336. # Cache of FILTER entries, also applied to FORMAT/FT
  337. self._filter_ids = set(self.header.filter_ids())
  338. # Helper for checking INFO fields
  339. if "INFO" in self.record_checks:
  340. self._info_checker = InfoChecker(self.header)
  341. else:
  342. self._info_checker = NoopInfoChecker()
  343. # Helper for checking FORMAT fields
  344. if "FORMAT" in self.record_checks:
  345. self._format_checker = FormatChecker(self.header)
  346. else:
  347. self._format_checker = NoopFormatChecker()
  348. def parse_line(self, line_str):
  349. """Parse line from file (including trailing line break) and return
  350. resulting Record
  351. """
  352. line_str = line_str.rstrip()
  353. if not line_str:
  354. return None # empty line, EOF
  355. arr = self._split_line(line_str)
  356. # CHROM
  357. chrom = arr[0]
  358. # POS
  359. pos = int(arr[1])
  360. # IDS
  361. if arr[2] == ".":
  362. ids = []
  363. else:
  364. ids = arr[2].split(";")
  365. # REF
  366. ref = arr[3]
  367. # ALT
  368. alts = []
  369. if arr[4] != ".":
  370. for alt in arr[4].split(","):
  371. alts.append(process_alt(self.header, ref, alt))
  372. # QUAL
  373. if arr[5] == ".":
  374. qual = None
  375. else:
  376. try:
  377. qual = int(arr[5])
  378. except ValueError: # try as float
  379. qual = float(arr[5])
  380. # FILTER
  381. if arr[6] == ".":
  382. filt = []
  383. else:
  384. filt = arr[6].split(";")
  385. self._check_filters(filt, "FILTER")
  386. # INFO
  387. info = self._parse_info(arr[7], len(alts))
  388. if len(arr) == 9:
  389. raise exceptions.IncorrectVCFFormat("Expected 8 or 10+ columns, got 9!")
  390. elif len(arr) == 8:
  391. format_ = None
  392. calls = None
  393. else:
  394. # FORMAT
  395. format_ = arr[8].split(":")
  396. # sample/call columns
  397. calls = self._handle_calls(alts, format_, arr[8], arr)
  398. return record.Record(chrom, pos, ids, ref, alts, qual, filt, info, format_, calls)
  399. def _handle_calls(self, alts, format_, format_str, arr):
  400. """Handle FORMAT and calls columns, factored out of parse_line"""
  401. if format_str not in self._format_cache:
  402. self._format_cache[format_str] = list(map(self.header.get_format_field_info, format_))
  403. # per-sample calls
  404. calls = []
  405. for sample, raw_data in zip(self.samples.names, arr[9:]):
  406. if self.samples.is_parsed(sample):
  407. data = self._parse_calls_data(format_, self._format_cache[format_str], raw_data)
  408. call = record.Call(sample, data)
  409. self._format_checker.run(call, len(alts))
  410. self._check_filters(call.data.get("FT"), "FORMAT/FT", call.sample)
  411. calls.append(call)
  412. else:
  413. calls.append(record.UnparsedCall(sample, raw_data))
  414. return calls
  415. def _check_filters(self, filt, source, sample=None):
  416. if not filt:
  417. return
  418. for f in filt:
  419. self._check_filter(f, source, sample)
  420. def _check_filter(self, f, source, sample):
  421. if f == "PASS":
  422. pass # the PASS filter is implicitely defined
  423. elif f not in self._filter_ids:
  424. if source == "FILTER":
  425. warnings.warn(
  426. ("Filter not found in header: {}; problem in FILTER " "column").format(f),
  427. UnknownFilter,
  428. )
  429. else:
  430. assert source == "FORMAT/FT" and sample
  431. warnings.warn(
  432. (
  433. "Filter not found in header: {}; problem in "
  434. "FORMAT/FT column of sample {}"
  435. ).format(f, sample),
  436. UnknownFilter,
  437. )
  438. def _split_line(self, line_str):
  439. """Split line and check number of columns"""
  440. arr = line_str.rstrip().split("\t")
  441. if len(arr) != self.expected_fields:
  442. raise exceptions.InvalidRecordException(
  443. (
  444. "The line contains an invalid number of fields. Was "
  445. "{} but expected {}\n{}".format(len(arr), 9 + len(self.samples.names), line_str)
  446. )
  447. )
  448. return arr
  449. def _parse_info(self, info_str, num_alts):
  450. """Parse INFO column from string"""
  451. result = OrderedDict()
  452. if info_str == ".":
  453. return result
  454. # The standard is very nice to parsers, we can simply split at
  455. # semicolon characters, although I (Manuel) don't know how strict
  456. # programs follow this
  457. for entry in info_str.split(";"):
  458. if "=" not in entry: # flag
  459. key = entry
  460. result[key] = parse_field_value(self.header.get_info_field_info(key), True)
  461. else:
  462. key, value = split_mapping(entry)
  463. result[key] = parse_field_value(self.header.get_info_field_info(key), value)
  464. self._info_checker.run(key, result[key], num_alts)
  465. return result
  466. @classmethod
  467. def _parse_calls_data(klass, format_, infos, gt_str):
  468. """Parse genotype call information from arrays using format array
  469. :param list format: List of strings with format names
  470. :param gt_str arr: string with genotype information values
  471. """
  472. data = OrderedDict()
  473. # The standard is very nice to parsers, we can simply split at
  474. # colon characters, although I (Manuel) don't know how strict
  475. # programs follow this
  476. for key, info, value in zip(format_, infos, gt_str.split(":")):
  477. data[key] = parse_field_value(info, value)
  478. return data
  479. class HeaderChecker:
  480. """Helper class for checking a VCF header
  481. """
  482. def run(self, header):
  483. """Check the header
  484. Warnings will be printed using ``warnings`` while errors will raise
  485. an exception.
  486. :raises: ``vcfpy.exceptions.InvalidHeaderException`` in the case of
  487. severe errors reading the header
  488. """
  489. self._check_header_lines(header.lines)
  490. def _check_header_lines(self, header_lines):
  491. """Check header lines, in particular for starting file "##fileformat"
  492. """
  493. if not header_lines:
  494. raise exceptions.InvalidHeaderException(
  495. "The VCF file did not contain any header lines!"
  496. )
  497. first = header_lines[0]
  498. if first.key != "fileformat":
  499. raise exceptions.InvalidHeaderException("The VCF file did not start with ##fileformat")
  500. if first.value not in SUPPORTED_VCF_VERSIONS:
  501. warnings.warn("Unknown VCF version {}".format(first.value), UnknownVCFVersion)
  502. @functools.lru_cache(maxsize=32)
  503. def binomial(n, k):
  504. try:
  505. res = math.factorial(n) // math.factorial(k) // math.factorial(n - k)
  506. except ValueError:
  507. res = 0
  508. return res
  509. class NoopInfoChecker:
  510. """Helper class that performs no checks"""
  511. def __init__(self):
  512. pass
  513. def run(self, key, value, num_alts):
  514. pass
  515. class InfoChecker:
  516. """Helper class for checking an INFO field"""
  517. def __init__(self, header):
  518. #: VCFHeader to use for checking
  519. self.header = header
  520. def run(self, key, value, num_alts):
  521. """Check value in INFO[key] of record
  522. Currently, only checks for consistent counts are implemented
  523. :param str key: key of INFO entry to check
  524. :param value: value to check
  525. :param int alts: list of alternative alleles, for length
  526. """
  527. field_info = self.header.get_info_field_info(key)
  528. if not isinstance(value, list):
  529. return
  530. TABLE = {
  531. ".": len(value),
  532. "A": num_alts,
  533. "R": num_alts + 1,
  534. "G": binomial(num_alts + 1, 2), # diploid only at the moment
  535. }
  536. expected = TABLE.get(field_info.number, field_info.number)
  537. if len(value) != expected:
  538. tpl = "Number of elements for INFO field {} is {} instead of {}"
  539. warnings.warn(
  540. tpl.format(key, len(value), field_info.number), exceptions.IncorrectListLength
  541. )
  542. class NoopFormatChecker:
  543. """Helper class that performs no checks"""
  544. def __init__(self):
  545. pass
  546. def run(self, call, num_alts):
  547. pass
  548. class FormatChecker:
  549. """Helper class for checking a FORMAT field"""
  550. def __init__(self, header):
  551. #: VCFHeader to use for checking
  552. self.header = header
  553. def run(self, call, num_alts):
  554. """Check ``FORMAT`` of a record.Call
  555. Currently, only checks for consistent counts are implemented
  556. """
  557. for key, value in call.data.items():
  558. self._check_count(call, key, value, num_alts)
  559. def _check_count(self, call, key, value, num_alts):
  560. field_info = self.header.get_format_field_info(key)
  561. if isinstance(value, list):
  562. return
  563. num_alleles = len(call.gt_alleles or [])
  564. TABLE = {
  565. ".": len(value),
  566. "A": num_alts,
  567. "R": num_alts + 1,
  568. "G": binomial(num_alts + num_alleles, num_alleles),
  569. }
  570. expected = TABLE.get(field_info.number, field_info.number)
  571. if len(value) != expected:
  572. tpl = (
  573. "Number of elements for FORMAT field {} is {} instead "
  574. "of {} (number specifier {})"
  575. )
  576. warnings.warn(
  577. tpl.format(key, len(value), expected, field_info.number),
  578. exceptions.IncorrectListLength,
  579. )
  580. class Parser:
  581. """Class for line-wise parsing of VCF files
  582. In most cases, you want to use :py:class:`vcfpy.reader.Reader` instead.
  583. :param stream: ``file``-like object to read from
  584. :param str path: path the VCF is parsed from, for display purposes
  585. only, optional
  586. """
  587. def __init__(self, stream, path=None, record_checks=None):
  588. self.stream = stream
  589. self.path = path
  590. #: checks to perform, can contain 'INFO' and 'FORMAT'
  591. self.record_checks = tuple(record_checks or [])
  592. #: header, once it has been read
  593. self.header = None
  594. # the currently read line
  595. self._line = stream.readline() # trailing '\n'
  596. #: :py:class:`vcfpy.header.SamplesInfos` with sample information;
  597. #: set on parsing the header
  598. self.samples = None
  599. # helper for parsing the records
  600. self._record_parser = None
  601. # helper for checking the header
  602. self._header_checker = HeaderChecker()
  603. def _read_next_line(self):
  604. """Read next line store in self._line and return old one"""
  605. prev_line = self._line
  606. self._line = self.stream.readline()
  607. return prev_line
  608. def parse_header(self, parsed_samples=None):
  609. """Read and parse :py:class:`vcfpy.header.Header` from file, set
  610. into ``self.header`` and return it
  611. :param list parsed_samples: ``list`` of ``str`` for subsetting the
  612. samples to parse
  613. :returns: ``vcfpy.header.Header``
  614. :raises: ``vcfpy.exceptions.InvalidHeaderException`` in the case of
  615. problems reading the header
  616. """
  617. # parse header lines
  618. sub_parser = HeaderParser()
  619. header_lines = []
  620. while self._line and self._line.startswith("##"):
  621. header_lines.append(sub_parser.parse_line(self._line))
  622. self._read_next_line()
  623. # parse sample info line
  624. self.samples = self._handle_sample_line(parsed_samples)
  625. # construct Header object
  626. self.header = header.Header(header_lines, self.samples)
  627. # check header for consistency
  628. self._header_checker.run(self.header)
  629. # construct record parser
  630. self._record_parser = RecordParser(self.header, self.samples, self.record_checks)
  631. # read next line, must not be header
  632. self._read_next_line()
  633. if self._line and self._line.startswith("#"):
  634. raise exceptions.IncorrectVCFFormat(
  635. 'Expecting non-header line or EOF after "#CHROM" line'
  636. )
  637. return self.header
  638. def _handle_sample_line(self, parsed_samples=None):
  639. """"Check and interpret the "##CHROM" line and return samples"""
  640. if not self._line or not self._line.startswith("#CHROM"):
  641. raise exceptions.IncorrectVCFFormat('Missing line starting with "#CHROM"')
  642. # check for space before INFO
  643. line = self._line.rstrip()
  644. pos = line.find("FORMAT") if ("FORMAT" in line) else line.find("INFO")
  645. if pos == -1:
  646. raise exceptions.IncorrectVCFFormat('Ill-formatted line starting with "#CHROM"')
  647. if " " in line[:pos]:
  648. warnings.warn(
  649. (
  650. "Found space in #CHROM line, splitting at whitespace "
  651. "instead of tab; this VCF file is ill-formatted"
  652. ),
  653. SpaceInChromLine,
  654. )
  655. arr = self._line.rstrip().split()
  656. else:
  657. arr = self._line.rstrip().split("\t")
  658. self._check_samples_line(arr)
  659. return header.SamplesInfos(arr[len(REQUIRE_SAMPLE_HEADER) :], parsed_samples)
  660. @classmethod
  661. def _check_samples_line(klass, arr):
  662. """Peform additional check on samples line"""
  663. if len(arr) <= len(REQUIRE_NO_SAMPLE_HEADER):
  664. if tuple(arr) != REQUIRE_NO_SAMPLE_HEADER:
  665. raise exceptions.IncorrectVCFFormat(
  666. "Sample header line indicates no sample but does not "
  667. "equal required prefix {}".format("\t".join(REQUIRE_NO_SAMPLE_HEADER))
  668. )
  669. elif tuple(arr[: len(REQUIRE_SAMPLE_HEADER)]) != REQUIRE_SAMPLE_HEADER:
  670. raise exceptions.IncorrectVCFFormat(
  671. 'Sample header line (starting with "#CHROM") does not '
  672. "start with required prefix {}".format("\t".join(REQUIRE_SAMPLE_HEADER))
  673. )
  674. def parse_line(self, line):
  675. """Pare the given line without reading another one from the stream"""
  676. return self._record_parser.parse_line(line)
  677. def parse_next_record(self):
  678. """Read, parse and return next :py:class:`vcfpy.record.Record`
  679. :returns: next VCF record or ``None`` if at end
  680. :raises: ``vcfpy.exceptions.InvalidRecordException`` in the case of
  681. problems reading the record
  682. """
  683. return self.parse_line(self._read_next_line())
  684. def print_warn_summary(self):
  685. """If there were any warnings, print summary with warnings"""
  686. # TODO: remove?