25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.
 
 
 
 

803 satır
28 KiB

  1. # -*- coding: utf-8 -*-
  2. """Parsing of VCF files from ``str``
  3. """
  4. import ast
  5. import functools
  6. import math
  7. import re
  8. import warnings
  9. from . import header
  10. from . import record
  11. from . import exceptions
  12. from .exceptions import (
  13. CannotConvertValue,
  14. LeadingTrailingSpaceInKey,
  15. UnknownFilter,
  16. UnknownVCFVersion,
  17. SpaceInChromLine,
  18. )
  19. from .compat import OrderedDict
  20. __author__ = "Manuel Holtgrewe <manuel.holtgrewe@bihealth.de>"
  21. # expected "#CHROM" header prefix when there are samples
  22. REQUIRE_SAMPLE_HEADER = ("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT")
  23. # expected "#CHROM" header prefix when there are no samples
  24. REQUIRE_NO_SAMPLE_HEADER = ("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")
  25. #: Supported VCF versions, a warning will be issued otherwise
  26. SUPPORTED_VCF_VERSIONS = ("VCFv4.0", "VCFv4.1", "VCFv4.2", "VCFv4.3")
  27. class QuotedStringSplitter:
  28. """Helper class for splitting quoted strings
  29. Has support for interpreting quoting strings but also brackets. Meant
  30. for splitting the VCF header line dicts
  31. """
  32. #: state constant for normal
  33. NORMAL = 0
  34. #: state constant for quoted
  35. QUOTED = 1
  36. #: state constant for delimiter
  37. ESCAPED = 2
  38. #: state constant for array
  39. ARRAY = 3
  40. #: state constant for delimiter
  41. DELIM = 4
  42. def __init__(self, delim=",", quote='"', brackets="[]"):
  43. #: string delimiter
  44. self.delim = delim
  45. #: quote character
  46. self.quote = quote
  47. #: two-character string with opening and closing brackets
  48. assert len(brackets) == 2
  49. self.brackets = brackets
  50. def run(self, s):
  51. """Split string ``s`` at delimiter, correctly interpreting quotes
  52. Further, interprets arrays wrapped in one level of ``[]``. No
  53. recursive brackets are interpreted (as this would make the grammar
  54. non-regular and currently this complexity is not needed). Currently,
  55. quoting inside of braces is not supported either. This is just to
  56. support the example from VCF v4.3.
  57. """
  58. begins, ends = [0], []
  59. # transition table
  60. DISPATCH = {
  61. self.NORMAL: self._handle_normal,
  62. self.QUOTED: self._handle_quoted,
  63. self.ARRAY: self._handle_array,
  64. self.DELIM: self._handle_delim,
  65. self.ESCAPED: self._handle_escaped,
  66. }
  67. # run state automaton
  68. state = self.NORMAL
  69. for pos, c in enumerate(s):
  70. state = DISPATCH[state](c, pos, begins, ends)
  71. ends.append(len(s))
  72. assert len(begins) == len(ends)
  73. # Build resulting list
  74. return [s[start:end] for start, end in zip(begins, ends)]
  75. def _handle_normal(self, c, pos, begins, ends): # pylint: disable=W0613
  76. if c == self.delim:
  77. ends.append(pos)
  78. return self.DELIM
  79. elif c == self.quote:
  80. return self.QUOTED
  81. elif c == self.brackets[0]:
  82. return self.ARRAY
  83. else:
  84. return self.NORMAL
  85. def _handle_quoted(self, c, pos, begins, ends): # pylint: disable=W0613
  86. if c == "\\":
  87. return self.ESCAPED
  88. elif c == self.quote:
  89. return self.NORMAL
  90. else:
  91. return self.QUOTED
  92. def _handle_array(self, c, pos, begins, ends): # pylint: disable=W0613
  93. if c == self.brackets[1]:
  94. return self.NORMAL
  95. else:
  96. return self.ARRAY
  97. def _handle_delim(self, c, pos, begins, ends): # pylint: disable=W0613
  98. begins.append(pos)
  99. return self.NORMAL
  100. def _handle_escaped(self, c, pos, begins, ends): # pylint: disable=W0613
  101. return self.QUOTED
  102. def split_quoted_string(s, delim=",", quote='"', brackets="[]"):
  103. return QuotedStringSplitter(delim, quote, brackets).run(s)
  104. def split_mapping(pair_str):
  105. """Split the ``str`` in ``pair_str`` at ``'='``
  106. Warn if key needs to be stripped
  107. """
  108. orig_key, value = pair_str.split("=", 1)
  109. key = orig_key.strip()
  110. if key != orig_key:
  111. warnings.warn(
  112. "Mapping key {} has leading or trailing space".format(repr(orig_key)),
  113. LeadingTrailingSpaceInKey,
  114. )
  115. return key, value
  116. def parse_mapping(value):
  117. """Parse the given VCF header line mapping
  118. Such a mapping consists of "key=value" pairs, separated by commas and
  119. wrapped into angular brackets ("<...>"). Strings are usually quoted,
  120. for certain known keys, exceptions are made, depending on the tag key.
  121. this, however, only gets important when serializing.
  122. :raises: :py:class:`vcfpy.exceptions.InvalidHeaderException` if
  123. there was a problem parsing the file
  124. """
  125. if not value.startswith("<") or not value.endswith(">"):
  126. raise exceptions.InvalidHeaderException(
  127. "Header mapping value was not wrapped in angular brackets"
  128. )
  129. # split the comma-separated list into pairs, ignoring commas in quotes
  130. pairs = split_quoted_string(value[1:-1], delim=",", quote='"')
  131. # split these pairs into key/value pairs, converting flags to mappings
  132. # to True
  133. key_values = []
  134. for pair in pairs:
  135. if "=" in pair:
  136. key, value = split_mapping(pair)
  137. if value.startswith('"') and value.endswith('"'):
  138. value = ast.literal_eval(value)
  139. elif value.startswith("[") and value.endswith("]"):
  140. value = [v.strip() for v in value[1:-1].split(",")]
  141. else:
  142. key, value = pair, True
  143. key_values.append((key, value))
  144. # return completely parsed mapping as OrderedDict
  145. return OrderedDict(key_values)
  146. class HeaderLineParserBase:
  147. """Parse into appropriate HeaderLine"""
  148. def parse_key_value(self, key, value):
  149. """Parse the key/value pair
  150. :param str key: the key to use in parsing
  151. :param str value: the value to parse
  152. :returns: :py:class:`vcfpy.header.HeaderLine` object
  153. """
  154. raise NotImplementedError("Must be overridden")
  155. class StupidHeaderLineParser(HeaderLineParserBase):
  156. """Parse into HeaderLine (no particular structure)"""
  157. def parse_key_value(self, key, value):
  158. return header.HeaderLine(key, value)
  159. class MappingHeaderLineParser(HeaderLineParserBase):
  160. """Parse into HeaderLine (no particular structure)"""
  161. def __init__(self, line_class):
  162. """Initialize the parser"""
  163. #: the class to use for the VCF header line
  164. self.line_class = line_class
  165. def parse_key_value(self, key, value):
  166. return self.line_class(key, value, parse_mapping(value))
  167. def build_header_parsers():
  168. """Return mapping for parsers to use for each VCF header type
  169. Inject the WarningHelper into the parsers.
  170. """
  171. result = {
  172. "ALT": MappingHeaderLineParser(header.AltAlleleHeaderLine),
  173. "contig": MappingHeaderLineParser(header.ContigHeaderLine),
  174. "FILTER": MappingHeaderLineParser(header.FilterHeaderLine),
  175. "FORMAT": MappingHeaderLineParser(header.FormatHeaderLine),
  176. "INFO": MappingHeaderLineParser(header.InfoHeaderLine),
  177. "META": MappingHeaderLineParser(header.MetaHeaderLine),
  178. "PEDIGREE": MappingHeaderLineParser(header.PedigreeHeaderLine),
  179. "SAMPLE": MappingHeaderLineParser(header.SampleHeaderLine),
  180. "__default__": StupidHeaderLineParser(), # fallback
  181. }
  182. return result
  183. # Field value converters
  184. _CONVERTERS = {
  185. "Integer": int,
  186. "Float": float,
  187. "Flag": lambda x: True,
  188. "Character": str,
  189. "String": str,
  190. }
  191. def convert_field_value(type_, value):
  192. """Convert atomic field value according to the type"""
  193. if value == ".":
  194. return None
  195. elif type_ in ("Character", "String"):
  196. if "%" in value:
  197. for k, v in record.UNESCAPE_MAPPING:
  198. value = value.replace(k, v)
  199. return value
  200. else:
  201. try:
  202. return _CONVERTERS[type_](value)
  203. except ValueError:
  204. warnings.warn(
  205. ("{} cannot be converted to {}, keeping as " "string.").format(value, type_),
  206. CannotConvertValue,
  207. )
  208. return value
  209. def parse_field_value(field_info, value):
  210. """Parse ``value`` according to ``field_info``"""
  211. if field_info.id == "FT":
  212. return [x for x in value.split(";") if x != "."]
  213. elif isinstance(value, bool) or field_info.type == "Flag":
  214. return True
  215. elif field_info.number == 1:
  216. return convert_field_value(field_info.type, value)
  217. else:
  218. if value == ".":
  219. return []
  220. else:
  221. return [convert_field_value(field_info.type, x) for x in value.split(",")]
  222. # Regular expression for break-end
  223. BREAKEND_PATTERN = re.compile("[\\[\\]]")
  224. def parse_breakend(alt_str):
  225. """Parse breakend and return tuple with results, parameters for BreakEnd
  226. constructor
  227. """
  228. arr = BREAKEND_PATTERN.split(alt_str)
  229. mate_chrom, mate_pos = arr[1].split(":", 1)
  230. mate_pos = int(mate_pos)
  231. if mate_chrom[0] == "<":
  232. mate_chrom = mate_chrom[1:-1]
  233. within_main_assembly = False
  234. else:
  235. within_main_assembly = True
  236. FWD_REV = {True: record.FORWARD, False: record.REVERSE}
  237. orientation = FWD_REV[alt_str[0] == "[" or alt_str[0] == "]"]
  238. mate_orientation = FWD_REV["[" in alt_str]
  239. if orientation == record.FORWARD:
  240. sequence = arr[2]
  241. else:
  242. sequence = arr[0]
  243. return (mate_chrom, mate_pos, orientation, mate_orientation, sequence, within_main_assembly)
  244. def process_sub_grow(ref, alt_str):
  245. """Process substution where the string grows"""
  246. if len(alt_str) == 0:
  247. raise exceptions.InvalidRecordException("Invalid VCF, empty ALT")
  248. elif len(alt_str) == 1:
  249. if ref[0] == alt_str[0]:
  250. return record.Substitution(record.DEL, alt_str)
  251. else:
  252. return record.Substitution(record.INDEL, alt_str)
  253. else:
  254. return record.Substitution(record.INDEL, alt_str)
  255. def process_sub_shrink(ref, alt_str):
  256. """Process substution where the string shrink"""
  257. if len(ref) == 0:
  258. raise exceptions.InvalidRecordException("Invalid VCF, empty REF")
  259. elif len(ref) == 1:
  260. if ref[0] == alt_str[0]:
  261. return record.Substitution(record.INS, alt_str)
  262. else:
  263. return record.Substitution(record.INDEL, alt_str)
  264. else:
  265. return record.Substitution(record.INDEL, alt_str)
  266. def process_sub(ref, alt_str):
  267. """Process substitution"""
  268. if len(ref) == len(alt_str):
  269. if len(ref) == 1:
  270. return record.Substitution(record.SNV, alt_str)
  271. else:
  272. return record.Substitution(record.MNV, alt_str)
  273. elif len(ref) > len(alt_str):
  274. return process_sub_grow(ref, alt_str)
  275. else: # len(ref) < len(alt_str):
  276. return process_sub_shrink(ref, alt_str)
  277. def process_alt(header, ref, alt_str): # pylint: disable=W0613
  278. """Process alternative value using Header in ``header``"""
  279. # By its nature, this function contains a large number of case distinctions
  280. if "]" in alt_str or "[" in alt_str:
  281. return record.BreakEnd(*parse_breakend(alt_str))
  282. elif alt_str[0] == "." and len(alt_str) > 0:
  283. return record.SingleBreakEnd(record.FORWARD, alt_str[1:])
  284. elif alt_str[-1] == "." and len(alt_str) > 0:
  285. return record.SingleBreakEnd(record.REVERSE, alt_str[:-1])
  286. elif alt_str[0] == "<" and alt_str[-1] == ">":
  287. inner = alt_str[1:-1]
  288. return record.SymbolicAllele(inner)
  289. else: # substitution
  290. return process_sub(ref, alt_str)
  291. class HeaderParser:
  292. """Helper class for parsing a VCF header"""
  293. def __init__(self):
  294. #: Sub parsers to use for parsing the header lines
  295. self.sub_parsers = build_header_parsers()
  296. def parse_line(self, line):
  297. """Parse VCF header ``line`` (trailing '\r\n' or '\n' is ignored)
  298. :param str line: ``str`` with line to parse
  299. :param dict sub_parsers: ``dict`` mapping header line types to
  300. appropriate parser objects
  301. :returns: appropriate :py:class:`HeaderLine` parsed from ``line``
  302. :raises: :py:class:`vcfpy.exceptions.InvalidHeaderException` if
  303. there was a problem parsing the file
  304. """
  305. if not line or not line.startswith("##"):
  306. raise exceptions.InvalidHeaderException(
  307. 'Invalid VCF header line (must start with "##") {}'.format(line)
  308. )
  309. if "=" not in line:
  310. raise exceptions.InvalidHeaderException(
  311. 'Invalid VCF header line (must contain "=") {}'.format(line)
  312. )
  313. line = line[len("##") :].rstrip() # trim '^##' and trailing whitespace
  314. # split key/value pair at "="
  315. key, value = split_mapping(line)
  316. sub_parser = self.sub_parsers.get(key, self.sub_parsers["__default__"])
  317. return sub_parser.parse_key_value(key, value)
  318. class RecordParser:
  319. """Helper class for parsing VCF records"""
  320. def __init__(self, header, samples, record_checks=None):
  321. #: Header with the meta information
  322. self.header = header
  323. #: SamplesInfos with sample information
  324. self.samples = samples
  325. #: The checks to perform, can contain 'INFO' and 'FORMAT'
  326. self.record_checks = tuple(record_checks or [])
  327. # Expected number of fields
  328. if self.samples.names:
  329. self.expected_fields = 9 + len(self.samples.names)
  330. else:
  331. self.expected_fields = 8
  332. # Cache of FieldInfo objects by FORMAT string
  333. self._format_cache = {}
  334. # Cache of FILTER entries, also applied to FORMAT/FT
  335. self._filter_ids = set(self.header.filter_ids())
  336. # Helper for checking INFO fields
  337. if "INFO" in self.record_checks:
  338. self._info_checker = InfoChecker(self.header)
  339. else:
  340. self._info_checker = NoopInfoChecker()
  341. # Helper for checking FORMAT fields
  342. if "FORMAT" in self.record_checks:
  343. self._format_checker = FormatChecker(self.header)
  344. else:
  345. self._format_checker = NoopFormatChecker()
  346. def parse_line(self, line_str):
  347. """Parse line from file (including trailing line break) and return
  348. resulting Record
  349. """
  350. line_str = line_str.rstrip()
  351. if not line_str:
  352. return None # empty line, EOF
  353. arr = self._split_line(line_str)
  354. # CHROM
  355. chrom = arr[0]
  356. # POS
  357. pos = int(arr[1])
  358. # IDS
  359. if arr[2] == ".":
  360. ids = []
  361. else:
  362. ids = arr[2].split(";")
  363. # REF
  364. ref = arr[3]
  365. # ALT
  366. alts = []
  367. if arr[4] != ".":
  368. for alt in arr[4].split(","):
  369. alts.append(process_alt(self.header, ref, alt))
  370. # QUAL
  371. if arr[5] == ".":
  372. qual = None
  373. else:
  374. try:
  375. qual = int(arr[5])
  376. except ValueError: # try as float
  377. qual = float(arr[5])
  378. # FILTER
  379. if arr[6] == ".":
  380. filt = []
  381. else:
  382. filt = arr[6].split(";")
  383. self._check_filters(filt, "FILTER")
  384. # INFO
  385. info = self._parse_info(arr[7], len(alts))
  386. if len(arr) == 9:
  387. raise exceptions.IncorrectVCFFormat("Expected 8 or 10+ columns, got 9!")
  388. elif len(arr) == 8:
  389. format_ = None
  390. calls = None
  391. else:
  392. # FORMAT
  393. format_ = arr[8].split(":")
  394. # sample/call columns
  395. calls = self._handle_calls(alts, format_, arr[8], arr)
  396. return record.Record(chrom, pos, ids, ref, alts, qual, filt, info, format_, calls)
  397. def _handle_calls(self, alts, format_, format_str, arr):
  398. """Handle FORMAT and calls columns, factored out of parse_line"""
  399. if format_str not in self._format_cache:
  400. self._format_cache[format_str] = list(map(self.header.get_format_field_info, format_))
  401. # per-sample calls
  402. calls = []
  403. for sample, raw_data in zip(self.samples.names, arr[9:]):
  404. if self.samples.is_parsed(sample):
  405. data = self._parse_calls_data(format_, self._format_cache[format_str], raw_data)
  406. call = record.Call(sample, data)
  407. self._format_checker.run(call, len(alts))
  408. self._check_filters(call.data.get("FT"), "FORMAT/FT", call.sample)
  409. calls.append(call)
  410. else:
  411. calls.append(record.UnparsedCall(sample, raw_data))
  412. return calls
  413. def _check_filters(self, filt, source, sample=None):
  414. if not filt:
  415. return
  416. for f in filt:
  417. self._check_filter(f, source, sample)
  418. def _check_filter(self, f, source, sample):
  419. if f == "PASS":
  420. pass # the PASS filter is implicitely defined
  421. elif f not in self._filter_ids:
  422. if source == "FILTER":
  423. warnings.warn(
  424. ("Filter not found in header: {}; problem in FILTER " "column").format(f),
  425. UnknownFilter,
  426. )
  427. else:
  428. assert source == "FORMAT/FT" and sample
  429. warnings.warn(
  430. (
  431. "Filter not found in header: {}; problem in "
  432. "FORMAT/FT column of sample {}"
  433. ).format(f, sample),
  434. UnknownFilter,
  435. )
  436. def _split_line(self, line_str):
  437. """Split line and check number of columns"""
  438. arr = line_str.rstrip().split("\t")
  439. if len(arr) != self.expected_fields:
  440. raise exceptions.InvalidRecordException(
  441. (
  442. "The line contains an invalid number of fields. Was "
  443. "{} but expected {}\n{}".format(len(arr), 9 + len(self.samples.names), line_str)
  444. )
  445. )
  446. return arr
  447. def _parse_info(self, info_str, num_alts):
  448. """Parse INFO column from string"""
  449. result = OrderedDict()
  450. if info_str == ".":
  451. return result
  452. # The standard is very nice to parsers, we can simply split at
  453. # semicolon characters, although I (Manuel) don't know how strict
  454. # programs follow this
  455. for entry in info_str.split(";"):
  456. if "=" not in entry: # flag
  457. key = entry
  458. result[key] = parse_field_value(self.header.get_info_field_info(key), True)
  459. else:
  460. key, value = split_mapping(entry)
  461. result[key] = parse_field_value(self.header.get_info_field_info(key), value)
  462. self._info_checker.run(key, result[key], num_alts)
  463. return result
  464. @classmethod
  465. def _parse_calls_data(klass, format_, infos, gt_str):
  466. """Parse genotype call information from arrays using format array
  467. :param list format: List of strings with format names
  468. :param gt_str arr: string with genotype information values
  469. """
  470. data = OrderedDict()
  471. # The standard is very nice to parsers, we can simply split at
  472. # colon characters, although I (Manuel) don't know how strict
  473. # programs follow this
  474. for key, info, value in zip(format_, infos, gt_str.split(":")):
  475. data[key] = parse_field_value(info, value)
  476. return data
  477. class HeaderChecker:
  478. """Helper class for checking a VCF header"""
  479. def run(self, header):
  480. """Check the header
  481. Warnings will be printed using ``warnings`` while errors will raise
  482. an exception.
  483. :raises: ``vcfpy.exceptions.InvalidHeaderException`` in the case of
  484. severe errors reading the header
  485. """
  486. self._check_header_lines(header.lines)
  487. def _check_header_lines(self, header_lines):
  488. """Check header lines, in particular for starting file "##fileformat" """
  489. if not header_lines:
  490. raise exceptions.InvalidHeaderException(
  491. "The VCF file did not contain any header lines!"
  492. )
  493. first = header_lines[0]
  494. if first.key != "fileformat":
  495. raise exceptions.InvalidHeaderException("The VCF file did not start with ##fileformat")
  496. if first.value not in SUPPORTED_VCF_VERSIONS:
  497. warnings.warn("Unknown VCF version {}".format(first.value), UnknownVCFVersion)
  498. @functools.lru_cache(maxsize=32)
  499. def binomial(n, k):
  500. try:
  501. res = math.factorial(n) // math.factorial(k) // math.factorial(n - k)
  502. except ValueError:
  503. res = 0
  504. return res
  505. class NoopInfoChecker:
  506. """Helper class that performs no checks"""
  507. def __init__(self):
  508. pass
  509. def run(self, key, value, num_alts):
  510. pass
  511. class InfoChecker:
  512. """Helper class for checking an INFO field"""
  513. def __init__(self, header):
  514. #: VCFHeader to use for checking
  515. self.header = header
  516. def run(self, key, value, num_alts):
  517. """Check value in INFO[key] of record
  518. Currently, only checks for consistent counts are implemented
  519. :param str key: key of INFO entry to check
  520. :param value: value to check
  521. :param int alts: list of alternative alleles, for length
  522. """
  523. field_info = self.header.get_info_field_info(key)
  524. if not isinstance(value, list):
  525. return
  526. TABLE = {
  527. ".": len(value),
  528. "A": num_alts,
  529. "R": num_alts + 1,
  530. "G": binomial(num_alts + 1, 2), # diploid only at the moment
  531. }
  532. expected = TABLE.get(field_info.number, field_info.number)
  533. if len(value) != expected:
  534. tpl = "Number of elements for INFO field {} is {} instead of {}"
  535. warnings.warn(
  536. tpl.format(key, len(value), field_info.number), exceptions.IncorrectListLength
  537. )
  538. class NoopFormatChecker:
  539. """Helper class that performs no checks"""
  540. def __init__(self):
  541. pass
  542. def run(self, call, num_alts):
  543. pass
  544. class FormatChecker:
  545. """Helper class for checking a FORMAT field"""
  546. def __init__(self, header):
  547. #: VCFHeader to use for checking
  548. self.header = header
  549. def run(self, call, num_alts):
  550. """Check ``FORMAT`` of a record.Call
  551. Currently, only checks for consistent counts are implemented
  552. """
  553. for key, value in call.data.items():
  554. self._check_count(call, key, value, num_alts)
  555. def _check_count(self, call, key, value, num_alts):
  556. field_info = self.header.get_format_field_info(key)
  557. if isinstance(value, list):
  558. return
  559. num_alleles = len(call.gt_alleles or [])
  560. TABLE = {
  561. ".": len(value),
  562. "A": num_alts,
  563. "R": num_alts + 1,
  564. "G": binomial(num_alts + num_alleles, num_alleles),
  565. }
  566. expected = TABLE.get(field_info.number, field_info.number)
  567. if len(value) != expected:
  568. tpl = (
  569. "Number of elements for FORMAT field {} is {} instead "
  570. "of {} (number specifier {})"
  571. )
  572. warnings.warn(
  573. tpl.format(key, len(value), expected, field_info.number),
  574. exceptions.IncorrectListLength,
  575. )
  576. class Parser:
  577. """Class for line-wise parsing of VCF files
  578. In most cases, you want to use :py:class:`vcfpy.reader.Reader` instead.
  579. :param stream: ``file``-like object to read from
  580. :param str path: path the VCF is parsed from, for display purposes
  581. only, optional
  582. """
  583. def __init__(self, stream, path=None, record_checks=None):
  584. self.stream = stream
  585. self.path = path
  586. #: checks to perform, can contain 'INFO' and 'FORMAT'
  587. self.record_checks = tuple(record_checks or [])
  588. #: header, once it has been read
  589. self.header = None
  590. # the currently read line
  591. self._line = stream.readline() # trailing '\n'
  592. #: :py:class:`vcfpy.header.SamplesInfos` with sample information;
  593. #: set on parsing the header
  594. self.samples = None
  595. # helper for parsing the records
  596. self._record_parser = None
  597. # helper for checking the header
  598. self._header_checker = HeaderChecker()
  599. def _read_next_line(self):
  600. """Read next line store in self._line and return old one"""
  601. prev_line = self._line
  602. self._line = self.stream.readline()
  603. return prev_line
  604. def parse_header(self, parsed_samples=None):
  605. """Read and parse :py:class:`vcfpy.header.Header` from file, set
  606. into ``self.header`` and return it
  607. :param list parsed_samples: ``list`` of ``str`` for subsetting the
  608. samples to parse
  609. :returns: ``vcfpy.header.Header``
  610. :raises: ``vcfpy.exceptions.InvalidHeaderException`` in the case of
  611. problems reading the header
  612. """
  613. # parse header lines
  614. sub_parser = HeaderParser()
  615. header_lines = []
  616. while self._line and self._line.startswith("##"):
  617. header_lines.append(sub_parser.parse_line(self._line))
  618. self._read_next_line()
  619. # parse sample info line
  620. self.samples = self._handle_sample_line(parsed_samples)
  621. # construct Header object
  622. self.header = header.Header(header_lines, self.samples)
  623. # check header for consistency
  624. self._header_checker.run(self.header)
  625. # construct record parser
  626. self._record_parser = RecordParser(self.header, self.samples, self.record_checks)
  627. # read next line, must not be header
  628. self._read_next_line()
  629. if self._line and self._line.startswith("#"):
  630. raise exceptions.IncorrectVCFFormat(
  631. 'Expecting non-header line or EOF after "#CHROM" line'
  632. )
  633. return self.header
  634. def _handle_sample_line(self, parsed_samples=None):
  635. """ "Check and interpret the "##CHROM" line and return samples"""
  636. if not self._line or not self._line.startswith("#CHROM"):
  637. raise exceptions.IncorrectVCFFormat('Missing line starting with "#CHROM"')
  638. # check for space before INFO
  639. line = self._line.rstrip()
  640. pos = line.find("FORMAT") if ("FORMAT" in line) else line.find("INFO")
  641. if pos == -1:
  642. raise exceptions.IncorrectVCFFormat('Ill-formatted line starting with "#CHROM"')
  643. if " " in line[:pos]:
  644. warnings.warn(
  645. (
  646. "Found space in #CHROM line, splitting at whitespace "
  647. "instead of tab; this VCF file is ill-formatted"
  648. ),
  649. SpaceInChromLine,
  650. )
  651. arr = self._line.rstrip().split()
  652. else:
  653. arr = self._line.rstrip().split("\t")
  654. self._check_samples_line(arr)
  655. return header.SamplesInfos(arr[len(REQUIRE_SAMPLE_HEADER) :], parsed_samples)
  656. @classmethod
  657. def _check_samples_line(klass, arr):
  658. """Peform additional check on samples line"""
  659. if len(arr) <= len(REQUIRE_NO_SAMPLE_HEADER):
  660. if tuple(arr) != REQUIRE_NO_SAMPLE_HEADER:
  661. raise exceptions.IncorrectVCFFormat(
  662. "Sample header line indicates no sample but does not "
  663. "equal required prefix {}".format("\t".join(REQUIRE_NO_SAMPLE_HEADER))
  664. )
  665. elif tuple(arr[: len(REQUIRE_SAMPLE_HEADER)]) != REQUIRE_SAMPLE_HEADER:
  666. raise exceptions.IncorrectVCFFormat(
  667. 'Sample header line (starting with "#CHROM") does not '
  668. "start with required prefix {}".format("\t".join(REQUIRE_SAMPLE_HEADER))
  669. )
  670. def parse_line(self, line):
  671. """Pare the given line without reading another one from the stream"""
  672. return self._record_parser.parse_line(line)
  673. def parse_next_record(self):
  674. """Read, parse and return next :py:class:`vcfpy.record.Record`
  675. :returns: next VCF record or ``None`` if at end
  676. :raises: ``vcfpy.exceptions.InvalidRecordException`` in the case of
  677. problems reading the record
  678. """
  679. return self.parse_line(self._read_next_line())
  680. def print_warn_summary(self):
  681. """If there were any warnings, print summary with warnings"""
  682. # TODO: remove?