您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 

936 行
34 KiB

  1. # -*- coding: utf-8 -*-
  2. """Code for representing the VCF header part
  3. The VCF header class structure is modeled after HTSJDK
  4. """
  5. import json
  6. import pprint
  7. import warnings
  8. from . import exceptions
  9. from .compat import OrderedDict
  10. from .exceptions import (
  11. DuplicateHeaderLineWarning,
  12. FieldInfoNotFound,
  13. FieldMissingNumber,
  14. FieldInvalidNumber,
  15. HeaderInvalidType,
  16. HeaderMissingDescription,
  17. )
  18. __author__ = "Manuel Holtgrewe <manuel.holtgrewe@bihealth.de>"
  19. # Tuples of valid entries -----------------------------------------------------
  20. #
  21. #: valid INFO value types
  22. INFO_TYPES = ("Integer", "Float", "Flag", "Character", "String")
  23. #: valid FORMAT value types
  24. FORMAT_TYPES = ("Integer", "Float", "Character", "String")
  25. #: valid values for "Number" entries, except for integers
  26. VALID_NUMBERS = ("A", "R", "G", ".")
  27. #: header lines that contain an "ID" entry
  28. LINES_WITH_ID = ("ALT", "contig", "FILTER", "FORMAT", "INFO", "META", "PEDIGREE", "SAMPLE")
  29. # Constants for "Number" entries ----------------------------------------------
  30. #
  31. #: number of alleles excluding reference
  32. HEADER_NUMBER_ALLELES = "A"
  33. #: number of alleles including reference
  34. HEADER_NUMBER_REF = "R"
  35. #: number of genotypes
  36. HEADER_NUMBER_GENOTYPES = "G"
  37. #: unbounded number of values
  38. HEADER_NUMBER_UNBOUNDED = "."
  39. class FieldInfo:
  40. """Core information for describing field type and number"""
  41. # TODO: always put in id?
  42. def __init__(self, type_, number, description=None, id_=None):
  43. #: The type, one of INFO_TYPES or FORMAT_TYPES
  44. self.type = type_
  45. #: Number description, either an int or constant
  46. self.number = number
  47. #: Description for the header field, optional
  48. self.description = description
  49. #: The id of the field, optional.
  50. self.id = id_
  51. def __eq__(self, other):
  52. if isinstance(other, self.__class__):
  53. return self.__dict__ == other.__dict__
  54. return NotImplemented
  55. def __ne__(self, other):
  56. if isinstance(other, self.__class__):
  57. return not self.__eq__(other)
  58. return NotImplemented
  59. def __hash__(self):
  60. return hash(tuple(sorted(self.__dict__.items())))
  61. def __str__(self):
  62. return "FieldInfo({}, {}, {}, {})".format(
  63. *map(repr, [self.type, self.number, self.description, self.id])
  64. )
  65. def __repr__(self):
  66. return str(self)
  67. # Reserved INFO keys ----------------------------------------------------------
  68. #: Reserved fields for INFO from VCF v4.3
  69. RESERVED_INFO = {
  70. # VCF v4.3, Section 1.6.1
  71. "AA": FieldInfo("String", 1, "Ancestral Allele"),
  72. "AC": FieldInfo(
  73. "Integer",
  74. "A",
  75. "Allele count in genotypes, for each ALT allele, in the " "same order as listed",
  76. ),
  77. "AD": FieldInfo("Integer", "R", "Total read depth for each allele"),
  78. "ADF": FieldInfo("Integer", "R", "Forward read depth for each allele"),
  79. "ADR": FieldInfo("Integer", "R", "Reverse read depth for each allele"),
  80. "AF": FieldInfo(
  81. "Float",
  82. "A",
  83. "Allele frequency for each ALT allele in the same order "
  84. "as listed: used for estimating from primary data not "
  85. "called genotypes",
  86. ),
  87. "AN": FieldInfo("Integer", 1, "Total number of alleles in called genotypes"),
  88. "BQ": FieldInfo("Float", 1, "RMS base quality at this position"),
  89. "CIGAR": FieldInfo(
  90. "String",
  91. "A",
  92. "CIGAR string describing how to align each ALT allele " "to the reference allele",
  93. ),
  94. "DB": FieldInfo("Flag", 0, "dbSNP membership"),
  95. "DP": FieldInfo(
  96. "Integer",
  97. 1,
  98. "Combined depth across samples for small variants and "
  99. "Read Depth of segment containing breakend for SVs",
  100. ),
  101. "H2": FieldInfo("Flag", 0, "Membership in HapMap 2"),
  102. "H3": FieldInfo("Flag", 0, "Membership in HapMap 3"),
  103. "MQ": FieldInfo("Integer", 1, "RMS mapping quality"),
  104. "MQ0": FieldInfo("Integer", 1, "Number of MAPQ == 0 reads covering this record"),
  105. "NS": FieldInfo("Integer", 1, "Number of samples with data"),
  106. "SB": FieldInfo("Integer", 4, "Strand bias at this position"),
  107. "SOMATIC": FieldInfo(
  108. "Flag", 0, "Indicates that the record is a somatic mutation, " "for cancer genomics"
  109. ),
  110. "VALIDATED": FieldInfo("Flag", 0, "Validated by follow-up experiment"),
  111. "1000G": FieldInfo("Flag", 0, "Membership in 1000 Genomes"),
  112. # VCF v4.3, Section 3
  113. "IMPRECISE": FieldInfo("Flag", 0, "Imprecise structural variation"),
  114. "NOVEL": FieldInfo("Flag", 0, "Indicates a novel structural variation"),
  115. "END": FieldInfo(
  116. "Integer",
  117. 1,
  118. "End position of the variant described in this record " "(for symbolic alleles)",
  119. ),
  120. "SVTYPE": FieldInfo("String", 1, "Type of structural variant"),
  121. "SVLEN": FieldInfo("Integer", 1, "Difference in length between REF and ALT alleles"),
  122. "CIPOS": FieldInfo("Integer", 2, "Confidence interval around POS for imprecise " "variants"),
  123. "CIEND": FieldInfo("Integer", 2, "Confidence interval around END for imprecise " "variants"),
  124. "HOMLEN": FieldInfo(
  125. "Integer", ".", "Length of base pair identical micro-homology at " "event breakpoints"
  126. ),
  127. "HOMSEQ": FieldInfo(
  128. "String", ".", "Sequence of base pair identical micro-homology at " "event breakpoints"
  129. ),
  130. "BKPTID": FieldInfo(
  131. "String", ".", "ID of the assembled alternate allele in the " "assembly file"
  132. ),
  133. "MEINFO": FieldInfo("String", 4, "Mobile element info of the form " "NAME,START,END,POLARITY"),
  134. "METRANS": FieldInfo(
  135. "String", 4, "Mobile element transduction info of the form " "CHR,START,END,POLARITY"
  136. ),
  137. "DGVID": FieldInfo("String", 1, "ID of this element in Database of Genomic Variation"),
  138. "DBVARID": FieldInfo("String", 1, "ID of this element in DBVAR"),
  139. "DBRIPID": FieldInfo("String", 1, "ID of this element in DBRIP"),
  140. "MATEID": FieldInfo("String", ".", "ID of mate breakends"),
  141. "PARID": FieldInfo("String", 1, "ID of partner breakend"),
  142. "EVENT": FieldInfo("String", 1, "ID of event associated to breakend"),
  143. "CILEN": FieldInfo(
  144. "Integer", 2, "Confidence interval around the inserted material " "between breakends"
  145. ),
  146. "DPADJ": FieldInfo("Integer", ".", "Read Depth of adjacency"),
  147. "CN": FieldInfo("Integer", 1, "Copy number of segment containing breakend"),
  148. "CNADJ": FieldInfo("Integer", ".", "Copy number of adjacency"),
  149. "CICN": FieldInfo("Integer", 2, "Confidence interval around copy number for the " "segment"),
  150. "CICNADJ": FieldInfo(
  151. "Integer", ".", "Confidence interval around copy number for the " "adjacency"
  152. ),
  153. }
  154. # Reserved FORMAT keys --------------------------------------------------------
  155. RESERVED_FORMAT = {
  156. # VCF v 4.3, Section 1.6.2
  157. "AD": FieldInfo("Integer", "R", "Total, per-sample read depth"),
  158. "ADF": FieldInfo("Integer", "R", "Forward-strand, per-sample read depth"),
  159. "ADR": FieldInfo("Integer", "R", "Reverse-strand, per-sample read depth"),
  160. "DP": FieldInfo("Integer", 1, "Read depth at this position for this sample"),
  161. "EC": FieldInfo(
  162. "Integer", "A", "Expected alternate allele counts for each alternate " "allele"
  163. ),
  164. "FT": FieldInfo("String", "1", "Filters applied for this sample", "FORMAT/FT"),
  165. "GQ": FieldInfo("Integer", "G", "Phred-scale, conditional genotype quality"),
  166. "GP": FieldInfo("Float", "G", "Genotype posterior probabilities"),
  167. "GT": FieldInfo("String", 1, "Genotype call"),
  168. "GL": FieldInfo("Float", "G", "Log10-scaled likelihoods for genotypes"),
  169. "HQ": FieldInfo("Integer", 2, "Haplotype qualities"),
  170. "MQ": FieldInfo("Integer", 1, "RMS mapping quality"),
  171. "PL": FieldInfo("Integer", "G", "Phred-scaled genotype likelihoods, rounded to integers"),
  172. "PQ": FieldInfo("Integer", 1, "Phasing quality"),
  173. "PS": FieldInfo(
  174. "Integer",
  175. 1,
  176. "Non-negative 32 bit integer giving phasing set " "for this sample and this chromosome",
  177. ),
  178. # VCF v4.3, Section 4
  179. "CN": FieldInfo("Integer", 1, "Copy number genotype for imprecise events"),
  180. "CNQ": FieldInfo("Float", 1, "Copy number genotype quality for imprecise events"),
  181. "CNL": FieldInfo("Float", "G", "Copy number genotype likelihood for imprecise events"),
  182. "CNP": FieldInfo("Float", "G", "Copy number posterior probabilities"),
  183. "NQ": FieldInfo("Integer", 1, "Phred style probability score that the variant is novel"),
  184. "HAP": FieldInfo("Integer", 1, "Unique haplotype identifier"),
  185. "AHAP": FieldInfo("Integer", 1, "Unique identifier of ancestral haplotype"),
  186. }
  187. # header files to enforce double-quoting for
  188. QUOTE_FIELDS = ("Description", "Source", "Version")
  189. def serialize_for_header(key, value):
  190. """Serialize value for the given mapping key for a VCF header line"""
  191. if key in QUOTE_FIELDS:
  192. return json.dumps(value)
  193. elif isinstance(value, str):
  194. if " " in value or "\t" in value:
  195. return json.dumps(value)
  196. else:
  197. return value
  198. elif isinstance(value, list):
  199. return "[{}]".format(", ".join(value))
  200. else:
  201. return str(value)
  202. def header_without_lines(header, remove):
  203. """Return :py:class:`Header` without lines given in ``remove``
  204. ``remove`` is an iterable of pairs ``key``/``ID`` with the VCF header key
  205. and ``ID`` of entry to remove. In the case that a line does not have
  206. a ``mapping`` entry, you can give the full value to remove.
  207. .. code-block:: python
  208. # header is a vcfpy.Header, e.g., as read earlier from file
  209. new_header = vcfpy.without_header_lines(
  210. header, [('assembly', None), ('FILTER', 'PASS')])
  211. # now, the header lines starting with "##assembly=" and the "PASS"
  212. # filter line will be missing from new_header
  213. """
  214. remove = set(remove)
  215. # Copy over lines that are not removed
  216. lines = []
  217. for line in header.lines:
  218. if hasattr(line, "mapping"):
  219. if (line.key, line.mapping.get("ID", None)) in remove:
  220. continue # filter out
  221. else:
  222. if (line.key, line.value) in remove:
  223. continue # filter out
  224. lines.append(line)
  225. return Header(lines, header.samples)
  226. class Header:
  227. """Represent header of VCF file
  228. While this class allows mutating records, it should not be changed once it
  229. has been assigned to a writer. Use :py:method:`~Header.copy` to create
  230. a copy that can be modified without problems.
  231. This class provides function for adding lines to a header and updating the
  232. supporting index data structures. There is no explicit API for removing
  233. header lines, the best way is to reconstruct a new ``Header`` instance with
  234. a filtered list of header lines.
  235. """
  236. def __init__(self, lines=None, samples=None):
  237. #: ``list`` of :py:HeaderLine objects
  238. self.lines = lines or []
  239. #: :py:class:`SamplesInfo` object
  240. self.samples = samples
  241. # build indices for the different field types
  242. self._indices = self._build_indices()
  243. def _build_indices(self):
  244. """Build indices for the different field types"""
  245. result = {key: OrderedDict() for key in LINES_WITH_ID}
  246. for line in self.lines:
  247. if line.key in LINES_WITH_ID:
  248. result.setdefault(line.key, OrderedDict())
  249. if line.mapping["ID"] in result[line.key]:
  250. warnings.warn(
  251. ("Seen {} header more than once: {}, using first" "occurence").format(
  252. line.key, line.mapping["ID"]
  253. ),
  254. DuplicateHeaderLineWarning,
  255. )
  256. else:
  257. result[line.key][line.mapping["ID"]] = line
  258. else:
  259. result.setdefault(line.key, [])
  260. result[line.key].append(line)
  261. return result
  262. def copy(self):
  263. """Return a copy of this header"""
  264. return Header([line.copy() for line in self.lines], self.samples.copy())
  265. def add_filter_line(self, mapping):
  266. """Add FILTER header line constructed from the given mapping
  267. :param mapping: ``OrderedDict`` with mapping to add. It is
  268. recommended to use ``OrderedDict`` over ``dict`` as this makes
  269. the result reproducible
  270. :return: ``False`` on conflicting line and ``True`` otherwise
  271. """
  272. return self.add_line(FilterHeaderLine.from_mapping(mapping))
  273. def add_contig_line(self, mapping):
  274. """Add "contig" header line constructed from the given mapping
  275. :param mapping: ``OrderedDict`` with mapping to add. It is
  276. recommended to use ``OrderedDict`` over ``dict`` as this makes
  277. the result reproducible
  278. :return: ``False`` on conflicting line and ``True`` otherwise
  279. """
  280. return self.add_line(ContigHeaderLine.from_mapping(mapping))
  281. def add_info_line(self, mapping):
  282. """Add INFO header line constructed from the given mapping
  283. :param mapping: ``OrderedDict`` with mapping to add. It is
  284. recommended to use ``OrderedDict`` over ``dict`` as this makes
  285. the result reproducible
  286. :return: ``False`` on conflicting line and ``True`` otherwise
  287. """
  288. return self.add_line(InfoHeaderLine.from_mapping(mapping))
  289. def add_format_line(self, mapping):
  290. """Add FORMAT header line constructed from the given mapping
  291. :param mapping: ``OrderedDict`` with mapping to add. It is
  292. recommended to use ``OrderedDict`` over ``dict`` as this makes
  293. the result reproducible
  294. :return: ``False`` on conflicting line and ``True`` otherwise
  295. """
  296. return self.add_line(FormatHeaderLine.from_mapping(mapping))
  297. def format_ids(self):
  298. """Return list of all format IDs"""
  299. return list(self._indices["FORMAT"].keys())
  300. def filter_ids(self):
  301. """Return list of all filter IDs"""
  302. return list(self._indices["FILTER"].keys())
  303. def info_ids(self):
  304. """Return list of all info IDs"""
  305. return list(self._indices["INFO"].keys())
  306. def get_lines(self, key):
  307. """Return header lines having the given ``key`` as their type"""
  308. if key in self._indices:
  309. return self._indices[key].values()
  310. else:
  311. return []
  312. def has_header_line(self, key, id_):
  313. """Return whether there is a header line with the given ID of the
  314. type given by ``key``
  315. :param key: The VCF header key/line type.
  316. :param id_: The ID value to compare fore
  317. :return: ``True`` if there is a header line starting with ``##${key}=``
  318. in the VCF file having the mapping entry ``ID`` set to ``id_``.
  319. """
  320. if key not in self._indices:
  321. return False
  322. else:
  323. return id_ in self._indices[key]
  324. def add_line(self, header_line):
  325. """Add header line, updating any necessary support indices
  326. :return: ``False`` on conflicting line and ``True`` otherwise
  327. """
  328. self.lines.append(header_line)
  329. self._indices.setdefault(header_line.key, OrderedDict())
  330. if not hasattr(header_line, "mapping"):
  331. return False # no registration required
  332. if self.has_header_line(header_line.key, header_line.mapping["ID"]):
  333. warnings.warn(
  334. (
  335. "Detected duplicate header line with type {} and ID {}. "
  336. "Ignoring this and subsequent one"
  337. ).format(header_line.key, header_line.mapping["ID"]),
  338. DuplicateHeaderLineWarning,
  339. )
  340. return False
  341. else:
  342. self._indices[header_line.key][header_line.mapping["ID"]] = header_line
  343. return True
  344. def get_info_field_info(self, key):
  345. """Return :py:class:`FieldInfo` for the given INFO field"""
  346. return self._get_field_info("INFO", key)
  347. def get_format_field_info(self, key):
  348. """Return :py:class:`FieldInfo` for the given INFO field"""
  349. return self._get_field_info("FORMAT", key)
  350. def _get_field_info(self, type_, key):
  351. result = self._indices[type_].get(key)
  352. if result:
  353. return result
  354. if key in RESERVED_INFO:
  355. res = FieldInfo(RESERVED_INFO[key].type, RESERVED_INFO[key].number)
  356. else:
  357. res = FieldInfo("String", HEADER_NUMBER_UNBOUNDED)
  358. warnings.warn(
  359. "{} {} not found using {}/{} instead".format(type_, key, res.type, repr(res.number)),
  360. FieldInfoNotFound,
  361. )
  362. return res
  363. def __eq__(self, other):
  364. if isinstance(other, self.__class__):
  365. return (self.lines, self.samples) == (other.lines, other.samples)
  366. return NotImplemented
  367. def __ne__(self, other):
  368. if isinstance(other, self.__class__):
  369. return (self.lines, self.samples) != (other.lines, other.samples)
  370. return NotImplemented
  371. def __hash__(self):
  372. raise TypeError("Unhashable type: Header")
  373. def __str__(self):
  374. tpl = "Header(lines={}, samples={})"
  375. return tpl.format(*map(repr, (self.lines, self.samples)))
  376. def __repr__(self):
  377. return str(self)
  378. class HeaderLine:
  379. """Base class for VCF header lines"""
  380. def __init__(self, key, value):
  381. #: ``str`` with key of header line
  382. self.key = key
  383. # ``str`` with raw value of header line
  384. self._value = value
  385. def copy(self):
  386. """Return a copy"""
  387. return self.__class__(self.key, self.value)
  388. @property
  389. def value(self):
  390. return self._value
  391. def serialize(self):
  392. """Return VCF-serialized version of this header line"""
  393. return "".join(("##", self.key, "=", self.value))
  394. def __eq__(self, other):
  395. if isinstance(other, self.__class__):
  396. return (self.key, self.value) == (other.key, other.value)
  397. return NotImplemented
  398. def __ne__(self, other):
  399. if isinstance(other, self.__class__):
  400. return (self.key, self.value) != (other.key, other.value)
  401. return NotImplemented
  402. def __hash__(self):
  403. raise TypeError("Unhashable type: HeaderLine")
  404. def __str__(self):
  405. return "HeaderLine({}, {})".format(*map(repr, (self.key, self.value)))
  406. def __repr__(self):
  407. return str(self)
  408. def mapping_to_str(mapping):
  409. """Convert mapping to string"""
  410. result = ["<"]
  411. for i, (key, value) in enumerate(mapping.items()):
  412. if i > 0:
  413. result.append(",")
  414. result += [key, "=", serialize_for_header(key, value)]
  415. result += [">"]
  416. return "".join(result)
  417. class SimpleHeaderLine(HeaderLine):
  418. """Base class for simple header lines, currently contig and filter
  419. header lines
  420. Don't use this class directly but rather the sub classes.
  421. :raises: :py:class:`vcfpy.exceptions.InvalidHeaderException` in
  422. the case of missing key ``"ID"``
  423. """
  424. def __init__(self, key, value, mapping):
  425. super().__init__(key, value)
  426. # check existence of key "ID"
  427. if "ID" not in mapping:
  428. raise exceptions.InvalidHeaderException(
  429. 'Missing key "ID" in header line "{}={}"'.format(key, value)
  430. )
  431. #: ``collections.OrderedDict`` with key/value mapping of the attributes
  432. self.mapping = OrderedDict(mapping.items())
  433. def copy(self):
  434. """Return a copy"""
  435. mapping = OrderedDict(self.mapping.items())
  436. return self.__class__(self.key, self.value, mapping)
  437. @property
  438. def value(self):
  439. return mapping_to_str(self.mapping)
  440. def serialize(self):
  441. return "".join(map(str, ["##", self.key, "=", self.value]))
  442. def __str__(self):
  443. return "SimpleHeaderLine({}, {}, {})".format(
  444. *map(repr, (self.key, self.value, self.mapping))
  445. )
  446. def __eq__(self, other):
  447. if isinstance(other, self.__class__):
  448. return (self.key, self.value, self.mapping) == (other.key, other.value, other.mapping)
  449. return NotImplemented
  450. def __ne__(self, other):
  451. if isinstance(other, self.__class__):
  452. return (self.key, self.value, self.mapping) != (other.key, other.value, other.mapping)
  453. return NotImplemented
  454. class AltAlleleHeaderLine(SimpleHeaderLine):
  455. """Alternative allele header line
  456. Mostly used for defining symbolic alleles for structural variants and
  457. IUPAC ambiguity codes
  458. """
  459. @classmethod
  460. def from_mapping(klass, mapping):
  461. """Construct from mapping, not requiring the string value"""
  462. return AltAlleleHeaderLine("ALT", mapping_to_str(mapping), mapping)
  463. def __init__(self, key, value, mapping):
  464. super().__init__(key, value, mapping)
  465. #: name of the alternative allele
  466. self.id = self.mapping["ID"]
  467. def __hash__(self):
  468. raise TypeError("Unhashable type: AltAlleleHeaderLine")
  469. def __str__(self):
  470. return "AltAlleleHeaderLine({}, {}, {})".format(
  471. *map(repr, (self.key, self.value, self.mapping))
  472. )
  473. class ContigHeaderLine(SimpleHeaderLine):
  474. """Contig header line
  475. Most importantly, parses the ``'length'`` key into an integer
  476. """
  477. @classmethod
  478. def from_mapping(klass, mapping):
  479. """Construct from mapping, not requiring the string value"""
  480. return ContigHeaderLine("contig", mapping_to_str(mapping), mapping)
  481. def __init__(self, key, value, mapping):
  482. super().__init__(key, value, mapping)
  483. # convert 'length' entry to integer if possible
  484. if "length" in self.mapping:
  485. mapping["length"] = int(mapping["length"])
  486. else:
  487. warnings.warn(
  488. 'Field "length" not found in header line {}={}'.format(key, value),
  489. FieldInfoNotFound,
  490. )
  491. #: name of the contig
  492. self.id = self.mapping["ID"]
  493. #: length of the contig, ``None`` if missing
  494. self.length = self.mapping.get("length")
  495. def __hash__(self):
  496. raise TypeError("Unhashable type: ContigHeaderLine")
  497. def __str__(self):
  498. return "ContigHeaderLine({}, {}, {})".format(
  499. *map(repr, (self.key, self.value, self.mapping))
  500. )
  501. class FilterHeaderLine(SimpleHeaderLine):
  502. """FILTER header line"""
  503. @classmethod
  504. def from_mapping(klass, mapping):
  505. """Construct from mapping, not requiring the string value"""
  506. return FilterHeaderLine("FILTER", mapping_to_str(mapping), mapping)
  507. def __init__(self, key, value, mapping):
  508. super().__init__(key, value, mapping)
  509. # check for "Description" key
  510. if "Description" not in self.mapping:
  511. warnings.warn(
  512. 'Field "Description" not found in header line {}={}'.format(key, value),
  513. FieldInfoNotFound,
  514. )
  515. #: token for the filter
  516. self.id = self.mapping["ID"]
  517. #: description for the filter, ``None`` if missing
  518. self.description = self.mapping.get("Description")
  519. def __hash__(self):
  520. raise TypeError("Unhashable type: FilterHeaderLine")
  521. def __str__(self):
  522. return "FilterHeaderLine({}, {}, {})".format(
  523. *map(repr, (self.key, self.value, self.mapping))
  524. )
  525. class MetaHeaderLine(SimpleHeaderLine):
  526. """Alternative allele header line
  527. Used for defining set of valid values for samples keys
  528. """
  529. @classmethod
  530. def from_mapping(klass, mapping):
  531. """Construct from mapping, not requiring the string value"""
  532. return MetaHeaderLine("META", mapping_to_str(mapping), mapping)
  533. def __init__(self, key, value, mapping):
  534. super().__init__(key, value, mapping)
  535. #: name of the alternative allele
  536. self.id = self.mapping["ID"]
  537. def __hash__(self):
  538. raise TypeError("Unhashable type: MetaHeaderLine")
  539. def __str__(self):
  540. return "MetaHeaderLine({}, {}, {})".format(*map(repr, (self.key, self.value, self.mapping)))
  541. class PedigreeHeaderLine(SimpleHeaderLine):
  542. """Header line for defining a pedigree entry"""
  543. @classmethod
  544. def from_mapping(klass, mapping):
  545. """Construct from mapping, not requiring the string value"""
  546. return PedigreeHeaderLine("PEDIGREE", mapping_to_str(mapping), mapping)
  547. def __init__(self, key, value, mapping):
  548. super().__init__(key, value, mapping)
  549. #: name of the alternative allele
  550. self.id = self.mapping["ID"]
  551. def __hash__(self):
  552. raise TypeError("Unhashable type: PedigreeHeaderLine")
  553. def __str__(self):
  554. return "PedigreeHeaderLine({}, {}, {})".format(
  555. *map(repr, (self.key, self.value, self.mapping))
  556. )
  557. class SampleHeaderLine(SimpleHeaderLine):
  558. """Header line for defining a SAMPLE entry"""
  559. @classmethod
  560. def from_mapping(klass, mapping):
  561. """Construct from mapping, not requiring the string value"""
  562. return SampleHeaderLine("SAMPLE", mapping_to_str(mapping), mapping)
  563. def __init__(self, key, value, mapping):
  564. super().__init__(key, value, mapping)
  565. #: name of the alternative allele
  566. self.id = self.mapping["ID"]
  567. def __eq__(self, other):
  568. if isinstance(other, self.__class__):
  569. return (self.key, self.value, self.mapping) == (other.key, other.value, other.mapping)
  570. return NotImplemented
  571. def __ne__(self, other):
  572. if isinstance(other, self.__class__):
  573. return (self.key, self.value, self.mapping) != (other.key, other.value, other.mapping)
  574. return NotImplemented
  575. def __hash__(self):
  576. raise TypeError("Unhashable type: SampleHeaderLine")
  577. def __str__(self):
  578. return "SampleHeaderLine({}, {}, {})".format(
  579. *map(repr, (self.key, self.value, self.mapping))
  580. )
  581. class CompoundHeaderLine(HeaderLine):
  582. """Base class for compound header lines, currently format and header lines
  583. Compound header lines describe fields that can have more than one entry.
  584. Don't use this class directly but rather the sub classes.
  585. """
  586. def __init__(self, key, value, mapping):
  587. super().__init__(key, value)
  588. #: OrderedDict with key/value mapping
  589. self.mapping = OrderedDict(mapping.items())
  590. # check that 'Number' is given and use "." otherwise
  591. if "Number" not in self.mapping:
  592. warnings.warn(
  593. '[vcfpy] WARNING: missing number, using unbounded/"." instead', FieldMissingNumber
  594. )
  595. self.mapping["Number"] = "."
  596. try:
  597. self.mapping["Number"] = self._parse_number(self.mapping["Number"])
  598. except ValueError:
  599. warnings.warn(
  600. ("[vcfpy] WARNING: invalid number {}, using " 'unbounded/"." instead').format(
  601. self.mapping["Number"]
  602. ),
  603. FieldInvalidNumber,
  604. )
  605. self.mapping["Number"] = "."
  606. def copy(self):
  607. """Return a copy"""
  608. mapping = OrderedDict(self.mapping.items())
  609. return self.__class__(self.key, self.value, mapping)
  610. @classmethod
  611. def _parse_number(klass, number):
  612. """Parse ``number`` into an ``int`` or return ``number`` if a valid
  613. expression for a INFO/FORMAT "Number".
  614. :param str number: ``str`` to parse and check
  615. """
  616. try:
  617. return int(number)
  618. except ValueError as e:
  619. if number in VALID_NUMBERS:
  620. return number
  621. else:
  622. raise e
  623. @property
  624. def value(self):
  625. return mapping_to_str(self.mapping)
  626. def serialize(self):
  627. return "".join(map(str, ["##", self.key, "=", self.value]))
  628. def __str__(self):
  629. return "CompoundHeaderLine({}, {}, {})".format(
  630. *map(repr, (self.key, self.value, self.mapping))
  631. )
  632. class InfoHeaderLine(CompoundHeaderLine):
  633. """Header line for INFO fields
  634. Note that the ``Number`` field will be parsed into an ``int`` if
  635. possible. Otherwise, the constants ``HEADER_NUMBER_*`` will be used.
  636. """
  637. @classmethod
  638. def from_mapping(klass, mapping):
  639. """Construct from mapping, not requiring the string value"""
  640. return InfoHeaderLine("INFO", mapping_to_str(mapping), mapping)
  641. def __init__(self, key, value, mapping):
  642. super().__init__(key, value, mapping)
  643. #: key in the INFO field
  644. self.id = self.mapping["ID"]
  645. # check for "Number" field
  646. self.number = self.mapping["Number"]
  647. # check for "Type" field
  648. type_ = self.mapping.get("Type")
  649. if "Type" not in self.mapping:
  650. warnings.warn(
  651. ('Field "Type" not found in header line, using String ' "instead {}={}").format(
  652. key, value
  653. ),
  654. HeaderInvalidType,
  655. )
  656. type_ = "String"
  657. if "Type" in self.mapping and type_ not in INFO_TYPES:
  658. warnings.warn(
  659. (
  660. "Invalid INFO value type {} in header line, using String " "instead, {}={}"
  661. ).format(self.mapping["Type"], key, value),
  662. HeaderInvalidType,
  663. )
  664. type_ = "String"
  665. #: value type
  666. self.type = type_
  667. # check for "Description" key
  668. if "Description" not in self.mapping:
  669. warnings.warn(
  670. 'Field "Description" not found in header line {}={}'.format(key, value),
  671. HeaderMissingDescription,
  672. )
  673. #: description, should be given, ``None`` if not given
  674. self.description = self.mapping.get("Description")
  675. #: source of INFO field, ``None`` if not given
  676. self.source = self.mapping.get("Source")
  677. #: version of INFO field, ``None`` if not given
  678. self.version = self.mapping.get("Version")
  679. def __hash__(self):
  680. raise TypeError("Unhashable type: InfoHeaderLine")
  681. def __str__(self):
  682. return "InfoHeaderLine({}, {}, {})".format(*map(repr, (self.key, self.value, self.mapping)))
  683. class FormatHeaderLine(CompoundHeaderLine):
  684. """Header line for FORMAT fields"""
  685. @classmethod
  686. def from_mapping(klass, mapping):
  687. """Construct from mapping, not requiring the string value"""
  688. return FormatHeaderLine("FORMAT", mapping_to_str(mapping), mapping)
  689. def __init__(self, key, value, mapping):
  690. super().__init__(key, value, mapping)
  691. #: key in the INFO field
  692. self.id = self.mapping["ID"]
  693. # check for "Number" field
  694. self.number = self.mapping["Number"]
  695. # check for "Type" field
  696. type_ = self.mapping.get("Type")
  697. if "Type" not in self.mapping:
  698. warnings.warn(
  699. ('Field "Type" not found in header line, using String ' "instead {}={}").format(
  700. key, value
  701. ),
  702. HeaderInvalidType,
  703. )
  704. type_ = "String"
  705. if "Type" in self.mapping and type_ not in FORMAT_TYPES:
  706. warnings.warn(
  707. (
  708. "Invalid FORMAT value type {} in header line, using String " "instead, {}={}"
  709. ).format(self.mapping["Type"], key, value),
  710. HeaderInvalidType,
  711. )
  712. type_ = "String"
  713. #: value type
  714. self.type = type_
  715. # check for "Description" key
  716. if "Description" not in self.mapping:
  717. warnings.warn(
  718. 'Field "Description" not found in header line {}={}'.format(key, value),
  719. HeaderMissingDescription,
  720. )
  721. #: description, should be given, ``None`` if not given
  722. self.description = self.mapping.get("Description")
  723. #: source of INFO field, ``None`` if not given
  724. self.source = self.mapping.get("Source")
  725. #: version of INFO field, ``None`` if not given
  726. self.version = self.mapping.get("Version")
  727. def __hash__(self):
  728. raise TypeError("Unhashable type: FormatHeaderLine")
  729. def __str__(self):
  730. return "FormatHeaderLine({}, {}, {})".format(
  731. *map(repr, (self.key, self.value, self.mapping))
  732. )
  733. class SamplesInfos:
  734. """Helper class for handling the samples in VCF files
  735. The purpose of this class is to decouple the sample name list somewhat
  736. from :py:class:`Header`. This encapsulates subsetting samples for which
  737. the genotype should be parsed and reordering samples into output files.
  738. Note that when subsetting is used and the records are to be written out
  739. again then the ``FORMAT`` field must not be touched.
  740. """
  741. def __init__(self, sample_names, parsed_samples=None):
  742. #: list of sample that are read from/written to the VCF file at
  743. #: hand in the given order
  744. self.names = list(sample_names)
  745. #: ``set`` with the samples for which the genotype call fields should
  746. #: be read; can be used for partial parsing (speedup) and defaults
  747. #: to the full list of samples, None if all are parsed
  748. self.parsed_samples = parsed_samples
  749. if self.parsed_samples:
  750. self.parsed_samples = set(self.parsed_samples)
  751. assert self.parsed_samples <= set(self.names), "Must be subset!"
  752. #: mapping from sample name to index
  753. self.name_to_idx = dict([(name, idx) for idx, name in enumerate(self.names)])
  754. def copy(self):
  755. """Return a copy of the object"""
  756. return SamplesInfos(self.names)
  757. def is_parsed(self, name):
  758. """Return whether the sample name is parsed"""
  759. return (not self.parsed_samples) or name in self.parsed_samples
  760. def __hash__(self):
  761. raise TypeError("Unhashable type: SamplesInfos")
  762. def __str__(self):
  763. tpl = "SamplesInfos(names={}, name_to_idx={})"
  764. return tpl.format(self.names, pprint.pformat(self.name_to_idx, width=10**10))
  765. def __repr__(self):
  766. return str(self)
  767. def __eq__(self, other):
  768. if isinstance(other, self.__class__):
  769. return self.names == other.names
  770. return NotImplemented
  771. def __ne__(self, other):
  772. if isinstance(other, self.__class__):
  773. return self.names != other.names
  774. return NotImplemented