Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 
 

4543 linhas
143 KiB

  1. # cython: language_level=3
  2. # cython: embedsignature=True
  3. # cython: profile=True
  4. ###############################################################################
  5. ###############################################################################
  6. ## Cython wrapper for htslib VCF/BCF reader/writer
  7. ###############################################################################
  8. #
  9. # NOTICE: This code is incomplete and preliminary. It offers a nearly
  10. # complete Pythonic interface to VCF/BCF metadata and data with
  11. # reading and writing capability. Documentation and a unit test suite
  12. # are in the works. The code is best tested under Python 2, but
  13. # should also work with Python 3. Please report any remaining
  14. # str/bytes issues on the github site when using Python 3 and I'll
  15. # fix them promptly.
  16. #
  17. # Here is a minimal example of how to use the API:
  18. #
  19. # $ cat bcfview.py
  20. # import sys
  21. # from pysam import VariantFile
  22. #
  23. # bcf_in = VariantFile(sys.argv[1]) # auto-detect input format
  24. # bcf_out = VariantFile('-', 'w', header=bcf_in.header)
  25. #
  26. # for rec in bcf_in:
  27. # bcf_out.write(rec)
  28. #
  29. # Performance is fairly close to that of bcftools view. Here is an example
  30. # using some 1k Genomes data:
  31. #
  32. # $ time python bcfview.py ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
  33. # 1103799
  34. #
  35. # real 0m56.114s
  36. # user 1m4.489s
  37. # sys 0m3.102s
  38. #
  39. # $ time bcftools view ALL.chr22.phase3_shapeit2_mvncall_integrated_v5.20130502.genotypes.bcf |wc -l
  40. # 1103800 # bcftools adds an extra header
  41. #
  42. # real 0m55.126s
  43. # user 1m3.502s
  44. # sys 0m3.459s
  45. #
  46. ###############################################################################
  47. #
  48. # TODO list:
  49. #
  50. # * more genotype methods
  51. # * unit test suite (perhaps py.test based)
  52. # * documentation
  53. # * pickle support
  54. # * left/right locus normalization
  55. # * fix reopen to re-use fd
  56. #
  57. ###############################################################################
  58. #
  59. # The MIT License
  60. #
  61. # Copyright (c) 2015,2016 Kevin Jacobs (jacobs@bioinformed.com)
  62. #
  63. # Permission is hereby granted, free of charge, to any person obtaining a
  64. # copy of this software and associated documentation files (the "Software"),
  65. # to deal in the Software without restriction, including without limitation
  66. # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  67. # and/or sell copies of the Software, and to permit persons to whom the
  68. # Software is furnished to do so, subject to the following conditions:
  69. #
  70. # The above copyright notice and this permission notice shall be included in
  71. # all copies or substantial portions of the Software.
  72. #
  73. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  74. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  75. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  76. # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  77. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  78. # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  79. # DEALINGS IN THE SOFTWARE.
  80. #
  81. ###############################################################################
  82. from __future__ import division, print_function
  83. import os
  84. import sys
  85. from libc.errno cimport errno, EPIPE
  86. from libc.string cimport strcmp, strpbrk, strerror
  87. from libc.stdint cimport INT8_MAX, INT16_MAX, INT32_MAX
  88. cimport cython
  89. from cpython.object cimport PyObject
  90. from cpython.ref cimport Py_INCREF
  91. from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString
  92. from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
  93. from cpython.bytes cimport PyBytes_FromStringAndSize
  94. from cpython.unicode cimport PyUnicode_DecodeUTF8
  95. from pysam.libchtslib cimport HTSFile, hisremote
  96. from pysam.utils import unquoted_str
  97. __all__ = ['VariantFile',
  98. 'VariantHeader',
  99. 'VariantHeaderRecord',
  100. 'VariantHeaderRecords',
  101. 'VariantMetadata',
  102. 'VariantHeaderMetadata',
  103. 'VariantContig',
  104. 'VariantHeaderContigs',
  105. 'VariantHeaderSamples',
  106. 'VariantRecordFilter',
  107. 'VariantRecordFormat',
  108. 'VariantRecordInfo',
  109. 'VariantRecordSamples',
  110. 'VariantRecord',
  111. 'VariantRecordSample',
  112. 'BaseIndex',
  113. 'BCFIndex',
  114. 'TabixIndex',
  115. 'BaseIterator',
  116. 'BCFIterator',
  117. 'TabixIterator',
  118. 'VariantRecord']
  119. ########################################################################
  120. ########################################################################
  121. ## Constants
  122. ########################################################################
  123. cdef int MAX_POS = (1 << 31) - 1
  124. cdef tuple VALUE_TYPES = ('Flag', 'Integer', 'Float', 'String')
  125. cdef tuple METADATA_TYPES = ('FILTER', 'INFO', 'FORMAT', 'CONTIG', 'STRUCTURED', 'GENERIC')
  126. cdef tuple METADATA_LENGTHS = ('FIXED', 'VARIABLE', 'A', 'G', 'R')
  127. ########################################################################
  128. ########################################################################
  129. ## Python 3 compatibility functions
  130. ########################################################################
  131. from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
  132. from pysam.libcutils cimport OSError_from_errno, encode_filename, from_string_and_size, decode_bytes
  133. ########################################################################
  134. ########################################################################
  135. ## Sentinel object
  136. ########################################################################
  137. cdef object _nothing = object()
  138. ########################################################################
  139. ########################################################################
  140. ## VCF/BCF string intern system
  141. ########################################################################
  142. cdef dict bcf_str_cache = {}
  143. cdef inline bcf_str_cache_get_charptr(const char* s):
  144. if s == NULL:
  145. return None
  146. cdef PyObject *pystr = PyDict_GetItemString(bcf_str_cache, s)
  147. if pystr:
  148. return <object>pystr
  149. val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
  150. PyDict_SetItemString(bcf_str_cache, s, val)
  151. return val
  152. ########################################################################
  153. ########################################################################
  154. ## Genotype math
  155. ########################################################################
  156. cdef int comb(int n, int k) except -1:
  157. """Return binomial coefficient: n choose k
  158. >>> comb(5, 1)
  159. 5
  160. >>> comb(5, 2)
  161. 10
  162. >>> comb(2, 2)
  163. 1
  164. >>> comb(100, 2)
  165. 4950
  166. """
  167. if k > n:
  168. return 0
  169. elif k == n:
  170. return 1
  171. elif k > n // 2:
  172. k = n - k
  173. cdef d, result
  174. d = result = n - k + 1
  175. for i in range(2, k + 1):
  176. d += 1
  177. result *= d
  178. result //= i
  179. return result
  180. cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1:
  181. """Return the count of genotypes expected for the given ploidy and number of alleles.
  182. >>> bcf_geno_combinations(1, 2)
  183. 2
  184. >>> bcf_geno_combinations(2, 2)
  185. 3
  186. >>> bcf_geno_combinations(2, 3)
  187. 6
  188. >>> bcf_geno_combinations(3, 2)
  189. 4
  190. """
  191. return comb(alleles + ploidy - 1, ploidy)
  192. ########################################################################
  193. ########################################################################
  194. ## Low level type conversion helpers
  195. ########################################################################
  196. cdef inline bint check_header_id(bcf_hdr_t *hdr, int hl_type, int id):
  197. return id >= 0 and id < hdr.n[BCF_DT_ID] and bcf_hdr_idinfo_exists(hdr, hl_type, id)
  198. cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
  199. return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0
  200. cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1:
  201. if sample < 0:
  202. raise ValueError('genotype is only valid as a format field')
  203. cdef int32_t *gt_arr = NULL
  204. cdef int ngt = 0
  205. ngt = bcf_get_genotypes(hdr, rec, &gt_arr, &ngt)
  206. if ngt <= 0 or not gt_arr:
  207. return 0
  208. assert ngt % rec.n_sample == 0
  209. cdef int max_ploidy = ngt // rec.n_sample
  210. cdef int32_t *gt = gt_arr + sample * max_ploidy
  211. cdef int ploidy = 0
  212. while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end:
  213. gt += 1
  214. ploidy += 1
  215. free(<void*>gt_arr)
  216. return bcf_geno_combinations(ploidy, rec.n_allele)
  217. cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
  218. if not a:
  219. return None
  220. try:
  221. return tuple(charptr_to_str(a[i]) for i in range(n))
  222. finally:
  223. if free_after and a:
  224. free(a)
  225. cdef bcf_array_to_object(void *data, int type, ssize_t n, ssize_t count, int scalar):
  226. cdef char *datac
  227. cdef int8_t *data8
  228. cdef int16_t *data16
  229. cdef int32_t *data32
  230. cdef float *dataf
  231. cdef int i
  232. cdef bytes b
  233. if not data or n <= 0:
  234. return None
  235. if type == BCF_BT_CHAR:
  236. datac = <char *>data
  237. if not n:
  238. value = ()
  239. else:
  240. # Check if at least one null terminator is present
  241. if datac[n-1] == bcf_str_vector_end:
  242. # If so, create a string up to the first null terminator
  243. b = datac
  244. else:
  245. # Otherwise, copy the entire block
  246. b = datac[:n]
  247. value = tuple(decode_bytes(v, 'utf-8') if v and v != bcf_str_missing else None for v in b.split(b','))
  248. else:
  249. value = []
  250. if type == BCF_BT_INT8:
  251. data8 = <int8_t *>data
  252. for i in range(n):
  253. if data8[i] == bcf_int8_vector_end:
  254. break
  255. value.append(data8[i] if data8[i] != bcf_int8_missing else None)
  256. elif type == BCF_BT_INT16:
  257. data16 = <int16_t *>data
  258. for i in range(n):
  259. if data16[i] == bcf_int16_vector_end:
  260. break
  261. value.append(data16[i] if data16[i] != bcf_int16_missing else None)
  262. elif type == BCF_BT_INT32:
  263. data32 = <int32_t *>data
  264. for i in range(n):
  265. if data32[i] == bcf_int32_vector_end:
  266. break
  267. value.append(data32[i] if data32[i] != bcf_int32_missing else None)
  268. elif type == BCF_BT_FLOAT:
  269. dataf = <float *>data
  270. for i in range(n):
  271. if bcf_float_is_vector_end(dataf[i]):
  272. break
  273. value.append(dataf[i] if not bcf_float_is_missing(dataf[i]) else None)
  274. else:
  275. raise TypeError('unsupported info type code')
  276. # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
  277. if not value:
  278. if scalar:
  279. value = None
  280. elif count <= 0:
  281. value = ()
  282. else:
  283. value = (None,)*count
  284. elif scalar and len(value) == 1:
  285. value = value[0]
  286. else:
  287. value = tuple(value)
  288. return value
  289. cdef bcf_object_to_array(values, void *data, int bt_type, ssize_t n, int vlen):
  290. cdef char *datac
  291. cdef int8_t *data8
  292. cdef int16_t *data16
  293. cdef int32_t *data32
  294. cdef float *dataf
  295. cdef ssize_t i, value_count = len(values)
  296. assert value_count <= n
  297. if bt_type == BCF_BT_CHAR:
  298. if not isinstance(values, (str, bytes)):
  299. values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values)
  300. value_count = len(values)
  301. assert value_count <= n
  302. datac = <char *>data
  303. memcpy(datac, <char *>values, value_count)
  304. for i in range(value_count, n):
  305. datac[i] = 0
  306. elif bt_type == BCF_BT_INT8:
  307. datai8 = <int8_t *>data
  308. for i in range(value_count):
  309. val = values[i]
  310. datai8[i] = val if val is not None else bcf_int8_missing
  311. for i in range(value_count, n):
  312. datai8[i] = bcf_int8_vector_end
  313. elif bt_type == BCF_BT_INT16:
  314. datai16 = <int16_t *>data
  315. for i in range(value_count):
  316. val = values[i]
  317. datai16[i] = val if val is not None else bcf_int16_missing
  318. for i in range(value_count, n):
  319. datai16[i] = bcf_int16_vector_end
  320. elif bt_type == BCF_BT_INT32:
  321. datai32 = <int32_t *>data
  322. for i in range(value_count):
  323. val = values[i]
  324. datai32[i] = val if val is not None else bcf_int32_missing
  325. for i in range(value_count, n):
  326. datai32[i] = bcf_int32_vector_end
  327. elif bt_type == BCF_BT_FLOAT:
  328. dataf = <float *>data
  329. for i in range(value_count):
  330. val = values[i]
  331. if val is None:
  332. bcf_float_set(dataf + i, bcf_float_missing)
  333. else:
  334. dataf[i] = val
  335. for i in range(value_count, n):
  336. bcf_float_set(dataf + i, bcf_float_vector_end)
  337. else:
  338. raise TypeError('unsupported type')
  339. cdef bcf_empty_array(int type, ssize_t n, int vlen):
  340. cdef char *datac
  341. cdef int32_t *data32
  342. cdef float *dataf
  343. cdef int i
  344. if n <= 0:
  345. raise ValueError('Cannot create empty array')
  346. if type == BCF_HT_STR:
  347. value = PyBytes_FromStringAndSize(NULL, sizeof(char)*n)
  348. datac = <char *>value
  349. for i in range(n):
  350. datac[i] = bcf_str_missing if not vlen else bcf_str_vector_end
  351. elif type == BCF_HT_INT:
  352. value = PyBytes_FromStringAndSize(NULL, sizeof(int32_t)*n)
  353. data32 = <int32_t *><char *>value
  354. for i in range(n):
  355. data32[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
  356. elif type == BCF_HT_REAL:
  357. value = PyBytes_FromStringAndSize(NULL, sizeof(float)*n)
  358. dataf = <float *><char *>value
  359. for i in range(n):
  360. bcf_float_set(dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
  361. else:
  362. raise TypeError('unsupported header type code')
  363. return value
  364. cdef bcf_copy_expand_array(void *src_data, int src_type, size_t src_values,
  365. void *dst_data, int dst_type, size_t dst_values,
  366. int vlen):
  367. """copy data from src to dest where the size of the elements (src_type/dst_type) differ
  368. as well as the number of elements (src_values/dst_values).
  369. """
  370. cdef char *src_datac
  371. cdef char *dst_datac
  372. cdef int8_t *src_datai8
  373. cdef int16_t *src_datai16
  374. cdef int32_t *src_datai32
  375. cdef int32_t *dst_datai
  376. cdef float *src_dataf
  377. cdef float *dst_dataf
  378. cdef ssize_t src_size, dst_size, i, j
  379. cdef int val
  380. if src_values > dst_values:
  381. raise ValueError('Cannot copy arrays with src_values={} > dst_values={}'.format(src_values, dst_values))
  382. if src_type == dst_type == BCF_BT_CHAR:
  383. src_datac = <char *>src_data
  384. dst_datac = <char *>dst_data
  385. memcpy(dst_datac, src_datac, src_values)
  386. for i in range(src_values, dst_values):
  387. dst_datac[i] = 0
  388. elif src_type == BCF_BT_INT8 and dst_type == BCF_BT_INT32:
  389. src_datai8 = <int8_t *>src_data
  390. dst_datai = <int32_t *>dst_data
  391. for i in range(src_values):
  392. val = src_datai8[i]
  393. if val == bcf_int8_missing:
  394. val = bcf_int32_missing
  395. elif val == bcf_int8_vector_end:
  396. val = bcf_int32_vector_end
  397. dst_datai[i] = val
  398. for i in range(src_values, dst_values):
  399. dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
  400. elif src_type == BCF_BT_INT16 and dst_type == BCF_BT_INT32:
  401. src_datai16 = <int16_t *>src_data
  402. dst_datai = <int32_t *>dst_data
  403. for i in range(src_values):
  404. val = src_datai16[i]
  405. if val == bcf_int16_missing:
  406. val = bcf_int32_missing
  407. elif val == bcf_int16_vector_end:
  408. val = bcf_int32_vector_end
  409. dst_datai[i] = val
  410. for i in range(src_values, dst_values):
  411. dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
  412. elif src_type == BCF_BT_INT32 and dst_type == BCF_BT_INT32:
  413. src_datai32 = <int32_t *>src_data
  414. dst_datai = <int32_t *>dst_data
  415. for i in range(src_values):
  416. dst_datai[i] = src_datai32[i]
  417. for i in range(src_values, dst_values):
  418. dst_datai[i] = bcf_int32_missing if not vlen else bcf_int32_vector_end
  419. elif src_type == BCF_BT_FLOAT and dst_type == BCF_BT_FLOAT:
  420. src_dataf = <float *>src_data
  421. dst_dataf = <float *>dst_data
  422. for i in range(src_values):
  423. dst_dataf[i] = src_dataf[i]
  424. for i in range(src_values, dst_values):
  425. bcf_float_set(dst_dataf + i, bcf_float_missing if not vlen else bcf_float_vector_end)
  426. else:
  427. raise TypeError('unsupported types')
  428. cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample):
  429. if record is None:
  430. raise ValueError('record must not be None')
  431. cdef bcf_hdr_t *hdr = record.header.ptr
  432. cdef bcf1_t *r = record.ptr
  433. if not check_header_id(hdr, hl_type, id):
  434. raise ValueError('Invalid header')
  435. cdef int length = bcf_hdr_id2length(hdr, hl_type, id)
  436. cdef int number = bcf_hdr_id2number(hdr, hl_type, id)
  437. scalar[0] = 0
  438. if hl_type == BCF_HL_FMT and is_gt_fmt(hdr, id):
  439. count[0] = number
  440. elif length == BCF_VL_FIXED:
  441. if number == 1:
  442. scalar[0] = 1
  443. count[0] = number
  444. elif length == BCF_VL_R:
  445. count[0] = r.n_allele
  446. elif length == BCF_VL_A:
  447. count[0] = r.n_allele - 1
  448. elif length == BCF_VL_G:
  449. count[0] = bcf_genotype_count(hdr, r, sample)
  450. elif length == BCF_VL_VAR:
  451. count[0] = -1
  452. else:
  453. raise ValueError('Unknown format length')
  454. cdef object bcf_info_get_value(VariantRecord record, const bcf_info_t *z):
  455. if record is None:
  456. raise ValueError('record must not be None')
  457. cdef bcf_hdr_t *hdr = record.header.ptr
  458. cdef char *s
  459. cdef ssize_t count
  460. cdef int scalar
  461. bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1)
  462. if z.len == 0:
  463. if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
  464. value = True
  465. elif scalar:
  466. value = None
  467. else:
  468. value = ()
  469. elif z.len == 1:
  470. if z.type == BCF_BT_INT8:
  471. value = z.v1.i if z.v1.i != bcf_int8_missing else None
  472. elif z.type == BCF_BT_INT16:
  473. value = z.v1.i if z.v1.i != bcf_int16_missing else None
  474. elif z.type == BCF_BT_INT32:
  475. value = z.v1.i if z.v1.i != bcf_int32_missing else None
  476. elif z.type == BCF_BT_FLOAT:
  477. value = z.v1.f if not bcf_float_is_missing(z.v1.f) else None
  478. elif z.type == BCF_BT_CHAR:
  479. value = force_str(chr(z.v1.i))
  480. else:
  481. raise TypeError('unsupported info type code')
  482. if not scalar and value != ():
  483. value = (value,)
  484. else:
  485. value = bcf_array_to_object(z.vptr, z.type, z.len, count, scalar)
  486. return value
  487. cdef object bcf_check_values(VariantRecord record, value, int sample,
  488. int hl_type, int ht_type,
  489. int id, int bt_type, ssize_t bt_len,
  490. ssize_t *value_count, int *scalar, int *realloc):
  491. if record is None:
  492. raise ValueError('record must not be None')
  493. bcf_get_value_count(record, hl_type, id, value_count, scalar, sample)
  494. # Validate values now that we know the type and size
  495. values = (value,) if not isinstance(value, (list, tuple)) else value
  496. # Validate values now that we know the type and size
  497. if ht_type == BCF_HT_FLAG:
  498. value_count[0] = 1
  499. elif hl_type == BCF_HL_FMT and is_gt_fmt(record.header.ptr, id):
  500. # KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
  501. value_count[0] = -1
  502. cdef int given = len(values)
  503. if value_count[0] != -1 and value_count[0] != given:
  504. if scalar[0]:
  505. raise TypeError('value expected to be scalar, given len={}'.format(given))
  506. else:
  507. raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
  508. if ht_type == BCF_HT_REAL:
  509. for v in values:
  510. if not(v is None or isinstance(v, (float, int))):
  511. raise TypeError('invalid value for Float format')
  512. elif ht_type == BCF_HT_INT:
  513. for v in values:
  514. if not(v is None or (isinstance(v, (float, int)) and int(v) == v)):
  515. raise TypeError('invalid value for Integer format')
  516. for v in values:
  517. if not(v is None or bcf_int32_missing < v <= INT32_MAX):
  518. raise ValueError('Integer value too small/large to store in VCF/BCF')
  519. elif ht_type == BCF_HT_STR:
  520. values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
  521. elif ht_type == BCF_HT_FLAG:
  522. if values[0] not in (True, False, None, 1, 0):
  523. raise ValueError('Flag values must be: True, False, None, 1, 0')
  524. else:
  525. raise TypeError('unsupported type')
  526. realloc[0] = 0
  527. if len(values) <= 1 and hl_type == BCF_HL_INFO:
  528. realloc[0] = 0
  529. elif len(values) > bt_len:
  530. realloc[0] = 1
  531. elif bt_type == BCF_BT_INT8:
  532. for v in values:
  533. if v is not None and not(bcf_int8_missing < v <= INT8_MAX):
  534. realloc[0] = 1
  535. break
  536. elif bt_type == BCF_BT_INT16:
  537. for v in values:
  538. if v is not None and not(bcf_int16_missing < v <= INT16_MAX):
  539. realloc[0] = 1
  540. break
  541. return values
  542. cdef bcf_encode_alleles(VariantRecord record, values):
  543. if record is None:
  544. raise ValueError('record must not be None')
  545. cdef bcf1_t *r = record.ptr
  546. cdef int32_t nalleles = r.n_allele
  547. cdef list gt_values = []
  548. cdef char *s
  549. cdef int i
  550. if values is None:
  551. return ()
  552. if not isinstance(values, (list, tuple)):
  553. values = (values,)
  554. for value in values:
  555. if value is None:
  556. gt_values.append(bcf_gt_missing)
  557. elif isinstance(value, (str, bytes)):
  558. bvalue = force_bytes(value)
  559. s = bvalue
  560. for i in range(r.n_allele):
  561. if strcmp(r.d.allele[i], s) != 0:
  562. gt_values.append(bcf_gt_unphased(i))
  563. break
  564. else:
  565. raise ValueError('Unknown allele')
  566. else:
  567. i = value
  568. if not (0 <= i < nalleles):
  569. raise ValueError('Invalid allele index')
  570. gt_values.append(bcf_gt_unphased(i))
  571. return gt_values
  572. cdef bcf_info_set_value(VariantRecord record, key, value):
  573. if record is None:
  574. raise ValueError('record must not be None')
  575. cdef bcf_hdr_t *hdr = record.header.ptr
  576. cdef bcf1_t *r = record.ptr
  577. cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
  578. cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
  579. if bcf_unpack(r, BCF_UN_INFO) < 0:
  580. raise ValueError('Error unpacking VariantRecord')
  581. cdef bytes bkey = force_bytes(key)
  582. cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
  583. if info:
  584. info_id = info.key
  585. else:
  586. info_id = bcf_header_get_info_id(hdr, bkey)
  587. if info_id < 0:
  588. raise KeyError('unknown INFO: {}'.format(key))
  589. if not check_header_id(hdr, BCF_HL_INFO, info_id):
  590. raise ValueError('Invalid header')
  591. info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
  592. values = bcf_check_values(record, value, -1,
  593. BCF_HL_INFO, info_type, info_id,
  594. info.type if info else -1,
  595. info.len if info else -1,
  596. &value_count, &scalar, &realloc)
  597. if info_type == BCF_HT_FLAG:
  598. if bcf_update_info(hdr, r, bkey, NULL, bool(values[0]), info_type) < 0:
  599. raise ValueError('Unable to update INFO values')
  600. return
  601. vlen = value_count < 0
  602. value_count = len(values)
  603. # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS
  604. # If we can, write updated values to existing allocated storage
  605. if 0 and info and not realloc:
  606. r.d.shared_dirty |= BCF1_DIRTY_INF
  607. if value_count == 0:
  608. info.len = 0
  609. if not info.vptr:
  610. info.vptr = <uint8_t *>&info.v1.i
  611. elif value_count == 1:
  612. # FIXME: Check if need to free vptr if info.len > 0?
  613. if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
  614. bcf_object_to_array(values, &info.v1.i, BCF_BT_INT32, 1, vlen)
  615. elif info.type == BCF_BT_FLOAT:
  616. bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
  617. else:
  618. raise TypeError('unsupported info type code')
  619. info.len = 1
  620. if not info.vptr:
  621. info.vptr = <uint8_t *>&info.v1.i
  622. else:
  623. bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
  624. return
  625. alloc_len = max(1, value_count)
  626. if info and info.len > alloc_len:
  627. alloc_len = info.len
  628. new_values = bcf_empty_array(info_type, alloc_len, vlen)
  629. cdef char *valp = <char *>new_values
  630. if info_type == BCF_HT_INT:
  631. dst_type = BCF_BT_INT32
  632. elif info_type == BCF_HT_REAL:
  633. dst_type = BCF_BT_FLOAT
  634. elif info_type == BCF_HT_STR:
  635. dst_type = BCF_BT_CHAR
  636. else:
  637. raise ValueError('Unsupported INFO type')
  638. bcf_object_to_array(values, valp, dst_type, alloc_len, vlen)
  639. if bcf_update_info(hdr, r, bkey, valp, <int>alloc_len, info_type) < 0:
  640. raise ValueError('Unable to update INFO values')
  641. cdef bcf_info_del_value(VariantRecord record, key):
  642. if record is None:
  643. raise ValueError('record must not be None')
  644. cdef bcf_hdr_t *hdr = record.header.ptr
  645. cdef bcf1_t *r = record.ptr
  646. cdef ssize_t value_count
  647. cdef int scalar
  648. if bcf_unpack(r, BCF_UN_INFO) < 0:
  649. raise ValueError('Error unpacking VariantRecord')
  650. cdef bytes bkey = force_bytes(key)
  651. cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
  652. if not info:
  653. raise KeyError(key)
  654. bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1)
  655. if value_count <= 0:
  656. null_value = ()
  657. elif scalar:
  658. null_value = None
  659. else:
  660. null_value = (None,)*value_count
  661. bcf_info_set_value(record, bkey, null_value)
  662. cdef bcf_format_get_value(VariantRecordSample sample, key):
  663. if sample is None:
  664. raise ValueError('sample must not be None')
  665. cdef bcf_hdr_t *hdr = sample.record.header.ptr
  666. cdef bcf1_t *r = sample.record.ptr
  667. cdef ssize_t count
  668. cdef int scalar
  669. if bcf_unpack(r, BCF_UN_ALL) < 0:
  670. raise ValueError('Error unpacking VariantRecord')
  671. cdef bytes bkey = force_bytes(key)
  672. cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
  673. if not fmt or not fmt.p:
  674. raise KeyError('invalid FORMAT: {}'.format(key))
  675. if is_gt_fmt(hdr, fmt.id):
  676. return bcf_format_get_allele_indices(sample)
  677. bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index)
  678. if fmt.p and fmt.n and fmt.size:
  679. return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
  680. elif scalar:
  681. return None
  682. elif count <= 0:
  683. return ()
  684. else:
  685. return (None,)*count
  686. cdef bcf_format_set_value(VariantRecordSample sample, key, value):
  687. if sample is None:
  688. raise ValueError('sample must not be None')
  689. if key == 'phased':
  690. sample.phased = bool(value)
  691. return
  692. cdef bcf_hdr_t *hdr = sample.record.header.ptr
  693. cdef bcf1_t *r = sample.record.ptr
  694. cdef int fmt_id
  695. cdef vdict_t *d
  696. cdef khiter_t k
  697. cdef int fmt_type, scalar, realloc, dst_type, vlen = 0
  698. cdef ssize_t i, nsamples, value_count, alloc_size, alloc_len, dst_size
  699. if bcf_unpack(r, BCF_UN_ALL) < 0:
  700. raise ValueError('Error unpacking VariantRecord')
  701. cdef bytes bkey = force_bytes(key)
  702. cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
  703. if fmt:
  704. fmt_id = fmt.id
  705. else:
  706. d = <vdict_t *>hdr.dict[BCF_DT_ID]
  707. k = kh_get_vdict(d, bkey)
  708. if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
  709. raise KeyError('unknown format: {}'.format(key))
  710. fmt_id = kh_val_vdict(d, k).id
  711. if not check_header_id(hdr, BCF_HL_FMT, fmt_id):
  712. raise ValueError('Invalid header')
  713. fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, fmt_id)
  714. if fmt_type == BCF_HT_FLAG:
  715. raise ValueError('Flag types are not allowed on FORMATs')
  716. if is_gt_fmt(hdr, fmt_id):
  717. value = bcf_encode_alleles(sample.record, value)
  718. # KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
  719. fmt_type = BCF_HT_INT
  720. values = bcf_check_values(sample.record, value, sample.index,
  721. BCF_HL_FMT, fmt_type, fmt_id,
  722. fmt.type if fmt else -1,
  723. fmt.n if fmt else -1,
  724. &value_count, &scalar, &realloc)
  725. vlen = value_count < 0
  726. value_count = len(values)
  727. # If we can, write updated values to existing allocated storage.
  728. if fmt and not realloc:
  729. r.d.indiv_dirty = 1
  730. bcf_object_to_array(values, fmt.p + sample.index * fmt.size, fmt.type, fmt.n, vlen)
  731. return
  732. alloc_len = max(1, value_count)
  733. if fmt and fmt.n > alloc_len:
  734. alloc_len = fmt.n
  735. nsamples = r.n_sample
  736. new_values = bcf_empty_array(fmt_type, nsamples * alloc_len, vlen)
  737. cdef char *new_values_p = <char *>new_values
  738. if fmt_type == BCF_HT_INT:
  739. dst_type = BCF_BT_INT32
  740. dst_size = sizeof(int32_t) * alloc_len
  741. elif fmt_type == BCF_HT_REAL:
  742. dst_type = BCF_BT_FLOAT
  743. dst_size = sizeof(float) * alloc_len
  744. elif fmt_type == BCF_HT_STR:
  745. dst_type = BCF_BT_CHAR
  746. dst_size = sizeof(char) * alloc_len
  747. else:
  748. raise ValueError('Unsupported FORMAT type')
  749. if fmt and nsamples > 1:
  750. for i in range(nsamples):
  751. bcf_copy_expand_array(fmt.p + i * fmt.size, fmt.type, fmt.n,
  752. new_values_p + i * dst_size, dst_type, alloc_len,
  753. vlen)
  754. bcf_object_to_array(values, new_values_p + sample.index * dst_size, dst_type, alloc_len, vlen)
  755. if bcf_update_format(hdr, r, bkey, new_values_p, <int>(nsamples * alloc_len), fmt_type) < 0:
  756. raise ValueError('Unable to update format values')
  757. cdef bcf_format_del_value(VariantRecordSample sample, key):
  758. if sample is None:
  759. raise ValueError('sample must not be None')
  760. cdef bcf_hdr_t *hdr = sample.record.header.ptr
  761. cdef bcf1_t *r = sample.record.ptr
  762. cdef ssize_t value_count
  763. cdef int scalar
  764. if bcf_unpack(r, BCF_UN_ALL) < 0:
  765. raise ValueError('Error unpacking VariantRecord')
  766. cdef bytes bkey = force_bytes(key)
  767. cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
  768. if not fmt or not fmt.p:
  769. raise KeyError(key)
  770. bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index)
  771. if value_count <= 0:
  772. null_value = ()
  773. elif scalar:
  774. null_value = None
  775. else:
  776. null_value = (None,)*value_count
  777. bcf_format_set_value(sample, bkey, null_value)
  778. cdef bcf_format_get_allele_indices(VariantRecordSample sample):
  779. if sample is None:
  780. raise ValueError('sample must not be None')
  781. cdef bcf_hdr_t *hdr = sample.record.header.ptr
  782. cdef bcf1_t *r = sample.record.ptr
  783. cdef int32_t n = r.n_sample
  784. if bcf_unpack(r, BCF_UN_ALL) < 0:
  785. raise ValueError('Error unpacking VariantRecord')
  786. if sample.index < 0 or sample.index >= n or not r.n_fmt:
  787. return ()
  788. cdef bcf_fmt_t *fmt0 = r.d.fmt
  789. cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
  790. if not gt0 or not fmt0.n:
  791. return ()
  792. cdef int8_t *data8
  793. cdef int16_t *data16
  794. cdef int32_t *data32
  795. cdef int32_t a, nalleles = r.n_allele
  796. cdef list alleles = []
  797. if fmt0.type == BCF_BT_INT8:
  798. data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
  799. for i in range(fmt0.n):
  800. if data8[i] == bcf_int8_vector_end:
  801. break
  802. elif data8[i] == bcf_gt_missing:
  803. a = -1
  804. else:
  805. a = bcf_gt_allele(data8[i])
  806. alleles.append(a if 0 <= a < nalleles else None)
  807. elif fmt0.type == BCF_BT_INT16:
  808. data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
  809. for i in range(fmt0.n):
  810. if data16[i] == bcf_int16_vector_end:
  811. break
  812. elif data16[i] == bcf_gt_missing:
  813. a = -1
  814. else:
  815. a = bcf_gt_allele(data16[i])
  816. alleles.append(a if 0 <= a < nalleles else None)
  817. elif fmt0.type == BCF_BT_INT32:
  818. data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
  819. for i in range(fmt0.n):
  820. if data32[i] == bcf_int32_vector_end:
  821. break
  822. elif data32[i] == bcf_gt_missing:
  823. a = -1
  824. else:
  825. a = bcf_gt_allele(data32[i])
  826. alleles.append(a if 0 <= a < nalleles else None)
  827. return tuple(alleles)
  828. cdef bcf_format_get_alleles(VariantRecordSample sample):
  829. if sample is None:
  830. raise ValueError('sample must not be None')
  831. cdef bcf_hdr_t *hdr = sample.record.header.ptr
  832. cdef bcf1_t *r = sample.record.ptr
  833. cdef int32_t nsamples = r.n_sample
  834. if bcf_unpack(r, BCF_UN_ALL) < 0:
  835. raise ValueError('Error unpacking VariantRecord')
  836. cdef int32_t nalleles = r.n_allele
  837. if sample.index < 0 or sample.index >= nsamples or not r.n_fmt:
  838. return ()
  839. cdef bcf_fmt_t *fmt0 = r.d.fmt
  840. cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
  841. if not gt0 or not fmt0.n:
  842. return ()
  843. cdef int32_t a
  844. cdef int8_t *data8
  845. cdef int16_t *data16
  846. cdef int32_t *data32
  847. alleles = []
  848. if fmt0.type == BCF_BT_INT8:
  849. data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
  850. for i in range(fmt0.n):
  851. if data8[i] == bcf_int8_vector_end:
  852. break
  853. a = bcf_gt_allele(data8[i])
  854. alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
  855. elif fmt0.type == BCF_BT_INT16:
  856. data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
  857. for i in range(fmt0.n):
  858. if data16[i] == bcf_int16_vector_end:
  859. break
  860. a = bcf_gt_allele(data16[i])
  861. alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
  862. elif fmt0.type == BCF_BT_INT32:
  863. data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
  864. for i in range(fmt0.n):
  865. if data32[i] == bcf_int32_vector_end:
  866. break
  867. a = bcf_gt_allele(data32[i])
  868. alleles.append(charptr_to_str(r.d.allele[a]) if 0 <= a < nalleles else None)
  869. return tuple(alleles)
  870. cdef bint bcf_sample_get_phased(VariantRecordSample sample):
  871. if sample is None:
  872. raise ValueError('sample must not be None')
  873. cdef bcf_hdr_t *hdr = sample.record.header.ptr
  874. cdef bcf1_t *r = sample.record.ptr
  875. cdef int32_t n = r.n_sample
  876. if bcf_unpack(r, BCF_UN_ALL) < 0:
  877. raise ValueError('Error unpacking VariantRecord')
  878. if sample.index < 0 or sample.index >= n or not r.n_fmt:
  879. return False
  880. cdef bcf_fmt_t *fmt0 = r.d.fmt
  881. cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
  882. if not gt0 or not fmt0.n:
  883. return False
  884. cdef int8_t *data8
  885. cdef int16_t *data16
  886. cdef int32_t *data32
  887. cdef bint phased = False
  888. if fmt0.type == BCF_BT_INT8:
  889. data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
  890. for i in range(fmt0.n):
  891. if data8[i] == bcf_int8_vector_end:
  892. break
  893. elif data8[i] == bcf_int8_missing:
  894. continue
  895. elif i and not bcf_gt_is_phased(data8[i]):
  896. return False
  897. else:
  898. phased = True
  899. elif fmt0.type == BCF_BT_INT16:
  900. data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
  901. for i in range(fmt0.n):
  902. if data16[i] == bcf_int16_vector_end:
  903. break
  904. elif data16[i] == bcf_int16_missing:
  905. continue
  906. elif i and not bcf_gt_is_phased(data16[i]):
  907. return False
  908. else:
  909. phased = True
  910. elif fmt0.type == BCF_BT_INT32:
  911. data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
  912. for i in range(fmt0.n):
  913. if data32[i] == bcf_int32_vector_end:
  914. break
  915. elif data32[i] == bcf_int32_missing:
  916. continue
  917. elif i and not bcf_gt_is_phased(data32[i]):
  918. return False
  919. else:
  920. phased = True
  921. return phased
  922. cdef bcf_sample_set_phased(VariantRecordSample sample, bint phased):
  923. if sample is None:
  924. raise ValueError('sample must not be None')
  925. cdef bcf_hdr_t *hdr = sample.record.header.ptr
  926. cdef bcf1_t *r = sample.record.ptr
  927. cdef int32_t n = r.n_sample
  928. if bcf_unpack(r, BCF_UN_ALL) < 0:
  929. raise ValueError('Error unpacking VariantRecord')
  930. if sample.index < 0 or sample.index >= n or not r.n_fmt:
  931. return
  932. cdef bcf_fmt_t *fmt0 = r.d.fmt
  933. cdef int gt0 = is_gt_fmt(hdr, fmt0.id)
  934. if not gt0 or not fmt0.n:
  935. raise ValueError('Cannot set phased before genotype is set')
  936. cdef int8_t *data8
  937. cdef int16_t *data16
  938. cdef int32_t *data32
  939. if fmt0.type == BCF_BT_INT8:
  940. data8 = <int8_t *>(fmt0.p + sample.index * fmt0.size)
  941. for i in range(fmt0.n):
  942. if data8[i] == bcf_int8_vector_end:
  943. break
  944. elif data8[i] == bcf_int8_missing:
  945. continue
  946. elif i:
  947. data8[i] = (data8[i] & 0xFE) | phased
  948. elif fmt0.type == BCF_BT_INT16:
  949. data16 = <int16_t *>(fmt0.p + sample.index * fmt0.size)
  950. for i in range(fmt0.n):
  951. if data16[i] == bcf_int16_vector_end:
  952. break
  953. elif data16[i] == bcf_int16_missing:
  954. continue
  955. elif i:
  956. data16[i] = (data16[i] & 0xFFFE) | phased
  957. elif fmt0.type == BCF_BT_INT32:
  958. data32 = <int32_t *>(fmt0.p + sample.index * fmt0.size)
  959. for i in range(fmt0.n):
  960. if data32[i] == bcf_int32_vector_end:
  961. break
  962. elif data32[i] == bcf_int32_missing:
  963. continue
  964. elif i:
  965. data32[i] = (data32[i] & 0xFFFFFFFE) | phased
  966. cdef inline bcf_sync_end(VariantRecord record):
  967. cdef bcf_hdr_t *hdr = record.header.ptr
  968. cdef bcf_info_t *info
  969. cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
  970. cdef int ref_len
  971. # allow missing ref when instantiating a new record
  972. if record.ref is not None:
  973. ref_len = len(record.ref)
  974. else:
  975. ref_len = 0
  976. # Delete INFO/END if no alleles are present or if rlen is equal to len(ref)
  977. # Always keep END for symbolic alleles
  978. if not has_symbolic_allele(record) and (not record.ptr.n_allele or record.ptr.rlen == ref_len):
  979. # If INFO/END is not defined in the header, it doesn't exist in the record
  980. if end_id >= 0:
  981. info = bcf_get_info(hdr, record.ptr, b'END')
  982. if info and info.vptr:
  983. if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0:
  984. raise ValueError('Unable to delete END')
  985. else:
  986. # Create END header, if not present
  987. if end_id < 0:
  988. record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval')
  989. # Update to reflect stop position
  990. bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen)
  991. cdef inline int has_symbolic_allele(VariantRecord record):
  992. """Return index of first symbolic allele. 0 if no symbolic alleles."""
  993. for i in range(1, record.ptr.n_allele):
  994. alt = record.ptr.d.allele[i]
  995. if alt[0] == b'<' and alt[len(alt) - 1] == b'>':
  996. return i
  997. return 0
  998. ########################################################################
  999. ########################################################################
  1000. ## Variant Header objects
  1001. ########################################################################
  1002. cdef bcf_header_remove_hrec(VariantHeader header, int i):
  1003. if header is None:
  1004. raise ValueError('header must not be None')
  1005. cdef bcf_hdr_t *hdr = header.ptr
  1006. if i < 0 or i >= hdr.nhrec:
  1007. raise ValueError('Invalid header record index')
  1008. cdef bcf_hrec_t *hrec = hdr.hrec[i]
  1009. hdr.nhrec -= 1
  1010. if i < hdr.nhrec:
  1011. memmove(&hdr.hrec[i], &hdr.hrec[i+1], (hdr.nhrec-i)*sizeof(bcf_hrec_t*))
  1012. bcf_hrec_destroy(hrec)
  1013. hdr.hrec[hdr.nhrec] = NULL
  1014. hdr.dirty = 1
  1015. #FIXME: implement a full mapping interface
  1016. #FIXME: passing bcf_hrec_t* is not safe, since we cannot control the
  1017. # object lifetime.
  1018. cdef class VariantHeaderRecord(object):
  1019. """header record from a :class:`VariantHeader` object"""
  1020. def __init__(self, *args, **kwargs):
  1021. raise TypeError('this class cannot be instantiated from Python')
  1022. @property
  1023. def type(self):
  1024. """header type: FILTER, INFO, FORMAT, CONTIG, STRUCTURED, or GENERIC"""
  1025. cdef bcf_hrec_t *r = self.ptr
  1026. if not r:
  1027. return None
  1028. return METADATA_TYPES[r.type]
  1029. @property
  1030. def key(self):
  1031. """header key (the part before '=', in FILTER/INFO/FORMAT/contig/fileformat etc.)"""
  1032. cdef bcf_hrec_t *r = self.ptr
  1033. return bcf_str_cache_get_charptr(r.key) if r and r.key else None
  1034. @property
  1035. def value(self):
  1036. """header value. Set only for generic lines, None for FILTER/INFO, etc."""
  1037. cdef bcf_hrec_t *r = self.ptr
  1038. return charptr_to_str(r.value) if r and r.value else None
  1039. @property
  1040. def attrs(self):
  1041. """sequence of additional header attributes"""
  1042. cdef bcf_hrec_t *r = self.ptr
  1043. if not r:
  1044. return ()
  1045. cdef int i
  1046. return tuple((bcf_str_cache_get_charptr(r.keys[i]) if r.keys[i] else None,
  1047. charptr_to_str(r.vals[i]) if r.vals[i] else None)
  1048. for i in range(r.nkeys))
  1049. def __len__(self):
  1050. cdef bcf_hrec_t *r = self.ptr
  1051. return r.nkeys if r else 0
  1052. def __bool__(self):
  1053. cdef bcf_hrec_t *r = self.ptr
  1054. return r != NULL and r.nkeys != 0
  1055. def __getitem__(self, key):
  1056. """get attribute value"""
  1057. cdef bcf_hrec_t *r = self.ptr
  1058. cdef int i
  1059. if r:
  1060. bkey = force_bytes(key)
  1061. for i in range(r.nkeys):
  1062. if r.keys[i] and r.keys[i] == bkey:
  1063. return charptr_to_str(r.vals[i]) if r.vals[i] else None
  1064. raise KeyError('cannot find metadata key')
  1065. def __iter__(self):
  1066. cdef bcf_hrec_t *r = self.ptr
  1067. if not r:
  1068. return
  1069. cdef int i
  1070. for i in range(r.nkeys):
  1071. if r.keys[i]:
  1072. yield bcf_str_cache_get_charptr(r.keys[i])
  1073. def get(self, key, default=None):
  1074. """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
  1075. try:
  1076. return self[key]
  1077. except KeyError:
  1078. return default
  1079. def __contains__(self, key):
  1080. try:
  1081. self[key]
  1082. except KeyError:
  1083. return False
  1084. else:
  1085. return True
  1086. def iterkeys(self):
  1087. """D.iterkeys() -> an iterator over the keys of D"""
  1088. return iter(self)
  1089. def itervalues(self):
  1090. """D.itervalues() -> an iterator over the values of D"""
  1091. cdef bcf_hrec_t *r = self.ptr
  1092. if not r:
  1093. return
  1094. cdef int i
  1095. for i in range(r.nkeys):
  1096. if r.keys[i]:
  1097. yield charptr_to_str(r.vals[i]) if r.vals[i] else None
  1098. def iteritems(self):
  1099. """D.iteritems() -> an iterator over the (key, value) items of D"""
  1100. cdef bcf_hrec_t *r = self.ptr
  1101. if not r:
  1102. return
  1103. cdef int i
  1104. for i in range(r.nkeys):
  1105. if r.keys[i]:
  1106. yield (bcf_str_cache_get_charptr(r.keys[i]), charptr_to_str(r.vals[i]) if r.vals[i] else None)
  1107. def keys(self):
  1108. """D.keys() -> list of D's keys"""
  1109. return list(self)
  1110. def items(self):
  1111. """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
  1112. return list(self.iteritems())
  1113. def values(self):
  1114. """D.values() -> list of D's values"""
  1115. return list(self.itervalues())
  1116. def update(self, items=None, **kwargs):
  1117. """D.update([E, ]**F) -> None.
  1118. Update D from dict/iterable E and F.
  1119. """
  1120. for k, v in items.items():
  1121. self[k] = v
  1122. if kwargs:
  1123. for k, v in kwargs.items():
  1124. self[k] = v
  1125. def pop(self, key, default=_nothing):
  1126. try:
  1127. value = self[key]
  1128. del self[key]
  1129. return value
  1130. except KeyError:
  1131. if default is not _nothing:
  1132. return default
  1133. raise
  1134. # Mappings are not hashable by default, but subclasses can change this
  1135. __hash__ = None
  1136. #TODO: implement __richcmp__
  1137. def __str__(self):
  1138. cdef bcf_hrec_t *r = self.ptr
  1139. if not r:
  1140. raise ValueError('cannot convert deleted record to str')
  1141. cdef kstring_t hrec_str
  1142. hrec_str.l = hrec_str.m = 0
  1143. hrec_str.s = NULL
  1144. bcf_hrec_format(r, &hrec_str)
  1145. ret = charptr_to_str_w_len(hrec_str.s, hrec_str.l)
  1146. if hrec_str.m:
  1147. free(hrec_str.s)
  1148. return ret
  1149. # FIXME: Not safe -- causes trivial segfaults at the moment
  1150. def remove(self):
  1151. cdef bcf_hdr_t *hdr = self.header.ptr
  1152. cdef bcf_hrec_t *r = self.ptr
  1153. if not r:
  1154. return
  1155. assert r.key
  1156. cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
  1157. bcf_hdr_remove(hdr, r.type, key)
  1158. self.ptr = NULL
  1159. cdef VariantHeaderRecord makeVariantHeaderRecord(VariantHeader header, bcf_hrec_t *hdr):
  1160. if not header:
  1161. raise ValueError('invalid VariantHeader')
  1162. if not hdr:
  1163. return None
  1164. cdef VariantHeaderRecord record = VariantHeaderRecord.__new__(VariantHeaderRecord)
  1165. record.header = header
  1166. record.ptr = hdr
  1167. return record
  1168. cdef class VariantHeaderRecords(object):
  1169. """sequence of :class:`VariantHeaderRecord` object from a :class:`VariantHeader` object"""
  1170. def __init__(self, *args, **kwargs):
  1171. raise TypeError('this class cannot be instantiated from Python')
  1172. def __len__(self):
  1173. return self.header.ptr.nhrec
  1174. def __bool__(self):
  1175. return self.header.ptr.nhrec != 0
  1176. def __getitem__(self, index):
  1177. cdef int32_t i = index
  1178. if i < 0 or i >= self.header.ptr.nhrec:
  1179. raise IndexError('invalid header record index')
  1180. return makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
  1181. def __iter__(self):
  1182. cdef int32_t i
  1183. for i in range(self.header.ptr.nhrec):
  1184. yield makeVariantHeaderRecord(self.header, self.header.ptr.hrec[i])
  1185. __hash__ = None
  1186. cdef VariantHeaderRecords makeVariantHeaderRecords(VariantHeader header):
  1187. if not header:
  1188. raise ValueError('invalid VariantHeader')
  1189. cdef VariantHeaderRecords records = VariantHeaderRecords.__new__(VariantHeaderRecords)
  1190. records.header = header
  1191. return records
  1192. cdef class VariantMetadata(object):
  1193. """filter, info or format metadata record from a :class:`VariantHeader` object"""
  1194. def __init__(self, *args, **kwargs):
  1195. raise TypeError('this class cannot be instantiated from Python')
  1196. @property
  1197. def name(self):
  1198. """metadata name"""
  1199. cdef bcf_hdr_t *hdr = self.header.ptr
  1200. return bcf_str_cache_get_charptr(hdr.id[BCF_DT_ID][self.id].key)
  1201. # Q: Should this be exposed?
  1202. @property
  1203. def id(self):
  1204. """metadata internal header id number"""
  1205. return self.id
  1206. @property
  1207. def number(self):
  1208. """metadata number (i.e. cardinality)"""
  1209. cdef bcf_hdr_t *hdr = self.header.ptr
  1210. if not check_header_id(hdr, self.type, self.id):
  1211. raise ValueError('Invalid header id')
  1212. if self.type == BCF_HL_FLT:
  1213. return None
  1214. cdef int l = bcf_hdr_id2length(hdr, self.type, self.id)
  1215. if l == BCF_VL_FIXED:
  1216. return bcf_hdr_id2number(hdr, self.type, self.id)
  1217. elif l == BCF_VL_VAR:
  1218. return '.'
  1219. else:
  1220. return METADATA_LENGTHS[l]
  1221. @property
  1222. def type(self):
  1223. """metadata value type"""
  1224. cdef bcf_hdr_t *hdr = self.header.ptr
  1225. if not check_header_id(hdr, self.type, self.id):
  1226. raise ValueError('Invalid header id')
  1227. if self.type == BCF_HL_FLT:
  1228. return None
  1229. return VALUE_TYPES[bcf_hdr_id2type(hdr, self.type, self.id)]
  1230. @property
  1231. def description(self):
  1232. """metadata description (or None if not set)"""
  1233. descr = self.record.get('Description')
  1234. if descr:
  1235. descr = descr.strip('"')
  1236. return force_str(descr)
  1237. @property
  1238. def record(self):
  1239. """:class:`VariantHeaderRecord` associated with this :class:`VariantMetadata` object"""
  1240. cdef bcf_hdr_t *hdr = self.header.ptr
  1241. if not check_header_id(hdr, self.type, self.id):
  1242. raise ValueError('Invalid header id')
  1243. cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_ID][self.id].val.hrec[self.type]
  1244. if not hrec:
  1245. return None
  1246. return makeVariantHeaderRecord(self.header, hrec)
  1247. def remove_header(self):
  1248. cdef bcf_hdr_t *hdr = self.header.ptr
  1249. cdef const char *key = hdr.id[BCF_DT_ID][self.id].key
  1250. bcf_hdr_remove(hdr, self.type, key)
  1251. cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
  1252. if not header:
  1253. raise ValueError('invalid VariantHeader')
  1254. if type != BCF_HL_FLT and type != BCF_HL_INFO and type != BCF_HL_FMT:
  1255. raise ValueError('invalid metadata type')
  1256. if id < 0 or id >= header.ptr.n[BCF_DT_ID]:
  1257. raise ValueError('invalid metadata id')
  1258. cdef VariantMetadata meta = VariantMetadata.__new__(VariantMetadata)
  1259. meta.header = header
  1260. meta.type = type
  1261. meta.id = id
  1262. return meta
  1263. cdef class VariantHeaderMetadata(object):
  1264. """mapping from filter, info or format name to :class:`VariantMetadata` object"""
  1265. def __init__(self, *args, **kwargs):
  1266. raise TypeError('this class cannot be instantiated from Python')
  1267. def add(self, id, number, type, description, **kwargs):
  1268. """Add a new filter, info or format record"""
  1269. if id in self:
  1270. raise ValueError('Header already exists for id={}'.format(id))
  1271. if self.type == BCF_HL_FLT:
  1272. if number is not None:
  1273. raise ValueError('Number must be None when adding a filter')
  1274. if type is not None:
  1275. raise ValueError('Type must be None when adding a filter')
  1276. items = [('ID', unquoted_str(id)), ('Description', description)]
  1277. else:
  1278. if type not in VALUE_TYPES:
  1279. raise ValueError('unknown type specified: {}'.format(type))
  1280. if number is None:
  1281. number = '.'
  1282. items = [('ID', unquoted_str(id)),
  1283. ('Number', unquoted_str(number)),
  1284. ('Type', unquoted_str(type)),
  1285. ('Description', description)]
  1286. items += kwargs.items()
  1287. self.header.add_meta(METADATA_TYPES[self.type], items=items)
  1288. def __len__(self):
  1289. cdef bcf_hdr_t *hdr = self.header.ptr
  1290. cdef bcf_idpair_t *idpair
  1291. cdef int32_t i, n = 0
  1292. for i in range(hdr.n[BCF_DT_ID]):
  1293. idpair = hdr.id[BCF_DT_ID] + i
  1294. if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
  1295. n += 1
  1296. return n
  1297. def __bool__(self):
  1298. cdef bcf_hdr_t *hdr = self.header.ptr
  1299. cdef bcf_idpair_t *idpair
  1300. cdef int32_t i
  1301. for i in range(hdr.n[BCF_DT_ID]):
  1302. idpair = hdr.id[BCF_DT_ID] + i
  1303. if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
  1304. return True
  1305. return False
  1306. def __getitem__(self, key):
  1307. cdef bcf_hdr_t *hdr = self.header.ptr
  1308. cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
  1309. cdef bytes bkey = force_bytes(key)
  1310. cdef khiter_t k = kh_get_vdict(d, bkey)
  1311. if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
  1312. raise KeyError('invalid key: {}'.format(key))
  1313. return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
  1314. def remove_header(self, key):
  1315. cdef bcf_hdr_t *hdr = self.header.ptr
  1316. cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
  1317. cdef bytes bkey = force_bytes(key)
  1318. cdef khiter_t k = kh_get_vdict(d, bkey)
  1319. if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
  1320. raise KeyError('invalid key: {}'.format(key))
  1321. bcf_hdr_remove(hdr, self.type, bkey)
  1322. #bcf_hdr_sync(hdr)
  1323. def clear_header(self):
  1324. cdef bcf_hdr_t *hdr = self.header.ptr
  1325. bcf_hdr_remove(hdr, self.type, NULL)
  1326. #bcf_hdr_sync(hdr)
  1327. def __iter__(self):
  1328. cdef bcf_hdr_t *hdr = self.header.ptr
  1329. cdef bcf_idpair_t *idpair
  1330. cdef int32_t i
  1331. for i in range(hdr.n[BCF_DT_ID]):
  1332. idpair = hdr.id[BCF_DT_ID] + i
  1333. if idpair.key and idpair.val and idpair.val.info[self.type] & 0xF != 0xF:
  1334. yield bcf_str_cache_get_charptr(idpair.key)
  1335. def get(self, key, default=None):
  1336. """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
  1337. try:
  1338. return self[key]
  1339. except KeyError:
  1340. return default
  1341. def __contains__(self, key):
  1342. try:
  1343. self[key]
  1344. except KeyError:
  1345. return False
  1346. else:
  1347. return True
  1348. def iterkeys(self):
  1349. """D.iterkeys() -> an iterator over the keys of D"""
  1350. return iter(self)
  1351. def itervalues(self):
  1352. """D.itervalues() -> an iterator over the values of D"""
  1353. for key in self:
  1354. yield self[key]
  1355. def iteritems(self):
  1356. """D.iteritems() -> an iterator over the (key, value) items of D"""
  1357. for key in self:
  1358. yield (key, self[key])
  1359. def keys(self):
  1360. """D.keys() -> list of D's keys"""
  1361. return list(self)
  1362. def items(self):
  1363. """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
  1364. return list(self.iteritems())
  1365. def values(self):
  1366. """D.values() -> list of D's values"""
  1367. return list(self.itervalues())
  1368. # Mappings are not hashable by default, but subclasses can change this
  1369. __hash__ = None
  1370. #TODO: implement __richcmp__
  1371. cdef VariantHeaderMetadata makeVariantHeaderMetadata(VariantHeader header, int32_t type):
  1372. if not header:
  1373. raise ValueError('invalid VariantHeader')
  1374. cdef VariantHeaderMetadata meta = VariantHeaderMetadata.__new__(VariantHeaderMetadata)
  1375. meta.header = header
  1376. meta.type = type
  1377. return meta
  1378. cdef class VariantContig(object):
  1379. """contig metadata from a :class:`VariantHeader`"""
  1380. def __init__(self, *args, **kwargs):
  1381. raise TypeError('this class cannot be instantiated from Python')
  1382. @property
  1383. def name(self):
  1384. """contig name"""
  1385. cdef bcf_hdr_t *hdr = self.header.ptr
  1386. return bcf_str_cache_get_charptr(hdr.id[BCF_DT_CTG][self.id].key)
  1387. @property
  1388. def id(self):
  1389. """contig internal id number"""
  1390. return self.id
  1391. @property
  1392. def length(self):
  1393. """contig length or None if not available"""
  1394. cdef bcf_hdr_t *hdr = self.header.ptr
  1395. cdef uint32_t length = hdr.id[BCF_DT_CTG][self.id].val.info[0]
  1396. return length if length else None
  1397. @property
  1398. def header_record(self):
  1399. """:class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
  1400. cdef bcf_hdr_t *hdr = self.header.ptr
  1401. cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
  1402. return makeVariantHeaderRecord(self.header, hrec)
  1403. def remove_header(self):
  1404. cdef bcf_hdr_t *hdr = self.header.ptr
  1405. cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key
  1406. bcf_hdr_remove(hdr, BCF_HL_CTG, key)
  1407. cdef VariantContig makeVariantContig(VariantHeader header, int id):
  1408. if not header:
  1409. raise ValueError('invalid VariantHeader')
  1410. if id < 0 or id >= header.ptr.n[BCF_DT_CTG]:
  1411. raise ValueError('invalid contig id')
  1412. cdef VariantContig contig = VariantContig.__new__(VariantContig)
  1413. contig.header = header
  1414. contig.id = id
  1415. return contig
  1416. cdef class VariantHeaderContigs(object):
  1417. """mapping from contig name or index to :class:`VariantContig` object."""
  1418. def __init__(self, *args, **kwargs):
  1419. raise TypeError('this class cannot be instantiated from Python')
  1420. def __len__(self):
  1421. cdef bcf_hdr_t *hdr = self.header.ptr
  1422. assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
  1423. return hdr.n[BCF_DT_CTG]
  1424. def __bool__(self):
  1425. cdef bcf_hdr_t *hdr = self.header.ptr
  1426. assert kh_size(<vdict_t *>hdr.dict[BCF_DT_CTG]) == hdr.n[BCF_DT_CTG]
  1427. return hdr.n[BCF_DT_CTG] != 0
  1428. def __getitem__(self, key):
  1429. cdef bcf_hdr_t *hdr = self.header.ptr
  1430. cdef int index
  1431. if isinstance(key, int):
  1432. index = key
  1433. if index < 0 or index >= hdr.n[BCF_DT_CTG]:
  1434. raise IndexError('invalid contig index')
  1435. return makeVariantContig(self.header, index)
  1436. cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
  1437. cdef bytes bkey = force_bytes(key)
  1438. cdef khiter_t k = kh_get_vdict(d, bkey)
  1439. if k == kh_end(d):
  1440. raise KeyError('invalid contig: {}'.format(key))
  1441. cdef int id = kh_val_vdict(d, k).id
  1442. return makeVariantContig(self.header, id)
  1443. def remove_header(self, key):
  1444. cdef bcf_hdr_t *hdr = self.header.ptr
  1445. cdef int index
  1446. cdef const char *ckey
  1447. cdef vdict_t *d
  1448. cdef khiter_t k
  1449. if isinstance(key, int):
  1450. index = key
  1451. if index < 0 or index >= hdr.n[BCF_DT_CTG]:
  1452. raise IndexError('invalid contig index')
  1453. ckey = hdr.id[BCF_DT_CTG][self.id].key
  1454. else:
  1455. d = <vdict_t *>hdr.dict[BCF_DT_CTG]
  1456. key = force_bytes(key)
  1457. if kh_get_vdict(d, key) == kh_end(d):
  1458. raise KeyError('invalid contig: {}'.format(key))
  1459. ckey = key
  1460. bcf_hdr_remove(hdr, BCF_HL_CTG, ckey)
  1461. def clear_header(self):
  1462. cdef bcf_hdr_t *hdr = self.header.ptr
  1463. bcf_hdr_remove(hdr, BCF_HL_CTG, NULL)
  1464. #bcf_hdr_sync(hdr)
  1465. def __iter__(self):
  1466. cdef bcf_hdr_t *hdr = self.header.ptr
  1467. cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
  1468. cdef uint32_t n = kh_size(d)
  1469. assert n == hdr.n[BCF_DT_CTG]
  1470. for i in range(n):
  1471. yield bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, i))
  1472. def get(self, key, default=None):
  1473. """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
  1474. try:
  1475. return self[key]
  1476. except KeyError:
  1477. return default
  1478. def __contains__(self, key):
  1479. try:
  1480. self[key]
  1481. except KeyError:
  1482. return False
  1483. else:
  1484. return True
  1485. def iterkeys(self):
  1486. """D.iterkeys() -> an iterator over the keys of D"""
  1487. return iter(self)
  1488. def itervalues(self):
  1489. """D.itervalues() -> an iterator over the values of D"""
  1490. for key in self:
  1491. yield self[key]
  1492. def iteritems(self):
  1493. """D.iteritems() -> an iterator over the (key, value) items of D"""
  1494. for key in self:
  1495. yield (key, self[key])
  1496. def keys(self):
  1497. """D.keys() -> list of D's keys"""
  1498. return list(self)
  1499. def items(self):
  1500. """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
  1501. return list(self.iteritems())
  1502. def values(self):
  1503. """D.values() -> list of D's values"""
  1504. return list(self.itervalues())
  1505. # Mappings are not hashable by default, but subclasses can change this
  1506. __hash__ = None
  1507. #TODO: implement __richcmp__
  1508. def add(self, id, length=None, **kwargs):
  1509. """Add a new contig record"""
  1510. if id in self:
  1511. raise ValueError('Header already exists for contig {}'.format(id))
  1512. items = [('ID', unquoted_str(id))]
  1513. if length is not None:
  1514. items.append(("length", unquoted_str(length)))
  1515. items += kwargs.items()
  1516. self.header.add_meta('contig', items=items)
  1517. cdef VariantHeaderContigs makeVariantHeaderContigs(VariantHeader header):
  1518. if not header:
  1519. raise ValueError('invalid VariantHeader')
  1520. cdef VariantHeaderContigs contigs = VariantHeaderContigs.__new__(VariantHeaderContigs)
  1521. contigs.header = header
  1522. return contigs
  1523. cdef class VariantHeaderSamples(object):
  1524. """sequence of sample names from a :class:`VariantHeader` object"""
  1525. def __init__(self, *args, **kwargs):
  1526. raise TypeError('this class cannot be instantiated from Python')
  1527. def __len__(self):
  1528. return bcf_hdr_nsamples(self.header.ptr)
  1529. def __bool__(self):
  1530. return bcf_hdr_nsamples(self.header.ptr) != 0
  1531. def __getitem__(self, index):
  1532. cdef bcf_hdr_t *hdr = self.header.ptr
  1533. cdef int32_t n = bcf_hdr_nsamples(hdr)
  1534. cdef int32_t i = index
  1535. if i < 0 or i >= n:
  1536. raise IndexError('invalid sample index')
  1537. return charptr_to_str(hdr.samples[i])
  1538. def __iter__(self):
  1539. cdef bcf_hdr_t *hdr = self.header.ptr
  1540. cdef int32_t i, n = bcf_hdr_nsamples(hdr)
  1541. for i in range(n):
  1542. yield charptr_to_str(hdr.samples[i])
  1543. def __contains__(self, key):
  1544. cdef bcf_hdr_t *hdr = self.header.ptr
  1545. cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
  1546. cdef bytes bkey = force_bytes(key)
  1547. cdef khiter_t k = kh_get_vdict(d, bkey)
  1548. return k != kh_end(d)
  1549. # Mappings are not hashable by default, but subclasses can change this
  1550. __hash__ = None
  1551. #TODO: implement __richcmp__
  1552. def add(self, name):
  1553. """Add a new sample"""
  1554. self.header.add_sample(name)
  1555. cdef VariantHeaderSamples makeVariantHeaderSamples(VariantHeader header):
  1556. if not header:
  1557. raise ValueError('invalid VariantHeader')
  1558. cdef VariantHeaderSamples samples = VariantHeaderSamples.__new__(VariantHeaderSamples)
  1559. samples.header = header
  1560. return samples
  1561. cdef class VariantHeader(object):
  1562. """header information for a :class:`VariantFile` object"""
  1563. #FIXME: Add structured proxy
  1564. #FIXME: Add generic proxy
  1565. #FIXME: Add mutable methods
  1566. # See makeVariantHeader for C constructor
  1567. def __cinit__(self):
  1568. self.ptr = NULL
  1569. # Python constructor
  1570. def __init__(self):
  1571. self.ptr = bcf_hdr_init(b'w')
  1572. if not self.ptr:
  1573. raise ValueError('cannot create VariantHeader')
  1574. def __dealloc__(self):
  1575. if self.ptr:
  1576. bcf_hdr_destroy(self.ptr)
  1577. self.ptr = NULL
  1578. def __bool__(self):
  1579. return self.ptr != NULL
  1580. def copy(self):
  1581. return makeVariantHeader(bcf_hdr_dup(self.ptr))
  1582. def merge(self, VariantHeader header):
  1583. if header is None:
  1584. raise ValueError('header must not be None')
  1585. bcf_hdr_merge(self.ptr, header.ptr)
  1586. @property
  1587. def version(self):
  1588. """VCF version"""
  1589. return force_str(bcf_hdr_get_version(self.ptr))
  1590. @property
  1591. def samples(self):
  1592. """samples (:class:`VariantHeaderSamples`)"""
  1593. return makeVariantHeaderSamples(self)
  1594. @property
  1595. def records(self):
  1596. """header records (:class:`VariantHeaderRecords`)"""
  1597. return makeVariantHeaderRecords(self)
  1598. @property
  1599. def contigs(self):
  1600. """contig information (:class:`VariantHeaderContigs`)"""
  1601. return makeVariantHeaderContigs(self)
  1602. @property
  1603. def filters(self):
  1604. """filter metadata (:class:`VariantHeaderMetadata`)"""
  1605. return makeVariantHeaderMetadata(self, BCF_HL_FLT)
  1606. @property
  1607. def info(self):
  1608. """info metadata (:class:`VariantHeaderMetadata`)"""
  1609. return makeVariantHeaderMetadata(self, BCF_HL_INFO)
  1610. @property
  1611. def formats(self):
  1612. """format metadata (:class:`VariantHeaderMetadata`)"""
  1613. return makeVariantHeaderMetadata(self, BCF_HL_FMT)
  1614. @property
  1615. def alts(self):
  1616. """alt metadata (:class:`dict` ID->record).
  1617. The data returned just a snapshot of alt records, is created
  1618. every time the property is requested, and modifications will
  1619. not be reflected in the header metadata and vice versa.
  1620. i.e. it is just a dict that reflects the state of alt records
  1621. at the time it is created.
  1622. """
  1623. return {record['ID']:record for record in self.records
  1624. if record.key.upper() == 'ALT' }
  1625. # only safe to do when opening an htsfile
  1626. cdef _subset_samples(self, include_samples):
  1627. keep_samples = set(self.samples)
  1628. include_samples = set(include_samples)
  1629. missing_samples = include_samples - keep_samples
  1630. keep_samples &= include_samples
  1631. if missing_samples:
  1632. # FIXME: add specialized exception with payload
  1633. raise ValueError(
  1634. 'missing {:d} requested samples'.format(
  1635. len(missing_samples)))
  1636. keep_samples = force_bytes(','.join(keep_samples))
  1637. cdef char *keep = <char *>keep_samples if keep_samples else NULL
  1638. cdef ret = bcf_hdr_set_samples(self.ptr, keep, 0)
  1639. if ret != 0:
  1640. raise ValueError(
  1641. 'bcf_hdr_set_samples failed: ret = {}'.format(ret))
  1642. def __str__(self):
  1643. cdef int hlen
  1644. cdef kstring_t line
  1645. line.l = line.m = 0
  1646. line.s = NULL
  1647. if bcf_hdr_format(self.ptr, 0, &line) < 0:
  1648. if line.m:
  1649. free(line.s)
  1650. raise ValueError('bcf_hdr_format failed')
  1651. ret = charptr_to_str_w_len(line.s, line.l)
  1652. if line.m:
  1653. free(line.s)
  1654. return ret
  1655. def new_record(self, contig=None, start=0, stop=0, alleles=None,
  1656. id=None, qual=None, filter=None, info=None, samples=None,
  1657. **kwargs):
  1658. """Create a new empty VariantRecord.
  1659. Arguments are currently experimental. Use with caution and expect
  1660. changes in upcoming releases.
  1661. """
  1662. rec = makeVariantRecord(self, bcf_init())
  1663. if not rec:
  1664. raise MemoryError('unable to allocate BCF record')
  1665. rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
  1666. if contig is not None:
  1667. rec.contig = contig
  1668. rec.start = start
  1669. rec.stop = stop
  1670. rec.id = id
  1671. rec.qual = qual
  1672. if alleles is not None:
  1673. rec.alleles = alleles
  1674. if filter is not None:
  1675. if isinstance(filter, (list, tuple, VariantRecordFilter)):
  1676. for f in filter:
  1677. rec.filter.add(f)
  1678. else:
  1679. rec.filter.add(filter)
  1680. if info:
  1681. rec.info.update(info)
  1682. if kwargs:
  1683. rec.samples[0].update(kwargs)
  1684. if samples:
  1685. for i, sample in enumerate(samples):
  1686. rec.samples[i].update(sample)
  1687. return rec
  1688. def add_record(self, VariantHeaderRecord record):
  1689. """Add an existing :class:`VariantHeaderRecord` to this header"""
  1690. if record is None:
  1691. raise ValueError('record must not be None')
  1692. cdef bcf_hrec_t *hrec = bcf_hrec_dup(record.ptr)
  1693. bcf_hdr_add_hrec(self.ptr, hrec)
  1694. self._hdr_sync()
  1695. def add_line(self, line):
  1696. """Add a metadata line to this header"""
  1697. bline = force_bytes(line)
  1698. if bcf_hdr_append(self.ptr, bline) < 0:
  1699. raise ValueError('invalid header line')
  1700. self._hdr_sync()
  1701. def add_meta(self, key, value=None, items=None):
  1702. """Add metadata to this header"""
  1703. if not ((value is not None) ^ (items is not None)):
  1704. raise ValueError('either value or items must be specified')
  1705. cdef bcf_hrec_t *hrec = <bcf_hrec_t*>calloc(1, sizeof(bcf_hrec_t))
  1706. cdef int quoted
  1707. try:
  1708. key = force_bytes(key)
  1709. hrec.key = strdup(key)
  1710. if value is not None:
  1711. hrec.value = strdup(force_bytes(value))
  1712. else:
  1713. for key, value in items:
  1714. quoted = not isinstance(value, unquoted_str) and key not in ("ID", "Number", "Type")
  1715. key = force_bytes(key)
  1716. if bcf_hrec_add_key(hrec, key, <int>len(key)) < 0:
  1717. raise MemoryError("Could not allocate VCF header record")
  1718. value = force_bytes(str(value))
  1719. if bcf_hrec_set_val(hrec, hrec.nkeys-1, value, <int>len(value), quoted) < 0:
  1720. raise MemoryError("Could not allocate VCF header record")
  1721. except:
  1722. bcf_hrec_destroy(hrec)
  1723. raise
  1724. bcf_hdr_add_hrec(self.ptr, hrec)
  1725. self._hdr_sync()
  1726. cdef _add_sample(self, name):
  1727. bname = force_bytes(name)
  1728. if bcf_hdr_add_sample(self.ptr, bname) < 0:
  1729. raise ValueError('Duplicated sample name: {}'.format(name))
  1730. cdef _hdr_sync(self):
  1731. cdef bcf_hdr_t *hdr = self.ptr
  1732. if hdr.dirty:
  1733. if bcf_hdr_sync(hdr) < 0:
  1734. raise MemoryError('unable to reallocate VariantHeader')
  1735. def add_sample(self, name):
  1736. """Add a new sample to this header"""
  1737. self._add_sample(name)
  1738. self._hdr_sync()
  1739. def add_samples(self, *args):
  1740. """Add several new samples to this header.
  1741. This function takes multiple arguments, each of which may
  1742. be either a sample name or an iterable returning sample names
  1743. (e.g., a list of sample names).
  1744. """
  1745. for arg in args:
  1746. if isinstance(arg, str):
  1747. self._add_sample(arg)
  1748. else:
  1749. for name in arg:
  1750. self._add_sample(name)
  1751. self._hdr_sync()
  1752. cdef VariantHeader makeVariantHeader(bcf_hdr_t *hdr):
  1753. if not hdr:
  1754. raise ValueError('cannot create VariantHeader')
  1755. cdef VariantHeader header = VariantHeader.__new__(VariantHeader)
  1756. header.ptr = hdr
  1757. return header
  1758. cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2:
  1759. cdef vdict_t *d
  1760. cdef khiter_t k
  1761. cdef int info_id
  1762. if isinstance(key, str):
  1763. key = force_bytes(key)
  1764. d = <vdict_t *>hdr.dict[BCF_DT_ID]
  1765. k = kh_get_vdict(d, key)
  1766. if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
  1767. return -1
  1768. return kh_val_vdict(d, k).id
  1769. ########################################################################
  1770. ########################################################################
  1771. ## Variant Record objects
  1772. ########################################################################
  1773. cdef class VariantRecordFilter(object):
  1774. """Filters set on a :class:`VariantRecord` object, presented as a mapping from
  1775. filter index or name to :class:`VariantMetadata` object"""
  1776. def __init__(self, *args, **kwargs):
  1777. raise TypeError('this class cannot be instantiated from Python')
  1778. def __len__(self):
  1779. return self.record.ptr.d.n_flt
  1780. def __bool__(self):
  1781. return self.record.ptr.d.n_flt != 0
  1782. def __getitem__(self, key):
  1783. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1784. cdef bcf1_t *r = self.record.ptr
  1785. cdef int index, id
  1786. cdef int n = r.d.n_flt
  1787. if isinstance(key, int):
  1788. index = key
  1789. if index < 0 or index >= n:
  1790. raise IndexError('invalid filter index')
  1791. id = r.d.flt[index]
  1792. else:
  1793. if key == '.':
  1794. key = 'PASS'
  1795. bkey = force_bytes(key)
  1796. id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
  1797. if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
  1798. raise KeyError('Invalid filter: {}'.format(key))
  1799. return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
  1800. def add(self, key):
  1801. """Add a new filter"""
  1802. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1803. cdef bcf1_t *r = self.record.ptr
  1804. cdef int id
  1805. if key == '.':
  1806. key = 'PASS'
  1807. cdef bytes bkey = force_bytes(key)
  1808. id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
  1809. if not check_header_id(hdr, BCF_HL_FLT, id):
  1810. raise KeyError('Invalid filter: {}'.format(key))
  1811. bcf_add_filter(hdr, r, id)
  1812. def __delitem__(self, key):
  1813. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1814. cdef bcf1_t *r = self.record.ptr
  1815. cdef int index, id
  1816. cdef int n = r.d.n_flt
  1817. if isinstance(key, int):
  1818. index = key
  1819. if index < 0 or index >= n:
  1820. raise IndexError('invalid filter index')
  1821. id = r.d.flt[index]
  1822. else:
  1823. if key == '.':
  1824. key = 'PASS'
  1825. bkey = force_bytes(key)
  1826. id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
  1827. if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
  1828. raise KeyError('Invalid filter: {}'.format(key))
  1829. bcf_remove_filter(hdr, r, id, 0)
  1830. def clear(self):
  1831. """Clear all filters"""
  1832. cdef bcf1_t *r = self.record.ptr
  1833. r.d.shared_dirty |= BCF1_DIRTY_FLT
  1834. r.d.n_flt = 0
  1835. def __iter__(self):
  1836. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1837. cdef bcf1_t *r = self.record.ptr
  1838. cdef int i
  1839. for i in range(r.d.n_flt):
  1840. yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, r.d.flt[i]))
  1841. def get(self, key, default=None):
  1842. """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
  1843. try:
  1844. return self[key]
  1845. except KeyError:
  1846. return default
  1847. def __contains__(self, key):
  1848. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1849. cdef bcf1_t *r = self.record.ptr
  1850. cdef bytes bkey = force_bytes(key)
  1851. return bcf_has_filter(hdr, r, bkey) == 1
  1852. def iterkeys(self):
  1853. """D.iterkeys() -> an iterator over the keys of D"""
  1854. return iter(self)
  1855. def itervalues(self):
  1856. """D.itervalues() -> an iterator over the values of D"""
  1857. for key in self:
  1858. yield self[key]
  1859. def iteritems(self):
  1860. """D.iteritems() -> an iterator over the (key, value) items of D"""
  1861. for key in self:
  1862. yield (key, self[key])
  1863. def keys(self):
  1864. """D.keys() -> list of D's keys"""
  1865. return list(self)
  1866. def items(self):
  1867. """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
  1868. return list(self.iteritems())
  1869. def values(self):
  1870. """D.values() -> list of D's values"""
  1871. return list(self.itervalues())
  1872. def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op):
  1873. if op != 2 and op != 3:
  1874. return NotImplemented
  1875. cdef bcf1_t *s = self.record.ptr
  1876. cdef bcf1_t *o = other.record.ptr
  1877. cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other))
  1878. if op == 3:
  1879. cmp = not cmp
  1880. return cmp
  1881. # Mappings are not hashable by default, but subclasses can change this
  1882. __hash__ = None
  1883. #TODO: implement __richcmp__
  1884. cdef VariantRecordFilter makeVariantRecordFilter(VariantRecord record):
  1885. if not record:
  1886. raise ValueError('invalid VariantRecord')
  1887. cdef VariantRecordFilter filter = VariantRecordFilter.__new__(VariantRecordFilter)
  1888. filter.record = record
  1889. return filter
  1890. cdef class VariantRecordFormat(object):
  1891. """Format data present for each sample in a :class:`VariantRecord` object,
  1892. presented as mapping from format name to :class:`VariantMetadata` object."""
  1893. def __init__(self, *args, **kwargs):
  1894. raise TypeError('this class cannot be instantiated from Python')
  1895. def __len__(self):
  1896. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1897. cdef bcf1_t *r = self.record.ptr
  1898. cdef int i, n = 0
  1899. for i in range(r.n_fmt):
  1900. if r.d.fmt[i].p:
  1901. n += 1
  1902. return n
  1903. def __bool__(self):
  1904. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1905. cdef bcf1_t *r = self.record.ptr
  1906. cdef int i
  1907. for i in range(r.n_fmt):
  1908. if r.d.fmt[i].p:
  1909. return True
  1910. return False
  1911. def __getitem__(self, key):
  1912. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1913. cdef bcf1_t *r = self.record.ptr
  1914. cdef bytes bkey = force_bytes(key)
  1915. cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
  1916. if not fmt or not fmt.p:
  1917. raise KeyError('unknown format: {}'.format(key))
  1918. return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
  1919. def __delitem__(self, key):
  1920. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1921. cdef bcf1_t *r = self.record.ptr
  1922. cdef bytes bkey = force_bytes(key)
  1923. cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
  1924. if not fmt or not fmt.p:
  1925. raise KeyError('unknown format: {}'.format(key))
  1926. if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
  1927. raise ValueError('Unable to delete FORMAT')
  1928. def clear(self):
  1929. """Clear all formats for all samples within the associated
  1930. :class:`VariantRecord` instance"""
  1931. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1932. cdef bcf1_t *r = self.record.ptr
  1933. cdef bcf_fmt_t *fmt
  1934. cdef const char *key
  1935. cdef int i
  1936. for i in reversed(range(r.n_fmt)):
  1937. fmt = &r.d.fmt[i]
  1938. if fmt.p:
  1939. key = bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id)
  1940. if bcf_update_format(hdr, r, key, fmt.p, 0, fmt.type) < 0:
  1941. raise ValueError('Unable to delete FORMAT')
  1942. def __iter__(self):
  1943. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1944. cdef bcf1_t *r = self.record.ptr
  1945. cdef bcf_fmt_t *fmt
  1946. cdef int i
  1947. for i in range(r.n_fmt):
  1948. fmt = &r.d.fmt[i]
  1949. if fmt.p:
  1950. yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
  1951. def get(self, key, default=None):
  1952. """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
  1953. try:
  1954. return self[key]
  1955. except KeyError:
  1956. return default
  1957. def __contains__(self, key):
  1958. cdef bcf_hdr_t *hdr = self.record.header.ptr
  1959. cdef bcf1_t *r = self.record.ptr
  1960. cdef bytes bkey = force_bytes(key)
  1961. cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
  1962. return fmt != NULL and fmt.p != NULL
  1963. def iterkeys(self):
  1964. """D.iterkeys() -> an iterator over the keys of D"""
  1965. return iter(self)
  1966. def itervalues(self):
  1967. """D.itervalues() -> an iterator over the values of D"""
  1968. for key in self:
  1969. yield self[key]
  1970. def iteritems(self):
  1971. """D.iteritems() -> an iterator over the (key, value) items of D"""
  1972. for key in self:
  1973. yield (key, self[key])
  1974. def keys(self):
  1975. """D.keys() -> list of D's keys"""
  1976. return list(self)
  1977. def items(self):
  1978. """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
  1979. return list(self.iteritems())
  1980. def values(self):
  1981. """D.values() -> list of D's values"""
  1982. return list(self.itervalues())
  1983. # Mappings are not hashable by default, but subclasses can change this
  1984. __hash__ = None
  1985. #TODO: implement __richcmp__
  1986. cdef VariantRecordFormat makeVariantRecordFormat(VariantRecord record):
  1987. if not record:
  1988. raise ValueError('invalid VariantRecord')
  1989. cdef VariantRecordFormat format = VariantRecordFormat.__new__(VariantRecordFormat)
  1990. format.record = record
  1991. return format
  1992. #TODO: Add a getmeta method to return the corresponding VariantMetadata?
  1993. cdef class VariantRecordInfo(object):
  1994. """Info data stored in a :class:`VariantRecord` object, presented as a
  1995. mapping from info metadata name to value."""
  1996. def __init__(self, *args, **kwargs):
  1997. raise TypeError('this class cannot be instantiated from Python')
  1998. def __len__(self):
  1999. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2000. cdef bcf1_t *r = self.record.ptr
  2001. cdef bcf_info_t *info
  2002. cdef const char *key
  2003. cdef int i, count = 0
  2004. if bcf_unpack(r, BCF_UN_INFO) < 0:
  2005. raise ValueError('Error unpacking VariantRecord')
  2006. for i in range(r.n_info):
  2007. info = &r.d.info[i]
  2008. key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
  2009. if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
  2010. count += 1
  2011. return count
  2012. def __bool__(self):
  2013. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2014. cdef bcf1_t *r = self.record.ptr
  2015. cdef bcf_info_t *info
  2016. cdef const char *key
  2017. cdef int i
  2018. if bcf_unpack(r, BCF_UN_INFO) < 0:
  2019. raise ValueError('Error unpacking VariantRecord')
  2020. for i in range(r.n_info):
  2021. info = &r.d.info[i]
  2022. key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
  2023. if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
  2024. return True
  2025. return False
  2026. def __getitem__(self, key):
  2027. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2028. cdef bcf1_t *r = self.record.ptr
  2029. if bcf_unpack(r, BCF_UN_INFO) < 0:
  2030. raise ValueError('Error unpacking VariantRecord')
  2031. cdef bytes bkey = force_bytes(key)
  2032. if strcmp(bkey, b'END') == 0:
  2033. raise KeyError('END is a reserved attribute; access is via record.stop')
  2034. cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
  2035. # Cannot stop here if info == NULL, since flags must return False
  2036. cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
  2037. if info_id < 0:
  2038. raise KeyError('Unknown INFO field: {}'.format(key))
  2039. if not check_header_id(hdr, BCF_HL_INFO, info_id):
  2040. raise ValueError('Invalid header')
  2041. # Handle type=Flag values
  2042. if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
  2043. return info != NULL and info.vptr != NULL
  2044. if not info or not info.vptr:
  2045. raise KeyError('Invalid INFO field: {}'.format(key))
  2046. return bcf_info_get_value(self.record, info)
  2047. def __setitem__(self, key, value):
  2048. cdef bytes bkey = force_bytes(key)
  2049. if strcmp(bkey, b'END') == 0:
  2050. raise KeyError('END is a reserved attribute; access is via record.stop')
  2051. if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0:
  2052. raise ValueError('Error unpacking VariantRecord')
  2053. bcf_info_set_value(self.record, key, value)
  2054. def __delitem__(self, key):
  2055. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2056. cdef bcf1_t *r = self.record.ptr
  2057. cdef bytes bkey = force_bytes(key)
  2058. if strcmp(bkey, b'END') == 0:
  2059. raise KeyError('END is a reserved attribute; access is via record.stop')
  2060. if bcf_unpack(r, BCF_UN_INFO) < 0:
  2061. raise ValueError('Error unpacking VariantRecord')
  2062. cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
  2063. # Cannot stop here if info == NULL, since flags must return False
  2064. cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
  2065. if info_id < 0:
  2066. raise KeyError('Unknown INFO field: {}'.format(key))
  2067. if not check_header_id(hdr, BCF_HL_INFO, info_id):
  2068. raise ValueError('Invalid header')
  2069. # Handle flags
  2070. if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
  2071. return
  2072. if not info or not info.vptr:
  2073. raise KeyError('Unknown INFO field: {}'.format(key))
  2074. if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
  2075. raise ValueError('Unable to delete INFO')
  2076. def clear(self):
  2077. """Clear all info data"""
  2078. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2079. cdef bcf1_t *r = self.record.ptr
  2080. cdef bcf_info_t *info
  2081. cdef const char *key
  2082. cdef int i
  2083. if bcf_unpack(r, BCF_UN_INFO) < 0:
  2084. raise ValueError('Error unpacking VariantRecord')
  2085. for i in range(r.n_info):
  2086. info = &r.d.info[i]
  2087. if info and info.vptr:
  2088. key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
  2089. if strcmp(key, b'END') == 0:
  2090. continue
  2091. if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
  2092. raise ValueError('Unable to delete INFO')
  2093. def __iter__(self):
  2094. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2095. cdef bcf1_t *r = self.record.ptr
  2096. cdef bcf_info_t *info
  2097. cdef const char *key
  2098. cdef int i
  2099. if bcf_unpack(r, BCF_UN_INFO) < 0:
  2100. raise ValueError('Error unpacking VariantRecord')
  2101. for i in range(r.n_info):
  2102. info = &r.d.info[i]
  2103. if info and info.vptr:
  2104. key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
  2105. if strcmp(key, b'END') != 0:
  2106. yield bcf_str_cache_get_charptr(key)
  2107. def get(self, key, default=None):
  2108. """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
  2109. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2110. cdef bcf1_t *r = self.record.ptr
  2111. if bcf_unpack(r, BCF_UN_INFO) < 0:
  2112. raise ValueError('Error unpacking VariantRecord')
  2113. cdef bytes bkey = force_bytes(key)
  2114. if strcmp(bkey, b'END') == 0:
  2115. return default
  2116. cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
  2117. # Cannot stop here if info == NULL, since flags must return False
  2118. cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
  2119. if not check_header_id(hdr, BCF_HL_INFO, info_id):
  2120. raise ValueError('Invalid header')
  2121. # Handle flags
  2122. if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
  2123. return info != NULL and info.vptr != NULL
  2124. if not info or not info.vptr:
  2125. return default
  2126. return bcf_info_get_value(self.record, info)
  2127. def __contains__(self, key):
  2128. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2129. cdef bcf1_t *r = self.record.ptr
  2130. if bcf_unpack(r, BCF_UN_INFO) < 0:
  2131. raise ValueError('Error unpacking VariantRecord')
  2132. cdef bytes bkey = force_bytes(key)
  2133. if strcmp(bkey, b'END') == 0:
  2134. return False
  2135. cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
  2136. return info != NULL and info.vptr != NULL
  2137. def iterkeys(self):
  2138. """D.iterkeys() -> an iterator over the keys of D"""
  2139. return iter(self)
  2140. def itervalues(self):
  2141. """D.itervalues() -> an iterator over the values of D"""
  2142. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2143. cdef bcf1_t *r = self.record.ptr
  2144. cdef bcf_info_t *info
  2145. cdef const char *key
  2146. cdef int i
  2147. if bcf_unpack(r, BCF_UN_INFO) < 0:
  2148. raise ValueError('Error unpacking VariantRecord')
  2149. for i in range(r.n_info):
  2150. info = &r.d.info[i]
  2151. if info and info.vptr:
  2152. key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
  2153. if strcmp(key, b'END') != 0:
  2154. yield bcf_info_get_value(self.record, info)
  2155. def iteritems(self):
  2156. """D.iteritems() -> an iterator over the (key, value) items of D"""
  2157. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2158. cdef bcf1_t *r = self.record.ptr
  2159. cdef bcf_info_t *info
  2160. cdef const char *key
  2161. cdef int i
  2162. if bcf_unpack(r, BCF_UN_INFO) < 0:
  2163. raise ValueError('Error unpacking VariantRecord')
  2164. for i in range(r.n_info):
  2165. info = &r.d.info[i]
  2166. if info and info.vptr:
  2167. key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
  2168. if strcmp(key, b'END') != 0:
  2169. value = bcf_info_get_value(self.record, info)
  2170. yield bcf_str_cache_get_charptr(key), value
  2171. def keys(self):
  2172. """D.keys() -> list of D's keys"""
  2173. return list(self)
  2174. def items(self):
  2175. """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
  2176. return list(self.iteritems())
  2177. def values(self):
  2178. """D.values() -> list of D's values"""
  2179. return list(self.itervalues())
  2180. def update(self, items=None, **kwargs):
  2181. """D.update([E, ]**F) -> None.
  2182. Update D from dict/iterable E and F.
  2183. """
  2184. for k, v in items.items():
  2185. if k != 'END':
  2186. self[k] = v
  2187. if kwargs:
  2188. kwargs.pop('END', None)
  2189. for k, v in kwargs.items():
  2190. self[k] = v
  2191. def pop(self, key, default=_nothing):
  2192. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2193. cdef bcf1_t *r = self.record.ptr
  2194. if bcf_unpack(r, BCF_UN_INFO) < 0:
  2195. raise ValueError('Error unpacking VariantRecord')
  2196. cdef bytes bkey = force_bytes(key)
  2197. cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
  2198. # Cannot stop here if info == NULL, since flags must return False
  2199. cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
  2200. if info_id < 0:
  2201. if default is _nothing:
  2202. raise KeyError('Unknown INFO field: {}'.format(key))
  2203. return default
  2204. if not check_header_id(hdr, BCF_HL_INFO, info_id):
  2205. raise ValueError('Invalid header')
  2206. # Handle flags
  2207. if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
  2208. return
  2209. if not info or not info.vptr:
  2210. if default is _nothing:
  2211. raise KeyError('Unknown INFO field: {}'.format(key))
  2212. return default
  2213. value = bcf_info_get_value(self.record, info)
  2214. if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
  2215. raise ValueError('Unable to delete INFO')
  2216. return value
  2217. def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op):
  2218. if op != 2 and op != 3:
  2219. return NotImplemented
  2220. cdef bcf1_t *s = self.record.ptr
  2221. cdef bcf1_t *o = other.record.ptr
  2222. # Cannot use n_info as shortcut logic, since null values may remain
  2223. cdef bint cmp = dict(self) == dict(other)
  2224. if op == 3:
  2225. cmp = not cmp
  2226. return cmp
  2227. # Mappings are not hashable by default, but subclasses can change this
  2228. __hash__ = None
  2229. cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
  2230. if not record:
  2231. raise ValueError('invalid VariantRecord')
  2232. cdef VariantRecordInfo info = VariantRecordInfo.__new__(VariantRecordInfo)
  2233. info.record = record
  2234. return info
  2235. cdef class VariantRecordSamples(object):
  2236. """mapping from sample index or name to :class:`VariantRecordSample` object."""
  2237. def __init__(self, *args, **kwargs):
  2238. raise TypeError('this class cannot be instantiated from Python')
  2239. def __len__(self):
  2240. return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr)
  2241. def __bool__(self):
  2242. return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0
  2243. def __getitem__(self, key):
  2244. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2245. cdef bcf1_t *r = self.record.ptr
  2246. cdef int n = self.record.ptr.n_sample
  2247. cdef int sample_index
  2248. cdef vdict_t *d
  2249. cdef khiter_t k
  2250. if isinstance(key, int):
  2251. sample_index = key
  2252. else:
  2253. bkey = force_bytes(key)
  2254. sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
  2255. if sample_index < 0:
  2256. raise KeyError('invalid sample name: {}'.format(key))
  2257. if sample_index < 0 or sample_index >= n:
  2258. raise IndexError('invalid sample index')
  2259. return makeVariantRecordSample(self.record, sample_index)
  2260. def __iter__(self):
  2261. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2262. cdef bcf1_t *r = self.record.ptr
  2263. cdef int32_t i, n = self.record.ptr.n_sample
  2264. for i in range(n):
  2265. yield charptr_to_str(hdr.samples[i])
  2266. def get(self, key, default=None):
  2267. """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
  2268. try:
  2269. return self[key]
  2270. except KeyError:
  2271. return default
  2272. def __contains__(self, key):
  2273. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2274. cdef bcf1_t *r = self.record.ptr
  2275. cdef int n = self.record.ptr.n_sample
  2276. cdef int sample_index
  2277. cdef vdict_t *d
  2278. cdef khiter_t k
  2279. if isinstance(key, int):
  2280. sample_index = key
  2281. else:
  2282. bkey = force_bytes(key)
  2283. sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
  2284. if sample_index < 0:
  2285. raise KeyError('invalid sample name: {}'.format(key))
  2286. return 0 <= sample_index < n
  2287. def iterkeys(self):
  2288. """D.iterkeys() -> an iterator over the keys of D"""
  2289. return iter(self)
  2290. def itervalues(self):
  2291. """D.itervalues() -> an iterator over the values of D"""
  2292. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2293. cdef bcf1_t *r = self.record.ptr
  2294. cdef int32_t i, n = self.record.ptr.n_sample
  2295. for i in range(n):
  2296. yield makeVariantRecordSample(self.record, i)
  2297. def iteritems(self):
  2298. """D.iteritems() -> an iterator over the (key, value) items of D"""
  2299. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2300. cdef bcf1_t *r = self.record.ptr
  2301. cdef int32_t i, n = self.record.ptr.n_sample
  2302. for i in range(n):
  2303. yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
  2304. def keys(self):
  2305. """D.keys() -> list of D's keys"""
  2306. return list(self)
  2307. def items(self):
  2308. """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
  2309. return list(self.iteritems())
  2310. def values(self):
  2311. """D.values() -> list of D's values"""
  2312. return list(self.itervalues())
  2313. def update(self, items=None, **kwargs):
  2314. """D.update([E, ]**F) -> None.
  2315. Update D from dict/iterable E and F.
  2316. """
  2317. for k, v in items.items():
  2318. self[k] = v
  2319. if kwargs:
  2320. for k, v in kwargs.items():
  2321. self[k] = v
  2322. def pop(self, key, default=_nothing):
  2323. try:
  2324. value = self[key]
  2325. del self[key]
  2326. return value
  2327. except KeyError:
  2328. if default is not _nothing:
  2329. return default
  2330. raise
  2331. def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op):
  2332. if op != 2 and op != 3:
  2333. return NotImplemented
  2334. cdef bcf1_t *s = self.record.ptr
  2335. cdef bcf1_t *o = other.record.ptr
  2336. cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values())
  2337. if op == 3:
  2338. cmp = not cmp
  2339. return cmp
  2340. # Mappings are not hashable by default, but subclasses can change this
  2341. __hash__ = None
  2342. cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
  2343. if not record:
  2344. raise ValueError('invalid VariantRecord')
  2345. cdef VariantRecordSamples samples = VariantRecordSamples.__new__(
  2346. VariantRecordSamples)
  2347. samples.record = record
  2348. return samples
  2349. cdef class VariantRecord(object):
  2350. """Variant record"""
  2351. def __init__(self, *args, **kwargs):
  2352. raise TypeError('this class cannot be instantiated from Python')
  2353. def __dealloc__(self):
  2354. if self.ptr:
  2355. bcf_destroy1(self.ptr)
  2356. self.ptr = NULL
  2357. def copy(self):
  2358. """return a copy of this VariantRecord object"""
  2359. return makeVariantRecord(self.header, bcf_dup(self.ptr))
  2360. def translate(self, VariantHeader dst_header):
  2361. if dst_header is None:
  2362. raise ValueError('dst_header must not be None')
  2363. cdef bcf_hdr_t *src_hdr = self.header.ptr
  2364. cdef bcf_hdr_t *dst_hdr = dst_header.ptr
  2365. if src_hdr != dst_hdr:
  2366. if self.ptr.n_sample != bcf_hdr_nsamples(dst_hdr):
  2367. msg = 'Cannot translate record. Number of samples does not match header ({} vs {})'
  2368. raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
  2369. bcf_translate(dst_hdr, src_hdr, self.ptr)
  2370. self.header = dst_header
  2371. @property
  2372. def rid(self):
  2373. """internal reference id number"""
  2374. return self.ptr.rid
  2375. @rid.setter
  2376. def rid(self, value):
  2377. cdef bcf_hdr_t *hdr = self.header.ptr
  2378. cdef int r = value
  2379. if r < 0 or r >= hdr.n[BCF_DT_CTG] or not hdr.id[BCF_DT_CTG][r].val:
  2380. raise ValueError('invalid reference id')
  2381. self.ptr.rid = r
  2382. @property
  2383. def chrom(self):
  2384. """chromosome/contig name"""
  2385. cdef bcf_hdr_t *hdr = self.header.ptr
  2386. cdef int rid = self.ptr.rid
  2387. if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
  2388. raise ValueError('Invalid header')
  2389. return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
  2390. @chrom.setter
  2391. def chrom(self, value):
  2392. cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
  2393. bchrom = force_bytes(value)
  2394. cdef khint_t k = kh_get_vdict(d, bchrom)
  2395. if k == kh_end(d):
  2396. raise ValueError('Invalid chromosome/contig')
  2397. self.ptr.rid = kh_val_vdict(d, k).id
  2398. @property
  2399. def contig(self):
  2400. """chromosome/contig name"""
  2401. cdef bcf_hdr_t *hdr = self.header.ptr
  2402. cdef int rid = self.ptr.rid
  2403. if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
  2404. raise ValueError('Invalid header')
  2405. return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
  2406. @contig.setter
  2407. def contig(self, value):
  2408. cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
  2409. bchrom = force_bytes(value)
  2410. cdef khint_t k = kh_get_vdict(d, bchrom)
  2411. if k == kh_end(d):
  2412. raise ValueError('Invalid chromosome/contig')
  2413. self.ptr.rid = kh_val_vdict(d, k).id
  2414. @property
  2415. def pos(self):
  2416. """record start position on chrom/contig (1-based inclusive)"""
  2417. return self.ptr.pos + 1
  2418. @pos.setter
  2419. def pos(self, value):
  2420. cdef int p = value
  2421. if p < 1:
  2422. raise ValueError('Position must be positive')
  2423. self.ptr.pos = p - 1
  2424. bcf_sync_end(self)
  2425. @property
  2426. def start(self):
  2427. """record start position on chrom/contig (0-based inclusive)"""
  2428. return self.ptr.pos
  2429. @start.setter
  2430. def start(self, value):
  2431. cdef int s = value
  2432. if s < 0:
  2433. raise ValueError('Start coordinate must be non-negative')
  2434. self.ptr.pos = s
  2435. bcf_sync_end(self)
  2436. @property
  2437. def stop(self):
  2438. """record stop position on chrom/contig (0-based exclusive)"""
  2439. return self.ptr.pos + self.ptr.rlen
  2440. @stop.setter
  2441. def stop(self, value):
  2442. cdef int s = value
  2443. if s < 0:
  2444. raise ValueError('Stop coordinate must be non-negative')
  2445. self.ptr.rlen = s - self.ptr.pos
  2446. bcf_sync_end(self)
  2447. @property
  2448. def rlen(self):
  2449. """record length on chrom/contig (aka rec.stop - rec.start)"""
  2450. return self.ptr.rlen
  2451. @rlen.setter
  2452. def rlen(self, value):
  2453. cdef int r = value
  2454. self.ptr.rlen = r
  2455. bcf_sync_end(self)
  2456. @property
  2457. def qual(self):
  2458. """phred scaled quality score or None if not available"""
  2459. return self.ptr.qual if not bcf_float_is_missing(self.ptr.qual) else None
  2460. @qual.setter
  2461. def qual(self, value):
  2462. if value is not None:
  2463. self.ptr.qual = value
  2464. else:
  2465. bcf_float_set(&self.ptr.qual, bcf_float_missing)
  2466. # @property
  2467. # def n_allele(self):
  2468. # return self.ptr.n_allele
  2469. # @property
  2470. # def n_sample(self):
  2471. # return self.ptr.n_sample
  2472. @property
  2473. def id(self):
  2474. """record identifier or None if not available"""
  2475. cdef bcf1_t *r = self.ptr
  2476. if bcf_unpack(r, BCF_UN_STR) < 0:
  2477. raise ValueError('Error unpacking VariantRecord')
  2478. # causes a memory leak https://github.com/pysam-developers/pysam/issues/773
  2479. # return bcf_str_cache_get_charptr(r.d.id) if r.d.id != b'.' else None
  2480. if (r.d.m_id == 0):
  2481. raise ValueError('Error extracting ID')
  2482. return charptr_to_str(r.d.id) if r.d.id != b'.' else None
  2483. @id.setter
  2484. def id(self, value):
  2485. cdef bcf1_t *r = self.ptr
  2486. if bcf_unpack(r, BCF_UN_STR) < 0:
  2487. raise ValueError('Error unpacking VariantRecord')
  2488. cdef char *idstr = NULL
  2489. if value is not None:
  2490. bid = force_bytes(value)
  2491. idstr = bid
  2492. if bcf_update_id(self.header.ptr, self.ptr, idstr) < 0:
  2493. raise ValueError('Error updating id')
  2494. @property
  2495. def ref(self):
  2496. """reference allele"""
  2497. cdef bcf1_t *r = self.ptr
  2498. if bcf_unpack(r, BCF_UN_STR) < 0:
  2499. raise ValueError('Error unpacking VariantRecord')
  2500. return charptr_to_str(r.d.allele[0]) if r.d.allele else None
  2501. @ref.setter
  2502. def ref(self, value):
  2503. cdef bcf1_t *r = self.ptr
  2504. if bcf_unpack(r, BCF_UN_STR) < 0:
  2505. raise ValueError('Error unpacking VariantRecord')
  2506. #FIXME: Set alleles directly -- this is stupid
  2507. if not value:
  2508. raise ValueError('ref allele must not be null')
  2509. value = force_bytes(value)
  2510. if r.d.allele and r.n_allele:
  2511. alleles = [r.d.allele[i] for i in range(r.n_allele)]
  2512. alleles[0] = value
  2513. else:
  2514. alleles = [value, '<NON_REF>']
  2515. self.alleles = alleles
  2516. bcf_sync_end(self)
  2517. @property
  2518. def alleles(self):
  2519. """tuple of reference allele followed by alt alleles"""
  2520. cdef bcf1_t *r = self.ptr
  2521. if bcf_unpack(r, BCF_UN_STR) < 0:
  2522. raise ValueError('Error unpacking VariantRecord')
  2523. if not r.d.allele:
  2524. return None
  2525. cdef tuple res = PyTuple_New(r.n_allele)
  2526. for i in range(r.n_allele):
  2527. a = charptr_to_str(r.d.allele[i])
  2528. PyTuple_SET_ITEM(res, i, a)
  2529. Py_INCREF(a)
  2530. return res
  2531. @alleles.setter
  2532. def alleles(self, values):
  2533. cdef bcf1_t *r = self.ptr
  2534. # Cache rlen of symbolic alleles before call to bcf_update_alleles_str
  2535. cdef int rlen = r.rlen
  2536. if bcf_unpack(r, BCF_UN_STR) < 0:
  2537. raise ValueError('Error unpacking VariantRecord')
  2538. values = [force_bytes(v) for v in values]
  2539. if len(values) < 2:
  2540. raise ValueError('must set at least 2 alleles')
  2541. if b'' in values:
  2542. raise ValueError('cannot set null allele')
  2543. value = b','.join(values)
  2544. if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
  2545. raise ValueError('Error updating alleles')
  2546. # Reset rlen if alternate allele isn't symbolic, otherwise used cached
  2547. if has_symbolic_allele(self):
  2548. self.ptr.rlen = rlen
  2549. else:
  2550. self.ptr.rlen = len(values[0])
  2551. r.d.var_type = -1
  2552. bcf_sync_end(self)
  2553. @property
  2554. def alts(self):
  2555. """tuple of alt alleles"""
  2556. cdef bcf1_t *r = self.ptr
  2557. if bcf_unpack(r, BCF_UN_STR) < 0:
  2558. raise ValueError('Error unpacking VariantRecord')
  2559. if r.n_allele < 2 or not r.d.allele:
  2560. return None
  2561. cdef tuple res = PyTuple_New(r.n_allele - 1)
  2562. for i in range(1, r.n_allele):
  2563. a = charptr_to_str(r.d.allele[i])
  2564. PyTuple_SET_ITEM(res, i - 1, a)
  2565. Py_INCREF(a)
  2566. return res
  2567. @alts.setter
  2568. def alts(self, value):
  2569. #FIXME: Set alleles directly -- this is stupid
  2570. cdef bcf1_t *r = self.ptr
  2571. if bcf_unpack(r, BCF_UN_STR) < 0:
  2572. raise ValueError('Error unpacking VariantRecord')
  2573. value = [force_bytes(v) for v in value]
  2574. if b'' in value:
  2575. raise ValueError('cannot set null alt allele')
  2576. ref = [r.d.allele[0] if r.d.allele and r.n_allele else b'.']
  2577. self.alleles = ref + value
  2578. r.d.var_type = -1
  2579. @property
  2580. def filter(self):
  2581. """filter information (see :class:`VariantRecordFilter`)"""
  2582. if bcf_unpack(self.ptr, BCF_UN_FLT) < 0:
  2583. raise ValueError('Error unpacking VariantRecord')
  2584. return makeVariantRecordFilter(self)
  2585. @property
  2586. def info(self):
  2587. """info data (see :class:`VariantRecordInfo`)"""
  2588. if bcf_unpack(self.ptr, BCF_UN_INFO) < 0:
  2589. raise ValueError('Error unpacking VariantRecord')
  2590. return makeVariantRecordInfo(self)
  2591. @property
  2592. def format(self):
  2593. """sample format metadata (see :class:`VariantRecordFormat`)"""
  2594. if bcf_unpack(self.ptr, BCF_UN_FMT) < 0:
  2595. raise ValueError('Error unpacking VariantRecord')
  2596. return makeVariantRecordFormat(self)
  2597. @property
  2598. def samples(self):
  2599. """sample data (see :class:`VariantRecordSamples`)"""
  2600. if bcf_unpack(self.ptr, BCF_UN_ALL) < 0:
  2601. raise ValueError('Error unpacking VariantRecord')
  2602. return makeVariantRecordSamples(self)
  2603. property alleles_variant_types:
  2604. def __get__(self):
  2605. cdef bcf1_t *r = self.ptr
  2606. cdef tuple result = PyTuple_New(r.n_allele)
  2607. for i in range(r.n_allele):
  2608. tp = bcf_get_variant_type(r, i)
  2609. if tp == VCF_REF:
  2610. v_type = "REF"
  2611. elif tp == VCF_SNP:
  2612. v_type = "SNP"
  2613. elif tp == VCF_MNP:
  2614. v_type = "MNP"
  2615. elif tp == VCF_INDEL:
  2616. v_type = "INDEL"
  2617. elif tp == VCF_BND:
  2618. v_type = "BND"
  2619. elif tp == VCF_OVERLAP:
  2620. v_type = "OVERLAP"
  2621. else:
  2622. v_type = "OTHER"
  2623. PyTuple_SET_ITEM(result, i, v_type)
  2624. Py_INCREF(v_type)
  2625. return result
  2626. def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op):
  2627. if op != 2 and op != 3:
  2628. return NotImplemented
  2629. cdef bcf1_t *s = self.ptr
  2630. cdef bcf1_t *o = other.ptr
  2631. cdef bint cmp = self is other or (
  2632. s.pos == o.pos
  2633. and s.rlen == o.rlen
  2634. and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual))
  2635. or s.qual == o.qual)
  2636. and s.n_sample == o.n_sample
  2637. and s.n_allele == o.n_allele
  2638. and self.contig == other.contig
  2639. and self.alleles == other.alleles
  2640. and self.id == other.id
  2641. and self.info == other.info
  2642. and self.filter == other.filter
  2643. and self.samples == other.samples)
  2644. if op == 3:
  2645. cmp = not cmp
  2646. return cmp
  2647. def __str__(self):
  2648. cdef kstring_t line
  2649. cdef char c
  2650. line.l = line.m = 0
  2651. line.s = NULL
  2652. if vcf_format(self.header.ptr, self.ptr, &line) < 0:
  2653. if line.m:
  2654. free(line.s)
  2655. raise ValueError('vcf_format failed')
  2656. # Strip CR/LF?
  2657. #while line.l:
  2658. # c = line.s[line.l - 1]
  2659. # if c != b'\n' and c != b'\r':
  2660. # break
  2661. # line.l -= 1
  2662. ret = charptr_to_str_w_len(line.s, line.l)
  2663. if line.m:
  2664. free(line.s)
  2665. return ret
  2666. cdef VariantRecord makeVariantRecord(VariantHeader header, bcf1_t *r):
  2667. if not header:
  2668. raise ValueError('invalid VariantHeader')
  2669. if not r:
  2670. raise ValueError('cannot create VariantRecord')
  2671. if r.errcode:
  2672. msg = []
  2673. #if r.errcode & BCF_ERR_CTG_UNDEF:
  2674. # msg.append('undefined contig')
  2675. #if r.errcode & BCF_ERR_TAG_UNDEF:
  2676. # msg.append('undefined tag')
  2677. if r.errcode & BCF_ERR_NCOLS:
  2678. msg.append('invalid number of columns')
  2679. if r.errcode & BCF_ERR_LIMITS:
  2680. msg.append('limits violated')
  2681. if r.errcode & BCF_ERR_CHAR:
  2682. msg.append('invalid character found')
  2683. if r.errcode & BCF_ERR_CTG_INVALID:
  2684. msg.append('invalid contig')
  2685. if r.errcode & BCF_ERR_TAG_INVALID:
  2686. msg.append('invalid tag')
  2687. if msg:
  2688. msg = ', '.join(msg)
  2689. raise ValueError('Error(s) reading record: {}'.format(msg))
  2690. cdef VariantRecord record = VariantRecord.__new__(VariantRecord)
  2691. record.header = header
  2692. record.ptr = r
  2693. return record
  2694. ########################################################################
  2695. ########################################################################
  2696. ## Variant Sampletype object
  2697. ########################################################################
  2698. cdef class VariantRecordSample(object):
  2699. """Data for a single sample from a :class:`VariantRecord` object.
  2700. Provides data accessors for genotypes and a mapping interface
  2701. from format name to values.
  2702. """
  2703. def __init__(self, *args, **kwargs):
  2704. raise TypeError('this class cannot be instantiated from Python')
  2705. @property
  2706. def name(self):
  2707. """sample name"""
  2708. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2709. cdef bcf1_t *r = self.record.ptr
  2710. cdef int32_t n = r.n_sample
  2711. if self.index < 0 or self.index >= n:
  2712. raise ValueError('invalid sample index')
  2713. return charptr_to_str(hdr.samples[self.index])
  2714. @property
  2715. def allele_indices(self):
  2716. """allele indices for called genotype, if present. Otherwise None"""
  2717. return bcf_format_get_allele_indices(self)
  2718. @allele_indices.setter
  2719. def allele_indices(self, value):
  2720. self['GT'] = value
  2721. @allele_indices.deleter
  2722. def allele_indices(self):
  2723. self['GT'] = ()
  2724. @property
  2725. def alleles(self):
  2726. """alleles for called genotype, if present. Otherwise None"""
  2727. return bcf_format_get_alleles(self)
  2728. @alleles.setter
  2729. def alleles(self, value):
  2730. # Sets the genotype, supply a tuple of alleles to set.
  2731. # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord
  2732. # The genotype is reset when an empty tuple, None or (None,) is supplied
  2733. if value==(None,) or value==tuple() or value is None:
  2734. self['GT'] = ()
  2735. return
  2736. if any((type(x) == int for x in value)):
  2737. raise ValueError('Use .allele_indices to set integer allele indices')
  2738. # determine and set allele indices:
  2739. try:
  2740. self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) )
  2741. except ValueError:
  2742. raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord."
  2743. "First set the .alleles of this record to define the alleles")
  2744. @alleles.deleter
  2745. def alleles(self):
  2746. self['GT'] = ()
  2747. @property
  2748. def phased(self):
  2749. """False if genotype is missing or any allele is unphased. Otherwise True."""
  2750. return bcf_sample_get_phased(self)
  2751. @phased.setter
  2752. def phased(self, value):
  2753. bcf_sample_set_phased(self, value)
  2754. def __len__(self):
  2755. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2756. cdef bcf1_t *r = self.record.ptr
  2757. cdef int i, n = 0
  2758. if bcf_unpack(r, BCF_UN_FMT) < 0:
  2759. raise ValueError('Error unpacking VariantRecord')
  2760. for i in range(r.n_fmt):
  2761. if r.d.fmt[i].p:
  2762. n += 1
  2763. return n
  2764. def __bool__(self):
  2765. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2766. cdef bcf1_t *r = self.record.ptr
  2767. cdef int i
  2768. if bcf_unpack(r, BCF_UN_FMT) < 0:
  2769. raise ValueError('Error unpacking VariantRecord')
  2770. for i in range(r.n_fmt):
  2771. if r.d.fmt[i].p:
  2772. return True
  2773. return False
  2774. def __getitem__(self, key):
  2775. return bcf_format_get_value(self, key)
  2776. def __setitem__(self, key, value):
  2777. bcf_format_set_value(self, key, value)
  2778. def __delitem__(self, key):
  2779. bcf_format_del_value(self, key)
  2780. def clear(self):
  2781. """Clear all format data (including genotype) for this sample"""
  2782. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2783. cdef bcf1_t *r = self.record.ptr
  2784. cdef bcf_fmt_t *fmt
  2785. cdef int i
  2786. for i in range(r.n_fmt):
  2787. fmt = &r.d.fmt[i]
  2788. if fmt.p:
  2789. bcf_format_del_value(self, bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
  2790. def __iter__(self):
  2791. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2792. cdef bcf1_t *r = self.record.ptr
  2793. cdef bcf_fmt_t *fmt
  2794. cdef int i
  2795. for i in range(r.n_fmt):
  2796. fmt = &r.d.fmt[i]
  2797. if r.d.fmt[i].p:
  2798. yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt.id))
  2799. def get(self, key, default=None):
  2800. """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
  2801. try:
  2802. return self[key]
  2803. except KeyError:
  2804. return default
  2805. def __contains__(self, key):
  2806. cdef bcf_hdr_t *hdr = self.record.header.ptr
  2807. cdef bcf1_t *r = self.record.ptr
  2808. cdef bytes bkey = force_bytes(key)
  2809. cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
  2810. return fmt != NULL and fmt.p != NULL
  2811. def iterkeys(self):
  2812. """D.iterkeys() -> an iterator over the keys of D"""
  2813. return iter(self)
  2814. def itervalues(self):
  2815. """D.itervalues() -> an iterator over the values of D"""
  2816. for key in self:
  2817. yield self[key]
  2818. def iteritems(self):
  2819. """D.iteritems() -> an iterator over the (key, value) items of D"""
  2820. for key in self:
  2821. yield (key, self[key])
  2822. def keys(self):
  2823. """D.keys() -> list of D's keys"""
  2824. return list(self)
  2825. def items(self):
  2826. """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
  2827. return list(self.iteritems())
  2828. def values(self):
  2829. """D.values() -> list of D's values"""
  2830. return list(self.itervalues())
  2831. def update(self, items=None, **kwargs):
  2832. """D.update([E, ]**F) -> None.
  2833. Update D from dict/iterable E and F.
  2834. """
  2835. for k, v in items.items():
  2836. self[k] = v
  2837. if kwargs:
  2838. for k, v in kwargs.items():
  2839. self[k] = v
  2840. def pop(self, key, default=_nothing):
  2841. try:
  2842. value = self[key]
  2843. del self[key]
  2844. return value
  2845. except KeyError:
  2846. if default is not _nothing:
  2847. return default
  2848. raise
  2849. def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op):
  2850. if op != 2 and op != 3:
  2851. return NotImplemented
  2852. cdef bint cmp = dict(self) == dict(other)
  2853. if op == 3:
  2854. cmp = not cmp
  2855. return cmp
  2856. # Mappings are not hashable by default, but subclasses can change this
  2857. __hash__ = None
  2858. cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
  2859. if not record or sample_index < 0:
  2860. raise ValueError('cannot create VariantRecordSample')
  2861. cdef VariantRecordSample sample = VariantRecordSample.__new__(VariantRecordSample)
  2862. sample.record = record
  2863. sample.index = sample_index
  2864. return sample
  2865. ########################################################################
  2866. ########################################################################
  2867. ## Index objects
  2868. ########################################################################
  2869. cdef class BaseIndex(object):
  2870. def __init__(self):
  2871. self.refs = ()
  2872. self.remap = {}
  2873. def __len__(self):
  2874. return len(self.refs)
  2875. def __bool__(self):
  2876. return len(self.refs) != 0
  2877. def __getitem__(self, key):
  2878. if isinstance(key, int):
  2879. return self.refs[key]
  2880. else:
  2881. return self.refmap[key]
  2882. def __iter__(self):
  2883. return iter(self.refs)
  2884. def get(self, key, default=None):
  2885. """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
  2886. try:
  2887. return self[key]
  2888. except KeyError:
  2889. return default
  2890. def __contains__(self, key):
  2891. try:
  2892. self[key]
  2893. except KeyError:
  2894. return False
  2895. else:
  2896. return True
  2897. def iterkeys(self):
  2898. """D.iterkeys() -> an iterator over the keys of D"""
  2899. return iter(self)
  2900. def itervalues(self):
  2901. """D.itervalues() -> an iterator over the values of D"""
  2902. for key in self:
  2903. yield self[key]
  2904. def iteritems(self):
  2905. """D.iteritems() -> an iterator over the (key, value) items of D"""
  2906. for key in self:
  2907. yield (key, self[key])
  2908. def keys(self):
  2909. """D.keys() -> list of D's keys"""
  2910. return list(self)
  2911. def items(self):
  2912. """D.items() -> list of D's (key, value) pairs, as 2-tuples"""
  2913. return list(self.iteritems())
  2914. def values(self):
  2915. """D.values() -> list of D's values"""
  2916. return list(self.itervalues())
  2917. def update(self, items=None, **kwargs):
  2918. """D.update([E, ]**F) -> None.
  2919. Update D from dict/iterable E and F.
  2920. """
  2921. for k, v in items.items():
  2922. self[k] = v
  2923. if kwargs:
  2924. for k, v in kwargs.items():
  2925. self[k] = v
  2926. def pop(self, key, default=_nothing):
  2927. try:
  2928. value = self[key]
  2929. del self[key]
  2930. return value
  2931. except KeyError:
  2932. if default is not _nothing:
  2933. return default
  2934. raise
  2935. # Mappings are not hashable by default, but subclasses can change this
  2936. __hash__ = None
  2937. #TODO: implement __richcmp__
  2938. cdef class BCFIndex(object):
  2939. """CSI index data structure for BCF files"""
  2940. def __init__(self):
  2941. self.refs = ()
  2942. self.refmap = {}
  2943. if not self.ptr:
  2944. raise ValueError('Invalid index object')
  2945. cdef int n
  2946. cdef const char **refs = bcf_index_seqnames(self.ptr, self.header.ptr, &n)
  2947. self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
  2948. self.refmap = { r:i for i,r in enumerate(self.refs) }
  2949. def __dealloc__(self):
  2950. if self.ptr:
  2951. hts_idx_destroy(self.ptr)
  2952. self.ptr = NULL
  2953. def fetch(self, bcf, contig, start, stop, reopen):
  2954. return BCFIterator(bcf, contig, start, stop, reopen)
  2955. cdef BCFIndex makeBCFIndex(VariantHeader header, hts_idx_t *idx):
  2956. if not idx:
  2957. return None
  2958. if not header:
  2959. raise ValueError('invalid VariantHeader')
  2960. cdef BCFIndex index = BCFIndex.__new__(BCFIndex)
  2961. index.header = header
  2962. index.ptr = idx
  2963. index.__init__()
  2964. return index
  2965. cdef class TabixIndex(BaseIndex):
  2966. """Tabix index data structure for VCF files"""
  2967. def __init__(self):
  2968. self.refs = ()
  2969. self.refmap = {}
  2970. if not self.ptr:
  2971. raise ValueError('Invalid index object')
  2972. cdef int n
  2973. cdef const char **refs = tbx_seqnames(self.ptr, &n)
  2974. self.refs = char_array_to_tuple(refs, n, free_after=1) if refs else ()
  2975. self.refmap = { r:i for i,r in enumerate(self.refs) }
  2976. def __dealloc__(self):
  2977. if self.ptr:
  2978. tbx_destroy(self.ptr)
  2979. self.ptr = NULL
  2980. def fetch(self, bcf, contig, start, stop, reopen):
  2981. return TabixIterator(bcf, contig, start, stop, reopen)
  2982. cdef TabixIndex makeTabixIndex(tbx_t *idx):
  2983. if not idx:
  2984. return None
  2985. cdef TabixIndex index = TabixIndex.__new__(TabixIndex)
  2986. index.ptr = idx
  2987. index.__init__()
  2988. return index
  2989. ########################################################################
  2990. ########################################################################
  2991. ## Iterators
  2992. ########################################################################
  2993. cdef class BaseIterator(object):
  2994. pass
  2995. # Internal function to clean up after iteration stop or failure.
  2996. # This would be a nested function if it weren't a cdef function.
  2997. cdef void _stop_BCFIterator(BCFIterator self, bcf1_t *record):
  2998. bcf_destroy1(record)
  2999. # destroy iter so future calls to __next__ raise StopIteration
  3000. bcf_itr_destroy(self.iter)
  3001. self.iter = NULL
  3002. cdef class BCFIterator(BaseIterator):
  3003. def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
  3004. if bcf is None:
  3005. raise ValueError('bcf must not be None')
  3006. if contig is None:
  3007. raise ValueError('contig must be specified')
  3008. if not isinstance(bcf.index, BCFIndex):
  3009. raise ValueError('bcf index required')
  3010. cdef BCFIndex index = bcf.index
  3011. self.bcf = bcf
  3012. self.index = index
  3013. cdef int rid, cstart, cstop
  3014. try:
  3015. rid = index.refmap[contig]
  3016. except KeyError:
  3017. # A query for a non-existent contig yields an empty iterator, does not raise an error
  3018. self.iter = NULL
  3019. return
  3020. if reopen:
  3021. self.bcf = self.bcf.copy()
  3022. cstart = start if start is not None else 0
  3023. cstop = stop if stop is not None else MAX_POS
  3024. with nogil:
  3025. self.iter = bcf_itr_queryi(index.ptr, rid, cstart, cstop)
  3026. if not self.iter:
  3027. if errno:
  3028. raise IOError(errno, strerror(errno))
  3029. else:
  3030. raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
  3031. def __dealloc__(self):
  3032. if self.iter:
  3033. bcf_itr_destroy(self.iter)
  3034. self.iter = NULL
  3035. def __iter__(self):
  3036. return self
  3037. def __next__(self):
  3038. if not self.iter:
  3039. raise StopIteration
  3040. cdef bcf1_t *record = bcf_init1()
  3041. if not record:
  3042. raise MemoryError('unable to allocate BCF record')
  3043. record.pos = -1
  3044. if self.bcf.drop_samples:
  3045. record.max_unpack = BCF_UN_SHR
  3046. cdef int ret
  3047. with nogil:
  3048. ret = bcf_itr_next(self.bcf.htsfile, self.iter, record)
  3049. if ret < 0:
  3050. _stop_BCFIterator(self, record)
  3051. if ret == -1:
  3052. raise StopIteration
  3053. elif ret == -2:
  3054. raise IOError('truncated file')
  3055. elif errno:
  3056. raise IOError(errno, strerror(errno))
  3057. else:
  3058. raise IOError('unable to fetch next record')
  3059. ret = bcf_subset_format(self.bcf.header.ptr, record)
  3060. if ret < 0:
  3061. _stop_BCFIterator(self, record)
  3062. raise ValueError('error in bcf_subset_format')
  3063. return makeVariantRecord(self.bcf.header, record)
  3064. cdef class TabixIterator(BaseIterator):
  3065. def __cinit__(self, *args, **kwargs):
  3066. self.line_buffer.l = 0
  3067. self.line_buffer.m = 0
  3068. self.line_buffer.s = NULL
  3069. def __init__(self, VariantFile bcf, contig=None, start=None, stop=None, reopen=True):
  3070. if bcf is None:
  3071. raise ValueError('bcf must not be None')
  3072. if not isinstance(bcf.index, TabixIndex):
  3073. raise ValueError('tabix index required')
  3074. cdef TabixIndex index = bcf.index
  3075. self.bcf = bcf
  3076. self.index = index
  3077. cdef int rid, cstart, cstop
  3078. try:
  3079. rid = index.refmap[contig]
  3080. except KeyError:
  3081. # A query for a non-existent contig yields an empty iterator, does not raise an error
  3082. self.iter = NULL
  3083. return
  3084. if reopen:
  3085. self.bcf = self.bcf.copy()
  3086. cstart = start if start is not None else 0
  3087. cstop = stop if stop is not None else MAX_POS
  3088. self.iter = tbx_itr_queryi(index.ptr, rid, start, stop)
  3089. if not self.iter:
  3090. if errno:
  3091. raise IOError(errno, strerror(errno))
  3092. else:
  3093. raise IOError('unable to fetch {}:{}-{}'.format(contig, start+1, stop))
  3094. def __dealloc__(self):
  3095. if self.iter:
  3096. tbx_itr_destroy(self.iter)
  3097. self.iter = NULL
  3098. if self.line_buffer.m:
  3099. free(self.line_buffer.s)
  3100. self.line_buffer.l = 0
  3101. self.line_buffer.m = 0
  3102. self.line_buffer.s = NULL
  3103. def __iter__(self):
  3104. return self
  3105. def __next__(self):
  3106. if not self.iter:
  3107. raise StopIteration
  3108. cdef int ret
  3109. with nogil:
  3110. ret = tbx_itr_next(self.bcf.htsfile, self.index.ptr, self.iter, &self.line_buffer)
  3111. if ret < 0:
  3112. tbx_itr_destroy(self.iter)
  3113. self.iter = NULL
  3114. if ret == -1:
  3115. raise StopIteration
  3116. elif ret == -2:
  3117. raise IOError('truncated file')
  3118. elif errno:
  3119. raise IOError(errno, strerror(errno))
  3120. else:
  3121. raise IOError('unable to fetch next record')
  3122. cdef bcf1_t *record = bcf_init1()
  3123. if not record:
  3124. raise MemoryError('unable to allocate BCF record')
  3125. record.pos = -1
  3126. if self.bcf.drop_samples:
  3127. record.max_unpack = BCF_UN_SHR
  3128. ret = vcf_parse1(&self.line_buffer, self.bcf.header.ptr, record)
  3129. # FIXME: stop iteration on parse failure?
  3130. if ret < 0:
  3131. bcf_destroy1(record)
  3132. raise ValueError('error in vcf_parse')
  3133. return makeVariantRecord(self.bcf.header, record)
  3134. ########################################################################
  3135. ########################################################################
  3136. ## Variant File
  3137. ########################################################################
  3138. cdef class VariantFile(HTSFile):
  3139. """*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
  3140. duplicate_filehandle=True, ignore_truncation=False, threads=1)*
  3141. A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
  3142. opened.
  3143. If an index for a variant file exists (.csi or .tbi), it will be
  3144. opened automatically. Without an index random access to records
  3145. via :meth:`fetch` is disabled.
  3146. For writing, a :class:`VariantHeader` object must be provided,
  3147. typically obtained from another :term:`VCF` file/:term:`BCF`
  3148. file.
  3149. Parameters
  3150. ----------
  3151. mode : string
  3152. *mode* should be ``r`` for reading or ``w`` for writing. The default is
  3153. text mode (:term:`VCF`). For binary (:term:`BCF`) I/O you should append
  3154. ``b`` for compressed or ``u`` for uncompressed :term:`BCF` output.
  3155. If ``b`` is present, it must immediately follow ``r`` or ``w``. Valid
  3156. modes are ``r``, ``w``, ``wh``, ``rb``, ``wb``, ``wbu`` and ``wb0``.
  3157. For instance, to open a :term:`BCF` formatted file for reading, type::
  3158. f = pysam.VariantFile('ex1.bcf','r')
  3159. If mode is not specified, we will try to auto-detect the file type. All
  3160. of the following should work::
  3161. f1 = pysam.VariantFile('ex1.bcf')
  3162. f2 = pysam.VariantFile('ex1.vcf')
  3163. f3 = pysam.VariantFile('ex1.vcf.gz')
  3164. index_filename : string
  3165. Explicit path to an index file.
  3166. header : VariantHeader
  3167. :class:`VariantHeader` object required for writing.
  3168. drop_samples: bool
  3169. Ignore sample information when reading.
  3170. duplicate_filehandle: bool
  3171. By default, file handles passed either directly or through
  3172. File-like objects will be duplicated before passing them to
  3173. htslib. The duplication prevents issues where the same stream
  3174. will be closed by htslib and through destruction of the
  3175. high-level python object. Set to False to turn off
  3176. duplication.
  3177. ignore_truncation: bool
  3178. Issue a warning, instead of raising an error if the current file
  3179. appears to be truncated due to a missing EOF marker. Only applies
  3180. to bgzipped formats. (Default=False)
  3181. threads: integer
  3182. Number of threads to use for compressing/decompressing VCF/BCF files.
  3183. Setting threads to > 1 cannot be combined with `ignore_truncation`.
  3184. (Default=1)
  3185. """
  3186. def __cinit__(self, *args, **kwargs):
  3187. self.htsfile = NULL
  3188. def __init__(self, *args, **kwargs):
  3189. self.header = None
  3190. self.index = None
  3191. self.filename = None
  3192. self.mode = None
  3193. self.threads = 1
  3194. self.index_filename = None
  3195. self.is_stream = False
  3196. self.is_remote = False
  3197. self.is_reading = False
  3198. self.drop_samples = False
  3199. self.header_written = False
  3200. self.start_offset = -1
  3201. self.open(*args, **kwargs)
  3202. def __dealloc__(self):
  3203. if not self.htsfile or not self.header:
  3204. return
  3205. cdef int ret
  3206. # Write header if no records were written
  3207. if self.htsfile.is_write and not self.header_written:
  3208. with nogil:
  3209. ret = bcf_hdr_write(self.htsfile, self.header.ptr)
  3210. if ret < 0 and errno != EPIPE:
  3211. raise OSError_from_errno("Can't write headers", self.filename)
  3212. ret = hts_close(self.htsfile)
  3213. self.htsfile = NULL
  3214. self.header = self.index = None
  3215. if ret < 0 and errno != EPIPE:
  3216. raise OSError_from_errno("Closing failed", self.filename)
  3217. def close(self):
  3218. """closes the :class:`pysam.VariantFile`."""
  3219. if not self.htsfile:
  3220. return
  3221. cdef int ret
  3222. # Write header if no records were written
  3223. if self.htsfile.is_write and not self.header_written:
  3224. with nogil:
  3225. ret = bcf_hdr_write(self.htsfile, self.header.ptr)
  3226. if ret < 0 and errno != EPIPE:
  3227. raise OSError_from_errno("Can't write headers", self.filename)
  3228. ret = hts_close(self.htsfile)
  3229. self.htsfile = NULL
  3230. self.header = self.index = None
  3231. if ret < 0 and errno != EPIPE:
  3232. raise OSError_from_errno("Closing failed", self.filename)
  3233. def __iter__(self):
  3234. if not self.is_open:
  3235. raise ValueError('I/O operation on closed file')
  3236. if self.htsfile.is_write:
  3237. raise ValueError('cannot iterate over Variantfile opened for writing')
  3238. self.is_reading = 1
  3239. return self
  3240. def __next__(self):
  3241. cdef int ret
  3242. cdef int errcode
  3243. cdef bcf1_t *record = bcf_init1()
  3244. if not record:
  3245. raise MemoryError('unable to allocate BCF record')
  3246. record.pos = -1
  3247. if self.drop_samples:
  3248. record.max_unpack = BCF_UN_SHR
  3249. with nogil:
  3250. ret = bcf_read1(self.htsfile, self.header.ptr, record)
  3251. if ret < 0:
  3252. errcode = record.errcode
  3253. bcf_destroy1(record)
  3254. if errcode:
  3255. raise IOError('unable to parse next record')
  3256. if ret == -1:
  3257. raise StopIteration
  3258. elif ret == -2:
  3259. raise IOError('truncated file')
  3260. elif errno:
  3261. raise OSError_from_errno("Unable to fetch next record", self.filename)
  3262. else:
  3263. raise IOError('unable to fetch next record')
  3264. return makeVariantRecord(self.header, record)
  3265. def copy(self):
  3266. if not self.is_open:
  3267. raise ValueError
  3268. cdef VariantFile vars = VariantFile.__new__(VariantFile)
  3269. cdef bcf_hdr_t *hdr
  3270. # FIXME: re-open using fd or else header and index could be invalid
  3271. vars.htsfile = self._open_htsfile()
  3272. if not vars.htsfile:
  3273. raise ValueError('Cannot re-open htsfile')
  3274. # minimize overhead by re-using header and index. This approach is
  3275. # currently risky, but see above for how this can be mitigated.
  3276. vars.header = self.header
  3277. vars.index = self.index
  3278. vars.filename = self.filename
  3279. vars.mode = self.mode
  3280. vars.threads = self.threads
  3281. vars.index_filename = self.index_filename
  3282. vars.drop_samples = self.drop_samples
  3283. vars.is_stream = self.is_stream
  3284. vars.is_remote = self.is_remote
  3285. vars.is_reading = self.is_reading
  3286. vars.start_offset = self.start_offset
  3287. vars.header_written = self.header_written
  3288. if self.htsfile.is_bin:
  3289. vars.seek(self.tell())
  3290. else:
  3291. with nogil:
  3292. hdr = bcf_hdr_read(vars.htsfile)
  3293. makeVariantHeader(hdr)
  3294. return vars
  3295. def open(self, filename, mode='r',
  3296. index_filename=None,
  3297. VariantHeader header=None,
  3298. drop_samples=False,
  3299. duplicate_filehandle=True,
  3300. ignore_truncation=False,
  3301. threads=1):
  3302. """open a vcf/bcf file.
  3303. If open is called on an existing VariantFile, the current file will be
  3304. closed and a new file will be opened.
  3305. """
  3306. cdef bcf_hdr_t *hdr
  3307. cdef BGZF *bgzfp
  3308. cdef hts_idx_t *idx
  3309. cdef tbx_t *tidx
  3310. cdef char *cfilename
  3311. cdef char *cindex_filename = NULL
  3312. cdef char *cmode
  3313. if threads > 1 and ignore_truncation:
  3314. # This won't raise errors if reaching a truncated alignment,
  3315. # because bgzf_mt_reader in htslib does not deal with
  3316. # bgzf_mt_read_block returning non-zero values, contrary
  3317. # to bgzf_read (https://github.com/samtools/htslib/blob/1.7/bgzf.c#L888)
  3318. # Better to avoid this (for now) than to produce seemingly correct results.
  3319. raise ValueError('Cannot add extra threads when "ignore_truncation" is True')
  3320. self.threads = threads
  3321. # close a previously opened file
  3322. if self.is_open:
  3323. self.close()
  3324. if not mode or mode[0] not in 'rwa':
  3325. raise ValueError('mode must begin with r, w or a')
  3326. self.duplicate_filehandle = duplicate_filehandle
  3327. format_modes = [m for m in mode[1:] if m in 'bcguz']
  3328. if len(format_modes) > 1:
  3329. raise ValueError('mode contains conflicting format specifiers: {}'.format(''.join(format_modes)))
  3330. invalid_modes = [m for m in mode[1:] if m not in 'bcguz0123456789ex']
  3331. if invalid_modes:
  3332. raise ValueError('invalid mode options: {}'.format(''.join(invalid_modes)))
  3333. # Autodetect mode from filename
  3334. if mode == 'w' and isinstance(filename, str):
  3335. if filename.endswith('.gz'):
  3336. mode = 'wz'
  3337. elif filename.endswith('.bcf'):
  3338. mode = 'wb'
  3339. # for htslib, wbu seems to not work
  3340. if mode == 'wbu':
  3341. mode = 'wb0'
  3342. self.mode = mode = force_bytes(mode)
  3343. try:
  3344. filename = encode_filename(filename)
  3345. self.is_remote = hisremote(filename)
  3346. self.is_stream = filename == b'-'
  3347. except TypeError:
  3348. filename = filename
  3349. self.is_remote = False
  3350. self.is_stream = True
  3351. self.filename = filename
  3352. if index_filename is not None:
  3353. self.index_filename = index_filename = encode_filename(index_filename)
  3354. else:
  3355. self.index_filename = None
  3356. self.drop_samples = bool(drop_samples)
  3357. self.header = None
  3358. self.header_written = False
  3359. if mode.startswith(b'w'):
  3360. # open file for writing
  3361. if index_filename is not None:
  3362. raise ValueError('Cannot specify an index filename when writing a VCF/BCF file')
  3363. # header structure (used for writing)
  3364. if header:
  3365. self.header = header.copy()
  3366. else:
  3367. self.header = VariantHeader()
  3368. #raise ValueError('a VariantHeader must be specified')
  3369. # Header is not written until the first write or on close
  3370. self.htsfile = self._open_htsfile()
  3371. if not self.htsfile:
  3372. raise ValueError("could not open file `{}` (mode='{}')".format(filename, mode))
  3373. elif mode.startswith(b'r'):
  3374. # open file for reading
  3375. self.htsfile = self._open_htsfile()
  3376. if not self.htsfile:
  3377. if errno:
  3378. raise OSError_from_errno("Could not open variant file", filename)
  3379. else:
  3380. raise ValueError('could not open variant file `{}`'.format(filename))
  3381. if self.htsfile.format.format not in (bcf, vcf):
  3382. raise ValueError('invalid file `{}` (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
  3383. self.check_truncation(ignore_truncation)
  3384. with nogil:
  3385. hdr = bcf_hdr_read(self.htsfile)
  3386. try:
  3387. self.header = makeVariantHeader(hdr)
  3388. except ValueError:
  3389. raise ValueError('file `{}` does not have valid header (mode=`{}`) - is it VCF/BCF format?'.format(filename, mode))
  3390. if isinstance(self.filename, bytes):
  3391. cfilename = self.filename
  3392. else:
  3393. cfilename = NULL
  3394. # check for index and open if present
  3395. if self.htsfile.format.format == bcf and cfilename:
  3396. if index_filename is not None:
  3397. cindex_filename = index_filename
  3398. with nogil:
  3399. idx = bcf_index_load2(cfilename, cindex_filename)
  3400. self.index = makeBCFIndex(self.header, idx)
  3401. elif self.htsfile.format.compression == bgzf and cfilename:
  3402. if index_filename is not None:
  3403. cindex_filename = index_filename
  3404. with nogil:
  3405. tidx = tbx_index_load2(cfilename, cindex_filename)
  3406. self.index = makeTabixIndex(tidx)
  3407. if not self.is_stream:
  3408. self.start_offset = self.tell()
  3409. else:
  3410. raise ValueError('unknown mode {}'.format(mode))
  3411. def reset(self):
  3412. """reset file position to beginning of file just after the header."""
  3413. return self.seek(self.start_offset)
  3414. def is_valid_tid(self, tid):
  3415. """
  3416. return True if the numerical :term:`tid` is valid; False otherwise.
  3417. returns -1 if reference is not known.
  3418. """
  3419. if not self.is_open:
  3420. raise ValueError('I/O operation on closed file')
  3421. cdef bcf_hdr_t *hdr = self.header.ptr
  3422. cdef int rid = tid
  3423. return 0 <= rid < hdr.n[BCF_DT_CTG]
  3424. def get_tid(self, reference):
  3425. """
  3426. return the numerical :term:`tid` corresponding to
  3427. :term:`reference`
  3428. returns -1 if reference is not known.
  3429. """
  3430. if not self.is_open:
  3431. raise ValueError('I/O operation on closed file')
  3432. cdef vdict_t *d = <vdict_t*>self.header.ptr.dict[BCF_DT_CTG]
  3433. reference = force_bytes(reference)
  3434. cdef khint_t k = kh_get_vdict(d, reference)
  3435. return kh_val_vdict(d, k).id if k != kh_end(d) else -1
  3436. def get_reference_name(self, tid):
  3437. """
  3438. return :term:`reference` name corresponding to numerical :term:`tid`
  3439. """
  3440. if not self.is_open:
  3441. raise ValueError('I/O operation on closed file')
  3442. cdef bcf_hdr_t *hdr = self.header.ptr
  3443. cdef int rid = tid
  3444. if rid < 0 or rid >= hdr.n[BCF_DT_CTG]:
  3445. raise ValueError('Invalid tid')
  3446. return bcf_str_cache_get_charptr(bcf_hdr_id2name(hdr, rid))
  3447. def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False, end=None, reference=None):
  3448. """fetch records in a :term:`region`, specified either by
  3449. :term:`contig`, *start*, and *end* (which are 0-based, half-open);
  3450. or alternatively by a samtools :term:`region` string (which is
  3451. 1-based inclusive).
  3452. Without *contig* or *region* all mapped records will be fetched. The
  3453. records will be returned ordered by contig, which will not necessarily
  3454. be the order within the file.
  3455. Set *reopen* to true if you will be using multiple iterators on the
  3456. same file at the same time. The iterator returned will receive its
  3457. own copy of a filehandle to the file effectively re-opening the
  3458. file. Re-opening a file incurrs some overhead, so use with care.
  3459. If only *contig* is set, all records on *contig* will be fetched.
  3460. If both *region* and *contig* are given, an exception is raised.
  3461. Note that a bgzipped :term:`VCF`.gz file without a tabix/CSI index
  3462. (.tbi/.csi) or a :term:`BCF` file without a CSI index can only be
  3463. read sequentially.
  3464. """
  3465. if not self.is_open:
  3466. raise ValueError('I/O operation on closed file')
  3467. if self.htsfile.is_write:
  3468. raise ValueError('cannot fetch from Variantfile opened for writing')
  3469. if contig is None and region is None:
  3470. self.is_reading = 1
  3471. bcf = self.copy() if reopen else self
  3472. bcf.seek(self.start_offset)
  3473. return iter(bcf)
  3474. if self.index is None:
  3475. raise ValueError('fetch requires an index')
  3476. _, tid, start, stop = self.parse_region(contig, start, stop, region,
  3477. None, end=end, reference=reference)
  3478. if contig is None:
  3479. contig = self.get_reference_name(tid)
  3480. self.is_reading = 1
  3481. return self.index.fetch(self, contig, start, stop, reopen)
  3482. def new_record(self, *args, **kwargs):
  3483. """Create a new empty :class:`VariantRecord`.
  3484. See :meth:`VariantHeader.new_record`
  3485. """
  3486. return self.header.new_record(*args, **kwargs)
  3487. cpdef int write(self, VariantRecord record) except -1:
  3488. """
  3489. write a single :class:`pysam.VariantRecord` to disk.
  3490. returns the number of bytes written.
  3491. """
  3492. if record is None:
  3493. raise ValueError('record must not be None')
  3494. if not self.is_open:
  3495. raise ValueError('I/O operation on closed file')
  3496. if not self.htsfile.is_write:
  3497. raise ValueError('cannot write to a Variantfile opened for reading')
  3498. cdef int ret
  3499. if not self.header_written:
  3500. self.header_written = True
  3501. with nogil:
  3502. ret = bcf_hdr_write(self.htsfile, self.header.ptr)
  3503. if ret < 0:
  3504. raise OSError_from_errno("Can't write headers", self.filename)
  3505. #if record.header is not self.header:
  3506. # record.translate(self.header)
  3507. # raise ValueError('Writing records from a different VariantFile is not yet supported')
  3508. if record.ptr.n_sample != bcf_hdr_nsamples(self.header.ptr):
  3509. msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})'
  3510. raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
  3511. # Sync END annotation before writing
  3512. bcf_sync_end(record)
  3513. with nogil:
  3514. ret = bcf_write1(self.htsfile, self.header.ptr, record.ptr)
  3515. if ret < 0:
  3516. raise OSError_from_errno("Can't write record", self.filename)
  3517. return ret
  3518. def subset_samples(self, include_samples):
  3519. """
  3520. Read only a subset of samples to reduce processing time and memory.
  3521. Must be called prior to retrieving records.
  3522. """
  3523. if not self.is_open:
  3524. raise ValueError('I/O operation on closed file')
  3525. if self.htsfile.is_write:
  3526. raise ValueError('cannot subset samples from Variantfile opened for writing')
  3527. if self.is_reading:
  3528. raise ValueError('cannot subset samples after fetching records')
  3529. self.header._subset_samples(include_samples)
  3530. # potentially unnecessary optimization that also sets max_unpack
  3531. if not include_samples:
  3532. self.drop_samples = True