You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

2720 lines
107 KiB

  1. # cython: language_level=3
  2. from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
  3. from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
  4. from libc.stdlib cimport malloc, calloc, realloc, free
  5. from libc.string cimport memcpy, memcmp, strncpy, strlen, strdup
  6. from libc.stdio cimport FILE, printf
  7. from posix.types cimport off_t
  8. cdef extern from "Python.h":
  9. FILE* PyFile_AsFile(object)
  10. # cython does not wrap stdarg
  11. cdef extern from "stdarg.h":
  12. ctypedef struct va_list:
  13. pass
  14. cdef extern from "htslib/kstring.h" nogil:
  15. ctypedef struct kstring_t:
  16. size_t l, m
  17. char *s
  18. int kputc(int c, kstring_t *s)
  19. int kputw(int c, kstring_t *s)
  20. int kputl(long c, kstring_t *s)
  21. int ksprintf(kstring_t *s, const char *fmt, ...)
  22. cdef extern from "htslib_util.h" nogil:
  23. int hts_set_verbosity(int verbosity)
  24. int hts_get_verbosity()
  25. ctypedef uint32_t khint32_t
  26. ctypedef uint32_t khint_t
  27. ctypedef khint_t khiter_t
  28. # Used to manage BCF Header info
  29. ctypedef struct vdict_t:
  30. khint_t n_buckets, size, n_occupied, upper_bound
  31. khint32_t *flags
  32. const char *keys
  33. bcf_idinfo_t *vals
  34. # Used to manage indexed contigs in Tabix
  35. ctypedef struct s2i_t:
  36. khint_t n_buckets, size, n_occupied, upper_bound
  37. khint32_t *flags
  38. const char *keys
  39. int64_t *vals
  40. # Generic khash methods
  41. khint_t kh_size(void *d)
  42. khint_t kh_begin(void *d)
  43. khint_t kh_end(void *d)
  44. int kh_exist(void *d, khiter_t i)
  45. # Specialized khash methods for vdict
  46. khint_t kh_get_vdict(vdict_t *d, const char *key)
  47. const char *kh_key_vdict "kh_key" (vdict_t *d, khint_t i)
  48. bcf_idinfo_t kh_val_vdict "kh_val" (vdict_t *d, khint_t i)
  49. cdef extern from "htslib/hfile.h" nogil:
  50. ctypedef struct hFILE
  51. # @abstract Open the named file or URL as a stream
  52. # @return An hFILE pointer, or NULL (with errno set) if an error occurred.
  53. hFILE *hopen(const char *filename, const char *mode, ...)
  54. # @abstract Associate a stream with an existing open file descriptor
  55. # @return An hFILE pointer, or NULL (with errno set) if an error occurred.
  56. # @notes For socket descriptors (on Windows), mode should contain 's'.
  57. hFILE *hdopen(int fd, const char *mode)
  58. # @abstract Report whether the file name or URL denotes remote storage
  59. # @return 0 if local, 1 if remote.
  60. # @notes "Remote" means involving e.g. explicit network access, with the
  61. # implication that callers may wish to cache such files' contents locally.
  62. int hisremote(const char *filename)
  63. # @abstract Flush (for output streams) and close the stream
  64. # @return 0 if successful, or EOF (with errno set) if an error occurred.
  65. int hclose(hFILE *fp)
  66. # @abstract Close the stream, without flushing or propagating errors
  67. # @notes For use while cleaning up after an error only. Preserves errno.
  68. void hclose_abruptly(hFILE *fp)
  69. # @abstract Return the stream's error indicator
  70. # @return Non-zero (in fact, an errno value) if an error has occurred.
  71. # @notes This would be called herror() and return true/false to parallel
  72. # ferror(3), but a networking-related herror(3) function already exists. */
  73. int herrno(hFILE *fp)
  74. # @abstract Clear the stream's error indicator
  75. void hclearerr(hFILE *fp)
  76. # @abstract Reposition the read/write stream offset
  77. # @return The resulting offset within the stream (as per lseek(2)),
  78. # or negative if an error occurred.
  79. off_t hseek(hFILE *fp, off_t offset, int whence)
  80. # @abstract Report the current stream offset
  81. # @return The offset within the stream, starting from zero.
  82. off_t htell(hFILE *fp)
  83. # @abstract Read one character from the stream
  84. # @return The character read, or EOF on end-of-file or error
  85. int hgetc(hFILE *fp)
  86. # Read from the stream until the delimiter, up to a maximum length
  87. # @param buffer The buffer into which bytes will be written
  88. # @param size The size of the buffer
  89. # @param delim The delimiter (interpreted as an `unsigned char`)
  90. # @param fp The file stream
  91. # @return The number of bytes read, or negative on error.
  92. # @since 1.4
  93. #
  94. # Bytes will be read into the buffer up to and including a delimiter, until
  95. # EOF is reached, or _size-1_ bytes have been written, whichever comes first.
  96. # The string will then be terminated with a NUL byte (`\0`).
  97. ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp)
  98. # Read a line from the stream, up to a maximum length
  99. # @param buffer The buffer into which bytes will be written
  100. # @param size The size of the buffer
  101. # @param fp The file stream
  102. # @return The number of bytes read, or negative on error.
  103. # @since 1.4
  104. #
  105. # Specialization of hgetdelim() for a `\n` delimiter.
  106. ssize_t hgetln(char *buffer, size_t size, hFILE *fp)
  107. # Read a line from the stream, up to a maximum length
  108. # @param buffer The buffer into which bytes will be written
  109. # @param size The size of the buffer (must be > 1 to be useful)
  110. # @param fp The file stream
  111. # @return _buffer_ on success, or `NULL` if an error occurred.
  112. # @since 1.4
  113. #
  114. # This function can be used as a replacement for `fgets(3)`, or together with
  115. # kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_.
  116. char *hgets(char *buffer, int size, hFILE *fp)
  117. # @abstract Peek at characters to be read without removing them from buffers
  118. # @param fp The file stream
  119. # @param buffer The buffer to which the peeked bytes will be written
  120. # @param nbytes The number of bytes to peek at; limited by the size of the
  121. # internal buffer, which could be as small as 4K.
  122. # @return The number of bytes peeked, which may be less than nbytes if EOF
  123. # is encountered; or negative, if there was an I/O error.
  124. # @notes The characters peeked at remain in the stream's internal buffer,
  125. # and will be returned by later hread() etc calls.
  126. ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes)
  127. # @abstract Read a block of characters from the file
  128. # @return The number of bytes read, or negative if an error occurred.
  129. # @notes The full nbytes requested will be returned, except as limited
  130. # by EOF or I/O errors.
  131. ssize_t hread(hFILE *fp, void *buffer, size_t nbytes)
  132. # @abstract Write a character to the stream
  133. # @return The character written, or EOF if an error occurred.
  134. int hputc(int c, hFILE *fp)
  135. # @abstract Write a string to the stream
  136. # @return 0 if successful, or EOF if an error occurred.
  137. int hputs(const char *text, hFILE *fp)
  138. # @abstract Write a block of characters to the file
  139. # @return Either nbytes, or negative if an error occurred.
  140. # @notes In the absence of I/O errors, the full nbytes will be written.
  141. ssize_t hwrite(hFILE *fp, const void *buffer, size_t nbytes)
  142. # @abstract For writing streams, flush buffered output to the underlying stream
  143. # @return 0 if successful, or EOF if an error occurred.
  144. int hflush(hFILE *fp)
  145. cdef extern from "htslib/bgzf.h" nogil:
  146. ctypedef struct bgzf_mtaux_t
  147. ctypedef struct bgzidx_t
  148. ctypedef struct z_stream
  149. ctypedef struct BGZF:
  150. unsigned errcode
  151. unsigned is_write
  152. int is_be
  153. int compress_level
  154. int is_compressed
  155. int is_gzip
  156. int cache_size
  157. int64_t block_address
  158. int64_t uncompressed_address
  159. void *uncompressed_block
  160. void *compressed_block
  161. void *cache
  162. hFILE *fp
  163. bgzf_mtaux_t *mt
  164. bgzidx_t *idx
  165. int idx_build_otf
  166. z_stream *gz_stream
  167. #*****************
  168. # Basic routines *
  169. # *****************/
  170. # Open an existing file descriptor for reading or writing.
  171. #
  172. # @param fd file descriptor
  173. # @param mode mode matching /[rwag][u0-9]+/: 'r' for reading, 'w' for
  174. # writing, 'a' for appending, 'g' for gzip rather than BGZF
  175. # compression (with 'w' only), and digit specifies the zlib
  176. # compression level.
  177. # Note that there is a distinction between 'u' and '0': the
  178. # first yields plain uncompressed output whereas the latter
  179. # outputs uncompressed data wrapped in the zlib format.
  180. # @return BGZF file handler; 0 on error
  181. BGZF* bgzf_dopen(int fd, const char *mode)
  182. BGZF* bgzf_fdopen(int fd, const char *mode) # for backward compatibility
  183. # Open the specified file for reading or writing.
  184. BGZF* bgzf_open(const char* path, const char *mode)
  185. # Open an existing hFILE stream for reading or writing.
  186. BGZF* bgzf_hopen(hFILE *fp, const char *mode)
  187. # Close the BGZF and free all associated resources.
  188. #
  189. # @param fp BGZF file handler
  190. # @return 0 on success and -1 on error
  191. int bgzf_close(BGZF *fp)
  192. # Read up to _length_ bytes from the file storing into _data_.
  193. #
  194. # @param fp BGZF file handler
  195. # @param data data array to read into
  196. # @param length size of data to read
  197. # @return number of bytes actually read; 0 on end-of-file and -1 on error
  198. ssize_t bgzf_read(BGZF *fp, void *data, size_t length)
  199. # Write _length_ bytes from _data_ to the file. If no I/O errors occur,
  200. # the complete _length_ bytes will be written (or queued for writing).
  201. #
  202. # @param fp BGZF file handler
  203. # @param data data array to write
  204. # @param length size of data to write
  205. # @return number of bytes written (i.e., _length_); negative on error
  206. ssize_t bgzf_write(BGZF *fp, const void *data, size_t length)
  207. # Read up to _length_ bytes directly from the underlying stream without
  208. # decompressing. Bypasses BGZF blocking, so must be used with care in
  209. # specialised circumstances only.
  210. #
  211. # @param fp BGZF file handler
  212. # @param data data array to read into
  213. # @param length number of raw bytes to read
  214. # @return number of bytes actually read; 0 on end-of-file and -1 on error
  215. ssize_t bgzf_raw_read(BGZF *fp, void *data, size_t length)
  216. # Write _length_ bytes directly to the underlying stream without
  217. # compressing. Bypasses BGZF blocking, so must be used with care
  218. # in specialised circumstances only.
  219. #
  220. # @param fp BGZF file handler
  221. # @param data data array to write
  222. # @param length number of raw bytes to write
  223. # @return number of bytes actually written; -1 on error
  224. ssize_t bgzf_raw_write(BGZF *fp, const void *data, size_t length)
  225. # Write the data in the buffer to the file.
  226. int bgzf_flush(BGZF *fp)
  227. # Return a virtual file pointer to the current location in the file.
  228. # No interpretation of the value should be made, other than a subsequent
  229. # call to bgzf_seek can be used to position the file at the same point.
  230. # Return value is non-negative on success.
  231. int64_t bgzf_tell(BGZF *fp)
  232. # Set the file to read from the location specified by _pos_.
  233. #
  234. # @param fp BGZF file handler
  235. # @param pos virtual file offset returned by bgzf_tell()
  236. # @param whence must be SEEK_SET (cimported from libc.stdio / posix.unistd)
  237. # @return 0 on success and -1 on error
  238. # /
  239. int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence)
  240. # Check if the BGZF end-of-file (EOF) marker is present
  241. #
  242. # @param fp BGZF file handler opened for reading
  243. # @return 1 if the EOF marker is present and correct
  244. # 2 if it can't be checked, e.g., because fp isn't seekable
  245. # 0 if the EOF marker is absent
  246. # -1 (with errno set) on error
  247. int bgzf_check_EOF(BGZF *fp)
  248. # Check if a file is in the BGZF format
  249. #
  250. # @param fn file name
  251. # @return 1 if _fn_ is BGZF; 0 if not or on I/O error
  252. int bgzf_is_bgzf(const char *fn)
  253. #*********************
  254. # Advanced routines *
  255. #*********************
  256. # Set the cache size. Only effective when compiled with -DBGZF_CACHE.
  257. #
  258. # @param fp BGZF file handler
  259. # @param size size of cache in bytes; 0 to disable caching (default)
  260. void bgzf_set_cache_size(BGZF *fp, int size)
  261. # Flush the file if the remaining buffer size is smaller than _size_
  262. # @return 0 if flushing succeeded or was not needed; negative on error
  263. int bgzf_flush_try(BGZF *fp, ssize_t size)
  264. # Read one byte from a BGZF file. It is faster than bgzf_read()
  265. # @param fp BGZF file handler
  266. # @return byte read; -1 on end-of-file or error
  267. int bgzf_getc(BGZF *fp)
  268. # Read one line from a BGZF file. It is faster than bgzf_getc()
  269. #
  270. # @param fp BGZF file handler
  271. # @param delim delimiter
  272. # @param str string to write to; must be initialized
  273. # @return length of the string; 0 on end-of-file; negative on error
  274. int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
  275. # Read the next BGZF block.
  276. int bgzf_read_block(BGZF *fp)
  277. # Enable multi-threading (only effective on writing and when the
  278. # library was compiled with -DBGZF_MT)
  279. #
  280. # @param fp BGZF file handler; must be opened for writing
  281. # @param n_threads #threads used for writing
  282. # @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended
  283. int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks)
  284. # Compress a single BGZF block.
  285. #
  286. # @param dst output buffer (must have size >= BGZF_MAX_BLOCK_SIZE)
  287. # @param dlen size of output buffer; updated on return to the number
  288. # of bytes actually written to dst
  289. # @param src buffer to be compressed
  290. # @param slen size of data to compress (must be <= BGZF_BLOCK_SIZE)
  291. # @param level compression level
  292. # @return 0 on success and negative on error
  293. #
  294. int bgzf_compress(void *dst, size_t *dlen, const void *src, size_t slen, int level)
  295. #*******************
  296. # bgzidx routines *
  297. # BGZF at the uncompressed offset
  298. #
  299. # @param fp BGZF file handler; must be opened for reading
  300. # @param uoffset file offset in the uncompressed data
  301. # @param where SEEK_SET (cimported from libc.stdio) supported atm
  302. #
  303. # Returns 0 on success and -1 on error.
  304. int bgzf_useek(BGZF *fp, long uoffset, int where)
  305. # Position in uncompressed BGZF
  306. #
  307. # @param fp BGZF file handler; must be opened for reading
  308. #
  309. # Returns the current offset on success and -1 on error.
  310. long bgzf_utell(BGZF *fp)
  311. # Tell BGZF to build index while compressing.
  312. #
  313. # @param fp BGZF file handler; can be opened for reading or writing.
  314. #
  315. # Returns 0 on success and -1 on error.
  316. int bgzf_index_build_init(BGZF *fp)
  317. # Load BGZF index
  318. #
  319. # @param fp BGZF file handler
  320. # @param bname base name
  321. # @param suffix suffix to add to bname (can be NULL)
  322. #
  323. # Returns 0 on success and -1 on error.
  324. int bgzf_index_load(BGZF *fp, const char *bname, const char *suffix)
  325. # Save BGZF index
  326. #
  327. # @param fp BGZF file handler
  328. # @param bname base name
  329. # @param suffix suffix to add to bname (can be NULL)
  330. #
  331. # Returns 0 on success and -1 on error.
  332. int bgzf_index_dump(BGZF *fp, const char *bname, const char *suffix)
  333. cdef extern from "htslib/hts.h" nogil:
  334. uint32_t kroundup32(uint32_t x)
  335. ctypedef struct cram_fd
  336. union FilePointerUnion:
  337. BGZF *bgzf
  338. cram_fd *cram
  339. hFILE *hfile
  340. void *voidp
  341. enum htsFormatCategory:
  342. unknown_category
  343. sequence_data # Sequence data -- SAM, BAM, CRAM, etc
  344. variant_data # Variant calling data -- VCF, BCF, etc
  345. index_file # Index file associated with some data file
  346. region_list # Coordinate intervals or regions -- BED, etc
  347. category_maximum
  348. enum htsExactFormat:
  349. unknown_format
  350. binary_format
  351. text_format
  352. sam, bam, bai, cram, crai, vcf, bcf, csi, gzi, tbi, bed
  353. format_maximum
  354. enum htsCompression:
  355. no_compression, gzip, bgzf, custom
  356. compression_maximum
  357. cdef enum hts_fmt_option:
  358. CRAM_OPT_DECODE_MD,
  359. CRAM_OPT_PREFIX,
  360. CRAM_OPT_VERBOSITY,
  361. CRAM_OPT_SEQS_PER_SLICE,
  362. CRAM_OPT_SLICES_PER_CONTAINER,
  363. CRAM_OPT_RANGE,
  364. CRAM_OPT_VERSION,
  365. CRAM_OPT_EMBED_REF,
  366. CRAM_OPT_IGNORE_MD5,
  367. CRAM_OPT_REFERENCE,
  368. CRAM_OPT_MULTI_SEQ_PER_SLICE,
  369. CRAM_OPT_NO_REF,
  370. CRAM_OPT_USE_BZIP2,
  371. CRAM_OPT_SHARED_REF,
  372. CRAM_OPT_NTHREADS,
  373. CRAM_OPT_THREAD_POOL,
  374. CRAM_OPT_USE_LZMA,
  375. CRAM_OPT_USE_RANS,
  376. CRAM_OPT_REQUIRED_FIELDS,
  377. HTS_OPT_COMPRESSION_LEVEL,
  378. HTS_OPT_NTHREADS,
  379. ctypedef struct htsVersion:
  380. short major, minor
  381. ctypedef struct htsFormat:
  382. htsFormatCategory category
  383. htsExactFormat format
  384. htsVersion version
  385. htsCompression compression
  386. short compression_level
  387. void *specific
  388. ctypedef struct htsFile:
  389. uint8_t is_bin
  390. uint8_t is_write
  391. uint8_t is_be
  392. uint8_t is_cram
  393. int64_t lineno
  394. kstring_t line
  395. char *fn
  396. char *fn_aux
  397. FilePointerUnion fp
  398. htsFormat format
  399. int hts_verbose
  400. cdef union hts_opt_val_union:
  401. int i
  402. char *s
  403. ctypedef struct hts_opt:
  404. char *arg
  405. hts_fmt_option opt
  406. hts_opt_val_union val
  407. void *next
  408. # @abstract Parses arg and appends it to the option list.
  409. # @return 0 on success and -1 on failure
  410. int hts_opt_add(hts_opt **opts, const char *c_arg)
  411. # @abstract Applies an hts_opt option list to a given htsFile.
  412. # @return 0 on success and -1 on failure
  413. int hts_opt_apply(htsFile *fp, hts_opt *opts)
  414. # @abstract Frees an hts_opt list.
  415. void hts_opt_free(hts_opt *opts)
  416. # @abstract Table for converting a nucleotide character to 4-bit encoding.
  417. # The input character may be either an IUPAC ambiguity code, '=' for 0, or
  418. # '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8
  419. # for A/C/G/T or combinations of these bits for ambiguous bases.
  420. const unsigned char *seq_nt16_table
  421. # @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
  422. # ambiguity code letter (or '=' when given 0).
  423. const char *seq_nt16_str
  424. # @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
  425. # Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
  426. const int *seq_nt16_int
  427. # @abstract Get the htslib version number
  428. # @return For released versions, a string like "N.N[.N]"; or git describe
  429. # output if using a library built within a Git repository.
  430. const char *hts_version()
  431. # @abstract Determine format by peeking at the start of a file
  432. # @param fp File opened for reading, positioned at the beginning
  433. # @param fmt Format structure that will be filled out on return
  434. # @return 0 for success, or negative if an error occurred.
  435. int hts_detect_format(hFILE *fp, htsFormat *fmt)
  436. # @abstract Get a human-readable description of the file format
  437. # @return Description string, to be freed by the caller after use.
  438. char *hts_format_description(const htsFormat *format)
  439. # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
  440. # @param fn The file name or "-" for stdin/stdout
  441. # @param mode Mode matching / [rwa][bceguxz0-9]* /
  442. # @discussion
  443. # With 'r' opens for reading; any further format mode letters are ignored
  444. # as the format is detected by checking the first few bytes or BGZF blocks
  445. # of the file. With 'w' or 'a' opens for writing or appending, with format
  446. # specifier letters:
  447. # b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
  448. # c CRAM format
  449. # g gzip compressed
  450. # u uncompressed
  451. # z bgzf compressed
  452. # [0-9] zlib compression level
  453. # and with non-format option letters (for any of 'r'/'w'/'a'):
  454. # e close the file on exec(2) (opens with O_CLOEXEC, where supported)
  455. # x create the file exclusively (opens with O_EXCL, where supported)
  456. # Note that there is a distinction between 'u' and '0': the first yields
  457. # plain uncompressed output whereas the latter outputs uncompressed data
  458. # wrapped in the zlib format.
  459. # @example
  460. # [rw]b .. compressed BCF, BAM, FAI
  461. # [rw]bu .. uncompressed BCF
  462. # [rw]z .. compressed VCF
  463. # [rw] .. uncompressed VCF
  464. htsFile *hts_open(const char *fn, const char *mode)
  465. # @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file
  466. # @param fn The file name or "-" for stdin/stdout
  467. # @param mode Open mode, as per hts_open()
  468. # @param fmt Optional format specific parameters
  469. # @discussion
  470. # See hts_open() for description of fn and mode.
  471. # // TODO Update documentation for s/opts/fmt/
  472. # Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
  473. # if defined, override mode. Opts also contains a linked list of hts_opt
  474. # structures to apply to the open file handle. These can contain things
  475. # like pointers to the reference or information on compression levels,
  476. # block sizes, etc.
  477. htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt)
  478. # @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
  479. # @param fp The already-open file handle
  480. # @param fn The file name or "-" for stdin/stdout
  481. # @param mode Open mode, as per hts_open()
  482. htsFile *hts_hopen(hFILE *fp, const char *fn, const char *mode)
  483. # @abstract For output streams, flush any buffered data
  484. # @param fp The file handle to be flushed
  485. # @return 0 for success, or negative if an error occurred.
  486. # @since 1.14
  487. int hts_flush(htsFile *fp)
  488. # @abstract Close a file handle, flushing buffered data for output streams
  489. # @param fp The file handle to be closed
  490. # @return 0 for success, or negative if an error occurred.
  491. int hts_close(htsFile *fp)
  492. # @abstract Returns the file's format information
  493. # @param fp The file handle
  494. # @return Read-only pointer to the file's htsFormat.
  495. const htsFormat *hts_get_format(htsFile *fp)
  496. # @ abstract Returns a string containing the file format extension.
  497. # @ param format Format structure containing the file type.
  498. # @ return A string ("sam", "bam", etc) or "?" for unknown formats.
  499. const char *hts_format_file_extension(const htsFormat *format)
  500. # @abstract Sets a specified CRAM option on the open file handle.
  501. # @param fp The file handle open the open file.
  502. # @param opt The CRAM_OPT_* option.
  503. # @param ... Optional arguments, dependent on the option used.
  504. # @return 0 for success, or negative if an error occurred.
  505. int hts_set_opt(htsFile *fp, hts_fmt_option opt, ...)
  506. int hts_getline(htsFile *fp, int delimiter, kstring_t *str)
  507. char **hts_readlines(const char *fn, int *_n)
  508. # @abstract Parse comma-separated list or read list from a file
  509. # @param list File name or comma-separated list
  510. # @param is_file
  511. # @param _n Size of the output array (number of items read)
  512. # @return NULL on failure or pointer to newly allocated array of
  513. # strings
  514. char **hts_readlist(const char *fn, int is_file, int *_n)
  515. # @abstract Create extra threads to aid compress/decompression for this file
  516. # @param fp The file handle
  517. # @param n The number of worker threads to create
  518. # @return 0 for success, or negative if an error occurred.
  519. # @notes THIS THREADING API IS LIKELY TO CHANGE IN FUTURE.
  520. int hts_set_threads(htsFile *fp, int n)
  521. # @abstract Set .fai filename for a file opened for reading
  522. # @return 0 for success, negative on failure
  523. # @discussion
  524. # Called before *_hdr_read(), this provides the name of a .fai file
  525. # used to provide a reference list if the htsFile contains no @SQ headers.
  526. int hts_set_fai_filename(htsFile *fp, const char *fn_aux)
  527. int8_t HTS_IDX_NOCOOR
  528. int8_t HTS_IDX_START
  529. int8_t HTS_IDX_REST
  530. int8_t HTS_IDX_NONE
  531. int8_t HTS_FMT_CSI
  532. int8_t HTS_FMT_BAI
  533. int8_t HTS_FMT_TBI
  534. int8_t HTS_FMT_CRAI
  535. BGZF *hts_get_bgzfp(htsFile *fp)
  536. ctypedef struct hts_idx_t
  537. ctypedef struct hts_pair64_t:
  538. uint64_t u, v
  539. ctypedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end)
  540. ctypedef struct hts_bins_t:
  541. int n, m
  542. int *a
  543. ctypedef struct hts_itr_t:
  544. uint32_t read_rest
  545. uint32_t finished
  546. int tid, bed, end, n_off, i
  547. int curr_tid, curr_beg, curr_end
  548. uint64_t curr_off
  549. hts_pair64_t *off
  550. hts_readrec_func *readfunc
  551. hts_bins_t bins
  552. hts_idx_t *hts_idx_init(int n, int fmt, uint64_t offset0, int min_shift, int n_lvls)
  553. void hts_idx_destroy(hts_idx_t *idx)
  554. int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped)
  555. void hts_idx_finish(hts_idx_t *idx, uint64_t final_offset)
  556. #### Save an index to a file
  557. # @param idx Index to be written
  558. # @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
  559. # @param fmt One of the HTS_FMT_* index formats
  560. # @return 0 if successful, or negative if an error occurred.
  561. int hts_idx_save(const hts_idx_t *idx, const char *fn, int fmt)
  562. #### Save an index to a specific file
  563. # @param idx Index to be written
  564. # @param fn Input BAM/BCF/etc filename
  565. # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
  566. # @param fmt One of the HTS_FMT_* index formats
  567. # @return 0 if successful, or negative if an error occurred.
  568. int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int fmt)
  569. #### Load an index file
  570. # @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
  571. # the extension substituted, to search for an existing index file
  572. # @param fmt One of the HTS_FMT_* index formats
  573. # @return The index, or NULL if an error occurred.
  574. hts_idx_t *hts_idx_load(const char *fn, int fmt)
  575. #### Load a specific index file
  576. # @param fn Input BAM/BCF/etc filename
  577. # @param fnidx The input index filename
  578. # @return The index, or NULL if an error occurred.
  579. hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
  580. #### Load a specific index file
  581. # @param fn Input BAM/BCF/etc filename
  582. # @param fnidx The input index filename
  583. # @param fmt One of the HTS_FMT_* index formats
  584. # @param flags Flags to alter behaviour (see description)
  585. # @return The index, or NULL if an error occurred.
  586. hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags)
  587. int HTS_IDX_SAVE_REMOTE
  588. int HTS_IDX_SILENT_FAIL
  589. uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
  590. void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
  591. int hts_idx_get_stat(const hts_idx_t* idx, int tid,
  592. uint64_t* mapped, uint64_t* unmapped)
  593. uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
  594. int HTS_PARSE_THOUSANDS_SEP # Ignore ',' separators within numbers
  595. # Parse a numeric string
  596. # The number may be expressed in scientific notation, and optionally may
  597. # contain commas in the integer part (before any decimal point or E notation).
  598. # @param str String to be parsed
  599. # @param strend If non-NULL, set on return to point to the first character
  600. # in @a str after those forming the parsed number
  601. # @param flags Or'ed-together combination of HTS_PARSE_* flags
  602. # @return Converted value of the parsed number.
  603. #
  604. # When @a strend is NULL, a warning will be printed (if hts_verbose is 2
  605. # or more) if there are any trailing characters after the number.
  606. long long hts_parse_decimal(const char *str, char **strend, int flags)
  607. # Parse a "CHR:START-END"-style region string
  608. # @param str String to be parsed
  609. # @param beg Set on return to the 0-based start of the region
  610. # @param end Set on return to the 1-based end of the region
  611. # @return Pointer to the colon or '\0' after the reference sequence name,
  612. # or NULL if @a str could not be parsed.
  613. const char *hts_parse_reg(const char *str, int *beg, int *end)
  614. hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec)
  615. void hts_itr_destroy(hts_itr_t *iter)
  616. ctypedef int (*hts_name2id_f)(void*, const char*)
  617. ctypedef const char *(*hts_id2name_f)(void*, int)
  618. ctypedef hts_itr_t *hts_itr_query_func(
  619. const hts_idx_t *idx,
  620. int tid,
  621. int beg,
  622. int end,
  623. hts_readrec_func *readrec)
  624. hts_itr_t *hts_itr_querys(
  625. const hts_idx_t *idx,
  626. const char *reg,
  627. hts_name2id_f getid,
  628. void *hdr,
  629. hts_itr_query_func *itr_query,
  630. hts_readrec_func *readrec)
  631. int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data)
  632. const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) # free only the array, not the values
  633. # hts_file_type() - Convenience function to determine file type
  634. # @fname: the file name
  635. #
  636. # Returns one of the FT_* defines.
  637. #
  638. # DEPRECATED: This function has been replaced by hts_detect_format().
  639. # It and these FT_* macros will be removed in a future HTSlib release.
  640. int FT_UNKN
  641. int FT_GZ
  642. int FT_VCF
  643. int FT_VCF_GZ
  644. int FT_BCF
  645. int FT_BCF_GZ
  646. int FT_STDIN
  647. int hts_file_type(const char *fname)
  648. # /***************************
  649. # * Revised MAQ error model *
  650. # ***************************/
  651. ctypedef struct errmod_t
  652. errmod_t *errmod_init(double depcorr)
  653. void errmod_destroy(errmod_t *em)
  654. # /*
  655. # n: number of bases
  656. # m: maximum base
  657. # bases[i]: qual:6, strand:1, base:4
  658. # q[i*m+j]: phred-scaled likelihood of (i,j)
  659. # */
  660. int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *Probabilistic)
  661. # /*****************************************
  662. # * q banded glocal alignment *
  663. # *****************************************/
  664. ctypedef struct probaln_par_t:
  665. float d, e
  666. int bw
  667. int probaln_glocal(const uint8_t *ref,
  668. int l_ref,
  669. const uint8_t *query,
  670. int l_query, const uint8_t *iqual,
  671. const probaln_par_t *c,
  672. int *state, uint8_t *q)
  673. # /**********************
  674. # * MD5 implementation *
  675. # **********************/
  676. ctypedef struct hts_md5_context
  677. # /*! @abstract Initialises an MD5 context.
  678. # * @discussion
  679. # * The expected use is to allocate an hts_md5_context using
  680. # * hts_md5_init(). This pointer is then passed into one or more calls
  681. # * of hts_md5_update() to compute successive internal portions of the
  682. # * MD5 sum, which can then be externalised as a full 16-byte MD5sum
  683. # * calculation by calling hts_md5_final(). This can then be turned
  684. # * into ASCII via hts_md5_hex().
  685. # *
  686. # * To dealloate any resources created by hts_md5_init() call the
  687. # * hts_md5_destroy() function.
  688. # *
  689. # * @return hts_md5_context pointer on success, NULL otherwise.
  690. # */
  691. hts_md5_context *hts_md5_init()
  692. # /*! @abstract Updates the context with the MD5 of the data. */
  693. void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size)
  694. # /*! @abstract Computes the final 128-bit MD5 hash from the given context */
  695. void hts_md5_final(unsigned char *digest, hts_md5_context *ctx)
  696. # /*! @abstract Resets an md5_context to the initial state, as returned
  697. # * by hts_md5_init().
  698. # */
  699. void hts_md5_reset(hts_md5_context *ctx)
  700. # /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated
  701. # * hex string.
  702. # */
  703. void hts_md5_hex(char *hex, const unsigned char *digest)
  704. # /*! @abstract Deallocates any memory allocated by hts_md5_init. */
  705. void hts_md5_destroy(hts_md5_context *ctx)
  706. int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
  707. int hts_bin_bot(int bin, int n_lvls)
  708. # * Endianness *
  709. int ed_is_big()
  710. uint16_t ed_swap_2(uint16_t v)
  711. void *ed_swap_2p(void *x)
  712. uint32_t ed_swap_4(uint32_t v)
  713. void *ed_swap_4p(void *x)
  714. uint64_t ed_swap_8(uint64_t v)
  715. void *ed_swap_8p(void *x)
  716. cdef extern from "htslib/sam.h" nogil:
  717. #**********************
  718. #*** SAM/BAM header ***
  719. #**********************
  720. # @abstract Structure for the alignment header.
  721. # @field n_targets number of reference sequences
  722. # @field l_text length of the plain text in the header
  723. # @field target_len lengths of the reference sequences
  724. # @field target_name names of the reference sequences
  725. # @field text plain text
  726. # @field sdict header dictionary
  727. ctypedef struct bam_hdr_t:
  728. int32_t n_targets, ignore_sam_err
  729. uint32_t l_text
  730. uint32_t *target_len
  731. uint8_t *cigar_tab
  732. char **target_name
  733. char *text
  734. void *sdict
  735. #****************************
  736. #*** CIGAR related macros ***
  737. #****************************
  738. int BAM_CMATCH
  739. int BAM_CINS
  740. int BAM_CDEL
  741. int BAM_CREF_SKIP
  742. int BAM_CSOFT_CLIP
  743. int BAM_CHARD_CLIP
  744. int BAM_CPAD
  745. int BAM_CEQUAL
  746. int BAM_CDIFF
  747. int BAM_CBACK
  748. char *BAM_CIGAR_STR
  749. int BAM_CIGAR_SHIFT
  750. uint32_t BAM_CIGAR_MASK
  751. uint32_t BAM_CIGAR_TYPE
  752. char bam_cigar_op(uint32_t c)
  753. uint32_t bam_cigar_oplen(uint32_t c)
  754. char bam_cigar_opchr(uint32_t)
  755. uint32_t bam_cigar_gen(char, uint32_t)
  756. int bam_cigar_type(char o)
  757. # @abstract the read is paired in sequencing, no matter whether it is mapped in a pair
  758. int BAM_FPAIRED
  759. # @abstract the read is mapped in a proper pair
  760. int BAM_FPROPER_PAIR
  761. # @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
  762. int BAM_FUNMAP
  763. # @abstract the mate is unmapped
  764. int BAM_FMUNMAP
  765. # @abstract the read is mapped to the reverse strand
  766. int BAM_FREVERSE
  767. # @abstract the mate is mapped to the reverse strand
  768. int BAM_FMREVERSE
  769. # @abstract this is read1
  770. int BAM_FREAD1
  771. # @abstract this is read2
  772. int BAM_FREAD2
  773. # @abstract not primary alignment
  774. int BAM_FSECONDARY
  775. # @abstract QC failure
  776. int BAM_FQCFAIL
  777. # @abstract optical or PCR duplicate
  778. int BAM_FDUP
  779. # @abstract supplementary alignment
  780. int BAM_FSUPPLEMENTARY
  781. #*************************
  782. #*** Alignment records ***
  783. #*************************
  784. # @abstract Structure for core alignment information.
  785. # @field tid chromosome ID, defined by bam_hdr_t
  786. # @field pos 0-based leftmost coordinate
  787. # @field bin bin calculated by bam_reg2bin()
  788. # @field qual mapping quality
  789. # @field l_qname length of the query name
  790. # @field flag bitwise flag
  791. # @field n_cigar number of CIGAR operations
  792. # @field l_qseq length of the query sequence (read)
  793. # @field mtid chromosome ID of next read in template, defined by bam_hdr_t
  794. # @field mpos 0-based leftmost coordinate of next read in template
  795. ctypedef struct bam1_core_t:
  796. int32_t tid
  797. int32_t pos
  798. uint16_t bin
  799. uint8_t qual
  800. uint8_t l_qname
  801. uint16_t flag
  802. uint8_t unused1
  803. uint8_t l_extranul
  804. uint32_t n_cigar
  805. int32_t l_qseq
  806. int32_t mtid
  807. int32_t mpos
  808. int32_t isize
  809. # @abstract Structure for one alignment.
  810. # @field core core information about the alignment
  811. # @field l_data current length of bam1_t::data
  812. # @field m_data maximum length of bam1_t::data
  813. # @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
  814. #
  815. # @discussion Notes:
  816. #
  817. # 1. qname is zero tailing and core.l_qname includes the tailing '\0'.
  818. # 2. l_qseq is calculated from the total length of an alignment block
  819. # on reading or from CIGAR.
  820. # 3. cigar data is encoded 4 bytes per CIGAR operation.
  821. # 4. seq is nybble-encoded according to seq_nt16_table.
  822. ctypedef struct bam1_t:
  823. bam1_core_t core
  824. int l_data
  825. uint32_t m_data
  826. uint8_t *data
  827. uint64_t id
  828. # @abstract Get whether the query is on the reverse strand
  829. # @param b pointer to an alignment
  830. # @return boolean true if query is on the reverse strand
  831. int bam_is_rev(bam1_t *b)
  832. # @abstract Get whether the query's mate is on the reverse strand
  833. # @param b pointer to an alignment
  834. # @return boolean true if query's mate on the reverse strand
  835. int bam_is_mrev(bam1_t *b)
  836. # @abstract Get the name of the query
  837. # @param b pointer to an alignment
  838. # @return pointer to the name string, null terminated
  839. char *bam_get_qname(bam1_t *b)
  840. # @abstract Get the CIGAR array
  841. # @param b pointer to an alignment
  842. # @return pointer to the CIGAR array
  843. #
  844. # @discussion In the CIGAR array, each element is a 32-bit integer. The
  845. # lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
  846. # length of a CIGAR.
  847. uint32_t *bam_get_cigar(bam1_t *b)
  848. # @abstract Get query sequence
  849. # @param b pointer to an alignment
  850. # @return pointer to sequence
  851. #
  852. # @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
  853. # 8 for T and 15 for N. Two bases are packed in one byte with the base
  854. # at the higher 4 bits having smaller coordinate on the read. It is
  855. # recommended to use bam_seqi() macro to get the base.
  856. char *bam_get_seq(bam1_t *b)
  857. # @abstract Get query quality
  858. # @param b pointer to an alignment
  859. # @return pointer to quality string
  860. uint8_t *bam_get_qual(bam1_t *b)
  861. # @abstract Get auxiliary data
  862. # @param b pointer to an alignment
  863. # @return pointer to the concatenated auxiliary data
  864. uint8_t *bam_get_aux(bam1_t *b)
  865. # @abstract Get length of auxiliary data
  866. # @param b pointer to an alignment
  867. # @return length of the concatenated auxiliary data
  868. int bam_get_l_aux(bam1_t *b)
  869. # @abstract Get a base on read
  870. # @param s Query sequence returned by bam1_seq()
  871. # @param i The i-th position, 0-based
  872. # @return 4-bit integer representing the base.
  873. char bam_seqi(char *s, int i)
  874. #**************************
  875. #*** Exported functions ***
  876. #**************************
  877. #***************
  878. #*** BAM I/O ***
  879. #***************
  880. bam_hdr_t *bam_hdr_init()
  881. bam_hdr_t *bam_hdr_read(BGZF *fp)
  882. int bam_hdr_write(BGZF *fp, const bam_hdr_t *h)
  883. void bam_hdr_destroy(bam_hdr_t *h)
  884. int bam_name2id(bam_hdr_t *h, const char *ref)
  885. bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0)
  886. bam1_t *bam_init1()
  887. void bam_destroy1(bam1_t *b)
  888. int bam_read1(BGZF *fp, bam1_t *b)
  889. int bam_write1(BGZF *fp, const bam1_t *b)
  890. bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
  891. bam1_t *bam_dup1(const bam1_t *bsrc)
  892. int bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
  893. int bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
  894. # @abstract Calculate the rightmost base position of an alignment on the
  895. # reference genome.
  896. # @param b pointer to an alignment
  897. # @return the coordinate of the first base after the alignment, 0-based
  898. # @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen.
  899. # For an unmapped read (either according to its flags or if it has no cigar
  900. # string), we return b->core.pos + 1 by convention.
  901. int32_t bam_endpos(const bam1_t *b)
  902. int bam_str2flag(const char *str) # returns negative value on error
  903. char *bam_flag2str(int flag) # The string must be freed by the user
  904. #*************************
  905. #*** BAM/CRAM indexing ***
  906. #*************************
  907. # These BAM iterator functions work only on BAM files. To work with either
  908. # BAM or CRAM files use the sam_index_load() & sam_itr_*() functions.
  909. void bam_itr_destroy(hts_itr_t *iter)
  910. hts_itr_t *bam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
  911. hts_itr_t *bam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
  912. int bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
  913. # Load/build .csi or .bai BAM index file. Does not work with CRAM.
  914. # It is recommended to use the sam_index_* functions below instead.
  915. hts_idx_t *bam_index_load(const char *fn)
  916. int bam_index_build(const char *fn, int min_shift)
  917. # Load a BAM (.csi or .bai) or CRAM (.crai) index file
  918. # @param fp File handle of the data file whose index is being opened
  919. # @param fn BAM/CRAM/etc filename to search alongside for the index file
  920. # @return The index, or NULL if an error occurred.
  921. hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
  922. # Load a specific BAM (.csi or .bai) or CRAM (.crai) index file
  923. # @param fp File handle of the data file whose index is being opened
  924. # @param fn BAM/CRAM/etc data file filename
  925. # @param fnidx Index filename, or NULL to search alongside @a fn
  926. # @return The index, or NULL if an error occurred.
  927. hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx)
  928. # Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file
  929. # @param fp File handle of the data file whose index is being opened
  930. # @param fn BAM/CRAM/etc data file filename
  931. # @param fnidx Index filename, or NULL to search alongside @a fn
  932. # @param flags Flags to alter behaviour
  933. # @return The index, or NULL if an error occurred.
  934. hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
  935. # Generate and save an index file
  936. # @param fn Input BAM/etc filename, to which .csi/etc will be added
  937. # @param min_shift Positive to generate CSI, or 0 to generate BAI
  938. # @return 0 if successful, or negative if an error occurred (usually -1; or
  939. # -2: opening fn failed; -3: format not indexable)
  940. int sam_index_build(const char *fn, int min_shift)
  941. # Generate and save an index to a specific file
  942. # @param fn Input BAM/CRAM/etc filename
  943. # @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn
  944. # @param min_shift Positive to generate CSI, or 0 to generate BAI
  945. # @return 0 if successful, or negative if an error occurred.
  946. int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
  947. void sam_itr_destroy(hts_itr_t *iter)
  948. hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
  949. hts_itr_t *sam_itr_querys(const hts_idx_t *idx, bam_hdr_t *hdr, const char *region)
  950. int sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r)
  951. #***************
  952. #*** SAM I/O ***
  953. #***************
  954. htsFile *sam_open(const char *fn, const char *mode)
  955. htsFile *sam_open_format(const char *fn, const char *mode, const htsFormat *fmt)
  956. int sam_close(htsFile *fp)
  957. int sam_open_mode(char *mode, const char *fn, const char *format)
  958. # A version of sam_open_mode that can handle ,key=value options.
  959. # The format string is allocated and returned, to be freed by the caller.
  960. # Prefix should be "r" or "w",
  961. char *sam_open_mode_opts(const char *fn, const char *mode, const char *format)
  962. bam_hdr_t *sam_hdr_parse(int l_text, const char *text)
  963. bam_hdr_t *sam_hdr_read(htsFile *fp)
  964. int sam_hdr_write(htsFile *fp, const bam_hdr_t *h)
  965. int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b)
  966. int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
  967. int sam_read1(htsFile *fp, bam_hdr_t *h, bam1_t *b)
  968. int sam_write1(htsFile *fp, const bam_hdr_t *h, const bam1_t *b)
  969. #*************************************
  970. #*** Manipulating auxiliary fields ***
  971. #*************************************
  972. uint8_t *bam_aux_get(const bam1_t *b, const char *tag)
  973. int64_t bam_aux2i(const uint8_t *s)
  974. double bam_aux2f(const uint8_t *s)
  975. char bam_aux2A(const uint8_t *s)
  976. char *bam_aux2Z(const uint8_t *s)
  977. void bam_aux_append(bam1_t *b, const char *tag, char type, int len, uint8_t *data)
  978. int bam_aux_del(bam1_t *b, uint8_t *s)
  979. #**************************
  980. #*** Pileup and Mpileup ***
  981. #**************************
  982. # @abstract Generic pileup 'client data'.
  983. # @discussion The pileup iterator allows setting a constructor and
  984. # destructor function, which will be called every time a sequence is
  985. # fetched and discarded. This permits caching of per-sequence data in
  986. # a tidy manner during the pileup process. This union is the cached
  987. # data to be manipulated by the "client" (the caller of pileup).
  988. #
  989. union bam_pileup_cd:
  990. void *p
  991. int64_t i
  992. double f
  993. # @abstract Structure for one alignment covering the pileup position.
  994. # @field b pointer to the alignment
  995. # @field qpos position of the read base at the pileup site, 0-based
  996. # @field indel indel length; 0 for no indel, positive for ins and negative for del
  997. # @field level the level of the read in the "viewer" mode
  998. # @field is_del 1 iff the base on the padded read is a deletion
  999. # @field is_head ???
  1000. # @field is_tail ???
  1001. # @field is_refskip ???
  1002. # @field aux ???
  1003. #
  1004. # @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
  1005. # difference between the two functions is that the former does not
  1006. # set bam_pileup1_t::level, while the later does. Level helps the
  1007. # implementation of alignment viewers, but calculating this has some
  1008. # overhead.
  1009. #
  1010. # is_del, is_head, etc are a bit field, declaring as below should
  1011. # work as expected, see
  1012. # https://groups.google.com/forum/#!msg/cython-users/24tD1kwRY7A/pmoPuSmanM0J
  1013. ctypedef struct bam_pileup1_t:
  1014. bam1_t *b
  1015. int32_t qpos
  1016. int indel, level
  1017. uint32_t is_del
  1018. uint32_t is_head
  1019. uint32_t is_tail
  1020. uint32_t is_refskip
  1021. uint32_t aux
  1022. bam_pileup_cd cd
  1023. ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b)
  1024. ctypedef int (*bam_test_f)()
  1025. ctypedef struct __bam_plp_t
  1026. ctypedef __bam_plp_t *bam_plp_t
  1027. ctypedef struct __bam_mplp_t
  1028. ctypedef __bam_mplp_t *bam_mplp_t
  1029. # bam_plp_init() - sets an iterator over multiple
  1030. # @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return
  1031. # status: 0 on success, -1 on end, < -1 on non-recoverable errors
  1032. # @data: user data to pass to @func
  1033. bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
  1034. void bam_plp_destroy(bam_plp_t iter)
  1035. int bam_plp_push(bam_plp_t iter, const bam1_t *b)
  1036. const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
  1037. const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
  1038. void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
  1039. void bam_plp_reset(bam_plp_t iter)
  1040. bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
  1041. # bam_mplp_init_overlaps() - if called, mpileup will detect overlapping
  1042. # read pairs and for each base pair set the base quality of the
  1043. # lower-quality base to zero, thus effectively discarding it from
  1044. # calling. If the two bases are identical, the quality of the other base
  1045. # is increased to the sum of their qualities (capped at 200), otherwise
  1046. # it is multiplied by 0.8.
  1047. void bam_mplp_init_overlaps(bam_mplp_t iter)
  1048. void bam_mplp_destroy(bam_mplp_t iter)
  1049. void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
  1050. int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
  1051. void bam_mplp_reset(bam_mplp_t iter)
  1052. void bam_mplp_constructor(bam_mplp_t iter,
  1053. int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd))
  1054. void bam_mplp_destructor(bam_mplp_t iter,
  1055. int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd))
  1056. # Added by AH
  1057. # ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *"
  1058. # // ---------------------------
  1059. # // Base modification retrieval
  1060. # /*! @typedef
  1061. # @abstract Holds a single base modification.
  1062. # @field modified_base The short base code (m, h, etc) or -ChEBI (negative)
  1063. # @field canonical_base The canonical base referred to in the MM tag.
  1064. # One of A, C, G, T or N. Note this may not be the
  1065. # explicit base recorded in the SEQ column (esp. if N).
  1066. # @field strand 0 or 1, indicating + or - strand from MM tag.
  1067. # @field qual Quality code (256*probability), or -1 if unknown
  1068. # @discussion
  1069. # Note this doesn't hold any location data or information on which other
  1070. # modifications may be possible at this site.
  1071. ctypedef struct hts_base_mod:
  1072. int modified_base
  1073. int canonical_base
  1074. int strand
  1075. int qual
  1076. # /// Allocates an hts_base_mode_state.
  1077. # /**
  1078. # * @return An hts_base_mode_state pointer on success,
  1079. # * NULL on failure.
  1080. # *
  1081. # * This just allocates the memory. The initialisation of the contents is
  1082. # * done using bam_parse_basemod. Successive calls may be made to that
  1083. # * without the need to free and allocate a new state.
  1084. # *
  1085. # * The state be destroyed using the hts_base_mode_state_free function.
  1086. # */
  1087. ctypedef struct hts_base_mod_state
  1088. hts_base_mod_state *hts_base_mod_state_alloc()
  1089. # /// Destroys an hts_base_mode_state.
  1090. # /**
  1091. # * @param state The base modification state pointer.
  1092. # *
  1093. # * The should have previously been created by hts_base_mode_state_alloc.
  1094. # */
  1095. void hts_base_mod_state_free(hts_base_mod_state *state)
  1096. # /// Parses the Mm and Ml tags out of a bam record.
  1097. # /**
  1098. # * @param b BAM alignment record
  1099. # * @param state The base modification state pointer.
  1100. # * @return 0 on success,
  1101. # * -1 on failure.
  1102. # *
  1103. # * This fills out the contents of the modification state, resetting the
  1104. # * iterator location to the first sequence base.
  1105. # */
  1106. int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state)
  1107. # /// Finds the next location containing base modifications and returns them
  1108. # /**
  1109. # * @param b BAM alignment record
  1110. # * @param state The base modification state pointer.
  1111. # * @param mods A supplied array for returning base modifications
  1112. # * @param n_mods The size of the mods array
  1113. # * @return The number of modifications found on success,
  1114. # * 0 if no more modifications are present,
  1115. # * -1 on failure.
  1116. # *
  1117. # * Unlike bam_mods_at_next_pos this skips ahead to the next site
  1118. # * with modifications.
  1119. # *
  1120. # * If more than n_mods modifications are found, the total found is returned.
  1121. # * Note this means the caller needs to check whether this is higher than
  1122. # * n_mods.
  1123. # */
  1124. int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,hts_base_mod *mods, int n_mods, int *pos)
  1125. # ***********************************
  1126. # * BAQ calculation and realignment *
  1127. # ***********************************/
  1128. int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres)
  1129. int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag)
  1130. cdef extern from "htslib/faidx.h" nogil:
  1131. ctypedef struct faidx_t:
  1132. pass
  1133. # /// Build index for a FASTA or bgzip-compressed FASTA file.
  1134. # /** @param fn FASTA file name
  1135. # @param fnfai Name of .fai file to build.
  1136. # @param fngzi Name of .gzi file to build (if fn is bgzip-compressed).
  1137. # @return 0 on success; or -1 on failure
  1138. # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
  1139. # If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI
  1140. # file will only be built if fn is bgzip-compressed.
  1141. # */
  1142. int fai_build3(const char *fn,
  1143. const char *fnfai,
  1144. const char *fngzi)
  1145. # /// Build index for a FASTA or bgzip-compressed FASTA file.
  1146. # /** @param fn FASTA file name
  1147. # @return 0 on success; or -1 on failure
  1148. #
  1149. # File "fn.fai" will be generated. This function is equivalent to
  1150. # fai_build3(fn, NULL, NULL);
  1151. # */
  1152. int fai_build(char *fn)
  1153. # /// Destroy a faidx_t struct
  1154. void fai_destroy(faidx_t *fai)
  1155. # /// Load FASTA indexes.
  1156. # /** @param fn File name of the FASTA file (can be compressed with bgzip).
  1157. # @param fnfai File name of the FASTA index.
  1158. # @param fngzi File name of the bgzip index.
  1159. # @param flags Option flags to control index file caching and creation.
  1160. # @return Pointer to a faidx_t struct on success, NULL on failure.
  1161. # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
  1162. # If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name.
  1163. # The bgzip index is only needed if fn is compressed.
  1164. # If (flags & FAI_CREATE) is true, the index files will be built using
  1165. # fai_build3() if they are not already present.
  1166. # */
  1167. faidx_t *fai_load3(const char *fn,
  1168. const char *fnfai,
  1169. const char *fngzi,
  1170. int flags)
  1171. # /// Load index from "fn.fai".
  1172. # /** @param fn File name of the FASTA file
  1173. # @return Pointer to a faidx_t struct on success, NULL on failure.
  1174. # This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE);
  1175. # */
  1176. faidx_t *fai_load(char *fn)
  1177. # /// Fetch the sequence in a region
  1178. # /** @param fai Pointer to the faidx_t struct
  1179. # @param reg Region in the format "chr2:20,000-30,000"
  1180. # @param len Length of the region; -2 if seq not present, -1 general error
  1181. # @return Pointer to the sequence; `NULL` on failure
  1182. # The returned sequence is allocated by `malloc()` family and should be destroyed
  1183. # by end users by calling `free()` on it.
  1184. # */
  1185. char *fai_fetch(faidx_t *fai,
  1186. char *reg,
  1187. int *len)
  1188. # /// Fetch the sequence in a region
  1189. # /** @param fai Pointer to the faidx_t struct
  1190. # @param c_name Region name
  1191. # @param p_beg_i Beginning position number (zero-based)
  1192. # @param p_end_i End position number (zero-based)
  1193. # @param len Length of the region; -2 if c_name not present, -1 general error
  1194. # @return Pointer to the sequence; null on failure
  1195. # The returned sequence is allocated by `malloc()` family and should be destroyed
  1196. # by end users by calling `free()` on it.
  1197. # */
  1198. char *faidx_fetch_seq(faidx_t *fai,
  1199. char *c_name,
  1200. int p_beg_i,
  1201. int p_end_i,
  1202. int *len)
  1203. # /// Query if sequence is present
  1204. # /** @param fai Pointer to the faidx_t struct
  1205. # @param seq Sequence name
  1206. # @return 1 if present or 0 if absent
  1207. # */
  1208. int faidx_has_seq(faidx_t *fai, const char *seq)
  1209. # /// Fetch the number of sequences
  1210. # /** @param fai Pointer to the faidx_t struct
  1211. # @return The number of sequences
  1212. # */
  1213. int faidx_nseq(const faidx_t *fai)
  1214. # /// Return name of i-th sequence
  1215. const char *faidx_iseq(const faidx_t *fai, int i)
  1216. # /// Return sequence length, -1 if not present
  1217. int faidx_seq_len(faidx_t *fai, const char *seq)
  1218. # tabix support
  1219. cdef extern from "htslib/tbx.h" nogil:
  1220. # tbx.h definitions
  1221. int8_t TBX_MAX_SHIFT
  1222. int32_t TBX_GENERIC
  1223. int32_t TBX_SAM
  1224. int32_t TBX_VCF
  1225. int32_t TBX_UCSC
  1226. ctypedef struct tbx_conf_t:
  1227. int32_t preset
  1228. int32_t sc, bc, ec # seq col., beg col. and end col.
  1229. int32_t meta_char, line_skip
  1230. ctypedef struct tbx_t:
  1231. tbx_conf_t conf
  1232. hts_idx_t *idx
  1233. void * dict
  1234. tbx_conf_t tbx_conf_gff
  1235. tbx_conf_t tbx_conf_bed
  1236. tbx_conf_t tbx_conf_psltbl
  1237. tbx_conf_t tbx_conf_sam
  1238. tbx_conf_t tbx_conf_vcf
  1239. void tbx_itr_destroy(hts_itr_t * iter)
  1240. hts_itr_t * tbx_itr_queryi(tbx_t * t, int tid, int bed, int end)
  1241. hts_itr_t * tbx_itr_querys(tbx_t * t, char * s)
  1242. int tbx_itr_next(htsFile * fp, tbx_t * t, hts_itr_t * iter, void * data)
  1243. int tbx_name2id(tbx_t *tbx, char *ss)
  1244. int tbx_index_build(char *fn, int min_shift, tbx_conf_t *conf)
  1245. int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf)
  1246. tbx_t * tbx_index_load(char *fn)
  1247. tbx_t *tbx_index_load2(const char *fn, const char *fnidx)
  1248. tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags)
  1249. # free the array but not the values
  1250. char **tbx_seqnames(tbx_t *tbx, int *n)
  1251. void tbx_destroy(tbx_t *tbx)
  1252. # VCF/BCF API
  1253. cdef extern from "htslib/vcf.h" nogil:
  1254. # Header struct
  1255. uint8_t BCF_HL_FLT # header line
  1256. uint8_t BCF_HL_INFO
  1257. uint8_t BCF_HL_FMT
  1258. uint8_t BCF_HL_CTG
  1259. uint8_t BCF_HL_STR # structured header line TAG=<A=..,B=..>
  1260. uint8_t BCF_HL_GEN # generic header line
  1261. uint8_t BCF_HT_FLAG # header type
  1262. uint8_t BCF_HT_INT
  1263. uint8_t BCF_HT_REAL
  1264. uint8_t BCF_HT_STR
  1265. uint8_t BCF_VL_FIXED # variable length
  1266. uint8_t BCF_VL_VAR
  1267. uint8_t BCF_VL_A
  1268. uint8_t BCF_VL_G
  1269. uint8_t BCF_VL_R
  1270. # === Dictionary ===
  1271. #
  1272. # The header keeps three dictionaries. The first keeps IDs in the
  1273. # "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
  1274. # in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
  1275. # is the actual hash table, which is opaque to the end users. In the hash
  1276. # table, the key is the ID or sample name as a C string and the value is a
  1277. # bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
  1278. # table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
  1279. # size of the hash table or, equivalently, the length of the id[] arrays.
  1280. uint8_t BCF_DT_ID # dictionary type
  1281. uint8_t BCF_DT_CTG
  1282. uint8_t BCF_DT_SAMPLE
  1283. # Complete textual representation of a header line
  1284. ctypedef struct bcf_hrec_t:
  1285. int type # One of the BCF_HL_* type
  1286. char *key # The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
  1287. char *value # Set only for generic lines, NULL for FILTER/INFO, etc.
  1288. int nkeys # Number of structured fields
  1289. char **keys # The key=value pairs
  1290. char **vals
  1291. ctypedef struct bcf_idinfo_t:
  1292. uint32_t info[3] # stores Number:20, var:4, Type:4, ColType:4 in info[0..2]
  1293. bcf_hrec_t *hrec[3] # for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG
  1294. int id
  1295. ctypedef struct bcf_idpair_t:
  1296. const char *key
  1297. const bcf_idinfo_t *val
  1298. ctypedef struct bcf_hdr_t:
  1299. int32_t n[3] # n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI)
  1300. bcf_idpair_t *id[3]
  1301. void *dict[3] # ID dictionary, contig dict and sample dict
  1302. char **samples
  1303. bcf_hrec_t **hrec
  1304. int nhrec, dirty
  1305. int ntransl
  1306. int *transl[2] # for bcf_translate()
  1307. int nsamples_ori # for bcf_hdr_set_samples()
  1308. uint8_t *keep_samples
  1309. kstring_t mem
  1310. int32_t m[3] # m: allocated size of the dictionary block in use (see n above)
  1311. uint8_t bcf_type_shift[]
  1312. # * VCF record *
  1313. uint8_t BCF_BT_NULL
  1314. uint8_t BCF_BT_INT8
  1315. uint8_t BCF_BT_INT16
  1316. uint8_t BCF_BT_INT32
  1317. uint8_t BCF_BT_FLOAT
  1318. uint8_t BCF_BT_CHAR
  1319. uint8_t VCF_REF
  1320. uint8_t VCF_SNP
  1321. uint8_t VCF_MNP
  1322. uint8_t VCF_INDEL
  1323. uint8_t VCF_OTHER
  1324. uint8_t VCF_BND
  1325. uint8_t VCF_OVERLAP
  1326. ctypedef struct variant_t:
  1327. int type, n # variant type and the number of bases affected, negative for deletions
  1328. ctypedef struct bcf_fmt_t:
  1329. int id # id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
  1330. int n, size, type # n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
  1331. uint8_t *p # same as vptr and vptr_* in bcf_info_t below
  1332. uint32_t p_len
  1333. uint32_t p_off
  1334. uint8_t p_free
  1335. union bcf_info_union_t:
  1336. int32_t i # integer value
  1337. float f # float value
  1338. ctypedef struct bcf_info_t:
  1339. int key # key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
  1340. int type, len # type: one of BCF_BT_* types; len: vector length, 1 for scalars
  1341. # v1 union only set if $len==1; for easier access
  1342. bcf_info_union_t v1
  1343. uint8_t *vptr # pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
  1344. uint32_t vptr_len # length of the vptr block or, when set, of the vptr_mod block, excluding offset
  1345. uint32_t vptr_off # vptr offset, i.e., the size of the INFO key plus size+type bytes
  1346. uint8_t vptr_free # indicates that vptr-vptr_off must be freed; set only when modified and the new
  1347. # data block is bigger than the original
  1348. uint8_t BCF1_DIRTY_ID
  1349. uint8_t BCF1_DIRTY_ALS
  1350. uint8_t BCF1_DIRTY_FLT
  1351. uint8_t BCF1_DIRTY_INF
  1352. ctypedef struct bcf_dec_t:
  1353. int m_fmt, m_info, m_id, m_als, m_allele, m_flt # allocated size (high-water mark); do not change
  1354. int n_flt # Number of FILTER fields
  1355. int *flt # FILTER keys in the dictionary
  1356. char *id # ID
  1357. char *als # REF+ALT block (\0-seperated)
  1358. char **allele # allele[0] is the REF (allele[] pointers to the als block); all null terminated
  1359. bcf_info_t *info # INFO
  1360. bcf_fmt_t *fmt # FORMAT and individual sample
  1361. variant_t *var # $var and $var_type set only when set_variant_types called
  1362. int n_var, var_type
  1363. int shared_dirty # if set, shared.s must be recreated on BCF output
  1364. int indiv_dirty # if set, indiv.s must be recreated on BCF output
  1365. uint8_t BCF_ERR_CTG_UNDEF
  1366. uint8_t BCF_ERR_TAG_UNDEF
  1367. uint8_t BCF_ERR_NCOLS
  1368. uint8_t BCF_ERR_LIMITS
  1369. uint8_t BCF_ERR_CHAR
  1370. uint8_t BCF_ERR_CTG_INVALID
  1371. uint8_t BCF_ERR_TAG_INVALID
  1372. # The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
  1373. # is slower because the string is first to be parsed, packed into BCF line
  1374. # (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
  1375. # is known in advance that some of the fields will not be required (notably
  1376. # the sample columns), parsing of these can be skipped by setting max_unpack
  1377. # appropriately.
  1378. # Similarly, it is fast to output a BCF line because the columns (kept in
  1379. # shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
  1380. # line must be formatted in vcf_format.
  1381. ctypedef struct bcf1_t:
  1382. int32_t rid # CHROM
  1383. int32_t pos # POS
  1384. int32_t rlen # length of REF
  1385. float qual # QUAL
  1386. uint32_t n_info, n_allele
  1387. uint32_t n_fmt, n_sample
  1388. kstring_t shared, indiv
  1389. bcf_dec_t d # lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
  1390. int max_unpack # Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
  1391. int unpacked # remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
  1392. int unpack_size[3] # the original block size of ID, REF+ALT and FILTER
  1393. int errcode # one of BCF_ERR_* codes
  1394. ####### API #######
  1395. # BCF and VCF I/O
  1396. #
  1397. # A note about naming conventions: htslib internally represents VCF
  1398. # records as bcf1_t data structures, therefore most functions are
  1399. # prefixed with bcf_. There are a few exceptions where the functions must
  1400. # be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
  1401. # these cases, functions prefixed with bcf_ are more general and work
  1402. # with both BCF and VCF.
  1403. # bcf_hdr_init() - create an empty BCF header.
  1404. # @param mode "r" or "w"
  1405. #
  1406. # When opened for writing, the mandatory fileFormat and
  1407. # FILTER=PASS lines are added automatically.
  1408. bcf_hdr_t *bcf_hdr_init(const char *mode)
  1409. # Destroy a BCF header struct
  1410. void bcf_hdr_destroy(bcf_hdr_t *h)
  1411. # Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t))
  1412. bcf1_t *bcf_init()
  1413. # Deallocate a bcf1_t object
  1414. void bcf_destroy(bcf1_t *v)
  1415. # Same as bcf_destroy() but frees only the memory allocated by bcf1_t,
  1416. # not the bcf1_t object itself.
  1417. void bcf_empty(bcf1_t *v)
  1418. # Make the bcf1_t object ready for next read. Intended mostly for
  1419. # internal use, the user should rarely need to call this function
  1420. # directly.
  1421. void bcf_clear(bcf1_t *v)
  1422. # Reads VCF or BCF header
  1423. bcf_hdr_t *bcf_hdr_read(htsFile *fp)
  1424. # bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed
  1425. # @samples: samples to include or exclude from file or as a comma-separated string.
  1426. # LIST|FILE .. select samples in list/file
  1427. # ^LIST|FILE .. exclude samples from list/file
  1428. # - .. include all samples
  1429. # NULL .. exclude all samples
  1430. # @is_file: @samples is a file (1) or a comma-separated list (0)
  1431. #
  1432. # The bottleneck of VCF reading is parsing of genotype fields. If the
  1433. # reader knows in advance that only subset of samples is needed (possibly
  1434. # no samples at all), the performance of bcf_read() can be significantly
  1435. # improved by calling bcf_hdr_set_samples after bcf_hdr_read().
  1436. # The function bcf_read() will subset the VCF/BCF records automatically
  1437. # with the notable exception when reading records via bcf_itr_next().
  1438. # In this case, bcf_subset_format() must be called explicitly, because
  1439. # bcf_readrec() does not see the header.
  1440. #
  1441. # Returns 0 on success, -1 on error or a positive integer if the list
  1442. # contains samples not present in the VCF header. In such a case, the
  1443. # return value is the index of the offending sample.
  1444. #
  1445. int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
  1446. int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
  1447. # Writes VCF or BCF header
  1448. int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h)
  1449. # Parse VCF line contained in kstring and populate the bcf1_t struct
  1450. int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
  1451. # The opposite of vcf_parse. It should rarely be called directly, see vcf_write
  1452. int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
  1453. # bcf_read() - read next VCF or BCF record
  1454. #
  1455. # Returns -1 on critical errors, 0 otherwise. On errors which are not
  1456. # critical for reading, such as missing header definitions, v->errcode is
  1457. # set to one of BCF_ERR* code and must be checked before calling
  1458. # vcf_write().
  1459. int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
  1460. # bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field)
  1461. #
  1462. # Note that bcf_unpack() must be called even when reading VCF. It is safe
  1463. # to call the function repeatedly, it will not unpack the same field
  1464. # twice.
  1465. uint8_t BCF_UN_STR # up to ALT inclusive
  1466. uint8_t BCF_UN_FLT # up to FILTER
  1467. uint8_t BCF_UN_INFO # up to INFO
  1468. uint8_t BCF_UN_SHR # all shared information
  1469. uint8_t BCF_UN_FMT # unpack format and each sample
  1470. uint8_t BCF_UN_IND # a synonymo of BCF_UN_FMT
  1471. uint8_t BCF_UN_ALL # everything
  1472. int bcf_unpack(bcf1_t *b, int which)
  1473. # bcf_dup() - create a copy of BCF record.
  1474. #
  1475. # Note that bcf_unpack() must be called on the returned copy as if it was
  1476. # obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
  1477. # internally to reflect any changes made by bcf_update_* functions.
  1478. bcf1_t *bcf_dup(bcf1_t *src)
  1479. bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src)
  1480. # bcf_write() - write one VCF or BCF record. The type is determined at the open() call.
  1481. int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v)
  1482. # The following functions work only with VCFs and should rarely be called
  1483. # directly. Usually one wants to use their bcf_* alternatives, which work
  1484. # transparently with both VCFs and BCFs.
  1485. bcf_hdr_t *vcf_hdr_read(htsFile *fp)
  1486. int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
  1487. int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
  1488. int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
  1489. #************************************************************************
  1490. # Header querying and manipulation routines
  1491. #************************************************************************
  1492. # Create a new header using the supplied template
  1493. bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr)
  1494. # Copy header lines from src to dst if not already present in dst. See also bcf_translate().
  1495. # Returns 0 on success or sets a bit on error:
  1496. # 1 .. conflicting definitions of tag length
  1497. # # todo
  1498. int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
  1499. # bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate()
  1500. # @param dst: the destination header to be merged into, NULL on the first pass
  1501. # @param src: the source header
  1502. #
  1503. # Notes:
  1504. # - use as:
  1505. # bcf_hdr_t *dst = NULL;
  1506. # for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]);
  1507. #
  1508. # - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when
  1509. # combining multiple BCF headers. The current bcf_hdr_combine()
  1510. # does not have this problem, but became slow when used for many files.
  1511. bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src)
  1512. # bcf_hdr_add_sample() - add a new sample.
  1513. # @param sample: sample name to be added
  1514. int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample)
  1515. # Read VCF header from a file and update the header
  1516. int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
  1517. # Appends formatted header text to _str_.
  1518. # If _is_bcf_ is zero, `IDX` fields are discarded.
  1519. # @return 0 if successful, or negative if an error occurred
  1520. # @since 1.4
  1521. int bcf_hdr_format(const bcf_hdr_t *hdr, int is_bcf, kstring_t *str);
  1522. # Returns formatted header (newly allocated string) and its length,
  1523. # excluding the terminating \0. If is_bcf parameter is unset, IDX
  1524. # fields are discarded.
  1525. char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
  1526. # Append new VCF header line, returns 0 on success
  1527. int bcf_hdr_append(bcf_hdr_t *h, const char *line)
  1528. int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...)
  1529. # VCF version, e.g. VCFv4.2
  1530. const char *bcf_hdr_get_version(const bcf_hdr_t *hdr)
  1531. void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
  1532. # bcf_hdr_remove() - remove VCF header tag
  1533. # @param type: one of BCF_HL_*
  1534. # @param key: tag name or NULL to remove all tags of the given type
  1535. void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key)
  1536. # bcf_hdr_subset() - creates a new copy of the header removing unwanted samples
  1537. # @param n: number of samples to keep
  1538. # @param samples: names of the samples to keep
  1539. # @param imap: mapping from index in @samples to the sample index in the original file
  1540. #
  1541. # Sample names not present in h0 are ignored. The number of unmatched samples can be checked
  1542. # by comparing n and bcf_hdr_nsamples(out_hdr).
  1543. # This function can be used to reorder samples.
  1544. # See also bcf_subset() which subsets individual records.
  1545. #
  1546. bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap)
  1547. # Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names)
  1548. const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs)
  1549. # Get number of samples
  1550. int32_t bcf_hdr_nsamples(const bcf_hdr_t *h)
  1551. # The following functions are for internal use and should rarely be called directly
  1552. int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
  1553. int bcf_hdr_sync(bcf_hdr_t *h)
  1554. bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
  1555. void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
  1556. int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
  1557. # bcf_hdr_get_hrec() - get header line info
  1558. # @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN
  1559. # @param key: the header key for generic lines (e.g. "fileformat"), any field
  1560. # for structured lines, typically "ID".
  1561. # @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN
  1562. # @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL
  1563. #
  1564. bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
  1565. bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec)
  1566. int bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, size_t len)
  1567. int bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, size_t len, int is_quoted)
  1568. int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
  1569. int hrec_add_idx(bcf_hrec_t *hrec, int idx)
  1570. void bcf_hrec_destroy(bcf_hrec_t *hrec)
  1571. #************************************************************************
  1572. # Individual record querying and manipulation routines
  1573. #************************************************************************
  1574. # See the description of bcf_hdr_subset()
  1575. int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
  1576. # bcf_translate() - translate tags ids to be consistent with different header. This function
  1577. # is useful when lines from multiple VCF need to be combined.
  1578. # @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine()
  1579. # @src_hdr: the source header, used in bcf_read()
  1580. # @src_line: line obtained by bcf_read()
  1581. int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line)
  1582. # bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc
  1583. int bcf_get_variant_types(bcf1_t *rec)
  1584. int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
  1585. int bcf_is_snp(bcf1_t *v)
  1586. # bcf_update_filter() - sets the FILTER column
  1587. # @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
  1588. # @n: Number of filters. If n==0, all filters are removed
  1589. int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
  1590. # bcf_add_filter() - adds to the FILTER column
  1591. # @flt_id: The filter IDs to add, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
  1592. #
  1593. # If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed.
  1594. int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
  1595. # bcf_remove_filter() - removes from the FILTER column
  1596. # @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS")
  1597. # @pass: when set to 1 and no filters are present, set to PASS
  1598. int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int set_pass)
  1599. # Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably.
  1600. int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
  1601. # bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALT column
  1602. # @alleles: Array of alleles
  1603. # @nals: Number of alleles
  1604. # @alleles_string: Comma-separated alleles, starting with the REF allele
  1605. int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
  1606. int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
  1607. # bcf_update_id() - sets new ID string
  1608. # bcf_add_id() - adds to the ID string checking for duplicates
  1609. int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
  1610. int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
  1611. # bcf_update_info_*() - functions for updating INFO fields
  1612. # @hdr: the BCF header
  1613. # @line: VCF line to be edited
  1614. # @key: the INFO tag to be updated
  1615. # @values: pointer to the array of values. Pass NULL to remove the tag.
  1616. # @n: number of values in the array. When set to 0, the INFO tag is removed
  1617. #
  1618. # The @string in bcf_update_info_flag() is optional, @n indicates whether
  1619. # the flag is set or removed.
  1620. #
  1621. # Returns 0 on success or negative value on error.
  1622. #
  1623. int bcf_update_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
  1624. int bcf_update_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
  1625. int bcf_update_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
  1626. int bcf_update_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
  1627. int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
  1628. # bcf_update_format_*() - functions for updating FORMAT fields
  1629. # @values: pointer to the array of values, the same number of elements
  1630. # is expected for each sample. Missing values must be padded
  1631. # with bcf_*_missing or bcf_*_vector_end values.
  1632. # @n: number of values in the array. If n==0, existing tag is removed.
  1633. #
  1634. # The function bcf_update_format_string() is a higher-level (slower) variant of
  1635. # bcf_update_format_char(). The former accepts array of \0-terminated strings
  1636. # whereas the latter requires that the strings are collapsed into a single array
  1637. # of fixed-length strings. In case of strings with variable length, shorter strings
  1638. # can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
  1639. # are not \0-terminated.
  1640. #
  1641. # Returns 0 on success or negative value on error.
  1642. #
  1643. int bcf_update_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const int32_t *values, int n)
  1644. int bcf_update_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const float *values, int n)
  1645. int bcf_update_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char *values, int n)
  1646. int bcf_update_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, const int32_t *values, int n)
  1647. int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
  1648. int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
  1649. # Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
  1650. # to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
  1651. # from bcf_get_genotypes() below.
  1652. uint32_t bcf_gt_phased(uint32_t idx)
  1653. uint32_t bcf_gt_unphased(uint32_t idx)
  1654. uint32_t bcf_gt_missing
  1655. uint32_t bcf_gt_is_missing(uint32_t val)
  1656. uint32_t bcf_gt_is_phased(uint32_t idx)
  1657. uint32_t bcf_gt_allele(uint32_t val)
  1658. # Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based)
  1659. uint32_t bcf_alleles2gt(uint32_t a, uint32_t b)
  1660. void bcf_gt2alleles(int igt, int *a, int *b)
  1661. # bcf_get_fmt() - returns pointer to FORMAT's field data
  1662. # @header: for access to BCF_DT_ID dictionary
  1663. # @line: VCF line obtained from vcf_parse1
  1664. # @fmt: one of GT,PL,...
  1665. #
  1666. # Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field
  1667. # is not available.
  1668. #
  1669. bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
  1670. bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
  1671. # bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID
  1672. # @line: VCF line obtained from vcf_parse1
  1673. # @id: The header index for the tag, obtained from bcf_hdr_id2int()
  1674. #
  1675. # Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid
  1676. # as their goal is to avoid the header lookup.
  1677. #
  1678. bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id)
  1679. bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id)
  1680. # bcf_get_info_*() - get INFO values, integers or floats
  1681. # @hdr: BCF header
  1682. # @line: BCF record
  1683. # @tag: INFO tag to retrieve
  1684. # @dst: *dst is pointer to a memory location, can point to NULL
  1685. # @ndst: pointer to the size of allocated memory
  1686. #
  1687. # Returns negative value on error or the number of written values on
  1688. # success. bcf_get_info_string() returns on success the number of
  1689. # characters written excluding the null-terminating byte. bcf_get_info_flag()
  1690. # returns 1 when flag is set or 0 if not.
  1691. #
  1692. # List of return codes:
  1693. # -1 .. no such INFO tag defined in the header
  1694. # -2 .. clash between types defined in the header and encountered in the VCF record
  1695. # -3 .. tag is not present in the VCF record
  1696. #
  1697. int bcf_get_info_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
  1698. int bcf_get_info_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
  1699. int bcf_get_info_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
  1700. int bcf_get_info_flag(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int **dst, int *ndst)
  1701. int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
  1702. # bcf_get_format_*() - same as bcf_get_info*() above
  1703. #
  1704. # The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char().
  1705. # see the description of bcf_update_format_string() and bcf_update_format_char() above.
  1706. # Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays:
  1707. # a single block of \0-terminated strings collapsed into a single array and an array of pointers
  1708. # to these strings. Both arrays must be cleaned by the user.
  1709. #
  1710. # Returns negative value on error or the number of written values on success.
  1711. #
  1712. # Example:
  1713. # int ndst = 0; char **dst = NULL
  1714. # if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 )
  1715. # for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i])
  1716. # free(dst[0]); free(dst)
  1717. #
  1718. # Example:
  1719. # int ngt, *gt_arr = NULL, ngt_arr = 0
  1720. # ngt = bcf_get_genotypes(hdr, line, &gt_arr, &ngt_arr)
  1721. #
  1722. int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
  1723. int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
  1724. int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
  1725. int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int32_t **dst, int *ndst)
  1726. int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
  1727. int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
  1728. #************************************************************************
  1729. # Helper functions
  1730. #************************************************************************
  1731. #
  1732. # bcf_hdr_id2int() - Translates string into numeric ID
  1733. # bcf_hdr_int2id() - Translates numeric ID into string
  1734. # @type: one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE
  1735. # @id: tag name, such as: PL, DP, GT, etc.
  1736. #
  1737. # Returns -1 if string is not in dictionary, otherwise numeric ID which identifies
  1738. # fields in BCF records.
  1739. #
  1740. int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id)
  1741. const char *bcf_hdr_int2id(const bcf_hdr_t *hdr, int type, int int_id)
  1742. # bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID
  1743. # bcf_hdr_id2name() - Translates numeric ID to sequence name
  1744. #
  1745. int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id)
  1746. const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid)
  1747. const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec)
  1748. #
  1749. # bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t
  1750. # @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT
  1751. # @int_id: return value of bcf_hdr_id2int, must be >=0
  1752. #
  1753. # The returned values are:
  1754. # bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_*
  1755. # bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields
  1756. # bcf_hdr_id2type .. the field type, one of BCF_HT_*
  1757. # bcf_hdr_id2coltype .. the column type, one of BCF_HL_*
  1758. #
  1759. # Notes: Prior to using the macros, the presence of the info should be
  1760. # tested with bcf_hdr_idinfo_exists().
  1761. #
  1762. int bcf_hdr_id2length(const bcf_hdr_t *hdr, int type, int int_id)
  1763. int bcf_hdr_id2number(const bcf_hdr_t *hdr, int type, int int_id)
  1764. int bcf_hdr_id2type(const bcf_hdr_t *hdr, int type, int int_id)
  1765. int bcf_hdr_id2coltype(const bcf_hdr_t *hdr, int type, int int_id)
  1766. int bcf_hdr_idinfo_exists(const bcf_hdr_t *hdr, int type, int int_id)
  1767. bcf_hrec_t *bcf_hdr_id2hrec(const bcf_hdr_t *hdr, int type, int col_type, int int_id)
  1768. void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
  1769. uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
  1770. void bcf_enc_vchar(kstring_t *s, int l, const char *a)
  1771. void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
  1772. void bcf_enc_vfloat(kstring_t *s, int n, float *a)
  1773. #************************************************************************
  1774. # BCF index
  1775. #
  1776. # Note that these functions work with BCFs only. See synced_bcf_reader.h
  1777. # which provides (amongst other things) an API to work transparently with
  1778. # both indexed BCFs and VCFs.
  1779. #************************************************************************
  1780. hts_idx_t *bcf_index_load2(const char *fn, const char *fnidx)
  1781. hts_idx_t *bcf_index_load3(const char *fn, const char *fnidx, int flags)
  1782. int bcf_index_build(const char *fn, int min_shift)
  1783. int bcf_index_build2(const char *fn, const char *fnidx, int min_shift)
  1784. #*******************
  1785. # Typed value I/O *
  1786. #******************
  1787. # Note that in contrast with BCFv2.1 specification, HTSlib implementation
  1788. # allows missing values in vectors. For integer types, the values 0x80,
  1789. # 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
  1790. # 0x80000001 as end-of-vector indicators. Similarly for floats, the value of
  1791. # 0x7F800001 is interpreted as a missing value and 0x7F800002 as an
  1792. # end-of-vector indicator.
  1793. # Note that the end-of-vector byte is not part of the vector.
  1794. # This trial BCF version (v2.2) is compatible with the VCF specification and
  1795. # enables to handle correctly vectors with different ploidy in presence of
  1796. # missing values.
  1797. int32_t bcf_int8_vector_end
  1798. int32_t bcf_int16_vector_end
  1799. int32_t bcf_int32_vector_end
  1800. int32_t bcf_str_vector_end
  1801. int32_t bcf_int8_missing
  1802. int32_t bcf_int16_missing
  1803. int32_t bcf_int32_missing
  1804. int32_t bcf_str_missing
  1805. uint32_t bcf_float_vector_end
  1806. uint32_t bcf_float_missing
  1807. void bcf_float_set(float *ptr, uint32_t value)
  1808. void bcf_float_set_vector_end(float *x)
  1809. void bcf_float_set_missing(float *x)
  1810. int bcf_float_is_missing(float f)
  1811. int bcf_float_is_vector_end(float f)
  1812. void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
  1813. void bcf_enc_size(kstring_t *s, int size, int type)
  1814. int bcf_enc_inttype(long x)
  1815. void bcf_enc_int1(kstring_t *s, int32_t x)
  1816. int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
  1817. int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
  1818. int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
  1819. # These trivial wrappers are defined only for consistency with other parts of htslib
  1820. bcf1_t *bcf_init1()
  1821. int bcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
  1822. int vcf_read1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
  1823. int bcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
  1824. int vcf_write1(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
  1825. void bcf_destroy1(bcf1_t *v)
  1826. void bcf_empty1(bcf1_t *v)
  1827. int vcf_parse1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
  1828. void bcf_clear1(bcf1_t *v)
  1829. int vcf_format1(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
  1830. # Other nice wrappers
  1831. void bcf_itr_destroy(hts_itr_t *iter)
  1832. hts_itr_t *bcf_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end)
  1833. hts_itr_t *bcf_itr_querys(const hts_idx_t *idx, const bcf_hdr_t *hdr, char *s)
  1834. int bcf_itr_next(htsFile *fp, hts_itr_t *iter, void *r)
  1835. hts_idx_t *bcf_index_load(const char *fn)
  1836. const char **bcf_index_seqnames(const hts_idx_t *idx, const bcf_hdr_t *hdr, int *nptr)
  1837. # VCF/BCF utility functions
  1838. cdef extern from "htslib/vcfutils.h" nogil:
  1839. struct kbitset_t
  1840. # bcf_trim_alleles() - remove ALT alleles unused in genotype fields
  1841. # @header: for access to BCF_DT_ID dictionary
  1842. # @line: VCF line obtain from vcf_parse1
  1843. #
  1844. # Returns the number of removed alleles on success or negative
  1845. # on error:
  1846. # -1 .. some allele index is out of bounds
  1847. int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line)
  1848. # bcf_remove_alleles() - remove ALT alleles according to bitmask @mask
  1849. # @header: for access to BCF_DT_ID dictionary
  1850. # @line: VCF line obtained from vcf_parse1
  1851. # @mask: alleles to remove
  1852. #
  1853. # If you have more than 31 alleles, then the integer bit mask will
  1854. # overflow, so use bcf_remove_allele_set instead
  1855. void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int mask)
  1856. # bcf_remove_allele_set() - remove ALT alleles according to bitset @rm_set
  1857. # @header: for access to BCF_DT_ID dictionary
  1858. # @line: VCF line obtained from vcf_parse1
  1859. # @rm_set: pointer to kbitset_t object with bits set for allele
  1860. # indexes to remove
  1861. #
  1862. # Number=A,R,G INFO and FORMAT fields will be updated accordingly.
  1863. void bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, kbitset_t *rm_set)
  1864. # bcf_calc_ac() - calculate the number of REF and ALT alleles
  1865. # @header: for access to BCF_DT_ID dictionary
  1866. # @line: VCF line obtained from vcf_parse1
  1867. # @ac: array of length line->n_allele
  1868. # @which: determine if INFO/AN,AC and indv fields be used
  1869. #
  1870. # Returns 1 if the call succeeded, or 0 if the value could not
  1871. # be determined.
  1872. #
  1873. # The value of @which determines if existing INFO/AC,AN can be
  1874. # used (BCF_UN_INFO) and and if indv fields can be split (BCF_UN_FMT).
  1875. int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
  1876. # bcf_gt_type() - determines type of the genotype
  1877. # @fmt_ptr: the GT format field as set for example by set_fmt_ptr
  1878. # @isample: sample index (starting from 0)
  1879. # @ial: index of the 1st non-reference allele (starting from 1)
  1880. # @jal: index of the 2nd non-reference allele (starting from 1)
  1881. #
  1882. # Returns the type of the genotype (one of GT_HOM_RR, GT_HET_RA,
  1883. # GT_HOM_AA, GT_HET_AA, GT_HAPL_R, GT_HAPL_A or GT_UNKN). If $ial
  1884. # is not NULL and the genotype has one or more non-reference
  1885. # alleles, $ial will be set. In case of GT_HET_AA, $ial is the
  1886. # position of the allele which appeared first in ALT. If $jal is
  1887. # not null and the genotype is GT_HET_AA, $jal will be set and is
  1888. # the position of the second allele in ALT.
  1889. uint8_t GT_HOM_RR # note: the actual value of GT_* matters, used in dosage r2 calculation
  1890. uint8_t GT_HOM_AA
  1891. uint8_t GT_HET_RA
  1892. uint8_t GT_HET_AA
  1893. uint8_t GT_HAPL_R
  1894. uint8_t GT_HAPL_A
  1895. uint8_t GT_UNKN
  1896. int bcf_gt_type(bcf_fmt_t *fmt_ptr, int isample, int *ial, int *jal)
  1897. int bcf_acgt2int(char c)
  1898. char bcf_int2acgt(int i)
  1899. # bcf_ij2G() - common task: allele indexes to Number=G index (diploid)
  1900. # @i,j: allele indexes, 0-based, i<=j
  1901. # Returns index to the Number=G diploid array
  1902. uint32_t bcf_ij2G(uint32_t i, uint32_t j)
  1903. cdef extern from "htslib/cram.h" nogil:
  1904. enum cram_block_method:
  1905. ERROR
  1906. RAW
  1907. GZIP
  1908. BZIP2
  1909. LZMA
  1910. RANS
  1911. RANS0
  1912. RANS1
  1913. GZIP_RLE
  1914. enum cram_content_type:
  1915. CT_ERROR
  1916. FILE_HEADER
  1917. COMPRESSION_HEADER
  1918. MAPPED_SLICE
  1919. UNMAPPED_SLICE
  1920. EXTERNAL
  1921. CORE
  1922. # Opaque data types, see cram_structs for the fully fledged versions.
  1923. ctypedef struct SAM_hdr
  1924. ctypedef struct cram_file_def
  1925. ctypedef struct cram_fd
  1926. ctypedef struct cram_container
  1927. ctypedef struct cram_block
  1928. ctypedef struct cram_slice
  1929. ctypedef struct cram_metrics
  1930. ctypedef struct cram_block_slice_hdr
  1931. ctypedef struct cram_block_compression_hdr
  1932. ctypedef struct refs_t
  1933. # Accessor functions
  1934. #
  1935. #-----------------------------------------------------------------------------
  1936. # cram_fd
  1937. #
  1938. SAM_hdr *cram_fd_get_header(cram_fd *fd)
  1939. void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr)
  1940. int cram_fd_get_version(cram_fd *fd)
  1941. void cram_fd_set_version(cram_fd *fd, int vers)
  1942. int cram_major_vers(cram_fd *fd)
  1943. int cram_minor_vers(cram_fd *fd)
  1944. hFILE *cram_fd_get_fp(cram_fd *fd)
  1945. void cram_fd_set_fp(cram_fd *fd, hFILE *fp)
  1946. #
  1947. #-----------------------------------------------------------------------------
  1948. # cram_container
  1949. #
  1950. int32_t cram_container_get_length(cram_container *c)
  1951. void cram_container_set_length(cram_container *c, int32_t length)
  1952. int32_t cram_container_get_num_blocks(cram_container *c)
  1953. void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks)
  1954. int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks)
  1955. void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
  1956. int32_t *landmarks)
  1957. # Returns true if the container is empty (EOF marker) */
  1958. int cram_container_is_empty(cram_fd *fd)
  1959. #
  1960. #-----------------------------------------------------------------------------
  1961. # cram_block
  1962. #
  1963. int32_t cram_block_get_content_id(cram_block *b)
  1964. int32_t cram_block_get_comp_size(cram_block *b)
  1965. int32_t cram_block_get_uncomp_size(cram_block *b)
  1966. int32_t cram_block_get_crc32(cram_block *b)
  1967. void * cram_block_get_data(cram_block *b)
  1968. cram_content_type cram_block_get_content_type(cram_block *b)
  1969. void cram_block_set_content_id(cram_block *b, int32_t id)
  1970. void cram_block_set_comp_size(cram_block *b, int32_t size)
  1971. void cram_block_set_uncomp_size(cram_block *b, int32_t size)
  1972. void cram_block_set_crc32(cram_block *b, int32_t crc)
  1973. void cram_block_set_data(cram_block *b, void *data)
  1974. int cram_block_append(cram_block *b, void *data, int size)
  1975. void cram_block_update_size(cram_block *b)
  1976. # Offset is known as "size" internally, but it can be confusing.
  1977. size_t cram_block_get_offset(cram_block *b)
  1978. void cram_block_set_offset(cram_block *b, size_t offset)
  1979. #
  1980. # Computes the size of a cram block, including the block
  1981. # header itself.
  1982. #
  1983. uint32_t cram_block_size(cram_block *b)
  1984. #
  1985. # Renumbers RG numbers in a cram compression header.
  1986. #
  1987. # CRAM stores RG as the Nth number in the header, rather than a
  1988. # string holding the ID: tag. This is smaller in space, but means
  1989. # "samtools cat" to join files together that contain single but
  1990. # different RG lines needs a way of renumbering them.
  1991. #
  1992. # The file descriptor is expected to be immediately after the
  1993. # cram_container structure (ie before the cram compression header).
  1994. # Due to the nature of the CRAM format, this needs to read and write
  1995. # the blocks itself. Note that there may be multiple slices within
  1996. # the container, meaning multiple compression headers to manipulate.
  1997. # Changing RG may change the size of the compression header and
  1998. # therefore the length field in the container. Hence we rewrite all
  1999. # blocks just in case and also emit the adjusted container.
  2000. #
  2001. # The current implementation can only cope with renumbering a single
  2002. # RG (and only then if it is using HUFFMAN or BETA codecs). In
  2003. # theory it *may* be possible to renumber multiple RGs if they use
  2004. # HUFFMAN to the CORE block or use an external block unshared by any
  2005. # other data series. So we have an API that can be upgraded to
  2006. # support this, but do not implement it for now. An example
  2007. # implementation of RG as an EXTERNAL block would be to find that
  2008. # block and rewrite it, returning the number of blocks consumed.
  2009. #
  2010. # Returns 0 on success;
  2011. # -1 if unable to edit;
  2012. # -2 on other errors (eg I/O).
  2013. #
  2014. int cram_transcode_rg(cram_fd *input, cram_fd *output,
  2015. cram_container *c,
  2016. int nrg, int *in_rg, int *out_rg)
  2017. #
  2018. # Copies the blocks representing the next num_slice slices from a
  2019. # container from 'in' to 'out'. It is expected that the file pointer
  2020. # is just after the read of the cram_container and cram compression
  2021. # header.
  2022. #
  2023. # Returns 0 on success
  2024. # -1 on failure
  2025. #
  2026. int cram_copy_slice(cram_fd *input, cram_fd *output, int32_t num_slice)
  2027. #
  2028. #-----------------------------------------------------------------------------
  2029. # SAM_hdr
  2030. #
  2031. # Tokenises a SAM header into a hash table.
  2032. #
  2033. # Also extracts a few bits on specific data types, such as @RG lines.
  2034. #
  2035. # @return
  2036. # Returns a SAM_hdr struct on success (free with sam_hdr_free())
  2037. # NULL on failure
  2038. #
  2039. SAM_hdr *sam_hdr_parse_(const char *hdr, int len)
  2040. #
  2041. #-----------------------------------------------------------------------------
  2042. # cram_io basics
  2043. #
  2044. # CRAM blocks - the dynamically growable data block. We have code to
  2045. # create, update, (un)compress and read/write.
  2046. #
  2047. # These are derived from the deflate_interlaced.c blocks, but with the
  2048. # CRAM extension of content types and IDs.
  2049. #
  2050. # Allocates a new cram_block structure with a specified content_type and
  2051. # id.
  2052. #
  2053. # @return
  2054. # Returns block pointer on success;
  2055. # NULL on failure
  2056. #
  2057. cram_block *cram_new_block(cram_content_type content_type,
  2058. int content_id)
  2059. # Reads a block from a cram file.
  2060. #
  2061. # @return
  2062. # Returns cram_block pointer on success;
  2063. # NULL on failure
  2064. #
  2065. cram_block *cram_read_block(cram_fd *fd)
  2066. # Writes a CRAM block.
  2067. #
  2068. # @return
  2069. # Returns 0 on success;
  2070. # -1 on failure
  2071. #
  2072. int cram_write_block(cram_fd *fd, cram_block *b)
  2073. # Frees a CRAM block, deallocating internal data too.
  2074. #
  2075. void cram_free_block(cram_block *b)
  2076. # Uncompresses a CRAM block, if compressed.
  2077. #
  2078. # @return
  2079. # Returns 0 on success;
  2080. # -1 on failure
  2081. #
  2082. int cram_uncompress_block(cram_block *b)
  2083. # Compresses a block.
  2084. #
  2085. # Compresses a block using one of two different zlib strategies. If we only
  2086. # want one choice set strat2 to be -1.
  2087. #
  2088. # The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
  2089. # or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
  2090. # significantly faster.
  2091. #
  2092. # @return
  2093. # Returns 0 on success;
  2094. # -1 on failure
  2095. #
  2096. int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
  2097. int method, int level)
  2098. # Containers
  2099. #
  2100. # Creates a new container, specifying the maximum number of slices
  2101. # and records permitted.
  2102. #
  2103. # @return
  2104. # Returns cram_container ptr on success;
  2105. # NULL on failure
  2106. #
  2107. cram_container *cram_new_container(int nrec, int nslice)
  2108. void cram_free_container(cram_container *c)
  2109. # Reads a container header.
  2110. #
  2111. # @return
  2112. # Returns cram_container on success;
  2113. # NULL on failure or no container left (fd->err == 0).
  2114. #
  2115. cram_container *cram_read_container(cram_fd *fd)
  2116. # Writes a container structure.
  2117. #
  2118. # @return
  2119. # Returns 0 on success;
  2120. # -1 on failure
  2121. #
  2122. int cram_write_container(cram_fd *fd, cram_container *h)
  2123. #
  2124. # Stores the container structure in dat and returns *size as the
  2125. # number of bytes written to dat[]. The input size of dat is also
  2126. # held in *size and should be initialised to cram_container_size(c).
  2127. #
  2128. # Returns 0 on success;
  2129. # -1 on failure
  2130. #
  2131. int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
  2132. int cram_container_size(cram_container *c)
  2133. # The top-level cram opening, closing and option handling
  2134. #
  2135. # Opens a CRAM file for read (mode "rb") or write ("wb").
  2136. #
  2137. # The filename may be "-" to indicate stdin or stdout.
  2138. #
  2139. # @return
  2140. # Returns file handle on success;
  2141. # NULL on failure.
  2142. #
  2143. cram_fd *cram_open(const char *filename, const char *mode)
  2144. # Opens an existing stream for reading or writing.
  2145. #
  2146. # @return
  2147. # Returns file handle on success;
  2148. # NULL on failure.
  2149. #
  2150. cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode)
  2151. # Closes a CRAM file.
  2152. #
  2153. # @return
  2154. # Returns 0 on success;
  2155. # -1 on failure
  2156. #
  2157. int cram_close(cram_fd *fd)
  2158. #
  2159. # Seek within a CRAM file.
  2160. #
  2161. # Returns 0 on success
  2162. # -1 on failure
  2163. #
  2164. int cram_seek(cram_fd *fd, off_t offset, int whence)
  2165. #
  2166. # Flushes a CRAM file.
  2167. # Useful for when writing to stdout without wishing to close the stream.
  2168. #
  2169. # Returns 0 on success
  2170. # -1 on failure
  2171. #
  2172. int cram_flush(cram_fd *fd)
  2173. # Checks for end of file on a cram_fd stream.
  2174. #
  2175. # @return
  2176. # Returns 0 if not at end of file
  2177. # 1 if we hit an expected EOF (end of range or EOF block)
  2178. # 2 for other EOF (end of stream without EOF block)
  2179. #
  2180. int cram_eof(cram_fd *fd)
  2181. # Sets options on the cram_fd.
  2182. #
  2183. # See CRAM_OPT_* definitions in hts.h.
  2184. # Use this immediately after opening.
  2185. #
  2186. # @return
  2187. # Returns 0 on success;
  2188. # -1 on failure
  2189. #
  2190. int cram_set_option(cram_fd *fd, hts_fmt_option opt, ...)
  2191. # Sets options on the cram_fd.
  2192. #
  2193. # See CRAM_OPT_* definitions in hts.h.
  2194. # Use this immediately after opening.
  2195. #
  2196. # @return
  2197. # Returns 0 on success;
  2198. # -1 on failure
  2199. #
  2200. int cram_set_voption(cram_fd *fd, hts_fmt_option opt, va_list args)
  2201. #
  2202. # Attaches a header to a cram_fd.
  2203. #
  2204. # This should be used when creating a new cram_fd for writing where
  2205. # we have an SAM_hdr already constructed (eg from a file we've read
  2206. # in).
  2207. #
  2208. # @return
  2209. # Returns 0 on success;
  2210. # -1 on failure
  2211. #
  2212. int cram_set_header(cram_fd *fd, SAM_hdr *hdr)
  2213. # Check if this file has a proper EOF block
  2214. #
  2215. # @return
  2216. # Returns 3 if the file is a version of CRAM that does not contain EOF blocks
  2217. # 2 if the file is a stream and thus unseekable
  2218. # 1 if the file contains an EOF block
  2219. # 0 if the file does not contain an EOF block
  2220. # -1 if an error occurred whilst reading the file or we could not seek back to where we were
  2221. #
  2222. #
  2223. int cram_check_EOF(cram_fd *fd)
  2224. # As int32_decoded/encode, but from/to blocks instead of cram_fd */
  2225. int int32_put_blk(cram_block *b, int32_t val)
  2226. # Deallocates all storage used by a SAM_hdr struct.
  2227. #
  2228. # This also decrements the header reference count. If after decrementing
  2229. # it is still non-zero then the header is assumed to be in use by another
  2230. # caller and the free is not done.
  2231. #
  2232. # This is a synonym for sam_hdr_dec_ref().
  2233. #
  2234. void sam_hdr_free(SAM_hdr *hdr)
  2235. # Returns the current length of the SAM_hdr in text form.
  2236. #
  2237. # Call sam_hdr_rebuild() first if editing has taken place.
  2238. #
  2239. int sam_hdr_length(SAM_hdr *hdr)
  2240. # Returns the string form of the SAM_hdr.
  2241. #
  2242. # Call sam_hdr_rebuild() first if editing has taken place.
  2243. #
  2244. char *sam_hdr_str(SAM_hdr *hdr)
  2245. # Appends a formatted line to an existing SAM header.
  2246. #
  2247. # Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
  2248. # optional new-line. If it contains more than 1 line then multiple lines
  2249. # will be added in order.
  2250. #
  2251. # Len is the length of the text data, or 0 if unknown (in which case
  2252. # it should be null terminated).
  2253. #
  2254. # @return
  2255. # Returns 0 on success;
  2256. # -1 on failure
  2257. #
  2258. # Add an @PG line.
  2259. #
  2260. # If we wish complete control over this use sam_hdr_add() directly. This
  2261. # function uses that, but attempts to do a lot of tedious house work for
  2262. # you too.
  2263. #
  2264. # - It will generate a suitable ID if the supplied one clashes.
  2265. # - It will generate multiple @PG records if we have multiple PG chains.
  2266. #
  2267. # Call it as per sam_hdr_add() with a series of key,value pairs ending
  2268. # in NULL.
  2269. #
  2270. # @return
  2271. # Returns 0 on success;
  2272. # -1 on failure
  2273. #
  2274. int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...)
  2275. #
  2276. # A function to help with construction of CL tags in @PG records.
  2277. # Takes an argc, argv pair and returns a single space-separated string.
  2278. # This string should be deallocated by the calling function.
  2279. #
  2280. # @return
  2281. # Returns malloced char * on success;
  2282. # NULL on failure
  2283. #
  2284. char *stringify_argv(int argc, char *argv[])
  2285. #
  2286. # Returns the refs_t structure used by a cram file handle.
  2287. #
  2288. # This may be used in conjunction with option CRAM_OPT_SHARED_REF to
  2289. # share reference memory between multiple file handles.
  2290. #
  2291. # @return
  2292. # Returns NULL if none exists or the file handle is not a CRAM file.
  2293. #
  2294. refs_t *cram_get_refs(htsFile *fd)
  2295. cdef class HTSFile(object):
  2296. cdef htsFile *htsfile # pointer to htsFile structure
  2297. cdef int64_t start_offset # BGZF offset of first record
  2298. cdef readonly object filename # filename as supplied by user
  2299. cdef readonly object mode # file opening mode
  2300. cdef readonly object threads # number of threads to use
  2301. cdef readonly object index_filename # filename of index, if supplied by user
  2302. cdef readonly bint is_stream # Is htsfile a non-seekable stream
  2303. cdef readonly bint is_remote # Is htsfile a remote stream
  2304. cdef readonly bint duplicate_filehandle # Duplicate filehandle when opening via fh
  2305. cdef htsFile *_open_htsfile(self) except? NULL