Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 
 
 

320 řádky
10 KiB

  1. /*
  2. Copyright (c) 2013-2019 Genome Research Ltd.
  3. Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are met:
  6. 1. Redistributions of source code must retain the above copyright notice,
  7. this list of conditions and the following disclaimer.
  8. 2. Redistributions in binary form must reproduce the above copyright notice,
  9. this list of conditions and the following disclaimer in the documentation
  10. and/or other materials provided with the distribution.
  11. 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
  12. Institute nor the names of its contributors may be used to endorse or promote
  13. products derived from this software without specific prior written permission.
  14. THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
  15. ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  16. WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  17. DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
  18. FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  20. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  21. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  22. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. /*! \file
  26. * SAM header parsing.
  27. *
  28. * These functions can be shared between SAM, BAM and CRAM file
  29. * formats as all three internally use the same string encoding for
  30. * header fields.
  31. */
  32. #ifndef HEADER_H_
  33. #define HEADER_H_
  34. #include <stdarg.h>
  35. #include "cram/string_alloc.h"
  36. #include "cram/pooled_alloc.h"
  37. #include "htslib/khash.h"
  38. #include "htslib/kstring.h"
  39. #include "htslib/sam.h"
  40. #ifdef __cplusplus
  41. extern "C" {
  42. #endif
  43. /*! Make a single integer out of a two-letter type code */
  44. static inline khint32_t TYPEKEY(const char *type) {
  45. unsigned int u0 = (unsigned char) type[0];
  46. unsigned int u1 = (unsigned char) type[1];
  47. return (u0 << 8) | u1;
  48. }
  49. /*
  50. * Proposed new SAM header parsing
  51. 1 @SQ ID:foo LN:100
  52. 2 @SQ ID:bar LN:200
  53. 3 @SQ ID:ram LN:300 UR:xyz
  54. 4 @RG ID:r ...
  55. 5 @RG ID:s ...
  56. Hash table for 2-char @keys without dup entries.
  57. If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}.
  58. HASH("SQ")--\
  59. |
  60. (3) <-> 1 <-> 2 <-> 3 <-> (1)
  61. HASH("RG")--\
  62. |
  63. (5) <-> 4 <-> 5 <-> (4)
  64. Items stored in the hash values also form their own linked lists:
  65. Ie SQ->ID(foo)->LN(100)
  66. SQ->ID(bar)->LN(200)
  67. SQ->ID(ram)->LN(300)->UR(xyz)
  68. RG->ID(r)
  69. */
  70. /*! A single key:value pair on a header line
  71. *
  72. * These form a linked list and hold strings. The strings are
  73. * allocated from a string_alloc_t pool referenced in the master
  74. * sam_hrecs_t structure. Do not attempt to free, malloc or manipulate
  75. * these strings directly.
  76. */
  77. typedef struct sam_hrec_tag_s {
  78. struct sam_hrec_tag_s *next;
  79. const char *str;
  80. int len;
  81. } sam_hrec_tag_t;
  82. /*! The parsed version of the SAM header string.
  83. *
  84. * Each header type (SQ, RG, HD, etc) points to its own sam_hdr_type
  85. * struct via the main hash table h in the sam_hrecs_t struct.
  86. *
  87. * These in turn consist of circular bi-directional linked lists (ie
  88. * rings) to hold the multiple instances of the same header type
  89. * code. For example if we have 5 \@SQ lines the primary hash table
  90. * will key on \@SQ pointing to the first sam_hdr_type and that in turn
  91. * will be part of a ring of 5 elements.
  92. *
  93. * For each sam_hdr_type structure we also point to a sam_hdr_tag
  94. * structure which holds the tokenised attributes; the tab separated
  95. * key:value pairs per line.
  96. */
  97. typedef struct sam_hrec_type_s {
  98. struct sam_hrec_type_s *next; // circular list of this type
  99. struct sam_hrec_type_s *prev; // circular list of this type
  100. struct sam_hrec_type_s *global_next; // circular list of all lines
  101. struct sam_hrec_type_s *global_prev; // circular list of all lines
  102. sam_hrec_tag_t *tag; // first tag
  103. khint32_t type; // Two-letter type code as an int
  104. } sam_hrec_type_t;
  105. /*! Parsed \@SQ lines */
  106. typedef struct {
  107. const char *name;
  108. hts_pos_t len;
  109. sam_hrec_type_t *ty;
  110. } sam_hrec_sq_t;
  111. /*! Parsed \@RG lines */
  112. typedef struct {
  113. const char *name;
  114. sam_hrec_type_t *ty;
  115. int name_len;
  116. int id; // numerical ID
  117. } sam_hrec_rg_t;
  118. /*! Parsed \@PG lines */
  119. typedef struct {
  120. const char *name;
  121. sam_hrec_type_t *ty;
  122. int name_len;
  123. int id; // numerical ID
  124. int prev_id; // -1 if none
  125. } sam_hrec_pg_t;
  126. /*! Sort order parsed from @HD line */
  127. enum sam_sort_order {
  128. ORDER_UNKNOWN =-1,
  129. ORDER_UNSORTED = 0,
  130. ORDER_NAME = 1,
  131. ORDER_COORD = 2
  132. //ORDER_COLLATE = 3 // maybe one day!
  133. };
  134. enum sam_group_order {
  135. ORDER_NONE =-1,
  136. ORDER_QUERY = 0,
  137. ORDER_REFERENCE = 1
  138. };
  139. KHASH_MAP_INIT_INT(sam_hrecs_t, sam_hrec_type_t*)
  140. KHASH_MAP_INIT_STR(m_s2i, int)
  141. /*! Primary structure for header manipulation
  142. *
  143. * The initial header text is held in the text kstring_t, but is also
  144. * parsed out into SQ, RG and PG arrays. These have a hash table
  145. * associated with each to allow lookup by ID or SN fields instead of
  146. * their numeric array indices. Additionally PG has an array to hold
  147. * the linked list start points (the last in a PP chain).
  148. *
  149. * Use the appropriate sam_hdr_* functions to edit the header, and
  150. * call sam_hdr_rebuild() any time the textual form needs to be
  151. * updated again.
  152. */
  153. struct sam_hrecs_t {
  154. khash_t(sam_hrecs_t) *h;
  155. sam_hrec_type_t *first_line; //!< First line (usually @HD)
  156. string_alloc_t *str_pool; //!< Pool of sam_hdr_tag->str strings
  157. pool_alloc_t *type_pool;//!< Pool of sam_hdr_type structs
  158. pool_alloc_t *tag_pool; //!< Pool of sam_hdr_tag structs
  159. // @SQ lines / references
  160. int nref; //!< Number of \@SQ lines
  161. int ref_sz; //!< Number of entries available in ref[]
  162. sam_hrec_sq_t *ref; //!< Array of parsed \@SQ lines
  163. khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to ref[] index
  164. // @RG lines / read-groups
  165. int nrg; //!< Number of \@RG lines
  166. int rg_sz; //!< number of entries available in rg[]
  167. sam_hrec_rg_t *rg; //!< Array of parsed \@RG lines
  168. khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index
  169. // @PG lines / programs
  170. int npg; //!< Number of \@PG lines
  171. int pg_sz; //!< Number of entries available in pg[]
  172. int npg_end; //!< Number of terminating \@PG lines
  173. int npg_end_alloc; //!< Size of pg_end field
  174. sam_hrec_pg_t *pg; //!< Array of parsed \@PG lines
  175. khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index
  176. int *pg_end; //!< \@PG chain termination IDs
  177. // @cond internal
  178. char *ID_buf; // temporary buffer for sam_hdr_pg_id
  179. uint32_t ID_buf_sz;
  180. int ID_cnt;
  181. // @endcond
  182. int dirty; // marks the header as modified, so it can be rebuilt
  183. int refs_changed; // Index of first changed ref (-1 if unchanged)
  184. int pgs_changed; // New PG line added
  185. int type_count;
  186. char (*type_order)[3];
  187. };
  188. /*!
  189. * Method for parsing the header text and populating the
  190. * internal hash tables. After calling this method, the
  191. * parsed representation becomes the single source of truth.
  192. *
  193. * @param bh Header structure, previously initialised by a
  194. * sam_hdr_init call
  195. * @return 0 on success, -1 on failure
  196. */
  197. int sam_hdr_fill_hrecs(sam_hdr_t *bh);
  198. /*!
  199. * Reconstructs the text representation of the header from
  200. * the hash table data after a change has been performed on
  201. * the header.
  202. *
  203. * @return 0 on success, -1 on failure
  204. */
  205. int sam_hdr_rebuild(sam_hdr_t *bh);
  206. /*! Creates an empty SAM header, ready to be populated.
  207. *
  208. * @return
  209. * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
  210. * NULL on failure
  211. */
  212. sam_hrecs_t *sam_hrecs_new(void);
  213. /*! Produces a duplicate copy of hrecs and returns it.
  214. * @return
  215. * Returns NULL on failure
  216. */
  217. sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *hrecs);
  218. /*! Update sam_hdr_t target_name and target_len arrays
  219. *
  220. * sam_hdr_t and sam_hrecs_t are specified separately so that sam_hdr_dup
  221. * can use it to construct target arrays from the source header.
  222. *
  223. * @return 0 on success; -1 on failure
  224. */
  225. int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
  226. int refs_changed);
  227. /*! Reconstructs a kstring from the header hash table.
  228. *
  229. * @return
  230. * Returns 0 on success
  231. * -1 on failure
  232. */
  233. int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks);
  234. /*! Deallocates all storage used by a sam_hrecs_t struct.
  235. *
  236. * This also decrements the header reference count. If after decrementing
  237. * it is still non-zero then the header is assumed to be in use by another
  238. * caller and the free is not done.
  239. */
  240. void sam_hrecs_free(sam_hrecs_t *hrecs);
  241. /*!
  242. * @return
  243. * Returns the first header item matching 'type'. If ID is non-NULL it checks
  244. * for the tag ID: and compares against the specified ID.
  245. *
  246. * Returns NULL if no type/ID is found
  247. */
  248. sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
  249. const char *ID_key, const char *ID_value);
  250. sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
  251. const char *key,
  252. sam_hrec_tag_t **prev);
  253. int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
  254. sam_hrec_type_t *type,
  255. const char *key);
  256. /*! Looks up a read-group by name and returns a pointer to the start of the
  257. * associated tag list.
  258. *
  259. * @return
  260. * Returns NULL on failure
  261. */
  262. sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg);
  263. /*! Returns the sort order from the @HD SO: field */
  264. enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs);
  265. /*! Returns the group order from the @HD SO: field */
  266. enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs);
  267. #ifdef __cplusplus
  268. }
  269. #endif
  270. #endif /* HEADER_H_ */