Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
 
 
 
 

167 строки
6.1 KiB

  1. /* bam2bcf.h -- variant calling.
  2. Copyright (C) 2010-2012 Broad Institute.
  3. Copyright (C) 2012-2021 Genome Research Ltd.
  4. Author: Heng Li <lh3@sanger.ac.uk>
  5. Permission is hereby granted, free of charge, to any person obtaining a copy
  6. of this software and associated documentation files (the "Software"), to deal
  7. in the Software without restriction, including without limitation the rights
  8. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. copies of the Software, and to permit persons to whom the Software is
  10. furnished to do so, subject to the following conditions:
  11. The above copyright notice and this permission notice shall be included in
  12. all copies or substantial portions of the Software.
  13. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  16. THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  18. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  19. DEALINGS IN THE SOFTWARE. */
  20. #ifndef BAM2BCF_H
  21. #define BAM2BCF_H
  22. #include <stdint.h>
  23. #include <htslib/hts.h>
  24. #include <htslib/vcf.h>
  25. /**
  26. * A simplified version of Mann-Whitney U-test is calculated
  27. * by default (no CDF) because it is faster and seems to work
  28. * better in machine learning filtering. When enabled by setting
  29. * CDF_MWU_TESTS, additional annotations will appear on mpileup's
  30. * output (RPB2 in addition to RPB, etc.).
  31. */
  32. #ifndef CDF_MWU_TESTS
  33. #define CDF_MWU_TESTS 0
  34. #endif
  35. #define B2B_INDEL_NULL 10000
  36. #define B2B_FMT_DP (1<<0)
  37. #define B2B_FMT_SP (1<<1)
  38. #define B2B_FMT_DV (1<<2)
  39. #define B2B_FMT_DP4 (1<<3)
  40. #define B2B_FMT_DPR (1<<4)
  41. #define B2B_INFO_DPR (1<<5)
  42. #define B2B_FMT_AD (1<<6)
  43. #define B2B_FMT_ADF (1<<7)
  44. #define B2B_FMT_ADR (1<<8)
  45. #define B2B_INFO_AD (1<<9)
  46. #define B2B_INFO_ADF (1<<10)
  47. #define B2B_INFO_ADR (1<<11)
  48. #define B2B_INFO_SCR (1<<12)
  49. #define B2B_FMT_SCR (1<<13)
  50. #define B2B_INFO_VDB (1<<14)
  51. #define B2B_INFO_RPB (1<<15)
  52. #define B2B_FMT_QS (1<<16)
  53. #define B2B_INFO_SCB (1<<17)
  54. #define B2B_INFO_ZSCORE (1<<30) // MWU as-is or Z-normalised
  55. #define B2B_MAX_ALLELES 5
  56. #define B2B_DROP 0
  57. #define B2B_INC_AD 1
  58. #define B2B_INC_AD0 2
  59. #define PLP_HAS_SOFT_CLIP(i) ((i)&1)
  60. #define PLP_HAS_INDEL(i) ((i)&2)
  61. #define PLP_SAMPLE_ID(i) ((i)>>2)
  62. #define PLP_SET_SOFT_CLIP(i) ((i)|=1)
  63. #define PLP_SET_INDEL(i) ((i)|=2)
  64. #define PLP_SET_SAMPLE_ID(i,n) ((i)|=(n)<<2)
  65. typedef struct __bcf_callaux_t {
  66. int fmt_flag, ambig_reads;
  67. int capQ, min_baseQ, max_baseQ, delta_baseQ;
  68. int openQ, extQ, tandemQ; // for indels
  69. uint32_t min_support, max_support; // for collecting indel candidates
  70. double min_frac; // for collecting indel candidates
  71. float max_frac; // for collecting indel candidates
  72. int per_sample_flt; // indel filtering strategy
  73. int *ref_pos, *alt_pos, npos, *ref_mq, *alt_mq, *ref_bq, *alt_bq, *fwd_mqs, *rev_mqs, nqual; // for bias tests
  74. int *iref_pos, *ialt_pos, *iref_mq, *ialt_mq; // for indels
  75. int ref_scl[100], alt_scl[100]; // soft-clip length bias; SNP
  76. int iref_scl[100], ialt_scl[100]; // soft-clip length bias; INDEL
  77. // for internal uses
  78. int max_bases;
  79. int indel_types[4]; // indel lengths
  80. int indel_win_size;
  81. int maxins, indelreg;
  82. int read_len;
  83. char *inscns;
  84. uint16_t *bases; // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types)
  85. errmod_t *e;
  86. void *rghash;
  87. float indel_bias; // adjusts indel score threshold; lower => call more.
  88. } bcf_callaux_t;
  89. // per-sample values
  90. typedef struct {
  91. uint32_t ori_depth; // ori_depth = anno[0..3] but before --min-BQ is applied
  92. unsigned int mq0;
  93. int32_t *ADF, *ADR, SCR, *QS; // FMT/QS
  94. // The fields are:
  95. // depth fwd .. ref (0) and non-ref (2)
  96. // depth rev .. ref (1) and non-ref (3)
  97. // baseQ .. ref (4) and non-ref (6)
  98. // baseQ^2 .. ref (5) and non-ref (7)
  99. // mapQ .. ref (8) and non-ref (10)
  100. // mapQ^2 .. ref (9) and non-ref (11)
  101. // minDist .. ref (12) and non-ref (14)
  102. // minDist^2 .. ref (13) and non-ref (15)
  103. // Note that this probably needs a more thorough fix: int types in
  104. // bcf_call_t do overflow with high-coverage data, such as exomes, and
  105. // BCFv2 supports only floats which may not suffice.
  106. double anno[16];
  107. float p[25]; // phred-scaled likelihood of each genotype
  108. } bcf_callret1_t;
  109. // values for all samples
  110. typedef struct {
  111. int tid, pos;
  112. bcf_hdr_t *bcf_hdr;
  113. int a[5]; // alleles: ref, alt, alt2, alt3
  114. float qsum[B2B_MAX_ALLELES]; // INFO/QS tag
  115. int n, n_alleles, shift, ori_ref, unseen;
  116. int n_supp; // number of supporting non-reference reads
  117. double anno[16];
  118. unsigned int depth, ori_depth, mq0;
  119. int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS;
  120. uint8_t *fmt_arr;
  121. float vdb; // variant distance bias
  122. float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc;
  123. #if CDF_MWU_TESTS
  124. float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf;
  125. #endif
  126. float seg_bias;
  127. float strand_bias; // phred-scaled fisher-exact test
  128. kstring_t tmp;
  129. } bcf_call_t;
  130. #ifdef __cplusplus
  131. extern "C" {
  132. #endif
  133. bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ,
  134. int delta_baseQ);
  135. void bcf_call_destroy(bcf_callaux_t *bca);
  136. int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r);
  137. int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call);
  138. int bcf_call2bcf(bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag,
  139. const bcf_callaux_t *bca, const char *ref);
  140. int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
  141. void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call);
  142. #ifdef __cplusplus
  143. }
  144. #endif
  145. #endif