Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 

283 lignes
8.8 KiB

  1. '''Tools for working with files in the samtools pileup -c format.'''
  2. import collections
  3. import pysam
  4. PileupSubstitution = collections.namedtuple("PileupSubstitution",
  5. " ".join((
  6. "chromosome",
  7. "pos",
  8. "reference_base",
  9. "genotype",
  10. "consensus_quality",
  11. "snp_quality",
  12. "mapping_quality",
  13. "coverage",
  14. "read_bases",
  15. "base_qualities")))
  16. PileupIndel = collections.namedtuple("PileupIndel",
  17. " ".join((
  18. "chromosome",
  19. "pos",
  20. "reference_base",
  21. "genotype",
  22. "consensus_quality",
  23. "snp_quality",
  24. "mapping_quality",
  25. "coverage",
  26. "first_allele",
  27. "second_allele",
  28. "reads_first",
  29. "reads_second",
  30. "reads_diff")))
  31. def iterate(infile):
  32. '''iterate over ``samtools pileup -c`` formatted file.
  33. *infile* can be any iterator over a lines.
  34. The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution`
  35. or :class:`pysam.Pileup.PileupIndel`.
  36. .. note::
  37. The parser converts to 0-based coordinates
  38. '''
  39. conv_subst = (str, lambda x: int(x) - 1, str,
  40. str, int, int, int, int, str, str)
  41. conv_indel = (str, lambda x: int(x) - 1, str, str, int,
  42. int, int, int, str, str, int, int, int)
  43. for line in infile:
  44. d = line[:-1].split()
  45. if d[2] == "*":
  46. try:
  47. yield PileupIndel(*[x(y) for x, y in zip(conv_indel, d)])
  48. except TypeError:
  49. raise pysam.SamtoolsError("parsing error in line: `%s`" % line)
  50. else:
  51. try:
  52. yield PileupSubstitution(*[x(y) for x, y in zip(conv_subst, d)])
  53. except TypeError:
  54. raise pysam.SamtoolsError("parsing error in line: `%s`" % line)
  55. ENCODE_GENOTYPE = {
  56. 'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T',
  57. 'AA': 'A', 'CC': 'C', 'GG': 'G', 'TT': 'T', 'UU': 'U',
  58. 'AG': 'r', 'GA': 'R',
  59. 'CT': 'y', 'TC': 'Y',
  60. 'AC': 'm', 'CA': 'M',
  61. 'GT': 'k', 'TG': 'K',
  62. 'CG': 's', 'GC': 'S',
  63. 'AT': 'w', 'TA': 'W',
  64. }
  65. DECODE_GENOTYPE = {
  66. 'A': 'AA',
  67. 'C': 'CC',
  68. 'G': 'GG',
  69. 'T': 'TT',
  70. 'r': 'AG', 'R': 'AG',
  71. 'y': 'CT', 'Y': 'CT',
  72. 'm': 'AC', 'M': 'AC',
  73. 'k': 'GT', 'K': 'GT',
  74. 's': 'CG', 'S': 'CG',
  75. 'w': 'AT', 'W': 'AT',
  76. }
  77. # ------------------------------------------------------------
  78. def encodeGenotype(code):
  79. '''encode genotypes like GG, GA into a one-letter code.
  80. The returned code is lower case if code[0] < code[1], otherwise
  81. it is uppercase.
  82. '''
  83. return ENCODE_GENOTYPE[code.upper()]
  84. def decodeGenotype(code):
  85. '''decode single letter genotypes like m, M into two letters.
  86. This is the reverse operation to :meth:`encodeGenotype`.
  87. '''
  88. return DECODE_GENOTYPE[code]
  89. def translateIndelGenotypeFromVCF(vcf_genotypes, ref):
  90. '''translate indel from vcf to pileup format.'''
  91. # indels
  92. def getPrefix(s1, s2):
  93. '''get common prefix of strings s1 and s2.'''
  94. n = min(len(s1), len(s2))
  95. for x in range(n):
  96. if s1[x] != s2[x]:
  97. return s1[:x]
  98. return s1[:n]
  99. def getSuffix(s1, s2):
  100. '''get common sufix of strings s1 and s2.'''
  101. n = min(len(s1), len(s2))
  102. if s1[-1] != s2[-1]:
  103. return ""
  104. for x in range(-2, -n - 1, -1):
  105. if s1[x] != s2[x]:
  106. return s1[x + 1:]
  107. return s1[-n:]
  108. def getGenotype(variant, ref):
  109. if variant == ref:
  110. return "*", 0
  111. if len(ref) > len(variant):
  112. # is a deletion
  113. if ref.startswith(variant):
  114. return "-%s" % ref[len(variant):], len(variant) - 1
  115. elif ref.endswith(variant):
  116. return "-%s" % ref[:-len(variant)], -1
  117. else:
  118. prefix = getPrefix(ref, variant)
  119. suffix = getSuffix(ref, variant)
  120. shared = len(prefix) + len(suffix) - len(variant)
  121. # print "-", prefix, suffix, ref, variant, shared, len(prefix), len(suffix), len(ref)
  122. if shared < 0:
  123. raise ValueError()
  124. return "-%s" % ref[len(prefix):-(len(suffix) - shared)], len(prefix) - 1
  125. elif len(ref) < len(variant):
  126. # is an insertion
  127. if variant.startswith(ref):
  128. return "+%s" % variant[len(ref):], len(ref) - 1
  129. elif variant.endswith(ref):
  130. return "+%s" % variant[:len(ref)], 0
  131. else:
  132. prefix = getPrefix(ref, variant)
  133. suffix = getSuffix(ref, variant)
  134. shared = len(prefix) + len(suffix) - len(ref)
  135. if shared < 0:
  136. raise ValueError()
  137. return "+%s" % variant[len(prefix):-(len(suffix) - shared)], len(prefix)
  138. else:
  139. assert 0, "snp?"
  140. # in pileup, the position refers to the base
  141. # after the coordinate, hence subtract 1
  142. # pos -= 1
  143. genotypes, offsets = [], []
  144. is_error = True
  145. for variant in vcf_genotypes:
  146. try:
  147. g, offset = getGenotype(variant, ref)
  148. except ValueError:
  149. break
  150. genotypes.append(g)
  151. if g != "*":
  152. offsets.append(offset)
  153. else:
  154. is_error = False
  155. if is_error:
  156. raise ValueError()
  157. assert len(set(offsets)) == 1, "multiple offsets for indel"
  158. offset = offsets[0]
  159. genotypes = "/".join(genotypes)
  160. return genotypes, offset
  161. def vcf2pileup(vcf, sample):
  162. '''convert vcf record to pileup record.'''
  163. chromosome = vcf.contig
  164. pos = vcf.pos
  165. reference = vcf.ref
  166. allelles = [reference] + vcf.alt
  167. data = vcf[sample]
  168. # get genotype
  169. genotypes = data["GT"]
  170. if len(genotypes) > 1:
  171. raise ValueError("only single genotype per position, %s" % (str(vcf)))
  172. genotypes = genotypes[0]
  173. # not a variant
  174. if genotypes[0] == ".":
  175. return None
  176. genotypes = [allelles[int(x)] for x in genotypes if x != "/"]
  177. # snp_quality is "genotype quality"
  178. snp_quality = consensus_quality = data.get("GQ", [0])[0]
  179. mapping_quality = vcf.info.get("MQ", [0])[0]
  180. coverage = data.get("DP", 0)
  181. if len(reference) > 1 or max([len(x) for x in vcf.alt]) > 1:
  182. # indel
  183. genotype, offset = translateIndelGenotypeFromVCF(genotypes, reference)
  184. return PileupIndel(chromosome,
  185. pos + offset,
  186. "*",
  187. genotype,
  188. consensus_quality,
  189. snp_quality,
  190. mapping_quality,
  191. coverage,
  192. genotype,
  193. "<" * len(genotype),
  194. 0,
  195. 0,
  196. 0)
  197. else:
  198. genotype = encodeGenotype("".join(genotypes))
  199. read_bases = ""
  200. base_qualities = ""
  201. return PileupSubstitution(chromosome, pos, reference,
  202. genotype, consensus_quality,
  203. snp_quality, mapping_quality,
  204. coverage, read_bases,
  205. base_qualities)
  206. def iterate_from_vcf(infile, sample):
  207. '''iterate over a vcf-formatted file.
  208. *infile* can be any iterator over a lines.
  209. The function yields named tuples of the type
  210. :class:`pysam.Pileup.PileupSubstitution` or
  211. :class:`pysam.Pileup.PileupIndel`.
  212. Positions without a snp will be skipped.
  213. This method is wasteful and written to support same legacy code
  214. that expects samtools pileup output.
  215. Better use the vcf parser directly.
  216. '''
  217. vcf = pysam.VCF()
  218. vcf.connect(infile)
  219. if sample not in vcf.getsamples():
  220. raise KeyError("sample %s not vcf file")
  221. for row in vcf.fetch():
  222. result = vcf2pileup(row, sample)
  223. if result:
  224. yield result